nokogumbo 1.5.0 → 2.0.0.pre.alpha
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +56 -0
- data/README.md +146 -22
- data/ext/nokogumbo/extconf.rb +116 -0
- data/ext/{nokogumboc → nokogumbo}/nokogumbo.c +174 -71
- data/gumbo-parser/src/ascii.c +33 -0
- data/gumbo-parser/src/ascii.h +31 -0
- data/gumbo-parser/src/attribute.c +26 -28
- data/gumbo-parser/src/attribute.h +3 -23
- data/gumbo-parser/src/char_ref.c +135 -2351
- data/gumbo-parser/src/char_ref.h +13 -29
- data/gumbo-parser/src/error.c +215 -133
- data/gumbo-parser/src/error.h +34 -49
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/gumbo.h +506 -304
- data/gumbo-parser/src/insertion_mode.h +4 -28
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +1989 -1431
- data/gumbo-parser/src/parser.h +6 -22
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +43 -50
- data/gumbo-parser/src/string_buffer.h +24 -40
- data/gumbo-parser/src/string_piece.c +39 -39
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/tag.c +186 -59
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_type.h +1 -25
- data/gumbo-parser/src/tokenizer.c +899 -495
- data/gumbo-parser/src/tokenizer.h +37 -37
- data/gumbo-parser/src/tokenizer_states.h +6 -22
- data/gumbo-parser/src/utf8.c +103 -86
- data/gumbo-parser/src/utf8.h +37 -41
- data/gumbo-parser/src/util.c +48 -38
- data/gumbo-parser/src/util.h +10 -40
- data/gumbo-parser/src/vector.c +45 -57
- data/gumbo-parser/src/vector.h +17 -39
- data/lib/nokogumbo.rb +10 -174
- data/lib/nokogumbo/html5.rb +250 -0
- data/lib/nokogumbo/html5/document.rb +37 -0
- data/lib/nokogumbo/html5/document_fragment.rb +46 -0
- data/lib/nokogumbo/version.rb +3 -0
- data/lib/nokogumbo/xml/node.rb +57 -0
- metadata +32 -19
- data/ext/nokogumboc/extconf.rb +0 -60
- data/gumbo-parser/src/char_ref.rl +0 -2554
- data/gumbo-parser/src/string_piece.h +0 -38
- data/gumbo-parser/src/tag.in +0 -150
- data/gumbo-parser/src/tag_enum.h +0 -153
- data/gumbo-parser/src/tag_gperf.h +0 -105
- data/gumbo-parser/src/tag_sizes.h +0 -4
- data/gumbo-parser/src/tag_strings.h +0 -153
- data/gumbo-parser/visualc/include/strings.h +0 -4
- data/test-nokogumbo.rb +0 -190
data/gumbo-parser/src/util.c
CHANGED
@@ -1,58 +1,68 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
1
|
+
/*
|
2
|
+
Copyright 2017-2018 Craig Barnes.
|
3
|
+
Copyright 2010 Google Inc.
|
16
4
|
|
17
|
-
|
5
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
you may not use this file except in compliance with the License.
|
7
|
+
You may obtain a copy of the License at
|
8
|
+
|
9
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
18
10
|
|
19
|
-
|
11
|
+
Unless required by applicable law or agreed to in writing, software
|
12
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
See the License for the specific language governing permissions and
|
15
|
+
limitations under the License.
|
16
|
+
*/
|
17
|
+
|
18
|
+
#include <stdio.h>
|
20
19
|
#include <stdlib.h>
|
21
20
|
#include <string.h>
|
22
|
-
#include
|
23
|
-
#include <stdarg.h>
|
24
|
-
#include <stdio.h>
|
25
|
-
|
21
|
+
#include "util.h"
|
26
22
|
#include "gumbo.h"
|
27
|
-
#include "parser.h"
|
28
23
|
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
24
|
+
void* gumbo_alloc(size_t size) {
|
25
|
+
void* ptr = malloc(size);
|
26
|
+
if (unlikely(ptr == NULL)) {
|
27
|
+
perror(__func__);
|
28
|
+
abort();
|
29
|
+
}
|
30
|
+
return ptr;
|
31
|
+
}
|
33
32
|
|
34
|
-
void*
|
35
|
-
|
33
|
+
void* gumbo_realloc(void* ptr, size_t size) {
|
34
|
+
ptr = realloc(ptr, size);
|
35
|
+
if (unlikely(ptr == NULL)) {
|
36
|
+
perror(__func__);
|
37
|
+
abort();
|
38
|
+
}
|
39
|
+
return ptr;
|
36
40
|
}
|
37
41
|
|
38
|
-
void
|
39
|
-
|
42
|
+
void gumbo_free(void* ptr) {
|
43
|
+
free(ptr);
|
40
44
|
}
|
41
45
|
|
42
|
-
char*
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
+
char* gumbo_strdup(const char* str) {
|
47
|
+
const size_t size = strlen(str) + 1;
|
48
|
+
// The strdup(3) function isn't available in strict "-std=c99" mode
|
49
|
+
// (it's part of POSIX, not C99), so use malloc(3) and memcpy(3)
|
50
|
+
// instead:
|
51
|
+
char* buffer = gumbo_alloc(size);
|
52
|
+
return memcpy(buffer, str, size);
|
46
53
|
}
|
47
54
|
|
48
|
-
// Debug function to trace operation of the parser. Pass --copts=-DGUMBO_DEBUG
|
49
|
-
// to use.
|
50
|
-
void gumbo_debug(const char* format, ...) {
|
51
55
|
#ifdef GUMBO_DEBUG
|
56
|
+
#include <stdarg.h>
|
57
|
+
// Debug function to trace operation of the parser
|
58
|
+
// (define GUMBO_DEBUG to use).
|
59
|
+
void gumbo_debug(const char* format, ...) {
|
52
60
|
va_list args;
|
53
61
|
va_start(args, format);
|
54
62
|
vprintf(format, args);
|
55
63
|
va_end(args);
|
56
64
|
fflush(stdout);
|
57
|
-
#endif
|
58
65
|
}
|
66
|
+
#else
|
67
|
+
void gumbo_debug(const char* UNUSED_ARG(format), ...) {}
|
68
|
+
#endif
|
data/gumbo-parser/src/util.h
CHANGED
@@ -1,60 +1,30 @@
|
|
1
|
-
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
-
//
|
17
|
-
// This contains some utility functions that didn't fit into any of the other
|
18
|
-
// headers.
|
19
|
-
|
20
1
|
#ifndef GUMBO_UTIL_H_
|
21
2
|
#define GUMBO_UTIL_H_
|
22
|
-
|
23
|
-
#define _CRT_SECURE_NO_WARNINGS
|
24
|
-
#endif
|
3
|
+
|
25
4
|
#include <stdbool.h>
|
26
5
|
#include <stddef.h>
|
6
|
+
#include "macros.h"
|
27
7
|
|
28
8
|
#ifdef __cplusplus
|
29
9
|
extern "C" {
|
30
10
|
#endif
|
31
11
|
|
32
|
-
// Forward declaration since it's passed into some of the functions in this
|
33
|
-
// header.
|
34
|
-
struct GumboInternalParser;
|
35
|
-
|
36
12
|
// Utility function for allocating & copying a null-terminated string into a
|
37
|
-
// freshly-allocated buffer.
|
13
|
+
// freshly-allocated buffer. This is necessary for proper memory management; we
|
38
14
|
// have the convention that all const char* in parse tree structures are
|
39
15
|
// freshly-allocated, so if we didn't copy, we'd try to delete a literal string
|
40
16
|
// when the parse tree is destroyed.
|
41
|
-
char*
|
42
|
-
|
43
|
-
// Allocate a chunk of memory, using the allocator specified in the Parser's
|
44
|
-
// config options.
|
45
|
-
void* gumbo_parser_allocate(
|
46
|
-
struct GumboInternalParser* parser, size_t num_bytes);
|
17
|
+
char* gumbo_strdup(const char* str) XMALLOC NONNULL_ARGS;
|
47
18
|
|
48
|
-
|
49
|
-
|
50
|
-
void
|
19
|
+
void* gumbo_alloc(size_t size) XMALLOC;
|
20
|
+
void* gumbo_realloc(void* ptr, size_t size) RETURNS_NONNULL;
|
21
|
+
void gumbo_free(void* ptr);
|
51
22
|
|
52
|
-
// Debug wrapper for printf
|
53
|
-
|
54
|
-
void gumbo_debug(const char* format, ...);
|
23
|
+
// Debug wrapper for printf
|
24
|
+
void gumbo_debug(const char* format, ...) PRINTF(1);
|
55
25
|
|
56
26
|
#ifdef __cplusplus
|
57
27
|
}
|
58
28
|
#endif
|
59
29
|
|
60
|
-
#endif
|
30
|
+
#endif // GUMBO_UTIL_H_
|
data/gumbo-parser/src/vector.c
CHANGED
@@ -1,81 +1,64 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
1
|
+
/*
|
2
|
+
Copyright 2018 Craig Barnes.
|
3
|
+
Copyright 2010 Google Inc.
|
16
4
|
|
17
|
-
|
5
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
you may not use this file except in compliance with the License.
|
7
|
+
You may obtain a copy of the License at
|
8
|
+
|
9
|
+
https://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
|
11
|
+
Unless required by applicable law or agreed to in writing, software
|
12
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
See the License for the specific language governing permissions and
|
15
|
+
limitations under the License.
|
16
|
+
*/
|
18
17
|
|
19
18
|
#include <assert.h>
|
20
19
|
#include <stdlib.h>
|
21
20
|
#include <string.h>
|
22
|
-
#include
|
23
|
-
|
21
|
+
#include "vector.h"
|
24
22
|
#include "util.h"
|
25
23
|
|
26
|
-
|
27
|
-
|
28
|
-
const GumboVector kGumboEmptyVector = {NULL, 0, 0};
|
29
|
-
|
30
|
-
void gumbo_vector_init(struct GumboInternalParser* parser,
|
31
|
-
size_t initial_capacity, GumboVector* vector) {
|
24
|
+
void gumbo_vector_init(unsigned int initial_capacity, GumboVector* vector) {
|
32
25
|
vector->length = 0;
|
33
26
|
vector->capacity = initial_capacity;
|
34
27
|
if (initial_capacity > 0) {
|
35
|
-
vector->data =
|
36
|
-
gumbo_parser_allocate(parser, sizeof(void*) * initial_capacity);
|
28
|
+
vector->data = gumbo_alloc(sizeof(void*) * initial_capacity);
|
37
29
|
} else {
|
38
30
|
vector->data = NULL;
|
39
31
|
}
|
40
32
|
}
|
41
33
|
|
42
|
-
void gumbo_vector_destroy(
|
43
|
-
struct GumboInternalParser* parser, GumboVector* vector) {
|
34
|
+
void gumbo_vector_destroy(GumboVector* vector) {
|
44
35
|
if (vector->capacity > 0) {
|
45
|
-
|
36
|
+
gumbo_free(vector->data);
|
46
37
|
}
|
47
38
|
}
|
48
39
|
|
49
|
-
static void enlarge_vector_if_full(
|
50
|
-
struct GumboInternalParser* parser, GumboVector* vector) {
|
40
|
+
static void enlarge_vector_if_full(GumboVector* vector) {
|
51
41
|
if (vector->length >= vector->capacity) {
|
52
42
|
if (vector->capacity) {
|
53
|
-
size_t old_num_bytes = sizeof(void*) * vector->capacity;
|
54
43
|
vector->capacity *= 2;
|
55
44
|
size_t num_bytes = sizeof(void*) * vector->capacity;
|
56
|
-
|
57
|
-
memcpy(temp, vector->data, old_num_bytes);
|
58
|
-
gumbo_parser_deallocate(parser, vector->data);
|
59
|
-
vector->data = temp;
|
45
|
+
vector->data = gumbo_realloc(vector->data, num_bytes);
|
60
46
|
} else {
|
61
47
|
// 0-capacity vector; no previous array to deallocate.
|
62
48
|
vector->capacity = 2;
|
63
|
-
vector->data =
|
64
|
-
gumbo_parser_allocate(parser, sizeof(void*) * vector->capacity);
|
49
|
+
vector->data = gumbo_alloc(sizeof(void*) * vector->capacity);
|
65
50
|
}
|
66
51
|
}
|
67
52
|
}
|
68
53
|
|
69
|
-
void gumbo_vector_add(
|
70
|
-
|
71
|
-
enlarge_vector_if_full(parser, vector);
|
54
|
+
void gumbo_vector_add(void* element, GumboVector* vector) {
|
55
|
+
enlarge_vector_if_full(vector);
|
72
56
|
assert(vector->data);
|
73
57
|
assert(vector->length < vector->capacity);
|
74
58
|
vector->data[vector->length++] = element;
|
75
59
|
}
|
76
60
|
|
77
|
-
void* gumbo_vector_pop(
|
78
|
-
struct GumboInternalParser* parser, GumboVector* vector) {
|
61
|
+
void* gumbo_vector_pop(GumboVector* vector) {
|
79
62
|
if (vector->length == 0) {
|
80
63
|
return NULL;
|
81
64
|
}
|
@@ -91,33 +74,38 @@ int gumbo_vector_index_of(GumboVector* vector, const void* element) {
|
|
91
74
|
return -1;
|
92
75
|
}
|
93
76
|
|
94
|
-
void gumbo_vector_insert_at(
|
95
|
-
|
96
|
-
|
77
|
+
void gumbo_vector_insert_at (
|
78
|
+
void* element,
|
79
|
+
unsigned int index,
|
80
|
+
GumboVector* vector
|
81
|
+
) {
|
97
82
|
assert(index <= vector->length);
|
98
|
-
enlarge_vector_if_full(
|
83
|
+
enlarge_vector_if_full(vector);
|
99
84
|
++vector->length;
|
100
|
-
memmove(
|
101
|
-
|
85
|
+
memmove (
|
86
|
+
&vector->data[index + 1],
|
87
|
+
&vector->data[index],
|
88
|
+
sizeof(void*) * (vector->length - index - 1)
|
89
|
+
);
|
102
90
|
vector->data[index] = element;
|
103
91
|
}
|
104
92
|
|
105
|
-
void gumbo_vector_remove(
|
106
|
-
struct GumboInternalParser* parser, void* node, GumboVector* vector) {
|
93
|
+
void gumbo_vector_remove(void* node, GumboVector* vector) {
|
107
94
|
int index = gumbo_vector_index_of(vector, node);
|
108
95
|
if (index == -1) {
|
109
96
|
return;
|
110
97
|
}
|
111
|
-
gumbo_vector_remove_at(
|
98
|
+
gumbo_vector_remove_at(index, vector);
|
112
99
|
}
|
113
100
|
|
114
|
-
void* gumbo_vector_remove_at(
|
115
|
-
unsigned int index, GumboVector* vector) {
|
116
|
-
assert(index >= 0);
|
101
|
+
void* gumbo_vector_remove_at(unsigned int index, GumboVector* vector) {
|
117
102
|
assert(index < vector->length);
|
118
103
|
void* result = vector->data[index];
|
119
|
-
memmove(
|
120
|
-
|
104
|
+
memmove (
|
105
|
+
&vector->data[index],
|
106
|
+
&vector->data[index + 1],
|
107
|
+
sizeof(void*) * (vector->length - index - 1)
|
108
|
+
);
|
121
109
|
--vector->length;
|
122
110
|
return result;
|
123
111
|
}
|
data/gumbo-parser/src/vector.h
CHANGED
@@ -1,19 +1,3 @@
|
|
1
|
-
// Copyright 2010 Google Inc. All Rights Reserved.
|
2
|
-
//
|
3
|
-
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
-
// you may not use this file except in compliance with the License.
|
5
|
-
// You may obtain a copy of the License at
|
6
|
-
//
|
7
|
-
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
-
//
|
9
|
-
// Unless required by applicable law or agreed to in writing, software
|
10
|
-
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
-
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
-
// See the License for the specific language governing permissions and
|
13
|
-
// limitations under the License.
|
14
|
-
//
|
15
|
-
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
-
|
17
1
|
#ifndef GUMBO_VECTOR_H_
|
18
2
|
#define GUMBO_VECTOR_H_
|
19
3
|
|
@@ -23,45 +7,39 @@
|
|
23
7
|
extern "C" {
|
24
8
|
#endif
|
25
9
|
|
26
|
-
// Forward declaration since it's passed into some of the functions in this
|
27
|
-
// header.
|
28
|
-
struct GumboInternalParser;
|
29
|
-
|
30
10
|
// Initializes a new GumboVector with the specified initial capacity.
|
31
|
-
void gumbo_vector_init(
|
32
|
-
size_t initial_capacity, GumboVector* vector);
|
11
|
+
void gumbo_vector_init(unsigned int initial_capacity, GumboVector* vector);
|
33
12
|
|
34
|
-
// Frees the memory used by
|
13
|
+
// Frees the memory used by a GumboVector. Does not free the contained
|
35
14
|
// pointers.
|
36
|
-
void gumbo_vector_destroy(
|
37
|
-
struct GumboInternalParser* parser, GumboVector* vector);
|
15
|
+
void gumbo_vector_destroy(GumboVector* vector);
|
38
16
|
|
39
|
-
// Adds a new element to
|
40
|
-
void gumbo_vector_add(
|
41
|
-
struct GumboInternalParser* parser, void* element, GumboVector* vector);
|
17
|
+
// Adds a new element to a GumboVector.
|
18
|
+
void gumbo_vector_add(void* element, GumboVector* vector);
|
42
19
|
|
43
20
|
// Removes and returns the element most recently added to the GumboVector.
|
44
|
-
// Ownership is transferred to caller.
|
21
|
+
// Ownership is transferred to caller. Capacity is unchanged. If the vector is
|
45
22
|
// empty, NULL is returned.
|
46
|
-
void* gumbo_vector_pop(
|
23
|
+
void* gumbo_vector_pop(GumboVector* vector);
|
47
24
|
|
48
|
-
// Inserts an element at a specific index.
|
25
|
+
// Inserts an element at a specific index. This is potentially O(N) time, but
|
49
26
|
// is necessary for some of the spec's behavior.
|
50
|
-
void gumbo_vector_insert_at(
|
51
|
-
|
27
|
+
void gumbo_vector_insert_at (
|
28
|
+
void* element,
|
29
|
+
unsigned int index,
|
30
|
+
GumboVector* vector
|
31
|
+
);
|
52
32
|
|
53
33
|
// Removes an element from the vector, or does nothing if the element is not in
|
54
34
|
// the vector.
|
55
|
-
void gumbo_vector_remove(
|
56
|
-
struct GumboInternalParser* parser, void* element, GumboVector* vector);
|
35
|
+
void gumbo_vector_remove(void* element, GumboVector* vector);
|
57
36
|
|
58
|
-
// Removes and returns an element at a specific index.
|
37
|
+
// Removes and returns an element at a specific index. Note that this is
|
59
38
|
// potentially O(N) time and should be used sparingly.
|
60
|
-
void* gumbo_vector_remove_at(
|
61
|
-
unsigned int index, GumboVector* vector);
|
39
|
+
void* gumbo_vector_remove_at(unsigned int index, GumboVector* vector);
|
62
40
|
|
63
41
|
#ifdef __cplusplus
|
64
42
|
}
|
65
43
|
#endif
|
66
44
|
|
67
|
-
#endif
|
45
|
+
#endif // GUMBO_VECTOR_H_
|
data/lib/nokogumbo.rb
CHANGED
@@ -1,179 +1,15 @@
|
|
1
1
|
require 'nokogiri'
|
2
|
-
require '
|
2
|
+
require 'nokogumbo/version'
|
3
|
+
require 'nokogumbo/html5'
|
4
|
+
require 'nokogumbo/xml/node.rb'
|
3
5
|
|
4
|
-
|
5
|
-
# Parse an HTML document. +string+ contains the document. +string+
|
6
|
-
# may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
|
7
|
-
def self.HTML5(*args)
|
8
|
-
Nokogiri::HTML5.parse(*args)
|
9
|
-
end
|
6
|
+
require 'nokogumbo/nokogumbo'
|
10
7
|
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
def self.parse(string, options={})
|
15
|
-
if string.respond_to? :read
|
16
|
-
string = string.read
|
17
|
-
end
|
8
|
+
module Nokogumbo
|
9
|
+
# The default maximum number of errors for parsing a document or a fragment.
|
10
|
+
DEFAULT_MAX_ERRORS = 0
|
18
11
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
end
|
23
|
-
|
24
|
-
Nokogumbo.parse(string.to_s, options[:max_parse_errors] || 0)
|
25
|
-
end
|
26
|
-
|
27
|
-
# Fetch and parse a HTML document from the web, following redirects,
|
28
|
-
# handling https, and determining the character encoding using HTML5
|
29
|
-
# rules. +uri+ may be a +String+ or a +URI+. +options+ contains
|
30
|
-
# http headers and special options. Everything which is not a
|
31
|
-
# special option is considered a header. Special options include:
|
32
|
-
# * :follow_limit => number of redirects which are followed
|
33
|
-
# * :basic_auth => [username, password]
|
34
|
-
def self.get(uri, options={})
|
35
|
-
headers = options.clone
|
36
|
-
headers = {:follow_limit => headers} if Numeric === headers # deprecated
|
37
|
-
limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
|
38
|
-
|
39
|
-
require 'net/http'
|
40
|
-
uri = URI(uri) unless URI === uri
|
41
|
-
|
42
|
-
http = Net::HTTP.new(uri.host, uri.port)
|
43
|
-
|
44
|
-
# TLS / SSL support
|
45
|
-
http.use_ssl = true if uri.scheme == 'https'
|
46
|
-
|
47
|
-
# Pass through Net::HTTP override values, which currently include:
|
48
|
-
# :ca_file, :ca_path, :cert, :cert_store, :ciphers,
|
49
|
-
# :close_on_empty_response, :continue_timeout, :key, :open_timeout,
|
50
|
-
# :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
|
51
|
-
# :verify_callback, :verify_depth, :verify_mode
|
52
|
-
options.each do |key, value|
|
53
|
-
http.send "#{key}=", headers.delete(key) if http.respond_to? "#{key}="
|
54
|
-
end
|
55
|
-
|
56
|
-
request = Net::HTTP::Get.new(uri.request_uri)
|
57
|
-
|
58
|
-
# basic authentication
|
59
|
-
auth = headers.delete(:basic_auth)
|
60
|
-
auth ||= [uri.user, uri.password] if uri.user and uri.password
|
61
|
-
request.basic_auth auth.first, auth.last if auth
|
62
|
-
|
63
|
-
# remaining options are treated as headers
|
64
|
-
headers.each {|key, value| request[key.to_s] = value.to_s}
|
65
|
-
|
66
|
-
response = http.request(request)
|
67
|
-
|
68
|
-
case response
|
69
|
-
when Net::HTTPSuccess
|
70
|
-
doc = parse(reencode(response.body, response['content-type']), options)
|
71
|
-
doc.instance_variable_set('@response', response)
|
72
|
-
doc.class.send(:attr_reader, :response)
|
73
|
-
doc
|
74
|
-
when Net::HTTPRedirection
|
75
|
-
response.value if limit <= 1
|
76
|
-
location = URI.join(uri, response['location'])
|
77
|
-
get(location, options.merge(:follow_limit => limit-1))
|
78
|
-
else
|
79
|
-
response.value
|
80
|
-
end
|
81
|
-
end
|
82
|
-
|
83
|
-
# while fragment is on the Gumbo TODO list, simulate it by doing
|
84
|
-
# a full document parse and ignoring the parent <html>, <head>, and <body>
|
85
|
-
# tags, and collecting up the children of each.
|
86
|
-
def self.fragment(*args)
|
87
|
-
doc = parse(*args)
|
88
|
-
fragment = Nokogiri::HTML::DocumentFragment.new(doc)
|
89
|
-
|
90
|
-
if doc.children.length != 1 or doc.children.first.name != 'html'
|
91
|
-
# no HTML? Return document as is
|
92
|
-
fragment = doc
|
93
|
-
else
|
94
|
-
# examine children of HTML element
|
95
|
-
children = doc.children.first.children
|
96
|
-
|
97
|
-
# head is always first. If present, take children but otherwise
|
98
|
-
# ignore the head element
|
99
|
-
if children.length > 0 and doc.children.first.name = 'head'
|
100
|
-
fragment << children.shift.children
|
101
|
-
end
|
102
|
-
|
103
|
-
# body may be next, or last. If found, take children but otherwise
|
104
|
-
# ignore the body element. Also take any remaining elements, taking
|
105
|
-
# care to preserve order.
|
106
|
-
if children.length > 0 and doc.children.first.name = 'body'
|
107
|
-
fragment << children.shift.children
|
108
|
-
fragment << children
|
109
|
-
elsif children.length > 0 and doc.children.last.name = 'body'
|
110
|
-
body = children.pop
|
111
|
-
fragment << children
|
112
|
-
fragment << body.children
|
113
|
-
else
|
114
|
-
fragment << children
|
115
|
-
end
|
116
|
-
end
|
117
|
-
|
118
|
-
# return result
|
119
|
-
fragment
|
120
|
-
end
|
121
|
-
|
122
|
-
private
|
123
|
-
|
124
|
-
# Charset sniffing is a complex and controversial topic that understandably
|
125
|
-
# isn't done _by default_ by the Ruby Net::HTTP library. This being said,
|
126
|
-
# it is a very real problem for consumers of HTML as the default for HTML
|
127
|
-
# is iso-8859-1, most "good" producers use utf-8, and the Gumbo parser
|
128
|
-
# *only* supports utf-8.
|
129
|
-
#
|
130
|
-
# Accordingly, Nokogiri::HTML::Document.parse provides limited encoding
|
131
|
-
# detection. Following this lead, Nokogiri::HTML5 attempts to do likewise,
|
132
|
-
# while attempting to more closely follow the HTML5 standard.
|
133
|
-
#
|
134
|
-
# http://bugs.ruby-lang.org/issues/2567
|
135
|
-
# http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
|
136
|
-
#
|
137
|
-
def self.reencode(body, content_type=nil)
|
138
|
-
return body unless body.respond_to? :encoding
|
139
|
-
|
140
|
-
if body.encoding == Encoding::ASCII_8BIT
|
141
|
-
encoding = nil
|
142
|
-
|
143
|
-
# look for a Byte Order Mark (BOM)
|
144
|
-
if body[0..1] == "\xFE\xFF"
|
145
|
-
encoding = 'utf-16be'
|
146
|
-
elsif body[0..1] == "\xFF\xFE"
|
147
|
-
encoding = 'utf-16le'
|
148
|
-
elsif body[0..2] == "\xEF\xBB\xBF"
|
149
|
-
encoding = 'utf-8'
|
150
|
-
end
|
151
|
-
|
152
|
-
# look for a charset in a content-encoding header
|
153
|
-
if content_type
|
154
|
-
encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
|
155
|
-
end
|
156
|
-
|
157
|
-
# look for a charset in a meta tag in the first 1024 bytes
|
158
|
-
if not encoding
|
159
|
-
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
|
160
|
-
data.scan(/<meta.*?>/m).each do |meta|
|
161
|
-
encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
|
162
|
-
end
|
163
|
-
end
|
164
|
-
|
165
|
-
# if all else fails, default to the official default encoding for HTML
|
166
|
-
encoding ||= Encoding::ISO_8859_1
|
167
|
-
|
168
|
-
# change the encoding to match the detected or inferred encoding
|
169
|
-
begin
|
170
|
-
body.force_encoding(encoding)
|
171
|
-
rescue ArgumentError
|
172
|
-
body.force_encoding(Encoding::ISO_8859_1)
|
173
|
-
end
|
174
|
-
end
|
175
|
-
|
176
|
-
body.encode(Encoding::UTF_8)
|
177
|
-
end
|
178
|
-
end
|
12
|
+
# The default maximum depth of the DOM tree produced by parsing a document
|
13
|
+
# or fragment.
|
14
|
+
DEFAULT_MAX_TREE_DEPTH = 400
|
179
15
|
end
|