nokogumbo 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +8 -2
- data/ext/nokogumboc/extconf.rb +18 -6
- data/ext/nokogumboc/nokogumbo.c +102 -42
- data/gumbo-parser/src/attribute.c +1 -1
- data/gumbo-parser/src/char_ref.c +37 -67
- data/gumbo-parser/src/char_ref.h +3 -4
- data/gumbo-parser/src/char_ref.rl +6 -1
- data/gumbo-parser/src/error.c +51 -51
- data/gumbo-parser/src/error.h +7 -9
- data/gumbo-parser/src/gumbo.h +45 -181
- data/gumbo-parser/src/parser.c +1439 -1172
- data/gumbo-parser/src/string_buffer.c +14 -10
- data/gumbo-parser/src/string_buffer.h +9 -6
- data/gumbo-parser/src/string_piece.c +5 -6
- data/gumbo-parser/src/string_piece.h +2 -3
- data/gumbo-parser/src/tag.c +36 -166
- data/gumbo-parser/src/tag.in +150 -0
- data/gumbo-parser/src/tag_enum.h +153 -0
- data/gumbo-parser/src/tag_gperf.h +105 -0
- data/gumbo-parser/src/tag_sizes.h +4 -0
- data/gumbo-parser/src/tag_strings.h +153 -0
- data/gumbo-parser/src/token_type.h +1 -0
- data/gumbo-parser/src/tokenizer.c +278 -361
- data/gumbo-parser/src/tokenizer.h +2 -2
- data/gumbo-parser/src/utf8.c +53 -52
- data/gumbo-parser/src/utf8.h +1 -2
- data/gumbo-parser/src/util.c +1 -1
- data/gumbo-parser/src/util.h +0 -2
- data/gumbo-parser/src/vector.c +17 -17
- data/gumbo-parser/src/vector.h +6 -8
- data/gumbo-parser/visualc/include/strings.h +2 -1
- data/lib/nokogumbo.rb +8 -8
- data/test-nokogumbo.rb +190 -0
- metadata +19 -17
@@ -64,8 +64,8 @@ typedef struct GumboInternalToken {
|
|
64
64
|
GumboTokenDocType doc_type;
|
65
65
|
GumboTokenStartTag start_tag;
|
66
66
|
GumboTag end_tag;
|
67
|
-
const char* text;
|
68
|
-
int character;
|
67
|
+
const char* text; // For comments.
|
68
|
+
int character; // For character, whitespace, null, and EOF tokens.
|
69
69
|
} v;
|
70
70
|
} GumboToken;
|
71
71
|
|
data/gumbo-parser/src/utf8.c
CHANGED
@@ -19,7 +19,7 @@
|
|
19
19
|
#include <assert.h>
|
20
20
|
#include <stdint.h>
|
21
21
|
#include <string.h>
|
22
|
-
#include <strings.h>
|
22
|
+
#include <strings.h> // For strncasecmp.
|
23
23
|
|
24
24
|
#include "error.h"
|
25
25
|
#include "gumbo.h"
|
@@ -47,9 +47,11 @@ const int kUtf8ReplacementChar = 0xFFFD;
|
|
47
47
|
|
48
48
|
// Permission is hereby granted, free of charge, to any person obtaining a copy
|
49
49
|
// of this software and associated documentation files (the "Software"), to deal
|
50
|
-
// in the Software without restriction, including without limitation the rights
|
50
|
+
// in the Software without restriction, including without limitation the rights
|
51
|
+
// to
|
51
52
|
// use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
|
52
|
-
// of the Software, and to permit persons to whom the Software is furnished to
|
53
|
+
// of the Software, and to permit persons to whom the Software is furnished to
|
54
|
+
// do
|
53
55
|
// so, subject to the following conditions:
|
54
56
|
|
55
57
|
// The above copyright notice and this permission notice shall be included in
|
@@ -59,32 +61,35 @@ const int kUtf8ReplacementChar = 0xFFFD;
|
|
59
61
|
#define UTF8_REJECT 12
|
60
62
|
|
61
63
|
static const uint8_t utf8d[] = {
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
64
|
+
// The first part of the table maps bytes to character classes that
|
65
|
+
// to reduce the size of the transition table and create bitmasks.
|
66
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
67
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
68
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
69
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
70
|
+
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
71
|
+
0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 9, 9, 9, 9,
|
72
|
+
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
|
73
|
+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 2, 2, 2, 2, 2, 2,
|
74
|
+
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 10,
|
75
|
+
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 3, 3, 11, 6, 6, 6, 5, 8, 8, 8, 8, 8,
|
76
|
+
8, 8, 8, 8, 8, 8,
|
77
|
+
|
78
|
+
// The second part is a transition table that maps a combination
|
79
|
+
// of a state of the automaton and a character class to a state.
|
80
|
+
0, 12, 24, 36, 60, 96, 84, 12, 12, 12, 48, 72, 12, 12, 12, 12, 12, 12, 12,
|
81
|
+
12, 12, 12, 12, 12, 12, 0, 12, 12, 12, 12, 12, 0, 12, 0, 12, 12, 12, 24, 12,
|
82
|
+
12, 12, 12, 12, 24, 12, 24, 12, 12, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12,
|
83
|
+
12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12, 24, 12, 12, 12, 12, 12, 12, 12,
|
84
|
+
12, 12, 36, 12, 36, 12, 12, 12, 36, 12, 12, 12, 12, 12, 36, 12, 36, 12, 12,
|
85
|
+
12, 36, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
|
80
86
|
};
|
81
87
|
|
82
88
|
uint32_t static inline decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
83
89
|
uint32_t type = utf8d[byte];
|
84
90
|
|
85
|
-
*codep = (*state != UTF8_ACCEPT) ?
|
86
|
-
|
87
|
-
(0xff >> type) & (byte);
|
91
|
+
*codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6)
|
92
|
+
: (0xff >> type) & (byte);
|
88
93
|
|
89
94
|
*state = utf8d[256 + *state + type];
|
90
95
|
return *state;
|
@@ -130,13 +135,13 @@ static void read_char(Utf8Iterator* iter) {
|
|
130
135
|
uint32_t code_point = 0;
|
131
136
|
uint32_t state = UTF8_ACCEPT;
|
132
137
|
for (const char* c = iter->_start; c < iter->_end; ++c) {
|
133
|
-
decode(&state, &code_point, (uint32_t)
|
138
|
+
decode(&state, &code_point, (uint32_t)(unsigned char) (*c));
|
134
139
|
if (state == UTF8_ACCEPT) {
|
135
140
|
iter->_width = c - iter->_start + 1;
|
136
|
-
// This is the special handling for carriage returns that is mandated by
|
137
|
-
// HTML5 spec. Since we're looking for particular 7-bit literal
|
138
|
-
// we operate in terms of chars and only need a check for iter
|
139
|
-
// instead of having to read in a full next code point.
|
141
|
+
// This is the special handling for carriage returns that is mandated by
|
142
|
+
// the HTML5 spec. Since we're looking for particular 7-bit literal
|
143
|
+
// characters, we operate in terms of chars and only need a check for iter
|
144
|
+
// overrun, instead of having to read in a full next code point.
|
140
145
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream
|
141
146
|
if (code_point == '\r') {
|
142
147
|
assert(iter->_width == 1);
|
@@ -165,10 +170,11 @@ static void read_char(Utf8Iterator* iter) {
|
|
165
170
|
return;
|
166
171
|
}
|
167
172
|
}
|
168
|
-
// If we got here without exiting early, then we've reached the end of the
|
169
|
-
// Add an error for truncated input, set the width to consume the
|
170
|
-
// iterator, and emit a replacement character. The next time we
|
171
|
-
// it will detect that there's no input to consume and
|
173
|
+
// If we got here without exiting early, then we've reached the end of the
|
174
|
+
// iterator. Add an error for truncated input, set the width to consume the
|
175
|
+
// rest of the iterator, and emit a replacement character. The next time we
|
176
|
+
// enter this method, it will detect that there's no input to consume and
|
177
|
+
// output an EOF.
|
172
178
|
iter->_current = kUtf8ReplacementChar;
|
173
179
|
iter->_width = iter->_end - iter->_start;
|
174
180
|
add_error(iter, GUMBO_ERR_UTF8_TRUNCATED);
|
@@ -179,10 +185,10 @@ static void update_position(Utf8Iterator* iter) {
|
|
179
185
|
if (iter->_current == '\n') {
|
180
186
|
++iter->_pos.line;
|
181
187
|
iter->_pos.column = 1;
|
182
|
-
} else if(iter->_current == '\t') {
|
188
|
+
} else if (iter->_current == '\t') {
|
183
189
|
int tab_stop = iter->_parser->_options->tab_stop;
|
184
190
|
iter->_pos.column = ((iter->_pos.column / tab_stop) + 1) * tab_stop;
|
185
|
-
} else if(iter->_current != -1) {
|
191
|
+
} else if (iter->_current != -1) {
|
186
192
|
++iter->_pos.column;
|
187
193
|
}
|
188
194
|
}
|
@@ -191,13 +197,12 @@ static void update_position(Utf8Iterator* iter) {
|
|
191
197
|
// forbidden by the HTML5 spec, such as undefined control chars.
|
192
198
|
bool utf8_is_invalid_code_point(int c) {
|
193
199
|
return (c >= 0x1 && c <= 0x8) || c == 0xB || (c >= 0xE && c <= 0x1F) ||
|
194
|
-
|
195
|
-
|
200
|
+
(c >= 0x7F && c <= 0x9F) || (c >= 0xFDD0 && c <= 0xFDEF) ||
|
201
|
+
((c & 0xFFFF) == 0xFFFE) || ((c & 0xFFFF) == 0xFFFF);
|
196
202
|
}
|
197
203
|
|
198
|
-
void utf8iterator_init(
|
199
|
-
|
200
|
-
Utf8Iterator* iter) {
|
204
|
+
void utf8iterator_init(GumboParser* parser, const char* source,
|
205
|
+
size_t source_length, Utf8Iterator* iter) {
|
201
206
|
iter->_start = source;
|
202
207
|
iter->_end = source + source_length;
|
203
208
|
iter->_pos.line = 1;
|
@@ -215,9 +220,7 @@ void utf8iterator_next(Utf8Iterator* iter) {
|
|
215
220
|
read_char(iter);
|
216
221
|
}
|
217
222
|
|
218
|
-
int utf8iterator_current(const Utf8Iterator* iter) {
|
219
|
-
return iter->_current;
|
220
|
-
}
|
223
|
+
int utf8iterator_current(const Utf8Iterator* iter) { return iter->_current; }
|
221
224
|
|
222
225
|
void utf8iterator_get_position(
|
223
226
|
const Utf8Iterator* iter, GumboSourcePosition* output) {
|
@@ -232,14 +235,13 @@ const char* utf8iterator_get_end_pointer(const Utf8Iterator* iter) {
|
|
232
235
|
return iter->_end;
|
233
236
|
}
|
234
237
|
|
235
|
-
bool utf8iterator_maybe_consume_match(
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
!strncasecmp(iter->_start, prefix, length));
|
238
|
+
bool utf8iterator_maybe_consume_match(Utf8Iterator* iter, const char* prefix,
|
239
|
+
size_t length, bool case_sensitive) {
|
240
|
+
bool matched = (iter->_start + length <= iter->_end) &&
|
241
|
+
(case_sensitive ? !strncmp(iter->_start, prefix, length)
|
242
|
+
: !strncasecmp(iter->_start, prefix, length));
|
241
243
|
if (matched) {
|
242
|
-
for (int i = 0; i < length; ++i) {
|
244
|
+
for (unsigned int i = 0; i < length; ++i) {
|
243
245
|
utf8iterator_next(iter);
|
244
246
|
}
|
245
247
|
return true;
|
@@ -262,8 +264,7 @@ void utf8iterator_reset(Utf8Iterator* iter) {
|
|
262
264
|
|
263
265
|
// Sets the position and original text fields of an error to the value at the
|
264
266
|
// mark.
|
265
|
-
void utf8iterator_fill_error_at_mark(
|
266
|
-
Utf8Iterator* iter, GumboError* error) {
|
267
|
+
void utf8iterator_fill_error_at_mark(Utf8Iterator* iter, GumboError* error) {
|
267
268
|
error->position = iter->_mark_pos;
|
268
269
|
error->original_text = iter->_mark;
|
269
270
|
}
|
data/gumbo-parser/src/utf8.h
CHANGED
@@ -81,8 +81,7 @@ bool utf8_is_invalid_code_point(int c);
|
|
81
81
|
|
82
82
|
// Initializes a new Utf8Iterator from the given byte buffer. The source does
|
83
83
|
// not have to be NUL-terminated, but the length must be passed in explicitly.
|
84
|
-
void utf8iterator_init(
|
85
|
-
struct GumboInternalParser* parser, const char* source,
|
84
|
+
void utf8iterator_init(struct GumboInternalParser* parser, const char* source,
|
86
85
|
size_t source_length, Utf8Iterator* iter);
|
87
86
|
|
88
87
|
// Advances the current position by one code point.
|
data/gumbo-parser/src/util.c
CHANGED
@@ -29,7 +29,7 @@
|
|
29
29
|
// TODO(jdtang): This should be elsewhere, but there's no .c file for
|
30
30
|
// SourcePositions and yet the constant needs some linkage, so this is as good
|
31
31
|
// as any.
|
32
|
-
const GumboSourcePosition kGumboEmptySourcePosition = {
|
32
|
+
const GumboSourcePosition kGumboEmptySourcePosition = {0, 0, 0};
|
33
33
|
|
34
34
|
void* gumbo_parser_allocate(GumboParser* parser, size_t num_bytes) {
|
35
35
|
return parser->_options->allocator(parser->_options->userdata, num_bytes);
|
data/gumbo-parser/src/util.h
CHANGED
data/gumbo-parser/src/vector.c
CHANGED
@@ -25,21 +25,22 @@
|
|
25
25
|
|
26
26
|
struct GumboInternalParser;
|
27
27
|
|
28
|
-
const GumboVector kGumboEmptyVector = {
|
28
|
+
const GumboVector kGumboEmptyVector = {NULL, 0, 0};
|
29
29
|
|
30
|
-
void gumbo_vector_init(
|
31
|
-
|
30
|
+
void gumbo_vector_init(struct GumboInternalParser* parser,
|
31
|
+
size_t initial_capacity, GumboVector* vector) {
|
32
32
|
vector->length = 0;
|
33
33
|
vector->capacity = initial_capacity;
|
34
34
|
if (initial_capacity > 0) {
|
35
|
-
vector->data =
|
36
|
-
parser, sizeof(void*) * initial_capacity);
|
35
|
+
vector->data =
|
36
|
+
gumbo_parser_allocate(parser, sizeof(void*) * initial_capacity);
|
37
37
|
} else {
|
38
38
|
vector->data = NULL;
|
39
39
|
}
|
40
40
|
}
|
41
41
|
|
42
|
-
void gumbo_vector_destroy(
|
42
|
+
void gumbo_vector_destroy(
|
43
|
+
struct GumboInternalParser* parser, GumboVector* vector) {
|
43
44
|
if (vector->capacity > 0) {
|
44
45
|
gumbo_parser_deallocate(parser, vector->data);
|
45
46
|
}
|
@@ -59,8 +60,8 @@ static void enlarge_vector_if_full(
|
|
59
60
|
} else {
|
60
61
|
// 0-capacity vector; no previous array to deallocate.
|
61
62
|
vector->capacity = 2;
|
62
|
-
vector->data =
|
63
|
-
parser, sizeof(void*) * vector->capacity);
|
63
|
+
vector->data =
|
64
|
+
gumbo_parser_allocate(parser, sizeof(void*) * vector->capacity);
|
64
65
|
}
|
65
66
|
}
|
66
67
|
}
|
@@ -81,8 +82,8 @@ void* gumbo_vector_pop(
|
|
81
82
|
return vector->data[--vector->length];
|
82
83
|
}
|
83
84
|
|
84
|
-
int gumbo_vector_index_of(GumboVector* vector, void* element) {
|
85
|
-
for (int i = 0; i < vector->length; ++i) {
|
85
|
+
int gumbo_vector_index_of(GumboVector* vector, const void* element) {
|
86
|
+
for (unsigned int i = 0; i < vector->length; ++i) {
|
86
87
|
if (vector->data[i] == element) {
|
87
88
|
return i;
|
88
89
|
}
|
@@ -90,15 +91,14 @@ int gumbo_vector_index_of(GumboVector* vector, void* element) {
|
|
90
91
|
return -1;
|
91
92
|
}
|
92
93
|
|
93
|
-
void gumbo_vector_insert_at(
|
94
|
-
|
95
|
-
GumboVector* vector) {
|
94
|
+
void gumbo_vector_insert_at(struct GumboInternalParser* parser, void* element,
|
95
|
+
unsigned int index, GumboVector* vector) {
|
96
96
|
assert(index >= 0);
|
97
97
|
assert(index <= vector->length);
|
98
98
|
enlarge_vector_if_full(parser, vector);
|
99
99
|
++vector->length;
|
100
100
|
memmove(&vector->data[index + 1], &vector->data[index],
|
101
|
-
|
101
|
+
sizeof(void*) * (vector->length - index - 1));
|
102
102
|
vector->data[index] = element;
|
103
103
|
}
|
104
104
|
|
@@ -111,13 +111,13 @@ void gumbo_vector_remove(
|
|
111
111
|
gumbo_vector_remove_at(parser, index, vector);
|
112
112
|
}
|
113
113
|
|
114
|
-
void* gumbo_vector_remove_at(
|
115
|
-
|
114
|
+
void* gumbo_vector_remove_at(struct GumboInternalParser* parser,
|
115
|
+
unsigned int index, GumboVector* vector) {
|
116
116
|
assert(index >= 0);
|
117
117
|
assert(index < vector->length);
|
118
118
|
void* result = vector->data[index];
|
119
119
|
memmove(&vector->data[index], &vector->data[index + 1],
|
120
|
-
|
120
|
+
sizeof(void*) * (vector->length - index - 1));
|
121
121
|
--vector->length;
|
122
122
|
return result;
|
123
123
|
}
|
data/gumbo-parser/src/vector.h
CHANGED
@@ -28,9 +28,8 @@ extern "C" {
|
|
28
28
|
struct GumboInternalParser;
|
29
29
|
|
30
30
|
// Initializes a new GumboVector with the specified initial capacity.
|
31
|
-
void gumbo_vector_init(
|
32
|
-
|
33
|
-
GumboVector* vector);
|
31
|
+
void gumbo_vector_init(struct GumboInternalParser* parser,
|
32
|
+
size_t initial_capacity, GumboVector* vector);
|
34
33
|
|
35
34
|
// Frees the memory used by an GumboVector. Does not free the contained
|
36
35
|
// pointers.
|
@@ -48,9 +47,8 @@ void* gumbo_vector_pop(struct GumboInternalParser* parser, GumboVector* vector);
|
|
48
47
|
|
49
48
|
// Inserts an element at a specific index. This is potentially O(N) time, but
|
50
49
|
// is necessary for some of the spec's behavior.
|
51
|
-
void gumbo_vector_insert_at(
|
52
|
-
|
53
|
-
GumboVector* vector);
|
50
|
+
void gumbo_vector_insert_at(struct GumboInternalParser* parser, void* element,
|
51
|
+
unsigned int index, GumboVector* vector);
|
54
52
|
|
55
53
|
// Removes an element from the vector, or does nothing if the element is not in
|
56
54
|
// the vector.
|
@@ -59,8 +57,8 @@ void gumbo_vector_remove(
|
|
59
57
|
|
60
58
|
// Removes and returns an element at a specific index. Note that this is
|
61
59
|
// potentially O(N) time and should be used sparingly.
|
62
|
-
void* gumbo_vector_remove_at(
|
63
|
-
|
60
|
+
void* gumbo_vector_remove_at(struct GumboInternalParser* parser,
|
61
|
+
unsigned int index, GumboVector* vector);
|
64
62
|
|
65
63
|
#ifdef __cplusplus
|
66
64
|
}
|
data/lib/nokogumbo.rb
CHANGED
@@ -4,14 +4,14 @@ require 'nokogumboc'
|
|
4
4
|
module Nokogiri
|
5
5
|
# Parse an HTML document. +string+ contains the document. +string+
|
6
6
|
# may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
|
7
|
-
def self.HTML5(
|
8
|
-
Nokogiri::HTML5.parse(
|
7
|
+
def self.HTML5(*args)
|
8
|
+
Nokogiri::HTML5.parse(*args)
|
9
9
|
end
|
10
10
|
|
11
11
|
module HTML5
|
12
12
|
# Parse an HTML document. +string+ contains the document. +string+
|
13
13
|
# may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
|
14
|
-
def self.parse(string)
|
14
|
+
def self.parse(string, options={})
|
15
15
|
if string.respond_to? :read
|
16
16
|
string = string.read
|
17
17
|
end
|
@@ -21,7 +21,7 @@ module Nokogiri
|
|
21
21
|
string = reencode(string)
|
22
22
|
end
|
23
23
|
|
24
|
-
Nokogumbo.parse(string.to_s)
|
24
|
+
Nokogumbo.parse(string.to_s, options[:max_parse_errors] || 0)
|
25
25
|
end
|
26
26
|
|
27
27
|
# Fetch and parse a HTML document from the web, following redirects,
|
@@ -67,7 +67,7 @@ module Nokogiri
|
|
67
67
|
|
68
68
|
case response
|
69
69
|
when Net::HTTPSuccess
|
70
|
-
doc = parse(reencode(response.body, response['content-type']))
|
70
|
+
doc = parse(reencode(response.body, response['content-type']), options)
|
71
71
|
doc.instance_variable_set('@response', response)
|
72
72
|
doc.class.send(:attr_reader, :response)
|
73
73
|
doc
|
@@ -83,8 +83,8 @@ module Nokogiri
|
|
83
83
|
# while fragment is on the Gumbo TODO list, simulate it by doing
|
84
84
|
# a full document parse and ignoring the parent <html>, <head>, and <body>
|
85
85
|
# tags, and collecting up the children of each.
|
86
|
-
def self.fragment(
|
87
|
-
doc = parse(
|
86
|
+
def self.fragment(*args)
|
87
|
+
doc = parse(*args)
|
88
88
|
fragment = Nokogiri::HTML::DocumentFragment.new(doc)
|
89
89
|
|
90
90
|
if doc.children.length != 1 or doc.children.first.name != 'html'
|
@@ -151,7 +151,7 @@ module Nokogiri
|
|
151
151
|
|
152
152
|
# look for a charset in a content-encoding header
|
153
153
|
if content_type
|
154
|
-
encoding ||= content_type[/charset=(.*?)(
|
154
|
+
encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
|
155
155
|
end
|
156
156
|
|
157
157
|
# look for a charset in a meta tag in the first 1024 bytes
|
data/test-nokogumbo.rb
ADDED
@@ -0,0 +1,190 @@
|
|
1
|
+
$:.unshift('lib')
|
2
|
+
$:.unshift('ext/nokogumboc')
|
3
|
+
|
4
|
+
gem 'minitest'
|
5
|
+
|
6
|
+
require 'nokogumbo'
|
7
|
+
require 'minitest/autorun'
|
8
|
+
|
9
|
+
class TestNokogumbo < Minitest::Test
|
10
|
+
def test_element_text
|
11
|
+
doc = Nokogiri::HTML5(buffer)
|
12
|
+
assert_equal "content", doc.at('span').text
|
13
|
+
end
|
14
|
+
|
15
|
+
def test_element_cdata_textarea
|
16
|
+
doc = Nokogiri::HTML5(buffer)
|
17
|
+
assert_equal "foo<x>bar", doc.at('textarea').text.strip
|
18
|
+
end
|
19
|
+
|
20
|
+
def test_element_cdata_script
|
21
|
+
doc = Nokogiri::HTML5.fragment(buffer)
|
22
|
+
assert_equal true, doc.document.html?
|
23
|
+
assert_equal "<script> if (a < b) alert(1) </script>", doc.at('script').to_s
|
24
|
+
end
|
25
|
+
|
26
|
+
def test_attr_value
|
27
|
+
doc = Nokogiri::HTML5(buffer)
|
28
|
+
assert_equal "utf-8", doc.at('meta')['charset']
|
29
|
+
end
|
30
|
+
|
31
|
+
def test_comment
|
32
|
+
doc = Nokogiri::HTML5(buffer)
|
33
|
+
assert_equal " test comment ", doc.xpath('//comment()').text
|
34
|
+
end
|
35
|
+
|
36
|
+
def test_unknown_element
|
37
|
+
doc = Nokogiri::HTML5(buffer)
|
38
|
+
assert_equal "main", doc.at('main').name
|
39
|
+
end
|
40
|
+
|
41
|
+
def test_IO
|
42
|
+
require 'stringio'
|
43
|
+
doc = Nokogiri::HTML5(StringIO.new(buffer))
|
44
|
+
assert_equal 'textarea', doc.at('form').element_children.first.name
|
45
|
+
end
|
46
|
+
|
47
|
+
def test_nil
|
48
|
+
doc = Nokogiri::HTML5(nil)
|
49
|
+
assert_equal 1, doc.search('body').count
|
50
|
+
end
|
51
|
+
|
52
|
+
if ''.respond_to? 'encoding'
|
53
|
+
def test_macroman_encoding
|
54
|
+
mac="<span>\xCA</span>".force_encoding('macroman')
|
55
|
+
doc = Nokogiri::HTML5(mac)
|
56
|
+
assert_equal '<span> </span>', doc.at('span').to_xml
|
57
|
+
end
|
58
|
+
|
59
|
+
def test_iso8859_encoding
|
60
|
+
iso8859="<span>Se\xF1or</span>".force_encoding(Encoding::ASCII_8BIT)
|
61
|
+
doc = Nokogiri::HTML5(iso8859)
|
62
|
+
assert_equal '<span>Señor</span>', doc.at('span').to_xml
|
63
|
+
end
|
64
|
+
|
65
|
+
def test_charset_encoding
|
66
|
+
utf8="<meta charset='utf-8'><span>Se\xC3\xB1or</span>".
|
67
|
+
force_encoding(Encoding::ASCII_8BIT)
|
68
|
+
doc = Nokogiri::HTML5(utf8)
|
69
|
+
assert_equal '<span>Señor</span>', doc.at('span').to_xml
|
70
|
+
end
|
71
|
+
|
72
|
+
def test_bogus_encoding
|
73
|
+
bogus="<meta charset='bogus'><span>Se\xF1or</span>".
|
74
|
+
force_encoding(Encoding::ASCII_8BIT)
|
75
|
+
doc = Nokogiri::HTML5(bogus)
|
76
|
+
assert_equal '<span>Señor</span>', doc.at('span').to_xml
|
77
|
+
end
|
78
|
+
end
|
79
|
+
|
80
|
+
def test_html5_doctype
|
81
|
+
doc = Nokogiri::HTML5.parse("<!DOCTYPE html><html></html>")
|
82
|
+
assert_match /<!DOCTYPE html>/, doc.to_html
|
83
|
+
end
|
84
|
+
|
85
|
+
def test_fragment_head
|
86
|
+
doc = Nokogiri::HTML5.fragment(buffer[/<head>(.*?)<\/head>/m, 1])
|
87
|
+
assert_equal "hello world", doc.xpath('title').text
|
88
|
+
assert_equal "utf-8", doc.xpath('meta').first['charset']
|
89
|
+
end
|
90
|
+
|
91
|
+
def test_fragment_body
|
92
|
+
doc = Nokogiri::HTML5.fragment(buffer[/<body>(.*?)<\/body>/m, 1])
|
93
|
+
assert_equal '<span>content</span>', doc.xpath('main/span').to_xml
|
94
|
+
assert_equal " test comment ", doc.xpath('comment()').text
|
95
|
+
end
|
96
|
+
|
97
|
+
def test_xlink_attribute
|
98
|
+
source = <<-EOF.gsub(/^ {6}/, '')
|
99
|
+
<svg xmlns="http://www.w3.org/2000/svg">
|
100
|
+
<a xmlns:xlink="http://www.w3.org/1999/xlink" xlink:href="#s1"/>
|
101
|
+
</svg>
|
102
|
+
EOF
|
103
|
+
doc = Nokogiri::HTML5.fragment(source)
|
104
|
+
a = doc.at('a')
|
105
|
+
assert_equal ["xlink:href", "xmlns:xlink"], a.attributes.keys.sort
|
106
|
+
end
|
107
|
+
|
108
|
+
def test_template
|
109
|
+
source = <<-EOF.gsub(/^ {6}/, '')
|
110
|
+
<template id="productrow">
|
111
|
+
<tr>
|
112
|
+
<td class="record"></td>
|
113
|
+
<td></td>
|
114
|
+
</tr>
|
115
|
+
</template>
|
116
|
+
EOF
|
117
|
+
doc = Nokogiri::HTML5.fragment(source)
|
118
|
+
template = doc.at('template')
|
119
|
+
assert_equal "productrow", template['id']
|
120
|
+
assert_equal "record", template.at('td')['class']
|
121
|
+
end
|
122
|
+
|
123
|
+
def test_root_comments
|
124
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><!-- start --><html></html><!-- -->")
|
125
|
+
assert_equal ["html", "comment", "html", "comment"], doc.children.map(&:name)
|
126
|
+
end
|
127
|
+
|
128
|
+
def test_parse_errors
|
129
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 10)
|
130
|
+
assert_equal doc.errors.length, 2
|
131
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 10)
|
132
|
+
assert_empty doc.errors
|
133
|
+
end
|
134
|
+
|
135
|
+
def test_max_parse_errors
|
136
|
+
# This document contains 2 parse errors, but we force limit to 1.
|
137
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 1)
|
138
|
+
assert_equal 1, doc.errors.length
|
139
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 1)
|
140
|
+
assert_empty doc.errors
|
141
|
+
end
|
142
|
+
|
143
|
+
def test_default_max_parse_errors
|
144
|
+
# This document contains 200 parse errors, but default limit is 0.
|
145
|
+
doc = Nokogiri::HTML5("<!DOCTYPE html><html>" + "</p>" * 200)
|
146
|
+
assert_equal 0, doc.errors.length
|
147
|
+
end
|
148
|
+
|
149
|
+
def test_parse_fragment_errors
|
150
|
+
doc = Nokogiri::HTML5.fragment("<\r\n", max_parse_errors: 10)
|
151
|
+
refute_empty doc.errors
|
152
|
+
end
|
153
|
+
|
154
|
+
def test_fragment_max_parse_errors
|
155
|
+
# This fragment contains 3 parse errors, but we force limit to 1.
|
156
|
+
doc = Nokogiri::HTML5.fragment("<!-- -- --></a>", max_parse_errors: 1)
|
157
|
+
assert_equal 1, doc.errors.length
|
158
|
+
end
|
159
|
+
|
160
|
+
def test_fragment_default_max_parse_errors
|
161
|
+
# This fragment contains 201 parse errors, but default limit is 0.
|
162
|
+
doc = Nokogiri::HTML5.fragment("</p>" * 200)
|
163
|
+
assert_equal 0, doc.errors.length
|
164
|
+
end
|
165
|
+
|
166
|
+
private
|
167
|
+
|
168
|
+
def buffer
|
169
|
+
<<-EOF.gsub(/^ /, '')
|
170
|
+
<html>
|
171
|
+
<head>
|
172
|
+
<meta charset="utf-8"/>
|
173
|
+
<title>hello world</title>
|
174
|
+
<script> if (a < b) alert(1) </script>
|
175
|
+
</head>
|
176
|
+
<body>
|
177
|
+
<h1>hello world</h1>
|
178
|
+
<main>
|
179
|
+
<span>content</span>
|
180
|
+
</main>
|
181
|
+
<!-- test comment -->
|
182
|
+
<form>
|
183
|
+
<textarea>foo<x>bar</textarea>
|
184
|
+
</form>
|
185
|
+
</body>
|
186
|
+
</html>
|
187
|
+
EOF
|
188
|
+
end
|
189
|
+
|
190
|
+
end
|