nokogumbo 1.1.1 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +8 -8
- data/README.md +1 -1
- data/gumbo-parser/src/gumbo.h +6 -3
- data/gumbo-parser/src/insertion_mode.h +3 -0
- data/gumbo-parser/src/parser.c +38 -6
- data/gumbo-parser/src/tag.c +6 -3
- data/lib/nokogumbo.rb +2 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
ZmRkOWYwNjM2OTdmNTc1MjIzNjg4MDBhNzJmZmMzNTgwMWNmNjJiNA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
ODA3OTkxNDYwZTM3NjI3ZGE1OGYyYjllYWU0ZDk5ODU5M2RlYWZkMA==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ZWJjNGEwMDg2OTVjNjBhYTA5YTM1NTM4ZWNjOTFiN2NjZTM5MmFiYWFmY2U4
|
10
|
+
MDliNGVhMWU3ZWI0ZTMxMjg4ZDRkMTNlMzQ5YzE3ZTg1NGU3MDAyOTcyOWQz
|
11
|
+
ZGYxYjU3YTE4Y2ZmNjNkNDZmN2NlN2Y5Y2MwOWE2MDFiY2E1NzA=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MTU3MjVmNWViNDdiMDJlOWI4MmNiMzU0NTBjZWVkNTg3YWMzZTFlYWYxODVi
|
14
|
+
MzM4YWE4NDdlZjdlNzdlZDc0N2FkMGE2MTY4ZGVmN2ZkYzYzNzdjNTViNDQ3
|
15
|
+
YzNmMzhiODQ0ZGQwMDk3ZDkxMDQ5NTFmNDcyMzNmZmZiOTJiMDc=
|
data/README.md
CHANGED
@@ -34,7 +34,7 @@ Notes
|
|
34
34
|
|
35
35
|
* The `Nokogiri::HTML5.parse` function takes a string and passes it to the
|
36
36
|
<code>gumbo_parse_with_options</code> method, using the default options.
|
37
|
-
The resulting Gumbo parse tree is
|
37
|
+
The resulting Gumbo parse tree is then walked.
|
38
38
|
* If the necessary Nokogiri and [libxml2](http://xmlsoft.org/html/) headers
|
39
39
|
can be found at installation time then an
|
40
40
|
[xmlDoc](http://xmlsoft.org/html/libxml-tree.html#xmlDoc) tree is produced
|
data/gumbo-parser/src/gumbo.h
CHANGED
@@ -164,11 +164,12 @@ typedef enum {
|
|
164
164
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1
|
165
165
|
GUMBO_TAG_SCRIPT,
|
166
166
|
GUMBO_TAG_NOSCRIPT,
|
167
|
+
GUMBO_TAG_TEMPLATE,
|
167
168
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections
|
168
169
|
GUMBO_TAG_BODY,
|
170
|
+
GUMBO_TAG_ARTICLE,
|
169
171
|
GUMBO_TAG_SECTION,
|
170
172
|
GUMBO_TAG_NAV,
|
171
|
-
GUMBO_TAG_ARTICLE,
|
172
173
|
GUMBO_TAG_ASIDE,
|
173
174
|
GUMBO_TAG_H1,
|
174
175
|
GUMBO_TAG_H2,
|
@@ -193,6 +194,7 @@ typedef enum {
|
|
193
194
|
GUMBO_TAG_DD,
|
194
195
|
GUMBO_TAG_FIGURE,
|
195
196
|
GUMBO_TAG_FIGCAPTION,
|
197
|
+
GUMBO_TAG_MAIN,
|
196
198
|
GUMBO_TAG_DIV,
|
197
199
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics
|
198
200
|
GUMBO_TAG_A,
|
@@ -204,6 +206,7 @@ typedef enum {
|
|
204
206
|
GUMBO_TAG_Q,
|
205
207
|
GUMBO_TAG_DFN,
|
206
208
|
GUMBO_TAG_ABBR,
|
209
|
+
GUMBO_TAG_DATA,
|
207
210
|
GUMBO_TAG_TIME,
|
208
211
|
GUMBO_TAG_CODE,
|
209
212
|
GUMBO_TAG_VAR,
|
@@ -213,6 +216,7 @@ typedef enum {
|
|
213
216
|
GUMBO_TAG_SUP,
|
214
217
|
GUMBO_TAG_I,
|
215
218
|
GUMBO_TAG_B,
|
219
|
+
GUMBO_TAG_U,
|
216
220
|
GUMBO_TAG_MARK,
|
217
221
|
GUMBO_TAG_RUBY,
|
218
222
|
GUMBO_TAG_RT,
|
@@ -284,8 +288,8 @@ typedef enum {
|
|
284
288
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements
|
285
289
|
GUMBO_TAG_DETAILS,
|
286
290
|
GUMBO_TAG_SUMMARY,
|
287
|
-
GUMBO_TAG_COMMAND,
|
288
291
|
GUMBO_TAG_MENU,
|
292
|
+
GUMBO_TAG_MENUITEM,
|
289
293
|
// Non-conforming elements that nonetheless appear in the HTML5 spec.
|
290
294
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features
|
291
295
|
GUMBO_TAG_APPLET,
|
@@ -313,7 +317,6 @@ typedef enum {
|
|
313
317
|
GUMBO_TAG_NOBR,
|
314
318
|
GUMBO_TAG_SPACER,
|
315
319
|
GUMBO_TAG_TT,
|
316
|
-
GUMBO_TAG_U,
|
317
320
|
// Used for all tags that don't have special handling in HTML.
|
318
321
|
GUMBO_TAG_UNKNOWN,
|
319
322
|
// A marker value to indicate the end of the enum, for iterating over it.
|
@@ -22,6 +22,8 @@ extern "C" {
|
|
22
22
|
#endif
|
23
23
|
|
24
24
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
|
25
|
+
// If new enum values are added, be sure to update the kTokenHandlers dispatch
|
26
|
+
// table in parser.c.
|
25
27
|
typedef enum {
|
26
28
|
GUMBO_INSERTION_MODE_INITIAL,
|
27
29
|
GUMBO_INSERTION_MODE_BEFORE_HTML,
|
@@ -40,6 +42,7 @@ typedef enum {
|
|
40
42
|
GUMBO_INSERTION_MODE_IN_CELL,
|
41
43
|
GUMBO_INSERTION_MODE_IN_SELECT,
|
42
44
|
GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE,
|
45
|
+
GUMBO_INSERTION_MODE_IN_TEMPLATE,
|
43
46
|
GUMBO_INSERTION_MODE_AFTER_BODY,
|
44
47
|
GUMBO_INSERTION_MODE_IN_FRAMESET,
|
45
48
|
GUMBO_INSERTION_MODE_AFTER_FRAMESET,
|
data/gumbo-parser/src/parser.c
CHANGED
@@ -354,6 +354,10 @@ typedef struct GumboInternalParserState {
|
|
354
354
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-list-of-active-formatting-elements
|
355
355
|
GumboVector /*GumboNode*/ _active_formatting_elements;
|
356
356
|
|
357
|
+
// The stack of template insertion modes.
|
358
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#the-insertion-mode
|
359
|
+
GumboVector /*InsertionMode*/ _template_insertion_modes;
|
360
|
+
|
357
361
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-element-pointers
|
358
362
|
GumboNode* _head_element;
|
359
363
|
GumboNode* _form_element;
|
@@ -482,6 +486,7 @@ static void parser_state_init(GumboParser* parser) {
|
|
482
486
|
gumbo_string_buffer_init(parser, &parser_state->_text_node._buffer);
|
483
487
|
gumbo_vector_init(parser, 10, &parser_state->_open_elements);
|
484
488
|
gumbo_vector_init(parser, 5, &parser_state->_active_formatting_elements);
|
489
|
+
gumbo_vector_init(parser, 5, &parser_state->_template_insertion_modes);
|
485
490
|
parser_state->_head_element = NULL;
|
486
491
|
parser_state->_form_element = NULL;
|
487
492
|
parser_state->_current_token = NULL;
|
@@ -494,6 +499,7 @@ static void parser_state_destroy(GumboParser* parser) {
|
|
494
499
|
GumboParserState* state = parser->_parser_state;
|
495
500
|
gumbo_vector_destroy(parser, &state->_active_formatting_elements);
|
496
501
|
gumbo_vector_destroy(parser, &state->_open_elements);
|
502
|
+
gumbo_vector_destroy(parser, &state->_template_insertion_modes);
|
497
503
|
gumbo_string_buffer_destroy(parser, &state->_text_node._buffer);
|
498
504
|
gumbo_parser_deallocate(parser, state);
|
499
505
|
}
|
@@ -531,6 +537,25 @@ static bool is_in_static_list(
|
|
531
537
|
return false;
|
532
538
|
}
|
533
539
|
|
540
|
+
static void push_template_insertion_mode(
|
541
|
+
GumboParser* parser, GumboInsertionMode mode) {
|
542
|
+
gumbo_vector_add(
|
543
|
+
parser, (void*) mode, &parser->_parser_state->_template_insertion_modes);
|
544
|
+
}
|
545
|
+
|
546
|
+
static void pop_template_insertion_mode(GumboParser* parser) {
|
547
|
+
gumbo_vector_pop(parser, &parser->_parser_state->_template_insertion_modes);
|
548
|
+
}
|
549
|
+
|
550
|
+
static GumboInsertionMode get_current_template_insertion_mode(
|
551
|
+
GumboParser* parser) {
|
552
|
+
GumboVector* template_insertion_modes =
|
553
|
+
&parser->_parser_state->_template_insertion_modes;
|
554
|
+
assert(template_insertion_modes->length > 0);
|
555
|
+
return (GumboInsertionMode) template_insertion_modes->data[
|
556
|
+
template_insertion_modes->length - 1];
|
557
|
+
}
|
558
|
+
|
534
559
|
static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
|
535
560
|
parser->_parser_state->_insertion_mode = mode;
|
536
561
|
}
|
@@ -718,7 +743,7 @@ static bool is_html_integration_point(const GumboNode* node) {
|
|
718
743
|
static void append_node(
|
719
744
|
GumboParser* parser, GumboNode* parent, GumboNode* node) {
|
720
745
|
assert(node->parent == NULL);
|
721
|
-
assert(node->index_within_parent
|
746
|
+
assert(node->index_within_parent == -1);
|
722
747
|
GumboVector* children;
|
723
748
|
if (parent->type == GUMBO_NODE_ELEMENT) {
|
724
749
|
children = &parent->v.element.children;
|
@@ -737,7 +762,7 @@ static void append_node(
|
|
737
762
|
static void insert_node(
|
738
763
|
GumboParser* parser, GumboNode* parent, int index, GumboNode* node) {
|
739
764
|
assert(node->parent == NULL);
|
740
|
-
assert(node->index_within_parent
|
765
|
+
assert(node->index_within_parent == -1);
|
741
766
|
assert(parent->type == GUMBO_NODE_ELEMENT);
|
742
767
|
GumboVector* children = &parent->v.element.children;
|
743
768
|
assert(index >= 0);
|
@@ -1520,7 +1545,7 @@ static bool is_special_node(const GumboNode* node) {
|
|
1520
1545
|
GUMBO_TAG_BASEFONT, GUMBO_TAG_BGSOUND, GUMBO_TAG_BLOCKQUOTE,
|
1521
1546
|
GUMBO_TAG_BODY, GUMBO_TAG_BR, GUMBO_TAG_BUTTON, GUMBO_TAG_CAPTION,
|
1522
1547
|
GUMBO_TAG_CENTER, GUMBO_TAG_COL, GUMBO_TAG_COLGROUP,
|
1523
|
-
|
1548
|
+
GUMBO_TAG_MENUITEM, GUMBO_TAG_DD, GUMBO_TAG_DETAILS, GUMBO_TAG_DIR,
|
1524
1549
|
GUMBO_TAG_DIV, GUMBO_TAG_DL, GUMBO_TAG_DT, GUMBO_TAG_EMBED,
|
1525
1550
|
GUMBO_TAG_FIELDSET, GUMBO_TAG_FIGCAPTION, GUMBO_TAG_FIGURE,
|
1526
1551
|
GUMBO_TAG_FOOTER, GUMBO_TAG_FORM, GUMBO_TAG_FRAME,
|
@@ -2105,7 +2130,7 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
|
|
2105
2130
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2106
2131
|
return handle_in_body(parser, token);
|
2107
2132
|
} else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
|
2108
|
-
GUMBO_TAG_BGSOUND,
|
2133
|
+
GUMBO_TAG_BGSOUND, GUMBO_TAG_MENUITEM, GUMBO_TAG_LINK,
|
2109
2134
|
GUMBO_TAG_LAST)) {
|
2110
2135
|
insert_element_from_token(parser, token);
|
2111
2136
|
pop_current_node(parser);
|
@@ -2316,7 +2341,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2316
2341
|
merge_attributes(parser, token, parser->_output->root);
|
2317
2342
|
return false;
|
2318
2343
|
} else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
|
2319
|
-
GUMBO_TAG_BGSOUND,
|
2344
|
+
GUMBO_TAG_BGSOUND, GUMBO_TAG_MENUITEM, GUMBO_TAG_LINK,
|
2320
2345
|
GUMBO_TAG_META, GUMBO_TAG_NOFRAMES, GUMBO_TAG_SCRIPT,
|
2321
2346
|
GUMBO_TAG_STYLE, GUMBO_TAG_TITLE, GUMBO_TAG_LAST)) {
|
2322
2347
|
return handle_in_head(parser, token);
|
@@ -3415,6 +3440,12 @@ static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
|
|
3415
3440
|
}
|
3416
3441
|
}
|
3417
3442
|
|
3443
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intemplate
|
3444
|
+
static bool handle_in_template(GumboParser* parser, GumboToken* token) {
|
3445
|
+
// TODO(jdtang): Implement this.
|
3446
|
+
return true;
|
3447
|
+
}
|
3448
|
+
|
3418
3449
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterbody
|
3419
3450
|
static bool handle_after_body(GumboParser* parser, GumboToken* token) {
|
3420
3451
|
if (token->type == GUMBO_TOKEN_WHITESPACE ||
|
@@ -3586,6 +3617,7 @@ static const TokenHandler kTokenHandlers[] = {
|
|
3586
3617
|
handle_in_cell,
|
3587
3618
|
handle_in_select,
|
3588
3619
|
handle_in_select_in_table,
|
3620
|
+
handle_in_template,
|
3589
3621
|
handle_after_body,
|
3590
3622
|
handle_in_frameset,
|
3591
3623
|
handle_after_frameset,
|
@@ -3594,7 +3626,7 @@ static const TokenHandler kTokenHandlers[] = {
|
|
3594
3626
|
};
|
3595
3627
|
|
3596
3628
|
static bool handle_html_content(GumboParser* parser, GumboToken* token) {
|
3597
|
-
return kTokenHandlers[parser->_parser_state->_insertion_mode](
|
3629
|
+
return kTokenHandlers[(unsigned int) parser->_parser_state->_insertion_mode](
|
3598
3630
|
parser, token);
|
3599
3631
|
}
|
3600
3632
|
|
data/gumbo-parser/src/tag.c
CHANGED
@@ -34,10 +34,11 @@ const char* kGumboTagNames[] = {
|
|
34
34
|
"style",
|
35
35
|
"script",
|
36
36
|
"noscript",
|
37
|
+
"template",
|
37
38
|
"body",
|
39
|
+
"article",
|
38
40
|
"section",
|
39
41
|
"nav",
|
40
|
-
"article",
|
41
42
|
"aside",
|
42
43
|
"h1",
|
43
44
|
"h2",
|
@@ -61,6 +62,7 @@ const char* kGumboTagNames[] = {
|
|
61
62
|
"dd",
|
62
63
|
"figure",
|
63
64
|
"figcaption",
|
65
|
+
"main",
|
64
66
|
"div",
|
65
67
|
"a",
|
66
68
|
"em",
|
@@ -71,6 +73,7 @@ const char* kGumboTagNames[] = {
|
|
71
73
|
"q",
|
72
74
|
"dfn",
|
73
75
|
"abbr",
|
76
|
+
"data",
|
74
77
|
"time",
|
75
78
|
"code",
|
76
79
|
"var",
|
@@ -80,6 +83,7 @@ const char* kGumboTagNames[] = {
|
|
80
83
|
"sup",
|
81
84
|
"i",
|
82
85
|
"b",
|
86
|
+
"u",
|
83
87
|
"mark",
|
84
88
|
"ruby",
|
85
89
|
"rt",
|
@@ -143,8 +147,8 @@ const char* kGumboTagNames[] = {
|
|
143
147
|
"meter",
|
144
148
|
"details",
|
145
149
|
"summary",
|
146
|
-
"command",
|
147
150
|
"menu",
|
151
|
+
"menuitem",
|
148
152
|
"applet",
|
149
153
|
"acronym",
|
150
154
|
"bgsound",
|
@@ -170,7 +174,6 @@ const char* kGumboTagNames[] = {
|
|
170
174
|
"nobr",
|
171
175
|
"spacer",
|
172
176
|
"tt",
|
173
|
-
"u",
|
174
177
|
"", // TAG_UNKNOWN
|
175
178
|
"", // TAG_LAST
|
176
179
|
};
|
data/lib/nokogumbo.rb
CHANGED
@@ -67,7 +67,8 @@ module Nokogiri
|
|
67
67
|
doc
|
68
68
|
when Net::HTTPRedirection
|
69
69
|
response.value if limit <= 1
|
70
|
-
|
70
|
+
location = URI.join(uri, response['location'])
|
71
|
+
get(location, options.merge(:follow_limit => limit-1))
|
71
72
|
else
|
72
73
|
response.value
|
73
74
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Ruby
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-10-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|