nokogumbo 1.1.1 → 1.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +8 -8
- data/README.md +1 -1
- data/gumbo-parser/src/gumbo.h +6 -3
- data/gumbo-parser/src/insertion_mode.h +3 -0
- data/gumbo-parser/src/parser.c +38 -6
- data/gumbo-parser/src/tag.c +6 -3
- data/lib/nokogumbo.rb +2 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,15 +1,15 @@
|
|
1
1
|
---
|
2
2
|
!binary "U0hBMQ==":
|
3
3
|
metadata.gz: !binary |-
|
4
|
-
|
4
|
+
ZmRkOWYwNjM2OTdmNTc1MjIzNjg4MDBhNzJmZmMzNTgwMWNmNjJiNA==
|
5
5
|
data.tar.gz: !binary |-
|
6
|
-
|
6
|
+
ODA3OTkxNDYwZTM3NjI3ZGE1OGYyYjllYWU0ZDk5ODU5M2RlYWZkMA==
|
7
7
|
!binary "U0hBNTEy":
|
8
8
|
metadata.gz: !binary |-
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
ZWJjNGEwMDg2OTVjNjBhYTA5YTM1NTM4ZWNjOTFiN2NjZTM5MmFiYWFmY2U4
|
10
|
+
MDliNGVhMWU3ZWI0ZTMxMjg4ZDRkMTNlMzQ5YzE3ZTg1NGU3MDAyOTcyOWQz
|
11
|
+
ZGYxYjU3YTE4Y2ZmNjNkNDZmN2NlN2Y5Y2MwOWE2MDFiY2E1NzA=
|
12
12
|
data.tar.gz: !binary |-
|
13
|
-
|
14
|
-
|
15
|
-
|
13
|
+
MTU3MjVmNWViNDdiMDJlOWI4MmNiMzU0NTBjZWVkNTg3YWMzZTFlYWYxODVi
|
14
|
+
MzM4YWE4NDdlZjdlNzdlZDc0N2FkMGE2MTY4ZGVmN2ZkYzYzNzdjNTViNDQ3
|
15
|
+
YzNmMzhiODQ0ZGQwMDk3ZDkxMDQ5NTFmNDcyMzNmZmZiOTJiMDc=
|
data/README.md
CHANGED
@@ -34,7 +34,7 @@ Notes
|
|
34
34
|
|
35
35
|
* The `Nokogiri::HTML5.parse` function takes a string and passes it to the
|
36
36
|
<code>gumbo_parse_with_options</code> method, using the default options.
|
37
|
-
The resulting Gumbo parse tree is
|
37
|
+
The resulting Gumbo parse tree is then walked.
|
38
38
|
* If the necessary Nokogiri and [libxml2](http://xmlsoft.org/html/) headers
|
39
39
|
can be found at installation time then an
|
40
40
|
[xmlDoc](http://xmlsoft.org/html/libxml-tree.html#xmlDoc) tree is produced
|
data/gumbo-parser/src/gumbo.h
CHANGED
@@ -164,11 +164,12 @@ typedef enum {
|
|
164
164
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/scripting-1.html#scripting-1
|
165
165
|
GUMBO_TAG_SCRIPT,
|
166
166
|
GUMBO_TAG_NOSCRIPT,
|
167
|
+
GUMBO_TAG_TEMPLATE,
|
167
168
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/sections.html#sections
|
168
169
|
GUMBO_TAG_BODY,
|
170
|
+
GUMBO_TAG_ARTICLE,
|
169
171
|
GUMBO_TAG_SECTION,
|
170
172
|
GUMBO_TAG_NAV,
|
171
|
-
GUMBO_TAG_ARTICLE,
|
172
173
|
GUMBO_TAG_ASIDE,
|
173
174
|
GUMBO_TAG_H1,
|
174
175
|
GUMBO_TAG_H2,
|
@@ -193,6 +194,7 @@ typedef enum {
|
|
193
194
|
GUMBO_TAG_DD,
|
194
195
|
GUMBO_TAG_FIGURE,
|
195
196
|
GUMBO_TAG_FIGCAPTION,
|
197
|
+
GUMBO_TAG_MAIN,
|
196
198
|
GUMBO_TAG_DIV,
|
197
199
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/text-level-semantics.html#text-level-semantics
|
198
200
|
GUMBO_TAG_A,
|
@@ -204,6 +206,7 @@ typedef enum {
|
|
204
206
|
GUMBO_TAG_Q,
|
205
207
|
GUMBO_TAG_DFN,
|
206
208
|
GUMBO_TAG_ABBR,
|
209
|
+
GUMBO_TAG_DATA,
|
207
210
|
GUMBO_TAG_TIME,
|
208
211
|
GUMBO_TAG_CODE,
|
209
212
|
GUMBO_TAG_VAR,
|
@@ -213,6 +216,7 @@ typedef enum {
|
|
213
216
|
GUMBO_TAG_SUP,
|
214
217
|
GUMBO_TAG_I,
|
215
218
|
GUMBO_TAG_B,
|
219
|
+
GUMBO_TAG_U,
|
216
220
|
GUMBO_TAG_MARK,
|
217
221
|
GUMBO_TAG_RUBY,
|
218
222
|
GUMBO_TAG_RT,
|
@@ -284,8 +288,8 @@ typedef enum {
|
|
284
288
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/interactive-elements.html#interactive-elements
|
285
289
|
GUMBO_TAG_DETAILS,
|
286
290
|
GUMBO_TAG_SUMMARY,
|
287
|
-
GUMBO_TAG_COMMAND,
|
288
291
|
GUMBO_TAG_MENU,
|
292
|
+
GUMBO_TAG_MENUITEM,
|
289
293
|
// Non-conforming elements that nonetheless appear in the HTML5 spec.
|
290
294
|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/obsolete.html#non-conforming-features
|
291
295
|
GUMBO_TAG_APPLET,
|
@@ -313,7 +317,6 @@ typedef enum {
|
|
313
317
|
GUMBO_TAG_NOBR,
|
314
318
|
GUMBO_TAG_SPACER,
|
315
319
|
GUMBO_TAG_TT,
|
316
|
-
GUMBO_TAG_U,
|
317
320
|
// Used for all tags that don't have special handling in HTML.
|
318
321
|
GUMBO_TAG_UNKNOWN,
|
319
322
|
// A marker value to indicate the end of the enum, for iterating over it.
|
@@ -22,6 +22,8 @@ extern "C" {
|
|
22
22
|
#endif
|
23
23
|
|
24
24
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
|
25
|
+
// If new enum values are added, be sure to update the kTokenHandlers dispatch
|
26
|
+
// table in parser.c.
|
25
27
|
typedef enum {
|
26
28
|
GUMBO_INSERTION_MODE_INITIAL,
|
27
29
|
GUMBO_INSERTION_MODE_BEFORE_HTML,
|
@@ -40,6 +42,7 @@ typedef enum {
|
|
40
42
|
GUMBO_INSERTION_MODE_IN_CELL,
|
41
43
|
GUMBO_INSERTION_MODE_IN_SELECT,
|
42
44
|
GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE,
|
45
|
+
GUMBO_INSERTION_MODE_IN_TEMPLATE,
|
43
46
|
GUMBO_INSERTION_MODE_AFTER_BODY,
|
44
47
|
GUMBO_INSERTION_MODE_IN_FRAMESET,
|
45
48
|
GUMBO_INSERTION_MODE_AFTER_FRAMESET,
|
data/gumbo-parser/src/parser.c
CHANGED
@@ -354,6 +354,10 @@ typedef struct GumboInternalParserState {
|
|
354
354
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-list-of-active-formatting-elements
|
355
355
|
GumboVector /*GumboNode*/ _active_formatting_elements;
|
356
356
|
|
357
|
+
// The stack of template insertion modes.
|
358
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#the-insertion-mode
|
359
|
+
GumboVector /*InsertionMode*/ _template_insertion_modes;
|
360
|
+
|
357
361
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#the-element-pointers
|
358
362
|
GumboNode* _head_element;
|
359
363
|
GumboNode* _form_element;
|
@@ -482,6 +486,7 @@ static void parser_state_init(GumboParser* parser) {
|
|
482
486
|
gumbo_string_buffer_init(parser, &parser_state->_text_node._buffer);
|
483
487
|
gumbo_vector_init(parser, 10, &parser_state->_open_elements);
|
484
488
|
gumbo_vector_init(parser, 5, &parser_state->_active_formatting_elements);
|
489
|
+
gumbo_vector_init(parser, 5, &parser_state->_template_insertion_modes);
|
485
490
|
parser_state->_head_element = NULL;
|
486
491
|
parser_state->_form_element = NULL;
|
487
492
|
parser_state->_current_token = NULL;
|
@@ -494,6 +499,7 @@ static void parser_state_destroy(GumboParser* parser) {
|
|
494
499
|
GumboParserState* state = parser->_parser_state;
|
495
500
|
gumbo_vector_destroy(parser, &state->_active_formatting_elements);
|
496
501
|
gumbo_vector_destroy(parser, &state->_open_elements);
|
502
|
+
gumbo_vector_destroy(parser, &state->_template_insertion_modes);
|
497
503
|
gumbo_string_buffer_destroy(parser, &state->_text_node._buffer);
|
498
504
|
gumbo_parser_deallocate(parser, state);
|
499
505
|
}
|
@@ -531,6 +537,25 @@ static bool is_in_static_list(
|
|
531
537
|
return false;
|
532
538
|
}
|
533
539
|
|
540
|
+
static void push_template_insertion_mode(
|
541
|
+
GumboParser* parser, GumboInsertionMode mode) {
|
542
|
+
gumbo_vector_add(
|
543
|
+
parser, (void*) mode, &parser->_parser_state->_template_insertion_modes);
|
544
|
+
}
|
545
|
+
|
546
|
+
static void pop_template_insertion_mode(GumboParser* parser) {
|
547
|
+
gumbo_vector_pop(parser, &parser->_parser_state->_template_insertion_modes);
|
548
|
+
}
|
549
|
+
|
550
|
+
static GumboInsertionMode get_current_template_insertion_mode(
|
551
|
+
GumboParser* parser) {
|
552
|
+
GumboVector* template_insertion_modes =
|
553
|
+
&parser->_parser_state->_template_insertion_modes;
|
554
|
+
assert(template_insertion_modes->length > 0);
|
555
|
+
return (GumboInsertionMode) template_insertion_modes->data[
|
556
|
+
template_insertion_modes->length - 1];
|
557
|
+
}
|
558
|
+
|
534
559
|
static void set_insertion_mode(GumboParser* parser, GumboInsertionMode mode) {
|
535
560
|
parser->_parser_state->_insertion_mode = mode;
|
536
561
|
}
|
@@ -718,7 +743,7 @@ static bool is_html_integration_point(const GumboNode* node) {
|
|
718
743
|
static void append_node(
|
719
744
|
GumboParser* parser, GumboNode* parent, GumboNode* node) {
|
720
745
|
assert(node->parent == NULL);
|
721
|
-
assert(node->index_within_parent
|
746
|
+
assert(node->index_within_parent == -1);
|
722
747
|
GumboVector* children;
|
723
748
|
if (parent->type == GUMBO_NODE_ELEMENT) {
|
724
749
|
children = &parent->v.element.children;
|
@@ -737,7 +762,7 @@ static void append_node(
|
|
737
762
|
static void insert_node(
|
738
763
|
GumboParser* parser, GumboNode* parent, int index, GumboNode* node) {
|
739
764
|
assert(node->parent == NULL);
|
740
|
-
assert(node->index_within_parent
|
765
|
+
assert(node->index_within_parent == -1);
|
741
766
|
assert(parent->type == GUMBO_NODE_ELEMENT);
|
742
767
|
GumboVector* children = &parent->v.element.children;
|
743
768
|
assert(index >= 0);
|
@@ -1520,7 +1545,7 @@ static bool is_special_node(const GumboNode* node) {
|
|
1520
1545
|
GUMBO_TAG_BASEFONT, GUMBO_TAG_BGSOUND, GUMBO_TAG_BLOCKQUOTE,
|
1521
1546
|
GUMBO_TAG_BODY, GUMBO_TAG_BR, GUMBO_TAG_BUTTON, GUMBO_TAG_CAPTION,
|
1522
1547
|
GUMBO_TAG_CENTER, GUMBO_TAG_COL, GUMBO_TAG_COLGROUP,
|
1523
|
-
|
1548
|
+
GUMBO_TAG_MENUITEM, GUMBO_TAG_DD, GUMBO_TAG_DETAILS, GUMBO_TAG_DIR,
|
1524
1549
|
GUMBO_TAG_DIV, GUMBO_TAG_DL, GUMBO_TAG_DT, GUMBO_TAG_EMBED,
|
1525
1550
|
GUMBO_TAG_FIELDSET, GUMBO_TAG_FIGCAPTION, GUMBO_TAG_FIGURE,
|
1526
1551
|
GUMBO_TAG_FOOTER, GUMBO_TAG_FORM, GUMBO_TAG_FRAME,
|
@@ -2105,7 +2130,7 @@ static bool handle_in_head(GumboParser* parser, GumboToken* token) {
|
|
2105
2130
|
} else if (tag_is(token, kStartTag, GUMBO_TAG_HTML)) {
|
2106
2131
|
return handle_in_body(parser, token);
|
2107
2132
|
} else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
|
2108
|
-
GUMBO_TAG_BGSOUND,
|
2133
|
+
GUMBO_TAG_BGSOUND, GUMBO_TAG_MENUITEM, GUMBO_TAG_LINK,
|
2109
2134
|
GUMBO_TAG_LAST)) {
|
2110
2135
|
insert_element_from_token(parser, token);
|
2111
2136
|
pop_current_node(parser);
|
@@ -2316,7 +2341,7 @@ static bool handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2316
2341
|
merge_attributes(parser, token, parser->_output->root);
|
2317
2342
|
return false;
|
2318
2343
|
} else if (tag_in(token, kStartTag, GUMBO_TAG_BASE, GUMBO_TAG_BASEFONT,
|
2319
|
-
GUMBO_TAG_BGSOUND,
|
2344
|
+
GUMBO_TAG_BGSOUND, GUMBO_TAG_MENUITEM, GUMBO_TAG_LINK,
|
2320
2345
|
GUMBO_TAG_META, GUMBO_TAG_NOFRAMES, GUMBO_TAG_SCRIPT,
|
2321
2346
|
GUMBO_TAG_STYLE, GUMBO_TAG_TITLE, GUMBO_TAG_LAST)) {
|
2322
2347
|
return handle_in_head(parser, token);
|
@@ -3415,6 +3440,12 @@ static bool handle_in_select_in_table(GumboParser* parser, GumboToken* token) {
|
|
3415
3440
|
}
|
3416
3441
|
}
|
3417
3442
|
|
3443
|
+
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tree-construction.html#parsing-main-intemplate
|
3444
|
+
static bool handle_in_template(GumboParser* parser, GumboToken* token) {
|
3445
|
+
// TODO(jdtang): Implement this.
|
3446
|
+
return true;
|
3447
|
+
}
|
3448
|
+
|
3418
3449
|
// http://www.whatwg.org/specs/web-apps/current-work/complete/tokenization.html#parsing-main-afterbody
|
3419
3450
|
static bool handle_after_body(GumboParser* parser, GumboToken* token) {
|
3420
3451
|
if (token->type == GUMBO_TOKEN_WHITESPACE ||
|
@@ -3586,6 +3617,7 @@ static const TokenHandler kTokenHandlers[] = {
|
|
3586
3617
|
handle_in_cell,
|
3587
3618
|
handle_in_select,
|
3588
3619
|
handle_in_select_in_table,
|
3620
|
+
handle_in_template,
|
3589
3621
|
handle_after_body,
|
3590
3622
|
handle_in_frameset,
|
3591
3623
|
handle_after_frameset,
|
@@ -3594,7 +3626,7 @@ static const TokenHandler kTokenHandlers[] = {
|
|
3594
3626
|
};
|
3595
3627
|
|
3596
3628
|
static bool handle_html_content(GumboParser* parser, GumboToken* token) {
|
3597
|
-
return kTokenHandlers[parser->_parser_state->_insertion_mode](
|
3629
|
+
return kTokenHandlers[(unsigned int) parser->_parser_state->_insertion_mode](
|
3598
3630
|
parser, token);
|
3599
3631
|
}
|
3600
3632
|
|
data/gumbo-parser/src/tag.c
CHANGED
@@ -34,10 +34,11 @@ const char* kGumboTagNames[] = {
|
|
34
34
|
"style",
|
35
35
|
"script",
|
36
36
|
"noscript",
|
37
|
+
"template",
|
37
38
|
"body",
|
39
|
+
"article",
|
38
40
|
"section",
|
39
41
|
"nav",
|
40
|
-
"article",
|
41
42
|
"aside",
|
42
43
|
"h1",
|
43
44
|
"h2",
|
@@ -61,6 +62,7 @@ const char* kGumboTagNames[] = {
|
|
61
62
|
"dd",
|
62
63
|
"figure",
|
63
64
|
"figcaption",
|
65
|
+
"main",
|
64
66
|
"div",
|
65
67
|
"a",
|
66
68
|
"em",
|
@@ -71,6 +73,7 @@ const char* kGumboTagNames[] = {
|
|
71
73
|
"q",
|
72
74
|
"dfn",
|
73
75
|
"abbr",
|
76
|
+
"data",
|
74
77
|
"time",
|
75
78
|
"code",
|
76
79
|
"var",
|
@@ -80,6 +83,7 @@ const char* kGumboTagNames[] = {
|
|
80
83
|
"sup",
|
81
84
|
"i",
|
82
85
|
"b",
|
86
|
+
"u",
|
83
87
|
"mark",
|
84
88
|
"ruby",
|
85
89
|
"rt",
|
@@ -143,8 +147,8 @@ const char* kGumboTagNames[] = {
|
|
143
147
|
"meter",
|
144
148
|
"details",
|
145
149
|
"summary",
|
146
|
-
"command",
|
147
150
|
"menu",
|
151
|
+
"menuitem",
|
148
152
|
"applet",
|
149
153
|
"acronym",
|
150
154
|
"bgsound",
|
@@ -170,7 +174,6 @@ const char* kGumboTagNames[] = {
|
|
170
174
|
"nobr",
|
171
175
|
"spacer",
|
172
176
|
"tt",
|
173
|
-
"u",
|
174
177
|
"", // TAG_UNKNOWN
|
175
178
|
"", // TAG_LAST
|
176
179
|
};
|
data/lib/nokogumbo.rb
CHANGED
@@ -67,7 +67,8 @@ module Nokogiri
|
|
67
67
|
doc
|
68
68
|
when Net::HTTPRedirection
|
69
69
|
response.value if limit <= 1
|
70
|
-
|
70
|
+
location = URI.join(uri, response['location'])
|
71
|
+
get(location, options.merge(:follow_limit => limit-1))
|
71
72
|
else
|
72
73
|
response.value
|
73
74
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.1.
|
4
|
+
version: 1.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Sam Ruby
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2013-
|
11
|
+
date: 2013-10-12 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|