nokogumbo 1.4.1 → 1.4.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/ext/nokogumboc/nokogumbo.c +1 -1
- data/gumbo-parser/src/error.c +3 -5
- data/gumbo-parser/src/gumbo.h +170 -36
- data/gumbo-parser/src/parser.c +403 -795
- data/gumbo-parser/src/string_buffer.c +1 -8
- data/gumbo-parser/src/string_buffer.h +0 -5
- data/gumbo-parser/src/tag.c +162 -35
- data/gumbo-parser/src/tokenizer.c +18 -13
- data/gumbo-parser/src/vector.c +1 -1
- data/test-nokogumbo.rb +1 -1
- metadata +15 -24
- data/gumbo-parser/src/tag.in +0 -150
- data/gumbo-parser/src/tag_enum.h +0 -150
- data/gumbo-parser/src/tag_gperf.h +0 -343
- data/gumbo-parser/src/tag_sizes.h +0 -1
- data/gumbo-parser/src/tag_strings.h +0 -150
@@ -26,9 +26,7 @@
|
|
26
26
|
|
27
27
|
struct GumboInternalParser;
|
28
28
|
|
29
|
-
|
30
|
-
// 99% of text nodes and 98% of attribute names/values fit in this initial size.
|
31
|
-
static const size_t kDefaultStringBufferSize = 5;
|
29
|
+
static const size_t kDefaultStringBufferSize = 10;
|
32
30
|
|
33
31
|
static void maybe_resize_string_buffer(
|
34
32
|
struct GumboInternalParser* parser, size_t additional_chars,
|
@@ -102,11 +100,6 @@ char* gumbo_string_buffer_to_string(
|
|
102
100
|
return buffer;
|
103
101
|
}
|
104
102
|
|
105
|
-
void gumbo_string_buffer_clear(
|
106
|
-
struct GumboInternalParser* parser, GumboStringBuffer* input) {
|
107
|
-
input->length = 0;
|
108
|
-
}
|
109
|
-
|
110
103
|
void gumbo_string_buffer_destroy(
|
111
104
|
struct GumboInternalParser* parser, GumboStringBuffer* buffer) {
|
112
105
|
gumbo_parser_deallocate(parser, buffer->data);
|
@@ -70,11 +70,6 @@ void gumbo_string_buffer_append_string(
|
|
70
70
|
char* gumbo_string_buffer_to_string(
|
71
71
|
struct GumboInternalParser* parser, GumboStringBuffer* input);
|
72
72
|
|
73
|
-
// Reinitialize this string buffer. This clears it by setting length=0. It
|
74
|
-
// does not zero out the buffer itself.
|
75
|
-
void gumbo_string_buffer_clear(
|
76
|
-
struct GumboInternalParser* parser, GumboStringBuffer* input);
|
77
|
-
|
78
73
|
// Deallocates this GumboStringBuffer.
|
79
74
|
void gumbo_string_buffer_destroy(
|
80
75
|
struct GumboInternalParser* parser, GumboStringBuffer* buffer);
|
data/gumbo-parser/src/tag.c
CHANGED
@@ -18,25 +18,172 @@
|
|
18
18
|
|
19
19
|
#include <assert.h>
|
20
20
|
#include <ctype.h>
|
21
|
-
#include <
|
21
|
+
#include <strings.h> // For strcasecmp.
|
22
22
|
|
23
|
+
// NOTE(jdtang): Keep this in sync with the GumboTag enum in the header.
|
24
|
+
// TODO(jdtang): Investigate whether there're efficiency benefits to putting the
|
25
|
+
// most common tag names first, or to putting them in alphabetical order and
|
26
|
+
// using a binary search.
|
23
27
|
const char* kGumboTagNames[] = {
|
24
|
-
|
28
|
+
"html",
|
29
|
+
"head",
|
30
|
+
"title",
|
31
|
+
"base",
|
32
|
+
"link",
|
33
|
+
"meta",
|
34
|
+
"style",
|
35
|
+
"script",
|
36
|
+
"noscript",
|
37
|
+
"template",
|
38
|
+
"body",
|
39
|
+
"article",
|
40
|
+
"section",
|
41
|
+
"nav",
|
42
|
+
"aside",
|
43
|
+
"h1",
|
44
|
+
"h2",
|
45
|
+
"h3",
|
46
|
+
"h4",
|
47
|
+
"h5",
|
48
|
+
"h6",
|
49
|
+
"hgroup",
|
50
|
+
"header",
|
51
|
+
"footer",
|
52
|
+
"address",
|
53
|
+
"p",
|
54
|
+
"hr",
|
55
|
+
"pre",
|
56
|
+
"blockquote",
|
57
|
+
"ol",
|
58
|
+
"ul",
|
59
|
+
"li",
|
60
|
+
"dl",
|
61
|
+
"dt",
|
62
|
+
"dd",
|
63
|
+
"figure",
|
64
|
+
"figcaption",
|
65
|
+
"main",
|
66
|
+
"div",
|
67
|
+
"a",
|
68
|
+
"em",
|
69
|
+
"strong",
|
70
|
+
"small",
|
71
|
+
"s",
|
72
|
+
"cite",
|
73
|
+
"q",
|
74
|
+
"dfn",
|
75
|
+
"abbr",
|
76
|
+
"data",
|
77
|
+
"time",
|
78
|
+
"code",
|
79
|
+
"var",
|
80
|
+
"samp",
|
81
|
+
"kbd",
|
82
|
+
"sub",
|
83
|
+
"sup",
|
84
|
+
"i",
|
85
|
+
"b",
|
86
|
+
"u",
|
87
|
+
"mark",
|
88
|
+
"ruby",
|
89
|
+
"rt",
|
90
|
+
"rp",
|
91
|
+
"bdi",
|
92
|
+
"bdo",
|
93
|
+
"span",
|
94
|
+
"br",
|
95
|
+
"wbr",
|
96
|
+
"ins",
|
97
|
+
"del",
|
98
|
+
"image",
|
99
|
+
"img",
|
100
|
+
"iframe",
|
101
|
+
"embed",
|
102
|
+
"object",
|
103
|
+
"param",
|
104
|
+
"video",
|
105
|
+
"audio",
|
106
|
+
"source",
|
107
|
+
"track",
|
108
|
+
"canvas",
|
109
|
+
"map",
|
110
|
+
"area",
|
111
|
+
"math",
|
112
|
+
"mi",
|
113
|
+
"mo",
|
114
|
+
"mn",
|
115
|
+
"ms",
|
116
|
+
"mtext",
|
117
|
+
"mglyph",
|
118
|
+
"malignmark",
|
119
|
+
"annotation-xml",
|
120
|
+
"svg",
|
121
|
+
"foreignobject",
|
122
|
+
"desc",
|
123
|
+
"table",
|
124
|
+
"caption",
|
125
|
+
"colgroup",
|
126
|
+
"col",
|
127
|
+
"tbody",
|
128
|
+
"thead",
|
129
|
+
"tfoot",
|
130
|
+
"tr",
|
131
|
+
"td",
|
132
|
+
"th",
|
133
|
+
"form",
|
134
|
+
"fieldset",
|
135
|
+
"legend",
|
136
|
+
"label",
|
137
|
+
"input",
|
138
|
+
"button",
|
139
|
+
"select",
|
140
|
+
"datalist",
|
141
|
+
"optgroup",
|
142
|
+
"option",
|
143
|
+
"textarea",
|
144
|
+
"keygen",
|
145
|
+
"output",
|
146
|
+
"progress",
|
147
|
+
"meter",
|
148
|
+
"details",
|
149
|
+
"summary",
|
150
|
+
"menu",
|
151
|
+
"menuitem",
|
152
|
+
"applet",
|
153
|
+
"acronym",
|
154
|
+
"bgsound",
|
155
|
+
"dir",
|
156
|
+
"frame",
|
157
|
+
"frameset",
|
158
|
+
"noframes",
|
159
|
+
"isindex",
|
160
|
+
"listing",
|
161
|
+
"xmp",
|
162
|
+
"nextid",
|
163
|
+
"noembed",
|
164
|
+
"plaintext",
|
165
|
+
"rb",
|
166
|
+
"strike",
|
167
|
+
"basefont",
|
168
|
+
"big",
|
169
|
+
"blink",
|
170
|
+
"center",
|
171
|
+
"font",
|
172
|
+
"marquee",
|
173
|
+
"multicol",
|
174
|
+
"nobr",
|
175
|
+
"spacer",
|
176
|
+
"tt",
|
25
177
|
"", // TAG_UNKNOWN
|
26
178
|
"", // TAG_LAST
|
27
179
|
};
|
28
180
|
|
29
|
-
static const unsigned char kGumboTagSizes[] = {
|
30
|
-
# include "tag_sizes.h"
|
31
|
-
0, // TAG_UNKNOWN
|
32
|
-
0, // TAG_LAST
|
33
|
-
};
|
34
|
-
|
35
181
|
const char* gumbo_normalized_tagname(GumboTag tag) {
|
36
182
|
assert(tag <= GUMBO_TAG_LAST);
|
37
183
|
return kGumboTagNames[tag];
|
38
184
|
}
|
39
185
|
|
186
|
+
// TODO(jdtang): Add test for this.
|
40
187
|
void gumbo_tag_from_original_text(GumboStringPiece* text) {
|
41
188
|
if (text->data == NULL) {
|
42
189
|
return;
|
@@ -65,34 +212,14 @@ void gumbo_tag_from_original_text(GumboStringPiece* text) {
|
|
65
212
|
}
|
66
213
|
}
|
67
214
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
return (int)c1 - (int)c2;
|
76
|
-
}
|
77
|
-
return 0;
|
78
|
-
}
|
79
|
-
|
80
|
-
#include "tag_gperf.h"
|
81
|
-
#define TAG_MAP_SIZE (sizeof(kGumboTagMap)/sizeof(kGumboTagMap[0]))
|
82
|
-
|
83
|
-
GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length) {
|
84
|
-
if (length) {
|
85
|
-
unsigned int key = tag_hash(tagname, length);
|
86
|
-
if (key < TAG_MAP_SIZE) {
|
87
|
-
GumboTag tag = kGumboTagMap[key];
|
88
|
-
if (length == kGumboTagSizes[(int)tag] &&
|
89
|
-
!case_memcmp(tagname, kGumboTagNames[(int)tag], length))
|
90
|
-
return tag;
|
215
|
+
GumboTag gumbo_tag_enum(const char* tagname) {
|
216
|
+
for (int i = 0; i < GUMBO_TAG_LAST; ++i) {
|
217
|
+
// TODO(jdtang): strcasecmp is non-portable, so if we want to support
|
218
|
+
// non-GCC compilers, we'll need some #ifdef magic. This source already has
|
219
|
+
// pretty significant issues with MSVC6 anyway.
|
220
|
+
if (strcasecmp(tagname, kGumboTagNames[i]) == 0) {
|
221
|
+
return i;
|
91
222
|
}
|
92
223
|
}
|
93
224
|
return GUMBO_TAG_UNKNOWN;
|
94
225
|
}
|
95
|
-
|
96
|
-
GumboTag gumbo_tag_enum(const char* tagname) {
|
97
|
-
return gumbo_tagn_enum(tagname, strlen(tagname));
|
98
|
-
}
|
@@ -356,10 +356,12 @@ static void clear_temporary_buffer(GumboParser* parser) {
|
|
356
356
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
357
357
|
assert(!tokenizer->_temporary_buffer_emit);
|
358
358
|
utf8iterator_mark(&tokenizer->_input);
|
359
|
-
|
359
|
+
gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
|
360
|
+
gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
|
360
361
|
// The temporary buffer and script data buffer are the same object in the
|
361
362
|
// spec, so the script data buffer should be cleared as well.
|
362
|
-
|
363
|
+
gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
|
364
|
+
gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
|
363
365
|
}
|
364
366
|
|
365
367
|
// Appends a codepoint to the temporary buffer.
|
@@ -695,11 +697,7 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
|
|
695
697
|
gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
|
696
698
|
|
697
699
|
assert(tag_state->_attributes.data == NULL);
|
698
|
-
|
699
|
-
// 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
|
700
|
-
// numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
|
701
|
-
// for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
|
702
|
-
gumbo_vector_init(parser, 1, &tag_state->_attributes);
|
700
|
+
gumbo_vector_init(parser, 4, &tag_state->_attributes);
|
703
701
|
tag_state->_drop_next_attr_value = false;
|
704
702
|
tag_state->_is_start_tag = is_start_tag;
|
705
703
|
tag_state->_is_self_closing = false;
|
@@ -753,9 +751,11 @@ static void finish_tag_name(GumboParser* parser) {
|
|
753
751
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
754
752
|
GumboTagState* tag_state = &tokenizer->_tag_state;
|
755
753
|
|
756
|
-
|
757
|
-
|
754
|
+
const char* temp;
|
755
|
+
copy_over_tag_buffer(parser, &temp);
|
756
|
+
tag_state->_tag = gumbo_tag_enum(temp);
|
758
757
|
reinitialize_tag_buffer(parser);
|
758
|
+
gumbo_parser_deallocate(parser, (void*) temp);
|
759
759
|
}
|
760
760
|
|
761
761
|
// Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
|
@@ -841,9 +841,13 @@ static void finish_attribute_value(GumboParser* parser) {
|
|
841
841
|
static bool is_appropriate_end_tag(GumboParser* parser) {
|
842
842
|
GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
|
843
843
|
assert(!tag_state->_is_start_tag);
|
844
|
+
// Null terminate the current string buffer, so it can be passed to
|
845
|
+
// gumbo_tag_enum, but don't increment the length in case we need to dump the
|
846
|
+
// buffer as character tokens.
|
847
|
+
gumbo_string_buffer_append_codepoint(parser, '\0', &tag_state->_buffer);
|
848
|
+
--tag_state->_buffer.length;
|
844
849
|
return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
|
845
|
-
tag_state->_last_start_tag ==
|
846
|
-
gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
|
850
|
+
tag_state->_last_start_tag == gumbo_tag_enum(tag_state->_buffer.data);
|
847
851
|
}
|
848
852
|
|
849
853
|
void gumbo_tokenizer_state_init(
|
@@ -1593,7 +1597,8 @@ static StateResult handle_script_double_escaped_lt_state(
|
|
1593
1597
|
int c, GumboToken* output) {
|
1594
1598
|
if (c == '/') {
|
1595
1599
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
|
1596
|
-
|
1600
|
+
gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
|
1601
|
+
gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
|
1597
1602
|
return emit_current_char(parser, output);
|
1598
1603
|
} else {
|
1599
1604
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
|
@@ -2819,7 +2824,7 @@ static StateResult handle_cdata_state(
|
|
2819
2824
|
tokenizer->_reconsume_current_input = true;
|
2820
2825
|
reset_token_start_point(tokenizer);
|
2821
2826
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2822
|
-
tokenizer->_is_in_cdata =
|
2827
|
+
tokenizer->_is_in_cdata = true;
|
2823
2828
|
return NEXT_CHAR;
|
2824
2829
|
} else {
|
2825
2830
|
return emit_current_char(parser, output);
|
data/gumbo-parser/src/vector.c
CHANGED
@@ -81,7 +81,7 @@ void* gumbo_vector_pop(
|
|
81
81
|
return vector->data[--vector->length];
|
82
82
|
}
|
83
83
|
|
84
|
-
int gumbo_vector_index_of(GumboVector* vector,
|
84
|
+
int gumbo_vector_index_of(GumboVector* vector, void* element) {
|
85
85
|
for (int i = 0; i < vector->length; ++i) {
|
86
86
|
if (vector->data[i] == element) {
|
87
87
|
return i;
|
data/test-nokogumbo.rb
CHANGED
metadata
CHANGED
@@ -1,32 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
version: 1.4.1
|
4
|
+
version: 1.4.2
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Sam Ruby
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2015-
|
11
|
+
date: 2015-05-12 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
|
-
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
|
-
none: false
|
21
20
|
type: :runtime
|
22
|
-
name: nokogiri
|
23
21
|
prerelease: false
|
24
|
-
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
23
|
requirements:
|
26
|
-
- -
|
24
|
+
- - ">="
|
27
25
|
- !ruby/object:Gem::Version
|
28
26
|
version: '0'
|
29
|
-
none: false
|
30
27
|
description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
|
31
28
|
access the result as a Nokogiri parsed document.
|
32
29
|
email: rubys@intertwingly.net
|
@@ -35,11 +32,10 @@ extensions:
|
|
35
32
|
- ext/nokogumboc/extconf.rb
|
36
33
|
extra_rdoc_files: []
|
37
34
|
files:
|
38
|
-
- ext/nokogumboc/extconf.rb
|
39
|
-
- ext/nokogumboc/nokogumbo.c
|
40
|
-
- lib/nokogumbo.rb
|
41
35
|
- LICENSE.txt
|
42
36
|
- README.md
|
37
|
+
- ext/nokogumboc/extconf.rb
|
38
|
+
- ext/nokogumboc/nokogumbo.c
|
43
39
|
- gumbo-parser/src/attribute.c
|
44
40
|
- gumbo-parser/src/attribute.h
|
45
41
|
- gumbo-parser/src/char_ref.c
|
@@ -56,11 +52,6 @@ files:
|
|
56
52
|
- gumbo-parser/src/string_piece.c
|
57
53
|
- gumbo-parser/src/string_piece.h
|
58
54
|
- gumbo-parser/src/tag.c
|
59
|
-
- gumbo-parser/src/tag.in
|
60
|
-
- gumbo-parser/src/tag_enum.h
|
61
|
-
- gumbo-parser/src/tag_gperf.h
|
62
|
-
- gumbo-parser/src/tag_sizes.h
|
63
|
-
- gumbo-parser/src/tag_strings.h
|
64
55
|
- gumbo-parser/src/token_type.h
|
65
56
|
- gumbo-parser/src/tokenizer.c
|
66
57
|
- gumbo-parser/src/tokenizer.h
|
@@ -72,30 +63,30 @@ files:
|
|
72
63
|
- gumbo-parser/src/vector.c
|
73
64
|
- gumbo-parser/src/vector.h
|
74
65
|
- gumbo-parser/visualc/include/strings.h
|
66
|
+
- lib/nokogumbo.rb
|
75
67
|
- test-nokogumbo.rb
|
76
68
|
homepage: https://github.com/rubys/nokogumbo/#readme
|
77
69
|
licenses:
|
78
70
|
- Apache 2.0
|
71
|
+
metadata: {}
|
79
72
|
post_install_message:
|
80
73
|
rdoc_options: []
|
81
74
|
require_paths:
|
82
75
|
- lib
|
83
76
|
required_ruby_version: !ruby/object:Gem::Requirement
|
84
77
|
requirements:
|
85
|
-
- -
|
78
|
+
- - ">="
|
86
79
|
- !ruby/object:Gem::Version
|
87
80
|
version: '0'
|
88
|
-
none: false
|
89
81
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
90
82
|
requirements:
|
91
|
-
- -
|
83
|
+
- - ">="
|
92
84
|
- !ruby/object:Gem::Version
|
93
85
|
version: '0'
|
94
|
-
none: false
|
95
86
|
requirements: []
|
96
87
|
rubyforge_project:
|
97
|
-
rubygems_version:
|
88
|
+
rubygems_version: 2.4.5
|
98
89
|
signing_key:
|
99
|
-
specification_version:
|
90
|
+
specification_version: 4
|
100
91
|
summary: Nokogiri interface to the Gumbo HTML5 parser
|
101
92
|
test_files: []
|