nokogumbo 1.4.1 → 1.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/ext/nokogumboc/nokogumbo.c +1 -1
- data/gumbo-parser/src/error.c +3 -5
- data/gumbo-parser/src/gumbo.h +170 -36
- data/gumbo-parser/src/parser.c +403 -795
- data/gumbo-parser/src/string_buffer.c +1 -8
- data/gumbo-parser/src/string_buffer.h +0 -5
- data/gumbo-parser/src/tag.c +162 -35
- data/gumbo-parser/src/tokenizer.c +18 -13
- data/gumbo-parser/src/vector.c +1 -1
- data/test-nokogumbo.rb +1 -1
- metadata +15 -24
- data/gumbo-parser/src/tag.in +0 -150
- data/gumbo-parser/src/tag_enum.h +0 -150
- data/gumbo-parser/src/tag_gperf.h +0 -343
- data/gumbo-parser/src/tag_sizes.h +0 -1
- data/gumbo-parser/src/tag_strings.h +0 -150
@@ -26,9 +26,7 @@
|
|
26
26
|
|
27
27
|
struct GumboInternalParser;
|
28
28
|
|
29
|
-
|
30
|
-
// 99% of text nodes and 98% of attribute names/values fit in this initial size.
|
31
|
-
static const size_t kDefaultStringBufferSize = 5;
|
29
|
+
static const size_t kDefaultStringBufferSize = 10;
|
32
30
|
|
33
31
|
static void maybe_resize_string_buffer(
|
34
32
|
struct GumboInternalParser* parser, size_t additional_chars,
|
@@ -102,11 +100,6 @@ char* gumbo_string_buffer_to_string(
|
|
102
100
|
return buffer;
|
103
101
|
}
|
104
102
|
|
105
|
-
void gumbo_string_buffer_clear(
|
106
|
-
struct GumboInternalParser* parser, GumboStringBuffer* input) {
|
107
|
-
input->length = 0;
|
108
|
-
}
|
109
|
-
|
110
103
|
void gumbo_string_buffer_destroy(
|
111
104
|
struct GumboInternalParser* parser, GumboStringBuffer* buffer) {
|
112
105
|
gumbo_parser_deallocate(parser, buffer->data);
|
@@ -70,11 +70,6 @@ void gumbo_string_buffer_append_string(
|
|
70
70
|
char* gumbo_string_buffer_to_string(
|
71
71
|
struct GumboInternalParser* parser, GumboStringBuffer* input);
|
72
72
|
|
73
|
-
// Reinitialize this string buffer. This clears it by setting length=0. It
|
74
|
-
// does not zero out the buffer itself.
|
75
|
-
void gumbo_string_buffer_clear(
|
76
|
-
struct GumboInternalParser* parser, GumboStringBuffer* input);
|
77
|
-
|
78
73
|
// Deallocates this GumboStringBuffer.
|
79
74
|
void gumbo_string_buffer_destroy(
|
80
75
|
struct GumboInternalParser* parser, GumboStringBuffer* buffer);
|
data/gumbo-parser/src/tag.c
CHANGED
@@ -18,25 +18,172 @@
|
|
18
18
|
|
19
19
|
#include <assert.h>
|
20
20
|
#include <ctype.h>
|
21
|
-
#include <
|
21
|
+
#include <strings.h> // For strcasecmp.
|
22
22
|
|
23
|
+
// NOTE(jdtang): Keep this in sync with the GumboTag enum in the header.
|
24
|
+
// TODO(jdtang): Investigate whether there're efficiency benefits to putting the
|
25
|
+
// most common tag names first, or to putting them in alphabetical order and
|
26
|
+
// using a binary search.
|
23
27
|
const char* kGumboTagNames[] = {
|
24
|
-
|
28
|
+
"html",
|
29
|
+
"head",
|
30
|
+
"title",
|
31
|
+
"base",
|
32
|
+
"link",
|
33
|
+
"meta",
|
34
|
+
"style",
|
35
|
+
"script",
|
36
|
+
"noscript",
|
37
|
+
"template",
|
38
|
+
"body",
|
39
|
+
"article",
|
40
|
+
"section",
|
41
|
+
"nav",
|
42
|
+
"aside",
|
43
|
+
"h1",
|
44
|
+
"h2",
|
45
|
+
"h3",
|
46
|
+
"h4",
|
47
|
+
"h5",
|
48
|
+
"h6",
|
49
|
+
"hgroup",
|
50
|
+
"header",
|
51
|
+
"footer",
|
52
|
+
"address",
|
53
|
+
"p",
|
54
|
+
"hr",
|
55
|
+
"pre",
|
56
|
+
"blockquote",
|
57
|
+
"ol",
|
58
|
+
"ul",
|
59
|
+
"li",
|
60
|
+
"dl",
|
61
|
+
"dt",
|
62
|
+
"dd",
|
63
|
+
"figure",
|
64
|
+
"figcaption",
|
65
|
+
"main",
|
66
|
+
"div",
|
67
|
+
"a",
|
68
|
+
"em",
|
69
|
+
"strong",
|
70
|
+
"small",
|
71
|
+
"s",
|
72
|
+
"cite",
|
73
|
+
"q",
|
74
|
+
"dfn",
|
75
|
+
"abbr",
|
76
|
+
"data",
|
77
|
+
"time",
|
78
|
+
"code",
|
79
|
+
"var",
|
80
|
+
"samp",
|
81
|
+
"kbd",
|
82
|
+
"sub",
|
83
|
+
"sup",
|
84
|
+
"i",
|
85
|
+
"b",
|
86
|
+
"u",
|
87
|
+
"mark",
|
88
|
+
"ruby",
|
89
|
+
"rt",
|
90
|
+
"rp",
|
91
|
+
"bdi",
|
92
|
+
"bdo",
|
93
|
+
"span",
|
94
|
+
"br",
|
95
|
+
"wbr",
|
96
|
+
"ins",
|
97
|
+
"del",
|
98
|
+
"image",
|
99
|
+
"img",
|
100
|
+
"iframe",
|
101
|
+
"embed",
|
102
|
+
"object",
|
103
|
+
"param",
|
104
|
+
"video",
|
105
|
+
"audio",
|
106
|
+
"source",
|
107
|
+
"track",
|
108
|
+
"canvas",
|
109
|
+
"map",
|
110
|
+
"area",
|
111
|
+
"math",
|
112
|
+
"mi",
|
113
|
+
"mo",
|
114
|
+
"mn",
|
115
|
+
"ms",
|
116
|
+
"mtext",
|
117
|
+
"mglyph",
|
118
|
+
"malignmark",
|
119
|
+
"annotation-xml",
|
120
|
+
"svg",
|
121
|
+
"foreignobject",
|
122
|
+
"desc",
|
123
|
+
"table",
|
124
|
+
"caption",
|
125
|
+
"colgroup",
|
126
|
+
"col",
|
127
|
+
"tbody",
|
128
|
+
"thead",
|
129
|
+
"tfoot",
|
130
|
+
"tr",
|
131
|
+
"td",
|
132
|
+
"th",
|
133
|
+
"form",
|
134
|
+
"fieldset",
|
135
|
+
"legend",
|
136
|
+
"label",
|
137
|
+
"input",
|
138
|
+
"button",
|
139
|
+
"select",
|
140
|
+
"datalist",
|
141
|
+
"optgroup",
|
142
|
+
"option",
|
143
|
+
"textarea",
|
144
|
+
"keygen",
|
145
|
+
"output",
|
146
|
+
"progress",
|
147
|
+
"meter",
|
148
|
+
"details",
|
149
|
+
"summary",
|
150
|
+
"menu",
|
151
|
+
"menuitem",
|
152
|
+
"applet",
|
153
|
+
"acronym",
|
154
|
+
"bgsound",
|
155
|
+
"dir",
|
156
|
+
"frame",
|
157
|
+
"frameset",
|
158
|
+
"noframes",
|
159
|
+
"isindex",
|
160
|
+
"listing",
|
161
|
+
"xmp",
|
162
|
+
"nextid",
|
163
|
+
"noembed",
|
164
|
+
"plaintext",
|
165
|
+
"rb",
|
166
|
+
"strike",
|
167
|
+
"basefont",
|
168
|
+
"big",
|
169
|
+
"blink",
|
170
|
+
"center",
|
171
|
+
"font",
|
172
|
+
"marquee",
|
173
|
+
"multicol",
|
174
|
+
"nobr",
|
175
|
+
"spacer",
|
176
|
+
"tt",
|
25
177
|
"", // TAG_UNKNOWN
|
26
178
|
"", // TAG_LAST
|
27
179
|
};
|
28
180
|
|
29
|
-
static const unsigned char kGumboTagSizes[] = {
|
30
|
-
# include "tag_sizes.h"
|
31
|
-
0, // TAG_UNKNOWN
|
32
|
-
0, // TAG_LAST
|
33
|
-
};
|
34
|
-
|
35
181
|
const char* gumbo_normalized_tagname(GumboTag tag) {
|
36
182
|
assert(tag <= GUMBO_TAG_LAST);
|
37
183
|
return kGumboTagNames[tag];
|
38
184
|
}
|
39
185
|
|
186
|
+
// TODO(jdtang): Add test for this.
|
40
187
|
void gumbo_tag_from_original_text(GumboStringPiece* text) {
|
41
188
|
if (text->data == NULL) {
|
42
189
|
return;
|
@@ -65,34 +212,14 @@ void gumbo_tag_from_original_text(GumboStringPiece* text) {
|
|
65
212
|
}
|
66
213
|
}
|
67
214
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
return (int)c1 - (int)c2;
|
76
|
-
}
|
77
|
-
return 0;
|
78
|
-
}
|
79
|
-
|
80
|
-
#include "tag_gperf.h"
|
81
|
-
#define TAG_MAP_SIZE (sizeof(kGumboTagMap)/sizeof(kGumboTagMap[0]))
|
82
|
-
|
83
|
-
GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length) {
|
84
|
-
if (length) {
|
85
|
-
unsigned int key = tag_hash(tagname, length);
|
86
|
-
if (key < TAG_MAP_SIZE) {
|
87
|
-
GumboTag tag = kGumboTagMap[key];
|
88
|
-
if (length == kGumboTagSizes[(int)tag] &&
|
89
|
-
!case_memcmp(tagname, kGumboTagNames[(int)tag], length))
|
90
|
-
return tag;
|
215
|
+
GumboTag gumbo_tag_enum(const char* tagname) {
|
216
|
+
for (int i = 0; i < GUMBO_TAG_LAST; ++i) {
|
217
|
+
// TODO(jdtang): strcasecmp is non-portable, so if we want to support
|
218
|
+
// non-GCC compilers, we'll need some #ifdef magic. This source already has
|
219
|
+
// pretty significant issues with MSVC6 anyway.
|
220
|
+
if (strcasecmp(tagname, kGumboTagNames[i]) == 0) {
|
221
|
+
return i;
|
91
222
|
}
|
92
223
|
}
|
93
224
|
return GUMBO_TAG_UNKNOWN;
|
94
225
|
}
|
95
|
-
|
96
|
-
GumboTag gumbo_tag_enum(const char* tagname) {
|
97
|
-
return gumbo_tagn_enum(tagname, strlen(tagname));
|
98
|
-
}
|
@@ -356,10 +356,12 @@ static void clear_temporary_buffer(GumboParser* parser) {
|
|
356
356
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
357
357
|
assert(!tokenizer->_temporary_buffer_emit);
|
358
358
|
utf8iterator_mark(&tokenizer->_input);
|
359
|
-
|
359
|
+
gumbo_string_buffer_destroy(parser, &tokenizer->_temporary_buffer);
|
360
|
+
gumbo_string_buffer_init(parser, &tokenizer->_temporary_buffer);
|
360
361
|
// The temporary buffer and script data buffer are the same object in the
|
361
362
|
// spec, so the script data buffer should be cleared as well.
|
362
|
-
|
363
|
+
gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
|
364
|
+
gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
|
363
365
|
}
|
364
366
|
|
365
367
|
// Appends a codepoint to the temporary buffer.
|
@@ -695,11 +697,7 @@ static void start_new_tag(GumboParser* parser, bool is_start_tag) {
|
|
695
697
|
gumbo_string_buffer_append_codepoint(parser, c, &tag_state->_buffer);
|
696
698
|
|
697
699
|
assert(tag_state->_attributes.data == NULL);
|
698
|
-
|
699
|
-
// 99.5% of elements have 0 attributes, 93% of the remainder have 1. These
|
700
|
-
// numbers are a bit higher for more modern websites (eg. ~45% = 0, ~40% = 1
|
701
|
-
// for the HTML5 Spec), but still have basically 99% of nodes with <= 2 attrs.
|
702
|
-
gumbo_vector_init(parser, 1, &tag_state->_attributes);
|
700
|
+
gumbo_vector_init(parser, 4, &tag_state->_attributes);
|
703
701
|
tag_state->_drop_next_attr_value = false;
|
704
702
|
tag_state->_is_start_tag = is_start_tag;
|
705
703
|
tag_state->_is_self_closing = false;
|
@@ -753,9 +751,11 @@ static void finish_tag_name(GumboParser* parser) {
|
|
753
751
|
GumboTokenizerState* tokenizer = parser->_tokenizer_state;
|
754
752
|
GumboTagState* tag_state = &tokenizer->_tag_state;
|
755
753
|
|
756
|
-
|
757
|
-
|
754
|
+
const char* temp;
|
755
|
+
copy_over_tag_buffer(parser, &temp);
|
756
|
+
tag_state->_tag = gumbo_tag_enum(temp);
|
758
757
|
reinitialize_tag_buffer(parser);
|
758
|
+
gumbo_parser_deallocate(parser, (void*) temp);
|
759
759
|
}
|
760
760
|
|
761
761
|
// Adds an ERR_DUPLICATE_ATTR parse error to the parser's error struct.
|
@@ -841,9 +841,13 @@ static void finish_attribute_value(GumboParser* parser) {
|
|
841
841
|
static bool is_appropriate_end_tag(GumboParser* parser) {
|
842
842
|
GumboTagState* tag_state = &parser->_tokenizer_state->_tag_state;
|
843
843
|
assert(!tag_state->_is_start_tag);
|
844
|
+
// Null terminate the current string buffer, so it can be passed to
|
845
|
+
// gumbo_tag_enum, but don't increment the length in case we need to dump the
|
846
|
+
// buffer as character tokens.
|
847
|
+
gumbo_string_buffer_append_codepoint(parser, '\0', &tag_state->_buffer);
|
848
|
+
--tag_state->_buffer.length;
|
844
849
|
return tag_state->_last_start_tag != GUMBO_TAG_LAST &&
|
845
|
-
tag_state->_last_start_tag ==
|
846
|
-
gumbo_tagn_enum(tag_state->_buffer.data, tag_state->_buffer.length);
|
850
|
+
tag_state->_last_start_tag == gumbo_tag_enum(tag_state->_buffer.data);
|
847
851
|
}
|
848
852
|
|
849
853
|
void gumbo_tokenizer_state_init(
|
@@ -1593,7 +1597,8 @@ static StateResult handle_script_double_escaped_lt_state(
|
|
1593
1597
|
int c, GumboToken* output) {
|
1594
1598
|
if (c == '/') {
|
1595
1599
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED_END);
|
1596
|
-
|
1600
|
+
gumbo_string_buffer_destroy(parser, &tokenizer->_script_data_buffer);
|
1601
|
+
gumbo_string_buffer_init(parser, &tokenizer->_script_data_buffer);
|
1597
1602
|
return emit_current_char(parser, output);
|
1598
1603
|
} else {
|
1599
1604
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_SCRIPT_DOUBLE_ESCAPED);
|
@@ -2819,7 +2824,7 @@ static StateResult handle_cdata_state(
|
|
2819
2824
|
tokenizer->_reconsume_current_input = true;
|
2820
2825
|
reset_token_start_point(tokenizer);
|
2821
2826
|
gumbo_tokenizer_set_state(parser, GUMBO_LEX_DATA);
|
2822
|
-
tokenizer->_is_in_cdata =
|
2827
|
+
tokenizer->_is_in_cdata = true;
|
2823
2828
|
return NEXT_CHAR;
|
2824
2829
|
} else {
|
2825
2830
|
return emit_current_char(parser, output);
|
data/gumbo-parser/src/vector.c
CHANGED
@@ -81,7 +81,7 @@ void* gumbo_vector_pop(
|
|
81
81
|
return vector->data[--vector->length];
|
82
82
|
}
|
83
83
|
|
84
|
-
int gumbo_vector_index_of(GumboVector* vector,
|
84
|
+
int gumbo_vector_index_of(GumboVector* vector, void* element) {
|
85
85
|
for (int i = 0; i < vector->length; ++i) {
|
86
86
|
if (vector->data[i] == element) {
|
87
87
|
return i;
|
data/test-nokogumbo.rb
CHANGED
metadata
CHANGED
@@ -1,32 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
version: 1.4.1
|
4
|
+
version: 1.4.2
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Sam Ruby
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2015-
|
11
|
+
date: 2015-05-12 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
|
-
|
14
|
+
name: nokogiri
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
19
|
version: '0'
|
20
|
-
none: false
|
21
20
|
type: :runtime
|
22
|
-
name: nokogiri
|
23
21
|
prerelease: false
|
24
|
-
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
23
|
requirements:
|
26
|
-
- -
|
24
|
+
- - ">="
|
27
25
|
- !ruby/object:Gem::Version
|
28
26
|
version: '0'
|
29
|
-
none: false
|
30
27
|
description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
|
31
28
|
access the result as a Nokogiri parsed document.
|
32
29
|
email: rubys@intertwingly.net
|
@@ -35,11 +32,10 @@ extensions:
|
|
35
32
|
- ext/nokogumboc/extconf.rb
|
36
33
|
extra_rdoc_files: []
|
37
34
|
files:
|
38
|
-
- ext/nokogumboc/extconf.rb
|
39
|
-
- ext/nokogumboc/nokogumbo.c
|
40
|
-
- lib/nokogumbo.rb
|
41
35
|
- LICENSE.txt
|
42
36
|
- README.md
|
37
|
+
- ext/nokogumboc/extconf.rb
|
38
|
+
- ext/nokogumboc/nokogumbo.c
|
43
39
|
- gumbo-parser/src/attribute.c
|
44
40
|
- gumbo-parser/src/attribute.h
|
45
41
|
- gumbo-parser/src/char_ref.c
|
@@ -56,11 +52,6 @@ files:
|
|
56
52
|
- gumbo-parser/src/string_piece.c
|
57
53
|
- gumbo-parser/src/string_piece.h
|
58
54
|
- gumbo-parser/src/tag.c
|
59
|
-
- gumbo-parser/src/tag.in
|
60
|
-
- gumbo-parser/src/tag_enum.h
|
61
|
-
- gumbo-parser/src/tag_gperf.h
|
62
|
-
- gumbo-parser/src/tag_sizes.h
|
63
|
-
- gumbo-parser/src/tag_strings.h
|
64
55
|
- gumbo-parser/src/token_type.h
|
65
56
|
- gumbo-parser/src/tokenizer.c
|
66
57
|
- gumbo-parser/src/tokenizer.h
|
@@ -72,30 +63,30 @@ files:
|
|
72
63
|
- gumbo-parser/src/vector.c
|
73
64
|
- gumbo-parser/src/vector.h
|
74
65
|
- gumbo-parser/visualc/include/strings.h
|
66
|
+
- lib/nokogumbo.rb
|
75
67
|
- test-nokogumbo.rb
|
76
68
|
homepage: https://github.com/rubys/nokogumbo/#readme
|
77
69
|
licenses:
|
78
70
|
- Apache 2.0
|
71
|
+
metadata: {}
|
79
72
|
post_install_message:
|
80
73
|
rdoc_options: []
|
81
74
|
require_paths:
|
82
75
|
- lib
|
83
76
|
required_ruby_version: !ruby/object:Gem::Requirement
|
84
77
|
requirements:
|
85
|
-
- -
|
78
|
+
- - ">="
|
86
79
|
- !ruby/object:Gem::Version
|
87
80
|
version: '0'
|
88
|
-
none: false
|
89
81
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
90
82
|
requirements:
|
91
|
-
- -
|
83
|
+
- - ">="
|
92
84
|
- !ruby/object:Gem::Version
|
93
85
|
version: '0'
|
94
|
-
none: false
|
95
86
|
requirements: []
|
96
87
|
rubyforge_project:
|
97
|
-
rubygems_version:
|
88
|
+
rubygems_version: 2.4.5
|
98
89
|
signing_key:
|
99
|
-
specification_version:
|
90
|
+
specification_version: 4
|
100
91
|
summary: Nokogiri interface to the Gumbo HTML5 parser
|
101
92
|
test_files: []
|