nokogumbo 1.3.0 → 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.md +8 -2
- data/ext/nokogumboc/extconf.rb +18 -6
- data/ext/nokogumboc/nokogumbo.c +102 -42
- data/gumbo-parser/src/attribute.c +1 -1
- data/gumbo-parser/src/char_ref.c +37 -67
- data/gumbo-parser/src/char_ref.h +3 -4
- data/gumbo-parser/src/char_ref.rl +6 -1
- data/gumbo-parser/src/error.c +51 -51
- data/gumbo-parser/src/error.h +7 -9
- data/gumbo-parser/src/gumbo.h +45 -181
- data/gumbo-parser/src/parser.c +1439 -1172
- data/gumbo-parser/src/string_buffer.c +14 -10
- data/gumbo-parser/src/string_buffer.h +9 -6
- data/gumbo-parser/src/string_piece.c +5 -6
- data/gumbo-parser/src/string_piece.h +2 -3
- data/gumbo-parser/src/tag.c +36 -166
- data/gumbo-parser/src/tag.in +150 -0
- data/gumbo-parser/src/tag_enum.h +153 -0
- data/gumbo-parser/src/tag_gperf.h +105 -0
- data/gumbo-parser/src/tag_sizes.h +4 -0
- data/gumbo-parser/src/tag_strings.h +153 -0
- data/gumbo-parser/src/token_type.h +1 -0
- data/gumbo-parser/src/tokenizer.c +278 -361
- data/gumbo-parser/src/tokenizer.h +2 -2
- data/gumbo-parser/src/utf8.c +53 -52
- data/gumbo-parser/src/utf8.h +1 -2
- data/gumbo-parser/src/util.c +1 -1
- data/gumbo-parser/src/util.h +0 -2
- data/gumbo-parser/src/vector.c +17 -17
- data/gumbo-parser/src/vector.h +6 -8
- data/gumbo-parser/visualc/include/strings.h +2 -1
- data/lib/nokogumbo.rb +8 -8
- data/test-nokogumbo.rb +190 -0
- metadata +19 -17
@@ -26,11 +26,12 @@
|
|
26
26
|
|
27
27
|
struct GumboInternalParser;
|
28
28
|
|
29
|
-
|
29
|
+
// Size chosen via statistical analysis of ~60K websites.
|
30
|
+
// 99% of text nodes and 98% of attribute names/values fit in this initial size.
|
31
|
+
static const size_t kDefaultStringBufferSize = 5;
|
30
32
|
|
31
|
-
static void maybe_resize_string_buffer(
|
32
|
-
|
33
|
-
GumboStringBuffer* buffer) {
|
33
|
+
static void maybe_resize_string_buffer(struct GumboInternalParser* parser,
|
34
|
+
size_t additional_chars, GumboStringBuffer* buffer) {
|
34
35
|
size_t new_length = buffer->length + additional_chars;
|
35
36
|
size_t new_capacity = buffer->capacity;
|
36
37
|
while (new_capacity < new_length) {
|
@@ -52,9 +53,8 @@ void gumbo_string_buffer_init(
|
|
52
53
|
output->capacity = kDefaultStringBufferSize;
|
53
54
|
}
|
54
55
|
|
55
|
-
void gumbo_string_buffer_reserve(
|
56
|
-
|
57
|
-
GumboStringBuffer* output) {
|
56
|
+
void gumbo_string_buffer_reserve(struct GumboInternalParser* parser,
|
57
|
+
size_t min_capacity, GumboStringBuffer* output) {
|
58
58
|
maybe_resize_string_buffer(parser, min_capacity - output->length, output);
|
59
59
|
}
|
60
60
|
|
@@ -84,9 +84,8 @@ void gumbo_string_buffer_append_codepoint(
|
|
84
84
|
}
|
85
85
|
}
|
86
86
|
|
87
|
-
void gumbo_string_buffer_append_string(
|
88
|
-
|
89
|
-
GumboStringBuffer* output) {
|
87
|
+
void gumbo_string_buffer_append_string(struct GumboInternalParser* parser,
|
88
|
+
GumboStringPiece* str, GumboStringBuffer* output) {
|
90
89
|
maybe_resize_string_buffer(parser, str->length, output);
|
91
90
|
memcpy(output->data + output->length, str->data, str->length);
|
92
91
|
output->length += str->length;
|
@@ -100,6 +99,11 @@ char* gumbo_string_buffer_to_string(
|
|
100
99
|
return buffer;
|
101
100
|
}
|
102
101
|
|
102
|
+
void gumbo_string_buffer_clear(
|
103
|
+
struct GumboInternalParser* parser, GumboStringBuffer* input) {
|
104
|
+
input->length = 0;
|
105
|
+
}
|
106
|
+
|
103
107
|
void gumbo_string_buffer_destroy(
|
104
108
|
struct GumboInternalParser* parser, GumboStringBuffer* buffer) {
|
105
109
|
gumbo_parser_deallocate(parser, buffer->data);
|
@@ -51,9 +51,8 @@ void gumbo_string_buffer_init(
|
|
51
51
|
// Ensures that the buffer contains at least a certain amount of space. Most
|
52
52
|
// useful with snprintf and the other length-delimited string functions, which
|
53
53
|
// may want to write directly into the buffer.
|
54
|
-
void gumbo_string_buffer_reserve(
|
55
|
-
|
56
|
-
GumboStringBuffer* output);
|
54
|
+
void gumbo_string_buffer_reserve(struct GumboInternalParser* parser,
|
55
|
+
size_t min_capacity, GumboStringBuffer* output);
|
57
56
|
|
58
57
|
// Appends a single Unicode codepoint onto the end of the GumboStringBuffer.
|
59
58
|
// This is essentially a UTF-8 encoder, and may add 1-4 bytes depending on the
|
@@ -62,14 +61,18 @@ void gumbo_string_buffer_append_codepoint(
|
|
62
61
|
struct GumboInternalParser* parser, int c, GumboStringBuffer* output);
|
63
62
|
|
64
63
|
// Appends a string onto the end of the GumboStringBuffer.
|
65
|
-
void gumbo_string_buffer_append_string(
|
66
|
-
|
67
|
-
GumboStringBuffer* output);
|
64
|
+
void gumbo_string_buffer_append_string(struct GumboInternalParser* parser,
|
65
|
+
GumboStringPiece* str, GumboStringBuffer* output);
|
68
66
|
|
69
67
|
// Converts this string buffer to const char*, alloctaing a new buffer for it.
|
70
68
|
char* gumbo_string_buffer_to_string(
|
71
69
|
struct GumboInternalParser* parser, GumboStringBuffer* input);
|
72
70
|
|
71
|
+
// Reinitialize this string buffer. This clears it by setting length=0. It
|
72
|
+
// does not zero out the buffer itself.
|
73
|
+
void gumbo_string_buffer_clear(
|
74
|
+
struct GumboInternalParser* parser, GumboStringBuffer* input);
|
75
|
+
|
73
76
|
// Deallocates this GumboStringBuffer.
|
74
77
|
void gumbo_string_buffer_destroy(
|
75
78
|
struct GumboInternalParser* parser, GumboStringBuffer* buffer);
|
@@ -25,23 +25,22 @@
|
|
25
25
|
|
26
26
|
struct GumboInternalParser;
|
27
27
|
|
28
|
-
const GumboStringPiece kGumboEmptyString = {
|
28
|
+
const GumboStringPiece kGumboEmptyString = {NULL, 0};
|
29
29
|
|
30
30
|
bool gumbo_string_equals(
|
31
31
|
const GumboStringPiece* str1, const GumboStringPiece* str2) {
|
32
32
|
return str1->length == str2->length &&
|
33
|
-
|
33
|
+
!memcmp(str1->data, str2->data, str1->length);
|
34
34
|
}
|
35
35
|
|
36
36
|
bool gumbo_string_equals_ignore_case(
|
37
37
|
const GumboStringPiece* str1, const GumboStringPiece* str2) {
|
38
38
|
return str1->length == str2->length &&
|
39
|
-
|
39
|
+
!strncasecmp(str1->data, str2->data, str1->length);
|
40
40
|
}
|
41
41
|
|
42
|
-
void gumbo_string_copy(
|
43
|
-
|
44
|
-
const GumboStringPiece* source) {
|
42
|
+
void gumbo_string_copy(struct GumboInternalParser* parser,
|
43
|
+
GumboStringPiece* dest, const GumboStringPiece* source) {
|
45
44
|
dest->length = source->length;
|
46
45
|
char* buffer = gumbo_parser_allocate(parser, source->length);
|
47
46
|
memcpy(buffer, source->data, source->length);
|
@@ -28,9 +28,8 @@ struct GumboInternalParser;
|
|
28
28
|
// Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the
|
29
29
|
// destination and copying over the characters from source. Dest should be
|
30
30
|
// empty, with no buffer allocated; otherwise, this leaks it.
|
31
|
-
void gumbo_string_copy(
|
32
|
-
|
33
|
-
const GumboStringPiece* source);
|
31
|
+
void gumbo_string_copy(struct GumboInternalParser* parser,
|
32
|
+
GumboStringPiece* dest, const GumboStringPiece* source);
|
34
33
|
|
35
34
|
#ifdef __cplusplus
|
36
35
|
}
|
data/gumbo-parser/src/tag.c
CHANGED
@@ -18,164 +18,18 @@
|
|
18
18
|
|
19
19
|
#include <assert.h>
|
20
20
|
#include <ctype.h>
|
21
|
-
#include <
|
21
|
+
#include <string.h>
|
22
22
|
|
23
|
-
// NOTE(jdtang): Keep this in sync with the GumboTag enum in the header.
|
24
|
-
// TODO(jdtang): Investigate whether there're efficiency benefits to putting the
|
25
|
-
// most common tag names first, or to putting them in alphabetical order and
|
26
|
-
// using a binary search.
|
27
23
|
const char* kGumboTagNames[] = {
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
"template",
|
38
|
-
"body",
|
39
|
-
"article",
|
40
|
-
"section",
|
41
|
-
"nav",
|
42
|
-
"aside",
|
43
|
-
"h1",
|
44
|
-
"h2",
|
45
|
-
"h3",
|
46
|
-
"h4",
|
47
|
-
"h5",
|
48
|
-
"h6",
|
49
|
-
"hgroup",
|
50
|
-
"header",
|
51
|
-
"footer",
|
52
|
-
"address",
|
53
|
-
"p",
|
54
|
-
"hr",
|
55
|
-
"pre",
|
56
|
-
"blockquote",
|
57
|
-
"ol",
|
58
|
-
"ul",
|
59
|
-
"li",
|
60
|
-
"dl",
|
61
|
-
"dt",
|
62
|
-
"dd",
|
63
|
-
"figure",
|
64
|
-
"figcaption",
|
65
|
-
"main",
|
66
|
-
"div",
|
67
|
-
"a",
|
68
|
-
"em",
|
69
|
-
"strong",
|
70
|
-
"small",
|
71
|
-
"s",
|
72
|
-
"cite",
|
73
|
-
"q",
|
74
|
-
"dfn",
|
75
|
-
"abbr",
|
76
|
-
"data",
|
77
|
-
"time",
|
78
|
-
"code",
|
79
|
-
"var",
|
80
|
-
"samp",
|
81
|
-
"kbd",
|
82
|
-
"sub",
|
83
|
-
"sup",
|
84
|
-
"i",
|
85
|
-
"b",
|
86
|
-
"u",
|
87
|
-
"mark",
|
88
|
-
"ruby",
|
89
|
-
"rt",
|
90
|
-
"rp",
|
91
|
-
"bdi",
|
92
|
-
"bdo",
|
93
|
-
"span",
|
94
|
-
"br",
|
95
|
-
"wbr",
|
96
|
-
"ins",
|
97
|
-
"del",
|
98
|
-
"image",
|
99
|
-
"img",
|
100
|
-
"iframe",
|
101
|
-
"embed",
|
102
|
-
"object",
|
103
|
-
"param",
|
104
|
-
"video",
|
105
|
-
"audio",
|
106
|
-
"source",
|
107
|
-
"track",
|
108
|
-
"canvas",
|
109
|
-
"map",
|
110
|
-
"area",
|
111
|
-
"math",
|
112
|
-
"mi",
|
113
|
-
"mo",
|
114
|
-
"mn",
|
115
|
-
"ms",
|
116
|
-
"mtext",
|
117
|
-
"mglyph",
|
118
|
-
"malignmark",
|
119
|
-
"annotation-xml",
|
120
|
-
"svg",
|
121
|
-
"foreignobject",
|
122
|
-
"desc",
|
123
|
-
"table",
|
124
|
-
"caption",
|
125
|
-
"colgroup",
|
126
|
-
"col",
|
127
|
-
"tbody",
|
128
|
-
"thead",
|
129
|
-
"tfoot",
|
130
|
-
"tr",
|
131
|
-
"td",
|
132
|
-
"th",
|
133
|
-
"form",
|
134
|
-
"fieldset",
|
135
|
-
"legend",
|
136
|
-
"label",
|
137
|
-
"input",
|
138
|
-
"button",
|
139
|
-
"select",
|
140
|
-
"datalist",
|
141
|
-
"optgroup",
|
142
|
-
"option",
|
143
|
-
"textarea",
|
144
|
-
"keygen",
|
145
|
-
"output",
|
146
|
-
"progress",
|
147
|
-
"meter",
|
148
|
-
"details",
|
149
|
-
"summary",
|
150
|
-
"menu",
|
151
|
-
"menuitem",
|
152
|
-
"applet",
|
153
|
-
"acronym",
|
154
|
-
"bgsound",
|
155
|
-
"dir",
|
156
|
-
"frame",
|
157
|
-
"frameset",
|
158
|
-
"noframes",
|
159
|
-
"isindex",
|
160
|
-
"listing",
|
161
|
-
"xmp",
|
162
|
-
"nextid",
|
163
|
-
"noembed",
|
164
|
-
"plaintext",
|
165
|
-
"rb",
|
166
|
-
"strike",
|
167
|
-
"basefont",
|
168
|
-
"big",
|
169
|
-
"blink",
|
170
|
-
"center",
|
171
|
-
"font",
|
172
|
-
"marquee",
|
173
|
-
"multicol",
|
174
|
-
"nobr",
|
175
|
-
"spacer",
|
176
|
-
"tt",
|
177
|
-
"", // TAG_UNKNOWN
|
178
|
-
"", // TAG_LAST
|
24
|
+
#include "tag_strings.h"
|
25
|
+
"", // TAG_UNKNOWN
|
26
|
+
"", // TAG_LAST
|
27
|
+
};
|
28
|
+
|
29
|
+
static const unsigned char kGumboTagSizes[] = {
|
30
|
+
#include "tag_sizes.h"
|
31
|
+
0, // TAG_UNKNOWN
|
32
|
+
0, // TAG_LAST
|
179
33
|
};
|
180
34
|
|
181
35
|
const char* gumbo_normalized_tagname(GumboTag tag) {
|
@@ -183,7 +37,6 @@ const char* gumbo_normalized_tagname(GumboTag tag) {
|
|
183
37
|
return kGumboTagNames[tag];
|
184
38
|
}
|
185
39
|
|
186
|
-
// TODO(jdtang): Add test for this.
|
187
40
|
void gumbo_tag_from_original_text(GumboStringPiece* text) {
|
188
41
|
if (text->data == NULL) {
|
189
42
|
return;
|
@@ -195,11 +48,11 @@ void gumbo_tag_from_original_text(GumboStringPiece* text) {
|
|
195
48
|
if (text->data[1] == '/') {
|
196
49
|
// End tag.
|
197
50
|
assert(text->length >= 3);
|
198
|
-
text->data += 2;
|
51
|
+
text->data += 2; // Move past </
|
199
52
|
text->length -= 3;
|
200
53
|
} else {
|
201
54
|
// Start tag.
|
202
|
-
text->data += 1;
|
55
|
+
text->data += 1; // Move past <
|
203
56
|
text->length -= 2;
|
204
57
|
// strnchr is apparently not a standard C library function, so I loop
|
205
58
|
// explicitly looking for whitespace or other illegal tag characters.
|
@@ -212,14 +65,31 @@ void gumbo_tag_from_original_text(GumboStringPiece* text) {
|
|
212
65
|
}
|
213
66
|
}
|
214
67
|
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
68
|
+
static int case_memcmp(const char* s1, const char* s2, unsigned int n) {
|
69
|
+
while (n--) {
|
70
|
+
unsigned char c1 = tolower(*s1++);
|
71
|
+
unsigned char c2 = tolower(*s2++);
|
72
|
+
if (c1 != c2) return (int) c1 - (int) c2;
|
73
|
+
}
|
74
|
+
return 0;
|
75
|
+
}
|
76
|
+
|
77
|
+
#include "tag_gperf.h"
|
78
|
+
#define TAG_MAP_SIZE (sizeof(kGumboTagMap) / sizeof(kGumboTagMap[0]))
|
79
|
+
|
80
|
+
GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length) {
|
81
|
+
if (length) {
|
82
|
+
unsigned int key = tag_hash(tagname, length);
|
83
|
+
if (key < TAG_MAP_SIZE) {
|
84
|
+
GumboTag tag = kGumboTagMap[key];
|
85
|
+
if (length == kGumboTagSizes[(int) tag] &&
|
86
|
+
!case_memcmp(tagname, kGumboTagNames[(int) tag], length))
|
87
|
+
return tag;
|
222
88
|
}
|
223
89
|
}
|
224
90
|
return GUMBO_TAG_UNKNOWN;
|
225
91
|
}
|
92
|
+
|
93
|
+
GumboTag gumbo_tag_enum(const char* tagname) {
|
94
|
+
return gumbo_tagn_enum(tagname, strlen(tagname));
|
95
|
+
}
|
@@ -0,0 +1,150 @@
|
|
1
|
+
html
|
2
|
+
head
|
3
|
+
title
|
4
|
+
base
|
5
|
+
link
|
6
|
+
meta
|
7
|
+
style
|
8
|
+
script
|
9
|
+
noscript
|
10
|
+
template
|
11
|
+
body
|
12
|
+
article
|
13
|
+
section
|
14
|
+
nav
|
15
|
+
aside
|
16
|
+
h1
|
17
|
+
h2
|
18
|
+
h3
|
19
|
+
h4
|
20
|
+
h5
|
21
|
+
h6
|
22
|
+
hgroup
|
23
|
+
header
|
24
|
+
footer
|
25
|
+
address
|
26
|
+
p
|
27
|
+
hr
|
28
|
+
pre
|
29
|
+
blockquote
|
30
|
+
ol
|
31
|
+
ul
|
32
|
+
li
|
33
|
+
dl
|
34
|
+
dt
|
35
|
+
dd
|
36
|
+
figure
|
37
|
+
figcaption
|
38
|
+
main
|
39
|
+
div
|
40
|
+
a
|
41
|
+
em
|
42
|
+
strong
|
43
|
+
small
|
44
|
+
s
|
45
|
+
cite
|
46
|
+
q
|
47
|
+
dfn
|
48
|
+
abbr
|
49
|
+
data
|
50
|
+
time
|
51
|
+
code
|
52
|
+
var
|
53
|
+
samp
|
54
|
+
kbd
|
55
|
+
sub
|
56
|
+
sup
|
57
|
+
i
|
58
|
+
b
|
59
|
+
u
|
60
|
+
mark
|
61
|
+
ruby
|
62
|
+
rt
|
63
|
+
rp
|
64
|
+
bdi
|
65
|
+
bdo
|
66
|
+
span
|
67
|
+
br
|
68
|
+
wbr
|
69
|
+
ins
|
70
|
+
del
|
71
|
+
image
|
72
|
+
img
|
73
|
+
iframe
|
74
|
+
embed
|
75
|
+
object
|
76
|
+
param
|
77
|
+
video
|
78
|
+
audio
|
79
|
+
source
|
80
|
+
track
|
81
|
+
canvas
|
82
|
+
map
|
83
|
+
area
|
84
|
+
math
|
85
|
+
mi
|
86
|
+
mo
|
87
|
+
mn
|
88
|
+
ms
|
89
|
+
mtext
|
90
|
+
mglyph
|
91
|
+
malignmark
|
92
|
+
annotation-xml
|
93
|
+
svg
|
94
|
+
foreignobject
|
95
|
+
desc
|
96
|
+
table
|
97
|
+
caption
|
98
|
+
colgroup
|
99
|
+
col
|
100
|
+
tbody
|
101
|
+
thead
|
102
|
+
tfoot
|
103
|
+
tr
|
104
|
+
td
|
105
|
+
th
|
106
|
+
form
|
107
|
+
fieldset
|
108
|
+
legend
|
109
|
+
label
|
110
|
+
input
|
111
|
+
button
|
112
|
+
select
|
113
|
+
datalist
|
114
|
+
optgroup
|
115
|
+
option
|
116
|
+
textarea
|
117
|
+
keygen
|
118
|
+
output
|
119
|
+
progress
|
120
|
+
meter
|
121
|
+
details
|
122
|
+
summary
|
123
|
+
menu
|
124
|
+
menuitem
|
125
|
+
applet
|
126
|
+
acronym
|
127
|
+
bgsound
|
128
|
+
dir
|
129
|
+
frame
|
130
|
+
frameset
|
131
|
+
noframes
|
132
|
+
isindex
|
133
|
+
listing
|
134
|
+
xmp
|
135
|
+
nextid
|
136
|
+
noembed
|
137
|
+
plaintext
|
138
|
+
rb
|
139
|
+
strike
|
140
|
+
basefont
|
141
|
+
big
|
142
|
+
blink
|
143
|
+
center
|
144
|
+
font
|
145
|
+
marquee
|
146
|
+
multicol
|
147
|
+
nobr
|
148
|
+
spacer
|
149
|
+
tt
|
150
|
+
rtc
|