nokogumbo 1.4.2 → 1.4.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/gumbo-parser/src/attribute.c +1 -1
- data/gumbo-parser/src/char_ref.c +37 -67
- data/gumbo-parser/src/char_ref.h +3 -4
- data/gumbo-parser/src/char_ref.rl +6 -1
- data/gumbo-parser/src/error.c +50 -51
- data/gumbo-parser/src/error.h +7 -9
- data/gumbo-parser/src/gumbo.h +45 -181
- data/gumbo-parser/src/parser.c +1397 -989
- data/gumbo-parser/src/string_buffer.c +14 -10
- data/gumbo-parser/src/string_buffer.h +9 -6
- data/gumbo-parser/src/string_piece.c +5 -6
- data/gumbo-parser/src/string_piece.h +2 -3
- data/gumbo-parser/src/tag.c +36 -166
- data/gumbo-parser/src/tag.in +150 -0
- data/gumbo-parser/src/tag_enum.h +153 -0
- data/gumbo-parser/src/tag_gperf.h +105 -0
- data/gumbo-parser/src/tag_sizes.h +4 -0
- data/gumbo-parser/src/tag_strings.h +153 -0
- data/gumbo-parser/src/tokenizer.c +264 -360
- data/gumbo-parser/src/tokenizer.h +2 -2
- data/gumbo-parser/src/utf8.c +44 -44
- data/gumbo-parser/src/utf8.h +1 -2
- data/gumbo-parser/src/util.c +1 -1
- data/gumbo-parser/src/util.h +0 -2
- data/gumbo-parser/src/vector.c +17 -17
- data/gumbo-parser/src/vector.h +6 -8
- metadata +8 -3
@@ -26,11 +26,12 @@
|
|
26
26
|
|
27
27
|
struct GumboInternalParser;
|
28
28
|
|
29
|
-
|
29
|
+
// Size chosen via statistical analysis of ~60K websites.
|
30
|
+
// 99% of text nodes and 98% of attribute names/values fit in this initial size.
|
31
|
+
static const size_t kDefaultStringBufferSize = 5;
|
30
32
|
|
31
|
-
static void maybe_resize_string_buffer(
|
32
|
-
|
33
|
-
GumboStringBuffer* buffer) {
|
33
|
+
static void maybe_resize_string_buffer(struct GumboInternalParser* parser,
|
34
|
+
size_t additional_chars, GumboStringBuffer* buffer) {
|
34
35
|
size_t new_length = buffer->length + additional_chars;
|
35
36
|
size_t new_capacity = buffer->capacity;
|
36
37
|
while (new_capacity < new_length) {
|
@@ -52,9 +53,8 @@ void gumbo_string_buffer_init(
|
|
52
53
|
output->capacity = kDefaultStringBufferSize;
|
53
54
|
}
|
54
55
|
|
55
|
-
void gumbo_string_buffer_reserve(
|
56
|
-
|
57
|
-
GumboStringBuffer* output) {
|
56
|
+
void gumbo_string_buffer_reserve(struct GumboInternalParser* parser,
|
57
|
+
size_t min_capacity, GumboStringBuffer* output) {
|
58
58
|
maybe_resize_string_buffer(parser, min_capacity - output->length, output);
|
59
59
|
}
|
60
60
|
|
@@ -84,9 +84,8 @@ void gumbo_string_buffer_append_codepoint(
|
|
84
84
|
}
|
85
85
|
}
|
86
86
|
|
87
|
-
void gumbo_string_buffer_append_string(
|
88
|
-
|
89
|
-
GumboStringBuffer* output) {
|
87
|
+
void gumbo_string_buffer_append_string(struct GumboInternalParser* parser,
|
88
|
+
GumboStringPiece* str, GumboStringBuffer* output) {
|
90
89
|
maybe_resize_string_buffer(parser, str->length, output);
|
91
90
|
memcpy(output->data + output->length, str->data, str->length);
|
92
91
|
output->length += str->length;
|
@@ -100,6 +99,11 @@ char* gumbo_string_buffer_to_string(
|
|
100
99
|
return buffer;
|
101
100
|
}
|
102
101
|
|
102
|
+
void gumbo_string_buffer_clear(
|
103
|
+
struct GumboInternalParser* parser, GumboStringBuffer* input) {
|
104
|
+
input->length = 0;
|
105
|
+
}
|
106
|
+
|
103
107
|
void gumbo_string_buffer_destroy(
|
104
108
|
struct GumboInternalParser* parser, GumboStringBuffer* buffer) {
|
105
109
|
gumbo_parser_deallocate(parser, buffer->data);
|
@@ -51,9 +51,8 @@ void gumbo_string_buffer_init(
|
|
51
51
|
// Ensures that the buffer contains at least a certain amount of space. Most
|
52
52
|
// useful with snprintf and the other length-delimited string functions, which
|
53
53
|
// may want to write directly into the buffer.
|
54
|
-
void gumbo_string_buffer_reserve(
|
55
|
-
|
56
|
-
GumboStringBuffer* output);
|
54
|
+
void gumbo_string_buffer_reserve(struct GumboInternalParser* parser,
|
55
|
+
size_t min_capacity, GumboStringBuffer* output);
|
57
56
|
|
58
57
|
// Appends a single Unicode codepoint onto the end of the GumboStringBuffer.
|
59
58
|
// This is essentially a UTF-8 encoder, and may add 1-4 bytes depending on the
|
@@ -62,14 +61,18 @@ void gumbo_string_buffer_append_codepoint(
|
|
62
61
|
struct GumboInternalParser* parser, int c, GumboStringBuffer* output);
|
63
62
|
|
64
63
|
// Appends a string onto the end of the GumboStringBuffer.
|
65
|
-
void gumbo_string_buffer_append_string(
|
66
|
-
|
67
|
-
GumboStringBuffer* output);
|
64
|
+
void gumbo_string_buffer_append_string(struct GumboInternalParser* parser,
|
65
|
+
GumboStringPiece* str, GumboStringBuffer* output);
|
68
66
|
|
69
67
|
// Converts this string buffer to const char*, alloctaing a new buffer for it.
|
70
68
|
char* gumbo_string_buffer_to_string(
|
71
69
|
struct GumboInternalParser* parser, GumboStringBuffer* input);
|
72
70
|
|
71
|
+
// Reinitialize this string buffer. This clears it by setting length=0. It
|
72
|
+
// does not zero out the buffer itself.
|
73
|
+
void gumbo_string_buffer_clear(
|
74
|
+
struct GumboInternalParser* parser, GumboStringBuffer* input);
|
75
|
+
|
73
76
|
// Deallocates this GumboStringBuffer.
|
74
77
|
void gumbo_string_buffer_destroy(
|
75
78
|
struct GumboInternalParser* parser, GumboStringBuffer* buffer);
|
@@ -25,23 +25,22 @@
|
|
25
25
|
|
26
26
|
struct GumboInternalParser;
|
27
27
|
|
28
|
-
const GumboStringPiece kGumboEmptyString = {
|
28
|
+
const GumboStringPiece kGumboEmptyString = {NULL, 0};
|
29
29
|
|
30
30
|
bool gumbo_string_equals(
|
31
31
|
const GumboStringPiece* str1, const GumboStringPiece* str2) {
|
32
32
|
return str1->length == str2->length &&
|
33
|
-
|
33
|
+
!memcmp(str1->data, str2->data, str1->length);
|
34
34
|
}
|
35
35
|
|
36
36
|
bool gumbo_string_equals_ignore_case(
|
37
37
|
const GumboStringPiece* str1, const GumboStringPiece* str2) {
|
38
38
|
return str1->length == str2->length &&
|
39
|
-
|
39
|
+
!strncasecmp(str1->data, str2->data, str1->length);
|
40
40
|
}
|
41
41
|
|
42
|
-
void gumbo_string_copy(
|
43
|
-
|
44
|
-
const GumboStringPiece* source) {
|
42
|
+
void gumbo_string_copy(struct GumboInternalParser* parser,
|
43
|
+
GumboStringPiece* dest, const GumboStringPiece* source) {
|
45
44
|
dest->length = source->length;
|
46
45
|
char* buffer = gumbo_parser_allocate(parser, source->length);
|
47
46
|
memcpy(buffer, source->data, source->length);
|
@@ -28,9 +28,8 @@ struct GumboInternalParser;
|
|
28
28
|
// Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the
|
29
29
|
// destination and copying over the characters from source. Dest should be
|
30
30
|
// empty, with no buffer allocated; otherwise, this leaks it.
|
31
|
-
void gumbo_string_copy(
|
32
|
-
|
33
|
-
const GumboStringPiece* source);
|
31
|
+
void gumbo_string_copy(struct GumboInternalParser* parser,
|
32
|
+
GumboStringPiece* dest, const GumboStringPiece* source);
|
34
33
|
|
35
34
|
#ifdef __cplusplus
|
36
35
|
}
|
data/gumbo-parser/src/tag.c
CHANGED
@@ -18,164 +18,18 @@
|
|
18
18
|
|
19
19
|
#include <assert.h>
|
20
20
|
#include <ctype.h>
|
21
|
-
#include <
|
21
|
+
#include <string.h>
|
22
22
|
|
23
|
-
// NOTE(jdtang): Keep this in sync with the GumboTag enum in the header.
|
24
|
-
// TODO(jdtang): Investigate whether there're efficiency benefits to putting the
|
25
|
-
// most common tag names first, or to putting them in alphabetical order and
|
26
|
-
// using a binary search.
|
27
23
|
const char* kGumboTagNames[] = {
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
"template",
|
38
|
-
"body",
|
39
|
-
"article",
|
40
|
-
"section",
|
41
|
-
"nav",
|
42
|
-
"aside",
|
43
|
-
"h1",
|
44
|
-
"h2",
|
45
|
-
"h3",
|
46
|
-
"h4",
|
47
|
-
"h5",
|
48
|
-
"h6",
|
49
|
-
"hgroup",
|
50
|
-
"header",
|
51
|
-
"footer",
|
52
|
-
"address",
|
53
|
-
"p",
|
54
|
-
"hr",
|
55
|
-
"pre",
|
56
|
-
"blockquote",
|
57
|
-
"ol",
|
58
|
-
"ul",
|
59
|
-
"li",
|
60
|
-
"dl",
|
61
|
-
"dt",
|
62
|
-
"dd",
|
63
|
-
"figure",
|
64
|
-
"figcaption",
|
65
|
-
"main",
|
66
|
-
"div",
|
67
|
-
"a",
|
68
|
-
"em",
|
69
|
-
"strong",
|
70
|
-
"small",
|
71
|
-
"s",
|
72
|
-
"cite",
|
73
|
-
"q",
|
74
|
-
"dfn",
|
75
|
-
"abbr",
|
76
|
-
"data",
|
77
|
-
"time",
|
78
|
-
"code",
|
79
|
-
"var",
|
80
|
-
"samp",
|
81
|
-
"kbd",
|
82
|
-
"sub",
|
83
|
-
"sup",
|
84
|
-
"i",
|
85
|
-
"b",
|
86
|
-
"u",
|
87
|
-
"mark",
|
88
|
-
"ruby",
|
89
|
-
"rt",
|
90
|
-
"rp",
|
91
|
-
"bdi",
|
92
|
-
"bdo",
|
93
|
-
"span",
|
94
|
-
"br",
|
95
|
-
"wbr",
|
96
|
-
"ins",
|
97
|
-
"del",
|
98
|
-
"image",
|
99
|
-
"img",
|
100
|
-
"iframe",
|
101
|
-
"embed",
|
102
|
-
"object",
|
103
|
-
"param",
|
104
|
-
"video",
|
105
|
-
"audio",
|
106
|
-
"source",
|
107
|
-
"track",
|
108
|
-
"canvas",
|
109
|
-
"map",
|
110
|
-
"area",
|
111
|
-
"math",
|
112
|
-
"mi",
|
113
|
-
"mo",
|
114
|
-
"mn",
|
115
|
-
"ms",
|
116
|
-
"mtext",
|
117
|
-
"mglyph",
|
118
|
-
"malignmark",
|
119
|
-
"annotation-xml",
|
120
|
-
"svg",
|
121
|
-
"foreignobject",
|
122
|
-
"desc",
|
123
|
-
"table",
|
124
|
-
"caption",
|
125
|
-
"colgroup",
|
126
|
-
"col",
|
127
|
-
"tbody",
|
128
|
-
"thead",
|
129
|
-
"tfoot",
|
130
|
-
"tr",
|
131
|
-
"td",
|
132
|
-
"th",
|
133
|
-
"form",
|
134
|
-
"fieldset",
|
135
|
-
"legend",
|
136
|
-
"label",
|
137
|
-
"input",
|
138
|
-
"button",
|
139
|
-
"select",
|
140
|
-
"datalist",
|
141
|
-
"optgroup",
|
142
|
-
"option",
|
143
|
-
"textarea",
|
144
|
-
"keygen",
|
145
|
-
"output",
|
146
|
-
"progress",
|
147
|
-
"meter",
|
148
|
-
"details",
|
149
|
-
"summary",
|
150
|
-
"menu",
|
151
|
-
"menuitem",
|
152
|
-
"applet",
|
153
|
-
"acronym",
|
154
|
-
"bgsound",
|
155
|
-
"dir",
|
156
|
-
"frame",
|
157
|
-
"frameset",
|
158
|
-
"noframes",
|
159
|
-
"isindex",
|
160
|
-
"listing",
|
161
|
-
"xmp",
|
162
|
-
"nextid",
|
163
|
-
"noembed",
|
164
|
-
"plaintext",
|
165
|
-
"rb",
|
166
|
-
"strike",
|
167
|
-
"basefont",
|
168
|
-
"big",
|
169
|
-
"blink",
|
170
|
-
"center",
|
171
|
-
"font",
|
172
|
-
"marquee",
|
173
|
-
"multicol",
|
174
|
-
"nobr",
|
175
|
-
"spacer",
|
176
|
-
"tt",
|
177
|
-
"", // TAG_UNKNOWN
|
178
|
-
"", // TAG_LAST
|
24
|
+
#include "tag_strings.h"
|
25
|
+
"", // TAG_UNKNOWN
|
26
|
+
"", // TAG_LAST
|
27
|
+
};
|
28
|
+
|
29
|
+
static const unsigned char kGumboTagSizes[] = {
|
30
|
+
#include "tag_sizes.h"
|
31
|
+
0, // TAG_UNKNOWN
|
32
|
+
0, // TAG_LAST
|
179
33
|
};
|
180
34
|
|
181
35
|
const char* gumbo_normalized_tagname(GumboTag tag) {
|
@@ -183,7 +37,6 @@ const char* gumbo_normalized_tagname(GumboTag tag) {
|
|
183
37
|
return kGumboTagNames[tag];
|
184
38
|
}
|
185
39
|
|
186
|
-
// TODO(jdtang): Add test for this.
|
187
40
|
void gumbo_tag_from_original_text(GumboStringPiece* text) {
|
188
41
|
if (text->data == NULL) {
|
189
42
|
return;
|
@@ -195,11 +48,11 @@ void gumbo_tag_from_original_text(GumboStringPiece* text) {
|
|
195
48
|
if (text->data[1] == '/') {
|
196
49
|
// End tag.
|
197
50
|
assert(text->length >= 3);
|
198
|
-
text->data += 2;
|
51
|
+
text->data += 2; // Move past </
|
199
52
|
text->length -= 3;
|
200
53
|
} else {
|
201
54
|
// Start tag.
|
202
|
-
text->data += 1;
|
55
|
+
text->data += 1; // Move past <
|
203
56
|
text->length -= 2;
|
204
57
|
// strnchr is apparently not a standard C library function, so I loop
|
205
58
|
// explicitly looking for whitespace or other illegal tag characters.
|
@@ -212,14 +65,31 @@ void gumbo_tag_from_original_text(GumboStringPiece* text) {
|
|
212
65
|
}
|
213
66
|
}
|
214
67
|
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
68
|
+
static int case_memcmp(const char* s1, const char* s2, unsigned int n) {
|
69
|
+
while (n--) {
|
70
|
+
unsigned char c1 = tolower(*s1++);
|
71
|
+
unsigned char c2 = tolower(*s2++);
|
72
|
+
if (c1 != c2) return (int) c1 - (int) c2;
|
73
|
+
}
|
74
|
+
return 0;
|
75
|
+
}
|
76
|
+
|
77
|
+
#include "tag_gperf.h"
|
78
|
+
#define TAG_MAP_SIZE (sizeof(kGumboTagMap) / sizeof(kGumboTagMap[0]))
|
79
|
+
|
80
|
+
GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length) {
|
81
|
+
if (length) {
|
82
|
+
unsigned int key = tag_hash(tagname, length);
|
83
|
+
if (key < TAG_MAP_SIZE) {
|
84
|
+
GumboTag tag = kGumboTagMap[key];
|
85
|
+
if (length == kGumboTagSizes[(int) tag] &&
|
86
|
+
!case_memcmp(tagname, kGumboTagNames[(int) tag], length))
|
87
|
+
return tag;
|
222
88
|
}
|
223
89
|
}
|
224
90
|
return GUMBO_TAG_UNKNOWN;
|
225
91
|
}
|
92
|
+
|
93
|
+
GumboTag gumbo_tag_enum(const char* tagname) {
|
94
|
+
return gumbo_tagn_enum(tagname, strlen(tagname));
|
95
|
+
}
|
@@ -0,0 +1,150 @@
|
|
1
|
+
html
|
2
|
+
head
|
3
|
+
title
|
4
|
+
base
|
5
|
+
link
|
6
|
+
meta
|
7
|
+
style
|
8
|
+
script
|
9
|
+
noscript
|
10
|
+
template
|
11
|
+
body
|
12
|
+
article
|
13
|
+
section
|
14
|
+
nav
|
15
|
+
aside
|
16
|
+
h1
|
17
|
+
h2
|
18
|
+
h3
|
19
|
+
h4
|
20
|
+
h5
|
21
|
+
h6
|
22
|
+
hgroup
|
23
|
+
header
|
24
|
+
footer
|
25
|
+
address
|
26
|
+
p
|
27
|
+
hr
|
28
|
+
pre
|
29
|
+
blockquote
|
30
|
+
ol
|
31
|
+
ul
|
32
|
+
li
|
33
|
+
dl
|
34
|
+
dt
|
35
|
+
dd
|
36
|
+
figure
|
37
|
+
figcaption
|
38
|
+
main
|
39
|
+
div
|
40
|
+
a
|
41
|
+
em
|
42
|
+
strong
|
43
|
+
small
|
44
|
+
s
|
45
|
+
cite
|
46
|
+
q
|
47
|
+
dfn
|
48
|
+
abbr
|
49
|
+
data
|
50
|
+
time
|
51
|
+
code
|
52
|
+
var
|
53
|
+
samp
|
54
|
+
kbd
|
55
|
+
sub
|
56
|
+
sup
|
57
|
+
i
|
58
|
+
b
|
59
|
+
u
|
60
|
+
mark
|
61
|
+
ruby
|
62
|
+
rt
|
63
|
+
rp
|
64
|
+
bdi
|
65
|
+
bdo
|
66
|
+
span
|
67
|
+
br
|
68
|
+
wbr
|
69
|
+
ins
|
70
|
+
del
|
71
|
+
image
|
72
|
+
img
|
73
|
+
iframe
|
74
|
+
embed
|
75
|
+
object
|
76
|
+
param
|
77
|
+
video
|
78
|
+
audio
|
79
|
+
source
|
80
|
+
track
|
81
|
+
canvas
|
82
|
+
map
|
83
|
+
area
|
84
|
+
math
|
85
|
+
mi
|
86
|
+
mo
|
87
|
+
mn
|
88
|
+
ms
|
89
|
+
mtext
|
90
|
+
mglyph
|
91
|
+
malignmark
|
92
|
+
annotation-xml
|
93
|
+
svg
|
94
|
+
foreignobject
|
95
|
+
desc
|
96
|
+
table
|
97
|
+
caption
|
98
|
+
colgroup
|
99
|
+
col
|
100
|
+
tbody
|
101
|
+
thead
|
102
|
+
tfoot
|
103
|
+
tr
|
104
|
+
td
|
105
|
+
th
|
106
|
+
form
|
107
|
+
fieldset
|
108
|
+
legend
|
109
|
+
label
|
110
|
+
input
|
111
|
+
button
|
112
|
+
select
|
113
|
+
datalist
|
114
|
+
optgroup
|
115
|
+
option
|
116
|
+
textarea
|
117
|
+
keygen
|
118
|
+
output
|
119
|
+
progress
|
120
|
+
meter
|
121
|
+
details
|
122
|
+
summary
|
123
|
+
menu
|
124
|
+
menuitem
|
125
|
+
applet
|
126
|
+
acronym
|
127
|
+
bgsound
|
128
|
+
dir
|
129
|
+
frame
|
130
|
+
frameset
|
131
|
+
noframes
|
132
|
+
isindex
|
133
|
+
listing
|
134
|
+
xmp
|
135
|
+
nextid
|
136
|
+
noembed
|
137
|
+
plaintext
|
138
|
+
rb
|
139
|
+
strike
|
140
|
+
basefont
|
141
|
+
big
|
142
|
+
blink
|
143
|
+
center
|
144
|
+
font
|
145
|
+
marquee
|
146
|
+
multicol
|
147
|
+
nobr
|
148
|
+
spacer
|
149
|
+
tt
|
150
|
+
rtc
|