nokogumbo 1.3.0 → 1.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +1 -1
- data/ext/nokogumboc/nokogumbo.c +1 -0
- data/gumbo-parser/src/error.c +6 -3
- data/gumbo-parser/src/gumbo.h +36 -170
- data/gumbo-parser/src/parser.c +1030 -779
- data/gumbo-parser/src/string_buffer.c +8 -1
- data/gumbo-parser/src/string_buffer.h +5 -0
- data/gumbo-parser/src/tag.c +35 -162
- data/gumbo-parser/src/tag.in +150 -0
- data/gumbo-parser/src/tag_enum.h +150 -0
- data/gumbo-parser/src/tag_gperf.h +343 -0
- data/gumbo-parser/src/tag_sizes.h +1 -0
- data/gumbo-parser/src/tag_strings.h +150 -0
- data/gumbo-parser/src/token_type.h +1 -0
- data/gumbo-parser/src/tokenizer.c +29 -21
- data/gumbo-parser/src/utf8.c +9 -8
- data/gumbo-parser/src/vector.c +1 -1
- data/gumbo-parser/visualc/include/strings.h +2 -1
- data/test-nokogumbo.rb +140 -0
- metadata +16 -10
@@ -26,7 +26,9 @@
|
|
26
26
|
|
27
27
|
struct GumboInternalParser;
|
28
28
|
|
29
|
-
|
29
|
+
// Size chosen via statistical analysis of ~60K websites.
|
30
|
+
// 99% of text nodes and 98% of attribute names/values fit in this initial size.
|
31
|
+
static const size_t kDefaultStringBufferSize = 5;
|
30
32
|
|
31
33
|
static void maybe_resize_string_buffer(
|
32
34
|
struct GumboInternalParser* parser, size_t additional_chars,
|
@@ -100,6 +102,11 @@ char* gumbo_string_buffer_to_string(
|
|
100
102
|
return buffer;
|
101
103
|
}
|
102
104
|
|
105
|
+
void gumbo_string_buffer_clear(
|
106
|
+
struct GumboInternalParser* parser, GumboStringBuffer* input) {
|
107
|
+
input->length = 0;
|
108
|
+
}
|
109
|
+
|
103
110
|
void gumbo_string_buffer_destroy(
|
104
111
|
struct GumboInternalParser* parser, GumboStringBuffer* buffer) {
|
105
112
|
gumbo_parser_deallocate(parser, buffer->data);
|
@@ -70,6 +70,11 @@ void gumbo_string_buffer_append_string(
|
|
70
70
|
char* gumbo_string_buffer_to_string(
|
71
71
|
struct GumboInternalParser* parser, GumboStringBuffer* input);
|
72
72
|
|
73
|
+
// Reinitialize this string buffer. This clears it by setting length=0. It
|
74
|
+
// does not zero out the buffer itself.
|
75
|
+
void gumbo_string_buffer_clear(
|
76
|
+
struct GumboInternalParser* parser, GumboStringBuffer* input);
|
77
|
+
|
73
78
|
// Deallocates this GumboStringBuffer.
|
74
79
|
void gumbo_string_buffer_destroy(
|
75
80
|
struct GumboInternalParser* parser, GumboStringBuffer* buffer);
|
data/gumbo-parser/src/tag.c
CHANGED
@@ -18,172 +18,25 @@
|
|
18
18
|
|
19
19
|
#include <assert.h>
|
20
20
|
#include <ctype.h>
|
21
|
-
#include <
|
21
|
+
#include <string.h>
|
22
22
|
|
23
|
-
// NOTE(jdtang): Keep this in sync with the GumboTag enum in the header.
|
24
|
-
// TODO(jdtang): Investigate whether there're efficiency benefits to putting the
|
25
|
-
// most common tag names first, or to putting them in alphabetical order and
|
26
|
-
// using a binary search.
|
27
23
|
const char* kGumboTagNames[] = {
|
28
|
-
|
29
|
-
"head",
|
30
|
-
"title",
|
31
|
-
"base",
|
32
|
-
"link",
|
33
|
-
"meta",
|
34
|
-
"style",
|
35
|
-
"script",
|
36
|
-
"noscript",
|
37
|
-
"template",
|
38
|
-
"body",
|
39
|
-
"article",
|
40
|
-
"section",
|
41
|
-
"nav",
|
42
|
-
"aside",
|
43
|
-
"h1",
|
44
|
-
"h2",
|
45
|
-
"h3",
|
46
|
-
"h4",
|
47
|
-
"h5",
|
48
|
-
"h6",
|
49
|
-
"hgroup",
|
50
|
-
"header",
|
51
|
-
"footer",
|
52
|
-
"address",
|
53
|
-
"p",
|
54
|
-
"hr",
|
55
|
-
"pre",
|
56
|
-
"blockquote",
|
57
|
-
"ol",
|
58
|
-
"ul",
|
59
|
-
"li",
|
60
|
-
"dl",
|
61
|
-
"dt",
|
62
|
-
"dd",
|
63
|
-
"figure",
|
64
|
-
"figcaption",
|
65
|
-
"main",
|
66
|
-
"div",
|
67
|
-
"a",
|
68
|
-
"em",
|
69
|
-
"strong",
|
70
|
-
"small",
|
71
|
-
"s",
|
72
|
-
"cite",
|
73
|
-
"q",
|
74
|
-
"dfn",
|
75
|
-
"abbr",
|
76
|
-
"data",
|
77
|
-
"time",
|
78
|
-
"code",
|
79
|
-
"var",
|
80
|
-
"samp",
|
81
|
-
"kbd",
|
82
|
-
"sub",
|
83
|
-
"sup",
|
84
|
-
"i",
|
85
|
-
"b",
|
86
|
-
"u",
|
87
|
-
"mark",
|
88
|
-
"ruby",
|
89
|
-
"rt",
|
90
|
-
"rp",
|
91
|
-
"bdi",
|
92
|
-
"bdo",
|
93
|
-
"span",
|
94
|
-
"br",
|
95
|
-
"wbr",
|
96
|
-
"ins",
|
97
|
-
"del",
|
98
|
-
"image",
|
99
|
-
"img",
|
100
|
-
"iframe",
|
101
|
-
"embed",
|
102
|
-
"object",
|
103
|
-
"param",
|
104
|
-
"video",
|
105
|
-
"audio",
|
106
|
-
"source",
|
107
|
-
"track",
|
108
|
-
"canvas",
|
109
|
-
"map",
|
110
|
-
"area",
|
111
|
-
"math",
|
112
|
-
"mi",
|
113
|
-
"mo",
|
114
|
-
"mn",
|
115
|
-
"ms",
|
116
|
-
"mtext",
|
117
|
-
"mglyph",
|
118
|
-
"malignmark",
|
119
|
-
"annotation-xml",
|
120
|
-
"svg",
|
121
|
-
"foreignobject",
|
122
|
-
"desc",
|
123
|
-
"table",
|
124
|
-
"caption",
|
125
|
-
"colgroup",
|
126
|
-
"col",
|
127
|
-
"tbody",
|
128
|
-
"thead",
|
129
|
-
"tfoot",
|
130
|
-
"tr",
|
131
|
-
"td",
|
132
|
-
"th",
|
133
|
-
"form",
|
134
|
-
"fieldset",
|
135
|
-
"legend",
|
136
|
-
"label",
|
137
|
-
"input",
|
138
|
-
"button",
|
139
|
-
"select",
|
140
|
-
"datalist",
|
141
|
-
"optgroup",
|
142
|
-
"option",
|
143
|
-
"textarea",
|
144
|
-
"keygen",
|
145
|
-
"output",
|
146
|
-
"progress",
|
147
|
-
"meter",
|
148
|
-
"details",
|
149
|
-
"summary",
|
150
|
-
"menu",
|
151
|
-
"menuitem",
|
152
|
-
"applet",
|
153
|
-
"acronym",
|
154
|
-
"bgsound",
|
155
|
-
"dir",
|
156
|
-
"frame",
|
157
|
-
"frameset",
|
158
|
-
"noframes",
|
159
|
-
"isindex",
|
160
|
-
"listing",
|
161
|
-
"xmp",
|
162
|
-
"nextid",
|
163
|
-
"noembed",
|
164
|
-
"plaintext",
|
165
|
-
"rb",
|
166
|
-
"strike",
|
167
|
-
"basefont",
|
168
|
-
"big",
|
169
|
-
"blink",
|
170
|
-
"center",
|
171
|
-
"font",
|
172
|
-
"marquee",
|
173
|
-
"multicol",
|
174
|
-
"nobr",
|
175
|
-
"spacer",
|
176
|
-
"tt",
|
24
|
+
# include "tag_strings.h"
|
177
25
|
"", // TAG_UNKNOWN
|
178
26
|
"", // TAG_LAST
|
179
27
|
};
|
180
28
|
|
29
|
+
static const unsigned char kGumboTagSizes[] = {
|
30
|
+
# include "tag_sizes.h"
|
31
|
+
0, // TAG_UNKNOWN
|
32
|
+
0, // TAG_LAST
|
33
|
+
};
|
34
|
+
|
181
35
|
const char* gumbo_normalized_tagname(GumboTag tag) {
|
182
36
|
assert(tag <= GUMBO_TAG_LAST);
|
183
37
|
return kGumboTagNames[tag];
|
184
38
|
}
|
185
39
|
|
186
|
-
// TODO(jdtang): Add test for this.
|
187
40
|
void gumbo_tag_from_original_text(GumboStringPiece* text) {
|
188
41
|
if (text->data == NULL) {
|
189
42
|
return;
|
@@ -212,14 +65,34 @@ void gumbo_tag_from_original_text(GumboStringPiece* text) {
|
|
212
65
|
}
|
213
66
|
}
|
214
67
|
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
68
|
+
static int
|
69
|
+
case_memcmp(const char *s1, const char *s2, unsigned int n)
|
70
|
+
{
|
71
|
+
while (n--) {
|
72
|
+
unsigned char c1 = tolower(*s1++);
|
73
|
+
unsigned char c2 = tolower(*s2++);
|
74
|
+
if (c1 != c2)
|
75
|
+
return (int)c1 - (int)c2;
|
76
|
+
}
|
77
|
+
return 0;
|
78
|
+
}
|
79
|
+
|
80
|
+
#include "tag_gperf.h"
|
81
|
+
#define TAG_MAP_SIZE (sizeof(kGumboTagMap)/sizeof(kGumboTagMap[0]))
|
82
|
+
|
83
|
+
GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length) {
|
84
|
+
if (length) {
|
85
|
+
unsigned int key = tag_hash(tagname, length);
|
86
|
+
if (key < TAG_MAP_SIZE) {
|
87
|
+
GumboTag tag = kGumboTagMap[key];
|
88
|
+
if (length == kGumboTagSizes[(int)tag] &&
|
89
|
+
!case_memcmp(tagname, kGumboTagNames[(int)tag], length))
|
90
|
+
return tag;
|
222
91
|
}
|
223
92
|
}
|
224
93
|
return GUMBO_TAG_UNKNOWN;
|
225
94
|
}
|
95
|
+
|
96
|
+
GumboTag gumbo_tag_enum(const char* tagname) {
|
97
|
+
return gumbo_tagn_enum(tagname, strlen(tagname));
|
98
|
+
}
|
@@ -0,0 +1,150 @@
|
|
1
|
+
html
|
2
|
+
head
|
3
|
+
title
|
4
|
+
base
|
5
|
+
link
|
6
|
+
meta
|
7
|
+
style
|
8
|
+
script
|
9
|
+
noscript
|
10
|
+
template
|
11
|
+
body
|
12
|
+
article
|
13
|
+
section
|
14
|
+
nav
|
15
|
+
aside
|
16
|
+
h1
|
17
|
+
h2
|
18
|
+
h3
|
19
|
+
h4
|
20
|
+
h5
|
21
|
+
h6
|
22
|
+
hgroup
|
23
|
+
header
|
24
|
+
footer
|
25
|
+
address
|
26
|
+
p
|
27
|
+
hr
|
28
|
+
pre
|
29
|
+
blockquote
|
30
|
+
ol
|
31
|
+
ul
|
32
|
+
li
|
33
|
+
dl
|
34
|
+
dt
|
35
|
+
dd
|
36
|
+
figure
|
37
|
+
figcaption
|
38
|
+
main
|
39
|
+
div
|
40
|
+
a
|
41
|
+
em
|
42
|
+
strong
|
43
|
+
small
|
44
|
+
s
|
45
|
+
cite
|
46
|
+
q
|
47
|
+
dfn
|
48
|
+
abbr
|
49
|
+
data
|
50
|
+
time
|
51
|
+
code
|
52
|
+
var
|
53
|
+
samp
|
54
|
+
kbd
|
55
|
+
sub
|
56
|
+
sup
|
57
|
+
i
|
58
|
+
b
|
59
|
+
u
|
60
|
+
mark
|
61
|
+
ruby
|
62
|
+
rt
|
63
|
+
rp
|
64
|
+
bdi
|
65
|
+
bdo
|
66
|
+
span
|
67
|
+
br
|
68
|
+
wbr
|
69
|
+
ins
|
70
|
+
del
|
71
|
+
image
|
72
|
+
img
|
73
|
+
iframe
|
74
|
+
embed
|
75
|
+
object
|
76
|
+
param
|
77
|
+
video
|
78
|
+
audio
|
79
|
+
source
|
80
|
+
track
|
81
|
+
canvas
|
82
|
+
map
|
83
|
+
area
|
84
|
+
math
|
85
|
+
mi
|
86
|
+
mo
|
87
|
+
mn
|
88
|
+
ms
|
89
|
+
mtext
|
90
|
+
mglyph
|
91
|
+
malignmark
|
92
|
+
annotation-xml
|
93
|
+
svg
|
94
|
+
foreignobject
|
95
|
+
desc
|
96
|
+
table
|
97
|
+
caption
|
98
|
+
colgroup
|
99
|
+
col
|
100
|
+
tbody
|
101
|
+
thead
|
102
|
+
tfoot
|
103
|
+
tr
|
104
|
+
td
|
105
|
+
th
|
106
|
+
form
|
107
|
+
fieldset
|
108
|
+
legend
|
109
|
+
label
|
110
|
+
input
|
111
|
+
button
|
112
|
+
select
|
113
|
+
datalist
|
114
|
+
optgroup
|
115
|
+
option
|
116
|
+
textarea
|
117
|
+
keygen
|
118
|
+
output
|
119
|
+
progress
|
120
|
+
meter
|
121
|
+
details
|
122
|
+
summary
|
123
|
+
menu
|
124
|
+
menuitem
|
125
|
+
applet
|
126
|
+
acronym
|
127
|
+
bgsound
|
128
|
+
dir
|
129
|
+
frame
|
130
|
+
frameset
|
131
|
+
noframes
|
132
|
+
isindex
|
133
|
+
listing
|
134
|
+
xmp
|
135
|
+
nextid
|
136
|
+
noembed
|
137
|
+
plaintext
|
138
|
+
rb
|
139
|
+
strike
|
140
|
+
basefont
|
141
|
+
big
|
142
|
+
blink
|
143
|
+
center
|
144
|
+
font
|
145
|
+
marquee
|
146
|
+
multicol
|
147
|
+
nobr
|
148
|
+
spacer
|
149
|
+
tt
|
150
|
+
rtc
|
@@ -0,0 +1,150 @@
|
|
1
|
+
GUMBO_TAG_HTML,
|
2
|
+
GUMBO_TAG_HEAD,
|
3
|
+
GUMBO_TAG_TITLE,
|
4
|
+
GUMBO_TAG_BASE,
|
5
|
+
GUMBO_TAG_LINK,
|
6
|
+
GUMBO_TAG_META,
|
7
|
+
GUMBO_TAG_STYLE,
|
8
|
+
GUMBO_TAG_SCRIPT,
|
9
|
+
GUMBO_TAG_NOSCRIPT,
|
10
|
+
GUMBO_TAG_TEMPLATE,
|
11
|
+
GUMBO_TAG_BODY,
|
12
|
+
GUMBO_TAG_ARTICLE,
|
13
|
+
GUMBO_TAG_SECTION,
|
14
|
+
GUMBO_TAG_NAV,
|
15
|
+
GUMBO_TAG_ASIDE,
|
16
|
+
GUMBO_TAG_H1,
|
17
|
+
GUMBO_TAG_H2,
|
18
|
+
GUMBO_TAG_H3,
|
19
|
+
GUMBO_TAG_H4,
|
20
|
+
GUMBO_TAG_H5,
|
21
|
+
GUMBO_TAG_H6,
|
22
|
+
GUMBO_TAG_HGROUP,
|
23
|
+
GUMBO_TAG_HEADER,
|
24
|
+
GUMBO_TAG_FOOTER,
|
25
|
+
GUMBO_TAG_ADDRESS,
|
26
|
+
GUMBO_TAG_P,
|
27
|
+
GUMBO_TAG_HR,
|
28
|
+
GUMBO_TAG_PRE,
|
29
|
+
GUMBO_TAG_BLOCKQUOTE,
|
30
|
+
GUMBO_TAG_OL,
|
31
|
+
GUMBO_TAG_UL,
|
32
|
+
GUMBO_TAG_LI,
|
33
|
+
GUMBO_TAG_DL,
|
34
|
+
GUMBO_TAG_DT,
|
35
|
+
GUMBO_TAG_DD,
|
36
|
+
GUMBO_TAG_FIGURE,
|
37
|
+
GUMBO_TAG_FIGCAPTION,
|
38
|
+
GUMBO_TAG_MAIN,
|
39
|
+
GUMBO_TAG_DIV,
|
40
|
+
GUMBO_TAG_A,
|
41
|
+
GUMBO_TAG_EM,
|
42
|
+
GUMBO_TAG_STRONG,
|
43
|
+
GUMBO_TAG_SMALL,
|
44
|
+
GUMBO_TAG_S,
|
45
|
+
GUMBO_TAG_CITE,
|
46
|
+
GUMBO_TAG_Q,
|
47
|
+
GUMBO_TAG_DFN,
|
48
|
+
GUMBO_TAG_ABBR,
|
49
|
+
GUMBO_TAG_DATA,
|
50
|
+
GUMBO_TAG_TIME,
|
51
|
+
GUMBO_TAG_CODE,
|
52
|
+
GUMBO_TAG_VAR,
|
53
|
+
GUMBO_TAG_SAMP,
|
54
|
+
GUMBO_TAG_KBD,
|
55
|
+
GUMBO_TAG_SUB,
|
56
|
+
GUMBO_TAG_SUP,
|
57
|
+
GUMBO_TAG_I,
|
58
|
+
GUMBO_TAG_B,
|
59
|
+
GUMBO_TAG_U,
|
60
|
+
GUMBO_TAG_MARK,
|
61
|
+
GUMBO_TAG_RUBY,
|
62
|
+
GUMBO_TAG_RT,
|
63
|
+
GUMBO_TAG_RP,
|
64
|
+
GUMBO_TAG_BDI,
|
65
|
+
GUMBO_TAG_BDO,
|
66
|
+
GUMBO_TAG_SPAN,
|
67
|
+
GUMBO_TAG_BR,
|
68
|
+
GUMBO_TAG_WBR,
|
69
|
+
GUMBO_TAG_INS,
|
70
|
+
GUMBO_TAG_DEL,
|
71
|
+
GUMBO_TAG_IMAGE,
|
72
|
+
GUMBO_TAG_IMG,
|
73
|
+
GUMBO_TAG_IFRAME,
|
74
|
+
GUMBO_TAG_EMBED,
|
75
|
+
GUMBO_TAG_OBJECT,
|
76
|
+
GUMBO_TAG_PARAM,
|
77
|
+
GUMBO_TAG_VIDEO,
|
78
|
+
GUMBO_TAG_AUDIO,
|
79
|
+
GUMBO_TAG_SOURCE,
|
80
|
+
GUMBO_TAG_TRACK,
|
81
|
+
GUMBO_TAG_CANVAS,
|
82
|
+
GUMBO_TAG_MAP,
|
83
|
+
GUMBO_TAG_AREA,
|
84
|
+
GUMBO_TAG_MATH,
|
85
|
+
GUMBO_TAG_MI,
|
86
|
+
GUMBO_TAG_MO,
|
87
|
+
GUMBO_TAG_MN,
|
88
|
+
GUMBO_TAG_MS,
|
89
|
+
GUMBO_TAG_MTEXT,
|
90
|
+
GUMBO_TAG_MGLYPH,
|
91
|
+
GUMBO_TAG_MALIGNMARK,
|
92
|
+
GUMBO_TAG_ANNOTATION_XML,
|
93
|
+
GUMBO_TAG_SVG,
|
94
|
+
GUMBO_TAG_FOREIGNOBJECT,
|
95
|
+
GUMBO_TAG_DESC,
|
96
|
+
GUMBO_TAG_TABLE,
|
97
|
+
GUMBO_TAG_CAPTION,
|
98
|
+
GUMBO_TAG_COLGROUP,
|
99
|
+
GUMBO_TAG_COL,
|
100
|
+
GUMBO_TAG_TBODY,
|
101
|
+
GUMBO_TAG_THEAD,
|
102
|
+
GUMBO_TAG_TFOOT,
|
103
|
+
GUMBO_TAG_TR,
|
104
|
+
GUMBO_TAG_TD,
|
105
|
+
GUMBO_TAG_TH,
|
106
|
+
GUMBO_TAG_FORM,
|
107
|
+
GUMBO_TAG_FIELDSET,
|
108
|
+
GUMBO_TAG_LEGEND,
|
109
|
+
GUMBO_TAG_LABEL,
|
110
|
+
GUMBO_TAG_INPUT,
|
111
|
+
GUMBO_TAG_BUTTON,
|
112
|
+
GUMBO_TAG_SELECT,
|
113
|
+
GUMBO_TAG_DATALIST,
|
114
|
+
GUMBO_TAG_OPTGROUP,
|
115
|
+
GUMBO_TAG_OPTION,
|
116
|
+
GUMBO_TAG_TEXTAREA,
|
117
|
+
GUMBO_TAG_KEYGEN,
|
118
|
+
GUMBO_TAG_OUTPUT,
|
119
|
+
GUMBO_TAG_PROGRESS,
|
120
|
+
GUMBO_TAG_METER,
|
121
|
+
GUMBO_TAG_DETAILS,
|
122
|
+
GUMBO_TAG_SUMMARY,
|
123
|
+
GUMBO_TAG_MENU,
|
124
|
+
GUMBO_TAG_MENUITEM,
|
125
|
+
GUMBO_TAG_APPLET,
|
126
|
+
GUMBO_TAG_ACRONYM,
|
127
|
+
GUMBO_TAG_BGSOUND,
|
128
|
+
GUMBO_TAG_DIR,
|
129
|
+
GUMBO_TAG_FRAME,
|
130
|
+
GUMBO_TAG_FRAMESET,
|
131
|
+
GUMBO_TAG_NOFRAMES,
|
132
|
+
GUMBO_TAG_ISINDEX,
|
133
|
+
GUMBO_TAG_LISTING,
|
134
|
+
GUMBO_TAG_XMP,
|
135
|
+
GUMBO_TAG_NEXTID,
|
136
|
+
GUMBO_TAG_NOEMBED,
|
137
|
+
GUMBO_TAG_PLAINTEXT,
|
138
|
+
GUMBO_TAG_RB,
|
139
|
+
GUMBO_TAG_STRIKE,
|
140
|
+
GUMBO_TAG_BASEFONT,
|
141
|
+
GUMBO_TAG_BIG,
|
142
|
+
GUMBO_TAG_BLINK,
|
143
|
+
GUMBO_TAG_CENTER,
|
144
|
+
GUMBO_TAG_FONT,
|
145
|
+
GUMBO_TAG_MARQUEE,
|
146
|
+
GUMBO_TAG_MULTICOL,
|
147
|
+
GUMBO_TAG_NOBR,
|
148
|
+
GUMBO_TAG_SPACER,
|
149
|
+
GUMBO_TAG_TT,
|
150
|
+
GUMBO_TAG_RTC,
|