nokogumbo 1.3.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,7 +26,9 @@
26
26
 
27
27
  struct GumboInternalParser;
28
28
 
29
- static const size_t kDefaultStringBufferSize = 10;
29
+ // Size chosen via statistical analysis of ~60K websites.
30
+ // 99% of text nodes and 98% of attribute names/values fit in this initial size.
31
+ static const size_t kDefaultStringBufferSize = 5;
30
32
 
31
33
  static void maybe_resize_string_buffer(
32
34
  struct GumboInternalParser* parser, size_t additional_chars,
@@ -100,6 +102,11 @@ char* gumbo_string_buffer_to_string(
100
102
  return buffer;
101
103
  }
102
104
 
105
+ void gumbo_string_buffer_clear(
106
+ struct GumboInternalParser* parser, GumboStringBuffer* input) {
107
+ input->length = 0;
108
+ }
109
+
103
110
  void gumbo_string_buffer_destroy(
104
111
  struct GumboInternalParser* parser, GumboStringBuffer* buffer) {
105
112
  gumbo_parser_deallocate(parser, buffer->data);
@@ -70,6 +70,11 @@ void gumbo_string_buffer_append_string(
70
70
  char* gumbo_string_buffer_to_string(
71
71
  struct GumboInternalParser* parser, GumboStringBuffer* input);
72
72
 
73
+ // Reinitialize this string buffer. This clears it by setting length=0. It
74
+ // does not zero out the buffer itself.
75
+ void gumbo_string_buffer_clear(
76
+ struct GumboInternalParser* parser, GumboStringBuffer* input);
77
+
73
78
  // Deallocates this GumboStringBuffer.
74
79
  void gumbo_string_buffer_destroy(
75
80
  struct GumboInternalParser* parser, GumboStringBuffer* buffer);
@@ -18,172 +18,25 @@
18
18
 
19
19
  #include <assert.h>
20
20
  #include <ctype.h>
21
- #include <strings.h> // For strcasecmp.
21
+ #include <string.h>
22
22
 
23
- // NOTE(jdtang): Keep this in sync with the GumboTag enum in the header.
24
- // TODO(jdtang): Investigate whether there're efficiency benefits to putting the
25
- // most common tag names first, or to putting them in alphabetical order and
26
- // using a binary search.
27
23
  const char* kGumboTagNames[] = {
28
- "html",
29
- "head",
30
- "title",
31
- "base",
32
- "link",
33
- "meta",
34
- "style",
35
- "script",
36
- "noscript",
37
- "template",
38
- "body",
39
- "article",
40
- "section",
41
- "nav",
42
- "aside",
43
- "h1",
44
- "h2",
45
- "h3",
46
- "h4",
47
- "h5",
48
- "h6",
49
- "hgroup",
50
- "header",
51
- "footer",
52
- "address",
53
- "p",
54
- "hr",
55
- "pre",
56
- "blockquote",
57
- "ol",
58
- "ul",
59
- "li",
60
- "dl",
61
- "dt",
62
- "dd",
63
- "figure",
64
- "figcaption",
65
- "main",
66
- "div",
67
- "a",
68
- "em",
69
- "strong",
70
- "small",
71
- "s",
72
- "cite",
73
- "q",
74
- "dfn",
75
- "abbr",
76
- "data",
77
- "time",
78
- "code",
79
- "var",
80
- "samp",
81
- "kbd",
82
- "sub",
83
- "sup",
84
- "i",
85
- "b",
86
- "u",
87
- "mark",
88
- "ruby",
89
- "rt",
90
- "rp",
91
- "bdi",
92
- "bdo",
93
- "span",
94
- "br",
95
- "wbr",
96
- "ins",
97
- "del",
98
- "image",
99
- "img",
100
- "iframe",
101
- "embed",
102
- "object",
103
- "param",
104
- "video",
105
- "audio",
106
- "source",
107
- "track",
108
- "canvas",
109
- "map",
110
- "area",
111
- "math",
112
- "mi",
113
- "mo",
114
- "mn",
115
- "ms",
116
- "mtext",
117
- "mglyph",
118
- "malignmark",
119
- "annotation-xml",
120
- "svg",
121
- "foreignobject",
122
- "desc",
123
- "table",
124
- "caption",
125
- "colgroup",
126
- "col",
127
- "tbody",
128
- "thead",
129
- "tfoot",
130
- "tr",
131
- "td",
132
- "th",
133
- "form",
134
- "fieldset",
135
- "legend",
136
- "label",
137
- "input",
138
- "button",
139
- "select",
140
- "datalist",
141
- "optgroup",
142
- "option",
143
- "textarea",
144
- "keygen",
145
- "output",
146
- "progress",
147
- "meter",
148
- "details",
149
- "summary",
150
- "menu",
151
- "menuitem",
152
- "applet",
153
- "acronym",
154
- "bgsound",
155
- "dir",
156
- "frame",
157
- "frameset",
158
- "noframes",
159
- "isindex",
160
- "listing",
161
- "xmp",
162
- "nextid",
163
- "noembed",
164
- "plaintext",
165
- "rb",
166
- "strike",
167
- "basefont",
168
- "big",
169
- "blink",
170
- "center",
171
- "font",
172
- "marquee",
173
- "multicol",
174
- "nobr",
175
- "spacer",
176
- "tt",
24
+ # include "tag_strings.h"
177
25
  "", // TAG_UNKNOWN
178
26
  "", // TAG_LAST
179
27
  };
180
28
 
29
+ static const unsigned char kGumboTagSizes[] = {
30
+ # include "tag_sizes.h"
31
+ 0, // TAG_UNKNOWN
32
+ 0, // TAG_LAST
33
+ };
34
+
181
35
  const char* gumbo_normalized_tagname(GumboTag tag) {
182
36
  assert(tag <= GUMBO_TAG_LAST);
183
37
  return kGumboTagNames[tag];
184
38
  }
185
39
 
186
- // TODO(jdtang): Add test for this.
187
40
  void gumbo_tag_from_original_text(GumboStringPiece* text) {
188
41
  if (text->data == NULL) {
189
42
  return;
@@ -212,14 +65,34 @@ void gumbo_tag_from_original_text(GumboStringPiece* text) {
212
65
  }
213
66
  }
214
67
 
215
- GumboTag gumbo_tag_enum(const char* tagname) {
216
- for (int i = 0; i < GUMBO_TAG_LAST; ++i) {
217
- // TODO(jdtang): strcasecmp is non-portable, so if we want to support
218
- // non-GCC compilers, we'll need some #ifdef magic. This source already has
219
- // pretty significant issues with MSVC6 anyway.
220
- if (strcasecmp(tagname, kGumboTagNames[i]) == 0) {
221
- return i;
68
+ static int
69
+ case_memcmp(const char *s1, const char *s2, unsigned int n)
70
+ {
71
+ while (n--) {
72
+ unsigned char c1 = tolower(*s1++);
73
+ unsigned char c2 = tolower(*s2++);
74
+ if (c1 != c2)
75
+ return (int)c1 - (int)c2;
76
+ }
77
+ return 0;
78
+ }
79
+
80
+ #include "tag_gperf.h"
81
+ #define TAG_MAP_SIZE (sizeof(kGumboTagMap)/sizeof(kGumboTagMap[0]))
82
+
83
+ GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length) {
84
+ if (length) {
85
+ unsigned int key = tag_hash(tagname, length);
86
+ if (key < TAG_MAP_SIZE) {
87
+ GumboTag tag = kGumboTagMap[key];
88
+ if (length == kGumboTagSizes[(int)tag] &&
89
+ !case_memcmp(tagname, kGumboTagNames[(int)tag], length))
90
+ return tag;
222
91
  }
223
92
  }
224
93
  return GUMBO_TAG_UNKNOWN;
225
94
  }
95
+
96
+ GumboTag gumbo_tag_enum(const char* tagname) {
97
+ return gumbo_tagn_enum(tagname, strlen(tagname));
98
+ }
@@ -0,0 +1,150 @@
1
+ html
2
+ head
3
+ title
4
+ base
5
+ link
6
+ meta
7
+ style
8
+ script
9
+ noscript
10
+ template
11
+ body
12
+ article
13
+ section
14
+ nav
15
+ aside
16
+ h1
17
+ h2
18
+ h3
19
+ h4
20
+ h5
21
+ h6
22
+ hgroup
23
+ header
24
+ footer
25
+ address
26
+ p
27
+ hr
28
+ pre
29
+ blockquote
30
+ ol
31
+ ul
32
+ li
33
+ dl
34
+ dt
35
+ dd
36
+ figure
37
+ figcaption
38
+ main
39
+ div
40
+ a
41
+ em
42
+ strong
43
+ small
44
+ s
45
+ cite
46
+ q
47
+ dfn
48
+ abbr
49
+ data
50
+ time
51
+ code
52
+ var
53
+ samp
54
+ kbd
55
+ sub
56
+ sup
57
+ i
58
+ b
59
+ u
60
+ mark
61
+ ruby
62
+ rt
63
+ rp
64
+ bdi
65
+ bdo
66
+ span
67
+ br
68
+ wbr
69
+ ins
70
+ del
71
+ image
72
+ img
73
+ iframe
74
+ embed
75
+ object
76
+ param
77
+ video
78
+ audio
79
+ source
80
+ track
81
+ canvas
82
+ map
83
+ area
84
+ math
85
+ mi
86
+ mo
87
+ mn
88
+ ms
89
+ mtext
90
+ mglyph
91
+ malignmark
92
+ annotation-xml
93
+ svg
94
+ foreignobject
95
+ desc
96
+ table
97
+ caption
98
+ colgroup
99
+ col
100
+ tbody
101
+ thead
102
+ tfoot
103
+ tr
104
+ td
105
+ th
106
+ form
107
+ fieldset
108
+ legend
109
+ label
110
+ input
111
+ button
112
+ select
113
+ datalist
114
+ optgroup
115
+ option
116
+ textarea
117
+ keygen
118
+ output
119
+ progress
120
+ meter
121
+ details
122
+ summary
123
+ menu
124
+ menuitem
125
+ applet
126
+ acronym
127
+ bgsound
128
+ dir
129
+ frame
130
+ frameset
131
+ noframes
132
+ isindex
133
+ listing
134
+ xmp
135
+ nextid
136
+ noembed
137
+ plaintext
138
+ rb
139
+ strike
140
+ basefont
141
+ big
142
+ blink
143
+ center
144
+ font
145
+ marquee
146
+ multicol
147
+ nobr
148
+ spacer
149
+ tt
150
+ rtc
@@ -0,0 +1,150 @@
1
+ GUMBO_TAG_HTML,
2
+ GUMBO_TAG_HEAD,
3
+ GUMBO_TAG_TITLE,
4
+ GUMBO_TAG_BASE,
5
+ GUMBO_TAG_LINK,
6
+ GUMBO_TAG_META,
7
+ GUMBO_TAG_STYLE,
8
+ GUMBO_TAG_SCRIPT,
9
+ GUMBO_TAG_NOSCRIPT,
10
+ GUMBO_TAG_TEMPLATE,
11
+ GUMBO_TAG_BODY,
12
+ GUMBO_TAG_ARTICLE,
13
+ GUMBO_TAG_SECTION,
14
+ GUMBO_TAG_NAV,
15
+ GUMBO_TAG_ASIDE,
16
+ GUMBO_TAG_H1,
17
+ GUMBO_TAG_H2,
18
+ GUMBO_TAG_H3,
19
+ GUMBO_TAG_H4,
20
+ GUMBO_TAG_H5,
21
+ GUMBO_TAG_H6,
22
+ GUMBO_TAG_HGROUP,
23
+ GUMBO_TAG_HEADER,
24
+ GUMBO_TAG_FOOTER,
25
+ GUMBO_TAG_ADDRESS,
26
+ GUMBO_TAG_P,
27
+ GUMBO_TAG_HR,
28
+ GUMBO_TAG_PRE,
29
+ GUMBO_TAG_BLOCKQUOTE,
30
+ GUMBO_TAG_OL,
31
+ GUMBO_TAG_UL,
32
+ GUMBO_TAG_LI,
33
+ GUMBO_TAG_DL,
34
+ GUMBO_TAG_DT,
35
+ GUMBO_TAG_DD,
36
+ GUMBO_TAG_FIGURE,
37
+ GUMBO_TAG_FIGCAPTION,
38
+ GUMBO_TAG_MAIN,
39
+ GUMBO_TAG_DIV,
40
+ GUMBO_TAG_A,
41
+ GUMBO_TAG_EM,
42
+ GUMBO_TAG_STRONG,
43
+ GUMBO_TAG_SMALL,
44
+ GUMBO_TAG_S,
45
+ GUMBO_TAG_CITE,
46
+ GUMBO_TAG_Q,
47
+ GUMBO_TAG_DFN,
48
+ GUMBO_TAG_ABBR,
49
+ GUMBO_TAG_DATA,
50
+ GUMBO_TAG_TIME,
51
+ GUMBO_TAG_CODE,
52
+ GUMBO_TAG_VAR,
53
+ GUMBO_TAG_SAMP,
54
+ GUMBO_TAG_KBD,
55
+ GUMBO_TAG_SUB,
56
+ GUMBO_TAG_SUP,
57
+ GUMBO_TAG_I,
58
+ GUMBO_TAG_B,
59
+ GUMBO_TAG_U,
60
+ GUMBO_TAG_MARK,
61
+ GUMBO_TAG_RUBY,
62
+ GUMBO_TAG_RT,
63
+ GUMBO_TAG_RP,
64
+ GUMBO_TAG_BDI,
65
+ GUMBO_TAG_BDO,
66
+ GUMBO_TAG_SPAN,
67
+ GUMBO_TAG_BR,
68
+ GUMBO_TAG_WBR,
69
+ GUMBO_TAG_INS,
70
+ GUMBO_TAG_DEL,
71
+ GUMBO_TAG_IMAGE,
72
+ GUMBO_TAG_IMG,
73
+ GUMBO_TAG_IFRAME,
74
+ GUMBO_TAG_EMBED,
75
+ GUMBO_TAG_OBJECT,
76
+ GUMBO_TAG_PARAM,
77
+ GUMBO_TAG_VIDEO,
78
+ GUMBO_TAG_AUDIO,
79
+ GUMBO_TAG_SOURCE,
80
+ GUMBO_TAG_TRACK,
81
+ GUMBO_TAG_CANVAS,
82
+ GUMBO_TAG_MAP,
83
+ GUMBO_TAG_AREA,
84
+ GUMBO_TAG_MATH,
85
+ GUMBO_TAG_MI,
86
+ GUMBO_TAG_MO,
87
+ GUMBO_TAG_MN,
88
+ GUMBO_TAG_MS,
89
+ GUMBO_TAG_MTEXT,
90
+ GUMBO_TAG_MGLYPH,
91
+ GUMBO_TAG_MALIGNMARK,
92
+ GUMBO_TAG_ANNOTATION_XML,
93
+ GUMBO_TAG_SVG,
94
+ GUMBO_TAG_FOREIGNOBJECT,
95
+ GUMBO_TAG_DESC,
96
+ GUMBO_TAG_TABLE,
97
+ GUMBO_TAG_CAPTION,
98
+ GUMBO_TAG_COLGROUP,
99
+ GUMBO_TAG_COL,
100
+ GUMBO_TAG_TBODY,
101
+ GUMBO_TAG_THEAD,
102
+ GUMBO_TAG_TFOOT,
103
+ GUMBO_TAG_TR,
104
+ GUMBO_TAG_TD,
105
+ GUMBO_TAG_TH,
106
+ GUMBO_TAG_FORM,
107
+ GUMBO_TAG_FIELDSET,
108
+ GUMBO_TAG_LEGEND,
109
+ GUMBO_TAG_LABEL,
110
+ GUMBO_TAG_INPUT,
111
+ GUMBO_TAG_BUTTON,
112
+ GUMBO_TAG_SELECT,
113
+ GUMBO_TAG_DATALIST,
114
+ GUMBO_TAG_OPTGROUP,
115
+ GUMBO_TAG_OPTION,
116
+ GUMBO_TAG_TEXTAREA,
117
+ GUMBO_TAG_KEYGEN,
118
+ GUMBO_TAG_OUTPUT,
119
+ GUMBO_TAG_PROGRESS,
120
+ GUMBO_TAG_METER,
121
+ GUMBO_TAG_DETAILS,
122
+ GUMBO_TAG_SUMMARY,
123
+ GUMBO_TAG_MENU,
124
+ GUMBO_TAG_MENUITEM,
125
+ GUMBO_TAG_APPLET,
126
+ GUMBO_TAG_ACRONYM,
127
+ GUMBO_TAG_BGSOUND,
128
+ GUMBO_TAG_DIR,
129
+ GUMBO_TAG_FRAME,
130
+ GUMBO_TAG_FRAMESET,
131
+ GUMBO_TAG_NOFRAMES,
132
+ GUMBO_TAG_ISINDEX,
133
+ GUMBO_TAG_LISTING,
134
+ GUMBO_TAG_XMP,
135
+ GUMBO_TAG_NEXTID,
136
+ GUMBO_TAG_NOEMBED,
137
+ GUMBO_TAG_PLAINTEXT,
138
+ GUMBO_TAG_RB,
139
+ GUMBO_TAG_STRIKE,
140
+ GUMBO_TAG_BASEFONT,
141
+ GUMBO_TAG_BIG,
142
+ GUMBO_TAG_BLINK,
143
+ GUMBO_TAG_CENTER,
144
+ GUMBO_TAG_FONT,
145
+ GUMBO_TAG_MARQUEE,
146
+ GUMBO_TAG_MULTICOL,
147
+ GUMBO_TAG_NOBR,
148
+ GUMBO_TAG_SPACER,
149
+ GUMBO_TAG_TT,
150
+ GUMBO_TAG_RTC,