nokogumbo 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -26,11 +26,12 @@
26
26
 
27
27
  struct GumboInternalParser;
28
28
 
29
- static const size_t kDefaultStringBufferSize = 10;
29
+ // Size chosen via statistical analysis of ~60K websites.
30
+ // 99% of text nodes and 98% of attribute names/values fit in this initial size.
31
+ static const size_t kDefaultStringBufferSize = 5;
30
32
 
31
- static void maybe_resize_string_buffer(
32
- struct GumboInternalParser* parser, size_t additional_chars,
33
- GumboStringBuffer* buffer) {
33
+ static void maybe_resize_string_buffer(struct GumboInternalParser* parser,
34
+ size_t additional_chars, GumboStringBuffer* buffer) {
34
35
  size_t new_length = buffer->length + additional_chars;
35
36
  size_t new_capacity = buffer->capacity;
36
37
  while (new_capacity < new_length) {
@@ -52,9 +53,8 @@ void gumbo_string_buffer_init(
52
53
  output->capacity = kDefaultStringBufferSize;
53
54
  }
54
55
 
55
- void gumbo_string_buffer_reserve(
56
- struct GumboInternalParser* parser, size_t min_capacity,
57
- GumboStringBuffer* output) {
56
+ void gumbo_string_buffer_reserve(struct GumboInternalParser* parser,
57
+ size_t min_capacity, GumboStringBuffer* output) {
58
58
  maybe_resize_string_buffer(parser, min_capacity - output->length, output);
59
59
  }
60
60
 
@@ -84,9 +84,8 @@ void gumbo_string_buffer_append_codepoint(
84
84
  }
85
85
  }
86
86
 
87
- void gumbo_string_buffer_append_string(
88
- struct GumboInternalParser* parser, GumboStringPiece* str,
89
- GumboStringBuffer* output) {
87
+ void gumbo_string_buffer_append_string(struct GumboInternalParser* parser,
88
+ GumboStringPiece* str, GumboStringBuffer* output) {
90
89
  maybe_resize_string_buffer(parser, str->length, output);
91
90
  memcpy(output->data + output->length, str->data, str->length);
92
91
  output->length += str->length;
@@ -100,6 +99,11 @@ char* gumbo_string_buffer_to_string(
100
99
  return buffer;
101
100
  }
102
101
 
102
+ void gumbo_string_buffer_clear(
103
+ struct GumboInternalParser* parser, GumboStringBuffer* input) {
104
+ input->length = 0;
105
+ }
106
+
103
107
  void gumbo_string_buffer_destroy(
104
108
  struct GumboInternalParser* parser, GumboStringBuffer* buffer) {
105
109
  gumbo_parser_deallocate(parser, buffer->data);
@@ -51,9 +51,8 @@ void gumbo_string_buffer_init(
51
51
  // Ensures that the buffer contains at least a certain amount of space. Most
52
52
  // useful with snprintf and the other length-delimited string functions, which
53
53
  // may want to write directly into the buffer.
54
- void gumbo_string_buffer_reserve(
55
- struct GumboInternalParser* parser, size_t min_capacity,
56
- GumboStringBuffer* output);
54
+ void gumbo_string_buffer_reserve(struct GumboInternalParser* parser,
55
+ size_t min_capacity, GumboStringBuffer* output);
57
56
 
58
57
  // Appends a single Unicode codepoint onto the end of the GumboStringBuffer.
59
58
  // This is essentially a UTF-8 encoder, and may add 1-4 bytes depending on the
@@ -62,14 +61,18 @@ void gumbo_string_buffer_append_codepoint(
62
61
  struct GumboInternalParser* parser, int c, GumboStringBuffer* output);
63
62
 
64
63
  // Appends a string onto the end of the GumboStringBuffer.
65
- void gumbo_string_buffer_append_string(
66
- struct GumboInternalParser* parser, GumboStringPiece* str,
67
- GumboStringBuffer* output);
64
+ void gumbo_string_buffer_append_string(struct GumboInternalParser* parser,
65
+ GumboStringPiece* str, GumboStringBuffer* output);
68
66
 
69
67
  // Converts this string buffer to const char*, alloctaing a new buffer for it.
70
68
  char* gumbo_string_buffer_to_string(
71
69
  struct GumboInternalParser* parser, GumboStringBuffer* input);
72
70
 
71
+ // Reinitialize this string buffer. This clears it by setting length=0. It
72
+ // does not zero out the buffer itself.
73
+ void gumbo_string_buffer_clear(
74
+ struct GumboInternalParser* parser, GumboStringBuffer* input);
75
+
73
76
  // Deallocates this GumboStringBuffer.
74
77
  void gumbo_string_buffer_destroy(
75
78
  struct GumboInternalParser* parser, GumboStringBuffer* buffer);
@@ -25,23 +25,22 @@
25
25
 
26
26
  struct GumboInternalParser;
27
27
 
28
- const GumboStringPiece kGumboEmptyString = { NULL, 0 };
28
+ const GumboStringPiece kGumboEmptyString = {NULL, 0};
29
29
 
30
30
  bool gumbo_string_equals(
31
31
  const GumboStringPiece* str1, const GumboStringPiece* str2) {
32
32
  return str1->length == str2->length &&
33
- !memcmp(str1->data, str2->data, str1->length);
33
+ !memcmp(str1->data, str2->data, str1->length);
34
34
  }
35
35
 
36
36
  bool gumbo_string_equals_ignore_case(
37
37
  const GumboStringPiece* str1, const GumboStringPiece* str2) {
38
38
  return str1->length == str2->length &&
39
- !strncasecmp(str1->data, str2->data, str1->length);
39
+ !strncasecmp(str1->data, str2->data, str1->length);
40
40
  }
41
41
 
42
- void gumbo_string_copy(
43
- struct GumboInternalParser* parser, GumboStringPiece* dest,
44
- const GumboStringPiece* source) {
42
+ void gumbo_string_copy(struct GumboInternalParser* parser,
43
+ GumboStringPiece* dest, const GumboStringPiece* source) {
45
44
  dest->length = source->length;
46
45
  char* buffer = gumbo_parser_allocate(parser, source->length);
47
46
  memcpy(buffer, source->data, source->length);
@@ -28,9 +28,8 @@ struct GumboInternalParser;
28
28
  // Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the
29
29
  // destination and copying over the characters from source. Dest should be
30
30
  // empty, with no buffer allocated; otherwise, this leaks it.
31
- void gumbo_string_copy(
32
- struct GumboInternalParser* parser, GumboStringPiece* dest,
33
- const GumboStringPiece* source);
31
+ void gumbo_string_copy(struct GumboInternalParser* parser,
32
+ GumboStringPiece* dest, const GumboStringPiece* source);
34
33
 
35
34
  #ifdef __cplusplus
36
35
  }
@@ -18,164 +18,18 @@
18
18
 
19
19
  #include <assert.h>
20
20
  #include <ctype.h>
21
- #include <strings.h> // For strcasecmp.
21
+ #include <string.h>
22
22
 
23
- // NOTE(jdtang): Keep this in sync with the GumboTag enum in the header.
24
- // TODO(jdtang): Investigate whether there're efficiency benefits to putting the
25
- // most common tag names first, or to putting them in alphabetical order and
26
- // using a binary search.
27
23
  const char* kGumboTagNames[] = {
28
- "html",
29
- "head",
30
- "title",
31
- "base",
32
- "link",
33
- "meta",
34
- "style",
35
- "script",
36
- "noscript",
37
- "template",
38
- "body",
39
- "article",
40
- "section",
41
- "nav",
42
- "aside",
43
- "h1",
44
- "h2",
45
- "h3",
46
- "h4",
47
- "h5",
48
- "h6",
49
- "hgroup",
50
- "header",
51
- "footer",
52
- "address",
53
- "p",
54
- "hr",
55
- "pre",
56
- "blockquote",
57
- "ol",
58
- "ul",
59
- "li",
60
- "dl",
61
- "dt",
62
- "dd",
63
- "figure",
64
- "figcaption",
65
- "main",
66
- "div",
67
- "a",
68
- "em",
69
- "strong",
70
- "small",
71
- "s",
72
- "cite",
73
- "q",
74
- "dfn",
75
- "abbr",
76
- "data",
77
- "time",
78
- "code",
79
- "var",
80
- "samp",
81
- "kbd",
82
- "sub",
83
- "sup",
84
- "i",
85
- "b",
86
- "u",
87
- "mark",
88
- "ruby",
89
- "rt",
90
- "rp",
91
- "bdi",
92
- "bdo",
93
- "span",
94
- "br",
95
- "wbr",
96
- "ins",
97
- "del",
98
- "image",
99
- "img",
100
- "iframe",
101
- "embed",
102
- "object",
103
- "param",
104
- "video",
105
- "audio",
106
- "source",
107
- "track",
108
- "canvas",
109
- "map",
110
- "area",
111
- "math",
112
- "mi",
113
- "mo",
114
- "mn",
115
- "ms",
116
- "mtext",
117
- "mglyph",
118
- "malignmark",
119
- "annotation-xml",
120
- "svg",
121
- "foreignobject",
122
- "desc",
123
- "table",
124
- "caption",
125
- "colgroup",
126
- "col",
127
- "tbody",
128
- "thead",
129
- "tfoot",
130
- "tr",
131
- "td",
132
- "th",
133
- "form",
134
- "fieldset",
135
- "legend",
136
- "label",
137
- "input",
138
- "button",
139
- "select",
140
- "datalist",
141
- "optgroup",
142
- "option",
143
- "textarea",
144
- "keygen",
145
- "output",
146
- "progress",
147
- "meter",
148
- "details",
149
- "summary",
150
- "menu",
151
- "menuitem",
152
- "applet",
153
- "acronym",
154
- "bgsound",
155
- "dir",
156
- "frame",
157
- "frameset",
158
- "noframes",
159
- "isindex",
160
- "listing",
161
- "xmp",
162
- "nextid",
163
- "noembed",
164
- "plaintext",
165
- "rb",
166
- "strike",
167
- "basefont",
168
- "big",
169
- "blink",
170
- "center",
171
- "font",
172
- "marquee",
173
- "multicol",
174
- "nobr",
175
- "spacer",
176
- "tt",
177
- "", // TAG_UNKNOWN
178
- "", // TAG_LAST
24
+ #include "tag_strings.h"
25
+ "", // TAG_UNKNOWN
26
+ "", // TAG_LAST
27
+ };
28
+
29
+ static const unsigned char kGumboTagSizes[] = {
30
+ #include "tag_sizes.h"
31
+ 0, // TAG_UNKNOWN
32
+ 0, // TAG_LAST
179
33
  };
180
34
 
181
35
  const char* gumbo_normalized_tagname(GumboTag tag) {
@@ -183,7 +37,6 @@ const char* gumbo_normalized_tagname(GumboTag tag) {
183
37
  return kGumboTagNames[tag];
184
38
  }
185
39
 
186
- // TODO(jdtang): Add test for this.
187
40
  void gumbo_tag_from_original_text(GumboStringPiece* text) {
188
41
  if (text->data == NULL) {
189
42
  return;
@@ -195,11 +48,11 @@ void gumbo_tag_from_original_text(GumboStringPiece* text) {
195
48
  if (text->data[1] == '/') {
196
49
  // End tag.
197
50
  assert(text->length >= 3);
198
- text->data += 2; // Move past </
51
+ text->data += 2; // Move past </
199
52
  text->length -= 3;
200
53
  } else {
201
54
  // Start tag.
202
- text->data += 1; // Move past <
55
+ text->data += 1; // Move past <
203
56
  text->length -= 2;
204
57
  // strnchr is apparently not a standard C library function, so I loop
205
58
  // explicitly looking for whitespace or other illegal tag characters.
@@ -212,14 +65,31 @@ void gumbo_tag_from_original_text(GumboStringPiece* text) {
212
65
  }
213
66
  }
214
67
 
215
- GumboTag gumbo_tag_enum(const char* tagname) {
216
- for (int i = 0; i < GUMBO_TAG_LAST; ++i) {
217
- // TODO(jdtang): strcasecmp is non-portable, so if we want to support
218
- // non-GCC compilers, we'll need some #ifdef magic. This source already has
219
- // pretty significant issues with MSVC6 anyway.
220
- if (strcasecmp(tagname, kGumboTagNames[i]) == 0) {
221
- return i;
68
+ static int case_memcmp(const char* s1, const char* s2, unsigned int n) {
69
+ while (n--) {
70
+ unsigned char c1 = tolower(*s1++);
71
+ unsigned char c2 = tolower(*s2++);
72
+ if (c1 != c2) return (int) c1 - (int) c2;
73
+ }
74
+ return 0;
75
+ }
76
+
77
+ #include "tag_gperf.h"
78
+ #define TAG_MAP_SIZE (sizeof(kGumboTagMap) / sizeof(kGumboTagMap[0]))
79
+
80
+ GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length) {
81
+ if (length) {
82
+ unsigned int key = tag_hash(tagname, length);
83
+ if (key < TAG_MAP_SIZE) {
84
+ GumboTag tag = kGumboTagMap[key];
85
+ if (length == kGumboTagSizes[(int) tag] &&
86
+ !case_memcmp(tagname, kGumboTagNames[(int) tag], length))
87
+ return tag;
222
88
  }
223
89
  }
224
90
  return GUMBO_TAG_UNKNOWN;
225
91
  }
92
+
93
+ GumboTag gumbo_tag_enum(const char* tagname) {
94
+ return gumbo_tagn_enum(tagname, strlen(tagname));
95
+ }
@@ -0,0 +1,150 @@
1
+ html
2
+ head
3
+ title
4
+ base
5
+ link
6
+ meta
7
+ style
8
+ script
9
+ noscript
10
+ template
11
+ body
12
+ article
13
+ section
14
+ nav
15
+ aside
16
+ h1
17
+ h2
18
+ h3
19
+ h4
20
+ h5
21
+ h6
22
+ hgroup
23
+ header
24
+ footer
25
+ address
26
+ p
27
+ hr
28
+ pre
29
+ blockquote
30
+ ol
31
+ ul
32
+ li
33
+ dl
34
+ dt
35
+ dd
36
+ figure
37
+ figcaption
38
+ main
39
+ div
40
+ a
41
+ em
42
+ strong
43
+ small
44
+ s
45
+ cite
46
+ q
47
+ dfn
48
+ abbr
49
+ data
50
+ time
51
+ code
52
+ var
53
+ samp
54
+ kbd
55
+ sub
56
+ sup
57
+ i
58
+ b
59
+ u
60
+ mark
61
+ ruby
62
+ rt
63
+ rp
64
+ bdi
65
+ bdo
66
+ span
67
+ br
68
+ wbr
69
+ ins
70
+ del
71
+ image
72
+ img
73
+ iframe
74
+ embed
75
+ object
76
+ param
77
+ video
78
+ audio
79
+ source
80
+ track
81
+ canvas
82
+ map
83
+ area
84
+ math
85
+ mi
86
+ mo
87
+ mn
88
+ ms
89
+ mtext
90
+ mglyph
91
+ malignmark
92
+ annotation-xml
93
+ svg
94
+ foreignobject
95
+ desc
96
+ table
97
+ caption
98
+ colgroup
99
+ col
100
+ tbody
101
+ thead
102
+ tfoot
103
+ tr
104
+ td
105
+ th
106
+ form
107
+ fieldset
108
+ legend
109
+ label
110
+ input
111
+ button
112
+ select
113
+ datalist
114
+ optgroup
115
+ option
116
+ textarea
117
+ keygen
118
+ output
119
+ progress
120
+ meter
121
+ details
122
+ summary
123
+ menu
124
+ menuitem
125
+ applet
126
+ acronym
127
+ bgsound
128
+ dir
129
+ frame
130
+ frameset
131
+ noframes
132
+ isindex
133
+ listing
134
+ xmp
135
+ nextid
136
+ noembed
137
+ plaintext
138
+ rb
139
+ strike
140
+ basefont
141
+ big
142
+ blink
143
+ center
144
+ font
145
+ marquee
146
+ multicol
147
+ nobr
148
+ spacer
149
+ tt
150
+ rtc