nokogumbo 1.3.0 → 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -26,11 +26,12 @@
26
26
 
27
27
  struct GumboInternalParser;
28
28
 
29
- static const size_t kDefaultStringBufferSize = 10;
29
+ // Size chosen via statistical analysis of ~60K websites.
30
+ // 99% of text nodes and 98% of attribute names/values fit in this initial size.
31
+ static const size_t kDefaultStringBufferSize = 5;
30
32
 
31
- static void maybe_resize_string_buffer(
32
- struct GumboInternalParser* parser, size_t additional_chars,
33
- GumboStringBuffer* buffer) {
33
+ static void maybe_resize_string_buffer(struct GumboInternalParser* parser,
34
+ size_t additional_chars, GumboStringBuffer* buffer) {
34
35
  size_t new_length = buffer->length + additional_chars;
35
36
  size_t new_capacity = buffer->capacity;
36
37
  while (new_capacity < new_length) {
@@ -52,9 +53,8 @@ void gumbo_string_buffer_init(
52
53
  output->capacity = kDefaultStringBufferSize;
53
54
  }
54
55
 
55
- void gumbo_string_buffer_reserve(
56
- struct GumboInternalParser* parser, size_t min_capacity,
57
- GumboStringBuffer* output) {
56
+ void gumbo_string_buffer_reserve(struct GumboInternalParser* parser,
57
+ size_t min_capacity, GumboStringBuffer* output) {
58
58
  maybe_resize_string_buffer(parser, min_capacity - output->length, output);
59
59
  }
60
60
 
@@ -84,9 +84,8 @@ void gumbo_string_buffer_append_codepoint(
84
84
  }
85
85
  }
86
86
 
87
- void gumbo_string_buffer_append_string(
88
- struct GumboInternalParser* parser, GumboStringPiece* str,
89
- GumboStringBuffer* output) {
87
+ void gumbo_string_buffer_append_string(struct GumboInternalParser* parser,
88
+ GumboStringPiece* str, GumboStringBuffer* output) {
90
89
  maybe_resize_string_buffer(parser, str->length, output);
91
90
  memcpy(output->data + output->length, str->data, str->length);
92
91
  output->length += str->length;
@@ -100,6 +99,11 @@ char* gumbo_string_buffer_to_string(
100
99
  return buffer;
101
100
  }
102
101
 
102
+ void gumbo_string_buffer_clear(
103
+ struct GumboInternalParser* parser, GumboStringBuffer* input) {
104
+ input->length = 0;
105
+ }
106
+
103
107
  void gumbo_string_buffer_destroy(
104
108
  struct GumboInternalParser* parser, GumboStringBuffer* buffer) {
105
109
  gumbo_parser_deallocate(parser, buffer->data);
@@ -51,9 +51,8 @@ void gumbo_string_buffer_init(
51
51
  // Ensures that the buffer contains at least a certain amount of space. Most
52
52
  // useful with snprintf and the other length-delimited string functions, which
53
53
  // may want to write directly into the buffer.
54
- void gumbo_string_buffer_reserve(
55
- struct GumboInternalParser* parser, size_t min_capacity,
56
- GumboStringBuffer* output);
54
+ void gumbo_string_buffer_reserve(struct GumboInternalParser* parser,
55
+ size_t min_capacity, GumboStringBuffer* output);
57
56
 
58
57
  // Appends a single Unicode codepoint onto the end of the GumboStringBuffer.
59
58
  // This is essentially a UTF-8 encoder, and may add 1-4 bytes depending on the
@@ -62,14 +61,18 @@ void gumbo_string_buffer_append_codepoint(
62
61
  struct GumboInternalParser* parser, int c, GumboStringBuffer* output);
63
62
 
64
63
  // Appends a string onto the end of the GumboStringBuffer.
65
- void gumbo_string_buffer_append_string(
66
- struct GumboInternalParser* parser, GumboStringPiece* str,
67
- GumboStringBuffer* output);
64
+ void gumbo_string_buffer_append_string(struct GumboInternalParser* parser,
65
+ GumboStringPiece* str, GumboStringBuffer* output);
68
66
 
69
67
  // Converts this string buffer to const char*, alloctaing a new buffer for it.
70
68
  char* gumbo_string_buffer_to_string(
71
69
  struct GumboInternalParser* parser, GumboStringBuffer* input);
72
70
 
71
+ // Reinitialize this string buffer. This clears it by setting length=0. It
72
+ // does not zero out the buffer itself.
73
+ void gumbo_string_buffer_clear(
74
+ struct GumboInternalParser* parser, GumboStringBuffer* input);
75
+
73
76
  // Deallocates this GumboStringBuffer.
74
77
  void gumbo_string_buffer_destroy(
75
78
  struct GumboInternalParser* parser, GumboStringBuffer* buffer);
@@ -25,23 +25,22 @@
25
25
 
26
26
  struct GumboInternalParser;
27
27
 
28
- const GumboStringPiece kGumboEmptyString = { NULL, 0 };
28
+ const GumboStringPiece kGumboEmptyString = {NULL, 0};
29
29
 
30
30
  bool gumbo_string_equals(
31
31
  const GumboStringPiece* str1, const GumboStringPiece* str2) {
32
32
  return str1->length == str2->length &&
33
- !memcmp(str1->data, str2->data, str1->length);
33
+ !memcmp(str1->data, str2->data, str1->length);
34
34
  }
35
35
 
36
36
  bool gumbo_string_equals_ignore_case(
37
37
  const GumboStringPiece* str1, const GumboStringPiece* str2) {
38
38
  return str1->length == str2->length &&
39
- !strncasecmp(str1->data, str2->data, str1->length);
39
+ !strncasecmp(str1->data, str2->data, str1->length);
40
40
  }
41
41
 
42
- void gumbo_string_copy(
43
- struct GumboInternalParser* parser, GumboStringPiece* dest,
44
- const GumboStringPiece* source) {
42
+ void gumbo_string_copy(struct GumboInternalParser* parser,
43
+ GumboStringPiece* dest, const GumboStringPiece* source) {
45
44
  dest->length = source->length;
46
45
  char* buffer = gumbo_parser_allocate(parser, source->length);
47
46
  memcpy(buffer, source->data, source->length);
@@ -28,9 +28,8 @@ struct GumboInternalParser;
28
28
  // Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the
29
29
  // destination and copying over the characters from source. Dest should be
30
30
  // empty, with no buffer allocated; otherwise, this leaks it.
31
- void gumbo_string_copy(
32
- struct GumboInternalParser* parser, GumboStringPiece* dest,
33
- const GumboStringPiece* source);
31
+ void gumbo_string_copy(struct GumboInternalParser* parser,
32
+ GumboStringPiece* dest, const GumboStringPiece* source);
34
33
 
35
34
  #ifdef __cplusplus
36
35
  }
@@ -18,164 +18,18 @@
18
18
 
19
19
  #include <assert.h>
20
20
  #include <ctype.h>
21
- #include <strings.h> // For strcasecmp.
21
+ #include <string.h>
22
22
 
23
- // NOTE(jdtang): Keep this in sync with the GumboTag enum in the header.
24
- // TODO(jdtang): Investigate whether there're efficiency benefits to putting the
25
- // most common tag names first, or to putting them in alphabetical order and
26
- // using a binary search.
27
23
  const char* kGumboTagNames[] = {
28
- "html",
29
- "head",
30
- "title",
31
- "base",
32
- "link",
33
- "meta",
34
- "style",
35
- "script",
36
- "noscript",
37
- "template",
38
- "body",
39
- "article",
40
- "section",
41
- "nav",
42
- "aside",
43
- "h1",
44
- "h2",
45
- "h3",
46
- "h4",
47
- "h5",
48
- "h6",
49
- "hgroup",
50
- "header",
51
- "footer",
52
- "address",
53
- "p",
54
- "hr",
55
- "pre",
56
- "blockquote",
57
- "ol",
58
- "ul",
59
- "li",
60
- "dl",
61
- "dt",
62
- "dd",
63
- "figure",
64
- "figcaption",
65
- "main",
66
- "div",
67
- "a",
68
- "em",
69
- "strong",
70
- "small",
71
- "s",
72
- "cite",
73
- "q",
74
- "dfn",
75
- "abbr",
76
- "data",
77
- "time",
78
- "code",
79
- "var",
80
- "samp",
81
- "kbd",
82
- "sub",
83
- "sup",
84
- "i",
85
- "b",
86
- "u",
87
- "mark",
88
- "ruby",
89
- "rt",
90
- "rp",
91
- "bdi",
92
- "bdo",
93
- "span",
94
- "br",
95
- "wbr",
96
- "ins",
97
- "del",
98
- "image",
99
- "img",
100
- "iframe",
101
- "embed",
102
- "object",
103
- "param",
104
- "video",
105
- "audio",
106
- "source",
107
- "track",
108
- "canvas",
109
- "map",
110
- "area",
111
- "math",
112
- "mi",
113
- "mo",
114
- "mn",
115
- "ms",
116
- "mtext",
117
- "mglyph",
118
- "malignmark",
119
- "annotation-xml",
120
- "svg",
121
- "foreignobject",
122
- "desc",
123
- "table",
124
- "caption",
125
- "colgroup",
126
- "col",
127
- "tbody",
128
- "thead",
129
- "tfoot",
130
- "tr",
131
- "td",
132
- "th",
133
- "form",
134
- "fieldset",
135
- "legend",
136
- "label",
137
- "input",
138
- "button",
139
- "select",
140
- "datalist",
141
- "optgroup",
142
- "option",
143
- "textarea",
144
- "keygen",
145
- "output",
146
- "progress",
147
- "meter",
148
- "details",
149
- "summary",
150
- "menu",
151
- "menuitem",
152
- "applet",
153
- "acronym",
154
- "bgsound",
155
- "dir",
156
- "frame",
157
- "frameset",
158
- "noframes",
159
- "isindex",
160
- "listing",
161
- "xmp",
162
- "nextid",
163
- "noembed",
164
- "plaintext",
165
- "rb",
166
- "strike",
167
- "basefont",
168
- "big",
169
- "blink",
170
- "center",
171
- "font",
172
- "marquee",
173
- "multicol",
174
- "nobr",
175
- "spacer",
176
- "tt",
177
- "", // TAG_UNKNOWN
178
- "", // TAG_LAST
24
+ #include "tag_strings.h"
25
+ "", // TAG_UNKNOWN
26
+ "", // TAG_LAST
27
+ };
28
+
29
+ static const unsigned char kGumboTagSizes[] = {
30
+ #include "tag_sizes.h"
31
+ 0, // TAG_UNKNOWN
32
+ 0, // TAG_LAST
179
33
  };
180
34
 
181
35
  const char* gumbo_normalized_tagname(GumboTag tag) {
@@ -183,7 +37,6 @@ const char* gumbo_normalized_tagname(GumboTag tag) {
183
37
  return kGumboTagNames[tag];
184
38
  }
185
39
 
186
- // TODO(jdtang): Add test for this.
187
40
  void gumbo_tag_from_original_text(GumboStringPiece* text) {
188
41
  if (text->data == NULL) {
189
42
  return;
@@ -195,11 +48,11 @@ void gumbo_tag_from_original_text(GumboStringPiece* text) {
195
48
  if (text->data[1] == '/') {
196
49
  // End tag.
197
50
  assert(text->length >= 3);
198
- text->data += 2; // Move past </
51
+ text->data += 2; // Move past </
199
52
  text->length -= 3;
200
53
  } else {
201
54
  // Start tag.
202
- text->data += 1; // Move past <
55
+ text->data += 1; // Move past <
203
56
  text->length -= 2;
204
57
  // strnchr is apparently not a standard C library function, so I loop
205
58
  // explicitly looking for whitespace or other illegal tag characters.
@@ -212,14 +65,31 @@ void gumbo_tag_from_original_text(GumboStringPiece* text) {
212
65
  }
213
66
  }
214
67
 
215
- GumboTag gumbo_tag_enum(const char* tagname) {
216
- for (int i = 0; i < GUMBO_TAG_LAST; ++i) {
217
- // TODO(jdtang): strcasecmp is non-portable, so if we want to support
218
- // non-GCC compilers, we'll need some #ifdef magic. This source already has
219
- // pretty significant issues with MSVC6 anyway.
220
- if (strcasecmp(tagname, kGumboTagNames[i]) == 0) {
221
- return i;
68
+ static int case_memcmp(const char* s1, const char* s2, unsigned int n) {
69
+ while (n--) {
70
+ unsigned char c1 = tolower(*s1++);
71
+ unsigned char c2 = tolower(*s2++);
72
+ if (c1 != c2) return (int) c1 - (int) c2;
73
+ }
74
+ return 0;
75
+ }
76
+
77
+ #include "tag_gperf.h"
78
+ #define TAG_MAP_SIZE (sizeof(kGumboTagMap) / sizeof(kGumboTagMap[0]))
79
+
80
+ GumboTag gumbo_tagn_enum(const char* tagname, unsigned int length) {
81
+ if (length) {
82
+ unsigned int key = tag_hash(tagname, length);
83
+ if (key < TAG_MAP_SIZE) {
84
+ GumboTag tag = kGumboTagMap[key];
85
+ if (length == kGumboTagSizes[(int) tag] &&
86
+ !case_memcmp(tagname, kGumboTagNames[(int) tag], length))
87
+ return tag;
222
88
  }
223
89
  }
224
90
  return GUMBO_TAG_UNKNOWN;
225
91
  }
92
+
93
+ GumboTag gumbo_tag_enum(const char* tagname) {
94
+ return gumbo_tagn_enum(tagname, strlen(tagname));
95
+ }
@@ -0,0 +1,150 @@
1
+ html
2
+ head
3
+ title
4
+ base
5
+ link
6
+ meta
7
+ style
8
+ script
9
+ noscript
10
+ template
11
+ body
12
+ article
13
+ section
14
+ nav
15
+ aside
16
+ h1
17
+ h2
18
+ h3
19
+ h4
20
+ h5
21
+ h6
22
+ hgroup
23
+ header
24
+ footer
25
+ address
26
+ p
27
+ hr
28
+ pre
29
+ blockquote
30
+ ol
31
+ ul
32
+ li
33
+ dl
34
+ dt
35
+ dd
36
+ figure
37
+ figcaption
38
+ main
39
+ div
40
+ a
41
+ em
42
+ strong
43
+ small
44
+ s
45
+ cite
46
+ q
47
+ dfn
48
+ abbr
49
+ data
50
+ time
51
+ code
52
+ var
53
+ samp
54
+ kbd
55
+ sub
56
+ sup
57
+ i
58
+ b
59
+ u
60
+ mark
61
+ ruby
62
+ rt
63
+ rp
64
+ bdi
65
+ bdo
66
+ span
67
+ br
68
+ wbr
69
+ ins
70
+ del
71
+ image
72
+ img
73
+ iframe
74
+ embed
75
+ object
76
+ param
77
+ video
78
+ audio
79
+ source
80
+ track
81
+ canvas
82
+ map
83
+ area
84
+ math
85
+ mi
86
+ mo
87
+ mn
88
+ ms
89
+ mtext
90
+ mglyph
91
+ malignmark
92
+ annotation-xml
93
+ svg
94
+ foreignobject
95
+ desc
96
+ table
97
+ caption
98
+ colgroup
99
+ col
100
+ tbody
101
+ thead
102
+ tfoot
103
+ tr
104
+ td
105
+ th
106
+ form
107
+ fieldset
108
+ legend
109
+ label
110
+ input
111
+ button
112
+ select
113
+ datalist
114
+ optgroup
115
+ option
116
+ textarea
117
+ keygen
118
+ output
119
+ progress
120
+ meter
121
+ details
122
+ summary
123
+ menu
124
+ menuitem
125
+ applet
126
+ acronym
127
+ bgsound
128
+ dir
129
+ frame
130
+ frameset
131
+ noframes
132
+ isindex
133
+ listing
134
+ xmp
135
+ nextid
136
+ noembed
137
+ plaintext
138
+ rb
139
+ strike
140
+ basefont
141
+ big
142
+ blink
143
+ center
144
+ font
145
+ marquee
146
+ multicol
147
+ nobr
148
+ spacer
149
+ tt
150
+ rtc