nokogumbo 0.3 → 0.4

Sign up to get free protection for your applications and to get access to all the features.
data/work/parser.h ADDED
@@ -0,0 +1,57 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // Contains the definition of the top-level GumboParser structure that's
18
+ // threaded through basically every internal function in the library.
19
+
20
+ #ifndef GUMBO_PARSER_H_
21
+ #define GUMBO_PARSER_H_
22
+
23
+ #ifdef __cplusplus
24
+ extern "C" {
25
+ #endif
26
+
27
+ struct _GumboParserState;
28
+ struct _GumboOutput;
29
+ struct _GumboOptions;
30
+ struct _GumboTokenizerState;
31
+
32
+ // An overarching struct that's threaded through (nearly) all functions in the
33
+ // library, OOP-style. This gives each function access to the options and
34
+ // output, along with any internal state needed for the parse.
35
+ typedef struct _GumboParser {
36
+ // Settings for this parse run.
37
+ const struct _GumboOptions* _options;
38
+
39
+ // Output for the parse.
40
+ struct _GumboOutput* _output;
41
+
42
+ // The internal tokenizer state, defined as a pointer to avoid a cyclic
43
+ // dependency on html5tokenizer.h. The main parse routine is responsible for
44
+ // initializing this on parse start, and destroying it on parse end.
45
+ // End-users will never see a non-garbage value in this pointer.
46
+ struct _GumboTokenizerState* _tokenizer_state;
47
+
48
+ // The internal parser state. Initialized on parse start and destroyed on
49
+ // parse end; end-users will never see a non-garbage value in this pointer.
50
+ struct _GumboParserState* _parser_state;
51
+ } GumboParser;
52
+
53
+ #ifdef __cplusplus
54
+ }
55
+ #endif
56
+
57
+ #endif // GUMBO_PARSER_H_
@@ -0,0 +1,106 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "string_buffer.h"
18
+
19
+ #include <assert.h>
20
+ #include <stdlib.h>
21
+ #include <string.h>
22
+ #include <strings.h>
23
+
24
+ #include "string_piece.h"
25
+ #include "util.h"
26
+
27
+ struct _GumboParser;
28
+
29
+ static const size_t kDefaultStringBufferSize = 10;
30
+
31
+ static void maybe_resize_string_buffer(
32
+ struct _GumboParser* parser, size_t additional_chars,
33
+ GumboStringBuffer* buffer) {
34
+ size_t new_length = buffer->length + additional_chars;
35
+ size_t new_capacity = buffer->capacity;
36
+ while (new_capacity < new_length) {
37
+ new_capacity *= 2;
38
+ }
39
+ if (new_capacity != buffer->capacity) {
40
+ char* new_data = gumbo_parser_allocate(parser, new_capacity);
41
+ memcpy(new_data, buffer->data, buffer->length);
42
+ gumbo_parser_deallocate(parser, buffer->data);
43
+ buffer->data = new_data;
44
+ buffer->capacity = new_capacity;
45
+ }
46
+ }
47
+
48
+ void gumbo_string_buffer_init(
49
+ struct _GumboParser* parser, GumboStringBuffer* output) {
50
+ output->data = gumbo_parser_allocate(parser, kDefaultStringBufferSize);
51
+ output->length = 0;
52
+ output->capacity = kDefaultStringBufferSize;
53
+ }
54
+
55
+ void gumbo_string_buffer_reserve(
56
+ struct _GumboParser* parser, size_t min_capacity,
57
+ GumboStringBuffer* output) {
58
+ maybe_resize_string_buffer(parser, min_capacity - output->length, output);
59
+ }
60
+
61
+ void gumbo_string_buffer_append_codepoint(
62
+ struct _GumboParser* parser, int c, GumboStringBuffer* output) {
63
+ // num_bytes is actually the number of continuation bytes, 1 less than the
64
+ // total number of bytes. This is done to keep the loop below simple and
65
+ // should probably change if we unroll it.
66
+ int num_bytes, prefix;
67
+ if (c <= 0x7f) {
68
+ num_bytes = 0;
69
+ prefix = 0;
70
+ } else if (c <= 0x7ff) {
71
+ num_bytes = 1;
72
+ prefix = 0xc0;
73
+ } else if (c <= 0xffff) {
74
+ num_bytes = 2;
75
+ prefix = 0xe0;
76
+ } else {
77
+ num_bytes = 3;
78
+ prefix = 0xf0;
79
+ }
80
+ maybe_resize_string_buffer(parser, num_bytes + 1, output);
81
+ output->data[output->length++] = prefix | (c >> (num_bytes * 6));
82
+ for (int i = num_bytes - 1; i >= 0; --i) {
83
+ output->data[output->length++] = 0x80 | (0x3f & (c >> (i * 6)));
84
+ }
85
+ }
86
+
87
+ void gumbo_string_buffer_append_string(
88
+ struct _GumboParser* parser, GumboStringPiece* str,
89
+ GumboStringBuffer* output) {
90
+ maybe_resize_string_buffer(parser, str->length, output);
91
+ memcpy(output->data + output->length, str->data, str->length);
92
+ output->length += str->length;
93
+ }
94
+
95
+ char* gumbo_string_buffer_to_string(
96
+ struct _GumboParser* parser, GumboStringBuffer* input) {
97
+ char* buffer = gumbo_parser_allocate(parser, input->length + 1);
98
+ memcpy(buffer, input->data, input->length);
99
+ buffer[input->length] = '\0';
100
+ return buffer;
101
+ }
102
+
103
+ void gumbo_string_buffer_destroy(
104
+ struct _GumboParser* parser, GumboStringBuffer* buffer) {
105
+ gumbo_parser_deallocate(parser, buffer->data);
106
+ }
@@ -0,0 +1,82 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ #ifndef GUMBO_STRING_BUFFER_H_
18
+ #define GUMBO_STRING_BUFFER_H_
19
+
20
+ #include <stdbool.h>
21
+ #include <stddef.h>
22
+
23
+ #ifdef __cplusplus
24
+ extern "C" {
25
+ #endif
26
+
27
+ // Forward declaration since it's passed into some of the functions in this
28
+ // header.
29
+ struct _GumboParser;
30
+ struct _GumboStringPiece;
31
+
32
+ // A struct representing a mutable, growable string. This consists of a
33
+ // heap-allocated buffer that may grow (by doubling) as necessary. When
34
+ // converting to a string, this allocates a new buffer that is only as long as
35
+ // it needs to be. Note that the internal buffer here is *not* nul-terminated,
36
+ // so be sure not to use ordinary string manipulation functions on it.
37
+ typedef struct _GumboStringBuffer {
38
+ // A pointer to the beginning of the string. NULL iff length == 0.
39
+ char* data;
40
+
41
+ // The length of the string fragment, in bytes. May be zero.
42
+ size_t length;
43
+
44
+ // The capacity of the buffer, in bytes.
45
+ size_t capacity;
46
+ } GumboStringBuffer;
47
+
48
+ // Initializes a new GumboStringBuffer.
49
+ void gumbo_string_buffer_init(
50
+ struct _GumboParser* parser, GumboStringBuffer* output);
51
+
52
+ // Ensures that the buffer contains at least a certain amount of space. Most
53
+ // useful with snprintf and the other length-delimited string functions, which
54
+ // may want to write directly into the buffer.
55
+ void gumbo_string_buffer_reserve(
56
+ struct _GumboParser* parser, size_t min_capacity,
57
+ GumboStringBuffer* output);
58
+
59
+ // Appends a single Unicode codepoint onto the end of the GumboStringBuffer.
60
+ // This is essentially a UTF-8 encoder, and may add 1-4 bytes depending on the
61
+ // value of the codepoint.
62
+ void gumbo_string_buffer_append_codepoint(
63
+ struct _GumboParser* parser, int c, GumboStringBuffer* output);
64
+
65
+ // Appends a string onto the end of the GumboStringBuffer.
66
+ void gumbo_string_buffer_append_string(
67
+ struct _GumboParser* parser, struct _GumboStringPiece* str,
68
+ GumboStringBuffer* output);
69
+
70
+ // Converts this string buffer to const char*, alloctaing a new buffer for it.
71
+ char* gumbo_string_buffer_to_string(
72
+ struct _GumboParser* parser, GumboStringBuffer* input);
73
+
74
+ // Deallocates this GumboStringBuffer.
75
+ void gumbo_string_buffer_destroy(
76
+ struct _GumboParser* parser, GumboStringBuffer* buffer);
77
+
78
+ #ifdef __cplusplus
79
+ }
80
+ #endif
81
+
82
+ #endif // GUMBO_STRING_BUFFER_H_
@@ -0,0 +1,49 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "string_piece.h"
18
+
19
+ #include <assert.h>
20
+ #include <stdlib.h>
21
+ #include <string.h>
22
+ #include <strings.h>
23
+
24
+ #include "util.h"
25
+
26
+ struct _GumboParser;
27
+
28
+ const GumboStringPiece kGumboEmptyString = { NULL, 0 };
29
+
30
+ bool gumbo_string_equals(
31
+ const GumboStringPiece* str1, const GumboStringPiece* str2) {
32
+ return str1->length == str2->length &&
33
+ !memcmp(str1->data, str2->data, str1->length);
34
+ }
35
+
36
+ bool gumbo_string_equals_ignore_case(
37
+ const GumboStringPiece* str1, const GumboStringPiece* str2) {
38
+ return str1->length == str2->length &&
39
+ !strncasecmp(str1->data, str2->data, str1->length);
40
+ }
41
+
42
+ void gumbo_string_copy(
43
+ struct _GumboParser* parser, GumboStringPiece* dest,
44
+ const GumboStringPiece* source) {
45
+ dest->length = source->length;
46
+ char* buffer = gumbo_parser_allocate(parser, source->length);
47
+ memcpy(buffer, source->data, source->length);
48
+ dest->data = buffer;
49
+ }
@@ -0,0 +1,39 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #ifndef GUMBO_STRING_PIECE_H_
18
+ #define GUMBO_STRING_PIECE_H_
19
+
20
+ #include "gumbo.h"
21
+
22
+ #ifdef __cplusplus
23
+ extern "C" {
24
+ #endif
25
+
26
+ struct _GumboParser;
27
+
28
+ // Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the
29
+ // destination and copying over the characters from source. Dest should be
30
+ // empty, with no buffer allocated; otherwise, this leaks it.
31
+ void gumbo_string_copy(
32
+ struct _GumboParser* parser, GumboStringPiece* dest,
33
+ const GumboStringPiece* source);
34
+
35
+ #ifdef __cplusplus
36
+ }
37
+ #endif
38
+
39
+ #endif // GUMBO_STRING_PIECE_H_
data/work/tag.c ADDED
@@ -0,0 +1,222 @@
1
+ // Copyright 2011 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "gumbo.h"
18
+
19
+ #include <assert.h>
20
+ #include <ctype.h>
21
+ #include <strings.h> // For strcasecmp.
22
+
23
+ // NOTE(jdtang): Keep this in sync with the GumboTag enum in the header.
24
+ // TODO(jdtang): Investigate whether there're efficiency benefits to putting the
25
+ // most common tag names first, or to putting them in alphabetical order and
26
+ // using a binary search.
27
+ const char* kGumboTagNames[] = {
28
+ "html",
29
+ "head",
30
+ "title",
31
+ "base",
32
+ "link",
33
+ "meta",
34
+ "style",
35
+ "script",
36
+ "noscript",
37
+ "body",
38
+ "section",
39
+ "nav",
40
+ "article",
41
+ "aside",
42
+ "h1",
43
+ "h2",
44
+ "h3",
45
+ "h4",
46
+ "h5",
47
+ "h6",
48
+ "hgroup",
49
+ "header",
50
+ "footer",
51
+ "address",
52
+ "p",
53
+ "hr",
54
+ "pre",
55
+ "blockquote",
56
+ "ol",
57
+ "ul",
58
+ "li",
59
+ "dl",
60
+ "dt",
61
+ "dd",
62
+ "figure",
63
+ "figcaption",
64
+ "div",
65
+ "a",
66
+ "em",
67
+ "strong",
68
+ "small",
69
+ "s",
70
+ "cite",
71
+ "q",
72
+ "dfn",
73
+ "abbr",
74
+ "time",
75
+ "code",
76
+ "var",
77
+ "samp",
78
+ "kbd",
79
+ "sub",
80
+ "sup",
81
+ "i",
82
+ "b",
83
+ "mark",
84
+ "ruby",
85
+ "rt",
86
+ "rp",
87
+ "bdi",
88
+ "bdo",
89
+ "span",
90
+ "br",
91
+ "wbr",
92
+ "ins",
93
+ "del",
94
+ "image",
95
+ "img",
96
+ "iframe",
97
+ "embed",
98
+ "object",
99
+ "param",
100
+ "video",
101
+ "audio",
102
+ "source",
103
+ "track",
104
+ "canvas",
105
+ "map",
106
+ "area",
107
+ "math",
108
+ "mi",
109
+ "mo",
110
+ "mn",
111
+ "ms",
112
+ "mtext",
113
+ "mglyph",
114
+ "malignmark",
115
+ "annotation-xml",
116
+ "svg",
117
+ "foreignobject",
118
+ "desc",
119
+ "table",
120
+ "caption",
121
+ "colgroup",
122
+ "col",
123
+ "tbody",
124
+ "thead",
125
+ "tfoot",
126
+ "tr",
127
+ "td",
128
+ "th",
129
+ "form",
130
+ "fieldset",
131
+ "legend",
132
+ "label",
133
+ "input",
134
+ "button",
135
+ "select",
136
+ "datalist",
137
+ "optgroup",
138
+ "option",
139
+ "textarea",
140
+ "keygen",
141
+ "output",
142
+ "progress",
143
+ "meter",
144
+ "details",
145
+ "summary",
146
+ "command",
147
+ "menu",
148
+ "applet",
149
+ "acronym",
150
+ "bgsound",
151
+ "dir",
152
+ "frame",
153
+ "frameset",
154
+ "noframes",
155
+ "isindex",
156
+ "listing",
157
+ "xmp",
158
+ "nextid",
159
+ "noembed",
160
+ "plaintext",
161
+ "rb",
162
+ "strike",
163
+ "basefont",
164
+ "big",
165
+ "blink",
166
+ "center",
167
+ "font",
168
+ "marquee",
169
+ "multicol",
170
+ "nobr",
171
+ "spacer",
172
+ "tt",
173
+ "u",
174
+ "", // TAG_UNKNOWN
175
+ "", // TAG_LAST
176
+ };
177
+
178
+ const char* gumbo_normalized_tagname(GumboTag tag) {
179
+ assert(tag <= GUMBO_TAG_LAST);
180
+ return kGumboTagNames[tag];
181
+ }
182
+
183
+ // TODO(jdtang): Add test for this.
184
+ void gumbo_tag_from_original_text(GumboStringPiece* text) {
185
+ if (text->data == NULL) {
186
+ return;
187
+ }
188
+
189
+ assert(text->length >= 2);
190
+ assert(text->data[0] == '<');
191
+ assert(text->data[text->length - 1] == '>');
192
+ if (text->data[1] == '/') {
193
+ // End tag.
194
+ assert(text->length >= 3);
195
+ text->data += 2; // Move past </
196
+ text->length -= 3;
197
+ } else {
198
+ // Start tag.
199
+ text->data += 1; // Move past <
200
+ text->length -= 2;
201
+ // strnchr is apparently not a standard C library function, so I loop
202
+ // explicitly looking for whitespace or other illegal tag characters.
203
+ for (const char* c = text->data; c != text->data + text->length; ++c) {
204
+ if (isspace(*c) || *c == '/') {
205
+ text->length = c - text->data;
206
+ break;
207
+ }
208
+ }
209
+ }
210
+ }
211
+
212
+ GumboTag gumbo_tag_enum(const char* tagname) {
213
+ for (int i = 0; i < GUMBO_TAG_LAST; ++i) {
214
+ // TODO(jdtang): strcasecmp is non-portable, so if we want to support
215
+ // non-GCC compilers, we'll need some #ifdef magic. This source already has
216
+ // pretty significant issues with MSVC6 anyway.
217
+ if (strcasecmp(tagname, kGumboTagNames[i]) == 0) {
218
+ return i;
219
+ }
220
+ }
221
+ return GUMBO_TAG_UNKNOWN;
222
+ }