nokogumbo 0.4 → 0.5

Sign up to get free protection for your applications and to get access to all the features.
data/work/parser.h DELETED
@@ -1,57 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // Contains the definition of the top-level GumboParser structure that's
18
- // threaded through basically every internal function in the library.
19
-
20
- #ifndef GUMBO_PARSER_H_
21
- #define GUMBO_PARSER_H_
22
-
23
- #ifdef __cplusplus
24
- extern "C" {
25
- #endif
26
-
27
- struct _GumboParserState;
28
- struct _GumboOutput;
29
- struct _GumboOptions;
30
- struct _GumboTokenizerState;
31
-
32
- // An overarching struct that's threaded through (nearly) all functions in the
33
- // library, OOP-style. This gives each function access to the options and
34
- // output, along with any internal state needed for the parse.
35
- typedef struct _GumboParser {
36
- // Settings for this parse run.
37
- const struct _GumboOptions* _options;
38
-
39
- // Output for the parse.
40
- struct _GumboOutput* _output;
41
-
42
- // The internal tokenizer state, defined as a pointer to avoid a cyclic
43
- // dependency on html5tokenizer.h. The main parse routine is responsible for
44
- // initializing this on parse start, and destroying it on parse end.
45
- // End-users will never see a non-garbage value in this pointer.
46
- struct _GumboTokenizerState* _tokenizer_state;
47
-
48
- // The internal parser state. Initialized on parse start and destroyed on
49
- // parse end; end-users will never see a non-garbage value in this pointer.
50
- struct _GumboParserState* _parser_state;
51
- } GumboParser;
52
-
53
- #ifdef __cplusplus
54
- }
55
- #endif
56
-
57
- #endif // GUMBO_PARSER_H_
data/work/string_buffer.c DELETED
@@ -1,106 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #include "string_buffer.h"
18
-
19
- #include <assert.h>
20
- #include <stdlib.h>
21
- #include <string.h>
22
- #include <strings.h>
23
-
24
- #include "string_piece.h"
25
- #include "util.h"
26
-
27
- struct _GumboParser;
28
-
29
- static const size_t kDefaultStringBufferSize = 10;
30
-
31
- static void maybe_resize_string_buffer(
32
- struct _GumboParser* parser, size_t additional_chars,
33
- GumboStringBuffer* buffer) {
34
- size_t new_length = buffer->length + additional_chars;
35
- size_t new_capacity = buffer->capacity;
36
- while (new_capacity < new_length) {
37
- new_capacity *= 2;
38
- }
39
- if (new_capacity != buffer->capacity) {
40
- char* new_data = gumbo_parser_allocate(parser, new_capacity);
41
- memcpy(new_data, buffer->data, buffer->length);
42
- gumbo_parser_deallocate(parser, buffer->data);
43
- buffer->data = new_data;
44
- buffer->capacity = new_capacity;
45
- }
46
- }
47
-
48
- void gumbo_string_buffer_init(
49
- struct _GumboParser* parser, GumboStringBuffer* output) {
50
- output->data = gumbo_parser_allocate(parser, kDefaultStringBufferSize);
51
- output->length = 0;
52
- output->capacity = kDefaultStringBufferSize;
53
- }
54
-
55
- void gumbo_string_buffer_reserve(
56
- struct _GumboParser* parser, size_t min_capacity,
57
- GumboStringBuffer* output) {
58
- maybe_resize_string_buffer(parser, min_capacity - output->length, output);
59
- }
60
-
61
- void gumbo_string_buffer_append_codepoint(
62
- struct _GumboParser* parser, int c, GumboStringBuffer* output) {
63
- // num_bytes is actually the number of continuation bytes, 1 less than the
64
- // total number of bytes. This is done to keep the loop below simple and
65
- // should probably change if we unroll it.
66
- int num_bytes, prefix;
67
- if (c <= 0x7f) {
68
- num_bytes = 0;
69
- prefix = 0;
70
- } else if (c <= 0x7ff) {
71
- num_bytes = 1;
72
- prefix = 0xc0;
73
- } else if (c <= 0xffff) {
74
- num_bytes = 2;
75
- prefix = 0xe0;
76
- } else {
77
- num_bytes = 3;
78
- prefix = 0xf0;
79
- }
80
- maybe_resize_string_buffer(parser, num_bytes + 1, output);
81
- output->data[output->length++] = prefix | (c >> (num_bytes * 6));
82
- for (int i = num_bytes - 1; i >= 0; --i) {
83
- output->data[output->length++] = 0x80 | (0x3f & (c >> (i * 6)));
84
- }
85
- }
86
-
87
- void gumbo_string_buffer_append_string(
88
- struct _GumboParser* parser, GumboStringPiece* str,
89
- GumboStringBuffer* output) {
90
- maybe_resize_string_buffer(parser, str->length, output);
91
- memcpy(output->data + output->length, str->data, str->length);
92
- output->length += str->length;
93
- }
94
-
95
- char* gumbo_string_buffer_to_string(
96
- struct _GumboParser* parser, GumboStringBuffer* input) {
97
- char* buffer = gumbo_parser_allocate(parser, input->length + 1);
98
- memcpy(buffer, input->data, input->length);
99
- buffer[input->length] = '\0';
100
- return buffer;
101
- }
102
-
103
- void gumbo_string_buffer_destroy(
104
- struct _GumboParser* parser, GumboStringBuffer* buffer) {
105
- gumbo_parser_deallocate(parser, buffer->data);
106
- }
data/work/string_buffer.h DELETED
@@ -1,82 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- #ifndef GUMBO_STRING_BUFFER_H_
18
- #define GUMBO_STRING_BUFFER_H_
19
-
20
- #include <stdbool.h>
21
- #include <stddef.h>
22
-
23
- #ifdef __cplusplus
24
- extern "C" {
25
- #endif
26
-
27
- // Forward declaration since it's passed into some of the functions in this
28
- // header.
29
- struct _GumboParser;
30
- struct _GumboStringPiece;
31
-
32
- // A struct representing a mutable, growable string. This consists of a
33
- // heap-allocated buffer that may grow (by doubling) as necessary. When
34
- // converting to a string, this allocates a new buffer that is only as long as
35
- // it needs to be. Note that the internal buffer here is *not* nul-terminated,
36
- // so be sure not to use ordinary string manipulation functions on it.
37
- typedef struct _GumboStringBuffer {
38
- // A pointer to the beginning of the string. NULL iff length == 0.
39
- char* data;
40
-
41
- // The length of the string fragment, in bytes. May be zero.
42
- size_t length;
43
-
44
- // The capacity of the buffer, in bytes.
45
- size_t capacity;
46
- } GumboStringBuffer;
47
-
48
- // Initializes a new GumboStringBuffer.
49
- void gumbo_string_buffer_init(
50
- struct _GumboParser* parser, GumboStringBuffer* output);
51
-
52
- // Ensures that the buffer contains at least a certain amount of space. Most
53
- // useful with snprintf and the other length-delimited string functions, which
54
- // may want to write directly into the buffer.
55
- void gumbo_string_buffer_reserve(
56
- struct _GumboParser* parser, size_t min_capacity,
57
- GumboStringBuffer* output);
58
-
59
- // Appends a single Unicode codepoint onto the end of the GumboStringBuffer.
60
- // This is essentially a UTF-8 encoder, and may add 1-4 bytes depending on the
61
- // value of the codepoint.
62
- void gumbo_string_buffer_append_codepoint(
63
- struct _GumboParser* parser, int c, GumboStringBuffer* output);
64
-
65
- // Appends a string onto the end of the GumboStringBuffer.
66
- void gumbo_string_buffer_append_string(
67
- struct _GumboParser* parser, struct _GumboStringPiece* str,
68
- GumboStringBuffer* output);
69
-
70
- // Converts this string buffer to const char*, alloctaing a new buffer for it.
71
- char* gumbo_string_buffer_to_string(
72
- struct _GumboParser* parser, GumboStringBuffer* input);
73
-
74
- // Deallocates this GumboStringBuffer.
75
- void gumbo_string_buffer_destroy(
76
- struct _GumboParser* parser, GumboStringBuffer* buffer);
77
-
78
- #ifdef __cplusplus
79
- }
80
- #endif
81
-
82
- #endif // GUMBO_STRING_BUFFER_H_
data/work/string_piece.c DELETED
@@ -1,49 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #include "string_piece.h"
18
-
19
- #include <assert.h>
20
- #include <stdlib.h>
21
- #include <string.h>
22
- #include <strings.h>
23
-
24
- #include "util.h"
25
-
26
- struct _GumboParser;
27
-
28
- const GumboStringPiece kGumboEmptyString = { NULL, 0 };
29
-
30
- bool gumbo_string_equals(
31
- const GumboStringPiece* str1, const GumboStringPiece* str2) {
32
- return str1->length == str2->length &&
33
- !memcmp(str1->data, str2->data, str1->length);
34
- }
35
-
36
- bool gumbo_string_equals_ignore_case(
37
- const GumboStringPiece* str1, const GumboStringPiece* str2) {
38
- return str1->length == str2->length &&
39
- !strncasecmp(str1->data, str2->data, str1->length);
40
- }
41
-
42
- void gumbo_string_copy(
43
- struct _GumboParser* parser, GumboStringPiece* dest,
44
- const GumboStringPiece* source) {
45
- dest->length = source->length;
46
- char* buffer = gumbo_parser_allocate(parser, source->length);
47
- memcpy(buffer, source->data, source->length);
48
- dest->data = buffer;
49
- }
data/work/string_piece.h DELETED
@@ -1,39 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #ifndef GUMBO_STRING_PIECE_H_
18
- #define GUMBO_STRING_PIECE_H_
19
-
20
- #include "gumbo.h"
21
-
22
- #ifdef __cplusplus
23
- extern "C" {
24
- #endif
25
-
26
- struct _GumboParser;
27
-
28
- // Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the
29
- // destination and copying over the characters from source. Dest should be
30
- // empty, with no buffer allocated; otherwise, this leaks it.
31
- void gumbo_string_copy(
32
- struct _GumboParser* parser, GumboStringPiece* dest,
33
- const GumboStringPiece* source);
34
-
35
- #ifdef __cplusplus
36
- }
37
- #endif
38
-
39
- #endif // GUMBO_STRING_PIECE_H_
data/work/tag.c DELETED
@@ -1,222 +0,0 @@
1
- // Copyright 2011 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #include "gumbo.h"
18
-
19
- #include <assert.h>
20
- #include <ctype.h>
21
- #include <strings.h> // For strcasecmp.
22
-
23
- // NOTE(jdtang): Keep this in sync with the GumboTag enum in the header.
24
- // TODO(jdtang): Investigate whether there're efficiency benefits to putting the
25
- // most common tag names first, or to putting them in alphabetical order and
26
- // using a binary search.
27
- const char* kGumboTagNames[] = {
28
- "html",
29
- "head",
30
- "title",
31
- "base",
32
- "link",
33
- "meta",
34
- "style",
35
- "script",
36
- "noscript",
37
- "body",
38
- "section",
39
- "nav",
40
- "article",
41
- "aside",
42
- "h1",
43
- "h2",
44
- "h3",
45
- "h4",
46
- "h5",
47
- "h6",
48
- "hgroup",
49
- "header",
50
- "footer",
51
- "address",
52
- "p",
53
- "hr",
54
- "pre",
55
- "blockquote",
56
- "ol",
57
- "ul",
58
- "li",
59
- "dl",
60
- "dt",
61
- "dd",
62
- "figure",
63
- "figcaption",
64
- "div",
65
- "a",
66
- "em",
67
- "strong",
68
- "small",
69
- "s",
70
- "cite",
71
- "q",
72
- "dfn",
73
- "abbr",
74
- "time",
75
- "code",
76
- "var",
77
- "samp",
78
- "kbd",
79
- "sub",
80
- "sup",
81
- "i",
82
- "b",
83
- "mark",
84
- "ruby",
85
- "rt",
86
- "rp",
87
- "bdi",
88
- "bdo",
89
- "span",
90
- "br",
91
- "wbr",
92
- "ins",
93
- "del",
94
- "image",
95
- "img",
96
- "iframe",
97
- "embed",
98
- "object",
99
- "param",
100
- "video",
101
- "audio",
102
- "source",
103
- "track",
104
- "canvas",
105
- "map",
106
- "area",
107
- "math",
108
- "mi",
109
- "mo",
110
- "mn",
111
- "ms",
112
- "mtext",
113
- "mglyph",
114
- "malignmark",
115
- "annotation-xml",
116
- "svg",
117
- "foreignobject",
118
- "desc",
119
- "table",
120
- "caption",
121
- "colgroup",
122
- "col",
123
- "tbody",
124
- "thead",
125
- "tfoot",
126
- "tr",
127
- "td",
128
- "th",
129
- "form",
130
- "fieldset",
131
- "legend",
132
- "label",
133
- "input",
134
- "button",
135
- "select",
136
- "datalist",
137
- "optgroup",
138
- "option",
139
- "textarea",
140
- "keygen",
141
- "output",
142
- "progress",
143
- "meter",
144
- "details",
145
- "summary",
146
- "command",
147
- "menu",
148
- "applet",
149
- "acronym",
150
- "bgsound",
151
- "dir",
152
- "frame",
153
- "frameset",
154
- "noframes",
155
- "isindex",
156
- "listing",
157
- "xmp",
158
- "nextid",
159
- "noembed",
160
- "plaintext",
161
- "rb",
162
- "strike",
163
- "basefont",
164
- "big",
165
- "blink",
166
- "center",
167
- "font",
168
- "marquee",
169
- "multicol",
170
- "nobr",
171
- "spacer",
172
- "tt",
173
- "u",
174
- "", // TAG_UNKNOWN
175
- "", // TAG_LAST
176
- };
177
-
178
- const char* gumbo_normalized_tagname(GumboTag tag) {
179
- assert(tag <= GUMBO_TAG_LAST);
180
- return kGumboTagNames[tag];
181
- }
182
-
183
- // TODO(jdtang): Add test for this.
184
- void gumbo_tag_from_original_text(GumboStringPiece* text) {
185
- if (text->data == NULL) {
186
- return;
187
- }
188
-
189
- assert(text->length >= 2);
190
- assert(text->data[0] == '<');
191
- assert(text->data[text->length - 1] == '>');
192
- if (text->data[1] == '/') {
193
- // End tag.
194
- assert(text->length >= 3);
195
- text->data += 2; // Move past </
196
- text->length -= 3;
197
- } else {
198
- // Start tag.
199
- text->data += 1; // Move past <
200
- text->length -= 2;
201
- // strnchr is apparently not a standard C library function, so I loop
202
- // explicitly looking for whitespace or other illegal tag characters.
203
- for (const char* c = text->data; c != text->data + text->length; ++c) {
204
- if (isspace(*c) || *c == '/') {
205
- text->length = c - text->data;
206
- break;
207
- }
208
- }
209
- }
210
- }
211
-
212
- GumboTag gumbo_tag_enum(const char* tagname) {
213
- for (int i = 0; i < GUMBO_TAG_LAST; ++i) {
214
- // TODO(jdtang): strcasecmp is non-portable, so if we want to support
215
- // non-GCC compilers, we'll need some #ifdef magic. This source already has
216
- // pretty significant issues with MSVC6 anyway.
217
- if (strcasecmp(tagname, kGumboTagNames[i]) == 0) {
218
- return i;
219
- }
220
- }
221
- return GUMBO_TAG_UNKNOWN;
222
- }