nokogumbo 0.4 → 0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/work/parser.h DELETED
@@ -1,57 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- // Contains the definition of the top-level GumboParser structure that's
18
- // threaded through basically every internal function in the library.
19
-
20
- #ifndef GUMBO_PARSER_H_
21
- #define GUMBO_PARSER_H_
22
-
23
- #ifdef __cplusplus
24
- extern "C" {
25
- #endif
26
-
27
- struct _GumboParserState;
28
- struct _GumboOutput;
29
- struct _GumboOptions;
30
- struct _GumboTokenizerState;
31
-
32
- // An overarching struct that's threaded through (nearly) all functions in the
33
- // library, OOP-style. This gives each function access to the options and
34
- // output, along with any internal state needed for the parse.
35
- typedef struct _GumboParser {
36
- // Settings for this parse run.
37
- const struct _GumboOptions* _options;
38
-
39
- // Output for the parse.
40
- struct _GumboOutput* _output;
41
-
42
- // The internal tokenizer state, defined as a pointer to avoid a cyclic
43
- // dependency on html5tokenizer.h. The main parse routine is responsible for
44
- // initializing this on parse start, and destroying it on parse end.
45
- // End-users will never see a non-garbage value in this pointer.
46
- struct _GumboTokenizerState* _tokenizer_state;
47
-
48
- // The internal parser state. Initialized on parse start and destroyed on
49
- // parse end; end-users will never see a non-garbage value in this pointer.
50
- struct _GumboParserState* _parser_state;
51
- } GumboParser;
52
-
53
- #ifdef __cplusplus
54
- }
55
- #endif
56
-
57
- #endif // GUMBO_PARSER_H_
data/work/string_buffer.c DELETED
@@ -1,106 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #include "string_buffer.h"
18
-
19
- #include <assert.h>
20
- #include <stdlib.h>
21
- #include <string.h>
22
- #include <strings.h>
23
-
24
- #include "string_piece.h"
25
- #include "util.h"
26
-
27
- struct _GumboParser;
28
-
29
- static const size_t kDefaultStringBufferSize = 10;
30
-
31
- static void maybe_resize_string_buffer(
32
- struct _GumboParser* parser, size_t additional_chars,
33
- GumboStringBuffer* buffer) {
34
- size_t new_length = buffer->length + additional_chars;
35
- size_t new_capacity = buffer->capacity;
36
- while (new_capacity < new_length) {
37
- new_capacity *= 2;
38
- }
39
- if (new_capacity != buffer->capacity) {
40
- char* new_data = gumbo_parser_allocate(parser, new_capacity);
41
- memcpy(new_data, buffer->data, buffer->length);
42
- gumbo_parser_deallocate(parser, buffer->data);
43
- buffer->data = new_data;
44
- buffer->capacity = new_capacity;
45
- }
46
- }
47
-
48
- void gumbo_string_buffer_init(
49
- struct _GumboParser* parser, GumboStringBuffer* output) {
50
- output->data = gumbo_parser_allocate(parser, kDefaultStringBufferSize);
51
- output->length = 0;
52
- output->capacity = kDefaultStringBufferSize;
53
- }
54
-
55
- void gumbo_string_buffer_reserve(
56
- struct _GumboParser* parser, size_t min_capacity,
57
- GumboStringBuffer* output) {
58
- maybe_resize_string_buffer(parser, min_capacity - output->length, output);
59
- }
60
-
61
- void gumbo_string_buffer_append_codepoint(
62
- struct _GumboParser* parser, int c, GumboStringBuffer* output) {
63
- // num_bytes is actually the number of continuation bytes, 1 less than the
64
- // total number of bytes. This is done to keep the loop below simple and
65
- // should probably change if we unroll it.
66
- int num_bytes, prefix;
67
- if (c <= 0x7f) {
68
- num_bytes = 0;
69
- prefix = 0;
70
- } else if (c <= 0x7ff) {
71
- num_bytes = 1;
72
- prefix = 0xc0;
73
- } else if (c <= 0xffff) {
74
- num_bytes = 2;
75
- prefix = 0xe0;
76
- } else {
77
- num_bytes = 3;
78
- prefix = 0xf0;
79
- }
80
- maybe_resize_string_buffer(parser, num_bytes + 1, output);
81
- output->data[output->length++] = prefix | (c >> (num_bytes * 6));
82
- for (int i = num_bytes - 1; i >= 0; --i) {
83
- output->data[output->length++] = 0x80 | (0x3f & (c >> (i * 6)));
84
- }
85
- }
86
-
87
- void gumbo_string_buffer_append_string(
88
- struct _GumboParser* parser, GumboStringPiece* str,
89
- GumboStringBuffer* output) {
90
- maybe_resize_string_buffer(parser, str->length, output);
91
- memcpy(output->data + output->length, str->data, str->length);
92
- output->length += str->length;
93
- }
94
-
95
- char* gumbo_string_buffer_to_string(
96
- struct _GumboParser* parser, GumboStringBuffer* input) {
97
- char* buffer = gumbo_parser_allocate(parser, input->length + 1);
98
- memcpy(buffer, input->data, input->length);
99
- buffer[input->length] = '\0';
100
- return buffer;
101
- }
102
-
103
- void gumbo_string_buffer_destroy(
104
- struct _GumboParser* parser, GumboStringBuffer* buffer) {
105
- gumbo_parser_deallocate(parser, buffer->data);
106
- }
data/work/string_buffer.h DELETED
@@ -1,82 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
- //
17
- #ifndef GUMBO_STRING_BUFFER_H_
18
- #define GUMBO_STRING_BUFFER_H_
19
-
20
- #include <stdbool.h>
21
- #include <stddef.h>
22
-
23
- #ifdef __cplusplus
24
- extern "C" {
25
- #endif
26
-
27
- // Forward declaration since it's passed into some of the functions in this
28
- // header.
29
- struct _GumboParser;
30
- struct _GumboStringPiece;
31
-
32
- // A struct representing a mutable, growable string. This consists of a
33
- // heap-allocated buffer that may grow (by doubling) as necessary. When
34
- // converting to a string, this allocates a new buffer that is only as long as
35
- // it needs to be. Note that the internal buffer here is *not* nul-terminated,
36
- // so be sure not to use ordinary string manipulation functions on it.
37
- typedef struct _GumboStringBuffer {
38
- // A pointer to the beginning of the string. NULL iff length == 0.
39
- char* data;
40
-
41
- // The length of the string fragment, in bytes. May be zero.
42
- size_t length;
43
-
44
- // The capacity of the buffer, in bytes.
45
- size_t capacity;
46
- } GumboStringBuffer;
47
-
48
- // Initializes a new GumboStringBuffer.
49
- void gumbo_string_buffer_init(
50
- struct _GumboParser* parser, GumboStringBuffer* output);
51
-
52
- // Ensures that the buffer contains at least a certain amount of space. Most
53
- // useful with snprintf and the other length-delimited string functions, which
54
- // may want to write directly into the buffer.
55
- void gumbo_string_buffer_reserve(
56
- struct _GumboParser* parser, size_t min_capacity,
57
- GumboStringBuffer* output);
58
-
59
- // Appends a single Unicode codepoint onto the end of the GumboStringBuffer.
60
- // This is essentially a UTF-8 encoder, and may add 1-4 bytes depending on the
61
- // value of the codepoint.
62
- void gumbo_string_buffer_append_codepoint(
63
- struct _GumboParser* parser, int c, GumboStringBuffer* output);
64
-
65
- // Appends a string onto the end of the GumboStringBuffer.
66
- void gumbo_string_buffer_append_string(
67
- struct _GumboParser* parser, struct _GumboStringPiece* str,
68
- GumboStringBuffer* output);
69
-
70
- // Converts this string buffer to const char*, alloctaing a new buffer for it.
71
- char* gumbo_string_buffer_to_string(
72
- struct _GumboParser* parser, GumboStringBuffer* input);
73
-
74
- // Deallocates this GumboStringBuffer.
75
- void gumbo_string_buffer_destroy(
76
- struct _GumboParser* parser, GumboStringBuffer* buffer);
77
-
78
- #ifdef __cplusplus
79
- }
80
- #endif
81
-
82
- #endif // GUMBO_STRING_BUFFER_H_
data/work/string_piece.c DELETED
@@ -1,49 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #include "string_piece.h"
18
-
19
- #include <assert.h>
20
- #include <stdlib.h>
21
- #include <string.h>
22
- #include <strings.h>
23
-
24
- #include "util.h"
25
-
26
- struct _GumboParser;
27
-
28
- const GumboStringPiece kGumboEmptyString = { NULL, 0 };
29
-
30
- bool gumbo_string_equals(
31
- const GumboStringPiece* str1, const GumboStringPiece* str2) {
32
- return str1->length == str2->length &&
33
- !memcmp(str1->data, str2->data, str1->length);
34
- }
35
-
36
- bool gumbo_string_equals_ignore_case(
37
- const GumboStringPiece* str1, const GumboStringPiece* str2) {
38
- return str1->length == str2->length &&
39
- !strncasecmp(str1->data, str2->data, str1->length);
40
- }
41
-
42
- void gumbo_string_copy(
43
- struct _GumboParser* parser, GumboStringPiece* dest,
44
- const GumboStringPiece* source) {
45
- dest->length = source->length;
46
- char* buffer = gumbo_parser_allocate(parser, source->length);
47
- memcpy(buffer, source->data, source->length);
48
- dest->data = buffer;
49
- }
data/work/string_piece.h DELETED
@@ -1,39 +0,0 @@
1
- // Copyright 2010 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #ifndef GUMBO_STRING_PIECE_H_
18
- #define GUMBO_STRING_PIECE_H_
19
-
20
- #include "gumbo.h"
21
-
22
- #ifdef __cplusplus
23
- extern "C" {
24
- #endif
25
-
26
- struct _GumboParser;
27
-
28
- // Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the
29
- // destination and copying over the characters from source. Dest should be
30
- // empty, with no buffer allocated; otherwise, this leaks it.
31
- void gumbo_string_copy(
32
- struct _GumboParser* parser, GumboStringPiece* dest,
33
- const GumboStringPiece* source);
34
-
35
- #ifdef __cplusplus
36
- }
37
- #endif
38
-
39
- #endif // GUMBO_STRING_PIECE_H_
data/work/tag.c DELETED
@@ -1,222 +0,0 @@
1
- // Copyright 2011 Google Inc. All Rights Reserved.
2
- //
3
- // Licensed under the Apache License, Version 2.0 (the "License");
4
- // you may not use this file except in compliance with the License.
5
- // You may obtain a copy of the License at
6
- //
7
- // http://www.apache.org/licenses/LICENSE-2.0
8
- //
9
- // Unless required by applicable law or agreed to in writing, software
10
- // distributed under the License is distributed on an "AS IS" BASIS,
11
- // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- // See the License for the specific language governing permissions and
13
- // limitations under the License.
14
- //
15
- // Author: jdtang@google.com (Jonathan Tang)
16
-
17
- #include "gumbo.h"
18
-
19
- #include <assert.h>
20
- #include <ctype.h>
21
- #include <strings.h> // For strcasecmp.
22
-
23
- // NOTE(jdtang): Keep this in sync with the GumboTag enum in the header.
24
- // TODO(jdtang): Investigate whether there're efficiency benefits to putting the
25
- // most common tag names first, or to putting them in alphabetical order and
26
- // using a binary search.
27
- const char* kGumboTagNames[] = {
28
- "html",
29
- "head",
30
- "title",
31
- "base",
32
- "link",
33
- "meta",
34
- "style",
35
- "script",
36
- "noscript",
37
- "body",
38
- "section",
39
- "nav",
40
- "article",
41
- "aside",
42
- "h1",
43
- "h2",
44
- "h3",
45
- "h4",
46
- "h5",
47
- "h6",
48
- "hgroup",
49
- "header",
50
- "footer",
51
- "address",
52
- "p",
53
- "hr",
54
- "pre",
55
- "blockquote",
56
- "ol",
57
- "ul",
58
- "li",
59
- "dl",
60
- "dt",
61
- "dd",
62
- "figure",
63
- "figcaption",
64
- "div",
65
- "a",
66
- "em",
67
- "strong",
68
- "small",
69
- "s",
70
- "cite",
71
- "q",
72
- "dfn",
73
- "abbr",
74
- "time",
75
- "code",
76
- "var",
77
- "samp",
78
- "kbd",
79
- "sub",
80
- "sup",
81
- "i",
82
- "b",
83
- "mark",
84
- "ruby",
85
- "rt",
86
- "rp",
87
- "bdi",
88
- "bdo",
89
- "span",
90
- "br",
91
- "wbr",
92
- "ins",
93
- "del",
94
- "image",
95
- "img",
96
- "iframe",
97
- "embed",
98
- "object",
99
- "param",
100
- "video",
101
- "audio",
102
- "source",
103
- "track",
104
- "canvas",
105
- "map",
106
- "area",
107
- "math",
108
- "mi",
109
- "mo",
110
- "mn",
111
- "ms",
112
- "mtext",
113
- "mglyph",
114
- "malignmark",
115
- "annotation-xml",
116
- "svg",
117
- "foreignobject",
118
- "desc",
119
- "table",
120
- "caption",
121
- "colgroup",
122
- "col",
123
- "tbody",
124
- "thead",
125
- "tfoot",
126
- "tr",
127
- "td",
128
- "th",
129
- "form",
130
- "fieldset",
131
- "legend",
132
- "label",
133
- "input",
134
- "button",
135
- "select",
136
- "datalist",
137
- "optgroup",
138
- "option",
139
- "textarea",
140
- "keygen",
141
- "output",
142
- "progress",
143
- "meter",
144
- "details",
145
- "summary",
146
- "command",
147
- "menu",
148
- "applet",
149
- "acronym",
150
- "bgsound",
151
- "dir",
152
- "frame",
153
- "frameset",
154
- "noframes",
155
- "isindex",
156
- "listing",
157
- "xmp",
158
- "nextid",
159
- "noembed",
160
- "plaintext",
161
- "rb",
162
- "strike",
163
- "basefont",
164
- "big",
165
- "blink",
166
- "center",
167
- "font",
168
- "marquee",
169
- "multicol",
170
- "nobr",
171
- "spacer",
172
- "tt",
173
- "u",
174
- "", // TAG_UNKNOWN
175
- "", // TAG_LAST
176
- };
177
-
178
- const char* gumbo_normalized_tagname(GumboTag tag) {
179
- assert(tag <= GUMBO_TAG_LAST);
180
- return kGumboTagNames[tag];
181
- }
182
-
183
- // TODO(jdtang): Add test for this.
184
- void gumbo_tag_from_original_text(GumboStringPiece* text) {
185
- if (text->data == NULL) {
186
- return;
187
- }
188
-
189
- assert(text->length >= 2);
190
- assert(text->data[0] == '<');
191
- assert(text->data[text->length - 1] == '>');
192
- if (text->data[1] == '/') {
193
- // End tag.
194
- assert(text->length >= 3);
195
- text->data += 2; // Move past </
196
- text->length -= 3;
197
- } else {
198
- // Start tag.
199
- text->data += 1; // Move past <
200
- text->length -= 2;
201
- // strnchr is apparently not a standard C library function, so I loop
202
- // explicitly looking for whitespace or other illegal tag characters.
203
- for (const char* c = text->data; c != text->data + text->length; ++c) {
204
- if (isspace(*c) || *c == '/') {
205
- text->length = c - text->data;
206
- break;
207
- }
208
- }
209
- }
210
- }
211
-
212
- GumboTag gumbo_tag_enum(const char* tagname) {
213
- for (int i = 0; i < GUMBO_TAG_LAST; ++i) {
214
- // TODO(jdtang): strcasecmp is non-portable, so if we want to support
215
- // non-GCC compilers, we'll need some #ifdef magic. This source already has
216
- // pretty significant issues with MSVC6 anyway.
217
- if (strcasecmp(tagname, kGumboTagNames[i]) == 0) {
218
- return i;
219
- }
220
- }
221
- return GUMBO_TAG_UNKNOWN;
222
- }