ruby-gumbo 1.0.2 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.mkd +28 -31
  4. data/Rakefile +60 -59
  5. data/ext/extconf.rb +17 -9
  6. data/ext/{gumbo.c → ruby_gumbo_ext.c} +29 -28
  7. data/lib/gumbo.rb +19 -0
  8. data/lib/gumbo/element.rb +52 -0
  9. data/lib/gumbo/{extra.rb → node.rb} +19 -22
  10. data/lib/gumbo/text.rb +29 -0
  11. data/vendor/gumbo-parser/src/attribute.c +44 -0
  12. data/vendor/gumbo-parser/src/attribute.h +37 -0
  13. data/vendor/gumbo-parser/src/char_ref.c +2561 -0
  14. data/vendor/gumbo-parser/src/char_ref.h +61 -0
  15. data/vendor/gumbo-parser/src/error.c +258 -0
  16. data/vendor/gumbo-parser/src/error.h +227 -0
  17. data/vendor/gumbo-parser/src/gumbo.h +807 -0
  18. data/vendor/gumbo-parser/src/insertion_mode.h +57 -0
  19. data/vendor/gumbo-parser/src/parser.c +3917 -0
  20. data/vendor/gumbo-parser/src/parser.h +57 -0
  21. data/vendor/gumbo-parser/src/string_buffer.c +106 -0
  22. data/vendor/gumbo-parser/src/string_buffer.h +81 -0
  23. data/vendor/gumbo-parser/src/string_piece.c +49 -0
  24. data/vendor/gumbo-parser/src/string_piece.h +39 -0
  25. data/vendor/gumbo-parser/src/tag.c +225 -0
  26. data/vendor/gumbo-parser/src/token_type.h +40 -0
  27. data/vendor/gumbo-parser/src/tokenizer.c +2980 -0
  28. data/vendor/gumbo-parser/src/tokenizer.h +123 -0
  29. data/vendor/gumbo-parser/src/tokenizer_states.h +103 -0
  30. data/vendor/gumbo-parser/src/utf8.c +275 -0
  31. data/vendor/gumbo-parser/src/utf8.h +127 -0
  32. data/vendor/gumbo-parser/src/util.c +58 -0
  33. data/vendor/gumbo-parser/src/util.h +62 -0
  34. data/vendor/gumbo-parser/src/vector.c +123 -0
  35. data/vendor/gumbo-parser/src/vector.h +69 -0
  36. metadata +40 -10
  37. data/ext/extconf.h +0 -3
@@ -0,0 +1,57 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // Contains the definition of the top-level GumboParser structure that's
18
+ // threaded through basically every internal function in the library.
19
+
20
+ #ifndef GUMBO_PARSER_H_
21
+ #define GUMBO_PARSER_H_
22
+
23
+ #ifdef __cplusplus
24
+ extern "C" {
25
+ #endif
26
+
27
+ struct GumboInternalParserState;
28
+ struct GumboInternalOutput;
29
+ struct GumboInternalOptions;
30
+ struct GumboInternalTokenizerState;
31
+
32
+ // An overarching struct that's threaded through (nearly) all functions in the
33
+ // library, OOP-style. This gives each function access to the options and
34
+ // output, along with any internal state needed for the parse.
35
+ typedef struct GumboInternalParser {
36
+ // Settings for this parse run.
37
+ const struct GumboInternalOptions* _options;
38
+
39
+ // Output for the parse.
40
+ struct GumboInternalOutput* _output;
41
+
42
+ // The internal tokenizer state, defined as a pointer to avoid a cyclic
43
+ // dependency on html5tokenizer.h. The main parse routine is responsible for
44
+ // initializing this on parse start, and destroying it on parse end.
45
+ // End-users will never see a non-garbage value in this pointer.
46
+ struct GumboInternalTokenizerState* _tokenizer_state;
47
+
48
+ // The internal parser state. Initialized on parse start and destroyed on
49
+ // parse end; end-users will never see a non-garbage value in this pointer.
50
+ struct GumboInternalParserState* _parser_state;
51
+ } GumboParser;
52
+
53
+ #ifdef __cplusplus
54
+ }
55
+ #endif
56
+
57
+ #endif // GUMBO_PARSER_H_
@@ -0,0 +1,106 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "string_buffer.h"
18
+
19
+ #include <assert.h>
20
+ #include <stdlib.h>
21
+ #include <string.h>
22
+ #include <strings.h>
23
+
24
+ #include "string_piece.h"
25
+ #include "util.h"
26
+
27
+ struct GumboInternalParser;
28
+
29
+ static const size_t kDefaultStringBufferSize = 10;
30
+
31
+ static void maybe_resize_string_buffer(
32
+ struct GumboInternalParser* parser, size_t additional_chars,
33
+ GumboStringBuffer* buffer) {
34
+ size_t new_length = buffer->length + additional_chars;
35
+ size_t new_capacity = buffer->capacity;
36
+ while (new_capacity < new_length) {
37
+ new_capacity *= 2;
38
+ }
39
+ if (new_capacity != buffer->capacity) {
40
+ char* new_data = gumbo_parser_allocate(parser, new_capacity);
41
+ memcpy(new_data, buffer->data, buffer->length);
42
+ gumbo_parser_deallocate(parser, buffer->data);
43
+ buffer->data = new_data;
44
+ buffer->capacity = new_capacity;
45
+ }
46
+ }
47
+
48
+ void gumbo_string_buffer_init(
49
+ struct GumboInternalParser* parser, GumboStringBuffer* output) {
50
+ output->data = gumbo_parser_allocate(parser, kDefaultStringBufferSize);
51
+ output->length = 0;
52
+ output->capacity = kDefaultStringBufferSize;
53
+ }
54
+
55
+ void gumbo_string_buffer_reserve(
56
+ struct GumboInternalParser* parser, size_t min_capacity,
57
+ GumboStringBuffer* output) {
58
+ maybe_resize_string_buffer(parser, min_capacity - output->length, output);
59
+ }
60
+
61
+ void gumbo_string_buffer_append_codepoint(
62
+ struct GumboInternalParser* parser, int c, GumboStringBuffer* output) {
63
+ // num_bytes is actually the number of continuation bytes, 1 less than the
64
+ // total number of bytes. This is done to keep the loop below simple and
65
+ // should probably change if we unroll it.
66
+ int num_bytes, prefix;
67
+ if (c <= 0x7f) {
68
+ num_bytes = 0;
69
+ prefix = 0;
70
+ } else if (c <= 0x7ff) {
71
+ num_bytes = 1;
72
+ prefix = 0xc0;
73
+ } else if (c <= 0xffff) {
74
+ num_bytes = 2;
75
+ prefix = 0xe0;
76
+ } else {
77
+ num_bytes = 3;
78
+ prefix = 0xf0;
79
+ }
80
+ maybe_resize_string_buffer(parser, num_bytes + 1, output);
81
+ output->data[output->length++] = prefix | (c >> (num_bytes * 6));
82
+ for (int i = num_bytes - 1; i >= 0; --i) {
83
+ output->data[output->length++] = 0x80 | (0x3f & (c >> (i * 6)));
84
+ }
85
+ }
86
+
87
+ void gumbo_string_buffer_append_string(
88
+ struct GumboInternalParser* parser, GumboStringPiece* str,
89
+ GumboStringBuffer* output) {
90
+ maybe_resize_string_buffer(parser, str->length, output);
91
+ memcpy(output->data + output->length, str->data, str->length);
92
+ output->length += str->length;
93
+ }
94
+
95
+ char* gumbo_string_buffer_to_string(
96
+ struct GumboInternalParser* parser, GumboStringBuffer* input) {
97
+ char* buffer = gumbo_parser_allocate(parser, input->length + 1);
98
+ memcpy(buffer, input->data, input->length);
99
+ buffer[input->length] = '\0';
100
+ return buffer;
101
+ }
102
+
103
+ void gumbo_string_buffer_destroy(
104
+ struct GumboInternalParser* parser, GumboStringBuffer* buffer) {
105
+ gumbo_parser_deallocate(parser, buffer->data);
106
+ }
@@ -0,0 +1,81 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ #ifndef GUMBO_STRING_BUFFER_H_
18
+ #define GUMBO_STRING_BUFFER_H_
19
+
20
+ #include <stdbool.h>
21
+ #include <stddef.h>
22
+
23
+ #include "gumbo.h"
24
+
25
+ #ifdef __cplusplus
26
+ extern "C" {
27
+ #endif
28
+
29
+ struct GumboInternalParser;
30
+
31
+ // A struct representing a mutable, growable string. This consists of a
32
+ // heap-allocated buffer that may grow (by doubling) as necessary. When
33
+ // converting to a string, this allocates a new buffer that is only as long as
34
+ // it needs to be. Note that the internal buffer here is *not* nul-terminated,
35
+ // so be sure not to use ordinary string manipulation functions on it.
36
+ typedef struct {
37
+ // A pointer to the beginning of the string. NULL iff length == 0.
38
+ char* data;
39
+
40
+ // The length of the string fragment, in bytes. May be zero.
41
+ size_t length;
42
+
43
+ // The capacity of the buffer, in bytes.
44
+ size_t capacity;
45
+ } GumboStringBuffer;
46
+
47
+ // Initializes a new GumboStringBuffer.
48
+ void gumbo_string_buffer_init(
49
+ struct GumboInternalParser* parser, GumboStringBuffer* output);
50
+
51
+ // Ensures that the buffer contains at least a certain amount of space. Most
52
+ // useful with snprintf and the other length-delimited string functions, which
53
+ // may want to write directly into the buffer.
54
+ void gumbo_string_buffer_reserve(
55
+ struct GumboInternalParser* parser, size_t min_capacity,
56
+ GumboStringBuffer* output);
57
+
58
+ // Appends a single Unicode codepoint onto the end of the GumboStringBuffer.
59
+ // This is essentially a UTF-8 encoder, and may add 1-4 bytes depending on the
60
+ // value of the codepoint.
61
+ void gumbo_string_buffer_append_codepoint(
62
+ struct GumboInternalParser* parser, int c, GumboStringBuffer* output);
63
+
64
+ // Appends a string onto the end of the GumboStringBuffer.
65
+ void gumbo_string_buffer_append_string(
66
+ struct GumboInternalParser* parser, GumboStringPiece* str,
67
+ GumboStringBuffer* output);
68
+
69
+ // Converts this string buffer to const char*, alloctaing a new buffer for it.
70
+ char* gumbo_string_buffer_to_string(
71
+ struct GumboInternalParser* parser, GumboStringBuffer* input);
72
+
73
+ // Deallocates this GumboStringBuffer.
74
+ void gumbo_string_buffer_destroy(
75
+ struct GumboInternalParser* parser, GumboStringBuffer* buffer);
76
+
77
+ #ifdef __cplusplus
78
+ }
79
+ #endif
80
+
81
+ #endif // GUMBO_STRING_BUFFER_H_
@@ -0,0 +1,49 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "string_piece.h"
18
+
19
+ #include <assert.h>
20
+ #include <stdlib.h>
21
+ #include <string.h>
22
+ #include <strings.h>
23
+
24
+ #include "util.h"
25
+
26
+ struct GumboInternalParser;
27
+
28
+ const GumboStringPiece kGumboEmptyString = { NULL, 0 };
29
+
30
+ bool gumbo_string_equals(
31
+ const GumboStringPiece* str1, const GumboStringPiece* str2) {
32
+ return str1->length == str2->length &&
33
+ !memcmp(str1->data, str2->data, str1->length);
34
+ }
35
+
36
+ bool gumbo_string_equals_ignore_case(
37
+ const GumboStringPiece* str1, const GumboStringPiece* str2) {
38
+ return str1->length == str2->length &&
39
+ !strncasecmp(str1->data, str2->data, str1->length);
40
+ }
41
+
42
+ void gumbo_string_copy(
43
+ struct GumboInternalParser* parser, GumboStringPiece* dest,
44
+ const GumboStringPiece* source) {
45
+ dest->length = source->length;
46
+ char* buffer = gumbo_parser_allocate(parser, source->length);
47
+ memcpy(buffer, source->data, source->length);
48
+ dest->data = buffer;
49
+ }
@@ -0,0 +1,39 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #ifndef GUMBO_STRING_PIECE_H_
18
+ #define GUMBO_STRING_PIECE_H_
19
+
20
+ #include "gumbo.h"
21
+
22
+ #ifdef __cplusplus
23
+ extern "C" {
24
+ #endif
25
+
26
+ struct GumboInternalParser;
27
+
28
+ // Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the
29
+ // destination and copying over the characters from source. Dest should be
30
+ // empty, with no buffer allocated; otherwise, this leaks it.
31
+ void gumbo_string_copy(
32
+ struct GumboInternalParser* parser, GumboStringPiece* dest,
33
+ const GumboStringPiece* source);
34
+
35
+ #ifdef __cplusplus
36
+ }
37
+ #endif
38
+
39
+ #endif // GUMBO_STRING_PIECE_H_
@@ -0,0 +1,225 @@
1
+ // Copyright 2011 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "gumbo.h"
18
+
19
+ #include <assert.h>
20
+ #include <ctype.h>
21
+ #include <strings.h> // For strcasecmp.
22
+
23
+ // NOTE(jdtang): Keep this in sync with the GumboTag enum in the header.
24
+ // TODO(jdtang): Investigate whether there're efficiency benefits to putting the
25
+ // most common tag names first, or to putting them in alphabetical order and
26
+ // using a binary search.
27
+ const char* kGumboTagNames[] = {
28
+ "html",
29
+ "head",
30
+ "title",
31
+ "base",
32
+ "link",
33
+ "meta",
34
+ "style",
35
+ "script",
36
+ "noscript",
37
+ "template",
38
+ "body",
39
+ "article",
40
+ "section",
41
+ "nav",
42
+ "aside",
43
+ "h1",
44
+ "h2",
45
+ "h3",
46
+ "h4",
47
+ "h5",
48
+ "h6",
49
+ "hgroup",
50
+ "header",
51
+ "footer",
52
+ "address",
53
+ "p",
54
+ "hr",
55
+ "pre",
56
+ "blockquote",
57
+ "ol",
58
+ "ul",
59
+ "li",
60
+ "dl",
61
+ "dt",
62
+ "dd",
63
+ "figure",
64
+ "figcaption",
65
+ "main",
66
+ "div",
67
+ "a",
68
+ "em",
69
+ "strong",
70
+ "small",
71
+ "s",
72
+ "cite",
73
+ "q",
74
+ "dfn",
75
+ "abbr",
76
+ "data",
77
+ "time",
78
+ "code",
79
+ "var",
80
+ "samp",
81
+ "kbd",
82
+ "sub",
83
+ "sup",
84
+ "i",
85
+ "b",
86
+ "u",
87
+ "mark",
88
+ "ruby",
89
+ "rt",
90
+ "rp",
91
+ "bdi",
92
+ "bdo",
93
+ "span",
94
+ "br",
95
+ "wbr",
96
+ "ins",
97
+ "del",
98
+ "image",
99
+ "img",
100
+ "iframe",
101
+ "embed",
102
+ "object",
103
+ "param",
104
+ "video",
105
+ "audio",
106
+ "source",
107
+ "track",
108
+ "canvas",
109
+ "map",
110
+ "area",
111
+ "math",
112
+ "mi",
113
+ "mo",
114
+ "mn",
115
+ "ms",
116
+ "mtext",
117
+ "mglyph",
118
+ "malignmark",
119
+ "annotation-xml",
120
+ "svg",
121
+ "foreignobject",
122
+ "desc",
123
+ "table",
124
+ "caption",
125
+ "colgroup",
126
+ "col",
127
+ "tbody",
128
+ "thead",
129
+ "tfoot",
130
+ "tr",
131
+ "td",
132
+ "th",
133
+ "form",
134
+ "fieldset",
135
+ "legend",
136
+ "label",
137
+ "input",
138
+ "button",
139
+ "select",
140
+ "datalist",
141
+ "optgroup",
142
+ "option",
143
+ "textarea",
144
+ "keygen",
145
+ "output",
146
+ "progress",
147
+ "meter",
148
+ "details",
149
+ "summary",
150
+ "menu",
151
+ "menuitem",
152
+ "applet",
153
+ "acronym",
154
+ "bgsound",
155
+ "dir",
156
+ "frame",
157
+ "frameset",
158
+ "noframes",
159
+ "isindex",
160
+ "listing",
161
+ "xmp",
162
+ "nextid",
163
+ "noembed",
164
+ "plaintext",
165
+ "rb",
166
+ "strike",
167
+ "basefont",
168
+ "big",
169
+ "blink",
170
+ "center",
171
+ "font",
172
+ "marquee",
173
+ "multicol",
174
+ "nobr",
175
+ "spacer",
176
+ "tt",
177
+ "", // TAG_UNKNOWN
178
+ "", // TAG_LAST
179
+ };
180
+
181
+ const char* gumbo_normalized_tagname(GumboTag tag) {
182
+ assert(tag <= GUMBO_TAG_LAST);
183
+ return kGumboTagNames[tag];
184
+ }
185
+
186
+ // TODO(jdtang): Add test for this.
187
+ void gumbo_tag_from_original_text(GumboStringPiece* text) {
188
+ if (text->data == NULL) {
189
+ return;
190
+ }
191
+
192
+ assert(text->length >= 2);
193
+ assert(text->data[0] == '<');
194
+ assert(text->data[text->length - 1] == '>');
195
+ if (text->data[1] == '/') {
196
+ // End tag.
197
+ assert(text->length >= 3);
198
+ text->data += 2; // Move past </
199
+ text->length -= 3;
200
+ } else {
201
+ // Start tag.
202
+ text->data += 1; // Move past <
203
+ text->length -= 2;
204
+ // strnchr is apparently not a standard C library function, so I loop
205
+ // explicitly looking for whitespace or other illegal tag characters.
206
+ for (const char* c = text->data; c != text->data + text->length; ++c) {
207
+ if (isspace(*c) || *c == '/') {
208
+ text->length = c - text->data;
209
+ break;
210
+ }
211
+ }
212
+ }
213
+ }
214
+
215
+ GumboTag gumbo_tag_enum(const char* tagname) {
216
+ for (int i = 0; i < GUMBO_TAG_LAST; ++i) {
217
+ // TODO(jdtang): strcasecmp is non-portable, so if we want to support
218
+ // non-GCC compilers, we'll need some #ifdef magic. This source already has
219
+ // pretty significant issues with MSVC6 anyway.
220
+ if (strcasecmp(tagname, kGumboTagNames[i]) == 0) {
221
+ return i;
222
+ }
223
+ }
224
+ return GUMBO_TAG_UNKNOWN;
225
+ }