ruby-gumbo 1.0.2 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. checksums.yaml +4 -4
  2. data/LICENSE +1 -1
  3. data/README.mkd +28 -31
  4. data/Rakefile +60 -59
  5. data/ext/extconf.rb +17 -9
  6. data/ext/{gumbo.c → ruby_gumbo_ext.c} +29 -28
  7. data/lib/gumbo.rb +19 -0
  8. data/lib/gumbo/element.rb +52 -0
  9. data/lib/gumbo/{extra.rb → node.rb} +19 -22
  10. data/lib/gumbo/text.rb +29 -0
  11. data/vendor/gumbo-parser/src/attribute.c +44 -0
  12. data/vendor/gumbo-parser/src/attribute.h +37 -0
  13. data/vendor/gumbo-parser/src/char_ref.c +2561 -0
  14. data/vendor/gumbo-parser/src/char_ref.h +61 -0
  15. data/vendor/gumbo-parser/src/error.c +258 -0
  16. data/vendor/gumbo-parser/src/error.h +227 -0
  17. data/vendor/gumbo-parser/src/gumbo.h +807 -0
  18. data/vendor/gumbo-parser/src/insertion_mode.h +57 -0
  19. data/vendor/gumbo-parser/src/parser.c +3917 -0
  20. data/vendor/gumbo-parser/src/parser.h +57 -0
  21. data/vendor/gumbo-parser/src/string_buffer.c +106 -0
  22. data/vendor/gumbo-parser/src/string_buffer.h +81 -0
  23. data/vendor/gumbo-parser/src/string_piece.c +49 -0
  24. data/vendor/gumbo-parser/src/string_piece.h +39 -0
  25. data/vendor/gumbo-parser/src/tag.c +225 -0
  26. data/vendor/gumbo-parser/src/token_type.h +40 -0
  27. data/vendor/gumbo-parser/src/tokenizer.c +2980 -0
  28. data/vendor/gumbo-parser/src/tokenizer.h +123 -0
  29. data/vendor/gumbo-parser/src/tokenizer_states.h +103 -0
  30. data/vendor/gumbo-parser/src/utf8.c +275 -0
  31. data/vendor/gumbo-parser/src/utf8.h +127 -0
  32. data/vendor/gumbo-parser/src/util.c +58 -0
  33. data/vendor/gumbo-parser/src/util.h +62 -0
  34. data/vendor/gumbo-parser/src/vector.c +123 -0
  35. data/vendor/gumbo-parser/src/vector.h +69 -0
  36. metadata +40 -10
  37. data/ext/extconf.h +0 -3
@@ -0,0 +1,57 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ // Contains the definition of the top-level GumboParser structure that's
18
+ // threaded through basically every internal function in the library.
19
+
20
+ #ifndef GUMBO_PARSER_H_
21
+ #define GUMBO_PARSER_H_
22
+
23
+ #ifdef __cplusplus
24
+ extern "C" {
25
+ #endif
26
+
27
+ struct GumboInternalParserState;
28
+ struct GumboInternalOutput;
29
+ struct GumboInternalOptions;
30
+ struct GumboInternalTokenizerState;
31
+
32
+ // An overarching struct that's threaded through (nearly) all functions in the
33
+ // library, OOP-style. This gives each function access to the options and
34
+ // output, along with any internal state needed for the parse.
35
+ typedef struct GumboInternalParser {
36
+ // Settings for this parse run.
37
+ const struct GumboInternalOptions* _options;
38
+
39
+ // Output for the parse.
40
+ struct GumboInternalOutput* _output;
41
+
42
+ // The internal tokenizer state, defined as a pointer to avoid a cyclic
43
+ // dependency on html5tokenizer.h. The main parse routine is responsible for
44
+ // initializing this on parse start, and destroying it on parse end.
45
+ // End-users will never see a non-garbage value in this pointer.
46
+ struct GumboInternalTokenizerState* _tokenizer_state;
47
+
48
+ // The internal parser state. Initialized on parse start and destroyed on
49
+ // parse end; end-users will never see a non-garbage value in this pointer.
50
+ struct GumboInternalParserState* _parser_state;
51
+ } GumboParser;
52
+
53
+ #ifdef __cplusplus
54
+ }
55
+ #endif
56
+
57
+ #endif // GUMBO_PARSER_H_
@@ -0,0 +1,106 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "string_buffer.h"
18
+
19
+ #include <assert.h>
20
+ #include <stdlib.h>
21
+ #include <string.h>
22
+ #include <strings.h>
23
+
24
+ #include "string_piece.h"
25
+ #include "util.h"
26
+
27
+ struct GumboInternalParser;
28
+
29
+ static const size_t kDefaultStringBufferSize = 10;
30
+
31
+ static void maybe_resize_string_buffer(
32
+ struct GumboInternalParser* parser, size_t additional_chars,
33
+ GumboStringBuffer* buffer) {
34
+ size_t new_length = buffer->length + additional_chars;
35
+ size_t new_capacity = buffer->capacity;
36
+ while (new_capacity < new_length) {
37
+ new_capacity *= 2;
38
+ }
39
+ if (new_capacity != buffer->capacity) {
40
+ char* new_data = gumbo_parser_allocate(parser, new_capacity);
41
+ memcpy(new_data, buffer->data, buffer->length);
42
+ gumbo_parser_deallocate(parser, buffer->data);
43
+ buffer->data = new_data;
44
+ buffer->capacity = new_capacity;
45
+ }
46
+ }
47
+
48
+ void gumbo_string_buffer_init(
49
+ struct GumboInternalParser* parser, GumboStringBuffer* output) {
50
+ output->data = gumbo_parser_allocate(parser, kDefaultStringBufferSize);
51
+ output->length = 0;
52
+ output->capacity = kDefaultStringBufferSize;
53
+ }
54
+
55
+ void gumbo_string_buffer_reserve(
56
+ struct GumboInternalParser* parser, size_t min_capacity,
57
+ GumboStringBuffer* output) {
58
+ maybe_resize_string_buffer(parser, min_capacity - output->length, output);
59
+ }
60
+
61
+ void gumbo_string_buffer_append_codepoint(
62
+ struct GumboInternalParser* parser, int c, GumboStringBuffer* output) {
63
+ // num_bytes is actually the number of continuation bytes, 1 less than the
64
+ // total number of bytes. This is done to keep the loop below simple and
65
+ // should probably change if we unroll it.
66
+ int num_bytes, prefix;
67
+ if (c <= 0x7f) {
68
+ num_bytes = 0;
69
+ prefix = 0;
70
+ } else if (c <= 0x7ff) {
71
+ num_bytes = 1;
72
+ prefix = 0xc0;
73
+ } else if (c <= 0xffff) {
74
+ num_bytes = 2;
75
+ prefix = 0xe0;
76
+ } else {
77
+ num_bytes = 3;
78
+ prefix = 0xf0;
79
+ }
80
+ maybe_resize_string_buffer(parser, num_bytes + 1, output);
81
+ output->data[output->length++] = prefix | (c >> (num_bytes * 6));
82
+ for (int i = num_bytes - 1; i >= 0; --i) {
83
+ output->data[output->length++] = 0x80 | (0x3f & (c >> (i * 6)));
84
+ }
85
+ }
86
+
87
+ void gumbo_string_buffer_append_string(
88
+ struct GumboInternalParser* parser, GumboStringPiece* str,
89
+ GumboStringBuffer* output) {
90
+ maybe_resize_string_buffer(parser, str->length, output);
91
+ memcpy(output->data + output->length, str->data, str->length);
92
+ output->length += str->length;
93
+ }
94
+
95
+ char* gumbo_string_buffer_to_string(
96
+ struct GumboInternalParser* parser, GumboStringBuffer* input) {
97
+ char* buffer = gumbo_parser_allocate(parser, input->length + 1);
98
+ memcpy(buffer, input->data, input->length);
99
+ buffer[input->length] = '\0';
100
+ return buffer;
101
+ }
102
+
103
+ void gumbo_string_buffer_destroy(
104
+ struct GumboInternalParser* parser, GumboStringBuffer* buffer) {
105
+ gumbo_parser_deallocate(parser, buffer->data);
106
+ }
@@ -0,0 +1,81 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+ //
17
+ #ifndef GUMBO_STRING_BUFFER_H_
18
+ #define GUMBO_STRING_BUFFER_H_
19
+
20
+ #include <stdbool.h>
21
+ #include <stddef.h>
22
+
23
+ #include "gumbo.h"
24
+
25
+ #ifdef __cplusplus
26
+ extern "C" {
27
+ #endif
28
+
29
+ struct GumboInternalParser;
30
+
31
+ // A struct representing a mutable, growable string. This consists of a
32
+ // heap-allocated buffer that may grow (by doubling) as necessary. When
33
+ // converting to a string, this allocates a new buffer that is only as long as
34
+ // it needs to be. Note that the internal buffer here is *not* nul-terminated,
35
+ // so be sure not to use ordinary string manipulation functions on it.
36
+ typedef struct {
37
+ // A pointer to the beginning of the string. NULL iff length == 0.
38
+ char* data;
39
+
40
+ // The length of the string fragment, in bytes. May be zero.
41
+ size_t length;
42
+
43
+ // The capacity of the buffer, in bytes.
44
+ size_t capacity;
45
+ } GumboStringBuffer;
46
+
47
+ // Initializes a new GumboStringBuffer.
48
+ void gumbo_string_buffer_init(
49
+ struct GumboInternalParser* parser, GumboStringBuffer* output);
50
+
51
+ // Ensures that the buffer contains at least a certain amount of space. Most
52
+ // useful with snprintf and the other length-delimited string functions, which
53
+ // may want to write directly into the buffer.
54
+ void gumbo_string_buffer_reserve(
55
+ struct GumboInternalParser* parser, size_t min_capacity,
56
+ GumboStringBuffer* output);
57
+
58
+ // Appends a single Unicode codepoint onto the end of the GumboStringBuffer.
59
+ // This is essentially a UTF-8 encoder, and may add 1-4 bytes depending on the
60
+ // value of the codepoint.
61
+ void gumbo_string_buffer_append_codepoint(
62
+ struct GumboInternalParser* parser, int c, GumboStringBuffer* output);
63
+
64
+ // Appends a string onto the end of the GumboStringBuffer.
65
+ void gumbo_string_buffer_append_string(
66
+ struct GumboInternalParser* parser, GumboStringPiece* str,
67
+ GumboStringBuffer* output);
68
+
69
+ // Converts this string buffer to const char*, alloctaing a new buffer for it.
70
+ char* gumbo_string_buffer_to_string(
71
+ struct GumboInternalParser* parser, GumboStringBuffer* input);
72
+
73
+ // Deallocates this GumboStringBuffer.
74
+ void gumbo_string_buffer_destroy(
75
+ struct GumboInternalParser* parser, GumboStringBuffer* buffer);
76
+
77
+ #ifdef __cplusplus
78
+ }
79
+ #endif
80
+
81
+ #endif // GUMBO_STRING_BUFFER_H_
@@ -0,0 +1,49 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "string_piece.h"
18
+
19
+ #include <assert.h>
20
+ #include <stdlib.h>
21
+ #include <string.h>
22
+ #include <strings.h>
23
+
24
+ #include "util.h"
25
+
26
+ struct GumboInternalParser;
27
+
28
+ const GumboStringPiece kGumboEmptyString = { NULL, 0 };
29
+
30
+ bool gumbo_string_equals(
31
+ const GumboStringPiece* str1, const GumboStringPiece* str2) {
32
+ return str1->length == str2->length &&
33
+ !memcmp(str1->data, str2->data, str1->length);
34
+ }
35
+
36
+ bool gumbo_string_equals_ignore_case(
37
+ const GumboStringPiece* str1, const GumboStringPiece* str2) {
38
+ return str1->length == str2->length &&
39
+ !strncasecmp(str1->data, str2->data, str1->length);
40
+ }
41
+
42
+ void gumbo_string_copy(
43
+ struct GumboInternalParser* parser, GumboStringPiece* dest,
44
+ const GumboStringPiece* source) {
45
+ dest->length = source->length;
46
+ char* buffer = gumbo_parser_allocate(parser, source->length);
47
+ memcpy(buffer, source->data, source->length);
48
+ dest->data = buffer;
49
+ }
@@ -0,0 +1,39 @@
1
+ // Copyright 2010 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #ifndef GUMBO_STRING_PIECE_H_
18
+ #define GUMBO_STRING_PIECE_H_
19
+
20
+ #include "gumbo.h"
21
+
22
+ #ifdef __cplusplus
23
+ extern "C" {
24
+ #endif
25
+
26
+ struct GumboInternalParser;
27
+
28
+ // Performs a deep-copy of an GumboStringPiece, allocating a fresh buffer in the
29
+ // destination and copying over the characters from source. Dest should be
30
+ // empty, with no buffer allocated; otherwise, this leaks it.
31
+ void gumbo_string_copy(
32
+ struct GumboInternalParser* parser, GumboStringPiece* dest,
33
+ const GumboStringPiece* source);
34
+
35
+ #ifdef __cplusplus
36
+ }
37
+ #endif
38
+
39
+ #endif // GUMBO_STRING_PIECE_H_
@@ -0,0 +1,225 @@
1
+ // Copyright 2011 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #include "gumbo.h"
18
+
19
+ #include <assert.h>
20
+ #include <ctype.h>
21
+ #include <strings.h> // For strcasecmp.
22
+
23
+ // NOTE(jdtang): Keep this in sync with the GumboTag enum in the header.
24
+ // TODO(jdtang): Investigate whether there're efficiency benefits to putting the
25
+ // most common tag names first, or to putting them in alphabetical order and
26
+ // using a binary search.
27
+ const char* kGumboTagNames[] = {
28
+ "html",
29
+ "head",
30
+ "title",
31
+ "base",
32
+ "link",
33
+ "meta",
34
+ "style",
35
+ "script",
36
+ "noscript",
37
+ "template",
38
+ "body",
39
+ "article",
40
+ "section",
41
+ "nav",
42
+ "aside",
43
+ "h1",
44
+ "h2",
45
+ "h3",
46
+ "h4",
47
+ "h5",
48
+ "h6",
49
+ "hgroup",
50
+ "header",
51
+ "footer",
52
+ "address",
53
+ "p",
54
+ "hr",
55
+ "pre",
56
+ "blockquote",
57
+ "ol",
58
+ "ul",
59
+ "li",
60
+ "dl",
61
+ "dt",
62
+ "dd",
63
+ "figure",
64
+ "figcaption",
65
+ "main",
66
+ "div",
67
+ "a",
68
+ "em",
69
+ "strong",
70
+ "small",
71
+ "s",
72
+ "cite",
73
+ "q",
74
+ "dfn",
75
+ "abbr",
76
+ "data",
77
+ "time",
78
+ "code",
79
+ "var",
80
+ "samp",
81
+ "kbd",
82
+ "sub",
83
+ "sup",
84
+ "i",
85
+ "b",
86
+ "u",
87
+ "mark",
88
+ "ruby",
89
+ "rt",
90
+ "rp",
91
+ "bdi",
92
+ "bdo",
93
+ "span",
94
+ "br",
95
+ "wbr",
96
+ "ins",
97
+ "del",
98
+ "image",
99
+ "img",
100
+ "iframe",
101
+ "embed",
102
+ "object",
103
+ "param",
104
+ "video",
105
+ "audio",
106
+ "source",
107
+ "track",
108
+ "canvas",
109
+ "map",
110
+ "area",
111
+ "math",
112
+ "mi",
113
+ "mo",
114
+ "mn",
115
+ "ms",
116
+ "mtext",
117
+ "mglyph",
118
+ "malignmark",
119
+ "annotation-xml",
120
+ "svg",
121
+ "foreignobject",
122
+ "desc",
123
+ "table",
124
+ "caption",
125
+ "colgroup",
126
+ "col",
127
+ "tbody",
128
+ "thead",
129
+ "tfoot",
130
+ "tr",
131
+ "td",
132
+ "th",
133
+ "form",
134
+ "fieldset",
135
+ "legend",
136
+ "label",
137
+ "input",
138
+ "button",
139
+ "select",
140
+ "datalist",
141
+ "optgroup",
142
+ "option",
143
+ "textarea",
144
+ "keygen",
145
+ "output",
146
+ "progress",
147
+ "meter",
148
+ "details",
149
+ "summary",
150
+ "menu",
151
+ "menuitem",
152
+ "applet",
153
+ "acronym",
154
+ "bgsound",
155
+ "dir",
156
+ "frame",
157
+ "frameset",
158
+ "noframes",
159
+ "isindex",
160
+ "listing",
161
+ "xmp",
162
+ "nextid",
163
+ "noembed",
164
+ "plaintext",
165
+ "rb",
166
+ "strike",
167
+ "basefont",
168
+ "big",
169
+ "blink",
170
+ "center",
171
+ "font",
172
+ "marquee",
173
+ "multicol",
174
+ "nobr",
175
+ "spacer",
176
+ "tt",
177
+ "", // TAG_UNKNOWN
178
+ "", // TAG_LAST
179
+ };
180
+
181
+ const char* gumbo_normalized_tagname(GumboTag tag) {
182
+ assert(tag <= GUMBO_TAG_LAST);
183
+ return kGumboTagNames[tag];
184
+ }
185
+
186
+ // TODO(jdtang): Add test for this.
187
+ void gumbo_tag_from_original_text(GumboStringPiece* text) {
188
+ if (text->data == NULL) {
189
+ return;
190
+ }
191
+
192
+ assert(text->length >= 2);
193
+ assert(text->data[0] == '<');
194
+ assert(text->data[text->length - 1] == '>');
195
+ if (text->data[1] == '/') {
196
+ // End tag.
197
+ assert(text->length >= 3);
198
+ text->data += 2; // Move past </
199
+ text->length -= 3;
200
+ } else {
201
+ // Start tag.
202
+ text->data += 1; // Move past <
203
+ text->length -= 2;
204
+ // strnchr is apparently not a standard C library function, so I loop
205
+ // explicitly looking for whitespace or other illegal tag characters.
206
+ for (const char* c = text->data; c != text->data + text->length; ++c) {
207
+ if (isspace(*c) || *c == '/') {
208
+ text->length = c - text->data;
209
+ break;
210
+ }
211
+ }
212
+ }
213
+ }
214
+
215
+ GumboTag gumbo_tag_enum(const char* tagname) {
216
+ for (int i = 0; i < GUMBO_TAG_LAST; ++i) {
217
+ // TODO(jdtang): strcasecmp is non-portable, so if we want to support
218
+ // non-GCC compilers, we'll need some #ifdef magic. This source already has
219
+ // pretty significant issues with MSVC6 anyway.
220
+ if (strcasecmp(tagname, kGumboTagNames[i]) == 0) {
221
+ return i;
222
+ }
223
+ }
224
+ return GUMBO_TAG_UNKNOWN;
225
+ }