ruby-gumbo 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9f5c5ba70ffb487659a2aadc0e9cf5677beef932
4
+ data.tar.gz: 6c4be899d6d729cf8070978c959ec076b2f2f155
5
+ SHA512:
6
+ metadata.gz: 0005f69394bedd851e92092ddf18169322b9c3748ecbdf478d9f15af1d2a5200cbc8334caa1c60a9cb3fd3d4de74f5d3b2d8d82360eb562db8c636da17ccc8ed
7
+ data.tar.gz: 31444aaed773d14e08862350b5a6af88284e96267d89ba03732eeaf5203f1351cfee18a2e36eb58e681d1f0c0d93aeb6bfe39b139fd5c82544f6f257a5f77f2b
data/LICENSE ADDED
@@ -0,0 +1,13 @@
1
+ Copyright (c) 2013 Nicolas Martyanoff
2
+
3
+ Permission to use, copy, modify, and distribute this software for any
4
+ purpose with or without fee is hereby granted, provided that the above
5
+ copyright notice and this permission notice appear in all copies.
6
+
7
+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8
+ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9
+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10
+ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
@@ -0,0 +1,15 @@
1
+ # ruby-gumbo
2
+
3
+ ## Description
4
+
5
+ `ruby-gumbo` is a ruby binding for the Gumbo HTML5 parser.
6
+
7
+ ## Installation
8
+
9
+ Create the gem with `rake package`, then install it with `gem install` (the
10
+ gem file is in the `pkg` directory).
11
+
12
+ ## Contact
13
+
14
+ If you have found a bug, have an idea or a question, email me at
15
+ <khaelin@gmail.com>.
@@ -0,0 +1,79 @@
1
+
2
+ require 'rake/clean'
3
+
4
+ require 'rdoc/task'
5
+
6
+ require 'rubygems/package_task'
7
+
8
+
9
+ PKG_NAME = "ruby-gumbo"
10
+ PKG_VERSION = "1.0.1"
11
+
12
+ EXT_CONF = "ext/extconf.rb"
13
+ MAKEFILE = "ext/Makefile"
14
+ MODULE = "ext/gumbo.so"
15
+ SRC = Dir.glob("ext/*.c") << MAKEFILE
16
+
17
+ CLEAN.include [MODULE, "ext/*.o"]
18
+ CLOBBER.include ["ext/mkmf.log", "ext/extconf.h", MAKEFILE]
19
+
20
+ # Build
21
+ file MAKEFILE => EXT_CONF do |t|
22
+ Dir::chdir(File::dirname(EXT_CONF)) do
23
+ unless sh "ruby #{File::basename(EXT_CONF)}"
24
+ $stderr.puts "extconf.rb failed"
25
+ break
26
+ end
27
+ end
28
+ end
29
+
30
+ file MODULE => SRC do |t|
31
+ Dir::chdir(File::dirname(EXT_CONF)) do
32
+ unless sh "make"
33
+ $stderr.puts "make failed"
34
+ break
35
+ end
36
+ end
37
+ end
38
+
39
+ desc "Build the native library"
40
+ task :build => MODULE
41
+
42
+ # Documentation
43
+ RDOC_FILES = FileList["ext/gumbo.c", "lib/gumbo/extra.rb"]
44
+
45
+ Rake::RDocTask.new do |task|
46
+ #task.main = "README.rdoc"
47
+ task.rdoc_dir = "doc/api"
48
+ task.rdoc_files.include(RDOC_FILES)
49
+ end
50
+
51
+ Rake::RDocTask.new(:ri) do |task|
52
+ #task.main = "README.rdoc"
53
+ task.rdoc_dir = "doc/ri"
54
+ task.options << "--ri-system"
55
+ task.rdoc_files.include(RDOC_FILES)
56
+ end
57
+
58
+ # Packaging
59
+ PKG_FILES = FileList["Rakefile", "LICENSE", "README.mkd",
60
+ "lib/gumbo/*.rb",
61
+ "ext/extconf.rb", "ext/*.[hc]"]
62
+
63
+ SPEC = Gem::Specification.new do |spec|
64
+ spec.name = PKG_NAME
65
+ spec.version = PKG_VERSION
66
+ spec.summary = "Ruby bindings for the gumbo html5 parser"
67
+ spec.author = "Nicolas Martyanoff"
68
+ spec.email = "khaelin@gmail.com"
69
+ spec.license = "ISC"
70
+
71
+ spec.files = PKG_FILES
72
+ spec.extensions = "ext/extconf.rb"
73
+
74
+ spec.required_ruby_version = ">= 1.9.3"
75
+ end
76
+
77
+ Gem::PackageTask.new(SPEC) do |pkg|
78
+ pkg.need_tar = true
79
+ end
@@ -0,0 +1,15 @@
1
+
2
+ require "mkmf"
3
+
4
+ RbConfig::MAKEFILE_CONFIG["CC"] = ENV["CC"] if ENV["CC"]
5
+
6
+ extension_name = "gumbo"
7
+
8
+ unless pkg_config("libgumbo")
9
+ $libs << " -lgumbo"
10
+ end
11
+
12
+ $CFLAGS << " -std=c99"
13
+
14
+ create_header
15
+ create_makefile(extension_name)
@@ -0,0 +1,534 @@
1
+ /*
2
+ * Copyright (c) 2013 Nicolas Martyanoff
3
+ *
4
+ * Permission to use, copy, modify, and distribute this software for any
5
+ * purpose with or without fee is hereby granted, provided that the above
6
+ * copyright notice and this permission notice appear in all copies.
7
+ *
8
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15
+ */
16
+
17
+ #include <ruby.h>
18
+ #include <ruby/encoding.h>
19
+
20
+ #include <gumbo.h>
21
+
22
+ void Init_gumbo(void);
23
+
24
+ VALUE r_gumbo_parse(VALUE module, VALUE input);
25
+ VALUE r_document_has_doctype(VALUE self);
26
+ VALUE r_element_attribute(VALUE self, VALUE name);
27
+ VALUE r_element_has_attribute(VALUE self, VALUE name);
28
+
29
+
30
+ static VALUE r_bool_new(bool val);
31
+ static VALUE r_sym_new(const char *str);
32
+ static VALUE r_str_new(const char *str, long len);
33
+ static VALUE r_tainted_str_new(const char *str, long len);
34
+ static VALUE r_cstr_new(const char *str);
35
+ static VALUE r_tainted_cstr_new(const char *str);
36
+
37
+ static VALUE r_gumbo_destroy_output(VALUE value);
38
+
39
+ static VALUE r_gumbo_source_position_to_value(GumboSourcePosition position);
40
+ static VALUE r_gumbo_node_type_to_symbol(GumboNodeType type);
41
+ static VALUE r_gumbo_parse_flags_to_symbol_array(GumboParseFlags flags);
42
+ static VALUE r_gumbo_quirks_mode_to_symbol(GumboQuirksModeEnum mode);
43
+ static VALUE r_gumbo_namespace_to_symbol(GumboNamespaceEnum ns);
44
+ static VALUE r_gumbo_tag_to_symbol(GumboTag tag);
45
+ static VALUE r_gumbo_node_to_value(GumboNode *node);
46
+
47
+ static VALUE r_gumbo_attribute_namespace_to_symbol(GumboAttributeNamespaceEnum ns);
48
+ static VALUE r_gumbo_attribute_to_value(GumboAttribute *attribute);
49
+
50
+ static VALUE m_gumbo;
51
+ static VALUE c_node, c_document, c_element;
52
+ static VALUE c_text, c_cdata, c_comment, c_whitespace;
53
+ static VALUE c_attribute;
54
+ static VALUE c_source_position;
55
+
56
+
57
+ void
58
+ Init_gumbo(void) {
59
+ m_gumbo = rb_define_module("Gumbo");
60
+ rb_define_module_function(m_gumbo, "parse", r_gumbo_parse, 1);
61
+
62
+ c_node = rb_define_class_under(m_gumbo, "Node", rb_cObject);
63
+ rb_define_attr(c_node, "type", 1, 0);
64
+ rb_define_attr(c_node, "parent", 1, 0);
65
+ rb_define_attr(c_node, "parse_flags", 1, 0);
66
+
67
+ c_document = rb_define_class_under(m_gumbo, "Document", c_node);
68
+ rb_define_attr(c_document, "name", 1, 0);
69
+ rb_define_attr(c_document, "public_identifier", 1, 0);
70
+ rb_define_attr(c_document, "system_identifier", 1, 0);
71
+ rb_define_attr(c_document, "quirks_mode", 1, 0);
72
+ rb_define_attr(c_document, "children", 1, 0);
73
+ rb_define_method(c_document, "has_doctype?", r_document_has_doctype, 0);
74
+
75
+ c_element = rb_define_class_under(m_gumbo, "Element", c_node);
76
+ rb_define_attr(c_element, "tag", 1, 0);
77
+ rb_define_attr(c_element, "original_tag", 1, 0);
78
+ rb_define_attr(c_element, "tag_namespace", 1, 0);
79
+ rb_define_attr(c_element, "attributes", 1, 0);
80
+ rb_define_attr(c_element, "children", 1, 0);
81
+ rb_define_attr(c_element, "start_pos", 1, 0);
82
+ rb_define_attr(c_element, "end_pos", 1, 0);
83
+ rb_define_method(c_element, "attribute", r_element_attribute, 1);
84
+ rb_define_method(c_element, "has_attribute?", r_element_has_attribute, 1);
85
+
86
+ c_text = rb_define_class_under(m_gumbo, "Text", c_node);
87
+ rb_define_attr(c_text, "text", 1, 0);
88
+ rb_define_attr(c_text, "original_text", 1, 0);
89
+ rb_define_attr(c_text, "start_pos", 1, 0);
90
+
91
+ c_cdata = rb_define_class_under(m_gumbo, "CData", c_node);
92
+ rb_define_attr(c_cdata, "text", 1, 0);
93
+ rb_define_attr(c_cdata, "original_text", 1, 0);
94
+ rb_define_attr(c_cdata, "start_pos", 1, 0);
95
+
96
+ c_comment = rb_define_class_under(m_gumbo, "Comment", c_node);
97
+ rb_define_attr(c_comment, "text", 1, 0);
98
+ rb_define_attr(c_comment, "original_text", 1, 0);
99
+ rb_define_attr(c_comment, "start_pos", 1, 0);
100
+
101
+ c_whitespace = rb_define_class_under(m_gumbo, "Whitespace", c_node);
102
+ rb_define_attr(c_whitespace, "text", 1, 0);
103
+ rb_define_attr(c_whitespace, "original_text", 1, 0);
104
+ rb_define_attr(c_whitespace, "start_pos", 1, 0);
105
+
106
+ c_attribute = rb_define_class_under(m_gumbo, "Attribute", rb_cObject);
107
+ rb_define_attr(c_attribute, "namespace", 1, 0);
108
+ rb_define_attr(c_attribute, "name", 1, 0);
109
+ rb_define_attr(c_attribute, "original_name", 1, 0);
110
+ rb_define_attr(c_attribute, "value", 1, 0);
111
+ rb_define_attr(c_attribute, "original_value", 1, 0);
112
+ rb_define_attr(c_attribute, "name_start", 1, 0);
113
+ rb_define_attr(c_attribute, "name_end", 1, 0);
114
+ rb_define_attr(c_attribute, "value_start", 1, 0);
115
+ rb_define_attr(c_attribute, "value_end", 1, 0);
116
+
117
+ c_source_position = rb_define_class_under(m_gumbo, "SourcePosition",
118
+ rb_cObject);
119
+ rb_define_attr(c_source_position, "line", 1, 0);
120
+ rb_define_attr(c_source_position, "column", 1, 0);
121
+ rb_define_attr(c_source_position, "offset", 1, 0);
122
+ }
123
+
124
+ /*
125
+ * call-seq:
126
+ * Gumbo::parse(input) {|document| ...}
127
+ * Gumbo::parse(input) -> document
128
+ *
129
+ * Parse a HTML document from a string. If the document cannot be created, a
130
+ * runtime error is raised.
131
+ *
132
+ * The input string must be UTF-8 encoded.
133
+ */
134
+ VALUE
135
+ r_gumbo_parse(VALUE module, VALUE input) {
136
+ GumboOutput *output;
137
+ GumboDocument *document;
138
+ VALUE r_document, r_root;
139
+ VALUE result;
140
+
141
+ rb_check_type(input, T_STRING);
142
+
143
+ if (rb_enc_get_index(input) != rb_utf8_encindex())
144
+ rb_raise(rb_eArgError, "input is not UTF-8 encoded");
145
+
146
+ output = gumbo_parse_with_options(&kGumboDefaultOptions,
147
+ StringValueCStr(input),
148
+ RSTRING_LEN(input));
149
+ if (!output)
150
+ rb_raise(rb_eRuntimeError, "cannot parse input");
151
+
152
+ r_document = rb_ensure(r_gumbo_node_to_value, (VALUE)output->document,
153
+ r_gumbo_destroy_output, (VALUE)output);
154
+
155
+ if (rb_block_given_p()) {
156
+ result = rb_yield(r_document);
157
+ } else {
158
+ result = r_document;
159
+ }
160
+
161
+ return result;
162
+ }
163
+
164
+ /*
165
+ * call-seq:
166
+ * document.has_doctype? -> boolean
167
+ *
168
+ * Return +true+ if the document has a doctype or +false+ else.
169
+ */
170
+ VALUE
171
+ r_document_has_doctype(VALUE self) {
172
+ return rb_iv_get(self, "@has_doctype");
173
+ }
174
+
175
+ /*
176
+ * call-seq:
177
+ * element.attribute(name) -> attribute
178
+ *
179
+ * If +element+ has an attribute with the name +name+, return it. If not,
180
+ * return +nil+.
181
+ */
182
+ VALUE
183
+ r_element_attribute(VALUE self, VALUE name) {
184
+ VALUE attributes;
185
+ const char *name_str;
186
+
187
+ name_str = StringValueCStr(name);
188
+
189
+ attributes = rb_iv_get(self, "@attributes");
190
+ for (long i = 0; i < RARRAY_LEN(attributes); i++) {
191
+ VALUE attribute;
192
+ VALUE r_attr_name;
193
+ const char *attr_name;
194
+
195
+ attribute = rb_ary_entry(attributes, i);
196
+ r_attr_name = rb_iv_get(attribute, "@name");
197
+ attr_name = StringValueCStr(r_attr_name);
198
+
199
+ if (strcasecmp(attr_name, name_str) == 0)
200
+ return attribute;
201
+ }
202
+
203
+ return Qnil;
204
+ }
205
+
206
+ /*
207
+ * call-seq:
208
+ * element.has_attribute?(name) -> boolean
209
+ *
210
+ * Return +true+ if +element+ has an attribute with the name +name+ or
211
+ * +false+ else.
212
+ */
213
+ VALUE
214
+ r_element_has_attribute(VALUE self, VALUE name) {
215
+ VALUE attribute;
216
+
217
+ attribute = r_element_attribute(self, name);
218
+ return (attribute == Qnil) ? Qfalse : Qtrue;
219
+ }
220
+
221
+ static VALUE
222
+ r_bool_new(bool val) {
223
+ return val ? Qtrue : Qfalse;
224
+ }
225
+
226
+ static VALUE
227
+ r_sym_new(const char *str) {
228
+ return ID2SYM(rb_intern(str));
229
+ }
230
+
231
+ static VALUE
232
+ r_str_new(const char *str, long len) {
233
+ return str ? rb_enc_str_new(str, len, rb_utf8_encoding()) : Qnil;
234
+ }
235
+
236
+ static VALUE
237
+ r_tainted_str_new(const char *str, long len) {
238
+ VALUE val;
239
+
240
+ if (str) {
241
+ val = rb_enc_str_new(str, len, rb_utf8_encoding());
242
+ OBJ_TAINT(str);
243
+ } else {
244
+ val = Qnil;
245
+ }
246
+
247
+ return val;
248
+ }
249
+
250
+ static VALUE
251
+ r_cstr_new(const char *str) {
252
+ return r_str_new(str, strlen(str));
253
+ }
254
+
255
+ static VALUE
256
+ r_tainted_cstr_new(const char *str) {
257
+ return r_tainted_str_new(str, strlen(str));
258
+ }
259
+
260
+ static VALUE
261
+ r_gumbo_destroy_output(VALUE value) {
262
+ GumboOutput *output;
263
+
264
+ output = (GumboOutput*)value;
265
+ gumbo_destroy_output(&kGumboDefaultOptions, output);
266
+
267
+ return Qnil;
268
+ }
269
+
270
+ static VALUE
271
+ r_gumbo_source_position_to_value(GumboSourcePosition position) {
272
+ VALUE r_position;
273
+
274
+ r_position = rb_class_new_instance(0, NULL, c_source_position);
275
+
276
+ rb_iv_set(r_position, "@line", UINT2NUM(position.line));
277
+ rb_iv_set(r_position, "@column", UINT2NUM(position.column));
278
+ rb_iv_set(r_position, "@offset", UINT2NUM(position.offset));
279
+
280
+ return r_position;
281
+ }
282
+
283
+ static VALUE
284
+ r_gumbo_node_type_to_symbol(GumboNodeType type) {
285
+ switch (type) {
286
+ case GUMBO_NODE_DOCUMENT:
287
+ return r_sym_new("document");
288
+ case GUMBO_NODE_ELEMENT:
289
+ return r_sym_new("element");
290
+ case GUMBO_NODE_TEXT:
291
+ return r_sym_new("text");
292
+ case GUMBO_NODE_CDATA:
293
+ return r_sym_new("cdata");
294
+ case GUMBO_NODE_COMMENT:
295
+ return r_sym_new("comment");
296
+ case GUMBO_NODE_WHITESPACE:
297
+ return r_sym_new("whitespace");
298
+ default:
299
+ rb_raise(rb_eArgError, "unknown node type %d", type);
300
+ }
301
+ }
302
+
303
+ static VALUE
304
+ r_gumbo_parse_flags_to_symbol_array(GumboParseFlags flags) {
305
+ VALUE array;
306
+
307
+ array = rb_ary_new();
308
+
309
+ if (flags & GUMBO_INSERTION_NORMAL)
310
+ rb_ary_push(array, r_sym_new("insertion_normal"));
311
+ if (flags & GUMBO_INSERTION_BY_PARSER)
312
+ rb_ary_push(array, r_sym_new("insertion_by_parser"));
313
+ if (flags & GUMBO_INSERTION_IMPLICIT_END_TAG)
314
+ rb_ary_push(array, r_sym_new("insertion_implicit_end_tag"));
315
+ if (flags & GUMBO_INSERTION_IMPLIED)
316
+ rb_ary_push(array, r_sym_new("insertion_implied"));
317
+ if (flags & GUMBO_INSERTION_CONVERTED_FROM_END_TAG)
318
+ rb_ary_push(array, r_sym_new("insertion_converted_from_end_tag"));
319
+ if (flags & GUMBO_INSERTION_FROM_ISINDEX)
320
+ rb_ary_push(array, r_sym_new("insertion_from_isindex"));
321
+ if (flags & GUMBO_INSERTION_FROM_IMAGE)
322
+ rb_ary_push(array, r_sym_new("insertion_from_image"));
323
+ if (flags & GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT)
324
+ rb_ary_push(array, r_sym_new("insertion_reconstructed_formatting_element"));
325
+ if (flags & GUMBO_INSERTION_ADOPTION_AGENCY_CLONED)
326
+ rb_ary_push(array, r_sym_new("insertion_adoption_agency_cloned"));
327
+ if (flags & GUMBO_INSERTION_ADOPTION_AGENCY_MOVED)
328
+ rb_ary_push(array, r_sym_new("insertion_adoption_agency_moved"));
329
+ if (flags & GUMBO_INSERTION_FOSTER_PARENTED)
330
+ rb_ary_push(array, r_sym_new("insertion_foster_parented"));
331
+
332
+ return array;
333
+ }
334
+
335
+ static VALUE
336
+ r_gumbo_quirks_mode_to_symbol(GumboQuirksModeEnum mode) {
337
+ switch (mode) {
338
+ case GUMBO_DOCTYPE_NO_QUIRKS:
339
+ return r_sym_new("no_quirks");
340
+ case GUMBO_DOCTYPE_QUIRKS:
341
+ return r_sym_new("quirks");
342
+ case GUMBO_DOCTYPE_LIMITED_QUIRKS:
343
+ return r_sym_new("limited_quirks");
344
+ default:
345
+ rb_raise(rb_eArgError, "unknown quirks mode %d", mode);
346
+ }
347
+ }
348
+
349
+ static VALUE
350
+ r_gumbo_namespace_to_symbol(GumboNamespaceEnum ns) {
351
+ switch (ns) {
352
+ case GUMBO_NAMESPACE_HTML:
353
+ return r_sym_new("html");
354
+ case GUMBO_NAMESPACE_SVG:
355
+ return r_sym_new("svg");
356
+ case GUMBO_NAMESPACE_MATHML:
357
+ return r_sym_new("mathml");
358
+ default:
359
+ rb_raise(rb_eArgError, "unknown namespace %d", ns);
360
+ }
361
+ }
362
+
363
+ static VALUE
364
+ r_gumbo_tag_to_symbol(GumboTag tag) {
365
+ const char *name;
366
+
367
+ if (tag < 0 || tag >= GUMBO_TAG_LAST)
368
+ rb_raise(rb_eArgError, "unknown tag %d", tag);
369
+
370
+ if (tag == GUMBO_TAG_UNKNOWN) {
371
+ name = "unknown";
372
+ } else {
373
+ name = gumbo_normalized_tagname(tag);
374
+ }
375
+
376
+ return r_sym_new(name);
377
+ }
378
+
379
+ static VALUE
380
+ r_gumbo_node_to_value(GumboNode *node) {
381
+ VALUE class;
382
+ VALUE r_node;
383
+ GumboVector *children;
384
+
385
+ if (node->type == GUMBO_NODE_DOCUMENT) {
386
+ class = c_document;
387
+ } else if (node->type == GUMBO_NODE_ELEMENT) {
388
+ class = c_element;
389
+ } else if (node->type == GUMBO_NODE_TEXT) {
390
+ class = c_text;
391
+ } else if (node->type == GUMBO_NODE_CDATA) {
392
+ class = c_cdata;
393
+ } else if (node->type == GUMBO_NODE_COMMENT) {
394
+ class = c_comment;
395
+ } else if (node->type == GUMBO_NODE_WHITESPACE) {
396
+ class = c_whitespace;
397
+ } else {
398
+ rb_raise(rb_eArgError, "unknown node type %d", node->type);
399
+ }
400
+
401
+ r_node = rb_class_new_instance(0, NULL, class);
402
+ rb_iv_set(r_node, "@type", r_gumbo_node_type_to_symbol(node->type));
403
+ rb_iv_set(r_node, "@parent", Qnil);
404
+ rb_iv_set(r_node, "@parse_flags",
405
+ r_gumbo_parse_flags_to_symbol_array(node->parse_flags));
406
+
407
+ children = NULL;
408
+
409
+ if (node->type == GUMBO_NODE_DOCUMENT) {
410
+ GumboDocument *document;
411
+
412
+ document = &node->v.document;
413
+ children = &document->children;
414
+
415
+ rb_iv_set(r_node, "@name", r_tainted_cstr_new(document->name));
416
+ rb_iv_set(r_node, "@public_identifier",
417
+ r_tainted_cstr_new(document->public_identifier));
418
+ rb_iv_set(r_node, "@system_identifier",
419
+ r_tainted_cstr_new(document->system_identifier));
420
+ rb_iv_set(r_node, "@quirks_mode",
421
+ r_gumbo_quirks_mode_to_symbol(document->doc_type_quirks_mode));
422
+ rb_iv_set(r_node, "@has_doctype", r_bool_new(document->has_doctype));
423
+ } else if (node->type == GUMBO_NODE_ELEMENT) {
424
+ GumboElement *element;
425
+ VALUE r_attributes;
426
+
427
+ element = &node->v.element;
428
+ children = &element->children;
429
+
430
+ rb_iv_set(r_node, "@tag",
431
+ r_gumbo_tag_to_symbol(element->tag));
432
+ rb_iv_set(r_node, "@original_tag",
433
+ r_tainted_str_new(element->original_tag.data,
434
+ element->original_tag.length));
435
+ rb_iv_set(r_node, "@tag_namespace",
436
+ r_gumbo_namespace_to_symbol(element->tag_namespace));
437
+ rb_iv_set(r_node, "@start_pos",
438
+ r_gumbo_source_position_to_value(element->start_pos));
439
+ rb_iv_set(r_node, "@end_pos",
440
+ r_gumbo_source_position_to_value(element->end_pos));
441
+
442
+ r_attributes = rb_ary_new2(element->attributes.length);
443
+ rb_iv_set(r_node, "@attributes", r_attributes);
444
+
445
+ for (unsigned int i = 0; i < element->attributes.length; i++) {
446
+ GumboAttribute *attribute;
447
+ VALUE r_attribute;
448
+
449
+ attribute = element->attributes.data[i];
450
+ r_attribute = r_gumbo_attribute_to_value(attribute);
451
+
452
+ rb_ary_store(r_attributes, i, r_attribute);
453
+ }
454
+ } else if (node->type == GUMBO_NODE_TEXT
455
+ || node->type == GUMBO_NODE_CDATA
456
+ || node->type == GUMBO_NODE_COMMENT
457
+ || node->type == GUMBO_NODE_WHITESPACE) {
458
+ GumboText *text;
459
+
460
+ text = &node->v.text;
461
+
462
+ rb_iv_set(r_node, "@text", r_tainted_cstr_new(text->text));
463
+ rb_iv_set(r_node, "@original_text",
464
+ r_tainted_str_new(text->original_text.data,
465
+ text->original_text.length));
466
+ rb_iv_set(r_node, "@start_pos",
467
+ r_gumbo_source_position_to_value(text->start_pos));
468
+ }
469
+
470
+ if (children) {
471
+ VALUE r_children;
472
+
473
+ r_children = rb_ary_new2(children->length);
474
+ rb_iv_set(r_node, "@children", r_children);
475
+
476
+ for (unsigned int i = 0; i < children->length; i++) {
477
+ GumboNode *child;
478
+ VALUE r_child;
479
+
480
+ child = children->data[i];
481
+ r_child = r_gumbo_node_to_value(child);
482
+
483
+ rb_iv_set(r_child, "@parent", r_node);
484
+
485
+ rb_ary_store(r_children, i, r_child);
486
+ }
487
+ }
488
+
489
+ return r_node;
490
+ }
491
+
492
+ static VALUE
493
+ r_gumbo_attribute_namespace_to_symbol(GumboAttributeNamespaceEnum ns) {
494
+ switch (ns) {
495
+ case GUMBO_ATTR_NAMESPACE_NONE:
496
+ return Qnil;
497
+ case GUMBO_ATTR_NAMESPACE_XLINK:
498
+ return r_sym_new("xlink");
499
+ case GUMBO_ATTR_NAMESPACE_XML:
500
+ return r_sym_new("xml");
501
+ case GUMBO_ATTR_NAMESPACE_XMLNS:
502
+ return r_sym_new("xmlns");
503
+ default:
504
+ rb_raise(rb_eArgError, "unknown namespace %d", ns);
505
+ }
506
+ }
507
+
508
+ static VALUE
509
+ r_gumbo_attribute_to_value(GumboAttribute *attribute) {
510
+ VALUE r_attribute;
511
+
512
+ r_attribute = rb_class_new_instance(0, NULL, c_attribute);
513
+
514
+ rb_iv_set(r_attribute, "@namespace",
515
+ r_gumbo_attribute_namespace_to_symbol(attribute->attr_namespace));
516
+ rb_iv_set(r_attribute, "@name", r_tainted_cstr_new(attribute->name));
517
+ rb_iv_set(r_attribute, "@original_name",
518
+ r_tainted_str_new(attribute->original_name.data,
519
+ attribute->original_name.length));
520
+ rb_iv_set(r_attribute, "@value", r_tainted_cstr_new(attribute->value));
521
+ rb_iv_set(r_attribute, "@original_value",
522
+ r_tainted_str_new(attribute->original_value.data,
523
+ attribute->original_value.length));
524
+ rb_iv_set(r_attribute, "@name_start",
525
+ r_gumbo_source_position_to_value(attribute->name_start));
526
+ rb_iv_set(r_attribute, "@name_end",
527
+ r_gumbo_source_position_to_value(attribute->name_end));
528
+ rb_iv_set(r_attribute, "@value_start",
529
+ r_gumbo_source_position_to_value(attribute->value_start));
530
+ rb_iv_set(r_attribute, "@value_end",
531
+ r_gumbo_source_position_to_value(attribute->value_end));
532
+
533
+ return r_attribute;
534
+ }
@@ -0,0 +1,44 @@
1
+
2
+ # Copyright (c) 2013 Nicolas Martyanoff
3
+ #
4
+ # Permission to use, copy, modify, and distribute this software for any
5
+ # purpose with or without fee is hereby granted, provided that the above
6
+ # copyright notice and this permission notice appear in all copies.
7
+ #
8
+ # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9
+ # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10
+ # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11
+ # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12
+ # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13
+ # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14
+ # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15
+
16
+ require 'gumbo'
17
+
18
+ module Gumbo
19
+ class Node
20
+ # Recursively dump an indented representation of a HTML tree to +output+.
21
+ # Text nodes are not printed.
22
+ def dump_tree(output = $stdout)
23
+ process_node = lambda do |node, indent|
24
+ return unless node.type == :document || node.type == :element
25
+
26
+ output.write (" " * indent)
27
+
28
+ if node.type == :element
29
+ tag = (node.tag == :unknown) ? node.original_tag : node.tag.to_s
30
+ attributes = node.attributes.map(&:name)
31
+ output.puts "<" + tag.upcase() + " " + attributes.join(" ") + ">"
32
+
33
+ indent += 2
34
+ end
35
+
36
+ for child in node.children
37
+ process_node.call(child, indent)
38
+ end
39
+ end
40
+
41
+ process_node.call(self, 0)
42
+ end
43
+ end
44
+ end
metadata ADDED
@@ -0,0 +1,50 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ruby-gumbo
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Nicolas Martyanoff
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-08-18 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email: khaelin@gmail.com
15
+ executables: []
16
+ extensions:
17
+ - ext/extconf.rb
18
+ extra_rdoc_files: []
19
+ files:
20
+ - Rakefile
21
+ - LICENSE
22
+ - README.mkd
23
+ - lib/gumbo/extra.rb
24
+ - ext/extconf.rb
25
+ - ext/gumbo.c
26
+ homepage:
27
+ licenses:
28
+ - ISC
29
+ metadata: {}
30
+ post_install_message:
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - '>='
37
+ - !ruby/object:Gem::Version
38
+ version: 1.9.3
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubyforge_project:
46
+ rubygems_version: 2.0.3
47
+ signing_key:
48
+ specification_version: 4
49
+ summary: Ruby bindings for the gumbo html5 parser
50
+ test_files: []