ruby-gumbo 1.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 9f5c5ba70ffb487659a2aadc0e9cf5677beef932
4
+ data.tar.gz: 6c4be899d6d729cf8070978c959ec076b2f2f155
5
+ SHA512:
6
+ metadata.gz: 0005f69394bedd851e92092ddf18169322b9c3748ecbdf478d9f15af1d2a5200cbc8334caa1c60a9cb3fd3d4de74f5d3b2d8d82360eb562db8c636da17ccc8ed
7
+ data.tar.gz: 31444aaed773d14e08862350b5a6af88284e96267d89ba03732eeaf5203f1351cfee18a2e36eb58e681d1f0c0d93aeb6bfe39b139fd5c82544f6f257a5f77f2b
data/LICENSE ADDED
@@ -0,0 +1,13 @@
1
+ Copyright (c) 2013 Nicolas Martyanoff
2
+
3
+ Permission to use, copy, modify, and distribute this software for any
4
+ purpose with or without fee is hereby granted, provided that the above
5
+ copyright notice and this permission notice appear in all copies.
6
+
7
+ THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
8
+ WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
9
+ MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
10
+ ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
11
+ WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
12
+ ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
13
+ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
@@ -0,0 +1,15 @@
1
+ # ruby-gumbo
2
+
3
+ ## Description
4
+
5
+ `ruby-gumbo` is a ruby binding for the Gumbo HTML5 parser.
6
+
7
+ ## Installation
8
+
9
+ Create the gem with `rake package`, then install it with `gem install` (the
10
+ gem file is in the `pkg` directory).
11
+
12
+ ## Contact
13
+
14
+ If you have found a bug, have an idea or a question, email me at
15
+ <khaelin@gmail.com>.
@@ -0,0 +1,79 @@
1
+
2
+ require 'rake/clean'
3
+
4
+ require 'rdoc/task'
5
+
6
+ require 'rubygems/package_task'
7
+
8
+
9
+ PKG_NAME = "ruby-gumbo"
10
+ PKG_VERSION = "1.0.1"
11
+
12
+ EXT_CONF = "ext/extconf.rb"
13
+ MAKEFILE = "ext/Makefile"
14
+ MODULE = "ext/gumbo.so"
15
+ SRC = Dir.glob("ext/*.c") << MAKEFILE
16
+
17
+ CLEAN.include [MODULE, "ext/*.o"]
18
+ CLOBBER.include ["ext/mkmf.log", "ext/extconf.h", MAKEFILE]
19
+
20
+ # Build
21
+ file MAKEFILE => EXT_CONF do |t|
22
+ Dir::chdir(File::dirname(EXT_CONF)) do
23
+ unless sh "ruby #{File::basename(EXT_CONF)}"
24
+ $stderr.puts "extconf.rb failed"
25
+ break
26
+ end
27
+ end
28
+ end
29
+
30
+ file MODULE => SRC do |t|
31
+ Dir::chdir(File::dirname(EXT_CONF)) do
32
+ unless sh "make"
33
+ $stderr.puts "make failed"
34
+ break
35
+ end
36
+ end
37
+ end
38
+
39
+ desc "Build the native library"
40
+ task :build => MODULE
41
+
42
+ # Documentation
43
+ RDOC_FILES = FileList["ext/gumbo.c", "lib/gumbo/extra.rb"]
44
+
45
+ Rake::RDocTask.new do |task|
46
+ #task.main = "README.rdoc"
47
+ task.rdoc_dir = "doc/api"
48
+ task.rdoc_files.include(RDOC_FILES)
49
+ end
50
+
51
+ Rake::RDocTask.new(:ri) do |task|
52
+ #task.main = "README.rdoc"
53
+ task.rdoc_dir = "doc/ri"
54
+ task.options << "--ri-system"
55
+ task.rdoc_files.include(RDOC_FILES)
56
+ end
57
+
58
+ # Packaging
59
+ PKG_FILES = FileList["Rakefile", "LICENSE", "README.mkd",
60
+ "lib/gumbo/*.rb",
61
+ "ext/extconf.rb", "ext/*.[hc]"]
62
+
63
+ SPEC = Gem::Specification.new do |spec|
64
+ spec.name = PKG_NAME
65
+ spec.version = PKG_VERSION
66
+ spec.summary = "Ruby bindings for the gumbo html5 parser"
67
+ spec.author = "Nicolas Martyanoff"
68
+ spec.email = "khaelin@gmail.com"
69
+ spec.license = "ISC"
70
+
71
+ spec.files = PKG_FILES
72
+ spec.extensions = "ext/extconf.rb"
73
+
74
+ spec.required_ruby_version = ">= 1.9.3"
75
+ end
76
+
77
+ Gem::PackageTask.new(SPEC) do |pkg|
78
+ pkg.need_tar = true
79
+ end
@@ -0,0 +1,15 @@
1
+
2
+ require "mkmf"
3
+
4
+ RbConfig::MAKEFILE_CONFIG["CC"] = ENV["CC"] if ENV["CC"]
5
+
6
+ extension_name = "gumbo"
7
+
8
+ unless pkg_config("libgumbo")
9
+ $libs << " -lgumbo"
10
+ end
11
+
12
+ $CFLAGS << " -std=c99"
13
+
14
+ create_header
15
+ create_makefile(extension_name)
@@ -0,0 +1,534 @@
1
+ /*
2
+ * Copyright (c) 2013 Nicolas Martyanoff
3
+ *
4
+ * Permission to use, copy, modify, and distribute this software for any
5
+ * purpose with or without fee is hereby granted, provided that the above
6
+ * copyright notice and this permission notice appear in all copies.
7
+ *
8
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15
+ */
16
+
17
+ #include <ruby.h>
18
+ #include <ruby/encoding.h>
19
+
20
+ #include <gumbo.h>
21
+
22
+ void Init_gumbo(void);
23
+
24
+ VALUE r_gumbo_parse(VALUE module, VALUE input);
25
+ VALUE r_document_has_doctype(VALUE self);
26
+ VALUE r_element_attribute(VALUE self, VALUE name);
27
+ VALUE r_element_has_attribute(VALUE self, VALUE name);
28
+
29
+
30
+ static VALUE r_bool_new(bool val);
31
+ static VALUE r_sym_new(const char *str);
32
+ static VALUE r_str_new(const char *str, long len);
33
+ static VALUE r_tainted_str_new(const char *str, long len);
34
+ static VALUE r_cstr_new(const char *str);
35
+ static VALUE r_tainted_cstr_new(const char *str);
36
+
37
+ static VALUE r_gumbo_destroy_output(VALUE value);
38
+
39
+ static VALUE r_gumbo_source_position_to_value(GumboSourcePosition position);
40
+ static VALUE r_gumbo_node_type_to_symbol(GumboNodeType type);
41
+ static VALUE r_gumbo_parse_flags_to_symbol_array(GumboParseFlags flags);
42
+ static VALUE r_gumbo_quirks_mode_to_symbol(GumboQuirksModeEnum mode);
43
+ static VALUE r_gumbo_namespace_to_symbol(GumboNamespaceEnum ns);
44
+ static VALUE r_gumbo_tag_to_symbol(GumboTag tag);
45
+ static VALUE r_gumbo_node_to_value(GumboNode *node);
46
+
47
+ static VALUE r_gumbo_attribute_namespace_to_symbol(GumboAttributeNamespaceEnum ns);
48
+ static VALUE r_gumbo_attribute_to_value(GumboAttribute *attribute);
49
+
50
+ static VALUE m_gumbo;
51
+ static VALUE c_node, c_document, c_element;
52
+ static VALUE c_text, c_cdata, c_comment, c_whitespace;
53
+ static VALUE c_attribute;
54
+ static VALUE c_source_position;
55
+
56
+
57
+ void
58
+ Init_gumbo(void) {
59
+ m_gumbo = rb_define_module("Gumbo");
60
+ rb_define_module_function(m_gumbo, "parse", r_gumbo_parse, 1);
61
+
62
+ c_node = rb_define_class_under(m_gumbo, "Node", rb_cObject);
63
+ rb_define_attr(c_node, "type", 1, 0);
64
+ rb_define_attr(c_node, "parent", 1, 0);
65
+ rb_define_attr(c_node, "parse_flags", 1, 0);
66
+
67
+ c_document = rb_define_class_under(m_gumbo, "Document", c_node);
68
+ rb_define_attr(c_document, "name", 1, 0);
69
+ rb_define_attr(c_document, "public_identifier", 1, 0);
70
+ rb_define_attr(c_document, "system_identifier", 1, 0);
71
+ rb_define_attr(c_document, "quirks_mode", 1, 0);
72
+ rb_define_attr(c_document, "children", 1, 0);
73
+ rb_define_method(c_document, "has_doctype?", r_document_has_doctype, 0);
74
+
75
+ c_element = rb_define_class_under(m_gumbo, "Element", c_node);
76
+ rb_define_attr(c_element, "tag", 1, 0);
77
+ rb_define_attr(c_element, "original_tag", 1, 0);
78
+ rb_define_attr(c_element, "tag_namespace", 1, 0);
79
+ rb_define_attr(c_element, "attributes", 1, 0);
80
+ rb_define_attr(c_element, "children", 1, 0);
81
+ rb_define_attr(c_element, "start_pos", 1, 0);
82
+ rb_define_attr(c_element, "end_pos", 1, 0);
83
+ rb_define_method(c_element, "attribute", r_element_attribute, 1);
84
+ rb_define_method(c_element, "has_attribute?", r_element_has_attribute, 1);
85
+
86
+ c_text = rb_define_class_under(m_gumbo, "Text", c_node);
87
+ rb_define_attr(c_text, "text", 1, 0);
88
+ rb_define_attr(c_text, "original_text", 1, 0);
89
+ rb_define_attr(c_text, "start_pos", 1, 0);
90
+
91
+ c_cdata = rb_define_class_under(m_gumbo, "CData", c_node);
92
+ rb_define_attr(c_cdata, "text", 1, 0);
93
+ rb_define_attr(c_cdata, "original_text", 1, 0);
94
+ rb_define_attr(c_cdata, "start_pos", 1, 0);
95
+
96
+ c_comment = rb_define_class_under(m_gumbo, "Comment", c_node);
97
+ rb_define_attr(c_comment, "text", 1, 0);
98
+ rb_define_attr(c_comment, "original_text", 1, 0);
99
+ rb_define_attr(c_comment, "start_pos", 1, 0);
100
+
101
+ c_whitespace = rb_define_class_under(m_gumbo, "Whitespace", c_node);
102
+ rb_define_attr(c_whitespace, "text", 1, 0);
103
+ rb_define_attr(c_whitespace, "original_text", 1, 0);
104
+ rb_define_attr(c_whitespace, "start_pos", 1, 0);
105
+
106
+ c_attribute = rb_define_class_under(m_gumbo, "Attribute", rb_cObject);
107
+ rb_define_attr(c_attribute, "namespace", 1, 0);
108
+ rb_define_attr(c_attribute, "name", 1, 0);
109
+ rb_define_attr(c_attribute, "original_name", 1, 0);
110
+ rb_define_attr(c_attribute, "value", 1, 0);
111
+ rb_define_attr(c_attribute, "original_value", 1, 0);
112
+ rb_define_attr(c_attribute, "name_start", 1, 0);
113
+ rb_define_attr(c_attribute, "name_end", 1, 0);
114
+ rb_define_attr(c_attribute, "value_start", 1, 0);
115
+ rb_define_attr(c_attribute, "value_end", 1, 0);
116
+
117
+ c_source_position = rb_define_class_under(m_gumbo, "SourcePosition",
118
+ rb_cObject);
119
+ rb_define_attr(c_source_position, "line", 1, 0);
120
+ rb_define_attr(c_source_position, "column", 1, 0);
121
+ rb_define_attr(c_source_position, "offset", 1, 0);
122
+ }
123
+
124
+ /*
125
+ * call-seq:
126
+ * Gumbo::parse(input) {|document| ...}
127
+ * Gumbo::parse(input) -> document
128
+ *
129
+ * Parse a HTML document from a string. If the document cannot be created, a
130
+ * runtime error is raised.
131
+ *
132
+ * The input string must be UTF-8 encoded.
133
+ */
134
+ VALUE
135
+ r_gumbo_parse(VALUE module, VALUE input) {
136
+ GumboOutput *output;
137
+ GumboDocument *document;
138
+ VALUE r_document, r_root;
139
+ VALUE result;
140
+
141
+ rb_check_type(input, T_STRING);
142
+
143
+ if (rb_enc_get_index(input) != rb_utf8_encindex())
144
+ rb_raise(rb_eArgError, "input is not UTF-8 encoded");
145
+
146
+ output = gumbo_parse_with_options(&kGumboDefaultOptions,
147
+ StringValueCStr(input),
148
+ RSTRING_LEN(input));
149
+ if (!output)
150
+ rb_raise(rb_eRuntimeError, "cannot parse input");
151
+
152
+ r_document = rb_ensure(r_gumbo_node_to_value, (VALUE)output->document,
153
+ r_gumbo_destroy_output, (VALUE)output);
154
+
155
+ if (rb_block_given_p()) {
156
+ result = rb_yield(r_document);
157
+ } else {
158
+ result = r_document;
159
+ }
160
+
161
+ return result;
162
+ }
163
+
164
+ /*
165
+ * call-seq:
166
+ * document.has_doctype? -> boolean
167
+ *
168
+ * Return +true+ if the document has a doctype or +false+ else.
169
+ */
170
+ VALUE
171
+ r_document_has_doctype(VALUE self) {
172
+ return rb_iv_get(self, "@has_doctype");
173
+ }
174
+
175
+ /*
176
+ * call-seq:
177
+ * element.attribute(name) -> attribute
178
+ *
179
+ * If +element+ has an attribute with the name +name+, return it. If not,
180
+ * return +nil+.
181
+ */
182
+ VALUE
183
+ r_element_attribute(VALUE self, VALUE name) {
184
+ VALUE attributes;
185
+ const char *name_str;
186
+
187
+ name_str = StringValueCStr(name);
188
+
189
+ attributes = rb_iv_get(self, "@attributes");
190
+ for (long i = 0; i < RARRAY_LEN(attributes); i++) {
191
+ VALUE attribute;
192
+ VALUE r_attr_name;
193
+ const char *attr_name;
194
+
195
+ attribute = rb_ary_entry(attributes, i);
196
+ r_attr_name = rb_iv_get(attribute, "@name");
197
+ attr_name = StringValueCStr(r_attr_name);
198
+
199
+ if (strcasecmp(attr_name, name_str) == 0)
200
+ return attribute;
201
+ }
202
+
203
+ return Qnil;
204
+ }
205
+
206
+ /*
207
+ * call-seq:
208
+ * element.has_attribute?(name) -> boolean
209
+ *
210
+ * Return +true+ if +element+ has an attribute with the name +name+ or
211
+ * +false+ else.
212
+ */
213
+ VALUE
214
+ r_element_has_attribute(VALUE self, VALUE name) {
215
+ VALUE attribute;
216
+
217
+ attribute = r_element_attribute(self, name);
218
+ return (attribute == Qnil) ? Qfalse : Qtrue;
219
+ }
220
+
221
+ static VALUE
222
+ r_bool_new(bool val) {
223
+ return val ? Qtrue : Qfalse;
224
+ }
225
+
226
+ static VALUE
227
+ r_sym_new(const char *str) {
228
+ return ID2SYM(rb_intern(str));
229
+ }
230
+
231
+ static VALUE
232
+ r_str_new(const char *str, long len) {
233
+ return str ? rb_enc_str_new(str, len, rb_utf8_encoding()) : Qnil;
234
+ }
235
+
236
+ static VALUE
237
+ r_tainted_str_new(const char *str, long len) {
238
+ VALUE val;
239
+
240
+ if (str) {
241
+ val = rb_enc_str_new(str, len, rb_utf8_encoding());
242
+ OBJ_TAINT(str);
243
+ } else {
244
+ val = Qnil;
245
+ }
246
+
247
+ return val;
248
+ }
249
+
250
+ static VALUE
251
+ r_cstr_new(const char *str) {
252
+ return r_str_new(str, strlen(str));
253
+ }
254
+
255
+ static VALUE
256
+ r_tainted_cstr_new(const char *str) {
257
+ return r_tainted_str_new(str, strlen(str));
258
+ }
259
+
260
+ static VALUE
261
+ r_gumbo_destroy_output(VALUE value) {
262
+ GumboOutput *output;
263
+
264
+ output = (GumboOutput*)value;
265
+ gumbo_destroy_output(&kGumboDefaultOptions, output);
266
+
267
+ return Qnil;
268
+ }
269
+
270
+ static VALUE
271
+ r_gumbo_source_position_to_value(GumboSourcePosition position) {
272
+ VALUE r_position;
273
+
274
+ r_position = rb_class_new_instance(0, NULL, c_source_position);
275
+
276
+ rb_iv_set(r_position, "@line", UINT2NUM(position.line));
277
+ rb_iv_set(r_position, "@column", UINT2NUM(position.column));
278
+ rb_iv_set(r_position, "@offset", UINT2NUM(position.offset));
279
+
280
+ return r_position;
281
+ }
282
+
283
+ static VALUE
284
+ r_gumbo_node_type_to_symbol(GumboNodeType type) {
285
+ switch (type) {
286
+ case GUMBO_NODE_DOCUMENT:
287
+ return r_sym_new("document");
288
+ case GUMBO_NODE_ELEMENT:
289
+ return r_sym_new("element");
290
+ case GUMBO_NODE_TEXT:
291
+ return r_sym_new("text");
292
+ case GUMBO_NODE_CDATA:
293
+ return r_sym_new("cdata");
294
+ case GUMBO_NODE_COMMENT:
295
+ return r_sym_new("comment");
296
+ case GUMBO_NODE_WHITESPACE:
297
+ return r_sym_new("whitespace");
298
+ default:
299
+ rb_raise(rb_eArgError, "unknown node type %d", type);
300
+ }
301
+ }
302
+
303
+ static VALUE
304
+ r_gumbo_parse_flags_to_symbol_array(GumboParseFlags flags) {
305
+ VALUE array;
306
+
307
+ array = rb_ary_new();
308
+
309
+ if (flags & GUMBO_INSERTION_NORMAL)
310
+ rb_ary_push(array, r_sym_new("insertion_normal"));
311
+ if (flags & GUMBO_INSERTION_BY_PARSER)
312
+ rb_ary_push(array, r_sym_new("insertion_by_parser"));
313
+ if (flags & GUMBO_INSERTION_IMPLICIT_END_TAG)
314
+ rb_ary_push(array, r_sym_new("insertion_implicit_end_tag"));
315
+ if (flags & GUMBO_INSERTION_IMPLIED)
316
+ rb_ary_push(array, r_sym_new("insertion_implied"));
317
+ if (flags & GUMBO_INSERTION_CONVERTED_FROM_END_TAG)
318
+ rb_ary_push(array, r_sym_new("insertion_converted_from_end_tag"));
319
+ if (flags & GUMBO_INSERTION_FROM_ISINDEX)
320
+ rb_ary_push(array, r_sym_new("insertion_from_isindex"));
321
+ if (flags & GUMBO_INSERTION_FROM_IMAGE)
322
+ rb_ary_push(array, r_sym_new("insertion_from_image"));
323
+ if (flags & GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT)
324
+ rb_ary_push(array, r_sym_new("insertion_reconstructed_formatting_element"));
325
+ if (flags & GUMBO_INSERTION_ADOPTION_AGENCY_CLONED)
326
+ rb_ary_push(array, r_sym_new("insertion_adoption_agency_cloned"));
327
+ if (flags & GUMBO_INSERTION_ADOPTION_AGENCY_MOVED)
328
+ rb_ary_push(array, r_sym_new("insertion_adoption_agency_moved"));
329
+ if (flags & GUMBO_INSERTION_FOSTER_PARENTED)
330
+ rb_ary_push(array, r_sym_new("insertion_foster_parented"));
331
+
332
+ return array;
333
+ }
334
+
335
+ static VALUE
336
+ r_gumbo_quirks_mode_to_symbol(GumboQuirksModeEnum mode) {
337
+ switch (mode) {
338
+ case GUMBO_DOCTYPE_NO_QUIRKS:
339
+ return r_sym_new("no_quirks");
340
+ case GUMBO_DOCTYPE_QUIRKS:
341
+ return r_sym_new("quirks");
342
+ case GUMBO_DOCTYPE_LIMITED_QUIRKS:
343
+ return r_sym_new("limited_quirks");
344
+ default:
345
+ rb_raise(rb_eArgError, "unknown quirks mode %d", mode);
346
+ }
347
+ }
348
+
349
+ static VALUE
350
+ r_gumbo_namespace_to_symbol(GumboNamespaceEnum ns) {
351
+ switch (ns) {
352
+ case GUMBO_NAMESPACE_HTML:
353
+ return r_sym_new("html");
354
+ case GUMBO_NAMESPACE_SVG:
355
+ return r_sym_new("svg");
356
+ case GUMBO_NAMESPACE_MATHML:
357
+ return r_sym_new("mathml");
358
+ default:
359
+ rb_raise(rb_eArgError, "unknown namespace %d", ns);
360
+ }
361
+ }
362
+
363
+ static VALUE
364
+ r_gumbo_tag_to_symbol(GumboTag tag) {
365
+ const char *name;
366
+
367
+ if (tag < 0 || tag >= GUMBO_TAG_LAST)
368
+ rb_raise(rb_eArgError, "unknown tag %d", tag);
369
+
370
+ if (tag == GUMBO_TAG_UNKNOWN) {
371
+ name = "unknown";
372
+ } else {
373
+ name = gumbo_normalized_tagname(tag);
374
+ }
375
+
376
+ return r_sym_new(name);
377
+ }
378
+
379
+ static VALUE
380
+ r_gumbo_node_to_value(GumboNode *node) {
381
+ VALUE class;
382
+ VALUE r_node;
383
+ GumboVector *children;
384
+
385
+ if (node->type == GUMBO_NODE_DOCUMENT) {
386
+ class = c_document;
387
+ } else if (node->type == GUMBO_NODE_ELEMENT) {
388
+ class = c_element;
389
+ } else if (node->type == GUMBO_NODE_TEXT) {
390
+ class = c_text;
391
+ } else if (node->type == GUMBO_NODE_CDATA) {
392
+ class = c_cdata;
393
+ } else if (node->type == GUMBO_NODE_COMMENT) {
394
+ class = c_comment;
395
+ } else if (node->type == GUMBO_NODE_WHITESPACE) {
396
+ class = c_whitespace;
397
+ } else {
398
+ rb_raise(rb_eArgError, "unknown node type %d", node->type);
399
+ }
400
+
401
+ r_node = rb_class_new_instance(0, NULL, class);
402
+ rb_iv_set(r_node, "@type", r_gumbo_node_type_to_symbol(node->type));
403
+ rb_iv_set(r_node, "@parent", Qnil);
404
+ rb_iv_set(r_node, "@parse_flags",
405
+ r_gumbo_parse_flags_to_symbol_array(node->parse_flags));
406
+
407
+ children = NULL;
408
+
409
+ if (node->type == GUMBO_NODE_DOCUMENT) {
410
+ GumboDocument *document;
411
+
412
+ document = &node->v.document;
413
+ children = &document->children;
414
+
415
+ rb_iv_set(r_node, "@name", r_tainted_cstr_new(document->name));
416
+ rb_iv_set(r_node, "@public_identifier",
417
+ r_tainted_cstr_new(document->public_identifier));
418
+ rb_iv_set(r_node, "@system_identifier",
419
+ r_tainted_cstr_new(document->system_identifier));
420
+ rb_iv_set(r_node, "@quirks_mode",
421
+ r_gumbo_quirks_mode_to_symbol(document->doc_type_quirks_mode));
422
+ rb_iv_set(r_node, "@has_doctype", r_bool_new(document->has_doctype));
423
+ } else if (node->type == GUMBO_NODE_ELEMENT) {
424
+ GumboElement *element;
425
+ VALUE r_attributes;
426
+
427
+ element = &node->v.element;
428
+ children = &element->children;
429
+
430
+ rb_iv_set(r_node, "@tag",
431
+ r_gumbo_tag_to_symbol(element->tag));
432
+ rb_iv_set(r_node, "@original_tag",
433
+ r_tainted_str_new(element->original_tag.data,
434
+ element->original_tag.length));
435
+ rb_iv_set(r_node, "@tag_namespace",
436
+ r_gumbo_namespace_to_symbol(element->tag_namespace));
437
+ rb_iv_set(r_node, "@start_pos",
438
+ r_gumbo_source_position_to_value(element->start_pos));
439
+ rb_iv_set(r_node, "@end_pos",
440
+ r_gumbo_source_position_to_value(element->end_pos));
441
+
442
+ r_attributes = rb_ary_new2(element->attributes.length);
443
+ rb_iv_set(r_node, "@attributes", r_attributes);
444
+
445
+ for (unsigned int i = 0; i < element->attributes.length; i++) {
446
+ GumboAttribute *attribute;
447
+ VALUE r_attribute;
448
+
449
+ attribute = element->attributes.data[i];
450
+ r_attribute = r_gumbo_attribute_to_value(attribute);
451
+
452
+ rb_ary_store(r_attributes, i, r_attribute);
453
+ }
454
+ } else if (node->type == GUMBO_NODE_TEXT
455
+ || node->type == GUMBO_NODE_CDATA
456
+ || node->type == GUMBO_NODE_COMMENT
457
+ || node->type == GUMBO_NODE_WHITESPACE) {
458
+ GumboText *text;
459
+
460
+ text = &node->v.text;
461
+
462
+ rb_iv_set(r_node, "@text", r_tainted_cstr_new(text->text));
463
+ rb_iv_set(r_node, "@original_text",
464
+ r_tainted_str_new(text->original_text.data,
465
+ text->original_text.length));
466
+ rb_iv_set(r_node, "@start_pos",
467
+ r_gumbo_source_position_to_value(text->start_pos));
468
+ }
469
+
470
+ if (children) {
471
+ VALUE r_children;
472
+
473
+ r_children = rb_ary_new2(children->length);
474
+ rb_iv_set(r_node, "@children", r_children);
475
+
476
+ for (unsigned int i = 0; i < children->length; i++) {
477
+ GumboNode *child;
478
+ VALUE r_child;
479
+
480
+ child = children->data[i];
481
+ r_child = r_gumbo_node_to_value(child);
482
+
483
+ rb_iv_set(r_child, "@parent", r_node);
484
+
485
+ rb_ary_store(r_children, i, r_child);
486
+ }
487
+ }
488
+
489
+ return r_node;
490
+ }
491
+
492
+ static VALUE
493
+ r_gumbo_attribute_namespace_to_symbol(GumboAttributeNamespaceEnum ns) {
494
+ switch (ns) {
495
+ case GUMBO_ATTR_NAMESPACE_NONE:
496
+ return Qnil;
497
+ case GUMBO_ATTR_NAMESPACE_XLINK:
498
+ return r_sym_new("xlink");
499
+ case GUMBO_ATTR_NAMESPACE_XML:
500
+ return r_sym_new("xml");
501
+ case GUMBO_ATTR_NAMESPACE_XMLNS:
502
+ return r_sym_new("xmlns");
503
+ default:
504
+ rb_raise(rb_eArgError, "unknown namespace %d", ns);
505
+ }
506
+ }
507
+
508
+ static VALUE
509
+ r_gumbo_attribute_to_value(GumboAttribute *attribute) {
510
+ VALUE r_attribute;
511
+
512
+ r_attribute = rb_class_new_instance(0, NULL, c_attribute);
513
+
514
+ rb_iv_set(r_attribute, "@namespace",
515
+ r_gumbo_attribute_namespace_to_symbol(attribute->attr_namespace));
516
+ rb_iv_set(r_attribute, "@name", r_tainted_cstr_new(attribute->name));
517
+ rb_iv_set(r_attribute, "@original_name",
518
+ r_tainted_str_new(attribute->original_name.data,
519
+ attribute->original_name.length));
520
+ rb_iv_set(r_attribute, "@value", r_tainted_cstr_new(attribute->value));
521
+ rb_iv_set(r_attribute, "@original_value",
522
+ r_tainted_str_new(attribute->original_value.data,
523
+ attribute->original_value.length));
524
+ rb_iv_set(r_attribute, "@name_start",
525
+ r_gumbo_source_position_to_value(attribute->name_start));
526
+ rb_iv_set(r_attribute, "@name_end",
527
+ r_gumbo_source_position_to_value(attribute->name_end));
528
+ rb_iv_set(r_attribute, "@value_start",
529
+ r_gumbo_source_position_to_value(attribute->value_start));
530
+ rb_iv_set(r_attribute, "@value_end",
531
+ r_gumbo_source_position_to_value(attribute->value_end));
532
+
533
+ return r_attribute;
534
+ }
@@ -0,0 +1,44 @@
1
+
2
+ # Copyright (c) 2013 Nicolas Martyanoff
3
+ #
4
+ # Permission to use, copy, modify, and distribute this software for any
5
+ # purpose with or without fee is hereby granted, provided that the above
6
+ # copyright notice and this permission notice appear in all copies.
7
+ #
8
+ # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9
+ # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10
+ # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11
+ # ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12
+ # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13
+ # ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14
+ # OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15
+
16
+ require 'gumbo'
17
+
18
+ module Gumbo
19
+ class Node
20
+ # Recursively dump an indented representation of a HTML tree to +output+.
21
+ # Text nodes are not printed.
22
+ def dump_tree(output = $stdout)
23
+ process_node = lambda do |node, indent|
24
+ return unless node.type == :document || node.type == :element
25
+
26
+ output.write (" " * indent)
27
+
28
+ if node.type == :element
29
+ tag = (node.tag == :unknown) ? node.original_tag : node.tag.to_s
30
+ attributes = node.attributes.map(&:name)
31
+ output.puts "<" + tag.upcase() + " " + attributes.join(" ") + ">"
32
+
33
+ indent += 2
34
+ end
35
+
36
+ for child in node.children
37
+ process_node.call(child, indent)
38
+ end
39
+ end
40
+
41
+ process_node.call(self, 0)
42
+ end
43
+ end
44
+ end
metadata ADDED
@@ -0,0 +1,50 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: ruby-gumbo
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Nicolas Martyanoff
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2013-08-18 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description:
14
+ email: khaelin@gmail.com
15
+ executables: []
16
+ extensions:
17
+ - ext/extconf.rb
18
+ extra_rdoc_files: []
19
+ files:
20
+ - Rakefile
21
+ - LICENSE
22
+ - README.mkd
23
+ - lib/gumbo/extra.rb
24
+ - ext/extconf.rb
25
+ - ext/gumbo.c
26
+ homepage:
27
+ licenses:
28
+ - ISC
29
+ metadata: {}
30
+ post_install_message:
31
+ rdoc_options: []
32
+ require_paths:
33
+ - lib
34
+ required_ruby_version: !ruby/object:Gem::Requirement
35
+ requirements:
36
+ - - '>='
37
+ - !ruby/object:Gem::Version
38
+ version: 1.9.3
39
+ required_rubygems_version: !ruby/object:Gem::Requirement
40
+ requirements:
41
+ - - '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ requirements: []
45
+ rubyforge_project:
46
+ rubygems_version: 2.0.3
47
+ signing_key:
48
+ specification_version: 4
49
+ summary: Ruby bindings for the gumbo html5 parser
50
+ test_files: []