ruby-gumbo 1.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/LICENSE +13 -0
- data/README.mkd +15 -0
- data/Rakefile +79 -0
- data/ext/extconf.rb +15 -0
- data/ext/gumbo.c +534 -0
- data/lib/gumbo/extra.rb +44 -0
- metadata +50 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 9f5c5ba70ffb487659a2aadc0e9cf5677beef932
|
4
|
+
data.tar.gz: 6c4be899d6d729cf8070978c959ec076b2f2f155
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0005f69394bedd851e92092ddf18169322b9c3748ecbdf478d9f15af1d2a5200cbc8334caa1c60a9cb3fd3d4de74f5d3b2d8d82360eb562db8c636da17ccc8ed
|
7
|
+
data.tar.gz: 31444aaed773d14e08862350b5a6af88284e96267d89ba03732eeaf5203f1351cfee18a2e36eb58e681d1f0c0d93aeb6bfe39b139fd5c82544f6f257a5f77f2b
|
data/LICENSE
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
Copyright (c) 2013 Nicolas Martyanoff
|
2
|
+
|
3
|
+
Permission to use, copy, modify, and distribute this software for any
|
4
|
+
purpose with or without fee is hereby granted, provided that the above
|
5
|
+
copyright notice and this permission notice appear in all copies.
|
6
|
+
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
8
|
+
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
9
|
+
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
10
|
+
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
11
|
+
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
12
|
+
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
13
|
+
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
data/README.mkd
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# ruby-gumbo
|
2
|
+
|
3
|
+
## Description
|
4
|
+
|
5
|
+
`ruby-gumbo` is a ruby binding for the Gumbo HTML5 parser.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Create the gem with `rake package`, then install it with `gem install` (the
|
10
|
+
gem file is in the `pkg` directory).
|
11
|
+
|
12
|
+
## Contact
|
13
|
+
|
14
|
+
If you have found a bug, have an idea or a question, email me at
|
15
|
+
<khaelin@gmail.com>.
|
data/Rakefile
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
|
2
|
+
require 'rake/clean'
|
3
|
+
|
4
|
+
require 'rdoc/task'
|
5
|
+
|
6
|
+
require 'rubygems/package_task'
|
7
|
+
|
8
|
+
|
9
|
+
PKG_NAME = "ruby-gumbo"
|
10
|
+
PKG_VERSION = "1.0.1"
|
11
|
+
|
12
|
+
EXT_CONF = "ext/extconf.rb"
|
13
|
+
MAKEFILE = "ext/Makefile"
|
14
|
+
MODULE = "ext/gumbo.so"
|
15
|
+
SRC = Dir.glob("ext/*.c") << MAKEFILE
|
16
|
+
|
17
|
+
CLEAN.include [MODULE, "ext/*.o"]
|
18
|
+
CLOBBER.include ["ext/mkmf.log", "ext/extconf.h", MAKEFILE]
|
19
|
+
|
20
|
+
# Build
|
21
|
+
file MAKEFILE => EXT_CONF do |t|
|
22
|
+
Dir::chdir(File::dirname(EXT_CONF)) do
|
23
|
+
unless sh "ruby #{File::basename(EXT_CONF)}"
|
24
|
+
$stderr.puts "extconf.rb failed"
|
25
|
+
break
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
file MODULE => SRC do |t|
|
31
|
+
Dir::chdir(File::dirname(EXT_CONF)) do
|
32
|
+
unless sh "make"
|
33
|
+
$stderr.puts "make failed"
|
34
|
+
break
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
desc "Build the native library"
|
40
|
+
task :build => MODULE
|
41
|
+
|
42
|
+
# Documentation
|
43
|
+
RDOC_FILES = FileList["ext/gumbo.c", "lib/gumbo/extra.rb"]
|
44
|
+
|
45
|
+
Rake::RDocTask.new do |task|
|
46
|
+
#task.main = "README.rdoc"
|
47
|
+
task.rdoc_dir = "doc/api"
|
48
|
+
task.rdoc_files.include(RDOC_FILES)
|
49
|
+
end
|
50
|
+
|
51
|
+
Rake::RDocTask.new(:ri) do |task|
|
52
|
+
#task.main = "README.rdoc"
|
53
|
+
task.rdoc_dir = "doc/ri"
|
54
|
+
task.options << "--ri-system"
|
55
|
+
task.rdoc_files.include(RDOC_FILES)
|
56
|
+
end
|
57
|
+
|
58
|
+
# Packaging
|
59
|
+
PKG_FILES = FileList["Rakefile", "LICENSE", "README.mkd",
|
60
|
+
"lib/gumbo/*.rb",
|
61
|
+
"ext/extconf.rb", "ext/*.[hc]"]
|
62
|
+
|
63
|
+
SPEC = Gem::Specification.new do |spec|
|
64
|
+
spec.name = PKG_NAME
|
65
|
+
spec.version = PKG_VERSION
|
66
|
+
spec.summary = "Ruby bindings for the gumbo html5 parser"
|
67
|
+
spec.author = "Nicolas Martyanoff"
|
68
|
+
spec.email = "khaelin@gmail.com"
|
69
|
+
spec.license = "ISC"
|
70
|
+
|
71
|
+
spec.files = PKG_FILES
|
72
|
+
spec.extensions = "ext/extconf.rb"
|
73
|
+
|
74
|
+
spec.required_ruby_version = ">= 1.9.3"
|
75
|
+
end
|
76
|
+
|
77
|
+
Gem::PackageTask.new(SPEC) do |pkg|
|
78
|
+
pkg.need_tar = true
|
79
|
+
end
|
data/ext/extconf.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
|
2
|
+
require "mkmf"
|
3
|
+
|
4
|
+
RbConfig::MAKEFILE_CONFIG["CC"] = ENV["CC"] if ENV["CC"]
|
5
|
+
|
6
|
+
extension_name = "gumbo"
|
7
|
+
|
8
|
+
unless pkg_config("libgumbo")
|
9
|
+
$libs << " -lgumbo"
|
10
|
+
end
|
11
|
+
|
12
|
+
$CFLAGS << " -std=c99"
|
13
|
+
|
14
|
+
create_header
|
15
|
+
create_makefile(extension_name)
|
data/ext/gumbo.c
ADDED
@@ -0,0 +1,534 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2013 Nicolas Martyanoff
|
3
|
+
*
|
4
|
+
* Permission to use, copy, modify, and distribute this software for any
|
5
|
+
* purpose with or without fee is hereby granted, provided that the above
|
6
|
+
* copyright notice and this permission notice appear in all copies.
|
7
|
+
*
|
8
|
+
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
9
|
+
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
10
|
+
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
11
|
+
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
12
|
+
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
13
|
+
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
14
|
+
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
15
|
+
*/
|
16
|
+
|
17
|
+
#include <ruby.h>
|
18
|
+
#include <ruby/encoding.h>
|
19
|
+
|
20
|
+
#include <gumbo.h>
|
21
|
+
|
22
|
+
void Init_gumbo(void);
|
23
|
+
|
24
|
+
VALUE r_gumbo_parse(VALUE module, VALUE input);
|
25
|
+
VALUE r_document_has_doctype(VALUE self);
|
26
|
+
VALUE r_element_attribute(VALUE self, VALUE name);
|
27
|
+
VALUE r_element_has_attribute(VALUE self, VALUE name);
|
28
|
+
|
29
|
+
|
30
|
+
static VALUE r_bool_new(bool val);
|
31
|
+
static VALUE r_sym_new(const char *str);
|
32
|
+
static VALUE r_str_new(const char *str, long len);
|
33
|
+
static VALUE r_tainted_str_new(const char *str, long len);
|
34
|
+
static VALUE r_cstr_new(const char *str);
|
35
|
+
static VALUE r_tainted_cstr_new(const char *str);
|
36
|
+
|
37
|
+
static VALUE r_gumbo_destroy_output(VALUE value);
|
38
|
+
|
39
|
+
static VALUE r_gumbo_source_position_to_value(GumboSourcePosition position);
|
40
|
+
static VALUE r_gumbo_node_type_to_symbol(GumboNodeType type);
|
41
|
+
static VALUE r_gumbo_parse_flags_to_symbol_array(GumboParseFlags flags);
|
42
|
+
static VALUE r_gumbo_quirks_mode_to_symbol(GumboQuirksModeEnum mode);
|
43
|
+
static VALUE r_gumbo_namespace_to_symbol(GumboNamespaceEnum ns);
|
44
|
+
static VALUE r_gumbo_tag_to_symbol(GumboTag tag);
|
45
|
+
static VALUE r_gumbo_node_to_value(GumboNode *node);
|
46
|
+
|
47
|
+
static VALUE r_gumbo_attribute_namespace_to_symbol(GumboAttributeNamespaceEnum ns);
|
48
|
+
static VALUE r_gumbo_attribute_to_value(GumboAttribute *attribute);
|
49
|
+
|
50
|
+
static VALUE m_gumbo;
|
51
|
+
static VALUE c_node, c_document, c_element;
|
52
|
+
static VALUE c_text, c_cdata, c_comment, c_whitespace;
|
53
|
+
static VALUE c_attribute;
|
54
|
+
static VALUE c_source_position;
|
55
|
+
|
56
|
+
|
57
|
+
void
|
58
|
+
Init_gumbo(void) {
|
59
|
+
m_gumbo = rb_define_module("Gumbo");
|
60
|
+
rb_define_module_function(m_gumbo, "parse", r_gumbo_parse, 1);
|
61
|
+
|
62
|
+
c_node = rb_define_class_under(m_gumbo, "Node", rb_cObject);
|
63
|
+
rb_define_attr(c_node, "type", 1, 0);
|
64
|
+
rb_define_attr(c_node, "parent", 1, 0);
|
65
|
+
rb_define_attr(c_node, "parse_flags", 1, 0);
|
66
|
+
|
67
|
+
c_document = rb_define_class_under(m_gumbo, "Document", c_node);
|
68
|
+
rb_define_attr(c_document, "name", 1, 0);
|
69
|
+
rb_define_attr(c_document, "public_identifier", 1, 0);
|
70
|
+
rb_define_attr(c_document, "system_identifier", 1, 0);
|
71
|
+
rb_define_attr(c_document, "quirks_mode", 1, 0);
|
72
|
+
rb_define_attr(c_document, "children", 1, 0);
|
73
|
+
rb_define_method(c_document, "has_doctype?", r_document_has_doctype, 0);
|
74
|
+
|
75
|
+
c_element = rb_define_class_under(m_gumbo, "Element", c_node);
|
76
|
+
rb_define_attr(c_element, "tag", 1, 0);
|
77
|
+
rb_define_attr(c_element, "original_tag", 1, 0);
|
78
|
+
rb_define_attr(c_element, "tag_namespace", 1, 0);
|
79
|
+
rb_define_attr(c_element, "attributes", 1, 0);
|
80
|
+
rb_define_attr(c_element, "children", 1, 0);
|
81
|
+
rb_define_attr(c_element, "start_pos", 1, 0);
|
82
|
+
rb_define_attr(c_element, "end_pos", 1, 0);
|
83
|
+
rb_define_method(c_element, "attribute", r_element_attribute, 1);
|
84
|
+
rb_define_method(c_element, "has_attribute?", r_element_has_attribute, 1);
|
85
|
+
|
86
|
+
c_text = rb_define_class_under(m_gumbo, "Text", c_node);
|
87
|
+
rb_define_attr(c_text, "text", 1, 0);
|
88
|
+
rb_define_attr(c_text, "original_text", 1, 0);
|
89
|
+
rb_define_attr(c_text, "start_pos", 1, 0);
|
90
|
+
|
91
|
+
c_cdata = rb_define_class_under(m_gumbo, "CData", c_node);
|
92
|
+
rb_define_attr(c_cdata, "text", 1, 0);
|
93
|
+
rb_define_attr(c_cdata, "original_text", 1, 0);
|
94
|
+
rb_define_attr(c_cdata, "start_pos", 1, 0);
|
95
|
+
|
96
|
+
c_comment = rb_define_class_under(m_gumbo, "Comment", c_node);
|
97
|
+
rb_define_attr(c_comment, "text", 1, 0);
|
98
|
+
rb_define_attr(c_comment, "original_text", 1, 0);
|
99
|
+
rb_define_attr(c_comment, "start_pos", 1, 0);
|
100
|
+
|
101
|
+
c_whitespace = rb_define_class_under(m_gumbo, "Whitespace", c_node);
|
102
|
+
rb_define_attr(c_whitespace, "text", 1, 0);
|
103
|
+
rb_define_attr(c_whitespace, "original_text", 1, 0);
|
104
|
+
rb_define_attr(c_whitespace, "start_pos", 1, 0);
|
105
|
+
|
106
|
+
c_attribute = rb_define_class_under(m_gumbo, "Attribute", rb_cObject);
|
107
|
+
rb_define_attr(c_attribute, "namespace", 1, 0);
|
108
|
+
rb_define_attr(c_attribute, "name", 1, 0);
|
109
|
+
rb_define_attr(c_attribute, "original_name", 1, 0);
|
110
|
+
rb_define_attr(c_attribute, "value", 1, 0);
|
111
|
+
rb_define_attr(c_attribute, "original_value", 1, 0);
|
112
|
+
rb_define_attr(c_attribute, "name_start", 1, 0);
|
113
|
+
rb_define_attr(c_attribute, "name_end", 1, 0);
|
114
|
+
rb_define_attr(c_attribute, "value_start", 1, 0);
|
115
|
+
rb_define_attr(c_attribute, "value_end", 1, 0);
|
116
|
+
|
117
|
+
c_source_position = rb_define_class_under(m_gumbo, "SourcePosition",
|
118
|
+
rb_cObject);
|
119
|
+
rb_define_attr(c_source_position, "line", 1, 0);
|
120
|
+
rb_define_attr(c_source_position, "column", 1, 0);
|
121
|
+
rb_define_attr(c_source_position, "offset", 1, 0);
|
122
|
+
}
|
123
|
+
|
124
|
+
/*
|
125
|
+
* call-seq:
|
126
|
+
* Gumbo::parse(input) {|document| ...}
|
127
|
+
* Gumbo::parse(input) -> document
|
128
|
+
*
|
129
|
+
* Parse a HTML document from a string. If the document cannot be created, a
|
130
|
+
* runtime error is raised.
|
131
|
+
*
|
132
|
+
* The input string must be UTF-8 encoded.
|
133
|
+
*/
|
134
|
+
VALUE
|
135
|
+
r_gumbo_parse(VALUE module, VALUE input) {
|
136
|
+
GumboOutput *output;
|
137
|
+
GumboDocument *document;
|
138
|
+
VALUE r_document, r_root;
|
139
|
+
VALUE result;
|
140
|
+
|
141
|
+
rb_check_type(input, T_STRING);
|
142
|
+
|
143
|
+
if (rb_enc_get_index(input) != rb_utf8_encindex())
|
144
|
+
rb_raise(rb_eArgError, "input is not UTF-8 encoded");
|
145
|
+
|
146
|
+
output = gumbo_parse_with_options(&kGumboDefaultOptions,
|
147
|
+
StringValueCStr(input),
|
148
|
+
RSTRING_LEN(input));
|
149
|
+
if (!output)
|
150
|
+
rb_raise(rb_eRuntimeError, "cannot parse input");
|
151
|
+
|
152
|
+
r_document = rb_ensure(r_gumbo_node_to_value, (VALUE)output->document,
|
153
|
+
r_gumbo_destroy_output, (VALUE)output);
|
154
|
+
|
155
|
+
if (rb_block_given_p()) {
|
156
|
+
result = rb_yield(r_document);
|
157
|
+
} else {
|
158
|
+
result = r_document;
|
159
|
+
}
|
160
|
+
|
161
|
+
return result;
|
162
|
+
}
|
163
|
+
|
164
|
+
/*
|
165
|
+
* call-seq:
|
166
|
+
* document.has_doctype? -> boolean
|
167
|
+
*
|
168
|
+
* Return +true+ if the document has a doctype or +false+ else.
|
169
|
+
*/
|
170
|
+
VALUE
|
171
|
+
r_document_has_doctype(VALUE self) {
|
172
|
+
return rb_iv_get(self, "@has_doctype");
|
173
|
+
}
|
174
|
+
|
175
|
+
/*
|
176
|
+
* call-seq:
|
177
|
+
* element.attribute(name) -> attribute
|
178
|
+
*
|
179
|
+
* If +element+ has an attribute with the name +name+, return it. If not,
|
180
|
+
* return +nil+.
|
181
|
+
*/
|
182
|
+
VALUE
|
183
|
+
r_element_attribute(VALUE self, VALUE name) {
|
184
|
+
VALUE attributes;
|
185
|
+
const char *name_str;
|
186
|
+
|
187
|
+
name_str = StringValueCStr(name);
|
188
|
+
|
189
|
+
attributes = rb_iv_get(self, "@attributes");
|
190
|
+
for (long i = 0; i < RARRAY_LEN(attributes); i++) {
|
191
|
+
VALUE attribute;
|
192
|
+
VALUE r_attr_name;
|
193
|
+
const char *attr_name;
|
194
|
+
|
195
|
+
attribute = rb_ary_entry(attributes, i);
|
196
|
+
r_attr_name = rb_iv_get(attribute, "@name");
|
197
|
+
attr_name = StringValueCStr(r_attr_name);
|
198
|
+
|
199
|
+
if (strcasecmp(attr_name, name_str) == 0)
|
200
|
+
return attribute;
|
201
|
+
}
|
202
|
+
|
203
|
+
return Qnil;
|
204
|
+
}
|
205
|
+
|
206
|
+
/*
|
207
|
+
* call-seq:
|
208
|
+
* element.has_attribute?(name) -> boolean
|
209
|
+
*
|
210
|
+
* Return +true+ if +element+ has an attribute with the name +name+ or
|
211
|
+
* +false+ else.
|
212
|
+
*/
|
213
|
+
VALUE
|
214
|
+
r_element_has_attribute(VALUE self, VALUE name) {
|
215
|
+
VALUE attribute;
|
216
|
+
|
217
|
+
attribute = r_element_attribute(self, name);
|
218
|
+
return (attribute == Qnil) ? Qfalse : Qtrue;
|
219
|
+
}
|
220
|
+
|
221
|
+
static VALUE
|
222
|
+
r_bool_new(bool val) {
|
223
|
+
return val ? Qtrue : Qfalse;
|
224
|
+
}
|
225
|
+
|
226
|
+
static VALUE
|
227
|
+
r_sym_new(const char *str) {
|
228
|
+
return ID2SYM(rb_intern(str));
|
229
|
+
}
|
230
|
+
|
231
|
+
static VALUE
|
232
|
+
r_str_new(const char *str, long len) {
|
233
|
+
return str ? rb_enc_str_new(str, len, rb_utf8_encoding()) : Qnil;
|
234
|
+
}
|
235
|
+
|
236
|
+
static VALUE
|
237
|
+
r_tainted_str_new(const char *str, long len) {
|
238
|
+
VALUE val;
|
239
|
+
|
240
|
+
if (str) {
|
241
|
+
val = rb_enc_str_new(str, len, rb_utf8_encoding());
|
242
|
+
OBJ_TAINT(str);
|
243
|
+
} else {
|
244
|
+
val = Qnil;
|
245
|
+
}
|
246
|
+
|
247
|
+
return val;
|
248
|
+
}
|
249
|
+
|
250
|
+
static VALUE
|
251
|
+
r_cstr_new(const char *str) {
|
252
|
+
return r_str_new(str, strlen(str));
|
253
|
+
}
|
254
|
+
|
255
|
+
static VALUE
|
256
|
+
r_tainted_cstr_new(const char *str) {
|
257
|
+
return r_tainted_str_new(str, strlen(str));
|
258
|
+
}
|
259
|
+
|
260
|
+
static VALUE
|
261
|
+
r_gumbo_destroy_output(VALUE value) {
|
262
|
+
GumboOutput *output;
|
263
|
+
|
264
|
+
output = (GumboOutput*)value;
|
265
|
+
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
266
|
+
|
267
|
+
return Qnil;
|
268
|
+
}
|
269
|
+
|
270
|
+
static VALUE
|
271
|
+
r_gumbo_source_position_to_value(GumboSourcePosition position) {
|
272
|
+
VALUE r_position;
|
273
|
+
|
274
|
+
r_position = rb_class_new_instance(0, NULL, c_source_position);
|
275
|
+
|
276
|
+
rb_iv_set(r_position, "@line", UINT2NUM(position.line));
|
277
|
+
rb_iv_set(r_position, "@column", UINT2NUM(position.column));
|
278
|
+
rb_iv_set(r_position, "@offset", UINT2NUM(position.offset));
|
279
|
+
|
280
|
+
return r_position;
|
281
|
+
}
|
282
|
+
|
283
|
+
static VALUE
|
284
|
+
r_gumbo_node_type_to_symbol(GumboNodeType type) {
|
285
|
+
switch (type) {
|
286
|
+
case GUMBO_NODE_DOCUMENT:
|
287
|
+
return r_sym_new("document");
|
288
|
+
case GUMBO_NODE_ELEMENT:
|
289
|
+
return r_sym_new("element");
|
290
|
+
case GUMBO_NODE_TEXT:
|
291
|
+
return r_sym_new("text");
|
292
|
+
case GUMBO_NODE_CDATA:
|
293
|
+
return r_sym_new("cdata");
|
294
|
+
case GUMBO_NODE_COMMENT:
|
295
|
+
return r_sym_new("comment");
|
296
|
+
case GUMBO_NODE_WHITESPACE:
|
297
|
+
return r_sym_new("whitespace");
|
298
|
+
default:
|
299
|
+
rb_raise(rb_eArgError, "unknown node type %d", type);
|
300
|
+
}
|
301
|
+
}
|
302
|
+
|
303
|
+
static VALUE
|
304
|
+
r_gumbo_parse_flags_to_symbol_array(GumboParseFlags flags) {
|
305
|
+
VALUE array;
|
306
|
+
|
307
|
+
array = rb_ary_new();
|
308
|
+
|
309
|
+
if (flags & GUMBO_INSERTION_NORMAL)
|
310
|
+
rb_ary_push(array, r_sym_new("insertion_normal"));
|
311
|
+
if (flags & GUMBO_INSERTION_BY_PARSER)
|
312
|
+
rb_ary_push(array, r_sym_new("insertion_by_parser"));
|
313
|
+
if (flags & GUMBO_INSERTION_IMPLICIT_END_TAG)
|
314
|
+
rb_ary_push(array, r_sym_new("insertion_implicit_end_tag"));
|
315
|
+
if (flags & GUMBO_INSERTION_IMPLIED)
|
316
|
+
rb_ary_push(array, r_sym_new("insertion_implied"));
|
317
|
+
if (flags & GUMBO_INSERTION_CONVERTED_FROM_END_TAG)
|
318
|
+
rb_ary_push(array, r_sym_new("insertion_converted_from_end_tag"));
|
319
|
+
if (flags & GUMBO_INSERTION_FROM_ISINDEX)
|
320
|
+
rb_ary_push(array, r_sym_new("insertion_from_isindex"));
|
321
|
+
if (flags & GUMBO_INSERTION_FROM_IMAGE)
|
322
|
+
rb_ary_push(array, r_sym_new("insertion_from_image"));
|
323
|
+
if (flags & GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT)
|
324
|
+
rb_ary_push(array, r_sym_new("insertion_reconstructed_formatting_element"));
|
325
|
+
if (flags & GUMBO_INSERTION_ADOPTION_AGENCY_CLONED)
|
326
|
+
rb_ary_push(array, r_sym_new("insertion_adoption_agency_cloned"));
|
327
|
+
if (flags & GUMBO_INSERTION_ADOPTION_AGENCY_MOVED)
|
328
|
+
rb_ary_push(array, r_sym_new("insertion_adoption_agency_moved"));
|
329
|
+
if (flags & GUMBO_INSERTION_FOSTER_PARENTED)
|
330
|
+
rb_ary_push(array, r_sym_new("insertion_foster_parented"));
|
331
|
+
|
332
|
+
return array;
|
333
|
+
}
|
334
|
+
|
335
|
+
static VALUE
|
336
|
+
r_gumbo_quirks_mode_to_symbol(GumboQuirksModeEnum mode) {
|
337
|
+
switch (mode) {
|
338
|
+
case GUMBO_DOCTYPE_NO_QUIRKS:
|
339
|
+
return r_sym_new("no_quirks");
|
340
|
+
case GUMBO_DOCTYPE_QUIRKS:
|
341
|
+
return r_sym_new("quirks");
|
342
|
+
case GUMBO_DOCTYPE_LIMITED_QUIRKS:
|
343
|
+
return r_sym_new("limited_quirks");
|
344
|
+
default:
|
345
|
+
rb_raise(rb_eArgError, "unknown quirks mode %d", mode);
|
346
|
+
}
|
347
|
+
}
|
348
|
+
|
349
|
+
static VALUE
|
350
|
+
r_gumbo_namespace_to_symbol(GumboNamespaceEnum ns) {
|
351
|
+
switch (ns) {
|
352
|
+
case GUMBO_NAMESPACE_HTML:
|
353
|
+
return r_sym_new("html");
|
354
|
+
case GUMBO_NAMESPACE_SVG:
|
355
|
+
return r_sym_new("svg");
|
356
|
+
case GUMBO_NAMESPACE_MATHML:
|
357
|
+
return r_sym_new("mathml");
|
358
|
+
default:
|
359
|
+
rb_raise(rb_eArgError, "unknown namespace %d", ns);
|
360
|
+
}
|
361
|
+
}
|
362
|
+
|
363
|
+
static VALUE
|
364
|
+
r_gumbo_tag_to_symbol(GumboTag tag) {
|
365
|
+
const char *name;
|
366
|
+
|
367
|
+
if (tag < 0 || tag >= GUMBO_TAG_LAST)
|
368
|
+
rb_raise(rb_eArgError, "unknown tag %d", tag);
|
369
|
+
|
370
|
+
if (tag == GUMBO_TAG_UNKNOWN) {
|
371
|
+
name = "unknown";
|
372
|
+
} else {
|
373
|
+
name = gumbo_normalized_tagname(tag);
|
374
|
+
}
|
375
|
+
|
376
|
+
return r_sym_new(name);
|
377
|
+
}
|
378
|
+
|
379
|
+
static VALUE
|
380
|
+
r_gumbo_node_to_value(GumboNode *node) {
|
381
|
+
VALUE class;
|
382
|
+
VALUE r_node;
|
383
|
+
GumboVector *children;
|
384
|
+
|
385
|
+
if (node->type == GUMBO_NODE_DOCUMENT) {
|
386
|
+
class = c_document;
|
387
|
+
} else if (node->type == GUMBO_NODE_ELEMENT) {
|
388
|
+
class = c_element;
|
389
|
+
} else if (node->type == GUMBO_NODE_TEXT) {
|
390
|
+
class = c_text;
|
391
|
+
} else if (node->type == GUMBO_NODE_CDATA) {
|
392
|
+
class = c_cdata;
|
393
|
+
} else if (node->type == GUMBO_NODE_COMMENT) {
|
394
|
+
class = c_comment;
|
395
|
+
} else if (node->type == GUMBO_NODE_WHITESPACE) {
|
396
|
+
class = c_whitespace;
|
397
|
+
} else {
|
398
|
+
rb_raise(rb_eArgError, "unknown node type %d", node->type);
|
399
|
+
}
|
400
|
+
|
401
|
+
r_node = rb_class_new_instance(0, NULL, class);
|
402
|
+
rb_iv_set(r_node, "@type", r_gumbo_node_type_to_symbol(node->type));
|
403
|
+
rb_iv_set(r_node, "@parent", Qnil);
|
404
|
+
rb_iv_set(r_node, "@parse_flags",
|
405
|
+
r_gumbo_parse_flags_to_symbol_array(node->parse_flags));
|
406
|
+
|
407
|
+
children = NULL;
|
408
|
+
|
409
|
+
if (node->type == GUMBO_NODE_DOCUMENT) {
|
410
|
+
GumboDocument *document;
|
411
|
+
|
412
|
+
document = &node->v.document;
|
413
|
+
children = &document->children;
|
414
|
+
|
415
|
+
rb_iv_set(r_node, "@name", r_tainted_cstr_new(document->name));
|
416
|
+
rb_iv_set(r_node, "@public_identifier",
|
417
|
+
r_tainted_cstr_new(document->public_identifier));
|
418
|
+
rb_iv_set(r_node, "@system_identifier",
|
419
|
+
r_tainted_cstr_new(document->system_identifier));
|
420
|
+
rb_iv_set(r_node, "@quirks_mode",
|
421
|
+
r_gumbo_quirks_mode_to_symbol(document->doc_type_quirks_mode));
|
422
|
+
rb_iv_set(r_node, "@has_doctype", r_bool_new(document->has_doctype));
|
423
|
+
} else if (node->type == GUMBO_NODE_ELEMENT) {
|
424
|
+
GumboElement *element;
|
425
|
+
VALUE r_attributes;
|
426
|
+
|
427
|
+
element = &node->v.element;
|
428
|
+
children = &element->children;
|
429
|
+
|
430
|
+
rb_iv_set(r_node, "@tag",
|
431
|
+
r_gumbo_tag_to_symbol(element->tag));
|
432
|
+
rb_iv_set(r_node, "@original_tag",
|
433
|
+
r_tainted_str_new(element->original_tag.data,
|
434
|
+
element->original_tag.length));
|
435
|
+
rb_iv_set(r_node, "@tag_namespace",
|
436
|
+
r_gumbo_namespace_to_symbol(element->tag_namespace));
|
437
|
+
rb_iv_set(r_node, "@start_pos",
|
438
|
+
r_gumbo_source_position_to_value(element->start_pos));
|
439
|
+
rb_iv_set(r_node, "@end_pos",
|
440
|
+
r_gumbo_source_position_to_value(element->end_pos));
|
441
|
+
|
442
|
+
r_attributes = rb_ary_new2(element->attributes.length);
|
443
|
+
rb_iv_set(r_node, "@attributes", r_attributes);
|
444
|
+
|
445
|
+
for (unsigned int i = 0; i < element->attributes.length; i++) {
|
446
|
+
GumboAttribute *attribute;
|
447
|
+
VALUE r_attribute;
|
448
|
+
|
449
|
+
attribute = element->attributes.data[i];
|
450
|
+
r_attribute = r_gumbo_attribute_to_value(attribute);
|
451
|
+
|
452
|
+
rb_ary_store(r_attributes, i, r_attribute);
|
453
|
+
}
|
454
|
+
} else if (node->type == GUMBO_NODE_TEXT
|
455
|
+
|| node->type == GUMBO_NODE_CDATA
|
456
|
+
|| node->type == GUMBO_NODE_COMMENT
|
457
|
+
|| node->type == GUMBO_NODE_WHITESPACE) {
|
458
|
+
GumboText *text;
|
459
|
+
|
460
|
+
text = &node->v.text;
|
461
|
+
|
462
|
+
rb_iv_set(r_node, "@text", r_tainted_cstr_new(text->text));
|
463
|
+
rb_iv_set(r_node, "@original_text",
|
464
|
+
r_tainted_str_new(text->original_text.data,
|
465
|
+
text->original_text.length));
|
466
|
+
rb_iv_set(r_node, "@start_pos",
|
467
|
+
r_gumbo_source_position_to_value(text->start_pos));
|
468
|
+
}
|
469
|
+
|
470
|
+
if (children) {
|
471
|
+
VALUE r_children;
|
472
|
+
|
473
|
+
r_children = rb_ary_new2(children->length);
|
474
|
+
rb_iv_set(r_node, "@children", r_children);
|
475
|
+
|
476
|
+
for (unsigned int i = 0; i < children->length; i++) {
|
477
|
+
GumboNode *child;
|
478
|
+
VALUE r_child;
|
479
|
+
|
480
|
+
child = children->data[i];
|
481
|
+
r_child = r_gumbo_node_to_value(child);
|
482
|
+
|
483
|
+
rb_iv_set(r_child, "@parent", r_node);
|
484
|
+
|
485
|
+
rb_ary_store(r_children, i, r_child);
|
486
|
+
}
|
487
|
+
}
|
488
|
+
|
489
|
+
return r_node;
|
490
|
+
}
|
491
|
+
|
492
|
+
static VALUE
|
493
|
+
r_gumbo_attribute_namespace_to_symbol(GumboAttributeNamespaceEnum ns) {
|
494
|
+
switch (ns) {
|
495
|
+
case GUMBO_ATTR_NAMESPACE_NONE:
|
496
|
+
return Qnil;
|
497
|
+
case GUMBO_ATTR_NAMESPACE_XLINK:
|
498
|
+
return r_sym_new("xlink");
|
499
|
+
case GUMBO_ATTR_NAMESPACE_XML:
|
500
|
+
return r_sym_new("xml");
|
501
|
+
case GUMBO_ATTR_NAMESPACE_XMLNS:
|
502
|
+
return r_sym_new("xmlns");
|
503
|
+
default:
|
504
|
+
rb_raise(rb_eArgError, "unknown namespace %d", ns);
|
505
|
+
}
|
506
|
+
}
|
507
|
+
|
508
|
+
static VALUE
|
509
|
+
r_gumbo_attribute_to_value(GumboAttribute *attribute) {
|
510
|
+
VALUE r_attribute;
|
511
|
+
|
512
|
+
r_attribute = rb_class_new_instance(0, NULL, c_attribute);
|
513
|
+
|
514
|
+
rb_iv_set(r_attribute, "@namespace",
|
515
|
+
r_gumbo_attribute_namespace_to_symbol(attribute->attr_namespace));
|
516
|
+
rb_iv_set(r_attribute, "@name", r_tainted_cstr_new(attribute->name));
|
517
|
+
rb_iv_set(r_attribute, "@original_name",
|
518
|
+
r_tainted_str_new(attribute->original_name.data,
|
519
|
+
attribute->original_name.length));
|
520
|
+
rb_iv_set(r_attribute, "@value", r_tainted_cstr_new(attribute->value));
|
521
|
+
rb_iv_set(r_attribute, "@original_value",
|
522
|
+
r_tainted_str_new(attribute->original_value.data,
|
523
|
+
attribute->original_value.length));
|
524
|
+
rb_iv_set(r_attribute, "@name_start",
|
525
|
+
r_gumbo_source_position_to_value(attribute->name_start));
|
526
|
+
rb_iv_set(r_attribute, "@name_end",
|
527
|
+
r_gumbo_source_position_to_value(attribute->name_end));
|
528
|
+
rb_iv_set(r_attribute, "@value_start",
|
529
|
+
r_gumbo_source_position_to_value(attribute->value_start));
|
530
|
+
rb_iv_set(r_attribute, "@value_end",
|
531
|
+
r_gumbo_source_position_to_value(attribute->value_end));
|
532
|
+
|
533
|
+
return r_attribute;
|
534
|
+
}
|
data/lib/gumbo/extra.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
|
2
|
+
# Copyright (c) 2013 Nicolas Martyanoff
|
3
|
+
#
|
4
|
+
# Permission to use, copy, modify, and distribute this software for any
|
5
|
+
# purpose with or without fee is hereby granted, provided that the above
|
6
|
+
# copyright notice and this permission notice appear in all copies.
|
7
|
+
#
|
8
|
+
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
9
|
+
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
10
|
+
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
11
|
+
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
12
|
+
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
13
|
+
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
14
|
+
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
15
|
+
|
16
|
+
require 'gumbo'
|
17
|
+
|
18
|
+
module Gumbo
|
19
|
+
class Node
|
20
|
+
# Recursively dump an indented representation of a HTML tree to +output+.
|
21
|
+
# Text nodes are not printed.
|
22
|
+
def dump_tree(output = $stdout)
|
23
|
+
process_node = lambda do |node, indent|
|
24
|
+
return unless node.type == :document || node.type == :element
|
25
|
+
|
26
|
+
output.write (" " * indent)
|
27
|
+
|
28
|
+
if node.type == :element
|
29
|
+
tag = (node.tag == :unknown) ? node.original_tag : node.tag.to_s
|
30
|
+
attributes = node.attributes.map(&:name)
|
31
|
+
output.puts "<" + tag.upcase() + " " + attributes.join(" ") + ">"
|
32
|
+
|
33
|
+
indent += 2
|
34
|
+
end
|
35
|
+
|
36
|
+
for child in node.children
|
37
|
+
process_node.call(child, indent)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
process_node.call(self, 0)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
metadata
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ruby-gumbo
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Nicolas Martyanoff
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-08-18 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description:
|
14
|
+
email: khaelin@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions:
|
17
|
+
- ext/extconf.rb
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- Rakefile
|
21
|
+
- LICENSE
|
22
|
+
- README.mkd
|
23
|
+
- lib/gumbo/extra.rb
|
24
|
+
- ext/extconf.rb
|
25
|
+
- ext/gumbo.c
|
26
|
+
homepage:
|
27
|
+
licenses:
|
28
|
+
- ISC
|
29
|
+
metadata: {}
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
require_paths:
|
33
|
+
- lib
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: 1.9.3
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubyforge_project:
|
46
|
+
rubygems_version: 2.0.3
|
47
|
+
signing_key:
|
48
|
+
specification_version: 4
|
49
|
+
summary: Ruby bindings for the gumbo html5 parser
|
50
|
+
test_files: []
|