ruby-gumbo 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/LICENSE +13 -0
- data/README.mkd +15 -0
- data/Rakefile +79 -0
- data/ext/extconf.rb +15 -0
- data/ext/gumbo.c +534 -0
- data/lib/gumbo/extra.rb +44 -0
- metadata +50 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 9f5c5ba70ffb487659a2aadc0e9cf5677beef932
|
4
|
+
data.tar.gz: 6c4be899d6d729cf8070978c959ec076b2f2f155
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 0005f69394bedd851e92092ddf18169322b9c3748ecbdf478d9f15af1d2a5200cbc8334caa1c60a9cb3fd3d4de74f5d3b2d8d82360eb562db8c636da17ccc8ed
|
7
|
+
data.tar.gz: 31444aaed773d14e08862350b5a6af88284e96267d89ba03732eeaf5203f1351cfee18a2e36eb58e681d1f0c0d93aeb6bfe39b139fd5c82544f6f257a5f77f2b
|
data/LICENSE
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
Copyright (c) 2013 Nicolas Martyanoff
|
2
|
+
|
3
|
+
Permission to use, copy, modify, and distribute this software for any
|
4
|
+
purpose with or without fee is hereby granted, provided that the above
|
5
|
+
copyright notice and this permission notice appear in all copies.
|
6
|
+
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
8
|
+
WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
9
|
+
MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
10
|
+
ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
11
|
+
WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
12
|
+
ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
13
|
+
OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
data/README.mkd
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
# ruby-gumbo
|
2
|
+
|
3
|
+
## Description
|
4
|
+
|
5
|
+
`ruby-gumbo` is a ruby binding for the Gumbo HTML5 parser.
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Create the gem with `rake package`, then install it with `gem install` (the
|
10
|
+
gem file is in the `pkg` directory).
|
11
|
+
|
12
|
+
## Contact
|
13
|
+
|
14
|
+
If you have found a bug, have an idea or a question, email me at
|
15
|
+
<khaelin@gmail.com>.
|
data/Rakefile
ADDED
@@ -0,0 +1,79 @@
|
|
1
|
+
|
2
|
+
require 'rake/clean'
|
3
|
+
|
4
|
+
require 'rdoc/task'
|
5
|
+
|
6
|
+
require 'rubygems/package_task'
|
7
|
+
|
8
|
+
|
9
|
+
PKG_NAME = "ruby-gumbo"
|
10
|
+
PKG_VERSION = "1.0.1"
|
11
|
+
|
12
|
+
EXT_CONF = "ext/extconf.rb"
|
13
|
+
MAKEFILE = "ext/Makefile"
|
14
|
+
MODULE = "ext/gumbo.so"
|
15
|
+
SRC = Dir.glob("ext/*.c") << MAKEFILE
|
16
|
+
|
17
|
+
CLEAN.include [MODULE, "ext/*.o"]
|
18
|
+
CLOBBER.include ["ext/mkmf.log", "ext/extconf.h", MAKEFILE]
|
19
|
+
|
20
|
+
# Build
|
21
|
+
file MAKEFILE => EXT_CONF do |t|
|
22
|
+
Dir::chdir(File::dirname(EXT_CONF)) do
|
23
|
+
unless sh "ruby #{File::basename(EXT_CONF)}"
|
24
|
+
$stderr.puts "extconf.rb failed"
|
25
|
+
break
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
file MODULE => SRC do |t|
|
31
|
+
Dir::chdir(File::dirname(EXT_CONF)) do
|
32
|
+
unless sh "make"
|
33
|
+
$stderr.puts "make failed"
|
34
|
+
break
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
desc "Build the native library"
|
40
|
+
task :build => MODULE
|
41
|
+
|
42
|
+
# Documentation
|
43
|
+
RDOC_FILES = FileList["ext/gumbo.c", "lib/gumbo/extra.rb"]
|
44
|
+
|
45
|
+
Rake::RDocTask.new do |task|
|
46
|
+
#task.main = "README.rdoc"
|
47
|
+
task.rdoc_dir = "doc/api"
|
48
|
+
task.rdoc_files.include(RDOC_FILES)
|
49
|
+
end
|
50
|
+
|
51
|
+
Rake::RDocTask.new(:ri) do |task|
|
52
|
+
#task.main = "README.rdoc"
|
53
|
+
task.rdoc_dir = "doc/ri"
|
54
|
+
task.options << "--ri-system"
|
55
|
+
task.rdoc_files.include(RDOC_FILES)
|
56
|
+
end
|
57
|
+
|
58
|
+
# Packaging
|
59
|
+
PKG_FILES = FileList["Rakefile", "LICENSE", "README.mkd",
|
60
|
+
"lib/gumbo/*.rb",
|
61
|
+
"ext/extconf.rb", "ext/*.[hc]"]
|
62
|
+
|
63
|
+
SPEC = Gem::Specification.new do |spec|
|
64
|
+
spec.name = PKG_NAME
|
65
|
+
spec.version = PKG_VERSION
|
66
|
+
spec.summary = "Ruby bindings for the gumbo html5 parser"
|
67
|
+
spec.author = "Nicolas Martyanoff"
|
68
|
+
spec.email = "khaelin@gmail.com"
|
69
|
+
spec.license = "ISC"
|
70
|
+
|
71
|
+
spec.files = PKG_FILES
|
72
|
+
spec.extensions = "ext/extconf.rb"
|
73
|
+
|
74
|
+
spec.required_ruby_version = ">= 1.9.3"
|
75
|
+
end
|
76
|
+
|
77
|
+
Gem::PackageTask.new(SPEC) do |pkg|
|
78
|
+
pkg.need_tar = true
|
79
|
+
end
|
data/ext/extconf.rb
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
|
2
|
+
require "mkmf"
|
3
|
+
|
4
|
+
RbConfig::MAKEFILE_CONFIG["CC"] = ENV["CC"] if ENV["CC"]
|
5
|
+
|
6
|
+
extension_name = "gumbo"
|
7
|
+
|
8
|
+
unless pkg_config("libgumbo")
|
9
|
+
$libs << " -lgumbo"
|
10
|
+
end
|
11
|
+
|
12
|
+
$CFLAGS << " -std=c99"
|
13
|
+
|
14
|
+
create_header
|
15
|
+
create_makefile(extension_name)
|
data/ext/gumbo.c
ADDED
@@ -0,0 +1,534 @@
|
|
1
|
+
/*
|
2
|
+
* Copyright (c) 2013 Nicolas Martyanoff
|
3
|
+
*
|
4
|
+
* Permission to use, copy, modify, and distribute this software for any
|
5
|
+
* purpose with or without fee is hereby granted, provided that the above
|
6
|
+
* copyright notice and this permission notice appear in all copies.
|
7
|
+
*
|
8
|
+
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
9
|
+
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
10
|
+
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
11
|
+
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
12
|
+
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
13
|
+
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
14
|
+
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
15
|
+
*/
|
16
|
+
|
17
|
+
#include <ruby.h>
|
18
|
+
#include <ruby/encoding.h>
|
19
|
+
|
20
|
+
#include <gumbo.h>
|
21
|
+
|
22
|
+
void Init_gumbo(void);
|
23
|
+
|
24
|
+
VALUE r_gumbo_parse(VALUE module, VALUE input);
|
25
|
+
VALUE r_document_has_doctype(VALUE self);
|
26
|
+
VALUE r_element_attribute(VALUE self, VALUE name);
|
27
|
+
VALUE r_element_has_attribute(VALUE self, VALUE name);
|
28
|
+
|
29
|
+
|
30
|
+
static VALUE r_bool_new(bool val);
|
31
|
+
static VALUE r_sym_new(const char *str);
|
32
|
+
static VALUE r_str_new(const char *str, long len);
|
33
|
+
static VALUE r_tainted_str_new(const char *str, long len);
|
34
|
+
static VALUE r_cstr_new(const char *str);
|
35
|
+
static VALUE r_tainted_cstr_new(const char *str);
|
36
|
+
|
37
|
+
static VALUE r_gumbo_destroy_output(VALUE value);
|
38
|
+
|
39
|
+
static VALUE r_gumbo_source_position_to_value(GumboSourcePosition position);
|
40
|
+
static VALUE r_gumbo_node_type_to_symbol(GumboNodeType type);
|
41
|
+
static VALUE r_gumbo_parse_flags_to_symbol_array(GumboParseFlags flags);
|
42
|
+
static VALUE r_gumbo_quirks_mode_to_symbol(GumboQuirksModeEnum mode);
|
43
|
+
static VALUE r_gumbo_namespace_to_symbol(GumboNamespaceEnum ns);
|
44
|
+
static VALUE r_gumbo_tag_to_symbol(GumboTag tag);
|
45
|
+
static VALUE r_gumbo_node_to_value(GumboNode *node);
|
46
|
+
|
47
|
+
static VALUE r_gumbo_attribute_namespace_to_symbol(GumboAttributeNamespaceEnum ns);
|
48
|
+
static VALUE r_gumbo_attribute_to_value(GumboAttribute *attribute);
|
49
|
+
|
50
|
+
static VALUE m_gumbo;
|
51
|
+
static VALUE c_node, c_document, c_element;
|
52
|
+
static VALUE c_text, c_cdata, c_comment, c_whitespace;
|
53
|
+
static VALUE c_attribute;
|
54
|
+
static VALUE c_source_position;
|
55
|
+
|
56
|
+
|
57
|
+
void
|
58
|
+
Init_gumbo(void) {
|
59
|
+
m_gumbo = rb_define_module("Gumbo");
|
60
|
+
rb_define_module_function(m_gumbo, "parse", r_gumbo_parse, 1);
|
61
|
+
|
62
|
+
c_node = rb_define_class_under(m_gumbo, "Node", rb_cObject);
|
63
|
+
rb_define_attr(c_node, "type", 1, 0);
|
64
|
+
rb_define_attr(c_node, "parent", 1, 0);
|
65
|
+
rb_define_attr(c_node, "parse_flags", 1, 0);
|
66
|
+
|
67
|
+
c_document = rb_define_class_under(m_gumbo, "Document", c_node);
|
68
|
+
rb_define_attr(c_document, "name", 1, 0);
|
69
|
+
rb_define_attr(c_document, "public_identifier", 1, 0);
|
70
|
+
rb_define_attr(c_document, "system_identifier", 1, 0);
|
71
|
+
rb_define_attr(c_document, "quirks_mode", 1, 0);
|
72
|
+
rb_define_attr(c_document, "children", 1, 0);
|
73
|
+
rb_define_method(c_document, "has_doctype?", r_document_has_doctype, 0);
|
74
|
+
|
75
|
+
c_element = rb_define_class_under(m_gumbo, "Element", c_node);
|
76
|
+
rb_define_attr(c_element, "tag", 1, 0);
|
77
|
+
rb_define_attr(c_element, "original_tag", 1, 0);
|
78
|
+
rb_define_attr(c_element, "tag_namespace", 1, 0);
|
79
|
+
rb_define_attr(c_element, "attributes", 1, 0);
|
80
|
+
rb_define_attr(c_element, "children", 1, 0);
|
81
|
+
rb_define_attr(c_element, "start_pos", 1, 0);
|
82
|
+
rb_define_attr(c_element, "end_pos", 1, 0);
|
83
|
+
rb_define_method(c_element, "attribute", r_element_attribute, 1);
|
84
|
+
rb_define_method(c_element, "has_attribute?", r_element_has_attribute, 1);
|
85
|
+
|
86
|
+
c_text = rb_define_class_under(m_gumbo, "Text", c_node);
|
87
|
+
rb_define_attr(c_text, "text", 1, 0);
|
88
|
+
rb_define_attr(c_text, "original_text", 1, 0);
|
89
|
+
rb_define_attr(c_text, "start_pos", 1, 0);
|
90
|
+
|
91
|
+
c_cdata = rb_define_class_under(m_gumbo, "CData", c_node);
|
92
|
+
rb_define_attr(c_cdata, "text", 1, 0);
|
93
|
+
rb_define_attr(c_cdata, "original_text", 1, 0);
|
94
|
+
rb_define_attr(c_cdata, "start_pos", 1, 0);
|
95
|
+
|
96
|
+
c_comment = rb_define_class_under(m_gumbo, "Comment", c_node);
|
97
|
+
rb_define_attr(c_comment, "text", 1, 0);
|
98
|
+
rb_define_attr(c_comment, "original_text", 1, 0);
|
99
|
+
rb_define_attr(c_comment, "start_pos", 1, 0);
|
100
|
+
|
101
|
+
c_whitespace = rb_define_class_under(m_gumbo, "Whitespace", c_node);
|
102
|
+
rb_define_attr(c_whitespace, "text", 1, 0);
|
103
|
+
rb_define_attr(c_whitespace, "original_text", 1, 0);
|
104
|
+
rb_define_attr(c_whitespace, "start_pos", 1, 0);
|
105
|
+
|
106
|
+
c_attribute = rb_define_class_under(m_gumbo, "Attribute", rb_cObject);
|
107
|
+
rb_define_attr(c_attribute, "namespace", 1, 0);
|
108
|
+
rb_define_attr(c_attribute, "name", 1, 0);
|
109
|
+
rb_define_attr(c_attribute, "original_name", 1, 0);
|
110
|
+
rb_define_attr(c_attribute, "value", 1, 0);
|
111
|
+
rb_define_attr(c_attribute, "original_value", 1, 0);
|
112
|
+
rb_define_attr(c_attribute, "name_start", 1, 0);
|
113
|
+
rb_define_attr(c_attribute, "name_end", 1, 0);
|
114
|
+
rb_define_attr(c_attribute, "value_start", 1, 0);
|
115
|
+
rb_define_attr(c_attribute, "value_end", 1, 0);
|
116
|
+
|
117
|
+
c_source_position = rb_define_class_under(m_gumbo, "SourcePosition",
|
118
|
+
rb_cObject);
|
119
|
+
rb_define_attr(c_source_position, "line", 1, 0);
|
120
|
+
rb_define_attr(c_source_position, "column", 1, 0);
|
121
|
+
rb_define_attr(c_source_position, "offset", 1, 0);
|
122
|
+
}
|
123
|
+
|
124
|
+
/*
|
125
|
+
* call-seq:
|
126
|
+
* Gumbo::parse(input) {|document| ...}
|
127
|
+
* Gumbo::parse(input) -> document
|
128
|
+
*
|
129
|
+
* Parse a HTML document from a string. If the document cannot be created, a
|
130
|
+
* runtime error is raised.
|
131
|
+
*
|
132
|
+
* The input string must be UTF-8 encoded.
|
133
|
+
*/
|
134
|
+
VALUE
|
135
|
+
r_gumbo_parse(VALUE module, VALUE input) {
|
136
|
+
GumboOutput *output;
|
137
|
+
GumboDocument *document;
|
138
|
+
VALUE r_document, r_root;
|
139
|
+
VALUE result;
|
140
|
+
|
141
|
+
rb_check_type(input, T_STRING);
|
142
|
+
|
143
|
+
if (rb_enc_get_index(input) != rb_utf8_encindex())
|
144
|
+
rb_raise(rb_eArgError, "input is not UTF-8 encoded");
|
145
|
+
|
146
|
+
output = gumbo_parse_with_options(&kGumboDefaultOptions,
|
147
|
+
StringValueCStr(input),
|
148
|
+
RSTRING_LEN(input));
|
149
|
+
if (!output)
|
150
|
+
rb_raise(rb_eRuntimeError, "cannot parse input");
|
151
|
+
|
152
|
+
r_document = rb_ensure(r_gumbo_node_to_value, (VALUE)output->document,
|
153
|
+
r_gumbo_destroy_output, (VALUE)output);
|
154
|
+
|
155
|
+
if (rb_block_given_p()) {
|
156
|
+
result = rb_yield(r_document);
|
157
|
+
} else {
|
158
|
+
result = r_document;
|
159
|
+
}
|
160
|
+
|
161
|
+
return result;
|
162
|
+
}
|
163
|
+
|
164
|
+
/*
|
165
|
+
* call-seq:
|
166
|
+
* document.has_doctype? -> boolean
|
167
|
+
*
|
168
|
+
* Return +true+ if the document has a doctype or +false+ else.
|
169
|
+
*/
|
170
|
+
VALUE
|
171
|
+
r_document_has_doctype(VALUE self) {
|
172
|
+
return rb_iv_get(self, "@has_doctype");
|
173
|
+
}
|
174
|
+
|
175
|
+
/*
|
176
|
+
* call-seq:
|
177
|
+
* element.attribute(name) -> attribute
|
178
|
+
*
|
179
|
+
* If +element+ has an attribute with the name +name+, return it. If not,
|
180
|
+
* return +nil+.
|
181
|
+
*/
|
182
|
+
VALUE
|
183
|
+
r_element_attribute(VALUE self, VALUE name) {
|
184
|
+
VALUE attributes;
|
185
|
+
const char *name_str;
|
186
|
+
|
187
|
+
name_str = StringValueCStr(name);
|
188
|
+
|
189
|
+
attributes = rb_iv_get(self, "@attributes");
|
190
|
+
for (long i = 0; i < RARRAY_LEN(attributes); i++) {
|
191
|
+
VALUE attribute;
|
192
|
+
VALUE r_attr_name;
|
193
|
+
const char *attr_name;
|
194
|
+
|
195
|
+
attribute = rb_ary_entry(attributes, i);
|
196
|
+
r_attr_name = rb_iv_get(attribute, "@name");
|
197
|
+
attr_name = StringValueCStr(r_attr_name);
|
198
|
+
|
199
|
+
if (strcasecmp(attr_name, name_str) == 0)
|
200
|
+
return attribute;
|
201
|
+
}
|
202
|
+
|
203
|
+
return Qnil;
|
204
|
+
}
|
205
|
+
|
206
|
+
/*
|
207
|
+
* call-seq:
|
208
|
+
* element.has_attribute?(name) -> boolean
|
209
|
+
*
|
210
|
+
* Return +true+ if +element+ has an attribute with the name +name+ or
|
211
|
+
* +false+ else.
|
212
|
+
*/
|
213
|
+
VALUE
|
214
|
+
r_element_has_attribute(VALUE self, VALUE name) {
|
215
|
+
VALUE attribute;
|
216
|
+
|
217
|
+
attribute = r_element_attribute(self, name);
|
218
|
+
return (attribute == Qnil) ? Qfalse : Qtrue;
|
219
|
+
}
|
220
|
+
|
221
|
+
static VALUE
|
222
|
+
r_bool_new(bool val) {
|
223
|
+
return val ? Qtrue : Qfalse;
|
224
|
+
}
|
225
|
+
|
226
|
+
static VALUE
|
227
|
+
r_sym_new(const char *str) {
|
228
|
+
return ID2SYM(rb_intern(str));
|
229
|
+
}
|
230
|
+
|
231
|
+
static VALUE
|
232
|
+
r_str_new(const char *str, long len) {
|
233
|
+
return str ? rb_enc_str_new(str, len, rb_utf8_encoding()) : Qnil;
|
234
|
+
}
|
235
|
+
|
236
|
+
static VALUE
|
237
|
+
r_tainted_str_new(const char *str, long len) {
|
238
|
+
VALUE val;
|
239
|
+
|
240
|
+
if (str) {
|
241
|
+
val = rb_enc_str_new(str, len, rb_utf8_encoding());
|
242
|
+
OBJ_TAINT(str);
|
243
|
+
} else {
|
244
|
+
val = Qnil;
|
245
|
+
}
|
246
|
+
|
247
|
+
return val;
|
248
|
+
}
|
249
|
+
|
250
|
+
static VALUE
|
251
|
+
r_cstr_new(const char *str) {
|
252
|
+
return r_str_new(str, strlen(str));
|
253
|
+
}
|
254
|
+
|
255
|
+
static VALUE
|
256
|
+
r_tainted_cstr_new(const char *str) {
|
257
|
+
return r_tainted_str_new(str, strlen(str));
|
258
|
+
}
|
259
|
+
|
260
|
+
static VALUE
|
261
|
+
r_gumbo_destroy_output(VALUE value) {
|
262
|
+
GumboOutput *output;
|
263
|
+
|
264
|
+
output = (GumboOutput*)value;
|
265
|
+
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
266
|
+
|
267
|
+
return Qnil;
|
268
|
+
}
|
269
|
+
|
270
|
+
static VALUE
|
271
|
+
r_gumbo_source_position_to_value(GumboSourcePosition position) {
|
272
|
+
VALUE r_position;
|
273
|
+
|
274
|
+
r_position = rb_class_new_instance(0, NULL, c_source_position);
|
275
|
+
|
276
|
+
rb_iv_set(r_position, "@line", UINT2NUM(position.line));
|
277
|
+
rb_iv_set(r_position, "@column", UINT2NUM(position.column));
|
278
|
+
rb_iv_set(r_position, "@offset", UINT2NUM(position.offset));
|
279
|
+
|
280
|
+
return r_position;
|
281
|
+
}
|
282
|
+
|
283
|
+
static VALUE
|
284
|
+
r_gumbo_node_type_to_symbol(GumboNodeType type) {
|
285
|
+
switch (type) {
|
286
|
+
case GUMBO_NODE_DOCUMENT:
|
287
|
+
return r_sym_new("document");
|
288
|
+
case GUMBO_NODE_ELEMENT:
|
289
|
+
return r_sym_new("element");
|
290
|
+
case GUMBO_NODE_TEXT:
|
291
|
+
return r_sym_new("text");
|
292
|
+
case GUMBO_NODE_CDATA:
|
293
|
+
return r_sym_new("cdata");
|
294
|
+
case GUMBO_NODE_COMMENT:
|
295
|
+
return r_sym_new("comment");
|
296
|
+
case GUMBO_NODE_WHITESPACE:
|
297
|
+
return r_sym_new("whitespace");
|
298
|
+
default:
|
299
|
+
rb_raise(rb_eArgError, "unknown node type %d", type);
|
300
|
+
}
|
301
|
+
}
|
302
|
+
|
303
|
+
static VALUE
|
304
|
+
r_gumbo_parse_flags_to_symbol_array(GumboParseFlags flags) {
|
305
|
+
VALUE array;
|
306
|
+
|
307
|
+
array = rb_ary_new();
|
308
|
+
|
309
|
+
if (flags & GUMBO_INSERTION_NORMAL)
|
310
|
+
rb_ary_push(array, r_sym_new("insertion_normal"));
|
311
|
+
if (flags & GUMBO_INSERTION_BY_PARSER)
|
312
|
+
rb_ary_push(array, r_sym_new("insertion_by_parser"));
|
313
|
+
if (flags & GUMBO_INSERTION_IMPLICIT_END_TAG)
|
314
|
+
rb_ary_push(array, r_sym_new("insertion_implicit_end_tag"));
|
315
|
+
if (flags & GUMBO_INSERTION_IMPLIED)
|
316
|
+
rb_ary_push(array, r_sym_new("insertion_implied"));
|
317
|
+
if (flags & GUMBO_INSERTION_CONVERTED_FROM_END_TAG)
|
318
|
+
rb_ary_push(array, r_sym_new("insertion_converted_from_end_tag"));
|
319
|
+
if (flags & GUMBO_INSERTION_FROM_ISINDEX)
|
320
|
+
rb_ary_push(array, r_sym_new("insertion_from_isindex"));
|
321
|
+
if (flags & GUMBO_INSERTION_FROM_IMAGE)
|
322
|
+
rb_ary_push(array, r_sym_new("insertion_from_image"));
|
323
|
+
if (flags & GUMBO_INSERTION_RECONSTRUCTED_FORMATTING_ELEMENT)
|
324
|
+
rb_ary_push(array, r_sym_new("insertion_reconstructed_formatting_element"));
|
325
|
+
if (flags & GUMBO_INSERTION_ADOPTION_AGENCY_CLONED)
|
326
|
+
rb_ary_push(array, r_sym_new("insertion_adoption_agency_cloned"));
|
327
|
+
if (flags & GUMBO_INSERTION_ADOPTION_AGENCY_MOVED)
|
328
|
+
rb_ary_push(array, r_sym_new("insertion_adoption_agency_moved"));
|
329
|
+
if (flags & GUMBO_INSERTION_FOSTER_PARENTED)
|
330
|
+
rb_ary_push(array, r_sym_new("insertion_foster_parented"));
|
331
|
+
|
332
|
+
return array;
|
333
|
+
}
|
334
|
+
|
335
|
+
static VALUE
|
336
|
+
r_gumbo_quirks_mode_to_symbol(GumboQuirksModeEnum mode) {
|
337
|
+
switch (mode) {
|
338
|
+
case GUMBO_DOCTYPE_NO_QUIRKS:
|
339
|
+
return r_sym_new("no_quirks");
|
340
|
+
case GUMBO_DOCTYPE_QUIRKS:
|
341
|
+
return r_sym_new("quirks");
|
342
|
+
case GUMBO_DOCTYPE_LIMITED_QUIRKS:
|
343
|
+
return r_sym_new("limited_quirks");
|
344
|
+
default:
|
345
|
+
rb_raise(rb_eArgError, "unknown quirks mode %d", mode);
|
346
|
+
}
|
347
|
+
}
|
348
|
+
|
349
|
+
static VALUE
|
350
|
+
r_gumbo_namespace_to_symbol(GumboNamespaceEnum ns) {
|
351
|
+
switch (ns) {
|
352
|
+
case GUMBO_NAMESPACE_HTML:
|
353
|
+
return r_sym_new("html");
|
354
|
+
case GUMBO_NAMESPACE_SVG:
|
355
|
+
return r_sym_new("svg");
|
356
|
+
case GUMBO_NAMESPACE_MATHML:
|
357
|
+
return r_sym_new("mathml");
|
358
|
+
default:
|
359
|
+
rb_raise(rb_eArgError, "unknown namespace %d", ns);
|
360
|
+
}
|
361
|
+
}
|
362
|
+
|
363
|
+
static VALUE
|
364
|
+
r_gumbo_tag_to_symbol(GumboTag tag) {
|
365
|
+
const char *name;
|
366
|
+
|
367
|
+
if (tag < 0 || tag >= GUMBO_TAG_LAST)
|
368
|
+
rb_raise(rb_eArgError, "unknown tag %d", tag);
|
369
|
+
|
370
|
+
if (tag == GUMBO_TAG_UNKNOWN) {
|
371
|
+
name = "unknown";
|
372
|
+
} else {
|
373
|
+
name = gumbo_normalized_tagname(tag);
|
374
|
+
}
|
375
|
+
|
376
|
+
return r_sym_new(name);
|
377
|
+
}
|
378
|
+
|
379
|
+
static VALUE
|
380
|
+
r_gumbo_node_to_value(GumboNode *node) {
|
381
|
+
VALUE class;
|
382
|
+
VALUE r_node;
|
383
|
+
GumboVector *children;
|
384
|
+
|
385
|
+
if (node->type == GUMBO_NODE_DOCUMENT) {
|
386
|
+
class = c_document;
|
387
|
+
} else if (node->type == GUMBO_NODE_ELEMENT) {
|
388
|
+
class = c_element;
|
389
|
+
} else if (node->type == GUMBO_NODE_TEXT) {
|
390
|
+
class = c_text;
|
391
|
+
} else if (node->type == GUMBO_NODE_CDATA) {
|
392
|
+
class = c_cdata;
|
393
|
+
} else if (node->type == GUMBO_NODE_COMMENT) {
|
394
|
+
class = c_comment;
|
395
|
+
} else if (node->type == GUMBO_NODE_WHITESPACE) {
|
396
|
+
class = c_whitespace;
|
397
|
+
} else {
|
398
|
+
rb_raise(rb_eArgError, "unknown node type %d", node->type);
|
399
|
+
}
|
400
|
+
|
401
|
+
r_node = rb_class_new_instance(0, NULL, class);
|
402
|
+
rb_iv_set(r_node, "@type", r_gumbo_node_type_to_symbol(node->type));
|
403
|
+
rb_iv_set(r_node, "@parent", Qnil);
|
404
|
+
rb_iv_set(r_node, "@parse_flags",
|
405
|
+
r_gumbo_parse_flags_to_symbol_array(node->parse_flags));
|
406
|
+
|
407
|
+
children = NULL;
|
408
|
+
|
409
|
+
if (node->type == GUMBO_NODE_DOCUMENT) {
|
410
|
+
GumboDocument *document;
|
411
|
+
|
412
|
+
document = &node->v.document;
|
413
|
+
children = &document->children;
|
414
|
+
|
415
|
+
rb_iv_set(r_node, "@name", r_tainted_cstr_new(document->name));
|
416
|
+
rb_iv_set(r_node, "@public_identifier",
|
417
|
+
r_tainted_cstr_new(document->public_identifier));
|
418
|
+
rb_iv_set(r_node, "@system_identifier",
|
419
|
+
r_tainted_cstr_new(document->system_identifier));
|
420
|
+
rb_iv_set(r_node, "@quirks_mode",
|
421
|
+
r_gumbo_quirks_mode_to_symbol(document->doc_type_quirks_mode));
|
422
|
+
rb_iv_set(r_node, "@has_doctype", r_bool_new(document->has_doctype));
|
423
|
+
} else if (node->type == GUMBO_NODE_ELEMENT) {
|
424
|
+
GumboElement *element;
|
425
|
+
VALUE r_attributes;
|
426
|
+
|
427
|
+
element = &node->v.element;
|
428
|
+
children = &element->children;
|
429
|
+
|
430
|
+
rb_iv_set(r_node, "@tag",
|
431
|
+
r_gumbo_tag_to_symbol(element->tag));
|
432
|
+
rb_iv_set(r_node, "@original_tag",
|
433
|
+
r_tainted_str_new(element->original_tag.data,
|
434
|
+
element->original_tag.length));
|
435
|
+
rb_iv_set(r_node, "@tag_namespace",
|
436
|
+
r_gumbo_namespace_to_symbol(element->tag_namespace));
|
437
|
+
rb_iv_set(r_node, "@start_pos",
|
438
|
+
r_gumbo_source_position_to_value(element->start_pos));
|
439
|
+
rb_iv_set(r_node, "@end_pos",
|
440
|
+
r_gumbo_source_position_to_value(element->end_pos));
|
441
|
+
|
442
|
+
r_attributes = rb_ary_new2(element->attributes.length);
|
443
|
+
rb_iv_set(r_node, "@attributes", r_attributes);
|
444
|
+
|
445
|
+
for (unsigned int i = 0; i < element->attributes.length; i++) {
|
446
|
+
GumboAttribute *attribute;
|
447
|
+
VALUE r_attribute;
|
448
|
+
|
449
|
+
attribute = element->attributes.data[i];
|
450
|
+
r_attribute = r_gumbo_attribute_to_value(attribute);
|
451
|
+
|
452
|
+
rb_ary_store(r_attributes, i, r_attribute);
|
453
|
+
}
|
454
|
+
} else if (node->type == GUMBO_NODE_TEXT
|
455
|
+
|| node->type == GUMBO_NODE_CDATA
|
456
|
+
|| node->type == GUMBO_NODE_COMMENT
|
457
|
+
|| node->type == GUMBO_NODE_WHITESPACE) {
|
458
|
+
GumboText *text;
|
459
|
+
|
460
|
+
text = &node->v.text;
|
461
|
+
|
462
|
+
rb_iv_set(r_node, "@text", r_tainted_cstr_new(text->text));
|
463
|
+
rb_iv_set(r_node, "@original_text",
|
464
|
+
r_tainted_str_new(text->original_text.data,
|
465
|
+
text->original_text.length));
|
466
|
+
rb_iv_set(r_node, "@start_pos",
|
467
|
+
r_gumbo_source_position_to_value(text->start_pos));
|
468
|
+
}
|
469
|
+
|
470
|
+
if (children) {
|
471
|
+
VALUE r_children;
|
472
|
+
|
473
|
+
r_children = rb_ary_new2(children->length);
|
474
|
+
rb_iv_set(r_node, "@children", r_children);
|
475
|
+
|
476
|
+
for (unsigned int i = 0; i < children->length; i++) {
|
477
|
+
GumboNode *child;
|
478
|
+
VALUE r_child;
|
479
|
+
|
480
|
+
child = children->data[i];
|
481
|
+
r_child = r_gumbo_node_to_value(child);
|
482
|
+
|
483
|
+
rb_iv_set(r_child, "@parent", r_node);
|
484
|
+
|
485
|
+
rb_ary_store(r_children, i, r_child);
|
486
|
+
}
|
487
|
+
}
|
488
|
+
|
489
|
+
return r_node;
|
490
|
+
}
|
491
|
+
|
492
|
+
static VALUE
|
493
|
+
r_gumbo_attribute_namespace_to_symbol(GumboAttributeNamespaceEnum ns) {
|
494
|
+
switch (ns) {
|
495
|
+
case GUMBO_ATTR_NAMESPACE_NONE:
|
496
|
+
return Qnil;
|
497
|
+
case GUMBO_ATTR_NAMESPACE_XLINK:
|
498
|
+
return r_sym_new("xlink");
|
499
|
+
case GUMBO_ATTR_NAMESPACE_XML:
|
500
|
+
return r_sym_new("xml");
|
501
|
+
case GUMBO_ATTR_NAMESPACE_XMLNS:
|
502
|
+
return r_sym_new("xmlns");
|
503
|
+
default:
|
504
|
+
rb_raise(rb_eArgError, "unknown namespace %d", ns);
|
505
|
+
}
|
506
|
+
}
|
507
|
+
|
508
|
+
static VALUE
|
509
|
+
r_gumbo_attribute_to_value(GumboAttribute *attribute) {
|
510
|
+
VALUE r_attribute;
|
511
|
+
|
512
|
+
r_attribute = rb_class_new_instance(0, NULL, c_attribute);
|
513
|
+
|
514
|
+
rb_iv_set(r_attribute, "@namespace",
|
515
|
+
r_gumbo_attribute_namespace_to_symbol(attribute->attr_namespace));
|
516
|
+
rb_iv_set(r_attribute, "@name", r_tainted_cstr_new(attribute->name));
|
517
|
+
rb_iv_set(r_attribute, "@original_name",
|
518
|
+
r_tainted_str_new(attribute->original_name.data,
|
519
|
+
attribute->original_name.length));
|
520
|
+
rb_iv_set(r_attribute, "@value", r_tainted_cstr_new(attribute->value));
|
521
|
+
rb_iv_set(r_attribute, "@original_value",
|
522
|
+
r_tainted_str_new(attribute->original_value.data,
|
523
|
+
attribute->original_value.length));
|
524
|
+
rb_iv_set(r_attribute, "@name_start",
|
525
|
+
r_gumbo_source_position_to_value(attribute->name_start));
|
526
|
+
rb_iv_set(r_attribute, "@name_end",
|
527
|
+
r_gumbo_source_position_to_value(attribute->name_end));
|
528
|
+
rb_iv_set(r_attribute, "@value_start",
|
529
|
+
r_gumbo_source_position_to_value(attribute->value_start));
|
530
|
+
rb_iv_set(r_attribute, "@value_end",
|
531
|
+
r_gumbo_source_position_to_value(attribute->value_end));
|
532
|
+
|
533
|
+
return r_attribute;
|
534
|
+
}
|
data/lib/gumbo/extra.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
|
2
|
+
# Copyright (c) 2013 Nicolas Martyanoff
|
3
|
+
#
|
4
|
+
# Permission to use, copy, modify, and distribute this software for any
|
5
|
+
# purpose with or without fee is hereby granted, provided that the above
|
6
|
+
# copyright notice and this permission notice appear in all copies.
|
7
|
+
#
|
8
|
+
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
9
|
+
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
10
|
+
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
|
11
|
+
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
12
|
+
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
|
13
|
+
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
|
14
|
+
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
15
|
+
|
16
|
+
require 'gumbo'
|
17
|
+
|
18
|
+
module Gumbo
|
19
|
+
class Node
|
20
|
+
# Recursively dump an indented representation of a HTML tree to +output+.
|
21
|
+
# Text nodes are not printed.
|
22
|
+
def dump_tree(output = $stdout)
|
23
|
+
process_node = lambda do |node, indent|
|
24
|
+
return unless node.type == :document || node.type == :element
|
25
|
+
|
26
|
+
output.write (" " * indent)
|
27
|
+
|
28
|
+
if node.type == :element
|
29
|
+
tag = (node.tag == :unknown) ? node.original_tag : node.tag.to_s
|
30
|
+
attributes = node.attributes.map(&:name)
|
31
|
+
output.puts "<" + tag.upcase() + " " + attributes.join(" ") + ">"
|
32
|
+
|
33
|
+
indent += 2
|
34
|
+
end
|
35
|
+
|
36
|
+
for child in node.children
|
37
|
+
process_node.call(child, indent)
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
process_node.call(self, 0)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
metadata
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: ruby-gumbo
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Nicolas Martyanoff
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2013-08-18 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description:
|
14
|
+
email: khaelin@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions:
|
17
|
+
- ext/extconf.rb
|
18
|
+
extra_rdoc_files: []
|
19
|
+
files:
|
20
|
+
- Rakefile
|
21
|
+
- LICENSE
|
22
|
+
- README.mkd
|
23
|
+
- lib/gumbo/extra.rb
|
24
|
+
- ext/extconf.rb
|
25
|
+
- ext/gumbo.c
|
26
|
+
homepage:
|
27
|
+
licenses:
|
28
|
+
- ISC
|
29
|
+
metadata: {}
|
30
|
+
post_install_message:
|
31
|
+
rdoc_options: []
|
32
|
+
require_paths:
|
33
|
+
- lib
|
34
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
35
|
+
requirements:
|
36
|
+
- - '>='
|
37
|
+
- !ruby/object:Gem::Version
|
38
|
+
version: 1.9.3
|
39
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
40
|
+
requirements:
|
41
|
+
- - '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
requirements: []
|
45
|
+
rubyforge_project:
|
46
|
+
rubygems_version: 2.0.3
|
47
|
+
signing_key:
|
48
|
+
specification_version: 4
|
49
|
+
summary: Ruby bindings for the gumbo html5 parser
|
50
|
+
test_files: []
|