nokogumbo 0.1 → 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +2 -3
- data/Rakefile +25 -15
- data/{ext → work}/extconf.rb +1 -1
- metadata +7 -8
- data/ext/nokogumbo.c +0 -131
data/README.md
CHANGED
@@ -29,12 +29,11 @@ original tag name is returned verbatim.
|
|
29
29
|
* Nothing meaningful is done with the `GumboDocument` struct, i.e., no
|
30
30
|
Nokogiri `EntityDecl` is produced.
|
31
31
|
|
32
|
+
* The gem itself includes a copy of the Nokogumbo HTML5 parser.
|
33
|
+
|
32
34
|
Installation:
|
33
35
|
============
|
34
36
|
|
35
|
-
* Build and install the
|
36
|
-
[gumbo-parser](https://github.com/google/gumbo-parser#readme) C library
|
37
|
-
|
38
37
|
* Execute `rake gem`
|
39
38
|
|
40
39
|
* [sudo] gem install pkg/nokogumbo*.gem
|
data/Rakefile
CHANGED
@@ -3,44 +3,54 @@ require 'rake/clean'
|
|
3
3
|
|
4
4
|
task 'default' => 'test'
|
5
5
|
|
6
|
-
file '
|
7
|
-
|
6
|
+
file 'gumbo-parser' do
|
7
|
+
sh 'git clone https://github.com/google/gumbo-parser.git'
|
8
|
+
end
|
9
|
+
|
10
|
+
file 'work/extconf.rb' => 'gumbo-parser' do
|
11
|
+
sh 'mkdir work'
|
12
|
+
sh 'cp gumbo-parser/src/* work'
|
13
|
+
sh 'cp ext/* work'
|
14
|
+
end
|
15
|
+
|
16
|
+
file 'work/Makefile' => 'work/extconf.rb' do
|
17
|
+
Dir.chdir 'work' do
|
8
18
|
ruby 'extconf.rb'
|
9
19
|
end
|
10
20
|
end
|
11
21
|
|
12
|
-
task 'test' => 'Makefile' do
|
13
|
-
Dir.chdir '
|
22
|
+
task 'test' => 'work/Makefile' do
|
23
|
+
Dir.chdir 'work' do
|
14
24
|
sh 'make -s'
|
15
25
|
end
|
16
26
|
ruby 'test-nokogumbo.rb'
|
17
27
|
end
|
18
28
|
|
19
|
-
CLEAN.include
|
29
|
+
CLEAN.include 'pkg', 'gumbo-parser', 'work'
|
20
30
|
|
21
|
-
MANIFEST =
|
22
|
-
|
23
|
-
|
31
|
+
MANIFEST = FileList[*%w(
|
32
|
+
work/*.rb
|
33
|
+
work/*.c
|
34
|
+
work/*.h
|
24
35
|
lib/nokogumbo.rb
|
25
36
|
Rakefile
|
26
37
|
README.md
|
27
|
-
)
|
38
|
+
)]
|
28
39
|
|
29
40
|
SPEC = Gem::Specification.new do |gem|
|
30
41
|
gem.name = 'nokogumbo'
|
31
|
-
gem.version = '0.
|
42
|
+
gem.version = '0.2'
|
32
43
|
gem.email = 'rubys@intertwingly.net'
|
33
44
|
gem.homepage = 'https://github.com/rubys/nokogumbo/tree/master/ruby#readme'
|
34
45
|
gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser'
|
35
46
|
gem.files = MANIFEST
|
36
|
-
gem.extensions = '
|
47
|
+
gem.extensions = 'work/extconf.rb'
|
37
48
|
gem.author = 'Sam Ruby'
|
38
49
|
gem.add_dependency 'nokogiri'
|
39
|
-
gem.license = '
|
50
|
+
gem.license = 'Apache 2.0'
|
40
51
|
gem.description = %q(
|
41
|
-
|
42
|
-
|
43
|
-
parsed document.).strip.gsub(/\s+/, ' ')
|
52
|
+
Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
|
53
|
+
access the result as a Nokogiri parsed document.).strip.gsub(/\s+/, ' ')
|
44
54
|
end
|
45
55
|
|
46
56
|
task 'gem' => 'test'
|
data/{ext → work}/extconf.rb
RENAMED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.2'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-08-
|
12
|
+
date: 2013-08-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -27,22 +27,21 @@ dependencies:
|
|
27
27
|
- - ! '>='
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: '0'
|
30
|
-
description:
|
31
|
-
|
30
|
+
description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
|
31
|
+
access the result as a Nokogiri parsed document.
|
32
32
|
email: rubys@intertwingly.net
|
33
33
|
executables: []
|
34
34
|
extensions:
|
35
|
-
-
|
35
|
+
- work/extconf.rb
|
36
36
|
extra_rdoc_files: []
|
37
37
|
files:
|
38
|
-
- ext/extconf.rb
|
39
|
-
- ext/nokogumbo.c
|
40
38
|
- lib/nokogumbo.rb
|
41
39
|
- Rakefile
|
42
40
|
- README.md
|
41
|
+
- work/extconf.rb
|
43
42
|
homepage: https://github.com/rubys/nokogumbo/tree/master/ruby#readme
|
44
43
|
licenses:
|
45
|
-
-
|
44
|
+
- Apache 2.0
|
46
45
|
post_install_message:
|
47
46
|
rdoc_options: []
|
48
47
|
require_paths:
|
data/ext/nokogumbo.c
DELETED
@@ -1,131 +0,0 @@
|
|
1
|
-
#include "ruby.h"
|
2
|
-
#include "gumbo.h"
|
3
|
-
|
4
|
-
// class constants
|
5
|
-
static VALUE Nokogiri;
|
6
|
-
static VALUE HTML;
|
7
|
-
static VALUE XML;
|
8
|
-
static VALUE Document;
|
9
|
-
static VALUE Element;
|
10
|
-
static VALUE Text;
|
11
|
-
static VALUE CDATA;
|
12
|
-
static VALUE Comment;
|
13
|
-
static VALUE TAGS=0;
|
14
|
-
static int Unknown=0;
|
15
|
-
|
16
|
-
// interned symbols
|
17
|
-
static VALUE new;
|
18
|
-
static VALUE set_attribute;
|
19
|
-
static VALUE add_child;
|
20
|
-
|
21
|
-
// determine tag name for a given node
|
22
|
-
static VALUE _name(GumboElement *node) {
|
23
|
-
if (!TAGS) {
|
24
|
-
// Deferred initialization of "Unknown" as the GumboParser class is
|
25
|
-
// defined *after* the Nokogumbo class is.
|
26
|
-
VALUE HTML5 = rb_const_get(Nokogiri, rb_intern("HTML5"));
|
27
|
-
TAGS = rb_const_get(HTML5, rb_intern("TAGS"));
|
28
|
-
Unknown = NUM2INT(rb_const_get(HTML5, rb_intern("Unknown")));
|
29
|
-
}
|
30
|
-
|
31
|
-
if (node->tag != Unknown) {
|
32
|
-
return rb_ary_entry(TAGS, (long) node->tag);
|
33
|
-
} else {
|
34
|
-
// Gumbo doesn't provide unknown tags, so we need to parse it ourselves:
|
35
|
-
// http://www.w3.org/html/wg/drafts/html/CR/syntax.html#tag-name-state
|
36
|
-
GumboStringPiece *tag = &node->original_tag;
|
37
|
-
int length;
|
38
|
-
for (length = 1; length < tag->length-1; length++) {
|
39
|
-
if (strchr(" \t\r\n<", *((char*)tag->data+length))) break;
|
40
|
-
}
|
41
|
-
return rb_str_new(1+(char *)tag->data, length-1);
|
42
|
-
}
|
43
|
-
}
|
44
|
-
|
45
|
-
// Build a Nokogiri Element for a given GumboElement (recursively)
|
46
|
-
static VALUE _element(VALUE document, GumboElement *node) {
|
47
|
-
int i;
|
48
|
-
VALUE element = rb_funcall(Element, new, 2, _name(node), document);
|
49
|
-
|
50
|
-
// add in the attributes
|
51
|
-
GumboVector* attrs = &node->attributes;
|
52
|
-
for (i=0; i < attrs->length; i++) {
|
53
|
-
GumboAttribute *attr = attrs->data[i];
|
54
|
-
VALUE name = rb_str_new2(attr->name);
|
55
|
-
rb_funcall(element, set_attribute, 2, name, rb_str_new2(attr->value));
|
56
|
-
}
|
57
|
-
|
58
|
-
// add in the children
|
59
|
-
GumboVector* children = &node->children;
|
60
|
-
for (i=0; i < children->length; i++) {
|
61
|
-
GumboNode* child = children->data[i];
|
62
|
-
|
63
|
-
VALUE node = 0;
|
64
|
-
VALUE text;
|
65
|
-
|
66
|
-
switch (child->type) {
|
67
|
-
case GUMBO_NODE_ELEMENT:
|
68
|
-
node = _element(document, &child->v.element);
|
69
|
-
break;
|
70
|
-
case GUMBO_NODE_WHITESPACE:
|
71
|
-
case GUMBO_NODE_TEXT:
|
72
|
-
text = rb_str_new2(child->v.text.text);
|
73
|
-
node = rb_funcall(Text, new, 2, text, document);
|
74
|
-
break;
|
75
|
-
case GUMBO_NODE_CDATA:
|
76
|
-
text = rb_str_new2(child->v.text.text);
|
77
|
-
node = rb_funcall(CDATA, new, 2, text, document);
|
78
|
-
break;
|
79
|
-
case GUMBO_NODE_COMMENT:
|
80
|
-
text = rb_str_new2(child->v.text.text);
|
81
|
-
node = rb_funcall(Comment, new, 2, document, text);
|
82
|
-
break;
|
83
|
-
case GUMBO_NODE_DOCUMENT:
|
84
|
-
break; // should never happen -- ignore
|
85
|
-
}
|
86
|
-
|
87
|
-
if (node) rb_funcall(element, add_child, 1, node);
|
88
|
-
}
|
89
|
-
|
90
|
-
return element;
|
91
|
-
}
|
92
|
-
|
93
|
-
// Parse a string using gumbo_parse into a Nokogiri document
|
94
|
-
static VALUE t_parse(VALUE self, VALUE string) {
|
95
|
-
VALUE document = rb_funcall(Document, new, 0);
|
96
|
-
|
97
|
-
GumboOutput *output = gumbo_parse_with_options(
|
98
|
-
&kGumboDefaultOptions, RSTRING_PTR(string), RSTRING_LEN(string)
|
99
|
-
);
|
100
|
-
VALUE root = _element(document, (GumboElement*)&output->root->v.element);
|
101
|
-
rb_funcall(document, add_child, 1, root);
|
102
|
-
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
103
|
-
|
104
|
-
return document;
|
105
|
-
}
|
106
|
-
|
107
|
-
// Initialize the Nokogumbo class and fetch constants we will use later
|
108
|
-
void Init_nokogumboc() {
|
109
|
-
rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
|
110
|
-
rb_require("nokogiri");
|
111
|
-
|
112
|
-
// class constants
|
113
|
-
Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
|
114
|
-
HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
|
115
|
-
XML = rb_const_get(Nokogiri, rb_intern("XML"));
|
116
|
-
Document = rb_const_get(HTML, rb_intern("Document"));
|
117
|
-
Element = rb_const_get(XML, rb_intern("Element"));
|
118
|
-
Text = rb_const_get(XML, rb_intern("Text"));
|
119
|
-
CDATA = rb_const_get(XML, rb_intern("CDATA"));
|
120
|
-
Comment = rb_const_get(XML, rb_intern("Comment"));
|
121
|
-
|
122
|
-
// interned symbols
|
123
|
-
new = rb_intern("new");
|
124
|
-
set_attribute = rb_intern("set_attribute");
|
125
|
-
add_child = rb_intern("add_child");
|
126
|
-
|
127
|
-
// define Nokogumbo class with a singleton parse method
|
128
|
-
VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
|
129
|
-
rb_define_singleton_method(Gumbo, "parse", t_parse, 1);
|
130
|
-
}
|
131
|
-
|