nokogumbo 0.1 → 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +2 -3
- data/Rakefile +25 -15
- data/{ext → work}/extconf.rb +1 -1
- metadata +7 -8
- data/ext/nokogumbo.c +0 -131
data/README.md
CHANGED
@@ -29,12 +29,11 @@ original tag name is returned verbatim.
|
|
29
29
|
* Nothing meaningful is done with the `GumboDocument` struct, i.e., no
|
30
30
|
Nokogiri `EntityDecl` is produced.
|
31
31
|
|
32
|
+
* The gem itself includes a copy of the Nokogumbo HTML5 parser.
|
33
|
+
|
32
34
|
Installation:
|
33
35
|
============
|
34
36
|
|
35
|
-
* Build and install the
|
36
|
-
[gumbo-parser](https://github.com/google/gumbo-parser#readme) C library
|
37
|
-
|
38
37
|
* Execute `rake gem`
|
39
38
|
|
40
39
|
* [sudo] gem install pkg/nokogumbo*.gem
|
data/Rakefile
CHANGED
@@ -3,44 +3,54 @@ require 'rake/clean'
|
|
3
3
|
|
4
4
|
task 'default' => 'test'
|
5
5
|
|
6
|
-
file '
|
7
|
-
|
6
|
+
file 'gumbo-parser' do
|
7
|
+
sh 'git clone https://github.com/google/gumbo-parser.git'
|
8
|
+
end
|
9
|
+
|
10
|
+
file 'work/extconf.rb' => 'gumbo-parser' do
|
11
|
+
sh 'mkdir work'
|
12
|
+
sh 'cp gumbo-parser/src/* work'
|
13
|
+
sh 'cp ext/* work'
|
14
|
+
end
|
15
|
+
|
16
|
+
file 'work/Makefile' => 'work/extconf.rb' do
|
17
|
+
Dir.chdir 'work' do
|
8
18
|
ruby 'extconf.rb'
|
9
19
|
end
|
10
20
|
end
|
11
21
|
|
12
|
-
task 'test' => 'Makefile' do
|
13
|
-
Dir.chdir '
|
22
|
+
task 'test' => 'work/Makefile' do
|
23
|
+
Dir.chdir 'work' do
|
14
24
|
sh 'make -s'
|
15
25
|
end
|
16
26
|
ruby 'test-nokogumbo.rb'
|
17
27
|
end
|
18
28
|
|
19
|
-
CLEAN.include
|
29
|
+
CLEAN.include 'pkg', 'gumbo-parser', 'work'
|
20
30
|
|
21
|
-
MANIFEST =
|
22
|
-
|
23
|
-
|
31
|
+
MANIFEST = FileList[*%w(
|
32
|
+
work/*.rb
|
33
|
+
work/*.c
|
34
|
+
work/*.h
|
24
35
|
lib/nokogumbo.rb
|
25
36
|
Rakefile
|
26
37
|
README.md
|
27
|
-
)
|
38
|
+
)]
|
28
39
|
|
29
40
|
SPEC = Gem::Specification.new do |gem|
|
30
41
|
gem.name = 'nokogumbo'
|
31
|
-
gem.version = '0.
|
42
|
+
gem.version = '0.2'
|
32
43
|
gem.email = 'rubys@intertwingly.net'
|
33
44
|
gem.homepage = 'https://github.com/rubys/nokogumbo/tree/master/ruby#readme'
|
34
45
|
gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser'
|
35
46
|
gem.files = MANIFEST
|
36
|
-
gem.extensions = '
|
47
|
+
gem.extensions = 'work/extconf.rb'
|
37
48
|
gem.author = 'Sam Ruby'
|
38
49
|
gem.add_dependency 'nokogiri'
|
39
|
-
gem.license = '
|
50
|
+
gem.license = 'Apache 2.0'
|
40
51
|
gem.description = %q(
|
41
|
-
|
42
|
-
|
43
|
-
parsed document.).strip.gsub(/\s+/, ' ')
|
52
|
+
Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
|
53
|
+
access the result as a Nokogiri parsed document.).strip.gsub(/\s+/, ' ')
|
44
54
|
end
|
45
55
|
|
46
56
|
task 'gem' => 'test'
|
data/{ext → work}/extconf.rb
RENAMED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
4
|
+
version: '0.2'
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,7 +9,7 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-08-
|
12
|
+
date: 2013-08-18 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: nokogiri
|
@@ -27,22 +27,21 @@ dependencies:
|
|
27
27
|
- - ! '>='
|
28
28
|
- !ruby/object:Gem::Version
|
29
29
|
version: '0'
|
30
|
-
description:
|
31
|
-
|
30
|
+
description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
|
31
|
+
access the result as a Nokogiri parsed document.
|
32
32
|
email: rubys@intertwingly.net
|
33
33
|
executables: []
|
34
34
|
extensions:
|
35
|
-
-
|
35
|
+
- work/extconf.rb
|
36
36
|
extra_rdoc_files: []
|
37
37
|
files:
|
38
|
-
- ext/extconf.rb
|
39
|
-
- ext/nokogumbo.c
|
40
38
|
- lib/nokogumbo.rb
|
41
39
|
- Rakefile
|
42
40
|
- README.md
|
41
|
+
- work/extconf.rb
|
43
42
|
homepage: https://github.com/rubys/nokogumbo/tree/master/ruby#readme
|
44
43
|
licenses:
|
45
|
-
-
|
44
|
+
- Apache 2.0
|
46
45
|
post_install_message:
|
47
46
|
rdoc_options: []
|
48
47
|
require_paths:
|
data/ext/nokogumbo.c
DELETED
@@ -1,131 +0,0 @@
|
|
1
|
-
#include "ruby.h"
|
2
|
-
#include "gumbo.h"
|
3
|
-
|
4
|
-
// class constants
|
5
|
-
static VALUE Nokogiri;
|
6
|
-
static VALUE HTML;
|
7
|
-
static VALUE XML;
|
8
|
-
static VALUE Document;
|
9
|
-
static VALUE Element;
|
10
|
-
static VALUE Text;
|
11
|
-
static VALUE CDATA;
|
12
|
-
static VALUE Comment;
|
13
|
-
static VALUE TAGS=0;
|
14
|
-
static int Unknown=0;
|
15
|
-
|
16
|
-
// interned symbols
|
17
|
-
static VALUE new;
|
18
|
-
static VALUE set_attribute;
|
19
|
-
static VALUE add_child;
|
20
|
-
|
21
|
-
// determine tag name for a given node
|
22
|
-
static VALUE _name(GumboElement *node) {
|
23
|
-
if (!TAGS) {
|
24
|
-
// Deferred initialization of "Unknown" as the GumboParser class is
|
25
|
-
// defined *after* the Nokogumbo class is.
|
26
|
-
VALUE HTML5 = rb_const_get(Nokogiri, rb_intern("HTML5"));
|
27
|
-
TAGS = rb_const_get(HTML5, rb_intern("TAGS"));
|
28
|
-
Unknown = NUM2INT(rb_const_get(HTML5, rb_intern("Unknown")));
|
29
|
-
}
|
30
|
-
|
31
|
-
if (node->tag != Unknown) {
|
32
|
-
return rb_ary_entry(TAGS, (long) node->tag);
|
33
|
-
} else {
|
34
|
-
// Gumbo doesn't provide unknown tags, so we need to parse it ourselves:
|
35
|
-
// http://www.w3.org/html/wg/drafts/html/CR/syntax.html#tag-name-state
|
36
|
-
GumboStringPiece *tag = &node->original_tag;
|
37
|
-
int length;
|
38
|
-
for (length = 1; length < tag->length-1; length++) {
|
39
|
-
if (strchr(" \t\r\n<", *((char*)tag->data+length))) break;
|
40
|
-
}
|
41
|
-
return rb_str_new(1+(char *)tag->data, length-1);
|
42
|
-
}
|
43
|
-
}
|
44
|
-
|
45
|
-
// Build a Nokogiri Element for a given GumboElement (recursively)
|
46
|
-
static VALUE _element(VALUE document, GumboElement *node) {
|
47
|
-
int i;
|
48
|
-
VALUE element = rb_funcall(Element, new, 2, _name(node), document);
|
49
|
-
|
50
|
-
// add in the attributes
|
51
|
-
GumboVector* attrs = &node->attributes;
|
52
|
-
for (i=0; i < attrs->length; i++) {
|
53
|
-
GumboAttribute *attr = attrs->data[i];
|
54
|
-
VALUE name = rb_str_new2(attr->name);
|
55
|
-
rb_funcall(element, set_attribute, 2, name, rb_str_new2(attr->value));
|
56
|
-
}
|
57
|
-
|
58
|
-
// add in the children
|
59
|
-
GumboVector* children = &node->children;
|
60
|
-
for (i=0; i < children->length; i++) {
|
61
|
-
GumboNode* child = children->data[i];
|
62
|
-
|
63
|
-
VALUE node = 0;
|
64
|
-
VALUE text;
|
65
|
-
|
66
|
-
switch (child->type) {
|
67
|
-
case GUMBO_NODE_ELEMENT:
|
68
|
-
node = _element(document, &child->v.element);
|
69
|
-
break;
|
70
|
-
case GUMBO_NODE_WHITESPACE:
|
71
|
-
case GUMBO_NODE_TEXT:
|
72
|
-
text = rb_str_new2(child->v.text.text);
|
73
|
-
node = rb_funcall(Text, new, 2, text, document);
|
74
|
-
break;
|
75
|
-
case GUMBO_NODE_CDATA:
|
76
|
-
text = rb_str_new2(child->v.text.text);
|
77
|
-
node = rb_funcall(CDATA, new, 2, text, document);
|
78
|
-
break;
|
79
|
-
case GUMBO_NODE_COMMENT:
|
80
|
-
text = rb_str_new2(child->v.text.text);
|
81
|
-
node = rb_funcall(Comment, new, 2, document, text);
|
82
|
-
break;
|
83
|
-
case GUMBO_NODE_DOCUMENT:
|
84
|
-
break; // should never happen -- ignore
|
85
|
-
}
|
86
|
-
|
87
|
-
if (node) rb_funcall(element, add_child, 1, node);
|
88
|
-
}
|
89
|
-
|
90
|
-
return element;
|
91
|
-
}
|
92
|
-
|
93
|
-
// Parse a string using gumbo_parse into a Nokogiri document
|
94
|
-
static VALUE t_parse(VALUE self, VALUE string) {
|
95
|
-
VALUE document = rb_funcall(Document, new, 0);
|
96
|
-
|
97
|
-
GumboOutput *output = gumbo_parse_with_options(
|
98
|
-
&kGumboDefaultOptions, RSTRING_PTR(string), RSTRING_LEN(string)
|
99
|
-
);
|
100
|
-
VALUE root = _element(document, (GumboElement*)&output->root->v.element);
|
101
|
-
rb_funcall(document, add_child, 1, root);
|
102
|
-
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
103
|
-
|
104
|
-
return document;
|
105
|
-
}
|
106
|
-
|
107
|
-
// Initialize the Nokogumbo class and fetch constants we will use later
|
108
|
-
void Init_nokogumboc() {
|
109
|
-
rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
|
110
|
-
rb_require("nokogiri");
|
111
|
-
|
112
|
-
// class constants
|
113
|
-
Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
|
114
|
-
HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
|
115
|
-
XML = rb_const_get(Nokogiri, rb_intern("XML"));
|
116
|
-
Document = rb_const_get(HTML, rb_intern("Document"));
|
117
|
-
Element = rb_const_get(XML, rb_intern("Element"));
|
118
|
-
Text = rb_const_get(XML, rb_intern("Text"));
|
119
|
-
CDATA = rb_const_get(XML, rb_intern("CDATA"));
|
120
|
-
Comment = rb_const_get(XML, rb_intern("Comment"));
|
121
|
-
|
122
|
-
// interned symbols
|
123
|
-
new = rb_intern("new");
|
124
|
-
set_attribute = rb_intern("set_attribute");
|
125
|
-
add_child = rb_intern("add_child");
|
126
|
-
|
127
|
-
// define Nokogumbo class with a singleton parse method
|
128
|
-
VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
|
129
|
-
rb_define_singleton_method(Gumbo, "parse", t_parse, 1);
|
130
|
-
}
|
131
|
-
|