nokogumbo 0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +46 -0
- data/Rakefile +50 -0
- data/ext/extconf.rb +3 -0
- data/ext/nokogumbo.c +131 -0
- data/lib/nokogumbo.rb +166 -0
- metadata +68 -0
data/README.md
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
|
2
|
+
===========
|
3
|
+
|
4
|
+
At the moment, this is a proof of concept, allowing a Ruby program to invoke
|
5
|
+
the Gumbo HTML5 parser and access the result as a Nokogiri parsed document.
|
6
|
+
|
7
|
+
Usage:
|
8
|
+
-----
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
require 'nokogumbo'
|
12
|
+
doc = Nokogiri::HTML5(string)
|
13
|
+
```
|
14
|
+
|
15
|
+
Notes:
|
16
|
+
-----
|
17
|
+
|
18
|
+
* The `Nokogumbo.parse` function takes a string and passes it to the
|
19
|
+
<code>gumbo_parse_with_options</code> method, using the default options.
|
20
|
+
The resulting Gumbo parse tree is the walked, producing a Nokogiri parse tree.
|
21
|
+
The original Gumbo parse tree is then destroyed, and the Nokogiri parse tree
|
22
|
+
is returned.
|
23
|
+
|
24
|
+
* Instead of uppercase element names, lowercase element names are produced.
|
25
|
+
|
26
|
+
* Instead of returning 'unknown' as the element name for unknown tags, the
|
27
|
+
original tag name is returned verbatim.
|
28
|
+
|
29
|
+
* Nothing meaningful is done with the `GumboDocument` struct, i.e., no
|
30
|
+
Nokogiri `EntityDecl` is produced.
|
31
|
+
|
32
|
+
Installation:
|
33
|
+
============
|
34
|
+
|
35
|
+
* Build and install the
|
36
|
+
[gumbo-parser](https://github.com/google/gumbo-parser#readme) C library
|
37
|
+
|
38
|
+
* Execute `rake gem`
|
39
|
+
|
40
|
+
* [sudo] gem install pkg/nokogumbo*.gem
|
41
|
+
|
42
|
+
Related efforts:
|
43
|
+
============
|
44
|
+
|
45
|
+
* [ruby-gumbo](https://github.com/galdor/ruby-gumbo#readme) - a ruby binding
|
46
|
+
for the Gumbo HTML5 parser.
|
data/Rakefile
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'rubygems/package_task'
|
2
|
+
require 'rake/clean'
|
3
|
+
|
4
|
+
task 'default' => 'test'
|
5
|
+
|
6
|
+
file 'Makefile' => 'ext/extconf.rb' do
|
7
|
+
Dir.chdir 'ext' do
|
8
|
+
ruby 'extconf.rb'
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
task 'test' => 'Makefile' do
|
13
|
+
Dir.chdir 'ext' do
|
14
|
+
sh 'make -s'
|
15
|
+
end
|
16
|
+
ruby 'test-nokogumbo.rb'
|
17
|
+
end
|
18
|
+
|
19
|
+
CLEAN.include('ext/*.o', 'ext/*.so', 'ext/*.log', 'ext/Makefile', 'pkg')
|
20
|
+
|
21
|
+
MANIFEST = %w(
|
22
|
+
ext/extconf.rb
|
23
|
+
ext/nokogumbo.c
|
24
|
+
lib/nokogumbo.rb
|
25
|
+
Rakefile
|
26
|
+
README.md
|
27
|
+
)
|
28
|
+
|
29
|
+
SPEC = Gem::Specification.new do |gem|
|
30
|
+
gem.name = 'nokogumbo'
|
31
|
+
gem.version = '0.1'
|
32
|
+
gem.email = 'rubys@intertwingly.net'
|
33
|
+
gem.homepage = 'https://github.com/rubys/nokogumbo/tree/master/ruby#readme'
|
34
|
+
gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser'
|
35
|
+
gem.files = MANIFEST
|
36
|
+
gem.extensions = 'ext/extconf.rb'
|
37
|
+
gem.author = 'Sam Ruby'
|
38
|
+
gem.add_dependency 'nokogiri'
|
39
|
+
gem.license = 'MIT'
|
40
|
+
gem.description = %q(
|
41
|
+
At the moment, this is a proof of concept, allowing a Ruby
|
42
|
+
program to invoke the Gumbo HTML5 parser and access the result as a Nokogiri
|
43
|
+
parsed document.).strip.gsub(/\s+/, ' ')
|
44
|
+
end
|
45
|
+
|
46
|
+
task 'gem' => 'test'
|
47
|
+
Gem::PackageTask.new(SPEC) do |pkg|
|
48
|
+
pkg.need_tar = true
|
49
|
+
pkg.need_zip = true
|
50
|
+
end
|
data/ext/extconf.rb
ADDED
data/ext/nokogumbo.c
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "gumbo.h"
|
3
|
+
|
4
|
+
// class constants
|
5
|
+
static VALUE Nokogiri;
|
6
|
+
static VALUE HTML;
|
7
|
+
static VALUE XML;
|
8
|
+
static VALUE Document;
|
9
|
+
static VALUE Element;
|
10
|
+
static VALUE Text;
|
11
|
+
static VALUE CDATA;
|
12
|
+
static VALUE Comment;
|
13
|
+
static VALUE TAGS=0;
|
14
|
+
static int Unknown=0;
|
15
|
+
|
16
|
+
// interned symbols
|
17
|
+
static VALUE new;
|
18
|
+
static VALUE set_attribute;
|
19
|
+
static VALUE add_child;
|
20
|
+
|
21
|
+
// determine tag name for a given node
|
22
|
+
static VALUE _name(GumboElement *node) {
|
23
|
+
if (!TAGS) {
|
24
|
+
// Deferred initialization of "Unknown" as the GumboParser class is
|
25
|
+
// defined *after* the Nokogumbo class is.
|
26
|
+
VALUE HTML5 = rb_const_get(Nokogiri, rb_intern("HTML5"));
|
27
|
+
TAGS = rb_const_get(HTML5, rb_intern("TAGS"));
|
28
|
+
Unknown = NUM2INT(rb_const_get(HTML5, rb_intern("Unknown")));
|
29
|
+
}
|
30
|
+
|
31
|
+
if (node->tag != Unknown) {
|
32
|
+
return rb_ary_entry(TAGS, (long) node->tag);
|
33
|
+
} else {
|
34
|
+
// Gumbo doesn't provide unknown tags, so we need to parse it ourselves:
|
35
|
+
// http://www.w3.org/html/wg/drafts/html/CR/syntax.html#tag-name-state
|
36
|
+
GumboStringPiece *tag = &node->original_tag;
|
37
|
+
int length;
|
38
|
+
for (length = 1; length < tag->length-1; length++) {
|
39
|
+
if (strchr(" \t\r\n<", *((char*)tag->data+length))) break;
|
40
|
+
}
|
41
|
+
return rb_str_new(1+(char *)tag->data, length-1);
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
// Build a Nokogiri Element for a given GumboElement (recursively)
|
46
|
+
static VALUE _element(VALUE document, GumboElement *node) {
|
47
|
+
int i;
|
48
|
+
VALUE element = rb_funcall(Element, new, 2, _name(node), document);
|
49
|
+
|
50
|
+
// add in the attributes
|
51
|
+
GumboVector* attrs = &node->attributes;
|
52
|
+
for (i=0; i < attrs->length; i++) {
|
53
|
+
GumboAttribute *attr = attrs->data[i];
|
54
|
+
VALUE name = rb_str_new2(attr->name);
|
55
|
+
rb_funcall(element, set_attribute, 2, name, rb_str_new2(attr->value));
|
56
|
+
}
|
57
|
+
|
58
|
+
// add in the children
|
59
|
+
GumboVector* children = &node->children;
|
60
|
+
for (i=0; i < children->length; i++) {
|
61
|
+
GumboNode* child = children->data[i];
|
62
|
+
|
63
|
+
VALUE node = 0;
|
64
|
+
VALUE text;
|
65
|
+
|
66
|
+
switch (child->type) {
|
67
|
+
case GUMBO_NODE_ELEMENT:
|
68
|
+
node = _element(document, &child->v.element);
|
69
|
+
break;
|
70
|
+
case GUMBO_NODE_WHITESPACE:
|
71
|
+
case GUMBO_NODE_TEXT:
|
72
|
+
text = rb_str_new2(child->v.text.text);
|
73
|
+
node = rb_funcall(Text, new, 2, text, document);
|
74
|
+
break;
|
75
|
+
case GUMBO_NODE_CDATA:
|
76
|
+
text = rb_str_new2(child->v.text.text);
|
77
|
+
node = rb_funcall(CDATA, new, 2, text, document);
|
78
|
+
break;
|
79
|
+
case GUMBO_NODE_COMMENT:
|
80
|
+
text = rb_str_new2(child->v.text.text);
|
81
|
+
node = rb_funcall(Comment, new, 2, document, text);
|
82
|
+
break;
|
83
|
+
case GUMBO_NODE_DOCUMENT:
|
84
|
+
break; // should never happen -- ignore
|
85
|
+
}
|
86
|
+
|
87
|
+
if (node) rb_funcall(element, add_child, 1, node);
|
88
|
+
}
|
89
|
+
|
90
|
+
return element;
|
91
|
+
}
|
92
|
+
|
93
|
+
// Parse a string using gumbo_parse into a Nokogiri document
|
94
|
+
static VALUE t_parse(VALUE self, VALUE string) {
|
95
|
+
VALUE document = rb_funcall(Document, new, 0);
|
96
|
+
|
97
|
+
GumboOutput *output = gumbo_parse_with_options(
|
98
|
+
&kGumboDefaultOptions, RSTRING_PTR(string), RSTRING_LEN(string)
|
99
|
+
);
|
100
|
+
VALUE root = _element(document, (GumboElement*)&output->root->v.element);
|
101
|
+
rb_funcall(document, add_child, 1, root);
|
102
|
+
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
103
|
+
|
104
|
+
return document;
|
105
|
+
}
|
106
|
+
|
107
|
+
// Initialize the Nokogumbo class and fetch constants we will use later
|
108
|
+
void Init_nokogumboc() {
|
109
|
+
rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
|
110
|
+
rb_require("nokogiri");
|
111
|
+
|
112
|
+
// class constants
|
113
|
+
Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
|
114
|
+
HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
|
115
|
+
XML = rb_const_get(Nokogiri, rb_intern("XML"));
|
116
|
+
Document = rb_const_get(HTML, rb_intern("Document"));
|
117
|
+
Element = rb_const_get(XML, rb_intern("Element"));
|
118
|
+
Text = rb_const_get(XML, rb_intern("Text"));
|
119
|
+
CDATA = rb_const_get(XML, rb_intern("CDATA"));
|
120
|
+
Comment = rb_const_get(XML, rb_intern("Comment"));
|
121
|
+
|
122
|
+
// interned symbols
|
123
|
+
new = rb_intern("new");
|
124
|
+
set_attribute = rb_intern("set_attribute");
|
125
|
+
add_child = rb_intern("add_child");
|
126
|
+
|
127
|
+
// define Nokogumbo class with a singleton parse method
|
128
|
+
VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
|
129
|
+
rb_define_singleton_method(Gumbo, "parse", t_parse, 1);
|
130
|
+
}
|
131
|
+
|
data/lib/nokogumbo.rb
ADDED
@@ -0,0 +1,166 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'nokogumboc'
|
3
|
+
|
4
|
+
module Nokogiri
|
5
|
+
def self.HTML5(string)
|
6
|
+
Nokogumbo.parse(string)
|
7
|
+
end
|
8
|
+
|
9
|
+
module HTML5
|
10
|
+
TAGS = [
|
11
|
+
'HTML',
|
12
|
+
'HEAD',
|
13
|
+
'TITLE',
|
14
|
+
'BASE',
|
15
|
+
'LINK',
|
16
|
+
'META',
|
17
|
+
'STYLE',
|
18
|
+
'SCRIPT',
|
19
|
+
'NOSCRIPT',
|
20
|
+
'BODY',
|
21
|
+
'SECTION',
|
22
|
+
'NAV',
|
23
|
+
'ARTICLE',
|
24
|
+
'ASIDE',
|
25
|
+
'H1',
|
26
|
+
'H2',
|
27
|
+
'H3',
|
28
|
+
'H4',
|
29
|
+
'H5',
|
30
|
+
'H6',
|
31
|
+
'HGROUP',
|
32
|
+
'HEADER',
|
33
|
+
'FOOTER',
|
34
|
+
'ADDRESS',
|
35
|
+
'P',
|
36
|
+
'HR',
|
37
|
+
'PRE',
|
38
|
+
'BLOCKQUOTE',
|
39
|
+
'OL',
|
40
|
+
'UL',
|
41
|
+
'LI',
|
42
|
+
'DL',
|
43
|
+
'DT',
|
44
|
+
'DD',
|
45
|
+
'FIGURE',
|
46
|
+
'FIGCAPTION',
|
47
|
+
'DIV',
|
48
|
+
'A',
|
49
|
+
'EM',
|
50
|
+
'STRONG',
|
51
|
+
'SMALL',
|
52
|
+
'S',
|
53
|
+
'CITE',
|
54
|
+
'Q',
|
55
|
+
'DFN',
|
56
|
+
'ABBR',
|
57
|
+
'TIME',
|
58
|
+
'CODE',
|
59
|
+
'VAR',
|
60
|
+
'SAMP',
|
61
|
+
'KBD',
|
62
|
+
'SUB',
|
63
|
+
'SUP',
|
64
|
+
'I',
|
65
|
+
'B',
|
66
|
+
'MARK',
|
67
|
+
'RUBY',
|
68
|
+
'RT',
|
69
|
+
'RP',
|
70
|
+
'BDI',
|
71
|
+
'BDO',
|
72
|
+
'SPAN',
|
73
|
+
'BR',
|
74
|
+
'WBR',
|
75
|
+
'INS',
|
76
|
+
'DEL',
|
77
|
+
'IMAGE',
|
78
|
+
'IMG',
|
79
|
+
'IFRAME',
|
80
|
+
'EMBED',
|
81
|
+
'OBJECT',
|
82
|
+
'PARAM',
|
83
|
+
'VIDEO',
|
84
|
+
'AUDIO',
|
85
|
+
'SOURCE',
|
86
|
+
'TRACK',
|
87
|
+
'CANVAS',
|
88
|
+
'MAP',
|
89
|
+
'AREA',
|
90
|
+
'MATH',
|
91
|
+
'MI',
|
92
|
+
'MO',
|
93
|
+
'MN',
|
94
|
+
'MS',
|
95
|
+
'MTEXT',
|
96
|
+
'MGLYPH',
|
97
|
+
'MALIGNMARK',
|
98
|
+
'ANNOTATION_XML',
|
99
|
+
'SVG',
|
100
|
+
'FOREIGNOBJECT',
|
101
|
+
'DESC',
|
102
|
+
'TABLE',
|
103
|
+
'CAPTION',
|
104
|
+
'COLGROUP',
|
105
|
+
'COL',
|
106
|
+
'TBODY',
|
107
|
+
'THEAD',
|
108
|
+
'TFOOT',
|
109
|
+
'TR',
|
110
|
+
'TD',
|
111
|
+
'TH',
|
112
|
+
'FORM',
|
113
|
+
'FIELDSET',
|
114
|
+
'LEGEND',
|
115
|
+
'LABEL',
|
116
|
+
'INPUT',
|
117
|
+
'BUTTON',
|
118
|
+
'SELECT',
|
119
|
+
'DATALIST',
|
120
|
+
'OPTGROUP',
|
121
|
+
'OPTION',
|
122
|
+
'TEXTAREA',
|
123
|
+
'KEYGEN',
|
124
|
+
'OUTPUT',
|
125
|
+
'PROGRESS',
|
126
|
+
'METER',
|
127
|
+
'DETAILS',
|
128
|
+
'SUMMARY',
|
129
|
+
'COMMAND',
|
130
|
+
'MENU',
|
131
|
+
'APPLET',
|
132
|
+
'ACRONYM',
|
133
|
+
'BGSOUND',
|
134
|
+
'DIR',
|
135
|
+
'FRAME',
|
136
|
+
'FRAMESET',
|
137
|
+
'NOFRAMES',
|
138
|
+
'ISINDEX',
|
139
|
+
'LISTING',
|
140
|
+
'XMP',
|
141
|
+
'NEXTID',
|
142
|
+
'NOEMBED',
|
143
|
+
'PLAINTEXT',
|
144
|
+
'RB',
|
145
|
+
'STRIKE',
|
146
|
+
'BASEFONT',
|
147
|
+
'BIG',
|
148
|
+
'BLINK',
|
149
|
+
'CENTER',
|
150
|
+
'FONT',
|
151
|
+
'MARQUEE',
|
152
|
+
'MULTICOL',
|
153
|
+
'NOBR',
|
154
|
+
'SPACER',
|
155
|
+
'TT',
|
156
|
+
'U',
|
157
|
+
'UNKNOWN',
|
158
|
+
].map(&:downcase).map(&:freeze).freeze
|
159
|
+
|
160
|
+
Unknown = TAGS.length - 1
|
161
|
+
|
162
|
+
def parse(string)
|
163
|
+
Nokogumbo.parse(string)
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
metadata
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: nokogumbo
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Sam Ruby
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-08-17 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
description: At the moment, this is a proof of concept, allowing a Ruby program to
|
31
|
+
invoke the Gumbo HTML5 parser and access the result as a Nokogiri parsed document.
|
32
|
+
email: rubys@intertwingly.net
|
33
|
+
executables: []
|
34
|
+
extensions:
|
35
|
+
- ext/extconf.rb
|
36
|
+
extra_rdoc_files: []
|
37
|
+
files:
|
38
|
+
- ext/extconf.rb
|
39
|
+
- ext/nokogumbo.c
|
40
|
+
- lib/nokogumbo.rb
|
41
|
+
- Rakefile
|
42
|
+
- README.md
|
43
|
+
homepage: https://github.com/rubys/nokogumbo/tree/master/ruby#readme
|
44
|
+
licenses:
|
45
|
+
- MIT
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
requirements: []
|
63
|
+
rubyforge_project:
|
64
|
+
rubygems_version: 1.8.23
|
65
|
+
signing_key:
|
66
|
+
specification_version: 3
|
67
|
+
summary: Nokogiri interface to the Gumbo HTML5 parser
|
68
|
+
test_files: []
|