nokogumbo 0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +46 -0
- data/Rakefile +50 -0
- data/ext/extconf.rb +3 -0
- data/ext/nokogumbo.c +131 -0
- data/lib/nokogumbo.rb +166 -0
- metadata +68 -0
data/README.md
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
|
2
|
+
===========
|
3
|
+
|
4
|
+
At the moment, this is a proof of concept, allowing a Ruby program to invoke
|
5
|
+
the Gumbo HTML5 parser and access the result as a Nokogiri parsed document.
|
6
|
+
|
7
|
+
Usage:
|
8
|
+
-----
|
9
|
+
|
10
|
+
```ruby
|
11
|
+
require 'nokogumbo'
|
12
|
+
doc = Nokogiri::HTML5(string)
|
13
|
+
```
|
14
|
+
|
15
|
+
Notes:
|
16
|
+
-----
|
17
|
+
|
18
|
+
* The `Nokogumbo.parse` function takes a string and passes it to the
|
19
|
+
<code>gumbo_parse_with_options</code> method, using the default options.
|
20
|
+
The resulting Gumbo parse tree is the walked, producing a Nokogiri parse tree.
|
21
|
+
The original Gumbo parse tree is then destroyed, and the Nokogiri parse tree
|
22
|
+
is returned.
|
23
|
+
|
24
|
+
* Instead of uppercase element names, lowercase element names are produced.
|
25
|
+
|
26
|
+
* Instead of returning 'unknown' as the element name for unknown tags, the
|
27
|
+
original tag name is returned verbatim.
|
28
|
+
|
29
|
+
* Nothing meaningful is done with the `GumboDocument` struct, i.e., no
|
30
|
+
Nokogiri `EntityDecl` is produced.
|
31
|
+
|
32
|
+
Installation:
|
33
|
+
============
|
34
|
+
|
35
|
+
* Build and install the
|
36
|
+
[gumbo-parser](https://github.com/google/gumbo-parser#readme) C library
|
37
|
+
|
38
|
+
* Execute `rake gem`
|
39
|
+
|
40
|
+
* [sudo] gem install pkg/nokogumbo*.gem
|
41
|
+
|
42
|
+
Related efforts:
|
43
|
+
============
|
44
|
+
|
45
|
+
* [ruby-gumbo](https://github.com/galdor/ruby-gumbo#readme) - a ruby binding
|
46
|
+
for the Gumbo HTML5 parser.
|
data/Rakefile
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
require 'rubygems/package_task'
|
2
|
+
require 'rake/clean'
|
3
|
+
|
4
|
+
task 'default' => 'test'
|
5
|
+
|
6
|
+
file 'Makefile' => 'ext/extconf.rb' do
|
7
|
+
Dir.chdir 'ext' do
|
8
|
+
ruby 'extconf.rb'
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
task 'test' => 'Makefile' do
|
13
|
+
Dir.chdir 'ext' do
|
14
|
+
sh 'make -s'
|
15
|
+
end
|
16
|
+
ruby 'test-nokogumbo.rb'
|
17
|
+
end
|
18
|
+
|
19
|
+
CLEAN.include('ext/*.o', 'ext/*.so', 'ext/*.log', 'ext/Makefile', 'pkg')
|
20
|
+
|
21
|
+
MANIFEST = %w(
|
22
|
+
ext/extconf.rb
|
23
|
+
ext/nokogumbo.c
|
24
|
+
lib/nokogumbo.rb
|
25
|
+
Rakefile
|
26
|
+
README.md
|
27
|
+
)
|
28
|
+
|
29
|
+
SPEC = Gem::Specification.new do |gem|
|
30
|
+
gem.name = 'nokogumbo'
|
31
|
+
gem.version = '0.1'
|
32
|
+
gem.email = 'rubys@intertwingly.net'
|
33
|
+
gem.homepage = 'https://github.com/rubys/nokogumbo/tree/master/ruby#readme'
|
34
|
+
gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser'
|
35
|
+
gem.files = MANIFEST
|
36
|
+
gem.extensions = 'ext/extconf.rb'
|
37
|
+
gem.author = 'Sam Ruby'
|
38
|
+
gem.add_dependency 'nokogiri'
|
39
|
+
gem.license = 'MIT'
|
40
|
+
gem.description = %q(
|
41
|
+
At the moment, this is a proof of concept, allowing a Ruby
|
42
|
+
program to invoke the Gumbo HTML5 parser and access the result as a Nokogiri
|
43
|
+
parsed document.).strip.gsub(/\s+/, ' ')
|
44
|
+
end
|
45
|
+
|
46
|
+
task 'gem' => 'test'
|
47
|
+
Gem::PackageTask.new(SPEC) do |pkg|
|
48
|
+
pkg.need_tar = true
|
49
|
+
pkg.need_zip = true
|
50
|
+
end
|
data/ext/extconf.rb
ADDED
data/ext/nokogumbo.c
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
#include "ruby.h"
|
2
|
+
#include "gumbo.h"
|
3
|
+
|
4
|
+
// class constants
|
5
|
+
static VALUE Nokogiri;
|
6
|
+
static VALUE HTML;
|
7
|
+
static VALUE XML;
|
8
|
+
static VALUE Document;
|
9
|
+
static VALUE Element;
|
10
|
+
static VALUE Text;
|
11
|
+
static VALUE CDATA;
|
12
|
+
static VALUE Comment;
|
13
|
+
static VALUE TAGS=0;
|
14
|
+
static int Unknown=0;
|
15
|
+
|
16
|
+
// interned symbols
|
17
|
+
static VALUE new;
|
18
|
+
static VALUE set_attribute;
|
19
|
+
static VALUE add_child;
|
20
|
+
|
21
|
+
// determine tag name for a given node
|
22
|
+
static VALUE _name(GumboElement *node) {
|
23
|
+
if (!TAGS) {
|
24
|
+
// Deferred initialization of "Unknown" as the GumboParser class is
|
25
|
+
// defined *after* the Nokogumbo class is.
|
26
|
+
VALUE HTML5 = rb_const_get(Nokogiri, rb_intern("HTML5"));
|
27
|
+
TAGS = rb_const_get(HTML5, rb_intern("TAGS"));
|
28
|
+
Unknown = NUM2INT(rb_const_get(HTML5, rb_intern("Unknown")));
|
29
|
+
}
|
30
|
+
|
31
|
+
if (node->tag != Unknown) {
|
32
|
+
return rb_ary_entry(TAGS, (long) node->tag);
|
33
|
+
} else {
|
34
|
+
// Gumbo doesn't provide unknown tags, so we need to parse it ourselves:
|
35
|
+
// http://www.w3.org/html/wg/drafts/html/CR/syntax.html#tag-name-state
|
36
|
+
GumboStringPiece *tag = &node->original_tag;
|
37
|
+
int length;
|
38
|
+
for (length = 1; length < tag->length-1; length++) {
|
39
|
+
if (strchr(" \t\r\n<", *((char*)tag->data+length))) break;
|
40
|
+
}
|
41
|
+
return rb_str_new(1+(char *)tag->data, length-1);
|
42
|
+
}
|
43
|
+
}
|
44
|
+
|
45
|
+
// Build a Nokogiri Element for a given GumboElement (recursively)
|
46
|
+
static VALUE _element(VALUE document, GumboElement *node) {
|
47
|
+
int i;
|
48
|
+
VALUE element = rb_funcall(Element, new, 2, _name(node), document);
|
49
|
+
|
50
|
+
// add in the attributes
|
51
|
+
GumboVector* attrs = &node->attributes;
|
52
|
+
for (i=0; i < attrs->length; i++) {
|
53
|
+
GumboAttribute *attr = attrs->data[i];
|
54
|
+
VALUE name = rb_str_new2(attr->name);
|
55
|
+
rb_funcall(element, set_attribute, 2, name, rb_str_new2(attr->value));
|
56
|
+
}
|
57
|
+
|
58
|
+
// add in the children
|
59
|
+
GumboVector* children = &node->children;
|
60
|
+
for (i=0; i < children->length; i++) {
|
61
|
+
GumboNode* child = children->data[i];
|
62
|
+
|
63
|
+
VALUE node = 0;
|
64
|
+
VALUE text;
|
65
|
+
|
66
|
+
switch (child->type) {
|
67
|
+
case GUMBO_NODE_ELEMENT:
|
68
|
+
node = _element(document, &child->v.element);
|
69
|
+
break;
|
70
|
+
case GUMBO_NODE_WHITESPACE:
|
71
|
+
case GUMBO_NODE_TEXT:
|
72
|
+
text = rb_str_new2(child->v.text.text);
|
73
|
+
node = rb_funcall(Text, new, 2, text, document);
|
74
|
+
break;
|
75
|
+
case GUMBO_NODE_CDATA:
|
76
|
+
text = rb_str_new2(child->v.text.text);
|
77
|
+
node = rb_funcall(CDATA, new, 2, text, document);
|
78
|
+
break;
|
79
|
+
case GUMBO_NODE_COMMENT:
|
80
|
+
text = rb_str_new2(child->v.text.text);
|
81
|
+
node = rb_funcall(Comment, new, 2, document, text);
|
82
|
+
break;
|
83
|
+
case GUMBO_NODE_DOCUMENT:
|
84
|
+
break; // should never happen -- ignore
|
85
|
+
}
|
86
|
+
|
87
|
+
if (node) rb_funcall(element, add_child, 1, node);
|
88
|
+
}
|
89
|
+
|
90
|
+
return element;
|
91
|
+
}
|
92
|
+
|
93
|
+
// Parse a string using gumbo_parse into a Nokogiri document
|
94
|
+
static VALUE t_parse(VALUE self, VALUE string) {
|
95
|
+
VALUE document = rb_funcall(Document, new, 0);
|
96
|
+
|
97
|
+
GumboOutput *output = gumbo_parse_with_options(
|
98
|
+
&kGumboDefaultOptions, RSTRING_PTR(string), RSTRING_LEN(string)
|
99
|
+
);
|
100
|
+
VALUE root = _element(document, (GumboElement*)&output->root->v.element);
|
101
|
+
rb_funcall(document, add_child, 1, root);
|
102
|
+
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
103
|
+
|
104
|
+
return document;
|
105
|
+
}
|
106
|
+
|
107
|
+
// Initialize the Nokogumbo class and fetch constants we will use later
|
108
|
+
void Init_nokogumboc() {
|
109
|
+
rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
|
110
|
+
rb_require("nokogiri");
|
111
|
+
|
112
|
+
// class constants
|
113
|
+
Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
|
114
|
+
HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
|
115
|
+
XML = rb_const_get(Nokogiri, rb_intern("XML"));
|
116
|
+
Document = rb_const_get(HTML, rb_intern("Document"));
|
117
|
+
Element = rb_const_get(XML, rb_intern("Element"));
|
118
|
+
Text = rb_const_get(XML, rb_intern("Text"));
|
119
|
+
CDATA = rb_const_get(XML, rb_intern("CDATA"));
|
120
|
+
Comment = rb_const_get(XML, rb_intern("Comment"));
|
121
|
+
|
122
|
+
// interned symbols
|
123
|
+
new = rb_intern("new");
|
124
|
+
set_attribute = rb_intern("set_attribute");
|
125
|
+
add_child = rb_intern("add_child");
|
126
|
+
|
127
|
+
// define Nokogumbo class with a singleton parse method
|
128
|
+
VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
|
129
|
+
rb_define_singleton_method(Gumbo, "parse", t_parse, 1);
|
130
|
+
}
|
131
|
+
|
data/lib/nokogumbo.rb
ADDED
@@ -0,0 +1,166 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
require 'nokogumboc'
|
3
|
+
|
4
|
+
module Nokogiri
|
5
|
+
def self.HTML5(string)
|
6
|
+
Nokogumbo.parse(string)
|
7
|
+
end
|
8
|
+
|
9
|
+
module HTML5
|
10
|
+
TAGS = [
|
11
|
+
'HTML',
|
12
|
+
'HEAD',
|
13
|
+
'TITLE',
|
14
|
+
'BASE',
|
15
|
+
'LINK',
|
16
|
+
'META',
|
17
|
+
'STYLE',
|
18
|
+
'SCRIPT',
|
19
|
+
'NOSCRIPT',
|
20
|
+
'BODY',
|
21
|
+
'SECTION',
|
22
|
+
'NAV',
|
23
|
+
'ARTICLE',
|
24
|
+
'ASIDE',
|
25
|
+
'H1',
|
26
|
+
'H2',
|
27
|
+
'H3',
|
28
|
+
'H4',
|
29
|
+
'H5',
|
30
|
+
'H6',
|
31
|
+
'HGROUP',
|
32
|
+
'HEADER',
|
33
|
+
'FOOTER',
|
34
|
+
'ADDRESS',
|
35
|
+
'P',
|
36
|
+
'HR',
|
37
|
+
'PRE',
|
38
|
+
'BLOCKQUOTE',
|
39
|
+
'OL',
|
40
|
+
'UL',
|
41
|
+
'LI',
|
42
|
+
'DL',
|
43
|
+
'DT',
|
44
|
+
'DD',
|
45
|
+
'FIGURE',
|
46
|
+
'FIGCAPTION',
|
47
|
+
'DIV',
|
48
|
+
'A',
|
49
|
+
'EM',
|
50
|
+
'STRONG',
|
51
|
+
'SMALL',
|
52
|
+
'S',
|
53
|
+
'CITE',
|
54
|
+
'Q',
|
55
|
+
'DFN',
|
56
|
+
'ABBR',
|
57
|
+
'TIME',
|
58
|
+
'CODE',
|
59
|
+
'VAR',
|
60
|
+
'SAMP',
|
61
|
+
'KBD',
|
62
|
+
'SUB',
|
63
|
+
'SUP',
|
64
|
+
'I',
|
65
|
+
'B',
|
66
|
+
'MARK',
|
67
|
+
'RUBY',
|
68
|
+
'RT',
|
69
|
+
'RP',
|
70
|
+
'BDI',
|
71
|
+
'BDO',
|
72
|
+
'SPAN',
|
73
|
+
'BR',
|
74
|
+
'WBR',
|
75
|
+
'INS',
|
76
|
+
'DEL',
|
77
|
+
'IMAGE',
|
78
|
+
'IMG',
|
79
|
+
'IFRAME',
|
80
|
+
'EMBED',
|
81
|
+
'OBJECT',
|
82
|
+
'PARAM',
|
83
|
+
'VIDEO',
|
84
|
+
'AUDIO',
|
85
|
+
'SOURCE',
|
86
|
+
'TRACK',
|
87
|
+
'CANVAS',
|
88
|
+
'MAP',
|
89
|
+
'AREA',
|
90
|
+
'MATH',
|
91
|
+
'MI',
|
92
|
+
'MO',
|
93
|
+
'MN',
|
94
|
+
'MS',
|
95
|
+
'MTEXT',
|
96
|
+
'MGLYPH',
|
97
|
+
'MALIGNMARK',
|
98
|
+
'ANNOTATION_XML',
|
99
|
+
'SVG',
|
100
|
+
'FOREIGNOBJECT',
|
101
|
+
'DESC',
|
102
|
+
'TABLE',
|
103
|
+
'CAPTION',
|
104
|
+
'COLGROUP',
|
105
|
+
'COL',
|
106
|
+
'TBODY',
|
107
|
+
'THEAD',
|
108
|
+
'TFOOT',
|
109
|
+
'TR',
|
110
|
+
'TD',
|
111
|
+
'TH',
|
112
|
+
'FORM',
|
113
|
+
'FIELDSET',
|
114
|
+
'LEGEND',
|
115
|
+
'LABEL',
|
116
|
+
'INPUT',
|
117
|
+
'BUTTON',
|
118
|
+
'SELECT',
|
119
|
+
'DATALIST',
|
120
|
+
'OPTGROUP',
|
121
|
+
'OPTION',
|
122
|
+
'TEXTAREA',
|
123
|
+
'KEYGEN',
|
124
|
+
'OUTPUT',
|
125
|
+
'PROGRESS',
|
126
|
+
'METER',
|
127
|
+
'DETAILS',
|
128
|
+
'SUMMARY',
|
129
|
+
'COMMAND',
|
130
|
+
'MENU',
|
131
|
+
'APPLET',
|
132
|
+
'ACRONYM',
|
133
|
+
'BGSOUND',
|
134
|
+
'DIR',
|
135
|
+
'FRAME',
|
136
|
+
'FRAMESET',
|
137
|
+
'NOFRAMES',
|
138
|
+
'ISINDEX',
|
139
|
+
'LISTING',
|
140
|
+
'XMP',
|
141
|
+
'NEXTID',
|
142
|
+
'NOEMBED',
|
143
|
+
'PLAINTEXT',
|
144
|
+
'RB',
|
145
|
+
'STRIKE',
|
146
|
+
'BASEFONT',
|
147
|
+
'BIG',
|
148
|
+
'BLINK',
|
149
|
+
'CENTER',
|
150
|
+
'FONT',
|
151
|
+
'MARQUEE',
|
152
|
+
'MULTICOL',
|
153
|
+
'NOBR',
|
154
|
+
'SPACER',
|
155
|
+
'TT',
|
156
|
+
'U',
|
157
|
+
'UNKNOWN',
|
158
|
+
].map(&:downcase).map(&:freeze).freeze
|
159
|
+
|
160
|
+
Unknown = TAGS.length - 1
|
161
|
+
|
162
|
+
def parse(string)
|
163
|
+
Nokogumbo.parse(string)
|
164
|
+
end
|
165
|
+
end
|
166
|
+
end
|
metadata
ADDED
@@ -0,0 +1,68 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: nokogumbo
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: '0.1'
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Sam Ruby
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2013-08-17 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: nokogiri
|
16
|
+
requirement: !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :runtime
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ! '>='
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
version: '0'
|
30
|
+
description: At the moment, this is a proof of concept, allowing a Ruby program to
|
31
|
+
invoke the Gumbo HTML5 parser and access the result as a Nokogiri parsed document.
|
32
|
+
email: rubys@intertwingly.net
|
33
|
+
executables: []
|
34
|
+
extensions:
|
35
|
+
- ext/extconf.rb
|
36
|
+
extra_rdoc_files: []
|
37
|
+
files:
|
38
|
+
- ext/extconf.rb
|
39
|
+
- ext/nokogumbo.c
|
40
|
+
- lib/nokogumbo.rb
|
41
|
+
- Rakefile
|
42
|
+
- README.md
|
43
|
+
homepage: https://github.com/rubys/nokogumbo/tree/master/ruby#readme
|
44
|
+
licenses:
|
45
|
+
- MIT
|
46
|
+
post_install_message:
|
47
|
+
rdoc_options: []
|
48
|
+
require_paths:
|
49
|
+
- lib
|
50
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
51
|
+
none: false
|
52
|
+
requirements:
|
53
|
+
- - ! '>='
|
54
|
+
- !ruby/object:Gem::Version
|
55
|
+
version: '0'
|
56
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
57
|
+
none: false
|
58
|
+
requirements:
|
59
|
+
- - ! '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
requirements: []
|
63
|
+
rubyforge_project:
|
64
|
+
rubygems_version: 1.8.23
|
65
|
+
signing_key:
|
66
|
+
specification_version: 3
|
67
|
+
summary: Nokogiri interface to the Gumbo HTML5 parser
|
68
|
+
test_files: []
|