nokogumbo 0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (6) hide show
  1. data/README.md +46 -0
  2. data/Rakefile +50 -0
  3. data/ext/extconf.rb +3 -0
  4. data/ext/nokogumbo.c +131 -0
  5. data/lib/nokogumbo.rb +166 -0
  6. metadata +68 -0
data/README.md ADDED
@@ -0,0 +1,46 @@
1
+ Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
2
+ ===========
3
+
4
+ At the moment, this is a proof of concept, allowing a Ruby program to invoke
5
+ the Gumbo HTML5 parser and access the result as a Nokogiri parsed document.
6
+
7
+ Usage:
8
+ -----
9
+
10
+ ```ruby
11
+ require 'nokogumbo'
12
+ doc = Nokogiri::HTML5(string)
13
+ ```
14
+
15
+ Notes:
16
+ -----
17
+
18
+ * The `Nokogumbo.parse` function takes a string and passes it to the
19
+ <code>gumbo_parse_with_options</code> method, using the default options.
20
+ The resulting Gumbo parse tree is the walked, producing a Nokogiri parse tree.
21
+ The original Gumbo parse tree is then destroyed, and the Nokogiri parse tree
22
+ is returned.
23
+
24
+ * Instead of uppercase element names, lowercase element names are produced.
25
+
26
+ * Instead of returning 'unknown' as the element name for unknown tags, the
27
+ original tag name is returned verbatim.
28
+
29
+ * Nothing meaningful is done with the `GumboDocument` struct, i.e., no
30
+ Nokogiri `EntityDecl` is produced.
31
+
32
+ Installation:
33
+ ============
34
+
35
+ * Build and install the
36
+ [gumbo-parser](https://github.com/google/gumbo-parser#readme) C library
37
+
38
+ * Execute `rake gem`
39
+
40
+ * [sudo] gem install pkg/nokogumbo*.gem
41
+
42
+ Related efforts:
43
+ ============
44
+
45
+ * [ruby-gumbo](https://github.com/galdor/ruby-gumbo#readme) - a ruby binding
46
+ for the Gumbo HTML5 parser.
data/Rakefile ADDED
@@ -0,0 +1,50 @@
1
+ require 'rubygems/package_task'
2
+ require 'rake/clean'
3
+
4
+ task 'default' => 'test'
5
+
6
+ file 'Makefile' => 'ext/extconf.rb' do
7
+ Dir.chdir 'ext' do
8
+ ruby 'extconf.rb'
9
+ end
10
+ end
11
+
12
+ task 'test' => 'Makefile' do
13
+ Dir.chdir 'ext' do
14
+ sh 'make -s'
15
+ end
16
+ ruby 'test-nokogumbo.rb'
17
+ end
18
+
19
+ CLEAN.include('ext/*.o', 'ext/*.so', 'ext/*.log', 'ext/Makefile', 'pkg')
20
+
21
+ MANIFEST = %w(
22
+ ext/extconf.rb
23
+ ext/nokogumbo.c
24
+ lib/nokogumbo.rb
25
+ Rakefile
26
+ README.md
27
+ )
28
+
29
+ SPEC = Gem::Specification.new do |gem|
30
+ gem.name = 'nokogumbo'
31
+ gem.version = '0.1'
32
+ gem.email = 'rubys@intertwingly.net'
33
+ gem.homepage = 'https://github.com/rubys/nokogumbo/tree/master/ruby#readme'
34
+ gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser'
35
+ gem.files = MANIFEST
36
+ gem.extensions = 'ext/extconf.rb'
37
+ gem.author = 'Sam Ruby'
38
+ gem.add_dependency 'nokogiri'
39
+ gem.license = 'MIT'
40
+ gem.description = %q(
41
+ At the moment, this is a proof of concept, allowing a Ruby
42
+ program to invoke the Gumbo HTML5 parser and access the result as a Nokogiri
43
+ parsed document.).strip.gsub(/\s+/, ' ')
44
+ end
45
+
46
+ task 'gem' => 'test'
47
+ Gem::PackageTask.new(SPEC) do |pkg|
48
+ pkg.need_tar = true
49
+ pkg.need_zip = true
50
+ end
data/ext/extconf.rb ADDED
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+ have_library('gumbo', 'gumbo_parse')
3
+ create_makefile('nokogumboc')
data/ext/nokogumbo.c ADDED
@@ -0,0 +1,131 @@
1
+ #include "ruby.h"
2
+ #include "gumbo.h"
3
+
4
+ // class constants
5
+ static VALUE Nokogiri;
6
+ static VALUE HTML;
7
+ static VALUE XML;
8
+ static VALUE Document;
9
+ static VALUE Element;
10
+ static VALUE Text;
11
+ static VALUE CDATA;
12
+ static VALUE Comment;
13
+ static VALUE TAGS=0;
14
+ static int Unknown=0;
15
+
16
+ // interned symbols
17
+ static VALUE new;
18
+ static VALUE set_attribute;
19
+ static VALUE add_child;
20
+
21
+ // determine tag name for a given node
22
+ static VALUE _name(GumboElement *node) {
23
+ if (!TAGS) {
24
+ // Deferred initialization of "Unknown" as the GumboParser class is
25
+ // defined *after* the Nokogumbo class is.
26
+ VALUE HTML5 = rb_const_get(Nokogiri, rb_intern("HTML5"));
27
+ TAGS = rb_const_get(HTML5, rb_intern("TAGS"));
28
+ Unknown = NUM2INT(rb_const_get(HTML5, rb_intern("Unknown")));
29
+ }
30
+
31
+ if (node->tag != Unknown) {
32
+ return rb_ary_entry(TAGS, (long) node->tag);
33
+ } else {
34
+ // Gumbo doesn't provide unknown tags, so we need to parse it ourselves:
35
+ // http://www.w3.org/html/wg/drafts/html/CR/syntax.html#tag-name-state
36
+ GumboStringPiece *tag = &node->original_tag;
37
+ int length;
38
+ for (length = 1; length < tag->length-1; length++) {
39
+ if (strchr(" \t\r\n<", *((char*)tag->data+length))) break;
40
+ }
41
+ return rb_str_new(1+(char *)tag->data, length-1);
42
+ }
43
+ }
44
+
45
+ // Build a Nokogiri Element for a given GumboElement (recursively)
46
+ static VALUE _element(VALUE document, GumboElement *node) {
47
+ int i;
48
+ VALUE element = rb_funcall(Element, new, 2, _name(node), document);
49
+
50
+ // add in the attributes
51
+ GumboVector* attrs = &node->attributes;
52
+ for (i=0; i < attrs->length; i++) {
53
+ GumboAttribute *attr = attrs->data[i];
54
+ VALUE name = rb_str_new2(attr->name);
55
+ rb_funcall(element, set_attribute, 2, name, rb_str_new2(attr->value));
56
+ }
57
+
58
+ // add in the children
59
+ GumboVector* children = &node->children;
60
+ for (i=0; i < children->length; i++) {
61
+ GumboNode* child = children->data[i];
62
+
63
+ VALUE node = 0;
64
+ VALUE text;
65
+
66
+ switch (child->type) {
67
+ case GUMBO_NODE_ELEMENT:
68
+ node = _element(document, &child->v.element);
69
+ break;
70
+ case GUMBO_NODE_WHITESPACE:
71
+ case GUMBO_NODE_TEXT:
72
+ text = rb_str_new2(child->v.text.text);
73
+ node = rb_funcall(Text, new, 2, text, document);
74
+ break;
75
+ case GUMBO_NODE_CDATA:
76
+ text = rb_str_new2(child->v.text.text);
77
+ node = rb_funcall(CDATA, new, 2, text, document);
78
+ break;
79
+ case GUMBO_NODE_COMMENT:
80
+ text = rb_str_new2(child->v.text.text);
81
+ node = rb_funcall(Comment, new, 2, document, text);
82
+ break;
83
+ case GUMBO_NODE_DOCUMENT:
84
+ break; // should never happen -- ignore
85
+ }
86
+
87
+ if (node) rb_funcall(element, add_child, 1, node);
88
+ }
89
+
90
+ return element;
91
+ }
92
+
93
+ // Parse a string using gumbo_parse into a Nokogiri document
94
+ static VALUE t_parse(VALUE self, VALUE string) {
95
+ VALUE document = rb_funcall(Document, new, 0);
96
+
97
+ GumboOutput *output = gumbo_parse_with_options(
98
+ &kGumboDefaultOptions, RSTRING_PTR(string), RSTRING_LEN(string)
99
+ );
100
+ VALUE root = _element(document, (GumboElement*)&output->root->v.element);
101
+ rb_funcall(document, add_child, 1, root);
102
+ gumbo_destroy_output(&kGumboDefaultOptions, output);
103
+
104
+ return document;
105
+ }
106
+
107
+ // Initialize the Nokogumbo class and fetch constants we will use later
108
+ void Init_nokogumboc() {
109
+ rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
110
+ rb_require("nokogiri");
111
+
112
+ // class constants
113
+ Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
114
+ HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
115
+ XML = rb_const_get(Nokogiri, rb_intern("XML"));
116
+ Document = rb_const_get(HTML, rb_intern("Document"));
117
+ Element = rb_const_get(XML, rb_intern("Element"));
118
+ Text = rb_const_get(XML, rb_intern("Text"));
119
+ CDATA = rb_const_get(XML, rb_intern("CDATA"));
120
+ Comment = rb_const_get(XML, rb_intern("Comment"));
121
+
122
+ // interned symbols
123
+ new = rb_intern("new");
124
+ set_attribute = rb_intern("set_attribute");
125
+ add_child = rb_intern("add_child");
126
+
127
+ // define Nokogumbo class with a singleton parse method
128
+ VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
129
+ rb_define_singleton_method(Gumbo, "parse", t_parse, 1);
130
+ }
131
+
data/lib/nokogumbo.rb ADDED
@@ -0,0 +1,166 @@
1
+ require 'nokogiri'
2
+ require 'nokogumboc'
3
+
4
+ module Nokogiri
5
+ def self.HTML5(string)
6
+ Nokogumbo.parse(string)
7
+ end
8
+
9
+ module HTML5
10
+ TAGS = [
11
+ 'HTML',
12
+ 'HEAD',
13
+ 'TITLE',
14
+ 'BASE',
15
+ 'LINK',
16
+ 'META',
17
+ 'STYLE',
18
+ 'SCRIPT',
19
+ 'NOSCRIPT',
20
+ 'BODY',
21
+ 'SECTION',
22
+ 'NAV',
23
+ 'ARTICLE',
24
+ 'ASIDE',
25
+ 'H1',
26
+ 'H2',
27
+ 'H3',
28
+ 'H4',
29
+ 'H5',
30
+ 'H6',
31
+ 'HGROUP',
32
+ 'HEADER',
33
+ 'FOOTER',
34
+ 'ADDRESS',
35
+ 'P',
36
+ 'HR',
37
+ 'PRE',
38
+ 'BLOCKQUOTE',
39
+ 'OL',
40
+ 'UL',
41
+ 'LI',
42
+ 'DL',
43
+ 'DT',
44
+ 'DD',
45
+ 'FIGURE',
46
+ 'FIGCAPTION',
47
+ 'DIV',
48
+ 'A',
49
+ 'EM',
50
+ 'STRONG',
51
+ 'SMALL',
52
+ 'S',
53
+ 'CITE',
54
+ 'Q',
55
+ 'DFN',
56
+ 'ABBR',
57
+ 'TIME',
58
+ 'CODE',
59
+ 'VAR',
60
+ 'SAMP',
61
+ 'KBD',
62
+ 'SUB',
63
+ 'SUP',
64
+ 'I',
65
+ 'B',
66
+ 'MARK',
67
+ 'RUBY',
68
+ 'RT',
69
+ 'RP',
70
+ 'BDI',
71
+ 'BDO',
72
+ 'SPAN',
73
+ 'BR',
74
+ 'WBR',
75
+ 'INS',
76
+ 'DEL',
77
+ 'IMAGE',
78
+ 'IMG',
79
+ 'IFRAME',
80
+ 'EMBED',
81
+ 'OBJECT',
82
+ 'PARAM',
83
+ 'VIDEO',
84
+ 'AUDIO',
85
+ 'SOURCE',
86
+ 'TRACK',
87
+ 'CANVAS',
88
+ 'MAP',
89
+ 'AREA',
90
+ 'MATH',
91
+ 'MI',
92
+ 'MO',
93
+ 'MN',
94
+ 'MS',
95
+ 'MTEXT',
96
+ 'MGLYPH',
97
+ 'MALIGNMARK',
98
+ 'ANNOTATION_XML',
99
+ 'SVG',
100
+ 'FOREIGNOBJECT',
101
+ 'DESC',
102
+ 'TABLE',
103
+ 'CAPTION',
104
+ 'COLGROUP',
105
+ 'COL',
106
+ 'TBODY',
107
+ 'THEAD',
108
+ 'TFOOT',
109
+ 'TR',
110
+ 'TD',
111
+ 'TH',
112
+ 'FORM',
113
+ 'FIELDSET',
114
+ 'LEGEND',
115
+ 'LABEL',
116
+ 'INPUT',
117
+ 'BUTTON',
118
+ 'SELECT',
119
+ 'DATALIST',
120
+ 'OPTGROUP',
121
+ 'OPTION',
122
+ 'TEXTAREA',
123
+ 'KEYGEN',
124
+ 'OUTPUT',
125
+ 'PROGRESS',
126
+ 'METER',
127
+ 'DETAILS',
128
+ 'SUMMARY',
129
+ 'COMMAND',
130
+ 'MENU',
131
+ 'APPLET',
132
+ 'ACRONYM',
133
+ 'BGSOUND',
134
+ 'DIR',
135
+ 'FRAME',
136
+ 'FRAMESET',
137
+ 'NOFRAMES',
138
+ 'ISINDEX',
139
+ 'LISTING',
140
+ 'XMP',
141
+ 'NEXTID',
142
+ 'NOEMBED',
143
+ 'PLAINTEXT',
144
+ 'RB',
145
+ 'STRIKE',
146
+ 'BASEFONT',
147
+ 'BIG',
148
+ 'BLINK',
149
+ 'CENTER',
150
+ 'FONT',
151
+ 'MARQUEE',
152
+ 'MULTICOL',
153
+ 'NOBR',
154
+ 'SPACER',
155
+ 'TT',
156
+ 'U',
157
+ 'UNKNOWN',
158
+ ].map(&:downcase).map(&:freeze).freeze
159
+
160
+ Unknown = TAGS.length - 1
161
+
162
+ def parse(string)
163
+ Nokogumbo.parse(string)
164
+ end
165
+ end
166
+ end
metadata ADDED
@@ -0,0 +1,68 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: nokogumbo
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Sam Ruby
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-08-17 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ description: At the moment, this is a proof of concept, allowing a Ruby program to
31
+ invoke the Gumbo HTML5 parser and access the result as a Nokogiri parsed document.
32
+ email: rubys@intertwingly.net
33
+ executables: []
34
+ extensions:
35
+ - ext/extconf.rb
36
+ extra_rdoc_files: []
37
+ files:
38
+ - ext/extconf.rb
39
+ - ext/nokogumbo.c
40
+ - lib/nokogumbo.rb
41
+ - Rakefile
42
+ - README.md
43
+ homepage: https://github.com/rubys/nokogumbo/tree/master/ruby#readme
44
+ licenses:
45
+ - MIT
46
+ post_install_message:
47
+ rdoc_options: []
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ requirements: []
63
+ rubyforge_project:
64
+ rubygems_version: 1.8.23
65
+ signing_key:
66
+ specification_version: 3
67
+ summary: Nokogiri interface to the Gumbo HTML5 parser
68
+ test_files: []