nokogumbo 0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (6) hide show
  1. data/README.md +46 -0
  2. data/Rakefile +50 -0
  3. data/ext/extconf.rb +3 -0
  4. data/ext/nokogumbo.c +131 -0
  5. data/lib/nokogumbo.rb +166 -0
  6. metadata +68 -0
data/README.md ADDED
@@ -0,0 +1,46 @@
1
+ Nokogumbo - a Nokogiri interface to the Gumbo HTML5 parser.
2
+ ===========
3
+
4
+ At the moment, this is a proof of concept, allowing a Ruby program to invoke
5
+ the Gumbo HTML5 parser and access the result as a Nokogiri parsed document.
6
+
7
+ Usage:
8
+ -----
9
+
10
+ ```ruby
11
+ require 'nokogumbo'
12
+ doc = Nokogiri::HTML5(string)
13
+ ```
14
+
15
+ Notes:
16
+ -----
17
+
18
+ * The `Nokogumbo.parse` function takes a string and passes it to the
19
+ <code>gumbo_parse_with_options</code> method, using the default options.
20
+ The resulting Gumbo parse tree is the walked, producing a Nokogiri parse tree.
21
+ The original Gumbo parse tree is then destroyed, and the Nokogiri parse tree
22
+ is returned.
23
+
24
+ * Instead of uppercase element names, lowercase element names are produced.
25
+
26
+ * Instead of returning 'unknown' as the element name for unknown tags, the
27
+ original tag name is returned verbatim.
28
+
29
+ * Nothing meaningful is done with the `GumboDocument` struct, i.e., no
30
+ Nokogiri `EntityDecl` is produced.
31
+
32
+ Installation:
33
+ ============
34
+
35
+ * Build and install the
36
+ [gumbo-parser](https://github.com/google/gumbo-parser#readme) C library
37
+
38
+ * Execute `rake gem`
39
+
40
+ * [sudo] gem install pkg/nokogumbo*.gem
41
+
42
+ Related efforts:
43
+ ============
44
+
45
+ * [ruby-gumbo](https://github.com/galdor/ruby-gumbo#readme) - a ruby binding
46
+ for the Gumbo HTML5 parser.
data/Rakefile ADDED
@@ -0,0 +1,50 @@
1
+ require 'rubygems/package_task'
2
+ require 'rake/clean'
3
+
4
+ task 'default' => 'test'
5
+
6
+ file 'Makefile' => 'ext/extconf.rb' do
7
+ Dir.chdir 'ext' do
8
+ ruby 'extconf.rb'
9
+ end
10
+ end
11
+
12
+ task 'test' => 'Makefile' do
13
+ Dir.chdir 'ext' do
14
+ sh 'make -s'
15
+ end
16
+ ruby 'test-nokogumbo.rb'
17
+ end
18
+
19
+ CLEAN.include('ext/*.o', 'ext/*.so', 'ext/*.log', 'ext/Makefile', 'pkg')
20
+
21
+ MANIFEST = %w(
22
+ ext/extconf.rb
23
+ ext/nokogumbo.c
24
+ lib/nokogumbo.rb
25
+ Rakefile
26
+ README.md
27
+ )
28
+
29
+ SPEC = Gem::Specification.new do |gem|
30
+ gem.name = 'nokogumbo'
31
+ gem.version = '0.1'
32
+ gem.email = 'rubys@intertwingly.net'
33
+ gem.homepage = 'https://github.com/rubys/nokogumbo/tree/master/ruby#readme'
34
+ gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser'
35
+ gem.files = MANIFEST
36
+ gem.extensions = 'ext/extconf.rb'
37
+ gem.author = 'Sam Ruby'
38
+ gem.add_dependency 'nokogiri'
39
+ gem.license = 'MIT'
40
+ gem.description = %q(
41
+ At the moment, this is a proof of concept, allowing a Ruby
42
+ program to invoke the Gumbo HTML5 parser and access the result as a Nokogiri
43
+ parsed document.).strip.gsub(/\s+/, ' ')
44
+ end
45
+
46
+ task 'gem' => 'test'
47
+ Gem::PackageTask.new(SPEC) do |pkg|
48
+ pkg.need_tar = true
49
+ pkg.need_zip = true
50
+ end
data/ext/extconf.rb ADDED
@@ -0,0 +1,3 @@
1
+ require 'mkmf'
2
+ have_library('gumbo', 'gumbo_parse')
3
+ create_makefile('nokogumboc')
data/ext/nokogumbo.c ADDED
@@ -0,0 +1,131 @@
1
+ #include "ruby.h"
2
+ #include "gumbo.h"
3
+
4
+ // class constants
5
+ static VALUE Nokogiri;
6
+ static VALUE HTML;
7
+ static VALUE XML;
8
+ static VALUE Document;
9
+ static VALUE Element;
10
+ static VALUE Text;
11
+ static VALUE CDATA;
12
+ static VALUE Comment;
13
+ static VALUE TAGS=0;
14
+ static int Unknown=0;
15
+
16
+ // interned symbols
17
+ static VALUE new;
18
+ static VALUE set_attribute;
19
+ static VALUE add_child;
20
+
21
+ // determine tag name for a given node
22
+ static VALUE _name(GumboElement *node) {
23
+ if (!TAGS) {
24
+ // Deferred initialization of "Unknown" as the GumboParser class is
25
+ // defined *after* the Nokogumbo class is.
26
+ VALUE HTML5 = rb_const_get(Nokogiri, rb_intern("HTML5"));
27
+ TAGS = rb_const_get(HTML5, rb_intern("TAGS"));
28
+ Unknown = NUM2INT(rb_const_get(HTML5, rb_intern("Unknown")));
29
+ }
30
+
31
+ if (node->tag != Unknown) {
32
+ return rb_ary_entry(TAGS, (long) node->tag);
33
+ } else {
34
+ // Gumbo doesn't provide unknown tags, so we need to parse it ourselves:
35
+ // http://www.w3.org/html/wg/drafts/html/CR/syntax.html#tag-name-state
36
+ GumboStringPiece *tag = &node->original_tag;
37
+ int length;
38
+ for (length = 1; length < tag->length-1; length++) {
39
+ if (strchr(" \t\r\n<", *((char*)tag->data+length))) break;
40
+ }
41
+ return rb_str_new(1+(char *)tag->data, length-1);
42
+ }
43
+ }
44
+
45
+ // Build a Nokogiri Element for a given GumboElement (recursively)
46
+ static VALUE _element(VALUE document, GumboElement *node) {
47
+ int i;
48
+ VALUE element = rb_funcall(Element, new, 2, _name(node), document);
49
+
50
+ // add in the attributes
51
+ GumboVector* attrs = &node->attributes;
52
+ for (i=0; i < attrs->length; i++) {
53
+ GumboAttribute *attr = attrs->data[i];
54
+ VALUE name = rb_str_new2(attr->name);
55
+ rb_funcall(element, set_attribute, 2, name, rb_str_new2(attr->value));
56
+ }
57
+
58
+ // add in the children
59
+ GumboVector* children = &node->children;
60
+ for (i=0; i < children->length; i++) {
61
+ GumboNode* child = children->data[i];
62
+
63
+ VALUE node = 0;
64
+ VALUE text;
65
+
66
+ switch (child->type) {
67
+ case GUMBO_NODE_ELEMENT:
68
+ node = _element(document, &child->v.element);
69
+ break;
70
+ case GUMBO_NODE_WHITESPACE:
71
+ case GUMBO_NODE_TEXT:
72
+ text = rb_str_new2(child->v.text.text);
73
+ node = rb_funcall(Text, new, 2, text, document);
74
+ break;
75
+ case GUMBO_NODE_CDATA:
76
+ text = rb_str_new2(child->v.text.text);
77
+ node = rb_funcall(CDATA, new, 2, text, document);
78
+ break;
79
+ case GUMBO_NODE_COMMENT:
80
+ text = rb_str_new2(child->v.text.text);
81
+ node = rb_funcall(Comment, new, 2, document, text);
82
+ break;
83
+ case GUMBO_NODE_DOCUMENT:
84
+ break; // should never happen -- ignore
85
+ }
86
+
87
+ if (node) rb_funcall(element, add_child, 1, node);
88
+ }
89
+
90
+ return element;
91
+ }
92
+
93
+ // Parse a string using gumbo_parse into a Nokogiri document
94
+ static VALUE t_parse(VALUE self, VALUE string) {
95
+ VALUE document = rb_funcall(Document, new, 0);
96
+
97
+ GumboOutput *output = gumbo_parse_with_options(
98
+ &kGumboDefaultOptions, RSTRING_PTR(string), RSTRING_LEN(string)
99
+ );
100
+ VALUE root = _element(document, (GumboElement*)&output->root->v.element);
101
+ rb_funcall(document, add_child, 1, root);
102
+ gumbo_destroy_output(&kGumboDefaultOptions, output);
103
+
104
+ return document;
105
+ }
106
+
107
+ // Initialize the Nokogumbo class and fetch constants we will use later
108
+ void Init_nokogumboc() {
109
+ rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
110
+ rb_require("nokogiri");
111
+
112
+ // class constants
113
+ Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
114
+ HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
115
+ XML = rb_const_get(Nokogiri, rb_intern("XML"));
116
+ Document = rb_const_get(HTML, rb_intern("Document"));
117
+ Element = rb_const_get(XML, rb_intern("Element"));
118
+ Text = rb_const_get(XML, rb_intern("Text"));
119
+ CDATA = rb_const_get(XML, rb_intern("CDATA"));
120
+ Comment = rb_const_get(XML, rb_intern("Comment"));
121
+
122
+ // interned symbols
123
+ new = rb_intern("new");
124
+ set_attribute = rb_intern("set_attribute");
125
+ add_child = rb_intern("add_child");
126
+
127
+ // define Nokogumbo class with a singleton parse method
128
+ VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
129
+ rb_define_singleton_method(Gumbo, "parse", t_parse, 1);
130
+ }
131
+
data/lib/nokogumbo.rb ADDED
@@ -0,0 +1,166 @@
1
+ require 'nokogiri'
2
+ require 'nokogumboc'
3
+
4
+ module Nokogiri
5
+ def self.HTML5(string)
6
+ Nokogumbo.parse(string)
7
+ end
8
+
9
+ module HTML5
10
+ TAGS = [
11
+ 'HTML',
12
+ 'HEAD',
13
+ 'TITLE',
14
+ 'BASE',
15
+ 'LINK',
16
+ 'META',
17
+ 'STYLE',
18
+ 'SCRIPT',
19
+ 'NOSCRIPT',
20
+ 'BODY',
21
+ 'SECTION',
22
+ 'NAV',
23
+ 'ARTICLE',
24
+ 'ASIDE',
25
+ 'H1',
26
+ 'H2',
27
+ 'H3',
28
+ 'H4',
29
+ 'H5',
30
+ 'H6',
31
+ 'HGROUP',
32
+ 'HEADER',
33
+ 'FOOTER',
34
+ 'ADDRESS',
35
+ 'P',
36
+ 'HR',
37
+ 'PRE',
38
+ 'BLOCKQUOTE',
39
+ 'OL',
40
+ 'UL',
41
+ 'LI',
42
+ 'DL',
43
+ 'DT',
44
+ 'DD',
45
+ 'FIGURE',
46
+ 'FIGCAPTION',
47
+ 'DIV',
48
+ 'A',
49
+ 'EM',
50
+ 'STRONG',
51
+ 'SMALL',
52
+ 'S',
53
+ 'CITE',
54
+ 'Q',
55
+ 'DFN',
56
+ 'ABBR',
57
+ 'TIME',
58
+ 'CODE',
59
+ 'VAR',
60
+ 'SAMP',
61
+ 'KBD',
62
+ 'SUB',
63
+ 'SUP',
64
+ 'I',
65
+ 'B',
66
+ 'MARK',
67
+ 'RUBY',
68
+ 'RT',
69
+ 'RP',
70
+ 'BDI',
71
+ 'BDO',
72
+ 'SPAN',
73
+ 'BR',
74
+ 'WBR',
75
+ 'INS',
76
+ 'DEL',
77
+ 'IMAGE',
78
+ 'IMG',
79
+ 'IFRAME',
80
+ 'EMBED',
81
+ 'OBJECT',
82
+ 'PARAM',
83
+ 'VIDEO',
84
+ 'AUDIO',
85
+ 'SOURCE',
86
+ 'TRACK',
87
+ 'CANVAS',
88
+ 'MAP',
89
+ 'AREA',
90
+ 'MATH',
91
+ 'MI',
92
+ 'MO',
93
+ 'MN',
94
+ 'MS',
95
+ 'MTEXT',
96
+ 'MGLYPH',
97
+ 'MALIGNMARK',
98
+ 'ANNOTATION_XML',
99
+ 'SVG',
100
+ 'FOREIGNOBJECT',
101
+ 'DESC',
102
+ 'TABLE',
103
+ 'CAPTION',
104
+ 'COLGROUP',
105
+ 'COL',
106
+ 'TBODY',
107
+ 'THEAD',
108
+ 'TFOOT',
109
+ 'TR',
110
+ 'TD',
111
+ 'TH',
112
+ 'FORM',
113
+ 'FIELDSET',
114
+ 'LEGEND',
115
+ 'LABEL',
116
+ 'INPUT',
117
+ 'BUTTON',
118
+ 'SELECT',
119
+ 'DATALIST',
120
+ 'OPTGROUP',
121
+ 'OPTION',
122
+ 'TEXTAREA',
123
+ 'KEYGEN',
124
+ 'OUTPUT',
125
+ 'PROGRESS',
126
+ 'METER',
127
+ 'DETAILS',
128
+ 'SUMMARY',
129
+ 'COMMAND',
130
+ 'MENU',
131
+ 'APPLET',
132
+ 'ACRONYM',
133
+ 'BGSOUND',
134
+ 'DIR',
135
+ 'FRAME',
136
+ 'FRAMESET',
137
+ 'NOFRAMES',
138
+ 'ISINDEX',
139
+ 'LISTING',
140
+ 'XMP',
141
+ 'NEXTID',
142
+ 'NOEMBED',
143
+ 'PLAINTEXT',
144
+ 'RB',
145
+ 'STRIKE',
146
+ 'BASEFONT',
147
+ 'BIG',
148
+ 'BLINK',
149
+ 'CENTER',
150
+ 'FONT',
151
+ 'MARQUEE',
152
+ 'MULTICOL',
153
+ 'NOBR',
154
+ 'SPACER',
155
+ 'TT',
156
+ 'U',
157
+ 'UNKNOWN',
158
+ ].map(&:downcase).map(&:freeze).freeze
159
+
160
+ Unknown = TAGS.length - 1
161
+
162
+ def parse(string)
163
+ Nokogumbo.parse(string)
164
+ end
165
+ end
166
+ end
metadata ADDED
@@ -0,0 +1,68 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: nokogumbo
3
+ version: !ruby/object:Gem::Version
4
+ version: '0.1'
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Sam Ruby
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-08-17 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: nokogiri
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
30
+ description: At the moment, this is a proof of concept, allowing a Ruby program to
31
+ invoke the Gumbo HTML5 parser and access the result as a Nokogiri parsed document.
32
+ email: rubys@intertwingly.net
33
+ executables: []
34
+ extensions:
35
+ - ext/extconf.rb
36
+ extra_rdoc_files: []
37
+ files:
38
+ - ext/extconf.rb
39
+ - ext/nokogumbo.c
40
+ - lib/nokogumbo.rb
41
+ - Rakefile
42
+ - README.md
43
+ homepage: https://github.com/rubys/nokogumbo/tree/master/ruby#readme
44
+ licenses:
45
+ - MIT
46
+ post_install_message:
47
+ rdoc_options: []
48
+ require_paths:
49
+ - lib
50
+ required_ruby_version: !ruby/object:Gem::Requirement
51
+ none: false
52
+ requirements:
53
+ - - ! '>='
54
+ - !ruby/object:Gem::Version
55
+ version: '0'
56
+ required_rubygems_version: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ requirements: []
63
+ rubyforge_project:
64
+ rubygems_version: 1.8.23
65
+ signing_key:
66
+ specification_version: 3
67
+ summary: Nokogiri interface to the Gumbo HTML5 parser
68
+ test_files: []