nokogumbo 0.1 → 0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -29,12 +29,11 @@ original tag name is returned verbatim.
29
29
  * Nothing meaningful is done with the `GumboDocument` struct, i.e., no
30
30
  Nokogiri `EntityDecl` is produced.
31
31
 
32
+ * The gem itself includes a copy of the Nokogumbo HTML5 parser.
33
+
32
34
  Installation:
33
35
  ============
34
36
 
35
- * Build and install the
36
- [gumbo-parser](https://github.com/google/gumbo-parser#readme) C library
37
-
38
37
  * Execute `rake gem`
39
38
 
40
39
  * [sudo] gem install pkg/nokogumbo*.gem
data/Rakefile CHANGED
@@ -3,44 +3,54 @@ require 'rake/clean'
3
3
 
4
4
  task 'default' => 'test'
5
5
 
6
- file 'Makefile' => 'ext/extconf.rb' do
7
- Dir.chdir 'ext' do
6
+ file 'gumbo-parser' do
7
+ sh 'git clone https://github.com/google/gumbo-parser.git'
8
+ end
9
+
10
+ file 'work/extconf.rb' => 'gumbo-parser' do
11
+ sh 'mkdir work'
12
+ sh 'cp gumbo-parser/src/* work'
13
+ sh 'cp ext/* work'
14
+ end
15
+
16
+ file 'work/Makefile' => 'work/extconf.rb' do
17
+ Dir.chdir 'work' do
8
18
  ruby 'extconf.rb'
9
19
  end
10
20
  end
11
21
 
12
- task 'test' => 'Makefile' do
13
- Dir.chdir 'ext' do
22
+ task 'test' => 'work/Makefile' do
23
+ Dir.chdir 'work' do
14
24
  sh 'make -s'
15
25
  end
16
26
  ruby 'test-nokogumbo.rb'
17
27
  end
18
28
 
19
- CLEAN.include('ext/*.o', 'ext/*.so', 'ext/*.log', 'ext/Makefile', 'pkg')
29
+ CLEAN.include 'pkg', 'gumbo-parser', 'work'
20
30
 
21
- MANIFEST = %w(
22
- ext/extconf.rb
23
- ext/nokogumbo.c
31
+ MANIFEST = FileList[*%w(
32
+ work/*.rb
33
+ work/*.c
34
+ work/*.h
24
35
  lib/nokogumbo.rb
25
36
  Rakefile
26
37
  README.md
27
- )
38
+ )]
28
39
 
29
40
  SPEC = Gem::Specification.new do |gem|
30
41
  gem.name = 'nokogumbo'
31
- gem.version = '0.1'
42
+ gem.version = '0.2'
32
43
  gem.email = 'rubys@intertwingly.net'
33
44
  gem.homepage = 'https://github.com/rubys/nokogumbo/tree/master/ruby#readme'
34
45
  gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser'
35
46
  gem.files = MANIFEST
36
- gem.extensions = 'ext/extconf.rb'
47
+ gem.extensions = 'work/extconf.rb'
37
48
  gem.author = 'Sam Ruby'
38
49
  gem.add_dependency 'nokogiri'
39
- gem.license = 'MIT'
50
+ gem.license = 'Apache 2.0'
40
51
  gem.description = %q(
41
- At the moment, this is a proof of concept, allowing a Ruby
42
- program to invoke the Gumbo HTML5 parser and access the result as a Nokogiri
43
- parsed document.).strip.gsub(/\s+/, ' ')
52
+ Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
53
+ access the result as a Nokogiri parsed document.).strip.gsub(/\s+/, ' ')
44
54
  end
45
55
 
46
56
  task 'gem' => 'test'
@@ -1,3 +1,3 @@
1
1
  require 'mkmf'
2
- have_library('gumbo', 'gumbo_parse')
2
+ $CFLAGS << " -std=c99"
3
3
  create_makefile('nokogumboc')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: '0.2'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-08-17 00:00:00.000000000 Z
12
+ date: 2013-08-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -27,22 +27,21 @@ dependencies:
27
27
  - - ! '>='
28
28
  - !ruby/object:Gem::Version
29
29
  version: '0'
30
- description: At the moment, this is a proof of concept, allowing a Ruby program to
31
- invoke the Gumbo HTML5 parser and access the result as a Nokogiri parsed document.
30
+ description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
31
+ access the result as a Nokogiri parsed document.
32
32
  email: rubys@intertwingly.net
33
33
  executables: []
34
34
  extensions:
35
- - ext/extconf.rb
35
+ - work/extconf.rb
36
36
  extra_rdoc_files: []
37
37
  files:
38
- - ext/extconf.rb
39
- - ext/nokogumbo.c
40
38
  - lib/nokogumbo.rb
41
39
  - Rakefile
42
40
  - README.md
41
+ - work/extconf.rb
43
42
  homepage: https://github.com/rubys/nokogumbo/tree/master/ruby#readme
44
43
  licenses:
45
- - MIT
44
+ - Apache 2.0
46
45
  post_install_message:
47
46
  rdoc_options: []
48
47
  require_paths:
@@ -1,131 +0,0 @@
1
- #include "ruby.h"
2
- #include "gumbo.h"
3
-
4
- // class constants
5
- static VALUE Nokogiri;
6
- static VALUE HTML;
7
- static VALUE XML;
8
- static VALUE Document;
9
- static VALUE Element;
10
- static VALUE Text;
11
- static VALUE CDATA;
12
- static VALUE Comment;
13
- static VALUE TAGS=0;
14
- static int Unknown=0;
15
-
16
- // interned symbols
17
- static VALUE new;
18
- static VALUE set_attribute;
19
- static VALUE add_child;
20
-
21
- // determine tag name for a given node
22
- static VALUE _name(GumboElement *node) {
23
- if (!TAGS) {
24
- // Deferred initialization of "Unknown" as the GumboParser class is
25
- // defined *after* the Nokogumbo class is.
26
- VALUE HTML5 = rb_const_get(Nokogiri, rb_intern("HTML5"));
27
- TAGS = rb_const_get(HTML5, rb_intern("TAGS"));
28
- Unknown = NUM2INT(rb_const_get(HTML5, rb_intern("Unknown")));
29
- }
30
-
31
- if (node->tag != Unknown) {
32
- return rb_ary_entry(TAGS, (long) node->tag);
33
- } else {
34
- // Gumbo doesn't provide unknown tags, so we need to parse it ourselves:
35
- // http://www.w3.org/html/wg/drafts/html/CR/syntax.html#tag-name-state
36
- GumboStringPiece *tag = &node->original_tag;
37
- int length;
38
- for (length = 1; length < tag->length-1; length++) {
39
- if (strchr(" \t\r\n<", *((char*)tag->data+length))) break;
40
- }
41
- return rb_str_new(1+(char *)tag->data, length-1);
42
- }
43
- }
44
-
45
- // Build a Nokogiri Element for a given GumboElement (recursively)
46
- static VALUE _element(VALUE document, GumboElement *node) {
47
- int i;
48
- VALUE element = rb_funcall(Element, new, 2, _name(node), document);
49
-
50
- // add in the attributes
51
- GumboVector* attrs = &node->attributes;
52
- for (i=0; i < attrs->length; i++) {
53
- GumboAttribute *attr = attrs->data[i];
54
- VALUE name = rb_str_new2(attr->name);
55
- rb_funcall(element, set_attribute, 2, name, rb_str_new2(attr->value));
56
- }
57
-
58
- // add in the children
59
- GumboVector* children = &node->children;
60
- for (i=0; i < children->length; i++) {
61
- GumboNode* child = children->data[i];
62
-
63
- VALUE node = 0;
64
- VALUE text;
65
-
66
- switch (child->type) {
67
- case GUMBO_NODE_ELEMENT:
68
- node = _element(document, &child->v.element);
69
- break;
70
- case GUMBO_NODE_WHITESPACE:
71
- case GUMBO_NODE_TEXT:
72
- text = rb_str_new2(child->v.text.text);
73
- node = rb_funcall(Text, new, 2, text, document);
74
- break;
75
- case GUMBO_NODE_CDATA:
76
- text = rb_str_new2(child->v.text.text);
77
- node = rb_funcall(CDATA, new, 2, text, document);
78
- break;
79
- case GUMBO_NODE_COMMENT:
80
- text = rb_str_new2(child->v.text.text);
81
- node = rb_funcall(Comment, new, 2, document, text);
82
- break;
83
- case GUMBO_NODE_DOCUMENT:
84
- break; // should never happen -- ignore
85
- }
86
-
87
- if (node) rb_funcall(element, add_child, 1, node);
88
- }
89
-
90
- return element;
91
- }
92
-
93
- // Parse a string using gumbo_parse into a Nokogiri document
94
- static VALUE t_parse(VALUE self, VALUE string) {
95
- VALUE document = rb_funcall(Document, new, 0);
96
-
97
- GumboOutput *output = gumbo_parse_with_options(
98
- &kGumboDefaultOptions, RSTRING_PTR(string), RSTRING_LEN(string)
99
- );
100
- VALUE root = _element(document, (GumboElement*)&output->root->v.element);
101
- rb_funcall(document, add_child, 1, root);
102
- gumbo_destroy_output(&kGumboDefaultOptions, output);
103
-
104
- return document;
105
- }
106
-
107
- // Initialize the Nokogumbo class and fetch constants we will use later
108
- void Init_nokogumboc() {
109
- rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
110
- rb_require("nokogiri");
111
-
112
- // class constants
113
- Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
114
- HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
115
- XML = rb_const_get(Nokogiri, rb_intern("XML"));
116
- Document = rb_const_get(HTML, rb_intern("Document"));
117
- Element = rb_const_get(XML, rb_intern("Element"));
118
- Text = rb_const_get(XML, rb_intern("Text"));
119
- CDATA = rb_const_get(XML, rb_intern("CDATA"));
120
- Comment = rb_const_get(XML, rb_intern("Comment"));
121
-
122
- // interned symbols
123
- new = rb_intern("new");
124
- set_attribute = rb_intern("set_attribute");
125
- add_child = rb_intern("add_child");
126
-
127
- // define Nokogumbo class with a singleton parse method
128
- VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
129
- rb_define_singleton_method(Gumbo, "parse", t_parse, 1);
130
- }
131
-