nokogumbo 0.1 → 0.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -29,12 +29,11 @@ original tag name is returned verbatim.
29
29
  * Nothing meaningful is done with the `GumboDocument` struct, i.e., no
30
30
  Nokogiri `EntityDecl` is produced.
31
31
 
32
+ * The gem itself includes a copy of the Nokogumbo HTML5 parser.
33
+
32
34
  Installation:
33
35
  ============
34
36
 
35
- * Build and install the
36
- [gumbo-parser](https://github.com/google/gumbo-parser#readme) C library
37
-
38
37
  * Execute `rake gem`
39
38
 
40
39
  * [sudo] gem install pkg/nokogumbo*.gem
data/Rakefile CHANGED
@@ -3,44 +3,54 @@ require 'rake/clean'
3
3
 
4
4
  task 'default' => 'test'
5
5
 
6
- file 'Makefile' => 'ext/extconf.rb' do
7
- Dir.chdir 'ext' do
6
+ file 'gumbo-parser' do
7
+ sh 'git clone https://github.com/google/gumbo-parser.git'
8
+ end
9
+
10
+ file 'work/extconf.rb' => 'gumbo-parser' do
11
+ sh 'mkdir work'
12
+ sh 'cp gumbo-parser/src/* work'
13
+ sh 'cp ext/* work'
14
+ end
15
+
16
+ file 'work/Makefile' => 'work/extconf.rb' do
17
+ Dir.chdir 'work' do
8
18
  ruby 'extconf.rb'
9
19
  end
10
20
  end
11
21
 
12
- task 'test' => 'Makefile' do
13
- Dir.chdir 'ext' do
22
+ task 'test' => 'work/Makefile' do
23
+ Dir.chdir 'work' do
14
24
  sh 'make -s'
15
25
  end
16
26
  ruby 'test-nokogumbo.rb'
17
27
  end
18
28
 
19
- CLEAN.include('ext/*.o', 'ext/*.so', 'ext/*.log', 'ext/Makefile', 'pkg')
29
+ CLEAN.include 'pkg', 'gumbo-parser', 'work'
20
30
 
21
- MANIFEST = %w(
22
- ext/extconf.rb
23
- ext/nokogumbo.c
31
+ MANIFEST = FileList[*%w(
32
+ work/*.rb
33
+ work/*.c
34
+ work/*.h
24
35
  lib/nokogumbo.rb
25
36
  Rakefile
26
37
  README.md
27
- )
38
+ )]
28
39
 
29
40
  SPEC = Gem::Specification.new do |gem|
30
41
  gem.name = 'nokogumbo'
31
- gem.version = '0.1'
42
+ gem.version = '0.2'
32
43
  gem.email = 'rubys@intertwingly.net'
33
44
  gem.homepage = 'https://github.com/rubys/nokogumbo/tree/master/ruby#readme'
34
45
  gem.summary = 'Nokogiri interface to the Gumbo HTML5 parser'
35
46
  gem.files = MANIFEST
36
- gem.extensions = 'ext/extconf.rb'
47
+ gem.extensions = 'work/extconf.rb'
37
48
  gem.author = 'Sam Ruby'
38
49
  gem.add_dependency 'nokogiri'
39
- gem.license = 'MIT'
50
+ gem.license = 'Apache 2.0'
40
51
  gem.description = %q(
41
- At the moment, this is a proof of concept, allowing a Ruby
42
- program to invoke the Gumbo HTML5 parser and access the result as a Nokogiri
43
- parsed document.).strip.gsub(/\s+/, ' ')
52
+ Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
53
+ access the result as a Nokogiri parsed document.).strip.gsub(/\s+/, ' ')
44
54
  end
45
55
 
46
56
  task 'gem' => 'test'
@@ -1,3 +1,3 @@
1
1
  require 'mkmf'
2
- have_library('gumbo', 'gumbo_parse')
2
+ $CFLAGS << " -std=c99"
3
3
  create_makefile('nokogumboc')
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- version: '0.1'
4
+ version: '0.2'
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-08-17 00:00:00.000000000 Z
12
+ date: 2013-08-18 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: nokogiri
@@ -27,22 +27,21 @@ dependencies:
27
27
  - - ! '>='
28
28
  - !ruby/object:Gem::Version
29
29
  version: '0'
30
- description: At the moment, this is a proof of concept, allowing a Ruby program to
31
- invoke the Gumbo HTML5 parser and access the result as a Nokogiri parsed document.
30
+ description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser and
31
+ access the result as a Nokogiri parsed document.
32
32
  email: rubys@intertwingly.net
33
33
  executables: []
34
34
  extensions:
35
- - ext/extconf.rb
35
+ - work/extconf.rb
36
36
  extra_rdoc_files: []
37
37
  files:
38
- - ext/extconf.rb
39
- - ext/nokogumbo.c
40
38
  - lib/nokogumbo.rb
41
39
  - Rakefile
42
40
  - README.md
41
+ - work/extconf.rb
43
42
  homepage: https://github.com/rubys/nokogumbo/tree/master/ruby#readme
44
43
  licenses:
45
- - MIT
44
+ - Apache 2.0
46
45
  post_install_message:
47
46
  rdoc_options: []
48
47
  require_paths:
@@ -1,131 +0,0 @@
1
- #include "ruby.h"
2
- #include "gumbo.h"
3
-
4
- // class constants
5
- static VALUE Nokogiri;
6
- static VALUE HTML;
7
- static VALUE XML;
8
- static VALUE Document;
9
- static VALUE Element;
10
- static VALUE Text;
11
- static VALUE CDATA;
12
- static VALUE Comment;
13
- static VALUE TAGS=0;
14
- static int Unknown=0;
15
-
16
- // interned symbols
17
- static VALUE new;
18
- static VALUE set_attribute;
19
- static VALUE add_child;
20
-
21
- // determine tag name for a given node
22
- static VALUE _name(GumboElement *node) {
23
- if (!TAGS) {
24
- // Deferred initialization of "Unknown" as the GumboParser class is
25
- // defined *after* the Nokogumbo class is.
26
- VALUE HTML5 = rb_const_get(Nokogiri, rb_intern("HTML5"));
27
- TAGS = rb_const_get(HTML5, rb_intern("TAGS"));
28
- Unknown = NUM2INT(rb_const_get(HTML5, rb_intern("Unknown")));
29
- }
30
-
31
- if (node->tag != Unknown) {
32
- return rb_ary_entry(TAGS, (long) node->tag);
33
- } else {
34
- // Gumbo doesn't provide unknown tags, so we need to parse it ourselves:
35
- // http://www.w3.org/html/wg/drafts/html/CR/syntax.html#tag-name-state
36
- GumboStringPiece *tag = &node->original_tag;
37
- int length;
38
- for (length = 1; length < tag->length-1; length++) {
39
- if (strchr(" \t\r\n<", *((char*)tag->data+length))) break;
40
- }
41
- return rb_str_new(1+(char *)tag->data, length-1);
42
- }
43
- }
44
-
45
- // Build a Nokogiri Element for a given GumboElement (recursively)
46
- static VALUE _element(VALUE document, GumboElement *node) {
47
- int i;
48
- VALUE element = rb_funcall(Element, new, 2, _name(node), document);
49
-
50
- // add in the attributes
51
- GumboVector* attrs = &node->attributes;
52
- for (i=0; i < attrs->length; i++) {
53
- GumboAttribute *attr = attrs->data[i];
54
- VALUE name = rb_str_new2(attr->name);
55
- rb_funcall(element, set_attribute, 2, name, rb_str_new2(attr->value));
56
- }
57
-
58
- // add in the children
59
- GumboVector* children = &node->children;
60
- for (i=0; i < children->length; i++) {
61
- GumboNode* child = children->data[i];
62
-
63
- VALUE node = 0;
64
- VALUE text;
65
-
66
- switch (child->type) {
67
- case GUMBO_NODE_ELEMENT:
68
- node = _element(document, &child->v.element);
69
- break;
70
- case GUMBO_NODE_WHITESPACE:
71
- case GUMBO_NODE_TEXT:
72
- text = rb_str_new2(child->v.text.text);
73
- node = rb_funcall(Text, new, 2, text, document);
74
- break;
75
- case GUMBO_NODE_CDATA:
76
- text = rb_str_new2(child->v.text.text);
77
- node = rb_funcall(CDATA, new, 2, text, document);
78
- break;
79
- case GUMBO_NODE_COMMENT:
80
- text = rb_str_new2(child->v.text.text);
81
- node = rb_funcall(Comment, new, 2, document, text);
82
- break;
83
- case GUMBO_NODE_DOCUMENT:
84
- break; // should never happen -- ignore
85
- }
86
-
87
- if (node) rb_funcall(element, add_child, 1, node);
88
- }
89
-
90
- return element;
91
- }
92
-
93
- // Parse a string using gumbo_parse into a Nokogiri document
94
- static VALUE t_parse(VALUE self, VALUE string) {
95
- VALUE document = rb_funcall(Document, new, 0);
96
-
97
- GumboOutput *output = gumbo_parse_with_options(
98
- &kGumboDefaultOptions, RSTRING_PTR(string), RSTRING_LEN(string)
99
- );
100
- VALUE root = _element(document, (GumboElement*)&output->root->v.element);
101
- rb_funcall(document, add_child, 1, root);
102
- gumbo_destroy_output(&kGumboDefaultOptions, output);
103
-
104
- return document;
105
- }
106
-
107
- // Initialize the Nokogumbo class and fetch constants we will use later
108
- void Init_nokogumboc() {
109
- rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
110
- rb_require("nokogiri");
111
-
112
- // class constants
113
- Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
114
- HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
115
- XML = rb_const_get(Nokogiri, rb_intern("XML"));
116
- Document = rb_const_get(HTML, rb_intern("Document"));
117
- Element = rb_const_get(XML, rb_intern("Element"));
118
- Text = rb_const_get(XML, rb_intern("Text"));
119
- CDATA = rb_const_get(XML, rb_intern("CDATA"));
120
- Comment = rb_const_get(XML, rb_intern("Comment"));
121
-
122
- // interned symbols
123
- new = rb_intern("new");
124
- set_attribute = rb_intern("set_attribute");
125
- add_child = rb_intern("add_child");
126
-
127
- // define Nokogumbo class with a singleton parse method
128
- VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
129
- rb_define_singleton_method(Gumbo, "parse", t_parse, 1);
130
- }
131
-