nokogumbo 0.9 → 0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +15 -0
- data/README.md +15 -10
- data/ext/nokogumboc/extconf.rb +34 -0
- data/ext/nokogumboc/nokogumbo.c +192 -0
- data/gumbo-parser/src/attribute.c +3 -3
- data/gumbo-parser/src/attribute.h +2 -2
- data/gumbo-parser/src/char_ref.c +8 -8
- data/gumbo-parser/src/char_ref.h +3 -3
- data/gumbo-parser/src/error.h +13 -13
- data/gumbo-parser/src/gumbo.h +21 -22
- data/gumbo-parser/src/insertion_mode.h +1 -1
- data/gumbo-parser/src/parser.c +1 -1
- data/gumbo-parser/src/parser.h +9 -9
- data/gumbo-parser/src/string_buffer.c +8 -8
- data/gumbo-parser/src/string_buffer.h +10 -11
- data/gumbo-parser/src/string_piece.c +2 -2
- data/gumbo-parser/src/string_piece.h +2 -2
- data/gumbo-parser/src/token_type.h +1 -1
- data/gumbo-parser/src/tokenizer.c +2 -2
- data/gumbo-parser/src/tokenizer.h +12 -12
- data/gumbo-parser/src/tokenizer_states.h +1 -1
- data/gumbo-parser/src/utf8.h +7 -7
- data/gumbo-parser/src/util.h +5 -4
- data/gumbo-parser/src/vector.c +11 -9
- data/gumbo-parser/src/vector.h +11 -8
- metadata +27 -31
- data/work/extconf.rb +0 -21
- data/work/nokogumbo.c +0 -100
metadata
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
5
|
-
prerelease:
|
4
|
+
version: '0.10'
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Sam Ruby
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2013-
|
11
|
+
date: 2013-09-03 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: nokogiri
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
17
|
- - ! '>='
|
20
18
|
- !ruby/object:Gem::Version
|
@@ -22,7 +20,6 @@ dependencies:
|
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
24
|
- - ! '>='
|
28
25
|
- !ruby/object:Gem::Version
|
@@ -32,62 +29,61 @@ description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser an
|
|
32
29
|
email: rubys@intertwingly.net
|
33
30
|
executables: []
|
34
31
|
extensions:
|
35
|
-
-
|
32
|
+
- ext/nokogumboc/extconf.rb
|
36
33
|
extra_rdoc_files: []
|
37
34
|
files:
|
35
|
+
- ext/nokogumboc/extconf.rb
|
36
|
+
- ext/nokogumboc/nokogumbo.c
|
38
37
|
- lib/nokogumbo.rb
|
39
38
|
- LICENSE.txt
|
40
39
|
- README.md
|
41
|
-
-
|
42
|
-
-
|
43
|
-
- gumbo-parser/src/
|
44
|
-
- gumbo-parser/src/
|
45
|
-
- gumbo-parser/src/
|
46
|
-
- gumbo-parser/src/vector.c
|
47
|
-
- gumbo-parser/src/string_buffer.c
|
48
|
-
- gumbo-parser/src/tokenizer_states.h
|
40
|
+
- gumbo-parser/src/attribute.c
|
41
|
+
- gumbo-parser/src/attribute.h
|
42
|
+
- gumbo-parser/src/char_ref.c
|
43
|
+
- gumbo-parser/src/char_ref.h
|
44
|
+
- gumbo-parser/src/error.c
|
49
45
|
- gumbo-parser/src/error.h
|
46
|
+
- gumbo-parser/src/gumbo.h
|
47
|
+
- gumbo-parser/src/insertion_mode.h
|
48
|
+
- gumbo-parser/src/parser.c
|
50
49
|
- gumbo-parser/src/parser.h
|
51
|
-
- gumbo-parser/src/
|
52
|
-
- gumbo-parser/src/tokenizer.h
|
50
|
+
- gumbo-parser/src/string_buffer.c
|
53
51
|
- gumbo-parser/src/string_buffer.h
|
54
|
-
- gumbo-parser/src/vector.h
|
55
|
-
- gumbo-parser/src/string_piece.h
|
56
|
-
- gumbo-parser/src/attribute.c
|
57
|
-
- gumbo-parser/src/char_ref.c
|
58
52
|
- gumbo-parser/src/string_piece.c
|
59
|
-
- gumbo-parser/src/
|
53
|
+
- gumbo-parser/src/string_piece.h
|
60
54
|
- gumbo-parser/src/tag.c
|
61
|
-
- gumbo-parser/src/
|
62
|
-
- gumbo-parser/src/parser.c
|
63
|
-
- gumbo-parser/src/utf8.c
|
64
|
-
- gumbo-parser/src/attribute.h
|
65
|
-
- gumbo-parser/src/char_ref.h
|
66
|
-
- gumbo-parser/src/insertion_mode.h
|
55
|
+
- gumbo-parser/src/token_type.h
|
67
56
|
- gumbo-parser/src/tokenizer.c
|
57
|
+
- gumbo-parser/src/tokenizer.h
|
58
|
+
- gumbo-parser/src/tokenizer_states.h
|
59
|
+
- gumbo-parser/src/utf8.c
|
60
|
+
- gumbo-parser/src/utf8.h
|
61
|
+
- gumbo-parser/src/util.c
|
62
|
+
- gumbo-parser/src/util.h
|
63
|
+
- gumbo-parser/src/vector.c
|
64
|
+
- gumbo-parser/src/vector.h
|
68
65
|
homepage: https://github.com/rubys/nokogumbo/#readme
|
69
66
|
licenses:
|
70
67
|
- Apache 2.0
|
68
|
+
metadata: {}
|
71
69
|
post_install_message:
|
72
70
|
rdoc_options: []
|
73
71
|
require_paths:
|
74
72
|
- lib
|
75
73
|
required_ruby_version: !ruby/object:Gem::Requirement
|
76
|
-
none: false
|
77
74
|
requirements:
|
78
75
|
- - ! '>='
|
79
76
|
- !ruby/object:Gem::Version
|
80
77
|
version: '0'
|
81
78
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
|
-
none: false
|
83
79
|
requirements:
|
84
80
|
- - ! '>='
|
85
81
|
- !ruby/object:Gem::Version
|
86
82
|
version: '0'
|
87
83
|
requirements: []
|
88
84
|
rubyforge_project:
|
89
|
-
rubygems_version:
|
85
|
+
rubygems_version: 2.0.7
|
90
86
|
signing_key:
|
91
|
-
specification_version:
|
87
|
+
specification_version: 4
|
92
88
|
summary: Nokogiri interface to the Gumbo HTML5 parser
|
93
89
|
test_files: []
|
data/work/extconf.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
require 'mkmf'
|
2
|
-
$CFLAGS = " -std=c99"
|
3
|
-
|
4
|
-
# libxml2 libraries from http://www.xmlsoft.org/
|
5
|
-
pkg_config('libxml-2.0')
|
6
|
-
|
7
|
-
# nokogiri configuration from gem install
|
8
|
-
nokogiri_lib = Gem.find_files('nokogiri').sort.last or gem 'nokogiri'
|
9
|
-
nokogiri_ext = nokogiri_lib.sub(%r(lib/nokogiri(.rb)?$), 'ext/nokogiri')
|
10
|
-
unless find_header('nokogiri.h', nokogiri_ext)
|
11
|
-
require "#{nokogiri_ext}/extconf.rb"
|
12
|
-
find_header('nokogiri.h', nokogiri_ext)
|
13
|
-
end
|
14
|
-
|
15
|
-
# add in gumbo-parser source from github if not already installed
|
16
|
-
unless have_library('gumbo', 'gumbo_parse') or File.exist? 'work/gumbo.h'
|
17
|
-
require 'fileutils'
|
18
|
-
FileUtils.cp Dir['../gumbo-parser/src/*'], '.'
|
19
|
-
end
|
20
|
-
|
21
|
-
create_makefile('nokogumboc')
|
data/work/nokogumbo.c
DELETED
@@ -1,100 +0,0 @@
|
|
1
|
-
#include <ruby.h>
|
2
|
-
#include <gumbo.h>
|
3
|
-
#include <nokogiri.h>
|
4
|
-
#include <libxml/tree.h>
|
5
|
-
|
6
|
-
#define CONST_CAST (xmlChar const*)
|
7
|
-
|
8
|
-
// class constants
|
9
|
-
static VALUE Document;
|
10
|
-
|
11
|
-
// Build a Nokogiri Element for a given GumboElement (recursively)
|
12
|
-
static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
|
13
|
-
// determine tag name for a given node
|
14
|
-
xmlNodePtr element;
|
15
|
-
if (node->tag != GUMBO_TAG_UNKNOWN) {
|
16
|
-
element = xmlNewNode(NULL, CONST_CAST gumbo_normalized_tagname(node->tag));
|
17
|
-
} else {
|
18
|
-
GumboStringPiece tag = node->original_tag;
|
19
|
-
gumbo_tag_from_original_text(&tag);
|
20
|
-
char name[tag.length+1];
|
21
|
-
strncpy(name, tag.data, tag.length);
|
22
|
-
name[tag.length] = '\0';
|
23
|
-
element = xmlNewNode(NULL, BAD_CAST name);
|
24
|
-
}
|
25
|
-
|
26
|
-
// add in the attributes
|
27
|
-
GumboVector* attrs = &node->attributes;
|
28
|
-
for (int i=0; i < attrs->length; i++) {
|
29
|
-
GumboAttribute *attr = attrs->data[i];
|
30
|
-
xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value);
|
31
|
-
}
|
32
|
-
|
33
|
-
// add in the children
|
34
|
-
GumboVector* children = &node->children;
|
35
|
-
for (int i=0; i < children->length; i++) {
|
36
|
-
GumboNode* child = children->data[i];
|
37
|
-
|
38
|
-
xmlNodePtr node = NULL;
|
39
|
-
|
40
|
-
switch (child->type) {
|
41
|
-
case GUMBO_NODE_ELEMENT:
|
42
|
-
node = walk_tree(document, &child->v.element);
|
43
|
-
break;
|
44
|
-
case GUMBO_NODE_WHITESPACE:
|
45
|
-
case GUMBO_NODE_TEXT:
|
46
|
-
node = xmlNewText(CONST_CAST child->v.text.text);
|
47
|
-
break;
|
48
|
-
case GUMBO_NODE_CDATA:
|
49
|
-
node = xmlNewCDataBlock(document,
|
50
|
-
CONST_CAST child->v.text.original_text.data,
|
51
|
-
(int) child->v.text.original_text.length);
|
52
|
-
break;
|
53
|
-
case GUMBO_NODE_COMMENT:
|
54
|
-
node = xmlNewComment(CONST_CAST child->v.text.text);
|
55
|
-
break;
|
56
|
-
case GUMBO_NODE_DOCUMENT:
|
57
|
-
break; // should never happen -- ignore
|
58
|
-
}
|
59
|
-
|
60
|
-
if (node) xmlAddChild(element, node);
|
61
|
-
}
|
62
|
-
|
63
|
-
return element;
|
64
|
-
}
|
65
|
-
|
66
|
-
// Parse a string using gumbo_parse into a Nokogiri document
|
67
|
-
static VALUE parse(VALUE self, VALUE string) {
|
68
|
-
GumboOutput *output = gumbo_parse_with_options(
|
69
|
-
&kGumboDefaultOptions, RSTRING_PTR(string),
|
70
|
-
(size_t) RSTRING_LEN(string)
|
71
|
-
);
|
72
|
-
xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
|
73
|
-
xmlNodePtr root = walk_tree(doc, &output->root->v.element);
|
74
|
-
xmlDocSetRootElement(doc, root);
|
75
|
-
if (output->document->v.document.has_doctype) {
|
76
|
-
const char *public = output->document->v.document.public_identifier;
|
77
|
-
const char *system = output->document->v.document.system_identifier;
|
78
|
-
xmlCreateIntSubset(doc, CONST_CAST "html",
|
79
|
-
(strlen(public) ? CONST_CAST public : NULL),
|
80
|
-
(strlen(system) ? CONST_CAST system : NULL));
|
81
|
-
}
|
82
|
-
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
83
|
-
|
84
|
-
return Nokogiri_wrap_xml_document(Document, doc);
|
85
|
-
}
|
86
|
-
|
87
|
-
// Initialize the Nokogumbo class and fetch constants we will use later
|
88
|
-
void Init_nokogumboc() {
|
89
|
-
rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
|
90
|
-
rb_require("nokogiri");
|
91
|
-
|
92
|
-
// class constants
|
93
|
-
VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
|
94
|
-
VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
|
95
|
-
Document = rb_const_get(HTML, rb_intern("Document"));
|
96
|
-
|
97
|
-
// define Nokogumbo class with a singleton parse method
|
98
|
-
VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
|
99
|
-
rb_define_singleton_method(Gumbo, "parse", parse, 1);
|
100
|
-
}
|