nokogumbo 0.9 → 0.10
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/README.md +15 -10
- data/ext/nokogumboc/extconf.rb +34 -0
- data/ext/nokogumboc/nokogumbo.c +192 -0
- data/gumbo-parser/src/attribute.c +3 -3
- data/gumbo-parser/src/attribute.h +2 -2
- data/gumbo-parser/src/char_ref.c +8 -8
- data/gumbo-parser/src/char_ref.h +3 -3
- data/gumbo-parser/src/error.h +13 -13
- data/gumbo-parser/src/gumbo.h +21 -22
- data/gumbo-parser/src/insertion_mode.h +1 -1
- data/gumbo-parser/src/parser.c +1 -1
- data/gumbo-parser/src/parser.h +9 -9
- data/gumbo-parser/src/string_buffer.c +8 -8
- data/gumbo-parser/src/string_buffer.h +10 -11
- data/gumbo-parser/src/string_piece.c +2 -2
- data/gumbo-parser/src/string_piece.h +2 -2
- data/gumbo-parser/src/token_type.h +1 -1
- data/gumbo-parser/src/tokenizer.c +2 -2
- data/gumbo-parser/src/tokenizer.h +12 -12
- data/gumbo-parser/src/tokenizer_states.h +1 -1
- data/gumbo-parser/src/utf8.h +7 -7
- data/gumbo-parser/src/util.h +5 -4
- data/gumbo-parser/src/vector.c +11 -9
- data/gumbo-parser/src/vector.h +11 -8
- metadata +27 -31
- data/work/extconf.rb +0 -21
- data/work/nokogumbo.c +0 -100
metadata
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: nokogumbo
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: '0.
|
5
|
-
prerelease:
|
4
|
+
version: '0.10'
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Sam Ruby
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date: 2013-
|
11
|
+
date: 2013-09-03 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: nokogiri
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
17
|
- - ! '>='
|
20
18
|
- !ruby/object:Gem::Version
|
@@ -22,7 +20,6 @@ dependencies:
|
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
24
|
- - ! '>='
|
28
25
|
- !ruby/object:Gem::Version
|
@@ -32,62 +29,61 @@ description: Nokogumbo allows a Ruby program to invoke the Gumbo HTML5 parser an
|
|
32
29
|
email: rubys@intertwingly.net
|
33
30
|
executables: []
|
34
31
|
extensions:
|
35
|
-
-
|
32
|
+
- ext/nokogumboc/extconf.rb
|
36
33
|
extra_rdoc_files: []
|
37
34
|
files:
|
35
|
+
- ext/nokogumboc/extconf.rb
|
36
|
+
- ext/nokogumboc/nokogumbo.c
|
38
37
|
- lib/nokogumbo.rb
|
39
38
|
- LICENSE.txt
|
40
39
|
- README.md
|
41
|
-
-
|
42
|
-
-
|
43
|
-
- gumbo-parser/src/
|
44
|
-
- gumbo-parser/src/
|
45
|
-
- gumbo-parser/src/
|
46
|
-
- gumbo-parser/src/vector.c
|
47
|
-
- gumbo-parser/src/string_buffer.c
|
48
|
-
- gumbo-parser/src/tokenizer_states.h
|
40
|
+
- gumbo-parser/src/attribute.c
|
41
|
+
- gumbo-parser/src/attribute.h
|
42
|
+
- gumbo-parser/src/char_ref.c
|
43
|
+
- gumbo-parser/src/char_ref.h
|
44
|
+
- gumbo-parser/src/error.c
|
49
45
|
- gumbo-parser/src/error.h
|
46
|
+
- gumbo-parser/src/gumbo.h
|
47
|
+
- gumbo-parser/src/insertion_mode.h
|
48
|
+
- gumbo-parser/src/parser.c
|
50
49
|
- gumbo-parser/src/parser.h
|
51
|
-
- gumbo-parser/src/
|
52
|
-
- gumbo-parser/src/tokenizer.h
|
50
|
+
- gumbo-parser/src/string_buffer.c
|
53
51
|
- gumbo-parser/src/string_buffer.h
|
54
|
-
- gumbo-parser/src/vector.h
|
55
|
-
- gumbo-parser/src/string_piece.h
|
56
|
-
- gumbo-parser/src/attribute.c
|
57
|
-
- gumbo-parser/src/char_ref.c
|
58
52
|
- gumbo-parser/src/string_piece.c
|
59
|
-
- gumbo-parser/src/
|
53
|
+
- gumbo-parser/src/string_piece.h
|
60
54
|
- gumbo-parser/src/tag.c
|
61
|
-
- gumbo-parser/src/
|
62
|
-
- gumbo-parser/src/parser.c
|
63
|
-
- gumbo-parser/src/utf8.c
|
64
|
-
- gumbo-parser/src/attribute.h
|
65
|
-
- gumbo-parser/src/char_ref.h
|
66
|
-
- gumbo-parser/src/insertion_mode.h
|
55
|
+
- gumbo-parser/src/token_type.h
|
67
56
|
- gumbo-parser/src/tokenizer.c
|
57
|
+
- gumbo-parser/src/tokenizer.h
|
58
|
+
- gumbo-parser/src/tokenizer_states.h
|
59
|
+
- gumbo-parser/src/utf8.c
|
60
|
+
- gumbo-parser/src/utf8.h
|
61
|
+
- gumbo-parser/src/util.c
|
62
|
+
- gumbo-parser/src/util.h
|
63
|
+
- gumbo-parser/src/vector.c
|
64
|
+
- gumbo-parser/src/vector.h
|
68
65
|
homepage: https://github.com/rubys/nokogumbo/#readme
|
69
66
|
licenses:
|
70
67
|
- Apache 2.0
|
68
|
+
metadata: {}
|
71
69
|
post_install_message:
|
72
70
|
rdoc_options: []
|
73
71
|
require_paths:
|
74
72
|
- lib
|
75
73
|
required_ruby_version: !ruby/object:Gem::Requirement
|
76
|
-
none: false
|
77
74
|
requirements:
|
78
75
|
- - ! '>='
|
79
76
|
- !ruby/object:Gem::Version
|
80
77
|
version: '0'
|
81
78
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
82
|
-
none: false
|
83
79
|
requirements:
|
84
80
|
- - ! '>='
|
85
81
|
- !ruby/object:Gem::Version
|
86
82
|
version: '0'
|
87
83
|
requirements: []
|
88
84
|
rubyforge_project:
|
89
|
-
rubygems_version:
|
85
|
+
rubygems_version: 2.0.7
|
90
86
|
signing_key:
|
91
|
-
specification_version:
|
87
|
+
specification_version: 4
|
92
88
|
summary: Nokogiri interface to the Gumbo HTML5 parser
|
93
89
|
test_files: []
|
data/work/extconf.rb
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
require 'mkmf'
|
2
|
-
$CFLAGS = " -std=c99"
|
3
|
-
|
4
|
-
# libxml2 libraries from http://www.xmlsoft.org/
|
5
|
-
pkg_config('libxml-2.0')
|
6
|
-
|
7
|
-
# nokogiri configuration from gem install
|
8
|
-
nokogiri_lib = Gem.find_files('nokogiri').sort.last or gem 'nokogiri'
|
9
|
-
nokogiri_ext = nokogiri_lib.sub(%r(lib/nokogiri(.rb)?$), 'ext/nokogiri')
|
10
|
-
unless find_header('nokogiri.h', nokogiri_ext)
|
11
|
-
require "#{nokogiri_ext}/extconf.rb"
|
12
|
-
find_header('nokogiri.h', nokogiri_ext)
|
13
|
-
end
|
14
|
-
|
15
|
-
# add in gumbo-parser source from github if not already installed
|
16
|
-
unless have_library('gumbo', 'gumbo_parse') or File.exist? 'work/gumbo.h'
|
17
|
-
require 'fileutils'
|
18
|
-
FileUtils.cp Dir['../gumbo-parser/src/*'], '.'
|
19
|
-
end
|
20
|
-
|
21
|
-
create_makefile('nokogumboc')
|
data/work/nokogumbo.c
DELETED
@@ -1,100 +0,0 @@
|
|
1
|
-
#include <ruby.h>
|
2
|
-
#include <gumbo.h>
|
3
|
-
#include <nokogiri.h>
|
4
|
-
#include <libxml/tree.h>
|
5
|
-
|
6
|
-
#define CONST_CAST (xmlChar const*)
|
7
|
-
|
8
|
-
// class constants
|
9
|
-
static VALUE Document;
|
10
|
-
|
11
|
-
// Build a Nokogiri Element for a given GumboElement (recursively)
|
12
|
-
static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
|
13
|
-
// determine tag name for a given node
|
14
|
-
xmlNodePtr element;
|
15
|
-
if (node->tag != GUMBO_TAG_UNKNOWN) {
|
16
|
-
element = xmlNewNode(NULL, CONST_CAST gumbo_normalized_tagname(node->tag));
|
17
|
-
} else {
|
18
|
-
GumboStringPiece tag = node->original_tag;
|
19
|
-
gumbo_tag_from_original_text(&tag);
|
20
|
-
char name[tag.length+1];
|
21
|
-
strncpy(name, tag.data, tag.length);
|
22
|
-
name[tag.length] = '\0';
|
23
|
-
element = xmlNewNode(NULL, BAD_CAST name);
|
24
|
-
}
|
25
|
-
|
26
|
-
// add in the attributes
|
27
|
-
GumboVector* attrs = &node->attributes;
|
28
|
-
for (int i=0; i < attrs->length; i++) {
|
29
|
-
GumboAttribute *attr = attrs->data[i];
|
30
|
-
xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value);
|
31
|
-
}
|
32
|
-
|
33
|
-
// add in the children
|
34
|
-
GumboVector* children = &node->children;
|
35
|
-
for (int i=0; i < children->length; i++) {
|
36
|
-
GumboNode* child = children->data[i];
|
37
|
-
|
38
|
-
xmlNodePtr node = NULL;
|
39
|
-
|
40
|
-
switch (child->type) {
|
41
|
-
case GUMBO_NODE_ELEMENT:
|
42
|
-
node = walk_tree(document, &child->v.element);
|
43
|
-
break;
|
44
|
-
case GUMBO_NODE_WHITESPACE:
|
45
|
-
case GUMBO_NODE_TEXT:
|
46
|
-
node = xmlNewText(CONST_CAST child->v.text.text);
|
47
|
-
break;
|
48
|
-
case GUMBO_NODE_CDATA:
|
49
|
-
node = xmlNewCDataBlock(document,
|
50
|
-
CONST_CAST child->v.text.original_text.data,
|
51
|
-
(int) child->v.text.original_text.length);
|
52
|
-
break;
|
53
|
-
case GUMBO_NODE_COMMENT:
|
54
|
-
node = xmlNewComment(CONST_CAST child->v.text.text);
|
55
|
-
break;
|
56
|
-
case GUMBO_NODE_DOCUMENT:
|
57
|
-
break; // should never happen -- ignore
|
58
|
-
}
|
59
|
-
|
60
|
-
if (node) xmlAddChild(element, node);
|
61
|
-
}
|
62
|
-
|
63
|
-
return element;
|
64
|
-
}
|
65
|
-
|
66
|
-
// Parse a string using gumbo_parse into a Nokogiri document
|
67
|
-
static VALUE parse(VALUE self, VALUE string) {
|
68
|
-
GumboOutput *output = gumbo_parse_with_options(
|
69
|
-
&kGumboDefaultOptions, RSTRING_PTR(string),
|
70
|
-
(size_t) RSTRING_LEN(string)
|
71
|
-
);
|
72
|
-
xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
|
73
|
-
xmlNodePtr root = walk_tree(doc, &output->root->v.element);
|
74
|
-
xmlDocSetRootElement(doc, root);
|
75
|
-
if (output->document->v.document.has_doctype) {
|
76
|
-
const char *public = output->document->v.document.public_identifier;
|
77
|
-
const char *system = output->document->v.document.system_identifier;
|
78
|
-
xmlCreateIntSubset(doc, CONST_CAST "html",
|
79
|
-
(strlen(public) ? CONST_CAST public : NULL),
|
80
|
-
(strlen(system) ? CONST_CAST system : NULL));
|
81
|
-
}
|
82
|
-
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
83
|
-
|
84
|
-
return Nokogiri_wrap_xml_document(Document, doc);
|
85
|
-
}
|
86
|
-
|
87
|
-
// Initialize the Nokogumbo class and fetch constants we will use later
|
88
|
-
void Init_nokogumboc() {
|
89
|
-
rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
|
90
|
-
rb_require("nokogiri");
|
91
|
-
|
92
|
-
// class constants
|
93
|
-
VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
|
94
|
-
VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
|
95
|
-
Document = rb_const_get(HTML, rb_intern("Document"));
|
96
|
-
|
97
|
-
// define Nokogumbo class with a singleton parse method
|
98
|
-
VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
|
99
|
-
rb_define_singleton_method(Gumbo, "parse", parse, 1);
|
100
|
-
}
|