nokogumbo 0.5 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- data/work/Makefile +213 -0
- data/work/attribute.c +44 -0
- data/work/attribute.h +37 -0
- data/work/attribute.o +0 -0
- data/work/char_ref.c +2561 -0
- data/work/char_ref.h +61 -0
- data/work/char_ref.o +0 -0
- data/work/error.c +258 -0
- data/work/error.h +225 -0
- data/work/error.o +0 -0
- data/work/gumbo.h +800 -0
- data/work/insertion_mode.h +54 -0
- data/work/mkmf.log +41 -0
- data/work/nokogumbo.c +97 -0
- data/work/nokogumbo.o +0 -0
- data/work/nokogumboc.so +0 -0
- data/work/parser.c +3893 -0
- data/work/parser.h +57 -0
- data/work/parser.o +0 -0
- data/work/string_buffer.c +106 -0
- data/work/string_buffer.h +82 -0
- data/work/string_buffer.o +0 -0
- data/work/string_piece.c +49 -0
- data/work/string_piece.h +39 -0
- data/work/string_piece.o +0 -0
- data/work/tag.c +222 -0
- data/work/tag.o +0 -0
- data/work/token_type.h +40 -0
- data/work/tokenizer.c +2978 -0
- data/work/tokenizer.h +123 -0
- data/work/tokenizer.o +0 -0
- data/work/tokenizer_states.h +103 -0
- data/work/utf8.c +268 -0
- data/work/utf8.h +127 -0
- data/work/utf8.o +0 -0
- data/work/util.c +58 -0
- data/work/util.h +57 -0
- data/work/util.o +0 -0
- data/work/vector.c +121 -0
- data/work/vector.h +66 -0
- data/work/vector.o +0 -0
- metadata +42 -2
- data/Rakefile +0 -68
@@ -0,0 +1,54 @@
|
|
1
|
+
// Copyright 2011 Google Inc. All Rights Reserved.
|
2
|
+
//
|
3
|
+
// Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
// you may not use this file except in compliance with the License.
|
5
|
+
// You may obtain a copy of the License at
|
6
|
+
//
|
7
|
+
// http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
//
|
9
|
+
// Unless required by applicable law or agreed to in writing, software
|
10
|
+
// distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
// See the License for the specific language governing permissions and
|
13
|
+
// limitations under the License.
|
14
|
+
//
|
15
|
+
// Author: jdtang@google.com (Jonathan Tang)
|
16
|
+
|
17
|
+
#ifndef GUMBO_INSERTION_MODE_H_
|
18
|
+
#define GUMBO_INSERTION_MODE_H_
|
19
|
+
|
20
|
+
#ifdef __cplusplus
|
21
|
+
extern "C" {
|
22
|
+
#endif
|
23
|
+
|
24
|
+
// http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
|
25
|
+
typedef enum _GumboInsertionMode {
|
26
|
+
GUMBO_INSERTION_MODE_INITIAL,
|
27
|
+
GUMBO_INSERTION_MODE_BEFORE_HTML,
|
28
|
+
GUMBO_INSERTION_MODE_BEFORE_HEAD,
|
29
|
+
GUMBO_INSERTION_MODE_IN_HEAD,
|
30
|
+
GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT,
|
31
|
+
GUMBO_INSERTION_MODE_AFTER_HEAD,
|
32
|
+
GUMBO_INSERTION_MODE_IN_BODY,
|
33
|
+
GUMBO_INSERTION_MODE_TEXT,
|
34
|
+
GUMBO_INSERTION_MODE_IN_TABLE,
|
35
|
+
GUMBO_INSERTION_MODE_IN_TABLE_TEXT,
|
36
|
+
GUMBO_INSERTION_MODE_IN_CAPTION,
|
37
|
+
GUMBO_INSERTION_MODE_IN_COLUMN_GROUP,
|
38
|
+
GUMBO_INSERTION_MODE_IN_TABLE_BODY,
|
39
|
+
GUMBO_INSERTION_MODE_IN_ROW,
|
40
|
+
GUMBO_INSERTION_MODE_IN_CELL,
|
41
|
+
GUMBO_INSERTION_MODE_IN_SELECT,
|
42
|
+
GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE,
|
43
|
+
GUMBO_INSERTION_MODE_AFTER_BODY,
|
44
|
+
GUMBO_INSERTION_MODE_IN_FRAMESET,
|
45
|
+
GUMBO_INSERTION_MODE_AFTER_FRAMESET,
|
46
|
+
GUMBO_INSERTION_MODE_AFTER_AFTER_BODY,
|
47
|
+
GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET
|
48
|
+
} GumboInsertionMode;
|
49
|
+
|
50
|
+
#ifdef __cplusplus
|
51
|
+
} // extern C
|
52
|
+
#endif
|
53
|
+
|
54
|
+
#endif // GUMBO_INSERTION_MODE_H_
|
data/work/mkmf.log
ADDED
@@ -0,0 +1,41 @@
|
|
1
|
+
package configuration for libxml-2.0
|
2
|
+
cflags: -I/usr/include/libxml2
|
3
|
+
ldflags:
|
4
|
+
libs: -lxml2
|
5
|
+
|
6
|
+
find_header: checking for nokogiri.h in /var/lib/gems/1.9.1/gems/nokogiri-1.5.5/ext/nokogiri... -------------------- yes
|
7
|
+
|
8
|
+
"gcc -o conftest -I/usr/include/ruby-1.9.1/x86_64-linux -I/usr/include/ruby-1.9.1/ruby/backward -I/usr/include/ruby-1.9.1 -I. -D_FORTIFY_SOURCE=2 -D_FORTIFY_SOURCE=2 -std=c99 -I/usr/include/libxml2 conftest.c -L. -L/usr/lib -L. -Wl,-Bsymbolic-functions -Wl,-z,relro -rdynamic -Wl,-export-dynamic -lxml2 -lruby-1.9.1 -lpthread -lrt -ldl -lcrypt -lm -lc"
|
9
|
+
checked program was:
|
10
|
+
/* begin */
|
11
|
+
1: #include "ruby.h"
|
12
|
+
2:
|
13
|
+
3: int main() {return 0;}
|
14
|
+
/* end */
|
15
|
+
|
16
|
+
"gcc -E -I/usr/include/ruby-1.9.1/x86_64-linux -I/usr/include/ruby-1.9.1/ruby/backward -I/usr/include/ruby-1.9.1 -I. -D_FORTIFY_SOURCE=2 -D_FORTIFY_SOURCE=2 -std=c99 -I/usr/include/libxml2 conftest.c -o conftest.i"
|
17
|
+
conftest.c:3:22: fatal error: nokogiri.h: No such file or directory
|
18
|
+
compilation terminated.
|
19
|
+
checked program was:
|
20
|
+
/* begin */
|
21
|
+
1: #include "ruby.h"
|
22
|
+
2:
|
23
|
+
3: #include <nokogiri.h>
|
24
|
+
/* end */
|
25
|
+
|
26
|
+
"gcc -E -I/usr/include/ruby-1.9.1/x86_64-linux -I/usr/include/ruby-1.9.1/ruby/backward -I/usr/include/ruby-1.9.1 -I. -D_FORTIFY_SOURCE=2 -D_FORTIFY_SOURCE=2 -std=c99 -I/usr/include/libxml2 -I/var/lib/gems/1.9.1/gems/nokogiri-1.5.5/ext/nokogiri conftest.c -o conftest.i"
|
27
|
+
In file included from conftest.c:3:0:
|
28
|
+
/var/lib/gems/1.9.1/gems/nokogiri-1.5.5/ext/nokogiri/nokogiri.h:13:0: warning: "_GNU_SOURCE" redefined [enabled by default]
|
29
|
+
In file included from /usr/include/ruby-1.9.1/ruby/ruby.h:24:0,
|
30
|
+
from /usr/include/ruby-1.9.1/ruby.h:32,
|
31
|
+
from conftest.c:1:
|
32
|
+
/usr/include/ruby-1.9.1/x86_64-linux/ruby/config.h:17:0: note: this is the location of the previous definition
|
33
|
+
checked program was:
|
34
|
+
/* begin */
|
35
|
+
1: #include "ruby.h"
|
36
|
+
2:
|
37
|
+
3: #include <nokogiri.h>
|
38
|
+
/* end */
|
39
|
+
|
40
|
+
--------------------
|
41
|
+
|
data/work/nokogumbo.c
ADDED
@@ -0,0 +1,97 @@
|
|
1
|
+
#include <ruby.h>
|
2
|
+
#include <gumbo.h>
|
3
|
+
#include <nokogiri.h>
|
4
|
+
#include <libxml/tree.h>
|
5
|
+
|
6
|
+
// class constants
|
7
|
+
static VALUE Document;
|
8
|
+
|
9
|
+
// Build a Nokogiri Element for a given GumboElement (recursively)
|
10
|
+
static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
|
11
|
+
// determine tag name for a given node
|
12
|
+
xmlNodePtr element;
|
13
|
+
if (node->tag != GUMBO_TAG_UNKNOWN) {
|
14
|
+
element = xmlNewNode(NULL, BAD_CAST gumbo_normalized_tagname(node->tag));
|
15
|
+
} else {
|
16
|
+
GumboStringPiece tag = node->original_tag;
|
17
|
+
gumbo_tag_from_original_text(&tag);
|
18
|
+
char name[tag.length+1];
|
19
|
+
strncpy(name, tag.data, tag.length);
|
20
|
+
name[tag.length] = '\0';
|
21
|
+
element = xmlNewNode(NULL, BAD_CAST name);
|
22
|
+
}
|
23
|
+
|
24
|
+
// add in the attributes
|
25
|
+
GumboVector* attrs = &node->attributes;
|
26
|
+
for (int i=0; i < attrs->length; i++) {
|
27
|
+
GumboAttribute *attr = attrs->data[i];
|
28
|
+
xmlNewProp(element, BAD_CAST attr->name, BAD_CAST attr->value);
|
29
|
+
}
|
30
|
+
|
31
|
+
// add in the children
|
32
|
+
GumboVector* children = &node->children;
|
33
|
+
for (int i=0; i < children->length; i++) {
|
34
|
+
GumboNode* child = children->data[i];
|
35
|
+
|
36
|
+
xmlNodePtr node = NULL;
|
37
|
+
|
38
|
+
switch (child->type) {
|
39
|
+
case GUMBO_NODE_ELEMENT:
|
40
|
+
node = walk_tree(document, &child->v.element);
|
41
|
+
break;
|
42
|
+
case GUMBO_NODE_WHITESPACE:
|
43
|
+
case GUMBO_NODE_TEXT:
|
44
|
+
node = xmlNewText(BAD_CAST child->v.text.text);
|
45
|
+
break;
|
46
|
+
case GUMBO_NODE_CDATA:
|
47
|
+
node = xmlNewCDataBlock(document,
|
48
|
+
BAD_CAST child->v.text.original_text.data,
|
49
|
+
child->v.text.original_text.length);
|
50
|
+
break;
|
51
|
+
case GUMBO_NODE_COMMENT:
|
52
|
+
node = xmlNewComment(BAD_CAST child->v.text.text);
|
53
|
+
break;
|
54
|
+
case GUMBO_NODE_DOCUMENT:
|
55
|
+
break; // should never happen -- ignore
|
56
|
+
}
|
57
|
+
|
58
|
+
if (node) xmlAddChild(element, node);
|
59
|
+
}
|
60
|
+
|
61
|
+
return element;
|
62
|
+
}
|
63
|
+
|
64
|
+
// Parse a string using gumbo_parse into a Nokogiri document
|
65
|
+
static VALUE parse(VALUE self, VALUE string) {
|
66
|
+
GumboOutput *output = gumbo_parse_with_options(
|
67
|
+
&kGumboDefaultOptions, RSTRING_PTR(string), RSTRING_LEN(string)
|
68
|
+
);
|
69
|
+
xmlDocPtr doc = xmlNewDoc(BAD_CAST "1.0");
|
70
|
+
xmlNodePtr root = walk_tree(doc, &output->root->v.element);
|
71
|
+
xmlDocSetRootElement(doc, root);
|
72
|
+
if (output->document->v.document.has_doctype) {
|
73
|
+
const char *public = output->document->v.document.public_identifier;
|
74
|
+
const char *system = output->document->v.document.system_identifier;
|
75
|
+
xmlCreateIntSubset(doc, BAD_CAST "html",
|
76
|
+
(strlen(public) ? public : NULL),
|
77
|
+
(strlen(system) ? system : NULL));
|
78
|
+
}
|
79
|
+
gumbo_destroy_output(&kGumboDefaultOptions, output);
|
80
|
+
|
81
|
+
return Nokogiri_wrap_xml_document(Document, doc);
|
82
|
+
}
|
83
|
+
|
84
|
+
// Initialize the Nokogumbo class and fetch constants we will use later
|
85
|
+
void Init_nokogumboc() {
|
86
|
+
rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
|
87
|
+
rb_require("nokogiri");
|
88
|
+
|
89
|
+
// class constants
|
90
|
+
VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
|
91
|
+
VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
|
92
|
+
Document = rb_const_get(HTML, rb_intern("Document"));
|
93
|
+
|
94
|
+
// define Nokogumbo class with a singleton parse method
|
95
|
+
VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
|
96
|
+
rb_define_singleton_method(Gumbo, "parse", parse, 1);
|
97
|
+
}
|
data/work/nokogumbo.o
ADDED
Binary file
|
data/work/nokogumboc.so
ADDED
Binary file
|