nokogumbo 0.5 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,54 @@
1
+ // Copyright 2011 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #ifndef GUMBO_INSERTION_MODE_H_
18
+ #define GUMBO_INSERTION_MODE_H_
19
+
20
+ #ifdef __cplusplus
21
+ extern "C" {
22
+ #endif
23
+
24
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
25
+ typedef enum _GumboInsertionMode {
26
+ GUMBO_INSERTION_MODE_INITIAL,
27
+ GUMBO_INSERTION_MODE_BEFORE_HTML,
28
+ GUMBO_INSERTION_MODE_BEFORE_HEAD,
29
+ GUMBO_INSERTION_MODE_IN_HEAD,
30
+ GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT,
31
+ GUMBO_INSERTION_MODE_AFTER_HEAD,
32
+ GUMBO_INSERTION_MODE_IN_BODY,
33
+ GUMBO_INSERTION_MODE_TEXT,
34
+ GUMBO_INSERTION_MODE_IN_TABLE,
35
+ GUMBO_INSERTION_MODE_IN_TABLE_TEXT,
36
+ GUMBO_INSERTION_MODE_IN_CAPTION,
37
+ GUMBO_INSERTION_MODE_IN_COLUMN_GROUP,
38
+ GUMBO_INSERTION_MODE_IN_TABLE_BODY,
39
+ GUMBO_INSERTION_MODE_IN_ROW,
40
+ GUMBO_INSERTION_MODE_IN_CELL,
41
+ GUMBO_INSERTION_MODE_IN_SELECT,
42
+ GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE,
43
+ GUMBO_INSERTION_MODE_AFTER_BODY,
44
+ GUMBO_INSERTION_MODE_IN_FRAMESET,
45
+ GUMBO_INSERTION_MODE_AFTER_FRAMESET,
46
+ GUMBO_INSERTION_MODE_AFTER_AFTER_BODY,
47
+ GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET
48
+ } GumboInsertionMode;
49
+
50
+ #ifdef __cplusplus
51
+ } // extern C
52
+ #endif
53
+
54
+ #endif // GUMBO_INSERTION_MODE_H_
data/work/mkmf.log ADDED
@@ -0,0 +1,41 @@
1
+ package configuration for libxml-2.0
2
+ cflags: -I/usr/include/libxml2
3
+ ldflags:
4
+ libs: -lxml2
5
+
6
+ find_header: checking for nokogiri.h in /var/lib/gems/1.9.1/gems/nokogiri-1.5.5/ext/nokogiri... -------------------- yes
7
+
8
+ "gcc -o conftest -I/usr/include/ruby-1.9.1/x86_64-linux -I/usr/include/ruby-1.9.1/ruby/backward -I/usr/include/ruby-1.9.1 -I. -D_FORTIFY_SOURCE=2 -D_FORTIFY_SOURCE=2 -std=c99 -I/usr/include/libxml2 conftest.c -L. -L/usr/lib -L. -Wl,-Bsymbolic-functions -Wl,-z,relro -rdynamic -Wl,-export-dynamic -lxml2 -lruby-1.9.1 -lpthread -lrt -ldl -lcrypt -lm -lc"
9
+ checked program was:
10
+ /* begin */
11
+ 1: #include "ruby.h"
12
+ 2:
13
+ 3: int main() {return 0;}
14
+ /* end */
15
+
16
+ "gcc -E -I/usr/include/ruby-1.9.1/x86_64-linux -I/usr/include/ruby-1.9.1/ruby/backward -I/usr/include/ruby-1.9.1 -I. -D_FORTIFY_SOURCE=2 -D_FORTIFY_SOURCE=2 -std=c99 -I/usr/include/libxml2 conftest.c -o conftest.i"
17
+ conftest.c:3:22: fatal error: nokogiri.h: No such file or directory
18
+ compilation terminated.
19
+ checked program was:
20
+ /* begin */
21
+ 1: #include "ruby.h"
22
+ 2:
23
+ 3: #include <nokogiri.h>
24
+ /* end */
25
+
26
+ "gcc -E -I/usr/include/ruby-1.9.1/x86_64-linux -I/usr/include/ruby-1.9.1/ruby/backward -I/usr/include/ruby-1.9.1 -I. -D_FORTIFY_SOURCE=2 -D_FORTIFY_SOURCE=2 -std=c99 -I/usr/include/libxml2 -I/var/lib/gems/1.9.1/gems/nokogiri-1.5.5/ext/nokogiri conftest.c -o conftest.i"
27
+ In file included from conftest.c:3:0:
28
+ /var/lib/gems/1.9.1/gems/nokogiri-1.5.5/ext/nokogiri/nokogiri.h:13:0: warning: "_GNU_SOURCE" redefined [enabled by default]
29
+ In file included from /usr/include/ruby-1.9.1/ruby/ruby.h:24:0,
30
+ from /usr/include/ruby-1.9.1/ruby.h:32,
31
+ from conftest.c:1:
32
+ /usr/include/ruby-1.9.1/x86_64-linux/ruby/config.h:17:0: note: this is the location of the previous definition
33
+ checked program was:
34
+ /* begin */
35
+ 1: #include "ruby.h"
36
+ 2:
37
+ 3: #include <nokogiri.h>
38
+ /* end */
39
+
40
+ --------------------
41
+
data/work/nokogumbo.c ADDED
@@ -0,0 +1,97 @@
1
+ #include <ruby.h>
2
+ #include <gumbo.h>
3
+ #include <nokogiri.h>
4
+ #include <libxml/tree.h>
5
+
6
+ // class constants
7
+ static VALUE Document;
8
+
9
+ // Build a Nokogiri Element for a given GumboElement (recursively)
10
+ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
11
+ // determine tag name for a given node
12
+ xmlNodePtr element;
13
+ if (node->tag != GUMBO_TAG_UNKNOWN) {
14
+ element = xmlNewNode(NULL, BAD_CAST gumbo_normalized_tagname(node->tag));
15
+ } else {
16
+ GumboStringPiece tag = node->original_tag;
17
+ gumbo_tag_from_original_text(&tag);
18
+ char name[tag.length+1];
19
+ strncpy(name, tag.data, tag.length);
20
+ name[tag.length] = '\0';
21
+ element = xmlNewNode(NULL, BAD_CAST name);
22
+ }
23
+
24
+ // add in the attributes
25
+ GumboVector* attrs = &node->attributes;
26
+ for (int i=0; i < attrs->length; i++) {
27
+ GumboAttribute *attr = attrs->data[i];
28
+ xmlNewProp(element, BAD_CAST attr->name, BAD_CAST attr->value);
29
+ }
30
+
31
+ // add in the children
32
+ GumboVector* children = &node->children;
33
+ for (int i=0; i < children->length; i++) {
34
+ GumboNode* child = children->data[i];
35
+
36
+ xmlNodePtr node = NULL;
37
+
38
+ switch (child->type) {
39
+ case GUMBO_NODE_ELEMENT:
40
+ node = walk_tree(document, &child->v.element);
41
+ break;
42
+ case GUMBO_NODE_WHITESPACE:
43
+ case GUMBO_NODE_TEXT:
44
+ node = xmlNewText(BAD_CAST child->v.text.text);
45
+ break;
46
+ case GUMBO_NODE_CDATA:
47
+ node = xmlNewCDataBlock(document,
48
+ BAD_CAST child->v.text.original_text.data,
49
+ child->v.text.original_text.length);
50
+ break;
51
+ case GUMBO_NODE_COMMENT:
52
+ node = xmlNewComment(BAD_CAST child->v.text.text);
53
+ break;
54
+ case GUMBO_NODE_DOCUMENT:
55
+ break; // should never happen -- ignore
56
+ }
57
+
58
+ if (node) xmlAddChild(element, node);
59
+ }
60
+
61
+ return element;
62
+ }
63
+
64
+ // Parse a string using gumbo_parse into a Nokogiri document
65
+ static VALUE parse(VALUE self, VALUE string) {
66
+ GumboOutput *output = gumbo_parse_with_options(
67
+ &kGumboDefaultOptions, RSTRING_PTR(string), RSTRING_LEN(string)
68
+ );
69
+ xmlDocPtr doc = xmlNewDoc(BAD_CAST "1.0");
70
+ xmlNodePtr root = walk_tree(doc, &output->root->v.element);
71
+ xmlDocSetRootElement(doc, root);
72
+ if (output->document->v.document.has_doctype) {
73
+ const char *public = output->document->v.document.public_identifier;
74
+ const char *system = output->document->v.document.system_identifier;
75
+ xmlCreateIntSubset(doc, BAD_CAST "html",
76
+ (strlen(public) ? public : NULL),
77
+ (strlen(system) ? system : NULL));
78
+ }
79
+ gumbo_destroy_output(&kGumboDefaultOptions, output);
80
+
81
+ return Nokogiri_wrap_xml_document(Document, doc);
82
+ }
83
+
84
+ // Initialize the Nokogumbo class and fetch constants we will use later
85
+ void Init_nokogumboc() {
86
+ rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
87
+ rb_require("nokogiri");
88
+
89
+ // class constants
90
+ VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
91
+ VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
92
+ Document = rb_const_get(HTML, rb_intern("Document"));
93
+
94
+ // define Nokogumbo class with a singleton parse method
95
+ VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
96
+ rb_define_singleton_method(Gumbo, "parse", parse, 1);
97
+ }
data/work/nokogumbo.o ADDED
Binary file
Binary file