nokogumbo 0.5 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,54 @@
1
+ // Copyright 2011 Google Inc. All Rights Reserved.
2
+ //
3
+ // Licensed under the Apache License, Version 2.0 (the "License");
4
+ // you may not use this file except in compliance with the License.
5
+ // You may obtain a copy of the License at
6
+ //
7
+ // http://www.apache.org/licenses/LICENSE-2.0
8
+ //
9
+ // Unless required by applicable law or agreed to in writing, software
10
+ // distributed under the License is distributed on an "AS IS" BASIS,
11
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ // See the License for the specific language governing permissions and
13
+ // limitations under the License.
14
+ //
15
+ // Author: jdtang@google.com (Jonathan Tang)
16
+
17
+ #ifndef GUMBO_INSERTION_MODE_H_
18
+ #define GUMBO_INSERTION_MODE_H_
19
+
20
+ #ifdef __cplusplus
21
+ extern "C" {
22
+ #endif
23
+
24
+ // http://www.whatwg.org/specs/web-apps/current-work/complete/parsing.html#insertion-mode
25
+ typedef enum _GumboInsertionMode {
26
+ GUMBO_INSERTION_MODE_INITIAL,
27
+ GUMBO_INSERTION_MODE_BEFORE_HTML,
28
+ GUMBO_INSERTION_MODE_BEFORE_HEAD,
29
+ GUMBO_INSERTION_MODE_IN_HEAD,
30
+ GUMBO_INSERTION_MODE_IN_HEAD_NOSCRIPT,
31
+ GUMBO_INSERTION_MODE_AFTER_HEAD,
32
+ GUMBO_INSERTION_MODE_IN_BODY,
33
+ GUMBO_INSERTION_MODE_TEXT,
34
+ GUMBO_INSERTION_MODE_IN_TABLE,
35
+ GUMBO_INSERTION_MODE_IN_TABLE_TEXT,
36
+ GUMBO_INSERTION_MODE_IN_CAPTION,
37
+ GUMBO_INSERTION_MODE_IN_COLUMN_GROUP,
38
+ GUMBO_INSERTION_MODE_IN_TABLE_BODY,
39
+ GUMBO_INSERTION_MODE_IN_ROW,
40
+ GUMBO_INSERTION_MODE_IN_CELL,
41
+ GUMBO_INSERTION_MODE_IN_SELECT,
42
+ GUMBO_INSERTION_MODE_IN_SELECT_IN_TABLE,
43
+ GUMBO_INSERTION_MODE_AFTER_BODY,
44
+ GUMBO_INSERTION_MODE_IN_FRAMESET,
45
+ GUMBO_INSERTION_MODE_AFTER_FRAMESET,
46
+ GUMBO_INSERTION_MODE_AFTER_AFTER_BODY,
47
+ GUMBO_INSERTION_MODE_AFTER_AFTER_FRAMESET
48
+ } GumboInsertionMode;
49
+
50
+ #ifdef __cplusplus
51
+ } // extern C
52
+ #endif
53
+
54
+ #endif // GUMBO_INSERTION_MODE_H_
data/work/mkmf.log ADDED
@@ -0,0 +1,41 @@
1
+ package configuration for libxml-2.0
2
+ cflags: -I/usr/include/libxml2
3
+ ldflags:
4
+ libs: -lxml2
5
+
6
+ find_header: checking for nokogiri.h in /var/lib/gems/1.9.1/gems/nokogiri-1.5.5/ext/nokogiri... -------------------- yes
7
+
8
+ "gcc -o conftest -I/usr/include/ruby-1.9.1/x86_64-linux -I/usr/include/ruby-1.9.1/ruby/backward -I/usr/include/ruby-1.9.1 -I. -D_FORTIFY_SOURCE=2 -D_FORTIFY_SOURCE=2 -std=c99 -I/usr/include/libxml2 conftest.c -L. -L/usr/lib -L. -Wl,-Bsymbolic-functions -Wl,-z,relro -rdynamic -Wl,-export-dynamic -lxml2 -lruby-1.9.1 -lpthread -lrt -ldl -lcrypt -lm -lc"
9
+ checked program was:
10
+ /* begin */
11
+ 1: #include "ruby.h"
12
+ 2:
13
+ 3: int main() {return 0;}
14
+ /* end */
15
+
16
+ "gcc -E -I/usr/include/ruby-1.9.1/x86_64-linux -I/usr/include/ruby-1.9.1/ruby/backward -I/usr/include/ruby-1.9.1 -I. -D_FORTIFY_SOURCE=2 -D_FORTIFY_SOURCE=2 -std=c99 -I/usr/include/libxml2 conftest.c -o conftest.i"
17
+ conftest.c:3:22: fatal error: nokogiri.h: No such file or directory
18
+ compilation terminated.
19
+ checked program was:
20
+ /* begin */
21
+ 1: #include "ruby.h"
22
+ 2:
23
+ 3: #include <nokogiri.h>
24
+ /* end */
25
+
26
+ "gcc -E -I/usr/include/ruby-1.9.1/x86_64-linux -I/usr/include/ruby-1.9.1/ruby/backward -I/usr/include/ruby-1.9.1 -I. -D_FORTIFY_SOURCE=2 -D_FORTIFY_SOURCE=2 -std=c99 -I/usr/include/libxml2 -I/var/lib/gems/1.9.1/gems/nokogiri-1.5.5/ext/nokogiri conftest.c -o conftest.i"
27
+ In file included from conftest.c:3:0:
28
+ /var/lib/gems/1.9.1/gems/nokogiri-1.5.5/ext/nokogiri/nokogiri.h:13:0: warning: "_GNU_SOURCE" redefined [enabled by default]
29
+ In file included from /usr/include/ruby-1.9.1/ruby/ruby.h:24:0,
30
+ from /usr/include/ruby-1.9.1/ruby.h:32,
31
+ from conftest.c:1:
32
+ /usr/include/ruby-1.9.1/x86_64-linux/ruby/config.h:17:0: note: this is the location of the previous definition
33
+ checked program was:
34
+ /* begin */
35
+ 1: #include "ruby.h"
36
+ 2:
37
+ 3: #include <nokogiri.h>
38
+ /* end */
39
+
40
+ --------------------
41
+
data/work/nokogumbo.c ADDED
@@ -0,0 +1,97 @@
1
+ #include <ruby.h>
2
+ #include <gumbo.h>
3
+ #include <nokogiri.h>
4
+ #include <libxml/tree.h>
5
+
6
+ // class constants
7
+ static VALUE Document;
8
+
9
+ // Build a Nokogiri Element for a given GumboElement (recursively)
10
+ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
11
+ // determine tag name for a given node
12
+ xmlNodePtr element;
13
+ if (node->tag != GUMBO_TAG_UNKNOWN) {
14
+ element = xmlNewNode(NULL, BAD_CAST gumbo_normalized_tagname(node->tag));
15
+ } else {
16
+ GumboStringPiece tag = node->original_tag;
17
+ gumbo_tag_from_original_text(&tag);
18
+ char name[tag.length+1];
19
+ strncpy(name, tag.data, tag.length);
20
+ name[tag.length] = '\0';
21
+ element = xmlNewNode(NULL, BAD_CAST name);
22
+ }
23
+
24
+ // add in the attributes
25
+ GumboVector* attrs = &node->attributes;
26
+ for (int i=0; i < attrs->length; i++) {
27
+ GumboAttribute *attr = attrs->data[i];
28
+ xmlNewProp(element, BAD_CAST attr->name, BAD_CAST attr->value);
29
+ }
30
+
31
+ // add in the children
32
+ GumboVector* children = &node->children;
33
+ for (int i=0; i < children->length; i++) {
34
+ GumboNode* child = children->data[i];
35
+
36
+ xmlNodePtr node = NULL;
37
+
38
+ switch (child->type) {
39
+ case GUMBO_NODE_ELEMENT:
40
+ node = walk_tree(document, &child->v.element);
41
+ break;
42
+ case GUMBO_NODE_WHITESPACE:
43
+ case GUMBO_NODE_TEXT:
44
+ node = xmlNewText(BAD_CAST child->v.text.text);
45
+ break;
46
+ case GUMBO_NODE_CDATA:
47
+ node = xmlNewCDataBlock(document,
48
+ BAD_CAST child->v.text.original_text.data,
49
+ child->v.text.original_text.length);
50
+ break;
51
+ case GUMBO_NODE_COMMENT:
52
+ node = xmlNewComment(BAD_CAST child->v.text.text);
53
+ break;
54
+ case GUMBO_NODE_DOCUMENT:
55
+ break; // should never happen -- ignore
56
+ }
57
+
58
+ if (node) xmlAddChild(element, node);
59
+ }
60
+
61
+ return element;
62
+ }
63
+
64
+ // Parse a string using gumbo_parse into a Nokogiri document
65
+ static VALUE parse(VALUE self, VALUE string) {
66
+ GumboOutput *output = gumbo_parse_with_options(
67
+ &kGumboDefaultOptions, RSTRING_PTR(string), RSTRING_LEN(string)
68
+ );
69
+ xmlDocPtr doc = xmlNewDoc(BAD_CAST "1.0");
70
+ xmlNodePtr root = walk_tree(doc, &output->root->v.element);
71
+ xmlDocSetRootElement(doc, root);
72
+ if (output->document->v.document.has_doctype) {
73
+ const char *public = output->document->v.document.public_identifier;
74
+ const char *system = output->document->v.document.system_identifier;
75
+ xmlCreateIntSubset(doc, BAD_CAST "html",
76
+ (strlen(public) ? public : NULL),
77
+ (strlen(system) ? system : NULL));
78
+ }
79
+ gumbo_destroy_output(&kGumboDefaultOptions, output);
80
+
81
+ return Nokogiri_wrap_xml_document(Document, doc);
82
+ }
83
+
84
+ // Initialize the Nokogumbo class and fetch constants we will use later
85
+ void Init_nokogumboc() {
86
+ rb_funcall(rb_mKernel, rb_intern("gem"), 1, rb_str_new2("nokogiri"));
87
+ rb_require("nokogiri");
88
+
89
+ // class constants
90
+ VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
91
+ VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
92
+ Document = rb_const_get(HTML, rb_intern("Document"));
93
+
94
+ // define Nokogumbo class with a singleton parse method
95
+ VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
96
+ rb_define_singleton_method(Gumbo, "parse", parse, 1);
97
+ }
data/work/nokogumbo.o ADDED
Binary file
Binary file