nokogumbo 1.1.14 → 1.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +12 -0
- data/ext/nokogumboc/nokogumbo.c +40 -1
- data/lib/nokogumbo.rb +39 -0
- metadata +1 -1
data/README.md
CHANGED
@@ -16,6 +16,14 @@ require 'nokogumbo'
|
|
16
16
|
doc = Nokogiri::HTML5(string)
|
17
17
|
```
|
18
18
|
|
19
|
+
An experimental _fragment_ method is also provided. While not HTML5
|
20
|
+
compliant, it may be useful:
|
21
|
+
|
22
|
+
```ruby
|
23
|
+
require 'nokogumbo'
|
24
|
+
doc = Nokogiri::HTML5.fragment(string)
|
25
|
+
```
|
26
|
+
|
19
27
|
Because HTML is often fetched via the web, a convenience interface to
|
20
28
|
HTTP get is also provided:
|
21
29
|
|
@@ -34,6 +42,10 @@ puts Nokogiri::HTML5.get('http://nokogiri.org').at('h1 abbr')['title']
|
|
34
42
|
Notes
|
35
43
|
-----
|
36
44
|
|
45
|
+
* The `Nokogiri::HTML5.fragment` function takes a string and parses it
|
46
|
+
as a HTML5 document. The `<html>`, `<head>`, and `<body>` elements are
|
47
|
+
removed from this document, and any children of these elements that remain
|
48
|
+
are returned as a `Nokogiri::HTML::DocumentFragment`.
|
37
49
|
* The `Nokogiri::HTML5.parse` function takes a string and passes it to the
|
38
50
|
<code>gumbo_parse_with_options</code> method, using the default options.
|
39
51
|
The resulting Gumbo parse tree is then walked.
|
data/ext/nokogumboc/nokogumbo.c
CHANGED
@@ -104,10 +104,49 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
|
|
104
104
|
|
105
105
|
// add in the attributes
|
106
106
|
GumboVector* attrs = &node->attributes;
|
107
|
+
char *name = NULL;
|
108
|
+
int namelen = 0;
|
109
|
+
char *ns;
|
107
110
|
for (int i=0; i < attrs->length; i++) {
|
108
111
|
GumboAttribute *attr = attrs->data[i];
|
109
|
-
|
112
|
+
|
113
|
+
switch (attr->attr_namespace) {
|
114
|
+
case GUMBO_ATTR_NAMESPACE_XLINK:
|
115
|
+
ns = "xlink:";
|
116
|
+
break;
|
117
|
+
|
118
|
+
case GUMBO_ATTR_NAMESPACE_XML:
|
119
|
+
ns = "xml:";
|
120
|
+
break;
|
121
|
+
|
122
|
+
case GUMBO_ATTR_NAMESPACE_XMLNS:
|
123
|
+
ns = "xmlns:";
|
124
|
+
if (!strcmp(attr->name, "xmlns")) ns = NULL;
|
125
|
+
break;
|
126
|
+
|
127
|
+
default:
|
128
|
+
ns = NULL;
|
129
|
+
}
|
130
|
+
|
131
|
+
if (ns) {
|
132
|
+
if (strlen(ns) + strlen(attr->name) + 1 > namelen) {
|
133
|
+
free(name);
|
134
|
+
name = NULL;
|
135
|
+
}
|
136
|
+
|
137
|
+
if (!name) {
|
138
|
+
namelen = strlen(ns) + strlen(attr->name) + 1;
|
139
|
+
name = malloc(namelen);
|
140
|
+
}
|
141
|
+
|
142
|
+
strcpy(name, ns);
|
143
|
+
strcat(name, attr->name);
|
144
|
+
xmlNewProp(element, CONST_CAST name, CONST_CAST attr->value);
|
145
|
+
} else {
|
146
|
+
xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value);
|
147
|
+
}
|
110
148
|
}
|
149
|
+
if (name) free(name);
|
111
150
|
|
112
151
|
// add in the children
|
113
152
|
GumboVector* children = &node->children;
|
data/lib/nokogumbo.rb
CHANGED
@@ -74,6 +74,45 @@ module Nokogiri
|
|
74
74
|
end
|
75
75
|
end
|
76
76
|
|
77
|
+
# while fragment is on the Gumbo TODO list, simulate it by doing
|
78
|
+
# a full document parse and ignoring the parent <html>, <head>, and <body>
|
79
|
+
# tags, and collecting up the children of each.
|
80
|
+
def self.fragment(string)
|
81
|
+
doc = parse(string)
|
82
|
+
fragment = Nokogiri::HTML::DocumentFragment.new(doc)
|
83
|
+
|
84
|
+
if doc.children.length != 1 or doc.children.first.name != 'html'
|
85
|
+
# no HTML? Return document as is
|
86
|
+
fragment = doc
|
87
|
+
else
|
88
|
+
# examine children of HTML element
|
89
|
+
children = doc.children.first.children
|
90
|
+
|
91
|
+
# head is always first. If present, take children but otherwise
|
92
|
+
# ignore the head element
|
93
|
+
if children.length > 0 and doc.children.first.name = 'head'
|
94
|
+
fragment << children.shift.children
|
95
|
+
end
|
96
|
+
|
97
|
+
# body may be next, or last. If found, take children but otherwise
|
98
|
+
# ignore the body element. Also take any remaining elements, taking
|
99
|
+
# care to preserve order.
|
100
|
+
if children.length > 0 and doc.children.first.name = 'body'
|
101
|
+
fragment << children.shift.children
|
102
|
+
fragment << children
|
103
|
+
elsif children.length > 0 and doc.children.last.name = 'body'
|
104
|
+
body = children.pop
|
105
|
+
fragment << children
|
106
|
+
fragment << body.children
|
107
|
+
else
|
108
|
+
fragment << children
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
# return result
|
113
|
+
fragment
|
114
|
+
end
|
115
|
+
|
77
116
|
private
|
78
117
|
|
79
118
|
# Charset sniffing is a complex and controversial topic that understandably
|