nokogumbo 1.1.14 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +12 -0
- data/ext/nokogumboc/nokogumbo.c +40 -1
- data/lib/nokogumbo.rb +39 -0
- metadata +1 -1
data/README.md
CHANGED
@@ -16,6 +16,14 @@ require 'nokogumbo'
|
|
16
16
|
doc = Nokogiri::HTML5(string)
|
17
17
|
```
|
18
18
|
|
19
|
+
An experimental _fragment_ method is also provided. While not HTML5
|
20
|
+
compliant, it may be useful:
|
21
|
+
|
22
|
+
```ruby
|
23
|
+
require 'nokogumbo'
|
24
|
+
doc = Nokogiri::HTML5.fragment(string)
|
25
|
+
```
|
26
|
+
|
19
27
|
Because HTML is often fetched via the web, a convenience interface to
|
20
28
|
HTTP get is also provided:
|
21
29
|
|
@@ -34,6 +42,10 @@ puts Nokogiri::HTML5.get('http://nokogiri.org').at('h1 abbr')['title']
|
|
34
42
|
Notes
|
35
43
|
-----
|
36
44
|
|
45
|
+
* The `Nokogiri::HTML5.fragment` function takes a string and parses it
|
46
|
+
as a HTML5 document. The `<html>`, `<head>`, and `<body>` elements are
|
47
|
+
removed from this document, and any children of these elements that remain
|
48
|
+
are returned as a `Nokogiri::HTML::DocumentFragment`.
|
37
49
|
* The `Nokogiri::HTML5.parse` function takes a string and passes it to the
|
38
50
|
<code>gumbo_parse_with_options</code> method, using the default options.
|
39
51
|
The resulting Gumbo parse tree is then walked.
|
data/ext/nokogumboc/nokogumbo.c
CHANGED
@@ -104,10 +104,49 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
|
|
104
104
|
|
105
105
|
// add in the attributes
|
106
106
|
GumboVector* attrs = &node->attributes;
|
107
|
+
char *name = NULL;
|
108
|
+
int namelen = 0;
|
109
|
+
char *ns;
|
107
110
|
for (int i=0; i < attrs->length; i++) {
|
108
111
|
GumboAttribute *attr = attrs->data[i];
|
109
|
-
|
112
|
+
|
113
|
+
switch (attr->attr_namespace) {
|
114
|
+
case GUMBO_ATTR_NAMESPACE_XLINK:
|
115
|
+
ns = "xlink:";
|
116
|
+
break;
|
117
|
+
|
118
|
+
case GUMBO_ATTR_NAMESPACE_XML:
|
119
|
+
ns = "xml:";
|
120
|
+
break;
|
121
|
+
|
122
|
+
case GUMBO_ATTR_NAMESPACE_XMLNS:
|
123
|
+
ns = "xmlns:";
|
124
|
+
if (!strcmp(attr->name, "xmlns")) ns = NULL;
|
125
|
+
break;
|
126
|
+
|
127
|
+
default:
|
128
|
+
ns = NULL;
|
129
|
+
}
|
130
|
+
|
131
|
+
if (ns) {
|
132
|
+
if (strlen(ns) + strlen(attr->name) + 1 > namelen) {
|
133
|
+
free(name);
|
134
|
+
name = NULL;
|
135
|
+
}
|
136
|
+
|
137
|
+
if (!name) {
|
138
|
+
namelen = strlen(ns) + strlen(attr->name) + 1;
|
139
|
+
name = malloc(namelen);
|
140
|
+
}
|
141
|
+
|
142
|
+
strcpy(name, ns);
|
143
|
+
strcat(name, attr->name);
|
144
|
+
xmlNewProp(element, CONST_CAST name, CONST_CAST attr->value);
|
145
|
+
} else {
|
146
|
+
xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value);
|
147
|
+
}
|
110
148
|
}
|
149
|
+
if (name) free(name);
|
111
150
|
|
112
151
|
// add in the children
|
113
152
|
GumboVector* children = &node->children;
|
data/lib/nokogumbo.rb
CHANGED
@@ -74,6 +74,45 @@ module Nokogiri
|
|
74
74
|
end
|
75
75
|
end
|
76
76
|
|
77
|
+
# while fragment is on the Gumbo TODO list, simulate it by doing
|
78
|
+
# a full document parse and ignoring the parent <html>, <head>, and <body>
|
79
|
+
# tags, and collecting up the children of each.
|
80
|
+
def self.fragment(string)
|
81
|
+
doc = parse(string)
|
82
|
+
fragment = Nokogiri::HTML::DocumentFragment.new(doc)
|
83
|
+
|
84
|
+
if doc.children.length != 1 or doc.children.first.name != 'html'
|
85
|
+
# no HTML? Return document as is
|
86
|
+
fragment = doc
|
87
|
+
else
|
88
|
+
# examine children of HTML element
|
89
|
+
children = doc.children.first.children
|
90
|
+
|
91
|
+
# head is always first. If present, take children but otherwise
|
92
|
+
# ignore the head element
|
93
|
+
if children.length > 0 and doc.children.first.name = 'head'
|
94
|
+
fragment << children.shift.children
|
95
|
+
end
|
96
|
+
|
97
|
+
# body may be next, or last. If found, take children but otherwise
|
98
|
+
# ignore the body element. Also take any remaining elements, taking
|
99
|
+
# care to preserve order.
|
100
|
+
if children.length > 0 and doc.children.first.name = 'body'
|
101
|
+
fragment << children.shift.children
|
102
|
+
fragment << children
|
103
|
+
elsif children.length > 0 and doc.children.last.name = 'body'
|
104
|
+
body = children.pop
|
105
|
+
fragment << children
|
106
|
+
fragment << body.children
|
107
|
+
else
|
108
|
+
fragment << children
|
109
|
+
end
|
110
|
+
end
|
111
|
+
|
112
|
+
# return result
|
113
|
+
fragment
|
114
|
+
end
|
115
|
+
|
77
116
|
private
|
78
117
|
|
79
118
|
# Charset sniffing is a complex and controversial topic that understandably
|