nokogumbo 1.1.14 → 1.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -16,6 +16,14 @@ require 'nokogumbo'
16
16
  doc = Nokogiri::HTML5(string)
17
17
  ```
18
18
 
19
+ An experimental _fragment_ method is also provided. While not HTML5
20
+ compliant, it may be useful:
21
+
22
+ ```ruby
23
+ require 'nokogumbo'
24
+ doc = Nokogiri::HTML5.fragment(string)
25
+ ```
26
+
19
27
  Because HTML is often fetched via the web, a convenience interface to
20
28
  HTTP get is also provided:
21
29
 
@@ -34,6 +42,10 @@ puts Nokogiri::HTML5.get('http://nokogiri.org').at('h1 abbr')['title']
34
42
  Notes
35
43
  -----
36
44
 
45
+ * The `Nokogiri::HTML5.fragment` function takes a string and parses it
46
+ as a HTML5 document. The `<html>`, `<head>`, and `<body>` elements are
47
+ removed from this document, and any children of these elements that remain
48
+ are returned as a `Nokogiri::HTML::DocumentFragment`.
37
49
  * The `Nokogiri::HTML5.parse` function takes a string and passes it to the
38
50
  <code>gumbo_parse_with_options</code> method, using the default options.
39
51
  The resulting Gumbo parse tree is then walked.
@@ -104,10 +104,49 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
104
104
 
105
105
  // add in the attributes
106
106
  GumboVector* attrs = &node->attributes;
107
+ char *name = NULL;
108
+ int namelen = 0;
109
+ char *ns;
107
110
  for (int i=0; i < attrs->length; i++) {
108
111
  GumboAttribute *attr = attrs->data[i];
109
- xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value);
112
+
113
+ switch (attr->attr_namespace) {
114
+ case GUMBO_ATTR_NAMESPACE_XLINK:
115
+ ns = "xlink:";
116
+ break;
117
+
118
+ case GUMBO_ATTR_NAMESPACE_XML:
119
+ ns = "xml:";
120
+ break;
121
+
122
+ case GUMBO_ATTR_NAMESPACE_XMLNS:
123
+ ns = "xmlns:";
124
+ if (!strcmp(attr->name, "xmlns")) ns = NULL;
125
+ break;
126
+
127
+ default:
128
+ ns = NULL;
129
+ }
130
+
131
+ if (ns) {
132
+ if (strlen(ns) + strlen(attr->name) + 1 > namelen) {
133
+ free(name);
134
+ name = NULL;
135
+ }
136
+
137
+ if (!name) {
138
+ namelen = strlen(ns) + strlen(attr->name) + 1;
139
+ name = malloc(namelen);
140
+ }
141
+
142
+ strcpy(name, ns);
143
+ strcat(name, attr->name);
144
+ xmlNewProp(element, CONST_CAST name, CONST_CAST attr->value);
145
+ } else {
146
+ xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value);
147
+ }
110
148
  }
149
+ if (name) free(name);
111
150
 
112
151
  // add in the children
113
152
  GumboVector* children = &node->children;
data/lib/nokogumbo.rb CHANGED
@@ -74,6 +74,45 @@ module Nokogiri
74
74
  end
75
75
  end
76
76
 
77
+ # while fragment is on the Gumbo TODO list, simulate it by doing
78
+ # a full document parse and ignoring the parent <html>, <head>, and <body>
79
+ # tags, and collecting up the children of each.
80
+ def self.fragment(string)
81
+ doc = parse(string)
82
+ fragment = Nokogiri::HTML::DocumentFragment.new(doc)
83
+
84
+ if doc.children.length != 1 or doc.children.first.name != 'html'
85
+ # no HTML? Return document as is
86
+ fragment = doc
87
+ else
88
+ # examine children of HTML element
89
+ children = doc.children.first.children
90
+
91
+ # head is always first. If present, take children but otherwise
92
+ # ignore the head element
93
+ if children.length > 0 and doc.children.first.name = 'head'
94
+ fragment << children.shift.children
95
+ end
96
+
97
+ # body may be next, or last. If found, take children but otherwise
98
+ # ignore the body element. Also take any remaining elements, taking
99
+ # care to preserve order.
100
+ if children.length > 0 and doc.children.first.name = 'body'
101
+ fragment << children.shift.children
102
+ fragment << children
103
+ elsif children.length > 0 and doc.children.last.name = 'body'
104
+ body = children.pop
105
+ fragment << children
106
+ fragment << body.children
107
+ else
108
+ fragment << children
109
+ end
110
+ end
111
+
112
+ # return result
113
+ fragment
114
+ end
115
+
77
116
  private
78
117
 
79
118
  # Charset sniffing is a complex and controversial topic that understandably
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.14
4
+ version: 1.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: