nokogumbo 1.1.14 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -16,6 +16,14 @@ require 'nokogumbo'
16
16
  doc = Nokogiri::HTML5(string)
17
17
  ```
18
18
 
19
+ An experimental _fragment_ method is also provided. While not HTML5
20
+ compliant, it may be useful:
21
+
22
+ ```ruby
23
+ require 'nokogumbo'
24
+ doc = Nokogiri::HTML5.fragment(string)
25
+ ```
26
+
19
27
  Because HTML is often fetched via the web, a convenience interface to
20
28
  HTTP get is also provided:
21
29
 
@@ -34,6 +42,10 @@ puts Nokogiri::HTML5.get('http://nokogiri.org').at('h1 abbr')['title']
34
42
  Notes
35
43
  -----
36
44
 
45
+ * The `Nokogiri::HTML5.fragment` function takes a string and parses it
46
+ as a HTML5 document. The `<html>`, `<head>`, and `<body>` elements are
47
+ removed from this document, and any children of these elements that remain
48
+ are returned as a `Nokogiri::HTML::DocumentFragment`.
37
49
  * The `Nokogiri::HTML5.parse` function takes a string and passes it to the
38
50
  <code>gumbo_parse_with_options</code> method, using the default options.
39
51
  The resulting Gumbo parse tree is then walked.
@@ -104,10 +104,49 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboElement *node) {
104
104
 
105
105
  // add in the attributes
106
106
  GumboVector* attrs = &node->attributes;
107
+ char *name = NULL;
108
+ int namelen = 0;
109
+ char *ns;
107
110
  for (int i=0; i < attrs->length; i++) {
108
111
  GumboAttribute *attr = attrs->data[i];
109
- xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value);
112
+
113
+ switch (attr->attr_namespace) {
114
+ case GUMBO_ATTR_NAMESPACE_XLINK:
115
+ ns = "xlink:";
116
+ break;
117
+
118
+ case GUMBO_ATTR_NAMESPACE_XML:
119
+ ns = "xml:";
120
+ break;
121
+
122
+ case GUMBO_ATTR_NAMESPACE_XMLNS:
123
+ ns = "xmlns:";
124
+ if (!strcmp(attr->name, "xmlns")) ns = NULL;
125
+ break;
126
+
127
+ default:
128
+ ns = NULL;
129
+ }
130
+
131
+ if (ns) {
132
+ if (strlen(ns) + strlen(attr->name) + 1 > namelen) {
133
+ free(name);
134
+ name = NULL;
135
+ }
136
+
137
+ if (!name) {
138
+ namelen = strlen(ns) + strlen(attr->name) + 1;
139
+ name = malloc(namelen);
140
+ }
141
+
142
+ strcpy(name, ns);
143
+ strcat(name, attr->name);
144
+ xmlNewProp(element, CONST_CAST name, CONST_CAST attr->value);
145
+ } else {
146
+ xmlNewProp(element, CONST_CAST attr->name, CONST_CAST attr->value);
147
+ }
110
148
  }
149
+ if (name) free(name);
111
150
 
112
151
  // add in the children
113
152
  GumboVector* children = &node->children;
data/lib/nokogumbo.rb CHANGED
@@ -74,6 +74,45 @@ module Nokogiri
74
74
  end
75
75
  end
76
76
 
77
+ # while fragment is on the Gumbo TODO list, simulate it by doing
78
+ # a full document parse and ignoring the parent <html>, <head>, and <body>
79
+ # tags, and collecting up the children of each.
80
+ def self.fragment(string)
81
+ doc = parse(string)
82
+ fragment = Nokogiri::HTML::DocumentFragment.new(doc)
83
+
84
+ if doc.children.length != 1 or doc.children.first.name != 'html'
85
+ # no HTML? Return document as is
86
+ fragment = doc
87
+ else
88
+ # examine children of HTML element
89
+ children = doc.children.first.children
90
+
91
+ # head is always first. If present, take children but otherwise
92
+ # ignore the head element
93
+ if children.length > 0 and doc.children.first.name = 'head'
94
+ fragment << children.shift.children
95
+ end
96
+
97
+ # body may be next, or last. If found, take children but otherwise
98
+ # ignore the body element. Also take any remaining elements, taking
99
+ # care to preserve order.
100
+ if children.length > 0 and doc.children.first.name = 'body'
101
+ fragment << children.shift.children
102
+ fragment << children
103
+ elsif children.length > 0 and doc.children.last.name = 'body'
104
+ body = children.pop
105
+ fragment << children
106
+ fragment << body.children
107
+ else
108
+ fragment << children
109
+ end
110
+ end
111
+
112
+ # return result
113
+ fragment
114
+ end
115
+
77
116
  private
78
117
 
79
118
  # Charset sniffing is a complex and controversial topic that understandably
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: nokogumbo
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.1.14
4
+ version: 1.2.0
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors: