nokogiri 1.18.0.rc1-x86_64-linux-musl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (203) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +38 -0
  3. data/LICENSE-DEPENDENCIES.md +2224 -0
  4. data/LICENSE.md +9 -0
  5. data/README.md +293 -0
  6. data/bin/nokogiri +131 -0
  7. data/dependencies.yml +42 -0
  8. data/ext/nokogiri/depend +38 -0
  9. data/ext/nokogiri/extconf.rb +1173 -0
  10. data/ext/nokogiri/gumbo.c +610 -0
  11. data/ext/nokogiri/html4_document.c +171 -0
  12. data/ext/nokogiri/html4_element_description.c +299 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser.c +40 -0
  15. data/ext/nokogiri/html4_sax_parser_context.c +98 -0
  16. data/ext/nokogiri/html4_sax_push_parser.c +96 -0
  17. data/ext/nokogiri/include/libexslt/exslt.h +108 -0
  18. data/ext/nokogiri/include/libexslt/exsltconfig.h +70 -0
  19. data/ext/nokogiri/include/libexslt/exsltexports.h +63 -0
  20. data/ext/nokogiri/include/libxml2/libxml/HTMLparser.h +336 -0
  21. data/ext/nokogiri/include/libxml2/libxml/HTMLtree.h +147 -0
  22. data/ext/nokogiri/include/libxml2/libxml/SAX.h +202 -0
  23. data/ext/nokogiri/include/libxml2/libxml/SAX2.h +171 -0
  24. data/ext/nokogiri/include/libxml2/libxml/c14n.h +115 -0
  25. data/ext/nokogiri/include/libxml2/libxml/catalog.h +182 -0
  26. data/ext/nokogiri/include/libxml2/libxml/chvalid.h +230 -0
  27. data/ext/nokogiri/include/libxml2/libxml/debugXML.h +217 -0
  28. data/ext/nokogiri/include/libxml2/libxml/dict.h +82 -0
  29. data/ext/nokogiri/include/libxml2/libxml/encoding.h +244 -0
  30. data/ext/nokogiri/include/libxml2/libxml/entities.h +166 -0
  31. data/ext/nokogiri/include/libxml2/libxml/globals.h +41 -0
  32. data/ext/nokogiri/include/libxml2/libxml/hash.h +251 -0
  33. data/ext/nokogiri/include/libxml2/libxml/list.h +137 -0
  34. data/ext/nokogiri/include/libxml2/libxml/nanoftp.h +186 -0
  35. data/ext/nokogiri/include/libxml2/libxml/nanohttp.h +98 -0
  36. data/ext/nokogiri/include/libxml2/libxml/parser.h +1390 -0
  37. data/ext/nokogiri/include/libxml2/libxml/parserInternals.h +671 -0
  38. data/ext/nokogiri/include/libxml2/libxml/pattern.h +106 -0
  39. data/ext/nokogiri/include/libxml2/libxml/relaxng.h +219 -0
  40. data/ext/nokogiri/include/libxml2/libxml/schemasInternals.h +959 -0
  41. data/ext/nokogiri/include/libxml2/libxml/schematron.h +143 -0
  42. data/ext/nokogiri/include/libxml2/libxml/threads.h +87 -0
  43. data/ext/nokogiri/include/libxml2/libxml/tree.h +1382 -0
  44. data/ext/nokogiri/include/libxml2/libxml/uri.h +106 -0
  45. data/ext/nokogiri/include/libxml2/libxml/valid.h +477 -0
  46. data/ext/nokogiri/include/libxml2/libxml/xinclude.h +136 -0
  47. data/ext/nokogiri/include/libxml2/libxml/xlink.h +189 -0
  48. data/ext/nokogiri/include/libxml2/libxml/xmlIO.h +438 -0
  49. data/ext/nokogiri/include/libxml2/libxml/xmlautomata.h +146 -0
  50. data/ext/nokogiri/include/libxml2/libxml/xmlerror.h +962 -0
  51. data/ext/nokogiri/include/libxml2/libxml/xmlexports.h +146 -0
  52. data/ext/nokogiri/include/libxml2/libxml/xmlmemory.h +188 -0
  53. data/ext/nokogiri/include/libxml2/libxml/xmlmodule.h +57 -0
  54. data/ext/nokogiri/include/libxml2/libxml/xmlreader.h +436 -0
  55. data/ext/nokogiri/include/libxml2/libxml/xmlregexp.h +215 -0
  56. data/ext/nokogiri/include/libxml2/libxml/xmlsave.h +102 -0
  57. data/ext/nokogiri/include/libxml2/libxml/xmlschemas.h +249 -0
  58. data/ext/nokogiri/include/libxml2/libxml/xmlschemastypes.h +152 -0
  59. data/ext/nokogiri/include/libxml2/libxml/xmlstring.h +140 -0
  60. data/ext/nokogiri/include/libxml2/libxml/xmlunicode.h +366 -0
  61. data/ext/nokogiri/include/libxml2/libxml/xmlversion.h +347 -0
  62. data/ext/nokogiri/include/libxml2/libxml/xmlwriter.h +489 -0
  63. data/ext/nokogiri/include/libxml2/libxml/xpath.h +579 -0
  64. data/ext/nokogiri/include/libxml2/libxml/xpathInternals.h +633 -0
  65. data/ext/nokogiri/include/libxml2/libxml/xpointer.h +138 -0
  66. data/ext/nokogiri/include/libxslt/attributes.h +39 -0
  67. data/ext/nokogiri/include/libxslt/documents.h +93 -0
  68. data/ext/nokogiri/include/libxslt/extensions.h +262 -0
  69. data/ext/nokogiri/include/libxslt/extra.h +72 -0
  70. data/ext/nokogiri/include/libxslt/functions.h +78 -0
  71. data/ext/nokogiri/include/libxslt/imports.h +75 -0
  72. data/ext/nokogiri/include/libxslt/keys.h +53 -0
  73. data/ext/nokogiri/include/libxslt/namespaces.h +68 -0
  74. data/ext/nokogiri/include/libxslt/numbersInternals.h +73 -0
  75. data/ext/nokogiri/include/libxslt/pattern.h +84 -0
  76. data/ext/nokogiri/include/libxslt/preproc.h +43 -0
  77. data/ext/nokogiri/include/libxslt/security.h +104 -0
  78. data/ext/nokogiri/include/libxslt/templates.h +77 -0
  79. data/ext/nokogiri/include/libxslt/transform.h +207 -0
  80. data/ext/nokogiri/include/libxslt/variables.h +118 -0
  81. data/ext/nokogiri/include/libxslt/xslt.h +110 -0
  82. data/ext/nokogiri/include/libxslt/xsltInternals.h +1995 -0
  83. data/ext/nokogiri/include/libxslt/xsltconfig.h +146 -0
  84. data/ext/nokogiri/include/libxslt/xsltexports.h +64 -0
  85. data/ext/nokogiri/include/libxslt/xsltlocale.h +44 -0
  86. data/ext/nokogiri/include/libxslt/xsltutils.h +343 -0
  87. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  88. data/ext/nokogiri/nokogiri.c +294 -0
  89. data/ext/nokogiri/nokogiri.h +238 -0
  90. data/ext/nokogiri/test_global_handlers.c +40 -0
  91. data/ext/nokogiri/xml_attr.c +103 -0
  92. data/ext/nokogiri/xml_attribute_decl.c +70 -0
  93. data/ext/nokogiri/xml_cdata.c +62 -0
  94. data/ext/nokogiri/xml_comment.c +57 -0
  95. data/ext/nokogiri/xml_document.c +784 -0
  96. data/ext/nokogiri/xml_document_fragment.c +29 -0
  97. data/ext/nokogiri/xml_dtd.c +208 -0
  98. data/ext/nokogiri/xml_element_content.c +131 -0
  99. data/ext/nokogiri/xml_element_decl.c +69 -0
  100. data/ext/nokogiri/xml_encoding_handler.c +112 -0
  101. data/ext/nokogiri/xml_entity_decl.c +112 -0
  102. data/ext/nokogiri/xml_entity_reference.c +50 -0
  103. data/ext/nokogiri/xml_namespace.c +181 -0
  104. data/ext/nokogiri/xml_node.c +2459 -0
  105. data/ext/nokogiri/xml_node_set.c +518 -0
  106. data/ext/nokogiri/xml_processing_instruction.c +54 -0
  107. data/ext/nokogiri/xml_reader.c +777 -0
  108. data/ext/nokogiri/xml_relax_ng.c +149 -0
  109. data/ext/nokogiri/xml_sax_parser.c +403 -0
  110. data/ext/nokogiri/xml_sax_parser_context.c +390 -0
  111. data/ext/nokogiri/xml_sax_push_parser.c +206 -0
  112. data/ext/nokogiri/xml_schema.c +226 -0
  113. data/ext/nokogiri/xml_syntax_error.c +93 -0
  114. data/ext/nokogiri/xml_text.c +59 -0
  115. data/ext/nokogiri/xml_xpath_context.c +502 -0
  116. data/ext/nokogiri/xslt_stylesheet.c +421 -0
  117. data/gumbo-parser/CHANGES.md +63 -0
  118. data/gumbo-parser/Makefile +129 -0
  119. data/gumbo-parser/THANKS +27 -0
  120. data/lib/nokogiri/3.1/nokogiri.so +0 -0
  121. data/lib/nokogiri/3.2/nokogiri.so +0 -0
  122. data/lib/nokogiri/3.3/nokogiri.so +0 -0
  123. data/lib/nokogiri/3.4/nokogiri.so +0 -0
  124. data/lib/nokogiri/class_resolver.rb +67 -0
  125. data/lib/nokogiri/css/node.rb +58 -0
  126. data/lib/nokogiri/css/parser.rb +772 -0
  127. data/lib/nokogiri/css/parser.y +277 -0
  128. data/lib/nokogiri/css/parser_extras.rb +36 -0
  129. data/lib/nokogiri/css/selector_cache.rb +38 -0
  130. data/lib/nokogiri/css/syntax_error.rb +9 -0
  131. data/lib/nokogiri/css/tokenizer.rb +155 -0
  132. data/lib/nokogiri/css/tokenizer.rex +57 -0
  133. data/lib/nokogiri/css/xpath_visitor.rb +375 -0
  134. data/lib/nokogiri/css.rb +132 -0
  135. data/lib/nokogiri/decorators/slop.rb +42 -0
  136. data/lib/nokogiri/encoding_handler.rb +57 -0
  137. data/lib/nokogiri/extension.rb +32 -0
  138. data/lib/nokogiri/gumbo.rb +15 -0
  139. data/lib/nokogiri/html.rb +48 -0
  140. data/lib/nokogiri/html4/builder.rb +37 -0
  141. data/lib/nokogiri/html4/document.rb +235 -0
  142. data/lib/nokogiri/html4/document_fragment.rb +166 -0
  143. data/lib/nokogiri/html4/element_description.rb +25 -0
  144. data/lib/nokogiri/html4/element_description_defaults.rb +2040 -0
  145. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  146. data/lib/nokogiri/html4/entity_lookup.rb +15 -0
  147. data/lib/nokogiri/html4/sax/parser.rb +48 -0
  148. data/lib/nokogiri/html4/sax/parser_context.rb +15 -0
  149. data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
  150. data/lib/nokogiri/html4.rb +42 -0
  151. data/lib/nokogiri/html5/builder.rb +40 -0
  152. data/lib/nokogiri/html5/document.rb +199 -0
  153. data/lib/nokogiri/html5/document_fragment.rb +200 -0
  154. data/lib/nokogiri/html5/node.rb +103 -0
  155. data/lib/nokogiri/html5.rb +368 -0
  156. data/lib/nokogiri/jruby/dependencies.rb +3 -0
  157. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  158. data/lib/nokogiri/syntax_error.rb +6 -0
  159. data/lib/nokogiri/version/constant.rb +6 -0
  160. data/lib/nokogiri/version/info.rb +224 -0
  161. data/lib/nokogiri/version.rb +4 -0
  162. data/lib/nokogiri/xml/attr.rb +66 -0
  163. data/lib/nokogiri/xml/attribute_decl.rb +22 -0
  164. data/lib/nokogiri/xml/builder.rb +494 -0
  165. data/lib/nokogiri/xml/cdata.rb +13 -0
  166. data/lib/nokogiri/xml/character_data.rb +9 -0
  167. data/lib/nokogiri/xml/document.rb +514 -0
  168. data/lib/nokogiri/xml/document_fragment.rb +276 -0
  169. data/lib/nokogiri/xml/dtd.rb +34 -0
  170. data/lib/nokogiri/xml/element_content.rb +46 -0
  171. data/lib/nokogiri/xml/element_decl.rb +17 -0
  172. data/lib/nokogiri/xml/entity_decl.rb +23 -0
  173. data/lib/nokogiri/xml/entity_reference.rb +20 -0
  174. data/lib/nokogiri/xml/namespace.rb +57 -0
  175. data/lib/nokogiri/xml/node/save_options.rb +76 -0
  176. data/lib/nokogiri/xml/node.rb +1650 -0
  177. data/lib/nokogiri/xml/node_set.rb +449 -0
  178. data/lib/nokogiri/xml/notation.rb +19 -0
  179. data/lib/nokogiri/xml/parse_options.rb +213 -0
  180. data/lib/nokogiri/xml/pp/character_data.rb +21 -0
  181. data/lib/nokogiri/xml/pp/node.rb +73 -0
  182. data/lib/nokogiri/xml/pp.rb +4 -0
  183. data/lib/nokogiri/xml/processing_instruction.rb +11 -0
  184. data/lib/nokogiri/xml/reader.rb +139 -0
  185. data/lib/nokogiri/xml/relax_ng.rb +75 -0
  186. data/lib/nokogiri/xml/sax/document.rb +258 -0
  187. data/lib/nokogiri/xml/sax/parser.rb +199 -0
  188. data/lib/nokogiri/xml/sax/parser_context.rb +129 -0
  189. data/lib/nokogiri/xml/sax/push_parser.rb +64 -0
  190. data/lib/nokogiri/xml/sax.rb +54 -0
  191. data/lib/nokogiri/xml/schema.rb +140 -0
  192. data/lib/nokogiri/xml/searchable.rb +297 -0
  193. data/lib/nokogiri/xml/syntax_error.rb +94 -0
  194. data/lib/nokogiri/xml/text.rb +11 -0
  195. data/lib/nokogiri/xml/xpath/syntax_error.rb +13 -0
  196. data/lib/nokogiri/xml/xpath.rb +21 -0
  197. data/lib/nokogiri/xml/xpath_context.rb +49 -0
  198. data/lib/nokogiri/xml.rb +65 -0
  199. data/lib/nokogiri/xslt/stylesheet.rb +49 -0
  200. data/lib/nokogiri/xslt.rb +129 -0
  201. data/lib/nokogiri.rb +128 -0
  202. data/lib/xsd/xmlparser/nokogiri.rb +105 -0
  203. metadata +324 -0
@@ -0,0 +1,610 @@
1
+ //
2
+ // Copyright 2013-2021 Sam Ruby, Stephen Checkoway
3
+ //
4
+ // Licensed under the Apache License, Version 2.0 (the "License");
5
+ // you may not use this file except in compliance with the License.
6
+ // You may obtain a copy of the License at
7
+ //
8
+ // http://www.apache.org/licenses/LICENSE-2.0
9
+ //
10
+ // Unless required by applicable law or agreed to in writing, software
11
+ // distributed under the License is distributed on an "AS IS" BASIS,
12
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ // See the License for the specific language governing permissions and
14
+ // limitations under the License.
15
+ //
16
+
17
+ //
18
+ // nokogumbo.c defines the following:
19
+ //
20
+ // class Nokogumbo
21
+ // def parse(utf8_string) # returns Nokogiri::HTML5::Document
22
+ // end
23
+ //
24
+ // Processing starts by calling gumbo_parse_with_options. The resulting document tree
25
+ // is then walked, a parallel libxml2 tree is constructed, and the final document is
26
+ // then wrapped using noko_xml_document_wrap. This approach reduces memory and CPU
27
+ // requirements as Ruby objects are only built when necessary.
28
+ //
29
+
30
+ #include <nokogiri.h>
31
+
32
+ #include "nokogiri_gumbo.h"
33
+
34
+ VALUE cNokogiriHtml5Document;
35
+
36
+ // Interned symbols
37
+ static ID internal_subset;
38
+ static ID parent;
39
+
40
+ #include <nokogiri.h>
41
+ #include <libxml/tree.h>
42
+ #include <libxml/HTMLtree.h>
43
+
44
+ // URI = system id
45
+ // external id = public id
46
+ static xmlDocPtr
47
+ new_html_doc(const char *dtd_name, const char *system, const char *public)
48
+ {
49
+ // These two libxml2 functions take the public and system ids in
50
+ // opposite orders.
51
+ htmlDocPtr doc = htmlNewDocNoDtD(/* URI */ NULL, /* ExternalID */NULL);
52
+ assert(doc);
53
+ if (dtd_name) {
54
+ xmlCreateIntSubset(doc, (const xmlChar *)dtd_name, (const xmlChar *)public, (const xmlChar *)system);
55
+ }
56
+ return doc;
57
+ }
58
+
59
+ static xmlNodePtr
60
+ get_parent(xmlNodePtr node)
61
+ {
62
+ return node->parent;
63
+ }
64
+
65
+ static GumboOutput *
66
+ perform_parse(const GumboOptions *options, VALUE input)
67
+ {
68
+ assert(RTEST(input));
69
+ Check_Type(input, T_STRING);
70
+ GumboOutput *output = gumbo_parse_with_options(
71
+ options,
72
+ RSTRING_PTR(input),
73
+ (size_t)RSTRING_LEN(input)
74
+ );
75
+
76
+ const char *status_string = gumbo_status_to_string(output->status);
77
+ switch (output->status) {
78
+ case GUMBO_STATUS_OK:
79
+ break;
80
+ case GUMBO_STATUS_TOO_MANY_ATTRIBUTES:
81
+ case GUMBO_STATUS_TREE_TOO_DEEP:
82
+ gumbo_destroy_output(output);
83
+ rb_raise(rb_eArgError, "%s", status_string);
84
+ case GUMBO_STATUS_OUT_OF_MEMORY:
85
+ gumbo_destroy_output(output);
86
+ rb_raise(rb_eNoMemError, "%s", status_string);
87
+ }
88
+ return output;
89
+ }
90
+
91
+ static xmlNsPtr
92
+ lookup_or_add_ns(
93
+ xmlDocPtr doc,
94
+ xmlNodePtr root,
95
+ const char *href,
96
+ const char *prefix
97
+ )
98
+ {
99
+ xmlNsPtr ns = xmlSearchNs(doc, root, (const xmlChar *)prefix);
100
+ if (ns) {
101
+ return ns;
102
+ }
103
+ return xmlNewNs(root, (const xmlChar *)href, (const xmlChar *)prefix);
104
+ }
105
+
106
+ static void
107
+ set_line(xmlNodePtr node, size_t line)
108
+ {
109
+ // libxml2 uses 65535 to mean look elsewhere for the line number on some
110
+ // nodes.
111
+ if (line < 65535) {
112
+ node->line = (unsigned short)line;
113
+ }
114
+ }
115
+
116
+ // Construct an XML tree rooted at xml_output_node from the Gumbo tree rooted
117
+ // at gumbo_node.
118
+ static void
119
+ build_tree(
120
+ xmlDocPtr doc,
121
+ xmlNodePtr xml_output_node,
122
+ const GumboNode *gumbo_node
123
+ )
124
+ {
125
+ xmlNodePtr xml_root = NULL;
126
+ xmlNodePtr xml_node = xml_output_node;
127
+ size_t child_index = 0;
128
+
129
+ while (true) {
130
+ assert(gumbo_node != NULL);
131
+ const GumboVector *children = gumbo_node->type == GUMBO_NODE_DOCUMENT ?
132
+ &gumbo_node->v.document.children : &gumbo_node->v.element.children;
133
+ if (child_index >= children->length) {
134
+ // Move up the tree and to the next child.
135
+ if (xml_node == xml_output_node) {
136
+ // We've built as much of the tree as we can.
137
+ return;
138
+ }
139
+ child_index = gumbo_node->index_within_parent + 1;
140
+ gumbo_node = gumbo_node->parent;
141
+ xml_node = get_parent(xml_node);
142
+ // Children of fragments don't share the same root, so reset it and
143
+ // it'll be set below. In the non-fragment case, this will only happen
144
+ // after the html element has been finished at which point there are no
145
+ // further elements.
146
+ if (xml_node == xml_output_node) {
147
+ xml_root = NULL;
148
+ }
149
+ continue;
150
+ }
151
+ const GumboNode *gumbo_child = children->data[child_index++];
152
+ xmlNodePtr xml_child;
153
+
154
+ switch (gumbo_child->type) {
155
+ case GUMBO_NODE_DOCUMENT:
156
+ abort(); // Bug in Gumbo.
157
+
158
+ case GUMBO_NODE_TEXT:
159
+ case GUMBO_NODE_WHITESPACE:
160
+ xml_child = xmlNewDocText(doc, (const xmlChar *)gumbo_child->v.text.text);
161
+ set_line(xml_child, gumbo_child->v.text.start_pos.line);
162
+ xmlAddChild(xml_node, xml_child);
163
+ break;
164
+
165
+ case GUMBO_NODE_CDATA:
166
+ xml_child = xmlNewCDataBlock(doc, (const xmlChar *)gumbo_child->v.text.text,
167
+ (int) strlen(gumbo_child->v.text.text));
168
+ set_line(xml_child, gumbo_child->v.text.start_pos.line);
169
+ xmlAddChild(xml_node, xml_child);
170
+ break;
171
+
172
+ case GUMBO_NODE_COMMENT:
173
+ xml_child = xmlNewDocComment(doc, (const xmlChar *)gumbo_child->v.text.text);
174
+ set_line(xml_child, gumbo_child->v.text.start_pos.line);
175
+ xmlAddChild(xml_node, xml_child);
176
+ break;
177
+
178
+ case GUMBO_NODE_TEMPLATE:
179
+ // XXX: Should create a template element and a new DocumentFragment
180
+ case GUMBO_NODE_ELEMENT: {
181
+ xml_child = xmlNewDocNode(doc, NULL, (const xmlChar *)gumbo_child->v.element.name, NULL);
182
+ set_line(xml_child, gumbo_child->v.element.start_pos.line);
183
+ if (xml_root == NULL) {
184
+ xml_root = xml_child;
185
+ }
186
+ xmlNsPtr ns = NULL;
187
+ switch (gumbo_child->v.element.tag_namespace) {
188
+ case GUMBO_NAMESPACE_HTML:
189
+ break;
190
+ case GUMBO_NAMESPACE_SVG:
191
+ ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/svg", "svg");
192
+ break;
193
+ case GUMBO_NAMESPACE_MATHML:
194
+ ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1998/Math/MathML", "math");
195
+ break;
196
+ }
197
+ if (ns != NULL) {
198
+ xmlSetNs(xml_child, ns);
199
+ }
200
+ xmlAddChild(xml_node, xml_child);
201
+
202
+ // Add the attributes.
203
+ const GumboVector *attrs = &gumbo_child->v.element.attributes;
204
+ for (size_t i = 0; i < attrs->length; i++) {
205
+ const GumboAttribute *attr = attrs->data[i];
206
+
207
+ switch (attr->attr_namespace) {
208
+ case GUMBO_ATTR_NAMESPACE_XLINK:
209
+ ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1999/xlink", "xlink");
210
+ break;
211
+
212
+ case GUMBO_ATTR_NAMESPACE_XML:
213
+ ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/XML/1998/namespace", "xml");
214
+ break;
215
+
216
+ case GUMBO_ATTR_NAMESPACE_XMLNS:
217
+ ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/xmlns/", "xmlns");
218
+ break;
219
+
220
+ default:
221
+ ns = NULL;
222
+ }
223
+ xmlNewNsProp(xml_child, ns, (const xmlChar *)attr->name, (const xmlChar *)attr->value);
224
+ }
225
+
226
+ // Add children for this element.
227
+ child_index = 0;
228
+ gumbo_node = gumbo_child;
229
+ xml_node = xml_child;
230
+ }
231
+ }
232
+ }
233
+ }
234
+
235
+ static void
236
+ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
237
+ {
238
+ const char *input_str = RSTRING_PTR(input);
239
+ size_t input_len = (size_t)RSTRING_LEN(input);
240
+
241
+ // Add parse errors to rdoc.
242
+ if (output->errors.length) {
243
+ const GumboVector *errors = &output->errors;
244
+ VALUE rerrors = rb_ary_new2(errors->length);
245
+
246
+ for (size_t i = 0; i < errors->length; i++) {
247
+ GumboError *err = errors->data[i];
248
+ GumboSourcePosition position = gumbo_error_position(err);
249
+ char *msg;
250
+ size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg);
251
+ VALUE err_str = rb_utf8_str_new(msg, (int)size);
252
+ free(msg);
253
+ VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
254
+ const char *error_code = gumbo_error_code(err);
255
+ VALUE str1 = error_code ? rb_utf8_str_new_static(error_code, (int)strlen(error_code)) : Qnil;
256
+ rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
257
+ rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
258
+ rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
259
+ rb_iv_set(syntax_error, "@file", url);
260
+ rb_iv_set(syntax_error, "@line", SIZET2NUM(position.line));
261
+ rb_iv_set(syntax_error, "@str1", str1);
262
+ rb_iv_set(syntax_error, "@str2", Qnil);
263
+ rb_iv_set(syntax_error, "@str3", Qnil);
264
+ rb_iv_set(syntax_error, "@int1", INT2NUM(0));
265
+ rb_iv_set(syntax_error, "@column", SIZET2NUM(position.column));
266
+ rb_ary_push(rerrors, syntax_error);
267
+ }
268
+ rb_iv_set(rdoc, "@errors", rerrors);
269
+ }
270
+ }
271
+
272
+ typedef struct {
273
+ GumboOutput *output;
274
+ VALUE input;
275
+ VALUE url_or_frag;
276
+ VALUE klass;
277
+ xmlDocPtr doc;
278
+ } ParseArgs;
279
+
280
+ static VALUE
281
+ parse_cleanup(VALUE parse_args)
282
+ {
283
+ ParseArgs *args = (ParseArgs *)parse_args;
284
+ gumbo_destroy_output(args->output);
285
+ // Make sure garbage collection doesn't mark the objects as being live based
286
+ // on references from the ParseArgs. This may be unnecessary.
287
+ args->input = Qnil;
288
+ args->url_or_frag = Qnil;
289
+ if (args->doc != NULL) {
290
+ xmlFreeDoc(args->doc);
291
+ }
292
+ return Qnil;
293
+ }
294
+
295
+ // Scan the keyword arguments for options common to the document and fragment
296
+ // parse.
297
+ static GumboOptions
298
+ common_options(VALUE kwargs)
299
+ {
300
+ // The order of the keywords determines the order of the values below.
301
+ // If this order is changed, then setting the options below must change as
302
+ // well.
303
+ ID keywords[] = {
304
+ // Required keywords.
305
+ rb_intern_const("max_attributes"),
306
+ rb_intern_const("max_errors"),
307
+ rb_intern_const("max_tree_depth"),
308
+
309
+ // Optional keywords.
310
+ rb_intern_const("parse_noscript_content_as_text"),
311
+ };
312
+ VALUE values[sizeof keywords / sizeof keywords[0]];
313
+
314
+ // Extract the values coresponding to the required keywords. Raise an error
315
+ // if required arguments are missing.
316
+ rb_get_kwargs(kwargs, keywords, 3, 1, values);
317
+
318
+ GumboOptions options = kGumboDefaultOptions;
319
+ options.max_attributes = NUM2INT(values[0]);
320
+ options.max_errors = NUM2INT(values[1]);
321
+
322
+ // handle negative values
323
+ int depth = NUM2INT(values[2]);
324
+ options.max_tree_depth = depth < 0 ? UINT_MAX : (unsigned int)depth;
325
+
326
+ options.parse_noscript_content_as_text = values[3] != Qundef && RTEST(values[3]);
327
+
328
+ return options;
329
+ }
330
+
331
+ static VALUE parse_continue(VALUE parse_args);
332
+
333
+ /*
334
+ * @!visibility protected
335
+ */
336
+ static VALUE
337
+ noko_gumbo_s_parse(int argc, VALUE *argv, VALUE _self)
338
+ {
339
+ VALUE input, url, klass, kwargs;
340
+
341
+ rb_scan_args(argc, argv, "3:", &input, &url, &klass, &kwargs);
342
+ if (NIL_P(kwargs)) {
343
+ kwargs = rb_hash_new();
344
+ }
345
+
346
+ GumboOptions options = common_options(kwargs);
347
+
348
+ GumboOutput *output = perform_parse(&options, input);
349
+ ParseArgs args = {
350
+ .output = output,
351
+ .input = input,
352
+ .url_or_frag = url,
353
+ .klass = klass,
354
+ .doc = NULL,
355
+ };
356
+
357
+ return rb_ensure(parse_continue, (VALUE)(&args), parse_cleanup, (VALUE)(&args));
358
+ }
359
+
360
+ static VALUE
361
+ parse_continue(VALUE parse_args)
362
+ {
363
+ ParseArgs *args = (ParseArgs *)parse_args;
364
+ GumboOutput *output = args->output;
365
+ xmlDocPtr doc;
366
+ if (output->document->v.document.has_doctype) {
367
+ const char *name = output->document->v.document.name;
368
+ const char *public = output->document->v.document.public_identifier;
369
+ const char *system = output->document->v.document.system_identifier;
370
+ public = public[0] ? public : NULL;
371
+ system = system[0] ? system : NULL;
372
+ doc = new_html_doc(name, system, public);
373
+ } else {
374
+ doc = new_html_doc(NULL, NULL, NULL);
375
+ }
376
+ args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
377
+ build_tree(doc, (xmlNodePtr)doc, output->document);
378
+ VALUE rdoc = noko_xml_document_wrap(args->klass, doc);
379
+ rb_iv_set(rdoc, "@url", args->url_or_frag);
380
+ rb_iv_set(rdoc, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
381
+ args->doc = NULL; // The Ruby runtime now owns doc so don't delete it.
382
+ add_errors(output, rdoc, args->input, args->url_or_frag);
383
+ return rdoc;
384
+ }
385
+
386
+ static int
387
+ lookup_namespace(VALUE node, bool require_known_ns)
388
+ {
389
+ ID namespace, href;
390
+ CONST_ID(namespace, "namespace");
391
+ CONST_ID(href, "href");
392
+ VALUE ns = rb_funcall(node, namespace, 0);
393
+
394
+ if (NIL_P(ns)) {
395
+ return GUMBO_NAMESPACE_HTML;
396
+ }
397
+ ns = rb_funcall(ns, href, 0);
398
+ assert(RTEST(ns));
399
+ Check_Type(ns, T_STRING);
400
+
401
+ const char *href_ptr = RSTRING_PTR(ns);
402
+ size_t href_len = (size_t)RSTRING_LEN(ns);
403
+ #define NAMESPACE_P(uri) (href_len == sizeof uri - 1 && !memcmp(href_ptr, uri, href_len))
404
+ if (NAMESPACE_P("http://www.w3.org/1999/xhtml")) {
405
+ return GUMBO_NAMESPACE_HTML;
406
+ }
407
+ if (NAMESPACE_P("http://www.w3.org/1998/Math/MathML")) {
408
+ return GUMBO_NAMESPACE_MATHML;
409
+ }
410
+ if (NAMESPACE_P("http://www.w3.org/2000/svg")) {
411
+ return GUMBO_NAMESPACE_SVG;
412
+ }
413
+ #undef NAMESPACE_P
414
+ if (require_known_ns) {
415
+ rb_raise(rb_eArgError, "Unexpected namespace URI \"%*s\"", (int)href_len, href_ptr);
416
+ }
417
+ return -1;
418
+ }
419
+
420
+ static xmlNodePtr
421
+ extract_xml_node(VALUE node)
422
+ {
423
+ xmlNodePtr xml_node;
424
+ Noko_Node_Get_Struct(node, xmlNode, xml_node);
425
+ return xml_node;
426
+ }
427
+
428
+ static VALUE fragment_continue(VALUE parse_args);
429
+
430
+ /*
431
+ * @!visibility protected
432
+ */
433
+ static VALUE
434
+ noko_gumbo_s_fragment(int argc, VALUE *argv, VALUE _self)
435
+ {
436
+ VALUE doc_fragment;
437
+ VALUE tags;
438
+ VALUE ctx;
439
+ VALUE kwargs;
440
+ ID name = rb_intern_const("name");
441
+ const char *ctx_tag;
442
+ GumboNamespaceEnum ctx_ns;
443
+ GumboQuirksModeEnum quirks_mode;
444
+ bool form = false;
445
+ const char *encoding = NULL;
446
+
447
+ rb_scan_args(argc, argv, "3:", &doc_fragment, &tags, &ctx, &kwargs);
448
+ if (NIL_P(kwargs)) {
449
+ kwargs = rb_hash_new();
450
+ }
451
+
452
+ GumboOptions options = common_options(kwargs);
453
+
454
+ if (NIL_P(ctx)) {
455
+ ctx_tag = "body";
456
+ ctx_ns = GUMBO_NAMESPACE_HTML;
457
+ } else if (TYPE(ctx) == T_STRING) {
458
+ ctx_tag = StringValueCStr(ctx);
459
+ ctx_ns = GUMBO_NAMESPACE_HTML;
460
+ size_t len = (size_t)RSTRING_LEN(ctx);
461
+ const char *colon = memchr(ctx_tag, ':', len);
462
+ if (colon) {
463
+ switch (colon - ctx_tag) {
464
+ case 3:
465
+ if (st_strncasecmp(ctx_tag, "svg", 3) != 0) {
466
+ goto error;
467
+ }
468
+ ctx_ns = GUMBO_NAMESPACE_SVG;
469
+ break;
470
+ case 4:
471
+ if (st_strncasecmp(ctx_tag, "html", 4) == 0) {
472
+ ctx_ns = GUMBO_NAMESPACE_HTML;
473
+ } else if (st_strncasecmp(ctx_tag, "math", 4) == 0) {
474
+ ctx_ns = GUMBO_NAMESPACE_MATHML;
475
+ } else {
476
+ goto error;
477
+ }
478
+ break;
479
+ default:
480
+ error:
481
+ rb_raise(rb_eArgError, "Invalid context namespace '%*s'", (int)(colon - ctx_tag), ctx_tag);
482
+ }
483
+ ctx_tag = colon + 1;
484
+ } else {
485
+ // For convenience, put 'svg' and 'math' in their namespaces.
486
+ if (len == 3 && st_strncasecmp(ctx_tag, "svg", 3) == 0) {
487
+ ctx_ns = GUMBO_NAMESPACE_SVG;
488
+ } else if (len == 4 && st_strncasecmp(ctx_tag, "math", 4) == 0) {
489
+ ctx_ns = GUMBO_NAMESPACE_MATHML;
490
+ }
491
+ }
492
+
493
+ // Check if it's a form.
494
+ form = ctx_ns == GUMBO_NAMESPACE_HTML && st_strcasecmp(ctx_tag, "form") == 0;
495
+ } else {
496
+ ID element_ = rb_intern_const("element?");
497
+
498
+ // Context fragment name.
499
+ VALUE tag_name = rb_funcall(ctx, name, 0);
500
+ assert(RTEST(tag_name));
501
+ Check_Type(tag_name, T_STRING);
502
+ ctx_tag = StringValueCStr(tag_name);
503
+
504
+ // Context fragment namespace.
505
+ ctx_ns = lookup_namespace(ctx, true);
506
+
507
+ // Check for a form ancestor, including self.
508
+ for (VALUE node = ctx;
509
+ !NIL_P(node);
510
+ node = rb_respond_to(node, parent) ? rb_funcall(node, parent, 0) : Qnil) {
511
+ if (!RTEST(rb_funcall(node, element_, 0))) {
512
+ continue;
513
+ }
514
+ VALUE element_name = rb_funcall(node, name, 0);
515
+ if (RSTRING_LEN(element_name) == 4
516
+ && !st_strcasecmp(RSTRING_PTR(element_name), "form")
517
+ && lookup_namespace(node, false) == GUMBO_NAMESPACE_HTML) {
518
+ form = true;
519
+ break;
520
+ }
521
+ }
522
+
523
+ // Encoding.
524
+ if (ctx_ns == GUMBO_NAMESPACE_MATHML
525
+ && RSTRING_LEN(tag_name) == 14
526
+ && !st_strcasecmp(ctx_tag, "annotation-xml")) {
527
+ VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
528
+ 1,
529
+ rb_utf8_str_new_static("encoding", 8));
530
+ if (RTEST(enc)) {
531
+ Check_Type(enc, T_STRING);
532
+ encoding = StringValueCStr(enc);
533
+ }
534
+ }
535
+ }
536
+
537
+ // Quirks mode.
538
+ VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
539
+ VALUE dtd = rb_funcall(doc, internal_subset, 0);
540
+ VALUE doc_quirks_mode = rb_iv_get(doc, "@quirks_mode");
541
+ if (NIL_P(ctx) || (TYPE(ctx) == T_STRING) || NIL_P(doc_quirks_mode)) {
542
+ quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
543
+ } else if (NIL_P(dtd)) {
544
+ quirks_mode = GUMBO_DOCTYPE_QUIRKS;
545
+ } else {
546
+ VALUE dtd_name = rb_funcall(dtd, name, 0);
547
+ VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
548
+ VALUE sysid = rb_funcall(dtd, rb_intern_const("system_id"), 0);
549
+ quirks_mode = gumbo_compute_quirks_mode(
550
+ NIL_P(dtd_name) ? NULL : StringValueCStr(dtd_name),
551
+ NIL_P(pubid) ? NULL : StringValueCStr(pubid),
552
+ NIL_P(sysid) ? NULL : StringValueCStr(sysid)
553
+ );
554
+ }
555
+
556
+ // Perform a fragment parse.
557
+ options.fragment_context = ctx_tag;
558
+ options.fragment_namespace = ctx_ns;
559
+ options.fragment_encoding = encoding;
560
+ options.quirks_mode = quirks_mode;
561
+ options.fragment_context_has_form_ancestor = form;
562
+
563
+ // Add one to the max tree depth to account for the HTML element.
564
+ if (options.max_tree_depth < UINT_MAX) { options.max_tree_depth++; }
565
+
566
+ GumboOutput *output = perform_parse(&options, tags);
567
+ ParseArgs args = {
568
+ .output = output,
569
+ .input = tags,
570
+ .url_or_frag = doc_fragment,
571
+ .doc = (xmlDocPtr)extract_xml_node(doc),
572
+ };
573
+ rb_ensure(fragment_continue, (VALUE)(&args), parse_cleanup, (VALUE)(&args));
574
+ return Qnil;
575
+ }
576
+
577
+ static VALUE
578
+ fragment_continue(VALUE parse_args)
579
+ {
580
+ ParseArgs *args = (ParseArgs *)parse_args;
581
+ GumboOutput *output = args->output;
582
+ VALUE doc_fragment = args->url_or_frag;
583
+ xmlDocPtr xml_doc = args->doc;
584
+
585
+ args->doc = NULL; // The Ruby runtime owns doc so make sure we don't delete it.
586
+ xmlNodePtr xml_frag = extract_xml_node(doc_fragment);
587
+ build_tree(xml_doc, xml_frag, output->root);
588
+ rb_iv_set(doc_fragment, "@quirks_mode", INT2NUM(output->document->v.document.doc_type_quirks_mode));
589
+ add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9));
590
+ return Qnil;
591
+ }
592
+
593
+ // Initialize the Nokogumbo class and fetch constants we will use later.
594
+ void
595
+ noko_init_gumbo(void)
596
+ {
597
+ // Class constants.
598
+ cNokogiriHtml5Document = rb_define_class_under(mNokogiriHtml5, "Document", cNokogiriHtml4Document);
599
+ rb_gc_register_mark_object(cNokogiriHtml5Document);
600
+
601
+ // Interned symbols.
602
+ internal_subset = rb_intern_const("internal_subset");
603
+ parent = rb_intern_const("parent");
604
+
605
+ // Define Nokogumbo module with parse and fragment methods.
606
+ rb_define_singleton_method(mNokogiriGumbo, "parse", noko_gumbo_s_parse, -1);
607
+ rb_define_singleton_method(mNokogiriGumbo, "fragment", noko_gumbo_s_fragment, -1);
608
+ }
609
+
610
+ // vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab: