nokogiri 1.6.0 → 1.13.2

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (340) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +3 -19
  3. data/LICENSE-DEPENDENCIES.md +1903 -0
  4. data/LICENSE.md +9 -0
  5. data/README.md +280 -0
  6. data/bin/nokogiri +84 -31
  7. data/dependencies.yml +23 -4
  8. data/ext/nokogiri/depend +38 -358
  9. data/ext/nokogiri/extconf.rb +952 -132
  10. data/ext/nokogiri/gumbo.c +584 -0
  11. data/ext/nokogiri/html4_document.c +166 -0
  12. data/ext/nokogiri/html4_element_description.c +294 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser_context.c +120 -0
  15. data/ext/nokogiri/html4_sax_push_parser.c +95 -0
  16. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  17. data/ext/nokogiri/nokogiri.c +231 -96
  18. data/ext/nokogiri/nokogiri.h +188 -129
  19. data/ext/nokogiri/test_global_handlers.c +40 -0
  20. data/ext/nokogiri/xml_attr.c +49 -40
  21. data/ext/nokogiri/xml_attribute_decl.c +18 -18
  22. data/ext/nokogiri/xml_cdata.c +24 -23
  23. data/ext/nokogiri/xml_comment.c +29 -21
  24. data/ext/nokogiri/xml_document.c +327 -223
  25. data/ext/nokogiri/xml_document_fragment.c +12 -16
  26. data/ext/nokogiri/xml_dtd.c +56 -50
  27. data/ext/nokogiri/xml_element_content.c +31 -26
  28. data/ext/nokogiri/xml_element_decl.c +22 -22
  29. data/ext/nokogiri/xml_encoding_handler.c +45 -20
  30. data/ext/nokogiri/xml_entity_decl.c +32 -30
  31. data/ext/nokogiri/xml_entity_reference.c +16 -18
  32. data/ext/nokogiri/xml_namespace.c +74 -32
  33. data/ext/nokogiri/xml_node.c +1290 -680
  34. data/ext/nokogiri/xml_node_set.c +239 -208
  35. data/ext/nokogiri/xml_processing_instruction.c +17 -19
  36. data/ext/nokogiri/xml_reader.c +227 -189
  37. data/ext/nokogiri/xml_relax_ng.c +52 -28
  38. data/ext/nokogiri/xml_sax_parser.c +123 -125
  39. data/ext/nokogiri/xml_sax_parser_context.c +138 -79
  40. data/ext/nokogiri/xml_sax_push_parser.c +88 -35
  41. data/ext/nokogiri/xml_schema.c +112 -33
  42. data/ext/nokogiri/xml_syntax_error.c +50 -23
  43. data/ext/nokogiri/xml_text.c +14 -18
  44. data/ext/nokogiri/xml_xpath_context.c +227 -140
  45. data/ext/nokogiri/xslt_stylesheet.c +269 -177
  46. data/gumbo-parser/CHANGES.md +63 -0
  47. data/gumbo-parser/Makefile +101 -0
  48. data/gumbo-parser/THANKS +27 -0
  49. data/gumbo-parser/src/Makefile +34 -0
  50. data/gumbo-parser/src/README.md +41 -0
  51. data/gumbo-parser/src/ascii.c +75 -0
  52. data/gumbo-parser/src/ascii.h +115 -0
  53. data/gumbo-parser/src/attribute.c +42 -0
  54. data/gumbo-parser/src/attribute.h +17 -0
  55. data/gumbo-parser/src/char_ref.c +22225 -0
  56. data/gumbo-parser/src/char_ref.h +29 -0
  57. data/gumbo-parser/src/char_ref.rl +2154 -0
  58. data/gumbo-parser/src/error.c +626 -0
  59. data/gumbo-parser/src/error.h +148 -0
  60. data/gumbo-parser/src/foreign_attrs.c +104 -0
  61. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  62. data/gumbo-parser/src/gumbo.h +943 -0
  63. data/gumbo-parser/src/insertion_mode.h +33 -0
  64. data/gumbo-parser/src/macros.h +91 -0
  65. data/gumbo-parser/src/parser.c +4875 -0
  66. data/gumbo-parser/src/parser.h +41 -0
  67. data/gumbo-parser/src/replacement.h +33 -0
  68. data/gumbo-parser/src/string_buffer.c +103 -0
  69. data/gumbo-parser/src/string_buffer.h +68 -0
  70. data/gumbo-parser/src/string_piece.c +48 -0
  71. data/gumbo-parser/src/svg_attrs.c +174 -0
  72. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  73. data/gumbo-parser/src/svg_tags.c +137 -0
  74. data/gumbo-parser/src/svg_tags.gperf +55 -0
  75. data/gumbo-parser/src/tag.c +222 -0
  76. data/gumbo-parser/src/tag_lookup.c +382 -0
  77. data/gumbo-parser/src/tag_lookup.gperf +169 -0
  78. data/gumbo-parser/src/tag_lookup.h +13 -0
  79. data/gumbo-parser/src/token_buffer.c +79 -0
  80. data/gumbo-parser/src/token_buffer.h +71 -0
  81. data/gumbo-parser/src/token_type.h +17 -0
  82. data/gumbo-parser/src/tokenizer.c +3463 -0
  83. data/gumbo-parser/src/tokenizer.h +112 -0
  84. data/gumbo-parser/src/tokenizer_states.h +339 -0
  85. data/gumbo-parser/src/utf8.c +245 -0
  86. data/gumbo-parser/src/utf8.h +164 -0
  87. data/gumbo-parser/src/util.c +68 -0
  88. data/gumbo-parser/src/util.h +30 -0
  89. data/gumbo-parser/src/vector.c +111 -0
  90. data/gumbo-parser/src/vector.h +45 -0
  91. data/lib/nokogiri/class_resolver.rb +67 -0
  92. data/lib/nokogiri/css/node.rb +10 -58
  93. data/lib/nokogiri/css/parser.rb +407 -357
  94. data/lib/nokogiri/css/parser.y +265 -246
  95. data/lib/nokogiri/css/parser_extras.rb +52 -49
  96. data/lib/nokogiri/css/syntax_error.rb +3 -1
  97. data/lib/nokogiri/css/tokenizer.rb +107 -104
  98. data/lib/nokogiri/css/tokenizer.rex +8 -7
  99. data/lib/nokogiri/css/xpath_visitor.rb +266 -80
  100. data/lib/nokogiri/css.rb +50 -17
  101. data/lib/nokogiri/decorators/slop.rb +17 -8
  102. data/lib/nokogiri/extension.rb +31 -0
  103. data/lib/nokogiri/gumbo.rb +15 -0
  104. data/lib/nokogiri/html.rb +38 -27
  105. data/lib/nokogiri/{html → html4}/builder.rb +4 -2
  106. data/lib/nokogiri/html4/document.rb +331 -0
  107. data/lib/nokogiri/html4/document_fragment.rb +54 -0
  108. data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
  109. data/lib/nokogiri/html4/element_description_defaults.rb +578 -0
  110. data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
  111. data/lib/nokogiri/{html → html4}/sax/parser.rb +24 -15
  112. data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
  113. data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
  114. data/lib/nokogiri/html4.rb +46 -0
  115. data/lib/nokogiri/html5/document.rb +88 -0
  116. data/lib/nokogiri/html5/document_fragment.rb +83 -0
  117. data/lib/nokogiri/html5/node.rb +96 -0
  118. data/lib/nokogiri/html5.rb +477 -0
  119. data/lib/nokogiri/jruby/dependencies.rb +21 -0
  120. data/lib/nokogiri/syntax_error.rb +2 -0
  121. data/lib/nokogiri/version/constant.rb +6 -0
  122. data/lib/nokogiri/version/info.rb +221 -0
  123. data/lib/nokogiri/version.rb +3 -105
  124. data/lib/nokogiri/xml/attr.rb +6 -3
  125. data/lib/nokogiri/xml/attribute_decl.rb +3 -1
  126. data/lib/nokogiri/xml/builder.rb +96 -54
  127. data/lib/nokogiri/xml/cdata.rb +3 -1
  128. data/lib/nokogiri/xml/character_data.rb +2 -0
  129. data/lib/nokogiri/xml/document.rb +234 -95
  130. data/lib/nokogiri/xml/document_fragment.rb +86 -36
  131. data/lib/nokogiri/xml/dtd.rb +16 -4
  132. data/lib/nokogiri/xml/element_content.rb +2 -0
  133. data/lib/nokogiri/xml/element_decl.rb +3 -1
  134. data/lib/nokogiri/xml/entity_decl.rb +4 -2
  135. data/lib/nokogiri/xml/entity_reference.rb +20 -0
  136. data/lib/nokogiri/xml/namespace.rb +3 -0
  137. data/lib/nokogiri/xml/node/save_options.rb +8 -4
  138. data/lib/nokogiri/xml/node.rb +947 -502
  139. data/lib/nokogiri/xml/node_set.rb +168 -159
  140. data/lib/nokogiri/xml/notation.rb +13 -0
  141. data/lib/nokogiri/xml/parse_options.rb +40 -5
  142. data/lib/nokogiri/xml/pp/character_data.rb +9 -6
  143. data/lib/nokogiri/xml/pp/node.rb +25 -26
  144. data/lib/nokogiri/xml/pp.rb +4 -2
  145. data/lib/nokogiri/xml/processing_instruction.rb +3 -1
  146. data/lib/nokogiri/xml/reader.rb +23 -28
  147. data/lib/nokogiri/xml/relax_ng.rb +8 -2
  148. data/lib/nokogiri/xml/sax/document.rb +45 -49
  149. data/lib/nokogiri/xml/sax/parser.rb +43 -41
  150. data/lib/nokogiri/xml/sax/parser_context.rb +8 -3
  151. data/lib/nokogiri/xml/sax/push_parser.rb +6 -5
  152. data/lib/nokogiri/xml/sax.rb +6 -4
  153. data/lib/nokogiri/xml/schema.rb +19 -9
  154. data/lib/nokogiri/xml/searchable.rb +270 -0
  155. data/lib/nokogiri/xml/syntax_error.rb +25 -1
  156. data/lib/nokogiri/xml/text.rb +2 -0
  157. data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
  158. data/lib/nokogiri/xml/xpath.rb +15 -4
  159. data/lib/nokogiri/xml/xpath_context.rb +3 -3
  160. data/lib/nokogiri/xml.rb +38 -36
  161. data/lib/nokogiri/xslt/stylesheet.rb +3 -1
  162. data/lib/nokogiri/xslt.rb +29 -20
  163. data/lib/nokogiri.rb +69 -69
  164. data/lib/xsd/xmlparser/nokogiri.rb +26 -24
  165. data/patches/libxml2/0001-Remove-script-macro-support.patch +40 -0
  166. data/patches/libxml2/0002-Update-entities-to-remove-handling-of-ssi.patch +44 -0
  167. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +25 -0
  168. data/patches/libxml2/0004-use-glibc-strlen.patch +53 -0
  169. data/patches/libxml2/0005-avoid-isnan-isinf.patch +81 -0
  170. data/patches/libxml2/0006-update-automake-files-for-arm64.patch +3040 -0
  171. data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +61 -0
  172. data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
  173. data/patches/libxslt/0001-update-automake-files-for-arm64.patch +3037 -0
  174. data/ports/archives/libxml2-2.9.13.tar.xz +0 -0
  175. data/ports/archives/libxslt-1.1.35.tar.xz +0 -0
  176. metadata +278 -362
  177. data/.autotest +0 -26
  178. data/.gemtest +0 -0
  179. data/.travis.yml +0 -27
  180. data/CHANGELOG.ja.rdoc +0 -819
  181. data/CHANGELOG.rdoc +0 -819
  182. data/C_CODING_STYLE.rdoc +0 -33
  183. data/Manifest.txt +0 -315
  184. data/README.ja.rdoc +0 -106
  185. data/README.rdoc +0 -175
  186. data/ROADMAP.md +0 -90
  187. data/Rakefile +0 -246
  188. data/STANDARD_RESPONSES.md +0 -47
  189. data/Y_U_NO_GEMSPEC.md +0 -155
  190. data/build_all +0 -105
  191. data/ext/nokogiri/html_document.c +0 -170
  192. data/ext/nokogiri/html_document.h +0 -10
  193. data/ext/nokogiri/html_element_description.c +0 -279
  194. data/ext/nokogiri/html_element_description.h +0 -10
  195. data/ext/nokogiri/html_entity_lookup.c +0 -32
  196. data/ext/nokogiri/html_entity_lookup.h +0 -8
  197. data/ext/nokogiri/html_sax_parser_context.c +0 -116
  198. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  199. data/ext/nokogiri/html_sax_push_parser.c +0 -87
  200. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  201. data/ext/nokogiri/xml_attr.h +0 -9
  202. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  203. data/ext/nokogiri/xml_cdata.h +0 -9
  204. data/ext/nokogiri/xml_comment.h +0 -9
  205. data/ext/nokogiri/xml_document.h +0 -23
  206. data/ext/nokogiri/xml_document_fragment.h +0 -10
  207. data/ext/nokogiri/xml_dtd.h +0 -10
  208. data/ext/nokogiri/xml_element_content.h +0 -10
  209. data/ext/nokogiri/xml_element_decl.h +0 -9
  210. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  211. data/ext/nokogiri/xml_entity_decl.h +0 -10
  212. data/ext/nokogiri/xml_entity_reference.h +0 -9
  213. data/ext/nokogiri/xml_io.c +0 -56
  214. data/ext/nokogiri/xml_io.h +0 -11
  215. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  216. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  217. data/ext/nokogiri/xml_namespace.h +0 -13
  218. data/ext/nokogiri/xml_node.h +0 -13
  219. data/ext/nokogiri/xml_node_set.h +0 -14
  220. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  221. data/ext/nokogiri/xml_reader.h +0 -10
  222. data/ext/nokogiri/xml_relax_ng.h +0 -9
  223. data/ext/nokogiri/xml_sax_parser.h +0 -39
  224. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  225. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  226. data/ext/nokogiri/xml_schema.h +0 -9
  227. data/ext/nokogiri/xml_syntax_error.h +0 -13
  228. data/ext/nokogiri/xml_text.h +0 -9
  229. data/ext/nokogiri/xml_xpath_context.h +0 -10
  230. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  231. data/lib/nokogiri/html/document.rb +0 -254
  232. data/lib/nokogiri/html/document_fragment.rb +0 -41
  233. data/lib/nokogiri/html/element_description_defaults.rb +0 -671
  234. data/lib/nokogiri/html/sax/parser_context.rb +0 -16
  235. data/lib/nokogiri/html/sax/push_parser.rb +0 -16
  236. data/ports/archives/libxml2-2.8.0.tar.gz +0 -0
  237. data/ports/archives/libxslt-1.1.26.tar.gz +0 -0
  238. data/tasks/cross_compile.rb +0 -132
  239. data/tasks/nokogiri.org.rb +0 -24
  240. data/tasks/test.rb +0 -95
  241. data/test/css/test_nthiness.rb +0 -159
  242. data/test/css/test_parser.rb +0 -341
  243. data/test/css/test_tokenizer.rb +0 -198
  244. data/test/css/test_xpath_visitor.rb +0 -91
  245. data/test/decorators/test_slop.rb +0 -16
  246. data/test/files/2ch.html +0 -108
  247. data/test/files/address_book.rlx +0 -12
  248. data/test/files/address_book.xml +0 -10
  249. data/test/files/bar/bar.xsd +0 -4
  250. data/test/files/bogus.xml +0 -0
  251. data/test/files/dont_hurt_em_why.xml +0 -422
  252. data/test/files/encoding.html +0 -82
  253. data/test/files/encoding.xhtml +0 -84
  254. data/test/files/exslt.xml +0 -8
  255. data/test/files/exslt.xslt +0 -35
  256. data/test/files/foo/foo.xsd +0 -4
  257. data/test/files/metacharset.html +0 -10
  258. data/test/files/noencoding.html +0 -47
  259. data/test/files/po.xml +0 -32
  260. data/test/files/po.xsd +0 -66
  261. data/test/files/saml/saml20assertion_schema.xsd +0 -283
  262. data/test/files/saml/saml20protocol_schema.xsd +0 -302
  263. data/test/files/saml/xenc_schema.xsd +0 -146
  264. data/test/files/saml/xmldsig_schema.xsd +0 -318
  265. data/test/files/shift_jis.html +0 -10
  266. data/test/files/shift_jis.xml +0 -5
  267. data/test/files/snuggles.xml +0 -3
  268. data/test/files/staff.dtd +0 -10
  269. data/test/files/staff.xml +0 -59
  270. data/test/files/staff.xslt +0 -32
  271. data/test/files/test_document_url/bar.xml +0 -2
  272. data/test/files/test_document_url/document.dtd +0 -4
  273. data/test/files/test_document_url/document.xml +0 -6
  274. data/test/files/tlm.html +0 -850
  275. data/test/files/to_be_xincluded.xml +0 -2
  276. data/test/files/valid_bar.xml +0 -2
  277. data/test/files/xinclude.xml +0 -4
  278. data/test/helper.rb +0 -154
  279. data/test/html/sax/test_parser.rb +0 -141
  280. data/test/html/sax/test_parser_context.rb +0 -46
  281. data/test/html/test_builder.rb +0 -164
  282. data/test/html/test_document.rb +0 -552
  283. data/test/html/test_document_encoding.rb +0 -138
  284. data/test/html/test_document_fragment.rb +0 -261
  285. data/test/html/test_element_description.rb +0 -105
  286. data/test/html/test_named_characters.rb +0 -14
  287. data/test/html/test_node.rb +0 -196
  288. data/test/html/test_node_encoding.rb +0 -27
  289. data/test/namespaces/test_additional_namespaces_in_builder_doc.rb +0 -14
  290. data/test/namespaces/test_namespaces_in_builder_doc.rb +0 -75
  291. data/test/namespaces/test_namespaces_in_created_doc.rb +0 -75
  292. data/test/namespaces/test_namespaces_in_parsed_doc.rb +0 -66
  293. data/test/test_convert_xpath.rb +0 -135
  294. data/test/test_css_cache.rb +0 -45
  295. data/test/test_encoding_handler.rb +0 -46
  296. data/test/test_memory_leak.rb +0 -156
  297. data/test/test_nokogiri.rb +0 -132
  298. data/test/test_reader.rb +0 -555
  299. data/test/test_soap4r_sax.rb +0 -52
  300. data/test/test_xslt_transforms.rb +0 -254
  301. data/test/xml/node/test_save_options.rb +0 -28
  302. data/test/xml/node/test_subclass.rb +0 -44
  303. data/test/xml/sax/test_parser.rb +0 -366
  304. data/test/xml/sax/test_parser_context.rb +0 -106
  305. data/test/xml/sax/test_push_parser.rb +0 -157
  306. data/test/xml/test_attr.rb +0 -64
  307. data/test/xml/test_attribute_decl.rb +0 -86
  308. data/test/xml/test_builder.rb +0 -306
  309. data/test/xml/test_c14n.rb +0 -151
  310. data/test/xml/test_cdata.rb +0 -48
  311. data/test/xml/test_comment.rb +0 -29
  312. data/test/xml/test_document.rb +0 -828
  313. data/test/xml/test_document_encoding.rb +0 -28
  314. data/test/xml/test_document_fragment.rb +0 -223
  315. data/test/xml/test_dtd.rb +0 -103
  316. data/test/xml/test_dtd_encoding.rb +0 -33
  317. data/test/xml/test_element_content.rb +0 -56
  318. data/test/xml/test_element_decl.rb +0 -73
  319. data/test/xml/test_entity_decl.rb +0 -122
  320. data/test/xml/test_entity_reference.rb +0 -245
  321. data/test/xml/test_namespace.rb +0 -95
  322. data/test/xml/test_node.rb +0 -1137
  323. data/test/xml/test_node_attributes.rb +0 -96
  324. data/test/xml/test_node_encoding.rb +0 -107
  325. data/test/xml/test_node_inheritance.rb +0 -32
  326. data/test/xml/test_node_reparenting.rb +0 -374
  327. data/test/xml/test_node_set.rb +0 -755
  328. data/test/xml/test_parse_options.rb +0 -64
  329. data/test/xml/test_processing_instruction.rb +0 -30
  330. data/test/xml/test_reader_encoding.rb +0 -142
  331. data/test/xml/test_relax_ng.rb +0 -60
  332. data/test/xml/test_schema.rb +0 -103
  333. data/test/xml/test_syntax_error.rb +0 -12
  334. data/test/xml/test_text.rb +0 -45
  335. data/test/xml/test_unparented_node.rb +0 -422
  336. data/test/xml/test_xinclude.rb +0 -83
  337. data/test/xml/test_xpath.rb +0 -295
  338. data/test/xslt/test_custom_functions.rb +0 -133
  339. data/test/xslt/test_exception_handling.rb +0 -37
  340. data/test_all +0 -81
@@ -0,0 +1,584 @@
1
+ //
2
+ // Copyright 2013-2021 Sam Ruby, Stephen Checkoway
3
+ //
4
+ // Licensed under the Apache License, Version 2.0 (the "License");
5
+ // you may not use this file except in compliance with the License.
6
+ // You may obtain a copy of the License at
7
+ //
8
+ // http://www.apache.org/licenses/LICENSE-2.0
9
+ //
10
+ // Unless required by applicable law or agreed to in writing, software
11
+ // distributed under the License is distributed on an "AS IS" BASIS,
12
+ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ // See the License for the specific language governing permissions and
14
+ // limitations under the License.
15
+ //
16
+
17
+ //
18
+ // nokogumbo.c defines the following:
19
+ //
20
+ // class Nokogumbo
21
+ // def parse(utf8_string) # returns Nokogiri::HTML5::Document
22
+ // end
23
+ //
24
+ // Processing starts by calling gumbo_parse_with_options. The resulting document tree
25
+ // is then walked, a parallel libxml2 tree is constructed, and the final document is
26
+ // then wrapped using Nokogiri_wrap_xml_document. This approach reduces memory and CPU
27
+ // requirements as Ruby objects are only built when necessary.
28
+ //
29
+
30
+ #include <nokogiri.h>
31
+
32
+ #include "gumbo.h"
33
+
34
+ VALUE cNokogiriHtml5Document;
35
+
36
+ // Interned symbols
37
+ static ID internal_subset;
38
+ static ID parent;
39
+
40
+ /* Backwards compatibility to Ruby 2.1.0 */
41
+ #if RUBY_API_VERSION_CODE < 20200
42
+ #define ONIG_ESCAPE_UCHAR_COLLISION 1
43
+ #include <ruby/encoding.h>
44
+
45
+ static VALUE
46
+ rb_utf8_str_new(const char *str, long length)
47
+ {
48
+ return rb_enc_str_new(str, length, rb_utf8_encoding());
49
+ }
50
+
51
+ static VALUE
52
+ rb_utf8_str_new_cstr(const char *str)
53
+ {
54
+ return rb_enc_str_new_cstr(str, rb_utf8_encoding());
55
+ }
56
+
57
+ static VALUE
58
+ rb_utf8_str_new_static(const char *str, long length)
59
+ {
60
+ return rb_enc_str_new(str, length, rb_utf8_encoding());
61
+ }
62
+ #endif
63
+
64
+ #include <nokogiri.h>
65
+ #include <libxml/tree.h>
66
+ #include <libxml/HTMLtree.h>
67
+
68
+ // URI = system id
69
+ // external id = public id
70
+ static xmlDocPtr
71
+ new_html_doc(const char *dtd_name, const char *system, const char *public)
72
+ {
73
+ // These two libxml2 functions take the public and system ids in
74
+ // opposite orders.
75
+ htmlDocPtr doc = htmlNewDocNoDtD(/* URI */ NULL, /* ExternalID */NULL);
76
+ assert(doc);
77
+ if (dtd_name) {
78
+ xmlCreateIntSubset(doc, (const xmlChar *)dtd_name, (const xmlChar *)public, (const xmlChar *)system);
79
+ }
80
+ return doc;
81
+ }
82
+
83
+ static xmlNodePtr
84
+ get_parent(xmlNodePtr node)
85
+ {
86
+ return node->parent;
87
+ }
88
+
89
+ static GumboOutput *
90
+ perform_parse(const GumboOptions *options, VALUE input)
91
+ {
92
+ assert(RTEST(input));
93
+ Check_Type(input, T_STRING);
94
+ GumboOutput *output = gumbo_parse_with_options(
95
+ options,
96
+ RSTRING_PTR(input),
97
+ RSTRING_LEN(input)
98
+ );
99
+
100
+ const char *status_string = gumbo_status_to_string(output->status);
101
+ switch (output->status) {
102
+ case GUMBO_STATUS_OK:
103
+ break;
104
+ case GUMBO_STATUS_TOO_MANY_ATTRIBUTES:
105
+ case GUMBO_STATUS_TREE_TOO_DEEP:
106
+ gumbo_destroy_output(output);
107
+ rb_raise(rb_eArgError, "%s", status_string);
108
+ case GUMBO_STATUS_OUT_OF_MEMORY:
109
+ gumbo_destroy_output(output);
110
+ rb_raise(rb_eNoMemError, "%s", status_string);
111
+ }
112
+ return output;
113
+ }
114
+
115
+ static xmlNsPtr
116
+ lookup_or_add_ns(
117
+ xmlDocPtr doc,
118
+ xmlNodePtr root,
119
+ const char *href,
120
+ const char *prefix
121
+ )
122
+ {
123
+ xmlNsPtr ns = xmlSearchNs(doc, root, (const xmlChar *)prefix);
124
+ if (ns) {
125
+ return ns;
126
+ }
127
+ return xmlNewNs(root, (const xmlChar *)href, (const xmlChar *)prefix);
128
+ }
129
+
130
+ static void
131
+ set_line(xmlNodePtr node, size_t line)
132
+ {
133
+ // libxml2 uses 65535 to mean look elsewhere for the line number on some
134
+ // nodes.
135
+ if (line < 65535) {
136
+ node->line = (unsigned short)line;
137
+ }
138
+ }
139
+
140
+ // Construct an XML tree rooted at xml_output_node from the Gumbo tree rooted
141
+ // at gumbo_node.
142
+ static void
143
+ build_tree(
144
+ xmlDocPtr doc,
145
+ xmlNodePtr xml_output_node,
146
+ const GumboNode *gumbo_node
147
+ )
148
+ {
149
+ xmlNodePtr xml_root = NULL;
150
+ xmlNodePtr xml_node = xml_output_node;
151
+ size_t child_index = 0;
152
+
153
+ while (true) {
154
+ assert(gumbo_node != NULL);
155
+ const GumboVector *children = gumbo_node->type == GUMBO_NODE_DOCUMENT ?
156
+ &gumbo_node->v.document.children : &gumbo_node->v.element.children;
157
+ if (child_index >= children->length) {
158
+ // Move up the tree and to the next child.
159
+ if (xml_node == xml_output_node) {
160
+ // We've built as much of the tree as we can.
161
+ return;
162
+ }
163
+ child_index = gumbo_node->index_within_parent + 1;
164
+ gumbo_node = gumbo_node->parent;
165
+ xml_node = get_parent(xml_node);
166
+ // Children of fragments don't share the same root, so reset it and
167
+ // it'll be set below. In the non-fragment case, this will only happen
168
+ // after the html element has been finished at which point there are no
169
+ // further elements.
170
+ if (xml_node == xml_output_node) {
171
+ xml_root = NULL;
172
+ }
173
+ continue;
174
+ }
175
+ const GumboNode *gumbo_child = children->data[child_index++];
176
+ xmlNodePtr xml_child;
177
+
178
+ switch (gumbo_child->type) {
179
+ case GUMBO_NODE_DOCUMENT:
180
+ abort(); // Bug in Gumbo.
181
+
182
+ case GUMBO_NODE_TEXT:
183
+ case GUMBO_NODE_WHITESPACE:
184
+ xml_child = xmlNewDocText(doc, (const xmlChar *)gumbo_child->v.text.text);
185
+ set_line(xml_child, gumbo_child->v.text.start_pos.line);
186
+ xmlAddChild(xml_node, xml_child);
187
+ break;
188
+
189
+ case GUMBO_NODE_CDATA:
190
+ xml_child = xmlNewCDataBlock(doc, (const xmlChar *)gumbo_child->v.text.text,
191
+ (int) strlen(gumbo_child->v.text.text));
192
+ set_line(xml_child, gumbo_child->v.text.start_pos.line);
193
+ xmlAddChild(xml_node, xml_child);
194
+ break;
195
+
196
+ case GUMBO_NODE_COMMENT:
197
+ xml_child = xmlNewDocComment(doc, (const xmlChar *)gumbo_child->v.text.text);
198
+ set_line(xml_child, gumbo_child->v.text.start_pos.line);
199
+ xmlAddChild(xml_node, xml_child);
200
+ break;
201
+
202
+ case GUMBO_NODE_TEMPLATE:
203
+ // XXX: Should create a template element and a new DocumentFragment
204
+ case GUMBO_NODE_ELEMENT: {
205
+ xml_child = xmlNewDocNode(doc, NULL, (const xmlChar *)gumbo_child->v.element.name, NULL);
206
+ set_line(xml_child, gumbo_child->v.element.start_pos.line);
207
+ if (xml_root == NULL) {
208
+ xml_root = xml_child;
209
+ }
210
+ xmlNsPtr ns = NULL;
211
+ switch (gumbo_child->v.element.tag_namespace) {
212
+ case GUMBO_NAMESPACE_HTML:
213
+ break;
214
+ case GUMBO_NAMESPACE_SVG:
215
+ ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/svg", "svg");
216
+ break;
217
+ case GUMBO_NAMESPACE_MATHML:
218
+ ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1998/Math/MathML", "math");
219
+ break;
220
+ }
221
+ if (ns != NULL) {
222
+ xmlSetNs(xml_child, ns);
223
+ }
224
+ xmlAddChild(xml_node, xml_child);
225
+
226
+ // Add the attributes.
227
+ const GumboVector *attrs = &gumbo_child->v.element.attributes;
228
+ for (size_t i = 0; i < attrs->length; i++) {
229
+ const GumboAttribute *attr = attrs->data[i];
230
+
231
+ switch (attr->attr_namespace) {
232
+ case GUMBO_ATTR_NAMESPACE_XLINK:
233
+ ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/1999/xlink", "xlink");
234
+ break;
235
+
236
+ case GUMBO_ATTR_NAMESPACE_XML:
237
+ ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/XML/1998/namespace", "xml");
238
+ break;
239
+
240
+ case GUMBO_ATTR_NAMESPACE_XMLNS:
241
+ ns = lookup_or_add_ns(doc, xml_root, "http://www.w3.org/2000/xmlns/", "xmlns");
242
+ break;
243
+
244
+ default:
245
+ ns = NULL;
246
+ }
247
+ xmlNewNsProp(xml_child, ns, (const xmlChar *)attr->name, (const xmlChar *)attr->value);
248
+ }
249
+
250
+ // Add children for this element.
251
+ child_index = 0;
252
+ gumbo_node = gumbo_child;
253
+ xml_node = xml_child;
254
+ }
255
+ }
256
+ }
257
+ }
258
+
259
+ static void
260
+ add_errors(const GumboOutput *output, VALUE rdoc, VALUE input, VALUE url)
261
+ {
262
+ const char *input_str = RSTRING_PTR(input);
263
+ size_t input_len = RSTRING_LEN(input);
264
+
265
+ // Add parse errors to rdoc.
266
+ if (output->errors.length) {
267
+ const GumboVector *errors = &output->errors;
268
+ VALUE rerrors = rb_ary_new2(errors->length);
269
+
270
+ for (size_t i = 0; i < errors->length; i++) {
271
+ GumboError *err = errors->data[i];
272
+ GumboSourcePosition position = gumbo_error_position(err);
273
+ char *msg;
274
+ size_t size = gumbo_caret_diagnostic_to_string(err, input_str, input_len, &msg);
275
+ VALUE err_str = rb_utf8_str_new(msg, size);
276
+ free(msg);
277
+ VALUE syntax_error = rb_class_new_instance(1, &err_str, cNokogiriXmlSyntaxError);
278
+ const char *error_code = gumbo_error_code(err);
279
+ VALUE str1 = error_code ? rb_utf8_str_new_static(error_code, strlen(error_code)) : Qnil;
280
+ rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
281
+ rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
282
+ rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
283
+ rb_iv_set(syntax_error, "@file", url);
284
+ rb_iv_set(syntax_error, "@line", INT2NUM(position.line));
285
+ rb_iv_set(syntax_error, "@str1", str1);
286
+ rb_iv_set(syntax_error, "@str2", Qnil);
287
+ rb_iv_set(syntax_error, "@str3", Qnil);
288
+ rb_iv_set(syntax_error, "@int1", INT2NUM(0));
289
+ rb_iv_set(syntax_error, "@column", INT2NUM(position.column));
290
+ rb_ary_push(rerrors, syntax_error);
291
+ }
292
+ rb_iv_set(rdoc, "@errors", rerrors);
293
+ }
294
+ }
295
+
296
+ typedef struct {
297
+ GumboOutput *output;
298
+ VALUE input;
299
+ VALUE url_or_frag;
300
+ xmlDocPtr doc;
301
+ } ParseArgs;
302
+
303
+ static VALUE
304
+ parse_cleanup(VALUE parse_args)
305
+ {
306
+ ParseArgs *args = (ParseArgs *)parse_args;
307
+ gumbo_destroy_output(args->output);
308
+ // Make sure garbage collection doesn't mark the objects as being live based
309
+ // on references from the ParseArgs. This may be unnecessary.
310
+ args->input = Qnil;
311
+ args->url_or_frag = Qnil;
312
+ if (args->doc != NULL) {
313
+ xmlFreeDoc(args->doc);
314
+ }
315
+ return Qnil;
316
+ }
317
+
318
+ static VALUE parse_continue(VALUE parse_args);
319
+
320
+ /*
321
+ * @!visibility protected
322
+ */
323
+ static VALUE
324
+ parse(VALUE self, VALUE input, VALUE url, VALUE max_attributes, VALUE max_errors, VALUE max_depth)
325
+ {
326
+ GumboOptions options = kGumboDefaultOptions;
327
+ options.max_attributes = NUM2INT(max_attributes);
328
+ options.max_errors = NUM2INT(max_errors);
329
+ options.max_tree_depth = NUM2INT(max_depth);
330
+
331
+ GumboOutput *output = perform_parse(&options, input);
332
+ ParseArgs args = {
333
+ .output = output,
334
+ .input = input,
335
+ .url_or_frag = url,
336
+ .doc = NULL,
337
+ };
338
+
339
+ return rb_ensure(parse_continue, (VALUE)(&args), parse_cleanup, (VALUE)(&args));
340
+ }
341
+
342
+ static VALUE
343
+ parse_continue(VALUE parse_args)
344
+ {
345
+ ParseArgs *args = (ParseArgs *)parse_args;
346
+ GumboOutput *output = args->output;
347
+ xmlDocPtr doc;
348
+ if (output->document->v.document.has_doctype) {
349
+ const char *name = output->document->v.document.name;
350
+ const char *public = output->document->v.document.public_identifier;
351
+ const char *system = output->document->v.document.system_identifier;
352
+ public = public[0] ? public : NULL;
353
+ system = system[0] ? system : NULL;
354
+ doc = new_html_doc(name, system, public);
355
+ } else {
356
+ doc = new_html_doc(NULL, NULL, NULL);
357
+ }
358
+ args->doc = doc; // Make sure doc gets cleaned up if an error is thrown.
359
+ build_tree(doc, (xmlNodePtr)doc, output->document);
360
+ VALUE rdoc = Nokogiri_wrap_xml_document(cNokogiriHtml5Document, doc);
361
+ args->doc = NULL; // The Ruby runtime now owns doc so don't delete it.
362
+ add_errors(output, rdoc, args->input, args->url_or_frag);
363
+ return rdoc;
364
+ }
365
+
366
+ static int
367
+ lookup_namespace(VALUE node, bool require_known_ns)
368
+ {
369
+ ID namespace, href;
370
+ CONST_ID(namespace, "namespace");
371
+ CONST_ID(href, "href");
372
+ VALUE ns = rb_funcall(node, namespace, 0);
373
+
374
+ if (NIL_P(ns)) {
375
+ return GUMBO_NAMESPACE_HTML;
376
+ }
377
+ ns = rb_funcall(ns, href, 0);
378
+ assert(RTEST(ns));
379
+ Check_Type(ns, T_STRING);
380
+
381
+ const char *href_ptr = RSTRING_PTR(ns);
382
+ size_t href_len = RSTRING_LEN(ns);
383
+ #define NAMESPACE_P(uri) (href_len == sizeof uri - 1 && !memcmp(href_ptr, uri, href_len))
384
+ if (NAMESPACE_P("http://www.w3.org/1999/xhtml")) {
385
+ return GUMBO_NAMESPACE_HTML;
386
+ }
387
+ if (NAMESPACE_P("http://www.w3.org/1998/Math/MathML")) {
388
+ return GUMBO_NAMESPACE_MATHML;
389
+ }
390
+ if (NAMESPACE_P("http://www.w3.org/2000/svg")) {
391
+ return GUMBO_NAMESPACE_SVG;
392
+ }
393
+ #undef NAMESPACE_P
394
+ if (require_known_ns) {
395
+ rb_raise(rb_eArgError, "Unexpected namespace URI \"%*s\"", (int)href_len, href_ptr);
396
+ }
397
+ return -1;
398
+ }
399
+
400
+ static xmlNodePtr
401
+ extract_xml_node(VALUE node)
402
+ {
403
+ xmlNodePtr xml_node;
404
+ Data_Get_Struct(node, xmlNode, xml_node);
405
+ return xml_node;
406
+ }
407
+
408
+ static VALUE fragment_continue(VALUE parse_args);
409
+
410
+ /*
411
+ * @!visibility protected
412
+ */
413
+ static VALUE
414
+ fragment(
415
+ VALUE self,
416
+ VALUE doc_fragment,
417
+ VALUE tags,
418
+ VALUE ctx,
419
+ VALUE max_attributes,
420
+ VALUE max_errors,
421
+ VALUE max_depth
422
+ )
423
+ {
424
+ ID name = rb_intern_const("name");
425
+ const char *ctx_tag;
426
+ GumboNamespaceEnum ctx_ns;
427
+ GumboQuirksModeEnum quirks_mode;
428
+ bool form = false;
429
+ const char *encoding = NULL;
430
+
431
+ if (NIL_P(ctx)) {
432
+ ctx_tag = "body";
433
+ ctx_ns = GUMBO_NAMESPACE_HTML;
434
+ } else if (TYPE(ctx) == T_STRING) {
435
+ ctx_tag = StringValueCStr(ctx);
436
+ ctx_ns = GUMBO_NAMESPACE_HTML;
437
+ size_t len = RSTRING_LEN(ctx);
438
+ const char *colon = memchr(ctx_tag, ':', len);
439
+ if (colon) {
440
+ switch (colon - ctx_tag) {
441
+ case 3:
442
+ if (st_strncasecmp(ctx_tag, "svg", 3) != 0) {
443
+ goto error;
444
+ }
445
+ ctx_ns = GUMBO_NAMESPACE_SVG;
446
+ break;
447
+ case 4:
448
+ if (st_strncasecmp(ctx_tag, "html", 4) == 0) {
449
+ ctx_ns = GUMBO_NAMESPACE_HTML;
450
+ } else if (st_strncasecmp(ctx_tag, "math", 4) == 0) {
451
+ ctx_ns = GUMBO_NAMESPACE_MATHML;
452
+ } else {
453
+ goto error;
454
+ }
455
+ break;
456
+ default:
457
+ error:
458
+ rb_raise(rb_eArgError, "Invalid context namespace '%*s'", (int)(colon - ctx_tag), ctx_tag);
459
+ }
460
+ ctx_tag = colon + 1;
461
+ } else {
462
+ // For convenience, put 'svg' and 'math' in their namespaces.
463
+ if (len == 3 && st_strncasecmp(ctx_tag, "svg", 3) == 0) {
464
+ ctx_ns = GUMBO_NAMESPACE_SVG;
465
+ } else if (len == 4 && st_strncasecmp(ctx_tag, "math", 4) == 0) {
466
+ ctx_ns = GUMBO_NAMESPACE_MATHML;
467
+ }
468
+ }
469
+
470
+ // Check if it's a form.
471
+ form = ctx_ns == GUMBO_NAMESPACE_HTML && st_strcasecmp(ctx_tag, "form") == 0;
472
+ } else {
473
+ ID element_ = rb_intern_const("element?");
474
+
475
+ // Context fragment name.
476
+ VALUE tag_name = rb_funcall(ctx, name, 0);
477
+ assert(RTEST(tag_name));
478
+ Check_Type(tag_name, T_STRING);
479
+ ctx_tag = StringValueCStr(tag_name);
480
+
481
+ // Context fragment namespace.
482
+ ctx_ns = lookup_namespace(ctx, true);
483
+
484
+ // Check for a form ancestor, including self.
485
+ for (VALUE node = ctx;
486
+ !NIL_P(node);
487
+ node = rb_respond_to(node, parent) ? rb_funcall(node, parent, 0) : Qnil) {
488
+ if (!RTEST(rb_funcall(node, element_, 0))) {
489
+ continue;
490
+ }
491
+ VALUE element_name = rb_funcall(node, name, 0);
492
+ if (RSTRING_LEN(element_name) == 4
493
+ && !st_strcasecmp(RSTRING_PTR(element_name), "form")
494
+ && lookup_namespace(node, false) == GUMBO_NAMESPACE_HTML) {
495
+ form = true;
496
+ break;
497
+ }
498
+ }
499
+
500
+ // Encoding.
501
+ if (RSTRING_LEN(tag_name) == 14
502
+ && !st_strcasecmp(ctx_tag, "annotation-xml")) {
503
+ VALUE enc = rb_funcall(ctx, rb_intern_const("[]"),
504
+ rb_utf8_str_new_static("encoding", 8));
505
+ if (RTEST(enc)) {
506
+ Check_Type(enc, T_STRING);
507
+ encoding = StringValueCStr(enc);
508
+ }
509
+ }
510
+ }
511
+
512
+ // Quirks mode.
513
+ VALUE doc = rb_funcall(doc_fragment, rb_intern_const("document"), 0);
514
+ VALUE dtd = rb_funcall(doc, internal_subset, 0);
515
+ if (NIL_P(dtd)) {
516
+ quirks_mode = GUMBO_DOCTYPE_NO_QUIRKS;
517
+ } else {
518
+ VALUE dtd_name = rb_funcall(dtd, name, 0);
519
+ VALUE pubid = rb_funcall(dtd, rb_intern_const("external_id"), 0);
520
+ VALUE sysid = rb_funcall(dtd, rb_intern_const("system_id"), 0);
521
+ quirks_mode = gumbo_compute_quirks_mode(
522
+ NIL_P(dtd_name) ? NULL : StringValueCStr(dtd_name),
523
+ NIL_P(pubid) ? NULL : StringValueCStr(pubid),
524
+ NIL_P(sysid) ? NULL : StringValueCStr(sysid)
525
+ );
526
+ }
527
+
528
+ // Perform a fragment parse.
529
+ int depth = NUM2INT(max_depth);
530
+ GumboOptions options = kGumboDefaultOptions;
531
+ options.max_attributes = NUM2INT(max_attributes);
532
+ options.max_errors = NUM2INT(max_errors);
533
+ // Add one to account for the HTML element.
534
+ options.max_tree_depth = depth < 0 ? -1 : (depth + 1);
535
+ options.fragment_context = ctx_tag;
536
+ options.fragment_namespace = ctx_ns;
537
+ options.fragment_encoding = encoding;
538
+ options.quirks_mode = quirks_mode;
539
+ options.fragment_context_has_form_ancestor = form;
540
+
541
+ GumboOutput *output = perform_parse(&options, tags);
542
+ ParseArgs args = {
543
+ .output = output,
544
+ .input = tags,
545
+ .url_or_frag = doc_fragment,
546
+ .doc = (xmlDocPtr)extract_xml_node(doc),
547
+ };
548
+ rb_ensure(fragment_continue, (VALUE)(&args), parse_cleanup, (VALUE)(&args));
549
+ return Qnil;
550
+ }
551
+
552
+ static VALUE
553
+ fragment_continue(VALUE parse_args)
554
+ {
555
+ ParseArgs *args = (ParseArgs *)parse_args;
556
+ GumboOutput *output = args->output;
557
+ VALUE doc_fragment = args->url_or_frag;
558
+ xmlDocPtr xml_doc = args->doc;
559
+
560
+ args->doc = NULL; // The Ruby runtime owns doc so make sure we don't delete it.
561
+ xmlNodePtr xml_frag = extract_xml_node(doc_fragment);
562
+ build_tree(xml_doc, xml_frag, output->root);
563
+ add_errors(output, doc_fragment, args->input, rb_utf8_str_new_static("#fragment", 9));
564
+ return Qnil;
565
+ }
566
+
567
+ // Initialize the Nokogumbo class and fetch constants we will use later.
568
+ void
569
+ noko_init_gumbo()
570
+ {
571
+ // Class constants.
572
+ cNokogiriHtml5Document = rb_define_class_under(mNokogiriHtml5, "Document", cNokogiriHtml4Document);
573
+ rb_gc_register_mark_object(cNokogiriHtml5Document);
574
+
575
+ // Interned symbols.
576
+ internal_subset = rb_intern_const("internal_subset");
577
+ parent = rb_intern_const("parent");
578
+
579
+ // Define Nokogumbo module with parse and fragment methods.
580
+ rb_define_singleton_method(mNokogiriGumbo, "parse", parse, 5);
581
+ rb_define_singleton_method(mNokogiriGumbo, "fragment", fragment, 6);
582
+ }
583
+
584
+ // vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab: