nokogiri 1.6.0 → 1.13.2

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (340) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +3 -19
  3. data/LICENSE-DEPENDENCIES.md +1903 -0
  4. data/LICENSE.md +9 -0
  5. data/README.md +280 -0
  6. data/bin/nokogiri +84 -31
  7. data/dependencies.yml +23 -4
  8. data/ext/nokogiri/depend +38 -358
  9. data/ext/nokogiri/extconf.rb +952 -132
  10. data/ext/nokogiri/gumbo.c +584 -0
  11. data/ext/nokogiri/html4_document.c +166 -0
  12. data/ext/nokogiri/html4_element_description.c +294 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser_context.c +120 -0
  15. data/ext/nokogiri/html4_sax_push_parser.c +95 -0
  16. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  17. data/ext/nokogiri/nokogiri.c +231 -96
  18. data/ext/nokogiri/nokogiri.h +188 -129
  19. data/ext/nokogiri/test_global_handlers.c +40 -0
  20. data/ext/nokogiri/xml_attr.c +49 -40
  21. data/ext/nokogiri/xml_attribute_decl.c +18 -18
  22. data/ext/nokogiri/xml_cdata.c +24 -23
  23. data/ext/nokogiri/xml_comment.c +29 -21
  24. data/ext/nokogiri/xml_document.c +327 -223
  25. data/ext/nokogiri/xml_document_fragment.c +12 -16
  26. data/ext/nokogiri/xml_dtd.c +56 -50
  27. data/ext/nokogiri/xml_element_content.c +31 -26
  28. data/ext/nokogiri/xml_element_decl.c +22 -22
  29. data/ext/nokogiri/xml_encoding_handler.c +45 -20
  30. data/ext/nokogiri/xml_entity_decl.c +32 -30
  31. data/ext/nokogiri/xml_entity_reference.c +16 -18
  32. data/ext/nokogiri/xml_namespace.c +74 -32
  33. data/ext/nokogiri/xml_node.c +1290 -680
  34. data/ext/nokogiri/xml_node_set.c +239 -208
  35. data/ext/nokogiri/xml_processing_instruction.c +17 -19
  36. data/ext/nokogiri/xml_reader.c +227 -189
  37. data/ext/nokogiri/xml_relax_ng.c +52 -28
  38. data/ext/nokogiri/xml_sax_parser.c +123 -125
  39. data/ext/nokogiri/xml_sax_parser_context.c +138 -79
  40. data/ext/nokogiri/xml_sax_push_parser.c +88 -35
  41. data/ext/nokogiri/xml_schema.c +112 -33
  42. data/ext/nokogiri/xml_syntax_error.c +50 -23
  43. data/ext/nokogiri/xml_text.c +14 -18
  44. data/ext/nokogiri/xml_xpath_context.c +227 -140
  45. data/ext/nokogiri/xslt_stylesheet.c +269 -177
  46. data/gumbo-parser/CHANGES.md +63 -0
  47. data/gumbo-parser/Makefile +101 -0
  48. data/gumbo-parser/THANKS +27 -0
  49. data/gumbo-parser/src/Makefile +34 -0
  50. data/gumbo-parser/src/README.md +41 -0
  51. data/gumbo-parser/src/ascii.c +75 -0
  52. data/gumbo-parser/src/ascii.h +115 -0
  53. data/gumbo-parser/src/attribute.c +42 -0
  54. data/gumbo-parser/src/attribute.h +17 -0
  55. data/gumbo-parser/src/char_ref.c +22225 -0
  56. data/gumbo-parser/src/char_ref.h +29 -0
  57. data/gumbo-parser/src/char_ref.rl +2154 -0
  58. data/gumbo-parser/src/error.c +626 -0
  59. data/gumbo-parser/src/error.h +148 -0
  60. data/gumbo-parser/src/foreign_attrs.c +104 -0
  61. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  62. data/gumbo-parser/src/gumbo.h +943 -0
  63. data/gumbo-parser/src/insertion_mode.h +33 -0
  64. data/gumbo-parser/src/macros.h +91 -0
  65. data/gumbo-parser/src/parser.c +4875 -0
  66. data/gumbo-parser/src/parser.h +41 -0
  67. data/gumbo-parser/src/replacement.h +33 -0
  68. data/gumbo-parser/src/string_buffer.c +103 -0
  69. data/gumbo-parser/src/string_buffer.h +68 -0
  70. data/gumbo-parser/src/string_piece.c +48 -0
  71. data/gumbo-parser/src/svg_attrs.c +174 -0
  72. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  73. data/gumbo-parser/src/svg_tags.c +137 -0
  74. data/gumbo-parser/src/svg_tags.gperf +55 -0
  75. data/gumbo-parser/src/tag.c +222 -0
  76. data/gumbo-parser/src/tag_lookup.c +382 -0
  77. data/gumbo-parser/src/tag_lookup.gperf +169 -0
  78. data/gumbo-parser/src/tag_lookup.h +13 -0
  79. data/gumbo-parser/src/token_buffer.c +79 -0
  80. data/gumbo-parser/src/token_buffer.h +71 -0
  81. data/gumbo-parser/src/token_type.h +17 -0
  82. data/gumbo-parser/src/tokenizer.c +3463 -0
  83. data/gumbo-parser/src/tokenizer.h +112 -0
  84. data/gumbo-parser/src/tokenizer_states.h +339 -0
  85. data/gumbo-parser/src/utf8.c +245 -0
  86. data/gumbo-parser/src/utf8.h +164 -0
  87. data/gumbo-parser/src/util.c +68 -0
  88. data/gumbo-parser/src/util.h +30 -0
  89. data/gumbo-parser/src/vector.c +111 -0
  90. data/gumbo-parser/src/vector.h +45 -0
  91. data/lib/nokogiri/class_resolver.rb +67 -0
  92. data/lib/nokogiri/css/node.rb +10 -58
  93. data/lib/nokogiri/css/parser.rb +407 -357
  94. data/lib/nokogiri/css/parser.y +265 -246
  95. data/lib/nokogiri/css/parser_extras.rb +52 -49
  96. data/lib/nokogiri/css/syntax_error.rb +3 -1
  97. data/lib/nokogiri/css/tokenizer.rb +107 -104
  98. data/lib/nokogiri/css/tokenizer.rex +8 -7
  99. data/lib/nokogiri/css/xpath_visitor.rb +266 -80
  100. data/lib/nokogiri/css.rb +50 -17
  101. data/lib/nokogiri/decorators/slop.rb +17 -8
  102. data/lib/nokogiri/extension.rb +31 -0
  103. data/lib/nokogiri/gumbo.rb +15 -0
  104. data/lib/nokogiri/html.rb +38 -27
  105. data/lib/nokogiri/{html → html4}/builder.rb +4 -2
  106. data/lib/nokogiri/html4/document.rb +331 -0
  107. data/lib/nokogiri/html4/document_fragment.rb +54 -0
  108. data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
  109. data/lib/nokogiri/html4/element_description_defaults.rb +578 -0
  110. data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
  111. data/lib/nokogiri/{html → html4}/sax/parser.rb +24 -15
  112. data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
  113. data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
  114. data/lib/nokogiri/html4.rb +46 -0
  115. data/lib/nokogiri/html5/document.rb +88 -0
  116. data/lib/nokogiri/html5/document_fragment.rb +83 -0
  117. data/lib/nokogiri/html5/node.rb +96 -0
  118. data/lib/nokogiri/html5.rb +477 -0
  119. data/lib/nokogiri/jruby/dependencies.rb +21 -0
  120. data/lib/nokogiri/syntax_error.rb +2 -0
  121. data/lib/nokogiri/version/constant.rb +6 -0
  122. data/lib/nokogiri/version/info.rb +221 -0
  123. data/lib/nokogiri/version.rb +3 -105
  124. data/lib/nokogiri/xml/attr.rb +6 -3
  125. data/lib/nokogiri/xml/attribute_decl.rb +3 -1
  126. data/lib/nokogiri/xml/builder.rb +96 -54
  127. data/lib/nokogiri/xml/cdata.rb +3 -1
  128. data/lib/nokogiri/xml/character_data.rb +2 -0
  129. data/lib/nokogiri/xml/document.rb +234 -95
  130. data/lib/nokogiri/xml/document_fragment.rb +86 -36
  131. data/lib/nokogiri/xml/dtd.rb +16 -4
  132. data/lib/nokogiri/xml/element_content.rb +2 -0
  133. data/lib/nokogiri/xml/element_decl.rb +3 -1
  134. data/lib/nokogiri/xml/entity_decl.rb +4 -2
  135. data/lib/nokogiri/xml/entity_reference.rb +20 -0
  136. data/lib/nokogiri/xml/namespace.rb +3 -0
  137. data/lib/nokogiri/xml/node/save_options.rb +8 -4
  138. data/lib/nokogiri/xml/node.rb +947 -502
  139. data/lib/nokogiri/xml/node_set.rb +168 -159
  140. data/lib/nokogiri/xml/notation.rb +13 -0
  141. data/lib/nokogiri/xml/parse_options.rb +40 -5
  142. data/lib/nokogiri/xml/pp/character_data.rb +9 -6
  143. data/lib/nokogiri/xml/pp/node.rb +25 -26
  144. data/lib/nokogiri/xml/pp.rb +4 -2
  145. data/lib/nokogiri/xml/processing_instruction.rb +3 -1
  146. data/lib/nokogiri/xml/reader.rb +23 -28
  147. data/lib/nokogiri/xml/relax_ng.rb +8 -2
  148. data/lib/nokogiri/xml/sax/document.rb +45 -49
  149. data/lib/nokogiri/xml/sax/parser.rb +43 -41
  150. data/lib/nokogiri/xml/sax/parser_context.rb +8 -3
  151. data/lib/nokogiri/xml/sax/push_parser.rb +6 -5
  152. data/lib/nokogiri/xml/sax.rb +6 -4
  153. data/lib/nokogiri/xml/schema.rb +19 -9
  154. data/lib/nokogiri/xml/searchable.rb +270 -0
  155. data/lib/nokogiri/xml/syntax_error.rb +25 -1
  156. data/lib/nokogiri/xml/text.rb +2 -0
  157. data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
  158. data/lib/nokogiri/xml/xpath.rb +15 -4
  159. data/lib/nokogiri/xml/xpath_context.rb +3 -3
  160. data/lib/nokogiri/xml.rb +38 -36
  161. data/lib/nokogiri/xslt/stylesheet.rb +3 -1
  162. data/lib/nokogiri/xslt.rb +29 -20
  163. data/lib/nokogiri.rb +69 -69
  164. data/lib/xsd/xmlparser/nokogiri.rb +26 -24
  165. data/patches/libxml2/0001-Remove-script-macro-support.patch +40 -0
  166. data/patches/libxml2/0002-Update-entities-to-remove-handling-of-ssi.patch +44 -0
  167. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +25 -0
  168. data/patches/libxml2/0004-use-glibc-strlen.patch +53 -0
  169. data/patches/libxml2/0005-avoid-isnan-isinf.patch +81 -0
  170. data/patches/libxml2/0006-update-automake-files-for-arm64.patch +3040 -0
  171. data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +61 -0
  172. data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
  173. data/patches/libxslt/0001-update-automake-files-for-arm64.patch +3037 -0
  174. data/ports/archives/libxml2-2.9.13.tar.xz +0 -0
  175. data/ports/archives/libxslt-1.1.35.tar.xz +0 -0
  176. metadata +278 -362
  177. data/.autotest +0 -26
  178. data/.gemtest +0 -0
  179. data/.travis.yml +0 -27
  180. data/CHANGELOG.ja.rdoc +0 -819
  181. data/CHANGELOG.rdoc +0 -819
  182. data/C_CODING_STYLE.rdoc +0 -33
  183. data/Manifest.txt +0 -315
  184. data/README.ja.rdoc +0 -106
  185. data/README.rdoc +0 -175
  186. data/ROADMAP.md +0 -90
  187. data/Rakefile +0 -246
  188. data/STANDARD_RESPONSES.md +0 -47
  189. data/Y_U_NO_GEMSPEC.md +0 -155
  190. data/build_all +0 -105
  191. data/ext/nokogiri/html_document.c +0 -170
  192. data/ext/nokogiri/html_document.h +0 -10
  193. data/ext/nokogiri/html_element_description.c +0 -279
  194. data/ext/nokogiri/html_element_description.h +0 -10
  195. data/ext/nokogiri/html_entity_lookup.c +0 -32
  196. data/ext/nokogiri/html_entity_lookup.h +0 -8
  197. data/ext/nokogiri/html_sax_parser_context.c +0 -116
  198. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  199. data/ext/nokogiri/html_sax_push_parser.c +0 -87
  200. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  201. data/ext/nokogiri/xml_attr.h +0 -9
  202. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  203. data/ext/nokogiri/xml_cdata.h +0 -9
  204. data/ext/nokogiri/xml_comment.h +0 -9
  205. data/ext/nokogiri/xml_document.h +0 -23
  206. data/ext/nokogiri/xml_document_fragment.h +0 -10
  207. data/ext/nokogiri/xml_dtd.h +0 -10
  208. data/ext/nokogiri/xml_element_content.h +0 -10
  209. data/ext/nokogiri/xml_element_decl.h +0 -9
  210. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  211. data/ext/nokogiri/xml_entity_decl.h +0 -10
  212. data/ext/nokogiri/xml_entity_reference.h +0 -9
  213. data/ext/nokogiri/xml_io.c +0 -56
  214. data/ext/nokogiri/xml_io.h +0 -11
  215. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  216. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  217. data/ext/nokogiri/xml_namespace.h +0 -13
  218. data/ext/nokogiri/xml_node.h +0 -13
  219. data/ext/nokogiri/xml_node_set.h +0 -14
  220. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  221. data/ext/nokogiri/xml_reader.h +0 -10
  222. data/ext/nokogiri/xml_relax_ng.h +0 -9
  223. data/ext/nokogiri/xml_sax_parser.h +0 -39
  224. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  225. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  226. data/ext/nokogiri/xml_schema.h +0 -9
  227. data/ext/nokogiri/xml_syntax_error.h +0 -13
  228. data/ext/nokogiri/xml_text.h +0 -9
  229. data/ext/nokogiri/xml_xpath_context.h +0 -10
  230. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  231. data/lib/nokogiri/html/document.rb +0 -254
  232. data/lib/nokogiri/html/document_fragment.rb +0 -41
  233. data/lib/nokogiri/html/element_description_defaults.rb +0 -671
  234. data/lib/nokogiri/html/sax/parser_context.rb +0 -16
  235. data/lib/nokogiri/html/sax/push_parser.rb +0 -16
  236. data/ports/archives/libxml2-2.8.0.tar.gz +0 -0
  237. data/ports/archives/libxslt-1.1.26.tar.gz +0 -0
  238. data/tasks/cross_compile.rb +0 -132
  239. data/tasks/nokogiri.org.rb +0 -24
  240. data/tasks/test.rb +0 -95
  241. data/test/css/test_nthiness.rb +0 -159
  242. data/test/css/test_parser.rb +0 -341
  243. data/test/css/test_tokenizer.rb +0 -198
  244. data/test/css/test_xpath_visitor.rb +0 -91
  245. data/test/decorators/test_slop.rb +0 -16
  246. data/test/files/2ch.html +0 -108
  247. data/test/files/address_book.rlx +0 -12
  248. data/test/files/address_book.xml +0 -10
  249. data/test/files/bar/bar.xsd +0 -4
  250. data/test/files/bogus.xml +0 -0
  251. data/test/files/dont_hurt_em_why.xml +0 -422
  252. data/test/files/encoding.html +0 -82
  253. data/test/files/encoding.xhtml +0 -84
  254. data/test/files/exslt.xml +0 -8
  255. data/test/files/exslt.xslt +0 -35
  256. data/test/files/foo/foo.xsd +0 -4
  257. data/test/files/metacharset.html +0 -10
  258. data/test/files/noencoding.html +0 -47
  259. data/test/files/po.xml +0 -32
  260. data/test/files/po.xsd +0 -66
  261. data/test/files/saml/saml20assertion_schema.xsd +0 -283
  262. data/test/files/saml/saml20protocol_schema.xsd +0 -302
  263. data/test/files/saml/xenc_schema.xsd +0 -146
  264. data/test/files/saml/xmldsig_schema.xsd +0 -318
  265. data/test/files/shift_jis.html +0 -10
  266. data/test/files/shift_jis.xml +0 -5
  267. data/test/files/snuggles.xml +0 -3
  268. data/test/files/staff.dtd +0 -10
  269. data/test/files/staff.xml +0 -59
  270. data/test/files/staff.xslt +0 -32
  271. data/test/files/test_document_url/bar.xml +0 -2
  272. data/test/files/test_document_url/document.dtd +0 -4
  273. data/test/files/test_document_url/document.xml +0 -6
  274. data/test/files/tlm.html +0 -850
  275. data/test/files/to_be_xincluded.xml +0 -2
  276. data/test/files/valid_bar.xml +0 -2
  277. data/test/files/xinclude.xml +0 -4
  278. data/test/helper.rb +0 -154
  279. data/test/html/sax/test_parser.rb +0 -141
  280. data/test/html/sax/test_parser_context.rb +0 -46
  281. data/test/html/test_builder.rb +0 -164
  282. data/test/html/test_document.rb +0 -552
  283. data/test/html/test_document_encoding.rb +0 -138
  284. data/test/html/test_document_fragment.rb +0 -261
  285. data/test/html/test_element_description.rb +0 -105
  286. data/test/html/test_named_characters.rb +0 -14
  287. data/test/html/test_node.rb +0 -196
  288. data/test/html/test_node_encoding.rb +0 -27
  289. data/test/namespaces/test_additional_namespaces_in_builder_doc.rb +0 -14
  290. data/test/namespaces/test_namespaces_in_builder_doc.rb +0 -75
  291. data/test/namespaces/test_namespaces_in_created_doc.rb +0 -75
  292. data/test/namespaces/test_namespaces_in_parsed_doc.rb +0 -66
  293. data/test/test_convert_xpath.rb +0 -135
  294. data/test/test_css_cache.rb +0 -45
  295. data/test/test_encoding_handler.rb +0 -46
  296. data/test/test_memory_leak.rb +0 -156
  297. data/test/test_nokogiri.rb +0 -132
  298. data/test/test_reader.rb +0 -555
  299. data/test/test_soap4r_sax.rb +0 -52
  300. data/test/test_xslt_transforms.rb +0 -254
  301. data/test/xml/node/test_save_options.rb +0 -28
  302. data/test/xml/node/test_subclass.rb +0 -44
  303. data/test/xml/sax/test_parser.rb +0 -366
  304. data/test/xml/sax/test_parser_context.rb +0 -106
  305. data/test/xml/sax/test_push_parser.rb +0 -157
  306. data/test/xml/test_attr.rb +0 -64
  307. data/test/xml/test_attribute_decl.rb +0 -86
  308. data/test/xml/test_builder.rb +0 -306
  309. data/test/xml/test_c14n.rb +0 -151
  310. data/test/xml/test_cdata.rb +0 -48
  311. data/test/xml/test_comment.rb +0 -29
  312. data/test/xml/test_document.rb +0 -828
  313. data/test/xml/test_document_encoding.rb +0 -28
  314. data/test/xml/test_document_fragment.rb +0 -223
  315. data/test/xml/test_dtd.rb +0 -103
  316. data/test/xml/test_dtd_encoding.rb +0 -33
  317. data/test/xml/test_element_content.rb +0 -56
  318. data/test/xml/test_element_decl.rb +0 -73
  319. data/test/xml/test_entity_decl.rb +0 -122
  320. data/test/xml/test_entity_reference.rb +0 -245
  321. data/test/xml/test_namespace.rb +0 -95
  322. data/test/xml/test_node.rb +0 -1137
  323. data/test/xml/test_node_attributes.rb +0 -96
  324. data/test/xml/test_node_encoding.rb +0 -107
  325. data/test/xml/test_node_inheritance.rb +0 -32
  326. data/test/xml/test_node_reparenting.rb +0 -374
  327. data/test/xml/test_node_set.rb +0 -755
  328. data/test/xml/test_parse_options.rb +0 -64
  329. data/test/xml/test_processing_instruction.rb +0 -30
  330. data/test/xml/test_reader_encoding.rb +0 -142
  331. data/test/xml/test_relax_ng.rb +0 -60
  332. data/test/xml/test_schema.rb +0 -103
  333. data/test/xml/test_syntax_error.rb +0 -12
  334. data/test/xml/test_text.rb +0 -45
  335. data/test/xml/test_unparented_node.rb +0 -422
  336. data/test/xml/test_xinclude.rb +0 -83
  337. data/test/xml/test_xpath.rb +0 -295
  338. data/test/xslt/test_custom_functions.rb +0 -133
  339. data/test/xslt/test_exception_handling.rb +0 -37
  340. data/test_all +0 -81
data/lib/nokogiri.rb CHANGED
@@ -1,91 +1,67 @@
1
- # -*- coding: utf-8 -*-
2
- # Modify the PATH on windows so that the external DLLs will get loaded.
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
3
 
4
- require 'rbconfig'
5
- ENV['PATH'] = [File.expand_path(
6
- File.join(File.dirname(__FILE__), "..", "ext", "nokogiri")
7
- ), ENV['PATH']].compact.join(';') if RbConfig::CONFIG['host_os'] =~ /(mswin|mingw)/i
4
+ require "rbconfig"
8
5
 
9
6
  if defined?(RUBY_ENGINE) && RUBY_ENGINE == "jruby"
10
- # The line below caused a problem on non-GAE rack environment.
11
- # unless defined?(JRuby::Rack::VERSION) || defined?(AppEngine::ApiProxy)
12
- #
13
- # However, simply cutting defined?(JRuby::Rack::VERSION) off resulted in
14
- # an unable-to-load-nokogiri problem. Thus, now, Nokogiri checks the presense
15
- # of appengine-rack.jar in $LOAD_PATH. If Nokogiri is on GAE, Nokogiri
16
- # should skip loading xml jars. This is because those are in WEB-INF/lib and
17
- # already set in the classpath.
18
- unless $LOAD_PATH.to_s.include?("appengine-rack")
19
- require 'stringio'
20
- require 'isorelax.jar'
21
- require 'jing.jar'
22
- require 'nekohtml.jar'
23
- require 'nekodtd.jar'
24
- require 'xercesImpl.jar'
25
- end
7
+ require_relative "nokogiri/jruby/dependencies"
26
8
  end
27
9
 
28
- require 'nokogiri/nokogiri'
29
- require 'nokogiri/version'
30
- require 'nokogiri/syntax_error'
31
- require 'nokogiri/xml'
32
- require 'nokogiri/xslt'
33
- require 'nokogiri/html'
34
- require 'nokogiri/decorators/slop'
35
- require 'nokogiri/css'
36
- require 'nokogiri/html/builder'
10
+ require_relative "nokogiri/extension"
37
11
 
38
12
  # Nokogiri parses and searches XML/HTML very quickly, and also has
39
- # correctly implemented CSS3 selector support as well as XPath support.
13
+ # correctly implemented CSS3 selector support as well as XPath 1.0
14
+ # support.
40
15
  #
41
16
  # Parsing a document returns either a Nokogiri::XML::Document, or a
42
- # Nokogiri::HTML::Document depending on the kind of document you parse.
17
+ # Nokogiri::HTML4::Document depending on the kind of document you parse.
43
18
  #
44
19
  # Here is an example:
45
20
  #
46
- # require 'nokogiri'
47
- # require 'open-uri'
21
+ # require 'nokogiri'
22
+ # require 'open-uri'
23
+ #
24
+ # # Get a Nokogiri::HTML4::Document for the page we’re interested in...
48
25
  #
49
- # # Get a Nokogiri::HTML:Document for the page we’re interested in...
26
+ # doc = Nokogiri::HTML4(URI.open('http://www.google.com/search?q=tenderlove'))
50
27
  #
51
- # doc = Nokogiri::HTML(open('http://www.google.com/search?q=tenderlove'))
28
+ # # Do funky things with it using Nokogiri::XML::Node methods...
52
29
  #
53
- # # Do funky things with it using Nokogiri::XML::Node methods...
30
+ # ####
31
+ # # Search for nodes by css
32
+ # doc.css('h3.r a.l').each do |link|
33
+ # puts link.content
34
+ # end
54
35
  #
55
- # ####
56
- # # Search for nodes by css
57
- # doc.css('h3.r a.l').each do |link|
58
- # puts link.content
59
- # end
36
+ # See also:
60
37
  #
61
- # See Nokogiri::XML::Node#css for more information about CSS searching.
62
- # See Nokogiri::XML::Node#xpath for more information about XPath searching.
38
+ # - Nokogiri::XML::Searchable#css for more information about CSS searching
39
+ # - Nokogiri::XML::Searchable#xpath for more information about XPath searching
63
40
  module Nokogiri
64
41
  class << self
65
42
  ###
66
43
  # Parse an HTML or XML document. +string+ contains the document.
67
- def parse string, url = nil, encoding = nil, options = nil
68
- doc =
69
- if string.respond_to?(:read) ||
70
- string =~ /^\s*<[^Hh>]*html/i # Probably html
71
- Nokogiri.HTML(
72
- string,
73
- url,
74
- encoding, options || XML::ParseOptions::DEFAULT_HTML
75
- )
76
- else
77
- Nokogiri.XML(string, url, encoding,
78
- options || XML::ParseOptions::DEFAULT_XML)
79
- end
80
- yield doc if block_given?
81
- doc
44
+ def parse(string, url = nil, encoding = nil, options = nil)
45
+ if string.respond_to?(:read) ||
46
+ /^\s*<(?:!DOCTYPE\s+)?html[\s>]/i.match?(string[0, 512])
47
+ # Expect an HTML indicator to appear within the first 512
48
+ # characters of a document. (<?xml ?> + <?xml-stylesheet ?>
49
+ # shouldn't be that long)
50
+ Nokogiri.HTML4(string, url, encoding,
51
+ options || XML::ParseOptions::DEFAULT_HTML)
52
+ else
53
+ Nokogiri.XML(string, url, encoding,
54
+ options || XML::ParseOptions::DEFAULT_XML)
55
+ end.tap do |doc|
56
+ yield doc if block_given?
57
+ end
82
58
  end
83
59
 
84
60
  ###
85
61
  # Create a new Nokogiri::XML::DocumentFragment
86
- def make input = nil, opts = {}, &blk
62
+ def make(input = nil, opts = {}, &blk)
87
63
  if input
88
- Nokogiri::HTML.fragment(input).children.first
64
+ Nokogiri::HTML4.fragment(input).children.first
89
65
  else
90
66
  Nokogiri(&blk)
91
67
  end
@@ -109,20 +85,44 @@ module Nokogiri
109
85
  def Slop(*args, &block)
110
86
  Nokogiri(*args, &block).slop!
111
87
  end
88
+
89
+ # :nodoc:
90
+ def install_default_aliases
91
+ # Make sure to support some popular encoding aliases not known by
92
+ # all iconv implementations.
93
+ {
94
+ "Windows-31J" => "CP932", # Windows-31J is the IANA registered name of CP932.
95
+ }.each do |alias_name, name|
96
+ EncodingHandler.alias(name, alias_name) if EncodingHandler[alias_name].nil?
97
+ end
98
+ end
112
99
  end
100
+
101
+ Nokogiri.install_default_aliases
113
102
  end
114
103
 
115
104
  ###
116
- # Parser a document contained in +args+. Nokogiri will try to guess what
117
- # type of document you are attempting to parse. For more information, see
118
- # Nokogiri.parse
105
+ # Parse a document contained in +args+. Nokogiri will try to guess what type of document you are
106
+ # attempting to parse. For more information, see Nokogiri.parse
119
107
  #
120
- # To specify the type of document, use Nokogiri.XML or Nokogiri.HTML.
108
+ # To specify the type of document, use {Nokogiri.XML}, {Nokogiri.HTML4}, or {Nokogiri.HTML5}.
121
109
  def Nokogiri(*args, &block)
122
- if block_given?
123
- builder = Nokogiri::HTML::Builder.new(&block)
124
- return builder.doc.root
110
+ if block
111
+ Nokogiri::HTML4::Builder.new(&block).doc.root
125
112
  else
126
113
  Nokogiri.parse(*args)
127
114
  end
128
115
  end
116
+
117
+ require_relative "nokogiri/version"
118
+ require_relative "nokogiri/class_resolver"
119
+ require_relative "nokogiri/syntax_error"
120
+ require_relative "nokogiri/xml"
121
+ require_relative "nokogiri/xslt"
122
+ require_relative "nokogiri/html4"
123
+ require_relative "nokogiri/html"
124
+ require_relative "nokogiri/decorators/slop"
125
+ require_relative "nokogiri/css"
126
+ require_relative "nokogiri/html4/builder"
127
+
128
+ require_relative "nokogiri/html5" if Nokogiri.uses_gumbo?
@@ -1,7 +1,9 @@
1
- require 'nokogiri'
1
+ # frozen_string_literal: true
2
2
 
3
- module XSD # :nodoc:
4
- module XMLParser # :nodoc:
3
+ require "nokogiri"
4
+
5
+ module XSD
6
+ module XMLParser
5
7
  ###
6
8
  # Nokogiri XML parser for soap4r.
7
9
  #
@@ -26,40 +28,40 @@ module XSD # :nodoc:
26
28
  class Nokogiri < XSD::XMLParser::Parser
27
29
  ###
28
30
  # Create a new XSD parser with +host+ and +opt+
29
- def initialize host, opt = {}
31
+ def initialize(host, opt = {})
30
32
  super
31
- @parser = ::Nokogiri::XML::SAX::Parser.new(self, @charset || 'UTF-8')
33
+ @parser = ::Nokogiri::XML::SAX::Parser.new(self, @charset || "UTF-8")
32
34
  end
33
35
 
34
36
  ###
35
37
  # Start parsing +string_or_readable+
36
- def do_parse string_or_readable
38
+ def do_parse(string_or_readable)
37
39
  @parser.parse(string_or_readable)
38
40
  end
39
41
 
40
42
  ###
41
43
  # Handle the start_element event with +name+ and +attrs+
42
- def start_element name, attrs = []
44
+ def start_element(name, attrs = [])
43
45
  super(name, Hash[*attrs.flatten])
44
46
  end
45
47
 
46
48
  ###
47
49
  # Handle the end_element event with +name+
48
- def end_element name
50
+ def end_element(name)
49
51
  super
50
52
  end
51
53
 
52
54
  ###
53
55
  # Handle errors with message +msg+
54
- def error msg
55
- raise ParseError.new(msg)
56
+ def error(msg)
57
+ raise ParseError, msg
56
58
  end
57
- alias :warning :error
59
+ alias_method :warning, :error
58
60
 
59
61
  ###
60
62
  # Handle cdata_blocks containing +string+
61
- def cdata_block string
62
- characters string
63
+ def cdata_block(string)
64
+ characters(string)
63
65
  end
64
66
 
65
67
  ###
@@ -69,16 +71,16 @@ module XSD # :nodoc:
69
71
  # +prefix+ is the namespace prefix for the element
70
72
  # +uri+ is the associated namespace URI
71
73
  # +ns+ is a hash of namespace prefix:urls associated with the element
72
- def start_element_namespace name, attrs = [], prefix = nil, uri = nil, ns = []
74
+ def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = [])
73
75
  ###
74
76
  # Deal with SAX v1 interface
75
- name = [prefix, name].compact.join(':')
76
- attributes = ns.map { |ns_prefix,ns_uri|
77
- [['xmlns', ns_prefix].compact.join(':'), ns_uri]
78
- } + attrs.map { |attr|
79
- [[attr.prefix, attr.localname].compact.join(':'), attr.value]
80
- }.flatten
81
- start_element name, attributes
77
+ name = [prefix, name].compact.join(":")
78
+ attributes = ns.map do |ns_prefix, ns_uri|
79
+ [["xmlns", ns_prefix].compact.join(":"), ns_uri]
80
+ end + attrs.map do |attr|
81
+ [[attr.prefix, attr.localname].compact.join(":"), attr.value]
82
+ end.flatten
83
+ start_element(name, attributes)
82
84
  end
83
85
 
84
86
  ###
@@ -86,13 +88,13 @@ module XSD # :nodoc:
86
88
  # +name+ is the element's name
87
89
  # +prefix+ is the namespace prefix associated with the element
88
90
  # +uri+ is the associated namespace URI
89
- def end_element_namespace name, prefix = nil, uri = nil
91
+ def end_element_namespace(name, prefix = nil, uri = nil)
90
92
  ###
91
93
  # Deal with SAX v1 interface
92
- end_element [prefix, name].compact.join(':')
94
+ end_element([prefix, name].compact.join(":"))
93
95
  end
94
96
 
95
- %w{ xmldecl start_document end_document comment }.each do |name|
97
+ ["xmldecl", "start_document", "end_document", "comment"].each do |name|
96
98
  class_eval %{ def #{name}(*args); end }
97
99
  end
98
100
 
@@ -0,0 +1,40 @@
1
+ From 27e4aa8d885e47a296ea78d114dbbe8fc7aa3508 Mon Sep 17 00:00:00 2001
2
+ From: Kevin Solorio <soloriok@gmail.com>
3
+ Date: Fri, 1 Feb 2019 14:32:42 -0800
4
+ Subject: [PATCH] Revert-support-html-h-b-7-1
5
+
6
+ ---
7
+ entities.c | 17 -----------------
8
+ 1 file changed, 17 deletions(-)
9
+
10
+ diff --git a/entities.c b/entities.c
11
+ index 43549bc5..82652f6d 100644
12
+ --- a/entities.c
13
+ +++ b/entities.c
14
+ @@ -623,23 +623,6 @@ xmlEncodeEntitiesInternal(xmlDocPtr doc, const xmlChar *input, int attr) {
15
+ *out++ = 't';
16
+ *out++ = ';';
17
+ } else if (*cur == '&') {
18
+ - /*
19
+ - * Special handling of &{...} construct from HTML 4, see
20
+ - * http://www.w3.org/TR/html401/appendix/notes.html#h-B.7.1
21
+ - */
22
+ - if (html && attr && (cur[1] == '{') &&
23
+ - (strchr((const char *) cur, '}'))) {
24
+ - while (*cur != '}') {
25
+ - *out++ = *cur++;
26
+ - indx = out - buffer;
27
+ - if (indx + 100 > buffer_size) {
28
+ - growBufferReentrant();
29
+ - out = &buffer[indx];
30
+ - }
31
+ - }
32
+ - *out++ = *cur++;
33
+ - continue;
34
+ - }
35
+ *out++ = '&';
36
+ *out++ = 'a';
37
+ *out++ = 'm';
38
+ --
39
+ 2.16.2
40
+
@@ -0,0 +1,44 @@
1
+ From ffc08467744bd2305d41ca882c37fa30adf3a067 Mon Sep 17 00:00:00 2001
2
+ From: Kevin Solorio <soloriok@gmail.com>
3
+ Date: Wed, 27 Feb 2019 14:34:17 -0800
4
+ Subject: [PATCH 2/2] update entities.c to remove handling of ssi
5
+
6
+ ---
7
+ entities.c | 21 ---------------------
8
+ 1 file changed, 21 deletions(-)
9
+
10
+ diff --git a/entities.c b/entities.c
11
+ index 43549bc5..5c4a2a60 100644
12
+ --- a/entities.c
13
+ +++ b/entities.c
14
+ @@ -592,27 +592,6 @@ xmlEncodeEntitiesInternal(xmlDocPtr doc, const xmlChar *input, int attr) {
15
+ * By default one have to encode at least '<', '>', '"' and '&' !
16
+ */
17
+ if (*cur == '<') {
18
+ - const xmlChar *end;
19
+ -
20
+ - /*
21
+ - * Special handling of server side include in HTML attributes
22
+ - */
23
+ - if (html && attr &&
24
+ - (cur[1] == '!') && (cur[2] == '-') && (cur[3] == '-') &&
25
+ - ((end = xmlStrstr(cur, BAD_CAST "-->")) != NULL)) {
26
+ - while (cur != end) {
27
+ - *out++ = *cur++;
28
+ - indx = out - buffer;
29
+ - if (indx + 100 > buffer_size) {
30
+ - growBufferReentrant();
31
+ - out = &buffer[indx];
32
+ - }
33
+ - }
34
+ - *out++ = *cur++;
35
+ - *out++ = *cur++;
36
+ - *out++ = *cur++;
37
+ - continue;
38
+ - }
39
+ *out++ = '&';
40
+ *out++ = 'l';
41
+ *out++ = 't';
42
+ --
43
+ 2.16.2
44
+
@@ -0,0 +1,25 @@
1
+ From 0b6ae484761fa01242fe8b67b54e3eb2d282d83d Mon Sep 17 00:00:00 2001
2
+ From: Mike Dalessio <mike.dalessio@gmail.com>
3
+ Date: Wed, 4 Dec 2019 08:43:51 -0500
4
+ Subject: [PATCH] fix libxml2.la's path
5
+
6
+ ---
7
+ Makefile.in | 2 +-
8
+ 1 file changed, 1 insertion(+), 1 deletion(-)
9
+
10
+ diff --git a/Makefile.in b/Makefile.in
11
+ index cf96d41..1372d8b 100644
12
+ --- a/Makefile.in
13
+ +++ b/Makefile.in
14
+ @@ -1057,7 +1057,7 @@ clean-noinstLTLIBRARIES:
15
+ rm -f $${locs}; \
16
+ }
17
+
18
+ -libxml2.la: $(libxml2_la_OBJECTS) $(libxml2_la_DEPENDENCIES) $(EXTRA_libxml2_la_DEPENDENCIES)
19
+ +$(top_builddir)/libxml2.la: $(libxml2_la_OBJECTS) $(libxml2_la_DEPENDENCIES) $(EXTRA_libxml2_la_DEPENDENCIES)
20
+ $(AM_V_CCLD)$(libxml2_la_LINK) -rpath $(libdir) $(libxml2_la_OBJECTS) $(libxml2_la_LIBADD) $(LIBS)
21
+
22
+ testdso.la: $(testdso_la_OBJECTS) $(testdso_la_DEPENDENCIES) $(EXTRA_testdso_la_DEPENDENCIES)
23
+ --
24
+ 2.17.1
25
+
@@ -0,0 +1,53 @@
1
+ From c94172d2a4451368530db2186190d70be8a1d9e5 Mon Sep 17 00:00:00 2001
2
+ From: Ilya Zub <ilya@serpapi.com>
3
+ Date: Wed, 23 Dec 2020 12:45:29 +0200
4
+ Subject: Use glibc strlen to speed up xmlStrlen
5
+ MIME-Version: 1.0
6
+ Content-Type: text/plain; charset=UTF-8
7
+ Content-Transfer-Encoding: 8bit
8
+
9
+ xmlStrlen (entire HTML file): 926171.936981 μs
10
+ glibc_xmlStrlen (entire HTML file): 36905.903992 μs
11
+ delta (xmlStrlen ÷ glibc_xmlStrlen): 25.094584 times
12
+
13
+ xmlStrlen (average string): 57479.204010 μs
14
+ glibc_xmlStrlen (average string): 5802.069000 μs
15
+ delta (xmlStrlen ÷ glibc_xmlStrlen): 9.905937 times
16
+
17
+ xmlStrlen (bigger string): 388056.315979 μs
18
+ glibc_xmlStrlen (bigger string): 12797.856995 μs
19
+ delta (xmlStrlen ÷ glibc_xmlStrlen): 30.318382 times
20
+
21
+ xmlStrlen (smallest string): 15870.046021 μs
22
+ glibc_xmlStrlen (smallest string): 6282.208984 μs
23
+ delta (xmlStrlen ÷ glibc_xmlStrlen): 2.527903 times
24
+
25
+ See https://gitlab.gnome.org/GNOME/libxml2/-/issues/212 for reference.
26
+ ---
27
+ xmlstring.c | 9 ++-------
28
+ 1 file changed, 2 insertions(+), 7 deletions(-)
29
+
30
+ diff --git a/xmlstring.c b/xmlstring.c
31
+ index e8a1e45d..df247dff 100644
32
+ --- a/xmlstring.c
33
+ +++ b/xmlstring.c
34
+ @@ -423,12 +423,7 @@ xmlStrsub(const xmlChar *str, int start, int len) {
35
+
36
+ int
37
+ xmlStrlen(const xmlChar *str) {
38
+ - size_t len = 0;
39
+ -
40
+ if (str == NULL) return(0);
41
+ - while (*str != 0) { /* non input consuming */
42
+ - str++;
43
+ - len++;
44
+ - }
45
+ - return(len > INT_MAX ? 0 : len);
46
+ +
47
+ + return strlen((const char*)str);
48
+ }
49
+
50
+ /**
51
+ --
52
+ 2.29.2
53
+
@@ -0,0 +1,81 @@
1
+ This patch is a result of rake-compiler-dock using centos 7 (manylinux2014) to cross-compile.
2
+
3
+ Centos, for reasons I have not been able to discern, implements `isnan` and `isinf` as a function
4
+ and not as a macro. Debian knows how to resolve that function at dynamic-link time (despite using a
5
+ macro at compile time), but musl-based systems (like alpine) do not. Running `nm` on nokogiri.so
6
+ created on such a centos system shows:
7
+
8
+ ```
9
+ U __isinf@@GLIBC_2.2.5
10
+ U __isnan@@GLIBC_2.2.5
11
+ ```
12
+
13
+ (see https://github.com/sparklemotion/nokogiri/pull/2142 for more info)
14
+
15
+ This patch avoids using glibc's `isnan` and `isinf` calls, instead using libxml2's fallback
16
+ implementation. There's history here, see libxml2 commit 8813f39:
17
+
18
+ commit 8813f39
19
+ Author: Nick Wellnhofer <wellnhofer@aevum.de>
20
+ Date: 2017-09-21 00:11:26 +0200
21
+
22
+ Simplify XPath NaN, inf and -0 handling
23
+
24
+ Use C99 macros NAN, INFINITY, isnan, isinf. If they're not available:
25
+
26
+ - Assume that (0.0 / 0.0) generates a NaN and !(x == x) tests for NaN.
27
+ - Use C89's HUGE_VAL for INFINITY.
28
+
29
+ Remove manual handling of NaN, infinity and negative zero in functions
30
+ xmlXPathValueFlipSign and xmlXPathDivValues.
31
+
32
+ Remove xmlXPathGetSign. All the tests for negative zero can be replaced
33
+ with a test for negative or positive zero.
34
+
35
+ Simplify xmlXPathRoundFunction.
36
+
37
+ Remove Trio dependency.
38
+
39
+ This should work on IEEE 754 compliant implementations even if the C99
40
+ macros aren't available, but will likely break some ancient platforms.
41
+ If problems arise, my plan is to port the relevant trionan.c solution
42
+ to xpath.c. Note that non-compliant implementations are impossible
43
+ to fully support, anyway, since XPath requires IEEE 754.
44
+
45
+ This patch would be unnecessary if any of the following was true:
46
+
47
+ * centos implements these as macros, and doesn't generate an unresolved symbol for either in the shared library
48
+ * we had a way to ensure `__isinf` and `__isnan` resolve on musl (e.g., we implement them locally)
49
+
50
+ diff --git a/xpath.c b/xpath.c
51
+ index 9f64ab9..5b6d999 100644
52
+ --- a/xpath.c
53
+ +++ b/xpath.c
54
+ @@ -515,11 +515,7 @@ xmlXPathInit(void) {
55
+ */
56
+ int
57
+ xmlXPathIsNaN(double val) {
58
+ -#ifdef isnan
59
+ - return isnan(val);
60
+ -#else
61
+ return !(val == val);
62
+ -#endif
63
+ }
64
+
65
+ /**
66
+ @@ -530,15 +530,11 @@ xmlXPathIsNaN(double val) {
67
+ */
68
+ int
69
+ xmlXPathIsInf(double val) {
70
+ -#ifdef isinf
71
+ - return isinf(val) ? (val > 0 ? 1 : -1) : 0;
72
+ -#else
73
+ if (val >= xmlXPathPINF)
74
+ return 1;
75
+ if (val <= -xmlXPathPINF)
76
+ return -1;
77
+ return 0;
78
+ -#endif
79
+ }
80
+
81
+ #endif /* SCHEMAS or XPATH */