nokogiri 1.5.10 → 1.12.5

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (328) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +3 -0
  3. data/LICENSE-DEPENDENCIES.md +1903 -0
  4. data/LICENSE.md +9 -0
  5. data/README.md +278 -0
  6. data/bin/nokogiri +50 -10
  7. data/dependencies.yml +74 -0
  8. data/ext/nokogiri/depend +38 -358
  9. data/ext/nokogiri/extconf.rb +944 -100
  10. data/ext/nokogiri/gumbo.c +584 -0
  11. data/ext/nokogiri/html4_document.c +166 -0
  12. data/ext/nokogiri/html4_element_description.c +294 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser_context.c +120 -0
  15. data/ext/nokogiri/html4_sax_push_parser.c +95 -0
  16. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  17. data/ext/nokogiri/nokogiri.c +232 -87
  18. data/ext/nokogiri/nokogiri.h +188 -129
  19. data/ext/nokogiri/test_global_handlers.c +40 -0
  20. data/ext/nokogiri/xml_attr.c +49 -40
  21. data/ext/nokogiri/xml_attribute_decl.c +18 -18
  22. data/ext/nokogiri/xml_cdata.c +24 -23
  23. data/ext/nokogiri/xml_comment.c +29 -21
  24. data/ext/nokogiri/xml_document.c +305 -201
  25. data/ext/nokogiri/xml_document_fragment.c +13 -15
  26. data/ext/nokogiri/xml_dtd.c +54 -48
  27. data/ext/nokogiri/xml_element_content.c +31 -26
  28. data/ext/nokogiri/xml_element_decl.c +22 -22
  29. data/ext/nokogiri/xml_encoding_handler.c +30 -19
  30. data/ext/nokogiri/xml_entity_decl.c +32 -30
  31. data/ext/nokogiri/xml_entity_reference.c +16 -18
  32. data/ext/nokogiri/xml_namespace.c +74 -32
  33. data/ext/nokogiri/xml_node.c +808 -503
  34. data/ext/nokogiri/xml_node_set.c +239 -208
  35. data/ext/nokogiri/xml_processing_instruction.c +17 -19
  36. data/ext/nokogiri/xml_reader.c +198 -186
  37. data/ext/nokogiri/xml_relax_ng.c +52 -28
  38. data/ext/nokogiri/xml_sax_parser.c +123 -125
  39. data/ext/nokogiri/xml_sax_parser_context.c +138 -79
  40. data/ext/nokogiri/xml_sax_push_parser.c +88 -35
  41. data/ext/nokogiri/xml_schema.c +112 -33
  42. data/ext/nokogiri/xml_syntax_error.c +50 -23
  43. data/ext/nokogiri/xml_text.c +14 -18
  44. data/ext/nokogiri/xml_xpath_context.c +162 -98
  45. data/ext/nokogiri/xslt_stylesheet.c +162 -168
  46. data/gumbo-parser/CHANGES.md +63 -0
  47. data/gumbo-parser/Makefile +101 -0
  48. data/gumbo-parser/THANKS +27 -0
  49. data/gumbo-parser/src/Makefile +34 -0
  50. data/gumbo-parser/src/README.md +41 -0
  51. data/gumbo-parser/src/ascii.c +75 -0
  52. data/gumbo-parser/src/ascii.h +115 -0
  53. data/gumbo-parser/src/attribute.c +42 -0
  54. data/gumbo-parser/src/attribute.h +17 -0
  55. data/gumbo-parser/src/char_ref.c +22225 -0
  56. data/gumbo-parser/src/char_ref.h +29 -0
  57. data/gumbo-parser/src/char_ref.rl +2154 -0
  58. data/gumbo-parser/src/error.c +626 -0
  59. data/gumbo-parser/src/error.h +148 -0
  60. data/gumbo-parser/src/foreign_attrs.c +104 -0
  61. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  62. data/gumbo-parser/src/gumbo.h +943 -0
  63. data/gumbo-parser/src/insertion_mode.h +33 -0
  64. data/gumbo-parser/src/macros.h +91 -0
  65. data/gumbo-parser/src/parser.c +4886 -0
  66. data/gumbo-parser/src/parser.h +41 -0
  67. data/gumbo-parser/src/replacement.h +33 -0
  68. data/gumbo-parser/src/string_buffer.c +103 -0
  69. data/gumbo-parser/src/string_buffer.h +68 -0
  70. data/gumbo-parser/src/string_piece.c +48 -0
  71. data/gumbo-parser/src/svg_attrs.c +174 -0
  72. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  73. data/gumbo-parser/src/svg_tags.c +137 -0
  74. data/gumbo-parser/src/svg_tags.gperf +55 -0
  75. data/gumbo-parser/src/tag.c +222 -0
  76. data/gumbo-parser/src/tag_lookup.c +382 -0
  77. data/gumbo-parser/src/tag_lookup.gperf +169 -0
  78. data/gumbo-parser/src/tag_lookup.h +13 -0
  79. data/gumbo-parser/src/token_buffer.c +79 -0
  80. data/gumbo-parser/src/token_buffer.h +71 -0
  81. data/gumbo-parser/src/token_type.h +17 -0
  82. data/gumbo-parser/src/tokenizer.c +3463 -0
  83. data/gumbo-parser/src/tokenizer.h +112 -0
  84. data/gumbo-parser/src/tokenizer_states.h +339 -0
  85. data/gumbo-parser/src/utf8.c +245 -0
  86. data/gumbo-parser/src/utf8.h +164 -0
  87. data/gumbo-parser/src/util.c +68 -0
  88. data/gumbo-parser/src/util.h +30 -0
  89. data/gumbo-parser/src/vector.c +111 -0
  90. data/gumbo-parser/src/vector.h +45 -0
  91. data/lib/nokogiri/css/node.rb +1 -50
  92. data/lib/nokogiri/css/parser.rb +317 -286
  93. data/lib/nokogiri/css/parser.y +57 -43
  94. data/lib/nokogiri/css/parser_extras.rb +39 -36
  95. data/lib/nokogiri/css/syntax_error.rb +2 -1
  96. data/lib/nokogiri/css/tokenizer.rb +105 -103
  97. data/lib/nokogiri/css/tokenizer.rex +5 -5
  98. data/lib/nokogiri/css/xpath_visitor.rb +137 -48
  99. data/lib/nokogiri/css.rb +15 -14
  100. data/lib/nokogiri/decorators/slop.rb +13 -5
  101. data/lib/nokogiri/extension.rb +31 -0
  102. data/lib/nokogiri/gumbo.rb +14 -0
  103. data/lib/nokogiri/html.rb +32 -27
  104. data/lib/nokogiri/{html → html4}/builder.rb +3 -2
  105. data/lib/nokogiri/{html → html4}/document.rb +118 -50
  106. data/lib/nokogiri/{html → html4}/document_fragment.rb +20 -11
  107. data/lib/nokogiri/{html → html4}/element_description.rb +2 -1
  108. data/lib/nokogiri/{html → html4}/element_description_defaults.rb +2 -1
  109. data/lib/nokogiri/{html → html4}/entity_lookup.rb +2 -1
  110. data/lib/nokogiri/{html → html4}/sax/parser.rb +22 -14
  111. data/lib/nokogiri/html4/sax/parser_context.rb +19 -0
  112. data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
  113. data/lib/nokogiri/html4.rb +40 -0
  114. data/lib/nokogiri/html5/document.rb +74 -0
  115. data/lib/nokogiri/html5/document_fragment.rb +80 -0
  116. data/lib/nokogiri/html5/node.rb +93 -0
  117. data/lib/nokogiri/html5.rb +473 -0
  118. data/lib/nokogiri/jruby/dependencies.rb +20 -0
  119. data/lib/nokogiri/syntax_error.rb +1 -0
  120. data/lib/nokogiri/version/constant.rb +5 -0
  121. data/lib/nokogiri/version/info.rb +215 -0
  122. data/lib/nokogiri/version.rb +3 -91
  123. data/lib/nokogiri/xml/attr.rb +1 -0
  124. data/lib/nokogiri/xml/attribute_decl.rb +1 -0
  125. data/lib/nokogiri/xml/builder.rb +75 -33
  126. data/lib/nokogiri/xml/cdata.rb +1 -0
  127. data/lib/nokogiri/xml/character_data.rb +1 -0
  128. data/lib/nokogiri/xml/document.rb +157 -54
  129. data/lib/nokogiri/xml/document_fragment.rb +55 -8
  130. data/lib/nokogiri/xml/dtd.rb +15 -4
  131. data/lib/nokogiri/xml/element_content.rb +1 -0
  132. data/lib/nokogiri/xml/element_decl.rb +1 -0
  133. data/lib/nokogiri/xml/entity_decl.rb +1 -0
  134. data/lib/nokogiri/xml/entity_reference.rb +19 -0
  135. data/lib/nokogiri/xml/namespace.rb +1 -0
  136. data/lib/nokogiri/xml/node/save_options.rb +2 -1
  137. data/lib/nokogiri/xml/node.rb +712 -431
  138. data/lib/nokogiri/xml/node_set.rb +140 -123
  139. data/lib/nokogiri/xml/notation.rb +1 -0
  140. data/lib/nokogiri/xml/parse_options.rb +31 -0
  141. data/lib/nokogiri/xml/pp/character_data.rb +1 -0
  142. data/lib/nokogiri/xml/pp/node.rb +1 -0
  143. data/lib/nokogiri/xml/pp.rb +3 -2
  144. data/lib/nokogiri/xml/processing_instruction.rb +1 -0
  145. data/lib/nokogiri/xml/reader.rb +9 -12
  146. data/lib/nokogiri/xml/relax_ng.rb +7 -2
  147. data/lib/nokogiri/xml/sax/document.rb +25 -30
  148. data/lib/nokogiri/xml/sax/parser.rb +8 -8
  149. data/lib/nokogiri/xml/sax/parser_context.rb +1 -0
  150. data/lib/nokogiri/xml/sax/push_parser.rb +1 -0
  151. data/lib/nokogiri/xml/sax.rb +5 -4
  152. data/lib/nokogiri/xml/schema.rb +13 -4
  153. data/lib/nokogiri/xml/searchable.rb +239 -0
  154. data/lib/nokogiri/xml/syntax_error.rb +25 -1
  155. data/lib/nokogiri/xml/text.rb +1 -0
  156. data/lib/nokogiri/xml/xpath/syntax_error.rb +2 -1
  157. data/lib/nokogiri/xml/xpath.rb +4 -5
  158. data/lib/nokogiri/xml/xpath_context.rb +1 -0
  159. data/lib/nokogiri/xml.rb +37 -35
  160. data/lib/nokogiri/xslt/stylesheet.rb +2 -1
  161. data/lib/nokogiri/xslt.rb +17 -16
  162. data/lib/nokogiri.rb +55 -58
  163. data/lib/xsd/xmlparser/nokogiri.rb +1 -0
  164. data/patches/libxml2/0001-Remove-script-macro-support.patch +40 -0
  165. data/patches/libxml2/0002-Update-entities-to-remove-handling-of-ssi.patch +44 -0
  166. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +25 -0
  167. data/patches/libxml2/0004-use-glibc-strlen.patch +53 -0
  168. data/patches/libxml2/0005-avoid-isnan-isinf.patch +81 -0
  169. data/patches/libxml2/0006-update-automake-files-for-arm64.patch +2511 -0
  170. data/patches/libxml2/0007-Fix-XPath-recursion-limit.patch +31 -0
  171. data/patches/libxslt/0001-update-automake-files-for-arm64.patch +2511 -0
  172. data/patches/libxslt/0002-Fix-xml2-config-check-in-configure-script.patch +19 -0
  173. data/ports/archives/libxml2-2.9.12.tar.gz +0 -0
  174. data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
  175. metadata +307 -459
  176. data/.autotest +0 -26
  177. data/.gemtest +0 -0
  178. data/CHANGELOG.ja.rdoc +0 -785
  179. data/CHANGELOG.rdoc +0 -783
  180. data/C_CODING_STYLE.rdoc +0 -33
  181. data/Manifest.txt +0 -303
  182. data/README.ja.rdoc +0 -106
  183. data/README.rdoc +0 -175
  184. data/ROADMAP.md +0 -90
  185. data/Rakefile +0 -228
  186. data/STANDARD_RESPONSES.md +0 -47
  187. data/Y_U_NO_GEMSPEC.md +0 -155
  188. data/build_all +0 -105
  189. data/ext/nokogiri/html_document.c +0 -170
  190. data/ext/nokogiri/html_document.h +0 -10
  191. data/ext/nokogiri/html_element_description.c +0 -279
  192. data/ext/nokogiri/html_element_description.h +0 -10
  193. data/ext/nokogiri/html_entity_lookup.c +0 -32
  194. data/ext/nokogiri/html_entity_lookup.h +0 -8
  195. data/ext/nokogiri/html_sax_parser_context.c +0 -116
  196. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  197. data/ext/nokogiri/html_sax_push_parser.c +0 -87
  198. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  199. data/ext/nokogiri/xml_attr.h +0 -9
  200. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  201. data/ext/nokogiri/xml_cdata.h +0 -9
  202. data/ext/nokogiri/xml_comment.h +0 -9
  203. data/ext/nokogiri/xml_document.h +0 -23
  204. data/ext/nokogiri/xml_document_fragment.h +0 -10
  205. data/ext/nokogiri/xml_dtd.h +0 -10
  206. data/ext/nokogiri/xml_element_content.h +0 -10
  207. data/ext/nokogiri/xml_element_decl.h +0 -9
  208. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  209. data/ext/nokogiri/xml_entity_decl.h +0 -10
  210. data/ext/nokogiri/xml_entity_reference.h +0 -9
  211. data/ext/nokogiri/xml_io.c +0 -56
  212. data/ext/nokogiri/xml_io.h +0 -11
  213. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  214. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  215. data/ext/nokogiri/xml_namespace.h +0 -13
  216. data/ext/nokogiri/xml_node.h +0 -13
  217. data/ext/nokogiri/xml_node_set.h +0 -14
  218. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  219. data/ext/nokogiri/xml_reader.h +0 -10
  220. data/ext/nokogiri/xml_relax_ng.h +0 -9
  221. data/ext/nokogiri/xml_sax_parser.h +0 -39
  222. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  223. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  224. data/ext/nokogiri/xml_schema.h +0 -9
  225. data/ext/nokogiri/xml_syntax_error.h +0 -13
  226. data/ext/nokogiri/xml_text.h +0 -9
  227. data/ext/nokogiri/xml_xpath_context.h +0 -10
  228. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  229. data/lib/nokogiri/html/sax/parser_context.rb +0 -16
  230. data/lib/nokogiri/html/sax/push_parser.rb +0 -16
  231. data/tasks/cross_compile.rb +0 -150
  232. data/tasks/nokogiri.org.rb +0 -24
  233. data/tasks/test.rb +0 -95
  234. data/test/css/test_nthiness.rb +0 -159
  235. data/test/css/test_parser.rb +0 -341
  236. data/test/css/test_tokenizer.rb +0 -198
  237. data/test/css/test_xpath_visitor.rb +0 -91
  238. data/test/decorators/test_slop.rb +0 -16
  239. data/test/files/2ch.html +0 -108
  240. data/test/files/address_book.rlx +0 -12
  241. data/test/files/address_book.xml +0 -10
  242. data/test/files/bar/bar.xsd +0 -4
  243. data/test/files/dont_hurt_em_why.xml +0 -422
  244. data/test/files/encoding.html +0 -82
  245. data/test/files/encoding.xhtml +0 -84
  246. data/test/files/exslt.xml +0 -8
  247. data/test/files/exslt.xslt +0 -35
  248. data/test/files/foo/foo.xsd +0 -4
  249. data/test/files/metacharset.html +0 -10
  250. data/test/files/noencoding.html +0 -47
  251. data/test/files/po.xml +0 -32
  252. data/test/files/po.xsd +0 -66
  253. data/test/files/shift_jis.html +0 -10
  254. data/test/files/shift_jis.xml +0 -5
  255. data/test/files/snuggles.xml +0 -3
  256. data/test/files/staff.dtd +0 -10
  257. data/test/files/staff.xml +0 -59
  258. data/test/files/staff.xslt +0 -32
  259. data/test/files/test_document_url/bar.xml +0 -2
  260. data/test/files/test_document_url/document.dtd +0 -4
  261. data/test/files/test_document_url/document.xml +0 -6
  262. data/test/files/tlm.html +0 -850
  263. data/test/files/to_be_xincluded.xml +0 -2
  264. data/test/files/valid_bar.xml +0 -2
  265. data/test/files/xinclude.xml +0 -4
  266. data/test/helper.rb +0 -154
  267. data/test/html/sax/test_parser.rb +0 -141
  268. data/test/html/sax/test_parser_context.rb +0 -46
  269. data/test/html/test_builder.rb +0 -164
  270. data/test/html/test_document.rb +0 -552
  271. data/test/html/test_document_encoding.rb +0 -138
  272. data/test/html/test_document_fragment.rb +0 -261
  273. data/test/html/test_element_description.rb +0 -105
  274. data/test/html/test_named_characters.rb +0 -14
  275. data/test/html/test_node.rb +0 -196
  276. data/test/html/test_node_encoding.rb +0 -27
  277. data/test/namespaces/test_additional_namespaces_in_builder_doc.rb +0 -14
  278. data/test/namespaces/test_namespaces_in_builder_doc.rb +0 -75
  279. data/test/namespaces/test_namespaces_in_created_doc.rb +0 -75
  280. data/test/namespaces/test_namespaces_in_parsed_doc.rb +0 -66
  281. data/test/test_convert_xpath.rb +0 -135
  282. data/test/test_css_cache.rb +0 -45
  283. data/test/test_encoding_handler.rb +0 -46
  284. data/test/test_memory_leak.rb +0 -156
  285. data/test/test_nokogiri.rb +0 -132
  286. data/test/test_reader.rb +0 -555
  287. data/test/test_soap4r_sax.rb +0 -52
  288. data/test/test_xslt_transforms.rb +0 -254
  289. data/test/xml/node/test_save_options.rb +0 -28
  290. data/test/xml/node/test_subclass.rb +0 -44
  291. data/test/xml/sax/test_parser.rb +0 -366
  292. data/test/xml/sax/test_parser_context.rb +0 -106
  293. data/test/xml/sax/test_push_parser.rb +0 -157
  294. data/test/xml/test_attr.rb +0 -64
  295. data/test/xml/test_attribute_decl.rb +0 -86
  296. data/test/xml/test_builder.rb +0 -306
  297. data/test/xml/test_c14n.rb +0 -151
  298. data/test/xml/test_cdata.rb +0 -48
  299. data/test/xml/test_comment.rb +0 -29
  300. data/test/xml/test_document.rb +0 -828
  301. data/test/xml/test_document_encoding.rb +0 -28
  302. data/test/xml/test_document_fragment.rb +0 -223
  303. data/test/xml/test_dtd.rb +0 -103
  304. data/test/xml/test_dtd_encoding.rb +0 -33
  305. data/test/xml/test_element_content.rb +0 -56
  306. data/test/xml/test_element_decl.rb +0 -73
  307. data/test/xml/test_entity_decl.rb +0 -122
  308. data/test/xml/test_entity_reference.rb +0 -245
  309. data/test/xml/test_namespace.rb +0 -95
  310. data/test/xml/test_node.rb +0 -1137
  311. data/test/xml/test_node_attributes.rb +0 -96
  312. data/test/xml/test_node_encoding.rb +0 -107
  313. data/test/xml/test_node_inheritance.rb +0 -32
  314. data/test/xml/test_node_reparenting.rb +0 -374
  315. data/test/xml/test_node_set.rb +0 -755
  316. data/test/xml/test_parse_options.rb +0 -64
  317. data/test/xml/test_processing_instruction.rb +0 -30
  318. data/test/xml/test_reader_encoding.rb +0 -142
  319. data/test/xml/test_relax_ng.rb +0 -60
  320. data/test/xml/test_schema.rb +0 -103
  321. data/test/xml/test_syntax_error.rb +0 -12
  322. data/test/xml/test_text.rb +0 -45
  323. data/test/xml/test_unparented_node.rb +0 -422
  324. data/test/xml/test_xinclude.rb +0 -83
  325. data/test/xml/test_xpath.rb +0 -295
  326. data/test/xslt/test_custom_functions.rb +0 -133
  327. data/test/xslt/test_exception_handling.rb +0 -37
  328. data/test_all +0 -81
data/LICENSE.md ADDED
@@ -0,0 +1,9 @@
1
+ The MIT License
2
+
3
+ Copyright 2008 -- 2021 by Mike Dalessio, Aaron Patterson, Yoko Harada, Akinori MUSHA, John Shahid, Karol Bucek, Sam Ruby, Craig Barnes, Stephen Checkoway, Lars Kanis, Sergio Arbeo, Timothy Elliott, Nobuyoshi Nakada, Charles Nutter, Patrick Mahoney.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,278 @@
1
+ <div><img src="https://nokogiri.org/images/nokogiri-serif-black.png" align="right"/></div>
2
+
3
+ # Nokogiri
4
+
5
+ Nokogiri (鋸) makes it easy and painless to work with XML and HTML from Ruby. It provides a sensible, easy-to-understand API for [reading](https://nokogiri.org/tutorials/parsing_an_html_xml_document.html), writing, [modifying](https://nokogiri.org/tutorials/modifying_an_html_xml_document.html), and [querying](https://nokogiri.org/tutorials/searching_a_xml_html_document.html) documents. It is fast and standards-compliant by relying on native parsers like libxml2 (C) and xerces (Java).
6
+
7
+ ## Guiding Principles
8
+
9
+ Some guiding principles Nokogiri tries to follow:
10
+
11
+ - be secure-by-default by treating all documents as **untrusted** by default
12
+ - be a **thin-as-reasonable layer** on top of the underlying parsers, and don't attempt to fix behavioral differences between the parsers
13
+
14
+
15
+ ## Features Overview
16
+
17
+ - DOM Parser for XML, HTML4, and HTML5
18
+ - SAX Parser for XML and HTML4
19
+ - Push Parser for XML and HTML4
20
+ - Document search via XPath 1.0
21
+ - Document search via CSS3 selectors, with some jquery-like extensions
22
+ - XSD Schema validation
23
+ - XSLT transformation
24
+ - "Builder" DSL for XML and HTML documents
25
+
26
+
27
+ ## Status
28
+
29
+ [![Github Actions CI](https://github.com/sparklemotion/nokogiri/actions/workflows/ci.yml/badge.svg?branch=main)](https://github.com/sparklemotion/nokogiri/actions/workflows/ci.yml)
30
+ [![Appveyor CI](https://ci.appveyor.com/api/projects/status/xj2pqwvlxwuwgr06/branch/main?svg=true)](https://ci.appveyor.com/project/flavorjones/nokogiri/branch/main)
31
+
32
+ [![Gem Version](https://badge.fury.io/rb/nokogiri.svg)](https://rubygems.org/gems/nokogiri)
33
+ [![SemVer compatibility](https://api.dependabot.com/badges/compatibility_score?dependency-name=nokogiri&package-manager=bundler&version-scheme=semver)](https://dependabot.com/compatibility-score/?dependency-name=nokogiri&package-manager=bundler)
34
+ [![Tidelift dependencies](https://tidelift.com/badges/package/rubygems/nokogiri)](https://tidelift.com/subscription/pkg/rubygems-nokogiri?utm_source=rubygems-nokogiri&utm_medium=referral&utm_campaign=readme)
35
+
36
+
37
+ ## Support, Getting Help, and Reporting Issues
38
+
39
+ All official documentation is posted at https://nokogiri.org (the source for which is at https://github.com/sparklemotion/nokogiri.org/, and we welcome contributions).
40
+
41
+ Consider subscribing to [Tidelift][tidelift] which provides license assurances and timely security notifications for your open source dependencies, including Nokogiri. [Tidelift][tidelift] subscriptions also help the Nokogiri maintainers fund our [automated testing](https://ci.nokogiri.org) which in turn allows us to ship releases, bugfixes, and security updates more often.
42
+
43
+ [tidelift]: https://tidelift.com/subscription/pkg/rubygems-nokogiri?utm_source=rubygems-nokogiri&utm_medium=referral&utm_campaign=readme
44
+
45
+ ### Reading
46
+
47
+ Your first stops for learning more about Nokogiri should be:
48
+
49
+ - [API Documentation](https://nokogiri.org/rdoc/index.html)
50
+ - [Tutorials](https://nokogiri.org/tutorials/toc.html)
51
+ - An excellent community-maintained [Cheat Sheet](https://github.com/sparklemotion/nokogiri/wiki/Cheat-sheet)
52
+
53
+
54
+ ### Ask For Help
55
+
56
+ There are a few ways to ask exploratory questions:
57
+
58
+ - The Ruby Discord chat server is active at https://discord.gg/UyQnKrT
59
+ - The Nokogiri mailing list is active at https://groups.google.com/group/nokogiri-talk
60
+ - Open an issue using the "Help Request" template at https://github.com/sparklemotion/nokogiri/issues
61
+
62
+ Please do not mail the maintainers at their personal addresses.
63
+
64
+
65
+ ### Report A Bug
66
+
67
+ The Nokogiri bug tracker is at https://github.com/sparklemotion/nokogiri/issues
68
+
69
+ Please use the "Bug Report" or "Installation Difficulties" templates.
70
+
71
+
72
+ ### Security and Vulnerability Reporting
73
+
74
+ Please report vulnerabilities at https://hackerone.com/nokogiri
75
+
76
+ Full information and description of our security policy is in [`SECURITY.md`](SECURITY.md)
77
+
78
+
79
+ ### Semantic Versioning Policy
80
+
81
+ Nokogiri follows [Semantic Versioning](https://semver.org/) (since 2017 or so). [![Dependabot's SemVer compatibility score for Nokogiri](https://api.dependabot.com/badges/compatibility_score?dependency-name=nokogiri&package-manager=bundler&version-scheme=semver)](https://dependabot.com/compatibility-score/?dependency-name=nokogiri&package-manager=bundler)
82
+
83
+ We bump `Major.Minor.Patch` versions following this guidance:
84
+
85
+ `Major`: (we've never done this)
86
+
87
+ - Significant backwards-incompatible changes to the public API that would require rewriting existing application code.
88
+ - Some examples of backwards-incompatible changes we might someday consider for a Major release are at [`ROADMAP.md`](ROADMAP.md).
89
+
90
+ `Minor`:
91
+
92
+ - Features and bugfixes.
93
+ - Updating packaged libraries for non-security-related reasons.
94
+ - Dropping support for EOLed Ruby versions. [Some folks find this objectionable](https://github.com/sparklemotion/nokogiri/issues/1568), but [SemVer says this is OK if the public API hasn't changed](https://semver.org/#what-should-i-do-if-i-update-my-own-dependencies-without-changing-the-public-api).
95
+ - Backwards-incompatible changes to internal or private methods and constants. These are detailed in the "Changes" section of each changelog entry.
96
+
97
+ `Patch`:
98
+
99
+ - Bugfixes.
100
+ - Security updates.
101
+ - Updating packaged libraries for security-related reasons.
102
+
103
+
104
+ ## Installation
105
+
106
+ Requirements:
107
+
108
+ - Ruby >= 2.5
109
+ - JRuby >= 9.2.0.0
110
+
111
+
112
+ ### Native Gems: Faster, more reliable installation
113
+
114
+ "Native gems" contain pre-compiled libraries for a specific machine architecture. On supported platforms, this removes the need for compiling the C extension and the packaged libraries, or for system dependencies to exist. This results in **much faster installation** and **more reliable installation**, which as you probably know are the biggest headaches for Nokogiri users.
115
+
116
+ ### Supported Platforms
117
+
118
+ As of v1.11.0, Nokogiri ships pre-compiled, "native" gems for the following platforms:
119
+
120
+ - Linux: `x86-linux` and `x86_64-linux` (req: `glibc >= 2.17`), including musl platforms like Alpine
121
+ - Darwin/MacOS: `x86_64-darwin` and `arm64-darwin`
122
+ - Windows: `x86-mingw32` and `x64-mingw32`
123
+ - Java: any platform running JRuby 9.2 or higher
124
+
125
+ To determine whether your system supports one of these gems, look at the output of `bundle platform` or `ruby -e 'puts Gem::Platform.local.to_s'`.
126
+
127
+ If you're on a supported platform, either `gem install` or `bundle install` should install a native gem without any additional action on your part. This installation should only take a few seconds, and your output should look something like:
128
+
129
+ ``` sh
130
+ $ gem install nokogiri
131
+ Fetching nokogiri-1.11.0-x86_64-linux.gem
132
+ Successfully installed nokogiri-1.11.0-x86_64-linux
133
+ 1 gem installed
134
+ ```
135
+
136
+
137
+ ### Other Installation Options
138
+
139
+ Because Nokogiri is a C extension, it requires that you have a C compiler toolchain, Ruby development header files, and some system dependencies installed.
140
+
141
+ The following may work for you if you have an appropriately-configured system:
142
+
143
+ ``` bash
144
+ gem install nokogiri
145
+ ```
146
+
147
+ If you have any issues, please visit [Installing Nokogiri](https://nokogiri.org/tutorials/installing_nokogiri.html) for more complete instructions and troubleshooting.
148
+
149
+
150
+ ## How To Use Nokogiri
151
+
152
+ Nokogiri is a large library, and so it's challenging to briefly summarize it. We've tried to provide long, real-world examples at [Tutorials](https://nokogiri.org/tutorials/toc.html).
153
+
154
+ ### Parsing and Querying
155
+
156
+ Here is example usage for parsing and querying a document:
157
+
158
+ ```ruby
159
+ #! /usr/bin/env ruby
160
+
161
+ require 'nokogiri'
162
+ require 'open-uri'
163
+
164
+ # Fetch and parse HTML document
165
+ doc = Nokogiri::HTML(URI.open('https://nokogiri.org/tutorials/installing_nokogiri.html'))
166
+
167
+ # Search for nodes by css
168
+ doc.css('nav ul.menu li a', 'article h2').each do |link|
169
+ puts link.content
170
+ end
171
+
172
+ # Search for nodes by xpath
173
+ doc.xpath('//nav//ul//li/a', '//article//h2').each do |link|
174
+ puts link.content
175
+ end
176
+
177
+ # Or mix and match
178
+ doc.search('nav ul.menu li a', '//article//h2').each do |link|
179
+ puts link.content
180
+ end
181
+ ```
182
+
183
+
184
+ ### Encoding
185
+
186
+ Strings are always stored as UTF-8 internally. Methods that return
187
+ text values will always return UTF-8 encoded strings. Methods that
188
+ return a string containing markup (like `to_xml`, `to_html` and
189
+ `inner_html`) will return a string encoded like the source document.
190
+
191
+ __WARNING__
192
+
193
+ Some documents declare one encoding, but actually use a different
194
+ one. In these cases, which encoding should the parser choose?
195
+
196
+ Data is just a stream of bytes. Humans add meaning to that stream. Any
197
+ particular set of bytes could be valid characters in multiple
198
+ encodings, so detecting encoding with 100% accuracy is not
199
+ possible. `libxml2` does its best, but it can't be right all the time.
200
+
201
+ If you want Nokogiri to handle the document encoding properly, your
202
+ best bet is to explicitly set the encoding. Here is an example of
203
+ explicitly setting the encoding to EUC-JP on the parser:
204
+
205
+ ```ruby
206
+ doc = Nokogiri.XML('<foo><bar /></foo>', nil, 'EUC-JP')
207
+ ```
208
+
209
+
210
+ ## Technical Overview
211
+
212
+ ### Guiding Principles
213
+
214
+ As noted above, two guiding principles of the software are:
215
+
216
+ - be secure-by-default by treating all documents as **untrusted** by default
217
+ - be a **thin-as-reasonable layer** on top of the underlying parsers, and don't attempt to fix behavioral differences between the parsers
218
+
219
+ Notably, despite all parsers being standards-compliant, there are behavioral inconsistencies between the parsers used in the CRuby and JRuby implementations, and Nokogiri does not and should not attempt to remove these inconsistencies. Instead, we surface these differences in the test suite when they are important/semantic; or we intentionally write tests to depend only on the important/semantic bits (omitting whitespace from regex matchers on results, for example).
220
+
221
+
222
+ ### CRuby
223
+
224
+ The Ruby (a.k.a., CRuby, MRI, YARV) implementation is a C extension that depends on libxml2 and libxslt (which in turn depend on zlib and possibly libiconv).
225
+
226
+ These dependencies are met by default by Nokogiri's packaged versions of the libxml2 and libxslt source code, but a configuration option `--use-system-libraries` is provided to allow specification of alternative library locations. See [Installing Nokogiri](https://nokogiri.org/tutorials/installing_nokogiri.html) for full documentation.
227
+
228
+ We provide native gems by pre-compiling libxml2 and libxslt (and potentially zlib and libiconv) and packaging them into the gem file. In this case, no compilation is necessary at installation time, which leads to faster and more reliable installation.
229
+
230
+ See [`LICENSE-DEPENDENCIES.md`](LICENSE-DEPENDENCIES.md) for more information on which dependencies are provided in which native and source gems.
231
+
232
+
233
+ ### JRuby
234
+
235
+ The Java (a.k.a. JRuby) implementation is a Java extension that depends primarily on Xerces and NekoHTML for parsing, though additional dependencies are on `isorelax`, `nekodtd`, `jing`, `serializer`, `xalan-j`, and `xml-apis`.
236
+
237
+ These dependencies are provided by pre-compiled jar files packaged in the `java` platform gem.
238
+
239
+ See [`LICENSE-DEPENDENCIES.md`](LICENSE-DEPENDENCIES.md) for more information on which dependencies are provided in which native and source gems.
240
+
241
+
242
+ ## Contributing
243
+
244
+ See [`CONTRIBUTING.md`](CONTRIBUTING.md) for an intro guide to developing Nokogiri.
245
+
246
+
247
+ ## Code of Conduct
248
+
249
+ We've adopted the Contributor Covenant code of conduct, which you can read in full in [`CODE_OF_CONDUCT.md`](CODE_OF_CONDUCT.md).
250
+
251
+
252
+ ## License
253
+
254
+ This project is licensed under the terms of the MIT license.
255
+
256
+ See this license at [`LICENSE.md`](LICENSE.md).
257
+
258
+
259
+ ### Dependencies
260
+
261
+ Some additional libraries may be distributed with your version of Nokogiri. Please see [`LICENSE-DEPENDENCIES.md`](LICENSE-DEPENDENCIES.md) for a discussion of the variations as well as the licenses thereof.
262
+
263
+
264
+ ## Authors
265
+
266
+ - Mike Dalessio
267
+ - Aaron Patterson
268
+ - Yoko Harada
269
+ - Akinori MUSHA
270
+ - John Shahid
271
+ - Karol Bucek
272
+ - Sam Ruby
273
+ - Craig Barnes
274
+ - Stephen Checkoway
275
+ - Lars Kanis
276
+ - Sergio Arbeo
277
+ - Timothy Elliott
278
+ - Nobuyoshi Nakada
data/bin/nokogiri CHANGED
@@ -1,30 +1,61 @@
1
1
  #!/usr/bin/env ruby
2
2
  require 'optparse'
3
3
  require 'open-uri'
4
- require 'irb'
5
4
  require 'uri'
6
5
  require 'rubygems'
7
6
  require 'nokogiri'
7
+ autoload :IRB, 'irb'
8
8
 
9
9
  parse_class = Nokogiri
10
10
  encoding = nil
11
11
 
12
+ # This module provides some tunables with the nokogiri CLI for use in
13
+ # your ~/.nokogirirc.
14
+ module Nokogiri::CLI
15
+ class << self
16
+ # Specify the console engine, defaulted to IRB.
17
+ #
18
+ # call-seq:
19
+ # require 'pry'
20
+ # Nokogiri::CLI.console = Pry
21
+ attr_writer :console
22
+
23
+ def console
24
+ case @console
25
+ when Symbol
26
+ Kernel.const_get(@console)
27
+ else
28
+ @console
29
+ end
30
+ end
31
+
32
+ attr_accessor :rcfile
33
+ end
34
+
35
+ self.rcfile = File.expand_path('~/.nokogirirc')
36
+ self.console = :IRB
37
+ end
38
+
12
39
  opts = OptionParser.new do |opts|
13
40
  opts.banner = "Nokogiri: an HTML, XML, SAX, and Reader parser"
14
41
  opts.define_head "Usage: nokogiri <uri|path> [options]"
15
42
  opts.separator ""
16
43
  opts.separator "Examples:"
17
- opts.separator " nokogiri http://www.ruby-lang.org/"
44
+ opts.separator " nokogiri https://www.ruby-lang.org/"
18
45
  opts.separator " nokogiri ./public/index.html"
19
- opts.separator " curl -s http://nokogiri.org | nokogiri -e'p $_.css(\"h1\").length'"
46
+ opts.separator " curl -s http://www.nokogiri.org | nokogiri -e'p $_.css(\"h1\").length'"
20
47
  opts.separator ""
21
48
  opts.separator "Options:"
22
49
 
23
- opts.on("--type [TYPE]", [:xml, :html]) do |v|
50
+ opts.on("--type type", "Parse as type: xml or html (default: auto)", [:xml, :html]) do |v|
24
51
  parse_class = {:xml => Nokogiri::XML, :html => Nokogiri::HTML}[v]
25
52
  end
26
53
 
27
- opts.on("-E", "--encoding encoding", "Read as encoding (default #{encoding})") do |v|
54
+ opts.on("-C file", "Specifies initialization file to load (default #{Nokogiri::CLI.rcfile})") do |v|
55
+ Nokogiri::CLI.rcfile = v
56
+ end
57
+
58
+ opts.on("-E", "--encoding encoding", "Read as encoding (default: #{encoding || 'none'})") do |v|
28
59
  encoding = v
29
60
  end
30
61
 
@@ -48,15 +79,24 @@ opts = OptionParser.new do |opts|
48
79
  end
49
80
  opts.parse!
50
81
 
51
- uri = ARGV.shift
82
+ url = ARGV.shift
52
83
 
53
- if uri.to_s.strip.empty? && $stdin.tty?
84
+ if url.to_s.strip.empty? && $stdin.tty?
54
85
  puts opts
55
86
  exit 1
56
87
  end
57
88
 
58
- if $stdin.tty?
59
- @doc = parse_class.parse(open(uri).read, nil, encoding)
89
+ if File.file?(Nokogiri::CLI.rcfile)
90
+ load Nokogiri::CLI.rcfile
91
+ end
92
+
93
+ if url || $stdin.tty?
94
+ case uri = (URI(url) rescue url)
95
+ when URI::HTTP
96
+ @doc = parse_class.parse(uri.read, url, encoding)
97
+ else
98
+ @doc = parse_class.parse(open(url).read, nil, encoding)
99
+ end
60
100
  else
61
101
  @doc = parse_class.parse($stdin, nil, encoding)
62
102
  end
@@ -72,7 +112,7 @@ else
72
112
  eval @script, binding, '<main>'
73
113
  else
74
114
  puts "Your document is stored in @doc..."
75
- IRB.start
115
+ Nokogiri::CLI.console.start
76
116
  end
77
117
  end
78
118
 
data/dependencies.yml ADDED
@@ -0,0 +1,74 @@
1
+ libxml2:
2
+ version: "2.9.12"
3
+ sha256: "c8d6681e38c56f172892c85ddc0852e1fd4b53b4209e7f4ebf17f7e2eae71d92"
4
+ # manually verified checksum:
5
+ #
6
+ # $ gpg --verify libxml2-2.9.12.tar.gz.asc ports/archives/libxml2-2.9.12.tar.gz
7
+ # gpg: Signature made Thu 13 May 2021 02:59:16 PM EDT
8
+ # gpg: using RSA key DB46681BB91ADCEA170FA2D415588B26596BEA5D
9
+ # gpg: Good signature from "Daniel Veillard (Red Hat work email) <veillard@redhat.com>" [unknown]
10
+ # gpg: aka "Daniel Veillard <Daniel.Veillard@w3.org>" [unknown]
11
+ # gpg: WARNING: This key is not certified with a trusted signature!
12
+ # gpg: There is no indication that the signature belongs to the owner.
13
+ # Primary key fingerprint: C744 15BA 7C9C 7F78 F02E 1DC3 4606 B8A5 DE95 BC1F
14
+ # Subkey fingerprint: DB46 681B B91A DCEA 170F A2D4 1558 8B26 596B EA5D
15
+ #
16
+ # using this pgp signature:
17
+ #
18
+ # -----BEGIN PGP SIGNATURE-----
19
+ #
20
+ # iQEzBAABCAAdFiEE20ZoG7ka3OoXD6LUFViLJllr6l0FAmCddwQACgkQFViLJllr
21
+ # 6l11LQgAioRTdfmcC+uK/7+6HPtF/3c5zkX6j8VGYuvFBwZ0jayqMRBAl++fcpjE
22
+ # JUU/JKebSZ/KCYjzyeOWK/i3Gq77iqm3UbZFB85rqu4a5P3gmj/4STWVyAx0KU3z
23
+ # G3jKqDhJOt7c0acXb5lh2DngfDa1dn/VGcQcIXsqplNxNr4ET7MnSJjZ3nlxYfW2
24
+ # E5vWBdPCMUeXDBl6MjYvw9XnGGBLUAaEJWoFToG6jKmVf4GAd9nza20jj5dtbcJq
25
+ # QEOaSDKDr+f9h2NS8haOhJ9vOpy52PdeGzaFlbRkXarGXuAr8kITgATVs8FAqcgv
26
+ # MoVhmrO5r2hJf0dCM9fZoYqzpMfmNA==
27
+ # =KfJ9
28
+ # -----END PGP SIGNATURE-----
29
+ #
30
+
31
+ libxslt:
32
+ version: "1.1.34"
33
+ sha256: "98b1bd46d6792925ad2dfe9a87452ea2adebf69dcb9919ffd55bf926a7f93f7f"
34
+ # manually verified checksum:
35
+ #
36
+ # $ gpg --verify ~/Downloads/libxslt-1.1.34.tar.gz.asc ports/archives/libxslt-1.1.34.tar.gz
37
+ # gpg: Signature made Wed 30 Oct 2019 04:02:48 PM EDT
38
+ # gpg: using RSA key DB46681BB91ADCEA170FA2D415588B26596BEA5D
39
+ # gpg: Good signature from "Daniel Veillard (Red Hat work email) <veillard@redhat.com>" [unknown]
40
+ # gpg: aka "Daniel Veillard <Daniel.Veillard@w3.org>" [unknown]
41
+ # gpg: WARNING: This key is not certified with a trusted signature!
42
+ # gpg: There is no indication that the signature belongs to the owner.
43
+ # Primary key fingerprint: C744 15BA 7C9C 7F78 F02E 1DC3 4606 B8A5 DE95 BC1F
44
+ # Subkey fingerprint: DB46 681B B91A DCEA 170F A2D4 1558 8B26 596B EA5D
45
+ #
46
+ # using this pgp signature:
47
+ #
48
+ # -----BEGIN PGP SIGNATURE-----
49
+ #
50
+ # iQEzBAABCAAdFiEE20ZoG7ka3OoXD6LUFViLJllr6l0FAl257GgACgkQFViLJllr
51
+ # 6l2vVggAjJEHmASiS56SxhPOsGqbfBihM66gQFoIymQfMu2430N1GSTkLsfbkJO8
52
+ # 8yBX11NjzK/m9uxwshMW3rVCU7EpL3PUimN3reXdPiQj9hAOAWF1V3BZNevbQC2E
53
+ # FCIraioukaidf8sjUG4/sGpK/gOcP/3hYoN0HUoBigCNJjDqhijxM3M3GJJtCASp
54
+ # jL4CQbs2OmxW8ixOZbuWEESvFFHUgYRsdZjRVN+GRfSOvJjxypurmYwQ3RjO7JxL
55
+ # 2FY8qKQ+xpeID8NV8F5OUEvWBjk1QS133VTqBZNlONdnEtV/og6jNu5k0O/Kvhup
56
+ # caR+8TMErOcLr9OgDklO6DoYyAsf9Q==
57
+ # =g4i4
58
+ # -----END PGP SIGNATURE-----
59
+ #
60
+
61
+ zlib:
62
+ version: "1.2.11"
63
+ sha256: "c3e5e9fdd5004dcb542feda5ee4f0ff0744628baf8ed2dd5d66f8ca1197cb1a1"
64
+ # SHA-256 hash provided on http://zlib.net/
65
+
66
+ libiconv:
67
+ version: "1.15"
68
+ sha256: "ccf536620a45458d26ba83887a983b96827001e92a13847b45e4925cc8913178"
69
+ # gpg: Signature made Fri Feb 3 00:38:12 2017 CET
70
+ # gpg: using RSA key 4F494A942E4616C2
71
+ # gpg: Good signature from "Bruno Haible (Open Source Development) <bruno@clisp.org>" [unknown]
72
+ # gpg: WARNING: This key is not certified with a trusted signature!
73
+ # gpg: There is no indication that the signature belongs to the owner.
74
+ # Primary key fingerprint: 68D9 4D8A AEEA D48A E7DC 5B90 4F49 4A94 2E46 16C2