nokogiri 1.5.10 → 1.12.5

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (328) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +3 -0
  3. data/LICENSE-DEPENDENCIES.md +1903 -0
  4. data/LICENSE.md +9 -0
  5. data/README.md +278 -0
  6. data/bin/nokogiri +50 -10
  7. data/dependencies.yml +74 -0
  8. data/ext/nokogiri/depend +38 -358
  9. data/ext/nokogiri/extconf.rb +944 -100
  10. data/ext/nokogiri/gumbo.c +584 -0
  11. data/ext/nokogiri/html4_document.c +166 -0
  12. data/ext/nokogiri/html4_element_description.c +294 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser_context.c +120 -0
  15. data/ext/nokogiri/html4_sax_push_parser.c +95 -0
  16. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  17. data/ext/nokogiri/nokogiri.c +232 -87
  18. data/ext/nokogiri/nokogiri.h +188 -129
  19. data/ext/nokogiri/test_global_handlers.c +40 -0
  20. data/ext/nokogiri/xml_attr.c +49 -40
  21. data/ext/nokogiri/xml_attribute_decl.c +18 -18
  22. data/ext/nokogiri/xml_cdata.c +24 -23
  23. data/ext/nokogiri/xml_comment.c +29 -21
  24. data/ext/nokogiri/xml_document.c +305 -201
  25. data/ext/nokogiri/xml_document_fragment.c +13 -15
  26. data/ext/nokogiri/xml_dtd.c +54 -48
  27. data/ext/nokogiri/xml_element_content.c +31 -26
  28. data/ext/nokogiri/xml_element_decl.c +22 -22
  29. data/ext/nokogiri/xml_encoding_handler.c +30 -19
  30. data/ext/nokogiri/xml_entity_decl.c +32 -30
  31. data/ext/nokogiri/xml_entity_reference.c +16 -18
  32. data/ext/nokogiri/xml_namespace.c +74 -32
  33. data/ext/nokogiri/xml_node.c +808 -503
  34. data/ext/nokogiri/xml_node_set.c +239 -208
  35. data/ext/nokogiri/xml_processing_instruction.c +17 -19
  36. data/ext/nokogiri/xml_reader.c +198 -186
  37. data/ext/nokogiri/xml_relax_ng.c +52 -28
  38. data/ext/nokogiri/xml_sax_parser.c +123 -125
  39. data/ext/nokogiri/xml_sax_parser_context.c +138 -79
  40. data/ext/nokogiri/xml_sax_push_parser.c +88 -35
  41. data/ext/nokogiri/xml_schema.c +112 -33
  42. data/ext/nokogiri/xml_syntax_error.c +50 -23
  43. data/ext/nokogiri/xml_text.c +14 -18
  44. data/ext/nokogiri/xml_xpath_context.c +162 -98
  45. data/ext/nokogiri/xslt_stylesheet.c +162 -168
  46. data/gumbo-parser/CHANGES.md +63 -0
  47. data/gumbo-parser/Makefile +101 -0
  48. data/gumbo-parser/THANKS +27 -0
  49. data/gumbo-parser/src/Makefile +34 -0
  50. data/gumbo-parser/src/README.md +41 -0
  51. data/gumbo-parser/src/ascii.c +75 -0
  52. data/gumbo-parser/src/ascii.h +115 -0
  53. data/gumbo-parser/src/attribute.c +42 -0
  54. data/gumbo-parser/src/attribute.h +17 -0
  55. data/gumbo-parser/src/char_ref.c +22225 -0
  56. data/gumbo-parser/src/char_ref.h +29 -0
  57. data/gumbo-parser/src/char_ref.rl +2154 -0
  58. data/gumbo-parser/src/error.c +626 -0
  59. data/gumbo-parser/src/error.h +148 -0
  60. data/gumbo-parser/src/foreign_attrs.c +104 -0
  61. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  62. data/gumbo-parser/src/gumbo.h +943 -0
  63. data/gumbo-parser/src/insertion_mode.h +33 -0
  64. data/gumbo-parser/src/macros.h +91 -0
  65. data/gumbo-parser/src/parser.c +4886 -0
  66. data/gumbo-parser/src/parser.h +41 -0
  67. data/gumbo-parser/src/replacement.h +33 -0
  68. data/gumbo-parser/src/string_buffer.c +103 -0
  69. data/gumbo-parser/src/string_buffer.h +68 -0
  70. data/gumbo-parser/src/string_piece.c +48 -0
  71. data/gumbo-parser/src/svg_attrs.c +174 -0
  72. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  73. data/gumbo-parser/src/svg_tags.c +137 -0
  74. data/gumbo-parser/src/svg_tags.gperf +55 -0
  75. data/gumbo-parser/src/tag.c +222 -0
  76. data/gumbo-parser/src/tag_lookup.c +382 -0
  77. data/gumbo-parser/src/tag_lookup.gperf +169 -0
  78. data/gumbo-parser/src/tag_lookup.h +13 -0
  79. data/gumbo-parser/src/token_buffer.c +79 -0
  80. data/gumbo-parser/src/token_buffer.h +71 -0
  81. data/gumbo-parser/src/token_type.h +17 -0
  82. data/gumbo-parser/src/tokenizer.c +3463 -0
  83. data/gumbo-parser/src/tokenizer.h +112 -0
  84. data/gumbo-parser/src/tokenizer_states.h +339 -0
  85. data/gumbo-parser/src/utf8.c +245 -0
  86. data/gumbo-parser/src/utf8.h +164 -0
  87. data/gumbo-parser/src/util.c +68 -0
  88. data/gumbo-parser/src/util.h +30 -0
  89. data/gumbo-parser/src/vector.c +111 -0
  90. data/gumbo-parser/src/vector.h +45 -0
  91. data/lib/nokogiri/css/node.rb +1 -50
  92. data/lib/nokogiri/css/parser.rb +317 -286
  93. data/lib/nokogiri/css/parser.y +57 -43
  94. data/lib/nokogiri/css/parser_extras.rb +39 -36
  95. data/lib/nokogiri/css/syntax_error.rb +2 -1
  96. data/lib/nokogiri/css/tokenizer.rb +105 -103
  97. data/lib/nokogiri/css/tokenizer.rex +5 -5
  98. data/lib/nokogiri/css/xpath_visitor.rb +137 -48
  99. data/lib/nokogiri/css.rb +15 -14
  100. data/lib/nokogiri/decorators/slop.rb +13 -5
  101. data/lib/nokogiri/extension.rb +31 -0
  102. data/lib/nokogiri/gumbo.rb +14 -0
  103. data/lib/nokogiri/html.rb +32 -27
  104. data/lib/nokogiri/{html → html4}/builder.rb +3 -2
  105. data/lib/nokogiri/{html → html4}/document.rb +118 -50
  106. data/lib/nokogiri/{html → html4}/document_fragment.rb +20 -11
  107. data/lib/nokogiri/{html → html4}/element_description.rb +2 -1
  108. data/lib/nokogiri/{html → html4}/element_description_defaults.rb +2 -1
  109. data/lib/nokogiri/{html → html4}/entity_lookup.rb +2 -1
  110. data/lib/nokogiri/{html → html4}/sax/parser.rb +22 -14
  111. data/lib/nokogiri/html4/sax/parser_context.rb +19 -0
  112. data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
  113. data/lib/nokogiri/html4.rb +40 -0
  114. data/lib/nokogiri/html5/document.rb +74 -0
  115. data/lib/nokogiri/html5/document_fragment.rb +80 -0
  116. data/lib/nokogiri/html5/node.rb +93 -0
  117. data/lib/nokogiri/html5.rb +473 -0
  118. data/lib/nokogiri/jruby/dependencies.rb +20 -0
  119. data/lib/nokogiri/syntax_error.rb +1 -0
  120. data/lib/nokogiri/version/constant.rb +5 -0
  121. data/lib/nokogiri/version/info.rb +215 -0
  122. data/lib/nokogiri/version.rb +3 -91
  123. data/lib/nokogiri/xml/attr.rb +1 -0
  124. data/lib/nokogiri/xml/attribute_decl.rb +1 -0
  125. data/lib/nokogiri/xml/builder.rb +75 -33
  126. data/lib/nokogiri/xml/cdata.rb +1 -0
  127. data/lib/nokogiri/xml/character_data.rb +1 -0
  128. data/lib/nokogiri/xml/document.rb +157 -54
  129. data/lib/nokogiri/xml/document_fragment.rb +55 -8
  130. data/lib/nokogiri/xml/dtd.rb +15 -4
  131. data/lib/nokogiri/xml/element_content.rb +1 -0
  132. data/lib/nokogiri/xml/element_decl.rb +1 -0
  133. data/lib/nokogiri/xml/entity_decl.rb +1 -0
  134. data/lib/nokogiri/xml/entity_reference.rb +19 -0
  135. data/lib/nokogiri/xml/namespace.rb +1 -0
  136. data/lib/nokogiri/xml/node/save_options.rb +2 -1
  137. data/lib/nokogiri/xml/node.rb +712 -431
  138. data/lib/nokogiri/xml/node_set.rb +140 -123
  139. data/lib/nokogiri/xml/notation.rb +1 -0
  140. data/lib/nokogiri/xml/parse_options.rb +31 -0
  141. data/lib/nokogiri/xml/pp/character_data.rb +1 -0
  142. data/lib/nokogiri/xml/pp/node.rb +1 -0
  143. data/lib/nokogiri/xml/pp.rb +3 -2
  144. data/lib/nokogiri/xml/processing_instruction.rb +1 -0
  145. data/lib/nokogiri/xml/reader.rb +9 -12
  146. data/lib/nokogiri/xml/relax_ng.rb +7 -2
  147. data/lib/nokogiri/xml/sax/document.rb +25 -30
  148. data/lib/nokogiri/xml/sax/parser.rb +8 -8
  149. data/lib/nokogiri/xml/sax/parser_context.rb +1 -0
  150. data/lib/nokogiri/xml/sax/push_parser.rb +1 -0
  151. data/lib/nokogiri/xml/sax.rb +5 -4
  152. data/lib/nokogiri/xml/schema.rb +13 -4
  153. data/lib/nokogiri/xml/searchable.rb +239 -0
  154. data/lib/nokogiri/xml/syntax_error.rb +25 -1
  155. data/lib/nokogiri/xml/text.rb +1 -0
  156. data/lib/nokogiri/xml/xpath/syntax_error.rb +2 -1
  157. data/lib/nokogiri/xml/xpath.rb +4 -5
  158. data/lib/nokogiri/xml/xpath_context.rb +1 -0
  159. data/lib/nokogiri/xml.rb +37 -35
  160. data/lib/nokogiri/xslt/stylesheet.rb +2 -1
  161. data/lib/nokogiri/xslt.rb +17 -16
  162. data/lib/nokogiri.rb +55 -58
  163. data/lib/xsd/xmlparser/nokogiri.rb +1 -0
  164. data/patches/libxml2/0001-Remove-script-macro-support.patch +40 -0
  165. data/patches/libxml2/0002-Update-entities-to-remove-handling-of-ssi.patch +44 -0
  166. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +25 -0
  167. data/patches/libxml2/0004-use-glibc-strlen.patch +53 -0
  168. data/patches/libxml2/0005-avoid-isnan-isinf.patch +81 -0
  169. data/patches/libxml2/0006-update-automake-files-for-arm64.patch +2511 -0
  170. data/patches/libxml2/0007-Fix-XPath-recursion-limit.patch +31 -0
  171. data/patches/libxslt/0001-update-automake-files-for-arm64.patch +2511 -0
  172. data/patches/libxslt/0002-Fix-xml2-config-check-in-configure-script.patch +19 -0
  173. data/ports/archives/libxml2-2.9.12.tar.gz +0 -0
  174. data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
  175. metadata +307 -459
  176. data/.autotest +0 -26
  177. data/.gemtest +0 -0
  178. data/CHANGELOG.ja.rdoc +0 -785
  179. data/CHANGELOG.rdoc +0 -783
  180. data/C_CODING_STYLE.rdoc +0 -33
  181. data/Manifest.txt +0 -303
  182. data/README.ja.rdoc +0 -106
  183. data/README.rdoc +0 -175
  184. data/ROADMAP.md +0 -90
  185. data/Rakefile +0 -228
  186. data/STANDARD_RESPONSES.md +0 -47
  187. data/Y_U_NO_GEMSPEC.md +0 -155
  188. data/build_all +0 -105
  189. data/ext/nokogiri/html_document.c +0 -170
  190. data/ext/nokogiri/html_document.h +0 -10
  191. data/ext/nokogiri/html_element_description.c +0 -279
  192. data/ext/nokogiri/html_element_description.h +0 -10
  193. data/ext/nokogiri/html_entity_lookup.c +0 -32
  194. data/ext/nokogiri/html_entity_lookup.h +0 -8
  195. data/ext/nokogiri/html_sax_parser_context.c +0 -116
  196. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  197. data/ext/nokogiri/html_sax_push_parser.c +0 -87
  198. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  199. data/ext/nokogiri/xml_attr.h +0 -9
  200. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  201. data/ext/nokogiri/xml_cdata.h +0 -9
  202. data/ext/nokogiri/xml_comment.h +0 -9
  203. data/ext/nokogiri/xml_document.h +0 -23
  204. data/ext/nokogiri/xml_document_fragment.h +0 -10
  205. data/ext/nokogiri/xml_dtd.h +0 -10
  206. data/ext/nokogiri/xml_element_content.h +0 -10
  207. data/ext/nokogiri/xml_element_decl.h +0 -9
  208. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  209. data/ext/nokogiri/xml_entity_decl.h +0 -10
  210. data/ext/nokogiri/xml_entity_reference.h +0 -9
  211. data/ext/nokogiri/xml_io.c +0 -56
  212. data/ext/nokogiri/xml_io.h +0 -11
  213. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  214. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  215. data/ext/nokogiri/xml_namespace.h +0 -13
  216. data/ext/nokogiri/xml_node.h +0 -13
  217. data/ext/nokogiri/xml_node_set.h +0 -14
  218. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  219. data/ext/nokogiri/xml_reader.h +0 -10
  220. data/ext/nokogiri/xml_relax_ng.h +0 -9
  221. data/ext/nokogiri/xml_sax_parser.h +0 -39
  222. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  223. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  224. data/ext/nokogiri/xml_schema.h +0 -9
  225. data/ext/nokogiri/xml_syntax_error.h +0 -13
  226. data/ext/nokogiri/xml_text.h +0 -9
  227. data/ext/nokogiri/xml_xpath_context.h +0 -10
  228. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  229. data/lib/nokogiri/html/sax/parser_context.rb +0 -16
  230. data/lib/nokogiri/html/sax/push_parser.rb +0 -16
  231. data/tasks/cross_compile.rb +0 -150
  232. data/tasks/nokogiri.org.rb +0 -24
  233. data/tasks/test.rb +0 -95
  234. data/test/css/test_nthiness.rb +0 -159
  235. data/test/css/test_parser.rb +0 -341
  236. data/test/css/test_tokenizer.rb +0 -198
  237. data/test/css/test_xpath_visitor.rb +0 -91
  238. data/test/decorators/test_slop.rb +0 -16
  239. data/test/files/2ch.html +0 -108
  240. data/test/files/address_book.rlx +0 -12
  241. data/test/files/address_book.xml +0 -10
  242. data/test/files/bar/bar.xsd +0 -4
  243. data/test/files/dont_hurt_em_why.xml +0 -422
  244. data/test/files/encoding.html +0 -82
  245. data/test/files/encoding.xhtml +0 -84
  246. data/test/files/exslt.xml +0 -8
  247. data/test/files/exslt.xslt +0 -35
  248. data/test/files/foo/foo.xsd +0 -4
  249. data/test/files/metacharset.html +0 -10
  250. data/test/files/noencoding.html +0 -47
  251. data/test/files/po.xml +0 -32
  252. data/test/files/po.xsd +0 -66
  253. data/test/files/shift_jis.html +0 -10
  254. data/test/files/shift_jis.xml +0 -5
  255. data/test/files/snuggles.xml +0 -3
  256. data/test/files/staff.dtd +0 -10
  257. data/test/files/staff.xml +0 -59
  258. data/test/files/staff.xslt +0 -32
  259. data/test/files/test_document_url/bar.xml +0 -2
  260. data/test/files/test_document_url/document.dtd +0 -4
  261. data/test/files/test_document_url/document.xml +0 -6
  262. data/test/files/tlm.html +0 -850
  263. data/test/files/to_be_xincluded.xml +0 -2
  264. data/test/files/valid_bar.xml +0 -2
  265. data/test/files/xinclude.xml +0 -4
  266. data/test/helper.rb +0 -154
  267. data/test/html/sax/test_parser.rb +0 -141
  268. data/test/html/sax/test_parser_context.rb +0 -46
  269. data/test/html/test_builder.rb +0 -164
  270. data/test/html/test_document.rb +0 -552
  271. data/test/html/test_document_encoding.rb +0 -138
  272. data/test/html/test_document_fragment.rb +0 -261
  273. data/test/html/test_element_description.rb +0 -105
  274. data/test/html/test_named_characters.rb +0 -14
  275. data/test/html/test_node.rb +0 -196
  276. data/test/html/test_node_encoding.rb +0 -27
  277. data/test/namespaces/test_additional_namespaces_in_builder_doc.rb +0 -14
  278. data/test/namespaces/test_namespaces_in_builder_doc.rb +0 -75
  279. data/test/namespaces/test_namespaces_in_created_doc.rb +0 -75
  280. data/test/namespaces/test_namespaces_in_parsed_doc.rb +0 -66
  281. data/test/test_convert_xpath.rb +0 -135
  282. data/test/test_css_cache.rb +0 -45
  283. data/test/test_encoding_handler.rb +0 -46
  284. data/test/test_memory_leak.rb +0 -156
  285. data/test/test_nokogiri.rb +0 -132
  286. data/test/test_reader.rb +0 -555
  287. data/test/test_soap4r_sax.rb +0 -52
  288. data/test/test_xslt_transforms.rb +0 -254
  289. data/test/xml/node/test_save_options.rb +0 -28
  290. data/test/xml/node/test_subclass.rb +0 -44
  291. data/test/xml/sax/test_parser.rb +0 -366
  292. data/test/xml/sax/test_parser_context.rb +0 -106
  293. data/test/xml/sax/test_push_parser.rb +0 -157
  294. data/test/xml/test_attr.rb +0 -64
  295. data/test/xml/test_attribute_decl.rb +0 -86
  296. data/test/xml/test_builder.rb +0 -306
  297. data/test/xml/test_c14n.rb +0 -151
  298. data/test/xml/test_cdata.rb +0 -48
  299. data/test/xml/test_comment.rb +0 -29
  300. data/test/xml/test_document.rb +0 -828
  301. data/test/xml/test_document_encoding.rb +0 -28
  302. data/test/xml/test_document_fragment.rb +0 -223
  303. data/test/xml/test_dtd.rb +0 -103
  304. data/test/xml/test_dtd_encoding.rb +0 -33
  305. data/test/xml/test_element_content.rb +0 -56
  306. data/test/xml/test_element_decl.rb +0 -73
  307. data/test/xml/test_entity_decl.rb +0 -122
  308. data/test/xml/test_entity_reference.rb +0 -245
  309. data/test/xml/test_namespace.rb +0 -95
  310. data/test/xml/test_node.rb +0 -1137
  311. data/test/xml/test_node_attributes.rb +0 -96
  312. data/test/xml/test_node_encoding.rb +0 -107
  313. data/test/xml/test_node_inheritance.rb +0 -32
  314. data/test/xml/test_node_reparenting.rb +0 -374
  315. data/test/xml/test_node_set.rb +0 -755
  316. data/test/xml/test_parse_options.rb +0 -64
  317. data/test/xml/test_processing_instruction.rb +0 -30
  318. data/test/xml/test_reader_encoding.rb +0 -142
  319. data/test/xml/test_relax_ng.rb +0 -60
  320. data/test/xml/test_schema.rb +0 -103
  321. data/test/xml/test_syntax_error.rb +0 -12
  322. data/test/xml/test_text.rb +0 -45
  323. data/test/xml/test_unparented_node.rb +0 -422
  324. data/test/xml/test_xinclude.rb +0 -83
  325. data/test/xml/test_xpath.rb +0 -295
  326. data/test/xslt/test_custom_functions.rb +0 -133
  327. data/test/xslt/test_exception_handling.rb +0 -37
  328. data/test_all +0 -81
@@ -1,28 +1,67 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'pathname'
4
+
1
5
  module Nokogiri
2
- module HTML
6
+ module HTML4
3
7
  class Document < Nokogiri::XML::Document
4
8
  ###
5
9
  # Get the meta tag encoding for this document. If there is no meta tag,
6
10
  # then nil is returned.
7
11
  def meta_encoding
8
- meta = meta_content_type and
9
- match = /charset\s*=\s*([\w-]+)/i.match(meta['content']) and
10
- match[1]
12
+ case
13
+ when meta = at('//meta[@charset]')
14
+ meta[:charset]
15
+ when meta = meta_content_type
16
+ meta['content'][/charset\s*=\s*([\w-]+)/i, 1]
17
+ end
11
18
  end
12
19
 
13
20
  ###
14
- # Set the meta tag encoding for this document. If there is no meta
15
- # content tag, the encoding is not set.
21
+ # Set the meta tag encoding for this document.
22
+ #
23
+ # If an meta encoding tag is already present, its content is
24
+ # replaced with the given text.
25
+ #
26
+ # Otherwise, this method tries to create one at an appropriate
27
+ # place supplying head and/or html elements as necessary, which
28
+ # is inside a head element if any, and before any text node or
29
+ # content element (typically <body>) if any.
30
+ #
31
+ # The result when trying to set an encoding that is different
32
+ # from the document encoding is undefined.
33
+ #
34
+ # Beware in CRuby, that libxml2 automatically inserts a meta tag
35
+ # into a head element.
16
36
  def meta_encoding= encoding
17
- meta = meta_content_type and
18
- meta['content'] = "text/html; charset=%s" % encoding
37
+ case
38
+ when meta = meta_content_type
39
+ meta['content'] = 'text/html; charset=%s' % encoding
40
+ encoding
41
+ when meta = at('//meta[@charset]')
42
+ meta['charset'] = encoding
43
+ else
44
+ meta = XML::Node.new('meta', self)
45
+ if dtd = internal_subset and dtd.html5_dtd?
46
+ meta['charset'] = encoding
47
+ else
48
+ meta['http-equiv'] = 'Content-Type'
49
+ meta['content'] = 'text/html; charset=%s' % encoding
50
+ end
51
+
52
+ case
53
+ when head = at('//head')
54
+ head.prepend_child(meta)
55
+ else
56
+ set_metadata_element(meta)
57
+ end
58
+ encoding
59
+ end
19
60
  end
20
61
 
21
62
  def meta_content_type
22
- css('meta[@http-equiv]').find { |node|
23
- node['http-equiv'] =~ /\AContent-Type\z/i and
24
- !node['content'].nil? and
25
- !node['content'].empty?
63
+ xpath('//meta[@http-equiv and boolean(@content)]').find { |node|
64
+ node['http-equiv'] =~ /\AContent-Type\z/i
26
65
  }
27
66
  end
28
67
  private :meta_content_type
@@ -31,21 +70,65 @@ module Nokogiri
31
70
  # Get the title string of this document. Return nil if there is
32
71
  # no title tag.
33
72
  def title
34
- title = at('title') and title.inner_text
73
+ title = at('//title') and title.inner_text
35
74
  end
36
75
 
37
76
  ###
38
- # Set the title string of this document. If there is no head
39
- # element, the title is not set.
77
+ # Set the title string of this document.
78
+ #
79
+ # If a title element is already present, its content is replaced
80
+ # with the given text.
81
+ #
82
+ # Otherwise, this method tries to create one at an appropriate
83
+ # place supplying head and/or html elements as necessary, which
84
+ # is inside a head element if any, right after a meta
85
+ # encoding/charset tag if any, and before any text node or
86
+ # content element (typically <body>) if any.
40
87
  def title=(text)
41
- unless title = at('title')
42
- head = at('head') or return nil
43
- title = Nokogiri::XML::Node.new('title', self)
88
+ tnode = XML::Text.new(text, self)
89
+ if title = at('//title')
90
+ title.children = tnode
91
+ return text
92
+ end
93
+
94
+ title = XML::Node.new('title', self) << tnode
95
+ case
96
+ when head = at('//head')
44
97
  head << title
98
+ when meta = at('//meta[@charset]') || meta_content_type
99
+ # better put after charset declaration
100
+ meta.add_next_sibling(title)
101
+ else
102
+ set_metadata_element(title)
45
103
  end
46
- title.children = XML::Text.new(text, self)
104
+ text
47
105
  end
48
106
 
107
+ def set_metadata_element(element)
108
+ case
109
+ when head = at('//head')
110
+ head << element
111
+ when html = at('//html')
112
+ head = html.prepend_child(XML::Node.new('head', self))
113
+ head.prepend_child(element)
114
+ when first = children.find { |node|
115
+ case node
116
+ when XML::Element, XML::Text
117
+ true
118
+ end
119
+ }
120
+ # We reach here only if the underlying document model
121
+ # allows <html>/<head> elements to be omitted and does not
122
+ # automatically supply them.
123
+ first.add_previous_sibling(element)
124
+ else
125
+ html = add_child(XML::Node.new('html', self))
126
+ head = html.add_child(XML::Node.new('head', self))
127
+ head.prepend_child(element)
128
+ end
129
+ end
130
+ private :set_metadata_element
131
+
49
132
  ####
50
133
  # Serialize Node using +options+. Save options can also be set using a
51
134
  # block. See SaveOptions.
@@ -81,11 +164,12 @@ module Nokogiri
81
164
  # Nokogiri::XML::ParseOptions::RECOVER. See the constants in
82
165
  # Nokogiri::XML::ParseOptions.
83
166
  def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML
167
+ options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
84
168
 
85
- options = Nokogiri::XML::ParseOptions.new(options) if Fixnum === options
86
- # Give the options to the user
87
169
  yield options if block_given?
88
170
 
171
+ url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
172
+
89
173
  if string_or_io.respond_to?(:encoding)
90
174
  unless string_or_io.encoding.name == "ASCII-8BIT"
91
175
  encoding ||= string_or_io.encoding.name
@@ -93,8 +177,13 @@ module Nokogiri
93
177
  end
94
178
 
95
179
  if string_or_io.respond_to?(:read)
96
- url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
97
- if !encoding
180
+ if string_or_io.is_a?(Pathname)
181
+ # resolve the Pathname to the file and open it as an IO object, see #2110
182
+ string_or_io = string_or_io.expand_path.open
183
+ url ||= string_or_io.path
184
+ end
185
+
186
+ unless encoding
98
187
  # Libxml2's parser has poor support for encoding
99
188
  # detection. First, it does not recognize the HTML5
100
189
  # style meta charset declaration. Secondly, even if it
@@ -117,7 +206,9 @@ module Nokogiri
117
206
  end
118
207
 
119
208
  # read_memory pukes on empty docs
120
- return new if string_or_io.nil? or string_or_io.empty?
209
+ if string_or_io.nil? or string_or_io.empty?
210
+ return encoding ? new.tap { |i| i.encoding = encoding } : new
211
+ end
121
212
 
122
213
  encoding ||= EncodingReader.detect_encoding(string_or_io)
123
214
 
@@ -170,9 +261,6 @@ module Nokogiri
170
261
  end
171
262
 
172
263
  def self.detect_encoding(chunk)
173
- if Nokogiri.jruby? && EncodingReader.is_jruby_without_fix?
174
- return EncodingReader.detect_encoding_for_jruby_without_fix(chunk)
175
- end
176
264
  m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
177
265
  return Nokogiri.XML(m[1]).encoding
178
266
 
@@ -180,37 +268,17 @@ module Nokogiri
180
268
  m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
181
269
  return m[4]
182
270
  catch(:encoding_found) {
183
- Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
271
+ Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
184
272
  nil
185
273
  }
186
274
  else
187
275
  handler = SAXHandler.new
188
- parser = Nokogiri::HTML::SAX::PushParser.new(handler)
276
+ parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
189
277
  parser << chunk rescue Nokogiri::SyntaxError
190
278
  handler.encoding
191
279
  end
192
280
  end
193
281
 
194
- def self.is_jruby_without_fix?
195
- JRUBY_VERSION.split('.').join.to_i < 165
196
- end
197
-
198
- def self.detect_encoding_for_jruby_without_fix(chunk)
199
- m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
200
- return Nokogiri.XML(m[1]).encoding
201
-
202
- m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
203
- return m[4]
204
-
205
- catch(:encoding_found) {
206
- Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk)
207
- nil
208
- }
209
- rescue Nokogiri::SyntaxError, RuntimeError
210
- # Ignore parser errors that nokogiri may raise
211
- nil
212
- end
213
-
214
282
  def initialize(io)
215
283
  @io = io
216
284
  @firstchunk = nil
@@ -218,7 +286,7 @@ module Nokogiri
218
286
  end
219
287
 
220
288
  # This method is used by the C extension so that
221
- # Nokogiri::HTML::Document#read_io() does not leak memory when
289
+ # Nokogiri::HTML4::Document#read_io() does not leak memory when
222
290
  # EncodingFound is raised.
223
291
  attr_reader :encoding_found
224
292
 
@@ -1,20 +1,29 @@
1
+ # frozen_string_literal: true
1
2
  module Nokogiri
2
- module HTML
3
+ module HTML4
3
4
  class DocumentFragment < Nokogiri::XML::DocumentFragment
4
- attr_accessor :errors
5
-
6
5
  ####
7
6
  # Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
8
- def self.parse tags, encoding = nil
9
- doc = HTML::Document.new
7
+ def self.parse(tags, encoding = nil)
8
+ doc = HTML4::Document.new
9
+
10
+ encoding ||= if tags.respond_to?(:encoding)
11
+ encoding = tags.encoding
12
+ if encoding == ::Encoding::ASCII_8BIT
13
+ 'UTF-8'
14
+ else
15
+ encoding.name
16
+ end
17
+ else
18
+ 'UTF-8'
19
+ end
10
20
 
11
- encoding ||= tags.respond_to?(:encoding) ? tags.encoding.name : 'UTF-8'
12
21
  doc.encoding = encoding
13
22
 
14
23
  new(doc, tags)
15
24
  end
16
25
 
17
- def initialize document, tags = nil, ctx = nil
26
+ def initialize(document, tags = nil, ctx = nil)
18
27
  return self unless tags
19
28
 
20
29
  if ctx
@@ -24,13 +33,13 @@ module Nokogiri
24
33
  self.errors = document.errors - preexisting_errors
25
34
  else
26
35
  # This is a horrible hack, but I don't care
27
- if tags.strip =~ /^<body/i
28
- path = "/html/body"
36
+ path = if /^\s*?<body/i.match?(tags)
37
+ "/html/body"
29
38
  else
30
- path = "/html/body/node()"
39
+ "/html/body/node()"
31
40
  end
32
41
 
33
- temp_doc = HTML::Document.parse "<html><body>#{tags}", nil, document.encoding
42
+ temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding)
34
43
  temp_doc.xpath(path).each { |child| child.parent = self }
35
44
  self.errors = temp_doc.errors
36
45
  end
@@ -1,5 +1,6 @@
1
+ # frozen_string_literal: true
1
2
  module Nokogiri
2
- module HTML
3
+ module HTML4
3
4
  class ElementDescription
4
5
  ###
5
6
  # Is this element a block element?
@@ -1,5 +1,6 @@
1
+ # frozen_string_literal: true
1
2
  module Nokogiri
2
- module HTML
3
+ module HTML4
3
4
  class ElementDescription
4
5
 
5
6
  # Methods are defined protected by method_defined? because at
@@ -1,5 +1,6 @@
1
+ # frozen_string_literal: true
1
2
  module Nokogiri
2
- module HTML
3
+ module HTML4
3
4
  class EntityDescription < Struct.new(:value, :name, :description); end
4
5
 
5
6
  class EntityLookup
@@ -1,17 +1,15 @@
1
+ # frozen_string_literal: true
1
2
  module Nokogiri
2
- module HTML
3
+ module HTML4
3
4
  ###
4
- # Nokogiri lets you write a SAX parser to process HTML but get HTML
5
- # correction features.
5
+ # Nokogiri lets you write a SAX parser to process HTML but get HTML correction features.
6
6
  #
7
- # See Nokogiri::HTML::SAX::Parser for a basic example of using a
8
- # SAX parser with HTML.
7
+ # See Nokogiri::HTML4::SAX::Parser for a basic example of using a SAX parser with HTML.
9
8
  #
10
9
  # For more information on SAX parsers, see Nokogiri::XML::SAX
11
10
  module SAX
12
11
  ###
13
- # This class lets you perform SAX style parsing on HTML with HTML
14
- # error correction.
12
+ # This class lets you perform SAX style parsing on HTML with HTML error correction.
15
13
  #
16
14
  # Here is a basic usage example:
17
15
  #
@@ -21,30 +19,40 @@ module Nokogiri
21
19
  # end
22
20
  # end
23
21
  #
24
- # parser = Nokogiri::HTML::SAX::Parser.new(MyDoc.new)
25
- # parser.parse(File.read(ARGV[0], 'rb'))
22
+ # parser = Nokogiri::HTML4::SAX::Parser.new(MyDoc.new)
23
+ # parser.parse(File.read(ARGV[0], mode: 'rb'))
26
24
  #
27
25
  # For more information on SAX parsers, see Nokogiri::XML::SAX
28
26
  class Parser < Nokogiri::XML::SAX::Parser
29
27
  ###
30
28
  # Parse html stored in +data+ using +encoding+
31
- def parse_memory data, encoding = 'UTF-8'
29
+ def parse_memory(data, encoding = "UTF-8")
32
30
  raise ArgumentError unless data
33
31
  return unless data.length > 0
34
32
  ctx = ParserContext.memory(data, encoding)
35
33
  yield ctx if block_given?
36
- ctx.parse_with self
34
+ ctx.parse_with(self)
35
+ end
36
+
37
+ ###
38
+ # Parse given +io+
39
+ def parse_io(io, encoding = "UTF-8")
40
+ check_encoding(encoding)
41
+ @encoding = encoding
42
+ ctx = ParserContext.io(io, ENCODINGS[encoding])
43
+ yield ctx if block_given?
44
+ ctx.parse_with(self)
37
45
  end
38
46
 
39
47
  ###
40
48
  # Parse a file with +filename+
41
- def parse_file filename, encoding = 'UTF-8'
49
+ def parse_file(filename, encoding = "UTF-8")
42
50
  raise ArgumentError unless filename
43
- raise Errno::ENOENT unless File.exists?(filename)
51
+ raise Errno::ENOENT unless File.exist?(filename)
44
52
  raise Errno::EISDIR if File.directory?(filename)
45
53
  ctx = ParserContext.file(filename, encoding)
46
54
  yield ctx if block_given?
47
- ctx.parse_with self
55
+ ctx.parse_with(self)
48
56
  end
49
57
  end
50
58
  end
@@ -0,0 +1,19 @@
1
+ # frozen_string_literal: true
2
+ module Nokogiri
3
+ module HTML4
4
+ module SAX
5
+ ###
6
+ # Context for HTML SAX parsers. This class is usually not instantiated by the user. Instead,
7
+ # you should be looking at Nokogiri::HTML4::SAX::Parser
8
+ class ParserContext < Nokogiri::XML::SAX::ParserContext
9
+ def self.new(thing, encoding = "UTF-8")
10
+ if [:read, :close].all? { |x| thing.respond_to?(x) }
11
+ super
12
+ else
13
+ memory(thing, encoding)
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+ module Nokogiri
3
+ module HTML4
4
+ module SAX
5
+ class PushParser
6
+
7
+ # The Nokogiri::HTML4::SAX::Document on which the PushParser will be
8
+ # operating
9
+ attr_accessor :document
10
+
11
+ def initialize(doc = HTML4::SAX::Document.new, file_name = nil, encoding = 'UTF-8')
12
+ @document = doc
13
+ @encoding = encoding
14
+ @sax_parser = HTML4::SAX::Parser.new(doc, @encoding)
15
+
16
+ ## Create our push parser context
17
+ initialize_native(@sax_parser, file_name, encoding)
18
+ end
19
+
20
+ ###
21
+ # Write a +chunk+ of HTML to the PushParser. Any callback methods
22
+ # that can be called will be called immediately.
23
+ def write chunk, last_chunk = false
24
+ native_write(chunk, last_chunk)
25
+ end
26
+ alias :<< :write
27
+
28
+ ###
29
+ # Finish the parsing. This method is only necessary for
30
+ # Nokogiri::HTML4::SAX::Document#end_document to be called.
31
+ def finish
32
+ write '', true
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+ module Nokogiri
3
+ class << self
4
+ ###
5
+ # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
6
+ def HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
7
+ Nokogiri::HTML4::Document.parse(input, url, encoding, options, &block)
8
+ end
9
+ end
10
+
11
+ # @since v1.12.0
12
+ # @note Before v1.12.0, {Nokogiri::HTML4} did not exist, and {Nokogiri::HTML} was the module/namespace for parsing HTML.
13
+ module HTML4
14
+ class << self
15
+ ###
16
+ # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
17
+ def parse(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
18
+ Document.parse(input, url, encoding, options, &block)
19
+ end
20
+
21
+ ####
22
+ # Parse a fragment from +string+ in to a NodeSet.
23
+ def fragment(string, encoding = nil)
24
+ HTML4::DocumentFragment.parse(string, encoding)
25
+ end
26
+ end
27
+
28
+ # Instance of Nokogiri::HTML4::EntityLookup
29
+ NamedCharacters = EntityLookup.new
30
+ end
31
+ end
32
+
33
+ require_relative "html4/entity_lookup"
34
+ require_relative "html4/document"
35
+ require_relative "html4/document_fragment"
36
+ require_relative "html4/sax/parser_context"
37
+ require_relative "html4/sax/parser"
38
+ require_relative "html4/sax/push_parser"
39
+ require_relative "html4/element_description"
40
+ require_relative "html4/element_description_defaults"
@@ -0,0 +1,74 @@
1
+ # frozen_string_literal: true
2
+ #
3
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ require_relative "../html4/document"
19
+
20
+ module Nokogiri
21
+ module HTML5
22
+ # @since v1.12.0
23
+ # @note HTML5 functionality is not available when running JRuby.
24
+ class Document < Nokogiri::HTML4::Document
25
+ def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
26
+ yield options if block_given?
27
+ string_or_io = '' unless string_or_io
28
+
29
+ if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != 'ASCII-8BIT'
30
+ encoding ||= string_or_io.encoding.name
31
+ end
32
+
33
+ if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
34
+ url ||= string_or_io.path
35
+ end
36
+ unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
37
+ raise ArgumentError.new("not a string or IO object")
38
+ end
39
+ do_parse(string_or_io, url, encoding, options)
40
+ end
41
+
42
+ def self.read_io(io, url = nil, encoding = nil, **options)
43
+ raise ArgumentError.new("io object doesn't respond to :read") unless io.respond_to?(:read)
44
+ do_parse(io, url, encoding, options)
45
+ end
46
+
47
+ def self.read_memory(string, url = nil, encoding = nil, **options)
48
+ raise ArgumentError.new("string object doesn't respond to :to_str") unless string.respond_to?(:to_str)
49
+ do_parse(string, url, encoding, options)
50
+ end
51
+
52
+ def fragment(tags = nil)
53
+ DocumentFragment.new(self, tags, self.root)
54
+ end
55
+
56
+ def to_xml(options = {}, &block)
57
+ # Bypass XML::Document#to_xml which doesn't add
58
+ # XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
59
+ XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
60
+ end
61
+
62
+ private
63
+ def self.do_parse(string_or_io, url, encoding, options)
64
+ string = HTML5.read_and_encode(string_or_io, encoding)
65
+ max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
66
+ max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
67
+ max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
68
+ doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth)
69
+ doc.encoding = 'UTF-8'
70
+ doc
71
+ end
72
+ end
73
+ end
74
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+ #
3
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ #
17
+
18
+ require_relative "../html4/document_fragment"
19
+
20
+ module Nokogiri
21
+ module HTML5
22
+ # @since v1.12.0
23
+ # @note HTML5 functionality is not available when running JRuby.
24
+ class DocumentFragment < Nokogiri::HTML4::DocumentFragment
25
+ attr_accessor :document
26
+ attr_accessor :errors
27
+
28
+ # Create a document fragment.
29
+ def initialize(doc, tags = nil, ctx = nil, options = {})
30
+ self.document = doc
31
+ self.errors = []
32
+ return self unless tags
33
+
34
+ max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
35
+ max_errors = options[:max_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
36
+ max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
37
+ tags = Nokogiri::HTML5.read_and_encode(tags, nil)
38
+ Nokogiri::Gumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
39
+ end
40
+
41
+ def serialize(options = {}, &block)
42
+ # Bypass XML::Document.serialize which doesn't support options even
43
+ # though XML::Node.serialize does!
44
+ XML::Node.instance_method(:serialize).bind(self).call(options, &block)
45
+ end
46
+
47
+ # Parse a document fragment from +tags+, returning a Nodeset.
48
+ def self.parse(tags, encoding = nil, options = {})
49
+ doc = HTML5::Document.new
50
+ tags = HTML5.read_and_encode(tags, encoding)
51
+ doc.encoding = "UTF-8"
52
+ new(doc, tags, nil, options)
53
+ end
54
+
55
+ def extract_params(params) # :nodoc:
56
+ handler = params.find do |param|
57
+ ![Hash, String, Symbol].include?(param.class)
58
+ end
59
+ params -= [handler] if handler
60
+
61
+ hashes = []
62
+ while Hash === params.last || params.last.nil?
63
+ hashes << params.pop
64
+ break if params.empty?
65
+ end
66
+ ns, binds = hashes.reverse
67
+
68
+ ns ||=
69
+ begin
70
+ ns = {}
71
+ children.each { |child| ns.merge!(child.namespaces) }
72
+ ns
73
+ end
74
+
75
+ [params, handler, ns, binds]
76
+ end
77
+ end
78
+ end
79
+ end
80
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab: