nokogiri 1.5.10 → 1.13.0

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (334) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +5 -0
  3. data/LICENSE-DEPENDENCIES.md +1903 -0
  4. data/LICENSE.md +9 -0
  5. data/README.md +280 -0
  6. data/bin/nokogiri +84 -31
  7. data/dependencies.yml +73 -0
  8. data/ext/nokogiri/depend +38 -358
  9. data/ext/nokogiri/extconf.rb +956 -100
  10. data/ext/nokogiri/gumbo.c +584 -0
  11. data/ext/nokogiri/html4_document.c +166 -0
  12. data/ext/nokogiri/html4_element_description.c +294 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser_context.c +120 -0
  15. data/ext/nokogiri/html4_sax_push_parser.c +95 -0
  16. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  17. data/ext/nokogiri/nokogiri.c +232 -87
  18. data/ext/nokogiri/nokogiri.h +188 -129
  19. data/ext/nokogiri/test_global_handlers.c +40 -0
  20. data/ext/nokogiri/xml_attr.c +49 -40
  21. data/ext/nokogiri/xml_attribute_decl.c +18 -18
  22. data/ext/nokogiri/xml_cdata.c +24 -23
  23. data/ext/nokogiri/xml_comment.c +29 -21
  24. data/ext/nokogiri/xml_document.c +327 -223
  25. data/ext/nokogiri/xml_document_fragment.c +12 -16
  26. data/ext/nokogiri/xml_dtd.c +56 -50
  27. data/ext/nokogiri/xml_element_content.c +31 -26
  28. data/ext/nokogiri/xml_element_decl.c +22 -22
  29. data/ext/nokogiri/xml_encoding_handler.c +45 -20
  30. data/ext/nokogiri/xml_entity_decl.c +32 -30
  31. data/ext/nokogiri/xml_entity_reference.c +16 -18
  32. data/ext/nokogiri/xml_namespace.c +74 -32
  33. data/ext/nokogiri/xml_node.c +1290 -680
  34. data/ext/nokogiri/xml_node_set.c +239 -208
  35. data/ext/nokogiri/xml_processing_instruction.c +17 -19
  36. data/ext/nokogiri/xml_reader.c +227 -189
  37. data/ext/nokogiri/xml_relax_ng.c +52 -28
  38. data/ext/nokogiri/xml_sax_parser.c +123 -125
  39. data/ext/nokogiri/xml_sax_parser_context.c +138 -79
  40. data/ext/nokogiri/xml_sax_push_parser.c +88 -35
  41. data/ext/nokogiri/xml_schema.c +112 -33
  42. data/ext/nokogiri/xml_syntax_error.c +50 -23
  43. data/ext/nokogiri/xml_text.c +14 -18
  44. data/ext/nokogiri/xml_xpath_context.c +227 -140
  45. data/ext/nokogiri/xslt_stylesheet.c +162 -168
  46. data/gumbo-parser/CHANGES.md +63 -0
  47. data/gumbo-parser/Makefile +101 -0
  48. data/gumbo-parser/THANKS +27 -0
  49. data/gumbo-parser/src/Makefile +34 -0
  50. data/gumbo-parser/src/README.md +41 -0
  51. data/gumbo-parser/src/ascii.c +75 -0
  52. data/gumbo-parser/src/ascii.h +115 -0
  53. data/gumbo-parser/src/attribute.c +42 -0
  54. data/gumbo-parser/src/attribute.h +17 -0
  55. data/gumbo-parser/src/char_ref.c +22225 -0
  56. data/gumbo-parser/src/char_ref.h +29 -0
  57. data/gumbo-parser/src/char_ref.rl +2154 -0
  58. data/gumbo-parser/src/error.c +626 -0
  59. data/gumbo-parser/src/error.h +148 -0
  60. data/gumbo-parser/src/foreign_attrs.c +104 -0
  61. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  62. data/gumbo-parser/src/gumbo.h +943 -0
  63. data/gumbo-parser/src/insertion_mode.h +33 -0
  64. data/gumbo-parser/src/macros.h +91 -0
  65. data/gumbo-parser/src/parser.c +4875 -0
  66. data/gumbo-parser/src/parser.h +41 -0
  67. data/gumbo-parser/src/replacement.h +33 -0
  68. data/gumbo-parser/src/string_buffer.c +103 -0
  69. data/gumbo-parser/src/string_buffer.h +68 -0
  70. data/gumbo-parser/src/string_piece.c +48 -0
  71. data/gumbo-parser/src/svg_attrs.c +174 -0
  72. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  73. data/gumbo-parser/src/svg_tags.c +137 -0
  74. data/gumbo-parser/src/svg_tags.gperf +55 -0
  75. data/gumbo-parser/src/tag.c +222 -0
  76. data/gumbo-parser/src/tag_lookup.c +382 -0
  77. data/gumbo-parser/src/tag_lookup.gperf +169 -0
  78. data/gumbo-parser/src/tag_lookup.h +13 -0
  79. data/gumbo-parser/src/token_buffer.c +79 -0
  80. data/gumbo-parser/src/token_buffer.h +71 -0
  81. data/gumbo-parser/src/token_type.h +17 -0
  82. data/gumbo-parser/src/tokenizer.c +3463 -0
  83. data/gumbo-parser/src/tokenizer.h +112 -0
  84. data/gumbo-parser/src/tokenizer_states.h +339 -0
  85. data/gumbo-parser/src/utf8.c +245 -0
  86. data/gumbo-parser/src/utf8.h +164 -0
  87. data/gumbo-parser/src/util.c +68 -0
  88. data/gumbo-parser/src/util.h +30 -0
  89. data/gumbo-parser/src/vector.c +111 -0
  90. data/gumbo-parser/src/vector.h +45 -0
  91. data/lib/nokogiri/class_resolver.rb +67 -0
  92. data/lib/nokogiri/css/node.rb +10 -58
  93. data/lib/nokogiri/css/parser.rb +327 -288
  94. data/lib/nokogiri/css/parser.y +67 -45
  95. data/lib/nokogiri/css/parser_extras.rb +52 -49
  96. data/lib/nokogiri/css/syntax_error.rb +3 -1
  97. data/lib/nokogiri/css/tokenizer.rb +107 -104
  98. data/lib/nokogiri/css/tokenizer.rex +7 -6
  99. data/lib/nokogiri/css/xpath_visitor.rb +263 -75
  100. data/lib/nokogiri/css.rb +50 -17
  101. data/lib/nokogiri/decorators/slop.rb +17 -8
  102. data/lib/nokogiri/extension.rb +31 -0
  103. data/lib/nokogiri/gumbo.rb +15 -0
  104. data/lib/nokogiri/html.rb +38 -27
  105. data/lib/nokogiri/{html → html4}/builder.rb +4 -2
  106. data/lib/nokogiri/html4/document.rb +331 -0
  107. data/lib/nokogiri/html4/document_fragment.rb +54 -0
  108. data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
  109. data/lib/nokogiri/html4/element_description_defaults.rb +578 -0
  110. data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
  111. data/lib/nokogiri/{html → html4}/sax/parser.rb +24 -15
  112. data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
  113. data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
  114. data/lib/nokogiri/html4.rb +46 -0
  115. data/lib/nokogiri/html5/document.rb +88 -0
  116. data/lib/nokogiri/html5/document_fragment.rb +83 -0
  117. data/lib/nokogiri/html5/node.rb +96 -0
  118. data/lib/nokogiri/html5.rb +477 -0
  119. data/lib/nokogiri/jruby/dependencies.rb +21 -0
  120. data/lib/nokogiri/syntax_error.rb +2 -0
  121. data/lib/nokogiri/version/constant.rb +6 -0
  122. data/lib/nokogiri/version/info.rb +221 -0
  123. data/lib/nokogiri/version.rb +3 -90
  124. data/lib/nokogiri/xml/attr.rb +6 -3
  125. data/lib/nokogiri/xml/attribute_decl.rb +3 -1
  126. data/lib/nokogiri/xml/builder.rb +96 -54
  127. data/lib/nokogiri/xml/cdata.rb +3 -1
  128. data/lib/nokogiri/xml/character_data.rb +2 -0
  129. data/lib/nokogiri/xml/document.rb +234 -95
  130. data/lib/nokogiri/xml/document_fragment.rb +86 -36
  131. data/lib/nokogiri/xml/dtd.rb +16 -4
  132. data/lib/nokogiri/xml/element_content.rb +2 -0
  133. data/lib/nokogiri/xml/element_decl.rb +3 -1
  134. data/lib/nokogiri/xml/entity_decl.rb +4 -2
  135. data/lib/nokogiri/xml/entity_reference.rb +20 -0
  136. data/lib/nokogiri/xml/namespace.rb +3 -0
  137. data/lib/nokogiri/xml/node/save_options.rb +8 -4
  138. data/lib/nokogiri/xml/node.rb +947 -502
  139. data/lib/nokogiri/xml/node_set.rb +168 -159
  140. data/lib/nokogiri/xml/notation.rb +13 -0
  141. data/lib/nokogiri/xml/parse_options.rb +40 -5
  142. data/lib/nokogiri/xml/pp/character_data.rb +9 -6
  143. data/lib/nokogiri/xml/pp/node.rb +25 -26
  144. data/lib/nokogiri/xml/pp.rb +4 -2
  145. data/lib/nokogiri/xml/processing_instruction.rb +3 -1
  146. data/lib/nokogiri/xml/reader.rb +23 -28
  147. data/lib/nokogiri/xml/relax_ng.rb +8 -2
  148. data/lib/nokogiri/xml/sax/document.rb +45 -49
  149. data/lib/nokogiri/xml/sax/parser.rb +43 -41
  150. data/lib/nokogiri/xml/sax/parser_context.rb +8 -3
  151. data/lib/nokogiri/xml/sax/push_parser.rb +6 -5
  152. data/lib/nokogiri/xml/sax.rb +6 -4
  153. data/lib/nokogiri/xml/schema.rb +19 -9
  154. data/lib/nokogiri/xml/searchable.rb +259 -0
  155. data/lib/nokogiri/xml/syntax_error.rb +25 -1
  156. data/lib/nokogiri/xml/text.rb +2 -0
  157. data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
  158. data/lib/nokogiri/xml/xpath.rb +15 -4
  159. data/lib/nokogiri/xml/xpath_context.rb +3 -3
  160. data/lib/nokogiri/xml.rb +38 -36
  161. data/lib/nokogiri/xslt/stylesheet.rb +3 -1
  162. data/lib/nokogiri/xslt.rb +18 -16
  163. data/lib/nokogiri.rb +69 -69
  164. data/lib/xsd/xmlparser/nokogiri.rb +26 -24
  165. data/patches/libxml2/0001-Remove-script-macro-support.patch +40 -0
  166. data/patches/libxml2/0002-Update-entities-to-remove-handling-of-ssi.patch +44 -0
  167. data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +25 -0
  168. data/patches/libxml2/0004-use-glibc-strlen.patch +53 -0
  169. data/patches/libxml2/0005-avoid-isnan-isinf.patch +81 -0
  170. data/patches/libxml2/0006-update-automake-files-for-arm64.patch +2511 -0
  171. data/patches/libxml2/0007-Fix-XPath-recursion-limit.patch +31 -0
  172. data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +61 -0
  173. data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
  174. data/patches/libxslt/0001-update-automake-files-for-arm64.patch +2511 -0
  175. data/patches/libxslt/0002-Fix-xml2-config-check-in-configure-script.patch +19 -0
  176. data/ports/archives/libxml2-2.9.12.tar.gz +0 -0
  177. data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
  178. metadata +382 -460
  179. data/.autotest +0 -26
  180. data/.gemtest +0 -0
  181. data/CHANGELOG.ja.rdoc +0 -785
  182. data/CHANGELOG.rdoc +0 -783
  183. data/C_CODING_STYLE.rdoc +0 -33
  184. data/Manifest.txt +0 -303
  185. data/README.ja.rdoc +0 -106
  186. data/README.rdoc +0 -175
  187. data/ROADMAP.md +0 -90
  188. data/Rakefile +0 -228
  189. data/STANDARD_RESPONSES.md +0 -47
  190. data/Y_U_NO_GEMSPEC.md +0 -155
  191. data/build_all +0 -105
  192. data/ext/nokogiri/html_document.c +0 -170
  193. data/ext/nokogiri/html_document.h +0 -10
  194. data/ext/nokogiri/html_element_description.c +0 -279
  195. data/ext/nokogiri/html_element_description.h +0 -10
  196. data/ext/nokogiri/html_entity_lookup.c +0 -32
  197. data/ext/nokogiri/html_entity_lookup.h +0 -8
  198. data/ext/nokogiri/html_sax_parser_context.c +0 -116
  199. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  200. data/ext/nokogiri/html_sax_push_parser.c +0 -87
  201. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  202. data/ext/nokogiri/xml_attr.h +0 -9
  203. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  204. data/ext/nokogiri/xml_cdata.h +0 -9
  205. data/ext/nokogiri/xml_comment.h +0 -9
  206. data/ext/nokogiri/xml_document.h +0 -23
  207. data/ext/nokogiri/xml_document_fragment.h +0 -10
  208. data/ext/nokogiri/xml_dtd.h +0 -10
  209. data/ext/nokogiri/xml_element_content.h +0 -10
  210. data/ext/nokogiri/xml_element_decl.h +0 -9
  211. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  212. data/ext/nokogiri/xml_entity_decl.h +0 -10
  213. data/ext/nokogiri/xml_entity_reference.h +0 -9
  214. data/ext/nokogiri/xml_io.c +0 -56
  215. data/ext/nokogiri/xml_io.h +0 -11
  216. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  217. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  218. data/ext/nokogiri/xml_namespace.h +0 -13
  219. data/ext/nokogiri/xml_node.h +0 -13
  220. data/ext/nokogiri/xml_node_set.h +0 -14
  221. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  222. data/ext/nokogiri/xml_reader.h +0 -10
  223. data/ext/nokogiri/xml_relax_ng.h +0 -9
  224. data/ext/nokogiri/xml_sax_parser.h +0 -39
  225. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  226. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  227. data/ext/nokogiri/xml_schema.h +0 -9
  228. data/ext/nokogiri/xml_syntax_error.h +0 -13
  229. data/ext/nokogiri/xml_text.h +0 -9
  230. data/ext/nokogiri/xml_xpath_context.h +0 -10
  231. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  232. data/lib/nokogiri/html/document.rb +0 -254
  233. data/lib/nokogiri/html/document_fragment.rb +0 -41
  234. data/lib/nokogiri/html/element_description_defaults.rb +0 -671
  235. data/lib/nokogiri/html/sax/parser_context.rb +0 -16
  236. data/lib/nokogiri/html/sax/push_parser.rb +0 -16
  237. data/tasks/cross_compile.rb +0 -150
  238. data/tasks/nokogiri.org.rb +0 -24
  239. data/tasks/test.rb +0 -95
  240. data/test/css/test_nthiness.rb +0 -159
  241. data/test/css/test_parser.rb +0 -341
  242. data/test/css/test_tokenizer.rb +0 -198
  243. data/test/css/test_xpath_visitor.rb +0 -91
  244. data/test/decorators/test_slop.rb +0 -16
  245. data/test/files/2ch.html +0 -108
  246. data/test/files/address_book.rlx +0 -12
  247. data/test/files/address_book.xml +0 -10
  248. data/test/files/bar/bar.xsd +0 -4
  249. data/test/files/dont_hurt_em_why.xml +0 -422
  250. data/test/files/encoding.html +0 -82
  251. data/test/files/encoding.xhtml +0 -84
  252. data/test/files/exslt.xml +0 -8
  253. data/test/files/exslt.xslt +0 -35
  254. data/test/files/foo/foo.xsd +0 -4
  255. data/test/files/metacharset.html +0 -10
  256. data/test/files/noencoding.html +0 -47
  257. data/test/files/po.xml +0 -32
  258. data/test/files/po.xsd +0 -66
  259. data/test/files/shift_jis.html +0 -10
  260. data/test/files/shift_jis.xml +0 -5
  261. data/test/files/snuggles.xml +0 -3
  262. data/test/files/staff.dtd +0 -10
  263. data/test/files/staff.xml +0 -59
  264. data/test/files/staff.xslt +0 -32
  265. data/test/files/test_document_url/bar.xml +0 -2
  266. data/test/files/test_document_url/document.dtd +0 -4
  267. data/test/files/test_document_url/document.xml +0 -6
  268. data/test/files/tlm.html +0 -850
  269. data/test/files/to_be_xincluded.xml +0 -2
  270. data/test/files/valid_bar.xml +0 -2
  271. data/test/files/xinclude.xml +0 -4
  272. data/test/helper.rb +0 -154
  273. data/test/html/sax/test_parser.rb +0 -141
  274. data/test/html/sax/test_parser_context.rb +0 -46
  275. data/test/html/test_builder.rb +0 -164
  276. data/test/html/test_document.rb +0 -552
  277. data/test/html/test_document_encoding.rb +0 -138
  278. data/test/html/test_document_fragment.rb +0 -261
  279. data/test/html/test_element_description.rb +0 -105
  280. data/test/html/test_named_characters.rb +0 -14
  281. data/test/html/test_node.rb +0 -196
  282. data/test/html/test_node_encoding.rb +0 -27
  283. data/test/namespaces/test_additional_namespaces_in_builder_doc.rb +0 -14
  284. data/test/namespaces/test_namespaces_in_builder_doc.rb +0 -75
  285. data/test/namespaces/test_namespaces_in_created_doc.rb +0 -75
  286. data/test/namespaces/test_namespaces_in_parsed_doc.rb +0 -66
  287. data/test/test_convert_xpath.rb +0 -135
  288. data/test/test_css_cache.rb +0 -45
  289. data/test/test_encoding_handler.rb +0 -46
  290. data/test/test_memory_leak.rb +0 -156
  291. data/test/test_nokogiri.rb +0 -132
  292. data/test/test_reader.rb +0 -555
  293. data/test/test_soap4r_sax.rb +0 -52
  294. data/test/test_xslt_transforms.rb +0 -254
  295. data/test/xml/node/test_save_options.rb +0 -28
  296. data/test/xml/node/test_subclass.rb +0 -44
  297. data/test/xml/sax/test_parser.rb +0 -366
  298. data/test/xml/sax/test_parser_context.rb +0 -106
  299. data/test/xml/sax/test_push_parser.rb +0 -157
  300. data/test/xml/test_attr.rb +0 -64
  301. data/test/xml/test_attribute_decl.rb +0 -86
  302. data/test/xml/test_builder.rb +0 -306
  303. data/test/xml/test_c14n.rb +0 -151
  304. data/test/xml/test_cdata.rb +0 -48
  305. data/test/xml/test_comment.rb +0 -29
  306. data/test/xml/test_document.rb +0 -828
  307. data/test/xml/test_document_encoding.rb +0 -28
  308. data/test/xml/test_document_fragment.rb +0 -223
  309. data/test/xml/test_dtd.rb +0 -103
  310. data/test/xml/test_dtd_encoding.rb +0 -33
  311. data/test/xml/test_element_content.rb +0 -56
  312. data/test/xml/test_element_decl.rb +0 -73
  313. data/test/xml/test_entity_decl.rb +0 -122
  314. data/test/xml/test_entity_reference.rb +0 -245
  315. data/test/xml/test_namespace.rb +0 -95
  316. data/test/xml/test_node.rb +0 -1137
  317. data/test/xml/test_node_attributes.rb +0 -96
  318. data/test/xml/test_node_encoding.rb +0 -107
  319. data/test/xml/test_node_inheritance.rb +0 -32
  320. data/test/xml/test_node_reparenting.rb +0 -374
  321. data/test/xml/test_node_set.rb +0 -755
  322. data/test/xml/test_parse_options.rb +0 -64
  323. data/test/xml/test_processing_instruction.rb +0 -30
  324. data/test/xml/test_reader_encoding.rb +0 -142
  325. data/test/xml/test_relax_ng.rb +0 -60
  326. data/test/xml/test_schema.rb +0 -103
  327. data/test/xml/test_syntax_error.rb +0 -12
  328. data/test/xml/test_text.rb +0 -45
  329. data/test/xml/test_unparented_node.rb +0 -422
  330. data/test/xml/test_xinclude.rb +0 -83
  331. data/test/xml/test_xpath.rb +0 -295
  332. data/test/xslt/test_custom_functions.rb +0 -133
  333. data/test/xslt/test_exception_handling.rb +0 -37
  334. data/test_all +0 -81
@@ -0,0 +1,20 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML4
5
+ module SAX
6
+ ###
7
+ # Context for HTML SAX parsers. This class is usually not instantiated by the user. Instead,
8
+ # you should be looking at Nokogiri::HTML4::SAX::Parser
9
+ class ParserContext < Nokogiri::XML::SAX::ParserContext
10
+ def self.new(thing, encoding = "UTF-8")
11
+ if [:read, :close].all? { |x| thing.respond_to?(x) }
12
+ super
13
+ else
14
+ memory(thing, encoding)
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML4
5
+ module SAX
6
+ class PushParser
7
+ # The Nokogiri::HTML4::SAX::Document on which the PushParser will be
8
+ # operating
9
+ attr_accessor :document
10
+
11
+ def initialize(doc = HTML4::SAX::Document.new, file_name = nil, encoding = "UTF-8")
12
+ @document = doc
13
+ @encoding = encoding
14
+ @sax_parser = HTML4::SAX::Parser.new(doc, @encoding)
15
+
16
+ ## Create our push parser context
17
+ initialize_native(@sax_parser, file_name, encoding)
18
+ end
19
+
20
+ ###
21
+ # Write a +chunk+ of HTML to the PushParser. Any callback methods
22
+ # that can be called will be called immediately.
23
+ def write(chunk, last_chunk = false)
24
+ native_write(chunk, last_chunk)
25
+ end
26
+ alias_method :<<, :write
27
+
28
+ ###
29
+ # Finish the parsing. This method is only necessary for
30
+ # Nokogiri::HTML4::SAX::Document#end_document to be called.
31
+ def finish
32
+ write("", true)
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,46 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ module Nokogiri
5
+ class << self
6
+ # :call-seq:
7
+ # HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) → Nokogiri::HTML4::Document
8
+ #
9
+ # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
10
+ def HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
11
+ Nokogiri::HTML4::Document.parse(input, url, encoding, options, &block)
12
+ end
13
+ end
14
+
15
+ # Since v1.12.0
16
+ #
17
+ # 💡 Before v1.12.0, Nokogiri::HTML4 did not exist, and Nokogiri::HTML was the module/namespace
18
+ # for parsing HTML.
19
+ module HTML4
20
+ class << self
21
+ ###
22
+ # Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
23
+ def parse(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
24
+ Document.parse(input, url, encoding, options, &block)
25
+ end
26
+
27
+ ####
28
+ # Parse a fragment from +string+ in to a NodeSet.
29
+ def fragment(string, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
30
+ HTML4::DocumentFragment.parse(string, encoding, options, &block)
31
+ end
32
+ end
33
+
34
+ # Instance of Nokogiri::HTML4::EntityLookup
35
+ NamedCharacters = EntityLookup.new
36
+ end
37
+ end
38
+
39
+ require_relative "html4/entity_lookup"
40
+ require_relative "html4/document"
41
+ require_relative "html4/document_fragment"
42
+ require_relative "html4/sax/parser_context"
43
+ require_relative "html4/sax/parser"
44
+ require_relative "html4/sax/push_parser"
45
+ require_relative "html4/element_description"
46
+ require_relative "html4/element_description_defaults"
@@ -0,0 +1,88 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ #
19
+
20
+ require_relative "../html4/document"
21
+
22
+ module Nokogiri
23
+ module HTML5
24
+ # Since v1.12.0
25
+ #
26
+ # 💡 HTML5 functionality is not available when running JRuby.
27
+ class Document < Nokogiri::HTML4::Document
28
+ def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
29
+ yield options if block
30
+ string_or_io = "" unless string_or_io
31
+
32
+ if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != "ASCII-8BIT"
33
+ encoding ||= string_or_io.encoding.name
34
+ end
35
+
36
+ if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
37
+ url ||= string_or_io.path
38
+ end
39
+ unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
40
+ raise ArgumentError, "not a string or IO object"
41
+ end
42
+ do_parse(string_or_io, url, encoding, options)
43
+ end
44
+
45
+ def self.read_io(io, url = nil, encoding = nil, **options)
46
+ raise ArgumentError, "io object doesn't respond to :read" unless io.respond_to?(:read)
47
+ do_parse(io, url, encoding, options)
48
+ end
49
+
50
+ def self.read_memory(string, url = nil, encoding = nil, **options)
51
+ raise ArgumentError, "string object doesn't respond to :to_str" unless string.respond_to?(:to_str)
52
+ do_parse(string, url, encoding, options)
53
+ end
54
+
55
+ def fragment(tags = nil)
56
+ DocumentFragment.new(self, tags, root)
57
+ end
58
+
59
+ def to_xml(options = {}, &block)
60
+ # Bypass XML::Document#to_xml which doesn't add
61
+ # XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
62
+ XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
63
+ end
64
+
65
+ # :call-seq:
66
+ # xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig
67
+ #
68
+ # [Returns] The document type which determines CSS-to-XPath translation.
69
+ #
70
+ # See XPathVisitor for more information.
71
+ def xpath_doctype
72
+ Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5
73
+ end
74
+
75
+ private
76
+
77
+ def self.do_parse(string_or_io, url, encoding, options)
78
+ string = HTML5.read_and_encode(string_or_io, encoding)
79
+ max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
80
+ max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
81
+ max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
82
+ doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth)
83
+ doc.encoding = "UTF-8"
84
+ doc
85
+ end
86
+ end
87
+ end
88
+ end
@@ -0,0 +1,83 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ #
19
+
20
+ require_relative "../html4/document_fragment"
21
+
22
+ module Nokogiri
23
+ module HTML5
24
+ # Since v1.12.0
25
+ #
26
+ # 💡 HTML5 functionality is not available when running JRuby.
27
+ class DocumentFragment < Nokogiri::HTML4::DocumentFragment
28
+ attr_accessor :document
29
+ attr_accessor :errors
30
+
31
+ # Create a document fragment.
32
+ def initialize(doc, tags = nil, ctx = nil, options = {})
33
+ self.document = doc
34
+ self.errors = []
35
+ return self unless tags
36
+
37
+ max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
38
+ max_errors = options[:max_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
39
+ max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
40
+ tags = Nokogiri::HTML5.read_and_encode(tags, nil)
41
+ Nokogiri::Gumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
42
+ end
43
+
44
+ def serialize(options = {}, &block)
45
+ # Bypass XML::Document.serialize which doesn't support options even
46
+ # though XML::Node.serialize does!
47
+ XML::Node.instance_method(:serialize).bind(self).call(options, &block)
48
+ end
49
+
50
+ # Parse a document fragment from +tags+, returning a Nodeset.
51
+ def self.parse(tags, encoding = nil, options = {})
52
+ doc = HTML5::Document.new
53
+ tags = HTML5.read_and_encode(tags, encoding)
54
+ doc.encoding = "UTF-8"
55
+ new(doc, tags, nil, options)
56
+ end
57
+
58
+ def extract_params(params) # :nodoc:
59
+ handler = params.find do |param|
60
+ ![Hash, String, Symbol].include?(param.class)
61
+ end
62
+ params -= [handler] if handler
63
+
64
+ hashes = []
65
+ while Hash === params.last || params.last.nil?
66
+ hashes << params.pop
67
+ break if params.empty?
68
+ end
69
+ ns, binds = hashes.reverse
70
+
71
+ ns ||=
72
+ begin
73
+ ns = {}
74
+ children.each { |child| ns.merge!(child.namespaces) }
75
+ ns
76
+ end
77
+
78
+ [params, handler, ns, binds]
79
+ end
80
+ end
81
+ end
82
+ end
83
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
@@ -0,0 +1,96 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ #
19
+
20
+ require_relative "../xml/node"
21
+
22
+ module Nokogiri
23
+ module HTML5
24
+ # Since v1.12.0
25
+ #
26
+ # 💡 HTML5 functionality is not available when running JRuby.
27
+ module Node
28
+ def inner_html(options = {})
29
+ return super(options) unless document.is_a?(HTML5::Document)
30
+ result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? +"\n" : +""
31
+ result << children.map { |child| child.to_html(options) }.join
32
+ result
33
+ end
34
+
35
+ def write_to(io, *options)
36
+ return super(io, *options) unless document.is_a?(HTML5::Document)
37
+ options = options.first.is_a?(Hash) ? options.shift : {}
38
+ encoding = options[:encoding] || options[0]
39
+ if Nokogiri.jruby?
40
+ save_options = options[:save_with] || options[1]
41
+ indent_times = options[:indent] || 0
42
+ else
43
+ save_options = options[:save_with] || options[1] || XML::Node::SaveOptions::FORMAT
44
+ indent_times = options[:indent] || 2
45
+ end
46
+ indent_string = (options[:indent_text] || " ") * indent_times
47
+
48
+ config = XML::Node::SaveOptions.new(save_options.to_i)
49
+ yield config if block_given?
50
+
51
+ config_options = config.options
52
+ if config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0
53
+ # Use Nokogiri's serializing code.
54
+ native_write_to(io, encoding, indent_string, config_options)
55
+ else
56
+ # Serialize including the current node.
57
+ encoding ||= document.encoding || Encoding::UTF_8
58
+ internal_ops = {
59
+ preserve_newline: options[:preserve_newline] || false,
60
+ }
61
+ HTML5.serialize_node_internal(self, io, encoding, internal_ops)
62
+ end
63
+ end
64
+
65
+ def fragment(tags)
66
+ return super(tags) unless document.is_a?(HTML5::Document)
67
+ DocumentFragment.new(document, tags, self)
68
+ end
69
+
70
+ private
71
+
72
+ # HTML elements can have attributes that contain colons.
73
+ # Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
74
+ # and tries to create an attribute in a namespace. This is especially
75
+ # annoying with attribute names like xml:lang since libxml2 will
76
+ # actually create the xml namespace if it doesn't exist already.
77
+ def add_child_node_and_reparent_attrs(node)
78
+ return super(node) unless document.is_a?(HTML5::Document)
79
+ # I'm not sure what this method is supposed to do. Reparenting
80
+ # namespaces is handled by libxml2, including child namespaces which
81
+ # this method wouldn't handle.
82
+ # https://github.com/sparklemotion/nokogiri/issues/1790
83
+ add_child_node(node)
84
+ # node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
85
+ # attr.remove
86
+ # ns = attr.namespace
87
+ # a["#{ns.prefix}:#{attr.name}"] = attr.value
88
+ # end
89
+ end
90
+ end
91
+ # Monkey patch
92
+ XML::Node.prepend(HTML5::Node)
93
+ end
94
+ end
95
+
96
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab: