Nokogiri_precompiled_aarch64_dedshit 1.14.5

Sign up to get free protection for your applications and to get access to all the features.
Files changed (263) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +44 -0
  3. data/LICENSE-DEPENDENCIES.md +2224 -0
  4. data/LICENSE.md +9 -0
  5. data/README.md +287 -0
  6. data/bin/nokogiri +131 -0
  7. data/dependencies.yml +41 -0
  8. data/ext/java/nokogiri/Html4Document.java +157 -0
  9. data/ext/java/nokogiri/Html4ElementDescription.java +133 -0
  10. data/ext/java/nokogiri/Html4EntityLookup.java +63 -0
  11. data/ext/java/nokogiri/Html4SaxParserContext.java +289 -0
  12. data/ext/java/nokogiri/Html4SaxPushParser.java +213 -0
  13. data/ext/java/nokogiri/NokogiriService.java +613 -0
  14. data/ext/java/nokogiri/XmlAttr.java +154 -0
  15. data/ext/java/nokogiri/XmlAttributeDecl.java +119 -0
  16. data/ext/java/nokogiri/XmlCdata.java +60 -0
  17. data/ext/java/nokogiri/XmlComment.java +77 -0
  18. data/ext/java/nokogiri/XmlDocument.java +705 -0
  19. data/ext/java/nokogiri/XmlDocumentFragment.java +163 -0
  20. data/ext/java/nokogiri/XmlDtd.java +516 -0
  21. data/ext/java/nokogiri/XmlElement.java +44 -0
  22. data/ext/java/nokogiri/XmlElementContent.java +412 -0
  23. data/ext/java/nokogiri/XmlElementDecl.java +148 -0
  24. data/ext/java/nokogiri/XmlEntityDecl.java +151 -0
  25. data/ext/java/nokogiri/XmlEntityReference.java +79 -0
  26. data/ext/java/nokogiri/XmlNamespace.java +193 -0
  27. data/ext/java/nokogiri/XmlNode.java +1938 -0
  28. data/ext/java/nokogiri/XmlNodeSet.java +463 -0
  29. data/ext/java/nokogiri/XmlProcessingInstruction.java +79 -0
  30. data/ext/java/nokogiri/XmlReader.java +615 -0
  31. data/ext/java/nokogiri/XmlRelaxng.java +133 -0
  32. data/ext/java/nokogiri/XmlSaxParserContext.java +329 -0
  33. data/ext/java/nokogiri/XmlSaxPushParser.java +288 -0
  34. data/ext/java/nokogiri/XmlSchema.java +423 -0
  35. data/ext/java/nokogiri/XmlSyntaxError.java +137 -0
  36. data/ext/java/nokogiri/XmlText.java +90 -0
  37. data/ext/java/nokogiri/XmlXpathContext.java +305 -0
  38. data/ext/java/nokogiri/XsltStylesheet.java +368 -0
  39. data/ext/java/nokogiri/internals/ClosedStreamException.java +13 -0
  40. data/ext/java/nokogiri/internals/HtmlDomParserContext.java +252 -0
  41. data/ext/java/nokogiri/internals/IgnoreSchemaErrorsErrorHandler.java +27 -0
  42. data/ext/java/nokogiri/internals/NokogiriBlockingQueueInputStream.java +178 -0
  43. data/ext/java/nokogiri/internals/NokogiriDomParser.java +99 -0
  44. data/ext/java/nokogiri/internals/NokogiriEntityResolver.java +140 -0
  45. data/ext/java/nokogiri/internals/NokogiriErrorHandler.java +65 -0
  46. data/ext/java/nokogiri/internals/NokogiriHandler.java +339 -0
  47. data/ext/java/nokogiri/internals/NokogiriHelpers.java +817 -0
  48. data/ext/java/nokogiri/internals/NokogiriNamespaceCache.java +228 -0
  49. data/ext/java/nokogiri/internals/NokogiriNamespaceContext.java +110 -0
  50. data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler.java +86 -0
  51. data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler4NekoHtml.java +107 -0
  52. data/ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java +62 -0
  53. data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +165 -0
  54. data/ext/java/nokogiri/internals/NokogiriXPathFunctionResolver.java +50 -0
  55. data/ext/java/nokogiri/internals/NokogiriXPathVariableResolver.java +37 -0
  56. data/ext/java/nokogiri/internals/NokogiriXsltErrorListener.java +70 -0
  57. data/ext/java/nokogiri/internals/ParserContext.java +262 -0
  58. data/ext/java/nokogiri/internals/ReaderNode.java +564 -0
  59. data/ext/java/nokogiri/internals/SaveContextVisitor.java +865 -0
  60. data/ext/java/nokogiri/internals/SchemaErrorHandler.java +50 -0
  61. data/ext/java/nokogiri/internals/XalanDTMManagerPatch.java +174 -0
  62. data/ext/java/nokogiri/internals/XmlDeclHandler.java +11 -0
  63. data/ext/java/nokogiri/internals/XmlDomParserContext.java +265 -0
  64. data/ext/java/nokogiri/internals/XmlSaxParser.java +40 -0
  65. data/ext/java/nokogiri/internals/c14n/AttrCompare.java +122 -0
  66. data/ext/java/nokogiri/internals/c14n/C14nHelper.java +178 -0
  67. data/ext/java/nokogiri/internals/c14n/CanonicalFilter.java +43 -0
  68. data/ext/java/nokogiri/internals/c14n/CanonicalizationException.java +106 -0
  69. data/ext/java/nokogiri/internals/c14n/Canonicalizer.java +278 -0
  70. data/ext/java/nokogiri/internals/c14n/Canonicalizer11.java +664 -0
  71. data/ext/java/nokogiri/internals/c14n/Canonicalizer11_OmitComments.java +45 -0
  72. data/ext/java/nokogiri/internals/c14n/Canonicalizer11_WithComments.java +45 -0
  73. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315.java +388 -0
  74. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315Excl.java +308 -0
  75. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclOmitComments.java +47 -0
  76. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315ExclWithComments.java +51 -0
  77. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315OmitComments.java +51 -0
  78. data/ext/java/nokogiri/internals/c14n/Canonicalizer20010315WithComments.java +50 -0
  79. data/ext/java/nokogiri/internals/c14n/CanonicalizerBase.java +660 -0
  80. data/ext/java/nokogiri/internals/c14n/CanonicalizerPhysical.java +194 -0
  81. data/ext/java/nokogiri/internals/c14n/CanonicalizerSpi.java +77 -0
  82. data/ext/java/nokogiri/internals/c14n/Constants.java +45 -0
  83. data/ext/java/nokogiri/internals/c14n/ElementProxy.java +325 -0
  84. data/ext/java/nokogiri/internals/c14n/HelperNodeList.java +106 -0
  85. data/ext/java/nokogiri/internals/c14n/IgnoreAllErrorHandler.java +86 -0
  86. data/ext/java/nokogiri/internals/c14n/InclusiveNamespaces.java +181 -0
  87. data/ext/java/nokogiri/internals/c14n/InvalidCanonicalizerException.java +87 -0
  88. data/ext/java/nokogiri/internals/c14n/NameSpaceSymbTable.java +452 -0
  89. data/ext/java/nokogiri/internals/c14n/NodeFilter.java +52 -0
  90. data/ext/java/nokogiri/internals/c14n/UtfHelpper.java +190 -0
  91. data/ext/java/nokogiri/internals/c14n/XMLUtils.java +540 -0
  92. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTM.java +1712 -0
  93. data/ext/java/nokogiri/internals/dom2dtm/DOM2DTMdefaultNamespaceDeclarationNode.java +737 -0
  94. data/ext/nokogiri/depend +38 -0
  95. data/ext/nokogiri/extconf.rb +1086 -0
  96. data/ext/nokogiri/gumbo.c +594 -0
  97. data/ext/nokogiri/html4_document.c +167 -0
  98. data/ext/nokogiri/html4_element_description.c +294 -0
  99. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  100. data/ext/nokogiri/html4_sax_parser_context.c +116 -0
  101. data/ext/nokogiri/html4_sax_push_parser.c +95 -0
  102. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  103. data/ext/nokogiri/nokogiri.c +265 -0
  104. data/ext/nokogiri/nokogiri.h +235 -0
  105. data/ext/nokogiri/test_global_handlers.c +42 -0
  106. data/ext/nokogiri/xml_attr.c +103 -0
  107. data/ext/nokogiri/xml_attribute_decl.c +70 -0
  108. data/ext/nokogiri/xml_cdata.c +57 -0
  109. data/ext/nokogiri/xml_comment.c +62 -0
  110. data/ext/nokogiri/xml_document.c +689 -0
  111. data/ext/nokogiri/xml_document_fragment.c +44 -0
  112. data/ext/nokogiri/xml_dtd.c +210 -0
  113. data/ext/nokogiri/xml_element_content.c +128 -0
  114. data/ext/nokogiri/xml_element_decl.c +69 -0
  115. data/ext/nokogiri/xml_encoding_handler.c +104 -0
  116. data/ext/nokogiri/xml_entity_decl.c +112 -0
  117. data/ext/nokogiri/xml_entity_reference.c +50 -0
  118. data/ext/nokogiri/xml_namespace.c +186 -0
  119. data/ext/nokogiri/xml_node.c +2426 -0
  120. data/ext/nokogiri/xml_node_set.c +496 -0
  121. data/ext/nokogiri/xml_processing_instruction.c +54 -0
  122. data/ext/nokogiri/xml_reader.c +794 -0
  123. data/ext/nokogiri/xml_relax_ng.c +164 -0
  124. data/ext/nokogiri/xml_sax_parser.c +316 -0
  125. data/ext/nokogiri/xml_sax_parser_context.c +283 -0
  126. data/ext/nokogiri/xml_sax_push_parser.c +166 -0
  127. data/ext/nokogiri/xml_schema.c +260 -0
  128. data/ext/nokogiri/xml_syntax_error.c +85 -0
  129. data/ext/nokogiri/xml_text.c +48 -0
  130. data/ext/nokogiri/xml_xpath_context.c +415 -0
  131. data/ext/nokogiri/xslt_stylesheet.c +363 -0
  132. data/gumbo-parser/CHANGES.md +63 -0
  133. data/gumbo-parser/Makefile +111 -0
  134. data/gumbo-parser/THANKS +27 -0
  135. data/gumbo-parser/src/Makefile +34 -0
  136. data/gumbo-parser/src/README.md +41 -0
  137. data/gumbo-parser/src/ascii.c +75 -0
  138. data/gumbo-parser/src/ascii.h +115 -0
  139. data/gumbo-parser/src/attribute.c +42 -0
  140. data/gumbo-parser/src/attribute.h +17 -0
  141. data/gumbo-parser/src/char_ref.c +22225 -0
  142. data/gumbo-parser/src/char_ref.h +29 -0
  143. data/gumbo-parser/src/char_ref.rl +2154 -0
  144. data/gumbo-parser/src/error.c +626 -0
  145. data/gumbo-parser/src/error.h +148 -0
  146. data/gumbo-parser/src/foreign_attrs.c +104 -0
  147. data/gumbo-parser/src/foreign_attrs.gperf +27 -0
  148. data/gumbo-parser/src/insertion_mode.h +33 -0
  149. data/gumbo-parser/src/macros.h +91 -0
  150. data/gumbo-parser/src/nokogiri_gumbo.h +944 -0
  151. data/gumbo-parser/src/parser.c +4878 -0
  152. data/gumbo-parser/src/parser.h +41 -0
  153. data/gumbo-parser/src/replacement.h +33 -0
  154. data/gumbo-parser/src/string_buffer.c +103 -0
  155. data/gumbo-parser/src/string_buffer.h +68 -0
  156. data/gumbo-parser/src/string_piece.c +48 -0
  157. data/gumbo-parser/src/svg_attrs.c +174 -0
  158. data/gumbo-parser/src/svg_attrs.gperf +77 -0
  159. data/gumbo-parser/src/svg_tags.c +137 -0
  160. data/gumbo-parser/src/svg_tags.gperf +55 -0
  161. data/gumbo-parser/src/tag.c +223 -0
  162. data/gumbo-parser/src/tag_lookup.c +382 -0
  163. data/gumbo-parser/src/tag_lookup.gperf +170 -0
  164. data/gumbo-parser/src/tag_lookup.h +13 -0
  165. data/gumbo-parser/src/token_buffer.c +79 -0
  166. data/gumbo-parser/src/token_buffer.h +71 -0
  167. data/gumbo-parser/src/token_type.h +17 -0
  168. data/gumbo-parser/src/tokenizer.c +3463 -0
  169. data/gumbo-parser/src/tokenizer.h +112 -0
  170. data/gumbo-parser/src/tokenizer_states.h +339 -0
  171. data/gumbo-parser/src/utf8.c +245 -0
  172. data/gumbo-parser/src/utf8.h +164 -0
  173. data/gumbo-parser/src/util.c +66 -0
  174. data/gumbo-parser/src/util.h +34 -0
  175. data/gumbo-parser/src/vector.c +111 -0
  176. data/gumbo-parser/src/vector.h +45 -0
  177. data/lib/nokogiri/class_resolver.rb +67 -0
  178. data/lib/nokogiri/css/node.rb +54 -0
  179. data/lib/nokogiri/css/parser.rb +770 -0
  180. data/lib/nokogiri/css/parser.y +277 -0
  181. data/lib/nokogiri/css/parser_extras.rb +96 -0
  182. data/lib/nokogiri/css/syntax_error.rb +9 -0
  183. data/lib/nokogiri/css/tokenizer.rb +155 -0
  184. data/lib/nokogiri/css/tokenizer.rex +56 -0
  185. data/lib/nokogiri/css/xpath_visitor.rb +359 -0
  186. data/lib/nokogiri/css.rb +66 -0
  187. data/lib/nokogiri/decorators/slop.rb +44 -0
  188. data/lib/nokogiri/encoding_handler.rb +57 -0
  189. data/lib/nokogiri/extension.rb +32 -0
  190. data/lib/nokogiri/gumbo.rb +15 -0
  191. data/lib/nokogiri/html.rb +48 -0
  192. data/lib/nokogiri/html4/builder.rb +37 -0
  193. data/lib/nokogiri/html4/document.rb +214 -0
  194. data/lib/nokogiri/html4/document_fragment.rb +54 -0
  195. data/lib/nokogiri/html4/element_description.rb +25 -0
  196. data/lib/nokogiri/html4/element_description_defaults.rb +572 -0
  197. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  198. data/lib/nokogiri/html4/entity_lookup.rb +15 -0
  199. data/lib/nokogiri/html4/sax/parser.rb +63 -0
  200. data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
  201. data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
  202. data/lib/nokogiri/html4.rb +47 -0
  203. data/lib/nokogiri/html5/document.rb +168 -0
  204. data/lib/nokogiri/html5/document_fragment.rb +90 -0
  205. data/lib/nokogiri/html5/node.rb +98 -0
  206. data/lib/nokogiri/html5.rb +389 -0
  207. data/lib/nokogiri/jruby/dependencies.rb +3 -0
  208. data/lib/nokogiri/jruby/isorelax/isorelax/20030108/isorelax-20030108.jar +0 -0
  209. data/lib/nokogiri/jruby/net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar +0 -0
  210. data/lib/nokogiri/jruby/net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar +0 -0
  211. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  212. data/lib/nokogiri/jruby/nu/validator/jing/20200702VNU/jing-20200702VNU.jar +0 -0
  213. data/lib/nokogiri/jruby/org/nokogiri/nekodtd/0.1.11.noko2/nekodtd-0.1.11.noko2.jar +0 -0
  214. data/lib/nokogiri/jruby/xalan/serializer/2.7.3/serializer-2.7.3.jar +0 -0
  215. data/lib/nokogiri/jruby/xalan/xalan/2.7.3/xalan-2.7.3.jar +0 -0
  216. data/lib/nokogiri/jruby/xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar +0 -0
  217. data/lib/nokogiri/jruby/xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar +0 -0
  218. data/lib/nokogiri/syntax_error.rb +6 -0
  219. data/lib/nokogiri/version/constant.rb +6 -0
  220. data/lib/nokogiri/version/info.rb +223 -0
  221. data/lib/nokogiri/version.rb +4 -0
  222. data/lib/nokogiri/xml/attr.rb +66 -0
  223. data/lib/nokogiri/xml/attribute_decl.rb +20 -0
  224. data/lib/nokogiri/xml/builder.rb +487 -0
  225. data/lib/nokogiri/xml/cdata.rb +13 -0
  226. data/lib/nokogiri/xml/character_data.rb +9 -0
  227. data/lib/nokogiri/xml/document.rb +471 -0
  228. data/lib/nokogiri/xml/document_fragment.rb +205 -0
  229. data/lib/nokogiri/xml/dtd.rb +34 -0
  230. data/lib/nokogiri/xml/element_content.rb +38 -0
  231. data/lib/nokogiri/xml/element_decl.rb +15 -0
  232. data/lib/nokogiri/xml/entity_decl.rb +21 -0
  233. data/lib/nokogiri/xml/entity_reference.rb +20 -0
  234. data/lib/nokogiri/xml/namespace.rb +58 -0
  235. data/lib/nokogiri/xml/node/save_options.rb +68 -0
  236. data/lib/nokogiri/xml/node.rb +1563 -0
  237. data/lib/nokogiri/xml/node_set.rb +447 -0
  238. data/lib/nokogiri/xml/notation.rb +19 -0
  239. data/lib/nokogiri/xml/parse_options.rb +213 -0
  240. data/lib/nokogiri/xml/pp/character_data.rb +21 -0
  241. data/lib/nokogiri/xml/pp/node.rb +57 -0
  242. data/lib/nokogiri/xml/pp.rb +4 -0
  243. data/lib/nokogiri/xml/processing_instruction.rb +11 -0
  244. data/lib/nokogiri/xml/reader.rb +105 -0
  245. data/lib/nokogiri/xml/relax_ng.rb +38 -0
  246. data/lib/nokogiri/xml/sax/document.rb +167 -0
  247. data/lib/nokogiri/xml/sax/parser.rb +125 -0
  248. data/lib/nokogiri/xml/sax/parser_context.rb +21 -0
  249. data/lib/nokogiri/xml/sax/push_parser.rb +61 -0
  250. data/lib/nokogiri/xml/sax.rb +6 -0
  251. data/lib/nokogiri/xml/schema.rb +73 -0
  252. data/lib/nokogiri/xml/searchable.rb +270 -0
  253. data/lib/nokogiri/xml/syntax_error.rb +72 -0
  254. data/lib/nokogiri/xml/text.rb +11 -0
  255. data/lib/nokogiri/xml/xpath/syntax_error.rb +13 -0
  256. data/lib/nokogiri/xml/xpath.rb +21 -0
  257. data/lib/nokogiri/xml/xpath_context.rb +16 -0
  258. data/lib/nokogiri/xml.rb +76 -0
  259. data/lib/nokogiri/xslt/stylesheet.rb +27 -0
  260. data/lib/nokogiri/xslt.rb +65 -0
  261. data/lib/nokogiri.rb +120 -0
  262. data/lib/xsd/xmlparser/nokogiri.rb +106 -0
  263. metadata +391 -0
@@ -0,0 +1,389 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ #
19
+
20
+ require_relative "html5/document"
21
+ require_relative "html5/document_fragment"
22
+ require_relative "html5/node"
23
+
24
+ module Nokogiri
25
+ # Since v1.12.0
26
+ #
27
+ # ⚠ HTML5 functionality is not available when running JRuby.
28
+ #
29
+ # Parse an HTML5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
30
+ def self.HTML5(input, url = nil, encoding = nil, **options, &block)
31
+ Nokogiri::HTML5::Document.parse(input, url, encoding, **options, &block)
32
+ end
33
+
34
+ # == Usage
35
+ #
36
+ # ⚠ HTML5 functionality is not available when running JRuby.
37
+ #
38
+ # Parse an HTML5 document:
39
+ #
40
+ # doc = Nokogiri.HTML5(string)
41
+ #
42
+ # Parse an HTML5 fragment:
43
+ #
44
+ # fragment = Nokogiri::HTML5.fragment(string)
45
+ #
46
+ # == Parsing options
47
+ #
48
+ # The document and fragment parsing methods support options that are different from Nokogiri's.
49
+ #
50
+ # - <tt>Nokogiri.HTML5(html, url = nil, encoding = nil, options = {})</tt>
51
+ # - <tt>Nokogiri::HTML5.parse(html, url = nil, encoding = nil, options = {})</tt>
52
+ # - <tt>Nokogiri::HTML5::Document.parse(html, url = nil, encoding = nil, options = {})</tt>
53
+ # - <tt>Nokogiri::HTML5.fragment(html, encoding = nil, options = {})</tt>
54
+ # - <tt>Nokogiri::HTML5::DocumentFragment.parse(html, encoding = nil, options = {})</tt>
55
+ #
56
+ # The three currently supported options are +:max_errors+, +:max_tree_depth+ and
57
+ # +:max_attributes+, described below.
58
+ #
59
+ # === Error reporting
60
+ #
61
+ # Nokogiri contains an experimental HTML5 parse error reporting facility. By default, no parse
62
+ # errors are reported but this can be configured by passing the +:max_errors+ option to
63
+ # {HTML5.parse} or {HTML5.fragment}.
64
+ #
65
+ # For example, this script:
66
+ #
67
+ # doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
68
+ # doc.errors.each do |err|
69
+ # puts(err)
70
+ # end
71
+ #
72
+ # Emits:
73
+ #
74
+ # 1:1: ERROR: Expected a doctype token
75
+ # <span/>Hi there!</span foo=bar />
76
+ # ^
77
+ # 1:1: ERROR: Start tag of nonvoid HTML element ends with '/>', use '>'.
78
+ # <span/>Hi there!</span foo=bar />
79
+ # ^
80
+ # 1:17: ERROR: End tag ends with '/>', use '>'.
81
+ # <span/>Hi there!</span foo=bar />
82
+ # ^
83
+ # 1:17: ERROR: End tag contains attributes.
84
+ # <span/>Hi there!</span foo=bar />
85
+ # ^
86
+ #
87
+ # Using <tt>max_errors: -1</tt> results in an unlimited number of errors being returned.
88
+ #
89
+ # The errors returned by {HTML5::Document#errors} are instances of {Nokogiri::XML::SyntaxError}.
90
+ #
91
+ # The {https://html.spec.whatwg.org/multipage/parsing.html#parse-errors HTML standard} defines a
92
+ # number of standard parse error codes. These error codes only cover the "tokenization" stage of
93
+ # parsing HTML. The parse errors in the "tree construction" stage do not have standardized error
94
+ # codes (yet).
95
+ #
96
+ # As a convenience to Nokogiri users, the defined error codes are available via
97
+ # {Nokogiri::XML::SyntaxError#str1} method.
98
+ #
99
+ # doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
100
+ # doc.errors.each do |err|
101
+ # puts("#{err.line}:#{err.column}: #{err.str1}")
102
+ # end
103
+ # # => 1:1: generic-parser
104
+ # # 1:1: non-void-html-element-start-tag-with-trailing-solidus
105
+ # # 1:17: end-tag-with-trailing-solidus
106
+ # # 1:17: end-tag-with-attributes
107
+ #
108
+ # Note that the first error is +generic-parser+ because it's an error from the tree construction
109
+ # stage and doesn't have a standardized error code.
110
+ #
111
+ # For the purposes of semantic versioning, the error messages, error locations, and error codes
112
+ # are not part of Nokogiri's public API. That is, these are subject to change without Nokogiri's
113
+ # major version number changing. These may be stabilized in the future.
114
+ #
115
+ # === Maximum tree depth
116
+ #
117
+ # The maximum depth of the DOM tree parsed by the various parsing methods is configurable by the
118
+ # +:max_tree_depth+ option. If the depth of the tree would exceed this limit, then an
119
+ # {::ArgumentError} is thrown.
120
+ #
121
+ # This limit (which defaults to <tt>Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH = 400</tt>) can be
122
+ # removed by giving the option <tt>max_tree_depth: -1</tt>.
123
+ #
124
+ # html = '<!DOCTYPE html>' + '<div>' * 1000
125
+ # doc = Nokogiri.HTML5(html)
126
+ # # raises ArgumentError: Document tree depth limit exceeded
127
+ # doc = Nokogiri.HTML5(html, max_tree_depth: -1)
128
+ #
129
+ # === Attribute limit per element
130
+ #
131
+ # The maximum number of attributes per DOM element is configurable by the +:max_attributes+
132
+ # option. If a given element would exceed this limit, then an {::ArgumentError} is thrown.
133
+ #
134
+ # This limit (which defaults to <tt>Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES = 400</tt>) can be
135
+ # removed by giving the option <tt>max_attributes: -1</tt>.
136
+ #
137
+ # html = '<!DOCTYPE html><div ' + (1..1000).map { |x| "attr-#{x}" }.join(' ') + '>'
138
+ # # "<!DOCTYPE html><div attr-1 attr-2 attr-3 ... attr-1000>"
139
+ # doc = Nokogiri.HTML5(html)
140
+ # # raises ArgumentError: Attributes per element limit exceeded
141
+ # doc = Nokogiri.HTML5(html, max_attributes: -1)
142
+ #
143
+ # == HTML Serialization
144
+ #
145
+ # After parsing HTML, it may be serialized using any of the {Nokogiri::XML::Node} serialization
146
+ # methods. In particular, {XML::Node#serialize}, {XML::Node#to_html}, and {XML::Node#to_s} will
147
+ # serialize a given node and its children. (This is the equivalent of JavaScript's
148
+ # +Element.outerHTML+.) Similarly, {XML::Node#inner_html} will serialize the children of a given
149
+ # node. (This is the equivalent of JavaScript's +Element.innerHTML+.)
150
+ #
151
+ # doc = Nokogiri::HTML5("<!DOCTYPE html><span>Hello world!</span>")
152
+ # puts doc.serialize
153
+ # # => <!DOCTYPE html><html><head></head><body><span>Hello world!</span></body></html>
154
+ #
155
+ # Due to quirks in how HTML is parsed and serialized, it's possible for a DOM tree to be
156
+ # serialized and then re-parsed, resulting in a different DOM. Mostly, this happens with DOMs
157
+ # produced from invalid HTML. Unfortunately, even valid HTML may not survive serialization and
158
+ # re-parsing.
159
+ #
160
+ # In particular, a newline at the start of +pre+, +listing+, and +textarea+ elements is ignored by
161
+ # the parser.
162
+ #
163
+ # doc = Nokogiri::HTML5(<<-EOF)
164
+ # <!DOCTYPE html>
165
+ # <pre>
166
+ # Content</pre>
167
+ # EOF
168
+ # puts doc.at('/html/body/pre').serialize
169
+ # # => <pre>Content</pre>
170
+ #
171
+ # In this case, the original HTML is semantically equivalent to the serialized version. If the
172
+ # +pre+, +listing+, or +textarea+ content starts with two newlines, the first newline will be
173
+ # stripped on the first parse and the second newline will be stripped on the second, leading to
174
+ # semantically different DOMs. Passing the parameter <tt>preserve_newline: true</tt> will cause
175
+ # two or more newlines to be preserved. (A single leading newline will still be removed.)
176
+ #
177
+ # doc = Nokogiri::HTML5(<<-EOF)
178
+ # <!DOCTYPE html>
179
+ # <listing>
180
+ #
181
+ # Content</listing>
182
+ # EOF
183
+ # puts doc.at('/html/body/listing').serialize(preserve_newline: true)
184
+ # # => <listing>
185
+ # #
186
+ # # Content</listing>
187
+ #
188
+ # == Encodings
189
+ #
190
+ # Nokogiri always parses HTML5 using {https://en.wikipedia.org/wiki/UTF-8 UTF-8}; however, the
191
+ # encoding of the input can be explicitly selected via the optional +encoding+ parameter. This is
192
+ # most useful when the input comes not from a string but from an IO object.
193
+ #
194
+ # When serializing a document or node, the encoding of the output string can be specified via the
195
+ # +:encoding+ options. Characters that cannot be encoded in the selected encoding will be encoded
196
+ # as {https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references HTML numeric
197
+ # entities}.
198
+ #
199
+ # frag = Nokogiri::HTML5.fragment('<span>아는 길도 물어가라</span>')
200
+ # html = frag.serialize(encoding: 'US-ASCII')
201
+ # puts html
202
+ # # => <span>&#xc544;&#xb294; &#xae38;&#xb3c4; &#xbb3c;&#xc5b4;&#xac00;&#xb77c;</span>
203
+ # frag = Nokogiri::HTML5.fragment(html)
204
+ # puts frag.serialize
205
+ # # => <span>아는 길도 물어가라</span>
206
+ #
207
+ # (There's a {https://bugs.ruby-lang.org/issues/15033 bug} in all current versions of Ruby that
208
+ # can cause the entity encoding to fail. Of the mandated supported encodings for HTML, the only
209
+ # encoding I'm aware of that has this bug is <tt>'ISO-2022-JP'</tt>. We recommend avoiding this
210
+ # encoding.)
211
+ #
212
+ # == Notes
213
+ #
214
+ # * The {Nokogiri::HTML5.fragment} function takes a string and parses it
215
+ # as a HTML5 document. The +<html>+, +<head>+, and +<body>+ elements are
216
+ # removed from this document, and any children of these elements that remain
217
+ # are returned as a {Nokogiri::HTML5::DocumentFragment}.
218
+ #
219
+ # * The {Nokogiri::HTML5.parse} function takes a string and passes it to the
220
+ # <code>gumbo_parse_with_options</code> method, using the default options.
221
+ # The resulting Gumbo parse tree is then walked.
222
+ #
223
+ # * Instead of uppercase element names, lowercase element names are produced.
224
+ #
225
+ # * Instead of returning +unknown+ as the element name for unknown tags, the
226
+ # original tag name is returned verbatim.
227
+ #
228
+ # Since v1.12.0
229
+ module HTML5
230
+ class << self
231
+ # Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse}
232
+ def parse(string, url = nil, encoding = nil, **options, &block)
233
+ Document.parse(string, url, encoding, **options, &block)
234
+ end
235
+
236
+ # Parse a fragment from +string+. Convenience method for
237
+ # {Nokogiri::HTML5::DocumentFragment.parse}.
238
+ def fragment(string, encoding = nil, **options)
239
+ DocumentFragment.parse(string, encoding, options)
240
+ end
241
+
242
+ # Fetch and parse a HTML document from the web, following redirects,
243
+ # handling https, and determining the character encoding using HTML5
244
+ # rules. +uri+ may be a +String+ or a +URI+. +options+ contains
245
+ # http headers and special options. Everything which is not a
246
+ # special option is considered a header. Special options include:
247
+ # * :follow_limit => number of redirects which are followed
248
+ # * :basic_auth => [username, password]
249
+ def get(uri, options = {})
250
+ # TODO: deprecate
251
+ warn("Nokogiri::HTML5.get is deprecated and will be removed in a future version of Nokogiri.",
252
+ uplevel: 1, category: :deprecated)
253
+ get_impl(uri, options)
254
+ end
255
+
256
+ # :nodoc:
257
+ def read_and_encode(string, encoding)
258
+ # Read the string with the given encoding.
259
+ if string.respond_to?(:read)
260
+ string = if encoding.nil?
261
+ string.read
262
+ else
263
+ string.read(encoding: encoding)
264
+ end
265
+ else
266
+ # Otherwise the string has the given encoding.
267
+ string = string.to_s
268
+ if encoding
269
+ string = string.dup
270
+ string.force_encoding(encoding)
271
+ end
272
+ end
273
+
274
+ # convert to UTF-8
275
+ if string.encoding != Encoding::UTF_8
276
+ string = reencode(string)
277
+ end
278
+ string
279
+ end
280
+
281
+ private
282
+
283
+ def get_impl(uri, options = {})
284
+ headers = options.clone
285
+ headers = { follow_limit: headers } if Numeric === headers # deprecated
286
+ limit = headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
287
+
288
+ require "net/http"
289
+ uri = URI(uri) unless URI === uri
290
+
291
+ http = Net::HTTP.new(uri.host, uri.port)
292
+
293
+ # TLS / SSL support
294
+ http.use_ssl = true if uri.scheme == "https"
295
+
296
+ # Pass through Net::HTTP override values, which currently include:
297
+ # :ca_file, :ca_path, :cert, :cert_store, :ciphers,
298
+ # :close_on_empty_response, :continue_timeout, :key, :open_timeout,
299
+ # :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
300
+ # :verify_callback, :verify_depth, :verify_mode
301
+ options.each do |key, _value|
302
+ http.send("#{key}=", headers.delete(key)) if http.respond_to?("#{key}=")
303
+ end
304
+
305
+ request = Net::HTTP::Get.new(uri.request_uri)
306
+
307
+ # basic authentication
308
+ auth = headers.delete(:basic_auth)
309
+ auth ||= [uri.user, uri.password] if uri.user && uri.password
310
+ request.basic_auth(auth.first, auth.last) if auth
311
+
312
+ # remaining options are treated as headers
313
+ headers.each { |key, value| request[key.to_s] = value.to_s }
314
+
315
+ response = http.request(request)
316
+
317
+ case response
318
+ when Net::HTTPSuccess
319
+ doc = parse(reencode(response.body, response["content-type"]), options)
320
+ doc.instance_variable_set(:@response, response)
321
+ doc.class.send(:attr_reader, :response)
322
+ doc
323
+ when Net::HTTPRedirection
324
+ response.value if limit <= 1
325
+ location = URI.join(uri, response["location"])
326
+ get_impl(location, options.merge(follow_limit: limit - 1))
327
+ else
328
+ response.value
329
+ end
330
+ end
331
+
332
+ # Charset sniffing is a complex and controversial topic that understandably isn't done _by
333
+ # default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
334
+ # consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
335
+ # the Gumbo parser *only* supports utf-8.
336
+ #
337
+ # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
338
+ # this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
339
+ # the HTML5 standard.
340
+ #
341
+ # http://bugs.ruby-lang.org/issues/2567
342
+ # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
343
+ #
344
+ def reencode(body, content_type = nil)
345
+ if body.encoding == Encoding::ASCII_8BIT
346
+ encoding = nil
347
+
348
+ # look for a Byte Order Mark (BOM)
349
+ initial_bytes = body[0..2].bytes
350
+ if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
351
+ encoding = Encoding::UTF_8
352
+ elsif initial_bytes[0..1] == [0xFE, 0xFF]
353
+ encoding = Encoding::UTF_16BE
354
+ elsif initial_bytes[0..1] == [0xFF, 0xFE]
355
+ encoding = Encoding::UTF_16LE
356
+ end
357
+
358
+ # look for a charset in a content-encoding header
359
+ if content_type
360
+ encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
361
+ end
362
+
363
+ # look for a charset in a meta tag in the first 1024 bytes
364
+ unless encoding
365
+ data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, "")
366
+ data.scan(/<meta.*?>/im).each do |meta|
367
+ encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
368
+ end
369
+ end
370
+
371
+ # if all else fails, default to the official default encoding for HTML
372
+ encoding ||= Encoding::ISO_8859_1
373
+
374
+ # change the encoding to match the detected or inferred encoding
375
+ body = body.dup
376
+ begin
377
+ body.force_encoding(encoding)
378
+ rescue ArgumentError
379
+ body.force_encoding(Encoding::ISO_8859_1)
380
+ end
381
+ end
382
+
383
+ body.encode(Encoding::UTF_8)
384
+ end
385
+ end
386
+ end
387
+ end
388
+
389
+ require_relative "gumbo"
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "nokogiri_jars"
@@ -0,0 +1,43 @@
1
+ # this is a generated file, to avoid over-writing it just delete this comment
2
+ begin
3
+ require 'jar_dependencies'
4
+ rescue LoadError
5
+ require 'xalan/serializer/2.7.3/serializer-2.7.3.jar'
6
+ require 'net/sourceforge/htmlunit/neko-htmlunit/2.63.0/neko-htmlunit-2.63.0.jar'
7
+ require 'nu/validator/jing/20200702VNU/jing-20200702VNU.jar'
8
+ require 'xerces/xercesImpl/2.12.2/xercesImpl-2.12.2.jar'
9
+ require 'net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar'
10
+ require 'xalan/xalan/2.7.3/xalan-2.7.3.jar'
11
+ require 'xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar'
12
+ require 'org/nokogiri/nekodtd/0.1.11.noko2/nekodtd-0.1.11.noko2.jar'
13
+ require 'isorelax/isorelax/20030108/isorelax-20030108.jar'
14
+ end
15
+
16
+ if defined? Jars
17
+ require_jar 'xalan', 'serializer', '2.7.3'
18
+ require_jar 'net.sourceforge.htmlunit', 'neko-htmlunit', '2.63.0'
19
+ require_jar 'nu.validator', 'jing', '20200702VNU'
20
+ require_jar 'xerces', 'xercesImpl', '2.12.2'
21
+ require_jar 'net.sf.saxon', 'Saxon-HE', '9.6.0-4'
22
+ require_jar 'xalan', 'xalan', '2.7.3'
23
+ require_jar 'xml-apis', 'xml-apis', '1.4.01'
24
+ require_jar 'org.nokogiri', 'nekodtd', '0.1.11.noko2'
25
+ require_jar 'isorelax', 'isorelax', '20030108'
26
+ end
27
+
28
+ module Nokogiri
29
+ # generated by the :vendor_jars rake task
30
+ JAR_DEPENDENCIES = {
31
+ "isorelax:isorelax" => "20030108",
32
+ "net.sf.saxon:Saxon-HE" => "9.6.0-4",
33
+ "net.sourceforge.htmlunit:neko-htmlunit" => "2.63.0",
34
+ "nu.validator:jing" => "20200702VNU",
35
+ "org.nokogiri:nekodtd" => "0.1.11.noko2",
36
+ "xalan:serializer" => "2.7.3",
37
+ "xalan:xalan" => "2.7.3",
38
+ "xerces:xercesImpl" => "2.12.2",
39
+ "xml-apis:xml-apis" => "1.4.01",
40
+ }.freeze
41
+ XERCES_VERSION = JAR_DEPENDENCIES["xerces:xercesImpl"]
42
+ NEKO_VERSION = JAR_DEPENDENCIES["net.sourceforge.htmlunit:neko-htmlunit"]
43
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ class SyntaxError < ::StandardError
5
+ end
6
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ # The version of Nokogiri you are using
5
+ VERSION = "1.14.5"
6
+ end