nokogiri 1.18.0.rc1-aarch64-linux-musl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (203) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +38 -0
  3. data/LICENSE-DEPENDENCIES.md +2224 -0
  4. data/LICENSE.md +9 -0
  5. data/README.md +293 -0
  6. data/bin/nokogiri +131 -0
  7. data/dependencies.yml +42 -0
  8. data/ext/nokogiri/depend +38 -0
  9. data/ext/nokogiri/extconf.rb +1173 -0
  10. data/ext/nokogiri/gumbo.c +610 -0
  11. data/ext/nokogiri/html4_document.c +171 -0
  12. data/ext/nokogiri/html4_element_description.c +299 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser.c +40 -0
  15. data/ext/nokogiri/html4_sax_parser_context.c +98 -0
  16. data/ext/nokogiri/html4_sax_push_parser.c +96 -0
  17. data/ext/nokogiri/include/libexslt/exslt.h +108 -0
  18. data/ext/nokogiri/include/libexslt/exsltconfig.h +70 -0
  19. data/ext/nokogiri/include/libexslt/exsltexports.h +63 -0
  20. data/ext/nokogiri/include/libxml2/libxml/HTMLparser.h +336 -0
  21. data/ext/nokogiri/include/libxml2/libxml/HTMLtree.h +147 -0
  22. data/ext/nokogiri/include/libxml2/libxml/SAX.h +202 -0
  23. data/ext/nokogiri/include/libxml2/libxml/SAX2.h +171 -0
  24. data/ext/nokogiri/include/libxml2/libxml/c14n.h +115 -0
  25. data/ext/nokogiri/include/libxml2/libxml/catalog.h +182 -0
  26. data/ext/nokogiri/include/libxml2/libxml/chvalid.h +230 -0
  27. data/ext/nokogiri/include/libxml2/libxml/debugXML.h +217 -0
  28. data/ext/nokogiri/include/libxml2/libxml/dict.h +82 -0
  29. data/ext/nokogiri/include/libxml2/libxml/encoding.h +244 -0
  30. data/ext/nokogiri/include/libxml2/libxml/entities.h +166 -0
  31. data/ext/nokogiri/include/libxml2/libxml/globals.h +41 -0
  32. data/ext/nokogiri/include/libxml2/libxml/hash.h +251 -0
  33. data/ext/nokogiri/include/libxml2/libxml/list.h +137 -0
  34. data/ext/nokogiri/include/libxml2/libxml/nanoftp.h +186 -0
  35. data/ext/nokogiri/include/libxml2/libxml/nanohttp.h +98 -0
  36. data/ext/nokogiri/include/libxml2/libxml/parser.h +1390 -0
  37. data/ext/nokogiri/include/libxml2/libxml/parserInternals.h +671 -0
  38. data/ext/nokogiri/include/libxml2/libxml/pattern.h +106 -0
  39. data/ext/nokogiri/include/libxml2/libxml/relaxng.h +219 -0
  40. data/ext/nokogiri/include/libxml2/libxml/schemasInternals.h +959 -0
  41. data/ext/nokogiri/include/libxml2/libxml/schematron.h +143 -0
  42. data/ext/nokogiri/include/libxml2/libxml/threads.h +87 -0
  43. data/ext/nokogiri/include/libxml2/libxml/tree.h +1382 -0
  44. data/ext/nokogiri/include/libxml2/libxml/uri.h +106 -0
  45. data/ext/nokogiri/include/libxml2/libxml/valid.h +477 -0
  46. data/ext/nokogiri/include/libxml2/libxml/xinclude.h +136 -0
  47. data/ext/nokogiri/include/libxml2/libxml/xlink.h +189 -0
  48. data/ext/nokogiri/include/libxml2/libxml/xmlIO.h +438 -0
  49. data/ext/nokogiri/include/libxml2/libxml/xmlautomata.h +146 -0
  50. data/ext/nokogiri/include/libxml2/libxml/xmlerror.h +962 -0
  51. data/ext/nokogiri/include/libxml2/libxml/xmlexports.h +146 -0
  52. data/ext/nokogiri/include/libxml2/libxml/xmlmemory.h +188 -0
  53. data/ext/nokogiri/include/libxml2/libxml/xmlmodule.h +57 -0
  54. data/ext/nokogiri/include/libxml2/libxml/xmlreader.h +436 -0
  55. data/ext/nokogiri/include/libxml2/libxml/xmlregexp.h +215 -0
  56. data/ext/nokogiri/include/libxml2/libxml/xmlsave.h +102 -0
  57. data/ext/nokogiri/include/libxml2/libxml/xmlschemas.h +249 -0
  58. data/ext/nokogiri/include/libxml2/libxml/xmlschemastypes.h +152 -0
  59. data/ext/nokogiri/include/libxml2/libxml/xmlstring.h +140 -0
  60. data/ext/nokogiri/include/libxml2/libxml/xmlunicode.h +366 -0
  61. data/ext/nokogiri/include/libxml2/libxml/xmlversion.h +347 -0
  62. data/ext/nokogiri/include/libxml2/libxml/xmlwriter.h +489 -0
  63. data/ext/nokogiri/include/libxml2/libxml/xpath.h +579 -0
  64. data/ext/nokogiri/include/libxml2/libxml/xpathInternals.h +633 -0
  65. data/ext/nokogiri/include/libxml2/libxml/xpointer.h +138 -0
  66. data/ext/nokogiri/include/libxslt/attributes.h +39 -0
  67. data/ext/nokogiri/include/libxslt/documents.h +93 -0
  68. data/ext/nokogiri/include/libxslt/extensions.h +262 -0
  69. data/ext/nokogiri/include/libxslt/extra.h +72 -0
  70. data/ext/nokogiri/include/libxslt/functions.h +78 -0
  71. data/ext/nokogiri/include/libxslt/imports.h +75 -0
  72. data/ext/nokogiri/include/libxslt/keys.h +53 -0
  73. data/ext/nokogiri/include/libxslt/namespaces.h +68 -0
  74. data/ext/nokogiri/include/libxslt/numbersInternals.h +73 -0
  75. data/ext/nokogiri/include/libxslt/pattern.h +84 -0
  76. data/ext/nokogiri/include/libxslt/preproc.h +43 -0
  77. data/ext/nokogiri/include/libxslt/security.h +104 -0
  78. data/ext/nokogiri/include/libxslt/templates.h +77 -0
  79. data/ext/nokogiri/include/libxslt/transform.h +207 -0
  80. data/ext/nokogiri/include/libxslt/variables.h +118 -0
  81. data/ext/nokogiri/include/libxslt/xslt.h +110 -0
  82. data/ext/nokogiri/include/libxslt/xsltInternals.h +1995 -0
  83. data/ext/nokogiri/include/libxslt/xsltconfig.h +146 -0
  84. data/ext/nokogiri/include/libxslt/xsltexports.h +64 -0
  85. data/ext/nokogiri/include/libxslt/xsltlocale.h +44 -0
  86. data/ext/nokogiri/include/libxslt/xsltutils.h +343 -0
  87. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  88. data/ext/nokogiri/nokogiri.c +294 -0
  89. data/ext/nokogiri/nokogiri.h +238 -0
  90. data/ext/nokogiri/test_global_handlers.c +40 -0
  91. data/ext/nokogiri/xml_attr.c +103 -0
  92. data/ext/nokogiri/xml_attribute_decl.c +70 -0
  93. data/ext/nokogiri/xml_cdata.c +62 -0
  94. data/ext/nokogiri/xml_comment.c +57 -0
  95. data/ext/nokogiri/xml_document.c +784 -0
  96. data/ext/nokogiri/xml_document_fragment.c +29 -0
  97. data/ext/nokogiri/xml_dtd.c +208 -0
  98. data/ext/nokogiri/xml_element_content.c +131 -0
  99. data/ext/nokogiri/xml_element_decl.c +69 -0
  100. data/ext/nokogiri/xml_encoding_handler.c +112 -0
  101. data/ext/nokogiri/xml_entity_decl.c +112 -0
  102. data/ext/nokogiri/xml_entity_reference.c +50 -0
  103. data/ext/nokogiri/xml_namespace.c +181 -0
  104. data/ext/nokogiri/xml_node.c +2459 -0
  105. data/ext/nokogiri/xml_node_set.c +518 -0
  106. data/ext/nokogiri/xml_processing_instruction.c +54 -0
  107. data/ext/nokogiri/xml_reader.c +777 -0
  108. data/ext/nokogiri/xml_relax_ng.c +149 -0
  109. data/ext/nokogiri/xml_sax_parser.c +403 -0
  110. data/ext/nokogiri/xml_sax_parser_context.c +390 -0
  111. data/ext/nokogiri/xml_sax_push_parser.c +206 -0
  112. data/ext/nokogiri/xml_schema.c +226 -0
  113. data/ext/nokogiri/xml_syntax_error.c +93 -0
  114. data/ext/nokogiri/xml_text.c +59 -0
  115. data/ext/nokogiri/xml_xpath_context.c +502 -0
  116. data/ext/nokogiri/xslt_stylesheet.c +421 -0
  117. data/gumbo-parser/CHANGES.md +63 -0
  118. data/gumbo-parser/Makefile +129 -0
  119. data/gumbo-parser/THANKS +27 -0
  120. data/lib/nokogiri/3.1/nokogiri.so +0 -0
  121. data/lib/nokogiri/3.2/nokogiri.so +0 -0
  122. data/lib/nokogiri/3.3/nokogiri.so +0 -0
  123. data/lib/nokogiri/3.4/nokogiri.so +0 -0
  124. data/lib/nokogiri/class_resolver.rb +67 -0
  125. data/lib/nokogiri/css/node.rb +58 -0
  126. data/lib/nokogiri/css/parser.rb +772 -0
  127. data/lib/nokogiri/css/parser.y +277 -0
  128. data/lib/nokogiri/css/parser_extras.rb +36 -0
  129. data/lib/nokogiri/css/selector_cache.rb +38 -0
  130. data/lib/nokogiri/css/syntax_error.rb +9 -0
  131. data/lib/nokogiri/css/tokenizer.rb +155 -0
  132. data/lib/nokogiri/css/tokenizer.rex +57 -0
  133. data/lib/nokogiri/css/xpath_visitor.rb +375 -0
  134. data/lib/nokogiri/css.rb +132 -0
  135. data/lib/nokogiri/decorators/slop.rb +42 -0
  136. data/lib/nokogiri/encoding_handler.rb +57 -0
  137. data/lib/nokogiri/extension.rb +32 -0
  138. data/lib/nokogiri/gumbo.rb +15 -0
  139. data/lib/nokogiri/html.rb +48 -0
  140. data/lib/nokogiri/html4/builder.rb +37 -0
  141. data/lib/nokogiri/html4/document.rb +235 -0
  142. data/lib/nokogiri/html4/document_fragment.rb +166 -0
  143. data/lib/nokogiri/html4/element_description.rb +25 -0
  144. data/lib/nokogiri/html4/element_description_defaults.rb +2040 -0
  145. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  146. data/lib/nokogiri/html4/entity_lookup.rb +15 -0
  147. data/lib/nokogiri/html4/sax/parser.rb +48 -0
  148. data/lib/nokogiri/html4/sax/parser_context.rb +15 -0
  149. data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
  150. data/lib/nokogiri/html4.rb +42 -0
  151. data/lib/nokogiri/html5/builder.rb +40 -0
  152. data/lib/nokogiri/html5/document.rb +199 -0
  153. data/lib/nokogiri/html5/document_fragment.rb +200 -0
  154. data/lib/nokogiri/html5/node.rb +103 -0
  155. data/lib/nokogiri/html5.rb +368 -0
  156. data/lib/nokogiri/jruby/dependencies.rb +3 -0
  157. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  158. data/lib/nokogiri/syntax_error.rb +6 -0
  159. data/lib/nokogiri/version/constant.rb +6 -0
  160. data/lib/nokogiri/version/info.rb +224 -0
  161. data/lib/nokogiri/version.rb +4 -0
  162. data/lib/nokogiri/xml/attr.rb +66 -0
  163. data/lib/nokogiri/xml/attribute_decl.rb +22 -0
  164. data/lib/nokogiri/xml/builder.rb +494 -0
  165. data/lib/nokogiri/xml/cdata.rb +13 -0
  166. data/lib/nokogiri/xml/character_data.rb +9 -0
  167. data/lib/nokogiri/xml/document.rb +514 -0
  168. data/lib/nokogiri/xml/document_fragment.rb +276 -0
  169. data/lib/nokogiri/xml/dtd.rb +34 -0
  170. data/lib/nokogiri/xml/element_content.rb +46 -0
  171. data/lib/nokogiri/xml/element_decl.rb +17 -0
  172. data/lib/nokogiri/xml/entity_decl.rb +23 -0
  173. data/lib/nokogiri/xml/entity_reference.rb +20 -0
  174. data/lib/nokogiri/xml/namespace.rb +57 -0
  175. data/lib/nokogiri/xml/node/save_options.rb +76 -0
  176. data/lib/nokogiri/xml/node.rb +1650 -0
  177. data/lib/nokogiri/xml/node_set.rb +449 -0
  178. data/lib/nokogiri/xml/notation.rb +19 -0
  179. data/lib/nokogiri/xml/parse_options.rb +213 -0
  180. data/lib/nokogiri/xml/pp/character_data.rb +21 -0
  181. data/lib/nokogiri/xml/pp/node.rb +73 -0
  182. data/lib/nokogiri/xml/pp.rb +4 -0
  183. data/lib/nokogiri/xml/processing_instruction.rb +11 -0
  184. data/lib/nokogiri/xml/reader.rb +139 -0
  185. data/lib/nokogiri/xml/relax_ng.rb +75 -0
  186. data/lib/nokogiri/xml/sax/document.rb +258 -0
  187. data/lib/nokogiri/xml/sax/parser.rb +199 -0
  188. data/lib/nokogiri/xml/sax/parser_context.rb +129 -0
  189. data/lib/nokogiri/xml/sax/push_parser.rb +64 -0
  190. data/lib/nokogiri/xml/sax.rb +54 -0
  191. data/lib/nokogiri/xml/schema.rb +140 -0
  192. data/lib/nokogiri/xml/searchable.rb +297 -0
  193. data/lib/nokogiri/xml/syntax_error.rb +94 -0
  194. data/lib/nokogiri/xml/text.rb +11 -0
  195. data/lib/nokogiri/xml/xpath/syntax_error.rb +13 -0
  196. data/lib/nokogiri/xml/xpath.rb +21 -0
  197. data/lib/nokogiri/xml/xpath_context.rb +49 -0
  198. data/lib/nokogiri/xml.rb +65 -0
  199. data/lib/nokogiri/xslt/stylesheet.rb +49 -0
  200. data/lib/nokogiri/xslt.rb +129 -0
  201. data/lib/nokogiri.rb +128 -0
  202. data/lib/xsd/xmlparser/nokogiri.rb +105 -0
  203. metadata +324 -0
@@ -0,0 +1,200 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ #
19
+
20
+ require_relative "../html4/document_fragment"
21
+
22
+ module Nokogiri
23
+ module HTML5
24
+ # Since v1.12.0
25
+ #
26
+ # 💡 HTML5 functionality is not available when running JRuby.
27
+ class DocumentFragment < Nokogiri::HTML4::DocumentFragment
28
+ class << self
29
+ # :call-seq:
30
+ # parse(input, **options) → HTML5::DocumentFragment
31
+ #
32
+ # Parse \HTML5 fragment input from a String, and return a new HTML5::DocumentFragment. This
33
+ # method creates a new, empty HTML5::Document to contain the fragment.
34
+ #
35
+ # [Parameters]
36
+ # - +input+ (String | IO) The HTML5 document fragment to parse.
37
+ #
38
+ # [Optional Keyword Arguments]
39
+ # - +encoding:+ (String | Encoding) The encoding, or name of the encoding, that should be
40
+ # used when processing the document. When not provided, the encoding will be determined
41
+ # based on the document content. Also see Nokogiri::HTML5 for a longer explanation of how
42
+ # encoding is handled by the parser.
43
+ #
44
+ # - +context:+ (String | Nokogiri::XML::Node) The node, or the name of an HTML5 element, "in
45
+ # context" of which to parse the document fragment. See below for more
46
+ # information. (default +"body"+)
47
+ #
48
+ # - +max_errors:+ (Integer) The maximum number of parse errors to record. (default
49
+ # +Nokogiri::Gumbo::DEFAULT_MAX_ERRORS+ which is currently 0)
50
+ #
51
+ # - +max_tree_depth:+ (Integer) The maximum depth of the parse tree. (default
52
+ # +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+)
53
+ #
54
+ # - +max_attributes:+ (Integer) The maximum number of attributes allowed on an
55
+ # element. (default +Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES+)
56
+ #
57
+ # - +parse_noscript_content_as_text:+ (Boolean) Whether to parse the content of +noscript+
58
+ # elements as text. (default +false+)
59
+ #
60
+ # See rdoc-ref:HTML5@Parsing+options for a complete description of these parsing options.
61
+ #
62
+ # [Returns] Nokogiri::HTML5::DocumentFragment
63
+ #
64
+ # === Context \Node
65
+ #
66
+ # If a context node is specified using +context:+, then the parser will behave as if that
67
+ # Node, or a hypothetical tag named as specified, is the parent of the fragment subtree.
68
+ #
69
+ def parse(
70
+ input,
71
+ encoding_ = nil, positional_options_hash = nil,
72
+ encoding: encoding_, **options
73
+ )
74
+ unless positional_options_hash.nil? || positional_options_hash.empty?
75
+ options.merge!(positional_options_hash)
76
+ end
77
+
78
+ context = options.delete(:context)
79
+
80
+ document = HTML5::Document.new
81
+ document.encoding = "UTF-8"
82
+ input = HTML5.read_and_encode(input, encoding)
83
+
84
+ new(document, input, context, options)
85
+ end
86
+ end
87
+
88
+ attr_accessor :document
89
+ attr_accessor :errors
90
+
91
+ # Get the parser's quirks mode value. See HTML5::QuirksMode.
92
+ #
93
+ # This method returns `nil` if the parser was not invoked (e.g.,
94
+ # `Nokogiri::HTML5::DocumentFragment.new(doc)`).
95
+ #
96
+ # Since v1.14.0
97
+ attr_reader :quirks_mode
98
+
99
+ #
100
+ # :call-seq:
101
+ # new(document, input, **options) → HTML5::DocumentFragment
102
+ #
103
+ # Parse \HTML5 fragment input from a String, and return a new HTML5::DocumentFragment.
104
+ #
105
+ # 💡 It's recommended to use either HTML5::DocumentFragment.parse or HTML5::Node#fragment
106
+ # rather than call this method directly.
107
+ #
108
+ # [Required Parameters]
109
+ # - +document+ (HTML5::Document) The parent document to associate the returned fragment with.
110
+ #
111
+ # [Optional Parameters]
112
+ # - +input+ (String) The content to be parsed.
113
+ #
114
+ # [Optional Keyword Arguments]
115
+ # - +encoding:+ (String | Encoding) The encoding, or name of the encoding, that should be
116
+ # used when processing the document. When not provided, the encoding will be determined
117
+ # based on the document content. Also see Nokogiri::HTML5 for a longer explanation of how
118
+ # encoding is handled by the parser.
119
+ #
120
+ # - +context:+ (String | Nokogiri::XML::Node) The node, or the name of an HTML5 element, in
121
+ # which to parse the document fragment. (default +"body"+)
122
+ #
123
+ # - +max_errors:+ (Integer) The maximum number of parse errors to record. (default
124
+ # +Nokogiri::Gumbo::DEFAULT_MAX_ERRORS+ which is currently 0)
125
+ #
126
+ # - +max_tree_depth:+ (Integer) The maximum depth of the parse tree. (default
127
+ # +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+)
128
+ #
129
+ # - +max_attributes:+ (Integer) The maximum number of attributes allowed on an
130
+ # element. (default +Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES+)
131
+ #
132
+ # - +parse_noscript_content_as_text:+ (Boolean) Whether to parse the content of +noscript+
133
+ # elements as text. (default +false+)
134
+ #
135
+ # See rdoc-ref:HTML5@Parsing+options for a complete description of these parsing options.
136
+ #
137
+ # [Returns] HTML5::DocumentFragment
138
+ #
139
+ # === Context \Node
140
+ #
141
+ # If a context node is specified using +context:+, then the parser will behave as if that
142
+ # Node, or a hypothetical tag named as specified, is the parent of the fragment subtree.
143
+ #
144
+ def initialize(
145
+ doc, input = nil,
146
+ context_ = nil, positional_options_hash = nil,
147
+ context: context_,
148
+ **options
149
+ ) # rubocop:disable Lint/MissingSuper
150
+ unless positional_options_hash.nil? || positional_options_hash.empty?
151
+ options.merge!(positional_options_hash)
152
+ end
153
+
154
+ @document = doc
155
+ @errors = []
156
+ return self unless input
157
+
158
+ input = Nokogiri::HTML5.read_and_encode(input, nil)
159
+
160
+ context = options.delete(:context) if options.key?(:context)
161
+
162
+ options[:max_attributes] ||= Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
163
+ options[:max_errors] ||= options.delete(:max_parse_errors) || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
164
+ options[:max_tree_depth] ||= Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
165
+
166
+ Nokogiri::Gumbo.fragment(self, input, context, **options)
167
+ end
168
+
169
+ def serialize(options = {}, &block) # :nodoc:
170
+ # Bypass XML::Document.serialize which doesn't support options even
171
+ # though XML::Node.serialize does!
172
+ XML::Node.instance_method(:serialize).bind_call(self, options, &block)
173
+ end
174
+
175
+ def extract_params(params) # :nodoc:
176
+ handler = params.find do |param|
177
+ ![Hash, String, Symbol].include?(param.class)
178
+ end
179
+ params -= [handler] if handler
180
+
181
+ hashes = []
182
+ while Hash === params.last || params.last.nil?
183
+ hashes << params.pop
184
+ break if params.empty?
185
+ end
186
+ ns, binds = hashes.reverse
187
+
188
+ ns ||=
189
+ begin
190
+ ns = {}
191
+ children.each { |child| ns.merge!(child.namespaces) }
192
+ ns
193
+ end
194
+
195
+ [params, handler, ns, binds]
196
+ end
197
+ end
198
+ end
199
+ end
200
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
@@ -0,0 +1,103 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ #
19
+
20
+ #
21
+ # TODO: this whole file should go away. maybe make it a decorator?
22
+ #
23
+ require_relative "../xml/node"
24
+
25
+ module Nokogiri
26
+ module HTML5
27
+ # Since v1.12.0
28
+ #
29
+ # 💡 HTML5 functionality is not available when running JRuby.
30
+ module Node
31
+ def inner_html(options = {})
32
+ return super unless document.is_a?(HTML5::Document)
33
+
34
+ result = options[:preserve_newline] && prepend_newline? ? +"\n" : +""
35
+ result << children.map { |child| child.to_html(options) }.join
36
+ result
37
+ end
38
+
39
+ def write_to(io, *options)
40
+ return super unless document.is_a?(HTML5::Document)
41
+
42
+ options = options.first.is_a?(Hash) ? options.shift : {}
43
+ encoding = options[:encoding] || options[0]
44
+ if Nokogiri.jruby?
45
+ save_options = options[:save_with] || options[1]
46
+ indent_times = options[:indent] || 0
47
+ else
48
+ save_options = options[:save_with] || options[1] || XML::Node::SaveOptions::FORMAT
49
+ indent_times = options[:indent] || 2
50
+ end
51
+ indent_string = (options[:indent_text] || " ") * indent_times
52
+
53
+ config = XML::Node::SaveOptions.new(save_options.to_i)
54
+ yield config if block_given?
55
+
56
+ encoding = encoding.is_a?(Encoding) ? encoding.name : encoding
57
+
58
+ config_options = config.options
59
+ if config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0
60
+ # Use Nokogiri's serializing code.
61
+ native_write_to(io, encoding, indent_string, config_options)
62
+ else
63
+ # Serialize including the current node.
64
+ html = html_standard_serialize(options[:preserve_newline] || false)
65
+ encoding ||= document.encoding || Encoding::UTF_8
66
+ io << html.encode(encoding, fallback: lambda { |c| "&#x#{c.ord.to_s(16)};" })
67
+ end
68
+ end
69
+
70
+ def fragment(tags)
71
+ return super unless document.is_a?(HTML5::Document)
72
+
73
+ DocumentFragment.new(document, tags, self)
74
+ end
75
+
76
+ private
77
+
78
+ # HTML elements can have attributes that contain colons.
79
+ # Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
80
+ # and tries to create an attribute in a namespace. This is especially
81
+ # annoying with attribute names like xml:lang since libxml2 will
82
+ # actually create the xml namespace if it doesn't exist already.
83
+ def add_child_node_and_reparent_attrs(node)
84
+ return super unless document.is_a?(HTML5::Document)
85
+
86
+ # I'm not sure what this method is supposed to do. Reparenting
87
+ # namespaces is handled by libxml2, including child namespaces which
88
+ # this method wouldn't handle.
89
+ # https://github.com/sparklemotion/nokogiri/issues/1790
90
+ add_child_node(node)
91
+ # node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
92
+ # attr.remove
93
+ # ns = attr.namespace
94
+ # a["#{ns.prefix}:#{attr.name}"] = attr.value
95
+ # end
96
+ end
97
+ end
98
+ # Monkey patch
99
+ XML::Node.prepend(HTML5::Node)
100
+ end
101
+ end
102
+
103
+ # vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
@@ -0,0 +1,368 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ # This file includes code from the Nokogumbo project, whose license follows.
5
+ #
6
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
7
+ #
8
+ # Licensed under the Apache License, Version 2.0 (the "License");
9
+ # you may not use this file except in compliance with the License.
10
+ # You may obtain a copy of the License at
11
+ #
12
+ # http://www.apache.org/licenses/LICENSE-2.0
13
+ #
14
+ # Unless required by applicable law or agreed to in writing, software
15
+ # distributed under the License is distributed on an "AS IS" BASIS,
16
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17
+ # See the License for the specific language governing permissions and
18
+ # limitations under the License.
19
+ #
20
+
21
+ require_relative "html5/document"
22
+ require_relative "html5/document_fragment"
23
+ require_relative "html5/node"
24
+ require_relative "html5/builder"
25
+
26
+ module Nokogiri
27
+ # Convenience method for Nokogiri::HTML5::Document.parse
28
+ def self.HTML5(...)
29
+ Nokogiri::HTML5::Document.parse(...)
30
+ end
31
+
32
+ # == Usage
33
+ #
34
+ # Parse an HTML5 document:
35
+ #
36
+ # doc = Nokogiri.HTML5(input)
37
+ #
38
+ # Parse an HTML5 fragment:
39
+ #
40
+ # fragment = Nokogiri::HTML5.fragment(input)
41
+ #
42
+ # ⚠ HTML5 functionality is not available when running JRuby.
43
+ #
44
+ # == Parsing options
45
+ #
46
+ # The document and fragment parsing methods support options that are different from
47
+ # Nokogiri::HTML4::Document or Nokogiri::XML::Document.
48
+ #
49
+ # - <tt>Nokogiri.HTML5(input, url:, encoding:, **parse_options)</tt>
50
+ # - <tt>Nokogiri::HTML5.parse(input, url:, encoding:, **parse_options)</tt>
51
+ # - <tt>Nokogiri::HTML5::Document.parse(input, url:, encoding:, **parse_options)</tt>
52
+ # - <tt>Nokogiri::HTML5.fragment(input, encoding:, **parse_options)</tt>
53
+ # - <tt>Nokogiri::HTML5::DocumentFragment.parse(input, encoding:, **parse_options)</tt>
54
+ #
55
+ # The four currently supported parse options are
56
+ #
57
+ # - +max_errors:+ (Integer, default 0) Maximum number of parse errors to report in HTML5::Document#errors.
58
+ # - +max_tree_depth:+ (Integer, default +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+) Maximum tree depth to parse.
59
+ # - +max_attributes:+ (Integer, default +Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES+) Maximum number of attributes to parse per element.
60
+ # - +parse_noscript_content_as_text:+ (Boolean, default false) When enabled, parse +noscript+ tag content as text, mimicking the behavior of web browsers.
61
+ #
62
+ # These options are explained in the following sections.
63
+ #
64
+ # === Error reporting: +max_errors:+
65
+ #
66
+ # Nokogiri contains an experimental HTML5 parse error reporting facility. By default, no parse
67
+ # errors are reported but this can be configured by passing the +:max_errors+ option to
68
+ # HTML5.parse or HTML5.fragment.
69
+ #
70
+ # For example, this script:
71
+ #
72
+ # doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
73
+ # doc.errors.each do |err|
74
+ # puts(err)
75
+ # end
76
+ #
77
+ # Emits:
78
+ #
79
+ # 1:1: ERROR: Expected a doctype token
80
+ # <span/>Hi there!</span foo=bar />
81
+ # ^
82
+ # 1:1: ERROR: Start tag of nonvoid HTML element ends with '/>', use '>'.
83
+ # <span/>Hi there!</span foo=bar />
84
+ # ^
85
+ # 1:17: ERROR: End tag ends with '/>', use '>'.
86
+ # <span/>Hi there!</span foo=bar />
87
+ # ^
88
+ # 1:17: ERROR: End tag contains attributes.
89
+ # <span/>Hi there!</span foo=bar />
90
+ # ^
91
+ #
92
+ # Using <tt>max_errors: -1</tt> results in an unlimited number of errors being returned.
93
+ #
94
+ # The errors returned by HTML5::Document#errors are instances of Nokogiri::XML::SyntaxError.
95
+ #
96
+ # The {HTML standard}[https://html.spec.whatwg.org/multipage/parsing.html#parse-errors] defines a
97
+ # number of standard parse error codes. These error codes only cover the "tokenization" stage of
98
+ # parsing HTML. The parse errors in the "tree construction" stage do not have standardized error
99
+ # codes (yet).
100
+ #
101
+ # As a convenience to Nokogiri users, the defined error codes are available
102
+ # via Nokogiri::XML::SyntaxError#str1 method.
103
+ #
104
+ # doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />', max_errors: 10)
105
+ # doc.errors.each do |err|
106
+ # puts("#{err.line}:#{err.column}: #{err.str1}")
107
+ # end
108
+ # doc = Nokogiri::HTML5.parse('<span/>Hi there!</span foo=bar />',
109
+ # # => 1:1: generic-parser
110
+ # # 1:1: non-void-html-element-start-tag-with-trailing-solidus
111
+ # # 1:17: end-tag-with-trailing-solidus
112
+ # # 1:17: end-tag-with-attributes
113
+ #
114
+ # Note that the first error is +generic-parser+ because it's an error from the tree construction
115
+ # stage and doesn't have a standardized error code.
116
+ #
117
+ # For the purposes of semantic versioning, the error messages, error locations, and error codes
118
+ # are not part of Nokogiri's public API. That is, these are subject to change without Nokogiri's
119
+ # major version number changing. These may be stabilized in the future.
120
+ #
121
+ # === Maximum tree depth: +max_tree_depth:+
122
+ #
123
+ # The maximum depth of the DOM tree parsed by the various parsing methods is configurable by the
124
+ # +:max_tree_depth+ option. If the depth of the tree would exceed this limit, then an
125
+ # +ArgumentError+ is thrown.
126
+ #
127
+ # This limit (which defaults to +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+) can be removed by
128
+ # giving the option <tt>max_tree_depth: -1</tt>.
129
+ #
130
+ # html = '<!DOCTYPE html>' + '<div>' * 1000
131
+ # doc = Nokogiri.HTML5(html)
132
+ # # raises ArgumentError: Document tree depth limit exceeded
133
+ # doc = Nokogiri.HTML5(html, max_tree_depth: -1)
134
+ #
135
+ # === Attribute limit per element: +max_attributes:+
136
+ #
137
+ # The maximum number of attributes per DOM element is configurable by the +:max_attributes+
138
+ # option. If a given element would exceed this limit, then an +ArgumentError+ is thrown.
139
+ #
140
+ # This limit (which defaults to +Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES+) can be removed by
141
+ # giving the option <tt>max_attributes: -1</tt>.
142
+ #
143
+ # html = '<!DOCTYPE html><div ' + (1..1000).map { |x| "attr-#{x}" }.join(' # ') + '>'
144
+ # # "<!DOCTYPE html><div attr-1 attr-2 attr-3 ... attr-1000>"
145
+ # doc = Nokogiri.HTML5(html)
146
+ # # raises ArgumentError: Attributes per element limit exceeded
147
+ #
148
+ # doc = Nokogiri.HTML5(html, max_attributes: -1)
149
+ # # parses successfully
150
+ #
151
+ # === Parse +noscript+ elements' content as text: +parse_noscript_content_as_text:+
152
+ #
153
+ # By default, the content of +noscript+ elements is parsed as HTML elements. Browsers that
154
+ # support scripting parse the content of +noscript+ elements as raw text.
155
+ #
156
+ # The +:parse_noscript_content_as_text+ option causes Nokogiri to parse the content of +noscript+
157
+ # elements as a single text node.
158
+ #
159
+ # html = "<!DOCTYPE html><noscript><meta charset='UTF-8'><link rel=stylesheet href=!></noscript>"
160
+ # doc = Nokogiri::HTML5.parse(html, parse_noscript_content_as_text: true)
161
+ # pp doc.at_xpath("/html/head/noscript")
162
+ # # => #(Element:0x878c {
163
+ # # name = "noscript",
164
+ # # children = [ #(Text "<meta charset='UTF-8'><link rel=stylesheet href=!>")]
165
+ # # })
166
+ #
167
+ # In contrast, <tt>parse_noscript_content_as_text: false</tt> (the default) causes the +noscript+
168
+ # element in the previous example to have two children, a +meta+ element and a +link+ element.
169
+ #
170
+ # doc = Nokogiri::HTML5.parse(html)
171
+ # puts doc.at_xpath("/html/head/noscript")
172
+ # # => #(Element:0x96b4 {
173
+ # # name = "noscript",
174
+ # # children = [
175
+ # # #(Element:0x97e0 { name = "meta", attribute_nodes = [ #(Attr:0x990c { name = "charset", value = "UTF-8" })] }),
176
+ # # #(Element:0x9b00 {
177
+ # # name = "link",
178
+ # # attribute_nodes = [
179
+ # # #(Attr:0x9c2c { name = "rel", value = "stylesheet" }),
180
+ # # #(Attr:0x9dd0 { name = "href", value = "!" })]
181
+ # # })]
182
+ # # })
183
+ #
184
+ # == HTML Serialization
185
+ #
186
+ # After parsing HTML, it may be serialized using any of the Nokogiri::XML::Node serialization
187
+ # methods. In particular, XML::Node#serialize, XML::Node#to_html, and XML::Node#to_s will
188
+ # serialize a given node and its children. (This is the equivalent of JavaScript's
189
+ # +Element.outerHTML+.) Similarly, XML::Node#inner_html will serialize the children of a given
190
+ # node. (This is the equivalent of JavaScript's +Element.innerHTML+.)
191
+ #
192
+ # doc = Nokogiri::HTML5("<!DOCTYPE html><span>Hello world!</span>")
193
+ # puts doc.serialize
194
+ # # => <!DOCTYPE html><html><head></head><body><span>Hello world!</span></body></html>
195
+ #
196
+ # Due to quirks in how HTML is parsed and serialized, it's possible for a DOM tree to be
197
+ # serialized and then re-parsed, resulting in a different DOM. Mostly, this happens with DOMs
198
+ # produced from invalid HTML. Unfortunately, even valid HTML may not survive serialization and
199
+ # re-parsing.
200
+ #
201
+ # In particular, a newline at the start of +pre+, +listing+, and +textarea+
202
+ # elements is ignored by the parser.
203
+ #
204
+ # doc = Nokogiri::HTML5(<<-EOF)
205
+ # <!DOCTYPE html>
206
+ # <pre>
207
+ # Content</pre>
208
+ # EOF
209
+ # puts doc.at('/html/body/pre').serialize
210
+ # # => <pre>Content</pre>
211
+ #
212
+ # In this case, the original HTML is semantically equivalent to the serialized version. If the
213
+ # +pre+, +listing+, or +textarea+ content starts with two newlines, the first newline will be
214
+ # stripped on the first parse and the second newline will be stripped on the second, leading to
215
+ # semantically different DOMs. Passing the parameter <tt>preserve_newline: true</tt> will cause
216
+ # two or more newlines to be preserved. (A single leading newline will still be removed.)
217
+ #
218
+ # doc = Nokogiri::HTML5(<<-EOF)
219
+ # <!DOCTYPE html>
220
+ # <listing>
221
+ #
222
+ # Content</listing>
223
+ # EOF
224
+ # puts doc.at('/html/body/listing').serialize(preserve_newline: true)
225
+ # # => <listing>
226
+ # #
227
+ # # Content</listing>
228
+ #
229
+ # == Encodings
230
+ #
231
+ # Nokogiri always parses HTML5 using {UTF-8}[https://en.wikipedia.org/wiki/UTF-8]; however, the
232
+ # encoding of the input can be explicitly selected via the optional +encoding+ parameter. This is
233
+ # most useful when the input comes not from a string but from an IO object.
234
+ #
235
+ # When serializing a document or node, the encoding of the output string can be specified via the
236
+ # +:encoding+ options. Characters that cannot be encoded in the selected encoding will be encoded
237
+ # as {HTML numeric
238
+ # entities}[https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references].
239
+ #
240
+ # frag = Nokogiri::HTML5.fragment('<span>아는 길도 물어가라</span>')
241
+ # html = frag.serialize(encoding: 'US-ASCII')
242
+ # puts html
243
+ # # => <span>&#xc544;&#xb294; &#xae38;&#xb3c4; &#xbb3c;&#xc5b4;&#xac00;&#xb77c;</span>
244
+ #
245
+ # frag = Nokogiri::HTML5.fragment(html)
246
+ # puts frag.serialize
247
+ # # => <span>아는 길도 물어가라</span>
248
+ #
249
+ # (There's a {bug}[https://bugs.ruby-lang.org/issues/15033] in all current versions of Ruby that
250
+ # can cause the entity encoding to fail. Of the mandated supported encodings for HTML, the only
251
+ # encoding I'm aware of that has this bug is <tt>'ISO-2022-JP'</tt>. We recommend avoiding this
252
+ # encoding.)
253
+ #
254
+ # == Notes
255
+ #
256
+ # * The Nokogiri::HTML5.fragment function takes a String or IO and parses it as a HTML5 document
257
+ # in a +body+ context. As a result, the +html+, +head+, and +body+ elements are removed from
258
+ # this document, and any children of these elements that remain are returned as a
259
+ # Nokogiri::HTML5::DocumentFragment; but you can pass in a different context (e.g., "html" to
260
+ # get +head+ and +body+ tags in the result).
261
+ #
262
+ # * The Nokogiri::HTML5.parse function takes a String or IO and passes it to the
263
+ # <code>gumbo_parse_with_options</code> method, using the default options. The resulting Gumbo
264
+ # parse tree is then walked.
265
+ #
266
+ # * Instead of uppercase element names, lowercase element names are produced.
267
+ #
268
+ # * Instead of returning +unknown+ as the element name for unknown tags, the original tag name is
269
+ # returned verbatim.
270
+ #
271
+ # Since v1.12.0
272
+ module HTML5
273
+ class << self
274
+ # Convenience method for Nokogiri::HTML5::Document.parse
275
+ def parse(...)
276
+ Document.parse(...)
277
+ end
278
+
279
+ # Convenience method for Nokogiri::HTML5::DocumentFragment.parse
280
+ def fragment(...)
281
+ DocumentFragment.parse(...)
282
+ end
283
+
284
+ # :nodoc:
285
+ def read_and_encode(string, encoding)
286
+ # Read the string with the given encoding.
287
+ if string.respond_to?(:read)
288
+ string = if encoding.nil?
289
+ string.read
290
+ else
291
+ string.read(encoding: encoding)
292
+ end
293
+ else
294
+ # Otherwise the string has the given encoding.
295
+ string = string.to_s
296
+ if encoding
297
+ string = string.dup
298
+ string.force_encoding(encoding)
299
+ end
300
+ end
301
+
302
+ # convert to UTF-8
303
+ if string.encoding != Encoding::UTF_8
304
+ string = reencode(string)
305
+ end
306
+ string
307
+ end
308
+
309
+ private
310
+
311
+ # Charset sniffing is a complex and controversial topic that understandably isn't done _by
312
+ # default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
313
+ # consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
314
+ # the Gumbo parser *only* supports utf-8.
315
+ #
316
+ # Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
317
+ # this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
318
+ # the HTML5 standard.
319
+ #
320
+ # http://bugs.ruby-lang.org/issues/2567
321
+ # http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
322
+ #
323
+ def reencode(body, content_type = nil)
324
+ if body.encoding == Encoding::ASCII_8BIT
325
+ encoding = nil
326
+
327
+ # look for a Byte Order Mark (BOM)
328
+ initial_bytes = body[0..2].bytes
329
+ if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
330
+ encoding = Encoding::UTF_8
331
+ elsif initial_bytes[0..1] == [0xFE, 0xFF]
332
+ encoding = Encoding::UTF_16BE
333
+ elsif initial_bytes[0..1] == [0xFF, 0xFE]
334
+ encoding = Encoding::UTF_16LE
335
+ end
336
+
337
+ # look for a charset in a content-encoding header
338
+ if content_type
339
+ encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
340
+ end
341
+
342
+ # look for a charset in a meta tag in the first 1024 bytes
343
+ unless encoding
344
+ data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, "")
345
+ data.scan(/<meta.*?>/im).each do |meta|
346
+ encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
347
+ end
348
+ end
349
+
350
+ # if all else fails, default to the official default encoding for HTML
351
+ encoding ||= Encoding::ISO_8859_1
352
+
353
+ # change the encoding to match the detected or inferred encoding
354
+ body = body.dup
355
+ begin
356
+ body.force_encoding(encoding)
357
+ rescue ArgumentError
358
+ body.force_encoding(Encoding::ISO_8859_1)
359
+ end
360
+ end
361
+
362
+ body.encode(Encoding::UTF_8)
363
+ end
364
+ end
365
+ end
366
+ end
367
+
368
+ require_relative "gumbo"
@@ -0,0 +1,3 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "nokogiri_jars"