nokogiri 1.13.0-x64-mingw-ucrt

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (195) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +5 -0
  3. data/LICENSE-DEPENDENCIES.md +1903 -0
  4. data/LICENSE.md +9 -0
  5. data/README.md +280 -0
  6. data/bin/nokogiri +131 -0
  7. data/dependencies.yml +73 -0
  8. data/ext/nokogiri/depend +38 -0
  9. data/ext/nokogiri/extconf.rb +1000 -0
  10. data/ext/nokogiri/gumbo.c +584 -0
  11. data/ext/nokogiri/html4_document.c +166 -0
  12. data/ext/nokogiri/html4_element_description.c +294 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser_context.c +120 -0
  15. data/ext/nokogiri/html4_sax_push_parser.c +95 -0
  16. data/ext/nokogiri/include/libexslt/exslt.h +102 -0
  17. data/ext/nokogiri/include/libexslt/exsltconfig.h +70 -0
  18. data/ext/nokogiri/include/libexslt/exsltexports.h +140 -0
  19. data/ext/nokogiri/include/libxml2/libxml/DOCBparser.h +96 -0
  20. data/ext/nokogiri/include/libxml2/libxml/HTMLparser.h +306 -0
  21. data/ext/nokogiri/include/libxml2/libxml/HTMLtree.h +147 -0
  22. data/ext/nokogiri/include/libxml2/libxml/SAX.h +173 -0
  23. data/ext/nokogiri/include/libxml2/libxml/SAX2.h +178 -0
  24. data/ext/nokogiri/include/libxml2/libxml/c14n.h +128 -0
  25. data/ext/nokogiri/include/libxml2/libxml/catalog.h +182 -0
  26. data/ext/nokogiri/include/libxml2/libxml/chvalid.h +230 -0
  27. data/ext/nokogiri/include/libxml2/libxml/debugXML.h +217 -0
  28. data/ext/nokogiri/include/libxml2/libxml/dict.h +79 -0
  29. data/ext/nokogiri/include/libxml2/libxml/encoding.h +245 -0
  30. data/ext/nokogiri/include/libxml2/libxml/entities.h +151 -0
  31. data/ext/nokogiri/include/libxml2/libxml/globals.h +508 -0
  32. data/ext/nokogiri/include/libxml2/libxml/hash.h +236 -0
  33. data/ext/nokogiri/include/libxml2/libxml/list.h +137 -0
  34. data/ext/nokogiri/include/libxml2/libxml/nanoftp.h +163 -0
  35. data/ext/nokogiri/include/libxml2/libxml/nanohttp.h +81 -0
  36. data/ext/nokogiri/include/libxml2/libxml/parser.h +1243 -0
  37. data/ext/nokogiri/include/libxml2/libxml/parserInternals.h +644 -0
  38. data/ext/nokogiri/include/libxml2/libxml/pattern.h +100 -0
  39. data/ext/nokogiri/include/libxml2/libxml/relaxng.h +217 -0
  40. data/ext/nokogiri/include/libxml2/libxml/schemasInternals.h +958 -0
  41. data/ext/nokogiri/include/libxml2/libxml/schematron.h +142 -0
  42. data/ext/nokogiri/include/libxml2/libxml/threads.h +89 -0
  43. data/ext/nokogiri/include/libxml2/libxml/tree.h +1311 -0
  44. data/ext/nokogiri/include/libxml2/libxml/uri.h +94 -0
  45. data/ext/nokogiri/include/libxml2/libxml/valid.h +458 -0
  46. data/ext/nokogiri/include/libxml2/libxml/xinclude.h +129 -0
  47. data/ext/nokogiri/include/libxml2/libxml/xlink.h +189 -0
  48. data/ext/nokogiri/include/libxml2/libxml/xmlIO.h +368 -0
  49. data/ext/nokogiri/include/libxml2/libxml/xmlautomata.h +146 -0
  50. data/ext/nokogiri/include/libxml2/libxml/xmlerror.h +946 -0
  51. data/ext/nokogiri/include/libxml2/libxml/xmlexports.h +77 -0
  52. data/ext/nokogiri/include/libxml2/libxml/xmlmemory.h +224 -0
  53. data/ext/nokogiri/include/libxml2/libxml/xmlmodule.h +57 -0
  54. data/ext/nokogiri/include/libxml2/libxml/xmlreader.h +428 -0
  55. data/ext/nokogiri/include/libxml2/libxml/xmlregexp.h +222 -0
  56. data/ext/nokogiri/include/libxml2/libxml/xmlsave.h +88 -0
  57. data/ext/nokogiri/include/libxml2/libxml/xmlschemas.h +246 -0
  58. data/ext/nokogiri/include/libxml2/libxml/xmlschemastypes.h +151 -0
  59. data/ext/nokogiri/include/libxml2/libxml/xmlstring.h +140 -0
  60. data/ext/nokogiri/include/libxml2/libxml/xmlunicode.h +202 -0
  61. data/ext/nokogiri/include/libxml2/libxml/xmlversion.h +485 -0
  62. data/ext/nokogiri/include/libxml2/libxml/xmlwriter.h +488 -0
  63. data/ext/nokogiri/include/libxml2/libxml/xpath.h +564 -0
  64. data/ext/nokogiri/include/libxml2/libxml/xpathInternals.h +632 -0
  65. data/ext/nokogiri/include/libxml2/libxml/xpointer.h +114 -0
  66. data/ext/nokogiri/include/libxslt/attributes.h +38 -0
  67. data/ext/nokogiri/include/libxslt/documents.h +93 -0
  68. data/ext/nokogiri/include/libxslt/extensions.h +262 -0
  69. data/ext/nokogiri/include/libxslt/extra.h +72 -0
  70. data/ext/nokogiri/include/libxslt/functions.h +78 -0
  71. data/ext/nokogiri/include/libxslt/imports.h +75 -0
  72. data/ext/nokogiri/include/libxslt/keys.h +53 -0
  73. data/ext/nokogiri/include/libxslt/namespaces.h +68 -0
  74. data/ext/nokogiri/include/libxslt/numbersInternals.h +73 -0
  75. data/ext/nokogiri/include/libxslt/pattern.h +84 -0
  76. data/ext/nokogiri/include/libxslt/preproc.h +43 -0
  77. data/ext/nokogiri/include/libxslt/security.h +104 -0
  78. data/ext/nokogiri/include/libxslt/templates.h +77 -0
  79. data/ext/nokogiri/include/libxslt/transform.h +207 -0
  80. data/ext/nokogiri/include/libxslt/variables.h +118 -0
  81. data/ext/nokogiri/include/libxslt/xslt.h +110 -0
  82. data/ext/nokogiri/include/libxslt/xsltInternals.h +1978 -0
  83. data/ext/nokogiri/include/libxslt/xsltconfig.h +180 -0
  84. data/ext/nokogiri/include/libxslt/xsltexports.h +142 -0
  85. data/ext/nokogiri/include/libxslt/xsltlocale.h +76 -0
  86. data/ext/nokogiri/include/libxslt/xsltutils.h +313 -0
  87. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  88. data/ext/nokogiri/nokogiri.c +278 -0
  89. data/ext/nokogiri/nokogiri.h +223 -0
  90. data/ext/nokogiri/test_global_handlers.c +40 -0
  91. data/ext/nokogiri/xml_attr.c +103 -0
  92. data/ext/nokogiri/xml_attribute_decl.c +70 -0
  93. data/ext/nokogiri/xml_cdata.c +57 -0
  94. data/ext/nokogiri/xml_comment.c +62 -0
  95. data/ext/nokogiri/xml_document.c +680 -0
  96. data/ext/nokogiri/xml_document_fragment.c +44 -0
  97. data/ext/nokogiri/xml_dtd.c +208 -0
  98. data/ext/nokogiri/xml_element_content.c +128 -0
  99. data/ext/nokogiri/xml_element_decl.c +69 -0
  100. data/ext/nokogiri/xml_encoding_handler.c +104 -0
  101. data/ext/nokogiri/xml_entity_decl.c +112 -0
  102. data/ext/nokogiri/xml_entity_reference.c +50 -0
  103. data/ext/nokogiri/xml_namespace.c +120 -0
  104. data/ext/nokogiri/xml_node.c +2144 -0
  105. data/ext/nokogiri/xml_node_set.c +498 -0
  106. data/ext/nokogiri/xml_processing_instruction.c +54 -0
  107. data/ext/nokogiri/xml_reader.c +719 -0
  108. data/ext/nokogiri/xml_relax_ng.c +185 -0
  109. data/ext/nokogiri/xml_sax_parser.c +310 -0
  110. data/ext/nokogiri/xml_sax_parser_context.c +281 -0
  111. data/ext/nokogiri/xml_sax_push_parser.c +168 -0
  112. data/ext/nokogiri/xml_schema.c +284 -0
  113. data/ext/nokogiri/xml_syntax_error.c +85 -0
  114. data/ext/nokogiri/xml_text.c +48 -0
  115. data/ext/nokogiri/xml_xpath_context.c +406 -0
  116. data/ext/nokogiri/xslt_stylesheet.c +264 -0
  117. data/gumbo-parser/CHANGES.md +63 -0
  118. data/gumbo-parser/Makefile +101 -0
  119. data/gumbo-parser/THANKS +27 -0
  120. data/lib/nokogiri/3.1/nokogiri.so +0 -0
  121. data/lib/nokogiri/class_resolver.rb +67 -0
  122. data/lib/nokogiri/css/node.rb +54 -0
  123. data/lib/nokogiri/css/parser.rb +759 -0
  124. data/lib/nokogiri/css/parser.y +280 -0
  125. data/lib/nokogiri/css/parser_extras.rb +94 -0
  126. data/lib/nokogiri/css/syntax_error.rb +9 -0
  127. data/lib/nokogiri/css/tokenizer.rb +155 -0
  128. data/lib/nokogiri/css/tokenizer.rex +56 -0
  129. data/lib/nokogiri/css/xpath_visitor.rb +359 -0
  130. data/lib/nokogiri/css.rb +60 -0
  131. data/lib/nokogiri/decorators/slop.rb +44 -0
  132. data/lib/nokogiri/extension.rb +31 -0
  133. data/lib/nokogiri/gumbo.rb +15 -0
  134. data/lib/nokogiri/html.rb +48 -0
  135. data/lib/nokogiri/html4/builder.rb +37 -0
  136. data/lib/nokogiri/html4/document.rb +331 -0
  137. data/lib/nokogiri/html4/document_fragment.rb +54 -0
  138. data/lib/nokogiri/html4/element_description.rb +25 -0
  139. data/lib/nokogiri/html4/element_description_defaults.rb +578 -0
  140. data/lib/nokogiri/html4/entity_lookup.rb +15 -0
  141. data/lib/nokogiri/html4/sax/parser.rb +61 -0
  142. data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
  143. data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
  144. data/lib/nokogiri/html4.rb +46 -0
  145. data/lib/nokogiri/html5/document.rb +88 -0
  146. data/lib/nokogiri/html5/document_fragment.rb +83 -0
  147. data/lib/nokogiri/html5/node.rb +96 -0
  148. data/lib/nokogiri/html5.rb +477 -0
  149. data/lib/nokogiri/jruby/dependencies.rb +21 -0
  150. data/lib/nokogiri/syntax_error.rb +6 -0
  151. data/lib/nokogiri/version/constant.rb +6 -0
  152. data/lib/nokogiri/version/info.rb +221 -0
  153. data/lib/nokogiri/version.rb +4 -0
  154. data/lib/nokogiri/xml/attr.rb +17 -0
  155. data/lib/nokogiri/xml/attribute_decl.rb +20 -0
  156. data/lib/nokogiri/xml/builder.rb +485 -0
  157. data/lib/nokogiri/xml/cdata.rb +13 -0
  158. data/lib/nokogiri/xml/character_data.rb +9 -0
  159. data/lib/nokogiri/xml/document.rb +418 -0
  160. data/lib/nokogiri/xml/document_fragment.rb +162 -0
  161. data/lib/nokogiri/xml/dtd.rb +34 -0
  162. data/lib/nokogiri/xml/element_content.rb +38 -0
  163. data/lib/nokogiri/xml/element_decl.rb +15 -0
  164. data/lib/nokogiri/xml/entity_decl.rb +21 -0
  165. data/lib/nokogiri/xml/entity_reference.rb +20 -0
  166. data/lib/nokogiri/xml/namespace.rb +16 -0
  167. data/lib/nokogiri/xml/node/save_options.rb +65 -0
  168. data/lib/nokogiri/xml/node.rb +1402 -0
  169. data/lib/nokogiri/xml/node_set.rb +364 -0
  170. data/lib/nokogiri/xml/notation.rb +19 -0
  171. data/lib/nokogiri/xml/parse_options.rb +133 -0
  172. data/lib/nokogiri/xml/pp/character_data.rb +21 -0
  173. data/lib/nokogiri/xml/pp/node.rb +55 -0
  174. data/lib/nokogiri/xml/pp.rb +4 -0
  175. data/lib/nokogiri/xml/processing_instruction.rb +10 -0
  176. data/lib/nokogiri/xml/reader.rb +107 -0
  177. data/lib/nokogiri/xml/relax_ng.rb +38 -0
  178. data/lib/nokogiri/xml/sax/document.rb +167 -0
  179. data/lib/nokogiri/xml/sax/parser.rb +125 -0
  180. data/lib/nokogiri/xml/sax/parser_context.rb +21 -0
  181. data/lib/nokogiri/xml/sax/push_parser.rb +61 -0
  182. data/lib/nokogiri/xml/sax.rb +6 -0
  183. data/lib/nokogiri/xml/schema.rb +73 -0
  184. data/lib/nokogiri/xml/searchable.rb +259 -0
  185. data/lib/nokogiri/xml/syntax_error.rb +71 -0
  186. data/lib/nokogiri/xml/text.rb +11 -0
  187. data/lib/nokogiri/xml/xpath/syntax_error.rb +13 -0
  188. data/lib/nokogiri/xml/xpath.rb +21 -0
  189. data/lib/nokogiri/xml/xpath_context.rb +16 -0
  190. data/lib/nokogiri/xml.rb +75 -0
  191. data/lib/nokogiri/xslt/stylesheet.rb +27 -0
  192. data/lib/nokogiri/xslt.rb +58 -0
  193. data/lib/nokogiri.rb +128 -0
  194. data/lib/xsd/xmlparser/nokogiri.rb +104 -0
  195. metadata +536 -0
@@ -0,0 +1,331 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ require "pathname"
5
+
6
+ module Nokogiri
7
+ module HTML4
8
+ class Document < Nokogiri::XML::Document
9
+ ###
10
+ # Get the meta tag encoding for this document. If there is no meta tag,
11
+ # then nil is returned.
12
+ def meta_encoding
13
+ if (meta = at_xpath("//meta[@charset]"))
14
+ meta[:charset]
15
+ elsif (meta = meta_content_type)
16
+ meta["content"][/charset\s*=\s*([\w-]+)/i, 1]
17
+ end
18
+ end
19
+
20
+ ###
21
+ # Set the meta tag encoding for this document.
22
+ #
23
+ # If an meta encoding tag is already present, its content is
24
+ # replaced with the given text.
25
+ #
26
+ # Otherwise, this method tries to create one at an appropriate
27
+ # place supplying head and/or html elements as necessary, which
28
+ # is inside a head element if any, and before any text node or
29
+ # content element (typically <body>) if any.
30
+ #
31
+ # The result when trying to set an encoding that is different
32
+ # from the document encoding is undefined.
33
+ #
34
+ # Beware in CRuby, that libxml2 automatically inserts a meta tag
35
+ # into a head element.
36
+ def meta_encoding=(encoding)
37
+ if (meta = meta_content_type)
38
+ meta["content"] = format("text/html; charset=%s", encoding)
39
+ encoding
40
+ elsif (meta = at_xpath("//meta[@charset]"))
41
+ meta["charset"] = encoding
42
+ else
43
+ meta = XML::Node.new("meta", self)
44
+ if (dtd = internal_subset) && dtd.html5_dtd?
45
+ meta["charset"] = encoding
46
+ else
47
+ meta["http-equiv"] = "Content-Type"
48
+ meta["content"] = format("text/html; charset=%s", encoding)
49
+ end
50
+
51
+ if (head = at_xpath("//head"))
52
+ head.prepend_child(meta)
53
+ else
54
+ set_metadata_element(meta)
55
+ end
56
+ encoding
57
+ end
58
+ end
59
+
60
+ def meta_content_type
61
+ xpath("//meta[@http-equiv and boolean(@content)]").find do |node|
62
+ node["http-equiv"] =~ /\AContent-Type\z/i
63
+ end
64
+ end
65
+ private :meta_content_type
66
+
67
+ ###
68
+ # Get the title string of this document. Return nil if there is
69
+ # no title tag.
70
+ def title
71
+ (title = at_xpath("//title")) && title.inner_text
72
+ end
73
+
74
+ ###
75
+ # Set the title string of this document.
76
+ #
77
+ # If a title element is already present, its content is replaced
78
+ # with the given text.
79
+ #
80
+ # Otherwise, this method tries to create one at an appropriate
81
+ # place supplying head and/or html elements as necessary, which
82
+ # is inside a head element if any, right after a meta
83
+ # encoding/charset tag if any, and before any text node or
84
+ # content element (typically <body>) if any.
85
+ def title=(text)
86
+ tnode = XML::Text.new(text, self)
87
+ if (title = at_xpath("//title"))
88
+ title.children = tnode
89
+ return text
90
+ end
91
+
92
+ title = XML::Node.new("title", self) << tnode
93
+ if (head = at_xpath("//head"))
94
+ head << title
95
+ elsif (meta = (at_xpath("//meta[@charset]") || meta_content_type))
96
+ # better put after charset declaration
97
+ meta.add_next_sibling(title)
98
+ else
99
+ set_metadata_element(title)
100
+ end
101
+ end
102
+
103
+ def set_metadata_element(element) # rubocop:disable Naming/AccessorMethodName
104
+ if (head = at_xpath("//head"))
105
+ head << element
106
+ elsif (html = at_xpath("//html"))
107
+ head = html.prepend_child(XML::Node.new("head", self))
108
+ head.prepend_child(element)
109
+ elsif (first = children.find do |node|
110
+ case node
111
+ when XML::Element, XML::Text
112
+ true
113
+ end
114
+ end)
115
+ # We reach here only if the underlying document model
116
+ # allows <html>/<head> elements to be omitted and does not
117
+ # automatically supply them.
118
+ first.add_previous_sibling(element)
119
+ else
120
+ html = add_child(XML::Node.new("html", self))
121
+ head = html.add_child(XML::Node.new("head", self))
122
+ head.prepend_child(element)
123
+ end
124
+ end
125
+ private :set_metadata_element
126
+
127
+ ####
128
+ # Serialize Node using +options+. Save options can also be set using a
129
+ # block. See SaveOptions.
130
+ #
131
+ # These two statements are equivalent:
132
+ #
133
+ # node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)
134
+ #
135
+ # or
136
+ #
137
+ # node.serialize(:encoding => 'UTF-8') do |config|
138
+ # config.format.as_xml
139
+ # end
140
+ #
141
+ def serialize(options = {})
142
+ options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
143
+ super
144
+ end
145
+
146
+ ####
147
+ # Create a Nokogiri::XML::DocumentFragment from +tags+
148
+ def fragment(tags = nil)
149
+ DocumentFragment.new(self, tags, root)
150
+ end
151
+
152
+ # :call-seq:
153
+ # xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig
154
+ #
155
+ # [Returns] The document type which determines CSS-to-XPath translation.
156
+ #
157
+ # See XPathVisitor for more information.
158
+ def xpath_doctype
159
+ Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML4
160
+ end
161
+
162
+ class << self
163
+ ###
164
+ # Parse HTML. +string_or_io+ may be a String, or any object that
165
+ # responds to _read_ and _close_ such as an IO, or StringIO.
166
+ # +url+ is resource where this document is located. +encoding+ is the
167
+ # encoding that should be used when processing the document. +options+
168
+ # is a number that sets options in the parser, such as
169
+ # Nokogiri::XML::ParseOptions::RECOVER. See the constants in
170
+ # Nokogiri::XML::ParseOptions.
171
+ def parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML)
172
+ options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
173
+ yield options if block_given?
174
+
175
+ url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
176
+
177
+ if string_or_io.respond_to?(:encoding)
178
+ unless string_or_io.encoding.name == "ASCII-8BIT"
179
+ encoding ||= string_or_io.encoding.name
180
+ end
181
+ end
182
+
183
+ if string_or_io.respond_to?(:read)
184
+ if string_or_io.is_a?(Pathname)
185
+ # resolve the Pathname to the file and open it as an IO object, see #2110
186
+ string_or_io = string_or_io.expand_path.open
187
+ url ||= string_or_io.path
188
+ end
189
+
190
+ unless encoding
191
+ # Libxml2's parser has poor support for encoding
192
+ # detection. First, it does not recognize the HTML5
193
+ # style meta charset declaration. Secondly, even if it
194
+ # successfully detects an encoding hint, it does not
195
+ # re-decode or re-parse the preceding part which may be
196
+ # garbled.
197
+ #
198
+ # EncodingReader aims to perform advanced encoding
199
+ # detection beyond what Libxml2 does, and to emulate
200
+ # rewinding of a stream and make Libxml2 redo parsing
201
+ # from the start when an encoding hint is found.
202
+ string_or_io = EncodingReader.new(string_or_io)
203
+ begin
204
+ return read_io(string_or_io, url, encoding, options.to_i)
205
+ rescue EncodingFound => e
206
+ encoding = e.found_encoding
207
+ end
208
+ end
209
+ return read_io(string_or_io, url, encoding, options.to_i)
210
+ end
211
+
212
+ # read_memory pukes on empty docs
213
+ if string_or_io.nil? || string_or_io.empty?
214
+ return encoding ? new.tap { |i| i.encoding = encoding } : new
215
+ end
216
+
217
+ encoding ||= EncodingReader.detect_encoding(string_or_io)
218
+
219
+ read_memory(string_or_io, url, encoding, options.to_i)
220
+ end
221
+ end
222
+
223
+ class EncodingFound < StandardError # :nodoc: all
224
+ attr_reader :found_encoding
225
+
226
+ def initialize(encoding)
227
+ @found_encoding = encoding
228
+ super(format("encoding found: %s", encoding))
229
+ end
230
+ end
231
+
232
+ # :nodoc: all
233
+ class EncodingReader
234
+ class SAXHandler < Nokogiri::XML::SAX::Document
235
+ attr_reader :encoding
236
+
237
+ def initialize
238
+ @encoding = nil
239
+ super()
240
+ end
241
+
242
+ def start_element(name, attrs = [])
243
+ return unless name == "meta"
244
+ attr = Hash[attrs]
245
+ (charset = attr["charset"]) &&
246
+ (@encoding = charset)
247
+ (http_equiv = attr["http-equiv"]) &&
248
+ http_equiv.match(/\AContent-Type\z/i) &&
249
+ (content = attr["content"]) &&
250
+ (m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
251
+ (@encoding = m[1])
252
+ end
253
+ end
254
+
255
+ class JumpSAXHandler < SAXHandler
256
+ def initialize(jumptag)
257
+ @jumptag = jumptag
258
+ super()
259
+ end
260
+
261
+ def start_element(name, attrs = [])
262
+ super
263
+ throw(@jumptag, @encoding) if @encoding
264
+ throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
265
+ end
266
+ end
267
+
268
+ def self.detect_encoding(chunk)
269
+ (m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/)) &&
270
+ (return Nokogiri.XML(m[1]).encoding)
271
+
272
+ if Nokogiri.jruby?
273
+ (m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
274
+ (return m[4])
275
+ catch(:encoding_found) do
276
+ Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
277
+ nil
278
+ end
279
+ else
280
+ handler = SAXHandler.new
281
+ parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
282
+ begin
283
+ parser << chunk
284
+ rescue
285
+ Nokogiri::SyntaxError
286
+ end
287
+ handler.encoding
288
+ end
289
+ end
290
+
291
+ def initialize(io)
292
+ @io = io
293
+ @firstchunk = nil
294
+ @encoding_found = nil
295
+ end
296
+
297
+ # This method is used by the C extension so that
298
+ # Nokogiri::HTML4::Document#read_io() does not leak memory when
299
+ # EncodingFound is raised.
300
+ attr_reader :encoding_found
301
+
302
+ def read(len)
303
+ # no support for a call without len
304
+
305
+ unless @firstchunk
306
+ (@firstchunk = @io.read(len)) || (return nil)
307
+
308
+ # This implementation expects that the first call from
309
+ # htmlReadIO() is made with a length long enough (~1KB) to
310
+ # achieve advanced encoding detection.
311
+ if (encoding = EncodingReader.detect_encoding(@firstchunk))
312
+ # The first chunk is stored for the next read in retry.
313
+ raise @encoding_found = EncodingFound.new(encoding)
314
+ end
315
+ end
316
+ @encoding_found = nil
317
+
318
+ ret = @firstchunk.slice!(0, len)
319
+ if (len -= ret.length) > 0
320
+ (rest = @io.read(len)) && ret << (rest)
321
+ end
322
+ if ret.empty?
323
+ nil
324
+ else
325
+ ret
326
+ end
327
+ end
328
+ end
329
+ end
330
+ end
331
+ end
@@ -0,0 +1,54 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML4
5
+ class DocumentFragment < Nokogiri::XML::DocumentFragment
6
+ ####
7
+ # Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
8
+ def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
9
+ doc = HTML4::Document.new
10
+
11
+ encoding ||= if tags.respond_to?(:encoding)
12
+ encoding = tags.encoding
13
+ if encoding == ::Encoding::ASCII_8BIT
14
+ "UTF-8"
15
+ else
16
+ encoding.name
17
+ end
18
+ else
19
+ "UTF-8"
20
+ end
21
+
22
+ doc.encoding = encoding
23
+
24
+ new(doc, tags, nil, options, &block)
25
+ end
26
+
27
+ def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML)
28
+ return self unless tags
29
+
30
+ options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
31
+ yield options if block_given?
32
+
33
+ if ctx
34
+ preexisting_errors = document.errors.dup
35
+ node_set = ctx.parse("<div>#{tags}</div>", options)
36
+ node_set.first.children.each { |child| child.parent = self } unless node_set.empty?
37
+ self.errors = document.errors - preexisting_errors
38
+ else
39
+ # This is a horrible hack, but I don't care
40
+ path = if /^\s*?<body/i.match?(tags)
41
+ "/html/body"
42
+ else
43
+ "/html/body/node()"
44
+ end
45
+
46
+ temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding, options)
47
+ temp_doc.xpath(path).each { |child| child.parent = self }
48
+ self.errors = temp_doc.errors
49
+ end
50
+ children
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,25 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML4
5
+ class ElementDescription
6
+ ###
7
+ # Is this element a block element?
8
+ def block?
9
+ !inline?
10
+ end
11
+
12
+ ###
13
+ # Convert this description to a string
14
+ def to_s
15
+ "#{name}: #{description}"
16
+ end
17
+
18
+ ###
19
+ # Inspection information
20
+ def inspect
21
+ "#<#{self.class.name}: #{name} #{description}>"
22
+ end
23
+ end
24
+ end
25
+ end