nokogiri 1.13.8 → 1.15.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (120) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +40 -0
  3. data/LICENSE-DEPENDENCIES.md +830 -509
  4. data/LICENSE.md +1 -1
  5. data/README.md +18 -11
  6. data/dependencies.yml +33 -15
  7. data/ext/nokogiri/extconf.rb +164 -46
  8. data/ext/nokogiri/gumbo.c +20 -10
  9. data/ext/nokogiri/html4_document.c +3 -4
  10. data/ext/nokogiri/html4_element_description.c +20 -15
  11. data/ext/nokogiri/html4_entity_lookup.c +2 -2
  12. data/ext/nokogiri/html4_sax_parser_context.c +11 -22
  13. data/ext/nokogiri/html4_sax_push_parser.c +3 -3
  14. data/ext/nokogiri/nokogiri.c +84 -75
  15. data/ext/nokogiri/nokogiri.h +31 -16
  16. data/ext/nokogiri/test_global_handlers.c +1 -1
  17. data/ext/nokogiri/xml_attr.c +2 -2
  18. data/ext/nokogiri/xml_attribute_decl.c +2 -2
  19. data/ext/nokogiri/xml_cdata.c +32 -18
  20. data/ext/nokogiri/xml_comment.c +2 -2
  21. data/ext/nokogiri/xml_document.c +127 -34
  22. data/ext/nokogiri/xml_document_fragment.c +2 -2
  23. data/ext/nokogiri/xml_dtd.c +2 -2
  24. data/ext/nokogiri/xml_element_content.c +34 -31
  25. data/ext/nokogiri/xml_element_decl.c +7 -7
  26. data/ext/nokogiri/xml_encoding_handler.c +15 -7
  27. data/ext/nokogiri/xml_entity_decl.c +1 -1
  28. data/ext/nokogiri/xml_entity_reference.c +2 -2
  29. data/ext/nokogiri/xml_namespace.c +79 -14
  30. data/ext/nokogiri/xml_node.c +300 -34
  31. data/ext/nokogiri/xml_node_set.c +125 -107
  32. data/ext/nokogiri/xml_processing_instruction.c +2 -2
  33. data/ext/nokogiri/xml_reader.c +81 -48
  34. data/ext/nokogiri/xml_relax_ng.c +66 -81
  35. data/ext/nokogiri/xml_sax_parser.c +45 -20
  36. data/ext/nokogiri/xml_sax_parser_context.c +46 -30
  37. data/ext/nokogiri/xml_sax_push_parser.c +30 -11
  38. data/ext/nokogiri/xml_schema.c +95 -117
  39. data/ext/nokogiri/xml_syntax_error.c +1 -1
  40. data/ext/nokogiri/xml_text.c +28 -14
  41. data/ext/nokogiri/xml_xpath_context.c +216 -136
  42. data/ext/nokogiri/xslt_stylesheet.c +118 -64
  43. data/gumbo-parser/Makefile +10 -0
  44. data/gumbo-parser/src/attribute.h +1 -1
  45. data/gumbo-parser/src/error.c +10 -6
  46. data/gumbo-parser/src/error.h +1 -1
  47. data/gumbo-parser/src/foreign_attrs.c +15 -16
  48. data/gumbo-parser/src/foreign_attrs.gperf +1 -1
  49. data/gumbo-parser/src/{gumbo.h → nokogiri_gumbo.h} +1 -0
  50. data/gumbo-parser/src/parser.c +21 -5
  51. data/gumbo-parser/src/replacement.h +1 -1
  52. data/gumbo-parser/src/string_buffer.h +1 -1
  53. data/gumbo-parser/src/string_piece.c +1 -1
  54. data/gumbo-parser/src/svg_attrs.c +2 -2
  55. data/gumbo-parser/src/svg_tags.c +2 -2
  56. data/gumbo-parser/src/tag.c +2 -1
  57. data/gumbo-parser/src/tag_lookup.c +7 -7
  58. data/gumbo-parser/src/tag_lookup.gperf +1 -0
  59. data/gumbo-parser/src/tag_lookup.h +1 -1
  60. data/gumbo-parser/src/token_buffer.h +1 -1
  61. data/gumbo-parser/src/tokenizer.c +1 -1
  62. data/gumbo-parser/src/tokenizer.h +1 -1
  63. data/gumbo-parser/src/utf8.c +1 -1
  64. data/gumbo-parser/src/utf8.h +1 -1
  65. data/gumbo-parser/src/util.c +1 -3
  66. data/gumbo-parser/src/util.h +4 -0
  67. data/gumbo-parser/src/vector.h +1 -1
  68. data/lib/nokogiri/css/node.rb +2 -2
  69. data/lib/nokogiri/css/xpath_visitor.rb +7 -5
  70. data/lib/nokogiri/css.rb +6 -0
  71. data/lib/nokogiri/decorators/slop.rb +1 -1
  72. data/lib/nokogiri/encoding_handler.rb +57 -0
  73. data/lib/nokogiri/extension.rb +4 -3
  74. data/lib/nokogiri/html4/document.rb +2 -121
  75. data/lib/nokogiri/html4/document_fragment.rb +1 -1
  76. data/lib/nokogiri/html4/element_description_defaults.rb +1827 -365
  77. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  78. data/lib/nokogiri/html4.rb +1 -0
  79. data/lib/nokogiri/html5/document.rb +113 -36
  80. data/lib/nokogiri/html5/document_fragment.rb +10 -3
  81. data/lib/nokogiri/html5/node.rb +8 -5
  82. data/lib/nokogiri/html5.rb +130 -216
  83. data/lib/nokogiri/jruby/dependencies.rb +1 -19
  84. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  85. data/lib/nokogiri/version/constant.rb +1 -1
  86. data/lib/nokogiri/version/info.rb +11 -10
  87. data/lib/nokogiri/xml/attr.rb +49 -0
  88. data/lib/nokogiri/xml/attribute_decl.rb +4 -2
  89. data/lib/nokogiri/xml/builder.rb +1 -1
  90. data/lib/nokogiri/xml/document.rb +102 -55
  91. data/lib/nokogiri/xml/document_fragment.rb +50 -7
  92. data/lib/nokogiri/xml/element_content.rb +10 -2
  93. data/lib/nokogiri/xml/element_decl.rb +4 -2
  94. data/lib/nokogiri/xml/entity_decl.rb +4 -2
  95. data/lib/nokogiri/xml/namespace.rb +42 -0
  96. data/lib/nokogiri/xml/node/save_options.rb +14 -4
  97. data/lib/nokogiri/xml/node.rb +212 -48
  98. data/lib/nokogiri/xml/node_set.rb +88 -9
  99. data/lib/nokogiri/xml/parse_options.rb +129 -50
  100. data/lib/nokogiri/xml/pp/node.rb +28 -15
  101. data/lib/nokogiri/xml/processing_instruction.rb +2 -1
  102. data/lib/nokogiri/xml/sax/document.rb +1 -1
  103. data/lib/nokogiri/xml/sax/parser.rb +2 -3
  104. data/lib/nokogiri/xml/searchable.rb +18 -10
  105. data/lib/nokogiri/xslt.rb +74 -4
  106. data/lib/nokogiri.rb +15 -15
  107. data/lib/xsd/xmlparser/nokogiri.rb +4 -2
  108. data/patches/libxml2/0010-update-config.guess-and-config.sub-for-libxml2.patch +224 -0
  109. data/patches/libxml2/0011-rip-out-libxml2-s-libc_single_threaded-support.patch +30 -0
  110. data/patches/libxslt/0001-update-config.guess-and-config.sub-for-libxslt.patch +224 -0
  111. data/ports/archives/libxml2-2.11.7.tar.xz +0 -0
  112. data/ports/archives/libxslt-1.1.39.tar.xz +0 -0
  113. metadata +19 -242
  114. data/patches/libxml2/0004-use-glibc-strlen.patch +0 -53
  115. data/patches/libxml2/0005-avoid-isnan-isinf.patch +0 -81
  116. data/patches/libxml2/0006-update-automake-files-for-arm64.patch +0 -3040
  117. data/patches/libxml2/0008-htmlParseComment-handle-abruptly-closed-comments.patch +0 -61
  118. data/patches/libxslt/0001-update-automake-files-for-arm64.patch +0 -3037
  119. data/ports/archives/libxml2-2.9.14.tar.xz +0 -0
  120. data/ports/archives/libxslt-1.1.35.tar.xz +0 -0
@@ -1,14 +1,14 @@
1
1
  /* ANSI-C code produced by gperf version 3.1 */
2
- /* Command-line: gperf -m100 lib/tag_lookup.gperf */
2
+ /* Command-line: gperf -m100 src/tag_lookup.gperf */
3
3
  /* Computed positions: -k'1-2,$' */
4
- /* Filtered by: mk/gperf-filter.sed */
4
+ /* Filtered by: gperf-filter.sed */
5
5
 
6
6
  #include "tag_lookup.h"
7
7
  #include "macros.h"
8
8
  #include "ascii.h"
9
9
  #include <string.h>
10
10
 
11
- #define TOTAL_KEYWORDS 150
11
+ #define TOTAL_KEYWORDS 151
12
12
  #define MIN_WORD_LENGTH 1
13
13
  #define MAX_WORD_LENGTH 14
14
14
  #define MIN_HASH_VALUE 9
@@ -26,7 +26,7 @@ hash (register const char *str, register size_t len)
26
26
  272, 272, 272, 272, 272, 272, 272, 272, 272, 272,
27
27
  272, 272, 272, 272, 272, 272, 272, 272, 272, 272,
28
28
  272, 272, 272, 272, 272, 272, 272, 272, 272, 272,
29
- 272, 272, 272, 272, 272, 272, 272, 272, 272, 9,
29
+ 272, 272, 272, 272, 272, 272, 272, 272, 272, 11,
30
30
  7, 6, 4, 4, 3, 4, 3, 3, 272, 272,
31
31
  272, 272, 272, 272, 272, 70, 83, 152, 7, 16,
32
32
  61, 98, 5, 76, 102, 126, 12, 19, 54, 54,
@@ -69,7 +69,7 @@ gumbo_tag_lookup (register const char *str, register size_t len)
69
69
  static const unsigned char lengthtable[] =
70
70
  {
71
71
  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2,
72
- 2, 2, 2, 6, 2, 6, 2, 4, 0, 7, 6, 3, 0, 3,
72
+ 2, 2, 2, 6, 2, 6, 6, 4, 2, 7, 6, 3, 0, 3,
73
73
  0, 6, 6, 8, 5, 0, 0, 4, 5, 5, 8, 0, 2, 4,
74
74
  5, 2, 0, 5, 4, 2, 0, 7, 0, 8, 5, 0, 0, 0,
75
75
  0, 0, 0, 5, 3, 4, 5, 1, 4, 0, 4, 1, 2, 8,
@@ -111,9 +111,9 @@ gumbo_tag_lookup (register const char *str, register size_t len)
111
111
  {"spacer", GUMBO_TAG_SPACER},
112
112
  {"h2", GUMBO_TAG_H2},
113
113
  {"header", GUMBO_TAG_HEADER},
114
- {"h1", GUMBO_TAG_H1},
114
+ {"search", GUMBO_TAG_SEARCH},
115
115
  {"head", GUMBO_TAG_HEAD},
116
- {(char*)0,GUMBO_TAG_UNKNOWN},
116
+ {"h1", GUMBO_TAG_H1},
117
117
  {"details", GUMBO_TAG_DETAILS},
118
118
  {"select", GUMBO_TAG_SELECT},
119
119
  {"dir", GUMBO_TAG_DIR},
@@ -167,3 +167,4 @@ spacer, GUMBO_TAG_SPACER
167
167
  tt, GUMBO_TAG_TT
168
168
  rtc, GUMBO_TAG_RTC
169
169
  dialog, GUMBO_TAG_DIALOG
170
+ search, GUMBO_TAG_SEARCH
@@ -1,7 +1,7 @@
1
1
  #ifndef GUMBO_TAG_LOOKUP_H_
2
2
  #define GUMBO_TAG_LOOKUP_H_
3
3
 
4
- #include "gumbo.h"
4
+ #include "nokogiri_gumbo.h"
5
5
 
6
6
  typedef struct {
7
7
  const char *key;
@@ -20,7 +20,7 @@
20
20
  #include <stdbool.h>
21
21
  #include <stddef.h>
22
22
 
23
- #include "gumbo.h"
23
+ #include "nokogiri_gumbo.h"
24
24
 
25
25
  #ifdef __cplusplus
26
26
  extern "C" {
@@ -50,7 +50,7 @@
50
50
  #include "attribute.h"
51
51
  #include "char_ref.h"
52
52
  #include "error.h"
53
- #include "gumbo.h"
53
+ #include "nokogiri_gumbo.h"
54
54
  #include "parser.h"
55
55
  #include "string_buffer.h"
56
56
  #include "token_type.h"
@@ -7,7 +7,7 @@
7
7
  #include <stdbool.h>
8
8
  #include <stddef.h>
9
9
 
10
- #include "gumbo.h"
10
+ #include "nokogiri_gumbo.h"
11
11
  #include "token_type.h"
12
12
  #include "tokenizer_states.h"
13
13
 
@@ -22,7 +22,7 @@
22
22
  #include <string.h>
23
23
 
24
24
  #include "error.h"
25
- #include "gumbo.h"
25
+ #include "nokogiri_gumbo.h"
26
26
  #include "parser.h"
27
27
  #include "ascii.h"
28
28
  #include "vector.h"
@@ -19,7 +19,7 @@
19
19
  #include <stdbool.h>
20
20
  #include <stddef.h>
21
21
 
22
- #include "gumbo.h"
22
+ #include "nokogiri_gumbo.h"
23
23
  #include "macros.h"
24
24
 
25
25
  #ifdef __cplusplus
@@ -19,7 +19,7 @@
19
19
  #include <stdlib.h>
20
20
  #include <string.h>
21
21
  #include "util.h"
22
- #include "gumbo.h"
22
+ #include "nokogiri_gumbo.h"
23
23
 
24
24
  void* gumbo_alloc(size_t size) {
25
25
  void* ptr = malloc(size);
@@ -63,6 +63,4 @@ void gumbo_debug(const char* format, ...) {
63
63
  va_end(args);
64
64
  fflush(stdout);
65
65
  }
66
- #else
67
- void gumbo_debug(const char* UNUSED_ARG(format), ...) {}
68
66
  #endif
@@ -21,7 +21,11 @@ void* gumbo_realloc(void* ptr, size_t size) RETURNS_NONNULL;
21
21
  void gumbo_free(void* ptr);
22
22
 
23
23
  // Debug wrapper for printf
24
+ #ifdef GUMBO_DEBUG
24
25
  void gumbo_debug(const char* format, ...) PRINTF(1);
26
+ #else
27
+ static inline void PRINTF(1) gumbo_debug(const char* UNUSED_ARG(format), ...) {};
28
+ #endif
25
29
 
26
30
  #ifdef __cplusplus
27
31
  }
@@ -1,7 +1,7 @@
1
1
  #ifndef GUMBO_VECTOR_H_
2
2
  #define GUMBO_VECTOR_H_
3
3
 
4
- #include "gumbo.h"
4
+ #include "nokogiri_gumbo.h"
5
5
 
6
6
  #ifdef __cplusplus
7
7
  extern "C" {
@@ -40,9 +40,9 @@ module Nokogiri
40
40
 
41
41
  # Convert to_type
42
42
  def to_type
43
- [@type] + @value.map do |n|
43
+ [@type] + @value.filter_map do |n|
44
44
  n.to_type if n.respond_to?(:to_type)
45
- end.compact
45
+ end
46
46
  end
47
47
 
48
48
  # Convert to array
@@ -133,7 +133,7 @@ module Nokogiri
133
133
  args += node.value[1..-1].map do |n|
134
134
  n.is_a?(Nokogiri::CSS::Node) ? n.accept(self) : n
135
135
  end
136
- "#{node.value.first}#{args.join(",")})"
136
+ "nokogiri:#{node.value.first}#{args.join(",")})"
137
137
  end
138
138
  end
139
139
 
@@ -207,7 +207,7 @@ module Nokogiri
207
207
  when "parent" then "node()"
208
208
  when "root" then "not(parent::*)"
209
209
  else
210
- node.value.first + "(.)"
210
+ "nokogiri:#{node.value.first}(.)"
211
211
  end
212
212
  end
213
213
  end
@@ -230,11 +230,11 @@ module Nokogiri
230
230
  "descendant_selector" => "//",
231
231
  "child_selector" => "/",
232
232
  }.each do |k, v|
233
- class_eval %{
233
+ class_eval <<~RUBY, __FILE__, __LINE__ + 1
234
234
  def visit_#{k} node
235
235
  "\#{node.value.first.accept(self) if node.value.first}#{v}\#{node.value.last.accept(self)}"
236
236
  end
237
- }
237
+ RUBY
238
238
  end
239
239
 
240
240
  def visit_conditional_selector(node)
@@ -278,7 +278,9 @@ module Nokogiri
278
278
  end
279
279
 
280
280
  def nth(node, options = {})
281
- raise ArgumentError, "expected an+b node to contain 4 tokens, but is #{node.value.inspect}" unless node.value.size == 4
281
+ unless node.value.size == 4
282
+ raise(ArgumentError, "expected an+b node to contain 4 tokens, but is #{node.value.inspect}")
283
+ end
282
284
 
283
285
  a, b = read_a_and_positive_b(node.value)
284
286
  position = if options[:child]
data/lib/nokogiri/css.rb CHANGED
@@ -40,9 +40,15 @@ module Nokogiri
40
40
  # 💡 Note that translated queries are cached for performance concerns.
41
41
  #
42
42
  def xpath_for(selector, options = {})
43
+ raise TypeError, "no implicit conversion of #{selector.inspect} to String" unless selector.respond_to?(:to_str)
44
+
45
+ selector = selector.to_str
46
+ raise Nokogiri::CSS::SyntaxError, "empty CSS selector" if selector.empty?
47
+
43
48
  prefix = options.fetch(:prefix, Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX)
44
49
  visitor = options.fetch(:visitor) { Nokogiri::CSS::XPathVisitor.new }
45
50
  ns = options.fetch(:ns, {})
51
+
46
52
  Parser.new(ns).xpath_for(selector, prefix, visitor)
47
53
  end
48
54
  end
@@ -25,7 +25,7 @@ module Nokogiri
25
25
  else
26
26
  CSS::Parser.without_cache do
27
27
  list = xpath(
28
- *CSS.xpath_for("#{name}#{args.first}", prefix: XPATH_PREFIX)
28
+ *CSS.xpath_for("#{name}#{args.first}", prefix: XPATH_PREFIX),
29
29
  )
30
30
  end
31
31
  end
@@ -0,0 +1,57 @@
1
+ # encoding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ module Nokogiri
5
+ class EncodingHandler
6
+ # Popular encoding aliases not known by all iconv implementations that Nokogiri should support.
7
+ USEFUL_ALIASES = {
8
+ # alias_name => true_name
9
+ "NOKOGIRI-SENTINEL" => "UTF-8", # indicating the Nokogiri has installed aliases
10
+ "Windows-31J" => "CP932", # Windows-31J is the IANA registered name of CP932.
11
+ "UTF-8" => "UTF-8", # for JRuby tests, this is a no-op in CRuby
12
+ }
13
+
14
+ class << self
15
+ def install_default_aliases
16
+ USEFUL_ALIASES.each do |alias_name, name|
17
+ EncodingHandler.alias(name, alias_name) if EncodingHandler[alias_name].nil?
18
+ end
19
+ end
20
+ end
21
+
22
+ # :stopdoc:
23
+ if Nokogiri.jruby?
24
+ class << self
25
+ def [](name)
26
+ storage.key?(name) ? new(storage[name]) : nil
27
+ end
28
+
29
+ def alias(name, alias_name)
30
+ storage[alias_name] = name
31
+ end
32
+
33
+ def delete(name)
34
+ storage.delete(name)
35
+ end
36
+
37
+ def clear_aliases!
38
+ storage.clear
39
+ end
40
+
41
+ private
42
+
43
+ def storage
44
+ @storage ||= {}
45
+ end
46
+ end
47
+
48
+ def initialize(name)
49
+ @name = name
50
+ end
51
+
52
+ attr_reader :name
53
+ end
54
+ end
55
+ end
56
+
57
+ Nokogiri::EncodingHandler.install_default_aliases
@@ -3,13 +3,14 @@
3
3
  # load the C or Java extension
4
4
  begin
5
5
  # native precompiled gems package shared libraries in <gem_dir>/lib/nokogiri/<ruby_version>
6
- ::RUBY_VERSION =~ /(\d+\.\d+)/
6
+ RUBY_VERSION =~ /(\d+\.\d+)/
7
7
  require_relative "#{Regexp.last_match(1)}/nokogiri"
8
8
  rescue LoadError => e
9
- if /GLIBC/.match?(e.message)
9
+ if e.message.include?("GLIBC")
10
10
  warn(<<~EOM)
11
11
 
12
- ERROR: It looks like you're trying to use Nokogiri as a precompiled native gem on a system with glibc < 2.17:
12
+ ERROR: It looks like you're trying to use Nokogiri as a precompiled native gem on a system
13
+ with an unsupported version of glibc.
13
14
 
14
15
  #{e.message}
15
16
 
@@ -176,7 +176,7 @@ module Nokogiri
176
176
  url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
177
177
 
178
178
  if string_or_io.respond_to?(:encoding)
179
- unless string_or_io.encoding.name == "ASCII-8BIT"
179
+ unless string_or_io.encoding == Encoding::ASCII_8BIT
180
180
  encoding ||= string_or_io.encoding.name
181
181
  end
182
182
  end
@@ -189,21 +189,10 @@ module Nokogiri
189
189
  end
190
190
 
191
191
  unless encoding
192
- # Libxml2's parser has poor support for encoding
193
- # detection. First, it does not recognize the HTML5
194
- # style meta charset declaration. Secondly, even if it
195
- # successfully detects an encoding hint, it does not
196
- # re-decode or re-parse the preceding part which may be
197
- # garbled.
198
- #
199
- # EncodingReader aims to perform advanced encoding
200
- # detection beyond what Libxml2 does, and to emulate
201
- # rewinding of a stream and make Libxml2 redo parsing
202
- # from the start when an encoding hint is found.
203
192
  string_or_io = EncodingReader.new(string_or_io)
204
193
  begin
205
194
  return read_io(string_or_io, url, encoding, options.to_i)
206
- rescue EncodingFound => e
195
+ rescue EncodingReader::EncodingFound => e
207
196
  encoding = e.found_encoding
208
197
  end
209
198
  end
@@ -220,114 +209,6 @@ module Nokogiri
220
209
  read_memory(string_or_io, url, encoding, options.to_i)
221
210
  end
222
211
  end
223
-
224
- class EncodingFound < StandardError # :nodoc: all
225
- attr_reader :found_encoding
226
-
227
- def initialize(encoding)
228
- @found_encoding = encoding
229
- super(format("encoding found: %s", encoding))
230
- end
231
- end
232
-
233
- # :nodoc: all
234
- class EncodingReader
235
- class SAXHandler < Nokogiri::XML::SAX::Document
236
- attr_reader :encoding
237
-
238
- def initialize
239
- @encoding = nil
240
- super()
241
- end
242
-
243
- def start_element(name, attrs = [])
244
- return unless name == "meta"
245
-
246
- attr = Hash[attrs]
247
- (charset = attr["charset"]) &&
248
- (@encoding = charset)
249
- (http_equiv = attr["http-equiv"]) &&
250
- http_equiv.match(/\AContent-Type\z/i) &&
251
- (content = attr["content"]) &&
252
- (m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
253
- (@encoding = m[1])
254
- end
255
- end
256
-
257
- class JumpSAXHandler < SAXHandler
258
- def initialize(jumptag)
259
- @jumptag = jumptag
260
- super()
261
- end
262
-
263
- def start_element(name, attrs = [])
264
- super
265
- throw(@jumptag, @encoding) if @encoding
266
- throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
267
- end
268
- end
269
-
270
- def self.detect_encoding(chunk)
271
- (m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) &&
272
- (return Nokogiri.XML(m[1]).encoding)
273
-
274
- if Nokogiri.jruby?
275
- (m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
276
- (return m[4])
277
- catch(:encoding_found) do
278
- Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
279
- nil
280
- end
281
- else
282
- handler = SAXHandler.new
283
- parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
284
- begin
285
- parser << chunk
286
- rescue
287
- Nokogiri::SyntaxError
288
- end
289
- handler.encoding
290
- end
291
- end
292
-
293
- def initialize(io)
294
- @io = io
295
- @firstchunk = nil
296
- @encoding_found = nil
297
- end
298
-
299
- # This method is used by the C extension so that
300
- # Nokogiri::HTML4::Document#read_io() does not leak memory when
301
- # EncodingFound is raised.
302
- attr_reader :encoding_found
303
-
304
- def read(len)
305
- # no support for a call without len
306
-
307
- unless @firstchunk
308
- (@firstchunk = @io.read(len)) || (return nil)
309
-
310
- # This implementation expects that the first call from
311
- # htmlReadIO() is made with a length long enough (~1KB) to
312
- # achieve advanced encoding detection.
313
- if (encoding = EncodingReader.detect_encoding(@firstchunk))
314
- # The first chunk is stored for the next read in retry.
315
- raise @encoding_found = EncodingFound.new(encoding)
316
- end
317
- end
318
- @encoding_found = nil
319
-
320
- ret = @firstchunk.slice!(0, len)
321
- if (len -= ret.length) > 0
322
- (rest = @io.read(len)) && ret << (rest)
323
- end
324
- if ret.empty?
325
- nil
326
- else
327
- ret
328
- end
329
- end
330
- end
331
212
  end
332
213
  end
333
214
  end
@@ -24,7 +24,7 @@ module Nokogiri
24
24
  new(doc, tags, nil, options, &block)
25
25
  end
26
26
 
27
- def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML)
27
+ def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML) # rubocop:disable Lint/MissingSuper
28
28
  return self unless tags
29
29
 
30
30
  options = Nokogiri::XML::ParseOptions.new(options) if Integer === options