nokogiri 1.13.10 → 1.14.0.rc1
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +33 -0
- data/LICENSE-DEPENDENCIES.md +830 -509
- data/README.md +18 -11
- data/dependencies.yml +25 -7
- data/ext/nokogiri/extconf.rb +79 -20
- data/ext/nokogiri/gumbo.c +19 -9
- data/ext/nokogiri/html4_document.c +1 -1
- data/ext/nokogiri/html4_entity_lookup.c +1 -1
- data/ext/nokogiri/html4_sax_parser_context.c +0 -5
- data/ext/nokogiri/nokogiri.c +32 -51
- data/ext/nokogiri/nokogiri.h +17 -14
- data/ext/nokogiri/xml_attribute_decl.c +1 -1
- data/ext/nokogiri/xml_cdata.c +1 -1
- data/ext/nokogiri/xml_document.c +16 -11
- data/ext/nokogiri/xml_element_content.c +2 -2
- data/ext/nokogiri/xml_element_decl.c +1 -1
- data/ext/nokogiri/xml_encoding_handler.c +2 -2
- data/ext/nokogiri/xml_namespace.c +38 -8
- data/ext/nokogiri/xml_node.c +286 -26
- data/ext/nokogiri/xml_node_set.c +0 -2
- data/ext/nokogiri/xml_reader.c +40 -20
- data/ext/nokogiri/xml_relax_ng.c +0 -2
- data/ext/nokogiri/xml_sax_parser.c +22 -16
- data/ext/nokogiri/xml_sax_parser_context.c +0 -5
- data/ext/nokogiri/xml_sax_push_parser.c +0 -2
- data/ext/nokogiri/xml_schema.c +0 -2
- data/ext/nokogiri/xml_xpath_context.c +87 -83
- data/ext/nokogiri/xslt_stylesheet.c +14 -13
- data/gumbo-parser/Makefile +10 -0
- data/gumbo-parser/src/attribute.h +1 -1
- data/gumbo-parser/src/error.c +1 -1
- data/gumbo-parser/src/error.h +1 -1
- data/gumbo-parser/src/foreign_attrs.c +2 -2
- data/gumbo-parser/src/{gumbo.h → nokogiri_gumbo.h} +1 -0
- data/gumbo-parser/src/parser.c +7 -4
- data/gumbo-parser/src/replacement.h +1 -1
- data/gumbo-parser/src/string_buffer.h +1 -1
- data/gumbo-parser/src/string_piece.c +1 -1
- data/gumbo-parser/src/svg_attrs.c +2 -2
- data/gumbo-parser/src/svg_tags.c +2 -2
- data/gumbo-parser/src/tag.c +2 -1
- data/gumbo-parser/src/tag_lookup.c +7 -7
- data/gumbo-parser/src/tag_lookup.gperf +1 -0
- data/gumbo-parser/src/tag_lookup.h +1 -1
- data/gumbo-parser/src/token_buffer.h +1 -1
- data/gumbo-parser/src/tokenizer.c +1 -1
- data/gumbo-parser/src/tokenizer.h +1 -1
- data/gumbo-parser/src/utf8.c +1 -1
- data/gumbo-parser/src/utf8.h +1 -1
- data/gumbo-parser/src/util.c +1 -3
- data/gumbo-parser/src/util.h +4 -0
- data/gumbo-parser/src/vector.h +1 -1
- data/lib/nokogiri/css/node.rb +2 -2
- data/lib/nokogiri/css/xpath_visitor.rb +3 -1
- data/lib/nokogiri/css.rb +6 -0
- data/lib/nokogiri/encoding_handler.rb +57 -0
- data/lib/nokogiri/extension.rb +3 -2
- data/lib/nokogiri/html4/document.rb +2 -121
- data/lib/nokogiri/html4/element_description_defaults.rb +6 -12
- data/lib/nokogiri/html4/encoding_reader.rb +121 -0
- data/lib/nokogiri/html4.rb +1 -0
- data/lib/nokogiri/html5/document.rb +113 -36
- data/lib/nokogiri/html5/document_fragment.rb +9 -2
- data/lib/nokogiri/html5/node.rb +3 -5
- data/lib/nokogiri/html5.rb +127 -216
- data/lib/nokogiri/jruby/dependencies.rb +1 -19
- data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
- data/lib/nokogiri/version/constant.rb +1 -1
- data/lib/nokogiri/version/info.rb +11 -10
- data/lib/nokogiri/xml/attr.rb +49 -0
- data/lib/nokogiri/xml/builder.rb +1 -1
- data/lib/nokogiri/xml/document.rb +102 -54
- data/lib/nokogiri/xml/document_fragment.rb +49 -6
- data/lib/nokogiri/xml/namespace.rb +42 -0
- data/lib/nokogiri/xml/node/save_options.rb +4 -2
- data/lib/nokogiri/xml/node.rb +190 -35
- data/lib/nokogiri/xml/node_set.rb +87 -9
- data/lib/nokogiri/xml/parse_options.rb +127 -48
- data/lib/nokogiri/xml/pp/node.rb +6 -4
- data/lib/nokogiri/xml/processing_instruction.rb +2 -1
- data/lib/nokogiri/xml/sax/parser.rb +2 -3
- data/lib/nokogiri/xslt.rb +1 -1
- data/lib/nokogiri.rb +3 -11
- metadata +11 -247
- data/patches/libxml2/0005-avoid-isnan-isinf.patch +0 -81
data/gumbo-parser/Makefile
CHANGED
@@ -13,8 +13,18 @@ LDFLAGS := -pthread
|
|
13
13
|
|
14
14
|
all: check
|
15
15
|
|
16
|
+
# don't try to regenerate ragel or gperf files in CI, that should be a development-only action and
|
17
|
+
# the generated files should be committed to SCM
|
18
|
+
ifneq ($(CI),true)
|
19
|
+
src/foreign_attrs.c: src/foreign_attrs.gperf
|
20
|
+
gperf -m100 -n $< | ./gperf-filter.sed > $@
|
21
|
+
|
22
|
+
src/%.c: src/%.gperf
|
23
|
+
gperf -m100 $< | ./gperf-filter.sed > $@
|
24
|
+
|
16
25
|
src/%.c: src/%.rl
|
17
26
|
ragel -F1 -o $@ $<
|
27
|
+
endif
|
18
28
|
|
19
29
|
build/src:
|
20
30
|
mkdir -p $@
|
data/gumbo-parser/src/error.c
CHANGED
data/gumbo-parser/src/error.h
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/* ANSI-C code produced by gperf version 3.1 */
|
2
|
-
/* Command-line: gperf -m100 -n
|
2
|
+
/* Command-line: gperf -m100 -n src/foreign_attrs.gperf */
|
3
3
|
/* Computed positions: -k'2,8' */
|
4
|
-
/* Filtered by:
|
4
|
+
/* Filtered by: gperf-filter.sed */
|
5
5
|
|
6
6
|
#include "replacement.h"
|
7
7
|
#include "macros.h"
|
@@ -292,6 +292,7 @@ typedef enum {
|
|
292
292
|
GUMBO_TAG_TT,
|
293
293
|
GUMBO_TAG_RTC,
|
294
294
|
GUMBO_TAG_DIALOG,
|
295
|
+
GUMBO_TAG_SEARCH,
|
295
296
|
// Used for all tags that don't have special handling in HTML.
|
296
297
|
GUMBO_TAG_UNKNOWN,
|
297
298
|
// A marker value to indicate the end of the enum, for iterating over it.
|
data/gumbo-parser/src/parser.c
CHANGED
@@ -24,7 +24,7 @@
|
|
24
24
|
#include "ascii.h"
|
25
25
|
#include "attribute.h"
|
26
26
|
#include "error.h"
|
27
|
-
#include "
|
27
|
+
#include "nokogiri_gumbo.h"
|
28
28
|
#include "insertion_mode.h"
|
29
29
|
#include "macros.h"
|
30
30
|
#include "parser.h"
|
@@ -2940,7 +2940,7 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
2940
2940
|
TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL), TAG(FIELDSET),
|
2941
2941
|
TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER), TAG(HGROUP),
|
2942
2942
|
TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL), TAG(P), TAG(SECTION),
|
2943
|
-
TAG(SUMMARY), TAG(UL)
|
2943
|
+
TAG(SUMMARY), TAG(UL), TAG(SEARCH)
|
2944
2944
|
})
|
2945
2945
|
) {
|
2946
2946
|
maybe_implicitly_close_p_tag(parser, token);
|
@@ -3018,7 +3018,7 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3018
3018
|
TAG(CENTER), TAG(DETAILS), TAG(DIALOG), TAG(DIR), TAG(DIV), TAG(DL),
|
3019
3019
|
TAG(FIELDSET), TAG(FIGCAPTION), TAG(FIGURE), TAG(FOOTER), TAG(HEADER),
|
3020
3020
|
TAG(HGROUP), TAG(LISTING), TAG(MAIN), TAG(MENU), TAG(NAV), TAG(OL),
|
3021
|
-
TAG(PRE), TAG(SECTION), TAG(SUMMARY), TAG(UL)
|
3021
|
+
TAG(PRE), TAG(SECTION), TAG(SUMMARY), TAG(UL), TAG(SEARCH)
|
3022
3022
|
})
|
3023
3023
|
) {
|
3024
3024
|
GumboTag tag = token->v.end_tag.tag;
|
@@ -3057,6 +3057,9 @@ static void handle_in_body(GumboParser* parser, GumboToken* token) {
|
|
3057
3057
|
ignore_token(parser);
|
3058
3058
|
return;
|
3059
3059
|
}
|
3060
|
+
// Since we remove the form node without popping, we need to make sure
|
3061
|
+
// that we flush any text nodes at the end of the form.
|
3062
|
+
maybe_flush_text_node_buffer(parser);
|
3060
3063
|
// This differs from implicitly_close_tags because we remove *only* the
|
3061
3064
|
// <form> element; other nodes are left in scope.
|
3062
3065
|
generate_implied_end_tags(parser, GUMBO_TAG_LAST, NULL);
|
@@ -3434,7 +3437,7 @@ static void handle_in_table(GumboParser* parser, GumboToken* token) {
|
|
3434
3437
|
|| token->type == GUMBO_TOKEN_WHITESPACE
|
3435
3438
|
|| token->type == GUMBO_TOKEN_NULL)
|
3436
3439
|
&& node_tag_in_set(get_current_node(parser), &(const TagSet) {
|
3437
|
-
TAG(TABLE), TAG(TBODY), TAG(TFOOT), TAG(THEAD), TAG(TR)
|
3440
|
+
TAG(TABLE), TAG(TBODY), TAG(TEMPLATE), TAG(TFOOT), TAG(THEAD), TAG(TR)
|
3438
3441
|
})
|
3439
3442
|
) {
|
3440
3443
|
// The "pending table character tokens" list described in the spec is
|
@@ -1,7 +1,7 @@
|
|
1
1
|
/* ANSI-C code produced by gperf version 3.1 */
|
2
|
-
/* Command-line: gperf -m100
|
2
|
+
/* Command-line: gperf -m100 src/svg_attrs.gperf */
|
3
3
|
/* Computed positions: -k'1,10,$' */
|
4
|
-
/* Filtered by:
|
4
|
+
/* Filtered by: gperf-filter.sed */
|
5
5
|
|
6
6
|
#include "replacement.h"
|
7
7
|
#include "macros.h"
|
data/gumbo-parser/src/svg_tags.c
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
/* ANSI-C code produced by gperf version 3.1 */
|
2
|
-
/* Command-line: gperf -m100
|
2
|
+
/* Command-line: gperf -m100 src/svg_tags.gperf */
|
3
3
|
/* Computed positions: -k'3,7' */
|
4
|
-
/* Filtered by:
|
4
|
+
/* Filtered by: gperf-filter.sed */
|
5
5
|
|
6
6
|
#include "replacement.h"
|
7
7
|
#include "macros.h"
|
data/gumbo-parser/src/tag.c
CHANGED
@@ -14,7 +14,7 @@
|
|
14
14
|
limitations under the License.
|
15
15
|
*/
|
16
16
|
|
17
|
-
#include "
|
17
|
+
#include "nokogiri_gumbo.h"
|
18
18
|
#include "util.h"
|
19
19
|
#include "tag_lookup.h"
|
20
20
|
|
@@ -172,6 +172,7 @@ static const char kGumboTagNames[GUMBO_TAG_LAST+1][15] = {
|
|
172
172
|
[GUMBO_TAG_TT] = "tt",
|
173
173
|
[GUMBO_TAG_RTC] = "rtc",
|
174
174
|
[GUMBO_TAG_DIALOG] = "dialog",
|
175
|
+
[GUMBO_TAG_SEARCH] = "search",
|
175
176
|
|
176
177
|
[GUMBO_TAG_UNKNOWN] = "",
|
177
178
|
[GUMBO_TAG_LAST] = "",
|
@@ -1,14 +1,14 @@
|
|
1
1
|
/* ANSI-C code produced by gperf version 3.1 */
|
2
|
-
/* Command-line: gperf -m100
|
2
|
+
/* Command-line: gperf -m100 src/tag_lookup.gperf */
|
3
3
|
/* Computed positions: -k'1-2,$' */
|
4
|
-
/* Filtered by:
|
4
|
+
/* Filtered by: gperf-filter.sed */
|
5
5
|
|
6
6
|
#include "tag_lookup.h"
|
7
7
|
#include "macros.h"
|
8
8
|
#include "ascii.h"
|
9
9
|
#include <string.h>
|
10
10
|
|
11
|
-
#define TOTAL_KEYWORDS
|
11
|
+
#define TOTAL_KEYWORDS 151
|
12
12
|
#define MIN_WORD_LENGTH 1
|
13
13
|
#define MAX_WORD_LENGTH 14
|
14
14
|
#define MIN_HASH_VALUE 9
|
@@ -26,7 +26,7 @@ hash (register const char *str, register size_t len)
|
|
26
26
|
272, 272, 272, 272, 272, 272, 272, 272, 272, 272,
|
27
27
|
272, 272, 272, 272, 272, 272, 272, 272, 272, 272,
|
28
28
|
272, 272, 272, 272, 272, 272, 272, 272, 272, 272,
|
29
|
-
272, 272, 272, 272, 272, 272, 272, 272, 272,
|
29
|
+
272, 272, 272, 272, 272, 272, 272, 272, 272, 11,
|
30
30
|
7, 6, 4, 4, 3, 4, 3, 3, 272, 272,
|
31
31
|
272, 272, 272, 272, 272, 70, 83, 152, 7, 16,
|
32
32
|
61, 98, 5, 76, 102, 126, 12, 19, 54, 54,
|
@@ -69,7 +69,7 @@ gumbo_tag_lookup (register const char *str, register size_t len)
|
|
69
69
|
static const unsigned char lengthtable[] =
|
70
70
|
{
|
71
71
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2,
|
72
|
-
2, 2, 2, 6, 2, 6,
|
72
|
+
2, 2, 2, 6, 2, 6, 6, 4, 2, 7, 6, 3, 0, 3,
|
73
73
|
0, 6, 6, 8, 5, 0, 0, 4, 5, 5, 8, 0, 2, 4,
|
74
74
|
5, 2, 0, 5, 4, 2, 0, 7, 0, 8, 5, 0, 0, 0,
|
75
75
|
0, 0, 0, 5, 3, 4, 5, 1, 4, 0, 4, 1, 2, 8,
|
@@ -111,9 +111,9 @@ gumbo_tag_lookup (register const char *str, register size_t len)
|
|
111
111
|
{"spacer", GUMBO_TAG_SPACER},
|
112
112
|
{"h2", GUMBO_TAG_H2},
|
113
113
|
{"header", GUMBO_TAG_HEADER},
|
114
|
-
{"
|
114
|
+
{"search", GUMBO_TAG_SEARCH},
|
115
115
|
{"head", GUMBO_TAG_HEAD},
|
116
|
-
{
|
116
|
+
{"h1", GUMBO_TAG_H1},
|
117
117
|
{"details", GUMBO_TAG_DETAILS},
|
118
118
|
{"select", GUMBO_TAG_SELECT},
|
119
119
|
{"dir", GUMBO_TAG_DIR},
|
data/gumbo-parser/src/utf8.c
CHANGED
data/gumbo-parser/src/utf8.h
CHANGED
data/gumbo-parser/src/util.c
CHANGED
@@ -19,7 +19,7 @@
|
|
19
19
|
#include <stdlib.h>
|
20
20
|
#include <string.h>
|
21
21
|
#include "util.h"
|
22
|
-
#include "
|
22
|
+
#include "nokogiri_gumbo.h"
|
23
23
|
|
24
24
|
void* gumbo_alloc(size_t size) {
|
25
25
|
void* ptr = malloc(size);
|
@@ -63,6 +63,4 @@ void gumbo_debug(const char* format, ...) {
|
|
63
63
|
va_end(args);
|
64
64
|
fflush(stdout);
|
65
65
|
}
|
66
|
-
#else
|
67
|
-
void gumbo_debug(const char* UNUSED_ARG(format), ...) {}
|
68
66
|
#endif
|
data/gumbo-parser/src/util.h
CHANGED
@@ -21,7 +21,11 @@ void* gumbo_realloc(void* ptr, size_t size) RETURNS_NONNULL;
|
|
21
21
|
void gumbo_free(void* ptr);
|
22
22
|
|
23
23
|
// Debug wrapper for printf
|
24
|
+
#ifdef GUMBO_DEBUG
|
24
25
|
void gumbo_debug(const char* format, ...) PRINTF(1);
|
26
|
+
#else
|
27
|
+
static inline void PRINTF(1) gumbo_debug(const char* UNUSED_ARG(format), ...) {};
|
28
|
+
#endif
|
25
29
|
|
26
30
|
#ifdef __cplusplus
|
27
31
|
}
|
data/gumbo-parser/src/vector.h
CHANGED
data/lib/nokogiri/css/node.rb
CHANGED
@@ -278,7 +278,9 @@ module Nokogiri
|
|
278
278
|
end
|
279
279
|
|
280
280
|
def nth(node, options = {})
|
281
|
-
|
281
|
+
unless node.value.size == 4
|
282
|
+
raise(ArgumentError, "expected an+b node to contain 4 tokens, but is #{node.value.inspect}")
|
283
|
+
end
|
282
284
|
|
283
285
|
a, b = read_a_and_positive_b(node.value)
|
284
286
|
position = if options[:child]
|
data/lib/nokogiri/css.rb
CHANGED
@@ -40,9 +40,15 @@ module Nokogiri
|
|
40
40
|
# 💡 Note that translated queries are cached for performance concerns.
|
41
41
|
#
|
42
42
|
def xpath_for(selector, options = {})
|
43
|
+
raise TypeError, "no implicit conversion of #{selector.inspect} to String" unless selector.respond_to?(:to_str)
|
44
|
+
|
45
|
+
selector = selector.to_str
|
46
|
+
raise Nokogiri::CSS::SyntaxError, "empty CSS selector" if selector.empty?
|
47
|
+
|
43
48
|
prefix = options.fetch(:prefix, Nokogiri::XML::XPath::GLOBAL_SEARCH_PREFIX)
|
44
49
|
visitor = options.fetch(:visitor) { Nokogiri::CSS::XPathVisitor.new }
|
45
50
|
ns = options.fetch(:ns, {})
|
51
|
+
|
46
52
|
Parser.new(ns).xpath_for(selector, prefix, visitor)
|
47
53
|
end
|
48
54
|
end
|
@@ -0,0 +1,57 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
module Nokogiri
|
5
|
+
class EncodingHandler
|
6
|
+
# Popular encoding aliases not known by all iconv implementations that Nokogiri should support.
|
7
|
+
USEFUL_ALIASES = {
|
8
|
+
# alias_name => true_name
|
9
|
+
"NOKOGIRI-SENTINEL" => "UTF-8", # indicating the Nokogiri has installed aliases
|
10
|
+
"Windows-31J" => "CP932", # Windows-31J is the IANA registered name of CP932.
|
11
|
+
"UTF-8" => "UTF-8", # for JRuby tests, this is a no-op in CRuby
|
12
|
+
}
|
13
|
+
|
14
|
+
class << self
|
15
|
+
def install_default_aliases
|
16
|
+
USEFUL_ALIASES.each do |alias_name, name|
|
17
|
+
EncodingHandler.alias(name, alias_name) if EncodingHandler[alias_name].nil?
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
# :stopdoc:
|
23
|
+
if Nokogiri.jruby?
|
24
|
+
class << self
|
25
|
+
def [](name)
|
26
|
+
storage.key?(name) ? new(storage[name]) : nil
|
27
|
+
end
|
28
|
+
|
29
|
+
def alias(name, alias_name)
|
30
|
+
storage[alias_name] = name
|
31
|
+
end
|
32
|
+
|
33
|
+
def delete(name)
|
34
|
+
storage.delete(name)
|
35
|
+
end
|
36
|
+
|
37
|
+
def clear_aliases!
|
38
|
+
storage.clear
|
39
|
+
end
|
40
|
+
|
41
|
+
private
|
42
|
+
|
43
|
+
def storage
|
44
|
+
@storage ||= {}
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
def initialize(name)
|
49
|
+
@name = name
|
50
|
+
end
|
51
|
+
|
52
|
+
attr_reader :name
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
Nokogiri::EncodingHandler.install_default_aliases
|
data/lib/nokogiri/extension.rb
CHANGED
@@ -3,13 +3,14 @@
|
|
3
3
|
# load the C or Java extension
|
4
4
|
begin
|
5
5
|
# native precompiled gems package shared libraries in <gem_dir>/lib/nokogiri/<ruby_version>
|
6
|
-
|
6
|
+
RUBY_VERSION =~ /(\d+\.\d+)/
|
7
7
|
require_relative "#{Regexp.last_match(1)}/nokogiri"
|
8
8
|
rescue LoadError => e
|
9
9
|
if /GLIBC/.match?(e.message)
|
10
10
|
warn(<<~EOM)
|
11
11
|
|
12
|
-
ERROR: It looks like you're trying to use Nokogiri as a precompiled native gem on a system
|
12
|
+
ERROR: It looks like you're trying to use Nokogiri as a precompiled native gem on a system
|
13
|
+
with an unsupported version of glibc.
|
13
14
|
|
14
15
|
#{e.message}
|
15
16
|
|
@@ -176,7 +176,7 @@ module Nokogiri
|
|
176
176
|
url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
|
177
177
|
|
178
178
|
if string_or_io.respond_to?(:encoding)
|
179
|
-
unless string_or_io.encoding
|
179
|
+
unless string_or_io.encoding == Encoding::ASCII_8BIT
|
180
180
|
encoding ||= string_or_io.encoding.name
|
181
181
|
end
|
182
182
|
end
|
@@ -189,21 +189,10 @@ module Nokogiri
|
|
189
189
|
end
|
190
190
|
|
191
191
|
unless encoding
|
192
|
-
# Libxml2's parser has poor support for encoding
|
193
|
-
# detection. First, it does not recognize the HTML5
|
194
|
-
# style meta charset declaration. Secondly, even if it
|
195
|
-
# successfully detects an encoding hint, it does not
|
196
|
-
# re-decode or re-parse the preceding part which may be
|
197
|
-
# garbled.
|
198
|
-
#
|
199
|
-
# EncodingReader aims to perform advanced encoding
|
200
|
-
# detection beyond what Libxml2 does, and to emulate
|
201
|
-
# rewinding of a stream and make Libxml2 redo parsing
|
202
|
-
# from the start when an encoding hint is found.
|
203
192
|
string_or_io = EncodingReader.new(string_or_io)
|
204
193
|
begin
|
205
194
|
return read_io(string_or_io, url, encoding, options.to_i)
|
206
|
-
rescue EncodingFound => e
|
195
|
+
rescue EncodingReader::EncodingFound => e
|
207
196
|
encoding = e.found_encoding
|
208
197
|
end
|
209
198
|
end
|
@@ -220,114 +209,6 @@ module Nokogiri
|
|
220
209
|
read_memory(string_or_io, url, encoding, options.to_i)
|
221
210
|
end
|
222
211
|
end
|
223
|
-
|
224
|
-
class EncodingFound < StandardError # :nodoc: all
|
225
|
-
attr_reader :found_encoding
|
226
|
-
|
227
|
-
def initialize(encoding)
|
228
|
-
@found_encoding = encoding
|
229
|
-
super(format("encoding found: %s", encoding))
|
230
|
-
end
|
231
|
-
end
|
232
|
-
|
233
|
-
# :nodoc: all
|
234
|
-
class EncodingReader
|
235
|
-
class SAXHandler < Nokogiri::XML::SAX::Document
|
236
|
-
attr_reader :encoding
|
237
|
-
|
238
|
-
def initialize
|
239
|
-
@encoding = nil
|
240
|
-
super()
|
241
|
-
end
|
242
|
-
|
243
|
-
def start_element(name, attrs = [])
|
244
|
-
return unless name == "meta"
|
245
|
-
|
246
|
-
attr = Hash[attrs]
|
247
|
-
(charset = attr["charset"]) &&
|
248
|
-
(@encoding = charset)
|
249
|
-
(http_equiv = attr["http-equiv"]) &&
|
250
|
-
http_equiv.match(/\AContent-Type\z/i) &&
|
251
|
-
(content = attr["content"]) &&
|
252
|
-
(m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
|
253
|
-
(@encoding = m[1])
|
254
|
-
end
|
255
|
-
end
|
256
|
-
|
257
|
-
class JumpSAXHandler < SAXHandler
|
258
|
-
def initialize(jumptag)
|
259
|
-
@jumptag = jumptag
|
260
|
-
super()
|
261
|
-
end
|
262
|
-
|
263
|
-
def start_element(name, attrs = [])
|
264
|
-
super
|
265
|
-
throw(@jumptag, @encoding) if @encoding
|
266
|
-
throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
|
267
|
-
end
|
268
|
-
end
|
269
|
-
|
270
|
-
def self.detect_encoding(chunk)
|
271
|
-
(m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) &&
|
272
|
-
(return Nokogiri.XML(m[1]).encoding)
|
273
|
-
|
274
|
-
if Nokogiri.jruby?
|
275
|
-
(m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
|
276
|
-
(return m[4])
|
277
|
-
catch(:encoding_found) do
|
278
|
-
Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
|
279
|
-
nil
|
280
|
-
end
|
281
|
-
else
|
282
|
-
handler = SAXHandler.new
|
283
|
-
parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
|
284
|
-
begin
|
285
|
-
parser << chunk
|
286
|
-
rescue
|
287
|
-
Nokogiri::SyntaxError
|
288
|
-
end
|
289
|
-
handler.encoding
|
290
|
-
end
|
291
|
-
end
|
292
|
-
|
293
|
-
def initialize(io)
|
294
|
-
@io = io
|
295
|
-
@firstchunk = nil
|
296
|
-
@encoding_found = nil
|
297
|
-
end
|
298
|
-
|
299
|
-
# This method is used by the C extension so that
|
300
|
-
# Nokogiri::HTML4::Document#read_io() does not leak memory when
|
301
|
-
# EncodingFound is raised.
|
302
|
-
attr_reader :encoding_found
|
303
|
-
|
304
|
-
def read(len)
|
305
|
-
# no support for a call without len
|
306
|
-
|
307
|
-
unless @firstchunk
|
308
|
-
(@firstchunk = @io.read(len)) || (return nil)
|
309
|
-
|
310
|
-
# This implementation expects that the first call from
|
311
|
-
# htmlReadIO() is made with a length long enough (~1KB) to
|
312
|
-
# achieve advanced encoding detection.
|
313
|
-
if (encoding = EncodingReader.detect_encoding(@firstchunk))
|
314
|
-
# The first chunk is stored for the next read in retry.
|
315
|
-
raise @encoding_found = EncodingFound.new(encoding)
|
316
|
-
end
|
317
|
-
end
|
318
|
-
@encoding_found = nil
|
319
|
-
|
320
|
-
ret = @firstchunk.slice!(0, len)
|
321
|
-
if (len -= ret.length) > 0
|
322
|
-
(rest = @io.read(len)) && ret << (rest)
|
323
|
-
end
|
324
|
-
if ret.empty?
|
325
|
-
nil
|
326
|
-
else
|
327
|
-
ret
|
328
|
-
end
|
329
|
-
end
|
330
|
-
end
|
331
212
|
end
|
332
213
|
end
|
333
214
|
end
|
@@ -25,43 +25,37 @@ module Nokogiri
|
|
25
25
|
|
26
26
|
unless method_defined?(:implied_start_tag?)
|
27
27
|
def implied_start_tag?
|
28
|
-
|
29
|
-
d ? d.startTag : nil
|
28
|
+
default_desc&.startTag
|
30
29
|
end
|
31
30
|
end
|
32
31
|
|
33
32
|
unless method_defined?(:implied_end_tag?)
|
34
33
|
def implied_end_tag?
|
35
|
-
|
36
|
-
d ? d.endTag : nil
|
34
|
+
default_desc&.endTag
|
37
35
|
end
|
38
36
|
end
|
39
37
|
|
40
38
|
unless method_defined?(:save_end_tag?)
|
41
39
|
def save_end_tag?
|
42
|
-
|
43
|
-
d ? d.saveEndTag : nil
|
40
|
+
default_desc&.saveEndTag
|
44
41
|
end
|
45
42
|
end
|
46
43
|
|
47
44
|
unless method_defined?(:deprecated?)
|
48
45
|
def deprecated?
|
49
|
-
|
50
|
-
d ? d.depr : nil
|
46
|
+
default_desc&.depr
|
51
47
|
end
|
52
48
|
end
|
53
49
|
|
54
50
|
unless method_defined?(:description)
|
55
51
|
def description
|
56
|
-
|
57
|
-
d ? d.desc : nil
|
52
|
+
default_desc&.desc
|
58
53
|
end
|
59
54
|
end
|
60
55
|
|
61
56
|
unless method_defined?(:default_sub_element)
|
62
57
|
def default_sub_element
|
63
|
-
|
64
|
-
d ? d.defaultsubelt : nil
|
58
|
+
default_desc&.defaultsubelt
|
65
59
|
end
|
66
60
|
end
|
67
61
|
|