nokogiri 1.10.10 → 1.13.9
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +5 -0
- data/LICENSE-DEPENDENCIES.md +1173 -884
- data/LICENSE.md +1 -1
- data/README.md +178 -96
- data/bin/nokogiri +63 -50
- data/dependencies.yml +13 -64
- data/ext/nokogiri/depend +38 -358
- data/ext/nokogiri/extconf.rb +761 -424
- data/ext/nokogiri/gumbo.c +584 -0
- data/ext/nokogiri/html4_document.c +166 -0
- data/ext/nokogiri/html4_element_description.c +294 -0
- data/ext/nokogiri/html4_entity_lookup.c +37 -0
- data/ext/nokogiri/html4_sax_parser_context.c +119 -0
- data/ext/nokogiri/html4_sax_push_parser.c +95 -0
- data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
- data/ext/nokogiri/nokogiri.c +228 -91
- data/ext/nokogiri/nokogiri.h +199 -88
- data/ext/nokogiri/test_global_handlers.c +40 -0
- data/ext/nokogiri/xml_attr.c +17 -17
- data/ext/nokogiri/xml_attribute_decl.c +21 -21
- data/ext/nokogiri/xml_cdata.c +14 -19
- data/ext/nokogiri/xml_comment.c +19 -26
- data/ext/nokogiri/xml_document.c +296 -220
- data/ext/nokogiri/xml_document_fragment.c +12 -16
- data/ext/nokogiri/xml_dtd.c +64 -58
- data/ext/nokogiri/xml_element_content.c +31 -26
- data/ext/nokogiri/xml_element_decl.c +25 -25
- data/ext/nokogiri/xml_encoding_handler.c +43 -18
- data/ext/nokogiri/xml_entity_decl.c +37 -35
- data/ext/nokogiri/xml_entity_reference.c +16 -18
- data/ext/nokogiri/xml_namespace.c +98 -53
- data/ext/nokogiri/xml_node.c +1065 -653
- data/ext/nokogiri/xml_node_set.c +178 -166
- data/ext/nokogiri/xml_processing_instruction.c +17 -19
- data/ext/nokogiri/xml_reader.c +277 -175
- data/ext/nokogiri/xml_relax_ng.c +52 -28
- data/ext/nokogiri/xml_sax_parser.c +112 -112
- data/ext/nokogiri/xml_sax_parser_context.c +112 -86
- data/ext/nokogiri/xml_sax_push_parser.c +36 -27
- data/ext/nokogiri/xml_schema.c +98 -48
- data/ext/nokogiri/xml_syntax_error.c +42 -21
- data/ext/nokogiri/xml_text.c +14 -18
- data/ext/nokogiri/xml_xpath_context.c +226 -115
- data/ext/nokogiri/xslt_stylesheet.c +265 -173
- data/gumbo-parser/CHANGES.md +63 -0
- data/gumbo-parser/Makefile +101 -0
- data/gumbo-parser/THANKS +27 -0
- data/gumbo-parser/src/Makefile +34 -0
- data/gumbo-parser/src/README.md +41 -0
- data/gumbo-parser/src/ascii.c +75 -0
- data/gumbo-parser/src/ascii.h +115 -0
- data/gumbo-parser/src/attribute.c +42 -0
- data/gumbo-parser/src/attribute.h +17 -0
- data/gumbo-parser/src/char_ref.c +22225 -0
- data/gumbo-parser/src/char_ref.h +29 -0
- data/gumbo-parser/src/char_ref.rl +2154 -0
- data/gumbo-parser/src/error.c +626 -0
- data/gumbo-parser/src/error.h +148 -0
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/foreign_attrs.gperf +27 -0
- data/gumbo-parser/src/gumbo.h +943 -0
- data/gumbo-parser/src/insertion_mode.h +33 -0
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +4875 -0
- data/gumbo-parser/src/parser.h +41 -0
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +103 -0
- data/gumbo-parser/src/string_buffer.h +68 -0
- data/gumbo-parser/src/string_piece.c +48 -0
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_attrs.gperf +77 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/svg_tags.gperf +55 -0
- data/gumbo-parser/src/tag.c +222 -0
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.gperf +169 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/token_type.h +17 -0
- data/gumbo-parser/src/tokenizer.c +3463 -0
- data/gumbo-parser/src/tokenizer.h +112 -0
- data/gumbo-parser/src/tokenizer_states.h +339 -0
- data/gumbo-parser/src/utf8.c +245 -0
- data/gumbo-parser/src/utf8.h +164 -0
- data/gumbo-parser/src/util.c +68 -0
- data/gumbo-parser/src/util.h +30 -0
- data/gumbo-parser/src/vector.c +111 -0
- data/gumbo-parser/src/vector.h +45 -0
- data/lib/nokogiri/class_resolver.rb +67 -0
- data/lib/nokogiri/css/node.rb +10 -8
- data/lib/nokogiri/css/parser.rb +397 -377
- data/lib/nokogiri/css/parser.y +250 -245
- data/lib/nokogiri/css/parser_extras.rb +54 -49
- data/lib/nokogiri/css/syntax_error.rb +3 -1
- data/lib/nokogiri/css/tokenizer.rb +5 -3
- data/lib/nokogiri/css/tokenizer.rex +3 -2
- data/lib/nokogiri/css/xpath_visitor.rb +218 -91
- data/lib/nokogiri/css.rb +50 -17
- data/lib/nokogiri/decorators/slop.rb +9 -7
- data/lib/nokogiri/extension.rb +31 -0
- data/lib/nokogiri/gumbo.rb +15 -0
- data/lib/nokogiri/html.rb +38 -27
- data/lib/nokogiri/{html → html4}/builder.rb +4 -2
- data/lib/nokogiri/{html → html4}/document.rb +103 -105
- data/lib/nokogiri/html4/document_fragment.rb +54 -0
- data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
- data/lib/nokogiri/html4/element_description_defaults.rb +578 -0
- data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
- data/lib/nokogiri/{html → html4}/sax/parser.rb +17 -16
- data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
- data/lib/nokogiri/{html → html4}/sax/push_parser.rb +12 -11
- data/lib/nokogiri/html4.rb +46 -0
- data/lib/nokogiri/html5/document.rb +91 -0
- data/lib/nokogiri/html5/document_fragment.rb +83 -0
- data/lib/nokogiri/html5/node.rb +100 -0
- data/lib/nokogiri/html5.rb +478 -0
- data/lib/nokogiri/jruby/dependencies.rb +21 -0
- data/lib/nokogiri/syntax_error.rb +2 -0
- data/lib/nokogiri/version/constant.rb +6 -0
- data/lib/nokogiri/version/info.rb +222 -0
- data/lib/nokogiri/version.rb +3 -108
- data/lib/nokogiri/xml/attr.rb +6 -3
- data/lib/nokogiri/xml/attribute_decl.rb +3 -1
- data/lib/nokogiri/xml/builder.rb +74 -33
- data/lib/nokogiri/xml/cdata.rb +3 -1
- data/lib/nokogiri/xml/character_data.rb +2 -0
- data/lib/nokogiri/xml/document.rb +224 -86
- data/lib/nokogiri/xml/document_fragment.rb +46 -44
- data/lib/nokogiri/xml/dtd.rb +4 -2
- data/lib/nokogiri/xml/element_content.rb +2 -0
- data/lib/nokogiri/xml/element_decl.rb +3 -1
- data/lib/nokogiri/xml/entity_decl.rb +4 -2
- data/lib/nokogiri/xml/entity_reference.rb +2 -0
- data/lib/nokogiri/xml/namespace.rb +3 -0
- data/lib/nokogiri/xml/node/save_options.rb +10 -5
- data/lib/nokogiri/xml/node.rb +884 -378
- data/lib/nokogiri/xml/node_set.rb +51 -54
- data/lib/nokogiri/xml/notation.rb +13 -0
- data/lib/nokogiri/xml/parse_options.rb +22 -8
- data/lib/nokogiri/xml/pp/character_data.rb +9 -6
- data/lib/nokogiri/xml/pp/node.rb +25 -26
- data/lib/nokogiri/xml/pp.rb +4 -2
- data/lib/nokogiri/xml/processing_instruction.rb +3 -1
- data/lib/nokogiri/xml/reader.rb +21 -28
- data/lib/nokogiri/xml/relax_ng.rb +8 -2
- data/lib/nokogiri/xml/sax/document.rb +45 -49
- data/lib/nokogiri/xml/sax/parser.rb +38 -34
- data/lib/nokogiri/xml/sax/parser_context.rb +8 -3
- data/lib/nokogiri/xml/sax/push_parser.rb +6 -5
- data/lib/nokogiri/xml/sax.rb +6 -4
- data/lib/nokogiri/xml/schema.rb +19 -9
- data/lib/nokogiri/xml/searchable.rb +112 -72
- data/lib/nokogiri/xml/syntax_error.rb +6 -4
- data/lib/nokogiri/xml/text.rb +2 -0
- data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
- data/lib/nokogiri/xml/xpath.rb +15 -4
- data/lib/nokogiri/xml/xpath_context.rb +3 -3
- data/lib/nokogiri/xml.rb +38 -37
- data/lib/nokogiri/xslt/stylesheet.rb +3 -1
- data/lib/nokogiri/xslt.rb +29 -20
- data/lib/nokogiri.rb +49 -65
- data/lib/xsd/xmlparser/nokogiri.rb +26 -24
- data/patches/libxml2/{0002-Remove-script-macro-support.patch → 0001-Remove-script-macro-support.patch} +0 -0
- data/patches/libxml2/{0003-Update-entities-to-remove-handling-of-ssi.patch → 0002-Update-entities-to-remove-handling-of-ssi.patch} +0 -0
- data/patches/libxml2/{0004-libxml2.la-is-in-top_builddir.patch → 0003-libxml2.la-is-in-top_builddir.patch} +1 -1
- data/patches/libxml2/0005-avoid-isnan-isinf.patch +81 -0
- data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
- data/patches/libxslt/0001-update-automake-files-for-arm64.patch +3037 -0
- data/ports/archives/libxml2-2.10.3.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.37.tar.xz +0 -0
- metadata +189 -142
- data/ext/nokogiri/html_document.c +0 -170
- data/ext/nokogiri/html_document.h +0 -10
- data/ext/nokogiri/html_element_description.c +0 -279
- data/ext/nokogiri/html_element_description.h +0 -10
- data/ext/nokogiri/html_entity_lookup.c +0 -32
- data/ext/nokogiri/html_entity_lookup.h +0 -8
- data/ext/nokogiri/html_sax_parser_context.c +0 -116
- data/ext/nokogiri/html_sax_parser_context.h +0 -11
- data/ext/nokogiri/html_sax_push_parser.c +0 -87
- data/ext/nokogiri/html_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_attr.h +0 -9
- data/ext/nokogiri/xml_attribute_decl.h +0 -9
- data/ext/nokogiri/xml_cdata.h +0 -9
- data/ext/nokogiri/xml_comment.h +0 -9
- data/ext/nokogiri/xml_document.h +0 -23
- data/ext/nokogiri/xml_document_fragment.h +0 -10
- data/ext/nokogiri/xml_dtd.h +0 -10
- data/ext/nokogiri/xml_element_content.h +0 -10
- data/ext/nokogiri/xml_element_decl.h +0 -9
- data/ext/nokogiri/xml_encoding_handler.h +0 -8
- data/ext/nokogiri/xml_entity_decl.h +0 -10
- data/ext/nokogiri/xml_entity_reference.h +0 -9
- data/ext/nokogiri/xml_io.c +0 -61
- data/ext/nokogiri/xml_io.h +0 -11
- data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
- data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
- data/ext/nokogiri/xml_namespace.h +0 -14
- data/ext/nokogiri/xml_node.h +0 -13
- data/ext/nokogiri/xml_node_set.h +0 -12
- data/ext/nokogiri/xml_processing_instruction.h +0 -9
- data/ext/nokogiri/xml_reader.h +0 -10
- data/ext/nokogiri/xml_relax_ng.h +0 -9
- data/ext/nokogiri/xml_sax_parser.h +0 -39
- data/ext/nokogiri/xml_sax_parser_context.h +0 -10
- data/ext/nokogiri/xml_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_schema.h +0 -9
- data/ext/nokogiri/xml_syntax_error.h +0 -13
- data/ext/nokogiri/xml_text.h +0 -9
- data/ext/nokogiri/xml_xpath_context.h +0 -10
- data/ext/nokogiri/xslt_stylesheet.h +0 -14
- data/lib/nokogiri/html/document_fragment.rb +0 -49
- data/lib/nokogiri/html/element_description_defaults.rb +0 -671
- data/lib/nokogiri/html/sax/parser_context.rb +0 -16
- data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
- data/patches/libxml2/0005-Fix-infinite-loop-in-xmlStringLenDecodeEntities.patch +0 -32
- data/ports/archives/libxml2-2.9.10.tar.gz +0 -0
- data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
data/lib/nokogiri/html.rb
CHANGED
@@ -1,37 +1,48 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
require 'nokogiri/html/sax/parser'
|
6
|
-
require 'nokogiri/html/sax/push_parser'
|
7
|
-
require 'nokogiri/html/element_description'
|
8
|
-
require 'nokogiri/html/element_description_defaults'
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require_relative "html4"
|
9
5
|
|
10
6
|
module Nokogiri
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
7
|
+
# Alias for Nokogiri::HTML4
|
8
|
+
HTML = Nokogiri::HTML4
|
9
|
+
|
10
|
+
# :singleton-method: HTML
|
11
|
+
# :call-seq: HTML(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) → Nokogiri::HTML4::Document
|
12
|
+
#
|
13
|
+
# Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
|
18
14
|
|
15
|
+
# :nodoc:
|
16
|
+
define_singleton_method(:HTML, Nokogiri.method(:HTML4))
|
17
|
+
|
18
|
+
# 💡 This module/namespace is an alias for Nokogiri::HTML4 as of v1.12.0. Before v1.12.0,
|
19
|
+
# Nokogiri::HTML4 did not exist, and this was the module/namespace for all HTML-related
|
20
|
+
# classes.
|
19
21
|
module HTML
|
20
|
-
class
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
22
|
+
# 💡 This class is an alias for Nokogiri::HTML4::Document as of v1.12.0.
|
23
|
+
class Document < Nokogiri::XML::Document
|
24
|
+
end
|
25
|
+
|
26
|
+
# 💡 This class is an alias for Nokogiri::HTML4::DocumentFragment as of v1.12.0.
|
27
|
+
class DocumentFragment < Nokogiri::XML::DocumentFragment
|
28
|
+
end
|
29
|
+
|
30
|
+
# 💡 This class is an alias for Nokogiri::HTML4::Builder as of v1.12.0.
|
31
|
+
class Builder < Nokogiri::XML::Builder
|
32
|
+
end
|
33
|
+
|
34
|
+
module SAX
|
35
|
+
# 💡 This class is an alias for Nokogiri::HTML4::SAX::Parser as of v1.12.0.
|
36
|
+
class Parser < Nokogiri::XML::SAX::Parser
|
25
37
|
end
|
26
38
|
|
27
|
-
|
28
|
-
|
29
|
-
def fragment string, encoding = nil
|
30
|
-
HTML::DocumentFragment.parse string, encoding
|
39
|
+
# 💡 This class is an alias for Nokogiri::HTML4::SAX::ParserContext as of v1.12.0.
|
40
|
+
class ParserContext < Nokogiri::XML::SAX::ParserContext
|
31
41
|
end
|
32
|
-
end
|
33
42
|
|
34
|
-
|
35
|
-
|
43
|
+
# 💡 This class is an alias for Nokogiri::HTML4::SAX::PushParser as of v1.12.0.
|
44
|
+
class PushParser
|
45
|
+
end
|
46
|
+
end
|
36
47
|
end
|
37
48
|
end
|
@@ -1,5 +1,7 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Nokogiri
|
2
|
-
module
|
4
|
+
module HTML4
|
3
5
|
###
|
4
6
|
# Nokogiri HTML builder is used for building HTML documents. It is very
|
5
7
|
# similar to the Nokogiri::XML::Builder. In fact, you should go read the
|
@@ -11,7 +13,7 @@ module Nokogiri
|
|
11
13
|
# Create an HTML document with a body that has an onload attribute, and a
|
12
14
|
# span tag with a class of "bold" that has content of "Hello world".
|
13
15
|
#
|
14
|
-
# builder = Nokogiri::
|
16
|
+
# builder = Nokogiri::HTML4::Builder.new do |doc|
|
15
17
|
# doc.html {
|
16
18
|
# doc.body(:onload => 'some_func();') {
|
17
19
|
# doc.span.bold {
|
@@ -1,15 +1,19 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
require "pathname"
|
5
|
+
|
1
6
|
module Nokogiri
|
2
|
-
module
|
7
|
+
module HTML4
|
3
8
|
class Document < Nokogiri::XML::Document
|
4
9
|
###
|
5
10
|
# Get the meta tag encoding for this document. If there is no meta tag,
|
6
11
|
# then nil is returned.
|
7
12
|
def meta_encoding
|
8
|
-
|
9
|
-
when meta = at('//meta[@charset]')
|
13
|
+
if (meta = at_xpath("//meta[@charset]"))
|
10
14
|
meta[:charset]
|
11
|
-
|
12
|
-
meta[
|
15
|
+
elsif (meta = meta_content_type)
|
16
|
+
meta["content"][/charset\s*=\s*([\w-]+)/i, 1]
|
13
17
|
end
|
14
18
|
end
|
15
19
|
|
@@ -29,24 +33,22 @@ module Nokogiri
|
|
29
33
|
#
|
30
34
|
# Beware in CRuby, that libxml2 automatically inserts a meta tag
|
31
35
|
# into a head element.
|
32
|
-
def meta_encoding=
|
33
|
-
|
34
|
-
|
35
|
-
meta['content'] = 'text/html; charset=%s' % encoding
|
36
|
+
def meta_encoding=(encoding)
|
37
|
+
if (meta = meta_content_type)
|
38
|
+
meta["content"] = format("text/html; charset=%s", encoding)
|
36
39
|
encoding
|
37
|
-
|
38
|
-
meta[
|
40
|
+
elsif (meta = at_xpath("//meta[@charset]"))
|
41
|
+
meta["charset"] = encoding
|
39
42
|
else
|
40
|
-
meta = XML::Node.new(
|
41
|
-
if dtd = internal_subset
|
42
|
-
meta[
|
43
|
+
meta = XML::Node.new("meta", self)
|
44
|
+
if (dtd = internal_subset) && dtd.html5_dtd?
|
45
|
+
meta["charset"] = encoding
|
43
46
|
else
|
44
|
-
meta[
|
45
|
-
meta[
|
47
|
+
meta["http-equiv"] = "Content-Type"
|
48
|
+
meta["content"] = format("text/html; charset=%s", encoding)
|
46
49
|
end
|
47
50
|
|
48
|
-
|
49
|
-
when head = at('//head')
|
51
|
+
if (head = at_xpath("//head"))
|
50
52
|
head.prepend_child(meta)
|
51
53
|
else
|
52
54
|
set_metadata_element(meta)
|
@@ -56,9 +58,9 @@ module Nokogiri
|
|
56
58
|
end
|
57
59
|
|
58
60
|
def meta_content_type
|
59
|
-
xpath(
|
60
|
-
node[
|
61
|
-
|
61
|
+
xpath("//meta[@http-equiv and boolean(@content)]").find do |node|
|
62
|
+
node["http-equiv"] =~ /\AContent-Type\z/i
|
63
|
+
end
|
62
64
|
end
|
63
65
|
private :meta_content_type
|
64
66
|
|
@@ -66,7 +68,7 @@ module Nokogiri
|
|
66
68
|
# Get the title string of this document. Return nil if there is
|
67
69
|
# no title tag.
|
68
70
|
def title
|
69
|
-
title =
|
71
|
+
(title = at_xpath("//title")) && title.inner_text
|
70
72
|
end
|
71
73
|
|
72
74
|
###
|
@@ -82,52 +84,50 @@ module Nokogiri
|
|
82
84
|
# content element (typically <body>) if any.
|
83
85
|
def title=(text)
|
84
86
|
tnode = XML::Text.new(text, self)
|
85
|
-
if title =
|
87
|
+
if (title = at_xpath("//title"))
|
86
88
|
title.children = tnode
|
87
89
|
return text
|
88
90
|
end
|
89
91
|
|
90
|
-
title = XML::Node.new(
|
91
|
-
|
92
|
-
when head = at('//head')
|
92
|
+
title = XML::Node.new("title", self) << tnode
|
93
|
+
if (head = at_xpath("//head"))
|
93
94
|
head << title
|
94
|
-
|
95
|
+
elsif (meta = (at_xpath("//meta[@charset]") || meta_content_type))
|
95
96
|
# better put after charset declaration
|
96
97
|
meta.add_next_sibling(title)
|
97
98
|
else
|
98
99
|
set_metadata_element(title)
|
99
100
|
end
|
100
|
-
text
|
101
101
|
end
|
102
102
|
|
103
|
-
def set_metadata_element(element)
|
104
|
-
|
105
|
-
when head = at('//head')
|
103
|
+
def set_metadata_element(element) # rubocop:disable Naming/AccessorMethodName
|
104
|
+
if (head = at_xpath("//head"))
|
106
105
|
head << element
|
107
|
-
|
108
|
-
head = html.prepend_child(XML::Node.new(
|
106
|
+
elsif (html = at_xpath("//html"))
|
107
|
+
head = html.prepend_child(XML::Node.new("head", self))
|
109
108
|
head.prepend_child(element)
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
109
|
+
elsif (first = children.find do |node|
|
110
|
+
case node
|
111
|
+
when XML::Element, XML::Text
|
112
|
+
true
|
113
|
+
end
|
114
|
+
end)
|
116
115
|
# We reach here only if the underlying document model
|
117
116
|
# allows <html>/<head> elements to be omitted and does not
|
118
117
|
# automatically supply them.
|
119
118
|
first.add_previous_sibling(element)
|
120
119
|
else
|
121
|
-
html = add_child(XML::Node.new(
|
122
|
-
head = html.add_child(XML::Node.new(
|
120
|
+
html = add_child(XML::Node.new("html", self))
|
121
|
+
head = html.add_child(XML::Node.new("head", self))
|
123
122
|
head.prepend_child(element)
|
124
123
|
end
|
125
124
|
end
|
126
125
|
private :set_metadata_element
|
127
126
|
|
128
127
|
####
|
129
|
-
# Serialize Node using +options+.
|
130
|
-
#
|
128
|
+
# Serialize Node using +options+. Save options can also be set using a block.
|
129
|
+
#
|
130
|
+
# See also Nokogiri::XML::Node::SaveOptions and Node@Serialization+and+Generating+Output.
|
131
131
|
#
|
132
132
|
# These two statements are equivalent:
|
133
133
|
#
|
@@ -139,15 +139,25 @@ module Nokogiri
|
|
139
139
|
# config.format.as_xml
|
140
140
|
# end
|
141
141
|
#
|
142
|
-
def serialize
|
142
|
+
def serialize(options = {})
|
143
143
|
options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
|
144
144
|
super
|
145
145
|
end
|
146
146
|
|
147
147
|
####
|
148
148
|
# Create a Nokogiri::XML::DocumentFragment from +tags+
|
149
|
-
def fragment
|
150
|
-
DocumentFragment.new(self, tags,
|
149
|
+
def fragment(tags = nil)
|
150
|
+
DocumentFragment.new(self, tags, root)
|
151
|
+
end
|
152
|
+
|
153
|
+
# :call-seq:
|
154
|
+
# xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig
|
155
|
+
#
|
156
|
+
# [Returns] The document type which determines CSS-to-XPath translation.
|
157
|
+
#
|
158
|
+
# See XPathVisitor for more information.
|
159
|
+
def xpath_doctype
|
160
|
+
Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML4
|
151
161
|
end
|
152
162
|
|
153
163
|
class << self
|
@@ -159,12 +169,12 @@ module Nokogiri
|
|
159
169
|
# is a number that sets options in the parser, such as
|
160
170
|
# Nokogiri::XML::ParseOptions::RECOVER. See the constants in
|
161
171
|
# Nokogiri::XML::ParseOptions.
|
162
|
-
def parse
|
163
|
-
|
172
|
+
def parse(string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML)
|
164
173
|
options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
|
165
|
-
# Give the options to the user
|
166
174
|
yield options if block_given?
|
167
175
|
|
176
|
+
url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
|
177
|
+
|
168
178
|
if string_or_io.respond_to?(:encoding)
|
169
179
|
unless string_or_io.encoding.name == "ASCII-8BIT"
|
170
180
|
encoding ||= string_or_io.encoding.name
|
@@ -172,7 +182,12 @@ module Nokogiri
|
|
172
182
|
end
|
173
183
|
|
174
184
|
if string_or_io.respond_to?(:read)
|
175
|
-
|
185
|
+
if string_or_io.is_a?(Pathname)
|
186
|
+
# resolve the Pathname to the file and open it as an IO object, see #2110
|
187
|
+
string_or_io = string_or_io.expand_path.open
|
188
|
+
url ||= string_or_io.path
|
189
|
+
end
|
190
|
+
|
176
191
|
unless encoding
|
177
192
|
# Libxml2's parser has poor support for encoding
|
178
193
|
# detection. First, it does not recognize the HTML5
|
@@ -196,7 +211,7 @@ module Nokogiri
|
|
196
211
|
end
|
197
212
|
|
198
213
|
# read_memory pukes on empty docs
|
199
|
-
if string_or_io.nil?
|
214
|
+
if string_or_io.nil? || string_or_io.empty?
|
200
215
|
return encoding ? new.tap { |i| i.encoding = encoding } : new
|
201
216
|
end
|
202
217
|
|
@@ -206,37 +221,39 @@ module Nokogiri
|
|
206
221
|
end
|
207
222
|
end
|
208
223
|
|
209
|
-
class EncodingFound < StandardError # :nodoc:
|
224
|
+
class EncodingFound < StandardError # :nodoc: all
|
210
225
|
attr_reader :found_encoding
|
211
226
|
|
212
227
|
def initialize(encoding)
|
213
228
|
@found_encoding = encoding
|
214
|
-
super("encoding found: %s"
|
229
|
+
super(format("encoding found: %s", encoding))
|
215
230
|
end
|
216
231
|
end
|
217
232
|
|
218
|
-
|
219
|
-
|
233
|
+
# :nodoc: all
|
234
|
+
class EncodingReader
|
235
|
+
class SAXHandler < Nokogiri::XML::SAX::Document
|
220
236
|
attr_reader :encoding
|
221
|
-
|
237
|
+
|
222
238
|
def initialize
|
223
239
|
@encoding = nil
|
224
240
|
super()
|
225
241
|
end
|
226
|
-
|
242
|
+
|
227
243
|
def start_element(name, attrs = [])
|
228
|
-
return unless name ==
|
244
|
+
return unless name == "meta"
|
245
|
+
|
229
246
|
attr = Hash[attrs]
|
230
|
-
charset = attr[
|
231
|
-
@encoding = charset
|
232
|
-
http_equiv = attr[
|
233
|
-
http_equiv.match(/\AContent-Type\z/i)
|
234
|
-
content = attr[
|
235
|
-
m = content.match(/;\s*charset\s*=\s*([\w-]+)/)
|
236
|
-
@encoding = m[1]
|
247
|
+
(charset = attr["charset"]) &&
|
248
|
+
(@encoding = charset)
|
249
|
+
(http_equiv = attr["http-equiv"]) &&
|
250
|
+
http_equiv.match(/\AContent-Type\z/i) &&
|
251
|
+
(content = attr["content"]) &&
|
252
|
+
(m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
|
253
|
+
(@encoding = m[1])
|
237
254
|
end
|
238
255
|
end
|
239
|
-
|
256
|
+
|
240
257
|
class JumpSAXHandler < SAXHandler
|
241
258
|
def initialize(jumptag)
|
242
259
|
@jumptag = jumptag
|
@@ -245,53 +262,34 @@ module Nokogiri
|
|
245
262
|
|
246
263
|
def start_element(name, attrs = [])
|
247
264
|
super
|
248
|
-
throw
|
249
|
-
throw
|
265
|
+
throw(@jumptag, @encoding) if @encoding
|
266
|
+
throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
|
250
267
|
end
|
251
268
|
end
|
252
269
|
|
253
270
|
def self.detect_encoding(chunk)
|
254
|
-
|
255
|
-
return
|
256
|
-
end
|
257
|
-
m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
|
258
|
-
return Nokogiri.XML(m[1]).encoding
|
271
|
+
(m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) &&
|
272
|
+
(return Nokogiri.XML(m[1]).encoding)
|
259
273
|
|
260
274
|
if Nokogiri.jruby?
|
261
|
-
m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)
|
262
|
-
return m[4]
|
263
|
-
catch(:encoding_found)
|
264
|
-
Nokogiri::
|
275
|
+
(m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
|
276
|
+
(return m[4])
|
277
|
+
catch(:encoding_found) do
|
278
|
+
Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
|
265
279
|
nil
|
266
|
-
|
280
|
+
end
|
267
281
|
else
|
268
282
|
handler = SAXHandler.new
|
269
|
-
parser = Nokogiri::
|
270
|
-
|
283
|
+
parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
|
284
|
+
begin
|
285
|
+
parser << chunk
|
286
|
+
rescue
|
287
|
+
Nokogiri::SyntaxError
|
288
|
+
end
|
271
289
|
handler.encoding
|
272
290
|
end
|
273
291
|
end
|
274
292
|
|
275
|
-
def self.is_jruby_without_fix?
|
276
|
-
JRUBY_VERSION.split('.').join.to_i < 165
|
277
|
-
end
|
278
|
-
|
279
|
-
def self.detect_encoding_for_jruby_without_fix(chunk)
|
280
|
-
m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
|
281
|
-
return Nokogiri.XML(m[1]).encoding
|
282
|
-
|
283
|
-
m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
|
284
|
-
return m[4]
|
285
|
-
|
286
|
-
catch(:encoding_found) {
|
287
|
-
Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk)
|
288
|
-
nil
|
289
|
-
}
|
290
|
-
rescue Nokogiri::SyntaxError, RuntimeError
|
291
|
-
# Ignore parser errors that nokogiri may raise
|
292
|
-
nil
|
293
|
-
end
|
294
|
-
|
295
293
|
def initialize(io)
|
296
294
|
@io = io
|
297
295
|
@firstchunk = nil
|
@@ -299,20 +297,20 @@ module Nokogiri
|
|
299
297
|
end
|
300
298
|
|
301
299
|
# This method is used by the C extension so that
|
302
|
-
# Nokogiri::
|
300
|
+
# Nokogiri::HTML4::Document#read_io() does not leak memory when
|
303
301
|
# EncodingFound is raised.
|
304
302
|
attr_reader :encoding_found
|
305
303
|
|
306
304
|
def read(len)
|
307
305
|
# no support for a call without len
|
308
306
|
|
309
|
-
|
310
|
-
@firstchunk = @io.read(len)
|
307
|
+
unless @firstchunk
|
308
|
+
(@firstchunk = @io.read(len)) || (return nil)
|
311
309
|
|
312
310
|
# This implementation expects that the first call from
|
313
311
|
# htmlReadIO() is made with a length long enough (~1KB) to
|
314
312
|
# achieve advanced encoding detection.
|
315
|
-
if encoding = EncodingReader.detect_encoding(@firstchunk)
|
313
|
+
if (encoding = EncodingReader.detect_encoding(@firstchunk))
|
316
314
|
# The first chunk is stored for the next read in retry.
|
317
315
|
raise @encoding_found = EncodingFound.new(encoding)
|
318
316
|
end
|
@@ -321,7 +319,7 @@ module Nokogiri
|
|
321
319
|
|
322
320
|
ret = @firstchunk.slice!(0, len)
|
323
321
|
if (len -= ret.length) > 0
|
324
|
-
rest = @io.read(len)
|
322
|
+
(rest = @io.read(len)) && ret << (rest)
|
325
323
|
end
|
326
324
|
if ret.empty?
|
327
325
|
nil
|
@@ -0,0 +1,54 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module HTML4
|
5
|
+
class DocumentFragment < Nokogiri::XML::DocumentFragment
|
6
|
+
####
|
7
|
+
# Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
|
8
|
+
def self.parse(tags, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
|
9
|
+
doc = HTML4::Document.new
|
10
|
+
|
11
|
+
encoding ||= if tags.respond_to?(:encoding)
|
12
|
+
encoding = tags.encoding
|
13
|
+
if encoding == ::Encoding::ASCII_8BIT
|
14
|
+
"UTF-8"
|
15
|
+
else
|
16
|
+
encoding.name
|
17
|
+
end
|
18
|
+
else
|
19
|
+
"UTF-8"
|
20
|
+
end
|
21
|
+
|
22
|
+
doc.encoding = encoding
|
23
|
+
|
24
|
+
new(doc, tags, nil, options, &block)
|
25
|
+
end
|
26
|
+
|
27
|
+
def initialize(document, tags = nil, ctx = nil, options = XML::ParseOptions::DEFAULT_HTML)
|
28
|
+
return self unless tags
|
29
|
+
|
30
|
+
options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
|
31
|
+
yield options if block_given?
|
32
|
+
|
33
|
+
if ctx
|
34
|
+
preexisting_errors = document.errors.dup
|
35
|
+
node_set = ctx.parse("<div>#{tags}</div>", options)
|
36
|
+
node_set.first.children.each { |child| child.parent = self } unless node_set.empty?
|
37
|
+
self.errors = document.errors - preexisting_errors
|
38
|
+
else
|
39
|
+
# This is a horrible hack, but I don't care
|
40
|
+
path = if /^\s*?<body/i.match?(tags)
|
41
|
+
"/html/body"
|
42
|
+
else
|
43
|
+
"/html/body/node()"
|
44
|
+
end
|
45
|
+
|
46
|
+
temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding, options)
|
47
|
+
temp_doc.xpath(path).each { |child| child.parent = self }
|
48
|
+
self.errors = temp_doc.errors
|
49
|
+
end
|
50
|
+
children
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|