nokogiri 1.9.1 → 1.15.3
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +45 -0
- data/LICENSE-DEPENDENCIES.md +1636 -1024
- data/LICENSE.md +5 -28
- data/README.md +203 -89
- data/bin/nokogiri +63 -50
- data/dependencies.yml +33 -61
- data/ext/nokogiri/depend +38 -358
- data/ext/nokogiri/extconf.rb +864 -418
- data/ext/nokogiri/gumbo.c +594 -0
- data/ext/nokogiri/html4_document.c +165 -0
- data/ext/nokogiri/html4_element_description.c +299 -0
- data/ext/nokogiri/html4_entity_lookup.c +37 -0
- data/ext/nokogiri/html4_sax_parser_context.c +108 -0
- data/ext/nokogiri/html4_sax_push_parser.c +95 -0
- data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
- data/ext/nokogiri/nokogiri.c +251 -105
- data/ext/nokogiri/nokogiri.h +215 -90
- data/ext/nokogiri/test_global_handlers.c +40 -0
- data/ext/nokogiri/xml_attr.c +17 -17
- data/ext/nokogiri/xml_attribute_decl.c +22 -22
- data/ext/nokogiri/xml_cdata.c +40 -31
- data/ext/nokogiri/xml_comment.c +20 -27
- data/ext/nokogiri/xml_document.c +401 -240
- data/ext/nokogiri/xml_document_fragment.c +13 -17
- data/ext/nokogiri/xml_dtd.c +64 -58
- data/ext/nokogiri/xml_element_content.c +63 -55
- data/ext/nokogiri/xml_element_decl.c +31 -31
- data/ext/nokogiri/xml_encoding_handler.c +54 -21
- data/ext/nokogiri/xml_entity_decl.c +37 -35
- data/ext/nokogiri/xml_entity_reference.c +17 -19
- data/ext/nokogiri/xml_namespace.c +135 -61
- data/ext/nokogiri/xml_node.c +1346 -677
- data/ext/nokogiri/xml_node_set.c +246 -216
- data/ext/nokogiri/xml_processing_instruction.c +18 -20
- data/ext/nokogiri/xml_reader.c +347 -212
- data/ext/nokogiri/xml_relax_ng.c +86 -77
- data/ext/nokogiri/xml_sax_parser.c +149 -124
- data/ext/nokogiri/xml_sax_parser_context.c +145 -103
- data/ext/nokogiri/xml_sax_push_parser.c +64 -36
- data/ext/nokogiri/xml_schema.c +138 -81
- data/ext/nokogiri/xml_syntax_error.c +42 -21
- data/ext/nokogiri/xml_text.c +36 -26
- data/ext/nokogiri/xml_xpath_context.c +366 -178
- data/ext/nokogiri/xslt_stylesheet.c +335 -189
- data/gumbo-parser/CHANGES.md +63 -0
- data/gumbo-parser/Makefile +111 -0
- data/gumbo-parser/THANKS +27 -0
- data/gumbo-parser/src/Makefile +34 -0
- data/gumbo-parser/src/README.md +41 -0
- data/gumbo-parser/src/ascii.c +75 -0
- data/gumbo-parser/src/ascii.h +115 -0
- data/gumbo-parser/src/attribute.c +42 -0
- data/gumbo-parser/src/attribute.h +17 -0
- data/gumbo-parser/src/char_ref.c +22225 -0
- data/gumbo-parser/src/char_ref.h +29 -0
- data/gumbo-parser/src/char_ref.rl +2154 -0
- data/gumbo-parser/src/error.c +630 -0
- data/gumbo-parser/src/error.h +148 -0
- data/gumbo-parser/src/foreign_attrs.c +103 -0
- data/gumbo-parser/src/foreign_attrs.gperf +27 -0
- data/gumbo-parser/src/insertion_mode.h +33 -0
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/nokogiri_gumbo.h +944 -0
- data/gumbo-parser/src/parser.c +4891 -0
- data/gumbo-parser/src/parser.h +41 -0
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +103 -0
- data/gumbo-parser/src/string_buffer.h +68 -0
- data/gumbo-parser/src/string_piece.c +48 -0
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_attrs.gperf +77 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/svg_tags.gperf +55 -0
- data/gumbo-parser/src/tag.c +223 -0
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.gperf +170 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/token_type.h +17 -0
- data/gumbo-parser/src/tokenizer.c +3463 -0
- data/gumbo-parser/src/tokenizer.h +112 -0
- data/gumbo-parser/src/tokenizer_states.h +339 -0
- data/gumbo-parser/src/utf8.c +245 -0
- data/gumbo-parser/src/utf8.h +164 -0
- data/gumbo-parser/src/util.c +66 -0
- data/gumbo-parser/src/util.h +34 -0
- data/gumbo-parser/src/vector.c +111 -0
- data/gumbo-parser/src/vector.h +45 -0
- data/lib/nokogiri/class_resolver.rb +67 -0
- data/lib/nokogiri/css/node.rb +10 -8
- data/lib/nokogiri/css/parser.rb +397 -377
- data/lib/nokogiri/css/parser.y +250 -245
- data/lib/nokogiri/css/parser_extras.rb +54 -49
- data/lib/nokogiri/css/syntax_error.rb +3 -1
- data/lib/nokogiri/css/tokenizer.rb +107 -104
- data/lib/nokogiri/css/tokenizer.rex +3 -2
- data/lib/nokogiri/css/xpath_visitor.rb +224 -95
- data/lib/nokogiri/css.rb +56 -17
- data/lib/nokogiri/decorators/slop.rb +9 -7
- data/lib/nokogiri/encoding_handler.rb +57 -0
- data/lib/nokogiri/extension.rb +32 -0
- data/lib/nokogiri/gumbo.rb +15 -0
- data/lib/nokogiri/html.rb +38 -27
- data/lib/nokogiri/{html → html4}/builder.rb +4 -2
- data/lib/nokogiri/html4/document.rb +214 -0
- data/lib/nokogiri/html4/document_fragment.rb +54 -0
- data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
- data/lib/nokogiri/html4/element_description_defaults.rb +2040 -0
- data/lib/nokogiri/html4/encoding_reader.rb +121 -0
- data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
- data/lib/nokogiri/{html → html4}/sax/parser.rb +17 -16
- data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
- data/lib/nokogiri/{html → html4}/sax/push_parser.rb +12 -11
- data/lib/nokogiri/html4.rb +47 -0
- data/lib/nokogiri/html5/document.rb +168 -0
- data/lib/nokogiri/html5/document_fragment.rb +90 -0
- data/lib/nokogiri/html5/node.rb +103 -0
- data/lib/nokogiri/html5.rb +392 -0
- data/lib/nokogiri/jruby/dependencies.rb +3 -0
- data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
- data/lib/nokogiri/syntax_error.rb +2 -0
- data/lib/nokogiri/version/constant.rb +6 -0
- data/lib/nokogiri/version/info.rb +223 -0
- data/lib/nokogiri/version.rb +3 -108
- data/lib/nokogiri/xml/attr.rb +55 -3
- data/lib/nokogiri/xml/attribute_decl.rb +6 -2
- data/lib/nokogiri/xml/builder.rb +98 -54
- data/lib/nokogiri/xml/cdata.rb +3 -1
- data/lib/nokogiri/xml/character_data.rb +2 -0
- data/lib/nokogiri/xml/document.rb +312 -126
- data/lib/nokogiri/xml/document_fragment.rb +93 -48
- data/lib/nokogiri/xml/dtd.rb +4 -2
- data/lib/nokogiri/xml/element_content.rb +12 -2
- data/lib/nokogiri/xml/element_decl.rb +6 -2
- data/lib/nokogiri/xml/entity_decl.rb +7 -3
- data/lib/nokogiri/xml/entity_reference.rb +2 -0
- data/lib/nokogiri/xml/namespace.rb +45 -0
- data/lib/nokogiri/xml/node/save_options.rb +23 -8
- data/lib/nokogiri/xml/node.rb +1088 -418
- data/lib/nokogiri/xml/node_set.rb +173 -63
- data/lib/nokogiri/xml/notation.rb +13 -0
- data/lib/nokogiri/xml/parse_options.rb +145 -52
- data/lib/nokogiri/xml/pp/character_data.rb +9 -6
- data/lib/nokogiri/xml/pp/node.rb +42 -30
- data/lib/nokogiri/xml/pp.rb +4 -2
- data/lib/nokogiri/xml/processing_instruction.rb +4 -1
- data/lib/nokogiri/xml/reader.rb +21 -28
- data/lib/nokogiri/xml/relax_ng.rb +8 -2
- data/lib/nokogiri/xml/sax/document.rb +45 -49
- data/lib/nokogiri/xml/sax/parser.rb +39 -36
- data/lib/nokogiri/xml/sax/parser_context.rb +8 -3
- data/lib/nokogiri/xml/sax/push_parser.rb +6 -5
- data/lib/nokogiri/xml/sax.rb +6 -4
- data/lib/nokogiri/xml/schema.rb +19 -9
- data/lib/nokogiri/xml/searchable.rb +120 -72
- data/lib/nokogiri/xml/syntax_error.rb +6 -4
- data/lib/nokogiri/xml/text.rb +2 -0
- data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
- data/lib/nokogiri/xml/xpath.rb +15 -4
- data/lib/nokogiri/xml/xpath_context.rb +3 -3
- data/lib/nokogiri/xml.rb +38 -37
- data/lib/nokogiri/xslt/stylesheet.rb +3 -1
- data/lib/nokogiri/xslt.rb +101 -22
- data/lib/nokogiri.rb +59 -75
- data/lib/xsd/xmlparser/nokogiri.rb +29 -25
- data/patches/libxml2/0001-Remove-script-macro-support.patch +40 -0
- data/patches/libxml2/0002-Update-entities-to-remove-handling-of-ssi.patch +44 -0
- data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +25 -0
- data/patches/libxml2/0009-allow-wildcard-namespaces.patch +77 -0
- data/patches/libxml2/0010-update-config.guess-and-config.sub-for-libxml2.patch +224 -0
- data/patches/libxml2/0011-rip-out-libxml2-s-libc_single_threaded-support.patch +30 -0
- data/patches/libxslt/0001-update-config.guess-and-config.sub-for-libxslt.patch +224 -0
- data/ports/archives/libxml2-2.11.4.tar.xz +0 -0
- data/ports/archives/libxslt-1.1.38.tar.xz +0 -0
- metadata +128 -265
- data/ext/nokogiri/html_document.c +0 -170
- data/ext/nokogiri/html_document.h +0 -10
- data/ext/nokogiri/html_element_description.c +0 -279
- data/ext/nokogiri/html_element_description.h +0 -10
- data/ext/nokogiri/html_entity_lookup.c +0 -32
- data/ext/nokogiri/html_entity_lookup.h +0 -8
- data/ext/nokogiri/html_sax_parser_context.c +0 -116
- data/ext/nokogiri/html_sax_parser_context.h +0 -11
- data/ext/nokogiri/html_sax_push_parser.c +0 -87
- data/ext/nokogiri/html_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_attr.h +0 -9
- data/ext/nokogiri/xml_attribute_decl.h +0 -9
- data/ext/nokogiri/xml_cdata.h +0 -9
- data/ext/nokogiri/xml_comment.h +0 -9
- data/ext/nokogiri/xml_document.h +0 -23
- data/ext/nokogiri/xml_document_fragment.h +0 -10
- data/ext/nokogiri/xml_dtd.h +0 -10
- data/ext/nokogiri/xml_element_content.h +0 -10
- data/ext/nokogiri/xml_element_decl.h +0 -9
- data/ext/nokogiri/xml_encoding_handler.h +0 -8
- data/ext/nokogiri/xml_entity_decl.h +0 -10
- data/ext/nokogiri/xml_entity_reference.h +0 -9
- data/ext/nokogiri/xml_io.c +0 -61
- data/ext/nokogiri/xml_io.h +0 -11
- data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
- data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
- data/ext/nokogiri/xml_namespace.h +0 -14
- data/ext/nokogiri/xml_node.h +0 -13
- data/ext/nokogiri/xml_node_set.h +0 -12
- data/ext/nokogiri/xml_processing_instruction.h +0 -9
- data/ext/nokogiri/xml_reader.h +0 -10
- data/ext/nokogiri/xml_relax_ng.h +0 -9
- data/ext/nokogiri/xml_sax_parser.h +0 -39
- data/ext/nokogiri/xml_sax_parser_context.h +0 -10
- data/ext/nokogiri/xml_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_schema.h +0 -9
- data/ext/nokogiri/xml_syntax_error.h +0 -13
- data/ext/nokogiri/xml_text.h +0 -9
- data/ext/nokogiri/xml_xpath_context.h +0 -10
- data/ext/nokogiri/xslt_stylesheet.h +0 -14
- data/lib/nokogiri/html/document.rb +0 -335
- data/lib/nokogiri/html/document_fragment.rb +0 -49
- data/lib/nokogiri/html/element_description_defaults.rb +0 -671
- data/lib/nokogiri/html/sax/parser_context.rb +0 -16
- data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
- data/patches/libxml2/0002-Fix-nullptr-deref-with-XPath-logic-ops.patch +0 -54
- data/patches/libxml2/0003-Fix-infinite-loop-in-LZMA-decompression.patch +0 -50
- data/ports/archives/libxml2-2.9.8.tar.gz +0 -0
- data/ports/archives/libxslt-1.1.32.tar.gz +0 -0
@@ -0,0 +1,121 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module HTML4
|
5
|
+
# Libxml2's parser has poor support for encoding detection. First, it does not recognize the
|
6
|
+
# HTML5 style meta charset declaration. Secondly, even if it successfully detects an encoding
|
7
|
+
# hint, it does not re-decode or re-parse the preceding part which may be garbled.
|
8
|
+
#
|
9
|
+
# EncodingReader aims to perform advanced encoding detection beyond what Libxml2 does, and to
|
10
|
+
# emulate rewinding of a stream and make Libxml2 redo parsing from the start when an encoding
|
11
|
+
# hint is found.
|
12
|
+
|
13
|
+
# :nodoc: all
|
14
|
+
class EncodingReader
|
15
|
+
class EncodingFound < StandardError
|
16
|
+
attr_reader :found_encoding
|
17
|
+
|
18
|
+
def initialize(encoding)
|
19
|
+
@found_encoding = encoding
|
20
|
+
super(format("encoding found: %s", encoding))
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class SAXHandler < Nokogiri::XML::SAX::Document
|
25
|
+
attr_reader :encoding
|
26
|
+
|
27
|
+
def initialize
|
28
|
+
@encoding = nil
|
29
|
+
super()
|
30
|
+
end
|
31
|
+
|
32
|
+
def start_element(name, attrs = [])
|
33
|
+
return unless name == "meta"
|
34
|
+
|
35
|
+
attr = Hash[attrs]
|
36
|
+
(charset = attr["charset"]) &&
|
37
|
+
(@encoding = charset)
|
38
|
+
(http_equiv = attr["http-equiv"]) &&
|
39
|
+
http_equiv.match(/\AContent-Type\z/i) &&
|
40
|
+
(content = attr["content"]) &&
|
41
|
+
(m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
|
42
|
+
(@encoding = m[1])
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class JumpSAXHandler < SAXHandler
|
47
|
+
def initialize(jumptag)
|
48
|
+
@jumptag = jumptag
|
49
|
+
super()
|
50
|
+
end
|
51
|
+
|
52
|
+
def start_element(name, attrs = [])
|
53
|
+
super
|
54
|
+
throw(@jumptag, @encoding) if @encoding
|
55
|
+
throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def self.detect_encoding(chunk)
|
60
|
+
(m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) &&
|
61
|
+
(return Nokogiri.XML(m[1]).encoding)
|
62
|
+
|
63
|
+
if Nokogiri.jruby?
|
64
|
+
(m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
|
65
|
+
(return m[4])
|
66
|
+
catch(:encoding_found) do
|
67
|
+
Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
|
68
|
+
nil
|
69
|
+
end
|
70
|
+
else
|
71
|
+
handler = SAXHandler.new
|
72
|
+
parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
|
73
|
+
begin
|
74
|
+
parser << chunk
|
75
|
+
rescue
|
76
|
+
Nokogiri::SyntaxError
|
77
|
+
end
|
78
|
+
handler.encoding
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def initialize(io)
|
83
|
+
@io = io
|
84
|
+
@firstchunk = nil
|
85
|
+
@encoding_found = nil
|
86
|
+
end
|
87
|
+
|
88
|
+
# This method is used by the C extension so that
|
89
|
+
# Nokogiri::HTML4::Document#read_io() does not leak memory when
|
90
|
+
# EncodingFound is raised.
|
91
|
+
attr_reader :encoding_found
|
92
|
+
|
93
|
+
def read(len)
|
94
|
+
# no support for a call without len
|
95
|
+
|
96
|
+
unless @firstchunk
|
97
|
+
(@firstchunk = @io.read(len)) || (return nil)
|
98
|
+
|
99
|
+
# This implementation expects that the first call from
|
100
|
+
# htmlReadIO() is made with a length long enough (~1KB) to
|
101
|
+
# achieve advanced encoding detection.
|
102
|
+
if (encoding = EncodingReader.detect_encoding(@firstchunk))
|
103
|
+
# The first chunk is stored for the next read in retry.
|
104
|
+
raise @encoding_found = EncodingFound.new(encoding)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
@encoding_found = nil
|
108
|
+
|
109
|
+
ret = @firstchunk.slice!(0, len)
|
110
|
+
if (len -= ret.length) > 0
|
111
|
+
(rest = @io.read(len)) && ret << (rest)
|
112
|
+
end
|
113
|
+
if ret.empty?
|
114
|
+
nil
|
115
|
+
else
|
116
|
+
ret
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -1,11 +1,13 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Nokogiri
|
2
|
-
module
|
4
|
+
module HTML4
|
3
5
|
class EntityDescription < Struct.new(:value, :name, :description); end
|
4
6
|
|
5
7
|
class EntityLookup
|
6
8
|
###
|
7
9
|
# Look up entity with +name+
|
8
|
-
def []
|
10
|
+
def [](name)
|
9
11
|
(val = get(name)) && val.value
|
10
12
|
end
|
11
13
|
end
|
@@ -1,17 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Nokogiri
|
2
|
-
module
|
4
|
+
module HTML4
|
3
5
|
###
|
4
|
-
# Nokogiri lets you write a SAX parser to process HTML but get HTML
|
5
|
-
# correction features.
|
6
|
+
# Nokogiri lets you write a SAX parser to process HTML but get HTML correction features.
|
6
7
|
#
|
7
|
-
# See Nokogiri::
|
8
|
-
# SAX parser with HTML.
|
8
|
+
# See Nokogiri::HTML4::SAX::Parser for a basic example of using a SAX parser with HTML.
|
9
9
|
#
|
10
10
|
# For more information on SAX parsers, see Nokogiri::XML::SAX
|
11
11
|
module SAX
|
12
12
|
###
|
13
|
-
# This class lets you perform SAX style parsing on HTML with HTML
|
14
|
-
# error correction.
|
13
|
+
# This class lets you perform SAX style parsing on HTML with HTML error correction.
|
15
14
|
#
|
16
15
|
# Here is a basic usage example:
|
17
16
|
#
|
@@ -21,40 +20,42 @@ module Nokogiri
|
|
21
20
|
# end
|
22
21
|
# end
|
23
22
|
#
|
24
|
-
# parser = Nokogiri::
|
23
|
+
# parser = Nokogiri::HTML4::SAX::Parser.new(MyDoc.new)
|
25
24
|
# parser.parse(File.read(ARGV[0], mode: 'rb'))
|
26
25
|
#
|
27
26
|
# For more information on SAX parsers, see Nokogiri::XML::SAX
|
28
27
|
class Parser < Nokogiri::XML::SAX::Parser
|
29
28
|
###
|
30
29
|
# Parse html stored in +data+ using +encoding+
|
31
|
-
def parse_memory
|
32
|
-
raise
|
33
|
-
return
|
30
|
+
def parse_memory(data, encoding = "UTF-8")
|
31
|
+
raise TypeError unless String === data
|
32
|
+
return if data.empty?
|
33
|
+
|
34
34
|
ctx = ParserContext.memory(data, encoding)
|
35
35
|
yield ctx if block_given?
|
36
|
-
ctx.parse_with
|
36
|
+
ctx.parse_with(self)
|
37
37
|
end
|
38
38
|
|
39
39
|
###
|
40
40
|
# Parse given +io+
|
41
|
-
def parse_io
|
41
|
+
def parse_io(io, encoding = "UTF-8")
|
42
42
|
check_encoding(encoding)
|
43
43
|
@encoding = encoding
|
44
44
|
ctx = ParserContext.io(io, ENCODINGS[encoding])
|
45
45
|
yield ctx if block_given?
|
46
|
-
ctx.parse_with
|
46
|
+
ctx.parse_with(self)
|
47
47
|
end
|
48
48
|
|
49
49
|
###
|
50
50
|
# Parse a file with +filename+
|
51
|
-
def parse_file
|
51
|
+
def parse_file(filename, encoding = "UTF-8")
|
52
52
|
raise ArgumentError unless filename
|
53
53
|
raise Errno::ENOENT unless File.exist?(filename)
|
54
54
|
raise Errno::EISDIR if File.directory?(filename)
|
55
|
+
|
55
56
|
ctx = ParserContext.file(filename, encoding)
|
56
57
|
yield ctx if block_given?
|
57
|
-
ctx.parse_with
|
58
|
+
ctx.parse_with(self)
|
58
59
|
end
|
59
60
|
end
|
60
61
|
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module HTML4
|
5
|
+
module SAX
|
6
|
+
###
|
7
|
+
# Context for HTML SAX parsers. This class is usually not instantiated by the user. Instead,
|
8
|
+
# you should be looking at Nokogiri::HTML4::SAX::Parser
|
9
|
+
class ParserContext < Nokogiri::XML::SAX::ParserContext
|
10
|
+
def self.new(thing, encoding = "UTF-8")
|
11
|
+
if [:read, :close].all? { |x| thing.respond_to?(x) }
|
12
|
+
super
|
13
|
+
else
|
14
|
+
memory(thing, encoding)
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
@@ -1,34 +1,35 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
module Nokogiri
|
2
|
-
module
|
4
|
+
module HTML4
|
3
5
|
module SAX
|
4
6
|
class PushParser
|
5
|
-
|
6
|
-
# The Nokogiri::HTML::SAX::Document on which the PushParser will be
|
7
|
+
# The Nokogiri::HTML4::SAX::Document on which the PushParser will be
|
7
8
|
# operating
|
8
9
|
attr_accessor :document
|
9
|
-
|
10
|
-
def initialize(doc =
|
10
|
+
|
11
|
+
def initialize(doc = HTML4::SAX::Document.new, file_name = nil, encoding = "UTF-8")
|
11
12
|
@document = doc
|
12
13
|
@encoding = encoding
|
13
|
-
@sax_parser =
|
14
|
+
@sax_parser = HTML4::SAX::Parser.new(doc, @encoding)
|
14
15
|
|
15
16
|
## Create our push parser context
|
16
17
|
initialize_native(@sax_parser, file_name, encoding)
|
17
18
|
end
|
18
|
-
|
19
|
+
|
19
20
|
###
|
20
21
|
# Write a +chunk+ of HTML to the PushParser. Any callback methods
|
21
22
|
# that can be called will be called immediately.
|
22
|
-
def write
|
23
|
+
def write(chunk, last_chunk = false)
|
23
24
|
native_write(chunk, last_chunk)
|
24
25
|
end
|
25
|
-
|
26
|
+
alias_method :<<, :write
|
26
27
|
|
27
28
|
###
|
28
29
|
# Finish the parsing. This method is only necessary for
|
29
|
-
# Nokogiri::
|
30
|
+
# Nokogiri::HTML4::SAX::Document#end_document to be called.
|
30
31
|
def finish
|
31
|
-
write
|
32
|
+
write("", true)
|
32
33
|
end
|
33
34
|
end
|
34
35
|
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
module Nokogiri
|
5
|
+
class << self
|
6
|
+
# :call-seq:
|
7
|
+
# HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block) → Nokogiri::HTML4::Document
|
8
|
+
#
|
9
|
+
# Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
|
10
|
+
def HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
|
11
|
+
Nokogiri::HTML4::Document.parse(input, url, encoding, options, &block)
|
12
|
+
end
|
13
|
+
end
|
14
|
+
|
15
|
+
# Since v1.12.0
|
16
|
+
#
|
17
|
+
# 💡 Before v1.12.0, Nokogiri::HTML4 did not exist, and Nokogiri::HTML was the module/namespace
|
18
|
+
# for parsing HTML.
|
19
|
+
module HTML4
|
20
|
+
class << self
|
21
|
+
###
|
22
|
+
# Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
|
23
|
+
def parse(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
|
24
|
+
Document.parse(input, url, encoding, options, &block)
|
25
|
+
end
|
26
|
+
|
27
|
+
####
|
28
|
+
# Parse a fragment from +string+ in to a NodeSet.
|
29
|
+
def fragment(string, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
|
30
|
+
HTML4::DocumentFragment.parse(string, encoding, options, &block)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
# Instance of Nokogiri::HTML4::EntityLookup
|
35
|
+
NamedCharacters = EntityLookup.new
|
36
|
+
end
|
37
|
+
end
|
38
|
+
|
39
|
+
require_relative "html4/entity_lookup"
|
40
|
+
require_relative "html4/document"
|
41
|
+
require_relative "html4/document_fragment"
|
42
|
+
require_relative "html4/encoding_reader"
|
43
|
+
require_relative "html4/sax/parser_context"
|
44
|
+
require_relative "html4/sax/parser"
|
45
|
+
require_relative "html4/sax/push_parser"
|
46
|
+
require_relative "html4/element_description"
|
47
|
+
require_relative "html4/element_description_defaults"
|
@@ -0,0 +1,168 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
#
|
5
|
+
# Copyright 2013-2021 Sam Ruby, Stephen Checkoway
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
8
|
+
# you may not use this file except in compliance with the License.
|
9
|
+
# You may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
16
|
+
# See the License for the specific language governing permissions and
|
17
|
+
# limitations under the License.
|
18
|
+
#
|
19
|
+
|
20
|
+
require_relative "../html4/document"
|
21
|
+
|
22
|
+
module Nokogiri
|
23
|
+
module HTML5
|
24
|
+
# Enum for the HTML5 parser quirks mode values. Values returned by HTML5::Document#quirks_mode
|
25
|
+
#
|
26
|
+
# See https://dom.spec.whatwg.org/#concept-document-quirks for more information on HTML5 quirks
|
27
|
+
# mode.
|
28
|
+
#
|
29
|
+
# Since v1.14.0
|
30
|
+
module QuirksMode
|
31
|
+
NO_QUIRKS = 0 # The document was parsed in "no-quirks" mode
|
32
|
+
QUIRKS = 1 # The document was parsed in "quirks" mode
|
33
|
+
LIMITED_QUIRKS = 2 # The document was parsed in "limited-quirks" mode
|
34
|
+
end
|
35
|
+
|
36
|
+
# Since v1.12.0
|
37
|
+
#
|
38
|
+
# 💡 HTML5 functionality is not available when running JRuby.
|
39
|
+
class Document < Nokogiri::HTML4::Document
|
40
|
+
# Get the url name for this document, as passed into Document.parse, Document.read_io, or
|
41
|
+
# Document.read_memory
|
42
|
+
attr_reader :url
|
43
|
+
|
44
|
+
# Get the parser's quirks mode value. See HTML5::QuirksMode.
|
45
|
+
#
|
46
|
+
# This method returns `nil` if the parser was not invoked (e.g., `Nokogiri::HTML5::Document.new`).
|
47
|
+
#
|
48
|
+
# Since v1.14.0
|
49
|
+
attr_reader :quirks_mode
|
50
|
+
|
51
|
+
class << self
|
52
|
+
# :call-seq:
|
53
|
+
# parse(input)
|
54
|
+
# parse(input, url=nil, encoding=nil, **options)
|
55
|
+
# parse(input, url=nil, encoding=nil) { |options| ... }
|
56
|
+
#
|
57
|
+
# Parse HTML5 input.
|
58
|
+
#
|
59
|
+
# [Parameters]
|
60
|
+
# - +input+ may be a String, or any object that responds to _read_ and _close_ such as an
|
61
|
+
# IO, or StringIO.
|
62
|
+
#
|
63
|
+
# - +url+ (optional) is a String indicating the canonical URI where this document is located.
|
64
|
+
#
|
65
|
+
# - +encoding+ (optional) is the encoding that should be used when processing
|
66
|
+
# the document.
|
67
|
+
#
|
68
|
+
# - +options+ (optional) is a configuration Hash (or keyword arguments) to set options
|
69
|
+
# during parsing. The three currently supported options are +:max_errors+,
|
70
|
+
# +:max_tree_depth+ and +:max_attributes+, described at Nokogiri::HTML5.
|
71
|
+
#
|
72
|
+
# ⚠ Note that these options are different than those made available by
|
73
|
+
# Nokogiri::XML::Document and Nokogiri::HTML4::Document.
|
74
|
+
#
|
75
|
+
# - +block+ (optional) is passed a configuration Hash on which parse options may be set. See
|
76
|
+
# Nokogiri::HTML5 for more information and usage.
|
77
|
+
#
|
78
|
+
# [Returns] Nokogiri::HTML5::Document
|
79
|
+
#
|
80
|
+
def parse(string_or_io, url = nil, encoding = nil, **options, &block)
|
81
|
+
yield options if block
|
82
|
+
string_or_io = "" unless string_or_io
|
83
|
+
|
84
|
+
if string_or_io.respond_to?(:encoding) && string_or_io.encoding != Encoding::ASCII_8BIT
|
85
|
+
encoding ||= string_or_io.encoding.name
|
86
|
+
end
|
87
|
+
|
88
|
+
if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
|
89
|
+
url ||= string_or_io.path
|
90
|
+
end
|
91
|
+
unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
|
92
|
+
raise ArgumentError, "not a string or IO object"
|
93
|
+
end
|
94
|
+
|
95
|
+
do_parse(string_or_io, url, encoding, options)
|
96
|
+
end
|
97
|
+
|
98
|
+
# Create a new document from an IO object.
|
99
|
+
#
|
100
|
+
# 💡 Most users should prefer Document.parse to this method.
|
101
|
+
def read_io(io, url = nil, encoding = nil, **options)
|
102
|
+
raise ArgumentError, "io object doesn't respond to :read" unless io.respond_to?(:read)
|
103
|
+
|
104
|
+
do_parse(io, url, encoding, options)
|
105
|
+
end
|
106
|
+
|
107
|
+
# Create a new document from a String.
|
108
|
+
#
|
109
|
+
# 💡 Most users should prefer Document.parse to this method.
|
110
|
+
def read_memory(string, url = nil, encoding = nil, **options)
|
111
|
+
raise ArgumentError, "string object doesn't respond to :to_str" unless string.respond_to?(:to_str)
|
112
|
+
|
113
|
+
do_parse(string, url, encoding, options)
|
114
|
+
end
|
115
|
+
|
116
|
+
private
|
117
|
+
|
118
|
+
def do_parse(string_or_io, url, encoding, options)
|
119
|
+
string = HTML5.read_and_encode(string_or_io, encoding)
|
120
|
+
max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
|
121
|
+
max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
|
122
|
+
max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
|
123
|
+
doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth, self)
|
124
|
+
doc.encoding = "UTF-8"
|
125
|
+
doc
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
def initialize(*args) # :nodoc:
|
130
|
+
super
|
131
|
+
@url = nil
|
132
|
+
@quirks_mode = nil
|
133
|
+
end
|
134
|
+
|
135
|
+
# :call-seq:
|
136
|
+
# fragment() → Nokogiri::HTML5::DocumentFragment
|
137
|
+
# fragment(markup) → Nokogiri::HTML5::DocumentFragment
|
138
|
+
#
|
139
|
+
# Parse a HTML5 document fragment from +markup+, returning a Nokogiri::HTML5::DocumentFragment.
|
140
|
+
#
|
141
|
+
# [Properties]
|
142
|
+
# - +markup+ (String) The HTML5 markup fragment to be parsed
|
143
|
+
#
|
144
|
+
# [Returns]
|
145
|
+
# Nokogiri::HTML5::DocumentFragment. This object's children will be empty if `markup` is not passed, is empty, or is `nil`.
|
146
|
+
#
|
147
|
+
def fragment(markup = nil)
|
148
|
+
DocumentFragment.new(self, markup)
|
149
|
+
end
|
150
|
+
|
151
|
+
def to_xml(options = {}, &block) # :nodoc:
|
152
|
+
# Bypass XML::Document#to_xml which doesn't add
|
153
|
+
# XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
|
154
|
+
XML::Node.instance_method(:to_xml).bind_call(self, options, &block)
|
155
|
+
end
|
156
|
+
|
157
|
+
# :call-seq:
|
158
|
+
# xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig
|
159
|
+
#
|
160
|
+
# [Returns] The document type which determines CSS-to-XPath translation.
|
161
|
+
#
|
162
|
+
# See CSS::XPathVisitor for more information.
|
163
|
+
def xpath_doctype
|
164
|
+
Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5
|
165
|
+
end
|
166
|
+
end
|
167
|
+
end
|
168
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
#
|
5
|
+
# Copyright 2013-2021 Sam Ruby, Stephen Checkoway
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
8
|
+
# you may not use this file except in compliance with the License.
|
9
|
+
# You may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
16
|
+
# See the License for the specific language governing permissions and
|
17
|
+
# limitations under the License.
|
18
|
+
#
|
19
|
+
|
20
|
+
require_relative "../html4/document_fragment"
|
21
|
+
|
22
|
+
module Nokogiri
|
23
|
+
module HTML5
|
24
|
+
# Since v1.12.0
|
25
|
+
#
|
26
|
+
# 💡 HTML5 functionality is not available when running JRuby.
|
27
|
+
class DocumentFragment < Nokogiri::HTML4::DocumentFragment
|
28
|
+
attr_accessor :document
|
29
|
+
attr_accessor :errors
|
30
|
+
|
31
|
+
# Get the parser's quirks mode value. See HTML5::QuirksMode.
|
32
|
+
#
|
33
|
+
# This method returns `nil` if the parser was not invoked (e.g., `Nokogiri::HTML5::DocumentFragment.new(doc)`).
|
34
|
+
#
|
35
|
+
# Since v1.14.0
|
36
|
+
attr_reader :quirks_mode
|
37
|
+
|
38
|
+
# Create a document fragment.
|
39
|
+
def initialize(doc, tags = nil, ctx = nil, options = {}) # rubocop:disable Lint/MissingSuper
|
40
|
+
self.document = doc
|
41
|
+
self.errors = []
|
42
|
+
return self unless tags
|
43
|
+
|
44
|
+
max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
|
45
|
+
max_errors = options[:max_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
|
46
|
+
max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
|
47
|
+
tags = Nokogiri::HTML5.read_and_encode(tags, nil)
|
48
|
+
Nokogiri::Gumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
|
49
|
+
end
|
50
|
+
|
51
|
+
def serialize(options = {}, &block) # :nodoc:
|
52
|
+
# Bypass XML::Document.serialize which doesn't support options even
|
53
|
+
# though XML::Node.serialize does!
|
54
|
+
XML::Node.instance_method(:serialize).bind_call(self, options, &block)
|
55
|
+
end
|
56
|
+
|
57
|
+
# Parse a document fragment from +tags+, returning a Nodeset.
|
58
|
+
def self.parse(tags, encoding = nil, options = {})
|
59
|
+
doc = HTML5::Document.new
|
60
|
+
tags = HTML5.read_and_encode(tags, encoding)
|
61
|
+
doc.encoding = "UTF-8"
|
62
|
+
new(doc, tags, nil, options)
|
63
|
+
end
|
64
|
+
|
65
|
+
def extract_params(params) # :nodoc:
|
66
|
+
handler = params.find do |param|
|
67
|
+
![Hash, String, Symbol].include?(param.class)
|
68
|
+
end
|
69
|
+
params -= [handler] if handler
|
70
|
+
|
71
|
+
hashes = []
|
72
|
+
while Hash === params.last || params.last.nil?
|
73
|
+
hashes << params.pop
|
74
|
+
break if params.empty?
|
75
|
+
end
|
76
|
+
ns, binds = hashes.reverse
|
77
|
+
|
78
|
+
ns ||=
|
79
|
+
begin
|
80
|
+
ns = {}
|
81
|
+
children.each { |child| ns.merge!(child.namespaces) }
|
82
|
+
ns
|
83
|
+
end
|
84
|
+
|
85
|
+
[params, handler, ns, binds]
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
# vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|
@@ -0,0 +1,103 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
#
|
5
|
+
# Copyright 2013-2021 Sam Ruby, Stephen Checkoway
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
8
|
+
# you may not use this file except in compliance with the License.
|
9
|
+
# You may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
16
|
+
# See the License for the specific language governing permissions and
|
17
|
+
# limitations under the License.
|
18
|
+
#
|
19
|
+
|
20
|
+
#
|
21
|
+
# TODO: this whole file should go away. maybe make it a decorator?
|
22
|
+
#
|
23
|
+
require_relative "../xml/node"
|
24
|
+
|
25
|
+
module Nokogiri
|
26
|
+
module HTML5
|
27
|
+
# Since v1.12.0
|
28
|
+
#
|
29
|
+
# 💡 HTML5 functionality is not available when running JRuby.
|
30
|
+
module Node
|
31
|
+
def inner_html(options = {})
|
32
|
+
return super(options) unless document.is_a?(HTML5::Document)
|
33
|
+
|
34
|
+
result = options[:preserve_newline] && prepend_newline? ? +"\n" : +""
|
35
|
+
result << children.map { |child| child.to_html(options) }.join
|
36
|
+
result
|
37
|
+
end
|
38
|
+
|
39
|
+
def write_to(io, *options)
|
40
|
+
return super(io, *options) unless document.is_a?(HTML5::Document)
|
41
|
+
|
42
|
+
options = options.first.is_a?(Hash) ? options.shift : {}
|
43
|
+
encoding = options[:encoding] || options[0]
|
44
|
+
if Nokogiri.jruby?
|
45
|
+
save_options = options[:save_with] || options[1]
|
46
|
+
indent_times = options[:indent] || 0
|
47
|
+
else
|
48
|
+
save_options = options[:save_with] || options[1] || XML::Node::SaveOptions::FORMAT
|
49
|
+
indent_times = options[:indent] || 2
|
50
|
+
end
|
51
|
+
indent_string = (options[:indent_text] || " ") * indent_times
|
52
|
+
|
53
|
+
config = XML::Node::SaveOptions.new(save_options.to_i)
|
54
|
+
yield config if block_given?
|
55
|
+
|
56
|
+
encoding = encoding.is_a?(Encoding) ? encoding.name : encoding
|
57
|
+
|
58
|
+
config_options = config.options
|
59
|
+
if config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0
|
60
|
+
# Use Nokogiri's serializing code.
|
61
|
+
native_write_to(io, encoding, indent_string, config_options)
|
62
|
+
else
|
63
|
+
# Serialize including the current node.
|
64
|
+
html = html_standard_serialize(options[:preserve_newline] || false)
|
65
|
+
encoding ||= document.encoding || Encoding::UTF_8
|
66
|
+
io << html.encode(encoding, fallback: lambda { |c| "&#x#{c.ord.to_s(16)};" })
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
def fragment(tags)
|
71
|
+
return super(tags) unless document.is_a?(HTML5::Document)
|
72
|
+
|
73
|
+
DocumentFragment.new(document, tags, self)
|
74
|
+
end
|
75
|
+
|
76
|
+
private
|
77
|
+
|
78
|
+
# HTML elements can have attributes that contain colons.
|
79
|
+
# Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
|
80
|
+
# and tries to create an attribute in a namespace. This is especially
|
81
|
+
# annoying with attribute names like xml:lang since libxml2 will
|
82
|
+
# actually create the xml namespace if it doesn't exist already.
|
83
|
+
def add_child_node_and_reparent_attrs(node)
|
84
|
+
return super(node) unless document.is_a?(HTML5::Document)
|
85
|
+
|
86
|
+
# I'm not sure what this method is supposed to do. Reparenting
|
87
|
+
# namespaces is handled by libxml2, including child namespaces which
|
88
|
+
# this method wouldn't handle.
|
89
|
+
# https://github.com/sparklemotion/nokogiri/issues/1790
|
90
|
+
add_child_node(node)
|
91
|
+
# node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
|
92
|
+
# attr.remove
|
93
|
+
# ns = attr.namespace
|
94
|
+
# a["#{ns.prefix}:#{attr.name}"] = attr.value
|
95
|
+
# end
|
96
|
+
end
|
97
|
+
end
|
98
|
+
# Monkey patch
|
99
|
+
XML::Node.prepend(HTML5::Node)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
# vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|