nokogiri 1.10.3 → 1.12.5
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +4 -4
- data/Gemfile +3 -0
- data/LICENSE-DEPENDENCIES.md +1173 -884
- data/LICENSE.md +1 -1
- data/README.md +176 -96
- data/dependencies.yml +28 -26
- data/ext/nokogiri/depend +38 -358
- data/ext/nokogiri/extconf.rb +716 -414
- data/ext/nokogiri/gumbo.c +584 -0
- data/ext/nokogiri/html4_document.c +166 -0
- data/ext/nokogiri/html4_element_description.c +294 -0
- data/ext/nokogiri/html4_entity_lookup.c +37 -0
- data/ext/nokogiri/html4_sax_parser_context.c +120 -0
- data/ext/nokogiri/html4_sax_push_parser.c +95 -0
- data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
- data/ext/nokogiri/nokogiri.c +228 -91
- data/ext/nokogiri/nokogiri.h +191 -89
- data/ext/nokogiri/test_global_handlers.c +40 -0
- data/ext/nokogiri/xml_attr.c +15 -15
- data/ext/nokogiri/xml_attribute_decl.c +18 -18
- data/ext/nokogiri/xml_cdata.c +13 -18
- data/ext/nokogiri/xml_comment.c +19 -26
- data/ext/nokogiri/xml_document.c +267 -195
- data/ext/nokogiri/xml_document_fragment.c +13 -15
- data/ext/nokogiri/xml_dtd.c +54 -48
- data/ext/nokogiri/xml_element_content.c +31 -26
- data/ext/nokogiri/xml_element_decl.c +22 -22
- data/ext/nokogiri/xml_encoding_handler.c +28 -17
- data/ext/nokogiri/xml_entity_decl.c +32 -30
- data/ext/nokogiri/xml_entity_reference.c +16 -18
- data/ext/nokogiri/xml_namespace.c +60 -51
- data/ext/nokogiri/xml_node.c +493 -407
- data/ext/nokogiri/xml_node_set.c +174 -162
- data/ext/nokogiri/xml_processing_instruction.c +17 -19
- data/ext/nokogiri/xml_reader.c +197 -172
- data/ext/nokogiri/xml_relax_ng.c +52 -28
- data/ext/nokogiri/xml_sax_parser.c +112 -112
- data/ext/nokogiri/xml_sax_parser_context.c +105 -86
- data/ext/nokogiri/xml_sax_push_parser.c +36 -27
- data/ext/nokogiri/xml_schema.c +112 -33
- data/ext/nokogiri/xml_syntax_error.c +42 -21
- data/ext/nokogiri/xml_text.c +13 -17
- data/ext/nokogiri/xml_xpath_context.c +158 -73
- data/ext/nokogiri/xslt_stylesheet.c +158 -164
- data/gumbo-parser/CHANGES.md +63 -0
- data/gumbo-parser/Makefile +101 -0
- data/gumbo-parser/THANKS +27 -0
- data/gumbo-parser/src/Makefile +34 -0
- data/gumbo-parser/src/README.md +41 -0
- data/gumbo-parser/src/ascii.c +75 -0
- data/gumbo-parser/src/ascii.h +115 -0
- data/gumbo-parser/src/attribute.c +42 -0
- data/gumbo-parser/src/attribute.h +17 -0
- data/gumbo-parser/src/char_ref.c +22225 -0
- data/gumbo-parser/src/char_ref.h +29 -0
- data/gumbo-parser/src/char_ref.rl +2154 -0
- data/gumbo-parser/src/error.c +626 -0
- data/gumbo-parser/src/error.h +148 -0
- data/gumbo-parser/src/foreign_attrs.c +104 -0
- data/gumbo-parser/src/foreign_attrs.gperf +27 -0
- data/gumbo-parser/src/gumbo.h +943 -0
- data/gumbo-parser/src/insertion_mode.h +33 -0
- data/gumbo-parser/src/macros.h +91 -0
- data/gumbo-parser/src/parser.c +4886 -0
- data/gumbo-parser/src/parser.h +41 -0
- data/gumbo-parser/src/replacement.h +33 -0
- data/gumbo-parser/src/string_buffer.c +103 -0
- data/gumbo-parser/src/string_buffer.h +68 -0
- data/gumbo-parser/src/string_piece.c +48 -0
- data/gumbo-parser/src/svg_attrs.c +174 -0
- data/gumbo-parser/src/svg_attrs.gperf +77 -0
- data/gumbo-parser/src/svg_tags.c +137 -0
- data/gumbo-parser/src/svg_tags.gperf +55 -0
- data/gumbo-parser/src/tag.c +222 -0
- data/gumbo-parser/src/tag_lookup.c +382 -0
- data/gumbo-parser/src/tag_lookup.gperf +169 -0
- data/gumbo-parser/src/tag_lookup.h +13 -0
- data/gumbo-parser/src/token_buffer.c +79 -0
- data/gumbo-parser/src/token_buffer.h +71 -0
- data/gumbo-parser/src/token_type.h +17 -0
- data/gumbo-parser/src/tokenizer.c +3463 -0
- data/gumbo-parser/src/tokenizer.h +112 -0
- data/gumbo-parser/src/tokenizer_states.h +339 -0
- data/gumbo-parser/src/utf8.c +245 -0
- data/gumbo-parser/src/utf8.h +164 -0
- data/gumbo-parser/src/util.c +68 -0
- data/gumbo-parser/src/util.h +30 -0
- data/gumbo-parser/src/vector.c +111 -0
- data/gumbo-parser/src/vector.h +45 -0
- data/lib/nokogiri/css/node.rb +1 -0
- data/lib/nokogiri/css/parser.rb +64 -63
- data/lib/nokogiri/css/parser.y +3 -3
- data/lib/nokogiri/css/parser_extras.rb +39 -36
- data/lib/nokogiri/css/syntax_error.rb +2 -1
- data/lib/nokogiri/css/tokenizer.rb +105 -103
- data/lib/nokogiri/css/xpath_visitor.rb +73 -43
- data/lib/nokogiri/css.rb +15 -14
- data/lib/nokogiri/decorators/slop.rb +1 -0
- data/lib/nokogiri/extension.rb +31 -0
- data/lib/nokogiri/gumbo.rb +14 -0
- data/lib/nokogiri/html.rb +32 -27
- data/lib/nokogiri/{html → html4}/builder.rb +3 -2
- data/lib/nokogiri/{html → html4}/document.rb +17 -30
- data/lib/nokogiri/{html → html4}/document_fragment.rb +18 -17
- data/lib/nokogiri/{html → html4}/element_description.rb +2 -1
- data/lib/nokogiri/{html → html4}/element_description_defaults.rb +2 -1
- data/lib/nokogiri/{html → html4}/entity_lookup.rb +2 -1
- data/lib/nokogiri/{html → html4}/sax/parser.rb +12 -14
- data/lib/nokogiri/html4/sax/parser_context.rb +19 -0
- data/lib/nokogiri/{html → html4}/sax/push_parser.rb +6 -5
- data/lib/nokogiri/html4.rb +40 -0
- data/lib/nokogiri/html5/document.rb +74 -0
- data/lib/nokogiri/html5/document_fragment.rb +80 -0
- data/lib/nokogiri/html5/node.rb +93 -0
- data/lib/nokogiri/html5.rb +473 -0
- data/lib/nokogiri/jruby/dependencies.rb +20 -0
- data/lib/nokogiri/syntax_error.rb +1 -0
- data/lib/nokogiri/version/constant.rb +5 -0
- data/lib/nokogiri/version/info.rb +215 -0
- data/lib/nokogiri/version.rb +3 -109
- data/lib/nokogiri/xml/attr.rb +1 -0
- data/lib/nokogiri/xml/attribute_decl.rb +1 -0
- data/lib/nokogiri/xml/builder.rb +74 -32
- data/lib/nokogiri/xml/cdata.rb +1 -0
- data/lib/nokogiri/xml/character_data.rb +1 -0
- data/lib/nokogiri/xml/document.rb +138 -41
- data/lib/nokogiri/xml/document_fragment.rb +5 -6
- data/lib/nokogiri/xml/dtd.rb +1 -0
- data/lib/nokogiri/xml/element_content.rb +1 -0
- data/lib/nokogiri/xml/element_decl.rb +1 -0
- data/lib/nokogiri/xml/entity_decl.rb +1 -0
- data/lib/nokogiri/xml/entity_reference.rb +1 -0
- data/lib/nokogiri/xml/namespace.rb +1 -0
- data/lib/nokogiri/xml/node/save_options.rb +2 -1
- data/lib/nokogiri/xml/node.rb +629 -293
- data/lib/nokogiri/xml/node_set.rb +1 -0
- data/lib/nokogiri/xml/notation.rb +1 -0
- data/lib/nokogiri/xml/parse_options.rb +12 -3
- data/lib/nokogiri/xml/pp/character_data.rb +1 -0
- data/lib/nokogiri/xml/pp/node.rb +1 -0
- data/lib/nokogiri/xml/pp.rb +3 -2
- data/lib/nokogiri/xml/processing_instruction.rb +1 -0
- data/lib/nokogiri/xml/reader.rb +9 -12
- data/lib/nokogiri/xml/relax_ng.rb +7 -2
- data/lib/nokogiri/xml/sax/document.rb +25 -30
- data/lib/nokogiri/xml/sax/parser.rb +1 -0
- data/lib/nokogiri/xml/sax/parser_context.rb +1 -0
- data/lib/nokogiri/xml/sax/push_parser.rb +1 -0
- data/lib/nokogiri/xml/sax.rb +5 -4
- data/lib/nokogiri/xml/schema.rb +13 -4
- data/lib/nokogiri/xml/searchable.rb +25 -16
- data/lib/nokogiri/xml/syntax_error.rb +1 -0
- data/lib/nokogiri/xml/text.rb +1 -0
- data/lib/nokogiri/xml/xpath/syntax_error.rb +2 -1
- data/lib/nokogiri/xml/xpath.rb +4 -5
- data/lib/nokogiri/xml/xpath_context.rb +1 -0
- data/lib/nokogiri/xml.rb +36 -36
- data/lib/nokogiri/xslt/stylesheet.rb +2 -1
- data/lib/nokogiri/xslt.rb +17 -16
- data/lib/nokogiri.rb +32 -51
- data/lib/xsd/xmlparser/nokogiri.rb +1 -0
- data/patches/libxml2/{0002-Remove-script-macro-support.patch → 0001-Remove-script-macro-support.patch} +0 -0
- data/patches/libxml2/{0003-Update-entities-to-remove-handling-of-ssi.patch → 0002-Update-entities-to-remove-handling-of-ssi.patch} +0 -0
- data/patches/libxml2/0003-libxml2.la-is-in-top_builddir.patch +25 -0
- data/patches/libxml2/0004-use-glibc-strlen.patch +53 -0
- data/patches/libxml2/0005-avoid-isnan-isinf.patch +81 -0
- data/patches/libxml2/0006-update-automake-files-for-arm64.patch +2511 -0
- data/patches/libxml2/0007-Fix-XPath-recursion-limit.patch +31 -0
- data/patches/libxslt/0001-update-automake-files-for-arm64.patch +2511 -0
- data/patches/libxslt/0002-Fix-xml2-config-check-in-configure-script.patch +19 -0
- data/ports/archives/libxml2-2.9.12.tar.gz +0 -0
- data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
- metadata +151 -153
- data/ext/nokogiri/html_document.c +0 -170
- data/ext/nokogiri/html_document.h +0 -10
- data/ext/nokogiri/html_element_description.c +0 -279
- data/ext/nokogiri/html_element_description.h +0 -10
- data/ext/nokogiri/html_entity_lookup.c +0 -32
- data/ext/nokogiri/html_entity_lookup.h +0 -8
- data/ext/nokogiri/html_sax_parser_context.c +0 -116
- data/ext/nokogiri/html_sax_parser_context.h +0 -11
- data/ext/nokogiri/html_sax_push_parser.c +0 -87
- data/ext/nokogiri/html_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_attr.h +0 -9
- data/ext/nokogiri/xml_attribute_decl.h +0 -9
- data/ext/nokogiri/xml_cdata.h +0 -9
- data/ext/nokogiri/xml_comment.h +0 -9
- data/ext/nokogiri/xml_document.h +0 -23
- data/ext/nokogiri/xml_document_fragment.h +0 -10
- data/ext/nokogiri/xml_dtd.h +0 -10
- data/ext/nokogiri/xml_element_content.h +0 -10
- data/ext/nokogiri/xml_element_decl.h +0 -9
- data/ext/nokogiri/xml_encoding_handler.h +0 -8
- data/ext/nokogiri/xml_entity_decl.h +0 -10
- data/ext/nokogiri/xml_entity_reference.h +0 -9
- data/ext/nokogiri/xml_io.c +0 -61
- data/ext/nokogiri/xml_io.h +0 -11
- data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
- data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
- data/ext/nokogiri/xml_namespace.h +0 -14
- data/ext/nokogiri/xml_node.h +0 -13
- data/ext/nokogiri/xml_node_set.h +0 -12
- data/ext/nokogiri/xml_processing_instruction.h +0 -9
- data/ext/nokogiri/xml_reader.h +0 -10
- data/ext/nokogiri/xml_relax_ng.h +0 -9
- data/ext/nokogiri/xml_sax_parser.h +0 -39
- data/ext/nokogiri/xml_sax_parser_context.h +0 -10
- data/ext/nokogiri/xml_sax_push_parser.h +0 -9
- data/ext/nokogiri/xml_schema.h +0 -9
- data/ext/nokogiri/xml_syntax_error.h +0 -13
- data/ext/nokogiri/xml_text.h +0 -9
- data/ext/nokogiri/xml_xpath_context.h +0 -10
- data/ext/nokogiri/xslt_stylesheet.h +0 -14
- data/lib/nokogiri/html/sax/parser_context.rb +0 -16
- data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
- data/patches/libxslt/0001-Fix-security-framework-bypass.patch +0 -120
- data/ports/archives/libxml2-2.9.9.tar.gz +0 -0
- data/ports/archives/libxslt-1.1.33.tar.gz +0 -0
@@ -1,5 +1,6 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Nokogiri
|
2
|
-
module
|
3
|
+
module HTML4
|
3
4
|
###
|
4
5
|
# Nokogiri HTML builder is used for building HTML documents. It is very
|
5
6
|
# similar to the Nokogiri::XML::Builder. In fact, you should go read the
|
@@ -11,7 +12,7 @@ module Nokogiri
|
|
11
12
|
# Create an HTML document with a body that has an onload attribute, and a
|
12
13
|
# span tag with a class of "bold" that has content of "Hello world".
|
13
14
|
#
|
14
|
-
# builder = Nokogiri::
|
15
|
+
# builder = Nokogiri::HTML4::Builder.new do |doc|
|
15
16
|
# doc.html {
|
16
17
|
# doc.body(:onload => 'some_func();') {
|
17
18
|
# doc.span.bold {
|
@@ -1,5 +1,9 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'pathname'
|
4
|
+
|
1
5
|
module Nokogiri
|
2
|
-
module
|
6
|
+
module HTML4
|
3
7
|
class Document < Nokogiri::XML::Document
|
4
8
|
###
|
5
9
|
# Get the meta tag encoding for this document. If there is no meta tag,
|
@@ -160,11 +164,12 @@ module Nokogiri
|
|
160
164
|
# Nokogiri::XML::ParseOptions::RECOVER. See the constants in
|
161
165
|
# Nokogiri::XML::ParseOptions.
|
162
166
|
def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML
|
163
|
-
|
164
167
|
options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
|
165
|
-
|
168
|
+
|
166
169
|
yield options if block_given?
|
167
170
|
|
171
|
+
url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
|
172
|
+
|
168
173
|
if string_or_io.respond_to?(:encoding)
|
169
174
|
unless string_or_io.encoding.name == "ASCII-8BIT"
|
170
175
|
encoding ||= string_or_io.encoding.name
|
@@ -172,7 +177,12 @@ module Nokogiri
|
|
172
177
|
end
|
173
178
|
|
174
179
|
if string_or_io.respond_to?(:read)
|
175
|
-
|
180
|
+
if string_or_io.is_a?(Pathname)
|
181
|
+
# resolve the Pathname to the file and open it as an IO object, see #2110
|
182
|
+
string_or_io = string_or_io.expand_path.open
|
183
|
+
url ||= string_or_io.path
|
184
|
+
end
|
185
|
+
|
176
186
|
unless encoding
|
177
187
|
# Libxml2's parser has poor support for encoding
|
178
188
|
# detection. First, it does not recognize the HTML5
|
@@ -251,9 +261,6 @@ module Nokogiri
|
|
251
261
|
end
|
252
262
|
|
253
263
|
def self.detect_encoding(chunk)
|
254
|
-
if Nokogiri.jruby? && EncodingReader.is_jruby_without_fix?
|
255
|
-
return EncodingReader.detect_encoding_for_jruby_without_fix(chunk)
|
256
|
-
end
|
257
264
|
m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
|
258
265
|
return Nokogiri.XML(m[1]).encoding
|
259
266
|
|
@@ -261,37 +268,17 @@ module Nokogiri
|
|
261
268
|
m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
|
262
269
|
return m[4]
|
263
270
|
catch(:encoding_found) {
|
264
|
-
Nokogiri::
|
271
|
+
Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
|
265
272
|
nil
|
266
273
|
}
|
267
274
|
else
|
268
275
|
handler = SAXHandler.new
|
269
|
-
parser = Nokogiri::
|
276
|
+
parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
|
270
277
|
parser << chunk rescue Nokogiri::SyntaxError
|
271
278
|
handler.encoding
|
272
279
|
end
|
273
280
|
end
|
274
281
|
|
275
|
-
def self.is_jruby_without_fix?
|
276
|
-
JRUBY_VERSION.split('.').join.to_i < 165
|
277
|
-
end
|
278
|
-
|
279
|
-
def self.detect_encoding_for_jruby_without_fix(chunk)
|
280
|
-
m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
|
281
|
-
return Nokogiri.XML(m[1]).encoding
|
282
|
-
|
283
|
-
m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
|
284
|
-
return m[4]
|
285
|
-
|
286
|
-
catch(:encoding_found) {
|
287
|
-
Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk)
|
288
|
-
nil
|
289
|
-
}
|
290
|
-
rescue Nokogiri::SyntaxError, RuntimeError
|
291
|
-
# Ignore parser errors that nokogiri may raise
|
292
|
-
nil
|
293
|
-
end
|
294
|
-
|
295
282
|
def initialize(io)
|
296
283
|
@io = io
|
297
284
|
@firstchunk = nil
|
@@ -299,7 +286,7 @@ module Nokogiri
|
|
299
286
|
end
|
300
287
|
|
301
288
|
# This method is used by the C extension so that
|
302
|
-
# Nokogiri::
|
289
|
+
# Nokogiri::HTML4::Document#read_io() does not leak memory when
|
303
290
|
# EncodingFound is raised.
|
304
291
|
attr_reader :encoding_found
|
305
292
|
|
@@ -1,28 +1,29 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Nokogiri
|
2
|
-
module
|
3
|
+
module HTML4
|
3
4
|
class DocumentFragment < Nokogiri::XML::DocumentFragment
|
4
5
|
####
|
5
6
|
# Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
|
6
|
-
def self.parse
|
7
|
-
doc =
|
7
|
+
def self.parse(tags, encoding = nil)
|
8
|
+
doc = HTML4::Document.new
|
8
9
|
|
9
10
|
encoding ||= if tags.respond_to?(:encoding)
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
11
|
+
encoding = tags.encoding
|
12
|
+
if encoding == ::Encoding::ASCII_8BIT
|
13
|
+
'UTF-8'
|
14
|
+
else
|
15
|
+
encoding.name
|
16
|
+
end
|
17
|
+
else
|
18
|
+
'UTF-8'
|
19
|
+
end
|
19
20
|
|
20
21
|
doc.encoding = encoding
|
21
22
|
|
22
23
|
new(doc, tags)
|
23
24
|
end
|
24
25
|
|
25
|
-
def initialize
|
26
|
+
def initialize(document, tags = nil, ctx = nil)
|
26
27
|
return self unless tags
|
27
28
|
|
28
29
|
if ctx
|
@@ -32,13 +33,13 @@ module Nokogiri
|
|
32
33
|
self.errors = document.errors - preexisting_errors
|
33
34
|
else
|
34
35
|
# This is a horrible hack, but I don't care
|
35
|
-
|
36
|
-
|
36
|
+
path = if /^\s*?<body/i.match?(tags)
|
37
|
+
"/html/body"
|
37
38
|
else
|
38
|
-
|
39
|
+
"/html/body/node()"
|
39
40
|
end
|
40
41
|
|
41
|
-
temp_doc =
|
42
|
+
temp_doc = HTML4::Document.parse("<html><body>#{tags}", nil, document.encoding)
|
42
43
|
temp_doc.xpath(path).each { |child| child.parent = self }
|
43
44
|
self.errors = temp_doc.errors
|
44
45
|
end
|
@@ -1,17 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Nokogiri
|
2
|
-
module
|
3
|
+
module HTML4
|
3
4
|
###
|
4
|
-
# Nokogiri lets you write a SAX parser to process HTML but get HTML
|
5
|
-
# correction features.
|
5
|
+
# Nokogiri lets you write a SAX parser to process HTML but get HTML correction features.
|
6
6
|
#
|
7
|
-
# See Nokogiri::
|
8
|
-
# SAX parser with HTML.
|
7
|
+
# See Nokogiri::HTML4::SAX::Parser for a basic example of using a SAX parser with HTML.
|
9
8
|
#
|
10
9
|
# For more information on SAX parsers, see Nokogiri::XML::SAX
|
11
10
|
module SAX
|
12
11
|
###
|
13
|
-
# This class lets you perform SAX style parsing on HTML with HTML
|
14
|
-
# error correction.
|
12
|
+
# This class lets you perform SAX style parsing on HTML with HTML error correction.
|
15
13
|
#
|
16
14
|
# Here is a basic usage example:
|
17
15
|
#
|
@@ -21,40 +19,40 @@ module Nokogiri
|
|
21
19
|
# end
|
22
20
|
# end
|
23
21
|
#
|
24
|
-
# parser = Nokogiri::
|
22
|
+
# parser = Nokogiri::HTML4::SAX::Parser.new(MyDoc.new)
|
25
23
|
# parser.parse(File.read(ARGV[0], mode: 'rb'))
|
26
24
|
#
|
27
25
|
# For more information on SAX parsers, see Nokogiri::XML::SAX
|
28
26
|
class Parser < Nokogiri::XML::SAX::Parser
|
29
27
|
###
|
30
28
|
# Parse html stored in +data+ using +encoding+
|
31
|
-
def parse_memory
|
29
|
+
def parse_memory(data, encoding = "UTF-8")
|
32
30
|
raise ArgumentError unless data
|
33
31
|
return unless data.length > 0
|
34
32
|
ctx = ParserContext.memory(data, encoding)
|
35
33
|
yield ctx if block_given?
|
36
|
-
ctx.parse_with
|
34
|
+
ctx.parse_with(self)
|
37
35
|
end
|
38
36
|
|
39
37
|
###
|
40
38
|
# Parse given +io+
|
41
|
-
def parse_io
|
39
|
+
def parse_io(io, encoding = "UTF-8")
|
42
40
|
check_encoding(encoding)
|
43
41
|
@encoding = encoding
|
44
42
|
ctx = ParserContext.io(io, ENCODINGS[encoding])
|
45
43
|
yield ctx if block_given?
|
46
|
-
ctx.parse_with
|
44
|
+
ctx.parse_with(self)
|
47
45
|
end
|
48
46
|
|
49
47
|
###
|
50
48
|
# Parse a file with +filename+
|
51
|
-
def parse_file
|
49
|
+
def parse_file(filename, encoding = "UTF-8")
|
52
50
|
raise ArgumentError unless filename
|
53
51
|
raise Errno::ENOENT unless File.exist?(filename)
|
54
52
|
raise Errno::EISDIR if File.directory?(filename)
|
55
53
|
ctx = ParserContext.file(filename, encoding)
|
56
54
|
yield ctx if block_given?
|
57
|
-
ctx.parse_with
|
55
|
+
ctx.parse_with(self)
|
58
56
|
end
|
59
57
|
end
|
60
58
|
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Nokogiri
|
3
|
+
module HTML4
|
4
|
+
module SAX
|
5
|
+
###
|
6
|
+
# Context for HTML SAX parsers. This class is usually not instantiated by the user. Instead,
|
7
|
+
# you should be looking at Nokogiri::HTML4::SAX::Parser
|
8
|
+
class ParserContext < Nokogiri::XML::SAX::ParserContext
|
9
|
+
def self.new(thing, encoding = "UTF-8")
|
10
|
+
if [:read, :close].all? { |x| thing.respond_to?(x) }
|
11
|
+
super
|
12
|
+
else
|
13
|
+
memory(thing, encoding)
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -1,16 +1,17 @@
|
|
1
|
+
# frozen_string_literal: true
|
1
2
|
module Nokogiri
|
2
|
-
module
|
3
|
+
module HTML4
|
3
4
|
module SAX
|
4
5
|
class PushParser
|
5
6
|
|
6
|
-
# The Nokogiri::
|
7
|
+
# The Nokogiri::HTML4::SAX::Document on which the PushParser will be
|
7
8
|
# operating
|
8
9
|
attr_accessor :document
|
9
10
|
|
10
|
-
def initialize(doc =
|
11
|
+
def initialize(doc = HTML4::SAX::Document.new, file_name = nil, encoding = 'UTF-8')
|
11
12
|
@document = doc
|
12
13
|
@encoding = encoding
|
13
|
-
@sax_parser =
|
14
|
+
@sax_parser = HTML4::SAX::Parser.new(doc, @encoding)
|
14
15
|
|
15
16
|
## Create our push parser context
|
16
17
|
initialize_native(@sax_parser, file_name, encoding)
|
@@ -26,7 +27,7 @@ module Nokogiri
|
|
26
27
|
|
27
28
|
###
|
28
29
|
# Finish the parsing. This method is only necessary for
|
29
|
-
# Nokogiri::
|
30
|
+
# Nokogiri::HTML4::SAX::Document#end_document to be called.
|
30
31
|
def finish
|
31
32
|
write '', true
|
32
33
|
end
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
module Nokogiri
|
3
|
+
class << self
|
4
|
+
###
|
5
|
+
# Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
|
6
|
+
def HTML4(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
|
7
|
+
Nokogiri::HTML4::Document.parse(input, url, encoding, options, &block)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
# @since v1.12.0
|
12
|
+
# @note Before v1.12.0, {Nokogiri::HTML4} did not exist, and {Nokogiri::HTML} was the module/namespace for parsing HTML.
|
13
|
+
module HTML4
|
14
|
+
class << self
|
15
|
+
###
|
16
|
+
# Parse HTML. Convenience method for Nokogiri::HTML4::Document.parse
|
17
|
+
def parse(input, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block)
|
18
|
+
Document.parse(input, url, encoding, options, &block)
|
19
|
+
end
|
20
|
+
|
21
|
+
####
|
22
|
+
# Parse a fragment from +string+ in to a NodeSet.
|
23
|
+
def fragment(string, encoding = nil)
|
24
|
+
HTML4::DocumentFragment.parse(string, encoding)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
# Instance of Nokogiri::HTML4::EntityLookup
|
29
|
+
NamedCharacters = EntityLookup.new
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
require_relative "html4/entity_lookup"
|
34
|
+
require_relative "html4/document"
|
35
|
+
require_relative "html4/document_fragment"
|
36
|
+
require_relative "html4/sax/parser_context"
|
37
|
+
require_relative "html4/sax/parser"
|
38
|
+
require_relative "html4/sax/push_parser"
|
39
|
+
require_relative "html4/element_description"
|
40
|
+
require_relative "html4/element_description_defaults"
|
@@ -0,0 +1,74 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
#
|
3
|
+
# Copyright 2013-2021 Sam Ruby, Stephen Checkoway
|
4
|
+
#
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
#
|
17
|
+
|
18
|
+
require_relative "../html4/document"
|
19
|
+
|
20
|
+
module Nokogiri
|
21
|
+
module HTML5
|
22
|
+
# @since v1.12.0
|
23
|
+
# @note HTML5 functionality is not available when running JRuby.
|
24
|
+
class Document < Nokogiri::HTML4::Document
|
25
|
+
def self.parse(string_or_io, url = nil, encoding = nil, **options, &block)
|
26
|
+
yield options if block_given?
|
27
|
+
string_or_io = '' unless string_or_io
|
28
|
+
|
29
|
+
if string_or_io.respond_to?(:encoding) && string_or_io.encoding.name != 'ASCII-8BIT'
|
30
|
+
encoding ||= string_or_io.encoding.name
|
31
|
+
end
|
32
|
+
|
33
|
+
if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
|
34
|
+
url ||= string_or_io.path
|
35
|
+
end
|
36
|
+
unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
|
37
|
+
raise ArgumentError.new("not a string or IO object")
|
38
|
+
end
|
39
|
+
do_parse(string_or_io, url, encoding, options)
|
40
|
+
end
|
41
|
+
|
42
|
+
def self.read_io(io, url = nil, encoding = nil, **options)
|
43
|
+
raise ArgumentError.new("io object doesn't respond to :read") unless io.respond_to?(:read)
|
44
|
+
do_parse(io, url, encoding, options)
|
45
|
+
end
|
46
|
+
|
47
|
+
def self.read_memory(string, url = nil, encoding = nil, **options)
|
48
|
+
raise ArgumentError.new("string object doesn't respond to :to_str") unless string.respond_to?(:to_str)
|
49
|
+
do_parse(string, url, encoding, options)
|
50
|
+
end
|
51
|
+
|
52
|
+
def fragment(tags = nil)
|
53
|
+
DocumentFragment.new(self, tags, self.root)
|
54
|
+
end
|
55
|
+
|
56
|
+
def to_xml(options = {}, &block)
|
57
|
+
# Bypass XML::Document#to_xml which doesn't add
|
58
|
+
# XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
|
59
|
+
XML::Node.instance_method(:to_xml).bind(self).call(options, &block)
|
60
|
+
end
|
61
|
+
|
62
|
+
private
|
63
|
+
def self.do_parse(string_or_io, url, encoding, options)
|
64
|
+
string = HTML5.read_and_encode(string_or_io, encoding)
|
65
|
+
max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
|
66
|
+
max_errors = options[:max_errors] || options[:max_parse_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
|
67
|
+
max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
|
68
|
+
doc = Nokogiri::Gumbo.parse(string, url, max_attributes, max_errors, max_depth)
|
69
|
+
doc.encoding = 'UTF-8'
|
70
|
+
doc
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
end
|
@@ -0,0 +1,80 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
#
|
3
|
+
# Copyright 2013-2021 Sam Ruby, Stephen Checkoway
|
4
|
+
#
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
#
|
17
|
+
|
18
|
+
require_relative "../html4/document_fragment"
|
19
|
+
|
20
|
+
module Nokogiri
|
21
|
+
module HTML5
|
22
|
+
# @since v1.12.0
|
23
|
+
# @note HTML5 functionality is not available when running JRuby.
|
24
|
+
class DocumentFragment < Nokogiri::HTML4::DocumentFragment
|
25
|
+
attr_accessor :document
|
26
|
+
attr_accessor :errors
|
27
|
+
|
28
|
+
# Create a document fragment.
|
29
|
+
def initialize(doc, tags = nil, ctx = nil, options = {})
|
30
|
+
self.document = doc
|
31
|
+
self.errors = []
|
32
|
+
return self unless tags
|
33
|
+
|
34
|
+
max_attributes = options[:max_attributes] || Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
|
35
|
+
max_errors = options[:max_errors] || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
|
36
|
+
max_depth = options[:max_tree_depth] || Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
|
37
|
+
tags = Nokogiri::HTML5.read_and_encode(tags, nil)
|
38
|
+
Nokogiri::Gumbo.fragment(self, tags, ctx, max_attributes, max_errors, max_depth)
|
39
|
+
end
|
40
|
+
|
41
|
+
def serialize(options = {}, &block)
|
42
|
+
# Bypass XML::Document.serialize which doesn't support options even
|
43
|
+
# though XML::Node.serialize does!
|
44
|
+
XML::Node.instance_method(:serialize).bind(self).call(options, &block)
|
45
|
+
end
|
46
|
+
|
47
|
+
# Parse a document fragment from +tags+, returning a Nodeset.
|
48
|
+
def self.parse(tags, encoding = nil, options = {})
|
49
|
+
doc = HTML5::Document.new
|
50
|
+
tags = HTML5.read_and_encode(tags, encoding)
|
51
|
+
doc.encoding = "UTF-8"
|
52
|
+
new(doc, tags, nil, options)
|
53
|
+
end
|
54
|
+
|
55
|
+
def extract_params(params) # :nodoc:
|
56
|
+
handler = params.find do |param|
|
57
|
+
![Hash, String, Symbol].include?(param.class)
|
58
|
+
end
|
59
|
+
params -= [handler] if handler
|
60
|
+
|
61
|
+
hashes = []
|
62
|
+
while Hash === params.last || params.last.nil?
|
63
|
+
hashes << params.pop
|
64
|
+
break if params.empty?
|
65
|
+
end
|
66
|
+
ns, binds = hashes.reverse
|
67
|
+
|
68
|
+
ns ||=
|
69
|
+
begin
|
70
|
+
ns = {}
|
71
|
+
children.each { |child| ns.merge!(child.namespaces) }
|
72
|
+
ns
|
73
|
+
end
|
74
|
+
|
75
|
+
[params, handler, ns, binds]
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|
80
|
+
# vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|
@@ -0,0 +1,93 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
#
|
3
|
+
# Copyright 2013-2021 Sam Ruby, Stephen Checkoway
|
4
|
+
#
|
5
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
6
|
+
# you may not use this file except in compliance with the License.
|
7
|
+
# You may obtain a copy of the License at
|
8
|
+
#
|
9
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
#
|
11
|
+
# Unless required by applicable law or agreed to in writing, software
|
12
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
14
|
+
# See the License for the specific language governing permissions and
|
15
|
+
# limitations under the License.
|
16
|
+
#
|
17
|
+
|
18
|
+
require_relative "../xml/node"
|
19
|
+
|
20
|
+
module Nokogiri
|
21
|
+
module HTML5
|
22
|
+
# @since v1.12.0
|
23
|
+
# @note HTML5 functionality is not available when running JRuby.
|
24
|
+
module Node
|
25
|
+
def inner_html(options = {})
|
26
|
+
return super(options) unless document.is_a?(HTML5::Document)
|
27
|
+
result = options[:preserve_newline] && HTML5.prepend_newline?(self) ? String.new("\n") : String.new
|
28
|
+
result << children.map { |child| child.to_html(options) }.join
|
29
|
+
result
|
30
|
+
end
|
31
|
+
|
32
|
+
def write_to(io, *options)
|
33
|
+
return super(io, *options) unless document.is_a?(HTML5::Document)
|
34
|
+
options = options.first.is_a?(Hash) ? options.shift : {}
|
35
|
+
encoding = options[:encoding] || options[0]
|
36
|
+
if Nokogiri.jruby?
|
37
|
+
save_options = options[:save_with] || options[1]
|
38
|
+
indent_times = options[:indent] || 0
|
39
|
+
else
|
40
|
+
save_options = options[:save_with] || options[1] || XML::Node::SaveOptions::FORMAT
|
41
|
+
indent_times = options[:indent] || 2
|
42
|
+
end
|
43
|
+
indent_string = (options[:indent_text] || " ") * indent_times
|
44
|
+
|
45
|
+
config = XML::Node::SaveOptions.new(save_options.to_i)
|
46
|
+
yield config if block_given?
|
47
|
+
|
48
|
+
config_options = config.options
|
49
|
+
if config_options & (XML::Node::SaveOptions::AS_XML | XML::Node::SaveOptions::AS_XHTML) != 0
|
50
|
+
# Use Nokogiri's serializing code.
|
51
|
+
native_write_to(io, encoding, indent_string, config_options)
|
52
|
+
else
|
53
|
+
# Serialize including the current node.
|
54
|
+
encoding ||= document.encoding || Encoding::UTF_8
|
55
|
+
internal_ops = {
|
56
|
+
preserve_newline: options[:preserve_newline] || false,
|
57
|
+
}
|
58
|
+
HTML5.serialize_node_internal(self, io, encoding, internal_ops)
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
def fragment(tags)
|
63
|
+
return super(tags) unless document.is_a?(HTML5::Document)
|
64
|
+
DocumentFragment.new(document, tags, self)
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
# HTML elements can have attributes that contain colons.
|
70
|
+
# Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName
|
71
|
+
# and tries to create an attribute in a namespace. This is especially
|
72
|
+
# annoying with attribute names like xml:lang since libxml2 will
|
73
|
+
# actually create the xml namespace if it doesn't exist already.
|
74
|
+
def add_child_node_and_reparent_attrs(node)
|
75
|
+
return super(node) unless document.is_a?(HTML5::Document)
|
76
|
+
# I'm not sure what this method is supposed to do. Reparenting
|
77
|
+
# namespaces is handled by libxml2, including child namespaces which
|
78
|
+
# this method wouldn't handle.
|
79
|
+
# https://github.com/sparklemotion/nokogiri/issues/1790
|
80
|
+
add_child_node(node)
|
81
|
+
# node.attribute_nodes.find_all { |a| a.namespace }.each do |attr|
|
82
|
+
# attr.remove
|
83
|
+
# ns = attr.namespace
|
84
|
+
# a["#{ns.prefix}:#{attr.name}"] = attr.value
|
85
|
+
# end
|
86
|
+
end
|
87
|
+
end
|
88
|
+
# Monkey patch
|
89
|
+
XML::Node.prepend(HTML5::Node)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
# vim: set shiftwidth=2 softtabstop=2 tabstop=8 expandtab:
|