nokogiri 1.18.0-aarch64-linux-gnu
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- checksums.yaml +7 -0
- data/Gemfile +39 -0
- data/LICENSE-DEPENDENCIES.md +2224 -0
- data/LICENSE.md +9 -0
- data/README.md +293 -0
- data/bin/nokogiri +131 -0
- data/dependencies.yml +42 -0
- data/ext/nokogiri/depend +38 -0
- data/ext/nokogiri/extconf.rb +1173 -0
- data/ext/nokogiri/gumbo.c +610 -0
- data/ext/nokogiri/html4_document.c +171 -0
- data/ext/nokogiri/html4_element_description.c +299 -0
- data/ext/nokogiri/html4_entity_lookup.c +37 -0
- data/ext/nokogiri/html4_sax_parser.c +40 -0
- data/ext/nokogiri/html4_sax_parser_context.c +98 -0
- data/ext/nokogiri/html4_sax_push_parser.c +96 -0
- data/ext/nokogiri/include/libexslt/exslt.h +108 -0
- data/ext/nokogiri/include/libexslt/exsltconfig.h +70 -0
- data/ext/nokogiri/include/libexslt/exsltexports.h +63 -0
- data/ext/nokogiri/include/libxml2/libxml/HTMLparser.h +336 -0
- data/ext/nokogiri/include/libxml2/libxml/HTMLtree.h +147 -0
- data/ext/nokogiri/include/libxml2/libxml/SAX.h +202 -0
- data/ext/nokogiri/include/libxml2/libxml/SAX2.h +171 -0
- data/ext/nokogiri/include/libxml2/libxml/c14n.h +115 -0
- data/ext/nokogiri/include/libxml2/libxml/catalog.h +182 -0
- data/ext/nokogiri/include/libxml2/libxml/chvalid.h +230 -0
- data/ext/nokogiri/include/libxml2/libxml/debugXML.h +217 -0
- data/ext/nokogiri/include/libxml2/libxml/dict.h +82 -0
- data/ext/nokogiri/include/libxml2/libxml/encoding.h +244 -0
- data/ext/nokogiri/include/libxml2/libxml/entities.h +166 -0
- data/ext/nokogiri/include/libxml2/libxml/globals.h +41 -0
- data/ext/nokogiri/include/libxml2/libxml/hash.h +251 -0
- data/ext/nokogiri/include/libxml2/libxml/list.h +137 -0
- data/ext/nokogiri/include/libxml2/libxml/nanoftp.h +186 -0
- data/ext/nokogiri/include/libxml2/libxml/nanohttp.h +98 -0
- data/ext/nokogiri/include/libxml2/libxml/parser.h +1390 -0
- data/ext/nokogiri/include/libxml2/libxml/parserInternals.h +671 -0
- data/ext/nokogiri/include/libxml2/libxml/pattern.h +106 -0
- data/ext/nokogiri/include/libxml2/libxml/relaxng.h +219 -0
- data/ext/nokogiri/include/libxml2/libxml/schemasInternals.h +959 -0
- data/ext/nokogiri/include/libxml2/libxml/schematron.h +143 -0
- data/ext/nokogiri/include/libxml2/libxml/threads.h +87 -0
- data/ext/nokogiri/include/libxml2/libxml/tree.h +1382 -0
- data/ext/nokogiri/include/libxml2/libxml/uri.h +106 -0
- data/ext/nokogiri/include/libxml2/libxml/valid.h +477 -0
- data/ext/nokogiri/include/libxml2/libxml/xinclude.h +136 -0
- data/ext/nokogiri/include/libxml2/libxml/xlink.h +189 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlIO.h +438 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlautomata.h +146 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlerror.h +962 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlexports.h +146 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlmemory.h +188 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlmodule.h +57 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlreader.h +436 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlregexp.h +215 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlsave.h +102 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlschemas.h +249 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlschemastypes.h +152 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlstring.h +140 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlunicode.h +366 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlversion.h +347 -0
- data/ext/nokogiri/include/libxml2/libxml/xmlwriter.h +489 -0
- data/ext/nokogiri/include/libxml2/libxml/xpath.h +579 -0
- data/ext/nokogiri/include/libxml2/libxml/xpathInternals.h +633 -0
- data/ext/nokogiri/include/libxml2/libxml/xpointer.h +138 -0
- data/ext/nokogiri/include/libxslt/attributes.h +39 -0
- data/ext/nokogiri/include/libxslt/documents.h +93 -0
- data/ext/nokogiri/include/libxslt/extensions.h +262 -0
- data/ext/nokogiri/include/libxslt/extra.h +72 -0
- data/ext/nokogiri/include/libxslt/functions.h +78 -0
- data/ext/nokogiri/include/libxslt/imports.h +75 -0
- data/ext/nokogiri/include/libxslt/keys.h +53 -0
- data/ext/nokogiri/include/libxslt/namespaces.h +68 -0
- data/ext/nokogiri/include/libxslt/numbersInternals.h +73 -0
- data/ext/nokogiri/include/libxslt/pattern.h +84 -0
- data/ext/nokogiri/include/libxslt/preproc.h +43 -0
- data/ext/nokogiri/include/libxslt/security.h +104 -0
- data/ext/nokogiri/include/libxslt/templates.h +77 -0
- data/ext/nokogiri/include/libxslt/transform.h +207 -0
- data/ext/nokogiri/include/libxslt/variables.h +118 -0
- data/ext/nokogiri/include/libxslt/xslt.h +110 -0
- data/ext/nokogiri/include/libxslt/xsltInternals.h +1995 -0
- data/ext/nokogiri/include/libxslt/xsltconfig.h +146 -0
- data/ext/nokogiri/include/libxslt/xsltexports.h +64 -0
- data/ext/nokogiri/include/libxslt/xsltlocale.h +44 -0
- data/ext/nokogiri/include/libxslt/xsltutils.h +343 -0
- data/ext/nokogiri/libxml2_polyfill.c +114 -0
- data/ext/nokogiri/nokogiri.c +294 -0
- data/ext/nokogiri/nokogiri.h +238 -0
- data/ext/nokogiri/test_global_handlers.c +40 -0
- data/ext/nokogiri/xml_attr.c +103 -0
- data/ext/nokogiri/xml_attribute_decl.c +70 -0
- data/ext/nokogiri/xml_cdata.c +62 -0
- data/ext/nokogiri/xml_comment.c +57 -0
- data/ext/nokogiri/xml_document.c +784 -0
- data/ext/nokogiri/xml_document_fragment.c +29 -0
- data/ext/nokogiri/xml_dtd.c +208 -0
- data/ext/nokogiri/xml_element_content.c +131 -0
- data/ext/nokogiri/xml_element_decl.c +69 -0
- data/ext/nokogiri/xml_encoding_handler.c +112 -0
- data/ext/nokogiri/xml_entity_decl.c +112 -0
- data/ext/nokogiri/xml_entity_reference.c +50 -0
- data/ext/nokogiri/xml_namespace.c +181 -0
- data/ext/nokogiri/xml_node.c +2459 -0
- data/ext/nokogiri/xml_node_set.c +518 -0
- data/ext/nokogiri/xml_processing_instruction.c +54 -0
- data/ext/nokogiri/xml_reader.c +777 -0
- data/ext/nokogiri/xml_relax_ng.c +149 -0
- data/ext/nokogiri/xml_sax_parser.c +403 -0
- data/ext/nokogiri/xml_sax_parser_context.c +390 -0
- data/ext/nokogiri/xml_sax_push_parser.c +206 -0
- data/ext/nokogiri/xml_schema.c +226 -0
- data/ext/nokogiri/xml_syntax_error.c +93 -0
- data/ext/nokogiri/xml_text.c +59 -0
- data/ext/nokogiri/xml_xpath_context.c +486 -0
- data/ext/nokogiri/xslt_stylesheet.c +421 -0
- data/gumbo-parser/CHANGES.md +63 -0
- data/gumbo-parser/Makefile +129 -0
- data/gumbo-parser/THANKS +27 -0
- data/lib/nokogiri/3.1/nokogiri.so +0 -0
- data/lib/nokogiri/3.2/nokogiri.so +0 -0
- data/lib/nokogiri/3.3/nokogiri.so +0 -0
- data/lib/nokogiri/3.4/nokogiri.so +0 -0
- data/lib/nokogiri/class_resolver.rb +67 -0
- data/lib/nokogiri/css/node.rb +58 -0
- data/lib/nokogiri/css/parser.rb +772 -0
- data/lib/nokogiri/css/parser.y +277 -0
- data/lib/nokogiri/css/parser_extras.rb +36 -0
- data/lib/nokogiri/css/selector_cache.rb +38 -0
- data/lib/nokogiri/css/syntax_error.rb +9 -0
- data/lib/nokogiri/css/tokenizer.rb +155 -0
- data/lib/nokogiri/css/tokenizer.rex +57 -0
- data/lib/nokogiri/css/xpath_visitor.rb +375 -0
- data/lib/nokogiri/css.rb +132 -0
- data/lib/nokogiri/decorators/slop.rb +42 -0
- data/lib/nokogiri/encoding_handler.rb +57 -0
- data/lib/nokogiri/extension.rb +32 -0
- data/lib/nokogiri/gumbo.rb +15 -0
- data/lib/nokogiri/html.rb +48 -0
- data/lib/nokogiri/html4/builder.rb +37 -0
- data/lib/nokogiri/html4/document.rb +235 -0
- data/lib/nokogiri/html4/document_fragment.rb +166 -0
- data/lib/nokogiri/html4/element_description.rb +25 -0
- data/lib/nokogiri/html4/element_description_defaults.rb +2040 -0
- data/lib/nokogiri/html4/encoding_reader.rb +121 -0
- data/lib/nokogiri/html4/entity_lookup.rb +15 -0
- data/lib/nokogiri/html4/sax/parser.rb +48 -0
- data/lib/nokogiri/html4/sax/parser_context.rb +15 -0
- data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
- data/lib/nokogiri/html4.rb +42 -0
- data/lib/nokogiri/html5/builder.rb +40 -0
- data/lib/nokogiri/html5/document.rb +199 -0
- data/lib/nokogiri/html5/document_fragment.rb +200 -0
- data/lib/nokogiri/html5/node.rb +103 -0
- data/lib/nokogiri/html5.rb +368 -0
- data/lib/nokogiri/jruby/dependencies.rb +3 -0
- data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
- data/lib/nokogiri/syntax_error.rb +6 -0
- data/lib/nokogiri/version/constant.rb +6 -0
- data/lib/nokogiri/version/info.rb +224 -0
- data/lib/nokogiri/version.rb +4 -0
- data/lib/nokogiri/xml/attr.rb +66 -0
- data/lib/nokogiri/xml/attribute_decl.rb +22 -0
- data/lib/nokogiri/xml/builder.rb +494 -0
- data/lib/nokogiri/xml/cdata.rb +13 -0
- data/lib/nokogiri/xml/character_data.rb +9 -0
- data/lib/nokogiri/xml/document.rb +514 -0
- data/lib/nokogiri/xml/document_fragment.rb +276 -0
- data/lib/nokogiri/xml/dtd.rb +34 -0
- data/lib/nokogiri/xml/element_content.rb +46 -0
- data/lib/nokogiri/xml/element_decl.rb +17 -0
- data/lib/nokogiri/xml/entity_decl.rb +23 -0
- data/lib/nokogiri/xml/entity_reference.rb +20 -0
- data/lib/nokogiri/xml/namespace.rb +57 -0
- data/lib/nokogiri/xml/node/save_options.rb +76 -0
- data/lib/nokogiri/xml/node.rb +1650 -0
- data/lib/nokogiri/xml/node_set.rb +449 -0
- data/lib/nokogiri/xml/notation.rb +19 -0
- data/lib/nokogiri/xml/parse_options.rb +213 -0
- data/lib/nokogiri/xml/pp/character_data.rb +21 -0
- data/lib/nokogiri/xml/pp/node.rb +73 -0
- data/lib/nokogiri/xml/pp.rb +4 -0
- data/lib/nokogiri/xml/processing_instruction.rb +11 -0
- data/lib/nokogiri/xml/reader.rb +139 -0
- data/lib/nokogiri/xml/relax_ng.rb +75 -0
- data/lib/nokogiri/xml/sax/document.rb +258 -0
- data/lib/nokogiri/xml/sax/parser.rb +199 -0
- data/lib/nokogiri/xml/sax/parser_context.rb +129 -0
- data/lib/nokogiri/xml/sax/push_parser.rb +64 -0
- data/lib/nokogiri/xml/sax.rb +54 -0
- data/lib/nokogiri/xml/schema.rb +140 -0
- data/lib/nokogiri/xml/searchable.rb +274 -0
- data/lib/nokogiri/xml/syntax_error.rb +94 -0
- data/lib/nokogiri/xml/text.rb +11 -0
- data/lib/nokogiri/xml/xpath/syntax_error.rb +13 -0
- data/lib/nokogiri/xml/xpath.rb +21 -0
- data/lib/nokogiri/xml/xpath_context.rb +27 -0
- data/lib/nokogiri/xml.rb +65 -0
- data/lib/nokogiri/xslt/stylesheet.rb +49 -0
- data/lib/nokogiri/xslt.rb +129 -0
- data/lib/nokogiri.rb +128 -0
- data/lib/xsd/xmlparser/nokogiri.rb +105 -0
- metadata +321 -0
@@ -0,0 +1,121 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module HTML4
|
5
|
+
# Libxml2's parser has poor support for encoding detection. First, it does not recognize the
|
6
|
+
# HTML5 style meta charset declaration. Secondly, even if it successfully detects an encoding
|
7
|
+
# hint, it does not re-decode or re-parse the preceding part which may be garbled.
|
8
|
+
#
|
9
|
+
# EncodingReader aims to perform advanced encoding detection beyond what Libxml2 does, and to
|
10
|
+
# emulate rewinding of a stream and make Libxml2 redo parsing from the start when an encoding
|
11
|
+
# hint is found.
|
12
|
+
|
13
|
+
# :nodoc: all
|
14
|
+
class EncodingReader
|
15
|
+
class EncodingFound < StandardError
|
16
|
+
attr_reader :found_encoding
|
17
|
+
|
18
|
+
def initialize(encoding)
|
19
|
+
@found_encoding = encoding
|
20
|
+
super(format("encoding found: %s", encoding))
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
class SAXHandler < Nokogiri::XML::SAX::Document
|
25
|
+
attr_reader :encoding
|
26
|
+
|
27
|
+
def initialize
|
28
|
+
@encoding = nil
|
29
|
+
super
|
30
|
+
end
|
31
|
+
|
32
|
+
def start_element(name, attrs = [])
|
33
|
+
return unless name == "meta"
|
34
|
+
|
35
|
+
attr = Hash[attrs]
|
36
|
+
(charset = attr["charset"]) &&
|
37
|
+
(@encoding = charset)
|
38
|
+
(http_equiv = attr["http-equiv"]) &&
|
39
|
+
http_equiv.match(/\AContent-Type\z/i) &&
|
40
|
+
(content = attr["content"]) &&
|
41
|
+
(m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
|
42
|
+
(@encoding = m[1])
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
class JumpSAXHandler < SAXHandler
|
47
|
+
def initialize(jumptag)
|
48
|
+
@jumptag = jumptag
|
49
|
+
super()
|
50
|
+
end
|
51
|
+
|
52
|
+
def start_element(name, attrs = [])
|
53
|
+
super
|
54
|
+
throw(@jumptag, @encoding) if @encoding
|
55
|
+
throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
def self.detect_encoding(chunk)
|
60
|
+
(m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) &&
|
61
|
+
(return Nokogiri.XML(m[1]).encoding)
|
62
|
+
|
63
|
+
if Nokogiri.jruby?
|
64
|
+
(m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
|
65
|
+
(return m[4])
|
66
|
+
catch(:encoding_found) do
|
67
|
+
Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
|
68
|
+
nil
|
69
|
+
end
|
70
|
+
else
|
71
|
+
handler = SAXHandler.new
|
72
|
+
parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
|
73
|
+
begin
|
74
|
+
parser << chunk
|
75
|
+
rescue
|
76
|
+
Nokogiri::SyntaxError
|
77
|
+
end
|
78
|
+
handler.encoding
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
def initialize(io)
|
83
|
+
@io = io
|
84
|
+
@firstchunk = nil
|
85
|
+
@encoding_found = nil
|
86
|
+
end
|
87
|
+
|
88
|
+
# This method is used by the C extension so that
|
89
|
+
# Nokogiri::HTML4::Document#read_io() does not leak memory when
|
90
|
+
# EncodingFound is raised.
|
91
|
+
attr_reader :encoding_found
|
92
|
+
|
93
|
+
def read(len)
|
94
|
+
# no support for a call without len
|
95
|
+
|
96
|
+
unless @firstchunk
|
97
|
+
(@firstchunk = @io.read(len)) || return
|
98
|
+
|
99
|
+
# This implementation expects that the first call from
|
100
|
+
# htmlReadIO() is made with a length long enough (~1KB) to
|
101
|
+
# achieve advanced encoding detection.
|
102
|
+
if (encoding = EncodingReader.detect_encoding(@firstchunk))
|
103
|
+
# The first chunk is stored for the next read in retry.
|
104
|
+
raise @encoding_found = EncodingFound.new(encoding)
|
105
|
+
end
|
106
|
+
end
|
107
|
+
@encoding_found = nil
|
108
|
+
|
109
|
+
ret = @firstchunk.slice!(0, len)
|
110
|
+
if (len -= ret.length) > 0
|
111
|
+
(rest = @io.read(len)) && ret << (rest)
|
112
|
+
end
|
113
|
+
if ret.empty?
|
114
|
+
nil
|
115
|
+
else
|
116
|
+
ret
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module HTML4
|
5
|
+
class EntityDescription < Struct.new(:value, :name, :description); end
|
6
|
+
|
7
|
+
class EntityLookup
|
8
|
+
###
|
9
|
+
# Look up entity with +name+
|
10
|
+
def [](name)
|
11
|
+
(val = get(name)) && val.value
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module HTML4
|
5
|
+
###
|
6
|
+
# Nokogiri provides a SAX parser to process HTML4 which will provide HTML recovery
|
7
|
+
# ("autocorrection") features.
|
8
|
+
#
|
9
|
+
# See Nokogiri::HTML4::SAX::Parser for a basic example of using a SAX parser with HTML.
|
10
|
+
#
|
11
|
+
# For more information on SAX parsers, see Nokogiri::XML::SAX
|
12
|
+
#
|
13
|
+
module SAX
|
14
|
+
###
|
15
|
+
# This parser is a SAX style parser that reads its input as it deems necessary. The parser
|
16
|
+
# takes a Nokogiri::XML::SAX::Document, an optional encoding, then given an HTML input, sends
|
17
|
+
# messages to the Nokogiri::XML::SAX::Document.
|
18
|
+
#
|
19
|
+
# ⚠ This is an HTML4 parser and so may not support some HTML5 features and behaviors.
|
20
|
+
#
|
21
|
+
# Here is a basic usage example:
|
22
|
+
#
|
23
|
+
# class MyHandler < Nokogiri::XML::SAX::Document
|
24
|
+
# def start_element name, attributes = []
|
25
|
+
# puts "found a #{name}"
|
26
|
+
# end
|
27
|
+
# end
|
28
|
+
#
|
29
|
+
# parser = Nokogiri::HTML4::SAX::Parser.new(MyHandler.new)
|
30
|
+
#
|
31
|
+
# # Hand an IO object to the parser, which will read the HTML from the IO.
|
32
|
+
# File.open(path_to_html) do |f|
|
33
|
+
# parser.parse(f)
|
34
|
+
# end
|
35
|
+
#
|
36
|
+
# For more information on \SAX parsers, see Nokogiri::XML::SAX or the parent class
|
37
|
+
# Nokogiri::XML::SAX::Parser.
|
38
|
+
#
|
39
|
+
# Also see Nokogiri::XML::SAX::Document for the available events.
|
40
|
+
#
|
41
|
+
class Parser < Nokogiri::XML::SAX::Parser
|
42
|
+
# this class inherits its behavior from Nokogiri::XML::SAX::Parser, but note that superclass
|
43
|
+
# uses Nokogiri::ClassResolver to use HTML4::SAX::ParserContext as the context class for
|
44
|
+
# this class, which is where the real behavioral differences are implemented.
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module HTML4
|
5
|
+
module SAX
|
6
|
+
###
|
7
|
+
# Context object to invoke the HTML4 SAX parser on the SAX::Document handler.
|
8
|
+
#
|
9
|
+
# 💡 This class is usually not instantiated by the user. Use Nokogiri::HTML4::SAX::Parser
|
10
|
+
# instead.
|
11
|
+
class ParserContext < Nokogiri::XML::SAX::ParserContext
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,37 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module HTML4
|
5
|
+
module SAX
|
6
|
+
class PushParser
|
7
|
+
# The Nokogiri::HTML4::SAX::Document on which the PushParser will be
|
8
|
+
# operating
|
9
|
+
attr_accessor :document
|
10
|
+
|
11
|
+
def initialize(doc = HTML4::SAX::Document.new, file_name = nil, encoding = "UTF-8")
|
12
|
+
@document = doc
|
13
|
+
@encoding = encoding
|
14
|
+
@sax_parser = HTML4::SAX::Parser.new(doc, @encoding)
|
15
|
+
|
16
|
+
## Create our push parser context
|
17
|
+
initialize_native(@sax_parser, file_name, encoding)
|
18
|
+
end
|
19
|
+
|
20
|
+
###
|
21
|
+
# Write a +chunk+ of HTML to the PushParser. Any callback methods
|
22
|
+
# that can be called will be called immediately.
|
23
|
+
def write(chunk, last_chunk = false)
|
24
|
+
native_write(chunk, last_chunk)
|
25
|
+
end
|
26
|
+
alias_method :<<, :write
|
27
|
+
|
28
|
+
###
|
29
|
+
# Finish the parsing. This method is only necessary for
|
30
|
+
# Nokogiri::HTML4::SAX::Document#end_document to be called.
|
31
|
+
def finish
|
32
|
+
write("", true)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
module Nokogiri
|
5
|
+
class << self
|
6
|
+
# Convenience method for Nokogiri::HTML4::Document.parse
|
7
|
+
def HTML4(...)
|
8
|
+
Nokogiri::HTML4::Document.parse(...)
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
# Since v1.12.0
|
13
|
+
#
|
14
|
+
# 💡 Before v1.12.0, Nokogiri::HTML4 did not exist, and Nokogiri::HTML was the module/namespace
|
15
|
+
# for parsing HTML.
|
16
|
+
module HTML4
|
17
|
+
class << self
|
18
|
+
# Convenience method for Nokogiri::HTML4::Document.parse
|
19
|
+
def parse(...)
|
20
|
+
Document.parse(...)
|
21
|
+
end
|
22
|
+
|
23
|
+
# Convenience method for Nokogiri::HTML4::DocumentFragment.parse
|
24
|
+
def fragment(...)
|
25
|
+
HTML4::DocumentFragment.parse(...)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
# Instance of Nokogiri::HTML4::EntityLookup
|
30
|
+
NamedCharacters = EntityLookup.new
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
require_relative "html4/entity_lookup"
|
35
|
+
require_relative "html4/document"
|
36
|
+
require_relative "html4/document_fragment"
|
37
|
+
require_relative "html4/encoding_reader"
|
38
|
+
require_relative "html4/sax/parser_context"
|
39
|
+
require_relative "html4/sax/parser"
|
40
|
+
require_relative "html4/sax/push_parser"
|
41
|
+
require_relative "html4/element_description"
|
42
|
+
require_relative "html4/element_description_defaults"
|
@@ -0,0 +1,40 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module HTML5
|
5
|
+
###
|
6
|
+
# Nokogiri HTML5 builder is used for building HTML documents. It is very similar to the
|
7
|
+
# Nokogiri::XML::Builder. In fact, you should go read the documentation for
|
8
|
+
# Nokogiri::XML::Builder before reading this documentation.
|
9
|
+
#
|
10
|
+
# The construction behavior is identical to HTML4::Builder, but HTML5 documents implement the
|
11
|
+
# [HTML5 standard's serialization
|
12
|
+
# algorithm](https://www.w3.org/TR/2008/WD-html5-20080610/serializing.html).
|
13
|
+
#
|
14
|
+
# == Synopsis:
|
15
|
+
#
|
16
|
+
# Create an HTML5 document with a body that has an onload attribute, and a
|
17
|
+
# span tag with a class of "bold" that has content of "Hello world".
|
18
|
+
#
|
19
|
+
# builder = Nokogiri::HTML5::Builder.new do |doc|
|
20
|
+
# doc.html {
|
21
|
+
# doc.body(:onload => 'some_func();') {
|
22
|
+
# doc.span.bold {
|
23
|
+
# doc.text "Hello world"
|
24
|
+
# }
|
25
|
+
# }
|
26
|
+
# }
|
27
|
+
# end
|
28
|
+
# puts builder.to_html
|
29
|
+
#
|
30
|
+
# The HTML5 builder inherits from the XML builder, so make sure to read the
|
31
|
+
# Nokogiri::XML::Builder documentation.
|
32
|
+
class Builder < Nokogiri::XML::Builder
|
33
|
+
###
|
34
|
+
# Convert the builder to HTML
|
35
|
+
def to_html
|
36
|
+
@doc.to_html
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,199 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
# frozen_string_literal: true
|
3
|
+
|
4
|
+
#
|
5
|
+
# Copyright 2013-2021 Sam Ruby, Stephen Checkoway
|
6
|
+
#
|
7
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
8
|
+
# you may not use this file except in compliance with the License.
|
9
|
+
# You may obtain a copy of the License at
|
10
|
+
#
|
11
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
12
|
+
#
|
13
|
+
# Unless required by applicable law or agreed to in writing, software
|
14
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
15
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
16
|
+
# See the License for the specific language governing permissions and
|
17
|
+
# limitations under the License.
|
18
|
+
#
|
19
|
+
|
20
|
+
require_relative "../html4/document"
|
21
|
+
|
22
|
+
module Nokogiri
|
23
|
+
module HTML5
|
24
|
+
# Enum for the HTML5 parser quirks mode values. Values returned by HTML5::Document#quirks_mode
|
25
|
+
#
|
26
|
+
# See https://dom.spec.whatwg.org/#concept-document-quirks for more information on HTML5 quirks
|
27
|
+
# mode.
|
28
|
+
#
|
29
|
+
# Since v1.14.0
|
30
|
+
module QuirksMode
|
31
|
+
NO_QUIRKS = 0 # The document was parsed in "no-quirks" mode
|
32
|
+
QUIRKS = 1 # The document was parsed in "quirks" mode
|
33
|
+
LIMITED_QUIRKS = 2 # The document was parsed in "limited-quirks" mode
|
34
|
+
end
|
35
|
+
|
36
|
+
# Since v1.12.0
|
37
|
+
#
|
38
|
+
# 💡 HTML5 functionality is not available when running JRuby.
|
39
|
+
class Document < Nokogiri::HTML4::Document
|
40
|
+
# Get the url name for this document, as passed into Document.parse, Document.read_io, or
|
41
|
+
# Document.read_memory
|
42
|
+
attr_reader :url
|
43
|
+
|
44
|
+
# Get the parser's quirks mode value. See HTML5::QuirksMode.
|
45
|
+
#
|
46
|
+
# This method returns +nil+ if the parser was not invoked (e.g., Nokogiri::HTML5::Document.new).
|
47
|
+
#
|
48
|
+
# Since v1.14.0
|
49
|
+
attr_reader :quirks_mode
|
50
|
+
|
51
|
+
class << self
|
52
|
+
# :call-seq:
|
53
|
+
# parse(input) { |options| ... } → HTML5::Document
|
54
|
+
# parse(input, url: encoding:) { |options| ... } → HTML5::Document
|
55
|
+
# parse(input, **options) → HTML5::Document
|
56
|
+
#
|
57
|
+
# Parse \HTML input with a parser compliant with the HTML5 spec. This method uses the
|
58
|
+
# encoding of +input+ if it can be determined, or else falls back to the +encoding:+
|
59
|
+
# parameter.
|
60
|
+
#
|
61
|
+
# [Required Parameters]
|
62
|
+
# - +input+ (String | IO) the \HTML content to be parsed.
|
63
|
+
#
|
64
|
+
# [Optional Parameters]
|
65
|
+
# - +url:+ (String) the base URI of the document.
|
66
|
+
#
|
67
|
+
# [Optional Keyword Arguments]
|
68
|
+
# - +encoding:+ (Encoding) The name of the encoding that should be used when processing the
|
69
|
+
# document. When not provided, the encoding will be determined based on the document
|
70
|
+
# content.
|
71
|
+
#
|
72
|
+
# - +max_errors:+ (Integer) The maximum number of parse errors to record. (default
|
73
|
+
# +Nokogiri::Gumbo::DEFAULT_MAX_ERRORS+ which is currently 0)
|
74
|
+
#
|
75
|
+
# - +max_tree_depth:+ (Integer) The maximum depth of the parse tree. (default
|
76
|
+
# +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+)
|
77
|
+
#
|
78
|
+
# - +max_attributes:+ (Integer) The maximum number of attributes allowed on an
|
79
|
+
# element. (default +Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES+)
|
80
|
+
#
|
81
|
+
# - +parse_noscript_content_as_text:+ (Boolean) Whether to parse the content of +noscript+
|
82
|
+
# elements as text. (default +false+)
|
83
|
+
#
|
84
|
+
# See rdoc-ref:HTML5@Parsing+options for a complete description of these parsing options.
|
85
|
+
#
|
86
|
+
# [Yields]
|
87
|
+
# If present, the block will be passed a Hash object to modify with parse options before the
|
88
|
+
# input is parsed. See rdoc-ref:HTML5@Parsing+options for a list of available options.
|
89
|
+
#
|
90
|
+
# ⚠ Note that +url:+ and +encoding:+ cannot be set by the configuration block.
|
91
|
+
#
|
92
|
+
# [Returns] Nokogiri::HTML5::Document
|
93
|
+
#
|
94
|
+
# *Example:* Parse a string with a specific encoding and custom max errors limit.
|
95
|
+
#
|
96
|
+
# Nokogiri::HTML5::Document.parse(socket, encoding: "ISO-8859-1", max_errors: 10)
|
97
|
+
#
|
98
|
+
# *Example:* Parse a string setting the +:parse_noscript_content_as_text+ option using the
|
99
|
+
# configuration block parameter.
|
100
|
+
#
|
101
|
+
# Nokogiri::HTML5::Document.parse(input) { |c| c[:parse_noscript_content_as_text] = true }
|
102
|
+
#
|
103
|
+
def parse(
|
104
|
+
string_or_io,
|
105
|
+
url_ = nil, encoding_ = nil,
|
106
|
+
url: url_, encoding: encoding_,
|
107
|
+
**options, &block
|
108
|
+
)
|
109
|
+
yield options if block
|
110
|
+
string_or_io = "" unless string_or_io
|
111
|
+
|
112
|
+
if string_or_io.respond_to?(:encoding) && string_or_io.encoding != Encoding::ASCII_8BIT
|
113
|
+
encoding ||= string_or_io.encoding.name
|
114
|
+
end
|
115
|
+
|
116
|
+
if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
|
117
|
+
url ||= string_or_io.path
|
118
|
+
end
|
119
|
+
unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
|
120
|
+
raise ArgumentError, "not a string or IO object"
|
121
|
+
end
|
122
|
+
|
123
|
+
do_parse(string_or_io, url, encoding, **options)
|
124
|
+
end
|
125
|
+
|
126
|
+
# Create a new document from an IO object.
|
127
|
+
#
|
128
|
+
# 💡 Most users should prefer Document.parse to this method.
|
129
|
+
def read_io(io, url_ = nil, encoding_ = nil, url: url_, encoding: encoding_, **options)
|
130
|
+
raise ArgumentError, "io object doesn't respond to :read" unless io.respond_to?(:read)
|
131
|
+
|
132
|
+
do_parse(io, url, encoding, **options)
|
133
|
+
end
|
134
|
+
|
135
|
+
# Create a new document from a String.
|
136
|
+
#
|
137
|
+
# 💡 Most users should prefer Document.parse to this method.
|
138
|
+
def read_memory(string, url_ = nil, encoding_ = nil, url: url_, encoding: encoding_, **options)
|
139
|
+
raise ArgumentError, "string object doesn't respond to :to_str" unless string.respond_to?(:to_str)
|
140
|
+
|
141
|
+
do_parse(string, url, encoding, **options)
|
142
|
+
end
|
143
|
+
|
144
|
+
private
|
145
|
+
|
146
|
+
def do_parse(string_or_io, url, encoding, **options)
|
147
|
+
string = HTML5.read_and_encode(string_or_io, encoding)
|
148
|
+
|
149
|
+
options[:max_attributes] ||= Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
|
150
|
+
options[:max_errors] ||= options.delete(:max_parse_errors) || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
|
151
|
+
options[:max_tree_depth] ||= Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
|
152
|
+
|
153
|
+
doc = Nokogiri::Gumbo.parse(string, url, self, **options)
|
154
|
+
doc.encoding = "UTF-8"
|
155
|
+
doc
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
def initialize(*args) # :nodoc:
|
160
|
+
super
|
161
|
+
@url = nil
|
162
|
+
@quirks_mode = nil
|
163
|
+
end
|
164
|
+
|
165
|
+
# :call-seq:
|
166
|
+
# fragment() → Nokogiri::HTML5::DocumentFragment
|
167
|
+
# fragment(markup) → Nokogiri::HTML5::DocumentFragment
|
168
|
+
#
|
169
|
+
# Parse a HTML5 document fragment from +markup+, returning a Nokogiri::HTML5::DocumentFragment.
|
170
|
+
#
|
171
|
+
# [Properties]
|
172
|
+
# - +markup+ (String) The HTML5 markup fragment to be parsed
|
173
|
+
#
|
174
|
+
# [Returns]
|
175
|
+
# Nokogiri::HTML5::DocumentFragment. This object's children will be empty if +markup+ is not
|
176
|
+
# passed, is empty, or is +nil+.
|
177
|
+
#
|
178
|
+
def fragment(markup = nil)
|
179
|
+
DocumentFragment.new(self, markup)
|
180
|
+
end
|
181
|
+
|
182
|
+
def to_xml(options = {}, &block) # :nodoc:
|
183
|
+
# Bypass XML::Document#to_xml which doesn't add
|
184
|
+
# XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
|
185
|
+
XML::Node.instance_method(:to_xml).bind_call(self, options, &block)
|
186
|
+
end
|
187
|
+
|
188
|
+
# :call-seq:
|
189
|
+
# xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig
|
190
|
+
#
|
191
|
+
# [Returns] The document type which determines CSS-to-XPath translation.
|
192
|
+
#
|
193
|
+
# See CSS::XPathVisitor for more information.
|
194
|
+
def xpath_doctype
|
195
|
+
Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5
|
196
|
+
end
|
197
|
+
end
|
198
|
+
end
|
199
|
+
end
|