nokogiri 1.18.0-aarch64-linux-gnu

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (203) hide show
  1. checksums.yaml +7 -0
  2. data/Gemfile +39 -0
  3. data/LICENSE-DEPENDENCIES.md +2224 -0
  4. data/LICENSE.md +9 -0
  5. data/README.md +293 -0
  6. data/bin/nokogiri +131 -0
  7. data/dependencies.yml +42 -0
  8. data/ext/nokogiri/depend +38 -0
  9. data/ext/nokogiri/extconf.rb +1173 -0
  10. data/ext/nokogiri/gumbo.c +610 -0
  11. data/ext/nokogiri/html4_document.c +171 -0
  12. data/ext/nokogiri/html4_element_description.c +299 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser.c +40 -0
  15. data/ext/nokogiri/html4_sax_parser_context.c +98 -0
  16. data/ext/nokogiri/html4_sax_push_parser.c +96 -0
  17. data/ext/nokogiri/include/libexslt/exslt.h +108 -0
  18. data/ext/nokogiri/include/libexslt/exsltconfig.h +70 -0
  19. data/ext/nokogiri/include/libexslt/exsltexports.h +63 -0
  20. data/ext/nokogiri/include/libxml2/libxml/HTMLparser.h +336 -0
  21. data/ext/nokogiri/include/libxml2/libxml/HTMLtree.h +147 -0
  22. data/ext/nokogiri/include/libxml2/libxml/SAX.h +202 -0
  23. data/ext/nokogiri/include/libxml2/libxml/SAX2.h +171 -0
  24. data/ext/nokogiri/include/libxml2/libxml/c14n.h +115 -0
  25. data/ext/nokogiri/include/libxml2/libxml/catalog.h +182 -0
  26. data/ext/nokogiri/include/libxml2/libxml/chvalid.h +230 -0
  27. data/ext/nokogiri/include/libxml2/libxml/debugXML.h +217 -0
  28. data/ext/nokogiri/include/libxml2/libxml/dict.h +82 -0
  29. data/ext/nokogiri/include/libxml2/libxml/encoding.h +244 -0
  30. data/ext/nokogiri/include/libxml2/libxml/entities.h +166 -0
  31. data/ext/nokogiri/include/libxml2/libxml/globals.h +41 -0
  32. data/ext/nokogiri/include/libxml2/libxml/hash.h +251 -0
  33. data/ext/nokogiri/include/libxml2/libxml/list.h +137 -0
  34. data/ext/nokogiri/include/libxml2/libxml/nanoftp.h +186 -0
  35. data/ext/nokogiri/include/libxml2/libxml/nanohttp.h +98 -0
  36. data/ext/nokogiri/include/libxml2/libxml/parser.h +1390 -0
  37. data/ext/nokogiri/include/libxml2/libxml/parserInternals.h +671 -0
  38. data/ext/nokogiri/include/libxml2/libxml/pattern.h +106 -0
  39. data/ext/nokogiri/include/libxml2/libxml/relaxng.h +219 -0
  40. data/ext/nokogiri/include/libxml2/libxml/schemasInternals.h +959 -0
  41. data/ext/nokogiri/include/libxml2/libxml/schematron.h +143 -0
  42. data/ext/nokogiri/include/libxml2/libxml/threads.h +87 -0
  43. data/ext/nokogiri/include/libxml2/libxml/tree.h +1382 -0
  44. data/ext/nokogiri/include/libxml2/libxml/uri.h +106 -0
  45. data/ext/nokogiri/include/libxml2/libxml/valid.h +477 -0
  46. data/ext/nokogiri/include/libxml2/libxml/xinclude.h +136 -0
  47. data/ext/nokogiri/include/libxml2/libxml/xlink.h +189 -0
  48. data/ext/nokogiri/include/libxml2/libxml/xmlIO.h +438 -0
  49. data/ext/nokogiri/include/libxml2/libxml/xmlautomata.h +146 -0
  50. data/ext/nokogiri/include/libxml2/libxml/xmlerror.h +962 -0
  51. data/ext/nokogiri/include/libxml2/libxml/xmlexports.h +146 -0
  52. data/ext/nokogiri/include/libxml2/libxml/xmlmemory.h +188 -0
  53. data/ext/nokogiri/include/libxml2/libxml/xmlmodule.h +57 -0
  54. data/ext/nokogiri/include/libxml2/libxml/xmlreader.h +436 -0
  55. data/ext/nokogiri/include/libxml2/libxml/xmlregexp.h +215 -0
  56. data/ext/nokogiri/include/libxml2/libxml/xmlsave.h +102 -0
  57. data/ext/nokogiri/include/libxml2/libxml/xmlschemas.h +249 -0
  58. data/ext/nokogiri/include/libxml2/libxml/xmlschemastypes.h +152 -0
  59. data/ext/nokogiri/include/libxml2/libxml/xmlstring.h +140 -0
  60. data/ext/nokogiri/include/libxml2/libxml/xmlunicode.h +366 -0
  61. data/ext/nokogiri/include/libxml2/libxml/xmlversion.h +347 -0
  62. data/ext/nokogiri/include/libxml2/libxml/xmlwriter.h +489 -0
  63. data/ext/nokogiri/include/libxml2/libxml/xpath.h +579 -0
  64. data/ext/nokogiri/include/libxml2/libxml/xpathInternals.h +633 -0
  65. data/ext/nokogiri/include/libxml2/libxml/xpointer.h +138 -0
  66. data/ext/nokogiri/include/libxslt/attributes.h +39 -0
  67. data/ext/nokogiri/include/libxslt/documents.h +93 -0
  68. data/ext/nokogiri/include/libxslt/extensions.h +262 -0
  69. data/ext/nokogiri/include/libxslt/extra.h +72 -0
  70. data/ext/nokogiri/include/libxslt/functions.h +78 -0
  71. data/ext/nokogiri/include/libxslt/imports.h +75 -0
  72. data/ext/nokogiri/include/libxslt/keys.h +53 -0
  73. data/ext/nokogiri/include/libxslt/namespaces.h +68 -0
  74. data/ext/nokogiri/include/libxslt/numbersInternals.h +73 -0
  75. data/ext/nokogiri/include/libxslt/pattern.h +84 -0
  76. data/ext/nokogiri/include/libxslt/preproc.h +43 -0
  77. data/ext/nokogiri/include/libxslt/security.h +104 -0
  78. data/ext/nokogiri/include/libxslt/templates.h +77 -0
  79. data/ext/nokogiri/include/libxslt/transform.h +207 -0
  80. data/ext/nokogiri/include/libxslt/variables.h +118 -0
  81. data/ext/nokogiri/include/libxslt/xslt.h +110 -0
  82. data/ext/nokogiri/include/libxslt/xsltInternals.h +1995 -0
  83. data/ext/nokogiri/include/libxslt/xsltconfig.h +146 -0
  84. data/ext/nokogiri/include/libxslt/xsltexports.h +64 -0
  85. data/ext/nokogiri/include/libxslt/xsltlocale.h +44 -0
  86. data/ext/nokogiri/include/libxslt/xsltutils.h +343 -0
  87. data/ext/nokogiri/libxml2_polyfill.c +114 -0
  88. data/ext/nokogiri/nokogiri.c +294 -0
  89. data/ext/nokogiri/nokogiri.h +238 -0
  90. data/ext/nokogiri/test_global_handlers.c +40 -0
  91. data/ext/nokogiri/xml_attr.c +103 -0
  92. data/ext/nokogiri/xml_attribute_decl.c +70 -0
  93. data/ext/nokogiri/xml_cdata.c +62 -0
  94. data/ext/nokogiri/xml_comment.c +57 -0
  95. data/ext/nokogiri/xml_document.c +784 -0
  96. data/ext/nokogiri/xml_document_fragment.c +29 -0
  97. data/ext/nokogiri/xml_dtd.c +208 -0
  98. data/ext/nokogiri/xml_element_content.c +131 -0
  99. data/ext/nokogiri/xml_element_decl.c +69 -0
  100. data/ext/nokogiri/xml_encoding_handler.c +112 -0
  101. data/ext/nokogiri/xml_entity_decl.c +112 -0
  102. data/ext/nokogiri/xml_entity_reference.c +50 -0
  103. data/ext/nokogiri/xml_namespace.c +181 -0
  104. data/ext/nokogiri/xml_node.c +2459 -0
  105. data/ext/nokogiri/xml_node_set.c +518 -0
  106. data/ext/nokogiri/xml_processing_instruction.c +54 -0
  107. data/ext/nokogiri/xml_reader.c +777 -0
  108. data/ext/nokogiri/xml_relax_ng.c +149 -0
  109. data/ext/nokogiri/xml_sax_parser.c +403 -0
  110. data/ext/nokogiri/xml_sax_parser_context.c +390 -0
  111. data/ext/nokogiri/xml_sax_push_parser.c +206 -0
  112. data/ext/nokogiri/xml_schema.c +226 -0
  113. data/ext/nokogiri/xml_syntax_error.c +93 -0
  114. data/ext/nokogiri/xml_text.c +59 -0
  115. data/ext/nokogiri/xml_xpath_context.c +486 -0
  116. data/ext/nokogiri/xslt_stylesheet.c +421 -0
  117. data/gumbo-parser/CHANGES.md +63 -0
  118. data/gumbo-parser/Makefile +129 -0
  119. data/gumbo-parser/THANKS +27 -0
  120. data/lib/nokogiri/3.1/nokogiri.so +0 -0
  121. data/lib/nokogiri/3.2/nokogiri.so +0 -0
  122. data/lib/nokogiri/3.3/nokogiri.so +0 -0
  123. data/lib/nokogiri/3.4/nokogiri.so +0 -0
  124. data/lib/nokogiri/class_resolver.rb +67 -0
  125. data/lib/nokogiri/css/node.rb +58 -0
  126. data/lib/nokogiri/css/parser.rb +772 -0
  127. data/lib/nokogiri/css/parser.y +277 -0
  128. data/lib/nokogiri/css/parser_extras.rb +36 -0
  129. data/lib/nokogiri/css/selector_cache.rb +38 -0
  130. data/lib/nokogiri/css/syntax_error.rb +9 -0
  131. data/lib/nokogiri/css/tokenizer.rb +155 -0
  132. data/lib/nokogiri/css/tokenizer.rex +57 -0
  133. data/lib/nokogiri/css/xpath_visitor.rb +375 -0
  134. data/lib/nokogiri/css.rb +132 -0
  135. data/lib/nokogiri/decorators/slop.rb +42 -0
  136. data/lib/nokogiri/encoding_handler.rb +57 -0
  137. data/lib/nokogiri/extension.rb +32 -0
  138. data/lib/nokogiri/gumbo.rb +15 -0
  139. data/lib/nokogiri/html.rb +48 -0
  140. data/lib/nokogiri/html4/builder.rb +37 -0
  141. data/lib/nokogiri/html4/document.rb +235 -0
  142. data/lib/nokogiri/html4/document_fragment.rb +166 -0
  143. data/lib/nokogiri/html4/element_description.rb +25 -0
  144. data/lib/nokogiri/html4/element_description_defaults.rb +2040 -0
  145. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  146. data/lib/nokogiri/html4/entity_lookup.rb +15 -0
  147. data/lib/nokogiri/html4/sax/parser.rb +48 -0
  148. data/lib/nokogiri/html4/sax/parser_context.rb +15 -0
  149. data/lib/nokogiri/html4/sax/push_parser.rb +37 -0
  150. data/lib/nokogiri/html4.rb +42 -0
  151. data/lib/nokogiri/html5/builder.rb +40 -0
  152. data/lib/nokogiri/html5/document.rb +199 -0
  153. data/lib/nokogiri/html5/document_fragment.rb +200 -0
  154. data/lib/nokogiri/html5/node.rb +103 -0
  155. data/lib/nokogiri/html5.rb +368 -0
  156. data/lib/nokogiri/jruby/dependencies.rb +3 -0
  157. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  158. data/lib/nokogiri/syntax_error.rb +6 -0
  159. data/lib/nokogiri/version/constant.rb +6 -0
  160. data/lib/nokogiri/version/info.rb +224 -0
  161. data/lib/nokogiri/version.rb +4 -0
  162. data/lib/nokogiri/xml/attr.rb +66 -0
  163. data/lib/nokogiri/xml/attribute_decl.rb +22 -0
  164. data/lib/nokogiri/xml/builder.rb +494 -0
  165. data/lib/nokogiri/xml/cdata.rb +13 -0
  166. data/lib/nokogiri/xml/character_data.rb +9 -0
  167. data/lib/nokogiri/xml/document.rb +514 -0
  168. data/lib/nokogiri/xml/document_fragment.rb +276 -0
  169. data/lib/nokogiri/xml/dtd.rb +34 -0
  170. data/lib/nokogiri/xml/element_content.rb +46 -0
  171. data/lib/nokogiri/xml/element_decl.rb +17 -0
  172. data/lib/nokogiri/xml/entity_decl.rb +23 -0
  173. data/lib/nokogiri/xml/entity_reference.rb +20 -0
  174. data/lib/nokogiri/xml/namespace.rb +57 -0
  175. data/lib/nokogiri/xml/node/save_options.rb +76 -0
  176. data/lib/nokogiri/xml/node.rb +1650 -0
  177. data/lib/nokogiri/xml/node_set.rb +449 -0
  178. data/lib/nokogiri/xml/notation.rb +19 -0
  179. data/lib/nokogiri/xml/parse_options.rb +213 -0
  180. data/lib/nokogiri/xml/pp/character_data.rb +21 -0
  181. data/lib/nokogiri/xml/pp/node.rb +73 -0
  182. data/lib/nokogiri/xml/pp.rb +4 -0
  183. data/lib/nokogiri/xml/processing_instruction.rb +11 -0
  184. data/lib/nokogiri/xml/reader.rb +139 -0
  185. data/lib/nokogiri/xml/relax_ng.rb +75 -0
  186. data/lib/nokogiri/xml/sax/document.rb +258 -0
  187. data/lib/nokogiri/xml/sax/parser.rb +199 -0
  188. data/lib/nokogiri/xml/sax/parser_context.rb +129 -0
  189. data/lib/nokogiri/xml/sax/push_parser.rb +64 -0
  190. data/lib/nokogiri/xml/sax.rb +54 -0
  191. data/lib/nokogiri/xml/schema.rb +140 -0
  192. data/lib/nokogiri/xml/searchable.rb +274 -0
  193. data/lib/nokogiri/xml/syntax_error.rb +94 -0
  194. data/lib/nokogiri/xml/text.rb +11 -0
  195. data/lib/nokogiri/xml/xpath/syntax_error.rb +13 -0
  196. data/lib/nokogiri/xml/xpath.rb +21 -0
  197. data/lib/nokogiri/xml/xpath_context.rb +27 -0
  198. data/lib/nokogiri/xml.rb +65 -0
  199. data/lib/nokogiri/xslt/stylesheet.rb +49 -0
  200. data/lib/nokogiri/xslt.rb +129 -0
  201. data/lib/nokogiri.rb +128 -0
  202. data/lib/xsd/xmlparser/nokogiri.rb +105 -0
  203. metadata +321 -0
@@ -0,0 +1,121 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML4
5
+ # Libxml2's parser has poor support for encoding detection. First, it does not recognize the
6
+ # HTML5 style meta charset declaration. Secondly, even if it successfully detects an encoding
7
+ # hint, it does not re-decode or re-parse the preceding part which may be garbled.
8
+ #
9
+ # EncodingReader aims to perform advanced encoding detection beyond what Libxml2 does, and to
10
+ # emulate rewinding of a stream and make Libxml2 redo parsing from the start when an encoding
11
+ # hint is found.
12
+
13
+ # :nodoc: all
14
+ class EncodingReader
15
+ class EncodingFound < StandardError
16
+ attr_reader :found_encoding
17
+
18
+ def initialize(encoding)
19
+ @found_encoding = encoding
20
+ super(format("encoding found: %s", encoding))
21
+ end
22
+ end
23
+
24
+ class SAXHandler < Nokogiri::XML::SAX::Document
25
+ attr_reader :encoding
26
+
27
+ def initialize
28
+ @encoding = nil
29
+ super
30
+ end
31
+
32
+ def start_element(name, attrs = [])
33
+ return unless name == "meta"
34
+
35
+ attr = Hash[attrs]
36
+ (charset = attr["charset"]) &&
37
+ (@encoding = charset)
38
+ (http_equiv = attr["http-equiv"]) &&
39
+ http_equiv.match(/\AContent-Type\z/i) &&
40
+ (content = attr["content"]) &&
41
+ (m = content.match(/;\s*charset\s*=\s*([\w-]+)/)) &&
42
+ (@encoding = m[1])
43
+ end
44
+ end
45
+
46
+ class JumpSAXHandler < SAXHandler
47
+ def initialize(jumptag)
48
+ @jumptag = jumptag
49
+ super()
50
+ end
51
+
52
+ def start_element(name, attrs = [])
53
+ super
54
+ throw(@jumptag, @encoding) if @encoding
55
+ throw(@jumptag, nil) if /\A(?:div|h1|img|p|br)\z/.match?(name)
56
+ end
57
+ end
58
+
59
+ def self.detect_encoding(chunk)
60
+ (m = chunk.match(/\A(<\?xml[ \t\r\n][^>]*>)/)) &&
61
+ (return Nokogiri.XML(m[1]).encoding)
62
+
63
+ if Nokogiri.jruby?
64
+ (m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)) &&
65
+ (return m[4])
66
+ catch(:encoding_found) do
67
+ Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
68
+ nil
69
+ end
70
+ else
71
+ handler = SAXHandler.new
72
+ parser = Nokogiri::HTML4::SAX::PushParser.new(handler)
73
+ begin
74
+ parser << chunk
75
+ rescue
76
+ Nokogiri::SyntaxError
77
+ end
78
+ handler.encoding
79
+ end
80
+ end
81
+
82
+ def initialize(io)
83
+ @io = io
84
+ @firstchunk = nil
85
+ @encoding_found = nil
86
+ end
87
+
88
+ # This method is used by the C extension so that
89
+ # Nokogiri::HTML4::Document#read_io() does not leak memory when
90
+ # EncodingFound is raised.
91
+ attr_reader :encoding_found
92
+
93
+ def read(len)
94
+ # no support for a call without len
95
+
96
+ unless @firstchunk
97
+ (@firstchunk = @io.read(len)) || return
98
+
99
+ # This implementation expects that the first call from
100
+ # htmlReadIO() is made with a length long enough (~1KB) to
101
+ # achieve advanced encoding detection.
102
+ if (encoding = EncodingReader.detect_encoding(@firstchunk))
103
+ # The first chunk is stored for the next read in retry.
104
+ raise @encoding_found = EncodingFound.new(encoding)
105
+ end
106
+ end
107
+ @encoding_found = nil
108
+
109
+ ret = @firstchunk.slice!(0, len)
110
+ if (len -= ret.length) > 0
111
+ (rest = @io.read(len)) && ret << (rest)
112
+ end
113
+ if ret.empty?
114
+ nil
115
+ else
116
+ ret
117
+ end
118
+ end
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML4
5
+ class EntityDescription < Struct.new(:value, :name, :description); end
6
+
7
+ class EntityLookup
8
+ ###
9
+ # Look up entity with +name+
10
+ def [](name)
11
+ (val = get(name)) && val.value
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,48 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML4
5
+ ###
6
+ # Nokogiri provides a SAX parser to process HTML4 which will provide HTML recovery
7
+ # ("autocorrection") features.
8
+ #
9
+ # See Nokogiri::HTML4::SAX::Parser for a basic example of using a SAX parser with HTML.
10
+ #
11
+ # For more information on SAX parsers, see Nokogiri::XML::SAX
12
+ #
13
+ module SAX
14
+ ###
15
+ # This parser is a SAX style parser that reads its input as it deems necessary. The parser
16
+ # takes a Nokogiri::XML::SAX::Document, an optional encoding, then given an HTML input, sends
17
+ # messages to the Nokogiri::XML::SAX::Document.
18
+ #
19
+ # ⚠ This is an HTML4 parser and so may not support some HTML5 features and behaviors.
20
+ #
21
+ # Here is a basic usage example:
22
+ #
23
+ # class MyHandler < Nokogiri::XML::SAX::Document
24
+ # def start_element name, attributes = []
25
+ # puts "found a #{name}"
26
+ # end
27
+ # end
28
+ #
29
+ # parser = Nokogiri::HTML4::SAX::Parser.new(MyHandler.new)
30
+ #
31
+ # # Hand an IO object to the parser, which will read the HTML from the IO.
32
+ # File.open(path_to_html) do |f|
33
+ # parser.parse(f)
34
+ # end
35
+ #
36
+ # For more information on \SAX parsers, see Nokogiri::XML::SAX or the parent class
37
+ # Nokogiri::XML::SAX::Parser.
38
+ #
39
+ # Also see Nokogiri::XML::SAX::Document for the available events.
40
+ #
41
+ class Parser < Nokogiri::XML::SAX::Parser
42
+ # this class inherits its behavior from Nokogiri::XML::SAX::Parser, but note that superclass
43
+ # uses Nokogiri::ClassResolver to use HTML4::SAX::ParserContext as the context class for
44
+ # this class, which is where the real behavioral differences are implemented.
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,15 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML4
5
+ module SAX
6
+ ###
7
+ # Context object to invoke the HTML4 SAX parser on the SAX::Document handler.
8
+ #
9
+ # 💡 This class is usually not instantiated by the user. Use Nokogiri::HTML4::SAX::Parser
10
+ # instead.
11
+ class ParserContext < Nokogiri::XML::SAX::ParserContext
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,37 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML4
5
+ module SAX
6
+ class PushParser
7
+ # The Nokogiri::HTML4::SAX::Document on which the PushParser will be
8
+ # operating
9
+ attr_accessor :document
10
+
11
+ def initialize(doc = HTML4::SAX::Document.new, file_name = nil, encoding = "UTF-8")
12
+ @document = doc
13
+ @encoding = encoding
14
+ @sax_parser = HTML4::SAX::Parser.new(doc, @encoding)
15
+
16
+ ## Create our push parser context
17
+ initialize_native(@sax_parser, file_name, encoding)
18
+ end
19
+
20
+ ###
21
+ # Write a +chunk+ of HTML to the PushParser. Any callback methods
22
+ # that can be called will be called immediately.
23
+ def write(chunk, last_chunk = false)
24
+ native_write(chunk, last_chunk)
25
+ end
26
+ alias_method :<<, :write
27
+
28
+ ###
29
+ # Finish the parsing. This method is only necessary for
30
+ # Nokogiri::HTML4::SAX::Document#end_document to be called.
31
+ def finish
32
+ write("", true)
33
+ end
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,42 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ module Nokogiri
5
+ class << self
6
+ # Convenience method for Nokogiri::HTML4::Document.parse
7
+ def HTML4(...)
8
+ Nokogiri::HTML4::Document.parse(...)
9
+ end
10
+ end
11
+
12
+ # Since v1.12.0
13
+ #
14
+ # 💡 Before v1.12.0, Nokogiri::HTML4 did not exist, and Nokogiri::HTML was the module/namespace
15
+ # for parsing HTML.
16
+ module HTML4
17
+ class << self
18
+ # Convenience method for Nokogiri::HTML4::Document.parse
19
+ def parse(...)
20
+ Document.parse(...)
21
+ end
22
+
23
+ # Convenience method for Nokogiri::HTML4::DocumentFragment.parse
24
+ def fragment(...)
25
+ HTML4::DocumentFragment.parse(...)
26
+ end
27
+ end
28
+
29
+ # Instance of Nokogiri::HTML4::EntityLookup
30
+ NamedCharacters = EntityLookup.new
31
+ end
32
+ end
33
+
34
+ require_relative "html4/entity_lookup"
35
+ require_relative "html4/document"
36
+ require_relative "html4/document_fragment"
37
+ require_relative "html4/encoding_reader"
38
+ require_relative "html4/sax/parser_context"
39
+ require_relative "html4/sax/parser"
40
+ require_relative "html4/sax/push_parser"
41
+ require_relative "html4/element_description"
42
+ require_relative "html4/element_description_defaults"
@@ -0,0 +1,40 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Nokogiri
4
+ module HTML5
5
+ ###
6
+ # Nokogiri HTML5 builder is used for building HTML documents. It is very similar to the
7
+ # Nokogiri::XML::Builder. In fact, you should go read the documentation for
8
+ # Nokogiri::XML::Builder before reading this documentation.
9
+ #
10
+ # The construction behavior is identical to HTML4::Builder, but HTML5 documents implement the
11
+ # [HTML5 standard's serialization
12
+ # algorithm](https://www.w3.org/TR/2008/WD-html5-20080610/serializing.html).
13
+ #
14
+ # == Synopsis:
15
+ #
16
+ # Create an HTML5 document with a body that has an onload attribute, and a
17
+ # span tag with a class of "bold" that has content of "Hello world".
18
+ #
19
+ # builder = Nokogiri::HTML5::Builder.new do |doc|
20
+ # doc.html {
21
+ # doc.body(:onload => 'some_func();') {
22
+ # doc.span.bold {
23
+ # doc.text "Hello world"
24
+ # }
25
+ # }
26
+ # }
27
+ # end
28
+ # puts builder.to_html
29
+ #
30
+ # The HTML5 builder inherits from the XML builder, so make sure to read the
31
+ # Nokogiri::XML::Builder documentation.
32
+ class Builder < Nokogiri::XML::Builder
33
+ ###
34
+ # Convert the builder to HTML
35
+ def to_html
36
+ @doc.to_html
37
+ end
38
+ end
39
+ end
40
+ end
@@ -0,0 +1,199 @@
1
+ # coding: utf-8
2
+ # frozen_string_literal: true
3
+
4
+ #
5
+ # Copyright 2013-2021 Sam Ruby, Stephen Checkoway
6
+ #
7
+ # Licensed under the Apache License, Version 2.0 (the "License");
8
+ # you may not use this file except in compliance with the License.
9
+ # You may obtain a copy of the License at
10
+ #
11
+ # http://www.apache.org/licenses/LICENSE-2.0
12
+ #
13
+ # Unless required by applicable law or agreed to in writing, software
14
+ # distributed under the License is distributed on an "AS IS" BASIS,
15
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ # See the License for the specific language governing permissions and
17
+ # limitations under the License.
18
+ #
19
+
20
+ require_relative "../html4/document"
21
+
22
+ module Nokogiri
23
+ module HTML5
24
+ # Enum for the HTML5 parser quirks mode values. Values returned by HTML5::Document#quirks_mode
25
+ #
26
+ # See https://dom.spec.whatwg.org/#concept-document-quirks for more information on HTML5 quirks
27
+ # mode.
28
+ #
29
+ # Since v1.14.0
30
+ module QuirksMode
31
+ NO_QUIRKS = 0 # The document was parsed in "no-quirks" mode
32
+ QUIRKS = 1 # The document was parsed in "quirks" mode
33
+ LIMITED_QUIRKS = 2 # The document was parsed in "limited-quirks" mode
34
+ end
35
+
36
+ # Since v1.12.0
37
+ #
38
+ # 💡 HTML5 functionality is not available when running JRuby.
39
+ class Document < Nokogiri::HTML4::Document
40
+ # Get the url name for this document, as passed into Document.parse, Document.read_io, or
41
+ # Document.read_memory
42
+ attr_reader :url
43
+
44
+ # Get the parser's quirks mode value. See HTML5::QuirksMode.
45
+ #
46
+ # This method returns +nil+ if the parser was not invoked (e.g., Nokogiri::HTML5::Document.new).
47
+ #
48
+ # Since v1.14.0
49
+ attr_reader :quirks_mode
50
+
51
+ class << self
52
+ # :call-seq:
53
+ # parse(input) { |options| ... } → HTML5::Document
54
+ # parse(input, url: encoding:) { |options| ... } → HTML5::Document
55
+ # parse(input, **options) → HTML5::Document
56
+ #
57
+ # Parse \HTML input with a parser compliant with the HTML5 spec. This method uses the
58
+ # encoding of +input+ if it can be determined, or else falls back to the +encoding:+
59
+ # parameter.
60
+ #
61
+ # [Required Parameters]
62
+ # - +input+ (String | IO) the \HTML content to be parsed.
63
+ #
64
+ # [Optional Parameters]
65
+ # - +url:+ (String) the base URI of the document.
66
+ #
67
+ # [Optional Keyword Arguments]
68
+ # - +encoding:+ (Encoding) The name of the encoding that should be used when processing the
69
+ # document. When not provided, the encoding will be determined based on the document
70
+ # content.
71
+ #
72
+ # - +max_errors:+ (Integer) The maximum number of parse errors to record. (default
73
+ # +Nokogiri::Gumbo::DEFAULT_MAX_ERRORS+ which is currently 0)
74
+ #
75
+ # - +max_tree_depth:+ (Integer) The maximum depth of the parse tree. (default
76
+ # +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+)
77
+ #
78
+ # - +max_attributes:+ (Integer) The maximum number of attributes allowed on an
79
+ # element. (default +Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES+)
80
+ #
81
+ # - +parse_noscript_content_as_text:+ (Boolean) Whether to parse the content of +noscript+
82
+ # elements as text. (default +false+)
83
+ #
84
+ # See rdoc-ref:HTML5@Parsing+options for a complete description of these parsing options.
85
+ #
86
+ # [Yields]
87
+ # If present, the block will be passed a Hash object to modify with parse options before the
88
+ # input is parsed. See rdoc-ref:HTML5@Parsing+options for a list of available options.
89
+ #
90
+ # ⚠ Note that +url:+ and +encoding:+ cannot be set by the configuration block.
91
+ #
92
+ # [Returns] Nokogiri::HTML5::Document
93
+ #
94
+ # *Example:* Parse a string with a specific encoding and custom max errors limit.
95
+ #
96
+ # Nokogiri::HTML5::Document.parse(socket, encoding: "ISO-8859-1", max_errors: 10)
97
+ #
98
+ # *Example:* Parse a string setting the +:parse_noscript_content_as_text+ option using the
99
+ # configuration block parameter.
100
+ #
101
+ # Nokogiri::HTML5::Document.parse(input) { |c| c[:parse_noscript_content_as_text] = true }
102
+ #
103
+ def parse(
104
+ string_or_io,
105
+ url_ = nil, encoding_ = nil,
106
+ url: url_, encoding: encoding_,
107
+ **options, &block
108
+ )
109
+ yield options if block
110
+ string_or_io = "" unless string_or_io
111
+
112
+ if string_or_io.respond_to?(:encoding) && string_or_io.encoding != Encoding::ASCII_8BIT
113
+ encoding ||= string_or_io.encoding.name
114
+ end
115
+
116
+ if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
117
+ url ||= string_or_io.path
118
+ end
119
+ unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
120
+ raise ArgumentError, "not a string or IO object"
121
+ end
122
+
123
+ do_parse(string_or_io, url, encoding, **options)
124
+ end
125
+
126
+ # Create a new document from an IO object.
127
+ #
128
+ # 💡 Most users should prefer Document.parse to this method.
129
+ def read_io(io, url_ = nil, encoding_ = nil, url: url_, encoding: encoding_, **options)
130
+ raise ArgumentError, "io object doesn't respond to :read" unless io.respond_to?(:read)
131
+
132
+ do_parse(io, url, encoding, **options)
133
+ end
134
+
135
+ # Create a new document from a String.
136
+ #
137
+ # 💡 Most users should prefer Document.parse to this method.
138
+ def read_memory(string, url_ = nil, encoding_ = nil, url: url_, encoding: encoding_, **options)
139
+ raise ArgumentError, "string object doesn't respond to :to_str" unless string.respond_to?(:to_str)
140
+
141
+ do_parse(string, url, encoding, **options)
142
+ end
143
+
144
+ private
145
+
146
+ def do_parse(string_or_io, url, encoding, **options)
147
+ string = HTML5.read_and_encode(string_or_io, encoding)
148
+
149
+ options[:max_attributes] ||= Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
150
+ options[:max_errors] ||= options.delete(:max_parse_errors) || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
151
+ options[:max_tree_depth] ||= Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH
152
+
153
+ doc = Nokogiri::Gumbo.parse(string, url, self, **options)
154
+ doc.encoding = "UTF-8"
155
+ doc
156
+ end
157
+ end
158
+
159
+ def initialize(*args) # :nodoc:
160
+ super
161
+ @url = nil
162
+ @quirks_mode = nil
163
+ end
164
+
165
+ # :call-seq:
166
+ # fragment() → Nokogiri::HTML5::DocumentFragment
167
+ # fragment(markup) → Nokogiri::HTML5::DocumentFragment
168
+ #
169
+ # Parse a HTML5 document fragment from +markup+, returning a Nokogiri::HTML5::DocumentFragment.
170
+ #
171
+ # [Properties]
172
+ # - +markup+ (String) The HTML5 markup fragment to be parsed
173
+ #
174
+ # [Returns]
175
+ # Nokogiri::HTML5::DocumentFragment. This object's children will be empty if +markup+ is not
176
+ # passed, is empty, or is +nil+.
177
+ #
178
+ def fragment(markup = nil)
179
+ DocumentFragment.new(self, markup)
180
+ end
181
+
182
+ def to_xml(options = {}, &block) # :nodoc:
183
+ # Bypass XML::Document#to_xml which doesn't add
184
+ # XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
185
+ XML::Node.instance_method(:to_xml).bind_call(self, options, &block)
186
+ end
187
+
188
+ # :call-seq:
189
+ # xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig
190
+ #
191
+ # [Returns] The document type which determines CSS-to-XPath translation.
192
+ #
193
+ # See CSS::XPathVisitor for more information.
194
+ def xpath_doctype
195
+ Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5
196
+ end
197
+ end
198
+ end
199
+ end