nokogiri 1.10.10 → 1.14.3-aarch64-linux

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (251) hide show
  1. checksums.yaml +4 -4
  2. data/Gemfile +44 -0
  3. data/LICENSE-DEPENDENCIES.md +1632 -1022
  4. data/LICENSE.md +1 -1
  5. data/README.md +185 -96
  6. data/bin/nokogiri +63 -50
  7. data/dependencies.yml +33 -66
  8. data/ext/nokogiri/depend +38 -358
  9. data/ext/nokogiri/extconf.rb +819 -421
  10. data/ext/nokogiri/gumbo.c +594 -0
  11. data/ext/nokogiri/html4_document.c +166 -0
  12. data/ext/nokogiri/html4_element_description.c +294 -0
  13. data/ext/nokogiri/html4_entity_lookup.c +37 -0
  14. data/ext/nokogiri/html4_sax_parser_context.c +114 -0
  15. data/ext/nokogiri/html4_sax_push_parser.c +95 -0
  16. data/ext/nokogiri/include/libexslt/exslt.h +108 -0
  17. data/ext/nokogiri/include/libexslt/exsltconfig.h +70 -0
  18. data/ext/nokogiri/include/libexslt/exsltexports.h +63 -0
  19. data/ext/nokogiri/include/libxml2/libxml/HTMLparser.h +306 -0
  20. data/ext/nokogiri/include/libxml2/libxml/HTMLtree.h +147 -0
  21. data/ext/nokogiri/include/libxml2/libxml/SAX.h +204 -0
  22. data/ext/nokogiri/include/libxml2/libxml/SAX2.h +172 -0
  23. data/ext/nokogiri/include/libxml2/libxml/c14n.h +128 -0
  24. data/ext/nokogiri/include/libxml2/libxml/catalog.h +182 -0
  25. data/ext/nokogiri/include/libxml2/libxml/chvalid.h +230 -0
  26. data/ext/nokogiri/include/libxml2/libxml/debugXML.h +217 -0
  27. data/ext/nokogiri/include/libxml2/libxml/dict.h +81 -0
  28. data/ext/nokogiri/include/libxml2/libxml/encoding.h +232 -0
  29. data/ext/nokogiri/include/libxml2/libxml/entities.h +153 -0
  30. data/ext/nokogiri/include/libxml2/libxml/globals.h +499 -0
  31. data/ext/nokogiri/include/libxml2/libxml/hash.h +236 -0
  32. data/ext/nokogiri/include/libxml2/libxml/list.h +137 -0
  33. data/ext/nokogiri/include/libxml2/libxml/nanoftp.h +186 -0
  34. data/ext/nokogiri/include/libxml2/libxml/nanohttp.h +81 -0
  35. data/ext/nokogiri/include/libxml2/libxml/parser.h +1244 -0
  36. data/ext/nokogiri/include/libxml2/libxml/parserInternals.h +656 -0
  37. data/ext/nokogiri/include/libxml2/libxml/pattern.h +100 -0
  38. data/ext/nokogiri/include/libxml2/libxml/relaxng.h +218 -0
  39. data/ext/nokogiri/include/libxml2/libxml/schemasInternals.h +958 -0
  40. data/ext/nokogiri/include/libxml2/libxml/schematron.h +142 -0
  41. data/ext/nokogiri/include/libxml2/libxml/threads.h +91 -0
  42. data/ext/nokogiri/include/libxml2/libxml/tree.h +1312 -0
  43. data/ext/nokogiri/include/libxml2/libxml/uri.h +94 -0
  44. data/ext/nokogiri/include/libxml2/libxml/valid.h +463 -0
  45. data/ext/nokogiri/include/libxml2/libxml/xinclude.h +129 -0
  46. data/ext/nokogiri/include/libxml2/libxml/xlink.h +189 -0
  47. data/ext/nokogiri/include/libxml2/libxml/xmlIO.h +368 -0
  48. data/ext/nokogiri/include/libxml2/libxml/xmlautomata.h +146 -0
  49. data/ext/nokogiri/include/libxml2/libxml/xmlerror.h +947 -0
  50. data/ext/nokogiri/include/libxml2/libxml/xmlexports.h +77 -0
  51. data/ext/nokogiri/include/libxml2/libxml/xmlmemory.h +226 -0
  52. data/ext/nokogiri/include/libxml2/libxml/xmlmodule.h +57 -0
  53. data/ext/nokogiri/include/libxml2/libxml/xmlreader.h +428 -0
  54. data/ext/nokogiri/include/libxml2/libxml/xmlregexp.h +222 -0
  55. data/ext/nokogiri/include/libxml2/libxml/xmlsave.h +88 -0
  56. data/ext/nokogiri/include/libxml2/libxml/xmlschemas.h +246 -0
  57. data/ext/nokogiri/include/libxml2/libxml/xmlschemastypes.h +152 -0
  58. data/ext/nokogiri/include/libxml2/libxml/xmlstring.h +140 -0
  59. data/ext/nokogiri/include/libxml2/libxml/xmlunicode.h +202 -0
  60. data/ext/nokogiri/include/libxml2/libxml/xmlversion.h +503 -0
  61. data/ext/nokogiri/include/libxml2/libxml/xmlwriter.h +488 -0
  62. data/ext/nokogiri/include/libxml2/libxml/xpath.h +575 -0
  63. data/ext/nokogiri/include/libxml2/libxml/xpathInternals.h +632 -0
  64. data/ext/nokogiri/include/libxml2/libxml/xpointer.h +137 -0
  65. data/ext/nokogiri/include/libxslt/attributes.h +38 -0
  66. data/ext/nokogiri/include/libxslt/documents.h +93 -0
  67. data/ext/nokogiri/include/libxslt/extensions.h +262 -0
  68. data/ext/nokogiri/include/libxslt/extra.h +72 -0
  69. data/ext/nokogiri/include/libxslt/functions.h +78 -0
  70. data/ext/nokogiri/include/libxslt/imports.h +75 -0
  71. data/ext/nokogiri/include/libxslt/keys.h +53 -0
  72. data/ext/nokogiri/include/libxslt/namespaces.h +68 -0
  73. data/ext/nokogiri/include/libxslt/numbersInternals.h +73 -0
  74. data/ext/nokogiri/include/libxslt/pattern.h +84 -0
  75. data/ext/nokogiri/include/libxslt/preproc.h +43 -0
  76. data/ext/nokogiri/include/libxslt/security.h +104 -0
  77. data/ext/nokogiri/include/libxslt/templates.h +77 -0
  78. data/ext/nokogiri/include/libxslt/transform.h +207 -0
  79. data/ext/nokogiri/include/libxslt/variables.h +118 -0
  80. data/ext/nokogiri/include/libxslt/xslt.h +110 -0
  81. data/ext/nokogiri/include/libxslt/xsltInternals.h +1982 -0
  82. data/ext/nokogiri/include/libxslt/xsltconfig.h +179 -0
  83. data/ext/nokogiri/include/libxslt/xsltexports.h +64 -0
  84. data/ext/nokogiri/include/libxslt/xsltlocale.h +76 -0
  85. data/ext/nokogiri/include/libxslt/xsltutils.h +310 -0
  86. data/ext/nokogiri/libxml2_backwards_compat.c +121 -0
  87. data/ext/nokogiri/nokogiri.c +228 -104
  88. data/ext/nokogiri/nokogiri.h +204 -90
  89. data/ext/nokogiri/test_global_handlers.c +40 -0
  90. data/ext/nokogiri/xml_attr.c +17 -17
  91. data/ext/nokogiri/xml_attribute_decl.c +22 -22
  92. data/ext/nokogiri/xml_cdata.c +15 -20
  93. data/ext/nokogiri/xml_comment.c +19 -26
  94. data/ext/nokogiri/xml_document.c +306 -225
  95. data/ext/nokogiri/xml_document_fragment.c +12 -16
  96. data/ext/nokogiri/xml_dtd.c +64 -58
  97. data/ext/nokogiri/xml_element_content.c +33 -28
  98. data/ext/nokogiri/xml_element_decl.c +26 -26
  99. data/ext/nokogiri/xml_encoding_handler.c +45 -20
  100. data/ext/nokogiri/xml_entity_decl.c +37 -35
  101. data/ext/nokogiri/xml_entity_reference.c +16 -18
  102. data/ext/nokogiri/xml_namespace.c +136 -61
  103. data/ext/nokogiri/xml_node.c +1344 -672
  104. data/ext/nokogiri/xml_node_set.c +178 -168
  105. data/ext/nokogiri/xml_processing_instruction.c +17 -19
  106. data/ext/nokogiri/xml_reader.c +316 -190
  107. data/ext/nokogiri/xml_relax_ng.c +52 -30
  108. data/ext/nokogiri/xml_sax_parser.c +130 -124
  109. data/ext/nokogiri/xml_sax_parser_context.c +110 -89
  110. data/ext/nokogiri/xml_sax_push_parser.c +36 -29
  111. data/ext/nokogiri/xml_schema.c +98 -50
  112. data/ext/nokogiri/xml_syntax_error.c +42 -21
  113. data/ext/nokogiri/xml_text.c +14 -18
  114. data/ext/nokogiri/xml_xpath_context.c +263 -148
  115. data/ext/nokogiri/xslt_stylesheet.c +271 -178
  116. data/gumbo-parser/CHANGES.md +63 -0
  117. data/gumbo-parser/Makefile +111 -0
  118. data/gumbo-parser/THANKS +27 -0
  119. data/lib/nokogiri/2.7/nokogiri.so +0 -0
  120. data/lib/nokogiri/3.0/nokogiri.so +0 -0
  121. data/lib/nokogiri/3.1/nokogiri.so +0 -0
  122. data/lib/nokogiri/3.2/nokogiri.so +0 -0
  123. data/lib/nokogiri/class_resolver.rb +67 -0
  124. data/lib/nokogiri/css/node.rb +10 -8
  125. data/lib/nokogiri/css/parser.rb +397 -377
  126. data/lib/nokogiri/css/parser.y +250 -245
  127. data/lib/nokogiri/css/parser_extras.rb +54 -49
  128. data/lib/nokogiri/css/syntax_error.rb +3 -1
  129. data/lib/nokogiri/css/tokenizer.rb +5 -3
  130. data/lib/nokogiri/css/tokenizer.rex +3 -2
  131. data/lib/nokogiri/css/xpath_visitor.rb +223 -94
  132. data/lib/nokogiri/css.rb +56 -17
  133. data/lib/nokogiri/decorators/slop.rb +9 -7
  134. data/lib/nokogiri/encoding_handler.rb +57 -0
  135. data/lib/nokogiri/extension.rb +32 -0
  136. data/lib/nokogiri/gumbo.rb +15 -0
  137. data/lib/nokogiri/html.rb +38 -27
  138. data/lib/nokogiri/{html → html4}/builder.rb +4 -2
  139. data/lib/nokogiri/html4/document.rb +214 -0
  140. data/lib/nokogiri/html4/document_fragment.rb +54 -0
  141. data/lib/nokogiri/{html → html4}/element_description.rb +3 -1
  142. data/lib/nokogiri/html4/element_description_defaults.rb +572 -0
  143. data/lib/nokogiri/html4/encoding_reader.rb +121 -0
  144. data/lib/nokogiri/{html → html4}/entity_lookup.rb +4 -2
  145. data/lib/nokogiri/{html → html4}/sax/parser.rb +17 -16
  146. data/lib/nokogiri/html4/sax/parser_context.rb +20 -0
  147. data/lib/nokogiri/{html → html4}/sax/push_parser.rb +12 -11
  148. data/lib/nokogiri/html4.rb +47 -0
  149. data/lib/nokogiri/html5/document.rb +168 -0
  150. data/lib/nokogiri/html5/document_fragment.rb +90 -0
  151. data/lib/nokogiri/html5/node.rb +98 -0
  152. data/lib/nokogiri/html5.rb +389 -0
  153. data/lib/nokogiri/jruby/dependencies.rb +3 -0
  154. data/lib/nokogiri/jruby/nokogiri_jars.rb +43 -0
  155. data/lib/nokogiri/syntax_error.rb +2 -0
  156. data/lib/nokogiri/version/constant.rb +6 -0
  157. data/lib/nokogiri/version/info.rb +223 -0
  158. data/lib/nokogiri/version.rb +3 -108
  159. data/lib/nokogiri/xml/attr.rb +55 -3
  160. data/lib/nokogiri/xml/attribute_decl.rb +3 -1
  161. data/lib/nokogiri/xml/builder.rb +75 -34
  162. data/lib/nokogiri/xml/cdata.rb +3 -1
  163. data/lib/nokogiri/xml/character_data.rb +2 -0
  164. data/lib/nokogiri/xml/document.rb +312 -126
  165. data/lib/nokogiri/xml/document_fragment.rb +93 -48
  166. data/lib/nokogiri/xml/dtd.rb +4 -2
  167. data/lib/nokogiri/xml/element_content.rb +2 -0
  168. data/lib/nokogiri/xml/element_decl.rb +3 -1
  169. data/lib/nokogiri/xml/entity_decl.rb +4 -2
  170. data/lib/nokogiri/xml/entity_reference.rb +2 -0
  171. data/lib/nokogiri/xml/namespace.rb +45 -0
  172. data/lib/nokogiri/xml/node/save_options.rb +15 -8
  173. data/lib/nokogiri/xml/node.rb +1067 -406
  174. data/lib/nokogiri/xml/node_set.rb +135 -59
  175. data/lib/nokogiri/xml/notation.rb +13 -0
  176. data/lib/nokogiri/xml/parse_options.rb +145 -52
  177. data/lib/nokogiri/xml/pp/character_data.rb +9 -6
  178. data/lib/nokogiri/xml/pp/node.rb +27 -26
  179. data/lib/nokogiri/xml/pp.rb +4 -2
  180. data/lib/nokogiri/xml/processing_instruction.rb +4 -1
  181. data/lib/nokogiri/xml/reader.rb +21 -28
  182. data/lib/nokogiri/xml/relax_ng.rb +8 -2
  183. data/lib/nokogiri/xml/sax/document.rb +45 -49
  184. data/lib/nokogiri/xml/sax/parser.rb +39 -36
  185. data/lib/nokogiri/xml/sax/parser_context.rb +8 -3
  186. data/lib/nokogiri/xml/sax/push_parser.rb +6 -5
  187. data/lib/nokogiri/xml/sax.rb +6 -4
  188. data/lib/nokogiri/xml/schema.rb +19 -9
  189. data/lib/nokogiri/xml/searchable.rb +112 -72
  190. data/lib/nokogiri/xml/syntax_error.rb +6 -4
  191. data/lib/nokogiri/xml/text.rb +2 -0
  192. data/lib/nokogiri/xml/xpath/syntax_error.rb +4 -2
  193. data/lib/nokogiri/xml/xpath.rb +15 -4
  194. data/lib/nokogiri/xml/xpath_context.rb +3 -3
  195. data/lib/nokogiri/xml.rb +38 -37
  196. data/lib/nokogiri/xslt/stylesheet.rb +3 -1
  197. data/lib/nokogiri/xslt.rb +29 -20
  198. data/lib/nokogiri.rb +48 -72
  199. data/lib/xsd/xmlparser/nokogiri.rb +29 -25
  200. metadata +146 -307
  201. data/ext/nokogiri/html_document.c +0 -170
  202. data/ext/nokogiri/html_document.h +0 -10
  203. data/ext/nokogiri/html_element_description.c +0 -279
  204. data/ext/nokogiri/html_element_description.h +0 -10
  205. data/ext/nokogiri/html_entity_lookup.c +0 -32
  206. data/ext/nokogiri/html_entity_lookup.h +0 -8
  207. data/ext/nokogiri/html_sax_parser_context.c +0 -116
  208. data/ext/nokogiri/html_sax_parser_context.h +0 -11
  209. data/ext/nokogiri/html_sax_push_parser.c +0 -87
  210. data/ext/nokogiri/html_sax_push_parser.h +0 -9
  211. data/ext/nokogiri/xml_attr.h +0 -9
  212. data/ext/nokogiri/xml_attribute_decl.h +0 -9
  213. data/ext/nokogiri/xml_cdata.h +0 -9
  214. data/ext/nokogiri/xml_comment.h +0 -9
  215. data/ext/nokogiri/xml_document.h +0 -23
  216. data/ext/nokogiri/xml_document_fragment.h +0 -10
  217. data/ext/nokogiri/xml_dtd.h +0 -10
  218. data/ext/nokogiri/xml_element_content.h +0 -10
  219. data/ext/nokogiri/xml_element_decl.h +0 -9
  220. data/ext/nokogiri/xml_encoding_handler.h +0 -8
  221. data/ext/nokogiri/xml_entity_decl.h +0 -10
  222. data/ext/nokogiri/xml_entity_reference.h +0 -9
  223. data/ext/nokogiri/xml_io.c +0 -61
  224. data/ext/nokogiri/xml_io.h +0 -11
  225. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  226. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  227. data/ext/nokogiri/xml_namespace.h +0 -14
  228. data/ext/nokogiri/xml_node.h +0 -13
  229. data/ext/nokogiri/xml_node_set.h +0 -12
  230. data/ext/nokogiri/xml_processing_instruction.h +0 -9
  231. data/ext/nokogiri/xml_reader.h +0 -10
  232. data/ext/nokogiri/xml_relax_ng.h +0 -9
  233. data/ext/nokogiri/xml_sax_parser.h +0 -39
  234. data/ext/nokogiri/xml_sax_parser_context.h +0 -10
  235. data/ext/nokogiri/xml_sax_push_parser.h +0 -9
  236. data/ext/nokogiri/xml_schema.h +0 -9
  237. data/ext/nokogiri/xml_syntax_error.h +0 -13
  238. data/ext/nokogiri/xml_text.h +0 -9
  239. data/ext/nokogiri/xml_xpath_context.h +0 -10
  240. data/ext/nokogiri/xslt_stylesheet.h +0 -14
  241. data/lib/nokogiri/html/document.rb +0 -335
  242. data/lib/nokogiri/html/document_fragment.rb +0 -49
  243. data/lib/nokogiri/html/element_description_defaults.rb +0 -671
  244. data/lib/nokogiri/html/sax/parser_context.rb +0 -16
  245. data/patches/libxml2/0001-Revert-Do-not-URI-escape-in-server-side-includes.patch +0 -78
  246. data/patches/libxml2/0002-Remove-script-macro-support.patch +0 -40
  247. data/patches/libxml2/0003-Update-entities-to-remove-handling-of-ssi.patch +0 -44
  248. data/patches/libxml2/0004-libxml2.la-is-in-top_builddir.patch +0 -25
  249. data/patches/libxml2/0005-Fix-infinite-loop-in-xmlStringLenDecodeEntities.patch +0 -32
  250. data/ports/archives/libxml2-2.9.10.tar.gz +0 -0
  251. data/ports/archives/libxslt-1.1.34.tar.gz +0 -0
@@ -1,335 +0,0 @@
1
- module Nokogiri
2
- module HTML
3
- class Document < Nokogiri::XML::Document
4
- ###
5
- # Get the meta tag encoding for this document. If there is no meta tag,
6
- # then nil is returned.
7
- def meta_encoding
8
- case
9
- when meta = at('//meta[@charset]')
10
- meta[:charset]
11
- when meta = meta_content_type
12
- meta['content'][/charset\s*=\s*([\w-]+)/i, 1]
13
- end
14
- end
15
-
16
- ###
17
- # Set the meta tag encoding for this document.
18
- #
19
- # If an meta encoding tag is already present, its content is
20
- # replaced with the given text.
21
- #
22
- # Otherwise, this method tries to create one at an appropriate
23
- # place supplying head and/or html elements as necessary, which
24
- # is inside a head element if any, and before any text node or
25
- # content element (typically <body>) if any.
26
- #
27
- # The result when trying to set an encoding that is different
28
- # from the document encoding is undefined.
29
- #
30
- # Beware in CRuby, that libxml2 automatically inserts a meta tag
31
- # into a head element.
32
- def meta_encoding= encoding
33
- case
34
- when meta = meta_content_type
35
- meta['content'] = 'text/html; charset=%s' % encoding
36
- encoding
37
- when meta = at('//meta[@charset]')
38
- meta['charset'] = encoding
39
- else
40
- meta = XML::Node.new('meta', self)
41
- if dtd = internal_subset and dtd.html5_dtd?
42
- meta['charset'] = encoding
43
- else
44
- meta['http-equiv'] = 'Content-Type'
45
- meta['content'] = 'text/html; charset=%s' % encoding
46
- end
47
-
48
- case
49
- when head = at('//head')
50
- head.prepend_child(meta)
51
- else
52
- set_metadata_element(meta)
53
- end
54
- encoding
55
- end
56
- end
57
-
58
- def meta_content_type
59
- xpath('//meta[@http-equiv and boolean(@content)]').find { |node|
60
- node['http-equiv'] =~ /\AContent-Type\z/i
61
- }
62
- end
63
- private :meta_content_type
64
-
65
- ###
66
- # Get the title string of this document. Return nil if there is
67
- # no title tag.
68
- def title
69
- title = at('//title') and title.inner_text
70
- end
71
-
72
- ###
73
- # Set the title string of this document.
74
- #
75
- # If a title element is already present, its content is replaced
76
- # with the given text.
77
- #
78
- # Otherwise, this method tries to create one at an appropriate
79
- # place supplying head and/or html elements as necessary, which
80
- # is inside a head element if any, right after a meta
81
- # encoding/charset tag if any, and before any text node or
82
- # content element (typically <body>) if any.
83
- def title=(text)
84
- tnode = XML::Text.new(text, self)
85
- if title = at('//title')
86
- title.children = tnode
87
- return text
88
- end
89
-
90
- title = XML::Node.new('title', self) << tnode
91
- case
92
- when head = at('//head')
93
- head << title
94
- when meta = at('//meta[@charset]') || meta_content_type
95
- # better put after charset declaration
96
- meta.add_next_sibling(title)
97
- else
98
- set_metadata_element(title)
99
- end
100
- text
101
- end
102
-
103
- def set_metadata_element(element)
104
- case
105
- when head = at('//head')
106
- head << element
107
- when html = at('//html')
108
- head = html.prepend_child(XML::Node.new('head', self))
109
- head.prepend_child(element)
110
- when first = children.find { |node|
111
- case node
112
- when XML::Element, XML::Text
113
- true
114
- end
115
- }
116
- # We reach here only if the underlying document model
117
- # allows <html>/<head> elements to be omitted and does not
118
- # automatically supply them.
119
- first.add_previous_sibling(element)
120
- else
121
- html = add_child(XML::Node.new('html', self))
122
- head = html.add_child(XML::Node.new('head', self))
123
- head.prepend_child(element)
124
- end
125
- end
126
- private :set_metadata_element
127
-
128
- ####
129
- # Serialize Node using +options+. Save options can also be set using a
130
- # block. See SaveOptions.
131
- #
132
- # These two statements are equivalent:
133
- #
134
- # node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)
135
- #
136
- # or
137
- #
138
- # node.serialize(:encoding => 'UTF-8') do |config|
139
- # config.format.as_xml
140
- # end
141
- #
142
- def serialize options = {}
143
- options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
144
- super
145
- end
146
-
147
- ####
148
- # Create a Nokogiri::XML::DocumentFragment from +tags+
149
- def fragment tags = nil
150
- DocumentFragment.new(self, tags, self.root)
151
- end
152
-
153
- class << self
154
- ###
155
- # Parse HTML. +string_or_io+ may be a String, or any object that
156
- # responds to _read_ and _close_ such as an IO, or StringIO.
157
- # +url+ is resource where this document is located. +encoding+ is the
158
- # encoding that should be used when processing the document. +options+
159
- # is a number that sets options in the parser, such as
160
- # Nokogiri::XML::ParseOptions::RECOVER. See the constants in
161
- # Nokogiri::XML::ParseOptions.
162
- def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML
163
-
164
- options = Nokogiri::XML::ParseOptions.new(options) if Integer === options
165
- # Give the options to the user
166
- yield options if block_given?
167
-
168
- if string_or_io.respond_to?(:encoding)
169
- unless string_or_io.encoding.name == "ASCII-8BIT"
170
- encoding ||= string_or_io.encoding.name
171
- end
172
- end
173
-
174
- if string_or_io.respond_to?(:read)
175
- url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
176
- unless encoding
177
- # Libxml2's parser has poor support for encoding
178
- # detection. First, it does not recognize the HTML5
179
- # style meta charset declaration. Secondly, even if it
180
- # successfully detects an encoding hint, it does not
181
- # re-decode or re-parse the preceding part which may be
182
- # garbled.
183
- #
184
- # EncodingReader aims to perform advanced encoding
185
- # detection beyond what Libxml2 does, and to emulate
186
- # rewinding of a stream and make Libxml2 redo parsing
187
- # from the start when an encoding hint is found.
188
- string_or_io = EncodingReader.new(string_or_io)
189
- begin
190
- return read_io(string_or_io, url, encoding, options.to_i)
191
- rescue EncodingFound => e
192
- encoding = e.found_encoding
193
- end
194
- end
195
- return read_io(string_or_io, url, encoding, options.to_i)
196
- end
197
-
198
- # read_memory pukes on empty docs
199
- if string_or_io.nil? or string_or_io.empty?
200
- return encoding ? new.tap { |i| i.encoding = encoding } : new
201
- end
202
-
203
- encoding ||= EncodingReader.detect_encoding(string_or_io)
204
-
205
- read_memory(string_or_io, url, encoding, options.to_i)
206
- end
207
- end
208
-
209
- class EncodingFound < StandardError # :nodoc:
210
- attr_reader :found_encoding
211
-
212
- def initialize(encoding)
213
- @found_encoding = encoding
214
- super("encoding found: %s" % encoding)
215
- end
216
- end
217
-
218
- class EncodingReader # :nodoc:
219
- class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc:
220
- attr_reader :encoding
221
-
222
- def initialize
223
- @encoding = nil
224
- super()
225
- end
226
-
227
- def start_element(name, attrs = [])
228
- return unless name == 'meta'
229
- attr = Hash[attrs]
230
- charset = attr['charset'] and
231
- @encoding = charset
232
- http_equiv = attr['http-equiv'] and
233
- http_equiv.match(/\AContent-Type\z/i) and
234
- content = attr['content'] and
235
- m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
236
- @encoding = m[1]
237
- end
238
- end
239
-
240
- class JumpSAXHandler < SAXHandler
241
- def initialize(jumptag)
242
- @jumptag = jumptag
243
- super()
244
- end
245
-
246
- def start_element(name, attrs = [])
247
- super
248
- throw @jumptag, @encoding if @encoding
249
- throw @jumptag, nil if name =~ /\A(?:div|h1|img|p|br)\z/
250
- end
251
- end
252
-
253
- def self.detect_encoding(chunk)
254
- if Nokogiri.jruby? && EncodingReader.is_jruby_without_fix?
255
- return EncodingReader.detect_encoding_for_jruby_without_fix(chunk)
256
- end
257
- m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
258
- return Nokogiri.XML(m[1]).encoding
259
-
260
- if Nokogiri.jruby?
261
- m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
262
- return m[4]
263
- catch(:encoding_found) {
264
- Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
265
- nil
266
- }
267
- else
268
- handler = SAXHandler.new
269
- parser = Nokogiri::HTML::SAX::PushParser.new(handler)
270
- parser << chunk rescue Nokogiri::SyntaxError
271
- handler.encoding
272
- end
273
- end
274
-
275
- def self.is_jruby_without_fix?
276
- JRUBY_VERSION.split('.').join.to_i < 165
277
- end
278
-
279
- def self.detect_encoding_for_jruby_without_fix(chunk)
280
- m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
281
- return Nokogiri.XML(m[1]).encoding
282
-
283
- m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
284
- return m[4]
285
-
286
- catch(:encoding_found) {
287
- Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk)
288
- nil
289
- }
290
- rescue Nokogiri::SyntaxError, RuntimeError
291
- # Ignore parser errors that nokogiri may raise
292
- nil
293
- end
294
-
295
- def initialize(io)
296
- @io = io
297
- @firstchunk = nil
298
- @encoding_found = nil
299
- end
300
-
301
- # This method is used by the C extension so that
302
- # Nokogiri::HTML::Document#read_io() does not leak memory when
303
- # EncodingFound is raised.
304
- attr_reader :encoding_found
305
-
306
- def read(len)
307
- # no support for a call without len
308
-
309
- if !@firstchunk
310
- @firstchunk = @io.read(len) or return nil
311
-
312
- # This implementation expects that the first call from
313
- # htmlReadIO() is made with a length long enough (~1KB) to
314
- # achieve advanced encoding detection.
315
- if encoding = EncodingReader.detect_encoding(@firstchunk)
316
- # The first chunk is stored for the next read in retry.
317
- raise @encoding_found = EncodingFound.new(encoding)
318
- end
319
- end
320
- @encoding_found = nil
321
-
322
- ret = @firstchunk.slice!(0, len)
323
- if (len -= ret.length) > 0
324
- rest = @io.read(len) and ret << rest
325
- end
326
- if ret.empty?
327
- nil
328
- else
329
- ret
330
- end
331
- end
332
- end
333
- end
334
- end
335
- end
@@ -1,49 +0,0 @@
1
- module Nokogiri
2
- module HTML
3
- class DocumentFragment < Nokogiri::XML::DocumentFragment
4
- ####
5
- # Create a Nokogiri::XML::DocumentFragment from +tags+, using +encoding+
6
- def self.parse tags, encoding = nil
7
- doc = HTML::Document.new
8
-
9
- encoding ||= if tags.respond_to?(:encoding)
10
- encoding = tags.encoding
11
- if encoding == ::Encoding::ASCII_8BIT
12
- 'UTF-8'
13
- else
14
- encoding.name
15
- end
16
- else
17
- 'UTF-8'
18
- end
19
-
20
- doc.encoding = encoding
21
-
22
- new(doc, tags)
23
- end
24
-
25
- def initialize document, tags = nil, ctx = nil
26
- return self unless tags
27
-
28
- if ctx
29
- preexisting_errors = document.errors.dup
30
- node_set = ctx.parse("<div>#{tags}</div>")
31
- node_set.first.children.each { |child| child.parent = self } unless node_set.empty?
32
- self.errors = document.errors - preexisting_errors
33
- else
34
- # This is a horrible hack, but I don't care
35
- if tags.strip =~ /^<body/i
36
- path = "/html/body"
37
- else
38
- path = "/html/body/node()"
39
- end
40
-
41
- temp_doc = HTML::Document.parse "<html><body>#{tags}", nil, document.encoding
42
- temp_doc.xpath(path).each { |child| child.parent = self }
43
- self.errors = temp_doc.errors
44
- end
45
- children
46
- end
47
- end
48
- end
49
- end