nokogiri 1.5.0.beta.3 → 1.5.0.beta.4
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- data/CHANGELOG.ja.rdoc +32 -16
- data/CHANGELOG.rdoc +18 -0
- data/Manifest.txt +1 -2
- data/README.rdoc +27 -3
- data/Rakefile +39 -83
- data/ext/java/nokogiri/EncodingHandler.java +1 -1
- data/ext/java/nokogiri/HtmlDocument.java +11 -14
- data/ext/java/nokogiri/HtmlElementDescription.java +1 -1
- data/ext/java/nokogiri/HtmlEntityLookup.java +1 -1
- data/ext/java/nokogiri/HtmlSaxParserContext.java +13 -10
- data/ext/java/nokogiri/NokogiriService.java +103 -34
- data/ext/java/nokogiri/XmlAttr.java +14 -6
- data/ext/java/nokogiri/XmlAttributeDecl.java +1 -1
- data/ext/java/nokogiri/XmlCdata.java +3 -1
- data/ext/java/nokogiri/XmlComment.java +3 -1
- data/ext/java/nokogiri/XmlDocument.java +29 -8
- data/ext/java/nokogiri/XmlDocumentFragment.java +14 -13
- data/ext/java/nokogiri/XmlDtd.java +5 -2
- data/ext/java/nokogiri/XmlElement.java +2 -1
- data/ext/java/nokogiri/XmlElementContent.java +1 -1
- data/ext/java/nokogiri/XmlElementDecl.java +2 -1
- data/ext/java/nokogiri/XmlEntityDecl.java +2 -1
- data/ext/java/nokogiri/XmlEntityReference.java +1 -1
- data/ext/java/nokogiri/XmlNamespace.java +3 -2
- data/ext/java/nokogiri/XmlNode.java +17 -10
- data/ext/java/nokogiri/XmlNodeSet.java +40 -13
- data/ext/java/nokogiri/XmlProcessingInstruction.java +1 -1
- data/ext/java/nokogiri/XmlReader.java +3 -1
- data/ext/java/nokogiri/XmlRelaxng.java +37 -92
- data/ext/java/nokogiri/XmlSaxParserContext.java +25 -11
- data/ext/java/nokogiri/XmlSaxPushParser.java +6 -4
- data/ext/java/nokogiri/XmlSchema.java +190 -46
- data/ext/java/nokogiri/XmlSyntaxError.java +42 -37
- data/ext/java/nokogiri/XmlText.java +3 -2
- data/ext/java/nokogiri/XmlXpathContext.java +8 -4
- data/ext/java/nokogiri/XsltStylesheet.java +12 -10
- data/ext/java/nokogiri/internals/HtmlDomParserContext.java +8 -7
- data/ext/java/nokogiri/internals/NokogiriDocumentCache.java +1 -1
- data/ext/java/nokogiri/internals/NokogiriErrorHandler.java +11 -5
- data/ext/java/nokogiri/internals/NokogiriHandler.java +36 -9
- data/ext/java/nokogiri/internals/NokogiriHelpers.java +21 -22
- data/ext/java/nokogiri/internals/NokogiriNamespaceCache.java +5 -4
- data/ext/java/nokogiri/internals/NokogiriNamespaceContext.java +1 -1
- data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler.java +2 -1
- data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler4NekoHtml.java +1 -1
- data/ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java +2 -1
- data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +15 -9
- data/ext/java/nokogiri/internals/NokogiriXPathFunctionResolver.java +1 -1
- data/ext/java/nokogiri/internals/ParserContext.java +18 -7
- data/ext/java/nokogiri/internals/PushInputStream.java +1 -1
- data/ext/java/nokogiri/internals/ReaderNode.java +7 -6
- data/ext/java/nokogiri/internals/SaveContext.java +16 -10
- data/ext/java/nokogiri/internals/SchemaErrorHandler.java +13 -5
- data/ext/java/nokogiri/internals/XmlDeclHandler.java +1 -1
- data/ext/java/nokogiri/internals/XmlDomParser.java +1 -1
- data/ext/java/nokogiri/internals/XmlDomParserContext.java +13 -8
- data/ext/java/nokogiri/internals/XmlSaxParser.java +1 -1
- data/ext/java/nokogiri/internals/XsltExtensionFunction.java +1 -1
- data/ext/nokogiri/extconf.rb +3 -3
- data/ext/nokogiri/xml_document.c +9 -0
- data/ext/nokogiri/xml_sax_parser.c +4 -2
- data/lib/nokogiri.rb +9 -6
- data/lib/nokogiri/css.rb +1 -3
- data/lib/nokogiri/css/parser.rb +665 -70
- data/lib/nokogiri/css/parser.y +3 -1
- data/lib/nokogiri/css/parser_extras.rb +91 -0
- data/lib/nokogiri/css/tokenizer.rb +148 -3
- data/lib/nokogiri/css/tokenizer.rex +1 -1
- data/lib/nokogiri/html/document.rb +138 -11
- data/lib/nokogiri/html/sax/parser.rb +6 -2
- data/lib/nokogiri/version.rb +1 -1
- data/lib/nokogiri/xml/node.rb +2 -2
- data/lib/nokogiri/xml/node/save_options.rb +3 -0
- data/lib/nokogiri/xml/node_set.rb +1 -1
- data/test/css/test_tokenizer.rb +8 -0
- data/test/helper.rb +2 -0
- data/test/html/sax/test_parser.rb +43 -0
- data/test/html/test_document.rb +59 -0
- data/test/html/test_document_encoding.rb +48 -0
- data/test/html/test_element_description.rb +1 -1
- data/test/xml/sax/test_parser.rb +16 -0
- data/test/xml/test_document.rb +3 -1
- data/test/xml/test_node.rb +4 -1
- data/test/xml/test_node_set.rb +10 -0
- metadata +90 -107
- data/lib/nokogiri/css/generated_parser.rb +0 -676
- data/lib/nokogiri/css/generated_tokenizer.rb +0 -145
data/lib/nokogiri/css/parser.y
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
class Nokogiri::CSS::
|
1
|
+
class Nokogiri::CSS::Parser
|
2
2
|
|
3
3
|
token FUNCTION INCLUDES DASHMATCH LBRACE HASH PLUS GREATER S STRING IDENT
|
4
4
|
token COMMA NUMBER PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH TILDE NOT_EQUAL
|
@@ -233,3 +233,5 @@ end
|
|
233
233
|
|
234
234
|
---- header
|
235
235
|
|
236
|
+
require 'nokogiri/css/parser_extras'
|
237
|
+
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'thread'
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module CSS
|
5
|
+
class Parser < Racc::Parser
|
6
|
+
@cache_on = true
|
7
|
+
@cache = {}
|
8
|
+
@mutex = Mutex.new
|
9
|
+
|
10
|
+
class << self
|
11
|
+
# Turn on CSS parse caching
|
12
|
+
attr_accessor :cache_on
|
13
|
+
alias :cache_on? :cache_on
|
14
|
+
alias :set_cache :cache_on=
|
15
|
+
|
16
|
+
# Get the css selector in +string+ from the cache
|
17
|
+
def [] string
|
18
|
+
return unless @cache_on
|
19
|
+
@mutex.synchronize { @cache[string] }
|
20
|
+
end
|
21
|
+
|
22
|
+
# Set the css selector in +string+ in the cache to +value+
|
23
|
+
def []= string, value
|
24
|
+
return value unless @cache_on
|
25
|
+
@mutex.synchronize { @cache[string] = value }
|
26
|
+
end
|
27
|
+
|
28
|
+
# Clear the cache
|
29
|
+
def clear_cache
|
30
|
+
@mutex.synchronize { @cache = {} }
|
31
|
+
end
|
32
|
+
|
33
|
+
# Execute +block+ without cache
|
34
|
+
def without_cache &block
|
35
|
+
tmp = @cache_on
|
36
|
+
@cache_on = false
|
37
|
+
block.call
|
38
|
+
@cache_on = tmp
|
39
|
+
end
|
40
|
+
|
41
|
+
###
|
42
|
+
# Parse this CSS selector in +selector+. Returns an AST.
|
43
|
+
def parse selector
|
44
|
+
@warned ||= false
|
45
|
+
unless @warned
|
46
|
+
$stderr.puts('Nokogiri::CSS::Parser.parse is deprecated, call Nokogiri::CSS.parse(), this will be removed August 1st or version 1.4.0 (whichever is first)')
|
47
|
+
@warned = true
|
48
|
+
end
|
49
|
+
new.parse selector
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Create a new CSS parser with respect to +namespaces+
|
54
|
+
def initialize namespaces = {}
|
55
|
+
@tokenizer = Tokenizer.new
|
56
|
+
@namespaces = namespaces
|
57
|
+
super()
|
58
|
+
end
|
59
|
+
|
60
|
+
def parse string
|
61
|
+
@tokenizer.scan_setup string
|
62
|
+
do_parse
|
63
|
+
end
|
64
|
+
|
65
|
+
def next_token
|
66
|
+
@tokenizer.next_token
|
67
|
+
end
|
68
|
+
|
69
|
+
# Get the xpath for +string+ using +options+
|
70
|
+
def xpath_for string, options={}
|
71
|
+
key = "#{string}#{options[:ns]}#{options[:prefix]}"
|
72
|
+
v = self.class[key]
|
73
|
+
return v if v
|
74
|
+
|
75
|
+
args = [
|
76
|
+
options[:prefix] || '//',
|
77
|
+
options[:visitor] || XPathVisitor.new
|
78
|
+
]
|
79
|
+
self.class[key] = parse(string).map { |ast|
|
80
|
+
ast.to_xpath(*args)
|
81
|
+
}
|
82
|
+
end
|
83
|
+
|
84
|
+
# On CSS parser error, raise an exception
|
85
|
+
def on_error error_token_id, error_value, value_stack
|
86
|
+
after = value_stack.compact.last
|
87
|
+
raise SyntaxError.new("unexpected '#{error_value}' after '#{after}'")
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -1,7 +1,152 @@
|
|
1
|
+
#--
|
2
|
+
# DO NOT MODIFY!!!!
|
3
|
+
# This file is automatically generated by rex 1.0.5
|
4
|
+
# from lexical definition file "lib/nokogiri/css/tokenizer.rex".
|
5
|
+
#++
|
6
|
+
|
1
7
|
module Nokogiri
|
2
|
-
|
3
|
-
|
4
|
-
|
8
|
+
module CSS
|
9
|
+
class Tokenizer
|
10
|
+
require 'strscan'
|
11
|
+
|
12
|
+
class ScanError < StandardError ; end
|
13
|
+
|
14
|
+
attr_reader :lineno
|
15
|
+
attr_reader :filename
|
16
|
+
attr_accessor :state
|
17
|
+
|
18
|
+
def scan_setup(str)
|
19
|
+
@ss = StringScanner.new(str)
|
20
|
+
@lineno = 1
|
21
|
+
@state = nil
|
22
|
+
end
|
23
|
+
|
24
|
+
def action
|
25
|
+
yield
|
26
|
+
end
|
27
|
+
|
28
|
+
def scan_str(str)
|
29
|
+
scan_setup(str)
|
30
|
+
do_parse
|
31
|
+
end
|
32
|
+
alias :scan :scan_str
|
33
|
+
|
34
|
+
def load_file( filename )
|
35
|
+
@filename = filename
|
36
|
+
open(filename, "r") do |f|
|
37
|
+
scan_setup(f.read)
|
5
38
|
end
|
6
39
|
end
|
40
|
+
|
41
|
+
def scan_file( filename )
|
42
|
+
load_file(filename)
|
43
|
+
do_parse
|
44
|
+
end
|
45
|
+
|
46
|
+
|
47
|
+
def next_token
|
48
|
+
return if @ss.eos?
|
49
|
+
|
50
|
+
# skips empty actions
|
51
|
+
until token = _next_token or @ss.eos?; end
|
52
|
+
token
|
53
|
+
end
|
54
|
+
|
55
|
+
def _next_token
|
56
|
+
text = @ss.peek(1)
|
57
|
+
@lineno += 1 if text == "\n"
|
58
|
+
token = case @state
|
59
|
+
when nil
|
60
|
+
case
|
61
|
+
when (text = @ss.scan(/has\([\s]*/))
|
62
|
+
action { [:HAS, text] }
|
63
|
+
|
64
|
+
when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
|
65
|
+
action { [:FUNCTION, text] }
|
66
|
+
|
67
|
+
when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
|
68
|
+
action { [:IDENT, text] }
|
69
|
+
|
70
|
+
when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])+/))
|
71
|
+
action { [:HASH, text] }
|
72
|
+
|
73
|
+
when (text = @ss.scan(/[\s]*~=[\s]*/))
|
74
|
+
action { [:INCLUDES, text] }
|
75
|
+
|
76
|
+
when (text = @ss.scan(/[\s]*\|=[\s]*/))
|
77
|
+
action { [:DASHMATCH, text] }
|
78
|
+
|
79
|
+
when (text = @ss.scan(/[\s]*\^=[\s]*/))
|
80
|
+
action { [:PREFIXMATCH, text] }
|
81
|
+
|
82
|
+
when (text = @ss.scan(/[\s]*\$=[\s]*/))
|
83
|
+
action { [:SUFFIXMATCH, text] }
|
84
|
+
|
85
|
+
when (text = @ss.scan(/[\s]*\*=[\s]*/))
|
86
|
+
action { [:SUBSTRINGMATCH, text] }
|
87
|
+
|
88
|
+
when (text = @ss.scan(/[\s]*!=[\s]*/))
|
89
|
+
action { [:NOT_EQUAL, text] }
|
90
|
+
|
91
|
+
when (text = @ss.scan(/[\s]*=[\s]*/))
|
92
|
+
action { [:EQUAL, text] }
|
93
|
+
|
94
|
+
when (text = @ss.scan(/[\s]*\)/))
|
95
|
+
action { [:RPAREN, text] }
|
96
|
+
|
97
|
+
when (text = @ss.scan(/[\s]*\[[\s]*/))
|
98
|
+
action { [:LSQUARE, text] }
|
99
|
+
|
100
|
+
when (text = @ss.scan(/[\s]*\]/))
|
101
|
+
action { [:RSQUARE, text] }
|
102
|
+
|
103
|
+
when (text = @ss.scan(/[\s]*\+[\s]*/))
|
104
|
+
action { [:PLUS, text] }
|
105
|
+
|
106
|
+
when (text = @ss.scan(/[\s]*>[\s]*/))
|
107
|
+
action { [:GREATER, text] }
|
108
|
+
|
109
|
+
when (text = @ss.scan(/[\s]*,[\s]*/))
|
110
|
+
action { [:COMMA, text] }
|
111
|
+
|
112
|
+
when (text = @ss.scan(/[\s]*~[\s]*/))
|
113
|
+
action { [:TILDE, text] }
|
114
|
+
|
115
|
+
when (text = @ss.scan(/\:not\([\s]*/))
|
116
|
+
action { [:NOT, text] }
|
117
|
+
|
118
|
+
when (text = @ss.scan(/-?([0-9]+|[0-9]*\.[0-9]+)/))
|
119
|
+
action { [:NUMBER, text] }
|
120
|
+
|
121
|
+
when (text = @ss.scan(/[\s]*\/\/[\s]*/))
|
122
|
+
action { [:DOUBLESLASH, text] }
|
123
|
+
|
124
|
+
when (text = @ss.scan(/[\s]*\/[\s]*/))
|
125
|
+
action { [:SLASH, text] }
|
126
|
+
|
127
|
+
when (text = @ss.scan(/U\+[0-9a-f?]{1,6}(-[0-9a-f]{1,6})?/))
|
128
|
+
action {[:UNICODE_RANGE, text] }
|
129
|
+
|
130
|
+
when (text = @ss.scan(/[\s]+/))
|
131
|
+
action { [:S, text] }
|
132
|
+
|
133
|
+
when (text = @ss.scan(/"([^\n\r\f"]|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*"|'([^\n\r\f']|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*'/))
|
134
|
+
action { [:STRING, text] }
|
135
|
+
|
136
|
+
when (text = @ss.scan(/./))
|
137
|
+
action { [text, text] }
|
138
|
+
|
139
|
+
else
|
140
|
+
text = @ss.string[@ss.pos .. -1]
|
141
|
+
raise ScanError, "can not match: '" + text + "'"
|
142
|
+
end # if
|
143
|
+
|
144
|
+
else
|
145
|
+
raise ScanError, "undefined state: '" + state.to_s + "'"
|
146
|
+
end # case state
|
147
|
+
token
|
148
|
+
end # def _next_token
|
149
|
+
|
150
|
+
end # class
|
151
|
+
end
|
7
152
|
end
|
@@ -3,25 +3,44 @@ module Nokogiri
|
|
3
3
|
class Document < Nokogiri::XML::Document
|
4
4
|
###
|
5
5
|
# Get the meta tag encoding for this document. If there is no meta tag,
|
6
|
-
# then nil is returned
|
6
|
+
# then nil is returned.
|
7
7
|
def meta_encoding
|
8
|
-
|
9
|
-
|
10
|
-
}
|
11
|
-
|
12
|
-
/charset\s*=\s*([\w-]+)/i.match(meta['content'])[1]
|
8
|
+
meta = meta_content_type and
|
9
|
+
/charset\s*=\s*([\w-]+)/i.match(meta['content'])[1]
|
13
10
|
end
|
14
11
|
|
15
12
|
###
|
16
13
|
# Set the meta tag encoding for this document. If there is no meta
|
17
|
-
# content tag,
|
14
|
+
# content tag, the encoding is not set.
|
18
15
|
def meta_encoding= encoding
|
19
|
-
|
20
|
-
|
16
|
+
meta = meta_content_type and
|
17
|
+
meta['content'] = "text/html; charset=%s" % encoding
|
18
|
+
end
|
19
|
+
|
20
|
+
def meta_content_type
|
21
|
+
css('meta[@http-equiv]').find { |node|
|
22
|
+
node['http-equiv'] =~ /\AContent-Type\z/i
|
21
23
|
}
|
24
|
+
end
|
25
|
+
private :meta_content_type
|
22
26
|
|
23
|
-
|
24
|
-
|
27
|
+
###
|
28
|
+
# Get the title string of this document. Return nil if there is
|
29
|
+
# no title tag.
|
30
|
+
def title
|
31
|
+
title = at('title') and title.inner_text
|
32
|
+
end
|
33
|
+
|
34
|
+
###
|
35
|
+
# Set the title string of this document. If there is no head
|
36
|
+
# element, the title is not set.
|
37
|
+
def title=(text)
|
38
|
+
unless title = at('title')
|
39
|
+
head = at('head') or return nil
|
40
|
+
title = Nokogiri::XML::Node.new('title', self)
|
41
|
+
head << title
|
42
|
+
end
|
43
|
+
title.children = XML::Text.new(text, self)
|
25
44
|
end
|
26
45
|
|
27
46
|
####
|
@@ -75,16 +94,124 @@ module Nokogiri
|
|
75
94
|
|
76
95
|
if string_or_io.respond_to?(:read)
|
77
96
|
url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
|
97
|
+
if !encoding
|
98
|
+
# Perform further encoding detection that libxml2 does
|
99
|
+
# not do.
|
100
|
+
string_or_io = EncodingReader.new(string_or_io)
|
101
|
+
begin
|
102
|
+
return read_io(string_or_io, url, encoding, options.to_i)
|
103
|
+
rescue EncodingFoundException => e
|
104
|
+
# A retry is required because libxml2 has a problem in
|
105
|
+
# that it cannot switch encoding well in the middle of
|
106
|
+
# parsing, especially if it has already seen a
|
107
|
+
# non-ASCII character when it finds an encoding hint.
|
108
|
+
encoding = e.encoding
|
109
|
+
end
|
110
|
+
end
|
78
111
|
return read_io(string_or_io, url, encoding, options.to_i)
|
79
112
|
end
|
80
113
|
|
81
114
|
# read_memory pukes on empty docs
|
82
115
|
return new if string_or_io.nil? or string_or_io.empty?
|
83
116
|
|
117
|
+
if !encoding
|
118
|
+
encoding = EncodingReader.detect_encoding(string_or_io)
|
119
|
+
end
|
120
|
+
|
84
121
|
read_memory(string_or_io, url, encoding, options.to_i)
|
85
122
|
end
|
86
123
|
end
|
87
124
|
|
125
|
+
class EncodingFoundException < Exception # :nodoc:
|
126
|
+
attr_reader :encoding
|
127
|
+
|
128
|
+
def initialize(encoding)
|
129
|
+
@encoding = encoding
|
130
|
+
super("encoding found: %s" % encoding)
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
class EncodingReader # :nodoc:
|
135
|
+
class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc:
|
136
|
+
attr_reader :encoding
|
137
|
+
|
138
|
+
def found(encoding)
|
139
|
+
@encoding = encoding
|
140
|
+
throw :found
|
141
|
+
end
|
142
|
+
|
143
|
+
def not_found(encoding)
|
144
|
+
found nil
|
145
|
+
end
|
146
|
+
|
147
|
+
def start_element(name, attrs = [])
|
148
|
+
case name
|
149
|
+
when /\A(?:div|h1|img|p|br)\z/
|
150
|
+
not_found
|
151
|
+
when 'meta'
|
152
|
+
attr = Hash[attrs]
|
153
|
+
http_equiv = attr['http-equiv'] and
|
154
|
+
http_equiv.match(/\AContent-Type\z/i) and
|
155
|
+
content = attr['content'] and
|
156
|
+
m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
|
157
|
+
found m[1]
|
158
|
+
end
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
def self.detect_encoding(chunk)
|
163
|
+
m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
|
164
|
+
return Nokogiri.XML(m[1]).encoding
|
165
|
+
|
166
|
+
if Nokogiri.jruby?
|
167
|
+
m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
|
168
|
+
return m[4]
|
169
|
+
end
|
170
|
+
|
171
|
+
handler = SAXHandler.new
|
172
|
+
parser = Nokogiri::HTML::SAX::Parser.new(handler)
|
173
|
+
catch(:found) {
|
174
|
+
parser.parse(chunk)
|
175
|
+
}
|
176
|
+
handler.encoding
|
177
|
+
rescue => e
|
178
|
+
nil
|
179
|
+
end
|
180
|
+
|
181
|
+
def initialize(io)
|
182
|
+
@io = io
|
183
|
+
@firstchunk = nil
|
184
|
+
end
|
185
|
+
|
186
|
+
def read(len)
|
187
|
+
# no support for a call without len
|
188
|
+
|
189
|
+
if !@firstchunk
|
190
|
+
@firstchunk = @io.read(len) or return nil
|
191
|
+
|
192
|
+
# This implementation expects and assumes that the first
|
193
|
+
# call from htmlReadIO() is made with a length long enough
|
194
|
+
# (~1KB) to achieve further encoding detection that
|
195
|
+
# libxml2 does not do.
|
196
|
+
if encoding = EncodingReader.detect_encoding(@firstchunk)
|
197
|
+
raise EncodingFoundException, encoding
|
198
|
+
end
|
199
|
+
|
200
|
+
# This chunk is stored for the next read in retry.
|
201
|
+
return @firstchunk
|
202
|
+
end
|
203
|
+
|
204
|
+
ret = @firstchunk.slice!(0, len)
|
205
|
+
if (len -= ret.length) > 0
|
206
|
+
rest = @io.read(len) and ret << rest
|
207
|
+
end
|
208
|
+
if ret.empty?
|
209
|
+
nil
|
210
|
+
else
|
211
|
+
ret
|
212
|
+
end
|
213
|
+
end
|
214
|
+
end
|
88
215
|
end
|
89
216
|
end
|
90
217
|
end
|
@@ -31,7 +31,9 @@ module Nokogiri
|
|
31
31
|
def parse_memory data, encoding = 'UTF-8'
|
32
32
|
raise ArgumentError unless data
|
33
33
|
return unless data.length > 0
|
34
|
-
ParserContext.memory(data, encoding)
|
34
|
+
ctx = ParserContext.memory(data, encoding)
|
35
|
+
yield ctx if block_given?
|
36
|
+
ctx.parse_with self
|
35
37
|
end
|
36
38
|
|
37
39
|
###
|
@@ -40,7 +42,9 @@ module Nokogiri
|
|
40
42
|
raise ArgumentError unless filename
|
41
43
|
raise Errno::ENOENT unless File.exists?(filename)
|
42
44
|
raise Errno::EISDIR if File.directory?(filename)
|
43
|
-
ParserContext.file(filename, encoding)
|
45
|
+
ctx = ParserContext.file(filename, encoding)
|
46
|
+
yield ctx if block_given?
|
47
|
+
ctx.parse_with self
|
44
48
|
end
|
45
49
|
end
|
46
50
|
end
|