nokogiri 1.4.4.2-java → 1.4.5-java
Sign up to get free protection for your applications and to get access to all the features.
Potentially problematic release.
This version of nokogiri might be problematic. Click here for more details.
- data/.gemtest +0 -0
- data/CHANGELOG.ja.rdoc +16 -0
- data/CHANGELOG.rdoc +22 -0
- data/Manifest.txt +4 -3
- data/Rakefile +11 -36
- data/ext/nokogiri/xml_document.c +9 -0
- data/ext/nokogiri/xml_io.c +32 -7
- data/ext/nokogiri/xml_node.c +14 -13
- data/ext/nokogiri/xml_sax_parser.c +4 -2
- data/ext/nokogiri/xslt_stylesheet.c +9 -3
- data/lib/nokogiri/css.rb +6 -3
- data/lib/nokogiri/css/parser.rb +665 -70
- data/lib/nokogiri/css/parser.y +3 -1
- data/lib/nokogiri/css/parser_extras.rb +91 -0
- data/lib/nokogiri/css/tokenizer.rb +148 -3
- data/lib/nokogiri/css/tokenizer.rex +1 -1
- data/lib/nokogiri/ffi/structs/xml_attr.rb +2 -1
- data/lib/nokogiri/ffi/structs/xml_node_set.rb +1 -1
- data/lib/nokogiri/ffi/weak_bucket.rb +10 -10
- data/lib/nokogiri/ffi/xml/document.rb +8 -0
- data/lib/nokogiri/ffi/xml/node_set.rb +1 -0
- data/lib/nokogiri/ffi/xml/sax/parser.rb +9 -1
- data/lib/nokogiri/ffi/xslt/stylesheet.rb +4 -0
- data/lib/nokogiri/html/document.rb +134 -15
- data/lib/nokogiri/html/sax/parser.rb +6 -2
- data/lib/nokogiri/version.rb +6 -1
- data/lib/nokogiri/xml/node.rb +8 -23
- data/lib/nokogiri/xml/node/save_options.rb +10 -0
- data/lib/nokogiri/xml/node_set.rb +1 -1
- data/lib/nokogiri/xml/parse_options.rb +8 -0
- data/lib/nokogiri/xml/reader.rb +6 -6
- data/lib/nokogiri/xml/sax/document.rb +2 -2
- data/lib/nokogiri/xml/schema.rb +7 -1
- data/tasks/cross_compile.rb +1 -5
- data/test/css/test_tokenizer.rb +8 -0
- data/test/files/encoding.html +82 -0
- data/test/files/encoding.xhtml +84 -0
- data/test/helper.rb +2 -0
- data/test/html/sax/test_parser.rb +45 -0
- data/test/html/test_document.rb +55 -0
- data/test/html/test_document_encoding.rb +46 -0
- data/test/html/test_element_description.rb +1 -1
- data/test/test_memory_leak.rb +20 -0
- data/test/test_reader.rb +13 -0
- data/test/test_xslt_transforms.rb +6 -2
- data/test/xml/sax/test_parser.rb +16 -0
- data/test/xml/test_document.rb +3 -1
- data/test/xml/test_node.rb +13 -1
- data/test/xml/test_node_set.rb +10 -0
- data/test/xml/test_schema.rb +5 -0
- metadata +89 -130
- data/deps.rip +0 -5
- data/lib/nokogiri/css/generated_parser.rb +0 -676
- data/lib/nokogiri/css/generated_tokenizer.rb +0 -145
data/lib/nokogiri/css/parser.y
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
class Nokogiri::CSS::
|
1
|
+
class Nokogiri::CSS::Parser
|
2
2
|
|
3
3
|
token FUNCTION INCLUDES DASHMATCH LBRACE HASH PLUS GREATER S STRING IDENT
|
4
4
|
token COMMA NUMBER PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH TILDE NOT_EQUAL
|
@@ -233,3 +233,5 @@ end
|
|
233
233
|
|
234
234
|
---- header
|
235
235
|
|
236
|
+
require 'nokogiri/css/parser_extras'
|
237
|
+
|
@@ -0,0 +1,91 @@
|
|
1
|
+
require 'thread'
|
2
|
+
|
3
|
+
module Nokogiri
|
4
|
+
module CSS
|
5
|
+
class Parser < Racc::Parser
|
6
|
+
@cache_on = true
|
7
|
+
@cache = {}
|
8
|
+
@mutex = Mutex.new
|
9
|
+
|
10
|
+
class << self
|
11
|
+
# Turn on CSS parse caching
|
12
|
+
attr_accessor :cache_on
|
13
|
+
alias :cache_on? :cache_on
|
14
|
+
alias :set_cache :cache_on=
|
15
|
+
|
16
|
+
# Get the css selector in +string+ from the cache
|
17
|
+
def [] string
|
18
|
+
return unless @cache_on
|
19
|
+
@mutex.synchronize { @cache[string] }
|
20
|
+
end
|
21
|
+
|
22
|
+
# Set the css selector in +string+ in the cache to +value+
|
23
|
+
def []= string, value
|
24
|
+
return value unless @cache_on
|
25
|
+
@mutex.synchronize { @cache[string] = value }
|
26
|
+
end
|
27
|
+
|
28
|
+
# Clear the cache
|
29
|
+
def clear_cache
|
30
|
+
@mutex.synchronize { @cache = {} }
|
31
|
+
end
|
32
|
+
|
33
|
+
# Execute +block+ without cache
|
34
|
+
def without_cache &block
|
35
|
+
tmp = @cache_on
|
36
|
+
@cache_on = false
|
37
|
+
block.call
|
38
|
+
@cache_on = tmp
|
39
|
+
end
|
40
|
+
|
41
|
+
###
|
42
|
+
# Parse this CSS selector in +selector+. Returns an AST.
|
43
|
+
def parse selector
|
44
|
+
@warned ||= false
|
45
|
+
unless @warned
|
46
|
+
$stderr.puts('Nokogiri::CSS::Parser.parse is deprecated, call Nokogiri::CSS.parse(), this will be removed August 1st or version 1.4.0 (whichever is first)')
|
47
|
+
@warned = true
|
48
|
+
end
|
49
|
+
new.parse selector
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
# Create a new CSS parser with respect to +namespaces+
|
54
|
+
def initialize namespaces = {}
|
55
|
+
@tokenizer = Tokenizer.new
|
56
|
+
@namespaces = namespaces
|
57
|
+
super()
|
58
|
+
end
|
59
|
+
|
60
|
+
def parse string
|
61
|
+
@tokenizer.scan_setup string
|
62
|
+
do_parse
|
63
|
+
end
|
64
|
+
|
65
|
+
def next_token
|
66
|
+
@tokenizer.next_token
|
67
|
+
end
|
68
|
+
|
69
|
+
# Get the xpath for +string+ using +options+
|
70
|
+
def xpath_for string, options={}
|
71
|
+
key = "#{string}#{options[:ns]}#{options[:prefix]}"
|
72
|
+
v = self.class[key]
|
73
|
+
return v if v
|
74
|
+
|
75
|
+
args = [
|
76
|
+
options[:prefix] || '//',
|
77
|
+
options[:visitor] || XPathVisitor.new
|
78
|
+
]
|
79
|
+
self.class[key] = parse(string).map { |ast|
|
80
|
+
ast.to_xpath(*args)
|
81
|
+
}
|
82
|
+
end
|
83
|
+
|
84
|
+
# On CSS parser error, raise an exception
|
85
|
+
def on_error error_token_id, error_value, value_stack
|
86
|
+
after = value_stack.compact.last
|
87
|
+
raise SyntaxError.new("unexpected '#{error_value}' after '#{after}'")
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
@@ -1,7 +1,152 @@
|
|
1
|
+
#--
|
2
|
+
# DO NOT MODIFY!!!!
|
3
|
+
# This file is automatically generated by rex 1.0.5
|
4
|
+
# from lexical definition file "lib/nokogiri/css/tokenizer.rex".
|
5
|
+
#++
|
6
|
+
|
1
7
|
module Nokogiri
|
2
|
-
|
3
|
-
|
4
|
-
|
8
|
+
module CSS
|
9
|
+
class Tokenizer
|
10
|
+
require 'strscan'
|
11
|
+
|
12
|
+
class ScanError < StandardError ; end
|
13
|
+
|
14
|
+
attr_reader :lineno
|
15
|
+
attr_reader :filename
|
16
|
+
attr_accessor :state
|
17
|
+
|
18
|
+
def scan_setup(str)
|
19
|
+
@ss = StringScanner.new(str)
|
20
|
+
@lineno = 1
|
21
|
+
@state = nil
|
22
|
+
end
|
23
|
+
|
24
|
+
def action
|
25
|
+
yield
|
26
|
+
end
|
27
|
+
|
28
|
+
def scan_str(str)
|
29
|
+
scan_setup(str)
|
30
|
+
do_parse
|
31
|
+
end
|
32
|
+
alias :scan :scan_str
|
33
|
+
|
34
|
+
def load_file( filename )
|
35
|
+
@filename = filename
|
36
|
+
open(filename, "r") do |f|
|
37
|
+
scan_setup(f.read)
|
5
38
|
end
|
6
39
|
end
|
40
|
+
|
41
|
+
def scan_file( filename )
|
42
|
+
load_file(filename)
|
43
|
+
do_parse
|
44
|
+
end
|
45
|
+
|
46
|
+
|
47
|
+
def next_token
|
48
|
+
return if @ss.eos?
|
49
|
+
|
50
|
+
# skips empty actions
|
51
|
+
until token = _next_token or @ss.eos?; end
|
52
|
+
token
|
53
|
+
end
|
54
|
+
|
55
|
+
def _next_token
|
56
|
+
text = @ss.peek(1)
|
57
|
+
@lineno += 1 if text == "\n"
|
58
|
+
token = case @state
|
59
|
+
when nil
|
60
|
+
case
|
61
|
+
when (text = @ss.scan(/has\([\s]*/))
|
62
|
+
action { [:HAS, text] }
|
63
|
+
|
64
|
+
when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
|
65
|
+
action { [:FUNCTION, text] }
|
66
|
+
|
67
|
+
when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
|
68
|
+
action { [:IDENT, text] }
|
69
|
+
|
70
|
+
when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])+/))
|
71
|
+
action { [:HASH, text] }
|
72
|
+
|
73
|
+
when (text = @ss.scan(/[\s]*~=[\s]*/))
|
74
|
+
action { [:INCLUDES, text] }
|
75
|
+
|
76
|
+
when (text = @ss.scan(/[\s]*\|=[\s]*/))
|
77
|
+
action { [:DASHMATCH, text] }
|
78
|
+
|
79
|
+
when (text = @ss.scan(/[\s]*\^=[\s]*/))
|
80
|
+
action { [:PREFIXMATCH, text] }
|
81
|
+
|
82
|
+
when (text = @ss.scan(/[\s]*\$=[\s]*/))
|
83
|
+
action { [:SUFFIXMATCH, text] }
|
84
|
+
|
85
|
+
when (text = @ss.scan(/[\s]*\*=[\s]*/))
|
86
|
+
action { [:SUBSTRINGMATCH, text] }
|
87
|
+
|
88
|
+
when (text = @ss.scan(/[\s]*!=[\s]*/))
|
89
|
+
action { [:NOT_EQUAL, text] }
|
90
|
+
|
91
|
+
when (text = @ss.scan(/[\s]*=[\s]*/))
|
92
|
+
action { [:EQUAL, text] }
|
93
|
+
|
94
|
+
when (text = @ss.scan(/[\s]*\)/))
|
95
|
+
action { [:RPAREN, text] }
|
96
|
+
|
97
|
+
when (text = @ss.scan(/[\s]*\[[\s]*/))
|
98
|
+
action { [:LSQUARE, text] }
|
99
|
+
|
100
|
+
when (text = @ss.scan(/[\s]*\]/))
|
101
|
+
action { [:RSQUARE, text] }
|
102
|
+
|
103
|
+
when (text = @ss.scan(/[\s]*\+[\s]*/))
|
104
|
+
action { [:PLUS, text] }
|
105
|
+
|
106
|
+
when (text = @ss.scan(/[\s]*>[\s]*/))
|
107
|
+
action { [:GREATER, text] }
|
108
|
+
|
109
|
+
when (text = @ss.scan(/[\s]*,[\s]*/))
|
110
|
+
action { [:COMMA, text] }
|
111
|
+
|
112
|
+
when (text = @ss.scan(/[\s]*~[\s]*/))
|
113
|
+
action { [:TILDE, text] }
|
114
|
+
|
115
|
+
when (text = @ss.scan(/\:not\([\s]*/))
|
116
|
+
action { [:NOT, text] }
|
117
|
+
|
118
|
+
when (text = @ss.scan(/-?([0-9]+|[0-9]*\.[0-9]+)/))
|
119
|
+
action { [:NUMBER, text] }
|
120
|
+
|
121
|
+
when (text = @ss.scan(/[\s]*\/\/[\s]*/))
|
122
|
+
action { [:DOUBLESLASH, text] }
|
123
|
+
|
124
|
+
when (text = @ss.scan(/[\s]*\/[\s]*/))
|
125
|
+
action { [:SLASH, text] }
|
126
|
+
|
127
|
+
when (text = @ss.scan(/U\+[0-9a-f?]{1,6}(-[0-9a-f]{1,6})?/))
|
128
|
+
action {[:UNICODE_RANGE, text] }
|
129
|
+
|
130
|
+
when (text = @ss.scan(/[\s]+/))
|
131
|
+
action { [:S, text] }
|
132
|
+
|
133
|
+
when (text = @ss.scan(/"([^\n\r\f"]|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*"|'([^\n\r\f']|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*'/))
|
134
|
+
action { [:STRING, text] }
|
135
|
+
|
136
|
+
when (text = @ss.scan(/./))
|
137
|
+
action { [text, text] }
|
138
|
+
|
139
|
+
else
|
140
|
+
text = @ss.string[@ss.pos .. -1]
|
141
|
+
raise ScanError, "can not match: '" + text + "'"
|
142
|
+
end # if
|
143
|
+
|
144
|
+
else
|
145
|
+
raise ScanError, "undefined state: '" + state.to_s + "'"
|
146
|
+
end # case state
|
147
|
+
token
|
148
|
+
end # def _next_token
|
149
|
+
|
150
|
+
end # class
|
151
|
+
end
|
7
152
|
end
|
@@ -5,27 +5,27 @@ else
|
|
5
5
|
require 'weakling'
|
6
6
|
Nokogiri::VERSION_INFO['refs'] = "weakling"
|
7
7
|
end
|
8
|
-
require 'singleton'
|
9
8
|
|
10
9
|
module Nokogiri
|
11
10
|
class WeakBucket
|
12
|
-
include Singleton
|
13
|
-
|
14
11
|
if Nokogiri::VERSION_INFO['refs'] == "weakling"
|
15
|
-
|
16
|
-
|
17
|
-
def initialize
|
18
|
-
@bucket = Weakling::IdHash.new
|
19
|
-
end
|
12
|
+
@@bucket = Weakling::IdHash.new
|
13
|
+
@@semaphore = Mutex.new
|
20
14
|
|
21
15
|
def WeakBucket.get_object(cstruct)
|
22
|
-
|
16
|
+
@@semaphore.synchronize do
|
17
|
+
@@bucket[cstruct.ruby_node_pointer]
|
18
|
+
end
|
23
19
|
end
|
24
20
|
|
25
21
|
def WeakBucket.set_object(cstruct, object)
|
26
|
-
|
22
|
+
@@semaphore.synchronize do
|
23
|
+
cstruct.ruby_node_pointer = @@bucket.add(object)
|
24
|
+
end
|
27
25
|
end
|
26
|
+
|
28
27
|
else
|
28
|
+
|
29
29
|
def WeakBucket.get_object(cstruct)
|
30
30
|
ptr = cstruct.ruby_node_pointer
|
31
31
|
ptr != 0 ? ObjectSpace._id2ref(ptr) : nil
|
@@ -157,6 +157,14 @@ module Nokogiri
|
|
157
157
|
LibXML.xmlFreeNsList(node.cstruct[:nsDef])
|
158
158
|
node.cstruct[:nsDef] = nil
|
159
159
|
end
|
160
|
+
unless node.cstruct[:properties].nil?
|
161
|
+
prop_ptr = node.cstruct[:properties]
|
162
|
+
while ! prop_ptr.null?
|
163
|
+
prop_cstruct = LibXML::XmlAttr.new(node.cstruct[:properties])
|
164
|
+
prop_cstruct[:ns] = nil unless prop_cstruct[:ns].nil?
|
165
|
+
prop_ptr = prop_cstruct[:next]
|
166
|
+
end
|
167
|
+
end
|
160
168
|
end
|
161
169
|
end
|
162
170
|
|
@@ -48,7 +48,15 @@ module Nokogiri
|
|
48
48
|
end
|
49
49
|
|
50
50
|
def __internal__startElement(_, name, attributes)
|
51
|
-
attrs =
|
51
|
+
attrs = []
|
52
|
+
offset = 0
|
53
|
+
if ! attributes.null?
|
54
|
+
while ! attributes.get_pointer(LibXML.pointer_offset(offset)).null? do
|
55
|
+
cons = attributes.get_array_of_string(LibXML.pointer_offset(offset), 2)
|
56
|
+
attrs << cons
|
57
|
+
offset += 2
|
58
|
+
end
|
59
|
+
end
|
52
60
|
@document.start_element name, attrs
|
53
61
|
end
|
54
62
|
|
@@ -51,6 +51,10 @@ module Nokogiri
|
|
51
51
|
end
|
52
52
|
|
53
53
|
def transform(document, params=[]) # :nodoc:
|
54
|
+
unless document.kind_of? Nokogiri::XML::Document
|
55
|
+
raise ArgumentError, "argument must be a Nokogiri::XML::Document"
|
56
|
+
end
|
57
|
+
|
54
58
|
params = params.to_a.flatten if params.is_a?(Hash)
|
55
59
|
raise(TypeError) unless params.is_a?(Array)
|
56
60
|
|
@@ -3,25 +3,44 @@ module Nokogiri
|
|
3
3
|
class Document < Nokogiri::XML::Document
|
4
4
|
###
|
5
5
|
# Get the meta tag encoding for this document. If there is no meta tag,
|
6
|
-
# then nil is returned
|
6
|
+
# then nil is returned.
|
7
7
|
def meta_encoding
|
8
|
-
|
9
|
-
|
10
|
-
}
|
11
|
-
|
12
|
-
/charset\s*=\s*([\w-]+)/i.match(meta['content'])[1]
|
8
|
+
meta = meta_content_type and
|
9
|
+
/charset\s*=\s*([\w-]+)/i.match(meta['content'])[1]
|
13
10
|
end
|
14
11
|
|
15
12
|
###
|
16
13
|
# Set the meta tag encoding for this document. If there is no meta
|
17
|
-
# content tag,
|
14
|
+
# content tag, the encoding is not set.
|
18
15
|
def meta_encoding= encoding
|
19
|
-
|
20
|
-
|
16
|
+
meta = meta_content_type and
|
17
|
+
meta['content'] = "text/html; charset=%s" % encoding
|
18
|
+
end
|
19
|
+
|
20
|
+
def meta_content_type
|
21
|
+
css('meta[@http-equiv]').find { |node|
|
22
|
+
node['http-equiv'] =~ /\AContent-Type\z/i
|
21
23
|
}
|
24
|
+
end
|
25
|
+
private :meta_content_type
|
22
26
|
|
23
|
-
|
24
|
-
|
27
|
+
###
|
28
|
+
# Get the title string of this document. Return nil if there is
|
29
|
+
# no title tag.
|
30
|
+
def title
|
31
|
+
title = at('title') and title.inner_text
|
32
|
+
end
|
33
|
+
|
34
|
+
###
|
35
|
+
# Set the title string of this document. If there is no head
|
36
|
+
# element, the title is not set.
|
37
|
+
def title=(text)
|
38
|
+
unless title = at('title')
|
39
|
+
head = at('head') or return nil
|
40
|
+
title = Nokogiri::XML::Node.new('title', self)
|
41
|
+
head << title
|
42
|
+
end
|
43
|
+
title.children = XML::Text.new(text, self)
|
25
44
|
end
|
26
45
|
|
27
46
|
####
|
@@ -39,10 +58,7 @@ module Nokogiri
|
|
39
58
|
# end
|
40
59
|
#
|
41
60
|
def serialize options = {}
|
42
|
-
options[:save_with] ||= XML::Node::SaveOptions::
|
43
|
-
XML::Node::SaveOptions::AS_HTML |
|
44
|
-
XML::Node::SaveOptions::NO_DECLARATION |
|
45
|
-
XML::Node::SaveOptions::NO_EMPTY_TAGS
|
61
|
+
options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
|
46
62
|
super
|
47
63
|
end
|
48
64
|
|
@@ -75,16 +91,119 @@ module Nokogiri
|
|
75
91
|
|
76
92
|
if string_or_io.respond_to?(:read)
|
77
93
|
url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
|
94
|
+
if !encoding
|
95
|
+
# Perform further encoding detection that libxml2 does
|
96
|
+
# not do.
|
97
|
+
string_or_io = EncodingReader.new(string_or_io)
|
98
|
+
begin
|
99
|
+
return read_io(string_or_io, url, encoding, options.to_i)
|
100
|
+
rescue EncodingFoundException => e
|
101
|
+
# A retry is required because libxml2 has a problem in
|
102
|
+
# that it cannot switch encoding well in the middle of
|
103
|
+
# parsing, especially if it has already seen a
|
104
|
+
# non-ASCII character when it finds an encoding hint.
|
105
|
+
encoding = e.encoding
|
106
|
+
end
|
107
|
+
end
|
78
108
|
return read_io(string_or_io, url, encoding, options.to_i)
|
79
109
|
end
|
80
110
|
|
81
111
|
# read_memory pukes on empty docs
|
82
112
|
return new if string_or_io.nil? or string_or_io.empty?
|
83
113
|
|
114
|
+
if !encoding
|
115
|
+
encoding = EncodingReader.detect_encoding(string_or_io)
|
116
|
+
end
|
117
|
+
|
84
118
|
read_memory(string_or_io, url, encoding, options.to_i)
|
85
119
|
end
|
86
120
|
end
|
87
121
|
|
122
|
+
class EncodingFoundException < Exception # :nodoc:
|
123
|
+
attr_reader :encoding
|
124
|
+
|
125
|
+
def initialize(encoding)
|
126
|
+
@encoding = encoding
|
127
|
+
super("encoding found: %s" % encoding)
|
128
|
+
end
|
129
|
+
end
|
130
|
+
|
131
|
+
class EncodingReader # :nodoc:
|
132
|
+
class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc:
|
133
|
+
attr_reader :encoding
|
134
|
+
|
135
|
+
def found(encoding)
|
136
|
+
@encoding = encoding
|
137
|
+
throw :found
|
138
|
+
end
|
139
|
+
|
140
|
+
def not_found(encoding)
|
141
|
+
found nil
|
142
|
+
end
|
143
|
+
|
144
|
+
def start_element(name, attrs = [])
|
145
|
+
case name
|
146
|
+
when /\A(?:div|h1|img|p|br)\z/
|
147
|
+
not_found
|
148
|
+
when 'meta'
|
149
|
+
attr = Hash[attrs]
|
150
|
+
http_equiv = attr['http-equiv'] and
|
151
|
+
http_equiv.match(/\AContent-Type\z/i) and
|
152
|
+
content = attr['content'] and
|
153
|
+
m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
|
154
|
+
found m[1]
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
|
159
|
+
def self.detect_encoding(chunk)
|
160
|
+
m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
|
161
|
+
return Nokogiri.XML(m[1]).encoding
|
162
|
+
|
163
|
+
handler = SAXHandler.new
|
164
|
+
parser = Nokogiri::HTML::SAX::Parser.new(handler)
|
165
|
+
catch(:found) {
|
166
|
+
parser.parse(chunk)
|
167
|
+
}
|
168
|
+
handler.encoding
|
169
|
+
rescue => e
|
170
|
+
nil
|
171
|
+
end
|
172
|
+
|
173
|
+
def initialize(io)
|
174
|
+
@io = io
|
175
|
+
@firstchunk = nil
|
176
|
+
end
|
177
|
+
|
178
|
+
def read(len)
|
179
|
+
# no support for a call without len
|
180
|
+
|
181
|
+
if !@firstchunk
|
182
|
+
@firstchunk = @io.read(len) or return nil
|
183
|
+
|
184
|
+
# This implementation expects and assumes that the first
|
185
|
+
# call from htmlReadIO() is made with a length long enough
|
186
|
+
# (~1KB) to achieve further encoding detection that
|
187
|
+
# libxml2 does not do.
|
188
|
+
if encoding = EncodingReader.detect_encoding(@firstchunk)
|
189
|
+
raise EncodingFoundException, encoding
|
190
|
+
end
|
191
|
+
|
192
|
+
# This chunk is stored for the next read in retry.
|
193
|
+
return @firstchunk
|
194
|
+
end
|
195
|
+
|
196
|
+
ret = @firstchunk.slice!(0, len)
|
197
|
+
if (len -= ret.length) > 0
|
198
|
+
rest = @io.read(len) and ret << rest
|
199
|
+
end
|
200
|
+
if ret.empty?
|
201
|
+
nil
|
202
|
+
else
|
203
|
+
ret
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
88
207
|
end
|
89
208
|
end
|
90
209
|
end
|