nokogiri 1.5.0.beta.3-java → 1.5.0.beta.4-java

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (88) hide show
  1. data/CHANGELOG.ja.rdoc +32 -16
  2. data/CHANGELOG.rdoc +18 -0
  3. data/Manifest.txt +1 -2
  4. data/README.rdoc +27 -3
  5. data/Rakefile +39 -83
  6. data/ext/java/nokogiri/EncodingHandler.java +1 -1
  7. data/ext/java/nokogiri/HtmlDocument.java +11 -14
  8. data/ext/java/nokogiri/HtmlElementDescription.java +1 -1
  9. data/ext/java/nokogiri/HtmlEntityLookup.java +1 -1
  10. data/ext/java/nokogiri/HtmlSaxParserContext.java +13 -10
  11. data/ext/java/nokogiri/NokogiriService.java +103 -34
  12. data/ext/java/nokogiri/XmlAttr.java +14 -6
  13. data/ext/java/nokogiri/XmlAttributeDecl.java +1 -1
  14. data/ext/java/nokogiri/XmlCdata.java +3 -1
  15. data/ext/java/nokogiri/XmlComment.java +3 -1
  16. data/ext/java/nokogiri/XmlDocument.java +29 -8
  17. data/ext/java/nokogiri/XmlDocumentFragment.java +14 -13
  18. data/ext/java/nokogiri/XmlDtd.java +5 -2
  19. data/ext/java/nokogiri/XmlElement.java +2 -1
  20. data/ext/java/nokogiri/XmlElementContent.java +1 -1
  21. data/ext/java/nokogiri/XmlElementDecl.java +2 -1
  22. data/ext/java/nokogiri/XmlEntityDecl.java +2 -1
  23. data/ext/java/nokogiri/XmlEntityReference.java +1 -1
  24. data/ext/java/nokogiri/XmlNamespace.java +3 -2
  25. data/ext/java/nokogiri/XmlNode.java +17 -10
  26. data/ext/java/nokogiri/XmlNodeSet.java +40 -13
  27. data/ext/java/nokogiri/XmlProcessingInstruction.java +1 -1
  28. data/ext/java/nokogiri/XmlReader.java +3 -1
  29. data/ext/java/nokogiri/XmlRelaxng.java +37 -92
  30. data/ext/java/nokogiri/XmlSaxParserContext.java +25 -11
  31. data/ext/java/nokogiri/XmlSaxPushParser.java +6 -4
  32. data/ext/java/nokogiri/XmlSchema.java +190 -46
  33. data/ext/java/nokogiri/XmlSyntaxError.java +42 -37
  34. data/ext/java/nokogiri/XmlText.java +3 -2
  35. data/ext/java/nokogiri/XmlXpathContext.java +8 -4
  36. data/ext/java/nokogiri/XsltStylesheet.java +12 -10
  37. data/ext/java/nokogiri/internals/HtmlDomParserContext.java +8 -7
  38. data/ext/java/nokogiri/internals/NokogiriDocumentCache.java +1 -1
  39. data/ext/java/nokogiri/internals/NokogiriErrorHandler.java +11 -5
  40. data/ext/java/nokogiri/internals/NokogiriHandler.java +36 -9
  41. data/ext/java/nokogiri/internals/NokogiriHelpers.java +21 -22
  42. data/ext/java/nokogiri/internals/NokogiriNamespaceCache.java +5 -4
  43. data/ext/java/nokogiri/internals/NokogiriNamespaceContext.java +1 -1
  44. data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler.java +2 -1
  45. data/ext/java/nokogiri/internals/NokogiriNonStrictErrorHandler4NekoHtml.java +1 -1
  46. data/ext/java/nokogiri/internals/NokogiriStrictErrorHandler.java +2 -1
  47. data/ext/java/nokogiri/internals/NokogiriXPathFunction.java +15 -9
  48. data/ext/java/nokogiri/internals/NokogiriXPathFunctionResolver.java +1 -1
  49. data/ext/java/nokogiri/internals/ParserContext.java +18 -7
  50. data/ext/java/nokogiri/internals/PushInputStream.java +1 -1
  51. data/ext/java/nokogiri/internals/ReaderNode.java +7 -6
  52. data/ext/java/nokogiri/internals/SaveContext.java +16 -10
  53. data/ext/java/nokogiri/internals/SchemaErrorHandler.java +13 -5
  54. data/ext/java/nokogiri/internals/XmlDeclHandler.java +1 -1
  55. data/ext/java/nokogiri/internals/XmlDomParser.java +1 -1
  56. data/ext/java/nokogiri/internals/XmlDomParserContext.java +13 -8
  57. data/ext/java/nokogiri/internals/XmlSaxParser.java +1 -1
  58. data/ext/java/nokogiri/internals/XsltExtensionFunction.java +1 -1
  59. data/ext/nokogiri/extconf.rb +3 -3
  60. data/ext/nokogiri/xml_document.c +9 -0
  61. data/ext/nokogiri/xml_sax_parser.c +4 -2
  62. data/lib/nokogiri.rb +9 -6
  63. data/lib/nokogiri/css.rb +1 -3
  64. data/lib/nokogiri/css/parser.rb +665 -70
  65. data/lib/nokogiri/css/parser.y +3 -1
  66. data/lib/nokogiri/css/parser_extras.rb +91 -0
  67. data/lib/nokogiri/css/tokenizer.rb +148 -3
  68. data/lib/nokogiri/css/tokenizer.rex +1 -1
  69. data/lib/nokogiri/html/document.rb +138 -11
  70. data/lib/nokogiri/html/sax/parser.rb +6 -2
  71. data/lib/nokogiri/nokogiri.jar +0 -0
  72. data/lib/nokogiri/version.rb +1 -1
  73. data/lib/nokogiri/xml/node.rb +2 -2
  74. data/lib/nokogiri/xml/node/save_options.rb +3 -0
  75. data/lib/nokogiri/xml/node_set.rb +1 -1
  76. data/test/css/test_tokenizer.rb +8 -0
  77. data/test/helper.rb +2 -0
  78. data/test/html/sax/test_parser.rb +43 -0
  79. data/test/html/test_document.rb +59 -0
  80. data/test/html/test_document_encoding.rb +48 -0
  81. data/test/html/test_element_description.rb +1 -1
  82. data/test/xml/sax/test_parser.rb +16 -0
  83. data/test/xml/test_document.rb +3 -1
  84. data/test/xml/test_node.rb +4 -1
  85. data/test/xml/test_node_set.rb +10 -0
  86. metadata +73 -82
  87. data/lib/nokogiri/css/generated_parser.rb +0 -676
  88. data/lib/nokogiri/css/generated_tokenizer.rb +0 -145
@@ -1,4 +1,4 @@
1
- class Nokogiri::CSS::GeneratedParser
1
+ class Nokogiri::CSS::Parser
2
2
 
3
3
  token FUNCTION INCLUDES DASHMATCH LBRACE HASH PLUS GREATER S STRING IDENT
4
4
  token COMMA NUMBER PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH TILDE NOT_EQUAL
@@ -233,3 +233,5 @@ end
233
233
 
234
234
  ---- header
235
235
 
236
+ require 'nokogiri/css/parser_extras'
237
+
@@ -0,0 +1,91 @@
1
+ require 'thread'
2
+
3
+ module Nokogiri
4
+ module CSS
5
+ class Parser < Racc::Parser
6
+ @cache_on = true
7
+ @cache = {}
8
+ @mutex = Mutex.new
9
+
10
+ class << self
11
+ # Turn on CSS parse caching
12
+ attr_accessor :cache_on
13
+ alias :cache_on? :cache_on
14
+ alias :set_cache :cache_on=
15
+
16
+ # Get the css selector in +string+ from the cache
17
+ def [] string
18
+ return unless @cache_on
19
+ @mutex.synchronize { @cache[string] }
20
+ end
21
+
22
+ # Set the css selector in +string+ in the cache to +value+
23
+ def []= string, value
24
+ return value unless @cache_on
25
+ @mutex.synchronize { @cache[string] = value }
26
+ end
27
+
28
+ # Clear the cache
29
+ def clear_cache
30
+ @mutex.synchronize { @cache = {} }
31
+ end
32
+
33
+ # Execute +block+ without cache
34
+ def without_cache &block
35
+ tmp = @cache_on
36
+ @cache_on = false
37
+ block.call
38
+ @cache_on = tmp
39
+ end
40
+
41
+ ###
42
+ # Parse this CSS selector in +selector+. Returns an AST.
43
+ def parse selector
44
+ @warned ||= false
45
+ unless @warned
46
+ $stderr.puts('Nokogiri::CSS::Parser.parse is deprecated, call Nokogiri::CSS.parse(), this will be removed August 1st or version 1.4.0 (whichever is first)')
47
+ @warned = true
48
+ end
49
+ new.parse selector
50
+ end
51
+ end
52
+
53
+ # Create a new CSS parser with respect to +namespaces+
54
+ def initialize namespaces = {}
55
+ @tokenizer = Tokenizer.new
56
+ @namespaces = namespaces
57
+ super()
58
+ end
59
+
60
+ def parse string
61
+ @tokenizer.scan_setup string
62
+ do_parse
63
+ end
64
+
65
+ def next_token
66
+ @tokenizer.next_token
67
+ end
68
+
69
+ # Get the xpath for +string+ using +options+
70
+ def xpath_for string, options={}
71
+ key = "#{string}#{options[:ns]}#{options[:prefix]}"
72
+ v = self.class[key]
73
+ return v if v
74
+
75
+ args = [
76
+ options[:prefix] || '//',
77
+ options[:visitor] || XPathVisitor.new
78
+ ]
79
+ self.class[key] = parse(string).map { |ast|
80
+ ast.to_xpath(*args)
81
+ }
82
+ end
83
+
84
+ # On CSS parser error, raise an exception
85
+ def on_error error_token_id, error_value, value_stack
86
+ after = value_stack.compact.last
87
+ raise SyntaxError.new("unexpected '#{error_value}' after '#{after}'")
88
+ end
89
+ end
90
+ end
91
+ end
@@ -1,7 +1,152 @@
1
+ #--
2
+ # DO NOT MODIFY!!!!
3
+ # This file is automatically generated by rex 1.0.5
4
+ # from lexical definition file "lib/nokogiri/css/tokenizer.rex".
5
+ #++
6
+
1
7
  module Nokogiri
2
- module CSS
3
- class Tokenizer < GeneratedTokenizer
4
- alias :scan :scan_setup
8
+ module CSS
9
+ class Tokenizer
10
+ require 'strscan'
11
+
12
+ class ScanError < StandardError ; end
13
+
14
+ attr_reader :lineno
15
+ attr_reader :filename
16
+ attr_accessor :state
17
+
18
+ def scan_setup(str)
19
+ @ss = StringScanner.new(str)
20
+ @lineno = 1
21
+ @state = nil
22
+ end
23
+
24
+ def action
25
+ yield
26
+ end
27
+
28
+ def scan_str(str)
29
+ scan_setup(str)
30
+ do_parse
31
+ end
32
+ alias :scan :scan_str
33
+
34
+ def load_file( filename )
35
+ @filename = filename
36
+ open(filename, "r") do |f|
37
+ scan_setup(f.read)
5
38
  end
6
39
  end
40
+
41
+ def scan_file( filename )
42
+ load_file(filename)
43
+ do_parse
44
+ end
45
+
46
+
47
+ def next_token
48
+ return if @ss.eos?
49
+
50
+ # skips empty actions
51
+ until token = _next_token or @ss.eos?; end
52
+ token
53
+ end
54
+
55
+ def _next_token
56
+ text = @ss.peek(1)
57
+ @lineno += 1 if text == "\n"
58
+ token = case @state
59
+ when nil
60
+ case
61
+ when (text = @ss.scan(/has\([\s]*/))
62
+ action { [:HAS, text] }
63
+
64
+ when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
65
+ action { [:FUNCTION, text] }
66
+
67
+ when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
68
+ action { [:IDENT, text] }
69
+
70
+ when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])+/))
71
+ action { [:HASH, text] }
72
+
73
+ when (text = @ss.scan(/[\s]*~=[\s]*/))
74
+ action { [:INCLUDES, text] }
75
+
76
+ when (text = @ss.scan(/[\s]*\|=[\s]*/))
77
+ action { [:DASHMATCH, text] }
78
+
79
+ when (text = @ss.scan(/[\s]*\^=[\s]*/))
80
+ action { [:PREFIXMATCH, text] }
81
+
82
+ when (text = @ss.scan(/[\s]*\$=[\s]*/))
83
+ action { [:SUFFIXMATCH, text] }
84
+
85
+ when (text = @ss.scan(/[\s]*\*=[\s]*/))
86
+ action { [:SUBSTRINGMATCH, text] }
87
+
88
+ when (text = @ss.scan(/[\s]*!=[\s]*/))
89
+ action { [:NOT_EQUAL, text] }
90
+
91
+ when (text = @ss.scan(/[\s]*=[\s]*/))
92
+ action { [:EQUAL, text] }
93
+
94
+ when (text = @ss.scan(/[\s]*\)/))
95
+ action { [:RPAREN, text] }
96
+
97
+ when (text = @ss.scan(/[\s]*\[[\s]*/))
98
+ action { [:LSQUARE, text] }
99
+
100
+ when (text = @ss.scan(/[\s]*\]/))
101
+ action { [:RSQUARE, text] }
102
+
103
+ when (text = @ss.scan(/[\s]*\+[\s]*/))
104
+ action { [:PLUS, text] }
105
+
106
+ when (text = @ss.scan(/[\s]*>[\s]*/))
107
+ action { [:GREATER, text] }
108
+
109
+ when (text = @ss.scan(/[\s]*,[\s]*/))
110
+ action { [:COMMA, text] }
111
+
112
+ when (text = @ss.scan(/[\s]*~[\s]*/))
113
+ action { [:TILDE, text] }
114
+
115
+ when (text = @ss.scan(/\:not\([\s]*/))
116
+ action { [:NOT, text] }
117
+
118
+ when (text = @ss.scan(/-?([0-9]+|[0-9]*\.[0-9]+)/))
119
+ action { [:NUMBER, text] }
120
+
121
+ when (text = @ss.scan(/[\s]*\/\/[\s]*/))
122
+ action { [:DOUBLESLASH, text] }
123
+
124
+ when (text = @ss.scan(/[\s]*\/[\s]*/))
125
+ action { [:SLASH, text] }
126
+
127
+ when (text = @ss.scan(/U\+[0-9a-f?]{1,6}(-[0-9a-f]{1,6})?/))
128
+ action {[:UNICODE_RANGE, text] }
129
+
130
+ when (text = @ss.scan(/[\s]+/))
131
+ action { [:S, text] }
132
+
133
+ when (text = @ss.scan(/"([^\n\r\f"]|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*"|'([^\n\r\f']|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*'/))
134
+ action { [:STRING, text] }
135
+
136
+ when (text = @ss.scan(/./))
137
+ action { [text, text] }
138
+
139
+ else
140
+ text = @ss.string[@ss.pos .. -1]
141
+ raise ScanError, "can not match: '" + text + "'"
142
+ end # if
143
+
144
+ else
145
+ raise ScanError, "undefined state: '" + state.to_s + "'"
146
+ end # case state
147
+ token
148
+ end # def _next_token
149
+
150
+ end # class
151
+ end
7
152
  end
@@ -1,6 +1,6 @@
1
1
  module Nokogiri
2
2
  module CSS
3
- class GeneratedTokenizer < GeneratedParser
3
+ class Tokenizer
4
4
 
5
5
  macro
6
6
  nl \n|\r\n|\r|\f
@@ -3,25 +3,44 @@ module Nokogiri
3
3
  class Document < Nokogiri::XML::Document
4
4
  ###
5
5
  # Get the meta tag encoding for this document. If there is no meta tag,
6
- # then nil is returned
6
+ # then nil is returned.
7
7
  def meta_encoding
8
- return nil unless meta = css('meta').find { |node|
9
- node['http-equiv'] =~ /Content-Type/i
10
- }
11
-
12
- /charset\s*=\s*([\w-]+)/i.match(meta['content'])[1]
8
+ meta = meta_content_type and
9
+ /charset\s*=\s*([\w-]+)/i.match(meta['content'])[1]
13
10
  end
14
11
 
15
12
  ###
16
13
  # Set the meta tag encoding for this document. If there is no meta
17
- # content tag, nil is returned and the encoding is not set.
14
+ # content tag, the encoding is not set.
18
15
  def meta_encoding= encoding
19
- return nil unless meta = css('meta').find { |node|
20
- node['http-equiv'] =~ /Content-Type/i
16
+ meta = meta_content_type and
17
+ meta['content'] = "text/html; charset=%s" % encoding
18
+ end
19
+
20
+ def meta_content_type
21
+ css('meta[@http-equiv]').find { |node|
22
+ node['http-equiv'] =~ /\AContent-Type\z/i
21
23
  }
24
+ end
25
+ private :meta_content_type
22
26
 
23
- meta['content'] = "text/html; charset=%s" % encoding
24
- encoding
27
+ ###
28
+ # Get the title string of this document. Return nil if there is
29
+ # no title tag.
30
+ def title
31
+ title = at('title') and title.inner_text
32
+ end
33
+
34
+ ###
35
+ # Set the title string of this document. If there is no head
36
+ # element, the title is not set.
37
+ def title=(text)
38
+ unless title = at('title')
39
+ head = at('head') or return nil
40
+ title = Nokogiri::XML::Node.new('title', self)
41
+ head << title
42
+ end
43
+ title.children = XML::Text.new(text, self)
25
44
  end
26
45
 
27
46
  ####
@@ -75,16 +94,124 @@ module Nokogiri
75
94
 
76
95
  if string_or_io.respond_to?(:read)
77
96
  url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
97
+ if !encoding
98
+ # Perform further encoding detection that libxml2 does
99
+ # not do.
100
+ string_or_io = EncodingReader.new(string_or_io)
101
+ begin
102
+ return read_io(string_or_io, url, encoding, options.to_i)
103
+ rescue EncodingFoundException => e
104
+ # A retry is required because libxml2 has a problem in
105
+ # that it cannot switch encoding well in the middle of
106
+ # parsing, especially if it has already seen a
107
+ # non-ASCII character when it finds an encoding hint.
108
+ encoding = e.encoding
109
+ end
110
+ end
78
111
  return read_io(string_or_io, url, encoding, options.to_i)
79
112
  end
80
113
 
81
114
  # read_memory pukes on empty docs
82
115
  return new if string_or_io.nil? or string_or_io.empty?
83
116
 
117
+ if !encoding
118
+ encoding = EncodingReader.detect_encoding(string_or_io)
119
+ end
120
+
84
121
  read_memory(string_or_io, url, encoding, options.to_i)
85
122
  end
86
123
  end
87
124
 
125
+ class EncodingFoundException < Exception # :nodoc:
126
+ attr_reader :encoding
127
+
128
+ def initialize(encoding)
129
+ @encoding = encoding
130
+ super("encoding found: %s" % encoding)
131
+ end
132
+ end
133
+
134
+ class EncodingReader # :nodoc:
135
+ class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc:
136
+ attr_reader :encoding
137
+
138
+ def found(encoding)
139
+ @encoding = encoding
140
+ throw :found
141
+ end
142
+
143
+ def not_found(encoding)
144
+ found nil
145
+ end
146
+
147
+ def start_element(name, attrs = [])
148
+ case name
149
+ when /\A(?:div|h1|img|p|br)\z/
150
+ not_found
151
+ when 'meta'
152
+ attr = Hash[attrs]
153
+ http_equiv = attr['http-equiv'] and
154
+ http_equiv.match(/\AContent-Type\z/i) and
155
+ content = attr['content'] and
156
+ m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
157
+ found m[1]
158
+ end
159
+ end
160
+ end
161
+
162
+ def self.detect_encoding(chunk)
163
+ m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
164
+ return Nokogiri.XML(m[1]).encoding
165
+
166
+ if Nokogiri.jruby?
167
+ m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
168
+ return m[4]
169
+ end
170
+
171
+ handler = SAXHandler.new
172
+ parser = Nokogiri::HTML::SAX::Parser.new(handler)
173
+ catch(:found) {
174
+ parser.parse(chunk)
175
+ }
176
+ handler.encoding
177
+ rescue => e
178
+ nil
179
+ end
180
+
181
+ def initialize(io)
182
+ @io = io
183
+ @firstchunk = nil
184
+ end
185
+
186
+ def read(len)
187
+ # no support for a call without len
188
+
189
+ if !@firstchunk
190
+ @firstchunk = @io.read(len) or return nil
191
+
192
+ # This implementation expects and assumes that the first
193
+ # call from htmlReadIO() is made with a length long enough
194
+ # (~1KB) to achieve further encoding detection that
195
+ # libxml2 does not do.
196
+ if encoding = EncodingReader.detect_encoding(@firstchunk)
197
+ raise EncodingFoundException, encoding
198
+ end
199
+
200
+ # This chunk is stored for the next read in retry.
201
+ return @firstchunk
202
+ end
203
+
204
+ ret = @firstchunk.slice!(0, len)
205
+ if (len -= ret.length) > 0
206
+ rest = @io.read(len) and ret << rest
207
+ end
208
+ if ret.empty?
209
+ nil
210
+ else
211
+ ret
212
+ end
213
+ end
214
+ end
88
215
  end
89
216
  end
90
217
  end
@@ -31,7 +31,9 @@ module Nokogiri
31
31
  def parse_memory data, encoding = 'UTF-8'
32
32
  raise ArgumentError unless data
33
33
  return unless data.length > 0
34
- ParserContext.memory(data, encoding).parse_with self
34
+ ctx = ParserContext.memory(data, encoding)
35
+ yield ctx if block_given?
36
+ ctx.parse_with self
35
37
  end
36
38
 
37
39
  ###
@@ -40,7 +42,9 @@ module Nokogiri
40
42
  raise ArgumentError unless filename
41
43
  raise Errno::ENOENT unless File.exists?(filename)
42
44
  raise Errno::EISDIR if File.directory?(filename)
43
- ParserContext.file(filename, encoding).parse_with self
45
+ ctx = ParserContext.file(filename, encoding)
46
+ yield ctx if block_given?
47
+ ctx.parse_with self
44
48
  end
45
49
  end
46
50
  end