nokogiri 1.4.4 → 1.4.5

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (54) hide show
  1. data/.gemtest +0 -0
  2. data/CHANGELOG.ja.rdoc +16 -0
  3. data/CHANGELOG.rdoc +23 -1
  4. data/Manifest.txt +4 -3
  5. data/Rakefile +41 -37
  6. data/ext/nokogiri/xml_document.c +9 -0
  7. data/ext/nokogiri/xml_io.c +32 -7
  8. data/ext/nokogiri/xml_node.c +14 -13
  9. data/ext/nokogiri/xml_sax_parser.c +4 -2
  10. data/ext/nokogiri/xslt_stylesheet.c +9 -3
  11. data/lib/nokogiri/css.rb +6 -3
  12. data/lib/nokogiri/css/parser.rb +665 -70
  13. data/lib/nokogiri/css/parser.y +3 -1
  14. data/lib/nokogiri/css/parser_extras.rb +91 -0
  15. data/lib/nokogiri/css/tokenizer.rb +148 -3
  16. data/lib/nokogiri/css/tokenizer.rex +1 -1
  17. data/lib/nokogiri/ffi/structs/xml_attr.rb +2 -1
  18. data/lib/nokogiri/ffi/structs/xml_node_set.rb +1 -1
  19. data/lib/nokogiri/ffi/weak_bucket.rb +10 -10
  20. data/lib/nokogiri/ffi/xml/document.rb +8 -0
  21. data/lib/nokogiri/ffi/xml/node_set.rb +1 -0
  22. data/lib/nokogiri/ffi/xml/sax/parser.rb +9 -1
  23. data/lib/nokogiri/ffi/xslt/stylesheet.rb +4 -0
  24. data/lib/nokogiri/html/document.rb +134 -15
  25. data/lib/nokogiri/html/sax/parser.rb +6 -2
  26. data/lib/nokogiri/version.rb +6 -1
  27. data/lib/nokogiri/xml/node.rb +8 -23
  28. data/lib/nokogiri/xml/node/save_options.rb +10 -0
  29. data/lib/nokogiri/xml/node_set.rb +1 -1
  30. data/lib/nokogiri/xml/parse_options.rb +8 -0
  31. data/lib/nokogiri/xml/reader.rb +6 -6
  32. data/lib/nokogiri/xml/sax/document.rb +2 -2
  33. data/lib/nokogiri/xml/schema.rb +7 -1
  34. data/tasks/cross_compile.rb +8 -15
  35. data/test/css/test_tokenizer.rb +8 -0
  36. data/test/files/encoding.html +82 -0
  37. data/test/files/encoding.xhtml +84 -0
  38. data/test/helper.rb +2 -0
  39. data/test/html/sax/test_parser.rb +45 -0
  40. data/test/html/test_document.rb +55 -0
  41. data/test/html/test_document_encoding.rb +46 -0
  42. data/test/html/test_element_description.rb +1 -1
  43. data/test/test_memory_leak.rb +20 -0
  44. data/test/test_reader.rb +13 -0
  45. data/test/test_xslt_transforms.rb +6 -2
  46. data/test/xml/sax/test_parser.rb +16 -0
  47. data/test/xml/test_document.rb +3 -1
  48. data/test/xml/test_node.rb +13 -1
  49. data/test/xml/test_node_set.rb +10 -0
  50. data/test/xml/test_schema.rb +5 -0
  51. metadata +94 -109
  52. data/deps.rip +0 -5
  53. data/lib/nokogiri/css/generated_parser.rb +0 -676
  54. data/lib/nokogiri/css/generated_tokenizer.rb +0 -145
@@ -1,4 +1,4 @@
1
- class Nokogiri::CSS::GeneratedParser
1
+ class Nokogiri::CSS::Parser
2
2
 
3
3
  token FUNCTION INCLUDES DASHMATCH LBRACE HASH PLUS GREATER S STRING IDENT
4
4
  token COMMA NUMBER PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH TILDE NOT_EQUAL
@@ -233,3 +233,5 @@ end
233
233
 
234
234
  ---- header
235
235
 
236
+ require 'nokogiri/css/parser_extras'
237
+
@@ -0,0 +1,91 @@
1
+ require 'thread'
2
+
3
+ module Nokogiri
4
+ module CSS
5
+ class Parser < Racc::Parser
6
+ @cache_on = true
7
+ @cache = {}
8
+ @mutex = Mutex.new
9
+
10
+ class << self
11
+ # Turn on CSS parse caching
12
+ attr_accessor :cache_on
13
+ alias :cache_on? :cache_on
14
+ alias :set_cache :cache_on=
15
+
16
+ # Get the css selector in +string+ from the cache
17
+ def [] string
18
+ return unless @cache_on
19
+ @mutex.synchronize { @cache[string] }
20
+ end
21
+
22
+ # Set the css selector in +string+ in the cache to +value+
23
+ def []= string, value
24
+ return value unless @cache_on
25
+ @mutex.synchronize { @cache[string] = value }
26
+ end
27
+
28
+ # Clear the cache
29
+ def clear_cache
30
+ @mutex.synchronize { @cache = {} }
31
+ end
32
+
33
+ # Execute +block+ without cache
34
+ def without_cache &block
35
+ tmp = @cache_on
36
+ @cache_on = false
37
+ block.call
38
+ @cache_on = tmp
39
+ end
40
+
41
+ ###
42
+ # Parse this CSS selector in +selector+. Returns an AST.
43
+ def parse selector
44
+ @warned ||= false
45
+ unless @warned
46
+ $stderr.puts('Nokogiri::CSS::Parser.parse is deprecated, call Nokogiri::CSS.parse(), this will be removed August 1st or version 1.4.0 (whichever is first)')
47
+ @warned = true
48
+ end
49
+ new.parse selector
50
+ end
51
+ end
52
+
53
+ # Create a new CSS parser with respect to +namespaces+
54
+ def initialize namespaces = {}
55
+ @tokenizer = Tokenizer.new
56
+ @namespaces = namespaces
57
+ super()
58
+ end
59
+
60
+ def parse string
61
+ @tokenizer.scan_setup string
62
+ do_parse
63
+ end
64
+
65
+ def next_token
66
+ @tokenizer.next_token
67
+ end
68
+
69
+ # Get the xpath for +string+ using +options+
70
+ def xpath_for string, options={}
71
+ key = "#{string}#{options[:ns]}#{options[:prefix]}"
72
+ v = self.class[key]
73
+ return v if v
74
+
75
+ args = [
76
+ options[:prefix] || '//',
77
+ options[:visitor] || XPathVisitor.new
78
+ ]
79
+ self.class[key] = parse(string).map { |ast|
80
+ ast.to_xpath(*args)
81
+ }
82
+ end
83
+
84
+ # On CSS parser error, raise an exception
85
+ def on_error error_token_id, error_value, value_stack
86
+ after = value_stack.compact.last
87
+ raise SyntaxError.new("unexpected '#{error_value}' after '#{after}'")
88
+ end
89
+ end
90
+ end
91
+ end
@@ -1,7 +1,152 @@
1
+ #--
2
+ # DO NOT MODIFY!!!!
3
+ # This file is automatically generated by rex 1.0.5
4
+ # from lexical definition file "lib/nokogiri/css/tokenizer.rex".
5
+ #++
6
+
1
7
  module Nokogiri
2
- module CSS
3
- class Tokenizer < GeneratedTokenizer
4
- alias :scan :scan_setup
8
+ module CSS
9
+ class Tokenizer
10
+ require 'strscan'
11
+
12
+ class ScanError < StandardError ; end
13
+
14
+ attr_reader :lineno
15
+ attr_reader :filename
16
+ attr_accessor :state
17
+
18
+ def scan_setup(str)
19
+ @ss = StringScanner.new(str)
20
+ @lineno = 1
21
+ @state = nil
22
+ end
23
+
24
+ def action
25
+ yield
26
+ end
27
+
28
+ def scan_str(str)
29
+ scan_setup(str)
30
+ do_parse
31
+ end
32
+ alias :scan :scan_str
33
+
34
+ def load_file( filename )
35
+ @filename = filename
36
+ open(filename, "r") do |f|
37
+ scan_setup(f.read)
5
38
  end
6
39
  end
40
+
41
+ def scan_file( filename )
42
+ load_file(filename)
43
+ do_parse
44
+ end
45
+
46
+
47
+ def next_token
48
+ return if @ss.eos?
49
+
50
+ # skips empty actions
51
+ until token = _next_token or @ss.eos?; end
52
+ token
53
+ end
54
+
55
+ def _next_token
56
+ text = @ss.peek(1)
57
+ @lineno += 1 if text == "\n"
58
+ token = case @state
59
+ when nil
60
+ case
61
+ when (text = @ss.scan(/has\([\s]*/))
62
+ action { [:HAS, text] }
63
+
64
+ when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
65
+ action { [:FUNCTION, text] }
66
+
67
+ when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
68
+ action { [:IDENT, text] }
69
+
70
+ when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])+/))
71
+ action { [:HASH, text] }
72
+
73
+ when (text = @ss.scan(/[\s]*~=[\s]*/))
74
+ action { [:INCLUDES, text] }
75
+
76
+ when (text = @ss.scan(/[\s]*\|=[\s]*/))
77
+ action { [:DASHMATCH, text] }
78
+
79
+ when (text = @ss.scan(/[\s]*\^=[\s]*/))
80
+ action { [:PREFIXMATCH, text] }
81
+
82
+ when (text = @ss.scan(/[\s]*\$=[\s]*/))
83
+ action { [:SUFFIXMATCH, text] }
84
+
85
+ when (text = @ss.scan(/[\s]*\*=[\s]*/))
86
+ action { [:SUBSTRINGMATCH, text] }
87
+
88
+ when (text = @ss.scan(/[\s]*!=[\s]*/))
89
+ action { [:NOT_EQUAL, text] }
90
+
91
+ when (text = @ss.scan(/[\s]*=[\s]*/))
92
+ action { [:EQUAL, text] }
93
+
94
+ when (text = @ss.scan(/[\s]*\)/))
95
+ action { [:RPAREN, text] }
96
+
97
+ when (text = @ss.scan(/[\s]*\[[\s]*/))
98
+ action { [:LSQUARE, text] }
99
+
100
+ when (text = @ss.scan(/[\s]*\]/))
101
+ action { [:RSQUARE, text] }
102
+
103
+ when (text = @ss.scan(/[\s]*\+[\s]*/))
104
+ action { [:PLUS, text] }
105
+
106
+ when (text = @ss.scan(/[\s]*>[\s]*/))
107
+ action { [:GREATER, text] }
108
+
109
+ when (text = @ss.scan(/[\s]*,[\s]*/))
110
+ action { [:COMMA, text] }
111
+
112
+ when (text = @ss.scan(/[\s]*~[\s]*/))
113
+ action { [:TILDE, text] }
114
+
115
+ when (text = @ss.scan(/\:not\([\s]*/))
116
+ action { [:NOT, text] }
117
+
118
+ when (text = @ss.scan(/-?([0-9]+|[0-9]*\.[0-9]+)/))
119
+ action { [:NUMBER, text] }
120
+
121
+ when (text = @ss.scan(/[\s]*\/\/[\s]*/))
122
+ action { [:DOUBLESLASH, text] }
123
+
124
+ when (text = @ss.scan(/[\s]*\/[\s]*/))
125
+ action { [:SLASH, text] }
126
+
127
+ when (text = @ss.scan(/U\+[0-9a-f?]{1,6}(-[0-9a-f]{1,6})?/))
128
+ action {[:UNICODE_RANGE, text] }
129
+
130
+ when (text = @ss.scan(/[\s]+/))
131
+ action { [:S, text] }
132
+
133
+ when (text = @ss.scan(/"([^\n\r\f"]|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*"|'([^\n\r\f']|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*'/))
134
+ action { [:STRING, text] }
135
+
136
+ when (text = @ss.scan(/./))
137
+ action { [text, text] }
138
+
139
+ else
140
+ text = @ss.string[@ss.pos .. -1]
141
+ raise ScanError, "can not match: '" + text + "'"
142
+ end # if
143
+
144
+ else
145
+ raise ScanError, "undefined state: '" + state.to_s + "'"
146
+ end # case state
147
+ token
148
+ end # def _next_token
149
+
150
+ end # class
151
+ end
7
152
  end
@@ -1,6 +1,6 @@
1
1
  module Nokogiri
2
2
  module CSS
3
- class GeneratedTokenizer < GeneratedParser
3
+ class Tokenizer
4
4
 
5
5
  macro
6
6
  nl \n|\r\n|\r|\f
@@ -11,7 +11,8 @@ module Nokogiri
11
11
  :parent, :pointer,
12
12
  :next, :pointer,
13
13
  :prev, :pointer,
14
- :doc, :pointer
14
+ :doc, :pointer,
15
+ :ns, :pointer
15
16
  )
16
17
 
17
18
  end
@@ -24,7 +24,7 @@ module Nokogiri
24
24
  end
25
25
 
26
26
  def nodeTab
27
- self[:nodeTab].read_array_of_pointer(self[:nodeNr])
27
+ self[:nodeTab].null? ? [] : self[:nodeTab].read_array_of_pointer(self[:nodeNr])
28
28
  end
29
29
 
30
30
  def nodeTab=(array)
@@ -5,27 +5,27 @@ else
5
5
  require 'weakling'
6
6
  Nokogiri::VERSION_INFO['refs'] = "weakling"
7
7
  end
8
- require 'singleton'
9
8
 
10
9
  module Nokogiri
11
10
  class WeakBucket
12
- include Singleton
13
-
14
11
  if Nokogiri::VERSION_INFO['refs'] == "weakling"
15
- attr_accessor :bucket
16
-
17
- def initialize
18
- @bucket = Weakling::IdHash.new
19
- end
12
+ @@bucket = Weakling::IdHash.new
13
+ @@semaphore = Mutex.new
20
14
 
21
15
  def WeakBucket.get_object(cstruct)
22
- instance.bucket[cstruct.ruby_node_pointer]
16
+ @@semaphore.synchronize do
17
+ @@bucket[cstruct.ruby_node_pointer]
18
+ end
23
19
  end
24
20
 
25
21
  def WeakBucket.set_object(cstruct, object)
26
- cstruct.ruby_node_pointer = instance.bucket.add(object)
22
+ @@semaphore.synchronize do
23
+ cstruct.ruby_node_pointer = @@bucket.add(object)
24
+ end
27
25
  end
26
+
28
27
  else
28
+
29
29
  def WeakBucket.get_object(cstruct)
30
30
  ptr = cstruct.ruby_node_pointer
31
31
  ptr != 0 ? ObjectSpace._id2ref(ptr) : nil
@@ -157,6 +157,14 @@ module Nokogiri
157
157
  LibXML.xmlFreeNsList(node.cstruct[:nsDef])
158
158
  node.cstruct[:nsDef] = nil
159
159
  end
160
+ unless node.cstruct[:properties].nil?
161
+ prop_ptr = node.cstruct[:properties]
162
+ while ! prop_ptr.null?
163
+ prop_cstruct = LibXML::XmlAttr.new(node.cstruct[:properties])
164
+ prop_cstruct[:ns] = nil unless prop_cstruct[:ns].nil?
165
+ prop_ptr = prop_cstruct[:next]
166
+ end
167
+ end
160
168
  end
161
169
  end
162
170
 
@@ -87,6 +87,7 @@ module Nokogiri
87
87
  end
88
88
 
89
89
  def unlink # :nodoc:
90
+ return if cstruct[:nodeNr] == 0
90
91
  nodetab = cstruct.nodeTab
91
92
  cstruct[:nodeNr].times do |j|
92
93
  node_cstruct = LibXML::XmlNode.new(nodetab[j])
@@ -48,7 +48,15 @@ module Nokogiri
48
48
  end
49
49
 
50
50
  def __internal__startElement(_, name, attributes)
51
- attrs = attributes.null? ? [] : attributes.get_array_of_string(0)
51
+ attrs = []
52
+ offset = 0
53
+ if ! attributes.null?
54
+ while ! attributes.get_pointer(LibXML.pointer_offset(offset)).null? do
55
+ cons = attributes.get_array_of_string(LibXML.pointer_offset(offset), 2)
56
+ attrs << cons
57
+ offset += 2
58
+ end
59
+ end
52
60
  @document.start_element name, attrs
53
61
  end
54
62
 
@@ -51,6 +51,10 @@ module Nokogiri
51
51
  end
52
52
 
53
53
  def transform(document, params=[]) # :nodoc:
54
+ unless document.kind_of? Nokogiri::XML::Document
55
+ raise ArgumentError, "argument must be a Nokogiri::XML::Document"
56
+ end
57
+
54
58
  params = params.to_a.flatten if params.is_a?(Hash)
55
59
  raise(TypeError) unless params.is_a?(Array)
56
60
 
@@ -3,25 +3,44 @@ module Nokogiri
3
3
  class Document < Nokogiri::XML::Document
4
4
  ###
5
5
  # Get the meta tag encoding for this document. If there is no meta tag,
6
- # then nil is returned
6
+ # then nil is returned.
7
7
  def meta_encoding
8
- return nil unless meta = css('meta').find { |node|
9
- node['http-equiv'] =~ /Content-Type/i
10
- }
11
-
12
- /charset\s*=\s*([\w-]+)/i.match(meta['content'])[1]
8
+ meta = meta_content_type and
9
+ /charset\s*=\s*([\w-]+)/i.match(meta['content'])[1]
13
10
  end
14
11
 
15
12
  ###
16
13
  # Set the meta tag encoding for this document. If there is no meta
17
- # content tag, nil is returned and the encoding is not set.
14
+ # content tag, the encoding is not set.
18
15
  def meta_encoding= encoding
19
- return nil unless meta = css('meta').find { |node|
20
- node['http-equiv'] =~ /Content-Type/i
16
+ meta = meta_content_type and
17
+ meta['content'] = "text/html; charset=%s" % encoding
18
+ end
19
+
20
+ def meta_content_type
21
+ css('meta[@http-equiv]').find { |node|
22
+ node['http-equiv'] =~ /\AContent-Type\z/i
21
23
  }
24
+ end
25
+ private :meta_content_type
22
26
 
23
- meta['content'] = "text/html; charset=%s" % encoding
24
- encoding
27
+ ###
28
+ # Get the title string of this document. Return nil if there is
29
+ # no title tag.
30
+ def title
31
+ title = at('title') and title.inner_text
32
+ end
33
+
34
+ ###
35
+ # Set the title string of this document. If there is no head
36
+ # element, the title is not set.
37
+ def title=(text)
38
+ unless title = at('title')
39
+ head = at('head') or return nil
40
+ title = Nokogiri::XML::Node.new('title', self)
41
+ head << title
42
+ end
43
+ title.children = XML::Text.new(text, self)
25
44
  end
26
45
 
27
46
  ####
@@ -39,10 +58,7 @@ module Nokogiri
39
58
  # end
40
59
  #
41
60
  def serialize options = {}
42
- options[:save_with] ||= XML::Node::SaveOptions::FORMAT |
43
- XML::Node::SaveOptions::AS_HTML |
44
- XML::Node::SaveOptions::NO_DECLARATION |
45
- XML::Node::SaveOptions::NO_EMPTY_TAGS
61
+ options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
46
62
  super
47
63
  end
48
64
 
@@ -75,16 +91,119 @@ module Nokogiri
75
91
 
76
92
  if string_or_io.respond_to?(:read)
77
93
  url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
94
+ if !encoding
95
+ # Perform further encoding detection that libxml2 does
96
+ # not do.
97
+ string_or_io = EncodingReader.new(string_or_io)
98
+ begin
99
+ return read_io(string_or_io, url, encoding, options.to_i)
100
+ rescue EncodingFoundException => e
101
+ # A retry is required because libxml2 has a problem in
102
+ # that it cannot switch encoding well in the middle of
103
+ # parsing, especially if it has already seen a
104
+ # non-ASCII character when it finds an encoding hint.
105
+ encoding = e.encoding
106
+ end
107
+ end
78
108
  return read_io(string_or_io, url, encoding, options.to_i)
79
109
  end
80
110
 
81
111
  # read_memory pukes on empty docs
82
112
  return new if string_or_io.nil? or string_or_io.empty?
83
113
 
114
+ if !encoding
115
+ encoding = EncodingReader.detect_encoding(string_or_io)
116
+ end
117
+
84
118
  read_memory(string_or_io, url, encoding, options.to_i)
85
119
  end
86
120
  end
87
121
 
122
+ class EncodingFoundException < Exception # :nodoc:
123
+ attr_reader :encoding
124
+
125
+ def initialize(encoding)
126
+ @encoding = encoding
127
+ super("encoding found: %s" % encoding)
128
+ end
129
+ end
130
+
131
+ class EncodingReader # :nodoc:
132
+ class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc:
133
+ attr_reader :encoding
134
+
135
+ def found(encoding)
136
+ @encoding = encoding
137
+ throw :found
138
+ end
139
+
140
+ def not_found(encoding)
141
+ found nil
142
+ end
143
+
144
+ def start_element(name, attrs = [])
145
+ case name
146
+ when /\A(?:div|h1|img|p|br)\z/
147
+ not_found
148
+ when 'meta'
149
+ attr = Hash[attrs]
150
+ http_equiv = attr['http-equiv'] and
151
+ http_equiv.match(/\AContent-Type\z/i) and
152
+ content = attr['content'] and
153
+ m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
154
+ found m[1]
155
+ end
156
+ end
157
+ end
158
+
159
+ def self.detect_encoding(chunk)
160
+ m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
161
+ return Nokogiri.XML(m[1]).encoding
162
+
163
+ handler = SAXHandler.new
164
+ parser = Nokogiri::HTML::SAX::Parser.new(handler)
165
+ catch(:found) {
166
+ parser.parse(chunk)
167
+ }
168
+ handler.encoding
169
+ rescue => e
170
+ nil
171
+ end
172
+
173
+ def initialize(io)
174
+ @io = io
175
+ @firstchunk = nil
176
+ end
177
+
178
+ def read(len)
179
+ # no support for a call without len
180
+
181
+ if !@firstchunk
182
+ @firstchunk = @io.read(len) or return nil
183
+
184
+ # This implementation expects and assumes that the first
185
+ # call from htmlReadIO() is made with a length long enough
186
+ # (~1KB) to achieve further encoding detection that
187
+ # libxml2 does not do.
188
+ if encoding = EncodingReader.detect_encoding(@firstchunk)
189
+ raise EncodingFoundException, encoding
190
+ end
191
+
192
+ # This chunk is stored for the next read in retry.
193
+ return @firstchunk
194
+ end
195
+
196
+ ret = @firstchunk.slice!(0, len)
197
+ if (len -= ret.length) > 0
198
+ rest = @io.read(len) and ret << rest
199
+ end
200
+ if ret.empty?
201
+ nil
202
+ else
203
+ ret
204
+ end
205
+ end
206
+ end
88
207
  end
89
208
  end
90
209
  end