nokogiri 1.4.7 → 1.5.0.beta.1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (164) hide show
  1. data/CHANGELOG.ja.rdoc +8 -83
  2. data/CHANGELOG.rdoc +6 -80
  3. data/Manifest.txt +4 -74
  4. data/README.ja.rdoc +5 -1
  5. data/README.rdoc +8 -22
  6. data/Rakefile +79 -60
  7. data/bin/nokogiri +1 -6
  8. data/deps.rip +5 -0
  9. data/ext/nokogiri/extconf.rb +32 -53
  10. data/ext/nokogiri/nokogiri.c +0 -2
  11. data/ext/nokogiri/nokogiri.h +0 -9
  12. data/ext/nokogiri/xml_document.c +0 -14
  13. data/ext/nokogiri/xml_dtd.c +2 -2
  14. data/ext/nokogiri/xml_io.c +7 -32
  15. data/ext/nokogiri/xml_node.c +31 -103
  16. data/ext/nokogiri/xml_node_set.c +8 -8
  17. data/ext/nokogiri/xml_reader.c +1 -20
  18. data/ext/nokogiri/xml_sax_parser.c +3 -5
  19. data/ext/nokogiri/xml_sax_parser_context.c +0 -40
  20. data/ext/nokogiri/xml_xpath_context.c +2 -35
  21. data/ext/nokogiri/xslt_stylesheet.c +6 -124
  22. data/lib/nokogiri.rb +7 -3
  23. data/lib/nokogiri/css.rb +3 -6
  24. data/lib/nokogiri/css/generated_parser.rb +669 -0
  25. data/lib/nokogiri/css/generated_tokenizer.rb +145 -0
  26. data/lib/nokogiri/css/parser.rb +70 -665
  27. data/lib/nokogiri/css/parser.y +1 -6
  28. data/lib/nokogiri/css/tokenizer.rb +3 -148
  29. data/lib/nokogiri/css/tokenizer.rex +1 -1
  30. data/lib/nokogiri/css/xpath_visitor.rb +14 -16
  31. data/lib/nokogiri/decorators/slop.rb +3 -5
  32. data/lib/nokogiri/html.rb +3 -2
  33. data/lib/nokogiri/html/document.rb +18 -134
  34. data/lib/nokogiri/html/document_fragment.rb +21 -26
  35. data/lib/nokogiri/html/element_description_defaults.rb +671 -0
  36. data/lib/nokogiri/html/sax/parser.rb +2 -6
  37. data/lib/nokogiri/version.rb +4 -9
  38. data/lib/nokogiri/xml/attribute_decl.rb +1 -1
  39. data/lib/nokogiri/xml/builder.rb +1 -1
  40. data/lib/nokogiri/xml/document.rb +3 -27
  41. data/lib/nokogiri/xml/document_fragment.rb +2 -9
  42. data/lib/nokogiri/xml/dtd.rb +1 -12
  43. data/lib/nokogiri/xml/element_decl.rb +1 -1
  44. data/lib/nokogiri/xml/entity_decl.rb +1 -1
  45. data/lib/nokogiri/xml/node.rb +75 -172
  46. data/lib/nokogiri/xml/node/save_options.rb +0 -10
  47. data/lib/nokogiri/xml/node_set.rb +3 -28
  48. data/lib/nokogiri/xml/parse_options.rb +0 -8
  49. data/lib/nokogiri/xml/reader.rb +6 -44
  50. data/lib/nokogiri/xml/sax/document.rb +5 -9
  51. data/lib/nokogiri/xml/schema.rb +1 -7
  52. data/lib/nokogiri/xslt.rb +5 -9
  53. data/tasks/cross_compile.rb +12 -27
  54. data/tasks/test.rb +0 -0
  55. data/test/css/test_parser.rb +19 -40
  56. data/test/css/test_tokenizer.rb +0 -8
  57. data/test/helper.rb +1 -4
  58. data/test/html/sax/test_parser.rb +21 -47
  59. data/test/html/sax/test_parser_context.rb +2 -2
  60. data/test/html/test_document.rb +3 -58
  61. data/test/html/test_document_encoding.rb +0 -53
  62. data/test/html/test_document_fragment.rb +13 -82
  63. data/test/html/test_element_description.rb +4 -2
  64. data/test/html/test_node.rb +0 -9
  65. data/test/test_memory_leak.rb +2 -57
  66. data/test/test_nokogiri.rb +14 -20
  67. data/test/test_reader.rb +7 -47
  68. data/test/test_xslt_transforms.rb +5 -8
  69. data/test/xml/sax/test_parser.rb +17 -34
  70. data/test/xml/sax/test_parser_context.rb +0 -50
  71. data/test/xml/sax/test_push_parser.rb +1 -18
  72. data/test/xml/test_attr.rb +4 -31
  73. data/test/xml/test_attribute_decl.rb +7 -3
  74. data/test/xml/test_builder.rb +5 -5
  75. data/test/xml/test_cdata.rb +3 -3
  76. data/test/xml/test_document.rb +18 -15
  77. data/test/xml/test_document_fragment.rb +20 -19
  78. data/test/xml/test_dtd.rb +13 -18
  79. data/test/xml/test_element_content.rb +1 -1
  80. data/test/xml/test_element_decl.rb +1 -1
  81. data/test/xml/test_entity_decl.rb +12 -10
  82. data/test/xml/test_namespace.rb +7 -5
  83. data/test/xml/test_node.rb +15 -54
  84. data/test/xml/test_node_reparenting.rb +42 -85
  85. data/test/xml/test_node_set.rb +2 -61
  86. data/test/xml/test_schema.rb +0 -5
  87. data/test/xml/test_text.rb +2 -11
  88. data/test/xml/test_unparented_node.rb +1 -1
  89. data/test/xml/test_xpath.rb +7 -43
  90. metadata +131 -155
  91. data/.gemtest +0 -0
  92. data/ext/nokogiri/depend +0 -358
  93. data/ext/nokogiri/xml_libxml2_hacks.c +0 -112
  94. data/ext/nokogiri/xml_libxml2_hacks.h +0 -12
  95. data/lib/nokogiri/css/parser_extras.rb +0 -91
  96. data/lib/nokogiri/ffi/encoding_handler.rb +0 -42
  97. data/lib/nokogiri/ffi/html/document.rb +0 -28
  98. data/lib/nokogiri/ffi/html/element_description.rb +0 -81
  99. data/lib/nokogiri/ffi/html/entity_lookup.rb +0 -16
  100. data/lib/nokogiri/ffi/html/sax/parser_context.rb +0 -38
  101. data/lib/nokogiri/ffi/io_callbacks.rb +0 -42
  102. data/lib/nokogiri/ffi/libxml.rb +0 -420
  103. data/lib/nokogiri/ffi/structs/common_node.rb +0 -38
  104. data/lib/nokogiri/ffi/structs/html_elem_desc.rb +0 -24
  105. data/lib/nokogiri/ffi/structs/html_entity_desc.rb +0 -13
  106. data/lib/nokogiri/ffi/structs/xml_alloc.rb +0 -16
  107. data/lib/nokogiri/ffi/structs/xml_attr.rb +0 -20
  108. data/lib/nokogiri/ffi/structs/xml_attribute.rb +0 -27
  109. data/lib/nokogiri/ffi/structs/xml_buffer.rb +0 -16
  110. data/lib/nokogiri/ffi/structs/xml_char_encoding_handler.rb +0 -11
  111. data/lib/nokogiri/ffi/structs/xml_document.rb +0 -117
  112. data/lib/nokogiri/ffi/structs/xml_dtd.rb +0 -28
  113. data/lib/nokogiri/ffi/structs/xml_element.rb +0 -26
  114. data/lib/nokogiri/ffi/structs/xml_element_content.rb +0 -17
  115. data/lib/nokogiri/ffi/structs/xml_entity.rb +0 -32
  116. data/lib/nokogiri/ffi/structs/xml_enumeration.rb +0 -12
  117. data/lib/nokogiri/ffi/structs/xml_node.rb +0 -28
  118. data/lib/nokogiri/ffi/structs/xml_node_set.rb +0 -53
  119. data/lib/nokogiri/ffi/structs/xml_notation.rb +0 -11
  120. data/lib/nokogiri/ffi/structs/xml_ns.rb +0 -15
  121. data/lib/nokogiri/ffi/structs/xml_parser_context.rb +0 -20
  122. data/lib/nokogiri/ffi/structs/xml_parser_input.rb +0 -19
  123. data/lib/nokogiri/ffi/structs/xml_relax_ng.rb +0 -14
  124. data/lib/nokogiri/ffi/structs/xml_sax_handler.rb +0 -51
  125. data/lib/nokogiri/ffi/structs/xml_sax_push_parser_context.rb +0 -124
  126. data/lib/nokogiri/ffi/structs/xml_schema.rb +0 -13
  127. data/lib/nokogiri/ffi/structs/xml_syntax_error.rb +0 -31
  128. data/lib/nokogiri/ffi/structs/xml_text_reader.rb +0 -12
  129. data/lib/nokogiri/ffi/structs/xml_xpath_context.rb +0 -38
  130. data/lib/nokogiri/ffi/structs/xml_xpath_object.rb +0 -35
  131. data/lib/nokogiri/ffi/structs/xml_xpath_parser_context.rb +0 -20
  132. data/lib/nokogiri/ffi/structs/xslt_stylesheet.rb +0 -13
  133. data/lib/nokogiri/ffi/weak_bucket.rb +0 -40
  134. data/lib/nokogiri/ffi/xml/attr.rb +0 -41
  135. data/lib/nokogiri/ffi/xml/attribute_decl.rb +0 -27
  136. data/lib/nokogiri/ffi/xml/cdata.rb +0 -19
  137. data/lib/nokogiri/ffi/xml/comment.rb +0 -18
  138. data/lib/nokogiri/ffi/xml/document.rb +0 -174
  139. data/lib/nokogiri/ffi/xml/document_fragment.rb +0 -21
  140. data/lib/nokogiri/ffi/xml/dtd.rb +0 -67
  141. data/lib/nokogiri/ffi/xml/element_content.rb +0 -43
  142. data/lib/nokogiri/ffi/xml/element_decl.rb +0 -19
  143. data/lib/nokogiri/ffi/xml/entity_decl.rb +0 -36
  144. data/lib/nokogiri/ffi/xml/entity_reference.rb +0 -19
  145. data/lib/nokogiri/ffi/xml/namespace.rb +0 -44
  146. data/lib/nokogiri/ffi/xml/node.rb +0 -559
  147. data/lib/nokogiri/ffi/xml/node_set.rb +0 -150
  148. data/lib/nokogiri/ffi/xml/processing_instruction.rb +0 -20
  149. data/lib/nokogiri/ffi/xml/reader.rb +0 -236
  150. data/lib/nokogiri/ffi/xml/relax_ng.rb +0 -85
  151. data/lib/nokogiri/ffi/xml/sax/parser.rb +0 -143
  152. data/lib/nokogiri/ffi/xml/sax/parser_context.rb +0 -79
  153. data/lib/nokogiri/ffi/xml/sax/push_parser.rb +0 -51
  154. data/lib/nokogiri/ffi/xml/schema.rb +0 -109
  155. data/lib/nokogiri/ffi/xml/syntax_error.rb +0 -98
  156. data/lib/nokogiri/ffi/xml/text.rb +0 -18
  157. data/lib/nokogiri/ffi/xml/xpath.rb +0 -9
  158. data/lib/nokogiri/ffi/xml/xpath_context.rb +0 -153
  159. data/lib/nokogiri/ffi/xslt/stylesheet.rb +0 -77
  160. data/test/decorators/test_slop.rb +0 -16
  161. data/test/ffi/test_document.rb +0 -35
  162. data/test/files/encoding.html +0 -82
  163. data/test/files/encoding.xhtml +0 -84
  164. data/test/xslt/test_custom_functions.rb +0 -94
@@ -1,4 +1,4 @@
1
- class Nokogiri::CSS::Parser
1
+ class Nokogiri::CSS::GeneratedParser
2
2
 
3
3
  token FUNCTION INCLUDES DASHMATCH LBRACE HASH PLUS GREATER S STRING IDENT
4
4
  token COMMA NUMBER PREFIXMATCH SUFFIXMATCH SUBSTRINGMATCH TILDE NOT_EQUAL
@@ -39,9 +39,6 @@ rule
39
39
  result = Node.new(:CONDITIONAL_SELECTOR, val)
40
40
  }
41
41
  | function
42
- | function pseudo {
43
- result = Node.new(:CONDITIONAL_SELECTOR, val)
44
- }
45
42
  | function attrib {
46
43
  result = Node.new(:CONDITIONAL_SELECTOR, val)
47
44
  }
@@ -233,5 +230,3 @@ end
233
230
 
234
231
  ---- header
235
232
 
236
- require 'nokogiri/css/parser_extras'
237
-
@@ -1,152 +1,7 @@
1
- #--
2
- # DO NOT MODIFY!!!!
3
- # This file is automatically generated by rex 1.0.5
4
- # from lexical definition file "lib/nokogiri/css/tokenizer.rex".
5
- #++
6
-
7
1
  module Nokogiri
8
- module CSS
9
- class Tokenizer
10
- require 'strscan'
11
-
12
- class ScanError < StandardError ; end
13
-
14
- attr_reader :lineno
15
- attr_reader :filename
16
- attr_accessor :state
17
-
18
- def scan_setup(str)
19
- @ss = StringScanner.new(str)
20
- @lineno = 1
21
- @state = nil
22
- end
23
-
24
- def action
25
- yield
26
- end
27
-
28
- def scan_str(str)
29
- scan_setup(str)
30
- do_parse
31
- end
32
- alias :scan :scan_str
33
-
34
- def load_file( filename )
35
- @filename = filename
36
- open(filename, "r") do |f|
37
- scan_setup(f.read)
2
+ module CSS
3
+ class Tokenizer < GeneratedTokenizer
4
+ alias :scan :scan_setup
38
5
  end
39
6
  end
40
-
41
- def scan_file( filename )
42
- load_file(filename)
43
- do_parse
44
- end
45
-
46
-
47
- def next_token
48
- return if @ss.eos?
49
-
50
- # skips empty actions
51
- until token = _next_token or @ss.eos?; end
52
- token
53
- end
54
-
55
- def _next_token
56
- text = @ss.peek(1)
57
- @lineno += 1 if text == "\n"
58
- token = case @state
59
- when nil
60
- case
61
- when (text = @ss.scan(/has\([\s]*/))
62
- action { [:HAS, text] }
63
-
64
- when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*\([\s]*/))
65
- action { [:FUNCTION, text] }
66
-
67
- when (text = @ss.scan(/[-@]?([_A-Za-z]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*/))
68
- action { [:IDENT, text] }
69
-
70
- when (text = @ss.scan(/\#([_A-Za-z0-9-]|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])+/))
71
- action { [:HASH, text] }
72
-
73
- when (text = @ss.scan(/[\s]*~=[\s]*/))
74
- action { [:INCLUDES, text] }
75
-
76
- when (text = @ss.scan(/[\s]*\|=[\s]*/))
77
- action { [:DASHMATCH, text] }
78
-
79
- when (text = @ss.scan(/[\s]*\^=[\s]*/))
80
- action { [:PREFIXMATCH, text] }
81
-
82
- when (text = @ss.scan(/[\s]*\$=[\s]*/))
83
- action { [:SUFFIXMATCH, text] }
84
-
85
- when (text = @ss.scan(/[\s]*\*=[\s]*/))
86
- action { [:SUBSTRINGMATCH, text] }
87
-
88
- when (text = @ss.scan(/[\s]*!=[\s]*/))
89
- action { [:NOT_EQUAL, text] }
90
-
91
- when (text = @ss.scan(/[\s]*=[\s]*/))
92
- action { [:EQUAL, text] }
93
-
94
- when (text = @ss.scan(/[\s]*\)/))
95
- action { [:RPAREN, text] }
96
-
97
- when (text = @ss.scan(/[\s]*\[[\s]*/))
98
- action { [:LSQUARE, text] }
99
-
100
- when (text = @ss.scan(/[\s]*\]/))
101
- action { [:RSQUARE, text] }
102
-
103
- when (text = @ss.scan(/[\s]*\+[\s]*/))
104
- action { [:PLUS, text] }
105
-
106
- when (text = @ss.scan(/[\s]*>[\s]*/))
107
- action { [:GREATER, text] }
108
-
109
- when (text = @ss.scan(/[\s]*,[\s]*/))
110
- action { [:COMMA, text] }
111
-
112
- when (text = @ss.scan(/[\s]*~[\s]*/))
113
- action { [:TILDE, text] }
114
-
115
- when (text = @ss.scan(/\:not\([\s]*/))
116
- action { [:NOT, text] }
117
-
118
- when (text = @ss.scan(/-?([0-9]+|[0-9]*\.[0-9]+)/))
119
- action { [:NUMBER, text] }
120
-
121
- when (text = @ss.scan(/[\s]*\/\/[\s]*/))
122
- action { [:DOUBLESLASH, text] }
123
-
124
- when (text = @ss.scan(/[\s]*\/[\s]*/))
125
- action { [:SLASH, text] }
126
-
127
- when (text = @ss.scan(/U\+[0-9a-f?]{1,6}(-[0-9a-f]{1,6})?/))
128
- action {[:UNICODE_RANGE, text] }
129
-
130
- when (text = @ss.scan(/[\s]+/))
131
- action { [:S, text] }
132
-
133
- when (text = @ss.scan(/"([^\n\r\f"]|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*"|'([^\n\r\f']|\n|\r\n|\r|\f|[^\0-\177]|\\[0-9A-Fa-f]{1,6}(\r\n|[\s])?|\\[^\n\r\f0-9A-Fa-f])*'/))
134
- action { [:STRING, text] }
135
-
136
- when (text = @ss.scan(/./))
137
- action { [text, text] }
138
-
139
- else
140
- text = @ss.string[@ss.pos .. -1]
141
- raise ScanError, "can not match: '" + text + "'"
142
- end # if
143
-
144
- else
145
- raise ScanError, "undefined state: '" + state.to_s + "'"
146
- end # case state
147
- token
148
- end # def _next_token
149
-
150
- end # class
151
- end
152
7
  end
@@ -1,6 +1,6 @@
1
1
  module Nokogiri
2
2
  module CSS
3
- class Tokenizer
3
+ class GeneratedTokenizer < GeneratedParser
4
4
 
5
5
  macro
6
6
  nl \n|\r\n|\r|\f
@@ -11,25 +11,18 @@ module Nokogiri
11
11
  'child::text()'
12
12
  when /^self\(/
13
13
  "self::#{node.value[1]}"
14
- when /^eq\(/
15
- "position() = #{node.value[1]}"
16
- when /^(nth|nth-of-type|nth-child)\(/
14
+ when /^(eq|nth|nth-of-type|nth-child)\(/
17
15
  if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :AN_PLUS_B
18
16
  an_plus_b(node.value[1])
19
17
  else
20
- "position() = #{node.value[1]}"
21
- end
22
- when /^(nth-last-child|nth-last-of-type)\(/
23
- if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :AN_PLUS_B
24
- an_plus_b(node.value[1], :last => true)
25
- else
26
- index = node.value[1].to_i - 1
27
- index == 0 ? "position() = last()" : "position() = last() - #{index}"
18
+ "position() = " + node.value[1]
28
19
  end
29
20
  when /^(first|first-of-type)\(/
30
21
  "position() = 1"
31
22
  when /^(last|last-of-type)\(/
32
23
  "position() = last()"
24
+ when /^(nth-last-child|nth-last-of-type)\(/
25
+ "position() = last() - #{node.value[1]}"
33
26
  when /^contains\(/
34
27
  "contains(., #{node.value[1]})"
35
28
  when /^gt\(/
@@ -55,6 +48,13 @@ module Nokogiri
55
48
  end
56
49
  end
57
50
 
51
+ def visit_preceding_selector node
52
+ node.value.last.accept(self) +
53
+ '[preceding-sibling::' +
54
+ node.value.first.accept(self) +
55
+ ']'
56
+ end
57
+
58
58
  def visit_id node
59
59
  node.value.first =~ /^#(.*)$/
60
60
  "@id = '#{$1}'"
@@ -126,7 +126,6 @@ module Nokogiri
126
126
  {
127
127
  'combinator' => ' and ',
128
128
  'direct_adjacent_selector' => "/following-sibling::*[1]/self::",
129
- 'preceding_selector' => "/following-sibling::",
130
129
  'descendant_selector' => '//',
131
130
  'child_selector' => '/',
132
131
  }.each do |k,v|
@@ -151,18 +150,17 @@ module Nokogiri
151
150
  end
152
151
 
153
152
  private
154
- def an_plus_b node, options={}
153
+ def an_plus_b node
155
154
  raise ArgumentError, "expected an+b node to contain 4 tokens, but is #{node.value.inspect}" unless node.value.size == 4
156
155
 
157
156
  a = node.value[0].to_i
158
157
  b = node.value[3].to_i
159
- position = options[:last] ? "(last()-position()+1)" : "position()"
160
158
 
161
159
  if (b == 0)
162
- return "(#{position} mod #{a}) = 0"
160
+ return "(position() mod #{a}) = 0"
163
161
  else
164
162
  compare = (a < 0) ? "<=" : ">="
165
- return "(#{position} #{compare} #{b}) and (((#{position}-#{b}) mod #{a.abs}) = 0)"
163
+ return "(position() #{compare} #{b}) and (((position()-#{b}) mod #{a.abs}) = 0)"
166
164
  end
167
165
  end
168
166
 
@@ -7,22 +7,20 @@ module Nokogiri
7
7
  ###
8
8
  # look for node with +name+. See Nokogiri.Slop
9
9
  def method_missing name, *args, &block
10
- prefix = implied_xpath_context
11
-
12
10
  if args.empty?
13
- list = xpath("#{prefix}#{name.to_s.sub(/^_/, '')}")
11
+ list = xpath("./#{name}")
14
12
  elsif args.first.is_a? Hash
15
13
  hash = args.first
16
14
  if hash[:css]
17
15
  list = css("#{name}#{hash[:css]}")
18
16
  elsif hash[:xpath]
19
17
  conds = Array(hash[:xpath]).join(' and ')
20
- list = xpath("#{prefix}#{name}[#{conds}]")
18
+ list = xpath("./#{name}[#{conds}]")
21
19
  end
22
20
  else
23
21
  CSS::Parser.without_cache do
24
22
  list = xpath(
25
- *CSS.xpath_for("#{name}#{args.first}", :prefix => prefix)
23
+ *CSS.xpath_for("#{name}#{args.first}", :prefix => "./")
26
24
  )
27
25
  end
28
26
  end
data/lib/nokogiri/html.rb CHANGED
@@ -4,6 +4,7 @@ require 'nokogiri/html/document_fragment'
4
4
  require 'nokogiri/html/sax/parser_context'
5
5
  require 'nokogiri/html/sax/parser'
6
6
  require 'nokogiri/html/element_description'
7
+ require 'nokogiri/html/element_description_defaults'
7
8
 
8
9
  module Nokogiri
9
10
  class << self
@@ -24,8 +25,8 @@ module Nokogiri
24
25
 
25
26
  ####
26
27
  # Parse a fragment from +string+ in to a NodeSet.
27
- def fragment string, encoding = nil
28
- HTML::DocumentFragment.parse string, encoding
28
+ def fragment string
29
+ HTML::DocumentFragment.parse(string)
29
30
  end
30
31
  end
31
32
 
@@ -3,44 +3,25 @@ module Nokogiri
3
3
  class Document < Nokogiri::XML::Document
4
4
  ###
5
5
  # Get the meta tag encoding for this document. If there is no meta tag,
6
- # then nil is returned.
6
+ # then nil is returned
7
7
  def meta_encoding
8
- meta = meta_content_type and
9
- /charset\s*=\s*([\w-]+)/i.match(meta['content'])[1]
8
+ return nil unless meta = css('meta').find { |node|
9
+ node['http-equiv'] =~ /Content-Type/i
10
+ }
11
+
12
+ /charset\s*=\s*([\w-]+)/i.match(meta['content'])[1]
10
13
  end
11
14
 
12
15
  ###
13
- # Set the meta tag encoding for this document. If there is no meta
14
- # content tag, the encoding is not set.
16
+ # Set the meta tag encoding for this document. If there is no meta
17
+ # content tag, nil is returned and the encoding is not set.
15
18
  def meta_encoding= encoding
16
- meta = meta_content_type and
17
- meta['content'] = "text/html; charset=%s" % encoding
18
- end
19
-
20
- def meta_content_type
21
- css('meta[@http-equiv]').find { |node|
22
- node['http-equiv'] =~ /\AContent-Type\z/i
19
+ return nil unless meta = css('meta').find { |node|
20
+ node['http-equiv'] =~ /Content-Type/i
23
21
  }
24
- end
25
- private :meta_content_type
26
22
 
27
- ###
28
- # Get the title string of this document. Return nil if there is
29
- # no title tag.
30
- def title
31
- title = at('title') and title.inner_text
32
- end
33
-
34
- ###
35
- # Set the title string of this document. If there is no head
36
- # element, the title is not set.
37
- def title=(text)
38
- unless title = at('title')
39
- head = at('head') or return nil
40
- title = Nokogiri::XML::Node.new('title', self)
41
- head << title
42
- end
43
- title.children = XML::Text.new(text, self)
23
+ meta['content'] = "text/html; charset=%s" % encoding
24
+ encoding
44
25
  end
45
26
 
46
27
  ####
@@ -57,8 +38,11 @@ module Nokogiri
57
38
  # config.format.as_xml
58
39
  # end
59
40
  #
60
- def serialize options = {}
61
- options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
41
+ def serialize options = {}, &block
42
+ options[:save_with] ||= XML::Node::SaveOptions::FORMAT |
43
+ XML::Node::SaveOptions::AS_HTML |
44
+ XML::Node::SaveOptions::NO_DECLARATION |
45
+ XML::Node::SaveOptions::NO_EMPTY_TAGS
62
46
  super
63
47
  end
64
48
 
@@ -77,7 +61,7 @@ module Nokogiri
77
61
  # is a number that sets options in the parser, such as
78
62
  # Nokogiri::XML::ParseOptions::RECOVER. See the constants in
79
63
  # Nokogiri::XML::ParseOptions.
80
- def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML
64
+ def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML, &block
81
65
 
82
66
  options = Nokogiri::XML::ParseOptions.new(options) if Fixnum === options
83
67
  # Give the options to the user
@@ -91,116 +75,16 @@ module Nokogiri
91
75
 
92
76
  if string_or_io.respond_to?(:read)
93
77
  url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
94
- if !encoding
95
- # Perform advanced encoding detection that libxml2 does
96
- # not do.
97
- string_or_io = EncodingReader.new(string_or_io)
98
- begin
99
- return read_io(string_or_io, url, encoding, options.to_i)
100
- rescue EncodingFoundException => e
101
- # A retry is required because libxml2 has a problem in
102
- # that it cannot switch encoding well in the middle of
103
- # parsing, especially if it has already seen a
104
- # non-ASCII character when it finds an encoding hint.
105
- encoding = e.encoding
106
- end
107
- end
108
78
  return read_io(string_or_io, url, encoding, options.to_i)
109
79
  end
110
80
 
111
81
  # read_memory pukes on empty docs
112
82
  return new if string_or_io.nil? or string_or_io.empty?
113
83
 
114
- if !encoding
115
- encoding = EncodingReader.detect_encoding(string_or_io)
116
- end
117
-
118
84
  read_memory(string_or_io, url, encoding, options.to_i)
119
85
  end
120
86
  end
121
87
 
122
- class EncodingFoundException < Exception # :nodoc:
123
- attr_reader :encoding
124
-
125
- def initialize(encoding)
126
- @encoding = encoding
127
- super("encoding found: %s" % encoding)
128
- end
129
- end
130
-
131
- class EncodingReader # :nodoc:
132
- class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc:
133
- attr_reader :encoding
134
-
135
- def found(encoding)
136
- @encoding = encoding
137
- throw :found
138
- end
139
-
140
- def not_found(encoding)
141
- found nil
142
- end
143
-
144
- def start_element(name, attrs = [])
145
- case name
146
- when /\A(?:div|h1|img|p|br)\z/
147
- not_found
148
- when 'meta'
149
- attr = Hash[attrs]
150
- http_equiv = attr['http-equiv'] and
151
- http_equiv.match(/\AContent-Type\z/i) and
152
- content = attr['content'] and
153
- m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
154
- found m[1]
155
- end
156
- end
157
- end
158
-
159
- def self.detect_encoding(chunk)
160
- m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
161
- return Nokogiri.XML(m[1]).encoding
162
-
163
- handler = SAXHandler.new
164
- parser = Nokogiri::HTML::SAX::Parser.new(handler)
165
- catch(:found) {
166
- parser.parse(chunk)
167
- }
168
- handler.encoding
169
- rescue => e
170
- nil
171
- end
172
-
173
- def initialize(io)
174
- @io = io
175
- @firstchunk = nil
176
- end
177
-
178
- def read(len)
179
- # no support for a call without len
180
-
181
- if !@firstchunk
182
- @firstchunk = @io.read(len) or return nil
183
-
184
- # This implementation expects that the first call from
185
- # htmlReadIO() is made with a length long enough (~1KB) to
186
- # achieve advanced encoding detection.
187
- if encoding = EncodingReader.detect_encoding(@firstchunk)
188
- # The first chunk is stored for the next read in retry.
189
- raise EncodingFoundException, encoding
190
- end
191
- end
192
-
193
- ret = @firstchunk.slice!(0, len)
194
- if (len -= ret.length) > 0
195
- rest = @io.read(len) and ret << rest
196
- end
197
- if ret.empty?
198
- nil
199
- else
200
- ret
201
- end
202
- end
203
- end
204
88
  end
205
89
  end
206
90
  end