nokogiri 1.0.0 → 1.6.8.1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (309) hide show
  1. checksums.yaml +7 -0
  2. data/.autotest +26 -0
  3. data/.cross_rubies +9 -0
  4. data/.editorconfig +17 -0
  5. data/.gemtest +0 -0
  6. data/.travis.yml +51 -0
  7. data/CHANGELOG.rdoc +1160 -0
  8. data/CONTRIBUTING.md +42 -0
  9. data/C_CODING_STYLE.rdoc +33 -0
  10. data/Gemfile +22 -0
  11. data/LICENSE.txt +31 -0
  12. data/Manifest.txt +284 -40
  13. data/README.md +166 -0
  14. data/ROADMAP.md +111 -0
  15. data/Rakefile +310 -199
  16. data/STANDARD_RESPONSES.md +47 -0
  17. data/Y_U_NO_GEMSPEC.md +155 -0
  18. data/appveyor.yml +22 -0
  19. data/bin/nokogiri +118 -0
  20. data/build_all +45 -0
  21. data/dependencies.yml +29 -0
  22. data/ext/nokogiri/depend +358 -0
  23. data/ext/nokogiri/extconf.rb +664 -34
  24. data/ext/nokogiri/html_document.c +120 -33
  25. data/ext/nokogiri/html_document.h +1 -1
  26. data/ext/nokogiri/html_element_description.c +279 -0
  27. data/ext/nokogiri/html_element_description.h +10 -0
  28. data/ext/nokogiri/html_entity_lookup.c +32 -0
  29. data/ext/nokogiri/html_entity_lookup.h +8 -0
  30. data/ext/nokogiri/html_sax_parser_context.c +116 -0
  31. data/ext/nokogiri/html_sax_parser_context.h +11 -0
  32. data/ext/nokogiri/html_sax_push_parser.c +87 -0
  33. data/ext/nokogiri/html_sax_push_parser.h +9 -0
  34. data/ext/nokogiri/nokogiri.c +145 -0
  35. data/ext/nokogiri/nokogiri.h +131 -0
  36. data/ext/nokogiri/xml_attr.c +94 -0
  37. data/ext/nokogiri/xml_attr.h +9 -0
  38. data/ext/nokogiri/xml_attribute_decl.c +70 -0
  39. data/ext/nokogiri/xml_attribute_decl.h +9 -0
  40. data/ext/nokogiri/xml_cdata.c +23 -19
  41. data/ext/nokogiri/xml_cdata.h +1 -1
  42. data/ext/nokogiri/xml_comment.c +69 -0
  43. data/ext/nokogiri/xml_comment.h +9 -0
  44. data/ext/nokogiri/xml_document.c +501 -54
  45. data/ext/nokogiri/xml_document.h +14 -1
  46. data/ext/nokogiri/xml_document_fragment.c +48 -0
  47. data/ext/nokogiri/xml_document_fragment.h +10 -0
  48. data/ext/nokogiri/xml_dtd.c +109 -24
  49. data/ext/nokogiri/xml_dtd.h +3 -1
  50. data/ext/nokogiri/xml_element_content.c +123 -0
  51. data/ext/nokogiri/xml_element_content.h +10 -0
  52. data/ext/nokogiri/xml_element_decl.c +69 -0
  53. data/ext/nokogiri/xml_element_decl.h +9 -0
  54. data/ext/nokogiri/xml_encoding_handler.c +79 -0
  55. data/ext/nokogiri/xml_encoding_handler.h +8 -0
  56. data/ext/nokogiri/xml_entity_decl.c +110 -0
  57. data/ext/nokogiri/xml_entity_decl.h +10 -0
  58. data/ext/nokogiri/xml_entity_reference.c +52 -0
  59. data/ext/nokogiri/xml_entity_reference.h +9 -0
  60. data/ext/nokogiri/xml_io.c +60 -0
  61. data/ext/nokogiri/xml_io.h +11 -0
  62. data/ext/nokogiri/xml_libxml2_hacks.c +112 -0
  63. data/ext/nokogiri/xml_libxml2_hacks.h +12 -0
  64. data/ext/nokogiri/xml_namespace.c +117 -0
  65. data/ext/nokogiri/xml_namespace.h +13 -0
  66. data/ext/nokogiri/xml_node.c +1285 -315
  67. data/ext/nokogiri/xml_node.h +4 -6
  68. data/ext/nokogiri/xml_node_set.c +415 -54
  69. data/ext/nokogiri/xml_node_set.h +6 -2
  70. data/ext/nokogiri/xml_processing_instruction.c +56 -0
  71. data/ext/nokogiri/xml_processing_instruction.h +9 -0
  72. data/ext/nokogiri/xml_reader.c +316 -77
  73. data/ext/nokogiri/xml_reader.h +1 -1
  74. data/ext/nokogiri/xml_relax_ng.c +161 -0
  75. data/ext/nokogiri/xml_relax_ng.h +9 -0
  76. data/ext/nokogiri/xml_sax_parser.c +215 -80
  77. data/ext/nokogiri/xml_sax_parser.h +30 -1
  78. data/ext/nokogiri/xml_sax_parser_context.c +262 -0
  79. data/ext/nokogiri/xml_sax_parser_context.h +10 -0
  80. data/ext/nokogiri/xml_sax_push_parser.c +115 -0
  81. data/ext/nokogiri/xml_sax_push_parser.h +9 -0
  82. data/ext/nokogiri/xml_schema.c +205 -0
  83. data/ext/nokogiri/xml_schema.h +9 -0
  84. data/ext/nokogiri/xml_syntax_error.c +45 -175
  85. data/ext/nokogiri/xml_syntax_error.h +4 -2
  86. data/ext/nokogiri/xml_text.c +37 -14
  87. data/ext/nokogiri/xml_text.h +1 -1
  88. data/ext/nokogiri/xml_xpath_context.c +230 -13
  89. data/ext/nokogiri/xml_xpath_context.h +2 -1
  90. data/ext/nokogiri/xslt_stylesheet.c +196 -34
  91. data/ext/nokogiri/xslt_stylesheet.h +6 -1
  92. data/lib/nokogiri/css/node.rb +18 -61
  93. data/lib/nokogiri/css/parser.rb +725 -17
  94. data/lib/nokogiri/css/parser.y +126 -63
  95. data/lib/nokogiri/css/parser_extras.rb +91 -0
  96. data/lib/nokogiri/css/syntax_error.rb +7 -0
  97. data/lib/nokogiri/css/tokenizer.rb +148 -5
  98. data/lib/nokogiri/css/tokenizer.rex +31 -39
  99. data/lib/nokogiri/css/xpath_visitor.rb +109 -51
  100. data/lib/nokogiri/css.rb +24 -3
  101. data/lib/nokogiri/decorators/slop.rb +42 -0
  102. data/lib/nokogiri/html/builder.rb +27 -1
  103. data/lib/nokogiri/html/document.rb +329 -3
  104. data/lib/nokogiri/html/document_fragment.rb +39 -0
  105. data/lib/nokogiri/html/element_description.rb +23 -0
  106. data/lib/nokogiri/html/element_description_defaults.rb +671 -0
  107. data/lib/nokogiri/html/entity_lookup.rb +13 -0
  108. data/lib/nokogiri/html/sax/parser.rb +35 -4
  109. data/lib/nokogiri/html/sax/parser_context.rb +16 -0
  110. data/lib/nokogiri/html/sax/push_parser.rb +36 -0
  111. data/lib/nokogiri/html.rb +18 -76
  112. data/lib/nokogiri/syntax_error.rb +4 -0
  113. data/lib/nokogiri/version.rb +106 -1
  114. data/lib/nokogiri/xml/attr.rb +14 -0
  115. data/lib/nokogiri/xml/attribute_decl.rb +18 -0
  116. data/lib/nokogiri/xml/builder.rb +395 -31
  117. data/lib/nokogiri/xml/cdata.rb +4 -2
  118. data/lib/nokogiri/xml/character_data.rb +7 -0
  119. data/lib/nokogiri/xml/document.rb +267 -12
  120. data/lib/nokogiri/xml/document_fragment.rb +149 -0
  121. data/lib/nokogiri/xml/dtd.rb +27 -1
  122. data/lib/nokogiri/xml/element_content.rb +36 -0
  123. data/lib/nokogiri/xml/element_decl.rb +13 -0
  124. data/lib/nokogiri/xml/entity_decl.rb +19 -0
  125. data/lib/nokogiri/xml/namespace.rb +13 -0
  126. data/lib/nokogiri/xml/node/save_options.rb +61 -0
  127. data/lib/nokogiri/xml/node.rb +748 -109
  128. data/lib/nokogiri/xml/node_set.rb +200 -72
  129. data/lib/nokogiri/xml/parse_options.rb +120 -0
  130. data/lib/nokogiri/xml/pp/character_data.rb +18 -0
  131. data/lib/nokogiri/xml/pp/node.rb +56 -0
  132. data/lib/nokogiri/xml/pp.rb +2 -0
  133. data/lib/nokogiri/xml/processing_instruction.rb +8 -0
  134. data/lib/nokogiri/xml/reader.rb +102 -4
  135. data/lib/nokogiri/xml/relax_ng.rb +32 -0
  136. data/lib/nokogiri/xml/sax/document.rb +114 -2
  137. data/lib/nokogiri/xml/sax/parser.rb +97 -7
  138. data/lib/nokogiri/xml/sax/parser_context.rb +16 -0
  139. data/lib/nokogiri/xml/sax/push_parser.rb +60 -0
  140. data/lib/nokogiri/xml/sax.rb +2 -7
  141. data/lib/nokogiri/xml/schema.rb +63 -0
  142. data/lib/nokogiri/xml/searchable.rb +221 -0
  143. data/lib/nokogiri/xml/syntax_error.rb +27 -1
  144. data/lib/nokogiri/xml/text.rb +4 -1
  145. data/lib/nokogiri/xml/xpath/syntax_error.rb +11 -0
  146. data/lib/nokogiri/xml/xpath.rb +4 -0
  147. data/lib/nokogiri/xml/xpath_context.rb +3 -1
  148. data/lib/nokogiri/xml.rb +45 -38
  149. data/lib/nokogiri/xslt/stylesheet.rb +19 -0
  150. data/lib/nokogiri/xslt.rb +47 -2
  151. data/lib/nokogiri.rb +117 -24
  152. data/lib/xsd/xmlparser/nokogiri.rb +102 -0
  153. data/patches/sort-patches-by-date +25 -0
  154. data/ports/archives/libxml2-2.9.4.tar.gz +0 -0
  155. data/ports/archives/libxslt-1.1.29.tar.gz +0 -0
  156. data/suppressions/README.txt +1 -0
  157. data/suppressions/nokogiri_ree-1.8.7.358.supp +61 -0
  158. data/suppressions/nokogiri_ruby-1.8.7.370.supp +0 -0
  159. data/suppressions/nokogiri_ruby-1.9.2.320.supp +28 -0
  160. data/suppressions/nokogiri_ruby-1.9.3.327.supp +28 -0
  161. data/tasks/test.rb +100 -0
  162. data/test/css/test_nthiness.rb +73 -6
  163. data/test/css/test_parser.rb +184 -39
  164. data/test/css/test_tokenizer.rb +72 -19
  165. data/test/css/test_xpath_visitor.rb +44 -2
  166. data/test/decorators/test_slop.rb +20 -0
  167. data/test/files/2ch.html +108 -0
  168. data/test/files/GH_1042.html +18 -0
  169. data/test/files/address_book.rlx +12 -0
  170. data/test/files/address_book.xml +10 -0
  171. data/test/files/atom.xml +344 -0
  172. data/test/files/bar/bar.xsd +4 -0
  173. data/test/files/bogus.xml +0 -0
  174. data/test/files/dont_hurt_em_why.xml +422 -0
  175. data/test/files/encoding.html +82 -0
  176. data/test/files/encoding.xhtml +84 -0
  177. data/test/files/exslt.xml +8 -0
  178. data/test/files/exslt.xslt +35 -0
  179. data/test/files/foo/foo.xsd +4 -0
  180. data/test/files/metacharset.html +10 -0
  181. data/test/files/namespace_pressure_test.xml +1684 -0
  182. data/test/files/noencoding.html +47 -0
  183. data/test/files/po.xml +32 -0
  184. data/test/files/po.xsd +66 -0
  185. data/test/files/saml/saml20assertion_schema.xsd +283 -0
  186. data/test/files/saml/saml20protocol_schema.xsd +302 -0
  187. data/test/files/saml/xenc_schema.xsd +146 -0
  188. data/test/files/saml/xmldsig_schema.xsd +318 -0
  189. data/test/files/shift_jis.html +10 -0
  190. data/test/files/shift_jis.xml +5 -0
  191. data/test/files/shift_jis_no_charset.html +9 -0
  192. data/test/files/slow-xpath.xml +25509 -0
  193. data/test/files/snuggles.xml +3 -0
  194. data/test/files/staff.dtd +10 -0
  195. data/test/files/test_document_url/bar.xml +2 -0
  196. data/test/files/test_document_url/document.dtd +4 -0
  197. data/test/files/test_document_url/document.xml +6 -0
  198. data/test/files/tlm.html +2 -1
  199. data/test/files/to_be_xincluded.xml +2 -0
  200. data/test/files/valid_bar.xml +2 -0
  201. data/test/files/xinclude.xml +4 -0
  202. data/test/helper.rb +124 -13
  203. data/test/html/sax/test_parser.rb +118 -4
  204. data/test/html/sax/test_parser_context.rb +46 -0
  205. data/test/html/sax/test_push_parser.rb +87 -0
  206. data/test/html/test_builder.rb +94 -8
  207. data/test/html/test_document.rb +626 -11
  208. data/test/html/test_document_encoding.rb +145 -0
  209. data/test/html/test_document_fragment.rb +301 -0
  210. data/test/html/test_element_description.rb +105 -0
  211. data/test/html/test_named_characters.rb +14 -0
  212. data/test/html/test_node.rb +212 -0
  213. data/test/html/test_node_encoding.rb +85 -0
  214. data/test/namespaces/test_additional_namespaces_in_builder_doc.rb +14 -0
  215. data/test/namespaces/test_namespaces_aliased_default.rb +24 -0
  216. data/test/namespaces/test_namespaces_in_builder_doc.rb +75 -0
  217. data/test/namespaces/test_namespaces_in_cloned_doc.rb +31 -0
  218. data/test/namespaces/test_namespaces_in_created_doc.rb +75 -0
  219. data/test/namespaces/test_namespaces_in_parsed_doc.rb +80 -0
  220. data/test/namespaces/test_namespaces_preservation.rb +31 -0
  221. data/test/test_convert_xpath.rb +2 -47
  222. data/test/test_css_cache.rb +45 -0
  223. data/test/test_encoding_handler.rb +48 -0
  224. data/test/test_memory_leak.rb +156 -0
  225. data/test/test_nokogiri.rb +103 -1
  226. data/test/test_soap4r_sax.rb +52 -0
  227. data/test/test_xslt_transforms.rb +293 -8
  228. data/test/xml/node/test_save_options.rb +28 -0
  229. data/test/xml/node/test_subclass.rb +44 -0
  230. data/test/xml/sax/test_parser.rb +309 -8
  231. data/test/xml/sax/test_parser_context.rb +115 -0
  232. data/test/xml/sax/test_push_parser.rb +157 -0
  233. data/test/xml/test_attr.rb +67 -0
  234. data/test/xml/test_attribute_decl.rb +86 -0
  235. data/test/xml/test_builder.rb +327 -2
  236. data/test/xml/test_c14n.rb +180 -0
  237. data/test/xml/test_cdata.rb +32 -2
  238. data/test/xml/test_comment.rb +40 -0
  239. data/test/xml/test_document.rb +846 -35
  240. data/test/xml/test_document_encoding.rb +31 -0
  241. data/test/xml/test_document_fragment.rb +271 -0
  242. data/test/xml/test_dtd.rb +153 -9
  243. data/test/xml/test_dtd_encoding.rb +31 -0
  244. data/test/xml/test_element_content.rb +56 -0
  245. data/test/xml/test_element_decl.rb +73 -0
  246. data/test/xml/test_entity_decl.rb +122 -0
  247. data/test/xml/test_entity_reference.rb +251 -0
  248. data/test/xml/test_namespace.rb +96 -0
  249. data/test/xml/test_node.rb +1126 -105
  250. data/test/xml/test_node_attributes.rb +115 -0
  251. data/test/xml/test_node_encoding.rb +69 -0
  252. data/test/xml/test_node_inheritance.rb +32 -0
  253. data/test/xml/test_node_reparenting.rb +549 -0
  254. data/test/xml/test_node_set.rb +668 -9
  255. data/test/xml/test_parse_options.rb +64 -0
  256. data/test/xml/test_processing_instruction.rb +30 -0
  257. data/test/xml/test_reader.rb +589 -0
  258. data/test/xml/test_reader_encoding.rb +134 -0
  259. data/test/xml/test_relax_ng.rb +60 -0
  260. data/test/xml/test_schema.rb +142 -0
  261. data/test/xml/test_syntax_error.rb +30 -0
  262. data/test/xml/test_text.rb +49 -2
  263. data/test/xml/test_unparented_node.rb +440 -0
  264. data/test/xml/test_xinclude.rb +83 -0
  265. data/test/xml/test_xpath.rb +445 -0
  266. data/test/xslt/test_custom_functions.rb +133 -0
  267. data/test/xslt/test_exception_handling.rb +37 -0
  268. data/test_all +107 -0
  269. metadata +459 -115
  270. data/History.txt +0 -6
  271. data/README.ja.txt +0 -86
  272. data/README.txt +0 -87
  273. data/ext/nokogiri/html_sax_parser.c +0 -32
  274. data/ext/nokogiri/html_sax_parser.h +0 -11
  275. data/ext/nokogiri/native.c +0 -40
  276. data/ext/nokogiri/native.h +0 -51
  277. data/ext/nokogiri/xml_xpath.c +0 -46
  278. data/ext/nokogiri/xml_xpath.h +0 -11
  279. data/lib/nokogiri/css/generated_parser.rb +0 -653
  280. data/lib/nokogiri/css/generated_tokenizer.rb +0 -159
  281. data/lib/nokogiri/decorators/hpricot/node.rb +0 -58
  282. data/lib/nokogiri/decorators/hpricot/node_set.rb +0 -14
  283. data/lib/nokogiri/decorators/hpricot/xpath_visitor.rb +0 -17
  284. data/lib/nokogiri/decorators/hpricot.rb +0 -3
  285. data/lib/nokogiri/decorators.rb +0 -1
  286. data/lib/nokogiri/hpricot.rb +0 -47
  287. data/lib/nokogiri/xml/after_handler.rb +0 -18
  288. data/lib/nokogiri/xml/before_handler.rb +0 -32
  289. data/lib/nokogiri/xml/element.rb +0 -6
  290. data/lib/nokogiri/xml/entity_declaration.rb +0 -9
  291. data/nokogiri.gemspec +0 -34
  292. data/test/hpricot/files/basic.xhtml +0 -17
  293. data/test/hpricot/files/boingboing.html +0 -2266
  294. data/test/hpricot/files/cy0.html +0 -3653
  295. data/test/hpricot/files/immob.html +0 -400
  296. data/test/hpricot/files/pace_application.html +0 -1320
  297. data/test/hpricot/files/tenderlove.html +0 -16
  298. data/test/hpricot/files/uswebgen.html +0 -220
  299. data/test/hpricot/files/utf8.html +0 -1054
  300. data/test/hpricot/files/week9.html +0 -1723
  301. data/test/hpricot/files/why.xml +0 -19
  302. data/test/hpricot/load_files.rb +0 -7
  303. data/test/hpricot/test_alter.rb +0 -67
  304. data/test/hpricot/test_builder.rb +0 -27
  305. data/test/hpricot/test_parser.rb +0 -423
  306. data/test/hpricot/test_paths.rb +0 -15
  307. data/test/hpricot/test_preserved.rb +0 -78
  308. data/test/hpricot/test_xml.rb +0 -30
  309. data/test/test_reader.rb +0 -222
@@ -1,8 +1,8 @@
1
1
  module Nokogiri
2
2
  module CSS
3
- class XPathVisitor
3
+ class XPathVisitor # :nodoc:
4
4
  def visit_function node
5
- # note that nth-child and nth-last-child are preprocessed in css/node.rb.
5
+
6
6
  msg = :"visit_function_#{node.value.first.gsub(/[(]/, '')}"
7
7
  return self.send(msg, node) if self.respond_to?(msg)
8
8
 
@@ -11,44 +11,60 @@ module Nokogiri
11
11
  'child::text()'
12
12
  when /^self\(/
13
13
  "self::#{node.value[1]}"
14
- when /^(eq|nth|nth-of-type|nth-child)\(/
15
- if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :AN_PLUS_B
16
- an_plus_b(node.value[1])
14
+ when /^eq\(/
15
+ "position() = #{node.value[1]}"
16
+ when /^(nth|nth-of-type)\(/
17
+ if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
18
+ nth(node.value[1])
19
+ else
20
+ "position() = #{node.value[1]}"
21
+ end
22
+ when /^nth-child\(/
23
+ if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
24
+ nth(node.value[1], :child => true)
25
+ else
26
+ "count(preceding-sibling::*) = #{node.value[1].to_i-1}"
27
+ end
28
+ when /^nth-last-of-type\(/
29
+ if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
30
+ nth(node.value[1], :last => true)
31
+ else
32
+ index = node.value[1].to_i - 1
33
+ index == 0 ? "position() = last()" : "position() = last() - #{index}"
34
+ end
35
+ when /^nth-last-child\(/
36
+ if node.value[1].is_a?(Nokogiri::CSS::Node) and node.value[1].type == :NTH
37
+ nth(node.value[1], :last => true, :child => true)
17
38
  else
18
- "position() = " + node.value[1]
39
+ "count(following-sibling::*) = #{node.value[1].to_i-1}"
19
40
  end
20
41
  when /^(first|first-of-type)\(/
21
42
  "position() = 1"
22
43
  when /^(last|last-of-type)\(/
23
44
  "position() = last()"
24
- when /^(nth-last-child|nth-last-of-type)\(/
25
- "position() = last() - #{node.value[1]}"
26
45
  when /^contains\(/
27
46
  "contains(., #{node.value[1]})"
28
47
  when /^gt\(/
29
48
  "position() > #{node.value[1]}"
30
49
  when /^only-child\(/
31
50
  "last() = 1"
51
+ when /^comment\(/
52
+ "comment()"
53
+ when /^has\(/
54
+ node.value[1].accept(self)
32
55
  else
33
- node.value.first + ')'
56
+ args = ['.'] + node.value[1..-1]
57
+ "#{node.value.first}#{args.join(', ')})"
34
58
  end
35
59
  end
36
60
 
37
61
  def visit_not node
38
- 'not(' + node.value.first.accept(self) + ')'
39
- end
40
-
41
- def visit_preceding_selector node
42
- node.value.last.accept(self) +
43
- '[preceding-sibling::' +
44
- node.value.first.accept(self) +
45
- ']'
46
- end
47
-
48
- def visit_direct_adjacent_selector node
49
- node.value.first.accept(self) +
50
- "/following-sibling::*[1]/self::" +
51
- node.value.last.accept(self)
62
+ child = node.value.first
63
+ if :ELEMENT_NAME == child.type
64
+ "not(self::#{child.accept(self)})"
65
+ else
66
+ "not(#{child.accept(self)})"
67
+ end
52
68
  end
53
69
 
54
70
  def visit_id node
@@ -73,15 +89,19 @@ module Nokogiri
73
89
  value = "'#{value}'" if value !~ /^['"]/
74
90
 
75
91
  case node.value[1]
76
- when '*='
92
+ when :equal
93
+ attribute + " = " + "#{value}"
94
+ when :not_equal
95
+ attribute + " != " + "#{value}"
96
+ when :substring_match
77
97
  "contains(#{attribute}, #{value})"
78
- when '^='
98
+ when :prefix_match
79
99
  "starts-with(#{attribute}, #{value})"
80
- when '|='
100
+ when :dash_match
81
101
  "#{attribute} = #{value} or starts-with(#{attribute}, concat(#{value}, '-'))"
82
- when '~='
102
+ when :includes
83
103
  "contains(concat(\" \", #{attribute}, \" \"),concat(\" \", #{value}, \" \"))"
84
- when '$='
104
+ when :suffix_match
85
105
  "substring(#{attribute}, string-length(#{attribute}) - " +
86
106
  "string-length(#{value}) + 1, string-length(#{value})) = #{value}"
87
107
  else
@@ -98,26 +118,45 @@ module Nokogiri
98
118
 
99
119
  case node.value.first
100
120
  when "first" then "position() = 1"
121
+ when "first-child" then "count(preceding-sibling::*) = 0"
101
122
  when "last" then "position() = last()"
123
+ when "last-child" then "count(following-sibling::*) = 0"
102
124
  when "first-of-type" then "position() = 1"
103
125
  when "last-of-type" then "position() = last()"
126
+ when "only-child" then "count(preceding-sibling::*) = 0 and count(following-sibling::*) = 0"
104
127
  when "only-of-type" then "last() = 1"
105
128
  when "empty" then "not(node())"
106
129
  when "parent" then "node()"
107
130
  when "root" then "not(parent::*)"
108
131
  else
109
- '1 = 1'
132
+ node.value.first + "(.)"
110
133
  end
111
134
  end
112
135
  end
113
136
 
114
137
  def visit_class_condition node
115
- "contains(concat(' ', @class, ' '), ' #{node.value.first} ')"
138
+ "contains(concat(' ', normalize-space(@class), ' '), ' #{node.value.first} ')"
116
139
  end
117
140
 
118
141
  def visit_combinator node
119
- node.value.first.accept(self) + ' and ' +
120
- node.value.last.accept(self)
142
+ if is_of_type_pseudo_class?(node.value.last)
143
+ "#{node.value.first.accept(self) if node.value.first}][#{node.value.last.accept(self)}"
144
+ else
145
+ "#{node.value.first.accept(self) if node.value.first} and #{node.value.last.accept(self)}"
146
+ end
147
+ end
148
+
149
+ {
150
+ 'direct_adjacent_selector' => "/following-sibling::*[1]/self::",
151
+ 'following_selector' => "/following-sibling::",
152
+ 'descendant_selector' => '//',
153
+ 'child_selector' => '/',
154
+ }.each do |k,v|
155
+ class_eval %{
156
+ def visit_#{k} node
157
+ "\#{node.value.first.accept(self) if node.value.first}#{v}\#{node.value.last.accept(self)}"
158
+ end
159
+ }
121
160
  end
122
161
 
123
162
  def visit_conditional_selector node
@@ -125,18 +164,6 @@ module Nokogiri
125
164
  node.value.last.accept(self) + ']'
126
165
  end
127
166
 
128
- def visit_descendant_selector node
129
- node.value.first.accept(self) +
130
- '//' +
131
- node.value.last.accept(self)
132
- end
133
-
134
- def visit_child_selector node
135
- node.value.first.accept(self) +
136
- '/' +
137
- node.value.last.accept(self)
138
- end
139
-
140
167
  def visit_element_name node
141
168
  node.value.first
142
169
  end
@@ -146,20 +173,51 @@ module Nokogiri
146
173
  end
147
174
 
148
175
  private
149
- def an_plus_b node
176
+ def nth node, options={}
150
177
  raise ArgumentError, "expected an+b node to contain 4 tokens, but is #{node.value.inspect}" unless node.value.size == 4
151
178
 
152
- a = node.value[0].to_i
153
- b = node.value[3].to_i
179
+ a, b = read_a_and_positive_b node.value
180
+ position = if options[:child]
181
+ options[:last] ? "(count(following-sibling::*) + 1)" : "(count(preceding-sibling::*) + 1)"
182
+ else
183
+ options[:last] ? "(last()-position()+1)" : "position()"
184
+ end
154
185
 
155
- if (b == 0)
156
- return "(position() mod #{a}) = 0"
186
+ if b.zero?
187
+ "(#{position} mod #{a}) = 0"
157
188
  else
158
- compare = (a < 0) ? "<=" : ">="
159
- return "(position() #{compare} #{b}) and (((position()-#{b}) mod #{a.abs}) = 0)"
189
+ compare = a < 0 ? "<=" : ">="
190
+ if a.abs == 1
191
+ "#{position} #{compare} #{b}"
192
+ else
193
+ "(#{position} #{compare} #{b}) and (((#{position}-#{b}) mod #{a.abs}) = 0)"
194
+ end
160
195
  end
161
196
  end
162
197
 
198
+ def read_a_and_positive_b values
199
+ op = values[2]
200
+ if op == "+"
201
+ a = values[0].to_i
202
+ b = values[3].to_i
203
+ elsif op == "-"
204
+ a = values[0].to_i
205
+ b = a - (values[3].to_i % a)
206
+ else
207
+ raise ArgumentError, "expected an+b node to have either + or - as the operator, but is #{op.inspect}"
208
+ end
209
+ [a, b]
210
+ end
211
+
212
+ def is_of_type_pseudo_class? node
213
+ if node.type==:PSEUDO_CLASS
214
+ if node.value[0].is_a?(Nokogiri::CSS::Node) and node.value[0].type == :FUNCTION
215
+ node.value[0].value[0]
216
+ else
217
+ node.value[0]
218
+ end =~ /(nth|first|last|only)-of-type(\()?/
219
+ end
220
+ end
163
221
  end
164
222
  end
165
223
  end
data/lib/nokogiri/css.rb CHANGED
@@ -1,6 +1,27 @@
1
1
  require 'nokogiri/css/node'
2
2
  require 'nokogiri/css/xpath_visitor'
3
- require 'nokogiri/css/generated_tokenizer'
4
- require 'nokogiri/css/generated_parser'
5
- require 'nokogiri/css/tokenizer'
3
+ x = $-w
4
+ $-w = false
6
5
  require 'nokogiri/css/parser'
6
+ $-w = x
7
+
8
+ require 'nokogiri/css/tokenizer'
9
+ require 'nokogiri/css/syntax_error'
10
+
11
+ module Nokogiri
12
+ module CSS
13
+ class << self
14
+ ###
15
+ # Parse this CSS selector in +selector+. Returns an AST.
16
+ def parse selector
17
+ Parser.new.parse selector
18
+ end
19
+
20
+ ###
21
+ # Get the XPath for +selector+.
22
+ def xpath_for selector, options={}
23
+ Parser.new(options[:ns] || {}).xpath_for selector, options
24
+ end
25
+ end
26
+ end
27
+ end
@@ -0,0 +1,42 @@
1
+ module Nokogiri
2
+ module Decorators
3
+ ###
4
+ # The Slop decorator implements method missing such that a methods may be
5
+ # used instead of XPath or CSS. See Nokogiri.Slop
6
+ module Slop
7
+ # The default XPath search context for Slop
8
+ XPATH_PREFIX = "./"
9
+
10
+ ###
11
+ # look for node with +name+. See Nokogiri.Slop
12
+ def method_missing name, *args, &block
13
+ if args.empty?
14
+ list = xpath("#{XPATH_PREFIX}#{name.to_s.sub(/^_/, '')}")
15
+ elsif args.first.is_a? Hash
16
+ hash = args.first
17
+ if hash[:css]
18
+ list = css("#{name}#{hash[:css]}")
19
+ elsif hash[:xpath]
20
+ conds = Array(hash[:xpath]).join(' and ')
21
+ list = xpath("#{XPATH_PREFIX}#{name}[#{conds}]")
22
+ end
23
+ else
24
+ CSS::Parser.without_cache do
25
+ list = xpath(
26
+ *CSS.xpath_for("#{name}#{args.first}", :prefix => XPATH_PREFIX)
27
+ )
28
+ end
29
+ end
30
+
31
+ super if list.empty?
32
+ list.length == 1 ? list.first : list
33
+ end
34
+
35
+ def respond_to_missing? name, include_private = false
36
+ list = xpath("#{XPATH_PREFIX}#{name.to_s.sub(/^_/, '')}")
37
+
38
+ !list.empty?
39
+ end
40
+ end
41
+ end
42
+ end
@@ -1,6 +1,32 @@
1
1
  module Nokogiri
2
2
  module HTML
3
- class Builder < XML::Builder
3
+ ###
4
+ # Nokogiri HTML builder is used for building HTML documents. It is very
5
+ # similar to the Nokogiri::XML::Builder. In fact, you should go read the
6
+ # documentation for Nokogiri::XML::Builder before reading this
7
+ # documentation.
8
+ #
9
+ # == Synopsis:
10
+ #
11
+ # Create an HTML document with a body that has an onload attribute, and a
12
+ # span tag with a class of "bold" that has content of "Hello world".
13
+ #
14
+ # builder = Nokogiri::HTML::Builder.new do |doc|
15
+ # doc.html {
16
+ # doc.body(:onload => 'some_func();') {
17
+ # doc.span.bold {
18
+ # doc.text "Hello world"
19
+ # }
20
+ # }
21
+ # }
22
+ # end
23
+ # puts builder.to_html
24
+ #
25
+ # The HTML builder inherits from the XML builder, so make sure to read the
26
+ # Nokogiri::XML::Builder documentation.
27
+ class Builder < Nokogiri::XML::Builder
28
+ ###
29
+ # Convert the builder to HTML
4
30
  def to_html
5
31
  @doc.to_html
6
32
  end
@@ -1,8 +1,334 @@
1
1
  module Nokogiri
2
2
  module HTML
3
- class Document < XML::Document
4
- def to_html
5
- serialize
3
+ class Document < Nokogiri::XML::Document
4
+ ###
5
+ # Get the meta tag encoding for this document. If there is no meta tag,
6
+ # then nil is returned.
7
+ def meta_encoding
8
+ case
9
+ when meta = at('//meta[@charset]')
10
+ meta[:charset]
11
+ when meta = meta_content_type
12
+ meta['content'][/charset\s*=\s*([\w-]+)/i, 1]
13
+ end
14
+ end
15
+
16
+ ###
17
+ # Set the meta tag encoding for this document.
18
+ #
19
+ # If an meta encoding tag is already present, its content is
20
+ # replaced with the given text.
21
+ #
22
+ # Otherwise, this method tries to create one at an appropriate
23
+ # place supplying head and/or html elements as necessary, which
24
+ # is inside a head element if any, and before any text node or
25
+ # content element (typically <body>) if any.
26
+ #
27
+ # The result when trying to set an encoding that is different
28
+ # from the document encoding is undefined.
29
+ #
30
+ # Beware in CRuby, that libxml2 automatically inserts a meta tag
31
+ # into a head element.
32
+ def meta_encoding= encoding
33
+ case
34
+ when meta = meta_content_type
35
+ meta['content'] = 'text/html; charset=%s' % encoding
36
+ encoding
37
+ when meta = at('//meta[@charset]')
38
+ meta['charset'] = encoding
39
+ else
40
+ meta = XML::Node.new('meta', self)
41
+ if dtd = internal_subset and dtd.html5_dtd?
42
+ meta['charset'] = encoding
43
+ else
44
+ meta['http-equiv'] = 'Content-Type'
45
+ meta['content'] = 'text/html; charset=%s' % encoding
46
+ end
47
+
48
+ case
49
+ when head = at('//head')
50
+ head.prepend_child(meta)
51
+ else
52
+ set_metadata_element(meta)
53
+ end
54
+ encoding
55
+ end
56
+ end
57
+
58
+ def meta_content_type
59
+ xpath('//meta[@http-equiv and boolean(@content)]').find { |node|
60
+ node['http-equiv'] =~ /\AContent-Type\z/i
61
+ }
62
+ end
63
+ private :meta_content_type
64
+
65
+ ###
66
+ # Get the title string of this document. Return nil if there is
67
+ # no title tag.
68
+ def title
69
+ title = at('//title') and title.inner_text
70
+ end
71
+
72
+ ###
73
+ # Set the title string of this document.
74
+ #
75
+ # If a title element is already present, its content is replaced
76
+ # with the given text.
77
+ #
78
+ # Otherwise, this method tries to create one at an appropriate
79
+ # place supplying head and/or html elements as necessary, which
80
+ # is inside a head element if any, right after a meta
81
+ # encoding/charset tag if any, and before any text node or
82
+ # content element (typically <body>) if any.
83
+ def title=(text)
84
+ tnode = XML::Text.new(text, self)
85
+ if title = at('//title')
86
+ title.children = tnode
87
+ return text
88
+ end
89
+
90
+ title = XML::Node.new('title', self) << tnode
91
+ case
92
+ when head = at('//head')
93
+ head << title
94
+ when meta = at('//meta[@charset]') || meta_content_type
95
+ # better put after charset declaration
96
+ meta.add_next_sibling(title)
97
+ else
98
+ set_metadata_element(title)
99
+ end
100
+ text
101
+ end
102
+
103
+ def set_metadata_element(element)
104
+ case
105
+ when head = at('//head')
106
+ head << element
107
+ when html = at('//html')
108
+ head = html.prepend_child(XML::Node.new('head', self))
109
+ head.prepend_child(element)
110
+ when first = children.find { |node|
111
+ case node
112
+ when XML::Element, XML::Text
113
+ true
114
+ end
115
+ }
116
+ # We reach here only if the underlying document model
117
+ # allows <html>/<head> elements to be omitted and does not
118
+ # automatically supply them.
119
+ first.add_previous_sibling(element)
120
+ else
121
+ html = add_child(XML::Node.new('html', self))
122
+ head = html.add_child(XML::Node.new('head', self))
123
+ head.prepend_child(element)
124
+ end
125
+ end
126
+ private :set_metadata_element
127
+
128
+ ####
129
+ # Serialize Node using +options+. Save options can also be set using a
130
+ # block. See SaveOptions.
131
+ #
132
+ # These two statements are equivalent:
133
+ #
134
+ # node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)
135
+ #
136
+ # or
137
+ #
138
+ # node.serialize(:encoding => 'UTF-8') do |config|
139
+ # config.format.as_xml
140
+ # end
141
+ #
142
+ def serialize options = {}
143
+ options[:save_with] ||= XML::Node::SaveOptions::DEFAULT_HTML
144
+ super
145
+ end
146
+
147
+ ####
148
+ # Create a Nokogiri::XML::DocumentFragment from +tags+
149
+ def fragment tags = nil
150
+ DocumentFragment.new(self, tags, self.root)
151
+ end
152
+
153
+ class << self
154
+ ###
155
+ # Parse HTML. +string_or_io+ may be a String, or any object that
156
+ # responds to _read_ and _close_ such as an IO, or StringIO.
157
+ # +url+ is resource where this document is located. +encoding+ is the
158
+ # encoding that should be used when processing the document. +options+
159
+ # is a number that sets options in the parser, such as
160
+ # Nokogiri::XML::ParseOptions::RECOVER. See the constants in
161
+ # Nokogiri::XML::ParseOptions.
162
+ def parse string_or_io, url = nil, encoding = nil, options = XML::ParseOptions::DEFAULT_HTML
163
+
164
+ options = Nokogiri::XML::ParseOptions.new(options) if Fixnum === options
165
+ # Give the options to the user
166
+ yield options if block_given?
167
+
168
+ if string_or_io.respond_to?(:encoding)
169
+ unless string_or_io.encoding.name == "ASCII-8BIT"
170
+ encoding ||= string_or_io.encoding.name
171
+ end
172
+ end
173
+
174
+ if string_or_io.respond_to?(:read)
175
+ url ||= string_or_io.respond_to?(:path) ? string_or_io.path : nil
176
+ unless encoding
177
+ # Libxml2's parser has poor support for encoding
178
+ # detection. First, it does not recognize the HTML5
179
+ # style meta charset declaration. Secondly, even if it
180
+ # successfully detects an encoding hint, it does not
181
+ # re-decode or re-parse the preceding part which may be
182
+ # garbled.
183
+ #
184
+ # EncodingReader aims to perform advanced encoding
185
+ # detection beyond what Libxml2 does, and to emulate
186
+ # rewinding of a stream and make Libxml2 redo parsing
187
+ # from the start when an encoding hint is found.
188
+ string_or_io = EncodingReader.new(string_or_io)
189
+ begin
190
+ return read_io(string_or_io, url, encoding, options.to_i)
191
+ rescue EncodingFound => e
192
+ encoding = e.found_encoding
193
+ end
194
+ end
195
+ return read_io(string_or_io, url, encoding, options.to_i)
196
+ end
197
+
198
+ # read_memory pukes on empty docs
199
+ if string_or_io.nil? or string_or_io.empty?
200
+ return encoding ? new.tap { |i| i.encoding = encoding } : new
201
+ end
202
+
203
+ encoding ||= EncodingReader.detect_encoding(string_or_io)
204
+
205
+ read_memory(string_or_io, url, encoding, options.to_i)
206
+ end
207
+ end
208
+
209
+ class EncodingFound < StandardError # :nodoc:
210
+ attr_reader :found_encoding
211
+
212
+ def initialize(encoding)
213
+ @found_encoding = encoding
214
+ super("encoding found: %s" % encoding)
215
+ end
216
+ end
217
+
218
+ class EncodingReader # :nodoc:
219
+ class SAXHandler < Nokogiri::XML::SAX::Document # :nodoc:
220
+ attr_reader :encoding
221
+
222
+ def initialize
223
+ @encoding = nil
224
+ super()
225
+ end
226
+
227
+ def start_element(name, attrs = [])
228
+ return unless name == 'meta'
229
+ attr = Hash[attrs]
230
+ charset = attr['charset'] and
231
+ @encoding = charset
232
+ http_equiv = attr['http-equiv'] and
233
+ http_equiv.match(/\AContent-Type\z/i) and
234
+ content = attr['content'] and
235
+ m = content.match(/;\s*charset\s*=\s*([\w-]+)/) and
236
+ @encoding = m[1]
237
+ end
238
+ end
239
+
240
+ class JumpSAXHandler < SAXHandler
241
+ def initialize(jumptag)
242
+ @jumptag = jumptag
243
+ super()
244
+ end
245
+
246
+ def start_element(name, attrs = [])
247
+ super
248
+ throw @jumptag, @encoding if @encoding
249
+ throw @jumptag, nil if name =~ /\A(?:div|h1|img|p|br)\z/
250
+ end
251
+ end
252
+
253
+ def self.detect_encoding(chunk)
254
+ if Nokogiri.jruby? && EncodingReader.is_jruby_without_fix?
255
+ return EncodingReader.detect_encoding_for_jruby_without_fix(chunk)
256
+ end
257
+ m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
258
+ return Nokogiri.XML(m[1]).encoding
259
+
260
+ if Nokogiri.jruby?
261
+ m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
262
+ return m[4]
263
+ catch(:encoding_found) {
264
+ Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)
265
+ nil
266
+ }
267
+ else
268
+ handler = SAXHandler.new
269
+ parser = Nokogiri::HTML::SAX::PushParser.new(handler)
270
+ parser << chunk rescue Nokogiri::SyntaxError
271
+ handler.encoding
272
+ end
273
+ end
274
+
275
+ def self.is_jruby_without_fix?
276
+ JRUBY_VERSION.split('.').join.to_i < 165
277
+ end
278
+
279
+ def self.detect_encoding_for_jruby_without_fix(chunk)
280
+ m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and
281
+ return Nokogiri.XML(m[1]).encoding
282
+
283
+ m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and
284
+ return m[4]
285
+
286
+ catch(:encoding_found) {
287
+ Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found.to_s)).parse(chunk)
288
+ nil
289
+ }
290
+ rescue Nokogiri::SyntaxError, RuntimeError
291
+ # Ignore parser errors that nokogiri may raise
292
+ nil
293
+ end
294
+
295
+ def initialize(io)
296
+ @io = io
297
+ @firstchunk = nil
298
+ @encoding_found = nil
299
+ end
300
+
301
+ # This method is used by the C extension so that
302
+ # Nokogiri::HTML::Document#read_io() does not leak memory when
303
+ # EncodingFound is raised.
304
+ attr_reader :encoding_found
305
+
306
+ def read(len)
307
+ # no support for a call without len
308
+
309
+ if !@firstchunk
310
+ @firstchunk = @io.read(len) or return nil
311
+
312
+ # This implementation expects that the first call from
313
+ # htmlReadIO() is made with a length long enough (~1KB) to
314
+ # achieve advanced encoding detection.
315
+ if encoding = EncodingReader.detect_encoding(@firstchunk)
316
+ # The first chunk is stored for the next read in retry.
317
+ raise @encoding_found = EncodingFound.new(encoding)
318
+ end
319
+ end
320
+ @encoding_found = nil
321
+
322
+ ret = @firstchunk.slice!(0, len)
323
+ if (len -= ret.length) > 0
324
+ rest = @io.read(len) and ret << rest
325
+ end
326
+ if ret.empty?
327
+ nil
328
+ else
329
+ ret
330
+ end
331
+ end
6
332
  end
7
333
  end
8
334
  end