nokogiri 1.0.0 → 1.6.8.1

Sign up to get free protection for your applications and to get access to all the features.

Potentially problematic release.


This version of nokogiri might be problematic. Click here for more details.

Files changed (309) hide show
  1. checksums.yaml +7 -0
  2. data/.autotest +26 -0
  3. data/.cross_rubies +9 -0
  4. data/.editorconfig +17 -0
  5. data/.gemtest +0 -0
  6. data/.travis.yml +51 -0
  7. data/CHANGELOG.rdoc +1160 -0
  8. data/CONTRIBUTING.md +42 -0
  9. data/C_CODING_STYLE.rdoc +33 -0
  10. data/Gemfile +22 -0
  11. data/LICENSE.txt +31 -0
  12. data/Manifest.txt +284 -40
  13. data/README.md +166 -0
  14. data/ROADMAP.md +111 -0
  15. data/Rakefile +310 -199
  16. data/STANDARD_RESPONSES.md +47 -0
  17. data/Y_U_NO_GEMSPEC.md +155 -0
  18. data/appveyor.yml +22 -0
  19. data/bin/nokogiri +118 -0
  20. data/build_all +45 -0
  21. data/dependencies.yml +29 -0
  22. data/ext/nokogiri/depend +358 -0
  23. data/ext/nokogiri/extconf.rb +664 -34
  24. data/ext/nokogiri/html_document.c +120 -33
  25. data/ext/nokogiri/html_document.h +1 -1
  26. data/ext/nokogiri/html_element_description.c +279 -0
  27. data/ext/nokogiri/html_element_description.h +10 -0
  28. data/ext/nokogiri/html_entity_lookup.c +32 -0
  29. data/ext/nokogiri/html_entity_lookup.h +8 -0
  30. data/ext/nokogiri/html_sax_parser_context.c +116 -0
  31. data/ext/nokogiri/html_sax_parser_context.h +11 -0
  32. data/ext/nokogiri/html_sax_push_parser.c +87 -0
  33. data/ext/nokogiri/html_sax_push_parser.h +9 -0
  34. data/ext/nokogiri/nokogiri.c +145 -0
  35. data/ext/nokogiri/nokogiri.h +131 -0
  36. data/ext/nokogiri/xml_attr.c +94 -0
  37. data/ext/nokogiri/xml_attr.h +9 -0
  38. data/ext/nokogiri/xml_attribute_decl.c +70 -0
  39. data/ext/nokogiri/xml_attribute_decl.h +9 -0
  40. data/ext/nokogiri/xml_cdata.c +23 -19
  41. data/ext/nokogiri/xml_cdata.h +1 -1
  42. data/ext/nokogiri/xml_comment.c +69 -0
  43. data/ext/nokogiri/xml_comment.h +9 -0
  44. data/ext/nokogiri/xml_document.c +501 -54
  45. data/ext/nokogiri/xml_document.h +14 -1
  46. data/ext/nokogiri/xml_document_fragment.c +48 -0
  47. data/ext/nokogiri/xml_document_fragment.h +10 -0
  48. data/ext/nokogiri/xml_dtd.c +109 -24
  49. data/ext/nokogiri/xml_dtd.h +3 -1
  50. data/ext/nokogiri/xml_element_content.c +123 -0
  51. data/ext/nokogiri/xml_element_content.h +10 -0
  52. data/ext/nokogiri/xml_element_decl.c +69 -0
  53. data/ext/nokogiri/xml_element_decl.h +9 -0
  54. data/ext/nokogiri/xml_encoding_handler.c +79 -0
  55. data/ext/nokogiri/xml_encoding_handler.h +8 -0
  56. data/ext/nokogiri/xml_entity_decl.c +110 -0
  57. data/ext/nokogiri/xml_entity_decl.h +10 -0
  58. data/ext/nokogiri/xml_entity_reference.c +52 -0
  59. data/ext/nokogiri/xml_entity_reference.h +9 -0
  60. data/ext/nokogiri/xml_io.c +60 -0
  61. data/ext/nokogiri/xml_io.h +11 -0
  62. data/ext/nokogiri/xml_libxml2_hacks.c +112 -0
  63. data/ext/nokogiri/xml_libxml2_hacks.h +12 -0
  64. data/ext/nokogiri/xml_namespace.c +117 -0
  65. data/ext/nokogiri/xml_namespace.h +13 -0
  66. data/ext/nokogiri/xml_node.c +1285 -315
  67. data/ext/nokogiri/xml_node.h +4 -6
  68. data/ext/nokogiri/xml_node_set.c +415 -54
  69. data/ext/nokogiri/xml_node_set.h +6 -2
  70. data/ext/nokogiri/xml_processing_instruction.c +56 -0
  71. data/ext/nokogiri/xml_processing_instruction.h +9 -0
  72. data/ext/nokogiri/xml_reader.c +316 -77
  73. data/ext/nokogiri/xml_reader.h +1 -1
  74. data/ext/nokogiri/xml_relax_ng.c +161 -0
  75. data/ext/nokogiri/xml_relax_ng.h +9 -0
  76. data/ext/nokogiri/xml_sax_parser.c +215 -80
  77. data/ext/nokogiri/xml_sax_parser.h +30 -1
  78. data/ext/nokogiri/xml_sax_parser_context.c +262 -0
  79. data/ext/nokogiri/xml_sax_parser_context.h +10 -0
  80. data/ext/nokogiri/xml_sax_push_parser.c +115 -0
  81. data/ext/nokogiri/xml_sax_push_parser.h +9 -0
  82. data/ext/nokogiri/xml_schema.c +205 -0
  83. data/ext/nokogiri/xml_schema.h +9 -0
  84. data/ext/nokogiri/xml_syntax_error.c +45 -175
  85. data/ext/nokogiri/xml_syntax_error.h +4 -2
  86. data/ext/nokogiri/xml_text.c +37 -14
  87. data/ext/nokogiri/xml_text.h +1 -1
  88. data/ext/nokogiri/xml_xpath_context.c +230 -13
  89. data/ext/nokogiri/xml_xpath_context.h +2 -1
  90. data/ext/nokogiri/xslt_stylesheet.c +196 -34
  91. data/ext/nokogiri/xslt_stylesheet.h +6 -1
  92. data/lib/nokogiri/css/node.rb +18 -61
  93. data/lib/nokogiri/css/parser.rb +725 -17
  94. data/lib/nokogiri/css/parser.y +126 -63
  95. data/lib/nokogiri/css/parser_extras.rb +91 -0
  96. data/lib/nokogiri/css/syntax_error.rb +7 -0
  97. data/lib/nokogiri/css/tokenizer.rb +148 -5
  98. data/lib/nokogiri/css/tokenizer.rex +31 -39
  99. data/lib/nokogiri/css/xpath_visitor.rb +109 -51
  100. data/lib/nokogiri/css.rb +24 -3
  101. data/lib/nokogiri/decorators/slop.rb +42 -0
  102. data/lib/nokogiri/html/builder.rb +27 -1
  103. data/lib/nokogiri/html/document.rb +329 -3
  104. data/lib/nokogiri/html/document_fragment.rb +39 -0
  105. data/lib/nokogiri/html/element_description.rb +23 -0
  106. data/lib/nokogiri/html/element_description_defaults.rb +671 -0
  107. data/lib/nokogiri/html/entity_lookup.rb +13 -0
  108. data/lib/nokogiri/html/sax/parser.rb +35 -4
  109. data/lib/nokogiri/html/sax/parser_context.rb +16 -0
  110. data/lib/nokogiri/html/sax/push_parser.rb +36 -0
  111. data/lib/nokogiri/html.rb +18 -76
  112. data/lib/nokogiri/syntax_error.rb +4 -0
  113. data/lib/nokogiri/version.rb +106 -1
  114. data/lib/nokogiri/xml/attr.rb +14 -0
  115. data/lib/nokogiri/xml/attribute_decl.rb +18 -0
  116. data/lib/nokogiri/xml/builder.rb +395 -31
  117. data/lib/nokogiri/xml/cdata.rb +4 -2
  118. data/lib/nokogiri/xml/character_data.rb +7 -0
  119. data/lib/nokogiri/xml/document.rb +267 -12
  120. data/lib/nokogiri/xml/document_fragment.rb +149 -0
  121. data/lib/nokogiri/xml/dtd.rb +27 -1
  122. data/lib/nokogiri/xml/element_content.rb +36 -0
  123. data/lib/nokogiri/xml/element_decl.rb +13 -0
  124. data/lib/nokogiri/xml/entity_decl.rb +19 -0
  125. data/lib/nokogiri/xml/namespace.rb +13 -0
  126. data/lib/nokogiri/xml/node/save_options.rb +61 -0
  127. data/lib/nokogiri/xml/node.rb +748 -109
  128. data/lib/nokogiri/xml/node_set.rb +200 -72
  129. data/lib/nokogiri/xml/parse_options.rb +120 -0
  130. data/lib/nokogiri/xml/pp/character_data.rb +18 -0
  131. data/lib/nokogiri/xml/pp/node.rb +56 -0
  132. data/lib/nokogiri/xml/pp.rb +2 -0
  133. data/lib/nokogiri/xml/processing_instruction.rb +8 -0
  134. data/lib/nokogiri/xml/reader.rb +102 -4
  135. data/lib/nokogiri/xml/relax_ng.rb +32 -0
  136. data/lib/nokogiri/xml/sax/document.rb +114 -2
  137. data/lib/nokogiri/xml/sax/parser.rb +97 -7
  138. data/lib/nokogiri/xml/sax/parser_context.rb +16 -0
  139. data/lib/nokogiri/xml/sax/push_parser.rb +60 -0
  140. data/lib/nokogiri/xml/sax.rb +2 -7
  141. data/lib/nokogiri/xml/schema.rb +63 -0
  142. data/lib/nokogiri/xml/searchable.rb +221 -0
  143. data/lib/nokogiri/xml/syntax_error.rb +27 -1
  144. data/lib/nokogiri/xml/text.rb +4 -1
  145. data/lib/nokogiri/xml/xpath/syntax_error.rb +11 -0
  146. data/lib/nokogiri/xml/xpath.rb +4 -0
  147. data/lib/nokogiri/xml/xpath_context.rb +3 -1
  148. data/lib/nokogiri/xml.rb +45 -38
  149. data/lib/nokogiri/xslt/stylesheet.rb +19 -0
  150. data/lib/nokogiri/xslt.rb +47 -2
  151. data/lib/nokogiri.rb +117 -24
  152. data/lib/xsd/xmlparser/nokogiri.rb +102 -0
  153. data/patches/sort-patches-by-date +25 -0
  154. data/ports/archives/libxml2-2.9.4.tar.gz +0 -0
  155. data/ports/archives/libxslt-1.1.29.tar.gz +0 -0
  156. data/suppressions/README.txt +1 -0
  157. data/suppressions/nokogiri_ree-1.8.7.358.supp +61 -0
  158. data/suppressions/nokogiri_ruby-1.8.7.370.supp +0 -0
  159. data/suppressions/nokogiri_ruby-1.9.2.320.supp +28 -0
  160. data/suppressions/nokogiri_ruby-1.9.3.327.supp +28 -0
  161. data/tasks/test.rb +100 -0
  162. data/test/css/test_nthiness.rb +73 -6
  163. data/test/css/test_parser.rb +184 -39
  164. data/test/css/test_tokenizer.rb +72 -19
  165. data/test/css/test_xpath_visitor.rb +44 -2
  166. data/test/decorators/test_slop.rb +20 -0
  167. data/test/files/2ch.html +108 -0
  168. data/test/files/GH_1042.html +18 -0
  169. data/test/files/address_book.rlx +12 -0
  170. data/test/files/address_book.xml +10 -0
  171. data/test/files/atom.xml +344 -0
  172. data/test/files/bar/bar.xsd +4 -0
  173. data/test/files/bogus.xml +0 -0
  174. data/test/files/dont_hurt_em_why.xml +422 -0
  175. data/test/files/encoding.html +82 -0
  176. data/test/files/encoding.xhtml +84 -0
  177. data/test/files/exslt.xml +8 -0
  178. data/test/files/exslt.xslt +35 -0
  179. data/test/files/foo/foo.xsd +4 -0
  180. data/test/files/metacharset.html +10 -0
  181. data/test/files/namespace_pressure_test.xml +1684 -0
  182. data/test/files/noencoding.html +47 -0
  183. data/test/files/po.xml +32 -0
  184. data/test/files/po.xsd +66 -0
  185. data/test/files/saml/saml20assertion_schema.xsd +283 -0
  186. data/test/files/saml/saml20protocol_schema.xsd +302 -0
  187. data/test/files/saml/xenc_schema.xsd +146 -0
  188. data/test/files/saml/xmldsig_schema.xsd +318 -0
  189. data/test/files/shift_jis.html +10 -0
  190. data/test/files/shift_jis.xml +5 -0
  191. data/test/files/shift_jis_no_charset.html +9 -0
  192. data/test/files/slow-xpath.xml +25509 -0
  193. data/test/files/snuggles.xml +3 -0
  194. data/test/files/staff.dtd +10 -0
  195. data/test/files/test_document_url/bar.xml +2 -0
  196. data/test/files/test_document_url/document.dtd +4 -0
  197. data/test/files/test_document_url/document.xml +6 -0
  198. data/test/files/tlm.html +2 -1
  199. data/test/files/to_be_xincluded.xml +2 -0
  200. data/test/files/valid_bar.xml +2 -0
  201. data/test/files/xinclude.xml +4 -0
  202. data/test/helper.rb +124 -13
  203. data/test/html/sax/test_parser.rb +118 -4
  204. data/test/html/sax/test_parser_context.rb +46 -0
  205. data/test/html/sax/test_push_parser.rb +87 -0
  206. data/test/html/test_builder.rb +94 -8
  207. data/test/html/test_document.rb +626 -11
  208. data/test/html/test_document_encoding.rb +145 -0
  209. data/test/html/test_document_fragment.rb +301 -0
  210. data/test/html/test_element_description.rb +105 -0
  211. data/test/html/test_named_characters.rb +14 -0
  212. data/test/html/test_node.rb +212 -0
  213. data/test/html/test_node_encoding.rb +85 -0
  214. data/test/namespaces/test_additional_namespaces_in_builder_doc.rb +14 -0
  215. data/test/namespaces/test_namespaces_aliased_default.rb +24 -0
  216. data/test/namespaces/test_namespaces_in_builder_doc.rb +75 -0
  217. data/test/namespaces/test_namespaces_in_cloned_doc.rb +31 -0
  218. data/test/namespaces/test_namespaces_in_created_doc.rb +75 -0
  219. data/test/namespaces/test_namespaces_in_parsed_doc.rb +80 -0
  220. data/test/namespaces/test_namespaces_preservation.rb +31 -0
  221. data/test/test_convert_xpath.rb +2 -47
  222. data/test/test_css_cache.rb +45 -0
  223. data/test/test_encoding_handler.rb +48 -0
  224. data/test/test_memory_leak.rb +156 -0
  225. data/test/test_nokogiri.rb +103 -1
  226. data/test/test_soap4r_sax.rb +52 -0
  227. data/test/test_xslt_transforms.rb +293 -8
  228. data/test/xml/node/test_save_options.rb +28 -0
  229. data/test/xml/node/test_subclass.rb +44 -0
  230. data/test/xml/sax/test_parser.rb +309 -8
  231. data/test/xml/sax/test_parser_context.rb +115 -0
  232. data/test/xml/sax/test_push_parser.rb +157 -0
  233. data/test/xml/test_attr.rb +67 -0
  234. data/test/xml/test_attribute_decl.rb +86 -0
  235. data/test/xml/test_builder.rb +327 -2
  236. data/test/xml/test_c14n.rb +180 -0
  237. data/test/xml/test_cdata.rb +32 -2
  238. data/test/xml/test_comment.rb +40 -0
  239. data/test/xml/test_document.rb +846 -35
  240. data/test/xml/test_document_encoding.rb +31 -0
  241. data/test/xml/test_document_fragment.rb +271 -0
  242. data/test/xml/test_dtd.rb +153 -9
  243. data/test/xml/test_dtd_encoding.rb +31 -0
  244. data/test/xml/test_element_content.rb +56 -0
  245. data/test/xml/test_element_decl.rb +73 -0
  246. data/test/xml/test_entity_decl.rb +122 -0
  247. data/test/xml/test_entity_reference.rb +251 -0
  248. data/test/xml/test_namespace.rb +96 -0
  249. data/test/xml/test_node.rb +1126 -105
  250. data/test/xml/test_node_attributes.rb +115 -0
  251. data/test/xml/test_node_encoding.rb +69 -0
  252. data/test/xml/test_node_inheritance.rb +32 -0
  253. data/test/xml/test_node_reparenting.rb +549 -0
  254. data/test/xml/test_node_set.rb +668 -9
  255. data/test/xml/test_parse_options.rb +64 -0
  256. data/test/xml/test_processing_instruction.rb +30 -0
  257. data/test/xml/test_reader.rb +589 -0
  258. data/test/xml/test_reader_encoding.rb +134 -0
  259. data/test/xml/test_relax_ng.rb +60 -0
  260. data/test/xml/test_schema.rb +142 -0
  261. data/test/xml/test_syntax_error.rb +30 -0
  262. data/test/xml/test_text.rb +49 -2
  263. data/test/xml/test_unparented_node.rb +440 -0
  264. data/test/xml/test_xinclude.rb +83 -0
  265. data/test/xml/test_xpath.rb +445 -0
  266. data/test/xslt/test_custom_functions.rb +133 -0
  267. data/test/xslt/test_exception_handling.rb +37 -0
  268. data/test_all +107 -0
  269. metadata +459 -115
  270. data/History.txt +0 -6
  271. data/README.ja.txt +0 -86
  272. data/README.txt +0 -87
  273. data/ext/nokogiri/html_sax_parser.c +0 -32
  274. data/ext/nokogiri/html_sax_parser.h +0 -11
  275. data/ext/nokogiri/native.c +0 -40
  276. data/ext/nokogiri/native.h +0 -51
  277. data/ext/nokogiri/xml_xpath.c +0 -46
  278. data/ext/nokogiri/xml_xpath.h +0 -11
  279. data/lib/nokogiri/css/generated_parser.rb +0 -653
  280. data/lib/nokogiri/css/generated_tokenizer.rb +0 -159
  281. data/lib/nokogiri/decorators/hpricot/node.rb +0 -58
  282. data/lib/nokogiri/decorators/hpricot/node_set.rb +0 -14
  283. data/lib/nokogiri/decorators/hpricot/xpath_visitor.rb +0 -17
  284. data/lib/nokogiri/decorators/hpricot.rb +0 -3
  285. data/lib/nokogiri/decorators.rb +0 -1
  286. data/lib/nokogiri/hpricot.rb +0 -47
  287. data/lib/nokogiri/xml/after_handler.rb +0 -18
  288. data/lib/nokogiri/xml/before_handler.rb +0 -32
  289. data/lib/nokogiri/xml/element.rb +0 -6
  290. data/lib/nokogiri/xml/entity_declaration.rb +0 -9
  291. data/nokogiri.gemspec +0 -34
  292. data/test/hpricot/files/basic.xhtml +0 -17
  293. data/test/hpricot/files/boingboing.html +0 -2266
  294. data/test/hpricot/files/cy0.html +0 -3653
  295. data/test/hpricot/files/immob.html +0 -400
  296. data/test/hpricot/files/pace_application.html +0 -1320
  297. data/test/hpricot/files/tenderlove.html +0 -16
  298. data/test/hpricot/files/uswebgen.html +0 -220
  299. data/test/hpricot/files/utf8.html +0 -1054
  300. data/test/hpricot/files/week9.html +0 -1723
  301. data/test/hpricot/files/why.xml +0 -19
  302. data/test/hpricot/load_files.rb +0 -7
  303. data/test/hpricot/test_alter.rb +0 -67
  304. data/test/hpricot/test_builder.rb +0 -27
  305. data/test/hpricot/test_parser.rb +0 -423
  306. data/test/hpricot/test_paths.rb +0 -15
  307. data/test/hpricot/test_preserved.rb +0 -78
  308. data/test/hpricot/test_xml.rb +0 -30
  309. data/test/test_reader.rb +0 -222
@@ -0,0 +1,32 @@
1
+ module Nokogiri
2
+ module XML
3
+ class << self
4
+ ###
5
+ # Create a new Nokogiri::XML::RelaxNG document from +string_or_io+.
6
+ # See Nokogiri::XML::RelaxNG for an example.
7
+ def RelaxNG string_or_io
8
+ RelaxNG.new(string_or_io)
9
+ end
10
+ end
11
+
12
+ ###
13
+ # Nokogiri::XML::RelaxNG is used for validating XML against a
14
+ # RelaxNG schema.
15
+ #
16
+ # == Synopsis
17
+ #
18
+ # Validate an XML document against a RelaxNG schema. Loop over the errors
19
+ # that are returned and print them out:
20
+ #
21
+ # schema = Nokogiri::XML::RelaxNG(File.open(ADDRESS_SCHEMA_FILE))
22
+ # doc = Nokogiri::XML(File.open(ADDRESS_XML_FILE))
23
+ #
24
+ # schema.validate(doc).each do |error|
25
+ # puts error.message
26
+ # end
27
+ #
28
+ # The list of errors are Nokogiri::XML::SyntaxError objects.
29
+ class RelaxNG < Nokogiri::XML::Schema
30
+ end
31
+ end
32
+ end
@@ -1,7 +1,78 @@
1
1
  module Nokogiri
2
2
  module XML
3
+ ###
4
+ # SAX Parsers are event driven parsers. Nokogiri provides two different
5
+ # event based parsers when dealing with XML. If you want to do SAX style
6
+ # parsing using HTML, check out Nokogiri::HTML::SAX.
7
+ #
8
+ # The basic way a SAX style parser works is by creating a parser,
9
+ # telling the parser about the events we're interested in, then giving
10
+ # the parser some XML to process. The parser will notify you when
11
+ # it encounters events you said you would like to know about.
12
+ #
13
+ # To register for events, you simply subclass Nokogiri::XML::SAX::Document,
14
+ # and implement the methods for which you would like notification.
15
+ #
16
+ # For example, if I want to be notified when a document ends, and when an
17
+ # element starts, I would write a class like this:
18
+ #
19
+ # class MyDocument < Nokogiri::XML::SAX::Document
20
+ # def end_document
21
+ # puts "the document has ended"
22
+ # end
23
+ #
24
+ # def start_element name, attributes = []
25
+ # puts "#{name} started"
26
+ # end
27
+ # end
28
+ #
29
+ # Then I would instantiate a SAX parser with this document, and feed the
30
+ # parser some XML
31
+ #
32
+ # # Create a new parser
33
+ # parser = Nokogiri::XML::SAX::Parser.new(MyDocument.new)
34
+ #
35
+ # # Feed the parser some XML
36
+ # parser.parse(File.open(ARGV[0]))
37
+ #
38
+ # Now my document handler will be called when each node starts, and when
39
+ # then document ends. To see what kinds of events are available, take
40
+ # a look at Nokogiri::XML::SAX::Document.
41
+ #
42
+ # Two SAX parsers for XML are available, a parser that reads from a string
43
+ # or IO object as it feels necessary, and a parser that lets you spoon
44
+ # feed it XML. If you want to let Nokogiri deal with reading your XML,
45
+ # use the Nokogiri::XML::SAX::Parser. If you want to have fine grain
46
+ # control over the XML input, use the Nokogiri::XML::SAX::PushParser.
3
47
  module SAX
48
+ ###
49
+ # This class is used for registering types of events you are interested
50
+ # in handling. All of the methods on this class are available as
51
+ # possible events while parsing an XML document. To register for any
52
+ # particular event, just subclass this class and implement the methods
53
+ # you are interested in knowing about.
54
+ #
55
+ # To only be notified about start and end element events, write a class
56
+ # like this:
57
+ #
58
+ # class MyDocument < Nokogiri::XML::SAX::Document
59
+ # def start_element name, attrs = []
60
+ # puts "#{name} started!"
61
+ # end
62
+ #
63
+ # def end_element name
64
+ # puts "#{name} ended"
65
+ # end
66
+ # end
67
+ #
68
+ # You can use this event handler for any SAX style parser included with
69
+ # Nokogiri. See Nokogiri::XML::SAX, and Nokogiri::HTML::SAX.
4
70
  class Document
71
+ ###
72
+ # Called when an XML declaration is parsed
73
+ def xmldecl version, encoding, standalone
74
+ end
75
+
5
76
  ###
6
77
  # Called when document starts parsing
7
78
  def start_document
@@ -14,7 +85,9 @@ module Nokogiri
14
85
 
15
86
  ###
16
87
  # Called at the beginning of an element
17
- # +name+ is the name of the tag with +attrs+ as attributes
88
+ # * +name+ is the name of the tag
89
+ # * +attrs+ are an assoc list of namespaces and attributes, e.g.:
90
+ # [ ["xmlns:foo", "http://sample.net"], ["size", "large"] ]
18
91
  def start_element name, attrs = []
19
92
  end
20
93
 
@@ -25,7 +98,39 @@ module Nokogiri
25
98
  end
26
99
 
27
100
  ###
28
- # Characters read between a tag
101
+ # Called at the beginning of an element
102
+ # +name+ is the element name
103
+ # +attrs+ is a list of attributes
104
+ # +prefix+ is the namespace prefix for the element
105
+ # +uri+ is the associated namespace URI
106
+ # +ns+ is a hash of namespace prefix:urls associated with the element
107
+ def start_element_namespace name, attrs = [], prefix = nil, uri = nil, ns = []
108
+ ###
109
+ # Deal with SAX v1 interface
110
+ name = [prefix, name].compact.join(':')
111
+ attributes = ns.map { |ns_prefix,ns_uri|
112
+ [['xmlns', ns_prefix].compact.join(':'), ns_uri]
113
+ } + attrs.map { |attr|
114
+ [[attr.prefix, attr.localname].compact.join(':'), attr.value]
115
+ }
116
+ start_element name, attributes
117
+ end
118
+
119
+ ###
120
+ # Called at the end of an element
121
+ # +name+ is the element's name
122
+ # +prefix+ is the namespace prefix associated with the element
123
+ # +uri+ is the associated namespace URI
124
+ def end_element_namespace name, prefix = nil, uri = nil
125
+ ###
126
+ # Deal with SAX v1 interface
127
+ end_element [prefix, name].compact.join(':')
128
+ end
129
+
130
+ ###
131
+ # Characters read between a tag. This method might be called multiple
132
+ # times given one contiguous string of characters.
133
+ #
29
134
  # +string+ contains the character data
30
135
  def characters string
31
136
  end
@@ -53,6 +158,13 @@ module Nokogiri
53
158
  # +string+ contains the cdata content
54
159
  def cdata_block string
55
160
  end
161
+
162
+ ###
163
+ # Called when processing instructions are found
164
+ # +name+ is the target of the instruction
165
+ # +content+ is the value of the instruction
166
+ def processing_instruction name, content
167
+ end
56
168
  end
57
169
  end
58
170
  end
@@ -1,31 +1,121 @@
1
1
  module Nokogiri
2
2
  module XML
3
3
  module SAX
4
+ ###
5
+ # This parser is a SAX style parser that reads it's input as it
6
+ # deems necessary. The parser takes a Nokogiri::XML::SAX::Document,
7
+ # an optional encoding, then given an XML input, sends messages to
8
+ # the Nokogiri::XML::SAX::Document.
9
+ #
10
+ # Here is an example of using this parser:
11
+ #
12
+ # # Create a subclass of Nokogiri::XML::SAX::Document and implement
13
+ # # the events we care about:
14
+ # class MyDoc < Nokogiri::XML::SAX::Document
15
+ # def start_element name, attrs = []
16
+ # puts "starting: #{name}"
17
+ # end
18
+ #
19
+ # def end_element name
20
+ # puts "ending: #{name}"
21
+ # end
22
+ # end
23
+ #
24
+ # # Create our parser
25
+ # parser = Nokogiri::XML::SAX::Parser.new(MyDoc.new)
26
+ #
27
+ # # Send some XML to the parser
28
+ # parser.parse(File.open(ARGV[0]))
29
+ #
30
+ # For more information about SAX parsers, see Nokogiri::XML::SAX. Also
31
+ # see Nokogiri::XML::SAX::Document for the available events.
4
32
  class Parser
33
+ class Attribute < Struct.new(:localname, :prefix, :uri, :value)
34
+ end
35
+
36
+ # Encodinds this parser supports
37
+ ENCODINGS = {
38
+ 'NONE' => 0, # No char encoding detected
39
+ 'UTF-8' => 1, # UTF-8
40
+ 'UTF16LE' => 2, # UTF-16 little endian
41
+ 'UTF16BE' => 3, # UTF-16 big endian
42
+ 'UCS4LE' => 4, # UCS-4 little endian
43
+ 'UCS4BE' => 5, # UCS-4 big endian
44
+ 'EBCDIC' => 6, # EBCDIC uh!
45
+ 'UCS4-2143' => 7, # UCS-4 unusual ordering
46
+ 'UCS4-3412' => 8, # UCS-4 unusual ordering
47
+ 'UCS2' => 9, # UCS-2
48
+ 'ISO-8859-1' => 10, # ISO-8859-1 ISO Latin 1
49
+ 'ISO-8859-2' => 11, # ISO-8859-2 ISO Latin 2
50
+ 'ISO-8859-3' => 12, # ISO-8859-3
51
+ 'ISO-8859-4' => 13, # ISO-8859-4
52
+ 'ISO-8859-5' => 14, # ISO-8859-5
53
+ 'ISO-8859-6' => 15, # ISO-8859-6
54
+ 'ISO-8859-7' => 16, # ISO-8859-7
55
+ 'ISO-8859-8' => 17, # ISO-8859-8
56
+ 'ISO-8859-9' => 18, # ISO-8859-9
57
+ 'ISO-2022-JP' => 19, # ISO-2022-JP
58
+ 'SHIFT-JIS' => 20, # Shift_JIS
59
+ 'EUC-JP' => 21, # EUC-JP
60
+ 'ASCII' => 22, # pure ASCII
61
+ }
62
+
63
+ # The Nokogiri::XML::SAX::Document where events will be sent.
5
64
  attr_accessor :document
6
- def initialize(doc = XML::SAX::Document.new)
65
+
66
+ # The encoding beings used for this document.
67
+ attr_accessor :encoding
68
+
69
+ # Create a new Parser with +doc+ and +encoding+
70
+ def initialize doc = Nokogiri::XML::SAX::Document.new, encoding = 'UTF-8'
71
+ check_encoding(encoding)
72
+ @encoding = encoding
7
73
  @document = doc
74
+ @warned = false
8
75
  end
9
76
 
10
77
  ###
11
78
  # Parse given +thing+ which may be a string containing xml, or an
12
79
  # IO object.
13
- def parse thing
14
- parse_memory(thing.is_a?(IO) ? thing.read : thing)
80
+ def parse thing, &block
81
+ if thing.respond_to?(:read) && thing.respond_to?(:close)
82
+ parse_io(thing, &block)
83
+ else
84
+ parse_memory(thing, &block)
85
+ end
15
86
  end
16
87
 
17
88
  ###
18
89
  # Parse given +io+
19
- def parse_io io
20
- parse_memory io.read
90
+ def parse_io io, encoding = 'ASCII'
91
+ check_encoding(encoding)
92
+ @encoding = encoding
93
+ ctx = ParserContext.io(io, ENCODINGS[encoding])
94
+ yield ctx if block_given?
95
+ ctx.parse_with self
21
96
  end
22
97
 
23
98
  ###
24
99
  # Parse a file with +filename+
25
100
  def parse_file filename
26
- raise Errno::ENOENT unless File.exists?(filename)
101
+ raise ArgumentError unless filename
102
+ raise Errno::ENOENT unless File.exist?(filename)
27
103
  raise Errno::EISDIR if File.directory?(filename)
28
- native_parse_file filename
104
+ ctx = ParserContext.file filename
105
+ yield ctx if block_given?
106
+ ctx.parse_with self
107
+ end
108
+
109
+ def parse_memory data
110
+ ctx = ParserContext.memory data
111
+ yield ctx if block_given?
112
+ ctx.parse_with self
113
+ end
114
+
115
+ private
116
+ def check_encoding(encoding)
117
+ encoding.upcase!
118
+ raise ArgumentError.new("'#{encoding}' is not a valid encoding") unless ENCODINGS[encoding]
29
119
  end
30
120
  end
31
121
  end
@@ -0,0 +1,16 @@
1
+ module Nokogiri
2
+ module XML
3
+ module SAX
4
+ ###
5
+ # Context for XML SAX parsers. This class is usually not instantiated
6
+ # by the user. Instead, you should be looking at
7
+ # Nokogiri::XML::SAX::Parser
8
+ class ParserContext
9
+ def self.new thing, encoding = 'UTF-8'
10
+ [:read, :close].all? { |x| thing.respond_to?(x) } ?
11
+ io(thing, Parser::ENCODINGS[encoding]) : memory(thing)
12
+ end
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,60 @@
1
+ module Nokogiri
2
+ module XML
3
+ module SAX
4
+ ###
5
+ # PushParser can parse a document that is fed to it manually. It
6
+ # must be given a SAX::Document object which will be called with
7
+ # SAX events as the document is being parsed.
8
+ #
9
+ # Calling PushParser#<< writes XML to the parser, calling any SAX
10
+ # callbacks it can.
11
+ #
12
+ # PushParser#finish tells the parser that the document is finished
13
+ # and calls the end_document SAX method.
14
+ #
15
+ # Example:
16
+ #
17
+ # parser = PushParser.new(Class.new(XML::SAX::Document) {
18
+ # def start_document
19
+ # puts "start document called"
20
+ # end
21
+ # }.new)
22
+ # parser << "<div>hello<"
23
+ # parser << "/div>"
24
+ # parser.finish
25
+ class PushParser
26
+
27
+ # The Nokogiri::XML::SAX::Document on which the PushParser will be
28
+ # operating
29
+ attr_accessor :document
30
+
31
+ ###
32
+ # Create a new PushParser with +doc+ as the SAX Document, providing
33
+ # an optional +file_name+ and +encoding+
34
+ def initialize(doc = XML::SAX::Document.new, file_name = nil, encoding = 'UTF-8')
35
+ @document = doc
36
+ @encoding = encoding
37
+ @sax_parser = XML::SAX::Parser.new(doc)
38
+
39
+ ## Create our push parser context
40
+ initialize_native(@sax_parser, file_name)
41
+ end
42
+
43
+ ###
44
+ # Write a +chunk+ of XML to the PushParser. Any callback methods
45
+ # that can be called will be called immediately.
46
+ def write chunk, last_chunk = false
47
+ native_write(chunk, last_chunk)
48
+ end
49
+ alias :<< :write
50
+
51
+ ###
52
+ # Finish the parsing. This method is only necessary for
53
+ # Nokogiri::XML::SAX::Document#end_document to be called.
54
+ def finish
55
+ write '', true
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -1,9 +1,4 @@
1
1
  require 'nokogiri/xml/sax/document'
2
+ require 'nokogiri/xml/sax/parser_context'
2
3
  require 'nokogiri/xml/sax/parser'
3
-
4
- module Nokogiri
5
- module XML
6
- module SAX
7
- end
8
- end
9
- end
4
+ require 'nokogiri/xml/sax/push_parser'
@@ -0,0 +1,63 @@
1
+ module Nokogiri
2
+ module XML
3
+ class << self
4
+ ###
5
+ # Create a new Nokogiri::XML::Schema object using a +string_or_io+
6
+ # object.
7
+ def Schema string_or_io
8
+ Schema.new(string_or_io)
9
+ end
10
+ end
11
+
12
+ ###
13
+ # Nokogiri::XML::Schema is used for validating XML against a schema
14
+ # (usually from an xsd file).
15
+ #
16
+ # == Synopsis
17
+ #
18
+ # Validate an XML document against a Schema. Loop over the errors that
19
+ # are returned and print them out:
20
+ #
21
+ # xsd = Nokogiri::XML::Schema(File.read(PO_SCHEMA_FILE))
22
+ # doc = Nokogiri::XML(File.read(PO_XML_FILE))
23
+ #
24
+ # xsd.validate(doc).each do |error|
25
+ # puts error.message
26
+ # end
27
+ #
28
+ # The list of errors are Nokogiri::XML::SyntaxError objects.
29
+ class Schema
30
+ # Errors while parsing the schema file
31
+ attr_accessor :errors
32
+
33
+ ###
34
+ # Create a new Nokogiri::XML::Schema object using a +string_or_io+
35
+ # object.
36
+ def self.new string_or_io
37
+ from_document Nokogiri::XML(string_or_io)
38
+ end
39
+
40
+ ###
41
+ # Validate +thing+ against this schema. +thing+ can be a
42
+ # Nokogiri::XML::Document object, or a filename. An Array of
43
+ # Nokogiri::XML::SyntaxError objects found while validating the
44
+ # +thing+ is returned.
45
+ def validate thing
46
+ if thing.is_a?(Nokogiri::XML::Document)
47
+ validate_document(thing)
48
+ elsif File.file?(thing)
49
+ validate_file(thing)
50
+ else
51
+ raise ArgumentError, "Must provide Nokogiri::Xml::Document or the name of an existing file"
52
+ end
53
+ end
54
+
55
+ ###
56
+ # Returns true if +thing+ is a valid Nokogiri::XML::Document or
57
+ # file.
58
+ def valid? thing
59
+ validate(thing).length == 0
60
+ end
61
+ end
62
+ end
63
+ end
@@ -0,0 +1,221 @@
1
+ module Nokogiri
2
+ module XML
3
+ #
4
+ # The Searchable module declares the interface used for searching your DOM.
5
+ #
6
+ # It implements the public methods `search`, `css`, and `xpath`,
7
+ # as well as allowing specific implementations to specialize some
8
+ # of the important behaviors.
9
+ #
10
+ module Searchable
11
+ # Regular expression used by Searchable#search to determine if a query
12
+ # string is CSS or XPath
13
+ LOOKS_LIKE_XPATH = /^(\.\/|\/|\.\.|\.$)/
14
+
15
+ ###
16
+ # call-seq: search *paths, [namespace-bindings, xpath-variable-bindings, custom-handler-class]
17
+ #
18
+ # Search this object for +paths+. +paths+ must be one or more XPath or CSS queries:
19
+ #
20
+ # node.search("div.employee", ".//title")
21
+ #
22
+ # A hash of namespace bindings may be appended:
23
+ #
24
+ # node.search('.//bike:tire', {'bike' => 'http://schwinn.com/'})
25
+ # node.search('bike|tire', {'bike' => 'http://schwinn.com/'})
26
+ #
27
+ # For XPath queries, a hash of variable bindings may also be
28
+ # appended to the namespace bindings. For example:
29
+ #
30
+ # node.search('.//address[@domestic=$value]', nil, {:value => 'Yes'})
31
+ #
32
+ # Custom XPath functions and CSS pseudo-selectors may also be
33
+ # defined. To define custom functions create a class and
34
+ # implement the function you want to define. The first argument
35
+ # to the method will be the current matching NodeSet. Any other
36
+ # arguments are ones that you pass in. Note that this class may
37
+ # appear anywhere in the argument list. For example:
38
+ #
39
+ # node.search('.//title[regex(., "\w+")]', 'div.employee:regex("[0-9]+")'
40
+ # Class.new {
41
+ # def regex node_set, regex
42
+ # node_set.find_all { |node| node['some_attribute'] =~ /#{regex}/ }
43
+ # end
44
+ # }.new
45
+ # )
46
+ #
47
+ # See Searchable#xpath and Searchable#css for further usage help.
48
+ def search *args
49
+ paths, handler, ns, binds = extract_params(args)
50
+
51
+ xpaths = paths.map(&:to_s).map do |path|
52
+ (path =~ LOOKS_LIKE_XPATH) ? path : xpath_query_from_css_rule(path, ns)
53
+ end.flatten.uniq
54
+
55
+ xpath(*(xpaths + [ns, handler, binds].compact))
56
+ end
57
+ alias :/ :search
58
+
59
+ ###
60
+ # call-seq: search *paths, [namespace-bindings, xpath-variable-bindings, custom-handler-class]
61
+ #
62
+ # Search this object for +paths+, and return only the first
63
+ # result. +paths+ must be one or more XPath or CSS queries.
64
+ #
65
+ # See Searchable#search for more information.
66
+ def at *args
67
+ search(*args).first
68
+ end
69
+ alias :% :at
70
+
71
+ ###
72
+ # call-seq: css *rules, [namespace-bindings, custom-pseudo-class]
73
+ #
74
+ # Search this object for CSS +rules+. +rules+ must be one or more CSS
75
+ # selectors. For example:
76
+ #
77
+ # node.css('title')
78
+ # node.css('body h1.bold')
79
+ # node.css('div + p.green', 'div#one')
80
+ #
81
+ # A hash of namespace bindings may be appended. For example:
82
+ #
83
+ # node.css('bike|tire', {'bike' => 'http://schwinn.com/'})
84
+ #
85
+ # Custom CSS pseudo classes may also be defined. To define
86
+ # custom pseudo classes, create a class and implement the custom
87
+ # pseudo class you want defined. The first argument to the
88
+ # method will be the current matching NodeSet. Any other
89
+ # arguments are ones that you pass in. For example:
90
+ #
91
+ # node.css('title:regex("\w+")', Class.new {
92
+ # def regex node_set, regex
93
+ # node_set.find_all { |node| node['some_attribute'] =~ /#{regex}/ }
94
+ # end
95
+ # }.new)
96
+ #
97
+ # Note that the CSS query string is case-sensitive with regards
98
+ # to your document type. That is, if you're looking for "H1" in
99
+ # an HTML document, you'll never find anything, since HTML tags
100
+ # will match only lowercase CSS queries. However, "H1" might be
101
+ # found in an XML document, where tags names are case-sensitive
102
+ # (e.g., "H1" is distinct from "h1").
103
+ #
104
+ def css *args
105
+ rules, handler, ns, _ = extract_params(args)
106
+
107
+ css_internal self, rules, handler, ns
108
+ end
109
+
110
+ ##
111
+ # call-seq: css *rules, [namespace-bindings, custom-pseudo-class]
112
+ #
113
+ # Search this object for CSS +rules+, and return only the first
114
+ # match. +rules+ must be one or more CSS selectors.
115
+ #
116
+ # See Searchable#css for more information.
117
+ def at_css *args
118
+ css(*args).first
119
+ end
120
+
121
+ ###
122
+ # call-seq: xpath *paths, [namespace-bindings, variable-bindings, custom-handler-class]
123
+ #
124
+ # Search this node for XPath +paths+. +paths+ must be one or more XPath
125
+ # queries.
126
+ #
127
+ # node.xpath('.//title')
128
+ #
129
+ # A hash of namespace bindings may be appended. For example:
130
+ #
131
+ # node.xpath('.//foo:name', {'foo' => 'http://example.org/'})
132
+ # node.xpath('.//xmlns:name', node.root.namespaces)
133
+ #
134
+ # A hash of variable bindings may also be appended to the namespace bindings. For example:
135
+ #
136
+ # node.xpath('.//address[@domestic=$value]', nil, {:value => 'Yes'})
137
+ #
138
+ # Custom XPath functions may also be defined. To define custom
139
+ # functions create a class and implement the function you want
140
+ # to define. The first argument to the method will be the
141
+ # current matching NodeSet. Any other arguments are ones that
142
+ # you pass in. Note that this class may appear anywhere in the
143
+ # argument list. For example:
144
+ #
145
+ # node.xpath('.//title[regex(., "\w+")]', Class.new {
146
+ # def regex node_set, regex
147
+ # node_set.find_all { |node| node['some_attribute'] =~ /#{regex}/ }
148
+ # end
149
+ # }.new)
150
+ #
151
+ def xpath *args
152
+ return NodeSet.new(document) unless document
153
+
154
+ paths, handler, ns, binds = extract_params(args)
155
+
156
+ sets = paths.map do |path|
157
+ ctx = XPathContext.new(self)
158
+ ctx.register_namespaces(ns)
159
+ path = path.gsub(/xmlns:/, ' :') unless Nokogiri.uses_libxml?
160
+
161
+ binds.each do |key,value|
162
+ ctx.register_variable key.to_s, value
163
+ end if binds
164
+
165
+ ctx.evaluate(path, handler)
166
+ end
167
+ return sets.first if sets.length == 1
168
+
169
+ NodeSet.new(document) do |combined|
170
+ sets.each do |set|
171
+ set.each do |node|
172
+ combined << node
173
+ end
174
+ end
175
+ end
176
+ end
177
+
178
+ ##
179
+ # call-seq: xpath *paths, [namespace-bindings, variable-bindings, custom-handler-class]
180
+ #
181
+ # Search this node for XPath +paths+, and return only the first
182
+ # match. +paths+ must be one or more XPath queries.
183
+ #
184
+ # See Searchable#xpath for more information.
185
+ def at_xpath *args
186
+ xpath(*args).first
187
+ end
188
+
189
+ private
190
+
191
+ def css_internal node, rules, handler, ns
192
+ xpaths = rules.map { |rule| xpath_query_from_css_rule(rule, ns) }
193
+ node.xpath(*(xpaths + [ns, handler].compact))
194
+ end
195
+
196
+ def xpath_query_from_css_rule rule, ns
197
+ implied_xpath_contexts.map do |implied_xpath_context|
198
+ CSS.xpath_for(rule.to_s, :prefix => implied_xpath_context, :ns => ns)
199
+ end.join(' | ')
200
+ end
201
+
202
+ def extract_params params # :nodoc:
203
+ handler = params.find do |param|
204
+ ![Hash, String, Symbol].include?(param.class)
205
+ end
206
+ params -= [handler] if handler
207
+
208
+ hashes = []
209
+ while Hash === params.last || params.last.nil?
210
+ hashes << params.pop
211
+ break if params.empty?
212
+ end
213
+ ns, binds = hashes.reverse
214
+
215
+ ns ||= document.root ? document.root.namespaces : {}
216
+
217
+ [params, handler, ns, binds]
218
+ end
219
+ end
220
+ end
221
+ end