shanna-xml-sax-machines 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE ADDED
@@ -0,0 +1,20 @@
1
+ Copyright (c) 2009 "Shane Hanna"
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining
4
+ a copy of this software and associated documentation files (the
5
+ "Software"), to deal in the Software without restriction, including
6
+ without limitation the rights to use, copy, modify, merge, publish,
7
+ distribute, sublicense, and/or sell copies of the Software, and to
8
+ permit persons to whom the Software is furnished to do so, subject to
9
+ the following conditions:
10
+
11
+ The above copyright notice and this permission notice shall be
12
+ included in all copies or substantial portions of the Software.
13
+
14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
18
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
20
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.rdoc ADDED
@@ -0,0 +1,43 @@
1
+ = XML SAX Machines
2
+
3
+ * http://github.com/shanna/xml-sax-machines/tree/master
4
+
5
+ == Description
6
+
7
+ Assorted XML SAX readers, filters and writers for nokogiri.
8
+
9
+ == Dependencies
10
+
11
+ Ruby::
12
+ * nokogiri ~> 1.2.2
13
+
14
+ == Install
15
+
16
+ * Via git: git clone git://github.com/shanna/xml-sax-machines.git
17
+ * Via gem: gem install shanna-xml-sax-machines -s http://gems.github.com
18
+
19
+ == Filters
20
+
21
+ === XML::SAX::Filter
22
+ Base class for creating chainable SAX filters.
23
+
24
+ === XML::SAX::Debug
25
+ Debuging output for SAX events.
26
+
27
+ === XML::SAX::Builder
28
+ Build in-memory document trees from SAX streams.
29
+
30
+ === XML::SAX::FragmentBuilder
31
+ Process in-memory record based document fragments. Builds well balanced XML chunks matching an XPath into a partial
32
+ in-memory document tree for processing by a callback block.
33
+
34
+ == TODO
35
+
36
+ * Pipeline and Tee filters.
37
+ * Json reader/writer?
38
+ * Any other Ruby parsers that can generate a SAX stream?
39
+ * Namespace handling in XML::SAX::Builder and XML::SAX::FragmentBuilder.
40
+
41
+ == Copyright
42
+
43
+ Copyright (c) 2009 "Shane Hanna". See LICENSE for details.
data/VERSION.yml ADDED
@@ -0,0 +1,4 @@
1
+ ---
2
+ :major: 0
3
+ :minor: 1
4
+ :patch: 1
@@ -0,0 +1,73 @@
1
+ module XML
2
+ module SAX
3
+
4
+ # Build a Nokogiri::XML::Document from a SAX stream.
5
+ #
6
+ # ==== Example
7
+ #
8
+ # builder = XML::SAX::Builder.new
9
+ # parser = Nokogiri::XML::SAX::PushParser.new(builder)
10
+ # parser << %q{<root>xml content</root>}
11
+ # parser.finish
12
+ #
13
+ # puts builder.document.children.to_s #=> xml content
14
+ #
15
+ # ==== See
16
+ # * XML::SAX::Filter
17
+ #
18
+ # --
19
+ # TODO:
20
+ # * Namespaces.
21
+ class Builder < Filter
22
+
23
+ # The document object.
24
+ #
25
+ # ==== Returns
26
+ # Nokogiri::XML::Document
27
+ attr_reader :document
28
+
29
+ def start_document #:nodoc:
30
+ super
31
+ @document = Nokogiri::XML::Document.new
32
+ @context = @document
33
+ end
34
+
35
+ def start_element(name, attributes = []) #:nodoc:
36
+ super
37
+
38
+ el = Nokogiri::XML::Element.new(name, @document)
39
+ Hash[*attributes].each_pair{|k, v| el[k] = v}
40
+ @context = @context.add_child(el)
41
+ end
42
+
43
+ def end_element(name) #:nodoc:
44
+ super
45
+ raise "Unmatched closing element. Got '#{name}' but expected '#{@context.name}'" \
46
+ unless name == @context.name
47
+ @context = @context.parent
48
+ end
49
+
50
+ def characters(string) #:nodoc:
51
+ super
52
+ # http://nokogiri.lighthouseapp.com/projects/19607-nokogiri/tickets/68-xpath-incorrect-when-text-siblings-exist#ticket-68-1
53
+ sibling = @context.children.last
54
+ if sibling.kind_of?(Nokogiri::XML::Text)
55
+ sibling.content += string
56
+ else
57
+ @context.add_child(Nokogiri::XML::Text.new(string, @document))
58
+ end
59
+ end
60
+
61
+ def cdata_block(string) #:nodoc:
62
+ super
63
+ @context.add_child(Nokogiri::XML::CDATA.new(@document, string))
64
+ end
65
+
66
+ def comment(string) #:nodoc:
67
+ super
68
+ @context.add_child(Nokogiri::XML::Comment.new(@document, string))
69
+ end
70
+
71
+ end # Builder
72
+ end # SAX
73
+ end # XML
@@ -0,0 +1,33 @@
1
+ module XML
2
+ module SAX
3
+
4
+ # SAX Debug filter.
5
+ #
6
+ # Warn all SAX event methods before calling the next filter in the chain. Handy as it can be placed anywhere in a
7
+ # pipeline to see what events are being passed to the next filter.
8
+ #
9
+ # ==== See
10
+ # * XML::SAX::Filter
11
+ #
12
+ class Debug < Filter
13
+
14
+ %w{
15
+ cdata_block
16
+ characters
17
+ comment
18
+ end_document
19
+ end_element
20
+ error
21
+ start_document
22
+ start_element
23
+ warning
24
+ }.each do |method|
25
+ define_method(method.to_sym) do |*args|
26
+ warn "#{method}: #{args.inspect}"
27
+ super(*args)
28
+ end
29
+ end
30
+
31
+ end # Debug
32
+ end # SAX
33
+ end # XML
@@ -0,0 +1,85 @@
1
+ module XML
2
+ module SAX
3
+
4
+ # SAX Filter base class.
5
+ #
6
+ # Chain SAX filters together by delegating missing SAX event methods to the next filter in the chain. Simply call
7
+ # super in any SAX event methods you overload to pass the call to the next filter in the chain.
8
+ #
9
+ # Extend this Class rather than <tt>Nokogiri::XML::SAX::Document</tt> which acts as a final filter.
10
+ #
11
+ # ==== See
12
+ # * Nokogiri::XML::SAX::Document
13
+ #
14
+ #--
15
+ # TODO:
16
+ # * Examples.
17
+ class Filter < Nokogiri::XML::SAX::Document
18
+
19
+ # The next filter in the chain.
20
+ attr_accessor :filter
21
+
22
+ # New filter instance.
23
+ #
24
+ # ==== Notes
25
+ # Filter chains are built in reverse, the filter passed during construction is called *after* the current
26
+ # filter.
27
+ #
28
+ # ==== See
29
+ # * XML::SAX::Pipeline
30
+ #
31
+ # ==== Parameters
32
+ # filter<Nokogiri::XML::SAX::Document>::
33
+ # Optional next <tt>XML::SAX::Filter</tt> or <tt>Nokogiri::XML::SAX::Document<tt>(final) in the chain.
34
+ # By default a <tt>Nokogiri::XML::SAX::Document</tt> will be used making the chain final.
35
+ #
36
+ # options<Hash>::
37
+ # Optional per-filter arguments.
38
+ #
39
+ #--
40
+ # TODO:
41
+ # * Barf if the filter isn't a Nokogiri::XML::SAX::Document or XML::SAX::Filter.
42
+ def initialize(filter = nil, options = {})
43
+ @filter = filter
44
+ end
45
+
46
+ def cdata_block(string) #:nodoc:
47
+ @filter.cdata_block(string) if @filter
48
+ end
49
+
50
+ def characters(string) #:nodoc:
51
+ @filter.characters(string) if @filter
52
+ end
53
+
54
+ def comment(string) #:nodoc:
55
+ @filter.comment(string) if @filter
56
+ end
57
+
58
+ def end_document #:nodoc:
59
+ @filter.end_document if @filter
60
+ end
61
+
62
+ def end_element(name) #:nodoc:
63
+ @filter.end_element(name) if @filter
64
+ end
65
+
66
+ def error(string) #:nodoc:
67
+ @filter.error(string) if @filter
68
+ end
69
+
70
+ def start_document #:nodoc:
71
+ @filter.start_document if @filter
72
+ end
73
+
74
+ def start_element(name, attributes = []) #:nodoc:
75
+ @filter.start_element(name, attributes = []) if @filter
76
+ end
77
+
78
+ def warning(string) #:nodoc:
79
+ @filter.warning(string) if @filter
80
+ end
81
+
82
+ end # Filter
83
+ end # SAX
84
+ end # XML
85
+
@@ -0,0 +1,86 @@
1
+ module XML
2
+ module SAX
3
+
4
+ # Build a Nokogiri::XML::Document fragments that match an XPath.
5
+ #
6
+ # Stream large (or small) record based XML documents building each matching XPath into a document fragment making
7
+ # futher manipulation of each record easier.
8
+ #
9
+ # ==== Notes
10
+ # * In order to save memory well balanced elements that do not match any XPath are unlinked. This means you *cannot*
11
+ # match records by position in relation to siblings.
12
+ # * Because we are parsing a SAX stream there is no read ahead. You *cannot* match records by any children the
13
+ # element may have once further events are pushed.
14
+ # * You can match by attributes of an element.
15
+ #
16
+ # ==== Example
17
+ #
18
+ # builder = XML::SAX::FragmentBuilder.new(nil, {
19
+ # '//record' => lambda{|record| puts el.to_s} # Process each matched record element.
20
+ # })
21
+ # parser = Nokogiri::XML::SAX::PushParser.new(builder)
22
+ # parser << %q{
23
+ # <root>
24
+ # <record id="1">record one</record>
25
+ # <record id="2">record two</record>
26
+ # </root>
27
+ # }
28
+ # #=> <record id="1">record one</record>
29
+ # #=> <record id="2">record two</record>
30
+ # parser.finish
31
+ #
32
+ # ==== See
33
+ # * XML::SAX::Builder
34
+ # * XML::SAX::Filter
35
+ #
36
+ # --
37
+ # TODO:
38
+ # * Namespaces.
39
+ class FragmentBuilder < Builder
40
+ private :document # Would return an empty/partial document you really shouldn't mess with.
41
+
42
+ # ==== Parameters
43
+ # handler<Nokogiri::XML::SAX::Document>::
44
+ # Optional next <tt>XML::SAX::Filter</tt> or <tt>Nokogiri::XML::SAX::Document<tt>(final) in the chain.
45
+ # By default a <tt>Nokogiri::XML::SAX::Document</tt> will be used making the chain final.
46
+ #
47
+ # options<Hash>::
48
+ # {xpath<String> => &block<Proc>} pairs. The first element passed to the block will be the matching
49
+ # Nokogiri::XML::Node. Keep in mind the node will be unlinked after your block returns.
50
+ def initialize(filter = nil, options = {})
51
+ super(filter)
52
+ @find = options
53
+ @found = {}
54
+ @buffer = 0
55
+ end
56
+
57
+ def start_element(name, attributes = []) #:nodoc:
58
+ super
59
+ @find.each_pair do |xpath, block|
60
+ if match = @document.at(xpath)
61
+ unless @found[match.path]
62
+ @buffer += 1
63
+ @found[match.path] = block
64
+ end
65
+ end
66
+ end
67
+ end
68
+
69
+ def end_element(name) #:nodoc:
70
+ path = @context.path
71
+ if @buffer > 0 && block = @found.delete(path)
72
+ @buffer -= 1
73
+ block.call(@context) # Copy @context fragment tree to actual fragment?
74
+ end
75
+ super
76
+
77
+ if @buffer == 0 && !(path == '/')
78
+ # Unlink children if position context []
79
+ # @doc.at(path).children.unlink unless path == '/'
80
+ @document.at(path).unlink
81
+ end
82
+ end
83
+
84
+ end # FragmentBuilder
85
+ end # SAX
86
+ end # XML
@@ -0,0 +1,15 @@
1
+ begin
2
+ require 'nokogiri'
3
+ rescue LoadError
4
+ require 'rubygems'
5
+ require 'nokogiri'
6
+ end
7
+
8
+ module XML
9
+ module SAX
10
+ end # SAX
11
+ end # XML
12
+
13
+ # TODO: Conditionally load some machines?
14
+ base = File.join(File.dirname(__FILE__), 'xml-sax-machines')
15
+ %w{filter debug builder fragment_builder}.each{|r| require File.join(base, r)}
@@ -0,0 +1,54 @@
1
+ require File.join(File.dirname(__FILE__), 'test_helper')
2
+
3
+ class BuilderTest < Test::Unit::TestCase
4
+ context 'XML::SAX::Builder' do
5
+
6
+ should 'create root' do
7
+ assert_equal 'r', build('<r/>').root.name
8
+ end
9
+
10
+ should 'create comments' do
11
+ assert_equal '<!-- woot -->', build('<r><!-- woot --></r>').root.children.to_s
12
+ end
13
+
14
+ should 'create cdata_blocks' do
15
+ assert_equal '<![CDATA[ woot ]]>', build('<r><![CDATA[ woot ]]></r>').root.children.to_s
16
+ end
17
+
18
+ should 'create characters' do
19
+ assert_equal 'woot', build('<r>woot</r>').root.children.to_s
20
+ end
21
+
22
+ should 'create empty element' do
23
+ assert build('<r><foo/></r>').at('/r/foo')
24
+ end
25
+
26
+ should 'create element with attributes' do
27
+ el = build('<r><foo id="1"/></r>').at('/r/foo')
28
+ assert_equal '1', el['id']
29
+ end
30
+
31
+ should 'create element with child element' do
32
+ assert build('<r><foo><bar/></foo></r>').at('/r/foo/bar')
33
+ end
34
+
35
+ should 'create element with mixed content' do
36
+ el = build('<r><foo>text<bar/></foo></r>').at('/r/foo')
37
+ assert_equal 'text<bar/>', el.children.to_s
38
+ end
39
+
40
+ should 'create element siblings' do
41
+ el = build('<r><foo/><bar/></r>').root
42
+ assert_equal 2, el.children.length
43
+ end
44
+ end
45
+
46
+ protected
47
+ def build(string)
48
+ builder = XML::SAX::Builder.new
49
+ parser = Nokogiri::XML::SAX::PushParser.new(builder)
50
+ parser << string
51
+ parser.finish
52
+ builder.document
53
+ end
54
+ end # BuilderTest
@@ -0,0 +1,59 @@
1
+ require File.join(File.dirname(__FILE__), 'test_helper')
2
+
3
+ class DebugTest < Test::Unit::TestCase
4
+ context 'XML::SAX::Debug event method warning' do
5
+
6
+ should 'warn #start_document' do
7
+ assert_match regexp('start_document: []'), parse('<r/>')
8
+ end
9
+
10
+ should 'warn #end_document' do
11
+ assert_match regexp('end_document: []'), parse('<r/>')
12
+ end
13
+
14
+ should 'warn #start_element' do
15
+ assert_match regexp('start_element: ["r", []]'), parse('<r/>')
16
+ end
17
+
18
+ should 'warn #start_element with attributes' do
19
+ assert_match regexp('start_element: ["r", ["id", "1"]]'), parse('<r id="1"/>')
20
+ end
21
+
22
+ should 'warn #end_element' do
23
+ assert_match regexp('end_element: ["r"]'), parse('<r/>')
24
+ end
25
+
26
+ should 'warn #characters' do
27
+ assert_match regexp('characters: ["woot"]'), parse('<r>woot</r>')
28
+ end
29
+
30
+ should 'warn #comment' do
31
+ assert_match regexp('comment: [" woot "]'), parse('<r><!-- woot --></r>')
32
+ end
33
+
34
+ should 'warn #cdata_block' do
35
+ assert_match regexp('cdata_block: [" woot "]'), parse('<r><![CDATA[ woot ]]></r>')
36
+ end
37
+ end
38
+
39
+ protected
40
+ def parse(xml)
41
+ parser = Nokogiri::XML::SAX::PushParser.new(XML::SAX::Debug.new)
42
+ capture_stderr do
43
+ parser << xml
44
+ parser.finish
45
+ end
46
+ end
47
+
48
+ def regexp(string)
49
+ Regexp.compile('^' + Regexp.escape(string))
50
+ end
51
+
52
+ def capture_stderr(&block)
53
+ $stderr = StringIO.new
54
+ yield
55
+ result = $stderr.rewind && $stderr.read
56
+ $stderr = STDERR
57
+ result
58
+ end
59
+ end # DebugTest
@@ -0,0 +1,15 @@
1
+ require File.join(File.dirname(__FILE__), 'test_helper')
2
+
3
+ class FilterTest < Test::Unit::TestCase
4
+ context 'XML::SAX::Filter' do
5
+
6
+ should 'run base filter without error' do
7
+ assert_nothing_thrown do
8
+ parser = Nokogiri::XML::SAX::PushParser.new(XML::SAX::Filter.new)
9
+ parser << '<r/>'
10
+ parser.finish
11
+ end
12
+ end
13
+
14
+ end
15
+ end # FilterTest
@@ -0,0 +1,28 @@
1
+ require File.join(File.dirname(__FILE__), 'test_helper')
2
+
3
+ class FragmentBuilderTest < Test::Unit::TestCase
4
+ context 'XML::SAX::FragmentBuilder' do
5
+
6
+ should 'call callback for record' do
7
+ builder = XML::SAX::FragmentBuilder.new(nil, {
8
+ '//foo' => lambda do |el|
9
+ assert_equal 'foo', el.name, 'foo element'
10
+ assert_equal 1, el.parent.children.length, 'no siblings'
11
+ end
12
+ })
13
+ parser = Nokogiri::XML::SAX::PushParser.new(builder)
14
+ parser << '<r><foo/><foo/><foo/></r>'
15
+ parser.finish
16
+ end
17
+
18
+ should 'have buffered children for record' do
19
+ builder = XML::SAX::FragmentBuilder.new(nil, {
20
+ '//foo' => lambda{|el| assert_equal 2, el.children.length}
21
+ })
22
+ parser = Nokogiri::XML::SAX::PushParser.new(builder)
23
+ parser << '<r><foo>text<el>el</el></foo></r>'
24
+ parser.finish
25
+ end
26
+
27
+ end
28
+ end # FragmentBuilderTest
@@ -0,0 +1,10 @@
1
+ require 'rubygems'
2
+ require 'test/unit'
3
+ require 'shoulda'
4
+
5
+ $LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
6
+ $LOAD_PATH.unshift(File.dirname(__FILE__))
7
+ require 'xml-sax-machines'
8
+
9
+ class Test::Unit::TestCase
10
+ end
metadata ADDED
@@ -0,0 +1,77 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: shanna-xml-sax-machines
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.1
5
+ platform: ruby
6
+ authors:
7
+ - Shane Hanna
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2009-03-17 00:00:00 -07:00
13
+ default_executable:
14
+ dependencies:
15
+ - !ruby/object:Gem::Dependency
16
+ name: nokogiri
17
+ type: :runtime
18
+ version_requirement:
19
+ version_requirements: !ruby/object:Gem::Requirement
20
+ requirements:
21
+ - - ~>
22
+ - !ruby/object:Gem::Version
23
+ version: 1.2.2
24
+ version:
25
+ description: XML SAX Machines
26
+ email: shane.hanna@gmail.com
27
+ executables: []
28
+
29
+ extensions: []
30
+
31
+ extra_rdoc_files:
32
+ - README.rdoc
33
+ - LICENSE
34
+ files:
35
+ - README.rdoc
36
+ - VERSION.yml
37
+ - lib/xml-sax-machines
38
+ - lib/xml-sax-machines/builder.rb
39
+ - lib/xml-sax-machines/debug.rb
40
+ - lib/xml-sax-machines/filter.rb
41
+ - lib/xml-sax-machines/fragment_builder.rb
42
+ - lib/xml-sax-machines.rb
43
+ - test/builder_test.rb
44
+ - test/debug_test.rb
45
+ - test/filter_test.rb
46
+ - test/fragment_builder_test.rb
47
+ - test/test_helper.rb
48
+ - LICENSE
49
+ has_rdoc: true
50
+ homepage: http://github.com/shanna/xml-sax-machines
51
+ post_install_message:
52
+ rdoc_options:
53
+ - --inline-source
54
+ - --charset=UTF-8
55
+ require_paths:
56
+ - lib
57
+ required_ruby_version: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: "0"
62
+ version:
63
+ required_rubygems_version: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: "0"
68
+ version:
69
+ requirements: []
70
+
71
+ rubyforge_project:
72
+ rubygems_version: 1.2.0
73
+ signing_key:
74
+ specification_version: 2
75
+ summary: Assorted XML SAX readers, filters and writers.
76
+ test_files: []
77
+