sawtooth 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in sawtooth.gemspec
4
+ gemspec
@@ -0,0 +1,38 @@
1
+
2
+ __
3
+ _____....--' .'
4
+ _..___...---'._ o -`(
5
+ _ | | _ \ .--. `\
6
+ ___ __ ___ _| |_ ___ ___ | |_| |__ | \ \ `|
7
+ / __|/ _` \ \ /\ / / __/ _ \ / _ \| __| '_ \ |o o | | |
8
+ \__ \ (_| |\ V V /| || (_) | (_) | |_| | | | \___'.-`. '.
9
+ |___/\__,_| \_/\_/ \__\___/ \___/ \__|_| |_| | `---'
10
+ '^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^'
11
+
12
+ A companion for [nokogori](http://nokogiri.org) to parse XML files by rules,
13
+ similar to [Apache Commons Digester](http://commons.apache.org/digester/).
14
+
15
+ Converting XML structures into Ruby is most often an unsatisfying task, having
16
+ to choose between implementing a SAX parser (for speed) or using _nokogiri_
17
+ features like CSS selectors for ease of use. At it's base _sawtooth_ is parsing
18
+ documents using SAX, but provides an interface to specify rules for the handling
19
+ the document.
20
+
21
+ require 'open-uri'
22
+ require 'sawtooth'
23
+
24
+ rules = Sawtooth.rules do
25
+ before { |doc| doc << [] } # 1. create an array for all news items
26
+
27
+ on 'rss/channel/item' do
28
+ on_start { |doc| doc << Hash.new } # 2. on an item create hash
29
+ on_finish { |doc| doc.parent << doc.pop } # 3. when closing an item, pop from stack and
30
+ end # append to parent array (from step 1.)
31
+
32
+ on_text 'rss/channel/item/*' # 4. add contents to hash
33
+ end
34
+
35
+ result = rules.parse(open('http://rss.cnn.com/rss/edition.rss')).root
36
+ p result #=> [{ 'title' => 'Some CNN News...', 'guid' =>, ...}, ...]
37
+
38
+ This sample shows the DSL exposed to create the XML parsing rules for an RSS feed.
@@ -0,0 +1,12 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ desc 'Default: run unit tests.'
5
+ task :default => :test
6
+
7
+ desc 'Test the sawtooth gem.'
8
+ Rake::TestTask.new(:test) do |t|
9
+ t.libs << 'test'
10
+ t.pattern = 'test/**/*_test.rb'
11
+ t.verbose = true
12
+ end
@@ -0,0 +1,8 @@
1
+ require "sawtooth/rules"
2
+ require "sawtooth/document"
3
+ require "sawtooth/parser"
4
+ require "sawtooth/builder"
5
+
6
+ module Sawtooth
7
+ autoload :VERSION, "sawtooth/version"
8
+ end
@@ -0,0 +1,106 @@
1
+ require 'sawtooth/parser'
2
+ require 'sawtooth/rules'
3
+
4
+ module Sawtooth
5
+
6
+ # Yield a builder instance and start working on pushing
7
+ # rules around like crazy.
8
+ #
9
+ def self.rules(&block)
10
+ Sawtooth::Builder.new(&block)
11
+ end
12
+
13
+ # Provides a nice and hopefully easy to use DSL to build rules and start
14
+ # parsing XML documents with ease.
15
+ #
16
+ class Builder
17
+
18
+ # Has access to a set of rules.
19
+ attr_reader :rules
20
+
21
+ # Creates a new instance.
22
+ def initialize(&block)
23
+ @rules = Sawtooth::Rules::Set.new
24
+ self.instance_eval(&block) if block_given?
25
+ end
26
+
27
+ # Get a parser instance with the same set of
28
+ # rules.
29
+ def parser
30
+ @parser ||= Sawtooth::Parser.new(:rules => self.rules)
31
+ end
32
+
33
+ # Shortcut method to parse some input, delegates to the
34
+ # parser.
35
+ def parse(thingy)
36
+ parser.parse(thingy)
37
+ end
38
+
39
+ # Called before the document starts.
40
+ def before(&block)
41
+ rules.add('@document:before', Sawtooth::Rules::CallRule.new(:start => block)) if block_given?
42
+ end
43
+
44
+ # Called after the document has ended.
45
+ def after(&block)
46
+ rules.add('@document:after', Sawtooth::Rules::CallRule.new(:finish => block)) if block_given?
47
+ end
48
+
49
+ def on_start(path, &block)
50
+ rules.add(path, Sawtooth::Rules::CallRule.new(:start => block)) if block_given?
51
+ end
52
+
53
+ # Called when the node has finished parsing, i.e. text and everything is available.
54
+ #
55
+ def on_finish(path, &block)
56
+ rules.add(path, Sawtooth::Rules::CallRule.new(:finish => block)) if block_given?
57
+ end
58
+ alias_method :on_node, :on_finish
59
+
60
+ # Perform a rule on a block, optionally pass in a custom rule instance.
61
+ def on(path, rule = nil, &block)
62
+ rule = block.arity <= 0 ? Sawtooth::Rules::CallRule.new(&block) : Sawtooth::Rules::CallRule.new(:start => block) if block_given?
63
+ rules.add(path, rule) if rule
64
+ end
65
+
66
+ # Use and set a nodes text to the top object in the stack.
67
+ #
68
+ # # Simple mapping, sets "name"
69
+ # on_text 'Person/Name'
70
+ #
71
+ # # Custom mapping
72
+ # on_text 'Person/Name' => :lastname
73
+ #
74
+ # # Data Conversion
75
+ # on_text('Person/Age') { |str| str.to_i }
76
+ #
77
+ # # Multiple Mappings
78
+ # on_text 'Person/Name' => :lastname, 'Person/FirstName' => :firstname
79
+ #
80
+ # The `TextRule` tries to set the value using a setter, or a hash
81
+ # accessor and the `document.top` object.
82
+ def on_text(mappings = {}, &block)
83
+ if mappings.respond_to?(:to_str)
84
+ rules.add(mappings.to_str, Sawtooth::Rules::TextRule.new(&block))
85
+ else
86
+ mappings.each do |path, name|
87
+ rules.add(path, Sawtooth::Rules::TextRule.new(name, &block))
88
+ end
89
+ end
90
+ end
91
+
92
+ def delegate(delegation = {})
93
+ path = delegation.keys.find { |k| k.to_s =~ %r{/\*\*?\z} }
94
+ cb_path = path.gsub(%r{/\*\*?\z}, '')
95
+ to = delegation[path]
96
+ prefix = delegation[:prefix] || path.gsub(%r{/?[^/]+/\*\*?\z}, '')
97
+
98
+ rule = Sawtooth::Rules::DelegateRule.new(:path => path, :rules => to.respond_to?(:rules) ? to.rules : to, :prefix => prefix)
99
+ rules.add(cb_path, rule.before_after_callbacks_rule)
100
+ rules.add(path, rule)
101
+ end
102
+
103
+ # Pretty print rules.
104
+ def to_pretty_s; rules.print_rules end
105
+ end
106
+ end
@@ -0,0 +1,158 @@
1
+ require 'nokogiri/xml/sax'
2
+
3
+ module Sawtooth
4
+
5
+ # Provides the current parser stack, delegates
6
+ # basically all calles to the supplied parser.
7
+ #
8
+ # Also the document exposes methods which can be
9
+ # used to directly interact with the stack.
10
+ class Document < ::Nokogiri::XML::SAX::Document
11
+
12
+ # A simple Document Node representation, for the node stack.
13
+ Node = Struct.new(:namespace, :name, :attributes, :text) do
14
+ def to_s; name end
15
+ end
16
+
17
+ class Stack < Array
18
+ def peek(n = 0)
19
+ self[(n + 1) * -1]
20
+ end
21
+
22
+ def current; peek(0) end
23
+ alias_method :top, :current
24
+
25
+ def parent; peek(-1) end
26
+
27
+ def root; first end
28
+ end
29
+
30
+ # Special freaky node for the Document and Comments
31
+ DOCUMENT_NODE = [Node.new(nil, '@document')]
32
+ COMMENT_NAME = '@comment'
33
+
34
+ # Both the stack and the delegate can be accessed.
35
+ attr_reader :stack, :stacks
36
+ attr_accessor :delegate
37
+
38
+ # Creates a new Document instance with an empty stack
39
+ # and the supplied delegate. The delegate is required to
40
+ # apply the rules.
41
+ def initialize(delegate = nil)
42
+ @delegate = delegate
43
+ reset!
44
+ end
45
+
46
+ # Allow an element to be pushed onto the stack
47
+ def <<(obj)
48
+ stack << obj
49
+ self
50
+ end
51
+ alias_method :push, :<<
52
+
53
+ # Pop an element of the stack
54
+ def pop
55
+ stack.pop
56
+ end
57
+
58
+ # Peek at an element in the stack, i.e. element 0 is the last
59
+ # element.
60
+ #
61
+ # doc.peek # => returns last element
62
+ # doc.peek(1) # => returns second last element
63
+ #
64
+ def peek(n = 0)
65
+ stack[(n + 1) * -1]
66
+ end
67
+
68
+ # Shortcut method for current, i.e. an alias of peek without
69
+ # an argument.
70
+ def current; peek(0) end
71
+ alias_method :top, :current
72
+
73
+ # Alias for `peek(1)`.
74
+ def parent; peek(1); end
75
+
76
+ # Alias for `stack.first`
77
+ def root; stack.first end
78
+
79
+ # Get current path stack.
80
+ def path; @path_stack end
81
+
82
+ # Get current node.
83
+ def node; @path_stack.last end
84
+
85
+ # Direct access to customizeable stacks
86
+ def [](key)
87
+ stacks[key]
88
+ end
89
+
90
+ # Resets path, stack and the current text.
91
+ def reset!
92
+ @path_stack = []
93
+ @stack = []
94
+ @stacks = Hash.new { |hsh, k| hsh[k] = Stack.new }
95
+ @text = nil
96
+ end
97
+
98
+ # Characters and CDATA will be appended the current text block, if any
99
+ def characters(str)
100
+ @text ||= ""
101
+ @text << str
102
+ end
103
+ alias_method :cdata_block, :characters
104
+
105
+ # Called when comments are encountered, empty implementation,
106
+ def comment(str)
107
+ cnode = Node.new(nil, COMMENT_NAME, {}, str)
108
+ delegate.comment((DOCUMENT_NODE + path + [cnode]).compact, self, cnode) if delegate.respond_to?(:comment)
109
+ end
110
+
111
+ # Called when document starts parsing, clears path and stack
112
+ # and calls with special @document path.
113
+ def start_document
114
+ reset!
115
+ delegate.start_document(DOCUMENT_NODE, self) if delegate.respond_to?(:start_document)
116
+ end
117
+
118
+ # Callend when document ends parsing, does call with
119
+ # special @document path.
120
+ def end_document
121
+ delegate.end_document(DOCUMENT_NODE, self) if delegate.respond_to?(:end_document)
122
+ end
123
+
124
+ # Called at the beginning of an element.
125
+ def start_element_namespace(name, attrs_ary = [], prefix = nil, uri = nil, ns = [])
126
+ @text = nil
127
+ node = Node.new(uri, name, attrs_ary.inject({}) { |hsh, a| hsh[a.localname] = a.value; hsh }, '')
128
+ path << node
129
+
130
+ # call delegate
131
+ delegate.start_element(path, self, node) if delegate.respond_to?(:start_element)
132
+ end
133
+
134
+ # Called at the end of an element.
135
+ def end_element_namespace(name, prefix = nil, uri = nil)
136
+ # fill text
137
+ node.text = @text.to_s.strip if @text
138
+
139
+ # call delegate
140
+ delegate.end_element(path, self, node) if delegate.respond_to?(:end_element)
141
+
142
+ # clear stack
143
+ @path_stack.pop
144
+ @text = nil
145
+ end
146
+
147
+ # Pass a warning along to the parser
148
+ def warning(string)
149
+ delegate.warning(path, self, string) if delegate.respond_to?(:warning)
150
+ end
151
+
152
+ # Pass an error along to the parser, parser should handle
153
+ # whether to continue or abort parsing.
154
+ def error(string)
155
+ delegate.error(path, self, string) if delegate.respond_to?(:error)
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,67 @@
1
+ require 'nokogiri'
2
+
3
+ require 'sawtooth/document'
4
+ require 'sawtooth/rules/set'
5
+
6
+ module Sawtooth
7
+
8
+ # Default Parser implementation, can be used as a
9
+ # starting point for custom implementations.
10
+ #
11
+ class Parser
12
+
13
+ # Array of accessible rules.
14
+ attr_reader :rules
15
+
16
+ # Creates a new instance.
17
+ def initialize(options = {})
18
+ @rules = options[:rules] || Sawtooth::Rules::Set.new
19
+ end
20
+
21
+ # Delegates to `Rules::Set#add`.
22
+ def add(path, rule)
23
+ rules.add(path, rule)
24
+ end
25
+
26
+ # Recieved a comment node.
27
+ def comment(path, doc, str); end
28
+
29
+ # Start document callback
30
+ def start_document(path, doc)
31
+ rule = rules.find('@document:before')
32
+ rule.start(path.join('/'), doc, nil) if rule && rule.respond_to?(:start)
33
+ end
34
+
35
+ # End document callback
36
+ def end_document(path, doc)
37
+ rule = rules.find('@document:after')
38
+ rule.finish(path.join('/'), doc, nil) if rule && rule.respond_to?(:finish)
39
+ end
40
+
41
+ # Start element callback
42
+ def start_element(path, doc, node)
43
+ rule = rules.find(path)
44
+ rule.start(path.join('/'), doc, node) if rule && rule.respond_to?(:start)
45
+ end
46
+
47
+ # End document callback
48
+ def end_element(path, doc, node)
49
+ rule = rules.find(path)
50
+ rule.finish(path.join('/'), doc, node) if rule && rule.respond_to?(:finish)
51
+ end
52
+
53
+ def error(path, doc, message)
54
+ raise message
55
+ end
56
+
57
+ # Parses and XML thingy, a filename, path, IO or content
58
+ # from memory. Provides and optional encoding, which defaults
59
+ # to `UTF-8`.
60
+ def parse(thing, encoding = 'UTF-8')
61
+ Sawtooth::Document.new(self).tap do |doc|
62
+ sax_parser = Nokogiri::XML::SAX::Parser.new(doc, encoding)
63
+ sax_parser.parse(thing)
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,6 @@
1
+ require 'sawtooth/rules/base'
2
+ require 'sawtooth/rules/set'
3
+
4
+ Dir[File.dirname(__FILE__) + '/rules/*_rule.rb'].each do |rule|
5
+ require rule
6
+ end
@@ -0,0 +1,32 @@
1
+ module Sawtooth
2
+ module Rules
3
+
4
+ # Base Rule, provides three unimplemented methods, which
5
+ # can be overriden by more specific rules - like the create
6
+ # or call rule etc.
7
+ #
8
+ class Base
9
+
10
+ # Called when the beginning of a matching XML node is encountered.
11
+ #
12
+ # - path, current (maybe rewritten) path
13
+ # - document, the current sawtooth parser stack (`Sawtooth::Document`)
14
+ # - node, the current node to process
15
+ def start(path, document, node)
16
+ end
17
+
18
+ # Called when the end of a matching XML node is encountered.
19
+ # If an element has no body, this method is called with an empty
20
+ # string instead.
21
+ #
22
+ # - path, current (maybe rewritten) path
23
+ # - document, the current sawtooth parser stack (`Sawtooth::Document`)
24
+ # - node, the current node
25
+ def finish(path, document, node)
26
+ end
27
+
28
+ # Basically calls inspect
29
+ def print_rule; self.class.name end
30
+ end
31
+ end
32
+ end