sawtooth 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in sawtooth.gemspec
4
+ gemspec
@@ -0,0 +1,38 @@
1
+
2
+ __
3
+ _____....--' .'
4
+ _..___...---'._ o -`(
5
+ _ | | _ \ .--. `\
6
+ ___ __ ___ _| |_ ___ ___ | |_| |__ | \ \ `|
7
+ / __|/ _` \ \ /\ / / __/ _ \ / _ \| __| '_ \ |o o | | |
8
+ \__ \ (_| |\ V V /| || (_) | (_) | |_| | | | \___'.-`. '.
9
+ |___/\__,_| \_/\_/ \__\___/ \___/ \__|_| |_| | `---'
10
+ '^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^'
11
+
12
+ A companion for [nokogori](http://nokogiri.org) to parse XML files by rules,
13
+ similar to [Apache Commons Digester](http://commons.apache.org/digester/).
14
+
15
+ Converting XML structures into Ruby is most often an unsatisfying task, having
16
+ to choose between implementing a SAX parser (for speed) or using _nokogiri_
17
+ features like CSS selectors for ease of use. At it's base _sawtooth_ is parsing
18
+ documents using SAX, but provides an interface to specify rules for the handling
19
+ the document.
20
+
21
+ require 'open-uri'
22
+ require 'sawtooth'
23
+
24
+ rules = Sawtooth.rules do
25
+ before { |doc| doc << [] } # 1. create an array for all news items
26
+
27
+ on 'rss/channel/item' do
28
+ on_start { |doc| doc << Hash.new } # 2. on an item create hash
29
+ on_finish { |doc| doc.parent << doc.pop } # 3. when closing an item, pop from stack and
30
+ end # append to parent array (from step 1.)
31
+
32
+ on_text 'rss/channel/item/*' # 4. add contents to hash
33
+ end
34
+
35
+ result = rules.parse(open('http://rss.cnn.com/rss/edition.rss')).root
36
+ p result #=> [{ 'title' => 'Some CNN News...', 'guid' =>, ...}, ...]
37
+
38
+ This sample shows the DSL exposed to create the XML parsing rules for an RSS feed.
@@ -0,0 +1,12 @@
1
+ require "bundler/gem_tasks"
2
+ require "rake/testtask"
3
+
4
+ desc 'Default: run unit tests.'
5
+ task :default => :test
6
+
7
+ desc 'Test the sawtooth gem.'
8
+ Rake::TestTask.new(:test) do |t|
9
+ t.libs << 'test'
10
+ t.pattern = 'test/**/*_test.rb'
11
+ t.verbose = true
12
+ end
@@ -0,0 +1,8 @@
1
+ require "sawtooth/rules"
2
+ require "sawtooth/document"
3
+ require "sawtooth/parser"
4
+ require "sawtooth/builder"
5
+
6
+ module Sawtooth
7
+ autoload :VERSION, "sawtooth/version"
8
+ end
@@ -0,0 +1,106 @@
1
+ require 'sawtooth/parser'
2
+ require 'sawtooth/rules'
3
+
4
+ module Sawtooth
5
+
6
+ # Yield a builder instance and start working on pushing
7
+ # rules around like crazy.
8
+ #
9
+ def self.rules(&block)
10
+ Sawtooth::Builder.new(&block)
11
+ end
12
+
13
+ # Provides a nice and hopefully easy to use DSL to build rules and start
14
+ # parsing XML documents with ease.
15
+ #
16
+ class Builder
17
+
18
+ # Has access to a set of rules.
19
+ attr_reader :rules
20
+
21
+ # Creates a new instance.
22
+ def initialize(&block)
23
+ @rules = Sawtooth::Rules::Set.new
24
+ self.instance_eval(&block) if block_given?
25
+ end
26
+
27
+ # Get a parser instance with the same set of
28
+ # rules.
29
+ def parser
30
+ @parser ||= Sawtooth::Parser.new(:rules => self.rules)
31
+ end
32
+
33
+ # Shortcut method to parse some input, delegates to the
34
+ # parser.
35
+ def parse(thingy)
36
+ parser.parse(thingy)
37
+ end
38
+
39
+ # Called before the document starts.
40
+ def before(&block)
41
+ rules.add('@document:before', Sawtooth::Rules::CallRule.new(:start => block)) if block_given?
42
+ end
43
+
44
+ # Called after the document has ended.
45
+ def after(&block)
46
+ rules.add('@document:after', Sawtooth::Rules::CallRule.new(:finish => block)) if block_given?
47
+ end
48
+
49
+ def on_start(path, &block)
50
+ rules.add(path, Sawtooth::Rules::CallRule.new(:start => block)) if block_given?
51
+ end
52
+
53
+ # Called when the node has finished parsing, i.e. text and everything is available.
54
+ #
55
+ def on_finish(path, &block)
56
+ rules.add(path, Sawtooth::Rules::CallRule.new(:finish => block)) if block_given?
57
+ end
58
+ alias_method :on_node, :on_finish
59
+
60
+ # Perform a rule on a block, optionally pass in a custom rule instance.
61
+ def on(path, rule = nil, &block)
62
+ rule = block.arity <= 0 ? Sawtooth::Rules::CallRule.new(&block) : Sawtooth::Rules::CallRule.new(:start => block) if block_given?
63
+ rules.add(path, rule) if rule
64
+ end
65
+
66
+ # Use and set a nodes text to the top object in the stack.
67
+ #
68
+ # # Simple mapping, sets "name"
69
+ # on_text 'Person/Name'
70
+ #
71
+ # # Custom mapping
72
+ # on_text 'Person/Name' => :lastname
73
+ #
74
+ # # Data Conversion
75
+ # on_text('Person/Age') { |str| str.to_i }
76
+ #
77
+ # # Multiple Mappings
78
+ # on_text 'Person/Name' => :lastname, 'Person/FirstName' => :firstname
79
+ #
80
+ # The `TextRule` tries to set the value using a setter, or a hash
81
+ # accessor and the `document.top` object.
82
+ def on_text(mappings = {}, &block)
83
+ if mappings.respond_to?(:to_str)
84
+ rules.add(mappings.to_str, Sawtooth::Rules::TextRule.new(&block))
85
+ else
86
+ mappings.each do |path, name|
87
+ rules.add(path, Sawtooth::Rules::TextRule.new(name, &block))
88
+ end
89
+ end
90
+ end
91
+
92
+ def delegate(delegation = {})
93
+ path = delegation.keys.find { |k| k.to_s =~ %r{/\*\*?\z} }
94
+ cb_path = path.gsub(%r{/\*\*?\z}, '')
95
+ to = delegation[path]
96
+ prefix = delegation[:prefix] || path.gsub(%r{/?[^/]+/\*\*?\z}, '')
97
+
98
+ rule = Sawtooth::Rules::DelegateRule.new(:path => path, :rules => to.respond_to?(:rules) ? to.rules : to, :prefix => prefix)
99
+ rules.add(cb_path, rule.before_after_callbacks_rule)
100
+ rules.add(path, rule)
101
+ end
102
+
103
+ # Pretty print rules.
104
+ def to_pretty_s; rules.print_rules end
105
+ end
106
+ end
@@ -0,0 +1,158 @@
1
+ require 'nokogiri/xml/sax'
2
+
3
+ module Sawtooth
4
+
5
+ # Provides the current parser stack, delegates
6
+ # basically all calles to the supplied parser.
7
+ #
8
+ # Also the document exposes methods which can be
9
+ # used to directly interact with the stack.
10
+ class Document < ::Nokogiri::XML::SAX::Document
11
+
12
+ # A simple Document Node representation, for the node stack.
13
+ Node = Struct.new(:namespace, :name, :attributes, :text) do
14
+ def to_s; name end
15
+ end
16
+
17
+ class Stack < Array
18
+ def peek(n = 0)
19
+ self[(n + 1) * -1]
20
+ end
21
+
22
+ def current; peek(0) end
23
+ alias_method :top, :current
24
+
25
+ def parent; peek(-1) end
26
+
27
+ def root; first end
28
+ end
29
+
30
+ # Special freaky node for the Document and Comments
31
+ DOCUMENT_NODE = [Node.new(nil, '@document')]
32
+ COMMENT_NAME = '@comment'
33
+
34
+ # Both the stack and the delegate can be accessed.
35
+ attr_reader :stack, :stacks
36
+ attr_accessor :delegate
37
+
38
+ # Creates a new Document instance with an empty stack
39
+ # and the supplied delegate. The delegate is required to
40
+ # apply the rules.
41
+ def initialize(delegate = nil)
42
+ @delegate = delegate
43
+ reset!
44
+ end
45
+
46
+ # Allow an element to be pushed onto the stack
47
+ def <<(obj)
48
+ stack << obj
49
+ self
50
+ end
51
+ alias_method :push, :<<
52
+
53
+ # Pop an element of the stack
54
+ def pop
55
+ stack.pop
56
+ end
57
+
58
+ # Peek at an element in the stack, i.e. element 0 is the last
59
+ # element.
60
+ #
61
+ # doc.peek # => returns last element
62
+ # doc.peek(1) # => returns second last element
63
+ #
64
+ def peek(n = 0)
65
+ stack[(n + 1) * -1]
66
+ end
67
+
68
+ # Shortcut method for current, i.e. an alias of peek without
69
+ # an argument.
70
+ def current; peek(0) end
71
+ alias_method :top, :current
72
+
73
+ # Alias for `peek(1)`.
74
+ def parent; peek(1); end
75
+
76
+ # Alias for `stack.first`
77
+ def root; stack.first end
78
+
79
+ # Get current path stack.
80
+ def path; @path_stack end
81
+
82
+ # Get current node.
83
+ def node; @path_stack.last end
84
+
85
+ # Direct access to customizeable stacks
86
+ def [](key)
87
+ stacks[key]
88
+ end
89
+
90
+ # Resets path, stack and the current text.
91
+ def reset!
92
+ @path_stack = []
93
+ @stack = []
94
+ @stacks = Hash.new { |hsh, k| hsh[k] = Stack.new }
95
+ @text = nil
96
+ end
97
+
98
+ # Characters and CDATA will be appended the current text block, if any
99
+ def characters(str)
100
+ @text ||= ""
101
+ @text << str
102
+ end
103
+ alias_method :cdata_block, :characters
104
+
105
+ # Called when comments are encountered, empty implementation,
106
+ def comment(str)
107
+ cnode = Node.new(nil, COMMENT_NAME, {}, str)
108
+ delegate.comment((DOCUMENT_NODE + path + [cnode]).compact, self, cnode) if delegate.respond_to?(:comment)
109
+ end
110
+
111
+ # Called when document starts parsing, clears path and stack
112
+ # and calls with special @document path.
113
+ def start_document
114
+ reset!
115
+ delegate.start_document(DOCUMENT_NODE, self) if delegate.respond_to?(:start_document)
116
+ end
117
+
118
+ # Callend when document ends parsing, does call with
119
+ # special @document path.
120
+ def end_document
121
+ delegate.end_document(DOCUMENT_NODE, self) if delegate.respond_to?(:end_document)
122
+ end
123
+
124
+ # Called at the beginning of an element.
125
+ def start_element_namespace(name, attrs_ary = [], prefix = nil, uri = nil, ns = [])
126
+ @text = nil
127
+ node = Node.new(uri, name, attrs_ary.inject({}) { |hsh, a| hsh[a.localname] = a.value; hsh }, '')
128
+ path << node
129
+
130
+ # call delegate
131
+ delegate.start_element(path, self, node) if delegate.respond_to?(:start_element)
132
+ end
133
+
134
+ # Called at the end of an element.
135
+ def end_element_namespace(name, prefix = nil, uri = nil)
136
+ # fill text
137
+ node.text = @text.to_s.strip if @text
138
+
139
+ # call delegate
140
+ delegate.end_element(path, self, node) if delegate.respond_to?(:end_element)
141
+
142
+ # clear stack
143
+ @path_stack.pop
144
+ @text = nil
145
+ end
146
+
147
+ # Pass a warning along to the parser
148
+ def warning(string)
149
+ delegate.warning(path, self, string) if delegate.respond_to?(:warning)
150
+ end
151
+
152
+ # Pass an error along to the parser, parser should handle
153
+ # whether to continue or abort parsing.
154
+ def error(string)
155
+ delegate.error(path, self, string) if delegate.respond_to?(:error)
156
+ end
157
+ end
158
+ end
@@ -0,0 +1,67 @@
1
+ require 'nokogiri'
2
+
3
+ require 'sawtooth/document'
4
+ require 'sawtooth/rules/set'
5
+
6
+ module Sawtooth
7
+
8
+ # Default Parser implementation, can be used as a
9
+ # starting point for custom implementations.
10
+ #
11
+ class Parser
12
+
13
+ # Array of accessible rules.
14
+ attr_reader :rules
15
+
16
+ # Creates a new instance.
17
+ def initialize(options = {})
18
+ @rules = options[:rules] || Sawtooth::Rules::Set.new
19
+ end
20
+
21
+ # Delegates to `Rules::Set#add`.
22
+ def add(path, rule)
23
+ rules.add(path, rule)
24
+ end
25
+
26
+ # Recieved a comment node.
27
+ def comment(path, doc, str); end
28
+
29
+ # Start document callback
30
+ def start_document(path, doc)
31
+ rule = rules.find('@document:before')
32
+ rule.start(path.join('/'), doc, nil) if rule && rule.respond_to?(:start)
33
+ end
34
+
35
+ # End document callback
36
+ def end_document(path, doc)
37
+ rule = rules.find('@document:after')
38
+ rule.finish(path.join('/'), doc, nil) if rule && rule.respond_to?(:finish)
39
+ end
40
+
41
+ # Start element callback
42
+ def start_element(path, doc, node)
43
+ rule = rules.find(path)
44
+ rule.start(path.join('/'), doc, node) if rule && rule.respond_to?(:start)
45
+ end
46
+
47
+ # End document callback
48
+ def end_element(path, doc, node)
49
+ rule = rules.find(path)
50
+ rule.finish(path.join('/'), doc, node) if rule && rule.respond_to?(:finish)
51
+ end
52
+
53
+ def error(path, doc, message)
54
+ raise message
55
+ end
56
+
57
+ # Parses and XML thingy, a filename, path, IO or content
58
+ # from memory. Provides and optional encoding, which defaults
59
+ # to `UTF-8`.
60
+ def parse(thing, encoding = 'UTF-8')
61
+ Sawtooth::Document.new(self).tap do |doc|
62
+ sax_parser = Nokogiri::XML::SAX::Parser.new(doc, encoding)
63
+ sax_parser.parse(thing)
64
+ end
65
+ end
66
+ end
67
+ end
@@ -0,0 +1,6 @@
1
+ require 'sawtooth/rules/base'
2
+ require 'sawtooth/rules/set'
3
+
4
+ Dir[File.dirname(__FILE__) + '/rules/*_rule.rb'].each do |rule|
5
+ require rule
6
+ end
@@ -0,0 +1,32 @@
1
+ module Sawtooth
2
+ module Rules
3
+
4
+ # Base Rule, provides three unimplemented methods, which
5
+ # can be overriden by more specific rules - like the create
6
+ # or call rule etc.
7
+ #
8
+ class Base
9
+
10
+ # Called when the beginning of a matching XML node is encountered.
11
+ #
12
+ # - path, current (maybe rewritten) path
13
+ # - document, the current sawtooth parser stack (`Sawtooth::Document`)
14
+ # - node, the current node to process
15
+ def start(path, document, node)
16
+ end
17
+
18
+ # Called when the end of a matching XML node is encountered.
19
+ # If an element has no body, this method is called with an empty
20
+ # string instead.
21
+ #
22
+ # - path, current (maybe rewritten) path
23
+ # - document, the current sawtooth parser stack (`Sawtooth::Document`)
24
+ # - node, the current node
25
+ def finish(path, document, node)
26
+ end
27
+
28
+ # Basically calls inspect
29
+ def print_rule; self.class.name end
30
+ end
31
+ end
32
+ end