sawtooth 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/README.md +38 -0
- data/Rakefile +12 -0
- data/lib/sawtooth.rb +8 -0
- data/lib/sawtooth/builder.rb +106 -0
- data/lib/sawtooth/document.rb +158 -0
- data/lib/sawtooth/parser.rb +67 -0
- data/lib/sawtooth/rules.rb +6 -0
- data/lib/sawtooth/rules/base.rb +32 -0
- data/lib/sawtooth/rules/call_rule.rb +60 -0
- data/lib/sawtooth/rules/delegate_rule.rb +75 -0
- data/lib/sawtooth/rules/set.rb +64 -0
- data/lib/sawtooth/rules/text_rule.rb +55 -0
- data/lib/sawtooth/version.rb +3 -0
- data/sawtooth.gemspec +25 -0
- data/test/files/delegate.xml +34 -0
- data/test/files/statuses.xml +422 -0
- data/test/sawtooth/builder_test.rb +102 -0
- data/test/sawtooth/document_test.rb +119 -0
- data/test/sawtooth/parser_test.rb +17 -0
- data/test/sawtooth/readme_test.rb +21 -0
- data/test/sawtooth/rules/delegate_rule_test.rb +44 -0
- data/test/sawtooth/rules/text_rule_test.rb +59 -0
- data/test/sawtooth/rules_set_test.rb +49 -0
- data/test/test_helper.rb +25 -0
- metadata +161 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
|
2
|
+
__
|
3
|
+
_____....--' .'
|
4
|
+
_..___...---'._ o -`(
|
5
|
+
_ | | _ \ .--. `\
|
6
|
+
___ __ ___ _| |_ ___ ___ | |_| |__ | \ \ `|
|
7
|
+
/ __|/ _` \ \ /\ / / __/ _ \ / _ \| __| '_ \ |o o | | |
|
8
|
+
\__ \ (_| |\ V V /| || (_) | (_) | |_| | | | \___'.-`. '.
|
9
|
+
|___/\__,_| \_/\_/ \__\___/ \___/ \__|_| |_| | `---'
|
10
|
+
'^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^'
|
11
|
+
|
12
|
+
A companion for [nokogori](http://nokogiri.org) to parse XML files by rules,
|
13
|
+
similar to [Apache Commons Digester](http://commons.apache.org/digester/).
|
14
|
+
|
15
|
+
Converting XML structures into Ruby is most often an unsatisfying task, having
|
16
|
+
to choose between implementing a SAX parser (for speed) or using _nokogiri_
|
17
|
+
features like CSS selectors for ease of use. At it's base _sawtooth_ is parsing
|
18
|
+
documents using SAX, but provides an interface to specify rules for the handling
|
19
|
+
the document.
|
20
|
+
|
21
|
+
require 'open-uri'
|
22
|
+
require 'sawtooth'
|
23
|
+
|
24
|
+
rules = Sawtooth.rules do
|
25
|
+
before { |doc| doc << [] } # 1. create an array for all news items
|
26
|
+
|
27
|
+
on 'rss/channel/item' do
|
28
|
+
on_start { |doc| doc << Hash.new } # 2. on an item create hash
|
29
|
+
on_finish { |doc| doc.parent << doc.pop } # 3. when closing an item, pop from stack and
|
30
|
+
end # append to parent array (from step 1.)
|
31
|
+
|
32
|
+
on_text 'rss/channel/item/*' # 4. add contents to hash
|
33
|
+
end
|
34
|
+
|
35
|
+
result = rules.parse(open('http://rss.cnn.com/rss/edition.rss')).root
|
36
|
+
p result #=> [{ 'title' => 'Some CNN News...', 'guid' =>, ...}, ...]
|
37
|
+
|
38
|
+
This sample shows the DSL exposed to create the XML parsing rules for an RSS feed.
|
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rake/testtask"
|
3
|
+
|
4
|
+
desc 'Default: run unit tests.'
|
5
|
+
task :default => :test
|
6
|
+
|
7
|
+
desc 'Test the sawtooth gem.'
|
8
|
+
Rake::TestTask.new(:test) do |t|
|
9
|
+
t.libs << 'test'
|
10
|
+
t.pattern = 'test/**/*_test.rb'
|
11
|
+
t.verbose = true
|
12
|
+
end
|
data/lib/sawtooth.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
require 'sawtooth/parser'
|
2
|
+
require 'sawtooth/rules'
|
3
|
+
|
4
|
+
module Sawtooth
|
5
|
+
|
6
|
+
# Yield a builder instance and start working on pushing
|
7
|
+
# rules around like crazy.
|
8
|
+
#
|
9
|
+
def self.rules(&block)
|
10
|
+
Sawtooth::Builder.new(&block)
|
11
|
+
end
|
12
|
+
|
13
|
+
# Provides a nice and hopefully easy to use DSL to build rules and start
|
14
|
+
# parsing XML documents with ease.
|
15
|
+
#
|
16
|
+
class Builder
|
17
|
+
|
18
|
+
# Has access to a set of rules.
|
19
|
+
attr_reader :rules
|
20
|
+
|
21
|
+
# Creates a new instance.
|
22
|
+
def initialize(&block)
|
23
|
+
@rules = Sawtooth::Rules::Set.new
|
24
|
+
self.instance_eval(&block) if block_given?
|
25
|
+
end
|
26
|
+
|
27
|
+
# Get a parser instance with the same set of
|
28
|
+
# rules.
|
29
|
+
def parser
|
30
|
+
@parser ||= Sawtooth::Parser.new(:rules => self.rules)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Shortcut method to parse some input, delegates to the
|
34
|
+
# parser.
|
35
|
+
def parse(thingy)
|
36
|
+
parser.parse(thingy)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Called before the document starts.
|
40
|
+
def before(&block)
|
41
|
+
rules.add('@document:before', Sawtooth::Rules::CallRule.new(:start => block)) if block_given?
|
42
|
+
end
|
43
|
+
|
44
|
+
# Called after the document has ended.
|
45
|
+
def after(&block)
|
46
|
+
rules.add('@document:after', Sawtooth::Rules::CallRule.new(:finish => block)) if block_given?
|
47
|
+
end
|
48
|
+
|
49
|
+
def on_start(path, &block)
|
50
|
+
rules.add(path, Sawtooth::Rules::CallRule.new(:start => block)) if block_given?
|
51
|
+
end
|
52
|
+
|
53
|
+
# Called when the node has finished parsing, i.e. text and everything is available.
|
54
|
+
#
|
55
|
+
def on_finish(path, &block)
|
56
|
+
rules.add(path, Sawtooth::Rules::CallRule.new(:finish => block)) if block_given?
|
57
|
+
end
|
58
|
+
alias_method :on_node, :on_finish
|
59
|
+
|
60
|
+
# Perform a rule on a block, optionally pass in a custom rule instance.
|
61
|
+
def on(path, rule = nil, &block)
|
62
|
+
rule = block.arity <= 0 ? Sawtooth::Rules::CallRule.new(&block) : Sawtooth::Rules::CallRule.new(:start => block) if block_given?
|
63
|
+
rules.add(path, rule) if rule
|
64
|
+
end
|
65
|
+
|
66
|
+
# Use and set a nodes text to the top object in the stack.
|
67
|
+
#
|
68
|
+
# # Simple mapping, sets "name"
|
69
|
+
# on_text 'Person/Name'
|
70
|
+
#
|
71
|
+
# # Custom mapping
|
72
|
+
# on_text 'Person/Name' => :lastname
|
73
|
+
#
|
74
|
+
# # Data Conversion
|
75
|
+
# on_text('Person/Age') { |str| str.to_i }
|
76
|
+
#
|
77
|
+
# # Multiple Mappings
|
78
|
+
# on_text 'Person/Name' => :lastname, 'Person/FirstName' => :firstname
|
79
|
+
#
|
80
|
+
# The `TextRule` tries to set the value using a setter, or a hash
|
81
|
+
# accessor and the `document.top` object.
|
82
|
+
def on_text(mappings = {}, &block)
|
83
|
+
if mappings.respond_to?(:to_str)
|
84
|
+
rules.add(mappings.to_str, Sawtooth::Rules::TextRule.new(&block))
|
85
|
+
else
|
86
|
+
mappings.each do |path, name|
|
87
|
+
rules.add(path, Sawtooth::Rules::TextRule.new(name, &block))
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def delegate(delegation = {})
|
93
|
+
path = delegation.keys.find { |k| k.to_s =~ %r{/\*\*?\z} }
|
94
|
+
cb_path = path.gsub(%r{/\*\*?\z}, '')
|
95
|
+
to = delegation[path]
|
96
|
+
prefix = delegation[:prefix] || path.gsub(%r{/?[^/]+/\*\*?\z}, '')
|
97
|
+
|
98
|
+
rule = Sawtooth::Rules::DelegateRule.new(:path => path, :rules => to.respond_to?(:rules) ? to.rules : to, :prefix => prefix)
|
99
|
+
rules.add(cb_path, rule.before_after_callbacks_rule)
|
100
|
+
rules.add(path, rule)
|
101
|
+
end
|
102
|
+
|
103
|
+
# Pretty print rules.
|
104
|
+
def to_pretty_s; rules.print_rules end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,158 @@
|
|
1
|
+
require 'nokogiri/xml/sax'
|
2
|
+
|
3
|
+
module Sawtooth
|
4
|
+
|
5
|
+
# Provides the current parser stack, delegates
|
6
|
+
# basically all calles to the supplied parser.
|
7
|
+
#
|
8
|
+
# Also the document exposes methods which can be
|
9
|
+
# used to directly interact with the stack.
|
10
|
+
class Document < ::Nokogiri::XML::SAX::Document
|
11
|
+
|
12
|
+
# A simple Document Node representation, for the node stack.
|
13
|
+
Node = Struct.new(:namespace, :name, :attributes, :text) do
|
14
|
+
def to_s; name end
|
15
|
+
end
|
16
|
+
|
17
|
+
class Stack < Array
|
18
|
+
def peek(n = 0)
|
19
|
+
self[(n + 1) * -1]
|
20
|
+
end
|
21
|
+
|
22
|
+
def current; peek(0) end
|
23
|
+
alias_method :top, :current
|
24
|
+
|
25
|
+
def parent; peek(-1) end
|
26
|
+
|
27
|
+
def root; first end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Special freaky node for the Document and Comments
|
31
|
+
DOCUMENT_NODE = [Node.new(nil, '@document')]
|
32
|
+
COMMENT_NAME = '@comment'
|
33
|
+
|
34
|
+
# Both the stack and the delegate can be accessed.
|
35
|
+
attr_reader :stack, :stacks
|
36
|
+
attr_accessor :delegate
|
37
|
+
|
38
|
+
# Creates a new Document instance with an empty stack
|
39
|
+
# and the supplied delegate. The delegate is required to
|
40
|
+
# apply the rules.
|
41
|
+
def initialize(delegate = nil)
|
42
|
+
@delegate = delegate
|
43
|
+
reset!
|
44
|
+
end
|
45
|
+
|
46
|
+
# Allow an element to be pushed onto the stack
|
47
|
+
def <<(obj)
|
48
|
+
stack << obj
|
49
|
+
self
|
50
|
+
end
|
51
|
+
alias_method :push, :<<
|
52
|
+
|
53
|
+
# Pop an element of the stack
|
54
|
+
def pop
|
55
|
+
stack.pop
|
56
|
+
end
|
57
|
+
|
58
|
+
# Peek at an element in the stack, i.e. element 0 is the last
|
59
|
+
# element.
|
60
|
+
#
|
61
|
+
# doc.peek # => returns last element
|
62
|
+
# doc.peek(1) # => returns second last element
|
63
|
+
#
|
64
|
+
def peek(n = 0)
|
65
|
+
stack[(n + 1) * -1]
|
66
|
+
end
|
67
|
+
|
68
|
+
# Shortcut method for current, i.e. an alias of peek without
|
69
|
+
# an argument.
|
70
|
+
def current; peek(0) end
|
71
|
+
alias_method :top, :current
|
72
|
+
|
73
|
+
# Alias for `peek(1)`.
|
74
|
+
def parent; peek(1); end
|
75
|
+
|
76
|
+
# Alias for `stack.first`
|
77
|
+
def root; stack.first end
|
78
|
+
|
79
|
+
# Get current path stack.
|
80
|
+
def path; @path_stack end
|
81
|
+
|
82
|
+
# Get current node.
|
83
|
+
def node; @path_stack.last end
|
84
|
+
|
85
|
+
# Direct access to customizeable stacks
|
86
|
+
def [](key)
|
87
|
+
stacks[key]
|
88
|
+
end
|
89
|
+
|
90
|
+
# Resets path, stack and the current text.
|
91
|
+
def reset!
|
92
|
+
@path_stack = []
|
93
|
+
@stack = []
|
94
|
+
@stacks = Hash.new { |hsh, k| hsh[k] = Stack.new }
|
95
|
+
@text = nil
|
96
|
+
end
|
97
|
+
|
98
|
+
# Characters and CDATA will be appended the current text block, if any
|
99
|
+
def characters(str)
|
100
|
+
@text ||= ""
|
101
|
+
@text << str
|
102
|
+
end
|
103
|
+
alias_method :cdata_block, :characters
|
104
|
+
|
105
|
+
# Called when comments are encountered, empty implementation,
|
106
|
+
def comment(str)
|
107
|
+
cnode = Node.new(nil, COMMENT_NAME, {}, str)
|
108
|
+
delegate.comment((DOCUMENT_NODE + path + [cnode]).compact, self, cnode) if delegate.respond_to?(:comment)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Called when document starts parsing, clears path and stack
|
112
|
+
# and calls with special @document path.
|
113
|
+
def start_document
|
114
|
+
reset!
|
115
|
+
delegate.start_document(DOCUMENT_NODE, self) if delegate.respond_to?(:start_document)
|
116
|
+
end
|
117
|
+
|
118
|
+
# Callend when document ends parsing, does call with
|
119
|
+
# special @document path.
|
120
|
+
def end_document
|
121
|
+
delegate.end_document(DOCUMENT_NODE, self) if delegate.respond_to?(:end_document)
|
122
|
+
end
|
123
|
+
|
124
|
+
# Called at the beginning of an element.
|
125
|
+
def start_element_namespace(name, attrs_ary = [], prefix = nil, uri = nil, ns = [])
|
126
|
+
@text = nil
|
127
|
+
node = Node.new(uri, name, attrs_ary.inject({}) { |hsh, a| hsh[a.localname] = a.value; hsh }, '')
|
128
|
+
path << node
|
129
|
+
|
130
|
+
# call delegate
|
131
|
+
delegate.start_element(path, self, node) if delegate.respond_to?(:start_element)
|
132
|
+
end
|
133
|
+
|
134
|
+
# Called at the end of an element.
|
135
|
+
def end_element_namespace(name, prefix = nil, uri = nil)
|
136
|
+
# fill text
|
137
|
+
node.text = @text.to_s.strip if @text
|
138
|
+
|
139
|
+
# call delegate
|
140
|
+
delegate.end_element(path, self, node) if delegate.respond_to?(:end_element)
|
141
|
+
|
142
|
+
# clear stack
|
143
|
+
@path_stack.pop
|
144
|
+
@text = nil
|
145
|
+
end
|
146
|
+
|
147
|
+
# Pass a warning along to the parser
|
148
|
+
def warning(string)
|
149
|
+
delegate.warning(path, self, string) if delegate.respond_to?(:warning)
|
150
|
+
end
|
151
|
+
|
152
|
+
# Pass an error along to the parser, parser should handle
|
153
|
+
# whether to continue or abort parsing.
|
154
|
+
def error(string)
|
155
|
+
delegate.error(path, self, string) if delegate.respond_to?(:error)
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
require 'sawtooth/document'
|
4
|
+
require 'sawtooth/rules/set'
|
5
|
+
|
6
|
+
module Sawtooth
|
7
|
+
|
8
|
+
# Default Parser implementation, can be used as a
|
9
|
+
# starting point for custom implementations.
|
10
|
+
#
|
11
|
+
class Parser
|
12
|
+
|
13
|
+
# Array of accessible rules.
|
14
|
+
attr_reader :rules
|
15
|
+
|
16
|
+
# Creates a new instance.
|
17
|
+
def initialize(options = {})
|
18
|
+
@rules = options[:rules] || Sawtooth::Rules::Set.new
|
19
|
+
end
|
20
|
+
|
21
|
+
# Delegates to `Rules::Set#add`.
|
22
|
+
def add(path, rule)
|
23
|
+
rules.add(path, rule)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Recieved a comment node.
|
27
|
+
def comment(path, doc, str); end
|
28
|
+
|
29
|
+
# Start document callback
|
30
|
+
def start_document(path, doc)
|
31
|
+
rule = rules.find('@document:before')
|
32
|
+
rule.start(path.join('/'), doc, nil) if rule && rule.respond_to?(:start)
|
33
|
+
end
|
34
|
+
|
35
|
+
# End document callback
|
36
|
+
def end_document(path, doc)
|
37
|
+
rule = rules.find('@document:after')
|
38
|
+
rule.finish(path.join('/'), doc, nil) if rule && rule.respond_to?(:finish)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Start element callback
|
42
|
+
def start_element(path, doc, node)
|
43
|
+
rule = rules.find(path)
|
44
|
+
rule.start(path.join('/'), doc, node) if rule && rule.respond_to?(:start)
|
45
|
+
end
|
46
|
+
|
47
|
+
# End document callback
|
48
|
+
def end_element(path, doc, node)
|
49
|
+
rule = rules.find(path)
|
50
|
+
rule.finish(path.join('/'), doc, node) if rule && rule.respond_to?(:finish)
|
51
|
+
end
|
52
|
+
|
53
|
+
def error(path, doc, message)
|
54
|
+
raise message
|
55
|
+
end
|
56
|
+
|
57
|
+
# Parses and XML thingy, a filename, path, IO or content
|
58
|
+
# from memory. Provides and optional encoding, which defaults
|
59
|
+
# to `UTF-8`.
|
60
|
+
def parse(thing, encoding = 'UTF-8')
|
61
|
+
Sawtooth::Document.new(self).tap do |doc|
|
62
|
+
sax_parser = Nokogiri::XML::SAX::Parser.new(doc, encoding)
|
63
|
+
sax_parser.parse(thing)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Sawtooth
|
2
|
+
module Rules
|
3
|
+
|
4
|
+
# Base Rule, provides three unimplemented methods, which
|
5
|
+
# can be overriden by more specific rules - like the create
|
6
|
+
# or call rule etc.
|
7
|
+
#
|
8
|
+
class Base
|
9
|
+
|
10
|
+
# Called when the beginning of a matching XML node is encountered.
|
11
|
+
#
|
12
|
+
# - path, current (maybe rewritten) path
|
13
|
+
# - document, the current sawtooth parser stack (`Sawtooth::Document`)
|
14
|
+
# - node, the current node to process
|
15
|
+
def start(path, document, node)
|
16
|
+
end
|
17
|
+
|
18
|
+
# Called when the end of a matching XML node is encountered.
|
19
|
+
# If an element has no body, this method is called with an empty
|
20
|
+
# string instead.
|
21
|
+
#
|
22
|
+
# - path, current (maybe rewritten) path
|
23
|
+
# - document, the current sawtooth parser stack (`Sawtooth::Document`)
|
24
|
+
# - node, the current node
|
25
|
+
def finish(path, document, node)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Basically calls inspect
|
29
|
+
def print_rule; self.class.name end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|