sawtooth 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/README.md +38 -0
- data/Rakefile +12 -0
- data/lib/sawtooth.rb +8 -0
- data/lib/sawtooth/builder.rb +106 -0
- data/lib/sawtooth/document.rb +158 -0
- data/lib/sawtooth/parser.rb +67 -0
- data/lib/sawtooth/rules.rb +6 -0
- data/lib/sawtooth/rules/base.rb +32 -0
- data/lib/sawtooth/rules/call_rule.rb +60 -0
- data/lib/sawtooth/rules/delegate_rule.rb +75 -0
- data/lib/sawtooth/rules/set.rb +64 -0
- data/lib/sawtooth/rules/text_rule.rb +55 -0
- data/lib/sawtooth/version.rb +3 -0
- data/sawtooth.gemspec +25 -0
- data/test/files/delegate.xml +34 -0
- data/test/files/statuses.xml +422 -0
- data/test/sawtooth/builder_test.rb +102 -0
- data/test/sawtooth/document_test.rb +119 -0
- data/test/sawtooth/parser_test.rb +17 -0
- data/test/sawtooth/readme_test.rb +21 -0
- data/test/sawtooth/rules/delegate_rule_test.rb +44 -0
- data/test/sawtooth/rules/text_rule_test.rb +59 -0
- data/test/sawtooth/rules_set_test.rb +49 -0
- data/test/test_helper.rb +25 -0
- metadata +161 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,38 @@
|
|
1
|
+
|
2
|
+
__
|
3
|
+
_____....--' .'
|
4
|
+
_..___...---'._ o -`(
|
5
|
+
_ | | _ \ .--. `\
|
6
|
+
___ __ ___ _| |_ ___ ___ | |_| |__ | \ \ `|
|
7
|
+
/ __|/ _` \ \ /\ / / __/ _ \ / _ \| __| '_ \ |o o | | |
|
8
|
+
\__ \ (_| |\ V V /| || (_) | (_) | |_| | | | \___'.-`. '.
|
9
|
+
|___/\__,_| \_/\_/ \__\___/ \___/ \__|_| |_| | `---'
|
10
|
+
'^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^'
|
11
|
+
|
12
|
+
A companion for [nokogori](http://nokogiri.org) to parse XML files by rules,
|
13
|
+
similar to [Apache Commons Digester](http://commons.apache.org/digester/).
|
14
|
+
|
15
|
+
Converting XML structures into Ruby is most often an unsatisfying task, having
|
16
|
+
to choose between implementing a SAX parser (for speed) or using _nokogiri_
|
17
|
+
features like CSS selectors for ease of use. At it's base _sawtooth_ is parsing
|
18
|
+
documents using SAX, but provides an interface to specify rules for the handling
|
19
|
+
the document.
|
20
|
+
|
21
|
+
require 'open-uri'
|
22
|
+
require 'sawtooth'
|
23
|
+
|
24
|
+
rules = Sawtooth.rules do
|
25
|
+
before { |doc| doc << [] } # 1. create an array for all news items
|
26
|
+
|
27
|
+
on 'rss/channel/item' do
|
28
|
+
on_start { |doc| doc << Hash.new } # 2. on an item create hash
|
29
|
+
on_finish { |doc| doc.parent << doc.pop } # 3. when closing an item, pop from stack and
|
30
|
+
end # append to parent array (from step 1.)
|
31
|
+
|
32
|
+
on_text 'rss/channel/item/*' # 4. add contents to hash
|
33
|
+
end
|
34
|
+
|
35
|
+
result = rules.parse(open('http://rss.cnn.com/rss/edition.rss')).root
|
36
|
+
p result #=> [{ 'title' => 'Some CNN News...', 'guid' =>, ...}, ...]
|
37
|
+
|
38
|
+
This sample shows the DSL exposed to create the XML parsing rules for an RSS feed.
|
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
2
|
+
require "rake/testtask"
|
3
|
+
|
4
|
+
desc 'Default: run unit tests.'
|
5
|
+
task :default => :test
|
6
|
+
|
7
|
+
desc 'Test the sawtooth gem.'
|
8
|
+
Rake::TestTask.new(:test) do |t|
|
9
|
+
t.libs << 'test'
|
10
|
+
t.pattern = 'test/**/*_test.rb'
|
11
|
+
t.verbose = true
|
12
|
+
end
|
data/lib/sawtooth.rb
ADDED
@@ -0,0 +1,106 @@
|
|
1
|
+
require 'sawtooth/parser'
|
2
|
+
require 'sawtooth/rules'
|
3
|
+
|
4
|
+
module Sawtooth
|
5
|
+
|
6
|
+
# Yield a builder instance and start working on pushing
|
7
|
+
# rules around like crazy.
|
8
|
+
#
|
9
|
+
def self.rules(&block)
|
10
|
+
Sawtooth::Builder.new(&block)
|
11
|
+
end
|
12
|
+
|
13
|
+
# Provides a nice and hopefully easy to use DSL to build rules and start
|
14
|
+
# parsing XML documents with ease.
|
15
|
+
#
|
16
|
+
class Builder
|
17
|
+
|
18
|
+
# Has access to a set of rules.
|
19
|
+
attr_reader :rules
|
20
|
+
|
21
|
+
# Creates a new instance.
|
22
|
+
def initialize(&block)
|
23
|
+
@rules = Sawtooth::Rules::Set.new
|
24
|
+
self.instance_eval(&block) if block_given?
|
25
|
+
end
|
26
|
+
|
27
|
+
# Get a parser instance with the same set of
|
28
|
+
# rules.
|
29
|
+
def parser
|
30
|
+
@parser ||= Sawtooth::Parser.new(:rules => self.rules)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Shortcut method to parse some input, delegates to the
|
34
|
+
# parser.
|
35
|
+
def parse(thingy)
|
36
|
+
parser.parse(thingy)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Called before the document starts.
|
40
|
+
def before(&block)
|
41
|
+
rules.add('@document:before', Sawtooth::Rules::CallRule.new(:start => block)) if block_given?
|
42
|
+
end
|
43
|
+
|
44
|
+
# Called after the document has ended.
|
45
|
+
def after(&block)
|
46
|
+
rules.add('@document:after', Sawtooth::Rules::CallRule.new(:finish => block)) if block_given?
|
47
|
+
end
|
48
|
+
|
49
|
+
def on_start(path, &block)
|
50
|
+
rules.add(path, Sawtooth::Rules::CallRule.new(:start => block)) if block_given?
|
51
|
+
end
|
52
|
+
|
53
|
+
# Called when the node has finished parsing, i.e. text and everything is available.
|
54
|
+
#
|
55
|
+
def on_finish(path, &block)
|
56
|
+
rules.add(path, Sawtooth::Rules::CallRule.new(:finish => block)) if block_given?
|
57
|
+
end
|
58
|
+
alias_method :on_node, :on_finish
|
59
|
+
|
60
|
+
# Perform a rule on a block, optionally pass in a custom rule instance.
|
61
|
+
def on(path, rule = nil, &block)
|
62
|
+
rule = block.arity <= 0 ? Sawtooth::Rules::CallRule.new(&block) : Sawtooth::Rules::CallRule.new(:start => block) if block_given?
|
63
|
+
rules.add(path, rule) if rule
|
64
|
+
end
|
65
|
+
|
66
|
+
# Use and set a nodes text to the top object in the stack.
|
67
|
+
#
|
68
|
+
# # Simple mapping, sets "name"
|
69
|
+
# on_text 'Person/Name'
|
70
|
+
#
|
71
|
+
# # Custom mapping
|
72
|
+
# on_text 'Person/Name' => :lastname
|
73
|
+
#
|
74
|
+
# # Data Conversion
|
75
|
+
# on_text('Person/Age') { |str| str.to_i }
|
76
|
+
#
|
77
|
+
# # Multiple Mappings
|
78
|
+
# on_text 'Person/Name' => :lastname, 'Person/FirstName' => :firstname
|
79
|
+
#
|
80
|
+
# The `TextRule` tries to set the value using a setter, or a hash
|
81
|
+
# accessor and the `document.top` object.
|
82
|
+
def on_text(mappings = {}, &block)
|
83
|
+
if mappings.respond_to?(:to_str)
|
84
|
+
rules.add(mappings.to_str, Sawtooth::Rules::TextRule.new(&block))
|
85
|
+
else
|
86
|
+
mappings.each do |path, name|
|
87
|
+
rules.add(path, Sawtooth::Rules::TextRule.new(name, &block))
|
88
|
+
end
|
89
|
+
end
|
90
|
+
end
|
91
|
+
|
92
|
+
def delegate(delegation = {})
|
93
|
+
path = delegation.keys.find { |k| k.to_s =~ %r{/\*\*?\z} }
|
94
|
+
cb_path = path.gsub(%r{/\*\*?\z}, '')
|
95
|
+
to = delegation[path]
|
96
|
+
prefix = delegation[:prefix] || path.gsub(%r{/?[^/]+/\*\*?\z}, '')
|
97
|
+
|
98
|
+
rule = Sawtooth::Rules::DelegateRule.new(:path => path, :rules => to.respond_to?(:rules) ? to.rules : to, :prefix => prefix)
|
99
|
+
rules.add(cb_path, rule.before_after_callbacks_rule)
|
100
|
+
rules.add(path, rule)
|
101
|
+
end
|
102
|
+
|
103
|
+
# Pretty print rules.
|
104
|
+
def to_pretty_s; rules.print_rules end
|
105
|
+
end
|
106
|
+
end
|
@@ -0,0 +1,158 @@
|
|
1
|
+
require 'nokogiri/xml/sax'
|
2
|
+
|
3
|
+
module Sawtooth
|
4
|
+
|
5
|
+
# Provides the current parser stack, delegates
|
6
|
+
# basically all calles to the supplied parser.
|
7
|
+
#
|
8
|
+
# Also the document exposes methods which can be
|
9
|
+
# used to directly interact with the stack.
|
10
|
+
class Document < ::Nokogiri::XML::SAX::Document
|
11
|
+
|
12
|
+
# A simple Document Node representation, for the node stack.
|
13
|
+
Node = Struct.new(:namespace, :name, :attributes, :text) do
|
14
|
+
def to_s; name end
|
15
|
+
end
|
16
|
+
|
17
|
+
class Stack < Array
|
18
|
+
def peek(n = 0)
|
19
|
+
self[(n + 1) * -1]
|
20
|
+
end
|
21
|
+
|
22
|
+
def current; peek(0) end
|
23
|
+
alias_method :top, :current
|
24
|
+
|
25
|
+
def parent; peek(-1) end
|
26
|
+
|
27
|
+
def root; first end
|
28
|
+
end
|
29
|
+
|
30
|
+
# Special freaky node for the Document and Comments
|
31
|
+
DOCUMENT_NODE = [Node.new(nil, '@document')]
|
32
|
+
COMMENT_NAME = '@comment'
|
33
|
+
|
34
|
+
# Both the stack and the delegate can be accessed.
|
35
|
+
attr_reader :stack, :stacks
|
36
|
+
attr_accessor :delegate
|
37
|
+
|
38
|
+
# Creates a new Document instance with an empty stack
|
39
|
+
# and the supplied delegate. The delegate is required to
|
40
|
+
# apply the rules.
|
41
|
+
def initialize(delegate = nil)
|
42
|
+
@delegate = delegate
|
43
|
+
reset!
|
44
|
+
end
|
45
|
+
|
46
|
+
# Allow an element to be pushed onto the stack
|
47
|
+
def <<(obj)
|
48
|
+
stack << obj
|
49
|
+
self
|
50
|
+
end
|
51
|
+
alias_method :push, :<<
|
52
|
+
|
53
|
+
# Pop an element of the stack
|
54
|
+
def pop
|
55
|
+
stack.pop
|
56
|
+
end
|
57
|
+
|
58
|
+
# Peek at an element in the stack, i.e. element 0 is the last
|
59
|
+
# element.
|
60
|
+
#
|
61
|
+
# doc.peek # => returns last element
|
62
|
+
# doc.peek(1) # => returns second last element
|
63
|
+
#
|
64
|
+
def peek(n = 0)
|
65
|
+
stack[(n + 1) * -1]
|
66
|
+
end
|
67
|
+
|
68
|
+
# Shortcut method for current, i.e. an alias of peek without
|
69
|
+
# an argument.
|
70
|
+
def current; peek(0) end
|
71
|
+
alias_method :top, :current
|
72
|
+
|
73
|
+
# Alias for `peek(1)`.
|
74
|
+
def parent; peek(1); end
|
75
|
+
|
76
|
+
# Alias for `stack.first`
|
77
|
+
def root; stack.first end
|
78
|
+
|
79
|
+
# Get current path stack.
|
80
|
+
def path; @path_stack end
|
81
|
+
|
82
|
+
# Get current node.
|
83
|
+
def node; @path_stack.last end
|
84
|
+
|
85
|
+
# Direct access to customizeable stacks
|
86
|
+
def [](key)
|
87
|
+
stacks[key]
|
88
|
+
end
|
89
|
+
|
90
|
+
# Resets path, stack and the current text.
|
91
|
+
def reset!
|
92
|
+
@path_stack = []
|
93
|
+
@stack = []
|
94
|
+
@stacks = Hash.new { |hsh, k| hsh[k] = Stack.new }
|
95
|
+
@text = nil
|
96
|
+
end
|
97
|
+
|
98
|
+
# Characters and CDATA will be appended the current text block, if any
|
99
|
+
def characters(str)
|
100
|
+
@text ||= ""
|
101
|
+
@text << str
|
102
|
+
end
|
103
|
+
alias_method :cdata_block, :characters
|
104
|
+
|
105
|
+
# Called when comments are encountered, empty implementation,
|
106
|
+
def comment(str)
|
107
|
+
cnode = Node.new(nil, COMMENT_NAME, {}, str)
|
108
|
+
delegate.comment((DOCUMENT_NODE + path + [cnode]).compact, self, cnode) if delegate.respond_to?(:comment)
|
109
|
+
end
|
110
|
+
|
111
|
+
# Called when document starts parsing, clears path and stack
|
112
|
+
# and calls with special @document path.
|
113
|
+
def start_document
|
114
|
+
reset!
|
115
|
+
delegate.start_document(DOCUMENT_NODE, self) if delegate.respond_to?(:start_document)
|
116
|
+
end
|
117
|
+
|
118
|
+
# Callend when document ends parsing, does call with
|
119
|
+
# special @document path.
|
120
|
+
def end_document
|
121
|
+
delegate.end_document(DOCUMENT_NODE, self) if delegate.respond_to?(:end_document)
|
122
|
+
end
|
123
|
+
|
124
|
+
# Called at the beginning of an element.
|
125
|
+
def start_element_namespace(name, attrs_ary = [], prefix = nil, uri = nil, ns = [])
|
126
|
+
@text = nil
|
127
|
+
node = Node.new(uri, name, attrs_ary.inject({}) { |hsh, a| hsh[a.localname] = a.value; hsh }, '')
|
128
|
+
path << node
|
129
|
+
|
130
|
+
# call delegate
|
131
|
+
delegate.start_element(path, self, node) if delegate.respond_to?(:start_element)
|
132
|
+
end
|
133
|
+
|
134
|
+
# Called at the end of an element.
|
135
|
+
def end_element_namespace(name, prefix = nil, uri = nil)
|
136
|
+
# fill text
|
137
|
+
node.text = @text.to_s.strip if @text
|
138
|
+
|
139
|
+
# call delegate
|
140
|
+
delegate.end_element(path, self, node) if delegate.respond_to?(:end_element)
|
141
|
+
|
142
|
+
# clear stack
|
143
|
+
@path_stack.pop
|
144
|
+
@text = nil
|
145
|
+
end
|
146
|
+
|
147
|
+
# Pass a warning along to the parser
|
148
|
+
def warning(string)
|
149
|
+
delegate.warning(path, self, string) if delegate.respond_to?(:warning)
|
150
|
+
end
|
151
|
+
|
152
|
+
# Pass an error along to the parser, parser should handle
|
153
|
+
# whether to continue or abort parsing.
|
154
|
+
def error(string)
|
155
|
+
delegate.error(path, self, string) if delegate.respond_to?(:error)
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|
@@ -0,0 +1,67 @@
|
|
1
|
+
require 'nokogiri'
|
2
|
+
|
3
|
+
require 'sawtooth/document'
|
4
|
+
require 'sawtooth/rules/set'
|
5
|
+
|
6
|
+
module Sawtooth
|
7
|
+
|
8
|
+
# Default Parser implementation, can be used as a
|
9
|
+
# starting point for custom implementations.
|
10
|
+
#
|
11
|
+
class Parser
|
12
|
+
|
13
|
+
# Array of accessible rules.
|
14
|
+
attr_reader :rules
|
15
|
+
|
16
|
+
# Creates a new instance.
|
17
|
+
def initialize(options = {})
|
18
|
+
@rules = options[:rules] || Sawtooth::Rules::Set.new
|
19
|
+
end
|
20
|
+
|
21
|
+
# Delegates to `Rules::Set#add`.
|
22
|
+
def add(path, rule)
|
23
|
+
rules.add(path, rule)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Recieved a comment node.
|
27
|
+
def comment(path, doc, str); end
|
28
|
+
|
29
|
+
# Start document callback
|
30
|
+
def start_document(path, doc)
|
31
|
+
rule = rules.find('@document:before')
|
32
|
+
rule.start(path.join('/'), doc, nil) if rule && rule.respond_to?(:start)
|
33
|
+
end
|
34
|
+
|
35
|
+
# End document callback
|
36
|
+
def end_document(path, doc)
|
37
|
+
rule = rules.find('@document:after')
|
38
|
+
rule.finish(path.join('/'), doc, nil) if rule && rule.respond_to?(:finish)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Start element callback
|
42
|
+
def start_element(path, doc, node)
|
43
|
+
rule = rules.find(path)
|
44
|
+
rule.start(path.join('/'), doc, node) if rule && rule.respond_to?(:start)
|
45
|
+
end
|
46
|
+
|
47
|
+
# End document callback
|
48
|
+
def end_element(path, doc, node)
|
49
|
+
rule = rules.find(path)
|
50
|
+
rule.finish(path.join('/'), doc, node) if rule && rule.respond_to?(:finish)
|
51
|
+
end
|
52
|
+
|
53
|
+
def error(path, doc, message)
|
54
|
+
raise message
|
55
|
+
end
|
56
|
+
|
57
|
+
# Parses and XML thingy, a filename, path, IO or content
|
58
|
+
# from memory. Provides and optional encoding, which defaults
|
59
|
+
# to `UTF-8`.
|
60
|
+
def parse(thing, encoding = 'UTF-8')
|
61
|
+
Sawtooth::Document.new(self).tap do |doc|
|
62
|
+
sax_parser = Nokogiri::XML::SAX::Parser.new(doc, encoding)
|
63
|
+
sax_parser.parse(thing)
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Sawtooth
|
2
|
+
module Rules
|
3
|
+
|
4
|
+
# Base Rule, provides three unimplemented methods, which
|
5
|
+
# can be overriden by more specific rules - like the create
|
6
|
+
# or call rule etc.
|
7
|
+
#
|
8
|
+
class Base
|
9
|
+
|
10
|
+
# Called when the beginning of a matching XML node is encountered.
|
11
|
+
#
|
12
|
+
# - path, current (maybe rewritten) path
|
13
|
+
# - document, the current sawtooth parser stack (`Sawtooth::Document`)
|
14
|
+
# - node, the current node to process
|
15
|
+
def start(path, document, node)
|
16
|
+
end
|
17
|
+
|
18
|
+
# Called when the end of a matching XML node is encountered.
|
19
|
+
# If an element has no body, this method is called with an empty
|
20
|
+
# string instead.
|
21
|
+
#
|
22
|
+
# - path, current (maybe rewritten) path
|
23
|
+
# - document, the current sawtooth parser stack (`Sawtooth::Document`)
|
24
|
+
# - node, the current node
|
25
|
+
def finish(path, document, node)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Basically calls inspect
|
29
|
+
def print_rule; self.class.name end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|