saxxy 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.travis.yml +5 -0
- data/Gemfile +13 -0
- data/LICENSE +22 -0
- data/README.md +117 -0
- data/Rakefile +12 -0
- data/lib/saxxy.rb +2 -0
- data/lib/saxxy/activatable.rb +160 -0
- data/lib/saxxy/callbacks/libxml.rb +26 -0
- data/lib/saxxy/callbacks/nokogiri.rb +30 -0
- data/lib/saxxy/callbacks/ox.rb +66 -0
- data/lib/saxxy/callbacks/sax.rb +86 -0
- data/lib/saxxy/context.rb +88 -0
- data/lib/saxxy/context_tree.rb +85 -0
- data/lib/saxxy/event.rb +83 -0
- data/lib/saxxy/event_registry.rb +122 -0
- data/lib/saxxy/node_action.rb +59 -0
- data/lib/saxxy/node_rule.rb +90 -0
- data/lib/saxxy/parsers/base.rb +28 -0
- data/lib/saxxy/parsers/libxml.rb +52 -0
- data/lib/saxxy/parsers/nokogiri.rb +28 -0
- data/lib/saxxy/parsers/ox.rb +30 -0
- data/lib/saxxy/service.rb +47 -0
- data/lib/saxxy/utils/agent.rb +66 -0
- data/lib/saxxy/utils/callback_array.rb +27 -0
- data/lib/saxxy/utils/helpers.rb +13 -0
- data/lib/saxxy/version.rb +3 -0
- data/saxxy.gemspec +21 -0
- data/spec/saxxy/activatable_spec.rb +344 -0
- data/spec/saxxy/callbacks/sax_spec.rb +456 -0
- data/spec/saxxy/context_spec.rb +51 -0
- data/spec/saxxy/context_tree_spec.rb +68 -0
- data/spec/saxxy/event_registry_spec.rb +137 -0
- data/spec/saxxy/event_spec.rb +49 -0
- data/spec/saxxy/node_action_spec.rb +46 -0
- data/spec/saxxy/node_rule_spec.rb +99 -0
- data/spec/saxxy/parsers/libxml_spec.rb +104 -0
- data/spec/saxxy/parsers/nokogiri_spec.rb +200 -0
- data/spec/saxxy/parsers/ox_spec.rb +175 -0
- data/spec/saxxy/utils/agent_spec.rb +63 -0
- data/spec/spec_helper.rb +28 -0
- data/spec/support/agent_macros.rb +24 -0
- metadata +155 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: bd0c6946f13251ffc9209c9762cacb8aebb5aa11
|
4
|
+
data.tar.gz: c2780c576c899e594286099289a3062b69be1cb9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: fb7a6ebd8808db1d6265845e5b3128213abf67779edc78ef934047952933fbd21b7b387157696d2c02e4f587130300672506050443dffdb433239adfde9cdd00
|
7
|
+
data.tar.gz: 02c47da0ab97345dba9ddb5ced8edc1a18af56ea2a04e8848ed1637e37d038fd28b4026d60de1a54ae477a383d63afcbea54a26dff8eee0496b0e9554cfeec77
|
data/.gitignore
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
*.gem
|
2
|
+
*.rbc
|
3
|
+
.bundle
|
4
|
+
.config
|
5
|
+
.yardoc
|
6
|
+
.bin
|
7
|
+
.rspec
|
8
|
+
.rbenv-version
|
9
|
+
.DS_Store
|
10
|
+
Gemfile.lock
|
11
|
+
InstalledFiles
|
12
|
+
_yardoc
|
13
|
+
coverage
|
14
|
+
doc/
|
15
|
+
docs/
|
16
|
+
lib/bundler/man
|
17
|
+
pkg
|
18
|
+
rdoc
|
19
|
+
spec/reports
|
20
|
+
test/tmp
|
21
|
+
test/version_tmp
|
22
|
+
tmp
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2015 rubymaniac
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,117 @@
|
|
1
|
+
# `Saxxy` A Ruby DSL for SAX parsers [![Build Status](https://travis-ci.org/rubymaniac/saxxy.png?branch=master)](https://travis-ci.org/rubymaniac/saxxy)
|
2
|
+
|
3
|
+
Saxxy is designed to be a DSL for creating SAX parsers. If anyone tells you that you are masochist 'cause you are SAX parsing HTML show her `Saxxy`.
|
4
|
+
|
5
|
+
It currently supports [Nokogiri](https://github.com/sparklemotion/nokogiri), [Ox](https://github.com/ohler55/ox), [LibXML](https://github.com/xml4r/libxml-ruby) and is really easy to implement your own parser bindings. It can parse XML out of the box but HTML SAX parsing heavily depends on how the parser handles HTML. Libxml cannot handle malformed HTML at all. Ox and Nokogiri handles the parsing of HTML (even malformed) really well and thus I recommend them.
|
6
|
+
|
7
|
+
|
8
|
+
## Dependencies
|
9
|
+
|
10
|
+
`Saxxy` requires Ruby >=1.9 or JRuby with JRUBY_OPTS=--1.9
|
11
|
+
|
12
|
+
|
13
|
+
## Installation
|
14
|
+
|
15
|
+
Add this line to your application's Gemfile:
|
16
|
+
|
17
|
+
gem 'saxxy'
|
18
|
+
|
19
|
+
Or install it independently of Bundler
|
20
|
+
|
21
|
+
$ gem install saxxy
|
22
|
+
|
23
|
+
|
24
|
+
## Getting started
|
25
|
+
|
26
|
+
### Overview
|
27
|
+
First you must create a service object with a specified parser. It accepts a symbol (`:nokogiri`, `:libxml`, `:ox`) or a class if you made your own parser implementation. It will create a context tree (see `Saxxy::ContextTree` for more details) and will register the callbacks it will call when parsing, as soon as you provide a block. E.g.
|
28
|
+
|
29
|
+
```ruby
|
30
|
+
require "saxxy/parsers/nokogiri"
|
31
|
+
|
32
|
+
service = Saxxy::Service.new(:nokogiri) do
|
33
|
+
under("div", class: /cool$/) do
|
34
|
+
on(/span|div/, rel: "foo") do |inner_text, element, attributes|
|
35
|
+
puts "Under a #{element} found some text: " + inner_text
|
36
|
+
end
|
37
|
+
|
38
|
+
under("table", class: "main") do
|
39
|
+
under("tr", class: "header") do
|
40
|
+
on("td") do |inner_text, element, attributes|
|
41
|
+
puts "Found some other text in a table cell: " + inner_text
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
```
|
48
|
+
The service provides either `parse_file`, `parse_string` or `parse_io` methods, depending on you needs. Every method accepts it's corresponding source (with the respective source type) as first argument and an optional encoding as a second argument.
|
49
|
+
|
50
|
+
```ruby
|
51
|
+
service.parse_string <<-eos
|
52
|
+
<html>
|
53
|
+
<span>
|
54
|
+
Hey I am in a span! <em>And I am nested in a span!</em>
|
55
|
+
</span>
|
56
|
+
<div>
|
57
|
+
Hey I am in a div!
|
58
|
+
</div>
|
59
|
+
</html>
|
60
|
+
eos
|
61
|
+
|
62
|
+
# => Under a span found some text: Hey I am in a span! And I am nested in a span!
|
63
|
+
# => Under a div found some text: Hey I am in a div!
|
64
|
+
```
|
65
|
+
If the parser doesn't raise some funny error you should be seeing your registered callbacks getting called with the
|
66
|
+
text, the element name and the attributes found at the matching node.
|
67
|
+
|
68
|
+
|
69
|
+
|
70
|
+
### The DSL
|
71
|
+
Saxxy uses a DSL in order to create a context tree and register callbacks. The two most significant methods for doing so is `on` and `under`. The `on` method is used to signify a specific condition and the block it accepts is the callback it will run when the condition is met on a node.
|
72
|
+
|
73
|
+
The following example shows a callback that is run when the parser encounters a header element with a class that matches `/foo$/`
|
74
|
+
|
75
|
+
```ruby
|
76
|
+
on(/^h[1-6]{1}/, class: /foo$/) do |text, element, attributes|
|
77
|
+
p "Element name is: #{element} and the inner text is: #{text}".
|
78
|
+
end
|
79
|
+
```
|
80
|
+
There is now the case where you want to restrict the range of the `on` call only, say, to headers inside a div element with a class footer. To do that you nest the `on` in an `under` call which is used for restricting callbacks' range. E.g.
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
under("div", class: "footer") do
|
84
|
+
on(/^h[1-6]{1}/, class: /foo$/) do |text, element, attributes|
|
85
|
+
p "Element name is: #{element} and the inner text is: #{text}".
|
86
|
+
end
|
87
|
+
end
|
88
|
+
```
|
89
|
+
|
90
|
+
## Documentation
|
91
|
+
You can find the documentation [here](http://rdoc.info/github/rubymaniac/saxxy/frames).
|
92
|
+
|
93
|
+
## TODO
|
94
|
+
1. Add support for a clean DSL for easily constructing highly nested contexts
|
95
|
+
2. Switch to a lazy evaluated context tree
|
96
|
+
3. Add more integration tests
|
97
|
+
|
98
|
+
## Known Issues
|
99
|
+
### Nokogiri
|
100
|
+
No issues
|
101
|
+
|
102
|
+
### Ox
|
103
|
+
No issues
|
104
|
+
|
105
|
+
### Libxml
|
106
|
+
1. Does not handle the malformed HTML (raises exceptions)
|
107
|
+
2. Triggers twice the callbacks on the nodes
|
108
|
+
|
109
|
+
|
110
|
+
## Contributing
|
111
|
+
|
112
|
+
1. Fork it
|
113
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
114
|
+
3. Commit your changes (`git commit -am 'Added some feature'`)
|
115
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
116
|
+
5. Create new Pull Request
|
117
|
+
[![githalytics.com alpha](https://cruel-carlota.pagodabox.com/c6bbeb377f74da9f3e282fa2fbf4b6a3 "githalytics.com")](http://githalytics.com/rubymaniac/saxxy)
|
data/Rakefile
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
#!/usr/bin/env rake
|
2
|
+
require "bundler/gem_tasks"
|
3
|
+
require "rspec/core/rake_task"
|
4
|
+
|
5
|
+
RSpec::Core::RakeTask.new(:spec)
|
6
|
+
|
7
|
+
task default: :spec
|
8
|
+
|
9
|
+
desc "Open an irb session preloaded with this gem"
|
10
|
+
task :console do
|
11
|
+
sh "irb -rubygems -I lib -r saxxy.rb"
|
12
|
+
end
|
data/lib/saxxy.rb
ADDED
@@ -0,0 +1,160 @@
|
|
1
|
+
module Saxxy
|
2
|
+
|
3
|
+
##
|
4
|
+
# @author rubymaniac
|
5
|
+
#
|
6
|
+
# Activatable contains all the logic for handling the
|
7
|
+
# activation of an object, whether this is a Context or an Event
|
8
|
+
# or any object that needs to be activated / deactivated multiple times.
|
9
|
+
#
|
10
|
+
# Any Activatable object needs to possess an activation_rule
|
11
|
+
# (assuming automatic activation if activation_rule is nil) in
|
12
|
+
# order to use this to activate / deactivate the object. Everytime
|
13
|
+
# the activation_rule matches an opening node the object gets activated
|
14
|
+
# (by incrementing the internal @deactivation_level) and can be
|
15
|
+
# activated many times. Everytime the activation_rule matches a
|
16
|
+
# closing node the object gets deactivated (by decrementing the internal
|
17
|
+
# @deactivation_level variable) and can be deactivated many times.
|
18
|
+
#
|
19
|
+
# An Activatable object is considered inactive when it's @deactivation_level
|
20
|
+
# equals DLEVEL_MIN i.e. when it has been deactivated as many times as it
|
21
|
+
# has been activated.
|
22
|
+
#
|
23
|
+
# @!attribute [r] activation_rule
|
24
|
+
# @return [NodeRule] this objects' activation_rule
|
25
|
+
##
|
26
|
+
module Activatable
|
27
|
+
|
28
|
+
# The lowest integer the deactivation level can reach before the object
|
29
|
+
# is considered inactive.
|
30
|
+
#
|
31
|
+
DLEVEL_MIN = -1
|
32
|
+
|
33
|
+
# Sets an attribute reader to the receiver for the
|
34
|
+
# activation rule.
|
35
|
+
#
|
36
|
+
def self.included(receiver)
|
37
|
+
receiver.send(:attr_reader, :activation_rule)
|
38
|
+
end
|
39
|
+
|
40
|
+
# Initiates the activatable by setting the activation_rule to the
|
41
|
+
# argument, setting the deactivation_level to DLEVEL_MIN and it's
|
42
|
+
# state to inactive
|
43
|
+
#
|
44
|
+
# @param rule [NodeRule] an instance of NodeRule or nil to
|
45
|
+
# declare that this object is automatically active.
|
46
|
+
#
|
47
|
+
# @return [Symbol] its state (active or inactive)
|
48
|
+
#
|
49
|
+
def initialize_activatable(rule)
|
50
|
+
@activation_rule = rule
|
51
|
+
@deactivation_level = DLEVEL_MIN
|
52
|
+
switch_to(rule ? :inactive : :active)
|
53
|
+
end
|
54
|
+
|
55
|
+
# Sets the callback to run when this object gets deactivated
|
56
|
+
#
|
57
|
+
# @param block [Proc] the code to be executed upon deactivation
|
58
|
+
#
|
59
|
+
# @return self
|
60
|
+
#
|
61
|
+
def on_deactivation(&block)
|
62
|
+
@deactivation_callback = block
|
63
|
+
self
|
64
|
+
end
|
65
|
+
|
66
|
+
# Sets the callback to run when this object gets activated
|
67
|
+
#
|
68
|
+
# @param block [Proc] the code to be executed upon activation
|
69
|
+
#
|
70
|
+
# @return self
|
71
|
+
#
|
72
|
+
def on_activation(&block)
|
73
|
+
@activation_callback = block
|
74
|
+
self
|
75
|
+
end
|
76
|
+
|
77
|
+
# Activates the object on an opening node if it is inactive and can be
|
78
|
+
# activated on the node or it increments the @deactivation_level
|
79
|
+
# if the activation_rule matches the element_name
|
80
|
+
#
|
81
|
+
# @param element_name [String] the nodes' element name
|
82
|
+
# @param attributes [Hash<String, String>] the nodes' attributes
|
83
|
+
#
|
84
|
+
# @return self
|
85
|
+
#
|
86
|
+
def activate_on(element_name, attributes)
|
87
|
+
if is(:inactive) && can_be_activated_on(element_name, attributes)
|
88
|
+
activate!
|
89
|
+
elsif is(:active) && rule_matches_element_name(element_name)
|
90
|
+
increment_level
|
91
|
+
end
|
92
|
+
self
|
93
|
+
end
|
94
|
+
|
95
|
+
# Deactivates the object on a closing node. If the object is inactive
|
96
|
+
# it does nothing, otherwise it decrements the @deactivation_level if
|
97
|
+
# the activation_rule matches the element_name and deactivates the object
|
98
|
+
# if the @deactivation_level is DLEVEL_MIN.
|
99
|
+
#
|
100
|
+
# @param element_name [String] the nodes' element name
|
101
|
+
#
|
102
|
+
# @return self
|
103
|
+
#
|
104
|
+
def deactivate_on(element_name)
|
105
|
+
return unless is(:active)
|
106
|
+
decrement_level if rule_matches_element_name(element_name)
|
107
|
+
deactivate! if closed?
|
108
|
+
self
|
109
|
+
end
|
110
|
+
|
111
|
+
private
|
112
|
+
def activate!
|
113
|
+
run_activation_callback
|
114
|
+
increment_level
|
115
|
+
switch_to(:active)
|
116
|
+
end
|
117
|
+
|
118
|
+
def deactivate!
|
119
|
+
run_deactivation_callback
|
120
|
+
switch_to(:inactive)
|
121
|
+
end
|
122
|
+
|
123
|
+
def is(mode)
|
124
|
+
@mode == mode
|
125
|
+
end
|
126
|
+
|
127
|
+
def switch_to(mode)
|
128
|
+
@mode = mode
|
129
|
+
end
|
130
|
+
|
131
|
+
def closed?
|
132
|
+
@deactivation_level == DLEVEL_MIN
|
133
|
+
end
|
134
|
+
|
135
|
+
def increment_level
|
136
|
+
@deactivation_level += 1
|
137
|
+
end
|
138
|
+
|
139
|
+
def decrement_level
|
140
|
+
@deactivation_level -= 1
|
141
|
+
end
|
142
|
+
|
143
|
+
def run_activation_callback
|
144
|
+
@activation_callback.call(self) if @activation_callback
|
145
|
+
end
|
146
|
+
|
147
|
+
def run_deactivation_callback
|
148
|
+
@deactivation_callback.call(self) if @deactivation_callback
|
149
|
+
end
|
150
|
+
|
151
|
+
def rule_matches_element_name(element_name)
|
152
|
+
activation_rule.nil? || activation_rule.match_element_name(element_name)
|
153
|
+
end
|
154
|
+
|
155
|
+
def can_be_activated_on(element_name, attributes)
|
156
|
+
(activation_rule.nil? || activation_rule.matches(element_name, attributes)) && closed?
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
require "libxml"
|
2
|
+
require "saxxy/callbacks/sax"
|
3
|
+
|
4
|
+
|
5
|
+
module Saxxy
|
6
|
+
module Callbacks
|
7
|
+
|
8
|
+
class Libxml
|
9
|
+
include LibXML::XML::SaxParser::Callbacks
|
10
|
+
include SAX
|
11
|
+
|
12
|
+
def on_start_element_ns(name, attributes, prefix, uri, namespaces)
|
13
|
+
on_start_element(name, attributes)
|
14
|
+
end
|
15
|
+
|
16
|
+
def on_end_element_ns(name, prefix, uri)
|
17
|
+
on_end_element(name)
|
18
|
+
end
|
19
|
+
|
20
|
+
def on_error(error)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
require "saxxy/callbacks/sax"
|
3
|
+
|
4
|
+
|
5
|
+
module Saxxy
|
6
|
+
module Callbacks
|
7
|
+
|
8
|
+
class Nokogiri < Nokogiri::XML::SAX::Document
|
9
|
+
include SAX
|
10
|
+
|
11
|
+
def start_element(name, attrs)
|
12
|
+
on_start_element(name, Hash[attrs])
|
13
|
+
end
|
14
|
+
|
15
|
+
def characters(string)
|
16
|
+
on_characters(string)
|
17
|
+
end
|
18
|
+
|
19
|
+
def end_element(name)
|
20
|
+
on_end_element(name)
|
21
|
+
end
|
22
|
+
|
23
|
+
def end_document
|
24
|
+
on_end_document
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require "ox"
|
2
|
+
require "saxxy/callbacks/sax"
|
3
|
+
|
4
|
+
|
5
|
+
module Saxxy
|
6
|
+
module Callbacks
|
7
|
+
|
8
|
+
class Ox < ::Ox::Sax
|
9
|
+
include SAX
|
10
|
+
|
11
|
+
def initialize(context)
|
12
|
+
super(context)
|
13
|
+
reset_state!
|
14
|
+
end
|
15
|
+
|
16
|
+
def start_element(name)
|
17
|
+
on_start_element_after_attr_parsing
|
18
|
+
reset_state!
|
19
|
+
set_name(name)
|
20
|
+
end
|
21
|
+
|
22
|
+
def attr(name, value)
|
23
|
+
push_attr(name, value)
|
24
|
+
end
|
25
|
+
|
26
|
+
def text(string)
|
27
|
+
on_start_element_after_attr_parsing
|
28
|
+
on_characters(string)
|
29
|
+
unset_name
|
30
|
+
end
|
31
|
+
|
32
|
+
def end_element(name)
|
33
|
+
on_start_element_after_attr_parsing
|
34
|
+
on_end_element(name.to_s)
|
35
|
+
reset_state!
|
36
|
+
end
|
37
|
+
|
38
|
+
private
|
39
|
+
def reset_state!
|
40
|
+
@__state = { attrs: {} }
|
41
|
+
end
|
42
|
+
|
43
|
+
def unset_name
|
44
|
+
@__state.delete(:name)
|
45
|
+
end
|
46
|
+
|
47
|
+
def set_name(name)
|
48
|
+
@__state[:name] = name.to_s
|
49
|
+
end
|
50
|
+
|
51
|
+
def push_attr(name, value)
|
52
|
+
@__state[:attrs].merge!(name.to_s => value)
|
53
|
+
end
|
54
|
+
|
55
|
+
def on_start_element_after_attr_parsing
|
56
|
+
on_start_element(@__state[:name], @__state[:attrs]) if start_element_found?
|
57
|
+
end
|
58
|
+
|
59
|
+
def start_element_found?
|
60
|
+
!@__state[:name].nil?
|
61
|
+
end
|
62
|
+
end
|
63
|
+
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|