saxxy 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.travis.yml +5 -0
- data/Gemfile +13 -0
- data/LICENSE +22 -0
- data/README.md +117 -0
- data/Rakefile +12 -0
- data/lib/saxxy.rb +2 -0
- data/lib/saxxy/activatable.rb +160 -0
- data/lib/saxxy/callbacks/libxml.rb +26 -0
- data/lib/saxxy/callbacks/nokogiri.rb +30 -0
- data/lib/saxxy/callbacks/ox.rb +66 -0
- data/lib/saxxy/callbacks/sax.rb +86 -0
- data/lib/saxxy/context.rb +88 -0
- data/lib/saxxy/context_tree.rb +85 -0
- data/lib/saxxy/event.rb +83 -0
- data/lib/saxxy/event_registry.rb +122 -0
- data/lib/saxxy/node_action.rb +59 -0
- data/lib/saxxy/node_rule.rb +90 -0
- data/lib/saxxy/parsers/base.rb +28 -0
- data/lib/saxxy/parsers/libxml.rb +52 -0
- data/lib/saxxy/parsers/nokogiri.rb +28 -0
- data/lib/saxxy/parsers/ox.rb +30 -0
- data/lib/saxxy/service.rb +47 -0
- data/lib/saxxy/utils/agent.rb +66 -0
- data/lib/saxxy/utils/callback_array.rb +27 -0
- data/lib/saxxy/utils/helpers.rb +13 -0
- data/lib/saxxy/version.rb +3 -0
- data/saxxy.gemspec +21 -0
- data/spec/saxxy/activatable_spec.rb +344 -0
- data/spec/saxxy/callbacks/sax_spec.rb +456 -0
- data/spec/saxxy/context_spec.rb +51 -0
- data/spec/saxxy/context_tree_spec.rb +68 -0
- data/spec/saxxy/event_registry_spec.rb +137 -0
- data/spec/saxxy/event_spec.rb +49 -0
- data/spec/saxxy/node_action_spec.rb +46 -0
- data/spec/saxxy/node_rule_spec.rb +99 -0
- data/spec/saxxy/parsers/libxml_spec.rb +104 -0
- data/spec/saxxy/parsers/nokogiri_spec.rb +200 -0
- data/spec/saxxy/parsers/ox_spec.rb +175 -0
- data/spec/saxxy/utils/agent_spec.rb +63 -0
- data/spec/spec_helper.rb +28 -0
- data/spec/support/agent_macros.rb +24 -0
- metadata +155 -0
@@ -0,0 +1,59 @@
|
|
1
|
+
module Saxxy
|
2
|
+
|
3
|
+
##
|
4
|
+
# @author rubymaniac
|
5
|
+
#
|
6
|
+
# NodeAction describes something that should be run on a
|
7
|
+
# node. In order to check whether to run this action it
|
8
|
+
# accepts as the first argument an activation_rule.
|
9
|
+
#
|
10
|
+
#
|
11
|
+
# @!attribute [r] activation_rule
|
12
|
+
# @return [Context] this action's activation rule
|
13
|
+
#
|
14
|
+
# @!attribute [r] action
|
15
|
+
# @return [Proc] the block of code that will run on a node
|
16
|
+
##
|
17
|
+
class NodeAction
|
18
|
+
attr_reader :activation_rule, :action
|
19
|
+
|
20
|
+
# Initializes a NodeAction with an `activation_rule` a context to run
|
21
|
+
# its action (block) and the block.
|
22
|
+
#
|
23
|
+
# @param activation_rule [NodeRule] an instance of NodeRule
|
24
|
+
# used to check whether to run this action on a node
|
25
|
+
#
|
26
|
+
# @param context [Object] a context (object) on which the block
|
27
|
+
# will be evaluated
|
28
|
+
#
|
29
|
+
# @param block [Proc] a block that will get evaluated on context
|
30
|
+
#
|
31
|
+
def initialize(activation_rule, context = self, &block)
|
32
|
+
@activation_rule = activation_rule
|
33
|
+
@ctx = context
|
34
|
+
@action = block_given? ? block : ->(e) { e }
|
35
|
+
end
|
36
|
+
|
37
|
+
# Delegates the call to its `activation_rule`
|
38
|
+
#
|
39
|
+
# @param element_name [String] the name of a node
|
40
|
+
#
|
41
|
+
# @param attributes [Hash<String, String>] the attributes of a node
|
42
|
+
#
|
43
|
+
# @return [Boolean] whether it matches the node
|
44
|
+
#
|
45
|
+
def matches(element_name, attributes)
|
46
|
+
activation_rule.matches(element_name, attributes)
|
47
|
+
end
|
48
|
+
|
49
|
+
# Evaluates the block that was given to the constructor on the context
|
50
|
+
# and passes the arguments to the block
|
51
|
+
#
|
52
|
+
# @param args [Array] variable arguments that pass to the block
|
53
|
+
#
|
54
|
+
def call(*args)
|
55
|
+
@ctx.instance_exec(args, &action)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
module Saxxy
|
2
|
+
|
3
|
+
##
|
4
|
+
# @author rubymaniac
|
5
|
+
#
|
6
|
+
# NodeRule describes a rule that will be tested upon an XML node
|
7
|
+
# and will check if the node satisfies this NodeRule.
|
8
|
+
#
|
9
|
+
# The NodeRule consists of two parts. The `element` part which
|
10
|
+
# refers to what should hold for the node's name. It can be
|
11
|
+
# either a String (where the strict equality is should hold) or
|
12
|
+
# a Regexp (where the Regexp must match the node name).
|
13
|
+
#
|
14
|
+
# The other part is the `attributes` part which refers to what
|
15
|
+
# should hold for the attributes of the node. It consists of key-value
|
16
|
+
# pairs where the key is the attribute to check and the value is what
|
17
|
+
# should hold for that attribute.
|
18
|
+
#
|
19
|
+
# @!attribute [r] element
|
20
|
+
# @return [String|Regexp] node's name rule
|
21
|
+
#
|
22
|
+
# @!attribute [r] attributes
|
23
|
+
# @return [Hash<String, String|Regexp>] node's attributes rule
|
24
|
+
##
|
25
|
+
class NodeRule
|
26
|
+
attr_reader :element, :attributes
|
27
|
+
|
28
|
+
# Initializes a NodeRule with an `element` part and an `attributes` part.
|
29
|
+
#
|
30
|
+
# @param element [String|Regexp] what should hold for the node name
|
31
|
+
# @param attributes [Hash<String, String|Regexp>]
|
32
|
+
# what should hold for node's attributes
|
33
|
+
#
|
34
|
+
def initialize(element, attributes = {})
|
35
|
+
@element = element
|
36
|
+
@attributes = Saxxy::Helpers.stringify_keys(attributes)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Checks whether this NodeRule matches a node.
|
40
|
+
#
|
41
|
+
# @param element_name [String] node's name
|
42
|
+
# @param attrs [Hash<String, String>] node's attributes
|
43
|
+
#
|
44
|
+
# @return [Boolean] whether this NodeRule matches the node
|
45
|
+
#
|
46
|
+
def matches(element_name, attrs = {})
|
47
|
+
match_element_name(element_name) && match_attributes(attrs)
|
48
|
+
end
|
49
|
+
|
50
|
+
# Checks whether this NodeRule is equal to another.
|
51
|
+
#
|
52
|
+
# @param rule [NodeRule] the other NodeRule
|
53
|
+
#
|
54
|
+
# @return [Boolean] whether this NodeRule equals rule
|
55
|
+
#
|
56
|
+
def equals(rule)
|
57
|
+
element == rule.element && attributes == rule.attributes
|
58
|
+
end
|
59
|
+
|
60
|
+
# Checks whether this NodeRule matches only the name of a node.
|
61
|
+
#
|
62
|
+
# @param element_name [String] node's name
|
63
|
+
#
|
64
|
+
# @return [Boolean] whether this NodeRule matches node's name
|
65
|
+
#
|
66
|
+
def match_element_name(element_name)
|
67
|
+
match(element, element_name)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Checks whether this NodeRule matches only the attributes of a node.
|
71
|
+
#
|
72
|
+
# @param attrs [Hash<String, String>] node's attributes
|
73
|
+
#
|
74
|
+
# @return [Boolean] whether this NodeRule matches node's attributes
|
75
|
+
#
|
76
|
+
def match_attributes(attrs)
|
77
|
+
attrs = Saxxy::Helpers.stringify_keys(attrs)
|
78
|
+
attributes.reduce(true) do |b, (k, v)|
|
79
|
+
value = attrs[k]
|
80
|
+
b && ((!value.nil? && match(v, value)) || (v.nil? && value.nil?))
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
def match(obj, value)
|
86
|
+
obj.is_a?(Regexp) ? !obj.match(value).nil? : obj == value
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Saxxy
|
2
|
+
module Parsers
|
3
|
+
|
4
|
+
class NotImplemented < StandardError; end
|
5
|
+
|
6
|
+
class Base
|
7
|
+
attr_reader :context_tree, :options
|
8
|
+
|
9
|
+
def initialize(context_tree, options = {})
|
10
|
+
@context_tree = context_tree
|
11
|
+
@options = options
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse_file(path_to_file)
|
15
|
+
raise NotImplemented
|
16
|
+
end
|
17
|
+
|
18
|
+
def parse_string(string)
|
19
|
+
raise NotImplemented
|
20
|
+
end
|
21
|
+
|
22
|
+
def parse_io(io)
|
23
|
+
raise NotImplemented
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require "saxxy/parsers/base"
|
2
|
+
require "saxxy/callbacks/libxml"
|
3
|
+
|
4
|
+
|
5
|
+
module Saxxy
|
6
|
+
module Parsers
|
7
|
+
|
8
|
+
class Libxml < Base
|
9
|
+
def initialize(context_tree, options = {})
|
10
|
+
super
|
11
|
+
@__internal_context_options =
|
12
|
+
case options[:mode]
|
13
|
+
when :html, nil
|
14
|
+
LibXML::XML::Parser::Options::RECOVER |
|
15
|
+
LibXML::XML::Parser::Options::NOERROR |
|
16
|
+
LibXML::XML::Parser::Options::NOWARNING |
|
17
|
+
LibXML::XML::Parser::Options::NONET
|
18
|
+
when :xml
|
19
|
+
LibXML::XML::Parser::Options::RECOVER |
|
20
|
+
LibXML::XML::Parser::Options::NONET
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def parse_string(string, encoding = LibXML::XML::Encoding::UTF_8)
|
25
|
+
parse_with LibXML::XML::SaxParser.new(build_context(:string, string, encoding))
|
26
|
+
end
|
27
|
+
|
28
|
+
def parse_file(path_to_file, encoding = LibXML::XML::Encoding::UTF_8)
|
29
|
+
parse_with LibXML::XML::SaxParser.new(build_context(:file, path_to_file, encoding))
|
30
|
+
end
|
31
|
+
|
32
|
+
def parse_io(io, encoding = LibXML::XML::Encoding::UTF_8)
|
33
|
+
parse_with LibXML::XML::SaxParser.new(build_context(:io, io, encoding))
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
def build_context(method, obj, encoding)
|
38
|
+
LibXML::XML::Parser::Context.public_send(method, obj).tap do |ctx|
|
39
|
+
ctx.options = @__internal_context_options
|
40
|
+
ctx.encoding = encoding
|
41
|
+
ctx.recovery = true
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def parse_with(parser)
|
46
|
+
parser.callbacks = Saxxy::Callbacks::Libxml.new(context_tree.root)
|
47
|
+
parser.parse
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require "saxxy/parsers/base"
|
2
|
+
require "saxxy/callbacks/nokogiri"
|
3
|
+
|
4
|
+
|
5
|
+
module Saxxy
|
6
|
+
module Parsers
|
7
|
+
|
8
|
+
class Nokogiri < Base
|
9
|
+
def parse_string(string, encoding = 'UTF-8', &block)
|
10
|
+
new_parser.parse_memory(string, encoding, &block)
|
11
|
+
end
|
12
|
+
|
13
|
+
def parse_file(path_to_file, encoding = 'UTF-8', &block)
|
14
|
+
new_parser.parse_file(path_to_file, encoding, &block)
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse_io(io, encoding = 'UTF-8', &block)
|
18
|
+
new_parser.parse_io(io, encoding, &block)
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def new_parser
|
23
|
+
::Nokogiri::HTML::SAX::Parser.new(Saxxy::Callbacks::Nokogiri.new(context_tree.root))
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require "saxxy/parsers/base"
|
2
|
+
require "saxxy/callbacks/ox"
|
3
|
+
|
4
|
+
|
5
|
+
module Saxxy
|
6
|
+
module Parsers
|
7
|
+
|
8
|
+
class Ox < Base
|
9
|
+
def parse_string(string, encoding = nil)
|
10
|
+
parse(StringIO.new(string), encoding)
|
11
|
+
end
|
12
|
+
|
13
|
+
def parse_file(path_to_file, encoding = nil)
|
14
|
+
parse(File.new(path_to_file), encoding)
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse_io(io, encoding = nil)
|
18
|
+
parse(io, encoding)
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def parse(io, encoding)
|
23
|
+
io.set_encoding(encoding) if encoding
|
24
|
+
callbacks = Saxxy::Callbacks::Ox.new(context_tree.root)
|
25
|
+
::Ox.sax_parse(callbacks, io, {smart: true}.merge(options))
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require "saxxy/context_tree"
|
2
|
+
|
3
|
+
|
4
|
+
module Saxxy
|
5
|
+
|
6
|
+
module Parsers
|
7
|
+
autoload :Nokogiri, "saxxy/parsers/nokogiri"
|
8
|
+
autoload :Ox, "saxxy/parsers/ox"
|
9
|
+
autoload :Libxml, "saxxy/parsers/libxml"
|
10
|
+
end
|
11
|
+
|
12
|
+
class Service
|
13
|
+
attr_reader :parser
|
14
|
+
|
15
|
+
def initialize(parser, options = {}, &block)
|
16
|
+
@parser = build_parser(parser, options, &block)
|
17
|
+
end
|
18
|
+
|
19
|
+
def parse_file(*args)
|
20
|
+
@parser.parse_file(*args)
|
21
|
+
end
|
22
|
+
|
23
|
+
def parse_string(*args)
|
24
|
+
@parser.parse_string(*args)
|
25
|
+
end
|
26
|
+
|
27
|
+
def parse_io(*args)
|
28
|
+
@parser.parse_io(*args)
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
def build_parser(parser, options, &block)
|
33
|
+
ctx = eval("self", block.binding)
|
34
|
+
parser_class_from(parser).new(Saxxy::ContextTree.new(ctx, &block), options)
|
35
|
+
end
|
36
|
+
|
37
|
+
def parser_class_from(obj)
|
38
|
+
case obj
|
39
|
+
when Symbol, String
|
40
|
+
Saxxy::Parsers.const_get(Saxxy::Helpers.camelize(obj))
|
41
|
+
else
|
42
|
+
obj
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require "net/http"
|
2
|
+
require "uri"
|
3
|
+
|
4
|
+
|
5
|
+
module Saxxy
|
6
|
+
|
7
|
+
# The Agent is a thin wrapper over Net::HTTP::Proxy in order to be used
|
8
|
+
# for crawling purposes. Supports GET and POST via its get and post methods.
|
9
|
+
class Agent
|
10
|
+
attr_reader :url, :uri, :proxy, :agent, :response
|
11
|
+
|
12
|
+
# Initializes an agent with optional proxy options.
|
13
|
+
# Url: A string that it is the url that the agent is going to use for issuing
|
14
|
+
# requests. It can be reset to another via the self.uri = method.
|
15
|
+
# Options:
|
16
|
+
# - proxy:
|
17
|
+
# - address: The address of the proxy.
|
18
|
+
# - port: The port the proxy will use.
|
19
|
+
# - username: The username if the proxy needs auth.
|
20
|
+
# - password: The password if the proxy needs auth.
|
21
|
+
def initialize(url, opts = {})
|
22
|
+
@proxy = opts[:proxy] || {}
|
23
|
+
@agent = proxy.empty? ? Net::HTTP : Net::HTTP::Proxy(proxy[:address], proxy[:port], proxy[:username], proxy[:password])
|
24
|
+
self.uri = url
|
25
|
+
end
|
26
|
+
|
27
|
+
# Sets the url and uri by inspecting the argument. Can accept either a string
|
28
|
+
# which must be a valid URL or a URI object.
|
29
|
+
def uri=(url_or_uri)
|
30
|
+
@uri = url_or_uri.is_a?(URI) ? url_or_uri : URI(url_or_uri)
|
31
|
+
@url = uri.to_s
|
32
|
+
end
|
33
|
+
|
34
|
+
# Issues a get request either by using the url provided as an argument or
|
35
|
+
# the one the agent currently holds.
|
36
|
+
# Note: if the provided url is different from the agent's it updates the
|
37
|
+
# agent's url also. See set_uri_for.
|
38
|
+
def get(url = nil)
|
39
|
+
issue_request(url, :get_response)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Issues a post request either by using the url provided as an argument or
|
43
|
+
# the one the agent currently holds. Uses the post_form method of the
|
44
|
+
# Net::HTTP::Proxy and forwards any passed options to the underlying agent.
|
45
|
+
# Note: if the provided url is different from the agent's it updates the
|
46
|
+
# agent's url also. See set_uri_for.
|
47
|
+
def post(url = nil, opts = {})
|
48
|
+
issue_request(url, :post_form, opts)
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
def set_uri_for(url = nil)
|
53
|
+
self.uri = url if url
|
54
|
+
end
|
55
|
+
|
56
|
+
def issue_request(*args)
|
57
|
+
new_url_or_uri = args.shift
|
58
|
+
if new_url_or_uri.to_s != url
|
59
|
+
set_uri_for(new_url_or_uri)
|
60
|
+
@response = agent.public_send(args.shift, uri, *args)
|
61
|
+
end
|
62
|
+
response.body
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Saxxy
|
2
|
+
|
3
|
+
class CallbackArray < Array
|
4
|
+
def <<(obj)
|
5
|
+
super(obj)
|
6
|
+
@add_callback.call(obj) if @add_callback
|
7
|
+
self
|
8
|
+
end
|
9
|
+
|
10
|
+
def >>(obj)
|
11
|
+
delete(obj)
|
12
|
+
@remove_callback.call(obj) if @remove_callback
|
13
|
+
self
|
14
|
+
end
|
15
|
+
|
16
|
+
def on_remove(&block)
|
17
|
+
@remove_callback = block
|
18
|
+
self
|
19
|
+
end
|
20
|
+
|
21
|
+
def on_add(&block)
|
22
|
+
@add_callback = block
|
23
|
+
self
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|