saxxy 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +22 -0
- data/.travis.yml +5 -0
- data/Gemfile +13 -0
- data/LICENSE +22 -0
- data/README.md +117 -0
- data/Rakefile +12 -0
- data/lib/saxxy.rb +2 -0
- data/lib/saxxy/activatable.rb +160 -0
- data/lib/saxxy/callbacks/libxml.rb +26 -0
- data/lib/saxxy/callbacks/nokogiri.rb +30 -0
- data/lib/saxxy/callbacks/ox.rb +66 -0
- data/lib/saxxy/callbacks/sax.rb +86 -0
- data/lib/saxxy/context.rb +88 -0
- data/lib/saxxy/context_tree.rb +85 -0
- data/lib/saxxy/event.rb +83 -0
- data/lib/saxxy/event_registry.rb +122 -0
- data/lib/saxxy/node_action.rb +59 -0
- data/lib/saxxy/node_rule.rb +90 -0
- data/lib/saxxy/parsers/base.rb +28 -0
- data/lib/saxxy/parsers/libxml.rb +52 -0
- data/lib/saxxy/parsers/nokogiri.rb +28 -0
- data/lib/saxxy/parsers/ox.rb +30 -0
- data/lib/saxxy/service.rb +47 -0
- data/lib/saxxy/utils/agent.rb +66 -0
- data/lib/saxxy/utils/callback_array.rb +27 -0
- data/lib/saxxy/utils/helpers.rb +13 -0
- data/lib/saxxy/version.rb +3 -0
- data/saxxy.gemspec +21 -0
- data/spec/saxxy/activatable_spec.rb +344 -0
- data/spec/saxxy/callbacks/sax_spec.rb +456 -0
- data/spec/saxxy/context_spec.rb +51 -0
- data/spec/saxxy/context_tree_spec.rb +68 -0
- data/spec/saxxy/event_registry_spec.rb +137 -0
- data/spec/saxxy/event_spec.rb +49 -0
- data/spec/saxxy/node_action_spec.rb +46 -0
- data/spec/saxxy/node_rule_spec.rb +99 -0
- data/spec/saxxy/parsers/libxml_spec.rb +104 -0
- data/spec/saxxy/parsers/nokogiri_spec.rb +200 -0
- data/spec/saxxy/parsers/ox_spec.rb +175 -0
- data/spec/saxxy/utils/agent_spec.rb +63 -0
- data/spec/spec_helper.rb +28 -0
- data/spec/support/agent_macros.rb +24 -0
- metadata +155 -0
@@ -0,0 +1,59 @@
|
|
1
|
+
module Saxxy
|
2
|
+
|
3
|
+
##
|
4
|
+
# @author rubymaniac
|
5
|
+
#
|
6
|
+
# NodeAction describes something that should be run on a
|
7
|
+
# node. In order to check whether to run this action it
|
8
|
+
# accepts as the first argument an activation_rule.
|
9
|
+
#
|
10
|
+
#
|
11
|
+
# @!attribute [r] activation_rule
|
12
|
+
# @return [Context] this action's activation rule
|
13
|
+
#
|
14
|
+
# @!attribute [r] action
|
15
|
+
# @return [Proc] the block of code that will run on a node
|
16
|
+
##
|
17
|
+
class NodeAction
|
18
|
+
attr_reader :activation_rule, :action
|
19
|
+
|
20
|
+
# Initializes a NodeAction with an `activation_rule` a context to run
|
21
|
+
# its action (block) and the block.
|
22
|
+
#
|
23
|
+
# @param activation_rule [NodeRule] an instance of NodeRule
|
24
|
+
# used to check whether to run this action on a node
|
25
|
+
#
|
26
|
+
# @param context [Object] a context (object) on which the block
|
27
|
+
# will be evaluated
|
28
|
+
#
|
29
|
+
# @param block [Proc] a block that will get evaluated on context
|
30
|
+
#
|
31
|
+
def initialize(activation_rule, context = self, &block)
|
32
|
+
@activation_rule = activation_rule
|
33
|
+
@ctx = context
|
34
|
+
@action = block_given? ? block : ->(e) { e }
|
35
|
+
end
|
36
|
+
|
37
|
+
# Delegates the call to its `activation_rule`
|
38
|
+
#
|
39
|
+
# @param element_name [String] the name of a node
|
40
|
+
#
|
41
|
+
# @param attributes [Hash<String, String>] the attributes of a node
|
42
|
+
#
|
43
|
+
# @return [Boolean] whether it matches the node
|
44
|
+
#
|
45
|
+
def matches(element_name, attributes)
|
46
|
+
activation_rule.matches(element_name, attributes)
|
47
|
+
end
|
48
|
+
|
49
|
+
# Evaluates the block that was given to the constructor on the context
|
50
|
+
# and passes the arguments to the block
|
51
|
+
#
|
52
|
+
# @param args [Array] variable arguments that pass to the block
|
53
|
+
#
|
54
|
+
def call(*args)
|
55
|
+
@ctx.instance_exec(args, &action)
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
end
|
@@ -0,0 +1,90 @@
|
|
1
|
+
module Saxxy
|
2
|
+
|
3
|
+
##
|
4
|
+
# @author rubymaniac
|
5
|
+
#
|
6
|
+
# NodeRule describes a rule that will be tested upon an XML node
|
7
|
+
# and will check if the node satisfies this NodeRule.
|
8
|
+
#
|
9
|
+
# The NodeRule consists of two parts. The `element` part which
|
10
|
+
# refers to what should hold for the node's name. It can be
|
11
|
+
# either a String (where the strict equality is should hold) or
|
12
|
+
# a Regexp (where the Regexp must match the node name).
|
13
|
+
#
|
14
|
+
# The other part is the `attributes` part which refers to what
|
15
|
+
# should hold for the attributes of the node. It consists of key-value
|
16
|
+
# pairs where the key is the attribute to check and the value is what
|
17
|
+
# should hold for that attribute.
|
18
|
+
#
|
19
|
+
# @!attribute [r] element
|
20
|
+
# @return [String|Regexp] node's name rule
|
21
|
+
#
|
22
|
+
# @!attribute [r] attributes
|
23
|
+
# @return [Hash<String, String|Regexp>] node's attributes rule
|
24
|
+
##
|
25
|
+
class NodeRule
|
26
|
+
attr_reader :element, :attributes
|
27
|
+
|
28
|
+
# Initializes a NodeRule with an `element` part and an `attributes` part.
|
29
|
+
#
|
30
|
+
# @param element [String|Regexp] what should hold for the node name
|
31
|
+
# @param attributes [Hash<String, String|Regexp>]
|
32
|
+
# what should hold for node's attributes
|
33
|
+
#
|
34
|
+
def initialize(element, attributes = {})
|
35
|
+
@element = element
|
36
|
+
@attributes = Saxxy::Helpers.stringify_keys(attributes)
|
37
|
+
end
|
38
|
+
|
39
|
+
# Checks whether this NodeRule matches a node.
|
40
|
+
#
|
41
|
+
# @param element_name [String] node's name
|
42
|
+
# @param attrs [Hash<String, String>] node's attributes
|
43
|
+
#
|
44
|
+
# @return [Boolean] whether this NodeRule matches the node
|
45
|
+
#
|
46
|
+
def matches(element_name, attrs = {})
|
47
|
+
match_element_name(element_name) && match_attributes(attrs)
|
48
|
+
end
|
49
|
+
|
50
|
+
# Checks whether this NodeRule is equal to another.
|
51
|
+
#
|
52
|
+
# @param rule [NodeRule] the other NodeRule
|
53
|
+
#
|
54
|
+
# @return [Boolean] whether this NodeRule equals rule
|
55
|
+
#
|
56
|
+
def equals(rule)
|
57
|
+
element == rule.element && attributes == rule.attributes
|
58
|
+
end
|
59
|
+
|
60
|
+
# Checks whether this NodeRule matches only the name of a node.
|
61
|
+
#
|
62
|
+
# @param element_name [String] node's name
|
63
|
+
#
|
64
|
+
# @return [Boolean] whether this NodeRule matches node's name
|
65
|
+
#
|
66
|
+
def match_element_name(element_name)
|
67
|
+
match(element, element_name)
|
68
|
+
end
|
69
|
+
|
70
|
+
# Checks whether this NodeRule matches only the attributes of a node.
|
71
|
+
#
|
72
|
+
# @param attrs [Hash<String, String>] node's attributes
|
73
|
+
#
|
74
|
+
# @return [Boolean] whether this NodeRule matches node's attributes
|
75
|
+
#
|
76
|
+
def match_attributes(attrs)
|
77
|
+
attrs = Saxxy::Helpers.stringify_keys(attrs)
|
78
|
+
attributes.reduce(true) do |b, (k, v)|
|
79
|
+
value = attrs[k]
|
80
|
+
b && ((!value.nil? && match(v, value)) || (v.nil? && value.nil?))
|
81
|
+
end
|
82
|
+
end
|
83
|
+
|
84
|
+
private
|
85
|
+
def match(obj, value)
|
86
|
+
obj.is_a?(Regexp) ? !obj.match(value).nil? : obj == value
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
module Saxxy
|
2
|
+
module Parsers
|
3
|
+
|
4
|
+
class NotImplemented < StandardError; end
|
5
|
+
|
6
|
+
class Base
|
7
|
+
attr_reader :context_tree, :options
|
8
|
+
|
9
|
+
def initialize(context_tree, options = {})
|
10
|
+
@context_tree = context_tree
|
11
|
+
@options = options
|
12
|
+
end
|
13
|
+
|
14
|
+
def parse_file(path_to_file)
|
15
|
+
raise NotImplemented
|
16
|
+
end
|
17
|
+
|
18
|
+
def parse_string(string)
|
19
|
+
raise NotImplemented
|
20
|
+
end
|
21
|
+
|
22
|
+
def parse_io(io)
|
23
|
+
raise NotImplemented
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
require "saxxy/parsers/base"
|
2
|
+
require "saxxy/callbacks/libxml"
|
3
|
+
|
4
|
+
|
5
|
+
module Saxxy
|
6
|
+
module Parsers
|
7
|
+
|
8
|
+
class Libxml < Base
|
9
|
+
def initialize(context_tree, options = {})
|
10
|
+
super
|
11
|
+
@__internal_context_options =
|
12
|
+
case options[:mode]
|
13
|
+
when :html, nil
|
14
|
+
LibXML::XML::Parser::Options::RECOVER |
|
15
|
+
LibXML::XML::Parser::Options::NOERROR |
|
16
|
+
LibXML::XML::Parser::Options::NOWARNING |
|
17
|
+
LibXML::XML::Parser::Options::NONET
|
18
|
+
when :xml
|
19
|
+
LibXML::XML::Parser::Options::RECOVER |
|
20
|
+
LibXML::XML::Parser::Options::NONET
|
21
|
+
end
|
22
|
+
end
|
23
|
+
|
24
|
+
def parse_string(string, encoding = LibXML::XML::Encoding::UTF_8)
|
25
|
+
parse_with LibXML::XML::SaxParser.new(build_context(:string, string, encoding))
|
26
|
+
end
|
27
|
+
|
28
|
+
def parse_file(path_to_file, encoding = LibXML::XML::Encoding::UTF_8)
|
29
|
+
parse_with LibXML::XML::SaxParser.new(build_context(:file, path_to_file, encoding))
|
30
|
+
end
|
31
|
+
|
32
|
+
def parse_io(io, encoding = LibXML::XML::Encoding::UTF_8)
|
33
|
+
parse_with LibXML::XML::SaxParser.new(build_context(:io, io, encoding))
|
34
|
+
end
|
35
|
+
|
36
|
+
private
|
37
|
+
def build_context(method, obj, encoding)
|
38
|
+
LibXML::XML::Parser::Context.public_send(method, obj).tap do |ctx|
|
39
|
+
ctx.options = @__internal_context_options
|
40
|
+
ctx.encoding = encoding
|
41
|
+
ctx.recovery = true
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
def parse_with(parser)
|
46
|
+
parser.callbacks = Saxxy::Callbacks::Libxml.new(context_tree.root)
|
47
|
+
parser.parse
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
end
|
52
|
+
end
|
@@ -0,0 +1,28 @@
|
|
1
|
+
require "saxxy/parsers/base"
|
2
|
+
require "saxxy/callbacks/nokogiri"
|
3
|
+
|
4
|
+
|
5
|
+
module Saxxy
|
6
|
+
module Parsers
|
7
|
+
|
8
|
+
class Nokogiri < Base
|
9
|
+
def parse_string(string, encoding = 'UTF-8', &block)
|
10
|
+
new_parser.parse_memory(string, encoding, &block)
|
11
|
+
end
|
12
|
+
|
13
|
+
def parse_file(path_to_file, encoding = 'UTF-8', &block)
|
14
|
+
new_parser.parse_file(path_to_file, encoding, &block)
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse_io(io, encoding = 'UTF-8', &block)
|
18
|
+
new_parser.parse_io(io, encoding, &block)
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def new_parser
|
23
|
+
::Nokogiri::HTML::SAX::Parser.new(Saxxy::Callbacks::Nokogiri.new(context_tree.root))
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|
28
|
+
end
|
@@ -0,0 +1,30 @@
|
|
1
|
+
require "saxxy/parsers/base"
|
2
|
+
require "saxxy/callbacks/ox"
|
3
|
+
|
4
|
+
|
5
|
+
module Saxxy
|
6
|
+
module Parsers
|
7
|
+
|
8
|
+
class Ox < Base
|
9
|
+
def parse_string(string, encoding = nil)
|
10
|
+
parse(StringIO.new(string), encoding)
|
11
|
+
end
|
12
|
+
|
13
|
+
def parse_file(path_to_file, encoding = nil)
|
14
|
+
parse(File.new(path_to_file), encoding)
|
15
|
+
end
|
16
|
+
|
17
|
+
def parse_io(io, encoding = nil)
|
18
|
+
parse(io, encoding)
|
19
|
+
end
|
20
|
+
|
21
|
+
private
|
22
|
+
def parse(io, encoding)
|
23
|
+
io.set_encoding(encoding) if encoding
|
24
|
+
callbacks = Saxxy::Callbacks::Ox.new(context_tree.root)
|
25
|
+
::Ox.sax_parse(callbacks, io, {smart: true}.merge(options))
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,47 @@
|
|
1
|
+
require "saxxy/context_tree"
|
2
|
+
|
3
|
+
|
4
|
+
module Saxxy
|
5
|
+
|
6
|
+
module Parsers
|
7
|
+
autoload :Nokogiri, "saxxy/parsers/nokogiri"
|
8
|
+
autoload :Ox, "saxxy/parsers/ox"
|
9
|
+
autoload :Libxml, "saxxy/parsers/libxml"
|
10
|
+
end
|
11
|
+
|
12
|
+
class Service
|
13
|
+
attr_reader :parser
|
14
|
+
|
15
|
+
def initialize(parser, options = {}, &block)
|
16
|
+
@parser = build_parser(parser, options, &block)
|
17
|
+
end
|
18
|
+
|
19
|
+
def parse_file(*args)
|
20
|
+
@parser.parse_file(*args)
|
21
|
+
end
|
22
|
+
|
23
|
+
def parse_string(*args)
|
24
|
+
@parser.parse_string(*args)
|
25
|
+
end
|
26
|
+
|
27
|
+
def parse_io(*args)
|
28
|
+
@parser.parse_io(*args)
|
29
|
+
end
|
30
|
+
|
31
|
+
private
|
32
|
+
def build_parser(parser, options, &block)
|
33
|
+
ctx = eval("self", block.binding)
|
34
|
+
parser_class_from(parser).new(Saxxy::ContextTree.new(ctx, &block), options)
|
35
|
+
end
|
36
|
+
|
37
|
+
def parser_class_from(obj)
|
38
|
+
case obj
|
39
|
+
when Symbol, String
|
40
|
+
Saxxy::Parsers.const_get(Saxxy::Helpers.camelize(obj))
|
41
|
+
else
|
42
|
+
obj
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require "net/http"
|
2
|
+
require "uri"
|
3
|
+
|
4
|
+
|
5
|
+
module Saxxy
|
6
|
+
|
7
|
+
# The Agent is a thin wrapper over Net::HTTP::Proxy in order to be used
|
8
|
+
# for crawling purposes. Supports GET and POST via its get and post methods.
|
9
|
+
class Agent
|
10
|
+
attr_reader :url, :uri, :proxy, :agent, :response
|
11
|
+
|
12
|
+
# Initializes an agent with optional proxy options.
|
13
|
+
# Url: A string that it is the url that the agent is going to use for issuing
|
14
|
+
# requests. It can be reset to another via the self.uri = method.
|
15
|
+
# Options:
|
16
|
+
# - proxy:
|
17
|
+
# - address: The address of the proxy.
|
18
|
+
# - port: The port the proxy will use.
|
19
|
+
# - username: The username if the proxy needs auth.
|
20
|
+
# - password: The password if the proxy needs auth.
|
21
|
+
def initialize(url, opts = {})
|
22
|
+
@proxy = opts[:proxy] || {}
|
23
|
+
@agent = proxy.empty? ? Net::HTTP : Net::HTTP::Proxy(proxy[:address], proxy[:port], proxy[:username], proxy[:password])
|
24
|
+
self.uri = url
|
25
|
+
end
|
26
|
+
|
27
|
+
# Sets the url and uri by inspecting the argument. Can accept either a string
|
28
|
+
# which must be a valid URL or a URI object.
|
29
|
+
def uri=(url_or_uri)
|
30
|
+
@uri = url_or_uri.is_a?(URI) ? url_or_uri : URI(url_or_uri)
|
31
|
+
@url = uri.to_s
|
32
|
+
end
|
33
|
+
|
34
|
+
# Issues a get request either by using the url provided as an argument or
|
35
|
+
# the one the agent currently holds.
|
36
|
+
# Note: if the provided url is different from the agent's it updates the
|
37
|
+
# agent's url also. See set_uri_for.
|
38
|
+
def get(url = nil)
|
39
|
+
issue_request(url, :get_response)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Issues a post request either by using the url provided as an argument or
|
43
|
+
# the one the agent currently holds. Uses the post_form method of the
|
44
|
+
# Net::HTTP::Proxy and forwards any passed options to the underlying agent.
|
45
|
+
# Note: if the provided url is different from the agent's it updates the
|
46
|
+
# agent's url also. See set_uri_for.
|
47
|
+
def post(url = nil, opts = {})
|
48
|
+
issue_request(url, :post_form, opts)
|
49
|
+
end
|
50
|
+
|
51
|
+
private
|
52
|
+
def set_uri_for(url = nil)
|
53
|
+
self.uri = url if url
|
54
|
+
end
|
55
|
+
|
56
|
+
def issue_request(*args)
|
57
|
+
new_url_or_uri = args.shift
|
58
|
+
if new_url_or_uri.to_s != url
|
59
|
+
set_uri_for(new_url_or_uri)
|
60
|
+
@response = agent.public_send(args.shift, uri, *args)
|
61
|
+
end
|
62
|
+
response.body
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
module Saxxy
|
2
|
+
|
3
|
+
class CallbackArray < Array
|
4
|
+
def <<(obj)
|
5
|
+
super(obj)
|
6
|
+
@add_callback.call(obj) if @add_callback
|
7
|
+
self
|
8
|
+
end
|
9
|
+
|
10
|
+
def >>(obj)
|
11
|
+
delete(obj)
|
12
|
+
@remove_callback.call(obj) if @remove_callback
|
13
|
+
self
|
14
|
+
end
|
15
|
+
|
16
|
+
def on_remove(&block)
|
17
|
+
@remove_callback = block
|
18
|
+
self
|
19
|
+
end
|
20
|
+
|
21
|
+
def on_add(&block)
|
22
|
+
@add_callback = block
|
23
|
+
self
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
end
|