saxxy 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.travis.yml +5 -0
  4. data/Gemfile +13 -0
  5. data/LICENSE +22 -0
  6. data/README.md +117 -0
  7. data/Rakefile +12 -0
  8. data/lib/saxxy.rb +2 -0
  9. data/lib/saxxy/activatable.rb +160 -0
  10. data/lib/saxxy/callbacks/libxml.rb +26 -0
  11. data/lib/saxxy/callbacks/nokogiri.rb +30 -0
  12. data/lib/saxxy/callbacks/ox.rb +66 -0
  13. data/lib/saxxy/callbacks/sax.rb +86 -0
  14. data/lib/saxxy/context.rb +88 -0
  15. data/lib/saxxy/context_tree.rb +85 -0
  16. data/lib/saxxy/event.rb +83 -0
  17. data/lib/saxxy/event_registry.rb +122 -0
  18. data/lib/saxxy/node_action.rb +59 -0
  19. data/lib/saxxy/node_rule.rb +90 -0
  20. data/lib/saxxy/parsers/base.rb +28 -0
  21. data/lib/saxxy/parsers/libxml.rb +52 -0
  22. data/lib/saxxy/parsers/nokogiri.rb +28 -0
  23. data/lib/saxxy/parsers/ox.rb +30 -0
  24. data/lib/saxxy/service.rb +47 -0
  25. data/lib/saxxy/utils/agent.rb +66 -0
  26. data/lib/saxxy/utils/callback_array.rb +27 -0
  27. data/lib/saxxy/utils/helpers.rb +13 -0
  28. data/lib/saxxy/version.rb +3 -0
  29. data/saxxy.gemspec +21 -0
  30. data/spec/saxxy/activatable_spec.rb +344 -0
  31. data/spec/saxxy/callbacks/sax_spec.rb +456 -0
  32. data/spec/saxxy/context_spec.rb +51 -0
  33. data/spec/saxxy/context_tree_spec.rb +68 -0
  34. data/spec/saxxy/event_registry_spec.rb +137 -0
  35. data/spec/saxxy/event_spec.rb +49 -0
  36. data/spec/saxxy/node_action_spec.rb +46 -0
  37. data/spec/saxxy/node_rule_spec.rb +99 -0
  38. data/spec/saxxy/parsers/libxml_spec.rb +104 -0
  39. data/spec/saxxy/parsers/nokogiri_spec.rb +200 -0
  40. data/spec/saxxy/parsers/ox_spec.rb +175 -0
  41. data/spec/saxxy/utils/agent_spec.rb +63 -0
  42. data/spec/spec_helper.rb +28 -0
  43. data/spec/support/agent_macros.rb +24 -0
  44. metadata +155 -0
@@ -0,0 +1,59 @@
1
+ module Saxxy
2
+
3
+ ##
4
+ # @author rubymaniac
5
+ #
6
+ # NodeAction describes something that should be run on a
7
+ # node. In order to check whether to run this action it
8
+ # accepts as the first argument an activation_rule.
9
+ #
10
+ #
11
+ # @!attribute [r] activation_rule
12
+ # @return [Context] this action's activation rule
13
+ #
14
+ # @!attribute [r] action
15
+ # @return [Proc] the block of code that will run on a node
16
+ ##
17
+ class NodeAction
18
+ attr_reader :activation_rule, :action
19
+
20
+ # Initializes a NodeAction with an `activation_rule` a context to run
21
+ # its action (block) and the block.
22
+ #
23
+ # @param activation_rule [NodeRule] an instance of NodeRule
24
+ # used to check whether to run this action on a node
25
+ #
26
+ # @param context [Object] a context (object) on which the block
27
+ # will be evaluated
28
+ #
29
+ # @param block [Proc] a block that will get evaluated on context
30
+ #
31
+ def initialize(activation_rule, context = self, &block)
32
+ @activation_rule = activation_rule
33
+ @ctx = context
34
+ @action = block_given? ? block : ->(e) { e }
35
+ end
36
+
37
+ # Delegates the call to its `activation_rule`
38
+ #
39
+ # @param element_name [String] the name of a node
40
+ #
41
+ # @param attributes [Hash<String, String>] the attributes of a node
42
+ #
43
+ # @return [Boolean] whether it matches the node
44
+ #
45
+ def matches(element_name, attributes)
46
+ activation_rule.matches(element_name, attributes)
47
+ end
48
+
49
+ # Evaluates the block that was given to the constructor on the context
50
+ # and passes the arguments to the block
51
+ #
52
+ # @param args [Array] variable arguments that pass to the block
53
+ #
54
+ def call(*args)
55
+ @ctx.instance_exec(args, &action)
56
+ end
57
+ end
58
+
59
+ end
@@ -0,0 +1,90 @@
1
+ module Saxxy
2
+
3
+ ##
4
+ # @author rubymaniac
5
+ #
6
+ # NodeRule describes a rule that will be tested upon an XML node
7
+ # and will check if the node satisfies this NodeRule.
8
+ #
9
+ # The NodeRule consists of two parts. The `element` part which
10
+ # refers to what should hold for the node's name. It can be
11
+ # either a String (where the strict equality is should hold) or
12
+ # a Regexp (where the Regexp must match the node name).
13
+ #
14
+ # The other part is the `attributes` part which refers to what
15
+ # should hold for the attributes of the node. It consists of key-value
16
+ # pairs where the key is the attribute to check and the value is what
17
+ # should hold for that attribute.
18
+ #
19
+ # @!attribute [r] element
20
+ # @return [String|Regexp] node's name rule
21
+ #
22
+ # @!attribute [r] attributes
23
+ # @return [Hash<String, String|Regexp>] node's attributes rule
24
+ ##
25
+ class NodeRule
26
+ attr_reader :element, :attributes
27
+
28
+ # Initializes a NodeRule with an `element` part and an `attributes` part.
29
+ #
30
+ # @param element [String|Regexp] what should hold for the node name
31
+ # @param attributes [Hash<String, String|Regexp>]
32
+ # what should hold for node's attributes
33
+ #
34
+ def initialize(element, attributes = {})
35
+ @element = element
36
+ @attributes = Saxxy::Helpers.stringify_keys(attributes)
37
+ end
38
+
39
+ # Checks whether this NodeRule matches a node.
40
+ #
41
+ # @param element_name [String] node's name
42
+ # @param attrs [Hash<String, String>] node's attributes
43
+ #
44
+ # @return [Boolean] whether this NodeRule matches the node
45
+ #
46
+ def matches(element_name, attrs = {})
47
+ match_element_name(element_name) && match_attributes(attrs)
48
+ end
49
+
50
+ # Checks whether this NodeRule is equal to another.
51
+ #
52
+ # @param rule [NodeRule] the other NodeRule
53
+ #
54
+ # @return [Boolean] whether this NodeRule equals rule
55
+ #
56
+ def equals(rule)
57
+ element == rule.element && attributes == rule.attributes
58
+ end
59
+
60
+ # Checks whether this NodeRule matches only the name of a node.
61
+ #
62
+ # @param element_name [String] node's name
63
+ #
64
+ # @return [Boolean] whether this NodeRule matches node's name
65
+ #
66
+ def match_element_name(element_name)
67
+ match(element, element_name)
68
+ end
69
+
70
+ # Checks whether this NodeRule matches only the attributes of a node.
71
+ #
72
+ # @param attrs [Hash<String, String>] node's attributes
73
+ #
74
+ # @return [Boolean] whether this NodeRule matches node's attributes
75
+ #
76
+ def match_attributes(attrs)
77
+ attrs = Saxxy::Helpers.stringify_keys(attrs)
78
+ attributes.reduce(true) do |b, (k, v)|
79
+ value = attrs[k]
80
+ b && ((!value.nil? && match(v, value)) || (v.nil? && value.nil?))
81
+ end
82
+ end
83
+
84
+ private
85
+ def match(obj, value)
86
+ obj.is_a?(Regexp) ? !obj.match(value).nil? : obj == value
87
+ end
88
+ end
89
+
90
+ end
@@ -0,0 +1,28 @@
1
+ module Saxxy
2
+ module Parsers
3
+
4
+ class NotImplemented < StandardError; end
5
+
6
+ class Base
7
+ attr_reader :context_tree, :options
8
+
9
+ def initialize(context_tree, options = {})
10
+ @context_tree = context_tree
11
+ @options = options
12
+ end
13
+
14
+ def parse_file(path_to_file)
15
+ raise NotImplemented
16
+ end
17
+
18
+ def parse_string(string)
19
+ raise NotImplemented
20
+ end
21
+
22
+ def parse_io(io)
23
+ raise NotImplemented
24
+ end
25
+ end
26
+
27
+ end
28
+ end
@@ -0,0 +1,52 @@
1
+ require "saxxy/parsers/base"
2
+ require "saxxy/callbacks/libxml"
3
+
4
+
5
+ module Saxxy
6
+ module Parsers
7
+
8
+ class Libxml < Base
9
+ def initialize(context_tree, options = {})
10
+ super
11
+ @__internal_context_options =
12
+ case options[:mode]
13
+ when :html, nil
14
+ LibXML::XML::Parser::Options::RECOVER |
15
+ LibXML::XML::Parser::Options::NOERROR |
16
+ LibXML::XML::Parser::Options::NOWARNING |
17
+ LibXML::XML::Parser::Options::NONET
18
+ when :xml
19
+ LibXML::XML::Parser::Options::RECOVER |
20
+ LibXML::XML::Parser::Options::NONET
21
+ end
22
+ end
23
+
24
+ def parse_string(string, encoding = LibXML::XML::Encoding::UTF_8)
25
+ parse_with LibXML::XML::SaxParser.new(build_context(:string, string, encoding))
26
+ end
27
+
28
+ def parse_file(path_to_file, encoding = LibXML::XML::Encoding::UTF_8)
29
+ parse_with LibXML::XML::SaxParser.new(build_context(:file, path_to_file, encoding))
30
+ end
31
+
32
+ def parse_io(io, encoding = LibXML::XML::Encoding::UTF_8)
33
+ parse_with LibXML::XML::SaxParser.new(build_context(:io, io, encoding))
34
+ end
35
+
36
+ private
37
+ def build_context(method, obj, encoding)
38
+ LibXML::XML::Parser::Context.public_send(method, obj).tap do |ctx|
39
+ ctx.options = @__internal_context_options
40
+ ctx.encoding = encoding
41
+ ctx.recovery = true
42
+ end
43
+ end
44
+
45
+ def parse_with(parser)
46
+ parser.callbacks = Saxxy::Callbacks::Libxml.new(context_tree.root)
47
+ parser.parse
48
+ end
49
+ end
50
+
51
+ end
52
+ end
@@ -0,0 +1,28 @@
1
+ require "saxxy/parsers/base"
2
+ require "saxxy/callbacks/nokogiri"
3
+
4
+
5
+ module Saxxy
6
+ module Parsers
7
+
8
+ class Nokogiri < Base
9
+ def parse_string(string, encoding = 'UTF-8', &block)
10
+ new_parser.parse_memory(string, encoding, &block)
11
+ end
12
+
13
+ def parse_file(path_to_file, encoding = 'UTF-8', &block)
14
+ new_parser.parse_file(path_to_file, encoding, &block)
15
+ end
16
+
17
+ def parse_io(io, encoding = 'UTF-8', &block)
18
+ new_parser.parse_io(io, encoding, &block)
19
+ end
20
+
21
+ private
22
+ def new_parser
23
+ ::Nokogiri::HTML::SAX::Parser.new(Saxxy::Callbacks::Nokogiri.new(context_tree.root))
24
+ end
25
+ end
26
+
27
+ end
28
+ end
@@ -0,0 +1,30 @@
1
+ require "saxxy/parsers/base"
2
+ require "saxxy/callbacks/ox"
3
+
4
+
5
+ module Saxxy
6
+ module Parsers
7
+
8
+ class Ox < Base
9
+ def parse_string(string, encoding = nil)
10
+ parse(StringIO.new(string), encoding)
11
+ end
12
+
13
+ def parse_file(path_to_file, encoding = nil)
14
+ parse(File.new(path_to_file), encoding)
15
+ end
16
+
17
+ def parse_io(io, encoding = nil)
18
+ parse(io, encoding)
19
+ end
20
+
21
+ private
22
+ def parse(io, encoding)
23
+ io.set_encoding(encoding) if encoding
24
+ callbacks = Saxxy::Callbacks::Ox.new(context_tree.root)
25
+ ::Ox.sax_parse(callbacks, io, {smart: true}.merge(options))
26
+ end
27
+ end
28
+
29
+ end
30
+ end
@@ -0,0 +1,47 @@
1
+ require "saxxy/context_tree"
2
+
3
+
4
+ module Saxxy
5
+
6
+ module Parsers
7
+ autoload :Nokogiri, "saxxy/parsers/nokogiri"
8
+ autoload :Ox, "saxxy/parsers/ox"
9
+ autoload :Libxml, "saxxy/parsers/libxml"
10
+ end
11
+
12
+ class Service
13
+ attr_reader :parser
14
+
15
+ def initialize(parser, options = {}, &block)
16
+ @parser = build_parser(parser, options, &block)
17
+ end
18
+
19
+ def parse_file(*args)
20
+ @parser.parse_file(*args)
21
+ end
22
+
23
+ def parse_string(*args)
24
+ @parser.parse_string(*args)
25
+ end
26
+
27
+ def parse_io(*args)
28
+ @parser.parse_io(*args)
29
+ end
30
+
31
+ private
32
+ def build_parser(parser, options, &block)
33
+ ctx = eval("self", block.binding)
34
+ parser_class_from(parser).new(Saxxy::ContextTree.new(ctx, &block), options)
35
+ end
36
+
37
+ def parser_class_from(obj)
38
+ case obj
39
+ when Symbol, String
40
+ Saxxy::Parsers.const_get(Saxxy::Helpers.camelize(obj))
41
+ else
42
+ obj
43
+ end
44
+ end
45
+ end
46
+
47
+ end
@@ -0,0 +1,66 @@
1
+ require "net/http"
2
+ require "uri"
3
+
4
+
5
+ module Saxxy
6
+
7
+ # The Agent is a thin wrapper over Net::HTTP::Proxy in order to be used
8
+ # for crawling purposes. Supports GET and POST via its get and post methods.
9
+ class Agent
10
+ attr_reader :url, :uri, :proxy, :agent, :response
11
+
12
+ # Initializes an agent with optional proxy options.
13
+ # Url: A string that it is the url that the agent is going to use for issuing
14
+ # requests. It can be reset to another via the self.uri = method.
15
+ # Options:
16
+ # - proxy:
17
+ # - address: The address of the proxy.
18
+ # - port: The port the proxy will use.
19
+ # - username: The username if the proxy needs auth.
20
+ # - password: The password if the proxy needs auth.
21
+ def initialize(url, opts = {})
22
+ @proxy = opts[:proxy] || {}
23
+ @agent = proxy.empty? ? Net::HTTP : Net::HTTP::Proxy(proxy[:address], proxy[:port], proxy[:username], proxy[:password])
24
+ self.uri = url
25
+ end
26
+
27
+ # Sets the url and uri by inspecting the argument. Can accept either a string
28
+ # which must be a valid URL or a URI object.
29
+ def uri=(url_or_uri)
30
+ @uri = url_or_uri.is_a?(URI) ? url_or_uri : URI(url_or_uri)
31
+ @url = uri.to_s
32
+ end
33
+
34
+ # Issues a get request either by using the url provided as an argument or
35
+ # the one the agent currently holds.
36
+ # Note: if the provided url is different from the agent's it updates the
37
+ # agent's url also. See set_uri_for.
38
+ def get(url = nil)
39
+ issue_request(url, :get_response)
40
+ end
41
+
42
+ # Issues a post request either by using the url provided as an argument or
43
+ # the one the agent currently holds. Uses the post_form method of the
44
+ # Net::HTTP::Proxy and forwards any passed options to the underlying agent.
45
+ # Note: if the provided url is different from the agent's it updates the
46
+ # agent's url also. See set_uri_for.
47
+ def post(url = nil, opts = {})
48
+ issue_request(url, :post_form, opts)
49
+ end
50
+
51
+ private
52
+ def set_uri_for(url = nil)
53
+ self.uri = url if url
54
+ end
55
+
56
+ def issue_request(*args)
57
+ new_url_or_uri = args.shift
58
+ if new_url_or_uri.to_s != url
59
+ set_uri_for(new_url_or_uri)
60
+ @response = agent.public_send(args.shift, uri, *args)
61
+ end
62
+ response.body
63
+ end
64
+ end
65
+
66
+ end
@@ -0,0 +1,27 @@
1
+ module Saxxy
2
+
3
+ class CallbackArray < Array
4
+ def <<(obj)
5
+ super(obj)
6
+ @add_callback.call(obj) if @add_callback
7
+ self
8
+ end
9
+
10
+ def >>(obj)
11
+ delete(obj)
12
+ @remove_callback.call(obj) if @remove_callback
13
+ self
14
+ end
15
+
16
+ def on_remove(&block)
17
+ @remove_callback = block
18
+ self
19
+ end
20
+
21
+ def on_add(&block)
22
+ @add_callback = block
23
+ self
24
+ end
25
+ end
26
+
27
+ end
@@ -0,0 +1,13 @@
1
+ module Saxxy
2
+ class Helpers
3
+
4
+ def self.camelize(obj)
5
+ obj.to_s.split(/[^a-z0-9]/i).map(&:capitalize).join
6
+ end
7
+
8
+ def self.stringify_keys(hash)
9
+ Hash[hash.map { |k, v| [k.to_s, v] }]
10
+ end
11
+
12
+ end
13
+ end