saxxy 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +22 -0
  3. data/.travis.yml +5 -0
  4. data/Gemfile +13 -0
  5. data/LICENSE +22 -0
  6. data/README.md +117 -0
  7. data/Rakefile +12 -0
  8. data/lib/saxxy.rb +2 -0
  9. data/lib/saxxy/activatable.rb +160 -0
  10. data/lib/saxxy/callbacks/libxml.rb +26 -0
  11. data/lib/saxxy/callbacks/nokogiri.rb +30 -0
  12. data/lib/saxxy/callbacks/ox.rb +66 -0
  13. data/lib/saxxy/callbacks/sax.rb +86 -0
  14. data/lib/saxxy/context.rb +88 -0
  15. data/lib/saxxy/context_tree.rb +85 -0
  16. data/lib/saxxy/event.rb +83 -0
  17. data/lib/saxxy/event_registry.rb +122 -0
  18. data/lib/saxxy/node_action.rb +59 -0
  19. data/lib/saxxy/node_rule.rb +90 -0
  20. data/lib/saxxy/parsers/base.rb +28 -0
  21. data/lib/saxxy/parsers/libxml.rb +52 -0
  22. data/lib/saxxy/parsers/nokogiri.rb +28 -0
  23. data/lib/saxxy/parsers/ox.rb +30 -0
  24. data/lib/saxxy/service.rb +47 -0
  25. data/lib/saxxy/utils/agent.rb +66 -0
  26. data/lib/saxxy/utils/callback_array.rb +27 -0
  27. data/lib/saxxy/utils/helpers.rb +13 -0
  28. data/lib/saxxy/version.rb +3 -0
  29. data/saxxy.gemspec +21 -0
  30. data/spec/saxxy/activatable_spec.rb +344 -0
  31. data/spec/saxxy/callbacks/sax_spec.rb +456 -0
  32. data/spec/saxxy/context_spec.rb +51 -0
  33. data/spec/saxxy/context_tree_spec.rb +68 -0
  34. data/spec/saxxy/event_registry_spec.rb +137 -0
  35. data/spec/saxxy/event_spec.rb +49 -0
  36. data/spec/saxxy/node_action_spec.rb +46 -0
  37. data/spec/saxxy/node_rule_spec.rb +99 -0
  38. data/spec/saxxy/parsers/libxml_spec.rb +104 -0
  39. data/spec/saxxy/parsers/nokogiri_spec.rb +200 -0
  40. data/spec/saxxy/parsers/ox_spec.rb +175 -0
  41. data/spec/saxxy/utils/agent_spec.rb +63 -0
  42. data/spec/spec_helper.rb +28 -0
  43. data/spec/support/agent_macros.rb +24 -0
  44. metadata +155 -0
@@ -0,0 +1,59 @@
1
+ module Saxxy
2
+
3
+ ##
4
+ # @author rubymaniac
5
+ #
6
+ # NodeAction describes something that should be run on a
7
+ # node. In order to check whether to run this action it
8
+ # accepts as the first argument an activation_rule.
9
+ #
10
+ #
11
+ # @!attribute [r] activation_rule
12
+ # @return [Context] this action's activation rule
13
+ #
14
+ # @!attribute [r] action
15
+ # @return [Proc] the block of code that will run on a node
16
+ ##
17
+ class NodeAction
18
+ attr_reader :activation_rule, :action
19
+
20
+ # Initializes a NodeAction with an `activation_rule` a context to run
21
+ # its action (block) and the block.
22
+ #
23
+ # @param activation_rule [NodeRule] an instance of NodeRule
24
+ # used to check whether to run this action on a node
25
+ #
26
+ # @param context [Object] a context (object) on which the block
27
+ # will be evaluated
28
+ #
29
+ # @param block [Proc] a block that will get evaluated on context
30
+ #
31
+ def initialize(activation_rule, context = self, &block)
32
+ @activation_rule = activation_rule
33
+ @ctx = context
34
+ @action = block_given? ? block : ->(e) { e }
35
+ end
36
+
37
+ # Delegates the call to its `activation_rule`
38
+ #
39
+ # @param element_name [String] the name of a node
40
+ #
41
+ # @param attributes [Hash<String, String>] the attributes of a node
42
+ #
43
+ # @return [Boolean] whether it matches the node
44
+ #
45
+ def matches(element_name, attributes)
46
+ activation_rule.matches(element_name, attributes)
47
+ end
48
+
49
+ # Evaluates the block that was given to the constructor on the context
50
+ # and passes the arguments to the block
51
+ #
52
+ # @param args [Array] variable arguments that pass to the block
53
+ #
54
+ def call(*args)
55
+ @ctx.instance_exec(args, &action)
56
+ end
57
+ end
58
+
59
+ end
@@ -0,0 +1,90 @@
1
+ module Saxxy
2
+
3
+ ##
4
+ # @author rubymaniac
5
+ #
6
+ # NodeRule describes a rule that will be tested upon an XML node
7
+ # and will check if the node satisfies this NodeRule.
8
+ #
9
+ # The NodeRule consists of two parts. The `element` part which
10
+ # refers to what should hold for the node's name. It can be
11
+ # either a String (where the strict equality is should hold) or
12
+ # a Regexp (where the Regexp must match the node name).
13
+ #
14
+ # The other part is the `attributes` part which refers to what
15
+ # should hold for the attributes of the node. It consists of key-value
16
+ # pairs where the key is the attribute to check and the value is what
17
+ # should hold for that attribute.
18
+ #
19
+ # @!attribute [r] element
20
+ # @return [String|Regexp] node's name rule
21
+ #
22
+ # @!attribute [r] attributes
23
+ # @return [Hash<String, String|Regexp>] node's attributes rule
24
+ ##
25
+ class NodeRule
26
+ attr_reader :element, :attributes
27
+
28
+ # Initializes a NodeRule with an `element` part and an `attributes` part.
29
+ #
30
+ # @param element [String|Regexp] what should hold for the node name
31
+ # @param attributes [Hash<String, String|Regexp>]
32
+ # what should hold for node's attributes
33
+ #
34
+ def initialize(element, attributes = {})
35
+ @element = element
36
+ @attributes = Saxxy::Helpers.stringify_keys(attributes)
37
+ end
38
+
39
+ # Checks whether this NodeRule matches a node.
40
+ #
41
+ # @param element_name [String] node's name
42
+ # @param attrs [Hash<String, String>] node's attributes
43
+ #
44
+ # @return [Boolean] whether this NodeRule matches the node
45
+ #
46
+ def matches(element_name, attrs = {})
47
+ match_element_name(element_name) && match_attributes(attrs)
48
+ end
49
+
50
+ # Checks whether this NodeRule is equal to another.
51
+ #
52
+ # @param rule [NodeRule] the other NodeRule
53
+ #
54
+ # @return [Boolean] whether this NodeRule equals rule
55
+ #
56
+ def equals(rule)
57
+ element == rule.element && attributes == rule.attributes
58
+ end
59
+
60
+ # Checks whether this NodeRule matches only the name of a node.
61
+ #
62
+ # @param element_name [String] node's name
63
+ #
64
+ # @return [Boolean] whether this NodeRule matches node's name
65
+ #
66
+ def match_element_name(element_name)
67
+ match(element, element_name)
68
+ end
69
+
70
+ # Checks whether this NodeRule matches only the attributes of a node.
71
+ #
72
+ # @param attrs [Hash<String, String>] node's attributes
73
+ #
74
+ # @return [Boolean] whether this NodeRule matches node's attributes
75
+ #
76
+ def match_attributes(attrs)
77
+ attrs = Saxxy::Helpers.stringify_keys(attrs)
78
+ attributes.reduce(true) do |b, (k, v)|
79
+ value = attrs[k]
80
+ b && ((!value.nil? && match(v, value)) || (v.nil? && value.nil?))
81
+ end
82
+ end
83
+
84
+ private
85
+ def match(obj, value)
86
+ obj.is_a?(Regexp) ? !obj.match(value).nil? : obj == value
87
+ end
88
+ end
89
+
90
+ end
@@ -0,0 +1,28 @@
1
+ module Saxxy
2
+ module Parsers
3
+
4
+ class NotImplemented < StandardError; end
5
+
6
+ class Base
7
+ attr_reader :context_tree, :options
8
+
9
+ def initialize(context_tree, options = {})
10
+ @context_tree = context_tree
11
+ @options = options
12
+ end
13
+
14
+ def parse_file(path_to_file)
15
+ raise NotImplemented
16
+ end
17
+
18
+ def parse_string(string)
19
+ raise NotImplemented
20
+ end
21
+
22
+ def parse_io(io)
23
+ raise NotImplemented
24
+ end
25
+ end
26
+
27
+ end
28
+ end
@@ -0,0 +1,52 @@
1
+ require "saxxy/parsers/base"
2
+ require "saxxy/callbacks/libxml"
3
+
4
+
5
+ module Saxxy
6
+ module Parsers
7
+
8
+ class Libxml < Base
9
+ def initialize(context_tree, options = {})
10
+ super
11
+ @__internal_context_options =
12
+ case options[:mode]
13
+ when :html, nil
14
+ LibXML::XML::Parser::Options::RECOVER |
15
+ LibXML::XML::Parser::Options::NOERROR |
16
+ LibXML::XML::Parser::Options::NOWARNING |
17
+ LibXML::XML::Parser::Options::NONET
18
+ when :xml
19
+ LibXML::XML::Parser::Options::RECOVER |
20
+ LibXML::XML::Parser::Options::NONET
21
+ end
22
+ end
23
+
24
+ def parse_string(string, encoding = LibXML::XML::Encoding::UTF_8)
25
+ parse_with LibXML::XML::SaxParser.new(build_context(:string, string, encoding))
26
+ end
27
+
28
+ def parse_file(path_to_file, encoding = LibXML::XML::Encoding::UTF_8)
29
+ parse_with LibXML::XML::SaxParser.new(build_context(:file, path_to_file, encoding))
30
+ end
31
+
32
+ def parse_io(io, encoding = LibXML::XML::Encoding::UTF_8)
33
+ parse_with LibXML::XML::SaxParser.new(build_context(:io, io, encoding))
34
+ end
35
+
36
+ private
37
+ def build_context(method, obj, encoding)
38
+ LibXML::XML::Parser::Context.public_send(method, obj).tap do |ctx|
39
+ ctx.options = @__internal_context_options
40
+ ctx.encoding = encoding
41
+ ctx.recovery = true
42
+ end
43
+ end
44
+
45
+ def parse_with(parser)
46
+ parser.callbacks = Saxxy::Callbacks::Libxml.new(context_tree.root)
47
+ parser.parse
48
+ end
49
+ end
50
+
51
+ end
52
+ end
@@ -0,0 +1,28 @@
1
+ require "saxxy/parsers/base"
2
+ require "saxxy/callbacks/nokogiri"
3
+
4
+
5
+ module Saxxy
6
+ module Parsers
7
+
8
+ class Nokogiri < Base
9
+ def parse_string(string, encoding = 'UTF-8', &block)
10
+ new_parser.parse_memory(string, encoding, &block)
11
+ end
12
+
13
+ def parse_file(path_to_file, encoding = 'UTF-8', &block)
14
+ new_parser.parse_file(path_to_file, encoding, &block)
15
+ end
16
+
17
+ def parse_io(io, encoding = 'UTF-8', &block)
18
+ new_parser.parse_io(io, encoding, &block)
19
+ end
20
+
21
+ private
22
+ def new_parser
23
+ ::Nokogiri::HTML::SAX::Parser.new(Saxxy::Callbacks::Nokogiri.new(context_tree.root))
24
+ end
25
+ end
26
+
27
+ end
28
+ end
@@ -0,0 +1,30 @@
1
+ require "saxxy/parsers/base"
2
+ require "saxxy/callbacks/ox"
3
+
4
+
5
+ module Saxxy
6
+ module Parsers
7
+
8
+ class Ox < Base
9
+ def parse_string(string, encoding = nil)
10
+ parse(StringIO.new(string), encoding)
11
+ end
12
+
13
+ def parse_file(path_to_file, encoding = nil)
14
+ parse(File.new(path_to_file), encoding)
15
+ end
16
+
17
+ def parse_io(io, encoding = nil)
18
+ parse(io, encoding)
19
+ end
20
+
21
+ private
22
+ def parse(io, encoding)
23
+ io.set_encoding(encoding) if encoding
24
+ callbacks = Saxxy::Callbacks::Ox.new(context_tree.root)
25
+ ::Ox.sax_parse(callbacks, io, {smart: true}.merge(options))
26
+ end
27
+ end
28
+
29
+ end
30
+ end
@@ -0,0 +1,47 @@
1
+ require "saxxy/context_tree"
2
+
3
+
4
+ module Saxxy
5
+
6
+ module Parsers
7
+ autoload :Nokogiri, "saxxy/parsers/nokogiri"
8
+ autoload :Ox, "saxxy/parsers/ox"
9
+ autoload :Libxml, "saxxy/parsers/libxml"
10
+ end
11
+
12
+ class Service
13
+ attr_reader :parser
14
+
15
+ def initialize(parser, options = {}, &block)
16
+ @parser = build_parser(parser, options, &block)
17
+ end
18
+
19
+ def parse_file(*args)
20
+ @parser.parse_file(*args)
21
+ end
22
+
23
+ def parse_string(*args)
24
+ @parser.parse_string(*args)
25
+ end
26
+
27
+ def parse_io(*args)
28
+ @parser.parse_io(*args)
29
+ end
30
+
31
+ private
32
+ def build_parser(parser, options, &block)
33
+ ctx = eval("self", block.binding)
34
+ parser_class_from(parser).new(Saxxy::ContextTree.new(ctx, &block), options)
35
+ end
36
+
37
+ def parser_class_from(obj)
38
+ case obj
39
+ when Symbol, String
40
+ Saxxy::Parsers.const_get(Saxxy::Helpers.camelize(obj))
41
+ else
42
+ obj
43
+ end
44
+ end
45
+ end
46
+
47
+ end
@@ -0,0 +1,66 @@
1
+ require "net/http"
2
+ require "uri"
3
+
4
+
5
+ module Saxxy
6
+
7
+ # The Agent is a thin wrapper over Net::HTTP::Proxy in order to be used
8
+ # for crawling purposes. Supports GET and POST via its get and post methods.
9
+ class Agent
10
+ attr_reader :url, :uri, :proxy, :agent, :response
11
+
12
+ # Initializes an agent with optional proxy options.
13
+ # Url: A string that it is the url that the agent is going to use for issuing
14
+ # requests. It can be reset to another via the self.uri = method.
15
+ # Options:
16
+ # - proxy:
17
+ # - address: The address of the proxy.
18
+ # - port: The port the proxy will use.
19
+ # - username: The username if the proxy needs auth.
20
+ # - password: The password if the proxy needs auth.
21
+ def initialize(url, opts = {})
22
+ @proxy = opts[:proxy] || {}
23
+ @agent = proxy.empty? ? Net::HTTP : Net::HTTP::Proxy(proxy[:address], proxy[:port], proxy[:username], proxy[:password])
24
+ self.uri = url
25
+ end
26
+
27
+ # Sets the url and uri by inspecting the argument. Can accept either a string
28
+ # which must be a valid URL or a URI object.
29
+ def uri=(url_or_uri)
30
+ @uri = url_or_uri.is_a?(URI) ? url_or_uri : URI(url_or_uri)
31
+ @url = uri.to_s
32
+ end
33
+
34
+ # Issues a get request either by using the url provided as an argument or
35
+ # the one the agent currently holds.
36
+ # Note: if the provided url is different from the agent's it updates the
37
+ # agent's url also. See set_uri_for.
38
+ def get(url = nil)
39
+ issue_request(url, :get_response)
40
+ end
41
+
42
+ # Issues a post request either by using the url provided as an argument or
43
+ # the one the agent currently holds. Uses the post_form method of the
44
+ # Net::HTTP::Proxy and forwards any passed options to the underlying agent.
45
+ # Note: if the provided url is different from the agent's it updates the
46
+ # agent's url also. See set_uri_for.
47
+ def post(url = nil, opts = {})
48
+ issue_request(url, :post_form, opts)
49
+ end
50
+
51
+ private
52
+ def set_uri_for(url = nil)
53
+ self.uri = url if url
54
+ end
55
+
56
+ def issue_request(*args)
57
+ new_url_or_uri = args.shift
58
+ if new_url_or_uri.to_s != url
59
+ set_uri_for(new_url_or_uri)
60
+ @response = agent.public_send(args.shift, uri, *args)
61
+ end
62
+ response.body
63
+ end
64
+ end
65
+
66
+ end
@@ -0,0 +1,27 @@
1
+ module Saxxy
2
+
3
+ class CallbackArray < Array
4
+ def <<(obj)
5
+ super(obj)
6
+ @add_callback.call(obj) if @add_callback
7
+ self
8
+ end
9
+
10
+ def >>(obj)
11
+ delete(obj)
12
+ @remove_callback.call(obj) if @remove_callback
13
+ self
14
+ end
15
+
16
+ def on_remove(&block)
17
+ @remove_callback = block
18
+ self
19
+ end
20
+
21
+ def on_add(&block)
22
+ @add_callback = block
23
+ self
24
+ end
25
+ end
26
+
27
+ end
@@ -0,0 +1,13 @@
1
+ module Saxxy
2
+ class Helpers
3
+
4
+ def self.camelize(obj)
5
+ obj.to_s.split(/[^a-z0-9]/i).map(&:capitalize).join
6
+ end
7
+
8
+ def self.stringify_keys(hash)
9
+ Hash[hash.map { |k, v| [k.to_s, v] }]
10
+ end
11
+
12
+ end
13
+ end