extraloop 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,36 @@
1
+ require 'pry'
2
+ class JsonExtractor < ExtractorBase
3
+
4
+ def initialize(*args)
5
+ @path = args[2] && args[2].is_a?(Array) ? args[2] : nil
6
+ super(*args)
7
+ end
8
+
9
+ def extract_field(node, record=nil)
10
+ output = node = node.is_a?(String) ? parse(node) : node
11
+ output = node.get_in(@path) if @path
12
+ output = node[@attribute.to_s] if @attribute
13
+ output = @environment.run(output, record, &@callback) if @callback
14
+
15
+ # when no attribute and no callback is provided, try fetching by field name
16
+ if !@attribute && !@callback
17
+ output = node[@field_name.to_s] if node[@field_name.to_s]
18
+ end
19
+ output
20
+ end
21
+
22
+ def extract_list(input)
23
+ #TODO: implement more clever stuff here after looking
24
+ # into possible hash traversal techniques
25
+
26
+ input = input.is_a?(String) ? parse(input) : input
27
+ input = input.get_in(@path) if @path
28
+
29
+ @callback && Array(@environment.run(input, &@callback)) || input
30
+ end
31
+
32
+ def parse(input)
33
+ super(input)
34
+ @environment.document = (Yajl::Parser.new).parse(input).extend(Utils::DeepFetchable)
35
+ end
36
+ end
@@ -0,0 +1,64 @@
1
+ autoload :Logging, "logging"
2
+
3
+ # Decorates a class with an instance of Logging.logger and a convenient
4
+ # helper method to log messages.
5
+
6
+ module Loggable
7
+ protected
8
+
9
+ #
10
+ # Initializes the incorporated logger object.
11
+ #
12
+ # Returns nothing.
13
+ #
14
+
15
+ def init_log!
16
+ return unless @options[:log]
17
+
18
+ @options[:log] ||= {
19
+ :appenders => [ Logging.appenders.stderr ],
20
+ :log_level => :info
21
+ }
22
+
23
+ if @options[:log] && @options[:log][:appenders] && @options[:log][:appenders].any?
24
+ @log = Logging.logger["#{self}"]
25
+ @log.add_appenders(@options[:log][:appenders])
26
+ @log.level = @options[:log] && @options[:log][:log_level] || :info
27
+ end
28
+ end
29
+
30
+ #
31
+ # Convenience method for logging messages.
32
+ #
33
+ # messages - the message content
34
+ # log_level - the message's log level (can be either :info, :debug, :error, :warning; defaults to :info)
35
+ #
36
+ # Returns nothing.
37
+ #
38
+
39
+ def log(message, log_level = :info)
40
+ @log.send(log_level, message) if @log
41
+ end
42
+ end
43
+
44
+
45
+ #
46
+ # Monkey patches ScraperBase.
47
+ #
48
+ class ScraperBase
49
+ include Loggable
50
+ alias_method :base_initialize, :initialize
51
+
52
+ #
53
+ # Wrapp ScraperBase#initialize method into Loggable#initialize
54
+ #
55
+ # args - The arguments to be passed over to the ScraperBase#initialize method.
56
+ #
57
+ # Returns itself.
58
+ #
59
+ def initialize(*args)
60
+ base_initialize(*args)
61
+ init_log!
62
+ self
63
+ end
64
+ end
@@ -0,0 +1,166 @@
1
+ class ScraperBase
2
+ include Hookable
3
+ include Utils::Support
4
+
5
+ attr_reader :results, :options
6
+
7
+ #
8
+ # Public: Initalizes a web scraper.
9
+ #
10
+ # urls - One or several urls.
11
+ # options - Hash of scraper options
12
+ # async : Whether the scraper should issue HTTP requests in series or in parallel (set to false to suppress logging completely).
13
+ # log : logging options (defaults to standard error).
14
+ # appenders : specifies where the log messages should be appended to (defaults to standard error).
15
+ # log_level : specifies the log level (defaults to info).
16
+ # arguments - Hash of arguments to be passed to the Typhoeus HTTP client (optional).
17
+ #
18
+ #
19
+ #
20
+ # Returns itself.
21
+ #
22
+
23
+ def initialize(urls, options = {}, arguments = {})
24
+ @urls = Array(urls)
25
+ @loop_extractor_args = nil
26
+ @extractor_args = []
27
+ @loop = nil
28
+
29
+ @request_arguments = arguments
30
+
31
+ @options = {
32
+ :async => false
33
+ }.merge(options)
34
+
35
+
36
+ @response_count = 0
37
+ @queued_count = 0
38
+
39
+ @hooks = {}
40
+ @failed_requests = []
41
+
42
+ hydra_options = @options[:hydra] && @options[:hydra][:max_concurrency] || {:max_concurrency => 10}
43
+ @hydra = Typhoeus::Hydra.new hydra_options
44
+ self
45
+ end
46
+
47
+
48
+ # Public: Sets the scraper extraction loop.
49
+ #
50
+ # Delegates to Extractor, will raise an exception if neither a selector, a block, or an attribute name is provided.
51
+ #
52
+ #
53
+ # selector - The CSS3 selector identifying the node list over which iterate (optional).
54
+ # callback - A block of code (optional).
55
+ # attribute - An attribute name (optional).
56
+ #
57
+ # Returns itself.
58
+ #
59
+
60
+ def loop_on(*args)
61
+ @loop_extractor_args = args.insert(0, nil, ExtractionEnvironment.new(self))
62
+ self
63
+ end
64
+
65
+ # Public: Registers a new extractor to be added to the loop.
66
+ #
67
+ # Delegates to Extractor, will raise an exception if neither a selector, a block, or an attribute name is provided.
68
+ #
69
+ # selector - The CSS3 selector identifying the node list over which iterate (optional).
70
+ # callback - A block of code (optional).
71
+ # attribute - An attribute name (optional).
72
+ #
73
+ # Returns itself.
74
+ #
75
+ #
76
+
77
+ def extract(*args)
78
+ @extractor_args << args.insert(1, ExtractionEnvironment.new(self))
79
+ self
80
+ end
81
+
82
+ #
83
+ # Public: Runs the main scraping loop.
84
+ #
85
+ # Returns nothing
86
+ #
87
+ def run
88
+ @urls.each do |url|
89
+ issue_request(url)
90
+
91
+ # if the scraper is asynchronous start processing the Hydra HTTP queue
92
+ # only after that the last url has been appended to the queue (see #issue_request).
93
+ #
94
+ if @options[:async]
95
+ if url == @urls.last
96
+ @hydra.run
97
+ end
98
+ else
99
+ @hydra.run
100
+ end
101
+ end
102
+ self
103
+ end
104
+
105
+ protected
106
+
107
+ def issue_request(url)
108
+
109
+ @request_arguments[:params] = merge_request_parameters(url)
110
+ url_without_params = url.gsub(/\?.*/,"")
111
+
112
+ arguments = {
113
+ 'headers' => [
114
+ 'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110613 Firefox/6.0a2',
115
+ 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
116
+ ].join("\n")
117
+ }
118
+
119
+ arguments.merge!(@request_arguments)
120
+ request = Typhoeus::Request.new(*[url_without_params, arguments])
121
+
122
+ request.on_complete do |response|
123
+ handle_response(response)
124
+ end
125
+
126
+ log("queueing url: #{url}, params #{arguments[:params]}", :debug)
127
+ @queued_count += 1
128
+ @hydra.queue(request)
129
+ end
130
+
131
+ def merge_request_parameters(url)
132
+ url_params = URI::parse(url).extend(Utils::URIAddition).query_hash
133
+ return @request_arguments[:params] || {} unless url_params && url_params.respond_to?(:merge)
134
+
135
+ params = symbolize_keys(@request_arguments[:params] ||= {})
136
+ url_params.merge(params)
137
+ end
138
+
139
+ def handle_response(response)
140
+ @response_count += 1
141
+ @loop = prepare_loop(response)
142
+ log("response ##{@response_count} of #{@queued_count}, status code: [#{response.code}], URL fragment: ...#{response.effective_url.split('/').last if response.effective_url}")
143
+ @loop.run
144
+
145
+ @environment = @loop.environment
146
+ run_hook(:data, [@loop.records, response])
147
+ end
148
+
149
+ def prepare_loop(response)
150
+ format = @options[:format] || detect_format(response.headers_hash.fetch('Content-Type', nil))
151
+ extractor_class = format == :json ? JsonExtractor : DomExtractor
152
+ loop_extractor = extractor_class.new(*@loop_extractor_args)
153
+ extractors = @extractor_args.map { |args| extractor_class.new(*args) }
154
+ ExtractionLoop.new(loop_extractor, extractors, response.body, @hooks, self)
155
+ end
156
+
157
+ def detect_format(content_type)
158
+ #TODO: add support for xml/rdf documents
159
+ if content_type && content_type =~ /json$/
160
+ :json
161
+ else
162
+ :html
163
+ end
164
+ end
165
+
166
+ end
@@ -0,0 +1,75 @@
1
+ module Utils
2
+ module ScrapingHelpers
3
+ #
4
+ # Generates a proc that iterates over a list of anchors
5
+ # and collects the value of the specified paramenter
6
+ #
7
+ def values_for_param(param)
8
+ lambda { |nodeList|
9
+ nodeList.collect {|node|
10
+ query = URI::parse(node.attr(:href)).query
11
+ query.split("&").collect { |token| token.split("=") }.
12
+ detect{ |chunks| chunks.first == param.to_s }.last
13
+ }.uniq
14
+ }
15
+ end
16
+ end
17
+
18
+ module URIAddition
19
+ #
20
+ # Public
21
+ #
22
+ # Generates a hash representation of a uri's query string.
23
+ #
24
+ # Returns a hash mapping the URL query parameters to their respective values
25
+ #
26
+ # NOTE: this is intended as a decorator method for instances of URI::HTTP.
27
+ #
28
+ # examples:
29
+ #
30
+ # URI::parse(url).extend(URIAddition).query_hash
31
+ #
32
+
33
+ def query_hash
34
+ return unless self.query
35
+ self.query.split("&").reduce({}) do |memo, item|
36
+ param, value = *item.split("=")
37
+ memo.merge(param.to_sym => value)
38
+ end
39
+ end
40
+ end
41
+
42
+ module DeepFetchable
43
+ def get_in(path)
44
+ keys, node = Array(path), self
45
+
46
+ keys.each_with_index do |key, index|
47
+ node = node[key]
48
+ next_key = keys[index + 1]
49
+ break unless node
50
+ end
51
+
52
+ node
53
+ end
54
+ end
55
+
56
+ module Support
57
+ def symbolize_keys(hash)
58
+ hash.reduce({}) { |memo, (k,v)| memo.merge(k => v) }
59
+ end
60
+ #
61
+ # Creates instance variables from a hash.
62
+ #
63
+ # hash - An hash representing of instance variables to be created.
64
+ # defaults - An hash representing the attributes' default values (optional).
65
+ #
66
+ protected
67
+ def set_attributes(hash, defaults={})
68
+ allowed = defaults.keys
69
+ hash.each { |key, value| self.instance_variable_set("@#{key}", value)}
70
+ defaults.each do |key, value|
71
+ self.instance_variable_set("@#{key}", value) unless self.instance_variable_get("@#{key}")
72
+ end
73
+ end
74
+ end
75
+ end
data/lib/extraloop.rb ADDED
@@ -0,0 +1,43 @@
1
+ base_path = File.expand_path(File.dirname(__FILE__) + "/extraloop" )
2
+
3
+ # Standard library
4
+ autoload :OpenStruct, "ostruct"
5
+
6
+ # Rubygems
7
+
8
+ gem "yajl-ruby"
9
+ gem "nokogiri"
10
+ gem "typhoeus"
11
+ gem "logging"
12
+
13
+
14
+ autoload :Nokogiri, "nokogiri"
15
+ autoload :Yajl, "yajl"
16
+ autoload :Typhoeus, "typhoeus"
17
+
18
+
19
+ # Extraloop components
20
+
21
+ autoload :Utils , "#{base_path}/utils"
22
+ autoload :ExtractionEnvironment , "#{base_path}/extraction_environment"
23
+ autoload :ExtractorBase , "#{base_path}/extractor_base"
24
+ autoload :DomExtractor , "#{base_path}/dom_extractor"
25
+ autoload :JsonExtractor , "#{base_path}/json_extractor"
26
+ autoload :ExtractionLoop , "#{base_path}/extraction_loop"
27
+ autoload :ScraperBase , "#{base_path}/scraper_base"
28
+ autoload :Loggable , "#{base_path}/loggable"
29
+ autoload :Hookable , "#{base_path}/hookable"
30
+ autoload :IterativeScraper , "#{base_path}/iterative_scraper"
31
+
32
+
33
+ # monkey patch scraperbase with the Loggable module.
34
+ #
35
+ # This is the equivalent adding extra_loop/ to the path and requiring both ScraperBase and Loggable
36
+ #
37
+ ScraperBase
38
+ Loggable
39
+
40
+
41
+ class ExtraLoop
42
+ VERSION = '0.0.1'
43
+ end
@@ -0,0 +1,165 @@
1
+ require 'helpers/spec_helper'
2
+
3
+ describe DomExtractor do
4
+ before(:each) do
5
+ stub(scraper = Object.new).options
6
+ stub(scraper).results
7
+ @env = ExtractionEnvironment.new(scraper)
8
+ @html ||= <<-EOF
9
+ <div class="entry">
10
+ <p><a href="http://example.com">my dummy link</a></p>
11
+ </div>
12
+ <div class="entry exclude" />
13
+ <div class="entry" />
14
+ EOF
15
+
16
+ @xml ||= <<-EOF
17
+ <?xml version="1.0"?>
18
+ <StandardDataObject xmlns="myns">
19
+ <InteractionElements>
20
+ <TargetCenter>92f4-MPA</TargetCenter>
21
+ <Trace>7.19879</Trace>
22
+ </InteractionElements>
23
+ </StandardDataObject>
24
+ EOF
25
+ end
26
+
27
+ describe "#new" do
28
+ subject { DomExtractor.new(:my_field, @env, "p a", :href) }
29
+ it { subject.field_name.should eql(:my_field) }
30
+ end
31
+
32
+ context "when no attribute is provided" do
33
+ before do
34
+ @extractor = DomExtractor.new(:anchor, @env, "p a")
35
+ @node = @extractor.parse(@html)
36
+ end
37
+
38
+ describe "#extract_field" do
39
+ subject { @extractor.extract_field(@node) }
40
+ it { should eql("my dummy link") }
41
+ end
42
+ end
43
+
44
+
45
+ context "when an attribute is provided" do
46
+ before do
47
+ @extractor = DomExtractor.new(:anchor, @env, "p a", :href)
48
+ @node = @extractor.parse(@html)
49
+ end
50
+
51
+ describe "#extract_field" do
52
+ subject { @extractor.extract_field(@node) }
53
+ it { should eql("http://example.com") }
54
+ end
55
+ end
56
+
57
+ context "when a selector and a block is provided" do
58
+ before do
59
+ @extractor = DomExtractor.new(:anchor, @env, "p a", proc { |node|
60
+ node.text.gsub("dummy", "fancy")
61
+ })
62
+ @node = @extractor.parse(@html)
63
+ end
64
+
65
+ describe "#extract_field" do
66
+ subject { @extractor.extract_field(@node) }
67
+ it { should match(/my fancy/) }
68
+ end
69
+ end
70
+
71
+ context "when only a block is provided" do
72
+ before do
73
+ @extractor = DomExtractor.new(:anchor, @env, proc { |document|
74
+ document.at_css("p a").text.gsub(/dummy/,'fancy')
75
+ })
76
+ @node = @extractor.parse(@html)
77
+ end
78
+
79
+ describe "#extract_field" do
80
+ subject { @extractor.extract_field(@node) }
81
+ it { should match(/my fancy/) }
82
+ end
83
+ end
84
+
85
+ context "when only an attribute is provided" do
86
+ before do
87
+ @extractor = DomExtractor.new(:url, @env, :href)
88
+ @node = @extractor.parse('<a href="hello-world">Hello</a>').at_css("a")
89
+ end
90
+ describe "#extract_field" do
91
+ subject { @extractor.extract_field(@node) }
92
+ it { should eql("hello-world") }
93
+ end
94
+ end
95
+
96
+ context "when nothing but a field name is provided" do
97
+ before do
98
+ @extractor = DomExtractor.new(:url, @env)
99
+ @node = @extractor.parse('<a href="hello-world">Hello</a>').at_css("a")
100
+ end
101
+ describe "#extract_field" do
102
+ subject { @extractor.extract_field(@node) }
103
+ it { should eql("Hello") }
104
+ end
105
+ end
106
+
107
+ describe "extract_list" do
108
+ context "no block provided" do
109
+ before do
110
+ @extractor = DomExtractor.new(nil, @env, "div.entry")
111
+ @node = @extractor.parse(@html)
112
+ end
113
+
114
+ subject { @extractor.extract_list(@node) }
115
+ it { subject.should have(3).items }
116
+ end
117
+
118
+ context "block provided" do
119
+ before do
120
+ @extractor = DomExtractor.new(nil, @env, "div.entry", lambda { |nodeList|
121
+ nodeList.reject {|node| node.attr(:class).split(" ").include?('exclude') }
122
+ })
123
+ end
124
+
125
+ subject { @extractor.extract_list(@html) }
126
+ it { subject.should have(2).items }
127
+ end
128
+ end
129
+
130
+ context "xml input" do
131
+ describe "#parse" do
132
+ before do
133
+ @extractor = DomExtractor.new(nil, @env, "entry")
134
+ end
135
+
136
+ subject { @extractor.parse(@xml) }
137
+ it { should be_an_instance_of(Nokogiri::XML::Document)}
138
+ end
139
+ end
140
+
141
+
142
+ context "html input" do
143
+ describe "#parse" do
144
+ before do
145
+ @extractor = DomExtractor.new(nil, @env, "entry")
146
+ end
147
+
148
+ subject { @extractor.parse(@html) }
149
+ it { should be_an_instance_of(Nokogiri::HTML::Document)}
150
+ end
151
+ end
152
+
153
+ context "non-string input" do
154
+ describe "#parse" do
155
+ before do
156
+ @extractor = DomExtractor.new(nil, @env, "entry")
157
+ end
158
+
159
+ it "Should raise an exception" do
160
+ expect { @extractor.parse(Nokogiri::HTML(@html)) }.to raise_exception(ExtractorBase::Exceptions::ExtractorParseError)
161
+ end
162
+
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,76 @@
1
+ require 'helpers/spec_helper'
2
+
3
+ describe ExtractionLoop do
4
+
5
+ before(:each) do
6
+ @fake_scraper = stub!.options
7
+ stub(@fake_scraper).results
8
+ end
9
+
10
+ describe "#new" do
11
+ before do
12
+ @mock_loop = Object.new
13
+ stub(@mock_loop).parse {}
14
+
15
+ end
16
+
17
+ subject { ExtractionLoop.new(@mock_loop ) }
18
+
19
+ it "should allow read/write access to public attributes" do
20
+
21
+ {:extractors => [:fake, :fake],
22
+ :document => nil,
23
+ :hooks => { }
24
+ }.each do |k, v|
25
+ subject.send("#{k}=", v)
26
+ subject.send(k).should eql(v)
27
+ end
28
+ end
29
+ end
30
+
31
+ describe "run" do
32
+ before(:each) do
33
+
34
+ @extractors = [:a, :b].map do |field_name|
35
+ object = Object.new
36
+ stub(object).extract_field { |node, record| node[field_name] }
37
+ stub(object).field_name { field_name }
38
+ object
39
+ end
40
+
41
+ @loop_extractor = Object.new
42
+
43
+ stub(@loop_extractor).parse { |input| Nokogiri::HTML("<html><body>Hello test!</body></html>") }
44
+
45
+ stub(@loop_extractor).extract_list { |document|
46
+ #list of fake dom elements
47
+ (0..9).to_a.map { |n| {:a => n, :b => n*n } }
48
+ }
49
+
50
+
51
+ before, before_extract, after_extract, after = *(1..4).to_a.map { proc {} }
52
+ hooks = {before: [before], before_extract: [before_extract], after_extract: [after_extract], after: [after]}
53
+
54
+ any_instance_of(ExtractionEnvironment) do |env|
55
+ mock(env).run.with_any_args.times(20 + 2)
56
+ end
57
+
58
+ @extraction_loop = ExtractionLoop.new(@loop_extractor, @extractors, "fake document", hooks, @fake_scraper).run
59
+ end
60
+
61
+ subject { @extraction_loop.run }
62
+
63
+ it "should produce 10 records" do
64
+ @extraction_loop.records.size.should eql(10)
65
+ end
66
+
67
+ it "should run extractors" do
68
+ @extraction_loop.records.all? { |record| record.a && record.b && record.b == record.a ** 2 }
69
+ end
70
+
71
+ it "should convert extracted records into OpenStruct instances" do
72
+ @extraction_loop.records.all? { |record| record.is_a?(OpenStruct) }
73
+ end
74
+
75
+ end
76
+ end