extraloop 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,36 @@
1
+ require 'pry'
2
+ class JsonExtractor < ExtractorBase
3
+
4
+ def initialize(*args)
5
+ @path = args[2] && args[2].is_a?(Array) ? args[2] : nil
6
+ super(*args)
7
+ end
8
+
9
+ def extract_field(node, record=nil)
10
+ output = node = node.is_a?(String) ? parse(node) : node
11
+ output = node.get_in(@path) if @path
12
+ output = node[@attribute.to_s] if @attribute
13
+ output = @environment.run(output, record, &@callback) if @callback
14
+
15
+ # when no attribute and no callback is provided, try fetching by field name
16
+ if !@attribute && !@callback
17
+ output = node[@field_name.to_s] if node[@field_name.to_s]
18
+ end
19
+ output
20
+ end
21
+
22
+ def extract_list(input)
23
+ #TODO: implement more clever stuff here after looking
24
+ # into possible hash traversal techniques
25
+
26
+ input = input.is_a?(String) ? parse(input) : input
27
+ input = input.get_in(@path) if @path
28
+
29
+ @callback && Array(@environment.run(input, &@callback)) || input
30
+ end
31
+
32
+ def parse(input)
33
+ super(input)
34
+ @environment.document = (Yajl::Parser.new).parse(input).extend(Utils::DeepFetchable)
35
+ end
36
+ end
@@ -0,0 +1,64 @@
1
+ autoload :Logging, "logging"
2
+
3
+ # Decorates a class with an instance of Logging.logger and a convenient
4
+ # helper method to log messages.
5
+
6
+ module Loggable
7
+ protected
8
+
9
+ #
10
+ # Initializes the incorporated logger object.
11
+ #
12
+ # Returns nothing.
13
+ #
14
+
15
+ def init_log!
16
+ return unless @options[:log]
17
+
18
+ @options[:log] ||= {
19
+ :appenders => [ Logging.appenders.stderr ],
20
+ :log_level => :info
21
+ }
22
+
23
+ if @options[:log] && @options[:log][:appenders] && @options[:log][:appenders].any?
24
+ @log = Logging.logger["#{self}"]
25
+ @log.add_appenders(@options[:log][:appenders])
26
+ @log.level = @options[:log] && @options[:log][:log_level] || :info
27
+ end
28
+ end
29
+
30
+ #
31
+ # Convenience method for logging messages.
32
+ #
33
+ # messages - the message content
34
+ # log_level - the message's log level (can be either :info, :debug, :error, :warning; defaults to :info)
35
+ #
36
+ # Returns nothing.
37
+ #
38
+
39
+ def log(message, log_level = :info)
40
+ @log.send(log_level, message) if @log
41
+ end
42
+ end
43
+
44
+
45
+ #
46
+ # Monkey patches ScraperBase.
47
+ #
48
+ class ScraperBase
49
+ include Loggable
50
+ alias_method :base_initialize, :initialize
51
+
52
+ #
53
+ # Wrapp ScraperBase#initialize method into Loggable#initialize
54
+ #
55
+ # args - The arguments to be passed over to the ScraperBase#initialize method.
56
+ #
57
+ # Returns itself.
58
+ #
59
+ def initialize(*args)
60
+ base_initialize(*args)
61
+ init_log!
62
+ self
63
+ end
64
+ end
@@ -0,0 +1,166 @@
1
+ class ScraperBase
2
+ include Hookable
3
+ include Utils::Support
4
+
5
+ attr_reader :results, :options
6
+
7
+ #
8
+ # Public: Initalizes a web scraper.
9
+ #
10
+ # urls - One or several urls.
11
+ # options - Hash of scraper options
12
+ # async : Whether the scraper should issue HTTP requests in series or in parallel (set to false to suppress logging completely).
13
+ # log : logging options (defaults to standard error).
14
+ # appenders : specifies where the log messages should be appended to (defaults to standard error).
15
+ # log_level : specifies the log level (defaults to info).
16
+ # arguments - Hash of arguments to be passed to the Typhoeus HTTP client (optional).
17
+ #
18
+ #
19
+ #
20
+ # Returns itself.
21
+ #
22
+
23
+ def initialize(urls, options = {}, arguments = {})
24
+ @urls = Array(urls)
25
+ @loop_extractor_args = nil
26
+ @extractor_args = []
27
+ @loop = nil
28
+
29
+ @request_arguments = arguments
30
+
31
+ @options = {
32
+ :async => false
33
+ }.merge(options)
34
+
35
+
36
+ @response_count = 0
37
+ @queued_count = 0
38
+
39
+ @hooks = {}
40
+ @failed_requests = []
41
+
42
+ hydra_options = @options[:hydra] && @options[:hydra][:max_concurrency] || {:max_concurrency => 10}
43
+ @hydra = Typhoeus::Hydra.new hydra_options
44
+ self
45
+ end
46
+
47
+
48
+ # Public: Sets the scraper extraction loop.
49
+ #
50
+ # Delegates to Extractor, will raise an exception if neither a selector, a block, or an attribute name is provided.
51
+ #
52
+ #
53
+ # selector - The CSS3 selector identifying the node list over which iterate (optional).
54
+ # callback - A block of code (optional).
55
+ # attribute - An attribute name (optional).
56
+ #
57
+ # Returns itself.
58
+ #
59
+
60
+ def loop_on(*args)
61
+ @loop_extractor_args = args.insert(0, nil, ExtractionEnvironment.new(self))
62
+ self
63
+ end
64
+
65
+ # Public: Registers a new extractor to be added to the loop.
66
+ #
67
+ # Delegates to Extractor, will raise an exception if neither a selector, a block, or an attribute name is provided.
68
+ #
69
+ # selector - The CSS3 selector identifying the node list over which iterate (optional).
70
+ # callback - A block of code (optional).
71
+ # attribute - An attribute name (optional).
72
+ #
73
+ # Returns itself.
74
+ #
75
+ #
76
+
77
+ def extract(*args)
78
+ @extractor_args << args.insert(1, ExtractionEnvironment.new(self))
79
+ self
80
+ end
81
+
82
+ #
83
+ # Public: Runs the main scraping loop.
84
+ #
85
+ # Returns nothing
86
+ #
87
+ def run
88
+ @urls.each do |url|
89
+ issue_request(url)
90
+
91
+ # if the scraper is asynchronous start processing the Hydra HTTP queue
92
+ # only after that the last url has been appended to the queue (see #issue_request).
93
+ #
94
+ if @options[:async]
95
+ if url == @urls.last
96
+ @hydra.run
97
+ end
98
+ else
99
+ @hydra.run
100
+ end
101
+ end
102
+ self
103
+ end
104
+
105
+ protected
106
+
107
+ def issue_request(url)
108
+
109
+ @request_arguments[:params] = merge_request_parameters(url)
110
+ url_without_params = url.gsub(/\?.*/,"")
111
+
112
+ arguments = {
113
+ 'headers' => [
114
+ 'User-Agent: Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0a2) Gecko/20110613 Firefox/6.0a2',
115
+ 'accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
116
+ ].join("\n")
117
+ }
118
+
119
+ arguments.merge!(@request_arguments)
120
+ request = Typhoeus::Request.new(*[url_without_params, arguments])
121
+
122
+ request.on_complete do |response|
123
+ handle_response(response)
124
+ end
125
+
126
+ log("queueing url: #{url}, params #{arguments[:params]}", :debug)
127
+ @queued_count += 1
128
+ @hydra.queue(request)
129
+ end
130
+
131
+ def merge_request_parameters(url)
132
+ url_params = URI::parse(url).extend(Utils::URIAddition).query_hash
133
+ return @request_arguments[:params] || {} unless url_params && url_params.respond_to?(:merge)
134
+
135
+ params = symbolize_keys(@request_arguments[:params] ||= {})
136
+ url_params.merge(params)
137
+ end
138
+
139
+ def handle_response(response)
140
+ @response_count += 1
141
+ @loop = prepare_loop(response)
142
+ log("response ##{@response_count} of #{@queued_count}, status code: [#{response.code}], URL fragment: ...#{response.effective_url.split('/').last if response.effective_url}")
143
+ @loop.run
144
+
145
+ @environment = @loop.environment
146
+ run_hook(:data, [@loop.records, response])
147
+ end
148
+
149
+ def prepare_loop(response)
150
+ format = @options[:format] || detect_format(response.headers_hash.fetch('Content-Type', nil))
151
+ extractor_class = format == :json ? JsonExtractor : DomExtractor
152
+ loop_extractor = extractor_class.new(*@loop_extractor_args)
153
+ extractors = @extractor_args.map { |args| extractor_class.new(*args) }
154
+ ExtractionLoop.new(loop_extractor, extractors, response.body, @hooks, self)
155
+ end
156
+
157
+ def detect_format(content_type)
158
+ #TODO: add support for xml/rdf documents
159
+ if content_type && content_type =~ /json$/
160
+ :json
161
+ else
162
+ :html
163
+ end
164
+ end
165
+
166
+ end
@@ -0,0 +1,75 @@
1
+ module Utils
2
+ module ScrapingHelpers
3
+ #
4
+ # Generates a proc that iterates over a list of anchors
5
+ # and collects the value of the specified paramenter
6
+ #
7
+ def values_for_param(param)
8
+ lambda { |nodeList|
9
+ nodeList.collect {|node|
10
+ query = URI::parse(node.attr(:href)).query
11
+ query.split("&").collect { |token| token.split("=") }.
12
+ detect{ |chunks| chunks.first == param.to_s }.last
13
+ }.uniq
14
+ }
15
+ end
16
+ end
17
+
18
+ module URIAddition
19
+ #
20
+ # Public
21
+ #
22
+ # Generates a hash representation of a uri's query string.
23
+ #
24
+ # Returns a hash mapping the URL query parameters to their respective values
25
+ #
26
+ # NOTE: this is intended as a decorator method for instances of URI::HTTP.
27
+ #
28
+ # examples:
29
+ #
30
+ # URI::parse(url).extend(URIAddition).query_hash
31
+ #
32
+
33
+ def query_hash
34
+ return unless self.query
35
+ self.query.split("&").reduce({}) do |memo, item|
36
+ param, value = *item.split("=")
37
+ memo.merge(param.to_sym => value)
38
+ end
39
+ end
40
+ end
41
+
42
+ module DeepFetchable
43
+ def get_in(path)
44
+ keys, node = Array(path), self
45
+
46
+ keys.each_with_index do |key, index|
47
+ node = node[key]
48
+ next_key = keys[index + 1]
49
+ break unless node
50
+ end
51
+
52
+ node
53
+ end
54
+ end
55
+
56
+ module Support
57
+ def symbolize_keys(hash)
58
+ hash.reduce({}) { |memo, (k,v)| memo.merge(k => v) }
59
+ end
60
+ #
61
+ # Creates instance variables from a hash.
62
+ #
63
+ # hash - An hash representing of instance variables to be created.
64
+ # defaults - An hash representing the attributes' default values (optional).
65
+ #
66
+ protected
67
+ def set_attributes(hash, defaults={})
68
+ allowed = defaults.keys
69
+ hash.each { |key, value| self.instance_variable_set("@#{key}", value)}
70
+ defaults.each do |key, value|
71
+ self.instance_variable_set("@#{key}", value) unless self.instance_variable_get("@#{key}")
72
+ end
73
+ end
74
+ end
75
+ end
data/lib/extraloop.rb ADDED
@@ -0,0 +1,43 @@
1
+ base_path = File.expand_path(File.dirname(__FILE__) + "/extraloop" )
2
+
3
+ # Standard library
4
+ autoload :OpenStruct, "ostruct"
5
+
6
+ # Rubygems
7
+
8
+ gem "yajl-ruby"
9
+ gem "nokogiri"
10
+ gem "typhoeus"
11
+ gem "logging"
12
+
13
+
14
+ autoload :Nokogiri, "nokogiri"
15
+ autoload :Yajl, "yajl"
16
+ autoload :Typhoeus, "typhoeus"
17
+
18
+
19
+ # Extraloop components
20
+
21
+ autoload :Utils , "#{base_path}/utils"
22
+ autoload :ExtractionEnvironment , "#{base_path}/extraction_environment"
23
+ autoload :ExtractorBase , "#{base_path}/extractor_base"
24
+ autoload :DomExtractor , "#{base_path}/dom_extractor"
25
+ autoload :JsonExtractor , "#{base_path}/json_extractor"
26
+ autoload :ExtractionLoop , "#{base_path}/extraction_loop"
27
+ autoload :ScraperBase , "#{base_path}/scraper_base"
28
+ autoload :Loggable , "#{base_path}/loggable"
29
+ autoload :Hookable , "#{base_path}/hookable"
30
+ autoload :IterativeScraper , "#{base_path}/iterative_scraper"
31
+
32
+
33
+ # monkey patch scraperbase with the Loggable module.
34
+ #
35
+ # This is the equivalent adding extra_loop/ to the path and requiring both ScraperBase and Loggable
36
+ #
37
+ ScraperBase
38
+ Loggable
39
+
40
+
41
+ class ExtraLoop
42
+ VERSION = '0.0.1'
43
+ end
@@ -0,0 +1,165 @@
1
+ require 'helpers/spec_helper'
2
+
3
+ describe DomExtractor do
4
+ before(:each) do
5
+ stub(scraper = Object.new).options
6
+ stub(scraper).results
7
+ @env = ExtractionEnvironment.new(scraper)
8
+ @html ||= <<-EOF
9
+ <div class="entry">
10
+ <p><a href="http://example.com">my dummy link</a></p>
11
+ </div>
12
+ <div class="entry exclude" />
13
+ <div class="entry" />
14
+ EOF
15
+
16
+ @xml ||= <<-EOF
17
+ <?xml version="1.0"?>
18
+ <StandardDataObject xmlns="myns">
19
+ <InteractionElements>
20
+ <TargetCenter>92f4-MPA</TargetCenter>
21
+ <Trace>7.19879</Trace>
22
+ </InteractionElements>
23
+ </StandardDataObject>
24
+ EOF
25
+ end
26
+
27
+ describe "#new" do
28
+ subject { DomExtractor.new(:my_field, @env, "p a", :href) }
29
+ it { subject.field_name.should eql(:my_field) }
30
+ end
31
+
32
+ context "when no attribute is provided" do
33
+ before do
34
+ @extractor = DomExtractor.new(:anchor, @env, "p a")
35
+ @node = @extractor.parse(@html)
36
+ end
37
+
38
+ describe "#extract_field" do
39
+ subject { @extractor.extract_field(@node) }
40
+ it { should eql("my dummy link") }
41
+ end
42
+ end
43
+
44
+
45
+ context "when an attribute is provided" do
46
+ before do
47
+ @extractor = DomExtractor.new(:anchor, @env, "p a", :href)
48
+ @node = @extractor.parse(@html)
49
+ end
50
+
51
+ describe "#extract_field" do
52
+ subject { @extractor.extract_field(@node) }
53
+ it { should eql("http://example.com") }
54
+ end
55
+ end
56
+
57
+ context "when a selector and a block is provided" do
58
+ before do
59
+ @extractor = DomExtractor.new(:anchor, @env, "p a", proc { |node|
60
+ node.text.gsub("dummy", "fancy")
61
+ })
62
+ @node = @extractor.parse(@html)
63
+ end
64
+
65
+ describe "#extract_field" do
66
+ subject { @extractor.extract_field(@node) }
67
+ it { should match(/my fancy/) }
68
+ end
69
+ end
70
+
71
+ context "when only a block is provided" do
72
+ before do
73
+ @extractor = DomExtractor.new(:anchor, @env, proc { |document|
74
+ document.at_css("p a").text.gsub(/dummy/,'fancy')
75
+ })
76
+ @node = @extractor.parse(@html)
77
+ end
78
+
79
+ describe "#extract_field" do
80
+ subject { @extractor.extract_field(@node) }
81
+ it { should match(/my fancy/) }
82
+ end
83
+ end
84
+
85
+ context "when only an attribute is provided" do
86
+ before do
87
+ @extractor = DomExtractor.new(:url, @env, :href)
88
+ @node = @extractor.parse('<a href="hello-world">Hello</a>').at_css("a")
89
+ end
90
+ describe "#extract_field" do
91
+ subject { @extractor.extract_field(@node) }
92
+ it { should eql("hello-world") }
93
+ end
94
+ end
95
+
96
+ context "when nothing but a field name is provided" do
97
+ before do
98
+ @extractor = DomExtractor.new(:url, @env)
99
+ @node = @extractor.parse('<a href="hello-world">Hello</a>').at_css("a")
100
+ end
101
+ describe "#extract_field" do
102
+ subject { @extractor.extract_field(@node) }
103
+ it { should eql("Hello") }
104
+ end
105
+ end
106
+
107
+ describe "extract_list" do
108
+ context "no block provided" do
109
+ before do
110
+ @extractor = DomExtractor.new(nil, @env, "div.entry")
111
+ @node = @extractor.parse(@html)
112
+ end
113
+
114
+ subject { @extractor.extract_list(@node) }
115
+ it { subject.should have(3).items }
116
+ end
117
+
118
+ context "block provided" do
119
+ before do
120
+ @extractor = DomExtractor.new(nil, @env, "div.entry", lambda { |nodeList|
121
+ nodeList.reject {|node| node.attr(:class).split(" ").include?('exclude') }
122
+ })
123
+ end
124
+
125
+ subject { @extractor.extract_list(@html) }
126
+ it { subject.should have(2).items }
127
+ end
128
+ end
129
+
130
+ context "xml input" do
131
+ describe "#parse" do
132
+ before do
133
+ @extractor = DomExtractor.new(nil, @env, "entry")
134
+ end
135
+
136
+ subject { @extractor.parse(@xml) }
137
+ it { should be_an_instance_of(Nokogiri::XML::Document)}
138
+ end
139
+ end
140
+
141
+
142
+ context "html input" do
143
+ describe "#parse" do
144
+ before do
145
+ @extractor = DomExtractor.new(nil, @env, "entry")
146
+ end
147
+
148
+ subject { @extractor.parse(@html) }
149
+ it { should be_an_instance_of(Nokogiri::HTML::Document)}
150
+ end
151
+ end
152
+
153
+ context "non-string input" do
154
+ describe "#parse" do
155
+ before do
156
+ @extractor = DomExtractor.new(nil, @env, "entry")
157
+ end
158
+
159
+ it "Should raise an exception" do
160
+ expect { @extractor.parse(Nokogiri::HTML(@html)) }.to raise_exception(ExtractorBase::Exceptions::ExtractorParseError)
161
+ end
162
+
163
+ end
164
+ end
165
+ end
@@ -0,0 +1,76 @@
1
+ require 'helpers/spec_helper'
2
+
3
+ describe ExtractionLoop do
4
+
5
+ before(:each) do
6
+ @fake_scraper = stub!.options
7
+ stub(@fake_scraper).results
8
+ end
9
+
10
+ describe "#new" do
11
+ before do
12
+ @mock_loop = Object.new
13
+ stub(@mock_loop).parse {}
14
+
15
+ end
16
+
17
+ subject { ExtractionLoop.new(@mock_loop ) }
18
+
19
+ it "should allow read/write access to public attributes" do
20
+
21
+ {:extractors => [:fake, :fake],
22
+ :document => nil,
23
+ :hooks => { }
24
+ }.each do |k, v|
25
+ subject.send("#{k}=", v)
26
+ subject.send(k).should eql(v)
27
+ end
28
+ end
29
+ end
30
+
31
+ describe "run" do
32
+ before(:each) do
33
+
34
+ @extractors = [:a, :b].map do |field_name|
35
+ object = Object.new
36
+ stub(object).extract_field { |node, record| node[field_name] }
37
+ stub(object).field_name { field_name }
38
+ object
39
+ end
40
+
41
+ @loop_extractor = Object.new
42
+
43
+ stub(@loop_extractor).parse { |input| Nokogiri::HTML("<html><body>Hello test!</body></html>") }
44
+
45
+ stub(@loop_extractor).extract_list { |document|
46
+ #list of fake dom elements
47
+ (0..9).to_a.map { |n| {:a => n, :b => n*n } }
48
+ }
49
+
50
+
51
+ before, before_extract, after_extract, after = *(1..4).to_a.map { proc {} }
52
+ hooks = {before: [before], before_extract: [before_extract], after_extract: [after_extract], after: [after]}
53
+
54
+ any_instance_of(ExtractionEnvironment) do |env|
55
+ mock(env).run.with_any_args.times(20 + 2)
56
+ end
57
+
58
+ @extraction_loop = ExtractionLoop.new(@loop_extractor, @extractors, "fake document", hooks, @fake_scraper).run
59
+ end
60
+
61
+ subject { @extraction_loop.run }
62
+
63
+ it "should produce 10 records" do
64
+ @extraction_loop.records.size.should eql(10)
65
+ end
66
+
67
+ it "should run extractors" do
68
+ @extraction_loop.records.all? { |record| record.a && record.b && record.b == record.a ** 2 }
69
+ end
70
+
71
+ it "should convert extracted records into OpenStruct instances" do
72
+ @extraction_loop.records.all? { |record| record.is_a?(OpenStruct) }
73
+ end
74
+
75
+ end
76
+ end