extraloop 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ {"completed_in":0.241,"max_id":149792397398786048,"max_id_str":"149792397398786048","next_page":"?page=2&max_id=149792397398786048&q=lolcatz","page":1,"query":"lolcatz","refresh_url":"?since_id=149792397398786048&q=lolcatz","results":[{"created_at":"Thu, 22 Dec 2011 10:04:25 +0000","from_user":"ludovickohn","from_user_id":389760832,"from_user_id_str":"389760832","from_user_name":"Ludovic kohn","geo":null,"id":149792397398786048,"id_str":"149792397398786048","iso_language_code":"fr","metadata":{"result_type":"recent"},"profile_image_url":"http://a2.twimg.com/profile_images/1598373174/image_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1598373174/image_normal.jpg","source":"<a href="http://www.samsungmobile.com" rel="nofollow">Samsung Mobile</a>","text":"J'ai ramen\u00E9e mon #rat \u00E0 paris pour les vacances, mon #chat kiffe il a enfin la t\u00E9l\u00E9! #lolcatz http://t.co/DIvnew3o","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Thu, 22 Dec 2011 07:43:25 +0000","from_user":"proud_stang5","from_user_id":138298648,"from_user_id_str":"138298648","from_user_name":"Hunter Byrnes","geo":null,"id":149756915742806016,"id_str":"149756915742806016","iso_language_code":"de","metadata":{"result_type":"recent"},"profile_image_url":"http://a2.twimg.com/profile_images/1629863431/image_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1629863431/image_normal.jpg","source":"<a href="http://twitter.com/#!/download/iphone" rel="nofollow">Twitter for iPhone</a>","text":"@kenferrera88 #lolcatz girl","to_user":"kenferrera88","to_user_id":330123906,"to_user_id_str":"330123906","to_user_name":"McKenna Ferrrera","in_reply_to_status_id":149756483284893696,"in_reply_to_status_id_str":"149756483284893696"},{"created_at":"Thu, 22 Dec 2011 06:01:27 +0000","from_user":"CatherineSwa","from_user_id":28416785,"from_user_id_str":"28416785","from_user_name":"Catherine Swantner","geo":null,"id":149731252528889856,"id_str":"149731252528889856","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a3.twimg.com/profile_images/1462050284/5920354189_e564722964_b_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1462050284/5920354189_e564722964_b_normal.jpg","source":"<a href="http://twitter.com/">web</a>","text":"@nicole1brown lolcatz meow","to_user":"nicole1brown","to_user_id":328187799,"to_user_id_str":"328187799","to_user_name":"Nicole Brown","in_reply_to_status_id":149730450024308736,"in_reply_to_status_id_str":"149730450024308736"},{"created_at":"Thu, 22 Dec 2011 04:34:51 +0000","from_user":"DanielleKim3","from_user_id":442392597,"from_user_id_str":"442392597","from_user_name":"Danielle Kim","geo":null,"id":149709461068578816,"id_str":"149709461068578816","iso_language_code":"it","metadata":{"result_type":"recent"},"profile_image_url":"http://a0.twimg.com/profile_images/1705391173/319281_10150395530514216_619154215_8228458_1986790910_n_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1705391173/319281_10150395530514216_619154215_8228458_1986790910_n_normal.jpg","source":"<a href="http://twitter.com/#!/download/iphone" rel="nofollow">Twitter for iPhone</a>","text":"RT @LizzieMcCaffrey: LOLCATZ. #Iamscum http://t.co/EYU6xS4f","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Thu, 22 Dec 2011 03:48:42 +0000","from_user":"megg_xxo","from_user_id":360964239,"from_user_id_str":"360964239","from_user_name":"Megan Brown","geo":null,"id":149697846000619520,"id_str":"149697846000619520","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a2.twimg.com/profile_images/1610806404/image_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1610806404/image_normal.jpg","source":"<a href="http://twitter.com/#!/download/iphone" rel="nofollow">Twitter for iPhone</a>","text":"RT @sydney_berger96: rocking out to karaoke with @megg_xxo oh my god why are we so good #lolcatz","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Thu, 22 Dec 2011 03:36:46 +0000","from_user":"TeighanDavies","from_user_id":382347005,"from_user_id_str":"382347005","from_user_name":"Teighan Davies","geo":null,"id":149694844325990400,"id_str":"149694844325990400","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a2.twimg.com/profile_images/1613021110/Picture0298_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1613021110/Picture0298_normal.jpg","source":"<a href="http://twitter.com/#!/download/iphone" rel="nofollow">Twitter for iPhone</a>","text":"@stephmcduff Lolcatz no #butilove #DALLASGREEN","to_user":"stephmcduff","to_user_id":233011266,"to_user_id_str":"233011266","to_user_name":"Stephanie McDuff","in_reply_to_status_id":149694211095142401,"in_reply_to_status_id_str":"149694211095142401"},{"created_at":"Thu, 22 Dec 2011 03:15:32 +0000","from_user":"wolfdawg69","from_user_id":376549124,"from_user_id_str":"376549124","from_user_name":"Delilahhhhh","geo":null,"id":149689499847364608,"id_str":"149689499847364608","iso_language_code":"it","metadata":{"result_type":"recent"},"profile_image_url":"http://a3.twimg.com/profile_images/1551014969/bubbababby_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1551014969/bubbababby_normal.jpg","source":"<a href="http://twitter.com/devices" rel="nofollow">txt</a>","text":"LOLCATZ","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Thu, 22 Dec 2011 02:52:38 +0000","from_user":"PagingDrDre","from_user_id":335938237,"from_user_id_str":"335938237","from_user_name":"Andrea Ratzlaff","geo":null,"id":149683737448284162,"id_str":"149683737448284162","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a2.twimg.com/profile_images/1444343446/image_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1444343446/image_normal.jpg","source":"<a href="http://twitter.com/#!/download/iphone" rel="nofollow">Twitter for iPhone</a>","text":"LAWL lolcatz @t_chapstick14","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Thu, 22 Dec 2011 01:48:22 +0000","from_user":"BigBoiRono","from_user_id":341301684,"from_user_id_str":"341301684","from_user_name":"Donovan Huyck","geo":null,"id":149667564874768385,"id_str":"149667564874768385","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a3.twimg.com/profile_images/1457795278/synchro_meet_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1457795278/synchro_meet_normal.jpg","source":"<a href="http://twitter.com/devices" rel="nofollow">txt</a>","text":"Bai hai wut awks o SALT lolcatz #fuckyou","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Thu, 22 Dec 2011 00:11:38 +0000","from_user":"StephanieZOO","from_user_id":379177126,"from_user_id_str":"379177126","from_user_name":"Stephanie Zamora","geo":null,"id":149643220807192576,"id_str":"149643220807192576","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a1.twimg.com/profile_images/1557641924/035_06__s_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1557641924/035_06__s_normal.jpg","source":"<a href="http://dlvr.it" rel="nofollow">dlvr.it</a>","text":"LolCatz Navy Chooses Two Women To Share Traditional First Kiss","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Thu, 22 Dec 2011 00:11:02 +0000","from_user":"MissTawdry","from_user_id":284260713,"from_user_id_str":"284260713","from_user_name":"MissTawdry","geo":null,"id":149643068751093760,"id_str":"149643068751093760","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a0.twimg.com/profile_images/1431052668/bod_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1431052668/bod_normal.jpg","source":"<a href="http://twitterfeed.com" rel="nofollow">twitterfeed</a>","text":"#LolCatz Navy Chooses Two Women To Share Traditional First Kiss http://t.co/7VqFgQl0 @misslindadee","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Wed, 21 Dec 2011 23:51:17 +0000","from_user":"LIVInmylife6","from_user_id":266053733,"from_user_id_str":"266053733","from_user_name":"Olivia Lake$","geo":null,"id":149638097905258497,"id_str":"149638097905258497","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a1.twimg.com/profile_images/1703355659/IMG00351-20110523-2253_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1703355659/IMG00351-20110523-2253_normal.jpg","source":"<a href="http://blackberry.com/twitter" rel="nofollow">Twitter for BlackBerry\u00AE</a>","text":"@CelioAraujo10 lolcatz I guesss its ok...ps u should fight ross LOL","to_user":"CelioAraujo10","to_user_id":379857020,"to_user_id_str":"379857020","to_user_name":"Celio Araujo","in_reply_to_status_id":149637652952518657,"in_reply_to_status_id_str":"149637652952518657"},{"created_at":"Wed, 21 Dec 2011 19:05:48 +0000","from_user":"rl1971","from_user_id":390991730,"from_user_id_str":"390991730","from_user_name":"RL","geo":null,"id":149566253621723136,"id_str":"149566253621723136","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a3.twimg.com/profile_images/1588515141/twittericon_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1588515141/twittericon_normal.jpg","source":"<a href="http://www.tweetdeck.com" rel="nofollow">TweetDeck</a>","text":"@w1zz http://t.co/4aIMUK1E #lolcatz #liev Met credits naar @gnesmu :)","to_user":"w1zz","to_user_id":194056505,"to_user_id_str":"194056505","to_user_name":"WiZZarD","in_reply_to_status_id":149503807187984385,"in_reply_to_status_id_str":"149503807187984385"},{"created_at":"Wed, 21 Dec 2011 18:51:52 +0000","from_user":"xTalieee1171x","from_user_id":66912646,"from_user_id_str":"66912646","from_user_name":"TC McCullough","geo":null,"id":149562749293297666,"id_str":"149562749293297666","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a2.twimg.com/profile_images/1631481911/DSCN9149_normal.JPG","profile_image_url_https":"https://si0.twimg.com/profile_images/1631481911/DSCN9149_normal.JPG","source":"<a href="http://red-badger.com" rel="nofollow">Birdsong for Windows Phone</a>","text":"Cuuuuuute! RT @Skyrawr All of my animals chillin out! Lolcatz http://t.co/inRSK1VM","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Wed, 21 Dec 2011 18:12:26 +0000","from_user":"gentin123","from_user_id":279705653,"from_user_id_str":"279705653","from_user_name":"morgan booker","geo":null,"id":149552824877973504,"id_str":"149552824877973504","iso_language_code":"it","metadata":{"result_type":"recent"},"profile_image_url":"http://a1.twimg.com/profile_images/1664758777/39216_421872978366_729553366_5014440_2895898_n_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1664758777/39216_421872978366_729553366_5014440_2895898_n_normal.jpg","source":"<a href="http://blackberry.com/twitter" rel="nofollow">Twitter for BlackBerry\u00AE</a>","text":"@mollyjayne_x everiiwon luffz LOLcatz http://t.co/UaGzxNtV","to_user":"mollyjayne_x","to_user_id":408582032,"to_user_id_str":"408582032","to_user_name":"molly","in_reply_to_status_id":149549826068066305,"in_reply_to_status_id_str":"149549826068066305"}],"results_per_page":15,"since_id":0,"since_id_str":"0"}
@@ -0,0 +1,46 @@
1
+ module Helpers
2
+ module Scrapers
3
+
4
+ # Public:
5
+ #
6
+ # Stubs a HTTP request/response pair
7
+ #
8
+ # request_args - Hash of options to be passed to Typhoeus::Request
9
+ # response_args - Hash of options to be passed to Typhoeus::Response (and to Hydra#stub).
10
+ #
11
+ # Returns nothing.
12
+ #
13
+
14
+ def stub_http(request_args={}, response_args={})
15
+
16
+ response_args = {
17
+ :code => 200,
18
+ :headers => "Content-Type: text/html",
19
+ :body => "response stub"
20
+ }.merge(response_args)
21
+
22
+ request_args = {
23
+ :method => :get,
24
+ :url => anything,
25
+ :options => anything
26
+ }.merge(request_args)
27
+
28
+ @hydra ||= Typhoeus::Hydra.new
29
+ stub(Typhoeus::Hydra).new { @hydra }
30
+ response = Typhoeus::Response.new(response_args)
31
+
32
+ stub.proxy(Typhoeus::Request).new(request_args[:url], request_args[:options]) do |request|
33
+ #
34
+ # this allows to stub several requests by handing control to a block
35
+ #
36
+
37
+ if block_given?
38
+ yield(@hydra, request, response)
39
+ else
40
+ @hydra.stub(request_args[:method], request_args[:url]).and_return(response)
41
+ end
42
+ request
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,12 @@
1
+ require 'rr'
2
+ require 'pry'
3
+
4
+ base_path = File.expand_path(File.dirname(File.dirname(File.dirname(__FILE__))))
5
+
6
+ load base_path + "/lib/extraloop.rb"
7
+
8
+ require 'helpers/scraper_helper'
9
+
10
+ RSpec.configure do |config|
11
+ config.mock_with :rr
12
+ end
@@ -0,0 +1,175 @@
1
+ require 'helpers/spec_helper'
2
+ include Helpers::Scrapers
3
+
4
+ describe IterativeScraper do
5
+ before(:each) do
6
+ @fixture_doc ||= proc {
7
+ file = File.open("fixtures/doc.html", "r")
8
+ file_content = file.read
9
+ file.close
10
+ file_content
11
+ }.call
12
+ end
13
+
14
+ describe "#initialize" do
15
+ subject { IterativeScraper.new("http://whatever.net/search") }
16
+
17
+ it { subject.should be_a(ScraperBase) }
18
+ it { subject.should respond_to(:log) }
19
+ end
20
+
21
+ describe "#set_iteration" do
22
+ before do
23
+ @scraper = IterativeScraper.new("http://whatever.net/search")
24
+ end
25
+
26
+ it "should allow passing a range and return itself" do
27
+ @scraper.set_iteration((0..10).step(5)).should be_an_instance_of(IterativeScraper)
28
+ end
29
+ it "should allow passing an array and return itself" do
30
+ @scraper.set_iteration([1, 2, 3, 4]).should be_an_instance_of(IterativeScraper)
31
+ end
32
+ it "should allow passing a string and a proc and return itself" do
33
+ @scraper.set_iteration("#pagination a", proc {}).should be_an_instance_of(IterativeScraper)
34
+ end
35
+ end
36
+
37
+ describe "#continue_with" do
38
+ before do
39
+ @scraper = IterativeScraper.new("http://whatever.net/search")
40
+ end
41
+
42
+ subject { @scraper.continue_with( proc { |result| result['continue'] }) }
43
+ it { should be_an_instance_of(IterativeScraper) }
44
+ end
45
+
46
+
47
+ context "(single url pattern, iteration_set is range , async => false )" do
48
+ before(:each) do
49
+ @scraper = IterativeScraper.new("http://whatever.net/search")
50
+ mock(@scraper).run_super(:run).times(10) {}
51
+ @scraper.set_iteration(:p, (1..10))
52
+ end
53
+
54
+ describe "#run" do
55
+ it "super#run should be called 10 times" do
56
+ @scraper.run
57
+ end
58
+ end
59
+ end
60
+
61
+ context "(single url pattern, iteration_set is extractor, async => false )" do
62
+ before(:each) do
63
+
64
+ iteration_count = 0
65
+ @params_sent = []
66
+ iteration_proc = proc {[2, 3, 4]}
67
+
68
+ any_instance_of(ExtractionLoop) do |eloop|
69
+ stub(eloop).run {}
70
+ end
71
+
72
+ stub_http do |hydra, request, response|
73
+ hydra.stub(:get, /http:\/\/whatever\.net\/search/).and_return(response)
74
+ @params_sent << request.params[:p]
75
+ end
76
+
77
+ @scraper = IterativeScraper.
78
+ new("http://whatever.net/search-stuff").
79
+ set_iteration(:p, iteration_proc).
80
+ loop_on(".whatever").
81
+ set_hook(:data, proc { iteration_count += 1 }).
82
+ run()
83
+
84
+ @iteration_count = iteration_count
85
+ end
86
+
87
+ describe "#run" do
88
+ it "The :data hook should be called 4 times" do
89
+ @iteration_count.should eql(4)
90
+ end
91
+
92
+ it "should have sent p=1, p=2, p=3, p=4 as request parameters" do
93
+ @params_sent.should eql(["1", "2", "3", "4"])
94
+ end
95
+ end
96
+ end
97
+
98
+ context "(single url pattern, iteration_set is range, async => true )" do
99
+
100
+ before do
101
+ @params_sent = []
102
+ any_instance_of(ExtractionLoop) do |eloop|
103
+ stub(eloop).run {}
104
+ end
105
+
106
+ stub_http do |hydra, request, response|
107
+ hydra.stub(:get, request.url).and_return(response)
108
+ @params_sent << request.params[:p]
109
+ end
110
+
111
+ @scraper = IterativeScraper.
112
+ new("http://whatever.net/search", {:async => true}, {:params => {:format => "json"} } ).
113
+ set_iteration(:p, (0..20).step(5)).
114
+ loop_on(".whatever").
115
+ run()
116
+ end
117
+
118
+
119
+ describe "#run" do
120
+ it "params sent should be p=1, p=5, p=10, p=15, p=20" do
121
+ @params_sent.should eql([0, 5, 10, 15, 20].map &:to_s)
122
+ end
123
+ end
124
+ end
125
+
126
+ context "using #continue_with" do
127
+
128
+ describe "#run" do
129
+ before do
130
+ continue_values = (5..10).to_a
131
+ @values_sent = []
132
+ shift_values = proc { |data| continue_values.shift }
133
+
134
+
135
+ stub_http({}, {headers: "Content-Type: application/json", :body => '{"hello":"test"}' }) do |hydra, request, response|
136
+ @values_sent << request.params[:continue]
137
+ hydra.stub(:get, request.url).and_return(response)
138
+ end
139
+
140
+ IterativeScraper.
141
+ new("http://twizzer.net/timeline").
142
+ loop_on(proc {}).
143
+ continue_with(:continue, shift_values).
144
+ run()
145
+
146
+ @continue_values = continue_values
147
+
148
+ end
149
+
150
+ #TODO:
151
+ #
152
+ # When #continue_with is used, it would be better avoid sending
153
+ # an empty iteration_parameter
154
+ #
155
+ it "Should run 5 times", :failing => 'true' do
156
+ @continue_values.all? { |val| @values_sent.include? val.to_s }.should be_true
157
+ end
158
+
159
+ end
160
+ end
161
+
162
+ context "using #continue_with with async = true" do
163
+ describe "#run" do
164
+ before do
165
+ @scraper = IterativeScraper.
166
+ new("http://twizzer.net/timeline", :async => true)
167
+ end
168
+
169
+ it "should raise an exception" do
170
+ expect { @scraper.continue_with(:continue, proc {}) }.to raise_exception(IterativeScraper::Exceptions::NonGetAsyncRequestNotYetImplemented)
171
+ end
172
+ end
173
+ end
174
+
175
+ end
@@ -0,0 +1,146 @@
1
+ require 'helpers/spec_helper'
2
+
3
+ describe JsonExtractor do
4
+ before(:each) do
5
+ stub(scraper = Object.new).options
6
+ stub(scraper).results
7
+ @env = ExtractionEnvironment.new(scraper)
8
+ @json ||= lambda {
9
+ file = File.open('fixtures/doc.json', 'r')
10
+ content = file.read
11
+ file.close
12
+ content
13
+ }.call()
14
+ end
15
+
16
+ describe "#initialize" do
17
+ context("argument is a block") do
18
+ subject { JsonExtractor.new(:thing, @env, proc {}) }
19
+
20
+ it { subject.field_name.should eql(:thing) }
21
+ it { should_not respond_to(:callback) }
22
+ end
23
+ end
24
+
25
+
26
+ describe "#extract_field" do
27
+
28
+ context "field_name and callback" do
29
+ before do
30
+ @extractor = JsonExtractor.new(:from_user, @env)
31
+ @node = @extractor.parse(@json)['results'].first
32
+ end
33
+
34
+ subject { @extractor.extract_field(@node) }
35
+ it { should eql("ludovickohn") }
36
+ end
37
+
38
+ context "field_name and callback" do
39
+ before do
40
+ @extractor = JsonExtractor.new(:from_user, @env, proc { |node| node['from_user_name'] } )
41
+ @node = @extractor.parse(@json)['results'].first
42
+ end
43
+
44
+ subject { @extractor.extract_field(@node) }
45
+ it { should eql("Ludovic kohn") }
46
+ end
47
+
48
+ context "field_name and attribute" do
49
+ before do
50
+ @extractor = JsonExtractor.new(:from_user, @env, :from_user_name )
51
+ @node = @extractor.parse(@json)['results'].first
52
+ end
53
+
54
+ subject { @extractor.extract_field(@node) }
55
+ it { should eql("Ludovic kohn") }
56
+ end
57
+
58
+ context "field name, attribute, and callback " do
59
+ before do
60
+ @extractor = JsonExtractor.new(:from_user, @env, :from_user_name, proc { |username| username.downcase.gsub("\s","-") } )
61
+ @node = @extractor.parse(@json)['results'].first
62
+ end
63
+
64
+ subject { @extractor.extract_field(@node) }
65
+ it { should eql("ludovic-kohn") }
66
+ end
67
+
68
+ context("field name and array (see Utils::DeepFetchable)") do
69
+ before do
70
+ @extractor = JsonExtractor.new(:from_user, @env, ['results', 0, 'from_user'])
71
+ end
72
+ subject { @extractor.extract_field(@json) }
73
+ it { should eql("ludovickohn") }
74
+ end
75
+
76
+ context("field name, array, and callback") do
77
+ before do
78
+ @extractor = JsonExtractor.new(:from_user, @env, ['results', 0, 'from_user'], proc { |username| username.gsub("ckohn",'co') })
79
+ end
80
+ subject { @extractor.extract_field(@json) }
81
+ it { should eql("ludovico") }
82
+ end
83
+
84
+ end
85
+
86
+ describe "#extract_list" do
87
+
88
+
89
+ context "using #get_in" do
90
+ before do
91
+ @extractor = JsonExtractor.new(nil, @env, ['results', 0..5])
92
+ end
93
+
94
+ subject { @extractor.extract_list(@json) }
95
+
96
+ it { subject.size.should eql(6) }
97
+ end
98
+
99
+ context "with json string input" do
100
+ before do
101
+ @extractor = JsonExtractor.new(nil, @env, proc { |data| data['results'] })
102
+ end
103
+
104
+ subject { @extractor.extract_list(@json) }
105
+ it { subject.size.should eql(15) }
106
+ it { should be_an_instance_of(Array) }
107
+ end
108
+
109
+ context "with pre-parsed input" do
110
+ before do
111
+ @extractor = JsonExtractor.new(nil, @env, proc { |data| data['results'] })
112
+ end
113
+
114
+ subject { @extractor.extract_list((Yajl::Parser.new).parse(@json)) }
115
+ it { subject.size.should eql(15) }
116
+ it { should be_an_instance_of(Array) }
117
+ end
118
+
119
+ end
120
+
121
+ context "non-string input" do
122
+ describe "#parse" do
123
+ before do
124
+ @extractor = JsonExtractor.new(nil, @env, proc {})
125
+ end
126
+
127
+ it "Should raise an exception" do
128
+ expect { @extractor.parse(Nokogiri::HTML(@html)) }.to raise_exception(ExtractorBase::Exceptions::ExtractorParseError)
129
+ end
130
+ end
131
+ end
132
+
133
+ context "#json input" do
134
+ describe "#parse" do
135
+ before do
136
+ @extractor = JsonExtractor.new(nil, @env, proc {})
137
+ end
138
+
139
+ subject { @extractor.parse(@json) }
140
+
141
+ it { should respond_to(:get_in) }
142
+ it { should be_an_instance_of(Hash) }
143
+ it { should_not be_empty }
144
+ end
145
+ end
146
+ end
@@ -0,0 +1,25 @@
1
+ require 'helpers/spec_helper'
2
+
3
+ class LoggableClass
4
+ attr_reader :super_called
5
+ include Loggable
6
+
7
+ def initialize
8
+ @super_called=true
9
+ end
10
+ end
11
+
12
+ describe Loggable do
13
+ describe "#initialize" do
14
+
15
+ subject { LoggableClass.new }
16
+
17
+ it "should execute the class' #initialize method" do
18
+ subject.super_called.should eql(true)
19
+ end
20
+
21
+ it "should respond to the #log method" do
22
+ subject.respond_to?(:log).should be_true
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,178 @@
1
+ require 'helpers/spec_helper'
2
+ include Helpers::Scrapers
3
+
4
+ describe ScraperBase do
5
+ before do
6
+ @fixture_doc = File.open("fixtures/doc.html", 'r') do |file|
7
+ file.read
8
+ end
9
+ end
10
+
11
+ before(:each) do
12
+ @scraper = ScraperBase.new("http://localhost/fixture")
13
+ end
14
+
15
+
16
+ describe "#loop_on" do
17
+ subject { @scraper.loop_on("bla.bla") }
18
+ it { should be_an_instance_of(ScraperBase) }
19
+ end
20
+
21
+ describe "#extract" do
22
+ subject { @scraper.extract("fieldname", "bla.bla") }
23
+ it { should be_an_instance_of(ScraperBase) }
24
+ end
25
+
26
+ describe "#set_hook" do
27
+ subject { @scraper.set_hook(:after, proc {}) }
28
+ it { should be_an_instance_of(ScraperBase) }
29
+ end
30
+
31
+ describe "#set_hook" do
32
+ it "should raise exception if no proc is provided" do
33
+ expect { @scraper.set_hook(:after, :method) }.to raise_exception(ScraperBase::Exceptions::HookArgumentError)
34
+ end
35
+ end
36
+
37
+ context "request params in both the url and the arguments hash" do
38
+ describe "#run" do
39
+ before do
40
+
41
+ @request_args = {}
42
+ url = "http://localhost/whatever?q=stuff&p=1&limit=100"
43
+ stub_http do |hydra, request, response|
44
+ @request_args = request.params
45
+ hydra.stub(:get, request.url).and_return(response)
46
+ end
47
+
48
+ any_instance_of(ExtractionLoop) do |extraloop|
49
+ stub(extraloop).run {}
50
+ end
51
+
52
+ @scraper = ScraperBase.new(url, {}, {
53
+ :params => { :limit => 250 }
54
+ }).loop_on(".stuff").run
55
+ end
56
+
57
+ it "should merge URL and request parameters" do
58
+ @request_args[:p].to_s.should eql("1")
59
+ @request_args[:q].to_s.should eql("stuff")
60
+ @request_args[:limit].to_s.should eql("250")
61
+ end
62
+ end
63
+ end
64
+
65
+ context "single url, no options provided (async => false)" do
66
+ describe "#run" do
67
+ before do
68
+ @url = "http://localhost/fixture"
69
+ results = []
70
+
71
+ stub_http({}, :body => @fixture_doc) do |hydra, request, response|
72
+ hydra.stub(:get, request.url).and_return(response)
73
+ end
74
+
75
+ @scraper = ScraperBase.new(@url).
76
+ loop_on("ul li.file a").
77
+ extract(:url, :href).
78
+ extract(:filename).
79
+ set_hook(:data, proc { |records| records.each { |record| results << record }})
80
+
81
+ @results = results
82
+ end
83
+
84
+
85
+ it "Should handle response" do
86
+ @scraper.run
87
+ @results.should_not be_empty
88
+ @results.all? { |record| record.extracted_at && record.url && record.filename }.should be_true
89
+ end
90
+ end
91
+ end
92
+
93
+ context "multiple urls (async => false)" do
94
+ describe "#run" do
95
+ before do
96
+ @urls = [
97
+ "http://localhost/fixture1",
98
+ "http://localhost/fixture2",
99
+ "http://localhost/fixture3",
100
+ ]
101
+ results = []
102
+ @hydra_run_call_count = 0
103
+
104
+ stub_http do |hydra, request, response|
105
+ @urls.each { |url| hydra.stub(:get, url).and_return(response) }
106
+ stub.proxy(hydra).run { @hydra_run_call_count += 1 }
107
+ end
108
+
109
+ @scraper = ScraperBase.new(@urls, :log => false).
110
+ loop_on("ul li.file a").
111
+ extract(:url, :href).
112
+ extract(:filename).
113
+ set_hook(:data, proc { |records| records.each { |record| results << record } })
114
+
115
+ @results = results
116
+
117
+ @fake_loop = Object.new
118
+ stub(@fake_loop).run { }
119
+ stub(@fake_loop).environment { ExtractionEnvironment.new }
120
+ stub(@fake_loop).records { Array(1..3).map { |n| Object.new } }
121
+
122
+ mock(ExtractionLoop).new(is_a(DomExtractor), is_a(Array), is_a(String), is_a(Hash), is_a(ScraperBase)).times(3) { @fake_loop }
123
+ end
124
+
125
+
126
+ it "Should handle response" do
127
+ @scraper.run
128
+ @results.size.should eql(9)
129
+ @hydra_run_call_count.should eql(@urls.size)
130
+ end
131
+ end
132
+ end
133
+
134
+
135
+ context "multiple urls (async => true)" do
136
+ describe "#run" do
137
+ before do
138
+ @urls = [
139
+ "http://localhost/fixture1",
140
+ "http://localhost/fixture2",
141
+ "http://localhost/fixture3",
142
+ "http://localhost/fixture4",
143
+ "http://localhost/fixture5",
144
+ ]
145
+ results = []
146
+ @hydra_run_call_count = 0
147
+
148
+ stub_http({}, :body => @fixture_doc) do |hydra, request, response|
149
+ @urls.each { |url| hydra.stub(:get, url).and_return(response) }
150
+ stub.proxy(hydra).run { @hydra_run_call_count+=1 }
151
+ end
152
+
153
+ @scraper = ScraperBase.new(@urls, :async => true).
154
+ loop_on("ul li.file a").
155
+ extract(:url, :href).
156
+ extract(:filename).
157
+ set_hook(:data, proc { |records| records.each { |record| results << record } })
158
+
159
+ @results = results
160
+
161
+ @fake_loop = Object.new
162
+ stub(@fake_loop).run { }
163
+ stub(@fake_loop).environment { ExtractionEnvironment.new }
164
+ stub(@fake_loop).records { Array(1..3).map { |n| Object.new } }
165
+
166
+ mock(ExtractionLoop).new(is_a(DomExtractor), is_a(Array), is_a(String), is_a(Hash), is_a(ScraperBase)).times(@urls.size) { @fake_loop }
167
+ end
168
+
169
+
170
+ it "Should handle response" do
171
+ @scraper.run
172
+ @results.size.should eql(@urls.size * 3)
173
+ @hydra_run_call_count.should eql(1)
174
+ end
175
+ end
176
+ end
177
+
178
+ end
@@ -0,0 +1,44 @@
1
+ require 'helpers/spec_helper'
2
+
3
+ describe Utils do
4
+ describe "AugmentedHash" do
5
+ describe "#get_in" do
6
+ context "extending a hash object" do
7
+ before do
8
+ @hash = {
9
+ :a => {
10
+ :b => {
11
+ :c => [
12
+ :x, :y, :z
13
+ ]
14
+ }
15
+ }
16
+ }.extend(Utils::DeepFetchable)
17
+ end
18
+
19
+ subject { @hash.get_in [:a, :b, :c, 2] }
20
+
21
+ it { should eql(:z) }
22
+
23
+
24
+ context "trying to fetch a key that does not exist" do
25
+ subject { @hash.get_in [:a, :b, :wrong, :even_worst ]}
26
+
27
+ it { should eql(nil) }
28
+ end
29
+
30
+ end
31
+
32
+ context "extending an Array object" do
33
+ before do
34
+ @array = [1, 2, 3, 4, [5.1, 5.2, 5.3]].extend(Utils::DeepFetchable)
35
+ end
36
+
37
+
38
+ subject { @array.get_in [4, -1] }
39
+
40
+ it { should eql(5.3) }
41
+ end
42
+ end
43
+ end
44
+ end