extraloop 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/History.txt +2 -0
- data/README.md +135 -0
- data/examples/google_news_scraper.rb +22 -0
- data/examples/wikipedia_categories.rb +49 -0
- data/lib/extraloop/dom_extractor.rb +45 -0
- data/lib/extraloop/extraction_environment.rb +20 -0
- data/lib/extraloop/extraction_loop.rb +46 -0
- data/lib/extraloop/extractor_base.rb +40 -0
- data/lib/extraloop/hookable.rb +26 -0
- data/lib/extraloop/iterative_scraper.rb +291 -0
- data/lib/extraloop/json_extractor.rb +36 -0
- data/lib/extraloop/loggable.rb +64 -0
- data/lib/extraloop/scraper_base.rb +166 -0
- data/lib/extraloop/utils.rb +75 -0
- data/lib/extraloop.rb +43 -0
- data/spec/dom_extractor_spec.rb +165 -0
- data/spec/extraction_loop_spec.rb +76 -0
- data/spec/fixtures/doc.html +1324 -0
- data/spec/fixtures/doc.json +1 -0
- data/spec/helpers/scraper_helper.rb +46 -0
- data/spec/helpers/spec_helper.rb +12 -0
- data/spec/iterative_scraper_spec.rb +175 -0
- data/spec/json_extractor_spec.rb +146 -0
- data/spec/loggable_spec.rb +25 -0
- data/spec/scraper_base_spec.rb +178 -0
- data/spec/utils_spec.rb +44 -0
- metadata +140 -0
@@ -0,0 +1 @@
|
|
1
|
+
{"completed_in":0.241,"max_id":149792397398786048,"max_id_str":"149792397398786048","next_page":"?page=2&max_id=149792397398786048&q=lolcatz","page":1,"query":"lolcatz","refresh_url":"?since_id=149792397398786048&q=lolcatz","results":[{"created_at":"Thu, 22 Dec 2011 10:04:25 +0000","from_user":"ludovickohn","from_user_id":389760832,"from_user_id_str":"389760832","from_user_name":"Ludovic kohn","geo":null,"id":149792397398786048,"id_str":"149792397398786048","iso_language_code":"fr","metadata":{"result_type":"recent"},"profile_image_url":"http://a2.twimg.com/profile_images/1598373174/image_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1598373174/image_normal.jpg","source":"<a href="http://www.samsungmobile.com" rel="nofollow">Samsung Mobile</a>","text":"J'ai ramen\u00E9e mon #rat \u00E0 paris pour les vacances, mon #chat kiffe il a enfin la t\u00E9l\u00E9! #lolcatz http://t.co/DIvnew3o","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Thu, 22 Dec 2011 07:43:25 +0000","from_user":"proud_stang5","from_user_id":138298648,"from_user_id_str":"138298648","from_user_name":"Hunter Byrnes","geo":null,"id":149756915742806016,"id_str":"149756915742806016","iso_language_code":"de","metadata":{"result_type":"recent"},"profile_image_url":"http://a2.twimg.com/profile_images/1629863431/image_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1629863431/image_normal.jpg","source":"<a href="http://twitter.com/#!/download/iphone" rel="nofollow">Twitter for iPhone</a>","text":"@kenferrera88 #lolcatz girl","to_user":"kenferrera88","to_user_id":330123906,"to_user_id_str":"330123906","to_user_name":"McKenna Ferrrera","in_reply_to_status_id":149756483284893696,"in_reply_to_status_id_str":"149756483284893696"},{"created_at":"Thu, 22 Dec 2011 06:01:27 +0000","from_user":"CatherineSwa","from_user_id":28416785,"from_user_id_str":"28416785","from_user_name":"Catherine Swantner","geo":null,"id":149731252528889856,"id_str":"149731252528889856","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a3.twimg.com/profile_images/1462050284/5920354189_e564722964_b_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1462050284/5920354189_e564722964_b_normal.jpg","source":"<a href="http://twitter.com/">web</a>","text":"@nicole1brown lolcatz meow","to_user":"nicole1brown","to_user_id":328187799,"to_user_id_str":"328187799","to_user_name":"Nicole Brown","in_reply_to_status_id":149730450024308736,"in_reply_to_status_id_str":"149730450024308736"},{"created_at":"Thu, 22 Dec 2011 04:34:51 +0000","from_user":"DanielleKim3","from_user_id":442392597,"from_user_id_str":"442392597","from_user_name":"Danielle Kim","geo":null,"id":149709461068578816,"id_str":"149709461068578816","iso_language_code":"it","metadata":{"result_type":"recent"},"profile_image_url":"http://a0.twimg.com/profile_images/1705391173/319281_10150395530514216_619154215_8228458_1986790910_n_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1705391173/319281_10150395530514216_619154215_8228458_1986790910_n_normal.jpg","source":"<a href="http://twitter.com/#!/download/iphone" rel="nofollow">Twitter for iPhone</a>","text":"RT @LizzieMcCaffrey: LOLCATZ. #Iamscum http://t.co/EYU6xS4f","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Thu, 22 Dec 2011 03:48:42 +0000","from_user":"megg_xxo","from_user_id":360964239,"from_user_id_str":"360964239","from_user_name":"Megan Brown","geo":null,"id":149697846000619520,"id_str":"149697846000619520","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a2.twimg.com/profile_images/1610806404/image_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1610806404/image_normal.jpg","source":"<a href="http://twitter.com/#!/download/iphone" rel="nofollow">Twitter for iPhone</a>","text":"RT @sydney_berger96: rocking out to karaoke with @megg_xxo oh my god why are we so good #lolcatz","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Thu, 22 Dec 2011 03:36:46 +0000","from_user":"TeighanDavies","from_user_id":382347005,"from_user_id_str":"382347005","from_user_name":"Teighan Davies","geo":null,"id":149694844325990400,"id_str":"149694844325990400","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a2.twimg.com/profile_images/1613021110/Picture0298_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1613021110/Picture0298_normal.jpg","source":"<a href="http://twitter.com/#!/download/iphone" rel="nofollow">Twitter for iPhone</a>","text":"@stephmcduff Lolcatz no #butilove #DALLASGREEN","to_user":"stephmcduff","to_user_id":233011266,"to_user_id_str":"233011266","to_user_name":"Stephanie McDuff","in_reply_to_status_id":149694211095142401,"in_reply_to_status_id_str":"149694211095142401"},{"created_at":"Thu, 22 Dec 2011 03:15:32 +0000","from_user":"wolfdawg69","from_user_id":376549124,"from_user_id_str":"376549124","from_user_name":"Delilahhhhh","geo":null,"id":149689499847364608,"id_str":"149689499847364608","iso_language_code":"it","metadata":{"result_type":"recent"},"profile_image_url":"http://a3.twimg.com/profile_images/1551014969/bubbababby_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1551014969/bubbababby_normal.jpg","source":"<a href="http://twitter.com/devices" rel="nofollow">txt</a>","text":"LOLCATZ","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Thu, 22 Dec 2011 02:52:38 +0000","from_user":"PagingDrDre","from_user_id":335938237,"from_user_id_str":"335938237","from_user_name":"Andrea Ratzlaff","geo":null,"id":149683737448284162,"id_str":"149683737448284162","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a2.twimg.com/profile_images/1444343446/image_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1444343446/image_normal.jpg","source":"<a href="http://twitter.com/#!/download/iphone" rel="nofollow">Twitter for iPhone</a>","text":"LAWL lolcatz @t_chapstick14","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Thu, 22 Dec 2011 01:48:22 +0000","from_user":"BigBoiRono","from_user_id":341301684,"from_user_id_str":"341301684","from_user_name":"Donovan Huyck","geo":null,"id":149667564874768385,"id_str":"149667564874768385","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a3.twimg.com/profile_images/1457795278/synchro_meet_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1457795278/synchro_meet_normal.jpg","source":"<a href="http://twitter.com/devices" rel="nofollow">txt</a>","text":"Bai hai wut awks o SALT lolcatz #fuckyou","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Thu, 22 Dec 2011 00:11:38 +0000","from_user":"StephanieZOO","from_user_id":379177126,"from_user_id_str":"379177126","from_user_name":"Stephanie Zamora","geo":null,"id":149643220807192576,"id_str":"149643220807192576","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a1.twimg.com/profile_images/1557641924/035_06__s_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1557641924/035_06__s_normal.jpg","source":"<a href="http://dlvr.it" rel="nofollow">dlvr.it</a>","text":"LolCatz Navy Chooses Two Women To Share Traditional First Kiss","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Thu, 22 Dec 2011 00:11:02 +0000","from_user":"MissTawdry","from_user_id":284260713,"from_user_id_str":"284260713","from_user_name":"MissTawdry","geo":null,"id":149643068751093760,"id_str":"149643068751093760","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a0.twimg.com/profile_images/1431052668/bod_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1431052668/bod_normal.jpg","source":"<a href="http://twitterfeed.com" rel="nofollow">twitterfeed</a>","text":"#LolCatz Navy Chooses Two Women To Share Traditional First Kiss http://t.co/7VqFgQl0 @misslindadee","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Wed, 21 Dec 2011 23:51:17 +0000","from_user":"LIVInmylife6","from_user_id":266053733,"from_user_id_str":"266053733","from_user_name":"Olivia Lake$","geo":null,"id":149638097905258497,"id_str":"149638097905258497","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a1.twimg.com/profile_images/1703355659/IMG00351-20110523-2253_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1703355659/IMG00351-20110523-2253_normal.jpg","source":"<a href="http://blackberry.com/twitter" rel="nofollow">Twitter for BlackBerry\u00AE</a>","text":"@CelioAraujo10 lolcatz I guesss its ok...ps u should fight ross LOL","to_user":"CelioAraujo10","to_user_id":379857020,"to_user_id_str":"379857020","to_user_name":"Celio Araujo","in_reply_to_status_id":149637652952518657,"in_reply_to_status_id_str":"149637652952518657"},{"created_at":"Wed, 21 Dec 2011 19:05:48 +0000","from_user":"rl1971","from_user_id":390991730,"from_user_id_str":"390991730","from_user_name":"RL","geo":null,"id":149566253621723136,"id_str":"149566253621723136","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a3.twimg.com/profile_images/1588515141/twittericon_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1588515141/twittericon_normal.jpg","source":"<a href="http://www.tweetdeck.com" rel="nofollow">TweetDeck</a>","text":"@w1zz http://t.co/4aIMUK1E #lolcatz #liev Met credits naar @gnesmu :)","to_user":"w1zz","to_user_id":194056505,"to_user_id_str":"194056505","to_user_name":"WiZZarD","in_reply_to_status_id":149503807187984385,"in_reply_to_status_id_str":"149503807187984385"},{"created_at":"Wed, 21 Dec 2011 18:51:52 +0000","from_user":"xTalieee1171x","from_user_id":66912646,"from_user_id_str":"66912646","from_user_name":"TC McCullough","geo":null,"id":149562749293297666,"id_str":"149562749293297666","iso_language_code":"en","metadata":{"result_type":"recent"},"profile_image_url":"http://a2.twimg.com/profile_images/1631481911/DSCN9149_normal.JPG","profile_image_url_https":"https://si0.twimg.com/profile_images/1631481911/DSCN9149_normal.JPG","source":"<a href="http://red-badger.com" rel="nofollow">Birdsong for Windows Phone</a>","text":"Cuuuuuute! RT @Skyrawr All of my animals chillin out! Lolcatz http://t.co/inRSK1VM","to_user":null,"to_user_id":null,"to_user_id_str":null,"to_user_name":null},{"created_at":"Wed, 21 Dec 2011 18:12:26 +0000","from_user":"gentin123","from_user_id":279705653,"from_user_id_str":"279705653","from_user_name":"morgan booker","geo":null,"id":149552824877973504,"id_str":"149552824877973504","iso_language_code":"it","metadata":{"result_type":"recent"},"profile_image_url":"http://a1.twimg.com/profile_images/1664758777/39216_421872978366_729553366_5014440_2895898_n_normal.jpg","profile_image_url_https":"https://si0.twimg.com/profile_images/1664758777/39216_421872978366_729553366_5014440_2895898_n_normal.jpg","source":"<a href="http://blackberry.com/twitter" rel="nofollow">Twitter for BlackBerry\u00AE</a>","text":"@mollyjayne_x everiiwon luffz LOLcatz http://t.co/UaGzxNtV","to_user":"mollyjayne_x","to_user_id":408582032,"to_user_id_str":"408582032","to_user_name":"molly","in_reply_to_status_id":149549826068066305,"in_reply_to_status_id_str":"149549826068066305"}],"results_per_page":15,"since_id":0,"since_id_str":"0"}
|
@@ -0,0 +1,46 @@
|
|
1
|
+
module Helpers
|
2
|
+
module Scrapers
|
3
|
+
|
4
|
+
# Public:
|
5
|
+
#
|
6
|
+
# Stubs a HTTP request/response pair
|
7
|
+
#
|
8
|
+
# request_args - Hash of options to be passed to Typhoeus::Request
|
9
|
+
# response_args - Hash of options to be passed to Typhoeus::Response (and to Hydra#stub).
|
10
|
+
#
|
11
|
+
# Returns nothing.
|
12
|
+
#
|
13
|
+
|
14
|
+
def stub_http(request_args={}, response_args={})
|
15
|
+
|
16
|
+
response_args = {
|
17
|
+
:code => 200,
|
18
|
+
:headers => "Content-Type: text/html",
|
19
|
+
:body => "response stub"
|
20
|
+
}.merge(response_args)
|
21
|
+
|
22
|
+
request_args = {
|
23
|
+
:method => :get,
|
24
|
+
:url => anything,
|
25
|
+
:options => anything
|
26
|
+
}.merge(request_args)
|
27
|
+
|
28
|
+
@hydra ||= Typhoeus::Hydra.new
|
29
|
+
stub(Typhoeus::Hydra).new { @hydra }
|
30
|
+
response = Typhoeus::Response.new(response_args)
|
31
|
+
|
32
|
+
stub.proxy(Typhoeus::Request).new(request_args[:url], request_args[:options]) do |request|
|
33
|
+
#
|
34
|
+
# this allows to stub several requests by handing control to a block
|
35
|
+
#
|
36
|
+
|
37
|
+
if block_given?
|
38
|
+
yield(@hydra, request, response)
|
39
|
+
else
|
40
|
+
@hydra.stub(request_args[:method], request_args[:url]).and_return(response)
|
41
|
+
end
|
42
|
+
request
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
@@ -0,0 +1,175 @@
|
|
1
|
+
require 'helpers/spec_helper'
|
2
|
+
include Helpers::Scrapers
|
3
|
+
|
4
|
+
describe IterativeScraper do
|
5
|
+
before(:each) do
|
6
|
+
@fixture_doc ||= proc {
|
7
|
+
file = File.open("fixtures/doc.html", "r")
|
8
|
+
file_content = file.read
|
9
|
+
file.close
|
10
|
+
file_content
|
11
|
+
}.call
|
12
|
+
end
|
13
|
+
|
14
|
+
describe "#initialize" do
|
15
|
+
subject { IterativeScraper.new("http://whatever.net/search") }
|
16
|
+
|
17
|
+
it { subject.should be_a(ScraperBase) }
|
18
|
+
it { subject.should respond_to(:log) }
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "#set_iteration" do
|
22
|
+
before do
|
23
|
+
@scraper = IterativeScraper.new("http://whatever.net/search")
|
24
|
+
end
|
25
|
+
|
26
|
+
it "should allow passing a range and return itself" do
|
27
|
+
@scraper.set_iteration((0..10).step(5)).should be_an_instance_of(IterativeScraper)
|
28
|
+
end
|
29
|
+
it "should allow passing an array and return itself" do
|
30
|
+
@scraper.set_iteration([1, 2, 3, 4]).should be_an_instance_of(IterativeScraper)
|
31
|
+
end
|
32
|
+
it "should allow passing a string and a proc and return itself" do
|
33
|
+
@scraper.set_iteration("#pagination a", proc {}).should be_an_instance_of(IterativeScraper)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
describe "#continue_with" do
|
38
|
+
before do
|
39
|
+
@scraper = IterativeScraper.new("http://whatever.net/search")
|
40
|
+
end
|
41
|
+
|
42
|
+
subject { @scraper.continue_with( proc { |result| result['continue'] }) }
|
43
|
+
it { should be_an_instance_of(IterativeScraper) }
|
44
|
+
end
|
45
|
+
|
46
|
+
|
47
|
+
context "(single url pattern, iteration_set is range , async => false )" do
|
48
|
+
before(:each) do
|
49
|
+
@scraper = IterativeScraper.new("http://whatever.net/search")
|
50
|
+
mock(@scraper).run_super(:run).times(10) {}
|
51
|
+
@scraper.set_iteration(:p, (1..10))
|
52
|
+
end
|
53
|
+
|
54
|
+
describe "#run" do
|
55
|
+
it "super#run should be called 10 times" do
|
56
|
+
@scraper.run
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
context "(single url pattern, iteration_set is extractor, async => false )" do
|
62
|
+
before(:each) do
|
63
|
+
|
64
|
+
iteration_count = 0
|
65
|
+
@params_sent = []
|
66
|
+
iteration_proc = proc {[2, 3, 4]}
|
67
|
+
|
68
|
+
any_instance_of(ExtractionLoop) do |eloop|
|
69
|
+
stub(eloop).run {}
|
70
|
+
end
|
71
|
+
|
72
|
+
stub_http do |hydra, request, response|
|
73
|
+
hydra.stub(:get, /http:\/\/whatever\.net\/search/).and_return(response)
|
74
|
+
@params_sent << request.params[:p]
|
75
|
+
end
|
76
|
+
|
77
|
+
@scraper = IterativeScraper.
|
78
|
+
new("http://whatever.net/search-stuff").
|
79
|
+
set_iteration(:p, iteration_proc).
|
80
|
+
loop_on(".whatever").
|
81
|
+
set_hook(:data, proc { iteration_count += 1 }).
|
82
|
+
run()
|
83
|
+
|
84
|
+
@iteration_count = iteration_count
|
85
|
+
end
|
86
|
+
|
87
|
+
describe "#run" do
|
88
|
+
it "The :data hook should be called 4 times" do
|
89
|
+
@iteration_count.should eql(4)
|
90
|
+
end
|
91
|
+
|
92
|
+
it "should have sent p=1, p=2, p=3, p=4 as request parameters" do
|
93
|
+
@params_sent.should eql(["1", "2", "3", "4"])
|
94
|
+
end
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
context "(single url pattern, iteration_set is range, async => true )" do
|
99
|
+
|
100
|
+
before do
|
101
|
+
@params_sent = []
|
102
|
+
any_instance_of(ExtractionLoop) do |eloop|
|
103
|
+
stub(eloop).run {}
|
104
|
+
end
|
105
|
+
|
106
|
+
stub_http do |hydra, request, response|
|
107
|
+
hydra.stub(:get, request.url).and_return(response)
|
108
|
+
@params_sent << request.params[:p]
|
109
|
+
end
|
110
|
+
|
111
|
+
@scraper = IterativeScraper.
|
112
|
+
new("http://whatever.net/search", {:async => true}, {:params => {:format => "json"} } ).
|
113
|
+
set_iteration(:p, (0..20).step(5)).
|
114
|
+
loop_on(".whatever").
|
115
|
+
run()
|
116
|
+
end
|
117
|
+
|
118
|
+
|
119
|
+
describe "#run" do
|
120
|
+
it "params sent should be p=1, p=5, p=10, p=15, p=20" do
|
121
|
+
@params_sent.should eql([0, 5, 10, 15, 20].map &:to_s)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
|
126
|
+
context "using #continue_with" do
|
127
|
+
|
128
|
+
describe "#run" do
|
129
|
+
before do
|
130
|
+
continue_values = (5..10).to_a
|
131
|
+
@values_sent = []
|
132
|
+
shift_values = proc { |data| continue_values.shift }
|
133
|
+
|
134
|
+
|
135
|
+
stub_http({}, {headers: "Content-Type: application/json", :body => '{"hello":"test"}' }) do |hydra, request, response|
|
136
|
+
@values_sent << request.params[:continue]
|
137
|
+
hydra.stub(:get, request.url).and_return(response)
|
138
|
+
end
|
139
|
+
|
140
|
+
IterativeScraper.
|
141
|
+
new("http://twizzer.net/timeline").
|
142
|
+
loop_on(proc {}).
|
143
|
+
continue_with(:continue, shift_values).
|
144
|
+
run()
|
145
|
+
|
146
|
+
@continue_values = continue_values
|
147
|
+
|
148
|
+
end
|
149
|
+
|
150
|
+
#TODO:
|
151
|
+
#
|
152
|
+
# When #continue_with is used, it would be better avoid sending
|
153
|
+
# an empty iteration_parameter
|
154
|
+
#
|
155
|
+
it "Should run 5 times", :failing => 'true' do
|
156
|
+
@continue_values.all? { |val| @values_sent.include? val.to_s }.should be_true
|
157
|
+
end
|
158
|
+
|
159
|
+
end
|
160
|
+
end
|
161
|
+
|
162
|
+
context "using #continue_with with async = true" do
|
163
|
+
describe "#run" do
|
164
|
+
before do
|
165
|
+
@scraper = IterativeScraper.
|
166
|
+
new("http://twizzer.net/timeline", :async => true)
|
167
|
+
end
|
168
|
+
|
169
|
+
it "should raise an exception" do
|
170
|
+
expect { @scraper.continue_with(:continue, proc {}) }.to raise_exception(IterativeScraper::Exceptions::NonGetAsyncRequestNotYetImplemented)
|
171
|
+
end
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
end
|
@@ -0,0 +1,146 @@
|
|
1
|
+
require 'helpers/spec_helper'
|
2
|
+
|
3
|
+
describe JsonExtractor do
|
4
|
+
before(:each) do
|
5
|
+
stub(scraper = Object.new).options
|
6
|
+
stub(scraper).results
|
7
|
+
@env = ExtractionEnvironment.new(scraper)
|
8
|
+
@json ||= lambda {
|
9
|
+
file = File.open('fixtures/doc.json', 'r')
|
10
|
+
content = file.read
|
11
|
+
file.close
|
12
|
+
content
|
13
|
+
}.call()
|
14
|
+
end
|
15
|
+
|
16
|
+
describe "#initialize" do
|
17
|
+
context("argument is a block") do
|
18
|
+
subject { JsonExtractor.new(:thing, @env, proc {}) }
|
19
|
+
|
20
|
+
it { subject.field_name.should eql(:thing) }
|
21
|
+
it { should_not respond_to(:callback) }
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
|
26
|
+
describe "#extract_field" do
|
27
|
+
|
28
|
+
context "field_name and callback" do
|
29
|
+
before do
|
30
|
+
@extractor = JsonExtractor.new(:from_user, @env)
|
31
|
+
@node = @extractor.parse(@json)['results'].first
|
32
|
+
end
|
33
|
+
|
34
|
+
subject { @extractor.extract_field(@node) }
|
35
|
+
it { should eql("ludovickohn") }
|
36
|
+
end
|
37
|
+
|
38
|
+
context "field_name and callback" do
|
39
|
+
before do
|
40
|
+
@extractor = JsonExtractor.new(:from_user, @env, proc { |node| node['from_user_name'] } )
|
41
|
+
@node = @extractor.parse(@json)['results'].first
|
42
|
+
end
|
43
|
+
|
44
|
+
subject { @extractor.extract_field(@node) }
|
45
|
+
it { should eql("Ludovic kohn") }
|
46
|
+
end
|
47
|
+
|
48
|
+
context "field_name and attribute" do
|
49
|
+
before do
|
50
|
+
@extractor = JsonExtractor.new(:from_user, @env, :from_user_name )
|
51
|
+
@node = @extractor.parse(@json)['results'].first
|
52
|
+
end
|
53
|
+
|
54
|
+
subject { @extractor.extract_field(@node) }
|
55
|
+
it { should eql("Ludovic kohn") }
|
56
|
+
end
|
57
|
+
|
58
|
+
context "field name, attribute, and callback " do
|
59
|
+
before do
|
60
|
+
@extractor = JsonExtractor.new(:from_user, @env, :from_user_name, proc { |username| username.downcase.gsub("\s","-") } )
|
61
|
+
@node = @extractor.parse(@json)['results'].first
|
62
|
+
end
|
63
|
+
|
64
|
+
subject { @extractor.extract_field(@node) }
|
65
|
+
it { should eql("ludovic-kohn") }
|
66
|
+
end
|
67
|
+
|
68
|
+
context("field name and array (see Utils::DeepFetchable)") do
|
69
|
+
before do
|
70
|
+
@extractor = JsonExtractor.new(:from_user, @env, ['results', 0, 'from_user'])
|
71
|
+
end
|
72
|
+
subject { @extractor.extract_field(@json) }
|
73
|
+
it { should eql("ludovickohn") }
|
74
|
+
end
|
75
|
+
|
76
|
+
context("field name, array, and callback") do
|
77
|
+
before do
|
78
|
+
@extractor = JsonExtractor.new(:from_user, @env, ['results', 0, 'from_user'], proc { |username| username.gsub("ckohn",'co') })
|
79
|
+
end
|
80
|
+
subject { @extractor.extract_field(@json) }
|
81
|
+
it { should eql("ludovico") }
|
82
|
+
end
|
83
|
+
|
84
|
+
end
|
85
|
+
|
86
|
+
describe "#extract_list" do
|
87
|
+
|
88
|
+
|
89
|
+
context "using #get_in" do
|
90
|
+
before do
|
91
|
+
@extractor = JsonExtractor.new(nil, @env, ['results', 0..5])
|
92
|
+
end
|
93
|
+
|
94
|
+
subject { @extractor.extract_list(@json) }
|
95
|
+
|
96
|
+
it { subject.size.should eql(6) }
|
97
|
+
end
|
98
|
+
|
99
|
+
context "with json string input" do
|
100
|
+
before do
|
101
|
+
@extractor = JsonExtractor.new(nil, @env, proc { |data| data['results'] })
|
102
|
+
end
|
103
|
+
|
104
|
+
subject { @extractor.extract_list(@json) }
|
105
|
+
it { subject.size.should eql(15) }
|
106
|
+
it { should be_an_instance_of(Array) }
|
107
|
+
end
|
108
|
+
|
109
|
+
context "with pre-parsed input" do
|
110
|
+
before do
|
111
|
+
@extractor = JsonExtractor.new(nil, @env, proc { |data| data['results'] })
|
112
|
+
end
|
113
|
+
|
114
|
+
subject { @extractor.extract_list((Yajl::Parser.new).parse(@json)) }
|
115
|
+
it { subject.size.should eql(15) }
|
116
|
+
it { should be_an_instance_of(Array) }
|
117
|
+
end
|
118
|
+
|
119
|
+
end
|
120
|
+
|
121
|
+
context "non-string input" do
|
122
|
+
describe "#parse" do
|
123
|
+
before do
|
124
|
+
@extractor = JsonExtractor.new(nil, @env, proc {})
|
125
|
+
end
|
126
|
+
|
127
|
+
it "Should raise an exception" do
|
128
|
+
expect { @extractor.parse(Nokogiri::HTML(@html)) }.to raise_exception(ExtractorBase::Exceptions::ExtractorParseError)
|
129
|
+
end
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
context "#json input" do
|
134
|
+
describe "#parse" do
|
135
|
+
before do
|
136
|
+
@extractor = JsonExtractor.new(nil, @env, proc {})
|
137
|
+
end
|
138
|
+
|
139
|
+
subject { @extractor.parse(@json) }
|
140
|
+
|
141
|
+
it { should respond_to(:get_in) }
|
142
|
+
it { should be_an_instance_of(Hash) }
|
143
|
+
it { should_not be_empty }
|
144
|
+
end
|
145
|
+
end
|
146
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'helpers/spec_helper'
|
2
|
+
|
3
|
+
class LoggableClass
|
4
|
+
attr_reader :super_called
|
5
|
+
include Loggable
|
6
|
+
|
7
|
+
def initialize
|
8
|
+
@super_called=true
|
9
|
+
end
|
10
|
+
end
|
11
|
+
|
12
|
+
describe Loggable do
|
13
|
+
describe "#initialize" do
|
14
|
+
|
15
|
+
subject { LoggableClass.new }
|
16
|
+
|
17
|
+
it "should execute the class' #initialize method" do
|
18
|
+
subject.super_called.should eql(true)
|
19
|
+
end
|
20
|
+
|
21
|
+
it "should respond to the #log method" do
|
22
|
+
subject.respond_to?(:log).should be_true
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,178 @@
|
|
1
|
+
require 'helpers/spec_helper'
|
2
|
+
include Helpers::Scrapers
|
3
|
+
|
4
|
+
describe ScraperBase do
|
5
|
+
before do
|
6
|
+
@fixture_doc = File.open("fixtures/doc.html", 'r') do |file|
|
7
|
+
file.read
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
before(:each) do
|
12
|
+
@scraper = ScraperBase.new("http://localhost/fixture")
|
13
|
+
end
|
14
|
+
|
15
|
+
|
16
|
+
describe "#loop_on" do
|
17
|
+
subject { @scraper.loop_on("bla.bla") }
|
18
|
+
it { should be_an_instance_of(ScraperBase) }
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "#extract" do
|
22
|
+
subject { @scraper.extract("fieldname", "bla.bla") }
|
23
|
+
it { should be_an_instance_of(ScraperBase) }
|
24
|
+
end
|
25
|
+
|
26
|
+
describe "#set_hook" do
|
27
|
+
subject { @scraper.set_hook(:after, proc {}) }
|
28
|
+
it { should be_an_instance_of(ScraperBase) }
|
29
|
+
end
|
30
|
+
|
31
|
+
describe "#set_hook" do
|
32
|
+
it "should raise exception if no proc is provided" do
|
33
|
+
expect { @scraper.set_hook(:after, :method) }.to raise_exception(ScraperBase::Exceptions::HookArgumentError)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
|
37
|
+
context "request params in both the url and the arguments hash" do
|
38
|
+
describe "#run" do
|
39
|
+
before do
|
40
|
+
|
41
|
+
@request_args = {}
|
42
|
+
url = "http://localhost/whatever?q=stuff&p=1&limit=100"
|
43
|
+
stub_http do |hydra, request, response|
|
44
|
+
@request_args = request.params
|
45
|
+
hydra.stub(:get, request.url).and_return(response)
|
46
|
+
end
|
47
|
+
|
48
|
+
any_instance_of(ExtractionLoop) do |extraloop|
|
49
|
+
stub(extraloop).run {}
|
50
|
+
end
|
51
|
+
|
52
|
+
@scraper = ScraperBase.new(url, {}, {
|
53
|
+
:params => { :limit => 250 }
|
54
|
+
}).loop_on(".stuff").run
|
55
|
+
end
|
56
|
+
|
57
|
+
it "should merge URL and request parameters" do
|
58
|
+
@request_args[:p].to_s.should eql("1")
|
59
|
+
@request_args[:q].to_s.should eql("stuff")
|
60
|
+
@request_args[:limit].to_s.should eql("250")
|
61
|
+
end
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
context "single url, no options provided (async => false)" do
|
66
|
+
describe "#run" do
|
67
|
+
before do
|
68
|
+
@url = "http://localhost/fixture"
|
69
|
+
results = []
|
70
|
+
|
71
|
+
stub_http({}, :body => @fixture_doc) do |hydra, request, response|
|
72
|
+
hydra.stub(:get, request.url).and_return(response)
|
73
|
+
end
|
74
|
+
|
75
|
+
@scraper = ScraperBase.new(@url).
|
76
|
+
loop_on("ul li.file a").
|
77
|
+
extract(:url, :href).
|
78
|
+
extract(:filename).
|
79
|
+
set_hook(:data, proc { |records| records.each { |record| results << record }})
|
80
|
+
|
81
|
+
@results = results
|
82
|
+
end
|
83
|
+
|
84
|
+
|
85
|
+
it "Should handle response" do
|
86
|
+
@scraper.run
|
87
|
+
@results.should_not be_empty
|
88
|
+
@results.all? { |record| record.extracted_at && record.url && record.filename }.should be_true
|
89
|
+
end
|
90
|
+
end
|
91
|
+
end
|
92
|
+
|
93
|
+
context "multiple urls (async => false)" do
|
94
|
+
describe "#run" do
|
95
|
+
before do
|
96
|
+
@urls = [
|
97
|
+
"http://localhost/fixture1",
|
98
|
+
"http://localhost/fixture2",
|
99
|
+
"http://localhost/fixture3",
|
100
|
+
]
|
101
|
+
results = []
|
102
|
+
@hydra_run_call_count = 0
|
103
|
+
|
104
|
+
stub_http do |hydra, request, response|
|
105
|
+
@urls.each { |url| hydra.stub(:get, url).and_return(response) }
|
106
|
+
stub.proxy(hydra).run { @hydra_run_call_count += 1 }
|
107
|
+
end
|
108
|
+
|
109
|
+
@scraper = ScraperBase.new(@urls, :log => false).
|
110
|
+
loop_on("ul li.file a").
|
111
|
+
extract(:url, :href).
|
112
|
+
extract(:filename).
|
113
|
+
set_hook(:data, proc { |records| records.each { |record| results << record } })
|
114
|
+
|
115
|
+
@results = results
|
116
|
+
|
117
|
+
@fake_loop = Object.new
|
118
|
+
stub(@fake_loop).run { }
|
119
|
+
stub(@fake_loop).environment { ExtractionEnvironment.new }
|
120
|
+
stub(@fake_loop).records { Array(1..3).map { |n| Object.new } }
|
121
|
+
|
122
|
+
mock(ExtractionLoop).new(is_a(DomExtractor), is_a(Array), is_a(String), is_a(Hash), is_a(ScraperBase)).times(3) { @fake_loop }
|
123
|
+
end
|
124
|
+
|
125
|
+
|
126
|
+
it "Should handle response" do
|
127
|
+
@scraper.run
|
128
|
+
@results.size.should eql(9)
|
129
|
+
@hydra_run_call_count.should eql(@urls.size)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
133
|
+
|
134
|
+
|
135
|
+
context "multiple urls (async => true)" do
|
136
|
+
describe "#run" do
|
137
|
+
before do
|
138
|
+
@urls = [
|
139
|
+
"http://localhost/fixture1",
|
140
|
+
"http://localhost/fixture2",
|
141
|
+
"http://localhost/fixture3",
|
142
|
+
"http://localhost/fixture4",
|
143
|
+
"http://localhost/fixture5",
|
144
|
+
]
|
145
|
+
results = []
|
146
|
+
@hydra_run_call_count = 0
|
147
|
+
|
148
|
+
stub_http({}, :body => @fixture_doc) do |hydra, request, response|
|
149
|
+
@urls.each { |url| hydra.stub(:get, url).and_return(response) }
|
150
|
+
stub.proxy(hydra).run { @hydra_run_call_count+=1 }
|
151
|
+
end
|
152
|
+
|
153
|
+
@scraper = ScraperBase.new(@urls, :async => true).
|
154
|
+
loop_on("ul li.file a").
|
155
|
+
extract(:url, :href).
|
156
|
+
extract(:filename).
|
157
|
+
set_hook(:data, proc { |records| records.each { |record| results << record } })
|
158
|
+
|
159
|
+
@results = results
|
160
|
+
|
161
|
+
@fake_loop = Object.new
|
162
|
+
stub(@fake_loop).run { }
|
163
|
+
stub(@fake_loop).environment { ExtractionEnvironment.new }
|
164
|
+
stub(@fake_loop).records { Array(1..3).map { |n| Object.new } }
|
165
|
+
|
166
|
+
mock(ExtractionLoop).new(is_a(DomExtractor), is_a(Array), is_a(String), is_a(Hash), is_a(ScraperBase)).times(@urls.size) { @fake_loop }
|
167
|
+
end
|
168
|
+
|
169
|
+
|
170
|
+
it "Should handle response" do
|
171
|
+
@scraper.run
|
172
|
+
@results.size.should eql(@urls.size * 3)
|
173
|
+
@hydra_run_call_count.should eql(1)
|
174
|
+
end
|
175
|
+
end
|
176
|
+
end
|
177
|
+
|
178
|
+
end
|
data/spec/utils_spec.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
require 'helpers/spec_helper'
|
2
|
+
|
3
|
+
describe Utils do
|
4
|
+
describe "AugmentedHash" do
|
5
|
+
describe "#get_in" do
|
6
|
+
context "extending a hash object" do
|
7
|
+
before do
|
8
|
+
@hash = {
|
9
|
+
:a => {
|
10
|
+
:b => {
|
11
|
+
:c => [
|
12
|
+
:x, :y, :z
|
13
|
+
]
|
14
|
+
}
|
15
|
+
}
|
16
|
+
}.extend(Utils::DeepFetchable)
|
17
|
+
end
|
18
|
+
|
19
|
+
subject { @hash.get_in [:a, :b, :c, 2] }
|
20
|
+
|
21
|
+
it { should eql(:z) }
|
22
|
+
|
23
|
+
|
24
|
+
context "trying to fetch a key that does not exist" do
|
25
|
+
subject { @hash.get_in [:a, :b, :wrong, :even_worst ]}
|
26
|
+
|
27
|
+
it { should eql(nil) }
|
28
|
+
end
|
29
|
+
|
30
|
+
end
|
31
|
+
|
32
|
+
context "extending an Array object" do
|
33
|
+
before do
|
34
|
+
@array = [1, 2, 3, 4, [5.1, 5.2, 5.3]].extend(Utils::DeepFetchable)
|
35
|
+
end
|
36
|
+
|
37
|
+
|
38
|
+
subject { @array.get_in [4, -1] }
|
39
|
+
|
40
|
+
it { should eql(5.3) }
|
41
|
+
end
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|