doctor_scrape 0.0.5 → 0.0.6
Sign up to get free protection for your applications and to get access to all the features.
- data/doctor_scrape.gemspec +3 -2
- data/lib/doctor_scrape.rb +1 -2
- data/lib/doctor_scrape/search.rb +1 -1
- data/lib/doctor_scrape/version.rb +1 -1
- data/spec/search_spec.rb +2 -2
- metadata +45 -37
- data/lib/doctor_scrape/redirect_follower.rb +0 -29
- data/spec/redirect_follower_spec.rb +0 -37
data/doctor_scrape.gemspec
CHANGED
@@ -15,7 +15,9 @@ Gem::Specification.new do |gem|
|
|
15
15
|
gem.require_paths = ["lib"]
|
16
16
|
gem.version = DoctorScrape::VERSION
|
17
17
|
|
18
|
-
gem.add_dependency "mechanize", ["~> 2.3"]
|
18
|
+
# gem.add_dependency "mechanize", ["~> 2.3"]
|
19
|
+
gem.add_dependency "nokogiri", ["~> 1.5.0"]
|
20
|
+
gem.add_dependency "unwind", ["~> 0.9.6"]
|
19
21
|
# gem.add_dependency "text", ["~> 1.0.3"]
|
20
22
|
|
21
23
|
gem.add_development_dependency "rspec", ["~> 2.8.0"]
|
@@ -31,5 +33,4 @@ Gem::Specification.new do |gem|
|
|
31
33
|
gem.add_development_dependency "hirb", ["~> 0.6.0"]
|
32
34
|
gem.add_development_dependency "awesome_print", ["~> 1.0.2"]
|
33
35
|
gem.add_development_dependency "simplecov", ["~> 0.6.0"]
|
34
|
-
# gem.add_development_dependency "psych", ["~> 1.2.2"]
|
35
36
|
end
|
data/lib/doctor_scrape.rb
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
require 'doctor_scrape/version'
|
3
|
-
require 'mechanize'
|
4
3
|
require 'nokogiri'
|
5
4
|
require 'ostruct'
|
6
5
|
require 'open-uri'
|
6
|
+
require 'unwind'
|
7
7
|
|
8
8
|
module DoctorScrape
|
9
9
|
autoload :Data, 'doctor_scrape/data'
|
10
10
|
autoload :Search, 'doctor_scrape/search'
|
11
|
-
autoload :RedirectFollower, 'doctor_scrape/redirect_follower'
|
12
11
|
|
13
12
|
module Scraper
|
14
13
|
autoload :Base, 'doctor_scrape/scraper/base'
|
data/lib/doctor_scrape/search.rb
CHANGED
data/spec/search_spec.rb
CHANGED
@@ -89,9 +89,9 @@ describe DoctorScrape::Search do
|
|
89
89
|
let(:resolved) { ["http://bitly.com", "http://example.com", "http://googl.com"] }
|
90
90
|
|
91
91
|
it "returns array with resolved urls" do
|
92
|
-
resolvers = resolved.map { |r| double "resolver", :
|
92
|
+
resolvers = resolved.map { |r| double "resolver", :final_url => r }
|
93
93
|
urls.each_with_index do |url, i|
|
94
|
-
|
94
|
+
Unwind::RedirectFollower.should_receive(:resolve).with(url) { resolvers[i] }
|
95
95
|
end
|
96
96
|
|
97
97
|
DoctorScrape::Search.resolve_urls(urls).should == resolved
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: doctor_scrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,22 +9,33 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-03-01 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
16
|
-
requirement: &
|
15
|
+
name: nokogiri
|
16
|
+
requirement: &70186149359280 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version:
|
21
|
+
version: 1.5.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70186149359280
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: unwind
|
27
|
+
requirement: &70186149358760 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ~>
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 0.9.6
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70186149358760
|
25
36
|
- !ruby/object:Gem::Dependency
|
26
37
|
name: rspec
|
27
|
-
requirement: &
|
38
|
+
requirement: &70186149358280 !ruby/object:Gem::Requirement
|
28
39
|
none: false
|
29
40
|
requirements:
|
30
41
|
- - ~>
|
@@ -32,10 +43,10 @@ dependencies:
|
|
32
43
|
version: 2.8.0
|
33
44
|
type: :development
|
34
45
|
prerelease: false
|
35
|
-
version_requirements: *
|
46
|
+
version_requirements: *70186149358280
|
36
47
|
- !ruby/object:Gem::Dependency
|
37
48
|
name: vcr
|
38
|
-
requirement: &
|
49
|
+
requirement: &70186149357800 !ruby/object:Gem::Requirement
|
39
50
|
none: false
|
40
51
|
requirements:
|
41
52
|
- - ~>
|
@@ -43,10 +54,10 @@ dependencies:
|
|
43
54
|
version: 2.0.0.rc1
|
44
55
|
type: :development
|
45
56
|
prerelease: false
|
46
|
-
version_requirements: *
|
57
|
+
version_requirements: *70186149357800
|
47
58
|
- !ruby/object:Gem::Dependency
|
48
59
|
name: webmock
|
49
|
-
requirement: &
|
60
|
+
requirement: &70186149357320 !ruby/object:Gem::Requirement
|
50
61
|
none: false
|
51
62
|
requirements:
|
52
63
|
- - <
|
@@ -54,10 +65,10 @@ dependencies:
|
|
54
65
|
version: '1.8'
|
55
66
|
type: :development
|
56
67
|
prerelease: false
|
57
|
-
version_requirements: *
|
68
|
+
version_requirements: *70186149357320
|
58
69
|
- !ruby/object:Gem::Dependency
|
59
70
|
name: guard
|
60
|
-
requirement: &
|
71
|
+
requirement: &70186149356840 !ruby/object:Gem::Requirement
|
61
72
|
none: false
|
62
73
|
requirements:
|
63
74
|
- - ~>
|
@@ -65,10 +76,10 @@ dependencies:
|
|
65
76
|
version: 1.0.0
|
66
77
|
type: :development
|
67
78
|
prerelease: false
|
68
|
-
version_requirements: *
|
79
|
+
version_requirements: *70186149356840
|
69
80
|
- !ruby/object:Gem::Dependency
|
70
81
|
name: guard-rspec
|
71
|
-
requirement: &
|
82
|
+
requirement: &70186149356360 !ruby/object:Gem::Requirement
|
72
83
|
none: false
|
73
84
|
requirements:
|
74
85
|
- - ~>
|
@@ -76,10 +87,10 @@ dependencies:
|
|
76
87
|
version: 0.6.0
|
77
88
|
type: :development
|
78
89
|
prerelease: false
|
79
|
-
version_requirements: *
|
90
|
+
version_requirements: *70186149356360
|
80
91
|
- !ruby/object:Gem::Dependency
|
81
92
|
name: ruby_gntp
|
82
|
-
requirement: &
|
93
|
+
requirement: &70186149355880 !ruby/object:Gem::Requirement
|
83
94
|
none: false
|
84
95
|
requirements:
|
85
96
|
- - ~>
|
@@ -87,10 +98,10 @@ dependencies:
|
|
87
98
|
version: 0.3.4
|
88
99
|
type: :development
|
89
100
|
prerelease: false
|
90
|
-
version_requirements: *
|
101
|
+
version_requirements: *70186149355880
|
91
102
|
- !ruby/object:Gem::Dependency
|
92
103
|
name: rb-fsevent
|
93
|
-
requirement: &
|
104
|
+
requirement: &70186149355400 !ruby/object:Gem::Requirement
|
94
105
|
none: false
|
95
106
|
requirements:
|
96
107
|
- - ~>
|
@@ -98,10 +109,10 @@ dependencies:
|
|
98
109
|
version: 0.9.0
|
99
110
|
type: :development
|
100
111
|
prerelease: false
|
101
|
-
version_requirements: *
|
112
|
+
version_requirements: *70186149355400
|
102
113
|
- !ruby/object:Gem::Dependency
|
103
114
|
name: pry
|
104
|
-
requirement: &
|
115
|
+
requirement: &70186149354920 !ruby/object:Gem::Requirement
|
105
116
|
none: false
|
106
117
|
requirements:
|
107
118
|
- - ~>
|
@@ -109,10 +120,10 @@ dependencies:
|
|
109
120
|
version: 0.9.8.2
|
110
121
|
type: :development
|
111
122
|
prerelease: false
|
112
|
-
version_requirements: *
|
123
|
+
version_requirements: *70186149354920
|
113
124
|
- !ruby/object:Gem::Dependency
|
114
125
|
name: pry-doc
|
115
|
-
requirement: &
|
126
|
+
requirement: &70186149354440 !ruby/object:Gem::Requirement
|
116
127
|
none: false
|
117
128
|
requirements:
|
118
129
|
- - ~>
|
@@ -120,10 +131,10 @@ dependencies:
|
|
120
131
|
version: 0.4.0
|
121
132
|
type: :development
|
122
133
|
prerelease: false
|
123
|
-
version_requirements: *
|
134
|
+
version_requirements: *70186149354440
|
124
135
|
- !ruby/object:Gem::Dependency
|
125
136
|
name: pry-editline
|
126
|
-
requirement: &
|
137
|
+
requirement: &70186149353960 !ruby/object:Gem::Requirement
|
127
138
|
none: false
|
128
139
|
requirements:
|
129
140
|
- - ~>
|
@@ -131,10 +142,10 @@ dependencies:
|
|
131
142
|
version: 1.1.1
|
132
143
|
type: :development
|
133
144
|
prerelease: false
|
134
|
-
version_requirements: *
|
145
|
+
version_requirements: *70186149353960
|
135
146
|
- !ruby/object:Gem::Dependency
|
136
147
|
name: hirb
|
137
|
-
requirement: &
|
148
|
+
requirement: &70186149353480 !ruby/object:Gem::Requirement
|
138
149
|
none: false
|
139
150
|
requirements:
|
140
151
|
- - ~>
|
@@ -142,10 +153,10 @@ dependencies:
|
|
142
153
|
version: 0.6.0
|
143
154
|
type: :development
|
144
155
|
prerelease: false
|
145
|
-
version_requirements: *
|
156
|
+
version_requirements: *70186149353480
|
146
157
|
- !ruby/object:Gem::Dependency
|
147
158
|
name: awesome_print
|
148
|
-
requirement: &
|
159
|
+
requirement: &70186149353000 !ruby/object:Gem::Requirement
|
149
160
|
none: false
|
150
161
|
requirements:
|
151
162
|
- - ~>
|
@@ -153,10 +164,10 @@ dependencies:
|
|
153
164
|
version: 1.0.2
|
154
165
|
type: :development
|
155
166
|
prerelease: false
|
156
|
-
version_requirements: *
|
167
|
+
version_requirements: *70186149353000
|
157
168
|
- !ruby/object:Gem::Dependency
|
158
169
|
name: simplecov
|
159
|
-
requirement: &
|
170
|
+
requirement: &70186149352520 !ruby/object:Gem::Requirement
|
160
171
|
none: false
|
161
172
|
requirements:
|
162
173
|
- - ~>
|
@@ -164,7 +175,7 @@ dependencies:
|
|
164
175
|
version: 0.6.0
|
165
176
|
type: :development
|
166
177
|
prerelease: false
|
167
|
-
version_requirements: *
|
178
|
+
version_requirements: *70186149352520
|
168
179
|
description: Library for scraping norwegian doctoral dissertations
|
169
180
|
email:
|
170
181
|
- gudleik@gmail.com
|
@@ -183,7 +194,6 @@ files:
|
|
183
194
|
- doctor_scrape.gemspec
|
184
195
|
- lib/doctor_scrape.rb
|
185
196
|
- lib/doctor_scrape/data.rb
|
186
|
-
- lib/doctor_scrape/redirect_follower.rb
|
187
197
|
- lib/doctor_scrape/scraper/base.rb
|
188
198
|
- lib/doctor_scrape/scraper/bora.rb
|
189
199
|
- lib/doctor_scrape/scraper/diva.rb
|
@@ -214,7 +224,6 @@ files:
|
|
214
224
|
- spec/parse/diva_spec.rb
|
215
225
|
- spec/parse/duo_spec.rb
|
216
226
|
- spec/parse/munin_spec.rb
|
217
|
-
- spec/redirect_follower_spec.rb
|
218
227
|
- spec/scraper_spec.rb
|
219
228
|
- spec/scrapers/base_spec.rb
|
220
229
|
- spec/scrapers/bora_spec.rb
|
@@ -240,7 +249,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
240
249
|
version: '0'
|
241
250
|
segments:
|
242
251
|
- 0
|
243
|
-
hash: -
|
252
|
+
hash: -2449353312367516367
|
244
253
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
245
254
|
none: false
|
246
255
|
requirements:
|
@@ -249,7 +258,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
249
258
|
version: '0'
|
250
259
|
segments:
|
251
260
|
- 0
|
252
|
-
hash: -
|
261
|
+
hash: -2449353312367516367
|
253
262
|
requirements: []
|
254
263
|
rubyforge_project:
|
255
264
|
rubygems_version: 1.8.11
|
@@ -279,7 +288,6 @@ test_files:
|
|
279
288
|
- spec/parse/diva_spec.rb
|
280
289
|
- spec/parse/duo_spec.rb
|
281
290
|
- spec/parse/munin_spec.rb
|
282
|
-
- spec/redirect_follower_spec.rb
|
283
291
|
- spec/scraper_spec.rb
|
284
292
|
- spec/scrapers/base_spec.rb
|
285
293
|
- spec/scrapers/bora_spec.rb
|
@@ -1,29 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
require 'net/https'
|
3
|
-
module DoctorScrape
|
4
|
-
class TooManyRedirects < StandardError; end
|
5
|
-
|
6
|
-
class RedirectFollower
|
7
|
-
attr_accessor :url
|
8
|
-
|
9
|
-
def initialize(url)
|
10
|
-
@url = url
|
11
|
-
end
|
12
|
-
|
13
|
-
def resolve(limit=5)
|
14
|
-
raise TooManyRedirects if limit == 0
|
15
|
-
|
16
|
-
response = Net::HTTP.get_response URI.parse(@url)
|
17
|
-
|
18
|
-
if response.is_a? Net::HTTPRedirection
|
19
|
-
@url = response['location']
|
20
|
-
resolve limit - 1
|
21
|
-
end
|
22
|
-
|
23
|
-
@url
|
24
|
-
rescue Net::HTTPBadResponse => error
|
25
|
-
# This can safely be ignored
|
26
|
-
end
|
27
|
-
|
28
|
-
end
|
29
|
-
end
|
@@ -1,37 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
require 'spec_helper'
|
3
|
-
|
4
|
-
describe DoctorScrape::RedirectFollower do
|
5
|
-
let(:url) { "http://bit.ly/foobar" }
|
6
|
-
let(:endpoint) { "http://example.com" }
|
7
|
-
let(:resolver) { DoctorScrape::RedirectFollower.new url }
|
8
|
-
before { stub_request(:any, endpoint).to_return(:body => "You found me!") }
|
9
|
-
|
10
|
-
context "when url doesn't redirect" do
|
11
|
-
before { stub_request(:any, url).to_return(:body => "Ok") }
|
12
|
-
specify { resolver.resolve.should eq url }
|
13
|
-
end
|
14
|
-
|
15
|
-
context "when url redirects" do
|
16
|
-
before { stub_request(:any, url).to_return(:status => [ 302, "Moved Temporarily" ], headers: { location: endpoint }) }
|
17
|
-
specify { resolver.resolve.should eq endpoint }
|
18
|
-
end
|
19
|
-
|
20
|
-
context "too many redirects" do
|
21
|
-
before { stub_request(:any, url).to_return(:status => [ 302, "Moved Temporarily" ], headers: { location: url }) }
|
22
|
-
|
23
|
-
it "raises error after 5 redirects" do
|
24
|
-
expect { resolver.resolve }.to raise_error
|
25
|
-
a_request(:get, url).should have_been_made.times(5)
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
context "when exception occurs" do
|
30
|
-
it "returns the last url" do
|
31
|
-
stub_request(:get, url).to_return(:status => [ 302, "Moved Temporarily" ], headers: { location: endpoint })
|
32
|
-
stub_request(:get, endpoint).to_raise Net::HTTPBadResponse
|
33
|
-
resolver.resolve.should eq(endpoint)
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
end
|