doctor_scrape 0.0.5 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/doctor_scrape.gemspec +3 -2
- data/lib/doctor_scrape.rb +1 -2
- data/lib/doctor_scrape/search.rb +1 -1
- data/lib/doctor_scrape/version.rb +1 -1
- data/spec/search_spec.rb +2 -2
- metadata +45 -37
- data/lib/doctor_scrape/redirect_follower.rb +0 -29
- data/spec/redirect_follower_spec.rb +0 -37
data/doctor_scrape.gemspec
CHANGED
@@ -15,7 +15,9 @@ Gem::Specification.new do |gem|
|
|
15
15
|
gem.require_paths = ["lib"]
|
16
16
|
gem.version = DoctorScrape::VERSION
|
17
17
|
|
18
|
-
gem.add_dependency "mechanize", ["~> 2.3"]
|
18
|
+
# gem.add_dependency "mechanize", ["~> 2.3"]
|
19
|
+
gem.add_dependency "nokogiri", ["~> 1.5.0"]
|
20
|
+
gem.add_dependency "unwind", ["~> 0.9.6"]
|
19
21
|
# gem.add_dependency "text", ["~> 1.0.3"]
|
20
22
|
|
21
23
|
gem.add_development_dependency "rspec", ["~> 2.8.0"]
|
@@ -31,5 +33,4 @@ Gem::Specification.new do |gem|
|
|
31
33
|
gem.add_development_dependency "hirb", ["~> 0.6.0"]
|
32
34
|
gem.add_development_dependency "awesome_print", ["~> 1.0.2"]
|
33
35
|
gem.add_development_dependency "simplecov", ["~> 0.6.0"]
|
34
|
-
# gem.add_development_dependency "psych", ["~> 1.2.2"]
|
35
36
|
end
|
data/lib/doctor_scrape.rb
CHANGED
@@ -1,14 +1,13 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
require 'doctor_scrape/version'
|
3
|
-
require 'mechanize'
|
4
3
|
require 'nokogiri'
|
5
4
|
require 'ostruct'
|
6
5
|
require 'open-uri'
|
6
|
+
require 'unwind'
|
7
7
|
|
8
8
|
module DoctorScrape
|
9
9
|
autoload :Data, 'doctor_scrape/data'
|
10
10
|
autoload :Search, 'doctor_scrape/search'
|
11
|
-
autoload :RedirectFollower, 'doctor_scrape/redirect_follower'
|
12
11
|
|
13
12
|
module Scraper
|
14
13
|
autoload :Base, 'doctor_scrape/scraper/base'
|
data/lib/doctor_scrape/search.rb
CHANGED
data/spec/search_spec.rb
CHANGED
@@ -89,9 +89,9 @@ describe DoctorScrape::Search do
|
|
89
89
|
let(:resolved) { ["http://bitly.com", "http://example.com", "http://googl.com"] }
|
90
90
|
|
91
91
|
it "returns array with resolved urls" do
|
92
|
-
resolvers = resolved.map { |r| double "resolver", :
|
92
|
+
resolvers = resolved.map { |r| double "resolver", :final_url => r }
|
93
93
|
urls.each_with_index do |url, i|
|
94
|
-
|
94
|
+
Unwind::RedirectFollower.should_receive(:resolve).with(url) { resolvers[i] }
|
95
95
|
end
|
96
96
|
|
97
97
|
DoctorScrape::Search.resolve_urls(urls).should == resolved
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: doctor_scrape
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.6
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,22 +9,33 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-03-01 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
|
-
name:
|
16
|
-
requirement: &
|
15
|
+
name: nokogiri
|
16
|
+
requirement: &70186149359280 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version:
|
21
|
+
version: 1.5.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70186149359280
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: unwind
|
27
|
+
requirement: &70186149358760 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ~>
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 0.9.6
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *70186149358760
|
25
36
|
- !ruby/object:Gem::Dependency
|
26
37
|
name: rspec
|
27
|
-
requirement: &
|
38
|
+
requirement: &70186149358280 !ruby/object:Gem::Requirement
|
28
39
|
none: false
|
29
40
|
requirements:
|
30
41
|
- - ~>
|
@@ -32,10 +43,10 @@ dependencies:
|
|
32
43
|
version: 2.8.0
|
33
44
|
type: :development
|
34
45
|
prerelease: false
|
35
|
-
version_requirements: *
|
46
|
+
version_requirements: *70186149358280
|
36
47
|
- !ruby/object:Gem::Dependency
|
37
48
|
name: vcr
|
38
|
-
requirement: &
|
49
|
+
requirement: &70186149357800 !ruby/object:Gem::Requirement
|
39
50
|
none: false
|
40
51
|
requirements:
|
41
52
|
- - ~>
|
@@ -43,10 +54,10 @@ dependencies:
|
|
43
54
|
version: 2.0.0.rc1
|
44
55
|
type: :development
|
45
56
|
prerelease: false
|
46
|
-
version_requirements: *
|
57
|
+
version_requirements: *70186149357800
|
47
58
|
- !ruby/object:Gem::Dependency
|
48
59
|
name: webmock
|
49
|
-
requirement: &
|
60
|
+
requirement: &70186149357320 !ruby/object:Gem::Requirement
|
50
61
|
none: false
|
51
62
|
requirements:
|
52
63
|
- - <
|
@@ -54,10 +65,10 @@ dependencies:
|
|
54
65
|
version: '1.8'
|
55
66
|
type: :development
|
56
67
|
prerelease: false
|
57
|
-
version_requirements: *
|
68
|
+
version_requirements: *70186149357320
|
58
69
|
- !ruby/object:Gem::Dependency
|
59
70
|
name: guard
|
60
|
-
requirement: &
|
71
|
+
requirement: &70186149356840 !ruby/object:Gem::Requirement
|
61
72
|
none: false
|
62
73
|
requirements:
|
63
74
|
- - ~>
|
@@ -65,10 +76,10 @@ dependencies:
|
|
65
76
|
version: 1.0.0
|
66
77
|
type: :development
|
67
78
|
prerelease: false
|
68
|
-
version_requirements: *
|
79
|
+
version_requirements: *70186149356840
|
69
80
|
- !ruby/object:Gem::Dependency
|
70
81
|
name: guard-rspec
|
71
|
-
requirement: &
|
82
|
+
requirement: &70186149356360 !ruby/object:Gem::Requirement
|
72
83
|
none: false
|
73
84
|
requirements:
|
74
85
|
- - ~>
|
@@ -76,10 +87,10 @@ dependencies:
|
|
76
87
|
version: 0.6.0
|
77
88
|
type: :development
|
78
89
|
prerelease: false
|
79
|
-
version_requirements: *
|
90
|
+
version_requirements: *70186149356360
|
80
91
|
- !ruby/object:Gem::Dependency
|
81
92
|
name: ruby_gntp
|
82
|
-
requirement: &
|
93
|
+
requirement: &70186149355880 !ruby/object:Gem::Requirement
|
83
94
|
none: false
|
84
95
|
requirements:
|
85
96
|
- - ~>
|
@@ -87,10 +98,10 @@ dependencies:
|
|
87
98
|
version: 0.3.4
|
88
99
|
type: :development
|
89
100
|
prerelease: false
|
90
|
-
version_requirements: *
|
101
|
+
version_requirements: *70186149355880
|
91
102
|
- !ruby/object:Gem::Dependency
|
92
103
|
name: rb-fsevent
|
93
|
-
requirement: &
|
104
|
+
requirement: &70186149355400 !ruby/object:Gem::Requirement
|
94
105
|
none: false
|
95
106
|
requirements:
|
96
107
|
- - ~>
|
@@ -98,10 +109,10 @@ dependencies:
|
|
98
109
|
version: 0.9.0
|
99
110
|
type: :development
|
100
111
|
prerelease: false
|
101
|
-
version_requirements: *
|
112
|
+
version_requirements: *70186149355400
|
102
113
|
- !ruby/object:Gem::Dependency
|
103
114
|
name: pry
|
104
|
-
requirement: &
|
115
|
+
requirement: &70186149354920 !ruby/object:Gem::Requirement
|
105
116
|
none: false
|
106
117
|
requirements:
|
107
118
|
- - ~>
|
@@ -109,10 +120,10 @@ dependencies:
|
|
109
120
|
version: 0.9.8.2
|
110
121
|
type: :development
|
111
122
|
prerelease: false
|
112
|
-
version_requirements: *
|
123
|
+
version_requirements: *70186149354920
|
113
124
|
- !ruby/object:Gem::Dependency
|
114
125
|
name: pry-doc
|
115
|
-
requirement: &
|
126
|
+
requirement: &70186149354440 !ruby/object:Gem::Requirement
|
116
127
|
none: false
|
117
128
|
requirements:
|
118
129
|
- - ~>
|
@@ -120,10 +131,10 @@ dependencies:
|
|
120
131
|
version: 0.4.0
|
121
132
|
type: :development
|
122
133
|
prerelease: false
|
123
|
-
version_requirements: *
|
134
|
+
version_requirements: *70186149354440
|
124
135
|
- !ruby/object:Gem::Dependency
|
125
136
|
name: pry-editline
|
126
|
-
requirement: &
|
137
|
+
requirement: &70186149353960 !ruby/object:Gem::Requirement
|
127
138
|
none: false
|
128
139
|
requirements:
|
129
140
|
- - ~>
|
@@ -131,10 +142,10 @@ dependencies:
|
|
131
142
|
version: 1.1.1
|
132
143
|
type: :development
|
133
144
|
prerelease: false
|
134
|
-
version_requirements: *
|
145
|
+
version_requirements: *70186149353960
|
135
146
|
- !ruby/object:Gem::Dependency
|
136
147
|
name: hirb
|
137
|
-
requirement: &
|
148
|
+
requirement: &70186149353480 !ruby/object:Gem::Requirement
|
138
149
|
none: false
|
139
150
|
requirements:
|
140
151
|
- - ~>
|
@@ -142,10 +153,10 @@ dependencies:
|
|
142
153
|
version: 0.6.0
|
143
154
|
type: :development
|
144
155
|
prerelease: false
|
145
|
-
version_requirements: *
|
156
|
+
version_requirements: *70186149353480
|
146
157
|
- !ruby/object:Gem::Dependency
|
147
158
|
name: awesome_print
|
148
|
-
requirement: &
|
159
|
+
requirement: &70186149353000 !ruby/object:Gem::Requirement
|
149
160
|
none: false
|
150
161
|
requirements:
|
151
162
|
- - ~>
|
@@ -153,10 +164,10 @@ dependencies:
|
|
153
164
|
version: 1.0.2
|
154
165
|
type: :development
|
155
166
|
prerelease: false
|
156
|
-
version_requirements: *
|
167
|
+
version_requirements: *70186149353000
|
157
168
|
- !ruby/object:Gem::Dependency
|
158
169
|
name: simplecov
|
159
|
-
requirement: &
|
170
|
+
requirement: &70186149352520 !ruby/object:Gem::Requirement
|
160
171
|
none: false
|
161
172
|
requirements:
|
162
173
|
- - ~>
|
@@ -164,7 +175,7 @@ dependencies:
|
|
164
175
|
version: 0.6.0
|
165
176
|
type: :development
|
166
177
|
prerelease: false
|
167
|
-
version_requirements: *
|
178
|
+
version_requirements: *70186149352520
|
168
179
|
description: Library for scraping norwegian doctoral dissertations
|
169
180
|
email:
|
170
181
|
- gudleik@gmail.com
|
@@ -183,7 +194,6 @@ files:
|
|
183
194
|
- doctor_scrape.gemspec
|
184
195
|
- lib/doctor_scrape.rb
|
185
196
|
- lib/doctor_scrape/data.rb
|
186
|
-
- lib/doctor_scrape/redirect_follower.rb
|
187
197
|
- lib/doctor_scrape/scraper/base.rb
|
188
198
|
- lib/doctor_scrape/scraper/bora.rb
|
189
199
|
- lib/doctor_scrape/scraper/diva.rb
|
@@ -214,7 +224,6 @@ files:
|
|
214
224
|
- spec/parse/diva_spec.rb
|
215
225
|
- spec/parse/duo_spec.rb
|
216
226
|
- spec/parse/munin_spec.rb
|
217
|
-
- spec/redirect_follower_spec.rb
|
218
227
|
- spec/scraper_spec.rb
|
219
228
|
- spec/scrapers/base_spec.rb
|
220
229
|
- spec/scrapers/bora_spec.rb
|
@@ -240,7 +249,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
240
249
|
version: '0'
|
241
250
|
segments:
|
242
251
|
- 0
|
243
|
-
hash: -
|
252
|
+
hash: -2449353312367516367
|
244
253
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
245
254
|
none: false
|
246
255
|
requirements:
|
@@ -249,7 +258,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
249
258
|
version: '0'
|
250
259
|
segments:
|
251
260
|
- 0
|
252
|
-
hash: -
|
261
|
+
hash: -2449353312367516367
|
253
262
|
requirements: []
|
254
263
|
rubyforge_project:
|
255
264
|
rubygems_version: 1.8.11
|
@@ -279,7 +288,6 @@ test_files:
|
|
279
288
|
- spec/parse/diva_spec.rb
|
280
289
|
- spec/parse/duo_spec.rb
|
281
290
|
- spec/parse/munin_spec.rb
|
282
|
-
- spec/redirect_follower_spec.rb
|
283
291
|
- spec/scraper_spec.rb
|
284
292
|
- spec/scrapers/base_spec.rb
|
285
293
|
- spec/scrapers/bora_spec.rb
|
@@ -1,29 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
require 'net/https'
|
3
|
-
module DoctorScrape
|
4
|
-
class TooManyRedirects < StandardError; end
|
5
|
-
|
6
|
-
class RedirectFollower
|
7
|
-
attr_accessor :url
|
8
|
-
|
9
|
-
def initialize(url)
|
10
|
-
@url = url
|
11
|
-
end
|
12
|
-
|
13
|
-
def resolve(limit=5)
|
14
|
-
raise TooManyRedirects if limit == 0
|
15
|
-
|
16
|
-
response = Net::HTTP.get_response URI.parse(@url)
|
17
|
-
|
18
|
-
if response.is_a? Net::HTTPRedirection
|
19
|
-
@url = response['location']
|
20
|
-
resolve limit - 1
|
21
|
-
end
|
22
|
-
|
23
|
-
@url
|
24
|
-
rescue Net::HTTPBadResponse => error
|
25
|
-
# This can safely be ignored
|
26
|
-
end
|
27
|
-
|
28
|
-
end
|
29
|
-
end
|
@@ -1,37 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
require 'spec_helper'
|
3
|
-
|
4
|
-
describe DoctorScrape::RedirectFollower do
|
5
|
-
let(:url) { "http://bit.ly/foobar" }
|
6
|
-
let(:endpoint) { "http://example.com" }
|
7
|
-
let(:resolver) { DoctorScrape::RedirectFollower.new url }
|
8
|
-
before { stub_request(:any, endpoint).to_return(:body => "You found me!") }
|
9
|
-
|
10
|
-
context "when url doesn't redirect" do
|
11
|
-
before { stub_request(:any, url).to_return(:body => "Ok") }
|
12
|
-
specify { resolver.resolve.should eq url }
|
13
|
-
end
|
14
|
-
|
15
|
-
context "when url redirects" do
|
16
|
-
before { stub_request(:any, url).to_return(:status => [ 302, "Moved Temporarily" ], headers: { location: endpoint }) }
|
17
|
-
specify { resolver.resolve.should eq endpoint }
|
18
|
-
end
|
19
|
-
|
20
|
-
context "too many redirects" do
|
21
|
-
before { stub_request(:any, url).to_return(:status => [ 302, "Moved Temporarily" ], headers: { location: url }) }
|
22
|
-
|
23
|
-
it "raises error after 5 redirects" do
|
24
|
-
expect { resolver.resolve }.to raise_error
|
25
|
-
a_request(:get, url).should have_been_made.times(5)
|
26
|
-
end
|
27
|
-
end
|
28
|
-
|
29
|
-
context "when exception occurs" do
|
30
|
-
it "returns the last url" do
|
31
|
-
stub_request(:get, url).to_return(:status => [ 302, "Moved Temporarily" ], headers: { location: endpoint })
|
32
|
-
stub_request(:get, endpoint).to_raise Net::HTTPBadResponse
|
33
|
-
resolver.resolve.should eq(endpoint)
|
34
|
-
end
|
35
|
-
end
|
36
|
-
|
37
|
-
end
|