dubdubdub 0.2.7 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
data/Gemfile CHANGED
@@ -1,6 +1,6 @@
1
1
  source "http://rubygems.org"
2
2
 
3
- gem 'rest-client'
3
+ gem 'rest-client', git: 'https://github.com/daveola/rest-client'
4
4
  gem 'nokogiri'
5
5
  gem 'mechanize'
6
6
 
data/Gemfile.lock CHANGED
@@ -1,3 +1,11 @@
1
+ GIT
2
+ remote: https://github.com/daveola/rest-client
3
+ revision: 9bbe538aa2003e172f818eeff5c2e2e6828a8453
4
+ specs:
5
+ rest-client (1.6.7)
6
+ mime-types (>= 1.16)
7
+ netrc
8
+
1
9
  GEM
2
10
  remote: http://rubygems.org/
3
11
  specs:
@@ -25,6 +33,7 @@ GEM
25
33
  mime-types (1.19)
26
34
  net-http-digest_auth (1.2.1)
27
35
  net-http-persistent (2.8)
36
+ netrc (0.7.7)
28
37
  nokogiri (1.5.5)
29
38
  ntlm-http (0.1.1)
30
39
  pry (0.9.10)
@@ -34,8 +43,6 @@ GEM
34
43
  rake (10.0.2)
35
44
  rdoc (3.12)
36
45
  json (~> 1.4)
37
- rest-client (1.6.7)
38
- mime-types (>= 1.16)
39
46
  rspec (2.8.0)
40
47
  rspec-core (~> 2.8.0)
41
48
  rspec-expectations (~> 2.8.0)
@@ -61,6 +68,6 @@ DEPENDENCIES
61
68
  mechanize
62
69
  nokogiri
63
70
  pry
64
- rest-client
71
+ rest-client!
65
72
  rspec (~> 2.8.0)
66
73
  vcr (~> 2.3.0)
data/dubdubdub.gemspec CHANGED
@@ -5,7 +5,7 @@
5
5
 
6
6
  Gem::Specification.new do |s|
7
7
  s.name = "dubdubdub"
8
- s.version = "0.2.7"
8
+ s.version = "0.3.0"
9
9
 
10
10
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
11
11
  s.authors = ["James Hu"]
@@ -33,17 +33,17 @@ Gem::Specification.new do |s|
33
33
  "spec/support/vcr.rb",
34
34
  "spec/vcr/browse/basic.yml",
35
35
  "spec/vcr/crawl/basic.yml",
36
- "spec/vcr/follow_url/alias_link.yml",
37
- "spec/vcr/follow_url/base.yml",
38
- "spec/vcr/follow_url/block_base_url.yml",
39
- "spec/vcr/follow_url/eoferror.yml",
40
- "spec/vcr/follow_url/https.yml",
41
- "spec/vcr/follow_url/pass_block.yml",
42
- "spec/vcr/follow_url/pass_block_iteration.yml",
43
- "spec/vcr/follow_url/proxied.yml",
44
- "spec/vcr/follow_url/proxy.yml",
45
- "spec/vcr/follow_url/proxy_forbidden.yml",
46
- "spec/vcr/follow_url/relative_redirects.yml",
36
+ "spec/vcr/follow/alias_link.yml",
37
+ "spec/vcr/follow/all_the_way.yml",
38
+ "spec/vcr/follow/base.yml",
39
+ "spec/vcr/follow/eoferror.yml",
40
+ "spec/vcr/follow/https.yml",
41
+ "spec/vcr/follow/pass_block.yml",
42
+ "spec/vcr/follow/pass_block_iteration.yml",
43
+ "spec/vcr/follow/proxy.yml",
44
+ "spec/vcr/follow/proxy_forbidden.yml",
45
+ "spec/vcr/follow/relative_redirects.yml",
46
+ "spec/vcr/follow/uri_error.yml",
47
47
  "spec/vcr/get/basic.yml",
48
48
  "spec/vcr/get/params.yml",
49
49
  "spec/vcr/get/proxy.yml"
data/lib/dubdubdub.rb CHANGED
@@ -1,6 +1,6 @@
1
1
  class DubDubDub
2
2
  # Version
3
- VERSION = "0.2.7"
3
+ VERSION = "0.3.0"
4
4
 
5
5
  attr_accessor :client
6
6
 
@@ -19,7 +19,7 @@ class DubDubDub::Client
19
19
  unless DubDubDub.configuration.ignore_proxy?
20
20
  proxy = DubDubDub.configuration.proxy
21
21
 
22
- raise DubDubDub::Exception, "No proxy has been configured or provided!" if proxy.nil?
22
+ raise ArgumentError, "No proxy has been configured or provided!" if proxy.nil?
23
23
 
24
24
  self.proxy = proxy
25
25
  end
@@ -43,36 +43,19 @@ class DubDubDub::Client
43
43
  end
44
44
 
45
45
  def proxy
46
- "#{proxy_host}:#{proxy_port}"
46
+ "#{proxy_host}:#{proxy_port}" if proxy_host and proxy_port
47
47
  end
48
48
 
49
49
  def proxy?
50
- return false if DubDubDub.configuration.ignore_proxy
50
+ return false if DubDubDub.configuration.ignore_proxy?
51
51
 
52
52
  !!proxy
53
53
  end
54
54
 
55
- # Returns a Net::HTTP object
56
- def net_http(uri)
57
- raise ArgumentError, "A URI must be provided!" unless uri.kind_of? URI::Generic
58
-
59
- net_http_class = if proxy?
60
- Net::HTTP.Proxy(proxy_host, proxy_port, proxy_user, proxy_password)
61
- else
62
- Net::HTTP
63
- end
64
-
65
- http = net_http_class.new(uri.host, uri.port)
66
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE # ssl certificate doesn't need to be verified, otherwise a OpenSSL::SSL::SSLError might get thrown
67
- http.use_ssl = true if uri.scheme == "https"
68
-
69
- http
70
- end
71
-
72
55
  # Returns a RestClient::Resource
73
56
  def rest_client_resource(url)
74
57
  options = {}
75
- options[:proxy] = proxy if proxy?
58
+ options[:proxy] = "http://#{proxy}" if proxy?
76
59
 
77
60
  RestClient::Resource.new(url, options)
78
61
  end
@@ -109,93 +92,34 @@ class DubDubDub::Client
109
92
 
110
93
  # Helper method to browse by using a GET request via Mechanize
111
94
  def browse(url, *args)
112
- mechanize.get(url, *args)
113
- end
114
-
115
- # Follow a url to the end until it can no longer go any further
116
- # Even if it times out, it will return the url that it times out on!
117
- def follow_url(url, options = {}, &block)
118
- default_options = { limit: 20, attempts: 5, timeout: 5 }
119
- options = default_options.merge(options)
120
-
121
- at_base = false
122
- previous_uri = nil # Keep track of previous uri for relative path redirects
123
- response = nil
124
- user_agents = [
125
- 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.79 Safari/535.11',
126
- ''
127
- ]
128
- urls = [] # the url history
129
-
130
- raise ArgumentError if options[:until] and !options[:until].is_a?(Proc)
131
-
132
- # before we begin, let's yield the initial url if a block was given
133
- yield(url) if block_given?
134
-
135
- options[:limit].downto(1).each do |i|
136
- begin
137
- at_base = true if options[:until] and options[:until].call(url)
138
-
139
- uri = URI.parse(url)
140
- net_http = net_http(uri)
141
- at_base = true unless uri.respond_to?(:request_uri) # make sure its a proper url
142
-
143
- unless at_base
144
- request = Net::HTTP::Get.new(uri.request_uri)
145
- request_attempts = 0
146
-
147
- # we make a certain amount of attempts in case we timeout
148
- while request_attempts < options[:attempts]
149
- begin
150
- request_attempts += 1
151
-
152
- # Don't let the request take too long
153
- response = Timeout::timeout(options[:timeout]) do
154
- net_http.request(request)
155
- end
156
-
157
- break # if it reaches this, that means the request was successful do break out!
158
- # If any of these exceptions are thrown, it has timed out, so keep trying depending on how many attempts we have
159
- rescue Timeout::Error, Errno::ETIMEDOUT, Errno::EHOSTUNREACH
160
- # do another attempt if we are allowed one, or stop
161
- at_base = true and break if request_attempts == options[:attempts]
162
- rescue SocketError # doesn't exist
163
- at_base = true and break
164
- end
165
- end
166
-
167
- case response
168
- when Net::HTTPSuccess then at_base = true
169
- when Net::HTTPRedirection then url = response['location']
170
- when Net::HTTPForbidden then raise DubDubDub::Forbidden
171
- # Couldn't resolve, just return url
172
- else at_base = true
173
- end if response
174
- end
175
-
176
- # If any of these exceptions get thrown, return the current url
177
- rescue SocketError, EOFError
178
- at_base = true
179
- rescue URI::InvalidURIError
180
- return url # Just return it
95
+ handle_net_http_exceptions do
96
+ handle_mechanize_exceptions do
97
+ mechanize.get(url, *args)
181
98
  end
182
-
183
- urls << url
184
-
185
- break if at_base
186
-
187
- previous_uri = uri # Keep track of previous uri
188
- yield(url) if block_given?
189
99
  end
100
+ end
190
101
 
191
- end_uri = URI.parse(url)
102
+ # Follow a URL to the end
103
+ def follow(url)
104
+ browse(url).uri.to_s
105
+ end
192
106
 
193
- # If there is no host, it's due to a relative 301 redirect. Use previous uri's host, port, etc
194
- if !end_uri.host and previous_uri
195
- end_uri = previous_uri
196
- end_uri.path = url
107
+ private
108
+ def handle_net_http_exceptions(&block)
109
+ begin
110
+ yield
111
+ rescue Timeout::Error, Errno::ETIMEDOUT, Errno::EHOSTUNREACH
112
+ raise DubDubDub::ResponseError.new(e, 408) # Timeout
113
+ rescue SocketError, EOFError => e
114
+ raise DubDubDub::ResponseError.new(e, 404) # Not found
197
115
  end
116
+ end
198
117
 
199
- end_uri.to_s
118
+ def handle_mechanize_exceptions(&block)
119
+ begin
120
+ yield
121
+ rescue Mechanize::ResponseCodeError => e
122
+ raise DubDubDub::ResponseError.new(e, e.response_code)
123
+ end
200
124
  end
201
125
  end
@@ -1,2 +1,88 @@
1
- class DubDubDub::Exception < RuntimeError; end
2
- class DubDubDub::Forbidden < DubDubDub::Exception; end
1
+ class DubDubDub::Error < RuntimeError; end
2
+
3
+ # URL not properly formatted
4
+ class DubDubDub::URLFormatError < DubDubDub::Error; end
5
+
6
+ # There was an unhandled response. Contains
7
+ # a reference to the originating error.
8
+ class DubDubDub::ResponseError < DubDubDub::Error
9
+ attr_reader :error
10
+ attr_reader :code
11
+
12
+ # Pulled from rest-client, thanks!
13
+ STATUSES = {
14
+ 100 => 'Continue',
15
+ 101 => 'Switching Protocols',
16
+ 102 => 'Processing', #WebDAV
17
+
18
+ 200 => 'OK',
19
+ 201 => 'Created',
20
+ 202 => 'Accepted',
21
+ 203 => 'Non-Authoritative Information', # http/1.1
22
+ 204 => 'No Content',
23
+ 205 => 'Reset Content',
24
+ 206 => 'Partial Content',
25
+ 207 => 'Multi-Status', #WebDAV
26
+
27
+ 300 => 'Multiple Choices',
28
+ 301 => 'Moved Permanently',
29
+ 302 => 'Found',
30
+ 303 => 'See Other', # http/1.1
31
+ 304 => 'Not Modified',
32
+ 305 => 'Use Proxy', # http/1.1
33
+ 306 => 'Switch Proxy', # no longer used
34
+ 307 => 'Temporary Redirect', # http/1.1
35
+
36
+ 400 => 'Bad Request',
37
+ 401 => 'Unauthorized',
38
+ 402 => 'Payment Required',
39
+ 403 => 'Forbidden',
40
+ 404 => 'Resource Not Found',
41
+ 405 => 'Method Not Allowed',
42
+ 406 => 'Not Acceptable',
43
+ 407 => 'Proxy Authentication Required',
44
+ 408 => 'Request Timeout',
45
+ 409 => 'Conflict',
46
+ 410 => 'Gone',
47
+ 411 => 'Length Required',
48
+ 412 => 'Precondition Failed',
49
+ 413 => 'Request Entity Too Large',
50
+ 414 => 'Request-URI Too Long',
51
+ 415 => 'Unsupported Media Type',
52
+ 416 => 'Requested Range Not Satisfiable',
53
+ 417 => 'Expectation Failed',
54
+ 418 => 'I\'m A Teapot',
55
+ 421 => 'Too Many Connections From This IP',
56
+ 422 => 'Unprocessable Entity', #WebDAV
57
+ 423 => 'Locked', #WebDAV
58
+ 424 => 'Failed Dependency', #WebDAV
59
+ 425 => 'Unordered Collection', #WebDAV
60
+ 426 => 'Upgrade Required',
61
+ 449 => 'Retry With', #Microsoft
62
+ 450 => 'Blocked By Windows Parental Controls', #Microsoft
63
+
64
+ 500 => 'Internal Server Error',
65
+ 501 => 'Not Implemented',
66
+ 502 => 'Bad Gateway',
67
+ 503 => 'Service Unavailable',
68
+ 504 => 'Gateway Timeout',
69
+ 505 => 'HTTP Version Not Supported',
70
+ 506 => 'Variant Also Negotiates',
71
+ 507 => 'Insufficient Storage', #WebDAV
72
+ 509 => 'Bandwidth Limit Exceeded', #Apache
73
+ 510 => 'Not Extended'
74
+ }
75
+
76
+ def initialize(error, code)
77
+ @error = error
78
+ @code = code.to_i
79
+ end
80
+
81
+ def message
82
+ STATUSES[code.to_i]
83
+ end
84
+
85
+ def to_s
86
+ "#{code}: #{message} => #{error.class.name}: #{error.message}"
87
+ end
88
+ end
@@ -57,7 +57,7 @@ describe DubDubDub do
57
57
  config.proxy = nil
58
58
  end
59
59
 
60
- lambda { DubDubDub.new(proxy: true) }.should raise_error(DubDubDub::Exception)
60
+ lambda { DubDubDub.new(proxy: true) }.should raise_error(ArgumentError)
61
61
  end
62
62
 
63
63
  it "doesn't raise an error if configured to ignore proxies and we have specified to use a global proxy that hasn't been set" do
@@ -66,7 +66,7 @@ describe DubDubDub do
66
66
  config.proxy = nil
67
67
  end
68
68
 
69
- lambda { DubDubDub.new(proxy: true) }.should_not raise_error(DubDubDub::Exception)
69
+ lambda { DubDubDub.new(proxy: true) }.should_not raise_error(ArgumentError)
70
70
  end
71
71
 
72
72
  it "does not pass the method to client if that method doesn't exist within the client" do
@@ -168,8 +168,11 @@ describe DubDubDub do
168
168
  end
169
169
 
170
170
  it "works with a proxy", vcr: { cassette_name: "get/proxy", record: :once } do
171
- www.proxy = "203.131.212.166"
172
- response = www.get "http://www.google.com"
171
+ www.proxy = "173.234.181.64:8800"
172
+ response = www.get "http://www.whatismyipaddress.com"
173
+ html = Nokogiri::HTML(response)
174
+
175
+ html.css('.ip').text.strip.should == "173.234.181.64"
173
176
  end
174
177
  end
175
178
 
@@ -187,78 +190,62 @@ describe DubDubDub do
187
190
  end
188
191
  end
189
192
 
190
- describe '#follow_url' do
191
- it "follows url to the end", vcr: { cassette_name: "follow_url/base", record: :once } do
192
- www.follow_url("http://say.ly/TCc1CEp").should == "http://www.whosay.com/TomHanks/photos/148406"
193
- www.follow_url("http://t.co//qbJx26r").should == "http://twitter.com/twitter/status/76360760606986241/photo/1"
194
- www.follow_url("http://mypict.me/mMgLU").should == "http://mypict.me/mobile.php?id=336583610"
193
+ describe '#follow' do
194
+ it "follows url to the end", vcr: { cassette_name: "follow/base", record: :once } do
195
+ www.follow("http://say.ly/TCc1CEp").should == "http://www.whosay.com/TomHanks/photos/148406"
196
+ www.follow("http://t.co//qbJx26r").should == "http://twitter.com/twitter/status/76360760606986241/photo/1"
197
+ www.follow("http://mypict.me/mMgLU").should == "http://mypict.me/mobile.php?id=336583610"
195
198
  end
196
199
 
197
- it "returns the base url if it meets a passed in block", vcr: { cassette_name: "follow_url/block_base_url", record: :once } do
198
- www.follow_url("http://ow.ly/9Rp7p", until: lambda { |url| url =~ /ow\.ly/ }).should == "http://ow.ly/9Rp7p"
199
- www.follow_url("http://ow.ly/9Rp7p", until: lambda { |url| url =~ /bit\.ly/ }).should == "http://bit.ly/GMx5lu"
200
- www.follow_url("http://ow.ly/9Rp7p", until: lambda { |url| url =~ /bit\.lyyy/ }).should == "http://instagram.com/p/IbhSB6EKRQ/"
200
+ it "handles invalid uris", vcr: { cassette_name: "follow/invalid_uris", record: :once } do
201
+ lambda { www.follow("http://rank.1new.biz/sharp-紙パック式クリーナー-床用吸い込み口タイプ-オ/") }.should_not raise_error(DubDubDub::URLFormatError)
201
202
  end
202
203
 
203
- it "can pass in a block to get the url every step of the way", vcr: { cassette_name: "follow_url/pass_block_iteration", record: :once } do
204
- urls = []
205
-
206
- www.follow_url("http://ow.ly/9Rp7p") do |url|
207
- urls << url
208
- end
209
-
210
- urls.first.should == "http://ow.ly/9Rp7p" # first url should be the initial one
211
- urls.count.should == 4
204
+ it "handles https", vcr: { cassette_name: "follow/https", record: :once } do
205
+ lambda { www.follow("https://www.youtube.com/watch?v=DM58Zdk7el0&feature=youtube_gdata_player") }.should_not raise_error(EOFError)
212
206
  end
213
207
 
214
- it "can pass in a block with the last url being the base url", vcr: { cassette_name: "follow_url/pass_block", record: :once } do
215
- urls = []
208
+ it "raises an exception if doesn't exist", vcr: { cassette_name: "follow/doesnt_exist", record: :once } do
209
+ lambda { www.follow("http://cnnsadasdasdasdasdasd.com/asd") }.should raise_error(DubDubDub::ResponseError)
216
210
 
217
- www.follow_url("http://twitpic.com/92a2p5") do |url|
218
- urls << url
211
+ begin
212
+ www.follow("http://cnnsadasdasdasdasdasd.com/asd")
213
+ rescue DubDubDub::ResponseError => e
214
+ e.code.should == 404
215
+ e.error.should_not be_nil
216
+ e.message.should_not be_nil
219
217
  end
220
-
221
- urls.count.should == 1
222
- urls.last.should == "http://twitpic.com/92a2p5"
223
- end
224
-
225
- it "handles invalid uris", vcr: { cassette_name: "follow_url/invalid_uris", record: :once } do
226
- lambda { www.follow_url("http://rank.1new.biz/sharp-紙パック式クリーナー-床用吸い込み口タイプ-オ/") }.should_not raise_error(URI::InvalidURIError)
227
- www.follow_url("http://rank.1new.biz/sharp-紙パック式クリーナー-床用吸い込み口タイプ-オ/").should == "http://rank.1new.biz/sharp-紙パック式クリーナー-床用吸い込み口タイプ-オ/"
228
218
  end
229
219
 
230
- it "handles https", vcr: { cassette_name: "follow_url/https", record: :once } do
231
- lambda { www.follow_url("https://www.youtube.com/watch?v=DM58Zdk7el0&feature=youtube_gdata_player") }.should_not raise_error(EOFError)
220
+ it "returns actual asset link for an alias link", vcr: { cassette_name: "follow/alias_link", record: :once } do
221
+ www.follow("http://yfrog.us/evlb0z:medium").should == "http://img535.imageshack.us/img535/9845/lb0.mp4"
232
222
  end
233
223
 
234
- it "returns the same url if the name or service doesn't exist", vcr: { cassette_name: "follow_url/doesnt_exist", record: :once } do
235
- www.follow_url("http://cnnsadasdasdasdasdasd.com/asd").should == "http://cnnsadasdasdasdasdasd.com/asd"
224
+ it "does not raise a EOFError", vcr: { cassette_name: "follow/eoferror", record: :once } do
225
+ lambda { www.follow("http://www.soulpancake.com/post/1607/whats-your-beautiful-mess.html") }.should_not raise_error
236
226
  end
237
227
 
238
- it "returns actual asset link for an alias link", vcr: { cassette_name: "follow_url/alias_link", record: :once } do
239
- www.follow_url("http://yfrog.us/evlb0z:medium").should == "http://img535.imageshack.us/img535/9845/lb0.mp4"
228
+ it 'works with a proxy', vcr: { cassette_name: "follow/proxy", record: :once } do
229
+ www.proxy = "198.154.114.100:8080"
230
+ www.follow("http://yfrog.us/evlb0z:medium").should == "http://img535.imageshack.us/img535/9845/lb0.mp4"
240
231
  end
241
232
 
242
- it "does not raise a EOFError", vcr: { cassette_name: "follow_url/eoferror", record: :once } do
243
- lambda { www.follow_url("http://www.soulpancake.com/post/1607/whats-your-beautiful-mess.html") }.should_not raise_error
233
+ it "works with relative path redirects", vcr: { cassette_name: "follow/relative_redirects", record: :once } do
234
+ www.follow("http://www.retailmenot.com/out/4223117").should == "http://www.papajohns.com/index.html"
244
235
  end
245
236
 
246
- it 'works with a proxy', vcr: { cassette_name: "follow_url/proxy", record: :once } do
247
- www.proxy = "198.154.114.100:8080"
248
- www.follow_url("http://yfrog.us/evlb0z:medium").should == "http://img535.imageshack.us/img535/9845/lb0.mp4"
249
- end
250
-
251
- it 'works for domains', vcr: { cassette_name: "follow_url/domains", record: :once } do
252
- www.follow_url("google.com").should == "google.com"
237
+ it "raises response error on a bad proxy", vcr: { cassette_name: "follow/proxy_forbidden", record: :once } do
238
+ www.proxy = "190.202.116.101:3128"
239
+ lambda { www.follow("http://yfrog.us/evlb0z:medium").should }.should raise_error(DubDubDub::ResponseError)
253
240
  end
254
241
 
255
- it "works with relative path redirects", vcr: { cassette_name: "follow_url/relative_redirects", record: :once } do
256
- www.follow_url("http://www.retailmenot.com/out/4223117").should == "http://www.papajohns.com/index.html"
242
+ it "follows to the end for some types of urls", vcr: { cassette_name: "follow/all_the_way", record: :once } do
243
+ www.follow("http://www.apmebf.com/fo122tenm4/elq/32A39898/4432424/2/2/2").should == "http://www.bedbathandbeyond.com/default.asp?utm_source=WhaleShark+Media%3A+RetailMeNot%2Ecom&utm_medium=affiliate&utm_term=&utm_campaign=Bed+Bath+and+Beyond+Product+Catalog&aid=10817676&pid=2210202&sid=&"
257
244
  end
258
245
 
259
- it "raises forbidden properly on a bad proxy", vcr: { cassette_name: "follow_url/proxy_forbidden", record: :once } do
260
- www.proxy = "190.202.116.101:3128"
261
- lambda { www.follow_url("http://yfrog.us/evlb0z:medium").should }.should raise_error(DubDubDub::Forbidden)
246
+ it "handles doesn't error out due to URI", vcr: { cassette_name: "follow/uri_error", record: :once } do
247
+ url = www.follow "http://retailmenot.com/out/4231224"
248
+ url.should == "http://www.toysrus.com/category/index.jsp?categoryId=3999911"
262
249
  end
263
250
  end
264
251
  end