gogetter 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.rspec +1 -0
- data/Gemfile +9 -0
- data/Gemfile.lock +30 -0
- data/LICENSE.txt +20 -0
- data/README.rdoc +28 -0
- data/Rakefile +35 -0
- data/VERSION +1 -0
- data/gogetter.gemspec +69 -0
- data/lib/go_getter/go_getter.rb +76 -0
- data/lib/go_getter/response.rb +16 -0
- data/lib/go_getter/utils.rb +17 -0
- data/lib/go_getter.rb +3 -0
- data/lib/gogetter.rb +2 -0
- data/spec/fixtures/google.html +3 -0
- data/spec/fixtures/google.redirect.html +6 -0
- data/spec/go_getter/go_getter_spec.rb +151 -0
- data/spec/rspec.opts +3 -0
- data/spec/spec_helper.rb +21 -0
- metadata +124 -0
data/.document
ADDED
data/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
GEM
|
2
|
+
remote: http://rubygems.org/
|
3
|
+
specs:
|
4
|
+
diff-lcs (1.1.2)
|
5
|
+
fakeweb (1.3.0)
|
6
|
+
git (1.2.5)
|
7
|
+
jeweler (1.6.4)
|
8
|
+
bundler (~> 1.0)
|
9
|
+
git (>= 1.2.5)
|
10
|
+
rake
|
11
|
+
rake (0.9.2)
|
12
|
+
rspec (2.6.0)
|
13
|
+
rspec-core (~> 2.6.0)
|
14
|
+
rspec-expectations (~> 2.6.0)
|
15
|
+
rspec-mocks (~> 2.6.0)
|
16
|
+
rspec-core (2.6.4)
|
17
|
+
rspec-expectations (2.6.0)
|
18
|
+
diff-lcs (~> 1.1.2)
|
19
|
+
rspec-mocks (2.6.0)
|
20
|
+
yard (0.7.2)
|
21
|
+
|
22
|
+
PLATFORMS
|
23
|
+
ruby
|
24
|
+
|
25
|
+
DEPENDENCIES
|
26
|
+
bundler (~> 1.0.0)
|
27
|
+
fakeweb (~> 1.3.0)
|
28
|
+
jeweler (~> 1.6.4)
|
29
|
+
rspec (~> 2.6.0)
|
30
|
+
yard (~> 0.7.2)
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,20 @@
|
|
1
|
+
Copyright (c) 2011 Elad Kehat
|
2
|
+
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
4
|
+
a copy of this software and associated documentation files (the
|
5
|
+
"Software"), to deal in the Software without restriction, including
|
6
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
7
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
8
|
+
permit persons to whom the Software is furnished to do so, subject to
|
9
|
+
the following conditions:
|
10
|
+
|
11
|
+
The above copyright notice and this permission notice shall be
|
12
|
+
included in all copies or substantial portions of the Software.
|
13
|
+
|
14
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
15
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
16
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
17
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
18
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
19
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
20
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.rdoc
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
= GoGetter
|
2
|
+
|
3
|
+
Easily send get requests, with a little more sophistication than Net::HTTP.get
|
4
|
+
|
5
|
+
Unlike HTTParty, which is a great gem if your class works against a specific website, GoGetter fits the use case where
|
6
|
+
you need to send a bunch of HTTP GETs to several domains, and don't want to wrap each one in its own class.
|
7
|
+
|
8
|
+
Think of it as an alternative to open-uri that doesn't create any temporary files.
|
9
|
+
|
10
|
+
It handles proxies, basic authentication, and HTTP redirects.
|
11
|
+
|
12
|
+
Before releasing this code I used it extensively in a proprietary web crawler that sent around a billion GET requests
|
13
|
+
so far, so you could say that it's quite robust :)
|
14
|
+
|
15
|
+
== Contributing to gogetter
|
16
|
+
|
17
|
+
* Check out the latest master to make sure the feature hasn't been implemented or the bug hasn't been fixed yet
|
18
|
+
* Check out the issue tracker to make sure someone already hasn't requested it and/or contributed it
|
19
|
+
* Fork the project
|
20
|
+
* Start a feature/bugfix branch
|
21
|
+
* Commit and push until you are happy with your contribution
|
22
|
+
* Make sure to add tests for it. This is important so I don't break it in a future version unintentionally.
|
23
|
+
* Please try not to mess with the Rakefile, version, or history. If you want to have your own version, or is otherwise
|
24
|
+
necessary, that is fine, but please isolate to its own commit so I can cherry-pick around it.
|
25
|
+
|
26
|
+
== Copyright
|
27
|
+
|
28
|
+
Copyright (c) 2011 Elad Kehat. See LICENSE.txt for further details.
|
data/Rakefile
ADDED
@@ -0,0 +1,35 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'bundler'
|
5
|
+
begin
|
6
|
+
Bundler.setup(:default, :development)
|
7
|
+
rescue Bundler::BundlerError => e
|
8
|
+
$stderr.puts e.message
|
9
|
+
$stderr.puts "Run `bundle install` to install missing gems"
|
10
|
+
exit e.status_code
|
11
|
+
end
|
12
|
+
require 'rake'
|
13
|
+
|
14
|
+
require 'jeweler'
|
15
|
+
Jeweler::Tasks.new do |gem|
|
16
|
+
gem.name = "gogetter"
|
17
|
+
gem.homepage = "http://github.com/eladkehat/gogetter"
|
18
|
+
gem.license = "MIT"
|
19
|
+
gem.summary = %Q{Go get something over HTTP}
|
20
|
+
gem.email = "eladkehat@gmail.com"
|
21
|
+
gem.authors = ["Elad Kehat"]
|
22
|
+
end
|
23
|
+
Jeweler::RubygemsDotOrgTasks.new
|
24
|
+
|
25
|
+
require 'rspec/core'
|
26
|
+
require 'rspec/core/rake_task'
|
27
|
+
RSpec::Core::RakeTask.new(:spec) do |spec|
|
28
|
+
spec.pattern = FileList['spec/**/*_spec.rb']
|
29
|
+
spec.rspec_opts = ['--options', 'spec/rspec.opts']
|
30
|
+
end
|
31
|
+
|
32
|
+
task :default => :spec
|
33
|
+
|
34
|
+
require 'yard'
|
35
|
+
YARD::Rake::YardocTask.new
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.1.0
|
data/gogetter.gemspec
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE DIRECTLY
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run 'rake gemspec'
|
4
|
+
# -*- encoding: utf-8 -*-
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = %q{gogetter}
|
8
|
+
s.version = "0.1.0"
|
9
|
+
|
10
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
11
|
+
s.authors = ["Elad Kehat"]
|
12
|
+
s.date = %q{2011-07-25}
|
13
|
+
s.email = %q{eladkehat@gmail.com}
|
14
|
+
s.extra_rdoc_files = [
|
15
|
+
"LICENSE.txt",
|
16
|
+
"README.rdoc"
|
17
|
+
]
|
18
|
+
s.files = [
|
19
|
+
".document",
|
20
|
+
".rspec",
|
21
|
+
"Gemfile",
|
22
|
+
"Gemfile.lock",
|
23
|
+
"LICENSE.txt",
|
24
|
+
"README.rdoc",
|
25
|
+
"Rakefile",
|
26
|
+
"VERSION",
|
27
|
+
"gogetter.gemspec",
|
28
|
+
"lib/go_getter.rb",
|
29
|
+
"lib/go_getter/go_getter.rb",
|
30
|
+
"lib/go_getter/response.rb",
|
31
|
+
"lib/go_getter/utils.rb",
|
32
|
+
"lib/gogetter.rb",
|
33
|
+
"spec/fixtures/google.html",
|
34
|
+
"spec/fixtures/google.redirect.html",
|
35
|
+
"spec/go_getter/go_getter_spec.rb",
|
36
|
+
"spec/rspec.opts",
|
37
|
+
"spec/spec_helper.rb"
|
38
|
+
]
|
39
|
+
s.homepage = %q{http://github.com/eladkehat/gogetter}
|
40
|
+
s.licenses = ["MIT"]
|
41
|
+
s.require_paths = ["lib"]
|
42
|
+
s.rubygems_version = %q{1.7.2}
|
43
|
+
s.summary = %q{Go get something over HTTP}
|
44
|
+
|
45
|
+
if s.respond_to? :specification_version then
|
46
|
+
s.specification_version = 3
|
47
|
+
|
48
|
+
if Gem::Version.new(Gem::VERSION) >= Gem::Version.new('1.2.0') then
|
49
|
+
s.add_development_dependency(%q<rspec>, ["~> 2.6.0"])
|
50
|
+
s.add_development_dependency(%q<yard>, ["~> 0.7.2"])
|
51
|
+
s.add_development_dependency(%q<bundler>, ["~> 1.0.0"])
|
52
|
+
s.add_development_dependency(%q<jeweler>, ["~> 1.6.4"])
|
53
|
+
s.add_development_dependency(%q<fakeweb>, ["~> 1.3.0"])
|
54
|
+
else
|
55
|
+
s.add_dependency(%q<rspec>, ["~> 2.6.0"])
|
56
|
+
s.add_dependency(%q<yard>, ["~> 0.7.2"])
|
57
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
58
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
59
|
+
s.add_dependency(%q<fakeweb>, ["~> 1.3.0"])
|
60
|
+
end
|
61
|
+
else
|
62
|
+
s.add_dependency(%q<rspec>, ["~> 2.6.0"])
|
63
|
+
s.add_dependency(%q<yard>, ["~> 0.7.2"])
|
64
|
+
s.add_dependency(%q<bundler>, ["~> 1.0.0"])
|
65
|
+
s.add_dependency(%q<jeweler>, ["~> 1.6.4"])
|
66
|
+
s.add_dependency(%q<fakeweb>, ["~> 1.3.0"])
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
@@ -0,0 +1,76 @@
|
|
1
|
+
# Monkey-patch Net::HTTPResponse
|
2
|
+
# Add a final_url attribute, which we use in handling HTTP redirections to determine
|
3
|
+
# the ultimate URI that the response was retrieve from
|
4
|
+
class Net::HTTPResponse
|
5
|
+
attr_accessor :final_uri
|
6
|
+
end
|
7
|
+
|
8
|
+
|
9
|
+
module GoGetter
|
10
|
+
|
11
|
+
def GoGetter.get(uri, http_headers = {}, params = {})
|
12
|
+
uri = parse_url(uri.to_s) unless uri.is_a? URI
|
13
|
+
path = uri.path
|
14
|
+
path << "?#{uri.query}" if uri.query
|
15
|
+
request = Net::HTTP::Get.new(path)
|
16
|
+
http_headers.each {|key, value| request.add_field key, value }
|
17
|
+
|
18
|
+
# basic authentication
|
19
|
+
request.basic_auth(params[:auth_user], params[:auth_pass]) if params[:auth_user] and params[:auth_pass]
|
20
|
+
|
21
|
+
# proxy
|
22
|
+
klass = (params[:proxy_host] and params[:proxy_port]) ?
|
23
|
+
Net::HTTP::Proxy(params[:proxy_host], params[:proxy_port], params[:proxy_user], params[:proxy_pass]) : Net::HTTP
|
24
|
+
|
25
|
+
response = klass.start(uri.host, uri.port) do |http|
|
26
|
+
http.read_timeout = params.fetch(:read_timeout, 600)
|
27
|
+
http.request(request)
|
28
|
+
end
|
29
|
+
|
30
|
+
if response.is_a?(Net::HTTPRedirection) # Redirect
|
31
|
+
# allow for a single redirection by default
|
32
|
+
params[:max_redirects] = 1 unless params.has_key?(:max_redirects)
|
33
|
+
response = handle_redirection(uri, response, http_headers, params)
|
34
|
+
else
|
35
|
+
response.final_uri = uri
|
36
|
+
end
|
37
|
+
|
38
|
+
return response
|
39
|
+
end
|
40
|
+
|
41
|
+
# Given a URL, which may not be formatted properly, parse a URI
|
42
|
+
def GoGetter.parse_url(url)
|
43
|
+
unless (url =~ %r{^https?://}mi) == 0
|
44
|
+
url = "http://#{url}"
|
45
|
+
end
|
46
|
+
uri = URI.parse url
|
47
|
+
if uri.path.length == 0 and uri.query.nil?
|
48
|
+
uri.path = "/"
|
49
|
+
end
|
50
|
+
uri
|
51
|
+
end
|
52
|
+
|
53
|
+
def GoGetter.handle_redirection(from_uri, response, http_headers, params)
|
54
|
+
if params.fetch(:max_redirects, 0) > 0
|
55
|
+
params[:uris_seen] = Set.new unless params[:uris_seen]
|
56
|
+
if params[:uris_seen].size < params.fetch(:max_redirects, 0) && response['Location']
|
57
|
+
params[:uris_seen] << from_uri
|
58
|
+
new_uri = URI.parse(response['Location'])
|
59
|
+
# new uri may be just the path, w/o host and port; if so, copy from old
|
60
|
+
unless new_uri.host
|
61
|
+
new_uri.host = from_uri.host
|
62
|
+
new_uri.port = from_uri.port
|
63
|
+
end
|
64
|
+
new_uri.scheme = from_uri.scheme unless new_uri.scheme
|
65
|
+
# avoid infinite redirect loops
|
66
|
+
unless params[:uris_seen].member? new_uri
|
67
|
+
# request the new location just as we did the old one.
|
68
|
+
params[:max_redirects] -= 1
|
69
|
+
response = GoGetter.get(new_uri, http_headers, params)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
response
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
# Monkey-patch for Net::HTTPResponse
|
2
|
+
# This file isn't required by the gem by default, so require it in your code
|
3
|
+
class Net::HTTPResponse
|
4
|
+
|
5
|
+
alias :body_asis :body
|
6
|
+
# New version of #body unzips a gzipped body before returning it
|
7
|
+
# Call GoGetter.get with the following in http_headers: "Accept-Encoding" => "gzip")
|
8
|
+
def body
|
9
|
+
if key?("Content-Encoding") and fetch("Content-Encoding") == "gzip"
|
10
|
+
body_io = StringIO.new(body_asis)
|
11
|
+
Zlib::GzipReader.new(body_io).read
|
12
|
+
else
|
13
|
+
body_asis
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module GoGetter
|
2
|
+
|
3
|
+
# Some user agents for use with websites that change their behavior according to your browser
|
4
|
+
# Set by adding to http_headers: "User-Agent" => USER_AGENTS[:chrome10_linux]
|
5
|
+
# Use http://www.useragentstring.com/pages/useragentstring.php to find more user agent strings
|
6
|
+
USER_AGENTS = {
|
7
|
+
:chrome10_win => "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.638.0 Safari/534.16",
|
8
|
+
:chrome10_linux => "Mozilla/5.0 (X11; U; Linux x86_64; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Ubuntu/10.10 Chromium/10.0.648.0 Chrome/10.0.648.0 Safari/534.16",
|
9
|
+
:firefox36_win => "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 ( .NET CLR 3.5.30729; .NET4.0C)",
|
10
|
+
:firefox36_linux => "Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.8) Gecko/20100804 Gentoo Firefox/3.6.8",
|
11
|
+
:ie8 => "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8)",
|
12
|
+
:ie7 => "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; SLCC2; .NET CLR 2.0.50727; InfoPath.3; .NET4.0C; .NET4.0E; .NET CLR 3.5.30729; .NET CLR 3.0.30729; MS-RTC LM 8)",
|
13
|
+
:opera11_win => "Opera/9.80 (Windows NT 6.0; U; en) Presto/2.7.39 Version/11.00",
|
14
|
+
:safari5_mac => "Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_3; en-us) AppleWebKit/534.1+ (KHTML, like Gecko) Version/5.0 Safari/533.16",
|
15
|
+
}
|
16
|
+
|
17
|
+
end
|
data/lib/go_getter.rb
ADDED
data/lib/gogetter.rb
ADDED
@@ -0,0 +1,3 @@
|
|
1
|
+
<html><head><meta http-equiv="content-type" content="text/html; charset=ISO-8859-1"><title>Google</title><style>body,td,a,p,.h{font-family:arial,sans-serif}.h{color:#36c;font-size:20px}.q{color:#00c}.ts td{padding:0}.ts{border-collapse:collapse}#gbar{height:22px;padding-left:2px}.gbh,.gbd{border-top:1px solid #c9d7f1;font-size:1px}.gbh{height:0;position:absolute;top:24px;width:100%}#gbi,#gbs{background:#fff;left:0;position:absolute;top:24px;visibility:hidden;z-index:1000}#gbi{border:1px solid;border-color:#c9d7f1 #36c #36c #a2bae7;z-index:1001}#guser{padding-bottom:7px !important}#gbar,#guser{font-size:13px;padding-top:1px !important}@media all{.gb1,.gb3{height:22px;margin-right:.73em;vertical-align:top}#gbar{float:left}}.gb2{display:block;padding:.2em .5em}a.gb1,a.gb2,a.gb3{color:#00c !important}.gb2,.gb3{text-decoration:none}a.gb2:hover{background:#36c;color:#fff !important}</style><script>window.google={kEI:"Zuk6ScOkLKHCMrrttckF",kEXPI:"17259,19124,19314",kHL:"en"};
|
2
|
+
google.y={};google.x=function(e,g){google.y[e.id]=[e,g];return false};function sf(){document.f.q.focus()}
|
3
|
+
window.gbar={};(function(){var b=window.gbar,f,h;b.qs=function(a){var c=window.encodeURIComponent&&(document.forms[0].q||"").value;if(c)a.href=a.href.replace(/([?&])q=[^&]*|$/,function(i,g){return(g||"&")+"q="+encodeURIComponent(c)})};function j(a,c){a.visibility=h?"hidden":"visible";a.left=c+"px"}b.tg=function(a){a=a||window.event;var c=0,i,g=window.navExtra,d=document.getElementById("gbi"),e=a.target||a.srcElement;a.cancelBubble=true;if(!f){f=document.createElement(Array.every||window.createPopup?"iframe":"div");f.frameBorder="0";f.src="#";d.parentNode.appendChild(f).id="gbs";if(g)for(i in g)d.insertBefore(g[i],d.firstChild).className="gb2";document.onclick=b.close}if(e.className!="gb3")e=e.parentNode;do c+=e.offsetLeft;while(e=e.offsetParent);j(d.style,c);f.style.width=d.offsetWidth+"px";f.style.height=d.offsetHeight+"px";j(f.style,c);h=!h};b.close=function(a){h&&b.tg(a)}})();</script></head><body bgcolor=#ffffff text=#000000 link=#0000cc vlink=#551a8b alink=#ff0000 onload="sf();if(document.images)new Image().src='/images/nav_logo3.png'" topmargin=3 marginheight=3><div id=gbar><nobr><b class=gb1>Web</b> <a href="http://images.google.com/imghp?hl=en&tab=wi" onclick=gbar.qs(this) class=gb1>Images</a> <a href="http://maps.google.com/maps?hl=en&tab=wl" onclick=gbar.qs(this) class=gb1>Maps</a> <a href="http://news.google.com/nwshp?hl=en&tab=wn" onclick=gbar.qs(this) class=gb1>News</a> <a href="http://www.google.com/prdhp?hl=en&tab=wf" onclick=gbar.qs(this) class=gb1>Shopping</a> <a href="http://mail.google.com/mail/?hl=en&tab=wm" class=gb1>Gmail</a> <a href="http://www.google.com/intl/en/options/" onclick="this.blur();gbar.tg(event);return !1" class=gb3><u>more</u> <small>▼</small></a><div id=gbi> <a href="http://video.google.com/?hl=en&tab=wv" onclick=gbar.qs(this) class=gb2>Video</a> <a href="http://groups.google.com/grphp?hl=en&tab=wg" onclick=gbar.qs(this) class=gb2>Groups</a> <a href="http://books.google.com/bkshp?hl=en&tab=wp" onclick=gbar.qs(this) class=gb2>Books</a> <a href="http://scholar.google.com/schhp?hl=en&tab=ws" onclick=gbar.qs(this) class=gb2>Scholar</a> <a href="http://finance.google.com/finance?hl=en&tab=we" onclick=gbar.qs(this) class=gb2>Finance</a> <a href="http://blogsearch.google.com/?hl=en&tab=wb" onclick=gbar.qs(this) class=gb2>Blogs</a> <div class=gb2><div class=gbd></div></div> <a href="http://www.youtube.com/?hl=en&tab=w1" onclick=gbar.qs(this) class=gb2>YouTube</a> <a href="http://www.google.com/calendar/render?hl=en&tab=wc" class=gb2>Calendar</a> <a href="http://picasaweb.google.com/home?hl=en&tab=wq" onclick=gbar.qs(this) class=gb2>Photos</a> <a href="http://docs.google.com/?hl=en&tab=wo" class=gb2>Documents</a> <a href="http://www.google.com/reader/view/?hl=en&tab=wy" class=gb2>Reader</a> <a href="http://sites.google.com/?hl=en&tab=w3" class=gb2>Sites</a> <div class=gb2><div class=gbd></div></div> <a href="http://www.google.com/intl/en/options/" class=gb2>even more »</a></div> </nobr></div><div class=gbh style=left:0></div><div class=gbh style=right:0></div><div align=right id=guser style="font-size:84%;padding:0 0 4px" width=100%><nobr><a href="/url?sa=p&pref=ig&pval=3&q=http://www.google.com/ig%3Fhl%3Den%26source%3Diglk&usg=AFQjCNFA18XPfgb7dKnXfKz7x7g1GDH1tg">iGoogle</a> | <a href="https://www.google.com/accounts/Login?continue=http://www.google.com/&hl=en">Sign in</a></nobr></div><center><br clear=all id=lgpd><img alt="Google" height=110 src="/intl/en_ALL/images/logo.gif" width=276><br><br><form action="/search" name=f><table cellpadding=0 cellspacing=0><tr valign=top><td width=25%> </td><td align=center nowrap><input name=hl type=hidden value=en><input type=hidden name=ie value="ISO-8859-1"><input autocomplete="off" maxlength=2048 name=q size=55 title="Google Search" value=""><br><input name=btnG type=submit value="Google Search"><input name=btnI type=submit value="I'm Feeling Lucky"></td><td nowrap width=25%><font size=-2> <a href=/advanced_search?hl=en>Advanced Search</a><br> <a href=/preferences?hl=en>Preferences</a><br> <a href=/language_tools?hl=en>Language Tools</a></font></td></tr></table></form><br><br><font size=-1><a href="/intl/en/ads/">Advertising Programs</a> - <a href="/services/">Business Solutions</a> - <a href="/intl/en/about.html">About Google</a></font><p><font size=-2>©2008 - <a href="/intl/en/privacy.html">Privacy</a></font></p></center></body><script>if(google.y)google.y.first=[];window.setTimeout(function(){var xjs=document.createElement('script');xjs.src='/extern_js/f/CgJlbhICdXMgACswCjgMLCswDjgCLCswGDgDLA/8MIofMT_4o8.js';document.getElementsByTagName('head')[0].appendChild(xjs)},0);google.y.first.push(function(){google.ac.i(document.f,document.f.q,'','')})</script></html>
|
@@ -0,0 +1,151 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
describe "GoGetter" do
|
4
|
+
|
5
|
+
describe "#parse_url" do
|
6
|
+
it "returns a URI instance, given a URL string" do
|
7
|
+
uri = GoGetter.parse_url('http://www.google.com/')
|
8
|
+
uri.scheme.should == 'http'
|
9
|
+
uri.host.should == 'www.google.com'
|
10
|
+
uri.port.should == 80
|
11
|
+
uri.path.should == '/'
|
12
|
+
end
|
13
|
+
|
14
|
+
context "when the URL has no scheme" do
|
15
|
+
it "prepends http" do
|
16
|
+
uri = GoGetter.parse_url('www.google.com')
|
17
|
+
uri.scheme.should == 'http'
|
18
|
+
end
|
19
|
+
end
|
20
|
+
|
21
|
+
context "when the URL has no path" do
|
22
|
+
it "appends '/'" do
|
23
|
+
uri = GoGetter.parse_url('www.google.com')
|
24
|
+
uri.path.should == '/'
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
context "when the URL has a path" do
|
29
|
+
it "retains the path" do
|
30
|
+
uri = GoGetter.parse_url('www.google.com/search')
|
31
|
+
uri.path.should == '/search'
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
context "when the URL has a query part" do
|
36
|
+
it "retains the query" do
|
37
|
+
uri = GoGetter.parse_url('www.google.com/search?q=gogetter&hl=en')
|
38
|
+
uri.query.should == 'q=gogetter&hl=en'
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
describe "#handle_redirection" do
|
44
|
+
context "when the max_redirects param is 0" do
|
45
|
+
it "should not send another request" do
|
46
|
+
GoGetter.should_not_receive(:get)
|
47
|
+
response = Net::HTTPRedirection.new('1.1', '302', 'Found')
|
48
|
+
response['Location'] = 'http://www.google.co.il/'
|
49
|
+
GoGetter.handle_redirection(URI.parse('http://www.google.com/'), response, {}, {max_redirects: 0}).should == response
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
context "when the max_redirects param > 0" do
|
54
|
+
it "should send another get request to the new location" do
|
55
|
+
response = Net::HTTPRedirection.new('1.1', '302', 'Found')
|
56
|
+
response['Location'] = 'http://www.google.co.il/'
|
57
|
+
GoGetter.should_receive(:get).with(URI.parse('http://www.google.co.il'), {}, hash_including(:max_redirects=>0))
|
58
|
+
GoGetter.handle_redirection(URI.parse('http://www.google.com/'), response, {}, {max_redirects: 1})
|
59
|
+
end
|
60
|
+
end
|
61
|
+
|
62
|
+
context "when the response location is only a path (no host)" do
|
63
|
+
it "uses the host from the original URI" do
|
64
|
+
response = Net::HTTPRedirection.new('1.1', '302', 'Found')
|
65
|
+
response['Location'] = '/new_target'
|
66
|
+
exp_uri = URI.parse('/new_target')
|
67
|
+
exp_uri.host = 'www.google.com'; exp_uri.port = 80; exp_uri.scheme = 'http'
|
68
|
+
GoGetter.should_receive(:get).with(exp_uri, {}, hash_including(:max_redirects=>0))
|
69
|
+
GoGetter.handle_redirection(URI.parse('http://www.google.com/'), response, {}, {max_redirects: 1})
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
context "when the response redirects to a location that had already been got in the recursive chain" do
|
74
|
+
it "should not send another request" do
|
75
|
+
GoGetter.should_not_receive(:get)
|
76
|
+
response = Net::HTTPRedirection.new('1.1', '302', 'Found')
|
77
|
+
response['Location'] = 'http://www.google.co.il/'
|
78
|
+
GoGetter.handle_redirection(
|
79
|
+
URI.parse('http://www.google.com/'),
|
80
|
+
response, {},
|
81
|
+
{:max_redirects=>4,
|
82
|
+
:uris_seen => Set.new([URI.parse('http://www.google.co.il/'), URI.parse('http://www.yahoo.com/')])}
|
83
|
+
).should == response
|
84
|
+
end
|
85
|
+
end
|
86
|
+
end
|
87
|
+
|
88
|
+
describe "#get" do
|
89
|
+
context "when given a URL" do
|
90
|
+
it "should get it" do
|
91
|
+
url = "http://google.html/"
|
92
|
+
body = file_fixture('google.html')
|
93
|
+
FakeWeb.register_uri(:get, url, :status => ['200', 'OK'], :body => body)
|
94
|
+
response = GoGetter.get url
|
95
|
+
response.should be_a(Net::HTTPOK)
|
96
|
+
response.body.should == body
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
context "when given a URI" do
|
101
|
+
it "should get that too" do
|
102
|
+
url = "http://google.html/"
|
103
|
+
body = file_fixture('google.html')
|
104
|
+
FakeWeb.register_uri(:get, url, :status => ['200', 'OK'], :body => body)
|
105
|
+
response = GoGetter.get URI.parse(url)
|
106
|
+
response.should be_a(Net::HTTPOK)
|
107
|
+
response.body.should == body
|
108
|
+
end
|
109
|
+
end
|
110
|
+
|
111
|
+
context "when given basic auth params" do
|
112
|
+
it "should do basic authentication" do
|
113
|
+
url = 'http://example.com/secret'
|
114
|
+
url_auth = 'http://user:pass@example.com/secret'
|
115
|
+
FakeWeb.register_uri(:get, url, :body => "Unauthorized", :status => ["401", "Unauthorized"])
|
116
|
+
FakeWeb.register_uri(:get, url_auth, :status => ['200', 'OK'], :body => "Authorized")
|
117
|
+
GoGetter.get(url).should be_a(Net::HTTPUnauthorized)
|
118
|
+
GoGetter.get(url, {}, {:auth_user => 'user', :auth_pass => 'pass'}).should be_a(Net::HTTPOK)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
|
122
|
+
context "when given proxy params" do
|
123
|
+
it "should use a proxy" do
|
124
|
+
url = "http://google.html/"
|
125
|
+
#proxy:
|
126
|
+
host = 'proxy.example.com'; port = '8080'
|
127
|
+
user = 'user'; pass = 'pass'
|
128
|
+
#proxy_class = Net::HTTP::Proxy(proxy_host, proxy_port, proxy_user, proxy_pass)
|
129
|
+
FakeWeb.register_uri(:get, url, :status => ['200', 'OK'], :body => file_fixture('google.html'))
|
130
|
+
Net::HTTP.should_receive(:Proxy).with(host, port, user, pass)
|
131
|
+
# I was unable to mock proxy behavior properly and #get keeps raising errors
|
132
|
+
# However, this code still tests that a proxy class is created, which is the whole point
|
133
|
+
expect {
|
134
|
+
GoGetter.get(url, {}, {proxy_host: host,proxy_port: port,proxy_user: user,proxy_pass: pass})
|
135
|
+
}.to raise_error
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
context "when the response is a redirect" do
|
140
|
+
it "does redirection" do
|
141
|
+
url1 = "http://www.google.com/"
|
142
|
+
url2 = "http://www.google.co.il/"
|
143
|
+
body = file_fixture('google.redirect.html')
|
144
|
+
FakeWeb.register_uri(:get, url1, :status => ['302','Found'], :headers => {'Location'=>url2},:body => body)
|
145
|
+
params = {max_redirects: 1}
|
146
|
+
GoGetter.should_receive(:handle_redirection).with(URI.parse(url1), an_instance_of(Net::HTTPFound), {}, params)
|
147
|
+
GoGetter.get(url1, {}, params)
|
148
|
+
end
|
149
|
+
end
|
150
|
+
end
|
151
|
+
end
|
data/spec/rspec.opts
ADDED
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
$LOAD_PATH.unshift(File.join(File.dirname(__FILE__), '..', 'lib'))
|
2
|
+
$LOAD_PATH.unshift(File.dirname(__FILE__))
|
3
|
+
require 'rspec'
|
4
|
+
require 'gogetter'
|
5
|
+
require 'fakeweb'
|
6
|
+
|
7
|
+
def file_fixture(filename)
|
8
|
+
open(File.join(File.dirname(__FILE__), 'fixtures', "#{filename}")).read
|
9
|
+
end
|
10
|
+
# Requires supporting files with custom matchers and macros, etc,
|
11
|
+
# in ./support/ and its subdirectories.
|
12
|
+
Dir["#{File.dirname(__FILE__)}/support/**/*.rb"].each {|f| require f}
|
13
|
+
|
14
|
+
RSpec.configure do |config|
|
15
|
+
config.before(:suite) do
|
16
|
+
FakeWeb.allow_net_connect = false
|
17
|
+
end
|
18
|
+
config.after(:suite) do
|
19
|
+
FakeWeb.allow_net_connect = true
|
20
|
+
end
|
21
|
+
end
|
metadata
ADDED
@@ -0,0 +1,124 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: gogetter
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Elad Kehat
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-07-25 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: &74419330 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ~>
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: 2.6.0
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *74419330
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: yard
|
27
|
+
requirement: &74418990 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ~>
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: 0.7.2
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *74418990
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: bundler
|
38
|
+
requirement: &74418500 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ~>
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: 1.0.0
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *74418500
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: jeweler
|
49
|
+
requirement: &74417980 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ~>
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: 1.6.4
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *74417980
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: fakeweb
|
60
|
+
requirement: &74417470 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ~>
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: 1.3.0
|
66
|
+
type: :development
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *74417470
|
69
|
+
description:
|
70
|
+
email: eladkehat@gmail.com
|
71
|
+
executables: []
|
72
|
+
extensions: []
|
73
|
+
extra_rdoc_files:
|
74
|
+
- LICENSE.txt
|
75
|
+
- README.rdoc
|
76
|
+
files:
|
77
|
+
- .document
|
78
|
+
- .rspec
|
79
|
+
- Gemfile
|
80
|
+
- Gemfile.lock
|
81
|
+
- LICENSE.txt
|
82
|
+
- README.rdoc
|
83
|
+
- Rakefile
|
84
|
+
- VERSION
|
85
|
+
- gogetter.gemspec
|
86
|
+
- lib/go_getter.rb
|
87
|
+
- lib/go_getter/go_getter.rb
|
88
|
+
- lib/go_getter/response.rb
|
89
|
+
- lib/go_getter/utils.rb
|
90
|
+
- lib/gogetter.rb
|
91
|
+
- spec/fixtures/google.html
|
92
|
+
- spec/fixtures/google.redirect.html
|
93
|
+
- spec/go_getter/go_getter_spec.rb
|
94
|
+
- spec/rspec.opts
|
95
|
+
- spec/spec_helper.rb
|
96
|
+
homepage: http://github.com/eladkehat/gogetter
|
97
|
+
licenses:
|
98
|
+
- MIT
|
99
|
+
post_install_message:
|
100
|
+
rdoc_options: []
|
101
|
+
require_paths:
|
102
|
+
- lib
|
103
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
104
|
+
none: false
|
105
|
+
requirements:
|
106
|
+
- - ! '>='
|
107
|
+
- !ruby/object:Gem::Version
|
108
|
+
version: '0'
|
109
|
+
segments:
|
110
|
+
- 0
|
111
|
+
hash: 740502813
|
112
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
113
|
+
none: false
|
114
|
+
requirements:
|
115
|
+
- - ! '>='
|
116
|
+
- !ruby/object:Gem::Version
|
117
|
+
version: '0'
|
118
|
+
requirements: []
|
119
|
+
rubyforge_project:
|
120
|
+
rubygems_version: 1.7.2
|
121
|
+
signing_key:
|
122
|
+
specification_version: 3
|
123
|
+
summary: Go get something over HTTP
|
124
|
+
test_files: []
|