cobweb 1.0.4 → 1.0.5

Sign up to get free protection for your applications and to get access to all the features.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v1.0.4
2
+ h1. Cobweb v1.0.5
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
data/lib/cobweb.rb CHANGED
@@ -139,21 +139,20 @@ class Cobweb
139
139
  @http.read_timeout = @options[:timeout].to_i
140
140
  @http.open_timeout = @options[:timeout].to_i
141
141
  begin
142
- print "Retrieving #{url }... " unless @options[:quiet]
142
+ puts "Retrieving #{url }... " unless @options[:quiet]
143
143
  request_options={}
144
144
  request_options['Cookie']= options[:cookies] if options.has_key?(:cookies)
145
145
  request_options['User-Agent']= options[:user_agent] if options.has_key?(:user_agent)
146
146
 
147
147
  request = Net::HTTP::Get.new uri.request_uri, request_options
148
-
149
148
  response = @http.request request
150
-
149
+
151
150
  if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
152
151
  puts "redirected... " unless @options[:quiet]
153
-
152
+
154
153
  # get location to redirect to
155
154
  uri = UriHelper.join_no_fragment(uri, response['location'])
156
-
155
+
157
156
  # decrement redirect limit
158
157
  redirect_limit = redirect_limit - 1
159
158
 
@@ -162,9 +161,10 @@ class Cobweb
162
161
 
163
162
  # get the content from redirect location
164
163
  content = get(uri, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
165
- content[:url] = uri.to_s
166
- content[:redirect_through] = [] if content[:redirect_through].nil?
164
+
165
+ content[:redirect_through] = [uri.to_s] if content[:redirect_through].nil?
167
166
  content[:redirect_through].insert(0, url)
167
+ content[:url] = content[:redirect_through].last
168
168
 
169
169
  content[:response_time] = Time.now.to_f - request_time
170
170
  else
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.4"
6
+ "1.0.5"
7
7
  end
8
8
 
9
9
  end
@@ -3,8 +3,8 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
3
3
  describe CobwebModule::Crawl, :local_only => true do
4
4
 
5
5
  before(:each) do
6
- @local_redis = {:host => "127.0.0.1", :port => 6379}
7
- @remote_redis = {:host => "192.168.100.16", :port => 6379}
6
+ @local_redis = {:host => "localhost", :port => 6379}
7
+ @remote_redis = {:host => "remote-redis", :port => 6379}
8
8
 
9
9
  @request = {:crawl_id => "test_crawl_id"}
10
10
  end
@@ -9,6 +9,8 @@ describe Cobweb, :local_only => true, :disabled => true do
9
9
  # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
10
10
  puts "Starting Workers... Please Wait..."
11
11
  `mkdir log`
12
+ `mkdir tmp`
13
+ `mkdir tmp/pids`
12
14
  io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=10 QUEUE=cobweb_crawl_job > log/output.log &")
13
15
  puts "Workers Started."
14
16
 
@@ -90,34 +90,21 @@ describe Cobweb do
90
90
  before(:each) do
91
91
  @base_url = "http://redirect-me.com/redirect.html"
92
92
  @cobweb = Cobweb.new(:follow_redirects => true, :quiet => true, :cache => nil)
93
-
94
- @mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
95
- @mock_http_redirect_response.stub!(:[]).with("location").and_return("http://redirected-to.com/redirect2.html")
96
- @mock_http_redirect_response2.stub!(:[]).with("location").and_return("http://redirected-to.com/redirected.html")
97
-
98
93
  end
99
94
 
100
- it "should flow through redirect" #do
101
-
102
- #@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
103
- #@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
104
- #
105
- #content = @cobweb.get(@base_url)
106
- #content.should be_an_instance_of HashHelper
107
- #ap content
108
- #content[:url].should == "http://redirect-me.com/redirect.html"
109
- #content[:redirect_through].length.should == 2
110
- #content[:mime_type].should == "text/html"
111
- #content[:body].should == "asdf"
95
+ it "should return final page from redirects" do
96
+ content = @cobweb.get(@base_url)
97
+ content.should be_an_instance_of Hash
98
+ content[:url].should == "http://redirected-to.com/redirected.html"
99
+ content[:mime_type].should == "text/html"
100
+ content[:body].should == "asdf"
101
+ end
102
+ it "should return the path followed" do
112
103
 
113
- #end
114
- it "should return the path followed" #do
115
- #@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
116
- #
117
- #content = @cobweb.get(@base_url)
118
- #content[:redirect_through].should == ["http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
104
+ content = @cobweb.get(@base_url)
105
+ content[:redirect_through].should == ["http://redirect-me.com/redirect.html", "http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
119
106
 
120
- #end
107
+ end
121
108
  it "should not follow with redirect disabled" do
122
109
  @cobweb = Cobweb.new(:follow_redirects => false, :cache => 3)
123
110
  @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
data/spec/spec_helper.rb CHANGED
@@ -9,7 +9,7 @@ APP_ROOT = File.expand_path(File.dirname(__FILE__) + '/../')
9
9
 
10
10
  RSpec.configure do |config|
11
11
 
12
- unless ENV["TRAVIS_RUBY_VERSION"].nil?
12
+ if ENV["TRAVIS_RUBY_VERSION"] || ENV['CI']
13
13
  config.filter_run_excluding :local_only => true
14
14
  end
15
15
 
@@ -63,9 +63,10 @@ RSpec.configure do |config|
63
63
 
64
64
  Net::HTTP.stub!(:new).and_return(@mock_http_client)
65
65
  Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
66
- Net::HTTP::Get.stub!(:new).with("/redirect.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"}).and_return(@mock_http_redirect_request)
67
- Net::HTTP::Get.stub!(:new).with("/robots.txt", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"}).and_return(@mock_http_robot_request)
68
- Net::HTTP::Get.stub!(:new).with("/redirect2.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"}).and_return(@mock_http_redirect_request2)
66
+ Net::HTTP::Get.stub!(:new).with("/redirect.html", an_instance_of(Hash)).and_return(@mock_http_redirect_request)
67
+ Net::HTTP::Get.stub!(:new).with("/robots.txt", an_instance_of(Hash)).and_return(@mock_http_robot_request)
68
+ Net::HTTP::Get.stub!(:new).with("/redirect2.html", an_instance_of(Hash)).and_return(@mock_http_redirect_request2)
69
+ Net::HTTP::Get.stub!(:new).with("/redirected.html", an_instance_of(Hash)).and_return(@mock_http_request)
69
70
 
70
71
  Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
71
72
 
@@ -77,7 +78,7 @@ RSpec.configure do |config|
77
78
  @mock_http_client.stub!(:open_timeout=).and_return(nil)
78
79
  @mock_http_client.stub!(:start).and_return(@mock_http_response)
79
80
  @mock_http_client.stub!(:address).and_return("www.baseurl.com")
80
- @mock_http_client.stub!(:port).and_return("80 ")
81
+ @mock_http_client.stub!(:port).and_return("80")
81
82
 
82
83
  @mock_http_robot_response.stub!(:code).and_return(200)
83
84
  @mock_http_robot_response.stub!(:body).and_return(File.open(File.dirname(__FILE__) + '/../spec/samples/robots.txt', "r").read)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-15 00:00:00.000000000 Z
12
+ date: 2013-02-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70312117054720 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,15 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70312117054720
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
25
30
  - !ruby/object:Gem::Dependency
26
31
  name: redis
27
- requirement: &70312117053180 !ruby/object:Gem::Requirement
32
+ requirement: !ruby/object:Gem::Requirement
28
33
  none: false
29
34
  requirements:
30
35
  - - ! '>='
@@ -32,10 +37,15 @@ dependencies:
32
37
  version: '0'
33
38
  type: :runtime
34
39
  prerelease: false
35
- version_requirements: *70312117053180
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
36
46
  - !ruby/object:Gem::Dependency
37
47
  name: nokogiri
38
- requirement: &70312117051880 !ruby/object:Gem::Requirement
48
+ requirement: !ruby/object:Gem::Requirement
39
49
  none: false
40
50
  requirements:
41
51
  - - ! '>='
@@ -43,10 +53,15 @@ dependencies:
43
53
  version: '0'
44
54
  type: :runtime
45
55
  prerelease: false
46
- version_requirements: *70312117051880
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
47
62
  - !ruby/object:Gem::Dependency
48
63
  name: addressable
49
- requirement: &70312117050680 !ruby/object:Gem::Requirement
64
+ requirement: !ruby/object:Gem::Requirement
50
65
  none: false
51
66
  requirements:
52
67
  - - ! '>='
@@ -54,10 +69,15 @@ dependencies:
54
69
  version: '0'
55
70
  type: :runtime
56
71
  prerelease: false
57
- version_requirements: *70312117050680
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
58
78
  - !ruby/object:Gem::Dependency
59
79
  name: rspec
60
- requirement: &70312117049660 !ruby/object:Gem::Requirement
80
+ requirement: !ruby/object:Gem::Requirement
61
81
  none: false
62
82
  requirements:
63
83
  - - ! '>='
@@ -65,10 +85,15 @@ dependencies:
65
85
  version: '0'
66
86
  type: :runtime
67
87
  prerelease: false
68
- version_requirements: *70312117049660
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
69
94
  - !ruby/object:Gem::Dependency
70
95
  name: awesome_print
71
- requirement: &70312117048900 !ruby/object:Gem::Requirement
96
+ requirement: !ruby/object:Gem::Requirement
72
97
  none: false
73
98
  requirements:
74
99
  - - ! '>='
@@ -76,10 +101,15 @@ dependencies:
76
101
  version: '0'
77
102
  type: :runtime
78
103
  prerelease: false
79
- version_requirements: *70312117048900
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
80
110
  - !ruby/object:Gem::Dependency
81
111
  name: sinatra
82
- requirement: &70312117047160 !ruby/object:Gem::Requirement
112
+ requirement: !ruby/object:Gem::Requirement
83
113
  none: false
84
114
  requirements:
85
115
  - - ! '>='
@@ -87,10 +117,15 @@ dependencies:
87
117
  version: '0'
88
118
  type: :runtime
89
119
  prerelease: false
90
- version_requirements: *70312117047160
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
91
126
  - !ruby/object:Gem::Dependency
92
127
  name: thin
93
- requirement: &70312117043520 !ruby/object:Gem::Requirement
128
+ requirement: !ruby/object:Gem::Requirement
94
129
  none: false
95
130
  requirements:
96
131
  - - ! '>='
@@ -98,10 +133,15 @@ dependencies:
98
133
  version: '0'
99
134
  type: :runtime
100
135
  prerelease: false
101
- version_requirements: *70312117043520
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
102
142
  - !ruby/object:Gem::Dependency
103
143
  name: haml
104
- requirement: &70312117041060 !ruby/object:Gem::Requirement
144
+ requirement: !ruby/object:Gem::Requirement
105
145
  none: false
106
146
  requirements:
107
147
  - - ! '>='
@@ -109,10 +149,15 @@ dependencies:
109
149
  version: '0'
110
150
  type: :runtime
111
151
  prerelease: false
112
- version_requirements: *70312117041060
152
+ version_requirements: !ruby/object:Gem::Requirement
153
+ none: false
154
+ requirements:
155
+ - - ! '>='
156
+ - !ruby/object:Gem::Version
157
+ version: '0'
113
158
  - !ruby/object:Gem::Dependency
114
159
  name: namespaced_redis
115
- requirement: &70312117038160 !ruby/object:Gem::Requirement
160
+ requirement: !ruby/object:Gem::Requirement
116
161
  none: false
117
162
  requirements:
118
163
  - - ! '>='
@@ -120,10 +165,15 @@ dependencies:
120
165
  version: '0'
121
166
  type: :runtime
122
167
  prerelease: false
123
- version_requirements: *70312117038160
168
+ version_requirements: !ruby/object:Gem::Requirement
169
+ none: false
170
+ requirements:
171
+ - - ! '>='
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
124
174
  - !ruby/object:Gem::Dependency
125
175
  name: json
126
- requirement: &70312117032620 !ruby/object:Gem::Requirement
176
+ requirement: !ruby/object:Gem::Requirement
127
177
  none: false
128
178
  requirements:
129
179
  - - ! '>='
@@ -131,7 +181,12 @@ dependencies:
131
181
  version: '0'
132
182
  type: :runtime
133
183
  prerelease: false
134
- version_requirements: *70312117032620
184
+ version_requirements: !ruby/object:Gem::Requirement
185
+ none: false
186
+ requirements:
187
+ - - ! '>='
188
+ - !ruby/object:Gem::Version
189
+ version: '0'
135
190
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
191
  crawl extremely large sites which is much more performant than multi-threaded crawlers. It
137
192
  is also a standalone crawler that has a sophisticated statistics monitoring interface
@@ -514,7 +569,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
514
569
  version: '0'
515
570
  requirements: []
516
571
  rubyforge_project:
517
- rubygems_version: 1.8.10
572
+ rubygems_version: 1.8.24
518
573
  signing_key:
519
574
  specification_version: 3
520
575
  summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly