cobweb 1.0.4 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v1.0.4
2
+ h1. Cobweb v1.0.5
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
data/lib/cobweb.rb CHANGED
@@ -139,21 +139,20 @@ class Cobweb
139
139
  @http.read_timeout = @options[:timeout].to_i
140
140
  @http.open_timeout = @options[:timeout].to_i
141
141
  begin
142
- print "Retrieving #{url }... " unless @options[:quiet]
142
+ puts "Retrieving #{url }... " unless @options[:quiet]
143
143
  request_options={}
144
144
  request_options['Cookie']= options[:cookies] if options.has_key?(:cookies)
145
145
  request_options['User-Agent']= options[:user_agent] if options.has_key?(:user_agent)
146
146
 
147
147
  request = Net::HTTP::Get.new uri.request_uri, request_options
148
-
149
148
  response = @http.request request
150
-
149
+
151
150
  if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
152
151
  puts "redirected... " unless @options[:quiet]
153
-
152
+
154
153
  # get location to redirect to
155
154
  uri = UriHelper.join_no_fragment(uri, response['location'])
156
-
155
+
157
156
  # decrement redirect limit
158
157
  redirect_limit = redirect_limit - 1
159
158
 
@@ -162,9 +161,10 @@ class Cobweb
162
161
 
163
162
  # get the content from redirect location
164
163
  content = get(uri, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
165
- content[:url] = uri.to_s
166
- content[:redirect_through] = [] if content[:redirect_through].nil?
164
+
165
+ content[:redirect_through] = [uri.to_s] if content[:redirect_through].nil?
167
166
  content[:redirect_through].insert(0, url)
167
+ content[:url] = content[:redirect_through].last
168
168
 
169
169
  content[:response_time] = Time.now.to_f - request_time
170
170
  else
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.4"
6
+ "1.0.5"
7
7
  end
8
8
 
9
9
  end
@@ -3,8 +3,8 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
3
3
  describe CobwebModule::Crawl, :local_only => true do
4
4
 
5
5
  before(:each) do
6
- @local_redis = {:host => "127.0.0.1", :port => 6379}
7
- @remote_redis = {:host => "192.168.100.16", :port => 6379}
6
+ @local_redis = {:host => "localhost", :port => 6379}
7
+ @remote_redis = {:host => "remote-redis", :port => 6379}
8
8
 
9
9
  @request = {:crawl_id => "test_crawl_id"}
10
10
  end
@@ -9,6 +9,8 @@ describe Cobweb, :local_only => true, :disabled => true do
9
9
  # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
10
10
  puts "Starting Workers... Please Wait..."
11
11
  `mkdir log`
12
+ `mkdir tmp`
13
+ `mkdir tmp/pids`
12
14
  io = IO.popen("nohup rake resque:workers PIDFILE=./tmp/pids/resque.pid COUNT=10 QUEUE=cobweb_crawl_job > log/output.log &")
13
15
  puts "Workers Started."
14
16
 
@@ -90,34 +90,21 @@ describe Cobweb do
90
90
  before(:each) do
91
91
  @base_url = "http://redirect-me.com/redirect.html"
92
92
  @cobweb = Cobweb.new(:follow_redirects => true, :quiet => true, :cache => nil)
93
-
94
- @mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
95
- @mock_http_redirect_response.stub!(:[]).with("location").and_return("http://redirected-to.com/redirect2.html")
96
- @mock_http_redirect_response2.stub!(:[]).with("location").and_return("http://redirected-to.com/redirected.html")
97
-
98
93
  end
99
94
 
100
- it "should flow through redirect" #do
101
-
102
- #@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
103
- #@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
104
- #
105
- #content = @cobweb.get(@base_url)
106
- #content.should be_an_instance_of HashHelper
107
- #ap content
108
- #content[:url].should == "http://redirect-me.com/redirect.html"
109
- #content[:redirect_through].length.should == 2
110
- #content[:mime_type].should == "text/html"
111
- #content[:body].should == "asdf"
95
+ it "should return final page from redirects" do
96
+ content = @cobweb.get(@base_url)
97
+ content.should be_an_instance_of Hash
98
+ content[:url].should == "http://redirected-to.com/redirected.html"
99
+ content[:mime_type].should == "text/html"
100
+ content[:body].should == "asdf"
101
+ end
102
+ it "should return the path followed" do
112
103
 
113
- #end
114
- it "should return the path followed" #do
115
- #@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
116
- #
117
- #content = @cobweb.get(@base_url)
118
- #content[:redirect_through].should == ["http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
104
+ content = @cobweb.get(@base_url)
105
+ content[:redirect_through].should == ["http://redirect-me.com/redirect.html", "http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
119
106
 
120
- #end
107
+ end
121
108
  it "should not follow with redirect disabled" do
122
109
  @cobweb = Cobweb.new(:follow_redirects => false, :cache => 3)
123
110
  @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
data/spec/spec_helper.rb CHANGED
@@ -9,7 +9,7 @@ APP_ROOT = File.expand_path(File.dirname(__FILE__) + '/../')
9
9
 
10
10
  RSpec.configure do |config|
11
11
 
12
- unless ENV["TRAVIS_RUBY_VERSION"].nil?
12
+ if ENV["TRAVIS_RUBY_VERSION"] || ENV['CI']
13
13
  config.filter_run_excluding :local_only => true
14
14
  end
15
15
 
@@ -63,9 +63,10 @@ RSpec.configure do |config|
63
63
 
64
64
  Net::HTTP.stub!(:new).and_return(@mock_http_client)
65
65
  Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
66
- Net::HTTP::Get.stub!(:new).with("/redirect.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"}).and_return(@mock_http_redirect_request)
67
- Net::HTTP::Get.stub!(:new).with("/robots.txt", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"}).and_return(@mock_http_robot_request)
68
- Net::HTTP::Get.stub!(:new).with("/redirect2.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/#{Nokogiri::VERSION})"}).and_return(@mock_http_redirect_request2)
66
+ Net::HTTP::Get.stub!(:new).with("/redirect.html", an_instance_of(Hash)).and_return(@mock_http_redirect_request)
67
+ Net::HTTP::Get.stub!(:new).with("/robots.txt", an_instance_of(Hash)).and_return(@mock_http_robot_request)
68
+ Net::HTTP::Get.stub!(:new).with("/redirect2.html", an_instance_of(Hash)).and_return(@mock_http_redirect_request2)
69
+ Net::HTTP::Get.stub!(:new).with("/redirected.html", an_instance_of(Hash)).and_return(@mock_http_request)
69
70
 
70
71
  Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
71
72
 
@@ -77,7 +78,7 @@ RSpec.configure do |config|
77
78
  @mock_http_client.stub!(:open_timeout=).and_return(nil)
78
79
  @mock_http_client.stub!(:start).and_return(@mock_http_response)
79
80
  @mock_http_client.stub!(:address).and_return("www.baseurl.com")
80
- @mock_http_client.stub!(:port).and_return("80 ")
81
+ @mock_http_client.stub!(:port).and_return("80")
81
82
 
82
83
  @mock_http_robot_response.stub!(:code).and_return(200)
83
84
  @mock_http_robot_response.stub!(:body).and_return(File.open(File.dirname(__FILE__) + '/../spec/samples/robots.txt', "r").read)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.4
4
+ version: 1.0.5
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-01-15 00:00:00.000000000 Z
12
+ date: 2013-02-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70312117054720 !ruby/object:Gem::Requirement
16
+ requirement: !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,15 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70312117054720
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '0'
25
30
  - !ruby/object:Gem::Dependency
26
31
  name: redis
27
- requirement: &70312117053180 !ruby/object:Gem::Requirement
32
+ requirement: !ruby/object:Gem::Requirement
28
33
  none: false
29
34
  requirements:
30
35
  - - ! '>='
@@ -32,10 +37,15 @@ dependencies:
32
37
  version: '0'
33
38
  type: :runtime
34
39
  prerelease: false
35
- version_requirements: *70312117053180
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ! '>='
44
+ - !ruby/object:Gem::Version
45
+ version: '0'
36
46
  - !ruby/object:Gem::Dependency
37
47
  name: nokogiri
38
- requirement: &70312117051880 !ruby/object:Gem::Requirement
48
+ requirement: !ruby/object:Gem::Requirement
39
49
  none: false
40
50
  requirements:
41
51
  - - ! '>='
@@ -43,10 +53,15 @@ dependencies:
43
53
  version: '0'
44
54
  type: :runtime
45
55
  prerelease: false
46
- version_requirements: *70312117051880
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
47
62
  - !ruby/object:Gem::Dependency
48
63
  name: addressable
49
- requirement: &70312117050680 !ruby/object:Gem::Requirement
64
+ requirement: !ruby/object:Gem::Requirement
50
65
  none: false
51
66
  requirements:
52
67
  - - ! '>='
@@ -54,10 +69,15 @@ dependencies:
54
69
  version: '0'
55
70
  type: :runtime
56
71
  prerelease: false
57
- version_requirements: *70312117050680
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
58
78
  - !ruby/object:Gem::Dependency
59
79
  name: rspec
60
- requirement: &70312117049660 !ruby/object:Gem::Requirement
80
+ requirement: !ruby/object:Gem::Requirement
61
81
  none: false
62
82
  requirements:
63
83
  - - ! '>='
@@ -65,10 +85,15 @@ dependencies:
65
85
  version: '0'
66
86
  type: :runtime
67
87
  prerelease: false
68
- version_requirements: *70312117049660
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
69
94
  - !ruby/object:Gem::Dependency
70
95
  name: awesome_print
71
- requirement: &70312117048900 !ruby/object:Gem::Requirement
96
+ requirement: !ruby/object:Gem::Requirement
72
97
  none: false
73
98
  requirements:
74
99
  - - ! '>='
@@ -76,10 +101,15 @@ dependencies:
76
101
  version: '0'
77
102
  type: :runtime
78
103
  prerelease: false
79
- version_requirements: *70312117048900
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
80
110
  - !ruby/object:Gem::Dependency
81
111
  name: sinatra
82
- requirement: &70312117047160 !ruby/object:Gem::Requirement
112
+ requirement: !ruby/object:Gem::Requirement
83
113
  none: false
84
114
  requirements:
85
115
  - - ! '>='
@@ -87,10 +117,15 @@ dependencies:
87
117
  version: '0'
88
118
  type: :runtime
89
119
  prerelease: false
90
- version_requirements: *70312117047160
120
+ version_requirements: !ruby/object:Gem::Requirement
121
+ none: false
122
+ requirements:
123
+ - - ! '>='
124
+ - !ruby/object:Gem::Version
125
+ version: '0'
91
126
  - !ruby/object:Gem::Dependency
92
127
  name: thin
93
- requirement: &70312117043520 !ruby/object:Gem::Requirement
128
+ requirement: !ruby/object:Gem::Requirement
94
129
  none: false
95
130
  requirements:
96
131
  - - ! '>='
@@ -98,10 +133,15 @@ dependencies:
98
133
  version: '0'
99
134
  type: :runtime
100
135
  prerelease: false
101
- version_requirements: *70312117043520
136
+ version_requirements: !ruby/object:Gem::Requirement
137
+ none: false
138
+ requirements:
139
+ - - ! '>='
140
+ - !ruby/object:Gem::Version
141
+ version: '0'
102
142
  - !ruby/object:Gem::Dependency
103
143
  name: haml
104
- requirement: &70312117041060 !ruby/object:Gem::Requirement
144
+ requirement: !ruby/object:Gem::Requirement
105
145
  none: false
106
146
  requirements:
107
147
  - - ! '>='
@@ -109,10 +149,15 @@ dependencies:
109
149
  version: '0'
110
150
  type: :runtime
111
151
  prerelease: false
112
- version_requirements: *70312117041060
152
+ version_requirements: !ruby/object:Gem::Requirement
153
+ none: false
154
+ requirements:
155
+ - - ! '>='
156
+ - !ruby/object:Gem::Version
157
+ version: '0'
113
158
  - !ruby/object:Gem::Dependency
114
159
  name: namespaced_redis
115
- requirement: &70312117038160 !ruby/object:Gem::Requirement
160
+ requirement: !ruby/object:Gem::Requirement
116
161
  none: false
117
162
  requirements:
118
163
  - - ! '>='
@@ -120,10 +165,15 @@ dependencies:
120
165
  version: '0'
121
166
  type: :runtime
122
167
  prerelease: false
123
- version_requirements: *70312117038160
168
+ version_requirements: !ruby/object:Gem::Requirement
169
+ none: false
170
+ requirements:
171
+ - - ! '>='
172
+ - !ruby/object:Gem::Version
173
+ version: '0'
124
174
  - !ruby/object:Gem::Dependency
125
175
  name: json
126
- requirement: &70312117032620 !ruby/object:Gem::Requirement
176
+ requirement: !ruby/object:Gem::Requirement
127
177
  none: false
128
178
  requirements:
129
179
  - - ! '>='
@@ -131,7 +181,12 @@ dependencies:
131
181
  version: '0'
132
182
  type: :runtime
133
183
  prerelease: false
134
- version_requirements: *70312117032620
184
+ version_requirements: !ruby/object:Gem::Requirement
185
+ none: false
186
+ requirements:
187
+ - - ! '>='
188
+ - !ruby/object:Gem::Version
189
+ version: '0'
135
190
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
191
  crawl extremely large sites which is much more performant than multi-threaded crawlers. It
137
192
  is also a standalone crawler that has a sophisticated statistics monitoring interface
@@ -514,7 +569,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
514
569
  version: '0'
515
570
  requirements: []
516
571
  rubyforge_project:
517
- rubygems_version: 1.8.10
572
+ rubygems_version: 1.8.24
518
573
  signing_key:
519
574
  specification_version: 3
520
575
  summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly