cobweb 0.0.19 → 0.0.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/cobweb.rb +32 -23
- data/lib/cobweb/version.rb +1 -0
- data/lib/crawl_job.rb +3 -2
- data/lib/robots.rb +9 -0
- data/lib/stats.rb +9 -6
- data/spec/cobweb/cobweb_spec.rb +62 -13
- data/spec/spec_helper.rb +12 -0
- metadata +24 -22
data/lib/cobweb.rb
CHANGED
@@ -29,8 +29,7 @@ class Cobweb
|
|
29
29
|
@options[:debug] = false unless @options.has_key?(:debug)
|
30
30
|
@options[:cache] = 300 unless @options.has_key?(:cache)
|
31
31
|
@options[:timeout] = 10 unless @options.has_key?(:timeout)
|
32
|
-
@options[:redis_options] = {} unless @options.has_key?(:redis_options)
|
33
|
-
|
32
|
+
@options[:redis_options] = {} unless @options.has_key?(:redis_options)
|
34
33
|
end
|
35
34
|
|
36
35
|
def start(base_url)
|
@@ -41,33 +40,40 @@ class Cobweb
|
|
41
40
|
}
|
42
41
|
|
43
42
|
request.merge!(@options)
|
44
|
-
redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{request[:crawl_id]}")
|
45
|
-
redis.hset "statistics", "queued_at", DateTime.now
|
43
|
+
@redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{VERSION}-#{request[:crawl_id]}")
|
44
|
+
@redis.hset "statistics", "queued_at", DateTime.now
|
46
45
|
|
47
46
|
Resque.enqueue(CrawlJob, request)
|
48
47
|
end
|
49
48
|
|
50
|
-
def get(url,
|
49
|
+
def get(url, options = @options)
|
51
50
|
|
52
|
-
raise "url cannot be nil" if url.nil?
|
51
|
+
raise "url cannot be nil" if url.nil?
|
53
52
|
|
54
53
|
absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => true, :force_escaping => false, :remove_anchors => true)
|
55
54
|
|
56
55
|
# get the unique id for this request
|
57
56
|
unique_id = Digest::SHA1.hexdigest(url.to_s)
|
57
|
+
redirect_limit = options[:redirect_limit]
|
58
58
|
|
59
59
|
# connect to redis
|
60
|
-
|
60
|
+
ap options
|
61
|
+
if options.has_key? :crawl_id
|
62
|
+
redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}-#{options[:crawl_id]}")
|
63
|
+
else
|
64
|
+
redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}")
|
65
|
+
end
|
61
66
|
|
67
|
+
ap "===== HEAD NAMESPACE ====="
|
68
|
+
ap redis.namespace
|
69
|
+
ap "===== HEAD NAMESPACE ====="
|
70
|
+
|
62
71
|
content = {}
|
63
72
|
|
64
73
|
# check if it has already been cached
|
65
74
|
if redis.get(unique_id) and @options[:cache]
|
66
75
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
67
|
-
content =
|
68
|
-
content[:body] = Base64.decode64(content[:body])
|
69
|
-
|
70
|
-
content
|
76
|
+
content = Marshal.load(redis.get(unique_id)).deep_symbolize_keys
|
71
77
|
else
|
72
78
|
# this url is valid for processing so lets get on with it
|
73
79
|
uri = Addressable::URI.parse(url.strip)
|
@@ -143,8 +149,7 @@ class Cobweb
|
|
143
149
|
end
|
144
150
|
# add content to cache if required
|
145
151
|
if @options[:cache]
|
146
|
-
|
147
|
-
redis.set(unique_id, content.to_json)
|
152
|
+
redis.set(unique_id, Marshal.dump(content))
|
148
153
|
redis.expire unique_id, @options[:cache].to_i
|
149
154
|
end
|
150
155
|
rescue RedirectError => e
|
@@ -196,28 +201,32 @@ class Cobweb
|
|
196
201
|
content
|
197
202
|
end
|
198
203
|
|
199
|
-
def head(url,
|
204
|
+
def head(url, options = @options)
|
200
205
|
raise "url cannot be nil" if url.nil?
|
201
206
|
|
202
207
|
absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
203
208
|
|
204
209
|
# get the unique id for this request
|
205
210
|
unique_id = Digest::SHA1.hexdigest(url)
|
211
|
+
redirect_limit = options[:redirect_limit]
|
206
212
|
|
207
213
|
# connect to redis
|
208
|
-
|
214
|
+
if options.has_key? :crawl_id
|
215
|
+
redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}-#{options[:crawl_id]}")
|
216
|
+
else
|
217
|
+
redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}")
|
218
|
+
end
|
209
219
|
|
220
|
+
ap "===== HEAD NAMESPACE ====="
|
221
|
+
ap redis.namespace
|
222
|
+
ap "===== HEAD NAMESPACE ====="
|
223
|
+
|
210
224
|
content = {}
|
211
225
|
|
212
226
|
# check if it has already been cached
|
213
|
-
if
|
227
|
+
if redis.get("head-#{unique_id}") and @options[:cache]
|
214
228
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
215
|
-
|
216
|
-
content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
|
217
|
-
else
|
218
|
-
content = JSON.parse(redis.get("head-#{unique_id}")).deep_symbolize_keys
|
219
|
-
end
|
220
|
-
content
|
229
|
+
Marshal.load(redis.get("head-#{unique_id}")).deep_symbolize_keys
|
221
230
|
else
|
222
231
|
print "Retrieving #{url }... " unless @options[:quiet]
|
223
232
|
uri = Addressable::URI.parse(url.strip)
|
@@ -259,7 +268,7 @@ class Cobweb
|
|
259
268
|
# add content to cache if required
|
260
269
|
if @options[:cache]
|
261
270
|
puts "Stored in cache [head-#{unique_id}]" if @options[:debug]
|
262
|
-
redis.set("head-#{unique_id}", content
|
271
|
+
redis.set("head-#{unique_id}", Marshal.dump(content))
|
263
272
|
redis.expire "head-#{unique_id}", @options[:cache].to_i
|
264
273
|
else
|
265
274
|
puts "Not storing in cache as cache disabled" if @options[:debug]
|
@@ -0,0 +1 @@
|
|
1
|
+
VERSION = "0.0.20"
|
data/lib/crawl_job.rb
CHANGED
@@ -28,7 +28,8 @@ class CrawlJob
|
|
28
28
|
def self.perform(content_request)
|
29
29
|
# change all hash keys to symbols
|
30
30
|
content_request.deep_symbolize_keys
|
31
|
-
redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{content_request[:crawl_id]}")
|
31
|
+
redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{VERSION}-#{content_request[:crawl_id]}")
|
32
|
+
ap redis.namespace
|
32
33
|
@absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
33
34
|
|
34
35
|
# check we haven't crawled this url before
|
@@ -40,7 +41,7 @@ class CrawlJob
|
|
40
41
|
redis.incr "crawl-counter"
|
41
42
|
crawl_counter += 1
|
42
43
|
if crawl_counter <= content_request[:crawl_limit].to_i
|
43
|
-
content = Cobweb.new(content_request).get(content_request[:url])
|
44
|
+
content = Cobweb.new(content_request).get(content_request[:url], content_request)
|
44
45
|
|
45
46
|
## update statistics
|
46
47
|
if redis.hexists "statistics", "average_response_time"
|
data/lib/robots.rb
ADDED
@@ -0,0 +1,9 @@
|
|
1
|
+
class Robots
|
2
|
+
|
3
|
+
def initialize(url, file_name="robots.txt")
|
4
|
+
uri = URI.parse(url)
|
5
|
+
[uri.scheme, "://", uri.host, ":", uri.port, "/", file_name].join
|
6
|
+
Cobweb.new(:cache => 6000).get([uri.scheme, "://", uri.host, ":", uri.port, "/", file_name].join)
|
7
|
+
|
8
|
+
end
|
9
|
+
end
|
data/lib/stats.rb
CHANGED
@@ -20,12 +20,15 @@ class Stats < Sinatra::Base
|
|
20
20
|
haml :statistics
|
21
21
|
end
|
22
22
|
|
23
|
-
end
|
24
|
-
|
25
|
-
thread = Thread.new do
|
26
|
-
Stats.run!
|
27
23
|
|
28
|
-
|
29
|
-
|
24
|
+
def self.start
|
25
|
+
thread = Thread.new do
|
26
|
+
Stats.run!
|
27
|
+
|
28
|
+
## we need to manually kill the main thread as sinatra traps the interrupts
|
29
|
+
Thread.main.kill
|
30
|
+
end
|
31
|
+
end
|
30
32
|
end
|
31
33
|
|
34
|
+
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -169,19 +169,68 @@ describe Cobweb do
|
|
169
169
|
|
170
170
|
end
|
171
171
|
end
|
172
|
+
|
173
|
+
describe "with cache" do
|
174
|
+
|
175
|
+
before(:each) do
|
176
|
+
@cobweb = Cobweb.new :quiet => true, :cache => 200
|
177
|
+
end
|
178
|
+
|
179
|
+
describe "content object" do
|
180
|
+
it "should return the url" do
|
181
|
+
@cobweb.get(@base_url)[:url].should == @base_url
|
182
|
+
@cobweb.get(@base_url)[:url].should == @base_url
|
183
|
+
end
|
184
|
+
it "should return correct content-type" do
|
185
|
+
@mock_http_response.stub!(:content_type).and_return("image/jpeg")
|
186
|
+
@cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
|
187
|
+
@cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
|
188
|
+
end
|
189
|
+
it "should return correct status-code" do
|
190
|
+
@mock_http_response.stub!(:code).and_return(404)
|
191
|
+
@cobweb.get(@base_url)[:status_code].should == 404
|
192
|
+
@cobweb.get(@base_url)[:status_code].should == 404
|
193
|
+
end
|
194
|
+
it "should return correct status-code" do
|
195
|
+
@mock_http_response.stub!(:code).and_return(404)
|
196
|
+
@cobweb.get(@base_url)[:status_code].should == 404
|
197
|
+
@cobweb.get(@base_url)[:status_code].should == 404
|
198
|
+
end
|
199
|
+
it "should return correct character_set" do
|
200
|
+
@cobweb.get(@base_url)[:character_set].should == "UTF-8"
|
201
|
+
@cobweb.get(@base_url)[:character_set].should == "UTF-8"
|
202
|
+
end
|
203
|
+
it "should return correct content_length" do
|
204
|
+
@cobweb.get(@base_url)[:length].should == 1024
|
205
|
+
@cobweb.get(@base_url)[:length].should == 1024
|
206
|
+
end
|
207
|
+
it "should return correct content_body" do
|
208
|
+
@cobweb.get(@base_url)[:body].should == "asdf"
|
209
|
+
@cobweb.get(@base_url)[:body].should == "asdf"
|
210
|
+
end
|
211
|
+
it "should return correct location" do
|
212
|
+
@cobweb.get(@base_url)[:location].should == nil
|
213
|
+
@cobweb.get(@base_url)[:location].should == nil
|
214
|
+
|
215
|
+
@mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
|
216
|
+
@cobweb.get(@base_url)[:location].should == "http://google.com/"
|
217
|
+
@cobweb.get(@base_url)[:location].should == "http://google.com/"
|
218
|
+
end
|
219
|
+
it "should return correct headers" do
|
220
|
+
@cobweb.get(@base_url)[:headers].should == @default_headers
|
221
|
+
@cobweb.get(@base_url)[:headers].should == @default_headers
|
222
|
+
end
|
223
|
+
it "should return correct a hash of links" do
|
224
|
+
@cobweb.get(@base_url)[:links].should be_an_instance_of Hash
|
225
|
+
@cobweb.get(@base_url)[:links].should be_an_instance_of Hash
|
226
|
+
end
|
227
|
+
it "should return the response time for the url" do
|
228
|
+
@cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
|
229
|
+
@cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
|
230
|
+
end
|
231
|
+
end
|
232
|
+
|
233
|
+
end
|
172
234
|
end
|
173
235
|
end
|
174
|
-
|
175
|
-
describe "without mock" do
|
176
|
-
|
177
|
-
it "should throw exception when server is unavailable" #do
|
178
|
-
# lambda {@cobweb.get({:url => "http://www.oasdjgoisadjgoisdiog.com"})}.should raise_error URI::InvalidURIError
|
179
|
-
#end
|
180
|
-
|
181
|
-
it "should return a valid content hash when url doesn't exist on a live server" do
|
182
|
-
status_code = @cobweb.get("http://test.com/laskdjflsdajf")[:status_code]
|
183
|
-
status_code.should == 404
|
184
|
-
end
|
185
|
-
|
186
|
-
end
|
187
236
|
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1 +1,13 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
|
2
|
+
require 'mock_redis'
|
3
|
+
|
4
|
+
RSpec.configure do |config|
|
5
|
+
config.before(:each) {
|
6
|
+
#redis_mock = double("redis")
|
7
|
+
#ap redis_mock
|
8
|
+
#redis_mock.stub(:new).and_return(MockRedis.new)
|
9
|
+
|
10
|
+
Redis.new.flushdb
|
11
|
+
}
|
12
|
+
|
13
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.20
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-03-
|
12
|
+
date: 2012-03-09 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70234753393720 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70234753393720
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70234753392840 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70234753392840
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: absolutize
|
38
|
-
requirement: &
|
38
|
+
requirement: &70234753392180 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70234753392180
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: nokogiri
|
49
|
-
requirement: &
|
49
|
+
requirement: &70234753391520 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70234753391520
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: addressable
|
60
|
-
requirement: &
|
60
|
+
requirement: &70234753390840 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70234753390840
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rspec
|
71
|
-
requirement: &
|
71
|
+
requirement: &70234753390200 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70234753390200
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: awesome_print
|
82
|
-
requirement: &
|
82
|
+
requirement: &70234753389280 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70234753389280
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: sinatra
|
93
|
-
requirement: &
|
93
|
+
requirement: &70234753388360 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70234753388360
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: thin
|
104
|
-
requirement: &
|
104
|
+
requirement: &70234753387580 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70234753387580
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: haml
|
115
|
-
requirement: &
|
115
|
+
requirement: &70234753386980 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,7 +120,7 @@ dependencies:
|
|
120
120
|
version: '0'
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70234753386980
|
124
124
|
description:
|
125
125
|
email: stewart@rockwellcottage.com
|
126
126
|
executables: []
|
@@ -134,6 +134,7 @@ files:
|
|
134
134
|
- spec/samples/sample_html_links.html
|
135
135
|
- spec/spec.opts
|
136
136
|
- spec/spec_helper.rb
|
137
|
+
- lib/cobweb/version.rb
|
137
138
|
- lib/cobweb.rb
|
138
139
|
- lib/cobweb_crawler.rb
|
139
140
|
- lib/cobweb_finished_job.rb
|
@@ -143,6 +144,7 @@ files:
|
|
143
144
|
- lib/hash.rb
|
144
145
|
- lib/namespaced_redis.rb
|
145
146
|
- lib/redirect_error.rb
|
147
|
+
- lib/robots.rb
|
146
148
|
- lib/stats.rb
|
147
149
|
- views/statistics.haml
|
148
150
|
- README.textile
|