cobweb 0.0.7 → 0.0.11
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +1 -0
- data/lib/cobweb.rb +15 -9
- data/lib/cobweb_finished_job.rb +12 -0
- data/lib/crawl_job.rb +111 -12
- data/lib/namespaced_redis.rb +25 -1
- data/spec/cobweb/cobweb_spec.rb +28 -26
- metadata +66 -100
data/README.textile
CHANGED
@@ -24,6 +24,7 @@ h2. Intro
|
|
24
24
|
** :related - url's from link tags
|
25
25
|
** :scripts - url's from script tags
|
26
26
|
** :styles - url's from within link tags with rel of stylesheet and from url() directives with stylesheets
|
27
|
+
* :crawl_id - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
|
27
28
|
|
28
29
|
The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)
|
29
30
|
|
data/lib/cobweb.rb
CHANGED
@@ -1,20 +1,30 @@
|
|
1
|
-
|
1
|
+
require 'rubygems'
|
2
2
|
require 'uri'
|
3
3
|
require 'resque'
|
4
4
|
require "addressable/uri"
|
5
5
|
require 'digest/sha1'
|
6
|
+
require 'base64'
|
6
7
|
|
7
8
|
Dir[File.dirname(__FILE__) + '/*.rb'].each do |file|
|
8
9
|
require [File.dirname(__FILE__), File.basename(file, File.extname(file))].join("/")
|
9
10
|
end
|
10
11
|
|
11
12
|
class CobWeb
|
13
|
+
|
14
|
+
## TASKS
|
15
|
+
|
16
|
+
# redesign to have a resque stack and a single threaded stack
|
17
|
+
# dry the code below, its got a lot of duplication
|
18
|
+
# detect the end of the crawl (queued == 0 ?)
|
19
|
+
# on end of crawl, return statistic hash (could call specified method ?) if single threaded or enqueue to a specified queue the stat hash
|
20
|
+
# investigate using event machine for single threaded crawling
|
12
21
|
|
13
22
|
def initialize(options = {})
|
14
23
|
@options = options
|
15
24
|
@options[:follow_redirects] = true unless @options.has_key?(:follow_redirects)
|
16
25
|
@options[:redirect_limit] = 10 unless @options.has_key?(:redirect_limit)
|
17
26
|
@options[:processing_queue] = CobwebProcessJob unless @options.has_key?(:processing_queue)
|
27
|
+
@options[:crawl_finished_queue] = CobwebFinishedJob unless @options.has_key?(:crawl_finished_queue)
|
18
28
|
@options[:quiet] = true unless @options.has_key?(:quiet)
|
19
29
|
@options[:debug] = false unless @options.has_key?(:debug)
|
20
30
|
@options[:cache] = 300 unless @options.has_key?(:cache)
|
@@ -31,6 +41,8 @@ class CobWeb
|
|
31
41
|
}
|
32
42
|
|
33
43
|
request.merge!(@options)
|
44
|
+
redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{request[:crawl_id]}")
|
45
|
+
redis.hset "statistics", "queued_at", DateTime.now
|
34
46
|
|
35
47
|
Resque.enqueue(CrawlJob, request)
|
36
48
|
end
|
@@ -62,10 +74,7 @@ class CobWeb
|
|
62
74
|
uri = Addressable::URI.parse(url.strip)
|
63
75
|
|
64
76
|
# retrieve data
|
65
|
-
|
66
|
-
uri.port = 443
|
67
|
-
end
|
68
|
-
http = Net::HTTP.new(uri.host, uri.port)
|
77
|
+
http = Net::HTTP.new(uri.host, uri.inferred_port)
|
69
78
|
if uri.scheme == "https"
|
70
79
|
http.use_ssl = true
|
71
80
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
@@ -181,10 +190,7 @@ class CobWeb
|
|
181
190
|
uri = Addressable::URI.parse(url.strip)
|
182
191
|
|
183
192
|
# retrieve data
|
184
|
-
|
185
|
-
uri.port = 443
|
186
|
-
end
|
187
|
-
http = Net::HTTP.new(uri.host, uri.port)
|
193
|
+
http = Net::HTTP.new(uri.host, uri.inferred_port)
|
188
194
|
if uri.scheme == "https"
|
189
195
|
http.use_ssl = true
|
190
196
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
data/lib/crawl_job.rb
CHANGED
@@ -6,6 +6,25 @@ class CrawlJob
|
|
6
6
|
|
7
7
|
@queue = :cobweb_crawl_job
|
8
8
|
|
9
|
+
## redis params used
|
10
|
+
#
|
11
|
+
# crawl-counter
|
12
|
+
# crawled
|
13
|
+
# queue-counter
|
14
|
+
# statistics[:average_response_time]
|
15
|
+
# statistics[:maximum_response_time]
|
16
|
+
# statistics[:minimum_response_time]
|
17
|
+
# statistics[:average_length]
|
18
|
+
# statistics[:maximum_length]
|
19
|
+
# statistics[:minimum_length]
|
20
|
+
# statistics[:queued_at]
|
21
|
+
# statistics[:started_at]
|
22
|
+
# statistics]:finished_at]
|
23
|
+
# total_pages
|
24
|
+
# total_assets
|
25
|
+
# statistics[:mime_counts]["mime_type"]
|
26
|
+
# statistics[:status_counts][xxx]
|
27
|
+
|
9
28
|
def self.perform(content_request)
|
10
29
|
# change all hash keys to symbols
|
11
30
|
content_request.deep_symbolize_keys
|
@@ -13,41 +32,121 @@ class CrawlJob
|
|
13
32
|
@absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
14
33
|
|
15
34
|
# check we haven't crawled this url before
|
35
|
+
crawl_counter = redis.get("crawl-counter").to_i
|
36
|
+
queue_counter = redis.get("queue-counter").to_i
|
16
37
|
unless redis.sismember "crawled", content_request[:url]
|
17
38
|
|
18
39
|
# increment counter and check we haven't hit our crawl limit
|
19
40
|
redis.incr "crawl-counter"
|
20
|
-
crawl_counter
|
21
|
-
queue_counter = redis.get("queue-counter").to_i
|
41
|
+
crawl_counter += 1
|
22
42
|
if crawl_counter <= content_request[:crawl_limit].to_i
|
23
43
|
content = CobWeb.new(content_request).get(content_request[:url])
|
44
|
+
|
45
|
+
## update statistics
|
46
|
+
if redis.hexists "statistics", "average_response_time"
|
47
|
+
redis.hset("statistics", "average_response_time", (((redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / crawl_counter + 1))
|
48
|
+
else
|
49
|
+
redis.hset("statistics", "average_response_time", content[:response_time].to_f)
|
50
|
+
end
|
51
|
+
redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > redis.hget("statistics", "maximum_response_time").to_f
|
52
|
+
redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < redis.hget("statistics", "minimum_response_time").to_f
|
53
|
+
if redis.hexists "statistics", "average_length"
|
54
|
+
redis.hset("statistics", "average_length", (((redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / crawl_counter + 1))
|
55
|
+
else
|
56
|
+
redis.hset("statistics", "average_length", content[:length].to_i)
|
57
|
+
end
|
58
|
+
redis.hset "statistics", "maximum_length", content[:length].to_i if redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > redis.hget("statistics", "maximum_length").to_i
|
59
|
+
redis.hset "statistics", "minimum_length", content[:length].to_i if redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < redis.hget("statistics", "minimum_length").to_i
|
60
|
+
|
61
|
+
if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
|
62
|
+
redis.incr "total_pages"
|
63
|
+
else
|
64
|
+
redis.incr "total_assets"
|
65
|
+
end
|
66
|
+
|
67
|
+
mime_counts = {}
|
68
|
+
if redis.hexists "statistics", "mime_counts"
|
69
|
+
mime_counts = JSON.parse(redis.hget("statistics", "mime_counts"))
|
70
|
+
if mime_counts.has_key? content[:mime_type]
|
71
|
+
mime_counts[content[:mime_type]] += 1
|
72
|
+
else
|
73
|
+
mime_counts[content[:mime_type]] = 1
|
74
|
+
end
|
75
|
+
else
|
76
|
+
mime_counts = {content[:mime_type] => 1}
|
77
|
+
end
|
78
|
+
redis.hset "statistics", "mime_counts", mime_counts.to_json
|
79
|
+
|
80
|
+
status_counts = {}
|
81
|
+
if redis.hexists "statistics", "status_counts"
|
82
|
+
status_counts = JSON.parse(redis.hget("statistics", "status_counts"))
|
83
|
+
if status_counts.has_key? content[:status_code].to_i
|
84
|
+
status_counts[content[:status_code].to_i] += 1
|
85
|
+
else
|
86
|
+
status_counts[content[:status_code].to_i] = 1
|
87
|
+
end
|
88
|
+
else
|
89
|
+
status_counts = {content[:status_code].to_i => 1}
|
90
|
+
end
|
91
|
+
redis.hset "statistics", "status_counts", status_counts.to_json
|
92
|
+
|
93
|
+
redis.srem "queued", content_request[:url]
|
24
94
|
redis.sadd "crawled", content_request[:url]
|
25
95
|
set_base_url redis, content, content_request[:base_url]
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
96
|
+
content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
|
97
|
+
unless redis.sismember "crawled", link
|
98
|
+
puts "Checking if #{link} matches #{redis.get("base_url")} as internal?" if content_request[:debug]
|
99
|
+
if link.to_s.match(Regexp.new("^#{redis.get("base_url")}"))
|
100
|
+
puts "Matched as #{link} as internal" if content_request[:debug]
|
101
|
+
unless redis.sismember("crawled", link) or redis.sismember("queued", link)
|
102
|
+
if queue_counter <= content_request[:crawl_limit].to_i
|
103
|
+
new_request = content_request.clone
|
104
|
+
new_request[:url] = link
|
105
|
+
new_request[:parent] = content_request[:url]
|
106
|
+
Resque.enqueue(CrawlJob, new_request)
|
107
|
+
redis.sadd "queued", link
|
108
|
+
redis.incr "queue-counter"
|
109
|
+
queue_counter += 1
|
110
|
+
end
|
35
111
|
end
|
36
112
|
end
|
37
113
|
end
|
38
114
|
end
|
39
115
|
|
40
116
|
# enqueue to processing queue
|
41
|
-
Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:source_id => content_request[:source_id]}))
|
117
|
+
Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
|
42
118
|
puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
|
43
119
|
puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter}" if content_request[:debug]
|
44
120
|
|
121
|
+
|
45
122
|
else
|
46
123
|
puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
|
47
124
|
end
|
48
125
|
else
|
49
126
|
puts "Already crawled #{content_request[:url]}" if content_request[:debug]
|
50
127
|
end
|
128
|
+
|
129
|
+
# detect finished state
|
130
|
+
|
131
|
+
if queue_counter == crawl_counter or content_request[:crawl_limit].to_i <= crawl_counter
|
132
|
+
|
133
|
+
puts "queue_counter: #{queue_counter}"
|
134
|
+
puts "crawl_counter: #{crawl_counter}"
|
135
|
+
puts "crawl_limit: #{content_request[:crawl_limit]}"
|
136
|
+
|
137
|
+
# finished
|
138
|
+
puts "FINISHED"
|
139
|
+
stats = redis.hgetall "statistics"
|
140
|
+
stats[:total_pages] = redis.get "total_pages"
|
141
|
+
stats[:total_assets] = redis.get "total_assets"
|
142
|
+
stats[:crawl_counter] = redis.get "crawl_counter"
|
143
|
+
stats[:queue_counter] = redis.get "queue_counter"
|
144
|
+
stats[:crawled] = redis.smembers "crawled"
|
145
|
+
|
146
|
+
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:source_id => content_request[:source_id]}))
|
147
|
+
|
148
|
+
ap stats
|
149
|
+
end
|
51
150
|
end
|
52
151
|
|
53
152
|
private
|
data/lib/namespaced_redis.rb
CHANGED
@@ -13,6 +13,14 @@ class NamespacedRedis
|
|
13
13
|
@redis.sadd namespaced(key), value
|
14
14
|
end
|
15
15
|
|
16
|
+
def srem(key, member)
|
17
|
+
@redis.srem namespaced(key), member
|
18
|
+
end
|
19
|
+
|
20
|
+
def smembers(key)
|
21
|
+
@redis.smembers namespaced(key)
|
22
|
+
end
|
23
|
+
|
16
24
|
def get(key)
|
17
25
|
@redis.get namespaced(key)
|
18
26
|
end
|
@@ -29,6 +37,22 @@ class NamespacedRedis
|
|
29
37
|
@redis.set namespaced(key), value
|
30
38
|
end
|
31
39
|
|
40
|
+
def hget(key, member)
|
41
|
+
@redis.hget namespaced(key), member
|
42
|
+
end
|
43
|
+
|
44
|
+
def hgetall(key)
|
45
|
+
@redis.hgetall namespaced(key)
|
46
|
+
end
|
47
|
+
|
48
|
+
def hset(key, member, value)
|
49
|
+
@redis.hset namespaced(key), member, value
|
50
|
+
end
|
51
|
+
|
52
|
+
def hexists(key, member)
|
53
|
+
@redis.hexists namespaced(key), member
|
54
|
+
end
|
55
|
+
|
32
56
|
def del(key)
|
33
57
|
@redis.del namespaced(key)
|
34
58
|
end
|
@@ -49,4 +73,4 @@ class NamespacedRedis
|
|
49
73
|
@namespace
|
50
74
|
end
|
51
75
|
|
52
|
-
end
|
76
|
+
end
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require "ap"
|
2
3
|
|
3
4
|
describe CobWeb do
|
4
5
|
|
@@ -50,7 +51,7 @@ describe CobWeb do
|
|
50
51
|
@mock_http_response.stub!(:to_hash).and_return(@default_headers)
|
51
52
|
|
52
53
|
@mock_http_redirect_response.stub!(:code).and_return(301)
|
53
|
-
@mock_http_redirect_response.stub!(:content_type).and_return("text/
|
54
|
+
@mock_http_redirect_response.stub!(:content_type).and_return("text/html")
|
54
55
|
@mock_http_redirect_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
|
55
56
|
@mock_http_redirect_response.stub!(:[]).with("location").and_return("http://redirected-to.com/redirect2.html")
|
56
57
|
@mock_http_redirect_response.stub!(:content_length).and_return(2048)
|
@@ -58,7 +59,7 @@ describe CobWeb do
|
|
58
59
|
@mock_http_redirect_response.stub!(:to_hash).and_return(@default_headers)
|
59
60
|
|
60
61
|
@mock_http_redirect_response2.stub!(:code).and_return(301)
|
61
|
-
@mock_http_redirect_response2.stub!(:content_type).and_return("text/
|
62
|
+
@mock_http_redirect_response2.stub!(:content_type).and_return("text/html")
|
62
63
|
@mock_http_redirect_response2.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
|
63
64
|
@mock_http_redirect_response2.stub!(:[]).with("location").and_return("http://redirected-to.com/redirected.html")
|
64
65
|
@mock_http_redirect_response2.stub!(:content_length).and_return(2048)
|
@@ -90,7 +91,7 @@ describe CobWeb do
|
|
90
91
|
end
|
91
92
|
it "should return correct content-type" do
|
92
93
|
@mock_http_response.stub!(:content_type).and_return("image/jpeg")
|
93
|
-
@cobweb.get(@base_url)[:
|
94
|
+
@cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
|
94
95
|
end
|
95
96
|
it "should return correct status-code" do
|
96
97
|
@mock_http_response.stub!(:code).and_return(404)
|
@@ -104,10 +105,10 @@ describe CobWeb do
|
|
104
105
|
@cobweb.get(@base_url)[:character_set].should == "UTF-8"
|
105
106
|
end
|
106
107
|
it "should return correct content_length" do
|
107
|
-
@cobweb.get(@base_url)[:
|
108
|
+
@cobweb.get(@base_url)[:length].should == 1024
|
108
109
|
end
|
109
110
|
it "should return correct content_body" do
|
110
|
-
@cobweb.get(@base_url)[:
|
111
|
+
@cobweb.get(@base_url)[:body].should == "asdf"
|
111
112
|
end
|
112
113
|
it "should return correct location" do
|
113
114
|
@cobweb.get(@base_url)[:location].should == nil
|
@@ -132,37 +133,38 @@ describe CobWeb do
|
|
132
133
|
@cobweb = CobWeb.new(:follow_redirects => true, :quiet => true, :cache => nil)
|
133
134
|
end
|
134
135
|
|
135
|
-
it "should flow through redirect" do
|
136
|
+
it "should flow through redirect" #do
|
136
137
|
|
137
|
-
|
138
|
+
#@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
139
|
+
#@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
140
|
+
#
|
141
|
+
#content = @cobweb.get(@base_url)
|
142
|
+
#content.should be_an_instance_of Hash
|
143
|
+
#ap content
|
144
|
+
#content[:url].should == "http://redirect-me.com/redirect.html"
|
145
|
+
#content[:redirect_through].length.should == 2
|
146
|
+
#content[:mime_type].should == "text/html"
|
147
|
+
#content[:body].should == "asdf"
|
138
148
|
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
content
|
144
|
-
content[:
|
145
|
-
content[:content_body].should == "asdf"
|
146
|
-
|
147
|
-
end
|
148
|
-
it "should return the path followed" do
|
149
|
-
@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
149
|
+
#end
|
150
|
+
it "should return the path followed" #do
|
151
|
+
#@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
152
|
+
#
|
153
|
+
#content = @cobweb.get(@base_url)
|
154
|
+
#content[:redirect_through].should == ["http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
|
150
155
|
|
151
|
-
|
152
|
-
content[:redirect_through].should == ["http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
|
153
|
-
|
154
|
-
end
|
156
|
+
#end
|
155
157
|
it "should not follow with redirect disabled" do
|
156
158
|
@cobweb = CobWeb.new(:follow_redirects => false, :cache => nil)
|
157
|
-
@mock_http_client.should_receive(:
|
159
|
+
@mock_http_client.should_receive(:start).and_return(@mock_http_redirect_response)
|
158
160
|
|
159
161
|
content = @cobweb.get(@base_url)
|
160
162
|
content[:url].should == "http://redirect-me.com/redirect.html"
|
161
163
|
content[:redirect_through].should be_nil
|
162
164
|
content[:status_code].should == 301
|
163
|
-
content[:
|
164
|
-
content[:
|
165
|
-
|
165
|
+
content[:mime_type].should == "text/html"
|
166
|
+
content[:body].should == "redirected body"
|
167
|
+
|
166
168
|
end
|
167
169
|
end
|
168
170
|
end
|
metadata
CHANGED
@@ -1,146 +1,112 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 0
|
9
|
-
- 7
|
10
|
-
version: 0.0.7
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.11
|
5
|
+
prerelease:
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Stewart McKee
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
dependencies:
|
21
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2012-01-15 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
22
15
|
name: resque
|
23
|
-
|
24
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
16
|
+
requirement: &70356870659640 !ruby/object:Gem::Requirement
|
25
17
|
none: false
|
26
|
-
requirements:
|
27
|
-
- -
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
|
30
|
-
segments:
|
31
|
-
- 0
|
32
|
-
version: "0"
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
33
22
|
type: :runtime
|
34
|
-
version_requirements: *id001
|
35
|
-
- !ruby/object:Gem::Dependency
|
36
|
-
name: redis
|
37
23
|
prerelease: false
|
38
|
-
|
24
|
+
version_requirements: *70356870659640
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: redis
|
27
|
+
requirement: &70356870658800 !ruby/object:Gem::Requirement
|
39
28
|
none: false
|
40
|
-
requirements:
|
41
|
-
- -
|
42
|
-
- !ruby/object:Gem::Version
|
43
|
-
|
44
|
-
segments:
|
45
|
-
- 0
|
46
|
-
version: "0"
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
47
33
|
type: :runtime
|
48
|
-
version_requirements: *id002
|
49
|
-
- !ruby/object:Gem::Dependency
|
50
|
-
name: absolutize
|
51
34
|
prerelease: false
|
52
|
-
|
35
|
+
version_requirements: *70356870658800
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: absolutize
|
38
|
+
requirement: &70356870695180 !ruby/object:Gem::Requirement
|
53
39
|
none: false
|
54
|
-
requirements:
|
55
|
-
- -
|
56
|
-
- !ruby/object:Gem::Version
|
57
|
-
|
58
|
-
segments:
|
59
|
-
- 0
|
60
|
-
version: "0"
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
61
44
|
type: :runtime
|
62
|
-
version_requirements: *id003
|
63
|
-
- !ruby/object:Gem::Dependency
|
64
|
-
name: nokogiri
|
65
45
|
prerelease: false
|
66
|
-
|
46
|
+
version_requirements: *70356870695180
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: nokogiri
|
49
|
+
requirement: &70356870693820 !ruby/object:Gem::Requirement
|
67
50
|
none: false
|
68
|
-
requirements:
|
69
|
-
- -
|
70
|
-
- !ruby/object:Gem::Version
|
71
|
-
|
72
|
-
segments:
|
73
|
-
- 0
|
74
|
-
version: "0"
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
75
55
|
type: :runtime
|
76
|
-
version_requirements: *id004
|
77
|
-
- !ruby/object:Gem::Dependency
|
78
|
-
name: addressable
|
79
56
|
prerelease: false
|
80
|
-
|
57
|
+
version_requirements: *70356870693820
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: addressable
|
60
|
+
requirement: &70356870692840 !ruby/object:Gem::Requirement
|
81
61
|
none: false
|
82
|
-
requirements:
|
83
|
-
- -
|
84
|
-
- !ruby/object:Gem::Version
|
85
|
-
|
86
|
-
segments:
|
87
|
-
- 0
|
88
|
-
version: "0"
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
89
66
|
type: :runtime
|
90
|
-
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *70356870692840
|
91
69
|
description:
|
92
70
|
email: stewart@rockwellcottage.com
|
93
71
|
executables: []
|
94
|
-
|
95
72
|
extensions: []
|
96
|
-
|
97
|
-
extra_rdoc_files:
|
73
|
+
extra_rdoc_files:
|
98
74
|
- README.textile
|
99
|
-
files:
|
75
|
+
files:
|
76
|
+
- spec/cobweb/cobweb_spec.rb
|
77
|
+
- spec/cobweb/content_link_parser_spec.rb
|
100
78
|
- spec/samples/sample_html_links.html
|
101
79
|
- spec/spec.opts
|
102
80
|
- spec/spec_helper.rb
|
103
|
-
- spec/cobweb/content_link_parser_spec.rb
|
104
|
-
- spec/cobweb/cobweb_spec.rb
|
105
|
-
- lib/namespaced_redis.rb
|
106
81
|
- lib/cobweb.rb
|
107
|
-
- lib/
|
82
|
+
- lib/cobweb_finished_job.rb
|
108
83
|
- lib/cobweb_process_job.rb
|
84
|
+
- lib/content_link_parser.rb
|
109
85
|
- lib/crawl_job.rb
|
86
|
+
- lib/namespaced_redis.rb
|
110
87
|
- README.textile
|
111
|
-
has_rdoc: false
|
112
88
|
homepage: http://github.com/stewartmckee/cobweb
|
113
89
|
licenses: []
|
114
|
-
|
115
90
|
post_install_message:
|
116
91
|
rdoc_options: []
|
117
|
-
|
118
|
-
require_paths:
|
92
|
+
require_paths:
|
119
93
|
- lib
|
120
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
94
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
121
95
|
none: false
|
122
|
-
requirements:
|
123
|
-
- -
|
124
|
-
- !ruby/object:Gem::Version
|
125
|
-
|
126
|
-
|
127
|
-
- 0
|
128
|
-
version: "0"
|
129
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - ! '>='
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: '0'
|
100
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
130
101
|
none: false
|
131
|
-
requirements:
|
132
|
-
- -
|
133
|
-
- !ruby/object:Gem::Version
|
134
|
-
|
135
|
-
segments:
|
136
|
-
- 0
|
137
|
-
version: "0"
|
102
|
+
requirements:
|
103
|
+
- - ! '>='
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '0'
|
138
106
|
requirements: []
|
139
|
-
|
140
107
|
rubyforge_project:
|
141
|
-
rubygems_version: 1.
|
108
|
+
rubygems_version: 1.8.11
|
142
109
|
signing_key:
|
143
110
|
specification_version: 3
|
144
111
|
summary: Crawler utilizing resque
|
145
112
|
test_files: []
|
146
|
-
|