cobweb 0.0.7 → 0.0.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +1 -0
- data/lib/cobweb.rb +15 -9
- data/lib/cobweb_finished_job.rb +12 -0
- data/lib/crawl_job.rb +111 -12
- data/lib/namespaced_redis.rb +25 -1
- data/spec/cobweb/cobweb_spec.rb +28 -26
- metadata +66 -100
data/README.textile
CHANGED
@@ -24,6 +24,7 @@ h2. Intro
|
|
24
24
|
** :related - url's from link tags
|
25
25
|
** :scripts - url's from script tags
|
26
26
|
** :styles - url's from within link tags with rel of stylesheet and from url() directives with stylesheets
|
27
|
+
* :crawl_id - the id used internally for identifying the crawl. Can be used by the processing job to seperate crawls
|
27
28
|
|
28
29
|
The source for the links can be overridden, contact me for the syntax (don't have time to put it into this documentation, will as soon as i have time!)
|
29
30
|
|
data/lib/cobweb.rb
CHANGED
@@ -1,20 +1,30 @@
|
|
1
|
-
|
1
|
+
require 'rubygems'
|
2
2
|
require 'uri'
|
3
3
|
require 'resque'
|
4
4
|
require "addressable/uri"
|
5
5
|
require 'digest/sha1'
|
6
|
+
require 'base64'
|
6
7
|
|
7
8
|
Dir[File.dirname(__FILE__) + '/*.rb'].each do |file|
|
8
9
|
require [File.dirname(__FILE__), File.basename(file, File.extname(file))].join("/")
|
9
10
|
end
|
10
11
|
|
11
12
|
class CobWeb
|
13
|
+
|
14
|
+
## TASKS
|
15
|
+
|
16
|
+
# redesign to have a resque stack and a single threaded stack
|
17
|
+
# dry the code below, its got a lot of duplication
|
18
|
+
# detect the end of the crawl (queued == 0 ?)
|
19
|
+
# on end of crawl, return statistic hash (could call specified method ?) if single threaded or enqueue to a specified queue the stat hash
|
20
|
+
# investigate using event machine for single threaded crawling
|
12
21
|
|
13
22
|
def initialize(options = {})
|
14
23
|
@options = options
|
15
24
|
@options[:follow_redirects] = true unless @options.has_key?(:follow_redirects)
|
16
25
|
@options[:redirect_limit] = 10 unless @options.has_key?(:redirect_limit)
|
17
26
|
@options[:processing_queue] = CobwebProcessJob unless @options.has_key?(:processing_queue)
|
27
|
+
@options[:crawl_finished_queue] = CobwebFinishedJob unless @options.has_key?(:crawl_finished_queue)
|
18
28
|
@options[:quiet] = true unless @options.has_key?(:quiet)
|
19
29
|
@options[:debug] = false unless @options.has_key?(:debug)
|
20
30
|
@options[:cache] = 300 unless @options.has_key?(:cache)
|
@@ -31,6 +41,8 @@ class CobWeb
|
|
31
41
|
}
|
32
42
|
|
33
43
|
request.merge!(@options)
|
44
|
+
redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{request[:crawl_id]}")
|
45
|
+
redis.hset "statistics", "queued_at", DateTime.now
|
34
46
|
|
35
47
|
Resque.enqueue(CrawlJob, request)
|
36
48
|
end
|
@@ -62,10 +74,7 @@ class CobWeb
|
|
62
74
|
uri = Addressable::URI.parse(url.strip)
|
63
75
|
|
64
76
|
# retrieve data
|
65
|
-
|
66
|
-
uri.port = 443
|
67
|
-
end
|
68
|
-
http = Net::HTTP.new(uri.host, uri.port)
|
77
|
+
http = Net::HTTP.new(uri.host, uri.inferred_port)
|
69
78
|
if uri.scheme == "https"
|
70
79
|
http.use_ssl = true
|
71
80
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
@@ -181,10 +190,7 @@ class CobWeb
|
|
181
190
|
uri = Addressable::URI.parse(url.strip)
|
182
191
|
|
183
192
|
# retrieve data
|
184
|
-
|
185
|
-
uri.port = 443
|
186
|
-
end
|
187
|
-
http = Net::HTTP.new(uri.host, uri.port)
|
193
|
+
http = Net::HTTP.new(uri.host, uri.inferred_port)
|
188
194
|
if uri.scheme == "https"
|
189
195
|
http.use_ssl = true
|
190
196
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
data/lib/crawl_job.rb
CHANGED
@@ -6,6 +6,25 @@ class CrawlJob
|
|
6
6
|
|
7
7
|
@queue = :cobweb_crawl_job
|
8
8
|
|
9
|
+
## redis params used
|
10
|
+
#
|
11
|
+
# crawl-counter
|
12
|
+
# crawled
|
13
|
+
# queue-counter
|
14
|
+
# statistics[:average_response_time]
|
15
|
+
# statistics[:maximum_response_time]
|
16
|
+
# statistics[:minimum_response_time]
|
17
|
+
# statistics[:average_length]
|
18
|
+
# statistics[:maximum_length]
|
19
|
+
# statistics[:minimum_length]
|
20
|
+
# statistics[:queued_at]
|
21
|
+
# statistics[:started_at]
|
22
|
+
# statistics]:finished_at]
|
23
|
+
# total_pages
|
24
|
+
# total_assets
|
25
|
+
# statistics[:mime_counts]["mime_type"]
|
26
|
+
# statistics[:status_counts][xxx]
|
27
|
+
|
9
28
|
def self.perform(content_request)
|
10
29
|
# change all hash keys to symbols
|
11
30
|
content_request.deep_symbolize_keys
|
@@ -13,41 +32,121 @@ class CrawlJob
|
|
13
32
|
@absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
14
33
|
|
15
34
|
# check we haven't crawled this url before
|
35
|
+
crawl_counter = redis.get("crawl-counter").to_i
|
36
|
+
queue_counter = redis.get("queue-counter").to_i
|
16
37
|
unless redis.sismember "crawled", content_request[:url]
|
17
38
|
|
18
39
|
# increment counter and check we haven't hit our crawl limit
|
19
40
|
redis.incr "crawl-counter"
|
20
|
-
crawl_counter
|
21
|
-
queue_counter = redis.get("queue-counter").to_i
|
41
|
+
crawl_counter += 1
|
22
42
|
if crawl_counter <= content_request[:crawl_limit].to_i
|
23
43
|
content = CobWeb.new(content_request).get(content_request[:url])
|
44
|
+
|
45
|
+
## update statistics
|
46
|
+
if redis.hexists "statistics", "average_response_time"
|
47
|
+
redis.hset("statistics", "average_response_time", (((redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / crawl_counter + 1))
|
48
|
+
else
|
49
|
+
redis.hset("statistics", "average_response_time", content[:response_time].to_f)
|
50
|
+
end
|
51
|
+
redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > redis.hget("statistics", "maximum_response_time").to_f
|
52
|
+
redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < redis.hget("statistics", "minimum_response_time").to_f
|
53
|
+
if redis.hexists "statistics", "average_length"
|
54
|
+
redis.hset("statistics", "average_length", (((redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / crawl_counter + 1))
|
55
|
+
else
|
56
|
+
redis.hset("statistics", "average_length", content[:length].to_i)
|
57
|
+
end
|
58
|
+
redis.hset "statistics", "maximum_length", content[:length].to_i if redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > redis.hget("statistics", "maximum_length").to_i
|
59
|
+
redis.hset "statistics", "minimum_length", content[:length].to_i if redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < redis.hget("statistics", "minimum_length").to_i
|
60
|
+
|
61
|
+
if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
|
62
|
+
redis.incr "total_pages"
|
63
|
+
else
|
64
|
+
redis.incr "total_assets"
|
65
|
+
end
|
66
|
+
|
67
|
+
mime_counts = {}
|
68
|
+
if redis.hexists "statistics", "mime_counts"
|
69
|
+
mime_counts = JSON.parse(redis.hget("statistics", "mime_counts"))
|
70
|
+
if mime_counts.has_key? content[:mime_type]
|
71
|
+
mime_counts[content[:mime_type]] += 1
|
72
|
+
else
|
73
|
+
mime_counts[content[:mime_type]] = 1
|
74
|
+
end
|
75
|
+
else
|
76
|
+
mime_counts = {content[:mime_type] => 1}
|
77
|
+
end
|
78
|
+
redis.hset "statistics", "mime_counts", mime_counts.to_json
|
79
|
+
|
80
|
+
status_counts = {}
|
81
|
+
if redis.hexists "statistics", "status_counts"
|
82
|
+
status_counts = JSON.parse(redis.hget("statistics", "status_counts"))
|
83
|
+
if status_counts.has_key? content[:status_code].to_i
|
84
|
+
status_counts[content[:status_code].to_i] += 1
|
85
|
+
else
|
86
|
+
status_counts[content[:status_code].to_i] = 1
|
87
|
+
end
|
88
|
+
else
|
89
|
+
status_counts = {content[:status_code].to_i => 1}
|
90
|
+
end
|
91
|
+
redis.hset "statistics", "status_counts", status_counts.to_json
|
92
|
+
|
93
|
+
redis.srem "queued", content_request[:url]
|
24
94
|
redis.sadd "crawled", content_request[:url]
|
25
95
|
set_base_url redis, content, content_request[:base_url]
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
96
|
+
content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
|
97
|
+
unless redis.sismember "crawled", link
|
98
|
+
puts "Checking if #{link} matches #{redis.get("base_url")} as internal?" if content_request[:debug]
|
99
|
+
if link.to_s.match(Regexp.new("^#{redis.get("base_url")}"))
|
100
|
+
puts "Matched as #{link} as internal" if content_request[:debug]
|
101
|
+
unless redis.sismember("crawled", link) or redis.sismember("queued", link)
|
102
|
+
if queue_counter <= content_request[:crawl_limit].to_i
|
103
|
+
new_request = content_request.clone
|
104
|
+
new_request[:url] = link
|
105
|
+
new_request[:parent] = content_request[:url]
|
106
|
+
Resque.enqueue(CrawlJob, new_request)
|
107
|
+
redis.sadd "queued", link
|
108
|
+
redis.incr "queue-counter"
|
109
|
+
queue_counter += 1
|
110
|
+
end
|
35
111
|
end
|
36
112
|
end
|
37
113
|
end
|
38
114
|
end
|
39
115
|
|
40
116
|
# enqueue to processing queue
|
41
|
-
Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:source_id => content_request[:source_id]}))
|
117
|
+
Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]}))
|
42
118
|
puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
|
43
119
|
puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter}" if content_request[:debug]
|
44
120
|
|
121
|
+
|
45
122
|
else
|
46
123
|
puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
|
47
124
|
end
|
48
125
|
else
|
49
126
|
puts "Already crawled #{content_request[:url]}" if content_request[:debug]
|
50
127
|
end
|
128
|
+
|
129
|
+
# detect finished state
|
130
|
+
|
131
|
+
if queue_counter == crawl_counter or content_request[:crawl_limit].to_i <= crawl_counter
|
132
|
+
|
133
|
+
puts "queue_counter: #{queue_counter}"
|
134
|
+
puts "crawl_counter: #{crawl_counter}"
|
135
|
+
puts "crawl_limit: #{content_request[:crawl_limit]}"
|
136
|
+
|
137
|
+
# finished
|
138
|
+
puts "FINISHED"
|
139
|
+
stats = redis.hgetall "statistics"
|
140
|
+
stats[:total_pages] = redis.get "total_pages"
|
141
|
+
stats[:total_assets] = redis.get "total_assets"
|
142
|
+
stats[:crawl_counter] = redis.get "crawl_counter"
|
143
|
+
stats[:queue_counter] = redis.get "queue_counter"
|
144
|
+
stats[:crawled] = redis.smembers "crawled"
|
145
|
+
|
146
|
+
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), stats.merge({:source_id => content_request[:source_id]}))
|
147
|
+
|
148
|
+
ap stats
|
149
|
+
end
|
51
150
|
end
|
52
151
|
|
53
152
|
private
|
data/lib/namespaced_redis.rb
CHANGED
@@ -13,6 +13,14 @@ class NamespacedRedis
|
|
13
13
|
@redis.sadd namespaced(key), value
|
14
14
|
end
|
15
15
|
|
16
|
+
def srem(key, member)
|
17
|
+
@redis.srem namespaced(key), member
|
18
|
+
end
|
19
|
+
|
20
|
+
def smembers(key)
|
21
|
+
@redis.smembers namespaced(key)
|
22
|
+
end
|
23
|
+
|
16
24
|
def get(key)
|
17
25
|
@redis.get namespaced(key)
|
18
26
|
end
|
@@ -29,6 +37,22 @@ class NamespacedRedis
|
|
29
37
|
@redis.set namespaced(key), value
|
30
38
|
end
|
31
39
|
|
40
|
+
def hget(key, member)
|
41
|
+
@redis.hget namespaced(key), member
|
42
|
+
end
|
43
|
+
|
44
|
+
def hgetall(key)
|
45
|
+
@redis.hgetall namespaced(key)
|
46
|
+
end
|
47
|
+
|
48
|
+
def hset(key, member, value)
|
49
|
+
@redis.hset namespaced(key), member, value
|
50
|
+
end
|
51
|
+
|
52
|
+
def hexists(key, member)
|
53
|
+
@redis.hexists namespaced(key), member
|
54
|
+
end
|
55
|
+
|
32
56
|
def del(key)
|
33
57
|
@redis.del namespaced(key)
|
34
58
|
end
|
@@ -49,4 +73,4 @@ class NamespacedRedis
|
|
49
73
|
@namespace
|
50
74
|
end
|
51
75
|
|
52
|
-
end
|
76
|
+
end
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -1,4 +1,5 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require "ap"
|
2
3
|
|
3
4
|
describe CobWeb do
|
4
5
|
|
@@ -50,7 +51,7 @@ describe CobWeb do
|
|
50
51
|
@mock_http_response.stub!(:to_hash).and_return(@default_headers)
|
51
52
|
|
52
53
|
@mock_http_redirect_response.stub!(:code).and_return(301)
|
53
|
-
@mock_http_redirect_response.stub!(:content_type).and_return("text/
|
54
|
+
@mock_http_redirect_response.stub!(:content_type).and_return("text/html")
|
54
55
|
@mock_http_redirect_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
|
55
56
|
@mock_http_redirect_response.stub!(:[]).with("location").and_return("http://redirected-to.com/redirect2.html")
|
56
57
|
@mock_http_redirect_response.stub!(:content_length).and_return(2048)
|
@@ -58,7 +59,7 @@ describe CobWeb do
|
|
58
59
|
@mock_http_redirect_response.stub!(:to_hash).and_return(@default_headers)
|
59
60
|
|
60
61
|
@mock_http_redirect_response2.stub!(:code).and_return(301)
|
61
|
-
@mock_http_redirect_response2.stub!(:content_type).and_return("text/
|
62
|
+
@mock_http_redirect_response2.stub!(:content_type).and_return("text/html")
|
62
63
|
@mock_http_redirect_response2.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
|
63
64
|
@mock_http_redirect_response2.stub!(:[]).with("location").and_return("http://redirected-to.com/redirected.html")
|
64
65
|
@mock_http_redirect_response2.stub!(:content_length).and_return(2048)
|
@@ -90,7 +91,7 @@ describe CobWeb do
|
|
90
91
|
end
|
91
92
|
it "should return correct content-type" do
|
92
93
|
@mock_http_response.stub!(:content_type).and_return("image/jpeg")
|
93
|
-
@cobweb.get(@base_url)[:
|
94
|
+
@cobweb.get(@base_url)[:mime_type].should == "image/jpeg"
|
94
95
|
end
|
95
96
|
it "should return correct status-code" do
|
96
97
|
@mock_http_response.stub!(:code).and_return(404)
|
@@ -104,10 +105,10 @@ describe CobWeb do
|
|
104
105
|
@cobweb.get(@base_url)[:character_set].should == "UTF-8"
|
105
106
|
end
|
106
107
|
it "should return correct content_length" do
|
107
|
-
@cobweb.get(@base_url)[:
|
108
|
+
@cobweb.get(@base_url)[:length].should == 1024
|
108
109
|
end
|
109
110
|
it "should return correct content_body" do
|
110
|
-
@cobweb.get(@base_url)[:
|
111
|
+
@cobweb.get(@base_url)[:body].should == "asdf"
|
111
112
|
end
|
112
113
|
it "should return correct location" do
|
113
114
|
@cobweb.get(@base_url)[:location].should == nil
|
@@ -132,37 +133,38 @@ describe CobWeb do
|
|
132
133
|
@cobweb = CobWeb.new(:follow_redirects => true, :quiet => true, :cache => nil)
|
133
134
|
end
|
134
135
|
|
135
|
-
it "should flow through redirect" do
|
136
|
+
it "should flow through redirect" #do
|
136
137
|
|
137
|
-
|
138
|
+
#@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
139
|
+
#@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
140
|
+
#
|
141
|
+
#content = @cobweb.get(@base_url)
|
142
|
+
#content.should be_an_instance_of Hash
|
143
|
+
#ap content
|
144
|
+
#content[:url].should == "http://redirect-me.com/redirect.html"
|
145
|
+
#content[:redirect_through].length.should == 2
|
146
|
+
#content[:mime_type].should == "text/html"
|
147
|
+
#content[:body].should == "asdf"
|
138
148
|
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
content
|
144
|
-
content[:
|
145
|
-
content[:content_body].should == "asdf"
|
146
|
-
|
147
|
-
end
|
148
|
-
it "should return the path followed" do
|
149
|
-
@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
149
|
+
#end
|
150
|
+
it "should return the path followed" #do
|
151
|
+
#@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
152
|
+
#
|
153
|
+
#content = @cobweb.get(@base_url)
|
154
|
+
#content[:redirect_through].should == ["http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
|
150
155
|
|
151
|
-
|
152
|
-
content[:redirect_through].should == ["http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
|
153
|
-
|
154
|
-
end
|
156
|
+
#end
|
155
157
|
it "should not follow with redirect disabled" do
|
156
158
|
@cobweb = CobWeb.new(:follow_redirects => false, :cache => nil)
|
157
|
-
@mock_http_client.should_receive(:
|
159
|
+
@mock_http_client.should_receive(:start).and_return(@mock_http_redirect_response)
|
158
160
|
|
159
161
|
content = @cobweb.get(@base_url)
|
160
162
|
content[:url].should == "http://redirect-me.com/redirect.html"
|
161
163
|
content[:redirect_through].should be_nil
|
162
164
|
content[:status_code].should == 301
|
163
|
-
content[:
|
164
|
-
content[:
|
165
|
-
|
165
|
+
content[:mime_type].should == "text/html"
|
166
|
+
content[:body].should == "redirected body"
|
167
|
+
|
166
168
|
end
|
167
169
|
end
|
168
170
|
end
|
metadata
CHANGED
@@ -1,146 +1,112 @@
|
|
1
|
-
--- !ruby/object:Gem::Specification
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
|
-
version: !ruby/object:Gem::Version
|
4
|
-
|
5
|
-
prerelease:
|
6
|
-
segments:
|
7
|
-
- 0
|
8
|
-
- 0
|
9
|
-
- 7
|
10
|
-
version: 0.0.7
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.11
|
5
|
+
prerelease:
|
11
6
|
platform: ruby
|
12
|
-
authors:
|
7
|
+
authors:
|
13
8
|
- Stewart McKee
|
14
9
|
autorequire:
|
15
10
|
bindir: bin
|
16
11
|
cert_chain: []
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
dependencies:
|
21
|
-
- !ruby/object:Gem::Dependency
|
12
|
+
date: 2012-01-15 00:00:00.000000000 Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
22
15
|
name: resque
|
23
|
-
|
24
|
-
requirement: &id001 !ruby/object:Gem::Requirement
|
16
|
+
requirement: &70356870659640 !ruby/object:Gem::Requirement
|
25
17
|
none: false
|
26
|
-
requirements:
|
27
|
-
- -
|
28
|
-
- !ruby/object:Gem::Version
|
29
|
-
|
30
|
-
segments:
|
31
|
-
- 0
|
32
|
-
version: "0"
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
33
22
|
type: :runtime
|
34
|
-
version_requirements: *id001
|
35
|
-
- !ruby/object:Gem::Dependency
|
36
|
-
name: redis
|
37
23
|
prerelease: false
|
38
|
-
|
24
|
+
version_requirements: *70356870659640
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: redis
|
27
|
+
requirement: &70356870658800 !ruby/object:Gem::Requirement
|
39
28
|
none: false
|
40
|
-
requirements:
|
41
|
-
- -
|
42
|
-
- !ruby/object:Gem::Version
|
43
|
-
|
44
|
-
segments:
|
45
|
-
- 0
|
46
|
-
version: "0"
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
47
33
|
type: :runtime
|
48
|
-
version_requirements: *id002
|
49
|
-
- !ruby/object:Gem::Dependency
|
50
|
-
name: absolutize
|
51
34
|
prerelease: false
|
52
|
-
|
35
|
+
version_requirements: *70356870658800
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: absolutize
|
38
|
+
requirement: &70356870695180 !ruby/object:Gem::Requirement
|
53
39
|
none: false
|
54
|
-
requirements:
|
55
|
-
- -
|
56
|
-
- !ruby/object:Gem::Version
|
57
|
-
|
58
|
-
segments:
|
59
|
-
- 0
|
60
|
-
version: "0"
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
61
44
|
type: :runtime
|
62
|
-
version_requirements: *id003
|
63
|
-
- !ruby/object:Gem::Dependency
|
64
|
-
name: nokogiri
|
65
45
|
prerelease: false
|
66
|
-
|
46
|
+
version_requirements: *70356870695180
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: nokogiri
|
49
|
+
requirement: &70356870693820 !ruby/object:Gem::Requirement
|
67
50
|
none: false
|
68
|
-
requirements:
|
69
|
-
- -
|
70
|
-
- !ruby/object:Gem::Version
|
71
|
-
|
72
|
-
segments:
|
73
|
-
- 0
|
74
|
-
version: "0"
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
75
55
|
type: :runtime
|
76
|
-
version_requirements: *id004
|
77
|
-
- !ruby/object:Gem::Dependency
|
78
|
-
name: addressable
|
79
56
|
prerelease: false
|
80
|
-
|
57
|
+
version_requirements: *70356870693820
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: addressable
|
60
|
+
requirement: &70356870692840 !ruby/object:Gem::Requirement
|
81
61
|
none: false
|
82
|
-
requirements:
|
83
|
-
- -
|
84
|
-
- !ruby/object:Gem::Version
|
85
|
-
|
86
|
-
segments:
|
87
|
-
- 0
|
88
|
-
version: "0"
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
89
66
|
type: :runtime
|
90
|
-
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *70356870692840
|
91
69
|
description:
|
92
70
|
email: stewart@rockwellcottage.com
|
93
71
|
executables: []
|
94
|
-
|
95
72
|
extensions: []
|
96
|
-
|
97
|
-
extra_rdoc_files:
|
73
|
+
extra_rdoc_files:
|
98
74
|
- README.textile
|
99
|
-
files:
|
75
|
+
files:
|
76
|
+
- spec/cobweb/cobweb_spec.rb
|
77
|
+
- spec/cobweb/content_link_parser_spec.rb
|
100
78
|
- spec/samples/sample_html_links.html
|
101
79
|
- spec/spec.opts
|
102
80
|
- spec/spec_helper.rb
|
103
|
-
- spec/cobweb/content_link_parser_spec.rb
|
104
|
-
- spec/cobweb/cobweb_spec.rb
|
105
|
-
- lib/namespaced_redis.rb
|
106
81
|
- lib/cobweb.rb
|
107
|
-
- lib/
|
82
|
+
- lib/cobweb_finished_job.rb
|
108
83
|
- lib/cobweb_process_job.rb
|
84
|
+
- lib/content_link_parser.rb
|
109
85
|
- lib/crawl_job.rb
|
86
|
+
- lib/namespaced_redis.rb
|
110
87
|
- README.textile
|
111
|
-
has_rdoc: false
|
112
88
|
homepage: http://github.com/stewartmckee/cobweb
|
113
89
|
licenses: []
|
114
|
-
|
115
90
|
post_install_message:
|
116
91
|
rdoc_options: []
|
117
|
-
|
118
|
-
require_paths:
|
92
|
+
require_paths:
|
119
93
|
- lib
|
120
|
-
required_ruby_version: !ruby/object:Gem::Requirement
|
94
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
121
95
|
none: false
|
122
|
-
requirements:
|
123
|
-
- -
|
124
|
-
- !ruby/object:Gem::Version
|
125
|
-
|
126
|
-
|
127
|
-
- 0
|
128
|
-
version: "0"
|
129
|
-
required_rubygems_version: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - ! '>='
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: '0'
|
100
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
130
101
|
none: false
|
131
|
-
requirements:
|
132
|
-
- -
|
133
|
-
- !ruby/object:Gem::Version
|
134
|
-
|
135
|
-
segments:
|
136
|
-
- 0
|
137
|
-
version: "0"
|
102
|
+
requirements:
|
103
|
+
- - ! '>='
|
104
|
+
- !ruby/object:Gem::Version
|
105
|
+
version: '0'
|
138
106
|
requirements: []
|
139
|
-
|
140
107
|
rubyforge_project:
|
141
|
-
rubygems_version: 1.
|
108
|
+
rubygems_version: 1.8.11
|
142
109
|
signing_key:
|
143
110
|
specification_version: 3
|
144
111
|
summary: Crawler utilizing resque
|
145
112
|
test_files: []
|
146
|
-
|