cobweb 1.0.19 → 1.0.20
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +15 -0
- data/README.textile +3 -4
- data/lib/cobweb.rb +7 -1
- data/lib/cobweb_crawl_helper.rb +18 -13
- data/lib/cobweb_crawler.rb +1 -1
- data/lib/cobweb_version.rb +1 -1
- data/lib/crawl.rb +49 -42
- data/lib/crawl_helper.rb +7 -1
- data/lib/crawl_worker.rb +9 -8
- data/lib/sidekiq/cobweb_helper.rb +7 -0
- data/lib/stats.rb +31 -30
- data/spec/cobweb/cobweb_crawl_helper_spec.rb +4 -2
- data/spec/cobweb/cobweb_crawler_spec.rb +2 -1
- data/spec/cobweb/crawl_job_spec.rb +37 -34
- data/spec/cobweb/crawl_worker_spec.rb +2 -2
- data/spec/cobweb/robots_spec.rb +1 -0
- data/spec/spec_helper.rb +9 -4
- metadata +6 -58
checksums.yaml
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
---
|
2
|
+
!binary "U0hBMQ==":
|
3
|
+
metadata.gz: !binary |-
|
4
|
+
MDE5MzU3NzI2MTRhYzM5NzIwMDZlMTJjMTg5NzNiMzAyMjFkMjcxOQ==
|
5
|
+
data.tar.gz: !binary |-
|
6
|
+
MWE3ZTAwYjExZjc4NzU2MDYzOTlhOTQwMTNlNTcyZjNmZTYwNmU3Zg==
|
7
|
+
SHA512:
|
8
|
+
metadata.gz: !binary |-
|
9
|
+
N2U2MDk1MmI3ZTU3OTFmZDI5YjY4YTEyNDNkMTE4MmJjOTFkNTZiYzNhY2Q4
|
10
|
+
Mjk0ZjM1YThhNzhkMGNjNjJiZTJkNjM1OWQ1MGMzZmVlMDI5MzUyOTU5YTRk
|
11
|
+
NzEzZjBiZjM2OTUxZTc2NzZjZDIyOWQ4ZmVlYzYyOGViMDIyYzY=
|
12
|
+
data.tar.gz: !binary |-
|
13
|
+
ZTJiYmRlNDY0M2FkNTdlN2I0ZjNiODYxOGQyN2MxZGZlMGViMWIxZDA4YmY1
|
14
|
+
ZDc2ZDU3NDc4ODg1YmExYjFmYjMyY2U0MDU4MGQ0OTJkZjRmNjAyYmQ3NWVl
|
15
|
+
ODY0ZTE5MGUzNzAzZWFlMzdmZmY1YzNhMmEzNWE1NzVkYzAwZDE=
|
data/README.textile
CHANGED
@@ -1,5 +1,4 @@
|
|
1
|
-
|
2
|
-
h1. Cobweb v1.0.19
|
1
|
+
h1. Cobweb v1.0.20
|
3
2
|
|
4
3
|
"@cobweb_gem":https://twitter.com/cobweb_gem
|
5
4
|
!https://badge.fury.io/rb/cobweb.png!:http://badge.fury.io/rb/cobweb
|
@@ -155,7 +154,7 @@ The :processing_queue option is used to specify the class that contains the resq
|
|
155
154
|
|
156
155
|
h3. CobwebCrawler
|
157
156
|
|
158
|
-
CobwebCrawler is the standalone crawling class. If you don't want to use
|
157
|
+
CobwebCrawler is the standalone crawling class. If you don't want to use resque or sidekiq and just want to crawl the site within your ruby process, you can use this class.
|
159
158
|
|
160
159
|
bc. crawler = CobwebCrawler.new(:cache => 600)
|
161
160
|
statistics = crawler.crawl("http://www.pepsico.com")
|
@@ -207,7 +206,7 @@ h2. License
|
|
207
206
|
|
208
207
|
h3. The MIT License
|
209
208
|
|
210
|
-
Copyright (c)
|
209
|
+
Copyright (c) 2013 Active Information Design
|
211
210
|
|
212
211
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
213
212
|
of this software and associated documentation files (the "Software"), to deal
|
data/lib/cobweb.rb
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
require 'rubygems'
|
2
2
|
require 'uri'
|
3
|
-
require 'resque'
|
4
3
|
require "addressable/uri"
|
5
4
|
require 'digest/sha1'
|
6
5
|
require 'base64'
|
@@ -178,6 +177,9 @@ class Cobweb
|
|
178
177
|
raise ":username and :password are required if using basic authentication" unless @options[:username] && @options[:password]
|
179
178
|
request.basic_auth @options[:username], @options[:password]
|
180
179
|
end
|
180
|
+
if @options[:range]
|
181
|
+
request.set_range(@options[:range])
|
182
|
+
end
|
181
183
|
|
182
184
|
response = @http.request request
|
183
185
|
|
@@ -451,6 +453,10 @@ class Cobweb
|
|
451
453
|
pattern = pattern.gsub("*", ".*?")
|
452
454
|
pattern
|
453
455
|
end
|
456
|
+
|
457
|
+
def clear_cache
|
458
|
+
|
459
|
+
end
|
454
460
|
|
455
461
|
private
|
456
462
|
# checks if the mime_type is textual
|
data/lib/cobweb_crawl_helper.rb
CHANGED
@@ -15,15 +15,18 @@ class CobwebCrawlHelper
|
|
15
15
|
@stats = Stats.new(data)
|
16
16
|
end
|
17
17
|
|
18
|
-
def destroy
|
19
|
-
|
18
|
+
def destroy
|
19
|
+
options = @data
|
20
20
|
options[:queue_name] = "cobweb_crawl_job" unless options.has_key?(:queue_name)
|
21
|
-
|
21
|
+
if RESQUE_INSTALLED
|
22
|
+
options[:finished_resque_queue] = CobwebFinishedJob unless options.has_key?(:finished_resque_queue)
|
23
|
+
end
|
22
24
|
|
23
25
|
# set status as cancelled now so that we don't enqueue any further pages
|
24
26
|
self.statistics.end_crawl(@data, true)
|
25
27
|
|
26
|
-
|
28
|
+
|
29
|
+
if options[:finished_resque_queue] && options[:queue_system] == :resque && RESQUE_INSTALLED
|
27
30
|
|
28
31
|
additional_stats = {:crawl_id => id, :crawled_base_url => @stats.redis.get("crawled_base_url")}
|
29
32
|
additional_stats[:redis_options] = @data[:redis_options] unless @data[:redis_options] == {}
|
@@ -38,15 +41,17 @@ class CobwebCrawlHelper
|
|
38
41
|
sleep 1
|
39
42
|
counter += 1
|
40
43
|
end
|
41
|
-
|
42
|
-
|
43
|
-
position
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
44
|
+
if options[:queue_system] == :resque && RESQUE_INSTALLED
|
45
|
+
position = Resque.size(options[:queue_name])
|
46
|
+
until position == 0
|
47
|
+
position-=BATCH_SIZE
|
48
|
+
position = 0 if position < 0
|
49
|
+
job_items = Resque.peek(options[:queue_name], position, BATCH_SIZE)
|
50
|
+
job_items.each do |item|
|
51
|
+
if item["args"][0]["crawl_id"] == id
|
52
|
+
# remove this job from the queue
|
53
|
+
Resque.dequeue(CrawlJob, item["args"][0])
|
54
|
+
end
|
50
55
|
end
|
51
56
|
end
|
52
57
|
end
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -122,7 +122,7 @@ class CobwebCrawler
|
|
122
122
|
|
123
123
|
if @options[:store_inbound_links]
|
124
124
|
document_links.each do |target_link|
|
125
|
-
target_uri = UriHelper.parse(target_link)
|
125
|
+
target_uri = UriHelper.parse(target_link).normalize
|
126
126
|
@redis.sadd("inbound_links_#{Digest::MD5.hexdigest(target_uri.to_s)}", UriHelper.parse(url).to_s)
|
127
127
|
end
|
128
128
|
end
|
data/lib/cobweb_version.rb
CHANGED
data/lib/crawl.rb
CHANGED
@@ -30,7 +30,6 @@ module CobwebModule
|
|
30
30
|
already_crawled?(link) || already_queued?(link) || already_running?(link)
|
31
31
|
end
|
32
32
|
|
33
|
-
|
34
33
|
# Returns true if the crawl count is within limits
|
35
34
|
def within_crawl_limits?
|
36
35
|
@options[:crawl_limit].nil? || crawl_counter < @options[:crawl_limit].to_i
|
@@ -62,15 +61,15 @@ module CobwebModule
|
|
62
61
|
|
63
62
|
unless already_running? @options[:url]
|
64
63
|
unless already_crawled? @options[:url]
|
65
|
-
|
64
|
+
update_queues
|
66
65
|
if within_crawl_limits?
|
66
|
+
@redis.sadd("currently_running", @options[:url])
|
67
67
|
@stats.update_status("Retrieving #{@options[:url]}...")
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
update_queues
|
68
|
+
@content = Cobweb.new(@options).get(@options[:url], @options)
|
69
|
+
update_counters
|
70
|
+
|
71
|
+
if @options[:url] == @redis.get("original_base_url")
|
72
|
+
@redis.set("crawled_base_url", @content[:base_url])
|
74
73
|
end
|
75
74
|
|
76
75
|
if content.permitted_type?
|
@@ -80,9 +79,15 @@ module CobwebModule
|
|
80
79
|
return true
|
81
80
|
end
|
82
81
|
else
|
82
|
+
puts "======================================="
|
83
|
+
puts "OUTWITH CRAWL LIMITS"
|
84
|
+
puts "======================================="
|
83
85
|
decrement_queue_counter
|
84
86
|
end
|
85
87
|
else
|
88
|
+
puts "======================================="
|
89
|
+
puts "ALREADY CRAWLED"
|
90
|
+
puts "======================================="
|
86
91
|
decrement_queue_counter
|
87
92
|
end
|
88
93
|
else
|
@@ -108,26 +113,28 @@ module CobwebModule
|
|
108
113
|
internal_links = document_links.select{ |link| @cobweb_links.internal?(link) }
|
109
114
|
|
110
115
|
# reject the link if we've crawled it or queued it
|
111
|
-
|
112
|
-
internal_links.reject! { |link|
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
if
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
116
|
+
|
117
|
+
internal_links.reject! { |link| already_handled?(link)}
|
118
|
+
|
119
|
+
lock("internal-links") do
|
120
|
+
internal_links.each do |link|
|
121
|
+
if within_queue_limits? && !already_handled?(link)
|
122
|
+
if status != CobwebCrawlHelper::CANCELLED
|
123
|
+
yield link if block_given?
|
124
|
+
unless link.nil?
|
125
|
+
@redis.sadd "queued", link
|
126
|
+
increment_queue_counter
|
127
|
+
end
|
128
|
+
else
|
129
|
+
debug_puts "Cannot enqueue new content as crawl has been cancelled."
|
121
130
|
end
|
122
|
-
else
|
123
|
-
debug_puts "Cannot enqueue new content as crawl has been cancelled."
|
124
131
|
end
|
125
132
|
end
|
126
133
|
end
|
127
134
|
|
128
135
|
if @options[:store_inbound_links]
|
129
136
|
document_links.each do |link|
|
130
|
-
uri = URI.parse(link)
|
137
|
+
uri = URI.parse(link).normalize
|
131
138
|
@redis.sadd("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}", url)
|
132
139
|
end
|
133
140
|
end
|
@@ -140,25 +147,25 @@ module CobwebModule
|
|
140
147
|
end
|
141
148
|
|
142
149
|
def update_queues
|
143
|
-
|
150
|
+
lock("update_queues") do
|
144
151
|
#@redis.incr "inprogress"
|
145
152
|
# move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
|
146
153
|
@redis.srem "queued", @options[:url]
|
147
154
|
@redis.sadd "crawled", @options[:url]
|
148
|
-
|
149
|
-
@redis.srem "queued", content.url
|
150
|
-
@redis.sadd "crawled", content.url
|
151
|
-
end
|
155
|
+
|
152
156
|
# increment the counter if we are not limiting by page only || we are limiting count by page and it is a page
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
157
|
+
end
|
158
|
+
end
|
159
|
+
|
160
|
+
def update_counters
|
161
|
+
if @options[:crawl_limit_by_page]
|
162
|
+
if content.mime_type.match("text/html")
|
158
163
|
increment_crawl_counter
|
159
164
|
end
|
160
|
-
|
161
|
-
|
165
|
+
else
|
166
|
+
increment_crawl_counter
|
167
|
+
end
|
168
|
+
decrement_queue_counter
|
162
169
|
end
|
163
170
|
|
164
171
|
def to_be_processed?
|
@@ -166,7 +173,7 @@ module CobwebModule
|
|
166
173
|
end
|
167
174
|
|
168
175
|
def process(&block)
|
169
|
-
lock("process") do
|
176
|
+
lock("process-count") do
|
170
177
|
if @options[:crawl_limit_by_page]
|
171
178
|
if content.mime_type.match("text/html")
|
172
179
|
increment_process_counter
|
@@ -175,10 +182,10 @@ module CobwebModule
|
|
175
182
|
increment_process_counter
|
176
183
|
end
|
177
184
|
#@redis.sadd "queued", @options[:url]
|
178
|
-
|
179
|
-
yield if block_given?
|
180
|
-
@redis.incr("crawl_job_enqueued_count")
|
181
185
|
end
|
186
|
+
|
187
|
+
yield if block_given?
|
188
|
+
@redis.incr("crawl_job_enqueued_count")
|
182
189
|
end
|
183
190
|
|
184
191
|
def finished_processing
|
@@ -250,17 +257,17 @@ module CobwebModule
|
|
250
257
|
end
|
251
258
|
|
252
259
|
def lock(key, &block)
|
253
|
-
|
260
|
+
debug_puts "REQUESTING LOCK [#{key}]"
|
254
261
|
set_nx = @redis.setnx("#{key}_lock", "locked")
|
255
|
-
|
262
|
+
debug_puts "LOCK:#{key}:#{set_nx}"
|
256
263
|
while !set_nx
|
257
|
-
|
264
|
+
debug_puts "===== WAITING FOR LOCK [#{key}] ====="
|
258
265
|
sleep 0.01
|
259
266
|
set_nx = @redis.setnx("#{key}_lock", "locked")
|
260
267
|
end
|
261
268
|
|
262
|
-
|
263
|
-
@redis.expire("#{key}_lock",
|
269
|
+
debug_puts "RECEIVED LOCK [#{key}]"
|
270
|
+
@redis.expire("#{key}_lock", 30)
|
264
271
|
begin
|
265
272
|
result = yield
|
266
273
|
ensure
|
data/lib/crawl_helper.rb
CHANGED
@@ -150,7 +150,13 @@ class CrawlHelper
|
|
150
150
|
elsif content_request[:use_encoding_safe_process_job]
|
151
151
|
content_to_send[:body] = Base64.encode64(content[:body])
|
152
152
|
content_to_send[:processing_queue] = content_request[:processing_queue]
|
153
|
-
|
153
|
+
if content_request[:queue_system] == :resque
|
154
|
+
Resque.enqueue(EncodingSafeProcessJob, content_to_send)
|
155
|
+
elsif content_request[:queue_system] == :sidekiq
|
156
|
+
const_get(content_request[:processing_queue]).perform_async(content_to_send)
|
157
|
+
else
|
158
|
+
raise "Unknown queue system: #{content_request[:queue_system]}"
|
159
|
+
end
|
154
160
|
else
|
155
161
|
if content_request[:queue_system] == :resque
|
156
162
|
Resque.enqueue(const_get(content_request[:processing_queue]), content_to_send)
|
data/lib/crawl_worker.rb
CHANGED
@@ -25,17 +25,15 @@ class CrawlWorker
|
|
25
25
|
|
26
26
|
# if the crawled object is an object type we are interested
|
27
27
|
if @crawl.content.permitted_type?
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
@crawl.
|
32
|
-
|
28
|
+
|
29
|
+
# extract links from content and process them if we are still within queue limits (block will not run if we are outwith limits)
|
30
|
+
@crawl.process_links do |link|
|
31
|
+
@crawl.lock("queue_links") do
|
33
32
|
if @crawl.within_crawl_limits? && !@crawl.already_handled?(link)
|
34
33
|
# enqueue the links to sidekiq
|
35
34
|
@crawl.debug_puts "QUEUED LINK: #{link}"
|
36
35
|
enqueue_content(content_request, link)
|
37
36
|
end
|
38
|
-
|
39
37
|
end
|
40
38
|
end
|
41
39
|
|
@@ -64,7 +62,7 @@ class CrawlWorker
|
|
64
62
|
end
|
65
63
|
end
|
66
64
|
|
67
|
-
|
65
|
+
#@crawl.lock("finished") do
|
68
66
|
# let the crawl know we're finished with this object
|
69
67
|
@crawl.finished_processing
|
70
68
|
|
@@ -74,7 +72,7 @@ class CrawlWorker
|
|
74
72
|
@crawl.debug_puts "Calling crawl_job finished"
|
75
73
|
finished(content_request)
|
76
74
|
end
|
77
|
-
end
|
75
|
+
#end
|
78
76
|
end
|
79
77
|
def self.jobs
|
80
78
|
Sidekiq.redis do |conn|
|
@@ -99,6 +97,9 @@ class CrawlWorker
|
|
99
97
|
# Enqueues the content to the processing queue setup in options
|
100
98
|
def send_to_processing_queue(content, content_request)
|
101
99
|
content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
|
100
|
+
content_to_send.keys.each do |key|
|
101
|
+
content_to_send[key] = content_to_send[key].force_encoding('UTF-8') if content_to_send[key].kind_of?(String)
|
102
|
+
end
|
102
103
|
if content_request[:direct_call_process_job]
|
103
104
|
clazz = content_request[:processing_queue].constantize
|
104
105
|
clazz.perform(content_to_send)
|
@@ -6,6 +6,13 @@ else
|
|
6
6
|
SIDEKIQ_INSTALLED = false
|
7
7
|
puts "sidekiq gem not installed, skipping crawl_worker specs"
|
8
8
|
end
|
9
|
+
if Gem::Specification.find_all_by_name("resque", ">=1.0.0").count >= 1
|
10
|
+
RESQUE_INSTALLED = true
|
11
|
+
require 'resque'
|
12
|
+
else
|
13
|
+
RESQUE_INSTALLED = false
|
14
|
+
puts "resque gem not installed, skipping crawl_job specs"
|
15
|
+
end
|
9
16
|
|
10
17
|
module Sidekiq
|
11
18
|
module Worker
|
data/lib/stats.rb
CHANGED
@@ -1,17 +1,21 @@
|
|
1
1
|
# Stats class is the main statisitics hub for monitoring crawls. Either can be viewed through the Sinatra interface, or returned from the CobwebCrawler.crawl method or block
|
2
2
|
class Stats
|
3
3
|
require 'json'
|
4
|
-
|
4
|
+
|
5
5
|
attr_reader :redis
|
6
|
-
|
6
|
+
|
7
7
|
# Sets up redis usage for statistics
|
8
8
|
def initialize(options)
|
9
9
|
options[:redis_options] = {} unless options.has_key? :redis_options
|
10
|
-
|
10
|
+
if options[:redis]
|
11
|
+
@full_redis = options[:redis]
|
12
|
+
else
|
13
|
+
@full_redis = Redis.new(options[:redis_options])
|
14
|
+
end
|
11
15
|
@lock = Mutex.new
|
12
16
|
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{options[:crawl_id]}", :redis => @full_redis)
|
13
17
|
end
|
14
|
-
|
18
|
+
|
15
19
|
# Sets up the crawl in statistics
|
16
20
|
def start_crawl(options)
|
17
21
|
unless @full_redis.sismember "cobweb_crawls", options[:crawl_id]
|
@@ -23,7 +27,7 @@ class Stats
|
|
23
27
|
@redis.hset "statistics", "crawl_started_at", DateTime.now
|
24
28
|
@redis.hset "statistics", "current_status", CobwebCrawlHelper::STARTING
|
25
29
|
end
|
26
|
-
|
30
|
+
|
27
31
|
# Removes the crawl from the running crawls and updates status
|
28
32
|
def end_crawl(options, cancelled=false)
|
29
33
|
#@full_redis.srem "cobweb_crawls", options[:crawl_id]
|
@@ -35,21 +39,21 @@ class Stats
|
|
35
39
|
@redis.hset "statistics", "crawl_finished_at", DateTime.now
|
36
40
|
#@redis.del "crawl_details"
|
37
41
|
end
|
38
|
-
|
42
|
+
|
39
43
|
def get_crawled
|
40
44
|
@redis.smembers "crawled"
|
41
45
|
end
|
42
46
|
|
43
47
|
def inbound_links_for(url)
|
44
|
-
uri = UriHelper.parse(url)
|
48
|
+
uri = UriHelper.parse(url).normalize
|
45
49
|
@redis.smembers("inbound_links_#{Digest::MD5.hexdigest(uri.to_s)}")
|
46
50
|
end
|
47
51
|
|
48
|
-
# Returns statistics hash. update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
|
52
|
+
# Returns statistics hash. update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
|
49
53
|
def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
|
50
54
|
@lock.synchronize {
|
51
55
|
@statistics = get_statistics
|
52
|
-
|
56
|
+
|
53
57
|
if @statistics.has_key? :average_response_time
|
54
58
|
@statistics[:average_response_time] = (((@redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1))
|
55
59
|
else
|
@@ -64,7 +68,7 @@ class Stats
|
|
64
68
|
end
|
65
69
|
@statistics[:maximum_length] = content[:length].to_i if @redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > @statistics[:maximum_length].to_i
|
66
70
|
@statistics[:minimum_length] = content[:length].to_i if @redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < @statistics[:minimum_length].to_i
|
67
|
-
|
71
|
+
|
68
72
|
if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
|
69
73
|
@statistics[:page_count] = @statistics[:page_count].to_i + 1
|
70
74
|
@statistics[:page_size] = @statistics[:page_size].to_i + content[:length].to_i
|
@@ -74,14 +78,14 @@ class Stats
|
|
74
78
|
@statistics[:asset_size] = @statistics[:asset_size].to_i + content[:length].to_i
|
75
79
|
increment_time_stat("assets_count")
|
76
80
|
end
|
77
|
-
|
81
|
+
|
78
82
|
total_redirects = @statistics[:total_redirects].to_i
|
79
83
|
@statistics[:total_redirects] = 0 if total_redirects.nil?
|
80
84
|
@statistics[:total_redirects] = total_redirects += content[:redirect_through].count unless content[:redirect_through].nil?
|
81
85
|
|
82
86
|
@statistics[:crawl_counter] = crawl_counter
|
83
87
|
@statistics[:queue_counter] = queue_counter
|
84
|
-
|
88
|
+
|
85
89
|
total_length = @statistics[:total_length].to_i
|
86
90
|
@statistics[:total_length] = total_length + content[:length].to_i
|
87
91
|
|
@@ -117,7 +121,7 @@ class Stats
|
|
117
121
|
elsif content[:mime_type].cobweb_starts_with? "video"
|
118
122
|
increment_time_stat("mime_video_count")
|
119
123
|
end
|
120
|
-
|
124
|
+
|
121
125
|
status_counts = {}
|
122
126
|
if @statistics.has_key? :status_counts
|
123
127
|
status_counts = @statistics[:status_counts]
|
@@ -126,11 +130,11 @@ class Stats
|
|
126
130
|
status_counts[status_code] += 1
|
127
131
|
else
|
128
132
|
status_counts[status_code] = 1
|
129
|
-
end
|
133
|
+
end
|
130
134
|
else
|
131
135
|
status_counts = {status_code => 1}
|
132
136
|
end
|
133
|
-
|
137
|
+
|
134
138
|
# record statistics by status type
|
135
139
|
if content[:status_code] >= 200 && content[:status_code] < 300
|
136
140
|
increment_time_stat("status_200_count")
|
@@ -139,21 +143,21 @@ class Stats
|
|
139
143
|
elsif content[:status_code] >= 500 && content[:status_code] < 600
|
140
144
|
increment_time_stat("status|_500_count")
|
141
145
|
end
|
142
|
-
|
146
|
+
|
143
147
|
@statistics[:status_counts] = status_counts.to_json
|
144
|
-
|
148
|
+
|
145
149
|
## time based statistics
|
146
150
|
increment_time_stat("minute_totals", "minute", 60)
|
147
|
-
|
151
|
+
|
148
152
|
redis_command = "@redis.hmset 'statistics', #{@statistics.keys.map{|key| "'#{key}', '#{@statistics[key].to_s.gsub("'","''")}'"}.join(", ")}"
|
149
153
|
instance_eval redis_command
|
150
154
|
}
|
151
155
|
@statistics
|
152
156
|
end
|
153
|
-
|
157
|
+
|
154
158
|
# Returns the statistics hash
|
155
159
|
def get_statistics
|
156
|
-
|
160
|
+
|
157
161
|
statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
|
158
162
|
if statistics[:status_counts].nil?
|
159
163
|
statistics[:status_counts]
|
@@ -167,23 +171,23 @@ class Stats
|
|
167
171
|
end
|
168
172
|
statistics
|
169
173
|
end
|
170
|
-
|
174
|
+
|
171
175
|
# Sets the current status of the crawl
|
172
176
|
def update_status(status)
|
173
177
|
@redis.hset("statistics", "current_status", status) unless get_status == CobwebCrawlHelper::CANCELLED
|
174
178
|
end
|
175
|
-
|
179
|
+
|
176
180
|
# Returns the current status of the crawl
|
177
181
|
def get_status
|
178
182
|
@redis.hget "statistics", "current_status"
|
179
183
|
end
|
180
|
-
|
184
|
+
|
181
185
|
# Sets totals for the end of the crawl (Not Used)
|
182
186
|
def set_totals
|
183
187
|
stats = get_statistics
|
184
188
|
stats[:crawled] = @redis.smembers "crawled"
|
185
189
|
end
|
186
|
-
|
190
|
+
|
187
191
|
private
|
188
192
|
# Records a time based statistic
|
189
193
|
def record_time_stat(stat_name, value, type="minute", duration=60)
|
@@ -193,7 +197,7 @@ class Stats
|
|
193
197
|
end
|
194
198
|
stat_value = @redis.hget(stat_name, key).to_i
|
195
199
|
stat_count = @redis.hget("#{stat_name}-count", key).to_i
|
196
|
-
|
200
|
+
|
197
201
|
if minute_count.nil?
|
198
202
|
@redis.hset stat_name, key, value
|
199
203
|
@redis.hset "#{stat_name}-count", key, 1
|
@@ -202,7 +206,7 @@ class Stats
|
|
202
206
|
@redis.hset "#{stat_name}-count", key, stat_count+1
|
203
207
|
end
|
204
208
|
end
|
205
|
-
|
209
|
+
|
206
210
|
# Increments a time based statistic (eg pages per minute)
|
207
211
|
def increment_time_stat(stat_name, type="minute", duration=60)
|
208
212
|
key = DateTime.now.strftime("%Y-%m-%d %H:%M")
|
@@ -218,12 +222,9 @@ class Stats
|
|
218
222
|
#clear up older data
|
219
223
|
@redis.hgetall(stat_name).keys.each do |key|
|
220
224
|
if DateTime.parse(key) < DateTime.now-(duration/1440.0)
|
221
|
-
puts "Deleting #{stat_name} - #{key}"
|
222
225
|
@redis.hdel(stat_name, key)
|
223
226
|
end
|
224
227
|
end
|
225
228
|
end
|
226
|
-
|
227
|
-
end
|
228
|
-
|
229
229
|
|
230
|
+
end
|
@@ -3,6 +3,8 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
|
3
3
|
describe CobwebCrawlHelper do
|
4
4
|
include HttpStubs
|
5
5
|
before(:each) do
|
6
|
+
pending("not enabled for non resque installs") unless RESQUE_INSTALLED
|
7
|
+
|
6
8
|
setup_stubs
|
7
9
|
end
|
8
10
|
# this spec tests the crawl object
|
@@ -42,14 +44,14 @@ describe CobwebCrawlHelper do
|
|
42
44
|
end
|
43
45
|
end
|
44
46
|
after(:each) do
|
45
|
-
Resque.remove_queue("cobweb_crawl_job")
|
47
|
+
Resque.remove_queue("cobweb_crawl_job") if RESQUE_INSTALLED
|
46
48
|
end
|
47
49
|
it "should have a queue length of 210" do
|
48
50
|
Resque.size("cobweb_crawl_job").should == 210
|
49
51
|
end
|
50
52
|
describe "after called" do
|
51
53
|
before(:each) do
|
52
|
-
@crawl = CobwebCrawlHelper.new({:crawl_id => "crawl_0_id"})
|
54
|
+
@crawl = CobwebCrawlHelper.new({:crawl_id => "crawl_0_id", :queue_system => :resque})
|
53
55
|
@crawl.destroy
|
54
56
|
end
|
55
57
|
it "should delete only the crawl specified" do
|
@@ -3,6 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
|
3
3
|
describe CobwebCrawler do
|
4
4
|
|
5
5
|
before(:each) do
|
6
|
+
pending("thin not installed") unless THIN_INSTALLED
|
6
7
|
|
7
8
|
@base_url = "http://localhost:3532/"
|
8
9
|
|
@@ -55,12 +56,12 @@ describe CobwebCrawler do
|
|
55
56
|
context "storing inbound links" do
|
56
57
|
|
57
58
|
before(:each) do
|
59
|
+
pending("thin not installed") unless THIN_INSTALLED
|
58
60
|
@crawler = CobwebCrawler.new({:cache => false, :quiet => true, :debug => false, :store_inbound_links => true})
|
59
61
|
@statistics = @crawler.crawl(@base_url)
|
60
62
|
end
|
61
63
|
|
62
64
|
it "should store inbound links" do
|
63
|
-
|
64
65
|
@statistics.inbound_links_for("http://localhost:3532/typography.html").should_not be_empty
|
65
66
|
@statistics.inbound_links_for("http://localhost:3532/typography.html").sort.should == ["http://localhost:3532/gallery.html", "http://localhost:3532/boxgrid%3Ewithsillyname.html", "http://localhost:3532/more.html", "http://localhost:3532/", "http://localhost:3532/tables.html", "http://localhost:3532/typography.html", "http://localhost:3532/forms.html", "http://localhost:3532/dashboard.html"].sort
|
66
67
|
end
|
@@ -6,48 +6,51 @@ describe CrawlJob, :local_only => true, :disabled => true do
|
|
6
6
|
|
7
7
|
before(:all) do
|
8
8
|
#store all existing resque process ids so we don't kill them afterwards
|
9
|
+
if RESQUE_INSTALLED && THIN_INSTALLED
|
10
|
+
|
11
|
+
@existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
12
|
+
if Resque.workers.count > 0 && @existing_processes.empty?
|
13
|
+
raise "Ghost workers present in resque, please clear before running specs (Resque::Worker.all.first.prune_dead_workers)"
|
14
|
+
elsif Resque.workers.count == 0 && !@existing_processes.empty?
|
15
|
+
raise "Ghost worker processes present (#{@existing_processes.join(',')})"
|
16
|
+
elsif Resque.workers.count > 0 && !@existing_processes.empty?
|
17
|
+
raise "Resque workers present, please end other resque processes before running this spec"
|
18
|
+
end
|
9
19
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
io = IO.popen("nohup rake resque:workers INTERVAL=1 PIDFILE=./tmp/pids/resque.pid COUNT=#{RESQUE_WORKER_COUNT} QUEUE=cobweb_crawl_job > log/output.log &")
|
24
|
-
|
25
|
-
counter = 0
|
26
|
-
until counter > 10 || workers_processes_started?
|
27
|
-
print "\rStarting Resque Processes... #{10-counter} "
|
28
|
-
counter += 1
|
29
|
-
sleep 1
|
30
|
-
end
|
31
|
-
puts ""
|
20
|
+
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
21
|
+
`mkdir log` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../log'))
|
22
|
+
`mkdir tmp` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../tmp'))
|
23
|
+
`mkdir tmp/pids` unless Dir.exist?(File.expand_path(File.dirname(__FILE__) + '/../../tmp/pids'))
|
24
|
+
io = IO.popen("nohup rake resque:workers INTERVAL=1 PIDFILE=./tmp/pids/resque.pid COUNT=#{RESQUE_WORKER_COUNT} QUEUE=cobweb_crawl_job > log/output.log &")
|
25
|
+
|
26
|
+
counter = 0
|
27
|
+
until counter > 10 || workers_processes_started?
|
28
|
+
print "\rStarting Resque Processes... #{10-counter} "
|
29
|
+
counter += 1
|
30
|
+
sleep 1
|
31
|
+
end
|
32
|
+
puts ""
|
32
33
|
|
33
34
|
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
35
|
+
counter = 0
|
36
|
+
until counter > 30 || workers_running?
|
37
|
+
print "\rWaiting for Resque Workers... #{30-counter} "
|
38
|
+
counter += 1
|
39
|
+
sleep 1
|
40
|
+
end
|
41
|
+
puts ""
|
41
42
|
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
43
|
+
if workers_running?
|
44
|
+
puts "Workers Running."
|
45
|
+
else
|
46
|
+
raise "Workers didn't appear, please check environment"
|
47
|
+
end
|
46
48
|
end
|
47
|
-
|
48
49
|
end
|
49
50
|
|
50
51
|
before(:each) do
|
52
|
+
pending("Resque not installed") unless RESQUE_INSTALLED
|
53
|
+
pending("thin not installed") unless THIN_INSTALLED
|
51
54
|
@base_url = "http://localhost:3532/"
|
52
55
|
@base_page_count = 77
|
53
56
|
|
@@ -4,8 +4,7 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
|
4
4
|
describe CrawlWorker, :local_only => true do
|
5
5
|
|
6
6
|
before(:all) do
|
7
|
-
|
8
|
-
if SIDEKIQ_INSTALLED
|
7
|
+
if SIDEKIQ_INSTALLED && THIN_INSTALLED
|
9
8
|
#store all existing resque process ids so we don't kill them afterwards
|
10
9
|
@existing_processes = `ps aux | grep sidekiq | grep -v grep | awk '{print $2}'`.split("\n")
|
11
10
|
|
@@ -22,6 +21,7 @@ describe CrawlWorker, :local_only => true do
|
|
22
21
|
|
23
22
|
before(:each) do
|
24
23
|
pending("Sidkiq not installed") unless SIDEKIQ_INSTALLED
|
24
|
+
pending("thin not installed") unless THIN_INSTALLED
|
25
25
|
@base_url = "http://localhost:3532/"
|
26
26
|
@base_page_count = 77
|
27
27
|
|
data/spec/cobweb/robots_spec.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -3,7 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
|
|
3
3
|
require File.expand_path(File.dirname(__FILE__) + '/../spec/samples/sample_server')
|
4
4
|
require File.expand_path(File.dirname(__FILE__) + '/../spec/http_stubs')
|
5
5
|
require 'mock_redis'
|
6
|
-
|
6
|
+
|
7
7
|
|
8
8
|
require 'coveralls'
|
9
9
|
Coveralls.wear!
|
@@ -17,9 +17,14 @@ RSpec.configure do |config|
|
|
17
17
|
if ENV["TRAVIS_RUBY_VERSION"] || ENV['CI']
|
18
18
|
config.filter_run_excluding :local_only => true
|
19
19
|
end
|
20
|
-
|
21
|
-
|
22
|
-
|
20
|
+
|
21
|
+
THIN_INSTALLED = false
|
22
|
+
if Gem::Specification.find_all_by_name("thin", ">=1.0.0").count >= 1
|
23
|
+
require 'thin'
|
24
|
+
THIN_INSTALLED = true
|
25
|
+
Thread.new do
|
26
|
+
@thin ||= Thin::Server.start("0.0.0.0", 3532, SampleServer.app)
|
27
|
+
end
|
23
28
|
end
|
24
29
|
|
25
30
|
# WAIT FOR START TO COMPLETE
|
metadata
CHANGED
@@ -1,20 +1,18 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
5
|
-
prerelease:
|
4
|
+
version: 1.0.20
|
6
5
|
platform: ruby
|
7
6
|
authors:
|
8
7
|
- Stewart McKee
|
9
8
|
autorequire:
|
10
9
|
bindir: bin
|
11
10
|
cert_chain: []
|
12
|
-
date:
|
11
|
+
date: 2014-08-23 00:00:00.000000000 Z
|
13
12
|
dependencies:
|
14
13
|
- !ruby/object:Gem::Dependency
|
15
14
|
name: redis
|
16
15
|
requirement: !ruby/object:Gem::Requirement
|
17
|
-
none: false
|
18
16
|
requirements:
|
19
17
|
- - ! '>='
|
20
18
|
- !ruby/object:Gem::Version
|
@@ -22,7 +20,6 @@ dependencies:
|
|
22
20
|
type: :runtime
|
23
21
|
prerelease: false
|
24
22
|
version_requirements: !ruby/object:Gem::Requirement
|
25
|
-
none: false
|
26
23
|
requirements:
|
27
24
|
- - ! '>='
|
28
25
|
- !ruby/object:Gem::Version
|
@@ -30,7 +27,6 @@ dependencies:
|
|
30
27
|
- !ruby/object:Gem::Dependency
|
31
28
|
name: nokogiri
|
32
29
|
requirement: !ruby/object:Gem::Requirement
|
33
|
-
none: false
|
34
30
|
requirements:
|
35
31
|
- - ! '>='
|
36
32
|
- !ruby/object:Gem::Version
|
@@ -38,7 +34,6 @@ dependencies:
|
|
38
34
|
type: :runtime
|
39
35
|
prerelease: false
|
40
36
|
version_requirements: !ruby/object:Gem::Requirement
|
41
|
-
none: false
|
42
37
|
requirements:
|
43
38
|
- - ! '>='
|
44
39
|
- !ruby/object:Gem::Version
|
@@ -46,7 +41,6 @@ dependencies:
|
|
46
41
|
- !ruby/object:Gem::Dependency
|
47
42
|
name: addressable
|
48
43
|
requirement: !ruby/object:Gem::Requirement
|
49
|
-
none: false
|
50
44
|
requirements:
|
51
45
|
- - ! '>='
|
52
46
|
- !ruby/object:Gem::Version
|
@@ -54,23 +48,6 @@ dependencies:
|
|
54
48
|
type: :runtime
|
55
49
|
prerelease: false
|
56
50
|
version_requirements: !ruby/object:Gem::Requirement
|
57
|
-
none: false
|
58
|
-
requirements:
|
59
|
-
- - ! '>='
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
|
-
- !ruby/object:Gem::Dependency
|
63
|
-
name: rspec
|
64
|
-
requirement: !ruby/object:Gem::Requirement
|
65
|
-
none: false
|
66
|
-
requirements:
|
67
|
-
- - ! '>='
|
68
|
-
- !ruby/object:Gem::Version
|
69
|
-
version: '0'
|
70
|
-
type: :runtime
|
71
|
-
prerelease: false
|
72
|
-
version_requirements: !ruby/object:Gem::Requirement
|
73
|
-
none: false
|
74
51
|
requirements:
|
75
52
|
- - ! '>='
|
76
53
|
- !ruby/object:Gem::Version
|
@@ -78,7 +55,6 @@ dependencies:
|
|
78
55
|
- !ruby/object:Gem::Dependency
|
79
56
|
name: awesome_print
|
80
57
|
requirement: !ruby/object:Gem::Requirement
|
81
|
-
none: false
|
82
58
|
requirements:
|
83
59
|
- - ! '>='
|
84
60
|
- !ruby/object:Gem::Version
|
@@ -86,7 +62,6 @@ dependencies:
|
|
86
62
|
type: :runtime
|
87
63
|
prerelease: false
|
88
64
|
version_requirements: !ruby/object:Gem::Requirement
|
89
|
-
none: false
|
90
65
|
requirements:
|
91
66
|
- - ! '>='
|
92
67
|
- !ruby/object:Gem::Version
|
@@ -94,23 +69,6 @@ dependencies:
|
|
94
69
|
- !ruby/object:Gem::Dependency
|
95
70
|
name: sinatra
|
96
71
|
requirement: !ruby/object:Gem::Requirement
|
97
|
-
none: false
|
98
|
-
requirements:
|
99
|
-
- - ! '>='
|
100
|
-
- !ruby/object:Gem::Version
|
101
|
-
version: '0'
|
102
|
-
type: :runtime
|
103
|
-
prerelease: false
|
104
|
-
version_requirements: !ruby/object:Gem::Requirement
|
105
|
-
none: false
|
106
|
-
requirements:
|
107
|
-
- - ! '>='
|
108
|
-
- !ruby/object:Gem::Version
|
109
|
-
version: '0'
|
110
|
-
- !ruby/object:Gem::Dependency
|
111
|
-
name: thin
|
112
|
-
requirement: !ruby/object:Gem::Requirement
|
113
|
-
none: false
|
114
72
|
requirements:
|
115
73
|
- - ! '>='
|
116
74
|
- !ruby/object:Gem::Version
|
@@ -118,7 +76,6 @@ dependencies:
|
|
118
76
|
type: :runtime
|
119
77
|
prerelease: false
|
120
78
|
version_requirements: !ruby/object:Gem::Requirement
|
121
|
-
none: false
|
122
79
|
requirements:
|
123
80
|
- - ! '>='
|
124
81
|
- !ruby/object:Gem::Version
|
@@ -126,7 +83,6 @@ dependencies:
|
|
126
83
|
- !ruby/object:Gem::Dependency
|
127
84
|
name: haml
|
128
85
|
requirement: !ruby/object:Gem::Requirement
|
129
|
-
none: false
|
130
86
|
requirements:
|
131
87
|
- - ! '>='
|
132
88
|
- !ruby/object:Gem::Version
|
@@ -134,15 +90,13 @@ dependencies:
|
|
134
90
|
type: :runtime
|
135
91
|
prerelease: false
|
136
92
|
version_requirements: !ruby/object:Gem::Requirement
|
137
|
-
none: false
|
138
93
|
requirements:
|
139
94
|
- - ! '>='
|
140
95
|
- !ruby/object:Gem::Version
|
141
96
|
version: '0'
|
142
97
|
- !ruby/object:Gem::Dependency
|
143
|
-
name:
|
98
|
+
name: redis-namespace
|
144
99
|
requirement: !ruby/object:Gem::Requirement
|
145
|
-
none: false
|
146
100
|
requirements:
|
147
101
|
- - ! '>='
|
148
102
|
- !ruby/object:Gem::Version
|
@@ -150,7 +104,6 @@ dependencies:
|
|
150
104
|
type: :runtime
|
151
105
|
prerelease: false
|
152
106
|
version_requirements: !ruby/object:Gem::Requirement
|
153
|
-
none: false
|
154
107
|
requirements:
|
155
108
|
- - ! '>='
|
156
109
|
- !ruby/object:Gem::Version
|
@@ -158,7 +111,6 @@ dependencies:
|
|
158
111
|
- !ruby/object:Gem::Dependency
|
159
112
|
name: json
|
160
113
|
requirement: !ruby/object:Gem::Requirement
|
161
|
-
none: false
|
162
114
|
requirements:
|
163
115
|
- - ! '>='
|
164
116
|
- !ruby/object:Gem::Version
|
@@ -166,7 +118,6 @@ dependencies:
|
|
166
118
|
type: :runtime
|
167
119
|
prerelease: false
|
168
120
|
version_requirements: !ruby/object:Gem::Requirement
|
169
|
-
none: false
|
170
121
|
requirements:
|
171
122
|
- - ! '>='
|
172
123
|
- !ruby/object:Gem::Version
|
@@ -174,7 +125,6 @@ dependencies:
|
|
174
125
|
- !ruby/object:Gem::Dependency
|
175
126
|
name: slop
|
176
127
|
requirement: !ruby/object:Gem::Requirement
|
177
|
-
none: false
|
178
128
|
requirements:
|
179
129
|
- - ! '>='
|
180
130
|
- !ruby/object:Gem::Version
|
@@ -182,7 +132,6 @@ dependencies:
|
|
182
132
|
type: :runtime
|
183
133
|
prerelease: false
|
184
134
|
version_requirements: !ruby/object:Gem::Requirement
|
185
|
-
none: false
|
186
135
|
requirements:
|
187
136
|
- - ! '>='
|
188
137
|
- !ruby/object:Gem::Version
|
@@ -564,27 +513,26 @@ files:
|
|
564
513
|
homepage: http://github.com/stewartmckee/cobweb
|
565
514
|
licenses:
|
566
515
|
- MIT
|
516
|
+
metadata: {}
|
567
517
|
post_install_message:
|
568
518
|
rdoc_options: []
|
569
519
|
require_paths:
|
570
520
|
- lib
|
571
521
|
required_ruby_version: !ruby/object:Gem::Requirement
|
572
|
-
none: false
|
573
522
|
requirements:
|
574
523
|
- - ! '>='
|
575
524
|
- !ruby/object:Gem::Version
|
576
525
|
version: '0'
|
577
526
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
578
|
-
none: false
|
579
527
|
requirements:
|
580
528
|
- - ! '>='
|
581
529
|
- !ruby/object:Gem::Version
|
582
530
|
version: '0'
|
583
531
|
requirements: []
|
584
532
|
rubyforge_project:
|
585
|
-
rubygems_version: 1.
|
533
|
+
rubygems_version: 2.1.11
|
586
534
|
signing_key:
|
587
|
-
specification_version:
|
535
|
+
specification_version: 4
|
588
536
|
summary: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
589
537
|
crawl extremely large sites faster than multi-threaded crawlers. It is also a standalone
|
590
538
|
crawler that has a sophisticated statistics monitoring interface to monitor the
|