cobweb 0.0.54 → 0.0.55
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +4 -1
- data/lib/cobweb.rb +24 -35
- data/lib/cobweb_crawler.rb +5 -0
- data/lib/cobweb_finished_job.rb +2 -0
- data/lib/cobweb_links.rb +8 -1
- data/lib/cobweb_process_job.rb +2 -0
- data/lib/cobweb_version.rb +4 -1
- data/lib/content_link_parser.rb +9 -2
- data/lib/crawl_job.rb +17 -13
- data/lib/encoding_safe_process_job.rb +2 -0
- data/lib/hash_util.rb +16 -0
- data/lib/redirect_error.rb +1 -0
- data/lib/robots.rb +2 -0
- data/lib/server.rb +7 -14
- data/lib/stats.rb +42 -29
- data/lib/uri_helper.rb +2 -0
- data/spec/spec_helper.rb +1 -0
- metadata +22 -21
data/README.textile
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v0.0.
|
2
|
+
h1. Cobweb v0.0.55
|
3
3
|
|
4
4
|
h2. Intro
|
5
5
|
|
@@ -14,6 +14,7 @@ h3. Standalone
|
|
14
14
|
CobwebCrawler takes the same options as cobweb itself, so you can use any of the options available for that. An example is listed below.
|
15
15
|
|
16
16
|
bq. crawler = CobwebCrawler.new(:cache => 600);
|
17
|
+
|
17
18
|
bq. stats = crawler.crawl("http://www.pepsico.com")
|
18
19
|
|
19
20
|
While the crawler is running, you can view statistics on http://localhost:4567
|
@@ -111,6 +112,8 @@ h2. Todo
|
|
111
112
|
* Add ability to start and stop crawls from web interface
|
112
113
|
* Allow crawler to start as web interface only (ie not run crawls at start)
|
113
114
|
* Fix content encoding issue requiring separate process job
|
115
|
+
* DRY the cobweb get/head calls, its got a lot of duplication
|
116
|
+
* Investigate using event machine for single threaded crawling
|
114
117
|
|
115
118
|
h3. Big changes
|
116
119
|
|
data/lib/cobweb.rb
CHANGED
@@ -10,19 +10,15 @@ Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
|
|
10
10
|
require file
|
11
11
|
end
|
12
12
|
|
13
|
+
# Cobweb class is used to perform get and head requests. You can use this on its own if you wish without the crawler
|
13
14
|
class Cobweb
|
14
|
-
## TASKS
|
15
|
-
|
16
|
-
# redesign to have a resque stack and a single threaded stack
|
17
|
-
# dry the code below, its got a lot of duplication
|
18
|
-
# detect the end of the crawl (queued == 0 ?)
|
19
|
-
# on end of crawl, return statistic hash (could call specified method ?) if single threaded or enqueue to a specified queue the stat hash
|
20
|
-
# investigate using event machine for single threaded crawling
|
21
15
|
|
16
|
+
# retrieves current version
|
22
17
|
def self.version
|
23
18
|
CobwebVersion.version
|
24
19
|
end
|
25
20
|
|
21
|
+
# used for setting default options
|
26
22
|
def method_missing(method_sym, *arguments, &block)
|
27
23
|
if method_sym.to_s =~ /^default_(.*)_to$/
|
28
24
|
tag_name = method_sym.to_s.split("_")[1..-2].join("_").to_sym
|
@@ -32,6 +28,7 @@ class Cobweb
|
|
32
28
|
end
|
33
29
|
end
|
34
30
|
|
31
|
+
# See readme for more information on options available
|
35
32
|
def initialize(options = {})
|
36
33
|
@options = options
|
37
34
|
default_use_encoding_safe_process_job_to false
|
@@ -49,6 +46,7 @@ class Cobweb
|
|
49
46
|
|
50
47
|
end
|
51
48
|
|
49
|
+
# This method starts the resque based crawl and enqueues the base_url
|
52
50
|
def start(base_url)
|
53
51
|
raise ":base_url is required" unless base_url
|
54
52
|
request = {
|
@@ -75,7 +73,20 @@ class Cobweb
|
|
75
73
|
|
76
74
|
Resque.enqueue(CrawlJob, request)
|
77
75
|
end
|
76
|
+
|
77
|
+
# Returns array of cookies from content
|
78
|
+
def get_cookies(response)
|
79
|
+
all_cookies = response.get_fields('set-cookie')
|
80
|
+
unless all_cookies.nil?
|
81
|
+
cookies_array = Array.new
|
82
|
+
all_cookies.each { |cookie|
|
83
|
+
cookies_array.push(cookie.split('; ')[0])
|
84
|
+
}
|
85
|
+
cookies = cookies_array.join('; ')
|
86
|
+
end
|
87
|
+
end
|
78
88
|
|
89
|
+
# Performs a HTTP GET request to the specified url applying the options supplied
|
79
90
|
def get(url, options = @options)
|
80
91
|
raise "url cannot be nil" if url.nil?
|
81
92
|
uri = Addressable::URI.parse(url)
|
@@ -103,7 +114,7 @@ class Cobweb
|
|
103
114
|
# check if it has already been cached
|
104
115
|
if redis.get(unique_id) and @options[:cache]
|
105
116
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
106
|
-
content = deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
|
117
|
+
content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
|
107
118
|
else
|
108
119
|
# retrieve data
|
109
120
|
unless @http && @http.address == uri.host && @http.port == uri.inferred_port
|
@@ -173,7 +184,7 @@ class Cobweb
|
|
173
184
|
content[:body] = Base64.encode64(response.body)
|
174
185
|
end
|
175
186
|
content[:location] = response["location"]
|
176
|
-
content[:headers] = deep_symbolize_keys(response.to_hash)
|
187
|
+
content[:headers] = HashUtil.deep_symbolize_keys(response.to_hash)
|
177
188
|
# parse data for links
|
178
189
|
link_parser = ContentLinkParser.new(content[:url], content[:body])
|
179
190
|
content[:links] = link_parser.link_data
|
@@ -233,17 +244,7 @@ class Cobweb
|
|
233
244
|
content
|
234
245
|
end
|
235
246
|
|
236
|
-
|
237
|
-
all_cookies = response.get_fields('set-cookie')
|
238
|
-
unless all_cookies.nil?
|
239
|
-
cookies_array = Array.new
|
240
|
-
all_cookies.each { |cookie|
|
241
|
-
cookies_array.push(cookie.split('; ')[0])
|
242
|
-
}
|
243
|
-
cookies = cookies_array.join('; ')
|
244
|
-
end
|
245
|
-
end
|
246
|
-
|
247
|
+
# Performs a HTTP HEAD request to the specified url applying the options supplied
|
247
248
|
def head(url, options = @options)
|
248
249
|
raise "url cannot be nil" if url.nil?
|
249
250
|
uri = Addressable::URI.parse(url)
|
@@ -271,7 +272,7 @@ class Cobweb
|
|
271
272
|
# check if it has already been cached
|
272
273
|
if redis.get("head-#{unique_id}") and @options[:cache]
|
273
274
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
274
|
-
content = deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
|
275
|
+
content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get("head-#{unique_id}")))
|
275
276
|
else
|
276
277
|
# retrieve data
|
277
278
|
unless @http && @http.address == uri.host && @http.port == uri.inferred_port
|
@@ -379,18 +380,6 @@ class Cobweb
|
|
379
380
|
|
380
381
|
content
|
381
382
|
end
|
382
|
-
|
383
|
-
|
384
|
-
def deep_symbolize_keys(hash)
|
385
|
-
hash.keys.each do |key|
|
386
|
-
value = hash[key]
|
387
|
-
hash.delete(key)
|
388
|
-
hash[key.to_sym] = value
|
389
|
-
if hash[key.to_sym].instance_of? Hash
|
390
|
-
hash[key.to_sym] = deep_symbolize_keys(hash[key.to_sym])
|
391
|
-
end
|
392
|
-
end
|
393
|
-
hash
|
394
|
-
end
|
395
|
-
|
383
|
+
|
384
|
+
end
|
396
385
|
end
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -3,8 +3,10 @@ require 'date'
|
|
3
3
|
require 'ap'
|
4
4
|
#require 'namespaced_redis'
|
5
5
|
|
6
|
+
# CobwebCrawler is a standalone crawler, it includes a built in statistics monitor using Sinatra.
|
6
7
|
class CobwebCrawler
|
7
8
|
|
9
|
+
# See README for more information on options available
|
8
10
|
def initialize(options={})
|
9
11
|
@options = options
|
10
12
|
|
@@ -31,6 +33,7 @@ class CobwebCrawler
|
|
31
33
|
@cobweb = Cobweb.new(@options)
|
32
34
|
end
|
33
35
|
|
36
|
+
# Initiates a crawl starting at the base_url and applying the options supplied. Can also take a block that is executed and passed content hash and statistic hash'
|
34
37
|
def crawl(base_url, crawl_options = {}, &block)
|
35
38
|
@options[:base_url] = base_url unless @options.has_key? :base_url
|
36
39
|
|
@@ -107,7 +110,9 @@ class CobwebCrawler
|
|
107
110
|
|
108
111
|
end
|
109
112
|
|
113
|
+
# Monkey patch into String a starts_with method
|
110
114
|
class String
|
115
|
+
# Monkey patch into String a starts_with method
|
111
116
|
def cobweb_starts_with?(val)
|
112
117
|
if self.length >= val.length
|
113
118
|
self[0..val.length-1] == val
|
data/lib/cobweb_finished_job.rb
CHANGED
data/lib/cobweb_links.rb
CHANGED
@@ -1,6 +1,8 @@
|
|
1
|
+
|
2
|
+
# CobwebLinks processes links to determine whether they are internal or external links
|
1
3
|
class CobwebLinks
|
2
4
|
|
3
|
-
#
|
5
|
+
# Initalise's internal and external patterns and sets up regular expressions
|
4
6
|
def initialize(options={})
|
5
7
|
@options = options
|
6
8
|
|
@@ -15,6 +17,7 @@ class CobwebLinks
|
|
15
17
|
|
16
18
|
end
|
17
19
|
|
20
|
+
# Returns true if the link is matched to an internal_url and not matched to an external_url
|
18
21
|
def internal?(link)
|
19
22
|
if @options[:debug]
|
20
23
|
puts "--------------------------------"
|
@@ -27,6 +30,7 @@ class CobwebLinks
|
|
27
30
|
!@internal_patterns.select{|pattern| link.match(pattern)}.empty? && @external_patterns.select{|pattern| link.match(pattern)}.empty?
|
28
31
|
end
|
29
32
|
|
33
|
+
# Returns true if the link is matched to an external_url or not matched to an internal_url
|
30
34
|
def external?(link)
|
31
35
|
if @options[:debug]
|
32
36
|
puts "--------------------------------"
|
@@ -40,6 +44,7 @@ class CobwebLinks
|
|
40
44
|
end
|
41
45
|
|
42
46
|
private
|
47
|
+
# escapes characters with meaning in regular expressions and adds wildcard expression
|
43
48
|
def escape_pattern_for_regex(pattern)
|
44
49
|
pattern = pattern.gsub(".", "\\.")
|
45
50
|
pattern = pattern.gsub("?", "\\?")
|
@@ -49,8 +54,10 @@ class CobwebLinks
|
|
49
54
|
end
|
50
55
|
end
|
51
56
|
|
57
|
+
# Exception raised for :internal_urls missing from CobwebLinks
|
52
58
|
class InternalUrlsMissingError < Exception
|
53
59
|
end
|
60
|
+
# Exception raised for :internal_urls being invalid from CobwebLinks
|
54
61
|
class InvalidUrlsError < Exception
|
55
62
|
end
|
56
63
|
|
data/lib/cobweb_process_job.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
|
+
# Dummy resque process job that is ran if none are specified
|
1
2
|
class CobwebProcessJob
|
2
3
|
require "ap"
|
3
4
|
|
4
5
|
@queue = :cobweb_process_job
|
5
6
|
|
7
|
+
# Resque perform method
|
6
8
|
def self.perform(content)
|
7
9
|
content = HashHelper.symbolize_keys(content)
|
8
10
|
puts "Dummy Processing for #{content[:url]}"
|
data/lib/cobweb_version.rb
CHANGED
data/lib/content_link_parser.rb
CHANGED
@@ -1,8 +1,10 @@
|
|
1
|
+
require "nokogiri"
|
1
2
|
|
3
|
+
# ContentLinkParser extracts links from HTML content and assigns them to a hash based on the location the link was found. The has contents can be configured in options, however, defaults to a pretty sensible default.
|
4
|
+
# Links can also be returned regardless of the location they were located and can be filtered by the scheme
|
2
5
|
class ContentLinkParser
|
3
6
|
|
4
|
-
|
5
|
-
|
7
|
+
# Parses the content and absolutizes the urls based on url. Options can be setup to determine the links that are extracted.
|
6
8
|
def initialize(url, content, options = {})
|
7
9
|
@options = options
|
8
10
|
@url = url
|
@@ -29,6 +31,7 @@ class ContentLinkParser
|
|
29
31
|
|
30
32
|
end
|
31
33
|
|
34
|
+
# Returns a hash with arrays of links
|
32
35
|
def link_data
|
33
36
|
data = {}
|
34
37
|
@options[:tags].keys.each do |key|
|
@@ -37,6 +40,7 @@ class ContentLinkParser
|
|
37
40
|
data
|
38
41
|
end
|
39
42
|
|
43
|
+
# Returns an array of all absolutized links, specify :valid_schemes in options to limit to certain schemes. Also filters repeating folders (ie if the crawler got in a link loop situation)
|
40
44
|
def all_links(options = {})
|
41
45
|
options[:valid_schemes] = [:http, :https] unless options.has_key? :valid_schemes
|
42
46
|
data = link_data
|
@@ -47,6 +51,7 @@ class ContentLinkParser
|
|
47
51
|
links
|
48
52
|
end
|
49
53
|
|
54
|
+
# Returns the type of links as a method rather than using the hash e.g. 'content_link_parser.images'
|
50
55
|
def method_missing(m)
|
51
56
|
if @options[:tags].keys.include?(m)
|
52
57
|
links = []
|
@@ -60,6 +65,8 @@ class ContentLinkParser
|
|
60
65
|
end
|
61
66
|
end
|
62
67
|
|
68
|
+
private
|
69
|
+
# Processes the content to find links based on options[:tags]
|
63
70
|
def find_matches(array, selector, attribute)
|
64
71
|
if attribute.kind_of? String or attribute.kind_of? Symbol
|
65
72
|
@doc.css(selector).each do |tag|
|
data/lib/crawl_job.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
|
2
|
+
# CrawlJob defines a resque job to perform the crawl
|
1
3
|
class CrawlJob
|
2
4
|
|
3
5
|
require "net/https"
|
@@ -7,10 +9,11 @@ class CrawlJob
|
|
7
9
|
|
8
10
|
@queue = :cobweb_crawl_job
|
9
11
|
|
12
|
+
# Resque perform method to maintain the crawl, enqueue found links and detect the end of crawl
|
10
13
|
def self.perform(content_request)
|
11
14
|
|
12
15
|
# change all hash keys to symbols
|
13
|
-
content_request =
|
16
|
+
content_request = HashUtil.deep_symbolize_keys(content_request)
|
14
17
|
|
15
18
|
content_request[:redis_options] = {} unless content_request.has_key? :redis_options
|
16
19
|
@redis = NamespacedRedis.new(content_request[:redis_options], "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
|
@@ -81,12 +84,14 @@ class CrawlJob
|
|
81
84
|
|
82
85
|
end
|
83
86
|
|
87
|
+
# Sets the crawl status to 'Crawl Stopped' and enqueues the crawl finished job
|
84
88
|
def self.finished(content_request)
|
85
89
|
# finished
|
86
90
|
@stats.end_crawl(content_request)
|
87
91
|
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
|
88
92
|
end
|
89
|
-
|
93
|
+
|
94
|
+
# Enqueues the content to the processing queue setup in options
|
90
95
|
def self.send_to_processing_queue(content, content_request)
|
91
96
|
content_to_send = content.merge({:internal_urls => content_request[:internal_urls], :redis_options => content_request[:redis_options], :source_id => content_request[:source_id], :crawl_id => content_request[:crawl_id]})
|
92
97
|
if content_request[:use_encoding_safe_process_job]
|
@@ -102,14 +107,17 @@ class CrawlJob
|
|
102
107
|
|
103
108
|
private
|
104
109
|
|
110
|
+
# Returns true if the crawl count is within limits
|
105
111
|
def self.within_crawl_limits?(crawl_limit)
|
106
112
|
crawl_limit.nil? or @crawl_counter < crawl_limit.to_i
|
107
113
|
end
|
108
114
|
|
115
|
+
# Returns true if the queue count is calculated to be still within limits when complete
|
109
116
|
def self.within_queue_limits?(crawl_limit)
|
110
117
|
within_crawl_limits?(crawl_limit) and (crawl_limit.nil? or (@queue_counter + @crawl_counter) < crawl_limit.to_i)
|
111
118
|
end
|
112
119
|
|
120
|
+
# Sets the base url in redis. If the first page is a redirect, it sets the base_url to the destination
|
113
121
|
def self.set_base_url(redis, content, content_request)
|
114
122
|
if redis.get("base_url").nil?
|
115
123
|
unless content[:redirect_through].nil? || content[:redirect_through].empty? || !content_request[:first_page_redirect_internal]
|
@@ -120,6 +128,7 @@ class CrawlJob
|
|
120
128
|
end
|
121
129
|
end
|
122
130
|
|
131
|
+
# Enqueues content to the crawl_job queue
|
123
132
|
def self.enqueue_content(content_request, link)
|
124
133
|
new_request = content_request.clone
|
125
134
|
new_request[:url] = link
|
@@ -129,37 +138,32 @@ class CrawlJob
|
|
129
138
|
increment_queue_counter
|
130
139
|
end
|
131
140
|
|
141
|
+
# Increments the queue counter and refreshes crawl counters
|
132
142
|
def self.increment_queue_counter
|
133
143
|
@redis.incr "queue-counter"
|
134
144
|
refresh_counters
|
135
145
|
end
|
146
|
+
# Increments the crawl counter and refreshes crawl counters
|
136
147
|
def self.increment_crawl_counter
|
137
148
|
@redis.incr "crawl-counter"
|
138
149
|
refresh_counters
|
139
150
|
end
|
151
|
+
# Decrements the queue counter and refreshes crawl counters
|
140
152
|
def self.decrement_queue_counter
|
141
153
|
@redis.decr "queue-counter"
|
142
154
|
refresh_counters
|
143
155
|
end
|
156
|
+
# Refreshes the crawl counters
|
144
157
|
def self.refresh_counters
|
145
158
|
@crawl_counter = @redis.get("crawl-counter").to_i
|
146
159
|
@queue_counter = @redis.get("queue-counter").to_i
|
147
160
|
end
|
161
|
+
# Sets the crawl counters based on the crawled and queued queues
|
148
162
|
def self.reset_counters
|
149
163
|
@redis.set("crawl-counter", @redis.smembers("crawled").count)
|
150
164
|
@redis.set("queue-counter", @redis.smembers("queued").count)
|
151
165
|
@crawl_counter = @redis.get("crawl-counter").to_i
|
152
166
|
@queue_counter = @redis.get("queue-counter").to_i
|
153
167
|
end
|
154
|
-
|
155
|
-
hash.keys.each do |key|
|
156
|
-
value = hash[key]
|
157
|
-
hash.delete(key)
|
158
|
-
hash[key.to_sym] = value
|
159
|
-
if hash[key.to_sym].instance_of? Hash
|
160
|
-
hash[key.to_sym] = self.deep_symbolize_keys(hash[key.to_sym])
|
161
|
-
end
|
162
|
-
end
|
163
|
-
hash
|
164
|
-
end
|
168
|
+
|
165
169
|
end
|
@@ -1,7 +1,9 @@
|
|
1
|
+
# Process Job to resolve encoding issue
|
1
2
|
class EncodingSafeProcessJob
|
2
3
|
|
3
4
|
@queue = :encoding_safe_process_job
|
4
5
|
|
6
|
+
# Resque perform method
|
5
7
|
def self.perform(content)
|
6
8
|
clazz = const_get(content["processing_queue"])
|
7
9
|
content["body"] = Base64.decode64(content["body"])
|
data/lib/hash_util.rb
ADDED
@@ -0,0 +1,16 @@
|
|
1
|
+
# Collection of utility methods for the Hash object
|
2
|
+
class HashUtil
|
3
|
+
|
4
|
+
# Returns a hash with the keys converted to symbols
|
5
|
+
def self.deep_symbolize_keys(hash)
|
6
|
+
hash.keys.each do |key|
|
7
|
+
value = hash[key]
|
8
|
+
hash.delete(key)
|
9
|
+
hash[key.to_sym] = value
|
10
|
+
if hash[key.to_sym].instance_of? Hash
|
11
|
+
hash[key.to_sym] = HashUtil.deep_symbolize_keys(hash[key.to_sym])
|
12
|
+
end
|
13
|
+
end
|
14
|
+
hash
|
15
|
+
end
|
16
|
+
end
|
data/lib/redirect_error.rb
CHANGED
data/lib/robots.rb
CHANGED
data/lib/server.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
require 'sinatra'
|
2
2
|
require 'haml'
|
3
3
|
|
4
|
+
# Sinatra server to host the statistics for the CobwebCrawler
|
4
5
|
class Server < Sinatra::Base
|
5
6
|
|
6
7
|
set :views, settings.root + '/../views'
|
@@ -8,6 +9,7 @@ class Server < Sinatra::Base
|
|
8
9
|
set :public_folder, settings.root + '/../public'
|
9
10
|
enable :static
|
10
11
|
|
12
|
+
# Sinatra Dashboard
|
11
13
|
get '/' do
|
12
14
|
@full_redis = Redis.new
|
13
15
|
|
@@ -27,6 +29,7 @@ class Server < Sinatra::Base
|
|
27
29
|
haml :home
|
28
30
|
end
|
29
31
|
|
32
|
+
# Sinatra Crawl Detail
|
30
33
|
get '/statistics/:crawl_id' do
|
31
34
|
redis = NamespacedRedis.new({}, "cobweb-#{params[:crawl_id]}")
|
32
35
|
|
@@ -58,6 +61,7 @@ class Server < Sinatra::Base
|
|
58
61
|
haml :statistics
|
59
62
|
end
|
60
63
|
|
64
|
+
# Starts the Sinatra server, and kills the processes when shutdown
|
61
65
|
def self.start
|
62
66
|
unless Server.running?
|
63
67
|
thread = Thread.new do
|
@@ -72,21 +76,10 @@ class Server < Sinatra::Base
|
|
72
76
|
|
73
77
|
end
|
74
78
|
|
75
|
-
class
|
76
|
-
def self.deep_symbolize_keys(hash)
|
77
|
-
hash.keys.each do |key|
|
78
|
-
value = hash[key]
|
79
|
-
hash.delete(key)
|
80
|
-
hash[key.to_sym] = value
|
81
|
-
if hash[key.to_sym].instance_of? Hash
|
82
|
-
hash[key.to_sym] = HashUtil.deep_symbolize_keys(hash[key.to_sym])
|
83
|
-
end
|
84
|
-
end
|
85
|
-
hash
|
86
|
-
end
|
87
|
-
end
|
88
|
-
|
79
|
+
# Monkey Patch of the Numeric class
|
89
80
|
class Numeric
|
81
|
+
|
82
|
+
#Returns a human readable format for a number representing a data size
|
90
83
|
def to_human
|
91
84
|
units = %w{B KB MB GB TB}
|
92
85
|
ap self
|
data/lib/stats.rb
CHANGED
@@ -1,10 +1,14 @@
|
|
1
|
+
|
2
|
+
# Stats class is the main statisitics hub for monitoring crawls. Either can be viewed through the Sinatra interface, or returned from the CobwebCrawler.crawl method or block
|
1
3
|
class Stats
|
2
4
|
|
5
|
+
# Sets up redis usage for statistics
|
3
6
|
def initialize(options)
|
4
7
|
@full_redis = Redis.new(options[:redis_options])
|
5
8
|
@redis = NamespacedRedis.new(options[:redis_options], "cobweb-#{options[:crawl_id]}")
|
6
9
|
end
|
7
10
|
|
11
|
+
# Sets up the crawl in statistics
|
8
12
|
def start_crawl(options)
|
9
13
|
unless @full_redis.sismember "cobweb_crawls", options[:crawl_id]
|
10
14
|
@full_redis.sadd "cobweb_crawls", options[:crawl_id]
|
@@ -15,12 +19,14 @@ class Stats
|
|
15
19
|
@redis.hset "statistics", "current_status", "Crawl Starting..."
|
16
20
|
end
|
17
21
|
|
22
|
+
# Removes the crawl from the running crawls and updates status
|
18
23
|
def end_crawl(options)
|
19
24
|
@full_redis.srem "cobweb_crawls", options[:crawl_id]
|
20
25
|
@redis.hset "statistics", "current_status", "Crawl Stopped"
|
21
26
|
@redis.del "crawl_details"
|
22
27
|
end
|
23
28
|
|
29
|
+
# Returns statistics hash. update_statistics takes the content hash, extracts statistics from it and updates redis with the data.
|
24
30
|
def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
|
25
31
|
|
26
32
|
@statistics = get_statistics
|
@@ -125,6 +131,41 @@ class Stats
|
|
125
131
|
@statistics
|
126
132
|
end
|
127
133
|
|
134
|
+
# Returns the statistics hash
|
135
|
+
def get_statistics
|
136
|
+
|
137
|
+
@statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
|
138
|
+
if @statistics[:status_counts].nil?
|
139
|
+
@statistics[:status_counts]
|
140
|
+
else
|
141
|
+
@statistics[:status_counts] = JSON.parse(@statistics[:status_counts])
|
142
|
+
end
|
143
|
+
if @statistics[:mime_counts].nil?
|
144
|
+
@statistics[:mime_counts]
|
145
|
+
else
|
146
|
+
@statistics[:mime_counts] = JSON.parse(@statistics[:mime_counts])
|
147
|
+
end
|
148
|
+
@statistics
|
149
|
+
end
|
150
|
+
|
151
|
+
# Sets the current status of the crawl
|
152
|
+
def update_status(status)
|
153
|
+
@redis.hset "statistics", "current_status", status
|
154
|
+
end
|
155
|
+
|
156
|
+
# Returns the current status of the crawl
|
157
|
+
def get_status
|
158
|
+
@redis.hget "statistics", "current_status"
|
159
|
+
end
|
160
|
+
|
161
|
+
# Sets totals for the end of the crawl (Not Used)
|
162
|
+
def set_totals
|
163
|
+
stats = get_statistics
|
164
|
+
stats[:crawled] = @redis.smembers "crawled"
|
165
|
+
end
|
166
|
+
|
167
|
+
private
|
168
|
+
# Records a time based statistic
|
128
169
|
def record_time_stat(stat_name, value, type="minute", duration=60)
|
129
170
|
key = DateTime.now.strftime("%Y-%m-%d %H:%M")
|
130
171
|
if type == "hour"
|
@@ -142,6 +183,7 @@ class Stats
|
|
142
183
|
end
|
143
184
|
end
|
144
185
|
|
186
|
+
# Increments a time based statistic (eg pages per minute)
|
145
187
|
def increment_time_stat(stat_name, type="minute", duration=60)
|
146
188
|
key = DateTime.now.strftime("%Y-%m-%d %H:%M")
|
147
189
|
if type == "hour"
|
@@ -162,35 +204,6 @@ class Stats
|
|
162
204
|
end
|
163
205
|
end
|
164
206
|
|
165
|
-
def get_statistics
|
166
|
-
|
167
|
-
@statistics = HashUtil.deep_symbolize_keys(@redis.hgetall("statistics"))
|
168
|
-
if @statistics[:status_counts].nil?
|
169
|
-
@statistics[:status_counts]
|
170
|
-
else
|
171
|
-
@statistics[:status_counts] = JSON.parse(@statistics[:status_counts])
|
172
|
-
end
|
173
|
-
if @statistics[:mime_counts].nil?
|
174
|
-
@statistics[:mime_counts]
|
175
|
-
else
|
176
|
-
@statistics[:mime_counts] = JSON.parse(@statistics[:mime_counts])
|
177
|
-
end
|
178
|
-
@statistics
|
179
|
-
end
|
180
|
-
|
181
|
-
def update_status(status)
|
182
|
-
@redis.hset "statistics", "current_status", status
|
183
|
-
end
|
184
|
-
|
185
|
-
def get_status
|
186
|
-
@redis.hget "statistics", "current_status"
|
187
|
-
end
|
188
|
-
|
189
|
-
def set_totals
|
190
|
-
stats = get_statistics
|
191
|
-
stats[:crawled] = @redis.smembers "crawled"
|
192
|
-
end
|
193
|
-
|
194
207
|
end
|
195
208
|
|
196
209
|
|
data/lib/uri_helper.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.55
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-05-08 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70166180363640 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70166180363640
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70166180362480 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70166180362480
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70166180361540 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70166180361540
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70166180359680 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70166180359680
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70166180357240 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70166180357240
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70166180380020 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70166180380020
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70166180377140 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70166180377140
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70166180389240 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70166180389240
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70166180388200 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70166180388200
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: namespaced_redis
|
115
|
-
requirement: &
|
115
|
+
requirement: &70166180387040 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,7 +120,7 @@ dependencies:
|
|
120
120
|
version: 1.0.2
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70166180387040
|
124
124
|
description: Web Crawler that uses resque background job engine to allow you to cluster
|
125
125
|
your crawl.
|
126
126
|
email: stewart@rockwellcottage.com
|
@@ -146,6 +146,7 @@ files:
|
|
146
146
|
- lib/content_link_parser.rb
|
147
147
|
- lib/crawl_job.rb
|
148
148
|
- lib/encoding_safe_process_job.rb
|
149
|
+
- lib/hash_util.rb
|
149
150
|
- lib/redirect_error.rb
|
150
151
|
- lib/robots.rb
|
151
152
|
- lib/server.rb
|