cobweb 0.0.77 → 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +9 -1
- data/lib/cobweb.rb +5 -5
- data/lib/cobweb_crawler.rb +30 -22
- data/lib/cobweb_version.rb +1 -1
- data/lib/server.rb +4 -4
- data/lib/stats.rb +2 -2
- data/spec/cobweb/cobweb_job_spec.rb +1 -1
- data/views/home.haml +1 -1
- data/views/statistics.haml +11 -9
- metadata +24 -24
data/README.textile
CHANGED
@@ -1,5 +1,5 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb
|
2
|
+
h1. Cobweb v1.0.0
|
3
3
|
|
4
4
|
"@cobweb_gem":https://twitter.com/cobweb_gem
|
5
5
|
|
@@ -100,6 +100,8 @@ Creates a new crawler object based on a base_url
|
|
100
100
|
** :user_agent - user agent string to match in robots.txt (not sent as user_agent of requests yet) (default: cobweb)
|
101
101
|
** :crawl_limit_by_page - sets the crawl counter to only use html page types when counting objects crawled
|
102
102
|
** :valid_mime_types - an array of mime types that takes wildcards (eg 'text/*') defaults to ['*/*']
|
103
|
+
** :direct_call_process_job - boolean that specifies whether objects should be passed directly to a processing method or should be put onto a queue
|
104
|
+
|
103
105
|
|
104
106
|
bc. crawler = Cobweb.new(:follow_redirects => false)
|
105
107
|
|
@@ -129,6 +131,12 @@ Simple get that obey's the options supplied in new.
|
|
129
131
|
|
130
132
|
bc. crawler.head("http://www.google.com/")
|
131
133
|
|
134
|
+
|
135
|
+
h4. Processing Queue
|
136
|
+
|
137
|
+
The :processing_queue option is used to specify the class that contains the resque perform method to pass the content onto. This class should be defined in your application to perform any tasks you wish to the content. There are two options however, for running this. Firstly, the default settings will push the content crawled onto a resque queue for that class. This allows you the flexibility of running in queues on seperate machines etc. The main drawback to this is that all your content is stored in redis within the queue. This can be memory intensive if you are crawling large sites, or have large content that is being crawled. To get around this you can specify that the crawl_job calls the perform method on the processing queue class directly, thereby not using memory in redis for the content. This is performed by using the :direct_call_process_job. If you set that option to 'true' then instead of the job being queued, it will be executed within the crawl_job queue.
|
138
|
+
|
139
|
+
|
132
140
|
h3. CobwebCrawler
|
133
141
|
|
134
142
|
CobwebCrawler is the standalone crawling class. If you don't want to use redis and just want to crawl the site within your ruby process, you can use this class.
|
data/lib/cobweb.rb
CHANGED
@@ -116,7 +116,7 @@ class Cobweb
|
|
116
116
|
end
|
117
117
|
|
118
118
|
content = {:base_url => url}
|
119
|
-
|
119
|
+
|
120
120
|
# check if it has already been cached
|
121
121
|
if redis.get(unique_id) and @options[:cache]
|
122
122
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
@@ -162,7 +162,7 @@ class Cobweb
|
|
162
162
|
content[:url] = uri.to_s
|
163
163
|
content[:redirect_through] = [] if content[:redirect_through].nil?
|
164
164
|
content[:redirect_through].insert(0, url)
|
165
|
-
|
165
|
+
|
166
166
|
content[:response_time] = Time.now.to_f - request_time
|
167
167
|
else
|
168
168
|
content[:response_time] = Time.now.to_f - request_time
|
@@ -231,7 +231,7 @@ class Cobweb
|
|
231
231
|
content[:mime_type] = "error/dnslookup"
|
232
232
|
content[:headers] = {}
|
233
233
|
content[:links] = {}
|
234
|
-
|
234
|
+
|
235
235
|
rescue Timeout::Error => e
|
236
236
|
puts "ERROR Timeout::Error: #{e.message}"
|
237
237
|
|
@@ -247,8 +247,8 @@ class Cobweb
|
|
247
247
|
content[:headers] = {}
|
248
248
|
content[:links] = {}
|
249
249
|
end
|
250
|
+
content
|
250
251
|
end
|
251
|
-
content
|
252
252
|
end
|
253
253
|
|
254
254
|
# Performs a HTTP HEAD request to the specified url applying the options supplied
|
@@ -368,7 +368,7 @@ class Cobweb
|
|
368
368
|
content[:mime_type] = "error/dnslookup"
|
369
369
|
content[:headers] = {}
|
370
370
|
content[:links] = {}
|
371
|
-
|
371
|
+
|
372
372
|
rescue Timeout::Error => e
|
373
373
|
puts "ERROR Timeout::Error: #{e.message}"
|
374
374
|
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -42,7 +42,7 @@ class CobwebCrawler
|
|
42
42
|
|
43
43
|
@crawl_options = crawl_options
|
44
44
|
|
45
|
-
@redis.sadd("queued", base_url) unless @redis.sismember("crawled", base_url) || @redis.sismember("queued", base_url)
|
45
|
+
@redis.sadd("queued", base_url) unless base_url.nil? || @redis.sismember("crawled", base_url) || @redis.sismember("queued", base_url)
|
46
46
|
crawl_counter = @redis.scard("crawled").to_i
|
47
47
|
queue_counter = @redis.scard("queued").to_i
|
48
48
|
|
@@ -58,34 +58,42 @@ class CobwebCrawler
|
|
58
58
|
begin
|
59
59
|
@stats.update_status("Requesting #{url}...")
|
60
60
|
content = @cobweb.get(url)
|
61
|
-
|
61
|
+
if content.nil?
|
62
|
+
queue_counter = queue_counter - 1 #@redis.scard("queued").to_i
|
63
|
+
else
|
64
|
+
@stats.update_status("Processing #{url}...")
|
62
65
|
|
63
|
-
|
64
|
-
|
66
|
+
@redis.sadd "crawled", url.to_s
|
67
|
+
@redis.incr "crawl-counter"
|
65
68
|
|
66
|
-
|
69
|
+
internal_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https])
|
67
70
|
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
+
# select the link if its internal (eliminate external before expensive lookups in queued and crawled)
|
72
|
+
cobweb_links = CobwebLinks.new(@options)
|
73
|
+
internal_links = internal_links.select{|link| cobweb_links.internal?(link)}
|
71
74
|
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
+
# reject the link if we've crawled it or queued it
|
76
|
+
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
77
|
+
internal_links.reject!{|link| @redis.sismember("queued", link)}
|
78
|
+
internal_links.reject!{|link| link.nil? || link.empty?}
|
75
79
|
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
80
|
+
internal_links.each do |link|
|
81
|
+
puts "Added #{link.to_s} to queue" if @debug
|
82
|
+
@redis.sadd "queued", link unless link.nil?
|
83
|
+
children = @redis.hget("navigation", url)
|
84
|
+
children = [] if children.nil?
|
85
|
+
children << link
|
86
|
+
@redis.hset "navigation", url, children
|
87
|
+
queue_counter += 1
|
88
|
+
end
|
81
89
|
|
82
|
-
|
83
|
-
|
90
|
+
crawl_counter = crawl_counter + 1 #@redis.scard("crawled").to_i
|
91
|
+
queue_counter = queue_counter - 1 #@redis.scard("queued").to_i
|
84
92
|
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
93
|
+
@stats.update_statistics(content, crawl_counter, queue_counter)
|
94
|
+
@stats.update_status("Completed #{url}.")
|
95
|
+
yield content, @stats.get_statistics if block_given?
|
96
|
+
end
|
89
97
|
rescue => e
|
90
98
|
puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
|
91
99
|
ap e
|
data/lib/cobweb_version.rb
CHANGED
data/lib/server.rb
CHANGED
@@ -14,7 +14,7 @@ class Server < Sinatra::Base
|
|
14
14
|
@colors = ["#00366f", "#006ba0", "#3F0BDB", "#396CB3"]
|
15
15
|
|
16
16
|
@crawls = []
|
17
|
-
@full_redis.smembers("cobweb_crawls").each do |crawl_id|
|
17
|
+
@full_redis.smembers("cobweb_crawls").each do |crawl_id|
|
18
18
|
version = cobweb_version(crawl_id)
|
19
19
|
redis = Redis::Namespace.new("cobweb-#{version}-#{crawl_id}", :redis => Redis.new(redis_options))
|
20
20
|
stats = HashUtil.deep_symbolize_keys({
|
@@ -69,8 +69,9 @@ class Server < Sinatra::Base
|
|
69
69
|
|
70
70
|
def cobweb_version(crawl_id)
|
71
71
|
redis = Redis.new(redis_options)
|
72
|
-
key = redis.keys("cobweb-*-#{crawl_id}
|
73
|
-
|
72
|
+
key = redis.keys("cobweb-*-#{crawl_id}:queued").first
|
73
|
+
|
74
|
+
key =~ /cobweb-(.*?)-(.*?):queued/
|
74
75
|
cobweb_version = $1
|
75
76
|
end
|
76
77
|
|
@@ -82,7 +83,6 @@ class Server < Sinatra::Base
|
|
82
83
|
def self.start(options={})
|
83
84
|
@options = options
|
84
85
|
@options[:redis_options] = {} unless @options.has_key? :redis_options
|
85
|
-
ap @options
|
86
86
|
unless Server.running?
|
87
87
|
if @options[:run_as_server]
|
88
88
|
puts "Starting Sinatra for cobweb v#{Cobweb.version}"
|
data/lib/stats.rb
CHANGED
@@ -24,13 +24,13 @@ class Stats
|
|
24
24
|
|
25
25
|
# Removes the crawl from the running crawls and updates status
|
26
26
|
def end_crawl(options, cancelled=false)
|
27
|
-
|
27
|
+
#@full_redis.srem "cobweb_crawls", options[:crawl_id]
|
28
28
|
if cancelled
|
29
29
|
@redis.hset "statistics", "current_status", CobwebCrawlHelper::CANCELLED
|
30
30
|
else
|
31
31
|
@redis.hset "statistics", "current_status", CobwebCrawlHelper::FINISHED
|
32
32
|
end
|
33
|
-
|
33
|
+
#@redis.del "crawl_details"
|
34
34
|
end
|
35
35
|
|
36
36
|
def get_crawled
|
data/views/home.haml
CHANGED
data/views/statistics.haml
CHANGED
@@ -136,11 +136,12 @@
|
|
136
136
|
%th Count
|
137
137
|
|
138
138
|
%tbody
|
139
|
-
- @crawl[:statistics][:status_counts]
|
140
|
-
-
|
141
|
-
|
142
|
-
%
|
143
|
-
|
139
|
+
- if @crawl[:statistics] && @crawl[:statistics][:status_counts]
|
140
|
+
- @crawl[:statistics][:status_counts].keys.each do |status|
|
141
|
+
- unless status.nil? || status == ""
|
142
|
+
%tr
|
143
|
+
%td= status
|
144
|
+
%td= @crawl[:statistics][:status_counts][status]
|
144
145
|
|
145
146
|
.medium
|
146
147
|
.box
|
@@ -156,10 +157,11 @@
|
|
156
157
|
%th Count
|
157
158
|
|
158
159
|
%tbody
|
159
|
-
- @crawl[:statistics][:mime_counts]
|
160
|
-
|
161
|
-
%
|
162
|
-
|
160
|
+
- if @crawl[:statistics][:mime_counts]
|
161
|
+
- @crawl[:statistics][:mime_counts].keys.each do |mime_type|
|
162
|
+
%tr
|
163
|
+
%td= mime_type
|
164
|
+
%td= @crawl[:statistics][:mime_counts][mime_type]
|
163
165
|
|
164
166
|
|
165
167
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 1.0.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-12-19 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70111197840140 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70111197840140
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70111197838420 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70111197838420
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70111197837080 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70111197837080
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70111197836140 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70111197836140
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70111197835560 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70111197835560
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70111197834960 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70111197834960
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70111197834160 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70111197834160
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70111197833580 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70111197833580
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70111197833140 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70111197833140
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: namespaced_redis
|
115
|
-
requirement: &
|
115
|
+
requirement: &70111197832400 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,10 +120,10 @@ dependencies:
|
|
120
120
|
version: 1.0.2
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70111197832400
|
124
124
|
- !ruby/object:Gem::Dependency
|
125
125
|
name: json
|
126
|
-
requirement: &
|
126
|
+
requirement: &70111197831720 !ruby/object:Gem::Requirement
|
127
127
|
none: false
|
128
128
|
requirements:
|
129
129
|
- - ! '>='
|
@@ -131,7 +131,7 @@ dependencies:
|
|
131
131
|
version: '0'
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
|
-
version_requirements: *
|
134
|
+
version_requirements: *70111197831720
|
135
135
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
136
136
|
crawl extremely large sites which is much more performant than multi-threaded crawlers. It
|
137
137
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|