cobweb 0.0.21 → 0.0.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/cobweb.rb +18 -10
- data/lib/crawl_job.rb +4 -3
- data/lib/stats.rb +0 -1
- metadata +21 -21
data/lib/cobweb.rb
CHANGED
@@ -5,8 +5,8 @@ require "addressable/uri"
|
|
5
5
|
require 'digest/sha1'
|
6
6
|
require 'base64'
|
7
7
|
|
8
|
-
Dir[File.dirname(__FILE__) + '
|
9
|
-
require
|
8
|
+
Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
|
9
|
+
require file
|
10
10
|
end
|
11
11
|
|
12
12
|
class Cobweb
|
@@ -18,6 +18,10 @@ class Cobweb
|
|
18
18
|
# on end of crawl, return statistic hash (could call specified method ?) if single threaded or enqueue to a specified queue the stat hash
|
19
19
|
# investigate using event machine for single threaded crawling
|
20
20
|
|
21
|
+
def self.version
|
22
|
+
"0.0.22"
|
23
|
+
end
|
24
|
+
|
21
25
|
def initialize(options = {})
|
22
26
|
@options = options
|
23
27
|
@options[:follow_redirects] = true unless @options.has_key?(:follow_redirects)
|
@@ -34,12 +38,12 @@ class Cobweb
|
|
34
38
|
def start(base_url)
|
35
39
|
raise ":base_url is required" unless base_url
|
36
40
|
request = {
|
37
|
-
:crawl_id => Digest::SHA1.hexdigest(Time.now.
|
41
|
+
:crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
|
38
42
|
:url => base_url
|
39
43
|
}
|
40
44
|
|
41
45
|
request.merge!(@options)
|
42
|
-
@redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{
|
46
|
+
@redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{Cobweb.version}-#{request[:crawl_id]}")
|
43
47
|
@redis.hset "statistics", "queued_at", DateTime.now
|
44
48
|
|
45
49
|
Resque.enqueue(CrawlJob, request)
|
@@ -53,13 +57,17 @@ class Cobweb
|
|
53
57
|
|
54
58
|
# get the unique id for this request
|
55
59
|
unique_id = Digest::SHA1.hexdigest(url.to_s)
|
56
|
-
redirect_limit
|
60
|
+
if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
|
61
|
+
redirect_limit = options[:redirect_limit].to_i
|
62
|
+
else
|
63
|
+
redirect_limit = 10
|
64
|
+
end
|
57
65
|
|
58
66
|
# connect to redis
|
59
67
|
if options.has_key? :crawl_id
|
60
|
-
redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{
|
68
|
+
redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
|
61
69
|
else
|
62
|
-
redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{
|
70
|
+
redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}")
|
63
71
|
end
|
64
72
|
|
65
73
|
content = {}
|
@@ -107,7 +115,7 @@ class Cobweb
|
|
107
115
|
raise RedirectError, "Redirect Limit reached" if redirect_limit == 0
|
108
116
|
|
109
117
|
# get the content from redirect location
|
110
|
-
content = get(url, redirect_limit)
|
118
|
+
content = get(url, options.merge(:redirect_limit => redirect_limit))
|
111
119
|
content[:url] = uri.to_s
|
112
120
|
content[:redirect_through] = [] if content[:redirect_through].nil?
|
113
121
|
content[:redirect_through].insert(0, url)
|
@@ -206,9 +214,9 @@ class Cobweb
|
|
206
214
|
|
207
215
|
# connect to redis
|
208
216
|
if options.has_key? :crawl_id
|
209
|
-
redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{
|
217
|
+
redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
|
210
218
|
else
|
211
|
-
redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{
|
219
|
+
redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}")
|
212
220
|
end
|
213
221
|
|
214
222
|
content = {}
|
data/lib/crawl_job.rb
CHANGED
@@ -28,8 +28,8 @@ class CrawlJob
|
|
28
28
|
def self.perform(content_request)
|
29
29
|
# change all hash keys to symbols
|
30
30
|
content_request.deep_symbolize_keys
|
31
|
-
redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{
|
32
|
-
|
31
|
+
redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
|
32
|
+
|
33
33
|
@absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
34
34
|
|
35
35
|
# check we haven't crawled this url before
|
@@ -95,6 +95,7 @@ class CrawlJob
|
|
95
95
|
redis.sadd "crawled", content_request[:url]
|
96
96
|
set_base_url redis, content, content_request[:base_url]
|
97
97
|
content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
|
98
|
+
link = link.to_s
|
98
99
|
unless redis.sismember "crawled", link
|
99
100
|
puts "Checking if #{link} matches #{redis.get("base_url")} as internal?" if content_request[:debug]
|
100
101
|
if link.to_s.match(Regexp.new("^#{redis.get("base_url")}"))
|
@@ -129,7 +130,7 @@ class CrawlJob
|
|
129
130
|
|
130
131
|
# detect finished state
|
131
132
|
|
132
|
-
if queue_counter == crawl_counter or content_request[:crawl_limit].to_i <= crawl_counter
|
133
|
+
if queue_counter == crawl_counter or content_request[:crawl_limit].to_i <= crawl_counter
|
133
134
|
|
134
135
|
puts "queue_counter: #{queue_counter}"
|
135
136
|
puts "crawl_counter: #{crawl_counter}"
|
data/lib/stats.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.22
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2012-03-09 00:00:00.000000000 Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70101145919340 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70101145919340
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70101145918920 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70101145918920
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: absolutize
|
38
|
-
requirement: &
|
38
|
+
requirement: &70101145918500 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70101145918500
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: nokogiri
|
49
|
-
requirement: &
|
49
|
+
requirement: &70101145934440 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70101145934440
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: addressable
|
60
|
-
requirement: &
|
60
|
+
requirement: &70101145934020 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70101145934020
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: rspec
|
71
|
-
requirement: &
|
71
|
+
requirement: &70101145933580 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70101145933580
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: awesome_print
|
82
|
-
requirement: &
|
82
|
+
requirement: &70101145933160 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70101145933160
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: sinatra
|
93
|
-
requirement: &
|
93
|
+
requirement: &70101145932740 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70101145932740
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: thin
|
104
|
-
requirement: &
|
104
|
+
requirement: &70101145932320 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70101145932320
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: haml
|
115
|
-
requirement: &
|
115
|
+
requirement: &70101145931900 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,7 +120,7 @@ dependencies:
|
|
120
120
|
version: '0'
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70101145931900
|
124
124
|
description:
|
125
125
|
email: stewart@rockwellcottage.com
|
126
126
|
executables: []
|