cobweb 0.0.2 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +1 -1
- data/lib/cobweb.rb +189 -39
- data/lib/{content_process_job.rb → cobweb_process_job.rb} +2 -2
- data/lib/crawl_job.rb +9 -13
- data/spec/cobweb/cobweb_spec.rb +4 -1
- metadata +19 -5
data/README.textile
CHANGED
data/lib/cobweb.rb
CHANGED
@@ -1,6 +1,7 @@
|
|
1
|
-
require 'rubygems'
|
1
|
+
require 'rubygems'
|
2
2
|
require 'uri'
|
3
3
|
require 'resque'
|
4
|
+
require "addressable/uri"
|
4
5
|
require 'digest/sha1'
|
5
6
|
|
6
7
|
Dir[File.dirname(__FILE__) + '/*.rb'].each do |file|
|
@@ -13,9 +14,12 @@ class CobWeb
|
|
13
14
|
@options = options
|
14
15
|
@options[:follow_redirects] = true unless @options.has_key?(:follow_redirects)
|
15
16
|
@options[:redirect_limit] = 10 unless @options.has_key?(:redirect_limit)
|
16
|
-
@options[:processing_queue] =
|
17
|
+
@options[:processing_queue] = CobwebProcessJob unless @options.has_key?(:processing_queue)
|
18
|
+
@options[:quiet] = true unless @options.has_key?(:quiet)
|
17
19
|
@options[:debug] = false unless @options.has_key?(:debug)
|
18
20
|
@options[:cache] = 300 unless @options.has_key?(:cache)
|
21
|
+
@options[:timeout] = 10 unless @options.has_key?(:timeout)
|
22
|
+
@options[:redis_options] = {} unless @options.has_key?(:redis_options)
|
19
23
|
|
20
24
|
end
|
21
25
|
|
@@ -27,7 +31,7 @@ class CobWeb
|
|
27
31
|
}
|
28
32
|
|
29
33
|
request.merge!(@options)
|
30
|
-
|
34
|
+
|
31
35
|
Resque.enqueue(CrawlJob, request)
|
32
36
|
end
|
33
37
|
|
@@ -35,11 +39,13 @@ class CobWeb
|
|
35
39
|
|
36
40
|
raise "url cannot be nil" if url.nil?
|
37
41
|
|
42
|
+
absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
43
|
+
|
38
44
|
# get the unique id for this request
|
39
45
|
unique_id = Digest::SHA1.hexdigest(url)
|
40
46
|
|
41
47
|
# connect to redis
|
42
|
-
redis = NamespacedRedis.new(Redis.new, "cobweb")
|
48
|
+
redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb")
|
43
49
|
|
44
50
|
content = {}
|
45
51
|
|
@@ -47,61 +53,205 @@ class CobWeb
|
|
47
53
|
if redis.get(unique_id) and @options[:cache]
|
48
54
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
49
55
|
content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
|
50
|
-
|
56
|
+
content[:body] = Base64.decode64(content[:body]) unless content[:body].nil? or content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
|
57
|
+
|
51
58
|
content
|
52
59
|
else
|
53
60
|
# this url is valid for processing so lets get on with it
|
54
61
|
print "Retrieving #{url }... " unless @options[:quiet]
|
55
|
-
uri = URI.parse(url)
|
62
|
+
uri = Addressable::URI.parse(url.strip)
|
56
63
|
|
57
64
|
# retrieve data
|
58
65
|
http = Net::HTTP.new(uri.host, uri.port)
|
59
66
|
if uri.scheme == "https"
|
60
67
|
http.use_ssl = true
|
61
68
|
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
62
|
-
end
|
69
|
+
end
|
70
|
+
|
63
71
|
request_time = Time.now.to_f
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
+
http.read_timeout = @options[:timeout].to_i
|
73
|
+
http.open_timeout = @options[:timeout].to_i
|
74
|
+
begin
|
75
|
+
response = http.start() {|http|
|
76
|
+
response = http.get(uri.request_uri)
|
77
|
+
}
|
78
|
+
|
79
|
+
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
80
|
+
puts "redirected... " unless @options[:quiet]
|
81
|
+
url = absolutize.url(response['location']).to_s
|
82
|
+
ap redirect_limit
|
83
|
+
redirect_limit = redirect_limit - 1
|
84
|
+
content = get(url, redirect_limit)
|
85
|
+
content[:url] = uri.to_s
|
86
|
+
content[:redirect_through] = [] if content[:redirect_through].nil?
|
87
|
+
content[:redirect_through].insert(0, url)
|
88
|
+
|
89
|
+
content[:response_time] = Time.now.to_f - request_time
|
90
|
+
else
|
91
|
+
content[:response_time] = Time.now.to_f - request_time
|
92
|
+
|
93
|
+
puts "Retrieved." unless @options[:quiet]
|
94
|
+
|
95
|
+
# create the content container
|
96
|
+
content[:url] = uri.to_s
|
97
|
+
content[:status_code] = response.code.to_i
|
98
|
+
content[:mime_type] = response.content_type.split(";")[0].strip
|
99
|
+
charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
|
100
|
+
charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
|
101
|
+
content[:character_set] = charset
|
102
|
+
content[:length] = response.content_length
|
103
|
+
if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
|
104
|
+
content[:body] = response.body
|
105
|
+
else
|
106
|
+
content[:body] = Base64.encode64(response.body)
|
107
|
+
end
|
108
|
+
content[:location] = response["location"]
|
109
|
+
content[:headers] = response.to_hash.symbolize_keys
|
110
|
+
# parse data for links
|
111
|
+
link_parser = ContentLinkParser.new(content[:url], content[:body])
|
112
|
+
content[:links] = link_parser.link_data
|
113
|
+
|
114
|
+
# add content to cache if required
|
115
|
+
if @options[:cache]
|
116
|
+
content[:body] = Base64.encode64(content[:body]) unless content[:body].nil? or content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
|
117
|
+
redis.set(unique_id, content.to_json)
|
118
|
+
redis.expire unique_id, @options[:cache].to_i
|
119
|
+
end
|
120
|
+
end
|
121
|
+
rescue SocketError => e
|
122
|
+
puts "ERROR: #{e.message}"
|
123
|
+
|
124
|
+
## generate a blank content
|
125
|
+
content = {}
|
72
126
|
content[:url] = uri.to_s
|
73
|
-
content[:
|
74
|
-
content[:
|
127
|
+
content[:response_time] = Time.now.to_f - request_time
|
128
|
+
content[:status_code] = 0
|
129
|
+
content[:length] = 0
|
130
|
+
content[:body] = ""
|
131
|
+
content[:error] = e.message
|
132
|
+
content[:mime_type] = "error/dnslookup"
|
133
|
+
content[:headers] = {}
|
134
|
+
content[:links] = {}
|
135
|
+
|
136
|
+
rescue Timeout::Error => e
|
137
|
+
puts "ERROR: #{e.message}"
|
75
138
|
|
139
|
+
## generate a blank content
|
140
|
+
content = {}
|
141
|
+
content[:url] = uri.to_s
|
76
142
|
content[:response_time] = Time.now.to_f - request_time
|
143
|
+
content[:status_code] = 0
|
144
|
+
content[:length] = 0
|
145
|
+
content[:body] = ""
|
146
|
+
content[:error] = e.message
|
147
|
+
content[:mime_type] = "error/serverdown"
|
148
|
+
content[:headers] = {}
|
149
|
+
content[:links] = {}
|
150
|
+
end
|
151
|
+
end
|
152
|
+
content
|
153
|
+
end
|
154
|
+
|
155
|
+
def head(url, redirect_limit = @options[:redirect_limit])
|
156
|
+
raise "url cannot be nil" if url.nil?
|
157
|
+
|
158
|
+
absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
159
|
+
|
160
|
+
# get the unique id for this request
|
161
|
+
unique_id = Digest::SHA1.hexdigest(url)
|
162
|
+
|
163
|
+
# connect to redis
|
164
|
+
redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb")
|
165
|
+
|
166
|
+
content = {}
|
167
|
+
|
168
|
+
# check if it has already been cached
|
169
|
+
if (redis.get(unique_id) or redis.get("head-#{unique_id}")) and @options[:cache]
|
170
|
+
puts "Cache hit for #{url}" unless @options[:quiet]
|
171
|
+
if redis.get(unique_id)
|
172
|
+
content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
|
77
173
|
else
|
78
|
-
content
|
174
|
+
content = JSON.parse(redis.get("head-#{unique_id}")).deep_symbolize_keys
|
175
|
+
end
|
176
|
+
content
|
177
|
+
else
|
178
|
+
print "Retrieving #{url }... " unless @options[:quiet]
|
179
|
+
uri = Addressable::URI.parse(url.strip)
|
180
|
+
|
181
|
+
# retrieve data
|
182
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
183
|
+
if uri.scheme == "https"
|
184
|
+
http.use_ssl = true
|
185
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
186
|
+
end
|
187
|
+
|
188
|
+
request_time = Time.now.to_f
|
189
|
+
http.read_timeout = @options[:timeout].to_i
|
190
|
+
http.open_timeout = @options[:timeout].to_i
|
191
|
+
|
192
|
+
begin
|
193
|
+
response = http.head(uri.to_s)
|
79
194
|
|
80
|
-
|
195
|
+
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
196
|
+
puts "redirected... " unless @options[:quiet]
|
197
|
+
url = absolutize.url(response['location']).to_s
|
198
|
+
redirect_limit = redirect_limit - 1
|
199
|
+
content = head(url, redirect_limit)
|
200
|
+
content[:url] = uri.to_s
|
201
|
+
content[:redirect_through] = [] if content[:redirect_through].nil?
|
202
|
+
content[:redirect_through].insert(0, url)
|
203
|
+
else
|
204
|
+
content[:url] = uri.to_s
|
205
|
+
content[:status_code] = response.code.to_i
|
206
|
+
unless response.content_type.nil?
|
207
|
+
content[:mime_type] = response.content_type.split(";")[0].strip
|
208
|
+
charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
|
209
|
+
charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
|
210
|
+
content[:character_set] = charset
|
211
|
+
end
|
212
|
+
|
213
|
+
# add content to cache if required
|
214
|
+
if @options[:cache]
|
215
|
+
puts "Stored in cache [head-#{unique_id}]" if @options[:debug]
|
216
|
+
redis.set("head-#{unique_id}", content.to_json)
|
217
|
+
redis.expire "head-#{unique_id}", @options[:cache].to_i
|
218
|
+
else
|
219
|
+
puts "Not storing in cache as cache disabled" if @options[:debug]
|
220
|
+
end
|
221
|
+
end
|
222
|
+
rescue SocketError => e
|
223
|
+
puts "ERROR: #{e.message}"
|
81
224
|
|
82
|
-
|
225
|
+
## generate a blank content
|
226
|
+
content = {}
|
83
227
|
content[:url] = uri.to_s
|
84
|
-
content[:
|
85
|
-
content[:
|
86
|
-
|
87
|
-
|
88
|
-
content[:
|
89
|
-
content[:
|
90
|
-
content[:
|
91
|
-
content[:
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
content[:links] = link_parser.link_data
|
228
|
+
content[:response_time] = Time.now.to_f - request_time
|
229
|
+
content[:status_code] = 0
|
230
|
+
content[:length] = 0
|
231
|
+
content[:body] = ""
|
232
|
+
content[:error] = e.message
|
233
|
+
content[:mime_type] = "error/dnslookup"
|
234
|
+
content[:headers] = {}
|
235
|
+
content[:links] = {}
|
236
|
+
|
237
|
+
rescue Timeout::Error => e
|
238
|
+
puts "ERROR: #{e.message}"
|
96
239
|
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
240
|
+
## generate a blank content
|
241
|
+
content = {}
|
242
|
+
content[:url] = uri.to_s
|
243
|
+
content[:response_time] = Time.now.to_f - request_time
|
244
|
+
content[:status_code] = 0
|
245
|
+
content[:length] = 0
|
246
|
+
content[:body] = ""
|
247
|
+
content[:error] = e.message
|
248
|
+
content[:mime_type] = "error/serverdown"
|
249
|
+
content[:headers] = {}
|
250
|
+
content[:links] = {}
|
102
251
|
end
|
252
|
+
|
253
|
+
content
|
103
254
|
end
|
104
|
-
content
|
105
255
|
end
|
106
256
|
end
|
107
257
|
|
@@ -126,4 +276,4 @@ class Hash
|
|
126
276
|
end
|
127
277
|
self
|
128
278
|
end
|
129
|
-
end
|
279
|
+
end
|
data/lib/crawl_job.rb
CHANGED
@@ -9,7 +9,7 @@ class CrawlJob
|
|
9
9
|
def self.perform(content_request)
|
10
10
|
# change all hash keys to symbols
|
11
11
|
content_request.deep_symbolize_keys
|
12
|
-
redis = NamespacedRedis.new(Redis.new, "cobweb-#{content_request[:crawl_id]}")
|
12
|
+
redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{content_request[:crawl_id]}")
|
13
13
|
@absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
14
14
|
|
15
15
|
# check we haven't crawled this url before
|
@@ -19,18 +19,14 @@ class CrawlJob
|
|
19
19
|
redis.incr "crawl-counter"
|
20
20
|
crawl_counter = redis.get("crawl-counter").to_i
|
21
21
|
queue_counter = redis.get("queue-counter").to_i
|
22
|
-
if crawl_counter <= content_request[:crawl_limit]
|
23
|
-
content = CobWeb.get(content_request)
|
24
|
-
redis.sadd "crawled", content_request[:url]
|
22
|
+
if crawl_counter <= content_request[:crawl_limit].to_i
|
23
|
+
content = CobWeb.new(content_request).get(content_request[:url])
|
24
|
+
redis.sadd "crawled", content_request[:url]
|
25
25
|
set_base_url redis, content, content_request[:base_url]
|
26
|
-
if queue_counter <= content_request[:crawl_limit]
|
27
|
-
ap content[:links]
|
26
|
+
if queue_counter <= content_request[:crawl_limit].to_i
|
28
27
|
content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
|
29
|
-
ap link
|
30
28
|
unless redis.sismember "crawled", link
|
31
|
-
|
32
|
-
puts "---------------------------------"
|
33
|
-
if link.match(Regexp.new("^#{redis.get("base_url")}"))
|
29
|
+
if link.to_s.match(Regexp.new("^#{redis.get("base_url")}"))
|
34
30
|
new_request = content_request.clone
|
35
31
|
new_request[:url] = link
|
36
32
|
new_request[:parent] = content_request[:url]
|
@@ -42,12 +38,12 @@ class CrawlJob
|
|
42
38
|
end
|
43
39
|
|
44
40
|
# enqueue to processing queue
|
45
|
-
Resque.enqueue(const_get(content_request[:processing_queue]), content)
|
41
|
+
Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:source_id => content_request[:source_id]}))
|
46
42
|
puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
|
47
43
|
puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter}" if content_request[:debug]
|
48
44
|
|
49
45
|
else
|
50
|
-
puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit]} objects" if content_request[:debug]
|
46
|
+
puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
|
51
47
|
end
|
52
48
|
else
|
53
49
|
puts "Already crawled #{content_request[:url]}" if content_request[:debug]
|
@@ -60,7 +56,7 @@ class CrawlJob
|
|
60
56
|
if content[:status_code] >= 300 and content[:status_code] < 400
|
61
57
|
#redirect received for first url
|
62
58
|
redis.set("base_url", @absolutize.url(content[:location]).to_s)
|
63
|
-
puts "
|
59
|
+
puts "WARNING: base_url given redirects to another location, setting base_url to #{@absolutize.url(content[:location]).to_s}"
|
64
60
|
else
|
65
61
|
redis.set("base_url", base_url)
|
66
62
|
end
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -36,7 +36,10 @@ describe CobWeb do
|
|
36
36
|
|
37
37
|
@mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
|
38
38
|
@mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
39
|
-
@mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
|
39
|
+
@mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
|
40
|
+
@mock_http_client.stub!(:read_timeout=).and_return(nil)
|
41
|
+
@mock_http_client.stub!(:open_timeout=).and_return(nil)
|
42
|
+
@mock_http_client.stub!(:start).and_return(@mock_http_response)
|
40
43
|
|
41
44
|
@mock_http_response.stub!(:code).and_return(200)
|
42
45
|
@mock_http_response.stub!(:content_type).and_return("text/html")
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 21
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 5
|
10
|
+
version: 0.0.5
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Stewart McKee
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date:
|
18
|
+
date: 2011-01-28 00:00:00 +00:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -74,6 +74,20 @@ dependencies:
|
|
74
74
|
version: "0"
|
75
75
|
type: :runtime
|
76
76
|
version_requirements: *id004
|
77
|
+
- !ruby/object:Gem::Dependency
|
78
|
+
name: addressable
|
79
|
+
prerelease: false
|
80
|
+
requirement: &id005 !ruby/object:Gem::Requirement
|
81
|
+
none: false
|
82
|
+
requirements:
|
83
|
+
- - ">="
|
84
|
+
- !ruby/object:Gem::Version
|
85
|
+
hash: 3
|
86
|
+
segments:
|
87
|
+
- 0
|
88
|
+
version: "0"
|
89
|
+
type: :runtime
|
90
|
+
version_requirements: *id005
|
77
91
|
description:
|
78
92
|
email: stewart@rockwellcottage.com
|
79
93
|
executables: []
|
@@ -90,8 +104,8 @@ files:
|
|
90
104
|
- spec/cobweb/cobweb_spec.rb
|
91
105
|
- lib/namespaced_redis.rb
|
92
106
|
- lib/cobweb.rb
|
93
|
-
- lib/content_process_job.rb
|
94
107
|
- lib/content_link_parser.rb
|
108
|
+
- lib/cobweb_process_job.rb
|
95
109
|
- lib/crawl_job.rb
|
96
110
|
- README.textile
|
97
111
|
has_rdoc: false
|