cobweb 0.0.2 → 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.2
2
+ h1. Cobweb v0.0.3
3
3
 
4
4
  h2. Intro
5
5
 
@@ -1,6 +1,7 @@
1
- require 'rubygems'
1
+ require 'rubygems'
2
2
  require 'uri'
3
3
  require 'resque'
4
+ require "addressable/uri"
4
5
  require 'digest/sha1'
5
6
 
6
7
  Dir[File.dirname(__FILE__) + '/*.rb'].each do |file|
@@ -13,9 +14,12 @@ class CobWeb
13
14
  @options = options
14
15
  @options[:follow_redirects] = true unless @options.has_key?(:follow_redirects)
15
16
  @options[:redirect_limit] = 10 unless @options.has_key?(:redirect_limit)
16
- @options[:processing_queue] = ContentProcessJob unless @options.has_key?(:processing_queue)
17
+ @options[:processing_queue] = CobwebProcessJob unless @options.has_key?(:processing_queue)
18
+ @options[:quiet] = true unless @options.has_key?(:quiet)
17
19
  @options[:debug] = false unless @options.has_key?(:debug)
18
20
  @options[:cache] = 300 unless @options.has_key?(:cache)
21
+ @options[:timeout] = 10 unless @options.has_key?(:timeout)
22
+ @options[:redis_options] = {} unless @options.has_key?(:redis_options)
19
23
 
20
24
  end
21
25
 
@@ -27,7 +31,7 @@ class CobWeb
27
31
  }
28
32
 
29
33
  request.merge!(@options)
30
-
34
+
31
35
  Resque.enqueue(CrawlJob, request)
32
36
  end
33
37
 
@@ -35,11 +39,13 @@ class CobWeb
35
39
 
36
40
  raise "url cannot be nil" if url.nil?
37
41
 
42
+ absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
43
+
38
44
  # get the unique id for this request
39
45
  unique_id = Digest::SHA1.hexdigest(url)
40
46
 
41
47
  # connect to redis
42
- redis = NamespacedRedis.new(Redis.new, "cobweb")
48
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb")
43
49
 
44
50
  content = {}
45
51
 
@@ -47,61 +53,205 @@ class CobWeb
47
53
  if redis.get(unique_id) and @options[:cache]
48
54
  puts "Cache hit for #{url}" unless @options[:quiet]
49
55
  content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
50
-
56
+ content[:body] = Base64.decode64(content[:body]) unless content[:body].nil? or content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
57
+
51
58
  content
52
59
  else
53
60
  # this url is valid for processing so lets get on with it
54
61
  print "Retrieving #{url }... " unless @options[:quiet]
55
- uri = URI.parse(url)
62
+ uri = Addressable::URI.parse(url.strip)
56
63
 
57
64
  # retrieve data
58
65
  http = Net::HTTP.new(uri.host, uri.port)
59
66
  if uri.scheme == "https"
60
67
  http.use_ssl = true
61
68
  http.verify_mode = OpenSSL::SSL::VERIFY_NONE
62
- end
69
+ end
70
+
63
71
  request_time = Time.now.to_f
64
- request = Net::HTTP::Get.new(uri.request_uri)
65
- response = http.request(request)
66
-
67
- if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
68
- puts "redirected... " unless @options[:quiet]
69
- url = response['location']
70
- redirect_limit = redirect_limit - 1
71
- content = get(response['location'], redirect_limit)
72
+ http.read_timeout = @options[:timeout].to_i
73
+ http.open_timeout = @options[:timeout].to_i
74
+ begin
75
+ response = http.start() {|http|
76
+ response = http.get(uri.request_uri)
77
+ }
78
+
79
+ if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
80
+ puts "redirected... " unless @options[:quiet]
81
+ url = absolutize.url(response['location']).to_s
82
+ ap redirect_limit
83
+ redirect_limit = redirect_limit - 1
84
+ content = get(url, redirect_limit)
85
+ content[:url] = uri.to_s
86
+ content[:redirect_through] = [] if content[:redirect_through].nil?
87
+ content[:redirect_through].insert(0, url)
88
+
89
+ content[:response_time] = Time.now.to_f - request_time
90
+ else
91
+ content[:response_time] = Time.now.to_f - request_time
92
+
93
+ puts "Retrieved." unless @options[:quiet]
94
+
95
+ # create the content container
96
+ content[:url] = uri.to_s
97
+ content[:status_code] = response.code.to_i
98
+ content[:mime_type] = response.content_type.split(";")[0].strip
99
+ charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
100
+ charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
101
+ content[:character_set] = charset
102
+ content[:length] = response.content_length
103
+ if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
104
+ content[:body] = response.body
105
+ else
106
+ content[:body] = Base64.encode64(response.body)
107
+ end
108
+ content[:location] = response["location"]
109
+ content[:headers] = response.to_hash.symbolize_keys
110
+ # parse data for links
111
+ link_parser = ContentLinkParser.new(content[:url], content[:body])
112
+ content[:links] = link_parser.link_data
113
+
114
+ # add content to cache if required
115
+ if @options[:cache]
116
+ content[:body] = Base64.encode64(content[:body]) unless content[:body].nil? or content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
117
+ redis.set(unique_id, content.to_json)
118
+ redis.expire unique_id, @options[:cache].to_i
119
+ end
120
+ end
121
+ rescue SocketError => e
122
+ puts "ERROR: #{e.message}"
123
+
124
+ ## generate a blank content
125
+ content = {}
72
126
  content[:url] = uri.to_s
73
- content[:redirect_through] = [] if content[:redirect_through].nil?
74
- content[:redirect_through].insert(0, response['location'])
127
+ content[:response_time] = Time.now.to_f - request_time
128
+ content[:status_code] = 0
129
+ content[:length] = 0
130
+ content[:body] = ""
131
+ content[:error] = e.message
132
+ content[:mime_type] = "error/dnslookup"
133
+ content[:headers] = {}
134
+ content[:links] = {}
135
+
136
+ rescue Timeout::Error => e
137
+ puts "ERROR: #{e.message}"
75
138
 
139
+ ## generate a blank content
140
+ content = {}
141
+ content[:url] = uri.to_s
76
142
  content[:response_time] = Time.now.to_f - request_time
143
+ content[:status_code] = 0
144
+ content[:length] = 0
145
+ content[:body] = ""
146
+ content[:error] = e.message
147
+ content[:mime_type] = "error/serverdown"
148
+ content[:headers] = {}
149
+ content[:links] = {}
150
+ end
151
+ end
152
+ content
153
+ end
154
+
155
+ def head(url, redirect_limit = @options[:redirect_limit])
156
+ raise "url cannot be nil" if url.nil?
157
+
158
+ absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
159
+
160
+ # get the unique id for this request
161
+ unique_id = Digest::SHA1.hexdigest(url)
162
+
163
+ # connect to redis
164
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb")
165
+
166
+ content = {}
167
+
168
+ # check if it has already been cached
169
+ if (redis.get(unique_id) or redis.get("head-#{unique_id}")) and @options[:cache]
170
+ puts "Cache hit for #{url}" unless @options[:quiet]
171
+ if redis.get(unique_id)
172
+ content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
77
173
  else
78
- content[:response_time] = Time.now.to_f - request_time
174
+ content = JSON.parse(redis.get("head-#{unique_id}")).deep_symbolize_keys
175
+ end
176
+ content
177
+ else
178
+ print "Retrieving #{url }... " unless @options[:quiet]
179
+ uri = Addressable::URI.parse(url.strip)
180
+
181
+ # retrieve data
182
+ http = Net::HTTP.new(uri.host, uri.port)
183
+ if uri.scheme == "https"
184
+ http.use_ssl = true
185
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
186
+ end
187
+
188
+ request_time = Time.now.to_f
189
+ http.read_timeout = @options[:timeout].to_i
190
+ http.open_timeout = @options[:timeout].to_i
191
+
192
+ begin
193
+ response = http.head(uri.to_s)
79
194
 
80
- puts "Retrieved." unless @options[:quiet]
195
+ if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
196
+ puts "redirected... " unless @options[:quiet]
197
+ url = absolutize.url(response['location']).to_s
198
+ redirect_limit = redirect_limit - 1
199
+ content = head(url, redirect_limit)
200
+ content[:url] = uri.to_s
201
+ content[:redirect_through] = [] if content[:redirect_through].nil?
202
+ content[:redirect_through].insert(0, url)
203
+ else
204
+ content[:url] = uri.to_s
205
+ content[:status_code] = response.code.to_i
206
+ unless response.content_type.nil?
207
+ content[:mime_type] = response.content_type.split(";")[0].strip
208
+ charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
209
+ charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
210
+ content[:character_set] = charset
211
+ end
212
+
213
+ # add content to cache if required
214
+ if @options[:cache]
215
+ puts "Stored in cache [head-#{unique_id}]" if @options[:debug]
216
+ redis.set("head-#{unique_id}", content.to_json)
217
+ redis.expire "head-#{unique_id}", @options[:cache].to_i
218
+ else
219
+ puts "Not storing in cache as cache disabled" if @options[:debug]
220
+ end
221
+ end
222
+ rescue SocketError => e
223
+ puts "ERROR: #{e.message}"
81
224
 
82
- # create the content container
225
+ ## generate a blank content
226
+ content = {}
83
227
  content[:url] = uri.to_s
84
- content[:status_code] = response.code.to_i
85
- content[:content_type] = response.content_type
86
- charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1 ] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
87
- charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
88
- content[:character_set] = charset
89
- content[:content_length] = response.content_length
90
- content[:content_body] = response.body
91
- content[:location] = response["location"]
92
- content[:headers] = response.to_hash.symbolize_keys
93
- # parse data for links
94
- link_parser = ContentLinkParser.new(content[:url], content[:content_body])
95
- content[:links] = link_parser.link_data
228
+ content[:response_time] = Time.now.to_f - request_time
229
+ content[:status_code] = 0
230
+ content[:length] = 0
231
+ content[:body] = ""
232
+ content[:error] = e.message
233
+ content[:mime_type] = "error/dnslookup"
234
+ content[:headers] = {}
235
+ content[:links] = {}
236
+
237
+ rescue Timeout::Error => e
238
+ puts "ERROR: #{e.message}"
96
239
 
97
- # add content to cache if required
98
- if @options[:cache]
99
- redis.set(unique_id, content.to_json)
100
- redis.expire unique_id, @options[:cache].to_i
101
- end
240
+ ## generate a blank content
241
+ content = {}
242
+ content[:url] = uri.to_s
243
+ content[:response_time] = Time.now.to_f - request_time
244
+ content[:status_code] = 0
245
+ content[:length] = 0
246
+ content[:body] = ""
247
+ content[:error] = e.message
248
+ content[:mime_type] = "error/serverdown"
249
+ content[:headers] = {}
250
+ content[:links] = {}
102
251
  end
252
+
253
+ content
103
254
  end
104
- content
105
255
  end
106
256
  end
107
257
 
@@ -126,4 +276,4 @@ class Hash
126
276
  end
127
277
  self
128
278
  end
129
- end
279
+ end
@@ -1,4 +1,4 @@
1
- class ContentProcessJob
1
+ class CobwebProcessJob
2
2
  require "ap"
3
3
 
4
4
  @queue = :cobweb_process_job
@@ -10,4 +10,4 @@ class ContentProcessJob
10
10
  #ap content.keys
11
11
 
12
12
  end
13
- end
13
+ end
@@ -9,7 +9,7 @@ class CrawlJob
9
9
  def self.perform(content_request)
10
10
  # change all hash keys to symbols
11
11
  content_request.deep_symbolize_keys
12
- redis = NamespacedRedis.new(Redis.new, "cobweb-#{content_request[:crawl_id]}")
12
+ redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{content_request[:crawl_id]}")
13
13
  @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
14
14
 
15
15
  # check we haven't crawled this url before
@@ -19,18 +19,14 @@ class CrawlJob
19
19
  redis.incr "crawl-counter"
20
20
  crawl_counter = redis.get("crawl-counter").to_i
21
21
  queue_counter = redis.get("queue-counter").to_i
22
- if crawl_counter <= content_request[:crawl_limit]
23
- content = CobWeb.get(content_request)
24
- redis.sadd "crawled", content_request[:url]
22
+ if crawl_counter <= content_request[:crawl_limit].to_i
23
+ content = CobWeb.new(content_request).get(content_request[:url])
24
+ redis.sadd "crawled", content_request[:url]
25
25
  set_base_url redis, content, content_request[:base_url]
26
- if queue_counter <= content_request[:crawl_limit]
27
- ap content[:links]
26
+ if queue_counter <= content_request[:crawl_limit].to_i
28
27
  content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
29
- ap link
30
28
  unless redis.sismember "crawled", link
31
- puts redis.get("base_url")
32
- puts "---------------------------------"
33
- if link.match(Regexp.new("^#{redis.get("base_url")}"))
29
+ if link.to_s.match(Regexp.new("^#{redis.get("base_url")}"))
34
30
  new_request = content_request.clone
35
31
  new_request[:url] = link
36
32
  new_request[:parent] = content_request[:url]
@@ -42,12 +38,12 @@ class CrawlJob
42
38
  end
43
39
 
44
40
  # enqueue to processing queue
45
- Resque.enqueue(const_get(content_request[:processing_queue]), content)
41
+ Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:source_id => content_request[:source_id]}))
46
42
  puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
47
43
  puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter}" if content_request[:debug]
48
44
 
49
45
  else
50
- puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit]} objects" if content_request[:debug]
46
+ puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
51
47
  end
52
48
  else
53
49
  puts "Already crawled #{content_request[:url]}" if content_request[:debug]
@@ -60,7 +56,7 @@ class CrawlJob
60
56
  if content[:status_code] >= 300 and content[:status_code] < 400
61
57
  #redirect received for first url
62
58
  redis.set("base_url", @absolutize.url(content[:location]).to_s)
63
- puts "Warning: base_url given redirects to another location, setting base_url to #{@absolutize.url(content[:location]).to_s}"
59
+ puts "WARNING: base_url given redirects to another location, setting base_url to #{@absolutize.url(content[:location]).to_s}"
64
60
  else
65
61
  redis.set("base_url", base_url)
66
62
  end
@@ -36,7 +36,10 @@ describe CobWeb do
36
36
 
37
37
  @mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
38
38
  @mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
39
- @mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
39
+ @mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
40
+ @mock_http_client.stub!(:read_timeout=).and_return(nil)
41
+ @mock_http_client.stub!(:open_timeout=).and_return(nil)
42
+ @mock_http_client.stub!(:start).and_return(@mock_http_response)
40
43
 
41
44
  @mock_http_response.stub!(:code).and_return(200)
42
45
  @mock_http_response.stub!(:content_type).and_return("text/html")
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 21
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 2
10
- version: 0.0.2
9
+ - 5
10
+ version: 0.0.5
11
11
  platform: ruby
12
12
  authors:
13
13
  - Stewart McKee
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-11-10 00:00:00 +00:00
18
+ date: 2011-01-28 00:00:00 +00:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -74,6 +74,20 @@ dependencies:
74
74
  version: "0"
75
75
  type: :runtime
76
76
  version_requirements: *id004
77
+ - !ruby/object:Gem::Dependency
78
+ name: addressable
79
+ prerelease: false
80
+ requirement: &id005 !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ hash: 3
86
+ segments:
87
+ - 0
88
+ version: "0"
89
+ type: :runtime
90
+ version_requirements: *id005
77
91
  description:
78
92
  email: stewart@rockwellcottage.com
79
93
  executables: []
@@ -90,8 +104,8 @@ files:
90
104
  - spec/cobweb/cobweb_spec.rb
91
105
  - lib/namespaced_redis.rb
92
106
  - lib/cobweb.rb
93
- - lib/content_process_job.rb
94
107
  - lib/content_link_parser.rb
108
+ - lib/cobweb_process_job.rb
95
109
  - lib/crawl_job.rb
96
110
  - README.textile
97
111
  has_rdoc: false