cobweb 0.0.2 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.2
2
+ h1. Cobweb v0.0.3
3
3
 
4
4
  h2. Intro
5
5
 
@@ -1,6 +1,7 @@
1
- require 'rubygems'
1
+ require 'rubygems'
2
2
  require 'uri'
3
3
  require 'resque'
4
+ require "addressable/uri"
4
5
  require 'digest/sha1'
5
6
 
6
7
  Dir[File.dirname(__FILE__) + '/*.rb'].each do |file|
@@ -13,9 +14,12 @@ class CobWeb
13
14
  @options = options
14
15
  @options[:follow_redirects] = true unless @options.has_key?(:follow_redirects)
15
16
  @options[:redirect_limit] = 10 unless @options.has_key?(:redirect_limit)
16
- @options[:processing_queue] = ContentProcessJob unless @options.has_key?(:processing_queue)
17
+ @options[:processing_queue] = CobwebProcessJob unless @options.has_key?(:processing_queue)
18
+ @options[:quiet] = true unless @options.has_key?(:quiet)
17
19
  @options[:debug] = false unless @options.has_key?(:debug)
18
20
  @options[:cache] = 300 unless @options.has_key?(:cache)
21
+ @options[:timeout] = 10 unless @options.has_key?(:timeout)
22
+ @options[:redis_options] = {} unless @options.has_key?(:redis_options)
19
23
 
20
24
  end
21
25
 
@@ -27,7 +31,7 @@ class CobWeb
27
31
  }
28
32
 
29
33
  request.merge!(@options)
30
-
34
+
31
35
  Resque.enqueue(CrawlJob, request)
32
36
  end
33
37
 
@@ -35,11 +39,13 @@ class CobWeb
35
39
 
36
40
  raise "url cannot be nil" if url.nil?
37
41
 
42
+ absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
43
+
38
44
  # get the unique id for this request
39
45
  unique_id = Digest::SHA1.hexdigest(url)
40
46
 
41
47
  # connect to redis
42
- redis = NamespacedRedis.new(Redis.new, "cobweb")
48
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb")
43
49
 
44
50
  content = {}
45
51
 
@@ -47,61 +53,205 @@ class CobWeb
47
53
  if redis.get(unique_id) and @options[:cache]
48
54
  puts "Cache hit for #{url}" unless @options[:quiet]
49
55
  content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
50
-
56
+ content[:body] = Base64.decode64(content[:body]) unless content[:body].nil? or content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
57
+
51
58
  content
52
59
  else
53
60
  # this url is valid for processing so lets get on with it
54
61
  print "Retrieving #{url }... " unless @options[:quiet]
55
- uri = URI.parse(url)
62
+ uri = Addressable::URI.parse(url.strip)
56
63
 
57
64
  # retrieve data
58
65
  http = Net::HTTP.new(uri.host, uri.port)
59
66
  if uri.scheme == "https"
60
67
  http.use_ssl = true
61
68
  http.verify_mode = OpenSSL::SSL::VERIFY_NONE
62
- end
69
+ end
70
+
63
71
  request_time = Time.now.to_f
64
- request = Net::HTTP::Get.new(uri.request_uri)
65
- response = http.request(request)
66
-
67
- if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
68
- puts "redirected... " unless @options[:quiet]
69
- url = response['location']
70
- redirect_limit = redirect_limit - 1
71
- content = get(response['location'], redirect_limit)
72
+ http.read_timeout = @options[:timeout].to_i
73
+ http.open_timeout = @options[:timeout].to_i
74
+ begin
75
+ response = http.start() {|http|
76
+ response = http.get(uri.request_uri)
77
+ }
78
+
79
+ if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
80
+ puts "redirected... " unless @options[:quiet]
81
+ url = absolutize.url(response['location']).to_s
82
+ ap redirect_limit
83
+ redirect_limit = redirect_limit - 1
84
+ content = get(url, redirect_limit)
85
+ content[:url] = uri.to_s
86
+ content[:redirect_through] = [] if content[:redirect_through].nil?
87
+ content[:redirect_through].insert(0, url)
88
+
89
+ content[:response_time] = Time.now.to_f - request_time
90
+ else
91
+ content[:response_time] = Time.now.to_f - request_time
92
+
93
+ puts "Retrieved." unless @options[:quiet]
94
+
95
+ # create the content container
96
+ content[:url] = uri.to_s
97
+ content[:status_code] = response.code.to_i
98
+ content[:mime_type] = response.content_type.split(";")[0].strip
99
+ charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
100
+ charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
101
+ content[:character_set] = charset
102
+ content[:length] = response.content_length
103
+ if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
104
+ content[:body] = response.body
105
+ else
106
+ content[:body] = Base64.encode64(response.body)
107
+ end
108
+ content[:location] = response["location"]
109
+ content[:headers] = response.to_hash.symbolize_keys
110
+ # parse data for links
111
+ link_parser = ContentLinkParser.new(content[:url], content[:body])
112
+ content[:links] = link_parser.link_data
113
+
114
+ # add content to cache if required
115
+ if @options[:cache]
116
+ content[:body] = Base64.encode64(content[:body]) unless content[:body].nil? or content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
117
+ redis.set(unique_id, content.to_json)
118
+ redis.expire unique_id, @options[:cache].to_i
119
+ end
120
+ end
121
+ rescue SocketError => e
122
+ puts "ERROR: #{e.message}"
123
+
124
+ ## generate a blank content
125
+ content = {}
72
126
  content[:url] = uri.to_s
73
- content[:redirect_through] = [] if content[:redirect_through].nil?
74
- content[:redirect_through].insert(0, response['location'])
127
+ content[:response_time] = Time.now.to_f - request_time
128
+ content[:status_code] = 0
129
+ content[:length] = 0
130
+ content[:body] = ""
131
+ content[:error] = e.message
132
+ content[:mime_type] = "error/dnslookup"
133
+ content[:headers] = {}
134
+ content[:links] = {}
135
+
136
+ rescue Timeout::Error => e
137
+ puts "ERROR: #{e.message}"
75
138
 
139
+ ## generate a blank content
140
+ content = {}
141
+ content[:url] = uri.to_s
76
142
  content[:response_time] = Time.now.to_f - request_time
143
+ content[:status_code] = 0
144
+ content[:length] = 0
145
+ content[:body] = ""
146
+ content[:error] = e.message
147
+ content[:mime_type] = "error/serverdown"
148
+ content[:headers] = {}
149
+ content[:links] = {}
150
+ end
151
+ end
152
+ content
153
+ end
154
+
155
+ def head(url, redirect_limit = @options[:redirect_limit])
156
+ raise "url cannot be nil" if url.nil?
157
+
158
+ absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
159
+
160
+ # get the unique id for this request
161
+ unique_id = Digest::SHA1.hexdigest(url)
162
+
163
+ # connect to redis
164
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb")
165
+
166
+ content = {}
167
+
168
+ # check if it has already been cached
169
+ if (redis.get(unique_id) or redis.get("head-#{unique_id}")) and @options[:cache]
170
+ puts "Cache hit for #{url}" unless @options[:quiet]
171
+ if redis.get(unique_id)
172
+ content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
77
173
  else
78
- content[:response_time] = Time.now.to_f - request_time
174
+ content = JSON.parse(redis.get("head-#{unique_id}")).deep_symbolize_keys
175
+ end
176
+ content
177
+ else
178
+ print "Retrieving #{url }... " unless @options[:quiet]
179
+ uri = Addressable::URI.parse(url.strip)
180
+
181
+ # retrieve data
182
+ http = Net::HTTP.new(uri.host, uri.port)
183
+ if uri.scheme == "https"
184
+ http.use_ssl = true
185
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
186
+ end
187
+
188
+ request_time = Time.now.to_f
189
+ http.read_timeout = @options[:timeout].to_i
190
+ http.open_timeout = @options[:timeout].to_i
191
+
192
+ begin
193
+ response = http.head(uri.to_s)
79
194
 
80
- puts "Retrieved." unless @options[:quiet]
195
+ if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
196
+ puts "redirected... " unless @options[:quiet]
197
+ url = absolutize.url(response['location']).to_s
198
+ redirect_limit = redirect_limit - 1
199
+ content = head(url, redirect_limit)
200
+ content[:url] = uri.to_s
201
+ content[:redirect_through] = [] if content[:redirect_through].nil?
202
+ content[:redirect_through].insert(0, url)
203
+ else
204
+ content[:url] = uri.to_s
205
+ content[:status_code] = response.code.to_i
206
+ unless response.content_type.nil?
207
+ content[:mime_type] = response.content_type.split(";")[0].strip
208
+ charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
209
+ charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
210
+ content[:character_set] = charset
211
+ end
212
+
213
+ # add content to cache if required
214
+ if @options[:cache]
215
+ puts "Stored in cache [head-#{unique_id}]" if @options[:debug]
216
+ redis.set("head-#{unique_id}", content.to_json)
217
+ redis.expire "head-#{unique_id}", @options[:cache].to_i
218
+ else
219
+ puts "Not storing in cache as cache disabled" if @options[:debug]
220
+ end
221
+ end
222
+ rescue SocketError => e
223
+ puts "ERROR: #{e.message}"
81
224
 
82
- # create the content container
225
+ ## generate a blank content
226
+ content = {}
83
227
  content[:url] = uri.to_s
84
- content[:status_code] = response.code.to_i
85
- content[:content_type] = response.content_type
86
- charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1 ] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
87
- charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
88
- content[:character_set] = charset
89
- content[:content_length] = response.content_length
90
- content[:content_body] = response.body
91
- content[:location] = response["location"]
92
- content[:headers] = response.to_hash.symbolize_keys
93
- # parse data for links
94
- link_parser = ContentLinkParser.new(content[:url], content[:content_body])
95
- content[:links] = link_parser.link_data
228
+ content[:response_time] = Time.now.to_f - request_time
229
+ content[:status_code] = 0
230
+ content[:length] = 0
231
+ content[:body] = ""
232
+ content[:error] = e.message
233
+ content[:mime_type] = "error/dnslookup"
234
+ content[:headers] = {}
235
+ content[:links] = {}
236
+
237
+ rescue Timeout::Error => e
238
+ puts "ERROR: #{e.message}"
96
239
 
97
- # add content to cache if required
98
- if @options[:cache]
99
- redis.set(unique_id, content.to_json)
100
- redis.expire unique_id, @options[:cache].to_i
101
- end
240
+ ## generate a blank content
241
+ content = {}
242
+ content[:url] = uri.to_s
243
+ content[:response_time] = Time.now.to_f - request_time
244
+ content[:status_code] = 0
245
+ content[:length] = 0
246
+ content[:body] = ""
247
+ content[:error] = e.message
248
+ content[:mime_type] = "error/serverdown"
249
+ content[:headers] = {}
250
+ content[:links] = {}
102
251
  end
252
+
253
+ content
103
254
  end
104
- content
105
255
  end
106
256
  end
107
257
 
@@ -126,4 +276,4 @@ class Hash
126
276
  end
127
277
  self
128
278
  end
129
- end
279
+ end
@@ -1,4 +1,4 @@
1
- class ContentProcessJob
1
+ class CobwebProcessJob
2
2
  require "ap"
3
3
 
4
4
  @queue = :cobweb_process_job
@@ -10,4 +10,4 @@ class ContentProcessJob
10
10
  #ap content.keys
11
11
 
12
12
  end
13
- end
13
+ end
@@ -9,7 +9,7 @@ class CrawlJob
9
9
  def self.perform(content_request)
10
10
  # change all hash keys to symbols
11
11
  content_request.deep_symbolize_keys
12
- redis = NamespacedRedis.new(Redis.new, "cobweb-#{content_request[:crawl_id]}")
12
+ redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{content_request[:crawl_id]}")
13
13
  @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
14
14
 
15
15
  # check we haven't crawled this url before
@@ -19,18 +19,14 @@ class CrawlJob
19
19
  redis.incr "crawl-counter"
20
20
  crawl_counter = redis.get("crawl-counter").to_i
21
21
  queue_counter = redis.get("queue-counter").to_i
22
- if crawl_counter <= content_request[:crawl_limit]
23
- content = CobWeb.get(content_request)
24
- redis.sadd "crawled", content_request[:url]
22
+ if crawl_counter <= content_request[:crawl_limit].to_i
23
+ content = CobWeb.new(content_request).get(content_request[:url])
24
+ redis.sadd "crawled", content_request[:url]
25
25
  set_base_url redis, content, content_request[:base_url]
26
- if queue_counter <= content_request[:crawl_limit]
27
- ap content[:links]
26
+ if queue_counter <= content_request[:crawl_limit].to_i
28
27
  content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
29
- ap link
30
28
  unless redis.sismember "crawled", link
31
- puts redis.get("base_url")
32
- puts "---------------------------------"
33
- if link.match(Regexp.new("^#{redis.get("base_url")}"))
29
+ if link.to_s.match(Regexp.new("^#{redis.get("base_url")}"))
34
30
  new_request = content_request.clone
35
31
  new_request[:url] = link
36
32
  new_request[:parent] = content_request[:url]
@@ -42,12 +38,12 @@ class CrawlJob
42
38
  end
43
39
 
44
40
  # enqueue to processing queue
45
- Resque.enqueue(const_get(content_request[:processing_queue]), content)
41
+ Resque.enqueue(const_get(content_request[:processing_queue]), content.merge({:source_id => content_request[:source_id]}))
46
42
  puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
47
43
  puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter}" if content_request[:debug]
48
44
 
49
45
  else
50
- puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit]} objects" if content_request[:debug]
46
+ puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit].to_i} objects" if content_request[:debug]
51
47
  end
52
48
  else
53
49
  puts "Already crawled #{content_request[:url]}" if content_request[:debug]
@@ -60,7 +56,7 @@ class CrawlJob
60
56
  if content[:status_code] >= 300 and content[:status_code] < 400
61
57
  #redirect received for first url
62
58
  redis.set("base_url", @absolutize.url(content[:location]).to_s)
63
- puts "Warning: base_url given redirects to another location, setting base_url to #{@absolutize.url(content[:location]).to_s}"
59
+ puts "WARNING: base_url given redirects to another location, setting base_url to #{@absolutize.url(content[:location]).to_s}"
64
60
  else
65
61
  redis.set("base_url", base_url)
66
62
  end
@@ -36,7 +36,10 @@ describe CobWeb do
36
36
 
37
37
  @mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
38
38
  @mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
39
- @mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
39
+ @mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
40
+ @mock_http_client.stub!(:read_timeout=).and_return(nil)
41
+ @mock_http_client.stub!(:open_timeout=).and_return(nil)
42
+ @mock_http_client.stub!(:start).and_return(@mock_http_response)
40
43
 
41
44
  @mock_http_response.stub!(:code).and_return(200)
42
45
  @mock_http_response.stub!(:content_type).and_return("text/html")
metadata CHANGED
@@ -1,13 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
4
+ hash: 21
5
5
  prerelease: false
6
6
  segments:
7
7
  - 0
8
8
  - 0
9
- - 2
10
- version: 0.0.2
9
+ - 5
10
+ version: 0.0.5
11
11
  platform: ruby
12
12
  authors:
13
13
  - Stewart McKee
@@ -15,7 +15,7 @@ autorequire:
15
15
  bindir: bin
16
16
  cert_chain: []
17
17
 
18
- date: 2010-11-10 00:00:00 +00:00
18
+ date: 2011-01-28 00:00:00 +00:00
19
19
  default_executable:
20
20
  dependencies:
21
21
  - !ruby/object:Gem::Dependency
@@ -74,6 +74,20 @@ dependencies:
74
74
  version: "0"
75
75
  type: :runtime
76
76
  version_requirements: *id004
77
+ - !ruby/object:Gem::Dependency
78
+ name: addressable
79
+ prerelease: false
80
+ requirement: &id005 !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ">="
84
+ - !ruby/object:Gem::Version
85
+ hash: 3
86
+ segments:
87
+ - 0
88
+ version: "0"
89
+ type: :runtime
90
+ version_requirements: *id005
77
91
  description:
78
92
  email: stewart@rockwellcottage.com
79
93
  executables: []
@@ -90,8 +104,8 @@ files:
90
104
  - spec/cobweb/cobweb_spec.rb
91
105
  - lib/namespaced_redis.rb
92
106
  - lib/cobweb.rb
93
- - lib/content_process_job.rb
94
107
  - lib/content_link_parser.rb
108
+ - lib/cobweb_process_job.rb
95
109
  - lib/crawl_job.rb
96
110
  - README.textile
97
111
  has_rdoc: false