cobweb 0.0.43 → 0.0.44

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.43
2
+ h1. Cobweb v0.0.44
3
3
 
4
4
  h2. Intro
5
5
 
@@ -20,7 +20,7 @@ class Cobweb
20
20
  # investigate using event machine for single threaded crawling
21
21
 
22
22
  def self.version
23
- "0.0.43"
23
+ "0.0.44"
24
24
  end
25
25
 
26
26
  def method_missing(method_sym, *arguments, &block)
@@ -76,12 +76,13 @@ class CobwebCrawler
76
76
  internal_links.each do |link|
77
77
  puts "Added #{link.to_s} to queue" if @debug
78
78
  @redis.sadd "queued", link
79
+ queue_counter += 1
79
80
  end
80
81
 
81
82
  crawl_counter = crawl_counter + 1 #@redis.scard("crawled").to_i
82
83
  queue_counter = queue_counter - 1 #@redis.scard("queued").to_i
83
84
 
84
- @stats.update_statistics(content)
85
+ @stats.update_statistics(content, crawl_counter, queue_counter)
85
86
  @stats.update_status("Completed #{url}.")
86
87
  puts "Crawled: #{crawl_counter.to_i} Limit: #{@options[:crawl_limit].to_i} Queued: #{queue_counter.to_i}" if @debug
87
88
 
@@ -125,7 +126,7 @@ class CobwebCrawler
125
126
 
126
127
  def all_links_from_content(content)
127
128
  links = content[:links].keys.map{|key| content[:links][key]}.flatten
128
- links.reject!{|link| link.starts_with?("javascript:")}
129
+ links.reject!{|link| link.cobweb_starts_with?("javascript:")}
129
130
  links = links.map{|link| UriHelper.join_no_fragment(content[:url], link) }
130
131
  links.select!{|link| link.scheme == "http" || link.scheme == "https"}
131
132
  links.uniq
@@ -134,7 +135,7 @@ class CobwebCrawler
134
135
  end
135
136
 
136
137
  class String
137
- def starts_with?(val)
138
+ def cobweb_starts_with?(val)
138
139
  if self.length >= val.length
139
140
  self[0..val.length-1] == val
140
141
  else
@@ -21,50 +21,48 @@ class Stats
21
21
  @redis.del "crawl_details"
22
22
  end
23
23
 
24
- def update_statistics(content)
24
+ def update_statistics(content, crawl_counter=@redis.scard("crawled").to_i, queue_counter=@redis.scard("queued").to_i)
25
25
 
26
- crawl_counter = @redis.scard("crawled").to_i
27
- queue_counter = @redis.scard("queued").to_i
26
+ @statistics = get_statistics
28
27
 
29
- if @redis.hexists "statistics", "average_response_time"
30
- @redis.hset("statistics", "average_response_time", (((@redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1)))
28
+ if @statistics.has_key? :average_response_time
29
+ @statistics[:average_response_time] = (((@redis.hget("statistics", "average_response_time").to_f*crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1))
31
30
  else
32
- @redis.hset("statistics", "average_response_time", content[:response_time].to_f)
31
+ @statistics[:average_response_time] = content[:response_time].to_f
33
32
  end
34
- @redis.hset "statistics", "maximum_response_time", content[:response_time].to_f if @redis.hget("statistics", "maximum_response_time").nil? or content[:response_time].to_f > @redis.hget("statistics", "maximum_response_time").to_f
35
- @redis.hset "statistics", "minimum_response_time", content[:response_time].to_f if @redis.hget("statistics", "minimum_response_time").nil? or content[:response_time].to_f < @redis.hget("statistics", "minimum_response_time").to_f
36
- if @redis.hexists "statistics", "average_length"
37
- @redis.hset("statistics", "average_length", (((@redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1)))
33
+ @statistics[:maximum_response_time] = content[:response_time].to_f if @statistics[:maximum_response_time].nil? or content[:response_time].to_f > @statistics[:maximum_response_time].to_f
34
+ @statistics[:minimum_response_time] = content[:response_time].to_f if @statistics[:minimum_response_time].nil? or content[:response_time].to_f < @statistics[:minimum_response_time].to_f
35
+ if @statistics.has_key? :average_length
36
+ @statistics[:average_length] = (((@redis.hget("statistics", "average_length").to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1))
38
37
  else
39
- @redis.hset("statistics", "average_length", content[:length].to_i)
38
+ @statistics[:average_length] = content[:length].to_i
40
39
  end
41
- @redis.hset "statistics", "maximum_length", content[:length].to_i if @redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > @redis.hget("statistics", "maximum_length").to_i
42
- @redis.hset "statistics", "minimum_length", content[:length].to_i if @redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < @redis.hget("statistics", "minimum_length").to_i
43
-
44
-
40
+ @statistics[:maximum_length] = content[:length].to_i if @redis.hget("statistics", "maximum_length").nil? or content[:length].to_i > @statistics[:maximum_length].to_i
41
+ @statistics[:minimum_length] = content[:length].to_i if @redis.hget("statistics", "minimum_length").nil? or content[:length].to_i < @statistics[:minimum_length].to_i
42
+
45
43
  if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
46
- @redis.hset "statistics", "page_count", @redis.hget("statistics", "page_count").to_i + 1
47
- @redis.hset "statistics", "page_size", @redis.hget("statistics", "page_size").to_i + content[:length].to_i
44
+ @statistics[:page_count] = @statistics[:page_count].to_i + 1
45
+ @statistics[:page_size] = @statistics[:page_size].to_i + content[:length].to_i
48
46
  increment_time_stat("pages_count")
49
47
  else
50
- @redis.hset "statistics", "asset_count", @redis.hget("statistics", "asset_count").to_i + 1
51
- @redis.hset "statistics", "asset_size", @redis.hget("statistics", "asset_size").to_i + content[:length].to_i
48
+ @statistics[:asset_count] = @statistics[:asset_count].to_i + 1
49
+ @statistics[:asset_size] = @statistics[:asset_size].to_i + content[:length].to_i
52
50
  increment_time_stat("assets_count")
53
51
  end
54
52
 
55
- total_redirects = @redis.hget("statistics", "total_redirects").to_i
56
- @redis.hset "statistics", "total_redirects", 0 if total_redirects.nil?
57
- @redis.hset("statistics", "total_redirects", total_redirects += content[:redirect_through].count) unless content[:redirect_through].nil?
53
+ total_redirects = @statistics[:total_redirects].to_i
54
+ @statistics[:total_redirects] = 0 if total_redirects.nil?
55
+ @statistics[:total_redirects] = total_redirects += content[:redirect_through].count unless content[:redirect_through].nil?
58
56
 
59
- @redis.hset "statistics", "crawl_counter", crawl_counter
60
- @redis.hset "statistics", "queue_counter", queue_counter
57
+ @statistics[:crawl_counter] = crawl_counter
58
+ @statistics[:queue_counter] = queue_counter
61
59
 
62
- total_length = @redis.hget("statistics", "total_length").to_i
63
- @redis.hset "statistics", "total_length", total_length + content[:length].to_i
60
+ total_length = @statistics[:total_length].to_i
61
+ @statistics[:total_length] = total_length + content[:length].to_i
64
62
 
65
63
  mime_counts = {}
66
- if @redis.hexists "statistics", "mime_counts"
67
- mime_counts = JSON.parse(@redis.hget("statistics", "mime_counts"))
64
+ if @statistics.has_key? :mime_counts
65
+ mime_counts = @statistics[:mime_counts]
68
66
  if mime_counts.has_key? content[:mime_type]
69
67
  mime_counts[content[:mime_type]] += 1
70
68
  else
@@ -73,30 +71,30 @@ class Stats
73
71
  else
74
72
  mime_counts = {content[:mime_type] => 1}
75
73
  end
76
- @redis.hset "statistics", "mime_counts", mime_counts.to_json
74
+ @statistics[:mime_counts] = mime_counts.to_json
77
75
 
78
76
  # record mime categories stats
79
- if content[:mime_type].starts_with? "text"
77
+ if content[:mime_type].cobweb_starts_with? "text"
80
78
  increment_time_stat("mime_text_count")
81
- elsif content[:mime_type].starts_with? "application"
79
+ elsif content[:mime_type].cobweb_starts_with? "application"
82
80
  increment_time_stat("mime_application_count")
83
- elsif content[:mime_type].starts_with? "audio"
81
+ elsif content[:mime_type].cobweb_starts_with? "audio"
84
82
  increment_time_stat("mime_audio_count")
85
- elsif content[:mime_type].starts_with? "image"
83
+ elsif content[:mime_type].cobweb_starts_with? "image"
86
84
  increment_time_stat("mime_image_count")
87
- elsif content[:mime_type].starts_with? "message"
85
+ elsif content[:mime_type].cobweb_starts_with? "message"
88
86
  increment_time_stat("mime_message_count")
89
- elsif content[:mime_type].starts_with? "model"
87
+ elsif content[:mime_type].cobweb_starts_with? "model"
90
88
  increment_time_stat("mime_model_count")
91
- elsif content[:mime_type].starts_with? "multipart"
89
+ elsif content[:mime_type].cobweb_starts_with? "multipart"
92
90
  increment_time_stat("mime_multipart_count")
93
- elsif content[:mime_type].starts_with? "video"
91
+ elsif content[:mime_type].cobweb_starts_with? "video"
94
92
  increment_time_stat("mime_video_count")
95
93
  end
96
94
 
97
95
  status_counts = {}
98
- if @redis.hexists "statistics", "status_counts"
99
- status_counts = HashUtil.deep_symbolize_keys(JSON.parse(@redis.hget("statistics", "status_counts")))
96
+ if @statistics.has_key? :status_counts
97
+ status_counts = @statistics[:status_counts]
100
98
  status_code = content[:status_code].to_i.to_s.to_sym
101
99
  if status_counts.has_key? status_code
102
100
  status_counts[status_code] += 1
@@ -116,13 +114,15 @@ class Stats
116
114
  increment_time_stat("status|_500_count")
117
115
  end
118
116
 
119
- @redis.hset "statistics", "status_counts", status_counts.to_json
120
-
117
+ @statistics[:status_counts] = status_counts.to_json
121
118
 
122
119
  ## time based statistics
123
120
  increment_time_stat("minute_totals", "minute", 60)
124
121
 
125
- get_statistics
122
+ redis_command = "@redis.hmset 'statistics', #{@statistics.keys.map{|key| "'#{key}', '#{@statistics[key]}'"}.join(", ")}"
123
+ instance_eval redis_command
124
+
125
+ @statistics
126
126
  end
127
127
 
128
128
  def record_time_stat(stat_name, value, type="minute", duration=60)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.43
4
+ version: 0.0.44
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-05-03 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70139604510540 !ruby/object:Gem::Requirement
16
+ requirement: &70139600146360 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70139604510540
24
+ version_requirements: *70139600146360
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70139604510120 !ruby/object:Gem::Requirement
27
+ requirement: &70139600145940 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70139604510120
35
+ version_requirements: *70139600145940
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70139604509700 !ruby/object:Gem::Requirement
38
+ requirement: &70139600145520 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70139604509700
46
+ version_requirements: *70139600145520
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70139604509280 !ruby/object:Gem::Requirement
49
+ requirement: &70139600145100 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70139604509280
57
+ version_requirements: *70139600145100
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70139604508860 !ruby/object:Gem::Requirement
60
+ requirement: &70139600144680 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70139604508860
68
+ version_requirements: *70139600144680
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70139604508440 !ruby/object:Gem::Requirement
71
+ requirement: &70139600144260 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70139604508440
79
+ version_requirements: *70139600144260
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70139604508020 !ruby/object:Gem::Requirement
82
+ requirement: &70139600143840 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70139604508020
90
+ version_requirements: *70139600143840
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70139604523960 !ruby/object:Gem::Requirement
93
+ requirement: &70139600143420 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70139604523960
101
+ version_requirements: *70139600143420
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70139604523540 !ruby/object:Gem::Requirement
104
+ requirement: &70139600143000 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70139604523540
112
+ version_requirements: *70139600143000
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70139604523120 !ruby/object:Gem::Requirement
115
+ requirement: &70139600142580 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70139604523120
123
+ version_requirements: *70139600142580
124
124
  description: Web Crawler that uses resque background job engine to allow you to cluster
125
125
  your crawl.
126
126
  email: stewart@rockwellcottage.com