cobweb 0.0.21 → 0.0.22

Sign up to get free protection for your applications and to get access to all the features.
Files changed (4) hide show
  1. data/lib/cobweb.rb +18 -10
  2. data/lib/crawl_job.rb +4 -3
  3. data/lib/stats.rb +0 -1
  4. metadata +21 -21
data/lib/cobweb.rb CHANGED
@@ -5,8 +5,8 @@ require "addressable/uri"
5
5
  require 'digest/sha1'
6
6
  require 'base64'
7
7
 
8
- Dir[File.dirname(__FILE__) + '/*.rb'].each do |file|
9
- require [File.dirname(__FILE__), File.basename(file, File.extname(file))].join("/")
8
+ Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
9
+ require file
10
10
  end
11
11
 
12
12
  class Cobweb
@@ -18,6 +18,10 @@ class Cobweb
18
18
  # on end of crawl, return statistic hash (could call specified method ?) if single threaded or enqueue to a specified queue the stat hash
19
19
  # investigate using event machine for single threaded crawling
20
20
 
21
+ def self.version
22
+ "0.0.22"
23
+ end
24
+
21
25
  def initialize(options = {})
22
26
  @options = options
23
27
  @options[:follow_redirects] = true unless @options.has_key?(:follow_redirects)
@@ -34,12 +38,12 @@ class Cobweb
34
38
  def start(base_url)
35
39
  raise ":base_url is required" unless base_url
36
40
  request = {
37
- :crawl_id => Digest::SHA1.hexdigest(Time.now.to_s),
41
+ :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
38
42
  :url => base_url
39
43
  }
40
44
 
41
45
  request.merge!(@options)
42
- @redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{VERSION}-#{request[:crawl_id]}")
46
+ @redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{Cobweb.version}-#{request[:crawl_id]}")
43
47
  @redis.hset "statistics", "queued_at", DateTime.now
44
48
 
45
49
  Resque.enqueue(CrawlJob, request)
@@ -53,13 +57,17 @@ class Cobweb
53
57
 
54
58
  # get the unique id for this request
55
59
  unique_id = Digest::SHA1.hexdigest(url.to_s)
56
- redirect_limit = options[:redirect_limit]
60
+ if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
61
+ redirect_limit = options[:redirect_limit].to_i
62
+ else
63
+ redirect_limit = 10
64
+ end
57
65
 
58
66
  # connect to redis
59
67
  if options.has_key? :crawl_id
60
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}-#{options[:crawl_id]}")
68
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
61
69
  else
62
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}")
70
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}")
63
71
  end
64
72
 
65
73
  content = {}
@@ -107,7 +115,7 @@ class Cobweb
107
115
  raise RedirectError, "Redirect Limit reached" if redirect_limit == 0
108
116
 
109
117
  # get the content from redirect location
110
- content = get(url, redirect_limit)
118
+ content = get(url, options.merge(:redirect_limit => redirect_limit))
111
119
  content[:url] = uri.to_s
112
120
  content[:redirect_through] = [] if content[:redirect_through].nil?
113
121
  content[:redirect_through].insert(0, url)
@@ -206,9 +214,9 @@ class Cobweb
206
214
 
207
215
  # connect to redis
208
216
  if options.has_key? :crawl_id
209
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}-#{options[:crawl_id]}")
217
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
210
218
  else
211
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}")
219
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}")
212
220
  end
213
221
 
214
222
  content = {}
data/lib/crawl_job.rb CHANGED
@@ -28,8 +28,8 @@ class CrawlJob
28
28
  def self.perform(content_request)
29
29
  # change all hash keys to symbols
30
30
  content_request.deep_symbolize_keys
31
- redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{VERSION}-#{content_request[:crawl_id]}")
32
- ap redis.namespace
31
+ redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
32
+
33
33
  @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
34
34
 
35
35
  # check we haven't crawled this url before
@@ -95,6 +95,7 @@ class CrawlJob
95
95
  redis.sadd "crawled", content_request[:url]
96
96
  set_base_url redis, content, content_request[:base_url]
97
97
  content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
98
+ link = link.to_s
98
99
  unless redis.sismember "crawled", link
99
100
  puts "Checking if #{link} matches #{redis.get("base_url")} as internal?" if content_request[:debug]
100
101
  if link.to_s.match(Regexp.new("^#{redis.get("base_url")}"))
@@ -129,7 +130,7 @@ class CrawlJob
129
130
 
130
131
  # detect finished state
131
132
 
132
- if queue_counter == crawl_counter or content_request[:crawl_limit].to_i <= crawl_counter
133
+ if queue_counter == crawl_counter or content_request[:crawl_limit].to_i <= crawl_counter
133
134
 
134
135
  puts "queue_counter: #{queue_counter}"
135
136
  puts "crawl_counter: #{crawl_counter}"
data/lib/stats.rb CHANGED
@@ -11,7 +11,6 @@ class Stats < Sinatra::Base
11
11
  @@status = status
12
12
  end
13
13
 
14
- ap settings.root
15
14
  set :views, settings.root + '/../views'
16
15
 
17
16
  get '/' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.21
4
+ version: 0.0.22
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-03-09 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70294598137300 !ruby/object:Gem::Requirement
16
+ requirement: &70101145919340 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70294598137300
24
+ version_requirements: *70101145919340
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70294598136460 !ruby/object:Gem::Requirement
27
+ requirement: &70101145918920 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70294598136460
35
+ version_requirements: *70101145918920
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: absolutize
38
- requirement: &70294598135800 !ruby/object:Gem::Requirement
38
+ requirement: &70101145918500 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70294598135800
46
+ version_requirements: *70101145918500
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: nokogiri
49
- requirement: &70294598135140 !ruby/object:Gem::Requirement
49
+ requirement: &70101145934440 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70294598135140
57
+ version_requirements: *70101145934440
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: addressable
60
- requirement: &70294598134440 !ruby/object:Gem::Requirement
60
+ requirement: &70101145934020 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70294598134440
68
+ version_requirements: *70101145934020
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rspec
71
- requirement: &70294598133760 !ruby/object:Gem::Requirement
71
+ requirement: &70101145933580 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70294598133760
79
+ version_requirements: *70101145933580
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: awesome_print
82
- requirement: &70294598132820 !ruby/object:Gem::Requirement
82
+ requirement: &70101145933160 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70294598132820
90
+ version_requirements: *70101145933160
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: sinatra
93
- requirement: &70294598131980 !ruby/object:Gem::Requirement
93
+ requirement: &70101145932740 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70294598131980
101
+ version_requirements: *70101145932740
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: thin
104
- requirement: &70294598131160 !ruby/object:Gem::Requirement
104
+ requirement: &70101145932320 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70294598131160
112
+ version_requirements: *70101145932320
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: haml
115
- requirement: &70294598130540 !ruby/object:Gem::Requirement
115
+ requirement: &70101145931900 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70294598130540
123
+ version_requirements: *70101145931900
124
124
  description:
125
125
  email: stewart@rockwellcottage.com
126
126
  executables: []