cobweb 0.0.21 → 0.0.22

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (4) hide show
  1. data/lib/cobweb.rb +18 -10
  2. data/lib/crawl_job.rb +4 -3
  3. data/lib/stats.rb +0 -1
  4. metadata +21 -21
data/lib/cobweb.rb CHANGED
@@ -5,8 +5,8 @@ require "addressable/uri"
5
5
  require 'digest/sha1'
6
6
  require 'base64'
7
7
 
8
- Dir[File.dirname(__FILE__) + '/*.rb'].each do |file|
9
- require [File.dirname(__FILE__), File.basename(file, File.extname(file))].join("/")
8
+ Dir[File.dirname(__FILE__) + '/**/*.rb'].each do |file|
9
+ require file
10
10
  end
11
11
 
12
12
  class Cobweb
@@ -18,6 +18,10 @@ class Cobweb
18
18
  # on end of crawl, return statistic hash (could call specified method ?) if single threaded or enqueue to a specified queue the stat hash
19
19
  # investigate using event machine for single threaded crawling
20
20
 
21
+ def self.version
22
+ "0.0.22"
23
+ end
24
+
21
25
  def initialize(options = {})
22
26
  @options = options
23
27
  @options[:follow_redirects] = true unless @options.has_key?(:follow_redirects)
@@ -34,12 +38,12 @@ class Cobweb
34
38
  def start(base_url)
35
39
  raise ":base_url is required" unless base_url
36
40
  request = {
37
- :crawl_id => Digest::SHA1.hexdigest(Time.now.to_s),
41
+ :crawl_id => Digest::SHA1.hexdigest("#{Time.now.to_i}.#{Time.now.usec}"),
38
42
  :url => base_url
39
43
  }
40
44
 
41
45
  request.merge!(@options)
42
- @redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{VERSION}-#{request[:crawl_id]}")
46
+ @redis = NamespacedRedis.new(Redis.new(request[:redis_options]), "cobweb-#{Cobweb.version}-#{request[:crawl_id]}")
43
47
  @redis.hset "statistics", "queued_at", DateTime.now
44
48
 
45
49
  Resque.enqueue(CrawlJob, request)
@@ -53,13 +57,17 @@ class Cobweb
53
57
 
54
58
  # get the unique id for this request
55
59
  unique_id = Digest::SHA1.hexdigest(url.to_s)
56
- redirect_limit = options[:redirect_limit]
60
+ if options.has_key?(:redirect_limit) and !options[:redirect_limit].nil?
61
+ redirect_limit = options[:redirect_limit].to_i
62
+ else
63
+ redirect_limit = 10
64
+ end
57
65
 
58
66
  # connect to redis
59
67
  if options.has_key? :crawl_id
60
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}-#{options[:crawl_id]}")
68
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
61
69
  else
62
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}")
70
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}")
63
71
  end
64
72
 
65
73
  content = {}
@@ -107,7 +115,7 @@ class Cobweb
107
115
  raise RedirectError, "Redirect Limit reached" if redirect_limit == 0
108
116
 
109
117
  # get the content from redirect location
110
- content = get(url, redirect_limit)
118
+ content = get(url, options.merge(:redirect_limit => redirect_limit))
111
119
  content[:url] = uri.to_s
112
120
  content[:redirect_through] = [] if content[:redirect_through].nil?
113
121
  content[:redirect_through].insert(0, url)
@@ -206,9 +214,9 @@ class Cobweb
206
214
 
207
215
  # connect to redis
208
216
  if options.has_key? :crawl_id
209
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}-#{options[:crawl_id]}")
217
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}-#{options[:crawl_id]}")
210
218
  else
211
- redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{VERSION}")
219
+ redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb-#{Cobweb.version}")
212
220
  end
213
221
 
214
222
  content = {}
data/lib/crawl_job.rb CHANGED
@@ -28,8 +28,8 @@ class CrawlJob
28
28
  def self.perform(content_request)
29
29
  # change all hash keys to symbols
30
30
  content_request.deep_symbolize_keys
31
- redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{VERSION}-#{content_request[:crawl_id]}")
32
- ap redis.namespace
31
+ redis = NamespacedRedis.new(Redis.new(content_request[:redis_options]), "cobweb-#{Cobweb.version}-#{content_request[:crawl_id]}")
32
+
33
33
  @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
34
34
 
35
35
  # check we haven't crawled this url before
@@ -95,6 +95,7 @@ class CrawlJob
95
95
  redis.sadd "crawled", content_request[:url]
96
96
  set_base_url redis, content, content_request[:base_url]
97
97
  content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
98
+ link = link.to_s
98
99
  unless redis.sismember "crawled", link
99
100
  puts "Checking if #{link} matches #{redis.get("base_url")} as internal?" if content_request[:debug]
100
101
  if link.to_s.match(Regexp.new("^#{redis.get("base_url")}"))
@@ -129,7 +130,7 @@ class CrawlJob
129
130
 
130
131
  # detect finished state
131
132
 
132
- if queue_counter == crawl_counter or content_request[:crawl_limit].to_i <= crawl_counter
133
+ if queue_counter == crawl_counter or content_request[:crawl_limit].to_i <= crawl_counter
133
134
 
134
135
  puts "queue_counter: #{queue_counter}"
135
136
  puts "crawl_counter: #{crawl_counter}"
data/lib/stats.rb CHANGED
@@ -11,7 +11,6 @@ class Stats < Sinatra::Base
11
11
  @@status = status
12
12
  end
13
13
 
14
- ap settings.root
15
14
  set :views, settings.root + '/../views'
16
15
 
17
16
  get '/' do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.21
4
+ version: 0.0.22
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -13,7 +13,7 @@ date: 2012-03-09 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70294598137300 !ruby/object:Gem::Requirement
16
+ requirement: &70101145919340 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70294598137300
24
+ version_requirements: *70101145919340
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70294598136460 !ruby/object:Gem::Requirement
27
+ requirement: &70101145918920 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70294598136460
35
+ version_requirements: *70101145918920
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: absolutize
38
- requirement: &70294598135800 !ruby/object:Gem::Requirement
38
+ requirement: &70101145918500 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70294598135800
46
+ version_requirements: *70101145918500
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: nokogiri
49
- requirement: &70294598135140 !ruby/object:Gem::Requirement
49
+ requirement: &70101145934440 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70294598135140
57
+ version_requirements: *70101145934440
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: addressable
60
- requirement: &70294598134440 !ruby/object:Gem::Requirement
60
+ requirement: &70101145934020 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70294598134440
68
+ version_requirements: *70101145934020
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: rspec
71
- requirement: &70294598133760 !ruby/object:Gem::Requirement
71
+ requirement: &70101145933580 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70294598133760
79
+ version_requirements: *70101145933580
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: awesome_print
82
- requirement: &70294598132820 !ruby/object:Gem::Requirement
82
+ requirement: &70101145933160 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70294598132820
90
+ version_requirements: *70101145933160
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: sinatra
93
- requirement: &70294598131980 !ruby/object:Gem::Requirement
93
+ requirement: &70101145932740 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70294598131980
101
+ version_requirements: *70101145932740
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: thin
104
- requirement: &70294598131160 !ruby/object:Gem::Requirement
104
+ requirement: &70101145932320 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70294598131160
112
+ version_requirements: *70101145932320
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: haml
115
- requirement: &70294598130540 !ruby/object:Gem::Requirement
115
+ requirement: &70101145931900 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: '0'
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70294598130540
123
+ version_requirements: *70101145931900
124
124
  description:
125
125
  email: stewart@rockwellcottage.com
126
126
  executables: []