cobweb 0.0.61 → 0.0.62

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.61
2
+ h1. Cobweb v0.0.62
3
3
  !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
4
4
 
5
5
  h2. Intro
data/lib/cobweb.rb CHANGED
@@ -66,6 +66,7 @@ class Cobweb
66
66
 
67
67
  request.merge!(@options)
68
68
  @redis = NamespacedRedis.new(request[:redis_options], "cobweb-#{Cobweb.version}-#{request[:crawl_id]}")
69
+ @redis.set("original_base_url", base_url)
69
70
  @redis.hset "statistics", "queued_at", DateTime.now
70
71
  @redis.set("crawl-counter", 0)
71
72
  @redis.set("queue-counter", 1)
@@ -75,7 +76,6 @@ class Cobweb
75
76
 
76
77
  # add internal_urls into redis
77
78
  @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
78
-
79
79
  Resque.enqueue(CrawlJob, request)
80
80
  request
81
81
  end
@@ -114,7 +114,7 @@ class Cobweb
114
114
  else
115
115
  redis = NamespacedRedis.new(@options[:redis_options], "cobweb-#{Cobweb.version}")
116
116
  end
117
-
117
+
118
118
  content = {:base_url => url}
119
119
 
120
120
  # check if it has already been cached
@@ -149,7 +149,7 @@ class Cobweb
149
149
  puts "redirected... " unless @options[:quiet]
150
150
 
151
151
  # get location to redirect to
152
- url = UriHelper.join_no_fragment(uri, response['location'])
152
+ uri = UriHelper.join_no_fragment(uri, response['location'])
153
153
 
154
154
  # decrement redirect limit
155
155
  redirect_limit = redirect_limit - 1
@@ -158,7 +158,7 @@ class Cobweb
158
158
  cookies = get_cookies(response)
159
159
 
160
160
  # get the content from redirect location
161
- content = get(url, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
161
+ content = get(uri, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
162
162
  content[:url] = uri.to_s
163
163
  content[:redirect_through] = [] if content[:redirect_through].nil?
164
164
  content[:redirect_through].insert(0, url)
@@ -307,14 +307,14 @@ class Cobweb
307
307
  if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
308
308
  puts "redirected... " unless @options[:quiet]
309
309
 
310
- url = UriHelper.join_no_fragment(uri, response['location'])
310
+ uri = UriHelper.join_no_fragment(uri, response['location'])
311
311
 
312
312
  redirect_limit = redirect_limit - 1
313
313
 
314
314
  raise RedirectError, "Redirect Limit reached" if redirect_limit == 0
315
315
  cookies = get_cookies(response)
316
316
 
317
- content = head(url, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
317
+ content = head(uri, options.merge(:redirect_limit => redirect_limit, :cookies => cookies))
318
318
  content[:url] = uri.to_s
319
319
  content[:redirect_through] = [] if content[:redirect_through].nil?
320
320
  content[:redirect_through].insert(0, url)
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.61"
6
+ "0.0.62"
7
7
  end
8
8
 
9
9
  end
data/lib/crawl_job.rb CHANGED
@@ -28,9 +28,11 @@ class CrawlJob
28
28
 
29
29
  # check we haven't crawled this url before
30
30
  unless @redis.sismember "crawled", content_request[:url]
31
-
32
31
  content = Cobweb.new(content_request).get(content_request[:url], content_request)
33
-
32
+ if content_request[:url] == @redis.get("original_base_url")
33
+ puts content
34
+ @redis.set("crawled_base_url", content[:base_url])
35
+ end
34
36
  if is_permitted_type(content)
35
37
  # if there is no limit or we're still under it lets get the url
36
38
  if within_crawl_limits?(content_request[:crawl_limit])
@@ -124,9 +126,9 @@ class CrawlJob
124
126
  def self.finished(content_request)
125
127
  # finished
126
128
  if @redis.hget("statistics", "current_status")!= "Crawl Stopped"
127
- ap "CRAWL FINISHED #{content_request[:url]}, #{counters}" if content_request[:debug]
129
+ ap "CRAWL FINISHED #{content_request[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if content_request[:debug]
128
130
  @stats.end_crawl(content_request)
129
- Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id]}))
131
+ Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id], :crawled_base_url => @redis.get("crawled_base_url")}))
130
132
  else
131
133
  ap "CRAWL REFINISHED #{content_request[:url]}, #{counters}" if content_request[:debug]
132
134
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.61
4
+ version: 0.0.62
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-17 00:00:00.000000000 Z
12
+ date: 2012-07-27 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70301996170680 !ruby/object:Gem::Requirement
16
+ requirement: &70305062213520 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70301996170680
24
+ version_requirements: *70305062213520
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70301996168960 !ruby/object:Gem::Requirement
27
+ requirement: &70305062213040 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70301996168960
35
+ version_requirements: *70305062213040
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70301996168220 !ruby/object:Gem::Requirement
38
+ requirement: &70305062212160 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70301996168220
46
+ version_requirements: *70305062212160
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70301996167040 !ruby/object:Gem::Requirement
49
+ requirement: &70305062211220 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70301996167040
57
+ version_requirements: *70305062211220
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70301996165600 !ruby/object:Gem::Requirement
60
+ requirement: &70305062210040 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70301996165600
68
+ version_requirements: *70305062210040
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70301996180600 !ruby/object:Gem::Requirement
71
+ requirement: &70305062208860 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70301996180600
79
+ version_requirements: *70305062208860
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70301996179980 !ruby/object:Gem::Requirement
82
+ requirement: &70305062224320 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70301996179980
90
+ version_requirements: *70305062224320
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70301996179420 !ruby/object:Gem::Requirement
93
+ requirement: &70305062223780 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70301996179420
101
+ version_requirements: *70305062223780
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70301996178900 !ruby/object:Gem::Requirement
104
+ requirement: &70305062223120 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70301996178900
112
+ version_requirements: *70305062223120
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70301996178200 !ruby/object:Gem::Requirement
115
+ requirement: &70305062222400 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70301996178200
123
+ version_requirements: *70305062222400
124
124
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
125
125
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
126
126
  is also a standalone crawler that has a sophisticated statistics monitoring interface