cobweb 0.0.62 → 0.0.63

Sign up to get free protection for your applications and to get access to all the features.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.62
2
+ h1. Cobweb v0.0.63
3
3
  !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
4
4
 
5
5
  h2. Intro
data/lib/cobweb_links.rb CHANGED
@@ -28,27 +28,11 @@ class CobwebLinks
28
28
 
29
29
  # Returns true if the link is matched to an internal_url and not matched to an external_url
30
30
  def internal?(link)
31
- if @options[:debug]
32
- puts "--------------------------------"
33
- puts "Link: #{link}"
34
- puts "Internal matches"
35
- ap @internal_patterns.select{|pattern| link.match(pattern)}
36
- puts "External matches"
37
- ap @external_patterns.select{|pattern| link.match(pattern)}
38
- end
39
31
  !@internal_patterns.select{|pattern| link.match(pattern)}.empty? && @external_patterns.select{|pattern| link.match(pattern)}.empty?
40
32
  end
41
33
 
42
34
  # Returns true if the link is matched to an external_url or not matched to an internal_url
43
35
  def external?(link)
44
- if @options[:debug]
45
- puts "--------------------------------"
46
- puts "Link: #{link}"
47
- puts "Internal matches"
48
- ap @internal_patterns.select{|pattern| link.match(pattern)}
49
- puts "External matches"
50
- ap @external_patterns.select{|pattern| link.match(pattern)}
51
- end
52
36
  @internal_patterns.select{|pattern| link.match(pattern)}.empty? || !@external_patterns.select{|pattern| link.match(pattern)}.empty?
53
37
  end
54
38
 
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.62"
6
+ "0.0.63"
7
7
  end
8
8
 
9
9
  end
data/lib/crawl_job.rb CHANGED
@@ -28,14 +28,14 @@ class CrawlJob
28
28
 
29
29
  # check we haven't crawled this url before
30
30
  unless @redis.sismember "crawled", content_request[:url]
31
- content = Cobweb.new(content_request).get(content_request[:url], content_request)
32
- if content_request[:url] == @redis.get("original_base_url")
33
- puts content
34
- @redis.set("crawled_base_url", content[:base_url])
35
- end
36
- if is_permitted_type(content)
37
- # if there is no limit or we're still under it lets get the url
38
- if within_crawl_limits?(content_request[:crawl_limit])
31
+ # if there is no limit or we're still under it lets get the url
32
+ if within_crawl_limits?(content_request[:crawl_limit])
33
+ content = Cobweb.new(content_request).get(content_request[:url], content_request)
34
+ if content_request[:url] == @redis.get("original_base_url")
35
+ puts content
36
+ @redis.set("crawled_base_url", content[:base_url])
37
+ end
38
+ if is_permitted_type(content)
39
39
  begin
40
40
  # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
41
41
  @redis.srem "queued", content_request[:url]
@@ -99,10 +99,10 @@ class CrawlJob
99
99
  puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter} In Progress: #{@crawl_started_counter-@crawl_counter}" if @debug
100
100
  end
101
101
  else
102
- puts "ignoring #{content_request[:url]} as outside of crawl limits." if content_request[:debug]
102
+ puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
103
103
  end
104
104
  else
105
- puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
105
+ puts "ignoring #{content_request[:url]} as outside of crawl limits." if content_request[:debug]
106
106
  end
107
107
 
108
108
  else
@@ -122,15 +122,15 @@ class CrawlJob
122
122
 
123
123
  end
124
124
 
125
- # Sets the crawl status to 'Crawl Stopped' and enqueues the crawl finished job
125
+ # Sets the crawl status to 'Crawl Finished' and enqueues the crawl finished job
126
126
  def self.finished(content_request)
127
127
  # finished
128
- if @redis.hget("statistics", "current_status")!= "Crawl Stopped"
128
+ if @redis.hget("statistics", "current_status")!= "Crawl Finished"
129
129
  ap "CRAWL FINISHED #{content_request[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if content_request[:debug]
130
130
  @stats.end_crawl(content_request)
131
131
  Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id], :crawled_base_url => @redis.get("crawled_base_url")}))
132
132
  else
133
- ap "CRAWL REFINISHED #{content_request[:url]}, #{counters}" if content_request[:debug]
133
+ # nothing to report here, we're skipping the remaining urls as we're outside of the crawl limit
134
134
  end
135
135
  end
136
136
 
data/lib/stats.rb CHANGED
@@ -23,7 +23,7 @@ class Stats
23
23
  # Removes the crawl from the running crawls and updates status
24
24
  def end_crawl(options)
25
25
  @full_redis.srem "cobweb_crawls", options[:crawl_id]
26
- @redis.hset "statistics", "current_status", "Crawl Stopped"
26
+ @redis.hset "statistics", "current_status", "Crawl Finished"
27
27
  @redis.del "crawl_details"
28
28
  end
29
29
 
@@ -200,7 +200,7 @@ def wait_for_crawl_finished(crawl_id, timeout=20)
200
200
  end
201
201
 
202
202
  def running?(crawl_id)
203
- @stat.get_status != "Crawl Stopped"
203
+ @stat.get_status != "Crawl Finished"
204
204
  end
205
205
 
206
206
  def clear_queues
data/views/layout.haml CHANGED
@@ -51,17 +51,17 @@
51
51
  %ul
52
52
  %li.home
53
53
  %a{:href => "#"}
54
- %li.break »
54
+ %li.break >
55
55
  %li
56
56
  %a{:href => "#"} Menu item
57
- %li.break »
57
+ %li.break >
58
58
  %li
59
59
  %a{:href => "#"} Menu item
60
60
  = yield
61
61
 
62
62
  .footer
63
63
  .split
64
- © Copyright
64
+ Copyright
65
65
  %a{:href => "http://www.activeinformationdesign.com"} activeinformationdesign.com
66
66
  .split.right
67
67
  Powered by
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.62
4
+ version: 0.0.63
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-27 00:00:00.000000000 Z
12
+ date: 2012-08-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70305062213520 !ruby/object:Gem::Requirement
16
+ requirement: &70236896384860 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70305062213520
24
+ version_requirements: *70236896384860
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70305062213040 !ruby/object:Gem::Requirement
27
+ requirement: &70236896384380 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70305062213040
35
+ version_requirements: *70236896384380
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70305062212160 !ruby/object:Gem::Requirement
38
+ requirement: &70236896383640 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70305062212160
46
+ version_requirements: *70236896383640
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70305062211220 !ruby/object:Gem::Requirement
49
+ requirement: &70236896382560 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70305062211220
57
+ version_requirements: *70236896382560
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70305062210040 !ruby/object:Gem::Requirement
60
+ requirement: &70236896381420 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70305062210040
68
+ version_requirements: *70236896381420
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70305062208860 !ruby/object:Gem::Requirement
71
+ requirement: &70236896380260 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70305062208860
79
+ version_requirements: *70236896380260
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70305062224320 !ruby/object:Gem::Requirement
82
+ requirement: &70236896395760 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70305062224320
90
+ version_requirements: *70236896395760
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70305062223780 !ruby/object:Gem::Requirement
93
+ requirement: &70236896395220 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70305062223780
101
+ version_requirements: *70236896395220
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70305062223120 !ruby/object:Gem::Requirement
104
+ requirement: &70236896394540 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70305062223120
112
+ version_requirements: *70236896394540
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70305062222400 !ruby/object:Gem::Requirement
115
+ requirement: &70236896393840 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70305062222400
123
+ version_requirements: *70236896393840
124
124
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
125
125
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
126
126
  is also a standalone crawler that has a sophisticated statistics monitoring interface