cobweb 0.0.62 → 0.0.63

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile CHANGED
@@ -1,5 +1,5 @@
1
1
 
2
- h1. Cobweb v0.0.62
2
+ h1. Cobweb v0.0.63
3
3
  !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
4
4
 
5
5
  h2. Intro
data/lib/cobweb_links.rb CHANGED
@@ -28,27 +28,11 @@ class CobwebLinks
28
28
 
29
29
  # Returns true if the link is matched to an internal_url and not matched to an external_url
30
30
  def internal?(link)
31
- if @options[:debug]
32
- puts "--------------------------------"
33
- puts "Link: #{link}"
34
- puts "Internal matches"
35
- ap @internal_patterns.select{|pattern| link.match(pattern)}
36
- puts "External matches"
37
- ap @external_patterns.select{|pattern| link.match(pattern)}
38
- end
39
31
  !@internal_patterns.select{|pattern| link.match(pattern)}.empty? && @external_patterns.select{|pattern| link.match(pattern)}.empty?
40
32
  end
41
33
 
42
34
  # Returns true if the link is matched to an external_url or not matched to an internal_url
43
35
  def external?(link)
44
- if @options[:debug]
45
- puts "--------------------------------"
46
- puts "Link: #{link}"
47
- puts "Internal matches"
48
- ap @internal_patterns.select{|pattern| link.match(pattern)}
49
- puts "External matches"
50
- ap @external_patterns.select{|pattern| link.match(pattern)}
51
- end
52
36
  @internal_patterns.select{|pattern| link.match(pattern)}.empty? || !@external_patterns.select{|pattern| link.match(pattern)}.empty?
53
37
  end
54
38
 
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.62"
6
+ "0.0.63"
7
7
  end
8
8
 
9
9
  end
data/lib/crawl_job.rb CHANGED
@@ -28,14 +28,14 @@ class CrawlJob
28
28
 
29
29
  # check we haven't crawled this url before
30
30
  unless @redis.sismember "crawled", content_request[:url]
31
- content = Cobweb.new(content_request).get(content_request[:url], content_request)
32
- if content_request[:url] == @redis.get("original_base_url")
33
- puts content
34
- @redis.set("crawled_base_url", content[:base_url])
35
- end
36
- if is_permitted_type(content)
37
- # if there is no limit or we're still under it lets get the url
38
- if within_crawl_limits?(content_request[:crawl_limit])
31
+ # if there is no limit or we're still under it lets get the url
32
+ if within_crawl_limits?(content_request[:crawl_limit])
33
+ content = Cobweb.new(content_request).get(content_request[:url], content_request)
34
+ if content_request[:url] == @redis.get("original_base_url")
35
+ puts content
36
+ @redis.set("crawled_base_url", content[:base_url])
37
+ end
38
+ if is_permitted_type(content)
39
39
  begin
40
40
  # move the url from the queued list to the crawled list - for both the original url, and the content url (to handle redirects)
41
41
  @redis.srem "queued", content_request[:url]
@@ -99,10 +99,10 @@ class CrawlJob
99
99
  puts "Crawled: #{@crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{@queue_counter} In Progress: #{@crawl_started_counter-@crawl_counter}" if @debug
100
100
  end
101
101
  else
102
- puts "ignoring #{content_request[:url]} as outside of crawl limits." if content_request[:debug]
102
+ puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
103
103
  end
104
104
  else
105
- puts "ignoring #{content_request[:url]} as mime_type is #{content[:mime_type]}" if content_request[:debug]
105
+ puts "ignoring #{content_request[:url]} as outside of crawl limits." if content_request[:debug]
106
106
  end
107
107
 
108
108
  else
@@ -122,15 +122,15 @@ class CrawlJob
122
122
 
123
123
  end
124
124
 
125
- # Sets the crawl status to 'Crawl Stopped' and enqueues the crawl finished job
125
+ # Sets the crawl status to 'Crawl Finished' and enqueues the crawl finished job
126
126
  def self.finished(content_request)
127
127
  # finished
128
- if @redis.hget("statistics", "current_status")!= "Crawl Stopped"
128
+ if @redis.hget("statistics", "current_status")!= "Crawl Finished"
129
129
  ap "CRAWL FINISHED #{content_request[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if content_request[:debug]
130
130
  @stats.end_crawl(content_request)
131
131
  Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id], :crawled_base_url => @redis.get("crawled_base_url")}))
132
132
  else
133
- ap "CRAWL REFINISHED #{content_request[:url]}, #{counters}" if content_request[:debug]
133
+ # nothing to report here, we're skipping the remaining urls as we're outside of the crawl limit
134
134
  end
135
135
  end
136
136
 
data/lib/stats.rb CHANGED
@@ -23,7 +23,7 @@ class Stats
23
23
  # Removes the crawl from the running crawls and updates status
24
24
  def end_crawl(options)
25
25
  @full_redis.srem "cobweb_crawls", options[:crawl_id]
26
- @redis.hset "statistics", "current_status", "Crawl Stopped"
26
+ @redis.hset "statistics", "current_status", "Crawl Finished"
27
27
  @redis.del "crawl_details"
28
28
  end
29
29
 
@@ -200,7 +200,7 @@ def wait_for_crawl_finished(crawl_id, timeout=20)
200
200
  end
201
201
 
202
202
  def running?(crawl_id)
203
- @stat.get_status != "Crawl Stopped"
203
+ @stat.get_status != "Crawl Finished"
204
204
  end
205
205
 
206
206
  def clear_queues
data/views/layout.haml CHANGED
@@ -51,17 +51,17 @@
51
51
  %ul
52
52
  %li.home
53
53
  %a{:href => "#"}
54
- %li.break »
54
+ %li.break >
55
55
  %li
56
56
  %a{:href => "#"} Menu item
57
- %li.break »
57
+ %li.break >
58
58
  %li
59
59
  %a{:href => "#"} Menu item
60
60
  = yield
61
61
 
62
62
  .footer
63
63
  .split
64
- © Copyright
64
+ Copyright
65
65
  %a{:href => "http://www.activeinformationdesign.com"} activeinformationdesign.com
66
66
  .split.right
67
67
  Powered by
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.62
4
+ version: 0.0.63
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-07-27 00:00:00.000000000 Z
12
+ date: 2012-08-07 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70305062213520 !ruby/object:Gem::Requirement
16
+ requirement: &70236896384860 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70305062213520
24
+ version_requirements: *70236896384860
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70305062213040 !ruby/object:Gem::Requirement
27
+ requirement: &70236896384380 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70305062213040
35
+ version_requirements: *70236896384380
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70305062212160 !ruby/object:Gem::Requirement
38
+ requirement: &70236896383640 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70305062212160
46
+ version_requirements: *70236896383640
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70305062211220 !ruby/object:Gem::Requirement
49
+ requirement: &70236896382560 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70305062211220
57
+ version_requirements: *70236896382560
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70305062210040 !ruby/object:Gem::Requirement
60
+ requirement: &70236896381420 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70305062210040
68
+ version_requirements: *70236896381420
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70305062208860 !ruby/object:Gem::Requirement
71
+ requirement: &70236896380260 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70305062208860
79
+ version_requirements: *70236896380260
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70305062224320 !ruby/object:Gem::Requirement
82
+ requirement: &70236896395760 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70305062224320
90
+ version_requirements: *70236896395760
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70305062223780 !ruby/object:Gem::Requirement
93
+ requirement: &70236896395220 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70305062223780
101
+ version_requirements: *70236896395220
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70305062223120 !ruby/object:Gem::Requirement
104
+ requirement: &70236896394540 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70305062223120
112
+ version_requirements: *70236896394540
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70305062222400 !ruby/object:Gem::Requirement
115
+ requirement: &70236896393840 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70305062222400
123
+ version_requirements: *70236896393840
124
124
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
125
125
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
126
126
  is also a standalone crawler that has a sophisticated statistics monitoring interface