cobweb 0.0.63 → 0.0.64

Sign up to get free protection for your applications and to get access to all the features.
data/README.textile CHANGED
@@ -1,11 +1,13 @@
1
1
 
2
- h1. Cobweb v0.0.63
2
+ h1. Cobweb v0.0.64
3
3
  !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
4
4
 
5
5
  h2. Intro
6
6
 
7
7
  CobWeb has three methods of running. Firstly it is a http client that allows get and head requests returning a hash of data relating to the requested resource. The second main function is to utilize this combined with the power of Resque to cluster the crawls allowing you crawl quickly. Lastly you can run the crawler with a block that uses each of the pages found in the crawl.
8
8
 
9
+ I've created a sample app to help with setting up cobweb at http://github.com/stewartmckee/cobweb_sample
10
+
9
11
  h3. Resque
10
12
 
11
13
  When running on resque, passing in a Class and queue name it will enqueue all resources to this queue for processing, passing in the hash it has generated. You then implement the perform method to process the resource for your own application.
data/lib/cobweb.rb CHANGED
@@ -34,8 +34,8 @@ class Cobweb
34
34
  default_use_encoding_safe_process_job_to false
35
35
  default_follow_redirects_to true
36
36
  default_redirect_limit_to 10
37
- default_processing_queue_to CobwebProcessJob
38
- default_crawl_finished_queue_to CobwebFinishedJob
37
+ default_processing_queue_to "CobwebProcessJob"
38
+ default_crawl_finished_queue_to "CobwebFinishedJob"
39
39
  default_quiet_to true
40
40
  default_debug_to false
41
41
  default_cache_to 300
@@ -64,7 +64,6 @@ class CobwebCrawler
64
64
  @redis.incr "crawl-counter"
65
65
 
66
66
  internal_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https])
67
- ap internal_links
68
67
 
69
68
  # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
70
69
  cobweb_links = CobwebLinks.new(@options)
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "0.0.63"
6
+ "0.0.64"
7
7
  end
8
8
 
9
9
  end
data/lib/crawl_job.rb CHANGED
@@ -32,7 +32,6 @@ class CrawlJob
32
32
  if within_crawl_limits?(content_request[:crawl_limit])
33
33
  content = Cobweb.new(content_request).get(content_request[:url], content_request)
34
34
  if content_request[:url] == @redis.get("original_base_url")
35
- puts content
36
35
  @redis.set("crawled_base_url", content[:base_url])
37
36
  end
38
37
  if is_permitted_type(content)
@@ -60,7 +59,7 @@ class CrawlJob
60
59
 
61
60
  @cobweb_links = CobwebLinks.new(content_request)
62
61
  if within_queue_limits?(content_request[:crawl_limit])
63
- internal_links = ContentLinkParser.new(content_request[:url], content[:body]).all_links(:valid_schemes => [:http, :https])
62
+ internal_links = ContentLinkParser.new(content_request[:url], content[:body], content_request).all_links(:valid_schemes => [:http, :https])
64
63
 
65
64
  # select the link if its internal
66
65
  internal_links.select! { |link| @cobweb_links.internal?(link) }
@@ -128,7 +127,12 @@ class CrawlJob
128
127
  if @redis.hget("statistics", "current_status")!= "Crawl Finished"
129
128
  ap "CRAWL FINISHED #{content_request[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if content_request[:debug]
130
129
  @stats.end_crawl(content_request)
131
- Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge({:redis_options => content_request[:redis_options], :crawl_id => content_request[:crawl_id], :source_id => content_request[:source_id], :crawled_base_url => @redis.get("crawled_base_url")}))
130
+
131
+ additional_stats = {:crawl_id => content_request[:crawl_id], :crawled_base_url => @redis.get("crawled_base_url")}
132
+ additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
133
+ additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
134
+
135
+ Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge(additional_stats))
132
136
  else
133
137
  # nothing to report here, we're skipping the remaining urls as we're outside of the crawl limit
134
138
  end
data/lib/stats.rb CHANGED
@@ -14,7 +14,7 @@ class Stats
14
14
  unless @full_redis.sismember "cobweb_crawls", options[:crawl_id]
15
15
  @full_redis.sadd "cobweb_crawls", options[:crawl_id]
16
16
  options.keys.each do |key|
17
- @redis.hset "crawl_details", key, options[key]
17
+ @redis.hset "crawl_details", key, options[key].to_s
18
18
  end
19
19
  end
20
20
  @redis.hset "statistics", "current_status", "Crawl Starting..."
@@ -3,7 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
3
3
  describe Cobweb, :local_only => true do
4
4
 
5
5
  before(:all) do
6
- #store all existing resque process ids
6
+ #store all existing resque process ids so we don't kill them afterwards
7
7
  @existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
8
8
 
9
9
  # START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
@@ -18,8 +18,8 @@ describe Cobweb do
18
18
 
19
19
  options[:follow_redirects].should == true
20
20
  options[:redirect_limit].should == 10
21
- options[:processing_queue].should == CobwebProcessJob
22
- options[:crawl_finished_queue].should == CobwebFinishedJob
21
+ options[:processing_queue].should == "CobwebProcessJob"
22
+ options[:crawl_finished_queue].should == "CobwebFinishedJob"
23
23
  options[:quiet].should == true
24
24
  options[:debug].should == false
25
25
  options[:cache].should == 300
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.63
4
+ version: 0.0.64
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-08-07 00:00:00.000000000 Z
12
+ date: 2012-08-20 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70236896384860 !ruby/object:Gem::Requirement
16
+ requirement: &70160920762100 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70236896384860
24
+ version_requirements: *70160920762100
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70236896384380 !ruby/object:Gem::Requirement
27
+ requirement: &70160920761600 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70236896384380
35
+ version_requirements: *70160920761600
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70236896383640 !ruby/object:Gem::Requirement
38
+ requirement: &70160920760740 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70236896383640
46
+ version_requirements: *70160920760740
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70236896382560 !ruby/object:Gem::Requirement
49
+ requirement: &70160920759800 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70236896382560
57
+ version_requirements: *70160920759800
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70236896381420 !ruby/object:Gem::Requirement
60
+ requirement: &70160920758660 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70236896381420
68
+ version_requirements: *70160920758660
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70236896380260 !ruby/object:Gem::Requirement
71
+ requirement: &70160920773700 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70236896380260
79
+ version_requirements: *70160920773700
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70236896395760 !ruby/object:Gem::Requirement
82
+ requirement: &70160920772960 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70236896395760
90
+ version_requirements: *70160920772960
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70236896395220 !ruby/object:Gem::Requirement
93
+ requirement: &70160920772440 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70236896395220
101
+ version_requirements: *70160920772440
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70236896394540 !ruby/object:Gem::Requirement
104
+ requirement: &70160920771760 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70236896394540
112
+ version_requirements: *70160920771760
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70236896393840 !ruby/object:Gem::Requirement
115
+ requirement: &70160920771040 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,7 +120,7 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70236896393840
123
+ version_requirements: *70160920771040
124
124
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
125
125
  crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
126
126
  is also a standalone crawler that has a sophisticated statistics monitoring interface