cobweb 0.0.63 → 0.0.64
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +3 -1
- data/lib/cobweb.rb +2 -2
- data/lib/cobweb_crawler.rb +0 -1
- data/lib/cobweb_version.rb +1 -1
- data/lib/crawl_job.rb +7 -3
- data/lib/stats.rb +1 -1
- data/spec/cobweb/cobweb_job_spec.rb +1 -1
- data/spec/cobweb/cobweb_spec.rb +2 -2
- metadata +22 -22
data/README.textile
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v0.0.
|
2
|
+
h1. Cobweb v0.0.64
|
3
3
|
!https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
|
4
4
|
|
5
5
|
h2. Intro
|
6
6
|
|
7
7
|
CobWeb has three methods of running. Firstly it is a http client that allows get and head requests returning a hash of data relating to the requested resource. The second main function is to utilize this combined with the power of Resque to cluster the crawls allowing you crawl quickly. Lastly you can run the crawler with a block that uses each of the pages found in the crawl.
|
8
8
|
|
9
|
+
I've created a sample app to help with setting up cobweb at http://github.com/stewartmckee/cobweb_sample
|
10
|
+
|
9
11
|
h3. Resque
|
10
12
|
|
11
13
|
When running on resque, passing in a Class and queue name it will enqueue all resources to this queue for processing, passing in the hash it has generated. You then implement the perform method to process the resource for your own application.
|
data/lib/cobweb.rb
CHANGED
@@ -34,8 +34,8 @@ class Cobweb
|
|
34
34
|
default_use_encoding_safe_process_job_to false
|
35
35
|
default_follow_redirects_to true
|
36
36
|
default_redirect_limit_to 10
|
37
|
-
default_processing_queue_to CobwebProcessJob
|
38
|
-
default_crawl_finished_queue_to CobwebFinishedJob
|
37
|
+
default_processing_queue_to "CobwebProcessJob"
|
38
|
+
default_crawl_finished_queue_to "CobwebFinishedJob"
|
39
39
|
default_quiet_to true
|
40
40
|
default_debug_to false
|
41
41
|
default_cache_to 300
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -64,7 +64,6 @@ class CobwebCrawler
|
|
64
64
|
@redis.incr "crawl-counter"
|
65
65
|
|
66
66
|
internal_links = ContentLinkParser.new(url, content[:body]).all_links(:valid_schemes => [:http, :https])
|
67
|
-
ap internal_links
|
68
67
|
|
69
68
|
# select the link if its internal (eliminate external before expensive lookups in queued and crawled)
|
70
69
|
cobweb_links = CobwebLinks.new(@options)
|
data/lib/cobweb_version.rb
CHANGED
data/lib/crawl_job.rb
CHANGED
@@ -32,7 +32,6 @@ class CrawlJob
|
|
32
32
|
if within_crawl_limits?(content_request[:crawl_limit])
|
33
33
|
content = Cobweb.new(content_request).get(content_request[:url], content_request)
|
34
34
|
if content_request[:url] == @redis.get("original_base_url")
|
35
|
-
puts content
|
36
35
|
@redis.set("crawled_base_url", content[:base_url])
|
37
36
|
end
|
38
37
|
if is_permitted_type(content)
|
@@ -60,7 +59,7 @@ class CrawlJob
|
|
60
59
|
|
61
60
|
@cobweb_links = CobwebLinks.new(content_request)
|
62
61
|
if within_queue_limits?(content_request[:crawl_limit])
|
63
|
-
internal_links = ContentLinkParser.new(content_request[:url], content[:body]).all_links(:valid_schemes => [:http, :https])
|
62
|
+
internal_links = ContentLinkParser.new(content_request[:url], content[:body], content_request).all_links(:valid_schemes => [:http, :https])
|
64
63
|
|
65
64
|
# select the link if its internal
|
66
65
|
internal_links.select! { |link| @cobweb_links.internal?(link) }
|
@@ -128,7 +127,12 @@ class CrawlJob
|
|
128
127
|
if @redis.hget("statistics", "current_status")!= "Crawl Finished"
|
129
128
|
ap "CRAWL FINISHED #{content_request[:url]}, #{counters}, #{@redis.get("original_base_url")}, #{@redis.get("crawled_base_url")}" if content_request[:debug]
|
130
129
|
@stats.end_crawl(content_request)
|
131
|
-
|
130
|
+
|
131
|
+
additional_stats = {:crawl_id => content_request[:crawl_id], :crawled_base_url => @redis.get("crawled_base_url")}
|
132
|
+
additional_stats[:redis_options] = content_request[:redis_options] unless content_request[:redis_options] == {}
|
133
|
+
additional_stats[:source_id] = content_request[:source_id] unless content_request[:source_id].nil?
|
134
|
+
|
135
|
+
Resque.enqueue(const_get(content_request[:crawl_finished_queue]), @stats.get_statistics.merge(additional_stats))
|
132
136
|
else
|
133
137
|
# nothing to report here, we're skipping the remaining urls as we're outside of the crawl limit
|
134
138
|
end
|
data/lib/stats.rb
CHANGED
@@ -14,7 +14,7 @@ class Stats
|
|
14
14
|
unless @full_redis.sismember "cobweb_crawls", options[:crawl_id]
|
15
15
|
@full_redis.sadd "cobweb_crawls", options[:crawl_id]
|
16
16
|
options.keys.each do |key|
|
17
|
-
@redis.hset "crawl_details", key, options[key]
|
17
|
+
@redis.hset "crawl_details", key, options[key].to_s
|
18
18
|
end
|
19
19
|
end
|
20
20
|
@redis.hset "statistics", "current_status", "Crawl Starting..."
|
@@ -3,7 +3,7 @@ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
|
3
3
|
describe Cobweb, :local_only => true do
|
4
4
|
|
5
5
|
before(:all) do
|
6
|
-
#store all existing resque process ids
|
6
|
+
#store all existing resque process ids so we don't kill them afterwards
|
7
7
|
@existing_processes = `ps aux | grep resque | grep -v grep | grep -v resque-web | awk '{print $2}'`.split("\n")
|
8
8
|
|
9
9
|
# START WORKERS ONLY FOR CRAWL QUEUE SO WE CAN COUNT ENQUEUED PROCESS AND FINISH QUEUES
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -18,8 +18,8 @@ describe Cobweb do
|
|
18
18
|
|
19
19
|
options[:follow_redirects].should == true
|
20
20
|
options[:redirect_limit].should == 10
|
21
|
-
options[:processing_queue].should == CobwebProcessJob
|
22
|
-
options[:crawl_finished_queue].should == CobwebFinishedJob
|
21
|
+
options[:processing_queue].should == "CobwebProcessJob"
|
22
|
+
options[:crawl_finished_queue].should == "CobwebFinishedJob"
|
23
23
|
options[:quiet].should == true
|
24
24
|
options[:debug].should == false
|
25
25
|
options[:cache].should == 300
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.64
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-08-
|
12
|
+
date: 2012-08-20 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70160920762100 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70160920762100
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70160920761600 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70160920761600
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70160920760740 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70160920760740
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70160920759800 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70160920759800
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70160920758660 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70160920758660
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70160920773700 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70160920773700
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70160920772960 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70160920772960
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70160920772440 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70160920772440
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70160920771760 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70160920771760
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: namespaced_redis
|
115
|
-
requirement: &
|
115
|
+
requirement: &70160920771040 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,7 +120,7 @@ dependencies:
|
|
120
120
|
version: 1.0.2
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70160920771040
|
124
124
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
125
125
|
crawl extremely large sites which is much more perofmant than multi-threaded crawlers. It
|
126
126
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|