cobweb 1.0.1 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +2 -2
- data/lib/cobweb.rb +6 -4
- data/lib/cobweb_crawler.rb +16 -7
- data/lib/cobweb_version.rb +1 -1
- data/lib/robots.rb +2 -2
- data/spec/cobweb/cobweb_spec.rb +3 -3
- data/spec/cobweb/robots_spec.rb +1 -1
- data/spec/spec_helper.rb +3 -3
- metadata +24 -24
data/README.textile
CHANGED
data/lib/cobweb.rb
CHANGED
@@ -38,6 +38,7 @@ class Cobweb
|
|
38
38
|
default_quiet_to true
|
39
39
|
default_debug_to false
|
40
40
|
default_cache_to 300
|
41
|
+
default_cache_type_to :crawl_based # other option is :full
|
41
42
|
default_timeout_to 10
|
42
43
|
default_redis_options_to Hash.new
|
43
44
|
default_internal_urls_to []
|
@@ -114,11 +115,12 @@ class Cobweb
|
|
114
115
|
else
|
115
116
|
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
|
116
117
|
end
|
118
|
+
full_redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
|
117
119
|
|
118
120
|
content = {:base_url => url}
|
119
121
|
|
120
122
|
# check if it has already been cached
|
121
|
-
if redis.get(unique_id)
|
123
|
+
if ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id))) && @options[:cache]
|
122
124
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
123
125
|
content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
|
124
126
|
else
|
@@ -138,9 +140,9 @@ class Cobweb
|
|
138
140
|
begin
|
139
141
|
print "Retrieving #{url }... " unless @options[:quiet]
|
140
142
|
request_options={}
|
141
|
-
|
142
|
-
|
143
|
-
|
143
|
+
request_options['Cookie']= options[:cookies] if options.has_key?(:cookies)
|
144
|
+
request_options['User-Agent']= options[:user_agent] if options.has_key?(:user_agent)
|
145
|
+
|
144
146
|
request = Net::HTTP::Get.new uri.request_uri, request_options
|
145
147
|
|
146
148
|
response = @http.request request
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -23,6 +23,9 @@ class CobwebCrawler
|
|
23
23
|
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => Redis.new(@options[:redis_options]))
|
24
24
|
@options[:internal_urls] = [] if @options[:internal_urls].nil?
|
25
25
|
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
26
|
+
|
27
|
+
@options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external
|
28
|
+
|
26
29
|
@debug = @options[:debug]
|
27
30
|
|
28
31
|
@stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
|
@@ -52,12 +55,13 @@ class CobwebCrawler
|
|
52
55
|
thread = Thread.new do
|
53
56
|
|
54
57
|
url = @redis.spop "queued"
|
55
|
-
|
58
|
+
queue_counter = 0 if url.nil?
|
59
|
+
|
56
60
|
@options[:url] = url
|
57
61
|
unless @redis.sismember("crawled", url.to_s)
|
58
62
|
begin
|
59
63
|
@stats.update_status("Requesting #{url}...")
|
60
|
-
content = @cobweb.get(url)
|
64
|
+
content = @cobweb.get(url) unless url.nil?
|
61
65
|
if content.nil?
|
62
66
|
queue_counter = queue_counter - 1 #@redis.scard("queued").to_i
|
63
67
|
else
|
@@ -70,8 +74,9 @@ class CobwebCrawler
|
|
70
74
|
|
71
75
|
# select the link if its internal (eliminate external before expensive lookups in queued and crawled)
|
72
76
|
cobweb_links = CobwebLinks.new(@options)
|
73
|
-
|
74
|
-
|
77
|
+
|
78
|
+
internal_links = internal_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s))}
|
79
|
+
|
75
80
|
# reject the link if we've crawled it or queued it
|
76
81
|
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
77
82
|
internal_links.reject!{|link| @redis.sismember("queued", link)}
|
@@ -95,9 +100,13 @@ class CobwebCrawler
|
|
95
100
|
yield content, @stats.get_statistics if block_given?
|
96
101
|
end
|
97
102
|
rescue => e
|
98
|
-
puts "
|
99
|
-
|
100
|
-
ap e
|
103
|
+
puts "Error loading #{url}: #{e}"
|
104
|
+
#puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
|
105
|
+
#ap e
|
106
|
+
#ap e.backtrace
|
107
|
+
ensure
|
108
|
+
crawl_counter = @redis.scard("crawled").to_i
|
109
|
+
queue_counter = @redis.scard("queued").to_i
|
101
110
|
end
|
102
111
|
else
|
103
112
|
puts "Already crawled #{@options[:url]}" if @debug
|
data/lib/cobweb_version.rb
CHANGED
data/lib/robots.rb
CHANGED
@@ -8,12 +8,12 @@ class Robots
|
|
8
8
|
raise ":url is required" unless @options.has_key? :url
|
9
9
|
@options[:file] = "robots.txt" unless @options.has_key? :file
|
10
10
|
@options[:user_agent] = "cobweb" unless @options.has_key? :user_agent
|
11
|
-
|
11
|
+
|
12
12
|
uri = URI.parse(@options[:url])
|
13
13
|
content = Cobweb.new(:cache => nil, :text_mime_types => ["text/html", "application/xhtml+xml", "text/plain"]).get([uri.scheme, "://", uri.host, ":", uri.port, "/", @options[:file]].join)
|
14
14
|
if content[:mime_type][0..4] == "text/"
|
15
15
|
@raw_data = parse_data(content[:body])
|
16
|
-
|
16
|
+
|
17
17
|
if @options.has_key?(:user_agent) && @raw_data.has_key?(@options[:user_agent].to_s.downcase.to_sym)
|
18
18
|
@params = @raw_data[@options[:user_agent].to_s.downcase.to_sym]
|
19
19
|
else
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -186,7 +186,7 @@ describe Cobweb do
|
|
186
186
|
describe "location setting" do
|
187
187
|
it "Get should strip fragments" do
|
188
188
|
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
189
|
-
Net::HTTP::Get.should_receive(:new).with("/", {})
|
189
|
+
Net::HTTP::Get.should_receive(:new).with("/", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
|
190
190
|
@cobweb.get("http://www.google.com/#ignore")
|
191
191
|
end
|
192
192
|
it "head should strip fragments" do
|
@@ -196,12 +196,12 @@ describe Cobweb do
|
|
196
196
|
end
|
197
197
|
it "get should not strip path" do
|
198
198
|
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
199
|
-
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", {})
|
199
|
+
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
|
200
200
|
@cobweb.get("http://www.google.com/path/to/stuff#ignore")
|
201
201
|
end
|
202
202
|
it "get should not strip query string" do
|
203
203
|
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
204
|
-
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", {})
|
204
|
+
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
|
205
205
|
@cobweb.get("http://www.google.com/path/to/stuff?query_string#ignore")
|
206
206
|
end
|
207
207
|
end
|
data/spec/cobweb/robots_spec.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -63,9 +63,9 @@ RSpec.configure do |config|
|
|
63
63
|
|
64
64
|
Net::HTTP.stub!(:new).and_return(@mock_http_client)
|
65
65
|
Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
|
66
|
-
Net::HTTP::Get.stub!(:new).with("/redirect.html", {}).and_return(@mock_http_redirect_request)
|
67
|
-
Net::HTTP::Get.stub!(:new).with("/robots.txt", {}).and_return(@mock_http_robot_request)
|
68
|
-
Net::HTTP::Get.stub!(:new).with("/redirect2.html", {}).and_return(@mock_http_redirect_request2)
|
66
|
+
Net::HTTP::Get.stub!(:new).with("/redirect.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_redirect_request)
|
67
|
+
Net::HTTP::Get.stub!(:new).with("/robots.txt", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_robot_request)
|
68
|
+
Net::HTTP::Get.stub!(:new).with("/redirect2.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_redirect_request2)
|
69
69
|
|
70
70
|
Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
|
71
71
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-12-
|
12
|
+
date: 2012-12-28 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70096521975680 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70096521975680
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70096521974080 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70096521974080
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70096521972700 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70096521972700
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70096521971740 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70096521971740
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70096521971220 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70096521971220
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70096521970620 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70096521970620
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70096521969800 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70096521969800
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70096521969240 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70096521969240
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70096521968800 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70096521968800
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: namespaced_redis
|
115
|
-
requirement: &
|
115
|
+
requirement: &70096521968060 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,10 +120,10 @@ dependencies:
|
|
120
120
|
version: 1.0.2
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70096521968060
|
124
124
|
- !ruby/object:Gem::Dependency
|
125
125
|
name: json
|
126
|
-
requirement: &
|
126
|
+
requirement: &70096521967360 !ruby/object:Gem::Requirement
|
127
127
|
none: false
|
128
128
|
requirements:
|
129
129
|
- - ! '>='
|
@@ -131,7 +131,7 @@ dependencies:
|
|
131
131
|
version: '0'
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
|
-
version_requirements: *
|
134
|
+
version_requirements: *70096521967360
|
135
135
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
136
136
|
crawl extremely large sites which is much more performant than multi-threaded crawlers. It
|
137
137
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|