cobweb 1.0.1 → 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +2 -2
- data/lib/cobweb.rb +6 -4
- data/lib/cobweb_crawler.rb +16 -7
- data/lib/cobweb_version.rb +1 -1
- data/lib/robots.rb +2 -2
- data/spec/cobweb/cobweb_spec.rb +3 -3
- data/spec/cobweb/robots_spec.rb +1 -1
- data/spec/spec_helper.rb +3 -3
- metadata +24 -24
data/README.textile
CHANGED
data/lib/cobweb.rb
CHANGED
@@ -38,6 +38,7 @@ class Cobweb
|
|
38
38
|
default_quiet_to true
|
39
39
|
default_debug_to false
|
40
40
|
default_cache_to 300
|
41
|
+
default_cache_type_to :crawl_based # other option is :full
|
41
42
|
default_timeout_to 10
|
42
43
|
default_redis_options_to Hash.new
|
43
44
|
default_internal_urls_to []
|
@@ -114,11 +115,12 @@ class Cobweb
|
|
114
115
|
else
|
115
116
|
redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
|
116
117
|
end
|
118
|
+
full_redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
|
117
119
|
|
118
120
|
content = {:base_url => url}
|
119
121
|
|
120
122
|
# check if it has already been cached
|
121
|
-
if redis.get(unique_id)
|
123
|
+
if ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id))) && @options[:cache]
|
122
124
|
puts "Cache hit for #{url}" unless @options[:quiet]
|
123
125
|
content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
|
124
126
|
else
|
@@ -138,9 +140,9 @@ class Cobweb
|
|
138
140
|
begin
|
139
141
|
print "Retrieving #{url }... " unless @options[:quiet]
|
140
142
|
request_options={}
|
141
|
-
|
142
|
-
|
143
|
-
|
143
|
+
request_options['Cookie']= options[:cookies] if options.has_key?(:cookies)
|
144
|
+
request_options['User-Agent']= options[:user_agent] if options.has_key?(:user_agent)
|
145
|
+
|
144
146
|
request = Net::HTTP::Get.new uri.request_uri, request_options
|
145
147
|
|
146
148
|
response = @http.request request
|
data/lib/cobweb_crawler.rb
CHANGED
@@ -23,6 +23,9 @@ class CobwebCrawler
|
|
23
23
|
@redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => Redis.new(@options[:redis_options]))
|
24
24
|
@options[:internal_urls] = [] if @options[:internal_urls].nil?
|
25
25
|
@options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
|
26
|
+
|
27
|
+
@options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external
|
28
|
+
|
26
29
|
@debug = @options[:debug]
|
27
30
|
|
28
31
|
@stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
|
@@ -52,12 +55,13 @@ class CobwebCrawler
|
|
52
55
|
thread = Thread.new do
|
53
56
|
|
54
57
|
url = @redis.spop "queued"
|
55
|
-
|
58
|
+
queue_counter = 0 if url.nil?
|
59
|
+
|
56
60
|
@options[:url] = url
|
57
61
|
unless @redis.sismember("crawled", url.to_s)
|
58
62
|
begin
|
59
63
|
@stats.update_status("Requesting #{url}...")
|
60
|
-
content = @cobweb.get(url)
|
64
|
+
content = @cobweb.get(url) unless url.nil?
|
61
65
|
if content.nil?
|
62
66
|
queue_counter = queue_counter - 1 #@redis.scard("queued").to_i
|
63
67
|
else
|
@@ -70,8 +74,9 @@ class CobwebCrawler
|
|
70
74
|
|
71
75
|
# select the link if its internal (eliminate external before expensive lookups in queued and crawled)
|
72
76
|
cobweb_links = CobwebLinks.new(@options)
|
73
|
-
|
74
|
-
|
77
|
+
|
78
|
+
internal_links = internal_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s))}
|
79
|
+
|
75
80
|
# reject the link if we've crawled it or queued it
|
76
81
|
internal_links.reject!{|link| @redis.sismember("crawled", link)}
|
77
82
|
internal_links.reject!{|link| @redis.sismember("queued", link)}
|
@@ -95,9 +100,13 @@ class CobwebCrawler
|
|
95
100
|
yield content, @stats.get_statistics if block_given?
|
96
101
|
end
|
97
102
|
rescue => e
|
98
|
-
puts "
|
99
|
-
|
100
|
-
ap e
|
103
|
+
puts "Error loading #{url}: #{e}"
|
104
|
+
#puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
|
105
|
+
#ap e
|
106
|
+
#ap e.backtrace
|
107
|
+
ensure
|
108
|
+
crawl_counter = @redis.scard("crawled").to_i
|
109
|
+
queue_counter = @redis.scard("queued").to_i
|
101
110
|
end
|
102
111
|
else
|
103
112
|
puts "Already crawled #{@options[:url]}" if @debug
|
data/lib/cobweb_version.rb
CHANGED
data/lib/robots.rb
CHANGED
@@ -8,12 +8,12 @@ class Robots
|
|
8
8
|
raise ":url is required" unless @options.has_key? :url
|
9
9
|
@options[:file] = "robots.txt" unless @options.has_key? :file
|
10
10
|
@options[:user_agent] = "cobweb" unless @options.has_key? :user_agent
|
11
|
-
|
11
|
+
|
12
12
|
uri = URI.parse(@options[:url])
|
13
13
|
content = Cobweb.new(:cache => nil, :text_mime_types => ["text/html", "application/xhtml+xml", "text/plain"]).get([uri.scheme, "://", uri.host, ":", uri.port, "/", @options[:file]].join)
|
14
14
|
if content[:mime_type][0..4] == "text/"
|
15
15
|
@raw_data = parse_data(content[:body])
|
16
|
-
|
16
|
+
|
17
17
|
if @options.has_key?(:user_agent) && @raw_data.has_key?(@options[:user_agent].to_s.downcase.to_sym)
|
18
18
|
@params = @raw_data[@options[:user_agent].to_s.downcase.to_sym]
|
19
19
|
else
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -186,7 +186,7 @@ describe Cobweb do
|
|
186
186
|
describe "location setting" do
|
187
187
|
it "Get should strip fragments" do
|
188
188
|
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
189
|
-
Net::HTTP::Get.should_receive(:new).with("/", {})
|
189
|
+
Net::HTTP::Get.should_receive(:new).with("/", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
|
190
190
|
@cobweb.get("http://www.google.com/#ignore")
|
191
191
|
end
|
192
192
|
it "head should strip fragments" do
|
@@ -196,12 +196,12 @@ describe Cobweb do
|
|
196
196
|
end
|
197
197
|
it "get should not strip path" do
|
198
198
|
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
199
|
-
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", {})
|
199
|
+
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
|
200
200
|
@cobweb.get("http://www.google.com/path/to/stuff#ignore")
|
201
201
|
end
|
202
202
|
it "get should not strip query string" do
|
203
203
|
Net::HTTP.should_receive(:new).with("www.google.com", 80)
|
204
|
-
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", {})
|
204
|
+
Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
|
205
205
|
@cobweb.get("http://www.google.com/path/to/stuff?query_string#ignore")
|
206
206
|
end
|
207
207
|
end
|
data/spec/cobweb/robots_spec.rb
CHANGED
data/spec/spec_helper.rb
CHANGED
@@ -63,9 +63,9 @@ RSpec.configure do |config|
|
|
63
63
|
|
64
64
|
Net::HTTP.stub!(:new).and_return(@mock_http_client)
|
65
65
|
Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
|
66
|
-
Net::HTTP::Get.stub!(:new).with("/redirect.html", {}).and_return(@mock_http_redirect_request)
|
67
|
-
Net::HTTP::Get.stub!(:new).with("/robots.txt", {}).and_return(@mock_http_robot_request)
|
68
|
-
Net::HTTP::Get.stub!(:new).with("/redirect2.html", {}).and_return(@mock_http_redirect_request2)
|
66
|
+
Net::HTTP::Get.stub!(:new).with("/redirect.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_redirect_request)
|
67
|
+
Net::HTTP::Get.stub!(:new).with("/robots.txt", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_robot_request)
|
68
|
+
Net::HTTP::Get.stub!(:new).with("/redirect2.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_redirect_request2)
|
69
69
|
|
70
70
|
Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
|
71
71
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-12-
|
12
|
+
date: 2012-12-28 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70096521975680 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70096521975680
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70096521974080 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70096521974080
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: nokogiri
|
38
|
-
requirement: &
|
38
|
+
requirement: &70096521972700 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70096521972700
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: addressable
|
49
|
-
requirement: &
|
49
|
+
requirement: &70096521971740 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70096521971740
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: rspec
|
60
|
-
requirement: &
|
60
|
+
requirement: &70096521971220 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,10 +65,10 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70096521971220
|
69
69
|
- !ruby/object:Gem::Dependency
|
70
70
|
name: awesome_print
|
71
|
-
requirement: &
|
71
|
+
requirement: &70096521970620 !ruby/object:Gem::Requirement
|
72
72
|
none: false
|
73
73
|
requirements:
|
74
74
|
- - ! '>='
|
@@ -76,10 +76,10 @@ dependencies:
|
|
76
76
|
version: '0'
|
77
77
|
type: :runtime
|
78
78
|
prerelease: false
|
79
|
-
version_requirements: *
|
79
|
+
version_requirements: *70096521970620
|
80
80
|
- !ruby/object:Gem::Dependency
|
81
81
|
name: sinatra
|
82
|
-
requirement: &
|
82
|
+
requirement: &70096521969800 !ruby/object:Gem::Requirement
|
83
83
|
none: false
|
84
84
|
requirements:
|
85
85
|
- - ! '>='
|
@@ -87,10 +87,10 @@ dependencies:
|
|
87
87
|
version: '0'
|
88
88
|
type: :runtime
|
89
89
|
prerelease: false
|
90
|
-
version_requirements: *
|
90
|
+
version_requirements: *70096521969800
|
91
91
|
- !ruby/object:Gem::Dependency
|
92
92
|
name: thin
|
93
|
-
requirement: &
|
93
|
+
requirement: &70096521969240 !ruby/object:Gem::Requirement
|
94
94
|
none: false
|
95
95
|
requirements:
|
96
96
|
- - ! '>='
|
@@ -98,10 +98,10 @@ dependencies:
|
|
98
98
|
version: '0'
|
99
99
|
type: :runtime
|
100
100
|
prerelease: false
|
101
|
-
version_requirements: *
|
101
|
+
version_requirements: *70096521969240
|
102
102
|
- !ruby/object:Gem::Dependency
|
103
103
|
name: haml
|
104
|
-
requirement: &
|
104
|
+
requirement: &70096521968800 !ruby/object:Gem::Requirement
|
105
105
|
none: false
|
106
106
|
requirements:
|
107
107
|
- - ! '>='
|
@@ -109,10 +109,10 @@ dependencies:
|
|
109
109
|
version: '0'
|
110
110
|
type: :runtime
|
111
111
|
prerelease: false
|
112
|
-
version_requirements: *
|
112
|
+
version_requirements: *70096521968800
|
113
113
|
- !ruby/object:Gem::Dependency
|
114
114
|
name: namespaced_redis
|
115
|
-
requirement: &
|
115
|
+
requirement: &70096521968060 !ruby/object:Gem::Requirement
|
116
116
|
none: false
|
117
117
|
requirements:
|
118
118
|
- - ! '>='
|
@@ -120,10 +120,10 @@ dependencies:
|
|
120
120
|
version: 1.0.2
|
121
121
|
type: :runtime
|
122
122
|
prerelease: false
|
123
|
-
version_requirements: *
|
123
|
+
version_requirements: *70096521968060
|
124
124
|
- !ruby/object:Gem::Dependency
|
125
125
|
name: json
|
126
|
-
requirement: &
|
126
|
+
requirement: &70096521967360 !ruby/object:Gem::Requirement
|
127
127
|
none: false
|
128
128
|
requirements:
|
129
129
|
- - ! '>='
|
@@ -131,7 +131,7 @@ dependencies:
|
|
131
131
|
version: '0'
|
132
132
|
type: :runtime
|
133
133
|
prerelease: false
|
134
|
-
version_requirements: *
|
134
|
+
version_requirements: *70096521967360
|
135
135
|
description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
|
136
136
|
crawl extremely large sites which is much more performant than multi-threaded crawlers. It
|
137
137
|
is also a standalone crawler that has a sophisticated statistics monitoring interface
|