cobweb 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
data/README.textile CHANGED
@@ -1,10 +1,10 @@
1
1
 
2
- h1. Cobweb v1.0.1
2
+ h1. Cobweb v1.0.2
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
6
6
  !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
7
-
7
+ !https://gemnasium.com/stewartmckee/cobweb.png!
8
8
 
9
9
  h2. Intro
10
10
 
data/lib/cobweb.rb CHANGED
@@ -38,6 +38,7 @@ class Cobweb
38
38
  default_quiet_to true
39
39
  default_debug_to false
40
40
  default_cache_to 300
41
+ default_cache_type_to :crawl_based # other option is :full
41
42
  default_timeout_to 10
42
43
  default_redis_options_to Hash.new
43
44
  default_internal_urls_to []
@@ -114,11 +115,12 @@ class Cobweb
114
115
  else
115
116
  redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
116
117
  end
118
+ full_redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
117
119
 
118
120
  content = {:base_url => url}
119
121
 
120
122
  # check if it has already been cached
121
- if redis.get(unique_id) and @options[:cache]
123
+ if ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id))) && @options[:cache]
122
124
  puts "Cache hit for #{url}" unless @options[:quiet]
123
125
  content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
124
126
  else
@@ -138,9 +140,9 @@ class Cobweb
138
140
  begin
139
141
  print "Retrieving #{url }... " unless @options[:quiet]
140
142
  request_options={}
141
- if options[:cookies]
142
- request_options[ 'Cookie']= options[:cookies]
143
- end
143
+ request_options['Cookie']= options[:cookies] if options.has_key?(:cookies)
144
+ request_options['User-Agent']= options[:user_agent] if options.has_key?(:user_agent)
145
+
144
146
  request = Net::HTTP::Get.new uri.request_uri, request_options
145
147
 
146
148
  response = @http.request request
@@ -23,6 +23,9 @@ class CobwebCrawler
23
23
  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => Redis.new(@options[:redis_options]))
24
24
  @options[:internal_urls] = [] if @options[:internal_urls].nil?
25
25
  @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
26
+
27
+ @options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external
28
+
26
29
  @debug = @options[:debug]
27
30
 
28
31
  @stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
@@ -52,12 +55,13 @@ class CobwebCrawler
52
55
  thread = Thread.new do
53
56
 
54
57
  url = @redis.spop "queued"
55
-
58
+ queue_counter = 0 if url.nil?
59
+
56
60
  @options[:url] = url
57
61
  unless @redis.sismember("crawled", url.to_s)
58
62
  begin
59
63
  @stats.update_status("Requesting #{url}...")
60
- content = @cobweb.get(url)
64
+ content = @cobweb.get(url) unless url.nil?
61
65
  if content.nil?
62
66
  queue_counter = queue_counter - 1 #@redis.scard("queued").to_i
63
67
  else
@@ -70,8 +74,9 @@ class CobwebCrawler
70
74
 
71
75
  # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
72
76
  cobweb_links = CobwebLinks.new(@options)
73
- internal_links = internal_links.select{|link| cobweb_links.internal?(link)}
74
-
77
+
78
+ internal_links = internal_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s))}
79
+
75
80
  # reject the link if we've crawled it or queued it
76
81
  internal_links.reject!{|link| @redis.sismember("crawled", link)}
77
82
  internal_links.reject!{|link| @redis.sismember("queued", link)}
@@ -95,9 +100,13 @@ class CobwebCrawler
95
100
  yield content, @stats.get_statistics if block_given?
96
101
  end
97
102
  rescue => e
98
- puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
99
- ap e
100
- ap e.backtrace
103
+ puts "Error loading #{url}: #{e}"
104
+ #puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
105
+ #ap e
106
+ #ap e.backtrace
107
+ ensure
108
+ crawl_counter = @redis.scard("crawled").to_i
109
+ queue_counter = @redis.scard("queued").to_i
101
110
  end
102
111
  else
103
112
  puts "Already crawled #{@options[:url]}" if @debug
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.1"
6
+ "1.0.2"
7
7
  end
8
8
 
9
9
  end
data/lib/robots.rb CHANGED
@@ -8,12 +8,12 @@ class Robots
8
8
  raise ":url is required" unless @options.has_key? :url
9
9
  @options[:file] = "robots.txt" unless @options.has_key? :file
10
10
  @options[:user_agent] = "cobweb" unless @options.has_key? :user_agent
11
-
11
+
12
12
  uri = URI.parse(@options[:url])
13
13
  content = Cobweb.new(:cache => nil, :text_mime_types => ["text/html", "application/xhtml+xml", "text/plain"]).get([uri.scheme, "://", uri.host, ":", uri.port, "/", @options[:file]].join)
14
14
  if content[:mime_type][0..4] == "text/"
15
15
  @raw_data = parse_data(content[:body])
16
-
16
+
17
17
  if @options.has_key?(:user_agent) && @raw_data.has_key?(@options[:user_agent].to_s.downcase.to_sym)
18
18
  @params = @raw_data[@options[:user_agent].to_s.downcase.to_sym]
19
19
  else
@@ -186,7 +186,7 @@ describe Cobweb do
186
186
  describe "location setting" do
187
187
  it "Get should strip fragments" do
188
188
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
189
- Net::HTTP::Get.should_receive(:new).with("/", {})
189
+ Net::HTTP::Get.should_receive(:new).with("/", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
190
190
  @cobweb.get("http://www.google.com/#ignore")
191
191
  end
192
192
  it "head should strip fragments" do
@@ -196,12 +196,12 @@ describe Cobweb do
196
196
  end
197
197
  it "get should not strip path" do
198
198
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
199
- Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", {})
199
+ Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
200
200
  @cobweb.get("http://www.google.com/path/to/stuff#ignore")
201
201
  end
202
202
  it "get should not strip query string" do
203
203
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
204
- Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", {})
204
+ Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
205
205
  @cobweb.get("http://www.google.com/path/to/stuff?query_string#ignore")
206
206
  end
207
207
  end
@@ -8,7 +8,7 @@ describe Robots do
8
8
 
9
9
  describe "default user-agent" do
10
10
  before(:each) do
11
- @options = {:url => "http://localhost/"}
11
+ @options = {:url => "http://localhost:3532/"}
12
12
  end
13
13
 
14
14
  it "should parse a valid robots.txt" do
data/spec/spec_helper.rb CHANGED
@@ -63,9 +63,9 @@ RSpec.configure do |config|
63
63
 
64
64
  Net::HTTP.stub!(:new).and_return(@mock_http_client)
65
65
  Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
66
- Net::HTTP::Get.stub!(:new).with("/redirect.html", {}).and_return(@mock_http_redirect_request)
67
- Net::HTTP::Get.stub!(:new).with("/robots.txt", {}).and_return(@mock_http_robot_request)
68
- Net::HTTP::Get.stub!(:new).with("/redirect2.html", {}).and_return(@mock_http_redirect_request2)
66
+ Net::HTTP::Get.stub!(:new).with("/redirect.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_redirect_request)
67
+ Net::HTTP::Get.stub!(:new).with("/robots.txt", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_robot_request)
68
+ Net::HTTP::Get.stub!(:new).with("/redirect2.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_redirect_request2)
69
69
 
70
70
  Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
71
71
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-19 00:00:00.000000000 Z
12
+ date: 2012-12-28 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70296707613060 !ruby/object:Gem::Requirement
16
+ requirement: &70096521975680 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70296707613060
24
+ version_requirements: *70096521975680
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70296707611440 !ruby/object:Gem::Requirement
27
+ requirement: &70096521974080 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70296707611440
35
+ version_requirements: *70096521974080
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70296707610120 !ruby/object:Gem::Requirement
38
+ requirement: &70096521972700 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70296707610120
46
+ version_requirements: *70096521972700
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70296707609180 !ruby/object:Gem::Requirement
49
+ requirement: &70096521971740 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70296707609180
57
+ version_requirements: *70096521971740
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70296707608660 !ruby/object:Gem::Requirement
60
+ requirement: &70096521971220 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70296707608660
68
+ version_requirements: *70096521971220
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70296707608040 !ruby/object:Gem::Requirement
71
+ requirement: &70096521970620 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70296707608040
79
+ version_requirements: *70096521970620
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70296707607220 !ruby/object:Gem::Requirement
82
+ requirement: &70096521969800 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70296707607220
90
+ version_requirements: *70096521969800
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70296707606660 !ruby/object:Gem::Requirement
93
+ requirement: &70096521969240 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70296707606660
101
+ version_requirements: *70096521969240
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70296707606220 !ruby/object:Gem::Requirement
104
+ requirement: &70096521968800 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70296707606220
112
+ version_requirements: *70096521968800
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70296707605480 !ruby/object:Gem::Requirement
115
+ requirement: &70096521968060 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70296707605480
123
+ version_requirements: *70096521968060
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: json
126
- requirement: &70296707604780 !ruby/object:Gem::Requirement
126
+ requirement: &70096521967360 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,7 +131,7 @@ dependencies:
131
131
  version: '0'
132
132
  type: :runtime
133
133
  prerelease: false
134
- version_requirements: *70296707604780
134
+ version_requirements: *70096521967360
135
135
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
136
  crawl extremely large sites which is much more performant than multi-threaded crawlers. It
137
137
  is also a standalone crawler that has a sophisticated statistics monitoring interface