cobweb 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile CHANGED
@@ -1,10 +1,10 @@
1
1
 
2
- h1. Cobweb v1.0.1
2
+ h1. Cobweb v1.0.2
3
3
 
4
4
  "@cobweb_gem":https://twitter.com/cobweb_gem
5
5
 
6
6
  !https://secure.travis-ci.org/stewartmckee/cobweb.png?branch=master!
7
-
7
+ !https://gemnasium.com/stewartmckee/cobweb.png!
8
8
 
9
9
  h2. Intro
10
10
 
data/lib/cobweb.rb CHANGED
@@ -38,6 +38,7 @@ class Cobweb
38
38
  default_quiet_to true
39
39
  default_debug_to false
40
40
  default_cache_to 300
41
+ default_cache_type_to :crawl_based # other option is :full
41
42
  default_timeout_to 10
42
43
  default_redis_options_to Hash.new
43
44
  default_internal_urls_to []
@@ -114,11 +115,12 @@ class Cobweb
114
115
  else
115
116
  redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
116
117
  end
118
+ full_redis = Redis::Namespace.new("cobweb-#{Cobweb.version}", :redis => Redis.new(@options[:redis_options]))
117
119
 
118
120
  content = {:base_url => url}
119
121
 
120
122
  # check if it has already been cached
121
- if redis.get(unique_id) and @options[:cache]
123
+ if ((@options[:cache_type] == :crawl_based && redis.get(unique_id)) || (@options[:cache_type] == :full && full_redis.get(unique_id))) && @options[:cache]
122
124
  puts "Cache hit for #{url}" unless @options[:quiet]
123
125
  content = HashUtil.deep_symbolize_keys(Marshal.load(redis.get(unique_id)))
124
126
  else
@@ -138,9 +140,9 @@ class Cobweb
138
140
  begin
139
141
  print "Retrieving #{url }... " unless @options[:quiet]
140
142
  request_options={}
141
- if options[:cookies]
142
- request_options[ 'Cookie']= options[:cookies]
143
- end
143
+ request_options['Cookie']= options[:cookies] if options.has_key?(:cookies)
144
+ request_options['User-Agent']= options[:user_agent] if options.has_key?(:user_agent)
145
+
144
146
  request = Net::HTTP::Get.new uri.request_uri, request_options
145
147
 
146
148
  response = @http.request request
@@ -23,6 +23,9 @@ class CobwebCrawler
23
23
  @redis = Redis::Namespace.new("cobweb-#{Cobweb.version}-#{@crawl_id}", :redis => Redis.new(@options[:redis_options]))
24
24
  @options[:internal_urls] = [] if @options[:internal_urls].nil?
25
25
  @options[:internal_urls].map{|url| @redis.sadd("internal_urls", url)}
26
+
27
+ @options[:crawl_linked_external] = false unless @options.has_key? :crawl_linked_external
28
+
26
29
  @debug = @options[:debug]
27
30
 
28
31
  @stats = Stats.new(@options.merge(:crawl_id => @crawl_id))
@@ -52,12 +55,13 @@ class CobwebCrawler
52
55
  thread = Thread.new do
53
56
 
54
57
  url = @redis.spop "queued"
55
-
58
+ queue_counter = 0 if url.nil?
59
+
56
60
  @options[:url] = url
57
61
  unless @redis.sismember("crawled", url.to_s)
58
62
  begin
59
63
  @stats.update_status("Requesting #{url}...")
60
- content = @cobweb.get(url)
64
+ content = @cobweb.get(url) unless url.nil?
61
65
  if content.nil?
62
66
  queue_counter = queue_counter - 1 #@redis.scard("queued").to_i
63
67
  else
@@ -70,8 +74,9 @@ class CobwebCrawler
70
74
 
71
75
  # select the link if its internal (eliminate external before expensive lookups in queued and crawled)
72
76
  cobweb_links = CobwebLinks.new(@options)
73
- internal_links = internal_links.select{|link| cobweb_links.internal?(link)}
74
-
77
+
78
+ internal_links = internal_links.select{|link| cobweb_links.internal?(link) || (@options[:crawl_linked_external] && cobweb_links.internal?(url.to_s))}
79
+
75
80
  # reject the link if we've crawled it or queued it
76
81
  internal_links.reject!{|link| @redis.sismember("crawled", link)}
77
82
  internal_links.reject!{|link| @redis.sismember("queued", link)}
@@ -95,9 +100,13 @@ class CobwebCrawler
95
100
  yield content, @stats.get_statistics if block_given?
96
101
  end
97
102
  rescue => e
98
- puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
99
- ap e
100
- ap e.backtrace
103
+ puts "Error loading #{url}: #{e}"
104
+ #puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
105
+ #ap e
106
+ #ap e.backtrace
107
+ ensure
108
+ crawl_counter = @redis.scard("crawled").to_i
109
+ queue_counter = @redis.scard("queued").to_i
101
110
  end
102
111
  else
103
112
  puts "Already crawled #{@options[:url]}" if @debug
@@ -3,7 +3,7 @@ class CobwebVersion
3
3
 
4
4
  # Returns a string of the current version
5
5
  def self.version
6
- "1.0.1"
6
+ "1.0.2"
7
7
  end
8
8
 
9
9
  end
data/lib/robots.rb CHANGED
@@ -8,12 +8,12 @@ class Robots
8
8
  raise ":url is required" unless @options.has_key? :url
9
9
  @options[:file] = "robots.txt" unless @options.has_key? :file
10
10
  @options[:user_agent] = "cobweb" unless @options.has_key? :user_agent
11
-
11
+
12
12
  uri = URI.parse(@options[:url])
13
13
  content = Cobweb.new(:cache => nil, :text_mime_types => ["text/html", "application/xhtml+xml", "text/plain"]).get([uri.scheme, "://", uri.host, ":", uri.port, "/", @options[:file]].join)
14
14
  if content[:mime_type][0..4] == "text/"
15
15
  @raw_data = parse_data(content[:body])
16
-
16
+
17
17
  if @options.has_key?(:user_agent) && @raw_data.has_key?(@options[:user_agent].to_s.downcase.to_sym)
18
18
  @params = @raw_data[@options[:user_agent].to_s.downcase.to_sym]
19
19
  else
@@ -186,7 +186,7 @@ describe Cobweb do
186
186
  describe "location setting" do
187
187
  it "Get should strip fragments" do
188
188
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
189
- Net::HTTP::Get.should_receive(:new).with("/", {})
189
+ Net::HTTP::Get.should_receive(:new).with("/", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
190
190
  @cobweb.get("http://www.google.com/#ignore")
191
191
  end
192
192
  it "head should strip fragments" do
@@ -196,12 +196,12 @@ describe Cobweb do
196
196
  end
197
197
  it "get should not strip path" do
198
198
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
199
- Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", {})
199
+ Net::HTTP::Get.should_receive(:new).with("/path/to/stuff", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
200
200
  @cobweb.get("http://www.google.com/path/to/stuff#ignore")
201
201
  end
202
202
  it "get should not strip query string" do
203
203
  Net::HTTP.should_receive(:new).with("www.google.com", 80)
204
- Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", {})
204
+ Net::HTTP::Get.should_receive(:new).with("/path/to/stuff?query_string", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"})
205
205
  @cobweb.get("http://www.google.com/path/to/stuff?query_string#ignore")
206
206
  end
207
207
  end
@@ -8,7 +8,7 @@ describe Robots do
8
8
 
9
9
  describe "default user-agent" do
10
10
  before(:each) do
11
- @options = {:url => "http://localhost/"}
11
+ @options = {:url => "http://localhost:3532/"}
12
12
  end
13
13
 
14
14
  it "should parse a valid robots.txt" do
data/spec/spec_helper.rb CHANGED
@@ -63,9 +63,9 @@ RSpec.configure do |config|
63
63
 
64
64
  Net::HTTP.stub!(:new).and_return(@mock_http_client)
65
65
  Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
66
- Net::HTTP::Get.stub!(:new).with("/redirect.html", {}).and_return(@mock_http_redirect_request)
67
- Net::HTTP::Get.stub!(:new).with("/robots.txt", {}).and_return(@mock_http_robot_request)
68
- Net::HTTP::Get.stub!(:new).with("/redirect2.html", {}).and_return(@mock_http_redirect_request2)
66
+ Net::HTTP::Get.stub!(:new).with("/redirect.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_redirect_request)
67
+ Net::HTTP::Get.stub!(:new).with("/robots.txt", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_robot_request)
68
+ Net::HTTP::Get.stub!(:new).with("/redirect2.html", {"User-Agent"=>"cobweb/#{CobwebVersion.version} (ruby/#{RUBY_VERSION} nokogiri/1.5.0)"}).and_return(@mock_http_redirect_request2)
69
69
 
70
70
  Net::HTTP::Head.stub!(:new).and_return(@mock_http_request)
71
71
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-12-19 00:00:00.000000000 Z
12
+ date: 2012-12-28 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70296707613060 !ruby/object:Gem::Requirement
16
+ requirement: &70096521975680 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70296707613060
24
+ version_requirements: *70096521975680
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70296707611440 !ruby/object:Gem::Requirement
27
+ requirement: &70096521974080 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70296707611440
35
+ version_requirements: *70096521974080
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: nokogiri
38
- requirement: &70296707610120 !ruby/object:Gem::Requirement
38
+ requirement: &70096521972700 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70296707610120
46
+ version_requirements: *70096521972700
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: addressable
49
- requirement: &70296707609180 !ruby/object:Gem::Requirement
49
+ requirement: &70096521971740 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70296707609180
57
+ version_requirements: *70096521971740
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: rspec
60
- requirement: &70296707608660 !ruby/object:Gem::Requirement
60
+ requirement: &70096521971220 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,10 +65,10 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70296707608660
68
+ version_requirements: *70096521971220
69
69
  - !ruby/object:Gem::Dependency
70
70
  name: awesome_print
71
- requirement: &70296707608040 !ruby/object:Gem::Requirement
71
+ requirement: &70096521970620 !ruby/object:Gem::Requirement
72
72
  none: false
73
73
  requirements:
74
74
  - - ! '>='
@@ -76,10 +76,10 @@ dependencies:
76
76
  version: '0'
77
77
  type: :runtime
78
78
  prerelease: false
79
- version_requirements: *70296707608040
79
+ version_requirements: *70096521970620
80
80
  - !ruby/object:Gem::Dependency
81
81
  name: sinatra
82
- requirement: &70296707607220 !ruby/object:Gem::Requirement
82
+ requirement: &70096521969800 !ruby/object:Gem::Requirement
83
83
  none: false
84
84
  requirements:
85
85
  - - ! '>='
@@ -87,10 +87,10 @@ dependencies:
87
87
  version: '0'
88
88
  type: :runtime
89
89
  prerelease: false
90
- version_requirements: *70296707607220
90
+ version_requirements: *70096521969800
91
91
  - !ruby/object:Gem::Dependency
92
92
  name: thin
93
- requirement: &70296707606660 !ruby/object:Gem::Requirement
93
+ requirement: &70096521969240 !ruby/object:Gem::Requirement
94
94
  none: false
95
95
  requirements:
96
96
  - - ! '>='
@@ -98,10 +98,10 @@ dependencies:
98
98
  version: '0'
99
99
  type: :runtime
100
100
  prerelease: false
101
- version_requirements: *70296707606660
101
+ version_requirements: *70096521969240
102
102
  - !ruby/object:Gem::Dependency
103
103
  name: haml
104
- requirement: &70296707606220 !ruby/object:Gem::Requirement
104
+ requirement: &70096521968800 !ruby/object:Gem::Requirement
105
105
  none: false
106
106
  requirements:
107
107
  - - ! '>='
@@ -109,10 +109,10 @@ dependencies:
109
109
  version: '0'
110
110
  type: :runtime
111
111
  prerelease: false
112
- version_requirements: *70296707606220
112
+ version_requirements: *70096521968800
113
113
  - !ruby/object:Gem::Dependency
114
114
  name: namespaced_redis
115
- requirement: &70296707605480 !ruby/object:Gem::Requirement
115
+ requirement: &70096521968060 !ruby/object:Gem::Requirement
116
116
  none: false
117
117
  requirements:
118
118
  - - ! '>='
@@ -120,10 +120,10 @@ dependencies:
120
120
  version: 1.0.2
121
121
  type: :runtime
122
122
  prerelease: false
123
- version_requirements: *70296707605480
123
+ version_requirements: *70096521968060
124
124
  - !ruby/object:Gem::Dependency
125
125
  name: json
126
- requirement: &70296707604780 !ruby/object:Gem::Requirement
126
+ requirement: &70096521967360 !ruby/object:Gem::Requirement
127
127
  none: false
128
128
  requirements:
129
129
  - - ! '>='
@@ -131,7 +131,7 @@ dependencies:
131
131
  version: '0'
132
132
  type: :runtime
133
133
  prerelease: false
134
- version_requirements: *70296707604780
134
+ version_requirements: *70096521967360
135
135
  description: Cobweb is a web crawler that can use resque to cluster crawls to quickly
136
136
  crawl extremely large sites which is much more performant than multi-threaded crawlers. It
137
137
  is also a standalone crawler that has a sophisticated statistics monitoring interface