cobweb 0.0.12 → 0.0.13

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,12 +1,14 @@
1
1
 
2
- h1. Cobweb v0.0.6
2
+ h1. Cobweb v0.0.13
3
3
 
4
4
  h2. Intro
5
5
 
6
- CobWeb has two functions. Firstly it is a http client that allows get and head requests returning a hash of data relating to the requested resource. The second main function is to utilize this combined with the power of Resque to cluster the crawls allowing you crawl quickly.
6
+ CobWeb has three methods of running. Firstly it is a http client that allows get and head requests returning a hash of data relating to the requested resource. The second main function is to utilize this combined with the power of Resque to cluster the crawls allowing you crawl quickly. Lastly you can run the crawler with a block that uses each of the pages found in the crawl.
7
7
 
8
8
  When running on resque, passing in a Class and queue name it will enqueue all resources to this queue for processing, passing in the hash it has generated. You then implement the perform method to process the resource for your own application.
9
9
 
10
+ Documentation for running with a block will come soon.. Check out the CobwebCrawler spec for hints.
11
+
10
12
  The data available in the returned hash are:
11
13
 
12
14
  * :url - url of the resource requested
@@ -51,10 +51,10 @@ class CobWeb
51
51
 
52
52
  raise "url cannot be nil" if url.nil?
53
53
 
54
- absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
55
-
54
+ absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => true, :force_escaping => false, :remove_anchors => true)
55
+
56
56
  # get the unique id for this request
57
- unique_id = Digest::SHA1.hexdigest(url)
57
+ unique_id = Digest::SHA1.hexdigest(url.to_s)
58
58
 
59
59
  # connect to redis
60
60
  redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb")
@@ -70,23 +70,26 @@ class CobWeb
70
70
  content
71
71
  else
72
72
  # this url is valid for processing so lets get on with it
73
- print "Retrieving #{url }... " unless @options[:quiet]
74
73
  uri = Addressable::URI.parse(url.strip)
75
74
 
76
75
  # retrieve data
77
- http = Net::HTTP.new(uri.host, uri.inferred_port)
76
+ unless @http && @http.address == uri.host && @http.port == uri.inferred_port
77
+ puts "Creating connection to #{uri.host}..."
78
+ @http = Net::HTTP.new(uri.host, uri.inferred_port)
79
+ end
78
80
  if uri.scheme == "https"
79
- http.use_ssl = true
80
- http.verify_mode = OpenSSL::SSL::VERIFY_NONE
81
+ @http.use_ssl = true
82
+ @http.verify_mode = OpenSSL::SSL::VERIFY_NONE
81
83
  end
82
84
 
83
85
  request_time = Time.now.to_f
84
- http.read_timeout = @options[:timeout].to_i
85
- http.open_timeout = @options[:timeout].to_i
86
+ @http.read_timeout = @options[:timeout].to_i
87
+ @http.open_timeout = @options[:timeout].to_i
86
88
  begin
87
- response = http.start() {|http|
88
- response = http.get(uri.request_uri)
89
- }
89
+ print "Retrieving #{url }... " unless @options[:quiet]
90
+ request = Net::HTTP::Get.new uri.request_uri
91
+
92
+ response = @http.request request
90
93
 
91
94
  if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
92
95
  puts "redirected... " unless @options[:quiet]
@@ -107,9 +110,11 @@ class CobWeb
107
110
  content[:url] = uri.to_s
108
111
  content[:status_code] = response.code.to_i
109
112
  content[:mime_type] = response.content_type.split(";")[0].strip
110
- charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
111
- charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
112
- content[:character_set] = charset
113
+ if response["Content-Type"].include? ";"
114
+ charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
115
+ charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
116
+ content[:character_set] = charset
117
+ end
113
118
  content[:length] = response.content_length
114
119
  if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
115
120
  content[:body] = response.body
@@ -216,9 +221,11 @@ class CobWeb
216
221
  content[:status_code] = response.code.to_i
217
222
  unless response.content_type.nil?
218
223
  content[:mime_type] = response.content_type.split(";")[0].strip
219
- charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
220
- charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
221
- content[:character_set] = charset
224
+ if response["Content-Type"].include? ";"
225
+ charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
226
+ charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
227
+ content[:character_set] = charset
228
+ end
222
229
  end
223
230
 
224
231
  # add content to cache if required
@@ -0,0 +1,120 @@
1
+ class CobwebCrawler
2
+
3
+ def initialize(options={})
4
+ @options = options
5
+
6
+ @statistic = {}
7
+ @queue = []
8
+ @crawled = []
9
+
10
+ @cobweb = CobWeb.new(@options)
11
+ end
12
+
13
+ def crawl(base_url, crawl_options = {}, &block)
14
+ @options[:base_url] = base_url unless @options.has_key? :base_url
15
+
16
+ @crawl_options = crawl_options
17
+
18
+ @absolutize = Absolutize.new(@options[:base_url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
19
+
20
+ crawl_counter = @crawled.count
21
+
22
+ @queue << base_url
23
+
24
+ while !@queue.empty? && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter)
25
+
26
+ url = @queue.first
27
+ @options[:url] = url
28
+ unless @crawled.include?(url) || url =~ /\/(.+?)\/\1\/\1/
29
+ begin
30
+ content = @cobweb.get(@options[:url])
31
+
32
+ if @statistic[:average_response_time].nil?
33
+ @statistic[:average_response_time] = content[:response_time].to_f
34
+ else
35
+ @statistic[:average_response_time] = (((@statistic[:average_response_time] * crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1))
36
+ end
37
+
38
+ @statistic[:maximum_response_time] = content[:response_time] if @statistic[:maximum_response_time].nil? || @statistic[:maximum_response_time] < content[:response_time]
39
+ @statistic[:minimum_response_time] = content[:response_time] if @statistic[:minimum_response_time].nil? || @statistic[:minimum_response_time] > content[:response_time]
40
+
41
+ if @statistic[:average_length]
42
+ @statistic[:average_length] = (((@statistic[:average_length].to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1))
43
+ else
44
+ @statistic[:average_length] = content[:length].to_i
45
+ end
46
+
47
+ @statistic[:maximum_length] = content[:length].to_i if @statistic[:maximum_length].nil? || content[:length].to_i > @statistic[:maximum_length].to_i
48
+ @statistic[:minimum_length] = content[:length].to_i if @statistic[:minimum_length].nil? || content[:length].to_i < @statistic[:minimum_length].to_i
49
+ @statistic[:total_length] = @statistic[:total_length].to_i + content[:length].to_i
50
+
51
+ if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
52
+ @statistic[:page_count] = @statistic[:page_count].to_i + 1
53
+ @statistic[:page_size] = @statistic[:page_count].to_i + content[:length].to_i
54
+ else
55
+ @statistic[:asset_count] = @statistic[:asset_count].to_i + 1
56
+ @statistic[:asset_size] = @statistic[:asset_count].to_i + content[:length].to_i
57
+ end
58
+
59
+ mime_counts = {}
60
+ if @statistic.has_key? :mime_counts
61
+ mime_counts = @statistic[:mime_counts]
62
+ if mime_counts.has_key? content[:mime_type]
63
+ mime_counts[content[:mime_type]] += 1
64
+ else
65
+ mime_counts[content[:mime_type]] = 1
66
+ end
67
+ else
68
+ mime_counts = {content[:mime_type] => 1}
69
+ end
70
+ @statistic[:mime_counts] = mime_counts
71
+
72
+ status_counts = {}
73
+
74
+ if @statistic.has_key? :status_counts
75
+ status_counts = @statistic[:status_counts]
76
+ if status_counts.has_key? content[:status_code].to_i
77
+ status_counts[content[:status_code].to_i] += 1
78
+ else
79
+ status_counts[content[:status_code].to_i] = 1
80
+ end
81
+ else
82
+ status_counts = {content[:status_code].to_i => 1}
83
+ end
84
+ @statistic[:status_counts] = status_counts
85
+
86
+ @crawled << url
87
+ crawl_counter += 1
88
+ @queue.delete(url)
89
+ content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
90
+ unless @crawled.include? link
91
+ puts "Checking if #{link} matches #{@options[:base_url]} as internal?" if @options[:debug]
92
+ if link.to_s.match(Regexp.new("^#{@options[:base_url]}"))
93
+ puts "Matched as #{link} as internal" if @options[:debug]
94
+ unless @crawled.include? link.to_s or @queue.include? link.to_s
95
+ puts "Added #{link.to_s} to queue" if @options[:debug]
96
+ @queue << link.to_s
97
+ end
98
+ end
99
+ end
100
+ end
101
+ @queue.uniq!
102
+
103
+ puts "Crawled: #{crawl_counter} Limit: #{@options[:crawl_limit]} Queued: #{@queue.count}" if @options[:debug]
104
+
105
+ yield content if block_given?
106
+
107
+ rescue => e
108
+ puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
109
+ ap e
110
+ @queue.delete(url)
111
+
112
+ end
113
+ else
114
+ puts "Already crawled #{@options[:url]}" if @options[:debug]
115
+ end
116
+ end
117
+ @statistic
118
+ end
119
+
120
+ end
@@ -56,13 +56,19 @@ class ContentLinkParser
56
56
 
57
57
  def find_matches(array, selector, attribute)
58
58
  if attribute.kind_of? String or attribute.kind_of? Symbol
59
- @doc.css(selector).each do |tag|
60
- uri = @absolutize.url(tag[attribute])
61
- array << uri.to_s
59
+ @doc.css(selector).each do |tag|
60
+ begin
61
+ uri = @absolutize.url(tag[attribute])
62
+ array << uri.to_s
63
+ rescue
64
+ end
62
65
  end
63
66
  elsif attribute.instance_of? Regexp
64
67
  @doc.css(selector).each do |tag|
65
- tag.content.scan(attribute) {|match| array << @absolutize.url(match[0])}
68
+ begin
69
+ tag.content.scan(attribute) {|match| array << @absolutize.url(match[0])}
70
+ rescue
71
+ end
66
72
  end
67
73
  end
68
74
  end
@@ -0,0 +1,56 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe CobwebCrawler do
4
+
5
+ before(:each) do
6
+
7
+ @base_url = "http://www.baseurl.com/"
8
+
9
+ @default_headers = {"Cache-Control" => "private, max-age=0",
10
+ "Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
11
+ "Expires" => "-1",
12
+ "Content-Type" => "text/html; charset=UTF-8",
13
+ "Content-Encoding" => "gzip",
14
+ "Transfer-Encoding" => "chunked",
15
+ "Server" => "gws",
16
+ "X-XSS-Protection" => "1; mode=block"}
17
+
18
+ end
19
+
20
+ describe "with mock" do
21
+
22
+
23
+ it "should generate a cobweb_crawler object" do
24
+ CobwebCrawler.new.should be_an_instance_of CobwebCrawler
25
+ end
26
+
27
+ describe "crawl" do
28
+ it "should crawl a site" do
29
+
30
+ # temporary tests to run crawler - proper specs to follow.. honest
31
+
32
+ crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
33
+
34
+ statistics = crawler.crawl("http://www.rockwellcottage.com/")
35
+
36
+ ap statistics
37
+
38
+ end
39
+
40
+ it "should take a block" do
41
+
42
+ # temporary tests to run crawler - proper specs to follow.. honest
43
+
44
+ crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
45
+
46
+ statistics = crawler.crawl("http://www.rockwellcottage.com/") do |content|
47
+ ap content[:url]
48
+ end
49
+
50
+ ap statistics
51
+
52
+ end
53
+ end
54
+ end
55
+
56
+ end
@@ -1,5 +1,4 @@
1
1
  require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
- require "ap"
3
2
 
4
3
  describe CobWeb do
5
4
 
@@ -28,6 +27,7 @@ describe CobWeb do
28
27
 
29
28
  @mock_http_response = mock(Net::HTTPResponse)
30
29
  @mock_http_redirect_response = mock(Net::HTTPRedirection)
30
+ @mock_http_redirect_response2 = mock(Net::HTTPRedirection)
31
31
  @mock_http_get = mock(Net::HTTP::Get)
32
32
 
33
33
  Net::HTTP.stub!(:new).and_return(@mock_http_client)
@@ -171,9 +171,6 @@ describe CobWeb do
171
171
  end
172
172
 
173
173
  describe "without mock" do
174
- it "should throw invalid url exception for an invalid url" do
175
- lambda {@cobweb.get("asdgas asv\"£%\"^%&*%")}.should raise_error URI::InvalidURIError
176
- end
177
174
 
178
175
  it "should throw exception when server is unavailable" #do
179
176
  # lambda {@cobweb.get({:url => "http://www.oasdjgoisadjgoisdiog.com"})}.should raise_error URI::InvalidURIError
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: cobweb
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.12
4
+ version: 0.0.13
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-01-15 00:00:00.000000000 Z
12
+ date: 2012-02-10 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: resque
16
- requirement: &70227287202120 !ruby/object:Gem::Requirement
16
+ requirement: &70125097329480 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ! '>='
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70227287202120
24
+ version_requirements: *70125097329480
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: redis
27
- requirement: &70227287201020 !ruby/object:Gem::Requirement
27
+ requirement: &70125097328760 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ! '>='
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '0'
33
33
  type: :runtime
34
34
  prerelease: false
35
- version_requirements: *70227287201020
35
+ version_requirements: *70125097328760
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: absolutize
38
- requirement: &70227287200100 !ruby/object:Gem::Requirement
38
+ requirement: &70125097328280 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ! '>='
@@ -43,10 +43,10 @@ dependencies:
43
43
  version: '0'
44
44
  type: :runtime
45
45
  prerelease: false
46
- version_requirements: *70227287200100
46
+ version_requirements: *70125097328280
47
47
  - !ruby/object:Gem::Dependency
48
48
  name: nokogiri
49
- requirement: &70227287199600 !ruby/object:Gem::Requirement
49
+ requirement: &70125097327660 !ruby/object:Gem::Requirement
50
50
  none: false
51
51
  requirements:
52
52
  - - ! '>='
@@ -54,10 +54,10 @@ dependencies:
54
54
  version: '0'
55
55
  type: :runtime
56
56
  prerelease: false
57
- version_requirements: *70227287199600
57
+ version_requirements: *70125097327660
58
58
  - !ruby/object:Gem::Dependency
59
59
  name: addressable
60
- requirement: &70227287198900 !ruby/object:Gem::Requirement
60
+ requirement: &70125097327060 !ruby/object:Gem::Requirement
61
61
  none: false
62
62
  requirements:
63
63
  - - ! '>='
@@ -65,7 +65,18 @@ dependencies:
65
65
  version: '0'
66
66
  type: :runtime
67
67
  prerelease: false
68
- version_requirements: *70227287198900
68
+ version_requirements: *70125097327060
69
+ - !ruby/object:Gem::Dependency
70
+ name: rspec
71
+ requirement: &70125097326400 !ruby/object:Gem::Requirement
72
+ none: false
73
+ requirements:
74
+ - - ! '>='
75
+ - !ruby/object:Gem::Version
76
+ version: '0'
77
+ type: :runtime
78
+ prerelease: false
79
+ version_requirements: *70125097326400
69
80
  description:
70
81
  email: stewart@rockwellcottage.com
71
82
  executables: []
@@ -73,12 +84,14 @@ extensions: []
73
84
  extra_rdoc_files:
74
85
  - README.textile
75
86
  files:
87
+ - spec/cobweb/cobweb_crawler_spec.rb
76
88
  - spec/cobweb/cobweb_spec.rb
77
89
  - spec/cobweb/content_link_parser_spec.rb
78
90
  - spec/samples/sample_html_links.html
79
91
  - spec/spec.opts
80
92
  - spec/spec_helper.rb
81
93
  - lib/cobweb.rb
94
+ - lib/cobweb_crawler.rb
82
95
  - lib/cobweb_finished_job.rb
83
96
  - lib/cobweb_process_job.rb
84
97
  - lib/content_link_parser.rb