cobweb 0.0.12 → 0.0.13
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +4 -2
- data/lib/cobweb.rb +25 -18
- data/lib/cobweb_crawler.rb +120 -0
- data/lib/content_link_parser.rb +10 -4
- data/spec/cobweb/cobweb_crawler_spec.rb +56 -0
- data/spec/cobweb/cobweb_spec.rb +1 -4
- metadata +25 -12
data/README.textile
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
|
2
|
-
h1. Cobweb v0.0.
|
2
|
+
h1. Cobweb v0.0.13
|
3
3
|
|
4
4
|
h2. Intro
|
5
5
|
|
6
|
-
CobWeb has
|
6
|
+
CobWeb has three methods of running. Firstly it is a http client that allows get and head requests returning a hash of data relating to the requested resource. The second main function is to utilize this combined with the power of Resque to cluster the crawls allowing you crawl quickly. Lastly you can run the crawler with a block that uses each of the pages found in the crawl.
|
7
7
|
|
8
8
|
When running on resque, passing in a Class and queue name it will enqueue all resources to this queue for processing, passing in the hash it has generated. You then implement the perform method to process the resource for your own application.
|
9
9
|
|
10
|
+
Documentation for running with a block will come soon.. Check out the CobwebCrawler spec for hints.
|
11
|
+
|
10
12
|
The data available in the returned hash are:
|
11
13
|
|
12
14
|
* :url - url of the resource requested
|
data/lib/cobweb.rb
CHANGED
@@ -51,10 +51,10 @@ class CobWeb
|
|
51
51
|
|
52
52
|
raise "url cannot be nil" if url.nil?
|
53
53
|
|
54
|
-
absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions =>
|
55
|
-
|
54
|
+
absolutize = Absolutize.new(url, :output_debug => false, :raise_exceptions => true, :force_escaping => false, :remove_anchors => true)
|
55
|
+
|
56
56
|
# get the unique id for this request
|
57
|
-
unique_id = Digest::SHA1.hexdigest(url)
|
57
|
+
unique_id = Digest::SHA1.hexdigest(url.to_s)
|
58
58
|
|
59
59
|
# connect to redis
|
60
60
|
redis = NamespacedRedis.new(Redis.new(@options[:redis_options]), "cobweb")
|
@@ -70,23 +70,26 @@ class CobWeb
|
|
70
70
|
content
|
71
71
|
else
|
72
72
|
# this url is valid for processing so lets get on with it
|
73
|
-
print "Retrieving #{url }... " unless @options[:quiet]
|
74
73
|
uri = Addressable::URI.parse(url.strip)
|
75
74
|
|
76
75
|
# retrieve data
|
77
|
-
http
|
76
|
+
unless @http && @http.address == uri.host && @http.port == uri.inferred_port
|
77
|
+
puts "Creating connection to #{uri.host}..."
|
78
|
+
@http = Net::HTTP.new(uri.host, uri.inferred_port)
|
79
|
+
end
|
78
80
|
if uri.scheme == "https"
|
79
|
-
http.use_ssl = true
|
80
|
-
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
81
|
+
@http.use_ssl = true
|
82
|
+
@http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
81
83
|
end
|
82
84
|
|
83
85
|
request_time = Time.now.to_f
|
84
|
-
http.read_timeout = @options[:timeout].to_i
|
85
|
-
http.open_timeout = @options[:timeout].to_i
|
86
|
+
@http.read_timeout = @options[:timeout].to_i
|
87
|
+
@http.open_timeout = @options[:timeout].to_i
|
86
88
|
begin
|
87
|
-
|
88
|
-
|
89
|
-
|
89
|
+
print "Retrieving #{url }... " unless @options[:quiet]
|
90
|
+
request = Net::HTTP::Get.new uri.request_uri
|
91
|
+
|
92
|
+
response = @http.request request
|
90
93
|
|
91
94
|
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
92
95
|
puts "redirected... " unless @options[:quiet]
|
@@ -107,9 +110,11 @@ class CobWeb
|
|
107
110
|
content[:url] = uri.to_s
|
108
111
|
content[:status_code] = response.code.to_i
|
109
112
|
content[:mime_type] = response.content_type.split(";")[0].strip
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
+
if response["Content-Type"].include? ";"
|
114
|
+
charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
|
115
|
+
charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
|
116
|
+
content[:character_set] = charset
|
117
|
+
end
|
113
118
|
content[:length] = response.content_length
|
114
119
|
if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
|
115
120
|
content[:body] = response.body
|
@@ -216,9 +221,11 @@ class CobWeb
|
|
216
221
|
content[:status_code] = response.code.to_i
|
217
222
|
unless response.content_type.nil?
|
218
223
|
content[:mime_type] = response.content_type.split(";")[0].strip
|
219
|
-
|
220
|
-
|
221
|
-
|
224
|
+
if response["Content-Type"].include? ";"
|
225
|
+
charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
|
226
|
+
charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
|
227
|
+
content[:character_set] = charset
|
228
|
+
end
|
222
229
|
end
|
223
230
|
|
224
231
|
# add content to cache if required
|
@@ -0,0 +1,120 @@
|
|
1
|
+
class CobwebCrawler
|
2
|
+
|
3
|
+
def initialize(options={})
|
4
|
+
@options = options
|
5
|
+
|
6
|
+
@statistic = {}
|
7
|
+
@queue = []
|
8
|
+
@crawled = []
|
9
|
+
|
10
|
+
@cobweb = CobWeb.new(@options)
|
11
|
+
end
|
12
|
+
|
13
|
+
def crawl(base_url, crawl_options = {}, &block)
|
14
|
+
@options[:base_url] = base_url unless @options.has_key? :base_url
|
15
|
+
|
16
|
+
@crawl_options = crawl_options
|
17
|
+
|
18
|
+
@absolutize = Absolutize.new(@options[:base_url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
19
|
+
|
20
|
+
crawl_counter = @crawled.count
|
21
|
+
|
22
|
+
@queue << base_url
|
23
|
+
|
24
|
+
while !@queue.empty? && (@options[:crawl_limit].to_i == 0 || @options[:crawl_limit].to_i > crawl_counter)
|
25
|
+
|
26
|
+
url = @queue.first
|
27
|
+
@options[:url] = url
|
28
|
+
unless @crawled.include?(url) || url =~ /\/(.+?)\/\1\/\1/
|
29
|
+
begin
|
30
|
+
content = @cobweb.get(@options[:url])
|
31
|
+
|
32
|
+
if @statistic[:average_response_time].nil?
|
33
|
+
@statistic[:average_response_time] = content[:response_time].to_f
|
34
|
+
else
|
35
|
+
@statistic[:average_response_time] = (((@statistic[:average_response_time] * crawl_counter) + content[:response_time].to_f) / (crawl_counter + 1))
|
36
|
+
end
|
37
|
+
|
38
|
+
@statistic[:maximum_response_time] = content[:response_time] if @statistic[:maximum_response_time].nil? || @statistic[:maximum_response_time] < content[:response_time]
|
39
|
+
@statistic[:minimum_response_time] = content[:response_time] if @statistic[:minimum_response_time].nil? || @statistic[:minimum_response_time] > content[:response_time]
|
40
|
+
|
41
|
+
if @statistic[:average_length]
|
42
|
+
@statistic[:average_length] = (((@statistic[:average_length].to_i*crawl_counter) + content[:length].to_i) / (crawl_counter + 1))
|
43
|
+
else
|
44
|
+
@statistic[:average_length] = content[:length].to_i
|
45
|
+
end
|
46
|
+
|
47
|
+
@statistic[:maximum_length] = content[:length].to_i if @statistic[:maximum_length].nil? || content[:length].to_i > @statistic[:maximum_length].to_i
|
48
|
+
@statistic[:minimum_length] = content[:length].to_i if @statistic[:minimum_length].nil? || content[:length].to_i < @statistic[:minimum_length].to_i
|
49
|
+
@statistic[:total_length] = @statistic[:total_length].to_i + content[:length].to_i
|
50
|
+
|
51
|
+
if content[:mime_type].include?("text/html") or content[:mime_type].include?("application/xhtml+xml")
|
52
|
+
@statistic[:page_count] = @statistic[:page_count].to_i + 1
|
53
|
+
@statistic[:page_size] = @statistic[:page_count].to_i + content[:length].to_i
|
54
|
+
else
|
55
|
+
@statistic[:asset_count] = @statistic[:asset_count].to_i + 1
|
56
|
+
@statistic[:asset_size] = @statistic[:asset_count].to_i + content[:length].to_i
|
57
|
+
end
|
58
|
+
|
59
|
+
mime_counts = {}
|
60
|
+
if @statistic.has_key? :mime_counts
|
61
|
+
mime_counts = @statistic[:mime_counts]
|
62
|
+
if mime_counts.has_key? content[:mime_type]
|
63
|
+
mime_counts[content[:mime_type]] += 1
|
64
|
+
else
|
65
|
+
mime_counts[content[:mime_type]] = 1
|
66
|
+
end
|
67
|
+
else
|
68
|
+
mime_counts = {content[:mime_type] => 1}
|
69
|
+
end
|
70
|
+
@statistic[:mime_counts] = mime_counts
|
71
|
+
|
72
|
+
status_counts = {}
|
73
|
+
|
74
|
+
if @statistic.has_key? :status_counts
|
75
|
+
status_counts = @statistic[:status_counts]
|
76
|
+
if status_counts.has_key? content[:status_code].to_i
|
77
|
+
status_counts[content[:status_code].to_i] += 1
|
78
|
+
else
|
79
|
+
status_counts[content[:status_code].to_i] = 1
|
80
|
+
end
|
81
|
+
else
|
82
|
+
status_counts = {content[:status_code].to_i => 1}
|
83
|
+
end
|
84
|
+
@statistic[:status_counts] = status_counts
|
85
|
+
|
86
|
+
@crawled << url
|
87
|
+
crawl_counter += 1
|
88
|
+
@queue.delete(url)
|
89
|
+
content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
|
90
|
+
unless @crawled.include? link
|
91
|
+
puts "Checking if #{link} matches #{@options[:base_url]} as internal?" if @options[:debug]
|
92
|
+
if link.to_s.match(Regexp.new("^#{@options[:base_url]}"))
|
93
|
+
puts "Matched as #{link} as internal" if @options[:debug]
|
94
|
+
unless @crawled.include? link.to_s or @queue.include? link.to_s
|
95
|
+
puts "Added #{link.to_s} to queue" if @options[:debug]
|
96
|
+
@queue << link.to_s
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
end
|
101
|
+
@queue.uniq!
|
102
|
+
|
103
|
+
puts "Crawled: #{crawl_counter} Limit: #{@options[:crawl_limit]} Queued: #{@queue.count}" if @options[:debug]
|
104
|
+
|
105
|
+
yield content if block_given?
|
106
|
+
|
107
|
+
rescue => e
|
108
|
+
puts "!!!!!!!!!!!! ERROR !!!!!!!!!!!!!!!!"
|
109
|
+
ap e
|
110
|
+
@queue.delete(url)
|
111
|
+
|
112
|
+
end
|
113
|
+
else
|
114
|
+
puts "Already crawled #{@options[:url]}" if @options[:debug]
|
115
|
+
end
|
116
|
+
end
|
117
|
+
@statistic
|
118
|
+
end
|
119
|
+
|
120
|
+
end
|
data/lib/content_link_parser.rb
CHANGED
@@ -56,13 +56,19 @@ class ContentLinkParser
|
|
56
56
|
|
57
57
|
def find_matches(array, selector, attribute)
|
58
58
|
if attribute.kind_of? String or attribute.kind_of? Symbol
|
59
|
-
@doc.css(selector).each do |tag|
|
60
|
-
|
61
|
-
|
59
|
+
@doc.css(selector).each do |tag|
|
60
|
+
begin
|
61
|
+
uri = @absolutize.url(tag[attribute])
|
62
|
+
array << uri.to_s
|
63
|
+
rescue
|
64
|
+
end
|
62
65
|
end
|
63
66
|
elsif attribute.instance_of? Regexp
|
64
67
|
@doc.css(selector).each do |tag|
|
65
|
-
|
68
|
+
begin
|
69
|
+
tag.content.scan(attribute) {|match| array << @absolutize.url(match[0])}
|
70
|
+
rescue
|
71
|
+
end
|
66
72
|
end
|
67
73
|
end
|
68
74
|
end
|
@@ -0,0 +1,56 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
describe CobwebCrawler do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
|
7
|
+
@base_url = "http://www.baseurl.com/"
|
8
|
+
|
9
|
+
@default_headers = {"Cache-Control" => "private, max-age=0",
|
10
|
+
"Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
|
11
|
+
"Expires" => "-1",
|
12
|
+
"Content-Type" => "text/html; charset=UTF-8",
|
13
|
+
"Content-Encoding" => "gzip",
|
14
|
+
"Transfer-Encoding" => "chunked",
|
15
|
+
"Server" => "gws",
|
16
|
+
"X-XSS-Protection" => "1; mode=block"}
|
17
|
+
|
18
|
+
end
|
19
|
+
|
20
|
+
describe "with mock" do
|
21
|
+
|
22
|
+
|
23
|
+
it "should generate a cobweb_crawler object" do
|
24
|
+
CobwebCrawler.new.should be_an_instance_of CobwebCrawler
|
25
|
+
end
|
26
|
+
|
27
|
+
describe "crawl" do
|
28
|
+
it "should crawl a site" do
|
29
|
+
|
30
|
+
# temporary tests to run crawler - proper specs to follow.. honest
|
31
|
+
|
32
|
+
crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
|
33
|
+
|
34
|
+
statistics = crawler.crawl("http://www.rockwellcottage.com/")
|
35
|
+
|
36
|
+
ap statistics
|
37
|
+
|
38
|
+
end
|
39
|
+
|
40
|
+
it "should take a block" do
|
41
|
+
|
42
|
+
# temporary tests to run crawler - proper specs to follow.. honest
|
43
|
+
|
44
|
+
crawler = CobwebCrawler.new({:cache => false, :quiet => false, :debug => false})
|
45
|
+
|
46
|
+
statistics = crawler.crawl("http://www.rockwellcottage.com/") do |content|
|
47
|
+
ap content[:url]
|
48
|
+
end
|
49
|
+
|
50
|
+
ap statistics
|
51
|
+
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
end
|
data/spec/cobweb/cobweb_spec.rb
CHANGED
@@ -1,5 +1,4 @@
|
|
1
1
|
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
-
require "ap"
|
3
2
|
|
4
3
|
describe CobWeb do
|
5
4
|
|
@@ -28,6 +27,7 @@ describe CobWeb do
|
|
28
27
|
|
29
28
|
@mock_http_response = mock(Net::HTTPResponse)
|
30
29
|
@mock_http_redirect_response = mock(Net::HTTPRedirection)
|
30
|
+
@mock_http_redirect_response2 = mock(Net::HTTPRedirection)
|
31
31
|
@mock_http_get = mock(Net::HTTP::Get)
|
32
32
|
|
33
33
|
Net::HTTP.stub!(:new).and_return(@mock_http_client)
|
@@ -171,9 +171,6 @@ describe CobWeb do
|
|
171
171
|
end
|
172
172
|
|
173
173
|
describe "without mock" do
|
174
|
-
it "should throw invalid url exception for an invalid url" do
|
175
|
-
lambda {@cobweb.get("asdgas asv\"£%\"^%&*%")}.should raise_error URI::InvalidURIError
|
176
|
-
end
|
177
174
|
|
178
175
|
it "should throw exception when server is unavailable" #do
|
179
176
|
# lambda {@cobweb.get({:url => "http://www.oasdjgoisadjgoisdiog.com"})}.should raise_error URI::InvalidURIError
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: cobweb
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.13
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-02-10 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: resque
|
16
|
-
requirement: &
|
16
|
+
requirement: &70125097329480 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70125097329480
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: redis
|
27
|
-
requirement: &
|
27
|
+
requirement: &70125097328760 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '0'
|
33
33
|
type: :runtime
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *70125097328760
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: absolutize
|
38
|
-
requirement: &
|
38
|
+
requirement: &70125097328280 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,10 +43,10 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :runtime
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *70125097328280
|
47
47
|
- !ruby/object:Gem::Dependency
|
48
48
|
name: nokogiri
|
49
|
-
requirement: &
|
49
|
+
requirement: &70125097327660 !ruby/object:Gem::Requirement
|
50
50
|
none: false
|
51
51
|
requirements:
|
52
52
|
- - ! '>='
|
@@ -54,10 +54,10 @@ dependencies:
|
|
54
54
|
version: '0'
|
55
55
|
type: :runtime
|
56
56
|
prerelease: false
|
57
|
-
version_requirements: *
|
57
|
+
version_requirements: *70125097327660
|
58
58
|
- !ruby/object:Gem::Dependency
|
59
59
|
name: addressable
|
60
|
-
requirement: &
|
60
|
+
requirement: &70125097327060 !ruby/object:Gem::Requirement
|
61
61
|
none: false
|
62
62
|
requirements:
|
63
63
|
- - ! '>='
|
@@ -65,7 +65,18 @@ dependencies:
|
|
65
65
|
version: '0'
|
66
66
|
type: :runtime
|
67
67
|
prerelease: false
|
68
|
-
version_requirements: *
|
68
|
+
version_requirements: *70125097327060
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: rspec
|
71
|
+
requirement: &70125097326400 !ruby/object:Gem::Requirement
|
72
|
+
none: false
|
73
|
+
requirements:
|
74
|
+
- - ! '>='
|
75
|
+
- !ruby/object:Gem::Version
|
76
|
+
version: '0'
|
77
|
+
type: :runtime
|
78
|
+
prerelease: false
|
79
|
+
version_requirements: *70125097326400
|
69
80
|
description:
|
70
81
|
email: stewart@rockwellcottage.com
|
71
82
|
executables: []
|
@@ -73,12 +84,14 @@ extensions: []
|
|
73
84
|
extra_rdoc_files:
|
74
85
|
- README.textile
|
75
86
|
files:
|
87
|
+
- spec/cobweb/cobweb_crawler_spec.rb
|
76
88
|
- spec/cobweb/cobweb_spec.rb
|
77
89
|
- spec/cobweb/content_link_parser_spec.rb
|
78
90
|
- spec/samples/sample_html_links.html
|
79
91
|
- spec/spec.opts
|
80
92
|
- spec/spec_helper.rb
|
81
93
|
- lib/cobweb.rb
|
94
|
+
- lib/cobweb_crawler.rb
|
82
95
|
- lib/cobweb_finished_job.rb
|
83
96
|
- lib/cobweb_process_job.rb
|
84
97
|
- lib/content_link_parser.rb
|