cobweb 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/README.textile +63 -0
- data/lib/cobweb.rb +130 -0
- data/lib/content_link_parser.rb +71 -0
- data/lib/content_process_job.rb +13 -0
- data/lib/crawl_job.rb +71 -0
- data/lib/namespaced_redis.rb +52 -0
- data/spec/cobweb/cobweb_spec.rb +189 -0
- data/spec/cobweb/content_link_parser_spec.rb +104 -0
- data/spec/cobweb/crawl_job_spec.rb +24 -0
- data/spec/samples/sample_html_links.html +34 -0
- data/spec/spec.opts +2 -0
- data/spec/spec_helper.rb +1 -0
- metadata +133 -0
data/README.textile
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
|
2
|
+
h1. Cobweb v0.0.1
|
3
|
+
|
4
|
+
h2. Intro
|
5
|
+
|
6
|
+
Crawler that utilises resque jobs to perform the crawl allowing clustering of crawls.
|
7
|
+
|
8
|
+
h2. Installation
|
9
|
+
|
10
|
+
Install crawler as a gem
|
11
|
+
|
12
|
+
bq. gem install cobweb
|
13
|
+
|
14
|
+
h2. Usage
|
15
|
+
|
16
|
+
h4. new(options)
|
17
|
+
|
18
|
+
Creates a new crawler object based on a base_url
|
19
|
+
|
20
|
+
* options - Options are passed in as a hash,
|
21
|
+
|
22
|
+
** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash(true)
|
23
|
+
** :redirect_limit - sets the limit to be used for concurrent redirects(10)
|
24
|
+
** :processing_queue - specifies the processing queue for content to be sent to (ContentProcessJob)
|
25
|
+
** :debug - enables debug output (false)
|
26
|
+
** :quiet - hides default output (false)
|
27
|
+
** :cache - if set, enables the cache and sets the ttl (300)
|
28
|
+
|
29
|
+
bq. crawler = CobWeb.new(:follow_redirects => false)
|
30
|
+
|
31
|
+
|
32
|
+
h4. start(base_url)
|
33
|
+
|
34
|
+
* base_url - the url to start the crawl from
|
35
|
+
|
36
|
+
h4. get(url)
|
37
|
+
|
38
|
+
* url - url requested
|
39
|
+
|
40
|
+
|
41
|
+
h2. License
|
42
|
+
|
43
|
+
h3. The MIT License
|
44
|
+
|
45
|
+
Copyright (c) 2010 6Central Limited
|
46
|
+
|
47
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
48
|
+
of this software and associated documentation files (the "Software"), to deal
|
49
|
+
in the Software without restriction, including without limitation the rights
|
50
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
51
|
+
copies of the Software, and to permit persons to whom the Software is
|
52
|
+
furnished to do so, subject to the following conditions:
|
53
|
+
|
54
|
+
The above copyright notice and this permission notice shall be included in
|
55
|
+
all copies or substantial portions of the Software.
|
56
|
+
|
57
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
58
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
59
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
60
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
61
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
62
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
63
|
+
THE SOFTWARE.
|
data/lib/cobweb.rb
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'uri'
|
3
|
+
require 'resque'
|
4
|
+
require 'digest/sha1'
|
5
|
+
|
6
|
+
Dir[File.dirname(__FILE__) + '/*.rb'].each do |file|
|
7
|
+
require [File.dirname(__FILE__), File.basename(file, File.extname(file))].join("/")
|
8
|
+
end
|
9
|
+
|
10
|
+
class CobWeb
|
11
|
+
|
12
|
+
def initialize(options = {})
|
13
|
+
@options = options
|
14
|
+
@options[:follow_redirects] = true if @options[:follow_redirects].nil?
|
15
|
+
@options[:redirect_limit] = 10 if @options[:redirect_limit].nil?
|
16
|
+
@options[:processing_queue] = ContentProcessJob if @options[:processing_queue].nil?
|
17
|
+
@options[:debug] = false unless @options[:debug]
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
def start(base_url)
|
22
|
+
raise ":base_url is required" unless base_url
|
23
|
+
request = {
|
24
|
+
:crawl_id => Digest::SHA1.hexdigest(Time.now.to_s),
|
25
|
+
:url => base_url
|
26
|
+
}
|
27
|
+
|
28
|
+
request.merge!(@options)
|
29
|
+
|
30
|
+
Resque.enqueue(CrawlJob, request)
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
def get(url, redirect_limit = @options[:redirect_limit])
|
35
|
+
|
36
|
+
raise "url cannot be nil" if url.nil?
|
37
|
+
|
38
|
+
# get the unique id for this request
|
39
|
+
unique_id = Digest::SHA1.hexdigest(url)
|
40
|
+
|
41
|
+
# connect to redis
|
42
|
+
redis = NamespacedRedis.new(Redis.new, "cobweb")
|
43
|
+
|
44
|
+
content = {}
|
45
|
+
|
46
|
+
# check if it has already been cached
|
47
|
+
if redis.get(unique_id) and @options[:cache]
|
48
|
+
puts "Cache hit for #{url}" unless @options[:quiet]
|
49
|
+
content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
|
50
|
+
|
51
|
+
content
|
52
|
+
else
|
53
|
+
# this url is valid for processing so lets get on with it
|
54
|
+
print "Retrieving #{url }... " unless @options[:quiet]
|
55
|
+
uri = URI.parse(url)
|
56
|
+
|
57
|
+
# retrieve data
|
58
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
59
|
+
if uri.scheme == "https"
|
60
|
+
http.use_ssl = true
|
61
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
62
|
+
end
|
63
|
+
request_time = Time.now.to_f
|
64
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
65
|
+
response = http.request(request)
|
66
|
+
|
67
|
+
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
68
|
+
puts "redirected... " unless @options[:quiet]
|
69
|
+
url = response['location']
|
70
|
+
redirect_limit = redirect_limit - 1
|
71
|
+
content = get(response['location'], redirect_limit)
|
72
|
+
content[:url] = uri.to_s
|
73
|
+
content[:redirect_through] = [] if content[:redirect_through].nil?
|
74
|
+
content[:redirect_through].insert(0, response['location'])
|
75
|
+
|
76
|
+
content[:response_time] = Time.now.to_f - request_time
|
77
|
+
else
|
78
|
+
content[:response_time] = Time.now.to_f - request_time
|
79
|
+
|
80
|
+
puts "Retrieved." unless @options[:quiet]
|
81
|
+
|
82
|
+
# create the content container
|
83
|
+
content[:url] = uri.to_s
|
84
|
+
content[:status_code] = response.code.to_i
|
85
|
+
content[:content_type] = response.content_type
|
86
|
+
charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1 ] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
|
87
|
+
charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
|
88
|
+
content[:character_set] = charset
|
89
|
+
content[:content_length] = response.content_length
|
90
|
+
content[:content_body] = response.body
|
91
|
+
content[:location] = response["location"]
|
92
|
+
content[:headers] = response.to_hash.symbolize_keys
|
93
|
+
|
94
|
+
# parse data for links
|
95
|
+
link_parser = ContentLinkParser.new(content[:url], content[:content_body])
|
96
|
+
content[:links] = link_parser.link_data
|
97
|
+
|
98
|
+
# add content to cache if required
|
99
|
+
if @options[:cache]
|
100
|
+
redis.set(unique_id, content.to_json)
|
101
|
+
redis.expire unique_id, content_request[:cache].to_i
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
content
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
## add symbolize methods to hash
|
110
|
+
class Hash
|
111
|
+
def symbolize_keys
|
112
|
+
keys.each do |key|
|
113
|
+
if key.instance_of? String
|
114
|
+
value = self[key]
|
115
|
+
self.delete(key)
|
116
|
+
self[key.to_sym] = value
|
117
|
+
end
|
118
|
+
end
|
119
|
+
self
|
120
|
+
end
|
121
|
+
def deep_symbolize_keys
|
122
|
+
symbolize_keys
|
123
|
+
keys.each do |key|
|
124
|
+
if self[key].instance_of? Hash
|
125
|
+
self[key].deep_symbolize_keys
|
126
|
+
end
|
127
|
+
end
|
128
|
+
self
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
|
2
|
+
class ContentLinkParser
|
3
|
+
|
4
|
+
require "nokogiri"
|
5
|
+
require "absolutize"
|
6
|
+
|
7
|
+
def initialize(url, content, options = {})
|
8
|
+
@options = options
|
9
|
+
@url = url
|
10
|
+
@doc = Nokogiri::HTML(content)
|
11
|
+
|
12
|
+
base_url = @url.to_s
|
13
|
+
if @doc.at("base[href]")
|
14
|
+
base_url = @doc.at("base[href]").attr("href").to_s
|
15
|
+
end
|
16
|
+
@absolutize = Absolutize.new(base_url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
17
|
+
|
18
|
+
@options[:tags] = {}
|
19
|
+
@options[:tags][:links] = [["a[href]", "href"], ["frame[src]", "src"], ["meta[@http-equiv=\"refresh\"]", "content"], ["link[href]:not([rel])", "href"], ["area[href]", "href"]]
|
20
|
+
@options[:tags][:images] = [["img[src]", "src"]]
|
21
|
+
@options[:tags][:related] = [["link[rel]", "href"]]
|
22
|
+
@options[:tags][:scripts] = [["script[src]", "src"]]
|
23
|
+
@options[:tags][:styles] = [["link[rel='stylesheet'][href]", "href"], ["style[@type^='text/css']", /url\("?(.*?)"?\)/]]
|
24
|
+
|
25
|
+
#clear the default tags if required
|
26
|
+
@options[:tags] = {} if @options[:ignore_default_tags]
|
27
|
+
@options[:tags].merge!(@options[:additional_tags]) unless @options[:additional_tags].nil?
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
def link_data
|
32
|
+
data = {}
|
33
|
+
@options[:tags].keys.each do |key|
|
34
|
+
data[key.to_sym] = self.instance_eval(key.to_s)
|
35
|
+
end
|
36
|
+
data
|
37
|
+
end
|
38
|
+
|
39
|
+
def all_links
|
40
|
+
data = link_data
|
41
|
+
data.keys.map{|key| data[key]}.flatten.uniq
|
42
|
+
end
|
43
|
+
|
44
|
+
def method_missing(m)
|
45
|
+
if @options[:tags].keys.include?(m)
|
46
|
+
links = []
|
47
|
+
@options[:tags][m].each do |selector, attribute|
|
48
|
+
find_matches(links, selector, attribute)
|
49
|
+
end
|
50
|
+
links.uniq
|
51
|
+
else
|
52
|
+
puts "Warning: There was no configuration on how to find #{m} links"
|
53
|
+
[]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def find_matches(array, selector, attribute)
|
58
|
+
if attribute.kind_of? String or attribute.kind_of? Symbol
|
59
|
+
@doc.css(selector).each do |tag|
|
60
|
+
uri = @absolutize.url(tag[attribute])
|
61
|
+
array << uri.to_s
|
62
|
+
end
|
63
|
+
elsif attribute.instance_of? Regexp
|
64
|
+
@doc.css(selector).each do |tag|
|
65
|
+
tag.content.scan(attribute) {|match| array << @absolutize.url(match[0])}
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
|
data/lib/crawl_job.rb
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
class CrawlJob
|
2
|
+
|
3
|
+
require "net/https"
|
4
|
+
require "uri"
|
5
|
+
require "redis"
|
6
|
+
|
7
|
+
@queue = :cobweb_crawl_job
|
8
|
+
|
9
|
+
def self.perform(content_request)
|
10
|
+
# change all hash keys to symbols
|
11
|
+
content_request.deep_symbolize_keys
|
12
|
+
redis = NamespacedRedis.new(Redis.new, "cobweb-#{content_request[:crawl_id]}")
|
13
|
+
@absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
14
|
+
|
15
|
+
# check we haven't crawled this url before
|
16
|
+
unless redis.sismember "crawled", content_request[:url]
|
17
|
+
|
18
|
+
# increment counter and check we haven't hit our crawl limit
|
19
|
+
redis.incr "crawl-counter"
|
20
|
+
crawl_counter = redis.get("crawl-counter").to_i
|
21
|
+
queue_counter = redis.get("queue-counter").to_i
|
22
|
+
if crawl_counter <= content_request[:crawl_limit]
|
23
|
+
content = CobWeb.get(content_request)
|
24
|
+
redis.sadd "crawled", content_request[:url]
|
25
|
+
set_base_url redis, content, content_request[:base_url]
|
26
|
+
if queue_counter <= content_request[:crawl_limit]
|
27
|
+
ap content[:links]
|
28
|
+
content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
|
29
|
+
ap link
|
30
|
+
unless redis.sismember "crawled", link
|
31
|
+
puts redis.get("base_url")
|
32
|
+
puts "---------------------------------"
|
33
|
+
if link.match(Regexp.new("^#{redis.get("base_url")}"))
|
34
|
+
new_request = content_request.clone
|
35
|
+
new_request[:url] = link
|
36
|
+
new_request[:parent] = content_request[:url]
|
37
|
+
Resque.enqueue(CrawlJob, new_request)
|
38
|
+
redis.incr "queue-counter"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# enqueue to processing queue
|
45
|
+
Resque.enqueue(const_get(content_request[:processing_queue]), content)
|
46
|
+
puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
|
47
|
+
puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter}" if content_request[:debug]
|
48
|
+
|
49
|
+
else
|
50
|
+
puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit]} objects" if content_request[:debug]
|
51
|
+
end
|
52
|
+
else
|
53
|
+
puts "Already crawled #{content_request[:url]}" if content_request[:debug]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
def self.set_base_url(redis, content, base_url)
|
59
|
+
if redis.get("base_url").nil?
|
60
|
+
if content[:status_code] >= 300 and content[:status_code] < 400
|
61
|
+
#redirect received for first url
|
62
|
+
redis.set("base_url", @absolutize.url(content[:location]).to_s)
|
63
|
+
puts "Warning: base_url given redirects to another location, setting base_url to #{@absolutize.url(content[:location]).to_s}"
|
64
|
+
else
|
65
|
+
redis.set("base_url", base_url)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
|
71
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
class NamespacedRedis
|
2
|
+
def initialize(redis, namespace="")
|
3
|
+
raise "redis must be supplied" if redis.nil?
|
4
|
+
@redis = redis
|
5
|
+
@namespace = namespace
|
6
|
+
end
|
7
|
+
|
8
|
+
def sismember(key, member)
|
9
|
+
@redis.sismember namespaced(key), member
|
10
|
+
end
|
11
|
+
|
12
|
+
def sadd(key, value)
|
13
|
+
@redis.sadd namespaced(key), value
|
14
|
+
end
|
15
|
+
|
16
|
+
def get(key)
|
17
|
+
@redis.get namespaced(key)
|
18
|
+
end
|
19
|
+
|
20
|
+
def incr(key)
|
21
|
+
@redis.incr namespaced(key)
|
22
|
+
end
|
23
|
+
|
24
|
+
def exist(key)
|
25
|
+
@redis.exist namespaced(key)
|
26
|
+
end
|
27
|
+
|
28
|
+
def set(key, value)
|
29
|
+
@redis.set namespaced(key), value
|
30
|
+
end
|
31
|
+
|
32
|
+
def del(key)
|
33
|
+
@redis.del namespaced(key)
|
34
|
+
end
|
35
|
+
|
36
|
+
def expire(key, value)
|
37
|
+
@redis.expire namespaced(key), value
|
38
|
+
end
|
39
|
+
|
40
|
+
def namespaced(key)
|
41
|
+
"#{@namespace}-#{key}"
|
42
|
+
end
|
43
|
+
|
44
|
+
def native
|
45
|
+
@redis
|
46
|
+
end
|
47
|
+
|
48
|
+
def namespace
|
49
|
+
@namespace
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
@@ -0,0 +1,189 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
describe CobWeb do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
|
7
|
+
@base_url = "http://www.baseurl.com/"
|
8
|
+
|
9
|
+
@default_headers = {"Cache-Control" => "private, max-age=0",
|
10
|
+
"Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
|
11
|
+
"Expires" => "-1",
|
12
|
+
"Content-Type" => "text/html; charset=UTF-8",
|
13
|
+
"Content-Encoding" => "gzip",
|
14
|
+
"Transfer-Encoding" => "chunked",
|
15
|
+
"Server" => "gws",
|
16
|
+
"X-XSS-Protection" => "1; mode=block"}
|
17
|
+
|
18
|
+
@cobweb = CobWeb.new :quiet => true
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "with mock" do
|
22
|
+
before(:each) do
|
23
|
+
@mock_http_client = mock(Net::HTTP)
|
24
|
+
@mock_http_request = mock(Net::HTTPRequest)
|
25
|
+
@mock_http_redirect_request = mock(Net::HTTPRequest)
|
26
|
+
@mock_http_redirect_request2 = mock(Net::HTTPRequest)
|
27
|
+
|
28
|
+
@mock_http_response = mock(Net::HTTPResponse)
|
29
|
+
@mock_http_redirect_response = mock(Net::HTTPRedirection)
|
30
|
+
@mock_http_get = mock(Net::HTTP::Get)
|
31
|
+
|
32
|
+
Net::HTTP.stub!(:new).and_return(@mock_http_client)
|
33
|
+
Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
|
34
|
+
Net::HTTP::Get.stub!(:new).with("/redirect.html").and_return(@mock_http_redirect_request)
|
35
|
+
Net::HTTP::Get.stub!(:new).with("/redirect2.html").and_return(@mock_http_redirect_request2)
|
36
|
+
|
37
|
+
@mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
|
38
|
+
@mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
39
|
+
@mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
|
40
|
+
|
41
|
+
@mock_http_response.stub!(:code).and_return(200)
|
42
|
+
@mock_http_response.stub!(:content_type).and_return("text/html")
|
43
|
+
@mock_http_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
|
44
|
+
@mock_http_response.stub!(:[]).with("location").and_return(@default_headers["location"])
|
45
|
+
@mock_http_response.stub!(:content_length).and_return(1024)
|
46
|
+
@mock_http_response.stub!(:body).and_return("asdf")
|
47
|
+
@mock_http_response.stub!(:to_hash).and_return(@default_headers)
|
48
|
+
|
49
|
+
@mock_http_redirect_response.stub!(:code).and_return(301)
|
50
|
+
@mock_http_redirect_response.stub!(:content_type).and_return("text/xml")
|
51
|
+
@mock_http_redirect_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
|
52
|
+
@mock_http_redirect_response.stub!(:[]).with("location").and_return("http://redirected-to.com/redirect2.html")
|
53
|
+
@mock_http_redirect_response.stub!(:content_length).and_return(2048)
|
54
|
+
@mock_http_redirect_response.stub!(:body).and_return("redirected body")
|
55
|
+
@mock_http_redirect_response.stub!(:to_hash).and_return(@default_headers)
|
56
|
+
|
57
|
+
@mock_http_redirect_response2.stub!(:code).and_return(301)
|
58
|
+
@mock_http_redirect_response2.stub!(:content_type).and_return("text/xml")
|
59
|
+
@mock_http_redirect_response2.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
|
60
|
+
@mock_http_redirect_response2.stub!(:[]).with("location").and_return("http://redirected-to.com/redirected.html")
|
61
|
+
@mock_http_redirect_response2.stub!(:content_length).and_return(2048)
|
62
|
+
@mock_http_redirect_response2.stub!(:body).and_return("redirected body")
|
63
|
+
@mock_http_redirect_response2.stub!(:to_hash).and_return(@default_headers)
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should generate a cobweb object" do
|
68
|
+
CobWeb.new.should be_an_instance_of CobWeb
|
69
|
+
end
|
70
|
+
|
71
|
+
describe "get" do
|
72
|
+
it "should return a hash with default values" do
|
73
|
+
@cobweb.get(@base_url).should be_an_instance_of Hash
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should return a hash with default values without quiet option" do
|
77
|
+
@cobweb.get(@base_url).should be_an_instance_of Hash
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should raise exception if there is no url" do
|
81
|
+
lambda {@cobweb.get(nil)}.should raise_error("url cannot be nil")
|
82
|
+
end
|
83
|
+
|
84
|
+
describe "content object" do
|
85
|
+
|
86
|
+
it "should return the url" do
|
87
|
+
@cobweb.get(@base_url)[:url].should == @base_url
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should return correct content-types" do
|
91
|
+
@mock_http_response.stub!(:content_type).and_return("image/jpeg")
|
92
|
+
@cobweb.get(@base_url)[:content_type].should == "image/jpeg"
|
93
|
+
end
|
94
|
+
|
95
|
+
it "should return correct status-code" do
|
96
|
+
@mock_http_response.stub!(:code).and_return(404)
|
97
|
+
@cobweb.get(@base_url)[:status_code].should == 404
|
98
|
+
end
|
99
|
+
|
100
|
+
it "should return correct status-code" do
|
101
|
+
@mock_http_response.stub!(:code).and_return(404)
|
102
|
+
@cobweb.get(@base_url)[:status_code].should == 404
|
103
|
+
end
|
104
|
+
|
105
|
+
it "should return correct character_set" do
|
106
|
+
@cobweb.get(@base_url)[:character_set].should == "UTF-8"
|
107
|
+
end
|
108
|
+
it "should return correct content_length" do
|
109
|
+
@cobweb.get(@base_url)[:content_length].should == 1024
|
110
|
+
end
|
111
|
+
it "should return correct content_body" do
|
112
|
+
@cobweb.get(@base_url)[:content_body].should == "asdf"
|
113
|
+
end
|
114
|
+
it "should return correct location" do
|
115
|
+
@cobweb.get(@base_url)[:location].should == nil
|
116
|
+
|
117
|
+
@mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
|
118
|
+
@cobweb.get(@base_url)[:location].should == "http://google.com/"
|
119
|
+
end
|
120
|
+
it "should return correct headers" do
|
121
|
+
@cobweb.get(@base_url)[:headers].should == @default_headers
|
122
|
+
end
|
123
|
+
it "should return correct a hash of links" do
|
124
|
+
@cobweb.get(@base_url)[:links].should be_an_instance_of Hash
|
125
|
+
end
|
126
|
+
it "should return the response time for the url" do
|
127
|
+
@cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
|
128
|
+
end
|
129
|
+
|
130
|
+
end
|
131
|
+
describe "with redirect" do
|
132
|
+
|
133
|
+
before(:each) do
|
134
|
+
@base_url = "http://redirect-me.com/redirect.html"
|
135
|
+
@cobweb = CobWeb.new(:follow_redirects => true, :quiet => true)
|
136
|
+
end
|
137
|
+
|
138
|
+
it "should flow through redirect" do
|
139
|
+
|
140
|
+
@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
141
|
+
|
142
|
+
content = @cobweb.get(@base_url)
|
143
|
+
content.should be_an_instance_of Hash
|
144
|
+
|
145
|
+
content[:url].should == "http://redirect-me.com/redirect.html"
|
146
|
+
content[:redirect_through].length.should == 2
|
147
|
+
content[:content_type].should == "text/html"
|
148
|
+
content[:content_body].should == "asdf"
|
149
|
+
|
150
|
+
end
|
151
|
+
it "should return the path followed" do
|
152
|
+
@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
153
|
+
|
154
|
+
content = @cobweb.get(@base_url)
|
155
|
+
content[:redirect_through].should == ["http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
|
156
|
+
|
157
|
+
end
|
158
|
+
it "should not follow with redirect disabled" do
|
159
|
+
@cobweb = CobWeb.new(:follow_redirects => false)
|
160
|
+
@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
161
|
+
|
162
|
+
content = @cobweb.get(@base_url)
|
163
|
+
content[:url].should == "http://redirect-me.com/redirect.html"
|
164
|
+
content[:redirect_through].should be_nil
|
165
|
+
content[:status_code].should == 301
|
166
|
+
content[:content_type].should == "text/xml"
|
167
|
+
content[:content_body].should == "redirected body"
|
168
|
+
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
describe "without mock" do
|
175
|
+
it "should throw invalid url exception for an invalid url" do
|
176
|
+
lambda {@cobweb.get("asdgas asv\"£%\"^%&*%")}.should raise_error URI::InvalidURIError
|
177
|
+
end
|
178
|
+
|
179
|
+
it "should throw exception when server is unavailable" #do
|
180
|
+
# lambda {@cobweb.get({:url => "http://www.oasdjgoisadjgoisdiog.com"})}.should raise_error URI::InvalidURIError
|
181
|
+
#end
|
182
|
+
|
183
|
+
it "should return a valid content hash when url doesn't exist on a live server" do
|
184
|
+
status_code = @cobweb.get("http://test.com/laskdjflsdajf")[:status_code]
|
185
|
+
status_code.should == 404
|
186
|
+
end
|
187
|
+
|
188
|
+
end
|
189
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../lib/content_link_parser.rb')
|
3
|
+
|
4
|
+
describe ContentLinkParser do
|
5
|
+
|
6
|
+
before(:each) do
|
7
|
+
@base_url = "http://www.baseurl.com/"
|
8
|
+
@content = File.read(File.dirname(__FILE__) + "/../samples/sample_html_links.html")
|
9
|
+
@content_parser = ContentLinkParser.new("http://sample-links.com/", @content)
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should load the sample document" do
|
13
|
+
@content.should_not be_nil
|
14
|
+
@content.should_not be_empty
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should create a content link parser" do
|
18
|
+
@content_parser.should_not be_nil
|
19
|
+
@content_parser.should be_an_instance_of ContentLinkParser
|
20
|
+
end
|
21
|
+
|
22
|
+
describe "using default tags" do
|
23
|
+
describe "returning general links" do
|
24
|
+
it "should return some links from the sample data" do
|
25
|
+
links = @content_parser.links
|
26
|
+
links.should_not be_nil
|
27
|
+
links.should_not be_empty
|
28
|
+
end
|
29
|
+
it "should return the correct links" do
|
30
|
+
links = @content_parser.links
|
31
|
+
links.length.should == 4
|
32
|
+
end
|
33
|
+
end
|
34
|
+
describe "returning image links" do
|
35
|
+
it "should return some image links from the sample data" do
|
36
|
+
links = @content_parser.images
|
37
|
+
links.should_not be_nil
|
38
|
+
links.should_not be_empty
|
39
|
+
end
|
40
|
+
it "should return the correct links" do
|
41
|
+
links = @content_parser.images
|
42
|
+
links.length.should == 1
|
43
|
+
end
|
44
|
+
end
|
45
|
+
describe "returning related links" do
|
46
|
+
it "should return some related links from the sample data" do
|
47
|
+
links = @content_parser.related
|
48
|
+
links.should_not be_nil
|
49
|
+
links.should_not be_empty
|
50
|
+
end
|
51
|
+
it "should return the correct links" do
|
52
|
+
links = @content_parser.related
|
53
|
+
links.length.should == 2
|
54
|
+
end
|
55
|
+
end
|
56
|
+
describe "returning script links" do
|
57
|
+
it "should return some script links from the sample data" do
|
58
|
+
links = @content_parser.scripts
|
59
|
+
links.should_not be_nil
|
60
|
+
links.should_not be_empty
|
61
|
+
end
|
62
|
+
it "should return the correct links" do
|
63
|
+
links = @content_parser.scripts
|
64
|
+
links.length.should == 1
|
65
|
+
end
|
66
|
+
end
|
67
|
+
describe "returning style links" do
|
68
|
+
it "should return some style links from the sample data" do
|
69
|
+
links = @content_parser.styles
|
70
|
+
links.should_not be_nil
|
71
|
+
links.should_not be_empty
|
72
|
+
end
|
73
|
+
it "should return the correct links" do
|
74
|
+
links = @content_parser.styles
|
75
|
+
links.length.should == 3
|
76
|
+
end
|
77
|
+
end
|
78
|
+
describe "returning unknown link type" do
|
79
|
+
it "should return an empty array" do
|
80
|
+
links = @content_parser.asdfasdfsadf
|
81
|
+
links.should_not be_nil
|
82
|
+
links.should be_an_instance_of Array
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
describe "returning all link data" do
|
88
|
+
it "should return a hash with all link data" do
|
89
|
+
link_data = @content_parser.link_data
|
90
|
+
link_data.should_not be_nil
|
91
|
+
link_data.should be_an_instance_of Hash
|
92
|
+
|
93
|
+
link_data.keys.length.should == 5
|
94
|
+
link_data[:links].length.should == 4
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
describe "ignoring default tags" do
|
99
|
+
it "should not return any links" do
|
100
|
+
parser = ContentLinkParser.new("http://sample-links.com", @content, :ignore_default_tags => true)
|
101
|
+
parser.links.should be_empty
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
|
4
|
+
describe CrawlJob do
|
5
|
+
|
6
|
+
before(:each) do
|
7
|
+
@base_url = "http://www.baseurl.com/"
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
client = Net::HTTPClient.new
|
12
|
+
puts client.get('http://www.google.com.au')
|
13
|
+
puts "asdf"
|
14
|
+
|
15
|
+
@cobweb = CobWeb.new("http://www.google.com")
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should be a cobweb type" do
|
20
|
+
@cobweb.should be_an_instance_of CobWeb
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<title>Sample HTML Document With all types of links</title>
|
4
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
5
|
+
<meta name="description" content="Information for people running web search indexing robots, and web site managers attempting to understand what's going on when a robot visits their site.">
|
6
|
+
<meta name="keywords" content="robots, crawlers, crawling, spiders, index, indexing, indexers, gatherers, search engines, searching, FAQ, checklist">
|
7
|
+
<meta name="DC.date.modified" content="2003-07-18">
|
8
|
+
<meta http-equiv="refresh" content="http://sampleurl-metarefresh.com/"/>
|
9
|
+
|
10
|
+
<link rel="stylesheet" type="text/css" href="http://sampleurl-linkcss/" />
|
11
|
+
<link rel="home" type="text/html" href="http://sampleurl-linkhome/" />
|
12
|
+
<script type="text/javascript" src="script.js"></script>
|
13
|
+
|
14
|
+
<STYLE TYPE="text/css" MEDIA="screen, projection">
|
15
|
+
<!--
|
16
|
+
@import url(http://www.htmlhelp.com/style.css);
|
17
|
+
@import url(/stylesheets/punk.css);
|
18
|
+
DT { background: yellow; color: black }
|
19
|
+
-->
|
20
|
+
</STYLE>
|
21
|
+
|
22
|
+
</head>
|
23
|
+
|
24
|
+
<body bgcolor="#FFFFFF"><!-- #BeginLibraryItem "/Library/navtop.lbi" --></p>
|
25
|
+
|
26
|
+
<a href="http://sampleurl-a.com/">Click Here for Sample URL 1</a>
|
27
|
+
<frameset><frame src="http://sampleurl-frame.com/"></frame></frameset>
|
28
|
+
|
29
|
+
<map id="testmap"><area href="http://sampleurl-area"></area>></map>
|
30
|
+
|
31
|
+
<img src="http://sampleurl-img/"/>
|
32
|
+
|
33
|
+
</body>
|
34
|
+
</html>
|
data/spec/spec.opts
ADDED
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
|
metadata
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: cobweb
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Stewart McKee
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-11-10 00:00:00 +00:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: resque
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: redis
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
44
|
+
segments:
|
45
|
+
- 0
|
46
|
+
version: "0"
|
47
|
+
type: :runtime
|
48
|
+
version_requirements: *id002
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: absolutize
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
hash: 3
|
58
|
+
segments:
|
59
|
+
- 0
|
60
|
+
version: "0"
|
61
|
+
type: :runtime
|
62
|
+
version_requirements: *id003
|
63
|
+
- !ruby/object:Gem::Dependency
|
64
|
+
name: nokogiri
|
65
|
+
prerelease: false
|
66
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
hash: 3
|
72
|
+
segments:
|
73
|
+
- 0
|
74
|
+
version: "0"
|
75
|
+
type: :runtime
|
76
|
+
version_requirements: *id004
|
77
|
+
description:
|
78
|
+
email: stewart@rockwellcottage.com
|
79
|
+
executables: []
|
80
|
+
|
81
|
+
extensions: []
|
82
|
+
|
83
|
+
extra_rdoc_files:
|
84
|
+
- README.textile
|
85
|
+
files:
|
86
|
+
- spec/samples/sample_html_links.html
|
87
|
+
- spec/spec.opts
|
88
|
+
- spec/spec_helper.rb
|
89
|
+
- spec/cobweb/content_link_parser_spec.rb
|
90
|
+
- spec/cobweb/cobweb_spec.rb
|
91
|
+
- spec/cobweb/crawl_job_spec.rb
|
92
|
+
- lib/namespaced_redis.rb
|
93
|
+
- lib/cobweb.rb
|
94
|
+
- lib/content_process_job.rb
|
95
|
+
- lib/content_link_parser.rb
|
96
|
+
- lib/crawl_job.rb
|
97
|
+
- README.textile
|
98
|
+
has_rdoc: false
|
99
|
+
homepage: http://github.com/stewartmckee/cobweb
|
100
|
+
licenses: []
|
101
|
+
|
102
|
+
post_install_message:
|
103
|
+
rdoc_options: []
|
104
|
+
|
105
|
+
require_paths:
|
106
|
+
- lib
|
107
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
108
|
+
none: false
|
109
|
+
requirements:
|
110
|
+
- - ">="
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
hash: 3
|
113
|
+
segments:
|
114
|
+
- 0
|
115
|
+
version: "0"
|
116
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
117
|
+
none: false
|
118
|
+
requirements:
|
119
|
+
- - ">="
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
hash: 3
|
122
|
+
segments:
|
123
|
+
- 0
|
124
|
+
version: "0"
|
125
|
+
requirements: []
|
126
|
+
|
127
|
+
rubyforge_project:
|
128
|
+
rubygems_version: 1.3.7
|
129
|
+
signing_key:
|
130
|
+
specification_version: 3
|
131
|
+
summary: Crawler utilizing resque
|
132
|
+
test_files: []
|
133
|
+
|