cobweb 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.textile +63 -0
- data/lib/cobweb.rb +130 -0
- data/lib/content_link_parser.rb +71 -0
- data/lib/content_process_job.rb +13 -0
- data/lib/crawl_job.rb +71 -0
- data/lib/namespaced_redis.rb +52 -0
- data/spec/cobweb/cobweb_spec.rb +189 -0
- data/spec/cobweb/content_link_parser_spec.rb +104 -0
- data/spec/cobweb/crawl_job_spec.rb +24 -0
- data/spec/samples/sample_html_links.html +34 -0
- data/spec/spec.opts +2 -0
- data/spec/spec_helper.rb +1 -0
- metadata +133 -0
data/README.textile
ADDED
@@ -0,0 +1,63 @@
|
|
1
|
+
|
2
|
+
h1. Cobweb v0.0.1
|
3
|
+
|
4
|
+
h2. Intro
|
5
|
+
|
6
|
+
Crawler that utilises resque jobs to perform the crawl allowing clustering of crawls.
|
7
|
+
|
8
|
+
h2. Installation
|
9
|
+
|
10
|
+
Install crawler as a gem
|
11
|
+
|
12
|
+
bq. gem install cobweb
|
13
|
+
|
14
|
+
h2. Usage
|
15
|
+
|
16
|
+
h4. new(options)
|
17
|
+
|
18
|
+
Creates a new crawler object based on a base_url
|
19
|
+
|
20
|
+
* options - Options are passed in as a hash,
|
21
|
+
|
22
|
+
** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash(true)
|
23
|
+
** :redirect_limit - sets the limit to be used for concurrent redirects(10)
|
24
|
+
** :processing_queue - specifies the processing queue for content to be sent to (ContentProcessJob)
|
25
|
+
** :debug - enables debug output (false)
|
26
|
+
** :quiet - hides default output (false)
|
27
|
+
** :cache - if set, enables the cache and sets the ttl (300)
|
28
|
+
|
29
|
+
bq. crawler = CobWeb.new(:follow_redirects => false)
|
30
|
+
|
31
|
+
|
32
|
+
h4. start(base_url)
|
33
|
+
|
34
|
+
* base_url - the url to start the crawl from
|
35
|
+
|
36
|
+
h4. get(url)
|
37
|
+
|
38
|
+
* url - url requested
|
39
|
+
|
40
|
+
|
41
|
+
h2. License
|
42
|
+
|
43
|
+
h3. The MIT License
|
44
|
+
|
45
|
+
Copyright (c) 2010 6Central Limited
|
46
|
+
|
47
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
48
|
+
of this software and associated documentation files (the "Software"), to deal
|
49
|
+
in the Software without restriction, including without limitation the rights
|
50
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
51
|
+
copies of the Software, and to permit persons to whom the Software is
|
52
|
+
furnished to do so, subject to the following conditions:
|
53
|
+
|
54
|
+
The above copyright notice and this permission notice shall be included in
|
55
|
+
all copies or substantial portions of the Software.
|
56
|
+
|
57
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
58
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
59
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
60
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
61
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
62
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
63
|
+
THE SOFTWARE.
|
data/lib/cobweb.rb
ADDED
@@ -0,0 +1,130 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'uri'
|
3
|
+
require 'resque'
|
4
|
+
require 'digest/sha1'
|
5
|
+
|
6
|
+
Dir[File.dirname(__FILE__) + '/*.rb'].each do |file|
|
7
|
+
require [File.dirname(__FILE__), File.basename(file, File.extname(file))].join("/")
|
8
|
+
end
|
9
|
+
|
10
|
+
class CobWeb
|
11
|
+
|
12
|
+
def initialize(options = {})
|
13
|
+
@options = options
|
14
|
+
@options[:follow_redirects] = true if @options[:follow_redirects].nil?
|
15
|
+
@options[:redirect_limit] = 10 if @options[:redirect_limit].nil?
|
16
|
+
@options[:processing_queue] = ContentProcessJob if @options[:processing_queue].nil?
|
17
|
+
@options[:debug] = false unless @options[:debug]
|
18
|
+
|
19
|
+
end
|
20
|
+
|
21
|
+
def start(base_url)
|
22
|
+
raise ":base_url is required" unless base_url
|
23
|
+
request = {
|
24
|
+
:crawl_id => Digest::SHA1.hexdigest(Time.now.to_s),
|
25
|
+
:url => base_url
|
26
|
+
}
|
27
|
+
|
28
|
+
request.merge!(@options)
|
29
|
+
|
30
|
+
Resque.enqueue(CrawlJob, request)
|
31
|
+
end
|
32
|
+
|
33
|
+
|
34
|
+
def get(url, redirect_limit = @options[:redirect_limit])
|
35
|
+
|
36
|
+
raise "url cannot be nil" if url.nil?
|
37
|
+
|
38
|
+
# get the unique id for this request
|
39
|
+
unique_id = Digest::SHA1.hexdigest(url)
|
40
|
+
|
41
|
+
# connect to redis
|
42
|
+
redis = NamespacedRedis.new(Redis.new, "cobweb")
|
43
|
+
|
44
|
+
content = {}
|
45
|
+
|
46
|
+
# check if it has already been cached
|
47
|
+
if redis.get(unique_id) and @options[:cache]
|
48
|
+
puts "Cache hit for #{url}" unless @options[:quiet]
|
49
|
+
content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
|
50
|
+
|
51
|
+
content
|
52
|
+
else
|
53
|
+
# this url is valid for processing so lets get on with it
|
54
|
+
print "Retrieving #{url }... " unless @options[:quiet]
|
55
|
+
uri = URI.parse(url)
|
56
|
+
|
57
|
+
# retrieve data
|
58
|
+
http = Net::HTTP.new(uri.host, uri.port)
|
59
|
+
if uri.scheme == "https"
|
60
|
+
http.use_ssl = true
|
61
|
+
http.verify_mode = OpenSSL::SSL::VERIFY_NONE
|
62
|
+
end
|
63
|
+
request_time = Time.now.to_f
|
64
|
+
request = Net::HTTP::Get.new(uri.request_uri)
|
65
|
+
response = http.request(request)
|
66
|
+
|
67
|
+
if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
|
68
|
+
puts "redirected... " unless @options[:quiet]
|
69
|
+
url = response['location']
|
70
|
+
redirect_limit = redirect_limit - 1
|
71
|
+
content = get(response['location'], redirect_limit)
|
72
|
+
content[:url] = uri.to_s
|
73
|
+
content[:redirect_through] = [] if content[:redirect_through].nil?
|
74
|
+
content[:redirect_through].insert(0, response['location'])
|
75
|
+
|
76
|
+
content[:response_time] = Time.now.to_f - request_time
|
77
|
+
else
|
78
|
+
content[:response_time] = Time.now.to_f - request_time
|
79
|
+
|
80
|
+
puts "Retrieved." unless @options[:quiet]
|
81
|
+
|
82
|
+
# create the content container
|
83
|
+
content[:url] = uri.to_s
|
84
|
+
content[:status_code] = response.code.to_i
|
85
|
+
content[:content_type] = response.content_type
|
86
|
+
charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1 ] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
|
87
|
+
charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
|
88
|
+
content[:character_set] = charset
|
89
|
+
content[:content_length] = response.content_length
|
90
|
+
content[:content_body] = response.body
|
91
|
+
content[:location] = response["location"]
|
92
|
+
content[:headers] = response.to_hash.symbolize_keys
|
93
|
+
|
94
|
+
# parse data for links
|
95
|
+
link_parser = ContentLinkParser.new(content[:url], content[:content_body])
|
96
|
+
content[:links] = link_parser.link_data
|
97
|
+
|
98
|
+
# add content to cache if required
|
99
|
+
if @options[:cache]
|
100
|
+
redis.set(unique_id, content.to_json)
|
101
|
+
redis.expire unique_id, content_request[:cache].to_i
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
105
|
+
content
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
## add symbolize methods to hash
|
110
|
+
class Hash
|
111
|
+
def symbolize_keys
|
112
|
+
keys.each do |key|
|
113
|
+
if key.instance_of? String
|
114
|
+
value = self[key]
|
115
|
+
self.delete(key)
|
116
|
+
self[key.to_sym] = value
|
117
|
+
end
|
118
|
+
end
|
119
|
+
self
|
120
|
+
end
|
121
|
+
def deep_symbolize_keys
|
122
|
+
symbolize_keys
|
123
|
+
keys.each do |key|
|
124
|
+
if self[key].instance_of? Hash
|
125
|
+
self[key].deep_symbolize_keys
|
126
|
+
end
|
127
|
+
end
|
128
|
+
self
|
129
|
+
end
|
130
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
|
2
|
+
class ContentLinkParser
|
3
|
+
|
4
|
+
require "nokogiri"
|
5
|
+
require "absolutize"
|
6
|
+
|
7
|
+
def initialize(url, content, options = {})
|
8
|
+
@options = options
|
9
|
+
@url = url
|
10
|
+
@doc = Nokogiri::HTML(content)
|
11
|
+
|
12
|
+
base_url = @url.to_s
|
13
|
+
if @doc.at("base[href]")
|
14
|
+
base_url = @doc.at("base[href]").attr("href").to_s
|
15
|
+
end
|
16
|
+
@absolutize = Absolutize.new(base_url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
17
|
+
|
18
|
+
@options[:tags] = {}
|
19
|
+
@options[:tags][:links] = [["a[href]", "href"], ["frame[src]", "src"], ["meta[@http-equiv=\"refresh\"]", "content"], ["link[href]:not([rel])", "href"], ["area[href]", "href"]]
|
20
|
+
@options[:tags][:images] = [["img[src]", "src"]]
|
21
|
+
@options[:tags][:related] = [["link[rel]", "href"]]
|
22
|
+
@options[:tags][:scripts] = [["script[src]", "src"]]
|
23
|
+
@options[:tags][:styles] = [["link[rel='stylesheet'][href]", "href"], ["style[@type^='text/css']", /url\("?(.*?)"?\)/]]
|
24
|
+
|
25
|
+
#clear the default tags if required
|
26
|
+
@options[:tags] = {} if @options[:ignore_default_tags]
|
27
|
+
@options[:tags].merge!(@options[:additional_tags]) unless @options[:additional_tags].nil?
|
28
|
+
|
29
|
+
end
|
30
|
+
|
31
|
+
def link_data
|
32
|
+
data = {}
|
33
|
+
@options[:tags].keys.each do |key|
|
34
|
+
data[key.to_sym] = self.instance_eval(key.to_s)
|
35
|
+
end
|
36
|
+
data
|
37
|
+
end
|
38
|
+
|
39
|
+
def all_links
|
40
|
+
data = link_data
|
41
|
+
data.keys.map{|key| data[key]}.flatten.uniq
|
42
|
+
end
|
43
|
+
|
44
|
+
def method_missing(m)
|
45
|
+
if @options[:tags].keys.include?(m)
|
46
|
+
links = []
|
47
|
+
@options[:tags][m].each do |selector, attribute|
|
48
|
+
find_matches(links, selector, attribute)
|
49
|
+
end
|
50
|
+
links.uniq
|
51
|
+
else
|
52
|
+
puts "Warning: There was no configuration on how to find #{m} links"
|
53
|
+
[]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
def find_matches(array, selector, attribute)
|
58
|
+
if attribute.kind_of? String or attribute.kind_of? Symbol
|
59
|
+
@doc.css(selector).each do |tag|
|
60
|
+
uri = @absolutize.url(tag[attribute])
|
61
|
+
array << uri.to_s
|
62
|
+
end
|
63
|
+
elsif attribute.instance_of? Regexp
|
64
|
+
@doc.css(selector).each do |tag|
|
65
|
+
tag.content.scan(attribute) {|match| array << @absolutize.url(match[0])}
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
end
|
71
|
+
|
data/lib/crawl_job.rb
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
class CrawlJob
|
2
|
+
|
3
|
+
require "net/https"
|
4
|
+
require "uri"
|
5
|
+
require "redis"
|
6
|
+
|
7
|
+
@queue = :cobweb_crawl_job
|
8
|
+
|
9
|
+
def self.perform(content_request)
|
10
|
+
# change all hash keys to symbols
|
11
|
+
content_request.deep_symbolize_keys
|
12
|
+
redis = NamespacedRedis.new(Redis.new, "cobweb-#{content_request[:crawl_id]}")
|
13
|
+
@absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
|
14
|
+
|
15
|
+
# check we haven't crawled this url before
|
16
|
+
unless redis.sismember "crawled", content_request[:url]
|
17
|
+
|
18
|
+
# increment counter and check we haven't hit our crawl limit
|
19
|
+
redis.incr "crawl-counter"
|
20
|
+
crawl_counter = redis.get("crawl-counter").to_i
|
21
|
+
queue_counter = redis.get("queue-counter").to_i
|
22
|
+
if crawl_counter <= content_request[:crawl_limit]
|
23
|
+
content = CobWeb.get(content_request)
|
24
|
+
redis.sadd "crawled", content_request[:url]
|
25
|
+
set_base_url redis, content, content_request[:base_url]
|
26
|
+
if queue_counter <= content_request[:crawl_limit]
|
27
|
+
ap content[:links]
|
28
|
+
content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
|
29
|
+
ap link
|
30
|
+
unless redis.sismember "crawled", link
|
31
|
+
puts redis.get("base_url")
|
32
|
+
puts "---------------------------------"
|
33
|
+
if link.match(Regexp.new("^#{redis.get("base_url")}"))
|
34
|
+
new_request = content_request.clone
|
35
|
+
new_request[:url] = link
|
36
|
+
new_request[:parent] = content_request[:url]
|
37
|
+
Resque.enqueue(CrawlJob, new_request)
|
38
|
+
redis.incr "queue-counter"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# enqueue to processing queue
|
45
|
+
Resque.enqueue(const_get(content_request[:processing_queue]), content)
|
46
|
+
puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
|
47
|
+
puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter}" if content_request[:debug]
|
48
|
+
|
49
|
+
else
|
50
|
+
puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit]} objects" if content_request[:debug]
|
51
|
+
end
|
52
|
+
else
|
53
|
+
puts "Already crawled #{content_request[:url]}" if content_request[:debug]
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
private
|
58
|
+
def self.set_base_url(redis, content, base_url)
|
59
|
+
if redis.get("base_url").nil?
|
60
|
+
if content[:status_code] >= 300 and content[:status_code] < 400
|
61
|
+
#redirect received for first url
|
62
|
+
redis.set("base_url", @absolutize.url(content[:location]).to_s)
|
63
|
+
puts "Warning: base_url given redirects to another location, setting base_url to #{@absolutize.url(content[:location]).to_s}"
|
64
|
+
else
|
65
|
+
redis.set("base_url", base_url)
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
69
|
+
|
70
|
+
|
71
|
+
end
|
@@ -0,0 +1,52 @@
|
|
1
|
+
class NamespacedRedis
|
2
|
+
def initialize(redis, namespace="")
|
3
|
+
raise "redis must be supplied" if redis.nil?
|
4
|
+
@redis = redis
|
5
|
+
@namespace = namespace
|
6
|
+
end
|
7
|
+
|
8
|
+
def sismember(key, member)
|
9
|
+
@redis.sismember namespaced(key), member
|
10
|
+
end
|
11
|
+
|
12
|
+
def sadd(key, value)
|
13
|
+
@redis.sadd namespaced(key), value
|
14
|
+
end
|
15
|
+
|
16
|
+
def get(key)
|
17
|
+
@redis.get namespaced(key)
|
18
|
+
end
|
19
|
+
|
20
|
+
def incr(key)
|
21
|
+
@redis.incr namespaced(key)
|
22
|
+
end
|
23
|
+
|
24
|
+
def exist(key)
|
25
|
+
@redis.exist namespaced(key)
|
26
|
+
end
|
27
|
+
|
28
|
+
def set(key, value)
|
29
|
+
@redis.set namespaced(key), value
|
30
|
+
end
|
31
|
+
|
32
|
+
def del(key)
|
33
|
+
@redis.del namespaced(key)
|
34
|
+
end
|
35
|
+
|
36
|
+
def expire(key, value)
|
37
|
+
@redis.expire namespaced(key), value
|
38
|
+
end
|
39
|
+
|
40
|
+
def namespaced(key)
|
41
|
+
"#{@namespace}-#{key}"
|
42
|
+
end
|
43
|
+
|
44
|
+
def native
|
45
|
+
@redis
|
46
|
+
end
|
47
|
+
|
48
|
+
def namespace
|
49
|
+
@namespace
|
50
|
+
end
|
51
|
+
|
52
|
+
end
|
@@ -0,0 +1,189 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
describe CobWeb do
|
4
|
+
|
5
|
+
before(:each) do
|
6
|
+
|
7
|
+
@base_url = "http://www.baseurl.com/"
|
8
|
+
|
9
|
+
@default_headers = {"Cache-Control" => "private, max-age=0",
|
10
|
+
"Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
|
11
|
+
"Expires" => "-1",
|
12
|
+
"Content-Type" => "text/html; charset=UTF-8",
|
13
|
+
"Content-Encoding" => "gzip",
|
14
|
+
"Transfer-Encoding" => "chunked",
|
15
|
+
"Server" => "gws",
|
16
|
+
"X-XSS-Protection" => "1; mode=block"}
|
17
|
+
|
18
|
+
@cobweb = CobWeb.new :quiet => true
|
19
|
+
end
|
20
|
+
|
21
|
+
describe "with mock" do
|
22
|
+
before(:each) do
|
23
|
+
@mock_http_client = mock(Net::HTTP)
|
24
|
+
@mock_http_request = mock(Net::HTTPRequest)
|
25
|
+
@mock_http_redirect_request = mock(Net::HTTPRequest)
|
26
|
+
@mock_http_redirect_request2 = mock(Net::HTTPRequest)
|
27
|
+
|
28
|
+
@mock_http_response = mock(Net::HTTPResponse)
|
29
|
+
@mock_http_redirect_response = mock(Net::HTTPRedirection)
|
30
|
+
@mock_http_get = mock(Net::HTTP::Get)
|
31
|
+
|
32
|
+
Net::HTTP.stub!(:new).and_return(@mock_http_client)
|
33
|
+
Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
|
34
|
+
Net::HTTP::Get.stub!(:new).with("/redirect.html").and_return(@mock_http_redirect_request)
|
35
|
+
Net::HTTP::Get.stub!(:new).with("/redirect2.html").and_return(@mock_http_redirect_request2)
|
36
|
+
|
37
|
+
@mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
|
38
|
+
@mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
39
|
+
@mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
|
40
|
+
|
41
|
+
@mock_http_response.stub!(:code).and_return(200)
|
42
|
+
@mock_http_response.stub!(:content_type).and_return("text/html")
|
43
|
+
@mock_http_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
|
44
|
+
@mock_http_response.stub!(:[]).with("location").and_return(@default_headers["location"])
|
45
|
+
@mock_http_response.stub!(:content_length).and_return(1024)
|
46
|
+
@mock_http_response.stub!(:body).and_return("asdf")
|
47
|
+
@mock_http_response.stub!(:to_hash).and_return(@default_headers)
|
48
|
+
|
49
|
+
@mock_http_redirect_response.stub!(:code).and_return(301)
|
50
|
+
@mock_http_redirect_response.stub!(:content_type).and_return("text/xml")
|
51
|
+
@mock_http_redirect_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
|
52
|
+
@mock_http_redirect_response.stub!(:[]).with("location").and_return("http://redirected-to.com/redirect2.html")
|
53
|
+
@mock_http_redirect_response.stub!(:content_length).and_return(2048)
|
54
|
+
@mock_http_redirect_response.stub!(:body).and_return("redirected body")
|
55
|
+
@mock_http_redirect_response.stub!(:to_hash).and_return(@default_headers)
|
56
|
+
|
57
|
+
@mock_http_redirect_response2.stub!(:code).and_return(301)
|
58
|
+
@mock_http_redirect_response2.stub!(:content_type).and_return("text/xml")
|
59
|
+
@mock_http_redirect_response2.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
|
60
|
+
@mock_http_redirect_response2.stub!(:[]).with("location").and_return("http://redirected-to.com/redirected.html")
|
61
|
+
@mock_http_redirect_response2.stub!(:content_length).and_return(2048)
|
62
|
+
@mock_http_redirect_response2.stub!(:body).and_return("redirected body")
|
63
|
+
@mock_http_redirect_response2.stub!(:to_hash).and_return(@default_headers)
|
64
|
+
|
65
|
+
end
|
66
|
+
|
67
|
+
it "should generate a cobweb object" do
|
68
|
+
CobWeb.new.should be_an_instance_of CobWeb
|
69
|
+
end
|
70
|
+
|
71
|
+
describe "get" do
|
72
|
+
it "should return a hash with default values" do
|
73
|
+
@cobweb.get(@base_url).should be_an_instance_of Hash
|
74
|
+
end
|
75
|
+
|
76
|
+
it "should return a hash with default values without quiet option" do
|
77
|
+
@cobweb.get(@base_url).should be_an_instance_of Hash
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should raise exception if there is no url" do
|
81
|
+
lambda {@cobweb.get(nil)}.should raise_error("url cannot be nil")
|
82
|
+
end
|
83
|
+
|
84
|
+
describe "content object" do
|
85
|
+
|
86
|
+
it "should return the url" do
|
87
|
+
@cobweb.get(@base_url)[:url].should == @base_url
|
88
|
+
end
|
89
|
+
|
90
|
+
it "should return correct content-types" do
|
91
|
+
@mock_http_response.stub!(:content_type).and_return("image/jpeg")
|
92
|
+
@cobweb.get(@base_url)[:content_type].should == "image/jpeg"
|
93
|
+
end
|
94
|
+
|
95
|
+
it "should return correct status-code" do
|
96
|
+
@mock_http_response.stub!(:code).and_return(404)
|
97
|
+
@cobweb.get(@base_url)[:status_code].should == 404
|
98
|
+
end
|
99
|
+
|
100
|
+
it "should return correct status-code" do
|
101
|
+
@mock_http_response.stub!(:code).and_return(404)
|
102
|
+
@cobweb.get(@base_url)[:status_code].should == 404
|
103
|
+
end
|
104
|
+
|
105
|
+
it "should return correct character_set" do
|
106
|
+
@cobweb.get(@base_url)[:character_set].should == "UTF-8"
|
107
|
+
end
|
108
|
+
it "should return correct content_length" do
|
109
|
+
@cobweb.get(@base_url)[:content_length].should == 1024
|
110
|
+
end
|
111
|
+
it "should return correct content_body" do
|
112
|
+
@cobweb.get(@base_url)[:content_body].should == "asdf"
|
113
|
+
end
|
114
|
+
it "should return correct location" do
|
115
|
+
@cobweb.get(@base_url)[:location].should == nil
|
116
|
+
|
117
|
+
@mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
|
118
|
+
@cobweb.get(@base_url)[:location].should == "http://google.com/"
|
119
|
+
end
|
120
|
+
it "should return correct headers" do
|
121
|
+
@cobweb.get(@base_url)[:headers].should == @default_headers
|
122
|
+
end
|
123
|
+
it "should return correct a hash of links" do
|
124
|
+
@cobweb.get(@base_url)[:links].should be_an_instance_of Hash
|
125
|
+
end
|
126
|
+
it "should return the response time for the url" do
|
127
|
+
@cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
|
128
|
+
end
|
129
|
+
|
130
|
+
end
|
131
|
+
describe "with redirect" do
|
132
|
+
|
133
|
+
before(:each) do
|
134
|
+
@base_url = "http://redirect-me.com/redirect.html"
|
135
|
+
@cobweb = CobWeb.new(:follow_redirects => true, :quiet => true)
|
136
|
+
end
|
137
|
+
|
138
|
+
it "should flow through redirect" do
|
139
|
+
|
140
|
+
@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
141
|
+
|
142
|
+
content = @cobweb.get(@base_url)
|
143
|
+
content.should be_an_instance_of Hash
|
144
|
+
|
145
|
+
content[:url].should == "http://redirect-me.com/redirect.html"
|
146
|
+
content[:redirect_through].length.should == 2
|
147
|
+
content[:content_type].should == "text/html"
|
148
|
+
content[:content_body].should == "asdf"
|
149
|
+
|
150
|
+
end
|
151
|
+
it "should return the path followed" do
|
152
|
+
@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
153
|
+
|
154
|
+
content = @cobweb.get(@base_url)
|
155
|
+
content[:redirect_through].should == ["http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
|
156
|
+
|
157
|
+
end
|
158
|
+
it "should not follow with redirect disabled" do
|
159
|
+
@cobweb = CobWeb.new(:follow_redirects => false)
|
160
|
+
@mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
|
161
|
+
|
162
|
+
content = @cobweb.get(@base_url)
|
163
|
+
content[:url].should == "http://redirect-me.com/redirect.html"
|
164
|
+
content[:redirect_through].should be_nil
|
165
|
+
content[:status_code].should == 301
|
166
|
+
content[:content_type].should == "text/xml"
|
167
|
+
content[:content_body].should == "redirected body"
|
168
|
+
|
169
|
+
end
|
170
|
+
end
|
171
|
+
end
|
172
|
+
end
|
173
|
+
|
174
|
+
describe "without mock" do
|
175
|
+
it "should throw invalid url exception for an invalid url" do
|
176
|
+
lambda {@cobweb.get("asdgas asv\"£%\"^%&*%")}.should raise_error URI::InvalidURIError
|
177
|
+
end
|
178
|
+
|
179
|
+
it "should throw exception when server is unavailable" #do
|
180
|
+
# lambda {@cobweb.get({:url => "http://www.oasdjgoisadjgoisdiog.com"})}.should raise_error URI::InvalidURIError
|
181
|
+
#end
|
182
|
+
|
183
|
+
it "should return a valid content hash when url doesn't exist on a live server" do
|
184
|
+
status_code = @cobweb.get("http://test.com/laskdjflsdajf")[:status_code]
|
185
|
+
status_code.should == 404
|
186
|
+
end
|
187
|
+
|
188
|
+
end
|
189
|
+
end
|
@@ -0,0 +1,104 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
require File.expand_path(File.dirname(__FILE__) + '/../../lib/content_link_parser.rb')
|
3
|
+
|
4
|
+
describe ContentLinkParser do
|
5
|
+
|
6
|
+
before(:each) do
|
7
|
+
@base_url = "http://www.baseurl.com/"
|
8
|
+
@content = File.read(File.dirname(__FILE__) + "/../samples/sample_html_links.html")
|
9
|
+
@content_parser = ContentLinkParser.new("http://sample-links.com/", @content)
|
10
|
+
end
|
11
|
+
|
12
|
+
it "should load the sample document" do
|
13
|
+
@content.should_not be_nil
|
14
|
+
@content.should_not be_empty
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should create a content link parser" do
|
18
|
+
@content_parser.should_not be_nil
|
19
|
+
@content_parser.should be_an_instance_of ContentLinkParser
|
20
|
+
end
|
21
|
+
|
22
|
+
describe "using default tags" do
|
23
|
+
describe "returning general links" do
|
24
|
+
it "should return some links from the sample data" do
|
25
|
+
links = @content_parser.links
|
26
|
+
links.should_not be_nil
|
27
|
+
links.should_not be_empty
|
28
|
+
end
|
29
|
+
it "should return the correct links" do
|
30
|
+
links = @content_parser.links
|
31
|
+
links.length.should == 4
|
32
|
+
end
|
33
|
+
end
|
34
|
+
describe "returning image links" do
|
35
|
+
it "should return some image links from the sample data" do
|
36
|
+
links = @content_parser.images
|
37
|
+
links.should_not be_nil
|
38
|
+
links.should_not be_empty
|
39
|
+
end
|
40
|
+
it "should return the correct links" do
|
41
|
+
links = @content_parser.images
|
42
|
+
links.length.should == 1
|
43
|
+
end
|
44
|
+
end
|
45
|
+
describe "returning related links" do
|
46
|
+
it "should return some related links from the sample data" do
|
47
|
+
links = @content_parser.related
|
48
|
+
links.should_not be_nil
|
49
|
+
links.should_not be_empty
|
50
|
+
end
|
51
|
+
it "should return the correct links" do
|
52
|
+
links = @content_parser.related
|
53
|
+
links.length.should == 2
|
54
|
+
end
|
55
|
+
end
|
56
|
+
describe "returning script links" do
|
57
|
+
it "should return some script links from the sample data" do
|
58
|
+
links = @content_parser.scripts
|
59
|
+
links.should_not be_nil
|
60
|
+
links.should_not be_empty
|
61
|
+
end
|
62
|
+
it "should return the correct links" do
|
63
|
+
links = @content_parser.scripts
|
64
|
+
links.length.should == 1
|
65
|
+
end
|
66
|
+
end
|
67
|
+
describe "returning style links" do
|
68
|
+
it "should return some style links from the sample data" do
|
69
|
+
links = @content_parser.styles
|
70
|
+
links.should_not be_nil
|
71
|
+
links.should_not be_empty
|
72
|
+
end
|
73
|
+
it "should return the correct links" do
|
74
|
+
links = @content_parser.styles
|
75
|
+
links.length.should == 3
|
76
|
+
end
|
77
|
+
end
|
78
|
+
describe "returning unknown link type" do
|
79
|
+
it "should return an empty array" do
|
80
|
+
links = @content_parser.asdfasdfsadf
|
81
|
+
links.should_not be_nil
|
82
|
+
links.should be_an_instance_of Array
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
86
|
+
|
87
|
+
describe "returning all link data" do
|
88
|
+
it "should return a hash with all link data" do
|
89
|
+
link_data = @content_parser.link_data
|
90
|
+
link_data.should_not be_nil
|
91
|
+
link_data.should be_an_instance_of Hash
|
92
|
+
|
93
|
+
link_data.keys.length.should == 5
|
94
|
+
link_data[:links].length.should == 4
|
95
|
+
end
|
96
|
+
end
|
97
|
+
|
98
|
+
describe "ignoring default tags" do
|
99
|
+
it "should not return any links" do
|
100
|
+
parser = ContentLinkParser.new("http://sample-links.com", @content, :ignore_default_tags => true)
|
101
|
+
parser.links.should be_empty
|
102
|
+
end
|
103
|
+
end
|
104
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
|
2
|
+
|
3
|
+
|
4
|
+
describe CrawlJob do
|
5
|
+
|
6
|
+
before(:each) do
|
7
|
+
@base_url = "http://www.baseurl.com/"
|
8
|
+
|
9
|
+
|
10
|
+
|
11
|
+
client = Net::HTTPClient.new
|
12
|
+
puts client.get('http://www.google.com.au')
|
13
|
+
puts "asdf"
|
14
|
+
|
15
|
+
@cobweb = CobWeb.new("http://www.google.com")
|
16
|
+
|
17
|
+
end
|
18
|
+
|
19
|
+
it "should be a cobweb type" do
|
20
|
+
@cobweb.should be_an_instance_of CobWeb
|
21
|
+
end
|
22
|
+
|
23
|
+
|
24
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
<html>
|
2
|
+
<head>
|
3
|
+
<title>Sample HTML Document With all types of links</title>
|
4
|
+
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
|
5
|
+
<meta name="description" content="Information for people running web search indexing robots, and web site managers attempting to understand what's going on when a robot visits their site.">
|
6
|
+
<meta name="keywords" content="robots, crawlers, crawling, spiders, index, indexing, indexers, gatherers, search engines, searching, FAQ, checklist">
|
7
|
+
<meta name="DC.date.modified" content="2003-07-18">
|
8
|
+
<meta http-equiv="refresh" content="http://sampleurl-metarefresh.com/"/>
|
9
|
+
|
10
|
+
<link rel="stylesheet" type="text/css" href="http://sampleurl-linkcss/" />
|
11
|
+
<link rel="home" type="text/html" href="http://sampleurl-linkhome/" />
|
12
|
+
<script type="text/javascript" src="script.js"></script>
|
13
|
+
|
14
|
+
<STYLE TYPE="text/css" MEDIA="screen, projection">
|
15
|
+
<!--
|
16
|
+
@import url(http://www.htmlhelp.com/style.css);
|
17
|
+
@import url(/stylesheets/punk.css);
|
18
|
+
DT { background: yellow; color: black }
|
19
|
+
-->
|
20
|
+
</STYLE>
|
21
|
+
|
22
|
+
</head>
|
23
|
+
|
24
|
+
<body bgcolor="#FFFFFF"><!-- #BeginLibraryItem "/Library/navtop.lbi" --></p>
|
25
|
+
|
26
|
+
<a href="http://sampleurl-a.com/">Click Here for Sample URL 1</a>
|
27
|
+
<frameset><frame src="http://sampleurl-frame.com/"></frame></frameset>
|
28
|
+
|
29
|
+
<map id="testmap"><area href="http://sampleurl-area"></area>></map>
|
30
|
+
|
31
|
+
<img src="http://sampleurl-img/"/>
|
32
|
+
|
33
|
+
</body>
|
34
|
+
</html>
|
data/spec/spec.opts
ADDED
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
|
metadata
ADDED
@@ -0,0 +1,133 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: cobweb
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
hash: 29
|
5
|
+
prerelease: false
|
6
|
+
segments:
|
7
|
+
- 0
|
8
|
+
- 0
|
9
|
+
- 1
|
10
|
+
version: 0.0.1
|
11
|
+
platform: ruby
|
12
|
+
authors:
|
13
|
+
- Stewart McKee
|
14
|
+
autorequire:
|
15
|
+
bindir: bin
|
16
|
+
cert_chain: []
|
17
|
+
|
18
|
+
date: 2010-11-10 00:00:00 +00:00
|
19
|
+
default_executable:
|
20
|
+
dependencies:
|
21
|
+
- !ruby/object:Gem::Dependency
|
22
|
+
name: resque
|
23
|
+
prerelease: false
|
24
|
+
requirement: &id001 !ruby/object:Gem::Requirement
|
25
|
+
none: false
|
26
|
+
requirements:
|
27
|
+
- - ">="
|
28
|
+
- !ruby/object:Gem::Version
|
29
|
+
hash: 3
|
30
|
+
segments:
|
31
|
+
- 0
|
32
|
+
version: "0"
|
33
|
+
type: :runtime
|
34
|
+
version_requirements: *id001
|
35
|
+
- !ruby/object:Gem::Dependency
|
36
|
+
name: redis
|
37
|
+
prerelease: false
|
38
|
+
requirement: &id002 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ">="
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
hash: 3
|
44
|
+
segments:
|
45
|
+
- 0
|
46
|
+
version: "0"
|
47
|
+
type: :runtime
|
48
|
+
version_requirements: *id002
|
49
|
+
- !ruby/object:Gem::Dependency
|
50
|
+
name: absolutize
|
51
|
+
prerelease: false
|
52
|
+
requirement: &id003 !ruby/object:Gem::Requirement
|
53
|
+
none: false
|
54
|
+
requirements:
|
55
|
+
- - ">="
|
56
|
+
- !ruby/object:Gem::Version
|
57
|
+
hash: 3
|
58
|
+
segments:
|
59
|
+
- 0
|
60
|
+
version: "0"
|
61
|
+
type: :runtime
|
62
|
+
version_requirements: *id003
|
63
|
+
- !ruby/object:Gem::Dependency
|
64
|
+
name: nokogiri
|
65
|
+
prerelease: false
|
66
|
+
requirement: &id004 !ruby/object:Gem::Requirement
|
67
|
+
none: false
|
68
|
+
requirements:
|
69
|
+
- - ">="
|
70
|
+
- !ruby/object:Gem::Version
|
71
|
+
hash: 3
|
72
|
+
segments:
|
73
|
+
- 0
|
74
|
+
version: "0"
|
75
|
+
type: :runtime
|
76
|
+
version_requirements: *id004
|
77
|
+
description:
|
78
|
+
email: stewart@rockwellcottage.com
|
79
|
+
executables: []
|
80
|
+
|
81
|
+
extensions: []
|
82
|
+
|
83
|
+
extra_rdoc_files:
|
84
|
+
- README.textile
|
85
|
+
files:
|
86
|
+
- spec/samples/sample_html_links.html
|
87
|
+
- spec/spec.opts
|
88
|
+
- spec/spec_helper.rb
|
89
|
+
- spec/cobweb/content_link_parser_spec.rb
|
90
|
+
- spec/cobweb/cobweb_spec.rb
|
91
|
+
- spec/cobweb/crawl_job_spec.rb
|
92
|
+
- lib/namespaced_redis.rb
|
93
|
+
- lib/cobweb.rb
|
94
|
+
- lib/content_process_job.rb
|
95
|
+
- lib/content_link_parser.rb
|
96
|
+
- lib/crawl_job.rb
|
97
|
+
- README.textile
|
98
|
+
has_rdoc: false
|
99
|
+
homepage: http://github.com/stewartmckee/cobweb
|
100
|
+
licenses: []
|
101
|
+
|
102
|
+
post_install_message:
|
103
|
+
rdoc_options: []
|
104
|
+
|
105
|
+
require_paths:
|
106
|
+
- lib
|
107
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
108
|
+
none: false
|
109
|
+
requirements:
|
110
|
+
- - ">="
|
111
|
+
- !ruby/object:Gem::Version
|
112
|
+
hash: 3
|
113
|
+
segments:
|
114
|
+
- 0
|
115
|
+
version: "0"
|
116
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
117
|
+
none: false
|
118
|
+
requirements:
|
119
|
+
- - ">="
|
120
|
+
- !ruby/object:Gem::Version
|
121
|
+
hash: 3
|
122
|
+
segments:
|
123
|
+
- 0
|
124
|
+
version: "0"
|
125
|
+
requirements: []
|
126
|
+
|
127
|
+
rubyforge_project:
|
128
|
+
rubygems_version: 1.3.7
|
129
|
+
signing_key:
|
130
|
+
specification_version: 3
|
131
|
+
summary: Crawler utilizing resque
|
132
|
+
test_files: []
|
133
|
+
|