cobweb 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/README.textile ADDED
@@ -0,0 +1,63 @@
1
+
2
+ h1. Cobweb v0.0.1
3
+
4
+ h2. Intro
5
+
6
+ Crawler that utilises resque jobs to perform the crawl allowing clustering of crawls.
7
+
8
+ h2. Installation
9
+
10
+ Install crawler as a gem
11
+
12
+ bq. gem install cobweb
13
+
14
+ h2. Usage
15
+
16
+ h4. new(options)
17
+
18
+ Creates a new crawler object based on a base_url
19
+
20
+ * options - Options are passed in as a hash,
21
+
22
+ ** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash(true)
23
+ ** :redirect_limit - sets the limit to be used for concurrent redirects(10)
24
+ ** :processing_queue - specifies the processing queue for content to be sent to (ContentProcessJob)
25
+ ** :debug - enables debug output (false)
26
+ ** :quiet - hides default output (false)
27
+ ** :cache - if set, enables the cache and sets the ttl (300)
28
+
29
+ bq. crawler = CobWeb.new(:follow_redirects => false)
30
+
31
+
32
+ h4. start(base_url)
33
+
34
+ * base_url - the url to start the crawl from
35
+
36
+ h4. get(url)
37
+
38
+ * url - url requested
39
+
40
+
41
+ h2. License
42
+
43
+ h3. The MIT License
44
+
45
+ Copyright (c) 2010 6Central Limited
46
+
47
+ Permission is hereby granted, free of charge, to any person obtaining a copy
48
+ of this software and associated documentation files (the "Software"), to deal
49
+ in the Software without restriction, including without limitation the rights
50
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
51
+ copies of the Software, and to permit persons to whom the Software is
52
+ furnished to do so, subject to the following conditions:
53
+
54
+ The above copyright notice and this permission notice shall be included in
55
+ all copies or substantial portions of the Software.
56
+
57
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
58
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
59
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
60
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
61
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
62
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
63
+ THE SOFTWARE.
data/lib/cobweb.rb ADDED
@@ -0,0 +1,130 @@
1
+ require 'rubygems'
2
+ require 'uri'
3
+ require 'resque'
4
+ require 'digest/sha1'
5
+
6
+ Dir[File.dirname(__FILE__) + '/*.rb'].each do |file|
7
+ require [File.dirname(__FILE__), File.basename(file, File.extname(file))].join("/")
8
+ end
9
+
10
+ class CobWeb
11
+
12
+ def initialize(options = {})
13
+ @options = options
14
+ @options[:follow_redirects] = true if @options[:follow_redirects].nil?
15
+ @options[:redirect_limit] = 10 if @options[:redirect_limit].nil?
16
+ @options[:processing_queue] = ContentProcessJob if @options[:processing_queue].nil?
17
+ @options[:debug] = false unless @options[:debug]
18
+
19
+ end
20
+
21
+ def start(base_url)
22
+ raise ":base_url is required" unless base_url
23
+ request = {
24
+ :crawl_id => Digest::SHA1.hexdigest(Time.now.to_s),
25
+ :url => base_url
26
+ }
27
+
28
+ request.merge!(@options)
29
+
30
+ Resque.enqueue(CrawlJob, request)
31
+ end
32
+
33
+
34
+ def get(url, redirect_limit = @options[:redirect_limit])
35
+
36
+ raise "url cannot be nil" if url.nil?
37
+
38
+ # get the unique id for this request
39
+ unique_id = Digest::SHA1.hexdigest(url)
40
+
41
+ # connect to redis
42
+ redis = NamespacedRedis.new(Redis.new, "cobweb")
43
+
44
+ content = {}
45
+
46
+ # check if it has already been cached
47
+ if redis.get(unique_id) and @options[:cache]
48
+ puts "Cache hit for #{url}" unless @options[:quiet]
49
+ content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
50
+
51
+ content
52
+ else
53
+ # this url is valid for processing so lets get on with it
54
+ print "Retrieving #{url }... " unless @options[:quiet]
55
+ uri = URI.parse(url)
56
+
57
+ # retrieve data
58
+ http = Net::HTTP.new(uri.host, uri.port)
59
+ if uri.scheme == "https"
60
+ http.use_ssl = true
61
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
62
+ end
63
+ request_time = Time.now.to_f
64
+ request = Net::HTTP::Get.new(uri.request_uri)
65
+ response = http.request(request)
66
+
67
+ if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
68
+ puts "redirected... " unless @options[:quiet]
69
+ url = response['location']
70
+ redirect_limit = redirect_limit - 1
71
+ content = get(response['location'], redirect_limit)
72
+ content[:url] = uri.to_s
73
+ content[:redirect_through] = [] if content[:redirect_through].nil?
74
+ content[:redirect_through].insert(0, response['location'])
75
+
76
+ content[:response_time] = Time.now.to_f - request_time
77
+ else
78
+ content[:response_time] = Time.now.to_f - request_time
79
+
80
+ puts "Retrieved." unless @options[:quiet]
81
+
82
+ # create the content container
83
+ content[:url] = uri.to_s
84
+ content[:status_code] = response.code.to_i
85
+ content[:content_type] = response.content_type
86
+ charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1 ] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
87
+ charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
88
+ content[:character_set] = charset
89
+ content[:content_length] = response.content_length
90
+ content[:content_body] = response.body
91
+ content[:location] = response["location"]
92
+ content[:headers] = response.to_hash.symbolize_keys
93
+
94
+ # parse data for links
95
+ link_parser = ContentLinkParser.new(content[:url], content[:content_body])
96
+ content[:links] = link_parser.link_data
97
+
98
+ # add content to cache if required
99
+ if @options[:cache]
100
+ redis.set(unique_id, content.to_json)
101
+ redis.expire unique_id, content_request[:cache].to_i
102
+ end
103
+ end
104
+ end
105
+ content
106
+ end
107
+ end
108
+
109
+ ## add symbolize methods to hash
110
+ class Hash
111
+ def symbolize_keys
112
+ keys.each do |key|
113
+ if key.instance_of? String
114
+ value = self[key]
115
+ self.delete(key)
116
+ self[key.to_sym] = value
117
+ end
118
+ end
119
+ self
120
+ end
121
+ def deep_symbolize_keys
122
+ symbolize_keys
123
+ keys.each do |key|
124
+ if self[key].instance_of? Hash
125
+ self[key].deep_symbolize_keys
126
+ end
127
+ end
128
+ self
129
+ end
130
+ end
@@ -0,0 +1,71 @@
1
+
2
+ class ContentLinkParser
3
+
4
+ require "nokogiri"
5
+ require "absolutize"
6
+
7
+ def initialize(url, content, options = {})
8
+ @options = options
9
+ @url = url
10
+ @doc = Nokogiri::HTML(content)
11
+
12
+ base_url = @url.to_s
13
+ if @doc.at("base[href]")
14
+ base_url = @doc.at("base[href]").attr("href").to_s
15
+ end
16
+ @absolutize = Absolutize.new(base_url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
17
+
18
+ @options[:tags] = {}
19
+ @options[:tags][:links] = [["a[href]", "href"], ["frame[src]", "src"], ["meta[@http-equiv=\"refresh\"]", "content"], ["link[href]:not([rel])", "href"], ["area[href]", "href"]]
20
+ @options[:tags][:images] = [["img[src]", "src"]]
21
+ @options[:tags][:related] = [["link[rel]", "href"]]
22
+ @options[:tags][:scripts] = [["script[src]", "src"]]
23
+ @options[:tags][:styles] = [["link[rel='stylesheet'][href]", "href"], ["style[@type^='text/css']", /url\("?(.*?)"?\)/]]
24
+
25
+ #clear the default tags if required
26
+ @options[:tags] = {} if @options[:ignore_default_tags]
27
+ @options[:tags].merge!(@options[:additional_tags]) unless @options[:additional_tags].nil?
28
+
29
+ end
30
+
31
+ def link_data
32
+ data = {}
33
+ @options[:tags].keys.each do |key|
34
+ data[key.to_sym] = self.instance_eval(key.to_s)
35
+ end
36
+ data
37
+ end
38
+
39
+ def all_links
40
+ data = link_data
41
+ data.keys.map{|key| data[key]}.flatten.uniq
42
+ end
43
+
44
+ def method_missing(m)
45
+ if @options[:tags].keys.include?(m)
46
+ links = []
47
+ @options[:tags][m].each do |selector, attribute|
48
+ find_matches(links, selector, attribute)
49
+ end
50
+ links.uniq
51
+ else
52
+ puts "Warning: There was no configuration on how to find #{m} links"
53
+ []
54
+ end
55
+ end
56
+
57
+ def find_matches(array, selector, attribute)
58
+ if attribute.kind_of? String or attribute.kind_of? Symbol
59
+ @doc.css(selector).each do |tag|
60
+ uri = @absolutize.url(tag[attribute])
61
+ array << uri.to_s
62
+ end
63
+ elsif attribute.instance_of? Regexp
64
+ @doc.css(selector).each do |tag|
65
+ tag.content.scan(attribute) {|match| array << @absolutize.url(match[0])}
66
+ end
67
+ end
68
+ end
69
+
70
+ end
71
+
@@ -0,0 +1,13 @@
1
+ class ContentProcessJob
2
+ require "ap"
3
+
4
+ @queue = :cobweb_process_job
5
+
6
+ def self.perform(content)
7
+ content.symbolize_keys
8
+ puts "Dummy Processing for #{content[:url]}"
9
+
10
+ #ap content.keys
11
+
12
+ end
13
+ end
data/lib/crawl_job.rb ADDED
@@ -0,0 +1,71 @@
1
+ class CrawlJob
2
+
3
+ require "net/https"
4
+ require "uri"
5
+ require "redis"
6
+
7
+ @queue = :cobweb_crawl_job
8
+
9
+ def self.perform(content_request)
10
+ # change all hash keys to symbols
11
+ content_request.deep_symbolize_keys
12
+ redis = NamespacedRedis.new(Redis.new, "cobweb-#{content_request[:crawl_id]}")
13
+ @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
14
+
15
+ # check we haven't crawled this url before
16
+ unless redis.sismember "crawled", content_request[:url]
17
+
18
+ # increment counter and check we haven't hit our crawl limit
19
+ redis.incr "crawl-counter"
20
+ crawl_counter = redis.get("crawl-counter").to_i
21
+ queue_counter = redis.get("queue-counter").to_i
22
+ if crawl_counter <= content_request[:crawl_limit]
23
+ content = CobWeb.get(content_request)
24
+ redis.sadd "crawled", content_request[:url]
25
+ set_base_url redis, content, content_request[:base_url]
26
+ if queue_counter <= content_request[:crawl_limit]
27
+ ap content[:links]
28
+ content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
29
+ ap link
30
+ unless redis.sismember "crawled", link
31
+ puts redis.get("base_url")
32
+ puts "---------------------------------"
33
+ if link.match(Regexp.new("^#{redis.get("base_url")}"))
34
+ new_request = content_request.clone
35
+ new_request[:url] = link
36
+ new_request[:parent] = content_request[:url]
37
+ Resque.enqueue(CrawlJob, new_request)
38
+ redis.incr "queue-counter"
39
+ end
40
+ end
41
+ end
42
+ end
43
+
44
+ # enqueue to processing queue
45
+ Resque.enqueue(const_get(content_request[:processing_queue]), content)
46
+ puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
47
+ puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter}" if content_request[:debug]
48
+
49
+ else
50
+ puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit]} objects" if content_request[:debug]
51
+ end
52
+ else
53
+ puts "Already crawled #{content_request[:url]}" if content_request[:debug]
54
+ end
55
+ end
56
+
57
+ private
58
+ def self.set_base_url(redis, content, base_url)
59
+ if redis.get("base_url").nil?
60
+ if content[:status_code] >= 300 and content[:status_code] < 400
61
+ #redirect received for first url
62
+ redis.set("base_url", @absolutize.url(content[:location]).to_s)
63
+ puts "Warning: base_url given redirects to another location, setting base_url to #{@absolutize.url(content[:location]).to_s}"
64
+ else
65
+ redis.set("base_url", base_url)
66
+ end
67
+ end
68
+ end
69
+
70
+
71
+ end
@@ -0,0 +1,52 @@
1
+ class NamespacedRedis
2
+ def initialize(redis, namespace="")
3
+ raise "redis must be supplied" if redis.nil?
4
+ @redis = redis
5
+ @namespace = namespace
6
+ end
7
+
8
+ def sismember(key, member)
9
+ @redis.sismember namespaced(key), member
10
+ end
11
+
12
+ def sadd(key, value)
13
+ @redis.sadd namespaced(key), value
14
+ end
15
+
16
+ def get(key)
17
+ @redis.get namespaced(key)
18
+ end
19
+
20
+ def incr(key)
21
+ @redis.incr namespaced(key)
22
+ end
23
+
24
+ def exist(key)
25
+ @redis.exist namespaced(key)
26
+ end
27
+
28
+ def set(key, value)
29
+ @redis.set namespaced(key), value
30
+ end
31
+
32
+ def del(key)
33
+ @redis.del namespaced(key)
34
+ end
35
+
36
+ def expire(key, value)
37
+ @redis.expire namespaced(key), value
38
+ end
39
+
40
+ def namespaced(key)
41
+ "#{@namespace}-#{key}"
42
+ end
43
+
44
+ def native
45
+ @redis
46
+ end
47
+
48
+ def namespace
49
+ @namespace
50
+ end
51
+
52
+ end
@@ -0,0 +1,189 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe CobWeb do
4
+
5
+ before(:each) do
6
+
7
+ @base_url = "http://www.baseurl.com/"
8
+
9
+ @default_headers = {"Cache-Control" => "private, max-age=0",
10
+ "Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
11
+ "Expires" => "-1",
12
+ "Content-Type" => "text/html; charset=UTF-8",
13
+ "Content-Encoding" => "gzip",
14
+ "Transfer-Encoding" => "chunked",
15
+ "Server" => "gws",
16
+ "X-XSS-Protection" => "1; mode=block"}
17
+
18
+ @cobweb = CobWeb.new :quiet => true
19
+ end
20
+
21
+ describe "with mock" do
22
+ before(:each) do
23
+ @mock_http_client = mock(Net::HTTP)
24
+ @mock_http_request = mock(Net::HTTPRequest)
25
+ @mock_http_redirect_request = mock(Net::HTTPRequest)
26
+ @mock_http_redirect_request2 = mock(Net::HTTPRequest)
27
+
28
+ @mock_http_response = mock(Net::HTTPResponse)
29
+ @mock_http_redirect_response = mock(Net::HTTPRedirection)
30
+ @mock_http_get = mock(Net::HTTP::Get)
31
+
32
+ Net::HTTP.stub!(:new).and_return(@mock_http_client)
33
+ Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
34
+ Net::HTTP::Get.stub!(:new).with("/redirect.html").and_return(@mock_http_redirect_request)
35
+ Net::HTTP::Get.stub!(:new).with("/redirect2.html").and_return(@mock_http_redirect_request2)
36
+
37
+ @mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
38
+ @mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
39
+ @mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
40
+
41
+ @mock_http_response.stub!(:code).and_return(200)
42
+ @mock_http_response.stub!(:content_type).and_return("text/html")
43
+ @mock_http_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
44
+ @mock_http_response.stub!(:[]).with("location").and_return(@default_headers["location"])
45
+ @mock_http_response.stub!(:content_length).and_return(1024)
46
+ @mock_http_response.stub!(:body).and_return("asdf")
47
+ @mock_http_response.stub!(:to_hash).and_return(@default_headers)
48
+
49
+ @mock_http_redirect_response.stub!(:code).and_return(301)
50
+ @mock_http_redirect_response.stub!(:content_type).and_return("text/xml")
51
+ @mock_http_redirect_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
52
+ @mock_http_redirect_response.stub!(:[]).with("location").and_return("http://redirected-to.com/redirect2.html")
53
+ @mock_http_redirect_response.stub!(:content_length).and_return(2048)
54
+ @mock_http_redirect_response.stub!(:body).and_return("redirected body")
55
+ @mock_http_redirect_response.stub!(:to_hash).and_return(@default_headers)
56
+
57
+ @mock_http_redirect_response2.stub!(:code).and_return(301)
58
+ @mock_http_redirect_response2.stub!(:content_type).and_return("text/xml")
59
+ @mock_http_redirect_response2.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
60
+ @mock_http_redirect_response2.stub!(:[]).with("location").and_return("http://redirected-to.com/redirected.html")
61
+ @mock_http_redirect_response2.stub!(:content_length).and_return(2048)
62
+ @mock_http_redirect_response2.stub!(:body).and_return("redirected body")
63
+ @mock_http_redirect_response2.stub!(:to_hash).and_return(@default_headers)
64
+
65
+ end
66
+
67
+ it "should generate a cobweb object" do
68
+ CobWeb.new.should be_an_instance_of CobWeb
69
+ end
70
+
71
+ describe "get" do
72
+ it "should return a hash with default values" do
73
+ @cobweb.get(@base_url).should be_an_instance_of Hash
74
+ end
75
+
76
+ it "should return a hash with default values without quiet option" do
77
+ @cobweb.get(@base_url).should be_an_instance_of Hash
78
+ end
79
+
80
+ it "should raise exception if there is no url" do
81
+ lambda {@cobweb.get(nil)}.should raise_error("url cannot be nil")
82
+ end
83
+
84
+ describe "content object" do
85
+
86
+ it "should return the url" do
87
+ @cobweb.get(@base_url)[:url].should == @base_url
88
+ end
89
+
90
+ it "should return correct content-types" do
91
+ @mock_http_response.stub!(:content_type).and_return("image/jpeg")
92
+ @cobweb.get(@base_url)[:content_type].should == "image/jpeg"
93
+ end
94
+
95
+ it "should return correct status-code" do
96
+ @mock_http_response.stub!(:code).and_return(404)
97
+ @cobweb.get(@base_url)[:status_code].should == 404
98
+ end
99
+
100
+ it "should return correct status-code" do
101
+ @mock_http_response.stub!(:code).and_return(404)
102
+ @cobweb.get(@base_url)[:status_code].should == 404
103
+ end
104
+
105
+ it "should return correct character_set" do
106
+ @cobweb.get(@base_url)[:character_set].should == "UTF-8"
107
+ end
108
+ it "should return correct content_length" do
109
+ @cobweb.get(@base_url)[:content_length].should == 1024
110
+ end
111
+ it "should return correct content_body" do
112
+ @cobweb.get(@base_url)[:content_body].should == "asdf"
113
+ end
114
+ it "should return correct location" do
115
+ @cobweb.get(@base_url)[:location].should == nil
116
+
117
+ @mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
118
+ @cobweb.get(@base_url)[:location].should == "http://google.com/"
119
+ end
120
+ it "should return correct headers" do
121
+ @cobweb.get(@base_url)[:headers].should == @default_headers
122
+ end
123
+ it "should return correct a hash of links" do
124
+ @cobweb.get(@base_url)[:links].should be_an_instance_of Hash
125
+ end
126
+ it "should return the response time for the url" do
127
+ @cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
128
+ end
129
+
130
+ end
131
+ describe "with redirect" do
132
+
133
+ before(:each) do
134
+ @base_url = "http://redirect-me.com/redirect.html"
135
+ @cobweb = CobWeb.new(:follow_redirects => true, :quiet => true)
136
+ end
137
+
138
+ it "should flow through redirect" do
139
+
140
+ @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
141
+
142
+ content = @cobweb.get(@base_url)
143
+ content.should be_an_instance_of Hash
144
+
145
+ content[:url].should == "http://redirect-me.com/redirect.html"
146
+ content[:redirect_through].length.should == 2
147
+ content[:content_type].should == "text/html"
148
+ content[:content_body].should == "asdf"
149
+
150
+ end
151
+ it "should return the path followed" do
152
+ @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
153
+
154
+ content = @cobweb.get(@base_url)
155
+ content[:redirect_through].should == ["http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
156
+
157
+ end
158
+ it "should not follow with redirect disabled" do
159
+ @cobweb = CobWeb.new(:follow_redirects => false)
160
+ @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
161
+
162
+ content = @cobweb.get(@base_url)
163
+ content[:url].should == "http://redirect-me.com/redirect.html"
164
+ content[:redirect_through].should be_nil
165
+ content[:status_code].should == 301
166
+ content[:content_type].should == "text/xml"
167
+ content[:content_body].should == "redirected body"
168
+
169
+ end
170
+ end
171
+ end
172
+ end
173
+
174
+ describe "without mock" do
175
+ it "should throw invalid url exception for an invalid url" do
176
+ lambda {@cobweb.get("asdgas asv\"£%\"^%&*%")}.should raise_error URI::InvalidURIError
177
+ end
178
+
179
+ it "should throw exception when server is unavailable" #do
180
+ # lambda {@cobweb.get({:url => "http://www.oasdjgoisadjgoisdiog.com"})}.should raise_error URI::InvalidURIError
181
+ #end
182
+
183
+ it "should return a valid content hash when url doesn't exist on a live server" do
184
+ status_code = @cobweb.get("http://test.com/laskdjflsdajf")[:status_code]
185
+ status_code.should == 404
186
+ end
187
+
188
+ end
189
+ end
@@ -0,0 +1,104 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require File.expand_path(File.dirname(__FILE__) + '/../../lib/content_link_parser.rb')
3
+
4
+ describe ContentLinkParser do
5
+
6
+ before(:each) do
7
+ @base_url = "http://www.baseurl.com/"
8
+ @content = File.read(File.dirname(__FILE__) + "/../samples/sample_html_links.html")
9
+ @content_parser = ContentLinkParser.new("http://sample-links.com/", @content)
10
+ end
11
+
12
+ it "should load the sample document" do
13
+ @content.should_not be_nil
14
+ @content.should_not be_empty
15
+ end
16
+
17
+ it "should create a content link parser" do
18
+ @content_parser.should_not be_nil
19
+ @content_parser.should be_an_instance_of ContentLinkParser
20
+ end
21
+
22
+ describe "using default tags" do
23
+ describe "returning general links" do
24
+ it "should return some links from the sample data" do
25
+ links = @content_parser.links
26
+ links.should_not be_nil
27
+ links.should_not be_empty
28
+ end
29
+ it "should return the correct links" do
30
+ links = @content_parser.links
31
+ links.length.should == 4
32
+ end
33
+ end
34
+ describe "returning image links" do
35
+ it "should return some image links from the sample data" do
36
+ links = @content_parser.images
37
+ links.should_not be_nil
38
+ links.should_not be_empty
39
+ end
40
+ it "should return the correct links" do
41
+ links = @content_parser.images
42
+ links.length.should == 1
43
+ end
44
+ end
45
+ describe "returning related links" do
46
+ it "should return some related links from the sample data" do
47
+ links = @content_parser.related
48
+ links.should_not be_nil
49
+ links.should_not be_empty
50
+ end
51
+ it "should return the correct links" do
52
+ links = @content_parser.related
53
+ links.length.should == 2
54
+ end
55
+ end
56
+ describe "returning script links" do
57
+ it "should return some script links from the sample data" do
58
+ links = @content_parser.scripts
59
+ links.should_not be_nil
60
+ links.should_not be_empty
61
+ end
62
+ it "should return the correct links" do
63
+ links = @content_parser.scripts
64
+ links.length.should == 1
65
+ end
66
+ end
67
+ describe "returning style links" do
68
+ it "should return some style links from the sample data" do
69
+ links = @content_parser.styles
70
+ links.should_not be_nil
71
+ links.should_not be_empty
72
+ end
73
+ it "should return the correct links" do
74
+ links = @content_parser.styles
75
+ links.length.should == 3
76
+ end
77
+ end
78
+ describe "returning unknown link type" do
79
+ it "should return an empty array" do
80
+ links = @content_parser.asdfasdfsadf
81
+ links.should_not be_nil
82
+ links.should be_an_instance_of Array
83
+ end
84
+ end
85
+ end
86
+
87
+ describe "returning all link data" do
88
+ it "should return a hash with all link data" do
89
+ link_data = @content_parser.link_data
90
+ link_data.should_not be_nil
91
+ link_data.should be_an_instance_of Hash
92
+
93
+ link_data.keys.length.should == 5
94
+ link_data[:links].length.should == 4
95
+ end
96
+ end
97
+
98
+ describe "ignoring default tags" do
99
+ it "should not return any links" do
100
+ parser = ContentLinkParser.new("http://sample-links.com", @content, :ignore_default_tags => true)
101
+ parser.links.should be_empty
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,24 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+
4
+ describe CrawlJob do
5
+
6
+ before(:each) do
7
+ @base_url = "http://www.baseurl.com/"
8
+
9
+
10
+
11
+ client = Net::HTTPClient.new
12
+ puts client.get('http://www.google.com.au')
13
+ puts "asdf"
14
+
15
+ @cobweb = CobWeb.new("http://www.google.com")
16
+
17
+ end
18
+
19
+ it "should be a cobweb type" do
20
+ @cobweb.should be_an_instance_of CobWeb
21
+ end
22
+
23
+
24
+ end
@@ -0,0 +1,34 @@
1
+ <html>
2
+ <head>
3
+ <title>Sample HTML Document With all types of links</title>
4
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
5
+ <meta name="description" content="Information for people running web search indexing robots, and web site managers attempting to understand what's going on when a robot visits their site.">
6
+ <meta name="keywords" content="robots, crawlers, crawling, spiders, index, indexing, indexers, gatherers, search engines, searching, FAQ, checklist">
7
+ <meta name="DC.date.modified" content="2003-07-18">
8
+ <meta http-equiv="refresh" content="http://sampleurl-metarefresh.com/"/>
9
+
10
+ <link rel="stylesheet" type="text/css" href="http://sampleurl-linkcss/" />
11
+ <link rel="home" type="text/html" href="http://sampleurl-linkhome/" />
12
+ <script type="text/javascript" src="script.js"></script>
13
+
14
+ <STYLE TYPE="text/css" MEDIA="screen, projection">
15
+ <!--
16
+ @import url(http://www.htmlhelp.com/style.css);
17
+ @import url(/stylesheets/punk.css);
18
+ DT { background: yellow; color: black }
19
+ -->
20
+ </STYLE>
21
+
22
+ </head>
23
+
24
+ <body bgcolor="#FFFFFF"><!-- #BeginLibraryItem "/Library/navtop.lbi" --></p>
25
+
26
+ <a href="http://sampleurl-a.com/">Click Here for Sample URL 1</a>
27
+ <frameset><frame src="http://sampleurl-frame.com/"></frame></frameset>
28
+
29
+ <map id="testmap"><area href="http://sampleurl-area"></area>></map>
30
+
31
+ <img src="http://sampleurl-img/"/>
32
+
33
+ </body>
34
+ </html>
data/spec/spec.opts ADDED
@@ -0,0 +1,2 @@
1
+ --colour
2
+ --format specdoc
@@ -0,0 +1 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
metadata ADDED
@@ -0,0 +1,133 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cobweb
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Stewart McKee
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-11-10 00:00:00 +00:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: resque
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ - !ruby/object:Gem::Dependency
36
+ name: redis
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ hash: 3
44
+ segments:
45
+ - 0
46
+ version: "0"
47
+ type: :runtime
48
+ version_requirements: *id002
49
+ - !ruby/object:Gem::Dependency
50
+ name: absolutize
51
+ prerelease: false
52
+ requirement: &id003 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ hash: 3
58
+ segments:
59
+ - 0
60
+ version: "0"
61
+ type: :runtime
62
+ version_requirements: *id003
63
+ - !ruby/object:Gem::Dependency
64
+ name: nokogiri
65
+ prerelease: false
66
+ requirement: &id004 !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ hash: 3
72
+ segments:
73
+ - 0
74
+ version: "0"
75
+ type: :runtime
76
+ version_requirements: *id004
77
+ description:
78
+ email: stewart@rockwellcottage.com
79
+ executables: []
80
+
81
+ extensions: []
82
+
83
+ extra_rdoc_files:
84
+ - README.textile
85
+ files:
86
+ - spec/samples/sample_html_links.html
87
+ - spec/spec.opts
88
+ - spec/spec_helper.rb
89
+ - spec/cobweb/content_link_parser_spec.rb
90
+ - spec/cobweb/cobweb_spec.rb
91
+ - spec/cobweb/crawl_job_spec.rb
92
+ - lib/namespaced_redis.rb
93
+ - lib/cobweb.rb
94
+ - lib/content_process_job.rb
95
+ - lib/content_link_parser.rb
96
+ - lib/crawl_job.rb
97
+ - README.textile
98
+ has_rdoc: false
99
+ homepage: http://github.com/stewartmckee/cobweb
100
+ licenses: []
101
+
102
+ post_install_message:
103
+ rdoc_options: []
104
+
105
+ require_paths:
106
+ - lib
107
+ required_ruby_version: !ruby/object:Gem::Requirement
108
+ none: false
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ hash: 3
113
+ segments:
114
+ - 0
115
+ version: "0"
116
+ required_rubygems_version: !ruby/object:Gem::Requirement
117
+ none: false
118
+ requirements:
119
+ - - ">="
120
+ - !ruby/object:Gem::Version
121
+ hash: 3
122
+ segments:
123
+ - 0
124
+ version: "0"
125
+ requirements: []
126
+
127
+ rubyforge_project:
128
+ rubygems_version: 1.3.7
129
+ signing_key:
130
+ specification_version: 3
131
+ summary: Crawler utilizing resque
132
+ test_files: []
133
+