cobweb 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.textile ADDED
@@ -0,0 +1,63 @@
1
+
2
+ h1. Cobweb v0.0.1
3
+
4
+ h2. Intro
5
+
6
+ Crawler that utilises resque jobs to perform the crawl allowing clustering of crawls.
7
+
8
+ h2. Installation
9
+
10
+ Install crawler as a gem
11
+
12
+ bq. gem install cobweb
13
+
14
+ h2. Usage
15
+
16
+ h4. new(options)
17
+
18
+ Creates a new crawler object based on a base_url
19
+
20
+ * options - Options are passed in as a hash,
21
+
22
+ ** :follow_redirects - transparently follows redirects and populates the :redirect_through key in the content hash(true)
23
+ ** :redirect_limit - sets the limit to be used for concurrent redirects(10)
24
+ ** :processing_queue - specifies the processing queue for content to be sent to (ContentProcessJob)
25
+ ** :debug - enables debug output (false)
26
+ ** :quiet - hides default output (false)
27
+ ** :cache - if set, enables the cache and sets the ttl (300)
28
+
29
+ bq. crawler = CobWeb.new(:follow_redirects => false)
30
+
31
+
32
+ h4. start(base_url)
33
+
34
+ * base_url - the url to start the crawl from
35
+
36
+ h4. get(url)
37
+
38
+ * url - url requested
39
+
40
+
41
+ h2. License
42
+
43
+ h3. The MIT License
44
+
45
+ Copyright (c) 2010 6Central Limited
46
+
47
+ Permission is hereby granted, free of charge, to any person obtaining a copy
48
+ of this software and associated documentation files (the "Software"), to deal
49
+ in the Software without restriction, including without limitation the rights
50
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
51
+ copies of the Software, and to permit persons to whom the Software is
52
+ furnished to do so, subject to the following conditions:
53
+
54
+ The above copyright notice and this permission notice shall be included in
55
+ all copies or substantial portions of the Software.
56
+
57
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
58
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
59
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
60
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
61
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
62
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
63
+ THE SOFTWARE.
data/lib/cobweb.rb ADDED
@@ -0,0 +1,130 @@
1
+ require 'rubygems'
2
+ require 'uri'
3
+ require 'resque'
4
+ require 'digest/sha1'
5
+
6
+ Dir[File.dirname(__FILE__) + '/*.rb'].each do |file|
7
+ require [File.dirname(__FILE__), File.basename(file, File.extname(file))].join("/")
8
+ end
9
+
10
+ class CobWeb
11
+
12
+ def initialize(options = {})
13
+ @options = options
14
+ @options[:follow_redirects] = true if @options[:follow_redirects].nil?
15
+ @options[:redirect_limit] = 10 if @options[:redirect_limit].nil?
16
+ @options[:processing_queue] = ContentProcessJob if @options[:processing_queue].nil?
17
+ @options[:debug] = false unless @options[:debug]
18
+
19
+ end
20
+
21
+ def start(base_url)
22
+ raise ":base_url is required" unless base_url
23
+ request = {
24
+ :crawl_id => Digest::SHA1.hexdigest(Time.now.to_s),
25
+ :url => base_url
26
+ }
27
+
28
+ request.merge!(@options)
29
+
30
+ Resque.enqueue(CrawlJob, request)
31
+ end
32
+
33
+
34
+ def get(url, redirect_limit = @options[:redirect_limit])
35
+
36
+ raise "url cannot be nil" if url.nil?
37
+
38
+ # get the unique id for this request
39
+ unique_id = Digest::SHA1.hexdigest(url)
40
+
41
+ # connect to redis
42
+ redis = NamespacedRedis.new(Redis.new, "cobweb")
43
+
44
+ content = {}
45
+
46
+ # check if it has already been cached
47
+ if redis.get(unique_id) and @options[:cache]
48
+ puts "Cache hit for #{url}" unless @options[:quiet]
49
+ content = JSON.parse(redis.get(unique_id)).deep_symbolize_keys
50
+
51
+ content
52
+ else
53
+ # this url is valid for processing so lets get on with it
54
+ print "Retrieving #{url }... " unless @options[:quiet]
55
+ uri = URI.parse(url)
56
+
57
+ # retrieve data
58
+ http = Net::HTTP.new(uri.host, uri.port)
59
+ if uri.scheme == "https"
60
+ http.use_ssl = true
61
+ http.verify_mode = OpenSSL::SSL::VERIFY_NONE
62
+ end
63
+ request_time = Time.now.to_f
64
+ request = Net::HTTP::Get.new(uri.request_uri)
65
+ response = http.request(request)
66
+
67
+ if @options[:follow_redirects] and response.code.to_i >= 300 and response.code.to_i < 400
68
+ puts "redirected... " unless @options[:quiet]
69
+ url = response['location']
70
+ redirect_limit = redirect_limit - 1
71
+ content = get(response['location'], redirect_limit)
72
+ content[:url] = uri.to_s
73
+ content[:redirect_through] = [] if content[:redirect_through].nil?
74
+ content[:redirect_through].insert(0, response['location'])
75
+
76
+ content[:response_time] = Time.now.to_f - request_time
77
+ else
78
+ content[:response_time] = Time.now.to_f - request_time
79
+
80
+ puts "Retrieved." unless @options[:quiet]
81
+
82
+ # create the content container
83
+ content[:url] = uri.to_s
84
+ content[:status_code] = response.code.to_i
85
+ content[:content_type] = response.content_type
86
+ charset = response["Content-Type"][response["Content-Type"].index(";")+2..-1 ] if !response["Content-Type"].nil? and response["Content-Type"].include?(";")
87
+ charset = charset[charset.index("=")+1..-1] if charset and charset.include?("=")
88
+ content[:character_set] = charset
89
+ content[:content_length] = response.content_length
90
+ content[:content_body] = response.body
91
+ content[:location] = response["location"]
92
+ content[:headers] = response.to_hash.symbolize_keys
93
+
94
+ # parse data for links
95
+ link_parser = ContentLinkParser.new(content[:url], content[:content_body])
96
+ content[:links] = link_parser.link_data
97
+
98
+ # add content to cache if required
99
+ if @options[:cache]
100
+ redis.set(unique_id, content.to_json)
101
+ redis.expire unique_id, content_request[:cache].to_i
102
+ end
103
+ end
104
+ end
105
+ content
106
+ end
107
+ end
108
+
109
+ ## add symbolize methods to hash
110
+ class Hash
111
+ def symbolize_keys
112
+ keys.each do |key|
113
+ if key.instance_of? String
114
+ value = self[key]
115
+ self.delete(key)
116
+ self[key.to_sym] = value
117
+ end
118
+ end
119
+ self
120
+ end
121
+ def deep_symbolize_keys
122
+ symbolize_keys
123
+ keys.each do |key|
124
+ if self[key].instance_of? Hash
125
+ self[key].deep_symbolize_keys
126
+ end
127
+ end
128
+ self
129
+ end
130
+ end
@@ -0,0 +1,71 @@
1
+
2
+ class ContentLinkParser
3
+
4
+ require "nokogiri"
5
+ require "absolutize"
6
+
7
+ def initialize(url, content, options = {})
8
+ @options = options
9
+ @url = url
10
+ @doc = Nokogiri::HTML(content)
11
+
12
+ base_url = @url.to_s
13
+ if @doc.at("base[href]")
14
+ base_url = @doc.at("base[href]").attr("href").to_s
15
+ end
16
+ @absolutize = Absolutize.new(base_url, :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
17
+
18
+ @options[:tags] = {}
19
+ @options[:tags][:links] = [["a[href]", "href"], ["frame[src]", "src"], ["meta[@http-equiv=\"refresh\"]", "content"], ["link[href]:not([rel])", "href"], ["area[href]", "href"]]
20
+ @options[:tags][:images] = [["img[src]", "src"]]
21
+ @options[:tags][:related] = [["link[rel]", "href"]]
22
+ @options[:tags][:scripts] = [["script[src]", "src"]]
23
+ @options[:tags][:styles] = [["link[rel='stylesheet'][href]", "href"], ["style[@type^='text/css']", /url\("?(.*?)"?\)/]]
24
+
25
+ #clear the default tags if required
26
+ @options[:tags] = {} if @options[:ignore_default_tags]
27
+ @options[:tags].merge!(@options[:additional_tags]) unless @options[:additional_tags].nil?
28
+
29
+ end
30
+
31
+ def link_data
32
+ data = {}
33
+ @options[:tags].keys.each do |key|
34
+ data[key.to_sym] = self.instance_eval(key.to_s)
35
+ end
36
+ data
37
+ end
38
+
39
+ def all_links
40
+ data = link_data
41
+ data.keys.map{|key| data[key]}.flatten.uniq
42
+ end
43
+
44
+ def method_missing(m)
45
+ if @options[:tags].keys.include?(m)
46
+ links = []
47
+ @options[:tags][m].each do |selector, attribute|
48
+ find_matches(links, selector, attribute)
49
+ end
50
+ links.uniq
51
+ else
52
+ puts "Warning: There was no configuration on how to find #{m} links"
53
+ []
54
+ end
55
+ end
56
+
57
+ def find_matches(array, selector, attribute)
58
+ if attribute.kind_of? String or attribute.kind_of? Symbol
59
+ @doc.css(selector).each do |tag|
60
+ uri = @absolutize.url(tag[attribute])
61
+ array << uri.to_s
62
+ end
63
+ elsif attribute.instance_of? Regexp
64
+ @doc.css(selector).each do |tag|
65
+ tag.content.scan(attribute) {|match| array << @absolutize.url(match[0])}
66
+ end
67
+ end
68
+ end
69
+
70
+ end
71
+
@@ -0,0 +1,13 @@
1
+ class ContentProcessJob
2
+ require "ap"
3
+
4
+ @queue = :cobweb_process_job
5
+
6
+ def self.perform(content)
7
+ content.symbolize_keys
8
+ puts "Dummy Processing for #{content[:url]}"
9
+
10
+ #ap content.keys
11
+
12
+ end
13
+ end
data/lib/crawl_job.rb ADDED
@@ -0,0 +1,71 @@
1
+ class CrawlJob
2
+
3
+ require "net/https"
4
+ require "uri"
5
+ require "redis"
6
+
7
+ @queue = :cobweb_crawl_job
8
+
9
+ def self.perform(content_request)
10
+ # change all hash keys to symbols
11
+ content_request.deep_symbolize_keys
12
+ redis = NamespacedRedis.new(Redis.new, "cobweb-#{content_request[:crawl_id]}")
13
+ @absolutize = Absolutize.new(content_request[:url], :output_debug => false, :raise_exceptions => false, :force_escaping => false, :remove_anchors => true)
14
+
15
+ # check we haven't crawled this url before
16
+ unless redis.sismember "crawled", content_request[:url]
17
+
18
+ # increment counter and check we haven't hit our crawl limit
19
+ redis.incr "crawl-counter"
20
+ crawl_counter = redis.get("crawl-counter").to_i
21
+ queue_counter = redis.get("queue-counter").to_i
22
+ if crawl_counter <= content_request[:crawl_limit]
23
+ content = CobWeb.get(content_request)
24
+ redis.sadd "crawled", content_request[:url]
25
+ set_base_url redis, content, content_request[:base_url]
26
+ if queue_counter <= content_request[:crawl_limit]
27
+ ap content[:links]
28
+ content[:links].keys.map{|key| content[:links][key]}.flatten.each do |link|
29
+ ap link
30
+ unless redis.sismember "crawled", link
31
+ puts redis.get("base_url")
32
+ puts "---------------------------------"
33
+ if link.match(Regexp.new("^#{redis.get("base_url")}"))
34
+ new_request = content_request.clone
35
+ new_request[:url] = link
36
+ new_request[:parent] = content_request[:url]
37
+ Resque.enqueue(CrawlJob, new_request)
38
+ redis.incr "queue-counter"
39
+ end
40
+ end
41
+ end
42
+ end
43
+
44
+ # enqueue to processing queue
45
+ Resque.enqueue(const_get(content_request[:processing_queue]), content)
46
+ puts "#{content_request[:url]} has been sent for processing." if content_request[:debug]
47
+ puts "Crawled: #{crawl_counter} Limit: #{content_request[:crawl_limit]} Queued: #{queue_counter}" if content_request[:debug]
48
+
49
+ else
50
+ puts "Crawl Limit Exceeded by #{crawl_counter - content_request[:crawl_limit]} objects" if content_request[:debug]
51
+ end
52
+ else
53
+ puts "Already crawled #{content_request[:url]}" if content_request[:debug]
54
+ end
55
+ end
56
+
57
+ private
58
+ def self.set_base_url(redis, content, base_url)
59
+ if redis.get("base_url").nil?
60
+ if content[:status_code] >= 300 and content[:status_code] < 400
61
+ #redirect received for first url
62
+ redis.set("base_url", @absolutize.url(content[:location]).to_s)
63
+ puts "Warning: base_url given redirects to another location, setting base_url to #{@absolutize.url(content[:location]).to_s}"
64
+ else
65
+ redis.set("base_url", base_url)
66
+ end
67
+ end
68
+ end
69
+
70
+
71
+ end
@@ -0,0 +1,52 @@
1
+ class NamespacedRedis
2
+ def initialize(redis, namespace="")
3
+ raise "redis must be supplied" if redis.nil?
4
+ @redis = redis
5
+ @namespace = namespace
6
+ end
7
+
8
+ def sismember(key, member)
9
+ @redis.sismember namespaced(key), member
10
+ end
11
+
12
+ def sadd(key, value)
13
+ @redis.sadd namespaced(key), value
14
+ end
15
+
16
+ def get(key)
17
+ @redis.get namespaced(key)
18
+ end
19
+
20
+ def incr(key)
21
+ @redis.incr namespaced(key)
22
+ end
23
+
24
+ def exist(key)
25
+ @redis.exist namespaced(key)
26
+ end
27
+
28
+ def set(key, value)
29
+ @redis.set namespaced(key), value
30
+ end
31
+
32
+ def del(key)
33
+ @redis.del namespaced(key)
34
+ end
35
+
36
+ def expire(key, value)
37
+ @redis.expire namespaced(key), value
38
+ end
39
+
40
+ def namespaced(key)
41
+ "#{@namespace}-#{key}"
42
+ end
43
+
44
+ def native
45
+ @redis
46
+ end
47
+
48
+ def namespace
49
+ @namespace
50
+ end
51
+
52
+ end
@@ -0,0 +1,189 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+ describe CobWeb do
4
+
5
+ before(:each) do
6
+
7
+ @base_url = "http://www.baseurl.com/"
8
+
9
+ @default_headers = {"Cache-Control" => "private, max-age=0",
10
+ "Date" => "Wed, 10 Nov 2010 09:06:17 GMT",
11
+ "Expires" => "-1",
12
+ "Content-Type" => "text/html; charset=UTF-8",
13
+ "Content-Encoding" => "gzip",
14
+ "Transfer-Encoding" => "chunked",
15
+ "Server" => "gws",
16
+ "X-XSS-Protection" => "1; mode=block"}
17
+
18
+ @cobweb = CobWeb.new :quiet => true
19
+ end
20
+
21
+ describe "with mock" do
22
+ before(:each) do
23
+ @mock_http_client = mock(Net::HTTP)
24
+ @mock_http_request = mock(Net::HTTPRequest)
25
+ @mock_http_redirect_request = mock(Net::HTTPRequest)
26
+ @mock_http_redirect_request2 = mock(Net::HTTPRequest)
27
+
28
+ @mock_http_response = mock(Net::HTTPResponse)
29
+ @mock_http_redirect_response = mock(Net::HTTPRedirection)
30
+ @mock_http_get = mock(Net::HTTP::Get)
31
+
32
+ Net::HTTP.stub!(:new).and_return(@mock_http_client)
33
+ Net::HTTP::Get.stub!(:new).and_return(@mock_http_request)
34
+ Net::HTTP::Get.stub!(:new).with("/redirect.html").and_return(@mock_http_redirect_request)
35
+ Net::HTTP::Get.stub!(:new).with("/redirect2.html").and_return(@mock_http_redirect_request2)
36
+
37
+ @mock_http_client.stub!(:request).with(@mock_http_request).and_return(@mock_http_response)
38
+ @mock_http_client.stub!(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
39
+ @mock_http_client.stub!(:request).with(@mock_http_redirect_request2).and_return(@mock_http_redirect_response2)
40
+
41
+ @mock_http_response.stub!(:code).and_return(200)
42
+ @mock_http_response.stub!(:content_type).and_return("text/html")
43
+ @mock_http_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
44
+ @mock_http_response.stub!(:[]).with("location").and_return(@default_headers["location"])
45
+ @mock_http_response.stub!(:content_length).and_return(1024)
46
+ @mock_http_response.stub!(:body).and_return("asdf")
47
+ @mock_http_response.stub!(:to_hash).and_return(@default_headers)
48
+
49
+ @mock_http_redirect_response.stub!(:code).and_return(301)
50
+ @mock_http_redirect_response.stub!(:content_type).and_return("text/xml")
51
+ @mock_http_redirect_response.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
52
+ @mock_http_redirect_response.stub!(:[]).with("location").and_return("http://redirected-to.com/redirect2.html")
53
+ @mock_http_redirect_response.stub!(:content_length).and_return(2048)
54
+ @mock_http_redirect_response.stub!(:body).and_return("redirected body")
55
+ @mock_http_redirect_response.stub!(:to_hash).and_return(@default_headers)
56
+
57
+ @mock_http_redirect_response2.stub!(:code).and_return(301)
58
+ @mock_http_redirect_response2.stub!(:content_type).and_return("text/xml")
59
+ @mock_http_redirect_response2.stub!(:[]).with("Content-Type").and_return(@default_headers["Content-Type"])
60
+ @mock_http_redirect_response2.stub!(:[]).with("location").and_return("http://redirected-to.com/redirected.html")
61
+ @mock_http_redirect_response2.stub!(:content_length).and_return(2048)
62
+ @mock_http_redirect_response2.stub!(:body).and_return("redirected body")
63
+ @mock_http_redirect_response2.stub!(:to_hash).and_return(@default_headers)
64
+
65
+ end
66
+
67
+ it "should generate a cobweb object" do
68
+ CobWeb.new.should be_an_instance_of CobWeb
69
+ end
70
+
71
+ describe "get" do
72
+ it "should return a hash with default values" do
73
+ @cobweb.get(@base_url).should be_an_instance_of Hash
74
+ end
75
+
76
+ it "should return a hash with default values without quiet option" do
77
+ @cobweb.get(@base_url).should be_an_instance_of Hash
78
+ end
79
+
80
+ it "should raise exception if there is no url" do
81
+ lambda {@cobweb.get(nil)}.should raise_error("url cannot be nil")
82
+ end
83
+
84
+ describe "content object" do
85
+
86
+ it "should return the url" do
87
+ @cobweb.get(@base_url)[:url].should == @base_url
88
+ end
89
+
90
+ it "should return correct content-types" do
91
+ @mock_http_response.stub!(:content_type).and_return("image/jpeg")
92
+ @cobweb.get(@base_url)[:content_type].should == "image/jpeg"
93
+ end
94
+
95
+ it "should return correct status-code" do
96
+ @mock_http_response.stub!(:code).and_return(404)
97
+ @cobweb.get(@base_url)[:status_code].should == 404
98
+ end
99
+
100
+ it "should return correct status-code" do
101
+ @mock_http_response.stub!(:code).and_return(404)
102
+ @cobweb.get(@base_url)[:status_code].should == 404
103
+ end
104
+
105
+ it "should return correct character_set" do
106
+ @cobweb.get(@base_url)[:character_set].should == "UTF-8"
107
+ end
108
+ it "should return correct content_length" do
109
+ @cobweb.get(@base_url)[:content_length].should == 1024
110
+ end
111
+ it "should return correct content_body" do
112
+ @cobweb.get(@base_url)[:content_body].should == "asdf"
113
+ end
114
+ it "should return correct location" do
115
+ @cobweb.get(@base_url)[:location].should == nil
116
+
117
+ @mock_http_response.stub!(:[]).with("location").and_return("http://google.com/")
118
+ @cobweb.get(@base_url)[:location].should == "http://google.com/"
119
+ end
120
+ it "should return correct headers" do
121
+ @cobweb.get(@base_url)[:headers].should == @default_headers
122
+ end
123
+ it "should return correct a hash of links" do
124
+ @cobweb.get(@base_url)[:links].should be_an_instance_of Hash
125
+ end
126
+ it "should return the response time for the url" do
127
+ @cobweb.get(@base_url)[:response_time].should be_an_instance_of Float
128
+ end
129
+
130
+ end
131
+ describe "with redirect" do
132
+
133
+ before(:each) do
134
+ @base_url = "http://redirect-me.com/redirect.html"
135
+ @cobweb = CobWeb.new(:follow_redirects => true, :quiet => true)
136
+ end
137
+
138
+ it "should flow through redirect" do
139
+
140
+ @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
141
+
142
+ content = @cobweb.get(@base_url)
143
+ content.should be_an_instance_of Hash
144
+
145
+ content[:url].should == "http://redirect-me.com/redirect.html"
146
+ content[:redirect_through].length.should == 2
147
+ content[:content_type].should == "text/html"
148
+ content[:content_body].should == "asdf"
149
+
150
+ end
151
+ it "should return the path followed" do
152
+ @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
153
+
154
+ content = @cobweb.get(@base_url)
155
+ content[:redirect_through].should == ["http://redirected-to.com/redirect2.html", "http://redirected-to.com/redirected.html"]
156
+
157
+ end
158
+ it "should not follow with redirect disabled" do
159
+ @cobweb = CobWeb.new(:follow_redirects => false)
160
+ @mock_http_client.should_receive(:request).with(@mock_http_redirect_request).and_return(@mock_http_redirect_response)
161
+
162
+ content = @cobweb.get(@base_url)
163
+ content[:url].should == "http://redirect-me.com/redirect.html"
164
+ content[:redirect_through].should be_nil
165
+ content[:status_code].should == 301
166
+ content[:content_type].should == "text/xml"
167
+ content[:content_body].should == "redirected body"
168
+
169
+ end
170
+ end
171
+ end
172
+ end
173
+
174
+ describe "without mock" do
175
+ it "should throw invalid url exception for an invalid url" do
176
+ lambda {@cobweb.get("asdgas asv\"£%\"^%&*%")}.should raise_error URI::InvalidURIError
177
+ end
178
+
179
+ it "should throw exception when server is unavailable" #do
180
+ # lambda {@cobweb.get({:url => "http://www.oasdjgoisadjgoisdiog.com"})}.should raise_error URI::InvalidURIError
181
+ #end
182
+
183
+ it "should return a valid content hash when url doesn't exist on a live server" do
184
+ status_code = @cobweb.get("http://test.com/laskdjflsdajf")[:status_code]
185
+ status_code.should == 404
186
+ end
187
+
188
+ end
189
+ end
@@ -0,0 +1,104 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+ require File.expand_path(File.dirname(__FILE__) + '/../../lib/content_link_parser.rb')
3
+
4
+ describe ContentLinkParser do
5
+
6
+ before(:each) do
7
+ @base_url = "http://www.baseurl.com/"
8
+ @content = File.read(File.dirname(__FILE__) + "/../samples/sample_html_links.html")
9
+ @content_parser = ContentLinkParser.new("http://sample-links.com/", @content)
10
+ end
11
+
12
+ it "should load the sample document" do
13
+ @content.should_not be_nil
14
+ @content.should_not be_empty
15
+ end
16
+
17
+ it "should create a content link parser" do
18
+ @content_parser.should_not be_nil
19
+ @content_parser.should be_an_instance_of ContentLinkParser
20
+ end
21
+
22
+ describe "using default tags" do
23
+ describe "returning general links" do
24
+ it "should return some links from the sample data" do
25
+ links = @content_parser.links
26
+ links.should_not be_nil
27
+ links.should_not be_empty
28
+ end
29
+ it "should return the correct links" do
30
+ links = @content_parser.links
31
+ links.length.should == 4
32
+ end
33
+ end
34
+ describe "returning image links" do
35
+ it "should return some image links from the sample data" do
36
+ links = @content_parser.images
37
+ links.should_not be_nil
38
+ links.should_not be_empty
39
+ end
40
+ it "should return the correct links" do
41
+ links = @content_parser.images
42
+ links.length.should == 1
43
+ end
44
+ end
45
+ describe "returning related links" do
46
+ it "should return some related links from the sample data" do
47
+ links = @content_parser.related
48
+ links.should_not be_nil
49
+ links.should_not be_empty
50
+ end
51
+ it "should return the correct links" do
52
+ links = @content_parser.related
53
+ links.length.should == 2
54
+ end
55
+ end
56
+ describe "returning script links" do
57
+ it "should return some script links from the sample data" do
58
+ links = @content_parser.scripts
59
+ links.should_not be_nil
60
+ links.should_not be_empty
61
+ end
62
+ it "should return the correct links" do
63
+ links = @content_parser.scripts
64
+ links.length.should == 1
65
+ end
66
+ end
67
+ describe "returning style links" do
68
+ it "should return some style links from the sample data" do
69
+ links = @content_parser.styles
70
+ links.should_not be_nil
71
+ links.should_not be_empty
72
+ end
73
+ it "should return the correct links" do
74
+ links = @content_parser.styles
75
+ links.length.should == 3
76
+ end
77
+ end
78
+ describe "returning unknown link type" do
79
+ it "should return an empty array" do
80
+ links = @content_parser.asdfasdfsadf
81
+ links.should_not be_nil
82
+ links.should be_an_instance_of Array
83
+ end
84
+ end
85
+ end
86
+
87
+ describe "returning all link data" do
88
+ it "should return a hash with all link data" do
89
+ link_data = @content_parser.link_data
90
+ link_data.should_not be_nil
91
+ link_data.should be_an_instance_of Hash
92
+
93
+ link_data.keys.length.should == 5
94
+ link_data[:links].length.should == 4
95
+ end
96
+ end
97
+
98
+ describe "ignoring default tags" do
99
+ it "should not return any links" do
100
+ parser = ContentLinkParser.new("http://sample-links.com", @content, :ignore_default_tags => true)
101
+ parser.links.should be_empty
102
+ end
103
+ end
104
+ end
@@ -0,0 +1,24 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../spec_helper')
2
+
3
+
4
+ describe CrawlJob do
5
+
6
+ before(:each) do
7
+ @base_url = "http://www.baseurl.com/"
8
+
9
+
10
+
11
+ client = Net::HTTPClient.new
12
+ puts client.get('http://www.google.com.au')
13
+ puts "asdf"
14
+
15
+ @cobweb = CobWeb.new("http://www.google.com")
16
+
17
+ end
18
+
19
+ it "should be a cobweb type" do
20
+ @cobweb.should be_an_instance_of CobWeb
21
+ end
22
+
23
+
24
+ end
@@ -0,0 +1,34 @@
1
+ <html>
2
+ <head>
3
+ <title>Sample HTML Document With all types of links</title>
4
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
5
+ <meta name="description" content="Information for people running web search indexing robots, and web site managers attempting to understand what's going on when a robot visits their site.">
6
+ <meta name="keywords" content="robots, crawlers, crawling, spiders, index, indexing, indexers, gatherers, search engines, searching, FAQ, checklist">
7
+ <meta name="DC.date.modified" content="2003-07-18">
8
+ <meta http-equiv="refresh" content="http://sampleurl-metarefresh.com/"/>
9
+
10
+ <link rel="stylesheet" type="text/css" href="http://sampleurl-linkcss/" />
11
+ <link rel="home" type="text/html" href="http://sampleurl-linkhome/" />
12
+ <script type="text/javascript" src="script.js"></script>
13
+
14
+ <STYLE TYPE="text/css" MEDIA="screen, projection">
15
+ <!--
16
+ @import url(http://www.htmlhelp.com/style.css);
17
+ @import url(/stylesheets/punk.css);
18
+ DT { background: yellow; color: black }
19
+ -->
20
+ </STYLE>
21
+
22
+ </head>
23
+
24
+ <body bgcolor="#FFFFFF"><!-- #BeginLibraryItem "/Library/navtop.lbi" --></p>
25
+
26
+ <a href="http://sampleurl-a.com/">Click Here for Sample URL 1</a>
27
+ <frameset><frame src="http://sampleurl-frame.com/"></frame></frameset>
28
+
29
+ <map id="testmap"><area href="http://sampleurl-area"></area>></map>
30
+
31
+ <img src="http://sampleurl-img/"/>
32
+
33
+ </body>
34
+ </html>
data/spec/spec.opts ADDED
@@ -0,0 +1,2 @@
1
+ --colour
2
+ --format specdoc
@@ -0,0 +1 @@
1
+ require File.expand_path(File.dirname(__FILE__) + '/../lib/cobweb')
metadata ADDED
@@ -0,0 +1,133 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: cobweb
3
+ version: !ruby/object:Gem::Version
4
+ hash: 29
5
+ prerelease: false
6
+ segments:
7
+ - 0
8
+ - 0
9
+ - 1
10
+ version: 0.0.1
11
+ platform: ruby
12
+ authors:
13
+ - Stewart McKee
14
+ autorequire:
15
+ bindir: bin
16
+ cert_chain: []
17
+
18
+ date: 2010-11-10 00:00:00 +00:00
19
+ default_executable:
20
+ dependencies:
21
+ - !ruby/object:Gem::Dependency
22
+ name: resque
23
+ prerelease: false
24
+ requirement: &id001 !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ">="
28
+ - !ruby/object:Gem::Version
29
+ hash: 3
30
+ segments:
31
+ - 0
32
+ version: "0"
33
+ type: :runtime
34
+ version_requirements: *id001
35
+ - !ruby/object:Gem::Dependency
36
+ name: redis
37
+ prerelease: false
38
+ requirement: &id002 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ">="
42
+ - !ruby/object:Gem::Version
43
+ hash: 3
44
+ segments:
45
+ - 0
46
+ version: "0"
47
+ type: :runtime
48
+ version_requirements: *id002
49
+ - !ruby/object:Gem::Dependency
50
+ name: absolutize
51
+ prerelease: false
52
+ requirement: &id003 !ruby/object:Gem::Requirement
53
+ none: false
54
+ requirements:
55
+ - - ">="
56
+ - !ruby/object:Gem::Version
57
+ hash: 3
58
+ segments:
59
+ - 0
60
+ version: "0"
61
+ type: :runtime
62
+ version_requirements: *id003
63
+ - !ruby/object:Gem::Dependency
64
+ name: nokogiri
65
+ prerelease: false
66
+ requirement: &id004 !ruby/object:Gem::Requirement
67
+ none: false
68
+ requirements:
69
+ - - ">="
70
+ - !ruby/object:Gem::Version
71
+ hash: 3
72
+ segments:
73
+ - 0
74
+ version: "0"
75
+ type: :runtime
76
+ version_requirements: *id004
77
+ description:
78
+ email: stewart@rockwellcottage.com
79
+ executables: []
80
+
81
+ extensions: []
82
+
83
+ extra_rdoc_files:
84
+ - README.textile
85
+ files:
86
+ - spec/samples/sample_html_links.html
87
+ - spec/spec.opts
88
+ - spec/spec_helper.rb
89
+ - spec/cobweb/content_link_parser_spec.rb
90
+ - spec/cobweb/cobweb_spec.rb
91
+ - spec/cobweb/crawl_job_spec.rb
92
+ - lib/namespaced_redis.rb
93
+ - lib/cobweb.rb
94
+ - lib/content_process_job.rb
95
+ - lib/content_link_parser.rb
96
+ - lib/crawl_job.rb
97
+ - README.textile
98
+ has_rdoc: false
99
+ homepage: http://github.com/stewartmckee/cobweb
100
+ licenses: []
101
+
102
+ post_install_message:
103
+ rdoc_options: []
104
+
105
+ require_paths:
106
+ - lib
107
+ required_ruby_version: !ruby/object:Gem::Requirement
108
+ none: false
109
+ requirements:
110
+ - - ">="
111
+ - !ruby/object:Gem::Version
112
+ hash: 3
113
+ segments:
114
+ - 0
115
+ version: "0"
116
+ required_rubygems_version: !ruby/object:Gem::Requirement
117
+ none: false
118
+ requirements:
119
+ - - ">="
120
+ - !ruby/object:Gem::Version
121
+ hash: 3
122
+ segments:
123
+ - 0
124
+ version: "0"
125
+ requirements: []
126
+
127
+ rubyforge_project:
128
+ rubygems_version: 1.3.7
129
+ signing_key:
130
+ specification_version: 3
131
+ summary: Crawler utilizing resque
132
+ test_files: []
133
+