rider 0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/README +0 -0
- data/Rakefile +1 -0
- data/bin/crawl +21 -0
- data/lib/rider.rb +36 -0
- data/lib/rider/crawler.rb +102 -0
- data/lib/rider/part_queue.rb +85 -0
- data/lib/rider/queue.rb +40 -0
- data/spec/crawler_spec.rb +94 -0
- data/spec/data/apples.html +23 -0
- data/spec/data/colors.html +24 -0
- data/spec/data/fruits.html +17 -0
- data/spec/data/notitle.html +14 -0
- data/spec/data/prices.html +34 -0
- data/spec/data/tiny.html +1 -0
- data/spec/part_queue_spec.rb +40 -0
- data/spec/queue_spec.rb +43 -0
- data/spec/spec_helper.rb +1 -0
- data/tasks/deployment.rake +25 -0
- data/tasks/environment.rake +7 -0
- data/tasks/rspec.rake +9 -0
- metadata +95 -0
data/README
ADDED
File without changes
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
Dir['tasks/**/*.rake'].each { |rake| load rake }
|
data/bin/crawl
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'lib/rider'
|
4
|
+
|
5
|
+
queue_name = ARGV[0]
|
6
|
+
queue = Rider::Queue.new(queue_name)
|
7
|
+
puts "Crawling URLs from #{queue.filename}"
|
8
|
+
|
9
|
+
# will crawl all URLs
|
10
|
+
crawler = Rider::Crawler.new(//, queue)
|
11
|
+
|
12
|
+
crawler.each_document do |uri, metadata, contents|
|
13
|
+
puts "-"*60
|
14
|
+
puts "URL: #{uri.to_s}"
|
15
|
+
puts "Metadata: #{metadata.inspect}"
|
16
|
+
puts "Contents excerpt: #{contents[0..250]}"
|
17
|
+
puts
|
18
|
+
end
|
19
|
+
|
20
|
+
puts
|
21
|
+
puts "Crawl finished"
|
data/lib/rider.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
$:.unshift File.dirname(__FILE__)
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'logger'
|
5
|
+
require 'mechanize'
|
6
|
+
require 'timeout'
|
7
|
+
require 'yaml'
|
8
|
+
|
9
|
+
require 'rider/queue'
|
10
|
+
require 'rider/part_queue'
|
11
|
+
require 'rider/crawler'
|
12
|
+
|
13
|
+
$KCODE = 'u'
|
14
|
+
|
15
|
+
module Rider
|
16
|
+
VERSION = '0.1'
|
17
|
+
LOGGER = Logger.new(STDOUT)
|
18
|
+
LOGGER.level = Logger::DEBUG
|
19
|
+
|
20
|
+
|
21
|
+
def log
|
22
|
+
LOGGER
|
23
|
+
end
|
24
|
+
module_function :log
|
25
|
+
|
26
|
+
def to_absolute(uri, link)
|
27
|
+
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
28
|
+
return nil if link.nil? or link.empty?
|
29
|
+
|
30
|
+
relative = URI(link)
|
31
|
+
absolute = uri.merge(relative)
|
32
|
+
|
33
|
+
absolute.path = '/' if absolute.path.nil? or absolute.path.empty?
|
34
|
+
return absolute
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'hpricot'
|
2
|
+
|
3
|
+
module Rider
|
4
|
+
class Crawler
|
5
|
+
# Creates a new Crawler, with the specified +mask+ (a Regexp) and queue (a +Rider::Queue+ instance).
|
6
|
+
def initialize(mask, queue)
|
7
|
+
@mask = mask
|
8
|
+
@queue = queue
|
9
|
+
@seen_urls = []
|
10
|
+
@www = WWW::Mechanize.new do |a|
|
11
|
+
a.log = Logger.new("tmp/www.log")
|
12
|
+
a.pluggable_parser.default = Hpricot
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# Returns true if +url+ passes the +mask+.
|
17
|
+
def match_mask?(url)
|
18
|
+
@mask.match(url) != nil
|
19
|
+
end
|
20
|
+
|
21
|
+
# Crawls documents and passes their URL, response headers, and data to the supplied block.
|
22
|
+
def each_document
|
23
|
+
while doc_data = next_document()
|
24
|
+
follow_urls = yield(doc_data) || []
|
25
|
+
add_follow_urls(follow_urls)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def add_follow_urls(urls)
|
30
|
+
urls.each { |url| @queue.push(url) if follow_url?(url) }
|
31
|
+
end
|
32
|
+
|
33
|
+
def follow_url?(url)
|
34
|
+
match_mask?(url) and !seen_url?(url)
|
35
|
+
end
|
36
|
+
|
37
|
+
SKIPPABLE_EXCEPTIONS = [Errno::ETIMEDOUT, WWW::Mechanize::ResponseCodeError, Errno::EHOSTUNREACH, SocketError,
|
38
|
+
Errno::ECONNREFUSED, Timeout::Error, Net::HTTPBadResponse, Hpricot::ParseError]
|
39
|
+
# Returns the next retrievable document from the next valid URL in the queue.
|
40
|
+
def next_document
|
41
|
+
begin
|
42
|
+
url = next_url()
|
43
|
+
return nil if url.nil?
|
44
|
+
doc_data = get(url)
|
45
|
+
saw_url(url)
|
46
|
+
return doc_data
|
47
|
+
rescue Exception=>ex
|
48
|
+
if SKIPPABLE_EXCEPTIONS.include?(ex.class)
|
49
|
+
Rider.log.debug("EXCEPTION: #{ex.inspect}, skipping...")
|
50
|
+
retry # go on to the next document
|
51
|
+
else
|
52
|
+
raise ex
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Gets the document at the specified +url+. Returns an Array [uri, metadata, contents]
|
58
|
+
def get(url)
|
59
|
+
uri = URI.parse(url)
|
60
|
+
Timeout::timeout(8, Timeout::Error) do
|
61
|
+
case uri.scheme
|
62
|
+
when 'http'
|
63
|
+
get_http(uri)
|
64
|
+
when 'file'
|
65
|
+
get_file(uri)
|
66
|
+
else
|
67
|
+
raise(ArgumentError, "don't know how to get #{url}")
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def get_file(uri)
|
73
|
+
filename = uri.gsub(/^file:\/\//, '')
|
74
|
+
[uri, {}, File.read(filename)]
|
75
|
+
end
|
76
|
+
|
77
|
+
def get_http(uri)
|
78
|
+
page = @www.get(uri)
|
79
|
+
meta = page.response
|
80
|
+
[uri, meta, page]
|
81
|
+
end
|
82
|
+
|
83
|
+
# Retrieves the next URL in the queue that matches the +mask+.
|
84
|
+
def next_url
|
85
|
+
while url = @queue.shift
|
86
|
+
return url if valid_url?(url)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def valid_url?(url)
|
91
|
+
!seen_url?(url) && match_mask?(url)
|
92
|
+
end
|
93
|
+
|
94
|
+
def seen_url?(url)
|
95
|
+
@seen_urls.include?(url)
|
96
|
+
end
|
97
|
+
|
98
|
+
def saw_url(url)
|
99
|
+
@seen_urls << url
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
module Rider
|
2
|
+
class HostPartitionedQueue
|
3
|
+
attr_reader :name
|
4
|
+
|
5
|
+
def initialize(name)
|
6
|
+
@name = name
|
7
|
+
clear
|
8
|
+
end
|
9
|
+
|
10
|
+
def push(url)
|
11
|
+
host = get_host(url)
|
12
|
+
@hosts << host unless @hosts.include?(host)
|
13
|
+
@urls_by_host[host] ||= []
|
14
|
+
@urls_by_host[host] << url
|
15
|
+
return true
|
16
|
+
end
|
17
|
+
|
18
|
+
def shift
|
19
|
+
if empty?
|
20
|
+
Rider.log.debug("Q #{name} POP nil")
|
21
|
+
return nil
|
22
|
+
end
|
23
|
+
host = @hosts[@current_host_index]
|
24
|
+
url = @urls_by_host[host].shift
|
25
|
+
|
26
|
+
if @urls_by_host[host].empty?
|
27
|
+
@hosts.delete_at(@current_host_index)
|
28
|
+
@urls_by_host.delete(host)
|
29
|
+
# no need to increment @current_host_index since we just effectively pushed every element down by one
|
30
|
+
# by deleting from @hosts, UNLESS it was the last item in the array, in which case that index doesn't
|
31
|
+
# exist anymore
|
32
|
+
increment_current_host_index if @current_host_index == @hosts.length
|
33
|
+
else
|
34
|
+
increment_current_host_index
|
35
|
+
end
|
36
|
+
return url
|
37
|
+
end
|
38
|
+
|
39
|
+
def clear
|
40
|
+
@urls_by_host = {}
|
41
|
+
@hosts = []
|
42
|
+
@current_host_index = 0
|
43
|
+
end
|
44
|
+
|
45
|
+
def empty?
|
46
|
+
@hosts.empty?
|
47
|
+
end
|
48
|
+
|
49
|
+
def ==(another_queue)
|
50
|
+
another_queue.instance_variable_get("@urls_by_host") == @urls_by_host &&
|
51
|
+
another_queue.instance_variable_get("@hosts") == @hosts &&
|
52
|
+
another_queue.instance_variable_get("@current_host_index") == @current_host_index
|
53
|
+
end
|
54
|
+
|
55
|
+
def serialize
|
56
|
+
File.open(filename, 'w') do |file|
|
57
|
+
file.write(self.to_yaml)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.unserialize(name)
|
62
|
+
filename = "tmp/#{name}.q"
|
63
|
+
return nil unless File.exist?(filename)
|
64
|
+
YAML.load_file("tmp/#{name}.q")
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
def get_host(url)
|
69
|
+
URI.parse(url).host
|
70
|
+
end
|
71
|
+
|
72
|
+
def increment_current_host_index
|
73
|
+
if @hosts.length == 0
|
74
|
+
@current_host_index = 0
|
75
|
+
else
|
76
|
+
# increment by one but go back to 0 if it exceeds the length of the array
|
77
|
+
@current_host_index = (@current_host_index + 1) % @hosts.length
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def filename
|
82
|
+
"tmp/#{name}.q"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
data/lib/rider/queue.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
module Rider
|
2
|
+
class Rider::Queue
|
3
|
+
attr_reader :filename
|
4
|
+
def initialize(filename)
|
5
|
+
raise(ArgumentError, "queues must have a filename") if !filename or filename.empty?
|
6
|
+
@filename = filename
|
7
|
+
end
|
8
|
+
|
9
|
+
def push(item)
|
10
|
+
Rider.log.debug("Q #{filename} PUSH #{item}")
|
11
|
+
File.open(filename, "a") do |file|
|
12
|
+
file.puts(item)
|
13
|
+
end
|
14
|
+
return true
|
15
|
+
end
|
16
|
+
|
17
|
+
def shift
|
18
|
+
if empty?
|
19
|
+
Rider.log.debug("Q #{filename} SHIFT nil")
|
20
|
+
return nil
|
21
|
+
end
|
22
|
+
lines = File.readlines(filename)
|
23
|
+
item = lines.shift.strip
|
24
|
+
File.open(filename, "w") do |file|
|
25
|
+
file.write(lines.join)
|
26
|
+
end
|
27
|
+
Rider.log.debug("Q #{filename} SHIFT #{item}")
|
28
|
+
return item
|
29
|
+
end
|
30
|
+
|
31
|
+
def clear
|
32
|
+
File.unlink(filename) if File.exist?(filename)
|
33
|
+
return true
|
34
|
+
end
|
35
|
+
|
36
|
+
def empty?
|
37
|
+
!File.exist?(filename) or File.open(filename).read == ""
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
require 'spec/spec_helper'
|
2
|
+
|
3
|
+
describe Rider::Crawler do
|
4
|
+
before do
|
5
|
+
@queue = Rider::Queue.new('web')
|
6
|
+
@crawler = Rider::Crawler.new(/http:\/\/localhost/, @queue)
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "when checking URLs against mask" do
|
10
|
+
it "should return true for a URL that matches the mask" do
|
11
|
+
@crawler.match_mask?("http://localhost/some/path").should == true
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should return false for a URL that does not match the mask" do
|
15
|
+
@crawler.match_mask?("http://example.com/some/path").should == false
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe "when checking URL validity" do
|
20
|
+
before do
|
21
|
+
@urls = %w(http://example.com/invalid http://localhost/valid http://localhost/valid/unseen)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should return URLs matching the mask" do
|
25
|
+
@urls.select { |url| @crawler.valid_url?(url) }.should == ["http://localhost/valid", "http://localhost/valid/unseen"]
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should return only unseen URLs" do
|
29
|
+
@crawler.saw_url('http://localhost/valid')
|
30
|
+
@urls.select { |url| @crawler.valid_url?(url) }.should == ['http://localhost/valid/unseen']
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "when determining URLs to follow" do
|
35
|
+
it "should follow URLs that match the mask" do
|
36
|
+
@crawler.follow_url?('http://localhost/abc').should == true
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should not follow URLs that don't match the mask" do
|
40
|
+
@crawler.follow_url?('http://invalid.com').should == false
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should follow URLs that haven't been seen"
|
44
|
+
it "should not follow URLs that have been seen already"
|
45
|
+
end
|
46
|
+
|
47
|
+
describe "when getting the next document" do
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
describe "when getting documents" do
|
52
|
+
it "should raise an error for schemes other than http and file" do
|
53
|
+
lambda { @crawler.get('ftp://example.com') }.should raise_error(ArgumentError)
|
54
|
+
end
|
55
|
+
|
56
|
+
describe "when getting file:// documents" do
|
57
|
+
before do
|
58
|
+
@filename = File.expand_path(File.join(File.dirname(__FILE__), 'data', 'apples.html'))
|
59
|
+
@file_uri = 'file://' + @filename
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should return an array whose first element is the uri" do
|
63
|
+
@crawler.get_file(@file_uri)[0].should == @file_uri
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should return an array whose second element is blank metadata" do
|
67
|
+
@crawler.get_file(@file_uri)[1].should == {}
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should return an array whose third element is the file contents" do
|
71
|
+
@crawler.get_file(@file_uri)[2].should == File.read(@filename)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
describe "when getting http:// documents" do
|
76
|
+
before do
|
77
|
+
@doc_uri = 'http://localhost/simplewikipedia/articles/a/l/g/Algebra.html'
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should return an array whose first element is the uri" do
|
81
|
+
@crawler.get_http(@doc_uri)[0].should == @doc_uri
|
82
|
+
end
|
83
|
+
|
84
|
+
it "should return an array whose second element is blank metadata" do
|
85
|
+
meta = @crawler.get_http(@doc_uri)[1]
|
86
|
+
meta['Content-type'].should == 'text/html'
|
87
|
+
end
|
88
|
+
|
89
|
+
it "should return an array whose third element is the file contents" do
|
90
|
+
@crawler.get_http(@doc_uri)[2].match(/Algebra is taught in school/).should_not == nil
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
3
|
+
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
4
|
+
|
5
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
6
|
+
<head>
|
7
|
+
<title>Apples</title>
|
8
|
+
|
9
|
+
</head>
|
10
|
+
|
11
|
+
<body>
|
12
|
+
|
13
|
+
<h1>Apples</h1>
|
14
|
+
|
15
|
+
<p>
|
16
|
+
Some apples are <a href="colors.html">red</a>.
|
17
|
+
Some are <a href="colors.html">green</a>.
|
18
|
+
They <a href="prices.html">do not cost much money</a>.
|
19
|
+
You can <a href="http://buyapples.com/">buy apples</a>.
|
20
|
+
</p>
|
21
|
+
|
22
|
+
</body>
|
23
|
+
</html>
|
@@ -0,0 +1,24 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
3
|
+
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
4
|
+
|
5
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
6
|
+
<head>
|
7
|
+
<title>Colors</title>
|
8
|
+
|
9
|
+
</head>
|
10
|
+
|
11
|
+
<body>
|
12
|
+
|
13
|
+
<h1>Colors</h1>
|
14
|
+
|
15
|
+
<dl>
|
16
|
+
<dt>Red</dt>
|
17
|
+
<dd><a href="apples.html">Apples</a></dd>
|
18
|
+
|
19
|
+
<dt>Green</dt>
|
20
|
+
<dd><a href="apples.html">Apples</a> or <a href="prices.html">dollars</a>.</dd>
|
21
|
+
</dl>
|
22
|
+
|
23
|
+
</body>
|
24
|
+
</html>
|
@@ -0,0 +1,17 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
3
|
+
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
4
|
+
|
5
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
6
|
+
<head>
|
7
|
+
<title>Fruits</title>
|
8
|
+
|
9
|
+
</head>
|
10
|
+
|
11
|
+
<body>
|
12
|
+
|
13
|
+
<p><a href="apples.html">Apples</a> are a fruit.</a></p>
|
14
|
+
|
15
|
+
|
16
|
+
</body>
|
17
|
+
</html>
|
@@ -0,0 +1,14 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
3
|
+
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
4
|
+
|
5
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
6
|
+
<head>
|
7
|
+
<title></title>
|
8
|
+
|
9
|
+
</head>
|
10
|
+
|
11
|
+
<body>
|
12
|
+
|
13
|
+
</body>
|
14
|
+
</html>
|
@@ -0,0 +1,34 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
3
|
+
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
4
|
+
|
5
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
6
|
+
<head>
|
7
|
+
<title>Prices</title>
|
8
|
+
|
9
|
+
</head>
|
10
|
+
|
11
|
+
<body>
|
12
|
+
|
13
|
+
<h1>Prices</h1>
|
14
|
+
|
15
|
+
<table>
|
16
|
+
<tr>
|
17
|
+
<th>Item</th>
|
18
|
+
<th>Price</th>
|
19
|
+
</tr>
|
20
|
+
|
21
|
+
<tr>
|
22
|
+
<td><a href="apples.html">Apples</a></td>
|
23
|
+
<td>$0.35</td>
|
24
|
+
</tr>
|
25
|
+
|
26
|
+
<tr>
|
27
|
+
<td><a href="colors.html">Colors</a></td>
|
28
|
+
<td>N/A</td>
|
29
|
+
</tr>
|
30
|
+
</table>
|
31
|
+
|
32
|
+
|
33
|
+
</body>
|
34
|
+
</html>
|
data/spec/data/tiny.html
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
<html><head><title>asdf</title></head><body>asdf</body></html>
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'spec/spec_helper'
|
2
|
+
require 'spec/queue_spec'
|
3
|
+
|
4
|
+
describe Rider::HostPartitionedQueue do
|
5
|
+
it_should_behave_like "queue"
|
6
|
+
|
7
|
+
before do
|
8
|
+
@q = Rider::HostPartitionedQueue.new('test')
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should alternate among hosts when shifting" do
|
12
|
+
%w(http://example.com/path1 http://example.com/path2 http://example.net/ http://localhost/path).each { |u| @q.push(u) }
|
13
|
+
[@q.shift, @q.shift, @q.shift, @q.shift].should ==
|
14
|
+
%w(http://example.com/path1 http://example.net/ http://localhost/path http://example.com/path2)
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should return the same host if only one distinct host exists" do
|
18
|
+
%w(http://example.com/path1 http://example.com/path2 http://example.com/path3).each { |u| @q.push(u) }
|
19
|
+
[@q.shift, @q.shift, @q.shift].should == %w(http://example.com/path1 http://example.com/path2 http://example.com/path3)
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should be equal to another queue with the same objects and state" do
|
23
|
+
@q2 = Rider::HostPartitionedQueue.new('test2')
|
24
|
+
%w(http://example.com/path1 http://example.com/path2 http://example.net/ http://localhost/path).each { |u| @q.push(u) }
|
25
|
+
%w(http://example.com/path1 http://example.com/path2 http://example.net/ http://localhost/path).each { |u| @q2.push(u) }
|
26
|
+
@q.should == @q2
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "when serializing" do
|
30
|
+
it "should write and read itself back" do
|
31
|
+
%w(http://example.com/path1 http://example.com/path2 http://example.net/ http://localhost/path).each { |u| @q.push(u) }
|
32
|
+
@q.serialize
|
33
|
+
Rider::HostPartitionedQueue.unserialize('test').should == @q
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should return nil if asked to unserialize from a nonexistent file" do
|
37
|
+
Rider::HostPartitionedQueue.unserialize('nonexistent').should == nil
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/spec/queue_spec.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'spec/spec_helper'
|
2
|
+
|
3
|
+
shared_examples_for "queue" do
|
4
|
+
it "must not have a blank or nil name" do
|
5
|
+
lambda { Rider::Queue.new(nil) }.should raise_error(ArgumentError)
|
6
|
+
lambda { Rider::Queue.new('') }.should raise_error(ArgumentError)
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should be empty after clearing" do
|
10
|
+
@q.clear
|
11
|
+
@q.empty?.should == true
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should push then shift one item" do
|
15
|
+
@q.push('blue')
|
16
|
+
@q.shift.should == 'blue'
|
17
|
+
end
|
18
|
+
|
19
|
+
describe "when empty" do
|
20
|
+
it "should return nil if shifted" do
|
21
|
+
@q.shift.should == nil
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should not clobber the queue upon initialization"
|
26
|
+
end
|
27
|
+
|
28
|
+
describe Rider::Queue do
|
29
|
+
before do
|
30
|
+
@q = Rider::Queue.new('tmp/colors.q')
|
31
|
+
@q.clear
|
32
|
+
end
|
33
|
+
|
34
|
+
after do
|
35
|
+
@q.clear
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should push then shift multiple items" do
|
39
|
+
%w(red green orange).each { |color| @q.push(color) }
|
40
|
+
puts "POP x 3"
|
41
|
+
[@q.shift, @q.shift, @q.shift].should == %w(red green orange)
|
42
|
+
end
|
43
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'lib/rider'
|
@@ -0,0 +1,25 @@
|
|
1
|
+
namespace "doc" do
|
2
|
+
desc "Generate RDoc docs"
|
3
|
+
task :generate do
|
4
|
+
# Using rake/rdoctask invoked old rdoc 1.x for some reason, but this invokes rdoc 2.x
|
5
|
+
sh "rdoc --all --title 'Rider - Ruby Web crawler' --line-numbers --inline-source --force-update --all --charset utf-8 --main README README lib/"
|
6
|
+
end
|
7
|
+
|
8
|
+
desc "Upload docs to site"
|
9
|
+
task :upload do
|
10
|
+
sh "tar czfv rider-rdoc.tgz doc/"
|
11
|
+
puts
|
12
|
+
puts "Going to upload..."
|
13
|
+
puts
|
14
|
+
sh "scp rider-rdoc.tgz cardinal.stanford.edu:WWW/rider/"
|
15
|
+
sh "ssh cardinal.stanford.edu 'cd WWW/rider;tar xzfv rider-rdoc.tgz'"
|
16
|
+
sh "rm rider-rdoc.tgz"
|
17
|
+
puts
|
18
|
+
puts "Upload complete"
|
19
|
+
end
|
20
|
+
|
21
|
+
desc "Generate & upload"
|
22
|
+
task :update=>[:generate, :upload]
|
23
|
+
end
|
24
|
+
|
25
|
+
|
data/tasks/rspec.rake
ADDED
metadata
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rider
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: "0.2"
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Quinn Slack
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-10-07 00:00:00 -03:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.8.1
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: mechanize
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.9.3
|
34
|
+
version:
|
35
|
+
description: Ruby Web crawler
|
36
|
+
email: me@rafaelss.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files: []
|
42
|
+
|
43
|
+
files:
|
44
|
+
- bin/crawl
|
45
|
+
- lib/rider/crawler.rb
|
46
|
+
- lib/rider/part_queue.rb
|
47
|
+
- lib/rider/queue.rb
|
48
|
+
- lib/rider.rb
|
49
|
+
- Rakefile
|
50
|
+
- README
|
51
|
+
- spec/crawler_spec.rb
|
52
|
+
- spec/data/apples.html
|
53
|
+
- spec/data/colors.html
|
54
|
+
- spec/data/fruits.html
|
55
|
+
- spec/data/notitle.html
|
56
|
+
- spec/data/prices.html
|
57
|
+
- spec/data/tiny.html
|
58
|
+
- spec/part_queue_spec.rb
|
59
|
+
- spec/queue_spec.rb
|
60
|
+
- spec/spec_helper.rb
|
61
|
+
- tasks/deployment.rake
|
62
|
+
- tasks/environment.rake
|
63
|
+
- tasks/rspec.rake
|
64
|
+
has_rdoc: true
|
65
|
+
homepage: http://qslack.com/
|
66
|
+
licenses: []
|
67
|
+
|
68
|
+
post_install_message:
|
69
|
+
rdoc_options: []
|
70
|
+
|
71
|
+
require_paths:
|
72
|
+
- lib
|
73
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - ">="
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: "0"
|
78
|
+
version:
|
79
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - ">="
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: "0"
|
84
|
+
version:
|
85
|
+
requirements: []
|
86
|
+
|
87
|
+
rubyforge_project:
|
88
|
+
rubygems_version: 1.3.5
|
89
|
+
signing_key:
|
90
|
+
specification_version: 3
|
91
|
+
summary: Ruby Web crawler
|
92
|
+
test_files:
|
93
|
+
- spec/crawler_spec.rb
|
94
|
+
- spec/part_queue_spec.rb
|
95
|
+
- spec/queue_spec.rb
|