rider 0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README +0 -0
- data/Rakefile +1 -0
- data/bin/crawl +21 -0
- data/lib/rider.rb +36 -0
- data/lib/rider/crawler.rb +102 -0
- data/lib/rider/part_queue.rb +85 -0
- data/lib/rider/queue.rb +40 -0
- data/spec/crawler_spec.rb +94 -0
- data/spec/data/apples.html +23 -0
- data/spec/data/colors.html +24 -0
- data/spec/data/fruits.html +17 -0
- data/spec/data/notitle.html +14 -0
- data/spec/data/prices.html +34 -0
- data/spec/data/tiny.html +1 -0
- data/spec/part_queue_spec.rb +40 -0
- data/spec/queue_spec.rb +43 -0
- data/spec/spec_helper.rb +1 -0
- data/tasks/deployment.rake +25 -0
- data/tasks/environment.rake +7 -0
- data/tasks/rspec.rake +9 -0
- metadata +95 -0
data/README
ADDED
File without changes
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
Dir['tasks/**/*.rake'].each { |rake| load rake }
|
data/bin/crawl
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'lib/rider'
|
4
|
+
|
5
|
+
queue_name = ARGV[0]
|
6
|
+
queue = Rider::Queue.new(queue_name)
|
7
|
+
puts "Crawling URLs from #{queue.filename}"
|
8
|
+
|
9
|
+
# will crawl all URLs
|
10
|
+
crawler = Rider::Crawler.new(//, queue)
|
11
|
+
|
12
|
+
crawler.each_document do |uri, metadata, contents|
|
13
|
+
puts "-"*60
|
14
|
+
puts "URL: #{uri.to_s}"
|
15
|
+
puts "Metadata: #{metadata.inspect}"
|
16
|
+
puts "Contents excerpt: #{contents[0..250]}"
|
17
|
+
puts
|
18
|
+
end
|
19
|
+
|
20
|
+
puts
|
21
|
+
puts "Crawl finished"
|
data/lib/rider.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
$:.unshift File.dirname(__FILE__)
|
2
|
+
|
3
|
+
require 'rubygems'
|
4
|
+
require 'logger'
|
5
|
+
require 'mechanize'
|
6
|
+
require 'timeout'
|
7
|
+
require 'yaml'
|
8
|
+
|
9
|
+
require 'rider/queue'
|
10
|
+
require 'rider/part_queue'
|
11
|
+
require 'rider/crawler'
|
12
|
+
|
13
|
+
$KCODE = 'u'
|
14
|
+
|
15
|
+
module Rider
|
16
|
+
VERSION = '0.1'
|
17
|
+
LOGGER = Logger.new(STDOUT)
|
18
|
+
LOGGER.level = Logger::DEBUG
|
19
|
+
|
20
|
+
|
21
|
+
def log
|
22
|
+
LOGGER
|
23
|
+
end
|
24
|
+
module_function :log
|
25
|
+
|
26
|
+
def to_absolute(uri, link)
|
27
|
+
link = URI.encode(link.to_s.gsub(/#[a-zA-Z0-9_-]*$/,''))
|
28
|
+
return nil if link.nil? or link.empty?
|
29
|
+
|
30
|
+
relative = URI(link)
|
31
|
+
absolute = uri.merge(relative)
|
32
|
+
|
33
|
+
absolute.path = '/' if absolute.path.nil? or absolute.path.empty?
|
34
|
+
return absolute
|
35
|
+
end
|
36
|
+
end
|
@@ -0,0 +1,102 @@
|
|
1
|
+
require 'hpricot'
|
2
|
+
|
3
|
+
module Rider
|
4
|
+
class Crawler
|
5
|
+
# Creates a new Crawler, with the specified +mask+ (a Regexp) and queue (a +Rider::Queue+ instance).
|
6
|
+
def initialize(mask, queue)
|
7
|
+
@mask = mask
|
8
|
+
@queue = queue
|
9
|
+
@seen_urls = []
|
10
|
+
@www = WWW::Mechanize.new do |a|
|
11
|
+
a.log = Logger.new("tmp/www.log")
|
12
|
+
a.pluggable_parser.default = Hpricot
|
13
|
+
end
|
14
|
+
end
|
15
|
+
|
16
|
+
# Returns true if +url+ passes the +mask+.
|
17
|
+
def match_mask?(url)
|
18
|
+
@mask.match(url) != nil
|
19
|
+
end
|
20
|
+
|
21
|
+
# Crawls documents and passes their URL, response headers, and data to the supplied block.
|
22
|
+
def each_document
|
23
|
+
while doc_data = next_document()
|
24
|
+
follow_urls = yield(doc_data) || []
|
25
|
+
add_follow_urls(follow_urls)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
|
29
|
+
def add_follow_urls(urls)
|
30
|
+
urls.each { |url| @queue.push(url) if follow_url?(url) }
|
31
|
+
end
|
32
|
+
|
33
|
+
def follow_url?(url)
|
34
|
+
match_mask?(url) and !seen_url?(url)
|
35
|
+
end
|
36
|
+
|
37
|
+
SKIPPABLE_EXCEPTIONS = [Errno::ETIMEDOUT, WWW::Mechanize::ResponseCodeError, Errno::EHOSTUNREACH, SocketError,
|
38
|
+
Errno::ECONNREFUSED, Timeout::Error, Net::HTTPBadResponse, Hpricot::ParseError]
|
39
|
+
# Returns the next retrievable document from the next valid URL in the queue.
|
40
|
+
def next_document
|
41
|
+
begin
|
42
|
+
url = next_url()
|
43
|
+
return nil if url.nil?
|
44
|
+
doc_data = get(url)
|
45
|
+
saw_url(url)
|
46
|
+
return doc_data
|
47
|
+
rescue Exception=>ex
|
48
|
+
if SKIPPABLE_EXCEPTIONS.include?(ex.class)
|
49
|
+
Rider.log.debug("EXCEPTION: #{ex.inspect}, skipping...")
|
50
|
+
retry # go on to the next document
|
51
|
+
else
|
52
|
+
raise ex
|
53
|
+
end
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
# Gets the document at the specified +url+. Returns an Array [uri, metadata, contents]
|
58
|
+
def get(url)
|
59
|
+
uri = URI.parse(url)
|
60
|
+
Timeout::timeout(8, Timeout::Error) do
|
61
|
+
case uri.scheme
|
62
|
+
when 'http'
|
63
|
+
get_http(uri)
|
64
|
+
when 'file'
|
65
|
+
get_file(uri)
|
66
|
+
else
|
67
|
+
raise(ArgumentError, "don't know how to get #{url}")
|
68
|
+
end
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
def get_file(uri)
|
73
|
+
filename = uri.gsub(/^file:\/\//, '')
|
74
|
+
[uri, {}, File.read(filename)]
|
75
|
+
end
|
76
|
+
|
77
|
+
def get_http(uri)
|
78
|
+
page = @www.get(uri)
|
79
|
+
meta = page.response
|
80
|
+
[uri, meta, page]
|
81
|
+
end
|
82
|
+
|
83
|
+
# Retrieves the next URL in the queue that matches the +mask+.
|
84
|
+
def next_url
|
85
|
+
while url = @queue.shift
|
86
|
+
return url if valid_url?(url)
|
87
|
+
end
|
88
|
+
end
|
89
|
+
|
90
|
+
def valid_url?(url)
|
91
|
+
!seen_url?(url) && match_mask?(url)
|
92
|
+
end
|
93
|
+
|
94
|
+
def seen_url?(url)
|
95
|
+
@seen_urls.include?(url)
|
96
|
+
end
|
97
|
+
|
98
|
+
def saw_url(url)
|
99
|
+
@seen_urls << url
|
100
|
+
end
|
101
|
+
end
|
102
|
+
end
|
@@ -0,0 +1,85 @@
|
|
1
|
+
module Rider
|
2
|
+
class HostPartitionedQueue
|
3
|
+
attr_reader :name
|
4
|
+
|
5
|
+
def initialize(name)
|
6
|
+
@name = name
|
7
|
+
clear
|
8
|
+
end
|
9
|
+
|
10
|
+
def push(url)
|
11
|
+
host = get_host(url)
|
12
|
+
@hosts << host unless @hosts.include?(host)
|
13
|
+
@urls_by_host[host] ||= []
|
14
|
+
@urls_by_host[host] << url
|
15
|
+
return true
|
16
|
+
end
|
17
|
+
|
18
|
+
def shift
|
19
|
+
if empty?
|
20
|
+
Rider.log.debug("Q #{name} POP nil")
|
21
|
+
return nil
|
22
|
+
end
|
23
|
+
host = @hosts[@current_host_index]
|
24
|
+
url = @urls_by_host[host].shift
|
25
|
+
|
26
|
+
if @urls_by_host[host].empty?
|
27
|
+
@hosts.delete_at(@current_host_index)
|
28
|
+
@urls_by_host.delete(host)
|
29
|
+
# no need to increment @current_host_index since we just effectively pushed every element down by one
|
30
|
+
# by deleting from @hosts, UNLESS it was the last item in the array, in which case that index doesn't
|
31
|
+
# exist anymore
|
32
|
+
increment_current_host_index if @current_host_index == @hosts.length
|
33
|
+
else
|
34
|
+
increment_current_host_index
|
35
|
+
end
|
36
|
+
return url
|
37
|
+
end
|
38
|
+
|
39
|
+
def clear
|
40
|
+
@urls_by_host = {}
|
41
|
+
@hosts = []
|
42
|
+
@current_host_index = 0
|
43
|
+
end
|
44
|
+
|
45
|
+
def empty?
|
46
|
+
@hosts.empty?
|
47
|
+
end
|
48
|
+
|
49
|
+
def ==(another_queue)
|
50
|
+
another_queue.instance_variable_get("@urls_by_host") == @urls_by_host &&
|
51
|
+
another_queue.instance_variable_get("@hosts") == @hosts &&
|
52
|
+
another_queue.instance_variable_get("@current_host_index") == @current_host_index
|
53
|
+
end
|
54
|
+
|
55
|
+
def serialize
|
56
|
+
File.open(filename, 'w') do |file|
|
57
|
+
file.write(self.to_yaml)
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.unserialize(name)
|
62
|
+
filename = "tmp/#{name}.q"
|
63
|
+
return nil unless File.exist?(filename)
|
64
|
+
YAML.load_file("tmp/#{name}.q")
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
def get_host(url)
|
69
|
+
URI.parse(url).host
|
70
|
+
end
|
71
|
+
|
72
|
+
def increment_current_host_index
|
73
|
+
if @hosts.length == 0
|
74
|
+
@current_host_index = 0
|
75
|
+
else
|
76
|
+
# increment by one but go back to 0 if it exceeds the length of the array
|
77
|
+
@current_host_index = (@current_host_index + 1) % @hosts.length
|
78
|
+
end
|
79
|
+
end
|
80
|
+
|
81
|
+
def filename
|
82
|
+
"tmp/#{name}.q"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
end
|
data/lib/rider/queue.rb
ADDED
@@ -0,0 +1,40 @@
|
|
1
|
+
module Rider
|
2
|
+
class Rider::Queue
|
3
|
+
attr_reader :filename
|
4
|
+
def initialize(filename)
|
5
|
+
raise(ArgumentError, "queues must have a filename") if !filename or filename.empty?
|
6
|
+
@filename = filename
|
7
|
+
end
|
8
|
+
|
9
|
+
def push(item)
|
10
|
+
Rider.log.debug("Q #{filename} PUSH #{item}")
|
11
|
+
File.open(filename, "a") do |file|
|
12
|
+
file.puts(item)
|
13
|
+
end
|
14
|
+
return true
|
15
|
+
end
|
16
|
+
|
17
|
+
def shift
|
18
|
+
if empty?
|
19
|
+
Rider.log.debug("Q #{filename} SHIFT nil")
|
20
|
+
return nil
|
21
|
+
end
|
22
|
+
lines = File.readlines(filename)
|
23
|
+
item = lines.shift.strip
|
24
|
+
File.open(filename, "w") do |file|
|
25
|
+
file.write(lines.join)
|
26
|
+
end
|
27
|
+
Rider.log.debug("Q #{filename} SHIFT #{item}")
|
28
|
+
return item
|
29
|
+
end
|
30
|
+
|
31
|
+
def clear
|
32
|
+
File.unlink(filename) if File.exist?(filename)
|
33
|
+
return true
|
34
|
+
end
|
35
|
+
|
36
|
+
def empty?
|
37
|
+
!File.exist?(filename) or File.open(filename).read == ""
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
@@ -0,0 +1,94 @@
|
|
1
|
+
require 'spec/spec_helper'
|
2
|
+
|
3
|
+
describe Rider::Crawler do
|
4
|
+
before do
|
5
|
+
@queue = Rider::Queue.new('web')
|
6
|
+
@crawler = Rider::Crawler.new(/http:\/\/localhost/, @queue)
|
7
|
+
end
|
8
|
+
|
9
|
+
describe "when checking URLs against mask" do
|
10
|
+
it "should return true for a URL that matches the mask" do
|
11
|
+
@crawler.match_mask?("http://localhost/some/path").should == true
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should return false for a URL that does not match the mask" do
|
15
|
+
@crawler.match_mask?("http://example.com/some/path").should == false
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe "when checking URL validity" do
|
20
|
+
before do
|
21
|
+
@urls = %w(http://example.com/invalid http://localhost/valid http://localhost/valid/unseen)
|
22
|
+
end
|
23
|
+
|
24
|
+
it "should return URLs matching the mask" do
|
25
|
+
@urls.select { |url| @crawler.valid_url?(url) }.should == ["http://localhost/valid", "http://localhost/valid/unseen"]
|
26
|
+
end
|
27
|
+
|
28
|
+
it "should return only unseen URLs" do
|
29
|
+
@crawler.saw_url('http://localhost/valid')
|
30
|
+
@urls.select { |url| @crawler.valid_url?(url) }.should == ['http://localhost/valid/unseen']
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
34
|
+
describe "when determining URLs to follow" do
|
35
|
+
it "should follow URLs that match the mask" do
|
36
|
+
@crawler.follow_url?('http://localhost/abc').should == true
|
37
|
+
end
|
38
|
+
|
39
|
+
it "should not follow URLs that don't match the mask" do
|
40
|
+
@crawler.follow_url?('http://invalid.com').should == false
|
41
|
+
end
|
42
|
+
|
43
|
+
it "should follow URLs that haven't been seen"
|
44
|
+
it "should not follow URLs that have been seen already"
|
45
|
+
end
|
46
|
+
|
47
|
+
describe "when getting the next document" do
|
48
|
+
|
49
|
+
end
|
50
|
+
|
51
|
+
describe "when getting documents" do
|
52
|
+
it "should raise an error for schemes other than http and file" do
|
53
|
+
lambda { @crawler.get('ftp://example.com') }.should raise_error(ArgumentError)
|
54
|
+
end
|
55
|
+
|
56
|
+
describe "when getting file:// documents" do
|
57
|
+
before do
|
58
|
+
@filename = File.expand_path(File.join(File.dirname(__FILE__), 'data', 'apples.html'))
|
59
|
+
@file_uri = 'file://' + @filename
|
60
|
+
end
|
61
|
+
|
62
|
+
it "should return an array whose first element is the uri" do
|
63
|
+
@crawler.get_file(@file_uri)[0].should == @file_uri
|
64
|
+
end
|
65
|
+
|
66
|
+
it "should return an array whose second element is blank metadata" do
|
67
|
+
@crawler.get_file(@file_uri)[1].should == {}
|
68
|
+
end
|
69
|
+
|
70
|
+
it "should return an array whose third element is the file contents" do
|
71
|
+
@crawler.get_file(@file_uri)[2].should == File.read(@filename)
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
describe "when getting http:// documents" do
|
76
|
+
before do
|
77
|
+
@doc_uri = 'http://localhost/simplewikipedia/articles/a/l/g/Algebra.html'
|
78
|
+
end
|
79
|
+
|
80
|
+
it "should return an array whose first element is the uri" do
|
81
|
+
@crawler.get_http(@doc_uri)[0].should == @doc_uri
|
82
|
+
end
|
83
|
+
|
84
|
+
it "should return an array whose second element is blank metadata" do
|
85
|
+
meta = @crawler.get_http(@doc_uri)[1]
|
86
|
+
meta['Content-type'].should == 'text/html'
|
87
|
+
end
|
88
|
+
|
89
|
+
it "should return an array whose third element is the file contents" do
|
90
|
+
@crawler.get_http(@doc_uri)[2].match(/Algebra is taught in school/).should_not == nil
|
91
|
+
end
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
3
|
+
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
4
|
+
|
5
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
6
|
+
<head>
|
7
|
+
<title>Apples</title>
|
8
|
+
|
9
|
+
</head>
|
10
|
+
|
11
|
+
<body>
|
12
|
+
|
13
|
+
<h1>Apples</h1>
|
14
|
+
|
15
|
+
<p>
|
16
|
+
Some apples are <a href="colors.html">red</a>.
|
17
|
+
Some are <a href="colors.html">green</a>.
|
18
|
+
They <a href="prices.html">do not cost much money</a>.
|
19
|
+
You can <a href="http://buyapples.com/">buy apples</a>.
|
20
|
+
</p>
|
21
|
+
|
22
|
+
</body>
|
23
|
+
</html>
|
@@ -0,0 +1,24 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
3
|
+
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
4
|
+
|
5
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
6
|
+
<head>
|
7
|
+
<title>Colors</title>
|
8
|
+
|
9
|
+
</head>
|
10
|
+
|
11
|
+
<body>
|
12
|
+
|
13
|
+
<h1>Colors</h1>
|
14
|
+
|
15
|
+
<dl>
|
16
|
+
<dt>Red</dt>
|
17
|
+
<dd><a href="apples.html">Apples</a></dd>
|
18
|
+
|
19
|
+
<dt>Green</dt>
|
20
|
+
<dd><a href="apples.html">Apples</a> or <a href="prices.html">dollars</a>.</dd>
|
21
|
+
</dl>
|
22
|
+
|
23
|
+
</body>
|
24
|
+
</html>
|
@@ -0,0 +1,17 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
3
|
+
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
4
|
+
|
5
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
6
|
+
<head>
|
7
|
+
<title>Fruits</title>
|
8
|
+
|
9
|
+
</head>
|
10
|
+
|
11
|
+
<body>
|
12
|
+
|
13
|
+
<p><a href="apples.html">Apples</a> are a fruit.</a></p>
|
14
|
+
|
15
|
+
|
16
|
+
</body>
|
17
|
+
</html>
|
@@ -0,0 +1,14 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
3
|
+
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
4
|
+
|
5
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
6
|
+
<head>
|
7
|
+
<title></title>
|
8
|
+
|
9
|
+
</head>
|
10
|
+
|
11
|
+
<body>
|
12
|
+
|
13
|
+
</body>
|
14
|
+
</html>
|
@@ -0,0 +1,34 @@
|
|
1
|
+
<?xml version="1.0" encoding="UTF-8"?>
|
2
|
+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
3
|
+
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
4
|
+
|
5
|
+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
6
|
+
<head>
|
7
|
+
<title>Prices</title>
|
8
|
+
|
9
|
+
</head>
|
10
|
+
|
11
|
+
<body>
|
12
|
+
|
13
|
+
<h1>Prices</h1>
|
14
|
+
|
15
|
+
<table>
|
16
|
+
<tr>
|
17
|
+
<th>Item</th>
|
18
|
+
<th>Price</th>
|
19
|
+
</tr>
|
20
|
+
|
21
|
+
<tr>
|
22
|
+
<td><a href="apples.html">Apples</a></td>
|
23
|
+
<td>$0.35</td>
|
24
|
+
</tr>
|
25
|
+
|
26
|
+
<tr>
|
27
|
+
<td><a href="colors.html">Colors</a></td>
|
28
|
+
<td>N/A</td>
|
29
|
+
</tr>
|
30
|
+
</table>
|
31
|
+
|
32
|
+
|
33
|
+
</body>
|
34
|
+
</html>
|
data/spec/data/tiny.html
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
<html><head><title>asdf</title></head><body>asdf</body></html>
|
@@ -0,0 +1,40 @@
|
|
1
|
+
require 'spec/spec_helper'
|
2
|
+
require 'spec/queue_spec'
|
3
|
+
|
4
|
+
describe Rider::HostPartitionedQueue do
|
5
|
+
it_should_behave_like "queue"
|
6
|
+
|
7
|
+
before do
|
8
|
+
@q = Rider::HostPartitionedQueue.new('test')
|
9
|
+
end
|
10
|
+
|
11
|
+
it "should alternate among hosts when shifting" do
|
12
|
+
%w(http://example.com/path1 http://example.com/path2 http://example.net/ http://localhost/path).each { |u| @q.push(u) }
|
13
|
+
[@q.shift, @q.shift, @q.shift, @q.shift].should ==
|
14
|
+
%w(http://example.com/path1 http://example.net/ http://localhost/path http://example.com/path2)
|
15
|
+
end
|
16
|
+
|
17
|
+
it "should return the same host if only one distinct host exists" do
|
18
|
+
%w(http://example.com/path1 http://example.com/path2 http://example.com/path3).each { |u| @q.push(u) }
|
19
|
+
[@q.shift, @q.shift, @q.shift].should == %w(http://example.com/path1 http://example.com/path2 http://example.com/path3)
|
20
|
+
end
|
21
|
+
|
22
|
+
it "should be equal to another queue with the same objects and state" do
|
23
|
+
@q2 = Rider::HostPartitionedQueue.new('test2')
|
24
|
+
%w(http://example.com/path1 http://example.com/path2 http://example.net/ http://localhost/path).each { |u| @q.push(u) }
|
25
|
+
%w(http://example.com/path1 http://example.com/path2 http://example.net/ http://localhost/path).each { |u| @q2.push(u) }
|
26
|
+
@q.should == @q2
|
27
|
+
end
|
28
|
+
|
29
|
+
describe "when serializing" do
|
30
|
+
it "should write and read itself back" do
|
31
|
+
%w(http://example.com/path1 http://example.com/path2 http://example.net/ http://localhost/path).each { |u| @q.push(u) }
|
32
|
+
@q.serialize
|
33
|
+
Rider::HostPartitionedQueue.unserialize('test').should == @q
|
34
|
+
end
|
35
|
+
|
36
|
+
it "should return nil if asked to unserialize from a nonexistent file" do
|
37
|
+
Rider::HostPartitionedQueue.unserialize('nonexistent').should == nil
|
38
|
+
end
|
39
|
+
end
|
40
|
+
end
|
data/spec/queue_spec.rb
ADDED
@@ -0,0 +1,43 @@
|
|
1
|
+
require 'spec/spec_helper'
|
2
|
+
|
3
|
+
shared_examples_for "queue" do
|
4
|
+
it "must not have a blank or nil name" do
|
5
|
+
lambda { Rider::Queue.new(nil) }.should raise_error(ArgumentError)
|
6
|
+
lambda { Rider::Queue.new('') }.should raise_error(ArgumentError)
|
7
|
+
end
|
8
|
+
|
9
|
+
it "should be empty after clearing" do
|
10
|
+
@q.clear
|
11
|
+
@q.empty?.should == true
|
12
|
+
end
|
13
|
+
|
14
|
+
it "should push then shift one item" do
|
15
|
+
@q.push('blue')
|
16
|
+
@q.shift.should == 'blue'
|
17
|
+
end
|
18
|
+
|
19
|
+
describe "when empty" do
|
20
|
+
it "should return nil if shifted" do
|
21
|
+
@q.shift.should == nil
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
it "should not clobber the queue upon initialization"
|
26
|
+
end
|
27
|
+
|
28
|
+
describe Rider::Queue do
|
29
|
+
before do
|
30
|
+
@q = Rider::Queue.new('tmp/colors.q')
|
31
|
+
@q.clear
|
32
|
+
end
|
33
|
+
|
34
|
+
after do
|
35
|
+
@q.clear
|
36
|
+
end
|
37
|
+
|
38
|
+
it "should push then shift multiple items" do
|
39
|
+
%w(red green orange).each { |color| @q.push(color) }
|
40
|
+
puts "POP x 3"
|
41
|
+
[@q.shift, @q.shift, @q.shift].should == %w(red green orange)
|
42
|
+
end
|
43
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require 'lib/rider'
|
@@ -0,0 +1,25 @@
|
|
1
|
+
namespace "doc" do
|
2
|
+
desc "Generate RDoc docs"
|
3
|
+
task :generate do
|
4
|
+
# Using rake/rdoctask invoked old rdoc 1.x for some reason, but this invokes rdoc 2.x
|
5
|
+
sh "rdoc --all --title 'Rider - Ruby Web crawler' --line-numbers --inline-source --force-update --all --charset utf-8 --main README README lib/"
|
6
|
+
end
|
7
|
+
|
8
|
+
desc "Upload docs to site"
|
9
|
+
task :upload do
|
10
|
+
sh "tar czfv rider-rdoc.tgz doc/"
|
11
|
+
puts
|
12
|
+
puts "Going to upload..."
|
13
|
+
puts
|
14
|
+
sh "scp rider-rdoc.tgz cardinal.stanford.edu:WWW/rider/"
|
15
|
+
sh "ssh cardinal.stanford.edu 'cd WWW/rider;tar xzfv rider-rdoc.tgz'"
|
16
|
+
sh "rm rider-rdoc.tgz"
|
17
|
+
puts
|
18
|
+
puts "Upload complete"
|
19
|
+
end
|
20
|
+
|
21
|
+
desc "Generate & upload"
|
22
|
+
task :update=>[:generate, :upload]
|
23
|
+
end
|
24
|
+
|
25
|
+
|
data/tasks/rspec.rake
ADDED
metadata
ADDED
@@ -0,0 +1,95 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: rider
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: "0.2"
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Quinn Slack
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2009-10-07 00:00:00 -03:00
|
13
|
+
default_executable:
|
14
|
+
dependencies:
|
15
|
+
- !ruby/object:Gem::Dependency
|
16
|
+
name: hpricot
|
17
|
+
type: :runtime
|
18
|
+
version_requirement:
|
19
|
+
version_requirements: !ruby/object:Gem::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">="
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.8.1
|
24
|
+
version:
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: mechanize
|
27
|
+
type: :runtime
|
28
|
+
version_requirement:
|
29
|
+
version_requirements: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ">="
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 0.9.3
|
34
|
+
version:
|
35
|
+
description: Ruby Web crawler
|
36
|
+
email: me@rafaelss.com
|
37
|
+
executables: []
|
38
|
+
|
39
|
+
extensions: []
|
40
|
+
|
41
|
+
extra_rdoc_files: []
|
42
|
+
|
43
|
+
files:
|
44
|
+
- bin/crawl
|
45
|
+
- lib/rider/crawler.rb
|
46
|
+
- lib/rider/part_queue.rb
|
47
|
+
- lib/rider/queue.rb
|
48
|
+
- lib/rider.rb
|
49
|
+
- Rakefile
|
50
|
+
- README
|
51
|
+
- spec/crawler_spec.rb
|
52
|
+
- spec/data/apples.html
|
53
|
+
- spec/data/colors.html
|
54
|
+
- spec/data/fruits.html
|
55
|
+
- spec/data/notitle.html
|
56
|
+
- spec/data/prices.html
|
57
|
+
- spec/data/tiny.html
|
58
|
+
- spec/part_queue_spec.rb
|
59
|
+
- spec/queue_spec.rb
|
60
|
+
- spec/spec_helper.rb
|
61
|
+
- tasks/deployment.rake
|
62
|
+
- tasks/environment.rake
|
63
|
+
- tasks/rspec.rake
|
64
|
+
has_rdoc: true
|
65
|
+
homepage: http://qslack.com/
|
66
|
+
licenses: []
|
67
|
+
|
68
|
+
post_install_message:
|
69
|
+
rdoc_options: []
|
70
|
+
|
71
|
+
require_paths:
|
72
|
+
- lib
|
73
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
74
|
+
requirements:
|
75
|
+
- - ">="
|
76
|
+
- !ruby/object:Gem::Version
|
77
|
+
version: "0"
|
78
|
+
version:
|
79
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
80
|
+
requirements:
|
81
|
+
- - ">="
|
82
|
+
- !ruby/object:Gem::Version
|
83
|
+
version: "0"
|
84
|
+
version:
|
85
|
+
requirements: []
|
86
|
+
|
87
|
+
rubyforge_project:
|
88
|
+
rubygems_version: 1.3.5
|
89
|
+
signing_key:
|
90
|
+
specification_version: 3
|
91
|
+
summary: Ruby Web crawler
|
92
|
+
test_files:
|
93
|
+
- spec/crawler_spec.rb
|
94
|
+
- spec/part_queue_spec.rb
|
95
|
+
- spec/queue_spec.rb
|