fizx-rwget 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- data/.document +5 -0
- data/.gitignore +5 -0
- data/README.markdown +57 -0
- data/Rakefile +59 -0
- data/VERSION +1 -0
- data/bin/rwget +15 -0
- data/lib/rwget.rb +5 -0
- data/lib/rwget/controller.rb +119 -0
- data/lib/rwget/dupes.rb +19 -0
- data/lib/rwget/fetch.rb +44 -0
- data/lib/rwget/links.rb +33 -0
- data/lib/rwget/queue.rb +32 -0
- data/lib/rwget/rwget_option_parser.rb +158 -0
- data/lib/rwget/sitemap_links.rb +27 -0
- data/lib/rwget/store.rb +17 -0
- data/test/controller_test.rb +37 -0
- data/test/dupes_test.rb +18 -0
- data/test/fetch_test.rb +19 -0
- data/test/fixtures/events00.xml.gz +0 -0
- data/test/fixtures/sitemap_index.xml +79 -0
- data/test/fixtures/yelp.html +2329 -0
- data/test/links_test.rb +22 -0
- data/test/queue_test.rb +14 -0
- data/test/server.rb +28 -0
- data/test/sitemap_links_test.rb +18 -0
- data/test/store_test.rb +28 -0
- metadata +137 -0
data/.document
ADDED
data/README.markdown
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
# RWGet
|
2
|
+
|
3
|
+
RWget is a web crawler that tries to emulate a subset of the interface of GNU/Wget, but with more flexibility for my needs.
|
4
|
+
|
5
|
+
## Features
|
6
|
+
|
7
|
+
1. Regular expression accept/reject lists
|
8
|
+
2. Pluggable interfaces for robots-txt, url-fetcher, url-queue, url-dupe-detector, and page-storage. The defaults store locally, and fetch using libcurl, but you could easily change to db storage, a distributed queue, etc.
|
9
|
+
|
10
|
+
## Help page
|
11
|
+
|
12
|
+
Usage: /usr/bin/rwget [options] SEED_URL [SEED_URL2 ...]
|
13
|
+
-w, --wait=SECONDS wait SECONDS between retrievals.
|
14
|
+
-P, --directory-prefix=PREFIX save files to PREFIX/...
|
15
|
+
-U, --user-agent=AGENT identify as AGENT instead of RWget/VERSION.
|
16
|
+
-A, --accept-pattern=RUBY_REGEX URLs must match RUBY_REGEX to be saved to the queue.
|
17
|
+
--time-limit=AMOUNT Crawler will stop after this AMOUNT of time has passed.
|
18
|
+
-R, --reject-pattern=RUBY_REGEX URLs must NOT match RUBY_REGEX to be saved to the queue.
|
19
|
+
--require=RUBY_SCRIPT Will execute 'require RUBY_SCRIPT'
|
20
|
+
--limit-rate=RATE limit download rate to RATE.
|
21
|
+
--http-proxy=URL Proxies via URL
|
22
|
+
--proxy-user=USER Sets proxy user to USER
|
23
|
+
--proxy-password=PASSWORD Sets proxy password to PASSWORD
|
24
|
+
--fetch-class=RUBY_CLASS Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object]
|
25
|
+
--store-class=RUBY_CLASS Must implement put(key_string, temp_file)
|
26
|
+
--dupes-class=RUBY_CLASS Must implement dupe?(uri)
|
27
|
+
--queue-class=RUBY_CLASS Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]
|
28
|
+
--links-class=RUBY_CLASS Must implement urls(base_uri, temp_file) #=> [uri, ...]
|
29
|
+
-S, --sitemap=URL URL of a sitemap to crawl (will ignore inter-page links)
|
30
|
+
|
31
|
+
-Q, --quota=NUMBER set retrieval quota to NUMBER.
|
32
|
+
--max-redirect=NUM maximum redirections allowed per page.
|
33
|
+
-H, --span-hosts go to foreign hosts when recursive
|
34
|
+
--connect-timeout=SECS set the connect timeout to SECS.
|
35
|
+
-T, --timeout=SECS set all timeout values to SECONDS.
|
36
|
+
-l, --level=NUMBER maximum recursion depth (inf or 0 for infinite).
|
37
|
+
--[no-]timestampize Prepend the timestamp of when the crawl started to the directory structure.
|
38
|
+
--incremental-from=PREVIOUS Build upon the indexing already saved in PREVIOUS.
|
39
|
+
--protocol-directories use protocol name in directories.
|
40
|
+
--no-host-directories don't create host directories.
|
41
|
+
-v, --[no-]verbose Run verbosely
|
42
|
+
-h, --help Show this message
|
43
|
+
|
44
|
+
## Ruby API
|
45
|
+
|
46
|
+
require "rubygems"
|
47
|
+
require "rwget"
|
48
|
+
|
49
|
+
# options is the same as the command-line long options, but converted into
|
50
|
+
# idiomatic ruby. See the RDoc for details.
|
51
|
+
# i.e.
|
52
|
+
# sh$ rwget -T 5 -A ".*foo.*" http://google.com
|
53
|
+
# becomes:
|
54
|
+
# irb$ RWGet::Controller.new({:seeds => ["http://google.com"],
|
55
|
+
# :timeout => 5, :accept_patterns => /.*foo.*/}).start
|
56
|
+
|
57
|
+
RWGet::Controller.new(options).start
|
data/Rakefile
ADDED
@@ -0,0 +1,59 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "rwget"
|
8
|
+
gem.summary = %Q{Ruby port of wget, emphasis on recursive/crawler}
|
9
|
+
gem.email = "kyle@kylemaxwell.com"
|
10
|
+
gem.homepage = "http://github.com/fizx/rwget"
|
11
|
+
gem.authors = ["Kyle Maxwell"]
|
12
|
+
gem.add_dependency("curb", ["> 0.0.0"])
|
13
|
+
gem.add_dependency("hpricot", ["> 0.0.0", "< 0.7"])
|
14
|
+
gem.add_dependency("fizx-robots", [">= 0.3.1"])
|
15
|
+
gem.add_dependency("bloomfilter", ["> 0.0.0"])
|
16
|
+
gem.add_dependency("libxml-ruby", ["> 0.9"])
|
17
|
+
end
|
18
|
+
rescue LoadError
|
19
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
20
|
+
end
|
21
|
+
|
22
|
+
require 'rake/testtask'
|
23
|
+
Rake::TestTask.new(:test) do |test|
|
24
|
+
test.libs << 'lib' << 'test'
|
25
|
+
test.pattern = 'test/**/*_test.rb'
|
26
|
+
test.verbose = true
|
27
|
+
end
|
28
|
+
|
29
|
+
begin
|
30
|
+
require 'rcov/rcovtask'
|
31
|
+
Rcov::RcovTask.new do |test|
|
32
|
+
test.libs << 'test'
|
33
|
+
test.pattern = 'test/**/*_test.rb'
|
34
|
+
test.verbose = true
|
35
|
+
end
|
36
|
+
rescue LoadError
|
37
|
+
task :rcov do
|
38
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
39
|
+
end
|
40
|
+
end
|
41
|
+
|
42
|
+
|
43
|
+
task :default => :test
|
44
|
+
|
45
|
+
require 'rake/rdoctask'
|
46
|
+
Rake::RDocTask.new do |rdoc|
|
47
|
+
if File.exist?('VERSION.yml')
|
48
|
+
config = YAML.load(File.read('VERSION.yml'))
|
49
|
+
version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
|
50
|
+
else
|
51
|
+
version = ""
|
52
|
+
end
|
53
|
+
|
54
|
+
rdoc.rdoc_dir = 'rdoc'
|
55
|
+
rdoc.title = "rwget #{version}"
|
56
|
+
rdoc.rdoc_files.include('README*')
|
57
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
58
|
+
end
|
59
|
+
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.5.0
|
data/bin/rwget
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require File.dirname(__FILE__) + "/../lib/rwget"
|
3
|
+
|
4
|
+
parser = RWGetOptionParser.new
|
5
|
+
parser.parse!
|
6
|
+
|
7
|
+
if parser.options[:seeds].empty?
|
8
|
+
puts parser.usage
|
9
|
+
puts " -h for options listing"
|
10
|
+
exit(1)
|
11
|
+
end
|
12
|
+
|
13
|
+
controller = RWGet::Controller.new(parser.options)
|
14
|
+
controller.start
|
15
|
+
controller.close
|
data/lib/rwget.rb
ADDED
@@ -0,0 +1,119 @@
|
|
1
|
+
require "set"
|
2
|
+
class RWGet::Controller
|
3
|
+
attr_reader :options
|
4
|
+
|
5
|
+
def self.resolve_class(string)
|
6
|
+
string.split("::").inject(Kernel) do |const, string|
|
7
|
+
const.const_get(string)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(options)
|
12
|
+
@options = options
|
13
|
+
@options[:user_agent] ||= "Ruby/Wget"
|
14
|
+
|
15
|
+
@options[:accept_patterns] ||= []
|
16
|
+
@options[:reject_patterns] ||= []
|
17
|
+
|
18
|
+
%w[quota depth wait limit_rate time_limit].each do |key|
|
19
|
+
key = key.to_sym
|
20
|
+
@options[key] = @options[key].to_i
|
21
|
+
end
|
22
|
+
|
23
|
+
@queue = (options[:queue_class] ? self.class.resolve_class(options[:queue_class]) : RWGet::Queue).new(options)
|
24
|
+
@fetch = (options[:fetch_class] ? self.class.resolve_class(options[:fetch_class]) : RWGet::Fetch).new(options)
|
25
|
+
@store = (options[:store_class] ? self.class.resolve_class(options[:store_class]) : RWGet::Store).new(options)
|
26
|
+
@links = (options[:links_class] ? self.class.resolve_class(options[:links_class]) : RWGet::Links).new(options)
|
27
|
+
@dupes = (options[:dupes_class] ? self.class.resolve_class(options[:dupes_class]) : RWGet::Dupes).new(options)
|
28
|
+
end
|
29
|
+
|
30
|
+
def start
|
31
|
+
@start_time = Time.now.to_i.to_s
|
32
|
+
@start = Time.now
|
33
|
+
@original_hosts = Set.new
|
34
|
+
options[:seeds].each do |seed|
|
35
|
+
@queue.put(seed, 0)
|
36
|
+
@original_hosts << URI.parse(seed).host
|
37
|
+
end
|
38
|
+
|
39
|
+
downloaded = 0
|
40
|
+
while (options[:quota] == 0 || downloaded < options[:quota]) &&
|
41
|
+
(options[:time_limit] == 0 || Time.now - @start < options[:time_limit])
|
42
|
+
|
43
|
+
url, depth = @queue.get
|
44
|
+
|
45
|
+
unless url
|
46
|
+
puts "no more urls"
|
47
|
+
return
|
48
|
+
end
|
49
|
+
|
50
|
+
if options[:depth] > 0 && depth > options[:depth]
|
51
|
+
next
|
52
|
+
end
|
53
|
+
|
54
|
+
uri = URI.parse(url)
|
55
|
+
|
56
|
+
while options[:limit_rate] > 0 && downloaded / (Time.now - @start) > options[:limit_rate]
|
57
|
+
puts "sleeping until under rate limit"
|
58
|
+
sleep 1
|
59
|
+
end
|
60
|
+
puts "download rate: #{downloaded / (Time.now - @start)}bps"
|
61
|
+
|
62
|
+
puts "downloading #{uri}"
|
63
|
+
effective_url, tmpfile = @fetch.fetch(uri, options[:user_agent])
|
64
|
+
|
65
|
+
if tmpfile
|
66
|
+
downloaded += File.size(tmpfile.path)
|
67
|
+
puts "parsing links"
|
68
|
+
@links.urls(effective_url, tmpfile).each do |link|
|
69
|
+
legal = legal?(link)
|
70
|
+
dupe = @dupes.dupe?(link)
|
71
|
+
puts "dupe: #{link}" if dupe
|
72
|
+
if legal && !dupe
|
73
|
+
puts "adding link: #{link}"
|
74
|
+
@queue.put(link, depth + 1)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
key = key_for(uri)
|
78
|
+
puts "storing at #{key}"
|
79
|
+
@store.put(key, tmpfile)
|
80
|
+
sleep options[:wait]
|
81
|
+
else
|
82
|
+
puts "unable to download"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
puts "hit time/quota"
|
86
|
+
end
|
87
|
+
|
88
|
+
def legal?(link)
|
89
|
+
unless options[:span_hosts] || @original_hosts.include?(link.host)
|
90
|
+
puts "can't span hosts: #{link}"
|
91
|
+
return false
|
92
|
+
end
|
93
|
+
link = link.to_s
|
94
|
+
legal = options[:accept_patterns].empty?
|
95
|
+
puts "accepted by default: #{link}" if legal
|
96
|
+
legal ||= options[:accept_patterns].any?{|p| link =~ p}
|
97
|
+
puts "not in accept patterns: #{link}" if !legal
|
98
|
+
rejected = options[:reject_patterns].any?{|p| link =~ p}
|
99
|
+
puts "in reject patterns: #{link}" if rejected
|
100
|
+
legal && !rejected
|
101
|
+
end
|
102
|
+
|
103
|
+
def key_for(uri)
|
104
|
+
arr = []
|
105
|
+
arr << options[:prefix] if options[:prefix]
|
106
|
+
arr << @start_time if options[:timestampize]
|
107
|
+
arr << uri.scheme if options[:protocol_directories]
|
108
|
+
arr << uri.host unless options[:no_host_directories]
|
109
|
+
paths = uri.path.split("/")
|
110
|
+
paths.shift if paths.first.to_s.empty?
|
111
|
+
File.join(arr + paths)
|
112
|
+
end
|
113
|
+
|
114
|
+
def close
|
115
|
+
[@queue, @fetch, @store, @links, @dupes].each do |obj|
|
116
|
+
obj.close if obj.respond_to?(:close)
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
data/lib/rwget/dupes.rb
ADDED
@@ -0,0 +1,19 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "tempfile"
|
3
|
+
require "bloomfilter"
|
4
|
+
|
5
|
+
class RWGet::Dupes
|
6
|
+
SIZE = 1_000_000
|
7
|
+
|
8
|
+
def initialize(options = {})
|
9
|
+
@tmp = Tempfile.new("bloom")
|
10
|
+
@bloom = ExternalBloomFilter.create(@tmp.path, SIZE)
|
11
|
+
end
|
12
|
+
|
13
|
+
def dupe?(uri)
|
14
|
+
key = uri.to_s
|
15
|
+
return true if @bloom.include?(key)
|
16
|
+
@bloom.add(key)
|
17
|
+
return false
|
18
|
+
end
|
19
|
+
end
|
data/lib/rwget/fetch.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
require "open-uri"
|
2
|
+
require "tempfile"
|
3
|
+
require "rubygems"
|
4
|
+
require "robots"
|
5
|
+
require "curl"
|
6
|
+
class RWGet::Fetch
|
7
|
+
DEFAULT_TIMEOUT = 30
|
8
|
+
DEFAULT_REDIRECTS = 30
|
9
|
+
|
10
|
+
def initialize(options = {})
|
11
|
+
@robots = {}
|
12
|
+
@curl = Curl::Easy.new
|
13
|
+
@curl.connect_timeout = options[:connect_timeout] || DEFAULT_TIMEOUT
|
14
|
+
@curl.timeout = options[:timeout] || DEFAULT_TIMEOUT
|
15
|
+
@curl.max_redirects = options[:max_redirect] || DEFAULT_REDIRECTS
|
16
|
+
@curl.follow_location = true
|
17
|
+
if options[:http_proxy]
|
18
|
+
@curl.proxy_url = options[:http_proxy]
|
19
|
+
if options[:proxy_user]
|
20
|
+
@curl.proxypwd = "#{options[:proxy_user]}:#{options[:proxy_password]}"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
puts "timeout: #{@curl.timeout}"
|
24
|
+
end
|
25
|
+
|
26
|
+
def fetch(uri, user_agent)
|
27
|
+
@robots[user_agent] ||= Robots.new(user_agent)
|
28
|
+
unless @robots[user_agent].allowed?(uri)
|
29
|
+
puts "disallowed by robots.txt"
|
30
|
+
return nil
|
31
|
+
end
|
32
|
+
|
33
|
+
@curl.headers["User-Agent"] = user_agent
|
34
|
+
@curl.url = uri.to_s
|
35
|
+
@curl.perform
|
36
|
+
tmp = nil
|
37
|
+
Tempfile.open("curl") {|file| file.print(@curl.body_str); tmp = file }
|
38
|
+
tmp.open
|
39
|
+
[@curl.last_effective_url, tmp]
|
40
|
+
rescue Exception => e
|
41
|
+
STDERR.puts "#{uri} not retrieved: #{e.message}"
|
42
|
+
nil
|
43
|
+
end
|
44
|
+
end
|
data/lib/rwget/links.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "hpricot"
|
3
|
+
class RWGet::Links
|
4
|
+
def initialize(options = {})
|
5
|
+
end
|
6
|
+
|
7
|
+
def urls(base, tmpfile)
|
8
|
+
@urls = []
|
9
|
+
base = base.to_s
|
10
|
+
string = File.read(tmpfile.path)
|
11
|
+
xml = string =~ /<\?xml/
|
12
|
+
doc = xml ? Hpricot.XML(string) : Hpricot(string)
|
13
|
+
|
14
|
+
(doc / "//item/link").each do |l|
|
15
|
+
add base, l.inner_text
|
16
|
+
end
|
17
|
+
(doc / "a").each do |a|
|
18
|
+
add base, a.attributes["href"]
|
19
|
+
end
|
20
|
+
@urls
|
21
|
+
rescue Exception => e
|
22
|
+
STDERR.puts "Couldn't parse #{base} for links: #{e.message}"
|
23
|
+
[]
|
24
|
+
end
|
25
|
+
|
26
|
+
def add(base, href)
|
27
|
+
begin
|
28
|
+
@urls << URI.join(base, href.strip) if href
|
29
|
+
rescue Exception => e
|
30
|
+
STDERR.puts "url error parsing URI.join(#{base.inspect}, #{href.inspect}): #{e.message}"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/rwget/queue.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require "tempfile"
|
2
|
+
class RWGet::Queue
|
3
|
+
def initialize(options = {})
|
4
|
+
@writer = Tempfile.new("rwget-queue")
|
5
|
+
@reader = File.open(@writer.path, "r")
|
6
|
+
@dirty = false
|
7
|
+
end
|
8
|
+
|
9
|
+
def put(key, depth)
|
10
|
+
@writer.puts "#{key}\t#{depth}"
|
11
|
+
@dirty = true
|
12
|
+
end
|
13
|
+
|
14
|
+
def get(retrying = false)
|
15
|
+
sleep 0.1 if retrying
|
16
|
+
if @dirty
|
17
|
+
@writer.flush
|
18
|
+
@dirty = false
|
19
|
+
end
|
20
|
+
line = @reader.gets
|
21
|
+
unless line
|
22
|
+
return retrying ? nil : get(:retry)
|
23
|
+
end
|
24
|
+
key, depth = line.split("\t")
|
25
|
+
return [key, depth.to_i]
|
26
|
+
end
|
27
|
+
|
28
|
+
def close
|
29
|
+
@writer.close
|
30
|
+
@reader.close
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,158 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
|
3
|
+
class RWGetOptionParser < OptionParser
|
4
|
+
attr_accessor :options
|
5
|
+
|
6
|
+
def usage
|
7
|
+
"Usage: #{$0} [options] SEED_URL [SEED_URL2 ...]"
|
8
|
+
end
|
9
|
+
|
10
|
+
def parse!
|
11
|
+
super
|
12
|
+
options[:seeds] = ARGV
|
13
|
+
end
|
14
|
+
|
15
|
+
def initialize
|
16
|
+
self.options = {}
|
17
|
+
super do |opts|
|
18
|
+
|
19
|
+
yield opts if block_given?
|
20
|
+
|
21
|
+
opts.banner = usage
|
22
|
+
|
23
|
+
opts.on("-w", "--wait=SECONDS", "wait SECONDS between retrievals.") do |w|
|
24
|
+
options[:wait] = w.to_i
|
25
|
+
end
|
26
|
+
|
27
|
+
opts.on("-P", "--directory-prefix=PREFIX", "save files to PREFIX/...") do |p|
|
28
|
+
options[:prefix] = p
|
29
|
+
end
|
30
|
+
|
31
|
+
opts.on("-U", "--user-agent=AGENT", "identify as AGENT instead of RWget/VERSION.") do |u|
|
32
|
+
options[:user_agent] = u
|
33
|
+
end
|
34
|
+
|
35
|
+
opts.on("-Ap", "--accept-pattern=RUBY_REGEX", "URLs must match RUBY_REGEX to be saved to the queue.") do |r|
|
36
|
+
options[:accept_patterns] ||= []
|
37
|
+
options[:accept_patterns] << Regexp.new(r)
|
38
|
+
end
|
39
|
+
|
40
|
+
opts.on("--time-limit=AMOUNT", "Crawler will stop after this AMOUNT of time has passed.") do |t|
|
41
|
+
options[:time_limit] = t.to_i
|
42
|
+
options[:time_limit] *= 60 if t =~ /m/i
|
43
|
+
options[:time_limit] *= 60 * 60 if t =~ /h/i
|
44
|
+
options[:time_limit] *= 60 * 60 * 24 if t =~ /d/i
|
45
|
+
options[:time_limit] *= 60 * 60 * 24 * 7 if t =~ /w/i
|
46
|
+
end
|
47
|
+
|
48
|
+
opts.on("-Rp", "--reject-pattern=RUBY_REGEX", "URLs must NOT match RUBY_REGEX to be saved to the queue.") do |r|
|
49
|
+
options[:reject_patterns] ||= []
|
50
|
+
options[:reject_patterns] << Regexp.new(r)
|
51
|
+
end
|
52
|
+
|
53
|
+
opts.on("--require=RUBY_SCRIPT", "Will execute 'require RUBY_SCRIPT'") do |s|
|
54
|
+
require s
|
55
|
+
end
|
56
|
+
|
57
|
+
opts.on("--limit-rate=RATE", "limit download rate to RATE.") do |r|
|
58
|
+
rate = r.to_i
|
59
|
+
rate *= 1000 if r =~ /k/i
|
60
|
+
rate *= 1000000 if r =~ /m/i
|
61
|
+
options[:limit_rate] = rate
|
62
|
+
puts "rate is #{rate}"
|
63
|
+
end
|
64
|
+
|
65
|
+
opts.on("--http-proxy=URL", "Proxies via URL") do |u|
|
66
|
+
options[:http_proxy] = u
|
67
|
+
end
|
68
|
+
|
69
|
+
opts.on("--proxy-user=USER", "Sets proxy user to USER") do |u|
|
70
|
+
options[:proxy_user] = u
|
71
|
+
end
|
72
|
+
|
73
|
+
opts.on("--proxy-password=PASSWORD", "Sets proxy password to PASSWORD") do |p|
|
74
|
+
options[:proxy_password] = p
|
75
|
+
end
|
76
|
+
|
77
|
+
opts.on("--fetch-class=RUBY_CLASS", "Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object]") do |c|
|
78
|
+
options[:fetch_class] = c
|
79
|
+
end
|
80
|
+
|
81
|
+
opts.on("--store-class=RUBY_CLASS", "Must implement put(key_string, temp_file)") do |c|
|
82
|
+
options[:store_class] = c
|
83
|
+
end
|
84
|
+
|
85
|
+
opts.on("--dupes-class=RUBY_CLASS", "Must implement dupe?(uri)") do |c|
|
86
|
+
options[:dupes_class] = c
|
87
|
+
end
|
88
|
+
|
89
|
+
opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]") do |c|
|
90
|
+
options[:queue_class] = c
|
91
|
+
end
|
92
|
+
|
93
|
+
opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]") do |c|
|
94
|
+
options[:queue_class] = c
|
95
|
+
end
|
96
|
+
|
97
|
+
opts.on("--links-class=RUBY_CLASS", "Must implement urls(base_uri, temp_file) #=> [uri, ...]") do |c|
|
98
|
+
options[:links_class] = c
|
99
|
+
end
|
100
|
+
|
101
|
+
opts.on("-S", "--sitemap=URL", "URL of a sitemap to crawl (will ignore inter-page links)") do |url|
|
102
|
+
options[:seeds] << url
|
103
|
+
options[:links_class] = "RWGet::SitemapLinks"
|
104
|
+
end
|
105
|
+
|
106
|
+
opts.on("-Q", "--quota=NUMBER", "set retrieval quota to NUMBER.") do |q|
|
107
|
+
options[:quota] = q.to_i
|
108
|
+
options[:quota] *= 1000 if q =~ /k/i
|
109
|
+
options[:quota] *= 1000000 if q =~ /m/i
|
110
|
+
end
|
111
|
+
|
112
|
+
opts.on("--max-redirect=NUM", "maximum redirections allowed per page.") do |m|
|
113
|
+
options[:max_redirect] = m.to_i
|
114
|
+
end
|
115
|
+
|
116
|
+
opts.on("-H", "--span-hosts", "go to foreign hosts when recursive") do |s|
|
117
|
+
options[:span_hosts] = s
|
118
|
+
end
|
119
|
+
|
120
|
+
opts.on("--connect-timeout=SECS", "set the connect timeout to SECS.") do |t|
|
121
|
+
options[:connect_timeout] = t.to_i
|
122
|
+
end
|
123
|
+
|
124
|
+
opts.on("-T", "--timeout=SECS", "set all timeout values to SECONDS.") do |t|
|
125
|
+
options[:timeout] = t.to_i
|
126
|
+
end
|
127
|
+
|
128
|
+
opts.on("-l", "--level=NUMBER", "maximum recursion depth (inf or 0 for infinite).") do |l|
|
129
|
+
options[:depth] = l.to_i
|
130
|
+
end
|
131
|
+
|
132
|
+
opts.on("--[no-]timestampize", "Prepend the timestamp of when the crawl started to the directory structure.") do |t|
|
133
|
+
options[:timestampize] = t
|
134
|
+
end
|
135
|
+
|
136
|
+
opts.on("--incremental-from=PREVIOUS", "Build upon the indexing already saved in PREVIOUS.") do |r|
|
137
|
+
options[:incremental_from] = r
|
138
|
+
end
|
139
|
+
|
140
|
+
opts.on("--protocol-directories", "use protocol name in directories.") do |p|
|
141
|
+
options[:protocol_directories] = p
|
142
|
+
end
|
143
|
+
|
144
|
+
opts.on("--no-host-directories", "don't create host directories.") do |h|
|
145
|
+
options[:no_host_directories] = h
|
146
|
+
end
|
147
|
+
|
148
|
+
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
|
149
|
+
options[:verbose] = v
|
150
|
+
end
|
151
|
+
|
152
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
153
|
+
puts opts
|
154
|
+
exit
|
155
|
+
end
|
156
|
+
end
|
157
|
+
end
|
158
|
+
end
|