rwget 0.0.0 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.document +5 -0
- data/.gitignore +5 -0
- data/README.markdown +57 -0
- data/Rakefile +60 -0
- data/VERSION +1 -0
- data/bin/rwget +15 -0
- data/lib/rwget.rb +5 -0
- data/lib/rwget/controller.rb +121 -0
- data/lib/rwget/dupes.rb +18 -0
- data/lib/rwget/fetch.rb +44 -0
- data/lib/rwget/links.rb +33 -0
- data/lib/rwget/queue.rb +32 -0
- data/lib/rwget/rwget_option_parser.rb +165 -0
- data/lib/rwget/sitemap_links.rb +27 -0
- data/lib/rwget/store.rb +25 -0
- data/rwget.gemspec +89 -0
- data/test/controller_test.rb +37 -0
- data/test/dupes_test.rb +18 -0
- data/test/fetch_test.rb +19 -0
- data/test/fixtures/events00.xml.gz +0 -0
- data/test/fixtures/sitemap_index.xml +79 -0
- data/test/fixtures/yelp.html +2329 -0
- data/test/links_test.rb +22 -0
- data/test/queue_test.rb +14 -0
- data/test/server.rb +28 -0
- data/test/sitemap_links_test.rb +18 -0
- data/test/store_test.rb +28 -0
- metadata +101 -19
data/.document
ADDED
data/README.markdown
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
# RWGet
|
2
|
+
|
3
|
+
RWget is a web crawler that tries to emulate a subset of the interface of GNU/Wget, but with more flexibility for my needs.
|
4
|
+
|
5
|
+
## Features
|
6
|
+
|
7
|
+
1. Regular expression accept/reject lists
|
8
|
+
2. Pluggable interfaces for robots-txt, url-fetcher, url-queue, url-dupe-detector, and page-storage. The defaults store locally, and fetch using libcurl, but you could easily change to db storage, a distributed queue, etc.
|
9
|
+
|
10
|
+
## Help page
|
11
|
+
|
12
|
+
Usage: /usr/bin/rwget [options] SEED_URL [SEED_URL2 ...]
|
13
|
+
-w, --wait=SECONDS wait SECONDS between retrievals.
|
14
|
+
-P, --directory-prefix=PREFIX save files to PREFIX/...
|
15
|
+
-U, --user-agent=AGENT identify as AGENT instead of RWget/VERSION.
|
16
|
+
-A, --accept-pattern=RUBY_REGEX URLs must match RUBY_REGEX to be saved to the queue.
|
17
|
+
--time-limit=AMOUNT Crawler will stop after this AMOUNT of time has passed.
|
18
|
+
-R, --reject-pattern=RUBY_REGEX URLs must NOT match RUBY_REGEX to be saved to the queue.
|
19
|
+
--require=RUBY_SCRIPT Will execute 'require RUBY_SCRIPT'
|
20
|
+
--limit-rate=RATE limit download rate to RATE.
|
21
|
+
--http-proxy=URL Proxies via URL
|
22
|
+
--proxy-user=USER Sets proxy user to USER
|
23
|
+
--proxy-password=PASSWORD Sets proxy password to PASSWORD
|
24
|
+
--fetch-class=RUBY_CLASS Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object]
|
25
|
+
--store-class=RUBY_CLASS Must implement put(key_string, temp_file)
|
26
|
+
--dupes-class=RUBY_CLASS Must implement dupe?(uri)
|
27
|
+
--queue-class=RUBY_CLASS Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]
|
28
|
+
--links-class=RUBY_CLASS Must implement urls(base_uri, temp_file) #=> [uri, ...]
|
29
|
+
-S, --sitemap=URL URL of a sitemap to crawl (will ignore inter-page links)
|
30
|
+
|
31
|
+
-Q, --quota=NUMBER set retrieval quota to NUMBER.
|
32
|
+
--max-redirect=NUM maximum redirections allowed per page.
|
33
|
+
-H, --span-hosts go to foreign hosts when recursive
|
34
|
+
--connect-timeout=SECS set the connect timeout to SECS.
|
35
|
+
-T, --timeout=SECS set all timeout values to SECONDS.
|
36
|
+
-l, --level=NUMBER maximum recursion depth (inf or 0 for infinite).
|
37
|
+
--[no-]timestampize Prepend the timestamp of when the crawl started to the directory structure.
|
38
|
+
--incremental-from=PREVIOUS Build upon the indexing already saved in PREVIOUS.
|
39
|
+
--protocol-directories use protocol name in directories.
|
40
|
+
--no-host-directories don't create host directories.
|
41
|
+
-v, --[no-]verbose Run verbosely
|
42
|
+
-h, --help Show this message
|
43
|
+
|
44
|
+
## Ruby API
|
45
|
+
|
46
|
+
require "rubygems"
|
47
|
+
require "rwget"
|
48
|
+
|
49
|
+
# options is the same as the command-line long options, but converted into
|
50
|
+
# idiomatic ruby. See the RDoc for details.
|
51
|
+
# i.e.
|
52
|
+
# sh$ rwget -T 5 -A ".*foo.*" http://google.com
|
53
|
+
# becomes:
|
54
|
+
# irb$ RWGet::Controller.new({:seeds => ["http://google.com"],
|
55
|
+
# :timeout => 5, :accept_patterns => /.*foo.*/}).start
|
56
|
+
|
57
|
+
RWGet::Controller.new(options).start
|
data/Rakefile
ADDED
@@ -0,0 +1,60 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
|
4
|
+
begin
|
5
|
+
require 'jeweler'
|
6
|
+
Jeweler::Tasks.new do |gem|
|
7
|
+
gem.name = "rwget"
|
8
|
+
gem.summary = %Q{Ruby port of wget, emphasis on recursive/crawler}
|
9
|
+
gem.email = "kyle@kylemaxwell.com"
|
10
|
+
gem.homepage = "http://github.com/fizx/rwget"
|
11
|
+
gem.authors = ["Kyle Maxwell"]
|
12
|
+
gem.add_dependency("curb", ["> 0.0.0"])
|
13
|
+
gem.add_dependency("hpricot", ["> 0.0.0"])
|
14
|
+
gem.add_dependency("fizx-robots", [">= 0.3.1"])
|
15
|
+
gem.add_dependency("igrigorik-bloomfilter", ["> 0.0.0"])
|
16
|
+
gem.add_dependency("libxml-ruby", ["> 0.9"])
|
17
|
+
end
|
18
|
+
Jeweler::GemcutterTasks.new
|
19
|
+
rescue LoadError
|
20
|
+
puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
|
21
|
+
end
|
22
|
+
|
23
|
+
require 'rake/testtask'
|
24
|
+
Rake::TestTask.new(:test) do |test|
|
25
|
+
test.libs << 'lib' << 'test'
|
26
|
+
test.pattern = 'test/**/*_test.rb'
|
27
|
+
test.verbose = true
|
28
|
+
end
|
29
|
+
|
30
|
+
begin
|
31
|
+
require 'rcov/rcovtask'
|
32
|
+
Rcov::RcovTask.new do |test|
|
33
|
+
test.libs << 'test'
|
34
|
+
test.pattern = 'test/**/*_test.rb'
|
35
|
+
test.verbose = true
|
36
|
+
end
|
37
|
+
rescue LoadError
|
38
|
+
task :rcov do
|
39
|
+
abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
43
|
+
|
44
|
+
task :default => :test
|
45
|
+
|
46
|
+
require 'rake/rdoctask'
|
47
|
+
Rake::RDocTask.new do |rdoc|
|
48
|
+
if File.exist?('VERSION.yml')
|
49
|
+
config = YAML.load(File.read('VERSION.yml'))
|
50
|
+
version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
|
51
|
+
else
|
52
|
+
version = ""
|
53
|
+
end
|
54
|
+
|
55
|
+
rdoc.rdoc_dir = 'rdoc'
|
56
|
+
rdoc.title = "rwget #{version}"
|
57
|
+
rdoc.rdoc_files.include('README*')
|
58
|
+
rdoc.rdoc_files.include('lib/**/*.rb')
|
59
|
+
end
|
60
|
+
|
data/VERSION
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
0.5.3
|
data/bin/rwget
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
require File.dirname(__FILE__) + "/../lib/rwget"
|
3
|
+
|
4
|
+
parser = RWGetOptionParser.new
|
5
|
+
parser.parse!
|
6
|
+
|
7
|
+
if parser.options[:seeds].empty?
|
8
|
+
puts parser.usage
|
9
|
+
puts " -h for options listing"
|
10
|
+
exit(1)
|
11
|
+
end
|
12
|
+
|
13
|
+
controller = RWGet::Controller.new(parser.options)
|
14
|
+
controller.start
|
15
|
+
controller.close
|
data/lib/rwget.rb
ADDED
@@ -0,0 +1,121 @@
|
|
1
|
+
require "set"
|
2
|
+
class RWGet::Controller
|
3
|
+
attr_reader :options
|
4
|
+
|
5
|
+
def self.resolve_class(string)
|
6
|
+
string.split("::").inject(Kernel) do |const, string|
|
7
|
+
const.const_get(string)
|
8
|
+
end
|
9
|
+
end
|
10
|
+
|
11
|
+
def initialize(options)
|
12
|
+
@options = options
|
13
|
+
@options[:user_agent] ||= "Ruby/Wget"
|
14
|
+
|
15
|
+
@options[:accept_patterns] ||= []
|
16
|
+
@options[:reject_patterns] ||= []
|
17
|
+
|
18
|
+
%w[quota depth wait limit_rate time_limit].each do |key|
|
19
|
+
key = key.to_sym
|
20
|
+
@options[key] = @options[key].to_i
|
21
|
+
end
|
22
|
+
|
23
|
+
@queue = (options[:queue_class] ? self.class.resolve_class(options[:queue_class]) : RWGet::Queue).new(options)
|
24
|
+
@fetch = (options[:fetch_class] ? self.class.resolve_class(options[:fetch_class]) : RWGet::Fetch).new(options)
|
25
|
+
@store = (options[:store_class] ? self.class.resolve_class(options[:store_class]) : RWGet::Store).new(options)
|
26
|
+
@links = (options[:links_class] ? self.class.resolve_class(options[:links_class]) : RWGet::Links).new(options)
|
27
|
+
@dupes = (options[:dupes_class] ? self.class.resolve_class(options[:dupes_class]) : RWGet::Dupes).new(options)
|
28
|
+
end
|
29
|
+
|
30
|
+
def start
|
31
|
+
@start_time = Time.now.to_i.to_s
|
32
|
+
@start = Time.now
|
33
|
+
@original_hosts = Set.new
|
34
|
+
options[:seeds].each do |seed|
|
35
|
+
@queue.put(seed, 0)
|
36
|
+
@original_hosts << URI.parse(seed).host
|
37
|
+
end
|
38
|
+
|
39
|
+
downloaded = 0
|
40
|
+
while (options[:quota] == 0 || downloaded < options[:quota]) &&
|
41
|
+
(options[:time_limit] == 0 || Time.now - @start < options[:time_limit])
|
42
|
+
|
43
|
+
url, depth = @queue.get
|
44
|
+
|
45
|
+
unless url
|
46
|
+
puts "no more urls"
|
47
|
+
return
|
48
|
+
end
|
49
|
+
|
50
|
+
if options[:depth] > 0 && depth > options[:depth]
|
51
|
+
next
|
52
|
+
end
|
53
|
+
|
54
|
+
uri = URI.parse(url)
|
55
|
+
|
56
|
+
while options[:limit_rate] > 0 && downloaded / (Time.now - @start) > options[:limit_rate]
|
57
|
+
puts "sleeping until under rate limit"
|
58
|
+
sleep 1
|
59
|
+
end
|
60
|
+
puts "download rate: #{downloaded / (Time.now - @start)}bps"
|
61
|
+
|
62
|
+
puts "downloading #{uri}"
|
63
|
+
effective_url, tmpfile = @fetch.fetch(uri, options[:user_agent])
|
64
|
+
|
65
|
+
if tmpfile
|
66
|
+
downloaded += File.size(tmpfile.path)
|
67
|
+
puts "parsing links"
|
68
|
+
@links.urls(effective_url, tmpfile).each do |link|
|
69
|
+
legal = legal?(link)
|
70
|
+
dupe = @dupes.dupe?(link)
|
71
|
+
puts "dupe: #{link}" if dupe
|
72
|
+
if legal && !dupe
|
73
|
+
puts "adding link: #{link}"
|
74
|
+
@queue.put(link, depth + 1)
|
75
|
+
end
|
76
|
+
end
|
77
|
+
key = key_for(uri)
|
78
|
+
puts "storing at #{key}"
|
79
|
+
@store.put(key, tmpfile)
|
80
|
+
sleep options[:wait]
|
81
|
+
tmpfile.close rescue nil
|
82
|
+
else
|
83
|
+
puts "unable to download"
|
84
|
+
end
|
85
|
+
end
|
86
|
+
puts "hit time/quota"
|
87
|
+
end
|
88
|
+
|
89
|
+
def legal?(link)
|
90
|
+
unless options[:span_hosts] || @original_hosts.include?(link.host)
|
91
|
+
puts "can't span hosts: #{link}"
|
92
|
+
return false
|
93
|
+
end
|
94
|
+
link = link.to_s
|
95
|
+
legal = options[:accept_patterns].empty?
|
96
|
+
puts "accepted by default: #{link}" if legal
|
97
|
+
legal ||= options[:accept_patterns].any?{|p| link =~ p}
|
98
|
+
puts "not in accept patterns: #{link}" if !legal
|
99
|
+
rejected = options[:reject_patterns].any?{|p| link =~ p}
|
100
|
+
puts "in reject patterns: #{link}" if rejected
|
101
|
+
legal && !rejected
|
102
|
+
end
|
103
|
+
|
104
|
+
def key_for(uri)
|
105
|
+
arr = []
|
106
|
+
arr << options[:prefix] if options[:prefix]
|
107
|
+
arr << @start_time if options[:timestampize]
|
108
|
+
arr << uri.scheme if options[:protocol_directories]
|
109
|
+
arr << uri.host unless options[:no_host_directories]
|
110
|
+
paths = uri.path.split("/")
|
111
|
+
paths << paths.pop + "?" + uri.query if uri.query
|
112
|
+
paths.shift if paths.first.to_s.empty?
|
113
|
+
File.join(arr + paths)
|
114
|
+
end
|
115
|
+
|
116
|
+
def close
|
117
|
+
[@queue, @fetch, @store, @links, @dupes].each do |obj|
|
118
|
+
obj.close if obj.respond_to?(:close)
|
119
|
+
end
|
120
|
+
end
|
121
|
+
end
|
data/lib/rwget/dupes.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "tempfile"
|
3
|
+
require "bloomfilter"
|
4
|
+
|
5
|
+
class RWGet::Dupes
|
6
|
+
SIZE = 1_000_000
|
7
|
+
|
8
|
+
def initialize(options = {})
|
9
|
+
@bloom = BloomFilter.new(SIZE, 4, 1)
|
10
|
+
end
|
11
|
+
|
12
|
+
def dupe?(uri)
|
13
|
+
key = uri.to_s
|
14
|
+
return true if @bloom.include?(key)
|
15
|
+
@bloom.insert(key)
|
16
|
+
return false
|
17
|
+
end
|
18
|
+
end
|
data/lib/rwget/fetch.rb
ADDED
@@ -0,0 +1,44 @@
|
|
1
|
+
require "open-uri"
|
2
|
+
require "tempfile"
|
3
|
+
require "rubygems"
|
4
|
+
require "robots"
|
5
|
+
require "curl"
|
6
|
+
class RWGet::Fetch
|
7
|
+
DEFAULT_TIMEOUT = 30
|
8
|
+
DEFAULT_REDIRECTS = 30
|
9
|
+
|
10
|
+
def initialize(options = {})
|
11
|
+
@robots = {}
|
12
|
+
@curl = Curl::Easy.new
|
13
|
+
@curl.connect_timeout = options[:connect_timeout] || DEFAULT_TIMEOUT
|
14
|
+
@curl.timeout = options[:timeout] || DEFAULT_TIMEOUT
|
15
|
+
@curl.max_redirects = options[:max_redirect] || DEFAULT_REDIRECTS
|
16
|
+
@curl.follow_location = true
|
17
|
+
if options[:http_proxy]
|
18
|
+
@curl.proxy_url = options[:http_proxy]
|
19
|
+
if options[:proxy_user]
|
20
|
+
@curl.proxypwd = "#{options[:proxy_user]}:#{options[:proxy_password]}"
|
21
|
+
end
|
22
|
+
end
|
23
|
+
puts "timeout: #{@curl.timeout}"
|
24
|
+
end
|
25
|
+
|
26
|
+
def fetch(uri, user_agent)
|
27
|
+
@robots[user_agent] ||= Robots.new(user_agent)
|
28
|
+
unless @robots[user_agent].allowed?(uri)
|
29
|
+
puts "disallowed by robots.txt"
|
30
|
+
return nil
|
31
|
+
end
|
32
|
+
|
33
|
+
@curl.headers["User-Agent"] = user_agent
|
34
|
+
@curl.url = uri.to_s
|
35
|
+
@curl.perform
|
36
|
+
tmp = nil
|
37
|
+
Tempfile.open("curl") {|file| file.print(@curl.body_str); tmp = file }
|
38
|
+
tmp.open
|
39
|
+
[@curl.last_effective_url, tmp]
|
40
|
+
rescue Exception => e
|
41
|
+
STDERR.puts "#{uri} not retrieved: #{e.message}"
|
42
|
+
nil
|
43
|
+
end
|
44
|
+
end
|
data/lib/rwget/links.rb
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
require "rubygems"
|
2
|
+
require "hpricot"
|
3
|
+
class RWGet::Links
|
4
|
+
def initialize(options = {})
|
5
|
+
end
|
6
|
+
|
7
|
+
def urls(base, tmpfile)
|
8
|
+
@urls = []
|
9
|
+
base = base.to_s
|
10
|
+
string = File.read(tmpfile.path)
|
11
|
+
xml = string =~ /<\?xml/
|
12
|
+
doc = xml ? Hpricot.XML(string) : Hpricot(string)
|
13
|
+
|
14
|
+
(doc / "//item/link").each do |l|
|
15
|
+
add base, l.inner_text
|
16
|
+
end
|
17
|
+
(doc / "a").each do |a|
|
18
|
+
add base, a.attributes["href"]
|
19
|
+
end
|
20
|
+
@urls
|
21
|
+
rescue Exception => e
|
22
|
+
STDERR.puts "Couldn't parse #{base} for links: #{e.message}"
|
23
|
+
[]
|
24
|
+
end
|
25
|
+
|
26
|
+
def add(base, href)
|
27
|
+
begin
|
28
|
+
@urls << URI.join(base, href.strip) if href
|
29
|
+
rescue Exception => e
|
30
|
+
STDERR.puts "url error parsing URI.join(#{base.inspect}, #{href.inspect}): #{e.message}"
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
data/lib/rwget/queue.rb
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
require "tempfile"
|
2
|
+
class RWGet::Queue
|
3
|
+
def initialize(options = {})
|
4
|
+
@writer = Tempfile.new("rwget-queue")
|
5
|
+
@reader = File.open(@writer.path, "r")
|
6
|
+
@dirty = false
|
7
|
+
end
|
8
|
+
|
9
|
+
def put(key, depth)
|
10
|
+
@writer.puts "#{key}\t#{depth}"
|
11
|
+
@dirty = true
|
12
|
+
end
|
13
|
+
|
14
|
+
def get(retrying = false)
|
15
|
+
sleep 0.1 if retrying
|
16
|
+
if @dirty
|
17
|
+
@writer.flush
|
18
|
+
@dirty = false
|
19
|
+
end
|
20
|
+
line = @reader.gets
|
21
|
+
unless line
|
22
|
+
return retrying ? nil : get(:retry)
|
23
|
+
end
|
24
|
+
key, depth = line.split("\t")
|
25
|
+
return [key, depth.to_i]
|
26
|
+
end
|
27
|
+
|
28
|
+
def close
|
29
|
+
@writer.close
|
30
|
+
@reader.close
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,165 @@
|
|
1
|
+
require 'optparse'
|
2
|
+
|
3
|
+
class RWGetOptionParser < OptionParser
|
4
|
+
attr_accessor :options
|
5
|
+
|
6
|
+
def usage
|
7
|
+
"Usage: #{$0} [options] SEED_URL [SEED_URL2 ...]"
|
8
|
+
end
|
9
|
+
|
10
|
+
def parse!
|
11
|
+
super
|
12
|
+
options[:seeds] ||= []
|
13
|
+
options[:seeds] += ARGV
|
14
|
+
end
|
15
|
+
|
16
|
+
def initialize
|
17
|
+
self.options = {}
|
18
|
+
super do |opts|
|
19
|
+
|
20
|
+
yield opts if block_given?
|
21
|
+
|
22
|
+
opts.banner = usage
|
23
|
+
|
24
|
+
opts.on("-w", "--wait=SECONDS", "wait SECONDS between retrievals.") do |w|
|
25
|
+
options[:wait] = w.to_i
|
26
|
+
end
|
27
|
+
|
28
|
+
opts.on("-P", "--directory-prefix=PREFIX", "save files to PREFIX/...") do |p|
|
29
|
+
options[:prefix] = p
|
30
|
+
end
|
31
|
+
|
32
|
+
opts.on("-U", "--user-agent=AGENT", "identify as AGENT instead of RWget/VERSION.") do |u|
|
33
|
+
options[:user_agent] = u
|
34
|
+
end
|
35
|
+
|
36
|
+
opts.on("-Ap", "--accept-pattern=RUBY_REGEX", "URLs must match RUBY_REGEX to be saved to the queue.") do |r|
|
37
|
+
options[:accept_patterns] ||= []
|
38
|
+
options[:accept_patterns] << Regexp.new(r)
|
39
|
+
end
|
40
|
+
|
41
|
+
opts.on("--time-limit=AMOUNT", "Crawler will stop after this AMOUNT of time has passed.") do |t|
|
42
|
+
options[:time_limit] = t.to_i
|
43
|
+
options[:time_limit] *= 60 if t =~ /m/i
|
44
|
+
options[:time_limit] *= 60 * 60 if t =~ /h/i
|
45
|
+
options[:time_limit] *= 60 * 60 * 24 if t =~ /d/i
|
46
|
+
options[:time_limit] *= 60 * 60 * 24 * 7 if t =~ /w/i
|
47
|
+
end
|
48
|
+
|
49
|
+
opts.on("-Rp", "--reject-pattern=RUBY_REGEX", "URLs must NOT match RUBY_REGEX to be saved to the queue.") do |r|
|
50
|
+
options[:reject_patterns] ||= []
|
51
|
+
options[:reject_patterns] << Regexp.new(r)
|
52
|
+
end
|
53
|
+
|
54
|
+
opts.on("--limit-rate=RATE", "limit download rate to RATE.") do |r|
|
55
|
+
rate = r.to_i
|
56
|
+
rate *= 1000 if r =~ /k/i
|
57
|
+
rate *= 1000000 if r =~ /m/i
|
58
|
+
options[:limit_rate] = rate
|
59
|
+
puts "rate is #{rate}"
|
60
|
+
end
|
61
|
+
|
62
|
+
opts.on("--http-proxy=URL", "Proxies via URL") do |u|
|
63
|
+
options[:http_proxy] = u
|
64
|
+
end
|
65
|
+
|
66
|
+
opts.on("--proxy-user=USER", "Sets proxy user to USER") do |u|
|
67
|
+
options[:proxy_user] = u
|
68
|
+
end
|
69
|
+
|
70
|
+
opts.on("--proxy-password=PASSWORD", "Sets proxy password to PASSWORD") do |p|
|
71
|
+
options[:proxy_password] = p
|
72
|
+
end
|
73
|
+
|
74
|
+
opts.on("--require=RUBY_SCRIPT", "Will execute 'require RUBY_SCRIPT'") do |s|
|
75
|
+
require s
|
76
|
+
end
|
77
|
+
|
78
|
+
opts.on("--fetch-class=RUBY_CLASS", "Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object] (Load the class with --require)") do |c|
|
79
|
+
options[:fetch_class] = c
|
80
|
+
end
|
81
|
+
|
82
|
+
opts.on("--store-class=RUBY_CLASS", "Must implement put(key_string, temp_file) (Load the class with --require)") do |c|
|
83
|
+
options[:store_class] = c
|
84
|
+
end
|
85
|
+
|
86
|
+
opts.on("--dupes-class=RUBY_CLASS", "Must implement dupe?(uri) (Load the class with --require)") do |c|
|
87
|
+
options[:dupes_class] = c
|
88
|
+
end
|
89
|
+
|
90
|
+
opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int] (Load the class with --require)") do |c|
|
91
|
+
options[:queue_class] = c
|
92
|
+
end
|
93
|
+
|
94
|
+
opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int] (Load the class with --require)") do |c|
|
95
|
+
options[:queue_class] = c
|
96
|
+
end
|
97
|
+
|
98
|
+
opts.on("--links-class=RUBY_CLASS", "Must implement urls(base_uri, temp_file) #=> [uri, ...]") do |c|
|
99
|
+
options[:links_class] = c
|
100
|
+
end
|
101
|
+
|
102
|
+
opts.on("-S", "--sitemap=URL", "URL of a sitemap to crawl (will ignore inter-page links)") do |url|
|
103
|
+
options[:seeds] ||= []
|
104
|
+
options[:seeds] << url
|
105
|
+
options[:links_class] = "RWGet::SitemapLinks"
|
106
|
+
end
|
107
|
+
|
108
|
+
opts.on("-V", "--version") do
|
109
|
+
puts File.read(File.dirname(__FILE__) + "/../../VERSION")
|
110
|
+
exit
|
111
|
+
end
|
112
|
+
|
113
|
+
opts.on("-Q", "--quota=NUMBER", "set retrieval quota to NUMBER.") do |q|
|
114
|
+
options[:quota] = q.to_i
|
115
|
+
options[:quota] *= 1000 if q =~ /k/i
|
116
|
+
options[:quota] *= 1000000 if q =~ /m/i
|
117
|
+
end
|
118
|
+
|
119
|
+
opts.on("--max-redirect=NUM", "maximum redirections allowed per page.") do |m|
|
120
|
+
options[:max_redirect] = m.to_i
|
121
|
+
end
|
122
|
+
|
123
|
+
opts.on("-H", "--span-hosts", "go to foreign hosts when recursive") do |s|
|
124
|
+
options[:span_hosts] = s
|
125
|
+
end
|
126
|
+
|
127
|
+
opts.on("--connect-timeout=SECS", "set the connect timeout to SECS.") do |t|
|
128
|
+
options[:connect_timeout] = t.to_i
|
129
|
+
end
|
130
|
+
|
131
|
+
opts.on("-T", "--timeout=SECS", "set all timeout values to SECONDS.") do |t|
|
132
|
+
options[:timeout] = t.to_i
|
133
|
+
end
|
134
|
+
|
135
|
+
opts.on("-l", "--level=NUMBER", "maximum recursion depth (inf or 0 for infinite).") do |l|
|
136
|
+
options[:depth] = l.to_i
|
137
|
+
end
|
138
|
+
|
139
|
+
opts.on("--[no-]timestampize", "Prepend the timestamp of when the crawl started to the directory structure.") do |t|
|
140
|
+
options[:timestampize] = t
|
141
|
+
end
|
142
|
+
|
143
|
+
opts.on("--incremental-from=PREVIOUS", "Build upon the indexing already saved in PREVIOUS.") do |r|
|
144
|
+
options[:incremental_from] = r
|
145
|
+
end
|
146
|
+
|
147
|
+
opts.on("--protocol-directories", "use protocol name in directories.") do |p|
|
148
|
+
options[:protocol_directories] = p
|
149
|
+
end
|
150
|
+
|
151
|
+
opts.on("--no-host-directories", "don't create host directories.") do |h|
|
152
|
+
options[:no_host_directories] = h
|
153
|
+
end
|
154
|
+
|
155
|
+
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
|
156
|
+
options[:verbose] = v
|
157
|
+
end
|
158
|
+
|
159
|
+
opts.on_tail("-h", "--help", "Show this message") do
|
160
|
+
puts opts
|
161
|
+
exit
|
162
|
+
end
|
163
|
+
end
|
164
|
+
end
|
165
|
+
end
|