rwget 0.0.0 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
@@ -0,0 +1,57 @@
1
+ # RWGet
2
+
3
+ RWget is a web crawler that tries to emulate a subset of the interface of GNU/Wget, but with more flexibility for my needs.
4
+
5
+ ## Features
6
+
7
+ 1. Regular expression accept/reject lists
8
+ 2. Pluggable interfaces for robots-txt, url-fetcher, url-queue, url-dupe-detector, and page-storage. The defaults store locally, and fetch using libcurl, but you could easily change to db storage, a distributed queue, etc.
9
+
10
+ ## Help page
11
+
12
+ Usage: /usr/bin/rwget [options] SEED_URL [SEED_URL2 ...]
13
+ -w, --wait=SECONDS wait SECONDS between retrievals.
14
+ -P, --directory-prefix=PREFIX save files to PREFIX/...
15
+ -U, --user-agent=AGENT identify as AGENT instead of RWget/VERSION.
16
+ -A, --accept-pattern=RUBY_REGEX URLs must match RUBY_REGEX to be saved to the queue.
17
+ --time-limit=AMOUNT Crawler will stop after this AMOUNT of time has passed.
18
+ -R, --reject-pattern=RUBY_REGEX URLs must NOT match RUBY_REGEX to be saved to the queue.
19
+ --require=RUBY_SCRIPT Will execute 'require RUBY_SCRIPT'
20
+ --limit-rate=RATE limit download rate to RATE.
21
+ --http-proxy=URL Proxies via URL
22
+ --proxy-user=USER Sets proxy user to USER
23
+ --proxy-password=PASSWORD Sets proxy password to PASSWORD
24
+ --fetch-class=RUBY_CLASS Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object]
25
+ --store-class=RUBY_CLASS Must implement put(key_string, temp_file)
26
+ --dupes-class=RUBY_CLASS Must implement dupe?(uri)
27
+ --queue-class=RUBY_CLASS Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]
28
+ --links-class=RUBY_CLASS Must implement urls(base_uri, temp_file) #=> [uri, ...]
29
+ -S, --sitemap=URL URL of a sitemap to crawl (will ignore inter-page links)
30
+
31
+ -Q, --quota=NUMBER set retrieval quota to NUMBER.
32
+ --max-redirect=NUM maximum redirections allowed per page.
33
+ -H, --span-hosts go to foreign hosts when recursive
34
+ --connect-timeout=SECS set the connect timeout to SECS.
35
+ -T, --timeout=SECS set all timeout values to SECONDS.
36
+ -l, --level=NUMBER maximum recursion depth (inf or 0 for infinite).
37
+ --[no-]timestampize Prepend the timestamp of when the crawl started to the directory structure.
38
+ --incremental-from=PREVIOUS Build upon the indexing already saved in PREVIOUS.
39
+ --protocol-directories use protocol name in directories.
40
+ --no-host-directories don't create host directories.
41
+ -v, --[no-]verbose Run verbosely
42
+ -h, --help Show this message
43
+
44
+ ## Ruby API
45
+
46
+ require "rubygems"
47
+ require "rwget"
48
+
49
+ # options is the same as the command-line long options, but converted into
50
+ # idiomatic ruby. See the RDoc for details.
51
+ # i.e.
52
+ # sh$ rwget -T 5 -A ".*foo.*" http://google.com
53
+ # becomes:
54
+ # irb$ RWGet::Controller.new({:seeds => ["http://google.com"],
55
+ # :timeout => 5, :accept_patterns => /.*foo.*/}).start
56
+
57
+ RWGet::Controller.new(options).start
@@ -0,0 +1,60 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "rwget"
8
+ gem.summary = %Q{Ruby port of wget, emphasis on recursive/crawler}
9
+ gem.email = "kyle@kylemaxwell.com"
10
+ gem.homepage = "http://github.com/fizx/rwget"
11
+ gem.authors = ["Kyle Maxwell"]
12
+ gem.add_dependency("curb", ["> 0.0.0"])
13
+ gem.add_dependency("hpricot", ["> 0.0.0"])
14
+ gem.add_dependency("fizx-robots", [">= 0.3.1"])
15
+ gem.add_dependency("igrigorik-bloomfilter", ["> 0.0.0"])
16
+ gem.add_dependency("libxml-ruby", ["> 0.9"])
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
21
+ end
22
+
23
+ require 'rake/testtask'
24
+ Rake::TestTask.new(:test) do |test|
25
+ test.libs << 'lib' << 'test'
26
+ test.pattern = 'test/**/*_test.rb'
27
+ test.verbose = true
28
+ end
29
+
30
+ begin
31
+ require 'rcov/rcovtask'
32
+ Rcov::RcovTask.new do |test|
33
+ test.libs << 'test'
34
+ test.pattern = 'test/**/*_test.rb'
35
+ test.verbose = true
36
+ end
37
+ rescue LoadError
38
+ task :rcov do
39
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
40
+ end
41
+ end
42
+
43
+
44
+ task :default => :test
45
+
46
+ require 'rake/rdoctask'
47
+ Rake::RDocTask.new do |rdoc|
48
+ if File.exist?('VERSION.yml')
49
+ config = YAML.load(File.read('VERSION.yml'))
50
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
51
+ else
52
+ version = ""
53
+ end
54
+
55
+ rdoc.rdoc_dir = 'rdoc'
56
+ rdoc.title = "rwget #{version}"
57
+ rdoc.rdoc_files.include('README*')
58
+ rdoc.rdoc_files.include('lib/**/*.rb')
59
+ end
60
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.5.3
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ require File.dirname(__FILE__) + "/../lib/rwget"
3
+
4
+ parser = RWGetOptionParser.new
5
+ parser.parse!
6
+
7
+ if parser.options[:seeds].empty?
8
+ puts parser.usage
9
+ puts " -h for options listing"
10
+ exit(1)
11
+ end
12
+
13
+ controller = RWGet::Controller.new(parser.options)
14
+ controller.start
15
+ controller.close
@@ -0,0 +1,5 @@
1
+ module RWGet
2
+ end
3
+ Dir[File.dirname(__FILE__) + "/rwget/*.rb"].each do |f|
4
+ require f.gsub(/\.rb$/, '')
5
+ end
@@ -0,0 +1,121 @@
1
+ require "set"
2
+ class RWGet::Controller
3
+ attr_reader :options
4
+
5
+ def self.resolve_class(string)
6
+ string.split("::").inject(Kernel) do |const, string|
7
+ const.const_get(string)
8
+ end
9
+ end
10
+
11
+ def initialize(options)
12
+ @options = options
13
+ @options[:user_agent] ||= "Ruby/Wget"
14
+
15
+ @options[:accept_patterns] ||= []
16
+ @options[:reject_patterns] ||= []
17
+
18
+ %w[quota depth wait limit_rate time_limit].each do |key|
19
+ key = key.to_sym
20
+ @options[key] = @options[key].to_i
21
+ end
22
+
23
+ @queue = (options[:queue_class] ? self.class.resolve_class(options[:queue_class]) : RWGet::Queue).new(options)
24
+ @fetch = (options[:fetch_class] ? self.class.resolve_class(options[:fetch_class]) : RWGet::Fetch).new(options)
25
+ @store = (options[:store_class] ? self.class.resolve_class(options[:store_class]) : RWGet::Store).new(options)
26
+ @links = (options[:links_class] ? self.class.resolve_class(options[:links_class]) : RWGet::Links).new(options)
27
+ @dupes = (options[:dupes_class] ? self.class.resolve_class(options[:dupes_class]) : RWGet::Dupes).new(options)
28
+ end
29
+
30
+ def start
31
+ @start_time = Time.now.to_i.to_s
32
+ @start = Time.now
33
+ @original_hosts = Set.new
34
+ options[:seeds].each do |seed|
35
+ @queue.put(seed, 0)
36
+ @original_hosts << URI.parse(seed).host
37
+ end
38
+
39
+ downloaded = 0
40
+ while (options[:quota] == 0 || downloaded < options[:quota]) &&
41
+ (options[:time_limit] == 0 || Time.now - @start < options[:time_limit])
42
+
43
+ url, depth = @queue.get
44
+
45
+ unless url
46
+ puts "no more urls"
47
+ return
48
+ end
49
+
50
+ if options[:depth] > 0 && depth > options[:depth]
51
+ next
52
+ end
53
+
54
+ uri = URI.parse(url)
55
+
56
+ while options[:limit_rate] > 0 && downloaded / (Time.now - @start) > options[:limit_rate]
57
+ puts "sleeping until under rate limit"
58
+ sleep 1
59
+ end
60
+ puts "download rate: #{downloaded / (Time.now - @start)}bps"
61
+
62
+ puts "downloading #{uri}"
63
+ effective_url, tmpfile = @fetch.fetch(uri, options[:user_agent])
64
+
65
+ if tmpfile
66
+ downloaded += File.size(tmpfile.path)
67
+ puts "parsing links"
68
+ @links.urls(effective_url, tmpfile).each do |link|
69
+ legal = legal?(link)
70
+ dupe = @dupes.dupe?(link)
71
+ puts "dupe: #{link}" if dupe
72
+ if legal && !dupe
73
+ puts "adding link: #{link}"
74
+ @queue.put(link, depth + 1)
75
+ end
76
+ end
77
+ key = key_for(uri)
78
+ puts "storing at #{key}"
79
+ @store.put(key, tmpfile)
80
+ sleep options[:wait]
81
+ tmpfile.close rescue nil
82
+ else
83
+ puts "unable to download"
84
+ end
85
+ end
86
+ puts "hit time/quota"
87
+ end
88
+
89
+ def legal?(link)
90
+ unless options[:span_hosts] || @original_hosts.include?(link.host)
91
+ puts "can't span hosts: #{link}"
92
+ return false
93
+ end
94
+ link = link.to_s
95
+ legal = options[:accept_patterns].empty?
96
+ puts "accepted by default: #{link}" if legal
97
+ legal ||= options[:accept_patterns].any?{|p| link =~ p}
98
+ puts "not in accept patterns: #{link}" if !legal
99
+ rejected = options[:reject_patterns].any?{|p| link =~ p}
100
+ puts "in reject patterns: #{link}" if rejected
101
+ legal && !rejected
102
+ end
103
+
104
+ def key_for(uri)
105
+ arr = []
106
+ arr << options[:prefix] if options[:prefix]
107
+ arr << @start_time if options[:timestampize]
108
+ arr << uri.scheme if options[:protocol_directories]
109
+ arr << uri.host unless options[:no_host_directories]
110
+ paths = uri.path.split("/")
111
+ paths << paths.pop + "?" + uri.query if uri.query
112
+ paths.shift if paths.first.to_s.empty?
113
+ File.join(arr + paths)
114
+ end
115
+
116
+ def close
117
+ [@queue, @fetch, @store, @links, @dupes].each do |obj|
118
+ obj.close if obj.respond_to?(:close)
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,18 @@
1
+ require "rubygems"
2
+ require "tempfile"
3
+ require "bloomfilter"
4
+
5
+ class RWGet::Dupes
6
+ SIZE = 1_000_000
7
+
8
+ def initialize(options = {})
9
+ @bloom = BloomFilter.new(SIZE, 4, 1)
10
+ end
11
+
12
+ def dupe?(uri)
13
+ key = uri.to_s
14
+ return true if @bloom.include?(key)
15
+ @bloom.insert(key)
16
+ return false
17
+ end
18
+ end
@@ -0,0 +1,44 @@
1
+ require "open-uri"
2
+ require "tempfile"
3
+ require "rubygems"
4
+ require "robots"
5
+ require "curl"
6
+ class RWGet::Fetch
7
+ DEFAULT_TIMEOUT = 30
8
+ DEFAULT_REDIRECTS = 30
9
+
10
+ def initialize(options = {})
11
+ @robots = {}
12
+ @curl = Curl::Easy.new
13
+ @curl.connect_timeout = options[:connect_timeout] || DEFAULT_TIMEOUT
14
+ @curl.timeout = options[:timeout] || DEFAULT_TIMEOUT
15
+ @curl.max_redirects = options[:max_redirect] || DEFAULT_REDIRECTS
16
+ @curl.follow_location = true
17
+ if options[:http_proxy]
18
+ @curl.proxy_url = options[:http_proxy]
19
+ if options[:proxy_user]
20
+ @curl.proxypwd = "#{options[:proxy_user]}:#{options[:proxy_password]}"
21
+ end
22
+ end
23
+ puts "timeout: #{@curl.timeout}"
24
+ end
25
+
26
+ def fetch(uri, user_agent)
27
+ @robots[user_agent] ||= Robots.new(user_agent)
28
+ unless @robots[user_agent].allowed?(uri)
29
+ puts "disallowed by robots.txt"
30
+ return nil
31
+ end
32
+
33
+ @curl.headers["User-Agent"] = user_agent
34
+ @curl.url = uri.to_s
35
+ @curl.perform
36
+ tmp = nil
37
+ Tempfile.open("curl") {|file| file.print(@curl.body_str); tmp = file }
38
+ tmp.open
39
+ [@curl.last_effective_url, tmp]
40
+ rescue Exception => e
41
+ STDERR.puts "#{uri} not retrieved: #{e.message}"
42
+ nil
43
+ end
44
+ end
@@ -0,0 +1,33 @@
1
+ require "rubygems"
2
+ require "hpricot"
3
+ class RWGet::Links
4
+ def initialize(options = {})
5
+ end
6
+
7
+ def urls(base, tmpfile)
8
+ @urls = []
9
+ base = base.to_s
10
+ string = File.read(tmpfile.path)
11
+ xml = string =~ /<\?xml/
12
+ doc = xml ? Hpricot.XML(string) : Hpricot(string)
13
+
14
+ (doc / "//item/link").each do |l|
15
+ add base, l.inner_text
16
+ end
17
+ (doc / "a").each do |a|
18
+ add base, a.attributes["href"]
19
+ end
20
+ @urls
21
+ rescue Exception => e
22
+ STDERR.puts "Couldn't parse #{base} for links: #{e.message}"
23
+ []
24
+ end
25
+
26
+ def add(base, href)
27
+ begin
28
+ @urls << URI.join(base, href.strip) if href
29
+ rescue Exception => e
30
+ STDERR.puts "url error parsing URI.join(#{base.inspect}, #{href.inspect}): #{e.message}"
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,32 @@
1
+ require "tempfile"
2
+ class RWGet::Queue
3
+ def initialize(options = {})
4
+ @writer = Tempfile.new("rwget-queue")
5
+ @reader = File.open(@writer.path, "r")
6
+ @dirty = false
7
+ end
8
+
9
+ def put(key, depth)
10
+ @writer.puts "#{key}\t#{depth}"
11
+ @dirty = true
12
+ end
13
+
14
+ def get(retrying = false)
15
+ sleep 0.1 if retrying
16
+ if @dirty
17
+ @writer.flush
18
+ @dirty = false
19
+ end
20
+ line = @reader.gets
21
+ unless line
22
+ return retrying ? nil : get(:retry)
23
+ end
24
+ key, depth = line.split("\t")
25
+ return [key, depth.to_i]
26
+ end
27
+
28
+ def close
29
+ @writer.close
30
+ @reader.close
31
+ end
32
+ end
@@ -0,0 +1,165 @@
1
+ require 'optparse'
2
+
3
+ class RWGetOptionParser < OptionParser
4
+ attr_accessor :options
5
+
6
+ def usage
7
+ "Usage: #{$0} [options] SEED_URL [SEED_URL2 ...]"
8
+ end
9
+
10
+ def parse!
11
+ super
12
+ options[:seeds] ||= []
13
+ options[:seeds] += ARGV
14
+ end
15
+
16
+ def initialize
17
+ self.options = {}
18
+ super do |opts|
19
+
20
+ yield opts if block_given?
21
+
22
+ opts.banner = usage
23
+
24
+ opts.on("-w", "--wait=SECONDS", "wait SECONDS between retrievals.") do |w|
25
+ options[:wait] = w.to_i
26
+ end
27
+
28
+ opts.on("-P", "--directory-prefix=PREFIX", "save files to PREFIX/...") do |p|
29
+ options[:prefix] = p
30
+ end
31
+
32
+ opts.on("-U", "--user-agent=AGENT", "identify as AGENT instead of RWget/VERSION.") do |u|
33
+ options[:user_agent] = u
34
+ end
35
+
36
+ opts.on("-Ap", "--accept-pattern=RUBY_REGEX", "URLs must match RUBY_REGEX to be saved to the queue.") do |r|
37
+ options[:accept_patterns] ||= []
38
+ options[:accept_patterns] << Regexp.new(r)
39
+ end
40
+
41
+ opts.on("--time-limit=AMOUNT", "Crawler will stop after this AMOUNT of time has passed.") do |t|
42
+ options[:time_limit] = t.to_i
43
+ options[:time_limit] *= 60 if t =~ /m/i
44
+ options[:time_limit] *= 60 * 60 if t =~ /h/i
45
+ options[:time_limit] *= 60 * 60 * 24 if t =~ /d/i
46
+ options[:time_limit] *= 60 * 60 * 24 * 7 if t =~ /w/i
47
+ end
48
+
49
+ opts.on("-Rp", "--reject-pattern=RUBY_REGEX", "URLs must NOT match RUBY_REGEX to be saved to the queue.") do |r|
50
+ options[:reject_patterns] ||= []
51
+ options[:reject_patterns] << Regexp.new(r)
52
+ end
53
+
54
+ opts.on("--limit-rate=RATE", "limit download rate to RATE.") do |r|
55
+ rate = r.to_i
56
+ rate *= 1000 if r =~ /k/i
57
+ rate *= 1000000 if r =~ /m/i
58
+ options[:limit_rate] = rate
59
+ puts "rate is #{rate}"
60
+ end
61
+
62
+ opts.on("--http-proxy=URL", "Proxies via URL") do |u|
63
+ options[:http_proxy] = u
64
+ end
65
+
66
+ opts.on("--proxy-user=USER", "Sets proxy user to USER") do |u|
67
+ options[:proxy_user] = u
68
+ end
69
+
70
+ opts.on("--proxy-password=PASSWORD", "Sets proxy password to PASSWORD") do |p|
71
+ options[:proxy_password] = p
72
+ end
73
+
74
+ opts.on("--require=RUBY_SCRIPT", "Will execute 'require RUBY_SCRIPT'") do |s|
75
+ require s
76
+ end
77
+
78
+ opts.on("--fetch-class=RUBY_CLASS", "Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object] (Load the class with --require)") do |c|
79
+ options[:fetch_class] = c
80
+ end
81
+
82
+ opts.on("--store-class=RUBY_CLASS", "Must implement put(key_string, temp_file) (Load the class with --require)") do |c|
83
+ options[:store_class] = c
84
+ end
85
+
86
+ opts.on("--dupes-class=RUBY_CLASS", "Must implement dupe?(uri) (Load the class with --require)") do |c|
87
+ options[:dupes_class] = c
88
+ end
89
+
90
+ opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int] (Load the class with --require)") do |c|
91
+ options[:queue_class] = c
92
+ end
93
+
94
+ opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int] (Load the class with --require)") do |c|
95
+ options[:queue_class] = c
96
+ end
97
+
98
+ opts.on("--links-class=RUBY_CLASS", "Must implement urls(base_uri, temp_file) #=> [uri, ...]") do |c|
99
+ options[:links_class] = c
100
+ end
101
+
102
+ opts.on("-S", "--sitemap=URL", "URL of a sitemap to crawl (will ignore inter-page links)") do |url|
103
+ options[:seeds] ||= []
104
+ options[:seeds] << url
105
+ options[:links_class] = "RWGet::SitemapLinks"
106
+ end
107
+
108
+ opts.on("-V", "--version") do
109
+ puts File.read(File.dirname(__FILE__) + "/../../VERSION")
110
+ exit
111
+ end
112
+
113
+ opts.on("-Q", "--quota=NUMBER", "set retrieval quota to NUMBER.") do |q|
114
+ options[:quota] = q.to_i
115
+ options[:quota] *= 1000 if q =~ /k/i
116
+ options[:quota] *= 1000000 if q =~ /m/i
117
+ end
118
+
119
+ opts.on("--max-redirect=NUM", "maximum redirections allowed per page.") do |m|
120
+ options[:max_redirect] = m.to_i
121
+ end
122
+
123
+ opts.on("-H", "--span-hosts", "go to foreign hosts when recursive") do |s|
124
+ options[:span_hosts] = s
125
+ end
126
+
127
+ opts.on("--connect-timeout=SECS", "set the connect timeout to SECS.") do |t|
128
+ options[:connect_timeout] = t.to_i
129
+ end
130
+
131
+ opts.on("-T", "--timeout=SECS", "set all timeout values to SECONDS.") do |t|
132
+ options[:timeout] = t.to_i
133
+ end
134
+
135
+ opts.on("-l", "--level=NUMBER", "maximum recursion depth (inf or 0 for infinite).") do |l|
136
+ options[:depth] = l.to_i
137
+ end
138
+
139
+ opts.on("--[no-]timestampize", "Prepend the timestamp of when the crawl started to the directory structure.") do |t|
140
+ options[:timestampize] = t
141
+ end
142
+
143
+ opts.on("--incremental-from=PREVIOUS", "Build upon the indexing already saved in PREVIOUS.") do |r|
144
+ options[:incremental_from] = r
145
+ end
146
+
147
+ opts.on("--protocol-directories", "use protocol name in directories.") do |p|
148
+ options[:protocol_directories] = p
149
+ end
150
+
151
+ opts.on("--no-host-directories", "don't create host directories.") do |h|
152
+ options[:no_host_directories] = h
153
+ end
154
+
155
+ opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
156
+ options[:verbose] = v
157
+ end
158
+
159
+ opts.on_tail("-h", "--help", "Show this message") do
160
+ puts opts
161
+ exit
162
+ end
163
+ end
164
+ end
165
+ end