rwget 0.0.0 → 0.5.3

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
@@ -0,0 +1,57 @@
1
+ # RWGet
2
+
3
+ RWget is a web crawler that tries to emulate a subset of the interface of GNU/Wget, but with more flexibility for my needs.
4
+
5
+ ## Features
6
+
7
+ 1. Regular expression accept/reject lists
8
+ 2. Pluggable interfaces for robots-txt, url-fetcher, url-queue, url-dupe-detector, and page-storage. The defaults store locally, and fetch using libcurl, but you could easily change to db storage, a distributed queue, etc.
9
+
10
+ ## Help page
11
+
12
+ Usage: /usr/bin/rwget [options] SEED_URL [SEED_URL2 ...]
13
+ -w, --wait=SECONDS wait SECONDS between retrievals.
14
+ -P, --directory-prefix=PREFIX save files to PREFIX/...
15
+ -U, --user-agent=AGENT identify as AGENT instead of RWget/VERSION.
16
+ -A, --accept-pattern=RUBY_REGEX URLs must match RUBY_REGEX to be saved to the queue.
17
+ --time-limit=AMOUNT Crawler will stop after this AMOUNT of time has passed.
18
+ -R, --reject-pattern=RUBY_REGEX URLs must NOT match RUBY_REGEX to be saved to the queue.
19
+ --require=RUBY_SCRIPT Will execute 'require RUBY_SCRIPT'
20
+ --limit-rate=RATE limit download rate to RATE.
21
+ --http-proxy=URL Proxies via URL
22
+ --proxy-user=USER Sets proxy user to USER
23
+ --proxy-password=PASSWORD Sets proxy password to PASSWORD
24
+ --fetch-class=RUBY_CLASS Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object]
25
+ --store-class=RUBY_CLASS Must implement put(key_string, temp_file)
26
+ --dupes-class=RUBY_CLASS Must implement dupe?(uri)
27
+ --queue-class=RUBY_CLASS Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]
28
+ --links-class=RUBY_CLASS Must implement urls(base_uri, temp_file) #=> [uri, ...]
29
+ -S, --sitemap=URL URL of a sitemap to crawl (will ignore inter-page links)
30
+
31
+ -Q, --quota=NUMBER set retrieval quota to NUMBER.
32
+ --max-redirect=NUM maximum redirections allowed per page.
33
+ -H, --span-hosts go to foreign hosts when recursive
34
+ --connect-timeout=SECS set the connect timeout to SECS.
35
+ -T, --timeout=SECS set all timeout values to SECONDS.
36
+ -l, --level=NUMBER maximum recursion depth (inf or 0 for infinite).
37
+ --[no-]timestampize Prepend the timestamp of when the crawl started to the directory structure.
38
+ --incremental-from=PREVIOUS Build upon the indexing already saved in PREVIOUS.
39
+ --protocol-directories use protocol name in directories.
40
+ --no-host-directories don't create host directories.
41
+ -v, --[no-]verbose Run verbosely
42
+ -h, --help Show this message
43
+
44
+ ## Ruby API
45
+
46
+ require "rubygems"
47
+ require "rwget"
48
+
49
+ # options is the same as the command-line long options, but converted into
50
+ # idiomatic ruby. See the RDoc for details.
51
+ # i.e.
52
+ # sh$ rwget -T 5 -A ".*foo.*" http://google.com
53
+ # becomes:
54
+ # irb$ RWGet::Controller.new({:seeds => ["http://google.com"],
55
+ # :timeout => 5, :accept_patterns => /.*foo.*/}).start
56
+
57
+ RWGet::Controller.new(options).start
@@ -0,0 +1,60 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "rwget"
8
+ gem.summary = %Q{Ruby port of wget, emphasis on recursive/crawler}
9
+ gem.email = "kyle@kylemaxwell.com"
10
+ gem.homepage = "http://github.com/fizx/rwget"
11
+ gem.authors = ["Kyle Maxwell"]
12
+ gem.add_dependency("curb", ["> 0.0.0"])
13
+ gem.add_dependency("hpricot", ["> 0.0.0"])
14
+ gem.add_dependency("fizx-robots", [">= 0.3.1"])
15
+ gem.add_dependency("igrigorik-bloomfilter", ["> 0.0.0"])
16
+ gem.add_dependency("libxml-ruby", ["> 0.9"])
17
+ end
18
+ Jeweler::GemcutterTasks.new
19
+ rescue LoadError
20
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
21
+ end
22
+
23
+ require 'rake/testtask'
24
+ Rake::TestTask.new(:test) do |test|
25
+ test.libs << 'lib' << 'test'
26
+ test.pattern = 'test/**/*_test.rb'
27
+ test.verbose = true
28
+ end
29
+
30
+ begin
31
+ require 'rcov/rcovtask'
32
+ Rcov::RcovTask.new do |test|
33
+ test.libs << 'test'
34
+ test.pattern = 'test/**/*_test.rb'
35
+ test.verbose = true
36
+ end
37
+ rescue LoadError
38
+ task :rcov do
39
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
40
+ end
41
+ end
42
+
43
+
44
+ task :default => :test
45
+
46
+ require 'rake/rdoctask'
47
+ Rake::RDocTask.new do |rdoc|
48
+ if File.exist?('VERSION.yml')
49
+ config = YAML.load(File.read('VERSION.yml'))
50
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
51
+ else
52
+ version = ""
53
+ end
54
+
55
+ rdoc.rdoc_dir = 'rdoc'
56
+ rdoc.title = "rwget #{version}"
57
+ rdoc.rdoc_files.include('README*')
58
+ rdoc.rdoc_files.include('lib/**/*.rb')
59
+ end
60
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.5.3
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ require File.dirname(__FILE__) + "/../lib/rwget"
3
+
4
+ parser = RWGetOptionParser.new
5
+ parser.parse!
6
+
7
+ if parser.options[:seeds].empty?
8
+ puts parser.usage
9
+ puts " -h for options listing"
10
+ exit(1)
11
+ end
12
+
13
+ controller = RWGet::Controller.new(parser.options)
14
+ controller.start
15
+ controller.close
@@ -0,0 +1,5 @@
1
+ module RWGet
2
+ end
3
+ Dir[File.dirname(__FILE__) + "/rwget/*.rb"].each do |f|
4
+ require f.gsub(/\.rb$/, '')
5
+ end
@@ -0,0 +1,121 @@
1
+ require "set"
2
+ class RWGet::Controller
3
+ attr_reader :options
4
+
5
+ def self.resolve_class(string)
6
+ string.split("::").inject(Kernel) do |const, string|
7
+ const.const_get(string)
8
+ end
9
+ end
10
+
11
+ def initialize(options)
12
+ @options = options
13
+ @options[:user_agent] ||= "Ruby/Wget"
14
+
15
+ @options[:accept_patterns] ||= []
16
+ @options[:reject_patterns] ||= []
17
+
18
+ %w[quota depth wait limit_rate time_limit].each do |key|
19
+ key = key.to_sym
20
+ @options[key] = @options[key].to_i
21
+ end
22
+
23
+ @queue = (options[:queue_class] ? self.class.resolve_class(options[:queue_class]) : RWGet::Queue).new(options)
24
+ @fetch = (options[:fetch_class] ? self.class.resolve_class(options[:fetch_class]) : RWGet::Fetch).new(options)
25
+ @store = (options[:store_class] ? self.class.resolve_class(options[:store_class]) : RWGet::Store).new(options)
26
+ @links = (options[:links_class] ? self.class.resolve_class(options[:links_class]) : RWGet::Links).new(options)
27
+ @dupes = (options[:dupes_class] ? self.class.resolve_class(options[:dupes_class]) : RWGet::Dupes).new(options)
28
+ end
29
+
30
+ def start
31
+ @start_time = Time.now.to_i.to_s
32
+ @start = Time.now
33
+ @original_hosts = Set.new
34
+ options[:seeds].each do |seed|
35
+ @queue.put(seed, 0)
36
+ @original_hosts << URI.parse(seed).host
37
+ end
38
+
39
+ downloaded = 0
40
+ while (options[:quota] == 0 || downloaded < options[:quota]) &&
41
+ (options[:time_limit] == 0 || Time.now - @start < options[:time_limit])
42
+
43
+ url, depth = @queue.get
44
+
45
+ unless url
46
+ puts "no more urls"
47
+ return
48
+ end
49
+
50
+ if options[:depth] > 0 && depth > options[:depth]
51
+ next
52
+ end
53
+
54
+ uri = URI.parse(url)
55
+
56
+ while options[:limit_rate] > 0 && downloaded / (Time.now - @start) > options[:limit_rate]
57
+ puts "sleeping until under rate limit"
58
+ sleep 1
59
+ end
60
+ puts "download rate: #{downloaded / (Time.now - @start)}bps"
61
+
62
+ puts "downloading #{uri}"
63
+ effective_url, tmpfile = @fetch.fetch(uri, options[:user_agent])
64
+
65
+ if tmpfile
66
+ downloaded += File.size(tmpfile.path)
67
+ puts "parsing links"
68
+ @links.urls(effective_url, tmpfile).each do |link|
69
+ legal = legal?(link)
70
+ dupe = @dupes.dupe?(link)
71
+ puts "dupe: #{link}" if dupe
72
+ if legal && !dupe
73
+ puts "adding link: #{link}"
74
+ @queue.put(link, depth + 1)
75
+ end
76
+ end
77
+ key = key_for(uri)
78
+ puts "storing at #{key}"
79
+ @store.put(key, tmpfile)
80
+ sleep options[:wait]
81
+ tmpfile.close rescue nil
82
+ else
83
+ puts "unable to download"
84
+ end
85
+ end
86
+ puts "hit time/quota"
87
+ end
88
+
89
+ def legal?(link)
90
+ unless options[:span_hosts] || @original_hosts.include?(link.host)
91
+ puts "can't span hosts: #{link}"
92
+ return false
93
+ end
94
+ link = link.to_s
95
+ legal = options[:accept_patterns].empty?
96
+ puts "accepted by default: #{link}" if legal
97
+ legal ||= options[:accept_patterns].any?{|p| link =~ p}
98
+ puts "not in accept patterns: #{link}" if !legal
99
+ rejected = options[:reject_patterns].any?{|p| link =~ p}
100
+ puts "in reject patterns: #{link}" if rejected
101
+ legal && !rejected
102
+ end
103
+
104
+ def key_for(uri)
105
+ arr = []
106
+ arr << options[:prefix] if options[:prefix]
107
+ arr << @start_time if options[:timestampize]
108
+ arr << uri.scheme if options[:protocol_directories]
109
+ arr << uri.host unless options[:no_host_directories]
110
+ paths = uri.path.split("/")
111
+ paths << paths.pop + "?" + uri.query if uri.query
112
+ paths.shift if paths.first.to_s.empty?
113
+ File.join(arr + paths)
114
+ end
115
+
116
+ def close
117
+ [@queue, @fetch, @store, @links, @dupes].each do |obj|
118
+ obj.close if obj.respond_to?(:close)
119
+ end
120
+ end
121
+ end
@@ -0,0 +1,18 @@
1
+ require "rubygems"
2
+ require "tempfile"
3
+ require "bloomfilter"
4
+
5
+ class RWGet::Dupes
6
+ SIZE = 1_000_000
7
+
8
+ def initialize(options = {})
9
+ @bloom = BloomFilter.new(SIZE, 4, 1)
10
+ end
11
+
12
+ def dupe?(uri)
13
+ key = uri.to_s
14
+ return true if @bloom.include?(key)
15
+ @bloom.insert(key)
16
+ return false
17
+ end
18
+ end
@@ -0,0 +1,44 @@
1
+ require "open-uri"
2
+ require "tempfile"
3
+ require "rubygems"
4
+ require "robots"
5
+ require "curl"
6
+ class RWGet::Fetch
7
+ DEFAULT_TIMEOUT = 30
8
+ DEFAULT_REDIRECTS = 30
9
+
10
+ def initialize(options = {})
11
+ @robots = {}
12
+ @curl = Curl::Easy.new
13
+ @curl.connect_timeout = options[:connect_timeout] || DEFAULT_TIMEOUT
14
+ @curl.timeout = options[:timeout] || DEFAULT_TIMEOUT
15
+ @curl.max_redirects = options[:max_redirect] || DEFAULT_REDIRECTS
16
+ @curl.follow_location = true
17
+ if options[:http_proxy]
18
+ @curl.proxy_url = options[:http_proxy]
19
+ if options[:proxy_user]
20
+ @curl.proxypwd = "#{options[:proxy_user]}:#{options[:proxy_password]}"
21
+ end
22
+ end
23
+ puts "timeout: #{@curl.timeout}"
24
+ end
25
+
26
+ def fetch(uri, user_agent)
27
+ @robots[user_agent] ||= Robots.new(user_agent)
28
+ unless @robots[user_agent].allowed?(uri)
29
+ puts "disallowed by robots.txt"
30
+ return nil
31
+ end
32
+
33
+ @curl.headers["User-Agent"] = user_agent
34
+ @curl.url = uri.to_s
35
+ @curl.perform
36
+ tmp = nil
37
+ Tempfile.open("curl") {|file| file.print(@curl.body_str); tmp = file }
38
+ tmp.open
39
+ [@curl.last_effective_url, tmp]
40
+ rescue Exception => e
41
+ STDERR.puts "#{uri} not retrieved: #{e.message}"
42
+ nil
43
+ end
44
+ end
@@ -0,0 +1,33 @@
1
+ require "rubygems"
2
+ require "hpricot"
3
+ class RWGet::Links
4
+ def initialize(options = {})
5
+ end
6
+
7
+ def urls(base, tmpfile)
8
+ @urls = []
9
+ base = base.to_s
10
+ string = File.read(tmpfile.path)
11
+ xml = string =~ /<\?xml/
12
+ doc = xml ? Hpricot.XML(string) : Hpricot(string)
13
+
14
+ (doc / "//item/link").each do |l|
15
+ add base, l.inner_text
16
+ end
17
+ (doc / "a").each do |a|
18
+ add base, a.attributes["href"]
19
+ end
20
+ @urls
21
+ rescue Exception => e
22
+ STDERR.puts "Couldn't parse #{base} for links: #{e.message}"
23
+ []
24
+ end
25
+
26
+ def add(base, href)
27
+ begin
28
+ @urls << URI.join(base, href.strip) if href
29
+ rescue Exception => e
30
+ STDERR.puts "url error parsing URI.join(#{base.inspect}, #{href.inspect}): #{e.message}"
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,32 @@
1
+ require "tempfile"
2
+ class RWGet::Queue
3
+ def initialize(options = {})
4
+ @writer = Tempfile.new("rwget-queue")
5
+ @reader = File.open(@writer.path, "r")
6
+ @dirty = false
7
+ end
8
+
9
+ def put(key, depth)
10
+ @writer.puts "#{key}\t#{depth}"
11
+ @dirty = true
12
+ end
13
+
14
+ def get(retrying = false)
15
+ sleep 0.1 if retrying
16
+ if @dirty
17
+ @writer.flush
18
+ @dirty = false
19
+ end
20
+ line = @reader.gets
21
+ unless line
22
+ return retrying ? nil : get(:retry)
23
+ end
24
+ key, depth = line.split("\t")
25
+ return [key, depth.to_i]
26
+ end
27
+
28
+ def close
29
+ @writer.close
30
+ @reader.close
31
+ end
32
+ end
@@ -0,0 +1,165 @@
1
+ require 'optparse'
2
+
3
+ class RWGetOptionParser < OptionParser
4
+ attr_accessor :options
5
+
6
+ def usage
7
+ "Usage: #{$0} [options] SEED_URL [SEED_URL2 ...]"
8
+ end
9
+
10
+ def parse!
11
+ super
12
+ options[:seeds] ||= []
13
+ options[:seeds] += ARGV
14
+ end
15
+
16
+ def initialize
17
+ self.options = {}
18
+ super do |opts|
19
+
20
+ yield opts if block_given?
21
+
22
+ opts.banner = usage
23
+
24
+ opts.on("-w", "--wait=SECONDS", "wait SECONDS between retrievals.") do |w|
25
+ options[:wait] = w.to_i
26
+ end
27
+
28
+ opts.on("-P", "--directory-prefix=PREFIX", "save files to PREFIX/...") do |p|
29
+ options[:prefix] = p
30
+ end
31
+
32
+ opts.on("-U", "--user-agent=AGENT", "identify as AGENT instead of RWget/VERSION.") do |u|
33
+ options[:user_agent] = u
34
+ end
35
+
36
+ opts.on("-Ap", "--accept-pattern=RUBY_REGEX", "URLs must match RUBY_REGEX to be saved to the queue.") do |r|
37
+ options[:accept_patterns] ||= []
38
+ options[:accept_patterns] << Regexp.new(r)
39
+ end
40
+
41
+ opts.on("--time-limit=AMOUNT", "Crawler will stop after this AMOUNT of time has passed.") do |t|
42
+ options[:time_limit] = t.to_i
43
+ options[:time_limit] *= 60 if t =~ /m/i
44
+ options[:time_limit] *= 60 * 60 if t =~ /h/i
45
+ options[:time_limit] *= 60 * 60 * 24 if t =~ /d/i
46
+ options[:time_limit] *= 60 * 60 * 24 * 7 if t =~ /w/i
47
+ end
48
+
49
+ opts.on("-Rp", "--reject-pattern=RUBY_REGEX", "URLs must NOT match RUBY_REGEX to be saved to the queue.") do |r|
50
+ options[:reject_patterns] ||= []
51
+ options[:reject_patterns] << Regexp.new(r)
52
+ end
53
+
54
+ opts.on("--limit-rate=RATE", "limit download rate to RATE.") do |r|
55
+ rate = r.to_i
56
+ rate *= 1000 if r =~ /k/i
57
+ rate *= 1000000 if r =~ /m/i
58
+ options[:limit_rate] = rate
59
+ puts "rate is #{rate}"
60
+ end
61
+
62
+ opts.on("--http-proxy=URL", "Proxies via URL") do |u|
63
+ options[:http_proxy] = u
64
+ end
65
+
66
+ opts.on("--proxy-user=USER", "Sets proxy user to USER") do |u|
67
+ options[:proxy_user] = u
68
+ end
69
+
70
+ opts.on("--proxy-password=PASSWORD", "Sets proxy password to PASSWORD") do |p|
71
+ options[:proxy_password] = p
72
+ end
73
+
74
+ opts.on("--require=RUBY_SCRIPT", "Will execute 'require RUBY_SCRIPT'") do |s|
75
+ require s
76
+ end
77
+
78
+ opts.on("--fetch-class=RUBY_CLASS", "Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object] (Load the class with --require)") do |c|
79
+ options[:fetch_class] = c
80
+ end
81
+
82
+ opts.on("--store-class=RUBY_CLASS", "Must implement put(key_string, temp_file) (Load the class with --require)") do |c|
83
+ options[:store_class] = c
84
+ end
85
+
86
+ opts.on("--dupes-class=RUBY_CLASS", "Must implement dupe?(uri) (Load the class with --require)") do |c|
87
+ options[:dupes_class] = c
88
+ end
89
+
90
+ opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int] (Load the class with --require)") do |c|
91
+ options[:queue_class] = c
92
+ end
93
+
94
+ opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int] (Load the class with --require)") do |c|
95
+ options[:queue_class] = c
96
+ end
97
+
98
+ opts.on("--links-class=RUBY_CLASS", "Must implement urls(base_uri, temp_file) #=> [uri, ...]") do |c|
99
+ options[:links_class] = c
100
+ end
101
+
102
+ opts.on("-S", "--sitemap=URL", "URL of a sitemap to crawl (will ignore inter-page links)") do |url|
103
+ options[:seeds] ||= []
104
+ options[:seeds] << url
105
+ options[:links_class] = "RWGet::SitemapLinks"
106
+ end
107
+
108
+ opts.on("-V", "--version") do
109
+ puts File.read(File.dirname(__FILE__) + "/../../VERSION")
110
+ exit
111
+ end
112
+
113
+ opts.on("-Q", "--quota=NUMBER", "set retrieval quota to NUMBER.") do |q|
114
+ options[:quota] = q.to_i
115
+ options[:quota] *= 1000 if q =~ /k/i
116
+ options[:quota] *= 1000000 if q =~ /m/i
117
+ end
118
+
119
+ opts.on("--max-redirect=NUM", "maximum redirections allowed per page.") do |m|
120
+ options[:max_redirect] = m.to_i
121
+ end
122
+
123
+ opts.on("-H", "--span-hosts", "go to foreign hosts when recursive") do |s|
124
+ options[:span_hosts] = s
125
+ end
126
+
127
+ opts.on("--connect-timeout=SECS", "set the connect timeout to SECS.") do |t|
128
+ options[:connect_timeout] = t.to_i
129
+ end
130
+
131
+ opts.on("-T", "--timeout=SECS", "set all timeout values to SECONDS.") do |t|
132
+ options[:timeout] = t.to_i
133
+ end
134
+
135
+ opts.on("-l", "--level=NUMBER", "maximum recursion depth (inf or 0 for infinite).") do |l|
136
+ options[:depth] = l.to_i
137
+ end
138
+
139
+ opts.on("--[no-]timestampize", "Prepend the timestamp of when the crawl started to the directory structure.") do |t|
140
+ options[:timestampize] = t
141
+ end
142
+
143
+ opts.on("--incremental-from=PREVIOUS", "Build upon the indexing already saved in PREVIOUS.") do |r|
144
+ options[:incremental_from] = r
145
+ end
146
+
147
+ opts.on("--protocol-directories", "use protocol name in directories.") do |p|
148
+ options[:protocol_directories] = p
149
+ end
150
+
151
+ opts.on("--no-host-directories", "don't create host directories.") do |h|
152
+ options[:no_host_directories] = h
153
+ end
154
+
155
+ opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
156
+ options[:verbose] = v
157
+ end
158
+
159
+ opts.on_tail("-h", "--help", "Show this message") do
160
+ puts opts
161
+ exit
162
+ end
163
+ end
164
+ end
165
+ end