fizx-rwget 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
@@ -0,0 +1,57 @@
1
+ # RWGet
2
+
3
+ RWget is a web crawler that tries to emulate a subset of the interface of GNU/Wget, but with more flexibility for my needs.
4
+
5
+ ## Features
6
+
7
+ 1. Regular expression accept/reject lists
8
+ 2. Pluggable interfaces for robots-txt, url-fetcher, url-queue, url-dupe-detector, and page-storage. The defaults store locally, and fetch using libcurl, but you could easily change to db storage, a distributed queue, etc.
9
+
10
+ ## Help page
11
+
12
+ Usage: /usr/bin/rwget [options] SEED_URL [SEED_URL2 ...]
13
+ -w, --wait=SECONDS wait SECONDS between retrievals.
14
+ -P, --directory-prefix=PREFIX save files to PREFIX/...
15
+ -U, --user-agent=AGENT identify as AGENT instead of RWget/VERSION.
16
+ -A, --accept-pattern=RUBY_REGEX URLs must match RUBY_REGEX to be saved to the queue.
17
+ --time-limit=AMOUNT Crawler will stop after this AMOUNT of time has passed.
18
+ -R, --reject-pattern=RUBY_REGEX URLs must NOT match RUBY_REGEX to be saved to the queue.
19
+ --require=RUBY_SCRIPT Will execute 'require RUBY_SCRIPT'
20
+ --limit-rate=RATE limit download rate to RATE.
21
+ --http-proxy=URL Proxies via URL
22
+ --proxy-user=USER Sets proxy user to USER
23
+ --proxy-password=PASSWORD Sets proxy password to PASSWORD
24
+ --fetch-class=RUBY_CLASS Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object]
25
+ --store-class=RUBY_CLASS Must implement put(key_string, temp_file)
26
+ --dupes-class=RUBY_CLASS Must implement dupe?(uri)
27
+ --queue-class=RUBY_CLASS Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]
28
+ --links-class=RUBY_CLASS Must implement urls(base_uri, temp_file) #=> [uri, ...]
29
+ -S, --sitemap=URL URL of a sitemap to crawl (will ignore inter-page links)
30
+
31
+ -Q, --quota=NUMBER set retrieval quota to NUMBER.
32
+ --max-redirect=NUM maximum redirections allowed per page.
33
+ -H, --span-hosts go to foreign hosts when recursive
34
+ --connect-timeout=SECS set the connect timeout to SECS.
35
+ -T, --timeout=SECS set all timeout values to SECONDS.
36
+ -l, --level=NUMBER maximum recursion depth (inf or 0 for infinite).
37
+ --[no-]timestampize Prepend the timestamp of when the crawl started to the directory structure.
38
+ --incremental-from=PREVIOUS Build upon the indexing already saved in PREVIOUS.
39
+ --protocol-directories use protocol name in directories.
40
+ --no-host-directories don't create host directories.
41
+ -v, --[no-]verbose Run verbosely
42
+ -h, --help Show this message
43
+
44
+ ## Ruby API
45
+
46
+ require "rubygems"
47
+ require "rwget"
48
+
49
+ # options is the same as the command-line long options, but converted into
50
+ # idiomatic ruby. See the RDoc for details.
51
+ # i.e.
52
+ # sh$ rwget -T 5 -A ".*foo.*" http://google.com
53
+ # becomes:
54
+ # irb$ RWGet::Controller.new({:seeds => ["http://google.com"],
55
+ # :timeout => 5, :accept_patterns => /.*foo.*/}).start
56
+
57
+ RWGet::Controller.new(options).start
@@ -0,0 +1,59 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "rwget"
8
+ gem.summary = %Q{Ruby port of wget, emphasis on recursive/crawler}
9
+ gem.email = "kyle@kylemaxwell.com"
10
+ gem.homepage = "http://github.com/fizx/rwget"
11
+ gem.authors = ["Kyle Maxwell"]
12
+ gem.add_dependency("curb", ["> 0.0.0"])
13
+ gem.add_dependency("hpricot", ["> 0.0.0", "< 0.7"])
14
+ gem.add_dependency("fizx-robots", [">= 0.3.1"])
15
+ gem.add_dependency("bloomfilter", ["> 0.0.0"])
16
+ gem.add_dependency("libxml-ruby", ["> 0.9"])
17
+ end
18
+ rescue LoadError
19
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
20
+ end
21
+
22
+ require 'rake/testtask'
23
+ Rake::TestTask.new(:test) do |test|
24
+ test.libs << 'lib' << 'test'
25
+ test.pattern = 'test/**/*_test.rb'
26
+ test.verbose = true
27
+ end
28
+
29
+ begin
30
+ require 'rcov/rcovtask'
31
+ Rcov::RcovTask.new do |test|
32
+ test.libs << 'test'
33
+ test.pattern = 'test/**/*_test.rb'
34
+ test.verbose = true
35
+ end
36
+ rescue LoadError
37
+ task :rcov do
38
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
39
+ end
40
+ end
41
+
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ if File.exist?('VERSION.yml')
48
+ config = YAML.load(File.read('VERSION.yml'))
49
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
50
+ else
51
+ version = ""
52
+ end
53
+
54
+ rdoc.rdoc_dir = 'rdoc'
55
+ rdoc.title = "rwget #{version}"
56
+ rdoc.rdoc_files.include('README*')
57
+ rdoc.rdoc_files.include('lib/**/*.rb')
58
+ end
59
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.5.0
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ require File.dirname(__FILE__) + "/../lib/rwget"
3
+
4
+ parser = RWGetOptionParser.new
5
+ parser.parse!
6
+
7
+ if parser.options[:seeds].empty?
8
+ puts parser.usage
9
+ puts " -h for options listing"
10
+ exit(1)
11
+ end
12
+
13
+ controller = RWGet::Controller.new(parser.options)
14
+ controller.start
15
+ controller.close
@@ -0,0 +1,5 @@
1
+ module RWGet
2
+ end
3
+ Dir[File.dirname(__FILE__) + "/rwget/*.rb"].each do |f|
4
+ require f.gsub(/\.rb$/, '')
5
+ end
@@ -0,0 +1,119 @@
1
+ require "set"
2
+ class RWGet::Controller
3
+ attr_reader :options
4
+
5
+ def self.resolve_class(string)
6
+ string.split("::").inject(Kernel) do |const, string|
7
+ const.const_get(string)
8
+ end
9
+ end
10
+
11
+ def initialize(options)
12
+ @options = options
13
+ @options[:user_agent] ||= "Ruby/Wget"
14
+
15
+ @options[:accept_patterns] ||= []
16
+ @options[:reject_patterns] ||= []
17
+
18
+ %w[quota depth wait limit_rate time_limit].each do |key|
19
+ key = key.to_sym
20
+ @options[key] = @options[key].to_i
21
+ end
22
+
23
+ @queue = (options[:queue_class] ? self.class.resolve_class(options[:queue_class]) : RWGet::Queue).new(options)
24
+ @fetch = (options[:fetch_class] ? self.class.resolve_class(options[:fetch_class]) : RWGet::Fetch).new(options)
25
+ @store = (options[:store_class] ? self.class.resolve_class(options[:store_class]) : RWGet::Store).new(options)
26
+ @links = (options[:links_class] ? self.class.resolve_class(options[:links_class]) : RWGet::Links).new(options)
27
+ @dupes = (options[:dupes_class] ? self.class.resolve_class(options[:dupes_class]) : RWGet::Dupes).new(options)
28
+ end
29
+
30
+ def start
31
+ @start_time = Time.now.to_i.to_s
32
+ @start = Time.now
33
+ @original_hosts = Set.new
34
+ options[:seeds].each do |seed|
35
+ @queue.put(seed, 0)
36
+ @original_hosts << URI.parse(seed).host
37
+ end
38
+
39
+ downloaded = 0
40
+ while (options[:quota] == 0 || downloaded < options[:quota]) &&
41
+ (options[:time_limit] == 0 || Time.now - @start < options[:time_limit])
42
+
43
+ url, depth = @queue.get
44
+
45
+ unless url
46
+ puts "no more urls"
47
+ return
48
+ end
49
+
50
+ if options[:depth] > 0 && depth > options[:depth]
51
+ next
52
+ end
53
+
54
+ uri = URI.parse(url)
55
+
56
+ while options[:limit_rate] > 0 && downloaded / (Time.now - @start) > options[:limit_rate]
57
+ puts "sleeping until under rate limit"
58
+ sleep 1
59
+ end
60
+ puts "download rate: #{downloaded / (Time.now - @start)}bps"
61
+
62
+ puts "downloading #{uri}"
63
+ effective_url, tmpfile = @fetch.fetch(uri, options[:user_agent])
64
+
65
+ if tmpfile
66
+ downloaded += File.size(tmpfile.path)
67
+ puts "parsing links"
68
+ @links.urls(effective_url, tmpfile).each do |link|
69
+ legal = legal?(link)
70
+ dupe = @dupes.dupe?(link)
71
+ puts "dupe: #{link}" if dupe
72
+ if legal && !dupe
73
+ puts "adding link: #{link}"
74
+ @queue.put(link, depth + 1)
75
+ end
76
+ end
77
+ key = key_for(uri)
78
+ puts "storing at #{key}"
79
+ @store.put(key, tmpfile)
80
+ sleep options[:wait]
81
+ else
82
+ puts "unable to download"
83
+ end
84
+ end
85
+ puts "hit time/quota"
86
+ end
87
+
88
+ def legal?(link)
89
+ unless options[:span_hosts] || @original_hosts.include?(link.host)
90
+ puts "can't span hosts: #{link}"
91
+ return false
92
+ end
93
+ link = link.to_s
94
+ legal = options[:accept_patterns].empty?
95
+ puts "accepted by default: #{link}" if legal
96
+ legal ||= options[:accept_patterns].any?{|p| link =~ p}
97
+ puts "not in accept patterns: #{link}" if !legal
98
+ rejected = options[:reject_patterns].any?{|p| link =~ p}
99
+ puts "in reject patterns: #{link}" if rejected
100
+ legal && !rejected
101
+ end
102
+
103
+ def key_for(uri)
104
+ arr = []
105
+ arr << options[:prefix] if options[:prefix]
106
+ arr << @start_time if options[:timestampize]
107
+ arr << uri.scheme if options[:protocol_directories]
108
+ arr << uri.host unless options[:no_host_directories]
109
+ paths = uri.path.split("/")
110
+ paths.shift if paths.first.to_s.empty?
111
+ File.join(arr + paths)
112
+ end
113
+
114
+ def close
115
+ [@queue, @fetch, @store, @links, @dupes].each do |obj|
116
+ obj.close if obj.respond_to?(:close)
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,19 @@
1
+ require "rubygems"
2
+ require "tempfile"
3
+ require "bloomfilter"
4
+
5
+ class RWGet::Dupes
6
+ SIZE = 1_000_000
7
+
8
+ def initialize(options = {})
9
+ @tmp = Tempfile.new("bloom")
10
+ @bloom = ExternalBloomFilter.create(@tmp.path, SIZE)
11
+ end
12
+
13
+ def dupe?(uri)
14
+ key = uri.to_s
15
+ return true if @bloom.include?(key)
16
+ @bloom.add(key)
17
+ return false
18
+ end
19
+ end
@@ -0,0 +1,44 @@
1
+ require "open-uri"
2
+ require "tempfile"
3
+ require "rubygems"
4
+ require "robots"
5
+ require "curl"
6
+ class RWGet::Fetch
7
+ DEFAULT_TIMEOUT = 30
8
+ DEFAULT_REDIRECTS = 30
9
+
10
+ def initialize(options = {})
11
+ @robots = {}
12
+ @curl = Curl::Easy.new
13
+ @curl.connect_timeout = options[:connect_timeout] || DEFAULT_TIMEOUT
14
+ @curl.timeout = options[:timeout] || DEFAULT_TIMEOUT
15
+ @curl.max_redirects = options[:max_redirect] || DEFAULT_REDIRECTS
16
+ @curl.follow_location = true
17
+ if options[:http_proxy]
18
+ @curl.proxy_url = options[:http_proxy]
19
+ if options[:proxy_user]
20
+ @curl.proxypwd = "#{options[:proxy_user]}:#{options[:proxy_password]}"
21
+ end
22
+ end
23
+ puts "timeout: #{@curl.timeout}"
24
+ end
25
+
26
+ def fetch(uri, user_agent)
27
+ @robots[user_agent] ||= Robots.new(user_agent)
28
+ unless @robots[user_agent].allowed?(uri)
29
+ puts "disallowed by robots.txt"
30
+ return nil
31
+ end
32
+
33
+ @curl.headers["User-Agent"] = user_agent
34
+ @curl.url = uri.to_s
35
+ @curl.perform
36
+ tmp = nil
37
+ Tempfile.open("curl") {|file| file.print(@curl.body_str); tmp = file }
38
+ tmp.open
39
+ [@curl.last_effective_url, tmp]
40
+ rescue Exception => e
41
+ STDERR.puts "#{uri} not retrieved: #{e.message}"
42
+ nil
43
+ end
44
+ end
@@ -0,0 +1,33 @@
1
+ require "rubygems"
2
+ require "hpricot"
3
+ class RWGet::Links
4
+ def initialize(options = {})
5
+ end
6
+
7
+ def urls(base, tmpfile)
8
+ @urls = []
9
+ base = base.to_s
10
+ string = File.read(tmpfile.path)
11
+ xml = string =~ /<\?xml/
12
+ doc = xml ? Hpricot.XML(string) : Hpricot(string)
13
+
14
+ (doc / "//item/link").each do |l|
15
+ add base, l.inner_text
16
+ end
17
+ (doc / "a").each do |a|
18
+ add base, a.attributes["href"]
19
+ end
20
+ @urls
21
+ rescue Exception => e
22
+ STDERR.puts "Couldn't parse #{base} for links: #{e.message}"
23
+ []
24
+ end
25
+
26
+ def add(base, href)
27
+ begin
28
+ @urls << URI.join(base, href.strip) if href
29
+ rescue Exception => e
30
+ STDERR.puts "url error parsing URI.join(#{base.inspect}, #{href.inspect}): #{e.message}"
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,32 @@
1
+ require "tempfile"
2
+ class RWGet::Queue
3
+ def initialize(options = {})
4
+ @writer = Tempfile.new("rwget-queue")
5
+ @reader = File.open(@writer.path, "r")
6
+ @dirty = false
7
+ end
8
+
9
+ def put(key, depth)
10
+ @writer.puts "#{key}\t#{depth}"
11
+ @dirty = true
12
+ end
13
+
14
+ def get(retrying = false)
15
+ sleep 0.1 if retrying
16
+ if @dirty
17
+ @writer.flush
18
+ @dirty = false
19
+ end
20
+ line = @reader.gets
21
+ unless line
22
+ return retrying ? nil : get(:retry)
23
+ end
24
+ key, depth = line.split("\t")
25
+ return [key, depth.to_i]
26
+ end
27
+
28
+ def close
29
+ @writer.close
30
+ @reader.close
31
+ end
32
+ end
@@ -0,0 +1,158 @@
1
+ require 'optparse'
2
+
3
+ class RWGetOptionParser < OptionParser
4
+ attr_accessor :options
5
+
6
+ def usage
7
+ "Usage: #{$0} [options] SEED_URL [SEED_URL2 ...]"
8
+ end
9
+
10
+ def parse!
11
+ super
12
+ options[:seeds] = ARGV
13
+ end
14
+
15
+ def initialize
16
+ self.options = {}
17
+ super do |opts|
18
+
19
+ yield opts if block_given?
20
+
21
+ opts.banner = usage
22
+
23
+ opts.on("-w", "--wait=SECONDS", "wait SECONDS between retrievals.") do |w|
24
+ options[:wait] = w.to_i
25
+ end
26
+
27
+ opts.on("-P", "--directory-prefix=PREFIX", "save files to PREFIX/...") do |p|
28
+ options[:prefix] = p
29
+ end
30
+
31
+ opts.on("-U", "--user-agent=AGENT", "identify as AGENT instead of RWget/VERSION.") do |u|
32
+ options[:user_agent] = u
33
+ end
34
+
35
+ opts.on("-Ap", "--accept-pattern=RUBY_REGEX", "URLs must match RUBY_REGEX to be saved to the queue.") do |r|
36
+ options[:accept_patterns] ||= []
37
+ options[:accept_patterns] << Regexp.new(r)
38
+ end
39
+
40
+ opts.on("--time-limit=AMOUNT", "Crawler will stop after this AMOUNT of time has passed.") do |t|
41
+ options[:time_limit] = t.to_i
42
+ options[:time_limit] *= 60 if t =~ /m/i
43
+ options[:time_limit] *= 60 * 60 if t =~ /h/i
44
+ options[:time_limit] *= 60 * 60 * 24 if t =~ /d/i
45
+ options[:time_limit] *= 60 * 60 * 24 * 7 if t =~ /w/i
46
+ end
47
+
48
+ opts.on("-Rp", "--reject-pattern=RUBY_REGEX", "URLs must NOT match RUBY_REGEX to be saved to the queue.") do |r|
49
+ options[:reject_patterns] ||= []
50
+ options[:reject_patterns] << Regexp.new(r)
51
+ end
52
+
53
+ opts.on("--require=RUBY_SCRIPT", "Will execute 'require RUBY_SCRIPT'") do |s|
54
+ require s
55
+ end
56
+
57
+ opts.on("--limit-rate=RATE", "limit download rate to RATE.") do |r|
58
+ rate = r.to_i
59
+ rate *= 1000 if r =~ /k/i
60
+ rate *= 1000000 if r =~ /m/i
61
+ options[:limit_rate] = rate
62
+ puts "rate is #{rate}"
63
+ end
64
+
65
+ opts.on("--http-proxy=URL", "Proxies via URL") do |u|
66
+ options[:http_proxy] = u
67
+ end
68
+
69
+ opts.on("--proxy-user=USER", "Sets proxy user to USER") do |u|
70
+ options[:proxy_user] = u
71
+ end
72
+
73
+ opts.on("--proxy-password=PASSWORD", "Sets proxy password to PASSWORD") do |p|
74
+ options[:proxy_password] = p
75
+ end
76
+
77
+ opts.on("--fetch-class=RUBY_CLASS", "Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object]") do |c|
78
+ options[:fetch_class] = c
79
+ end
80
+
81
+ opts.on("--store-class=RUBY_CLASS", "Must implement put(key_string, temp_file)") do |c|
82
+ options[:store_class] = c
83
+ end
84
+
85
+ opts.on("--dupes-class=RUBY_CLASS", "Must implement dupe?(uri)") do |c|
86
+ options[:dupes_class] = c
87
+ end
88
+
89
+ opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]") do |c|
90
+ options[:queue_class] = c
91
+ end
92
+
93
+ opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]") do |c|
94
+ options[:queue_class] = c
95
+ end
96
+
97
+ opts.on("--links-class=RUBY_CLASS", "Must implement urls(base_uri, temp_file) #=> [uri, ...]") do |c|
98
+ options[:links_class] = c
99
+ end
100
+
101
+ opts.on("-S", "--sitemap=URL", "URL of a sitemap to crawl (will ignore inter-page links)") do |url|
102
+ options[:seeds] << url
103
+ options[:links_class] = "RWGet::SitemapLinks"
104
+ end
105
+
106
+ opts.on("-Q", "--quota=NUMBER", "set retrieval quota to NUMBER.") do |q|
107
+ options[:quota] = q.to_i
108
+ options[:quota] *= 1000 if q =~ /k/i
109
+ options[:quota] *= 1000000 if q =~ /m/i
110
+ end
111
+
112
+ opts.on("--max-redirect=NUM", "maximum redirections allowed per page.") do |m|
113
+ options[:max_redirect] = m.to_i
114
+ end
115
+
116
+ opts.on("-H", "--span-hosts", "go to foreign hosts when recursive") do |s|
117
+ options[:span_hosts] = s
118
+ end
119
+
120
+ opts.on("--connect-timeout=SECS", "set the connect timeout to SECS.") do |t|
121
+ options[:connect_timeout] = t.to_i
122
+ end
123
+
124
+ opts.on("-T", "--timeout=SECS", "set all timeout values to SECONDS.") do |t|
125
+ options[:timeout] = t.to_i
126
+ end
127
+
128
+ opts.on("-l", "--level=NUMBER", "maximum recursion depth (inf or 0 for infinite).") do |l|
129
+ options[:depth] = l.to_i
130
+ end
131
+
132
+ opts.on("--[no-]timestampize", "Prepend the timestamp of when the crawl started to the directory structure.") do |t|
133
+ options[:timestampize] = t
134
+ end
135
+
136
+ opts.on("--incremental-from=PREVIOUS", "Build upon the indexing already saved in PREVIOUS.") do |r|
137
+ options[:incremental_from] = r
138
+ end
139
+
140
+ opts.on("--protocol-directories", "use protocol name in directories.") do |p|
141
+ options[:protocol_directories] = p
142
+ end
143
+
144
+ opts.on("--no-host-directories", "don't create host directories.") do |h|
145
+ options[:no_host_directories] = h
146
+ end
147
+
148
+ opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
149
+ options[:verbose] = v
150
+ end
151
+
152
+ opts.on_tail("-h", "--help", "Show this message") do
153
+ puts opts
154
+ exit
155
+ end
156
+ end
157
+ end
158
+ end