fizx-rwget 0.5.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ README.rdoc
2
+ lib/**/*.rb
3
+ bin/*
4
+ features/**/*.feature
5
+ LICENSE
@@ -0,0 +1,5 @@
1
+ *.sw?
2
+ .DS_Store
3
+ coverage
4
+ rdoc
5
+ pkg
@@ -0,0 +1,57 @@
1
+ # RWGet
2
+
3
+ RWget is a web crawler that tries to emulate a subset of the interface of GNU/Wget, but with more flexibility for my needs.
4
+
5
+ ## Features
6
+
7
+ 1. Regular expression accept/reject lists
8
+ 2. Pluggable interfaces for robots-txt, url-fetcher, url-queue, url-dupe-detector, and page-storage. The defaults store locally, and fetch using libcurl, but you could easily change to db storage, a distributed queue, etc.
9
+
10
+ ## Help page
11
+
12
+ Usage: /usr/bin/rwget [options] SEED_URL [SEED_URL2 ...]
13
+ -w, --wait=SECONDS wait SECONDS between retrievals.
14
+ -P, --directory-prefix=PREFIX save files to PREFIX/...
15
+ -U, --user-agent=AGENT identify as AGENT instead of RWget/VERSION.
16
+ -A, --accept-pattern=RUBY_REGEX URLs must match RUBY_REGEX to be saved to the queue.
17
+ --time-limit=AMOUNT Crawler will stop after this AMOUNT of time has passed.
18
+ -R, --reject-pattern=RUBY_REGEX URLs must NOT match RUBY_REGEX to be saved to the queue.
19
+ --require=RUBY_SCRIPT Will execute 'require RUBY_SCRIPT'
20
+ --limit-rate=RATE limit download rate to RATE.
21
+ --http-proxy=URL Proxies via URL
22
+ --proxy-user=USER Sets proxy user to USER
23
+ --proxy-password=PASSWORD Sets proxy password to PASSWORD
24
+ --fetch-class=RUBY_CLASS Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object]
25
+ --store-class=RUBY_CLASS Must implement put(key_string, temp_file)
26
+ --dupes-class=RUBY_CLASS Must implement dupe?(uri)
27
+ --queue-class=RUBY_CLASS Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]
28
+ --links-class=RUBY_CLASS Must implement urls(base_uri, temp_file) #=> [uri, ...]
29
+ -S, --sitemap=URL URL of a sitemap to crawl (will ignore inter-page links)
30
+
31
+ -Q, --quota=NUMBER set retrieval quota to NUMBER.
32
+ --max-redirect=NUM maximum redirections allowed per page.
33
+ -H, --span-hosts go to foreign hosts when recursive
34
+ --connect-timeout=SECS set the connect timeout to SECS.
35
+ -T, --timeout=SECS set all timeout values to SECONDS.
36
+ -l, --level=NUMBER maximum recursion depth (inf or 0 for infinite).
37
+ --[no-]timestampize Prepend the timestamp of when the crawl started to the directory structure.
38
+ --incremental-from=PREVIOUS Build upon the indexing already saved in PREVIOUS.
39
+ --protocol-directories use protocol name in directories.
40
+ --no-host-directories don't create host directories.
41
+ -v, --[no-]verbose Run verbosely
42
+ -h, --help Show this message
43
+
44
+ ## Ruby API
45
+
46
+ require "rubygems"
47
+ require "rwget"
48
+
49
+ # options is the same as the command-line long options, but converted into
50
+ # idiomatic ruby. See the RDoc for details.
51
+ # i.e.
52
+ # sh$ rwget -T 5 -A ".*foo.*" http://google.com
53
+ # becomes:
54
+ # irb$ RWGet::Controller.new({:seeds => ["http://google.com"],
55
+ # :timeout => 5, :accept_patterns => /.*foo.*/}).start
56
+
57
+ RWGet::Controller.new(options).start
@@ -0,0 +1,59 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+
4
+ begin
5
+ require 'jeweler'
6
+ Jeweler::Tasks.new do |gem|
7
+ gem.name = "rwget"
8
+ gem.summary = %Q{Ruby port of wget, emphasis on recursive/crawler}
9
+ gem.email = "kyle@kylemaxwell.com"
10
+ gem.homepage = "http://github.com/fizx/rwget"
11
+ gem.authors = ["Kyle Maxwell"]
12
+ gem.add_dependency("curb", ["> 0.0.0"])
13
+ gem.add_dependency("hpricot", ["> 0.0.0", "< 0.7"])
14
+ gem.add_dependency("fizx-robots", [">= 0.3.1"])
15
+ gem.add_dependency("bloomfilter", ["> 0.0.0"])
16
+ gem.add_dependency("libxml-ruby", ["> 0.9"])
17
+ end
18
+ rescue LoadError
19
+ puts "Jeweler (or a dependency) not available. Install it with: sudo gem install jeweler"
20
+ end
21
+
22
+ require 'rake/testtask'
23
+ Rake::TestTask.new(:test) do |test|
24
+ test.libs << 'lib' << 'test'
25
+ test.pattern = 'test/**/*_test.rb'
26
+ test.verbose = true
27
+ end
28
+
29
+ begin
30
+ require 'rcov/rcovtask'
31
+ Rcov::RcovTask.new do |test|
32
+ test.libs << 'test'
33
+ test.pattern = 'test/**/*_test.rb'
34
+ test.verbose = true
35
+ end
36
+ rescue LoadError
37
+ task :rcov do
38
+ abort "RCov is not available. In order to run rcov, you must: sudo gem install spicycode-rcov"
39
+ end
40
+ end
41
+
42
+
43
+ task :default => :test
44
+
45
+ require 'rake/rdoctask'
46
+ Rake::RDocTask.new do |rdoc|
47
+ if File.exist?('VERSION.yml')
48
+ config = YAML.load(File.read('VERSION.yml'))
49
+ version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
50
+ else
51
+ version = ""
52
+ end
53
+
54
+ rdoc.rdoc_dir = 'rdoc'
55
+ rdoc.title = "rwget #{version}"
56
+ rdoc.rdoc_files.include('README*')
57
+ rdoc.rdoc_files.include('lib/**/*.rb')
58
+ end
59
+
data/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.5.0
@@ -0,0 +1,15 @@
1
+ #!/usr/bin/env ruby
2
+ require File.dirname(__FILE__) + "/../lib/rwget"
3
+
4
+ parser = RWGetOptionParser.new
5
+ parser.parse!
6
+
7
+ if parser.options[:seeds].empty?
8
+ puts parser.usage
9
+ puts " -h for options listing"
10
+ exit(1)
11
+ end
12
+
13
+ controller = RWGet::Controller.new(parser.options)
14
+ controller.start
15
+ controller.close
@@ -0,0 +1,5 @@
1
+ module RWGet
2
+ end
3
+ Dir[File.dirname(__FILE__) + "/rwget/*.rb"].each do |f|
4
+ require f.gsub(/\.rb$/, '')
5
+ end
@@ -0,0 +1,119 @@
1
+ require "set"
2
+ class RWGet::Controller
3
+ attr_reader :options
4
+
5
+ def self.resolve_class(string)
6
+ string.split("::").inject(Kernel) do |const, string|
7
+ const.const_get(string)
8
+ end
9
+ end
10
+
11
+ def initialize(options)
12
+ @options = options
13
+ @options[:user_agent] ||= "Ruby/Wget"
14
+
15
+ @options[:accept_patterns] ||= []
16
+ @options[:reject_patterns] ||= []
17
+
18
+ %w[quota depth wait limit_rate time_limit].each do |key|
19
+ key = key.to_sym
20
+ @options[key] = @options[key].to_i
21
+ end
22
+
23
+ @queue = (options[:queue_class] ? self.class.resolve_class(options[:queue_class]) : RWGet::Queue).new(options)
24
+ @fetch = (options[:fetch_class] ? self.class.resolve_class(options[:fetch_class]) : RWGet::Fetch).new(options)
25
+ @store = (options[:store_class] ? self.class.resolve_class(options[:store_class]) : RWGet::Store).new(options)
26
+ @links = (options[:links_class] ? self.class.resolve_class(options[:links_class]) : RWGet::Links).new(options)
27
+ @dupes = (options[:dupes_class] ? self.class.resolve_class(options[:dupes_class]) : RWGet::Dupes).new(options)
28
+ end
29
+
30
+ def start
31
+ @start_time = Time.now.to_i.to_s
32
+ @start = Time.now
33
+ @original_hosts = Set.new
34
+ options[:seeds].each do |seed|
35
+ @queue.put(seed, 0)
36
+ @original_hosts << URI.parse(seed).host
37
+ end
38
+
39
+ downloaded = 0
40
+ while (options[:quota] == 0 || downloaded < options[:quota]) &&
41
+ (options[:time_limit] == 0 || Time.now - @start < options[:time_limit])
42
+
43
+ url, depth = @queue.get
44
+
45
+ unless url
46
+ puts "no more urls"
47
+ return
48
+ end
49
+
50
+ if options[:depth] > 0 && depth > options[:depth]
51
+ next
52
+ end
53
+
54
+ uri = URI.parse(url)
55
+
56
+ while options[:limit_rate] > 0 && downloaded / (Time.now - @start) > options[:limit_rate]
57
+ puts "sleeping until under rate limit"
58
+ sleep 1
59
+ end
60
+ puts "download rate: #{downloaded / (Time.now - @start)}bps"
61
+
62
+ puts "downloading #{uri}"
63
+ effective_url, tmpfile = @fetch.fetch(uri, options[:user_agent])
64
+
65
+ if tmpfile
66
+ downloaded += File.size(tmpfile.path)
67
+ puts "parsing links"
68
+ @links.urls(effective_url, tmpfile).each do |link|
69
+ legal = legal?(link)
70
+ dupe = @dupes.dupe?(link)
71
+ puts "dupe: #{link}" if dupe
72
+ if legal && !dupe
73
+ puts "adding link: #{link}"
74
+ @queue.put(link, depth + 1)
75
+ end
76
+ end
77
+ key = key_for(uri)
78
+ puts "storing at #{key}"
79
+ @store.put(key, tmpfile)
80
+ sleep options[:wait]
81
+ else
82
+ puts "unable to download"
83
+ end
84
+ end
85
+ puts "hit time/quota"
86
+ end
87
+
88
+ def legal?(link)
89
+ unless options[:span_hosts] || @original_hosts.include?(link.host)
90
+ puts "can't span hosts: #{link}"
91
+ return false
92
+ end
93
+ link = link.to_s
94
+ legal = options[:accept_patterns].empty?
95
+ puts "accepted by default: #{link}" if legal
96
+ legal ||= options[:accept_patterns].any?{|p| link =~ p}
97
+ puts "not in accept patterns: #{link}" if !legal
98
+ rejected = options[:reject_patterns].any?{|p| link =~ p}
99
+ puts "in reject patterns: #{link}" if rejected
100
+ legal && !rejected
101
+ end
102
+
103
+ def key_for(uri)
104
+ arr = []
105
+ arr << options[:prefix] if options[:prefix]
106
+ arr << @start_time if options[:timestampize]
107
+ arr << uri.scheme if options[:protocol_directories]
108
+ arr << uri.host unless options[:no_host_directories]
109
+ paths = uri.path.split("/")
110
+ paths.shift if paths.first.to_s.empty?
111
+ File.join(arr + paths)
112
+ end
113
+
114
+ def close
115
+ [@queue, @fetch, @store, @links, @dupes].each do |obj|
116
+ obj.close if obj.respond_to?(:close)
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,19 @@
1
+ require "rubygems"
2
+ require "tempfile"
3
+ require "bloomfilter"
4
+
5
+ class RWGet::Dupes
6
+ SIZE = 1_000_000
7
+
8
+ def initialize(options = {})
9
+ @tmp = Tempfile.new("bloom")
10
+ @bloom = ExternalBloomFilter.create(@tmp.path, SIZE)
11
+ end
12
+
13
+ def dupe?(uri)
14
+ key = uri.to_s
15
+ return true if @bloom.include?(key)
16
+ @bloom.add(key)
17
+ return false
18
+ end
19
+ end
@@ -0,0 +1,44 @@
1
+ require "open-uri"
2
+ require "tempfile"
3
+ require "rubygems"
4
+ require "robots"
5
+ require "curl"
6
+ class RWGet::Fetch
7
+ DEFAULT_TIMEOUT = 30
8
+ DEFAULT_REDIRECTS = 30
9
+
10
+ def initialize(options = {})
11
+ @robots = {}
12
+ @curl = Curl::Easy.new
13
+ @curl.connect_timeout = options[:connect_timeout] || DEFAULT_TIMEOUT
14
+ @curl.timeout = options[:timeout] || DEFAULT_TIMEOUT
15
+ @curl.max_redirects = options[:max_redirect] || DEFAULT_REDIRECTS
16
+ @curl.follow_location = true
17
+ if options[:http_proxy]
18
+ @curl.proxy_url = options[:http_proxy]
19
+ if options[:proxy_user]
20
+ @curl.proxypwd = "#{options[:proxy_user]}:#{options[:proxy_password]}"
21
+ end
22
+ end
23
+ puts "timeout: #{@curl.timeout}"
24
+ end
25
+
26
+ def fetch(uri, user_agent)
27
+ @robots[user_agent] ||= Robots.new(user_agent)
28
+ unless @robots[user_agent].allowed?(uri)
29
+ puts "disallowed by robots.txt"
30
+ return nil
31
+ end
32
+
33
+ @curl.headers["User-Agent"] = user_agent
34
+ @curl.url = uri.to_s
35
+ @curl.perform
36
+ tmp = nil
37
+ Tempfile.open("curl") {|file| file.print(@curl.body_str); tmp = file }
38
+ tmp.open
39
+ [@curl.last_effective_url, tmp]
40
+ rescue Exception => e
41
+ STDERR.puts "#{uri} not retrieved: #{e.message}"
42
+ nil
43
+ end
44
+ end
@@ -0,0 +1,33 @@
1
+ require "rubygems"
2
+ require "hpricot"
3
+ class RWGet::Links
4
+ def initialize(options = {})
5
+ end
6
+
7
+ def urls(base, tmpfile)
8
+ @urls = []
9
+ base = base.to_s
10
+ string = File.read(tmpfile.path)
11
+ xml = string =~ /<\?xml/
12
+ doc = xml ? Hpricot.XML(string) : Hpricot(string)
13
+
14
+ (doc / "//item/link").each do |l|
15
+ add base, l.inner_text
16
+ end
17
+ (doc / "a").each do |a|
18
+ add base, a.attributes["href"]
19
+ end
20
+ @urls
21
+ rescue Exception => e
22
+ STDERR.puts "Couldn't parse #{base} for links: #{e.message}"
23
+ []
24
+ end
25
+
26
+ def add(base, href)
27
+ begin
28
+ @urls << URI.join(base, href.strip) if href
29
+ rescue Exception => e
30
+ STDERR.puts "url error parsing URI.join(#{base.inspect}, #{href.inspect}): #{e.message}"
31
+ end
32
+ end
33
+ end
@@ -0,0 +1,32 @@
1
+ require "tempfile"
2
+ class RWGet::Queue
3
+ def initialize(options = {})
4
+ @writer = Tempfile.new("rwget-queue")
5
+ @reader = File.open(@writer.path, "r")
6
+ @dirty = false
7
+ end
8
+
9
+ def put(key, depth)
10
+ @writer.puts "#{key}\t#{depth}"
11
+ @dirty = true
12
+ end
13
+
14
+ def get(retrying = false)
15
+ sleep 0.1 if retrying
16
+ if @dirty
17
+ @writer.flush
18
+ @dirty = false
19
+ end
20
+ line = @reader.gets
21
+ unless line
22
+ return retrying ? nil : get(:retry)
23
+ end
24
+ key, depth = line.split("\t")
25
+ return [key, depth.to_i]
26
+ end
27
+
28
+ def close
29
+ @writer.close
30
+ @reader.close
31
+ end
32
+ end
@@ -0,0 +1,158 @@
1
+ require 'optparse'
2
+
3
+ class RWGetOptionParser < OptionParser
4
+ attr_accessor :options
5
+
6
+ def usage
7
+ "Usage: #{$0} [options] SEED_URL [SEED_URL2 ...]"
8
+ end
9
+
10
+ def parse!
11
+ super
12
+ options[:seeds] = ARGV
13
+ end
14
+
15
+ def initialize
16
+ self.options = {}
17
+ super do |opts|
18
+
19
+ yield opts if block_given?
20
+
21
+ opts.banner = usage
22
+
23
+ opts.on("-w", "--wait=SECONDS", "wait SECONDS between retrievals.") do |w|
24
+ options[:wait] = w.to_i
25
+ end
26
+
27
+ opts.on("-P", "--directory-prefix=PREFIX", "save files to PREFIX/...") do |p|
28
+ options[:prefix] = p
29
+ end
30
+
31
+ opts.on("-U", "--user-agent=AGENT", "identify as AGENT instead of RWget/VERSION.") do |u|
32
+ options[:user_agent] = u
33
+ end
34
+
35
+ opts.on("-Ap", "--accept-pattern=RUBY_REGEX", "URLs must match RUBY_REGEX to be saved to the queue.") do |r|
36
+ options[:accept_patterns] ||= []
37
+ options[:accept_patterns] << Regexp.new(r)
38
+ end
39
+
40
+ opts.on("--time-limit=AMOUNT", "Crawler will stop after this AMOUNT of time has passed.") do |t|
41
+ options[:time_limit] = t.to_i
42
+ options[:time_limit] *= 60 if t =~ /m/i
43
+ options[:time_limit] *= 60 * 60 if t =~ /h/i
44
+ options[:time_limit] *= 60 * 60 * 24 if t =~ /d/i
45
+ options[:time_limit] *= 60 * 60 * 24 * 7 if t =~ /w/i
46
+ end
47
+
48
+ opts.on("-Rp", "--reject-pattern=RUBY_REGEX", "URLs must NOT match RUBY_REGEX to be saved to the queue.") do |r|
49
+ options[:reject_patterns] ||= []
50
+ options[:reject_patterns] << Regexp.new(r)
51
+ end
52
+
53
+ opts.on("--require=RUBY_SCRIPT", "Will execute 'require RUBY_SCRIPT'") do |s|
54
+ require s
55
+ end
56
+
57
+ opts.on("--limit-rate=RATE", "limit download rate to RATE.") do |r|
58
+ rate = r.to_i
59
+ rate *= 1000 if r =~ /k/i
60
+ rate *= 1000000 if r =~ /m/i
61
+ options[:limit_rate] = rate
62
+ puts "rate is #{rate}"
63
+ end
64
+
65
+ opts.on("--http-proxy=URL", "Proxies via URL") do |u|
66
+ options[:http_proxy] = u
67
+ end
68
+
69
+ opts.on("--proxy-user=USER", "Sets proxy user to USER") do |u|
70
+ options[:proxy_user] = u
71
+ end
72
+
73
+ opts.on("--proxy-password=PASSWORD", "Sets proxy password to PASSWORD") do |p|
74
+ options[:proxy_password] = p
75
+ end
76
+
77
+ opts.on("--fetch-class=RUBY_CLASS", "Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object]") do |c|
78
+ options[:fetch_class] = c
79
+ end
80
+
81
+ opts.on("--store-class=RUBY_CLASS", "Must implement put(key_string, temp_file)") do |c|
82
+ options[:store_class] = c
83
+ end
84
+
85
+ opts.on("--dupes-class=RUBY_CLASS", "Must implement dupe?(uri)") do |c|
86
+ options[:dupes_class] = c
87
+ end
88
+
89
+ opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]") do |c|
90
+ options[:queue_class] = c
91
+ end
92
+
93
+ opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]") do |c|
94
+ options[:queue_class] = c
95
+ end
96
+
97
+ opts.on("--links-class=RUBY_CLASS", "Must implement urls(base_uri, temp_file) #=> [uri, ...]") do |c|
98
+ options[:links_class] = c
99
+ end
100
+
101
+ opts.on("-S", "--sitemap=URL", "URL of a sitemap to crawl (will ignore inter-page links)") do |url|
102
+ options[:seeds] << url
103
+ options[:links_class] = "RWGet::SitemapLinks"
104
+ end
105
+
106
+ opts.on("-Q", "--quota=NUMBER", "set retrieval quota to NUMBER.") do |q|
107
+ options[:quota] = q.to_i
108
+ options[:quota] *= 1000 if q =~ /k/i
109
+ options[:quota] *= 1000000 if q =~ /m/i
110
+ end
111
+
112
+ opts.on("--max-redirect=NUM", "maximum redirections allowed per page.") do |m|
113
+ options[:max_redirect] = m.to_i
114
+ end
115
+
116
+ opts.on("-H", "--span-hosts", "go to foreign hosts when recursive") do |s|
117
+ options[:span_hosts] = s
118
+ end
119
+
120
+ opts.on("--connect-timeout=SECS", "set the connect timeout to SECS.") do |t|
121
+ options[:connect_timeout] = t.to_i
122
+ end
123
+
124
+ opts.on("-T", "--timeout=SECS", "set all timeout values to SECONDS.") do |t|
125
+ options[:timeout] = t.to_i
126
+ end
127
+
128
+ opts.on("-l", "--level=NUMBER", "maximum recursion depth (inf or 0 for infinite).") do |l|
129
+ options[:depth] = l.to_i
130
+ end
131
+
132
+ opts.on("--[no-]timestampize", "Prepend the timestamp of when the crawl started to the directory structure.") do |t|
133
+ options[:timestampize] = t
134
+ end
135
+
136
+ opts.on("--incremental-from=PREVIOUS", "Build upon the indexing already saved in PREVIOUS.") do |r|
137
+ options[:incremental_from] = r
138
+ end
139
+
140
+ opts.on("--protocol-directories", "use protocol name in directories.") do |p|
141
+ options[:protocol_directories] = p
142
+ end
143
+
144
+ opts.on("--no-host-directories", "don't create host directories.") do |h|
145
+ options[:no_host_directories] = h
146
+ end
147
+
148
+ opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
149
+ options[:verbose] = v
150
+ end
151
+
152
+ opts.on_tail("-h", "--help", "Show this message") do
153
+ puts opts
154
+ exit
155
+ end
156
+ end
157
+ end
158
+ end