fizx-rwget 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/rwget/controller.rb +7 -5
- data/lib/rwget/rwget_option_parser.rb +12 -10
- data/lib/rwget/store.rb +9 -1
- data/rwget.gemspec +9 -7
- metadata +5 -7
data/Rakefile
CHANGED
|
@@ -10,7 +10,7 @@ begin
|
|
|
10
10
|
gem.homepage = "http://github.com/fizx/rwget"
|
|
11
11
|
gem.authors = ["Kyle Maxwell"]
|
|
12
12
|
gem.add_dependency("curb", ["> 0.0.0"])
|
|
13
|
-
gem.add_dependency("hpricot", ["> 0.0.0"
|
|
13
|
+
gem.add_dependency("hpricot", ["> 0.0.0"])
|
|
14
14
|
gem.add_dependency("fizx-robots", [">= 0.3.1"])
|
|
15
15
|
gem.add_dependency("bloomfilter", ["> 0.0.0"])
|
|
16
16
|
gem.add_dependency("libxml-ruby", ["> 0.9"])
|
data/VERSION
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
0.5.
|
|
1
|
+
0.5.2
|
data/lib/rwget/controller.rb
CHANGED
|
@@ -78,6 +78,7 @@ class RWGet::Controller
|
|
|
78
78
|
puts "storing at #{key}"
|
|
79
79
|
@store.put(key, tmpfile)
|
|
80
80
|
sleep options[:wait]
|
|
81
|
+
tmpfile.close rescue nil
|
|
81
82
|
else
|
|
82
83
|
puts "unable to download"
|
|
83
84
|
end
|
|
@@ -102,12 +103,13 @@ class RWGet::Controller
|
|
|
102
103
|
|
|
103
104
|
def key_for(uri)
|
|
104
105
|
arr = []
|
|
105
|
-
arr << options[:prefix]
|
|
106
|
-
arr << @start_time
|
|
107
|
-
arr << uri.scheme
|
|
108
|
-
arr << uri.host
|
|
106
|
+
arr << options[:prefix] if options[:prefix]
|
|
107
|
+
arr << @start_time if options[:timestampize]
|
|
108
|
+
arr << uri.scheme if options[:protocol_directories]
|
|
109
|
+
arr << uri.host unless options[:no_host_directories]
|
|
109
110
|
paths = uri.path.split("/")
|
|
110
|
-
paths
|
|
111
|
+
paths << paths.pop + "?" + uri.query if uri.query
|
|
112
|
+
paths.shift if paths.first.to_s.empty?
|
|
111
113
|
File.join(arr + paths)
|
|
112
114
|
end
|
|
113
115
|
|
|
@@ -9,7 +9,8 @@ class RWGetOptionParser < OptionParser
|
|
|
9
9
|
|
|
10
10
|
def parse!
|
|
11
11
|
super
|
|
12
|
-
options[:seeds]
|
|
12
|
+
options[:seeds] ||= []
|
|
13
|
+
options[:seeds] += ARGV
|
|
13
14
|
end
|
|
14
15
|
|
|
15
16
|
def initialize
|
|
@@ -49,10 +50,6 @@ class RWGetOptionParser < OptionParser
|
|
|
49
50
|
options[:reject_patterns] ||= []
|
|
50
51
|
options[:reject_patterns] << Regexp.new(r)
|
|
51
52
|
end
|
|
52
|
-
|
|
53
|
-
opts.on("--require=RUBY_SCRIPT", "Will execute 'require RUBY_SCRIPT'") do |s|
|
|
54
|
-
require s
|
|
55
|
-
end
|
|
56
53
|
|
|
57
54
|
opts.on("--limit-rate=RATE", "limit download rate to RATE.") do |r|
|
|
58
55
|
rate = r.to_i
|
|
@@ -74,23 +71,27 @@ class RWGetOptionParser < OptionParser
|
|
|
74
71
|
options[:proxy_password] = p
|
|
75
72
|
end
|
|
76
73
|
|
|
77
|
-
opts.on("--
|
|
74
|
+
opts.on("--require=RUBY_SCRIPT", "Will execute 'require RUBY_SCRIPT'") do |s|
|
|
75
|
+
require s
|
|
76
|
+
end
|
|
77
|
+
|
|
78
|
+
opts.on("--fetch-class=RUBY_CLASS", "Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object] (Load the class with --require)") do |c|
|
|
78
79
|
options[:fetch_class] = c
|
|
79
80
|
end
|
|
80
81
|
|
|
81
|
-
opts.on("--store-class=RUBY_CLASS", "Must implement put(key_string, temp_file)") do |c|
|
|
82
|
+
opts.on("--store-class=RUBY_CLASS", "Must implement put(key_string, temp_file) (Load the class with --require)") do |c|
|
|
82
83
|
options[:store_class] = c
|
|
83
84
|
end
|
|
84
85
|
|
|
85
|
-
opts.on("--dupes-class=RUBY_CLASS", "Must implement dupe?(uri)") do |c|
|
|
86
|
+
opts.on("--dupes-class=RUBY_CLASS", "Must implement dupe?(uri) (Load the class with --require)") do |c|
|
|
86
87
|
options[:dupes_class] = c
|
|
87
88
|
end
|
|
88
89
|
|
|
89
|
-
opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]") do |c|
|
|
90
|
+
opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int] (Load the class with --require)") do |c|
|
|
90
91
|
options[:queue_class] = c
|
|
91
92
|
end
|
|
92
93
|
|
|
93
|
-
opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]") do |c|
|
|
94
|
+
opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int] (Load the class with --require)") do |c|
|
|
94
95
|
options[:queue_class] = c
|
|
95
96
|
end
|
|
96
97
|
|
|
@@ -99,6 +100,7 @@ class RWGetOptionParser < OptionParser
|
|
|
99
100
|
end
|
|
100
101
|
|
|
101
102
|
opts.on("-S", "--sitemap=URL", "URL of a sitemap to crawl (will ignore inter-page links)") do |url|
|
|
103
|
+
options[:seeds] ||= []
|
|
102
104
|
options[:seeds] << url
|
|
103
105
|
options[:links_class] = "RWGet::SitemapLinks"
|
|
104
106
|
end
|
data/lib/rwget/store.rb
CHANGED
|
@@ -11,7 +11,15 @@ class RWGet::Store
|
|
|
11
11
|
def put(key, tmpfile)
|
|
12
12
|
path = File.join(@root, key)
|
|
13
13
|
path = File.join(path, "index.html") unless path.split("/").last =~ /\.|\?/
|
|
14
|
-
|
|
14
|
+
dir = File.dirname(path)
|
|
15
|
+
if(File.file?(dir))
|
|
16
|
+
tmp = "#{dir}.index.html.#{Time.now.to_f}"
|
|
17
|
+
mv dir, tmp
|
|
18
|
+
mkdir_p(dir)
|
|
19
|
+
mv tmp, File.join(dir, "index.html")
|
|
20
|
+
else
|
|
21
|
+
mkdir_p(dir)
|
|
22
|
+
end
|
|
15
23
|
mv tmpfile.path, path
|
|
16
24
|
end
|
|
17
25
|
end
|
data/rwget.gemspec
CHANGED
|
@@ -1,12 +1,15 @@
|
|
|
1
|
+
# Generated by jeweler
|
|
2
|
+
# DO NOT EDIT THIS FILE
|
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
|
|
1
4
|
# -*- encoding: utf-8 -*-
|
|
2
5
|
|
|
3
6
|
Gem::Specification.new do |s|
|
|
4
7
|
s.name = %q{rwget}
|
|
5
|
-
s.version = "0.5.
|
|
8
|
+
s.version = "0.5.2"
|
|
6
9
|
|
|
7
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
|
8
11
|
s.authors = ["Kyle Maxwell"]
|
|
9
|
-
s.date = %q{2009-
|
|
12
|
+
s.date = %q{2009-09-10}
|
|
10
13
|
s.default_executable = %q{rwget}
|
|
11
14
|
s.email = %q{kyle@kylemaxwell.com}
|
|
12
15
|
s.executables = ["rwget"]
|
|
@@ -42,11 +45,10 @@ Gem::Specification.new do |s|
|
|
|
42
45
|
"test/sitemap_links_test.rb",
|
|
43
46
|
"test/store_test.rb"
|
|
44
47
|
]
|
|
45
|
-
s.has_rdoc = true
|
|
46
48
|
s.homepage = %q{http://github.com/fizx/rwget}
|
|
47
49
|
s.rdoc_options = ["--charset=UTF-8"]
|
|
48
50
|
s.require_paths = ["lib"]
|
|
49
|
-
s.rubygems_version = %q{1.3.
|
|
51
|
+
s.rubygems_version = %q{1.3.5}
|
|
50
52
|
s.summary = %q{Ruby port of wget, emphasis on recursive/crawler}
|
|
51
53
|
s.test_files = [
|
|
52
54
|
"test/controller_test.rb",
|
|
@@ -65,20 +67,20 @@ Gem::Specification.new do |s|
|
|
|
65
67
|
|
|
66
68
|
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
|
67
69
|
s.add_runtime_dependency(%q<curb>, ["> 0.0.0"])
|
|
68
|
-
s.add_runtime_dependency(%q<hpricot>, ["> 0.0.0"
|
|
70
|
+
s.add_runtime_dependency(%q<hpricot>, ["> 0.0.0"])
|
|
69
71
|
s.add_runtime_dependency(%q<fizx-robots>, [">= 0.3.1"])
|
|
70
72
|
s.add_runtime_dependency(%q<bloomfilter>, ["> 0.0.0"])
|
|
71
73
|
s.add_runtime_dependency(%q<libxml-ruby>, ["> 0.9"])
|
|
72
74
|
else
|
|
73
75
|
s.add_dependency(%q<curb>, ["> 0.0.0"])
|
|
74
|
-
s.add_dependency(%q<hpricot>, ["> 0.0.0"
|
|
76
|
+
s.add_dependency(%q<hpricot>, ["> 0.0.0"])
|
|
75
77
|
s.add_dependency(%q<fizx-robots>, [">= 0.3.1"])
|
|
76
78
|
s.add_dependency(%q<bloomfilter>, ["> 0.0.0"])
|
|
77
79
|
s.add_dependency(%q<libxml-ruby>, ["> 0.9"])
|
|
78
80
|
end
|
|
79
81
|
else
|
|
80
82
|
s.add_dependency(%q<curb>, ["> 0.0.0"])
|
|
81
|
-
s.add_dependency(%q<hpricot>, ["> 0.0.0"
|
|
83
|
+
s.add_dependency(%q<hpricot>, ["> 0.0.0"])
|
|
82
84
|
s.add_dependency(%q<fizx-robots>, [">= 0.3.1"])
|
|
83
85
|
s.add_dependency(%q<bloomfilter>, ["> 0.0.0"])
|
|
84
86
|
s.add_dependency(%q<libxml-ruby>, ["> 0.9"])
|
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: fizx-rwget
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.5.
|
|
4
|
+
version: 0.5.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Kyle Maxwell
|
|
@@ -9,7 +9,7 @@ autorequire:
|
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
11
|
|
|
12
|
-
date: 2009-
|
|
12
|
+
date: 2009-09-10 00:00:00 -07:00
|
|
13
13
|
default_executable: rwget
|
|
14
14
|
dependencies:
|
|
15
15
|
- !ruby/object:Gem::Dependency
|
|
@@ -31,9 +31,6 @@ dependencies:
|
|
|
31
31
|
- - ">"
|
|
32
32
|
- !ruby/object:Gem::Version
|
|
33
33
|
version: 0.0.0
|
|
34
|
-
- - <
|
|
35
|
-
- !ruby/object:Gem::Version
|
|
36
|
-
version: "0.7"
|
|
37
34
|
version:
|
|
38
35
|
- !ruby/object:Gem::Dependency
|
|
39
36
|
name: fizx-robots
|
|
@@ -101,8 +98,9 @@ files:
|
|
|
101
98
|
- test/server.rb
|
|
102
99
|
- test/sitemap_links_test.rb
|
|
103
100
|
- test/store_test.rb
|
|
104
|
-
has_rdoc:
|
|
101
|
+
has_rdoc: false
|
|
105
102
|
homepage: http://github.com/fizx/rwget
|
|
103
|
+
licenses:
|
|
106
104
|
post_install_message:
|
|
107
105
|
rdoc_options:
|
|
108
106
|
- --charset=UTF-8
|
|
@@ -123,7 +121,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
123
121
|
requirements: []
|
|
124
122
|
|
|
125
123
|
rubyforge_project:
|
|
126
|
-
rubygems_version: 1.
|
|
124
|
+
rubygems_version: 1.3.5
|
|
127
125
|
signing_key:
|
|
128
126
|
specification_version: 3
|
|
129
127
|
summary: Ruby port of wget, emphasis on recursive/crawler
|