fizx-rwget 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Rakefile +1 -1
- data/VERSION +1 -1
- data/lib/rwget/controller.rb +7 -5
- data/lib/rwget/rwget_option_parser.rb +12 -10
- data/lib/rwget/store.rb +9 -1
- data/rwget.gemspec +9 -7
- metadata +5 -7
data/Rakefile
CHANGED
@@ -10,7 +10,7 @@ begin
|
|
10
10
|
gem.homepage = "http://github.com/fizx/rwget"
|
11
11
|
gem.authors = ["Kyle Maxwell"]
|
12
12
|
gem.add_dependency("curb", ["> 0.0.0"])
|
13
|
-
gem.add_dependency("hpricot", ["> 0.0.0"
|
13
|
+
gem.add_dependency("hpricot", ["> 0.0.0"])
|
14
14
|
gem.add_dependency("fizx-robots", [">= 0.3.1"])
|
15
15
|
gem.add_dependency("bloomfilter", ["> 0.0.0"])
|
16
16
|
gem.add_dependency("libxml-ruby", ["> 0.9"])
|
data/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.5.
|
1
|
+
0.5.2
|
data/lib/rwget/controller.rb
CHANGED
@@ -78,6 +78,7 @@ class RWGet::Controller
|
|
78
78
|
puts "storing at #{key}"
|
79
79
|
@store.put(key, tmpfile)
|
80
80
|
sleep options[:wait]
|
81
|
+
tmpfile.close rescue nil
|
81
82
|
else
|
82
83
|
puts "unable to download"
|
83
84
|
end
|
@@ -102,12 +103,13 @@ class RWGet::Controller
|
|
102
103
|
|
103
104
|
def key_for(uri)
|
104
105
|
arr = []
|
105
|
-
arr << options[:prefix]
|
106
|
-
arr << @start_time
|
107
|
-
arr << uri.scheme
|
108
|
-
arr << uri.host
|
106
|
+
arr << options[:prefix] if options[:prefix]
|
107
|
+
arr << @start_time if options[:timestampize]
|
108
|
+
arr << uri.scheme if options[:protocol_directories]
|
109
|
+
arr << uri.host unless options[:no_host_directories]
|
109
110
|
paths = uri.path.split("/")
|
110
|
-
paths
|
111
|
+
paths << paths.pop + "?" + uri.query if uri.query
|
112
|
+
paths.shift if paths.first.to_s.empty?
|
111
113
|
File.join(arr + paths)
|
112
114
|
end
|
113
115
|
|
@@ -9,7 +9,8 @@ class RWGetOptionParser < OptionParser
|
|
9
9
|
|
10
10
|
def parse!
|
11
11
|
super
|
12
|
-
options[:seeds]
|
12
|
+
options[:seeds] ||= []
|
13
|
+
options[:seeds] += ARGV
|
13
14
|
end
|
14
15
|
|
15
16
|
def initialize
|
@@ -49,10 +50,6 @@ class RWGetOptionParser < OptionParser
|
|
49
50
|
options[:reject_patterns] ||= []
|
50
51
|
options[:reject_patterns] << Regexp.new(r)
|
51
52
|
end
|
52
|
-
|
53
|
-
opts.on("--require=RUBY_SCRIPT", "Will execute 'require RUBY_SCRIPT'") do |s|
|
54
|
-
require s
|
55
|
-
end
|
56
53
|
|
57
54
|
opts.on("--limit-rate=RATE", "limit download rate to RATE.") do |r|
|
58
55
|
rate = r.to_i
|
@@ -74,23 +71,27 @@ class RWGetOptionParser < OptionParser
|
|
74
71
|
options[:proxy_password] = p
|
75
72
|
end
|
76
73
|
|
77
|
-
opts.on("--
|
74
|
+
opts.on("--require=RUBY_SCRIPT", "Will execute 'require RUBY_SCRIPT'") do |s|
|
75
|
+
require s
|
76
|
+
end
|
77
|
+
|
78
|
+
opts.on("--fetch-class=RUBY_CLASS", "Must implement fetch(uri, user_agent_string) #=> [final_redirected_url, file_object] (Load the class with --require)") do |c|
|
78
79
|
options[:fetch_class] = c
|
79
80
|
end
|
80
81
|
|
81
|
-
opts.on("--store-class=RUBY_CLASS", "Must implement put(key_string, temp_file)") do |c|
|
82
|
+
opts.on("--store-class=RUBY_CLASS", "Must implement put(key_string, temp_file) (Load the class with --require)") do |c|
|
82
83
|
options[:store_class] = c
|
83
84
|
end
|
84
85
|
|
85
|
-
opts.on("--dupes-class=RUBY_CLASS", "Must implement dupe?(uri)") do |c|
|
86
|
+
opts.on("--dupes-class=RUBY_CLASS", "Must implement dupe?(uri) (Load the class with --require)") do |c|
|
86
87
|
options[:dupes_class] = c
|
87
88
|
end
|
88
89
|
|
89
|
-
opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]") do |c|
|
90
|
+
opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int] (Load the class with --require)") do |c|
|
90
91
|
options[:queue_class] = c
|
91
92
|
end
|
92
93
|
|
93
|
-
opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int]") do |c|
|
94
|
+
opts.on("--queue-class=RUBY_CLASS", "Must implement put(key_string, depth_int) and get() #=> [key_string, depth_int] (Load the class with --require)") do |c|
|
94
95
|
options[:queue_class] = c
|
95
96
|
end
|
96
97
|
|
@@ -99,6 +100,7 @@ class RWGetOptionParser < OptionParser
|
|
99
100
|
end
|
100
101
|
|
101
102
|
opts.on("-S", "--sitemap=URL", "URL of a sitemap to crawl (will ignore inter-page links)") do |url|
|
103
|
+
options[:seeds] ||= []
|
102
104
|
options[:seeds] << url
|
103
105
|
options[:links_class] = "RWGet::SitemapLinks"
|
104
106
|
end
|
data/lib/rwget/store.rb
CHANGED
@@ -11,7 +11,15 @@ class RWGet::Store
|
|
11
11
|
def put(key, tmpfile)
|
12
12
|
path = File.join(@root, key)
|
13
13
|
path = File.join(path, "index.html") unless path.split("/").last =~ /\.|\?/
|
14
|
-
|
14
|
+
dir = File.dirname(path)
|
15
|
+
if(File.file?(dir))
|
16
|
+
tmp = "#{dir}.index.html.#{Time.now.to_f}"
|
17
|
+
mv dir, tmp
|
18
|
+
mkdir_p(dir)
|
19
|
+
mv tmp, File.join(dir, "index.html")
|
20
|
+
else
|
21
|
+
mkdir_p(dir)
|
22
|
+
end
|
15
23
|
mv tmpfile.path, path
|
16
24
|
end
|
17
25
|
end
|
data/rwget.gemspec
CHANGED
@@ -1,12 +1,15 @@
|
|
1
|
+
# Generated by jeweler
|
2
|
+
# DO NOT EDIT THIS FILE
|
3
|
+
# Instead, edit Jeweler::Tasks in Rakefile, and run `rake gemspec`
|
1
4
|
# -*- encoding: utf-8 -*-
|
2
5
|
|
3
6
|
Gem::Specification.new do |s|
|
4
7
|
s.name = %q{rwget}
|
5
|
-
s.version = "0.5.
|
8
|
+
s.version = "0.5.2"
|
6
9
|
|
7
10
|
s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
|
8
11
|
s.authors = ["Kyle Maxwell"]
|
9
|
-
s.date = %q{2009-
|
12
|
+
s.date = %q{2009-09-10}
|
10
13
|
s.default_executable = %q{rwget}
|
11
14
|
s.email = %q{kyle@kylemaxwell.com}
|
12
15
|
s.executables = ["rwget"]
|
@@ -42,11 +45,10 @@ Gem::Specification.new do |s|
|
|
42
45
|
"test/sitemap_links_test.rb",
|
43
46
|
"test/store_test.rb"
|
44
47
|
]
|
45
|
-
s.has_rdoc = true
|
46
48
|
s.homepage = %q{http://github.com/fizx/rwget}
|
47
49
|
s.rdoc_options = ["--charset=UTF-8"]
|
48
50
|
s.require_paths = ["lib"]
|
49
|
-
s.rubygems_version = %q{1.3.
|
51
|
+
s.rubygems_version = %q{1.3.5}
|
50
52
|
s.summary = %q{Ruby port of wget, emphasis on recursive/crawler}
|
51
53
|
s.test_files = [
|
52
54
|
"test/controller_test.rb",
|
@@ -65,20 +67,20 @@ Gem::Specification.new do |s|
|
|
65
67
|
|
66
68
|
if Gem::Version.new(Gem::RubyGemsVersion) >= Gem::Version.new('1.2.0') then
|
67
69
|
s.add_runtime_dependency(%q<curb>, ["> 0.0.0"])
|
68
|
-
s.add_runtime_dependency(%q<hpricot>, ["> 0.0.0"
|
70
|
+
s.add_runtime_dependency(%q<hpricot>, ["> 0.0.0"])
|
69
71
|
s.add_runtime_dependency(%q<fizx-robots>, [">= 0.3.1"])
|
70
72
|
s.add_runtime_dependency(%q<bloomfilter>, ["> 0.0.0"])
|
71
73
|
s.add_runtime_dependency(%q<libxml-ruby>, ["> 0.9"])
|
72
74
|
else
|
73
75
|
s.add_dependency(%q<curb>, ["> 0.0.0"])
|
74
|
-
s.add_dependency(%q<hpricot>, ["> 0.0.0"
|
76
|
+
s.add_dependency(%q<hpricot>, ["> 0.0.0"])
|
75
77
|
s.add_dependency(%q<fizx-robots>, [">= 0.3.1"])
|
76
78
|
s.add_dependency(%q<bloomfilter>, ["> 0.0.0"])
|
77
79
|
s.add_dependency(%q<libxml-ruby>, ["> 0.9"])
|
78
80
|
end
|
79
81
|
else
|
80
82
|
s.add_dependency(%q<curb>, ["> 0.0.0"])
|
81
|
-
s.add_dependency(%q<hpricot>, ["> 0.0.0"
|
83
|
+
s.add_dependency(%q<hpricot>, ["> 0.0.0"])
|
82
84
|
s.add_dependency(%q<fizx-robots>, [">= 0.3.1"])
|
83
85
|
s.add_dependency(%q<bloomfilter>, ["> 0.0.0"])
|
84
86
|
s.add_dependency(%q<libxml-ruby>, ["> 0.9"])
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: fizx-rwget
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Kyle Maxwell
|
@@ -9,7 +9,7 @@ autorequire:
|
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
11
|
|
12
|
-
date: 2009-
|
12
|
+
date: 2009-09-10 00:00:00 -07:00
|
13
13
|
default_executable: rwget
|
14
14
|
dependencies:
|
15
15
|
- !ruby/object:Gem::Dependency
|
@@ -31,9 +31,6 @@ dependencies:
|
|
31
31
|
- - ">"
|
32
32
|
- !ruby/object:Gem::Version
|
33
33
|
version: 0.0.0
|
34
|
-
- - <
|
35
|
-
- !ruby/object:Gem::Version
|
36
|
-
version: "0.7"
|
37
34
|
version:
|
38
35
|
- !ruby/object:Gem::Dependency
|
39
36
|
name: fizx-robots
|
@@ -101,8 +98,9 @@ files:
|
|
101
98
|
- test/server.rb
|
102
99
|
- test/sitemap_links_test.rb
|
103
100
|
- test/store_test.rb
|
104
|
-
has_rdoc:
|
101
|
+
has_rdoc: false
|
105
102
|
homepage: http://github.com/fizx/rwget
|
103
|
+
licenses:
|
106
104
|
post_install_message:
|
107
105
|
rdoc_options:
|
108
106
|
- --charset=UTF-8
|
@@ -123,7 +121,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
123
121
|
requirements: []
|
124
122
|
|
125
123
|
rubyforge_project:
|
126
|
-
rubygems_version: 1.
|
124
|
+
rubygems_version: 1.3.5
|
127
125
|
signing_key:
|
128
126
|
specification_version: 3
|
129
127
|
summary: Ruby port of wget, emphasis on recursive/crawler
|