wayback_machine_downloader 2.1.1 → 2.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +4 -0
- data/lib/wayback_machine_downloader.rb +45 -7
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 77c3cb9b38d076dbaa7c30790c828ed63dacf2a6
|
4
|
+
data.tar.gz: 1e47ae74b6d2c0bbbe2ae99bc4622115c1a6496c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0326d70c1cf269099418b37302b01b22433931e217cf6d37b1bfd9fd6bcd1ced45e80ff2d0aad72d7c695ac4c25fa59d5e512a4b4674f182d4602647602ccb23
|
7
|
+
data.tar.gz: 9a0bf4eed69041ce156075a7f206237f9c1f5974b63fcb23370c19139a996d5fd143c1ff1fbe4640ce9cf5d36e36ed2e20916db504cbff9d0949f72f4086626d
|
@@ -18,6 +18,10 @@ option_parser = OptionParser.new do |opts|
|
|
18
18
|
options[:directory] = t
|
19
19
|
end
|
20
20
|
|
21
|
+
opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
|
22
|
+
options[:all_timestamps] = true
|
23
|
+
end
|
24
|
+
|
21
25
|
opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
|
22
26
|
options[:from_timestamp] = t
|
23
27
|
end
|
@@ -14,9 +14,9 @@ class WaybackMachineDownloader
|
|
14
14
|
|
15
15
|
include ArchiveAPI
|
16
16
|
|
17
|
-
VERSION = "2.
|
17
|
+
VERSION = "2.2.0"
|
18
18
|
|
19
|
-
attr_accessor :base_url, :exact_url, :directory,
|
19
|
+
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
20
20
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
21
21
|
:all, :maximum_pages, :threads_count
|
22
22
|
|
@@ -24,6 +24,7 @@ class WaybackMachineDownloader
|
|
24
24
|
@base_url = params[:base_url]
|
25
25
|
@exact_url = params[:exact_url]
|
26
26
|
@directory = params[:directory]
|
27
|
+
@all_timestamps = params[:all_timestamps]
|
27
28
|
@from_timestamp = params[:from_timestamp].to_i
|
28
29
|
@to_timestamp = params[:to_timestamp].to_i
|
29
30
|
@only_filter = params[:only_filter]
|
@@ -127,12 +128,49 @@ class WaybackMachineDownloader
|
|
127
128
|
file_list_curated
|
128
129
|
end
|
129
130
|
|
131
|
+
def get_file_list_all_timestamps
|
132
|
+
file_list_curated = Hash.new
|
133
|
+
get_all_snapshots_to_consider.each_line do |line|
|
134
|
+
next unless line.include?('/')
|
135
|
+
file_timestamp = line[0..13].to_i
|
136
|
+
file_url = line[15..-2]
|
137
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
138
|
+
file_id_and_timestamp = [file_timestamp, file_id].join('/')
|
139
|
+
file_id_and_timestamp = CGI::unescape file_id_and_timestamp
|
140
|
+
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
|
141
|
+
if file_id.nil?
|
142
|
+
puts "Malformed file url, ignoring: #{file_url}"
|
143
|
+
else
|
144
|
+
if match_exclude_filter(file_url)
|
145
|
+
puts "File url matches exclude filter, ignoring: #{file_url}"
|
146
|
+
elsif not match_only_filter(file_url)
|
147
|
+
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
148
|
+
elsif file_list_curated[file_id_and_timestamp]
|
149
|
+
puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
|
150
|
+
else
|
151
|
+
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
puts "file_list_curated: " + file_list_curated.count.to_s
|
156
|
+
file_list_curated
|
157
|
+
end
|
158
|
+
|
159
|
+
|
130
160
|
def get_file_list_by_timestamp
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
161
|
+
if @all_timestamps
|
162
|
+
file_list_curated = get_file_list_all_timestamps
|
163
|
+
file_list_curated.map do |file_remote_info|
|
164
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
165
|
+
file_remote_info[1]
|
166
|
+
end
|
167
|
+
else
|
168
|
+
file_list_curated = get_file_list_curated
|
169
|
+
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
|
170
|
+
file_list_curated.map do |file_remote_info|
|
171
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
172
|
+
file_remote_info[1]
|
173
|
+
end
|
136
174
|
end
|
137
175
|
end
|
138
176
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-10-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|