wayback_machine_downloader 2.1.1 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +4 -0
- data/lib/wayback_machine_downloader.rb +45 -7
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 77c3cb9b38d076dbaa7c30790c828ed63dacf2a6
|
4
|
+
data.tar.gz: 1e47ae74b6d2c0bbbe2ae99bc4622115c1a6496c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0326d70c1cf269099418b37302b01b22433931e217cf6d37b1bfd9fd6bcd1ced45e80ff2d0aad72d7c695ac4c25fa59d5e512a4b4674f182d4602647602ccb23
|
7
|
+
data.tar.gz: 9a0bf4eed69041ce156075a7f206237f9c1f5974b63fcb23370c19139a996d5fd143c1ff1fbe4640ce9cf5d36e36ed2e20916db504cbff9d0949f72f4086626d
|
@@ -18,6 +18,10 @@ option_parser = OptionParser.new do |opts|
|
|
18
18
|
options[:directory] = t
|
19
19
|
end
|
20
20
|
|
21
|
+
opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
|
22
|
+
options[:all_timestamps] = true
|
23
|
+
end
|
24
|
+
|
21
25
|
opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
|
22
26
|
options[:from_timestamp] = t
|
23
27
|
end
|
@@ -14,9 +14,9 @@ class WaybackMachineDownloader
|
|
14
14
|
|
15
15
|
include ArchiveAPI
|
16
16
|
|
17
|
-
VERSION = "2.
|
17
|
+
VERSION = "2.2.0"
|
18
18
|
|
19
|
-
attr_accessor :base_url, :exact_url, :directory,
|
19
|
+
attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
|
20
20
|
:from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
|
21
21
|
:all, :maximum_pages, :threads_count
|
22
22
|
|
@@ -24,6 +24,7 @@ class WaybackMachineDownloader
|
|
24
24
|
@base_url = params[:base_url]
|
25
25
|
@exact_url = params[:exact_url]
|
26
26
|
@directory = params[:directory]
|
27
|
+
@all_timestamps = params[:all_timestamps]
|
27
28
|
@from_timestamp = params[:from_timestamp].to_i
|
28
29
|
@to_timestamp = params[:to_timestamp].to_i
|
29
30
|
@only_filter = params[:only_filter]
|
@@ -127,12 +128,49 @@ class WaybackMachineDownloader
|
|
127
128
|
file_list_curated
|
128
129
|
end
|
129
130
|
|
131
|
+
def get_file_list_all_timestamps
|
132
|
+
file_list_curated = Hash.new
|
133
|
+
get_all_snapshots_to_consider.each_line do |line|
|
134
|
+
next unless line.include?('/')
|
135
|
+
file_timestamp = line[0..13].to_i
|
136
|
+
file_url = line[15..-2]
|
137
|
+
file_id = file_url.split('/')[3..-1].join('/')
|
138
|
+
file_id_and_timestamp = [file_timestamp, file_id].join('/')
|
139
|
+
file_id_and_timestamp = CGI::unescape file_id_and_timestamp
|
140
|
+
file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
|
141
|
+
if file_id.nil?
|
142
|
+
puts "Malformed file url, ignoring: #{file_url}"
|
143
|
+
else
|
144
|
+
if match_exclude_filter(file_url)
|
145
|
+
puts "File url matches exclude filter, ignoring: #{file_url}"
|
146
|
+
elsif not match_only_filter(file_url)
|
147
|
+
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
148
|
+
elsif file_list_curated[file_id_and_timestamp]
|
149
|
+
puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
|
150
|
+
else
|
151
|
+
file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
|
152
|
+
end
|
153
|
+
end
|
154
|
+
end
|
155
|
+
puts "file_list_curated: " + file_list_curated.count.to_s
|
156
|
+
file_list_curated
|
157
|
+
end
|
158
|
+
|
159
|
+
|
130
160
|
def get_file_list_by_timestamp
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
161
|
+
if @all_timestamps
|
162
|
+
file_list_curated = get_file_list_all_timestamps
|
163
|
+
file_list_curated.map do |file_remote_info|
|
164
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
165
|
+
file_remote_info[1]
|
166
|
+
end
|
167
|
+
else
|
168
|
+
file_list_curated = get_file_list_curated
|
169
|
+
file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
|
170
|
+
file_list_curated.map do |file_remote_info|
|
171
|
+
file_remote_info[1][:file_id] = file_remote_info[0]
|
172
|
+
file_remote_info[1]
|
173
|
+
end
|
136
174
|
end
|
137
175
|
end
|
138
176
|
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.
|
4
|
+
version: 2.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2017-
|
11
|
+
date: 2017-10-27 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|