wayback_machine_downloader 2.1.1 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 48f524cedc0e9f66c7b0acca132a71557a327ea2
4
- data.tar.gz: 1d70bb2a76cd07c82c08674fdc96b543caec48c0
3
+ metadata.gz: 77c3cb9b38d076dbaa7c30790c828ed63dacf2a6
4
+ data.tar.gz: 1e47ae74b6d2c0bbbe2ae99bc4622115c1a6496c
5
5
  SHA512:
6
- metadata.gz: 26eb05cbeebd911502bd01513535c7cc2d4ad0fe3850adc0205ca4f649351e56855af66915d86c501fb8be64963fe1d409d013d8afcd24064cc15673b2cc0854
7
- data.tar.gz: 0dbbd54b4b4ab231adcae908bbf6cd3865768590263e767fe5e45fb3a9d70676c337f79aba576378272ddc14647ecd06fc26820fc1dec8cb52704aa6740582b7
6
+ metadata.gz: 0326d70c1cf269099418b37302b01b22433931e217cf6d37b1bfd9fd6bcd1ced45e80ff2d0aad72d7c695ac4c25fa59d5e512a4b4674f182d4602647602ccb23
7
+ data.tar.gz: 9a0bf4eed69041ce156075a7f206237f9c1f5974b63fcb23370c19139a996d5fd143c1ff1fbe4640ce9cf5d36e36ed2e20916db504cbff9d0949f72f4086626d
@@ -18,6 +18,10 @@ option_parser = OptionParser.new do |opts|
18
18
  options[:directory] = t
19
19
  end
20
20
 
21
+ opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
22
+ options[:all_timestamps] = true
23
+ end
24
+
21
25
  opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
22
26
  options[:from_timestamp] = t
23
27
  end
@@ -14,9 +14,9 @@ class WaybackMachineDownloader
14
14
 
15
15
  include ArchiveAPI
16
16
 
17
- VERSION = "2.1.1"
17
+ VERSION = "2.2.0"
18
18
 
19
- attr_accessor :base_url, :exact_url, :directory,
19
+ attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
20
20
  :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
21
21
  :all, :maximum_pages, :threads_count
22
22
 
@@ -24,6 +24,7 @@ class WaybackMachineDownloader
24
24
  @base_url = params[:base_url]
25
25
  @exact_url = params[:exact_url]
26
26
  @directory = params[:directory]
27
+ @all_timestamps = params[:all_timestamps]
27
28
  @from_timestamp = params[:from_timestamp].to_i
28
29
  @to_timestamp = params[:to_timestamp].to_i
29
30
  @only_filter = params[:only_filter]
@@ -127,12 +128,49 @@ class WaybackMachineDownloader
127
128
  file_list_curated
128
129
  end
129
130
 
131
+ def get_file_list_all_timestamps
132
+ file_list_curated = Hash.new
133
+ get_all_snapshots_to_consider.each_line do |line|
134
+ next unless line.include?('/')
135
+ file_timestamp = line[0..13].to_i
136
+ file_url = line[15..-2]
137
+ file_id = file_url.split('/')[3..-1].join('/')
138
+ file_id_and_timestamp = [file_timestamp, file_id].join('/')
139
+ file_id_and_timestamp = CGI::unescape file_id_and_timestamp
140
+ file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
141
+ if file_id.nil?
142
+ puts "Malformed file url, ignoring: #{file_url}"
143
+ else
144
+ if match_exclude_filter(file_url)
145
+ puts "File url matches exclude filter, ignoring: #{file_url}"
146
+ elsif not match_only_filter(file_url)
147
+ puts "File url doesn't match only filter, ignoring: #{file_url}"
148
+ elsif file_list_curated[file_id_and_timestamp]
149
+ puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
150
+ else
151
+ file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
152
+ end
153
+ end
154
+ end
155
+ puts "file_list_curated: " + file_list_curated.count.to_s
156
+ file_list_curated
157
+ end
158
+
159
+
130
160
  def get_file_list_by_timestamp
131
- file_list_curated = get_file_list_curated
132
- file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
133
- file_list_curated.map do |file_remote_info|
134
- file_remote_info[1][:file_id] = file_remote_info[0]
135
- file_remote_info[1]
161
+ if @all_timestamps
162
+ file_list_curated = get_file_list_all_timestamps
163
+ file_list_curated.map do |file_remote_info|
164
+ file_remote_info[1][:file_id] = file_remote_info[0]
165
+ file_remote_info[1]
166
+ end
167
+ else
168
+ file_list_curated = get_file_list_curated
169
+ file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
170
+ file_list_curated.map do |file_remote_info|
171
+ file_remote_info[1][:file_id] = file_remote_info[0]
172
+ file_remote_info[1]
173
+ end
136
174
  end
137
175
  end
138
176
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-06-12 00:00:00.000000000 Z
11
+ date: 2017-10-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake