wayback_machine_downloader 2.1.1 → 2.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 48f524cedc0e9f66c7b0acca132a71557a327ea2
4
- data.tar.gz: 1d70bb2a76cd07c82c08674fdc96b543caec48c0
3
+ metadata.gz: 77c3cb9b38d076dbaa7c30790c828ed63dacf2a6
4
+ data.tar.gz: 1e47ae74b6d2c0bbbe2ae99bc4622115c1a6496c
5
5
  SHA512:
6
- metadata.gz: 26eb05cbeebd911502bd01513535c7cc2d4ad0fe3850adc0205ca4f649351e56855af66915d86c501fb8be64963fe1d409d013d8afcd24064cc15673b2cc0854
7
- data.tar.gz: 0dbbd54b4b4ab231adcae908bbf6cd3865768590263e767fe5e45fb3a9d70676c337f79aba576378272ddc14647ecd06fc26820fc1dec8cb52704aa6740582b7
6
+ metadata.gz: 0326d70c1cf269099418b37302b01b22433931e217cf6d37b1bfd9fd6bcd1ced45e80ff2d0aad72d7c695ac4c25fa59d5e512a4b4674f182d4602647602ccb23
7
+ data.tar.gz: 9a0bf4eed69041ce156075a7f206237f9c1f5974b63fcb23370c19139a996d5fd143c1ff1fbe4640ce9cf5d36e36ed2e20916db504cbff9d0949f72f4086626d
@@ -18,6 +18,10 @@ option_parser = OptionParser.new do |opts|
18
18
  options[:directory] = t
19
19
  end
20
20
 
21
+ opts.on("-s", "--all-timestamps", "Download all snapshots/timestamps for a given website") do |t|
22
+ options[:all_timestamps] = true
23
+ end
24
+
21
25
  opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
22
26
  options[:from_timestamp] = t
23
27
  end
@@ -14,9 +14,9 @@ class WaybackMachineDownloader
14
14
 
15
15
  include ArchiveAPI
16
16
 
17
- VERSION = "2.1.1"
17
+ VERSION = "2.2.0"
18
18
 
19
- attr_accessor :base_url, :exact_url, :directory,
19
+ attr_accessor :base_url, :exact_url, :directory, :all_timestamps,
20
20
  :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
21
21
  :all, :maximum_pages, :threads_count
22
22
 
@@ -24,6 +24,7 @@ class WaybackMachineDownloader
24
24
  @base_url = params[:base_url]
25
25
  @exact_url = params[:exact_url]
26
26
  @directory = params[:directory]
27
+ @all_timestamps = params[:all_timestamps]
27
28
  @from_timestamp = params[:from_timestamp].to_i
28
29
  @to_timestamp = params[:to_timestamp].to_i
29
30
  @only_filter = params[:only_filter]
@@ -127,12 +128,49 @@ class WaybackMachineDownloader
127
128
  file_list_curated
128
129
  end
129
130
 
131
+ def get_file_list_all_timestamps
132
+ file_list_curated = Hash.new
133
+ get_all_snapshots_to_consider.each_line do |line|
134
+ next unless line.include?('/')
135
+ file_timestamp = line[0..13].to_i
136
+ file_url = line[15..-2]
137
+ file_id = file_url.split('/')[3..-1].join('/')
138
+ file_id_and_timestamp = [file_timestamp, file_id].join('/')
139
+ file_id_and_timestamp = CGI::unescape file_id_and_timestamp
140
+ file_id_and_timestamp = file_id_and_timestamp.tidy_bytes unless file_id_and_timestamp == ""
141
+ if file_id.nil?
142
+ puts "Malformed file url, ignoring: #{file_url}"
143
+ else
144
+ if match_exclude_filter(file_url)
145
+ puts "File url matches exclude filter, ignoring: #{file_url}"
146
+ elsif not match_only_filter(file_url)
147
+ puts "File url doesn't match only filter, ignoring: #{file_url}"
148
+ elsif file_list_curated[file_id_and_timestamp]
149
+ puts "Duplicate file and timestamp combo, ignoring: #{file_id}" if @verbose
150
+ else
151
+ file_list_curated[file_id_and_timestamp] = {file_url: file_url, timestamp: file_timestamp}
152
+ end
153
+ end
154
+ end
155
+ puts "file_list_curated: " + file_list_curated.count.to_s
156
+ file_list_curated
157
+ end
158
+
159
+
130
160
  def get_file_list_by_timestamp
131
- file_list_curated = get_file_list_curated
132
- file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
133
- file_list_curated.map do |file_remote_info|
134
- file_remote_info[1][:file_id] = file_remote_info[0]
135
- file_remote_info[1]
161
+ if @all_timestamps
162
+ file_list_curated = get_file_list_all_timestamps
163
+ file_list_curated.map do |file_remote_info|
164
+ file_remote_info[1][:file_id] = file_remote_info[0]
165
+ file_remote_info[1]
166
+ end
167
+ else
168
+ file_list_curated = get_file_list_curated
169
+ file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
170
+ file_list_curated.map do |file_remote_info|
171
+ file_remote_info[1][:file_id] = file_remote_info[0]
172
+ file_remote_info[1]
173
+ end
136
174
  end
137
175
  end
138
176
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.1.1
4
+ version: 2.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-06-12 00:00:00.000000000 Z
11
+ date: 2017-10-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake