wayback_machine_downloader 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 785e30db377ae0b9d7a51a0de48bc1ac6f84de2e
4
- data.tar.gz: 3f88ba144987f62fed74c0271ef6e2f55985f280
3
+ metadata.gz: 20560ddf17786139b6ad1bd4e01d4b5e4e41dafe
4
+ data.tar.gz: 3d8357785e11e81b20123a97e55f084a0442f03b
5
5
  SHA512:
6
- metadata.gz: 7acc4a0d9c18ff2625cbada39ded4b2f5b919afb10d4bda9f7f8dc010fd650b374f8288e8d482c06af9bceca418d65739ab00312f49cb45f72556a5faa70b3b9
7
- data.tar.gz: e945379f4517828d60e3d5ccb515f5a47a54d2961b417c8adec3d3193f6c009d09f7e7d945596aab0f5a9de32c367465bc1fc40542c2a6bf87f8c10135ddc0bc
6
+ metadata.gz: 333c6bcfcce0ab972d4dc927af130eb9992ebff1c12a45caee68a109aa459c76a0f6a4ace5058612742559b731876cd3af64f3aa2402b790ec756e2dc54a179a
7
+ data.tar.gz: 3dddac44a54eaf44a396f94c5428fa68da97a2e92bd2ec1c25f55eb569a5b2f64cd3082e429393be3893be29779faa4387a74952cd99fef09161d1a8dc070148
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'wayback_machine_downloader'
3
+ require_relative '../lib/wayback_machine_downloader'
4
4
  require 'optparse'
5
5
 
6
6
  options = {}
@@ -3,7 +3,7 @@ require 'fileutils'
3
3
 
4
4
  class WaybackMachineDownloader
5
5
 
6
- VERSION = "0.1.8"
6
+ VERSION = "0.1.9"
7
7
 
8
8
  attr_accessor :base_url, :timestamp
9
9
 
@@ -21,35 +21,48 @@ class WaybackMachineDownloader
21
21
  end
22
22
 
23
23
  def get_file_list_curated
24
- file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
24
+ index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
25
+ all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
25
26
  file_list_curated = Hash.new
26
- file_list_raw.each_line do |line|
27
- line = line.split(' ')
28
- file_timestamp = line[1].to_i
29
- file_url = line[2]
30
- file_id = file_url.split('/')[3..-1].join('/')
31
- file_id = URI.unescape file_id
32
- if @timestamp == 0 or file_timestamp <= @timestamp
33
- if file_list_curated[file_id]
34
- unless file_list_curated[file_id][:timestamp] > file_timestamp
27
+ [index_file_list_raw, all_file_list_raw].each do |file|
28
+ file.each_line do |line|
29
+ line = line.split(' ')
30
+ file_timestamp = line[1].to_i
31
+ file_url = line[2]
32
+ file_id = file_url.split('/')[3..-1].join('/')
33
+ file_id = URI.unescape file_id
34
+ if @timestamp == 0 or file_timestamp <= @timestamp
35
+ if file_list_curated[file_id]
36
+ unless file_list_curated[file_id][:timestamp] > file_timestamp
37
+ file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
38
+ end
39
+ else
35
40
  file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
36
41
  end
37
- else
38
- file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
39
42
  end
40
43
  end
41
44
  end
42
45
  file_list_curated
43
46
  end
44
47
 
48
+ def file_list_by_timestamp
49
+ file_list_curated = get_file_list_curated
50
+ file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
51
+ file_list_curated.map do |file_remote_info|
52
+ file_remote_info[1][:file_id] = file_remote_info[0]
53
+ file_remote_info[1]
54
+ end
55
+ end
56
+
45
57
  def download_files
46
58
  puts "Downlading #{@base_url} from Wayback Machine..."
47
59
  puts
48
60
  file_list_curated = get_file_list_curated
49
61
  count = 0
50
- file_list_curated.each do |file_id, file_remote_info|
62
+ file_list_by_timestamp.each do |file_remote_info|
51
63
  count += 1
52
64
  file_url = file_remote_info[:file_url]
65
+ file_id = file_remote_info[:file_id]
53
66
  file_path_elements = file_id.split('/')
54
67
  if file_id == ""
55
68
  dir_path = backup_path
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.8
4
+ version: 0.1.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator