wayback_machine_downloader 0.1.8 → 0.1.9

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 785e30db377ae0b9d7a51a0de48bc1ac6f84de2e
4
- data.tar.gz: 3f88ba144987f62fed74c0271ef6e2f55985f280
3
+ metadata.gz: 20560ddf17786139b6ad1bd4e01d4b5e4e41dafe
4
+ data.tar.gz: 3d8357785e11e81b20123a97e55f084a0442f03b
5
5
  SHA512:
6
- metadata.gz: 7acc4a0d9c18ff2625cbada39ded4b2f5b919afb10d4bda9f7f8dc010fd650b374f8288e8d482c06af9bceca418d65739ab00312f49cb45f72556a5faa70b3b9
7
- data.tar.gz: e945379f4517828d60e3d5ccb515f5a47a54d2961b417c8adec3d3193f6c009d09f7e7d945596aab0f5a9de32c367465bc1fc40542c2a6bf87f8c10135ddc0bc
6
+ metadata.gz: 333c6bcfcce0ab972d4dc927af130eb9992ebff1c12a45caee68a109aa459c76a0f6a4ace5058612742559b731876cd3af64f3aa2402b790ec756e2dc54a179a
7
+ data.tar.gz: 3dddac44a54eaf44a396f94c5428fa68da97a2e92bd2ec1c25f55eb569a5b2f64cd3082e429393be3893be29779faa4387a74952cd99fef09161d1a8dc070148
@@ -1,6 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
- require 'wayback_machine_downloader'
3
+ require_relative '../lib/wayback_machine_downloader'
4
4
  require 'optparse'
5
5
 
6
6
  options = {}
@@ -3,7 +3,7 @@ require 'fileutils'
3
3
 
4
4
  class WaybackMachineDownloader
5
5
 
6
- VERSION = "0.1.8"
6
+ VERSION = "0.1.9"
7
7
 
8
8
  attr_accessor :base_url, :timestamp
9
9
 
@@ -21,35 +21,48 @@ class WaybackMachineDownloader
21
21
  end
22
22
 
23
23
  def get_file_list_curated
24
- file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
24
+ index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
25
+ all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
25
26
  file_list_curated = Hash.new
26
- file_list_raw.each_line do |line|
27
- line = line.split(' ')
28
- file_timestamp = line[1].to_i
29
- file_url = line[2]
30
- file_id = file_url.split('/')[3..-1].join('/')
31
- file_id = URI.unescape file_id
32
- if @timestamp == 0 or file_timestamp <= @timestamp
33
- if file_list_curated[file_id]
34
- unless file_list_curated[file_id][:timestamp] > file_timestamp
27
+ [index_file_list_raw, all_file_list_raw].each do |file|
28
+ file.each_line do |line|
29
+ line = line.split(' ')
30
+ file_timestamp = line[1].to_i
31
+ file_url = line[2]
32
+ file_id = file_url.split('/')[3..-1].join('/')
33
+ file_id = URI.unescape file_id
34
+ if @timestamp == 0 or file_timestamp <= @timestamp
35
+ if file_list_curated[file_id]
36
+ unless file_list_curated[file_id][:timestamp] > file_timestamp
37
+ file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
38
+ end
39
+ else
35
40
  file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
36
41
  end
37
- else
38
- file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
39
42
  end
40
43
  end
41
44
  end
42
45
  file_list_curated
43
46
  end
44
47
 
48
+ def file_list_by_timestamp
49
+ file_list_curated = get_file_list_curated
50
+ file_list_curated = file_list_curated.sort_by { |k,v| v[:timestamp] }.reverse
51
+ file_list_curated.map do |file_remote_info|
52
+ file_remote_info[1][:file_id] = file_remote_info[0]
53
+ file_remote_info[1]
54
+ end
55
+ end
56
+
45
57
  def download_files
46
58
  puts "Downlading #{@base_url} from Wayback Machine..."
47
59
  puts
48
60
  file_list_curated = get_file_list_curated
49
61
  count = 0
50
- file_list_curated.each do |file_id, file_remote_info|
62
+ file_list_by_timestamp.each do |file_remote_info|
51
63
  count += 1
52
64
  file_url = file_remote_info[:file_url]
65
+ file_id = file_remote_info[:file_id]
53
66
  file_path_elements = file_id.split('/')
54
67
  if file_id == ""
55
68
  dir_path = backup_path
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.8
4
+ version: 0.1.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator