wayback_machine_downloader 1.0.0 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a09bb38fc8d8a248fe84bd05eda03a92b49cc16e
4
- data.tar.gz: 13d4bbb5e2041c3414a568afdbe719e475e57039
3
+ metadata.gz: 88515e4c80ca24904b31eb4261287894176a35a4
4
+ data.tar.gz: b7d82944989a68cb64c84fc78bc7bbbcce15d58b
5
5
  SHA512:
6
- metadata.gz: 7070199a5c9935b6617d7c5a390756ca517acbbb28a193619e192a8c31b6b3cdaa83bbbbae3fbebfb4ce379af4bd2a0e12dd7969739b6cbcf00f357a997a9ded
7
- data.tar.gz: b17716daad7329c37ff9a2e876bf443aed97878c69e41ac0143c9040955f67269070f5193f26ac7093050e9edc9def8084223c02805fdf3ebf38ba53b18bf554
6
+ metadata.gz: 3567625f16ff6b38ce3da0b0e720f1a20a16ee2aef34ab5085eaa320dba72f7fad1e42e644704af6ff70506c24fc778fd690a2268386eb226e43010bf7724029
7
+ data.tar.gz: cd81b5a1b75a1e077806966c9d253d171167d81c09b49cfcdde8e315193142907a49a6d99e062ee4925aeaf9885621d2195d5d358c9e9e0f1bf2ae4ef55df26f
@@ -14,7 +14,7 @@ option_parser = OptionParser.new do |opts|
14
14
  opts.separator ""
15
15
  opts.separator "Optional options:"
16
16
 
17
- opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files to. Default is ./websites/ plus the domain name.") do |t|
17
+ opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into\n\t\t\t\t Default is ./websites/ plus the domain name") do |t|
18
18
  options[:directory] = t
19
19
  end
20
20
 
@@ -26,11 +26,11 @@ option_parser = OptionParser.new do |opts|
26
26
  options[:to_timestamp] = t
27
27
  end
28
28
 
29
- opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
29
+ opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter\n\t\t\t\t (use // notation for the filter to be treated as a regex)") do |t|
30
30
  options[:only_filter] = t
31
31
  end
32
32
 
33
- opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
33
+ opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter\n\t\t\t\t (use // notation for the filter to be treated as a regex)") do |t|
34
34
  options[:exclude_filter] = t
35
35
  end
36
36
 
@@ -38,11 +38,15 @@ option_parser = OptionParser.new do |opts|
38
38
  options[:all] = true
39
39
  end
40
40
 
41
- opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time. Default is one file at a time. (ie. 20)") do |t|
41
+ opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time\n\t\t\t\t Default is one file at a time (ie. 20)") do |t|
42
42
  options[:threads_count] = t
43
43
  end
44
44
 
45
- opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps. Won't download anything.") do |t|
45
+ opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)\n\t\t\t\t Count an average of 150,000 snapshots per page ") do |t|
46
+ options[:maximum_pages] = t
47
+ end
48
+
49
+ opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything.") do |t|
46
50
  options[:list] = true
47
51
  end
48
52
 
@@ -1,20 +1,20 @@
1
1
  module ArchiveAPI
2
2
 
3
- def get_raw_list_from_api url
3
+ def get_raw_list_from_api url, page_index
4
4
  request_url = "http://web.archive.org/cdx/search/xd?url="
5
5
  request_url += url
6
- request_url += parameters_for_api
7
- request_uri = URI.parse request_url
8
- response = Net::HTTP.get_response request_uri
9
- response.body
6
+ request_url += parameters_for_api page_index
7
+ request_uri = URI.parse request_url
8
+ response = Net::HTTP.get_response request_uri
9
+ response.body
10
10
  end
11
11
 
12
- def parameters_for_api
13
- parameters = "&fl=timestamp,original&gzip=false"
12
+ def parameters_for_api page_index
13
+ parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
14
14
  if @all
15
- parameters += "&collapse=digest"
15
+ parameters += ""
16
16
  else
17
- parameters += "&filter=statuscode:200&collapse=original"
17
+ parameters += "&filter=statuscode:200"
18
18
  end
19
19
  if @from_timestamp and @from_timestamp != 0
20
20
  parameters += "&from=" + @from_timestamp.to_s
@@ -22,6 +22,9 @@ module ArchiveAPI
22
22
  if @to_timestamp and @to_timestamp != 0
23
23
  parameters += "&to=" + @to_timestamp.to_s
24
24
  end
25
+ if page_index
26
+ parameters += "&page=#{page_index}"
27
+ end
25
28
  parameters
26
29
  end
27
30
 
@@ -13,9 +13,9 @@ class WaybackMachineDownloader
13
13
 
14
14
  include ArchiveAPI
15
15
 
16
- VERSION = "1.0.0"
16
+ VERSION = "1.1.0"
17
17
 
18
- attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
18
+ attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count
19
19
 
20
20
  def initialize params
21
21
  @base_url = params[:base_url]
@@ -26,6 +26,7 @@ class WaybackMachineDownloader
26
26
  @exclude_filter = params[:exclude_filter]
27
27
  @all = params[:all]
28
28
  @list = params[:list]
29
+ @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
29
30
  @threads_count = params[:threads_count].to_i
30
31
  end
31
32
 
@@ -75,32 +76,44 @@ class WaybackMachineDownloader
75
76
  end
76
77
  end
77
78
 
79
+ def get_all_snapshots_to_consider
80
+ print "Getting snapshot pages"
81
+ snapshot_list_to_consider = ""
82
+ snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
83
+ print "."
84
+ @maximum_pages.times do |page_index|
85
+ snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
86
+ break if snapshot_list.empty?
87
+ snapshot_list_to_consider += snapshot_list
88
+ print "."
89
+ end
90
+ puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
91
+ puts
92
+ snapshot_list_to_consider
93
+ end
94
+
78
95
  def get_file_list_curated
79
- index_file_list_raw = get_raw_list_from_api(@base_url)
80
- all_file_list_raw = get_raw_list_from_api(@base_url + '/*')
81
96
  file_list_curated = Hash.new
82
- [index_file_list_raw, all_file_list_raw].each do |file|
83
- file.each_line do |line|
84
- next unless line.include?('/')
85
- file_timestamp = line[0..13].to_i
86
- file_url = line[15..-2]
87
- file_id = file_url.split('/')[3..-1].join('/')
88
- file_id = CGI::unescape file_id
89
- file_id = file_id.tidy_bytes unless file_id == ""
90
- if file_id.nil?
91
- puts "Malformed file url, ignoring: #{file_url}"
92
- else
93
- if match_exclude_filter(file_url)
94
- puts "File url matches exclude filter, ignoring: #{file_url}"
95
- elsif not match_only_filter(file_url)
96
- puts "File url doesn't match only filter, ignoring: #{file_url}"
97
- elsif file_list_curated[file_id]
98
- unless file_list_curated[file_id][:timestamp] > file_timestamp
99
- file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
100
- end
101
- else
97
+ get_all_snapshots_to_consider.each_line do |line|
98
+ next unless line.include?('/')
99
+ file_timestamp = line[0..13].to_i
100
+ file_url = line[15..-2]
101
+ file_id = file_url.split('/')[3..-1].join('/')
102
+ file_id = CGI::unescape file_id
103
+ file_id = file_id.tidy_bytes unless file_id == ""
104
+ if file_id.nil?
105
+ puts "Malformed file url, ignoring: #{file_url}"
106
+ else
107
+ if match_exclude_filter(file_url)
108
+ puts "File url matches exclude filter, ignoring: #{file_url}"
109
+ elsif not match_only_filter(file_url)
110
+ puts "File url doesn't match only filter, ignoring: #{file_url}"
111
+ elsif file_list_curated[file_id]
112
+ unless file_list_curated[file_id][:timestamp] > file_timestamp
102
113
  file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
103
114
  end
115
+ else
116
+ file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
104
117
  end
105
118
  end
106
119
  end
@@ -126,7 +139,7 @@ class WaybackMachineDownloader
126
139
 
127
140
  def download_files
128
141
  start_time = Time.now
129
- puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine..."
142
+ puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
130
143
  puts
131
144
 
132
145
  if file_list_by_timestamp.count == 0
@@ -139,6 +152,8 @@ class WaybackMachineDownloader
139
152
  puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
140
153
  return
141
154
  end
155
+
156
+ puts "#{file_list_by_timestamp.count} files to download:"
142
157
 
143
158
  threads = []
144
159
  @processed_file_count = 0
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-09-18 00:00:00.000000000 Z
11
+ date: 2016-09-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake