wayback_machine_downloader 1.0.0 → 1.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a09bb38fc8d8a248fe84bd05eda03a92b49cc16e
4
- data.tar.gz: 13d4bbb5e2041c3414a568afdbe719e475e57039
3
+ metadata.gz: 88515e4c80ca24904b31eb4261287894176a35a4
4
+ data.tar.gz: b7d82944989a68cb64c84fc78bc7bbbcce15d58b
5
5
  SHA512:
6
- metadata.gz: 7070199a5c9935b6617d7c5a390756ca517acbbb28a193619e192a8c31b6b3cdaa83bbbbae3fbebfb4ce379af4bd2a0e12dd7969739b6cbcf00f357a997a9ded
7
- data.tar.gz: b17716daad7329c37ff9a2e876bf443aed97878c69e41ac0143c9040955f67269070f5193f26ac7093050e9edc9def8084223c02805fdf3ebf38ba53b18bf554
6
+ metadata.gz: 3567625f16ff6b38ce3da0b0e720f1a20a16ee2aef34ab5085eaa320dba72f7fad1e42e644704af6ff70506c24fc778fd690a2268386eb226e43010bf7724029
7
+ data.tar.gz: cd81b5a1b75a1e077806966c9d253d171167d81c09b49cfcdde8e315193142907a49a6d99e062ee4925aeaf9885621d2195d5d358c9e9e0f1bf2ae4ef55df26f
@@ -14,7 +14,7 @@ option_parser = OptionParser.new do |opts|
14
14
  opts.separator ""
15
15
  opts.separator "Optional options:"
16
16
 
17
- opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files to. Default is ./websites/ plus the domain name.") do |t|
17
+ opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files into\n\t\t\t\t Default is ./websites/ plus the domain name") do |t|
18
18
  options[:directory] = t
19
19
  end
20
20
 
@@ -26,11 +26,11 @@ option_parser = OptionParser.new do |opts|
26
26
  options[:to_timestamp] = t
27
27
  end
28
28
 
29
- opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
29
+ opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter\n\t\t\t\t (use // notation for the filter to be treated as a regex)") do |t|
30
30
  options[:only_filter] = t
31
31
  end
32
32
 
33
- opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
33
+ opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip downloading of urls that match this filter\n\t\t\t\t (use // notation for the filter to be treated as a regex)") do |t|
34
34
  options[:exclude_filter] = t
35
35
  end
36
36
 
@@ -38,11 +38,15 @@ option_parser = OptionParser.new do |opts|
38
38
  options[:all] = true
39
39
  end
40
40
 
41
- opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time. Default is one file at a time. (ie. 20)") do |t|
41
+ opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time\n\t\t\t\t Default is one file at a time (ie. 20)") do |t|
42
42
  options[:threads_count] = t
43
43
  end
44
44
 
45
- opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps. Won't download anything.") do |t|
45
+ opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)\n\t\t\t\t Count an average of 150,000 snapshots per page ") do |t|
46
+ options[:maximum_pages] = t
47
+ end
48
+
49
+ opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything.") do |t|
46
50
  options[:list] = true
47
51
  end
48
52
 
@@ -1,20 +1,20 @@
1
1
  module ArchiveAPI
2
2
 
3
- def get_raw_list_from_api url
3
+ def get_raw_list_from_api url, page_index
4
4
  request_url = "http://web.archive.org/cdx/search/xd?url="
5
5
  request_url += url
6
- request_url += parameters_for_api
7
- request_uri = URI.parse request_url
8
- response = Net::HTTP.get_response request_uri
9
- response.body
6
+ request_url += parameters_for_api page_index
7
+ request_uri = URI.parse request_url
8
+ response = Net::HTTP.get_response request_uri
9
+ response.body
10
10
  end
11
11
 
12
- def parameters_for_api
13
- parameters = "&fl=timestamp,original&gzip=false"
12
+ def parameters_for_api page_index
13
+ parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
14
14
  if @all
15
- parameters += "&collapse=digest"
15
+ parameters += ""
16
16
  else
17
- parameters += "&filter=statuscode:200&collapse=original"
17
+ parameters += "&filter=statuscode:200"
18
18
  end
19
19
  if @from_timestamp and @from_timestamp != 0
20
20
  parameters += "&from=" + @from_timestamp.to_s
@@ -22,6 +22,9 @@ module ArchiveAPI
22
22
  if @to_timestamp and @to_timestamp != 0
23
23
  parameters += "&to=" + @to_timestamp.to_s
24
24
  end
25
+ if page_index
26
+ parameters += "&page=#{page_index}"
27
+ end
25
28
  parameters
26
29
  end
27
30
 
@@ -13,9 +13,9 @@ class WaybackMachineDownloader
13
13
 
14
14
  include ArchiveAPI
15
15
 
16
- VERSION = "1.0.0"
16
+ VERSION = "1.1.0"
17
17
 
18
- attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
18
+ attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count
19
19
 
20
20
  def initialize params
21
21
  @base_url = params[:base_url]
@@ -26,6 +26,7 @@ class WaybackMachineDownloader
26
26
  @exclude_filter = params[:exclude_filter]
27
27
  @all = params[:all]
28
28
  @list = params[:list]
29
+ @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
29
30
  @threads_count = params[:threads_count].to_i
30
31
  end
31
32
 
@@ -75,32 +76,44 @@ class WaybackMachineDownloader
75
76
  end
76
77
  end
77
78
 
79
+ def get_all_snapshots_to_consider
80
+ print "Getting snapshot pages"
81
+ snapshot_list_to_consider = ""
82
+ snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
83
+ print "."
84
+ @maximum_pages.times do |page_index|
85
+ snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
86
+ break if snapshot_list.empty?
87
+ snapshot_list_to_consider += snapshot_list
88
+ print "."
89
+ end
90
+ puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
91
+ puts
92
+ snapshot_list_to_consider
93
+ end
94
+
78
95
  def get_file_list_curated
79
- index_file_list_raw = get_raw_list_from_api(@base_url)
80
- all_file_list_raw = get_raw_list_from_api(@base_url + '/*')
81
96
  file_list_curated = Hash.new
82
- [index_file_list_raw, all_file_list_raw].each do |file|
83
- file.each_line do |line|
84
- next unless line.include?('/')
85
- file_timestamp = line[0..13].to_i
86
- file_url = line[15..-2]
87
- file_id = file_url.split('/')[3..-1].join('/')
88
- file_id = CGI::unescape file_id
89
- file_id = file_id.tidy_bytes unless file_id == ""
90
- if file_id.nil?
91
- puts "Malformed file url, ignoring: #{file_url}"
92
- else
93
- if match_exclude_filter(file_url)
94
- puts "File url matches exclude filter, ignoring: #{file_url}"
95
- elsif not match_only_filter(file_url)
96
- puts "File url doesn't match only filter, ignoring: #{file_url}"
97
- elsif file_list_curated[file_id]
98
- unless file_list_curated[file_id][:timestamp] > file_timestamp
99
- file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
100
- end
101
- else
97
+ get_all_snapshots_to_consider.each_line do |line|
98
+ next unless line.include?('/')
99
+ file_timestamp = line[0..13].to_i
100
+ file_url = line[15..-2]
101
+ file_id = file_url.split('/')[3..-1].join('/')
102
+ file_id = CGI::unescape file_id
103
+ file_id = file_id.tidy_bytes unless file_id == ""
104
+ if file_id.nil?
105
+ puts "Malformed file url, ignoring: #{file_url}"
106
+ else
107
+ if match_exclude_filter(file_url)
108
+ puts "File url matches exclude filter, ignoring: #{file_url}"
109
+ elsif not match_only_filter(file_url)
110
+ puts "File url doesn't match only filter, ignoring: #{file_url}"
111
+ elsif file_list_curated[file_id]
112
+ unless file_list_curated[file_id][:timestamp] > file_timestamp
102
113
  file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
103
114
  end
115
+ else
116
+ file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
104
117
  end
105
118
  end
106
119
  end
@@ -126,7 +139,7 @@ class WaybackMachineDownloader
126
139
 
127
140
  def download_files
128
141
  start_time = Time.now
129
- puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine..."
142
+ puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine archives."
130
143
  puts
131
144
 
132
145
  if file_list_by_timestamp.count == 0
@@ -139,6 +152,8 @@ class WaybackMachineDownloader
139
152
  puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
140
153
  return
141
154
  end
155
+
156
+ puts "#{file_list_by_timestamp.count} files to download:"
142
157
 
143
158
  threads = []
144
159
  @processed_file_count = 0
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.0
4
+ version: 1.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-09-18 00:00:00.000000000 Z
11
+ date: 2016-09-24 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake