wayback_machine_downloader 2.0.0 → 2.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e2132b28dea0a03978384a3b337b1107562e644a
4
- data.tar.gz: e8b6421b78d02505a8498c79cd1761ebb28a3290
3
+ metadata.gz: 520f637efbb03d1e3ac87aadf1a937cc132f6c32
4
+ data.tar.gz: 90ec9079f5420153e1b7c149ce5aedb45bd4ba2c
5
5
  SHA512:
6
- metadata.gz: d1d0944e9593aadc02db950aa9826491d727f93c6185f23aac20b24b48da086ed67f2be76d91184d2709610b84160c2665ca4b30bcddfcd4981b6840c988e1d0
7
- data.tar.gz: aa40fb4da67241e972c86631b9390703ec77643b17b7f62ae2cfbffe49f276ff6f77d901aac87df66aca94d6da7ec3d230ceb001aa9b47f697b5ef1c98b4194f
6
+ metadata.gz: 6009139fbad22b7e269582d905956147df5cec565e376cbf1bdbb1125bb906fdea31c6501b3dfd9fa1b35d730444c976bcf187b8c869e8ed7db146ea155ba8fb
7
+ data.tar.gz: b4cff49e64c3ec528b544184e1426b76d1284c7f97ee9c787f370cc305b652f328c53ac48d3fc9bd9c9405b26edceb7a888e7f4a8f061ee704608e95a0642108
@@ -26,6 +26,10 @@ option_parser = OptionParser.new do |opts|
26
26
  options[:to_timestamp] = t
27
27
  end
28
28
 
29
+ opts.on("-e", "--exact_url", String, "Download only the url provied and not the full site") do |t|
30
+ options[:only_filter] = t
31
+ end
32
+
29
33
  opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
30
34
  options[:only_filter] = t
31
35
  end
@@ -42,11 +46,11 @@ option_parser = OptionParser.new do |opts|
42
46
  options[:threads_count] = t
43
47
  end
44
48
 
45
- opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page ") do |t|
49
+ opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
46
50
  options[:maximum_pages] = t
47
51
  end
48
52
 
49
- opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything.") do |t|
53
+ opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
50
54
  options[:list] = true
51
55
  end
52
56
 
@@ -58,7 +62,7 @@ end.parse!
58
62
  if (base_url = ARGV[-1])
59
63
  options[:base_url] = base_url
60
64
  wayback_machine_downloader = WaybackMachineDownloader.new options
61
- if wayback_machine_downloader.list
65
+ if options[:list]
62
66
  wayback_machine_downloader.list_files
63
67
  else
64
68
  wayback_machine_downloader.download_files
@@ -14,19 +14,21 @@ class WaybackMachineDownloader
14
14
 
15
15
  include ArchiveAPI
16
16
 
17
- VERSION = "2.0.0"
17
+ VERSION = "2.1.0"
18
18
 
19
- attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count
19
+ attr_accessor :base_url, :exact_url, :directory,
20
+ :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
21
+ :all, :maximum_pages, :threads_count
20
22
 
21
23
  def initialize params
22
24
  @base_url = params[:base_url]
25
+ @exact_url = params[:exact_url]
23
26
  @directory = params[:directory]
24
27
  @from_timestamp = params[:from_timestamp].to_i
25
28
  @to_timestamp = params[:to_timestamp].to_i
26
29
  @only_filter = params[:only_filter]
27
30
  @exclude_filter = params[:exclude_filter]
28
31
  @all = params[:all]
29
- @list = params[:list]
30
32
  @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
31
33
  @threads_count = params[:threads_count].to_i
32
34
  end
@@ -78,18 +80,19 @@ class WaybackMachineDownloader
78
80
  end
79
81
 
80
82
  def get_all_snapshots_to_consider
81
- # Note: Passing a page index parameter allow us to get more snapshots, but from a less fresh index
83
+ # Note: Passing a page index parameter allow us to get more snapshots,
84
+ # but from a less fresh index
82
85
  print "Getting snapshot pages"
83
86
  snapshot_list_to_consider = ""
84
87
  snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
85
88
  print "."
86
- snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil)
87
- print "."
88
- @maximum_pages.times do |page_index|
89
- snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
90
- break if snapshot_list.empty?
91
- snapshot_list_to_consider += snapshot_list
92
- print "."
89
+ unless @exact_url
90
+ @maximum_pages.times do |page_index|
91
+ snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
92
+ break if snapshot_list.empty?
93
+ snapshot_list_to_consider += snapshot_list
94
+ print "."
95
+ end
93
96
  end
94
97
  puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
95
98
  puts
@@ -134,8 +137,10 @@ class WaybackMachineDownloader
134
137
  end
135
138
 
136
139
  def list_files
140
+ # retrieval produces its own output
141
+ files = get_file_list_by_timestamp
137
142
  puts "["
138
- get_file_list_by_timestamp.each do |file|
143
+ files.each do |file|
139
144
  puts file.to_json + ","
140
145
  end
141
146
  puts "]"
@@ -179,7 +184,7 @@ class WaybackMachineDownloader
179
184
 
180
185
  def structure_dir_path dir_path
181
186
  begin
182
- FileUtils::mkdir_p dir_path unless File.exists? dir_path
187
+ FileUtils::mkdir_p dir_path unless File.exist? dir_path
183
188
  rescue Errno::EEXIST => e
184
189
  error_to_string = e.to_s
185
190
  puts "# #{error_to_string}"
@@ -219,7 +224,7 @@ class WaybackMachineDownloader
219
224
  if Gem.win_platform?
220
225
  file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
221
226
  end
222
- unless File.exists? file_path
227
+ unless File.exist? file_path
223
228
  begin
224
229
  structure_dir_path dir_path
225
230
  open(file_path, "wb") do |file|
@@ -240,7 +245,7 @@ class WaybackMachineDownloader
240
245
  rescue StandardError => e
241
246
  puts "#{file_url} # #{e}"
242
247
  ensure
243
- if not @all and File.exists?(file_path) and File.size(file_path) == 0
248
+ if not @all and File.exist?(file_path) and File.size(file_path) == 0
244
249
  File.delete(file_path)
245
250
  puts "#{file_path} was empty and was removed."
246
251
  end
@@ -1,15 +1,15 @@
1
1
  module ArchiveAPI
2
2
 
3
- def get_raw_list_from_api url, page_index
4
- request_url = "http://web.archive.org/cdx/search/xd?url="
5
- request_url += url
6
- request_url += parameters_for_api page_index
3
+ def get_raw_list_from_api url, page_index
4
+ request_url = "http://web.archive.org/cdx/search/xd?url="
5
+ request_url += url
6
+ request_url += parameters_for_api page_index
7
7
 
8
8
  open(request_url).read
9
- end
9
+ end
10
10
 
11
- def parameters_for_api page_index
12
- parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
11
+ def parameters_for_api page_index
12
+ parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
13
13
  if @all
14
14
  parameters += ""
15
15
  else
@@ -60,7 +60,7 @@ module TibyBytes
60
60
  bytes.each_index do |i|
61
61
 
62
62
  byte = bytes[i]
63
- is_ascii = byte < 128
63
+ _is_ascii = byte < 128
64
64
  is_cont = byte > 127 && byte < 192
65
65
  is_lead = byte > 191 && byte < 245
66
66
  is_unused = byte > 240
@@ -78,7 +78,7 @@ module TibyBytes
78
78
  # the leading byte.
79
79
  begin
80
80
  (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
81
- rescue NoMethodError => e
81
+ rescue NoMethodError
82
82
  next
83
83
  end
84
84
  conts_expected = 0
@@ -98,7 +98,7 @@ module TibyBytes
98
98
  end
99
99
  begin
100
100
  bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
101
- rescue ArgumentError => e
101
+ rescue ArgumentError
102
102
  nil
103
103
  end
104
104
  end
@@ -25,7 +25,7 @@ module ToRegex
25
25
  # @option options [true,false] :lang /foo/[nesu]
26
26
  def to_regex(options = {})
27
27
  if args = as_regexp(options)
28
- ::Regexp.new *args
28
+ ::Regexp.new(*args)
29
29
  end
30
30
  end
31
31
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0
4
+ version: 2.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-06-10 00:00:00.000000000 Z
11
+ date: 2017-06-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake