wayback_machine_downloader 2.0.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e2132b28dea0a03978384a3b337b1107562e644a
4
- data.tar.gz: e8b6421b78d02505a8498c79cd1761ebb28a3290
3
+ metadata.gz: 520f637efbb03d1e3ac87aadf1a937cc132f6c32
4
+ data.tar.gz: 90ec9079f5420153e1b7c149ce5aedb45bd4ba2c
5
5
  SHA512:
6
- metadata.gz: d1d0944e9593aadc02db950aa9826491d727f93c6185f23aac20b24b48da086ed67f2be76d91184d2709610b84160c2665ca4b30bcddfcd4981b6840c988e1d0
7
- data.tar.gz: aa40fb4da67241e972c86631b9390703ec77643b17b7f62ae2cfbffe49f276ff6f77d901aac87df66aca94d6da7ec3d230ceb001aa9b47f697b5ef1c98b4194f
6
+ metadata.gz: 6009139fbad22b7e269582d905956147df5cec565e376cbf1bdbb1125bb906fdea31c6501b3dfd9fa1b35d730444c976bcf187b8c869e8ed7db146ea155ba8fb
7
+ data.tar.gz: b4cff49e64c3ec528b544184e1426b76d1284c7f97ee9c787f370cc305b652f328c53ac48d3fc9bd9c9405b26edceb7a888e7f4a8f061ee704608e95a0642108
@@ -26,6 +26,10 @@ option_parser = OptionParser.new do |opts|
26
26
  options[:to_timestamp] = t
27
27
  end
28
28
 
29
+ opts.on("-e", "--exact_url", String, "Download only the url provied and not the full site") do |t|
30
+ options[:only_filter] = t
31
+ end
32
+
29
33
  opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter", "(use // notation for the filter to be treated as a regex)") do |t|
30
34
  options[:only_filter] = t
31
35
  end
@@ -42,11 +46,11 @@ option_parser = OptionParser.new do |opts|
42
46
  options[:threads_count] = t
43
47
  end
44
48
 
45
- opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page ") do |t|
49
+ opts.on("-p", "--maximum-snapshot NUMBER", Integer, "Maximum snapshot pages to consider (Default is 100)", "Count an average of 150,000 snapshots per page") do |t|
46
50
  options[:maximum_pages] = t
47
51
  end
48
52
 
49
- opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything.") do |t|
53
+ opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps, won't download anything") do |t|
50
54
  options[:list] = true
51
55
  end
52
56
 
@@ -58,7 +62,7 @@ end.parse!
58
62
  if (base_url = ARGV[-1])
59
63
  options[:base_url] = base_url
60
64
  wayback_machine_downloader = WaybackMachineDownloader.new options
61
- if wayback_machine_downloader.list
65
+ if options[:list]
62
66
  wayback_machine_downloader.list_files
63
67
  else
64
68
  wayback_machine_downloader.download_files
@@ -14,19 +14,21 @@ class WaybackMachineDownloader
14
14
 
15
15
  include ArchiveAPI
16
16
 
17
- VERSION = "2.0.0"
17
+ VERSION = "2.1.0"
18
18
 
19
- attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :maximum_pages, :threads_count
19
+ attr_accessor :base_url, :exact_url, :directory,
20
+ :from_timestamp, :to_timestamp, :only_filter, :exclude_filter,
21
+ :all, :maximum_pages, :threads_count
20
22
 
21
23
  def initialize params
22
24
  @base_url = params[:base_url]
25
+ @exact_url = params[:exact_url]
23
26
  @directory = params[:directory]
24
27
  @from_timestamp = params[:from_timestamp].to_i
25
28
  @to_timestamp = params[:to_timestamp].to_i
26
29
  @only_filter = params[:only_filter]
27
30
  @exclude_filter = params[:exclude_filter]
28
31
  @all = params[:all]
29
- @list = params[:list]
30
32
  @maximum_pages = params[:maximum_pages] ? params[:maximum_pages].to_i : 100
31
33
  @threads_count = params[:threads_count].to_i
32
34
  end
@@ -78,18 +80,19 @@ class WaybackMachineDownloader
78
80
  end
79
81
 
80
82
  def get_all_snapshots_to_consider
81
- # Note: Passing a page index parameter allow us to get more snapshots, but from a less fresh index
83
+ # Note: Passing a page index parameter allow us to get more snapshots,
84
+ # but from a less fresh index
82
85
  print "Getting snapshot pages"
83
86
  snapshot_list_to_consider = ""
84
87
  snapshot_list_to_consider += get_raw_list_from_api(@base_url, nil)
85
88
  print "."
86
- snapshot_list_to_consider += get_raw_list_from_api(@base_url + '/*', nil)
87
- print "."
88
- @maximum_pages.times do |page_index|
89
- snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
90
- break if snapshot_list.empty?
91
- snapshot_list_to_consider += snapshot_list
92
- print "."
89
+ unless @exact_url
90
+ @maximum_pages.times do |page_index|
91
+ snapshot_list = get_raw_list_from_api(@base_url + '/*', page_index)
92
+ break if snapshot_list.empty?
93
+ snapshot_list_to_consider += snapshot_list
94
+ print "."
95
+ end
93
96
  end
94
97
  puts " found #{snapshot_list_to_consider.lines.count} snaphots to consider."
95
98
  puts
@@ -134,8 +137,10 @@ class WaybackMachineDownloader
134
137
  end
135
138
 
136
139
  def list_files
140
+ # retrieval produces its own output
141
+ files = get_file_list_by_timestamp
137
142
  puts "["
138
- get_file_list_by_timestamp.each do |file|
143
+ files.each do |file|
139
144
  puts file.to_json + ","
140
145
  end
141
146
  puts "]"
@@ -179,7 +184,7 @@ class WaybackMachineDownloader
179
184
 
180
185
  def structure_dir_path dir_path
181
186
  begin
182
- FileUtils::mkdir_p dir_path unless File.exists? dir_path
187
+ FileUtils::mkdir_p dir_path unless File.exist? dir_path
183
188
  rescue Errno::EEXIST => e
184
189
  error_to_string = e.to_s
185
190
  puts "# #{error_to_string}"
@@ -219,7 +224,7 @@ class WaybackMachineDownloader
219
224
  if Gem.win_platform?
220
225
  file_path = file_path.gsub(/[:*?&=<>\\|]/) {|s| '%' + s.ord.to_s(16) }
221
226
  end
222
- unless File.exists? file_path
227
+ unless File.exist? file_path
223
228
  begin
224
229
  structure_dir_path dir_path
225
230
  open(file_path, "wb") do |file|
@@ -240,7 +245,7 @@ class WaybackMachineDownloader
240
245
  rescue StandardError => e
241
246
  puts "#{file_url} # #{e}"
242
247
  ensure
243
- if not @all and File.exists?(file_path) and File.size(file_path) == 0
248
+ if not @all and File.exist?(file_path) and File.size(file_path) == 0
244
249
  File.delete(file_path)
245
250
  puts "#{file_path} was empty and was removed."
246
251
  end
@@ -1,15 +1,15 @@
1
1
  module ArchiveAPI
2
2
 
3
- def get_raw_list_from_api url, page_index
4
- request_url = "http://web.archive.org/cdx/search/xd?url="
5
- request_url += url
6
- request_url += parameters_for_api page_index
3
+ def get_raw_list_from_api url, page_index
4
+ request_url = "http://web.archive.org/cdx/search/xd?url="
5
+ request_url += url
6
+ request_url += parameters_for_api page_index
7
7
 
8
8
  open(request_url).read
9
- end
9
+ end
10
10
 
11
- def parameters_for_api page_index
12
- parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
11
+ def parameters_for_api page_index
12
+ parameters = "&fl=timestamp,original&collapse=digest&gzip=false"
13
13
  if @all
14
14
  parameters += ""
15
15
  else
@@ -60,7 +60,7 @@ module TibyBytes
60
60
  bytes.each_index do |i|
61
61
 
62
62
  byte = bytes[i]
63
- is_ascii = byte < 128
63
+ _is_ascii = byte < 128
64
64
  is_cont = byte > 127 && byte < 192
65
65
  is_lead = byte > 191 && byte < 245
66
66
  is_unused = byte > 240
@@ -78,7 +78,7 @@ module TibyBytes
78
78
  # the leading byte.
79
79
  begin
80
80
  (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
81
- rescue NoMethodError => e
81
+ rescue NoMethodError
82
82
  next
83
83
  end
84
84
  conts_expected = 0
@@ -98,7 +98,7 @@ module TibyBytes
98
98
  end
99
99
  begin
100
100
  bytes.empty? ? nil : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
101
- rescue ArgumentError => e
101
+ rescue ArgumentError
102
102
  nil
103
103
  end
104
104
  end
@@ -25,7 +25,7 @@ module ToRegex
25
25
  # @option options [true,false] :lang /foo/[nesu]
26
26
  def to_regex(options = {})
27
27
  if args = as_regexp(options)
28
- ::Regexp.new *args
28
+ ::Regexp.new(*args)
29
29
  end
30
30
  end
31
31
 
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.0
4
+ version: 2.1.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-06-10 00:00:00.000000000 Z
11
+ date: 2017-06-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake