wayback_machine_downloader 0.2.4 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e8b8be7be5e839e0e7350fce3c54003484a53ada
4
- data.tar.gz: 7f2f9eb8d64a45ad83c24937498dbbf0d9ee42a8
3
+ metadata.gz: d29c33bd6a4ffc9cdceb326a266b3ba987e89ec4
4
+ data.tar.gz: f739bd763030c3e32026812e4cbe6bca51c1ae8c
5
5
  SHA512:
6
- metadata.gz: f7ebc824e1b0fbe59dc84ad4c5b93f32ce4dfd3e9f32cbc977f647cba1ccb0052035599efbf039a4eaa7f192405016e57ff319ec7d4208c0489fd2c7797e1581
7
- data.tar.gz: af467686c99f3a3ce30b8dadecaa4a2340788e994c2b9888941a808a29b290aa3bfb21a21426f46a1f738a81f50062d6465024d89c8d0475b26adec6f524e49a
6
+ metadata.gz: 6ae55228a24711f5d1fc2cc40ee70326a2033fa031d1273a4f4b9432b2fe2c017699cd577a2e36e86ddd3c1fb1b299455796ebf6af97f109d7f3bbedb1f7963a
7
+ data.tar.gz: f358009785bb2bb52d0017e762566eb25a919e938d5552c997df76cc64de0857b80a11d3c7284e22e393b96ba6cf3133908af0a7e24098fe3a756023ad77c706
@@ -18,17 +18,21 @@ option_parser = OptionParser.new do |opts|
18
18
  options[:timestamp] = t
19
19
  end
20
20
 
21
- opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to file urls matching the only filter supplied (use // notation for the only filter to be treated as a regex)") do |t|
21
+ opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
22
22
  options[:only_filter] = t
23
23
  end
24
24
 
25
+ opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
26
+ options[:exclude_filter] = t
27
+ end
28
+
25
29
  opts.on("-v", "--version", "Display version") do |t|
26
30
  options[:version] = t
27
31
  end
28
32
  end.parse!
29
33
 
30
34
  if (base_url = ARGV[-1])
31
- wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter]
35
+ wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter], exclude_filter: options[:exclude_filter]
32
36
  wayback_machine_downloader.download_files
33
37
  elsif options[:version]
34
38
  puts WaybackMachineDownloader::VERSION
@@ -2,19 +2,21 @@
2
2
 
3
3
  require 'open-uri'
4
4
  require 'fileutils'
5
+ require 'cgi'
5
6
  require_relative 'wayback_machine_downloader/tidy_bytes'
6
7
  require_relative 'wayback_machine_downloader/to_regex'
7
8
 
8
9
  class WaybackMachineDownloader
9
10
 
10
- VERSION = "0.2.4"
11
+ VERSION = "0.3.0"
11
12
 
12
- attr_accessor :base_url, :timestamp, :only_filter
13
+ attr_accessor :base_url, :timestamp, :only_filter, :exclude_filter
13
14
 
14
15
  def initialize params
15
16
  @base_url = params[:base_url]
16
17
  @timestamp = params[:timestamp].to_i
17
18
  @only_filter = params[:only_filter]
19
+ @exclude_filter = params[:exclude_filter]
18
20
  end
19
21
 
20
22
  def backup_name
@@ -38,6 +40,19 @@ class WaybackMachineDownloader
38
40
  end
39
41
  end
40
42
 
43
+ def match_exclude_filter file_url
44
+ if @exclude_filter
45
+ exclude_filter_regex = @exclude_filter.to_regex
46
+ if exclude_filter_regex
47
+ exclude_filter_regex =~ file_url
48
+ else
49
+ file_url.downcase.include? @exclude_filter.downcase
50
+ end
51
+ else
52
+ false
53
+ end
54
+ end
55
+
41
56
  def get_file_list_curated
42
57
  index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
43
58
  all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
@@ -48,13 +63,15 @@ class WaybackMachineDownloader
48
63
  file_timestamp = line[1].to_i
49
64
  file_url = line[2]
50
65
  file_id = file_url.split('/')[3..-1].join('/')
51
- file_id = URI.unescape file_id
66
+ file_id = CGI::unescape file_id
52
67
  file_id = file_id.tidy_bytes unless file_id == ""
53
68
  if file_id.nil?
54
69
  puts "Malformed file url, ignoring: #{file_url}"
55
70
  elsif @timestamp == 0 or file_timestamp <= @timestamp
56
- if not match_only_filter(file_url)
57
- puts "File url not in supplied only filter, ignoring: #{file_url}"
71
+ if match_exclude_filter(file_url)
72
+ puts "File url matches exclude filter, ignoring: #{file_url}"
73
+ elsif not match_only_filter(file_url)
74
+ puts "File url doesn't match only filter, ignoring: #{file_url}"
58
75
  elsif file_list_curated[file_id]
59
76
  unless file_list_curated[file_id][:timestamp] > file_timestamp
60
77
  file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
@@ -82,7 +99,11 @@ class WaybackMachineDownloader
82
99
  puts
83
100
  file_list_by_timestamp = get_file_list_by_timestamp
84
101
  if file_list_by_timestamp.count == 0
85
- puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@accept_regex.to_s}\")\n\t* Site is not in wayback machine."
102
+ puts "No files to download."
103
+ puts "Possible reaosons:"
104
+ puts "\t* Site is not in Wayback Machine Archive."
105
+ puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
106
+ puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
86
107
  return
87
108
  end
88
109
  count = 0
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-29 00:00:00.000000000 Z
11
+ date: 2016-07-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake