wayback_machine_downloader 0.2.4 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: e8b8be7be5e839e0e7350fce3c54003484a53ada
4
- data.tar.gz: 7f2f9eb8d64a45ad83c24937498dbbf0d9ee42a8
3
+ metadata.gz: d29c33bd6a4ffc9cdceb326a266b3ba987e89ec4
4
+ data.tar.gz: f739bd763030c3e32026812e4cbe6bca51c1ae8c
5
5
  SHA512:
6
- metadata.gz: f7ebc824e1b0fbe59dc84ad4c5b93f32ce4dfd3e9f32cbc977f647cba1ccb0052035599efbf039a4eaa7f192405016e57ff319ec7d4208c0489fd2c7797e1581
7
- data.tar.gz: af467686c99f3a3ce30b8dadecaa4a2340788e994c2b9888941a808a29b290aa3bfb21a21426f46a1f738a81f50062d6465024d89c8d0475b26adec6f524e49a
6
+ metadata.gz: 6ae55228a24711f5d1fc2cc40ee70326a2033fa031d1273a4f4b9432b2fe2c017699cd577a2e36e86ddd3c1fb1b299455796ebf6af97f109d7f3bbedb1f7963a
7
+ data.tar.gz: f358009785bb2bb52d0017e762566eb25a919e938d5552c997df76cc64de0857b80a11d3c7284e22e393b96ba6cf3133908af0a7e24098fe3a756023ad77c706
@@ -18,17 +18,21 @@ option_parser = OptionParser.new do |opts|
18
18
  options[:timestamp] = t
19
19
  end
20
20
 
21
- opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to file urls matching the only filter supplied (use // notation for the only filter to be treated as a regex)") do |t|
21
+ opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
22
22
  options[:only_filter] = t
23
23
  end
24
24
 
25
+ opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
26
+ options[:exclude_filter] = t
27
+ end
28
+
25
29
  opts.on("-v", "--version", "Display version") do |t|
26
30
  options[:version] = t
27
31
  end
28
32
  end.parse!
29
33
 
30
34
  if (base_url = ARGV[-1])
31
- wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter]
35
+ wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter], exclude_filter: options[:exclude_filter]
32
36
  wayback_machine_downloader.download_files
33
37
  elsif options[:version]
34
38
  puts WaybackMachineDownloader::VERSION
@@ -2,19 +2,21 @@
2
2
 
3
3
  require 'open-uri'
4
4
  require 'fileutils'
5
+ require 'cgi'
5
6
  require_relative 'wayback_machine_downloader/tidy_bytes'
6
7
  require_relative 'wayback_machine_downloader/to_regex'
7
8
 
8
9
  class WaybackMachineDownloader
9
10
 
10
- VERSION = "0.2.4"
11
+ VERSION = "0.3.0"
11
12
 
12
- attr_accessor :base_url, :timestamp, :only_filter
13
+ attr_accessor :base_url, :timestamp, :only_filter, :exclude_filter
13
14
 
14
15
  def initialize params
15
16
  @base_url = params[:base_url]
16
17
  @timestamp = params[:timestamp].to_i
17
18
  @only_filter = params[:only_filter]
19
+ @exclude_filter = params[:exclude_filter]
18
20
  end
19
21
 
20
22
  def backup_name
@@ -38,6 +40,19 @@ class WaybackMachineDownloader
38
40
  end
39
41
  end
40
42
 
43
+ def match_exclude_filter file_url
44
+ if @exclude_filter
45
+ exclude_filter_regex = @exclude_filter.to_regex
46
+ if exclude_filter_regex
47
+ exclude_filter_regex =~ file_url
48
+ else
49
+ file_url.downcase.include? @exclude_filter.downcase
50
+ end
51
+ else
52
+ false
53
+ end
54
+ end
55
+
41
56
  def get_file_list_curated
42
57
  index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
43
58
  all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
@@ -48,13 +63,15 @@ class WaybackMachineDownloader
48
63
  file_timestamp = line[1].to_i
49
64
  file_url = line[2]
50
65
  file_id = file_url.split('/')[3..-1].join('/')
51
- file_id = URI.unescape file_id
66
+ file_id = CGI::unescape file_id
52
67
  file_id = file_id.tidy_bytes unless file_id == ""
53
68
  if file_id.nil?
54
69
  puts "Malformed file url, ignoring: #{file_url}"
55
70
  elsif @timestamp == 0 or file_timestamp <= @timestamp
56
- if not match_only_filter(file_url)
57
- puts "File url not in supplied only filter, ignoring: #{file_url}"
71
+ if match_exclude_filter(file_url)
72
+ puts "File url matches exclude filter, ignoring: #{file_url}"
73
+ elsif not match_only_filter(file_url)
74
+ puts "File url doesn't match only filter, ignoring: #{file_url}"
58
75
  elsif file_list_curated[file_id]
59
76
  unless file_list_curated[file_id][:timestamp] > file_timestamp
60
77
  file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
@@ -82,7 +99,11 @@ class WaybackMachineDownloader
82
99
  puts
83
100
  file_list_by_timestamp = get_file_list_by_timestamp
84
101
  if file_list_by_timestamp.count == 0
85
- puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@accept_regex.to_s}\")\n\t* Site is not in wayback machine."
102
+ puts "No files to download."
103
+ puts "Possible reaosons:"
104
+ puts "\t* Site is not in Wayback Machine Archive."
105
+ puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
106
+ puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
86
107
  return
87
108
  end
88
109
  count = 0
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.3.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2016-03-29 00:00:00.000000000 Z
11
+ date: 2016-07-30 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rake