wayback_machine_downloader 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +6 -2
- data/lib/wayback_machine_downloader.rb +27 -6
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d29c33bd6a4ffc9cdceb326a266b3ba987e89ec4
|
4
|
+
data.tar.gz: f739bd763030c3e32026812e4cbe6bca51c1ae8c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6ae55228a24711f5d1fc2cc40ee70326a2033fa031d1273a4f4b9432b2fe2c017699cd577a2e36e86ddd3c1fb1b299455796ebf6af97f109d7f3bbedb1f7963a
|
7
|
+
data.tar.gz: f358009785bb2bb52d0017e762566eb25a919e938d5552c997df76cc64de0857b80a11d3c7284e22e393b96ba6cf3133908af0a7e24098fe3a756023ad77c706
|
@@ -18,17 +18,21 @@ option_parser = OptionParser.new do |opts|
|
|
18
18
|
options[:timestamp] = t
|
19
19
|
end
|
20
20
|
|
21
|
-
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to
|
21
|
+
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
|
22
22
|
options[:only_filter] = t
|
23
23
|
end
|
24
24
|
|
25
|
+
opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
|
26
|
+
options[:exclude_filter] = t
|
27
|
+
end
|
28
|
+
|
25
29
|
opts.on("-v", "--version", "Display version") do |t|
|
26
30
|
options[:version] = t
|
27
31
|
end
|
28
32
|
end.parse!
|
29
33
|
|
30
34
|
if (base_url = ARGV[-1])
|
31
|
-
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter]
|
35
|
+
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter], exclude_filter: options[:exclude_filter]
|
32
36
|
wayback_machine_downloader.download_files
|
33
37
|
elsif options[:version]
|
34
38
|
puts WaybackMachineDownloader::VERSION
|
@@ -2,19 +2,21 @@
|
|
2
2
|
|
3
3
|
require 'open-uri'
|
4
4
|
require 'fileutils'
|
5
|
+
require 'cgi'
|
5
6
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
6
7
|
require_relative 'wayback_machine_downloader/to_regex'
|
7
8
|
|
8
9
|
class WaybackMachineDownloader
|
9
10
|
|
10
|
-
VERSION = "0.
|
11
|
+
VERSION = "0.3.0"
|
11
12
|
|
12
|
-
attr_accessor :base_url, :timestamp, :only_filter
|
13
|
+
attr_accessor :base_url, :timestamp, :only_filter, :exclude_filter
|
13
14
|
|
14
15
|
def initialize params
|
15
16
|
@base_url = params[:base_url]
|
16
17
|
@timestamp = params[:timestamp].to_i
|
17
18
|
@only_filter = params[:only_filter]
|
19
|
+
@exclude_filter = params[:exclude_filter]
|
18
20
|
end
|
19
21
|
|
20
22
|
def backup_name
|
@@ -38,6 +40,19 @@ class WaybackMachineDownloader
|
|
38
40
|
end
|
39
41
|
end
|
40
42
|
|
43
|
+
def match_exclude_filter file_url
|
44
|
+
if @exclude_filter
|
45
|
+
exclude_filter_regex = @exclude_filter.to_regex
|
46
|
+
if exclude_filter_regex
|
47
|
+
exclude_filter_regex =~ file_url
|
48
|
+
else
|
49
|
+
file_url.downcase.include? @exclude_filter.downcase
|
50
|
+
end
|
51
|
+
else
|
52
|
+
false
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
41
56
|
def get_file_list_curated
|
42
57
|
index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
|
43
58
|
all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
|
@@ -48,13 +63,15 @@ class WaybackMachineDownloader
|
|
48
63
|
file_timestamp = line[1].to_i
|
49
64
|
file_url = line[2]
|
50
65
|
file_id = file_url.split('/')[3..-1].join('/')
|
51
|
-
file_id =
|
66
|
+
file_id = CGI::unescape file_id
|
52
67
|
file_id = file_id.tidy_bytes unless file_id == ""
|
53
68
|
if file_id.nil?
|
54
69
|
puts "Malformed file url, ignoring: #{file_url}"
|
55
70
|
elsif @timestamp == 0 or file_timestamp <= @timestamp
|
56
|
-
if
|
57
|
-
puts "File url
|
71
|
+
if match_exclude_filter(file_url)
|
72
|
+
puts "File url matches exclude filter, ignoring: #{file_url}"
|
73
|
+
elsif not match_only_filter(file_url)
|
74
|
+
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
58
75
|
elsif file_list_curated[file_id]
|
59
76
|
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
60
77
|
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
@@ -82,7 +99,11 @@ class WaybackMachineDownloader
|
|
82
99
|
puts
|
83
100
|
file_list_by_timestamp = get_file_list_by_timestamp
|
84
101
|
if file_list_by_timestamp.count == 0
|
85
|
-
puts "No files to download.
|
102
|
+
puts "No files to download."
|
103
|
+
puts "Possible reaosons:"
|
104
|
+
puts "\t* Site is not in Wayback Machine Archive."
|
105
|
+
puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
|
106
|
+
puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
|
86
107
|
return
|
87
108
|
end
|
88
109
|
count = 0
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-07-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|