wayback_machine_downloader 0.2.4 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +6 -2
- data/lib/wayback_machine_downloader.rb +27 -6
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d29c33bd6a4ffc9cdceb326a266b3ba987e89ec4
|
4
|
+
data.tar.gz: f739bd763030c3e32026812e4cbe6bca51c1ae8c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 6ae55228a24711f5d1fc2cc40ee70326a2033fa031d1273a4f4b9432b2fe2c017699cd577a2e36e86ddd3c1fb1b299455796ebf6af97f109d7f3bbedb1f7963a
|
7
|
+
data.tar.gz: f358009785bb2bb52d0017e762566eb25a919e938d5552c997df76cc64de0857b80a11d3c7284e22e393b96ba6cf3133908af0a7e24098fe3a756023ad77c706
|
@@ -18,17 +18,21 @@ option_parser = OptionParser.new do |opts|
|
|
18
18
|
options[:timestamp] = t
|
19
19
|
end
|
20
20
|
|
21
|
-
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to
|
21
|
+
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
|
22
22
|
options[:only_filter] = t
|
23
23
|
end
|
24
24
|
|
25
|
+
opts.on("-x", "--exclude EXCLUDE_FILTER", String, "Skip urls that match this filter (use // notation for the filter to be treated as a regex)") do |t|
|
26
|
+
options[:exclude_filter] = t
|
27
|
+
end
|
28
|
+
|
25
29
|
opts.on("-v", "--version", "Display version") do |t|
|
26
30
|
options[:version] = t
|
27
31
|
end
|
28
32
|
end.parse!
|
29
33
|
|
30
34
|
if (base_url = ARGV[-1])
|
31
|
-
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter]
|
35
|
+
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter], exclude_filter: options[:exclude_filter]
|
32
36
|
wayback_machine_downloader.download_files
|
33
37
|
elsif options[:version]
|
34
38
|
puts WaybackMachineDownloader::VERSION
|
@@ -2,19 +2,21 @@
|
|
2
2
|
|
3
3
|
require 'open-uri'
|
4
4
|
require 'fileutils'
|
5
|
+
require 'cgi'
|
5
6
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
6
7
|
require_relative 'wayback_machine_downloader/to_regex'
|
7
8
|
|
8
9
|
class WaybackMachineDownloader
|
9
10
|
|
10
|
-
VERSION = "0.
|
11
|
+
VERSION = "0.3.0"
|
11
12
|
|
12
|
-
attr_accessor :base_url, :timestamp, :only_filter
|
13
|
+
attr_accessor :base_url, :timestamp, :only_filter, :exclude_filter
|
13
14
|
|
14
15
|
def initialize params
|
15
16
|
@base_url = params[:base_url]
|
16
17
|
@timestamp = params[:timestamp].to_i
|
17
18
|
@only_filter = params[:only_filter]
|
19
|
+
@exclude_filter = params[:exclude_filter]
|
18
20
|
end
|
19
21
|
|
20
22
|
def backup_name
|
@@ -38,6 +40,19 @@ class WaybackMachineDownloader
|
|
38
40
|
end
|
39
41
|
end
|
40
42
|
|
43
|
+
def match_exclude_filter file_url
|
44
|
+
if @exclude_filter
|
45
|
+
exclude_filter_regex = @exclude_filter.to_regex
|
46
|
+
if exclude_filter_regex
|
47
|
+
exclude_filter_regex =~ file_url
|
48
|
+
else
|
49
|
+
file_url.downcase.include? @exclude_filter.downcase
|
50
|
+
end
|
51
|
+
else
|
52
|
+
false
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
41
56
|
def get_file_list_curated
|
42
57
|
index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
|
43
58
|
all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
|
@@ -48,13 +63,15 @@ class WaybackMachineDownloader
|
|
48
63
|
file_timestamp = line[1].to_i
|
49
64
|
file_url = line[2]
|
50
65
|
file_id = file_url.split('/')[3..-1].join('/')
|
51
|
-
file_id =
|
66
|
+
file_id = CGI::unescape file_id
|
52
67
|
file_id = file_id.tidy_bytes unless file_id == ""
|
53
68
|
if file_id.nil?
|
54
69
|
puts "Malformed file url, ignoring: #{file_url}"
|
55
70
|
elsif @timestamp == 0 or file_timestamp <= @timestamp
|
56
|
-
if
|
57
|
-
puts "File url
|
71
|
+
if match_exclude_filter(file_url)
|
72
|
+
puts "File url matches exclude filter, ignoring: #{file_url}"
|
73
|
+
elsif not match_only_filter(file_url)
|
74
|
+
puts "File url doesn't match only filter, ignoring: #{file_url}"
|
58
75
|
elsif file_list_curated[file_id]
|
59
76
|
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
60
77
|
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
@@ -82,7 +99,11 @@ class WaybackMachineDownloader
|
|
82
99
|
puts
|
83
100
|
file_list_by_timestamp = get_file_list_by_timestamp
|
84
101
|
if file_list_by_timestamp.count == 0
|
85
|
-
puts "No files to download.
|
102
|
+
puts "No files to download."
|
103
|
+
puts "Possible reaosons:"
|
104
|
+
puts "\t* Site is not in Wayback Machine Archive."
|
105
|
+
puts "\t* Only filter too restrictive (#{only_filter.to_s})" if @only_filter
|
106
|
+
puts "\t* Exclude filter too wide (#{exclude_filter.to_s})" if @exclude_filter
|
86
107
|
return
|
87
108
|
end
|
88
109
|
count = 0
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-07-30 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|