wayback_machine_downloader 0.1.18 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +8 -3
- data/lib/wayback_machine_downloader.rb +27 -4
- data/lib/wayback_machine_downloader/to_regex.rb +81 -0
- metadata +3 -30
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eccabaecb58ce40f79f3bfcbac3a50eea4c55142
|
4
|
+
data.tar.gz: fd5673c7ee5404f8f73b73de9bf94c5b1e2d08ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8db03676ee13a095e18d24ba98512852d00926d6142b4cba12b8053519384c9e96ca49ab542d45b310390248772db973b252a131937890a5ed3646c908a4d7a0
|
7
|
+
data.tar.gz: 8646cea23b376bfb80019ce670badeb5746d4cd2c8010e074f7157314e9112cece9845903a8c7ee00bb5f1bafaefbb5a05f700101245872f1070018f4ac65fd2
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require_relative '../lib/wayback_machine_downloader'
|
4
4
|
require 'optparse'
|
5
|
+
require 'pp'
|
5
6
|
|
6
7
|
options = {}
|
7
8
|
option_parser = OptionParser.new do |opts|
|
@@ -11,19 +12,23 @@ option_parser = OptionParser.new do |opts|
|
|
11
12
|
opts.separator "Download any website from the Wayback Machine."
|
12
13
|
|
13
14
|
opts.separator ""
|
14
|
-
opts.separator "Optional
|
15
|
+
opts.separator "Optional options:"
|
15
16
|
|
16
17
|
opts.on("-t", "--timestamp TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20150806225358)") do |t|
|
17
18
|
options[:timestamp] = t
|
18
19
|
end
|
19
20
|
|
21
|
+
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to file urls matching the only filter supplied (use // notation for the only filter to be treated as a regex)") do |t|
|
22
|
+
options[:only_filter] = t
|
23
|
+
end
|
24
|
+
|
20
25
|
opts.on("-v", "--version", "Display version") do |t|
|
21
26
|
options[:version] = t
|
22
27
|
end
|
23
28
|
end.parse!
|
24
29
|
|
25
|
-
if base_url = ARGV[
|
26
|
-
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp]
|
30
|
+
if base_url = ARGV[-1]
|
31
|
+
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter]
|
27
32
|
wayback_machine_downloader.download_files
|
28
33
|
elsif options[:version]
|
29
34
|
puts WaybackMachineDownloader::VERSION
|
@@ -1,16 +1,20 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
1
3
|
require 'open-uri'
|
2
4
|
require 'fileutils'
|
3
5
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
6
|
+
require_relative 'wayback_machine_downloader/to_regex'
|
4
7
|
|
5
8
|
class WaybackMachineDownloader
|
6
9
|
|
7
|
-
VERSION = "0.
|
10
|
+
VERSION = "0.2.0"
|
8
11
|
|
9
|
-
attr_accessor :base_url, :timestamp
|
12
|
+
attr_accessor :base_url, :timestamp, :only_filter
|
10
13
|
|
11
14
|
def initialize params
|
12
15
|
@base_url = params[:base_url]
|
13
16
|
@timestamp = params[:timestamp].to_i
|
17
|
+
@only_filter = params[:only_filter]
|
14
18
|
end
|
15
19
|
|
16
20
|
def backup_name
|
@@ -21,8 +25,21 @@ class WaybackMachineDownloader
|
|
21
25
|
'websites/' + backup_name + '/'
|
22
26
|
end
|
23
27
|
|
28
|
+
def match_only_filter file_url
|
29
|
+
if @only_filter
|
30
|
+
only_filter_regex = @only_filter.to_regex
|
31
|
+
if only_filter_regex
|
32
|
+
only_filter_regex =~ file_url
|
33
|
+
else
|
34
|
+
file_url.downcase.include? @only_filter.downcase
|
35
|
+
end
|
36
|
+
else
|
37
|
+
true
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
24
41
|
def get_file_list_curated
|
25
|
-
index_file_list_raw =
|
42
|
+
index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
|
26
43
|
all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
|
27
44
|
file_list_curated = Hash.new
|
28
45
|
[index_file_list_raw, all_file_list_raw].each do |file|
|
@@ -36,7 +53,9 @@ class WaybackMachineDownloader
|
|
36
53
|
if file_id.nil?
|
37
54
|
puts "Malformed file url, ignoring: #{file_url}"
|
38
55
|
elsif @timestamp == 0 or file_timestamp <= @timestamp
|
39
|
-
if
|
56
|
+
if not match_only_filter(file_url)
|
57
|
+
puts "File url not in supplied only filter, ignoring: #{file_url}"
|
58
|
+
elsif file_list_curated[file_id]
|
40
59
|
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
41
60
|
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
42
61
|
end
|
@@ -62,6 +81,10 @@ class WaybackMachineDownloader
|
|
62
81
|
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine..."
|
63
82
|
puts
|
64
83
|
file_list_by_timestamp = get_file_list_by_timestamp
|
84
|
+
if file_list_by_timestamp.count == 0
|
85
|
+
puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@accept_regex.to_s}\")\n\t* Site is not in wayback machine."
|
86
|
+
return
|
87
|
+
end
|
65
88
|
count = 0
|
66
89
|
file_list_by_timestamp.each do |file_remote_info|
|
67
90
|
count += 1
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module ToRegex
|
2
|
+
module StringMixin
|
3
|
+
class << self
|
4
|
+
def literal?(str)
|
5
|
+
REGEXP_DELIMITERS.none? { |s, e| str.start_with?(s) and str =~ /#{e}#{INLINE_OPTIONS}\z/ }
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
INLINE_OPTIONS = /[imxnesu]*/
|
10
|
+
REGEXP_DELIMITERS = {
|
11
|
+
'%r{' => '}',
|
12
|
+
'/' => '/',
|
13
|
+
}
|
14
|
+
|
15
|
+
# Get a regex back
|
16
|
+
#
|
17
|
+
# Without :literal or :detect, `"foo".to_regex` will return nil.
|
18
|
+
#
|
19
|
+
# @param [optional, Hash] options
|
20
|
+
# @option options [true,false] :literal Treat meta characters and other regexp codes as just text; always return a regexp
|
21
|
+
# @option options [true,false] :detect If string starts and ends with valid regexp delimiters, treat it as a regexp; otherwise, interpret it literally
|
22
|
+
# @option options [true,false] :ignore_case /foo/i
|
23
|
+
# @option options [true,false] :multiline /foo/m
|
24
|
+
# @option options [true,false] :extended /foo/x
|
25
|
+
# @option options [true,false] :lang /foo/[nesu]
|
26
|
+
def to_regex(options = {})
|
27
|
+
if args = as_regexp(options)
|
28
|
+
::Regexp.new *args
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# Return arguments that can be passed to `Regexp.new`
|
33
|
+
# @see to_regexp
|
34
|
+
def as_regexp(options = {})
|
35
|
+
unless options.is_a?(::Hash)
|
36
|
+
raise ::ArgumentError, "[to_regexp] Options must be a Hash"
|
37
|
+
end
|
38
|
+
str = self
|
39
|
+
|
40
|
+
return if options[:detect] and str == ''
|
41
|
+
|
42
|
+
if options[:literal] or (options[:detect] and ToRegexp::String.literal?(str))
|
43
|
+
content = ::Regexp.escape str
|
44
|
+
elsif delim_set = REGEXP_DELIMITERS.detect { |k, _| str.start_with?(k) }
|
45
|
+
delim_start, delim_end = delim_set
|
46
|
+
/\A#{delim_start}(.*)#{delim_end}(#{INLINE_OPTIONS})\z/u =~ str
|
47
|
+
content = $1
|
48
|
+
inline_options = $2
|
49
|
+
return unless content.is_a?(::String)
|
50
|
+
content.gsub! '\\/', '/'
|
51
|
+
if inline_options
|
52
|
+
options[:ignore_case] = true if inline_options.include?('i')
|
53
|
+
options[:multiline] = true if inline_options.include?('m')
|
54
|
+
options[:extended] = true if inline_options.include?('x')
|
55
|
+
# 'n', 'N' = none, 'e', 'E' = EUC, 's', 'S' = SJIS, 'u', 'U' = UTF-8
|
56
|
+
options[:lang] = inline_options.scan(/[nesu]/i).join.downcase
|
57
|
+
end
|
58
|
+
else
|
59
|
+
return
|
60
|
+
end
|
61
|
+
|
62
|
+
ignore_case = options[:ignore_case] ? ::Regexp::IGNORECASE : 0
|
63
|
+
multiline = options[:multiline] ? ::Regexp::MULTILINE : 0
|
64
|
+
extended = options[:extended] ? ::Regexp::EXTENDED : 0
|
65
|
+
lang = options[:lang] || ''
|
66
|
+
if ::RUBY_VERSION > '1.9' and lang.include?('u')
|
67
|
+
lang = lang.delete 'u'
|
68
|
+
end
|
69
|
+
|
70
|
+
if lang.empty?
|
71
|
+
[ content, (ignore_case|multiline|extended) ]
|
72
|
+
else
|
73
|
+
[ content, (ignore_case|multiline|extended), lang ]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
class String
|
80
|
+
include ToRegex::StringMixin
|
81
|
+
end
|
metadata
CHANGED
@@ -1,43 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-11-
|
11
|
+
date: 2015-11-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: pry-rescue
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.4'
|
20
|
-
type: :development
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - "~>"
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '1.4'
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: pry-stack_explorer
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - "~>"
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0.4'
|
34
|
-
type: :development
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - "~>"
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '0.4'
|
41
13
|
- !ruby/object:Gem::Dependency
|
42
14
|
name: rake
|
43
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -78,6 +50,7 @@ files:
|
|
78
50
|
- bin/wayback_machine_downloader
|
79
51
|
- lib/wayback_machine_downloader.rb
|
80
52
|
- lib/wayback_machine_downloader/tidy_bytes.rb
|
53
|
+
- lib/wayback_machine_downloader/to_regex.rb
|
81
54
|
homepage: https://github.com/hartator/wayback-machine-downloader
|
82
55
|
licenses:
|
83
56
|
- MIT
|