wayback_machine_downloader 0.1.18 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +8 -3
- data/lib/wayback_machine_downloader.rb +27 -4
- data/lib/wayback_machine_downloader/to_regex.rb +81 -0
- metadata +3 -30
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: eccabaecb58ce40f79f3bfcbac3a50eea4c55142
|
4
|
+
data.tar.gz: fd5673c7ee5404f8f73b73de9bf94c5b1e2d08ea
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8db03676ee13a095e18d24ba98512852d00926d6142b4cba12b8053519384c9e96ca49ab542d45b310390248772db973b252a131937890a5ed3646c908a4d7a0
|
7
|
+
data.tar.gz: 8646cea23b376bfb80019ce670badeb5746d4cd2c8010e074f7157314e9112cece9845903a8c7ee00bb5f1bafaefbb5a05f700101245872f1070018f4ac65fd2
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
require_relative '../lib/wayback_machine_downloader'
|
4
4
|
require 'optparse'
|
5
|
+
require 'pp'
|
5
6
|
|
6
7
|
options = {}
|
7
8
|
option_parser = OptionParser.new do |opts|
|
@@ -11,19 +12,23 @@ option_parser = OptionParser.new do |opts|
|
|
11
12
|
opts.separator "Download any website from the Wayback Machine."
|
12
13
|
|
13
14
|
opts.separator ""
|
14
|
-
opts.separator "Optional
|
15
|
+
opts.separator "Optional options:"
|
15
16
|
|
16
17
|
opts.on("-t", "--timestamp TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20150806225358)") do |t|
|
17
18
|
options[:timestamp] = t
|
18
19
|
end
|
19
20
|
|
21
|
+
opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to file urls matching the only filter supplied (use // notation for the only filter to be treated as a regex)") do |t|
|
22
|
+
options[:only_filter] = t
|
23
|
+
end
|
24
|
+
|
20
25
|
opts.on("-v", "--version", "Display version") do |t|
|
21
26
|
options[:version] = t
|
22
27
|
end
|
23
28
|
end.parse!
|
24
29
|
|
25
|
-
if base_url = ARGV[
|
26
|
-
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp]
|
30
|
+
if base_url = ARGV[-1]
|
31
|
+
wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter]
|
27
32
|
wayback_machine_downloader.download_files
|
28
33
|
elsif options[:version]
|
29
34
|
puts WaybackMachineDownloader::VERSION
|
@@ -1,16 +1,20 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
1
3
|
require 'open-uri'
|
2
4
|
require 'fileutils'
|
3
5
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
6
|
+
require_relative 'wayback_machine_downloader/to_regex'
|
4
7
|
|
5
8
|
class WaybackMachineDownloader
|
6
9
|
|
7
|
-
VERSION = "0.
|
10
|
+
VERSION = "0.2.0"
|
8
11
|
|
9
|
-
attr_accessor :base_url, :timestamp
|
12
|
+
attr_accessor :base_url, :timestamp, :only_filter
|
10
13
|
|
11
14
|
def initialize params
|
12
15
|
@base_url = params[:base_url]
|
13
16
|
@timestamp = params[:timestamp].to_i
|
17
|
+
@only_filter = params[:only_filter]
|
14
18
|
end
|
15
19
|
|
16
20
|
def backup_name
|
@@ -21,8 +25,21 @@ class WaybackMachineDownloader
|
|
21
25
|
'websites/' + backup_name + '/'
|
22
26
|
end
|
23
27
|
|
28
|
+
def match_only_filter file_url
|
29
|
+
if @only_filter
|
30
|
+
only_filter_regex = @only_filter.to_regex
|
31
|
+
if only_filter_regex
|
32
|
+
only_filter_regex =~ file_url
|
33
|
+
else
|
34
|
+
file_url.downcase.include? @only_filter.downcase
|
35
|
+
end
|
36
|
+
else
|
37
|
+
true
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
24
41
|
def get_file_list_curated
|
25
|
-
index_file_list_raw =
|
42
|
+
index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
|
26
43
|
all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
|
27
44
|
file_list_curated = Hash.new
|
28
45
|
[index_file_list_raw, all_file_list_raw].each do |file|
|
@@ -36,7 +53,9 @@ class WaybackMachineDownloader
|
|
36
53
|
if file_id.nil?
|
37
54
|
puts "Malformed file url, ignoring: #{file_url}"
|
38
55
|
elsif @timestamp == 0 or file_timestamp <= @timestamp
|
39
|
-
if
|
56
|
+
if not match_only_filter(file_url)
|
57
|
+
puts "File url not in supplied only filter, ignoring: #{file_url}"
|
58
|
+
elsif file_list_curated[file_id]
|
40
59
|
unless file_list_curated[file_id][:timestamp] > file_timestamp
|
41
60
|
file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
|
42
61
|
end
|
@@ -62,6 +81,10 @@ class WaybackMachineDownloader
|
|
62
81
|
puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine..."
|
63
82
|
puts
|
64
83
|
file_list_by_timestamp = get_file_list_by_timestamp
|
84
|
+
if file_list_by_timestamp.count == 0
|
85
|
+
puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@accept_regex.to_s}\")\n\t* Site is not in wayback machine."
|
86
|
+
return
|
87
|
+
end
|
65
88
|
count = 0
|
66
89
|
file_list_by_timestamp.each do |file_remote_info|
|
67
90
|
count += 1
|
@@ -0,0 +1,81 @@
|
|
1
|
+
module ToRegex
|
2
|
+
module StringMixin
|
3
|
+
class << self
|
4
|
+
def literal?(str)
|
5
|
+
REGEXP_DELIMITERS.none? { |s, e| str.start_with?(s) and str =~ /#{e}#{INLINE_OPTIONS}\z/ }
|
6
|
+
end
|
7
|
+
end
|
8
|
+
|
9
|
+
INLINE_OPTIONS = /[imxnesu]*/
|
10
|
+
REGEXP_DELIMITERS = {
|
11
|
+
'%r{' => '}',
|
12
|
+
'/' => '/',
|
13
|
+
}
|
14
|
+
|
15
|
+
# Get a regex back
|
16
|
+
#
|
17
|
+
# Without :literal or :detect, `"foo".to_regex` will return nil.
|
18
|
+
#
|
19
|
+
# @param [optional, Hash] options
|
20
|
+
# @option options [true,false] :literal Treat meta characters and other regexp codes as just text; always return a regexp
|
21
|
+
# @option options [true,false] :detect If string starts and ends with valid regexp delimiters, treat it as a regexp; otherwise, interpret it literally
|
22
|
+
# @option options [true,false] :ignore_case /foo/i
|
23
|
+
# @option options [true,false] :multiline /foo/m
|
24
|
+
# @option options [true,false] :extended /foo/x
|
25
|
+
# @option options [true,false] :lang /foo/[nesu]
|
26
|
+
def to_regex(options = {})
|
27
|
+
if args = as_regexp(options)
|
28
|
+
::Regexp.new *args
|
29
|
+
end
|
30
|
+
end
|
31
|
+
|
32
|
+
# Return arguments that can be passed to `Regexp.new`
|
33
|
+
# @see to_regexp
|
34
|
+
def as_regexp(options = {})
|
35
|
+
unless options.is_a?(::Hash)
|
36
|
+
raise ::ArgumentError, "[to_regexp] Options must be a Hash"
|
37
|
+
end
|
38
|
+
str = self
|
39
|
+
|
40
|
+
return if options[:detect] and str == ''
|
41
|
+
|
42
|
+
if options[:literal] or (options[:detect] and ToRegexp::String.literal?(str))
|
43
|
+
content = ::Regexp.escape str
|
44
|
+
elsif delim_set = REGEXP_DELIMITERS.detect { |k, _| str.start_with?(k) }
|
45
|
+
delim_start, delim_end = delim_set
|
46
|
+
/\A#{delim_start}(.*)#{delim_end}(#{INLINE_OPTIONS})\z/u =~ str
|
47
|
+
content = $1
|
48
|
+
inline_options = $2
|
49
|
+
return unless content.is_a?(::String)
|
50
|
+
content.gsub! '\\/', '/'
|
51
|
+
if inline_options
|
52
|
+
options[:ignore_case] = true if inline_options.include?('i')
|
53
|
+
options[:multiline] = true if inline_options.include?('m')
|
54
|
+
options[:extended] = true if inline_options.include?('x')
|
55
|
+
# 'n', 'N' = none, 'e', 'E' = EUC, 's', 'S' = SJIS, 'u', 'U' = UTF-8
|
56
|
+
options[:lang] = inline_options.scan(/[nesu]/i).join.downcase
|
57
|
+
end
|
58
|
+
else
|
59
|
+
return
|
60
|
+
end
|
61
|
+
|
62
|
+
ignore_case = options[:ignore_case] ? ::Regexp::IGNORECASE : 0
|
63
|
+
multiline = options[:multiline] ? ::Regexp::MULTILINE : 0
|
64
|
+
extended = options[:extended] ? ::Regexp::EXTENDED : 0
|
65
|
+
lang = options[:lang] || ''
|
66
|
+
if ::RUBY_VERSION > '1.9' and lang.include?('u')
|
67
|
+
lang = lang.delete 'u'
|
68
|
+
end
|
69
|
+
|
70
|
+
if lang.empty?
|
71
|
+
[ content, (ignore_case|multiline|extended) ]
|
72
|
+
else
|
73
|
+
[ content, (ignore_case|multiline|extended), lang ]
|
74
|
+
end
|
75
|
+
end
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
class String
|
80
|
+
include ToRegex::StringMixin
|
81
|
+
end
|
metadata
CHANGED
@@ -1,43 +1,15 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2015-11-
|
11
|
+
date: 2015-11-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
|
-
- !ruby/object:Gem::Dependency
|
14
|
-
name: pry-rescue
|
15
|
-
requirement: !ruby/object:Gem::Requirement
|
16
|
-
requirements:
|
17
|
-
- - "~>"
|
18
|
-
- !ruby/object:Gem::Version
|
19
|
-
version: '1.4'
|
20
|
-
type: :development
|
21
|
-
prerelease: false
|
22
|
-
version_requirements: !ruby/object:Gem::Requirement
|
23
|
-
requirements:
|
24
|
-
- - "~>"
|
25
|
-
- !ruby/object:Gem::Version
|
26
|
-
version: '1.4'
|
27
|
-
- !ruby/object:Gem::Dependency
|
28
|
-
name: pry-stack_explorer
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - "~>"
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0.4'
|
34
|
-
type: :development
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - "~>"
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '0.4'
|
41
13
|
- !ruby/object:Gem::Dependency
|
42
14
|
name: rake
|
43
15
|
requirement: !ruby/object:Gem::Requirement
|
@@ -78,6 +50,7 @@ files:
|
|
78
50
|
- bin/wayback_machine_downloader
|
79
51
|
- lib/wayback_machine_downloader.rb
|
80
52
|
- lib/wayback_machine_downloader/tidy_bytes.rb
|
53
|
+
- lib/wayback_machine_downloader/to_regex.rb
|
81
54
|
homepage: https://github.com/hartator/wayback-machine-downloader
|
82
55
|
licenses:
|
83
56
|
- MIT
|