wayback_machine_downloader 0.1.18 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1a37e6ffa46aae434f606fd8d887c2017a0a06a1
4
- data.tar.gz: c05f60c12fec76ec3709371a31fff7f5de17782f
3
+ metadata.gz: eccabaecb58ce40f79f3bfcbac3a50eea4c55142
4
+ data.tar.gz: fd5673c7ee5404f8f73b73de9bf94c5b1e2d08ea
5
5
  SHA512:
6
- metadata.gz: 24a3096abfdc4506a873bb784380d169d1a0b45c39f5c1f91538f7c1f39d67d86b0a7d8f74103406b35d4718ab732a2ca442ee8bcbc27caf7bb3db2406530130
7
- data.tar.gz: c19c30ccf76b8993e054aff2dabadb000e2b4fe90eff5ae61e93a170ef2aae79fc543b699e78b1df3df23c7b0ef1bc625b55da556943de9554aee3e732aa58a1
6
+ metadata.gz: 8db03676ee13a095e18d24ba98512852d00926d6142b4cba12b8053519384c9e96ca49ab542d45b310390248772db973b252a131937890a5ed3646c908a4d7a0
7
+ data.tar.gz: 8646cea23b376bfb80019ce670badeb5746d4cd2c8010e074f7157314e9112cece9845903a8c7ee00bb5f1bafaefbb5a05f700101245872f1070018f4ac65fd2
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative '../lib/wayback_machine_downloader'
4
4
  require 'optparse'
5
+ require 'pp'
5
6
 
6
7
  options = {}
7
8
  option_parser = OptionParser.new do |opts|
@@ -11,19 +12,23 @@ option_parser = OptionParser.new do |opts|
11
12
  opts.separator "Download any website from the Wayback Machine."
12
13
 
13
14
  opts.separator ""
14
- opts.separator "Optional option:"
15
+ opts.separator "Optional options:"
15
16
 
16
17
  opts.on("-t", "--timestamp TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20150806225358)") do |t|
17
18
  options[:timestamp] = t
18
19
  end
19
20
 
21
+ opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to file urls matching the only filter supplied (use // notation for the only filter to be treated as a regex)") do |t|
22
+ options[:only_filter] = t
23
+ end
24
+
20
25
  opts.on("-v", "--version", "Display version") do |t|
21
26
  options[:version] = t
22
27
  end
23
28
  end.parse!
24
29
 
25
- if base_url = ARGV[0]
26
- wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp]
30
+ if base_url = ARGV[-1]
31
+ wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter]
27
32
  wayback_machine_downloader.download_files
28
33
  elsif options[:version]
29
34
  puts WaybackMachineDownloader::VERSION
@@ -1,16 +1,20 @@
1
+ # encoding: UTF-8
2
+
1
3
  require 'open-uri'
2
4
  require 'fileutils'
3
5
  require_relative 'wayback_machine_downloader/tidy_bytes'
6
+ require_relative 'wayback_machine_downloader/to_regex'
4
7
 
5
8
  class WaybackMachineDownloader
6
9
 
7
- VERSION = "0.1.18"
10
+ VERSION = "0.2.0"
8
11
 
9
- attr_accessor :base_url, :timestamp
12
+ attr_accessor :base_url, :timestamp, :only_filter
10
13
 
11
14
  def initialize params
12
15
  @base_url = params[:base_url]
13
16
  @timestamp = params[:timestamp].to_i
17
+ @only_filter = params[:only_filter]
14
18
  end
15
19
 
16
20
  def backup_name
@@ -21,8 +25,21 @@ class WaybackMachineDownloader
21
25
  'websites/' + backup_name + '/'
22
26
  end
23
27
 
28
+ def match_only_filter file_url
29
+ if @only_filter
30
+ only_filter_regex = @only_filter.to_regex
31
+ if only_filter_regex
32
+ only_filter_regex =~ file_url
33
+ else
34
+ file_url.downcase.include? @only_filter.downcase
35
+ end
36
+ else
37
+ true
38
+ end
39
+ end
40
+
24
41
  def get_file_list_curated
25
- index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
42
+ index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
26
43
  all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
27
44
  file_list_curated = Hash.new
28
45
  [index_file_list_raw, all_file_list_raw].each do |file|
@@ -36,7 +53,9 @@ class WaybackMachineDownloader
36
53
  if file_id.nil?
37
54
  puts "Malformed file url, ignoring: #{file_url}"
38
55
  elsif @timestamp == 0 or file_timestamp <= @timestamp
39
- if file_list_curated[file_id]
56
+ if not match_only_filter(file_url)
57
+ puts "File url not in supplied only filter, ignoring: #{file_url}"
58
+ elsif file_list_curated[file_id]
40
59
  unless file_list_curated[file_id][:timestamp] > file_timestamp
41
60
  file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
42
61
  end
@@ -62,6 +81,10 @@ class WaybackMachineDownloader
62
81
  puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine..."
63
82
  puts
64
83
  file_list_by_timestamp = get_file_list_by_timestamp
84
+ if file_list_by_timestamp.count == 0
85
+ puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@accept_regex.to_s}\")\n\t* Site is not in wayback machine."
86
+ return
87
+ end
65
88
  count = 0
66
89
  file_list_by_timestamp.each do |file_remote_info|
67
90
  count += 1
@@ -0,0 +1,81 @@
1
+ module ToRegex
2
+ module StringMixin
3
+ class << self
4
+ def literal?(str)
5
+ REGEXP_DELIMITERS.none? { |s, e| str.start_with?(s) and str =~ /#{e}#{INLINE_OPTIONS}\z/ }
6
+ end
7
+ end
8
+
9
+ INLINE_OPTIONS = /[imxnesu]*/
10
+ REGEXP_DELIMITERS = {
11
+ '%r{' => '}',
12
+ '/' => '/',
13
+ }
14
+
15
+ # Get a regex back
16
+ #
17
+ # Without :literal or :detect, `"foo".to_regex` will return nil.
18
+ #
19
+ # @param [optional, Hash] options
20
+ # @option options [true,false] :literal Treat meta characters and other regexp codes as just text; always return a regexp
21
+ # @option options [true,false] :detect If string starts and ends with valid regexp delimiters, treat it as a regexp; otherwise, interpret it literally
22
+ # @option options [true,false] :ignore_case /foo/i
23
+ # @option options [true,false] :multiline /foo/m
24
+ # @option options [true,false] :extended /foo/x
25
+ # @option options [true,false] :lang /foo/[nesu]
26
+ def to_regex(options = {})
27
+ if args = as_regexp(options)
28
+ ::Regexp.new *args
29
+ end
30
+ end
31
+
32
+ # Return arguments that can be passed to `Regexp.new`
33
+ # @see to_regexp
34
+ def as_regexp(options = {})
35
+ unless options.is_a?(::Hash)
36
+ raise ::ArgumentError, "[to_regexp] Options must be a Hash"
37
+ end
38
+ str = self
39
+
40
+ return if options[:detect] and str == ''
41
+
42
+ if options[:literal] or (options[:detect] and ToRegexp::String.literal?(str))
43
+ content = ::Regexp.escape str
44
+ elsif delim_set = REGEXP_DELIMITERS.detect { |k, _| str.start_with?(k) }
45
+ delim_start, delim_end = delim_set
46
+ /\A#{delim_start}(.*)#{delim_end}(#{INLINE_OPTIONS})\z/u =~ str
47
+ content = $1
48
+ inline_options = $2
49
+ return unless content.is_a?(::String)
50
+ content.gsub! '\\/', '/'
51
+ if inline_options
52
+ options[:ignore_case] = true if inline_options.include?('i')
53
+ options[:multiline] = true if inline_options.include?('m')
54
+ options[:extended] = true if inline_options.include?('x')
55
+ # 'n', 'N' = none, 'e', 'E' = EUC, 's', 'S' = SJIS, 'u', 'U' = UTF-8
56
+ options[:lang] = inline_options.scan(/[nesu]/i).join.downcase
57
+ end
58
+ else
59
+ return
60
+ end
61
+
62
+ ignore_case = options[:ignore_case] ? ::Regexp::IGNORECASE : 0
63
+ multiline = options[:multiline] ? ::Regexp::MULTILINE : 0
64
+ extended = options[:extended] ? ::Regexp::EXTENDED : 0
65
+ lang = options[:lang] || ''
66
+ if ::RUBY_VERSION > '1.9' and lang.include?('u')
67
+ lang = lang.delete 'u'
68
+ end
69
+
70
+ if lang.empty?
71
+ [ content, (ignore_case|multiline|extended) ]
72
+ else
73
+ [ content, (ignore_case|multiline|extended), lang ]
74
+ end
75
+ end
76
+ end
77
+ end
78
+
79
+ class String
80
+ include ToRegex::StringMixin
81
+ end
metadata CHANGED
@@ -1,43 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.18
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-05 00:00:00.000000000 Z
11
+ date: 2015-11-19 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: pry-rescue
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - "~>"
18
- - !ruby/object:Gem::Version
19
- version: '1.4'
20
- type: :development
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - "~>"
25
- - !ruby/object:Gem::Version
26
- version: '1.4'
27
- - !ruby/object:Gem::Dependency
28
- name: pry-stack_explorer
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - "~>"
32
- - !ruby/object:Gem::Version
33
- version: '0.4'
34
- type: :development
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - "~>"
39
- - !ruby/object:Gem::Version
40
- version: '0.4'
41
13
  - !ruby/object:Gem::Dependency
42
14
  name: rake
43
15
  requirement: !ruby/object:Gem::Requirement
@@ -78,6 +50,7 @@ files:
78
50
  - bin/wayback_machine_downloader
79
51
  - lib/wayback_machine_downloader.rb
80
52
  - lib/wayback_machine_downloader/tidy_bytes.rb
53
+ - lib/wayback_machine_downloader/to_regex.rb
81
54
  homepage: https://github.com/hartator/wayback-machine-downloader
82
55
  licenses:
83
56
  - MIT