wayback_machine_downloader 0.1.18 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 1a37e6ffa46aae434f606fd8d887c2017a0a06a1
4
- data.tar.gz: c05f60c12fec76ec3709371a31fff7f5de17782f
3
+ metadata.gz: eccabaecb58ce40f79f3bfcbac3a50eea4c55142
4
+ data.tar.gz: fd5673c7ee5404f8f73b73de9bf94c5b1e2d08ea
5
5
  SHA512:
6
- metadata.gz: 24a3096abfdc4506a873bb784380d169d1a0b45c39f5c1f91538f7c1f39d67d86b0a7d8f74103406b35d4718ab732a2ca442ee8bcbc27caf7bb3db2406530130
7
- data.tar.gz: c19c30ccf76b8993e054aff2dabadb000e2b4fe90eff5ae61e93a170ef2aae79fc543b699e78b1df3df23c7b0ef1bc625b55da556943de9554aee3e732aa58a1
6
+ metadata.gz: 8db03676ee13a095e18d24ba98512852d00926d6142b4cba12b8053519384c9e96ca49ab542d45b310390248772db973b252a131937890a5ed3646c908a4d7a0
7
+ data.tar.gz: 8646cea23b376bfb80019ce670badeb5746d4cd2c8010e074f7157314e9112cece9845903a8c7ee00bb5f1bafaefbb5a05f700101245872f1070018f4ac65fd2
@@ -2,6 +2,7 @@
2
2
 
3
3
  require_relative '../lib/wayback_machine_downloader'
4
4
  require 'optparse'
5
+ require 'pp'
5
6
 
6
7
  options = {}
7
8
  option_parser = OptionParser.new do |opts|
@@ -11,19 +12,23 @@ option_parser = OptionParser.new do |opts|
11
12
  opts.separator "Download any website from the Wayback Machine."
12
13
 
13
14
  opts.separator ""
14
- opts.separator "Optional option:"
15
+ opts.separator "Optional options:"
15
16
 
16
17
  opts.on("-t", "--timestamp TIMESTAMP", Integer, "Only files on or before timestamp supplied (ie. 20150806225358)") do |t|
17
18
  options[:timestamp] = t
18
19
  end
19
20
 
21
+ opts.on("-o", "--only ONLY_FILTER", String, "Restrict downloading to file urls matching the only filter supplied (use // notation for the only filter to be treated as a regex)") do |t|
22
+ options[:only_filter] = t
23
+ end
24
+
20
25
  opts.on("-v", "--version", "Display version") do |t|
21
26
  options[:version] = t
22
27
  end
23
28
  end.parse!
24
29
 
25
- if base_url = ARGV[0]
26
- wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp]
30
+ if base_url = ARGV[-1]
31
+ wayback_machine_downloader = WaybackMachineDownloader.new base_url: base_url, timestamp: options[:timestamp], only_filter: options[:only_filter]
27
32
  wayback_machine_downloader.download_files
28
33
  elsif options[:version]
29
34
  puts WaybackMachineDownloader::VERSION
@@ -1,16 +1,20 @@
1
+ # encoding: UTF-8
2
+
1
3
  require 'open-uri'
2
4
  require 'fileutils'
3
5
  require_relative 'wayback_machine_downloader/tidy_bytes'
6
+ require_relative 'wayback_machine_downloader/to_regex'
4
7
 
5
8
  class WaybackMachineDownloader
6
9
 
7
- VERSION = "0.1.18"
10
+ VERSION = "0.2.0"
8
11
 
9
- attr_accessor :base_url, :timestamp
12
+ attr_accessor :base_url, :timestamp, :only_filter
10
13
 
11
14
  def initialize params
12
15
  @base_url = params[:base_url]
13
16
  @timestamp = params[:timestamp].to_i
17
+ @only_filter = params[:only_filter]
14
18
  end
15
19
 
16
20
  def backup_name
@@ -21,8 +25,21 @@ class WaybackMachineDownloader
21
25
  'websites/' + backup_name + '/'
22
26
  end
23
27
 
28
+ def match_only_filter file_url
29
+ if @only_filter
30
+ only_filter_regex = @only_filter.to_regex
31
+ if only_filter_regex
32
+ only_filter_regex =~ file_url
33
+ else
34
+ file_url.downcase.include? @only_filter.downcase
35
+ end
36
+ else
37
+ true
38
+ end
39
+ end
40
+
24
41
  def get_file_list_curated
25
- index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
42
+ index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
26
43
  all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
27
44
  file_list_curated = Hash.new
28
45
  [index_file_list_raw, all_file_list_raw].each do |file|
@@ -36,7 +53,9 @@ class WaybackMachineDownloader
36
53
  if file_id.nil?
37
54
  puts "Malformed file url, ignoring: #{file_url}"
38
55
  elsif @timestamp == 0 or file_timestamp <= @timestamp
39
- if file_list_curated[file_id]
56
+ if not match_only_filter(file_url)
57
+ puts "File url not in supplied only filter, ignoring: #{file_url}"
58
+ elsif file_list_curated[file_id]
40
59
  unless file_list_curated[file_id][:timestamp] > file_timestamp
41
60
  file_list_curated[file_id] = {file_url: file_url, timestamp: file_timestamp}
42
61
  end
@@ -62,6 +81,10 @@ class WaybackMachineDownloader
62
81
  puts "Downloading #{@base_url} to #{backup_path} from Wayback Machine..."
63
82
  puts
64
83
  file_list_by_timestamp = get_file_list_by_timestamp
84
+ if file_list_by_timestamp.count == 0
85
+ puts "No files to download. Possible reasons:\n\t* Accept regex didn't let any files through (Accept Regex: \"#{@accept_regex.to_s}\")\n\t* Site is not in wayback machine."
86
+ return
87
+ end
65
88
  count = 0
66
89
  file_list_by_timestamp.each do |file_remote_info|
67
90
  count += 1
@@ -0,0 +1,81 @@
1
+ module ToRegex
2
+ module StringMixin
3
+ class << self
4
+ def literal?(str)
5
+ REGEXP_DELIMITERS.none? { |s, e| str.start_with?(s) and str =~ /#{e}#{INLINE_OPTIONS}\z/ }
6
+ end
7
+ end
8
+
9
+ INLINE_OPTIONS = /[imxnesu]*/
10
+ REGEXP_DELIMITERS = {
11
+ '%r{' => '}',
12
+ '/' => '/',
13
+ }
14
+
15
+ # Get a regex back
16
+ #
17
+ # Without :literal or :detect, `"foo".to_regex` will return nil.
18
+ #
19
+ # @param [optional, Hash] options
20
+ # @option options [true,false] :literal Treat meta characters and other regexp codes as just text; always return a regexp
21
+ # @option options [true,false] :detect If string starts and ends with valid regexp delimiters, treat it as a regexp; otherwise, interpret it literally
22
+ # @option options [true,false] :ignore_case /foo/i
23
+ # @option options [true,false] :multiline /foo/m
24
+ # @option options [true,false] :extended /foo/x
25
+ # @option options [true,false] :lang /foo/[nesu]
26
+ def to_regex(options = {})
27
+ if args = as_regexp(options)
28
+ ::Regexp.new *args
29
+ end
30
+ end
31
+
32
+ # Return arguments that can be passed to `Regexp.new`
33
+ # @see to_regexp
34
+ def as_regexp(options = {})
35
+ unless options.is_a?(::Hash)
36
+ raise ::ArgumentError, "[to_regexp] Options must be a Hash"
37
+ end
38
+ str = self
39
+
40
+ return if options[:detect] and str == ''
41
+
42
+ if options[:literal] or (options[:detect] and ToRegexp::String.literal?(str))
43
+ content = ::Regexp.escape str
44
+ elsif delim_set = REGEXP_DELIMITERS.detect { |k, _| str.start_with?(k) }
45
+ delim_start, delim_end = delim_set
46
+ /\A#{delim_start}(.*)#{delim_end}(#{INLINE_OPTIONS})\z/u =~ str
47
+ content = $1
48
+ inline_options = $2
49
+ return unless content.is_a?(::String)
50
+ content.gsub! '\\/', '/'
51
+ if inline_options
52
+ options[:ignore_case] = true if inline_options.include?('i')
53
+ options[:multiline] = true if inline_options.include?('m')
54
+ options[:extended] = true if inline_options.include?('x')
55
+ # 'n', 'N' = none, 'e', 'E' = EUC, 's', 'S' = SJIS, 'u', 'U' = UTF-8
56
+ options[:lang] = inline_options.scan(/[nesu]/i).join.downcase
57
+ end
58
+ else
59
+ return
60
+ end
61
+
62
+ ignore_case = options[:ignore_case] ? ::Regexp::IGNORECASE : 0
63
+ multiline = options[:multiline] ? ::Regexp::MULTILINE : 0
64
+ extended = options[:extended] ? ::Regexp::EXTENDED : 0
65
+ lang = options[:lang] || ''
66
+ if ::RUBY_VERSION > '1.9' and lang.include?('u')
67
+ lang = lang.delete 'u'
68
+ end
69
+
70
+ if lang.empty?
71
+ [ content, (ignore_case|multiline|extended) ]
72
+ else
73
+ [ content, (ignore_case|multiline|extended), lang ]
74
+ end
75
+ end
76
+ end
77
+ end
78
+
79
+ class String
80
+ include ToRegex::StringMixin
81
+ end
metadata CHANGED
@@ -1,43 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.18
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2015-11-05 00:00:00.000000000 Z
11
+ date: 2015-11-19 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: pry-rescue
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - "~>"
18
- - !ruby/object:Gem::Version
19
- version: '1.4'
20
- type: :development
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - "~>"
25
- - !ruby/object:Gem::Version
26
- version: '1.4'
27
- - !ruby/object:Gem::Dependency
28
- name: pry-stack_explorer
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - "~>"
32
- - !ruby/object:Gem::Version
33
- version: '0.4'
34
- type: :development
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - "~>"
39
- - !ruby/object:Gem::Version
40
- version: '0.4'
41
13
  - !ruby/object:Gem::Dependency
42
14
  name: rake
43
15
  requirement: !ruby/object:Gem::Requirement
@@ -78,6 +50,7 @@ files:
78
50
  - bin/wayback_machine_downloader
79
51
  - lib/wayback_machine_downloader.rb
80
52
  - lib/wayback_machine_downloader/tidy_bytes.rb
53
+ - lib/wayback_machine_downloader/to_regex.rb
81
54
  homepage: https://github.com/hartator/wayback-machine-downloader
82
55
  licenses:
83
56
  - MIT