wayback_machine_downloader 0.5.3 → 0.5.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f13c4b2fd341ad88d3962eb775435ab41ddabb43
4
- data.tar.gz: 25b0d71d96dbeafd1e055a5d235544a1a23fd452
3
+ metadata.gz: f4dcc90f3f3f974aa5c894a6fda9e8158a7d8a2b
4
+ data.tar.gz: 560460143036a84357328e9b4dfba09a8b78e9d8
5
5
  SHA512:
6
- metadata.gz: 3b97a0422f514ca4026fd1235bb60103316f943906187ce50578c033c4e350820d0eab24c19db09198d40c16e44a6713431178c28ff962d2627b30c0e8d317e8
7
- data.tar.gz: 858cb6cbb51f32736d0e1f4496b555ae1d8fa15d97ce7d7f4cb3616115b6ee93cb0c2751f4b6d0cd1428c50109b461d2bbf503be4455bff99cef769f23d7b358
6
+ metadata.gz: 3808d953b979cc18a5efd0a0875b96e988649141b3591bfa3cf7eca96d6a06ed218c6d6418e8f3c20d12aff0de89fc10773d7381eb1a07f53aeea79816abe0aa
7
+ data.tar.gz: 1fd31782e408ae274546204033f7960accd8daeb79c5d09d62d53974298de717baf2c5c7e49f962e7a0101267c15a666c82941d7a9bd619bc7cd9d97e23e4343
@@ -14,6 +14,10 @@ option_parser = OptionParser.new do |opts|
14
14
  opts.separator ""
15
15
  opts.separator "Optional options:"
16
16
 
17
+ opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files to. Default is ./websites/ plus the domain name.") do |t|
18
+ options[:directory] = t
19
+ end
20
+
17
21
  opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
18
22
  options[:from_timestamp] = t
19
23
  end
@@ -34,14 +38,14 @@ option_parser = OptionParser.new do |opts|
34
38
  options[:all] = true
35
39
  end
36
40
 
37
- opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps. Won't download anything.") do |t|
38
- options[:list] = true
39
- end
40
-
41
41
  opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time. Default is one file at a time. (ie. 20)") do |t|
42
42
  options[:threads_count] = t
43
43
  end
44
44
 
45
+ opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps. Won't download anything.") do |t|
46
+ options[:list] = true
47
+ end
48
+
45
49
  opts.on("-v", "--version", "Display version") do |t|
46
50
  options[:version] = t
47
51
  end
@@ -1,20 +1,25 @@
1
1
  # encoding: UTF-8
2
2
 
3
+ require 'net/http'
3
4
  require 'open-uri'
4
5
  require 'fileutils'
5
6
  require 'cgi'
6
7
  require 'json'
7
8
  require_relative 'wayback_machine_downloader/tidy_bytes'
8
9
  require_relative 'wayback_machine_downloader/to_regex'
10
+ require_relative 'wayback_machine_downloader/archive_api'
9
11
 
10
12
  class WaybackMachineDownloader
11
13
 
12
- VERSION = "0.5.3"
14
+ include ArchiveAPI
13
15
 
14
- attr_accessor :base_url, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
16
+ VERSION = "0.5.4"
17
+
18
+ attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
15
19
 
16
20
  def initialize params
17
21
  @base_url = params[:base_url]
22
+ @directory = params[:directory]
18
23
  @from_timestamp = params[:from_timestamp].to_i
19
24
  @to_timestamp = params[:to_timestamp].to_i
20
25
  @only_filter = params[:only_filter]
@@ -33,7 +38,15 @@ class WaybackMachineDownloader
33
38
  end
34
39
 
35
40
  def backup_path
36
- 'websites/' + backup_name + '/'
41
+ if @directory
42
+ if @directory[-1] == '/'
43
+ @directory
44
+ else
45
+ @directory + '/'
46
+ end
47
+ else
48
+ 'websites/' + backup_name + '/'
49
+ end
37
50
  end
38
51
 
39
52
  def match_only_filter file_url
@@ -63,18 +76,8 @@ class WaybackMachineDownloader
63
76
  end
64
77
 
65
78
  def get_file_list_curated
66
- parameters_for_wayback_machine_api = "&fl=timestamp,original&collapse=original"
67
- unless @all
68
- parameters_for_wayback_machine_api += "&filter=statuscode:200"
69
- end
70
- if @from_timestamp and @from_timestamp != 0
71
- parameters_for_wayback_machine_api += "&from=" + @from_timestamp.to_s
72
- end
73
- if @to_timestamp and @to_timestamp != 0
74
- parameters_for_wayback_machine_api += "&to=" + @to_timestamp.to_s
75
- end
76
- index_file_list_raw = open("http://web.archive.org/cdx/search/xd?url=#{@base_url}" + parameters_for_wayback_machine_api)
77
- all_file_list_raw = open("http://web.archive.org/cdx/search/xd?url=#{@base_url}/*" + parameters_for_wayback_machine_api)
79
+ index_file_list_raw = get_raw_list_from_api(@base_url)
80
+ all_file_list_raw = get_raw_list_from_api(@base_url + '/*')
78
81
  file_list_curated = Hash.new
79
82
  [index_file_list_raw, all_file_list_raw].each do |file|
80
83
  file.each_line do |line|
@@ -0,0 +1,26 @@
1
+ module ArchiveAPI
2
+
3
+ def get_raw_list_from_api url
4
+ request_url = "http://web.archive.org/cdx/search/xd?url="
5
+ request_url += url
6
+ request_url += parameters_for_api
7
+ request_uri = URI.parse request_url
8
+ response = Net::HTTP.get_response request_uri
9
+ response.body
10
+ end
11
+
12
+ def parameters_for_api
13
+ parameters = "&fl=timestamp,original&collapse=original&gzip=false"
14
+ unless @all
15
+ parameters += "&filter=statuscode:200"
16
+ end
17
+ if @from_timestamp and @from_timestamp != 0
18
+ parameters += "&from=" + @from_timestamp.to_s
19
+ end
20
+ if @to_timestamp and @to_timestamp != 0
21
+ parameters += "&to=" + @to_timestamp.to_s
22
+ end
23
+ parameters
24
+ end
25
+
26
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.3
4
+ version: 0.5.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
@@ -50,6 +50,7 @@ extra_rdoc_files: []
50
50
  files:
51
51
  - bin/wayback_machine_downloader
52
52
  - lib/wayback_machine_downloader.rb
53
+ - lib/wayback_machine_downloader/archive_api.rb
53
54
  - lib/wayback_machine_downloader/tidy_bytes.rb
54
55
  - lib/wayback_machine_downloader/to_regex.rb
55
56
  homepage: https://github.com/hartator/wayback-machine-downloader