wayback_machine_downloader 0.5.3 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: f13c4b2fd341ad88d3962eb775435ab41ddabb43
4
- data.tar.gz: 25b0d71d96dbeafd1e055a5d235544a1a23fd452
3
+ metadata.gz: f4dcc90f3f3f974aa5c894a6fda9e8158a7d8a2b
4
+ data.tar.gz: 560460143036a84357328e9b4dfba09a8b78e9d8
5
5
  SHA512:
6
- metadata.gz: 3b97a0422f514ca4026fd1235bb60103316f943906187ce50578c033c4e350820d0eab24c19db09198d40c16e44a6713431178c28ff962d2627b30c0e8d317e8
7
- data.tar.gz: 858cb6cbb51f32736d0e1f4496b555ae1d8fa15d97ce7d7f4cb3616115b6ee93cb0c2751f4b6d0cd1428c50109b461d2bbf503be4455bff99cef769f23d7b358
6
+ metadata.gz: 3808d953b979cc18a5efd0a0875b96e988649141b3591bfa3cf7eca96d6a06ed218c6d6418e8f3c20d12aff0de89fc10773d7381eb1a07f53aeea79816abe0aa
7
+ data.tar.gz: 1fd31782e408ae274546204033f7960accd8daeb79c5d09d62d53974298de717baf2c5c7e49f962e7a0101267c15a666c82941d7a9bd619bc7cd9d97e23e4343
@@ -14,6 +14,10 @@ option_parser = OptionParser.new do |opts|
14
14
  opts.separator ""
15
15
  opts.separator "Optional options:"
16
16
 
17
+ opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files to. Default is ./websites/ plus the domain name.") do |t|
18
+ options[:directory] = t
19
+ end
20
+
17
21
  opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
18
22
  options[:from_timestamp] = t
19
23
  end
@@ -34,14 +38,14 @@ option_parser = OptionParser.new do |opts|
34
38
  options[:all] = true
35
39
  end
36
40
 
37
- opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps. Won't download anything.") do |t|
38
- options[:list] = true
39
- end
40
-
41
41
  opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time. Default is one file at a time. (ie. 20)") do |t|
42
42
  options[:threads_count] = t
43
43
  end
44
44
 
45
+ opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps. Won't download anything.") do |t|
46
+ options[:list] = true
47
+ end
48
+
45
49
  opts.on("-v", "--version", "Display version") do |t|
46
50
  options[:version] = t
47
51
  end
@@ -1,20 +1,25 @@
1
1
  # encoding: UTF-8
2
2
 
3
+ require 'net/http'
3
4
  require 'open-uri'
4
5
  require 'fileutils'
5
6
  require 'cgi'
6
7
  require 'json'
7
8
  require_relative 'wayback_machine_downloader/tidy_bytes'
8
9
  require_relative 'wayback_machine_downloader/to_regex'
10
+ require_relative 'wayback_machine_downloader/archive_api'
9
11
 
10
12
  class WaybackMachineDownloader
11
13
 
12
- VERSION = "0.5.3"
14
+ include ArchiveAPI
13
15
 
14
- attr_accessor :base_url, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
16
+ VERSION = "0.5.4"
17
+
18
+ attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
15
19
 
16
20
  def initialize params
17
21
  @base_url = params[:base_url]
22
+ @directory = params[:directory]
18
23
  @from_timestamp = params[:from_timestamp].to_i
19
24
  @to_timestamp = params[:to_timestamp].to_i
20
25
  @only_filter = params[:only_filter]
@@ -33,7 +38,15 @@ class WaybackMachineDownloader
33
38
  end
34
39
 
35
40
  def backup_path
36
- 'websites/' + backup_name + '/'
41
+ if @directory
42
+ if @directory[-1] == '/'
43
+ @directory
44
+ else
45
+ @directory + '/'
46
+ end
47
+ else
48
+ 'websites/' + backup_name + '/'
49
+ end
37
50
  end
38
51
 
39
52
  def match_only_filter file_url
@@ -63,18 +76,8 @@ class WaybackMachineDownloader
63
76
  end
64
77
 
65
78
  def get_file_list_curated
66
- parameters_for_wayback_machine_api = "&fl=timestamp,original&collapse=original"
67
- unless @all
68
- parameters_for_wayback_machine_api += "&filter=statuscode:200"
69
- end
70
- if @from_timestamp and @from_timestamp != 0
71
- parameters_for_wayback_machine_api += "&from=" + @from_timestamp.to_s
72
- end
73
- if @to_timestamp and @to_timestamp != 0
74
- parameters_for_wayback_machine_api += "&to=" + @to_timestamp.to_s
75
- end
76
- index_file_list_raw = open("http://web.archive.org/cdx/search/xd?url=#{@base_url}" + parameters_for_wayback_machine_api)
77
- all_file_list_raw = open("http://web.archive.org/cdx/search/xd?url=#{@base_url}/*" + parameters_for_wayback_machine_api)
79
+ index_file_list_raw = get_raw_list_from_api(@base_url)
80
+ all_file_list_raw = get_raw_list_from_api(@base_url + '/*')
78
81
  file_list_curated = Hash.new
79
82
  [index_file_list_raw, all_file_list_raw].each do |file|
80
83
  file.each_line do |line|
@@ -0,0 +1,26 @@
1
+ module ArchiveAPI
2
+
3
+ def get_raw_list_from_api url
4
+ request_url = "http://web.archive.org/cdx/search/xd?url="
5
+ request_url += url
6
+ request_url += parameters_for_api
7
+ request_uri = URI.parse request_url
8
+ response = Net::HTTP.get_response request_uri
9
+ response.body
10
+ end
11
+
12
+ def parameters_for_api
13
+ parameters = "&fl=timestamp,original&collapse=original&gzip=false"
14
+ unless @all
15
+ parameters += "&filter=statuscode:200"
16
+ end
17
+ if @from_timestamp and @from_timestamp != 0
18
+ parameters += "&from=" + @from_timestamp.to_s
19
+ end
20
+ if @to_timestamp and @to_timestamp != 0
21
+ parameters += "&to=" + @to_timestamp.to_s
22
+ end
23
+ parameters
24
+ end
25
+
26
+ end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.3
4
+ version: 0.5.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator
@@ -50,6 +50,7 @@ extra_rdoc_files: []
50
50
  files:
51
51
  - bin/wayback_machine_downloader
52
52
  - lib/wayback_machine_downloader.rb
53
+ - lib/wayback_machine_downloader/archive_api.rb
53
54
  - lib/wayback_machine_downloader/tidy_bytes.rb
54
55
  - lib/wayback_machine_downloader/to_regex.rb
55
56
  homepage: https://github.com/hartator/wayback-machine-downloader