wayback_machine_downloader 0.5.3 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +8 -4
- data/lib/wayback_machine_downloader.rb +18 -15
- data/lib/wayback_machine_downloader/archive_api.rb +26 -0
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f4dcc90f3f3f974aa5c894a6fda9e8158a7d8a2b
|
4
|
+
data.tar.gz: 560460143036a84357328e9b4dfba09a8b78e9d8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3808d953b979cc18a5efd0a0875b96e988649141b3591bfa3cf7eca96d6a06ed218c6d6418e8f3c20d12aff0de89fc10773d7381eb1a07f53aeea79816abe0aa
|
7
|
+
data.tar.gz: 1fd31782e408ae274546204033f7960accd8daeb79c5d09d62d53974298de717baf2c5c7e49f962e7a0101267c15a666c82941d7a9bd619bc7cd9d97e23e4343
|
@@ -14,6 +14,10 @@ option_parser = OptionParser.new do |opts|
|
|
14
14
|
opts.separator ""
|
15
15
|
opts.separator "Optional options:"
|
16
16
|
|
17
|
+
opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files to. Default is ./websites/ plus the domain name.") do |t|
|
18
|
+
options[:directory] = t
|
19
|
+
end
|
20
|
+
|
17
21
|
opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
|
18
22
|
options[:from_timestamp] = t
|
19
23
|
end
|
@@ -34,14 +38,14 @@ option_parser = OptionParser.new do |opts|
|
|
34
38
|
options[:all] = true
|
35
39
|
end
|
36
40
|
|
37
|
-
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps. Won't download anything.") do |t|
|
38
|
-
options[:list] = true
|
39
|
-
end
|
40
|
-
|
41
41
|
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time. Default is one file at a time. (ie. 20)") do |t|
|
42
42
|
options[:threads_count] = t
|
43
43
|
end
|
44
44
|
|
45
|
+
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps. Won't download anything.") do |t|
|
46
|
+
options[:list] = true
|
47
|
+
end
|
48
|
+
|
45
49
|
opts.on("-v", "--version", "Display version") do |t|
|
46
50
|
options[:version] = t
|
47
51
|
end
|
@@ -1,20 +1,25 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
|
+
require 'net/http'
|
3
4
|
require 'open-uri'
|
4
5
|
require 'fileutils'
|
5
6
|
require 'cgi'
|
6
7
|
require 'json'
|
7
8
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
8
9
|
require_relative 'wayback_machine_downloader/to_regex'
|
10
|
+
require_relative 'wayback_machine_downloader/archive_api'
|
9
11
|
|
10
12
|
class WaybackMachineDownloader
|
11
13
|
|
12
|
-
|
14
|
+
include ArchiveAPI
|
13
15
|
|
14
|
-
|
16
|
+
VERSION = "0.5.4"
|
17
|
+
|
18
|
+
attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
|
15
19
|
|
16
20
|
def initialize params
|
17
21
|
@base_url = params[:base_url]
|
22
|
+
@directory = params[:directory]
|
18
23
|
@from_timestamp = params[:from_timestamp].to_i
|
19
24
|
@to_timestamp = params[:to_timestamp].to_i
|
20
25
|
@only_filter = params[:only_filter]
|
@@ -33,7 +38,15 @@ class WaybackMachineDownloader
|
|
33
38
|
end
|
34
39
|
|
35
40
|
def backup_path
|
36
|
-
|
41
|
+
if @directory
|
42
|
+
if @directory[-1] == '/'
|
43
|
+
@directory
|
44
|
+
else
|
45
|
+
@directory + '/'
|
46
|
+
end
|
47
|
+
else
|
48
|
+
'websites/' + backup_name + '/'
|
49
|
+
end
|
37
50
|
end
|
38
51
|
|
39
52
|
def match_only_filter file_url
|
@@ -63,18 +76,8 @@ class WaybackMachineDownloader
|
|
63
76
|
end
|
64
77
|
|
65
78
|
def get_file_list_curated
|
66
|
-
|
67
|
-
|
68
|
-
parameters_for_wayback_machine_api += "&filter=statuscode:200"
|
69
|
-
end
|
70
|
-
if @from_timestamp and @from_timestamp != 0
|
71
|
-
parameters_for_wayback_machine_api += "&from=" + @from_timestamp.to_s
|
72
|
-
end
|
73
|
-
if @to_timestamp and @to_timestamp != 0
|
74
|
-
parameters_for_wayback_machine_api += "&to=" + @to_timestamp.to_s
|
75
|
-
end
|
76
|
-
index_file_list_raw = open("http://web.archive.org/cdx/search/xd?url=#{@base_url}" + parameters_for_wayback_machine_api)
|
77
|
-
all_file_list_raw = open("http://web.archive.org/cdx/search/xd?url=#{@base_url}/*" + parameters_for_wayback_machine_api)
|
79
|
+
index_file_list_raw = get_raw_list_from_api(@base_url)
|
80
|
+
all_file_list_raw = get_raw_list_from_api(@base_url + '/*')
|
78
81
|
file_list_curated = Hash.new
|
79
82
|
[index_file_list_raw, all_file_list_raw].each do |file|
|
80
83
|
file.each_line do |line|
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module ArchiveAPI
|
2
|
+
|
3
|
+
def get_raw_list_from_api url
|
4
|
+
request_url = "http://web.archive.org/cdx/search/xd?url="
|
5
|
+
request_url += url
|
6
|
+
request_url += parameters_for_api
|
7
|
+
request_uri = URI.parse request_url
|
8
|
+
response = Net::HTTP.get_response request_uri
|
9
|
+
response.body
|
10
|
+
end
|
11
|
+
|
12
|
+
def parameters_for_api
|
13
|
+
parameters = "&fl=timestamp,original&collapse=original&gzip=false"
|
14
|
+
unless @all
|
15
|
+
parameters += "&filter=statuscode:200"
|
16
|
+
end
|
17
|
+
if @from_timestamp and @from_timestamp != 0
|
18
|
+
parameters += "&from=" + @from_timestamp.to_s
|
19
|
+
end
|
20
|
+
if @to_timestamp and @to_timestamp != 0
|
21
|
+
parameters += "&to=" + @to_timestamp.to_s
|
22
|
+
end
|
23
|
+
parameters
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
@@ -50,6 +50,7 @@ extra_rdoc_files: []
|
|
50
50
|
files:
|
51
51
|
- bin/wayback_machine_downloader
|
52
52
|
- lib/wayback_machine_downloader.rb
|
53
|
+
- lib/wayback_machine_downloader/archive_api.rb
|
53
54
|
- lib/wayback_machine_downloader/tidy_bytes.rb
|
54
55
|
- lib/wayback_machine_downloader/to_regex.rb
|
55
56
|
homepage: https://github.com/hartator/wayback-machine-downloader
|