wayback_machine_downloader 0.5.3 → 0.5.4
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/wayback_machine_downloader +8 -4
- data/lib/wayback_machine_downloader.rb +18 -15
- data/lib/wayback_machine_downloader/archive_api.rb +26 -0
- metadata +2 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f4dcc90f3f3f974aa5c894a6fda9e8158a7d8a2b
|
4
|
+
data.tar.gz: 560460143036a84357328e9b4dfba09a8b78e9d8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3808d953b979cc18a5efd0a0875b96e988649141b3591bfa3cf7eca96d6a06ed218c6d6418e8f3c20d12aff0de89fc10773d7381eb1a07f53aeea79816abe0aa
|
7
|
+
data.tar.gz: 1fd31782e408ae274546204033f7960accd8daeb79c5d09d62d53974298de717baf2c5c7e49f962e7a0101267c15a666c82941d7a9bd619bc7cd9d97e23e4343
|
@@ -14,6 +14,10 @@ option_parser = OptionParser.new do |opts|
|
|
14
14
|
opts.separator ""
|
15
15
|
opts.separator "Optional options:"
|
16
16
|
|
17
|
+
opts.on("-d", "--directory PATH", String, "Directory to save the downloaded files to. Default is ./websites/ plus the domain name.") do |t|
|
18
|
+
options[:directory] = t
|
19
|
+
end
|
20
|
+
|
17
21
|
opts.on("-f", "--from TIMESTAMP", Integer, "Only files on or after timestamp supplied (ie. 20060716231334)") do |t|
|
18
22
|
options[:from_timestamp] = t
|
19
23
|
end
|
@@ -34,14 +38,14 @@ option_parser = OptionParser.new do |opts|
|
|
34
38
|
options[:all] = true
|
35
39
|
end
|
36
40
|
|
37
|
-
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps. Won't download anything.") do |t|
|
38
|
-
options[:list] = true
|
39
|
-
end
|
40
|
-
|
41
41
|
opts.on("-c", "--concurrency NUMBER", Integer, "Number of multiple files to dowload at a time. Default is one file at a time. (ie. 20)") do |t|
|
42
42
|
options[:threads_count] = t
|
43
43
|
end
|
44
44
|
|
45
|
+
opts.on("-l", "--list", "Only list file urls in a JSON format with the archived timestamps. Won't download anything.") do |t|
|
46
|
+
options[:list] = true
|
47
|
+
end
|
48
|
+
|
45
49
|
opts.on("-v", "--version", "Display version") do |t|
|
46
50
|
options[:version] = t
|
47
51
|
end
|
@@ -1,20 +1,25 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
|
+
require 'net/http'
|
3
4
|
require 'open-uri'
|
4
5
|
require 'fileutils'
|
5
6
|
require 'cgi'
|
6
7
|
require 'json'
|
7
8
|
require_relative 'wayback_machine_downloader/tidy_bytes'
|
8
9
|
require_relative 'wayback_machine_downloader/to_regex'
|
10
|
+
require_relative 'wayback_machine_downloader/archive_api'
|
9
11
|
|
10
12
|
class WaybackMachineDownloader
|
11
13
|
|
12
|
-
|
14
|
+
include ArchiveAPI
|
13
15
|
|
14
|
-
|
16
|
+
VERSION = "0.5.4"
|
17
|
+
|
18
|
+
attr_accessor :base_url, :directory, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter, :all, :list, :threads_count
|
15
19
|
|
16
20
|
def initialize params
|
17
21
|
@base_url = params[:base_url]
|
22
|
+
@directory = params[:directory]
|
18
23
|
@from_timestamp = params[:from_timestamp].to_i
|
19
24
|
@to_timestamp = params[:to_timestamp].to_i
|
20
25
|
@only_filter = params[:only_filter]
|
@@ -33,7 +38,15 @@ class WaybackMachineDownloader
|
|
33
38
|
end
|
34
39
|
|
35
40
|
def backup_path
|
36
|
-
|
41
|
+
if @directory
|
42
|
+
if @directory[-1] == '/'
|
43
|
+
@directory
|
44
|
+
else
|
45
|
+
@directory + '/'
|
46
|
+
end
|
47
|
+
else
|
48
|
+
'websites/' + backup_name + '/'
|
49
|
+
end
|
37
50
|
end
|
38
51
|
|
39
52
|
def match_only_filter file_url
|
@@ -63,18 +76,8 @@ class WaybackMachineDownloader
|
|
63
76
|
end
|
64
77
|
|
65
78
|
def get_file_list_curated
|
66
|
-
|
67
|
-
|
68
|
-
parameters_for_wayback_machine_api += "&filter=statuscode:200"
|
69
|
-
end
|
70
|
-
if @from_timestamp and @from_timestamp != 0
|
71
|
-
parameters_for_wayback_machine_api += "&from=" + @from_timestamp.to_s
|
72
|
-
end
|
73
|
-
if @to_timestamp and @to_timestamp != 0
|
74
|
-
parameters_for_wayback_machine_api += "&to=" + @to_timestamp.to_s
|
75
|
-
end
|
76
|
-
index_file_list_raw = open("http://web.archive.org/cdx/search/xd?url=#{@base_url}" + parameters_for_wayback_machine_api)
|
77
|
-
all_file_list_raw = open("http://web.archive.org/cdx/search/xd?url=#{@base_url}/*" + parameters_for_wayback_machine_api)
|
79
|
+
index_file_list_raw = get_raw_list_from_api(@base_url)
|
80
|
+
all_file_list_raw = get_raw_list_from_api(@base_url + '/*')
|
78
81
|
file_list_curated = Hash.new
|
79
82
|
[index_file_list_raw, all_file_list_raw].each do |file|
|
80
83
|
file.each_line do |line|
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module ArchiveAPI
|
2
|
+
|
3
|
+
def get_raw_list_from_api url
|
4
|
+
request_url = "http://web.archive.org/cdx/search/xd?url="
|
5
|
+
request_url += url
|
6
|
+
request_url += parameters_for_api
|
7
|
+
request_uri = URI.parse request_url
|
8
|
+
response = Net::HTTP.get_response request_uri
|
9
|
+
response.body
|
10
|
+
end
|
11
|
+
|
12
|
+
def parameters_for_api
|
13
|
+
parameters = "&fl=timestamp,original&collapse=original&gzip=false"
|
14
|
+
unless @all
|
15
|
+
parameters += "&filter=statuscode:200"
|
16
|
+
end
|
17
|
+
if @from_timestamp and @from_timestamp != 0
|
18
|
+
parameters += "&from=" + @from_timestamp.to_s
|
19
|
+
end
|
20
|
+
if @to_timestamp and @to_timestamp != 0
|
21
|
+
parameters += "&to=" + @to_timestamp.to_s
|
22
|
+
end
|
23
|
+
parameters
|
24
|
+
end
|
25
|
+
|
26
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: wayback_machine_downloader
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- hartator
|
@@ -50,6 +50,7 @@ extra_rdoc_files: []
|
|
50
50
|
files:
|
51
51
|
- bin/wayback_machine_downloader
|
52
52
|
- lib/wayback_machine_downloader.rb
|
53
|
+
- lib/wayback_machine_downloader/archive_api.rb
|
53
54
|
- lib/wayback_machine_downloader/tidy_bytes.rb
|
54
55
|
- lib/wayback_machine_downloader/to_regex.rb
|
55
56
|
homepage: https://github.com/hartator/wayback-machine-downloader
|