wayback_machine_downloader 0.3.0 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/wayback_machine_downloader.rb +16 -8
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f581f0aa09dbfecb7080ae0788073c38bef547b1
|
4
|
+
data.tar.gz: 8013f6f7b57cf41b8674e37eb0beb9dada4d1b2b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 02ce907c84817031c068a10c4b78eeaf08ff4b2c4805159d01bb1a0fb40eb4fa0d212a0a101f4c61cd1309dc6aafc0f95969759e0930dacddcc566f500051cfc
|
7
|
+
data.tar.gz: 7131a398e753f271a8db19c7ad1677de9241b3cddf951efa9bf561e97d10b8a81e8f41ae0a676d7eccd8ed0d76770c3b7fb582989f8cab2659af94fa07681d0c
|
@@ -8,13 +8,14 @@ require_relative 'wayback_machine_downloader/to_regex'
|
|
8
8
|
|
9
9
|
class WaybackMachineDownloader
|
10
10
|
|
11
|
-
VERSION = "0.
|
11
|
+
VERSION = "0.4.0"
|
12
12
|
|
13
|
-
attr_accessor :base_url, :
|
13
|
+
attr_accessor :base_url, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter
|
14
14
|
|
15
15
|
def initialize params
|
16
16
|
@base_url = params[:base_url]
|
17
|
-
@
|
17
|
+
@from_timestamp = params[:from_timestamp].to_i
|
18
|
+
@to_timestamp = params[:to_timestamp].to_i
|
18
19
|
@only_filter = params[:only_filter]
|
19
20
|
@exclude_filter = params[:exclude_filter]
|
20
21
|
end
|
@@ -54,20 +55,27 @@ class WaybackMachineDownloader
|
|
54
55
|
end
|
55
56
|
|
56
57
|
def get_file_list_curated
|
57
|
-
|
58
|
-
|
58
|
+
parameters_for_wayback_machine_api = "&fl=timestamp,original&fastLatest=true&filter=statuscode:200&collapse=original"
|
59
|
+
if @from_timestamp and @from_timestamp != 0
|
60
|
+
parameters_for_wayback_machine_api += "&from=" + @from_timestamp.to_s
|
61
|
+
end
|
62
|
+
if @to_timestamp and @to_timestamp != 0
|
63
|
+
parameters_for_wayback_machine_api += "&to=" + @to_timestamp.to_s
|
64
|
+
end
|
65
|
+
index_file_list_raw = open ("http://web.archive.org/cdx/search/xd?url=#{@base_url}" + parameters_for_wayback_machine_api)
|
66
|
+
all_file_list_raw = open ("http://web.archive.org/cdx/search/xd?url=#{@base_url}/*" + parameters_for_wayback_machine_api)
|
59
67
|
file_list_curated = Hash.new
|
60
68
|
[index_file_list_raw, all_file_list_raw].each do |file|
|
61
69
|
file.each_line do |line|
|
62
70
|
line = line.split(' ')
|
63
|
-
file_timestamp = line[
|
64
|
-
file_url = line[
|
71
|
+
file_timestamp = line[0].to_i
|
72
|
+
file_url = line[1]
|
65
73
|
file_id = file_url.split('/')[3..-1].join('/')
|
66
74
|
file_id = CGI::unescape file_id
|
67
75
|
file_id = file_id.tidy_bytes unless file_id == ""
|
68
76
|
if file_id.nil?
|
69
77
|
puts "Malformed file url, ignoring: #{file_url}"
|
70
|
-
|
78
|
+
else
|
71
79
|
if match_exclude_filter(file_url)
|
72
80
|
puts "File url matches exclude filter, ignoring: #{file_url}"
|
73
81
|
elsif not match_only_filter(file_url)
|