wayback_machine_downloader 0.3.0 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d29c33bd6a4ffc9cdceb326a266b3ba987e89ec4
4
- data.tar.gz: f739bd763030c3e32026812e4cbe6bca51c1ae8c
3
+ metadata.gz: f581f0aa09dbfecb7080ae0788073c38bef547b1
4
+ data.tar.gz: 8013f6f7b57cf41b8674e37eb0beb9dada4d1b2b
5
5
  SHA512:
6
- metadata.gz: 6ae55228a24711f5d1fc2cc40ee70326a2033fa031d1273a4f4b9432b2fe2c017699cd577a2e36e86ddd3c1fb1b299455796ebf6af97f109d7f3bbedb1f7963a
7
- data.tar.gz: f358009785bb2bb52d0017e762566eb25a919e938d5552c997df76cc64de0857b80a11d3c7284e22e393b96ba6cf3133908af0a7e24098fe3a756023ad77c706
6
+ metadata.gz: 02ce907c84817031c068a10c4b78eeaf08ff4b2c4805159d01bb1a0fb40eb4fa0d212a0a101f4c61cd1309dc6aafc0f95969759e0930dacddcc566f500051cfc
7
+ data.tar.gz: 7131a398e753f271a8db19c7ad1677de9241b3cddf951efa9bf561e97d10b8a81e8f41ae0a676d7eccd8ed0d76770c3b7fb582989f8cab2659af94fa07681d0c
@@ -8,13 +8,14 @@ require_relative 'wayback_machine_downloader/to_regex'
8
8
 
9
9
  class WaybackMachineDownloader
10
10
 
11
- VERSION = "0.3.0"
11
+ VERSION = "0.4.0"
12
12
 
13
- attr_accessor :base_url, :timestamp, :only_filter, :exclude_filter
13
+ attr_accessor :base_url, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter
14
14
 
15
15
  def initialize params
16
16
  @base_url = params[:base_url]
17
- @timestamp = params[:timestamp].to_i
17
+ @from_timestamp = params[:from_timestamp].to_i
18
+ @to_timestamp = params[:to_timestamp].to_i
18
19
  @only_filter = params[:only_filter]
19
20
  @exclude_filter = params[:exclude_filter]
20
21
  end
@@ -54,20 +55,27 @@ class WaybackMachineDownloader
54
55
  end
55
56
 
56
57
  def get_file_list_curated
57
- index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
58
- all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
58
+ parameters_for_wayback_machine_api = "&fl=timestamp,original&fastLatest=true&filter=statuscode:200&collapse=original"
59
+ if @from_timestamp and @from_timestamp != 0
60
+ parameters_for_wayback_machine_api += "&from=" + @from_timestamp.to_s
61
+ end
62
+ if @to_timestamp and @to_timestamp != 0
63
+ parameters_for_wayback_machine_api += "&to=" + @to_timestamp.to_s
64
+ end
65
+ index_file_list_raw = open ("http://web.archive.org/cdx/search/xd?url=#{@base_url}" + parameters_for_wayback_machine_api)
66
+ all_file_list_raw = open ("http://web.archive.org/cdx/search/xd?url=#{@base_url}/*" + parameters_for_wayback_machine_api)
59
67
  file_list_curated = Hash.new
60
68
  [index_file_list_raw, all_file_list_raw].each do |file|
61
69
  file.each_line do |line|
62
70
  line = line.split(' ')
63
- file_timestamp = line[1].to_i
64
- file_url = line[2]
71
+ file_timestamp = line[0].to_i
72
+ file_url = line[1]
65
73
  file_id = file_url.split('/')[3..-1].join('/')
66
74
  file_id = CGI::unescape file_id
67
75
  file_id = file_id.tidy_bytes unless file_id == ""
68
76
  if file_id.nil?
69
77
  puts "Malformed file url, ignoring: #{file_url}"
70
- elsif @timestamp == 0 or file_timestamp <= @timestamp
78
+ else
71
79
  if match_exclude_filter(file_url)
72
80
  puts "File url matches exclude filter, ignoring: #{file_url}"
73
81
  elsif not match_only_filter(file_url)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator