wayback_machine_downloader 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d29c33bd6a4ffc9cdceb326a266b3ba987e89ec4
4
- data.tar.gz: f739bd763030c3e32026812e4cbe6bca51c1ae8c
3
+ metadata.gz: f581f0aa09dbfecb7080ae0788073c38bef547b1
4
+ data.tar.gz: 8013f6f7b57cf41b8674e37eb0beb9dada4d1b2b
5
5
  SHA512:
6
- metadata.gz: 6ae55228a24711f5d1fc2cc40ee70326a2033fa031d1273a4f4b9432b2fe2c017699cd577a2e36e86ddd3c1fb1b299455796ebf6af97f109d7f3bbedb1f7963a
7
- data.tar.gz: f358009785bb2bb52d0017e762566eb25a919e938d5552c997df76cc64de0857b80a11d3c7284e22e393b96ba6cf3133908af0a7e24098fe3a756023ad77c706
6
+ metadata.gz: 02ce907c84817031c068a10c4b78eeaf08ff4b2c4805159d01bb1a0fb40eb4fa0d212a0a101f4c61cd1309dc6aafc0f95969759e0930dacddcc566f500051cfc
7
+ data.tar.gz: 7131a398e753f271a8db19c7ad1677de9241b3cddf951efa9bf561e97d10b8a81e8f41ae0a676d7eccd8ed0d76770c3b7fb582989f8cab2659af94fa07681d0c
@@ -8,13 +8,14 @@ require_relative 'wayback_machine_downloader/to_regex'
8
8
 
9
9
  class WaybackMachineDownloader
10
10
 
11
- VERSION = "0.3.0"
11
+ VERSION = "0.4.0"
12
12
 
13
- attr_accessor :base_url, :timestamp, :only_filter, :exclude_filter
13
+ attr_accessor :base_url, :from_timestamp, :to_timestamp, :only_filter, :exclude_filter
14
14
 
15
15
  def initialize params
16
16
  @base_url = params[:base_url]
17
- @timestamp = params[:timestamp].to_i
17
+ @from_timestamp = params[:from_timestamp].to_i
18
+ @to_timestamp = params[:to_timestamp].to_i
18
19
  @only_filter = params[:only_filter]
19
20
  @exclude_filter = params[:exclude_filter]
20
21
  end
@@ -54,20 +55,27 @@ class WaybackMachineDownloader
54
55
  end
55
56
 
56
57
  def get_file_list_curated
57
- index_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}"
58
- all_file_list_raw = open "http://web.archive.org/cdx/search/xd?url=#{@base_url}/*"
58
+ parameters_for_wayback_machine_api = "&fl=timestamp,original&fastLatest=true&filter=statuscode:200&collapse=original"
59
+ if @from_timestamp and @from_timestamp != 0
60
+ parameters_for_wayback_machine_api += "&from=" + @from_timestamp.to_s
61
+ end
62
+ if @to_timestamp and @to_timestamp != 0
63
+ parameters_for_wayback_machine_api += "&to=" + @to_timestamp.to_s
64
+ end
65
+ index_file_list_raw = open ("http://web.archive.org/cdx/search/xd?url=#{@base_url}" + parameters_for_wayback_machine_api)
66
+ all_file_list_raw = open ("http://web.archive.org/cdx/search/xd?url=#{@base_url}/*" + parameters_for_wayback_machine_api)
59
67
  file_list_curated = Hash.new
60
68
  [index_file_list_raw, all_file_list_raw].each do |file|
61
69
  file.each_line do |line|
62
70
  line = line.split(' ')
63
- file_timestamp = line[1].to_i
64
- file_url = line[2]
71
+ file_timestamp = line[0].to_i
72
+ file_url = line[1]
65
73
  file_id = file_url.split('/')[3..-1].join('/')
66
74
  file_id = CGI::unescape file_id
67
75
  file_id = file_id.tidy_bytes unless file_id == ""
68
76
  if file_id.nil?
69
77
  puts "Malformed file url, ignoring: #{file_url}"
70
- elsif @timestamp == 0 or file_timestamp <= @timestamp
78
+ else
71
79
  if match_exclude_filter(file_url)
72
80
  puts "File url matches exclude filter, ignoring: #{file_url}"
73
81
  elsif not match_only_filter(file_url)
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: wayback_machine_downloader
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - hartator