archaeo 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/lib/archaeo/cli.rb CHANGED
@@ -9,6 +9,10 @@ module Archaeo
9
9
  class Cli < Thor
10
10
  map %w[--version -v] => :version
11
11
 
12
+ def self.exit_on_failure?
13
+ true
14
+ end
15
+
12
16
  desc "version", "Show archaeo version"
13
17
  def version
14
18
  puts "archaeo #{VERSION}"
@@ -26,51 +30,64 @@ module Archaeo
26
30
  option :format, desc: "Output format (table, json, csv)",
27
31
  default: "table"
28
32
  def snapshots(url)
29
- cdx = CdxApi.new
30
- opts = build_cdx_options(options)
31
- snaps = cdx.snapshots(url, **opts).to_a
32
- case options[:format]
33
- when "json" then output_json(snaps)
34
- when "csv" then output_csv(snaps)
35
- else output_table(snaps)
33
+ fmt = validate_output_format
34
+ handle_errors do
35
+ snaps = fetch_snapshots(url)
36
+ output_formatted(snaps, fmt)
36
37
  end
37
38
  end
38
39
 
39
40
  desc "near URL TIMESTAMP",
40
41
  "Find the snapshot closest to a timestamp"
42
+ option :format, desc: "Output format (url, json)", default: "url"
41
43
  def near(url, timestamp)
42
- snap = CdxApi.new.near(url, timestamp: timestamp)
43
- puts snap.archive_url
44
+ handle_errors do
45
+ snap = CdxApi.new.near(url, timestamp: timestamp)
46
+ output_snapshot(snap)
47
+ end
44
48
  end
45
49
 
46
50
  desc "oldest URL", "Find the oldest snapshot of a URL"
51
+ option :format, desc: "Output format (url, json)", default: "url"
47
52
  def oldest(url)
48
- snap = CdxApi.new.oldest(url)
49
- puts snap.archive_url
53
+ handle_errors do
54
+ snap = CdxApi.new.oldest(url)
55
+ output_snapshot(snap)
56
+ end
50
57
  end
51
58
 
52
59
  desc "newest URL", "Find the newest snapshot of a URL"
60
+ option :format, desc: "Output format (url, json)", default: "url"
53
61
  def newest(url)
54
- snap = CdxApi.new.newest(url)
55
- puts snap.archive_url
62
+ handle_errors do
63
+ snap = CdxApi.new.newest(url)
64
+ output_snapshot(snap)
65
+ end
56
66
  end
57
67
 
58
68
  desc "available URL", "Check if a URL is archived"
69
+ option :timestamp, desc: "Check near this timestamp (YYYYMMDDHHmmss)"
59
70
  def available(url)
60
- result = AvailabilityApi.new.near(url)
61
- if result.available?
62
- puts "Available: #{result.archive_url}"
63
- else
64
- puts "Not available"
65
- exit 1
71
+ handle_errors do
72
+ result = AvailabilityApi.new.near(
73
+ url, timestamp: options[:timestamp]
74
+ )
75
+ if result.available?
76
+ puts "Available: #{result.archive_url}"
77
+ else
78
+ puts "Not available"
79
+ exit 1
80
+ end
66
81
  end
67
82
  end
68
83
 
69
84
  desc "save URL", "Save a URL to the Wayback Machine"
70
85
  def save(url)
71
- result = SaveApi.new.save(url)
72
- label = result.cached? ? "Cached" : "Saved"
73
- puts "#{label}: #{result.archive_url}"
86
+ handle_errors do
87
+ result = SaveApi.new.save(url)
88
+ label = result.cached? ? "Cached" : "Saved"
89
+ puts "#{label}: #{result.archive_url}"
90
+ end
74
91
  end
75
92
 
76
93
  desc "fetch URL TIMESTAMP",
@@ -79,15 +96,12 @@ module Archaeo
79
96
  desc: "Fetch raw (identity) content"
80
97
  option :output, desc: "Write content to file"
81
98
  def fetch(url, timestamp)
82
- page = Fetcher.new.fetch(
83
- url, timestamp: timestamp,
84
- identity: options[:identity]
85
- )
86
-
87
- if options[:output]
88
- write_output(options[:output], page.content)
89
- else
90
- $stdout.write(page.content)
99
+ handle_errors do
100
+ page = Fetcher.new.fetch(
101
+ url, timestamp: timestamp,
102
+ identity: options[:identity]
103
+ )
104
+ output_page(page)
91
105
  end
92
106
  end
93
107
 
@@ -97,38 +111,34 @@ module Archaeo
97
111
  option :to, desc: "End timestamp (YYYYMMDDHHmmss)"
98
112
  option :resume, type: :boolean, default: false,
99
113
  desc: "Resume interrupted download"
114
+ option :concurrency, type: :numeric, default: 1,
115
+ desc: "Number of parallel downloads"
100
116
  def download(url)
101
- downloader = BulkDownloader.new(output_dir: options[:output])
102
-
103
- downloader.download(
104
- url,
105
- from: options[:from],
106
- to: options[:to],
107
- resume: options[:resume],
108
- ) do |current, total, snap|
109
- warn "[#{current}/#{total}] " \
110
- "#{snap.timestamp} #{snap.original_url}"
117
+ handle_errors do
118
+ downloader = BulkDownloader.new(
119
+ output_dir: options[:output],
120
+ concurrency: options[:concurrency],
121
+ )
122
+ download_with_progress(downloader, url)
111
123
  end
112
124
  end
113
125
 
114
126
  desc "known_urls DOMAIN",
115
127
  "List all known URLs for a domain"
116
128
  def known_urls(domain)
117
- CdxApi.new.known_urls(domain).each do |u|
118
- puts u
129
+ handle_errors do
130
+ CdxApi.new.known_urls(domain).each do |u|
131
+ puts u
132
+ end
119
133
  end
120
134
  end
121
135
 
122
136
  desc "num_pages URL",
123
137
  "Show number of CDX result pages for a URL"
124
138
  def num_pages(url)
125
- puts CdxApi.new.num_pages(url)
126
- rescue RateLimitError => e
127
- warn "Error: #{e.message}"
128
- exit 1
129
- rescue Error => e
130
- warn "Error: #{e.message}"
131
- exit 1
139
+ handle_errors do
140
+ puts CdxApi.new.num_pages(url)
141
+ end
132
142
  end
133
143
 
134
144
  CDX_OPTION_MAP = {
@@ -143,6 +153,79 @@ module Archaeo
143
153
 
144
154
  private
145
155
 
156
+ def handle_errors
157
+ yield
158
+ rescue RateLimitError => e
159
+ warn "Rate limited: #{e.message}"
160
+ exit 1
161
+ rescue NoSnapshotFound => e
162
+ warn "Not found: #{e.message}"
163
+ exit 1
164
+ rescue BlockedSiteError => e
165
+ warn "Blocked: #{e.message}"
166
+ exit 1
167
+ rescue Error => e
168
+ warn "Error: #{e.message}"
169
+ exit 1
170
+ end
171
+
172
+ def validate_output_format
173
+ fmt = options[:format].to_s
174
+ fmt = "table" if fmt.empty?
175
+ unless %w[table json csv].include?(fmt)
176
+ warn "Unknown format '#{fmt}'. Use: table, json, csv"
177
+ exit 1
178
+ end
179
+ fmt
180
+ end
181
+
182
+ def fetch_snapshots(url)
183
+ cdx = CdxApi.new
184
+ opts = build_cdx_options(options)
185
+ cdx.snapshots(url, **opts).to_a
186
+ end
187
+
188
+ def output_formatted(snaps, fmt)
189
+ case fmt
190
+ when "json" then output_json(snaps)
191
+ when "csv" then output_csv(snaps)
192
+ else output_table(snaps)
193
+ end
194
+ end
195
+
196
+ def output_snapshot(snap)
197
+ case options[:format]
198
+ when "json"
199
+ puts JSON.generate(snap.as_json)
200
+ else
201
+ puts snap.archive_url
202
+ end
203
+ end
204
+
205
+ def output_page(page)
206
+ if options[:output]
207
+ write_output(options[:output], page.content)
208
+ elsif page.text? || page.json?
209
+ $stdout.write(page.content)
210
+ else
211
+ warn "Binary content (#{page.content_type}). " \
212
+ "Use --output FILE to save."
213
+ exit 1
214
+ end
215
+ end
216
+
217
+ def download_with_progress(downloader, url)
218
+ downloader.download(
219
+ url,
220
+ from: options[:from],
221
+ to: options[:to],
222
+ resume: options[:resume],
223
+ ) do |current, total, snap|
224
+ warn "[#{current}/#{total}] " \
225
+ "#{snap.timestamp} #{snap.original_url}"
226
+ end
227
+ end
228
+
146
229
  def build_cdx_options(opts)
147
230
  CDX_OPTION_MAP.each_with_object({}) do |(cli_key, api_key), result|
148
231
  value = opts[cli_key]
@@ -158,14 +241,7 @@ module Archaeo
158
241
  end
159
242
 
160
243
  def output_json(snaps)
161
- data = snaps.map do |snap|
162
- {
163
- timestamp: snap.timestamp.to_s,
164
- status_code: snap.status_code,
165
- url: snap.original_url,
166
- archive_url: snap.archive_url,
167
- }
168
- end
244
+ data = snaps.map(&:as_json)
169
245
  puts JSON.generate(data)
170
246
  end
171
247
 
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "set"
4
+
3
5
  module Archaeo
4
6
  # Tracks download progress for resume support.
5
7
  #
@@ -17,16 +19,21 @@ module Archaeo
17
19
  end
18
20
 
19
21
  def completed?(timestamp)
20
- timestamps.include?(timestamp.to_s)
22
+ timestamps_set.include?(timestamp.to_s)
21
23
  end
22
24
 
23
25
  def mark_completed(timestamp)
24
- timestamps << timestamp.to_s
26
+ ts = timestamp.to_s
27
+ return if timestamps_set.include?(ts)
28
+
29
+ timestamps << ts
30
+ @timestamps_set = nil
25
31
  save
26
32
  end
27
33
 
28
34
  def clear
29
35
  @timestamps = []
36
+ @timestamps_set = nil
30
37
  FileUtils.rm_f(@path)
31
38
  end
32
39
 
@@ -36,6 +43,10 @@ module Archaeo
36
43
  @timestamps ||= load_timestamps
37
44
  end
38
45
 
46
+ def timestamps_set
47
+ @timestamps_set ||= timestamps.to_set
48
+ end
49
+
39
50
  def load_timestamps
40
51
  return [] unless File.exist?(@path)
41
52
 
@@ -43,7 +54,10 @@ module Archaeo
43
54
  end
44
55
 
45
56
  def save
46
- File.write(@path, "#{timestamps.uniq.sort.join("\n")}\n")
57
+ content = "#{timestamps.sort.join("\n")}\n"
58
+ tmp_path = "#{@path}.tmp"
59
+ File.write(tmp_path, content)
60
+ File.rename(tmp_path, @path)
47
61
  end
48
62
  end
49
63
  end
@@ -15,6 +15,10 @@ module Archaeo
15
15
  DEFAULT_TIMEOUT = 30
16
16
  DEFAULT_MAX_RETRIES = 3
17
17
  DEFAULT_RETRY_DELAY = 2
18
+ MAX_POOL_SIZE = 8
19
+ MAX_IDLE_TIME = 60
20
+
21
+ RETRIABLE_STATUSES = [429, 502, 503, 504].freeze
18
22
 
19
23
  TRANSIENT_ERRORS = [
20
24
  Net::ReadTimeout,
@@ -64,7 +68,16 @@ module Archaeo
64
68
  @retry_delay = retry_delay
65
69
  @user_agent = user_agent
66
70
  @connections = {}
71
+ @last_used = {}
67
72
  @mutex = Mutex.new
73
+ @shutdown = false
74
+ end
75
+
76
+ def self.open(**opts)
77
+ client = new(**opts)
78
+ yield client
79
+ ensure
80
+ client&.shutdown
68
81
  end
69
82
 
70
83
  def get(url, headers: {})
@@ -81,6 +94,9 @@ module Archaeo
81
94
 
82
95
  def shutdown
83
96
  @mutex.synchronize do
97
+ return if @shutdown
98
+
99
+ @shutdown = true
84
100
  @connections.each_value do |http|
85
101
  http.finish
86
102
  rescue StandardError
@@ -103,16 +119,26 @@ module Archaeo
103
119
  def connection_for(uri)
104
120
  key = connection_key(uri)
105
121
  @mutex.synchronize do
122
+ evict_if_pool_full(key)
106
123
  http = @connections[key]
107
124
  if http && !http.active?
108
- @connections.delete(key)
125
+ close_connection(key)
109
126
  http = nil
110
127
  end
111
128
  @connections[key] = build_connection(uri) unless http
129
+ @last_used[key] = Time.now
112
130
  @connections[key]
113
131
  end
114
132
  end
115
133
 
134
+ def evict_if_pool_full(key)
135
+ evict_stale_connections
136
+ return unless @connections.size >= MAX_POOL_SIZE &&
137
+ !@connections.key?(key)
138
+
139
+ evict_lru
140
+ end
141
+
116
142
  def build_connection(uri)
117
143
  http = Net::HTTP.new(uri.host, uri.port)
118
144
  http.use_ssl = uri.scheme == "https"
@@ -124,26 +150,82 @@ module Archaeo
124
150
 
125
151
  def invalidate_connection(uri)
126
152
  key = connection_key(uri)
127
- @mutex.synchronize do
128
- http = @connections.delete(key)
129
- begin
130
- http&.finish
131
- rescue StandardError
132
- nil
133
- end
153
+ @mutex.synchronize { close_connection(key) }
154
+ end
155
+
156
+ def close_connection(key)
157
+ http = @connections.delete(key)
158
+ @last_used.delete(key)
159
+ begin
160
+ http&.finish
161
+ rescue StandardError
162
+ nil
163
+ end
164
+ end
165
+
166
+ def evict_stale_connections
167
+ now = Time.now
168
+ @connections.each_key do |key|
169
+ idle = now - (@last_used[key] || now)
170
+ close_connection(key) if idle > MAX_IDLE_TIME
171
+ end
172
+ end
173
+
174
+ def evict_lru
175
+ lru_key = @last_used.min_by { |_, t| t }&.first
176
+ close_connection(lru_key) if lru_key
177
+ end
178
+
179
+ # Internal error class for HTTP status retry signaling
180
+ class RetriableStatusError < StandardError
181
+ attr_reader :response
182
+
183
+ def initialize(response)
184
+ @response = response
185
+ super("Retriable HTTP status: #{response.status}")
134
186
  end
135
187
  end
136
188
 
137
189
  def attempt_with_retries(uri, headers, request_class)
138
190
  retries = 0
139
191
  begin
140
- execute_with_connection(uri, headers, request_class)
192
+ execute_and_check(uri, headers, request_class)
193
+ rescue RetriableStatusError => e
194
+ retry_status(e, retries += 1) && retry
141
195
  rescue *TRANSIENT_ERRORS => e
142
- retries += 1
143
- raise_if_exhausted(retries, e)
144
- invalidate_connection(uri)
145
- sleep(@retry_delay * retries)
146
- retry
196
+ retry_transient(e, uri, retries += 1) && retry
197
+ end
198
+ end
199
+
200
+ def retry_status(error, retries)
201
+ raise_if_exhausted(retries,
202
+ RateLimitError.new("HTTP #{error.response.status}"))
203
+ sleep(extract_retry_after(error.response) || (@retry_delay * retries))
204
+ end
205
+
206
+ def retry_transient(error, uri, retries)
207
+ raise_if_exhausted(retries, error)
208
+ invalidate_connection(uri)
209
+ sleep(@retry_delay * retries)
210
+ end
211
+
212
+ def execute_and_check(uri, headers, request_class)
213
+ response = execute_with_connection(uri, headers, request_class)
214
+ if RETRIABLE_STATUSES.include?(response.status)
215
+ raise RetriableStatusError, response
216
+ end
217
+
218
+ response
219
+ end
220
+
221
+ def extract_retry_after(response)
222
+ value = response.headers["retry-after"]
223
+ return nil unless value
224
+
225
+ begin
226
+ Integer(value)
227
+ rescue StandardError
228
+ nil
147
229
  end
148
230
  end
149
231
 
data/lib/archaeo/page.rb CHANGED
@@ -34,6 +34,35 @@ module Archaeo
34
34
  @encoding ||= detect_encoding
35
35
  end
36
36
 
37
+ def html?
38
+ @content_type&.include?("text/html")
39
+ end
40
+
41
+ def json?
42
+ @content_type&.include?("application/json")
43
+ end
44
+
45
+ def image?
46
+ @content_type&.start_with?("image/")
47
+ end
48
+
49
+ def text?
50
+ @content_type&.start_with?("text/")
51
+ end
52
+
53
+ def binary?
54
+ !(text? || json? || html?)
55
+ end
56
+
57
+ def title
58
+ @title ||= begin
59
+ doc = Nokogiri::HTML(@raw_content)
60
+ doc.at_css("title")&.text&.strip
61
+ rescue StandardError
62
+ nil
63
+ end
64
+ end
65
+
37
66
  private
38
67
 
39
68
  def detect_encoding
@@ -6,11 +6,25 @@ module Archaeo
6
6
  # Bundles a Page with the AssetList discovered from its HTML,
7
7
  # providing a single object for complete page archival.
8
8
  class PageBundle
9
+ include Enumerable
10
+
9
11
  attr_reader :page, :assets
10
12
 
11
13
  def initialize(page:, assets:)
12
14
  @page = page
13
15
  @assets = assets
14
16
  end
17
+
18
+ def each(&block)
19
+ assets.each(&block)
20
+ end
21
+
22
+ def size
23
+ assets.size + 1
24
+ end
25
+
26
+ def asset_count
27
+ assets.size
28
+ end
15
29
  end
16
30
  end
@@ -32,7 +32,7 @@ module Archaeo
32
32
  response = @client.get(save_url)
33
33
  check_response_errors!(response, url)
34
34
 
35
- result = process_save_response(response, start_time)
35
+ result = process_save_response(response, start_time, url)
36
36
  return result if result
37
37
  end
38
38
 
@@ -40,13 +40,13 @@ module Archaeo
40
40
  "Failed to save #{url} after #{@max_tries} attempts"
41
41
  end
42
42
 
43
- def process_save_response(response, start_time)
43
+ def process_save_response(response, start_time, url)
44
44
  archive_url = extract_archive_url(response)
45
45
  return nil unless archive_url
46
46
 
47
47
  ts = Timestamp.parse(extract_timestamp(archive_url))
48
48
  cached = ts.to_time < start_time - 2700
49
- SaveResult.new(archive_url: archive_url,
49
+ SaveResult.new(url: url, archive_url: archive_url,
50
50
  timestamp: ts, cached: cached)
51
51
  end
52
52
 
@@ -6,9 +6,10 @@ module Archaeo
6
6
  # Contains the resulting archive URL, timestamp, and whether
7
7
  # the page was already cached in the archive.
8
8
  class SaveResult
9
- attr_reader :archive_url, :timestamp
9
+ attr_reader :url, :archive_url, :timestamp
10
10
 
11
- def initialize(archive_url:, timestamp:, cached:)
11
+ def initialize(url:, archive_url:, timestamp:, cached:)
12
+ @url = url
12
13
  @archive_url = archive_url
13
14
  @timestamp = Timestamp.coerce(timestamp)
14
15
  @cached = cached
@@ -37,6 +37,34 @@ module Archaeo
37
37
  @status_code == 200
38
38
  end
39
39
 
40
+ def redirect?
41
+ @status_code.between?(300, 399)
42
+ end
43
+
44
+ def client_error?
45
+ @status_code.between?(400, 499)
46
+ end
47
+
48
+ def server_error?
49
+ @status_code.between?(500, 599)
50
+ end
51
+
52
+ def error?
53
+ client_error? || server_error?
54
+ end
55
+
56
+ def fetch(client: HttpClient.new, identity: false)
57
+ Fetcher.new(client: client).fetch(
58
+ original_url, timestamp: @timestamp, identity: identity
59
+ )
60
+ end
61
+
62
+ def fetch_with_assets(client: HttpClient.new)
63
+ Fetcher.new(client: client).fetch_page_with_assets(
64
+ original_url, timestamp: @timestamp
65
+ )
66
+ end
67
+
40
68
  def to_a
41
69
  [@urlkey, @timestamp, @original_url, @mimetype,
42
70
  @status_code, @digest, @length]
@@ -54,6 +82,18 @@ module Archaeo
54
82
  }
55
83
  end
56
84
 
85
+ def as_json(*)
86
+ {
87
+ urlkey: @urlkey,
88
+ timestamp: @timestamp.to_s,
89
+ original_url: @original_url,
90
+ mimetype: @mimetype,
91
+ status_code: @status_code,
92
+ digest: @digest,
93
+ length: @length,
94
+ }
95
+ end
96
+
57
97
  def ==(other)
58
98
  other.is_a?(self.class) && to_a == other.to_a
59
99
  end
@@ -1,5 +1,7 @@
1
1
  # frozen_string_literal: true
2
2
 
3
+ require "date"
4
+
3
5
  module Archaeo
4
6
  # Value object representing a Wayback Machine timestamp (YYYYMMDDHHmmss).
5
7
  #
@@ -70,6 +72,26 @@ module Archaeo
70
72
  @to_time.to_i
71
73
  end
72
74
 
75
+ def to_iso8601
76
+ @to_time.iso8601
77
+ end
78
+
79
+ def to_rfc3339
80
+ @to_time.rfc3339
81
+ end
82
+
83
+ def +(seconds)
84
+ self.class.from_time(@to_time + seconds)
85
+ end
86
+
87
+ def -(other)
88
+ if other.is_a?(self.class)
89
+ @to_time - other.to_time
90
+ else
91
+ self.class.from_time(@to_time - other)
92
+ end
93
+ end
94
+
73
95
  def <=>(other)
74
96
  return nil unless other.is_a?(self.class)
75
97
 
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Archaeo
4
- VERSION = "0.2.4"
4
+ VERSION = "0.2.5"
5
5
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: archaeo
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.4
4
+ version: 0.2.5
5
5
  platform: ruby
6
6
  authors:
7
7
  - Ribose Inc.