wasapi_client 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.rubocop.yml +1 -0
- data/.rubocop_todo.yml +27 -0
- data/Gemfile +6 -0
- data/Gemfile.lock +171 -0
- data/LICENSE +15 -0
- data/README.md +76 -0
- data/Rakefile +10 -0
- data/lib/wasapi_client/version.rb +5 -0
- data/lib/wasapi_client.rb +180 -0
- data/wasapi_client.gemspec +50 -0
- metadata +264 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 62152f5af7e6336feaf5798316ca24ff79b482d9d232cb56c2b06da3758180fd
|
4
|
+
data.tar.gz: e4b18ec16684c28a59bba87d31d547e333374fb4f54b31e388a475b5633987c9
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 83e3974f8e4ab44afd47a0d8d1e0fc70b1852ec49e3a51f530decfce7035cc80341f923305bc806e87804cfa25c200393ab0ea118cc6ba59997a8d5d9b50da18
|
7
|
+
data.tar.gz: 68068f5f03f933662a9c28f03febe9559d2985576843c8e1cd7687bf7d73d2fba93844053b6ee89bd8f66fec2bdee2cf15ff908f187f019177b19bc01f12feff
|
data/.rspec
ADDED
data/.rubocop.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
inherit_from: .rubocop_todo.yml
|
data/.rubocop_todo.yml
ADDED
@@ -0,0 +1,27 @@
|
|
1
|
+
# This configuration was generated by
|
2
|
+
# `rubocop --auto-gen-config --auto-gen-only-exclude`
|
3
|
+
# on 2025-07-07 16:45:12 UTC using RuboCop version 1.77.0.
|
4
|
+
# The point is for the user to remove these configuration records
|
5
|
+
# one by one as the offenses are removed from the code base.
|
6
|
+
# Note that changes in the inspected code, or installation of new
|
7
|
+
# versions of RuboCop, may require this file to be generated again.
|
8
|
+
|
9
|
+
# Offense count: 2
|
10
|
+
# Configuration parameters: AllowedMethods, AllowedPatterns, CountRepeatedAttributes, Max.
|
11
|
+
Metrics/AbcSize:
|
12
|
+
Exclude:
|
13
|
+
- 'lib/wasapi_client.rb'
|
14
|
+
|
15
|
+
# Offense count: 1
|
16
|
+
# Configuration parameters: CountComments, Max, CountAsOne, AllowedMethods, AllowedPatterns.
|
17
|
+
# AllowedMethods: refine
|
18
|
+
Metrics/BlockLength:
|
19
|
+
Exclude:
|
20
|
+
- '**/*.gemspec'
|
21
|
+
- 'spec/wasapi_client_spec.rb'
|
22
|
+
|
23
|
+
# Offense count: 2
|
24
|
+
# Configuration parameters: CountComments, Max, CountAsOne, AllowedMethods, AllowedPatterns.
|
25
|
+
Metrics/MethodLength:
|
26
|
+
Exclude:
|
27
|
+
- 'lib/wasapi_client.rb'
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,171 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
wasapi_client (0.1.0)
|
5
|
+
activesupport
|
6
|
+
digest
|
7
|
+
faraday
|
8
|
+
faraday-follow_redirects
|
9
|
+
faraday-retry
|
10
|
+
zeitwerk
|
11
|
+
|
12
|
+
GEM
|
13
|
+
remote: https://rubygems.org/
|
14
|
+
specs:
|
15
|
+
activesupport (8.0.2)
|
16
|
+
base64
|
17
|
+
benchmark (>= 0.3)
|
18
|
+
bigdecimal
|
19
|
+
concurrent-ruby (~> 1.0, >= 1.3.1)
|
20
|
+
connection_pool (>= 2.2.5)
|
21
|
+
drb
|
22
|
+
i18n (>= 1.6, < 2)
|
23
|
+
logger (>= 1.4.2)
|
24
|
+
minitest (>= 5.1)
|
25
|
+
securerandom (>= 0.3)
|
26
|
+
tzinfo (~> 2.0, >= 2.0.5)
|
27
|
+
uri (>= 0.13.1)
|
28
|
+
addressable (2.8.7)
|
29
|
+
public_suffix (>= 2.0.2, < 7.0)
|
30
|
+
ast (2.4.3)
|
31
|
+
base64 (0.3.0)
|
32
|
+
benchmark (0.4.1)
|
33
|
+
bigdecimal (3.2.2)
|
34
|
+
concurrent-ruby (1.3.5)
|
35
|
+
connection_pool (2.5.3)
|
36
|
+
crack (1.0.0)
|
37
|
+
bigdecimal
|
38
|
+
rexml
|
39
|
+
date (3.4.1)
|
40
|
+
debug (1.11.0)
|
41
|
+
irb (~> 1.10)
|
42
|
+
reline (>= 0.3.8)
|
43
|
+
diff-lcs (1.6.2)
|
44
|
+
digest (3.2.0)
|
45
|
+
docile (1.4.1)
|
46
|
+
drb (2.2.3)
|
47
|
+
erb (5.0.1)
|
48
|
+
faraday (2.13.2)
|
49
|
+
faraday-net_http (>= 2.0, < 3.5)
|
50
|
+
json
|
51
|
+
logger
|
52
|
+
faraday-follow_redirects (0.3.0)
|
53
|
+
faraday (>= 1, < 3)
|
54
|
+
faraday-net_http (3.4.1)
|
55
|
+
net-http (>= 0.5.0)
|
56
|
+
faraday-retry (2.3.2)
|
57
|
+
faraday (~> 2.0)
|
58
|
+
hashdiff (1.2.0)
|
59
|
+
i18n (1.14.7)
|
60
|
+
concurrent-ruby (~> 1.0)
|
61
|
+
io-console (0.8.0)
|
62
|
+
irb (1.15.2)
|
63
|
+
pp (>= 0.6.0)
|
64
|
+
rdoc (>= 4.0.0)
|
65
|
+
reline (>= 0.4.2)
|
66
|
+
json (2.12.2)
|
67
|
+
language_server-protocol (3.17.0.5)
|
68
|
+
lint_roller (1.1.0)
|
69
|
+
logger (1.7.0)
|
70
|
+
minitest (5.25.5)
|
71
|
+
net-http (0.6.0)
|
72
|
+
uri
|
73
|
+
parallel (1.27.0)
|
74
|
+
parser (3.3.8.0)
|
75
|
+
ast (~> 2.4.1)
|
76
|
+
racc
|
77
|
+
pp (0.6.2)
|
78
|
+
prettyprint
|
79
|
+
prettyprint (0.2.0)
|
80
|
+
prism (1.4.0)
|
81
|
+
psych (5.2.6)
|
82
|
+
date
|
83
|
+
stringio
|
84
|
+
public_suffix (6.0.2)
|
85
|
+
racc (1.8.1)
|
86
|
+
rainbow (3.1.1)
|
87
|
+
rake (13.3.0)
|
88
|
+
rdoc (6.14.2)
|
89
|
+
erb
|
90
|
+
psych (>= 4.0.0)
|
91
|
+
regexp_parser (2.10.0)
|
92
|
+
reline (0.6.1)
|
93
|
+
io-console (~> 0.5)
|
94
|
+
rexml (3.4.1)
|
95
|
+
rspec (3.13.1)
|
96
|
+
rspec-core (~> 3.13.0)
|
97
|
+
rspec-expectations (~> 3.13.0)
|
98
|
+
rspec-mocks (~> 3.13.0)
|
99
|
+
rspec-core (3.13.5)
|
100
|
+
rspec-support (~> 3.13.0)
|
101
|
+
rspec-expectations (3.13.5)
|
102
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
103
|
+
rspec-support (~> 3.13.0)
|
104
|
+
rspec-mocks (3.13.5)
|
105
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
106
|
+
rspec-support (~> 3.13.0)
|
107
|
+
rspec-support (3.13.4)
|
108
|
+
rubocop (1.78.0)
|
109
|
+
json (~> 2.3)
|
110
|
+
language_server-protocol (~> 3.17.0.2)
|
111
|
+
lint_roller (~> 1.1.0)
|
112
|
+
parallel (~> 1.10)
|
113
|
+
parser (>= 3.3.0.2)
|
114
|
+
rainbow (>= 2.2.2, < 4.0)
|
115
|
+
regexp_parser (>= 2.9.3, < 3.0)
|
116
|
+
rubocop-ast (>= 1.45.1, < 2.0)
|
117
|
+
ruby-progressbar (~> 1.7)
|
118
|
+
unicode-display_width (>= 2.4.0, < 4.0)
|
119
|
+
rubocop-ast (1.45.1)
|
120
|
+
parser (>= 3.3.7.2)
|
121
|
+
prism (~> 1.4)
|
122
|
+
rubocop-performance (1.25.0)
|
123
|
+
lint_roller (~> 1.1)
|
124
|
+
rubocop (>= 1.75.0, < 2.0)
|
125
|
+
rubocop-ast (>= 1.38.0, < 2.0)
|
126
|
+
rubocop-rspec (3.6.0)
|
127
|
+
lint_roller (~> 1.1)
|
128
|
+
rubocop (~> 1.72, >= 1.72.1)
|
129
|
+
rubocop-rspec_rails (2.31.0)
|
130
|
+
lint_roller (~> 1.1)
|
131
|
+
rubocop (~> 1.72, >= 1.72.1)
|
132
|
+
rubocop-rspec (~> 3.5)
|
133
|
+
ruby-progressbar (1.13.0)
|
134
|
+
securerandom (0.4.1)
|
135
|
+
simplecov (0.22.0)
|
136
|
+
docile (~> 1.1)
|
137
|
+
simplecov-html (~> 0.11)
|
138
|
+
simplecov_json_formatter (~> 0.1)
|
139
|
+
simplecov-html (0.13.1)
|
140
|
+
simplecov_json_formatter (0.1.4)
|
141
|
+
stringio (3.1.7)
|
142
|
+
tzinfo (2.0.6)
|
143
|
+
concurrent-ruby (~> 1.0)
|
144
|
+
unicode-display_width (3.1.4)
|
145
|
+
unicode-emoji (~> 4.0, >= 4.0.4)
|
146
|
+
unicode-emoji (4.0.4)
|
147
|
+
uri (1.0.3)
|
148
|
+
webmock (3.25.1)
|
149
|
+
addressable (>= 2.8.0)
|
150
|
+
crack (>= 0.3.2)
|
151
|
+
hashdiff (>= 0.4.0, < 2.0.0)
|
152
|
+
zeitwerk (2.7.3)
|
153
|
+
|
154
|
+
PLATFORMS
|
155
|
+
arm64-darwin-23
|
156
|
+
ruby
|
157
|
+
|
158
|
+
DEPENDENCIES
|
159
|
+
debug
|
160
|
+
rake
|
161
|
+
rspec
|
162
|
+
rubocop
|
163
|
+
rubocop-performance
|
164
|
+
rubocop-rspec
|
165
|
+
rubocop-rspec_rails
|
166
|
+
simplecov
|
167
|
+
wasapi_client!
|
168
|
+
webmock
|
169
|
+
|
170
|
+
BUNDLED WITH
|
171
|
+
2.6.9
|
data/LICENSE
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
|
2
|
+
Copyright (c) 2025 by The Board of Trustees of the Leland Stanford
|
3
|
+
Junior University. All rights reserved.
|
4
|
+
|
5
|
+
Licensed under the Apache License, Version 2.0 (the "License"); you
|
6
|
+
may not use this file except in compliance with the License. You
|
7
|
+
may obtain a copy of the License at
|
8
|
+
|
9
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
10
|
+
|
11
|
+
Unless required by applicable law or agreed to in writing, software
|
12
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
13
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
14
|
+
implied. See the License for the specific language governing
|
15
|
+
permissions and limitations under the License.
|
data/README.md
ADDED
@@ -0,0 +1,76 @@
|
|
1
|
+
[](https://dl.circleci.com/status-badge/redirect/gh/sul-dlss/wasapi_client/tree/main)
|
2
|
+
[](https://codecov.io/gh/sul-dlss/wasapi_client)
|
3
|
+
|
4
|
+
# WasapiClient
|
5
|
+
|
6
|
+
WasapiClient is a Ruby gem that acts as a client to Internet Archive's WASAPI APIs. It gets information about WARCs and downloads them. It is a successor to wasapi-downloader but is not provider-generic and is intended for use with Archive-It collections.
|
7
|
+
|
8
|
+
## Installation
|
9
|
+
|
10
|
+
Once the gem has been published, it will be possible to install the gem and add to the application's Gemfile by executing:
|
11
|
+
|
12
|
+
```
|
13
|
+
bundle add wasapi_client
|
14
|
+
```
|
15
|
+
|
16
|
+
If bundler is not being used to manage dependencies, install the gem by executing:
|
17
|
+
|
18
|
+
```
|
19
|
+
gem install wasapi_client
|
20
|
+
```
|
21
|
+
|
22
|
+
## Usage
|
23
|
+
|
24
|
+
Each Archive-It account has its own username and password for downloading WARCs. An account includes many collections, which each have a numeric id. Since we have many accounts, when making requests we need to provide the username and password for the account to which the Archive-It collection belongs.
|
25
|
+
|
26
|
+
```ruby
|
27
|
+
require 'wasapi_client'
|
28
|
+
|
29
|
+
# NOTE: The settings below live in the consumer, not in the gem.
|
30
|
+
client = WasapiClient.new(username: 'username', password: 'password')
|
31
|
+
client.fetch_warcs(
|
32
|
+
output_dir: 'path/to/save/warcs',
|
33
|
+
collection: '12345',
|
34
|
+
crawl_start_after: '2023-01-01',
|
35
|
+
crawl_start_before: '2023-01-31'
|
36
|
+
)
|
37
|
+
|
38
|
+
# Get filenames for a collection (used when auditing)
|
39
|
+
client.filenames(
|
40
|
+
collection: '12345',
|
41
|
+
crawl_start_after: '2025-01-01',
|
42
|
+
crawl_start_before: '2025-06-30'
|
43
|
+
)
|
44
|
+
|
45
|
+
# Fetch a single WARC by URL
|
46
|
+
client.fetch_file(
|
47
|
+
file: 'https://warcs.archive-it.org/webdatafile/ARCHIVEIT-123-example.warc.gz',
|
48
|
+
output_dir: 'path/to/save/warcs'
|
49
|
+
)
|
50
|
+
|
51
|
+
# Fetch a single WARC by filename (used when auditing/remediating)
|
52
|
+
client.fetch_file(
|
53
|
+
file: 'ARCHIVEIT-123-example.warc.gz',
|
54
|
+
output_dir: 'path/to/save/warcs',
|
55
|
+
base_url: 'https://other-archive-it-location.org'
|
56
|
+
)
|
57
|
+
|
58
|
+
# Get the URLs for WARCs meeting collection and crawl time criteria
|
59
|
+
client.get_locations(
|
60
|
+
collection: '12345',
|
61
|
+
crawl_start_after: '2025-01-01',
|
62
|
+
crawl_start_before: '2025-06-30'
|
63
|
+
)
|
64
|
+
```
|
65
|
+
|
66
|
+
## TODO
|
67
|
+
* Add store-time- params to support usage with backfill downloads
|
68
|
+
|
69
|
+
|
70
|
+
## Development
|
71
|
+
|
72
|
+
After checking out the repo, run `bin/setup` to install dependencies. Then, run `rake spec` to run the tests. You can also run `bin/console` for an interactive prompt that will allow you to experiment.
|
73
|
+
|
74
|
+
To install this gem onto your local machine, run `bundle exec rake install`.
|
75
|
+
|
76
|
+
To release a new version, update the version number in `version.rb`, and then run `bundle exec rake release`, which will create a git tag for the version, push git commits and the created tag, and push the `.gem` file to [rubygems.org](https://rubygems.org).
|
data/Rakefile
ADDED
@@ -0,0 +1,180 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'active_support'
|
4
|
+
require 'active_support/core_ext/hash/indifferent_access'
|
5
|
+
require 'faraday'
|
6
|
+
require 'faraday/follow_redirects'
|
7
|
+
require 'faraday/retry'
|
8
|
+
require 'zeitwerk'
|
9
|
+
|
10
|
+
# Load the gem's internal dependencies: use Zeitwerk instead of needing to manually require classes
|
11
|
+
Zeitwerk::Loader.for_gem.setup
|
12
|
+
|
13
|
+
# Client for interacting with the Archive-It WASAPI APIs
|
14
|
+
class WasapiClient
|
15
|
+
# @param username [String] an Archive-It account username
|
16
|
+
# @param password [String] an Archive-It account password
|
17
|
+
# @param base_url [String, nil] the base URL for the WASAPI API'
|
18
|
+
def initialize(username:, password:, base_url: nil)
|
19
|
+
@username = username
|
20
|
+
@password = password
|
21
|
+
@base_url = base_url
|
22
|
+
end
|
23
|
+
|
24
|
+
NUM_RETRIES = 5
|
25
|
+
|
26
|
+
attr_accessor :username, :password, :base_url
|
27
|
+
|
28
|
+
def default_url
|
29
|
+
'https://partner.archive-it.org'
|
30
|
+
end
|
31
|
+
|
32
|
+
def default_storage_url
|
33
|
+
'https://warcs.archive-it.org/webdatafile/'
|
34
|
+
end
|
35
|
+
|
36
|
+
# Set up an authenticated GET request for the account
|
37
|
+
def connection(url)
|
38
|
+
Faraday.new(url:) do |conn|
|
39
|
+
conn.request :authorization, :basic, username, password
|
40
|
+
conn.request :retry, max: 3, interval: 0.05, backoff_factor: 2
|
41
|
+
conn.response :follow_redirects
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
# Send a GET request for the URLs for WARCs and download files. Response will be paginated.
|
46
|
+
# @param collection [String] the collection ID to fetch WARC files for
|
47
|
+
# @param output_dir [String] the directory to save the WARC files to
|
48
|
+
# @param crawl_start_after [String] the start date for the crawl in RFC3339 format
|
49
|
+
# @param crawl_start_before [String] the end date for the crawl in RFC3339 format
|
50
|
+
# rubocop:disable Metrics/CyclomaticComplexity
|
51
|
+
def fetch_warcs(collection:, output_dir:, crawl_start_after: nil, crawl_start_before: nil)
|
52
|
+
locations = get_locations(collection:, crawl_start_after:, crawl_start_before:)
|
53
|
+
return nil if locations.empty?
|
54
|
+
|
55
|
+
FileUtils.mkdir_p(output_dir) unless Dir.exist?(output_dir)
|
56
|
+
locations.each do |location|
|
57
|
+
# See if the file already exists and has the correct checksum
|
58
|
+
filepath = File.join(output_dir, File.basename(location[:url]))
|
59
|
+
next if checksum_valid?(filepath:, expected_md5: location[:md5])
|
60
|
+
|
61
|
+
retries = 0
|
62
|
+
until (valid = checksum_valid?(filepath:, expected_md5: location[:md5])) || retries >= NUM_RETRIES
|
63
|
+
fetch_file(file: location[:url], output_dir:)
|
64
|
+
retries += 1
|
65
|
+
end
|
66
|
+
|
67
|
+
raise "Failed to fetch a valid file for #{location[:url]} after #{NUM_RETRIES} retries" unless valid
|
68
|
+
end
|
69
|
+
end
|
70
|
+
# rubocop:enable Metrics/CyclomaticComplexity
|
71
|
+
|
72
|
+
# Send a GET request for the URLs for WARCs. Response will be paginated.
|
73
|
+
# @param collection [String] the Archive-It collection ID to fetch WARC files for
|
74
|
+
# @param crawl_start_after [String] the start date for the crawl in RFC3339 format
|
75
|
+
# @param crawl_start_before [String] the end date for the crawl in RFC3339 format
|
76
|
+
# @return [Array<Hash>] hashes containing WARC file location (URL) and md5 checksums from the parsed JSON response
|
77
|
+
def get_locations(collection:, crawl_start_after: nil, crawl_start_before: nil)
|
78
|
+
params = {
|
79
|
+
'collection': collection,
|
80
|
+
'crawl-start-after': crawl_start_after,
|
81
|
+
'crawl-start-before': crawl_start_before
|
82
|
+
}
|
83
|
+
|
84
|
+
response = query(params:)
|
85
|
+
extract_files(response:, params:)
|
86
|
+
end
|
87
|
+
|
88
|
+
# Fetch a specific file from the WASAPI storage location.
|
89
|
+
# @param file [String] the URL or filename for the file
|
90
|
+
# @param output_dir [String] the directory to save the file to
|
91
|
+
# @return [String, nil] the path to the downloaded file, or nil if not found
|
92
|
+
def fetch_file(file:, output_dir:, base_url: default_storage_url)
|
93
|
+
# Determine if the input is a URL or a filename
|
94
|
+
file = URI.join(base_url, file).to_s unless file.start_with?('http')
|
95
|
+
|
96
|
+
download(url: file, output_dir:)
|
97
|
+
end
|
98
|
+
|
99
|
+
# Send a GET request for WARCs filenames.
|
100
|
+
# @param collection [String] the Archive-It collection ID
|
101
|
+
# @param crawl_start_after [String] the start date for the crawl in RFC3339 format
|
102
|
+
# @param crawl_start_before [String] the end date for the crawl in RFC3339 format
|
103
|
+
# @return [Array<String>] WARC filenames
|
104
|
+
def filenames(collection:, crawl_start_after: nil, crawl_start_before: nil)
|
105
|
+
locations = get_locations(collection:, crawl_start_after:, crawl_start_before:)
|
106
|
+
locations.map { |location| File.basename(location[:url]) }
|
107
|
+
end
|
108
|
+
|
109
|
+
private
|
110
|
+
|
111
|
+
# Extract the WARC file locations and checksums from the response while paginating through results
|
112
|
+
# @param response [Hash] the parsed JSON response from the WASAPI API
|
113
|
+
# @param params [Hash] the parameters used for the request, to support pagination
|
114
|
+
# @return [Array<Hash>] hashes containing WARC file location (URL) and md5 checksum
|
115
|
+
def extract_files(response:, params:)
|
116
|
+
files = response['files']
|
117
|
+
return [] unless files.any?
|
118
|
+
|
119
|
+
# use the first (primary) location for each file. The second is a backup which may not be complete when accessed.
|
120
|
+
files.map! { |file| { url: file['locations'].first, md5: file&.dig('checksums', 'md5') } }
|
121
|
+
|
122
|
+
while response['next']
|
123
|
+
response = query(params:, next_page: response['next'])
|
124
|
+
new_files = response['files']
|
125
|
+
return [] unless new_files.any?
|
126
|
+
|
127
|
+
files << new_files.map! { |file| { url: file['locations'].first, md5: file&.dig('checksums', 'md5') } }
|
128
|
+
end
|
129
|
+
|
130
|
+
files.flatten
|
131
|
+
end
|
132
|
+
|
133
|
+
# Send a GET request for WARC files matching the query params
|
134
|
+
# @param params [Hash] the parameters for the request, including:
|
135
|
+
# - collection: the collection ID to fetch WARC files for
|
136
|
+
# - crawl-start-after: the start date for the crawl in RFC3339 format
|
137
|
+
# - crawl-start-before: the end date for the crawl in RFC3339 format
|
138
|
+
# @param base_url [String] the base URL for the WASAPI API
|
139
|
+
# @param next_page [String, nil] the URL for the next page of results, if available
|
140
|
+
# @return [Hash] parsed JSON response
|
141
|
+
def query(params:, base_url: default_url, next_page: nil)
|
142
|
+
# If a next page is provided, use it to fetch the next set of results
|
143
|
+
response = if next_page
|
144
|
+
connection(next_page).get
|
145
|
+
else
|
146
|
+
connection(base_url).get('/wasapi/v1/webdata', params)
|
147
|
+
end
|
148
|
+
|
149
|
+
raise "Failed to get list of WARCS: #{response.status}: #{response.body}" unless response.success?
|
150
|
+
|
151
|
+
return nil unless response.body
|
152
|
+
|
153
|
+
JSON.parse(response.body).with_indifferent_access
|
154
|
+
end
|
155
|
+
|
156
|
+
# Download a file and save it to the specified output directory
|
157
|
+
# @param url [String] the URL of the file to download
|
158
|
+
# @param output_dir [String] the directory to save the downloaded file to
|
159
|
+
def download(url:, output_dir:)
|
160
|
+
filename = File.basename(URI.parse(url).path)
|
161
|
+
filepath = File.join(output_dir, filename)
|
162
|
+
File.open(filepath, 'wb') do |file|
|
163
|
+
# Use streaming to write the file in chunks. WARCs can be large.
|
164
|
+
connection(url).get do |req|
|
165
|
+
req.options.on_data = proc { |chunk, _| file.write(chunk) }
|
166
|
+
end
|
167
|
+
end
|
168
|
+
|
169
|
+
filepath
|
170
|
+
end
|
171
|
+
|
172
|
+
# Calculate the MD5 checksum of the downloaded file and verify it against the expected checksum
|
173
|
+
def checksum_valid?(filepath:, expected_md5:)
|
174
|
+
raise "No md5 checksum provided for #{File.basename(filepath)}" unless expected_md5
|
175
|
+
return false unless File.exist?(filepath)
|
176
|
+
|
177
|
+
actual_md5 = Digest::MD5.file(filepath).hexdigest
|
178
|
+
actual_md5 == expected_md5
|
179
|
+
end
|
180
|
+
end
|
@@ -0,0 +1,50 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
lib = File.expand_path('lib', __dir__)
|
4
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
5
|
+
require 'wasapi_client/version'
|
6
|
+
|
7
|
+
Gem::Specification.new do |spec|
|
8
|
+
spec.name = 'wasapi_client'
|
9
|
+
spec.version = WasapiClient::VERSION
|
10
|
+
spec.authors = ['Laura Wrubel']
|
11
|
+
spec.email = ['lwrubel@stanford.edu']
|
12
|
+
|
13
|
+
spec.summary = 'Interface for interacting with the Archive-It WASAPI API.'
|
14
|
+
spec.description = 'This provides API interaction with the Archive-It WASAPI API'
|
15
|
+
spec.homepage = 'https://github.com/sul-dlss/wasapi_client'
|
16
|
+
spec.required_ruby_version = '>= 3.4.0'
|
17
|
+
|
18
|
+
spec.metadata['homepage_uri'] = spec.homepage
|
19
|
+
spec.metadata['source_code_uri'] = 'https://github.com/sul-dlss/wasapi_client'
|
20
|
+
spec.metadata['changelog_uri'] = 'https://github.com/sul-dlss/wasapi_client/releases'
|
21
|
+
spec.metadata['rubygems_mfa_required'] = 'true'
|
22
|
+
|
23
|
+
# Specify which files should be added to the gem when it is released.
|
24
|
+
# The `git ls-files -z` loads the files in the RubyGem that have been added into git.
|
25
|
+
spec.files = Dir.chdir(__dir__) do
|
26
|
+
`git ls-files -z`.split("\x0").reject do |f|
|
27
|
+
(f == __FILE__) || f.match(%r{\A(?:(?:bin|test|spec|features)/|\.(?:git|travis|circleci)|appveyor)})
|
28
|
+
end
|
29
|
+
end
|
30
|
+
spec.bindir = 'exe'
|
31
|
+
spec.executables = spec.files.grep(%r{\Aexe/}) { |f| File.basename(f) }
|
32
|
+
spec.require_paths = ['lib']
|
33
|
+
|
34
|
+
spec.add_dependency 'activesupport'
|
35
|
+
spec.add_dependency 'digest'
|
36
|
+
spec.add_dependency 'faraday'
|
37
|
+
spec.add_dependency 'faraday-follow_redirects'
|
38
|
+
spec.add_dependency 'faraday-retry'
|
39
|
+
spec.add_dependency 'zeitwerk'
|
40
|
+
|
41
|
+
spec.add_development_dependency 'debug'
|
42
|
+
spec.add_development_dependency 'rake'
|
43
|
+
spec.add_development_dependency 'rspec'
|
44
|
+
spec.add_development_dependency 'rubocop'
|
45
|
+
spec.add_development_dependency 'rubocop-performance'
|
46
|
+
spec.add_development_dependency 'rubocop-rspec'
|
47
|
+
spec.add_development_dependency 'rubocop-rspec_rails'
|
48
|
+
spec.add_development_dependency 'simplecov'
|
49
|
+
spec.add_development_dependency 'webmock'
|
50
|
+
end
|
metadata
ADDED
@@ -0,0 +1,264 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: wasapi_client
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Laura Wrubel
|
8
|
+
bindir: exe
|
9
|
+
cert_chain: []
|
10
|
+
date: 2025-07-17 00:00:00.000000000 Z
|
11
|
+
dependencies:
|
12
|
+
- !ruby/object:Gem::Dependency
|
13
|
+
name: activesupport
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - ">="
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: '0'
|
19
|
+
type: :runtime
|
20
|
+
prerelease: false
|
21
|
+
version_requirements: !ruby/object:Gem::Requirement
|
22
|
+
requirements:
|
23
|
+
- - ">="
|
24
|
+
- !ruby/object:Gem::Version
|
25
|
+
version: '0'
|
26
|
+
- !ruby/object:Gem::Dependency
|
27
|
+
name: digest
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :runtime
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: !ruby/object:Gem::Requirement
|
36
|
+
requirements:
|
37
|
+
- - ">="
|
38
|
+
- !ruby/object:Gem::Version
|
39
|
+
version: '0'
|
40
|
+
- !ruby/object:Gem::Dependency
|
41
|
+
name: faraday
|
42
|
+
requirement: !ruby/object:Gem::Requirement
|
43
|
+
requirements:
|
44
|
+
- - ">="
|
45
|
+
- !ruby/object:Gem::Version
|
46
|
+
version: '0'
|
47
|
+
type: :runtime
|
48
|
+
prerelease: false
|
49
|
+
version_requirements: !ruby/object:Gem::Requirement
|
50
|
+
requirements:
|
51
|
+
- - ">="
|
52
|
+
- !ruby/object:Gem::Version
|
53
|
+
version: '0'
|
54
|
+
- !ruby/object:Gem::Dependency
|
55
|
+
name: faraday-follow_redirects
|
56
|
+
requirement: !ruby/object:Gem::Requirement
|
57
|
+
requirements:
|
58
|
+
- - ">="
|
59
|
+
- !ruby/object:Gem::Version
|
60
|
+
version: '0'
|
61
|
+
type: :runtime
|
62
|
+
prerelease: false
|
63
|
+
version_requirements: !ruby/object:Gem::Requirement
|
64
|
+
requirements:
|
65
|
+
- - ">="
|
66
|
+
- !ruby/object:Gem::Version
|
67
|
+
version: '0'
|
68
|
+
- !ruby/object:Gem::Dependency
|
69
|
+
name: faraday-retry
|
70
|
+
requirement: !ruby/object:Gem::Requirement
|
71
|
+
requirements:
|
72
|
+
- - ">="
|
73
|
+
- !ruby/object:Gem::Version
|
74
|
+
version: '0'
|
75
|
+
type: :runtime
|
76
|
+
prerelease: false
|
77
|
+
version_requirements: !ruby/object:Gem::Requirement
|
78
|
+
requirements:
|
79
|
+
- - ">="
|
80
|
+
- !ruby/object:Gem::Version
|
81
|
+
version: '0'
|
82
|
+
- !ruby/object:Gem::Dependency
|
83
|
+
name: zeitwerk
|
84
|
+
requirement: !ruby/object:Gem::Requirement
|
85
|
+
requirements:
|
86
|
+
- - ">="
|
87
|
+
- !ruby/object:Gem::Version
|
88
|
+
version: '0'
|
89
|
+
type: :runtime
|
90
|
+
prerelease: false
|
91
|
+
version_requirements: !ruby/object:Gem::Requirement
|
92
|
+
requirements:
|
93
|
+
- - ">="
|
94
|
+
- !ruby/object:Gem::Version
|
95
|
+
version: '0'
|
96
|
+
- !ruby/object:Gem::Dependency
|
97
|
+
name: debug
|
98
|
+
requirement: !ruby/object:Gem::Requirement
|
99
|
+
requirements:
|
100
|
+
- - ">="
|
101
|
+
- !ruby/object:Gem::Version
|
102
|
+
version: '0'
|
103
|
+
type: :development
|
104
|
+
prerelease: false
|
105
|
+
version_requirements: !ruby/object:Gem::Requirement
|
106
|
+
requirements:
|
107
|
+
- - ">="
|
108
|
+
- !ruby/object:Gem::Version
|
109
|
+
version: '0'
|
110
|
+
- !ruby/object:Gem::Dependency
|
111
|
+
name: rake
|
112
|
+
requirement: !ruby/object:Gem::Requirement
|
113
|
+
requirements:
|
114
|
+
- - ">="
|
115
|
+
- !ruby/object:Gem::Version
|
116
|
+
version: '0'
|
117
|
+
type: :development
|
118
|
+
prerelease: false
|
119
|
+
version_requirements: !ruby/object:Gem::Requirement
|
120
|
+
requirements:
|
121
|
+
- - ">="
|
122
|
+
- !ruby/object:Gem::Version
|
123
|
+
version: '0'
|
124
|
+
- !ruby/object:Gem::Dependency
|
125
|
+
name: rspec
|
126
|
+
requirement: !ruby/object:Gem::Requirement
|
127
|
+
requirements:
|
128
|
+
- - ">="
|
129
|
+
- !ruby/object:Gem::Version
|
130
|
+
version: '0'
|
131
|
+
type: :development
|
132
|
+
prerelease: false
|
133
|
+
version_requirements: !ruby/object:Gem::Requirement
|
134
|
+
requirements:
|
135
|
+
- - ">="
|
136
|
+
- !ruby/object:Gem::Version
|
137
|
+
version: '0'
|
138
|
+
- !ruby/object:Gem::Dependency
|
139
|
+
name: rubocop
|
140
|
+
requirement: !ruby/object:Gem::Requirement
|
141
|
+
requirements:
|
142
|
+
- - ">="
|
143
|
+
- !ruby/object:Gem::Version
|
144
|
+
version: '0'
|
145
|
+
type: :development
|
146
|
+
prerelease: false
|
147
|
+
version_requirements: !ruby/object:Gem::Requirement
|
148
|
+
requirements:
|
149
|
+
- - ">="
|
150
|
+
- !ruby/object:Gem::Version
|
151
|
+
version: '0'
|
152
|
+
- !ruby/object:Gem::Dependency
|
153
|
+
name: rubocop-performance
|
154
|
+
requirement: !ruby/object:Gem::Requirement
|
155
|
+
requirements:
|
156
|
+
- - ">="
|
157
|
+
- !ruby/object:Gem::Version
|
158
|
+
version: '0'
|
159
|
+
type: :development
|
160
|
+
prerelease: false
|
161
|
+
version_requirements: !ruby/object:Gem::Requirement
|
162
|
+
requirements:
|
163
|
+
- - ">="
|
164
|
+
- !ruby/object:Gem::Version
|
165
|
+
version: '0'
|
166
|
+
- !ruby/object:Gem::Dependency
|
167
|
+
name: rubocop-rspec
|
168
|
+
requirement: !ruby/object:Gem::Requirement
|
169
|
+
requirements:
|
170
|
+
- - ">="
|
171
|
+
- !ruby/object:Gem::Version
|
172
|
+
version: '0'
|
173
|
+
type: :development
|
174
|
+
prerelease: false
|
175
|
+
version_requirements: !ruby/object:Gem::Requirement
|
176
|
+
requirements:
|
177
|
+
- - ">="
|
178
|
+
- !ruby/object:Gem::Version
|
179
|
+
version: '0'
|
180
|
+
- !ruby/object:Gem::Dependency
|
181
|
+
name: rubocop-rspec_rails
|
182
|
+
requirement: !ruby/object:Gem::Requirement
|
183
|
+
requirements:
|
184
|
+
- - ">="
|
185
|
+
- !ruby/object:Gem::Version
|
186
|
+
version: '0'
|
187
|
+
type: :development
|
188
|
+
prerelease: false
|
189
|
+
version_requirements: !ruby/object:Gem::Requirement
|
190
|
+
requirements:
|
191
|
+
- - ">="
|
192
|
+
- !ruby/object:Gem::Version
|
193
|
+
version: '0'
|
194
|
+
- !ruby/object:Gem::Dependency
|
195
|
+
name: simplecov
|
196
|
+
requirement: !ruby/object:Gem::Requirement
|
197
|
+
requirements:
|
198
|
+
- - ">="
|
199
|
+
- !ruby/object:Gem::Version
|
200
|
+
version: '0'
|
201
|
+
type: :development
|
202
|
+
prerelease: false
|
203
|
+
version_requirements: !ruby/object:Gem::Requirement
|
204
|
+
requirements:
|
205
|
+
- - ">="
|
206
|
+
- !ruby/object:Gem::Version
|
207
|
+
version: '0'
|
208
|
+
- !ruby/object:Gem::Dependency
|
209
|
+
name: webmock
|
210
|
+
requirement: !ruby/object:Gem::Requirement
|
211
|
+
requirements:
|
212
|
+
- - ">="
|
213
|
+
- !ruby/object:Gem::Version
|
214
|
+
version: '0'
|
215
|
+
type: :development
|
216
|
+
prerelease: false
|
217
|
+
version_requirements: !ruby/object:Gem::Requirement
|
218
|
+
requirements:
|
219
|
+
- - ">="
|
220
|
+
- !ruby/object:Gem::Version
|
221
|
+
version: '0'
|
222
|
+
description: This provides API interaction with the Archive-It WASAPI API
|
223
|
+
email:
|
224
|
+
- lwrubel@stanford.edu
|
225
|
+
executables: []
|
226
|
+
extensions: []
|
227
|
+
extra_rdoc_files: []
|
228
|
+
files:
|
229
|
+
- ".rspec"
|
230
|
+
- ".rubocop.yml"
|
231
|
+
- ".rubocop_todo.yml"
|
232
|
+
- Gemfile
|
233
|
+
- Gemfile.lock
|
234
|
+
- LICENSE
|
235
|
+
- README.md
|
236
|
+
- Rakefile
|
237
|
+
- lib/wasapi_client.rb
|
238
|
+
- lib/wasapi_client/version.rb
|
239
|
+
- wasapi_client.gemspec
|
240
|
+
homepage: https://github.com/sul-dlss/wasapi_client
|
241
|
+
licenses: []
|
242
|
+
metadata:
|
243
|
+
homepage_uri: https://github.com/sul-dlss/wasapi_client
|
244
|
+
source_code_uri: https://github.com/sul-dlss/wasapi_client
|
245
|
+
changelog_uri: https://github.com/sul-dlss/wasapi_client/releases
|
246
|
+
rubygems_mfa_required: 'true'
|
247
|
+
rdoc_options: []
|
248
|
+
require_paths:
|
249
|
+
- lib
|
250
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
251
|
+
requirements:
|
252
|
+
- - ">="
|
253
|
+
- !ruby/object:Gem::Version
|
254
|
+
version: 3.4.0
|
255
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
256
|
+
requirements:
|
257
|
+
- - ">="
|
258
|
+
- !ruby/object:Gem::Version
|
259
|
+
version: '0'
|
260
|
+
requirements: []
|
261
|
+
rubygems_version: 3.6.2
|
262
|
+
specification_version: 4
|
263
|
+
summary: Interface for interacting with the Archive-It WASAPI API.
|
264
|
+
test_files: []
|