s3-querier 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +203 -0
- package/docs/s3-querier.md +196 -0
- package/package.json +71 -0
- package/src/duck-db/index.js +57 -0
- package/src/plugins/avro/avro-plugin.js +64 -0
- package/src/plugins/query-finalizer/query-finalizer.js +41 -0
- package/src/plugins/query-parser/query-parser.js +33 -0
- package/src/s3/auth/ibm-iam-client.js +21 -0
- package/src/s3/auth/ibm-iam-token-manager.js +40 -0
- package/src/s3/s3.js +427 -0
- package/src/s3-querier.js +107 -0
- package/src/utils/bigint-replacer.js +13 -0
- package/src/utils/date-regex/date-regex.js +52 -0
- package/src/utils/file-path-builder/file-path-builder.js +55 -0
- package/src/utils/file-settings/file-settings.js +65 -0
- package/src/utils/logger.js +3 -0
- package/src/utils/path-parser/path-parser-grammar.js +39 -0
- package/src/utils/path-parser/path-parser.js +58 -0
- package/src/utils/sql-parser/sql-parser.js +31 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
const TOKEN_REFRESH_BUFFER_MS = 5 * 60 * 1000;
|
|
2
|
+
const IBM_IAM_TOKEN_URL = 'https://iam.cloud.ibm.com/identity/token';
|
|
3
|
+
|
|
4
|
+
export class IbmIamTokenManager {
|
|
5
|
+
constructor(apiKey, { tokenUrl = IBM_IAM_TOKEN_URL, fetchFn = globalThis.fetch } = {}) {
|
|
6
|
+
this.apiKey = apiKey;
|
|
7
|
+
this.tokenUrl = tokenUrl;
|
|
8
|
+
this.fetchFn = fetchFn;
|
|
9
|
+
this.token = null;
|
|
10
|
+
this.expiry = 0;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
getToken() {
|
|
14
|
+
if (this.token && Date.now() < this.expiry - TOKEN_REFRESH_BUFFER_MS) {
|
|
15
|
+
return Promise.resolve(this.token);
|
|
16
|
+
}
|
|
17
|
+
return this.refresh();
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
async refresh() {
|
|
21
|
+
const body = new URLSearchParams({
|
|
22
|
+
grant_type: 'urn:ibm:params:oauth:grant-type:apikey',
|
|
23
|
+
apikey: this.apiKey,
|
|
24
|
+
});
|
|
25
|
+
const response = await this.fetchFn(this.tokenUrl, {
|
|
26
|
+
method: 'POST',
|
|
27
|
+
headers: { 'Content-Type': 'application/x-www-form-urlencoded', Accept: 'application/json' },
|
|
28
|
+
body: body.toString(),
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
if (!response.ok) {
|
|
32
|
+
throw new Error(`IBM IAM token fetch failed: ${response.status} ${response.statusText}`);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const { access_token, expires_in } = await response.json();
|
|
36
|
+
this.token = access_token;
|
|
37
|
+
this.expiry = Date.now() + expires_in * 1000;
|
|
38
|
+
return this.token;
|
|
39
|
+
}
|
|
40
|
+
}
|
package/src/s3/s3.js
ADDED
|
@@ -0,0 +1,427 @@
|
|
|
1
|
+
import fsPromise from 'node:fs/promises';
|
|
2
|
+
import { dirname } from 'node:path';
|
|
3
|
+
import { S3Client, ListObjectsV2Command, GetObjectCommand } from '@aws-sdk/client-s3';
|
|
4
|
+
|
|
5
|
+
import { logger } from '../utils/logger.js';
|
|
6
|
+
import { datesInRange, hoursInRange, buildPath } from '../utils/file-path-builder/file-path-builder.js';
|
|
7
|
+
import { regexFromPattern } from '../utils/date-regex/date-regex.js';
|
|
8
|
+
import { buildIbmIamClient } from './auth/ibm-iam-client.js';
|
|
9
|
+
|
|
10
|
+
export default class S3 {
|
|
11
|
+
constructor({
|
|
12
|
+
apiKey,
|
|
13
|
+
accessKeyId,
|
|
14
|
+
secretAccessKey,
|
|
15
|
+
endpoint,
|
|
16
|
+
region = 'us-east-1',
|
|
17
|
+
bucket,
|
|
18
|
+
mount = '.',
|
|
19
|
+
listingCache,
|
|
20
|
+
plugins,
|
|
21
|
+
}) {
|
|
22
|
+
this.bucket = bucket;
|
|
23
|
+
this.s3 = buildS3Client({ apiKey, accessKeyId, secretAccessKey, endpoint, region });
|
|
24
|
+
this.mount = mount;
|
|
25
|
+
this.downloadFile = this.downloadFile.bind(this);
|
|
26
|
+
this.resetEnqueued = this.resetEnqueued.bind(this);
|
|
27
|
+
this.listFiles = this.listFiles.bind(this);
|
|
28
|
+
this.enqueuedFiles = new Map();
|
|
29
|
+
this.listingCache = listingCache || new Map();
|
|
30
|
+
this.plugins = plugins;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* 1. List files in date range matching file patterns
|
|
35
|
+
* 2. Begin downloads of an array of files from S3
|
|
36
|
+
*
|
|
37
|
+
* @typedef {object} DownloadSettings
|
|
38
|
+
* @property {Date} from Start time
|
|
39
|
+
* @property {Date} to End time
|
|
40
|
+
* @property {string[]} filePatterns An array of file patterns
|
|
41
|
+
* @property {string[]} staticFiles An array of static files
|
|
42
|
+
*
|
|
43
|
+
* @param {DownloadSettings} downloadSettings Settings for downloading
|
|
44
|
+
* @returns {PromiseSettledResult<string[]>} Promise result for each file downloaded
|
|
45
|
+
*/
|
|
46
|
+
async downloadFiles({ from, to, filePatterns = [], staticFiles = [] }) {
|
|
47
|
+
const startListing = new Date();
|
|
48
|
+
const listPromises = filePatterns.map((pattern) => {
|
|
49
|
+
return this.getFilePathsFromPrefixes(from, to, pattern);
|
|
50
|
+
});
|
|
51
|
+
const filePaths = await Promise.allSettled(listPromises).then((fileList) => {
|
|
52
|
+
return fileList.map((list) => list.value).flat();
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
logger.info(`Total listing time: ${(new Date() - startListing) / 1000}s`);
|
|
56
|
+
return this.downloadFileList([...filePaths, ...staticFiles]);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Downloads an array of files from S3
|
|
61
|
+
*
|
|
62
|
+
* @param {string[]} filePaths A list of files to download
|
|
63
|
+
* @returns {PromiseSettledResult} A Promise that resolves to an array of file paths
|
|
64
|
+
*/
|
|
65
|
+
downloadFileList(filePaths = []) {
|
|
66
|
+
logger.info(`Starting downloads for ${filePaths.length} files`);
|
|
67
|
+
this.preFlightCheck(filePaths);
|
|
68
|
+
|
|
69
|
+
const stats = {
|
|
70
|
+
start: new Date(),
|
|
71
|
+
cacheHits: 0,
|
|
72
|
+
cacheMisses: 0,
|
|
73
|
+
enqueuedHits: 0,
|
|
74
|
+
bytesDownloaded: 0,
|
|
75
|
+
};
|
|
76
|
+
const filesPromises = this.startDownloads(stats, filePaths);
|
|
77
|
+
|
|
78
|
+
return Promise.allSettled(filesPromises)
|
|
79
|
+
.then((results) => {
|
|
80
|
+
return results
|
|
81
|
+
.filter((result) => {
|
|
82
|
+
return result.value;
|
|
83
|
+
})
|
|
84
|
+
.map((result) => result.value);
|
|
85
|
+
})
|
|
86
|
+
.then(this.logStatistics(stats))
|
|
87
|
+
.then(this.resetEnqueued)
|
|
88
|
+
.then((results) => results);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
/**
|
|
92
|
+
* Returns a list of file paths from S3 listing for each date
|
|
93
|
+
*
|
|
94
|
+
* @param {Date|string} from From date
|
|
95
|
+
* @param {Date|string} to To date
|
|
96
|
+
* @param {string} filePattern The file pattern to use
|
|
97
|
+
* @returns {string[]} List of files for the given prefixes & file pattern
|
|
98
|
+
*/
|
|
99
|
+
getFilePathsFromPrefixes(from, to, filePattern) {
|
|
100
|
+
const { file, cache } = filePattern;
|
|
101
|
+
const prefixes = this.createPrefixes(from, to, file);
|
|
102
|
+
const listPromises = prefixes.map(this.listFiles);
|
|
103
|
+
|
|
104
|
+
return Promise.allSettled(listPromises).then((results) => {
|
|
105
|
+
const regex = regexFromPattern(file);
|
|
106
|
+
const todayPrefix = this.getTodayPrefix(file);
|
|
107
|
+
this.listingCache.delete(`${this.bucket}/${todayPrefix}`);
|
|
108
|
+
|
|
109
|
+
return results
|
|
110
|
+
.filter((result) => result.status === 'fulfilled')
|
|
111
|
+
.map((result) => result.value)
|
|
112
|
+
.flat()
|
|
113
|
+
.filter((fileObject) => {
|
|
114
|
+
return regex.test(fileObject.file);
|
|
115
|
+
})
|
|
116
|
+
.map((fileObject) => ({ file: fileObject.file, cache, size: fileObject.size }));
|
|
117
|
+
});
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
/**
|
|
121
|
+
* Returns a list of prefixes to use for S3 listing for files within a given date range
|
|
122
|
+
*
|
|
123
|
+
* @param {Date} from From date
|
|
124
|
+
* @param {Date} to To date
|
|
125
|
+
* @param {string} filePattern The file pattern to use
|
|
126
|
+
* @returns {string[]} The list of prefixes for filtering
|
|
127
|
+
*/
|
|
128
|
+
createPrefixes(from, to, filePattern) {
|
|
129
|
+
const prefixStrategy = this.prefixStrategy(from, to, filePattern);
|
|
130
|
+
return prefixStrategy(from, to, filePattern);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Determines which strategy to use to create S3 prefix queries
|
|
135
|
+
*
|
|
136
|
+
* @param {Date} from From date
|
|
137
|
+
* @param {Date} to To date
|
|
138
|
+
* @param {string} filePattern The file pattern to use
|
|
139
|
+
* @returns {(from:Date, to:Date, filePattern:string) => string[]} A function that creates a list of prefixes
|
|
140
|
+
*/
|
|
141
|
+
prefixStrategy(from, to, filePattern) {
|
|
142
|
+
const hasDateToken = filePattern.match(/\{(yyyy|MM|dd|hh|mm)\}/g);
|
|
143
|
+
const hasGlob = filePattern.match(/\*/g);
|
|
144
|
+
const hourDiff = (new Date(to) - new Date(from)) / 1000 / 60 / 60;
|
|
145
|
+
|
|
146
|
+
if (hasDateToken && hourDiff < 24) return this.prefixHours;
|
|
147
|
+
if (hasDateToken) return this.prefixDays;
|
|
148
|
+
if (hasGlob) return this.prefixGlob;
|
|
149
|
+
return (_from, _to, pattern) => [pattern];
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
/**
|
|
153
|
+
* Returns a list of prefixes based on a range of hours
|
|
154
|
+
*
|
|
155
|
+
* @param {Date} from From date
|
|
156
|
+
* @param {Date} to To date
|
|
157
|
+
* @param {string} filePattern The file pattern to use
|
|
158
|
+
* @returns {string[]} The list of prefixes for filtering
|
|
159
|
+
*/
|
|
160
|
+
prefixHours(from, to, filePattern) {
|
|
161
|
+
const hourRange = hoursInRange(new Date(from), new Date(to));
|
|
162
|
+
const [trimmed] = filePattern.split('{hh}');
|
|
163
|
+
return hourRange.map((date) => {
|
|
164
|
+
return buildPath(`${trimmed}{hh}`, date);
|
|
165
|
+
});
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Returns a list of prefixes based on a range of days
|
|
170
|
+
*
|
|
171
|
+
* @param {Date} from From date
|
|
172
|
+
* @param {Date} to To date
|
|
173
|
+
* @param {string} filePattern The file pattern to use
|
|
174
|
+
* @returns {string[]} The list of prefixes for filtering
|
|
175
|
+
*/
|
|
176
|
+
prefixDays(from, to, filePattern) {
|
|
177
|
+
const dateRange = datesInRange(new Date(from), new Date(to));
|
|
178
|
+
const [trimmed] = filePattern.split('{dd}');
|
|
179
|
+
|
|
180
|
+
return dateRange.map((date) => {
|
|
181
|
+
return buildPath(`${trimmed}{dd}`, date);
|
|
182
|
+
});
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
/**
|
|
186
|
+
* Returns a single entry array with a file pattern trimmed to the first glob
|
|
187
|
+
*
|
|
188
|
+
* @param {Date} _from From date
|
|
189
|
+
* @param {Date} _to To date
|
|
190
|
+
* @param {string} filePattern The file pattern to use
|
|
191
|
+
* @returns {string[]} The single entry array with a file pattern trimmed to the first glob
|
|
192
|
+
*/
|
|
193
|
+
prefixGlob(_from, _to, filePattern) {
|
|
194
|
+
const [trimmed] = filePattern.split('*');
|
|
195
|
+
return [trimmed];
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
/**
|
|
199
|
+
* Returns the file pattern for current time
|
|
200
|
+
* We need this so we can remove this from cache keys if it exists
|
|
201
|
+
*
|
|
202
|
+
* @param {string} filePattern File pattern
|
|
203
|
+
* @returns {string}
|
|
204
|
+
*/
|
|
205
|
+
getTodayPrefix(filePattern) {
|
|
206
|
+
const [trimmed] = filePattern.split('{dd}');
|
|
207
|
+
return buildPath(`${trimmed}{dd}`, new Date());
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
/**
|
|
211
|
+
* Returns a list of files from S3 under the given prefix
|
|
212
|
+
*
|
|
213
|
+
* @param {string} prefix The prefix to use when querying S3
|
|
214
|
+
* @returns {Promise<string[]>} A Promise that resolves to an Array of S3 file paths found under the `prefix`
|
|
215
|
+
*/
|
|
216
|
+
async listFiles(prefix) {
|
|
217
|
+
const cacheKey = `${this.bucket}/${prefix}`;
|
|
218
|
+
if (this.listingCache.has(cacheKey)) {
|
|
219
|
+
return this.listingCache.get(cacheKey);
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
const files = [];
|
|
223
|
+
let continuationToken;
|
|
224
|
+
do {
|
|
225
|
+
const response = await this.s3.send(
|
|
226
|
+
new ListObjectsV2Command({ Bucket: this.bucket, Prefix: prefix, ContinuationToken: continuationToken }),
|
|
227
|
+
);
|
|
228
|
+
response.Contents?.forEach((content) => {
|
|
229
|
+
files.push({ file: content.Key, size: content.Size });
|
|
230
|
+
});
|
|
231
|
+
continuationToken = response.NextContinuationToken;
|
|
232
|
+
} while (continuationToken);
|
|
233
|
+
|
|
234
|
+
this.listingCache.set(cacheKey, files);
|
|
235
|
+
return files;
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
/**
|
|
239
|
+
* Resets enqueued files
|
|
240
|
+
*
|
|
241
|
+
* @param {PromiseSettledResult} fileDLPromises Results of all the files downloaded
|
|
242
|
+
* @returns {PromiseSettledResult} Results of all the files downloaded
|
|
243
|
+
*/
|
|
244
|
+
resetEnqueued(fileDLPromises) {
|
|
245
|
+
fileDLPromises.forEach((fileDownloadPromise) => {
|
|
246
|
+
if (fileDownloadPromise.status === 'rejected') {
|
|
247
|
+
this.enqueuedFiles.delete(fileDownloadPromise.reason);
|
|
248
|
+
}
|
|
249
|
+
if (fileDownloadPromise.status === 'fulfilled') {
|
|
250
|
+
this.enqueuedFiles.delete(fileDownloadPromise.value);
|
|
251
|
+
}
|
|
252
|
+
});
|
|
253
|
+
return fileDLPromises;
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
/**
|
|
257
|
+
* Logs download statistics
|
|
258
|
+
*
|
|
259
|
+
* @param {object} stats A statistics object
|
|
260
|
+
* @returns {(PromiseSettledResult) => PromiseSettledResult}
|
|
261
|
+
*/
|
|
262
|
+
logStatistics(stats) {
|
|
263
|
+
return (results) => {
|
|
264
|
+
const mbDownloaded = stats.bytesDownloaded !== 0 ? stats.bytesDownloaded / (1024 * 1024) : 0;
|
|
265
|
+
const seconds = (new Date() - stats.start) / 1000;
|
|
266
|
+
const mbPerSecond = mbDownloaded / seconds;
|
|
267
|
+
|
|
268
|
+
logger.info(`Enqueued keys: ${this.enqueuedFiles.size}`);
|
|
269
|
+
logger.info(
|
|
270
|
+
`Download completed in: ${seconds} seconds. Cache hits: ${stats.cacheHits}. Cache misses: ${stats.cacheMisses}. Enqueued hits: ${stats.enqueuedHits}. MB downloaded: ${mbDownloaded}. MB/s ${mbPerSecond}`,
|
|
271
|
+
);
|
|
272
|
+
return results;
|
|
273
|
+
};
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
/**
|
|
277
|
+
* Starts the download for all files
|
|
278
|
+
*
|
|
279
|
+
* @param {object} stats Stats to write to
|
|
280
|
+
* @param {string[]} filePaths An array of file paths
|
|
281
|
+
* @returns {Promise[]} An array of promises resolving when a download is complete and written
|
|
282
|
+
*/
|
|
283
|
+
startDownloads(stats, filePaths) {
|
|
284
|
+
return filePaths.map((fileObject) => {
|
|
285
|
+
const { file } = fileObject;
|
|
286
|
+
if (this.enqueuedFiles.has(file)) {
|
|
287
|
+
stats.enqueuedHits += 1;
|
|
288
|
+
return this.enqueuedFiles.get(file);
|
|
289
|
+
}
|
|
290
|
+
const filePromise = this.downloadFile(stats, fileObject);
|
|
291
|
+
this.enqueuedFiles.set(file, filePromise);
|
|
292
|
+
return filePromise;
|
|
293
|
+
});
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* Wrapper decides which download strategy to use. One of:
|
|
298
|
+
* - Check cache before downloading
|
|
299
|
+
* - Don't check cache and always download
|
|
300
|
+
*
|
|
301
|
+
* @param {object} stats The stats object to write file cache stats to
|
|
302
|
+
* @param {object} fileObject The file object
|
|
303
|
+
* @returns {Promise<object>} A Promise that resolves to the file path object
|
|
304
|
+
*/
|
|
305
|
+
downloadFile(stats, fileObject) {
|
|
306
|
+
if (fileObject.cache === false) return this.downloadFileForced(stats, fileObject);
|
|
307
|
+
return this.downloadFileCache(stats, fileObject);
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
/**
|
|
311
|
+
* A download strategy that checks cache before downloading
|
|
312
|
+
* Default strategy
|
|
313
|
+
*
|
|
314
|
+
* @param {object} stats The stats object to write file stats to
|
|
315
|
+
* @param {object} fileObject The file object
|
|
316
|
+
* @returns {Promise<object>} A Promise that resolves to the file path object
|
|
317
|
+
*/
|
|
318
|
+
downloadFileCache(stats, fileObject) {
|
|
319
|
+
const { file, size } = fileObject;
|
|
320
|
+
const dir = dirname(`${this.mount}/${file}`);
|
|
321
|
+
return fsPromise
|
|
322
|
+
.stat(`${this.mount}/${file}`)
|
|
323
|
+
.then(() => {
|
|
324
|
+
stats.cacheHits += 1;
|
|
325
|
+
return `${this.mount}/${file}`;
|
|
326
|
+
})
|
|
327
|
+
.catch(() => {
|
|
328
|
+
return fsPromise
|
|
329
|
+
.mkdir(dir, { recursive: true })
|
|
330
|
+
.then(() => {
|
|
331
|
+
stats.cacheMisses += 1;
|
|
332
|
+
stats.bytesDownloaded += size;
|
|
333
|
+
return this.objectToFile(file);
|
|
334
|
+
})
|
|
335
|
+
.catch(() => {
|
|
336
|
+
return Promise.reject(file);
|
|
337
|
+
});
|
|
338
|
+
});
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
/**
|
|
342
|
+
* A download strategy that does not check cache before downloading.
|
|
343
|
+
*
|
|
344
|
+
* @param {object} stats The stats object to write file stats to
|
|
345
|
+
* @param {object} fileObject The file object
|
|
346
|
+
* @returns {Promise<object>} A Promise that resolves to the file path object
|
|
347
|
+
*/
|
|
348
|
+
downloadFileForced(stats, fileObject) {
|
|
349
|
+
const { file, size } = fileObject;
|
|
350
|
+
const dir = dirname(`${this.mount}/${file}`);
|
|
351
|
+
return fsPromise
|
|
352
|
+
.mkdir(dir, { recursive: true })
|
|
353
|
+
.then(() => {
|
|
354
|
+
stats.cacheMisses += 1;
|
|
355
|
+
stats.bytesDownloaded += size;
|
|
356
|
+
return this.objectToFile(file);
|
|
357
|
+
})
|
|
358
|
+
.catch(() => {
|
|
359
|
+
return Promise.reject(file);
|
|
360
|
+
});
|
|
361
|
+
}
|
|
362
|
+
|
|
363
|
+
/**
|
|
364
|
+
* Downloads an S3 object and writes it to the local filesystem.
|
|
365
|
+
*
|
|
366
|
+
* @param {string} key The S3 object key
|
|
367
|
+
* @returns {Promise<string>} The local file path the object was written to
|
|
368
|
+
*/
|
|
369
|
+
async objectToFile(key) {
|
|
370
|
+
const file = `${this.mount}/${key}`;
|
|
371
|
+
try {
|
|
372
|
+
const response = await this.s3.send(new GetObjectCommand({ Bucket: this.bucket, Key: key }));
|
|
373
|
+
const chunks = [];
|
|
374
|
+
for await (const chunk of response.Body) {
|
|
375
|
+
chunks.push(chunk);
|
|
376
|
+
}
|
|
377
|
+
await fsPromise.writeFile(file, Buffer.concat(chunks));
|
|
378
|
+
await this.processFile(file);
|
|
379
|
+
return file;
|
|
380
|
+
} catch (error) {
|
|
381
|
+
logger.error(`${error.$metadata?.httpStatusCode ?? error.statusCode} - ${file}`);
|
|
382
|
+
throw error;
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
/**
|
|
387
|
+
* Passes a downloaded file path to possibly be further processed by a plugin.
|
|
388
|
+
* For example, the Avro plugin processes .avro files to json
|
|
389
|
+
*
|
|
390
|
+
* @param {string} file
|
|
391
|
+
* @returns {Promise} A Promise that resolves when after file is processed by one or more plugins.
|
|
392
|
+
*/
|
|
393
|
+
processFile(file) {
|
|
394
|
+
const fileProcessPromises = this.plugins.map((plugin) => {
|
|
395
|
+
return plugin.processFile ? plugin.processFile(file) : Promise.resolve(file);
|
|
396
|
+
});
|
|
397
|
+
return Promise.allSettled(fileProcessPromises)
|
|
398
|
+
.then(() => file)
|
|
399
|
+
.catch((error) => logger.error(`Error processing file ${file} %s`, error));
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
/**
|
|
403
|
+
* Checks the accumulated value of bytes preflight.
|
|
404
|
+
* Throws if it exceeds `process.env.MAX_MB_DOWNLOAD` or the default
|
|
405
|
+
*
|
|
406
|
+
* @param {object[]} filePaths An array of file path objects
|
|
407
|
+
* @returns true
|
|
408
|
+
*/
|
|
409
|
+
preFlightCheck(filePaths) {
|
|
410
|
+
const totalBytes = filePaths.reduce((total, fileObject) => total + fileObject.size, 0);
|
|
411
|
+
const totalMB = totalBytes / 1e6;
|
|
412
|
+
const maxMB = process.env.MAX_MB_DOWNLOAD ? Number(process.env.MAX_MB_DOWNLOAD) : 1000;
|
|
413
|
+
|
|
414
|
+
if (totalMB > maxMB) {
|
|
415
|
+
throw new Error(`The total file size required for this query (${totalMB} MBs) exceeds ${maxMB} MBs`);
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
return true;
|
|
419
|
+
}
|
|
420
|
+
}
|
|
421
|
+
|
|
422
|
+
function buildS3Client({ apiKey, accessKeyId, secretAccessKey, endpoint, region }) {
|
|
423
|
+
const config = { ...(endpoint && { endpoint }), region, forcePathStyle: true };
|
|
424
|
+
|
|
425
|
+
if (apiKey) return buildIbmIamClient(config, apiKey);
|
|
426
|
+
return new S3Client({ ...config, credentials: { accessKeyId, secretAccessKey } });
|
|
427
|
+
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import { LRUCache } from 'lru-cache';
|
|
2
|
+
|
|
3
|
+
import S3 from './s3/s3.js';
|
|
4
|
+
export { bigintReplacer } from './utils/bigint-replacer.js';
|
|
5
|
+
import { mergeSettings } from './utils/file-settings/file-settings.js';
|
|
6
|
+
import { query as execQuery } from './duck-db/index.js';
|
|
7
|
+
import QueryParserPlugin from './plugins/query-parser/query-parser.js';
|
|
8
|
+
import QueryFinalizerPlugin from './plugins/query-finalizer/query-finalizer.js';
|
|
9
|
+
|
|
10
|
+
const listingCache = new LRUCache({ max: 1000 });
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* Downloads files from S3-compatible storage and executes a DuckDB SQL query against them.
|
|
14
|
+
*
|
|
15
|
+
* @param {object} options
|
|
16
|
+
* @param {string} [options.apiKey] - IBM Cloud API key. When provided, IBM IAM token auth is used instead of HMAC.
|
|
17
|
+
* @param {string} [options.accessKeyId] - HMAC access key ID. Required when not using `apiKey`.
|
|
18
|
+
* @param {string} [options.secretAccessKey] - HMAC secret access key. Required when not using `apiKey`.
|
|
19
|
+
* @param {string} options.defaultEndpoint - S3 endpoint URL used when no {endpoint:} token is present in the query.
|
|
20
|
+
* @param {string} options.defaultBucket - Bucket used when no {bucket:} token is present in the query.
|
|
21
|
+
* @param {string} options.bucketsDir - Local directory for caching downloaded files.
|
|
22
|
+
* @param {string} options.query - DuckDB SQL query. Supports date tokens, location tokens, and glob patterns.
|
|
23
|
+
* @param {number} [options.from] - Start of date range as a Unix timestamp in milliseconds. Required when using date tokens.
|
|
24
|
+
* @param {number} [options.to] - End of date range as a Unix timestamp in milliseconds. Required when using date tokens.
|
|
25
|
+
* @param {string} [options.format] - Output format. `'jsonRecords'` returns `[{ col: val }]`. Default is columnar `[{ name, fields: [val, ...] }]`.
|
|
26
|
+
* @param {object[]} [options.plugins] - Additional plugins for query parsing or file processing.
|
|
27
|
+
* @returns {Promise<Array>} Query results in the requested format.
|
|
28
|
+
*/
|
|
29
|
+
export default function s3Querier({
|
|
30
|
+
to,
|
|
31
|
+
from,
|
|
32
|
+
bucketsDir,
|
|
33
|
+
defaultEndpoint,
|
|
34
|
+
defaultBucket,
|
|
35
|
+
query,
|
|
36
|
+
plugins = [],
|
|
37
|
+
apiKey,
|
|
38
|
+
accessKeyId,
|
|
39
|
+
secretAccessKey,
|
|
40
|
+
format,
|
|
41
|
+
}) {
|
|
42
|
+
const systemPlugins = [new QueryParserPlugin(), ...plugins, new QueryFinalizerPlugin()];
|
|
43
|
+
const processed = processQuery(systemPlugins, { query, endpoint: defaultEndpoint, defaultBucket, bucketsDir });
|
|
44
|
+
const { query: processedQuery, settings: downloadSettings } = processed;
|
|
45
|
+
|
|
46
|
+
const downloadPromises = startDownloads({
|
|
47
|
+
apiKey,
|
|
48
|
+
accessKeyId,
|
|
49
|
+
secretAccessKey,
|
|
50
|
+
bucketsDir,
|
|
51
|
+
to,
|
|
52
|
+
from,
|
|
53
|
+
downloadSettings,
|
|
54
|
+
plugins: systemPlugins,
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
return Promise.allSettled(downloadPromises).then((results) => {
|
|
58
|
+
results.forEach((result) => {
|
|
59
|
+
if (result.status === 'rejected') throw result.reason;
|
|
60
|
+
});
|
|
61
|
+
return execQuery(processedQuery, { format });
|
|
62
|
+
});
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Orchestrates:
|
|
67
|
+
* - Passing the query through to each plugin
|
|
68
|
+
* - Merging file download settings
|
|
69
|
+
*
|
|
70
|
+
* @param {Array} plugins
|
|
71
|
+
* @param {object} context
|
|
72
|
+
* @returns
|
|
73
|
+
*/
|
|
74
|
+
function processQuery(plugins = [], { query = '', endpoint, defaultBucket, bucketsDir }) {
|
|
75
|
+
const processedQuery = plugins.reduce(
|
|
76
|
+
(result, plugin) => {
|
|
77
|
+
return plugin.processQuery(result);
|
|
78
|
+
},
|
|
79
|
+
{ endpoint, defaultBucket, bucketsDir, query, settings: [] },
|
|
80
|
+
);
|
|
81
|
+
processedQuery.settings = mergeSettings(processedQuery.settings);
|
|
82
|
+
return processedQuery;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Starts the download process
|
|
87
|
+
*
|
|
88
|
+
* @param {object} params Request query params
|
|
89
|
+
* @param {object[]} settings Settings derived from query and merged
|
|
90
|
+
* @returns {Promise[]} An array of promises for file downloads in each bucket
|
|
91
|
+
*/
|
|
92
|
+
function startDownloads({ to, from, downloadSettings, bucketsDir, apiKey, accessKeyId, secretAccessKey, plugins }) {
|
|
93
|
+
return downloadSettings.map((setting) => {
|
|
94
|
+
const { endpoint, bucket, filePatterns, staticFiles } = setting;
|
|
95
|
+
const s3 = new S3({
|
|
96
|
+
apiKey,
|
|
97
|
+
accessKeyId,
|
|
98
|
+
secretAccessKey,
|
|
99
|
+
endpoint,
|
|
100
|
+
bucket,
|
|
101
|
+
mount: `${bucketsDir}/${bucket}`,
|
|
102
|
+
listingCache,
|
|
103
|
+
plugins,
|
|
104
|
+
});
|
|
105
|
+
return s3.downloadFiles({ to: Number(to), from: Number(from), filePatterns, staticFiles });
|
|
106
|
+
});
|
|
107
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* JSON.stringify replacer that converts BigInt values to Number.
|
|
3
|
+
* Use this when serializing jsonRecords results that may contain BigInt columns
|
|
4
|
+
* (e.g. COUNT(*), SUM of integer columns). Note: values above Number.MAX_SAFE_INTEGER
|
|
5
|
+
* will lose precision — cast to INTEGER in SQL if exact large values matter.
|
|
6
|
+
*
|
|
7
|
+
* @param {string} _ - The key (unused)
|
|
8
|
+
* @param {*} val - The value to serialize
|
|
9
|
+
* @returns {*}
|
|
10
|
+
*/
|
|
11
|
+
export function bigintReplacer(_, val) {
|
|
12
|
+
return typeof val === 'bigint' ? Number(val) : val;
|
|
13
|
+
}
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
const DIGITS_4 = '\\d{4}';
|
|
2
|
+
const DIGITS_2 = '\\d{2}';
|
|
3
|
+
|
|
4
|
+
export function yyyy(str, date) {
|
|
5
|
+
return str.replaceAll('{yyyy}', String(date.getFullYear()));
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
export function MM(str, date) {
|
|
9
|
+
return str.replaceAll('{MM}', String(date.getMonth() + 1).padStart(2, '0'));
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export function dd(str, date) {
|
|
13
|
+
return str.replaceAll('{dd}', String(date.getDate()).padStart(2, '0'));
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
export function hh(str, date) {
|
|
17
|
+
return str.replaceAll('{hh}', String(date.getHours()).padStart(2, '0'));
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export function mm(str, date) {
|
|
21
|
+
return str.replaceAll('{mm}', String(date.getMinutes()).padStart(2, '0'));
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export function ss(str, date) {
|
|
25
|
+
return str.replaceAll('{ss}', String(date.getSeconds()).padStart(2, '0'));
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export function regexFromPattern(pattern = '') {
|
|
29
|
+
const regex = pattern
|
|
30
|
+
.replaceAll('/', '\\/')
|
|
31
|
+
.replaceAll('.', '\\.')
|
|
32
|
+
.replaceAll('+', '\\+')
|
|
33
|
+
.replaceAll('{yyyy}', DIGITS_4)
|
|
34
|
+
.replaceAll('{MM}', DIGITS_2)
|
|
35
|
+
.replaceAll('{dd}', DIGITS_2)
|
|
36
|
+
.replaceAll('{hh}', DIGITS_2)
|
|
37
|
+
.replaceAll('{mm}', DIGITS_2)
|
|
38
|
+
.replaceAll('{ss}', DIGITS_2)
|
|
39
|
+
.replaceAll('*', '.*?');
|
|
40
|
+
|
|
41
|
+
return new RegExp(regex);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export function removeFileDatePatterns(query = '') {
|
|
45
|
+
return query
|
|
46
|
+
.replaceAll('{yyyy}', '*')
|
|
47
|
+
.replaceAll('{MM}', '*')
|
|
48
|
+
.replaceAll('{mm}', '*')
|
|
49
|
+
.replaceAll('{dd}', '*')
|
|
50
|
+
.replaceAll('{hh}', '*')
|
|
51
|
+
.replaceAll('{ss}', '*');
|
|
52
|
+
}
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
import { eachDayOfInterval, eachHourOfInterval } from 'date-fns';
|
|
2
|
+
import { yyyy, MM, dd, hh, mm, ss } from '../date-regex/date-regex.js';
|
|
3
|
+
/**
|
|
4
|
+
* Give a string with date patterns replaces patterns with actual data values
|
|
5
|
+
*
|
|
6
|
+
* @param {string} filePattern A string with year month etc date patters (up to seconds)
|
|
7
|
+
* @param {Date} date JS Date object
|
|
8
|
+
* @returns {string} A string with date patterns replaced with the date
|
|
9
|
+
*/
|
|
10
|
+
export function buildPath(filePattern, date) {
|
|
11
|
+
const datePatternReplaceFns = [yyyy, MM, dd, hh, mm, ss];
|
|
12
|
+
return datePatternReplaceFns.reduce((acc, replacer) => {
|
|
13
|
+
return replacer(acc, date);
|
|
14
|
+
}, filePattern);
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Given a date range returns an array of date objects
|
|
19
|
+
*
|
|
20
|
+
* @param {Date} from The from Date object
|
|
21
|
+
* @param {Date} to The to Date object
|
|
22
|
+
* @returns {Date[]} An array of dates within the to from from time range
|
|
23
|
+
*/
|
|
24
|
+
export function datesInRange(from, to) {
|
|
25
|
+
return eachDayOfInterval({
|
|
26
|
+
start: zeroDateMins(new Date(from)),
|
|
27
|
+
end: zeroDateMins(new Date(to)),
|
|
28
|
+
});
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/**
|
|
32
|
+
* Given a date range returns an array of date objects
|
|
33
|
+
*
|
|
34
|
+
* @param {Date} from The from Date object
|
|
35
|
+
* @param {Date} to The to Date object
|
|
36
|
+
* @returns {Date[]} An array of dates by hours within the to from from time range
|
|
37
|
+
*/
|
|
38
|
+
export function hoursInRange(from, to) {
|
|
39
|
+
return eachHourOfInterval({
|
|
40
|
+
start: new Date(from),
|
|
41
|
+
end: new Date(to),
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Sets a Date object minutes and seconds to 0
|
|
47
|
+
*
|
|
48
|
+
* @param {Date} date Date object
|
|
49
|
+
* @returns {Date} A Date object with minutes and seconds set to 0
|
|
50
|
+
*/
|
|
51
|
+
export function zeroDateMins(date) {
|
|
52
|
+
const zeroedDate = new Date(date);
|
|
53
|
+
zeroedDate.setMinutes(0, 0);
|
|
54
|
+
return zeroedDate;
|
|
55
|
+
}
|