s3-querier 1.2.2 → 1.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +2 -2
- package/src/duck-db/index.js +1 -7
- package/src/mcp/tools/query/query.js +6 -1
- package/src/plugins/fs-purge/fs-purge-plugin.js +21 -0
- package/src/plugins/lifecycle.js +124 -0
- package/src/plugins/stats/stats-plugin.js +49 -0
- package/src/s3/s3.js +44 -59
- package/src/s3-querier.js +8 -42
- package/src/utils/date-regex/date-regex.js +6 -6
- package/src/utils/file-path-builder/file-path-builder.js +14 -23
- package/src/utils/fs-purge/fs-purge.js +57 -0
- package/src/utils/logger.js +1 -1
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "s3-querier",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.3.0",
|
|
4
4
|
"description": "Query S3-compatible storage with DuckDB and SQL",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "src/s3-querier.js",
|
|
@@ -51,7 +51,7 @@
|
|
|
51
51
|
"@duckdb/node-api": "^1.5.3-r.3",
|
|
52
52
|
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
53
53
|
"avsc": "^5.7.7",
|
|
54
|
-
"
|
|
54
|
+
"glob": "^13.0.6",
|
|
55
55
|
"hyparquet": "^1.26.0",
|
|
56
56
|
"lru-cache": "^11.0.0",
|
|
57
57
|
"peggy": "^5.1.0",
|
package/src/duck-db/index.js
CHANGED
|
@@ -20,19 +20,13 @@ const formatStrategies = {
|
|
|
20
20
|
*/
|
|
21
21
|
export async function query(sql, options = {}) {
|
|
22
22
|
const { format } = options;
|
|
23
|
-
const queryStart = new Date();
|
|
24
|
-
|
|
25
23
|
try {
|
|
26
24
|
const connection = await db.connect();
|
|
27
25
|
const reader = await connection.runAndReadAll(sql);
|
|
28
26
|
const columnsResult = reader.getColumnsObjectJS();
|
|
29
27
|
|
|
30
28
|
const formatter = formatStrategies[format] ?? formatStrategies.default;
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
const queryTime = new Date() - queryStart;
|
|
34
|
-
logger.info(`Query completed in : ${queryTime / 1000} seconds`);
|
|
35
|
-
return result ?? [];
|
|
29
|
+
return formatter(columnsResult) ?? [];
|
|
36
30
|
} catch (error) {
|
|
37
31
|
logger.error(error);
|
|
38
32
|
throw error;
|
|
@@ -2,7 +2,8 @@ import { readFileSync } from 'node:fs';
|
|
|
2
2
|
import { z } from 'zod';
|
|
3
3
|
|
|
4
4
|
import BaseTool from '../base-tool.js';
|
|
5
|
-
import s3Querier, { bigintReplacer } from '../../../s3-querier.js';
|
|
5
|
+
import s3Querier, { bigintReplacer, FSPurgePlugin, StatsPlugin } from '../../../s3-querier.js';
|
|
6
|
+
import { logger } from '../../../utils/logger.js';
|
|
6
7
|
|
|
7
8
|
const {
|
|
8
9
|
S3_ACCESS_KEY_ID,
|
|
@@ -13,6 +14,9 @@ const {
|
|
|
13
14
|
S3_BUCKETS_DIR = '/tmp/s3-querier',
|
|
14
15
|
} = process.env;
|
|
15
16
|
|
|
17
|
+
const purgePlugin = new FSPurgePlugin({ bucketsDir: S3_BUCKETS_DIR });
|
|
18
|
+
const statsPlugin = new StatsPlugin((event) => logger.info(event));
|
|
19
|
+
|
|
16
20
|
const sqlDescription = readFileSync(new URL('../../descriptions/sql-param.md', import.meta.url), 'utf8');
|
|
17
21
|
const toolDescription = readFileSync(new URL('../../descriptions/tool.md', import.meta.url), 'utf8');
|
|
18
22
|
|
|
@@ -59,6 +63,7 @@ export default class QueryTool extends BaseTool {
|
|
|
59
63
|
accessKeyId: S3_ACCESS_KEY_ID,
|
|
60
64
|
secretAccessKey: S3_SECRET_ACCESS_KEY,
|
|
61
65
|
format: 'jsonRecords',
|
|
66
|
+
plugins: [purgePlugin, statsPlugin],
|
|
62
67
|
});
|
|
63
68
|
|
|
64
69
|
return {
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import FSPurge from '../../utils/fs-purge/fs-purge.js';
|
|
2
|
+
|
|
3
|
+
export default class FSPurgePlugin {
|
|
4
|
+
/**
|
|
5
|
+
* @param {object} options
|
|
6
|
+
* @param {string} options.bucketsDir Local directory where S3 files are cached
|
|
7
|
+
* @param {number} [options.lastAccessTTLMinutes=60] Minutes since last access before a file is eligible for purge
|
|
8
|
+
* @param {number} [options.refreshIntervalMin=60] Minimum minutes between sweeps
|
|
9
|
+
*/
|
|
10
|
+
constructor({ bucketsDir, lastAccessTTLMinutes = 60, refreshIntervalMin = 60 }) {
|
|
11
|
+
this.purger = new FSPurge({ pattern: `${bucketsDir}/**`, lastAccessTTLMinutes, refreshIntervalMin });
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
processQuery(context) {
|
|
15
|
+
return context;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
postQuery() {
|
|
19
|
+
this.purger.sweep();
|
|
20
|
+
}
|
|
21
|
+
}
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
import { logger } from '../utils/logger.js';
|
|
2
|
+
import { mergeSettings } from '../utils/file-settings/file-settings.js';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Passes the query through each plugin's `processQuery` hook and merges the
|
|
6
|
+
* resulting file download settings.
|
|
7
|
+
*
|
|
8
|
+
* @param {object[]} plugins - Plugin instances.
|
|
9
|
+
* @param {object} context - `{ query, endpoint, defaultBucket, bucketsDir }`.
|
|
10
|
+
* @returns {{ query: string, fileSettings: object[], settings: object[], ... }}
|
|
11
|
+
*/
|
|
12
|
+
export function processQuery(plugins = [], { query = '', endpoint, defaultBucket, bucketsDir }) {
|
|
13
|
+
const processedQuery = plugins.reduce((result, plugin) => plugin.processQuery(result), {
|
|
14
|
+
endpoint,
|
|
15
|
+
defaultBucket,
|
|
16
|
+
bucketsDir,
|
|
17
|
+
query,
|
|
18
|
+
settings: [],
|
|
19
|
+
});
|
|
20
|
+
const fileSettings = processedQuery.settings;
|
|
21
|
+
processedQuery.settings = mergeSettings(processedQuery.settings);
|
|
22
|
+
return { ...processedQuery, fileSettings };
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Passes the raw query through each plugin's `finalizeQuery` hook, substituting
|
|
27
|
+
* exact downloaded paths in place of glob patterns.
|
|
28
|
+
*
|
|
29
|
+
* @param {object} params
|
|
30
|
+
* @param {object[]} params.plugins - Plugin instances.
|
|
31
|
+
* @param {string} params.rawQuery - SQL with original file references and date/location tokens.
|
|
32
|
+
* @param {object[]} params.fileSettings - Pre-merge per-file settings from `processQuery`.
|
|
33
|
+
* @param {string[]} params.downloadedPaths - Absolute local paths of all downloaded files.
|
|
34
|
+
* @param {string} params.bucketsDir - Root directory where files are cached locally.
|
|
35
|
+
* @returns {string} Finalized SQL ready for DuckDB execution.
|
|
36
|
+
*/
|
|
37
|
+
export function runFinalizers({ plugins, rawQuery, fileSettings, downloadedPaths, bucketsDir }) {
|
|
38
|
+
return plugins.reduce((query, plugin) => {
|
|
39
|
+
if (!plugin.finalizeQuery) return query;
|
|
40
|
+
return plugin.finalizeQuery(query, fileSettings, downloadedPaths, bucketsDir);
|
|
41
|
+
}, rawQuery);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Calls each plugin's `preListFiles` hook and collects the returned callbacks.
|
|
46
|
+
* Plugins that do not implement `preListFiles` contribute a `null` placeholder
|
|
47
|
+
* so that callback indices stay aligned with plugin indices.
|
|
48
|
+
*
|
|
49
|
+
* @param {object[]} plugins - Plugin instances.
|
|
50
|
+
* @param {object} context - `{ prefix, bucket }`.
|
|
51
|
+
* @returns {Array<Function|null>} One entry per plugin — a callback or `null`.
|
|
52
|
+
*/
|
|
53
|
+
export function runPreListFiles(plugins, context) {
|
|
54
|
+
return plugins.map((plugin) => plugin.preListFiles?.(context) ?? null);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
/**
|
|
58
|
+
* Invokes each callback returned by `runPreListFiles`. Errors are logged and
|
|
59
|
+
* swallowed so a failing plugin never rejects the caller's query result.
|
|
60
|
+
*
|
|
61
|
+
* @param {Array<Function|null>} callbacks - Collected from `runPreListFiles`.
|
|
62
|
+
* @param {object} context - `{ prefix, bucket, files, durationMs, cacheHit }`.
|
|
63
|
+
*/
|
|
64
|
+
export function runPostListFiles(callbacks, context) {
|
|
65
|
+
callbacks.forEach((cb) => {
|
|
66
|
+
if (cb) Promise.resolve(cb(context)).catch((error) => logger.error(error));
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* Calls each plugin's `preDownloadFiles` hook and collects the returned callbacks.
|
|
72
|
+
* Plugins that do not implement `preDownloadFiles` contribute a `null` placeholder
|
|
73
|
+
* so that callback indices stay aligned with plugin indices.
|
|
74
|
+
*
|
|
75
|
+
* @param {object[]} plugins - Plugin instances.
|
|
76
|
+
* @param {object} context - `{ bucket, from, to }`.
|
|
77
|
+
* @returns {Array<Function|null>} One entry per plugin — a callback or `null`.
|
|
78
|
+
*/
|
|
79
|
+
export function runPreDownloadFiles(plugins, context) {
|
|
80
|
+
return plugins.map((plugin) => plugin.preDownloadFiles?.(context) ?? null);
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
/**
|
|
84
|
+
* Invokes each callback returned by `runPreDownloadFiles`. Errors are logged and
|
|
85
|
+
* swallowed so a failing plugin never rejects the caller's query result.
|
|
86
|
+
*
|
|
87
|
+
* @param {Array<Function|null>} callbacks - Collected from `runPreDownloadFiles`.
|
|
88
|
+
* @param {object} context - `{ cacheHits, cacheMisses, enqueuedHits, bytesDownloaded, durationMs, bucket }`.
|
|
89
|
+
*/
|
|
90
|
+
export function runPostDownloadFiles(callbacks, context) {
|
|
91
|
+
callbacks.forEach((cb) => {
|
|
92
|
+
if (cb) Promise.resolve(cb(context)).catch((error) => logger.error(error));
|
|
93
|
+
});
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Calls each plugin's `preQuery` hook and collects the returned callbacks.
|
|
98
|
+
* Plugins that do not implement `preQuery` contribute a `null` placeholder
|
|
99
|
+
* so that callback indices stay aligned with plugin indices.
|
|
100
|
+
*
|
|
101
|
+
* @param {object[]} plugins - Plugin instances.
|
|
102
|
+
* @param {object} context - `{ sql, downloadedPaths, bucketsDir }`.
|
|
103
|
+
* @returns {Array<Function|null>} One entry per plugin — a callback or `null`.
|
|
104
|
+
*/
|
|
105
|
+
export function runPreQuery(plugins, context) {
|
|
106
|
+
return plugins.map((plugin) => plugin.preQuery?.(context) ?? null);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Invokes each callback returned by `runPreQuery` and each plugin's `postQuery`
|
|
111
|
+
* hook. Errors are logged and swallowed so a failing plugin never rejects the
|
|
112
|
+
* caller's query result.
|
|
113
|
+
*
|
|
114
|
+
* @param {object[]} plugins - Plugin instances.
|
|
115
|
+
* @param {object} context - `{ result, downloadedPaths, bucketsDir }`.
|
|
116
|
+
* @param {Array<Function|null>} callbacks - Collected from `runPreQuery`.
|
|
117
|
+
*/
|
|
118
|
+
export function runPostQuery(plugins, context, callbacks = []) {
|
|
119
|
+
plugins.forEach((plugin, index) => {
|
|
120
|
+
const cb = callbacks[index];
|
|
121
|
+
if (cb) Promise.resolve(cb(context)).catch((error) => logger.error(error));
|
|
122
|
+
if (plugin.postQuery) Promise.resolve(plugin.postQuery(context)).catch((error) => logger.error(error));
|
|
123
|
+
});
|
|
124
|
+
}
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Collects listing, download, and query timing stats and delivers them via a
|
|
3
|
+
* single callback. Each invocation fires independently — the caller is
|
|
4
|
+
* responsible for aggregation across events.
|
|
5
|
+
*
|
|
6
|
+
* @example
|
|
7
|
+
* const stats = new StatsPlugin((event) => console.log(event));
|
|
8
|
+
* // { type: 'listing', prefix, bucket, fileCount, durationMs, cacheHit }
|
|
9
|
+
* // { type: 'download', bucket, cacheHits, cacheMisses, bytesDownloaded, durationMs }
|
|
10
|
+
* // { type: 'query', sql, durationMs, rowCount }
|
|
11
|
+
*/
|
|
12
|
+
export default class StatsPlugin {
|
|
13
|
+
constructor(onStats) {
|
|
14
|
+
this.onStats = onStats;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
processQuery(context) {
|
|
18
|
+
return context;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
preListFiles({ prefix, bucket }) {
|
|
22
|
+
return ({ files, durationMs, cacheHit }) => {
|
|
23
|
+
this.onStats({ type: 'listing', prefix, bucket, fileCount: files.length, durationMs, cacheHit });
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
preDownloadFiles({ bucket, from, to }) {
|
|
28
|
+
return ({ cacheHits, cacheMisses, enqueuedHits, bytesDownloaded, durationMs }) => {
|
|
29
|
+
this.onStats({
|
|
30
|
+
type: 'download',
|
|
31
|
+
bucket,
|
|
32
|
+
from,
|
|
33
|
+
to,
|
|
34
|
+
cacheHits,
|
|
35
|
+
cacheMisses,
|
|
36
|
+
enqueuedHits,
|
|
37
|
+
bytesDownloaded,
|
|
38
|
+
durationMs,
|
|
39
|
+
});
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
preQuery({ sql }) {
|
|
44
|
+
const start = Date.now();
|
|
45
|
+
return ({ result }) => {
|
|
46
|
+
this.onStats({ type: 'query', sql, durationMs: Date.now() - start, rowCount: result.length });
|
|
47
|
+
};
|
|
48
|
+
}
|
|
49
|
+
}
|
package/src/s3/s3.js
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import fsPromise from 'node:fs/promises';
|
|
2
2
|
import { dirname } from 'node:path';
|
|
3
|
-
import { S3Client,
|
|
3
|
+
import { S3Client, paginateListObjectsV2, GetObjectCommand } from '@aws-sdk/client-s3';
|
|
4
4
|
|
|
5
5
|
import { logger } from '../utils/logger.js';
|
|
6
|
+
import { runPreListFiles, runPostListFiles, runPreDownloadFiles, runPostDownloadFiles } from '../plugins/lifecycle.js';
|
|
6
7
|
import { datesInRange, hoursInRange, monthsInRange, buildPath } from '../utils/file-path-builder/file-path-builder.js';
|
|
7
8
|
import { regexFromPattern } from '../utils/date-regex/date-regex.js';
|
|
8
9
|
import { buildIbmIamClient } from './auth/ibm-iam-client.js';
|
|
@@ -44,16 +45,25 @@ export default class S3 {
|
|
|
44
45
|
* @returns {PromiseSettledResult<string[]>} Promise result for each file downloaded
|
|
45
46
|
*/
|
|
46
47
|
async downloadFiles({ from, to, filePatterns = [], staticFiles = [] }) {
|
|
47
|
-
const
|
|
48
|
-
const
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
const
|
|
52
|
-
|
|
48
|
+
const listPromises = filePatterns.map((pattern) => this.getFilePathsFromPrefixes(from, to, pattern));
|
|
49
|
+
const filePaths = await Promise.allSettled(listPromises).then((fileList) =>
|
|
50
|
+
fileList.map((list) => list.value).flat(),
|
|
51
|
+
);
|
|
52
|
+
const stats = { start: new Date(), cacheHits: 0, cacheMisses: 0, enqueuedHits: 0, bytesDownloaded: 0 };
|
|
53
|
+
const downloadCallbacks = runPreDownloadFiles(this.plugins, { bucket: this.bucket, from, to });
|
|
54
|
+
const downloadedPaths = await this.downloadFileList([...filePaths, ...staticFiles], stats);
|
|
55
|
+
const durationMs = new Date() - stats.start;
|
|
56
|
+
|
|
57
|
+
runPostDownloadFiles(downloadCallbacks, {
|
|
58
|
+
cacheHits: stats.cacheHits,
|
|
59
|
+
cacheMisses: stats.cacheMisses,
|
|
60
|
+
enqueuedHits: stats.enqueuedHits,
|
|
61
|
+
bytesDownloaded: stats.bytesDownloaded,
|
|
62
|
+
durationMs,
|
|
63
|
+
bucket: this.bucket,
|
|
53
64
|
});
|
|
54
65
|
|
|
55
|
-
|
|
56
|
-
return this.downloadFileList([...filePaths, ...staticFiles]);
|
|
66
|
+
return downloadedPaths;
|
|
57
67
|
}
|
|
58
68
|
|
|
59
69
|
/**
|
|
@@ -62,30 +72,14 @@ export default class S3 {
|
|
|
62
72
|
* @param {string[]} filePaths A list of files to download
|
|
63
73
|
* @returns {PromiseSettledResult} A Promise that resolves to an array of file paths
|
|
64
74
|
*/
|
|
65
|
-
downloadFileList(filePaths = []) {
|
|
66
|
-
logger.info(`Starting downloads for ${filePaths.length} files`);
|
|
75
|
+
downloadFileList(filePaths = [], stats = { cacheHits: 0, cacheMisses: 0, enqueuedHits: 0, bytesDownloaded: 0 }) {
|
|
67
76
|
this.preFlightCheck(filePaths);
|
|
68
77
|
|
|
69
|
-
const stats = {
|
|
70
|
-
start: new Date(),
|
|
71
|
-
cacheHits: 0,
|
|
72
|
-
cacheMisses: 0,
|
|
73
|
-
enqueuedHits: 0,
|
|
74
|
-
bytesDownloaded: 0,
|
|
75
|
-
};
|
|
76
78
|
const filesPromises = this.startDownloads(stats, filePaths);
|
|
77
79
|
|
|
78
80
|
return Promise.allSettled(filesPromises)
|
|
79
|
-
.then((results) =>
|
|
80
|
-
|
|
81
|
-
.filter((result) => {
|
|
82
|
-
return result.value;
|
|
83
|
-
})
|
|
84
|
-
.map((result) => result.value);
|
|
85
|
-
})
|
|
86
|
-
.then(this.logStatistics(stats))
|
|
87
|
-
.then(this.resetEnqueued)
|
|
88
|
-
.then((results) => results);
|
|
81
|
+
.then((results) => results.filter((result) => result.value).map((result) => result.value))
|
|
82
|
+
.then(this.resetEnqueued);
|
|
89
83
|
}
|
|
90
84
|
|
|
91
85
|
/**
|
|
@@ -245,23 +239,34 @@ export default class S3 {
|
|
|
245
239
|
*/
|
|
246
240
|
async listFiles(prefix) {
|
|
247
241
|
const cacheKey = `${this.bucket}/${prefix}`;
|
|
242
|
+
const start = new Date();
|
|
243
|
+
const listCallbacks = runPreListFiles(this.plugins, { prefix, bucket: this.bucket });
|
|
244
|
+
|
|
248
245
|
if (this.listingCache.has(cacheKey)) {
|
|
249
|
-
|
|
246
|
+
const files = this.listingCache.get(cacheKey);
|
|
247
|
+
runPostListFiles(listCallbacks, {
|
|
248
|
+
prefix,
|
|
249
|
+
bucket: this.bucket,
|
|
250
|
+
files,
|
|
251
|
+
durationMs: new Date() - start,
|
|
252
|
+
cacheHit: true,
|
|
253
|
+
});
|
|
254
|
+
return files;
|
|
250
255
|
}
|
|
251
256
|
|
|
252
257
|
const files = [];
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
new ListObjectsV2Command({ Bucket: this.bucket, Prefix: prefix, ContinuationToken: continuationToken }),
|
|
257
|
-
);
|
|
258
|
-
response.Contents?.forEach((content) => {
|
|
259
|
-
files.push({ file: content.Key, size: content.Size });
|
|
260
|
-
});
|
|
261
|
-
continuationToken = response.NextContinuationToken;
|
|
262
|
-
} while (continuationToken);
|
|
258
|
+
for await (const page of paginateListObjectsV2({ client: this.s3 }, { Bucket: this.bucket, Prefix: prefix })) {
|
|
259
|
+
page.Contents?.forEach((content) => files.push({ file: content.Key, size: content.Size }));
|
|
260
|
+
}
|
|
263
261
|
|
|
264
262
|
this.listingCache.set(cacheKey, files);
|
|
263
|
+
runPostListFiles(listCallbacks, {
|
|
264
|
+
prefix,
|
|
265
|
+
bucket: this.bucket,
|
|
266
|
+
files,
|
|
267
|
+
durationMs: new Date() - start,
|
|
268
|
+
cacheHit: false,
|
|
269
|
+
});
|
|
265
270
|
return files;
|
|
266
271
|
}
|
|
267
272
|
|
|
@@ -283,26 +288,6 @@ export default class S3 {
|
|
|
283
288
|
return fileDLPromises;
|
|
284
289
|
}
|
|
285
290
|
|
|
286
|
-
/**
|
|
287
|
-
* Logs download statistics
|
|
288
|
-
*
|
|
289
|
-
* @param {object} stats A statistics object
|
|
290
|
-
* @returns {(PromiseSettledResult) => PromiseSettledResult}
|
|
291
|
-
*/
|
|
292
|
-
logStatistics(stats) {
|
|
293
|
-
return (results) => {
|
|
294
|
-
const mbDownloaded = stats.bytesDownloaded !== 0 ? stats.bytesDownloaded / (1024 * 1024) : 0;
|
|
295
|
-
const seconds = (new Date() - stats.start) / 1000;
|
|
296
|
-
const mbPerSecond = mbDownloaded / seconds;
|
|
297
|
-
|
|
298
|
-
logger.info(`Enqueued keys: ${this.enqueuedFiles.size}`);
|
|
299
|
-
logger.info(
|
|
300
|
-
`Download completed in: ${seconds} seconds. Cache hits: ${stats.cacheHits}. Cache misses: ${stats.cacheMisses}. Enqueued hits: ${stats.enqueuedHits}. MB downloaded: ${mbDownloaded}. MB/s ${mbPerSecond}`,
|
|
301
|
-
);
|
|
302
|
-
return results;
|
|
303
|
-
};
|
|
304
|
-
}
|
|
305
|
-
|
|
306
291
|
/**
|
|
307
292
|
* Starts the download for all files
|
|
308
293
|
*
|
package/src/s3-querier.js
CHANGED
|
@@ -2,10 +2,12 @@ import { LRUCache } from 'lru-cache';
|
|
|
2
2
|
|
|
3
3
|
import S3 from './s3/s3.js';
|
|
4
4
|
export { bigintReplacer } from './utils/bigint-replacer.js';
|
|
5
|
-
import { mergeSettings } from './utils/file-settings/file-settings.js';
|
|
6
5
|
import { query as execQuery } from './duck-db/index.js';
|
|
7
6
|
import QueryParserPlugin from './plugins/query-parser/query-parser.js';
|
|
8
7
|
import QueryFinalizerPlugin from './plugins/query-finalizer/query-finalizer.js';
|
|
8
|
+
export { default as FSPurgePlugin } from './plugins/fs-purge/fs-purge-plugin.js';
|
|
9
|
+
export { default as StatsPlugin } from './plugins/stats/stats-plugin.js';
|
|
10
|
+
import { processQuery, runFinalizers, runPreQuery, runPostQuery } from './plugins/lifecycle.js';
|
|
9
11
|
|
|
10
12
|
const listingCache = new LRUCache({ max: 1000 });
|
|
11
13
|
|
|
@@ -68,49 +70,13 @@ export default function s3Querier({
|
|
|
68
70
|
});
|
|
69
71
|
const downloadedPaths = results.flatMap((result) => result.value);
|
|
70
72
|
const finalQuery = runFinalizers({ plugins: systemPlugins, rawQuery, fileSettings, downloadedPaths, bucketsDir });
|
|
71
|
-
|
|
72
|
-
});
|
|
73
|
-
}
|
|
73
|
+
const postQueryCallbacks = runPreQuery(systemPlugins, { sql: finalQuery, downloadedPaths, bucketsDir });
|
|
74
74
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
*
|
|
80
|
-
* @param {Array} plugins
|
|
81
|
-
* @param {object} context
|
|
82
|
-
* @returns
|
|
83
|
-
*/
|
|
84
|
-
function processQuery(plugins = [], { query = '', endpoint, defaultBucket, bucketsDir }) {
|
|
85
|
-
const processedQuery = plugins.reduce((result, plugin) => plugin.processQuery(result), {
|
|
86
|
-
endpoint,
|
|
87
|
-
defaultBucket,
|
|
88
|
-
bucketsDir,
|
|
89
|
-
query,
|
|
90
|
-
settings: [],
|
|
75
|
+
return execQuery(finalQuery, { format }).then((result) => {
|
|
76
|
+
runPostQuery(systemPlugins, { result, downloadedPaths, bucketsDir }, postQueryCallbacks);
|
|
77
|
+
return result;
|
|
78
|
+
});
|
|
91
79
|
});
|
|
92
|
-
const fileSettings = processedQuery.settings;
|
|
93
|
-
processedQuery.settings = mergeSettings(processedQuery.settings);
|
|
94
|
-
return { ...processedQuery, fileSettings };
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
/**
|
|
98
|
-
* Passes the raw query through each plugin's `finalizeQuery` lifecycle method,
|
|
99
|
-
* substituting exact downloaded paths in place of glob patterns.
|
|
100
|
-
*
|
|
101
|
-
* @param {object} params
|
|
102
|
-
* @param {object[]} params.plugins - Plugin instances to run finalizers on.
|
|
103
|
-
* @param {string} params.rawQuery - SQL with original file references and date/location tokens.
|
|
104
|
-
* @param {object[]} params.fileSettings - Pre-merge per-file settings from processQuery.
|
|
105
|
-
* @param {string[]} params.downloadedPaths - Absolute local paths of all downloaded files.
|
|
106
|
-
* @param {string} params.bucketsDir - Root directory where files are cached locally.
|
|
107
|
-
* @returns {string} Finalized SQL ready for DuckDB execution.
|
|
108
|
-
*/
|
|
109
|
-
function runFinalizers({ plugins, rawQuery, fileSettings, downloadedPaths, bucketsDir }) {
|
|
110
|
-
return plugins.reduce((query, plugin) => {
|
|
111
|
-
if (!plugin.finalizeQuery) return query;
|
|
112
|
-
return plugin.finalizeQuery(query, fileSettings, downloadedPaths, bucketsDir);
|
|
113
|
-
}, rawQuery);
|
|
114
80
|
}
|
|
115
81
|
|
|
116
82
|
/**
|
|
@@ -2,27 +2,27 @@ const DIGITS_4 = '\\d{4}';
|
|
|
2
2
|
const DIGITS_2 = '\\d{2}';
|
|
3
3
|
|
|
4
4
|
export function yyyy(str, date) {
|
|
5
|
-
return str.replaceAll('{yyyy}', String(date.
|
|
5
|
+
return str.replaceAll('{yyyy}', String(date.getUTCFullYear()));
|
|
6
6
|
}
|
|
7
7
|
|
|
8
8
|
export function MM(str, date) {
|
|
9
|
-
return str.replaceAll('{MM}', String(date.
|
|
9
|
+
return str.replaceAll('{MM}', String(date.getUTCMonth() + 1).padStart(2, '0'));
|
|
10
10
|
}
|
|
11
11
|
|
|
12
12
|
export function dd(str, date) {
|
|
13
|
-
return str.replaceAll('{dd}', String(date.
|
|
13
|
+
return str.replaceAll('{dd}', String(date.getUTCDate()).padStart(2, '0'));
|
|
14
14
|
}
|
|
15
15
|
|
|
16
16
|
export function hh(str, date) {
|
|
17
|
-
return str.replaceAll('{hh}', String(date.
|
|
17
|
+
return str.replaceAll('{hh}', String(date.getUTCHours()).padStart(2, '0'));
|
|
18
18
|
}
|
|
19
19
|
|
|
20
20
|
export function mm(str, date) {
|
|
21
|
-
return str.replaceAll('{mm}', String(date.
|
|
21
|
+
return str.replaceAll('{mm}', String(date.getUTCMinutes()).padStart(2, '0'));
|
|
22
22
|
}
|
|
23
23
|
|
|
24
24
|
export function ss(str, date) {
|
|
25
|
-
return str.replaceAll('{ss}', String(date.
|
|
25
|
+
return str.replaceAll('{ss}', String(date.getUTCSeconds()).padStart(2, '0'));
|
|
26
26
|
}
|
|
27
27
|
|
|
28
28
|
export function regexFromPattern(pattern = '') {
|
|
@@ -1,5 +1,8 @@
|
|
|
1
|
-
import { eachDayOfInterval, eachHourOfInterval } from 'date-fns';
|
|
2
1
|
import { yyyy, MM, dd, hh, mm, ss } from '../date-regex/date-regex.js';
|
|
2
|
+
|
|
3
|
+
const MS_PER_HOUR = 60 * 60 * 1000;
|
|
4
|
+
const MS_PER_DAY = 24 * MS_PER_HOUR;
|
|
5
|
+
|
|
3
6
|
/**
|
|
4
7
|
* Give a string with date patterns replaces patterns with actual data values
|
|
5
8
|
*
|
|
@@ -15,17 +18,17 @@ export function buildPath(filePattern, date) {
|
|
|
15
18
|
}
|
|
16
19
|
|
|
17
20
|
/**
|
|
18
|
-
* Given a date range returns an array of date objects
|
|
21
|
+
* Given a date range returns an array of date objects, one per UTC calendar day.
|
|
19
22
|
*
|
|
20
23
|
* @param {Date} from The from Date object
|
|
21
24
|
* @param {Date} to The to Date object
|
|
22
25
|
* @returns {Date[]} An array of dates within the to from from time range
|
|
23
26
|
*/
|
|
24
27
|
export function datesInRange(from, to) {
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
});
|
|
28
|
+
const startMs = Date.UTC(from.getUTCFullYear(), from.getUTCMonth(), from.getUTCDate());
|
|
29
|
+
const endMs = Date.UTC(to.getUTCFullYear(), to.getUTCMonth(), to.getUTCDate());
|
|
30
|
+
const count = Math.round((endMs - startMs) / MS_PER_DAY) + 1;
|
|
31
|
+
return Array.from({ length: count }, (_, index) => new Date(startMs + index * MS_PER_DAY));
|
|
29
32
|
}
|
|
30
33
|
|
|
31
34
|
/**
|
|
@@ -45,29 +48,17 @@ export function monthsInRange(from, to) {
|
|
|
45
48
|
}
|
|
46
49
|
|
|
47
50
|
/**
|
|
48
|
-
* Given a date range returns an array of date objects
|
|
51
|
+
* Given a date range returns an array of date objects, one per UTC hour.
|
|
49
52
|
*
|
|
50
53
|
* @param {Date} from The from Date object
|
|
51
54
|
* @param {Date} to The to Date object
|
|
52
55
|
* @returns {Date[]} An array of dates by hours within the to from from time range
|
|
53
56
|
*/
|
|
54
57
|
export function hoursInRange(from, to) {
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
});
|
|
59
|
-
}
|
|
60
|
-
|
|
61
|
-
/**
|
|
62
|
-
* Sets a Date object minutes and seconds to 0
|
|
63
|
-
*
|
|
64
|
-
* @param {Date} date Date object
|
|
65
|
-
* @returns {Date} A Date object with minutes and seconds set to 0
|
|
66
|
-
*/
|
|
67
|
-
export function zeroDateMins(date) {
|
|
68
|
-
const zeroedDate = new Date(date);
|
|
69
|
-
zeroedDate.setMinutes(0, 0);
|
|
70
|
-
return zeroedDate;
|
|
58
|
+
const startMs = Date.UTC(from.getUTCFullYear(), from.getUTCMonth(), from.getUTCDate(), from.getUTCHours());
|
|
59
|
+
const endMs = Date.UTC(to.getUTCFullYear(), to.getUTCMonth(), to.getUTCDate(), to.getUTCHours());
|
|
60
|
+
const count = Math.round((endMs - startMs) / MS_PER_HOUR) + 1;
|
|
61
|
+
return Array.from({ length: count }, (_, index) => new Date(startMs + index * MS_PER_HOUR));
|
|
71
62
|
}
|
|
72
63
|
|
|
73
64
|
function noonUtcForMonthOffset(startYear, startMonth, offset) {
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { glob } from 'glob';
|
|
2
|
+
import { unlink } from 'node:fs/promises';
|
|
3
|
+
import { logger } from '../logger.js';
|
|
4
|
+
|
|
5
|
+
export default class FSPurge {
|
|
6
|
+
/**
|
|
7
|
+
* @param {object} options
|
|
8
|
+
* @param {string} options.pattern Glob pattern for files to consider for purging
|
|
9
|
+
* @param {number} [options.lastAccessTTLMinutes=60] Minutes since last access before a file is eligible for purge
|
|
10
|
+
* @param {number} [options.refreshIntervalMin=60] How often to run a sweep, in minutes
|
|
11
|
+
*/
|
|
12
|
+
constructor({ pattern, lastAccessTTLMinutes = 60, refreshIntervalMin = 60 }) {
|
|
13
|
+
this.lastAccessTTLMinutes = lastAccessTTLMinutes;
|
|
14
|
+
this.pattern = pattern;
|
|
15
|
+
this.refreshIntervalMin = refreshIntervalMin;
|
|
16
|
+
this.lastSweep = null;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
start() {
|
|
20
|
+
this.refreshRef = setInterval(this.sweep.bind(this), this.refreshIntervalMin * (1000 * 60));
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
stop() {
|
|
24
|
+
clearInterval(this.refreshRef);
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
async sweep() {
|
|
28
|
+
const now = new Date();
|
|
29
|
+
if (this.lastSweep && now - this.lastSweep < this.refreshIntervalMin * 60_000) return;
|
|
30
|
+
this.lastSweep = now;
|
|
31
|
+
|
|
32
|
+
const startSweep = now;
|
|
33
|
+
const filesToBeRemoved = await this.getExpiredFiles();
|
|
34
|
+
logger.info(`Removing ${filesToBeRemoved.length} files in sweep. Read time: ${(new Date() - startSweep) / 1000}s`);
|
|
35
|
+
|
|
36
|
+
await Promise.allSettled(
|
|
37
|
+
filesToBeRemoved.map((file) => {
|
|
38
|
+
return unlink(file)
|
|
39
|
+
.then(() => logger.debug(`Deleted file ${file}.`))
|
|
40
|
+
.catch((error) => logger.error(`Error deleting file ${file}. ${error.toString()}`));
|
|
41
|
+
}),
|
|
42
|
+
);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
async getExpiredFiles() {
|
|
46
|
+
const expiresAt = new Date();
|
|
47
|
+
expiresAt.setMinutes(expiresAt.getMinutes() - this.lastAccessTTLMinutes);
|
|
48
|
+
|
|
49
|
+
const files = await glob(this.pattern, {
|
|
50
|
+
stat: true,
|
|
51
|
+
withFileTypes: true,
|
|
52
|
+
nodir: true,
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
return files.filter((file) => file.atime < expiresAt).map((file) => file.fullpath());
|
|
56
|
+
}
|
|
57
|
+
}
|
package/src/utils/logger.js
CHANGED