s3-querier 1.2.3 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -141,16 +141,82 @@ Downloaded files are cached to `bucketsDir` on disk. Subsequent queries that ref
141
141
 
142
142
  ## Plugins
143
143
 
144
- The `plugins` option accepts an array of plugin objects that can extend query parsing and file processing. A plugin may implement:
144
+ The `plugins` option accepts an array of plugin objects. Plugins can hook into every phase of query execution from S3 listing through download to SQL execution.
145
145
 
146
- - `processQuery(context)` — transform the query context before execution
147
- - `processFile(filePath)` — process each downloaded file (e.g. convert Avro to JSON)
146
+ ### Plugin interface
148
147
 
149
- The built-in Avro plugin is an example:
148
+ | Method | Phase | Description |
149
+ | --- | --- | --- |
150
+ | `processQuery(context)` | pre-download | Transform the query context. Return the (possibly mutated) context. |
151
+ | `finalizeQuery(query, fileSettings, downloadedPaths, bucketsDir)` | post-download | Rewrite the SQL string after downloads complete. Return the final SQL. |
152
+ | `preListFiles({ prefix, bucket })` | S3 listing | Called before listing. Return a callback or nothing. |
153
+ | `preDownloadFiles({ bucket, from, to })` | S3 download | Called before downloading. Return a callback or nothing. |
154
+ | `preQuery({ sql, downloadedPaths, bucketsDir })` | DuckDB execution | Called before the query runs. Return a callback or nothing. |
155
+ | `postQuery({ result, downloadedPaths, bucketsDir })` | DuckDB execution | Called after the query completes. |
156
+
157
+ The `pre*` methods use a closure pattern: return a callback to receive the after-state for that phase. This lets you capture a start timestamp and receive the result in one place without shared mutable variables:
150
158
 
151
159
  ```js
152
- import s3Querier from 's3-querier';
153
- import AvroPlugin from 's3-querier/src/plugins/avro/avro-plugin.js';
160
+ preQuery({ sql }) {
161
+ const start = Date.now();
162
+ return ({ result }) => {
163
+ console.log(`Query took ${Date.now() - start}ms — ${result.length} rows`);
164
+ };
165
+ }
166
+ ```
167
+
168
+ Post-phase callbacks are fire-and-forget — errors are logged and swallowed, so a failing plugin never rejects the caller's query.
169
+
170
+ ### FSPurgePlugin
171
+
172
+ `FSPurgePlugin` sweeps the local file cache after each query, evicting files that haven't been accessed recently. Import it alongside the default export:
173
+
174
+ ```js
175
+ import s3Querier, { FSPurgePlugin } from 's3-querier';
176
+
177
+ const purgePlugin = new FSPurgePlugin({
178
+ bucketsDir: '/tmp/s3-cache',
179
+ lastAccessTTLMinutes: 60, // evict files not accessed in the last hour (default: 60)
180
+ refreshIntervalMin: 60, // minimum minutes between sweeps (default: 60)
181
+ });
182
+
183
+ const results = await s3Querier({
184
+ // ...
185
+ plugins: [purgePlugin],
186
+ });
187
+ ```
188
+
189
+ ### StatsPlugin
190
+
191
+ `StatsPlugin` fires a single `onStats` callback for listing, download, and query events. Use it for logging, metrics, or custom dashboards:
192
+
193
+ ```js
194
+ import s3Querier, { StatsPlugin } from 's3-querier';
195
+
196
+ const statsPlugin = new StatsPlugin((event) => console.log(event));
197
+
198
+ await s3Querier({ /* ... */ plugins: [statsPlugin] });
199
+ // { type: 'listing', prefix, bucket, fileCount, durationMs, cacheHit }
200
+ // { type: 'download', bucket, from, to, cacheHits, cacheMisses, enqueuedHits, bytesDownloaded, durationMs }
201
+ // { type: 'query', sql, durationMs, rowCount }
202
+ ```
203
+
204
+ Each call fires a single event with a discriminated `type`:
205
+
206
+ | `type` | Fields |
207
+ | --- | --- |
208
+ | `'listing'` | `prefix`, `bucket`, `fileCount`, `durationMs`, `cacheHit` |
209
+ | `'download'` | `bucket`, `from`, `to`, `cacheHits`, `cacheMisses`, `enqueuedHits`, `bytesDownloaded`, `durationMs` |
210
+ | `'query'` | `sql`, `durationMs`, `rowCount` |
211
+
212
+ Events fire independently — one listing event per S3 prefix, one download event per bucket per query, one query event per execution. Aggregation is left to the caller.
213
+
214
+ ### AvroPlugin
215
+
216
+ The built-in Avro plugin converts Avro files to JSON before querying:
217
+
218
+ ```js
219
+ import s3Querier, { AvroPlugin } from 's3-querier';
154
220
 
155
221
  const results = await s3Querier({
156
222
  // ...
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "s3-querier",
3
- "version": "1.2.3",
3
+ "version": "1.3.1",
4
4
  "description": "Query S3-compatible storage with DuckDB and SQL",
5
5
  "type": "module",
6
6
  "main": "src/s3-querier.js",
@@ -51,7 +51,7 @@
51
51
  "@duckdb/node-api": "^1.5.3-r.3",
52
52
  "@modelcontextprotocol/sdk": "^1.29.0",
53
53
  "avsc": "^5.7.7",
54
- "date-fns": "^4.0.0",
54
+ "glob": "^13.0.6",
55
55
  "hyparquet": "^1.26.0",
56
56
  "lru-cache": "^11.0.0",
57
57
  "peggy": "^5.1.0",
@@ -20,19 +20,13 @@ const formatStrategies = {
20
20
  */
21
21
  export async function query(sql, options = {}) {
22
22
  const { format } = options;
23
- const queryStart = new Date();
24
-
25
23
  try {
26
24
  const connection = await db.connect();
27
25
  const reader = await connection.runAndReadAll(sql);
28
26
  const columnsResult = reader.getColumnsObjectJS();
29
27
 
30
28
  const formatter = formatStrategies[format] ?? formatStrategies.default;
31
- const result = formatter(columnsResult);
32
-
33
- const queryTime = new Date() - queryStart;
34
- logger.info(`Query completed in : ${queryTime / 1000} seconds`);
35
- return result ?? [];
29
+ return formatter(columnsResult) ?? [];
36
30
  } catch (error) {
37
31
  logger.error(error);
38
32
  throw error;
package/src/mcp/server.js CHANGED
@@ -1,5 +1,17 @@
1
1
  #!/usr/bin/env node
2
-
3
2
  import { S3QuerierMCP } from './s3querier-mcp.js';
3
+ import { FSPurgePlugin, StatsPlugin } from '../../s3-querier.js';
4
+ import { logger } from '../utils/logger.js';
5
+
6
+ const { S3_BUCKETS_DIR = '/tmp/s3-querier', S3_PURGE_CACHE = 'true', S3_PURGE_TTL_MINUTES = '60' } = process.env;
7
+
8
+ await new S3QuerierMCP({ plugins: buildDefaultPlugins() }).start();
4
9
 
5
- await new S3QuerierMCP().start();
10
+ function buildDefaultPlugins() {
11
+ const plugins = [];
12
+ if (S3_PURGE_CACHE !== 'false') {
13
+ plugins.push(new FSPurgePlugin({ bucketsDir: S3_BUCKETS_DIR, lastAccessTTLMinutes: Number(S3_PURGE_TTL_MINUTES) }));
14
+ }
15
+ plugins.push(new StatsPlugin((event) => logger.info(event)));
16
+ return plugins;
17
+ }
@@ -59,6 +59,7 @@ export default class QueryTool extends BaseTool {
59
59
  accessKeyId: S3_ACCESS_KEY_ID,
60
60
  secretAccessKey: S3_SECRET_ACCESS_KEY,
61
61
  format: 'jsonRecords',
62
+ plugins: this.config.plugins ?? [],
62
63
  });
63
64
 
64
65
  return {
@@ -0,0 +1,21 @@
1
+ import FSPurge from '../../utils/fs-purge/fs-purge.js';
2
+
3
+ export default class FSPurgePlugin {
4
+ /**
5
+ * @param {object} options
6
+ * @param {string} options.bucketsDir Local directory where S3 files are cached
7
+ * @param {number} [options.lastAccessTTLMinutes=60] Minutes since last access before a file is eligible for purge
8
+ * @param {number} [options.refreshIntervalMin=60] Minimum minutes between sweeps
9
+ */
10
+ constructor({ bucketsDir, lastAccessTTLMinutes = 60, refreshIntervalMin = 60 }) {
11
+ this.purger = new FSPurge({ pattern: `${bucketsDir}/**`, lastAccessTTLMinutes, refreshIntervalMin });
12
+ }
13
+
14
+ processQuery(context) {
15
+ return context;
16
+ }
17
+
18
+ postQuery() {
19
+ this.purger.sweep();
20
+ }
21
+ }
@@ -0,0 +1,124 @@
1
+ import { logger } from '../utils/logger.js';
2
+ import { mergeSettings } from '../utils/file-settings/file-settings.js';
3
+
4
+ /**
5
+ * Passes the query through each plugin's `processQuery` hook and merges the
6
+ * resulting file download settings.
7
+ *
8
+ * @param {object[]} plugins - Plugin instances.
9
+ * @param {object} context - `{ query, endpoint, defaultBucket, bucketsDir }`.
10
+ * @returns {{ query: string, fileSettings: object[], settings: object[], ... }}
11
+ */
12
+ export function processQuery(plugins = [], { query = '', endpoint, defaultBucket, bucketsDir }) {
13
+ const processedQuery = plugins.reduce((result, plugin) => plugin.processQuery(result), {
14
+ endpoint,
15
+ defaultBucket,
16
+ bucketsDir,
17
+ query,
18
+ settings: [],
19
+ });
20
+ const fileSettings = processedQuery.settings;
21
+ processedQuery.settings = mergeSettings(processedQuery.settings);
22
+ return { ...processedQuery, fileSettings };
23
+ }
24
+
25
+ /**
26
+ * Passes the raw query through each plugin's `finalizeQuery` hook, substituting
27
+ * exact downloaded paths in place of glob patterns.
28
+ *
29
+ * @param {object} params
30
+ * @param {object[]} params.plugins - Plugin instances.
31
+ * @param {string} params.rawQuery - SQL with original file references and date/location tokens.
32
+ * @param {object[]} params.fileSettings - Pre-merge per-file settings from `processQuery`.
33
+ * @param {string[]} params.downloadedPaths - Absolute local paths of all downloaded files.
34
+ * @param {string} params.bucketsDir - Root directory where files are cached locally.
35
+ * @returns {string} Finalized SQL ready for DuckDB execution.
36
+ */
37
+ export function runFinalizers({ plugins, rawQuery, fileSettings, downloadedPaths, bucketsDir }) {
38
+ return plugins.reduce((query, plugin) => {
39
+ if (!plugin.finalizeQuery) return query;
40
+ return plugin.finalizeQuery(query, fileSettings, downloadedPaths, bucketsDir);
41
+ }, rawQuery);
42
+ }
43
+
44
+ /**
45
+ * Calls each plugin's `preListFiles` hook and collects the returned callbacks.
46
+ * Plugins that do not implement `preListFiles` contribute a `null` placeholder
47
+ * so that callback indices stay aligned with plugin indices.
48
+ *
49
+ * @param {object[]} plugins - Plugin instances.
50
+ * @param {object} context - `{ prefix, bucket }`.
51
+ * @returns {Array<Function|null>} One entry per plugin — a callback or `null`.
52
+ */
53
+ export function runPreListFiles(plugins, context) {
54
+ return plugins.map((plugin) => plugin.preListFiles?.(context) ?? null);
55
+ }
56
+
57
+ /**
58
+ * Invokes each callback returned by `runPreListFiles`. Errors are logged and
59
+ * swallowed so a failing plugin never rejects the caller's query result.
60
+ *
61
+ * @param {Array<Function|null>} callbacks - Collected from `runPreListFiles`.
62
+ * @param {object} context - `{ prefix, bucket, files, durationMs, cacheHit }`.
63
+ */
64
+ export function runPostListFiles(callbacks, context) {
65
+ callbacks.forEach((cb) => {
66
+ if (cb) Promise.resolve(cb(context)).catch((error) => logger.error(error));
67
+ });
68
+ }
69
+
70
+ /**
71
+ * Calls each plugin's `preDownloadFiles` hook and collects the returned callbacks.
72
+ * Plugins that do not implement `preDownloadFiles` contribute a `null` placeholder
73
+ * so that callback indices stay aligned with plugin indices.
74
+ *
75
+ * @param {object[]} plugins - Plugin instances.
76
+ * @param {object} context - `{ bucket, from, to }`.
77
+ * @returns {Array<Function|null>} One entry per plugin — a callback or `null`.
78
+ */
79
+ export function runPreDownloadFiles(plugins, context) {
80
+ return plugins.map((plugin) => plugin.preDownloadFiles?.(context) ?? null);
81
+ }
82
+
83
+ /**
84
+ * Invokes each callback returned by `runPreDownloadFiles`. Errors are logged and
85
+ * swallowed so a failing plugin never rejects the caller's query result.
86
+ *
87
+ * @param {Array<Function|null>} callbacks - Collected from `runPreDownloadFiles`.
88
+ * @param {object} context - `{ cacheHits, cacheMisses, enqueuedHits, bytesDownloaded, durationMs, bucket }`.
89
+ */
90
+ export function runPostDownloadFiles(callbacks, context) {
91
+ callbacks.forEach((cb) => {
92
+ if (cb) Promise.resolve(cb(context)).catch((error) => logger.error(error));
93
+ });
94
+ }
95
+
96
+ /**
97
+ * Calls each plugin's `preQuery` hook and collects the returned callbacks.
98
+ * Plugins that do not implement `preQuery` contribute a `null` placeholder
99
+ * so that callback indices stay aligned with plugin indices.
100
+ *
101
+ * @param {object[]} plugins - Plugin instances.
102
+ * @param {object} context - `{ sql, downloadedPaths, bucketsDir }`.
103
+ * @returns {Array<Function|null>} One entry per plugin — a callback or `null`.
104
+ */
105
+ export function runPreQuery(plugins, context) {
106
+ return plugins.map((plugin) => plugin.preQuery?.(context) ?? null);
107
+ }
108
+
109
+ /**
110
+ * Invokes each callback returned by `runPreQuery` and each plugin's `postQuery`
111
+ * hook. Errors are logged and swallowed so a failing plugin never rejects the
112
+ * caller's query result.
113
+ *
114
+ * @param {object[]} plugins - Plugin instances.
115
+ * @param {object} context - `{ result, downloadedPaths, bucketsDir }`.
116
+ * @param {Array<Function|null>} callbacks - Collected from `runPreQuery`.
117
+ */
118
+ export function runPostQuery(plugins, context, callbacks = []) {
119
+ plugins.forEach((plugin, index) => {
120
+ const cb = callbacks[index];
121
+ if (cb) Promise.resolve(cb(context)).catch((error) => logger.error(error));
122
+ if (plugin.postQuery) Promise.resolve(plugin.postQuery(context)).catch((error) => logger.error(error));
123
+ });
124
+ }
@@ -0,0 +1,49 @@
1
+ /**
2
+ * Collects listing, download, and query timing stats and delivers them via a
3
+ * single callback. Each invocation fires independently — the caller is
4
+ * responsible for aggregation across events.
5
+ *
6
+ * @example
7
+ * const stats = new StatsPlugin((event) => console.log(event));
8
+ * // { type: 'listing', prefix, bucket, fileCount, durationMs, cacheHit }
9
+ * // { type: 'download', bucket, cacheHits, cacheMisses, bytesDownloaded, durationMs }
10
+ * // { type: 'query', sql, durationMs, rowCount }
11
+ */
12
+ export default class StatsPlugin {
13
+ constructor(onStats) {
14
+ this.onStats = onStats;
15
+ }
16
+
17
+ processQuery(context) {
18
+ return context;
19
+ }
20
+
21
+ preListFiles({ prefix, bucket }) {
22
+ return ({ files, durationMs, cacheHit }) => {
23
+ this.onStats({ type: 'listing', prefix, bucket, fileCount: files.length, durationMs, cacheHit });
24
+ };
25
+ }
26
+
27
+ preDownloadFiles({ bucket, from, to }) {
28
+ return ({ cacheHits, cacheMisses, enqueuedHits, bytesDownloaded, durationMs }) => {
29
+ this.onStats({
30
+ type: 'download',
31
+ bucket,
32
+ from,
33
+ to,
34
+ cacheHits,
35
+ cacheMisses,
36
+ enqueuedHits,
37
+ bytesDownloaded,
38
+ durationMs,
39
+ });
40
+ };
41
+ }
42
+
43
+ preQuery({ sql }) {
44
+ const start = Date.now();
45
+ return ({ result }) => {
46
+ this.onStats({ type: 'query', sql, durationMs: Date.now() - start, rowCount: result.length });
47
+ };
48
+ }
49
+ }
package/src/s3/s3.js CHANGED
@@ -3,6 +3,7 @@ import { dirname } from 'node:path';
3
3
  import { S3Client, paginateListObjectsV2, GetObjectCommand } from '@aws-sdk/client-s3';
4
4
 
5
5
  import { logger } from '../utils/logger.js';
6
+ import { runPreListFiles, runPostListFiles, runPreDownloadFiles, runPostDownloadFiles } from '../plugins/lifecycle.js';
6
7
  import { datesInRange, hoursInRange, monthsInRange, buildPath } from '../utils/file-path-builder/file-path-builder.js';
7
8
  import { regexFromPattern } from '../utils/date-regex/date-regex.js';
8
9
  import { buildIbmIamClient } from './auth/ibm-iam-client.js';
@@ -44,16 +45,25 @@ export default class S3 {
44
45
  * @returns {PromiseSettledResult<string[]>} Promise result for each file downloaded
45
46
  */
46
47
  async downloadFiles({ from, to, filePatterns = [], staticFiles = [] }) {
47
- const startListing = new Date();
48
- const listPromises = filePatterns.map((pattern) => {
49
- return this.getFilePathsFromPrefixes(from, to, pattern);
50
- });
51
- const filePaths = await Promise.allSettled(listPromises).then((fileList) => {
52
- return fileList.map((list) => list.value).flat();
48
+ const listPromises = filePatterns.map((pattern) => this.getFilePathsFromPrefixes(from, to, pattern));
49
+ const filePaths = await Promise.allSettled(listPromises).then((fileList) =>
50
+ fileList.map((list) => list.value).flat(),
51
+ );
52
+ const stats = { start: new Date(), cacheHits: 0, cacheMisses: 0, enqueuedHits: 0, bytesDownloaded: 0 };
53
+ const downloadCallbacks = runPreDownloadFiles(this.plugins, { bucket: this.bucket, from, to });
54
+ const downloadedPaths = await this.downloadFileList([...filePaths, ...staticFiles], stats);
55
+ const durationMs = new Date() - stats.start;
56
+
57
+ runPostDownloadFiles(downloadCallbacks, {
58
+ cacheHits: stats.cacheHits,
59
+ cacheMisses: stats.cacheMisses,
60
+ enqueuedHits: stats.enqueuedHits,
61
+ bytesDownloaded: stats.bytesDownloaded,
62
+ durationMs,
63
+ bucket: this.bucket,
53
64
  });
54
65
 
55
- logger.info(`Total listing time: ${(new Date() - startListing) / 1000}s`);
56
- return this.downloadFileList([...filePaths, ...staticFiles]);
66
+ return downloadedPaths;
57
67
  }
58
68
 
59
69
  /**
@@ -62,30 +72,14 @@ export default class S3 {
62
72
  * @param {string[]} filePaths A list of files to download
63
73
  * @returns {PromiseSettledResult} A Promise that resolves to an array of file paths
64
74
  */
65
- downloadFileList(filePaths = []) {
66
- logger.info(`Starting downloads for ${filePaths.length} files`);
75
+ downloadFileList(filePaths = [], stats = { cacheHits: 0, cacheMisses: 0, enqueuedHits: 0, bytesDownloaded: 0 }) {
67
76
  this.preFlightCheck(filePaths);
68
77
 
69
- const stats = {
70
- start: new Date(),
71
- cacheHits: 0,
72
- cacheMisses: 0,
73
- enqueuedHits: 0,
74
- bytesDownloaded: 0,
75
- };
76
78
  const filesPromises = this.startDownloads(stats, filePaths);
77
79
 
78
80
  return Promise.allSettled(filesPromises)
79
- .then((results) => {
80
- return results
81
- .filter((result) => {
82
- return result.value;
83
- })
84
- .map((result) => result.value);
85
- })
86
- .then(this.logStatistics(stats))
87
- .then(this.resetEnqueued)
88
- .then((results) => results);
81
+ .then((results) => results.filter((result) => result.value).map((result) => result.value))
82
+ .then(this.resetEnqueued);
89
83
  }
90
84
 
91
85
  /**
@@ -245,8 +239,19 @@ export default class S3 {
245
239
  */
246
240
  async listFiles(prefix) {
247
241
  const cacheKey = `${this.bucket}/${prefix}`;
242
+ const start = new Date();
243
+ const listCallbacks = runPreListFiles(this.plugins, { prefix, bucket: this.bucket });
244
+
248
245
  if (this.listingCache.has(cacheKey)) {
249
- return this.listingCache.get(cacheKey);
246
+ const files = this.listingCache.get(cacheKey);
247
+ runPostListFiles(listCallbacks, {
248
+ prefix,
249
+ bucket: this.bucket,
250
+ files,
251
+ durationMs: new Date() - start,
252
+ cacheHit: true,
253
+ });
254
+ return files;
250
255
  }
251
256
 
252
257
  const files = [];
@@ -255,6 +260,13 @@ export default class S3 {
255
260
  }
256
261
 
257
262
  this.listingCache.set(cacheKey, files);
263
+ runPostListFiles(listCallbacks, {
264
+ prefix,
265
+ bucket: this.bucket,
266
+ files,
267
+ durationMs: new Date() - start,
268
+ cacheHit: false,
269
+ });
258
270
  return files;
259
271
  }
260
272
 
@@ -276,26 +288,6 @@ export default class S3 {
276
288
  return fileDLPromises;
277
289
  }
278
290
 
279
- /**
280
- * Logs download statistics
281
- *
282
- * @param {object} stats A statistics object
283
- * @returns {(PromiseSettledResult) => PromiseSettledResult}
284
- */
285
- logStatistics(stats) {
286
- return (results) => {
287
- const mbDownloaded = stats.bytesDownloaded !== 0 ? stats.bytesDownloaded / (1024 * 1024) : 0;
288
- const seconds = (new Date() - stats.start) / 1000;
289
- const mbPerSecond = mbDownloaded / seconds;
290
-
291
- logger.info(`Enqueued keys: ${this.enqueuedFiles.size}`);
292
- logger.info(
293
- `Download completed in: ${seconds} seconds. Cache hits: ${stats.cacheHits}. Cache misses: ${stats.cacheMisses}. Enqueued hits: ${stats.enqueuedHits}. MB downloaded: ${mbDownloaded}. MB/s ${mbPerSecond}`,
294
- );
295
- return results;
296
- };
297
- }
298
-
299
291
  /**
300
292
  * Starts the download for all files
301
293
  *
package/src/s3-querier.js CHANGED
@@ -2,10 +2,13 @@ import { LRUCache } from 'lru-cache';
2
2
 
3
3
  import S3 from './s3/s3.js';
4
4
  export { bigintReplacer } from './utils/bigint-replacer.js';
5
- import { mergeSettings } from './utils/file-settings/file-settings.js';
6
5
  import { query as execQuery } from './duck-db/index.js';
7
6
  import QueryParserPlugin from './plugins/query-parser/query-parser.js';
8
7
  import QueryFinalizerPlugin from './plugins/query-finalizer/query-finalizer.js';
8
+ export { default as AvroPlugin } from './plugins/avro/avro-plugin.js';
9
+ export { default as FSPurgePlugin } from './plugins/fs-purge/fs-purge-plugin.js';
10
+ export { default as StatsPlugin } from './plugins/stats/stats-plugin.js';
11
+ import { processQuery, runFinalizers, runPreQuery, runPostQuery } from './plugins/lifecycle.js';
9
12
 
10
13
  const listingCache = new LRUCache({ max: 1000 });
11
14
 
@@ -68,49 +71,13 @@ export default function s3Querier({
68
71
  });
69
72
  const downloadedPaths = results.flatMap((result) => result.value);
70
73
  const finalQuery = runFinalizers({ plugins: systemPlugins, rawQuery, fileSettings, downloadedPaths, bucketsDir });
71
- return execQuery(finalQuery, { format });
72
- });
73
- }
74
+ const postQueryCallbacks = runPreQuery(systemPlugins, { sql: finalQuery, downloadedPaths, bucketsDir });
74
75
 
75
- /**
76
- * Orchestrates:
77
- * - Passing the query through to each plugin
78
- * - Merging file download settings
79
- *
80
- * @param {Array} plugins
81
- * @param {object} context
82
- * @returns
83
- */
84
- function processQuery(plugins = [], { query = '', endpoint, defaultBucket, bucketsDir }) {
85
- const processedQuery = plugins.reduce((result, plugin) => plugin.processQuery(result), {
86
- endpoint,
87
- defaultBucket,
88
- bucketsDir,
89
- query,
90
- settings: [],
76
+ return execQuery(finalQuery, { format }).then((result) => {
77
+ runPostQuery(systemPlugins, { result, downloadedPaths, bucketsDir }, postQueryCallbacks);
78
+ return result;
79
+ });
91
80
  });
92
- const fileSettings = processedQuery.settings;
93
- processedQuery.settings = mergeSettings(processedQuery.settings);
94
- return { ...processedQuery, fileSettings };
95
- }
96
-
97
- /**
98
- * Passes the raw query through each plugin's `finalizeQuery` lifecycle method,
99
- * substituting exact downloaded paths in place of glob patterns.
100
- *
101
- * @param {object} params
102
- * @param {object[]} params.plugins - Plugin instances to run finalizers on.
103
- * @param {string} params.rawQuery - SQL with original file references and date/location tokens.
104
- * @param {object[]} params.fileSettings - Pre-merge per-file settings from processQuery.
105
- * @param {string[]} params.downloadedPaths - Absolute local paths of all downloaded files.
106
- * @param {string} params.bucketsDir - Root directory where files are cached locally.
107
- * @returns {string} Finalized SQL ready for DuckDB execution.
108
- */
109
- function runFinalizers({ plugins, rawQuery, fileSettings, downloadedPaths, bucketsDir }) {
110
- return plugins.reduce((query, plugin) => {
111
- if (!plugin.finalizeQuery) return query;
112
- return plugin.finalizeQuery(query, fileSettings, downloadedPaths, bucketsDir);
113
- }, rawQuery);
114
81
  }
115
82
 
116
83
  /**
@@ -2,27 +2,27 @@ const DIGITS_4 = '\\d{4}';
2
2
  const DIGITS_2 = '\\d{2}';
3
3
 
4
4
  export function yyyy(str, date) {
5
- return str.replaceAll('{yyyy}', String(date.getFullYear()));
5
+ return str.replaceAll('{yyyy}', String(date.getUTCFullYear()));
6
6
  }
7
7
 
8
8
  export function MM(str, date) {
9
- return str.replaceAll('{MM}', String(date.getMonth() + 1).padStart(2, '0'));
9
+ return str.replaceAll('{MM}', String(date.getUTCMonth() + 1).padStart(2, '0'));
10
10
  }
11
11
 
12
12
  export function dd(str, date) {
13
- return str.replaceAll('{dd}', String(date.getDate()).padStart(2, '0'));
13
+ return str.replaceAll('{dd}', String(date.getUTCDate()).padStart(2, '0'));
14
14
  }
15
15
 
16
16
  export function hh(str, date) {
17
- return str.replaceAll('{hh}', String(date.getHours()).padStart(2, '0'));
17
+ return str.replaceAll('{hh}', String(date.getUTCHours()).padStart(2, '0'));
18
18
  }
19
19
 
20
20
  export function mm(str, date) {
21
- return str.replaceAll('{mm}', String(date.getMinutes()).padStart(2, '0'));
21
+ return str.replaceAll('{mm}', String(date.getUTCMinutes()).padStart(2, '0'));
22
22
  }
23
23
 
24
24
  export function ss(str, date) {
25
- return str.replaceAll('{ss}', String(date.getSeconds()).padStart(2, '0'));
25
+ return str.replaceAll('{ss}', String(date.getUTCSeconds()).padStart(2, '0'));
26
26
  }
27
27
 
28
28
  export function regexFromPattern(pattern = '') {
@@ -1,5 +1,8 @@
1
- import { eachDayOfInterval, eachHourOfInterval } from 'date-fns';
2
1
  import { yyyy, MM, dd, hh, mm, ss } from '../date-regex/date-regex.js';
2
+
3
+ const MS_PER_HOUR = 60 * 60 * 1000;
4
+ const MS_PER_DAY = 24 * MS_PER_HOUR;
5
+
3
6
  /**
4
7
  * Give a string with date patterns replaces patterns with actual data values
5
8
  *
@@ -15,17 +18,17 @@ export function buildPath(filePattern, date) {
15
18
  }
16
19
 
17
20
  /**
18
- * Given a date range returns an array of date objects
21
+ * Given a date range returns an array of date objects, one per UTC calendar day.
19
22
  *
20
23
  * @param {Date} from The from Date object
21
24
  * @param {Date} to The to Date object
22
25
  * @returns {Date[]} An array of dates within the to from from time range
23
26
  */
24
27
  export function datesInRange(from, to) {
25
- return eachDayOfInterval({
26
- start: zeroDateMins(new Date(from)),
27
- end: zeroDateMins(new Date(to)),
28
- });
28
+ const startMs = Date.UTC(from.getUTCFullYear(), from.getUTCMonth(), from.getUTCDate());
29
+ const endMs = Date.UTC(to.getUTCFullYear(), to.getUTCMonth(), to.getUTCDate());
30
+ const count = Math.round((endMs - startMs) / MS_PER_DAY) + 1;
31
+ return Array.from({ length: count }, (_, index) => new Date(startMs + index * MS_PER_DAY));
29
32
  }
30
33
 
31
34
  /**
@@ -45,29 +48,17 @@ export function monthsInRange(from, to) {
45
48
  }
46
49
 
47
50
  /**
48
- * Given a date range returns an array of date objects
51
+ * Given a date range returns an array of date objects, one per UTC hour.
49
52
  *
50
53
  * @param {Date} from The from Date object
51
54
  * @param {Date} to The to Date object
52
55
  * @returns {Date[]} An array of dates by hours within the to from from time range
53
56
  */
54
57
  export function hoursInRange(from, to) {
55
- return eachHourOfInterval({
56
- start: new Date(from),
57
- end: new Date(to),
58
- });
59
- }
60
-
61
- /**
62
- * Sets a Date object minutes and seconds to 0
63
- *
64
- * @param {Date} date Date object
65
- * @returns {Date} A Date object with minutes and seconds set to 0
66
- */
67
- export function zeroDateMins(date) {
68
- const zeroedDate = new Date(date);
69
- zeroedDate.setMinutes(0, 0);
70
- return zeroedDate;
58
+ const startMs = Date.UTC(from.getUTCFullYear(), from.getUTCMonth(), from.getUTCDate(), from.getUTCHours());
59
+ const endMs = Date.UTC(to.getUTCFullYear(), to.getUTCMonth(), to.getUTCDate(), to.getUTCHours());
60
+ const count = Math.round((endMs - startMs) / MS_PER_HOUR) + 1;
61
+ return Array.from({ length: count }, (_, index) => new Date(startMs + index * MS_PER_HOUR));
71
62
  }
72
63
 
73
64
  function noonUtcForMonthOffset(startYear, startMonth, offset) {
@@ -0,0 +1,57 @@
1
+ import { glob } from 'glob';
2
+ import { unlink } from 'node:fs/promises';
3
+ import { logger } from '../logger.js';
4
+
5
+ export default class FSPurge {
6
+ /**
7
+ * @param {object} options
8
+ * @param {string} options.pattern Glob pattern for files to consider for purging
9
+ * @param {number} [options.lastAccessTTLMinutes=60] Minutes since last access before a file is eligible for purge
10
+ * @param {number} [options.refreshIntervalMin=60] How often to run a sweep, in minutes
11
+ */
12
+ constructor({ pattern, lastAccessTTLMinutes = 60, refreshIntervalMin = 60 }) {
13
+ this.lastAccessTTLMinutes = lastAccessTTLMinutes;
14
+ this.pattern = pattern;
15
+ this.refreshIntervalMin = refreshIntervalMin;
16
+ this.lastSweep = null;
17
+ }
18
+
19
+ start() {
20
+ this.refreshRef = setInterval(this.sweep.bind(this), this.refreshIntervalMin * (1000 * 60));
21
+ }
22
+
23
+ stop() {
24
+ clearInterval(this.refreshRef);
25
+ }
26
+
27
+ async sweep() {
28
+ const now = new Date();
29
+ if (this.lastSweep && now - this.lastSweep < this.refreshIntervalMin * 60_000) return;
30
+ this.lastSweep = now;
31
+
32
+ const startSweep = now;
33
+ const filesToBeRemoved = await this.getExpiredFiles();
34
+ logger.info(`Removing ${filesToBeRemoved.length} files in sweep. Read time: ${(new Date() - startSweep) / 1000}s`);
35
+
36
+ await Promise.allSettled(
37
+ filesToBeRemoved.map((file) => {
38
+ return unlink(file)
39
+ .then(() => logger.debug(`Deleted file ${file}.`))
40
+ .catch((error) => logger.error(`Error deleting file ${file}. ${error.toString()}`));
41
+ }),
42
+ );
43
+ }
44
+
45
+ async getExpiredFiles() {
46
+ const expiresAt = new Date();
47
+ expiresAt.setMinutes(expiresAt.getMinutes() - this.lastAccessTTLMinutes);
48
+
49
+ const files = await glob(this.pattern, {
50
+ stat: true,
51
+ withFileTypes: true,
52
+ nodir: true,
53
+ });
54
+
55
+ return files.filter((file) => file.atime < expiresAt).map((file) => file.fullpath());
56
+ }
57
+ }