s3-querier 1.0.1 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { regexFromPattern } from '../../utils/date-regex/date-regex.js';
|
|
2
2
|
import {
|
|
3
3
|
removeFileSettingTokens,
|
|
4
4
|
removeDoubleFwdSlash,
|
|
@@ -9,33 +9,60 @@ export default class QueryFinalizerPlugin {
|
|
|
9
9
|
name = 'CorePlugin';
|
|
10
10
|
|
|
11
11
|
processQuery(context) {
|
|
12
|
-
|
|
13
|
-
const processedQuery = QueryFinalizerPlugin.prepareQuery(settings, bucketsDir, query);
|
|
14
|
-
return { ...context, query: processedQuery };
|
|
12
|
+
return context;
|
|
15
13
|
}
|
|
16
14
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
15
|
+
/**
|
|
16
|
+
* Replaces each SQL file reference with the exact local paths downloaded from S3.
|
|
17
|
+
* Called after all downloads complete so that DuckDB receives precise file paths
|
|
18
|
+
* rather than glob patterns that would scan the entire local cache.
|
|
19
|
+
*
|
|
20
|
+
* @param {string} rawQuery - SQL with original file references and date/location tokens
|
|
21
|
+
* @param {object[]} fileSettings - Pre-merge per-file settings from processQuery
|
|
22
|
+
* @param {string[]} downloadedPaths - Absolute local paths of all downloaded files
|
|
23
|
+
* @param {string} bucketsDir - Root directory where files are cached locally
|
|
24
|
+
* @returns {string} Finalized SQL ready for DuckDB execution
|
|
25
|
+
*/
|
|
26
|
+
finalizeQuery(rawQuery, fileSettings, downloadedPaths, bucketsDir) {
|
|
27
|
+
let prepared = fileSettings.reduce(
|
|
28
|
+
(query, setting) => applyFileSetting(query, setting, downloadedPaths, bucketsDir),
|
|
29
|
+
rawQuery,
|
|
30
|
+
);
|
|
25
31
|
prepared = removeFileSettingTokens(prepared);
|
|
26
|
-
prepared = removeFileDatePatterns(prepared);
|
|
27
32
|
prepared = removeCacheSettings(prepared);
|
|
28
33
|
prepared = removeDoubleFwdSlash(prepared);
|
|
29
|
-
|
|
30
34
|
return prepared;
|
|
31
35
|
}
|
|
36
|
+
}
|
|
32
37
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
38
|
+
/** Helpers */
|
|
39
|
+
|
|
40
|
+
function applyFileSetting(query, { sqlFileReference, file, bucket }, downloadedPaths, bucketsDir) {
|
|
41
|
+
const localDir = `${bucketsDir}/${bucket}/`;
|
|
42
|
+
const filePattern = regexFromPattern(file);
|
|
43
|
+
const matchingPaths = downloadedPaths.filter((localPath) => matchesPattern(localPath, localDir, filePattern));
|
|
44
|
+
const searchStr = sqlFileReference.replace(/\?cache=(true|false)/i, '');
|
|
45
|
+
|
|
46
|
+
if (matchingPaths.length === 0) throw new Error(`No files found for: ${file}`);
|
|
47
|
+
if (matchingPaths.length > 1) return replaceWithArray(query, searchStr, matchingPaths);
|
|
48
|
+
|
|
49
|
+
return query.replace(new RegExp(escapeForRegex(searchStr), 'gi'), matchingPaths[0]);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function matchesPattern(localPath, localDir, filePattern) {
|
|
53
|
+
return localPath.startsWith(localDir) && filePattern.test(localPath.slice(localDir.length));
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function replaceWithArray(query, searchStr, paths) {
|
|
57
|
+
const arrayLiteral = `[${paths.map((path) => `'${path}'`).join(', ')}]`;
|
|
58
|
+
return query.replace(new RegExp(`['"]${escapeForRegex(searchStr)}['"]`, 'gi'), arrayLiteral);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function escapeForRegex(str) {
|
|
62
|
+
return str
|
|
63
|
+
.replace(/\*/g, '\\*')
|
|
64
|
+
.replace(/\./g, '\\.')
|
|
65
|
+
.replace(/\{/g, '\\{')
|
|
66
|
+
.replace(/\}/g, '\\}')
|
|
67
|
+
.replace(/\+/g, '\\+');
|
|
41
68
|
}
|
package/src/s3/s3.js
CHANGED
|
@@ -3,7 +3,7 @@ import { dirname } from 'node:path';
|
|
|
3
3
|
import { S3Client, ListObjectsV2Command, GetObjectCommand } from '@aws-sdk/client-s3';
|
|
4
4
|
|
|
5
5
|
import { logger } from '../utils/logger.js';
|
|
6
|
-
import { datesInRange, hoursInRange, buildPath } from '../utils/file-path-builder/file-path-builder.js';
|
|
6
|
+
import { datesInRange, hoursInRange, monthsInRange, buildPath } from '../utils/file-path-builder/file-path-builder.js';
|
|
7
7
|
import { regexFromPattern } from '../utils/date-regex/date-regex.js';
|
|
8
8
|
import { buildIbmIamClient } from './auth/ibm-iam-client.js';
|
|
9
9
|
|
|
@@ -139,12 +139,14 @@ export default class S3 {
|
|
|
139
139
|
* @returns {(from:Date, to:Date, filePattern:string) => string[]} A function that creates a list of prefixes
|
|
140
140
|
*/
|
|
141
141
|
prefixStrategy(from, to, filePattern) {
|
|
142
|
-
const
|
|
143
|
-
const
|
|
142
|
+
const hasDayToken = /\{(dd|hh|mm)\}/.test(filePattern);
|
|
143
|
+
const hasMonthToken = /\{(yyyy|MM)\}/.test(filePattern);
|
|
144
|
+
const hasGlob = filePattern.includes('*');
|
|
144
145
|
const hourDiff = (new Date(to) - new Date(from)) / 1000 / 60 / 60;
|
|
145
146
|
|
|
146
|
-
if (
|
|
147
|
-
if (
|
|
147
|
+
if (hasDayToken && hourDiff < 24) return this.prefixHours;
|
|
148
|
+
if (hasDayToken) return this.prefixDays;
|
|
149
|
+
if (hasMonthToken) return this.prefixMonths;
|
|
148
150
|
if (hasGlob) return this.prefixGlob;
|
|
149
151
|
return (_from, _to, pattern) => [pattern];
|
|
150
152
|
}
|
|
@@ -182,6 +184,22 @@ export default class S3 {
|
|
|
182
184
|
});
|
|
183
185
|
}
|
|
184
186
|
|
|
187
|
+
/**
|
|
188
|
+
* Returns a list of prefixes based on a range of months, one per calendar month.
|
|
189
|
+
* Used for Hive-style paths with {yyyy}/{MM} tokens but no {dd}.
|
|
190
|
+
*
|
|
191
|
+
* @param {Date} from From date
|
|
192
|
+
* @param {Date} to To date
|
|
193
|
+
* @param {string} filePattern The file pattern to use
|
|
194
|
+
* @returns {string[]} The list of prefixes for filtering
|
|
195
|
+
*/
|
|
196
|
+
prefixMonths(from, to, filePattern) {
|
|
197
|
+
const monthRange = monthsInRange(new Date(from), new Date(to));
|
|
198
|
+
const splitToken = filePattern.includes('{MM}') ? '{MM}' : '{yyyy}';
|
|
199
|
+
const [trimmed] = filePattern.split(splitToken);
|
|
200
|
+
return monthRange.map((date) => buildPath(`${trimmed}${splitToken}`, date));
|
|
201
|
+
}
|
|
202
|
+
|
|
185
203
|
/**
|
|
186
204
|
* Returns a single entry array with a file pattern trimmed to the first glob
|
|
187
205
|
*
|
|
@@ -368,7 +386,7 @@ export default class S3 {
|
|
|
368
386
|
*/
|
|
369
387
|
async objectToFile(key) {
|
|
370
388
|
const file = `${this.mount}/${key}`;
|
|
371
|
-
const tmp = `${file}.${process.pid}.${Date.now()}.tmp`;
|
|
389
|
+
const tmp = `${file}.${process.pid}.${Date.now()}.${Math.random().toString(36).slice(2)}.tmp`;
|
|
372
390
|
try {
|
|
373
391
|
const response = await this.s3.send(new GetObjectCommand({ Bucket: this.bucket, Key: key }));
|
|
374
392
|
const chunks = [];
|
|
@@ -422,7 +440,7 @@ export default class S3 {
|
|
|
422
440
|
}
|
|
423
441
|
}
|
|
424
442
|
|
|
425
|
-
function buildS3Client({ apiKey, accessKeyId, secretAccessKey, endpoint, region }) {
|
|
443
|
+
export function buildS3Client({ apiKey, accessKeyId, secretAccessKey, endpoint, region = 'us-east-1' }) {
|
|
426
444
|
const config = { ...(endpoint && { endpoint }), region, forcePathStyle: true };
|
|
427
445
|
|
|
428
446
|
if (apiKey) return buildIbmIamClient(config, apiKey);
|
package/src/s3-querier.js
CHANGED
|
@@ -40,8 +40,16 @@ export default function s3Querier({
|
|
|
40
40
|
format,
|
|
41
41
|
}) {
|
|
42
42
|
const systemPlugins = [new QueryParserPlugin(), ...plugins, new QueryFinalizerPlugin()];
|
|
43
|
-
const
|
|
44
|
-
|
|
43
|
+
const {
|
|
44
|
+
query: rawQuery,
|
|
45
|
+
fileSettings,
|
|
46
|
+
settings: downloadSettings,
|
|
47
|
+
} = processQuery(systemPlugins, {
|
|
48
|
+
query,
|
|
49
|
+
endpoint: defaultEndpoint,
|
|
50
|
+
defaultBucket,
|
|
51
|
+
bucketsDir,
|
|
52
|
+
});
|
|
45
53
|
|
|
46
54
|
const downloadPromises = startDownloads({
|
|
47
55
|
apiKey,
|
|
@@ -58,7 +66,9 @@ export default function s3Querier({
|
|
|
58
66
|
results.forEach((result) => {
|
|
59
67
|
if (result.status === 'rejected') throw result.reason;
|
|
60
68
|
});
|
|
61
|
-
|
|
69
|
+
const downloadedPaths = results.flatMap((result) => result.value);
|
|
70
|
+
const finalQuery = runFinalizers({ plugins: systemPlugins, rawQuery, fileSettings, downloadedPaths, bucketsDir });
|
|
71
|
+
return execQuery(finalQuery, { format });
|
|
62
72
|
});
|
|
63
73
|
}
|
|
64
74
|
|
|
@@ -72,14 +82,35 @@ export default function s3Querier({
|
|
|
72
82
|
* @returns
|
|
73
83
|
*/
|
|
74
84
|
function processQuery(plugins = [], { query = '', endpoint, defaultBucket, bucketsDir }) {
|
|
75
|
-
const processedQuery = plugins.reduce(
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
85
|
+
const processedQuery = plugins.reduce((result, plugin) => plugin.processQuery(result), {
|
|
86
|
+
endpoint,
|
|
87
|
+
defaultBucket,
|
|
88
|
+
bucketsDir,
|
|
89
|
+
query,
|
|
90
|
+
settings: [],
|
|
91
|
+
});
|
|
92
|
+
const fileSettings = processedQuery.settings;
|
|
81
93
|
processedQuery.settings = mergeSettings(processedQuery.settings);
|
|
82
|
-
return processedQuery;
|
|
94
|
+
return { ...processedQuery, fileSettings };
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Passes the raw query through each plugin's `finalizeQuery` lifecycle method,
|
|
99
|
+
* substituting exact downloaded paths in place of glob patterns.
|
|
100
|
+
*
|
|
101
|
+
* @param {object} params
|
|
102
|
+
* @param {object[]} params.plugins - Plugin instances to run finalizers on.
|
|
103
|
+
* @param {string} params.rawQuery - SQL with original file references and date/location tokens.
|
|
104
|
+
* @param {object[]} params.fileSettings - Pre-merge per-file settings from processQuery.
|
|
105
|
+
* @param {string[]} params.downloadedPaths - Absolute local paths of all downloaded files.
|
|
106
|
+
* @param {string} params.bucketsDir - Root directory where files are cached locally.
|
|
107
|
+
* @returns {string} Finalized SQL ready for DuckDB execution.
|
|
108
|
+
*/
|
|
109
|
+
function runFinalizers({ plugins, rawQuery, fileSettings, downloadedPaths, bucketsDir }) {
|
|
110
|
+
return plugins.reduce((query, plugin) => {
|
|
111
|
+
if (!plugin.finalizeQuery) return query;
|
|
112
|
+
return plugin.finalizeQuery(query, fileSettings, downloadedPaths, bucketsDir);
|
|
113
|
+
}, rawQuery);
|
|
83
114
|
}
|
|
84
115
|
|
|
85
116
|
/**
|
|
@@ -28,6 +28,22 @@ export function datesInRange(from, to) {
|
|
|
28
28
|
});
|
|
29
29
|
}
|
|
30
30
|
|
|
31
|
+
/**
|
|
32
|
+
* Given a date range returns one date object per calendar month.
|
|
33
|
+
* Returns noon-UTC dates so that getMonth()/getFullYear() produce the correct
|
|
34
|
+
* UTC month in any timezone.
|
|
35
|
+
*
|
|
36
|
+
* @param {Date} from The from Date object
|
|
37
|
+
* @param {Date} to The to Date object
|
|
38
|
+
* @returns {Date[]} An array of dates, one per month within the range
|
|
39
|
+
*/
|
|
40
|
+
export function monthsInRange(from, to) {
|
|
41
|
+
const startYear = from.getUTCFullYear();
|
|
42
|
+
const startMonth = from.getUTCMonth();
|
|
43
|
+
const count = (to.getUTCFullYear() - startYear) * 12 + (to.getUTCMonth() - startMonth) + 1;
|
|
44
|
+
return Array.from({ length: count }, (_, index) => noonUtcForMonthOffset(startYear, startMonth, index));
|
|
45
|
+
}
|
|
46
|
+
|
|
31
47
|
/**
|
|
32
48
|
* Given a date range returns an array of date objects
|
|
33
49
|
*
|
|
@@ -53,3 +69,9 @@ export function zeroDateMins(date) {
|
|
|
53
69
|
zeroedDate.setMinutes(0, 0);
|
|
54
70
|
return zeroedDate;
|
|
55
71
|
}
|
|
72
|
+
|
|
73
|
+
function noonUtcForMonthOffset(startYear, startMonth, offset) {
|
|
74
|
+
const year = startYear + Math.floor((startMonth + offset) / 12);
|
|
75
|
+
const month = (startMonth + offset) % 12;
|
|
76
|
+
return new Date(Date.UTC(year, month, 1, 12, 0, 0));
|
|
77
|
+
}
|