s3-querier 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "s3-querier",
3
- "version": "1.0.1",
3
+ "version": "1.0.3",
4
4
  "description": "Query S3-compatible storage with DuckDB and SQL",
5
5
  "type": "module",
6
6
  "main": "src/s3-querier.js",
@@ -1,4 +1,4 @@
1
- import { removeFileDatePatterns } from '../../utils/date-regex/date-regex.js';
1
+ import { regexFromPattern } from '../../utils/date-regex/date-regex.js';
2
2
  import {
3
3
  removeFileSettingTokens,
4
4
  removeDoubleFwdSlash,
@@ -9,33 +9,60 @@ export default class QueryFinalizerPlugin {
9
9
  name = 'CorePlugin';
10
10
 
11
11
  processQuery(context) {
12
- const { settings, bucketsDir, query } = context;
13
- const processedQuery = QueryFinalizerPlugin.prepareQuery(settings, bucketsDir, query);
14
- return { ...context, query: processedQuery };
12
+ return context;
15
13
  }
16
14
 
17
- static prepareQuery(settings, bucketsDir, query) {
18
- let prepared = query;
19
-
20
- settings.forEach((setting) => {
21
- const searchPattern = setting.sqlFileReference.replace(/\?cache=(true|false)/i, '');
22
- const fileRegexStr = QueryFinalizerPlugin.prepareFileRegexStr(searchPattern);
23
- prepared = prepared.replace(new RegExp(fileRegexStr, 'gi'), `${bucketsDir}/${setting.bucket}/${setting.file}`);
24
- });
15
+ /**
16
+ * Replaces each SQL file reference with the exact local paths downloaded from S3.
17
+ * Called after all downloads complete so that DuckDB receives precise file paths
18
+ * rather than glob patterns that would scan the entire local cache.
19
+ *
20
+ * @param {string} rawQuery - SQL with original file references and date/location tokens
21
+ * @param {object[]} fileSettings - Pre-merge per-file settings from processQuery
22
+ * @param {string[]} downloadedPaths - Absolute local paths of all downloaded files
23
+ * @param {string} bucketsDir - Root directory where files are cached locally
24
+ * @returns {string} Finalized SQL ready for DuckDB execution
25
+ */
26
+ finalizeQuery(rawQuery, fileSettings, downloadedPaths, bucketsDir) {
27
+ let prepared = fileSettings.reduce(
28
+ (query, setting) => applyFileSetting(query, setting, downloadedPaths, bucketsDir),
29
+ rawQuery,
30
+ );
25
31
  prepared = removeFileSettingTokens(prepared);
26
- prepared = removeFileDatePatterns(prepared);
27
32
  prepared = removeCacheSettings(prepared);
28
33
  prepared = removeDoubleFwdSlash(prepared);
29
-
30
34
  return prepared;
31
35
  }
36
+ }
32
37
 
33
- static prepareFileRegexStr(fileStr) {
34
- return fileStr
35
- .replace(/\*/g, '\\*')
36
- .replace(/\./g, '\\.')
37
- .replace(/\{/g, '\\{')
38
- .replace(/\}/g, '\\}')
39
- .replace(/\+/g, '\\+');
40
- }
38
+ /** Helpers */
39
+
40
+ function applyFileSetting(query, { sqlFileReference, file, bucket }, downloadedPaths, bucketsDir) {
41
+ const localDir = `${bucketsDir}/${bucket}/`;
42
+ const filePattern = regexFromPattern(file);
43
+ const matchingPaths = downloadedPaths.filter((localPath) => matchesPattern(localPath, localDir, filePattern));
44
+ const searchStr = sqlFileReference.replace(/\?cache=(true|false)/i, '');
45
+
46
+ if (matchingPaths.length === 0) throw new Error(`No files found for: ${file}`);
47
+ if (matchingPaths.length > 1) return replaceWithArray(query, searchStr, matchingPaths);
48
+
49
+ return query.replace(new RegExp(escapeForRegex(searchStr), 'gi'), matchingPaths[0]);
50
+ }
51
+
52
+ function matchesPattern(localPath, localDir, filePattern) {
53
+ return localPath.startsWith(localDir) && filePattern.test(localPath.slice(localDir.length));
54
+ }
55
+
56
+ function replaceWithArray(query, searchStr, paths) {
57
+ const arrayLiteral = `[${paths.map((path) => `'${path}'`).join(', ')}]`;
58
+ return query.replace(new RegExp(`['"]${escapeForRegex(searchStr)}['"]`, 'gi'), arrayLiteral);
59
+ }
60
+
61
+ function escapeForRegex(str) {
62
+ return str
63
+ .replace(/\*/g, '\\*')
64
+ .replace(/\./g, '\\.')
65
+ .replace(/\{/g, '\\{')
66
+ .replace(/\}/g, '\\}')
67
+ .replace(/\+/g, '\\+');
41
68
  }
package/src/s3/s3.js CHANGED
@@ -3,7 +3,7 @@ import { dirname } from 'node:path';
3
3
  import { S3Client, ListObjectsV2Command, GetObjectCommand } from '@aws-sdk/client-s3';
4
4
 
5
5
  import { logger } from '../utils/logger.js';
6
- import { datesInRange, hoursInRange, buildPath } from '../utils/file-path-builder/file-path-builder.js';
6
+ import { datesInRange, hoursInRange, monthsInRange, buildPath } from '../utils/file-path-builder/file-path-builder.js';
7
7
  import { regexFromPattern } from '../utils/date-regex/date-regex.js';
8
8
  import { buildIbmIamClient } from './auth/ibm-iam-client.js';
9
9
 
@@ -139,12 +139,14 @@ export default class S3 {
139
139
  * @returns {(from:Date, to:Date, filePattern:string) => string[]} A function that creates a list of prefixes
140
140
  */
141
141
  prefixStrategy(from, to, filePattern) {
142
- const hasDateToken = filePattern.match(/\{(yyyy|MM|dd|hh|mm)\}/g);
143
- const hasGlob = filePattern.match(/\*/g);
142
+ const hasDayToken = /\{(dd|hh|mm)\}/.test(filePattern);
143
+ const hasMonthToken = /\{(yyyy|MM)\}/.test(filePattern);
144
+ const hasGlob = filePattern.includes('*');
144
145
  const hourDiff = (new Date(to) - new Date(from)) / 1000 / 60 / 60;
145
146
 
146
- if (hasDateToken && hourDiff < 24) return this.prefixHours;
147
- if (hasDateToken) return this.prefixDays;
147
+ if (hasDayToken && hourDiff < 24) return this.prefixHours;
148
+ if (hasDayToken) return this.prefixDays;
149
+ if (hasMonthToken) return this.prefixMonths;
148
150
  if (hasGlob) return this.prefixGlob;
149
151
  return (_from, _to, pattern) => [pattern];
150
152
  }
@@ -182,6 +184,22 @@ export default class S3 {
182
184
  });
183
185
  }
184
186
 
187
+ /**
188
+ * Returns a list of prefixes based on a range of months, one per calendar month.
189
+ * Used for Hive-style paths with {yyyy}/{MM} tokens but no {dd}.
190
+ *
191
+ * @param {Date} from From date
192
+ * @param {Date} to To date
193
+ * @param {string} filePattern The file pattern to use
194
+ * @returns {string[]} The list of prefixes for filtering
195
+ */
196
+ prefixMonths(from, to, filePattern) {
197
+ const monthRange = monthsInRange(new Date(from), new Date(to));
198
+ const splitToken = filePattern.includes('{MM}') ? '{MM}' : '{yyyy}';
199
+ const [trimmed] = filePattern.split(splitToken);
200
+ return monthRange.map((date) => buildPath(`${trimmed}${splitToken}`, date));
201
+ }
202
+
185
203
  /**
186
204
  * Returns a single entry array with a file pattern trimmed to the first glob
187
205
  *
@@ -368,7 +386,7 @@ export default class S3 {
368
386
  */
369
387
  async objectToFile(key) {
370
388
  const file = `${this.mount}/${key}`;
371
- const tmp = `${file}.${process.pid}.${Date.now()}.tmp`;
389
+ const tmp = `${file}.${process.pid}.${Date.now()}.${Math.random().toString(36).slice(2)}.tmp`;
372
390
  try {
373
391
  const response = await this.s3.send(new GetObjectCommand({ Bucket: this.bucket, Key: key }));
374
392
  const chunks = [];
@@ -422,7 +440,7 @@ export default class S3 {
422
440
  }
423
441
  }
424
442
 
425
- function buildS3Client({ apiKey, accessKeyId, secretAccessKey, endpoint, region }) {
443
+ export function buildS3Client({ apiKey, accessKeyId, secretAccessKey, endpoint, region = 'us-east-1' }) {
426
444
  const config = { ...(endpoint && { endpoint }), region, forcePathStyle: true };
427
445
 
428
446
  if (apiKey) return buildIbmIamClient(config, apiKey);
package/src/s3-querier.js CHANGED
@@ -40,8 +40,16 @@ export default function s3Querier({
40
40
  format,
41
41
  }) {
42
42
  const systemPlugins = [new QueryParserPlugin(), ...plugins, new QueryFinalizerPlugin()];
43
- const processed = processQuery(systemPlugins, { query, endpoint: defaultEndpoint, defaultBucket, bucketsDir });
44
- const { query: processedQuery, settings: downloadSettings } = processed;
43
+ const {
44
+ query: rawQuery,
45
+ fileSettings,
46
+ settings: downloadSettings,
47
+ } = processQuery(systemPlugins, {
48
+ query,
49
+ endpoint: defaultEndpoint,
50
+ defaultBucket,
51
+ bucketsDir,
52
+ });
45
53
 
46
54
  const downloadPromises = startDownloads({
47
55
  apiKey,
@@ -58,7 +66,9 @@ export default function s3Querier({
58
66
  results.forEach((result) => {
59
67
  if (result.status === 'rejected') throw result.reason;
60
68
  });
61
- return execQuery(processedQuery, { format });
69
+ const downloadedPaths = results.flatMap((result) => result.value);
70
+ const finalQuery = runFinalizers({ plugins: systemPlugins, rawQuery, fileSettings, downloadedPaths, bucketsDir });
71
+ return execQuery(finalQuery, { format });
62
72
  });
63
73
  }
64
74
 
@@ -72,14 +82,35 @@ export default function s3Querier({
72
82
  * @returns
73
83
  */
74
84
  function processQuery(plugins = [], { query = '', endpoint, defaultBucket, bucketsDir }) {
75
- const processedQuery = plugins.reduce(
76
- (result, plugin) => {
77
- return plugin.processQuery(result);
78
- },
79
- { endpoint, defaultBucket, bucketsDir, query, settings: [] },
80
- );
85
+ const processedQuery = plugins.reduce((result, plugin) => plugin.processQuery(result), {
86
+ endpoint,
87
+ defaultBucket,
88
+ bucketsDir,
89
+ query,
90
+ settings: [],
91
+ });
92
+ const fileSettings = processedQuery.settings;
81
93
  processedQuery.settings = mergeSettings(processedQuery.settings);
82
- return processedQuery;
94
+ return { ...processedQuery, fileSettings };
95
+ }
96
+
97
+ /**
98
+ * Passes the raw query through each plugin's `finalizeQuery` lifecycle method,
99
+ * substituting exact downloaded paths in place of glob patterns.
100
+ *
101
+ * @param {object} params
102
+ * @param {object[]} params.plugins - Plugin instances to run finalizers on.
103
+ * @param {string} params.rawQuery - SQL with original file references and date/location tokens.
104
+ * @param {object[]} params.fileSettings - Pre-merge per-file settings from processQuery.
105
+ * @param {string[]} params.downloadedPaths - Absolute local paths of all downloaded files.
106
+ * @param {string} params.bucketsDir - Root directory where files are cached locally.
107
+ * @returns {string} Finalized SQL ready for DuckDB execution.
108
+ */
109
+ function runFinalizers({ plugins, rawQuery, fileSettings, downloadedPaths, bucketsDir }) {
110
+ return plugins.reduce((query, plugin) => {
111
+ if (!plugin.finalizeQuery) return query;
112
+ return plugin.finalizeQuery(query, fileSettings, downloadedPaths, bucketsDir);
113
+ }, rawQuery);
83
114
  }
84
115
 
85
116
  /**
@@ -28,6 +28,22 @@ export function datesInRange(from, to) {
28
28
  });
29
29
  }
30
30
 
31
+ /**
32
+ * Given a date range returns one date object per calendar month.
33
+ * Returns noon-UTC dates so that getMonth()/getFullYear() produce the correct
34
+ * UTC month in any timezone.
35
+ *
36
+ * @param {Date} from The from Date object
37
+ * @param {Date} to The to Date object
38
+ * @returns {Date[]} An array of dates, one per month within the range
39
+ */
40
+ export function monthsInRange(from, to) {
41
+ const startYear = from.getUTCFullYear();
42
+ const startMonth = from.getUTCMonth();
43
+ const count = (to.getUTCFullYear() - startYear) * 12 + (to.getUTCMonth() - startMonth) + 1;
44
+ return Array.from({ length: count }, (_, index) => noonUtcForMonthOffset(startYear, startMonth, index));
45
+ }
46
+
31
47
  /**
32
48
  * Given a date range returns an array of date objects
33
49
  *
@@ -53,3 +69,9 @@ export function zeroDateMins(date) {
53
69
  zeroedDate.setMinutes(0, 0);
54
70
  return zeroedDate;
55
71
  }
72
+
73
+ function noonUtcForMonthOffset(startYear, startMonth, offset) {
74
+ const year = startYear + Math.floor((startMonth + offset) / 12);
75
+ const month = (startMonth + offset) % 12;
76
+ return new Date(Date.UTC(year, month, 1, 12, 0, 0));
77
+ }