s3-querier 1.0.2 → 1.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +165 -0
- package/package.json +8 -2
- package/src/mcp/handlers/list-files.js +63 -0
- package/src/mcp/handlers/query.js +34 -0
- package/src/mcp/s3querier-mcp.js +119 -0
- package/src/mcp/server.js +5 -0
- package/src/plugins/query-finalizer/query-finalizer.js +49 -22
- package/src/s3/s3.js +1 -1
- package/src/s3-querier.js +41 -10
- package/src/utils/parquet-schema-reader.js +33 -0
package/README.md
CHANGED
|
@@ -159,6 +159,171 @@ const results = await s3Querier({
|
|
|
159
159
|
});
|
|
160
160
|
```
|
|
161
161
|
|
|
162
|
+
## MCP Server
|
|
163
|
+
|
|
164
|
+
s3-querier ships a [Model Context Protocol](https://modelcontextprotocol.io/) server that exposes two tools to any MCP-compatible client (Claude Desktop, Claude Code, etc.):
|
|
165
|
+
|
|
166
|
+
- **`query`** — runs a DuckDB SQL query against your S3 data
|
|
167
|
+
- **`list_files`** — lists objects under a prefix so an LLM can discover available data
|
|
168
|
+
|
|
169
|
+
### Environment variables
|
|
170
|
+
|
|
171
|
+
| Variable | Required | Description |
|
|
172
|
+
| --- | --- | --- |
|
|
173
|
+
| `S3_ENDPOINT` | ✓ | S3 endpoint URL |
|
|
174
|
+
| `S3_BUCKET` | ✓ | Default bucket |
|
|
175
|
+
| `S3_ACCESS_KEY_ID` | ✓ * | HMAC access key |
|
|
176
|
+
| `S3_SECRET_ACCESS_KEY` | ✓ * | HMAC secret key |
|
|
177
|
+
| `S3_API_KEY` | ✓ * | IBM IAM API key (alternative to HMAC) |
|
|
178
|
+
| `S3_BUCKETS_DIR` | | Local cache directory (default `/tmp/s3-querier`) |
|
|
179
|
+
|
|
180
|
+
\* Either HMAC pair **or** `S3_API_KEY` is required.
|
|
181
|
+
|
|
182
|
+
### Basic server
|
|
183
|
+
|
|
184
|
+
The built-in server entry point requires no configuration beyond environment variables.
|
|
185
|
+
|
|
186
|
+
**Claude Code / Claude Desktop**
|
|
187
|
+
|
|
188
|
+
```bash
|
|
189
|
+
claude mcp add s3-querier \
|
|
190
|
+
-e S3_ENDPOINT=https://s3.amazonaws.com \
|
|
191
|
+
-e S3_BUCKET=my-bucket \
|
|
192
|
+
-e S3_ACCESS_KEY_ID=key \
|
|
193
|
+
-e S3_SECRET_ACCESS_KEY=secret \
|
|
194
|
+
-- node node_modules/s3-querier/src/mcp/server.js
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
**IBM Bob**
|
|
198
|
+
|
|
199
|
+
Add to `mcp_settings.json` (global, applies across all workspaces) or `.Bob/mcp.json` (project-level, committed with your repo):
|
|
200
|
+
|
|
201
|
+
```json
|
|
202
|
+
{
|
|
203
|
+
"mcpServers": {
|
|
204
|
+
"s3-querier": {
|
|
205
|
+
"command": "node",
|
|
206
|
+
"args": ["/absolute/path/to/node_modules/s3-querier/src/mcp/server.js"],
|
|
207
|
+
"env": {
|
|
208
|
+
"S3_ENDPOINT": "https://s3.amazonaws.com",
|
|
209
|
+
"S3_BUCKET": "my-bucket",
|
|
210
|
+
"S3_ACCESS_KEY_ID": "key",
|
|
211
|
+
"S3_SECRET_ACCESS_KEY": "secret"
|
|
212
|
+
},
|
|
213
|
+
"disabled": false,
|
|
214
|
+
"alwaysAllow": [],
|
|
215
|
+
"disabledTools": []
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
### Extending with `S3QuerierMCP`
|
|
222
|
+
|
|
223
|
+
For richer LLM context, create a custom server using `S3QuerierMCP` and pass a `datasets` array. Each entry describes a dataset so the model knows what data is available and how it's structured — without having to explore the bucket first.
|
|
224
|
+
|
|
225
|
+
```js
|
|
226
|
+
// my-server.js
|
|
227
|
+
import { S3QuerierMCP } from 's3-querier/src/mcp/s3querier-mcp.js';
|
|
228
|
+
|
|
229
|
+
new S3QuerierMCP({
|
|
230
|
+
datasets: [
|
|
231
|
+
{
|
|
232
|
+
name: 'sales',
|
|
233
|
+
description: 'Monthly sales transactions partitioned by year and month.',
|
|
234
|
+
prefix: 'sales/',
|
|
235
|
+
partitioning: 'year/month',
|
|
236
|
+
files: {
|
|
237
|
+
data: {
|
|
238
|
+
description: 'Sales records — id (int), date, product, amount (float), region',
|
|
239
|
+
},
|
|
240
|
+
},
|
|
241
|
+
},
|
|
242
|
+
{
|
|
243
|
+
name: 'products',
|
|
244
|
+
description: 'Product catalog — static reference data, no partitioning.',
|
|
245
|
+
prefix: 'products/',
|
|
246
|
+
files: {
|
|
247
|
+
catalog: {
|
|
248
|
+
description: 'Products — name, category, price (float)',
|
|
249
|
+
},
|
|
250
|
+
},
|
|
251
|
+
},
|
|
252
|
+
],
|
|
253
|
+
}).start();
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
**Claude Code / Claude Desktop**
|
|
257
|
+
|
|
258
|
+
```bash
|
|
259
|
+
claude mcp add my-datalake \
|
|
260
|
+
-e S3_ENDPOINT=https://s3.amazonaws.com \
|
|
261
|
+
-e S3_BUCKET=my-bucket \
|
|
262
|
+
-e S3_ACCESS_KEY_ID=key \
|
|
263
|
+
-e S3_SECRET_ACCESS_KEY=secret \
|
|
264
|
+
-- node my-server.js
|
|
265
|
+
```
|
|
266
|
+
|
|
267
|
+
**IBM Bob**
|
|
268
|
+
|
|
269
|
+
```json
|
|
270
|
+
{
|
|
271
|
+
"mcpServers": {
|
|
272
|
+
"my-datalake": {
|
|
273
|
+
"command": "node",
|
|
274
|
+
"args": ["/absolute/path/to/my-server.js"],
|
|
275
|
+
"env": {
|
|
276
|
+
"S3_ENDPOINT": "https://s3.amazonaws.com",
|
|
277
|
+
"S3_BUCKET": "my-bucket",
|
|
278
|
+
"S3_ACCESS_KEY_ID": "key",
|
|
279
|
+
"S3_SECRET_ACCESS_KEY": "secret"
|
|
280
|
+
},
|
|
281
|
+
"disabled": false,
|
|
282
|
+
"alwaysAllow": [],
|
|
283
|
+
"disabledTools": []
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
```
|
|
288
|
+
|
|
289
|
+
#### Dataset options
|
|
290
|
+
|
|
291
|
+
| Field | Description |
|
|
292
|
+
| --- | --- |
|
|
293
|
+
| `name` | Dataset identifier |
|
|
294
|
+
| `description` | Narrative description injected into the tool prompt |
|
|
295
|
+
| `prefix` | S3 path prefix (e.g. `"sales/"`) |
|
|
296
|
+
| `partitioning` | Partitioning scheme hint (e.g. `"year/month"`) |
|
|
297
|
+
| `bucket` | Overrides `S3_BUCKET` for this dataset |
|
|
298
|
+
| `endpoint` | Overrides `S3_ENDPOINT` for this dataset |
|
|
299
|
+
| `files` | Map of logical file names to `{ description }` |
|
|
300
|
+
|
|
301
|
+
### Adding custom tools
|
|
302
|
+
|
|
303
|
+
Pass a `tools` array to register additional MCP tools alongside the built-in ones:
|
|
304
|
+
|
|
305
|
+
```js
|
|
306
|
+
import { z } from 'zod';
|
|
307
|
+
import { S3QuerierMCP } from 's3-querier/src/mcp/s3querier-mcp.js';
|
|
308
|
+
|
|
309
|
+
new S3QuerierMCP({
|
|
310
|
+
datasets: [ /* ... */ ],
|
|
311
|
+
tools: [
|
|
312
|
+
{
|
|
313
|
+
name: 'get_report',
|
|
314
|
+
description: 'Returns the latest weekly summary report.',
|
|
315
|
+
inputSchema: {
|
|
316
|
+
week: z.string().describe('ISO week string, e.g. "2025-W03"'),
|
|
317
|
+
},
|
|
318
|
+
handler: async ({ week }) => {
|
|
319
|
+
// your logic here
|
|
320
|
+
return { content: [{ type: 'text', text: `Report for ${week}` }] };
|
|
321
|
+
},
|
|
322
|
+
},
|
|
323
|
+
],
|
|
324
|
+
}).start();
|
|
325
|
+
```
|
|
326
|
+
|
|
162
327
|
## Examples
|
|
163
328
|
|
|
164
329
|
The `examples/` directory contains a local interactive demo and standalone scripts. All examples target a local MinIO instance — you'll need [Docker](https://docs.docker.com/get-docker/) and [Docker Compose](https://docs.docker.com/compose/install/) installed. Both are bundled with Docker Desktop on Mac and Windows; on Linux, install the Compose plugin separately.
|
package/package.json
CHANGED
|
@@ -1,11 +1,15 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "s3-querier",
|
|
3
|
-
"version": "1.0
|
|
3
|
+
"version": "1.1.0",
|
|
4
4
|
"description": "Query S3-compatible storage with DuckDB and SQL",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "src/s3-querier.js",
|
|
7
|
+
"bin": {
|
|
8
|
+
"s3-querier-mcp": "src/mcp/server.js"
|
|
9
|
+
},
|
|
7
10
|
"exports": {
|
|
8
|
-
".": "./src/s3-querier.js"
|
|
11
|
+
".": "./src/s3-querier.js",
|
|
12
|
+
"./mcp": "./src/mcp/datalake-mcp.js"
|
|
9
13
|
},
|
|
10
14
|
"files": [
|
|
11
15
|
"src/**/*.js",
|
|
@@ -44,8 +48,10 @@
|
|
|
44
48
|
"@aws-sdk/client-s3": "^3.0.0",
|
|
45
49
|
"@derekstride/tree-sitter-sql": "^0.3.11",
|
|
46
50
|
"@duckdb/node-api": "^1.5.3-r.3",
|
|
51
|
+
"@modelcontextprotocol/sdk": "^1.29.0",
|
|
47
52
|
"avsc": "^5.7.7",
|
|
48
53
|
"date-fns": "^4.0.0",
|
|
54
|
+
"hyparquet": "^1.26.0",
|
|
49
55
|
"lru-cache": "^11.0.0",
|
|
50
56
|
"peggy": "^5.1.0",
|
|
51
57
|
"pino": "^10.3.1",
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { ListObjectsV2Command } from '@aws-sdk/client-s3';
|
|
2
|
+
import { bigintReplacer } from '../../s3-querier.js';
|
|
3
|
+
import { buildS3Client } from '../../s3/s3.js';
|
|
4
|
+
import { readParquetColumns } from '../../utils/parquet-schema-reader.js';
|
|
5
|
+
|
|
6
|
+
const { S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY, S3_API_KEY, S3_ENDPOINT, S3_BUCKET } = process.env;
|
|
7
|
+
|
|
8
|
+
export async function handleListFiles({ prefix = '', maxResults = 100, endpoint, bucket }) {
|
|
9
|
+
const resolvedEndpoint = endpoint || S3_ENDPOINT;
|
|
10
|
+
const resolvedBucket = bucket || S3_BUCKET;
|
|
11
|
+
const clientConfig = {
|
|
12
|
+
apiKey: S3_API_KEY,
|
|
13
|
+
accessKeyId: S3_ACCESS_KEY_ID,
|
|
14
|
+
secretAccessKey: S3_SECRET_ACCESS_KEY,
|
|
15
|
+
endpoint: resolvedEndpoint,
|
|
16
|
+
};
|
|
17
|
+
const s3Client = buildS3Client(clientConfig);
|
|
18
|
+
const listCommand = new ListObjectsV2Command({
|
|
19
|
+
Bucket: resolvedBucket,
|
|
20
|
+
Prefix: prefix,
|
|
21
|
+
MaxKeys: maxResults,
|
|
22
|
+
Delimiter: '/',
|
|
23
|
+
});
|
|
24
|
+
const response = await s3Client.send(listCommand);
|
|
25
|
+
const directories = (response.CommonPrefixes ?? []).map(({ Prefix }) => Prefix);
|
|
26
|
+
const files = (response.Contents ?? []).map(({ Key, Size }) => ({ file: Key, size: Size }));
|
|
27
|
+
const truncated = response.IsTruncated ?? false;
|
|
28
|
+
const representatives = getRepresentativeFiles(files);
|
|
29
|
+
const filesWithSchema = await Promise.all(
|
|
30
|
+
files.map((fileObj) => maybeAddSchema(s3Client, resolvedBucket, representatives, fileObj)),
|
|
31
|
+
);
|
|
32
|
+
|
|
33
|
+
return {
|
|
34
|
+
content: [
|
|
35
|
+
{ type: 'text', text: JSON.stringify({ directories, files: filesWithSchema, truncated }, bigintReplacer) },
|
|
36
|
+
],
|
|
37
|
+
};
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/** Helpers */
|
|
41
|
+
|
|
42
|
+
async function addSchema(s3Client, bucket, { file, size }) {
|
|
43
|
+
if (!file.endsWith('.parquet')) return { file, size };
|
|
44
|
+
const columns = await readParquetColumns(s3Client, bucket, file).catch(() => null);
|
|
45
|
+
return { file, size, columns };
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function maybeAddSchema(s3Client, bucket, representatives, fileObj) {
|
|
49
|
+
if (representatives.has(fileObj.file)) return addSchema(s3Client, bucket, fileObj);
|
|
50
|
+
return Promise.resolve(fileObj);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function getRepresentativeFiles(files) {
|
|
54
|
+
const parquetFiles = files.filter(({ file }) => file.endsWith('.parquet'));
|
|
55
|
+
const dirMap = parquetFiles.reduce(addFirstFilePerDir, new Map());
|
|
56
|
+
return new Set(dirMap.values());
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function addFirstFilePerDir(acc, { file }) {
|
|
60
|
+
const dir = file.substring(0, file.lastIndexOf('/'));
|
|
61
|
+
if (!acc.has(dir)) acc.set(dir, file);
|
|
62
|
+
return acc;
|
|
63
|
+
}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
import s3Querier, { bigintReplacer } from '../../s3-querier.js';
|
|
2
|
+
|
|
3
|
+
const {
|
|
4
|
+
S3_ACCESS_KEY_ID,
|
|
5
|
+
S3_SECRET_ACCESS_KEY,
|
|
6
|
+
S3_API_KEY,
|
|
7
|
+
S3_ENDPOINT,
|
|
8
|
+
S3_BUCKET,
|
|
9
|
+
S3_BUCKETS_DIR = '/tmp/s3-querier',
|
|
10
|
+
} = process.env;
|
|
11
|
+
|
|
12
|
+
export async function handleQuery({ sql, from, to, endpoint, bucket }) {
|
|
13
|
+
const fromMs = from ? new Date(from).getTime() : undefined;
|
|
14
|
+
const toMs = to ? new Date(to).getTime() : undefined;
|
|
15
|
+
const resolvedEndpoint = endpoint || S3_ENDPOINT;
|
|
16
|
+
const resolvedBucket = bucket || S3_BUCKET;
|
|
17
|
+
|
|
18
|
+
const results = await s3Querier({
|
|
19
|
+
query: sql,
|
|
20
|
+
from: fromMs,
|
|
21
|
+
to: toMs,
|
|
22
|
+
defaultEndpoint: resolvedEndpoint,
|
|
23
|
+
defaultBucket: resolvedBucket,
|
|
24
|
+
bucketsDir: S3_BUCKETS_DIR,
|
|
25
|
+
apiKey: S3_API_KEY,
|
|
26
|
+
accessKeyId: S3_ACCESS_KEY_ID,
|
|
27
|
+
secretAccessKey: S3_SECRET_ACCESS_KEY,
|
|
28
|
+
format: 'jsonRecords',
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
return {
|
|
32
|
+
content: [{ type: 'text', text: JSON.stringify(results, bigintReplacer) }],
|
|
33
|
+
};
|
|
34
|
+
}
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import { readFileSync } from 'node:fs';
|
|
2
|
+
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
3
|
+
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
4
|
+
import { z } from 'zod';
|
|
5
|
+
import { handleListFiles } from './handlers/list-files.js';
|
|
6
|
+
import { handleQuery } from './handlers/query.js';
|
|
7
|
+
|
|
8
|
+
const pkg = JSON.parse(readFileSync(new URL('../../package.json', import.meta.url), 'utf8'));
|
|
9
|
+
|
|
10
|
+
const toolDescription = readFileSync(new URL('./descriptions/tool.md', import.meta.url), 'utf8');
|
|
11
|
+
const sqlDescription = readFileSync(new URL('./descriptions/sql-param.md', import.meta.url), 'utf8');
|
|
12
|
+
const listFilesTemplate = readFileSync(new URL('./descriptions/list-files.md', import.meta.url), 'utf8');
|
|
13
|
+
const docsContent = readFileSync(new URL('../../docs/s3-querier.md', import.meta.url), 'utf8');
|
|
14
|
+
|
|
15
|
+
const QUERY_TOOL_SCHEMA = {
|
|
16
|
+
sql: z.string().describe(sqlDescription),
|
|
17
|
+
from: z
|
|
18
|
+
.string()
|
|
19
|
+
.optional()
|
|
20
|
+
.describe('Start of date range as ISO 8601 (e.g. "2025-01-01"). Required when the query uses date tokens.'),
|
|
21
|
+
to: z
|
|
22
|
+
.string()
|
|
23
|
+
.optional()
|
|
24
|
+
.describe('End of date range as ISO 8601 (e.g. "2025-01-31"). Required when the query uses date tokens.'),
|
|
25
|
+
endpoint: z.string().optional().describe('S3 endpoint URL. Overrides S3_ENDPOINT for this query.'),
|
|
26
|
+
bucket: z.string().optional().describe('S3 bucket name. Overrides S3_BUCKET for this query.'),
|
|
27
|
+
};
|
|
28
|
+
|
|
29
|
+
const LIST_FILES_TOOL_SCHEMA = {
|
|
30
|
+
prefix: z
|
|
31
|
+
.string()
|
|
32
|
+
.optional()
|
|
33
|
+
.describe('Path prefix to list under (e.g. "sales/" or ""). Defaults to empty string to list all files.'),
|
|
34
|
+
maxResults: z
|
|
35
|
+
.number()
|
|
36
|
+
.int()
|
|
37
|
+
.min(1)
|
|
38
|
+
.max(1000)
|
|
39
|
+
.optional()
|
|
40
|
+
.describe('Maximum number of files to return (default 100). Increase if the response is truncated.'),
|
|
41
|
+
endpoint: z.string().optional().describe('S3 endpoint URL. Overrides S3_ENDPOINT for this call.'),
|
|
42
|
+
bucket: z.string().optional().describe('S3 bucket name. Overrides S3_BUCKET for this call.'),
|
|
43
|
+
};
|
|
44
|
+
|
|
45
|
+
const DOCS_RESOURCE = {
|
|
46
|
+
title: 'S3 Querier Documentation',
|
|
47
|
+
description: 'Full documentation: query planning, file tokens, location tokens, and examples.',
|
|
48
|
+
mimeType: 'text/markdown',
|
|
49
|
+
};
|
|
50
|
+
|
|
51
|
+
export class S3QuerierMCP {
|
|
52
|
+
constructor(config = {}) {
|
|
53
|
+
this.config = config;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
async start() {
|
|
57
|
+
const server = new McpServer({ name: 's3-querier', version: pkg.version });
|
|
58
|
+
const transport = new StdioServerTransport();
|
|
59
|
+
const listFilesDescription = buildListFilesDescription(this.config);
|
|
60
|
+
const enrichedToolDescription = buildToolDescription(this.config);
|
|
61
|
+
|
|
62
|
+
server.registerResource('s3-querier-docs', 's3-querier://docs', DOCS_RESOURCE, serveDocsHandler);
|
|
63
|
+
server.registerTool(
|
|
64
|
+
'list_files',
|
|
65
|
+
{ description: listFilesDescription, inputSchema: LIST_FILES_TOOL_SCHEMA },
|
|
66
|
+
handleListFiles,
|
|
67
|
+
);
|
|
68
|
+
server.registerTool('query', { description: enrichedToolDescription, inputSchema: QUERY_TOOL_SCHEMA }, handleQuery);
|
|
69
|
+
(this.config.tools ?? []).forEach(({ name, description, inputSchema, handler }) => {
|
|
70
|
+
server.registerTool(name, { description, inputSchema }, handler);
|
|
71
|
+
});
|
|
72
|
+
|
|
73
|
+
await server.connect(transport);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/** Helpers */
|
|
78
|
+
|
|
79
|
+
function serveDocsHandler(uri) {
|
|
80
|
+
return { contents: [{ uri: uri.href, text: docsContent, mimeType: 'text/markdown' }] };
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function buildListFilesDescription(config) {
|
|
84
|
+
const today = new Date().toISOString().slice(0, 10);
|
|
85
|
+
const withDate = listFilesTemplate.replace('{{TODAY}}', today);
|
|
86
|
+
const datasetContext = buildDatasetContext(config.datasets);
|
|
87
|
+
return datasetContext ? `${withDate}\n${datasetContext}` : withDate;
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function buildToolDescription(config) {
|
|
91
|
+
const datasetContext = buildDatasetContext(config.datasets);
|
|
92
|
+
return datasetContext ? `${toolDescription}\n\n${datasetContext}` : toolDescription;
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function buildDatasetContext(datasets) {
|
|
96
|
+
if (!datasets?.length) return '';
|
|
97
|
+
const datasetLines = datasets.flatMap(formatDataset);
|
|
98
|
+
return ['CONFIGURED DATASETS', '', ...datasetLines].join('\n');
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function formatDataset({ name, description, bucket, endpoint, prefix, partitioning, files }) {
|
|
102
|
+
const header = description ? `${name} — ${description}` : name;
|
|
103
|
+
const lines = [header];
|
|
104
|
+
if (bucket) lines.push(` Bucket: ${bucket}`);
|
|
105
|
+
if (endpoint) lines.push(` Endpoint: ${endpoint}`);
|
|
106
|
+
if (prefix) lines.push(` Prefix: ${prefix}`);
|
|
107
|
+
if (partitioning) lines.push(` Partitioning: ${partitioning}`);
|
|
108
|
+
if (files) {
|
|
109
|
+
const fileLines = Object.entries(files).map(formatFileLine);
|
|
110
|
+
lines.push(' Files:', ...fileLines);
|
|
111
|
+
}
|
|
112
|
+
lines.push('');
|
|
113
|
+
return lines;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
function formatFileLine([fileName, { description: fileDesc }]) {
|
|
117
|
+
const label = fileDesc ? `${fileName} — ${fileDesc}` : fileName;
|
|
118
|
+
return ` ${label}`;
|
|
119
|
+
}
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { regexFromPattern } from '../../utils/date-regex/date-regex.js';
|
|
2
2
|
import {
|
|
3
3
|
removeFileSettingTokens,
|
|
4
4
|
removeDoubleFwdSlash,
|
|
@@ -9,33 +9,60 @@ export default class QueryFinalizerPlugin {
|
|
|
9
9
|
name = 'CorePlugin';
|
|
10
10
|
|
|
11
11
|
processQuery(context) {
|
|
12
|
-
|
|
13
|
-
const processedQuery = QueryFinalizerPlugin.prepareQuery(settings, bucketsDir, query);
|
|
14
|
-
return { ...context, query: processedQuery };
|
|
12
|
+
return context;
|
|
15
13
|
}
|
|
16
14
|
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
15
|
+
/**
|
|
16
|
+
* Replaces each SQL file reference with the exact local paths downloaded from S3.
|
|
17
|
+
* Called after all downloads complete so that DuckDB receives precise file paths
|
|
18
|
+
* rather than glob patterns that would scan the entire local cache.
|
|
19
|
+
*
|
|
20
|
+
* @param {string} rawQuery - SQL with original file references and date/location tokens
|
|
21
|
+
* @param {object[]} fileSettings - Pre-merge per-file settings from processQuery
|
|
22
|
+
* @param {string[]} downloadedPaths - Absolute local paths of all downloaded files
|
|
23
|
+
* @param {string} bucketsDir - Root directory where files are cached locally
|
|
24
|
+
* @returns {string} Finalized SQL ready for DuckDB execution
|
|
25
|
+
*/
|
|
26
|
+
finalizeQuery(rawQuery, fileSettings, downloadedPaths, bucketsDir) {
|
|
27
|
+
let prepared = fileSettings.reduce(
|
|
28
|
+
(query, setting) => applyFileSetting(query, setting, downloadedPaths, bucketsDir),
|
|
29
|
+
rawQuery,
|
|
30
|
+
);
|
|
25
31
|
prepared = removeFileSettingTokens(prepared);
|
|
26
|
-
prepared = removeFileDatePatterns(prepared);
|
|
27
32
|
prepared = removeCacheSettings(prepared);
|
|
28
33
|
prepared = removeDoubleFwdSlash(prepared);
|
|
29
|
-
|
|
30
34
|
return prepared;
|
|
31
35
|
}
|
|
36
|
+
}
|
|
32
37
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
38
|
+
/** Helpers */
|
|
39
|
+
|
|
40
|
+
function applyFileSetting(query, { sqlFileReference, file, bucket }, downloadedPaths, bucketsDir) {
|
|
41
|
+
const localDir = `${bucketsDir}/${bucket}/`;
|
|
42
|
+
const filePattern = regexFromPattern(file);
|
|
43
|
+
const matchingPaths = downloadedPaths.filter((localPath) => matchesPattern(localPath, localDir, filePattern));
|
|
44
|
+
const searchStr = sqlFileReference.replace(/\?cache=(true|false)/i, '');
|
|
45
|
+
|
|
46
|
+
if (matchingPaths.length === 0) throw new Error(`No files found for: ${file}`);
|
|
47
|
+
if (matchingPaths.length > 1) return replaceWithArray(query, searchStr, matchingPaths);
|
|
48
|
+
|
|
49
|
+
return query.replace(new RegExp(escapeForRegex(searchStr), 'gi'), matchingPaths[0]);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function matchesPattern(localPath, localDir, filePattern) {
|
|
53
|
+
return localPath.startsWith(localDir) && filePattern.test(localPath.slice(localDir.length));
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
function replaceWithArray(query, searchStr, paths) {
|
|
57
|
+
const arrayLiteral = `[${paths.map((path) => `'${path}'`).join(', ')}]`;
|
|
58
|
+
return query.replace(new RegExp(`['"]${escapeForRegex(searchStr)}['"]`, 'gi'), arrayLiteral);
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
function escapeForRegex(str) {
|
|
62
|
+
return str
|
|
63
|
+
.replace(/\*/g, '\\*')
|
|
64
|
+
.replace(/\./g, '\\.')
|
|
65
|
+
.replace(/\{/g, '\\{')
|
|
66
|
+
.replace(/\}/g, '\\}')
|
|
67
|
+
.replace(/\+/g, '\\+');
|
|
41
68
|
}
|
package/src/s3/s3.js
CHANGED
|
@@ -386,7 +386,7 @@ export default class S3 {
|
|
|
386
386
|
*/
|
|
387
387
|
async objectToFile(key) {
|
|
388
388
|
const file = `${this.mount}/${key}`;
|
|
389
|
-
const tmp = `${file}.${process.pid}.${Date.now()}.tmp`;
|
|
389
|
+
const tmp = `${file}.${process.pid}.${Date.now()}.${Math.random().toString(36).slice(2)}.tmp`;
|
|
390
390
|
try {
|
|
391
391
|
const response = await this.s3.send(new GetObjectCommand({ Bucket: this.bucket, Key: key }));
|
|
392
392
|
const chunks = [];
|
package/src/s3-querier.js
CHANGED
|
@@ -40,8 +40,16 @@ export default function s3Querier({
|
|
|
40
40
|
format,
|
|
41
41
|
}) {
|
|
42
42
|
const systemPlugins = [new QueryParserPlugin(), ...plugins, new QueryFinalizerPlugin()];
|
|
43
|
-
const
|
|
44
|
-
|
|
43
|
+
const {
|
|
44
|
+
query: rawQuery,
|
|
45
|
+
fileSettings,
|
|
46
|
+
settings: downloadSettings,
|
|
47
|
+
} = processQuery(systemPlugins, {
|
|
48
|
+
query,
|
|
49
|
+
endpoint: defaultEndpoint,
|
|
50
|
+
defaultBucket,
|
|
51
|
+
bucketsDir,
|
|
52
|
+
});
|
|
45
53
|
|
|
46
54
|
const downloadPromises = startDownloads({
|
|
47
55
|
apiKey,
|
|
@@ -58,7 +66,9 @@ export default function s3Querier({
|
|
|
58
66
|
results.forEach((result) => {
|
|
59
67
|
if (result.status === 'rejected') throw result.reason;
|
|
60
68
|
});
|
|
61
|
-
|
|
69
|
+
const downloadedPaths = results.flatMap((result) => result.value);
|
|
70
|
+
const finalQuery = runFinalizers({ plugins: systemPlugins, rawQuery, fileSettings, downloadedPaths, bucketsDir });
|
|
71
|
+
return execQuery(finalQuery, { format });
|
|
62
72
|
});
|
|
63
73
|
}
|
|
64
74
|
|
|
@@ -72,14 +82,35 @@ export default function s3Querier({
|
|
|
72
82
|
* @returns
|
|
73
83
|
*/
|
|
74
84
|
function processQuery(plugins = [], { query = '', endpoint, defaultBucket, bucketsDir }) {
|
|
75
|
-
const processedQuery = plugins.reduce(
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
85
|
+
const processedQuery = plugins.reduce((result, plugin) => plugin.processQuery(result), {
|
|
86
|
+
endpoint,
|
|
87
|
+
defaultBucket,
|
|
88
|
+
bucketsDir,
|
|
89
|
+
query,
|
|
90
|
+
settings: [],
|
|
91
|
+
});
|
|
92
|
+
const fileSettings = processedQuery.settings;
|
|
81
93
|
processedQuery.settings = mergeSettings(processedQuery.settings);
|
|
82
|
-
return processedQuery;
|
|
94
|
+
return { ...processedQuery, fileSettings };
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Passes the raw query through each plugin's `finalizeQuery` lifecycle method,
|
|
99
|
+
* substituting exact downloaded paths in place of glob patterns.
|
|
100
|
+
*
|
|
101
|
+
* @param {object} params
|
|
102
|
+
* @param {object[]} params.plugins - Plugin instances to run finalizers on.
|
|
103
|
+
* @param {string} params.rawQuery - SQL with original file references and date/location tokens.
|
|
104
|
+
* @param {object[]} params.fileSettings - Pre-merge per-file settings from processQuery.
|
|
105
|
+
* @param {string[]} params.downloadedPaths - Absolute local paths of all downloaded files.
|
|
106
|
+
* @param {string} params.bucketsDir - Root directory where files are cached locally.
|
|
107
|
+
* @returns {string} Finalized SQL ready for DuckDB execution.
|
|
108
|
+
*/
|
|
109
|
+
function runFinalizers({ plugins, rawQuery, fileSettings, downloadedPaths, bucketsDir }) {
|
|
110
|
+
return plugins.reduce((query, plugin) => {
|
|
111
|
+
if (!plugin.finalizeQuery) return query;
|
|
112
|
+
return plugin.finalizeQuery(query, fileSettings, downloadedPaths, bucketsDir);
|
|
113
|
+
}, rawQuery);
|
|
83
114
|
}
|
|
84
115
|
|
|
85
116
|
/**
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { GetObjectCommand } from '@aws-sdk/client-s3';
|
|
2
|
+
import { parquetMetadata, parquetSchema } from 'hyparquet';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Reads the column names from a parquet file using two ranged S3 GET requests
|
|
6
|
+
* (footer only — no full file download regardless of file size).
|
|
7
|
+
*
|
|
8
|
+
* @param {import('@aws-sdk/client-s3').S3Client} s3Client
|
|
9
|
+
* @param {string} bucket
|
|
10
|
+
* @param {string} key
|
|
11
|
+
* @returns {Promise<string[]>} Column names
|
|
12
|
+
*/
|
|
13
|
+
export async function readParquetColumns(s3Client, bucket, key) {
|
|
14
|
+
const tail = await fetchRange(s3Client, bucket, key, 'bytes=-8');
|
|
15
|
+
const footerLength = tail.readUInt32LE(0);
|
|
16
|
+
const footerBuffer = await fetchRange(s3Client, bucket, key, `bytes=-${footerLength + 8}`);
|
|
17
|
+
const { buffer, byteOffset, byteLength } = footerBuffer;
|
|
18
|
+
const arrayBuffer = buffer.slice(byteOffset, byteOffset + byteLength);
|
|
19
|
+
const metadata = parquetMetadata(arrayBuffer);
|
|
20
|
+
const schema = parquetSchema(metadata);
|
|
21
|
+
return schema.children.map((col) => col.element.name);
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/** Helpers */
|
|
25
|
+
|
|
26
|
+
async function fetchRange(s3Client, bucket, key, range) {
|
|
27
|
+
const response = await s3Client.send(new GetObjectCommand({ Bucket: bucket, Key: key, Range: range }));
|
|
28
|
+
const chunks = [];
|
|
29
|
+
for await (const chunk of response.Body) {
|
|
30
|
+
chunks.push(chunk);
|
|
31
|
+
}
|
|
32
|
+
return Buffer.concat(chunks);
|
|
33
|
+
}
|