s3-querier 1.0.3 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -159,6 +159,171 @@ const results = await s3Querier({
159
159
  });
160
160
  ```
161
161
 
162
+ ## MCP Server
163
+
164
+ s3-querier ships a [Model Context Protocol](https://modelcontextprotocol.io/) server that exposes two tools to any MCP-compatible client (Claude Desktop, Claude Code, etc.):
165
+
166
+ - **`query`** — runs a DuckDB SQL query against your S3 data
167
+ - **`list_files`** — lists objects under a prefix so an LLM can discover available data
168
+
169
+ ### Environment variables
170
+
171
+ | Variable | Required | Description |
172
+ | --- | --- | --- |
173
+ | `S3_ENDPOINT` | ✓ | S3 endpoint URL |
174
+ | `S3_BUCKET` | ✓ | Default bucket |
175
+ | `S3_ACCESS_KEY_ID` | ✓ * | HMAC access key |
176
+ | `S3_SECRET_ACCESS_KEY` | ✓ * | HMAC secret key |
177
+ | `S3_API_KEY` | ✓ * | IBM IAM API key (alternative to HMAC) |
178
+ | `S3_BUCKETS_DIR` | | Local cache directory (default `/tmp/s3-querier`) |
179
+
180
+ \* Either HMAC pair **or** `S3_API_KEY` is required.
181
+
182
+ ### Basic server
183
+
184
+ The built-in server entry point requires no configuration beyond environment variables.
185
+
186
+ **Claude Code / Claude Desktop**
187
+
188
+ ```bash
189
+ claude mcp add s3-querier \
190
+ -e S3_ENDPOINT=https://s3.amazonaws.com \
191
+ -e S3_BUCKET=my-bucket \
192
+ -e S3_ACCESS_KEY_ID=key \
193
+ -e S3_SECRET_ACCESS_KEY=secret \
194
+ -- node node_modules/s3-querier/src/mcp/server.js
195
+ ```
196
+
197
+ **IBM Bob**
198
+
199
+ Add to `mcp_settings.json` (global, applies across all workspaces) or `.Bob/mcp.json` (project-level, committed with your repo):
200
+
201
+ ```json
202
+ {
203
+ "mcpServers": {
204
+ "s3-querier": {
205
+ "command": "node",
206
+ "args": ["/absolute/path/to/node_modules/s3-querier/src/mcp/server.js"],
207
+ "env": {
208
+ "S3_ENDPOINT": "https://s3.amazonaws.com",
209
+ "S3_BUCKET": "my-bucket",
210
+ "S3_ACCESS_KEY_ID": "key",
211
+ "S3_SECRET_ACCESS_KEY": "secret"
212
+ },
213
+ "disabled": false,
214
+ "alwaysAllow": [],
215
+ "disabledTools": []
216
+ }
217
+ }
218
+ }
219
+ ```
220
+
221
+ ### Extending with `S3QuerierMCP`
222
+
223
+ For richer LLM context, create a custom server using `S3QuerierMCP` and pass a `datasets` array. Each entry describes a dataset so the model knows what data is available and how it's structured — without having to explore the bucket first.
224
+
225
+ ```js
226
+ // my-server.js
227
+ import { S3QuerierMCP } from 's3-querier/src/mcp/s3querier-mcp.js';
228
+
229
+ new S3QuerierMCP({
230
+ datasets: [
231
+ {
232
+ name: 'sales',
233
+ description: 'Monthly sales transactions partitioned by year and month.',
234
+ prefix: 'sales/',
235
+ partitioning: 'year/month',
236
+ files: {
237
+ data: {
238
+ description: 'Sales records — id (int), date, product, amount (float), region',
239
+ },
240
+ },
241
+ },
242
+ {
243
+ name: 'products',
244
+ description: 'Product catalog — static reference data, no partitioning.',
245
+ prefix: 'products/',
246
+ files: {
247
+ catalog: {
248
+ description: 'Products — name, category, price (float)',
249
+ },
250
+ },
251
+ },
252
+ ],
253
+ }).start();
254
+ ```
255
+
256
+ **Claude Code / Claude Desktop**
257
+
258
+ ```bash
259
+ claude mcp add my-datalake \
260
+ -e S3_ENDPOINT=https://s3.amazonaws.com \
261
+ -e S3_BUCKET=my-bucket \
262
+ -e S3_ACCESS_KEY_ID=key \
263
+ -e S3_SECRET_ACCESS_KEY=secret \
264
+ -- node my-server.js
265
+ ```
266
+
267
+ **IBM Bob**
268
+
269
+ ```json
270
+ {
271
+ "mcpServers": {
272
+ "my-datalake": {
273
+ "command": "node",
274
+ "args": ["/absolute/path/to/my-server.js"],
275
+ "env": {
276
+ "S3_ENDPOINT": "https://s3.amazonaws.com",
277
+ "S3_BUCKET": "my-bucket",
278
+ "S3_ACCESS_KEY_ID": "key",
279
+ "S3_SECRET_ACCESS_KEY": "secret"
280
+ },
281
+ "disabled": false,
282
+ "alwaysAllow": [],
283
+ "disabledTools": []
284
+ }
285
+ }
286
+ }
287
+ ```
288
+
289
+ #### Dataset options
290
+
291
+ | Field | Description |
292
+ | --- | --- |
293
+ | `name` | Dataset identifier |
294
+ | `description` | Narrative description injected into the tool prompt |
295
+ | `prefix` | S3 path prefix (e.g. `"sales/"`) |
296
+ | `partitioning` | Partitioning scheme hint (e.g. `"year/month"`) |
297
+ | `bucket` | Overrides `S3_BUCKET` for this dataset |
298
+ | `endpoint` | Overrides `S3_ENDPOINT` for this dataset |
299
+ | `files` | Map of logical file names to `{ description }` |
300
+
301
+ ### Adding custom tools
302
+
303
+ Pass a `tools` array to register additional MCP tools alongside the built-in ones:
304
+
305
+ ```js
306
+ import { z } from 'zod';
307
+ import { S3QuerierMCP } from 's3-querier/src/mcp/s3querier-mcp.js';
308
+
309
+ new S3QuerierMCP({
310
+ datasets: [ /* ... */ ],
311
+ tools: [
312
+ {
313
+ name: 'get_report',
314
+ description: 'Returns the latest weekly summary report.',
315
+ inputSchema: {
316
+ week: z.string().describe('ISO week string, e.g. "2025-W03"'),
317
+ },
318
+ handler: async ({ week }) => {
319
+ // your logic here
320
+ return { content: [{ type: 'text', text: `Report for ${week}` }] };
321
+ },
322
+ },
323
+ ],
324
+ }).start();
325
+ ```
326
+
162
327
  ## Examples
163
328
 
164
329
  The `examples/` directory contains a local interactive demo and standalone scripts. All examples target a local MinIO instance — you'll need [Docker](https://docs.docker.com/get-docker/) and [Docker Compose](https://docs.docker.com/compose/install/) installed. Both are bundled with Docker Desktop on Mac and Windows; on Linux, install the Compose plugin separately.
package/package.json CHANGED
@@ -1,15 +1,20 @@
1
1
  {
2
2
  "name": "s3-querier",
3
- "version": "1.0.3",
3
+ "version": "1.1.1",
4
4
  "description": "Query S3-compatible storage with DuckDB and SQL",
5
5
  "type": "module",
6
6
  "main": "src/s3-querier.js",
7
+ "bin": {
8
+ "s3-querier-mcp": "src/mcp/server.js"
9
+ },
7
10
  "exports": {
8
- ".": "./src/s3-querier.js"
11
+ ".": "./src/s3-querier.js",
12
+ "./mcp": "./src/mcp/s3querier-mcp.js"
9
13
  },
10
14
  "files": [
11
15
  "src/**/*.js",
12
16
  "!src/**/*.test.js",
17
+ "src/mcp/descriptions/",
13
18
  "docs/",
14
19
  "README.md"
15
20
  ],
@@ -21,7 +26,7 @@
21
26
  "scripts": {
22
27
  "test": "node --test \"./src/**/*.test.js\"",
23
28
  "test:e2e": "docker compose -f e2e/docker-compose.yml up -d --wait && node e2e/setup/seed.js && node --test e2e/*.e2e.js; docker compose -f e2e/docker-compose.yml down",
24
- "test:coverage:html": "c8 -x coverage -x **/*.test.js --all -r html node --test \"./src/**/*.test.js\"",
29
+ "test:coverage:html": "c8 -x coverage -x **/*.test.js -x src/s3-querier.js -x e2e -x examples -x '*.config.js' --all -r html node --test \"./src/**/*.test.js\"",
25
30
  "prettify": "prettier \"./src/**/*.js\" --write",
26
31
  "lint": "eslint \"./src/**/*.js\"",
27
32
  "lint:fix": "eslint --fix \"./src/**/*.js\"",
@@ -44,8 +49,10 @@
44
49
  "@aws-sdk/client-s3": "^3.0.0",
45
50
  "@derekstride/tree-sitter-sql": "^0.3.11",
46
51
  "@duckdb/node-api": "^1.5.3-r.3",
52
+ "@modelcontextprotocol/sdk": "^1.29.0",
47
53
  "avsc": "^5.7.7",
48
54
  "date-fns": "^4.0.0",
55
+ "hyparquet": "^1.26.0",
49
56
  "lru-cache": "^11.0.0",
50
57
  "peggy": "^5.1.0",
51
58
  "pino": "^10.3.1",
@@ -0,0 +1,34 @@
1
+ List files and sub-directories in an S3 bucket at a given prefix level.
2
+
3
+ Returns:
4
+ - directories: sub-prefixes to drill into (e.g. "env=production/", "year=2026/")
5
+ - files: objects at this level with path, size, and column names for parquet files
6
+ - truncated: true if there are more results — narrow the prefix or increase maxResults
7
+
8
+ NAVIGATION STRATEGY
9
+
10
+ Start with an empty prefix to see the top-level structure. Drill into directories
11
+ one level at a time. Each call only shows what is directly under the given prefix,
12
+ not the full recursive tree.
13
+
14
+ HIVE-PARTITIONED DATA
15
+
16
+ Today's date is {{TODAY}}.
17
+
18
+ Many datasets use Hive-style partitioning with partition keys in the path:
19
+ year=YYYY/month=MM/day=DD/hour=HH/minute=MM/
20
+
21
+ To find the most recent data:
22
+ 1. List down to the partition root (e.g. env=production/) — one call per structural level
23
+ 2. Then STOP listing. Do NOT list year, month, or day directories individually.
24
+ 3. Construct the date segment directly from today: year=YYYY/month=MM/day=DD/
25
+ 4. Append it to the partition root and list from the day directory to find the latest hour.
26
+
27
+ Example: if you reach env=production/ and today is 2026-06-14, your next call is
28
+ prefix: "…/env=production/year=2026/month=06/day=14/"
29
+ Then list that to find the available hours, pick the highest, and continue.
30
+
31
+ Only fall back to listing year/month/day if the constructed path returns empty directories and files.
32
+
33
+ Always check for a "latest/" directory first — it often contains the most recent
34
+ snapshot and avoids navigating the full partition tree.
@@ -0,0 +1,53 @@
1
+ A DuckDB SQL query. File paths inside read_parquet() or read_csv() are resolved against S3
2
+ and downloaded before the query runs.
3
+
4
+ REQUIRED: always call read_parquet() or read_csv() — plain table names are not supported.
5
+ Prefer union_by_name=1 when reading multiple files.
6
+
7
+ SCHEMA DISCOVERY: Never guess column names. Before writing any query that joins files or
8
+ references specific columns, run SELECT * FROM read_parquet('path') LIMIT 1 on each file
9
+ to inspect the actual schema. Only then write the real query.
10
+
11
+ FILE PATH TOKENS
12
+
13
+ Date tokens — expanded using the `from` and `to` parameters:
14
+ {yyyy} 4-digit year e.g. 2025
15
+ {MM} 2-digit month e.g. 08
16
+ {dd} 2-digit day e.g. 03
17
+ {hh} 2-digit hour e.g. 14
18
+ {mm} 2-digit minute e.g. 30
19
+ {ss} 2-digit second e.g. 00
20
+
21
+ Location tokens — override endpoint or bucket per path:
22
+ {endpoint:https://s3.example.com}
23
+ {bucket:my-bucket}
24
+
25
+ Glob syntax — wildcard matching for non-time path segments:
26
+ jobs/window=202308032130/*.parquet
27
+
28
+ HIVE-PARTITIONED DATA
29
+
30
+ For paths partitioned only by year and month (no day segment), use {yyyy} and {MM} together:
31
+ sales/year={yyyy}/month={MM}/data.parquet
32
+
33
+ s3-querier generates one prefix per calendar month in the from/to range, so a Q1 query
34
+ (from=2024-01-01, to=2024-03-31) fetches exactly months 01, 02, 03 — not all of 2024.
35
+
36
+ Do NOT use DuckDB character-class globs like month=0[1-3] — DuckDB does not support them.
37
+ Use {yyyy}/{MM} tokens instead, which s3-querier expands correctly.
38
+
39
+ EXAMPLES
40
+
41
+ Single file:
42
+ SELECT * FROM read_parquet('reports/summary.parquet') LIMIT 10
43
+
44
+ Day-partitioned files (requires from/to):
45
+ SELECT id FROM read_parquet('events/year={yyyy}/month={MM}/day={dd}/data.parquet', union_by_name=1)
46
+
47
+ Month-partitioned files (no day segment — use {yyyy}/{MM} only):
48
+ SELECT * FROM read_parquet('sales/year={yyyy}/month={MM}/data.parquet', union_by_name=1)
49
+
50
+ Cross-endpoint join:
51
+ WITH east AS (SELECT id FROM read_parquet('{endpoint:https://s3.us-east.example.com}/{bucket:logs}/data/{yyyy}{MM}{dd}.parquet'))
52
+ SELECT * FROM read_parquet('{endpoint:https://s3.eu-west.example.com}/{bucket:logs}/data/{yyyy}{MM}{dd}.parquet') AS west
53
+ JOIN east ON west.id = east.id
@@ -0,0 +1,8 @@
1
+ Download files from S3-compatible storage and execute a DuckDB SQL query against them.
2
+
3
+ Queries must use DuckDB table functions such as read_parquet() or read_csv() with file paths
4
+ that reference objects in S3. S3 Querier resolves those paths, downloads the matching files,
5
+ and runs the query locally with DuckDB.
6
+
7
+ Use the `s3-querier://docs` resource for full documentation including token syntax, examples,
8
+ and query planning tips.
@@ -0,0 +1,63 @@
1
+ import { ListObjectsV2Command } from '@aws-sdk/client-s3';
2
+ import { bigintReplacer } from '../../s3-querier.js';
3
+ import { buildS3Client } from '../../s3/s3.js';
4
+ import { readParquetColumns } from '../../utils/parquet-schema-reader.js';
5
+
6
+ const { S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY, S3_API_KEY, S3_ENDPOINT, S3_BUCKET } = process.env;
7
+
8
+ export async function handleListFiles({ prefix = '', maxResults = 100, endpoint, bucket }) {
9
+ const resolvedEndpoint = endpoint || S3_ENDPOINT;
10
+ const resolvedBucket = bucket || S3_BUCKET;
11
+ const clientConfig = {
12
+ apiKey: S3_API_KEY,
13
+ accessKeyId: S3_ACCESS_KEY_ID,
14
+ secretAccessKey: S3_SECRET_ACCESS_KEY,
15
+ endpoint: resolvedEndpoint,
16
+ };
17
+ const s3Client = buildS3Client(clientConfig);
18
+ const listCommand = new ListObjectsV2Command({
19
+ Bucket: resolvedBucket,
20
+ Prefix: prefix,
21
+ MaxKeys: maxResults,
22
+ Delimiter: '/',
23
+ });
24
+ const response = await s3Client.send(listCommand);
25
+ const directories = (response.CommonPrefixes ?? []).map(({ Prefix }) => Prefix);
26
+ const files = (response.Contents ?? []).map(({ Key, Size }) => ({ file: Key, size: Size }));
27
+ const truncated = response.IsTruncated ?? false;
28
+ const representatives = getRepresentativeFiles(files);
29
+ const filesWithSchema = await Promise.all(
30
+ files.map((fileObj) => maybeAddSchema(s3Client, resolvedBucket, representatives, fileObj)),
31
+ );
32
+
33
+ return {
34
+ content: [
35
+ { type: 'text', text: JSON.stringify({ directories, files: filesWithSchema, truncated }, bigintReplacer) },
36
+ ],
37
+ };
38
+ }
39
+
40
+ /** Helpers */
41
+
42
+ async function addSchema(s3Client, bucket, { file, size }) {
43
+ if (!file.endsWith('.parquet')) return { file, size };
44
+ const columns = await readParquetColumns(s3Client, bucket, file).catch(() => null);
45
+ return { file, size, columns };
46
+ }
47
+
48
+ function maybeAddSchema(s3Client, bucket, representatives, fileObj) {
49
+ if (representatives.has(fileObj.file)) return addSchema(s3Client, bucket, fileObj);
50
+ return Promise.resolve(fileObj);
51
+ }
52
+
53
+ function getRepresentativeFiles(files) {
54
+ const parquetFiles = files.filter(({ file }) => file.endsWith('.parquet'));
55
+ const dirMap = parquetFiles.reduce(addFirstFilePerDir, new Map());
56
+ return new Set(dirMap.values());
57
+ }
58
+
59
+ function addFirstFilePerDir(acc, { file }) {
60
+ const dir = file.substring(0, file.lastIndexOf('/'));
61
+ if (!acc.has(dir)) acc.set(dir, file);
62
+ return acc;
63
+ }
@@ -0,0 +1,34 @@
1
+ import s3Querier, { bigintReplacer } from '../../s3-querier.js';
2
+
3
+ const {
4
+ S3_ACCESS_KEY_ID,
5
+ S3_SECRET_ACCESS_KEY,
6
+ S3_API_KEY,
7
+ S3_ENDPOINT,
8
+ S3_BUCKET,
9
+ S3_BUCKETS_DIR = '/tmp/s3-querier',
10
+ } = process.env;
11
+
12
+ export async function handleQuery({ sql, from, to, endpoint, bucket }) {
13
+ const fromMs = from ? new Date(from).getTime() : undefined;
14
+ const toMs = to ? new Date(to).getTime() : undefined;
15
+ const resolvedEndpoint = endpoint || S3_ENDPOINT;
16
+ const resolvedBucket = bucket || S3_BUCKET;
17
+
18
+ const results = await s3Querier({
19
+ query: sql,
20
+ from: fromMs,
21
+ to: toMs,
22
+ defaultEndpoint: resolvedEndpoint,
23
+ defaultBucket: resolvedBucket,
24
+ bucketsDir: S3_BUCKETS_DIR,
25
+ apiKey: S3_API_KEY,
26
+ accessKeyId: S3_ACCESS_KEY_ID,
27
+ secretAccessKey: S3_SECRET_ACCESS_KEY,
28
+ format: 'jsonRecords',
29
+ });
30
+
31
+ return {
32
+ content: [{ type: 'text', text: JSON.stringify(results, bigintReplacer) }],
33
+ };
34
+ }
@@ -0,0 +1,119 @@
1
+ import { readFileSync } from 'node:fs';
2
+ import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
3
+ import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
4
+ import { z } from 'zod';
5
+ import { handleListFiles } from './handlers/list-files.js';
6
+ import { handleQuery } from './handlers/query.js';
7
+
8
+ const pkg = JSON.parse(readFileSync(new URL('../../package.json', import.meta.url), 'utf8'));
9
+
10
+ const toolDescription = readFileSync(new URL('./descriptions/tool.md', import.meta.url), 'utf8');
11
+ const sqlDescription = readFileSync(new URL('./descriptions/sql-param.md', import.meta.url), 'utf8');
12
+ const listFilesTemplate = readFileSync(new URL('./descriptions/list-files.md', import.meta.url), 'utf8');
13
+ const docsContent = readFileSync(new URL('../../docs/s3-querier.md', import.meta.url), 'utf8');
14
+
15
+ const QUERY_TOOL_SCHEMA = {
16
+ sql: z.string().describe(sqlDescription),
17
+ from: z
18
+ .string()
19
+ .optional()
20
+ .describe('Start of date range as ISO 8601 (e.g. "2025-01-01"). Required when the query uses date tokens.'),
21
+ to: z
22
+ .string()
23
+ .optional()
24
+ .describe('End of date range as ISO 8601 (e.g. "2025-01-31"). Required when the query uses date tokens.'),
25
+ endpoint: z.string().optional().describe('S3 endpoint URL. Overrides S3_ENDPOINT for this query.'),
26
+ bucket: z.string().optional().describe('S3 bucket name. Overrides S3_BUCKET for this query.'),
27
+ };
28
+
29
+ const LIST_FILES_TOOL_SCHEMA = {
30
+ prefix: z
31
+ .string()
32
+ .optional()
33
+ .describe('Path prefix to list under (e.g. "sales/" or ""). Defaults to empty string to list all files.'),
34
+ maxResults: z
35
+ .number()
36
+ .int()
37
+ .min(1)
38
+ .max(1000)
39
+ .optional()
40
+ .describe('Maximum number of files to return (default 100). Increase if the response is truncated.'),
41
+ endpoint: z.string().optional().describe('S3 endpoint URL. Overrides S3_ENDPOINT for this call.'),
42
+ bucket: z.string().optional().describe('S3 bucket name. Overrides S3_BUCKET for this call.'),
43
+ };
44
+
45
+ const DOCS_RESOURCE = {
46
+ title: 'S3 Querier Documentation',
47
+ description: 'Full documentation: query planning, file tokens, location tokens, and examples.',
48
+ mimeType: 'text/markdown',
49
+ };
50
+
51
+ export class S3QuerierMCP {
52
+ constructor(config = {}) {
53
+ this.config = config;
54
+ }
55
+
56
+ async start() {
57
+ const server = new McpServer({ name: 's3-querier', version: pkg.version });
58
+ const transport = new StdioServerTransport();
59
+ const listFilesDescription = buildListFilesDescription(this.config);
60
+ const enrichedToolDescription = buildToolDescription(this.config);
61
+
62
+ server.registerResource('s3-querier-docs', 's3-querier://docs', DOCS_RESOURCE, serveDocsHandler);
63
+ server.registerTool(
64
+ 'list_files',
65
+ { description: listFilesDescription, inputSchema: LIST_FILES_TOOL_SCHEMA },
66
+ handleListFiles,
67
+ );
68
+ server.registerTool('query', { description: enrichedToolDescription, inputSchema: QUERY_TOOL_SCHEMA }, handleQuery);
69
+ (this.config.tools ?? []).forEach(({ name, description, inputSchema, handler }) => {
70
+ server.registerTool(name, { description, inputSchema }, handler);
71
+ });
72
+
73
+ await server.connect(transport);
74
+ }
75
+ }
76
+
77
+ /** Helpers */
78
+
79
+ function serveDocsHandler(uri) {
80
+ return { contents: [{ uri: uri.href, text: docsContent, mimeType: 'text/markdown' }] };
81
+ }
82
+
83
+ function buildListFilesDescription(config) {
84
+ const today = new Date().toISOString().slice(0, 10);
85
+ const withDate = listFilesTemplate.replace('{{TODAY}}', today);
86
+ const datasetContext = buildDatasetContext(config.datasets);
87
+ return datasetContext ? `${withDate}\n${datasetContext}` : withDate;
88
+ }
89
+
90
+ function buildToolDescription(config) {
91
+ const datasetContext = buildDatasetContext(config.datasets);
92
+ return datasetContext ? `${toolDescription}\n\n${datasetContext}` : toolDescription;
93
+ }
94
+
95
+ function buildDatasetContext(datasets) {
96
+ if (!datasets?.length) return '';
97
+ const datasetLines = datasets.flatMap(formatDataset);
98
+ return ['CONFIGURED DATASETS', '', ...datasetLines].join('\n');
99
+ }
100
+
101
+ function formatDataset({ name, description, bucket, endpoint, prefix, partitioning, files }) {
102
+ const header = description ? `${name} — ${description}` : name;
103
+ const lines = [header];
104
+ if (bucket) lines.push(` Bucket: ${bucket}`);
105
+ if (endpoint) lines.push(` Endpoint: ${endpoint}`);
106
+ if (prefix) lines.push(` Prefix: ${prefix}`);
107
+ if (partitioning) lines.push(` Partitioning: ${partitioning}`);
108
+ if (files) {
109
+ const fileLines = Object.entries(files).map(formatFileLine);
110
+ lines.push(' Files:', ...fileLines);
111
+ }
112
+ lines.push('');
113
+ return lines;
114
+ }
115
+
116
+ function formatFileLine([fileName, { description: fileDesc }]) {
117
+ const label = fileDesc ? `${fileName} — ${fileDesc}` : fileName;
118
+ return ` ${label}`;
119
+ }
@@ -0,0 +1,5 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { S3QuerierMCP } from './s3querier-mcp.js';
4
+
5
+ await new S3QuerierMCP().start();
@@ -0,0 +1,33 @@
1
+ import { GetObjectCommand } from '@aws-sdk/client-s3';
2
+ import { parquetMetadata, parquetSchema } from 'hyparquet';
3
+
4
+ /**
5
+ * Reads the column names from a parquet file using two ranged S3 GET requests
6
+ * (footer only — no full file download regardless of file size).
7
+ *
8
+ * @param {import('@aws-sdk/client-s3').S3Client} s3Client
9
+ * @param {string} bucket
10
+ * @param {string} key
11
+ * @returns {Promise<string[]>} Column names
12
+ */
13
+ export async function readParquetColumns(s3Client, bucket, key) {
14
+ const tail = await fetchRange(s3Client, bucket, key, 'bytes=-8');
15
+ const footerLength = tail.readUInt32LE(0);
16
+ const footerBuffer = await fetchRange(s3Client, bucket, key, `bytes=-${footerLength + 8}`);
17
+ const { buffer, byteOffset, byteLength } = footerBuffer;
18
+ const arrayBuffer = buffer.slice(byteOffset, byteOffset + byteLength);
19
+ const metadata = parquetMetadata(arrayBuffer);
20
+ const schema = parquetSchema(metadata);
21
+ return schema.children.map((col) => col.element.name);
22
+ }
23
+
24
+ /** Helpers */
25
+
26
+ async function fetchRange(s3Client, bucket, key, range) {
27
+ const response = await s3Client.send(new GetObjectCommand({ Bucket: bucket, Key: key, Range: range }));
28
+ const chunks = [];
29
+ for await (const chunk of response.Body) {
30
+ chunks.push(chunk);
31
+ }
32
+ return Buffer.concat(chunks);
33
+ }