s3-querier 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -161,7 +161,7 @@ const results = await s3Querier({
161
161
 
162
162
  ## MCP Server
163
163
 
164
- s3-querier ships a [Model Context Protocol](https://modelcontextprotocol.io/) server that exposes two tools to any MCP-compatible client (Claude Desktop, Claude Code, etc.):
164
+ s3-querier ships a [Model Context Protocol](https://modelcontextprotocol.io/) server that exposes two tools to any MCP-compatible client (Claude Desktop, Claude Code, IBM Bob etc.):
165
165
 
166
166
  - **`query`** — runs a DuckDB SQL query against your S3 data
167
167
  - **`list_files`** — lists objects under a prefix so an LLM can discover available data
@@ -191,7 +191,7 @@ claude mcp add s3-querier \
191
191
  -e S3_BUCKET=my-bucket \
192
192
  -e S3_ACCESS_KEY_ID=key \
193
193
  -e S3_SECRET_ACCESS_KEY=secret \
194
- -- node node_modules/s3-querier/src/mcp/server.js
194
+ -- npx -y s3-querier
195
195
  ```
196
196
 
197
197
  **IBM Bob**
@@ -202,8 +202,8 @@ Add to `mcp_settings.json` (global, applies across all workspaces) or `.Bob/mcp.
202
202
  {
203
203
  "mcpServers": {
204
204
  "s3-querier": {
205
- "command": "node",
206
- "args": ["/absolute/path/to/node_modules/s3-querier/src/mcp/server.js"],
205
+ "command": "npx",
206
+ "args": ["-y", "s3-querier"],
207
207
  "env": {
208
208
  "S3_ENDPOINT": "https://s3.amazonaws.com",
209
209
  "S3_BUCKET": "my-bucket",
@@ -298,6 +298,22 @@ claude mcp add my-datalake \
298
298
  | `endpoint` | Overrides `S3_ENDPOINT` for this dataset |
299
299
  | `files` | Map of logical file names to `{ description }` |
300
300
 
301
+ #### Server instructions
302
+
303
+ By default, `S3QuerierMCP` sends step-by-step workflow instructions to the LLM at connection time. When datasets are configured, the default guides the LLM to read the datasets resource, inspect schemas, and use date tokens correctly. Without datasets, it guides discovery via `list_files`.
304
+
305
+ | Option | Description |
306
+ | --- | --- |
307
+ | `additionalInstructions` | Appended to the default instructions. Use this to add project-specific guidance, e.g. a preferred lookback window for "latest" queries. |
308
+ | `instructions` | Replaces the default instructions entirely. |
309
+
310
+ ```js
311
+ new S3QuerierMCP({
312
+ datasets: [ /* ... */ ],
313
+ additionalInstructions: 'Data is updated hourly. For recent data, set from to 2 hours before current time and to to current time.',
314
+ }).start();
315
+ ```
316
+
301
317
  ### Adding custom tools
302
318
 
303
319
  Pass a `tools` array to register additional MCP tools alongside the built-in ones:
@@ -26,7 +26,7 @@ When querying data from a data lake, be mindful of how your queries are construc
26
26
  [Install the DuckDB CLI](https://duckdb.org/docs/installation/?version=stable&environment=cli&platform=macos&download_method=direct) and experiment with your queries on local parquet files before running them against S3. This gives a fast feedback loop for understanding data structure and refining queries.
27
27
 
28
28
  - **Be Mindful Of Time Ranges In Date Tokens**
29
- Long time ranges require fetching more files and slow execution. Use narrow time ranges whenever possible.
29
+ Long time ranges require fetching more files and slow execution. Use narrow time ranges whenever possible. If a query fails due to the 1GB limit, narrow the `from`/`to` range and run multiple queries — for example, query 4–6 hours at a time instead of a full day.
30
30
 
31
31
  - **Create Secondary Representations Of Your Data**
32
32
  For larger datasets, break files into smaller chunks to avoid hitting the file size limit.
@@ -151,7 +151,22 @@ FROM read_parquet('jobs_failed/window=202308032130/*.parquet', union_by_name=1);
151
151
  ```
152
152
 
153
153
  > [!WARNING]
154
- > **Use globs with caution.** They can match more files than expected, causing unnecessary downloads and degraded performance. Always verify the file list your glob will match before running a broad query.
154
+ > **Never use globs on time-partitioned folder segments** (e.g. `year=*/`, `month=*/`, `hour=*/`). A folder-level glob matches every partition and will over-fetch, hitting the 1GB query limit.
155
+ >
156
+ > Use [time formatting tokens](#time-formatting-tokens) with `from`/`to` instead — they expand to exactly the partitions needed:
157
+ >
158
+ > ```sql
159
+ > -- ❌ WRONG — hour=*/ fetches every hour
160
+ > SELECT * FROM read_parquet('data/year=2026/month=06/day=15/hour=*/file.parquet');
161
+ >
162
+ > -- ✅ CORRECT — {hh} with from/to fetches only the requested hours
163
+ > SELECT * FROM read_parquet('data/year={yyyy}/month={MM}/day={dd}/hour={hh}/file.parquet', union_by_name=1);
164
+ > ```
165
+ >
166
+ > Globs are appropriate only for **file name patterns within a known folder**, or for non-time path segments. They can be combined with time tokens — tokens on the folder segments, glob on the filename:
167
+ > ```sql
168
+ > SELECT * FROM read_parquet('data/year={yyyy}/month={MM}/day={dd}/hour={hh}/records_*.parquet', union_by_name=1);
169
+ > ```
155
170
 
156
171
  ---
157
172
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "s3-querier",
3
- "version": "1.1.1",
3
+ "version": "1.2.0",
4
4
  "description": "Query S3-compatible storage with DuckDB and SQL",
5
5
  "type": "module",
6
6
  "main": "src/s3-querier.js",
@@ -1,5 +1,7 @@
1
1
  A DuckDB SQL query. File paths inside read_parquet() or read_csv() are resolved against S3
2
- and downloaded before the query runs.
2
+ and downloaded before the query runs. Paths are relative to the bucket root — do not use
3
+ s3://, s3a://, or any protocol prefix. To reference a non-default bucket, use the
4
+ {bucket:name} token instead.
3
5
 
4
6
  REQUIRED: always call read_parquet() or read_csv() — plain table names are not supported.
5
7
  Prefer union_by_name=1 when reading multiple files.
@@ -22,8 +24,33 @@ Location tokens — override endpoint or bucket per path:
22
24
  {endpoint:https://s3.example.com}
23
25
  {bucket:my-bucket}
24
26
 
25
- Glob syntax — wildcard matching for non-time path segments:
26
- jobs/window=202308032130/*.parquet
27
+ QUERYING TIME-PARTITIONED DATA
28
+
29
+ Always use date tokens for time-partitioned paths — even for a single snapshot. Tokens keep
30
+ the partitioning structure explicit and make the query work correctly if the time range
31
+ changes. Only hardcode a date when the file is genuinely static (not part of any
32
+ time-partition scheme).
33
+
34
+ Use date tokens in the SQL with `from`/`to` as separate parameters. ONE query with tokens
35
+ downloads all matching files across the range — do not make multiple tool calls with
36
+ hardcoded dates.
37
+
38
+ ✗ WRONG — hardcoded date in path (even for a single hour):
39
+ sql: SELECT * FROM read_parquet('events/year=2026/month=06/day=15/hour=14/data.parquet')
40
+
41
+ ✗ WRONG — multiple tool calls with hardcoded hours:
42
+ sql: SELECT * FROM read_parquet('events/year=2026/month=06/day=15/hour=12/data.parquet')
43
+ sql: SELECT * FROM read_parquet('events/year=2026/month=06/day=15/hour=13/data.parquet')
44
+ sql: SELECT * FROM read_parquet('events/year=2026/month=06/day=15/hour=14/data.parquet')
45
+
46
+ ✓ CORRECT — one tool call, tokens expand across all hours in the range:
47
+ sql: SELECT * FROM read_parquet('events/year={yyyy}/month={MM}/day={dd}/hour={hh}/data.parquet', union_by_name=1)
48
+ from: 2026-06-15T12:00:00Z
49
+ to: 2026-06-15T14:59:59Z
50
+
51
+ Tokens also expand inside the filename, not just in path directory segments:
52
+ data/year={yyyy}/month={MM}/day={dd}/hour={hh}/file_{yyyy}{MM}{dd}{hh}00.parquet
53
+ → s3-querier downloads one file per hour in the from/to range.
27
54
 
28
55
  HIVE-PARTITIONED DATA
29
56
 
@@ -41,6 +68,11 @@ EXAMPLES
41
68
  Single file:
42
69
  SELECT * FROM read_parquet('reports/summary.parquet') LIMIT 10
43
70
 
71
+ Hour-partitioned files — tokens in path and filename (requires from/to):
72
+ SELECT * FROM read_parquet(
73
+ 'events/year={yyyy}/month={MM}/day={dd}/hour={hh}/file_{yyyy}{MM}{dd}{hh}00.parquet',
74
+ union_by_name=1)
75
+
44
76
  Day-partitioned files (requires from/to):
45
77
  SELECT id FROM read_parquet('events/year={yyyy}/month={MM}/day={dd}/data.parquet', union_by_name=1)
46
78
 
@@ -51,3 +83,17 @@ Cross-endpoint join:
51
83
  WITH east AS (SELECT id FROM read_parquet('{endpoint:https://s3.us-east.example.com}/{bucket:logs}/data/{yyyy}{MM}{dd}.parquet'))
52
84
  SELECT * FROM read_parquet('{endpoint:https://s3.eu-west.example.com}/{bucket:logs}/data/{yyyy}{MM}{dd}.parquet') AS west
53
85
  JOIN east ON west.id = east.id
86
+
87
+ GLOB SYNTAX (last resort — filename patterns only)
88
+
89
+ Globs match non-time file name segments within a known folder:
90
+ jobs/window=202308032130/*.parquet
91
+
92
+ Do NOT use globs on time-partitioned folder segments (year=, month=, day=, hour=, etc.).
93
+ A folder-level glob like hour=*/ matches every hour and causes massive over-fetching.
94
+ Use date tokens with from/to instead — they expand to exactly the hours/days needed:
95
+ ✗ data/year=2026/month=06/day=15/hour=*/file.parquet
96
+ ✓ data/year={yyyy}/month={MM}/day={dd}/hour={hh}/file.parquet (with from/to)
97
+
98
+ Tokens and globs can be combined — tokens on folder segments, glob on the filename:
99
+ data/year={yyyy}/month={MM}/day={dd}/hour={hh}/records_*.parquet
@@ -4,5 +4,18 @@ Queries must use DuckDB table functions such as read_parquet() or read_csv() wit
4
4
  that reference objects in S3. S3 Querier resolves those paths, downloads the matching files,
5
5
  and runs the query locally with DuckDB.
6
6
 
7
- Use the `s3-querier://docs` resource for full documentation including token syntax, examples,
8
- and query planning tips.
7
+ TIME-PARTITIONED DATA: use date tokens ({yyyy}, {MM}, {dd}, {hh}, {mm}) in file paths
8
+ together with the `from` and `to` parameters to query a time range. ONE query with tokens
9
+ fetches all matching files across the range — never make multiple calls with hardcoded dates.
10
+
11
+ sql: SELECT * FROM read_parquet('data/year={yyyy}/month={MM}/day={dd}/hour={hh}/file.parquet', union_by_name=1)
12
+ from: 2026-06-15T12:00:00Z
13
+ to: 2026-06-15T19:59:59Z
14
+
15
+ CURRENT TIME: If the query involves "now", "recent", "latest", or a relative time range,
16
+ call `get_current_time` first to get the accurate current UTC time. Do not rely on training
17
+ knowledge to guess the current date or time. This does not apply to static file queries or
18
+ queries for a specific known date range.
19
+
20
+ Read the `s3-querier://docs` resource or the `sql` parameter description for full token
21
+ syntax, examples, and query planning tips before writing your first query.
@@ -0,0 +1,17 @@
1
+ export default class BaseResource {
2
+ constructor(config) {
3
+ this.config = config;
4
+ }
5
+
6
+ isEnabled() {
7
+ return true;
8
+ }
9
+
10
+ getMeta() {
11
+ throw new Error('Resources must implement getMeta()');
12
+ }
13
+
14
+ handler() {
15
+ throw new Error('Resources must implement handler()');
16
+ }
17
+ }
@@ -0,0 +1,24 @@
1
+ import BaseResource from '../base-resource.js';
2
+ import { buildDatasetContext } from '../../utils/utils.js';
3
+
4
+ export default class S3QuerierDatasetsResource extends BaseResource {
5
+ name = 's3-querier-datasets';
6
+ uri = 's3-querier://datasets';
7
+
8
+ isEnabled() {
9
+ return !!this.config.datasets?.length;
10
+ }
11
+
12
+ getMeta() {
13
+ return {
14
+ title: 'Configured Datasets',
15
+ description: 'Available datasets: bucket, prefix, file path template, partitioning, and resource types.',
16
+ mimeType: 'text/plain',
17
+ };
18
+ }
19
+
20
+ handler(uri) {
21
+ const text = buildDatasetContext(this.config.datasets);
22
+ return { contents: [{ uri: uri.href, text, mimeType: 'text/plain' }] };
23
+ }
24
+ }
@@ -0,0 +1,22 @@
1
+ import { readFileSync } from 'node:fs';
2
+
3
+ import BaseResource from '../base-resource.js';
4
+
5
+ const docsContent = readFileSync(new URL('../../../../docs/s3-querier.md', import.meta.url), 'utf8');
6
+
7
+ export default class S3QuerierDocsResource extends BaseResource {
8
+ name = 's3-querier-docs';
9
+ uri = 's3-querier://docs';
10
+
11
+ getMeta() {
12
+ return {
13
+ title: 'S3 Querier Documentation',
14
+ description: 'Full documentation: query planning, file tokens, location tokens, and examples.',
15
+ mimeType: 'text/markdown',
16
+ };
17
+ }
18
+
19
+ handler(uri) {
20
+ return { contents: [{ uri: uri.href, text: docsContent, mimeType: 'text/markdown' }] };
21
+ }
22
+ }
@@ -1,71 +1,56 @@
1
1
  import { readFileSync } from 'node:fs';
2
2
  import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
3
3
  import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
4
- import { z } from 'zod';
5
- import { handleListFiles } from './handlers/list-files.js';
6
- import { handleQuery } from './handlers/query.js';
7
4
 
8
- const pkg = JSON.parse(readFileSync(new URL('../../package.json', import.meta.url), 'utf8'));
5
+ import QueryTool from './tools/query/query.js';
6
+ import ListFilesTool from './tools/list-files/list-files.js';
7
+ import CurentTimeTool from './tools/current-time/current-time.js';
9
8
 
10
- const toolDescription = readFileSync(new URL('./descriptions/tool.md', import.meta.url), 'utf8');
11
- const sqlDescription = readFileSync(new URL('./descriptions/sql-param.md', import.meta.url), 'utf8');
12
- const listFilesTemplate = readFileSync(new URL('./descriptions/list-files.md', import.meta.url), 'utf8');
13
- const docsContent = readFileSync(new URL('../../docs/s3-querier.md', import.meta.url), 'utf8');
9
+ import S3QuerierDocsResource from './resources/s3-querier-docs/s3-querier-docs.js';
10
+ import S3QuerierDatasetsResource from './resources/s3-querier-datasets/s3-querier-datasets.js';
14
11
 
15
- const QUERY_TOOL_SCHEMA = {
16
- sql: z.string().describe(sqlDescription),
17
- from: z
18
- .string()
19
- .optional()
20
- .describe('Start of date range as ISO 8601 (e.g. "2025-01-01"). Required when the query uses date tokens.'),
21
- to: z
22
- .string()
23
- .optional()
24
- .describe('End of date range as ISO 8601 (e.g. "2025-01-31"). Required when the query uses date tokens.'),
25
- endpoint: z.string().optional().describe('S3 endpoint URL. Overrides S3_ENDPOINT for this query.'),
26
- bucket: z.string().optional().describe('S3 bucket name. Overrides S3_BUCKET for this query.'),
27
- };
12
+ const pkg = JSON.parse(readFileSync(new URL('../../package.json', import.meta.url), 'utf8'));
28
13
 
29
- const LIST_FILES_TOOL_SCHEMA = {
30
- prefix: z
31
- .string()
32
- .optional()
33
- .describe('Path prefix to list under (e.g. "sales/" or ""). Defaults to empty string to list all files.'),
34
- maxResults: z
35
- .number()
36
- .int()
37
- .min(1)
38
- .max(1000)
39
- .optional()
40
- .describe('Maximum number of files to return (default 100). Increase if the response is truncated.'),
41
- endpoint: z.string().optional().describe('S3 endpoint URL. Overrides S3_ENDPOINT for this call.'),
42
- bucket: z.string().optional().describe('S3 bucket name. Overrides S3_BUCKET for this call.'),
43
- };
14
+ const DEFAULT_INSTRUCTIONS = `
15
+ Step 1: Use list_files to discover what data is available under a prefix.
16
+ Step 2: Check the columns field in the list_files response — if present, use those column names. Otherwise run SELECT * FROM read_parquet('path') LIMIT 1 to inspect the schema.
17
+ Step 3: For time-partitioned data, call get_current_time to get the current UTC time, then query with the appropriate from/to range.
18
+ Step 4: Query using the correct file paths discovered in Step 1.
19
+ `.trim();
44
20
 
45
- const DOCS_RESOURCE = {
46
- title: 'S3 Querier Documentation',
47
- description: 'Full documentation: query planning, file tokens, location tokens, and examples.',
48
- mimeType: 'text/markdown',
49
- };
21
+ const DATASETS_INSTRUCTIONS = `
22
+ Step 1: Read the s3-querier://datasets resource to see available datasets and their S3 paths.
23
+ Step 2: Review the datasets to identify which are relevant to the request.
24
+ Step 3: Never guess column names. Run SELECT * FROM read_parquet('full_path') LIMIT 1 on each relevant file to inspect the schema — for time-partitioned paths, call get_current_time first to construct a valid path.
25
+ Step 4: Query the relevant datasets directly — do not use list_files to explore the bucket.
26
+ `.trim();
50
27
 
51
28
  export class S3QuerierMCP {
52
29
  constructor(config = {}) {
53
30
  this.config = config;
31
+ this.toolClasses = [QueryTool, ListFilesTool, CurentTimeTool];
32
+ this.resourceClasses = [S3QuerierDocsResource, S3QuerierDatasetsResource];
54
33
  }
55
34
 
56
35
  async start() {
57
- const server = new McpServer({ name: 's3-querier', version: pkg.version });
36
+ const server = new McpServer({
37
+ name: 's3-querier',
38
+ version: pkg.version,
39
+ instructions: buildInstructions(this.config),
40
+ });
58
41
  const transport = new StdioServerTransport();
59
- const listFilesDescription = buildListFilesDescription(this.config);
60
- const enrichedToolDescription = buildToolDescription(this.config);
61
42
 
62
- server.registerResource('s3-querier-docs', 's3-querier://docs', DOCS_RESOURCE, serveDocsHandler);
63
- server.registerTool(
64
- 'list_files',
65
- { description: listFilesDescription, inputSchema: LIST_FILES_TOOL_SCHEMA },
66
- handleListFiles,
67
- );
68
- server.registerTool('query', { description: enrichedToolDescription, inputSchema: QUERY_TOOL_SCHEMA }, handleQuery);
43
+ this.resourceClasses.forEach((ResourceClass) => {
44
+ const resource = new ResourceClass(this.config);
45
+ if (!resource.isEnabled()) return;
46
+ server.registerResource(resource.name, resource.uri, resource.getMeta(), resource.handler.bind(resource));
47
+ });
48
+
49
+ this.toolClasses.forEach((ToolClass) => {
50
+ const tool = new ToolClass(this.config);
51
+ server.registerTool(tool.name, tool.getConfig(), tool.handler.bind(tool));
52
+ });
53
+
69
54
  (this.config.tools ?? []).forEach(({ name, description, inputSchema, handler }) => {
70
55
  server.registerTool(name, { description, inputSchema }, handler);
71
56
  });
@@ -74,46 +59,8 @@ export class S3QuerierMCP {
74
59
  }
75
60
  }
76
61
 
77
- /** Helpers */
78
-
79
- function serveDocsHandler(uri) {
80
- return { contents: [{ uri: uri.href, text: docsContent, mimeType: 'text/markdown' }] };
81
- }
82
-
83
- function buildListFilesDescription(config) {
84
- const today = new Date().toISOString().slice(0, 10);
85
- const withDate = listFilesTemplate.replace('{{TODAY}}', today);
86
- const datasetContext = buildDatasetContext(config.datasets);
87
- return datasetContext ? `${withDate}\n${datasetContext}` : withDate;
88
- }
89
-
90
- function buildToolDescription(config) {
91
- const datasetContext = buildDatasetContext(config.datasets);
92
- return datasetContext ? `${toolDescription}\n\n${datasetContext}` : toolDescription;
93
- }
94
-
95
- function buildDatasetContext(datasets) {
96
- if (!datasets?.length) return '';
97
- const datasetLines = datasets.flatMap(formatDataset);
98
- return ['CONFIGURED DATASETS', '', ...datasetLines].join('\n');
99
- }
100
-
101
- function formatDataset({ name, description, bucket, endpoint, prefix, partitioning, files }) {
102
- const header = description ? `${name} — ${description}` : name;
103
- const lines = [header];
104
- if (bucket) lines.push(` Bucket: ${bucket}`);
105
- if (endpoint) lines.push(` Endpoint: ${endpoint}`);
106
- if (prefix) lines.push(` Prefix: ${prefix}`);
107
- if (partitioning) lines.push(` Partitioning: ${partitioning}`);
108
- if (files) {
109
- const fileLines = Object.entries(files).map(formatFileLine);
110
- lines.push(' Files:', ...fileLines);
111
- }
112
- lines.push('');
113
- return lines;
114
- }
115
-
116
- function formatFileLine([fileName, { description: fileDesc }]) {
117
- const label = fileDesc ? `${fileName} — ${fileDesc}` : fileName;
118
- return ` ${label}`;
62
+ function buildInstructions(config) {
63
+ const base = config.instructions ?? (config.datasets?.length ? DATASETS_INSTRUCTIONS : DEFAULT_INSTRUCTIONS);
64
+ if (config.additionalInstructions) return `${base}\n\n${config.additionalInstructions}`;
65
+ return base;
119
66
  }
@@ -0,0 +1,13 @@
1
+ export default class BaseTool {
2
+ constructor(config) {
3
+ this.config = config;
4
+ }
5
+
6
+ getConfig() {
7
+ throw new Error('Tools must implement getConfig()');
8
+ }
9
+
10
+ handler() {
11
+ throw new Error('Tools must implement handler()');
12
+ }
13
+ }
@@ -0,0 +1,20 @@
1
+ import BaseTool from '../base-tool.js';
2
+
3
+ export default class CurentTimeTool extends BaseTool {
4
+ name = 'get_current_time';
5
+
6
+ getConfig() {
7
+ return {
8
+ description:
9
+ 'Returns the current UTC time as an ISO 8601 timestamp. ' +
10
+ 'Call this before constructing time-partitioned queries that involve "now", "recent", or a relative time range. ' +
11
+ 'Use the returned time as `to` and set `from` based on how frequently the data is updated — e.g. 1 hour back for hourly data, 1 day back for daily data. ' +
12
+ 'Not needed for static file queries or queries for a specific known date range.',
13
+ inputSchema: {},
14
+ };
15
+ }
16
+
17
+ handler() {
18
+ return { content: [{ type: 'text', text: new Date().toISOString() }] };
19
+ }
20
+ }
@@ -0,0 +1,94 @@
1
+ import { readFileSync } from 'node:fs';
2
+ import { z } from 'zod';
3
+
4
+ import BaseTool from '../base-tool.js';
5
+ import { ListObjectsV2Command } from '@aws-sdk/client-s3';
6
+ import { bigintReplacer } from '../../../utils/bigint-replacer.js';
7
+ import { buildS3Client } from '../../../s3/s3.js';
8
+ import { readParquetColumns } from '../../../utils/parquet-schema-reader.js';
9
+ import { buildDatasetContext } from '../../utils/utils.js';
10
+
11
+ const { S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY, S3_API_KEY, S3_ENDPOINT, S3_BUCKET } = process.env;
12
+ const listFilesTemplate = readFileSync(new URL('../../descriptions/list-files.md', import.meta.url), 'utf8');
13
+
14
+ export default class ListFilesTool extends BaseTool {
15
+ name = 'list_files';
16
+
17
+ getConfig() {
18
+ const today = new Date().toISOString().slice(0, 10);
19
+ const withDate = listFilesTemplate.replace('{{TODAY}}', today);
20
+ const datasetContext = buildDatasetContext(this.config.datasets);
21
+ const description = datasetContext ? `${withDate}\n${datasetContext}` : withDate;
22
+
23
+ return {
24
+ description,
25
+ inputSchema: {
26
+ prefix: z
27
+ .string()
28
+ .optional()
29
+ .describe('Path prefix to list under (e.g. "sales/" or ""). Defaults to empty string to list all files.'),
30
+ maxResults: z
31
+ .number()
32
+ .int()
33
+ .min(1)
34
+ .max(1000)
35
+ .optional()
36
+ .describe('Maximum number of files to return (default 100). Increase if the response is truncated.'),
37
+ endpoint: z.string().optional().describe('S3 endpoint URL. Overrides S3_ENDPOINT for this call.'),
38
+ bucket: z.string().optional().describe('S3 bucket name. Overrides S3_BUCKET for this call.'),
39
+ },
40
+ };
41
+ }
42
+
43
+ async handler({ prefix = '', maxResults = 100, endpoint, bucket }) {
44
+ const resolvedEndpoint = endpoint || S3_ENDPOINT;
45
+ const resolvedBucket = bucket || S3_BUCKET;
46
+ const s3Client = buildS3Client({
47
+ apiKey: S3_API_KEY,
48
+ accessKeyId: S3_ACCESS_KEY_ID,
49
+ secretAccessKey: S3_SECRET_ACCESS_KEY,
50
+ endpoint: resolvedEndpoint,
51
+ });
52
+ const response = await s3Client.send(
53
+ new ListObjectsV2Command({ Bucket: resolvedBucket, Prefix: prefix, MaxKeys: maxResults, Delimiter: '/' }),
54
+ );
55
+ const directories = (response.CommonPrefixes ?? []).map(({ Prefix }) => Prefix);
56
+ const files = (response.Contents ?? []).map(({ Key, Size }) => ({ file: Key, size: Size }));
57
+ const truncated = response.IsTruncated ?? false;
58
+ const representatives = getRepresentativeFiles(files);
59
+ const filesWithSchema = await Promise.all(
60
+ files.map((fileObj) => maybeAddSchema(s3Client, resolvedBucket, representatives, fileObj)),
61
+ );
62
+
63
+ return {
64
+ content: [
65
+ { type: 'text', text: JSON.stringify({ directories, files: filesWithSchema, truncated }, bigintReplacer) },
66
+ ],
67
+ };
68
+ }
69
+ }
70
+
71
+ /** Helpers */
72
+
73
+ function getRepresentativeFiles(files) {
74
+ const parquetFiles = files.filter(({ file }) => file.endsWith('.parquet'));
75
+ const dirMap = parquetFiles.reduce(addFirstFilePerDir, new Map());
76
+ return new Set(dirMap.values());
77
+ }
78
+
79
+ function addFirstFilePerDir(acc, { file }) {
80
+ const dir = file.substring(0, file.lastIndexOf('/'));
81
+ if (!acc.has(dir)) acc.set(dir, file);
82
+ return acc;
83
+ }
84
+
85
+ function maybeAddSchema(s3Client, bucket, representatives, fileObj) {
86
+ if (representatives.has(fileObj.file)) return addSchema(s3Client, bucket, fileObj);
87
+ return Promise.resolve(fileObj);
88
+ }
89
+
90
+ async function addSchema(s3Client, bucket, { file, size }) {
91
+ if (!file.endsWith('.parquet')) return { file, size };
92
+ const columns = await readParquetColumns(s3Client, bucket, file).catch(() => null);
93
+ return { file, size, columns };
94
+ }
@@ -0,0 +1,68 @@
1
+ import { readFileSync } from 'node:fs';
2
+ import { z } from 'zod';
3
+
4
+ import BaseTool from '../base-tool.js';
5
+ import s3Querier, { bigintReplacer } from '../../../s3-querier.js';
6
+
7
+ const {
8
+ S3_ACCESS_KEY_ID,
9
+ S3_SECRET_ACCESS_KEY,
10
+ S3_API_KEY,
11
+ S3_ENDPOINT,
12
+ S3_BUCKET,
13
+ S3_BUCKETS_DIR = '/tmp/s3-querier',
14
+ } = process.env;
15
+
16
+ const sqlDescription = readFileSync(new URL('../../descriptions/sql-param.md', import.meta.url), 'utf8');
17
+ const toolDescription = readFileSync(new URL('../../descriptions/tool.md', import.meta.url), 'utf8');
18
+
19
+ export default class QueryTool extends BaseTool {
20
+ name = 'query';
21
+
22
+ getConfig() {
23
+ const description = this.config.datasets?.length
24
+ ? `${toolDescription}\n\nCONFIGURED DATASETS: read the \`s3-querier://datasets\` resource for available datasets, prefixes, file path templates, and resource types.`
25
+ : toolDescription;
26
+
27
+ return {
28
+ description,
29
+ inputSchema: {
30
+ sql: z.string().describe(sqlDescription),
31
+ from: z
32
+ .string()
33
+ .optional()
34
+ .describe('Start of date range as ISO 8601 (e.g. "2025-01-01"). Required when the query uses date tokens.'),
35
+ to: z
36
+ .string()
37
+ .optional()
38
+ .describe('End of date range as ISO 8601 (e.g. "2025-01-31"). Required when the query uses date tokens.'),
39
+ endpoint: z.string().optional().describe('S3 endpoint URL. Overrides S3_ENDPOINT for this query.'),
40
+ bucket: z.string().optional().describe('S3 bucket name. Overrides S3_BUCKET for this query.'),
41
+ },
42
+ };
43
+ }
44
+
45
+ async handler({ sql, from, to, endpoint, bucket }) {
46
+ const fromMs = from ? new Date(from).getTime() : undefined;
47
+ const toMs = to ? new Date(to).getTime() : undefined;
48
+ const resolvedEndpoint = endpoint || S3_ENDPOINT;
49
+ const resolvedBucket = bucket || S3_BUCKET;
50
+
51
+ const results = await s3Querier({
52
+ query: sql,
53
+ from: fromMs,
54
+ to: toMs,
55
+ defaultEndpoint: resolvedEndpoint,
56
+ defaultBucket: resolvedBucket,
57
+ bucketsDir: S3_BUCKETS_DIR,
58
+ apiKey: S3_API_KEY,
59
+ accessKeyId: S3_ACCESS_KEY_ID,
60
+ secretAccessKey: S3_SECRET_ACCESS_KEY,
61
+ format: 'jsonRecords',
62
+ });
63
+
64
+ return {
65
+ content: [{ type: 'text', text: JSON.stringify(results, bigintReplacer) }],
66
+ };
67
+ }
68
+ }
@@ -0,0 +1,26 @@
1
+ export function buildDatasetContext(datasets) {
2
+ if (!datasets?.length) return '';
3
+ const datasetLines = datasets.flatMap(formatDataset);
4
+ return ['CONFIGURED DATASETS', '', ...datasetLines].join('\n');
5
+ }
6
+
7
+ function formatDataset({ name, description, bucket, endpoint, prefix, filePathTemplate, partitioning, files }) {
8
+ const header = description ? `${name} — ${description}` : name;
9
+ const lines = [header];
10
+ if (bucket) lines.push(` Bucket: ${bucket}`);
11
+ if (endpoint) lines.push(` Endpoint: ${endpoint}`);
12
+ if (prefix && !filePathTemplate) lines.push(` Prefix: ${prefix}`);
13
+ if (prefix && filePathTemplate)
14
+ lines.push(` Full path: ${prefix}${filePathTemplate} ({file} = resource name from Files list)`);
15
+ if (partitioning) lines.push(` Partitioning: ${partitioning}`);
16
+ if (files) {
17
+ const fileLines = Object.entries(files).flatMap(formatFileLine);
18
+ lines.push(' Files:', ...fileLines);
19
+ }
20
+ lines.push('');
21
+ return lines;
22
+ }
23
+
24
+ function formatFileLine([fileName, { description: fileDesc }]) {
25
+ return [` ${fileDesc ? `${fileName} — ${fileDesc}` : fileName}`];
26
+ }
@@ -1,63 +0,0 @@
1
- import { ListObjectsV2Command } from '@aws-sdk/client-s3';
2
- import { bigintReplacer } from '../../s3-querier.js';
3
- import { buildS3Client } from '../../s3/s3.js';
4
- import { readParquetColumns } from '../../utils/parquet-schema-reader.js';
5
-
6
- const { S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY, S3_API_KEY, S3_ENDPOINT, S3_BUCKET } = process.env;
7
-
8
- export async function handleListFiles({ prefix = '', maxResults = 100, endpoint, bucket }) {
9
- const resolvedEndpoint = endpoint || S3_ENDPOINT;
10
- const resolvedBucket = bucket || S3_BUCKET;
11
- const clientConfig = {
12
- apiKey: S3_API_KEY,
13
- accessKeyId: S3_ACCESS_KEY_ID,
14
- secretAccessKey: S3_SECRET_ACCESS_KEY,
15
- endpoint: resolvedEndpoint,
16
- };
17
- const s3Client = buildS3Client(clientConfig);
18
- const listCommand = new ListObjectsV2Command({
19
- Bucket: resolvedBucket,
20
- Prefix: prefix,
21
- MaxKeys: maxResults,
22
- Delimiter: '/',
23
- });
24
- const response = await s3Client.send(listCommand);
25
- const directories = (response.CommonPrefixes ?? []).map(({ Prefix }) => Prefix);
26
- const files = (response.Contents ?? []).map(({ Key, Size }) => ({ file: Key, size: Size }));
27
- const truncated = response.IsTruncated ?? false;
28
- const representatives = getRepresentativeFiles(files);
29
- const filesWithSchema = await Promise.all(
30
- files.map((fileObj) => maybeAddSchema(s3Client, resolvedBucket, representatives, fileObj)),
31
- );
32
-
33
- return {
34
- content: [
35
- { type: 'text', text: JSON.stringify({ directories, files: filesWithSchema, truncated }, bigintReplacer) },
36
- ],
37
- };
38
- }
39
-
40
- /** Helpers */
41
-
42
- async function addSchema(s3Client, bucket, { file, size }) {
43
- if (!file.endsWith('.parquet')) return { file, size };
44
- const columns = await readParquetColumns(s3Client, bucket, file).catch(() => null);
45
- return { file, size, columns };
46
- }
47
-
48
- function maybeAddSchema(s3Client, bucket, representatives, fileObj) {
49
- if (representatives.has(fileObj.file)) return addSchema(s3Client, bucket, fileObj);
50
- return Promise.resolve(fileObj);
51
- }
52
-
53
- function getRepresentativeFiles(files) {
54
- const parquetFiles = files.filter(({ file }) => file.endsWith('.parquet'));
55
- const dirMap = parquetFiles.reduce(addFirstFilePerDir, new Map());
56
- return new Set(dirMap.values());
57
- }
58
-
59
- function addFirstFilePerDir(acc, { file }) {
60
- const dir = file.substring(0, file.lastIndexOf('/'));
61
- if (!acc.has(dir)) acc.set(dir, file);
62
- return acc;
63
- }
@@ -1,34 +0,0 @@
1
- import s3Querier, { bigintReplacer } from '../../s3-querier.js';
2
-
3
- const {
4
- S3_ACCESS_KEY_ID,
5
- S3_SECRET_ACCESS_KEY,
6
- S3_API_KEY,
7
- S3_ENDPOINT,
8
- S3_BUCKET,
9
- S3_BUCKETS_DIR = '/tmp/s3-querier',
10
- } = process.env;
11
-
12
- export async function handleQuery({ sql, from, to, endpoint, bucket }) {
13
- const fromMs = from ? new Date(from).getTime() : undefined;
14
- const toMs = to ? new Date(to).getTime() : undefined;
15
- const resolvedEndpoint = endpoint || S3_ENDPOINT;
16
- const resolvedBucket = bucket || S3_BUCKET;
17
-
18
- const results = await s3Querier({
19
- query: sql,
20
- from: fromMs,
21
- to: toMs,
22
- defaultEndpoint: resolvedEndpoint,
23
- defaultBucket: resolvedBucket,
24
- bucketsDir: S3_BUCKETS_DIR,
25
- apiKey: S3_API_KEY,
26
- accessKeyId: S3_ACCESS_KEY_ID,
27
- secretAccessKey: S3_SECRET_ACCESS_KEY,
28
- format: 'jsonRecords',
29
- });
30
-
31
- return {
32
- content: [{ type: 'text', text: JSON.stringify(results, bigintReplacer) }],
33
- };
34
- }