s3-querier 1.1.1 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -4
- package/docs/s3-querier.md +17 -2
- package/package.json +1 -1
- package/src/mcp/descriptions/sql-param.md +49 -3
- package/src/mcp/descriptions/tool.md +15 -2
- package/src/mcp/resources/base-resource.js +17 -0
- package/src/mcp/resources/s3-querier-datasets/s3-querier-datasets.js +24 -0
- package/src/mcp/resources/s3-querier-docs/s3-querier-docs.js +22 -0
- package/src/mcp/s3querier-mcp.js +40 -93
- package/src/mcp/tools/base-tool.js +13 -0
- package/src/mcp/tools/current-time/current-time.js +20 -0
- package/src/mcp/tools/list-files/list-files.js +94 -0
- package/src/mcp/tools/query/query.js +68 -0
- package/src/mcp/utils/utils.js +26 -0
- package/src/mcp/handlers/list-files.js +0 -63
- package/src/mcp/handlers/query.js +0 -34
package/README.md
CHANGED
|
@@ -161,7 +161,7 @@ const results = await s3Querier({
|
|
|
161
161
|
|
|
162
162
|
## MCP Server
|
|
163
163
|
|
|
164
|
-
s3-querier ships a [Model Context Protocol](https://modelcontextprotocol.io/) server that exposes two tools to any MCP-compatible client (Claude Desktop, Claude Code, etc.):
|
|
164
|
+
s3-querier ships a [Model Context Protocol](https://modelcontextprotocol.io/) server that exposes two tools to any MCP-compatible client (Claude Desktop, Claude Code, IBM Bob etc.):
|
|
165
165
|
|
|
166
166
|
- **`query`** — runs a DuckDB SQL query against your S3 data
|
|
167
167
|
- **`list_files`** — lists objects under a prefix so an LLM can discover available data
|
|
@@ -191,7 +191,7 @@ claude mcp add s3-querier \
|
|
|
191
191
|
-e S3_BUCKET=my-bucket \
|
|
192
192
|
-e S3_ACCESS_KEY_ID=key \
|
|
193
193
|
-e S3_SECRET_ACCESS_KEY=secret \
|
|
194
|
-
--
|
|
194
|
+
-- npx -y s3-querier
|
|
195
195
|
```
|
|
196
196
|
|
|
197
197
|
**IBM Bob**
|
|
@@ -202,8 +202,8 @@ Add to `mcp_settings.json` (global, applies across all workspaces) or `.Bob/mcp.
|
|
|
202
202
|
{
|
|
203
203
|
"mcpServers": {
|
|
204
204
|
"s3-querier": {
|
|
205
|
-
"command": "
|
|
206
|
-
"args": ["
|
|
205
|
+
"command": "npx",
|
|
206
|
+
"args": ["-y", "s3-querier"],
|
|
207
207
|
"env": {
|
|
208
208
|
"S3_ENDPOINT": "https://s3.amazonaws.com",
|
|
209
209
|
"S3_BUCKET": "my-bucket",
|
|
@@ -298,6 +298,22 @@ claude mcp add my-datalake \
|
|
|
298
298
|
| `endpoint` | Overrides `S3_ENDPOINT` for this dataset |
|
|
299
299
|
| `files` | Map of logical file names to `{ description }` |
|
|
300
300
|
|
|
301
|
+
#### Server instructions
|
|
302
|
+
|
|
303
|
+
By default, `S3QuerierMCP` sends step-by-step workflow instructions to the LLM at connection time. When datasets are configured, the default guides the LLM to read the datasets resource, inspect schemas, and use date tokens correctly. Without datasets, it guides discovery via `list_files`.
|
|
304
|
+
|
|
305
|
+
| Option | Description |
|
|
306
|
+
| --- | --- |
|
|
307
|
+
| `additionalInstructions` | Appended to the default instructions. Use this to add project-specific guidance, e.g. a preferred lookback window for "latest" queries. |
|
|
308
|
+
| `instructions` | Replaces the default instructions entirely. |
|
|
309
|
+
|
|
310
|
+
```js
|
|
311
|
+
new S3QuerierMCP({
|
|
312
|
+
datasets: [ /* ... */ ],
|
|
313
|
+
additionalInstructions: 'Data is updated hourly. For recent data, set from to 2 hours before current time and to to current time.',
|
|
314
|
+
}).start();
|
|
315
|
+
```
|
|
316
|
+
|
|
301
317
|
### Adding custom tools
|
|
302
318
|
|
|
303
319
|
Pass a `tools` array to register additional MCP tools alongside the built-in ones:
|
package/docs/s3-querier.md
CHANGED
|
@@ -26,7 +26,7 @@ When querying data from a data lake, be mindful of how your queries are construc
|
|
|
26
26
|
[Install the DuckDB CLI](https://duckdb.org/docs/installation/?version=stable&environment=cli&platform=macos&download_method=direct) and experiment with your queries on local parquet files before running them against S3. This gives a fast feedback loop for understanding data structure and refining queries.
|
|
27
27
|
|
|
28
28
|
- **Be Mindful Of Time Ranges In Date Tokens**
|
|
29
|
-
Long time ranges require fetching more files and slow execution. Use narrow time ranges whenever possible.
|
|
29
|
+
Long time ranges require fetching more files and slow execution. Use narrow time ranges whenever possible. If a query fails due to the 1GB limit, narrow the `from`/`to` range and run multiple queries — for example, query 4–6 hours at a time instead of a full day.
|
|
30
30
|
|
|
31
31
|
- **Create Secondary Representations Of Your Data**
|
|
32
32
|
For larger datasets, break files into smaller chunks to avoid hitting the file size limit.
|
|
@@ -151,7 +151,22 @@ FROM read_parquet('jobs_failed/window=202308032130/*.parquet', union_by_name=1);
|
|
|
151
151
|
```
|
|
152
152
|
|
|
153
153
|
> [!WARNING]
|
|
154
|
-
> **
|
|
154
|
+
> **Never use globs on time-partitioned folder segments** (e.g. `year=*/`, `month=*/`, `hour=*/`). A folder-level glob matches every partition and will over-fetch, hitting the 1GB query limit.
|
|
155
|
+
>
|
|
156
|
+
> Use [time formatting tokens](#time-formatting-tokens) with `from`/`to` instead — they expand to exactly the partitions needed:
|
|
157
|
+
>
|
|
158
|
+
> ```sql
|
|
159
|
+
> -- ❌ WRONG — hour=*/ fetches every hour
|
|
160
|
+
> SELECT * FROM read_parquet('data/year=2026/month=06/day=15/hour=*/file.parquet');
|
|
161
|
+
>
|
|
162
|
+
> -- ✅ CORRECT — {hh} with from/to fetches only the requested hours
|
|
163
|
+
> SELECT * FROM read_parquet('data/year={yyyy}/month={MM}/day={dd}/hour={hh}/file.parquet', union_by_name=1);
|
|
164
|
+
> ```
|
|
165
|
+
>
|
|
166
|
+
> Globs are appropriate only for **file name patterns within a known folder**, or for non-time path segments. They can be combined with time tokens — tokens on the folder segments, glob on the filename:
|
|
167
|
+
> ```sql
|
|
168
|
+
> SELECT * FROM read_parquet('data/year={yyyy}/month={MM}/day={dd}/hour={hh}/records_*.parquet', union_by_name=1);
|
|
169
|
+
> ```
|
|
155
170
|
|
|
156
171
|
---
|
|
157
172
|
|
package/package.json
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
A DuckDB SQL query. File paths inside read_parquet() or read_csv() are resolved against S3
|
|
2
|
-
and downloaded before the query runs.
|
|
2
|
+
and downloaded before the query runs. Paths are relative to the bucket root — do not use
|
|
3
|
+
s3://, s3a://, or any protocol prefix. To reference a non-default bucket, use the
|
|
4
|
+
{bucket:name} token instead.
|
|
3
5
|
|
|
4
6
|
REQUIRED: always call read_parquet() or read_csv() — plain table names are not supported.
|
|
5
7
|
Prefer union_by_name=1 when reading multiple files.
|
|
@@ -22,8 +24,33 @@ Location tokens — override endpoint or bucket per path:
|
|
|
22
24
|
{endpoint:https://s3.example.com}
|
|
23
25
|
{bucket:my-bucket}
|
|
24
26
|
|
|
25
|
-
|
|
26
|
-
|
|
27
|
+
QUERYING TIME-PARTITIONED DATA
|
|
28
|
+
|
|
29
|
+
Always use date tokens for time-partitioned paths — even for a single snapshot. Tokens keep
|
|
30
|
+
the partitioning structure explicit and make the query work correctly if the time range
|
|
31
|
+
changes. Only hardcode a date when the file is genuinely static (not part of any
|
|
32
|
+
time-partition scheme).
|
|
33
|
+
|
|
34
|
+
Use date tokens in the SQL with `from`/`to` as separate parameters. ONE query with tokens
|
|
35
|
+
downloads all matching files across the range — do not make multiple tool calls with
|
|
36
|
+
hardcoded dates.
|
|
37
|
+
|
|
38
|
+
✗ WRONG — hardcoded date in path (even for a single hour):
|
|
39
|
+
sql: SELECT * FROM read_parquet('events/year=2026/month=06/day=15/hour=14/data.parquet')
|
|
40
|
+
|
|
41
|
+
✗ WRONG — multiple tool calls with hardcoded hours:
|
|
42
|
+
sql: SELECT * FROM read_parquet('events/year=2026/month=06/day=15/hour=12/data.parquet')
|
|
43
|
+
sql: SELECT * FROM read_parquet('events/year=2026/month=06/day=15/hour=13/data.parquet')
|
|
44
|
+
sql: SELECT * FROM read_parquet('events/year=2026/month=06/day=15/hour=14/data.parquet')
|
|
45
|
+
|
|
46
|
+
✓ CORRECT — one tool call, tokens expand across all hours in the range:
|
|
47
|
+
sql: SELECT * FROM read_parquet('events/year={yyyy}/month={MM}/day={dd}/hour={hh}/data.parquet', union_by_name=1)
|
|
48
|
+
from: 2026-06-15T12:00:00Z
|
|
49
|
+
to: 2026-06-15T14:59:59Z
|
|
50
|
+
|
|
51
|
+
Tokens also expand inside the filename, not just in path directory segments:
|
|
52
|
+
data/year={yyyy}/month={MM}/day={dd}/hour={hh}/file_{yyyy}{MM}{dd}{hh}00.parquet
|
|
53
|
+
→ s3-querier downloads one file per hour in the from/to range.
|
|
27
54
|
|
|
28
55
|
HIVE-PARTITIONED DATA
|
|
29
56
|
|
|
@@ -41,6 +68,11 @@ EXAMPLES
|
|
|
41
68
|
Single file:
|
|
42
69
|
SELECT * FROM read_parquet('reports/summary.parquet') LIMIT 10
|
|
43
70
|
|
|
71
|
+
Hour-partitioned files — tokens in path and filename (requires from/to):
|
|
72
|
+
SELECT * FROM read_parquet(
|
|
73
|
+
'events/year={yyyy}/month={MM}/day={dd}/hour={hh}/file_{yyyy}{MM}{dd}{hh}00.parquet',
|
|
74
|
+
union_by_name=1)
|
|
75
|
+
|
|
44
76
|
Day-partitioned files (requires from/to):
|
|
45
77
|
SELECT id FROM read_parquet('events/year={yyyy}/month={MM}/day={dd}/data.parquet', union_by_name=1)
|
|
46
78
|
|
|
@@ -51,3 +83,17 @@ Cross-endpoint join:
|
|
|
51
83
|
WITH east AS (SELECT id FROM read_parquet('{endpoint:https://s3.us-east.example.com}/{bucket:logs}/data/{yyyy}{MM}{dd}.parquet'))
|
|
52
84
|
SELECT * FROM read_parquet('{endpoint:https://s3.eu-west.example.com}/{bucket:logs}/data/{yyyy}{MM}{dd}.parquet') AS west
|
|
53
85
|
JOIN east ON west.id = east.id
|
|
86
|
+
|
|
87
|
+
GLOB SYNTAX (last resort — filename patterns only)
|
|
88
|
+
|
|
89
|
+
Globs match non-time file name segments within a known folder:
|
|
90
|
+
jobs/window=202308032130/*.parquet
|
|
91
|
+
|
|
92
|
+
Do NOT use globs on time-partitioned folder segments (year=, month=, day=, hour=, etc.).
|
|
93
|
+
A folder-level glob like hour=*/ matches every hour and causes massive over-fetching.
|
|
94
|
+
Use date tokens with from/to instead — they expand to exactly the hours/days needed:
|
|
95
|
+
✗ data/year=2026/month=06/day=15/hour=*/file.parquet
|
|
96
|
+
✓ data/year={yyyy}/month={MM}/day={dd}/hour={hh}/file.parquet (with from/to)
|
|
97
|
+
|
|
98
|
+
Tokens and globs can be combined — tokens on folder segments, glob on the filename:
|
|
99
|
+
data/year={yyyy}/month={MM}/day={dd}/hour={hh}/records_*.parquet
|
|
@@ -4,5 +4,18 @@ Queries must use DuckDB table functions such as read_parquet() or read_csv() wit
|
|
|
4
4
|
that reference objects in S3. S3 Querier resolves those paths, downloads the matching files,
|
|
5
5
|
and runs the query locally with DuckDB.
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
and query
|
|
7
|
+
TIME-PARTITIONED DATA: use date tokens ({yyyy}, {MM}, {dd}, {hh}, {mm}) in file paths
|
|
8
|
+
together with the `from` and `to` parameters to query a time range. ONE query with tokens
|
|
9
|
+
fetches all matching files across the range — never make multiple calls with hardcoded dates.
|
|
10
|
+
|
|
11
|
+
sql: SELECT * FROM read_parquet('data/year={yyyy}/month={MM}/day={dd}/hour={hh}/file.parquet', union_by_name=1)
|
|
12
|
+
from: 2026-06-15T12:00:00Z
|
|
13
|
+
to: 2026-06-15T19:59:59Z
|
|
14
|
+
|
|
15
|
+
CURRENT TIME: If the query involves "now", "recent", "latest", or a relative time range,
|
|
16
|
+
call `get_current_time` first to get the accurate current UTC time. Do not rely on training
|
|
17
|
+
knowledge to guess the current date or time. This does not apply to static file queries or
|
|
18
|
+
queries for a specific known date range.
|
|
19
|
+
|
|
20
|
+
Read the `s3-querier://docs` resource or the `sql` parameter description for full token
|
|
21
|
+
syntax, examples, and query planning tips before writing your first query.
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
export default class BaseResource {
|
|
2
|
+
constructor(config) {
|
|
3
|
+
this.config = config;
|
|
4
|
+
}
|
|
5
|
+
|
|
6
|
+
isEnabled() {
|
|
7
|
+
return true;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
getMeta() {
|
|
11
|
+
throw new Error('Resources must implement getMeta()');
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
handler() {
|
|
15
|
+
throw new Error('Resources must implement handler()');
|
|
16
|
+
}
|
|
17
|
+
}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import BaseResource from '../base-resource.js';
|
|
2
|
+
import { buildDatasetContext } from '../../utils/utils.js';
|
|
3
|
+
|
|
4
|
+
export default class S3QuerierDatasetsResource extends BaseResource {
|
|
5
|
+
name = 's3-querier-datasets';
|
|
6
|
+
uri = 's3-querier://datasets';
|
|
7
|
+
|
|
8
|
+
isEnabled() {
|
|
9
|
+
return !!this.config.datasets?.length;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
getMeta() {
|
|
13
|
+
return {
|
|
14
|
+
title: 'Configured Datasets',
|
|
15
|
+
description: 'Available datasets: bucket, prefix, file path template, partitioning, and resource types.',
|
|
16
|
+
mimeType: 'text/plain',
|
|
17
|
+
};
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
handler(uri) {
|
|
21
|
+
const text = buildDatasetContext(this.config.datasets);
|
|
22
|
+
return { contents: [{ uri: uri.href, text, mimeType: 'text/plain' }] };
|
|
23
|
+
}
|
|
24
|
+
}
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
import { readFileSync } from 'node:fs';
|
|
2
|
+
|
|
3
|
+
import BaseResource from '../base-resource.js';
|
|
4
|
+
|
|
5
|
+
const docsContent = readFileSync(new URL('../../../../docs/s3-querier.md', import.meta.url), 'utf8');
|
|
6
|
+
|
|
7
|
+
export default class S3QuerierDocsResource extends BaseResource {
|
|
8
|
+
name = 's3-querier-docs';
|
|
9
|
+
uri = 's3-querier://docs';
|
|
10
|
+
|
|
11
|
+
getMeta() {
|
|
12
|
+
return {
|
|
13
|
+
title: 'S3 Querier Documentation',
|
|
14
|
+
description: 'Full documentation: query planning, file tokens, location tokens, and examples.',
|
|
15
|
+
mimeType: 'text/markdown',
|
|
16
|
+
};
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
handler(uri) {
|
|
20
|
+
return { contents: [{ uri: uri.href, text: docsContent, mimeType: 'text/markdown' }] };
|
|
21
|
+
}
|
|
22
|
+
}
|
package/src/mcp/s3querier-mcp.js
CHANGED
|
@@ -1,71 +1,56 @@
|
|
|
1
1
|
import { readFileSync } from 'node:fs';
|
|
2
2
|
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
3
3
|
import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
|
|
4
|
-
import { z } from 'zod';
|
|
5
|
-
import { handleListFiles } from './handlers/list-files.js';
|
|
6
|
-
import { handleQuery } from './handlers/query.js';
|
|
7
4
|
|
|
8
|
-
|
|
5
|
+
import QueryTool from './tools/query/query.js';
|
|
6
|
+
import ListFilesTool from './tools/list-files/list-files.js';
|
|
7
|
+
import CurentTimeTool from './tools/current-time/current-time.js';
|
|
9
8
|
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
const listFilesTemplate = readFileSync(new URL('./descriptions/list-files.md', import.meta.url), 'utf8');
|
|
13
|
-
const docsContent = readFileSync(new URL('../../docs/s3-querier.md', import.meta.url), 'utf8');
|
|
9
|
+
import S3QuerierDocsResource from './resources/s3-querier-docs/s3-querier-docs.js';
|
|
10
|
+
import S3QuerierDatasetsResource from './resources/s3-querier-datasets/s3-querier-datasets.js';
|
|
14
11
|
|
|
15
|
-
const
|
|
16
|
-
sql: z.string().describe(sqlDescription),
|
|
17
|
-
from: z
|
|
18
|
-
.string()
|
|
19
|
-
.optional()
|
|
20
|
-
.describe('Start of date range as ISO 8601 (e.g. "2025-01-01"). Required when the query uses date tokens.'),
|
|
21
|
-
to: z
|
|
22
|
-
.string()
|
|
23
|
-
.optional()
|
|
24
|
-
.describe('End of date range as ISO 8601 (e.g. "2025-01-31"). Required when the query uses date tokens.'),
|
|
25
|
-
endpoint: z.string().optional().describe('S3 endpoint URL. Overrides S3_ENDPOINT for this query.'),
|
|
26
|
-
bucket: z.string().optional().describe('S3 bucket name. Overrides S3_BUCKET for this query.'),
|
|
27
|
-
};
|
|
12
|
+
const pkg = JSON.parse(readFileSync(new URL('../../package.json', import.meta.url), 'utf8'));
|
|
28
13
|
|
|
29
|
-
const
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
.number()
|
|
36
|
-
.int()
|
|
37
|
-
.min(1)
|
|
38
|
-
.max(1000)
|
|
39
|
-
.optional()
|
|
40
|
-
.describe('Maximum number of files to return (default 100). Increase if the response is truncated.'),
|
|
41
|
-
endpoint: z.string().optional().describe('S3 endpoint URL. Overrides S3_ENDPOINT for this call.'),
|
|
42
|
-
bucket: z.string().optional().describe('S3 bucket name. Overrides S3_BUCKET for this call.'),
|
|
43
|
-
};
|
|
14
|
+
const DEFAULT_INSTRUCTIONS = `
|
|
15
|
+
Step 1: Use list_files to discover what data is available under a prefix.
|
|
16
|
+
Step 2: Check the columns field in the list_files response — if present, use those column names. Otherwise run SELECT * FROM read_parquet('path') LIMIT 1 to inspect the schema.
|
|
17
|
+
Step 3: For time-partitioned data, call get_current_time to get the current UTC time, then query with the appropriate from/to range.
|
|
18
|
+
Step 4: Query using the correct file paths discovered in Step 1.
|
|
19
|
+
`.trim();
|
|
44
20
|
|
|
45
|
-
const
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
21
|
+
const DATASETS_INSTRUCTIONS = `
|
|
22
|
+
Step 1: Read the s3-querier://datasets resource to see available datasets and their S3 paths.
|
|
23
|
+
Step 2: Review the datasets to identify which are relevant to the request.
|
|
24
|
+
Step 3: Never guess column names. Run SELECT * FROM read_parquet('full_path') LIMIT 1 on each relevant file to inspect the schema — for time-partitioned paths, call get_current_time first to construct a valid path.
|
|
25
|
+
Step 4: Query the relevant datasets directly — do not use list_files to explore the bucket.
|
|
26
|
+
`.trim();
|
|
50
27
|
|
|
51
28
|
export class S3QuerierMCP {
|
|
52
29
|
constructor(config = {}) {
|
|
53
30
|
this.config = config;
|
|
31
|
+
this.toolClasses = [QueryTool, ListFilesTool, CurentTimeTool];
|
|
32
|
+
this.resourceClasses = [S3QuerierDocsResource, S3QuerierDatasetsResource];
|
|
54
33
|
}
|
|
55
34
|
|
|
56
35
|
async start() {
|
|
57
|
-
const server = new McpServer({
|
|
36
|
+
const server = new McpServer({
|
|
37
|
+
name: 's3-querier',
|
|
38
|
+
version: pkg.version,
|
|
39
|
+
instructions: buildInstructions(this.config),
|
|
40
|
+
});
|
|
58
41
|
const transport = new StdioServerTransport();
|
|
59
|
-
const listFilesDescription = buildListFilesDescription(this.config);
|
|
60
|
-
const enrichedToolDescription = buildToolDescription(this.config);
|
|
61
42
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
43
|
+
this.resourceClasses.forEach((ResourceClass) => {
|
|
44
|
+
const resource = new ResourceClass(this.config);
|
|
45
|
+
if (!resource.isEnabled()) return;
|
|
46
|
+
server.registerResource(resource.name, resource.uri, resource.getMeta(), resource.handler.bind(resource));
|
|
47
|
+
});
|
|
48
|
+
|
|
49
|
+
this.toolClasses.forEach((ToolClass) => {
|
|
50
|
+
const tool = new ToolClass(this.config);
|
|
51
|
+
server.registerTool(tool.name, tool.getConfig(), tool.handler.bind(tool));
|
|
52
|
+
});
|
|
53
|
+
|
|
69
54
|
(this.config.tools ?? []).forEach(({ name, description, inputSchema, handler }) => {
|
|
70
55
|
server.registerTool(name, { description, inputSchema }, handler);
|
|
71
56
|
});
|
|
@@ -74,46 +59,8 @@ export class S3QuerierMCP {
|
|
|
74
59
|
}
|
|
75
60
|
}
|
|
76
61
|
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
return
|
|
81
|
-
}
|
|
82
|
-
|
|
83
|
-
function buildListFilesDescription(config) {
|
|
84
|
-
const today = new Date().toISOString().slice(0, 10);
|
|
85
|
-
const withDate = listFilesTemplate.replace('{{TODAY}}', today);
|
|
86
|
-
const datasetContext = buildDatasetContext(config.datasets);
|
|
87
|
-
return datasetContext ? `${withDate}\n${datasetContext}` : withDate;
|
|
88
|
-
}
|
|
89
|
-
|
|
90
|
-
function buildToolDescription(config) {
|
|
91
|
-
const datasetContext = buildDatasetContext(config.datasets);
|
|
92
|
-
return datasetContext ? `${toolDescription}\n\n${datasetContext}` : toolDescription;
|
|
93
|
-
}
|
|
94
|
-
|
|
95
|
-
function buildDatasetContext(datasets) {
|
|
96
|
-
if (!datasets?.length) return '';
|
|
97
|
-
const datasetLines = datasets.flatMap(formatDataset);
|
|
98
|
-
return ['CONFIGURED DATASETS', '', ...datasetLines].join('\n');
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
function formatDataset({ name, description, bucket, endpoint, prefix, partitioning, files }) {
|
|
102
|
-
const header = description ? `${name} — ${description}` : name;
|
|
103
|
-
const lines = [header];
|
|
104
|
-
if (bucket) lines.push(` Bucket: ${bucket}`);
|
|
105
|
-
if (endpoint) lines.push(` Endpoint: ${endpoint}`);
|
|
106
|
-
if (prefix) lines.push(` Prefix: ${prefix}`);
|
|
107
|
-
if (partitioning) lines.push(` Partitioning: ${partitioning}`);
|
|
108
|
-
if (files) {
|
|
109
|
-
const fileLines = Object.entries(files).map(formatFileLine);
|
|
110
|
-
lines.push(' Files:', ...fileLines);
|
|
111
|
-
}
|
|
112
|
-
lines.push('');
|
|
113
|
-
return lines;
|
|
114
|
-
}
|
|
115
|
-
|
|
116
|
-
function formatFileLine([fileName, { description: fileDesc }]) {
|
|
117
|
-
const label = fileDesc ? `${fileName} — ${fileDesc}` : fileName;
|
|
118
|
-
return ` ${label}`;
|
|
62
|
+
function buildInstructions(config) {
|
|
63
|
+
const base = config.instructions ?? (config.datasets?.length ? DATASETS_INSTRUCTIONS : DEFAULT_INSTRUCTIONS);
|
|
64
|
+
if (config.additionalInstructions) return `${base}\n\n${config.additionalInstructions}`;
|
|
65
|
+
return base;
|
|
119
66
|
}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
import BaseTool from '../base-tool.js';
|
|
2
|
+
|
|
3
|
+
export default class CurentTimeTool extends BaseTool {
|
|
4
|
+
name = 'get_current_time';
|
|
5
|
+
|
|
6
|
+
getConfig() {
|
|
7
|
+
return {
|
|
8
|
+
description:
|
|
9
|
+
'Returns the current UTC time as an ISO 8601 timestamp. ' +
|
|
10
|
+
'Call this before constructing time-partitioned queries that involve "now", "recent", or a relative time range. ' +
|
|
11
|
+
'Use the returned time as `to` and set `from` based on how frequently the data is updated — e.g. 1 hour back for hourly data, 1 day back for daily data. ' +
|
|
12
|
+
'Not needed for static file queries or queries for a specific known date range.',
|
|
13
|
+
inputSchema: {},
|
|
14
|
+
};
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
handler() {
|
|
18
|
+
return { content: [{ type: 'text', text: new Date().toISOString() }] };
|
|
19
|
+
}
|
|
20
|
+
}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import { readFileSync } from 'node:fs';
|
|
2
|
+
import { z } from 'zod';
|
|
3
|
+
|
|
4
|
+
import BaseTool from '../base-tool.js';
|
|
5
|
+
import { ListObjectsV2Command } from '@aws-sdk/client-s3';
|
|
6
|
+
import { bigintReplacer } from '../../../utils/bigint-replacer.js';
|
|
7
|
+
import { buildS3Client } from '../../../s3/s3.js';
|
|
8
|
+
import { readParquetColumns } from '../../../utils/parquet-schema-reader.js';
|
|
9
|
+
import { buildDatasetContext } from '../../utils/utils.js';
|
|
10
|
+
|
|
11
|
+
const { S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY, S3_API_KEY, S3_ENDPOINT, S3_BUCKET } = process.env;
|
|
12
|
+
const listFilesTemplate = readFileSync(new URL('../../descriptions/list-files.md', import.meta.url), 'utf8');
|
|
13
|
+
|
|
14
|
+
export default class ListFilesTool extends BaseTool {
|
|
15
|
+
name = 'list_files';
|
|
16
|
+
|
|
17
|
+
getConfig() {
|
|
18
|
+
const today = new Date().toISOString().slice(0, 10);
|
|
19
|
+
const withDate = listFilesTemplate.replace('{{TODAY}}', today);
|
|
20
|
+
const datasetContext = buildDatasetContext(this.config.datasets);
|
|
21
|
+
const description = datasetContext ? `${withDate}\n${datasetContext}` : withDate;
|
|
22
|
+
|
|
23
|
+
return {
|
|
24
|
+
description,
|
|
25
|
+
inputSchema: {
|
|
26
|
+
prefix: z
|
|
27
|
+
.string()
|
|
28
|
+
.optional()
|
|
29
|
+
.describe('Path prefix to list under (e.g. "sales/" or ""). Defaults to empty string to list all files.'),
|
|
30
|
+
maxResults: z
|
|
31
|
+
.number()
|
|
32
|
+
.int()
|
|
33
|
+
.min(1)
|
|
34
|
+
.max(1000)
|
|
35
|
+
.optional()
|
|
36
|
+
.describe('Maximum number of files to return (default 100). Increase if the response is truncated.'),
|
|
37
|
+
endpoint: z.string().optional().describe('S3 endpoint URL. Overrides S3_ENDPOINT for this call.'),
|
|
38
|
+
bucket: z.string().optional().describe('S3 bucket name. Overrides S3_BUCKET for this call.'),
|
|
39
|
+
},
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
async handler({ prefix = '', maxResults = 100, endpoint, bucket }) {
|
|
44
|
+
const resolvedEndpoint = endpoint || S3_ENDPOINT;
|
|
45
|
+
const resolvedBucket = bucket || S3_BUCKET;
|
|
46
|
+
const s3Client = buildS3Client({
|
|
47
|
+
apiKey: S3_API_KEY,
|
|
48
|
+
accessKeyId: S3_ACCESS_KEY_ID,
|
|
49
|
+
secretAccessKey: S3_SECRET_ACCESS_KEY,
|
|
50
|
+
endpoint: resolvedEndpoint,
|
|
51
|
+
});
|
|
52
|
+
const response = await s3Client.send(
|
|
53
|
+
new ListObjectsV2Command({ Bucket: resolvedBucket, Prefix: prefix, MaxKeys: maxResults, Delimiter: '/' }),
|
|
54
|
+
);
|
|
55
|
+
const directories = (response.CommonPrefixes ?? []).map(({ Prefix }) => Prefix);
|
|
56
|
+
const files = (response.Contents ?? []).map(({ Key, Size }) => ({ file: Key, size: Size }));
|
|
57
|
+
const truncated = response.IsTruncated ?? false;
|
|
58
|
+
const representatives = getRepresentativeFiles(files);
|
|
59
|
+
const filesWithSchema = await Promise.all(
|
|
60
|
+
files.map((fileObj) => maybeAddSchema(s3Client, resolvedBucket, representatives, fileObj)),
|
|
61
|
+
);
|
|
62
|
+
|
|
63
|
+
return {
|
|
64
|
+
content: [
|
|
65
|
+
{ type: 'text', text: JSON.stringify({ directories, files: filesWithSchema, truncated }, bigintReplacer) },
|
|
66
|
+
],
|
|
67
|
+
};
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/** Helpers */
|
|
72
|
+
|
|
73
|
+
function getRepresentativeFiles(files) {
|
|
74
|
+
const parquetFiles = files.filter(({ file }) => file.endsWith('.parquet'));
|
|
75
|
+
const dirMap = parquetFiles.reduce(addFirstFilePerDir, new Map());
|
|
76
|
+
return new Set(dirMap.values());
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
function addFirstFilePerDir(acc, { file }) {
|
|
80
|
+
const dir = file.substring(0, file.lastIndexOf('/'));
|
|
81
|
+
if (!acc.has(dir)) acc.set(dir, file);
|
|
82
|
+
return acc;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
function maybeAddSchema(s3Client, bucket, representatives, fileObj) {
|
|
86
|
+
if (representatives.has(fileObj.file)) return addSchema(s3Client, bucket, fileObj);
|
|
87
|
+
return Promise.resolve(fileObj);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
async function addSchema(s3Client, bucket, { file, size }) {
|
|
91
|
+
if (!file.endsWith('.parquet')) return { file, size };
|
|
92
|
+
const columns = await readParquetColumns(s3Client, bucket, file).catch(() => null);
|
|
93
|
+
return { file, size, columns };
|
|
94
|
+
}
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
import { readFileSync } from 'node:fs';
|
|
2
|
+
import { z } from 'zod';
|
|
3
|
+
|
|
4
|
+
import BaseTool from '../base-tool.js';
|
|
5
|
+
import s3Querier, { bigintReplacer } from '../../../s3-querier.js';
|
|
6
|
+
|
|
7
|
+
const {
|
|
8
|
+
S3_ACCESS_KEY_ID,
|
|
9
|
+
S3_SECRET_ACCESS_KEY,
|
|
10
|
+
S3_API_KEY,
|
|
11
|
+
S3_ENDPOINT,
|
|
12
|
+
S3_BUCKET,
|
|
13
|
+
S3_BUCKETS_DIR = '/tmp/s3-querier',
|
|
14
|
+
} = process.env;
|
|
15
|
+
|
|
16
|
+
const sqlDescription = readFileSync(new URL('../../descriptions/sql-param.md', import.meta.url), 'utf8');
|
|
17
|
+
const toolDescription = readFileSync(new URL('../../descriptions/tool.md', import.meta.url), 'utf8');
|
|
18
|
+
|
|
19
|
+
export default class QueryTool extends BaseTool {
|
|
20
|
+
name = 'query';
|
|
21
|
+
|
|
22
|
+
getConfig() {
|
|
23
|
+
const description = this.config.datasets?.length
|
|
24
|
+
? `${toolDescription}\n\nCONFIGURED DATASETS: read the \`s3-querier://datasets\` resource for available datasets, prefixes, file path templates, and resource types.`
|
|
25
|
+
: toolDescription;
|
|
26
|
+
|
|
27
|
+
return {
|
|
28
|
+
description,
|
|
29
|
+
inputSchema: {
|
|
30
|
+
sql: z.string().describe(sqlDescription),
|
|
31
|
+
from: z
|
|
32
|
+
.string()
|
|
33
|
+
.optional()
|
|
34
|
+
.describe('Start of date range as ISO 8601 (e.g. "2025-01-01"). Required when the query uses date tokens.'),
|
|
35
|
+
to: z
|
|
36
|
+
.string()
|
|
37
|
+
.optional()
|
|
38
|
+
.describe('End of date range as ISO 8601 (e.g. "2025-01-31"). Required when the query uses date tokens.'),
|
|
39
|
+
endpoint: z.string().optional().describe('S3 endpoint URL. Overrides S3_ENDPOINT for this query.'),
|
|
40
|
+
bucket: z.string().optional().describe('S3 bucket name. Overrides S3_BUCKET for this query.'),
|
|
41
|
+
},
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
async handler({ sql, from, to, endpoint, bucket }) {
|
|
46
|
+
const fromMs = from ? new Date(from).getTime() : undefined;
|
|
47
|
+
const toMs = to ? new Date(to).getTime() : undefined;
|
|
48
|
+
const resolvedEndpoint = endpoint || S3_ENDPOINT;
|
|
49
|
+
const resolvedBucket = bucket || S3_BUCKET;
|
|
50
|
+
|
|
51
|
+
const results = await s3Querier({
|
|
52
|
+
query: sql,
|
|
53
|
+
from: fromMs,
|
|
54
|
+
to: toMs,
|
|
55
|
+
defaultEndpoint: resolvedEndpoint,
|
|
56
|
+
defaultBucket: resolvedBucket,
|
|
57
|
+
bucketsDir: S3_BUCKETS_DIR,
|
|
58
|
+
apiKey: S3_API_KEY,
|
|
59
|
+
accessKeyId: S3_ACCESS_KEY_ID,
|
|
60
|
+
secretAccessKey: S3_SECRET_ACCESS_KEY,
|
|
61
|
+
format: 'jsonRecords',
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
return {
|
|
65
|
+
content: [{ type: 'text', text: JSON.stringify(results, bigintReplacer) }],
|
|
66
|
+
};
|
|
67
|
+
}
|
|
68
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
export function buildDatasetContext(datasets) {
|
|
2
|
+
if (!datasets?.length) return '';
|
|
3
|
+
const datasetLines = datasets.flatMap(formatDataset);
|
|
4
|
+
return ['CONFIGURED DATASETS', '', ...datasetLines].join('\n');
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
function formatDataset({ name, description, bucket, endpoint, prefix, filePathTemplate, partitioning, files }) {
|
|
8
|
+
const header = description ? `${name} — ${description}` : name;
|
|
9
|
+
const lines = [header];
|
|
10
|
+
if (bucket) lines.push(` Bucket: ${bucket}`);
|
|
11
|
+
if (endpoint) lines.push(` Endpoint: ${endpoint}`);
|
|
12
|
+
if (prefix && !filePathTemplate) lines.push(` Prefix: ${prefix}`);
|
|
13
|
+
if (prefix && filePathTemplate)
|
|
14
|
+
lines.push(` Full path: ${prefix}${filePathTemplate} ({file} = resource name from Files list)`);
|
|
15
|
+
if (partitioning) lines.push(` Partitioning: ${partitioning}`);
|
|
16
|
+
if (files) {
|
|
17
|
+
const fileLines = Object.entries(files).flatMap(formatFileLine);
|
|
18
|
+
lines.push(' Files:', ...fileLines);
|
|
19
|
+
}
|
|
20
|
+
lines.push('');
|
|
21
|
+
return lines;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function formatFileLine([fileName, { description: fileDesc }]) {
|
|
25
|
+
return [` ${fileDesc ? `${fileName} — ${fileDesc}` : fileName}`];
|
|
26
|
+
}
|
|
@@ -1,63 +0,0 @@
|
|
|
1
|
-
import { ListObjectsV2Command } from '@aws-sdk/client-s3';
|
|
2
|
-
import { bigintReplacer } from '../../s3-querier.js';
|
|
3
|
-
import { buildS3Client } from '../../s3/s3.js';
|
|
4
|
-
import { readParquetColumns } from '../../utils/parquet-schema-reader.js';
|
|
5
|
-
|
|
6
|
-
const { S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY, S3_API_KEY, S3_ENDPOINT, S3_BUCKET } = process.env;
|
|
7
|
-
|
|
8
|
-
export async function handleListFiles({ prefix = '', maxResults = 100, endpoint, bucket }) {
|
|
9
|
-
const resolvedEndpoint = endpoint || S3_ENDPOINT;
|
|
10
|
-
const resolvedBucket = bucket || S3_BUCKET;
|
|
11
|
-
const clientConfig = {
|
|
12
|
-
apiKey: S3_API_KEY,
|
|
13
|
-
accessKeyId: S3_ACCESS_KEY_ID,
|
|
14
|
-
secretAccessKey: S3_SECRET_ACCESS_KEY,
|
|
15
|
-
endpoint: resolvedEndpoint,
|
|
16
|
-
};
|
|
17
|
-
const s3Client = buildS3Client(clientConfig);
|
|
18
|
-
const listCommand = new ListObjectsV2Command({
|
|
19
|
-
Bucket: resolvedBucket,
|
|
20
|
-
Prefix: prefix,
|
|
21
|
-
MaxKeys: maxResults,
|
|
22
|
-
Delimiter: '/',
|
|
23
|
-
});
|
|
24
|
-
const response = await s3Client.send(listCommand);
|
|
25
|
-
const directories = (response.CommonPrefixes ?? []).map(({ Prefix }) => Prefix);
|
|
26
|
-
const files = (response.Contents ?? []).map(({ Key, Size }) => ({ file: Key, size: Size }));
|
|
27
|
-
const truncated = response.IsTruncated ?? false;
|
|
28
|
-
const representatives = getRepresentativeFiles(files);
|
|
29
|
-
const filesWithSchema = await Promise.all(
|
|
30
|
-
files.map((fileObj) => maybeAddSchema(s3Client, resolvedBucket, representatives, fileObj)),
|
|
31
|
-
);
|
|
32
|
-
|
|
33
|
-
return {
|
|
34
|
-
content: [
|
|
35
|
-
{ type: 'text', text: JSON.stringify({ directories, files: filesWithSchema, truncated }, bigintReplacer) },
|
|
36
|
-
],
|
|
37
|
-
};
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
/** Helpers */
|
|
41
|
-
|
|
42
|
-
async function addSchema(s3Client, bucket, { file, size }) {
|
|
43
|
-
if (!file.endsWith('.parquet')) return { file, size };
|
|
44
|
-
const columns = await readParquetColumns(s3Client, bucket, file).catch(() => null);
|
|
45
|
-
return { file, size, columns };
|
|
46
|
-
}
|
|
47
|
-
|
|
48
|
-
function maybeAddSchema(s3Client, bucket, representatives, fileObj) {
|
|
49
|
-
if (representatives.has(fileObj.file)) return addSchema(s3Client, bucket, fileObj);
|
|
50
|
-
return Promise.resolve(fileObj);
|
|
51
|
-
}
|
|
52
|
-
|
|
53
|
-
function getRepresentativeFiles(files) {
|
|
54
|
-
const parquetFiles = files.filter(({ file }) => file.endsWith('.parquet'));
|
|
55
|
-
const dirMap = parquetFiles.reduce(addFirstFilePerDir, new Map());
|
|
56
|
-
return new Set(dirMap.values());
|
|
57
|
-
}
|
|
58
|
-
|
|
59
|
-
function addFirstFilePerDir(acc, { file }) {
|
|
60
|
-
const dir = file.substring(0, file.lastIndexOf('/'));
|
|
61
|
-
if (!acc.has(dir)) acc.set(dir, file);
|
|
62
|
-
return acc;
|
|
63
|
-
}
|
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
import s3Querier, { bigintReplacer } from '../../s3-querier.js';
|
|
2
|
-
|
|
3
|
-
const {
|
|
4
|
-
S3_ACCESS_KEY_ID,
|
|
5
|
-
S3_SECRET_ACCESS_KEY,
|
|
6
|
-
S3_API_KEY,
|
|
7
|
-
S3_ENDPOINT,
|
|
8
|
-
S3_BUCKET,
|
|
9
|
-
S3_BUCKETS_DIR = '/tmp/s3-querier',
|
|
10
|
-
} = process.env;
|
|
11
|
-
|
|
12
|
-
export async function handleQuery({ sql, from, to, endpoint, bucket }) {
|
|
13
|
-
const fromMs = from ? new Date(from).getTime() : undefined;
|
|
14
|
-
const toMs = to ? new Date(to).getTime() : undefined;
|
|
15
|
-
const resolvedEndpoint = endpoint || S3_ENDPOINT;
|
|
16
|
-
const resolvedBucket = bucket || S3_BUCKET;
|
|
17
|
-
|
|
18
|
-
const results = await s3Querier({
|
|
19
|
-
query: sql,
|
|
20
|
-
from: fromMs,
|
|
21
|
-
to: toMs,
|
|
22
|
-
defaultEndpoint: resolvedEndpoint,
|
|
23
|
-
defaultBucket: resolvedBucket,
|
|
24
|
-
bucketsDir: S3_BUCKETS_DIR,
|
|
25
|
-
apiKey: S3_API_KEY,
|
|
26
|
-
accessKeyId: S3_ACCESS_KEY_ID,
|
|
27
|
-
secretAccessKey: S3_SECRET_ACCESS_KEY,
|
|
28
|
-
format: 'jsonRecords',
|
|
29
|
-
});
|
|
30
|
-
|
|
31
|
-
return {
|
|
32
|
-
content: [{ type: 'text', text: JSON.stringify(results, bigintReplacer) }],
|
|
33
|
-
};
|
|
34
|
-
}
|