npm - s3-querier - Versions diffs - 1.1.1 → 1.2.0 - Mend

s3-querier 1.1.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/README.md +20 -4
package/docs/s3-querier.md +17 -2
package/package.json +1 -1
package/src/mcp/descriptions/sql-param.md +49 -3
package/src/mcp/descriptions/tool.md +15 -2
package/src/mcp/resources/base-resource.js +17 -0
package/src/mcp/resources/s3-querier-datasets/s3-querier-datasets.js +24 -0
package/src/mcp/resources/s3-querier-docs/s3-querier-docs.js +22 -0
package/src/mcp/s3querier-mcp.js +40 -93
package/src/mcp/tools/base-tool.js +13 -0
package/src/mcp/tools/current-time/current-time.js +20 -0
package/src/mcp/tools/list-files/list-files.js +94 -0
package/src/mcp/tools/query/query.js +68 -0
package/src/mcp/utils/utils.js +26 -0
package/src/mcp/handlers/list-files.js +0 -63
package/src/mcp/handlers/query.js +0 -34

package/README.md CHANGED Viewed

@@ -161,7 +161,7 @@ const results = await s3Querier({
 ## MCP Server
-s3-querier ships a [Model Context Protocol](https://modelcontextprotocol.io/) server that exposes two tools to any MCP-compatible client (Claude Desktop, Claude Code, etc.):
+s3-querier ships a [Model Context Protocol](https://modelcontextprotocol.io/) server that exposes two tools to any MCP-compatible client (Claude Desktop, Claude Code, IBM Bob etc.):
 - **`query`** — runs a DuckDB SQL query against your S3 data
 - **`list_files`** — lists objects under a prefix so an LLM can discover available data
@@ -191,7 +191,7 @@ claude mcp add s3-querier \
   -e S3_BUCKET=my-bucket \
   -e S3_ACCESS_KEY_ID=key \
   -e S3_SECRET_ACCESS_KEY=secret \
-  -- node node_modules/s3-querier/src/mcp/server.js
+  -- npx -y s3-querier
 ```
 **IBM Bob**
@@ -202,8 +202,8 @@ Add to `mcp_settings.json` (global, applies across all workspaces) or `.Bob/mcp.
 {
   "mcpServers": {
     "s3-querier": {
-      "command": "node",
-      "args": ["/absolute/path/to/node_modules/s3-querier/src/mcp/server.js"],
+      "command": "npx",
+      "args": ["-y", "s3-querier"],
       "env": {
         "S3_ENDPOINT": "https://s3.amazonaws.com",
         "S3_BUCKET": "my-bucket",
@@ -298,6 +298,22 @@ claude mcp add my-datalake \
 | `endpoint` | Overrides `S3_ENDPOINT` for this dataset |
 | `files` | Map of logical file names to `{ description }` |
+#### Server instructions
+By default, `S3QuerierMCP` sends step-by-step workflow instructions to the LLM at connection time. When datasets are configured, the default guides the LLM to read the datasets resource, inspect schemas, and use date tokens correctly. Without datasets, it guides discovery via `list_files`.
+| Option | Description |
+| --- | --- |
+| `additionalInstructions` | Appended to the default instructions. Use this to add project-specific guidance, e.g. a preferred lookback window for "latest" queries. |
+| `instructions` | Replaces the default instructions entirely. |
+```js
+new S3QuerierMCP({
+  datasets: [ /* ... */ ],
+  additionalInstructions: 'Data is updated hourly. For recent data, set from to 2 hours before current time and to to current time.',
+}).start();
+```
 ### Adding custom tools
 Pass a `tools` array to register additional MCP tools alongside the built-in ones:

package/docs/s3-querier.md CHANGED Viewed

@@ -26,7 +26,7 @@ When querying data from a data lake, be mindful of how your queries are construc
   [Install the DuckDB CLI](https://duckdb.org/docs/installation/?version=stable&environment=cli&platform=macos&download_method=direct) and experiment with your queries on local parquet files before running them against S3. This gives a fast feedback loop for understanding data structure and refining queries.
 - **Be Mindful Of Time Ranges In Date Tokens**
-  Long time ranges require fetching more files and slow execution. Use narrow time ranges whenever possible.
+  Long time ranges require fetching more files and slow execution. Use narrow time ranges whenever possible. If a query fails due to the 1GB limit, narrow the `from`/`to` range and run multiple queries — for example, query 4–6 hours at a time instead of a full day.
 - **Create Secondary Representations Of Your Data**
   For larger datasets, break files into smaller chunks to avoid hitting the file size limit.
@@ -151,7 +151,22 @@ FROM read_parquet('jobs_failed/window=202308032130/*.parquet', union_by_name=1);
 ```
 > [!WARNING]
-> **Use globs with caution.** They can match more files than expected, causing unnecessary downloads and degraded performance. Always verify the file list your glob will match before running a broad query.
+> **Never use globs on time-partitioned folder segments** (e.g. `year=*/`, `month=*/`, `hour=*/`). A folder-level glob matches every partition and will over-fetch, hitting the 1GB query limit.
+>
+> Use [time formatting tokens](#time-formatting-tokens) with `from`/`to` instead — they expand to exactly the partitions needed:
+>
+> ```sql
+> -- ❌ WRONG — hour=*/ fetches every hour
+> SELECT * FROM read_parquet('data/year=2026/month=06/day=15/hour=*/file.parquet');
+>
+> -- ✅ CORRECT — {hh} with from/to fetches only the requested hours
+> SELECT * FROM read_parquet('data/year={yyyy}/month={MM}/day={dd}/hour={hh}/file.parquet', union_by_name=1);
+> ```
+>
+> Globs are appropriate only for **file name patterns within a known folder**, or for non-time path segments. They can be combined with time tokens — tokens on the folder segments, glob on the filename:
+> ```sql
+> SELECT * FROM read_parquet('data/year={yyyy}/month={MM}/day={dd}/hour={hh}/records_*.parquet', union_by_name=1);
+> ```
 ---

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "s3-querier",
-  "version": "1.1.1",
+  "version": "1.2.0",
   "description": "Query S3-compatible storage with DuckDB and SQL",
   "type": "module",
   "main": "src/s3-querier.js",

package/src/mcp/descriptions/sql-param.md CHANGED Viewed

@@ -1,5 +1,7 @@
 A DuckDB SQL query. File paths inside read_parquet() or read_csv() are resolved against S3
-and downloaded before the query runs.
+and downloaded before the query runs. Paths are relative to the bucket root — do not use
+s3://, s3a://, or any protocol prefix. To reference a non-default bucket, use the
+{bucket:name} token instead.
 REQUIRED: always call read_parquet() or read_csv() — plain table names are not supported.
 Prefer union_by_name=1 when reading multiple files.
@@ -22,8 +24,33 @@ Location tokens — override endpoint or bucket per path:
   {endpoint:https://s3.example.com}
   {bucket:my-bucket}
-Glob syntax — wildcard matching for non-time path segments:
-  jobs/window=202308032130/*.parquet
+QUERYING TIME-PARTITIONED DATA
+Always use date tokens for time-partitioned paths — even for a single snapshot. Tokens keep
+the partitioning structure explicit and make the query work correctly if the time range
+changes. Only hardcode a date when the file is genuinely static (not part of any
+time-partition scheme).
+Use date tokens in the SQL with `from`/`to` as separate parameters. ONE query with tokens
+downloads all matching files across the range — do not make multiple tool calls with
+hardcoded dates.
+  ✗ WRONG — hardcoded date in path (even for a single hour):
+      sql: SELECT * FROM read_parquet('events/year=2026/month=06/day=15/hour=14/data.parquet')
+  ✗ WRONG — multiple tool calls with hardcoded hours:
+      sql: SELECT * FROM read_parquet('events/year=2026/month=06/day=15/hour=12/data.parquet')
+      sql: SELECT * FROM read_parquet('events/year=2026/month=06/day=15/hour=13/data.parquet')
+      sql: SELECT * FROM read_parquet('events/year=2026/month=06/day=15/hour=14/data.parquet')
+  ✓ CORRECT — one tool call, tokens expand across all hours in the range:
+      sql:  SELECT * FROM read_parquet('events/year={yyyy}/month={MM}/day={dd}/hour={hh}/data.parquet', union_by_name=1)
+      from: 2026-06-15T12:00:00Z
+      to:   2026-06-15T14:59:59Z
+Tokens also expand inside the filename, not just in path directory segments:
+  data/year={yyyy}/month={MM}/day={dd}/hour={hh}/file_{yyyy}{MM}{dd}{hh}00.parquet
+  → s3-querier downloads one file per hour in the from/to range.
 HIVE-PARTITIONED DATA
@@ -41,6 +68,11 @@ EXAMPLES
 Single file:
   SELECT * FROM read_parquet('reports/summary.parquet') LIMIT 10
+Hour-partitioned files — tokens in path and filename (requires from/to):
+  SELECT * FROM read_parquet(
+    'events/year={yyyy}/month={MM}/day={dd}/hour={hh}/file_{yyyy}{MM}{dd}{hh}00.parquet',
+    union_by_name=1)
 Day-partitioned files (requires from/to):
   SELECT id FROM read_parquet('events/year={yyyy}/month={MM}/day={dd}/data.parquet', union_by_name=1)
@@ -51,3 +83,17 @@ Cross-endpoint join:
   WITH east AS (SELECT id FROM read_parquet('{endpoint:https://s3.us-east.example.com}/{bucket:logs}/data/{yyyy}{MM}{dd}.parquet'))
   SELECT * FROM read_parquet('{endpoint:https://s3.eu-west.example.com}/{bucket:logs}/data/{yyyy}{MM}{dd}.parquet') AS west
   JOIN east ON west.id = east.id
+GLOB SYNTAX (last resort — filename patterns only)
+Globs match non-time file name segments within a known folder:
+  jobs/window=202308032130/*.parquet
+  Do NOT use globs on time-partitioned folder segments (year=, month=, day=, hour=, etc.).
+  A folder-level glob like hour=*/ matches every hour and causes massive over-fetching.
+  Use date tokens with from/to instead — they expand to exactly the hours/days needed:
+    ✗  data/year=2026/month=06/day=15/hour=*/file.parquet
+    ✓  data/year={yyyy}/month={MM}/day={dd}/hour={hh}/file.parquet  (with from/to)
+  Tokens and globs can be combined — tokens on folder segments, glob on the filename:
+    data/year={yyyy}/month={MM}/day={dd}/hour={hh}/records_*.parquet

package/src/mcp/descriptions/tool.md CHANGED Viewed

@@ -4,5 +4,18 @@ Queries must use DuckDB table functions such as read_parquet() or read_csv() wit
 that reference objects in S3. S3 Querier resolves those paths, downloads the matching files,
 and runs the query locally with DuckDB.
-Use the `s3-querier://docs` resource for full documentation including token syntax, examples,
-and query planning tips.
+TIME-PARTITIONED DATA: use date tokens ({yyyy}, {MM}, {dd}, {hh}, {mm}) in file paths
+together with the `from` and `to` parameters to query a time range. ONE query with tokens
+fetches all matching files across the range — never make multiple calls with hardcoded dates.
+  sql:  SELECT * FROM read_parquet('data/year={yyyy}/month={MM}/day={dd}/hour={hh}/file.parquet', union_by_name=1)
+  from: 2026-06-15T12:00:00Z
+  to:   2026-06-15T19:59:59Z
+CURRENT TIME: If the query involves "now", "recent", "latest", or a relative time range,
+call `get_current_time` first to get the accurate current UTC time. Do not rely on training
+knowledge to guess the current date or time. This does not apply to static file queries or
+queries for a specific known date range.
+Read the `s3-querier://docs` resource or the `sql` parameter description for full token
+syntax, examples, and query planning tips before writing your first query.

package/src/mcp/resources/base-resource.js ADDED Viewed

@@ -0,0 +1,17 @@
+export default class BaseResource {
+  constructor(config) {
+    this.config = config;
+  }
+  isEnabled() {
+    return true;
+  }
+  getMeta() {
+    throw new Error('Resources must implement getMeta()');
+  }
+  handler() {
+    throw new Error('Resources must implement handler()');
+  }
+}

package/src/mcp/resources/s3-querier-datasets/s3-querier-datasets.js ADDED Viewed

@@ -0,0 +1,24 @@
+import BaseResource from '../base-resource.js';
+import { buildDatasetContext } from '../../utils/utils.js';
+export default class S3QuerierDatasetsResource extends BaseResource {
+  name = 's3-querier-datasets';
+  uri = 's3-querier://datasets';
+  isEnabled() {
+    return !!this.config.datasets?.length;
+  }
+  getMeta() {
+    return {
+      title: 'Configured Datasets',
+      description: 'Available datasets: bucket, prefix, file path template, partitioning, and resource types.',
+      mimeType: 'text/plain',
+    };
+  }
+  handler(uri) {
+    const text = buildDatasetContext(this.config.datasets);
+    return { contents: [{ uri: uri.href, text, mimeType: 'text/plain' }] };
+  }
+}

package/src/mcp/resources/s3-querier-docs/s3-querier-docs.js ADDED Viewed

@@ -0,0 +1,22 @@
+import { readFileSync } from 'node:fs';
+import BaseResource from '../base-resource.js';
+const docsContent = readFileSync(new URL('../../../../docs/s3-querier.md', import.meta.url), 'utf8');
+export default class S3QuerierDocsResource extends BaseResource {
+  name = 's3-querier-docs';
+  uri = 's3-querier://docs';
+  getMeta() {
+    return {
+      title: 'S3 Querier Documentation',
+      description: 'Full documentation: query planning, file tokens, location tokens, and examples.',
+      mimeType: 'text/markdown',
+    };
+  }
+  handler(uri) {
+    return { contents: [{ uri: uri.href, text: docsContent, mimeType: 'text/markdown' }] };
+  }
+}

package/src/mcp/s3querier-mcp.js CHANGED Viewed

@@ -1,71 +1,56 @@
 import { readFileSync } from 'node:fs';
 import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
 import { StdioServerTransport } from '@modelcontextprotocol/sdk/server/stdio.js';
-import { z } from 'zod';
-import { handleListFiles } from './handlers/list-files.js';
-import { handleQuery } from './handlers/query.js';
-const pkg = JSON.parse(readFileSync(new URL('../../package.json', import.meta.url), 'utf8'));
+import QueryTool from './tools/query/query.js';
+import ListFilesTool from './tools/list-files/list-files.js';
+import CurentTimeTool from './tools/current-time/current-time.js';
-const toolDescription = readFileSync(new URL('./descriptions/tool.md', import.meta.url), 'utf8');
-const sqlDescription = readFileSync(new URL('./descriptions/sql-param.md', import.meta.url), 'utf8');
-const listFilesTemplate = readFileSync(new URL('./descriptions/list-files.md', import.meta.url), 'utf8');
-const docsContent = readFileSync(new URL('../../docs/s3-querier.md', import.meta.url), 'utf8');
+import S3QuerierDocsResource from './resources/s3-querier-docs/s3-querier-docs.js';
+import S3QuerierDatasetsResource from './resources/s3-querier-datasets/s3-querier-datasets.js';
-const QUERY_TOOL_SCHEMA = {
-  sql: z.string().describe(sqlDescription),
-  from: z
-    .string()
-    .optional()
-    .describe('Start of date range as ISO 8601 (e.g. "2025-01-01"). Required when the query uses date tokens.'),
-  to: z
-    .string()
-    .optional()
-    .describe('End of date range as ISO 8601 (e.g. "2025-01-31"). Required when the query uses date tokens.'),
-  endpoint: z.string().optional().describe('S3 endpoint URL. Overrides S3_ENDPOINT for this query.'),
-  bucket: z.string().optional().describe('S3 bucket name. Overrides S3_BUCKET for this query.'),
-};
+const pkg = JSON.parse(readFileSync(new URL('../../package.json', import.meta.url), 'utf8'));
-const LIST_FILES_TOOL_SCHEMA = {
-  prefix: z
-    .string()
-    .optional()
-    .describe('Path prefix to list under (e.g. "sales/" or ""). Defaults to empty string to list all files.'),
-  maxResults: z
-    .number()
-    .int()
-    .min(1)
-    .max(1000)
-    .optional()
-    .describe('Maximum number of files to return (default 100). Increase if the response is truncated.'),
-  endpoint: z.string().optional().describe('S3 endpoint URL. Overrides S3_ENDPOINT for this call.'),
-  bucket: z.string().optional().describe('S3 bucket name. Overrides S3_BUCKET for this call.'),
-};
+const DEFAULT_INSTRUCTIONS = `
+Step 1: Use list_files to discover what data is available under a prefix.
+Step 2: Check the columns field in the list_files response — if present, use those column names. Otherwise run SELECT * FROM read_parquet('path') LIMIT 1 to inspect the schema.
+Step 3: For time-partitioned data, call get_current_time to get the current UTC time, then query with the appropriate from/to range.
+Step 4: Query using the correct file paths discovered in Step 1.
+`.trim();
-const DOCS_RESOURCE = {
-  title: 'S3 Querier Documentation',
-  description: 'Full documentation: query planning, file tokens, location tokens, and examples.',
-  mimeType: 'text/markdown',
-};
+const DATASETS_INSTRUCTIONS = `
+Step 1: Read the s3-querier://datasets resource to see available datasets and their S3 paths.
+Step 2: Review the datasets to identify which are relevant to the request.
+Step 3: Never guess column names. Run SELECT * FROM read_parquet('full_path') LIMIT 1 on each relevant file to inspect the schema — for time-partitioned paths, call get_current_time first to construct a valid path.
+Step 4: Query the relevant datasets directly — do not use list_files to explore the bucket.
+`.trim();
 export class S3QuerierMCP {
   constructor(config = {}) {
     this.config = config;
+    this.toolClasses = [QueryTool, ListFilesTool, CurentTimeTool];
+    this.resourceClasses = [S3QuerierDocsResource, S3QuerierDatasetsResource];
   }
   async start() {
-    const server = new McpServer({ name: 's3-querier', version: pkg.version });
+    const server = new McpServer({
+      name: 's3-querier',
+      version: pkg.version,
+      instructions: buildInstructions(this.config),
+    });
     const transport = new StdioServerTransport();
-    const listFilesDescription = buildListFilesDescription(this.config);
-    const enrichedToolDescription = buildToolDescription(this.config);
-    server.registerResource('s3-querier-docs', 's3-querier://docs', DOCS_RESOURCE, serveDocsHandler);
-    server.registerTool(
-      'list_files',
-      { description: listFilesDescription, inputSchema: LIST_FILES_TOOL_SCHEMA },
-      handleListFiles,
-    );
-    server.registerTool('query', { description: enrichedToolDescription, inputSchema: QUERY_TOOL_SCHEMA }, handleQuery);
+    this.resourceClasses.forEach((ResourceClass) => {
+      const resource = new ResourceClass(this.config);
+      if (!resource.isEnabled()) return;
+      server.registerResource(resource.name, resource.uri, resource.getMeta(), resource.handler.bind(resource));
+    });
+    this.toolClasses.forEach((ToolClass) => {
+      const tool = new ToolClass(this.config);
+      server.registerTool(tool.name, tool.getConfig(), tool.handler.bind(tool));
+    });
     (this.config.tools ?? []).forEach(({ name, description, inputSchema, handler }) => {
       server.registerTool(name, { description, inputSchema }, handler);
     });
@@ -74,46 +59,8 @@ export class S3QuerierMCP {
   }
 }
-/** Helpers */
-function serveDocsHandler(uri) {
-  return { contents: [{ uri: uri.href, text: docsContent, mimeType: 'text/markdown' }] };
-}
-function buildListFilesDescription(config) {
-  const today = new Date().toISOString().slice(0, 10);
-  const withDate = listFilesTemplate.replace('{{TODAY}}', today);
-  const datasetContext = buildDatasetContext(config.datasets);
-  return datasetContext ? `${withDate}\n${datasetContext}` : withDate;
-}
-function buildToolDescription(config) {
-  const datasetContext = buildDatasetContext(config.datasets);
-  return datasetContext ? `${toolDescription}\n\n${datasetContext}` : toolDescription;
-}
-function buildDatasetContext(datasets) {
-  if (!datasets?.length) return '';
-  const datasetLines = datasets.flatMap(formatDataset);
-  return ['CONFIGURED DATASETS', '', ...datasetLines].join('\n');
-}
-function formatDataset({ name, description, bucket, endpoint, prefix, partitioning, files }) {
-  const header = description ? `${name} — ${description}` : name;
-  const lines = [header];
-  if (bucket) lines.push(`  Bucket: ${bucket}`);
-  if (endpoint) lines.push(`  Endpoint: ${endpoint}`);
-  if (prefix) lines.push(`  Prefix: ${prefix}`);
-  if (partitioning) lines.push(`  Partitioning: ${partitioning}`);
-  if (files) {
-    const fileLines = Object.entries(files).map(formatFileLine);
-    lines.push('  Files:', ...fileLines);
-  }
-  lines.push('');
-  return lines;
-}
-function formatFileLine([fileName, { description: fileDesc }]) {
-  const label = fileDesc ? `${fileName} — ${fileDesc}` : fileName;
-  return `    ${label}`;
+function buildInstructions(config) {
+  const base = config.instructions ?? (config.datasets?.length ? DATASETS_INSTRUCTIONS : DEFAULT_INSTRUCTIONS);
+  if (config.additionalInstructions) return `${base}\n\n${config.additionalInstructions}`;
+  return base;
 }

package/src/mcp/tools/base-tool.js ADDED Viewed

@@ -0,0 +1,13 @@
+export default class BaseTool {
+  constructor(config) {
+    this.config = config;
+  }
+  getConfig() {
+    throw new Error('Tools must implement getConfig()');
+  }
+  handler() {
+    throw new Error('Tools must implement handler()');
+  }
+}

package/src/mcp/tools/current-time/current-time.js ADDED Viewed

@@ -0,0 +1,20 @@
+import BaseTool from '../base-tool.js';
+export default class CurentTimeTool extends BaseTool {
+  name = 'get_current_time';
+  getConfig() {
+    return {
+      description:
+        'Returns the current UTC time as an ISO 8601 timestamp. ' +
+        'Call this before constructing time-partitioned queries that involve "now", "recent", or a relative time range. ' +
+        'Use the returned time as `to` and set `from` based on how frequently the data is updated — e.g. 1 hour back for hourly data, 1 day back for daily data. ' +
+        'Not needed for static file queries or queries for a specific known date range.',
+      inputSchema: {},
+    };
+  }
+  handler() {
+    return { content: [{ type: 'text', text: new Date().toISOString() }] };
+  }
+}

package/src/mcp/tools/list-files/list-files.js ADDED Viewed

@@ -0,0 +1,94 @@
+import { readFileSync } from 'node:fs';
+import { z } from 'zod';
+import BaseTool from '../base-tool.js';
+import { ListObjectsV2Command } from '@aws-sdk/client-s3';
+import { bigintReplacer } from '../../../utils/bigint-replacer.js';
+import { buildS3Client } from '../../../s3/s3.js';
+import { readParquetColumns } from '../../../utils/parquet-schema-reader.js';
+import { buildDatasetContext } from '../../utils/utils.js';
+const { S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY, S3_API_KEY, S3_ENDPOINT, S3_BUCKET } = process.env;
+const listFilesTemplate = readFileSync(new URL('../../descriptions/list-files.md', import.meta.url), 'utf8');
+export default class ListFilesTool extends BaseTool {
+  name = 'list_files';
+  getConfig() {
+    const today = new Date().toISOString().slice(0, 10);
+    const withDate = listFilesTemplate.replace('{{TODAY}}', today);
+    const datasetContext = buildDatasetContext(this.config.datasets);
+    const description = datasetContext ? `${withDate}\n${datasetContext}` : withDate;
+    return {
+      description,
+      inputSchema: {
+        prefix: z
+          .string()
+          .optional()
+          .describe('Path prefix to list under (e.g. "sales/" or ""). Defaults to empty string to list all files.'),
+        maxResults: z
+          .number()
+          .int()
+          .min(1)
+          .max(1000)
+          .optional()
+          .describe('Maximum number of files to return (default 100). Increase if the response is truncated.'),
+        endpoint: z.string().optional().describe('S3 endpoint URL. Overrides S3_ENDPOINT for this call.'),
+        bucket: z.string().optional().describe('S3 bucket name. Overrides S3_BUCKET for this call.'),
+      },
+    };
+  }
+  async handler({ prefix = '', maxResults = 100, endpoint, bucket }) {
+    const resolvedEndpoint = endpoint || S3_ENDPOINT;
+    const resolvedBucket = bucket || S3_BUCKET;
+    const s3Client = buildS3Client({
+      apiKey: S3_API_KEY,
+      accessKeyId: S3_ACCESS_KEY_ID,
+      secretAccessKey: S3_SECRET_ACCESS_KEY,
+      endpoint: resolvedEndpoint,
+    });
+    const response = await s3Client.send(
+      new ListObjectsV2Command({ Bucket: resolvedBucket, Prefix: prefix, MaxKeys: maxResults, Delimiter: '/' }),
+    );
+    const directories = (response.CommonPrefixes ?? []).map(({ Prefix }) => Prefix);
+    const files = (response.Contents ?? []).map(({ Key, Size }) => ({ file: Key, size: Size }));
+    const truncated = response.IsTruncated ?? false;
+    const representatives = getRepresentativeFiles(files);
+    const filesWithSchema = await Promise.all(
+      files.map((fileObj) => maybeAddSchema(s3Client, resolvedBucket, representatives, fileObj)),
+    );
+    return {
+      content: [
+        { type: 'text', text: JSON.stringify({ directories, files: filesWithSchema, truncated }, bigintReplacer) },
+      ],
+    };
+  }
+}
+/** Helpers */
+function getRepresentativeFiles(files) {
+  const parquetFiles = files.filter(({ file }) => file.endsWith('.parquet'));
+  const dirMap = parquetFiles.reduce(addFirstFilePerDir, new Map());
+  return new Set(dirMap.values());
+}
+function addFirstFilePerDir(acc, { file }) {
+  const dir = file.substring(0, file.lastIndexOf('/'));
+  if (!acc.has(dir)) acc.set(dir, file);
+  return acc;
+}
+function maybeAddSchema(s3Client, bucket, representatives, fileObj) {
+  if (representatives.has(fileObj.file)) return addSchema(s3Client, bucket, fileObj);
+  return Promise.resolve(fileObj);
+}
+async function addSchema(s3Client, bucket, { file, size }) {
+  if (!file.endsWith('.parquet')) return { file, size };
+  const columns = await readParquetColumns(s3Client, bucket, file).catch(() => null);
+  return { file, size, columns };
+}

package/src/mcp/tools/query/query.js ADDED Viewed

@@ -0,0 +1,68 @@
+import { readFileSync } from 'node:fs';
+import { z } from 'zod';
+import BaseTool from '../base-tool.js';
+import s3Querier, { bigintReplacer } from '../../../s3-querier.js';
+const {
+  S3_ACCESS_KEY_ID,
+  S3_SECRET_ACCESS_KEY,
+  S3_API_KEY,
+  S3_ENDPOINT,
+  S3_BUCKET,
+  S3_BUCKETS_DIR = '/tmp/s3-querier',
+} = process.env;
+const sqlDescription = readFileSync(new URL('../../descriptions/sql-param.md', import.meta.url), 'utf8');
+const toolDescription = readFileSync(new URL('../../descriptions/tool.md', import.meta.url), 'utf8');
+export default class QueryTool extends BaseTool {
+  name = 'query';
+  getConfig() {
+    const description = this.config.datasets?.length
+      ? `${toolDescription}\n\nCONFIGURED DATASETS: read the \`s3-querier://datasets\` resource for available datasets, prefixes, file path templates, and resource types.`
+      : toolDescription;
+    return {
+      description,
+      inputSchema: {
+        sql: z.string().describe(sqlDescription),
+        from: z
+          .string()
+          .optional()
+          .describe('Start of date range as ISO 8601 (e.g. "2025-01-01"). Required when the query uses date tokens.'),
+        to: z
+          .string()
+          .optional()
+          .describe('End of date range as ISO 8601 (e.g. "2025-01-31"). Required when the query uses date tokens.'),
+        endpoint: z.string().optional().describe('S3 endpoint URL. Overrides S3_ENDPOINT for this query.'),
+        bucket: z.string().optional().describe('S3 bucket name. Overrides S3_BUCKET for this query.'),
+      },
+    };
+  }
+  async handler({ sql, from, to, endpoint, bucket }) {
+    const fromMs = from ? new Date(from).getTime() : undefined;
+    const toMs = to ? new Date(to).getTime() : undefined;
+    const resolvedEndpoint = endpoint || S3_ENDPOINT;
+    const resolvedBucket = bucket || S3_BUCKET;
+    const results = await s3Querier({
+      query: sql,
+      from: fromMs,
+      to: toMs,
+      defaultEndpoint: resolvedEndpoint,
+      defaultBucket: resolvedBucket,
+      bucketsDir: S3_BUCKETS_DIR,
+      apiKey: S3_API_KEY,
+      accessKeyId: S3_ACCESS_KEY_ID,
+      secretAccessKey: S3_SECRET_ACCESS_KEY,
+      format: 'jsonRecords',
+    });
+    return {
+      content: [{ type: 'text', text: JSON.stringify(results, bigintReplacer) }],
+    };
+  }
+}

package/src/mcp/utils/utils.js ADDED Viewed

@@ -0,0 +1,26 @@
+export function buildDatasetContext(datasets) {
+  if (!datasets?.length) return '';
+  const datasetLines = datasets.flatMap(formatDataset);
+  return ['CONFIGURED DATASETS', '', ...datasetLines].join('\n');
+}
+function formatDataset({ name, description, bucket, endpoint, prefix, filePathTemplate, partitioning, files }) {
+  const header = description ? `${name} — ${description}` : name;
+  const lines = [header];
+  if (bucket) lines.push(`  Bucket: ${bucket}`);
+  if (endpoint) lines.push(`  Endpoint: ${endpoint}`);
+  if (prefix && !filePathTemplate) lines.push(`  Prefix: ${prefix}`);
+  if (prefix && filePathTemplate)
+    lines.push(`  Full path: ${prefix}${filePathTemplate}  ({file} = resource name from Files list)`);
+  if (partitioning) lines.push(`  Partitioning: ${partitioning}`);
+  if (files) {
+    const fileLines = Object.entries(files).flatMap(formatFileLine);
+    lines.push('  Files:', ...fileLines);
+  }
+  lines.push('');
+  return lines;
+}
+function formatFileLine([fileName, { description: fileDesc }]) {
+  return [`    ${fileDesc ? `${fileName} — ${fileDesc}` : fileName}`];
+}

package/src/mcp/handlers/list-files.js DELETED Viewed

@@ -1,63 +0,0 @@
-import { ListObjectsV2Command } from '@aws-sdk/client-s3';
-import { bigintReplacer } from '../../s3-querier.js';
-import { buildS3Client } from '../../s3/s3.js';
-import { readParquetColumns } from '../../utils/parquet-schema-reader.js';
-const { S3_ACCESS_KEY_ID, S3_SECRET_ACCESS_KEY, S3_API_KEY, S3_ENDPOINT, S3_BUCKET } = process.env;
-export async function handleListFiles({ prefix = '', maxResults = 100, endpoint, bucket }) {
-  const resolvedEndpoint = endpoint || S3_ENDPOINT;
-  const resolvedBucket = bucket || S3_BUCKET;
-  const clientConfig = {
-    apiKey: S3_API_KEY,
-    accessKeyId: S3_ACCESS_KEY_ID,
-    secretAccessKey: S3_SECRET_ACCESS_KEY,
-    endpoint: resolvedEndpoint,
-  };
-  const s3Client = buildS3Client(clientConfig);
-  const listCommand = new ListObjectsV2Command({
-    Bucket: resolvedBucket,
-    Prefix: prefix,
-    MaxKeys: maxResults,
-    Delimiter: '/',
-  });
-  const response = await s3Client.send(listCommand);
-  const directories = (response.CommonPrefixes ?? []).map(({ Prefix }) => Prefix);
-  const files = (response.Contents ?? []).map(({ Key, Size }) => ({ file: Key, size: Size }));
-  const truncated = response.IsTruncated ?? false;
-  const representatives = getRepresentativeFiles(files);
-  const filesWithSchema = await Promise.all(
-    files.map((fileObj) => maybeAddSchema(s3Client, resolvedBucket, representatives, fileObj)),
-  );
-  return {
-    content: [
-      { type: 'text', text: JSON.stringify({ directories, files: filesWithSchema, truncated }, bigintReplacer) },
-    ],
-  };
-}
-/** Helpers */
-async function addSchema(s3Client, bucket, { file, size }) {
-  if (!file.endsWith('.parquet')) return { file, size };
-  const columns = await readParquetColumns(s3Client, bucket, file).catch(() => null);
-  return { file, size, columns };
-}
-function maybeAddSchema(s3Client, bucket, representatives, fileObj) {
-  if (representatives.has(fileObj.file)) return addSchema(s3Client, bucket, fileObj);
-  return Promise.resolve(fileObj);
-}
-function getRepresentativeFiles(files) {
-  const parquetFiles = files.filter(({ file }) => file.endsWith('.parquet'));
-  const dirMap = parquetFiles.reduce(addFirstFilePerDir, new Map());
-  return new Set(dirMap.values());
-}
-function addFirstFilePerDir(acc, { file }) {
-  const dir = file.substring(0, file.lastIndexOf('/'));
-  if (!acc.has(dir)) acc.set(dir, file);
-  return acc;
-}

package/src/mcp/handlers/query.js DELETED Viewed

@@ -1,34 +0,0 @@
-import s3Querier, { bigintReplacer } from '../../s3-querier.js';
-const {
-  S3_ACCESS_KEY_ID,
-  S3_SECRET_ACCESS_KEY,
-  S3_API_KEY,
-  S3_ENDPOINT,
-  S3_BUCKET,
-  S3_BUCKETS_DIR = '/tmp/s3-querier',
-} = process.env;
-export async function handleQuery({ sql, from, to, endpoint, bucket }) {
-  const fromMs = from ? new Date(from).getTime() : undefined;
-  const toMs = to ? new Date(to).getTime() : undefined;
-  const resolvedEndpoint = endpoint || S3_ENDPOINT;
-  const resolvedBucket = bucket || S3_BUCKET;
-  const results = await s3Querier({
-    query: sql,
-    from: fromMs,
-    to: toMs,
-    defaultEndpoint: resolvedEndpoint,
-    defaultBucket: resolvedBucket,
-    bucketsDir: S3_BUCKETS_DIR,
-    apiKey: S3_API_KEY,
-    accessKeyId: S3_ACCESS_KEY_ID,
-    secretAccessKey: S3_SECRET_ACCESS_KEY,
-    format: 'jsonRecords',
-  });
-  return {
-    content: [{ type: 'text', text: JSON.stringify(results, bigintReplacer) }],
-  };
-}