s3-querier 1.1.0 → 1.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "s3-querier",
|
|
3
|
-
"version": "1.1.
|
|
3
|
+
"version": "1.1.2",
|
|
4
4
|
"description": "Query S3-compatible storage with DuckDB and SQL",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "src/s3-querier.js",
|
|
@@ -9,11 +9,12 @@
|
|
|
9
9
|
},
|
|
10
10
|
"exports": {
|
|
11
11
|
".": "./src/s3-querier.js",
|
|
12
|
-
"./mcp": "./src/mcp/
|
|
12
|
+
"./mcp": "./src/mcp/s3querier-mcp.js"
|
|
13
13
|
},
|
|
14
14
|
"files": [
|
|
15
15
|
"src/**/*.js",
|
|
16
16
|
"!src/**/*.test.js",
|
|
17
|
+
"src/mcp/descriptions/",
|
|
17
18
|
"docs/",
|
|
18
19
|
"README.md"
|
|
19
20
|
],
|
|
@@ -25,7 +26,7 @@
|
|
|
25
26
|
"scripts": {
|
|
26
27
|
"test": "node --test \"./src/**/*.test.js\"",
|
|
27
28
|
"test:e2e": "docker compose -f e2e/docker-compose.yml up -d --wait && node e2e/setup/seed.js && node --test e2e/*.e2e.js; docker compose -f e2e/docker-compose.yml down",
|
|
28
|
-
"test:coverage:html": "c8 -x coverage -x **/*.test.js --all -r html node --test \"./src/**/*.test.js\"",
|
|
29
|
+
"test:coverage:html": "c8 -x coverage -x **/*.test.js -x src/s3-querier.js -x e2e -x examples -x '*.config.js' --all -r html node --test \"./src/**/*.test.js\"",
|
|
29
30
|
"prettify": "prettier \"./src/**/*.js\" --write",
|
|
30
31
|
"lint": "eslint \"./src/**/*.js\"",
|
|
31
32
|
"lint:fix": "eslint --fix \"./src/**/*.js\"",
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
List files and sub-directories in an S3 bucket at a given prefix level.
|
|
2
|
+
|
|
3
|
+
Returns:
|
|
4
|
+
- directories: sub-prefixes to drill into (e.g. "env=production/", "year=2026/")
|
|
5
|
+
- files: objects at this level with path, size, and column names for parquet files
|
|
6
|
+
- truncated: true if there are more results — narrow the prefix or increase maxResults
|
|
7
|
+
|
|
8
|
+
NAVIGATION STRATEGY
|
|
9
|
+
|
|
10
|
+
Start with an empty prefix to see the top-level structure. Drill into directories
|
|
11
|
+
one level at a time. Each call only shows what is directly under the given prefix,
|
|
12
|
+
not the full recursive tree.
|
|
13
|
+
|
|
14
|
+
HIVE-PARTITIONED DATA
|
|
15
|
+
|
|
16
|
+
Today's date is {{TODAY}}.
|
|
17
|
+
|
|
18
|
+
Many datasets use Hive-style partitioning with partition keys in the path:
|
|
19
|
+
year=YYYY/month=MM/day=DD/hour=HH/minute=MM/
|
|
20
|
+
|
|
21
|
+
To find the most recent data:
|
|
22
|
+
1. List down to the partition root (e.g. env=production/) — one call per structural level
|
|
23
|
+
2. Then STOP listing. Do NOT list year, month, or day directories individually.
|
|
24
|
+
3. Construct the date segment directly from today: year=YYYY/month=MM/day=DD/
|
|
25
|
+
4. Append it to the partition root and list from the day directory to find the latest hour.
|
|
26
|
+
|
|
27
|
+
Example: if you reach env=production/ and today is 2026-06-14, your next call is
|
|
28
|
+
prefix: "…/env=production/year=2026/month=06/day=14/"
|
|
29
|
+
Then list that to find the available hours, pick the highest, and continue.
|
|
30
|
+
|
|
31
|
+
Only fall back to listing year/month/day if the constructed path returns empty directories and files.
|
|
32
|
+
|
|
33
|
+
Always check for a "latest/" directory first — it often contains the most recent
|
|
34
|
+
snapshot and avoids navigating the full partition tree.
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
A DuckDB SQL query. File paths inside read_parquet() or read_csv() are resolved against S3
|
|
2
|
+
and downloaded before the query runs.
|
|
3
|
+
|
|
4
|
+
REQUIRED: always call read_parquet() or read_csv() — plain table names are not supported.
|
|
5
|
+
Prefer union_by_name=1 when reading multiple files.
|
|
6
|
+
|
|
7
|
+
SCHEMA DISCOVERY: Never guess column names. Before writing any query that joins files or
|
|
8
|
+
references specific columns, run SELECT * FROM read_parquet('path') LIMIT 1 on each file
|
|
9
|
+
to inspect the actual schema. Only then write the real query.
|
|
10
|
+
|
|
11
|
+
FILE PATH TOKENS
|
|
12
|
+
|
|
13
|
+
Date tokens — expanded using the `from` and `to` parameters:
|
|
14
|
+
{yyyy} 4-digit year e.g. 2025
|
|
15
|
+
{MM} 2-digit month e.g. 08
|
|
16
|
+
{dd} 2-digit day e.g. 03
|
|
17
|
+
{hh} 2-digit hour e.g. 14
|
|
18
|
+
{mm} 2-digit minute e.g. 30
|
|
19
|
+
{ss} 2-digit second e.g. 00
|
|
20
|
+
|
|
21
|
+
Location tokens — override endpoint or bucket per path:
|
|
22
|
+
{endpoint:https://s3.example.com}
|
|
23
|
+
{bucket:my-bucket}
|
|
24
|
+
|
|
25
|
+
Glob syntax — wildcard matching for non-time path segments:
|
|
26
|
+
jobs/window=202308032130/*.parquet
|
|
27
|
+
|
|
28
|
+
QUERYING TIME-PARTITIONED DATA OVER A RANGE
|
|
29
|
+
|
|
30
|
+
When data is partitioned by time and you need multiple hours, days, or months, use date
|
|
31
|
+
tokens in the SQL with `from`/`to` as separate parameters. ONE query with tokens downloads
|
|
32
|
+
all matching files across the range — do not make multiple tool calls with hardcoded dates.
|
|
33
|
+
|
|
34
|
+
✗ WRONG — three separate tool calls with hardcoded hours:
|
|
35
|
+
sql: SELECT * FROM read_parquet('events/year=2026/month=06/day=15/hour=12/data.parquet')
|
|
36
|
+
sql: SELECT * FROM read_parquet('events/year=2026/month=06/day=15/hour=13/data.parquet')
|
|
37
|
+
sql: SELECT * FROM read_parquet('events/year=2026/month=06/day=15/hour=14/data.parquet')
|
|
38
|
+
|
|
39
|
+
✓ CORRECT — one tool call, tokens expand across all hours in the range:
|
|
40
|
+
sql: SELECT * FROM read_parquet('events/year={yyyy}/month={MM}/day={dd}/hour={hh}/data.parquet', union_by_name=1)
|
|
41
|
+
from: 2026-06-15T12:00:00Z
|
|
42
|
+
to: 2026-06-15T14:59:59Z
|
|
43
|
+
|
|
44
|
+
Tokens also expand inside the filename, not just in path directory segments:
|
|
45
|
+
data/year={yyyy}/month={MM}/day={dd}/hour={hh}/file_{yyyy}{MM}{dd}{hh}00.parquet
|
|
46
|
+
→ s3-querier downloads one file per hour in the from/to range.
|
|
47
|
+
|
|
48
|
+
Hardcoding a date is fine for a single known file or a fixed point-in-time lookup where
|
|
49
|
+
you are not spanning multiple time segments.
|
|
50
|
+
|
|
51
|
+
HIVE-PARTITIONED DATA
|
|
52
|
+
|
|
53
|
+
For paths partitioned only by year and month (no day segment), use {yyyy} and {MM} together:
|
|
54
|
+
sales/year={yyyy}/month={MM}/data.parquet
|
|
55
|
+
|
|
56
|
+
s3-querier generates one prefix per calendar month in the from/to range, so a Q1 query
|
|
57
|
+
(from=2024-01-01, to=2024-03-31) fetches exactly months 01, 02, 03 — not all of 2024.
|
|
58
|
+
|
|
59
|
+
Do NOT use DuckDB character-class globs like month=0[1-3] — DuckDB does not support them.
|
|
60
|
+
Use {yyyy}/{MM} tokens instead, which s3-querier expands correctly.
|
|
61
|
+
|
|
62
|
+
EXAMPLES
|
|
63
|
+
|
|
64
|
+
Single file:
|
|
65
|
+
SELECT * FROM read_parquet('reports/summary.parquet') LIMIT 10
|
|
66
|
+
|
|
67
|
+
Hour-partitioned files — tokens in path and filename (requires from/to):
|
|
68
|
+
SELECT * FROM read_parquet(
|
|
69
|
+
'events/year={yyyy}/month={MM}/day={dd}/hour={hh}/file_{yyyy}{MM}{dd}{hh}00.parquet',
|
|
70
|
+
union_by_name=1)
|
|
71
|
+
|
|
72
|
+
Day-partitioned files (requires from/to):
|
|
73
|
+
SELECT id FROM read_parquet('events/year={yyyy}/month={MM}/day={dd}/data.parquet', union_by_name=1)
|
|
74
|
+
|
|
75
|
+
Month-partitioned files (no day segment — use {yyyy}/{MM} only):
|
|
76
|
+
SELECT * FROM read_parquet('sales/year={yyyy}/month={MM}/data.parquet', union_by_name=1)
|
|
77
|
+
|
|
78
|
+
Cross-endpoint join:
|
|
79
|
+
WITH east AS (SELECT id FROM read_parquet('{endpoint:https://s3.us-east.example.com}/{bucket:logs}/data/{yyyy}{MM}{dd}.parquet'))
|
|
80
|
+
SELECT * FROM read_parquet('{endpoint:https://s3.eu-west.example.com}/{bucket:logs}/data/{yyyy}{MM}{dd}.parquet') AS west
|
|
81
|
+
JOIN east ON west.id = east.id
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
Download files from S3-compatible storage and execute a DuckDB SQL query against them.
|
|
2
|
+
|
|
3
|
+
Queries must use DuckDB table functions such as read_parquet() or read_csv() with file paths
|
|
4
|
+
that reference objects in S3. S3 Querier resolves those paths, downloads the matching files,
|
|
5
|
+
and runs the query locally with DuckDB.
|
|
6
|
+
|
|
7
|
+
Use the `s3-querier://docs` resource for full documentation including token syntax, examples,
|
|
8
|
+
and query planning tips.
|