s3-querier 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +203 -0
- package/docs/s3-querier.md +196 -0
- package/package.json +71 -0
- package/src/duck-db/index.js +57 -0
- package/src/plugins/avro/avro-plugin.js +64 -0
- package/src/plugins/query-finalizer/query-finalizer.js +41 -0
- package/src/plugins/query-parser/query-parser.js +33 -0
- package/src/s3/auth/ibm-iam-client.js +21 -0
- package/src/s3/auth/ibm-iam-token-manager.js +40 -0
- package/src/s3/s3.js +427 -0
- package/src/s3-querier.js +107 -0
- package/src/utils/bigint-replacer.js +13 -0
- package/src/utils/date-regex/date-regex.js +52 -0
- package/src/utils/file-path-builder/file-path-builder.js +55 -0
- package/src/utils/file-settings/file-settings.js +65 -0
- package/src/utils/logger.js +3 -0
- package/src/utils/path-parser/path-parser-grammar.js +39 -0
- package/src/utils/path-parser/path-parser.js +58 -0
- package/src/utils/sql-parser/sql-parser.js +31 -0
package/README.md
ADDED
|
@@ -0,0 +1,203 @@
|
|
|
1
|
+
# s3-querier
|
|
2
|
+
|
|
3
|
+
Query S3-compatible storage directly with DuckDB SQL. S3 Querier handles listing files, downloading them locally, and executing your query — turning a data lake into a queryable resource with a single function call.
|
|
4
|
+
|
|
5
|
+
## Requirements
|
|
6
|
+
|
|
7
|
+
- Node.js >= 22
|
|
8
|
+
- S3-compatible storage (AWS S3, MinIO, IBM COS, etc.) with HMAC or IBM IAM credentials
|
|
9
|
+
|
|
10
|
+
## Installation
|
|
11
|
+
|
|
12
|
+
```bash
|
|
13
|
+
npm install s3-querier
|
|
14
|
+
```
|
|
15
|
+
|
|
16
|
+
## Usage
|
|
17
|
+
|
|
18
|
+
```js
|
|
19
|
+
import s3Querier from 's3-querier';
|
|
20
|
+
|
|
21
|
+
const results = await s3Querier({
|
|
22
|
+
accessKeyId: 'your-access-key',
|
|
23
|
+
secretAccessKey: 'your-secret-key',
|
|
24
|
+
defaultEndpoint: 'https://s3.amazonaws.com',
|
|
25
|
+
defaultBucket: 'my-bucket',
|
|
26
|
+
bucketsDir: '/tmp/s3-cache',
|
|
27
|
+
from: new Date('2025-01-01').getTime(),
|
|
28
|
+
to: new Date('2025-01-31').getTime(),
|
|
29
|
+
query: `SELECT * FROM read_parquet('events/year={yyyy}/month={MM}/day={dd}/data.parquet')`,
|
|
30
|
+
format: 'jsonRecords',
|
|
31
|
+
});
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
## API
|
|
35
|
+
|
|
36
|
+
### `s3Querier(options)`
|
|
37
|
+
|
|
38
|
+
Returns a `Promise` that resolves to the query results.
|
|
39
|
+
|
|
40
|
+
| Option | Type | Required | Description |
|
|
41
|
+
| --- | --- | --- | --- |
|
|
42
|
+
| `accessKeyId` | `string` | ✓ | HMAC access key ID |
|
|
43
|
+
| `secretAccessKey` | `string` | ✓ | HMAC secret access key |
|
|
44
|
+
| `defaultEndpoint` | `string` | ✓ | S3 endpoint URL |
|
|
45
|
+
| `defaultBucket` | `string` | ✓ | Default bucket name |
|
|
46
|
+
| `bucketsDir` | `string` | ✓ | Local directory for caching downloaded files |
|
|
47
|
+
| `query` | `string` | ✓ | DuckDB SQL query |
|
|
48
|
+
| `from` | `number` | | Start of date range as a Unix timestamp (ms). Required when using date tokens. |
|
|
49
|
+
| `to` | `number` | | End of date range as a Unix timestamp (ms). Required when using date tokens. |
|
|
50
|
+
| `format` | `string` | | Output format. `'jsonRecords'` returns `[{ col: val }]`. Default is columnar `[{ name, fields: [val, ...] }]`. |
|
|
51
|
+
| `plugins` | `array` | | Additional plugins to extend query processing. |
|
|
52
|
+
|
|
53
|
+
### Environment Variables
|
|
54
|
+
|
|
55
|
+
| Variable | Default | Description |
|
|
56
|
+
| --- | --- | --- |
|
|
57
|
+
| `MAX_MB_DOWNLOAD` | `1000` | Maximum total download size in MB per query. Queries exceeding this limit throw an error. |
|
|
58
|
+
|
|
59
|
+
## Query Syntax
|
|
60
|
+
|
|
61
|
+
### Static Files
|
|
62
|
+
|
|
63
|
+
```sql
|
|
64
|
+
SELECT * FROM read_parquet('reports/summary.parquet') LIMIT 10;
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
### Date Tokens
|
|
68
|
+
|
|
69
|
+
When `from` and `to` are provided, date tokens are expanded into a list of matching file paths.
|
|
70
|
+
|
|
71
|
+
```sql
|
|
72
|
+
SELECT *
|
|
73
|
+
FROM read_parquet('events/year={yyyy}/month={MM}/day={dd}/data.parquet', union_by_name=1);
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
| Token | Description | Example |
|
|
77
|
+
| --- | --- | --- |
|
|
78
|
+
| `{yyyy}` | 4-digit year | `2025` |
|
|
79
|
+
| `{MM}` | 2-digit month | `01`–`12` |
|
|
80
|
+
| `{dd}` | 2-digit day | `01`–`31` |
|
|
81
|
+
| `{hh}` | 2-digit hour | `00`–`23` |
|
|
82
|
+
| `{mm}` | 2-digit minute | `00`–`59` |
|
|
83
|
+
| `{ss}` | 2-digit second | `00`–`59` |
|
|
84
|
+
|
|
85
|
+
### Glob Patterns
|
|
86
|
+
|
|
87
|
+
```sql
|
|
88
|
+
SELECT * FROM read_parquet('reports/2025/*.parquet', union_by_name=1);
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
### Location Tokens
|
|
92
|
+
|
|
93
|
+
Override the default endpoint and bucket per file reference within a query.
|
|
94
|
+
|
|
95
|
+
```sql
|
|
96
|
+
SELECT *
|
|
97
|
+
FROM read_parquet('{endpoint:https://s3.us-east.example.com}/{bucket:my-bucket}/data.parquet');
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
### Cross-Bucket Joins
|
|
101
|
+
|
|
102
|
+
```sql
|
|
103
|
+
SELECT s.id, s.event_type, r.description
|
|
104
|
+
FROM read_parquet('{bucket:events-bucket}/reports/summary.parquet') s
|
|
105
|
+
JOIN read_parquet('{bucket:reference-bucket}/lookup.parquet') r ON s.id = r.id;
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
### Cache Control
|
|
109
|
+
|
|
110
|
+
Append `?cache=false` to force a fresh download, bypassing the local cache.
|
|
111
|
+
|
|
112
|
+
```sql
|
|
113
|
+
SELECT * FROM read_parquet('reports/summary.parquet?cache=false');
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## BigInt
|
|
117
|
+
|
|
118
|
+
> [!WARNING]
|
|
119
|
+
> DuckDB returns `BigInt` for `COUNT(*)`, `SUM`, and other integer aggregations. `BigInt` is not JSON-serializable — `JSON.stringify` will throw.
|
|
120
|
+
|
|
121
|
+
The safest fix is to cast in SQL:
|
|
122
|
+
|
|
123
|
+
```sql
|
|
124
|
+
SELECT CAST(COUNT(*) AS INTEGER) AS total FROM read_parquet('data.parquet')
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
If you can't control the query, use the exported `bigintReplacer` with `JSON.stringify`:
|
|
128
|
+
|
|
129
|
+
```js
|
|
130
|
+
import s3Querier, { bigintReplacer } from 's3-querier';
|
|
131
|
+
|
|
132
|
+
const results = await s3Querier({ ..., format: 'jsonRecords' });
|
|
133
|
+
const json = JSON.stringify(results, bigintReplacer);
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
Note: `bigintReplacer` converts `BigInt` to `Number`, which loses precision for values above `Number.MAX_SAFE_INTEGER` (~9 quadrillion). For large integer IDs or counters, prefer the SQL cast.
|
|
137
|
+
|
|
138
|
+
## Caching
|
|
139
|
+
|
|
140
|
+
Downloaded files are cached to `bucketsDir` on disk. Subsequent queries that reference the same files skip the download entirely. The listing cache (S3 object listings) is held in memory per process using an LRU cache, with today's prefix always re-fetched to pick up new files.
|
|
141
|
+
|
|
142
|
+
## Plugins
|
|
143
|
+
|
|
144
|
+
The `plugins` option accepts an array of plugin objects that can extend query parsing and file processing. A plugin may implement:
|
|
145
|
+
|
|
146
|
+
- `processQuery(context)` — transform the query context before execution
|
|
147
|
+
- `processFile(filePath)` — process each downloaded file (e.g. convert Avro to JSON)
|
|
148
|
+
|
|
149
|
+
The built-in Avro plugin is an example:
|
|
150
|
+
|
|
151
|
+
```js
|
|
152
|
+
import s3Querier from 's3-querier';
|
|
153
|
+
import AvroPlugin from 's3-querier/src/plugins/avro/avro-plugin.js';
|
|
154
|
+
|
|
155
|
+
const results = await s3Querier({
|
|
156
|
+
// ...
|
|
157
|
+
plugins: [new AvroPlugin()],
|
|
158
|
+
query: `SELECT * FROM read_json('data.avro+json')`,
|
|
159
|
+
});
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
## Examples
|
|
163
|
+
|
|
164
|
+
The `examples/` directory contains a local interactive demo and standalone scripts. All examples target a local MinIO instance — you'll need [Docker](https://docs.docker.com/get-docker/) and [Docker Compose](https://docs.docker.com/compose/install/) installed. Both are bundled with Docker Desktop on Mac and Windows; on Linux, install the Compose plugin separately.
|
|
165
|
+
|
|
166
|
+
### Interactive demo
|
|
167
|
+
|
|
168
|
+
Starts MinIO, seeds it with sample parquet data, and launches an Express server with a Monaco SQL editor in the browser.
|
|
169
|
+
|
|
170
|
+
```bash
|
|
171
|
+
npm run demo:up # start MinIO and seed data (runs once)
|
|
172
|
+
npm run demo:start # start the Express server
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
Then open [http://localhost:3000](http://localhost:3000). The editor has five pre-loaded example queries you can run or modify. When you're done:
|
|
176
|
+
|
|
177
|
+
```bash
|
|
178
|
+
npm run demo:down # stop MinIO
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
### Standalone scripts
|
|
182
|
+
|
|
183
|
+
Run any script directly after MinIO is up:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
npm run demo:up # if not already running
|
|
187
|
+
node examples/scripts/basic-query.js # fetch the first 10 sales rows
|
|
188
|
+
node examples/scripts/glob-pattern.js # filter to Jan–Feb with a brace glob
|
|
189
|
+
node examples/scripts/date-range.js # use {from}/{to} date tokens
|
|
190
|
+
node examples/scripts/ibm-cos.js # IBM Cloud Object Storage (requires env vars)
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
For the IBM COS script, set these environment variables first:
|
|
194
|
+
|
|
195
|
+
```bash
|
|
196
|
+
export IBM_COS_API_KEY=your-api-key
|
|
197
|
+
export IBM_COS_ENDPOINT=https://s3.us-south.cloud-object-storage.appdomain.cloud
|
|
198
|
+
export IBM_COS_BUCKET=your-bucket
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
## License
|
|
202
|
+
|
|
203
|
+
MIT
|
|
@@ -0,0 +1,196 @@
|
|
|
1
|
+
# S3 Querier
|
|
2
|
+
|
|
3
|
+
S3 Querier allows you to query data lake content directly using [DuckDB](https://duckdb.org/) queries. By parsing these queries, determining the necessary files, and dynamically downloading and processing them, S3 Querier transforms what is otherwise an opaque storage system into a user-friendly, queryable resource.
|
|
4
|
+
|
|
5
|
+
## Planning Your Queries
|
|
6
|
+
|
|
7
|
+
When querying data from a data lake, be mindful of how your queries are constructed. S3 Querier downloads files from S3-compatible storage before they can be queried using DuckDB, so query speed is directly influenced by the size and number of files involved.
|
|
8
|
+
|
|
9
|
+
### Key Considerations
|
|
10
|
+
|
|
11
|
+
1. **File Size And Query Efficiency**
|
|
12
|
+
Large files increase query time because they take longer to download. To optimize performance:
|
|
13
|
+
|
|
14
|
+
- Query only the columns you need. Avoid `SELECT *` without a `LIMIT`.
|
|
15
|
+
- Avoid overly broad queries that download unnecessary files, such as `read_parquet('my-bucket/*.parquet')`.
|
|
16
|
+
|
|
17
|
+
2. **1GB File Size Limit**
|
|
18
|
+
This service enforces a 1GB limit per query. Queries that access files accumulating beyond this limit will fail.
|
|
19
|
+
|
|
20
|
+
3. **Partitioning And Filtering**
|
|
21
|
+
Partition your data in the lake where possible. This lets you filter queries to target only relevant partitions, reducing unnecessary downloads.
|
|
22
|
+
|
|
23
|
+
### Tips For Better Query Planning
|
|
24
|
+
|
|
25
|
+
- **Test Locally First**
|
|
26
|
+
[Install the DuckDB CLI](https://duckdb.org/docs/installation/?version=stable&environment=cli&platform=macos&download_method=direct) and experiment with your queries on local parquet files before running them against S3. This gives a fast feedback loop for understanding data structure and refining queries.
|
|
27
|
+
|
|
28
|
+
- **Be Mindful Of Time Ranges In Date Tokens**
|
|
29
|
+
Long time ranges require fetching more files and slow execution. Use narrow time ranges whenever possible.
|
|
30
|
+
|
|
31
|
+
- **Create Secondary Representations Of Your Data**
|
|
32
|
+
For larger datasets, break files into smaller chunks to avoid hitting the file size limit.
|
|
33
|
+
|
|
34
|
+
- **Monitor Query Times**
|
|
35
|
+
If a query is slow, revisit the query logic and the files it accesses.
|
|
36
|
+
|
|
37
|
+
## Example Queries
|
|
38
|
+
|
|
39
|
+
Below are some examples of common use cases.
|
|
40
|
+
|
|
41
|
+
**Key Concepts:**
|
|
42
|
+
|
|
43
|
+
- Static files versus dynamic files. A static file is a file for which you know the exact location in S3. A dynamic file uses one or more file tokens to match a range of files. [See the file tokens section](#file-tokens-overview) for details.
|
|
44
|
+
- In most cases you'll want to use `union_by_name=1` when using `read_parquet` or `read_csv`. Read more about [why this is important](https://duckdb.org/2025/01/10/union-by-name.html).
|
|
45
|
+
|
|
46
|
+
### Querying A Single, Static File
|
|
47
|
+
|
|
48
|
+
```sql
|
|
49
|
+
SELECT * FROM
|
|
50
|
+
read_parquet('file1.parquet', union_by_name=1)
|
|
51
|
+
LIMIT 10;
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
### Getting Multiple, Static Files
|
|
55
|
+
|
|
56
|
+
When querying multiple files they should share the same or a similar schema. Use `union_by_name=1` to handle minor schema differences.
|
|
57
|
+
|
|
58
|
+
```sql
|
|
59
|
+
SELECT * FROM
|
|
60
|
+
read_parquet(['file1.parquet', 'file2.parquet'], union_by_name=1)
|
|
61
|
+
LIMIT 10;
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### Querying Time-Related Files
|
|
65
|
+
|
|
66
|
+
Use date tokens to query files spanning a specific date range. When S3 Querier receives a `from` and `to` parameter, it automatically expands the file list to match the date tokens in your query.
|
|
67
|
+
|
|
68
|
+
#### Example
|
|
69
|
+
|
|
70
|
+
Given `from=2025-08-03` and `to=2025-08-06`, the following query:
|
|
71
|
+
|
|
72
|
+
```sql
|
|
73
|
+
SELECT id
|
|
74
|
+
FROM read_parquet('jobs_failed/year={yyyy}/month={MM}/day={dd}/servers.parquet', union_by_name=1);
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
Will resolve and download:
|
|
78
|
+
|
|
79
|
+
```
|
|
80
|
+
jobs_failed/year=2025/month=08/day=03/servers.parquet
|
|
81
|
+
jobs_failed/year=2025/month=08/day=04/servers.parquet
|
|
82
|
+
jobs_failed/year=2025/month=08/day=05/servers.parquet
|
|
83
|
+
jobs_failed/year=2025/month=08/day=06/servers.parquet
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
For more details, see [tips about date ranges](#tips-for-better-query-planning).
|
|
87
|
+
|
|
88
|
+
### Querying Files From Multiple Locations
|
|
89
|
+
|
|
90
|
+
Use location tokens to query files across different endpoints or buckets in a single query.
|
|
91
|
+
|
|
92
|
+
```sql
|
|
93
|
+
WITH us_south_data AS (
|
|
94
|
+
SELECT id, timestamp
|
|
95
|
+
FROM read_parquet('{endpoint:https://s3.us-south.example.com}/{bucket:my-bucket}/my_time_series/{yyyy}{MM}{dd}{hh}{mm}{ss}.parquet')
|
|
96
|
+
)
|
|
97
|
+
SELECT id, timestamp
|
|
98
|
+
FROM read_parquet('{endpoint:https://s3.us-east.example.com}/{bucket:my-bucket}/my_time_series_2/{yyyy}{MM}{dd}{hh}{mm}{ss}.parquet') AS us_east_data
|
|
99
|
+
JOIN us_south_data ON us_east_data.id = us_south_data.id;
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
### Tips And Utilities
|
|
103
|
+
|
|
104
|
+
See the DuckDB docs for [`read_parquet` parameters](https://duckdb.org/docs/stable/data/parquet/overview.html#parameters) and [`read_csv` parameters](https://duckdb.org/docs/stable/data/csv/overview#parameters).
|
|
105
|
+
|
|
106
|
+
#### Getting The File Name Of The File(s) Being Queried
|
|
107
|
+
|
|
108
|
+
Pass `filename=1` to `read_parquet` or `read_csv` to include the source file path as a column in results.
|
|
109
|
+
|
|
110
|
+
```sql
|
|
111
|
+
SELECT id, filename
|
|
112
|
+
FROM read_parquet('year={yyyy}/month={MM}/my-file.parquet', filename=1);
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
| id | filename |
|
|
116
|
+
| --- | ----------------------------------- |
|
|
117
|
+
| 1 | year=2025/month=01/my-file.parquet |
|
|
118
|
+
|
|
119
|
+
#### Extracting Partition Values From Hive-Style Paths
|
|
120
|
+
|
|
121
|
+
If your data uses Hive-style partitioning (e.g., `year=2025/month=04/day=20`), use `hive_partitioning=1` to extract partition keys as columns.
|
|
122
|
+
|
|
123
|
+
```sql
|
|
124
|
+
SELECT year, month, day, id
|
|
125
|
+
FROM read_parquet('jobs_failed/year=2025/month=01/day=19/my-file.parquet', hive_partitioning=1);
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
| id | year | month | day |
|
|
129
|
+
| -- | ---- | ----- | --- |
|
|
130
|
+
| 1 | 2025 | 01 | 19 |
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
## File Tokens Overview
|
|
135
|
+
|
|
136
|
+
File tokens allow you to create dynamic queries with patterns that vary based on time, non-time components, or storage location. There are three types: **Glob Syntax**, **Time Formatting Tokens**, and **Location Tokens**.
|
|
137
|
+
|
|
138
|
+
### Glob Syntax
|
|
139
|
+
|
|
140
|
+
Glob syntax handles file name segments that vary but are not time-related.
|
|
141
|
+
|
|
142
|
+
```
|
|
143
|
+
jobs_failed/window=202308032130/0.parquet
|
|
144
|
+
jobs_failed/window=202308032230/3.parquet
|
|
145
|
+
jobs_failed/window=202308032330/6.parquet
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
```sql
|
|
149
|
+
SELECT id
|
|
150
|
+
FROM read_parquet('jobs_failed/window=202308032130/*.parquet', union_by_name=1);
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
> [!WARNING]
|
|
154
|
+
> **Use globs with caution.** They can match more files than expected, causing unnecessary downloads and degraded performance. Always verify the file list your glob will match before running a broad query.
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
### Time Formatting Tokens
|
|
159
|
+
|
|
160
|
+
Time tokens dynamically match files based on time-related patterns in their names, based on [Unicode Technical Standard #35](https://unicode.org/reports/tr35/).
|
|
161
|
+
|
|
162
|
+
| **Token** | **Usage** | **Example Output** |
|
|
163
|
+
| ----------------- | -------------- | ------------------ |
|
|
164
|
+
| **Year** `{yyyy}` | 4-digit year | 1970, ..., 2030 |
|
|
165
|
+
| **Month** `{MM}` | 2-digit month | 01...12 |
|
|
166
|
+
| **Day** `{dd}` | 2-digit day | 01...31 |
|
|
167
|
+
| **Hour** `{hh}` | 2-digit hour | 00...23 |
|
|
168
|
+
| **Minute** `{mm}` | 2-digit minute | 00...59 |
|
|
169
|
+
| **Second** `{ss}` | 2-digit second | 00...59 |
|
|
170
|
+
|
|
171
|
+
```sql
|
|
172
|
+
SELECT id
|
|
173
|
+
FROM read_parquet('jobs_failed/window={yyyy}{MM}{dd}{hh}{mm}/*.parquet', union_by_name=1);
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
---
|
|
177
|
+
|
|
178
|
+
### Location Tokens
|
|
179
|
+
|
|
180
|
+
Location tokens let you vary the storage endpoint and bucket within a query.
|
|
181
|
+
|
|
182
|
+
| **Token** | **Usage** | **Example** |
|
|
183
|
+
| -------------------------- | -------------------------------- | ------------------------------------------------------------ |
|
|
184
|
+
| **Endpoint** `{endpoint:}` | Specifies a storage endpoint URL | `{endpoint:http://s3.example.com}/my-bucket/file.parquet` |
|
|
185
|
+
| **Bucket** `{bucket:}` | Specifies a storage bucket | `{bucket:my-bucket}/file.parquet` |
|
|
186
|
+
|
|
187
|
+
```sql
|
|
188
|
+
SELECT id
|
|
189
|
+
FROM read_parquet('{endpoint:http://s3.example.com}/{bucket:my-bucket}/jobs_failed/window={yyyy}{MM}{dd}{hh}{mm}/*.parquet');
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
**Benefits:**
|
|
193
|
+
|
|
194
|
+
1. **Cross-Endpoint Queries** — Query data stored on different S3-compatible endpoints in a single query.
|
|
195
|
+
2. **Cross-Bucket Queries** — Access data from multiple buckets without separate queries.
|
|
196
|
+
3. **Dynamic Query Construction** — Combine location tokens with glob syntax and time tokens for fully dynamic, cross-location queries.
|
package/package.json
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "s3-querier",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Query S3-compatible storage with DuckDB and SQL",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "src/s3-querier.js",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": "./src/s3-querier.js"
|
|
9
|
+
},
|
|
10
|
+
"files": [
|
|
11
|
+
"src/**/*.js",
|
|
12
|
+
"!src/**/*.test.js",
|
|
13
|
+
"docs/",
|
|
14
|
+
"README.md"
|
|
15
|
+
],
|
|
16
|
+
"repository": {
|
|
17
|
+
"type": "git",
|
|
18
|
+
"url": "git+ssh://git@github.com/grommett/s3-querier.git"
|
|
19
|
+
},
|
|
20
|
+
"author": "david@pinkiering.com",
|
|
21
|
+
"scripts": {
|
|
22
|
+
"test": "node --test \"./src/**/*.test.js\"",
|
|
23
|
+
"test:e2e": "docker compose -f e2e/docker-compose.yml up -d --wait && node e2e/setup/seed.js && node --test e2e/*.e2e.js; docker compose -f e2e/docker-compose.yml down",
|
|
24
|
+
"test:coverage:html": "c8 -x coverage -x **/*.test.js --all -r html node --test \"./src/**/*.test.js\"",
|
|
25
|
+
"prettify": "prettier \"./src/**/*.js\" --write",
|
|
26
|
+
"lint": "eslint \"./src/**/*.js\"",
|
|
27
|
+
"lint:fix": "eslint --fix \"./src/**/*.js\"",
|
|
28
|
+
"prepare": "husky",
|
|
29
|
+
"demo:up": "docker compose -f examples/demo/docker-compose.yml up -d --wait && node examples/demo/seed.js",
|
|
30
|
+
"demo:down": "docker compose -f examples/demo/docker-compose.yml down",
|
|
31
|
+
"demo:start": "node examples/demo/server.js"
|
|
32
|
+
},
|
|
33
|
+
"engines": {
|
|
34
|
+
"node": ">=22"
|
|
35
|
+
},
|
|
36
|
+
"keywords": [
|
|
37
|
+
"s3",
|
|
38
|
+
"duckdb",
|
|
39
|
+
"parquet",
|
|
40
|
+
"query"
|
|
41
|
+
],
|
|
42
|
+
"license": "MIT",
|
|
43
|
+
"dependencies": {
|
|
44
|
+
"@aws-sdk/client-s3": "^3.0.0",
|
|
45
|
+
"@derekstride/tree-sitter-sql": "^0.3.11",
|
|
46
|
+
"@duckdb/node-api": "^1.5.3-r.3",
|
|
47
|
+
"avsc": "^5.7.7",
|
|
48
|
+
"date-fns": "^4.0.0",
|
|
49
|
+
"lru-cache": "^11.0.0",
|
|
50
|
+
"peggy": "^5.1.0",
|
|
51
|
+
"pino": "^10.3.1",
|
|
52
|
+
"tree-sitter": "^0.21.1"
|
|
53
|
+
},
|
|
54
|
+
"lint-staged": {
|
|
55
|
+
"src/**/*.js": [
|
|
56
|
+
"prettier --write",
|
|
57
|
+
"eslint"
|
|
58
|
+
]
|
|
59
|
+
},
|
|
60
|
+
"devDependencies": {
|
|
61
|
+
"@eslint/js": "^10.0.1",
|
|
62
|
+
"c8": "^11.0.0",
|
|
63
|
+
"eslint": "^10.5.0",
|
|
64
|
+
"esmock": "^2.7.5",
|
|
65
|
+
"express": "^5.2.1",
|
|
66
|
+
"globals": "^17.6.0",
|
|
67
|
+
"husky": "^9.1.7",
|
|
68
|
+
"lint-staged": "^17.0.7",
|
|
69
|
+
"prettier": "^3.0.3"
|
|
70
|
+
}
|
|
71
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { DuckDBInstance } from '@duckdb/node-api';
|
|
2
|
+
import { logger } from '../utils/logger.js';
|
|
3
|
+
|
|
4
|
+
const db = await DuckDBInstance.create(':memory:', {
|
|
5
|
+
threads: 4,
|
|
6
|
+
});
|
|
7
|
+
|
|
8
|
+
const formatStrategies = {
|
|
9
|
+
jsonRecords: formatJsonRecords,
|
|
10
|
+
default: formatColumnar,
|
|
11
|
+
};
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Execute a SQL query and return results in the specified format
|
|
15
|
+
*
|
|
16
|
+
* @param {string} sql - The SQL query to execute
|
|
17
|
+
* @param {object} options - Query options
|
|
18
|
+
* @param {string} options.format - Output format: 'jsonRecords' for row objects, otherwise columnar (default)
|
|
19
|
+
* @returns {Promise<Array>} Query results in the requested format
|
|
20
|
+
*/
|
|
21
|
+
export async function query(sql, options = {}) {
|
|
22
|
+
const { format } = options;
|
|
23
|
+
const queryStart = new Date();
|
|
24
|
+
|
|
25
|
+
try {
|
|
26
|
+
const connection = await db.connect();
|
|
27
|
+
const reader = await connection.runAndReadAll(sql);
|
|
28
|
+
const columnsResult = reader.getColumnsObjectJS();
|
|
29
|
+
|
|
30
|
+
const formatter = formatStrategies[format] ?? formatStrategies.default;
|
|
31
|
+
const result = formatter(columnsResult);
|
|
32
|
+
|
|
33
|
+
const queryTime = new Date() - queryStart;
|
|
34
|
+
logger.info(`Query completed in : ${queryTime / 1000} seconds`);
|
|
35
|
+
return result ?? [];
|
|
36
|
+
} catch (error) {
|
|
37
|
+
logger.error(error);
|
|
38
|
+
throw error;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function formatColumnar(columnsResult) {
|
|
43
|
+
return Object.keys(columnsResult).map((key) => ({ name: key, fields: columnsResult[key] }));
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
function formatJsonRecords(columnsResult) {
|
|
47
|
+
const keys = Object.keys(columnsResult);
|
|
48
|
+
if (keys.length === 0) return [];
|
|
49
|
+
const rowCount = columnsResult[keys[0]].length;
|
|
50
|
+
return Array.from({ length: rowCount }, (_, rowIndex) => {
|
|
51
|
+
const row = {};
|
|
52
|
+
keys.forEach((key) => {
|
|
53
|
+
row[key] = columnsResult[key][rowIndex];
|
|
54
|
+
});
|
|
55
|
+
return row;
|
|
56
|
+
});
|
|
57
|
+
}
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { createWriteStream } from 'node:fs';
|
|
2
|
+
import fsPromise from 'node:fs/promises';
|
|
3
|
+
import avro from 'avsc';
|
|
4
|
+
|
|
5
|
+
import QueryParserPlugin from '../query-parser/query-parser.js';
|
|
6
|
+
|
|
7
|
+
const AVRO_EXTENSION = /\.avro(\?|$)/i;
|
|
8
|
+
|
|
9
|
+
class AvroPlugin extends QueryParserPlugin {
|
|
10
|
+
name = 'AvroPlugin';
|
|
11
|
+
|
|
12
|
+
processQuery(context) {
|
|
13
|
+
const { query, settings, endpoint, defaultBucket } = context;
|
|
14
|
+
const avroSettings = this.getFiles({ endpoint, defaultBucket, query });
|
|
15
|
+
const processedQuery = replaceAvroExtension(query);
|
|
16
|
+
return { ...context, settings: [...settings, ...avroSettings], query: processedQuery };
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
getFiles({ endpoint, defaultBucket, query }) {
|
|
20
|
+
return super
|
|
21
|
+
.getFiles({ endpoint, defaultBucket, query })
|
|
22
|
+
.filter((setting) => AVRO_EXTENSION.test(setting.file))
|
|
23
|
+
.map((setting) => ({ ...setting, sqlFileReference: setting.file.replace(/\.avro/gi, '.json') }));
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
/**
|
|
27
|
+
* Converts an avro file to json file
|
|
28
|
+
*
|
|
29
|
+
* @param {string} file
|
|
30
|
+
* @returns {Promise<string>} A promise that resolves to the processed file's name
|
|
31
|
+
*/
|
|
32
|
+
processFile(file) {
|
|
33
|
+
if (!file.includes('.avro')) return Promise.resolve(file);
|
|
34
|
+
const errorMsg = `Error converting avro to json for ${file}`;
|
|
35
|
+
|
|
36
|
+
return new Promise((resolve, reject) => {
|
|
37
|
+
const jsonFile = file.replace('.avro', '.json');
|
|
38
|
+
fileExists(jsonFile)
|
|
39
|
+
.then((exists) => {
|
|
40
|
+
if (exists) return resolve(jsonFile);
|
|
41
|
+
const fileStream = createWriteStream(jsonFile);
|
|
42
|
+
avro.createFileDecoder(file).pipe(fileStream);
|
|
43
|
+
fileStream.on('close', () => resolve(jsonFile));
|
|
44
|
+
fileStream.on('error', () => reject(new Error(errorMsg)));
|
|
45
|
+
})
|
|
46
|
+
.catch(() => reject(new Error(errorMsg)));
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
function replaceAvroExtension(query) {
|
|
52
|
+
return query.replace(/\.avro/gi, '.json');
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
async function fileExists(file) {
|
|
56
|
+
try {
|
|
57
|
+
await fsPromise.stat(file);
|
|
58
|
+
return true;
|
|
59
|
+
} catch {
|
|
60
|
+
return false;
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export default AvroPlugin;
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
import { removeFileDatePatterns } from '../../utils/date-regex/date-regex.js';
|
|
2
|
+
import {
|
|
3
|
+
removeFileSettingTokens,
|
|
4
|
+
removeDoubleFwdSlash,
|
|
5
|
+
removeCacheSettings,
|
|
6
|
+
} from '../../utils/file-settings/file-settings.js';
|
|
7
|
+
|
|
8
|
+
export default class QueryFinalizerPlugin {
|
|
9
|
+
name = 'CorePlugin';
|
|
10
|
+
|
|
11
|
+
processQuery(context) {
|
|
12
|
+
const { settings, bucketsDir, query } = context;
|
|
13
|
+
const processedQuery = QueryFinalizerPlugin.prepareQuery(settings, bucketsDir, query);
|
|
14
|
+
return { ...context, query: processedQuery };
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
static prepareQuery(settings, bucketsDir, query) {
|
|
18
|
+
let prepared = query;
|
|
19
|
+
|
|
20
|
+
settings.forEach((setting) => {
|
|
21
|
+
const searchPattern = setting.sqlFileReference.replace(/\?cache=(true|false)/i, '');
|
|
22
|
+
const fileRegexStr = QueryFinalizerPlugin.prepareFileRegexStr(searchPattern);
|
|
23
|
+
prepared = prepared.replace(new RegExp(fileRegexStr, 'gi'), `${bucketsDir}/${setting.bucket}/${setting.file}`);
|
|
24
|
+
});
|
|
25
|
+
prepared = removeFileSettingTokens(prepared);
|
|
26
|
+
prepared = removeFileDatePatterns(prepared);
|
|
27
|
+
prepared = removeCacheSettings(prepared);
|
|
28
|
+
prepared = removeDoubleFwdSlash(prepared);
|
|
29
|
+
|
|
30
|
+
return prepared;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
static prepareFileRegexStr(fileStr) {
|
|
34
|
+
return fileStr
|
|
35
|
+
.replace(/\*/g, '\\*')
|
|
36
|
+
.replace(/\./g, '\\.')
|
|
37
|
+
.replace(/\{/g, '\\{')
|
|
38
|
+
.replace(/\}/g, '\\}')
|
|
39
|
+
.replace(/\+/g, '\\+');
|
|
40
|
+
}
|
|
41
|
+
}
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import { extractFileReferences } from '../../utils/sql-parser/sql-parser.js';
|
|
2
|
+
import { parseFilePath } from '../../utils/path-parser/path-parser.js';
|
|
3
|
+
|
|
4
|
+
class QueryParserPlugin {
|
|
5
|
+
name = 'BasePlugin';
|
|
6
|
+
|
|
7
|
+
processQuery(context) {
|
|
8
|
+
const { settings, endpoint, defaultBucket, query } = context;
|
|
9
|
+
const fileSettings = this.getFiles({ endpoint, defaultBucket, query });
|
|
10
|
+
return { ...context, settings: [...settings, ...fileSettings] };
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
getFiles({ endpoint, defaultBucket, query }) {
|
|
14
|
+
return extractFileReferences(query).map((ref) => toFileSetting(ref, endpoint, defaultBucket));
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
static processFile(file) {
|
|
18
|
+
return Promise.resolve(file);
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function toFileSetting({ raw }, defaultEndpoint, defaultBucket) {
|
|
23
|
+
const parsed = parseFilePath(raw);
|
|
24
|
+
return {
|
|
25
|
+
endpoint: parsed.endpoint ?? defaultEndpoint,
|
|
26
|
+
bucket: parsed.bucket ?? defaultBucket,
|
|
27
|
+
file: parsed.file,
|
|
28
|
+
cache: parsed.cache,
|
|
29
|
+
sqlFileReference: raw,
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export default QueryParserPlugin;
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { S3Client } from '@aws-sdk/client-s3';
|
|
2
|
+
import { IbmIamTokenManager } from './ibm-iam-token-manager.js';
|
|
3
|
+
|
|
4
|
+
export function buildIbmIamClient(config, apiKey) {
|
|
5
|
+
const tokenManager = new IbmIamTokenManager(apiKey);
|
|
6
|
+
const client = new S3Client({ ...config, credentials: { accessKeyId: 'ibm-iam', secretAccessKey: 'ibm-iam' } });
|
|
7
|
+
client.middlewareStack.add(ibmIamMiddleware(tokenManager), {
|
|
8
|
+
step: 'finalizeRequest',
|
|
9
|
+
priority: 'low',
|
|
10
|
+
name: 'ibmIamAuth',
|
|
11
|
+
});
|
|
12
|
+
return client;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
function ibmIamMiddleware(tokenManager) {
|
|
16
|
+
return (next) => async (args) => {
|
|
17
|
+
const token = await tokenManager.getToken();
|
|
18
|
+
args.request.headers['Authorization'] = `Bearer ${token}`;
|
|
19
|
+
return next(args);
|
|
20
|
+
};
|
|
21
|
+
}
|