s3-querier 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,203 @@
1
+ # s3-querier
2
+
3
+ Query S3-compatible storage directly with DuckDB SQL. S3 Querier handles listing files, downloading them locally, and executing your query — turning a data lake into a queryable resource with a single function call.
4
+
5
+ ## Requirements
6
+
7
+ - Node.js >= 22
8
+ - S3-compatible storage (AWS S3, MinIO, IBM COS, etc.) with HMAC or IBM IAM credentials
9
+
10
+ ## Installation
11
+
12
+ ```bash
13
+ npm install s3-querier
14
+ ```
15
+
16
+ ## Usage
17
+
18
+ ```js
19
+ import s3Querier from 's3-querier';
20
+
21
+ const results = await s3Querier({
22
+ accessKeyId: 'your-access-key',
23
+ secretAccessKey: 'your-secret-key',
24
+ defaultEndpoint: 'https://s3.amazonaws.com',
25
+ defaultBucket: 'my-bucket',
26
+ bucketsDir: '/tmp/s3-cache',
27
+ from: new Date('2025-01-01').getTime(),
28
+ to: new Date('2025-01-31').getTime(),
29
+ query: `SELECT * FROM read_parquet('events/year={yyyy}/month={MM}/day={dd}/data.parquet')`,
30
+ format: 'jsonRecords',
31
+ });
32
+ ```
33
+
34
+ ## API
35
+
36
+ ### `s3Querier(options)`
37
+
38
+ Returns a `Promise` that resolves to the query results.
39
+
40
+ | Option | Type | Required | Description |
41
+ | --- | --- | --- | --- |
42
+ | `accessKeyId` | `string` | ✓ | HMAC access key ID |
43
+ | `secretAccessKey` | `string` | ✓ | HMAC secret access key |
44
+ | `defaultEndpoint` | `string` | ✓ | S3 endpoint URL |
45
+ | `defaultBucket` | `string` | ✓ | Default bucket name |
46
+ | `bucketsDir` | `string` | ✓ | Local directory for caching downloaded files |
47
+ | `query` | `string` | ✓ | DuckDB SQL query |
48
+ | `from` | `number` | | Start of date range as a Unix timestamp (ms). Required when using date tokens. |
49
+ | `to` | `number` | | End of date range as a Unix timestamp (ms). Required when using date tokens. |
50
+ | `format` | `string` | | Output format. `'jsonRecords'` returns `[{ col: val }]`. Default is columnar `[{ name, fields: [val, ...] }]`. |
51
+ | `plugins` | `array` | | Additional plugins to extend query processing. |
52
+
53
+ ### Environment Variables
54
+
55
+ | Variable | Default | Description |
56
+ | --- | --- | --- |
57
+ | `MAX_MB_DOWNLOAD` | `1000` | Maximum total download size in MB per query. Queries exceeding this limit throw an error. |
58
+
59
+ ## Query Syntax
60
+
61
+ ### Static Files
62
+
63
+ ```sql
64
+ SELECT * FROM read_parquet('reports/summary.parquet') LIMIT 10;
65
+ ```
66
+
67
+ ### Date Tokens
68
+
69
+ When `from` and `to` are provided, date tokens are expanded into a list of matching file paths.
70
+
71
+ ```sql
72
+ SELECT *
73
+ FROM read_parquet('events/year={yyyy}/month={MM}/day={dd}/data.parquet', union_by_name=1);
74
+ ```
75
+
76
+ | Token | Description | Example |
77
+ | --- | --- | --- |
78
+ | `{yyyy}` | 4-digit year | `2025` |
79
+ | `{MM}` | 2-digit month | `01`–`12` |
80
+ | `{dd}` | 2-digit day | `01`–`31` |
81
+ | `{hh}` | 2-digit hour | `00`–`23` |
82
+ | `{mm}` | 2-digit minute | `00`–`59` |
83
+ | `{ss}` | 2-digit second | `00`–`59` |
84
+
85
+ ### Glob Patterns
86
+
87
+ ```sql
88
+ SELECT * FROM read_parquet('reports/2025/*.parquet', union_by_name=1);
89
+ ```
90
+
91
+ ### Location Tokens
92
+
93
+ Override the default endpoint and bucket per file reference within a query.
94
+
95
+ ```sql
96
+ SELECT *
97
+ FROM read_parquet('{endpoint:https://s3.us-east.example.com}/{bucket:my-bucket}/data.parquet');
98
+ ```
99
+
100
+ ### Cross-Bucket Joins
101
+
102
+ ```sql
103
+ SELECT s.id, s.event_type, r.description
104
+ FROM read_parquet('{bucket:events-bucket}/reports/summary.parquet') s
105
+ JOIN read_parquet('{bucket:reference-bucket}/lookup.parquet') r ON s.id = r.id;
106
+ ```
107
+
108
+ ### Cache Control
109
+
110
+ Append `?cache=false` to force a fresh download, bypassing the local cache.
111
+
112
+ ```sql
113
+ SELECT * FROM read_parquet('reports/summary.parquet?cache=false');
114
+ ```
115
+
116
+ ## BigInt
117
+
118
+ > [!WARNING]
119
+ > DuckDB returns `BigInt` for `COUNT(*)`, `SUM`, and other integer aggregations. `BigInt` is not JSON-serializable — `JSON.stringify` will throw.
120
+
121
+ The safest fix is to cast in SQL:
122
+
123
+ ```sql
124
+ SELECT CAST(COUNT(*) AS INTEGER) AS total FROM read_parquet('data.parquet')
125
+ ```
126
+
127
+ If you can't control the query, use the exported `bigintReplacer` with `JSON.stringify`:
128
+
129
+ ```js
130
+ import s3Querier, { bigintReplacer } from 's3-querier';
131
+
132
+ const results = await s3Querier({ ..., format: 'jsonRecords' });
133
+ const json = JSON.stringify(results, bigintReplacer);
134
+ ```
135
+
136
+ Note: `bigintReplacer` converts `BigInt` to `Number`, which loses precision for values above `Number.MAX_SAFE_INTEGER` (~9 quadrillion). For large integer IDs or counters, prefer the SQL cast.
137
+
138
+ ## Caching
139
+
140
+ Downloaded files are cached to `bucketsDir` on disk. Subsequent queries that reference the same files skip the download entirely. The listing cache (S3 object listings) is held in memory per process using an LRU cache, with today's prefix always re-fetched to pick up new files.
141
+
142
+ ## Plugins
143
+
144
+ The `plugins` option accepts an array of plugin objects that can extend query parsing and file processing. A plugin may implement:
145
+
146
+ - `processQuery(context)` — transform the query context before execution
147
+ - `processFile(filePath)` — process each downloaded file (e.g. convert Avro to JSON)
148
+
149
+ The built-in Avro plugin is an example:
150
+
151
+ ```js
152
+ import s3Querier from 's3-querier';
153
+ import AvroPlugin from 's3-querier/src/plugins/avro/avro-plugin.js';
154
+
155
+ const results = await s3Querier({
156
+ // ...
157
+ plugins: [new AvroPlugin()],
158
+ query: `SELECT * FROM read_json('data.avro+json')`,
159
+ });
160
+ ```
161
+
162
+ ## Examples
163
+
164
+ The `examples/` directory contains a local interactive demo and standalone scripts. All examples target a local MinIO instance — you'll need [Docker](https://docs.docker.com/get-docker/) and [Docker Compose](https://docs.docker.com/compose/install/) installed. Both are bundled with Docker Desktop on Mac and Windows; on Linux, install the Compose plugin separately.
165
+
166
+ ### Interactive demo
167
+
168
+ Starts MinIO, seeds it with sample parquet data, and launches an Express server with a Monaco SQL editor in the browser.
169
+
170
+ ```bash
171
+ npm run demo:up # start MinIO and seed data (runs once)
172
+ npm run demo:start # start the Express server
173
+ ```
174
+
175
+ Then open [http://localhost:3000](http://localhost:3000). The editor has five pre-loaded example queries you can run or modify. When you're done:
176
+
177
+ ```bash
178
+ npm run demo:down # stop MinIO
179
+ ```
180
+
181
+ ### Standalone scripts
182
+
183
+ Run any script directly after MinIO is up:
184
+
185
+ ```bash
186
+ npm run demo:up # if not already running
187
+ node examples/scripts/basic-query.js # fetch the first 10 sales rows
188
+ node examples/scripts/glob-pattern.js # filter to Jan–Feb with a brace glob
189
+ node examples/scripts/date-range.js # use {from}/{to} date tokens
190
+ node examples/scripts/ibm-cos.js # IBM Cloud Object Storage (requires env vars)
191
+ ```
192
+
193
+ For the IBM COS script, set these environment variables first:
194
+
195
+ ```bash
196
+ export IBM_COS_API_KEY=your-api-key
197
+ export IBM_COS_ENDPOINT=https://s3.us-south.cloud-object-storage.appdomain.cloud
198
+ export IBM_COS_BUCKET=your-bucket
199
+ ```
200
+
201
+ ## License
202
+
203
+ MIT
@@ -0,0 +1,196 @@
1
+ # S3 Querier
2
+
3
+ S3 Querier allows you to query data lake content directly using [DuckDB](https://duckdb.org/) queries. By parsing these queries, determining the necessary files, and dynamically downloading and processing them, S3 Querier transforms what is otherwise an opaque storage system into a user-friendly, queryable resource.
4
+
5
+ ## Planning Your Queries
6
+
7
+ When querying data from a data lake, be mindful of how your queries are constructed. S3 Querier downloads files from S3-compatible storage before they can be queried using DuckDB, so query speed is directly influenced by the size and number of files involved.
8
+
9
+ ### Key Considerations
10
+
11
+ 1. **File Size And Query Efficiency**
12
+ Large files increase query time because they take longer to download. To optimize performance:
13
+
14
+ - Query only the columns you need. Avoid `SELECT *` without a `LIMIT`.
15
+ - Avoid overly broad queries that download unnecessary files, such as `read_parquet('my-bucket/*.parquet')`.
16
+
17
+ 2. **1GB File Size Limit**
18
+ This service enforces a 1GB limit per query. Queries that access files accumulating beyond this limit will fail.
19
+
20
+ 3. **Partitioning And Filtering**
21
+ Partition your data in the lake where possible. This lets you filter queries to target only relevant partitions, reducing unnecessary downloads.
22
+
23
+ ### Tips For Better Query Planning
24
+
25
+ - **Test Locally First**
26
+ [Install the DuckDB CLI](https://duckdb.org/docs/installation/?version=stable&environment=cli&platform=macos&download_method=direct) and experiment with your queries on local parquet files before running them against S3. This gives a fast feedback loop for understanding data structure and refining queries.
27
+
28
+ - **Be Mindful Of Time Ranges In Date Tokens**
29
+ Long time ranges require fetching more files and slow execution. Use narrow time ranges whenever possible.
30
+
31
+ - **Create Secondary Representations Of Your Data**
32
+ For larger datasets, break files into smaller chunks to avoid hitting the file size limit.
33
+
34
+ - **Monitor Query Times**
35
+ If a query is slow, revisit the query logic and the files it accesses.
36
+
37
+ ## Example Queries
38
+
39
+ Below are some examples of common use cases.
40
+
41
+ **Key Concepts:**
42
+
43
+ - Static files versus dynamic files. A static file is a file for which you know the exact location in S3. A dynamic file uses one or more file tokens to match a range of files. [See the file tokens section](#file-tokens-overview) for details.
44
+ - In most cases you'll want to use `union_by_name=1` when using `read_parquet` or `read_csv`. Read more about [why this is important](https://duckdb.org/2025/01/10/union-by-name.html).
45
+
46
+ ### Querying A Single, Static File
47
+
48
+ ```sql
49
+ SELECT * FROM
50
+ read_parquet('file1.parquet', union_by_name=1)
51
+ LIMIT 10;
52
+ ```
53
+
54
+ ### Getting Multiple, Static Files
55
+
56
+ When querying multiple files they should share the same or a similar schema. Use `union_by_name=1` to handle minor schema differences.
57
+
58
+ ```sql
59
+ SELECT * FROM
60
+ read_parquet(['file1.parquet', 'file2.parquet'], union_by_name=1)
61
+ LIMIT 10;
62
+ ```
63
+
64
+ ### Querying Time-Related Files
65
+
66
+ Use date tokens to query files spanning a specific date range. When S3 Querier receives a `from` and `to` parameter, it automatically expands the file list to match the date tokens in your query.
67
+
68
+ #### Example
69
+
70
+ Given `from=2025-08-03` and `to=2025-08-06`, the following query:
71
+
72
+ ```sql
73
+ SELECT id
74
+ FROM read_parquet('jobs_failed/year={yyyy}/month={MM}/day={dd}/servers.parquet', union_by_name=1);
75
+ ```
76
+
77
+ Will resolve and download:
78
+
79
+ ```
80
+ jobs_failed/year=2025/month=08/day=03/servers.parquet
81
+ jobs_failed/year=2025/month=08/day=04/servers.parquet
82
+ jobs_failed/year=2025/month=08/day=05/servers.parquet
83
+ jobs_failed/year=2025/month=08/day=06/servers.parquet
84
+ ```
85
+
86
+ For more details, see [tips about date ranges](#tips-for-better-query-planning).
87
+
88
+ ### Querying Files From Multiple Locations
89
+
90
+ Use location tokens to query files across different endpoints or buckets in a single query.
91
+
92
+ ```sql
93
+ WITH us_south_data AS (
94
+ SELECT id, timestamp
95
+ FROM read_parquet('{endpoint:https://s3.us-south.example.com}/{bucket:my-bucket}/my_time_series/{yyyy}{MM}{dd}{hh}{mm}{ss}.parquet')
96
+ )
97
+ SELECT id, timestamp
98
+ FROM read_parquet('{endpoint:https://s3.us-east.example.com}/{bucket:my-bucket}/my_time_series_2/{yyyy}{MM}{dd}{hh}{mm}{ss}.parquet') AS us_east_data
99
+ JOIN us_south_data ON us_east_data.id = us_south_data.id;
100
+ ```
101
+
102
+ ### Tips And Utilities
103
+
104
+ See the DuckDB docs for [`read_parquet` parameters](https://duckdb.org/docs/stable/data/parquet/overview.html#parameters) and [`read_csv` parameters](https://duckdb.org/docs/stable/data/csv/overview#parameters).
105
+
106
+ #### Getting The File Name Of The File(s) Being Queried
107
+
108
+ Pass `filename=1` to `read_parquet` or `read_csv` to include the source file path as a column in results.
109
+
110
+ ```sql
111
+ SELECT id, filename
112
+ FROM read_parquet('year={yyyy}/month={MM}/my-file.parquet', filename=1);
113
+ ```
114
+
115
+ | id | filename |
116
+ | --- | ----------------------------------- |
117
+ | 1 | year=2025/month=01/my-file.parquet |
118
+
119
+ #### Extracting Partition Values From Hive-Style Paths
120
+
121
+ If your data uses Hive-style partitioning (e.g., `year=2025/month=04/day=20`), use `hive_partitioning=1` to extract partition keys as columns.
122
+
123
+ ```sql
124
+ SELECT year, month, day, id
125
+ FROM read_parquet('jobs_failed/year=2025/month=01/day=19/my-file.parquet', hive_partitioning=1);
126
+ ```
127
+
128
+ | id | year | month | day |
129
+ | -- | ---- | ----- | --- |
130
+ | 1 | 2025 | 01 | 19 |
131
+
132
+ ---
133
+
134
+ ## File Tokens Overview
135
+
136
+ File tokens allow you to create dynamic queries with patterns that vary based on time, non-time components, or storage location. There are three types: **Glob Syntax**, **Time Formatting Tokens**, and **Location Tokens**.
137
+
138
+ ### Glob Syntax
139
+
140
+ Glob syntax handles file name segments that vary but are not time-related.
141
+
142
+ ```
143
+ jobs_failed/window=202308032130/0.parquet
144
+ jobs_failed/window=202308032230/3.parquet
145
+ jobs_failed/window=202308032330/6.parquet
146
+ ```
147
+
148
+ ```sql
149
+ SELECT id
150
+ FROM read_parquet('jobs_failed/window=202308032130/*.parquet', union_by_name=1);
151
+ ```
152
+
153
+ > [!WARNING]
154
+ > **Use globs with caution.** They can match more files than expected, causing unnecessary downloads and degraded performance. Always verify the file list your glob will match before running a broad query.
155
+
156
+ ---
157
+
158
+ ### Time Formatting Tokens
159
+
160
+ Time tokens dynamically match files based on time-related patterns in their names, based on [Unicode Technical Standard #35](https://unicode.org/reports/tr35/).
161
+
162
+ | **Token** | **Usage** | **Example Output** |
163
+ | ----------------- | -------------- | ------------------ |
164
+ | **Year** `{yyyy}` | 4-digit year | 1970, ..., 2030 |
165
+ | **Month** `{MM}` | 2-digit month | 01...12 |
166
+ | **Day** `{dd}` | 2-digit day | 01...31 |
167
+ | **Hour** `{hh}` | 2-digit hour | 00...23 |
168
+ | **Minute** `{mm}` | 2-digit minute | 00...59 |
169
+ | **Second** `{ss}` | 2-digit second | 00...59 |
170
+
171
+ ```sql
172
+ SELECT id
173
+ FROM read_parquet('jobs_failed/window={yyyy}{MM}{dd}{hh}{mm}/*.parquet', union_by_name=1);
174
+ ```
175
+
176
+ ---
177
+
178
+ ### Location Tokens
179
+
180
+ Location tokens let you vary the storage endpoint and bucket within a query.
181
+
182
+ | **Token** | **Usage** | **Example** |
183
+ | -------------------------- | -------------------------------- | ------------------------------------------------------------ |
184
+ | **Endpoint** `{endpoint:}` | Specifies a storage endpoint URL | `{endpoint:http://s3.example.com}/my-bucket/file.parquet` |
185
+ | **Bucket** `{bucket:}` | Specifies a storage bucket | `{bucket:my-bucket}/file.parquet` |
186
+
187
+ ```sql
188
+ SELECT id
189
+ FROM read_parquet('{endpoint:http://s3.example.com}/{bucket:my-bucket}/jobs_failed/window={yyyy}{MM}{dd}{hh}{mm}/*.parquet');
190
+ ```
191
+
192
+ **Benefits:**
193
+
194
+ 1. **Cross-Endpoint Queries** — Query data stored on different S3-compatible endpoints in a single query.
195
+ 2. **Cross-Bucket Queries** — Access data from multiple buckets without separate queries.
196
+ 3. **Dynamic Query Construction** — Combine location tokens with glob syntax and time tokens for fully dynamic, cross-location queries.
package/package.json ADDED
@@ -0,0 +1,71 @@
1
+ {
2
+ "name": "s3-querier",
3
+ "version": "1.0.0",
4
+ "description": "Query S3-compatible storage with DuckDB and SQL",
5
+ "type": "module",
6
+ "main": "src/s3-querier.js",
7
+ "exports": {
8
+ ".": "./src/s3-querier.js"
9
+ },
10
+ "files": [
11
+ "src/**/*.js",
12
+ "!src/**/*.test.js",
13
+ "docs/",
14
+ "README.md"
15
+ ],
16
+ "repository": {
17
+ "type": "git",
18
+ "url": "git+ssh://git@github.com/grommett/s3-querier.git"
19
+ },
20
+ "author": "david@pinkiering.com",
21
+ "scripts": {
22
+ "test": "node --test \"./src/**/*.test.js\"",
23
+ "test:e2e": "docker compose -f e2e/docker-compose.yml up -d --wait && node e2e/setup/seed.js && node --test e2e/*.e2e.js; docker compose -f e2e/docker-compose.yml down",
24
+ "test:coverage:html": "c8 -x coverage -x **/*.test.js --all -r html node --test \"./src/**/*.test.js\"",
25
+ "prettify": "prettier \"./src/**/*.js\" --write",
26
+ "lint": "eslint \"./src/**/*.js\"",
27
+ "lint:fix": "eslint --fix \"./src/**/*.js\"",
28
+ "prepare": "husky",
29
+ "demo:up": "docker compose -f examples/demo/docker-compose.yml up -d --wait && node examples/demo/seed.js",
30
+ "demo:down": "docker compose -f examples/demo/docker-compose.yml down",
31
+ "demo:start": "node examples/demo/server.js"
32
+ },
33
+ "engines": {
34
+ "node": ">=22"
35
+ },
36
+ "keywords": [
37
+ "s3",
38
+ "duckdb",
39
+ "parquet",
40
+ "query"
41
+ ],
42
+ "license": "MIT",
43
+ "dependencies": {
44
+ "@aws-sdk/client-s3": "^3.0.0",
45
+ "@derekstride/tree-sitter-sql": "^0.3.11",
46
+ "@duckdb/node-api": "^1.5.3-r.3",
47
+ "avsc": "^5.7.7",
48
+ "date-fns": "^4.0.0",
49
+ "lru-cache": "^11.0.0",
50
+ "peggy": "^5.1.0",
51
+ "pino": "^10.3.1",
52
+ "tree-sitter": "^0.21.1"
53
+ },
54
+ "lint-staged": {
55
+ "src/**/*.js": [
56
+ "prettier --write",
57
+ "eslint"
58
+ ]
59
+ },
60
+ "devDependencies": {
61
+ "@eslint/js": "^10.0.1",
62
+ "c8": "^11.0.0",
63
+ "eslint": "^10.5.0",
64
+ "esmock": "^2.7.5",
65
+ "express": "^5.2.1",
66
+ "globals": "^17.6.0",
67
+ "husky": "^9.1.7",
68
+ "lint-staged": "^17.0.7",
69
+ "prettier": "^3.0.3"
70
+ }
71
+ }
@@ -0,0 +1,57 @@
1
+ import { DuckDBInstance } from '@duckdb/node-api';
2
+ import { logger } from '../utils/logger.js';
3
+
4
+ const db = await DuckDBInstance.create(':memory:', {
5
+ threads: 4,
6
+ });
7
+
8
+ const formatStrategies = {
9
+ jsonRecords: formatJsonRecords,
10
+ default: formatColumnar,
11
+ };
12
+
13
+ /**
14
+ * Execute a SQL query and return results in the specified format
15
+ *
16
+ * @param {string} sql - The SQL query to execute
17
+ * @param {object} options - Query options
18
+ * @param {string} options.format - Output format: 'jsonRecords' for row objects, otherwise columnar (default)
19
+ * @returns {Promise<Array>} Query results in the requested format
20
+ */
21
+ export async function query(sql, options = {}) {
22
+ const { format } = options;
23
+ const queryStart = new Date();
24
+
25
+ try {
26
+ const connection = await db.connect();
27
+ const reader = await connection.runAndReadAll(sql);
28
+ const columnsResult = reader.getColumnsObjectJS();
29
+
30
+ const formatter = formatStrategies[format] ?? formatStrategies.default;
31
+ const result = formatter(columnsResult);
32
+
33
+ const queryTime = new Date() - queryStart;
34
+ logger.info(`Query completed in : ${queryTime / 1000} seconds`);
35
+ return result ?? [];
36
+ } catch (error) {
37
+ logger.error(error);
38
+ throw error;
39
+ }
40
+ }
41
+
42
+ function formatColumnar(columnsResult) {
43
+ return Object.keys(columnsResult).map((key) => ({ name: key, fields: columnsResult[key] }));
44
+ }
45
+
46
+ function formatJsonRecords(columnsResult) {
47
+ const keys = Object.keys(columnsResult);
48
+ if (keys.length === 0) return [];
49
+ const rowCount = columnsResult[keys[0]].length;
50
+ return Array.from({ length: rowCount }, (_, rowIndex) => {
51
+ const row = {};
52
+ keys.forEach((key) => {
53
+ row[key] = columnsResult[key][rowIndex];
54
+ });
55
+ return row;
56
+ });
57
+ }
@@ -0,0 +1,64 @@
1
+ import { createWriteStream } from 'node:fs';
2
+ import fsPromise from 'node:fs/promises';
3
+ import avro from 'avsc';
4
+
5
+ import QueryParserPlugin from '../query-parser/query-parser.js';
6
+
7
+ const AVRO_EXTENSION = /\.avro(\?|$)/i;
8
+
9
+ class AvroPlugin extends QueryParserPlugin {
10
+ name = 'AvroPlugin';
11
+
12
+ processQuery(context) {
13
+ const { query, settings, endpoint, defaultBucket } = context;
14
+ const avroSettings = this.getFiles({ endpoint, defaultBucket, query });
15
+ const processedQuery = replaceAvroExtension(query);
16
+ return { ...context, settings: [...settings, ...avroSettings], query: processedQuery };
17
+ }
18
+
19
+ getFiles({ endpoint, defaultBucket, query }) {
20
+ return super
21
+ .getFiles({ endpoint, defaultBucket, query })
22
+ .filter((setting) => AVRO_EXTENSION.test(setting.file))
23
+ .map((setting) => ({ ...setting, sqlFileReference: setting.file.replace(/\.avro/gi, '.json') }));
24
+ }
25
+
26
+ /**
27
+ * Converts an avro file to json file
28
+ *
29
+ * @param {string} file
30
+ * @returns {Promise<string>} A promise that resolves to the processed file's name
31
+ */
32
+ processFile(file) {
33
+ if (!file.includes('.avro')) return Promise.resolve(file);
34
+ const errorMsg = `Error converting avro to json for ${file}`;
35
+
36
+ return new Promise((resolve, reject) => {
37
+ const jsonFile = file.replace('.avro', '.json');
38
+ fileExists(jsonFile)
39
+ .then((exists) => {
40
+ if (exists) return resolve(jsonFile);
41
+ const fileStream = createWriteStream(jsonFile);
42
+ avro.createFileDecoder(file).pipe(fileStream);
43
+ fileStream.on('close', () => resolve(jsonFile));
44
+ fileStream.on('error', () => reject(new Error(errorMsg)));
45
+ })
46
+ .catch(() => reject(new Error(errorMsg)));
47
+ });
48
+ }
49
+ }
50
+
51
+ function replaceAvroExtension(query) {
52
+ return query.replace(/\.avro/gi, '.json');
53
+ }
54
+
55
+ async function fileExists(file) {
56
+ try {
57
+ await fsPromise.stat(file);
58
+ return true;
59
+ } catch {
60
+ return false;
61
+ }
62
+ }
63
+
64
+ export default AvroPlugin;
@@ -0,0 +1,41 @@
1
+ import { removeFileDatePatterns } from '../../utils/date-regex/date-regex.js';
2
+ import {
3
+ removeFileSettingTokens,
4
+ removeDoubleFwdSlash,
5
+ removeCacheSettings,
6
+ } from '../../utils/file-settings/file-settings.js';
7
+
8
+ export default class QueryFinalizerPlugin {
9
+ name = 'CorePlugin';
10
+
11
+ processQuery(context) {
12
+ const { settings, bucketsDir, query } = context;
13
+ const processedQuery = QueryFinalizerPlugin.prepareQuery(settings, bucketsDir, query);
14
+ return { ...context, query: processedQuery };
15
+ }
16
+
17
+ static prepareQuery(settings, bucketsDir, query) {
18
+ let prepared = query;
19
+
20
+ settings.forEach((setting) => {
21
+ const searchPattern = setting.sqlFileReference.replace(/\?cache=(true|false)/i, '');
22
+ const fileRegexStr = QueryFinalizerPlugin.prepareFileRegexStr(searchPattern);
23
+ prepared = prepared.replace(new RegExp(fileRegexStr, 'gi'), `${bucketsDir}/${setting.bucket}/${setting.file}`);
24
+ });
25
+ prepared = removeFileSettingTokens(prepared);
26
+ prepared = removeFileDatePatterns(prepared);
27
+ prepared = removeCacheSettings(prepared);
28
+ prepared = removeDoubleFwdSlash(prepared);
29
+
30
+ return prepared;
31
+ }
32
+
33
+ static prepareFileRegexStr(fileStr) {
34
+ return fileStr
35
+ .replace(/\*/g, '\\*')
36
+ .replace(/\./g, '\\.')
37
+ .replace(/\{/g, '\\{')
38
+ .replace(/\}/g, '\\}')
39
+ .replace(/\+/g, '\\+');
40
+ }
41
+ }
@@ -0,0 +1,33 @@
1
+ import { extractFileReferences } from '../../utils/sql-parser/sql-parser.js';
2
+ import { parseFilePath } from '../../utils/path-parser/path-parser.js';
3
+
4
+ class QueryParserPlugin {
5
+ name = 'BasePlugin';
6
+
7
+ processQuery(context) {
8
+ const { settings, endpoint, defaultBucket, query } = context;
9
+ const fileSettings = this.getFiles({ endpoint, defaultBucket, query });
10
+ return { ...context, settings: [...settings, ...fileSettings] };
11
+ }
12
+
13
+ getFiles({ endpoint, defaultBucket, query }) {
14
+ return extractFileReferences(query).map((ref) => toFileSetting(ref, endpoint, defaultBucket));
15
+ }
16
+
17
+ static processFile(file) {
18
+ return Promise.resolve(file);
19
+ }
20
+ }
21
+
22
+ function toFileSetting({ raw }, defaultEndpoint, defaultBucket) {
23
+ const parsed = parseFilePath(raw);
24
+ return {
25
+ endpoint: parsed.endpoint ?? defaultEndpoint,
26
+ bucket: parsed.bucket ?? defaultBucket,
27
+ file: parsed.file,
28
+ cache: parsed.cache,
29
+ sqlFileReference: raw,
30
+ };
31
+ }
32
+
33
+ export default QueryParserPlugin;
@@ -0,0 +1,21 @@
1
+ import { S3Client } from '@aws-sdk/client-s3';
2
+ import { IbmIamTokenManager } from './ibm-iam-token-manager.js';
3
+
4
+ export function buildIbmIamClient(config, apiKey) {
5
+ const tokenManager = new IbmIamTokenManager(apiKey);
6
+ const client = new S3Client({ ...config, credentials: { accessKeyId: 'ibm-iam', secretAccessKey: 'ibm-iam' } });
7
+ client.middlewareStack.add(ibmIamMiddleware(tokenManager), {
8
+ step: 'finalizeRequest',
9
+ priority: 'low',
10
+ name: 'ibmIamAuth',
11
+ });
12
+ return client;
13
+ }
14
+
15
+ function ibmIamMiddleware(tokenManager) {
16
+ return (next) => async (args) => {
17
+ const token = await tokenManager.getToken();
18
+ args.request.headers['Authorization'] = `Bearer ${token}`;
19
+ return next(args);
20
+ };
21
+ }