dbt-js 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +325 -0
- package/bin/dbt-js.js +4 -0
- package/package.json +53 -0
- package/src/api.js +257 -0
- package/src/batches.js +120 -0
- package/src/cli.js +175 -0
- package/src/config.js +68 -0
- package/src/dag.js +67 -0
- package/src/db.js +182 -0
- package/src/materialize.js +197 -0
- package/src/project.js +107 -0
- package/src/render.js +62 -0
- package/src/seed.js +68 -0
- package/src/tests.js +49 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Hamza Shahzad
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,325 @@
|
|
|
1
|
+
# dbt-js
|
|
2
|
+
|
|
3
|
+
A minimalist dbt-like SQL transformation tool for Postgres, MySQL, SQLite, and DuckDB. Models are plain SQL `SELECT` files; dbt-js compiles them (resolving `ref()` / `source()` / `var()`), builds a dependency DAG, and executes everything inside the database in dependency order. Like dbt, it is transformation-only — it never extracts or moves data; raw data must already be in your database (or, with DuckDB, in files it can read in place).
|
|
4
|
+
|
|
5
|
+
Five dependencies: `pg`, `mysql2`, `better-sqlite3`, `@duckdb/node-api`, and `csv-parse` — the database drivers are loaded lazily, so each backend only pays for its own. Plain ESM JavaScript, no build step.
|
|
6
|
+
|
|
7
|
+
## Install
|
|
8
|
+
|
|
9
|
+
```sh
|
|
10
|
+
npm install -g dbt-js # global CLI: dbt-js <command>
|
|
11
|
+
npx dbt-js debug # or run without installing
|
|
12
|
+
npm install dbt-js # as a library, for embedding (see below)
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
Requires Node.js >= 20.
|
|
16
|
+
|
|
17
|
+
## Quick start
|
|
18
|
+
|
|
19
|
+
The published package ships just the CLI and library; the runnable examples live in the
|
|
20
|
+
[repository](https://github.com/<you>/dbt-js). Clone it to try the fully self-contained
|
|
21
|
+
DuckDB example (no database server needed):
|
|
22
|
+
|
|
23
|
+
```sh
|
|
24
|
+
git clone https://github.com/<you>/dbt-js && cd dbt-js
|
|
25
|
+
npm install
|
|
26
|
+
cd example-duckdb
|
|
27
|
+
node ../bin/dbt-js.js debug # check config + connectivity
|
|
28
|
+
node ../bin/dbt-js.js seed # load seeds/*.csv
|
|
29
|
+
node ../bin/dbt-js.js run # build all models in DAG order
|
|
30
|
+
node ../bin/dbt-js.js test # run data tests
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
(With `dbt-js` installed globally, the commands are just `dbt-js debug`, `dbt-js run`, etc.)
|
|
34
|
+
`example/` is the same project targeting a Postgres server instead, `example-mysql/` targets MySQL (a one-line Docker server is in its README), and `example-sqlite/` targets SQLite (also serverless).
|
|
35
|
+
|
|
36
|
+
## Project layout
|
|
37
|
+
|
|
38
|
+
A dbt-js project is a directory containing:
|
|
39
|
+
|
|
40
|
+
```
|
|
41
|
+
dbtjs.config.json # connection, target schema, sources, vars
|
|
42
|
+
models/*.sql # one SELECT per file; filename = model name
|
|
43
|
+
seeds/*.csv # one table per file; filename = table name
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### dbtjs.config.json
|
|
47
|
+
|
|
48
|
+
```json
|
|
49
|
+
{
|
|
50
|
+
"connection": {
|
|
51
|
+
"host": "localhost",
|
|
52
|
+
"port": 5432,
|
|
53
|
+
"user": "me",
|
|
54
|
+
"password": "${DBTJS_PASSWORD}",
|
|
55
|
+
"database": "mydb"
|
|
56
|
+
},
|
|
57
|
+
"schema": "analytics",
|
|
58
|
+
"sources": { "raw": { "schema": "public" } },
|
|
59
|
+
"vars": { "start": null },
|
|
60
|
+
"seeds": { "columnTypes": { "my_seed": { "joined_on": "date" } } }
|
|
61
|
+
}
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
For MySQL, the same shape with `"type": "mysql"` (`port` defaults to 3306):
|
|
65
|
+
|
|
66
|
+
```json
|
|
67
|
+
{
|
|
68
|
+
"connection": {
|
|
69
|
+
"type": "mysql",
|
|
70
|
+
"host": "localhost",
|
|
71
|
+
"user": "me",
|
|
72
|
+
"password": "${DBTJS_PASSWORD}",
|
|
73
|
+
"database": "mydb"
|
|
74
|
+
},
|
|
75
|
+
"schema": "analytics"
|
|
76
|
+
}
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
For DuckDB and SQLite, the connection is just a file path (the warehouse is an embedded local file):
|
|
80
|
+
|
|
81
|
+
```json
|
|
82
|
+
{
|
|
83
|
+
"connection": { "type": "duckdb", "path": "./warehouse.duckdb" },
|
|
84
|
+
"schema": "analytics"
|
|
85
|
+
}
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
```json
|
|
89
|
+
{
|
|
90
|
+
"connection": { "type": "sqlite", "path": "./warehouse.db" },
|
|
91
|
+
"schema": "analytics"
|
|
92
|
+
}
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
- `connection.type` is `"postgres"` (default), `"mysql"`, `"sqlite"`, or `"duckdb"`.
|
|
96
|
+
- `${NAME}` in connection values is replaced from the environment (error if unset). Omit `password` entirely to let `pg` use `PGPASSWORD`.
|
|
97
|
+
- `schema` is where all models and seeds are created (`CREATE SCHEMA IF NOT EXISTS` runs automatically).
|
|
98
|
+
- `sources` maps a source name to a schema, used by `{{ source('name', 'table') }}`.
|
|
99
|
+
- `vars` are defaults, overridable per-invocation with `--vars '{"start": "2026-06-01"}'`.
|
|
100
|
+
- `seeds.columnTypes` overrides inferred CSV column types (the escape hatch for dates/timestamps).
|
|
101
|
+
|
|
102
|
+
## Models
|
|
103
|
+
|
|
104
|
+
A model is a single `SELECT`. Configuration lives in one leading block comment with a JSON body:
|
|
105
|
+
|
|
106
|
+
```sql
|
|
107
|
+
/* config: {
|
|
108
|
+
"materialized": "incremental",
|
|
109
|
+
"strategy": "delete+insert",
|
|
110
|
+
"unique_key": "day",
|
|
111
|
+
"tests": { "day": ["not_null", "unique"] }
|
|
112
|
+
} */
|
|
113
|
+
select ...
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
No config comment means `{ "materialized": "view" }`.
|
|
117
|
+
|
|
118
|
+
### Templating
|
|
119
|
+
|
|
120
|
+
| Expression | Becomes |
|
|
121
|
+
|---|---|
|
|
122
|
+
| `{{ ref('other_model') }}` | `"schema"."other_model"` — and declares a DAG dependency |
|
|
123
|
+
| `{{ this }}` | the current model's own table (for incremental high-water marks) |
|
|
124
|
+
| `{{ source('raw', 'orders') }}` | `"public"."orders"` (schema from `sources` config) |
|
|
125
|
+
| `{{ var('start') }}` / `{{ var('x', 0) }}` | the var's value, or the default; error if neither. Inserted verbatim — quote it yourself in SQL |
|
|
126
|
+
| `{{ batch_start }}` / `{{ batch_end }}` | the current batch window as `YYYY-MM-DD HH:MM:SS` (microbatch models only). Inserted verbatim — quote it yourself |
|
|
127
|
+
| `{% if is_incremental() %} ... {% endif %}` | body included only on incremental runs (table exists, not `--full-refresh`) |
|
|
128
|
+
|
|
129
|
+
That's the whole template language. Anything else inside `{{ }}` / `{% %}` is a compile error.
|
|
130
|
+
|
|
131
|
+
### Materializations
|
|
132
|
+
|
|
133
|
+
- **view** (default): `CREATE OR REPLACE VIEW`
|
|
134
|
+
- **table**: transactional `DROP TABLE ... CASCADE; CREATE TABLE ... AS SELECT` (atomic to readers; CASCADE-dropped downstream views are rebuilt later in the same run — for partial runs use `--select model+`)
|
|
135
|
+
- **incremental**: first run (or `--full-refresh`) builds like a table; after that only the rows your SELECT returns are applied, via a strategy:
|
|
136
|
+
- `append` — plain `INSERT INTO ... SELECT` (immutable event data)
|
|
137
|
+
- `delete+insert` — requires `unique_key` (string or array); deletes matching keys then inserts, in one transaction (idempotent re-runs)
|
|
138
|
+
- `microbatch` — splits the event-time range into aligned windows and replaces each window in its own transaction (see below)
|
|
139
|
+
|
|
140
|
+
### Hooks
|
|
141
|
+
|
|
142
|
+
`pre_hook` / `post_hook` run extra SQL around a model's build — grants, indexes, `ANALYZE`, audit rows. Each is a string or array of strings, rendered with the same template language as the model body (everything except `batch_start` / `batch_end`):
|
|
143
|
+
|
|
144
|
+
```sql
|
|
145
|
+
/* config: {
|
|
146
|
+
"materialized": "table",
|
|
147
|
+
"post_hook": [
|
|
148
|
+
"create index if not exists idx_daily_revenue_day on {{ this }} (day)",
|
|
149
|
+
"grant select on {{ this }} to reporting"
|
|
150
|
+
]
|
|
151
|
+
} */
|
|
152
|
+
select ...
|
|
153
|
+
```
|
|
154
|
+
|
|
155
|
+
- Order: all pre-hooks → materialization → all post-hooks, each hook as its own statement.
|
|
156
|
+
- One deliberate divergence from dbt: hooks run **outside** the materialization transaction, so they can use statements Postgres forbids inside one (`VACUUM`, `CREATE INDEX CONCURRENTLY`). A failing pre-hook aborts the model before any build; a failing post-hook marks the model FAIL but the built relation remains — fix the hook and re-run.
|
|
157
|
+
- Microbatch models run hooks once per model (pre-hooks before the first batch, post-hooks after the last), not per batch; post-hooks are skipped when any batch failed.
|
|
158
|
+
- `{{ ref('x') }}` inside a hook declares a DAG dependency, same as in the body.
|
|
159
|
+
|
|
160
|
+
### Incremental pattern + backfill
|
|
161
|
+
|
|
162
|
+
```sql
|
|
163
|
+
select date_trunc('day', created_at)::date as day, count(*) as orders
|
|
164
|
+
from {{ ref('orders_enriched') }}
|
|
165
|
+
{% if is_incremental() %}
|
|
166
|
+
where created_at >= coalesce(
|
|
167
|
+
nullif('{{ var("start", "") }}', '')::timestamptz,
|
|
168
|
+
(select max(day) from {{ this }})::timestamptz)
|
|
169
|
+
{% endif %}
|
|
170
|
+
group by 1
|
|
171
|
+
```
|
|
172
|
+
|
|
173
|
+
- Normal run: processes from the table's own high-water mark (`max(day)`).
|
|
174
|
+
- Backfill: `dbt-js run --select daily_revenue --vars '{"start": "2026-01-01"}'` re-derives from that date; `delete+insert` makes it idempotent.
|
|
175
|
+
- Full rebuild: `dbt-js run --select daily_revenue --full-refresh`.
|
|
176
|
+
|
|
177
|
+
### Microbatch (dbt 1.9-style)
|
|
178
|
+
|
|
179
|
+
For batched, retryable backfills, use `strategy: "microbatch"`. dbt-js splits the time range into `batch_size` windows and runs each as its own transaction: `DELETE` the target rows whose `event_time` falls in the window, then `INSERT` the batch's rows. A failed batch is reported and the rest keep running.
|
|
180
|
+
|
|
181
|
+
```sql
|
|
182
|
+
/* config: {
|
|
183
|
+
"materialized": "incremental",
|
|
184
|
+
"strategy": "microbatch",
|
|
185
|
+
"event_time": "day",
|
|
186
|
+
"begin": "2026-01-01",
|
|
187
|
+
"batch_size": "day",
|
|
188
|
+
"lookback": 1
|
|
189
|
+
} */
|
|
190
|
+
select date_trunc('day', created_at)::date as day, count(*) as orders
|
|
191
|
+
from {{ ref('orders_enriched') }}
|
|
192
|
+
where created_at >= '{{ batch_start }}'::timestamptz
|
|
193
|
+
and created_at < '{{ batch_end }}'::timestamptz
|
|
194
|
+
group by 1
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
- `event_time` — column **of this model's output** bounding each batch (used by the engine's per-window DELETE).
|
|
198
|
+
- `begin` — start of history; first run and `--full-refresh` build every batch from here.
|
|
199
|
+
- `batch_size` — `hour` | `day` | `month` | `year`. Boundaries align to the model's `timezone` (default UTC).
|
|
200
|
+
- `lookback` (default 1) — a normal run reprocesses the current batch plus this many previous ones (no high-water mark, same as dbt).
|
|
201
|
+
- Backfill: `dbt-js run --select my_model --event-time-start 2026-06-02 --event-time-end 2026-06-04` rewrites exactly those windows (whole batches; end is exclusive). Idempotent by construction.
|
|
202
|
+
- No `is_incremental()` needed — the `batch_start`/`batch_end` filter applies on every run, including the first.
|
|
203
|
+
- If batches fail, the model exits FAIL listing the failed windows and the exact `--event-time-start/--event-time-end` retry command; other batches' work is kept.
|
|
204
|
+
|
|
205
|
+
One deliberate divergence from dbt: dbt auto-filters upstream `ref()`s by their declared `event_time`; dbt-js does no hidden query rewriting — you filter your input yourself with `{{ batch_start }}` / `{{ batch_end }}`.
|
|
206
|
+
|
|
207
|
+
### Timezone
|
|
208
|
+
|
|
209
|
+
Any model may set `"timezone"` in its config (a string IANA zone, default `"UTC"`):
|
|
210
|
+
|
|
211
|
+
- For microbatch models it aligns each window to that zone's wall-clock. `{{ batch_start }}` / `{{ batch_end }}` are emitted as naive `YYYY-MM-DD HH:MM:SS` **wall-clock strings in that zone**, so they compare directly against a locally-stored `event_time` column. A `"day"` batch in `"America/New_York"` therefore spans local midnight-to-midnight, not UTC.
|
|
212
|
+
- `{{ timezone }}` is available in **any** model's SQL (raw substitution — quote it yourself, e.g. `created_at at time zone '{{ timezone }}'`).
|
|
213
|
+
- `begin`, `--event-time-start`, and `--event-time-end` given as naive strings are interpreted as wall-clock in the model's `timezone`; strings with an explicit `Z`/offset stay absolute.
|
|
214
|
+
- DST caveat: with `batch_size: "hour"` in a DST zone the spring-forward/fall-back hour is irregular — prefer UTC for hour-grain, or day+ grain for zoned models.
|
|
215
|
+
|
|
216
|
+
## Tests
|
|
217
|
+
|
|
218
|
+
Declared per column in the model's config. Each compiles to a query returning violating rows; any row fails the test (exit code 1, with up to 10 sample rows printed).
|
|
219
|
+
|
|
220
|
+
- `"not_null"` — rows where the column is NULL
|
|
221
|
+
- `"unique"` — non-NULL values appearing more than once
|
|
222
|
+
- `{ "accepted_values": ["a", "b"] }` — non-NULL values outside the list
|
|
223
|
+
|
|
224
|
+
## Seeds
|
|
225
|
+
|
|
226
|
+
`dbt-js seed` loads each `seeds/*.csv` as a table (drop + create + insert, transactional). Column types are inferred (`integer`/`bigint`/`numeric`/`boolean`, else `text`; empty string → NULL); override per column via `seeds.columnTypes`. Models can `{{ ref('seed_name') }}` seeds.
|
|
227
|
+
|
|
228
|
+
## CLI
|
|
229
|
+
|
|
230
|
+
```
|
|
231
|
+
dbt-js run [--select SPEC] [--full-refresh] [--vars JSON]
|
|
232
|
+
[--event-time-start TS] [--event-time-end TS] # microbatch backfill window
|
|
233
|
+
dbt-js test [--select SPEC] [--vars JSON]
|
|
234
|
+
dbt-js seed [--select SPEC]
|
|
235
|
+
dbt-js compile [--select SPEC] [--vars JSON] # print compiled SQL, no DB needed
|
|
236
|
+
dbt-js ls # nodes in execution order
|
|
237
|
+
dbt-js debug # config + connectivity check
|
|
238
|
+
```
|
|
239
|
+
|
|
240
|
+
`--select` accepts comma-separated names; `+name` adds everything upstream, `name+` everything downstream (e.g. `--select orders_enriched+` rebuilds it and its dependents).
|
|
241
|
+
|
|
242
|
+
On failure, downstream models are skipped and reported; exit code is 1 if anything failed.
|
|
243
|
+
|
|
244
|
+
## Embedding in a Node.js app
|
|
245
|
+
|
|
246
|
+
The CLI is a thin wrapper over a programmatic API — `example-embed/` is a runnable ~70-line server using it. Install dbt-js as a dependency:
|
|
247
|
+
|
|
248
|
+
```sh
|
|
249
|
+
npm install dbt-js
|
|
250
|
+
```
|
|
251
|
+
|
|
252
|
+
```js
|
|
253
|
+
import { run, test, seed, compile, ls, debug } from 'dbt-js';
|
|
254
|
+
|
|
255
|
+
const result = await run({
|
|
256
|
+
projectDir: './analytics', // dir containing dbtjs.config.json — always pass this
|
|
257
|
+
select: 'daily_revenue+', // optional, same syntax as --select
|
|
258
|
+
vars: { start: '2026-06-01' }, // optional, plain object (not a JSON string)
|
|
259
|
+
fullRefresh: false,
|
|
260
|
+
onEvent: (e) => logger.info(e), // optional progress stream; omit for silence
|
|
261
|
+
});
|
|
262
|
+
// result = { ok, models: [{ name, status: 'ok'|'fail'|'skip', action, rowCount,
|
|
263
|
+
// batchCount, failedBatches, durationMs, error }] }
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
The project can also be supplied inline instead of from files — handy when connection settings live in your app's config system or model SQL is generated:
|
|
267
|
+
|
|
268
|
+
```js
|
|
269
|
+
await run({
|
|
270
|
+
config: { // contents of dbtjs.config.json (file not read)
|
|
271
|
+
connection: { host: 'db', port: 5432, user: 'analytics', password: process.env.PW, database: 'warehouse' },
|
|
272
|
+
schema: 'analytics',
|
|
273
|
+
sources: { raw: { schema: 'public' } },
|
|
274
|
+
},
|
|
275
|
+
models: { // replaces models/*.sql — same format, config comment included
|
|
276
|
+
stg_orders: "select * from {{ source('raw', 'orders') }} where deleted = false",
|
|
277
|
+
order_counts: "/* config: { \"materialized\": \"table\" } */ select count(*) as n from {{ ref('stg_orders') }}",
|
|
278
|
+
},
|
|
279
|
+
});
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
With both given, `projectDir` is optional — it then only anchors relative DuckDB paths and locates `seeds/` (file seeds remain `ref()`-able from inline models). Inline `config` goes through the same validation and `${ENV}` interpolation as the file; your object is not mutated.
|
|
283
|
+
|
|
284
|
+
- `run` also takes `eventTimeStart` / `eventTimeEnd` for microbatch backfills. `test` → `{ ok, tests: [{ id, pass, violations, sample }] }`; `seed` → `{ ok, seeds: [...] }`; `compile` → `[{ name, materialized, sql, preHookSql, postHookSql }]` (no DB needed); `ls` → `[{ name, kind, deps }]`; `debug` → connectivity info.
|
|
285
|
+
- Config or project errors **throw**; model/test failures come back as `ok: false` (mirrors the CLI's exit code 1).
|
|
286
|
+
- Every call opens its own connection and closes it before returning — nothing to pool.
|
|
287
|
+
- **Serialize runs yourself** (a one-promise queue is enough — see `example-embed/server.js`): DuckDB allows a single writer per file, so a scheduled refresh and an HTTP-triggered run must not overlap.
|
|
288
|
+
- Relative paths are anchored to `projectDir`, not your app's cwd: the DuckDB `connection.path` is resolved against it, and `read_csv('data/...')`-style paths in model SQL resolve via DuckDB's `file_search_path`.
|
|
289
|
+
|
|
290
|
+
## DuckDB notes
|
|
291
|
+
|
|
292
|
+
- `sources` resolve to schemas inside the same `.duckdb` file, exactly like Postgres schemas.
|
|
293
|
+
- Models can call DuckDB-native readers directly — `from read_csv('data/orders.csv')` or `read_parquet('...')` — no template syntax needed; raw data files never pass through dbt-js.
|
|
294
|
+
- DuckDB doesn't report row counts for full table builds (CTAS), so those log lines omit the count. Incremental and seed counts are reported normally.
|
|
295
|
+
- `:memory:` is a valid path but pointless for a CLI — each invocation is a separate process, so nothing would persist between `seed` and `run`.
|
|
296
|
+
- Attaching external databases (DuckDB `ATTACH`) is not supported in v1.
|
|
297
|
+
- One Postgres-specific change: pre-existing **materialized views** squatting on a model's name are no longer auto-dropped (relation detection now uses `information_schema`, which can't see them); you'd get a clear Postgres error at build time instead. dbt-js itself never creates materialized views.
|
|
298
|
+
|
|
299
|
+
## MySQL notes
|
|
300
|
+
|
|
301
|
+
Requires MySQL 8.0+ (`CREATE TABLE ... AS SELECT` under GTID consistency additionally needs 8.0.21+, and temp-table-in-transaction is disallowed when it's enforced).
|
|
302
|
+
|
|
303
|
+
- dbt-js enables `ANSI_QUOTES` for its session, so double quotes are **identifier** quotes exactly as on Postgres/DuckDB — write string literals with single quotes in model SQL (the habit you already have from Postgres).
|
|
304
|
+
- `schema` maps to a MySQL **database**: `CREATE SCHEMA IF NOT EXISTS` is `CREATE DATABASE`, so the connecting user needs the server-wide CREATE privilege (or pre-create the schema and grant on it — see `example-mysql/README.md`).
|
|
305
|
+
- MySQL DDL implicitly commits, so `table` and `--full-refresh` rebuilds (DROP + CREATE TABLE AS) are **not** atomic to readers the way they are on Postgres/DuckDB. `delete+insert` and microbatch window replacement remain fully transactional.
|
|
306
|
+
- No `CREATE INDEX IF NOT EXISTS` — use an idempotent post-hook like `analyze table {{ this }}`, or guard index creation yourself.
|
|
307
|
+
- Seed type inference maps `numeric` to `decimal(38,10)` (bare `NUMERIC` is `DECIMAL(10,0)` on MySQL and would round); `boolean` becomes `TINYINT(1)` with `true/false` loaded as `1/0`. Override per column via `seeds.columnTypes` as usual.
|
|
308
|
+
- Microbatch boundaries are computed in UTC and compared as `DATETIME` literals — prefer a `DATETIME` event-time column, or set the session time zone to UTC via mysql2's `timezone` connection option.
|
|
309
|
+
- Rows come back with `dateStrings: true` (dates as strings, JSON-safe, matching the DuckDB adapter); set `dateStrings: false` in the connection object to get JS `Date`s from the `query` API.
|
|
310
|
+
|
|
311
|
+
## SQLite notes
|
|
312
|
+
|
|
313
|
+
Driver: `better-sqlite3` (synchronous — a long-running statement blocks the embedding app's event loop; irrelevant for CLI use).
|
|
314
|
+
|
|
315
|
+
- `schema` maps to a **separate database file** `<schema>.db` next to `connection.path`, ATTACHed for the session (created automatically when writable). `"schema": "main"` keeps everything in the single main file — see `example-sqlite/README.md`.
|
|
316
|
+
- SQLite DDL is transactional, so **all** rebuilds — including `table` and `--full-refresh` — are atomic, like Postgres/DuckDB. One caveat: switching `journal_mode` to WAL in a hook removes crash atomicity for transactions spanning the main and attached files.
|
|
317
|
+
- There is no `DROP ... CASCADE`: dropping a table leaves dependent views dangling (they error when next queried) instead of dropping them.
|
|
318
|
+
- Type affinity gotchas: never `CAST(x AS DATETIME)` — `DATETIME` gets NUMERIC affinity, truncating `'2026-06-03'` to `2026`. Store timestamps as `'YYYY-MM-DD HH:MM:SS'` text; lexicographic comparison is chronological, and microbatch window boundaries are normalized with `datetime()` so day-granularity event-time columns work too.
|
|
319
|
+
- Seed `boolean` columns load as `1/0` (the text `'true'` would be falsy in `CASE WHEN`); `numeric` needs no special mapping (affinity stores decimals losslessly).
|
|
320
|
+
- The read-only `query` API opens the files with SQLite's readonly flag — writes fail with `SQLITE_READONLY`, and the database files must already exist.
|
|
321
|
+
- INTEGER values beyond 2^53 come back as imprecise JS numbers from the `query` API.
|
|
322
|
+
|
|
323
|
+
## License
|
|
324
|
+
|
|
325
|
+
MIT
|
package/bin/dbt-js.js
ADDED
package/package.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "dbt-js",
|
|
3
|
+
"version": "0.1.1",
|
|
4
|
+
"description": "A minimalist, dbt-like SQL transformation tool for Node.js — compile SQL models, build a dependency DAG, and materialize them on any supported SQL database.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./src/api.js",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": "./src/api.js",
|
|
9
|
+
"./package.json": "./package.json"
|
|
10
|
+
},
|
|
11
|
+
"bin": {
|
|
12
|
+
"dbt-js": "bin/dbt-js.js"
|
|
13
|
+
},
|
|
14
|
+
"license": "MIT",
|
|
15
|
+
"author": "Shahzad Hamza",
|
|
16
|
+
"repository": {
|
|
17
|
+
"type": "git",
|
|
18
|
+
"url": "git+https://github.com/shahzadhamza/dbt-js.git"
|
|
19
|
+
},
|
|
20
|
+
"homepage": "https://github.com/shahzadhamza/dbt-js.git#readme",
|
|
21
|
+
"bugs": {
|
|
22
|
+
"url": "https://github.com/shahzadhamza/dbt-js/issues"
|
|
23
|
+
},
|
|
24
|
+
"keywords": [
|
|
25
|
+
"dbt",
|
|
26
|
+
"sql",
|
|
27
|
+
"postgres",
|
|
28
|
+
"mysql",
|
|
29
|
+
"sqlite",
|
|
30
|
+
"duckdb",
|
|
31
|
+
"etl",
|
|
32
|
+
"elt",
|
|
33
|
+
"data-transformation",
|
|
34
|
+
"dag",
|
|
35
|
+
"analytics"
|
|
36
|
+
],
|
|
37
|
+
"files": [
|
|
38
|
+
"bin",
|
|
39
|
+
"src",
|
|
40
|
+
"README.md",
|
|
41
|
+
"LICENSE"
|
|
42
|
+
],
|
|
43
|
+
"engines": {
|
|
44
|
+
"node": ">=20"
|
|
45
|
+
},
|
|
46
|
+
"dependencies": {
|
|
47
|
+
"@duckdb/node-api": "1.5.3-r.3",
|
|
48
|
+
"better-sqlite3": "~12.9.0",
|
|
49
|
+
"csv-parse": "^5.6.0",
|
|
50
|
+
"mysql2": "^3.11.0",
|
|
51
|
+
"pg": "^8.13.0"
|
|
52
|
+
}
|
|
53
|
+
}
|
package/src/api.js
ADDED
|
@@ -0,0 +1,257 @@
|
|
|
1
|
+
// Programmatic API — what `import 'dbt-js'` gives you. Every function takes a
|
|
2
|
+
// projectDir (default process.cwd()), opens its own connection, and closes it
|
|
3
|
+
// before returning. Loading/config errors throw; model and test failures are
|
|
4
|
+
// returned as ok: false. Nothing here writes to the console or exits the
|
|
5
|
+
// process — pass onEvent to observe progress.
|
|
6
|
+
//
|
|
7
|
+
// Instead of project files you can pass the project inline:
|
|
8
|
+
// config — the contents of dbtjs.config.json as an object (file not read)
|
|
9
|
+
// models — a { name: rawSql } map replacing models/*.sql (same format,
|
|
10
|
+
// /* config: {...} */ comment included)
|
|
11
|
+
// projectDir then only anchors relative duckdb paths and locates seeds/.
|
|
12
|
+
|
|
13
|
+
import { loadConfig, validateConfig } from './config.js';
|
|
14
|
+
import { loadProject } from './project.js';
|
|
15
|
+
import { buildDag, expandSelection } from './dag.js';
|
|
16
|
+
import { connect, ensureSchema } from './db.js';
|
|
17
|
+
import { runModel } from './materialize.js';
|
|
18
|
+
import { buildTests, runTest } from './tests.js';
|
|
19
|
+
import { loadSeed } from './seed.js';
|
|
20
|
+
import { render } from './render.js';
|
|
21
|
+
import { computeBatches } from './batches.js';
|
|
22
|
+
|
|
23
|
+
function loadAll({ projectDir = process.cwd(), vars, config, models: inlineModels } = {}) {
|
|
24
|
+
const cfg = config
|
|
25
|
+
? validateConfig(structuredClone(config), projectDir) // clone: validation mutates (defaults, env interp, path resolve)
|
|
26
|
+
: loadConfig(projectDir);
|
|
27
|
+
if (vars) cfg.vars = { ...cfg.vars, ...vars };
|
|
28
|
+
const { models, seeds } = loadProject(projectDir, { models: inlineModels });
|
|
29
|
+
const { nodes, order } = buildDag(models, seeds);
|
|
30
|
+
return { cfg, models, seeds, nodes, order, projectDir };
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
async function withClient(cfg, projectDir, fn) {
|
|
34
|
+
const client = await connect(cfg.connection, { projectDir, schema: cfg.schema });
|
|
35
|
+
try {
|
|
36
|
+
return await fn(client);
|
|
37
|
+
} finally {
|
|
38
|
+
await client.end();
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// → { ok, models: [{ name, status: 'ok'|'fail'|'skip', materialized, action?,
|
|
43
|
+
// rowCount?, batchCount?, failedBatches?, durationMs?, error? }] }
|
|
44
|
+
export async function run(opts = {}) {
|
|
45
|
+
const { select, fullRefresh = false, eventTimeStart, eventTimeEnd, onEvent } = opts;
|
|
46
|
+
if (eventTimeEnd && !eventTimeStart) throw new Error('eventTimeEnd requires eventTimeStart');
|
|
47
|
+
const { cfg, nodes, order, projectDir } = loadAll(opts);
|
|
48
|
+
const selected = expandSelection(select, nodes, order).filter(
|
|
49
|
+
(n) => nodes.get(n).type === 'model'
|
|
50
|
+
);
|
|
51
|
+
if (!selected.length) throw new Error('Nothing to run for this selection');
|
|
52
|
+
|
|
53
|
+
return withClient(cfg, projectDir, async (client) => {
|
|
54
|
+
await ensureSchema(client, cfg.schema);
|
|
55
|
+
const models = [];
|
|
56
|
+
const bad = new Set(); // failed or skipped — either blocks downstream
|
|
57
|
+
for (const [i, name] of selected.entries()) {
|
|
58
|
+
const node = nodes.get(name);
|
|
59
|
+
const base = {
|
|
60
|
+
type: 'model',
|
|
61
|
+
name,
|
|
62
|
+
materialized: node.config.materialized,
|
|
63
|
+
index: i + 1,
|
|
64
|
+
total: selected.length,
|
|
65
|
+
};
|
|
66
|
+
if (node.deps.some((d) => bad.has(d))) {
|
|
67
|
+
const rec = { ...base, status: 'skip' };
|
|
68
|
+
bad.add(name);
|
|
69
|
+
models.push(rec);
|
|
70
|
+
onEvent?.(rec);
|
|
71
|
+
continue;
|
|
72
|
+
}
|
|
73
|
+
const start = Date.now();
|
|
74
|
+
let rec;
|
|
75
|
+
try {
|
|
76
|
+
const result = await runModel(client, node, cfg, {
|
|
77
|
+
fullRefresh,
|
|
78
|
+
vars: cfg.vars,
|
|
79
|
+
eventTimeStart,
|
|
80
|
+
eventTimeEnd,
|
|
81
|
+
onBatch: (b) => onEvent?.({ type: 'batch', model: name, ...b }),
|
|
82
|
+
});
|
|
83
|
+
const failedBatches = result.failedBatches ?? [];
|
|
84
|
+
rec = {
|
|
85
|
+
...base,
|
|
86
|
+
status: failedBatches.length ? 'fail' : 'ok',
|
|
87
|
+
action: result.action,
|
|
88
|
+
rowCount: result.rowCount,
|
|
89
|
+
batchCount: result.batchCount,
|
|
90
|
+
failedBatches,
|
|
91
|
+
durationMs: Date.now() - start,
|
|
92
|
+
};
|
|
93
|
+
if (rec.status === 'fail') {
|
|
94
|
+
rec.error = `${failedBatches.length} of ${result.batchCount} batches failed`;
|
|
95
|
+
bad.add(name);
|
|
96
|
+
}
|
|
97
|
+
} catch (e) {
|
|
98
|
+
rec = { ...base, status: 'fail', error: e.message, durationMs: Date.now() - start };
|
|
99
|
+
bad.add(name);
|
|
100
|
+
}
|
|
101
|
+
models.push(rec);
|
|
102
|
+
onEvent?.(rec);
|
|
103
|
+
}
|
|
104
|
+
return { ok: models.every((m) => m.status === 'ok'), models };
|
|
105
|
+
});
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// → { ok, tests: [{ id, model, pass, violations, sample }] }
|
|
109
|
+
export async function test(opts = {}) {
|
|
110
|
+
const { select, onEvent } = opts;
|
|
111
|
+
const { cfg, nodes, order, projectDir } = loadAll(opts);
|
|
112
|
+
const selected = new Set(expandSelection(select, nodes, order));
|
|
113
|
+
const models = order
|
|
114
|
+
.filter((n) => selected.has(n) && nodes.get(n).type === 'model')
|
|
115
|
+
.map((n) => nodes.get(n));
|
|
116
|
+
const tests = buildTests(models, cfg.schema);
|
|
117
|
+
if (!tests.length) return { ok: true, tests: [] };
|
|
118
|
+
|
|
119
|
+
return withClient(cfg, projectDir, async (client) => {
|
|
120
|
+
const results = [];
|
|
121
|
+
for (const t of tests) {
|
|
122
|
+
const r = await runTest(client, t);
|
|
123
|
+
const rec = {
|
|
124
|
+
type: 'test',
|
|
125
|
+
id: t.id,
|
|
126
|
+
model: t.model,
|
|
127
|
+
pass: r.pass,
|
|
128
|
+
violations: r.violations ?? 0,
|
|
129
|
+
sample: r.sample ?? [],
|
|
130
|
+
};
|
|
131
|
+
results.push(rec);
|
|
132
|
+
onEvent?.(rec);
|
|
133
|
+
}
|
|
134
|
+
return { ok: results.every((r) => r.pass), tests: results };
|
|
135
|
+
});
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// → { ok: true, seeds: [{ name, rowCount, durationMs }] } — a failing seed throws
|
|
139
|
+
export async function seed(opts = {}) {
|
|
140
|
+
const { select, onEvent } = opts;
|
|
141
|
+
const { cfg, seeds, projectDir } = loadAll(opts);
|
|
142
|
+
const wanted = select ? new Set(String(select).split(',').map((s) => s.trim())) : null;
|
|
143
|
+
const selected = wanted ? seeds.filter((s) => wanted.has(s.name)) : seeds;
|
|
144
|
+
if (!selected.length) throw new Error('No seeds match this selection');
|
|
145
|
+
|
|
146
|
+
return withClient(cfg, projectDir, async (client) => {
|
|
147
|
+
await ensureSchema(client, cfg.schema);
|
|
148
|
+
const results = [];
|
|
149
|
+
for (const [i, s] of selected.entries()) {
|
|
150
|
+
const start = Date.now();
|
|
151
|
+
const { rowCount } = await loadSeed(client, s, cfg);
|
|
152
|
+
const rec = {
|
|
153
|
+
type: 'seed',
|
|
154
|
+
name: s.name,
|
|
155
|
+
index: i + 1,
|
|
156
|
+
total: selected.length,
|
|
157
|
+
rowCount,
|
|
158
|
+
durationMs: Date.now() - start,
|
|
159
|
+
};
|
|
160
|
+
results.push(rec);
|
|
161
|
+
onEvent?.(rec);
|
|
162
|
+
}
|
|
163
|
+
return { ok: true, seeds: results };
|
|
164
|
+
});
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// → [{ name, materialized, sql }] — no DB connection needed
|
|
168
|
+
export async function compile(opts = {}) {
|
|
169
|
+
const { select } = opts;
|
|
170
|
+
const { cfg, nodes, order } = loadAll(opts);
|
|
171
|
+
const selected = expandSelection(select, nodes, order).filter(
|
|
172
|
+
(n) => nodes.get(n).type === 'model'
|
|
173
|
+
);
|
|
174
|
+
return selected.map((name) => {
|
|
175
|
+
const node = nodes.get(name);
|
|
176
|
+
let batchCtx = {};
|
|
177
|
+
if (node.config.strategy === 'microbatch') {
|
|
178
|
+
// show the current normal-run window as one span, so the output is runnable SQL
|
|
179
|
+
const b = computeBatches({
|
|
180
|
+
begin: node.config.begin,
|
|
181
|
+
batchSize: node.config.batch_size,
|
|
182
|
+
lookback: node.config.lookback,
|
|
183
|
+
firstBuild: false,
|
|
184
|
+
timezone: node.config.timezone,
|
|
185
|
+
});
|
|
186
|
+
batchCtx = { batchStart: b[0].start, batchEnd: b[b.length - 1].end };
|
|
187
|
+
}
|
|
188
|
+
const ctx = {
|
|
189
|
+
name,
|
|
190
|
+
schema: cfg.schema,
|
|
191
|
+
vars: cfg.vars,
|
|
192
|
+
isIncremental: false, // compile is offline; run decides this against the live DB
|
|
193
|
+
sources: cfg.sources,
|
|
194
|
+
timezone: node.config.timezone,
|
|
195
|
+
};
|
|
196
|
+
const { sql } = render(node.rawSql, { ...ctx, ...batchCtx });
|
|
197
|
+
// hooks render without batch context — batch_start/batch_end are body-only
|
|
198
|
+
const preHookSql = node.config.pre_hook.map((h) => render(h, ctx).sql);
|
|
199
|
+
const postHookSql = node.config.post_hook.map((h) => render(h, ctx).sql);
|
|
200
|
+
return { name, materialized: node.config.materialized, sql, preHookSql, postHookSql };
|
|
201
|
+
});
|
|
202
|
+
}
|
|
203
|
+
|
|
204
|
+
// → [{ name, kind, deps }] in execution order — no DB connection needed
|
|
205
|
+
export async function ls(opts = {}) {
|
|
206
|
+
const { nodes, order } = loadAll(opts);
|
|
207
|
+
return order.map((name) => {
|
|
208
|
+
const node = nodes.get(name);
|
|
209
|
+
return {
|
|
210
|
+
name,
|
|
211
|
+
kind: node.type === 'seed' ? 'seed' : node.config.materialized,
|
|
212
|
+
deps: node.deps,
|
|
213
|
+
};
|
|
214
|
+
});
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// → { rows, rowCount } — one arbitrary statement against the project's warehouse.
|
|
218
|
+
// Bypasses loadAll so it works on projects with zero models. readOnly (default)
|
|
219
|
+
// opens DuckDB with access_mode READ_ONLY / sets the Postgres session read-only.
|
|
220
|
+
export async function query(opts = {}) {
|
|
221
|
+
const { sql, params, readOnly = true, projectDir = process.cwd(), config } = opts;
|
|
222
|
+
if (typeof sql !== 'string' || !sql.trim()) throw new Error('sql is required');
|
|
223
|
+
const cfg = config
|
|
224
|
+
? validateConfig(structuredClone(config), projectDir)
|
|
225
|
+
: loadConfig(projectDir);
|
|
226
|
+
const client = await connect(cfg.connection, { projectDir, readOnly, schema: cfg.schema });
|
|
227
|
+
try {
|
|
228
|
+
return await client.query(sql, params);
|
|
229
|
+
} finally {
|
|
230
|
+
await client.end();
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// → { schema, modelCount, seedCount, target, database, version } — connectivity check
|
|
235
|
+
export async function debug(opts = {}) {
|
|
236
|
+
const { cfg, models, seeds, projectDir } = loadAll(opts);
|
|
237
|
+
const target = ['duckdb', 'sqlite'].includes(cfg.connection.type)
|
|
238
|
+
? `${cfg.connection.type} ${cfg.connection.path}`
|
|
239
|
+
: `${cfg.connection.host}:${cfg.connection.port}/${cfg.connection.database} as ${cfg.connection.user}`;
|
|
240
|
+
return withClient(cfg, projectDir, async (client) => {
|
|
241
|
+
const { rows } = await client.query(
|
|
242
|
+
cfg.connection.type === 'mysql'
|
|
243
|
+
? 'SELECT DATABASE() AS db, VERSION() AS version'
|
|
244
|
+
: cfg.connection.type === 'sqlite'
|
|
245
|
+
? 'SELECT sqlite_version() AS version'
|
|
246
|
+
: 'SELECT current_database() AS db, version() AS version'
|
|
247
|
+
);
|
|
248
|
+
return {
|
|
249
|
+
schema: cfg.schema,
|
|
250
|
+
modelCount: models.length,
|
|
251
|
+
seedCount: seeds.length,
|
|
252
|
+
target,
|
|
253
|
+
database: rows[0].db ?? cfg.connection.path,
|
|
254
|
+
version: rows[0].version,
|
|
255
|
+
};
|
|
256
|
+
});
|
|
257
|
+
}
|