dbt-js 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Hamza Shahzad
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,325 @@
1
+ # dbt-js
2
+
3
+ A minimalist dbt-like SQL transformation tool for Postgres, MySQL, SQLite, and DuckDB. Models are plain SQL `SELECT` files; dbt-js compiles them (resolving `ref()` / `source()` / `var()`), builds a dependency DAG, and executes everything inside the database in dependency order. Like dbt, it is transformation-only — it never extracts or moves data; raw data must already be in your database (or, with DuckDB, in files it can read in place).
4
+
5
+ Five dependencies: `pg`, `mysql2`, `better-sqlite3`, `@duckdb/node-api`, and `csv-parse` — the database drivers are loaded lazily, so each backend only pays for its own. Plain ESM JavaScript, no build step.
6
+
7
+ ## Install
8
+
9
+ ```sh
10
+ npm install -g dbt-js # global CLI: dbt-js <command>
11
+ npx dbt-js debug # or run without installing
12
+ npm install dbt-js # as a library, for embedding (see below)
13
+ ```
14
+
15
+ Requires Node.js >= 20.
16
+
17
+ ## Quick start
18
+
19
+ The published package ships just the CLI and library; the runnable examples live in the
20
+ [repository](https://github.com/<you>/dbt-js). Clone it to try the fully self-contained
21
+ DuckDB example (no database server needed):
22
+
23
+ ```sh
24
+ git clone https://github.com/<you>/dbt-js && cd dbt-js
25
+ npm install
26
+ cd example-duckdb
27
+ node ../bin/dbt-js.js debug # check config + connectivity
28
+ node ../bin/dbt-js.js seed # load seeds/*.csv
29
+ node ../bin/dbt-js.js run # build all models in DAG order
30
+ node ../bin/dbt-js.js test # run data tests
31
+ ```
32
+
33
+ (With `dbt-js` installed globally, the commands are just `dbt-js debug`, `dbt-js run`, etc.)
34
+ `example/` is the same project targeting a Postgres server instead, `example-mysql/` targets MySQL (a one-line Docker server is in its README), and `example-sqlite/` targets SQLite (also serverless).
35
+
36
+ ## Project layout
37
+
38
+ A dbt-js project is a directory containing:
39
+
40
+ ```
41
+ dbtjs.config.json # connection, target schema, sources, vars
42
+ models/*.sql # one SELECT per file; filename = model name
43
+ seeds/*.csv # one table per file; filename = table name
44
+ ```
45
+
46
+ ### dbtjs.config.json
47
+
48
+ ```json
49
+ {
50
+ "connection": {
51
+ "host": "localhost",
52
+ "port": 5432,
53
+ "user": "me",
54
+ "password": "${DBTJS_PASSWORD}",
55
+ "database": "mydb"
56
+ },
57
+ "schema": "analytics",
58
+ "sources": { "raw": { "schema": "public" } },
59
+ "vars": { "start": null },
60
+ "seeds": { "columnTypes": { "my_seed": { "joined_on": "date" } } }
61
+ }
62
+ ```
63
+
64
+ For MySQL, the same shape with `"type": "mysql"` (`port` defaults to 3306):
65
+
66
+ ```json
67
+ {
68
+ "connection": {
69
+ "type": "mysql",
70
+ "host": "localhost",
71
+ "user": "me",
72
+ "password": "${DBTJS_PASSWORD}",
73
+ "database": "mydb"
74
+ },
75
+ "schema": "analytics"
76
+ }
77
+ ```
78
+
79
+ For DuckDB and SQLite, the connection is just a file path (the warehouse is an embedded local file):
80
+
81
+ ```json
82
+ {
83
+ "connection": { "type": "duckdb", "path": "./warehouse.duckdb" },
84
+ "schema": "analytics"
85
+ }
86
+ ```
87
+
88
+ ```json
89
+ {
90
+ "connection": { "type": "sqlite", "path": "./warehouse.db" },
91
+ "schema": "analytics"
92
+ }
93
+ ```
94
+
95
+ - `connection.type` is `"postgres"` (default), `"mysql"`, `"sqlite"`, or `"duckdb"`.
96
+ - `${NAME}` in connection values is replaced from the environment (error if unset). Omit `password` entirely to let `pg` use `PGPASSWORD`.
97
+ - `schema` is where all models and seeds are created (`CREATE SCHEMA IF NOT EXISTS` runs automatically).
98
+ - `sources` maps a source name to a schema, used by `{{ source('name', 'table') }}`.
99
+ - `vars` are defaults, overridable per-invocation with `--vars '{"start": "2026-06-01"}'`.
100
+ - `seeds.columnTypes` overrides inferred CSV column types (the escape hatch for dates/timestamps).
101
+
102
+ ## Models
103
+
104
+ A model is a single `SELECT`. Configuration lives in one leading block comment with a JSON body:
105
+
106
+ ```sql
107
+ /* config: {
108
+ "materialized": "incremental",
109
+ "strategy": "delete+insert",
110
+ "unique_key": "day",
111
+ "tests": { "day": ["not_null", "unique"] }
112
+ } */
113
+ select ...
114
+ ```
115
+
116
+ No config comment means `{ "materialized": "view" }`.
117
+
118
+ ### Templating
119
+
120
+ | Expression | Becomes |
121
+ |---|---|
122
+ | `{{ ref('other_model') }}` | `"schema"."other_model"` — and declares a DAG dependency |
123
+ | `{{ this }}` | the current model's own table (for incremental high-water marks) |
124
+ | `{{ source('raw', 'orders') }}` | `"public"."orders"` (schema from `sources` config) |
125
+ | `{{ var('start') }}` / `{{ var('x', 0) }}` | the var's value, or the default; error if neither. Inserted verbatim — quote it yourself in SQL |
126
+ | `{{ batch_start }}` / `{{ batch_end }}` | the current batch window as `YYYY-MM-DD HH:MM:SS` (microbatch models only). Inserted verbatim — quote it yourself |
127
+ | `{% if is_incremental() %} ... {% endif %}` | body included only on incremental runs (table exists, not `--full-refresh`) |
128
+
129
+ That's the whole template language. Anything else inside `{{ }}` / `{% %}` is a compile error.
130
+
131
+ ### Materializations
132
+
133
+ - **view** (default): `CREATE OR REPLACE VIEW`
134
+ - **table**: transactional `DROP TABLE ... CASCADE; CREATE TABLE ... AS SELECT` (atomic to readers; CASCADE-dropped downstream views are rebuilt later in the same run — for partial runs use `--select model+`)
135
+ - **incremental**: first run (or `--full-refresh`) builds like a table; after that only the rows your SELECT returns are applied, via a strategy:
136
+ - `append` — plain `INSERT INTO ... SELECT` (immutable event data)
137
+ - `delete+insert` — requires `unique_key` (string or array); deletes matching keys then inserts, in one transaction (idempotent re-runs)
138
+ - `microbatch` — splits the event-time range into aligned windows and replaces each window in its own transaction (see below)
139
+
140
+ ### Hooks
141
+
142
+ `pre_hook` / `post_hook` run extra SQL around a model's build — grants, indexes, `ANALYZE`, audit rows. Each is a string or array of strings, rendered with the same template language as the model body (everything except `batch_start` / `batch_end`):
143
+
144
+ ```sql
145
+ /* config: {
146
+ "materialized": "table",
147
+ "post_hook": [
148
+ "create index if not exists idx_daily_revenue_day on {{ this }} (day)",
149
+ "grant select on {{ this }} to reporting"
150
+ ]
151
+ } */
152
+ select ...
153
+ ```
154
+
155
+ - Order: all pre-hooks → materialization → all post-hooks, each hook as its own statement.
156
+ - One deliberate divergence from dbt: hooks run **outside** the materialization transaction, so they can use statements Postgres forbids inside one (`VACUUM`, `CREATE INDEX CONCURRENTLY`). A failing pre-hook aborts the model before any build; a failing post-hook marks the model FAIL but the built relation remains — fix the hook and re-run.
157
+ - Microbatch models run hooks once per model (pre-hooks before the first batch, post-hooks after the last), not per batch; post-hooks are skipped when any batch failed.
158
+ - `{{ ref('x') }}` inside a hook declares a DAG dependency, same as in the body.
159
+
160
+ ### Incremental pattern + backfill
161
+
162
+ ```sql
163
+ select date_trunc('day', created_at)::date as day, count(*) as orders
164
+ from {{ ref('orders_enriched') }}
165
+ {% if is_incremental() %}
166
+ where created_at >= coalesce(
167
+ nullif('{{ var("start", "") }}', '')::timestamptz,
168
+ (select max(day) from {{ this }})::timestamptz)
169
+ {% endif %}
170
+ group by 1
171
+ ```
172
+
173
+ - Normal run: processes from the table's own high-water mark (`max(day)`).
174
+ - Backfill: `dbt-js run --select daily_revenue --vars '{"start": "2026-01-01"}'` re-derives from that date; `delete+insert` makes it idempotent.
175
+ - Full rebuild: `dbt-js run --select daily_revenue --full-refresh`.
176
+
177
+ ### Microbatch (dbt 1.9-style)
178
+
179
+ For batched, retryable backfills, use `strategy: "microbatch"`. dbt-js splits the time range into `batch_size` windows and runs each as its own transaction: `DELETE` the target rows whose `event_time` falls in the window, then `INSERT` the batch's rows. A failed batch is reported and the rest keep running.
180
+
181
+ ```sql
182
+ /* config: {
183
+ "materialized": "incremental",
184
+ "strategy": "microbatch",
185
+ "event_time": "day",
186
+ "begin": "2026-01-01",
187
+ "batch_size": "day",
188
+ "lookback": 1
189
+ } */
190
+ select date_trunc('day', created_at)::date as day, count(*) as orders
191
+ from {{ ref('orders_enriched') }}
192
+ where created_at >= '{{ batch_start }}'::timestamptz
193
+ and created_at < '{{ batch_end }}'::timestamptz
194
+ group by 1
195
+ ```
196
+
197
+ - `event_time` — column **of this model's output** bounding each batch (used by the engine's per-window DELETE).
198
+ - `begin` — start of history; first run and `--full-refresh` build every batch from here.
199
+ - `batch_size` — `hour` | `day` | `month` | `year`. Boundaries align to the model's `timezone` (default UTC).
200
+ - `lookback` (default 1) — a normal run reprocesses the current batch plus this many previous ones (no high-water mark, same as dbt).
201
+ - Backfill: `dbt-js run --select my_model --event-time-start 2026-06-02 --event-time-end 2026-06-04` rewrites exactly those windows (whole batches; end is exclusive). Idempotent by construction.
202
+ - No `is_incremental()` needed — the `batch_start`/`batch_end` filter applies on every run, including the first.
203
+ - If batches fail, the model exits FAIL listing the failed windows and the exact `--event-time-start/--event-time-end` retry command; other batches' work is kept.
204
+
205
+ One deliberate divergence from dbt: dbt auto-filters upstream `ref()`s by their declared `event_time`; dbt-js does no hidden query rewriting — you filter your input yourself with `{{ batch_start }}` / `{{ batch_end }}`.
206
+
207
+ ### Timezone
208
+
209
+ Any model may set `"timezone"` in its config (a string IANA zone, default `"UTC"`):
210
+
211
+ - For microbatch models it aligns each window to that zone's wall-clock. `{{ batch_start }}` / `{{ batch_end }}` are emitted as naive `YYYY-MM-DD HH:MM:SS` **wall-clock strings in that zone**, so they compare directly against a locally-stored `event_time` column. A `"day"` batch in `"America/New_York"` therefore spans local midnight-to-midnight, not UTC.
212
+ - `{{ timezone }}` is available in **any** model's SQL (raw substitution — quote it yourself, e.g. `created_at at time zone '{{ timezone }}'`).
213
+ - `begin`, `--event-time-start`, and `--event-time-end` given as naive strings are interpreted as wall-clock in the model's `timezone`; strings with an explicit `Z`/offset stay absolute.
214
+ - DST caveat: with `batch_size: "hour"` in a DST zone the spring-forward/fall-back hour is irregular — prefer UTC for hour-grain, or day+ grain for zoned models.
215
+
216
+ ## Tests
217
+
218
+ Declared per column in the model's config. Each compiles to a query returning violating rows; any row fails the test (exit code 1, with up to 10 sample rows printed).
219
+
220
+ - `"not_null"` — rows where the column is NULL
221
+ - `"unique"` — non-NULL values appearing more than once
222
+ - `{ "accepted_values": ["a", "b"] }` — non-NULL values outside the list
223
+
224
+ ## Seeds
225
+
226
+ `dbt-js seed` loads each `seeds/*.csv` as a table (drop + create + insert, transactional). Column types are inferred (`integer`/`bigint`/`numeric`/`boolean`, else `text`; empty string → NULL); override per column via `seeds.columnTypes`. Models can `{{ ref('seed_name') }}` seeds.
227
+
228
+ ## CLI
229
+
230
+ ```
231
+ dbt-js run [--select SPEC] [--full-refresh] [--vars JSON]
232
+ [--event-time-start TS] [--event-time-end TS] # microbatch backfill window
233
+ dbt-js test [--select SPEC] [--vars JSON]
234
+ dbt-js seed [--select SPEC]
235
+ dbt-js compile [--select SPEC] [--vars JSON] # print compiled SQL, no DB needed
236
+ dbt-js ls # nodes in execution order
237
+ dbt-js debug # config + connectivity check
238
+ ```
239
+
240
+ `--select` accepts comma-separated names; `+name` adds everything upstream, `name+` everything downstream (e.g. `--select orders_enriched+` rebuilds it and its dependents).
241
+
242
+ On failure, downstream models are skipped and reported; exit code is 1 if anything failed.
243
+
244
+ ## Embedding in a Node.js app
245
+
246
+ The CLI is a thin wrapper over a programmatic API — `example-embed/` is a runnable ~70-line server using it. Install dbt-js as a dependency:
247
+
248
+ ```sh
249
+ npm install dbt-js
250
+ ```
251
+
252
+ ```js
253
+ import { run, test, seed, compile, ls, debug } from 'dbt-js';
254
+
255
+ const result = await run({
256
+ projectDir: './analytics', // dir containing dbtjs.config.json — always pass this
257
+ select: 'daily_revenue+', // optional, same syntax as --select
258
+ vars: { start: '2026-06-01' }, // optional, plain object (not a JSON string)
259
+ fullRefresh: false,
260
+ onEvent: (e) => logger.info(e), // optional progress stream; omit for silence
261
+ });
262
+ // result = { ok, models: [{ name, status: 'ok'|'fail'|'skip', action, rowCount,
263
+ // batchCount, failedBatches, durationMs, error }] }
264
+ ```
265
+
266
+ The project can also be supplied inline instead of from files — handy when connection settings live in your app's config system or model SQL is generated:
267
+
268
+ ```js
269
+ await run({
270
+ config: { // contents of dbtjs.config.json (file not read)
271
+ connection: { host: 'db', port: 5432, user: 'analytics', password: process.env.PW, database: 'warehouse' },
272
+ schema: 'analytics',
273
+ sources: { raw: { schema: 'public' } },
274
+ },
275
+ models: { // replaces models/*.sql — same format, config comment included
276
+ stg_orders: "select * from {{ source('raw', 'orders') }} where deleted = false",
277
+ order_counts: "/* config: { \"materialized\": \"table\" } */ select count(*) as n from {{ ref('stg_orders') }}",
278
+ },
279
+ });
280
+ ```
281
+
282
+ With both given, `projectDir` is optional — it then only anchors relative DuckDB paths and locates `seeds/` (file seeds remain `ref()`-able from inline models). Inline `config` goes through the same validation and `${ENV}` interpolation as the file; your object is not mutated.
283
+
284
+ - `run` also takes `eventTimeStart` / `eventTimeEnd` for microbatch backfills. `test` → `{ ok, tests: [{ id, pass, violations, sample }] }`; `seed` → `{ ok, seeds: [...] }`; `compile` → `[{ name, materialized, sql, preHookSql, postHookSql }]` (no DB needed); `ls` → `[{ name, kind, deps }]`; `debug` → connectivity info.
285
+ - Config or project errors **throw**; model/test failures come back as `ok: false` (mirrors the CLI's exit code 1).
286
+ - Every call opens its own connection and closes it before returning — nothing to pool.
287
+ - **Serialize runs yourself** (a one-promise queue is enough — see `example-embed/server.js`): DuckDB allows a single writer per file, so a scheduled refresh and an HTTP-triggered run must not overlap.
288
+ - Relative paths are anchored to `projectDir`, not your app's cwd: the DuckDB `connection.path` is resolved against it, and `read_csv('data/...')`-style paths in model SQL resolve via DuckDB's `file_search_path`.
289
+
290
+ ## DuckDB notes
291
+
292
+ - `sources` resolve to schemas inside the same `.duckdb` file, exactly like Postgres schemas.
293
+ - Models can call DuckDB-native readers directly — `from read_csv('data/orders.csv')` or `read_parquet('...')` — no template syntax needed; raw data files never pass through dbt-js.
294
+ - DuckDB doesn't report row counts for full table builds (CTAS), so those log lines omit the count. Incremental and seed counts are reported normally.
295
+ - `:memory:` is a valid path but pointless for a CLI — each invocation is a separate process, so nothing would persist between `seed` and `run`.
296
+ - Attaching external databases (DuckDB `ATTACH`) is not supported in v1.
297
+ - One Postgres-specific change: pre-existing **materialized views** squatting on a model's name are no longer auto-dropped (relation detection now uses `information_schema`, which can't see them); you'd get a clear Postgres error at build time instead. dbt-js itself never creates materialized views.
298
+
299
+ ## MySQL notes
300
+
301
+ Requires MySQL 8.0+ (`CREATE TABLE ... AS SELECT` under GTID consistency additionally needs 8.0.21+, and temp-table-in-transaction is disallowed when it's enforced).
302
+
303
+ - dbt-js enables `ANSI_QUOTES` for its session, so double quotes are **identifier** quotes exactly as on Postgres/DuckDB — write string literals with single quotes in model SQL (the habit you already have from Postgres).
304
+ - `schema` maps to a MySQL **database**: `CREATE SCHEMA IF NOT EXISTS` is `CREATE DATABASE`, so the connecting user needs the server-wide CREATE privilege (or pre-create the schema and grant on it — see `example-mysql/README.md`).
305
+ - MySQL DDL implicitly commits, so `table` and `--full-refresh` rebuilds (DROP + CREATE TABLE AS) are **not** atomic to readers the way they are on Postgres/DuckDB. `delete+insert` and microbatch window replacement remain fully transactional.
306
+ - No `CREATE INDEX IF NOT EXISTS` — use an idempotent post-hook like `analyze table {{ this }}`, or guard index creation yourself.
307
+ - Seed type inference maps `numeric` to `decimal(38,10)` (bare `NUMERIC` is `DECIMAL(10,0)` on MySQL and would round); `boolean` becomes `TINYINT(1)` with `true/false` loaded as `1/0`. Override per column via `seeds.columnTypes` as usual.
308
+ - Microbatch boundaries are computed in UTC and compared as `DATETIME` literals — prefer a `DATETIME` event-time column, or set the session time zone to UTC via mysql2's `timezone` connection option.
309
+ - Rows come back with `dateStrings: true` (dates as strings, JSON-safe, matching the DuckDB adapter); set `dateStrings: false` in the connection object to get JS `Date`s from the `query` API.
310
+
311
+ ## SQLite notes
312
+
313
+ Driver: `better-sqlite3` (synchronous — a long-running statement blocks the embedding app's event loop; irrelevant for CLI use).
314
+
315
+ - `schema` maps to a **separate database file** `<schema>.db` next to `connection.path`, ATTACHed for the session (created automatically when writable). `"schema": "main"` keeps everything in the single main file — see `example-sqlite/README.md`.
316
+ - SQLite DDL is transactional, so **all** rebuilds — including `table` and `--full-refresh` — are atomic, like Postgres/DuckDB. One caveat: switching `journal_mode` to WAL in a hook removes crash atomicity for transactions spanning the main and attached files.
317
+ - There is no `DROP ... CASCADE`: dropping a table leaves dependent views dangling (they error when next queried) instead of dropping them.
318
+ - Type affinity gotchas: never `CAST(x AS DATETIME)` — `DATETIME` gets NUMERIC affinity, truncating `'2026-06-03'` to `2026`. Store timestamps as `'YYYY-MM-DD HH:MM:SS'` text; lexicographic comparison is chronological, and microbatch window boundaries are normalized with `datetime()` so day-granularity event-time columns work too.
319
+ - Seed `boolean` columns load as `1/0` (the text `'true'` would be falsy in `CASE WHEN`); `numeric` needs no special mapping (affinity stores decimals losslessly).
320
+ - The read-only `query` API opens the files with SQLite's readonly flag — writes fail with `SQLITE_READONLY`, and the database files must already exist.
321
+ - INTEGER values beyond 2^53 come back as imprecise JS numbers from the `query` API.
322
+
323
+ ## License
324
+
325
+ MIT
package/bin/dbt-js.js ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env node
2
+ import { main } from '../src/cli.js';
3
+
4
+ main();
package/package.json ADDED
@@ -0,0 +1,53 @@
1
+ {
2
+ "name": "dbt-js",
3
+ "version": "0.1.1",
4
+ "description": "A minimalist, dbt-like SQL transformation tool for Node.js — compile SQL models, build a dependency DAG, and materialize them on any supported SQL database.",
5
+ "type": "module",
6
+ "main": "./src/api.js",
7
+ "exports": {
8
+ ".": "./src/api.js",
9
+ "./package.json": "./package.json"
10
+ },
11
+ "bin": {
12
+ "dbt-js": "bin/dbt-js.js"
13
+ },
14
+ "license": "MIT",
15
+ "author": "Shahzad Hamza",
16
+ "repository": {
17
+ "type": "git",
18
+ "url": "git+https://github.com/shahzadhamza/dbt-js.git"
19
+ },
20
+ "homepage": "https://github.com/shahzadhamza/dbt-js.git#readme",
21
+ "bugs": {
22
+ "url": "https://github.com/shahzadhamza/dbt-js/issues"
23
+ },
24
+ "keywords": [
25
+ "dbt",
26
+ "sql",
27
+ "postgres",
28
+ "mysql",
29
+ "sqlite",
30
+ "duckdb",
31
+ "etl",
32
+ "elt",
33
+ "data-transformation",
34
+ "dag",
35
+ "analytics"
36
+ ],
37
+ "files": [
38
+ "bin",
39
+ "src",
40
+ "README.md",
41
+ "LICENSE"
42
+ ],
43
+ "engines": {
44
+ "node": ">=20"
45
+ },
46
+ "dependencies": {
47
+ "@duckdb/node-api": "1.5.3-r.3",
48
+ "better-sqlite3": "~12.9.0",
49
+ "csv-parse": "^5.6.0",
50
+ "mysql2": "^3.11.0",
51
+ "pg": "^8.13.0"
52
+ }
53
+ }
package/src/api.js ADDED
@@ -0,0 +1,257 @@
1
+ // Programmatic API — what `import 'dbt-js'` gives you. Every function takes a
2
+ // projectDir (default process.cwd()), opens its own connection, and closes it
3
+ // before returning. Loading/config errors throw; model and test failures are
4
+ // returned as ok: false. Nothing here writes to the console or exits the
5
+ // process — pass onEvent to observe progress.
6
+ //
7
+ // Instead of project files you can pass the project inline:
8
+ // config — the contents of dbtjs.config.json as an object (file not read)
9
+ // models — a { name: rawSql } map replacing models/*.sql (same format,
10
+ // /* config: {...} */ comment included)
11
+ // projectDir then only anchors relative duckdb paths and locates seeds/.
12
+
13
+ import { loadConfig, validateConfig } from './config.js';
14
+ import { loadProject } from './project.js';
15
+ import { buildDag, expandSelection } from './dag.js';
16
+ import { connect, ensureSchema } from './db.js';
17
+ import { runModel } from './materialize.js';
18
+ import { buildTests, runTest } from './tests.js';
19
+ import { loadSeed } from './seed.js';
20
+ import { render } from './render.js';
21
+ import { computeBatches } from './batches.js';
22
+
23
+ function loadAll({ projectDir = process.cwd(), vars, config, models: inlineModels } = {}) {
24
+ const cfg = config
25
+ ? validateConfig(structuredClone(config), projectDir) // clone: validation mutates (defaults, env interp, path resolve)
26
+ : loadConfig(projectDir);
27
+ if (vars) cfg.vars = { ...cfg.vars, ...vars };
28
+ const { models, seeds } = loadProject(projectDir, { models: inlineModels });
29
+ const { nodes, order } = buildDag(models, seeds);
30
+ return { cfg, models, seeds, nodes, order, projectDir };
31
+ }
32
+
33
+ async function withClient(cfg, projectDir, fn) {
34
+ const client = await connect(cfg.connection, { projectDir, schema: cfg.schema });
35
+ try {
36
+ return await fn(client);
37
+ } finally {
38
+ await client.end();
39
+ }
40
+ }
41
+
42
+ // → { ok, models: [{ name, status: 'ok'|'fail'|'skip', materialized, action?,
43
+ // rowCount?, batchCount?, failedBatches?, durationMs?, error? }] }
44
+ export async function run(opts = {}) {
45
+ const { select, fullRefresh = false, eventTimeStart, eventTimeEnd, onEvent } = opts;
46
+ if (eventTimeEnd && !eventTimeStart) throw new Error('eventTimeEnd requires eventTimeStart');
47
+ const { cfg, nodes, order, projectDir } = loadAll(opts);
48
+ const selected = expandSelection(select, nodes, order).filter(
49
+ (n) => nodes.get(n).type === 'model'
50
+ );
51
+ if (!selected.length) throw new Error('Nothing to run for this selection');
52
+
53
+ return withClient(cfg, projectDir, async (client) => {
54
+ await ensureSchema(client, cfg.schema);
55
+ const models = [];
56
+ const bad = new Set(); // failed or skipped — either blocks downstream
57
+ for (const [i, name] of selected.entries()) {
58
+ const node = nodes.get(name);
59
+ const base = {
60
+ type: 'model',
61
+ name,
62
+ materialized: node.config.materialized,
63
+ index: i + 1,
64
+ total: selected.length,
65
+ };
66
+ if (node.deps.some((d) => bad.has(d))) {
67
+ const rec = { ...base, status: 'skip' };
68
+ bad.add(name);
69
+ models.push(rec);
70
+ onEvent?.(rec);
71
+ continue;
72
+ }
73
+ const start = Date.now();
74
+ let rec;
75
+ try {
76
+ const result = await runModel(client, node, cfg, {
77
+ fullRefresh,
78
+ vars: cfg.vars,
79
+ eventTimeStart,
80
+ eventTimeEnd,
81
+ onBatch: (b) => onEvent?.({ type: 'batch', model: name, ...b }),
82
+ });
83
+ const failedBatches = result.failedBatches ?? [];
84
+ rec = {
85
+ ...base,
86
+ status: failedBatches.length ? 'fail' : 'ok',
87
+ action: result.action,
88
+ rowCount: result.rowCount,
89
+ batchCount: result.batchCount,
90
+ failedBatches,
91
+ durationMs: Date.now() - start,
92
+ };
93
+ if (rec.status === 'fail') {
94
+ rec.error = `${failedBatches.length} of ${result.batchCount} batches failed`;
95
+ bad.add(name);
96
+ }
97
+ } catch (e) {
98
+ rec = { ...base, status: 'fail', error: e.message, durationMs: Date.now() - start };
99
+ bad.add(name);
100
+ }
101
+ models.push(rec);
102
+ onEvent?.(rec);
103
+ }
104
+ return { ok: models.every((m) => m.status === 'ok'), models };
105
+ });
106
+ }
107
+
108
+ // → { ok, tests: [{ id, model, pass, violations, sample }] }
109
+ export async function test(opts = {}) {
110
+ const { select, onEvent } = opts;
111
+ const { cfg, nodes, order, projectDir } = loadAll(opts);
112
+ const selected = new Set(expandSelection(select, nodes, order));
113
+ const models = order
114
+ .filter((n) => selected.has(n) && nodes.get(n).type === 'model')
115
+ .map((n) => nodes.get(n));
116
+ const tests = buildTests(models, cfg.schema);
117
+ if (!tests.length) return { ok: true, tests: [] };
118
+
119
+ return withClient(cfg, projectDir, async (client) => {
120
+ const results = [];
121
+ for (const t of tests) {
122
+ const r = await runTest(client, t);
123
+ const rec = {
124
+ type: 'test',
125
+ id: t.id,
126
+ model: t.model,
127
+ pass: r.pass,
128
+ violations: r.violations ?? 0,
129
+ sample: r.sample ?? [],
130
+ };
131
+ results.push(rec);
132
+ onEvent?.(rec);
133
+ }
134
+ return { ok: results.every((r) => r.pass), tests: results };
135
+ });
136
+ }
137
+
138
+ // → { ok: true, seeds: [{ name, rowCount, durationMs }] } — a failing seed throws
139
+ export async function seed(opts = {}) {
140
+ const { select, onEvent } = opts;
141
+ const { cfg, seeds, projectDir } = loadAll(opts);
142
+ const wanted = select ? new Set(String(select).split(',').map((s) => s.trim())) : null;
143
+ const selected = wanted ? seeds.filter((s) => wanted.has(s.name)) : seeds;
144
+ if (!selected.length) throw new Error('No seeds match this selection');
145
+
146
+ return withClient(cfg, projectDir, async (client) => {
147
+ await ensureSchema(client, cfg.schema);
148
+ const results = [];
149
+ for (const [i, s] of selected.entries()) {
150
+ const start = Date.now();
151
+ const { rowCount } = await loadSeed(client, s, cfg);
152
+ const rec = {
153
+ type: 'seed',
154
+ name: s.name,
155
+ index: i + 1,
156
+ total: selected.length,
157
+ rowCount,
158
+ durationMs: Date.now() - start,
159
+ };
160
+ results.push(rec);
161
+ onEvent?.(rec);
162
+ }
163
+ return { ok: true, seeds: results };
164
+ });
165
+ }
166
+
167
+ // → [{ name, materialized, sql }] — no DB connection needed
168
+ export async function compile(opts = {}) {
169
+ const { select } = opts;
170
+ const { cfg, nodes, order } = loadAll(opts);
171
+ const selected = expandSelection(select, nodes, order).filter(
172
+ (n) => nodes.get(n).type === 'model'
173
+ );
174
+ return selected.map((name) => {
175
+ const node = nodes.get(name);
176
+ let batchCtx = {};
177
+ if (node.config.strategy === 'microbatch') {
178
+ // show the current normal-run window as one span, so the output is runnable SQL
179
+ const b = computeBatches({
180
+ begin: node.config.begin,
181
+ batchSize: node.config.batch_size,
182
+ lookback: node.config.lookback,
183
+ firstBuild: false,
184
+ timezone: node.config.timezone,
185
+ });
186
+ batchCtx = { batchStart: b[0].start, batchEnd: b[b.length - 1].end };
187
+ }
188
+ const ctx = {
189
+ name,
190
+ schema: cfg.schema,
191
+ vars: cfg.vars,
192
+ isIncremental: false, // compile is offline; run decides this against the live DB
193
+ sources: cfg.sources,
194
+ timezone: node.config.timezone,
195
+ };
196
+ const { sql } = render(node.rawSql, { ...ctx, ...batchCtx });
197
+ // hooks render without batch context — batch_start/batch_end are body-only
198
+ const preHookSql = node.config.pre_hook.map((h) => render(h, ctx).sql);
199
+ const postHookSql = node.config.post_hook.map((h) => render(h, ctx).sql);
200
+ return { name, materialized: node.config.materialized, sql, preHookSql, postHookSql };
201
+ });
202
+ }
203
+
204
+ // → [{ name, kind, deps }] in execution order — no DB connection needed
205
+ export async function ls(opts = {}) {
206
+ const { nodes, order } = loadAll(opts);
207
+ return order.map((name) => {
208
+ const node = nodes.get(name);
209
+ return {
210
+ name,
211
+ kind: node.type === 'seed' ? 'seed' : node.config.materialized,
212
+ deps: node.deps,
213
+ };
214
+ });
215
+ }
216
+
217
+ // → { rows, rowCount } — one arbitrary statement against the project's warehouse.
218
+ // Bypasses loadAll so it works on projects with zero models. readOnly (default)
219
+ // opens DuckDB with access_mode READ_ONLY / sets the Postgres session read-only.
220
+ export async function query(opts = {}) {
221
+ const { sql, params, readOnly = true, projectDir = process.cwd(), config } = opts;
222
+ if (typeof sql !== 'string' || !sql.trim()) throw new Error('sql is required');
223
+ const cfg = config
224
+ ? validateConfig(structuredClone(config), projectDir)
225
+ : loadConfig(projectDir);
226
+ const client = await connect(cfg.connection, { projectDir, readOnly, schema: cfg.schema });
227
+ try {
228
+ return await client.query(sql, params);
229
+ } finally {
230
+ await client.end();
231
+ }
232
+ }
233
+
234
+ // → { schema, modelCount, seedCount, target, database, version } — connectivity check
235
+ export async function debug(opts = {}) {
236
+ const { cfg, models, seeds, projectDir } = loadAll(opts);
237
+ const target = ['duckdb', 'sqlite'].includes(cfg.connection.type)
238
+ ? `${cfg.connection.type} ${cfg.connection.path}`
239
+ : `${cfg.connection.host}:${cfg.connection.port}/${cfg.connection.database} as ${cfg.connection.user}`;
240
+ return withClient(cfg, projectDir, async (client) => {
241
+ const { rows } = await client.query(
242
+ cfg.connection.type === 'mysql'
243
+ ? 'SELECT DATABASE() AS db, VERSION() AS version'
244
+ : cfg.connection.type === 'sqlite'
245
+ ? 'SELECT sqlite_version() AS version'
246
+ : 'SELECT current_database() AS db, version() AS version'
247
+ );
248
+ return {
249
+ schema: cfg.schema,
250
+ modelCount: models.length,
251
+ seedCount: seeds.length,
252
+ target,
253
+ database: rows[0].db ?? cfg.connection.path,
254
+ version: rows[0].version,
255
+ };
256
+ });
257
+ }