npm - dbt-js - Versions diffs - 0.1.1 - Mend

dbt-js 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/src/materialize.js ADDED Viewed

@@ -0,0 +1,197 @@
+import { quoteIdent, rel, relationKind, withTransaction } from './db.js';
+import { render } from './render.js';
+import { computeBatches } from './batches.js';
+export async function runModel(client, node, projectCfg, opts = {}) {
+  const { fullRefresh = false, vars } = opts;
+  const { name, config, rawSql } = node;
+  const schema = projectCfg.schema;
+  const target = rel(schema, name);
+  const kind = await relationKind(client, schema, name);
+  const isIncremental = config.materialized === 'incremental' && !fullRefresh && kind === 'r';
+  const ctx = {
+    name,
+    schema,
+    vars: vars ?? projectCfg.vars,
+    isIncremental,
+    sources: projectCfg.sources,
+    timezone: config.timezone,
+  };
+  // Hooks run outside the materialization transaction, one statement each, so
+  // they can use statements Postgres forbids inside a txn (VACUUM, CREATE
+  // INDEX CONCURRENTLY). Microbatch runs them once per model, not per batch.
+  await runHooks(client, config.pre_hook, 'pre_hook', ctx);
+  if (config.materialized === 'incremental' && config.strategy === 'microbatch') {
+    return runMicrobatch(client, node, projectCfg, { ...opts, kind, hookCtx: ctx });
+  }
+  const { sql } = render(rawSql, ctx);
+  const result = await materialize(client, { name, config, sql, target, kind, isIncremental });
+  await runHooks(client, config.post_hook, 'post_hook', ctx);
+  return result;
+}
+async function materialize(client, { name, config, sql, target, kind, isIncremental }) {
+  const sqlite = client.dialect === 'sqlite';
+  const cascade = sqlite ? '' : ' CASCADE'; // CASCADE is a SQLite syntax error
+  if (config.materialized === 'view') {
+    if (sqlite) {
+      // no CREATE OR REPLACE VIEW; SQLite DDL is transactional, so the wrap
+      // closes the window where the view would be absent
+      return withTransaction(client, async () => {
+        if (kind && kind !== 'v') await client.query(`DROP TABLE IF EXISTS ${target}`);
+        await client.query(`DROP VIEW IF EXISTS ${target}`);
+        await client.query(`CREATE VIEW ${target} AS\n${sql}`);
+        return { action: 'view' };
+      });
+    }
+    if (kind && kind !== 'v') await client.query(`DROP TABLE IF EXISTS ${target} CASCADE`);
+    await client.query(`CREATE OR REPLACE VIEW ${target} AS\n${sql}`);
+    return { action: 'view' };
+  }
+  if (!isIncremental) {
+    // table, or incremental first run / --full-refresh: transactional rebuild
+    return withTransaction(client, async () => {
+      if (kind === 'v') await client.query(`DROP VIEW IF EXISTS ${target}${cascade}`);
+      else await client.query(`DROP TABLE IF EXISTS ${target}${cascade}`);
+      const res = await client.query(`CREATE TABLE ${target} AS\n${sql}`);
+      const action = config.materialized === 'table' ? 'table' : 'incremental (full build)';
+      return { action, rowCount: res.rowCount };
+    });
+  }
+  if (config.strategy === 'append') {
+    const res = await client.query(`INSERT INTO ${target}\n${sql}`);
+    return { action: 'incremental append', rowCount: res.rowCount };
+  }
+  // delete+insert: compute the SELECT once into a temp table, swap within one txn
+  const keys = Array.isArray(config.unique_key) ? config.unique_key : [config.unique_key];
+  const temp = quoteIdent(`${name}__dbtjs_incr`);
+  const mysql = client.dialect === 'mysql';
+  return withTransaction(client, async () => {
+    // explicit DROP rather than ON COMMIT DROP — DuckDB silently ignores the latter
+    await client.query(`CREATE TEMPORARY TABLE ${temp} AS\n${sql}`);
+    const match = keys.map((k) => `t.${quoteIdent(k)} = i.${quoteIdent(k)}`).join(' AND ');
+    // MySQL has no Postgres-style DELETE ... USING ... WHERE; its multi-table
+    // form references the temp table once per statement, satisfying MySQL's
+    // single-reference rule for TEMPORARY tables. SQLite has neither form —
+    // correlated EXISTS against the aliased target instead.
+    await client.query(
+      sqlite
+        ? `DELETE FROM ${target} AS t WHERE EXISTS (SELECT 1 FROM ${temp} i WHERE ${match})`
+        : mysql
+          ? `DELETE t FROM ${target} t JOIN ${temp} i ON ${match}`
+          : `DELETE FROM ${target} t USING ${temp} i WHERE ${match}`
+    );
+    const res = await client.query(`INSERT INTO ${target} SELECT * FROM ${temp}`);
+    // TEMPORARY keyword on MySQL: plain DROP TABLE implicitly commits,
+    // which would break this transaction's atomicity
+    await client.query(`DROP ${mysql ? 'TEMPORARY ' : ''}TABLE ${temp}`);
+    return { action: 'incremental delete+insert', rowCount: res.rowCount };
+  });
+}
+async function runHooks(client, hooks, which, ctx) {
+  for (const [i, hook] of hooks.entries()) {
+    const { sql } = render(hook, ctx);
+    try {
+      await client.query(sql);
+    } catch (e) {
+      throw new Error(`${which}[${i}]: ${e.message}`);
+    }
+  }
+}
+// Microbatch: split the event-time range into aligned windows; each batch is its
+// own transaction that replaces the target rows inside its window. A failed
+// batch is recorded and the rest keep running (retry via --event-time-start/-end).
+async function runMicrobatch(client, node, projectCfg, opts) {
+  const { fullRefresh = false, vars, eventTimeStart, eventTimeEnd, onBatch, kind, hookCtx } = opts;
+  const { name, config, rawSql } = node;
+  const schema = projectCfg.schema;
+  const target = rel(schema, name);
+  const firstBuild = fullRefresh || kind !== 'r';
+  const batches = computeBatches({
+    begin: config.begin,
+    batchSize: config.batch_size,
+    lookback: config.lookback,
+    start: eventTimeStart,
+    end: eventTimeEnd,
+    firstBuild,
+    timezone: config.timezone,
+  });
+  const et = quoteIdent(config.event_time);
+  const sqlite = client.dialect === 'sqlite';
+  const cascade = sqlite ? '' : ' CASCADE';
+  const failed = [];
+  let total = 0;
+  let countUnknown = false;
+  let created = !firstBuild;
+  for (const b of batches) {
+    const { sql } = render(rawSql, {
+      name,
+      schema,
+      vars: vars ?? projectCfg.vars,
+      isIncremental: !firstBuild,
+      sources: projectCfg.sources,
+      batchStart: b.start,
+      batchEnd: b.end,
+      timezone: config.timezone,
+    });
+    try {
+      let rowCount;
+      if (!created) {
+        rowCount = await withTransaction(client, async () => {
+          if (kind === 'v') await client.query(`DROP VIEW IF EXISTS ${target}${cascade}`);
+          else await client.query(`DROP TABLE IF EXISTS ${target}${cascade}`);
+          const res = await client.query(`CREATE TABLE ${target} AS\n${sql}`);
+          return res.rowCount;
+        });
+        created = true;
+      } else {
+        rowCount = await withTransaction(client, async () => {
+          // SQLite compares timestamps as text, and a day-granularity event_time
+          // ('YYYY-MM-DD') sorts BELOW the batch boundary ('YYYY-MM-DD HH:MM:SS'
+          // from computeBatches) — datetime() normalizes both shapes
+          await client.query(
+            sqlite
+              ? `DELETE FROM ${target} WHERE datetime(${et}) >= datetime('${b.start}') AND datetime(${et}) < datetime('${b.end}')`
+              : `DELETE FROM ${target} WHERE ${et} >= '${b.start}' AND ${et} < '${b.end}'`
+          );
+          const res = await client.query(`INSERT INTO ${target}\n${sql}`);
+          return res.rowCount;
+        });
+      }
+      if (rowCount == null) countUnknown = true;
+      else total += rowCount;
+      onBatch?.({ ...b, ok: true, rowCount });
+    } catch (e) {
+      onBatch?.({ ...b, ok: false, message: e.message });
+      if (!created) {
+        // the target doesn't exist yet, so no later batch can insert into it
+        throw new Error(`first batch (${b.start}) failed: ${e.message}`);
+      }
+      failed.push({ ...b, message: e.message });
+    }
+  }
+  // skipped on partial failure: the model is already 'fail', don't stamp a
+  // success hook (grant, index, audit row) onto an incomplete build
+  if (!failed.length) await runHooks(client, config.post_hook, 'post_hook', hookCtx);
+  return {
+    action: 'incremental microbatch',
+    rowCount: countUnknown ? undefined : total,
+    batchCount: batches.length,
+    failedBatches: failed,
+  };
+}

package/src/project.js ADDED Viewed

@@ -0,0 +1,107 @@
+import { existsSync, readFileSync, readdirSync } from 'node:fs';
+import { basename, join } from 'node:path';
+const CONFIG_RE = /\/\*\s*config:\s*([\s\S]*?)\*\//;
+const MATERIALIZATIONS = new Set(['view', 'table', 'incremental']);
+const STRATEGIES = new Set(['append', 'delete+insert', 'microbatch']);
+const BATCH_SIZES = new Set(['hour', 'day', 'month', 'year']);
+// inlineModels: optional { name: rawSql } map (same format as a model file,
+// config comment included) — when given, models/ is not scanned.
+export function loadProject(cwd = process.cwd(), { models: inlineModels } = {}) {
+  const models = [];
+  if (inlineModels) {
+    for (const [name, rawSql] of Object.entries(inlineModels)) {
+      models.push({ name, rawSql, config: parseModelConfig(name, rawSql) });
+    }
+  } else {
+    const modelsDir = join(cwd, 'models');
+    if (existsSync(modelsDir)) {
+      for (const file of readdirSync(modelsDir).filter((f) => f.endsWith('.sql')).sort()) {
+        const path = join(modelsDir, file);
+        const rawSql = readFileSync(path, 'utf8');
+        const name = basename(file, '.sql');
+        models.push({ name, path, rawSql, config: parseModelConfig(name, rawSql) });
+      }
+    }
+  }
+  const seeds = [];
+  const seedsDir = join(cwd, 'seeds');
+  if (existsSync(seedsDir)) {
+    for (const file of readdirSync(seedsDir).filter((f) => f.endsWith('.csv')).sort()) {
+      seeds.push({ name: basename(file, '.csv'), path: join(seedsDir, file) });
+    }
+  }
+  const seen = new Set();
+  for (const { name } of [...models, ...seeds]) {
+    if (seen.has(name)) throw new Error(`Duplicate node name '${name}' across models/ and seeds/`);
+    seen.add(name);
+  }
+  if (!models.length && !seeds.length) {
+    throw new Error(`No models/*.sql or seeds/*.csv found in ${cwd} (and no inline models given)`);
+  }
+  return { models, seeds };
+}
+function parseModelConfig(name, rawSql) {
+  const match = rawSql.match(CONFIG_RE);
+  let config = {};
+  if (match) {
+    try {
+      config = JSON.parse(match[1]);
+    } catch (e) {
+      throw new Error(`Invalid JSON in config comment of model '${name}': ${e.message}`);
+    }
+  }
+  config.materialized ??= 'view';
+  if (!MATERIALIZATIONS.has(config.materialized)) {
+    throw new Error(`Model '${name}': unknown materialized '${config.materialized}' (use view|table|incremental)`);
+  }
+  config.timezone ??= 'UTC';
+  if (typeof config.timezone !== 'string') {
+    throw new Error(`Model '${name}': "timezone" must be a string (e.g. "UTC", "America/New_York")`);
+  }
+  try {
+    // RangeError on an unknown IANA zone; 'UTC' is always valid
+    new Intl.DateTimeFormat('en-US', { timeZone: config.timezone });
+  } catch {
+    throw new Error(`Model '${name}': unknown timezone '${config.timezone}' (use an IANA name like "America/New_York" or "UTC")`);
+  }
+  for (const key of ['pre_hook', 'post_hook']) {
+    if (typeof config[key] === 'string') config[key] = [config[key]];
+    config[key] ??= [];
+    if (!Array.isArray(config[key]) || config[key].some((h) => typeof h !== 'string' || !h.trim())) {
+      throw new Error(`Model '${name}': "${key}" must be a SQL string or array of SQL strings`);
+    }
+  }
+  if (config.materialized === 'incremental') {
+    config.strategy ??= 'append';
+    if (!STRATEGIES.has(config.strategy)) {
+      throw new Error(`Model '${name}': unknown strategy '${config.strategy}' (use append|delete+insert|microbatch)`);
+    }
+    if (config.strategy === 'delete+insert' && !config.unique_key) {
+      throw new Error(`Model '${name}': strategy delete+insert requires "unique_key"`);
+    }
+    if (config.strategy === 'microbatch') {
+      if (typeof config.event_time !== 'string' || !config.event_time) {
+        throw new Error(`Model '${name}': microbatch requires "event_time" (a column of this model)`);
+      }
+      if (!config.begin || Number.isNaN(Date.parse(String(config.begin).replace(' ', 'T')))) {
+        throw new Error(`Model '${name}': microbatch requires "begin" (start of history, e.g. "2026-01-01")`);
+      }
+      if (!BATCH_SIZES.has(config.batch_size)) {
+        throw new Error(`Model '${name}': microbatch requires "batch_size" (hour|day|month|year)`);
+      }
+      config.lookback ??= 1;
+      if (!Number.isInteger(config.lookback) || config.lookback < 0) {
+        throw new Error(`Model '${name}': "lookback" must be a non-negative integer`);
+      }
+      if (config.unique_key) {
+        throw new Error(`Model '${name}': "unique_key" is not used by microbatch (batches replace by event_time window)`);
+      }
+    }
+  }
+  return config;
+}

package/src/render.js ADDED Viewed

@@ -0,0 +1,62 @@
+// Minimal template renderer. Supported constructs:
+//   {{ ref('model') }}  {{ this }}  {{ source('src', 'table') }}
+//   {{ var('name') }}  {{ var('name', default) }}
+//   {{ batch_start }}  {{ batch_end }}          (microbatch models only)
+//   {{ timezone }}                              (the model's config timezone)
+//   {% if is_incremental() %} ... {% endif %}   (no nesting)
+const CONFIG_RE = /\/\*\s*config:\s*[\s\S]*?\*\//;
+const IF_INCREMENTAL_RE = /\{%\s*if\s+is_incremental\(\)\s*%\}([\s\S]*?)\{%\s*endif\s*%\}/g;
+const REF_RE = /\{\{\s*ref\(\s*['"](\w+)['"]\s*\)\s*\}\}/g;
+const THIS_RE = /\{\{\s*this\s*\}\}/g;
+const SOURCE_RE = /\{\{\s*source\(\s*['"](\w+)['"]\s*,\s*['"](\w+)['"]\s*\)\s*\}\}/g;
+const VAR_RE = /\{\{\s*var\(\s*['"](\w+)['"]\s*(?:,\s*('[^']*'|"[^"]*"|[^)\s]+))?\s*\)\s*\}\}/g;
+const BATCH_RE = /\{\{\s*(batch_start|batch_end)\s*\}\}/g;
+const TIMEZONE_RE = /\{\{\s*timezone\s*\}\}/g;
+const LEFTOVER_RE = /\{\{[\s\S]*?\}\}|\{%[\s\S]*?%\}|\{\{|\{%/;
+const quoteIdent = (s) => `"${s.replace(/"/g, '""')}"`;
+const stripQuotes = (s) => (/^(['"]).*\1$/s.test(s) ? s.slice(1, -1) : s);
+// Cheap dependency extraction for DAG building — scans ref() calls without
+// rendering, so missing vars or incremental branches can't hide a dependency.
+export function extractRefs(rawSql) {
+  return [...rawSql.matchAll(REF_RE)].map((m) => m[1]);
+}
+// ctx: { name, schema, vars, isIncremental, sources, batchStart?, batchEnd?, timezone? }
+export function render(rawSql, ctx) {
+  const refs = [];
+  let sql = rawSql.replace(CONFIG_RE, '');
+  sql = sql.replace(IF_INCREMENTAL_RE, (_, body) => (ctx.isIncremental ? body : ''));
+  sql = sql.replace(REF_RE, (_, name) => {
+    refs.push(name);
+    return `${quoteIdent(ctx.schema)}.${quoteIdent(name)}`;
+  });
+  sql = sql.replace(THIS_RE, () => `${quoteIdent(ctx.schema)}.${quoteIdent(ctx.name)}`);
+  sql = sql.replace(SOURCE_RE, (_, src, table) => {
+    const decl = ctx.sources?.[src];
+    if (!decl?.schema) {
+      throw new Error(
+        `'${ctx.name}' uses undeclared source '${src}' — add it under "sources" in dbtjs.config.json`
+      );
+    }
+    return `${quoteIdent(decl.schema)}.${quoteIdent(table)}`;
+  });
+  if (ctx.batchStart != null) {
+    // only microbatch runs supply these; elsewhere the token falls through to the leftover guard
+    sql = sql.replace(BATCH_RE, (_, which) => (which === 'batch_start' ? ctx.batchStart : ctx.batchEnd));
+  }
+  // raw substitution (like batch_start) — author quotes it in SQL if needed
+  sql = sql.replace(TIMEZONE_RE, ctx.timezone ?? 'UTC');
+  sql = sql.replace(VAR_RE, (_, name, def) => {
+    const value = ctx.vars?.[name];
+    if (value !== undefined && value !== null) return String(value);
+    if (def !== undefined) return stripQuotes(def);
+    throw new Error(`Missing var '${name}' in '${ctx.name}' (no default given) — pass --vars '{"${name}": ...}'`);
+  });
+  const leftover = sql.match(LEFTOVER_RE);
+  if (leftover) {
+    throw new Error(`Unrecognized template expression in '${ctx.name}': ${leftover[0].slice(0, 80)}`);
+  }
+  return { sql: sql.trim(), refs };
+}

package/src/seed.js ADDED Viewed

@@ -0,0 +1,68 @@
+import { readFileSync } from 'node:fs';
+import { parse } from 'csv-parse/sync';
+import { quoteIdent, rel, withTransaction } from './db.js';
+const BATCH_SIZE = 500;
+export async function loadSeed(client, seed, projectCfg) {
+  const rows = parse(readFileSync(seed.path, 'utf8'), {
+    columns: true,
+    skip_empty_lines: true,
+    trim: true,
+  });
+  if (!rows.length) throw new Error(`Seed '${seed.name}' has no data rows`);
+  const columns = Object.keys(rows[0]);
+  const overrides = projectCfg.seeds?.columnTypes?.[seed.name] ?? {};
+  const mysql = client.dialect === 'mysql';
+  const sqlite = client.dialect === 'sqlite';
+  const types = columns.map((c) => {
+    const t = overrides[c] ?? inferType(rows.map((r) => r[c]));
+    // bare NUMERIC is DECIMAL(10,0) on MySQL — would silently round decimals
+    return mysql && t === 'numeric' ? 'decimal(38,10)' : t;
+  });
+  const target = rel(projectCfg.schema, seed.name);
+  // stay under SQLite's 32766-bind-variable cap (and Postgres's 65535) for wide CSVs
+  const batchSize = Math.max(1, Math.min(BATCH_SIZE, Math.floor(32000 / columns.length)));
+  await withTransaction(client, async () => {
+    await client.query(`DROP TABLE IF EXISTS ${target}${sqlite ? '' : ' CASCADE'}`);
+    const defs = columns.map((c, i) => `${quoteIdent(c)} ${types[i]}`).join(', ');
+    await client.query(`CREATE TABLE ${target} (${defs})`);
+    for (let i = 0; i < rows.length; i += batchSize) {
+      const batch = rows.slice(i, i + batchSize);
+      const params = [];
+      const tuples = batch.map(
+        (row) =>
+          `(${columns
+            .map((c, j) => {
+              let v = row[c] === '' ? null : row[c];
+              // MySQL booleans are TINYINT(1); the string 'true' errors under
+              // strict mode. SQLite would store the TEXT 'true', which is falsy
+              // in CASE WHEN (and better-sqlite3 can't bind true/false anyway).
+              if ((mysql || sqlite) && v !== null && types[j] === 'boolean')
+                v = /^(true|t)$/i.test(v) ? 1 : 0;
+              params.push(v);
+              return `$${params.length}`;
+            })
+            .join(', ')})`
+      );
+      await client.query(`INSERT INTO ${target} VALUES ${tuples.join(', ')}`, params);
+    }
+  });
+  return { rowCount: rows.length };
+}
+// Minimal inference: integer/bigint, numeric, boolean, else text.
+// Empty strings load as NULL and are excluded from inference.
+// Anything fancier (dates, etc.) → seeds.columnTypes override in dbtjs.config.json.
+export function inferType(values) {
+  const present = values.filter((v) => v !== '');
+  if (!present.length) return 'text';
+  if (present.every((v) => /^-?\d+$/.test(v))) {
+    return present.some((v) => Math.abs(Number(v)) > 2147483647) ? 'bigint' : 'integer';
+  }
+  if (present.every((v) => /^-?\d*\.?\d+$/.test(v))) return 'numeric';
+  if (present.every((v) => /^(true|false|t|f)$/i.test(v))) return 'boolean';
+  return 'text';
+}

package/src/tests.js ADDED Viewed

@@ -0,0 +1,49 @@
+import { quoteIdent, rel } from './db.js';
+// Each test compiles to a SELECT returning violating rows; any row = FAIL.
+// NULLs only violate not_null (dbt semantics).
+export function buildTests(models, schema) {
+  const tests = [];
+  for (const model of models) {
+    for (const [column, specs] of Object.entries(model.config.tests ?? {})) {
+      const target = rel(schema, model.name);
+      const col = quoteIdent(column);
+      for (const spec of specs) {
+        if (spec === 'not_null') {
+          tests.push({
+            id: `${model.name}.${column}.not_null`,
+            model: model.name,
+            sql: `SELECT * FROM ${target} WHERE ${col} IS NULL`,
+            params: [],
+          });
+        } else if (spec === 'unique') {
+          tests.push({
+            id: `${model.name}.${column}.unique`,
+            model: model.name,
+            sql: `SELECT ${col}, count(*) AS n FROM ${target} WHERE ${col} IS NOT NULL GROUP BY ${col} HAVING count(*) > 1`,
+            params: [],
+          });
+        } else if (spec?.accepted_values?.length) {
+          const placeholders = spec.accepted_values.map((_, i) => `$${i + 1}`).join(', ');
+          tests.push({
+            id: `${model.name}.${column}.accepted_values`,
+            model: model.name,
+            sql: `SELECT ${col}, count(*) AS n FROM ${target} WHERE ${col} IS NOT NULL AND ${col} NOT IN (${placeholders}) GROUP BY ${col}`,
+            params: spec.accepted_values,
+          });
+        } else {
+          throw new Error(`Unknown test ${JSON.stringify(spec)} on ${model.name}.${column}`);
+        }
+      }
+    }
+  }
+  return tests;
+}
+export async function runTest(client, test) {
+  const count = await client.query(`SELECT count(*) AS n FROM (${test.sql}) q`, test.params);
+  const violations = Number(count.rows[0].n);
+  if (violations === 0) return { pass: true };
+  const sample = await client.query(`${test.sql} LIMIT 10`, test.params);
+  return { pass: false, violations, sample: sample.rows };
+}