goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,212 @@
1
+ /**
2
+ * workers.ts -- Concurrent and parallel block scoring for Node.
3
+ *
4
+ * Python uses ThreadPoolExecutor (rapidfuzz releases the GIL). JS is
5
+ * single-threaded by default; true parallelism requires `worker_threads`
6
+ * with serialization overhead that's only worth it for large blocks.
7
+ *
8
+ * This module ships two schedulers:
9
+ * - `scoreBlocksConcurrent` -- Promise.all batching on the main thread.
10
+ * No real parallelism, but zero setup cost and good for small/medium
11
+ * block counts.
12
+ * - `scoreBlocksParallel` -- piscina-backed worker pool for true CPU
13
+ * parallelism. Optional peer dep; falls back to `scoreBlocksConcurrent`
14
+ * when piscina isn't installed.
15
+ *
16
+ * Mirrors the shape of `goldenmatch.backends.ray_backend.score_blocks_ray`
17
+ * from the Python source, but stays inside one Node process.
18
+ */
19
+
20
+ import { fileURLToPath } from "node:url";
21
+ import { dirname, join } from "node:path";
22
+
23
+ import type {
24
+ BlockResult,
25
+ MatchkeyConfig,
26
+ PairKey,
27
+ ScoredPair,
28
+ } from "../../core/types.js";
29
+ import { pairKey } from "../../core/cluster.js";
30
+
31
+ export interface WorkerPoolOptions {
32
+ /** Max blocks scored concurrently per batch. Defaults to 4. */
33
+ readonly batchSize?: number;
34
+ }
35
+
36
+ export interface ParallelWorkerOptions {
37
+ /** Max worker threads. Defaults to min(8, max(2, blocks.length)). */
38
+ readonly maxThreads?: number;
39
+ /** Min worker threads kept warm. Defaults to 1. */
40
+ readonly minThreads?: number;
41
+ /** Idle timeout in ms before workers exit. Defaults to 1000. */
42
+ readonly idleTimeout?: number;
43
+ }
44
+
45
+ /**
46
+ * Score blocks with cooperative concurrency.
47
+ *
48
+ * - For 0 blocks: returns empty.
49
+ * - For <= 2 blocks: skips batching overhead and runs sequentially via
50
+ * `scoreBlocksSequential` (mirrors Python's small-block fast path).
51
+ * - Otherwise: schedules blocks in batches of `batchSize`, awaiting each
52
+ * batch with `Promise.all` so the event loop can interleave I/O.
53
+ *
54
+ * Note: `matchedPairs` is mutated as new pairs are discovered (consistent
55
+ * with `scoreBlocksSequential`). A frozen snapshot is used per block so
56
+ * concurrent batches see a stable exclusion set, matching Python's
57
+ * `score_blocks_parallel` contract.
58
+ */
59
+ export async function scoreBlocksConcurrent(
60
+ blocks: readonly BlockResult[],
61
+ mk: MatchkeyConfig,
62
+ matchedPairs: Set<PairKey>,
63
+ options: WorkerPoolOptions = {},
64
+ ): Promise<readonly ScoredPair[]> {
65
+ if (blocks.length === 0) return [];
66
+
67
+ // Small-block fast path -- sequential is cheaper than batching.
68
+ if (blocks.length <= 2) {
69
+ const { scoreBlocksSequential } = await import("../../core/scorer.js");
70
+ return scoreBlocksSequential(blocks, mk, matchedPairs);
71
+ }
72
+
73
+ const { findFuzzyMatches } = await import("../../core/scorer.js");
74
+ const batchSize = options.batchSize ?? 4;
75
+ const results: ScoredPair[] = [];
76
+
77
+ for (let i = 0; i < blocks.length; i += batchSize) {
78
+ const batch = blocks.slice(i, i + batchSize);
79
+
80
+ // Snapshot exclude set per batch so concurrent block scoring is stable.
81
+ const excludeSnapshot: ReadonlySet<PairKey> = new Set(matchedPairs);
82
+
83
+ const batchResults = await Promise.all(
84
+ batch.map((block) =>
85
+ Promise.resolve().then(() =>
86
+ findFuzzyMatches(
87
+ block.rows,
88
+ mk,
89
+ excludeSnapshot,
90
+ block.preScoredPairs,
91
+ ),
92
+ ),
93
+ ),
94
+ );
95
+
96
+ for (const pairs of batchResults) {
97
+ for (const p of pairs) {
98
+ const key = pairKey(p.idA, p.idB);
99
+ if (matchedPairs.has(key)) continue;
100
+ matchedPairs.add(key);
101
+ results.push(p);
102
+ }
103
+ }
104
+ }
105
+
106
+ return results;
107
+ }
108
+
109
+ /**
110
+ * Score blocks in true parallel via piscina worker_threads.
111
+ *
112
+ * - For 0 blocks: returns empty.
113
+ * - For <= 2 blocks: runs sequentially (spinning up workers isn't worth it).
114
+ * - Otherwise: dispatches each block to a piscina worker that runs
115
+ * `findFuzzyMatches` in its own V8 isolate, giving true CPU parallelism.
116
+ *
117
+ * Falls back to `scoreBlocksConcurrent` with a console warning if piscina
118
+ * isn't installed (it's an optional peer dep).
119
+ *
120
+ * `matchedPairs` is mutated in place with newly discovered pairs, matching
121
+ * the contract of `scoreBlocksSequential` / `scoreBlocksConcurrent`.
122
+ */
123
+ export async function scoreBlocksParallel(
124
+ blocks: readonly BlockResult[],
125
+ mk: MatchkeyConfig,
126
+ matchedPairs: Set<PairKey>,
127
+ options: ParallelWorkerOptions = {},
128
+ ): Promise<readonly ScoredPair[]> {
129
+ if (blocks.length === 0) return [];
130
+
131
+ // Small-block fast path -- worker startup isn't worth it.
132
+ if (blocks.length <= 2) {
133
+ const { scoreBlocksSequential } = await import("../../core/scorer.js");
134
+ return scoreBlocksSequential(blocks, mk, matchedPairs);
135
+ }
136
+
137
+ // Dynamically load piscina so it stays an optional peer dep.
138
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
139
+ let PiscinaCtor: any;
140
+ try {
141
+ const mod = await import("piscina" as string);
142
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
143
+ const m = mod as any;
144
+ PiscinaCtor = m.Piscina ?? m.default ?? m;
145
+ } catch {
146
+ console.warn(
147
+ "piscina not installed; falling back to Promise.all concurrency. " +
148
+ "Install `piscina` as a peer dep for true worker-thread parallelism.",
149
+ );
150
+ return scoreBlocksConcurrent(blocks, mk, matchedPairs);
151
+ }
152
+
153
+ const workerScript = resolveWorkerScript();
154
+
155
+ const pool = new PiscinaCtor({
156
+ filename: workerScript,
157
+ maxThreads: options.maxThreads ?? Math.max(2, Math.min(8, blocks.length)),
158
+ minThreads: options.minThreads ?? 1,
159
+ idleTimeout: options.idleTimeout ?? 1000,
160
+ });
161
+
162
+ try {
163
+ // Snapshot the exclude set as an array so it serializes across threads.
164
+ const snapshot = Array.from(matchedPairs);
165
+
166
+ const results = (await Promise.all(
167
+ blocks.map(
168
+ (block) =>
169
+ pool.run({ block, mk, matchedPairs: snapshot }) as Promise<{
170
+ pairs: readonly ScoredPair[];
171
+ }>,
172
+ ),
173
+ )) as { pairs: readonly ScoredPair[] }[];
174
+
175
+ const all: ScoredPair[] = [];
176
+ for (const r of results) {
177
+ for (const p of r.pairs) {
178
+ const key = pairKey(p.idA, p.idB);
179
+ if (matchedPairs.has(key)) continue;
180
+ matchedPairs.add(key);
181
+ all.push(p);
182
+ }
183
+ }
184
+ return all;
185
+ } finally {
186
+ await pool.destroy();
187
+ }
188
+ }
189
+
190
+ /**
191
+ * Resolve the on-disk path of the compiled worker script.
192
+ *
193
+ * tsup is configured to emit `score-worker.js` / `.cjs` alongside this
194
+ * module in the `dist/node/backends/` directory. In dev (pre-build) the
195
+ * caller can set `GOLDENMATCH_WORKER_SCRIPT` to point at a custom path
196
+ * (e.g. a ts-node loader).
197
+ *
198
+ * Picks `.js` (ESM) first, falling back to `.cjs`. piscina resolves the
199
+ * file itself -- we just hand it a path string.
200
+ */
201
+ function resolveWorkerScript(): string {
202
+ const override = process.env["GOLDENMATCH_WORKER_SCRIPT"];
203
+ if (override !== undefined && override.length > 0) return override;
204
+
205
+ const here = fileURLToPath(import.meta.url);
206
+ const dir = dirname(here);
207
+
208
+ // Preferred: sibling in the same dist/node/backends/ directory.
209
+ // tsup emits both .js (ESM) and .cjs depending on the package format;
210
+ // piscina accepts either.
211
+ return join(dir, "score-worker.js");
212
+ }
@@ -0,0 +1,66 @@
1
+ /**
2
+ * config-file.ts -- YAML config loading/saving from disk.
3
+ *
4
+ * Node-only. Uses `createRequire` so the optional `yaml` peer dependency
5
+ * is resolved lazily without breaking edge-safe ESM builds.
6
+ */
7
+
8
+ import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs";
9
+ import { resolve, dirname } from "node:path";
10
+ import { createRequire } from "node:module";
11
+ import { parseConfigYaml, configToYaml } from "../core/config/loader.js";
12
+ import type { GoldenMatchConfig } from "../core/types.js";
13
+
14
+ const require = createRequire(import.meta.url);
15
+
16
+ interface YamlModule {
17
+ parse: (s: string) => unknown;
18
+ stringify: (v: unknown) => string;
19
+ }
20
+
21
+ function loadYamlModule(): YamlModule {
22
+ try {
23
+ // eslint-disable-next-line @typescript-eslint/no-var-requires
24
+ const mod = require("yaml") as YamlModule;
25
+ if (typeof mod.parse !== "function" || typeof mod.stringify !== "function") {
26
+ throw new Error("'yaml' module missing parse/stringify exports");
27
+ }
28
+ return mod;
29
+ } catch (err) {
30
+ const detail = err instanceof Error ? err.message : String(err);
31
+ throw new Error(
32
+ `'yaml' package is required for config file I/O. Install: npm install yaml (${detail})`,
33
+ );
34
+ }
35
+ }
36
+
37
+ /**
38
+ * Load and parse a YAML config file into a typed GoldenMatchConfig.
39
+ *
40
+ * @throws if the file cannot be read, `yaml` is not installed, or the
41
+ * document does not describe a valid config.
42
+ */
43
+ export function loadConfigFile(path: string): GoldenMatchConfig {
44
+ const resolved = resolve(path);
45
+ if (!existsSync(resolved)) {
46
+ throw new Error(`Config file not found: ${resolved}`);
47
+ }
48
+ const content = readFileSync(resolved, "utf8");
49
+ const yamlMod = loadYamlModule();
50
+ return parseConfigYaml(content, yamlMod.parse);
51
+ }
52
+
53
+ /**
54
+ * Serialize a GoldenMatchConfig to YAML and write it to disk.
55
+ * Creates parent directories as needed.
56
+ */
57
+ export function writeConfigFile(path: string, config: GoldenMatchConfig): void {
58
+ const resolved = resolve(path);
59
+ const dir = dirname(resolved);
60
+ if (dir && dir !== "." && !existsSync(dir)) {
61
+ mkdirSync(dir, { recursive: true });
62
+ }
63
+ const yamlMod = loadYamlModule();
64
+ const yamlStr = configToYaml(config, yamlMod.stringify);
65
+ writeFileSync(resolved, yamlStr, "utf8");
66
+ }
@@ -0,0 +1,57 @@
1
+ /**
2
+ * base.ts -- Base connector interface and registry.
3
+ *
4
+ * Mirrors goldenmatch.connectors.base from the Python package: a small
5
+ * abstraction over external data sources (Snowflake, BigQuery, etc.) that
6
+ * exposes connect/read/close lifecycle and a name-based registry.
7
+ */
8
+
9
+ import type { Row } from "../../core/types.js";
10
+
11
+ export interface ConnectorConfig {
12
+ readonly [key: string]: unknown;
13
+ }
14
+
15
+ export interface ConnectorQuery {
16
+ readonly table: string;
17
+ readonly columns?: readonly string[];
18
+ readonly limit?: number;
19
+ }
20
+
21
+ export interface BaseConnector {
22
+ readonly name: string;
23
+ connect(): Promise<void>;
24
+ read(query: string | ConnectorQuery): Promise<Row[]>;
25
+ close(): Promise<void>;
26
+ }
27
+
28
+ export interface ConnectorFactory<C extends ConnectorConfig = ConnectorConfig> {
29
+ (config: C): BaseConnector;
30
+ }
31
+
32
+ // ---------------------------------------------------------------------------
33
+ // Registry
34
+ // ---------------------------------------------------------------------------
35
+
36
+ const registry = new Map<string, ConnectorFactory>();
37
+
38
+ export function registerConnector(name: string, factory: ConnectorFactory): void {
39
+ registry.set(name, factory);
40
+ }
41
+
42
+ export function loadConnector<C extends ConnectorConfig = ConnectorConfig>(
43
+ name: string,
44
+ config: C,
45
+ ): BaseConnector {
46
+ const factory = registry.get(name);
47
+ if (!factory) {
48
+ throw new Error(
49
+ `Unknown connector: ${name}. Registered: ${[...registry.keys()].join(", ")}`,
50
+ );
51
+ }
52
+ return factory(config);
53
+ }
54
+
55
+ export function listConnectors(): readonly string[] {
56
+ return [...registry.keys()];
57
+ }
@@ -0,0 +1,61 @@
1
+ /**
2
+ * bigquery.ts -- Google BigQuery connector via the optional `@google-cloud/bigquery` peer dep.
3
+ */
4
+
5
+ import { createRequire } from "node:module";
6
+ import type { Row } from "../../core/types.js";
7
+ import type { BaseConnector, ConnectorQuery } from "./base.js";
8
+
9
+ export interface BigQueryConfig {
10
+ readonly projectId: string;
11
+ readonly keyFilename?: string;
12
+ readonly credentials?: Readonly<Record<string, unknown>>;
13
+ readonly dataset?: string;
14
+ readonly location?: string;
15
+ }
16
+
17
+ export function createBigQueryConnector(config: BigQueryConfig): BaseConnector {
18
+ const require = createRequire(import.meta.url);
19
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
20
+ let bq: any;
21
+ try {
22
+ bq = require("@google-cloud/bigquery");
23
+ } catch {
24
+ throw new Error(
25
+ "'@google-cloud/bigquery' is required for the BigQuery connector. Install: npm install @google-cloud/bigquery",
26
+ );
27
+ }
28
+
29
+ const client = new bq.BigQuery({
30
+ projectId: config.projectId,
31
+ keyFilename: config.keyFilename,
32
+ credentials: config.credentials,
33
+ location: config.location,
34
+ });
35
+
36
+ return {
37
+ name: "bigquery",
38
+ async connect() {
39
+ // No-op: BigQuery client uses REST per-query.
40
+ },
41
+ async read(query) {
42
+ const sql =
43
+ typeof query === "string"
44
+ ? query
45
+ : buildSelect(query, config.dataset);
46
+ const [rows] = await client.query({ query: sql, location: config.location });
47
+ return rows as Row[];
48
+ },
49
+ async close() {
50
+ // No persistent connection to tear down.
51
+ },
52
+ };
53
+ }
54
+
55
+ function buildSelect(q: ConnectorQuery, dataset?: string): string {
56
+ const cols = q.columns?.length ? q.columns.join(",") : "*";
57
+ const tableRef = `${dataset ? `${dataset}.` : ""}${q.table}`;
58
+ let sql = `SELECT ${cols} FROM \`${tableRef}\``;
59
+ if (q.limit) sql += ` LIMIT ${q.limit}`;
60
+ return sql;
61
+ }
@@ -0,0 +1,69 @@
1
+ /**
2
+ * databricks.ts -- Databricks SQL warehouse connector via the optional `@databricks/sql` peer dep.
3
+ */
4
+
5
+ import { createRequire } from "node:module";
6
+ import type { Row } from "../../core/types.js";
7
+ import type { BaseConnector, ConnectorQuery } from "./base.js";
8
+
9
+ export interface DatabricksConfig {
10
+ readonly serverHostname: string;
11
+ readonly httpPath: string;
12
+ readonly token: string;
13
+ readonly catalog?: string;
14
+ readonly schema?: string;
15
+ }
16
+
17
+ export function createDatabricksConnector(config: DatabricksConfig): BaseConnector {
18
+ const require = createRequire(import.meta.url);
19
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
20
+ let db: any;
21
+ try {
22
+ db = require("@databricks/sql");
23
+ } catch {
24
+ throw new Error(
25
+ "'@databricks/sql' is required for the Databricks connector. Install: npm install @databricks/sql",
26
+ );
27
+ }
28
+
29
+ const client = new db.DBSQLClient();
30
+ // eslint-disable-next-line @typescript-eslint/no-explicit-any
31
+ let session: any = null;
32
+
33
+ return {
34
+ name: "databricks",
35
+ async connect() {
36
+ await client.connect({
37
+ host: config.serverHostname,
38
+ path: config.httpPath,
39
+ token: config.token,
40
+ });
41
+ session = await client.openSession({
42
+ initialCatalog: config.catalog,
43
+ initialSchema: config.schema,
44
+ });
45
+ },
46
+ async read(query) {
47
+ if (!session) throw new Error("Databricks connector not connected. Call connect() first.");
48
+ const sql = typeof query === "string" ? query : buildSelect(query);
49
+ const operation = await session.executeStatement(sql);
50
+ const rows = await operation.fetchAll();
51
+ await operation.close();
52
+ return rows as Row[];
53
+ },
54
+ async close() {
55
+ if (session) {
56
+ await session.close();
57
+ session = null;
58
+ }
59
+ await client.close();
60
+ },
61
+ };
62
+ }
63
+
64
+ function buildSelect(q: ConnectorQuery): string {
65
+ const cols = q.columns?.length ? q.columns.map((c) => `\`${c}\``).join(",") : "*";
66
+ let sql = `SELECT ${cols} FROM ${q.table}`;
67
+ if (q.limit) sql += ` LIMIT ${q.limit}`;
68
+ return sql;
69
+ }