goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* workers.ts -- Concurrent and parallel block scoring for Node.
|
|
3
|
+
*
|
|
4
|
+
* Python uses ThreadPoolExecutor (rapidfuzz releases the GIL). JS is
|
|
5
|
+
* single-threaded by default; true parallelism requires `worker_threads`
|
|
6
|
+
* with serialization overhead that's only worth it for large blocks.
|
|
7
|
+
*
|
|
8
|
+
* This module ships two schedulers:
|
|
9
|
+
* - `scoreBlocksConcurrent` -- Promise.all batching on the main thread.
|
|
10
|
+
* No real parallelism, but zero setup cost and good for small/medium
|
|
11
|
+
* block counts.
|
|
12
|
+
* - `scoreBlocksParallel` -- piscina-backed worker pool for true CPU
|
|
13
|
+
* parallelism. Optional peer dep; falls back to `scoreBlocksConcurrent`
|
|
14
|
+
* when piscina isn't installed.
|
|
15
|
+
*
|
|
16
|
+
* Mirrors the shape of `goldenmatch.backends.ray_backend.score_blocks_ray`
|
|
17
|
+
* from the Python source, but stays inside one Node process.
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import { fileURLToPath } from "node:url";
|
|
21
|
+
import { dirname, join } from "node:path";
|
|
22
|
+
|
|
23
|
+
import type {
|
|
24
|
+
BlockResult,
|
|
25
|
+
MatchkeyConfig,
|
|
26
|
+
PairKey,
|
|
27
|
+
ScoredPair,
|
|
28
|
+
} from "../../core/types.js";
|
|
29
|
+
import { pairKey } from "../../core/cluster.js";
|
|
30
|
+
|
|
31
|
+
export interface WorkerPoolOptions {
|
|
32
|
+
/** Max blocks scored concurrently per batch. Defaults to 4. */
|
|
33
|
+
readonly batchSize?: number;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export interface ParallelWorkerOptions {
|
|
37
|
+
/** Max worker threads. Defaults to min(8, max(2, blocks.length)). */
|
|
38
|
+
readonly maxThreads?: number;
|
|
39
|
+
/** Min worker threads kept warm. Defaults to 1. */
|
|
40
|
+
readonly minThreads?: number;
|
|
41
|
+
/** Idle timeout in ms before workers exit. Defaults to 1000. */
|
|
42
|
+
readonly idleTimeout?: number;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Score blocks with cooperative concurrency.
|
|
47
|
+
*
|
|
48
|
+
* - For 0 blocks: returns empty.
|
|
49
|
+
* - For <= 2 blocks: skips batching overhead and runs sequentially via
|
|
50
|
+
* `scoreBlocksSequential` (mirrors Python's small-block fast path).
|
|
51
|
+
* - Otherwise: schedules blocks in batches of `batchSize`, awaiting each
|
|
52
|
+
* batch with `Promise.all` so the event loop can interleave I/O.
|
|
53
|
+
*
|
|
54
|
+
* Note: `matchedPairs` is mutated as new pairs are discovered (consistent
|
|
55
|
+
* with `scoreBlocksSequential`). A frozen snapshot is used per block so
|
|
56
|
+
* concurrent batches see a stable exclusion set, matching Python's
|
|
57
|
+
* `score_blocks_parallel` contract.
|
|
58
|
+
*/
|
|
59
|
+
export async function scoreBlocksConcurrent(
|
|
60
|
+
blocks: readonly BlockResult[],
|
|
61
|
+
mk: MatchkeyConfig,
|
|
62
|
+
matchedPairs: Set<PairKey>,
|
|
63
|
+
options: WorkerPoolOptions = {},
|
|
64
|
+
): Promise<readonly ScoredPair[]> {
|
|
65
|
+
if (blocks.length === 0) return [];
|
|
66
|
+
|
|
67
|
+
// Small-block fast path -- sequential is cheaper than batching.
|
|
68
|
+
if (blocks.length <= 2) {
|
|
69
|
+
const { scoreBlocksSequential } = await import("../../core/scorer.js");
|
|
70
|
+
return scoreBlocksSequential(blocks, mk, matchedPairs);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
const { findFuzzyMatches } = await import("../../core/scorer.js");
|
|
74
|
+
const batchSize = options.batchSize ?? 4;
|
|
75
|
+
const results: ScoredPair[] = [];
|
|
76
|
+
|
|
77
|
+
for (let i = 0; i < blocks.length; i += batchSize) {
|
|
78
|
+
const batch = blocks.slice(i, i + batchSize);
|
|
79
|
+
|
|
80
|
+
// Snapshot exclude set per batch so concurrent block scoring is stable.
|
|
81
|
+
const excludeSnapshot: ReadonlySet<PairKey> = new Set(matchedPairs);
|
|
82
|
+
|
|
83
|
+
const batchResults = await Promise.all(
|
|
84
|
+
batch.map((block) =>
|
|
85
|
+
Promise.resolve().then(() =>
|
|
86
|
+
findFuzzyMatches(
|
|
87
|
+
block.rows,
|
|
88
|
+
mk,
|
|
89
|
+
excludeSnapshot,
|
|
90
|
+
block.preScoredPairs,
|
|
91
|
+
),
|
|
92
|
+
),
|
|
93
|
+
),
|
|
94
|
+
);
|
|
95
|
+
|
|
96
|
+
for (const pairs of batchResults) {
|
|
97
|
+
for (const p of pairs) {
|
|
98
|
+
const key = pairKey(p.idA, p.idB);
|
|
99
|
+
if (matchedPairs.has(key)) continue;
|
|
100
|
+
matchedPairs.add(key);
|
|
101
|
+
results.push(p);
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
return results;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/**
|
|
110
|
+
* Score blocks in true parallel via piscina worker_threads.
|
|
111
|
+
*
|
|
112
|
+
* - For 0 blocks: returns empty.
|
|
113
|
+
* - For <= 2 blocks: runs sequentially (spinning up workers isn't worth it).
|
|
114
|
+
* - Otherwise: dispatches each block to a piscina worker that runs
|
|
115
|
+
* `findFuzzyMatches` in its own V8 isolate, giving true CPU parallelism.
|
|
116
|
+
*
|
|
117
|
+
* Falls back to `scoreBlocksConcurrent` with a console warning if piscina
|
|
118
|
+
* isn't installed (it's an optional peer dep).
|
|
119
|
+
*
|
|
120
|
+
* `matchedPairs` is mutated in place with newly discovered pairs, matching
|
|
121
|
+
* the contract of `scoreBlocksSequential` / `scoreBlocksConcurrent`.
|
|
122
|
+
*/
|
|
123
|
+
export async function scoreBlocksParallel(
|
|
124
|
+
blocks: readonly BlockResult[],
|
|
125
|
+
mk: MatchkeyConfig,
|
|
126
|
+
matchedPairs: Set<PairKey>,
|
|
127
|
+
options: ParallelWorkerOptions = {},
|
|
128
|
+
): Promise<readonly ScoredPair[]> {
|
|
129
|
+
if (blocks.length === 0) return [];
|
|
130
|
+
|
|
131
|
+
// Small-block fast path -- worker startup isn't worth it.
|
|
132
|
+
if (blocks.length <= 2) {
|
|
133
|
+
const { scoreBlocksSequential } = await import("../../core/scorer.js");
|
|
134
|
+
return scoreBlocksSequential(blocks, mk, matchedPairs);
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
// Dynamically load piscina so it stays an optional peer dep.
|
|
138
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
139
|
+
let PiscinaCtor: any;
|
|
140
|
+
try {
|
|
141
|
+
const mod = await import("piscina" as string);
|
|
142
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
143
|
+
const m = mod as any;
|
|
144
|
+
PiscinaCtor = m.Piscina ?? m.default ?? m;
|
|
145
|
+
} catch {
|
|
146
|
+
console.warn(
|
|
147
|
+
"piscina not installed; falling back to Promise.all concurrency. " +
|
|
148
|
+
"Install `piscina` as a peer dep for true worker-thread parallelism.",
|
|
149
|
+
);
|
|
150
|
+
return scoreBlocksConcurrent(blocks, mk, matchedPairs);
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
const workerScript = resolveWorkerScript();
|
|
154
|
+
|
|
155
|
+
const pool = new PiscinaCtor({
|
|
156
|
+
filename: workerScript,
|
|
157
|
+
maxThreads: options.maxThreads ?? Math.max(2, Math.min(8, blocks.length)),
|
|
158
|
+
minThreads: options.minThreads ?? 1,
|
|
159
|
+
idleTimeout: options.idleTimeout ?? 1000,
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
try {
|
|
163
|
+
// Snapshot the exclude set as an array so it serializes across threads.
|
|
164
|
+
const snapshot = Array.from(matchedPairs);
|
|
165
|
+
|
|
166
|
+
const results = (await Promise.all(
|
|
167
|
+
blocks.map(
|
|
168
|
+
(block) =>
|
|
169
|
+
pool.run({ block, mk, matchedPairs: snapshot }) as Promise<{
|
|
170
|
+
pairs: readonly ScoredPair[];
|
|
171
|
+
}>,
|
|
172
|
+
),
|
|
173
|
+
)) as { pairs: readonly ScoredPair[] }[];
|
|
174
|
+
|
|
175
|
+
const all: ScoredPair[] = [];
|
|
176
|
+
for (const r of results) {
|
|
177
|
+
for (const p of r.pairs) {
|
|
178
|
+
const key = pairKey(p.idA, p.idB);
|
|
179
|
+
if (matchedPairs.has(key)) continue;
|
|
180
|
+
matchedPairs.add(key);
|
|
181
|
+
all.push(p);
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
return all;
|
|
185
|
+
} finally {
|
|
186
|
+
await pool.destroy();
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
/**
|
|
191
|
+
* Resolve the on-disk path of the compiled worker script.
|
|
192
|
+
*
|
|
193
|
+
* tsup is configured to emit `score-worker.js` / `.cjs` alongside this
|
|
194
|
+
* module in the `dist/node/backends/` directory. In dev (pre-build) the
|
|
195
|
+
* caller can set `GOLDENMATCH_WORKER_SCRIPT` to point at a custom path
|
|
196
|
+
* (e.g. a ts-node loader).
|
|
197
|
+
*
|
|
198
|
+
* Picks `.js` (ESM) first, falling back to `.cjs`. piscina resolves the
|
|
199
|
+
* file itself -- we just hand it a path string.
|
|
200
|
+
*/
|
|
201
|
+
function resolveWorkerScript(): string {
|
|
202
|
+
const override = process.env["GOLDENMATCH_WORKER_SCRIPT"];
|
|
203
|
+
if (override !== undefined && override.length > 0) return override;
|
|
204
|
+
|
|
205
|
+
const here = fileURLToPath(import.meta.url);
|
|
206
|
+
const dir = dirname(here);
|
|
207
|
+
|
|
208
|
+
// Preferred: sibling in the same dist/node/backends/ directory.
|
|
209
|
+
// tsup emits both .js (ESM) and .cjs depending on the package format;
|
|
210
|
+
// piscina accepts either.
|
|
211
|
+
return join(dir, "score-worker.js");
|
|
212
|
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* config-file.ts -- YAML config loading/saving from disk.
|
|
3
|
+
*
|
|
4
|
+
* Node-only. Uses `createRequire` so the optional `yaml` peer dependency
|
|
5
|
+
* is resolved lazily without breaking edge-safe ESM builds.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { readFileSync, writeFileSync, mkdirSync, existsSync } from "node:fs";
|
|
9
|
+
import { resolve, dirname } from "node:path";
|
|
10
|
+
import { createRequire } from "node:module";
|
|
11
|
+
import { parseConfigYaml, configToYaml } from "../core/config/loader.js";
|
|
12
|
+
import type { GoldenMatchConfig } from "../core/types.js";
|
|
13
|
+
|
|
14
|
+
const require = createRequire(import.meta.url);
|
|
15
|
+
|
|
16
|
+
interface YamlModule {
|
|
17
|
+
parse: (s: string) => unknown;
|
|
18
|
+
stringify: (v: unknown) => string;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
function loadYamlModule(): YamlModule {
|
|
22
|
+
try {
|
|
23
|
+
// eslint-disable-next-line @typescript-eslint/no-var-requires
|
|
24
|
+
const mod = require("yaml") as YamlModule;
|
|
25
|
+
if (typeof mod.parse !== "function" || typeof mod.stringify !== "function") {
|
|
26
|
+
throw new Error("'yaml' module missing parse/stringify exports");
|
|
27
|
+
}
|
|
28
|
+
return mod;
|
|
29
|
+
} catch (err) {
|
|
30
|
+
const detail = err instanceof Error ? err.message : String(err);
|
|
31
|
+
throw new Error(
|
|
32
|
+
`'yaml' package is required for config file I/O. Install: npm install yaml (${detail})`,
|
|
33
|
+
);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Load and parse a YAML config file into a typed GoldenMatchConfig.
|
|
39
|
+
*
|
|
40
|
+
* @throws if the file cannot be read, `yaml` is not installed, or the
|
|
41
|
+
* document does not describe a valid config.
|
|
42
|
+
*/
|
|
43
|
+
export function loadConfigFile(path: string): GoldenMatchConfig {
|
|
44
|
+
const resolved = resolve(path);
|
|
45
|
+
if (!existsSync(resolved)) {
|
|
46
|
+
throw new Error(`Config file not found: ${resolved}`);
|
|
47
|
+
}
|
|
48
|
+
const content = readFileSync(resolved, "utf8");
|
|
49
|
+
const yamlMod = loadYamlModule();
|
|
50
|
+
return parseConfigYaml(content, yamlMod.parse);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Serialize a GoldenMatchConfig to YAML and write it to disk.
|
|
55
|
+
* Creates parent directories as needed.
|
|
56
|
+
*/
|
|
57
|
+
export function writeConfigFile(path: string, config: GoldenMatchConfig): void {
|
|
58
|
+
const resolved = resolve(path);
|
|
59
|
+
const dir = dirname(resolved);
|
|
60
|
+
if (dir && dir !== "." && !existsSync(dir)) {
|
|
61
|
+
mkdirSync(dir, { recursive: true });
|
|
62
|
+
}
|
|
63
|
+
const yamlMod = loadYamlModule();
|
|
64
|
+
const yamlStr = configToYaml(config, yamlMod.stringify);
|
|
65
|
+
writeFileSync(resolved, yamlStr, "utf8");
|
|
66
|
+
}
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* base.ts -- Base connector interface and registry.
|
|
3
|
+
*
|
|
4
|
+
* Mirrors goldenmatch.connectors.base from the Python package: a small
|
|
5
|
+
* abstraction over external data sources (Snowflake, BigQuery, etc.) that
|
|
6
|
+
* exposes connect/read/close lifecycle and a name-based registry.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type { Row } from "../../core/types.js";
|
|
10
|
+
|
|
11
|
+
export interface ConnectorConfig {
|
|
12
|
+
readonly [key: string]: unknown;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface ConnectorQuery {
|
|
16
|
+
readonly table: string;
|
|
17
|
+
readonly columns?: readonly string[];
|
|
18
|
+
readonly limit?: number;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface BaseConnector {
|
|
22
|
+
readonly name: string;
|
|
23
|
+
connect(): Promise<void>;
|
|
24
|
+
read(query: string | ConnectorQuery): Promise<Row[]>;
|
|
25
|
+
close(): Promise<void>;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export interface ConnectorFactory<C extends ConnectorConfig = ConnectorConfig> {
|
|
29
|
+
(config: C): BaseConnector;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
// ---------------------------------------------------------------------------
|
|
33
|
+
// Registry
|
|
34
|
+
// ---------------------------------------------------------------------------
|
|
35
|
+
|
|
36
|
+
const registry = new Map<string, ConnectorFactory>();
|
|
37
|
+
|
|
38
|
+
export function registerConnector(name: string, factory: ConnectorFactory): void {
|
|
39
|
+
registry.set(name, factory);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export function loadConnector<C extends ConnectorConfig = ConnectorConfig>(
|
|
43
|
+
name: string,
|
|
44
|
+
config: C,
|
|
45
|
+
): BaseConnector {
|
|
46
|
+
const factory = registry.get(name);
|
|
47
|
+
if (!factory) {
|
|
48
|
+
throw new Error(
|
|
49
|
+
`Unknown connector: ${name}. Registered: ${[...registry.keys()].join(", ")}`,
|
|
50
|
+
);
|
|
51
|
+
}
|
|
52
|
+
return factory(config);
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export function listConnectors(): readonly string[] {
|
|
56
|
+
return [...registry.keys()];
|
|
57
|
+
}
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* bigquery.ts -- Google BigQuery connector via the optional `@google-cloud/bigquery` peer dep.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { createRequire } from "node:module";
|
|
6
|
+
import type { Row } from "../../core/types.js";
|
|
7
|
+
import type { BaseConnector, ConnectorQuery } from "./base.js";
|
|
8
|
+
|
|
9
|
+
export interface BigQueryConfig {
|
|
10
|
+
readonly projectId: string;
|
|
11
|
+
readonly keyFilename?: string;
|
|
12
|
+
readonly credentials?: Readonly<Record<string, unknown>>;
|
|
13
|
+
readonly dataset?: string;
|
|
14
|
+
readonly location?: string;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export function createBigQueryConnector(config: BigQueryConfig): BaseConnector {
|
|
18
|
+
const require = createRequire(import.meta.url);
|
|
19
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
20
|
+
let bq: any;
|
|
21
|
+
try {
|
|
22
|
+
bq = require("@google-cloud/bigquery");
|
|
23
|
+
} catch {
|
|
24
|
+
throw new Error(
|
|
25
|
+
"'@google-cloud/bigquery' is required for the BigQuery connector. Install: npm install @google-cloud/bigquery",
|
|
26
|
+
);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
const client = new bq.BigQuery({
|
|
30
|
+
projectId: config.projectId,
|
|
31
|
+
keyFilename: config.keyFilename,
|
|
32
|
+
credentials: config.credentials,
|
|
33
|
+
location: config.location,
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
return {
|
|
37
|
+
name: "bigquery",
|
|
38
|
+
async connect() {
|
|
39
|
+
// No-op: BigQuery client uses REST per-query.
|
|
40
|
+
},
|
|
41
|
+
async read(query) {
|
|
42
|
+
const sql =
|
|
43
|
+
typeof query === "string"
|
|
44
|
+
? query
|
|
45
|
+
: buildSelect(query, config.dataset);
|
|
46
|
+
const [rows] = await client.query({ query: sql, location: config.location });
|
|
47
|
+
return rows as Row[];
|
|
48
|
+
},
|
|
49
|
+
async close() {
|
|
50
|
+
// No persistent connection to tear down.
|
|
51
|
+
},
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function buildSelect(q: ConnectorQuery, dataset?: string): string {
|
|
56
|
+
const cols = q.columns?.length ? q.columns.join(",") : "*";
|
|
57
|
+
const tableRef = `${dataset ? `${dataset}.` : ""}${q.table}`;
|
|
58
|
+
let sql = `SELECT ${cols} FROM \`${tableRef}\``;
|
|
59
|
+
if (q.limit) sql += ` LIMIT ${q.limit}`;
|
|
60
|
+
return sql;
|
|
61
|
+
}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* databricks.ts -- Databricks SQL warehouse connector via the optional `@databricks/sql` peer dep.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import { createRequire } from "node:module";
|
|
6
|
+
import type { Row } from "../../core/types.js";
|
|
7
|
+
import type { BaseConnector, ConnectorQuery } from "./base.js";
|
|
8
|
+
|
|
9
|
+
export interface DatabricksConfig {
|
|
10
|
+
readonly serverHostname: string;
|
|
11
|
+
readonly httpPath: string;
|
|
12
|
+
readonly token: string;
|
|
13
|
+
readonly catalog?: string;
|
|
14
|
+
readonly schema?: string;
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export function createDatabricksConnector(config: DatabricksConfig): BaseConnector {
|
|
18
|
+
const require = createRequire(import.meta.url);
|
|
19
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
20
|
+
let db: any;
|
|
21
|
+
try {
|
|
22
|
+
db = require("@databricks/sql");
|
|
23
|
+
} catch {
|
|
24
|
+
throw new Error(
|
|
25
|
+
"'@databricks/sql' is required for the Databricks connector. Install: npm install @databricks/sql",
|
|
26
|
+
);
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
const client = new db.DBSQLClient();
|
|
30
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
31
|
+
let session: any = null;
|
|
32
|
+
|
|
33
|
+
return {
|
|
34
|
+
name: "databricks",
|
|
35
|
+
async connect() {
|
|
36
|
+
await client.connect({
|
|
37
|
+
host: config.serverHostname,
|
|
38
|
+
path: config.httpPath,
|
|
39
|
+
token: config.token,
|
|
40
|
+
});
|
|
41
|
+
session = await client.openSession({
|
|
42
|
+
initialCatalog: config.catalog,
|
|
43
|
+
initialSchema: config.schema,
|
|
44
|
+
});
|
|
45
|
+
},
|
|
46
|
+
async read(query) {
|
|
47
|
+
if (!session) throw new Error("Databricks connector not connected. Call connect() first.");
|
|
48
|
+
const sql = typeof query === "string" ? query : buildSelect(query);
|
|
49
|
+
const operation = await session.executeStatement(sql);
|
|
50
|
+
const rows = await operation.fetchAll();
|
|
51
|
+
await operation.close();
|
|
52
|
+
return rows as Row[];
|
|
53
|
+
},
|
|
54
|
+
async close() {
|
|
55
|
+
if (session) {
|
|
56
|
+
await session.close();
|
|
57
|
+
session = null;
|
|
58
|
+
}
|
|
59
|
+
await client.close();
|
|
60
|
+
},
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function buildSelect(q: ConnectorQuery): string {
|
|
65
|
+
const cols = q.columns?.length ? q.columns.map((c) => `\`${c}\``).join(",") : "*";
|
|
66
|
+
let sql = `SELECT ${cols} FROM ${q.table}`;
|
|
67
|
+
if (q.limit) sql += ` LIMIT ${q.limit}`;
|
|
68
|
+
return sql;
|
|
69
|
+
}
|