goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* postgres.ts -- Optional Postgres connector for Node.
|
|
3
|
+
*
|
|
4
|
+
* Mirrors `goldenmatch.db.connector.PostgresConnector` from Python.
|
|
5
|
+
*
|
|
6
|
+
* Peer dependency (NOT in package.json -- install on demand):
|
|
7
|
+
* npm install pg
|
|
8
|
+
*
|
|
9
|
+
* The dep is loaded via `createRequire` so the package stays importable
|
|
10
|
+
* on edge runtimes and in environments without Postgres.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { createRequire } from "node:module";
|
|
14
|
+
import type { Row } from "../../core/types.js";
|
|
15
|
+
|
|
16
|
+
export interface PostgresConfig {
|
|
17
|
+
readonly connectionString?: string;
|
|
18
|
+
readonly host?: string;
|
|
19
|
+
readonly port?: number;
|
|
20
|
+
readonly database?: string;
|
|
21
|
+
readonly user?: string;
|
|
22
|
+
readonly password?: string;
|
|
23
|
+
readonly ssl?: boolean;
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
export interface PostgresWriteOptions {
|
|
27
|
+
readonly upsert?: boolean;
|
|
28
|
+
readonly primaryKey?: string;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export interface PostgresConnector {
|
|
32
|
+
connect(): Promise<void>;
|
|
33
|
+
query<T = Row>(sql: string, params?: readonly unknown[]): Promise<T[]>;
|
|
34
|
+
readTable(table: string): Promise<Row[]>;
|
|
35
|
+
writeTable(
|
|
36
|
+
table: string,
|
|
37
|
+
rows: readonly Row[],
|
|
38
|
+
options?: PostgresWriteOptions,
|
|
39
|
+
): Promise<void>;
|
|
40
|
+
listTables(schema?: string): Promise<string[]>;
|
|
41
|
+
close(): Promise<void>;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/** Minimal shape of `pg.Client` we use. */
|
|
45
|
+
interface PgClient {
|
|
46
|
+
connect(): Promise<void>;
|
|
47
|
+
query(sql: string, params?: readonly unknown[]): Promise<{ rows: unknown[] }>;
|
|
48
|
+
end(): Promise<void>;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
interface PgModule {
|
|
52
|
+
Client: new (config: unknown) => PgClient;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/**
|
|
56
|
+
* Create a Postgres connector. Throws if `pg` isn't installed.
|
|
57
|
+
*
|
|
58
|
+
* The returned connector requires `connect()` before any query. Inserts
|
|
59
|
+
* are batched in chunks of 1000 rows. When `options.upsert` is set, the
|
|
60
|
+
* write uses `INSERT ... ON CONFLICT (primaryKey) DO UPDATE`.
|
|
61
|
+
*/
|
|
62
|
+
export function createPostgresConnector(
|
|
63
|
+
config: PostgresConfig,
|
|
64
|
+
): PostgresConnector {
|
|
65
|
+
const require = createRequire(import.meta.url);
|
|
66
|
+
let pg: PgModule;
|
|
67
|
+
try {
|
|
68
|
+
pg = require("pg") as PgModule;
|
|
69
|
+
} catch {
|
|
70
|
+
throw new Error(
|
|
71
|
+
"'pg' is required for Postgres support. Install: npm install pg",
|
|
72
|
+
);
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const clientConfig: Record<string, unknown> = {};
|
|
76
|
+
if (config.connectionString !== undefined) {
|
|
77
|
+
clientConfig["connectionString"] = config.connectionString;
|
|
78
|
+
}
|
|
79
|
+
if (config.host !== undefined) clientConfig["host"] = config.host;
|
|
80
|
+
if (config.port !== undefined) clientConfig["port"] = config.port;
|
|
81
|
+
if (config.database !== undefined) clientConfig["database"] = config.database;
|
|
82
|
+
if (config.user !== undefined) clientConfig["user"] = config.user;
|
|
83
|
+
if (config.password !== undefined) clientConfig["password"] = config.password;
|
|
84
|
+
if (config.ssl !== undefined) clientConfig["ssl"] = config.ssl;
|
|
85
|
+
|
|
86
|
+
const client = new pg.Client(clientConfig);
|
|
87
|
+
|
|
88
|
+
const escapeIdent = (s: string): string => `"${s.replace(/"/g, '""')}"`;
|
|
89
|
+
|
|
90
|
+
const writeTable = async (
|
|
91
|
+
table: string,
|
|
92
|
+
rows: readonly Row[],
|
|
93
|
+
options: PostgresWriteOptions = {},
|
|
94
|
+
): Promise<void> => {
|
|
95
|
+
if (rows.length === 0) return;
|
|
96
|
+
|
|
97
|
+
const first = rows[0]!;
|
|
98
|
+
const cols = Object.keys(first);
|
|
99
|
+
const tableIdent = escapeIdent(table);
|
|
100
|
+
const colNames = cols.map(escapeIdent).join(", ");
|
|
101
|
+
|
|
102
|
+
const BATCH_SIZE = 1000;
|
|
103
|
+
for (let i = 0; i < rows.length; i += BATCH_SIZE) {
|
|
104
|
+
const batch = rows.slice(i, i + BATCH_SIZE);
|
|
105
|
+
|
|
106
|
+
const valueClauses = batch
|
|
107
|
+
.map((_row, idx) => {
|
|
108
|
+
const offset = idx * cols.length;
|
|
109
|
+
return `(${cols.map((_, j) => `$${offset + j + 1}`).join(", ")})`;
|
|
110
|
+
})
|
|
111
|
+
.join(", ");
|
|
112
|
+
|
|
113
|
+
const params: unknown[] = [];
|
|
114
|
+
for (const row of batch) {
|
|
115
|
+
for (const c of cols) {
|
|
116
|
+
params.push((row as Record<string, unknown>)[c]);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
let sql = `INSERT INTO ${tableIdent} (${colNames}) VALUES ${valueClauses}`;
|
|
121
|
+
|
|
122
|
+
if (options.upsert === true && options.primaryKey !== undefined) {
|
|
123
|
+
const pk = options.primaryKey;
|
|
124
|
+
const updates = cols
|
|
125
|
+
.filter((c) => c !== pk)
|
|
126
|
+
.map((c) => `${escapeIdent(c)} = EXCLUDED.${escapeIdent(c)}`)
|
|
127
|
+
.join(", ");
|
|
128
|
+
if (updates.length > 0) {
|
|
129
|
+
sql += ` ON CONFLICT (${escapeIdent(pk)}) DO UPDATE SET ${updates}`;
|
|
130
|
+
} else {
|
|
131
|
+
sql += ` ON CONFLICT (${escapeIdent(pk)}) DO NOTHING`;
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
await client.query(sql, params);
|
|
136
|
+
}
|
|
137
|
+
};
|
|
138
|
+
|
|
139
|
+
return {
|
|
140
|
+
async connect(): Promise<void> {
|
|
141
|
+
await client.connect();
|
|
142
|
+
},
|
|
143
|
+
|
|
144
|
+
async query<T = Row>(
|
|
145
|
+
sql: string,
|
|
146
|
+
params: readonly unknown[] = [],
|
|
147
|
+
): Promise<T[]> {
|
|
148
|
+
const result = await client.query(sql, params);
|
|
149
|
+
return result.rows as T[];
|
|
150
|
+
},
|
|
151
|
+
|
|
152
|
+
async readTable(table: string): Promise<Row[]> {
|
|
153
|
+
const result = await client.query(`SELECT * FROM ${escapeIdent(table)}`);
|
|
154
|
+
return result.rows as Row[];
|
|
155
|
+
},
|
|
156
|
+
|
|
157
|
+
writeTable,
|
|
158
|
+
|
|
159
|
+
async listTables(schema: string = "public"): Promise<string[]> {
|
|
160
|
+
const result = await client.query(
|
|
161
|
+
"SELECT table_name FROM information_schema.tables WHERE table_schema = $1",
|
|
162
|
+
[schema],
|
|
163
|
+
);
|
|
164
|
+
return (result.rows as Array<Record<string, unknown>>).map((r) =>
|
|
165
|
+
String(r["table_name"] ?? ""),
|
|
166
|
+
);
|
|
167
|
+
},
|
|
168
|
+
|
|
169
|
+
async close(): Promise<void> {
|
|
170
|
+
await client.end();
|
|
171
|
+
},
|
|
172
|
+
};
|
|
173
|
+
}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* sync.ts -- Postgres-backed dedupe sync + watch helpers.
|
|
3
|
+
*
|
|
4
|
+
* Mirrors `goldenmatch.db.sync` from Python: read source table, run
|
|
5
|
+
* dedupe, write golden + cluster tables back to Postgres.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { dedupe } from "../../core/api.js";
|
|
9
|
+
import type { DedupeResult, GoldenMatchConfig } from "../../core/types.js";
|
|
10
|
+
import {
|
|
11
|
+
createPostgresConnector,
|
|
12
|
+
type PostgresConfig,
|
|
13
|
+
} from "./postgres.js";
|
|
14
|
+
|
|
15
|
+
export interface SyncOptions {
|
|
16
|
+
readonly pg: PostgresConfig;
|
|
17
|
+
readonly sourceTable: string;
|
|
18
|
+
readonly goldenTable: string;
|
|
19
|
+
/** Optional table to write cluster summaries (cluster_id, members, size, ...). */
|
|
20
|
+
readonly clustersTable?: string;
|
|
21
|
+
readonly config: GoldenMatchConfig;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Run a single dedupe pass against Postgres.
|
|
26
|
+
*
|
|
27
|
+
* 1. Read all rows from `sourceTable`.
|
|
28
|
+
* 2. Dedupe via the core pipeline.
|
|
29
|
+
* 3. Write golden records to `goldenTable`.
|
|
30
|
+
* 4. Optionally write cluster summaries to `clustersTable`.
|
|
31
|
+
*
|
|
32
|
+
* Always closes the connection on the way out.
|
|
33
|
+
*/
|
|
34
|
+
export async function syncDedupe(options: SyncOptions): Promise<DedupeResult> {
|
|
35
|
+
const conn = createPostgresConnector(options.pg);
|
|
36
|
+
try {
|
|
37
|
+
await conn.connect();
|
|
38
|
+
const rows = await conn.readTable(options.sourceTable);
|
|
39
|
+
const result = dedupe(rows, { config: options.config });
|
|
40
|
+
|
|
41
|
+
await conn.writeTable(options.goldenTable, result.goldenRecords);
|
|
42
|
+
|
|
43
|
+
if (options.clustersTable !== undefined && result.clusters.size > 0) {
|
|
44
|
+
const clusterRows = Array.from(result.clusters.entries()).map(
|
|
45
|
+
([id, c]) => ({
|
|
46
|
+
cluster_id: id,
|
|
47
|
+
members: JSON.stringify(c.members),
|
|
48
|
+
size: c.size,
|
|
49
|
+
confidence: c.confidence,
|
|
50
|
+
quality: c.clusterQuality,
|
|
51
|
+
}),
|
|
52
|
+
);
|
|
53
|
+
await conn.writeTable(options.clustersTable, clusterRows);
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
return result;
|
|
57
|
+
} finally {
|
|
58
|
+
await conn.close();
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
export interface WatchSyncOptions extends SyncOptions {
|
|
63
|
+
/** Polling interval in ms. Defaults to 60_000 (1 minute). */
|
|
64
|
+
readonly intervalMs?: number;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Run `syncDedupe` on a recurring interval.
|
|
69
|
+
*
|
|
70
|
+
* Returns a `stop` function. Errors in any iteration are logged via
|
|
71
|
+
* `console.warn` so the loop keeps running; callers should monitor
|
|
72
|
+
* `onResult` to confirm forward progress.
|
|
73
|
+
*/
|
|
74
|
+
export async function watchSync(
|
|
75
|
+
options: WatchSyncOptions,
|
|
76
|
+
onResult?: (result: DedupeResult) => void,
|
|
77
|
+
): Promise<() => void> {
|
|
78
|
+
const intervalMs = options.intervalMs ?? 60_000;
|
|
79
|
+
let stopped = false;
|
|
80
|
+
|
|
81
|
+
const loop = async (): Promise<void> => {
|
|
82
|
+
while (!stopped) {
|
|
83
|
+
try {
|
|
84
|
+
const result = await syncDedupe(options);
|
|
85
|
+
onResult?.(result);
|
|
86
|
+
} catch (err) {
|
|
87
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
88
|
+
console.warn("syncDedupe failed:", msg);
|
|
89
|
+
}
|
|
90
|
+
if (stopped) break;
|
|
91
|
+
await new Promise<void>((resolve) => setTimeout(resolve, intervalMs));
|
|
92
|
+
}
|
|
93
|
+
};
|
|
94
|
+
|
|
95
|
+
loop().catch((err: unknown) => {
|
|
96
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
97
|
+
console.warn("watchSync loop error:", msg);
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
return (): void => {
|
|
101
|
+
stopped = true;
|
|
102
|
+
};
|
|
103
|
+
}
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* dedupe-file.ts -- File-based convenience wrappers around dedupe() / match().
|
|
3
|
+
*
|
|
4
|
+
* Node-only: reads from disk, tags rows with __source__, delegates to the
|
|
5
|
+
* edge-safe core API.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { readFile, writeCsv, writeJson } from "./connectors/file.js";
|
|
9
|
+
import { dedupe, match } from "../core/api.js";
|
|
10
|
+
import type {
|
|
11
|
+
Row,
|
|
12
|
+
DedupeResult,
|
|
13
|
+
MatchResult,
|
|
14
|
+
GoldenMatchConfig,
|
|
15
|
+
} from "../core/types.js";
|
|
16
|
+
import { extname, basename } from "node:path";
|
|
17
|
+
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
// Options
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
/**
|
|
23
|
+
* File specification. Either a bare path (source name is derived from the
|
|
24
|
+
* file's basename without extension) or a tuple `[path, sourceName]`.
|
|
25
|
+
*/
|
|
26
|
+
export type FileSpec = string | readonly [string, string];
|
|
27
|
+
|
|
28
|
+
export interface FileDedupeOptions {
|
|
29
|
+
/** Input files. Required when calling `dedupeFile(opts)`. */
|
|
30
|
+
readonly files?: readonly FileSpec[];
|
|
31
|
+
/** Full config -- takes precedence over shorthand fields below. */
|
|
32
|
+
readonly config?: GoldenMatchConfig;
|
|
33
|
+
readonly exact?: readonly string[];
|
|
34
|
+
readonly fuzzy?: Readonly<Record<string, number>>;
|
|
35
|
+
readonly blocking?: readonly string[];
|
|
36
|
+
readonly threshold?: number;
|
|
37
|
+
/** Enable LLM scorer for borderline pairs (not yet implemented in JS). */
|
|
38
|
+
readonly llmScorer?: boolean;
|
|
39
|
+
/** Write golden records to this path (.csv or .json). */
|
|
40
|
+
readonly outputPath?: string;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// ---------------------------------------------------------------------------
|
|
44
|
+
// Helpers
|
|
45
|
+
// ---------------------------------------------------------------------------
|
|
46
|
+
|
|
47
|
+
function defaultSourceName(path: string, index: number): string {
|
|
48
|
+
const base = basename(path, extname(path));
|
|
49
|
+
return base && base.length > 0 ? base : `file_${index}`;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function resolveFileSpec(
|
|
53
|
+
spec: FileSpec,
|
|
54
|
+
index: number,
|
|
55
|
+
): { path: string; sourceName: string } {
|
|
56
|
+
if (typeof spec === "string") {
|
|
57
|
+
return { path: spec, sourceName: defaultSourceName(spec, index) };
|
|
58
|
+
}
|
|
59
|
+
const [path, sourceName] = spec;
|
|
60
|
+
return { path, sourceName };
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
function loadRowsWithSource(files: readonly FileSpec[]): Row[] {
|
|
64
|
+
const all: Row[] = [];
|
|
65
|
+
for (let i = 0; i < files.length; i++) {
|
|
66
|
+
const { path, sourceName } = resolveFileSpec(files[i]!, i);
|
|
67
|
+
const rows = readFile(path);
|
|
68
|
+
for (const row of rows) {
|
|
69
|
+
all.push({ ...row, __source__: sourceName });
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
return all;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
function buildCoreOptions(opts: FileDedupeOptions) {
|
|
76
|
+
const core: {
|
|
77
|
+
config?: GoldenMatchConfig;
|
|
78
|
+
exact?: readonly string[];
|
|
79
|
+
fuzzy?: Readonly<Record<string, number>>;
|
|
80
|
+
blocking?: readonly string[];
|
|
81
|
+
threshold?: number;
|
|
82
|
+
llmScorer?: boolean;
|
|
83
|
+
} = {};
|
|
84
|
+
if (opts.config !== undefined) core.config = opts.config;
|
|
85
|
+
if (opts.exact !== undefined) core.exact = opts.exact;
|
|
86
|
+
if (opts.fuzzy !== undefined) core.fuzzy = opts.fuzzy;
|
|
87
|
+
if (opts.blocking !== undefined) core.blocking = opts.blocking;
|
|
88
|
+
if (opts.threshold !== undefined) core.threshold = opts.threshold;
|
|
89
|
+
if (opts.llmScorer !== undefined) core.llmScorer = opts.llmScorer;
|
|
90
|
+
return core;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
function writeGoldenRecords(
|
|
94
|
+
outputPath: string,
|
|
95
|
+
rows: readonly Row[],
|
|
96
|
+
): void {
|
|
97
|
+
const ext = extname(outputPath).toLowerCase();
|
|
98
|
+
if (ext === ".json" || ext === ".jsonl" || ext === ".ndjson") {
|
|
99
|
+
writeJson(outputPath, rows);
|
|
100
|
+
} else {
|
|
101
|
+
// Default to CSV (includes `.csv`, `.tsv`, or unknown extensions).
|
|
102
|
+
const delimiter = ext === ".tsv" ? "\t" : ",";
|
|
103
|
+
writeCsv(outputPath, rows, { delimiter });
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// ---------------------------------------------------------------------------
|
|
108
|
+
// Public API
|
|
109
|
+
// ---------------------------------------------------------------------------
|
|
110
|
+
|
|
111
|
+
/**
|
|
112
|
+
* Deduplicate records across one or more files.
|
|
113
|
+
*
|
|
114
|
+
* Each file's rows are tagged with `__source__ = <sourceName>` before being
|
|
115
|
+
* concatenated and passed to `dedupe()`.
|
|
116
|
+
*
|
|
117
|
+
* @throws if no files are provided or any file cannot be read.
|
|
118
|
+
*/
|
|
119
|
+
export function dedupeFile(opts: FileDedupeOptions): DedupeResult {
|
|
120
|
+
if (!opts.files || opts.files.length === 0) {
|
|
121
|
+
throw new Error("dedupeFile: at least one input file is required");
|
|
122
|
+
}
|
|
123
|
+
const rows = loadRowsWithSource(opts.files);
|
|
124
|
+
const result = dedupe(rows, buildCoreOptions(opts));
|
|
125
|
+
if (opts.outputPath) {
|
|
126
|
+
writeGoldenRecords(opts.outputPath, result.goldenRecords);
|
|
127
|
+
}
|
|
128
|
+
return result;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
/**
|
|
132
|
+
* Match target records against a reference file.
|
|
133
|
+
*
|
|
134
|
+
* Reads both files, tags with `__source__`, and delegates to `match()`.
|
|
135
|
+
* `opts.files` (if provided) is ignored in favor of the explicit paths.
|
|
136
|
+
*/
|
|
137
|
+
export function matchFiles(
|
|
138
|
+
targetPath: string,
|
|
139
|
+
referencePath: string,
|
|
140
|
+
opts?: FileDedupeOptions,
|
|
141
|
+
): MatchResult {
|
|
142
|
+
const targetRows = readFile(targetPath).map((row) => ({
|
|
143
|
+
...row,
|
|
144
|
+
__source__: "target",
|
|
145
|
+
}));
|
|
146
|
+
const referenceRows = readFile(referencePath).map((row) => ({
|
|
147
|
+
...row,
|
|
148
|
+
__source__: "reference",
|
|
149
|
+
}));
|
|
150
|
+
|
|
151
|
+
const result = match(targetRows, referenceRows, buildCoreOptions(opts ?? {}));
|
|
152
|
+
if (opts?.outputPath) {
|
|
153
|
+
writeGoldenRecords(opts.outputPath, result.matched);
|
|
154
|
+
}
|
|
155
|
+
return result;
|
|
156
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* index.ts -- Node.js entry point for GoldenMatch.
|
|
3
|
+
*
|
|
4
|
+
* Re-exports the edge-safe core API plus Node-only helpers that use
|
|
5
|
+
* `node:fs`, `node:path`, and the optional `yaml` peer dependency.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// Re-export everything from the edge-safe core
|
|
9
|
+
export * from "../core/index.js";
|
|
10
|
+
|
|
11
|
+
// Node-only file I/O
|
|
12
|
+
export {
|
|
13
|
+
readCsv,
|
|
14
|
+
readJson,
|
|
15
|
+
readFile,
|
|
16
|
+
writeCsv,
|
|
17
|
+
writeJson,
|
|
18
|
+
} from "./connectors/file.js";
|
|
19
|
+
export type {
|
|
20
|
+
ReadCsvOptions,
|
|
21
|
+
WriteCsvOptions,
|
|
22
|
+
} from "./connectors/file.js";
|
|
23
|
+
|
|
24
|
+
// File-based dedupe / match
|
|
25
|
+
export { dedupeFile, matchFiles } from "./dedupe-file.js";
|
|
26
|
+
export type { FileDedupeOptions, FileSpec } from "./dedupe-file.js";
|
|
27
|
+
|
|
28
|
+
// YAML config file I/O
|
|
29
|
+
export { loadConfigFile, writeConfigFile } from "./config-file.js";
|
|
30
|
+
|
|
31
|
+
// Cloud connectors (registers built-in connectors as a side-effect)
|
|
32
|
+
export {
|
|
33
|
+
createSnowflakeConnector,
|
|
34
|
+
createBigQueryConnector,
|
|
35
|
+
createDatabricksConnector,
|
|
36
|
+
createSalesforceConnector,
|
|
37
|
+
createHubSpotConnector,
|
|
38
|
+
registerConnector,
|
|
39
|
+
loadConnector,
|
|
40
|
+
listConnectors,
|
|
41
|
+
} from "./connectors/index.js";
|
|
42
|
+
export type {
|
|
43
|
+
BaseConnector,
|
|
44
|
+
ConnectorConfig,
|
|
45
|
+
ConnectorFactory,
|
|
46
|
+
ConnectorQuery,
|
|
47
|
+
SnowflakeConfig,
|
|
48
|
+
BigQueryConfig,
|
|
49
|
+
DatabricksConfig,
|
|
50
|
+
SalesforceConfig,
|
|
51
|
+
HubSpotConfig,
|
|
52
|
+
} from "./connectors/index.js";
|
|
53
|
+
|
|
54
|
+
// Servers (MCP / REST / A2A)
|
|
55
|
+
export { startMcpServer, handleTool, TOOLS } from "./mcp/server.js";
|
|
56
|
+
export { startApiServer, ReviewQueue } from "./api/server.js";
|
|
57
|
+
export type { StartApiOptions } from "./api/server.js";
|
|
58
|
+
export { startA2aServer, AGENT_CARD } from "./a2a/server.js";
|
|
59
|
+
export type { StartA2aOptions, AgentSkill } from "./a2a/server.js";
|
|
60
|
+
|
|
61
|
+
// Concurrent / parallel block scoring (Node-only; uses dynamic import of core scorer).
|
|
62
|
+
// `scoreBlocksParallel` uses piscina for true worker-thread parallelism when the
|
|
63
|
+
// optional `piscina` peer dep is installed; falls back to `scoreBlocksConcurrent`.
|
|
64
|
+
export {
|
|
65
|
+
scoreBlocksConcurrent,
|
|
66
|
+
scoreBlocksParallel,
|
|
67
|
+
} from "./backends/workers.js";
|
|
68
|
+
export type {
|
|
69
|
+
WorkerPoolOptions,
|
|
70
|
+
ParallelWorkerOptions,
|
|
71
|
+
} from "./backends/workers.js";
|
|
72
|
+
|
|
73
|
+
// Optional DuckDB connector (peer dep: @duckdb/node-api -- install on demand)
|
|
74
|
+
export { createDuckDBConnector } from "./backends/duckdb.js";
|
|
75
|
+
export type { DuckDBConfig, DuckDBConnector } from "./backends/duckdb.js";
|
|
76
|
+
|
|
77
|
+
// Optional Postgres connector + sync helpers (peer dep: pg -- install on demand)
|
|
78
|
+
export { createPostgresConnector } from "./db/postgres.js";
|
|
79
|
+
export type {
|
|
80
|
+
PostgresConfig,
|
|
81
|
+
PostgresConnector,
|
|
82
|
+
PostgresWriteOptions,
|
|
83
|
+
} from "./db/postgres.js";
|
|
84
|
+
export { syncDedupe, watchSync } from "./db/sync.js";
|
|
85
|
+
export type { SyncOptions, WatchSyncOptions } from "./db/sync.js";
|
|
86
|
+
|
|
87
|
+
// Interactive TUI (optional peer deps: ink + react)
|
|
88
|
+
export { startTui } from "./tui/app.js";
|
|
89
|
+
export type { TuiOptions } from "./tui/app.js";
|