goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,396 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* scorer.ts — LLM scorer for borderline record pairs.
|
|
3
|
+
* Ports `goldenmatch/core/llm_scorer.py`.
|
|
4
|
+
*
|
|
5
|
+
* Three-tier decision:
|
|
6
|
+
* score >= autoThreshold -> auto-accept (promote to 1.0)
|
|
7
|
+
* candidateLo <= score < hi -> send to LLM
|
|
8
|
+
* score < candidateLo -> keep original score (never demoted)
|
|
9
|
+
*
|
|
10
|
+
* Edge-safe: uses `fetch()` (global on Node 20+/edge runtimes).
|
|
11
|
+
* No `node:` imports.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import type { Row, ScoredPair, LLMScorerConfig } from "../types.js";
|
|
15
|
+
import { makeScoredPair } from "../types.js";
|
|
16
|
+
import { BudgetTracker, countTokensApprox } from "./budget.js";
|
|
17
|
+
import type { BudgetSnapshot } from "./budget.js";
|
|
18
|
+
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
// Public result types
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
|
|
23
|
+
export interface LLMScoreResult {
|
|
24
|
+
readonly pairs: readonly ScoredPair[];
|
|
25
|
+
readonly budget: BudgetSnapshot | null;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export interface LLMCallResult {
|
|
29
|
+
readonly decisions: ReadonlyMap<number, boolean>;
|
|
30
|
+
readonly inputTokens: number;
|
|
31
|
+
readonly outputTokens: number;
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
type Provider = "openai" | "anthropic";
|
|
35
|
+
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
// Provider detection
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Pick a provider based on config + key heuristics.
|
|
42
|
+
* OpenAI keys start with `sk-` (or `sk-proj-`); Anthropic keys with `sk-ant-`.
|
|
43
|
+
*/
|
|
44
|
+
function detectProvider(apiKey?: string, configProvider?: string): Provider {
|
|
45
|
+
if (configProvider === "openai" || configProvider === "anthropic") {
|
|
46
|
+
return configProvider;
|
|
47
|
+
}
|
|
48
|
+
if (apiKey?.startsWith("sk-ant-")) return "anthropic";
|
|
49
|
+
return "openai";
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function defaultModel(provider: Provider): string {
|
|
53
|
+
return provider === "openai" ? "gpt-4o-mini" : "claude-haiku-4-5-20251001";
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
// ---------------------------------------------------------------------------
|
|
57
|
+
// Prompt construction
|
|
58
|
+
// ---------------------------------------------------------------------------
|
|
59
|
+
|
|
60
|
+
/** Pull non-internal fields from a Row into a compact display string. */
|
|
61
|
+
function summariseRow(row: Row, cols: readonly string[]): string {
|
|
62
|
+
const parts: string[] = [];
|
|
63
|
+
for (const c of cols) {
|
|
64
|
+
const v = row[c];
|
|
65
|
+
if (v === null || v === undefined || v === "") continue;
|
|
66
|
+
parts.push(`${c}: ${String(v)}`);
|
|
67
|
+
}
|
|
68
|
+
return parts.join(" | ").slice(0, 200);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/** Build the batch prompt for a list of candidate pairs. */
|
|
72
|
+
function buildBatchPrompt(
|
|
73
|
+
batch: readonly ScoredPair[],
|
|
74
|
+
rowById: ReadonlyMap<number, Row>,
|
|
75
|
+
cols: readonly string[],
|
|
76
|
+
): string {
|
|
77
|
+
const lines: string[] = [
|
|
78
|
+
"For each numbered pair, answer YES if they are the same entity/product, " +
|
|
79
|
+
"NO if they are different. Respond with just the number and YES/NO, one per line.",
|
|
80
|
+
"",
|
|
81
|
+
];
|
|
82
|
+
batch.forEach((pair, k) => {
|
|
83
|
+
const rowA = rowById.get(pair.idA) ?? {};
|
|
84
|
+
const rowB = rowById.get(pair.idB) ?? {};
|
|
85
|
+
const textA = summariseRow(rowA, cols);
|
|
86
|
+
const textB = summariseRow(rowB, cols);
|
|
87
|
+
lines.push(`${k + 1}. A: ${textA}`);
|
|
88
|
+
lines.push(` B: ${textB}`);
|
|
89
|
+
});
|
|
90
|
+
return lines.join("\n");
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
/** Parse a batch YES/NO response into a decision list aligned to batch. */
|
|
94
|
+
function parseBatchResponse(answer: string, batchSize: number): boolean[] {
|
|
95
|
+
const decisions: boolean[] = [];
|
|
96
|
+
const lines = answer.split(/\r?\n/);
|
|
97
|
+
for (const raw of lines) {
|
|
98
|
+
const line = raw.trim().toUpperCase();
|
|
99
|
+
if (!line) continue;
|
|
100
|
+
if (line.includes("YES")) decisions.push(true);
|
|
101
|
+
else if (line.includes("NO")) decisions.push(false);
|
|
102
|
+
if (decisions.length >= batchSize) break;
|
|
103
|
+
}
|
|
104
|
+
// Pad with `false` so callers can align by index.
|
|
105
|
+
while (decisions.length < batchSize) decisions.push(false);
|
|
106
|
+
return decisions;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
// ---------------------------------------------------------------------------
|
|
110
|
+
// Provider calls (fetch-based, edge-safe)
|
|
111
|
+
// ---------------------------------------------------------------------------
|
|
112
|
+
|
|
113
|
+
async function callOpenAI(
|
|
114
|
+
prompt: string,
|
|
115
|
+
apiKey: string,
|
|
116
|
+
model: string,
|
|
117
|
+
maxTokens: number,
|
|
118
|
+
): Promise<{ text: string; inputTokens: number; outputTokens: number }> {
|
|
119
|
+
const resp = await fetch("https://api.openai.com/v1/chat/completions", {
|
|
120
|
+
method: "POST",
|
|
121
|
+
headers: {
|
|
122
|
+
Authorization: `Bearer ${apiKey}`,
|
|
123
|
+
"Content-Type": "application/json",
|
|
124
|
+
},
|
|
125
|
+
body: JSON.stringify({
|
|
126
|
+
model,
|
|
127
|
+
messages: [{ role: "user", content: prompt }],
|
|
128
|
+
temperature: 0,
|
|
129
|
+
max_tokens: maxTokens,
|
|
130
|
+
}),
|
|
131
|
+
});
|
|
132
|
+
if (!resp.ok) {
|
|
133
|
+
const body = await resp.text().catch(() => "");
|
|
134
|
+
throw new LLMHttpError(resp.status, `OpenAI ${resp.status}: ${body.slice(0, 200)}`);
|
|
135
|
+
}
|
|
136
|
+
const data = (await resp.json()) as {
|
|
137
|
+
choices?: Array<{ message?: { content?: string } }>;
|
|
138
|
+
usage?: { prompt_tokens?: number; completion_tokens?: number };
|
|
139
|
+
};
|
|
140
|
+
const text = data.choices?.[0]?.message?.content?.trim() ?? "";
|
|
141
|
+
return {
|
|
142
|
+
text,
|
|
143
|
+
inputTokens: data.usage?.prompt_tokens ?? 0,
|
|
144
|
+
outputTokens: data.usage?.completion_tokens ?? 0,
|
|
145
|
+
};
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
async function callAnthropic(
|
|
149
|
+
prompt: string,
|
|
150
|
+
apiKey: string,
|
|
151
|
+
model: string,
|
|
152
|
+
maxTokens: number,
|
|
153
|
+
): Promise<{ text: string; inputTokens: number; outputTokens: number }> {
|
|
154
|
+
const resp = await fetch("https://api.anthropic.com/v1/messages", {
|
|
155
|
+
method: "POST",
|
|
156
|
+
headers: {
|
|
157
|
+
"x-api-key": apiKey,
|
|
158
|
+
"content-type": "application/json",
|
|
159
|
+
"anthropic-version": "2023-06-01",
|
|
160
|
+
},
|
|
161
|
+
body: JSON.stringify({
|
|
162
|
+
model,
|
|
163
|
+
max_tokens: maxTokens,
|
|
164
|
+
messages: [{ role: "user", content: prompt }],
|
|
165
|
+
}),
|
|
166
|
+
});
|
|
167
|
+
if (!resp.ok) {
|
|
168
|
+
const body = await resp.text().catch(() => "");
|
|
169
|
+
throw new LLMHttpError(resp.status, `Anthropic ${resp.status}: ${body.slice(0, 200)}`);
|
|
170
|
+
}
|
|
171
|
+
const data = (await resp.json()) as {
|
|
172
|
+
content?: Array<{ text?: string }>;
|
|
173
|
+
usage?: { input_tokens?: number; output_tokens?: number };
|
|
174
|
+
};
|
|
175
|
+
const text = data.content?.[0]?.text?.trim() ?? "";
|
|
176
|
+
return {
|
|
177
|
+
text,
|
|
178
|
+
inputTokens: data.usage?.input_tokens ?? 0,
|
|
179
|
+
outputTokens: data.usage?.output_tokens ?? 0,
|
|
180
|
+
};
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
/** Error thrown by provider helpers when the HTTP call fails. */
|
|
184
|
+
export class LLMHttpError extends Error {
|
|
185
|
+
constructor(public readonly status: number, message: string) {
|
|
186
|
+
super(message);
|
|
187
|
+
this.name = "LLMHttpError";
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// ---------------------------------------------------------------------------
|
|
192
|
+
// Batch orchestration
|
|
193
|
+
// ---------------------------------------------------------------------------
|
|
194
|
+
|
|
195
|
+
function* batchify<T>(items: readonly T[], size: number): Generator<T[]> {
|
|
196
|
+
const step = Math.max(1, size);
|
|
197
|
+
for (let i = 0; i < items.length; i += step) {
|
|
198
|
+
yield items.slice(i, i + step);
|
|
199
|
+
}
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
async function scoreBatch(
|
|
203
|
+
batch: readonly ScoredPair[],
|
|
204
|
+
rowById: ReadonlyMap<number, Row>,
|
|
205
|
+
cols: readonly string[],
|
|
206
|
+
provider: Provider,
|
|
207
|
+
model: string,
|
|
208
|
+
apiKey: string,
|
|
209
|
+
budget: BudgetTracker,
|
|
210
|
+
): Promise<LLMCallResult> {
|
|
211
|
+
const prompt = buildBatchPrompt(batch, rowById, cols);
|
|
212
|
+
const estIn = countTokensApprox(prompt);
|
|
213
|
+
const estOut = batch.length * 10;
|
|
214
|
+
|
|
215
|
+
if (!budget.canSend(estIn)) {
|
|
216
|
+
return { decisions: new Map(), inputTokens: 0, outputTokens: 0 };
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
try {
|
|
220
|
+
const { text, inputTokens, outputTokens } =
|
|
221
|
+
provider === "openai"
|
|
222
|
+
? await callOpenAI(prompt, apiKey, model, batch.length * 10)
|
|
223
|
+
: await callAnthropic(prompt, apiKey, model, batch.length * 10);
|
|
224
|
+
|
|
225
|
+
budget.record(inputTokens || estIn, outputTokens || estOut, model);
|
|
226
|
+
|
|
227
|
+
const decisions = parseBatchResponse(text, batch.length);
|
|
228
|
+
const out = new Map<number, boolean>();
|
|
229
|
+
batch.forEach((pair, k) => {
|
|
230
|
+
out.set(pairIndex(pair), decisions[k] ?? false);
|
|
231
|
+
});
|
|
232
|
+
return { decisions: out, inputTokens, outputTokens };
|
|
233
|
+
} catch (err) {
|
|
234
|
+
if (err instanceof LLMHttpError) {
|
|
235
|
+
// Graceful degradation: caller keeps original fuzzy scores.
|
|
236
|
+
return { decisions: new Map(), inputTokens: 0, outputTokens: 0 };
|
|
237
|
+
}
|
|
238
|
+
// Unknown error — also degrade gracefully.
|
|
239
|
+
return { decisions: new Map(), inputTokens: 0, outputTokens: 0 };
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
/** A stable numeric key for a pair, used as a Map index. */
|
|
244
|
+
function pairIndex(pair: ScoredPair): number {
|
|
245
|
+
// Cantor pairing on the canonical (min,max) ids.
|
|
246
|
+
const a = Math.min(pair.idA, pair.idB);
|
|
247
|
+
const b = Math.max(pair.idA, pair.idB);
|
|
248
|
+
return ((a + b) * (a + b + 1)) / 2 + b;
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// ---------------------------------------------------------------------------
|
|
252
|
+
// Public: llmScorePairs
|
|
253
|
+
// ---------------------------------------------------------------------------
|
|
254
|
+
|
|
255
|
+
/**
|
|
256
|
+
* Score borderline pairs with an LLM. Never demotes: pairs the LLM rejects
|
|
257
|
+
* keep their original fuzzy score. Pairs the LLM confirms are promoted to 1.0.
|
|
258
|
+
*
|
|
259
|
+
* When no `apiKey` is available, degrades gracefully and returns the input.
|
|
260
|
+
*/
|
|
261
|
+
export async function llmScorePairs(
|
|
262
|
+
pairs: readonly ScoredPair[],
|
|
263
|
+
rows: readonly Row[],
|
|
264
|
+
config: LLMScorerConfig,
|
|
265
|
+
apiKey?: string,
|
|
266
|
+
): Promise<LLMScoreResult> {
|
|
267
|
+
const budget = new BudgetTracker(
|
|
268
|
+
config.budget ?? {},
|
|
269
|
+
config.model ?? "gpt-4o-mini",
|
|
270
|
+
);
|
|
271
|
+
|
|
272
|
+
if (pairs.length === 0) {
|
|
273
|
+
return { pairs: [], budget: budget.snapshot() };
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
const provider = detectProvider(apiKey, config.provider);
|
|
277
|
+
const model = config.model ?? defaultModel(provider);
|
|
278
|
+
|
|
279
|
+
// Display columns: everything not prefixed with `__`.
|
|
280
|
+
const cols = new Set<string>();
|
|
281
|
+
for (const r of rows) {
|
|
282
|
+
for (const k of Object.keys(r)) {
|
|
283
|
+
if (!k.startsWith("__")) cols.add(k);
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
const displayCols = [...cols];
|
|
287
|
+
|
|
288
|
+
const rowById = new Map<number, Row>();
|
|
289
|
+
for (const r of rows) {
|
|
290
|
+
const id = r["__row_id__"];
|
|
291
|
+
if (typeof id === "number") rowById.set(id, r);
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
// Three-tier partition.
|
|
295
|
+
const autoAccept: ScoredPair[] = [];
|
|
296
|
+
const candidates: ScoredPair[] = [];
|
|
297
|
+
const below: ScoredPair[] = [];
|
|
298
|
+
for (const p of pairs) {
|
|
299
|
+
if (p.score >= config.autoThreshold) autoAccept.push(p);
|
|
300
|
+
else if (p.score >= config.candidateLo) candidates.push(p);
|
|
301
|
+
else below.push(p);
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
// Build result scaffold: auto-accept promoted to 1.0, below untouched.
|
|
305
|
+
const resultPairs: ScoredPair[] = [];
|
|
306
|
+
for (const p of autoAccept) {
|
|
307
|
+
resultPairs.push(makeScoredPair(p.idA, p.idB, 1.0));
|
|
308
|
+
}
|
|
309
|
+
for (const p of below) {
|
|
310
|
+
resultPairs.push(p);
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// If no API key, pass candidates through unchanged.
|
|
314
|
+
if (!apiKey) {
|
|
315
|
+
resultPairs.push(...candidates);
|
|
316
|
+
return { pairs: resultPairs, budget: budget.snapshot() };
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
// Batch LLM scoring for candidates.
|
|
320
|
+
const batchSize = Math.max(1, config.batchSize || 20);
|
|
321
|
+
const llmDecisions = new Map<number, boolean>();
|
|
322
|
+
for (const batch of batchify(candidates, batchSize)) {
|
|
323
|
+
if (!budget.canProceed()) break;
|
|
324
|
+
const res = await scoreBatch(
|
|
325
|
+
batch,
|
|
326
|
+
rowById,
|
|
327
|
+
displayCols,
|
|
328
|
+
provider,
|
|
329
|
+
model,
|
|
330
|
+
apiKey,
|
|
331
|
+
budget,
|
|
332
|
+
);
|
|
333
|
+
res.decisions.forEach((v, k) => llmDecisions.set(k, v));
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
// Merge candidates: promote YES to 1.0, keep NO/unscored at original score.
|
|
337
|
+
for (const p of candidates) {
|
|
338
|
+
const decision = llmDecisions.get(pairIndex(p));
|
|
339
|
+
if (decision === true) {
|
|
340
|
+
resultPairs.push(makeScoredPair(p.idA, p.idB, 1.0));
|
|
341
|
+
} else {
|
|
342
|
+
resultPairs.push(p);
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
return { pairs: resultPairs, budget: budget.snapshot() };
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// ---------------------------------------------------------------------------
|
|
350
|
+
// Public: scoreStringsWithLlm (single-pair helper)
|
|
351
|
+
// ---------------------------------------------------------------------------
|
|
352
|
+
|
|
353
|
+
/**
|
|
354
|
+
* Ask the LLM a single yes/no question about two strings. Returns 1.0
|
|
355
|
+
* for yes, 0.0 for no, and 0.0 on any error (graceful).
|
|
356
|
+
*/
|
|
357
|
+
export async function scoreStringsWithLlm(
|
|
358
|
+
a: string,
|
|
359
|
+
b: string,
|
|
360
|
+
config: LLMScorerConfig,
|
|
361
|
+
apiKey?: string,
|
|
362
|
+
): Promise<{ score: number; budget: BudgetSnapshot; error?: string }> {
|
|
363
|
+
const budget = new BudgetTracker(
|
|
364
|
+
config.budget ?? {},
|
|
365
|
+
config.model ?? "gpt-4o-mini",
|
|
366
|
+
);
|
|
367
|
+
if (!apiKey) return { score: 0, budget: budget.snapshot() };
|
|
368
|
+
|
|
369
|
+
const provider = detectProvider(apiKey, config.provider);
|
|
370
|
+
const model = config.model ?? defaultModel(provider);
|
|
371
|
+
|
|
372
|
+
const prompt =
|
|
373
|
+
"Are these two values referring to the same entity? Answer YES or NO.\n" +
|
|
374
|
+
`A: ${a}\nB: ${b}`;
|
|
375
|
+
|
|
376
|
+
try {
|
|
377
|
+
const { text, inputTokens, outputTokens } =
|
|
378
|
+
provider === "openai"
|
|
379
|
+
? await callOpenAI(prompt, apiKey, model, 10)
|
|
380
|
+
: await callAnthropic(prompt, apiKey, model, 10);
|
|
381
|
+
budget.record(inputTokens, outputTokens, model);
|
|
382
|
+
const upper = text.trim().toUpperCase();
|
|
383
|
+
const score = upper.includes("YES") ? 1.0 : 0.0;
|
|
384
|
+
return { score, budget: budget.snapshot() };
|
|
385
|
+
} catch (err) {
|
|
386
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
387
|
+
// eslint-disable-next-line no-console
|
|
388
|
+
console.warn("scoreStringsWithLlm failed:", message);
|
|
389
|
+
// Return score=0 (treats as "not matched") but surface the error so
|
|
390
|
+
// operators can distinguish HTTP failures from genuine LLM "no" answers.
|
|
391
|
+
return { score: 0, budget: budget.snapshot(), error: message };
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
// Re-export budget types for convenience.
|
|
396
|
+
export type { BudgetSnapshot } from "./budget.js";
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* match-one.ts — Single-record matching primitive.
|
|
3
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/match_one.py.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { Row, MatchkeyConfig } from "./types.js";
|
|
9
|
+
import { scorePair, asString } from "./scorer.js";
|
|
10
|
+
import { applyTransforms } from "./transforms.js";
|
|
11
|
+
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
// Types
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
|
|
16
|
+
export interface MatchOneHit {
|
|
17
|
+
readonly rowId: number;
|
|
18
|
+
readonly score: number;
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// matchOne
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Match a single record against a dataset using a weighted matchkey.
|
|
27
|
+
*
|
|
28
|
+
* Threshold defaults to 0 (return everything). For exact matchkeys use
|
|
29
|
+
* {@link findExactMatchesOne}.
|
|
30
|
+
*
|
|
31
|
+
* Returns hits sorted by descending score. Rows are expected to carry
|
|
32
|
+
* `__row_id__`.
|
|
33
|
+
*/
|
|
34
|
+
export function matchOne(
|
|
35
|
+
record: Row,
|
|
36
|
+
rows: readonly Row[],
|
|
37
|
+
mk: MatchkeyConfig,
|
|
38
|
+
): readonly MatchOneHit[] {
|
|
39
|
+
// Exact matchkeys require perfect match (score 1.0).
|
|
40
|
+
const threshold = mk.type === "exact" ? 1.0 : (mk.threshold ?? 0);
|
|
41
|
+
const matches: MatchOneHit[] = [];
|
|
42
|
+
for (const row of rows) {
|
|
43
|
+
const score = scorePair(record, row, mk.fields);
|
|
44
|
+
if (score >= threshold) {
|
|
45
|
+
matches.push({ rowId: row["__row_id__"] as number, score });
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
matches.sort((a, b) => b.score - a.score);
|
|
49
|
+
return matches;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
// ---------------------------------------------------------------------------
|
|
53
|
+
// findExactMatchesOne
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Find exact matches for a single record against a dataset.
|
|
58
|
+
*
|
|
59
|
+
* Builds the composite matchkey for the probe record, then scans the rows
|
|
60
|
+
* and returns any that share the same composite key (score 1.0). Null
|
|
61
|
+
* transformed fields disqualify the comparison.
|
|
62
|
+
*/
|
|
63
|
+
export function findExactMatchesOne(
|
|
64
|
+
record: Row,
|
|
65
|
+
rows: readonly Row[],
|
|
66
|
+
mk: MatchkeyConfig,
|
|
67
|
+
): readonly MatchOneHit[] {
|
|
68
|
+
// Build composite key for probe
|
|
69
|
+
const probeParts: string[] = [];
|
|
70
|
+
for (const f of mk.fields) {
|
|
71
|
+
const t = applyTransforms(asString(record[f.field]), f.transforms);
|
|
72
|
+
if (t === null) return [];
|
|
73
|
+
probeParts.push(t);
|
|
74
|
+
}
|
|
75
|
+
const probeKey = probeParts.join("\x00");
|
|
76
|
+
|
|
77
|
+
const hits: MatchOneHit[] = [];
|
|
78
|
+
for (const row of rows) {
|
|
79
|
+
const parts: string[] = [];
|
|
80
|
+
let hasNull = false;
|
|
81
|
+
for (const f of mk.fields) {
|
|
82
|
+
const t = applyTransforms(asString(row[f.field]), f.transforms);
|
|
83
|
+
if (t === null) {
|
|
84
|
+
hasNull = true;
|
|
85
|
+
break;
|
|
86
|
+
}
|
|
87
|
+
parts.push(t);
|
|
88
|
+
}
|
|
89
|
+
if (hasNull) continue;
|
|
90
|
+
if (parts.join("\x00") === probeKey) {
|
|
91
|
+
hits.push({ rowId: row["__row_id__"] as number, score: 1.0 });
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
return hits;
|
|
95
|
+
}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* matchkey.ts — Matchkey builder for GoldenMatch-JS.
|
|
3
|
+
* Edge-safe: no `node:` imports, pure TypeScript only.
|
|
4
|
+
*
|
|
5
|
+
* Ports matchkey building from goldenmatch/core/matchkey.py.
|
|
6
|
+
* In Python this uses Polars expressions; here we work with Row arrays.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type { Row, MatchkeyConfig } from "./types.js";
|
|
10
|
+
import { applyTransforms } from "./transforms.js";
|
|
11
|
+
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
// computeMatchkeyValue — build a matchkey value for a single row
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Build a composite matchkey value for a single row.
|
|
18
|
+
*
|
|
19
|
+
* For each field in the matchkey config:
|
|
20
|
+
* 1. Read the raw value from the row
|
|
21
|
+
* 2. Apply the field's transform chain
|
|
22
|
+
* 3. Concatenate all parts with "||" separator
|
|
23
|
+
*
|
|
24
|
+
* Returns `null` if any field value is null/undefined or transforms to null.
|
|
25
|
+
*/
|
|
26
|
+
export function computeMatchkeyValue(
|
|
27
|
+
row: Row,
|
|
28
|
+
mk: MatchkeyConfig,
|
|
29
|
+
): string | null {
|
|
30
|
+
const parts: string[] = [];
|
|
31
|
+
for (const f of mk.fields) {
|
|
32
|
+
const raw = row[f.field];
|
|
33
|
+
if (raw === null || raw === undefined) return null;
|
|
34
|
+
const val = applyTransforms(String(raw), f.transforms);
|
|
35
|
+
if (val === null) return null;
|
|
36
|
+
parts.push(val);
|
|
37
|
+
}
|
|
38
|
+
return parts.join("||");
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
// ---------------------------------------------------------------------------
|
|
42
|
+
// computeMatchkeys — add matchkey columns to all rows
|
|
43
|
+
// ---------------------------------------------------------------------------
|
|
44
|
+
|
|
45
|
+
/**
|
|
46
|
+
* Add matchkey columns to rows. For each matchkey `mk`, adds a column
|
|
47
|
+
* `__mk_{mk.name}__` with the computed matchkey value.
|
|
48
|
+
*
|
|
49
|
+
* Returns new row objects (does not mutate originals).
|
|
50
|
+
*/
|
|
51
|
+
export function computeMatchkeys(
|
|
52
|
+
rows: readonly Row[],
|
|
53
|
+
matchkeys: readonly MatchkeyConfig[],
|
|
54
|
+
): Row[] {
|
|
55
|
+
return rows.map((row) => {
|
|
56
|
+
const extra: Record<string, unknown> = {};
|
|
57
|
+
for (const mk of matchkeys) {
|
|
58
|
+
extra[`__mk_${mk.name}__`] = computeMatchkeyValue(row, mk);
|
|
59
|
+
}
|
|
60
|
+
return { ...row, ...extra };
|
|
61
|
+
});
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
// ---------------------------------------------------------------------------
|
|
65
|
+
// addRowIds — add sequential __row_id__ column
|
|
66
|
+
// ---------------------------------------------------------------------------
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Add `__row_id__` column as sequential integers starting from `offset`.
|
|
70
|
+
*
|
|
71
|
+
* Returns new row objects (does not mutate originals).
|
|
72
|
+
*/
|
|
73
|
+
export function addRowIds(rows: readonly Row[], offset: number = 0): Row[] {
|
|
74
|
+
return rows.map((row, i) => ({
|
|
75
|
+
...row,
|
|
76
|
+
__row_id__: offset + i,
|
|
77
|
+
}));
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// ---------------------------------------------------------------------------
|
|
81
|
+
// addSourceColumn — add __source__ column
|
|
82
|
+
// ---------------------------------------------------------------------------
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Add `__source__` column with the given source name to every row.
|
|
86
|
+
*
|
|
87
|
+
* Returns new row objects (does not mutate originals).
|
|
88
|
+
*/
|
|
89
|
+
export function addSourceColumn(
|
|
90
|
+
rows: readonly Row[],
|
|
91
|
+
sourceName: string,
|
|
92
|
+
): Row[] {
|
|
93
|
+
return rows.map((row) => ({
|
|
94
|
+
...row,
|
|
95
|
+
__source__: sourceName,
|
|
96
|
+
}));
|
|
97
|
+
}
|