goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,396 @@
1
+ /**
2
+ * scorer.ts — LLM scorer for borderline record pairs.
3
+ * Ports `goldenmatch/core/llm_scorer.py`.
4
+ *
5
+ * Three-tier decision:
6
+ * score >= autoThreshold -> auto-accept (promote to 1.0)
7
+ * candidateLo <= score < hi -> send to LLM
8
+ * score < candidateLo -> keep original score (never demoted)
9
+ *
10
+ * Edge-safe: uses `fetch()` (global on Node 20+/edge runtimes).
11
+ * No `node:` imports.
12
+ */
13
+
14
+ import type { Row, ScoredPair, LLMScorerConfig } from "../types.js";
15
+ import { makeScoredPair } from "../types.js";
16
+ import { BudgetTracker, countTokensApprox } from "./budget.js";
17
+ import type { BudgetSnapshot } from "./budget.js";
18
+
19
+ // ---------------------------------------------------------------------------
20
+ // Public result types
21
+ // ---------------------------------------------------------------------------
22
+
23
+ export interface LLMScoreResult {
24
+ readonly pairs: readonly ScoredPair[];
25
+ readonly budget: BudgetSnapshot | null;
26
+ }
27
+
28
+ export interface LLMCallResult {
29
+ readonly decisions: ReadonlyMap<number, boolean>;
30
+ readonly inputTokens: number;
31
+ readonly outputTokens: number;
32
+ }
33
+
34
+ type Provider = "openai" | "anthropic";
35
+
36
+ // ---------------------------------------------------------------------------
37
+ // Provider detection
38
+ // ---------------------------------------------------------------------------
39
+
40
+ /**
41
+ * Pick a provider based on config + key heuristics.
42
+ * OpenAI keys start with `sk-` (or `sk-proj-`); Anthropic keys with `sk-ant-`.
43
+ */
44
+ function detectProvider(apiKey?: string, configProvider?: string): Provider {
45
+ if (configProvider === "openai" || configProvider === "anthropic") {
46
+ return configProvider;
47
+ }
48
+ if (apiKey?.startsWith("sk-ant-")) return "anthropic";
49
+ return "openai";
50
+ }
51
+
52
+ function defaultModel(provider: Provider): string {
53
+ return provider === "openai" ? "gpt-4o-mini" : "claude-haiku-4-5-20251001";
54
+ }
55
+
56
+ // ---------------------------------------------------------------------------
57
+ // Prompt construction
58
+ // ---------------------------------------------------------------------------
59
+
60
+ /** Pull non-internal fields from a Row into a compact display string. */
61
+ function summariseRow(row: Row, cols: readonly string[]): string {
62
+ const parts: string[] = [];
63
+ for (const c of cols) {
64
+ const v = row[c];
65
+ if (v === null || v === undefined || v === "") continue;
66
+ parts.push(`${c}: ${String(v)}`);
67
+ }
68
+ return parts.join(" | ").slice(0, 200);
69
+ }
70
+
71
+ /** Build the batch prompt for a list of candidate pairs. */
72
+ function buildBatchPrompt(
73
+ batch: readonly ScoredPair[],
74
+ rowById: ReadonlyMap<number, Row>,
75
+ cols: readonly string[],
76
+ ): string {
77
+ const lines: string[] = [
78
+ "For each numbered pair, answer YES if they are the same entity/product, " +
79
+ "NO if they are different. Respond with just the number and YES/NO, one per line.",
80
+ "",
81
+ ];
82
+ batch.forEach((pair, k) => {
83
+ const rowA = rowById.get(pair.idA) ?? {};
84
+ const rowB = rowById.get(pair.idB) ?? {};
85
+ const textA = summariseRow(rowA, cols);
86
+ const textB = summariseRow(rowB, cols);
87
+ lines.push(`${k + 1}. A: ${textA}`);
88
+ lines.push(` B: ${textB}`);
89
+ });
90
+ return lines.join("\n");
91
+ }
92
+
93
+ /** Parse a batch YES/NO response into a decision list aligned to batch. */
94
+ function parseBatchResponse(answer: string, batchSize: number): boolean[] {
95
+ const decisions: boolean[] = [];
96
+ const lines = answer.split(/\r?\n/);
97
+ for (const raw of lines) {
98
+ const line = raw.trim().toUpperCase();
99
+ if (!line) continue;
100
+ if (line.includes("YES")) decisions.push(true);
101
+ else if (line.includes("NO")) decisions.push(false);
102
+ if (decisions.length >= batchSize) break;
103
+ }
104
+ // Pad with `false` so callers can align by index.
105
+ while (decisions.length < batchSize) decisions.push(false);
106
+ return decisions;
107
+ }
108
+
109
+ // ---------------------------------------------------------------------------
110
+ // Provider calls (fetch-based, edge-safe)
111
+ // ---------------------------------------------------------------------------
112
+
113
+ async function callOpenAI(
114
+ prompt: string,
115
+ apiKey: string,
116
+ model: string,
117
+ maxTokens: number,
118
+ ): Promise<{ text: string; inputTokens: number; outputTokens: number }> {
119
+ const resp = await fetch("https://api.openai.com/v1/chat/completions", {
120
+ method: "POST",
121
+ headers: {
122
+ Authorization: `Bearer ${apiKey}`,
123
+ "Content-Type": "application/json",
124
+ },
125
+ body: JSON.stringify({
126
+ model,
127
+ messages: [{ role: "user", content: prompt }],
128
+ temperature: 0,
129
+ max_tokens: maxTokens,
130
+ }),
131
+ });
132
+ if (!resp.ok) {
133
+ const body = await resp.text().catch(() => "");
134
+ throw new LLMHttpError(resp.status, `OpenAI ${resp.status}: ${body.slice(0, 200)}`);
135
+ }
136
+ const data = (await resp.json()) as {
137
+ choices?: Array<{ message?: { content?: string } }>;
138
+ usage?: { prompt_tokens?: number; completion_tokens?: number };
139
+ };
140
+ const text = data.choices?.[0]?.message?.content?.trim() ?? "";
141
+ return {
142
+ text,
143
+ inputTokens: data.usage?.prompt_tokens ?? 0,
144
+ outputTokens: data.usage?.completion_tokens ?? 0,
145
+ };
146
+ }
147
+
148
+ async function callAnthropic(
149
+ prompt: string,
150
+ apiKey: string,
151
+ model: string,
152
+ maxTokens: number,
153
+ ): Promise<{ text: string; inputTokens: number; outputTokens: number }> {
154
+ const resp = await fetch("https://api.anthropic.com/v1/messages", {
155
+ method: "POST",
156
+ headers: {
157
+ "x-api-key": apiKey,
158
+ "content-type": "application/json",
159
+ "anthropic-version": "2023-06-01",
160
+ },
161
+ body: JSON.stringify({
162
+ model,
163
+ max_tokens: maxTokens,
164
+ messages: [{ role: "user", content: prompt }],
165
+ }),
166
+ });
167
+ if (!resp.ok) {
168
+ const body = await resp.text().catch(() => "");
169
+ throw new LLMHttpError(resp.status, `Anthropic ${resp.status}: ${body.slice(0, 200)}`);
170
+ }
171
+ const data = (await resp.json()) as {
172
+ content?: Array<{ text?: string }>;
173
+ usage?: { input_tokens?: number; output_tokens?: number };
174
+ };
175
+ const text = data.content?.[0]?.text?.trim() ?? "";
176
+ return {
177
+ text,
178
+ inputTokens: data.usage?.input_tokens ?? 0,
179
+ outputTokens: data.usage?.output_tokens ?? 0,
180
+ };
181
+ }
182
+
183
+ /** Error thrown by provider helpers when the HTTP call fails. */
184
+ export class LLMHttpError extends Error {
185
+ constructor(public readonly status: number, message: string) {
186
+ super(message);
187
+ this.name = "LLMHttpError";
188
+ }
189
+ }
190
+
191
+ // ---------------------------------------------------------------------------
192
+ // Batch orchestration
193
+ // ---------------------------------------------------------------------------
194
+
195
+ function* batchify<T>(items: readonly T[], size: number): Generator<T[]> {
196
+ const step = Math.max(1, size);
197
+ for (let i = 0; i < items.length; i += step) {
198
+ yield items.slice(i, i + step);
199
+ }
200
+ }
201
+
202
+ async function scoreBatch(
203
+ batch: readonly ScoredPair[],
204
+ rowById: ReadonlyMap<number, Row>,
205
+ cols: readonly string[],
206
+ provider: Provider,
207
+ model: string,
208
+ apiKey: string,
209
+ budget: BudgetTracker,
210
+ ): Promise<LLMCallResult> {
211
+ const prompt = buildBatchPrompt(batch, rowById, cols);
212
+ const estIn = countTokensApprox(prompt);
213
+ const estOut = batch.length * 10;
214
+
215
+ if (!budget.canSend(estIn)) {
216
+ return { decisions: new Map(), inputTokens: 0, outputTokens: 0 };
217
+ }
218
+
219
+ try {
220
+ const { text, inputTokens, outputTokens } =
221
+ provider === "openai"
222
+ ? await callOpenAI(prompt, apiKey, model, batch.length * 10)
223
+ : await callAnthropic(prompt, apiKey, model, batch.length * 10);
224
+
225
+ budget.record(inputTokens || estIn, outputTokens || estOut, model);
226
+
227
+ const decisions = parseBatchResponse(text, batch.length);
228
+ const out = new Map<number, boolean>();
229
+ batch.forEach((pair, k) => {
230
+ out.set(pairIndex(pair), decisions[k] ?? false);
231
+ });
232
+ return { decisions: out, inputTokens, outputTokens };
233
+ } catch (err) {
234
+ if (err instanceof LLMHttpError) {
235
+ // Graceful degradation: caller keeps original fuzzy scores.
236
+ return { decisions: new Map(), inputTokens: 0, outputTokens: 0 };
237
+ }
238
+ // Unknown error — also degrade gracefully.
239
+ return { decisions: new Map(), inputTokens: 0, outputTokens: 0 };
240
+ }
241
+ }
242
+
243
+ /** A stable numeric key for a pair, used as a Map index. */
244
+ function pairIndex(pair: ScoredPair): number {
245
+ // Cantor pairing on the canonical (min,max) ids.
246
+ const a = Math.min(pair.idA, pair.idB);
247
+ const b = Math.max(pair.idA, pair.idB);
248
+ return ((a + b) * (a + b + 1)) / 2 + b;
249
+ }
250
+
251
+ // ---------------------------------------------------------------------------
252
+ // Public: llmScorePairs
253
+ // ---------------------------------------------------------------------------
254
+
255
+ /**
256
+ * Score borderline pairs with an LLM. Never demotes: pairs the LLM rejects
257
+ * keep their original fuzzy score. Pairs the LLM confirms are promoted to 1.0.
258
+ *
259
+ * When no `apiKey` is available, degrades gracefully and returns the input.
260
+ */
261
+ export async function llmScorePairs(
262
+ pairs: readonly ScoredPair[],
263
+ rows: readonly Row[],
264
+ config: LLMScorerConfig,
265
+ apiKey?: string,
266
+ ): Promise<LLMScoreResult> {
267
+ const budget = new BudgetTracker(
268
+ config.budget ?? {},
269
+ config.model ?? "gpt-4o-mini",
270
+ );
271
+
272
+ if (pairs.length === 0) {
273
+ return { pairs: [], budget: budget.snapshot() };
274
+ }
275
+
276
+ const provider = detectProvider(apiKey, config.provider);
277
+ const model = config.model ?? defaultModel(provider);
278
+
279
+ // Display columns: everything not prefixed with `__`.
280
+ const cols = new Set<string>();
281
+ for (const r of rows) {
282
+ for (const k of Object.keys(r)) {
283
+ if (!k.startsWith("__")) cols.add(k);
284
+ }
285
+ }
286
+ const displayCols = [...cols];
287
+
288
+ const rowById = new Map<number, Row>();
289
+ for (const r of rows) {
290
+ const id = r["__row_id__"];
291
+ if (typeof id === "number") rowById.set(id, r);
292
+ }
293
+
294
+ // Three-tier partition.
295
+ const autoAccept: ScoredPair[] = [];
296
+ const candidates: ScoredPair[] = [];
297
+ const below: ScoredPair[] = [];
298
+ for (const p of pairs) {
299
+ if (p.score >= config.autoThreshold) autoAccept.push(p);
300
+ else if (p.score >= config.candidateLo) candidates.push(p);
301
+ else below.push(p);
302
+ }
303
+
304
+ // Build result scaffold: auto-accept promoted to 1.0, below untouched.
305
+ const resultPairs: ScoredPair[] = [];
306
+ for (const p of autoAccept) {
307
+ resultPairs.push(makeScoredPair(p.idA, p.idB, 1.0));
308
+ }
309
+ for (const p of below) {
310
+ resultPairs.push(p);
311
+ }
312
+
313
+ // If no API key, pass candidates through unchanged.
314
+ if (!apiKey) {
315
+ resultPairs.push(...candidates);
316
+ return { pairs: resultPairs, budget: budget.snapshot() };
317
+ }
318
+
319
+ // Batch LLM scoring for candidates.
320
+ const batchSize = Math.max(1, config.batchSize || 20);
321
+ const llmDecisions = new Map<number, boolean>();
322
+ for (const batch of batchify(candidates, batchSize)) {
323
+ if (!budget.canProceed()) break;
324
+ const res = await scoreBatch(
325
+ batch,
326
+ rowById,
327
+ displayCols,
328
+ provider,
329
+ model,
330
+ apiKey,
331
+ budget,
332
+ );
333
+ res.decisions.forEach((v, k) => llmDecisions.set(k, v));
334
+ }
335
+
336
+ // Merge candidates: promote YES to 1.0, keep NO/unscored at original score.
337
+ for (const p of candidates) {
338
+ const decision = llmDecisions.get(pairIndex(p));
339
+ if (decision === true) {
340
+ resultPairs.push(makeScoredPair(p.idA, p.idB, 1.0));
341
+ } else {
342
+ resultPairs.push(p);
343
+ }
344
+ }
345
+
346
+ return { pairs: resultPairs, budget: budget.snapshot() };
347
+ }
348
+
349
+ // ---------------------------------------------------------------------------
350
+ // Public: scoreStringsWithLlm (single-pair helper)
351
+ // ---------------------------------------------------------------------------
352
+
353
+ /**
354
+ * Ask the LLM a single yes/no question about two strings. Returns 1.0
355
+ * for yes, 0.0 for no, and 0.0 on any error (graceful).
356
+ */
357
+ export async function scoreStringsWithLlm(
358
+ a: string,
359
+ b: string,
360
+ config: LLMScorerConfig,
361
+ apiKey?: string,
362
+ ): Promise<{ score: number; budget: BudgetSnapshot; error?: string }> {
363
+ const budget = new BudgetTracker(
364
+ config.budget ?? {},
365
+ config.model ?? "gpt-4o-mini",
366
+ );
367
+ if (!apiKey) return { score: 0, budget: budget.snapshot() };
368
+
369
+ const provider = detectProvider(apiKey, config.provider);
370
+ const model = config.model ?? defaultModel(provider);
371
+
372
+ const prompt =
373
+ "Are these two values referring to the same entity? Answer YES or NO.\n" +
374
+ `A: ${a}\nB: ${b}`;
375
+
376
+ try {
377
+ const { text, inputTokens, outputTokens } =
378
+ provider === "openai"
379
+ ? await callOpenAI(prompt, apiKey, model, 10)
380
+ : await callAnthropic(prompt, apiKey, model, 10);
381
+ budget.record(inputTokens, outputTokens, model);
382
+ const upper = text.trim().toUpperCase();
383
+ const score = upper.includes("YES") ? 1.0 : 0.0;
384
+ return { score, budget: budget.snapshot() };
385
+ } catch (err) {
386
+ const message = err instanceof Error ? err.message : String(err);
387
+ // eslint-disable-next-line no-console
388
+ console.warn("scoreStringsWithLlm failed:", message);
389
+ // Return score=0 (treats as "not matched") but surface the error so
390
+ // operators can distinguish HTTP failures from genuine LLM "no" answers.
391
+ return { score: 0, budget: budget.snapshot(), error: message };
392
+ }
393
+ }
394
+
395
+ // Re-export budget types for convenience.
396
+ export type { BudgetSnapshot } from "./budget.js";
@@ -0,0 +1,95 @@
1
+ /**
2
+ * match-one.ts — Single-record matching primitive.
3
+ * Edge-safe: no Node.js imports, pure TypeScript only.
4
+ *
5
+ * Ports goldenmatch/core/match_one.py.
6
+ */
7
+
8
+ import type { Row, MatchkeyConfig } from "./types.js";
9
+ import { scorePair, asString } from "./scorer.js";
10
+ import { applyTransforms } from "./transforms.js";
11
+
12
+ // ---------------------------------------------------------------------------
13
+ // Types
14
+ // ---------------------------------------------------------------------------
15
+
16
+ export interface MatchOneHit {
17
+ readonly rowId: number;
18
+ readonly score: number;
19
+ }
20
+
21
+ // ---------------------------------------------------------------------------
22
+ // matchOne
23
+ // ---------------------------------------------------------------------------
24
+
25
+ /**
26
+ * Match a single record against a dataset using a weighted matchkey.
27
+ *
28
+ * Threshold defaults to 0 (return everything). For exact matchkeys use
29
+ * {@link findExactMatchesOne}.
30
+ *
31
+ * Returns hits sorted by descending score. Rows are expected to carry
32
+ * `__row_id__`.
33
+ */
34
+ export function matchOne(
35
+ record: Row,
36
+ rows: readonly Row[],
37
+ mk: MatchkeyConfig,
38
+ ): readonly MatchOneHit[] {
39
+ // Exact matchkeys require perfect match (score 1.0).
40
+ const threshold = mk.type === "exact" ? 1.0 : (mk.threshold ?? 0);
41
+ const matches: MatchOneHit[] = [];
42
+ for (const row of rows) {
43
+ const score = scorePair(record, row, mk.fields);
44
+ if (score >= threshold) {
45
+ matches.push({ rowId: row["__row_id__"] as number, score });
46
+ }
47
+ }
48
+ matches.sort((a, b) => b.score - a.score);
49
+ return matches;
50
+ }
51
+
52
+ // ---------------------------------------------------------------------------
53
+ // findExactMatchesOne
54
+ // ---------------------------------------------------------------------------
55
+
56
+ /**
57
+ * Find exact matches for a single record against a dataset.
58
+ *
59
+ * Builds the composite matchkey for the probe record, then scans the rows
60
+ * and returns any that share the same composite key (score 1.0). Null
61
+ * transformed fields disqualify the comparison.
62
+ */
63
+ export function findExactMatchesOne(
64
+ record: Row,
65
+ rows: readonly Row[],
66
+ mk: MatchkeyConfig,
67
+ ): readonly MatchOneHit[] {
68
+ // Build composite key for probe
69
+ const probeParts: string[] = [];
70
+ for (const f of mk.fields) {
71
+ const t = applyTransforms(asString(record[f.field]), f.transforms);
72
+ if (t === null) return [];
73
+ probeParts.push(t);
74
+ }
75
+ const probeKey = probeParts.join("\x00");
76
+
77
+ const hits: MatchOneHit[] = [];
78
+ for (const row of rows) {
79
+ const parts: string[] = [];
80
+ let hasNull = false;
81
+ for (const f of mk.fields) {
82
+ const t = applyTransforms(asString(row[f.field]), f.transforms);
83
+ if (t === null) {
84
+ hasNull = true;
85
+ break;
86
+ }
87
+ parts.push(t);
88
+ }
89
+ if (hasNull) continue;
90
+ if (parts.join("\x00") === probeKey) {
91
+ hits.push({ rowId: row["__row_id__"] as number, score: 1.0 });
92
+ }
93
+ }
94
+ return hits;
95
+ }
@@ -0,0 +1,97 @@
1
+ /**
2
+ * matchkey.ts — Matchkey builder for GoldenMatch-JS.
3
+ * Edge-safe: no `node:` imports, pure TypeScript only.
4
+ *
5
+ * Ports matchkey building from goldenmatch/core/matchkey.py.
6
+ * In Python this uses Polars expressions; here we work with Row arrays.
7
+ */
8
+
9
+ import type { Row, MatchkeyConfig } from "./types.js";
10
+ import { applyTransforms } from "./transforms.js";
11
+
12
+ // ---------------------------------------------------------------------------
13
+ // computeMatchkeyValue — build a matchkey value for a single row
14
+ // ---------------------------------------------------------------------------
15
+
16
+ /**
17
+ * Build a composite matchkey value for a single row.
18
+ *
19
+ * For each field in the matchkey config:
20
+ * 1. Read the raw value from the row
21
+ * 2. Apply the field's transform chain
22
+ * 3. Concatenate all parts with "||" separator
23
+ *
24
+ * Returns `null` if any field value is null/undefined or transforms to null.
25
+ */
26
+ export function computeMatchkeyValue(
27
+ row: Row,
28
+ mk: MatchkeyConfig,
29
+ ): string | null {
30
+ const parts: string[] = [];
31
+ for (const f of mk.fields) {
32
+ const raw = row[f.field];
33
+ if (raw === null || raw === undefined) return null;
34
+ const val = applyTransforms(String(raw), f.transforms);
35
+ if (val === null) return null;
36
+ parts.push(val);
37
+ }
38
+ return parts.join("||");
39
+ }
40
+
41
+ // ---------------------------------------------------------------------------
42
+ // computeMatchkeys — add matchkey columns to all rows
43
+ // ---------------------------------------------------------------------------
44
+
45
+ /**
46
+ * Add matchkey columns to rows. For each matchkey `mk`, adds a column
47
+ * `__mk_{mk.name}__` with the computed matchkey value.
48
+ *
49
+ * Returns new row objects (does not mutate originals).
50
+ */
51
+ export function computeMatchkeys(
52
+ rows: readonly Row[],
53
+ matchkeys: readonly MatchkeyConfig[],
54
+ ): Row[] {
55
+ return rows.map((row) => {
56
+ const extra: Record<string, unknown> = {};
57
+ for (const mk of matchkeys) {
58
+ extra[`__mk_${mk.name}__`] = computeMatchkeyValue(row, mk);
59
+ }
60
+ return { ...row, ...extra };
61
+ });
62
+ }
63
+
64
+ // ---------------------------------------------------------------------------
65
+ // addRowIds — add sequential __row_id__ column
66
+ // ---------------------------------------------------------------------------
67
+
68
+ /**
69
+ * Add `__row_id__` column as sequential integers starting from `offset`.
70
+ *
71
+ * Returns new row objects (does not mutate originals).
72
+ */
73
+ export function addRowIds(rows: readonly Row[], offset: number = 0): Row[] {
74
+ return rows.map((row, i) => ({
75
+ ...row,
76
+ __row_id__: offset + i,
77
+ }));
78
+ }
79
+
80
+ // ---------------------------------------------------------------------------
81
+ // addSourceColumn — add __source__ column
82
+ // ---------------------------------------------------------------------------
83
+
84
+ /**
85
+ * Add `__source__` column with the given source name to every row.
86
+ *
87
+ * Returns new row objects (does not mutate originals).
88
+ */
89
+ export function addSourceColumn(
90
+ rows: readonly Row[],
91
+ sourceName: string,
92
+ ): Row[] {
93
+ return rows.map((row) => ({
94
+ ...row,
95
+ __source__: sourceName,
96
+ }));
97
+ }