goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,542 @@
1
+ /**
2
+ * cluster.ts — In-context LLM clustering: send blocks of borderline records
3
+ * to an LLM for direct cluster assignment. Ports `goldenmatch/core/llm_cluster.py`.
4
+ *
5
+ * Flow:
6
+ * 1. Pairs with candidateLo <= score < autoThreshold form the borderline band.
7
+ * 2. Build connected components over the borderline graph.
8
+ * 3. Oversized components split by dropping weakest edges first.
9
+ * 4. Each component (block) sent to LLM with a JSON cluster schema.
10
+ * 5. Pair scores synthesized from cluster membership + confidence.
11
+ *
12
+ * Degrades: cluster call fails -> pairwise fallback -> return input pairs.
13
+ * Edge-safe: fetch-only, no `node:` imports.
14
+ */
15
+
16
+ import type { Row, ScoredPair, LLMScorerConfig } from "../types.js";
17
+ import { makeScoredPair } from "../types.js";
18
+ import { BudgetTracker, countTokensApprox } from "./budget.js";
19
+ import type { BudgetSnapshot } from "./budget.js";
20
+ import { llmScorePairs, LLMHttpError } from "./scorer.js";
21
+ import type { LLMScoreResult } from "./scorer.js";
22
+
23
+ // ---------------------------------------------------------------------------
24
+ // Types
25
+ // ---------------------------------------------------------------------------
26
+
27
+ interface ClusterBlock {
28
+ readonly records: readonly number[];
29
+ readonly pairs: readonly ScoredPair[];
30
+ }
31
+
32
+ interface LLMClusterResponse {
33
+ readonly clusters: ReadonlyArray<{
34
+ readonly members: readonly number[];
35
+ readonly confidence: number;
36
+ }>;
37
+ readonly singletons: readonly number[];
38
+ }
39
+
40
+ // ---------------------------------------------------------------------------
41
+ // Public: llmClusterPairs
42
+ // ---------------------------------------------------------------------------
43
+
44
+ export async function llmClusterPairs(
45
+ pairs: readonly ScoredPair[],
46
+ rows: readonly Row[],
47
+ config: LLMScorerConfig,
48
+ apiKey?: string,
49
+ ): Promise<LLMScoreResult> {
50
+ const budget = new BudgetTracker(
51
+ config.budget ?? {},
52
+ config.model ?? "gpt-4o-mini",
53
+ );
54
+
55
+ if (pairs.length === 0) {
56
+ return { pairs: [], budget: budget.snapshot() };
57
+ }
58
+
59
+ // Tier partition.
60
+ const autoAccept: ScoredPair[] = [];
61
+ const candidates: ScoredPair[] = [];
62
+ const below: ScoredPair[] = [];
63
+ for (const p of pairs) {
64
+ if (p.score >= config.autoThreshold) autoAccept.push(p);
65
+ else if (p.score >= config.candidateLo && p.score < config.candidateHi) {
66
+ candidates.push(p);
67
+ } else below.push(p);
68
+ }
69
+
70
+ // Result scaffold.
71
+ const result: ScoredPair[] = [];
72
+ for (const p of autoAccept) result.push(makeScoredPair(p.idA, p.idB, 1.0));
73
+ for (const p of below) result.push(p);
74
+
75
+ if (candidates.length === 0) {
76
+ return { pairs: result, budget: budget.snapshot() };
77
+ }
78
+
79
+ // Build row lookup.
80
+ const rowById = new Map<number, Row>();
81
+ for (const r of rows) {
82
+ const id = r["__row_id__"];
83
+ if (typeof id === "number") rowById.set(id, r);
84
+ }
85
+
86
+ // Display columns: first 6 non-internal columns.
87
+ const cols = new Set<string>();
88
+ for (const r of rows) {
89
+ for (const k of Object.keys(r)) {
90
+ if (!k.startsWith("__")) cols.add(k);
91
+ }
92
+ }
93
+ const displayCols = [...cols].slice(0, 6);
94
+
95
+ // Build connected components over the borderline graph.
96
+ const clusterMax = config.clusterMaxSize ?? 20;
97
+ const clusterMin = config.clusterMinSize ?? 2;
98
+ const components = buildComponents(candidates);
99
+
100
+ const provider = (config.provider === "anthropic" ? "anthropic" : "openai") as
101
+ | "openai"
102
+ | "anthropic";
103
+ const model =
104
+ config.model ??
105
+ (provider === "openai" ? "gpt-4o-mini" : "claude-haiku-4-5-20251001");
106
+
107
+ // Pairs we still need to emit: start with all candidates, remove as we resolve.
108
+ const unresolved = new Set<ScoredPair>(candidates);
109
+
110
+ // No API key -> degrade: keep candidates at original scores.
111
+ if (!apiKey) {
112
+ for (const p of candidates) result.push(p);
113
+ return { pairs: result, budget: budget.snapshot() };
114
+ }
115
+
116
+ for (const component of components) {
117
+ if (budget.exhausted) break;
118
+
119
+ // Tiny components: fall back to pairwise scoring.
120
+ if (component.records.length < clusterMin) {
121
+ const fallback = await llmScorePairs(component.pairs, rows, config, apiKey);
122
+ for (const p of fallback.pairs) result.push(p);
123
+ for (const p of component.pairs) unresolved.delete(p);
124
+ continue;
125
+ }
126
+
127
+ // Split oversized components by trimming weakest edges first.
128
+ const blocks = splitComponent(component, clusterMax);
129
+
130
+ for (const block of blocks) {
131
+ if (budget.exhausted) break;
132
+
133
+ let clusterResult: LLMClusterResponse | null = null;
134
+ try {
135
+ clusterResult = await callLlmCluster(
136
+ block.records,
137
+ rowById,
138
+ displayCols,
139
+ provider,
140
+ model,
141
+ apiKey,
142
+ budget,
143
+ );
144
+ } catch (err) {
145
+ // eslint-disable-next-line no-console
146
+ console.warn(
147
+ "llm_cluster call failed, falling back to pairwise:",
148
+ err instanceof Error ? err.message : String(err),
149
+ );
150
+ clusterResult = null;
151
+ }
152
+
153
+ if (clusterResult === null) {
154
+ // Pairwise fallback.
155
+ const fallback = await llmScorePairs(block.pairs, rows, config, apiKey);
156
+ for (const p of fallback.pairs) result.push(p);
157
+ } else {
158
+ const synthesized = applyClusterResults(clusterResult, block.pairs);
159
+ for (const p of synthesized) result.push(p);
160
+ }
161
+ for (const p of block.pairs) unresolved.delete(p);
162
+ }
163
+ }
164
+
165
+ // Anything still unresolved (e.g. budget exhausted early): keep original.
166
+ for (const p of unresolved) result.push(p);
167
+
168
+ return { pairs: result, budget: budget.snapshot() };
169
+ }
170
+
171
+ // ---------------------------------------------------------------------------
172
+ // Component graph construction
173
+ // ---------------------------------------------------------------------------
174
+
175
+ function buildComponents(pairs: readonly ScoredPair[]): ClusterBlock[] {
176
+ const adj = new Map<number, Set<number>>();
177
+ const recordPairs = new Map<number, ScoredPair[]>();
178
+
179
+ for (const p of pairs) {
180
+ if (!adj.has(p.idA)) adj.set(p.idA, new Set());
181
+ if (!adj.has(p.idB)) adj.set(p.idB, new Set());
182
+ adj.get(p.idA)!.add(p.idB);
183
+ adj.get(p.idB)!.add(p.idA);
184
+ if (!recordPairs.has(p.idA)) recordPairs.set(p.idA, []);
185
+ if (!recordPairs.has(p.idB)) recordPairs.set(p.idB, []);
186
+ recordPairs.get(p.idA)!.push(p);
187
+ recordPairs.get(p.idB)!.push(p);
188
+ }
189
+
190
+ const visited = new Set<number>();
191
+ const components: ClusterBlock[] = [];
192
+
193
+ for (const start of adj.keys()) {
194
+ if (visited.has(start)) continue;
195
+ const members: number[] = [];
196
+ const stack = [start];
197
+ while (stack.length > 0) {
198
+ const node = stack.pop()!;
199
+ if (visited.has(node)) continue;
200
+ visited.add(node);
201
+ members.push(node);
202
+ const neighbors = adj.get(node);
203
+ if (neighbors) {
204
+ for (const nb of neighbors) {
205
+ if (!visited.has(nb)) stack.push(nb);
206
+ }
207
+ }
208
+ }
209
+
210
+ // Collect pairs that live entirely within this component.
211
+ const memberSet = new Set(members);
212
+ const seen = new Set<string>();
213
+ const compPairs: ScoredPair[] = [];
214
+ for (const rec of members) {
215
+ const ps = recordPairs.get(rec);
216
+ if (!ps) continue;
217
+ for (const p of ps) {
218
+ if (!memberSet.has(p.idA) || !memberSet.has(p.idB)) continue;
219
+ const key = `${p.idA}:${p.idB}`;
220
+ if (seen.has(key)) continue;
221
+ seen.add(key);
222
+ compPairs.push(p);
223
+ }
224
+ }
225
+
226
+ components.push({ records: members.sort((a, b) => a - b), pairs: compPairs });
227
+ }
228
+
229
+ return components;
230
+ }
231
+
232
+ // ---------------------------------------------------------------------------
233
+ // Component splitting (oversized blocks)
234
+ // ---------------------------------------------------------------------------
235
+
236
+ function splitComponent(
237
+ component: ClusterBlock,
238
+ maxSize: number,
239
+ ): ClusterBlock[] {
240
+ if (component.records.length <= maxSize) return [component];
241
+
242
+ // Work with a mutable adjacency map.
243
+ const adj = new Map<number, Set<number>>();
244
+ for (const rec of component.records) adj.set(rec, new Set());
245
+ for (const p of component.pairs) {
246
+ adj.get(p.idA)!.add(p.idB);
247
+ adj.get(p.idB)!.add(p.idA);
248
+ }
249
+
250
+ // Edges sorted by score ascending (weakest first).
251
+ const edges = [...component.pairs].sort((a, b) => a.score - b.score);
252
+ const removed = new Set<ScoredPair>();
253
+
254
+ for (const e of edges) {
255
+ adj.get(e.idA)?.delete(e.idB);
256
+ adj.get(e.idB)?.delete(e.idA);
257
+ removed.add(e);
258
+ const max = largestComponentSize(adj, component.records);
259
+ if (max <= maxSize) break;
260
+ }
261
+
262
+ // Rebuild components with the surviving edges.
263
+ const remainingPairs = component.pairs.filter((p) => !removed.has(p));
264
+ const remainingAdj = new Map<number, Set<number>>();
265
+ for (const rec of component.records) remainingAdj.set(rec, new Set());
266
+ for (const p of remainingPairs) {
267
+ remainingAdj.get(p.idA)!.add(p.idB);
268
+ remainingAdj.get(p.idB)!.add(p.idA);
269
+ }
270
+
271
+ const visited = new Set<number>();
272
+ const blocks: ClusterBlock[] = [];
273
+ for (const start of component.records) {
274
+ if (visited.has(start)) continue;
275
+ const comp: number[] = [];
276
+ const stack = [start];
277
+ while (stack.length > 0) {
278
+ const node = stack.pop()!;
279
+ if (visited.has(node)) continue;
280
+ visited.add(node);
281
+ comp.push(node);
282
+ const nbs = remainingAdj.get(node);
283
+ if (nbs) {
284
+ for (const nb of nbs) if (!visited.has(nb)) stack.push(nb);
285
+ }
286
+ }
287
+ const memberSet = new Set(comp);
288
+ const compPairs = remainingPairs.filter(
289
+ (p) => memberSet.has(p.idA) && memberSet.has(p.idB),
290
+ );
291
+ blocks.push({ records: comp.sort((a, b) => a - b), pairs: compPairs });
292
+ }
293
+
294
+ return blocks;
295
+ }
296
+
297
+ function largestComponentSize(
298
+ adj: ReadonlyMap<number, ReadonlySet<number>>,
299
+ records: readonly number[],
300
+ ): number {
301
+ const visited = new Set<number>();
302
+ let max = 0;
303
+ for (const start of records) {
304
+ if (visited.has(start)) continue;
305
+ let size = 0;
306
+ const stack = [start];
307
+ while (stack.length > 0) {
308
+ const node = stack.pop()!;
309
+ if (visited.has(node)) continue;
310
+ visited.add(node);
311
+ size++;
312
+ const nbs = adj.get(node);
313
+ if (nbs) {
314
+ for (const nb of nbs) if (!visited.has(nb)) stack.push(nb);
315
+ }
316
+ }
317
+ if (size > max) max = size;
318
+ }
319
+ return max;
320
+ }
321
+
322
+ // ---------------------------------------------------------------------------
323
+ // LLM cluster call
324
+ // ---------------------------------------------------------------------------
325
+
326
+ async function callLlmCluster(
327
+ recordIds: readonly number[],
328
+ rowById: ReadonlyMap<number, Row>,
329
+ displayCols: readonly string[],
330
+ provider: "openai" | "anthropic",
331
+ model: string,
332
+ apiKey: string,
333
+ budget: BudgetTracker,
334
+ ): Promise<LLMClusterResponse> {
335
+ const lines: string[] = [
336
+ "Group these records into clusters of duplicates. Return JSON only.",
337
+ "",
338
+ "Records:",
339
+ ];
340
+ for (const rid of recordIds) {
341
+ const row = rowById.get(rid) ?? {};
342
+ const parts = displayCols.map((c) => String(row[c] ?? ""));
343
+ lines.push(` [${rid}] ${parts.join(" | ")}`);
344
+ }
345
+ lines.push("");
346
+ lines.push(
347
+ 'Return JSON: {"clusters": [{"members": [id1, id2, ...], "confidence": 0.0-1.0}, ...], "singletons": [id1, ...]}',
348
+ );
349
+ lines.push("Rules:");
350
+ lines.push("- Each record appears in exactly one cluster or as a singleton");
351
+ lines.push("- confidence = how certain you are that all members are the same entity");
352
+ lines.push("- Only group records that are clearly the same real-world entity");
353
+
354
+ const prompt = lines.join("\n");
355
+ const estTokens = countTokensApprox(prompt);
356
+ if (!budget.canSend(estTokens)) {
357
+ throw new Error("Budget insufficient for this block");
358
+ }
359
+
360
+ const maxTokens = Math.min(2000, Math.max(200, recordIds.length * 30));
361
+ const { text, inputTokens, outputTokens } =
362
+ provider === "openai"
363
+ ? await openaiJson(prompt, apiKey, model, maxTokens)
364
+ : await anthropicJson(prompt, apiKey, model, maxTokens);
365
+
366
+ budget.record(inputTokens || estTokens, outputTokens || maxTokens, model);
367
+ return parseClusterResponse(text, recordIds);
368
+ }
369
+
370
+ async function openaiJson(
371
+ prompt: string,
372
+ apiKey: string,
373
+ model: string,
374
+ maxTokens: number,
375
+ ): Promise<{ text: string; inputTokens: number; outputTokens: number }> {
376
+ const resp = await fetch("https://api.openai.com/v1/chat/completions", {
377
+ method: "POST",
378
+ headers: {
379
+ Authorization: `Bearer ${apiKey}`,
380
+ "Content-Type": "application/json",
381
+ },
382
+ body: JSON.stringify({
383
+ model,
384
+ messages: [{ role: "user", content: prompt }],
385
+ temperature: 0,
386
+ max_tokens: maxTokens,
387
+ }),
388
+ });
389
+ if (!resp.ok) {
390
+ const body = await resp.text().catch(() => "");
391
+ throw new LLMHttpError(resp.status, `OpenAI ${resp.status}: ${body.slice(0, 200)}`);
392
+ }
393
+ const data = (await resp.json()) as {
394
+ choices?: Array<{ message?: { content?: string } }>;
395
+ usage?: { prompt_tokens?: number; completion_tokens?: number };
396
+ };
397
+ return {
398
+ text: data.choices?.[0]?.message?.content ?? "",
399
+ inputTokens: data.usage?.prompt_tokens ?? 0,
400
+ outputTokens: data.usage?.completion_tokens ?? 0,
401
+ };
402
+ }
403
+
404
+ async function anthropicJson(
405
+ prompt: string,
406
+ apiKey: string,
407
+ model: string,
408
+ maxTokens: number,
409
+ ): Promise<{ text: string; inputTokens: number; outputTokens: number }> {
410
+ const resp = await fetch("https://api.anthropic.com/v1/messages", {
411
+ method: "POST",
412
+ headers: {
413
+ "x-api-key": apiKey,
414
+ "content-type": "application/json",
415
+ "anthropic-version": "2023-06-01",
416
+ },
417
+ body: JSON.stringify({
418
+ model,
419
+ max_tokens: maxTokens,
420
+ messages: [{ role: "user", content: prompt }],
421
+ }),
422
+ });
423
+ if (!resp.ok) {
424
+ const body = await resp.text().catch(() => "");
425
+ throw new LLMHttpError(resp.status, `Anthropic ${resp.status}: ${body.slice(0, 200)}`);
426
+ }
427
+ const data = (await resp.json()) as {
428
+ content?: Array<{ text?: string }>;
429
+ usage?: { input_tokens?: number; output_tokens?: number };
430
+ };
431
+ return {
432
+ text: data.content?.[0]?.text ?? "",
433
+ inputTokens: data.usage?.input_tokens ?? 0,
434
+ outputTokens: data.usage?.output_tokens ?? 0,
435
+ };
436
+ }
437
+
438
+ // ---------------------------------------------------------------------------
439
+ // Response parsing
440
+ // ---------------------------------------------------------------------------
441
+
442
+ function parseClusterResponse(
443
+ response: string,
444
+ validIds: readonly number[],
445
+ ): LLMClusterResponse {
446
+ const validSet = new Set(validIds);
447
+ const fallback: LLMClusterResponse = {
448
+ clusters: [],
449
+ singletons: [...validIds],
450
+ };
451
+
452
+ let text = response.trim();
453
+
454
+ // Strip markdown code fences if present.
455
+ if (text.includes("```")) {
456
+ const parts = text.split("```");
457
+ for (const raw of parts) {
458
+ let p = raw.trim();
459
+ if (p.startsWith("json")) p = p.slice(4).trim();
460
+ if (p.startsWith("{")) {
461
+ text = p;
462
+ break;
463
+ }
464
+ }
465
+ }
466
+
467
+ // Extract first balanced JSON object.
468
+ let parsed: unknown;
469
+ try {
470
+ parsed = JSON.parse(text);
471
+ } catch {
472
+ const start = text.indexOf("{");
473
+ const end = text.lastIndexOf("}");
474
+ if (start >= 0 && end > start) {
475
+ try {
476
+ parsed = JSON.parse(text.slice(start, end + 1));
477
+ } catch {
478
+ return fallback;
479
+ }
480
+ } else {
481
+ return fallback;
482
+ }
483
+ }
484
+
485
+ if (!parsed || typeof parsed !== "object") return fallback;
486
+ const obj = parsed as {
487
+ clusters?: Array<{ members?: unknown; confidence?: unknown }>;
488
+ };
489
+
490
+ const clusters: Array<{ members: number[]; confidence: number }> = [];
491
+ const assigned = new Set<number>();
492
+
493
+ for (const c of obj.clusters ?? []) {
494
+ const membersRaw = Array.isArray(c.members) ? c.members : [];
495
+ const conf = typeof c.confidence === "number" ? c.confidence : 0.5;
496
+ const clamped = Math.max(0, Math.min(1, conf));
497
+ const validMembers = membersRaw
498
+ .filter((m): m is number => typeof m === "number")
499
+ .filter((m) => validSet.has(m) && !assigned.has(m));
500
+ if (validMembers.length >= 2) {
501
+ clusters.push({ members: validMembers, confidence: clamped });
502
+ for (const m of validMembers) assigned.add(m);
503
+ }
504
+ }
505
+
506
+ const singletons = validIds.filter((rid) => !assigned.has(rid));
507
+ return { clusters, singletons };
508
+ }
509
+
510
+ // ---------------------------------------------------------------------------
511
+ // Synthesize pair_scores from cluster membership
512
+ // ---------------------------------------------------------------------------
513
+
514
+ function applyClusterResults(
515
+ result: LLMClusterResponse,
516
+ pairs: readonly ScoredPair[],
517
+ ): ScoredPair[] {
518
+ // record_id -> (cluster_index, confidence)
519
+ const recordCluster = new Map<number, { idx: number; conf: number }>();
520
+ result.clusters.forEach((c, idx) => {
521
+ for (const m of c.members) {
522
+ recordCluster.set(m, { idx, conf: c.confidence });
523
+ }
524
+ });
525
+
526
+ const out: ScoredPair[] = [];
527
+ for (const p of pairs) {
528
+ const ca = recordCluster.get(p.idA);
529
+ const cb = recordCluster.get(p.idB);
530
+ if (ca !== undefined && cb !== undefined && ca.idx === cb.idx) {
531
+ // Same cluster: use cluster confidence.
532
+ out.push(makeScoredPair(p.idA, p.idB, ca.conf));
533
+ } else {
534
+ // Different cluster or singleton: rejected.
535
+ out.push(makeScoredPair(p.idA, p.idB, 0));
536
+ }
537
+ }
538
+ return out;
539
+ }
540
+
541
+ // Re-export for convenience.
542
+ export type { BudgetSnapshot, LLMScoreResult };