goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,614 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* cross-encoder.ts — LLM-based pair reranking ("cross-encoder lite").
|
|
3
|
+
*
|
|
4
|
+
* The Python port uses ONNX/sentence-transformers cross-encoders, which need
|
|
5
|
+
* native deps. This edge-safe variant performs zero-shot reranking by asking
|
|
6
|
+
* an LLM (OpenAI / Anthropic) for a 0..1 match score on borderline pairs.
|
|
7
|
+
*
|
|
8
|
+
* - Borderline pairs are identified by `band` around the matchkey threshold
|
|
9
|
+
* and/or top-N highest fuzzy scores below the auto-accept cutoff.
|
|
10
|
+
* - The combined score is `0.5 * original + 0.5 * rerank` by default.
|
|
11
|
+
* - Budget tracking uses BudgetTracker; on any HTTP failure we degrade to
|
|
12
|
+
* the original score for that pair.
|
|
13
|
+
*
|
|
14
|
+
* Edge-safe: uses global `fetch()` only.
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import type { BudgetConfig, MatchkeyConfig, Row, ScoredPair } from "./types.js";
|
|
18
|
+
import { makeScoredPair } from "./types.js";
|
|
19
|
+
import { BudgetTracker, countTokensApprox } from "./llm/budget.js";
|
|
20
|
+
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
// Public types
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
|
|
25
|
+
export type CrossEncoderProvider = "openai" | "anthropic";
|
|
26
|
+
|
|
27
|
+
export type CrossEncoderReranker = "llm" | "cross-encoder";
|
|
28
|
+
|
|
29
|
+
export interface CrossEncoderModelOptions {
|
|
30
|
+
/** HuggingFace model id. Default "Xenova/ms-marco-MiniLM-L-6-v2". */
|
|
31
|
+
readonly model?: string;
|
|
32
|
+
/** Execution device. Default "cpu". */
|
|
33
|
+
readonly device?: "cpu" | "webgpu";
|
|
34
|
+
/** Use quantized weights (q8). Default true. */
|
|
35
|
+
readonly quantized?: boolean;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export interface CrossEncoderOptions {
|
|
39
|
+
/**
|
|
40
|
+
* Reranker backend. `"llm"` (default) uses OpenAI/Anthropic.
|
|
41
|
+
* `"cross-encoder"` loads `@huggingface/transformers` and runs a real
|
|
42
|
+
* ONNX cross-encoder model locally; falls back to LLM on load/inference
|
|
43
|
+
* failure.
|
|
44
|
+
*/
|
|
45
|
+
readonly reranker?: CrossEncoderReranker;
|
|
46
|
+
readonly provider?: CrossEncoderProvider;
|
|
47
|
+
readonly model?: string;
|
|
48
|
+
readonly apiKey?: string;
|
|
49
|
+
/** Device for cross-encoder model (when reranker="cross-encoder"). */
|
|
50
|
+
readonly device?: "cpu" | "webgpu";
|
|
51
|
+
/** Use quantized cross-encoder weights (q8). Default true. */
|
|
52
|
+
readonly quantized?: boolean;
|
|
53
|
+
/** Re-rank pairs scoring within `band` of `mk.threshold` (default 0.1). */
|
|
54
|
+
readonly band?: number;
|
|
55
|
+
/** Maximum number of pairs to rerank, ranked highest to lowest. */
|
|
56
|
+
readonly topN?: number;
|
|
57
|
+
/** Weight given to the LLM rerank vs the original score. Default 0.5. */
|
|
58
|
+
readonly rerankWeight?: number;
|
|
59
|
+
/** Budget cap shared with regular LLM scoring. */
|
|
60
|
+
readonly budget?: BudgetConfig;
|
|
61
|
+
/** Optional override for retry attempts. Default 2. */
|
|
62
|
+
readonly maxRetries?: number;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
// ---------------------------------------------------------------------------
|
|
66
|
+
// Helpers
|
|
67
|
+
// ---------------------------------------------------------------------------
|
|
68
|
+
|
|
69
|
+
function readEnv(key: string): string | undefined {
|
|
70
|
+
const proc = (globalThis as { process?: { env?: Record<string, string | undefined> } }).process;
|
|
71
|
+
return proc?.env?.[key];
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function detectProvider(
|
|
75
|
+
apiKey: string | undefined,
|
|
76
|
+
configProvider: CrossEncoderProvider | undefined,
|
|
77
|
+
): CrossEncoderProvider {
|
|
78
|
+
if (configProvider) return configProvider;
|
|
79
|
+
if (apiKey?.startsWith("sk-ant-")) return "anthropic";
|
|
80
|
+
return "openai";
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function defaultModel(provider: CrossEncoderProvider): string {
|
|
84
|
+
return provider === "openai" ? "gpt-4o-mini" : "claude-haiku-4-5-20251001";
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
async function sleep(ms: number): Promise<void> {
|
|
88
|
+
await new Promise<void>((resolve) => setTimeout(resolve, ms));
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function summariseRow(row: Row, fields: readonly string[]): string {
|
|
92
|
+
const parts: string[] = [];
|
|
93
|
+
for (const f of fields) {
|
|
94
|
+
const v = row[f];
|
|
95
|
+
if (v === null || v === undefined || v === "") continue;
|
|
96
|
+
parts.push(`${f}: ${String(v)}`);
|
|
97
|
+
}
|
|
98
|
+
return parts.join(" | ").slice(0, 300);
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function buildPrompt(
|
|
102
|
+
rowA: Row,
|
|
103
|
+
rowB: Row,
|
|
104
|
+
fields: readonly string[],
|
|
105
|
+
): string {
|
|
106
|
+
return [
|
|
107
|
+
"Are these two records the same real-world entity?",
|
|
108
|
+
'Answer with strict JSON: {"score": <number 0..1>}.',
|
|
109
|
+
"1.0 = certainly the same. 0.0 = certainly different.",
|
|
110
|
+
"",
|
|
111
|
+
`A: ${summariseRow(rowA, fields)}`,
|
|
112
|
+
`B: ${summariseRow(rowB, fields)}`,
|
|
113
|
+
].join("\n");
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/** Extract a 0..1 score from an LLM response. Tolerates loose formatting. */
|
|
117
|
+
function parseScore(text: string): number | null {
|
|
118
|
+
const trimmed = text.trim();
|
|
119
|
+
// Try strict JSON first.
|
|
120
|
+
try {
|
|
121
|
+
const obj = JSON.parse(trimmed) as { score?: unknown };
|
|
122
|
+
if (typeof obj.score === "number" && Number.isFinite(obj.score)) {
|
|
123
|
+
return Math.min(1, Math.max(0, obj.score));
|
|
124
|
+
}
|
|
125
|
+
} catch {
|
|
126
|
+
// fall through
|
|
127
|
+
}
|
|
128
|
+
// Try to find a JSON object inside.
|
|
129
|
+
const match = trimmed.match(/\{[^}]*"score"\s*:\s*([0-9.]+)[^}]*\}/);
|
|
130
|
+
if (match) {
|
|
131
|
+
const v = parseFloat(match[1]!);
|
|
132
|
+
if (Number.isFinite(v)) return Math.min(1, Math.max(0, v));
|
|
133
|
+
}
|
|
134
|
+
// Last resort: a bare number.
|
|
135
|
+
const num = parseFloat(trimmed);
|
|
136
|
+
if (Number.isFinite(num)) return Math.min(1, Math.max(0, num));
|
|
137
|
+
return null;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// ---------------------------------------------------------------------------
|
|
141
|
+
// Provider calls
|
|
142
|
+
// ---------------------------------------------------------------------------
|
|
143
|
+
|
|
144
|
+
interface CallResult {
|
|
145
|
+
readonly text: string;
|
|
146
|
+
readonly inputTokens: number;
|
|
147
|
+
readonly outputTokens: number;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
async function callOpenAI(
|
|
151
|
+
prompt: string,
|
|
152
|
+
apiKey: string,
|
|
153
|
+
model: string,
|
|
154
|
+
maxRetries: number,
|
|
155
|
+
): Promise<CallResult> {
|
|
156
|
+
return await callWithRetry(
|
|
157
|
+
"https://api.openai.com/v1/chat/completions",
|
|
158
|
+
{
|
|
159
|
+
method: "POST",
|
|
160
|
+
headers: {
|
|
161
|
+
Authorization: `Bearer ${apiKey}`,
|
|
162
|
+
"Content-Type": "application/json",
|
|
163
|
+
},
|
|
164
|
+
body: JSON.stringify({
|
|
165
|
+
model,
|
|
166
|
+
messages: [{ role: "user", content: prompt }],
|
|
167
|
+
temperature: 0,
|
|
168
|
+
max_tokens: 32,
|
|
169
|
+
response_format: { type: "json_object" },
|
|
170
|
+
}),
|
|
171
|
+
},
|
|
172
|
+
maxRetries,
|
|
173
|
+
(data) => {
|
|
174
|
+
const d = data as {
|
|
175
|
+
choices?: Array<{ message?: { content?: string } }>;
|
|
176
|
+
usage?: { prompt_tokens?: number; completion_tokens?: number };
|
|
177
|
+
};
|
|
178
|
+
return {
|
|
179
|
+
text: d.choices?.[0]?.message?.content?.trim() ?? "",
|
|
180
|
+
inputTokens: d.usage?.prompt_tokens ?? 0,
|
|
181
|
+
outputTokens: d.usage?.completion_tokens ?? 0,
|
|
182
|
+
};
|
|
183
|
+
},
|
|
184
|
+
);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
async function callAnthropic(
|
|
188
|
+
prompt: string,
|
|
189
|
+
apiKey: string,
|
|
190
|
+
model: string,
|
|
191
|
+
maxRetries: number,
|
|
192
|
+
): Promise<CallResult> {
|
|
193
|
+
return await callWithRetry(
|
|
194
|
+
"https://api.anthropic.com/v1/messages",
|
|
195
|
+
{
|
|
196
|
+
method: "POST",
|
|
197
|
+
headers: {
|
|
198
|
+
"x-api-key": apiKey,
|
|
199
|
+
"content-type": "application/json",
|
|
200
|
+
"anthropic-version": "2023-06-01",
|
|
201
|
+
},
|
|
202
|
+
body: JSON.stringify({
|
|
203
|
+
model,
|
|
204
|
+
max_tokens: 32,
|
|
205
|
+
messages: [{ role: "user", content: prompt }],
|
|
206
|
+
}),
|
|
207
|
+
},
|
|
208
|
+
maxRetries,
|
|
209
|
+
(data) => {
|
|
210
|
+
const d = data as {
|
|
211
|
+
content?: Array<{ text?: string }>;
|
|
212
|
+
usage?: { input_tokens?: number; output_tokens?: number };
|
|
213
|
+
};
|
|
214
|
+
return {
|
|
215
|
+
text: d.content?.[0]?.text?.trim() ?? "",
|
|
216
|
+
inputTokens: d.usage?.input_tokens ?? 0,
|
|
217
|
+
outputTokens: d.usage?.output_tokens ?? 0,
|
|
218
|
+
};
|
|
219
|
+
},
|
|
220
|
+
);
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
async function callWithRetry(
|
|
224
|
+
url: string,
|
|
225
|
+
init: RequestInit,
|
|
226
|
+
maxRetries: number,
|
|
227
|
+
parse: (data: unknown) => CallResult,
|
|
228
|
+
): Promise<CallResult> {
|
|
229
|
+
let lastErr: unknown = null;
|
|
230
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
231
|
+
try {
|
|
232
|
+
const resp = await fetch(url, init);
|
|
233
|
+
if (resp.status === 429 || (resp.status >= 500 && resp.status < 600)) {
|
|
234
|
+
if (attempt < maxRetries) {
|
|
235
|
+
await sleep(500 * Math.pow(2, attempt));
|
|
236
|
+
continue;
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
if (!resp.ok) {
|
|
240
|
+
const body = await resp.text().catch(() => "");
|
|
241
|
+
throw new CrossEncoderHttpError(
|
|
242
|
+
resp.status,
|
|
243
|
+
`${resp.status}: ${body.slice(0, 200)}`,
|
|
244
|
+
);
|
|
245
|
+
}
|
|
246
|
+
const data = await resp.json();
|
|
247
|
+
return parse(data);
|
|
248
|
+
} catch (err) {
|
|
249
|
+
lastErr = err;
|
|
250
|
+
if (err instanceof CrossEncoderHttpError) throw err;
|
|
251
|
+
if (attempt < maxRetries) {
|
|
252
|
+
await sleep(500 * Math.pow(2, attempt));
|
|
253
|
+
continue;
|
|
254
|
+
}
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
throw new CrossEncoderHttpError(0, `Network error: ${String(lastErr)}`);
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
export class CrossEncoderHttpError extends Error {
|
|
261
|
+
constructor(public readonly status: number, message: string) {
|
|
262
|
+
super(message);
|
|
263
|
+
this.name = "CrossEncoderHttpError";
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
// ---------------------------------------------------------------------------
|
|
268
|
+
// CrossEncoderModel — local ONNX cross-encoder via @huggingface/transformers
|
|
269
|
+
// ---------------------------------------------------------------------------
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Optional local cross-encoder backed by @huggingface/transformers (ONNX).
|
|
273
|
+
*
|
|
274
|
+
* Kept optional so goldenmatch-js stays edge-safe / zero-deps by default.
|
|
275
|
+
* The peer dependency must be installed explicitly:
|
|
276
|
+
* npm install @huggingface/transformers
|
|
277
|
+
*
|
|
278
|
+
* Typical usage is indirect: pass `reranker: "cross-encoder"` to
|
|
279
|
+
* `rerankPair` / `rerankTopPairs` and a shared model instance is cached.
|
|
280
|
+
*/
|
|
281
|
+
export class CrossEncoderModel {
|
|
282
|
+
// Using `any` to avoid a compile-time dep on the optional package.
|
|
283
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
284
|
+
private pipelineFn: any = null;
|
|
285
|
+
private loading: Promise<void> | null = null;
|
|
286
|
+
|
|
287
|
+
constructor(private readonly options: CrossEncoderModelOptions = {}) {}
|
|
288
|
+
|
|
289
|
+
private async ensureLoaded(): Promise<void> {
|
|
290
|
+
if (this.pipelineFn) return;
|
|
291
|
+
if (this.loading) return this.loading;
|
|
292
|
+
this.loading = (async () => {
|
|
293
|
+
try {
|
|
294
|
+
// Dynamic import so tsup/bundlers treat this as optional.
|
|
295
|
+
const modName = "@huggingface/transformers";
|
|
296
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
297
|
+
const transformers: any = await import(/* @vite-ignore */ modName);
|
|
298
|
+
const pipeline = transformers.pipeline ?? transformers.default?.pipeline;
|
|
299
|
+
if (typeof pipeline !== "function") {
|
|
300
|
+
throw new Error("pipeline() export not found on @huggingface/transformers");
|
|
301
|
+
}
|
|
302
|
+
this.pipelineFn = await pipeline(
|
|
303
|
+
"text-classification",
|
|
304
|
+
this.options.model ?? "Xenova/ms-marco-MiniLM-L-6-v2",
|
|
305
|
+
{
|
|
306
|
+
device: this.options.device ?? "cpu",
|
|
307
|
+
dtype: this.options.quantized !== false ? "q8" : "fp32",
|
|
308
|
+
},
|
|
309
|
+
);
|
|
310
|
+
} catch (err) {
|
|
311
|
+
this.loading = null;
|
|
312
|
+
throw new Error(
|
|
313
|
+
"'@huggingface/transformers' is required for the cross-encoder reranker. " +
|
|
314
|
+
"Install: npm install @huggingface/transformers. " +
|
|
315
|
+
"Original error: " +
|
|
316
|
+
(err instanceof Error ? err.message : String(err)),
|
|
317
|
+
);
|
|
318
|
+
}
|
|
319
|
+
})();
|
|
320
|
+
return this.loading;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
/** Score a single text pair. Returns a [0,1] relevance probability. */
|
|
324
|
+
async score(textA: string, textB: string): Promise<number> {
|
|
325
|
+
await this.ensureLoaded();
|
|
326
|
+
const result = await this.pipelineFn({ text: textA, text_pair: textB });
|
|
327
|
+
// Result may be an array (batch of 1) or a single object. Defensive.
|
|
328
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
329
|
+
const r: any = Array.isArray(result) ? result[0] : result;
|
|
330
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
331
|
+
const logits: any = r?.logits;
|
|
332
|
+
const raw: unknown =
|
|
333
|
+
r?.score ??
|
|
334
|
+
(Array.isArray(logits) ? logits[0] : undefined) ??
|
|
335
|
+
(logits && typeof logits === "object" && "data" in logits
|
|
336
|
+
? // eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
337
|
+
(logits as any).data?.[0]
|
|
338
|
+
: undefined) ??
|
|
339
|
+
r?.value ??
|
|
340
|
+
0;
|
|
341
|
+
const raw_n = typeof raw === "number" && Number.isFinite(raw) ? raw : 0;
|
|
342
|
+
// text-classification pipelines already return a probability in [0,1].
|
|
343
|
+
if (raw_n >= 0 && raw_n <= 1) return raw_n;
|
|
344
|
+
// Sigmoid fallback for raw logits.
|
|
345
|
+
return 1 / (1 + Math.exp(-raw_n));
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
/**
|
|
349
|
+
* Score a batch of text pairs. Currently calls `score` serially —
|
|
350
|
+
* transformers.js v3 batching APIs vary across versions, so we stay
|
|
351
|
+
* conservative. Still avoids any LLM HTTP round-trips.
|
|
352
|
+
*/
|
|
353
|
+
async scoreBatch(
|
|
354
|
+
pairs: ReadonlyArray<readonly [string, string]>,
|
|
355
|
+
): Promise<number[]> {
|
|
356
|
+
await this.ensureLoaded();
|
|
357
|
+
const scores: number[] = [];
|
|
358
|
+
for (const [a, b] of pairs) {
|
|
359
|
+
scores.push(await this.score(a, b));
|
|
360
|
+
}
|
|
361
|
+
return scores;
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
// Shared model cache keyed by model/device/quantized tuple. Most callers
|
|
366
|
+
// reuse defaults, so this effectively reuses a single instance per process.
|
|
367
|
+
let _modelCache: CrossEncoderModel | null = null;
|
|
368
|
+
let _modelCacheKey: string | null = null;
|
|
369
|
+
|
|
370
|
+
function getCrossEncoderModel(options: CrossEncoderOptions): CrossEncoderModel {
|
|
371
|
+
const key = [
|
|
372
|
+
options.model ?? "",
|
|
373
|
+
options.device ?? "",
|
|
374
|
+
options.quantized === undefined ? "" : String(options.quantized),
|
|
375
|
+
].join("|");
|
|
376
|
+
if (_modelCache && _modelCacheKey === key) return _modelCache;
|
|
377
|
+
const modelOpts: {
|
|
378
|
+
model?: string;
|
|
379
|
+
device?: "cpu" | "webgpu";
|
|
380
|
+
quantized?: boolean;
|
|
381
|
+
} = {};
|
|
382
|
+
if (options.model !== undefined) modelOpts.model = options.model;
|
|
383
|
+
if (options.device !== undefined) modelOpts.device = options.device;
|
|
384
|
+
if (options.quantized !== undefined) modelOpts.quantized = options.quantized;
|
|
385
|
+
_modelCache = new CrossEncoderModel(modelOpts);
|
|
386
|
+
_modelCacheKey = key;
|
|
387
|
+
return _modelCache;
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
/** Test hook: reset the cached model instance. */
|
|
391
|
+
export function _resetCrossEncoderModelCache(): void {
|
|
392
|
+
_modelCache = null;
|
|
393
|
+
_modelCacheKey = null;
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
function rowToText(row: Row, fields: readonly string[]): string {
|
|
397
|
+
const parts: string[] = [];
|
|
398
|
+
for (const f of fields) {
|
|
399
|
+
const v = row[f];
|
|
400
|
+
if (v === null || v === undefined || v === "") continue;
|
|
401
|
+
parts.push(`${f}: ${String(v)}`);
|
|
402
|
+
}
|
|
403
|
+
return parts.join(" | ");
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
// ---------------------------------------------------------------------------
|
|
407
|
+
// Single-pair rerank
|
|
408
|
+
// ---------------------------------------------------------------------------
|
|
409
|
+
|
|
410
|
+
/**
|
|
411
|
+
* Ask the LLM for a single 0..1 match score for two rows.
|
|
412
|
+
*
|
|
413
|
+
* Returns `NaN` when the call fails or the response is unparseable so
|
|
414
|
+
* callers can fall back to the original score.
|
|
415
|
+
*/
|
|
416
|
+
export async function rerankPair(
|
|
417
|
+
rowA: Row,
|
|
418
|
+
rowB: Row,
|
|
419
|
+
fields: readonly string[],
|
|
420
|
+
options: CrossEncoderOptions = {},
|
|
421
|
+
): Promise<number> {
|
|
422
|
+
// Cross-encoder fast path: real ONNX model via @huggingface/transformers.
|
|
423
|
+
// On any failure (package missing, model load failure, inference error)
|
|
424
|
+
// we fall back to the LLM path so callers keep getting a score.
|
|
425
|
+
if (options.reranker === "cross-encoder") {
|
|
426
|
+
try {
|
|
427
|
+
const model = getCrossEncoderModel(options);
|
|
428
|
+
const textA = rowToText(rowA, fields);
|
|
429
|
+
const textB = rowToText(rowB, fields);
|
|
430
|
+
return await model.score(textA, textB);
|
|
431
|
+
} catch (err) {
|
|
432
|
+
// eslint-disable-next-line no-console
|
|
433
|
+
console.warn(
|
|
434
|
+
"cross-encoder reranker failed, falling back to LLM:",
|
|
435
|
+
err instanceof Error ? err.message : String(err),
|
|
436
|
+
);
|
|
437
|
+
// fall through to LLM path
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
|
|
441
|
+
const apiKey = options.apiKey ?? readEnv("OPENAI_API_KEY") ?? readEnv("ANTHROPIC_API_KEY");
|
|
442
|
+
if (!apiKey) return NaN;
|
|
443
|
+
|
|
444
|
+
const provider = detectProvider(apiKey, options.provider);
|
|
445
|
+
const model = options.model ?? defaultModel(provider);
|
|
446
|
+
const maxRetries = options.maxRetries ?? 2;
|
|
447
|
+
|
|
448
|
+
const prompt = buildPrompt(rowA, rowB, fields);
|
|
449
|
+
|
|
450
|
+
try {
|
|
451
|
+
const res =
|
|
452
|
+
provider === "openai"
|
|
453
|
+
? await callOpenAI(prompt, apiKey, model, maxRetries)
|
|
454
|
+
: await callAnthropic(prompt, apiKey, model, maxRetries);
|
|
455
|
+
const score = parseScore(res.text);
|
|
456
|
+
return score ?? NaN;
|
|
457
|
+
} catch (err) {
|
|
458
|
+
// eslint-disable-next-line no-console
|
|
459
|
+
console.warn(
|
|
460
|
+
"cross-encoder LLM score failed:",
|
|
461
|
+
err instanceof Error ? err.message : String(err),
|
|
462
|
+
);
|
|
463
|
+
return NaN;
|
|
464
|
+
}
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
// ---------------------------------------------------------------------------
|
|
468
|
+
// Batch rerank top pairs
|
|
469
|
+
// ---------------------------------------------------------------------------
|
|
470
|
+
|
|
471
|
+
/**
|
|
472
|
+
* Rerank borderline pairs via LLM. Pairs outside the borderline band are
|
|
473
|
+
* returned unchanged. Pairs the LLM can't score (HTTP error, parse fail,
|
|
474
|
+
* budget exhausted) keep their original score.
|
|
475
|
+
*
|
|
476
|
+
* Combine rule: `final = (1 - w) * original + w * rerank`, with `w = rerankWeight`.
|
|
477
|
+
*
|
|
478
|
+
* Pairs whose final score falls below `mk.threshold` are dropped from the
|
|
479
|
+
* result, matching the Python "rerank then re-filter" behaviour.
|
|
480
|
+
*/
|
|
481
|
+
export async function rerankTopPairs(
|
|
482
|
+
pairs: readonly ScoredPair[],
|
|
483
|
+
rows: readonly Row[],
|
|
484
|
+
mk: MatchkeyConfig,
|
|
485
|
+
options: CrossEncoderOptions = {},
|
|
486
|
+
): Promise<readonly ScoredPair[]> {
|
|
487
|
+
if (pairs.length === 0) return [];
|
|
488
|
+
|
|
489
|
+
const useCrossEncoder = options.reranker === "cross-encoder";
|
|
490
|
+
const apiKey = options.apiKey ?? readEnv("OPENAI_API_KEY") ?? readEnv("ANTHROPIC_API_KEY");
|
|
491
|
+
// When neither backend is available, pass pairs through unchanged.
|
|
492
|
+
if (!useCrossEncoder && !apiKey) return pairs;
|
|
493
|
+
|
|
494
|
+
const provider = apiKey ? detectProvider(apiKey, options.provider) : "openai";
|
|
495
|
+
const model = options.model ?? defaultModel(provider);
|
|
496
|
+
const maxRetries = options.maxRetries ?? 2;
|
|
497
|
+
const band = options.band ?? 0.1;
|
|
498
|
+
const weight = options.rerankWeight ?? 0.5;
|
|
499
|
+
// Reranking only meaningful for weighted/probabilistic; exact is binary.
|
|
500
|
+
const threshold = mk.type === "exact" ? 1.0 : (mk.threshold ?? 0.85);
|
|
501
|
+
|
|
502
|
+
// Build row lookup.
|
|
503
|
+
const rowById = new Map<number, Row>();
|
|
504
|
+
for (const r of rows) {
|
|
505
|
+
const id = r["__row_id__"];
|
|
506
|
+
if (typeof id === "number") rowById.set(id, r);
|
|
507
|
+
}
|
|
508
|
+
const fieldNames = mk.fields.map((f) => f.field);
|
|
509
|
+
|
|
510
|
+
// Identify borderline candidates: within `band` below threshold.
|
|
511
|
+
// We rerank pairs whose original score sits in [threshold - band, 1.0].
|
|
512
|
+
const lo = threshold - band;
|
|
513
|
+
const candidatesIdx: number[] = [];
|
|
514
|
+
for (let i = 0; i < pairs.length; i++) {
|
|
515
|
+
if (pairs[i]!.score >= lo) candidatesIdx.push(i);
|
|
516
|
+
}
|
|
517
|
+
// Sort candidates by score descending.
|
|
518
|
+
candidatesIdx.sort((a, b) => pairs[b]!.score - pairs[a]!.score);
|
|
519
|
+
|
|
520
|
+
// Cap to topN if configured.
|
|
521
|
+
const limit = options.topN ?? candidatesIdx.length;
|
|
522
|
+
const targets = candidatesIdx.slice(0, Math.max(0, limit));
|
|
523
|
+
|
|
524
|
+
const budget = new BudgetTracker(options.budget ?? {}, model);
|
|
525
|
+
|
|
526
|
+
// Cross-encoder fast path. Try loading the model once up-front; on any
|
|
527
|
+
// failure we fall back to LLM scoring (if apiKey present) or give up.
|
|
528
|
+
let ceModel: CrossEncoderModel | null = null;
|
|
529
|
+
let ceFailed = false;
|
|
530
|
+
if (useCrossEncoder) {
|
|
531
|
+
try {
|
|
532
|
+
ceModel = getCrossEncoderModel(options);
|
|
533
|
+
} catch (err) {
|
|
534
|
+
// eslint-disable-next-line no-console
|
|
535
|
+
console.warn(
|
|
536
|
+
"cross-encoder load failed, falling back to LLM:",
|
|
537
|
+
err instanceof Error ? err.message : String(err),
|
|
538
|
+
);
|
|
539
|
+
ceFailed = true;
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
const newScores = new Map<number, number>();
|
|
544
|
+
let loggedLlmError = false;
|
|
545
|
+
for (const idx of targets) {
|
|
546
|
+
const pair = pairs[idx]!;
|
|
547
|
+
const rowA = rowById.get(pair.idA);
|
|
548
|
+
const rowB = rowById.get(pair.idB);
|
|
549
|
+
if (!rowA || !rowB) continue;
|
|
550
|
+
|
|
551
|
+
// Prefer cross-encoder if requested and available.
|
|
552
|
+
if (ceModel && !ceFailed) {
|
|
553
|
+
try {
|
|
554
|
+
const score = await ceModel.score(
|
|
555
|
+
rowToText(rowA, fieldNames),
|
|
556
|
+
rowToText(rowB, fieldNames),
|
|
557
|
+
);
|
|
558
|
+
const combined = (1 - weight) * pair.score + weight * score;
|
|
559
|
+
newScores.set(idx, Math.min(1, Math.max(0, combined)));
|
|
560
|
+
continue;
|
|
561
|
+
} catch (err) {
|
|
562
|
+
// eslint-disable-next-line no-console
|
|
563
|
+
console.warn(
|
|
564
|
+
"cross-encoder inference failed, falling back to LLM for remaining pairs:",
|
|
565
|
+
err instanceof Error ? err.message : String(err),
|
|
566
|
+
);
|
|
567
|
+
ceFailed = true;
|
|
568
|
+
// fall through to LLM branch
|
|
569
|
+
}
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
// LLM branch.
|
|
573
|
+
if (!apiKey) continue;
|
|
574
|
+
if (!budget.canProceed()) break;
|
|
575
|
+
|
|
576
|
+
const prompt = buildPrompt(rowA, rowB, fieldNames);
|
|
577
|
+
const estIn = countTokensApprox(prompt);
|
|
578
|
+
if (!budget.canSend(estIn)) break;
|
|
579
|
+
|
|
580
|
+
try {
|
|
581
|
+
const res =
|
|
582
|
+
provider === "openai"
|
|
583
|
+
? await callOpenAI(prompt, apiKey, model, maxRetries)
|
|
584
|
+
: await callAnthropic(prompt, apiKey, model, maxRetries);
|
|
585
|
+
budget.record(res.inputTokens || estIn, res.outputTokens || 16, model);
|
|
586
|
+
const llmScore = parseScore(res.text);
|
|
587
|
+
if (llmScore === null) continue;
|
|
588
|
+
const combined = (1 - weight) * pair.score + weight * llmScore;
|
|
589
|
+
newScores.set(idx, Math.min(1, Math.max(0, combined)));
|
|
590
|
+
} catch (err) {
|
|
591
|
+
// Degrade gracefully: keep original score for this pair.
|
|
592
|
+
if (!loggedLlmError) {
|
|
593
|
+
// eslint-disable-next-line no-console
|
|
594
|
+
console.warn(
|
|
595
|
+
"rerank LLM call failed for pair; keeping original score. First error:",
|
|
596
|
+
err instanceof Error ? err.message : String(err),
|
|
597
|
+
);
|
|
598
|
+
loggedLlmError = true;
|
|
599
|
+
}
|
|
600
|
+
continue;
|
|
601
|
+
}
|
|
602
|
+
}
|
|
603
|
+
|
|
604
|
+
// Rebuild output, dropping pairs whose new score falls under threshold.
|
|
605
|
+
const out: ScoredPair[] = [];
|
|
606
|
+
for (let i = 0; i < pairs.length; i++) {
|
|
607
|
+
const original = pairs[i]!;
|
|
608
|
+
const reranked = newScores.get(i);
|
|
609
|
+
const finalScore = reranked ?? original.score;
|
|
610
|
+
if (finalScore < threshold) continue;
|
|
611
|
+
out.push(makeScoredPair(original.idA, original.idB, finalScore));
|
|
612
|
+
}
|
|
613
|
+
return out;
|
|
614
|
+
}
|