goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,562 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* embedder.ts — Embedding API client (OpenAI / Vertex AI / Voyage).
|
|
3
|
+
*
|
|
4
|
+
* Edge-safe: uses global `fetch()` only. No `node:` imports.
|
|
5
|
+
*
|
|
6
|
+
* Ports `goldenmatch/core/embedder.py` and `goldenmatch/core/vertex_embedder.py`,
|
|
7
|
+
* but replaces sentence-transformers / google-cloud-aiplatform with HTTP calls
|
|
8
|
+
* so the module runs in Edge / Workers / browser-like runtimes.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
// Public types
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
|
|
15
|
+
export type EmbedderProvider = "openai" | "vertex" | "voyage";
|
|
16
|
+
|
|
17
|
+
export interface EmbedderOptions {
|
|
18
|
+
readonly provider?: EmbedderProvider;
|
|
19
|
+
readonly model?: string;
|
|
20
|
+
readonly apiKey?: string;
|
|
21
|
+
/** Override the default endpoint URL. */
|
|
22
|
+
readonly endpoint?: string;
|
|
23
|
+
/** Batch size for API calls (default 64 for OpenAI, 50 for Vertex). */
|
|
24
|
+
readonly batchSize?: number;
|
|
25
|
+
/** Cache embeddings by text hash within an Embedder instance. */
|
|
26
|
+
readonly cache?: boolean;
|
|
27
|
+
/**
|
|
28
|
+
* For OpenAI text-embedding-3+ this requests a smaller embedding
|
|
29
|
+
* dimension (e.g. 512 instead of 1536).
|
|
30
|
+
*/
|
|
31
|
+
readonly dimensions?: number;
|
|
32
|
+
/** GCP project ID (required for Vertex). */
|
|
33
|
+
readonly project?: string;
|
|
34
|
+
/** GCP region (Vertex). Default: us-central1. */
|
|
35
|
+
readonly location?: string;
|
|
36
|
+
/** Pre-fetched OAuth bearer token for Vertex. */
|
|
37
|
+
readonly bearerToken?: string;
|
|
38
|
+
/** Maximum HTTP retries for transient failures (default 3). */
|
|
39
|
+
readonly maxRetries?: number;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface EmbeddingResult {
|
|
43
|
+
readonly embeddings: readonly Float32Array[];
|
|
44
|
+
readonly model: string;
|
|
45
|
+
readonly tokensUsed: number;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// ---------------------------------------------------------------------------
|
|
49
|
+
// Errors
|
|
50
|
+
// ---------------------------------------------------------------------------
|
|
51
|
+
|
|
52
|
+
export class EmbedderError extends Error {
|
|
53
|
+
constructor(
|
|
54
|
+
message: string,
|
|
55
|
+
public readonly status?: number,
|
|
56
|
+
public readonly body?: string,
|
|
57
|
+
) {
|
|
58
|
+
super(message);
|
|
59
|
+
this.name = "EmbedderError";
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// ---------------------------------------------------------------------------
|
|
64
|
+
// Helpers
|
|
65
|
+
// ---------------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
function readEnv(key: string): string | undefined {
|
|
68
|
+
// Optional, soft-read env without hard `process` reference.
|
|
69
|
+
const proc = (globalThis as { process?: { env?: Record<string, string | undefined> } }).process;
|
|
70
|
+
return proc?.env?.[key];
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
async function sleep(ms: number): Promise<void> {
|
|
74
|
+
await new Promise<void>((resolve) => {
|
|
75
|
+
setTimeout(resolve, ms);
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/** Cheap stable hash for cache keys. */
|
|
80
|
+
function hashText(text: string): string {
|
|
81
|
+
// FNV-1a 32-bit. Stable, no collisions matter much for cache use.
|
|
82
|
+
let h = 0x811c9dc5;
|
|
83
|
+
for (let i = 0; i < text.length; i++) {
|
|
84
|
+
h ^= text.charCodeAt(i);
|
|
85
|
+
h = Math.imul(h, 0x01000193);
|
|
86
|
+
}
|
|
87
|
+
return (h >>> 0).toString(36);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
/** L2-normalize an embedding in place. */
|
|
91
|
+
function l2Normalize(vec: Float32Array): Float32Array {
|
|
92
|
+
let s = 0;
|
|
93
|
+
for (let i = 0; i < vec.length; i++) s += vec[i]! * vec[i]!;
|
|
94
|
+
const norm = Math.sqrt(s);
|
|
95
|
+
if (norm === 0) return vec;
|
|
96
|
+
for (let i = 0; i < vec.length; i++) vec[i] = vec[i]! / norm;
|
|
97
|
+
return vec;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
function toFloat32(arr: readonly number[]): Float32Array {
|
|
101
|
+
const out = new Float32Array(arr.length);
|
|
102
|
+
for (let i = 0; i < arr.length; i++) out[i] = arr[i] ?? 0;
|
|
103
|
+
return out;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
// ---------------------------------------------------------------------------
|
|
107
|
+
// Provider defaults
|
|
108
|
+
// ---------------------------------------------------------------------------
|
|
109
|
+
|
|
110
|
+
function defaultModelFor(provider: EmbedderProvider): string {
|
|
111
|
+
switch (provider) {
|
|
112
|
+
case "openai":
|
|
113
|
+
return "text-embedding-3-small";
|
|
114
|
+
case "vertex":
|
|
115
|
+
return "text-embedding-004";
|
|
116
|
+
case "voyage":
|
|
117
|
+
return "voyage-3";
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
function defaultBatchSizeFor(provider: EmbedderProvider): number {
|
|
122
|
+
switch (provider) {
|
|
123
|
+
case "openai":
|
|
124
|
+
return 64;
|
|
125
|
+
case "vertex":
|
|
126
|
+
return 50;
|
|
127
|
+
case "voyage":
|
|
128
|
+
return 64;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
// ---------------------------------------------------------------------------
|
|
133
|
+
// Retry wrapper
|
|
134
|
+
// ---------------------------------------------------------------------------
|
|
135
|
+
|
|
136
|
+
async function fetchWithRetry(
|
|
137
|
+
url: string,
|
|
138
|
+
init: RequestInit,
|
|
139
|
+
maxRetries: number,
|
|
140
|
+
): Promise<Response> {
|
|
141
|
+
let lastErr: unknown = null;
|
|
142
|
+
for (let attempt = 0; attempt <= maxRetries; attempt++) {
|
|
143
|
+
try {
|
|
144
|
+
const resp = await fetch(url, init);
|
|
145
|
+
if (resp.status === 429 || (resp.status >= 500 && resp.status < 600)) {
|
|
146
|
+
if (attempt < maxRetries) {
|
|
147
|
+
// Exponential backoff: 0.5s, 1s, 2s ...
|
|
148
|
+
await sleep(500 * Math.pow(2, attempt));
|
|
149
|
+
continue;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
return resp;
|
|
153
|
+
} catch (err) {
|
|
154
|
+
lastErr = err;
|
|
155
|
+
if (attempt < maxRetries) {
|
|
156
|
+
await sleep(500 * Math.pow(2, attempt));
|
|
157
|
+
continue;
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
throw new EmbedderError(
|
|
162
|
+
`Network error after ${maxRetries + 1} attempts: ${String(lastErr)}`,
|
|
163
|
+
);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// ---------------------------------------------------------------------------
|
|
167
|
+
// Embedder
|
|
168
|
+
// ---------------------------------------------------------------------------
|
|
169
|
+
|
|
170
|
+
export class Embedder {
|
|
171
|
+
private readonly cacheMap = new Map<string, Float32Array>();
|
|
172
|
+
private readonly provider: EmbedderProvider;
|
|
173
|
+
private readonly model: string;
|
|
174
|
+
private readonly batchSize: number;
|
|
175
|
+
private readonly maxRetries: number;
|
|
176
|
+
private readonly cacheEnabled: boolean;
|
|
177
|
+
|
|
178
|
+
constructor(private readonly options: EmbedderOptions = {}) {
|
|
179
|
+
this.provider = options.provider ?? "openai";
|
|
180
|
+
this.model = options.model ?? defaultModelFor(this.provider);
|
|
181
|
+
this.batchSize = options.batchSize ?? defaultBatchSizeFor(this.provider);
|
|
182
|
+
this.maxRetries = options.maxRetries ?? 3;
|
|
183
|
+
this.cacheEnabled = options.cache ?? true;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// ──────────────────────────────────────────────────────────
|
|
187
|
+
// Public API
|
|
188
|
+
// ──────────────────────────────────────────────────────────
|
|
189
|
+
|
|
190
|
+
/** Embed a batch of texts in one or more API calls. */
|
|
191
|
+
async embedBatch(texts: readonly string[]): Promise<EmbeddingResult> {
|
|
192
|
+
if (texts.length === 0) {
|
|
193
|
+
return { embeddings: [], model: this.model, tokensUsed: 0 };
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
// Deduplicate while preserving original order mapping.
|
|
197
|
+
const uniqueOrder: string[] = [];
|
|
198
|
+
const uniqueIndex = new Map<string, number>();
|
|
199
|
+
const indexFor: number[] = new Array(texts.length).fill(0);
|
|
200
|
+
for (let i = 0; i < texts.length; i++) {
|
|
201
|
+
const t = texts[i] ?? "";
|
|
202
|
+
let idx = uniqueIndex.get(t);
|
|
203
|
+
if (idx === undefined) {
|
|
204
|
+
idx = uniqueOrder.length;
|
|
205
|
+
uniqueOrder.push(t);
|
|
206
|
+
uniqueIndex.set(t, idx);
|
|
207
|
+
}
|
|
208
|
+
indexFor[i] = idx;
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// Resolve from cache where possible.
|
|
212
|
+
const uniqueEmbeddings: (Float32Array | null)[] = new Array(uniqueOrder.length).fill(null);
|
|
213
|
+
const toFetchIdx: number[] = [];
|
|
214
|
+
const toFetchText: string[] = [];
|
|
215
|
+
for (let i = 0; i < uniqueOrder.length; i++) {
|
|
216
|
+
const text = uniqueOrder[i]!;
|
|
217
|
+
if (this.cacheEnabled) {
|
|
218
|
+
const hit = this.cacheMap.get(hashText(text));
|
|
219
|
+
if (hit) {
|
|
220
|
+
uniqueEmbeddings[i] = hit;
|
|
221
|
+
continue;
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
toFetchIdx.push(i);
|
|
225
|
+
toFetchText.push(text);
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
let tokensUsed = 0;
|
|
229
|
+
|
|
230
|
+
if (toFetchText.length > 0) {
|
|
231
|
+
// Batch the API calls.
|
|
232
|
+
for (let start = 0; start < toFetchText.length; start += this.batchSize) {
|
|
233
|
+
const end = Math.min(start + this.batchSize, toFetchText.length);
|
|
234
|
+
const slice = toFetchText.slice(start, end);
|
|
235
|
+
const result = await this.callProvider(slice);
|
|
236
|
+
tokensUsed += result.tokensUsed;
|
|
237
|
+
for (let j = 0; j < result.embeddings.length; j++) {
|
|
238
|
+
const targetIdx = toFetchIdx[start + j]!;
|
|
239
|
+
const emb = result.embeddings[j]!;
|
|
240
|
+
uniqueEmbeddings[targetIdx] = emb;
|
|
241
|
+
if (this.cacheEnabled) {
|
|
242
|
+
this.cacheMap.set(hashText(uniqueOrder[targetIdx]!), emb);
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// Re-expand back to original order.
|
|
249
|
+
const embeddings: Float32Array[] = new Array(texts.length);
|
|
250
|
+
for (let i = 0; i < texts.length; i++) {
|
|
251
|
+
const u = uniqueEmbeddings[indexFor[i]!];
|
|
252
|
+
if (!u) {
|
|
253
|
+
// Should not happen; fall back to zero vector of last-known dim.
|
|
254
|
+
const dim = this.firstDim(uniqueEmbeddings) ?? 0;
|
|
255
|
+
embeddings[i] = new Float32Array(dim);
|
|
256
|
+
} else {
|
|
257
|
+
embeddings[i] = u;
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
return { embeddings, model: this.model, tokensUsed };
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
/** Embed a single text. */
|
|
265
|
+
async embedOne(text: string): Promise<Float32Array> {
|
|
266
|
+
const r = await this.embedBatch([text]);
|
|
267
|
+
return r.embeddings[0] ?? new Float32Array(0);
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
/**
|
|
271
|
+
* Embed a column of (possibly null) values. Null/empty get a zero vector.
|
|
272
|
+
* Identical text values are de-duplicated automatically.
|
|
273
|
+
*/
|
|
274
|
+
async embedColumn(
|
|
275
|
+
values: readonly (string | null | undefined)[],
|
|
276
|
+
_cacheKey?: string,
|
|
277
|
+
): Promise<readonly Float32Array[]> {
|
|
278
|
+
if (values.length === 0) return [];
|
|
279
|
+
|
|
280
|
+
// Substitute null/empty with a sentinel; we replace with zero vectors after.
|
|
281
|
+
const ZERO_SENTINEL = "\u0000__GM_NULL__\u0000";
|
|
282
|
+
const inputs: string[] = values.map((v) => {
|
|
283
|
+
if (v === null || v === undefined) return ZERO_SENTINEL;
|
|
284
|
+
const s = String(v).trim();
|
|
285
|
+
return s === "" ? ZERO_SENTINEL : s;
|
|
286
|
+
});
|
|
287
|
+
|
|
288
|
+
// Embed only the non-null subset.
|
|
289
|
+
const nonNullTexts: string[] = [];
|
|
290
|
+
const positions: number[] = [];
|
|
291
|
+
for (let i = 0; i < inputs.length; i++) {
|
|
292
|
+
if (inputs[i] !== ZERO_SENTINEL) {
|
|
293
|
+
nonNullTexts.push(inputs[i]!);
|
|
294
|
+
positions.push(i);
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
let dim = 0;
|
|
299
|
+
let realEmbeddings: readonly Float32Array[] = [];
|
|
300
|
+
if (nonNullTexts.length > 0) {
|
|
301
|
+
const r = await this.embedBatch(nonNullTexts);
|
|
302
|
+
realEmbeddings = r.embeddings;
|
|
303
|
+
dim = realEmbeddings[0]?.length ?? 0;
|
|
304
|
+
}
|
|
305
|
+
|
|
306
|
+
const out: Float32Array[] = new Array(values.length);
|
|
307
|
+
for (let i = 0; i < values.length; i++) out[i] = new Float32Array(dim);
|
|
308
|
+
for (let k = 0; k < positions.length; k++) {
|
|
309
|
+
out[positions[k]!] = realEmbeddings[k] ?? new Float32Array(dim);
|
|
310
|
+
}
|
|
311
|
+
return out;
|
|
312
|
+
}
|
|
313
|
+
|
|
314
|
+
// ──────────────────────────────────────────────────────────
|
|
315
|
+
// Similarity helpers
|
|
316
|
+
// ──────────────────────────────────────────────────────────
|
|
317
|
+
|
|
318
|
+
cosineSimilarity(a: Float32Array, b: Float32Array): number {
|
|
319
|
+
let dot = 0;
|
|
320
|
+
let na = 0;
|
|
321
|
+
let nb = 0;
|
|
322
|
+
const n = Math.min(a.length, b.length);
|
|
323
|
+
for (let i = 0; i < n; i++) {
|
|
324
|
+
const av = a[i]!;
|
|
325
|
+
const bv = b[i]!;
|
|
326
|
+
dot += av * bv;
|
|
327
|
+
na += av * av;
|
|
328
|
+
nb += bv * bv;
|
|
329
|
+
}
|
|
330
|
+
const denom = Math.sqrt(na) * Math.sqrt(nb);
|
|
331
|
+
return denom === 0 ? 0 : dot / denom;
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
cosineSimilarityMatrix(embeddings: readonly Float32Array[]): number[][] {
|
|
335
|
+
const n = embeddings.length;
|
|
336
|
+
const out: number[][] = new Array(n);
|
|
337
|
+
for (let i = 0; i < n; i++) out[i] = new Array(n).fill(0);
|
|
338
|
+
for (let i = 0; i < n; i++) {
|
|
339
|
+
out[i]![i] = 1;
|
|
340
|
+
for (let j = i + 1; j < n; j++) {
|
|
341
|
+
const s = this.cosineSimilarity(embeddings[i]!, embeddings[j]!);
|
|
342
|
+
out[i]![j] = s;
|
|
343
|
+
out[j]![i] = s;
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
return out;
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
// ──────────────────────────────────────────────────────────
|
|
350
|
+
// Internals
|
|
351
|
+
// ──────────────────────────────────────────────────────────
|
|
352
|
+
|
|
353
|
+
private firstDim(arr: readonly (Float32Array | null)[]): number | null {
|
|
354
|
+
for (const v of arr) if (v) return v.length;
|
|
355
|
+
return null;
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
private async callProvider(
|
|
359
|
+
texts: readonly string[],
|
|
360
|
+
): Promise<EmbeddingResult> {
|
|
361
|
+
switch (this.provider) {
|
|
362
|
+
case "openai":
|
|
363
|
+
return this.callOpenAI(texts);
|
|
364
|
+
case "vertex":
|
|
365
|
+
return this.callVertex(texts);
|
|
366
|
+
case "voyage":
|
|
367
|
+
return this.callVoyage(texts);
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
// ── OpenAI ────────────────────────────────────────────────
|
|
372
|
+
private async callOpenAI(texts: readonly string[]): Promise<EmbeddingResult> {
|
|
373
|
+
const apiKey = this.options.apiKey ?? readEnv("OPENAI_API_KEY");
|
|
374
|
+
if (!apiKey) {
|
|
375
|
+
throw new EmbedderError(
|
|
376
|
+
"OpenAI API key required. Pass options.apiKey or set OPENAI_API_KEY.",
|
|
377
|
+
);
|
|
378
|
+
}
|
|
379
|
+
const url = this.options.endpoint ?? "https://api.openai.com/v1/embeddings";
|
|
380
|
+
const body: Record<string, unknown> = {
|
|
381
|
+
model: this.model,
|
|
382
|
+
input: texts,
|
|
383
|
+
};
|
|
384
|
+
if (this.options.dimensions !== undefined) {
|
|
385
|
+
body.dimensions = this.options.dimensions;
|
|
386
|
+
}
|
|
387
|
+
const resp = await fetchWithRetry(
|
|
388
|
+
url,
|
|
389
|
+
{
|
|
390
|
+
method: "POST",
|
|
391
|
+
headers: {
|
|
392
|
+
Authorization: `Bearer ${apiKey}`,
|
|
393
|
+
"Content-Type": "application/json",
|
|
394
|
+
},
|
|
395
|
+
body: JSON.stringify(body),
|
|
396
|
+
},
|
|
397
|
+
this.maxRetries,
|
|
398
|
+
);
|
|
399
|
+
if (!resp.ok) {
|
|
400
|
+
const text = await resp.text().catch(() => "");
|
|
401
|
+
throw new EmbedderError(
|
|
402
|
+
`OpenAI embeddings ${resp.status}`,
|
|
403
|
+
resp.status,
|
|
404
|
+
text.slice(0, 500),
|
|
405
|
+
);
|
|
406
|
+
}
|
|
407
|
+
const data = (await resp.json()) as {
|
|
408
|
+
data?: Array<{ embedding?: number[] }>;
|
|
409
|
+
usage?: { total_tokens?: number };
|
|
410
|
+
};
|
|
411
|
+
const arr = data.data ?? [];
|
|
412
|
+
const embeddings: Float32Array[] = arr.map((d) => {
|
|
413
|
+
const v = d.embedding ?? [];
|
|
414
|
+
return l2Normalize(toFloat32(v));
|
|
415
|
+
});
|
|
416
|
+
return {
|
|
417
|
+
embeddings,
|
|
418
|
+
model: this.model,
|
|
419
|
+
tokensUsed: data.usage?.total_tokens ?? 0,
|
|
420
|
+
};
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
// ── Vertex AI ─────────────────────────────────────────────
|
|
424
|
+
private async callVertex(texts: readonly string[]): Promise<EmbeddingResult> {
|
|
425
|
+
const project = this.options.project ?? readEnv("GOOGLE_CLOUD_PROJECT");
|
|
426
|
+
if (!project) {
|
|
427
|
+
throw new EmbedderError(
|
|
428
|
+
"Vertex requires options.project or GOOGLE_CLOUD_PROJECT.",
|
|
429
|
+
);
|
|
430
|
+
}
|
|
431
|
+
const location =
|
|
432
|
+
this.options.location ??
|
|
433
|
+
readEnv("GOOGLE_CLOUD_LOCATION") ??
|
|
434
|
+
"us-central1";
|
|
435
|
+
const token =
|
|
436
|
+
this.options.bearerToken ??
|
|
437
|
+
this.options.apiKey ??
|
|
438
|
+
readEnv("GOOGLE_OAUTH_TOKEN");
|
|
439
|
+
if (!token) {
|
|
440
|
+
throw new EmbedderError(
|
|
441
|
+
"Vertex requires options.bearerToken (OAuth access token). " +
|
|
442
|
+
"Service-account JWT signing is unavailable in edge runtime — " +
|
|
443
|
+
"fetch a token out-of-band and pass it in.",
|
|
444
|
+
);
|
|
445
|
+
}
|
|
446
|
+
const url =
|
|
447
|
+
this.options.endpoint ??
|
|
448
|
+
`https://${location}-aiplatform.googleapis.com/v1/projects/${project}/locations/${location}/publishers/google/models/${this.model}:predict`;
|
|
449
|
+
|
|
450
|
+
const body = JSON.stringify({
|
|
451
|
+
instances: texts.map((t) => ({ content: t })),
|
|
452
|
+
});
|
|
453
|
+
const resp = await fetchWithRetry(
|
|
454
|
+
url,
|
|
455
|
+
{
|
|
456
|
+
method: "POST",
|
|
457
|
+
headers: {
|
|
458
|
+
Authorization: `Bearer ${token}`,
|
|
459
|
+
"Content-Type": "application/json",
|
|
460
|
+
},
|
|
461
|
+
body,
|
|
462
|
+
},
|
|
463
|
+
this.maxRetries,
|
|
464
|
+
);
|
|
465
|
+
if (!resp.ok) {
|
|
466
|
+
const text = await resp.text().catch(() => "");
|
|
467
|
+
throw new EmbedderError(
|
|
468
|
+
`Vertex embeddings ${resp.status}`,
|
|
469
|
+
resp.status,
|
|
470
|
+
text.slice(0, 500),
|
|
471
|
+
);
|
|
472
|
+
}
|
|
473
|
+
const data = (await resp.json()) as {
|
|
474
|
+
predictions?: Array<{
|
|
475
|
+
embeddings?: { values?: number[]; statistics?: { token_count?: number } };
|
|
476
|
+
}>;
|
|
477
|
+
};
|
|
478
|
+
let tokens = 0;
|
|
479
|
+
const embeddings: Float32Array[] = (data.predictions ?? []).map((p) => {
|
|
480
|
+
const v = p.embeddings?.values ?? [];
|
|
481
|
+
tokens += p.embeddings?.statistics?.token_count ?? 0;
|
|
482
|
+
return l2Normalize(toFloat32(v));
|
|
483
|
+
});
|
|
484
|
+
return { embeddings, model: this.model, tokensUsed: tokens };
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
// ── Voyage AI ─────────────────────────────────────────────
|
|
488
|
+
private async callVoyage(texts: readonly string[]): Promise<EmbeddingResult> {
|
|
489
|
+
const apiKey = this.options.apiKey ?? readEnv("VOYAGE_API_KEY");
|
|
490
|
+
if (!apiKey) {
|
|
491
|
+
throw new EmbedderError(
|
|
492
|
+
"Voyage API key required. Pass options.apiKey or set VOYAGE_API_KEY.",
|
|
493
|
+
);
|
|
494
|
+
}
|
|
495
|
+
const url = this.options.endpoint ?? "https://api.voyageai.com/v1/embeddings";
|
|
496
|
+
const resp = await fetchWithRetry(
|
|
497
|
+
url,
|
|
498
|
+
{
|
|
499
|
+
method: "POST",
|
|
500
|
+
headers: {
|
|
501
|
+
Authorization: `Bearer ${apiKey}`,
|
|
502
|
+
"Content-Type": "application/json",
|
|
503
|
+
},
|
|
504
|
+
body: JSON.stringify({ model: this.model, input: texts }),
|
|
505
|
+
},
|
|
506
|
+
this.maxRetries,
|
|
507
|
+
);
|
|
508
|
+
if (!resp.ok) {
|
|
509
|
+
const text = await resp.text().catch(() => "");
|
|
510
|
+
throw new EmbedderError(
|
|
511
|
+
`Voyage embeddings ${resp.status}`,
|
|
512
|
+
resp.status,
|
|
513
|
+
text.slice(0, 500),
|
|
514
|
+
);
|
|
515
|
+
}
|
|
516
|
+
const data = (await resp.json()) as {
|
|
517
|
+
data?: Array<{ embedding?: number[] }>;
|
|
518
|
+
usage?: { total_tokens?: number };
|
|
519
|
+
};
|
|
520
|
+
const arr = data.data ?? [];
|
|
521
|
+
const embeddings: Float32Array[] = arr.map((d) => {
|
|
522
|
+
const v = d.embedding ?? [];
|
|
523
|
+
return l2Normalize(toFloat32(v));
|
|
524
|
+
});
|
|
525
|
+
return {
|
|
526
|
+
embeddings,
|
|
527
|
+
model: this.model,
|
|
528
|
+
tokensUsed: data.usage?.total_tokens ?? 0,
|
|
529
|
+
};
|
|
530
|
+
}
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
// ---------------------------------------------------------------------------
|
|
534
|
+
// Singleton factory
|
|
535
|
+
// ---------------------------------------------------------------------------
|
|
536
|
+
|
|
537
|
+
const embedderCache = new Map<string, Embedder>();
|
|
538
|
+
|
|
539
|
+
/**
|
|
540
|
+
* Return a cached Embedder instance keyed by provider+model.
|
|
541
|
+
* Pass a string to use a model name with default provider, or full options.
|
|
542
|
+
*/
|
|
543
|
+
export function getEmbedder(modelOrOptions?: string | EmbedderOptions): Embedder {
|
|
544
|
+
const opts: EmbedderOptions =
|
|
545
|
+
typeof modelOrOptions === "string"
|
|
546
|
+
? { model: modelOrOptions }
|
|
547
|
+
: (modelOrOptions ?? {});
|
|
548
|
+
const provider = opts.provider ?? "openai";
|
|
549
|
+
const model = opts.model ?? defaultModelFor(provider);
|
|
550
|
+
const key = `${provider}::${model}`;
|
|
551
|
+
let e = embedderCache.get(key);
|
|
552
|
+
if (!e) {
|
|
553
|
+
e = new Embedder(opts);
|
|
554
|
+
embedderCache.set(key, e);
|
|
555
|
+
}
|
|
556
|
+
return e;
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
/** Test-only: clear the embedder cache. */
|
|
560
|
+
export function _clearEmbedderCache(): void {
|
|
561
|
+
embedderCache.clear();
|
|
562
|
+
}
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evaluate.ts — Precision/recall/F1 evaluation against ground truth.
|
|
3
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/evaluate.py.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import type { Row, ScoredPair, ClusterInfo } from "./types.js";
|
|
9
|
+
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
// Types
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
|
|
14
|
+
export interface EvalResult {
|
|
15
|
+
readonly precision: number;
|
|
16
|
+
readonly recall: number;
|
|
17
|
+
readonly f1: number;
|
|
18
|
+
readonly truePositives: number;
|
|
19
|
+
readonly falsePositives: number;
|
|
20
|
+
readonly falseNegatives: number;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
// Helpers
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
|
|
27
|
+
function canonicalPair(a: number, b: number): string {
|
|
28
|
+
return a < b ? `${a}:${b}` : `${b}:${a}`;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function toPairSet(
|
|
32
|
+
pairs: readonly (readonly [number, number])[],
|
|
33
|
+
): Set<string> {
|
|
34
|
+
const out = new Set<string>();
|
|
35
|
+
for (const [a, b] of pairs) {
|
|
36
|
+
if (a === b) continue;
|
|
37
|
+
out.add(canonicalPair(a, b));
|
|
38
|
+
}
|
|
39
|
+
return out;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function computeMetrics(
|
|
43
|
+
tp: number,
|
|
44
|
+
fp: number,
|
|
45
|
+
fn: number,
|
|
46
|
+
): EvalResult {
|
|
47
|
+
const precision = tp + fp > 0 ? tp / (tp + fp) : 0;
|
|
48
|
+
const recall = tp + fn > 0 ? tp / (tp + fn) : 0;
|
|
49
|
+
const f1 =
|
|
50
|
+
precision + recall > 0 ? (2 * precision * recall) / (precision + recall) : 0;
|
|
51
|
+
return {
|
|
52
|
+
precision,
|
|
53
|
+
recall,
|
|
54
|
+
f1,
|
|
55
|
+
truePositives: tp,
|
|
56
|
+
falsePositives: fp,
|
|
57
|
+
falseNegatives: fn,
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// ---------------------------------------------------------------------------
|
|
62
|
+
// Public API
|
|
63
|
+
// ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Evaluate predicted pairs against ground-truth pairs.
|
|
67
|
+
*
|
|
68
|
+
* Pairs are treated as unordered (canonicalized to min:max).
|
|
69
|
+
*/
|
|
70
|
+
export function evaluatePairs(
|
|
71
|
+
predictedPairs: readonly ScoredPair[],
|
|
72
|
+
groundTruthPairs: readonly (readonly [number, number])[],
|
|
73
|
+
): EvalResult {
|
|
74
|
+
const truth = toPairSet(groundTruthPairs);
|
|
75
|
+
const predicted = new Set<string>();
|
|
76
|
+
for (const p of predictedPairs) {
|
|
77
|
+
if (p.idA === p.idB) continue;
|
|
78
|
+
predicted.add(canonicalPair(p.idA, p.idB));
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
let tp = 0;
|
|
82
|
+
let fp = 0;
|
|
83
|
+
for (const key of predicted) {
|
|
84
|
+
if (truth.has(key)) tp++;
|
|
85
|
+
else fp++;
|
|
86
|
+
}
|
|
87
|
+
let fn = 0;
|
|
88
|
+
for (const key of truth) {
|
|
89
|
+
if (!predicted.has(key)) fn++;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
return computeMetrics(tp, fp, fn);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Evaluate clusters against ground-truth pairs.
|
|
97
|
+
*
|
|
98
|
+
* Expands each cluster's members into the full set of intra-cluster pairs and
|
|
99
|
+
* compares that set to the ground truth.
|
|
100
|
+
*/
|
|
101
|
+
export function evaluateClusters(
|
|
102
|
+
clusters: ReadonlyMap<number, ClusterInfo>,
|
|
103
|
+
groundTruthPairs: readonly (readonly [number, number])[],
|
|
104
|
+
_allIds: readonly number[],
|
|
105
|
+
): EvalResult {
|
|
106
|
+
const predicted = new Set<string>();
|
|
107
|
+
for (const info of clusters.values()) {
|
|
108
|
+
const members = info.members;
|
|
109
|
+
if (members.length < 2) continue;
|
|
110
|
+
for (let i = 0; i < members.length; i++) {
|
|
111
|
+
for (let j = i + 1; j < members.length; j++) {
|
|
112
|
+
predicted.add(canonicalPair(members[i]!, members[j]!));
|
|
113
|
+
}
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
const truth = toPairSet(groundTruthPairs);
|
|
118
|
+
|
|
119
|
+
let tp = 0;
|
|
120
|
+
let fp = 0;
|
|
121
|
+
for (const key of predicted) {
|
|
122
|
+
if (truth.has(key)) tp++;
|
|
123
|
+
else fp++;
|
|
124
|
+
}
|
|
125
|
+
let fn = 0;
|
|
126
|
+
for (const key of truth) {
|
|
127
|
+
if (!predicted.has(key)) fn++;
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
return computeMetrics(tp, fp, fn);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/**
|
|
134
|
+
* Extract ground truth pairs from a list of rows containing two id columns.
|
|
135
|
+
*
|
|
136
|
+
* Numeric strings are parsed to integers. Rows with missing/unparseable ids
|
|
137
|
+
* are skipped.
|
|
138
|
+
*/
|
|
139
|
+
export function loadGroundTruthPairs(
|
|
140
|
+
rows: readonly Row[],
|
|
141
|
+
idColA: string,
|
|
142
|
+
idColB: string,
|
|
143
|
+
): (readonly [number, number])[] {
|
|
144
|
+
const out: [number, number][] = [];
|
|
145
|
+
for (const row of rows) {
|
|
146
|
+
const rawA = row[idColA];
|
|
147
|
+
const rawB = row[idColB];
|
|
148
|
+
if (rawA === null || rawA === undefined) continue;
|
|
149
|
+
if (rawB === null || rawB === undefined) continue;
|
|
150
|
+
const a = typeof rawA === "number" ? rawA : Number(rawA);
|
|
151
|
+
const b = typeof rawB === "number" ? rawB : Number(rawB);
|
|
152
|
+
if (!Number.isFinite(a) || !Number.isFinite(b)) continue;
|
|
153
|
+
out.push([a, b]);
|
|
154
|
+
}
|
|
155
|
+
return out;
|
|
156
|
+
}
|