goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,1972 @@
|
|
|
1
|
+
import { R as Row, g as ColumnValue, e as ClusterInfo, P as PairKey, M as MatchkeyConfig, S as ScoredPair, B as BlockResult, o as MatchkeyField, b as BlockingConfig, c as BlockingKeyConfig, d as BudgetConfig, k as GoldenRulesConfig, f as ClusterProvenance, j as GoldenFieldRule, G as GoldenMatchConfig, D as DedupeResult, a as MatchResult, L as LLMScorerConfig, Q as QualityConfig, m as LearningConfig } from '../types-DhUdX5Rc.cjs';
|
|
2
|
+
export { C as CanopyConfig, h as DedupeStats, i as DomainConfig, E as ExactMatchkey, F as FieldProvenance, I as InputConfig, l as InputFileConfig, n as MakeMatchkeyConfigInput, p as MemoryConfig, O as OutputConfig, q as ProbabilisticMatchkey, r as SortKeyField, s as StandardizationConfig, T as TransformConfig, V as VALID_SCORERS, t as VALID_STANDARDIZERS, u as VALID_STRATEGIES, v as VALID_TRANSFORMS, w as ValidationConfig, x as ValidationRuleConfig, W as WeightedMatchkey, y as getMatchkeys, z as makeBlockingConfig, A as makeConfig, H as makeGoldenRulesConfig, J as makeMatchkeyConfig, K as makeMatchkeyField, N as makeScoredPair } from '../types-DhUdX5Rc.cjs';
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* data.ts — TabularData, edge-safe Polars replacement.
|
|
6
|
+
* Wraps readonly Row[] with column operations, joins, groupBy, sampling.
|
|
7
|
+
* No Node.js imports, no `process`.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
/** Returns true for null, undefined, NaN, and null-ish string sentinels. */
|
|
11
|
+
declare function isNullish(v: unknown): v is null | undefined;
|
|
12
|
+
/** Normalize an unknown value to ColumnValue (string | number | boolean | null). */
|
|
13
|
+
declare function toColumnValue(v: unknown): ColumnValue;
|
|
14
|
+
declare class TabularData {
|
|
15
|
+
private readonly _rows;
|
|
16
|
+
private _columnCache;
|
|
17
|
+
constructor(rows: readonly Row[]);
|
|
18
|
+
get rows(): readonly Row[];
|
|
19
|
+
get columns(): readonly string[];
|
|
20
|
+
get rowCount(): number;
|
|
21
|
+
/** Get column values with null coercion (N/A, NaN, etc. become null). */
|
|
22
|
+
column(name: string): readonly ColumnValue[];
|
|
23
|
+
/** Raw column access -- preserves original values without null coercion.
|
|
24
|
+
* Use for profiling where "N/A" should remain a string, not become null. */
|
|
25
|
+
rawColumn(name: string): readonly ColumnValue[];
|
|
26
|
+
nullCount(col: string): number;
|
|
27
|
+
dropNulls(col: string): ColumnValue[];
|
|
28
|
+
nUnique(col: string): number;
|
|
29
|
+
valueCounts(col: string): Map<ColumnValue, number>;
|
|
30
|
+
/** MUST use loop -- Math.min(...array) crashes on >65K elements. */
|
|
31
|
+
min(col: string): number | null;
|
|
32
|
+
/** MUST use loop -- Math.max(...array) crashes on >65K elements. */
|
|
33
|
+
max(col: string): number | null;
|
|
34
|
+
mean(col: string): number | null;
|
|
35
|
+
std(col: string): number | null;
|
|
36
|
+
filter(predicate: (row: Row) => boolean): TabularData;
|
|
37
|
+
map(fn: (row: Row, index: number) => Row): TabularData;
|
|
38
|
+
slice(start: number, end?: number): TabularData;
|
|
39
|
+
/** Keep only the named columns. */
|
|
40
|
+
select(cols: readonly string[]): TabularData;
|
|
41
|
+
/** Drop the named columns. */
|
|
42
|
+
drop(cols: readonly string[]): TabularData;
|
|
43
|
+
/** Return a new TabularData with an added (or replaced) column. */
|
|
44
|
+
addColumn(name: string, values: readonly ColumnValue[]): TabularData;
|
|
45
|
+
/** Add a sequential row index column (like Polars with_row_index). */
|
|
46
|
+
withRowIndex(name?: string, offset?: number): TabularData;
|
|
47
|
+
/** Group rows by a column, returning Map<stringKey, TabularData>. */
|
|
48
|
+
groupBy(key: string): Map<string, TabularData>;
|
|
49
|
+
/**
|
|
50
|
+
* Inner join with another TabularData on a shared column.
|
|
51
|
+
* Columns from `other` get a suffix to avoid collisions.
|
|
52
|
+
*/
|
|
53
|
+
join(other: TabularData, on: string, suffix?: string): TabularData;
|
|
54
|
+
/** Fisher-Yates partial shuffle with seedable PRNG. */
|
|
55
|
+
sample(n: number, seed?: number): TabularData;
|
|
56
|
+
/** Sort by a column (ascending). Nulls sort last. */
|
|
57
|
+
sortBy(col: string): TabularData;
|
|
58
|
+
/** Return rows with unique values in the given column (keeps first occurrence). */
|
|
59
|
+
unique(col: string): TabularData;
|
|
60
|
+
/** Return rows as plain dicts. */
|
|
61
|
+
toDicts(): Row[];
|
|
62
|
+
numericValues(col: string): number[];
|
|
63
|
+
stringValues(col: string): string[];
|
|
64
|
+
/** Create from an array of row dicts. */
|
|
65
|
+
static fromDicts(rows: readonly Row[]): TabularData;
|
|
66
|
+
/** Create from column-oriented data: {col: values[]}. */
|
|
67
|
+
static fromColumns(cols: Readonly<Record<string, readonly ColumnValue[]>>): TabularData;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/**
|
|
71
|
+
* transforms.ts — Pure field transform utilities.
|
|
72
|
+
* Edge-safe: no Node.js imports, no `process`.
|
|
73
|
+
*
|
|
74
|
+
* Ports goldenmatch/utils/transforms.py.
|
|
75
|
+
*/
|
|
76
|
+
/** Apply a single named transform to a value. Returns null if input is null. */
|
|
77
|
+
declare function applyTransform(value: string | null, transform: string): string | null;
|
|
78
|
+
/** Apply a chain of transforms in order. */
|
|
79
|
+
declare function applyTransforms(value: string | null, transforms: readonly string[]): string | null;
|
|
80
|
+
/**
|
|
81
|
+
* American Soundex (Robert Russell, 1918).
|
|
82
|
+
* 1. Keep first letter
|
|
83
|
+
* 2. Map consonants to digits (B/F/P/V->1, C/G/J/K/Q/S/X/Z->2, D/T->3, L->4, M/N->5, R->6)
|
|
84
|
+
* 3. Remove adjacent duplicates, vowels/H/W
|
|
85
|
+
* 4. Pad/truncate to 4 chars
|
|
86
|
+
*
|
|
87
|
+
* H and W are transparent — they do NOT reset the duplicate suppression.
|
|
88
|
+
* Vowels (A/E/I/O/U/Y) DO reset, so "Pfister" and "Jackson" work correctly.
|
|
89
|
+
*/
|
|
90
|
+
declare function soundex(value: string): string;
|
|
91
|
+
/**
|
|
92
|
+
* Simplified Metaphone.
|
|
93
|
+
* Returns a phonetic code of up to 4 characters.
|
|
94
|
+
*/
|
|
95
|
+
declare function metaphone(value: string): string;
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* cluster.ts — Union-Find clustering with MST splitting.
|
|
99
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
100
|
+
*/
|
|
101
|
+
|
|
102
|
+
/** Canonicalize a pair key: always min:max. Sole producer of branded PairKey. */
|
|
103
|
+
declare function pairKey(a: number, b: number): PairKey;
|
|
104
|
+
/** Parse a pair key back into [idA, idB]. */
|
|
105
|
+
declare function parsePairKey(key: PairKey): readonly [number, number];
|
|
106
|
+
declare class UnionFind {
|
|
107
|
+
private parent;
|
|
108
|
+
private rank;
|
|
109
|
+
/** Add element as its own root. */
|
|
110
|
+
add(x: number): void;
|
|
111
|
+
/** Batch add multiple elements. */
|
|
112
|
+
addMany(ids: readonly number[]): void;
|
|
113
|
+
/** Find root with iterative path compression. */
|
|
114
|
+
find(x: number): number;
|
|
115
|
+
/** Union by rank. */
|
|
116
|
+
union(a: number, b: number): void;
|
|
117
|
+
/** Return all clusters as arrays of sets. */
|
|
118
|
+
getClusters(): Set<number>[];
|
|
119
|
+
}
|
|
120
|
+
/**
|
|
121
|
+
* Build a max-weight spanning tree using Kruskal's algorithm.
|
|
122
|
+
* Returns edges as [idA, idB, score] sorted by descending weight.
|
|
123
|
+
*/
|
|
124
|
+
declare function buildMst(members: readonly number[], pairScores: ReadonlyMap<PairKey, number>): [number, number, number][];
|
|
125
|
+
interface ClusterConfidence {
|
|
126
|
+
readonly minEdge: number | null;
|
|
127
|
+
readonly avgEdge: number | null;
|
|
128
|
+
readonly connectivity: number;
|
|
129
|
+
readonly bottleneckPair: readonly [number, number] | null;
|
|
130
|
+
readonly confidence: number;
|
|
131
|
+
}
|
|
132
|
+
/**
|
|
133
|
+
* Compute confidence metrics for a cluster.
|
|
134
|
+
* confidence = 0.4 * minEdge + 0.3 * avgEdge + 0.3 * connectivity
|
|
135
|
+
*/
|
|
136
|
+
declare function computeClusterConfidence(pairScores: ReadonlyMap<PairKey, number>, size: number): ClusterConfidence;
|
|
137
|
+
/** Internal mutable cluster info used during building. */
|
|
138
|
+
interface MutableClusterInfo {
|
|
139
|
+
members: number[];
|
|
140
|
+
size: number;
|
|
141
|
+
oversized: boolean;
|
|
142
|
+
pairScores: Map<PairKey, number>;
|
|
143
|
+
confidence: number;
|
|
144
|
+
bottleneckPair: readonly [number, number] | null;
|
|
145
|
+
clusterQuality: "strong" | "weak" | "split";
|
|
146
|
+
_wasSplit?: boolean;
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Split a cluster by removing the weakest MST edge.
|
|
150
|
+
* Returns sub-cluster infos.
|
|
151
|
+
*/
|
|
152
|
+
declare function splitOversizedCluster(members: readonly number[], pairScores: ReadonlyMap<PairKey, number>): MutableClusterInfo[];
|
|
153
|
+
interface BuildClustersOptions {
|
|
154
|
+
readonly maxClusterSize?: number;
|
|
155
|
+
readonly weakClusterThreshold?: number;
|
|
156
|
+
readonly autoSplit?: boolean;
|
|
157
|
+
}
|
|
158
|
+
/**
|
|
159
|
+
* Build clusters from scored pairs using Union-Find.
|
|
160
|
+
*
|
|
161
|
+
* Auto-splits oversized clusters via MST (iterative, not recursive).
|
|
162
|
+
* Assigns cluster_quality: "strong", "weak" (avg-min > weakThreshold), or "split".
|
|
163
|
+
* Downgrades confidence by 0.7 for weak clusters.
|
|
164
|
+
*/
|
|
165
|
+
declare function buildClusters(pairs: readonly (readonly [number, number, number])[], allIds: readonly number[], options?: BuildClustersOptions): Map<number, ClusterInfo>;
|
|
166
|
+
/**
|
|
167
|
+
* Add a new record to existing clusters based on matches.
|
|
168
|
+
*
|
|
169
|
+
* - No matches: new singleton cluster
|
|
170
|
+
* - Single cluster match: join that cluster
|
|
171
|
+
* - Multiple cluster match: merge all matched clusters
|
|
172
|
+
*
|
|
173
|
+
* Flags oversized but does NOT auto-split. Caller should call
|
|
174
|
+
* splitOversizedCluster() if desired.
|
|
175
|
+
*/
|
|
176
|
+
declare function addToCluster(recordId: number, matches: readonly (readonly [number, number])[], clusters: Map<number, ClusterInfo>, maxClusterSize?: number): Map<number, ClusterInfo>;
|
|
177
|
+
/**
|
|
178
|
+
* Remove a record from its cluster and re-cluster remaining members.
|
|
179
|
+
* The removed record becomes a singleton.
|
|
180
|
+
*/
|
|
181
|
+
declare function unmergeRecord(recordId: number, clusters: Map<number, ClusterInfo>, threshold?: number): Map<number, ClusterInfo>;
|
|
182
|
+
/**
|
|
183
|
+
* Shatter a cluster into individual singletons.
|
|
184
|
+
* All members become their own cluster. Pair scores are discarded.
|
|
185
|
+
*/
|
|
186
|
+
declare function unmergeCluster(clusterId: number, clusters: Map<number, ClusterInfo>): Map<number, ClusterInfo>;
|
|
187
|
+
/**
|
|
188
|
+
* Get pair scores for a specific set of cluster members from all pairs.
|
|
189
|
+
* Call on-demand, not in hot path.
|
|
190
|
+
*/
|
|
191
|
+
declare function getClusterPairScores(members: readonly number[], allPairs: readonly (readonly [number, number, number])[]): Map<PairKey, number>;
|
|
192
|
+
|
|
193
|
+
/**
|
|
194
|
+
* scorer.ts — Fuzzy scoring module for GoldenMatch.
|
|
195
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
196
|
+
*
|
|
197
|
+
* Ports goldenmatch/core/scorer.py. The Python version uses `rapidfuzz`
|
|
198
|
+
* for vectorized NxN scoring. Here we implement all algorithms in pure TS.
|
|
199
|
+
*/
|
|
200
|
+
|
|
201
|
+
/** Convert unknown value to string or null. */
|
|
202
|
+
declare function asString(v: unknown): string | null;
|
|
203
|
+
/**
|
|
204
|
+
* Jaro similarity between two strings.
|
|
205
|
+
*
|
|
206
|
+
* matchWindow = floor(max(lenA, lenB) / 2) - 1
|
|
207
|
+
* Count matches (chars within window) and transpositions.
|
|
208
|
+
* jaro = (m/lenA + m/lenB + (m - t/2) / m) / 3
|
|
209
|
+
*/
|
|
210
|
+
declare function jaro(a: string, b: string): number;
|
|
211
|
+
/**
|
|
212
|
+
* Jaro-Winkler similarity.
|
|
213
|
+
* Adds a bonus for a common prefix of up to 4 characters, scaling factor 0.1.
|
|
214
|
+
*/
|
|
215
|
+
declare function jaroWinkler(a: string, b: string): number;
|
|
216
|
+
/**
|
|
217
|
+
* Levenshtein edit distance (classic DP, 2-row optimization).
|
|
218
|
+
*/
|
|
219
|
+
declare function levenshteinDistance(a: string, b: string): number;
|
|
220
|
+
/**
|
|
221
|
+
* Normalized Levenshtein similarity: 1 - distance / max(lenA, lenB).
|
|
222
|
+
*/
|
|
223
|
+
declare function levenshteinSimilarity(a: string, b: string): number;
|
|
224
|
+
/**
|
|
225
|
+
* Indel (insertion+deletion) edit distance.
|
|
226
|
+
*
|
|
227
|
+
* Like Levenshtein but without substitutions — a substitution costs 2
|
|
228
|
+
* (one delete + one insert) instead of 1. This matches the distance
|
|
229
|
+
* metric used by rapidfuzz's Indel ratio, which underlies
|
|
230
|
+
* `rapidfuzz.fuzz.token_sort_ratio` in Python.
|
|
231
|
+
*/
|
|
232
|
+
declare function indelDistance(a: string, b: string): number;
|
|
233
|
+
/**
|
|
234
|
+
* Indel normalized similarity: `1 - d_indel / (len_a + len_b)`.
|
|
235
|
+
* Matches rapidfuzz's `Indel.normalized_similarity`.
|
|
236
|
+
*/
|
|
237
|
+
declare function indelSimilarity(a: string, b: string): number;
|
|
238
|
+
/**
|
|
239
|
+
* Token sort ratio, rapidfuzz-compatible.
|
|
240
|
+
*
|
|
241
|
+
* Matches `rapidfuzz.fuzz.token_sort_ratio`:
|
|
242
|
+
* 1. Lowercase both strings.
|
|
243
|
+
* 2. Strip non-alphanumeric characters (replace with whitespace).
|
|
244
|
+
* 3. Split on whitespace, drop empties, sort tokens, rejoin with single space.
|
|
245
|
+
* 4. Compare via Indel normalized similarity (NOT Levenshtein).
|
|
246
|
+
*
|
|
247
|
+
* Python reference: for ("John Smith", "Smith Johnson") returns ~0.8571.
|
|
248
|
+
*/
|
|
249
|
+
declare function tokenSortRatio(a: string, b: string): number;
|
|
250
|
+
/**
|
|
251
|
+
* Soundex match: 1.0 if soundex codes equal, else 0.0.
|
|
252
|
+
*/
|
|
253
|
+
declare function soundexMatch(a: string, b: string): number;
|
|
254
|
+
/**
|
|
255
|
+
* Dice coefficient on two hex-encoded bloom filters.
|
|
256
|
+
* 2 * intersection / (popcount_a + popcount_b)
|
|
257
|
+
*/
|
|
258
|
+
declare function diceCoefficient(a: string, b: string): number;
|
|
259
|
+
/**
|
|
260
|
+
* Jaccard similarity on two hex-encoded bloom filters.
|
|
261
|
+
* intersection / union of bits
|
|
262
|
+
*/
|
|
263
|
+
declare function jaccardSimilarity(a: string, b: string): number;
|
|
264
|
+
/**
|
|
265
|
+
* Ensemble scorer: combines jaro_winkler, token_sort, and soundex_match * 0.8.
|
|
266
|
+
* Takes element-wise max of all three.
|
|
267
|
+
*/
|
|
268
|
+
declare function ensembleScore(a: string, b: string): number;
|
|
269
|
+
/**
|
|
270
|
+
* Score two field values using the specified scorer.
|
|
271
|
+
* Returns null if either value is null.
|
|
272
|
+
*/
|
|
273
|
+
declare function scoreField(valA: string | null, valB: string | null, scorer: string): number | null;
|
|
274
|
+
/**
|
|
275
|
+
* Score a pair of rows across all fields using weighted aggregation.
|
|
276
|
+
* Fields that produce null scores are excluded. If all null -> 0.0.
|
|
277
|
+
*/
|
|
278
|
+
declare function scorePair(rowA: Row, rowB: Row, fields: readonly MatchkeyField[]): number;
|
|
279
|
+
/**
|
|
280
|
+
* Build an NxN score matrix for a list of values using a scorer.
|
|
281
|
+
* Symmetric: matrix[i][j] === matrix[j][i]. Diagonal is 0.
|
|
282
|
+
*/
|
|
283
|
+
declare function scoreMatrix(values: (string | null)[], scorerName: string): number[][];
|
|
284
|
+
/**
|
|
285
|
+
* Find exact matches by grouping rows on matchkey columns.
|
|
286
|
+
* Builds a composite key from all matchkey fields (with transforms applied),
|
|
287
|
+
* groups rows sharing the same key, and returns all pairs with score 1.0.
|
|
288
|
+
*
|
|
289
|
+
* Rows must have a `__row_id__` field.
|
|
290
|
+
*/
|
|
291
|
+
declare function findExactMatches(rows: readonly Row[], mk: MatchkeyConfig): ScoredPair[];
|
|
292
|
+
/**
|
|
293
|
+
* Find fuzzy matches within a block of rows (NxN scoring).
|
|
294
|
+
*
|
|
295
|
+
* Implements early termination:
|
|
296
|
+
* - Score cheap fields (exact/soundex) first
|
|
297
|
+
* - Check if max possible score can reach threshold
|
|
298
|
+
* - Score expensive fuzzy fields only for promising pairs
|
|
299
|
+
*
|
|
300
|
+
* Rows must have a `__row_id__` field.
|
|
301
|
+
*/
|
|
302
|
+
declare function findFuzzyMatches(rows: readonly Row[], mk: MatchkeyConfig, excludePairs?: ReadonlySet<PairKey>, preScoredPairs?: readonly ScoredPair[]): ScoredPair[];
|
|
303
|
+
interface ScoreBlocksOptions {
|
|
304
|
+
/** Filter to cross-source pairs only. */
|
|
305
|
+
readonly acrossFilesOnly?: boolean;
|
|
306
|
+
/** Row ID -> source name mapping (for acrossFilesOnly). */
|
|
307
|
+
readonly sourceLookup?: ReadonlyMap<number, string>;
|
|
308
|
+
/** Target IDs for match mode — filter to target/ref cross pairs. */
|
|
309
|
+
readonly targetIds?: ReadonlySet<number>;
|
|
310
|
+
}
|
|
311
|
+
/**
|
|
312
|
+
* Score all blocks sequentially.
|
|
313
|
+
*
|
|
314
|
+
* In JS there is no GIL, so we use sequential scoring as the default.
|
|
315
|
+
* For web workers or similar concurrency, the caller can partition blocks.
|
|
316
|
+
*/
|
|
317
|
+
declare function scoreBlocksSequential(blocks: readonly BlockResult[], mk: MatchkeyConfig, matchedPairs: Set<PairKey>, options?: ScoreBlocksOptions): ScoredPair[];
|
|
318
|
+
|
|
319
|
+
/**
|
|
320
|
+
* matchkey.ts — Matchkey builder for GoldenMatch-JS.
|
|
321
|
+
* Edge-safe: no `node:` imports, pure TypeScript only.
|
|
322
|
+
*
|
|
323
|
+
* Ports matchkey building from goldenmatch/core/matchkey.py.
|
|
324
|
+
* In Python this uses Polars expressions; here we work with Row arrays.
|
|
325
|
+
*/
|
|
326
|
+
|
|
327
|
+
/**
|
|
328
|
+
* Build a composite matchkey value for a single row.
|
|
329
|
+
*
|
|
330
|
+
* For each field in the matchkey config:
|
|
331
|
+
* 1. Read the raw value from the row
|
|
332
|
+
* 2. Apply the field's transform chain
|
|
333
|
+
* 3. Concatenate all parts with "||" separator
|
|
334
|
+
*
|
|
335
|
+
* Returns `null` if any field value is null/undefined or transforms to null.
|
|
336
|
+
*/
|
|
337
|
+
declare function computeMatchkeyValue(row: Row, mk: MatchkeyConfig): string | null;
|
|
338
|
+
/**
|
|
339
|
+
* Add matchkey columns to rows. For each matchkey `mk`, adds a column
|
|
340
|
+
* `__mk_{mk.name}__` with the computed matchkey value.
|
|
341
|
+
*
|
|
342
|
+
* Returns new row objects (does not mutate originals).
|
|
343
|
+
*/
|
|
344
|
+
declare function computeMatchkeys(rows: readonly Row[], matchkeys: readonly MatchkeyConfig[]): Row[];
|
|
345
|
+
/**
|
|
346
|
+
* Add `__row_id__` column as sequential integers starting from `offset`.
|
|
347
|
+
*
|
|
348
|
+
* Returns new row objects (does not mutate originals).
|
|
349
|
+
*/
|
|
350
|
+
declare function addRowIds(rows: readonly Row[], offset?: number): Row[];
|
|
351
|
+
/**
|
|
352
|
+
* Add `__source__` column with the given source name to every row.
|
|
353
|
+
*
|
|
354
|
+
* Returns new row objects (does not mutate originals).
|
|
355
|
+
*/
|
|
356
|
+
declare function addSourceColumn(rows: readonly Row[], sourceName: string): Row[];
|
|
357
|
+
|
|
358
|
+
/**
|
|
359
|
+
* standardize.ts — Data standardization for GoldenMatch-JS.
|
|
360
|
+
* Edge-safe: no `node:` imports, pure TypeScript only.
|
|
361
|
+
*
|
|
362
|
+
* Ports standardization from goldenmatch/core/standardize.py.
|
|
363
|
+
* These are data cleaning transforms applied to columns before matching.
|
|
364
|
+
*/
|
|
365
|
+
|
|
366
|
+
/**
|
|
367
|
+
* Apply a named standardizer to a string value.
|
|
368
|
+
*
|
|
369
|
+
* @throws Error if the standardizer name is not recognized.
|
|
370
|
+
*/
|
|
371
|
+
declare function applyStandardizer(value: string, name: string): string;
|
|
372
|
+
/**
|
|
373
|
+
* Apply standardization rules to rows.
|
|
374
|
+
*
|
|
375
|
+
* `rules` maps column names to arrays of standardizer names that are
|
|
376
|
+
* applied in sequence. For example:
|
|
377
|
+
*
|
|
378
|
+
* ```ts
|
|
379
|
+
* applyStandardization(rows, {
|
|
380
|
+
* email: ["email"],
|
|
381
|
+
* first_name: ["strip", "name_proper"],
|
|
382
|
+
* phone: ["phone"],
|
|
383
|
+
* });
|
|
384
|
+
* ```
|
|
385
|
+
*
|
|
386
|
+
* Returns new row objects (does not mutate originals).
|
|
387
|
+
* Null/undefined column values are skipped (left as-is).
|
|
388
|
+
*/
|
|
389
|
+
declare function applyStandardization(rows: readonly Row[], rules: Readonly<Record<string, readonly string[]>>): Row[];
|
|
390
|
+
|
|
391
|
+
/**
|
|
392
|
+
* blocker.ts — Groups records into blocks for pairwise comparison.
|
|
393
|
+
*
|
|
394
|
+
* Edge-safe: no Node.js imports. Pure TypeScript only.
|
|
395
|
+
*
|
|
396
|
+
* Ports `goldenmatch/core/blocker.py`.
|
|
397
|
+
*/
|
|
398
|
+
|
|
399
|
+
/**
|
|
400
|
+
* Group rows by blocking key. Skip blocks with fewer than 2 rows.
|
|
401
|
+
* Handle oversized blocks per `config.skipOversized`.
|
|
402
|
+
*/
|
|
403
|
+
declare function buildStaticBlocks(rows: readonly Row[], config: BlockingConfig): BlockResult[];
|
|
404
|
+
/**
|
|
405
|
+
* Run multiple blocking passes using `config.passes`.
|
|
406
|
+
*
|
|
407
|
+
* Each pass uses a different `BlockingKeyConfig`. Blocks are deduplicated
|
|
408
|
+
* by block key so each unique key appears only once.
|
|
409
|
+
*/
|
|
410
|
+
declare function buildMultiPassBlocks(rows: readonly Row[], config: BlockingConfig): BlockResult[];
|
|
411
|
+
/**
|
|
412
|
+
* Build static blocks first, then auto-split any oversized blocks
|
|
413
|
+
* using the highest-cardinality column.
|
|
414
|
+
*
|
|
415
|
+
* If `config.subBlockKeys` is configured, uses recursive sub-blocking
|
|
416
|
+
* instead of auto-split.
|
|
417
|
+
*/
|
|
418
|
+
declare function buildAdaptiveBlocks(rows: readonly Row[], config: BlockingConfig): BlockResult[];
|
|
419
|
+
/**
|
|
420
|
+
* Evaluate candidate blocking keys and select the one with the smallest
|
|
421
|
+
* max group size while maintaining >= 50% coverage.
|
|
422
|
+
*
|
|
423
|
+
* Coverage = fraction of rows that produce a non-null block key.
|
|
424
|
+
* If only one key is provided, returns it directly.
|
|
425
|
+
*/
|
|
426
|
+
declare function selectBestBlockingKey(rows: readonly Row[], keys: readonly BlockingKeyConfig[], maxBlockSize?: number): BlockingKeyConfig;
|
|
427
|
+
/**
|
|
428
|
+
* Build blocks from rows based on blocking configuration.
|
|
429
|
+
*
|
|
430
|
+
* Routes by `config.strategy`:
|
|
431
|
+
* - `"static"` -- hash-based grouping on blocking keys
|
|
432
|
+
* - `"multi_pass"` -- multiple passes with deduplication
|
|
433
|
+
* - `"sorted_neighborhood"` -- sliding window over sorted data
|
|
434
|
+
* - `"adaptive"` -- static + auto-split for oversized blocks
|
|
435
|
+
* - `"ann"`, `"ann_pairs"`, `"canopy"`, `"learned"` -- not yet implemented
|
|
436
|
+
*
|
|
437
|
+
* If `config.autoSelect` is true and multiple keys are configured,
|
|
438
|
+
* automatically selects the best key before blocking.
|
|
439
|
+
*/
|
|
440
|
+
declare function buildBlocks(rows: readonly Row[], config: BlockingConfig): BlockResult[];
|
|
441
|
+
/**
|
|
442
|
+
* Async variant of `buildBlocks`. Required for `"ann"` and `"ann_pairs"`
|
|
443
|
+
* strategies which need to fetch embeddings via HTTP. All other strategies
|
|
444
|
+
* delegate to the synchronous `buildBlocks` path.
|
|
445
|
+
*/
|
|
446
|
+
declare function buildBlocksAsync(rows: readonly Row[], config: BlockingConfig): Promise<BlockResult[]>;
|
|
447
|
+
|
|
448
|
+
/**
|
|
449
|
+
* embedder.ts — Embedding API client (OpenAI / Vertex AI / Voyage).
|
|
450
|
+
*
|
|
451
|
+
* Edge-safe: uses global `fetch()` only. No `node:` imports.
|
|
452
|
+
*
|
|
453
|
+
* Ports `goldenmatch/core/embedder.py` and `goldenmatch/core/vertex_embedder.py`,
|
|
454
|
+
* but replaces sentence-transformers / google-cloud-aiplatform with HTTP calls
|
|
455
|
+
* so the module runs in Edge / Workers / browser-like runtimes.
|
|
456
|
+
*/
|
|
457
|
+
type EmbedderProvider = "openai" | "vertex" | "voyage";
|
|
458
|
+
interface EmbedderOptions {
|
|
459
|
+
readonly provider?: EmbedderProvider;
|
|
460
|
+
readonly model?: string;
|
|
461
|
+
readonly apiKey?: string;
|
|
462
|
+
/** Override the default endpoint URL. */
|
|
463
|
+
readonly endpoint?: string;
|
|
464
|
+
/** Batch size for API calls (default 64 for OpenAI, 50 for Vertex). */
|
|
465
|
+
readonly batchSize?: number;
|
|
466
|
+
/** Cache embeddings by text hash within an Embedder instance. */
|
|
467
|
+
readonly cache?: boolean;
|
|
468
|
+
/**
|
|
469
|
+
* For OpenAI text-embedding-3+ this requests a smaller embedding
|
|
470
|
+
* dimension (e.g. 512 instead of 1536).
|
|
471
|
+
*/
|
|
472
|
+
readonly dimensions?: number;
|
|
473
|
+
/** GCP project ID (required for Vertex). */
|
|
474
|
+
readonly project?: string;
|
|
475
|
+
/** GCP region (Vertex). Default: us-central1. */
|
|
476
|
+
readonly location?: string;
|
|
477
|
+
/** Pre-fetched OAuth bearer token for Vertex. */
|
|
478
|
+
readonly bearerToken?: string;
|
|
479
|
+
/** Maximum HTTP retries for transient failures (default 3). */
|
|
480
|
+
readonly maxRetries?: number;
|
|
481
|
+
}
|
|
482
|
+
interface EmbeddingResult {
|
|
483
|
+
readonly embeddings: readonly Float32Array[];
|
|
484
|
+
readonly model: string;
|
|
485
|
+
readonly tokensUsed: number;
|
|
486
|
+
}
|
|
487
|
+
declare class EmbedderError extends Error {
|
|
488
|
+
readonly status?: number | undefined;
|
|
489
|
+
readonly body?: string | undefined;
|
|
490
|
+
constructor(message: string, status?: number | undefined, body?: string | undefined);
|
|
491
|
+
}
|
|
492
|
+
declare class Embedder {
|
|
493
|
+
private readonly options;
|
|
494
|
+
private readonly cacheMap;
|
|
495
|
+
private readonly provider;
|
|
496
|
+
private readonly model;
|
|
497
|
+
private readonly batchSize;
|
|
498
|
+
private readonly maxRetries;
|
|
499
|
+
private readonly cacheEnabled;
|
|
500
|
+
constructor(options?: EmbedderOptions);
|
|
501
|
+
/** Embed a batch of texts in one or more API calls. */
|
|
502
|
+
embedBatch(texts: readonly string[]): Promise<EmbeddingResult>;
|
|
503
|
+
/** Embed a single text. */
|
|
504
|
+
embedOne(text: string): Promise<Float32Array>;
|
|
505
|
+
/**
|
|
506
|
+
* Embed a column of (possibly null) values. Null/empty get a zero vector.
|
|
507
|
+
* Identical text values are de-duplicated automatically.
|
|
508
|
+
*/
|
|
509
|
+
embedColumn(values: readonly (string | null | undefined)[], _cacheKey?: string): Promise<readonly Float32Array[]>;
|
|
510
|
+
cosineSimilarity(a: Float32Array, b: Float32Array): number;
|
|
511
|
+
cosineSimilarityMatrix(embeddings: readonly Float32Array[]): number[][];
|
|
512
|
+
private firstDim;
|
|
513
|
+
private callProvider;
|
|
514
|
+
private callOpenAI;
|
|
515
|
+
private callVertex;
|
|
516
|
+
private callVoyage;
|
|
517
|
+
}
|
|
518
|
+
/**
|
|
519
|
+
* Return a cached Embedder instance keyed by provider+model.
|
|
520
|
+
* Pass a string to use a model name with default provider, or full options.
|
|
521
|
+
*/
|
|
522
|
+
declare function getEmbedder(modelOrOptions?: string | EmbedderOptions): Embedder;
|
|
523
|
+
|
|
524
|
+
/**
|
|
525
|
+
* ann-blocker.ts — Approximate nearest neighbour blocking.
|
|
526
|
+
*
|
|
527
|
+
* Edge-safe: no `node:` imports, no FAISS. Implements a brute-force kNN
|
|
528
|
+
* (O(n^2)) which is appropriate for <= ~10K records. Embeddings are
|
|
529
|
+
* fetched via `getEmbedder()` which uses HTTP `fetch()`.
|
|
530
|
+
*
|
|
531
|
+
* Ports `goldenmatch/core/ann_blocker.py`.
|
|
532
|
+
*/
|
|
533
|
+
|
|
534
|
+
interface ANNBlockerOptions {
|
|
535
|
+
readonly topK?: number;
|
|
536
|
+
readonly metric?: "cosine" | "euclidean";
|
|
537
|
+
}
|
|
538
|
+
interface BuildANNOptions {
|
|
539
|
+
readonly topK?: number;
|
|
540
|
+
readonly model?: string;
|
|
541
|
+
readonly apiKey?: string;
|
|
542
|
+
readonly provider?: EmbedderOptions["provider"];
|
|
543
|
+
/** Row identifier column (default `__row_id__`). */
|
|
544
|
+
readonly idColumn?: string;
|
|
545
|
+
/** Maximum block size produced by Union-Find grouping. */
|
|
546
|
+
readonly maxBlockSize?: number;
|
|
547
|
+
/** Use hnswlib-node fast-path when available (falls back to brute-force). */
|
|
548
|
+
readonly useHNSW?: boolean;
|
|
549
|
+
}
|
|
550
|
+
/**
|
|
551
|
+
* Minimal shape of the `hnswlib-node` module that we rely on. The caller
|
|
552
|
+
* passes in the loaded module; we deliberately keep the surface tiny so we
|
|
553
|
+
* don't hard-depend on its types.
|
|
554
|
+
*/
|
|
555
|
+
interface HNSWModule {
|
|
556
|
+
readonly HierarchicalNSW: new (metric: string, dim: number) => HNSWIndexLike;
|
|
557
|
+
}
|
|
558
|
+
interface HNSWIndexLike {
|
|
559
|
+
initIndex(maxElements: number, M?: number, efConstruction?: number, randomSeed?: number): void;
|
|
560
|
+
setEf(ef: number): void;
|
|
561
|
+
addPoint(vector: number[] | Float32Array, labelId: number): void;
|
|
562
|
+
searchKnn(query: number[] | Float32Array, k: number): {
|
|
563
|
+
distances: number[];
|
|
564
|
+
neighbors: number[];
|
|
565
|
+
};
|
|
566
|
+
}
|
|
567
|
+
interface HNSWOptions {
|
|
568
|
+
readonly hnswModule: HNSWModule;
|
|
569
|
+
readonly topK?: number;
|
|
570
|
+
readonly metric?: "cosine" | "euclidean";
|
|
571
|
+
readonly maxElements?: number;
|
|
572
|
+
readonly M?: number;
|
|
573
|
+
readonly efConstruction?: number;
|
|
574
|
+
readonly efSearch?: number;
|
|
575
|
+
}
|
|
576
|
+
/** Shared interface so `ANNBlocker` and `HNSWANNBlocker` are interchangeable. */
|
|
577
|
+
interface ANNBlockerBase {
|
|
578
|
+
buildIndex(embeddings: readonly Float32Array[]): void;
|
|
579
|
+
addToIndex(embedding: Float32Array): number;
|
|
580
|
+
query(queryEmbeddings: readonly Float32Array[]): Array<[number, number]>;
|
|
581
|
+
queryWithScores(queryEmbeddings: readonly Float32Array[]): Array<[number, number, number]>;
|
|
582
|
+
queryOne(queryEmbedding: Float32Array): Array<[number, number]>;
|
|
583
|
+
readonly indexSize: number;
|
|
584
|
+
}
|
|
585
|
+
declare function cosineSim(a: Float32Array, b: Float32Array): number;
|
|
586
|
+
declare function euclideanDist(a: Float32Array, b: Float32Array): number;
|
|
587
|
+
declare class ANNBlocker implements ANNBlockerBase {
|
|
588
|
+
private embeddings;
|
|
589
|
+
private readonly topK;
|
|
590
|
+
private readonly metric;
|
|
591
|
+
constructor(options?: ANNBlockerOptions);
|
|
592
|
+
/** Replace the index with a fresh set of embeddings. */
|
|
593
|
+
buildIndex(embeddings: readonly Float32Array[]): void;
|
|
594
|
+
/** Number of vectors currently in the index. */
|
|
595
|
+
get indexSize(): number;
|
|
596
|
+
/** Append a single embedding; returns its position. */
|
|
597
|
+
addToIndex(embedding: Float32Array): number;
|
|
598
|
+
/**
|
|
599
|
+
* For each query embedding, return up to topK (queryIdx, indexIdx) pairs.
|
|
600
|
+
* Self-matches (same index when queries == embeddings) are excluded only
|
|
601
|
+
* when the underlying object identity matches; otherwise the caller is
|
|
602
|
+
* responsible for filtering self-pairs.
|
|
603
|
+
*
|
|
604
|
+
* Pairs are canonicalised so the lower index is always first when querying
|
|
605
|
+
* against the same index population (queryIdx === indexIdx case removed).
|
|
606
|
+
*/
|
|
607
|
+
query(queryEmbeddings: readonly Float32Array[]): Array<[number, number]>;
|
|
608
|
+
/** Same as `query` but also returns the similarity score for each pair. */
|
|
609
|
+
queryWithScores(queryEmbeddings: readonly Float32Array[]): Array<[number, number, number]>;
|
|
610
|
+
/** Top-K matches for a single query. Returns (neighborIdx, score). */
|
|
611
|
+
queryOne(queryEmbedding: Float32Array): Array<[number, number]>;
|
|
612
|
+
private topKFor;
|
|
613
|
+
}
|
|
614
|
+
declare class HNSWANNBlocker implements ANNBlockerBase {
|
|
615
|
+
private index;
|
|
616
|
+
private count;
|
|
617
|
+
private readonly opts;
|
|
618
|
+
private readonly topK;
|
|
619
|
+
private readonly metric;
|
|
620
|
+
constructor(opts: HNSWOptions);
|
|
621
|
+
get indexSize(): number;
|
|
622
|
+
buildIndex(embeddings: readonly Float32Array[]): void;
|
|
623
|
+
addToIndex(embedding: Float32Array): number;
|
|
624
|
+
query(queryEmbeddings: readonly Float32Array[]): Array<[number, number]>;
|
|
625
|
+
queryWithScores(queryEmbeddings: readonly Float32Array[]): Array<[number, number, number]>;
|
|
626
|
+
queryOne(queryEmbedding: Float32Array): Array<[number, number]>;
|
|
627
|
+
}
|
|
628
|
+
interface CreateANNBlockerOptions extends ANNBlockerOptions {
|
|
629
|
+
/** Attempt to use the hnswlib-node fast-path. */
|
|
630
|
+
readonly useHNSW?: boolean;
|
|
631
|
+
/** Pre-loaded hnswlib-node module (skips dynamic import). */
|
|
632
|
+
readonly hnswModule?: HNSWModule;
|
|
633
|
+
/** Additional HNSW tuning knobs. Ignored when falling back to brute-force. */
|
|
634
|
+
readonly maxElements?: number;
|
|
635
|
+
readonly M?: number;
|
|
636
|
+
readonly efConstruction?: number;
|
|
637
|
+
readonly efSearch?: number;
|
|
638
|
+
/**
|
|
639
|
+
* Override the warning sink when the fast-path is unavailable. Defaults to
|
|
640
|
+
* `console.warn`. Tests pass a spy here.
|
|
641
|
+
*/
|
|
642
|
+
readonly onFallbackWarning?: (message: string) => void;
|
|
643
|
+
}
|
|
644
|
+
/**
|
|
645
|
+
* Build an ANN blocker, preferring the `hnswlib-node` fast-path when
|
|
646
|
+
* `useHNSW` is `true` and the module can be loaded. Falls back to the
|
|
647
|
+
* brute-force `ANNBlocker` when the module is missing (e.g. edge runtime,
|
|
648
|
+
* peer dep not installed) and emits a single warning.
|
|
649
|
+
*/
|
|
650
|
+
declare function createANNBlocker(options?: CreateANNBlockerOptions): Promise<ANNBlockerBase>;
|
|
651
|
+
/**
|
|
652
|
+
* Embed one column, query top-K neighbours, and group connected pairs
|
|
653
|
+
* into micro-blocks via Union-Find.
|
|
654
|
+
*/
|
|
655
|
+
declare function buildANNBlocks(rows: readonly Row[], annColumn: string, options?: BuildANNOptions): Promise<BlockResult[]>;
|
|
656
|
+
/**
|
|
657
|
+
* Variant that returns one BlockResult containing every row plus
|
|
658
|
+
* pre-scored pairs derived from ANN cosine similarity. Useful when the
|
|
659
|
+
* scorer should reuse the embedding-based scores instead of recomputing.
|
|
660
|
+
*/
|
|
661
|
+
declare function buildANNPairBlocks(rows: readonly Row[], annColumn: string, options?: BuildANNOptions): Promise<BlockResult[]>;
|
|
662
|
+
|
|
663
|
+
/**
|
|
664
|
+
* cross-encoder.ts — LLM-based pair reranking ("cross-encoder lite").
|
|
665
|
+
*
|
|
666
|
+
* The Python port uses ONNX/sentence-transformers cross-encoders, which need
|
|
667
|
+
* native deps. This edge-safe variant performs zero-shot reranking by asking
|
|
668
|
+
* an LLM (OpenAI / Anthropic) for a 0..1 match score on borderline pairs.
|
|
669
|
+
*
|
|
670
|
+
* - Borderline pairs are identified by `band` around the matchkey threshold
|
|
671
|
+
* and/or top-N highest fuzzy scores below the auto-accept cutoff.
|
|
672
|
+
* - The combined score is `0.5 * original + 0.5 * rerank` by default.
|
|
673
|
+
* - Budget tracking uses BudgetTracker; on any HTTP failure we degrade to
|
|
674
|
+
* the original score for that pair.
|
|
675
|
+
*
|
|
676
|
+
* Edge-safe: uses global `fetch()` only.
|
|
677
|
+
*/
|
|
678
|
+
|
|
679
|
+
type CrossEncoderProvider = "openai" | "anthropic";
|
|
680
|
+
type CrossEncoderReranker = "llm" | "cross-encoder";
|
|
681
|
+
interface CrossEncoderModelOptions {
|
|
682
|
+
/** HuggingFace model id. Default "Xenova/ms-marco-MiniLM-L-6-v2". */
|
|
683
|
+
readonly model?: string;
|
|
684
|
+
/** Execution device. Default "cpu". */
|
|
685
|
+
readonly device?: "cpu" | "webgpu";
|
|
686
|
+
/** Use quantized weights (q8). Default true. */
|
|
687
|
+
readonly quantized?: boolean;
|
|
688
|
+
}
|
|
689
|
+
interface CrossEncoderOptions {
|
|
690
|
+
/**
|
|
691
|
+
* Reranker backend. `"llm"` (default) uses OpenAI/Anthropic.
|
|
692
|
+
* `"cross-encoder"` loads `@huggingface/transformers` and runs a real
|
|
693
|
+
* ONNX cross-encoder model locally; falls back to LLM on load/inference
|
|
694
|
+
* failure.
|
|
695
|
+
*/
|
|
696
|
+
readonly reranker?: CrossEncoderReranker;
|
|
697
|
+
readonly provider?: CrossEncoderProvider;
|
|
698
|
+
readonly model?: string;
|
|
699
|
+
readonly apiKey?: string;
|
|
700
|
+
/** Device for cross-encoder model (when reranker="cross-encoder"). */
|
|
701
|
+
readonly device?: "cpu" | "webgpu";
|
|
702
|
+
/** Use quantized cross-encoder weights (q8). Default true. */
|
|
703
|
+
readonly quantized?: boolean;
|
|
704
|
+
/** Re-rank pairs scoring within `band` of `mk.threshold` (default 0.1). */
|
|
705
|
+
readonly band?: number;
|
|
706
|
+
/** Maximum number of pairs to rerank, ranked highest to lowest. */
|
|
707
|
+
readonly topN?: number;
|
|
708
|
+
/** Weight given to the LLM rerank vs the original score. Default 0.5. */
|
|
709
|
+
readonly rerankWeight?: number;
|
|
710
|
+
/** Budget cap shared with regular LLM scoring. */
|
|
711
|
+
readonly budget?: BudgetConfig;
|
|
712
|
+
/** Optional override for retry attempts. Default 2. */
|
|
713
|
+
readonly maxRetries?: number;
|
|
714
|
+
}
|
|
715
|
+
declare class CrossEncoderHttpError extends Error {
|
|
716
|
+
readonly status: number;
|
|
717
|
+
constructor(status: number, message: string);
|
|
718
|
+
}
|
|
719
|
+
/**
|
|
720
|
+
* Optional local cross-encoder backed by @huggingface/transformers (ONNX).
|
|
721
|
+
*
|
|
722
|
+
* Kept optional so goldenmatch-js stays edge-safe / zero-deps by default.
|
|
723
|
+
* The peer dependency must be installed explicitly:
|
|
724
|
+
* npm install @huggingface/transformers
|
|
725
|
+
*
|
|
726
|
+
* Typical usage is indirect: pass `reranker: "cross-encoder"` to
|
|
727
|
+
* `rerankPair` / `rerankTopPairs` and a shared model instance is cached.
|
|
728
|
+
*/
|
|
729
|
+
declare class CrossEncoderModel {
|
|
730
|
+
private readonly options;
|
|
731
|
+
private pipelineFn;
|
|
732
|
+
private loading;
|
|
733
|
+
constructor(options?: CrossEncoderModelOptions);
|
|
734
|
+
private ensureLoaded;
|
|
735
|
+
/** Score a single text pair. Returns a [0,1] relevance probability. */
|
|
736
|
+
score(textA: string, textB: string): Promise<number>;
|
|
737
|
+
/**
|
|
738
|
+
* Score a batch of text pairs. Currently calls `score` serially —
|
|
739
|
+
* transformers.js v3 batching APIs vary across versions, so we stay
|
|
740
|
+
* conservative. Still avoids any LLM HTTP round-trips.
|
|
741
|
+
*/
|
|
742
|
+
scoreBatch(pairs: ReadonlyArray<readonly [string, string]>): Promise<number[]>;
|
|
743
|
+
}
|
|
744
|
+
/** Test hook: reset the cached model instance. */
|
|
745
|
+
declare function _resetCrossEncoderModelCache(): void;
|
|
746
|
+
/**
|
|
747
|
+
* Ask the LLM for a single 0..1 match score for two rows.
|
|
748
|
+
*
|
|
749
|
+
* Returns `NaN` when the call fails or the response is unparseable so
|
|
750
|
+
* callers can fall back to the original score.
|
|
751
|
+
*/
|
|
752
|
+
declare function rerankPair(rowA: Row, rowB: Row, fields: readonly string[], options?: CrossEncoderOptions): Promise<number>;
|
|
753
|
+
/**
|
|
754
|
+
* Rerank borderline pairs via LLM. Pairs outside the borderline band are
|
|
755
|
+
* returned unchanged. Pairs the LLM can't score (HTTP error, parse fail,
|
|
756
|
+
* budget exhausted) keep their original score.
|
|
757
|
+
*
|
|
758
|
+
* Combine rule: `final = (1 - w) * original + w * rerank`, with `w = rerankWeight`.
|
|
759
|
+
*
|
|
760
|
+
* Pairs whose final score falls below `mk.threshold` are dropped from the
|
|
761
|
+
* result, matching the Python "rerank then re-filter" behaviour.
|
|
762
|
+
*/
|
|
763
|
+
declare function rerankTopPairs(pairs: readonly ScoredPair[], rows: readonly Row[], mk: MatchkeyConfig, options?: CrossEncoderOptions): Promise<readonly ScoredPair[]>;
|
|
764
|
+
|
|
765
|
+
/**
|
|
766
|
+
* golden.ts — Golden record builder with per-field merge strategies.
|
|
767
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
768
|
+
*/
|
|
769
|
+
|
|
770
|
+
interface MergeFieldResult {
|
|
771
|
+
readonly value: unknown;
|
|
772
|
+
readonly confidence: number;
|
|
773
|
+
readonly sourceIndex: number | null;
|
|
774
|
+
}
|
|
775
|
+
interface MergeFieldOptions {
|
|
776
|
+
readonly sources?: readonly string[];
|
|
777
|
+
readonly dates?: readonly unknown[];
|
|
778
|
+
readonly qualityWeights?: readonly number[];
|
|
779
|
+
}
|
|
780
|
+
/**
|
|
781
|
+
* Merge a list of values using the given strategy.
|
|
782
|
+
*
|
|
783
|
+
* Strategies:
|
|
784
|
+
* - most_complete: pick longest string value; tie-break by quality weight
|
|
785
|
+
* - majority_vote: pick most frequent value; weighted by quality if available
|
|
786
|
+
* - source_priority: pick first non-null from priority list
|
|
787
|
+
* - most_recent: pick value with most recent date
|
|
788
|
+
* - first_non_null: pick first non-null; prefer highest quality weight
|
|
789
|
+
*/
|
|
790
|
+
declare function mergeField(values: readonly unknown[], rule: GoldenFieldRule, options?: MergeFieldOptions): MergeFieldResult;
|
|
791
|
+
interface GoldenRecord {
|
|
792
|
+
readonly fields: Readonly<Record<string, {
|
|
793
|
+
value: unknown;
|
|
794
|
+
confidence: number;
|
|
795
|
+
}>>;
|
|
796
|
+
readonly goldenConfidence: number;
|
|
797
|
+
}
|
|
798
|
+
/**
|
|
799
|
+
* Build a golden record from cluster rows.
|
|
800
|
+
*
|
|
801
|
+
* @param clusterRows - Array of row objects belonging to one cluster.
|
|
802
|
+
* @param rules - Golden rules config with default strategy and field rules.
|
|
803
|
+
* @param qualityScores - Optional map of `"rowId:column"` -> quality score.
|
|
804
|
+
*/
|
|
805
|
+
declare function buildGoldenRecord(clusterRows: readonly Row[], rules: GoldenRulesConfig, qualityScores?: ReadonlyMap<string, number>): GoldenRecord;
|
|
806
|
+
interface GoldenRecordWithProvenanceResult {
|
|
807
|
+
readonly goldenRecords: readonly (Row & {
|
|
808
|
+
__cluster_id__: number;
|
|
809
|
+
__golden_confidence__: number;
|
|
810
|
+
})[];
|
|
811
|
+
readonly provenance: readonly ClusterProvenance[];
|
|
812
|
+
}
|
|
813
|
+
/**
|
|
814
|
+
* Build golden records with full field-level provenance tracking.
|
|
815
|
+
*
|
|
816
|
+
* @param allRows - All rows with `__cluster_id__` and `__row_id__` columns.
|
|
817
|
+
* @param rules - Golden rules config.
|
|
818
|
+
* @param clusters - Cluster map from buildClusters.
|
|
819
|
+
* @param qualityScores - Optional `"rowId:column"` -> quality score map.
|
|
820
|
+
*/
|
|
821
|
+
declare function buildGoldenRecordWithProvenance(allRows: readonly Row[], rules: GoldenRulesConfig, clusters: ReadonlyMap<number, ClusterInfo>, qualityScores?: ReadonlyMap<string, number>): GoldenRecordWithProvenanceResult;
|
|
822
|
+
|
|
823
|
+
/**
|
|
824
|
+
* pipeline.ts — Core pipeline orchestrator for GoldenMatch-JS.
|
|
825
|
+
* Edge-safe: no `node:` imports, pure TypeScript only.
|
|
826
|
+
*
|
|
827
|
+
* Ports goldenmatch/core/pipeline.py.
|
|
828
|
+
* Chains: standardize -> matchkeys -> block -> score -> cluster -> golden.
|
|
829
|
+
*/
|
|
830
|
+
|
|
831
|
+
interface DedupeOptions$1 {
|
|
832
|
+
readonly outputGolden?: boolean;
|
|
833
|
+
readonly outputReport?: boolean;
|
|
834
|
+
readonly acrossFilesOnly?: boolean;
|
|
835
|
+
}
|
|
836
|
+
/**
|
|
837
|
+
* Run the full deduplication pipeline.
|
|
838
|
+
*
|
|
839
|
+
* Steps:
|
|
840
|
+
* 1. Add __row_id__ and __source__ if not present
|
|
841
|
+
* 2. Apply standardization
|
|
842
|
+
* 3. Compute matchkeys
|
|
843
|
+
* 4. Phase 1: Exact matchkeys (hash-based grouping)
|
|
844
|
+
* 5. Phase 2: Fuzzy matchkeys (block + score)
|
|
845
|
+
* 6. Phase 3: Cluster (Union-Find with MST splitting)
|
|
846
|
+
* 7. Phase 4: Build golden records for multi-member clusters
|
|
847
|
+
* 8. Classify dupes vs unique
|
|
848
|
+
* 9. Compute stats
|
|
849
|
+
* 10. Return DedupeResult
|
|
850
|
+
*/
|
|
851
|
+
declare function runDedupePipeline(rows: readonly Row[], config: GoldenMatchConfig, options?: DedupeOptions$1): DedupeResult;
|
|
852
|
+
/**
|
|
853
|
+
* Run the match pipeline: match target rows against reference rows.
|
|
854
|
+
*
|
|
855
|
+
* - Assigns __row_id__ with offset for reference rows
|
|
856
|
+
* - Assigns __source__ ("target" / "reference")
|
|
857
|
+
* - Runs same pipeline but filters to cross-source pairs only
|
|
858
|
+
*/
|
|
859
|
+
declare function runMatchPipeline(targetRows: readonly Row[], referenceRows: readonly Row[], config: GoldenMatchConfig): MatchResult;
|
|
860
|
+
|
|
861
|
+
/**
|
|
862
|
+
* api.ts — High-level API functions wrapping the pipeline.
|
|
863
|
+
* Edge-safe: no `node:` imports, pure TypeScript only.
|
|
864
|
+
*
|
|
865
|
+
* Ports goldenmatch/_api.py convenience functions.
|
|
866
|
+
*/
|
|
867
|
+
|
|
868
|
+
interface DedupeOptions {
|
|
869
|
+
/** Full config object -- takes precedence over shorthand options. */
|
|
870
|
+
readonly config?: GoldenMatchConfig;
|
|
871
|
+
/** Columns for exact matching (creates one exact matchkey per column). */
|
|
872
|
+
readonly exact?: readonly string[];
|
|
873
|
+
/** Columns for fuzzy matching with per-field thresholds. */
|
|
874
|
+
readonly fuzzy?: Readonly<Record<string, number>>;
|
|
875
|
+
/** Blocking key columns (lowercase transform applied). */
|
|
876
|
+
readonly blocking?: readonly string[];
|
|
877
|
+
/** Overall fuzzy threshold (default 0.85). */
|
|
878
|
+
readonly threshold?: number;
|
|
879
|
+
/** Enable LLM scorer for borderline pairs. Requires OPENAI_API_KEY or ANTHROPIC_API_KEY in env. */
|
|
880
|
+
readonly llmScorer?: boolean;
|
|
881
|
+
}
|
|
882
|
+
/**
|
|
883
|
+
* Deduplicate an array of row objects.
|
|
884
|
+
*
|
|
885
|
+
* Shorthand usage:
|
|
886
|
+
* ```ts
|
|
887
|
+
* const result = dedupe(rows, {
|
|
888
|
+
* exact: ["email"],
|
|
889
|
+
* fuzzy: { name: 0.85, address: 0.7 },
|
|
890
|
+
* blocking: ["zip"],
|
|
891
|
+
* threshold: 0.85,
|
|
892
|
+
* });
|
|
893
|
+
* ```
|
|
894
|
+
*
|
|
895
|
+
* Or provide a full config:
|
|
896
|
+
* ```ts
|
|
897
|
+
* const result = dedupe(rows, { config: myConfig });
|
|
898
|
+
* ```
|
|
899
|
+
*/
|
|
900
|
+
declare function dedupe(rows: readonly Row[], options?: DedupeOptions): DedupeResult;
|
|
901
|
+
/**
|
|
902
|
+
* Match target rows against reference rows.
|
|
903
|
+
*
|
|
904
|
+
* Same options as `dedupe()`. Returns matched/unmatched target rows.
|
|
905
|
+
*/
|
|
906
|
+
declare function match(target: readonly Row[], reference: readonly Row[], options?: DedupeOptions): MatchResult;
|
|
907
|
+
/**
|
|
908
|
+
* Score two strings using the specified scorer algorithm.
|
|
909
|
+
*
|
|
910
|
+
* @param a - First string.
|
|
911
|
+
* @param b - Second string.
|
|
912
|
+
* @param scorer - Scorer name (default: "jaro_winkler").
|
|
913
|
+
* Valid scorers: exact, jaro_winkler, levenshtein, token_sort,
|
|
914
|
+
* soundex_match, dice, jaccard, ensemble.
|
|
915
|
+
* @returns Similarity score between 0.0 and 1.0.
|
|
916
|
+
*/
|
|
917
|
+
declare function scoreStrings(a: string, b: string, scorer?: string): number;
|
|
918
|
+
/**
|
|
919
|
+
* Score a pair of row objects across specified fields using weighted
|
|
920
|
+
* aggregation.
|
|
921
|
+
*
|
|
922
|
+
* @param rowA - First row.
|
|
923
|
+
* @param rowB - Second row.
|
|
924
|
+
* @param fields - Field configs specifying which fields to compare,
|
|
925
|
+
* transforms to apply, scorer to use, and weight.
|
|
926
|
+
* @returns Weighted similarity score between 0.0 and 1.0.
|
|
927
|
+
*/
|
|
928
|
+
declare function scorePairRecord(rowA: Row, rowB: Row, fields: readonly MatchkeyField[]): number;
|
|
929
|
+
|
|
930
|
+
/**
|
|
931
|
+
* config/loader.ts — Config loader that parses raw objects (from YAML/JSON)
|
|
932
|
+
* into typed GoldenMatchConfig.
|
|
933
|
+
*
|
|
934
|
+
* Edge-safe: no `node:` imports, no `require()`.
|
|
935
|
+
*/
|
|
936
|
+
|
|
937
|
+
/**
|
|
938
|
+
* Parse a raw JS object (already deserialized from YAML or JSON) into a
|
|
939
|
+
* validated GoldenMatchConfig.
|
|
940
|
+
*
|
|
941
|
+
* Handles:
|
|
942
|
+
* - Snake_case to camelCase key conversion
|
|
943
|
+
* - Normalization of `matchkeys` / `match_settings`
|
|
944
|
+
* - Parsing of all nested config objects
|
|
945
|
+
* - `default` -> `defaultStrategy` normalization in golden_rules
|
|
946
|
+
*/
|
|
947
|
+
declare function parseConfig(raw: unknown): GoldenMatchConfig;
|
|
948
|
+
/**
|
|
949
|
+
* Parse a YAML string into a GoldenMatchConfig.
|
|
950
|
+
*
|
|
951
|
+
* Requires the caller to provide a YAML parse function (e.g. from the `yaml`
|
|
952
|
+
* npm package) to keep this module edge-safe with no dynamic imports.
|
|
953
|
+
*
|
|
954
|
+
* @param yamlStr - The YAML configuration string.
|
|
955
|
+
* @param yamlParseFn - A function that parses a YAML string into a JS object.
|
|
956
|
+
*/
|
|
957
|
+
declare function parseConfigYaml(yamlStr: string, yamlParseFn: (s: string) => unknown): GoldenMatchConfig;
|
|
958
|
+
/**
|
|
959
|
+
* Convert a GoldenMatchConfig back to a plain JS object suitable for
|
|
960
|
+
* YAML or JSON serialization (snake_case keys).
|
|
961
|
+
*
|
|
962
|
+
* @param config - The typed config object.
|
|
963
|
+
* @param yamlStringifyFn - A function that serializes a JS object to YAML.
|
|
964
|
+
*/
|
|
965
|
+
declare function configToYaml(config: GoldenMatchConfig, yamlStringifyFn: (obj: unknown) => string): string;
|
|
966
|
+
|
|
967
|
+
/**
|
|
968
|
+
* budget.ts — LLM budget tracking: cost accounting, model tiering,
|
|
969
|
+
* and graceful degradation. Ports `goldenmatch/core/llm_budget.py`.
|
|
970
|
+
*
|
|
971
|
+
* Edge-safe: no `node:` imports, no `process`.
|
|
972
|
+
*/
|
|
973
|
+
|
|
974
|
+
interface BudgetSnapshot {
|
|
975
|
+
readonly calls: number;
|
|
976
|
+
readonly inputTokens: number;
|
|
977
|
+
readonly outputTokens: number;
|
|
978
|
+
readonly costUsd: number;
|
|
979
|
+
readonly model: string;
|
|
980
|
+
readonly modelsUsed: Readonly<Record<string, number>>;
|
|
981
|
+
readonly remainingCalls: number | null;
|
|
982
|
+
readonly remainingUsd: number | null;
|
|
983
|
+
readonly pctUsed: number;
|
|
984
|
+
readonly exhausted: boolean;
|
|
985
|
+
}
|
|
986
|
+
/**
|
|
987
|
+
* Tracks LLM token usage, cost, and enforces budget limits.
|
|
988
|
+
*
|
|
989
|
+
* Mirrors `goldenmatch.core.llm_budget.BudgetTracker`. No thread lock
|
|
990
|
+
* is needed here — the edge runtime is single-threaded per request.
|
|
991
|
+
*/
|
|
992
|
+
declare class BudgetTracker {
|
|
993
|
+
private readonly config;
|
|
994
|
+
readonly model: string;
|
|
995
|
+
private _calls;
|
|
996
|
+
private _inputTokens;
|
|
997
|
+
private _outputTokens;
|
|
998
|
+
private _costUsd;
|
|
999
|
+
private _escalationCost;
|
|
1000
|
+
private readonly _modelsUsed;
|
|
1001
|
+
constructor(config?: BudgetConfig, model?: string);
|
|
1002
|
+
/** Estimate the cost of a hypothetical call (USD). */
|
|
1003
|
+
estimateCost(inputTokens: number, outputTokens: number, model?: string): number;
|
|
1004
|
+
/** Record usage from a completed API call. */
|
|
1005
|
+
record(inputTokens: number, outputTokens: number, model?: string): void;
|
|
1006
|
+
/**
|
|
1007
|
+
* Return true if another call can proceed without exceeding the budget.
|
|
1008
|
+
* If `estimatedCost` is provided, checks whether the projected total stays
|
|
1009
|
+
* under `maxCostUsd`.
|
|
1010
|
+
*/
|
|
1011
|
+
canProceed(estimatedCost?: number): boolean;
|
|
1012
|
+
/**
|
|
1013
|
+
* Estimate whether a batch of a given token size can be sent.
|
|
1014
|
+
* Mirrors Python's `can_send(estimated_tokens)`.
|
|
1015
|
+
*/
|
|
1016
|
+
canSend(estimatedTokens: number): boolean;
|
|
1017
|
+
/**
|
|
1018
|
+
* Pick a model based on a pair score and escalation config.
|
|
1019
|
+
* Returns `escalationModel` when the score is in the escalation band
|
|
1020
|
+
* and the escalation sub-budget hasn't been exhausted.
|
|
1021
|
+
*/
|
|
1022
|
+
selectModel(pairScore: number, defaultModel: string): string;
|
|
1023
|
+
get costUsd(): number;
|
|
1024
|
+
get calls(): number;
|
|
1025
|
+
get inputTokens(): number;
|
|
1026
|
+
get outputTokens(): number;
|
|
1027
|
+
get exhausted(): boolean;
|
|
1028
|
+
/** Return a snapshot of the current budget state. */
|
|
1029
|
+
snapshot(): BudgetSnapshot;
|
|
1030
|
+
}
|
|
1031
|
+
/**
|
|
1032
|
+
* Rough token count approximation.
|
|
1033
|
+
* Rule of thumb: ~4 chars per token for English text.
|
|
1034
|
+
*/
|
|
1035
|
+
declare function countTokensApprox(text: string): number;
|
|
1036
|
+
|
|
1037
|
+
/**
|
|
1038
|
+
* scorer.ts — LLM scorer for borderline record pairs.
|
|
1039
|
+
* Ports `goldenmatch/core/llm_scorer.py`.
|
|
1040
|
+
*
|
|
1041
|
+
* Three-tier decision:
|
|
1042
|
+
* score >= autoThreshold -> auto-accept (promote to 1.0)
|
|
1043
|
+
* candidateLo <= score < hi -> send to LLM
|
|
1044
|
+
* score < candidateLo -> keep original score (never demoted)
|
|
1045
|
+
*
|
|
1046
|
+
* Edge-safe: uses `fetch()` (global on Node 20+/edge runtimes).
|
|
1047
|
+
* No `node:` imports.
|
|
1048
|
+
*/
|
|
1049
|
+
|
|
1050
|
+
interface LLMScoreResult {
|
|
1051
|
+
readonly pairs: readonly ScoredPair[];
|
|
1052
|
+
readonly budget: BudgetSnapshot | null;
|
|
1053
|
+
}
|
|
1054
|
+
/**
|
|
1055
|
+
* Score borderline pairs with an LLM. Never demotes: pairs the LLM rejects
|
|
1056
|
+
* keep their original fuzzy score. Pairs the LLM confirms are promoted to 1.0.
|
|
1057
|
+
*
|
|
1058
|
+
* When no `apiKey` is available, degrades gracefully and returns the input.
|
|
1059
|
+
*/
|
|
1060
|
+
declare function llmScorePairs(pairs: readonly ScoredPair[], rows: readonly Row[], config: LLMScorerConfig, apiKey?: string): Promise<LLMScoreResult>;
|
|
1061
|
+
/**
|
|
1062
|
+
* Ask the LLM a single yes/no question about two strings. Returns 1.0
|
|
1063
|
+
* for yes, 0.0 for no, and 0.0 on any error (graceful).
|
|
1064
|
+
*/
|
|
1065
|
+
declare function scoreStringsWithLlm(a: string, b: string, config: LLMScorerConfig, apiKey?: string): Promise<{
|
|
1066
|
+
score: number;
|
|
1067
|
+
budget: BudgetSnapshot;
|
|
1068
|
+
error?: string;
|
|
1069
|
+
}>;
|
|
1070
|
+
|
|
1071
|
+
/**
|
|
1072
|
+
* cluster.ts — In-context LLM clustering: send blocks of borderline records
|
|
1073
|
+
* to an LLM for direct cluster assignment. Ports `goldenmatch/core/llm_cluster.py`.
|
|
1074
|
+
*
|
|
1075
|
+
* Flow:
|
|
1076
|
+
* 1. Pairs with candidateLo <= score < autoThreshold form the borderline band.
|
|
1077
|
+
* 2. Build connected components over the borderline graph.
|
|
1078
|
+
* 3. Oversized components split by dropping weakest edges first.
|
|
1079
|
+
* 4. Each component (block) sent to LLM with a JSON cluster schema.
|
|
1080
|
+
* 5. Pair scores synthesized from cluster membership + confidence.
|
|
1081
|
+
*
|
|
1082
|
+
* Degrades: cluster call fails -> pairwise fallback -> return input pairs.
|
|
1083
|
+
* Edge-safe: fetch-only, no `node:` imports.
|
|
1084
|
+
*/
|
|
1085
|
+
|
|
1086
|
+
declare function llmClusterPairs(pairs: readonly ScoredPair[], rows: readonly Row[], config: LLMScorerConfig, apiKey?: string): Promise<LLMScoreResult>;
|
|
1087
|
+
|
|
1088
|
+
/**
|
|
1089
|
+
* explain.ts — Natural language explanations for match decisions.
|
|
1090
|
+
* Ports `goldenmatch/core/explain.py` (+ parts of `explainer.py`).
|
|
1091
|
+
*
|
|
1092
|
+
* Template-based, zero LLM cost. Produces human-readable summaries of why
|
|
1093
|
+
* two records matched, plus cluster-level summaries.
|
|
1094
|
+
*
|
|
1095
|
+
* Edge-safe: no `node:` imports.
|
|
1096
|
+
*/
|
|
1097
|
+
|
|
1098
|
+
interface FieldScoreDetail {
|
|
1099
|
+
readonly field: string;
|
|
1100
|
+
readonly scorer: string;
|
|
1101
|
+
readonly valueA: string | null;
|
|
1102
|
+
readonly valueB: string | null;
|
|
1103
|
+
readonly score: number | null;
|
|
1104
|
+
readonly weight: number;
|
|
1105
|
+
readonly diffType: "identical" | "similar" | "different" | "missing" | "unknown";
|
|
1106
|
+
}
|
|
1107
|
+
interface PairExplanation {
|
|
1108
|
+
readonly score: number;
|
|
1109
|
+
readonly fieldScores: Readonly<Record<string, number | null>>;
|
|
1110
|
+
readonly explanation: string;
|
|
1111
|
+
readonly confidence: "high" | "medium" | "low";
|
|
1112
|
+
readonly reasoning: readonly string[];
|
|
1113
|
+
readonly details: readonly FieldScoreDetail[];
|
|
1114
|
+
}
|
|
1115
|
+
interface ClusterExplanation {
|
|
1116
|
+
readonly clusterId: number;
|
|
1117
|
+
readonly size: number;
|
|
1118
|
+
readonly confidence: number;
|
|
1119
|
+
readonly quality: string;
|
|
1120
|
+
readonly summary: string;
|
|
1121
|
+
readonly strongestField: string | null;
|
|
1122
|
+
readonly weakestLink: readonly [number, number] | null;
|
|
1123
|
+
}
|
|
1124
|
+
/**
|
|
1125
|
+
* Produce an NL explanation for why two rows match (or don't), using the
|
|
1126
|
+
* scorers and weights defined by the matchkey config.
|
|
1127
|
+
*/
|
|
1128
|
+
declare function explainPair(rowA: Row, rowB: Row, mk: MatchkeyConfig): PairExplanation;
|
|
1129
|
+
/**
|
|
1130
|
+
* Produce a template summary for a cluster: size, confidence, weakest link.
|
|
1131
|
+
* Mirrors `explain_cluster_nl` in Python.
|
|
1132
|
+
*/
|
|
1133
|
+
declare function explainCluster(clusterId: number, cluster: ClusterInfo, rows: readonly Row[], mk: MatchkeyConfig): ClusterExplanation;
|
|
1134
|
+
|
|
1135
|
+
/**
|
|
1136
|
+
* probabilistic.ts — Fellegi-Sunter probabilistic matching with EM-trained
|
|
1137
|
+
* parameters. Ports `goldenmatch/core/probabilistic.py` (discrete path).
|
|
1138
|
+
*
|
|
1139
|
+
* Implements:
|
|
1140
|
+
* - Comparison vectors (2/3/N-level field agreements)
|
|
1141
|
+
* - Splink-style EM: u estimated from random pairs (fixed), m trained via EM
|
|
1142
|
+
* - Blocking fields get fixed neutral priors
|
|
1143
|
+
* - Match weights as log2(m/u) log-likelihood ratios, normalized to [0,1]
|
|
1144
|
+
*
|
|
1145
|
+
* Edge-safe: no `node:` imports, no numpy. Uses typed arrays where helpful.
|
|
1146
|
+
*/
|
|
1147
|
+
|
|
1148
|
+
interface EMOptions {
|
|
1149
|
+
readonly maxIterations?: number;
|
|
1150
|
+
readonly convergence?: number;
|
|
1151
|
+
readonly blockingFields?: readonly string[];
|
|
1152
|
+
readonly seed?: number;
|
|
1153
|
+
readonly nSamplePairs?: number;
|
|
1154
|
+
}
|
|
1155
|
+
interface EMResult {
|
|
1156
|
+
/** P(level | match) per field. */
|
|
1157
|
+
readonly m: Readonly<Record<string, readonly number[]>>;
|
|
1158
|
+
/** P(level | non-match) per field. */
|
|
1159
|
+
readonly u: Readonly<Record<string, readonly number[]>>;
|
|
1160
|
+
/** log2(m / u) per level per field. Score weights. */
|
|
1161
|
+
readonly matchWeights: Readonly<Record<string, readonly number[]>>;
|
|
1162
|
+
/** Estimated p(match) in the sampled population. */
|
|
1163
|
+
readonly proportionMatched: number;
|
|
1164
|
+
readonly iterations: number;
|
|
1165
|
+
readonly converged: boolean;
|
|
1166
|
+
}
|
|
1167
|
+
/**
|
|
1168
|
+
* Build a comparison vector: one integer level per field.
|
|
1169
|
+
* levels=2: 0=disagree, 1=agree
|
|
1170
|
+
* levels=3: 0=disagree, 1=partial, 2=agree (>= 0.95)
|
|
1171
|
+
* levels=N: evenly spaced thresholds k/N for k in 1..N-1
|
|
1172
|
+
*/
|
|
1173
|
+
declare function buildComparisonVector(rowA: Row, rowB: Row, fields: readonly MatchkeyField[]): readonly number[];
|
|
1174
|
+
/**
|
|
1175
|
+
* Splink-style EM training:
|
|
1176
|
+
* 1. Estimate u from random pairs (fixed throughout).
|
|
1177
|
+
* 2. Train m via EM starting from exponential priors.
|
|
1178
|
+
* 3. Blocking fields bypass EM and receive fixed neutral u + linear weights.
|
|
1179
|
+
*/
|
|
1180
|
+
declare function trainEM(rows: readonly Row[], mk: MatchkeyConfig, options?: EMOptions): EMResult;
|
|
1181
|
+
interface ProbScoreOptions {
|
|
1182
|
+
readonly excludePairs?: ReadonlySet<string>;
|
|
1183
|
+
readonly threshold?: number;
|
|
1184
|
+
}
|
|
1185
|
+
/**
|
|
1186
|
+
* Score all pairs in a block using F-S match weights.
|
|
1187
|
+
* Returns normalized scores in [0,1] (weight sum mapped to 0-1 via min/max).
|
|
1188
|
+
* Pairs below threshold are filtered out.
|
|
1189
|
+
*/
|
|
1190
|
+
declare function scoreProbabilistic(rows: readonly Row[], mk: MatchkeyConfig, em: EMResult, options?: ProbScoreOptions): ScoredPair[];
|
|
1191
|
+
|
|
1192
|
+
/**
|
|
1193
|
+
* evaluate.ts — Precision/recall/F1 evaluation against ground truth.
|
|
1194
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
1195
|
+
*
|
|
1196
|
+
* Ports goldenmatch/core/evaluate.py.
|
|
1197
|
+
*/
|
|
1198
|
+
|
|
1199
|
+
interface EvalResult {
|
|
1200
|
+
readonly precision: number;
|
|
1201
|
+
readonly recall: number;
|
|
1202
|
+
readonly f1: number;
|
|
1203
|
+
readonly truePositives: number;
|
|
1204
|
+
readonly falsePositives: number;
|
|
1205
|
+
readonly falseNegatives: number;
|
|
1206
|
+
}
|
|
1207
|
+
/**
|
|
1208
|
+
* Evaluate predicted pairs against ground-truth pairs.
|
|
1209
|
+
*
|
|
1210
|
+
* Pairs are treated as unordered (canonicalized to min:max).
|
|
1211
|
+
*/
|
|
1212
|
+
declare function evaluatePairs(predictedPairs: readonly ScoredPair[], groundTruthPairs: readonly (readonly [number, number])[]): EvalResult;
|
|
1213
|
+
/**
|
|
1214
|
+
* Evaluate clusters against ground-truth pairs.
|
|
1215
|
+
*
|
|
1216
|
+
* Expands each cluster's members into the full set of intra-cluster pairs and
|
|
1217
|
+
* compares that set to the ground truth.
|
|
1218
|
+
*/
|
|
1219
|
+
declare function evaluateClusters(clusters: ReadonlyMap<number, ClusterInfo>, groundTruthPairs: readonly (readonly [number, number])[], _allIds: readonly number[]): EvalResult;
|
|
1220
|
+
/**
|
|
1221
|
+
* Extract ground truth pairs from a list of rows containing two id columns.
|
|
1222
|
+
*
|
|
1223
|
+
* Numeric strings are parsed to integers. Rows with missing/unparseable ids
|
|
1224
|
+
* are skipped.
|
|
1225
|
+
*/
|
|
1226
|
+
declare function loadGroundTruthPairs(rows: readonly Row[], idColA: string, idColB: string): (readonly [number, number])[];
|
|
1227
|
+
|
|
1228
|
+
/**
|
|
1229
|
+
* streaming.ts — Incremental single-record match + cluster updates.
|
|
1230
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
1231
|
+
*
|
|
1232
|
+
* Ports goldenmatch/core/streaming.py.
|
|
1233
|
+
*/
|
|
1234
|
+
|
|
1235
|
+
interface StreamAddResult {
|
|
1236
|
+
readonly rowId: number;
|
|
1237
|
+
readonly matchedIds: readonly number[];
|
|
1238
|
+
readonly clusterId: number;
|
|
1239
|
+
}
|
|
1240
|
+
interface StreamProcessorConfig {
|
|
1241
|
+
readonly matchkey: MatchkeyConfig;
|
|
1242
|
+
readonly threshold: number;
|
|
1243
|
+
readonly maxClusterSize?: number;
|
|
1244
|
+
}
|
|
1245
|
+
interface StreamSnapshot {
|
|
1246
|
+
readonly clusters: ReadonlyMap<number, ClusterInfo>;
|
|
1247
|
+
readonly rows: readonly Row[];
|
|
1248
|
+
}
|
|
1249
|
+
/**
|
|
1250
|
+
* Incremental record processor.
|
|
1251
|
+
*
|
|
1252
|
+
* On each `add()` the new row is matched against all previously seen rows
|
|
1253
|
+
* using `matchOne`, then folded into the cluster map via `addToCluster`.
|
|
1254
|
+
*/
|
|
1255
|
+
declare class StreamProcessor {
|
|
1256
|
+
private readonly config;
|
|
1257
|
+
private readonly clusters;
|
|
1258
|
+
private readonly rowsById;
|
|
1259
|
+
private readonly order;
|
|
1260
|
+
private nextId;
|
|
1261
|
+
constructor(config: StreamProcessorConfig);
|
|
1262
|
+
/** Add a new record and return match + cluster info. */
|
|
1263
|
+
add(row: Row): StreamAddResult;
|
|
1264
|
+
/** Number of records ingested. */
|
|
1265
|
+
get size(): number;
|
|
1266
|
+
/** Snapshot of current cluster state + rows. */
|
|
1267
|
+
snapshot(): StreamSnapshot;
|
|
1268
|
+
}
|
|
1269
|
+
|
|
1270
|
+
/**
|
|
1271
|
+
* match-one.ts — Single-record matching primitive.
|
|
1272
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
1273
|
+
*
|
|
1274
|
+
* Ports goldenmatch/core/match_one.py.
|
|
1275
|
+
*/
|
|
1276
|
+
|
|
1277
|
+
interface MatchOneHit {
|
|
1278
|
+
readonly rowId: number;
|
|
1279
|
+
readonly score: number;
|
|
1280
|
+
}
|
|
1281
|
+
/**
|
|
1282
|
+
* Match a single record against a dataset using a weighted matchkey.
|
|
1283
|
+
*
|
|
1284
|
+
* Threshold defaults to 0 (return everything). For exact matchkeys use
|
|
1285
|
+
* {@link findExactMatchesOne}.
|
|
1286
|
+
*
|
|
1287
|
+
* Returns hits sorted by descending score. Rows are expected to carry
|
|
1288
|
+
* `__row_id__`.
|
|
1289
|
+
*/
|
|
1290
|
+
declare function matchOne(record: Row, rows: readonly Row[], mk: MatchkeyConfig): readonly MatchOneHit[];
|
|
1291
|
+
/**
|
|
1292
|
+
* Find exact matches for a single record against a dataset.
|
|
1293
|
+
*
|
|
1294
|
+
* Builds the composite matchkey for the probe record, then scans the rows
|
|
1295
|
+
* and returns any that share the same composite key (score 1.0). Null
|
|
1296
|
+
* transformed fields disqualify the comparison.
|
|
1297
|
+
*/
|
|
1298
|
+
declare function findExactMatchesOne(record: Row, rows: readonly Row[], mk: MatchkeyConfig): readonly MatchOneHit[];
|
|
1299
|
+
|
|
1300
|
+
/**
|
|
1301
|
+
* compare-clusters.ts — CCMS (Case Count Metric System) cluster comparison.
|
|
1302
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
1303
|
+
*
|
|
1304
|
+
* Ports goldenmatch/core/compare_clusters.py.
|
|
1305
|
+
* Reference: Talburt et al., Case Count Metric System, arXiv:2601.02824v1.
|
|
1306
|
+
*/
|
|
1307
|
+
|
|
1308
|
+
type ClusterCase = "unchanged" | "merged" | "partitioned" | "overlapping";
|
|
1309
|
+
interface CCMSResult {
|
|
1310
|
+
readonly unchanged: number;
|
|
1311
|
+
readonly merged: number;
|
|
1312
|
+
readonly partitioned: number;
|
|
1313
|
+
readonly overlapping: number;
|
|
1314
|
+
readonly twi: number;
|
|
1315
|
+
readonly clusterClassifications: Readonly<Record<number, ClusterCase>>;
|
|
1316
|
+
readonly cc1: number;
|
|
1317
|
+
readonly cc2: number;
|
|
1318
|
+
readonly rc: number;
|
|
1319
|
+
}
|
|
1320
|
+
/**
|
|
1321
|
+
* Compare two clustering outcomes via the CCMS framework.
|
|
1322
|
+
*
|
|
1323
|
+
* Classifies each cluster in A as unchanged, merged, partitioned, or
|
|
1324
|
+
* overlapping relative to B, and computes the Talburt-Wang Index:
|
|
1325
|
+
* TWI = sqrt(CC1 * CC2) / V
|
|
1326
|
+
* where CC1/CC2 are cluster counts and V is the number of non-empty
|
|
1327
|
+
* A-to-B cluster intersections.
|
|
1328
|
+
*
|
|
1329
|
+
* Throws if the two cluster dicts do not cover the same row IDs.
|
|
1330
|
+
*/
|
|
1331
|
+
declare function compareClusters(clustersA: ReadonlyMap<number, ClusterInfo>, clustersB: ReadonlyMap<number, ClusterInfo>): CCMSResult;
|
|
1332
|
+
|
|
1333
|
+
/**
|
|
1334
|
+
* sensitivity.ts — Parameter sweep engine for GoldenMatch.
|
|
1335
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
1336
|
+
*
|
|
1337
|
+
* Ports goldenmatch/core/sensitivity.py.
|
|
1338
|
+
*/
|
|
1339
|
+
|
|
1340
|
+
interface SweepParam {
|
|
1341
|
+
/** Dot-path into the config, e.g. "threshold", "blocking.maxBlockSize". */
|
|
1342
|
+
readonly path: string;
|
|
1343
|
+
readonly values: readonly unknown[];
|
|
1344
|
+
}
|
|
1345
|
+
interface SweepPoint {
|
|
1346
|
+
readonly params: Readonly<Record<string, unknown>>;
|
|
1347
|
+
readonly stats: Readonly<Record<string, number>>;
|
|
1348
|
+
readonly twi?: number;
|
|
1349
|
+
readonly error?: string;
|
|
1350
|
+
}
|
|
1351
|
+
interface SensitivityResult {
|
|
1352
|
+
readonly baseline: SweepPoint;
|
|
1353
|
+
readonly points: readonly SweepPoint[];
|
|
1354
|
+
readonly stable: boolean;
|
|
1355
|
+
}
|
|
1356
|
+
/**
|
|
1357
|
+
* Run a parameter sweep.
|
|
1358
|
+
*
|
|
1359
|
+
* Each point in the Cartesian product of `params` is applied to
|
|
1360
|
+
* `baselineConfig`, the dedupe pipeline runs, and the resulting clusters are
|
|
1361
|
+
* compared against the baseline via CCMS. A `stable` flag is set when every
|
|
1362
|
+
* point's TWI is within 0.05 of 1.0.
|
|
1363
|
+
*
|
|
1364
|
+
* Per-point errors are caught and stored on the point so that partial
|
|
1365
|
+
* results are preserved.
|
|
1366
|
+
*/
|
|
1367
|
+
declare function runSensitivity(rows: readonly Row[], baselineConfig: GoldenMatchConfig, params: readonly SweepParam[]): SensitivityResult;
|
|
1368
|
+
/** Render a human-readable stability report for a sensitivity result. */
|
|
1369
|
+
declare function stabilityReport(result: SensitivityResult): string;
|
|
1370
|
+
|
|
1371
|
+
/**
|
|
1372
|
+
* quality.ts — Lightweight quality scan stub.
|
|
1373
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
1374
|
+
*
|
|
1375
|
+
* Ports a subset of goldenmatch/core/quality.py. The Python version
|
|
1376
|
+
* integrates with GoldenCheck; this port only provides the interface and a
|
|
1377
|
+
* handful of basic heuristics that are safe to run client-side.
|
|
1378
|
+
*/
|
|
1379
|
+
|
|
1380
|
+
type QualitySeverity = "info" | "warn" | "error";
|
|
1381
|
+
interface QualityFinding {
|
|
1382
|
+
readonly column: string;
|
|
1383
|
+
readonly issue: string;
|
|
1384
|
+
readonly severity: QualitySeverity;
|
|
1385
|
+
readonly affectedRows: number;
|
|
1386
|
+
readonly sampleValues: readonly unknown[];
|
|
1387
|
+
}
|
|
1388
|
+
interface QualityRunResult {
|
|
1389
|
+
readonly rows: readonly Row[];
|
|
1390
|
+
readonly findings: readonly QualityFinding[];
|
|
1391
|
+
}
|
|
1392
|
+
/**
|
|
1393
|
+
* Run a few cheap heuristics across the dataset: high null rate, low
|
|
1394
|
+
* cardinality, inconsistent date format, obviously malformed emails.
|
|
1395
|
+
*/
|
|
1396
|
+
declare function scanQuality(rows: readonly Row[], _config?: QualityConfig): QualityFinding[];
|
|
1397
|
+
/**
|
|
1398
|
+
* Pass-through runner: produce findings, echo rows unchanged.
|
|
1399
|
+
*
|
|
1400
|
+
* Mirrors `_scan_only` / `run_quality_check` from the Python module: no
|
|
1401
|
+
* GoldenCheck, no row rewrites, just reportable findings.
|
|
1402
|
+
*/
|
|
1403
|
+
declare function runQualityCheck(rows: readonly Row[], config?: QualityConfig): QualityRunResult;
|
|
1404
|
+
|
|
1405
|
+
/**
|
|
1406
|
+
* autofix.ts — Lightweight row auto-fix utilities.
|
|
1407
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
1408
|
+
*
|
|
1409
|
+
* Ports goldenmatch/core/autofix.py. Trims whitespace, nulls empty strings,
|
|
1410
|
+
* and converts common "no value" tokens to null.
|
|
1411
|
+
*/
|
|
1412
|
+
|
|
1413
|
+
interface AutoFixLog {
|
|
1414
|
+
readonly fixType: string;
|
|
1415
|
+
readonly column: string;
|
|
1416
|
+
readonly affectedRows: number;
|
|
1417
|
+
}
|
|
1418
|
+
interface AutoFixResult {
|
|
1419
|
+
readonly rows: Row[];
|
|
1420
|
+
readonly log: AutoFixLog[];
|
|
1421
|
+
}
|
|
1422
|
+
/**
|
|
1423
|
+
* Apply conservative fixes row-by-row:
|
|
1424
|
+
* - trim string values
|
|
1425
|
+
* - convert empty strings and common "no value" tokens to null
|
|
1426
|
+
*
|
|
1427
|
+
* Internal columns (prefix `__`) are preserved unchanged.
|
|
1428
|
+
*/
|
|
1429
|
+
declare function autoFixRows(rows: readonly Row[]): AutoFixResult;
|
|
1430
|
+
|
|
1431
|
+
/**
|
|
1432
|
+
* validate.ts — Column validation rules with quarantine/flag actions.
|
|
1433
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
1434
|
+
*
|
|
1435
|
+
* Ports goldenmatch/core/validate.py.
|
|
1436
|
+
*/
|
|
1437
|
+
|
|
1438
|
+
type ValidationAction = "null" | "quarantine" | "flag";
|
|
1439
|
+
type ValidationRuleType = "regex" | "min_length" | "max_length" | "not_null" | "in_set" | "format";
|
|
1440
|
+
interface ValidationRule {
|
|
1441
|
+
readonly column: string;
|
|
1442
|
+
readonly ruleType: ValidationRuleType;
|
|
1443
|
+
readonly params: Readonly<Record<string, unknown>>;
|
|
1444
|
+
readonly action: ValidationAction;
|
|
1445
|
+
}
|
|
1446
|
+
interface ValidationReport {
|
|
1447
|
+
readonly totalRows: number;
|
|
1448
|
+
readonly quarantined: number;
|
|
1449
|
+
readonly flagged: number;
|
|
1450
|
+
readonly ruleViolations: Readonly<Record<string, number>>;
|
|
1451
|
+
}
|
|
1452
|
+
interface ValidationResult {
|
|
1453
|
+
readonly valid: Row[];
|
|
1454
|
+
readonly quarantine: Row[];
|
|
1455
|
+
readonly report: ValidationReport;
|
|
1456
|
+
}
|
|
1457
|
+
/**
|
|
1458
|
+
* Validate rows against a list of rules.
|
|
1459
|
+
*
|
|
1460
|
+
* Actions:
|
|
1461
|
+
* - "null": replace the failing value with null, row stays valid
|
|
1462
|
+
* - "quarantine": move row to quarantine bucket
|
|
1463
|
+
* - "flag": add __flags__ entry, row stays valid
|
|
1464
|
+
*/
|
|
1465
|
+
declare function validateRows(rows: readonly Row[], rules: readonly ValidationRule[]): ValidationResult;
|
|
1466
|
+
|
|
1467
|
+
/**
|
|
1468
|
+
* profiler.ts — Lightweight per-column data profiler.
|
|
1469
|
+
* Edge-safe: no `node:` imports.
|
|
1470
|
+
*
|
|
1471
|
+
* Ports parts of goldenmatch/core/profiler.py that autoconfig relies on.
|
|
1472
|
+
*/
|
|
1473
|
+
|
|
1474
|
+
type ColumnType = "email" | "phone" | "zip" | "date" | "name" | "geo" | "id" | "numeric" | "text";
|
|
1475
|
+
interface ColumnProfile {
|
|
1476
|
+
readonly name: string;
|
|
1477
|
+
readonly nullRate: number;
|
|
1478
|
+
readonly nullCount: number;
|
|
1479
|
+
readonly totalCount: number;
|
|
1480
|
+
readonly distinctCount: number;
|
|
1481
|
+
readonly cardinalityRatio: number;
|
|
1482
|
+
readonly inferredType: ColumnType;
|
|
1483
|
+
readonly avgLength: number;
|
|
1484
|
+
readonly maxLength: number;
|
|
1485
|
+
readonly sampleValues: readonly string[];
|
|
1486
|
+
}
|
|
1487
|
+
interface DatasetProfile {
|
|
1488
|
+
readonly rowCount: number;
|
|
1489
|
+
readonly columns: readonly ColumnProfile[];
|
|
1490
|
+
readonly byName: Readonly<Record<string, ColumnProfile>>;
|
|
1491
|
+
}
|
|
1492
|
+
/** Profile all columns of a row array. */
|
|
1493
|
+
declare function profileRows(rows: readonly Row[]): DatasetProfile;
|
|
1494
|
+
|
|
1495
|
+
/**
|
|
1496
|
+
* ingest.ts — Edge-safe data-shape transforms for row arrays.
|
|
1497
|
+
* Edge-safe: no `node:` imports, no file I/O.
|
|
1498
|
+
*
|
|
1499
|
+
* Ports goldenmatch/core/ingest.py minus file loading — callers are
|
|
1500
|
+
* expected to bring already-parsed rows (JSON, fetched CSV, etc.).
|
|
1501
|
+
*/
|
|
1502
|
+
|
|
1503
|
+
/**
|
|
1504
|
+
* Rename columns according to a {oldName: newName} map.
|
|
1505
|
+
*
|
|
1506
|
+
* Keys missing from the map are passed through untouched. If a rename
|
|
1507
|
+
* would collide with an existing column, the mapped column wins
|
|
1508
|
+
* (mirroring Polars behavior).
|
|
1509
|
+
*/
|
|
1510
|
+
declare function applyColumnMap(rows: readonly Row[], columnMap: Readonly<Record<string, string>>): Row[];
|
|
1511
|
+
/**
|
|
1512
|
+
* Ensure every required column exists on the first row of `rows`.
|
|
1513
|
+
* No-ops on empty input.
|
|
1514
|
+
*/
|
|
1515
|
+
declare function validateColumns(rows: readonly Row[], required: readonly string[]): void;
|
|
1516
|
+
/**
|
|
1517
|
+
* Concatenate several row arrays. Unioned schema: any column present in
|
|
1518
|
+
* any input appears in the output; missing values become null.
|
|
1519
|
+
*/
|
|
1520
|
+
declare function concatRows(rowsArrays: readonly (readonly Row[])[]): Row[];
|
|
1521
|
+
|
|
1522
|
+
/**
|
|
1523
|
+
* review-queue.ts — Human-in-the-loop pair gating.
|
|
1524
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
1525
|
+
*
|
|
1526
|
+
* Ports goldenmatch/core/review_queue.py. Default gates: >=0.95 auto-approve,
|
|
1527
|
+
* <0.75 auto-reject, everything in between needs review.
|
|
1528
|
+
*/
|
|
1529
|
+
|
|
1530
|
+
type ReviewStatus = "pending" | "approved" | "rejected";
|
|
1531
|
+
interface ReviewItem {
|
|
1532
|
+
readonly pairId: string;
|
|
1533
|
+
readonly idA: number;
|
|
1534
|
+
readonly idB: number;
|
|
1535
|
+
readonly score: number;
|
|
1536
|
+
readonly status: ReviewStatus;
|
|
1537
|
+
readonly createdAt: number;
|
|
1538
|
+
}
|
|
1539
|
+
interface GatedResult {
|
|
1540
|
+
readonly autoApproved: readonly ScoredPair[];
|
|
1541
|
+
readonly needsReview: readonly ReviewItem[];
|
|
1542
|
+
readonly rejected: readonly ScoredPair[];
|
|
1543
|
+
}
|
|
1544
|
+
interface GateOptions {
|
|
1545
|
+
readonly approveAbove?: number;
|
|
1546
|
+
readonly rejectBelow?: number;
|
|
1547
|
+
}
|
|
1548
|
+
/**
|
|
1549
|
+
* Split pairs into auto-approved, needs-review, and rejected buckets.
|
|
1550
|
+
*
|
|
1551
|
+
* Defaults: approveAbove=0.95, rejectBelow=0.75.
|
|
1552
|
+
*/
|
|
1553
|
+
declare function gatePairs(pairs: readonly ScoredPair[], options?: GateOptions): GatedResult;
|
|
1554
|
+
/**
|
|
1555
|
+
* In-memory review queue for human adjudication of borderline pairs.
|
|
1556
|
+
*/
|
|
1557
|
+
declare class ReviewQueue {
|
|
1558
|
+
private readonly items;
|
|
1559
|
+
/** Add a pair as a pending review item (idempotent by canonical pair id). */
|
|
1560
|
+
add(pair: ScoredPair): void;
|
|
1561
|
+
/** Get an item by canonical pair id ("minId:maxId"). */
|
|
1562
|
+
get(pairId: string): ReviewItem | undefined;
|
|
1563
|
+
/** Mark a pair approved. No-op if unknown. */
|
|
1564
|
+
approve(pairId: string): void;
|
|
1565
|
+
/** Mark a pair rejected. No-op if unknown. */
|
|
1566
|
+
reject(pairId: string): void;
|
|
1567
|
+
/** All pending items. */
|
|
1568
|
+
pending(): ReviewItem[];
|
|
1569
|
+
/** All approved items. */
|
|
1570
|
+
approved(): ReviewItem[];
|
|
1571
|
+
/** All rejected items. */
|
|
1572
|
+
rejected(): ReviewItem[];
|
|
1573
|
+
/** Current queue size. */
|
|
1574
|
+
size(): number;
|
|
1575
|
+
/** Canonical pair id helper ("minId:maxId"). */
|
|
1576
|
+
static pairIdFor(a: number, b: number): string;
|
|
1577
|
+
}
|
|
1578
|
+
|
|
1579
|
+
/**
|
|
1580
|
+
* autoconfig.ts — Auto-generate a GoldenMatch config from sample data.
|
|
1581
|
+
* Edge-safe: no `node:` imports.
|
|
1582
|
+
*
|
|
1583
|
+
* Ports goldenmatch/core/autoconfig.py. Profiles the rows, classifies
|
|
1584
|
+
* columns, and builds exact/weighted matchkeys + blocking config.
|
|
1585
|
+
*/
|
|
1586
|
+
|
|
1587
|
+
interface AutoconfigOptions {
|
|
1588
|
+
readonly llmProvider?: string;
|
|
1589
|
+
readonly llmAuto?: boolean;
|
|
1590
|
+
}
|
|
1591
|
+
/**
|
|
1592
|
+
* Build a GoldenMatchConfig by profiling the provided rows.
|
|
1593
|
+
*
|
|
1594
|
+
* Mirrors goldenmatch.core.autoconfig.auto_configure_df. Does not apply
|
|
1595
|
+
* standardization rules directly — callers can merge them onto the result.
|
|
1596
|
+
*/
|
|
1597
|
+
declare function autoConfigureRows(rows: readonly Row[], options?: AutoconfigOptions): GoldenMatchConfig;
|
|
1598
|
+
|
|
1599
|
+
/**
|
|
1600
|
+
* domain.ts — Domain detection & lightweight feature extraction.
|
|
1601
|
+
* Edge-safe: no `node:` imports.
|
|
1602
|
+
*
|
|
1603
|
+
* Ports goldenmatch/core/domain.py. Detects the subject area (product,
|
|
1604
|
+
* person, bibliographic, company, generic) from column names and extracts
|
|
1605
|
+
* per-row features (brand, model, version, etc.) as extra columns.
|
|
1606
|
+
*/
|
|
1607
|
+
|
|
1608
|
+
interface DomainProfile {
|
|
1609
|
+
readonly name: string;
|
|
1610
|
+
readonly confidence: number;
|
|
1611
|
+
readonly textColumns: readonly string[];
|
|
1612
|
+
readonly featureColumns: readonly string[];
|
|
1613
|
+
}
|
|
1614
|
+
/**
|
|
1615
|
+
* Detect the domain of a dataset based on its column names.
|
|
1616
|
+
*/
|
|
1617
|
+
declare function detectDomain(columns: readonly string[]): DomainProfile;
|
|
1618
|
+
/**
|
|
1619
|
+
* Annotate rows with domain-specific extracted columns.
|
|
1620
|
+
* Returns enriched rows plus indices with low extraction confidence.
|
|
1621
|
+
*/
|
|
1622
|
+
declare function extractFeatures(rows: readonly Row[], profile: DomainProfile, confidenceThreshold?: number): {
|
|
1623
|
+
rows: Row[];
|
|
1624
|
+
lowConfidenceIds: readonly number[];
|
|
1625
|
+
};
|
|
1626
|
+
|
|
1627
|
+
/**
|
|
1628
|
+
* lineage.ts — Provenance tracking for golden records.
|
|
1629
|
+
* Edge-safe: no `node:` imports.
|
|
1630
|
+
*
|
|
1631
|
+
* Ports goldenmatch/core/lineage.py. Records which source rows contributed
|
|
1632
|
+
* each golden-record field, with the survivorship strategy and confidence.
|
|
1633
|
+
*/
|
|
1634
|
+
|
|
1635
|
+
interface FieldProvenanceEntry {
|
|
1636
|
+
readonly sourceRowId: number;
|
|
1637
|
+
readonly strategy: string;
|
|
1638
|
+
readonly confidence: number;
|
|
1639
|
+
}
|
|
1640
|
+
interface LineageEdge {
|
|
1641
|
+
readonly clusterId: number;
|
|
1642
|
+
readonly sourceRowIds: readonly number[];
|
|
1643
|
+
readonly goldenRowId: number;
|
|
1644
|
+
readonly fieldProvenance: Readonly<Record<string, FieldProvenanceEntry>>;
|
|
1645
|
+
readonly timestamp: string;
|
|
1646
|
+
/**
|
|
1647
|
+
* Human-readable summary of this lineage edge. Only populated when
|
|
1648
|
+
* `buildLineage` is called with `{ naturalLanguage: true }`.
|
|
1649
|
+
*/
|
|
1650
|
+
readonly naturalLanguage?: string;
|
|
1651
|
+
}
|
|
1652
|
+
interface LineageBundle {
|
|
1653
|
+
readonly edges: readonly LineageEdge[];
|
|
1654
|
+
readonly timestamp: string;
|
|
1655
|
+
readonly recordCount: number;
|
|
1656
|
+
}
|
|
1657
|
+
interface BuildLineageOptions {
|
|
1658
|
+
/**
|
|
1659
|
+
* When `true`, every emitted `LineageEdge` gets a human-readable
|
|
1660
|
+
* `naturalLanguage` summary describing which rows were merged, how many
|
|
1661
|
+
* fields carry provenance, and the strongest contributing field. Zero LLM
|
|
1662
|
+
* cost — purely template-based.
|
|
1663
|
+
*/
|
|
1664
|
+
readonly naturalLanguage?: boolean;
|
|
1665
|
+
readonly defaultStrategy?: string;
|
|
1666
|
+
}
|
|
1667
|
+
/**
|
|
1668
|
+
* Build a lineage bundle from a DedupeResult.
|
|
1669
|
+
*
|
|
1670
|
+
* The resulting bundle has one edge per golden record, with field-level
|
|
1671
|
+
* provenance keyed by column name. Source row IDs include every member of
|
|
1672
|
+
* the cluster the golden record came from.
|
|
1673
|
+
*
|
|
1674
|
+
* Pass `{ naturalLanguage: true }` to populate a human-readable summary on
|
|
1675
|
+
* each edge (see {@link LineageEdge.naturalLanguage}).
|
|
1676
|
+
*/
|
|
1677
|
+
declare function buildLineage(result: DedupeResult, options?: BuildLineageOptions): LineageBundle;
|
|
1678
|
+
/** Serialize a lineage bundle to stable, human-readable JSON. */
|
|
1679
|
+
declare function lineageToJson(bundle: LineageBundle): string;
|
|
1680
|
+
/** Parse a lineage bundle from JSON. Does not validate schema. */
|
|
1681
|
+
declare function lineageFromJson(json: string): LineageBundle;
|
|
1682
|
+
|
|
1683
|
+
/**
|
|
1684
|
+
* learned-blocking.ts — Data-driven predicate selection for blocking.
|
|
1685
|
+
* Edge-safe: no `node:` imports.
|
|
1686
|
+
*
|
|
1687
|
+
* Ports goldenmatch/core/learned_blocking.py. Given a labelled set of
|
|
1688
|
+
* matching pairs, greedy-select predicates (equal, soundex, prefix,
|
|
1689
|
+
* qgram) that maximize recall while minimizing candidate pair count.
|
|
1690
|
+
*/
|
|
1691
|
+
|
|
1692
|
+
interface LearnedPredicate {
|
|
1693
|
+
readonly type: "equal" | "soundex" | "first_n_chars" | "qgram";
|
|
1694
|
+
readonly field: string;
|
|
1695
|
+
readonly n?: number;
|
|
1696
|
+
readonly recall: number;
|
|
1697
|
+
readonly reduction: number;
|
|
1698
|
+
}
|
|
1699
|
+
interface LearnedRules {
|
|
1700
|
+
readonly predicates: readonly LearnedPredicate[];
|
|
1701
|
+
readonly minRecall: number;
|
|
1702
|
+
readonly minReduction: number;
|
|
1703
|
+
readonly learnedAt: string;
|
|
1704
|
+
}
|
|
1705
|
+
interface LearnBlockingOptions {
|
|
1706
|
+
readonly minRecall?: number;
|
|
1707
|
+
readonly minReduction?: number;
|
|
1708
|
+
readonly predicateDepth?: number;
|
|
1709
|
+
}
|
|
1710
|
+
/**
|
|
1711
|
+
* Learn blocking rules from known-matching pairs.
|
|
1712
|
+
*
|
|
1713
|
+
* Evaluates candidate predicates for each column, then greedily selects
|
|
1714
|
+
* predicates that add the most new recall per candidate pair introduced,
|
|
1715
|
+
* stopping when minRecall is reached (or when options.predicateDepth is
|
|
1716
|
+
* exceeded).
|
|
1717
|
+
*/
|
|
1718
|
+
declare function learnBlockingRules(rows: readonly Row[], knownPairs: readonly ScoredPair[], columns: readonly string[], options?: LearnBlockingOptions): LearnedRules;
|
|
1719
|
+
/**
|
|
1720
|
+
* Apply learned rules to rows, returning one BlockResult per non-empty,
|
|
1721
|
+
* non-oversized block. Each predicate is its own pass (OR'd together).
|
|
1722
|
+
*/
|
|
1723
|
+
declare function applyLearnedBlocks(rows: readonly Row[], rules: LearnedRules, maxBlockSize: number): BlockResult[];
|
|
1724
|
+
|
|
1725
|
+
/**
|
|
1726
|
+
* graph-er.ts — Multi-table entity resolution with evidence propagation.
|
|
1727
|
+
* Edge-safe: no `node:` imports.
|
|
1728
|
+
*
|
|
1729
|
+
* Ports goldenmatch/core/graph_er.py. Each table is deduped independently
|
|
1730
|
+
* first, then cluster assignments propagate across foreign-key edges:
|
|
1731
|
+
* if row A.fk points into B's cluster, rows of A whose FK shares a cluster
|
|
1732
|
+
* get a similarity boost before re-clustering.
|
|
1733
|
+
*/
|
|
1734
|
+
|
|
1735
|
+
interface TableSchema {
|
|
1736
|
+
readonly name: string;
|
|
1737
|
+
readonly rows: readonly Row[];
|
|
1738
|
+
readonly idColumn: string;
|
|
1739
|
+
}
|
|
1740
|
+
interface Relationship {
|
|
1741
|
+
readonly tableA: string;
|
|
1742
|
+
readonly tableB: string;
|
|
1743
|
+
readonly fkColumn: string;
|
|
1744
|
+
}
|
|
1745
|
+
interface GraphERResult {
|
|
1746
|
+
readonly clustersByTable: ReadonlyMap<string, ReadonlyMap<number, ClusterInfo>>;
|
|
1747
|
+
readonly converged: boolean;
|
|
1748
|
+
readonly iterations: number;
|
|
1749
|
+
}
|
|
1750
|
+
/**
|
|
1751
|
+
* A scorer for graph ER.
|
|
1752
|
+
*
|
|
1753
|
+
* **Contract:** Must return `ScoredPair.idA` / `.idB` as **0-based row indices
|
|
1754
|
+
* into the input `rows` array**, NOT the `__row_id__` values (or any other
|
|
1755
|
+
* external/stable row identifier) those rows may carry.
|
|
1756
|
+
*
|
|
1757
|
+
* Why: `runGraphER` seeds its Union-Find with `0..rows.length` and indexes
|
|
1758
|
+
* foreign-key cluster lookups by row position. Returning external row IDs
|
|
1759
|
+
* instead of 0-based indices causes the evidence-propagation boost to never
|
|
1760
|
+
* apply (ids won't line up with the UF roots or the fk-index map) and can
|
|
1761
|
+
* silently produce wrong clusters.
|
|
1762
|
+
*
|
|
1763
|
+
* If you have stable external row IDs, re-number your rows to 0-based
|
|
1764
|
+
* positional indices before scoring, then map back afterward.
|
|
1765
|
+
*/
|
|
1766
|
+
interface GraphERScorer {
|
|
1767
|
+
(rows: readonly Row[]): readonly ScoredPair[];
|
|
1768
|
+
}
|
|
1769
|
+
interface RunGraphEROptions {
|
|
1770
|
+
readonly maxIterations?: number;
|
|
1771
|
+
readonly convergenceThreshold?: number;
|
|
1772
|
+
readonly similarityBoost?: number;
|
|
1773
|
+
/** Per-table scorer: takes rows, returns scored pairs. Required. */
|
|
1774
|
+
readonly scorerByTable: ReadonlyMap<string, GraphERScorer>;
|
|
1775
|
+
/** Match threshold for building clusters. Default 0.85. */
|
|
1776
|
+
readonly threshold?: number;
|
|
1777
|
+
}
|
|
1778
|
+
/**
|
|
1779
|
+
* Run multi-table entity resolution with iterative evidence propagation.
|
|
1780
|
+
*
|
|
1781
|
+
* For each table, the caller provides a scorer that produces pair scores
|
|
1782
|
+
* from a row array. The algorithm:
|
|
1783
|
+
* 1. Score & cluster each table independently.
|
|
1784
|
+
* 2. For every relationship A->B: find pairs in A whose fk resolves to
|
|
1785
|
+
* the same cluster in B. Boost those pair scores by `similarityBoost`.
|
|
1786
|
+
* 3. Re-cluster every table. Repeat until clusters stabilize or
|
|
1787
|
+
* `maxIterations` is reached.
|
|
1788
|
+
*
|
|
1789
|
+
* **Scorer contract (important):** scorers in `options.scorerByTable` must
|
|
1790
|
+
* return `ScoredPair.idA` / `.idB` as **0-based row indices** into the
|
|
1791
|
+
* `rows` array they were handed (NOT the stable `__row_id__` values those
|
|
1792
|
+
* rows may carry). The evidence-propagation step keys foreign-key cluster
|
|
1793
|
+
* lookups by row position; using external row IDs will silently make the
|
|
1794
|
+
* boost no-op and can produce wrong clusters. See {@link GraphERScorer}.
|
|
1795
|
+
*/
|
|
1796
|
+
declare function runGraphER(tables: readonly TableSchema[], relationships: readonly Relationship[], options: RunGraphEROptions): GraphERResult;
|
|
1797
|
+
|
|
1798
|
+
/**
|
|
1799
|
+
* memory/store.ts — Learning Memory store (in-memory backend).
|
|
1800
|
+
* Edge-safe: no `node:` imports.
|
|
1801
|
+
*
|
|
1802
|
+
* Ports goldenmatch/core/memory/store.py. SQLite / Postgres backends are
|
|
1803
|
+
* deferred (they require host-specific drivers); the in-memory backend
|
|
1804
|
+
* keeps all corrections in a plain array with trust-based upsert.
|
|
1805
|
+
*/
|
|
1806
|
+
interface Correction {
|
|
1807
|
+
readonly rowIdA: number;
|
|
1808
|
+
readonly rowIdB: number;
|
|
1809
|
+
readonly verdict: "match" | "no_match";
|
|
1810
|
+
readonly feature: string;
|
|
1811
|
+
readonly score: number;
|
|
1812
|
+
readonly timestamp: number;
|
|
1813
|
+
readonly trust: number;
|
|
1814
|
+
readonly source: string;
|
|
1815
|
+
}
|
|
1816
|
+
interface MemoryStoreConfig {
|
|
1817
|
+
readonly backend: "memory" | "sqlite" | "postgres";
|
|
1818
|
+
readonly path?: string;
|
|
1819
|
+
readonly trustDefault?: number;
|
|
1820
|
+
}
|
|
1821
|
+
declare class MemoryStore {
|
|
1822
|
+
private readonly config;
|
|
1823
|
+
private corrections;
|
|
1824
|
+
constructor(config?: MemoryStoreConfig);
|
|
1825
|
+
/** Append a correction unconditionally. */
|
|
1826
|
+
add(correction: Correction): void;
|
|
1827
|
+
/** Append many corrections unconditionally. */
|
|
1828
|
+
addBatch(corrections: readonly Correction[]): void;
|
|
1829
|
+
/** All corrections, in insertion order. */
|
|
1830
|
+
list(): readonly Correction[];
|
|
1831
|
+
/** Corrections whose verdict is "match". */
|
|
1832
|
+
listMatches(): readonly Correction[];
|
|
1833
|
+
/** Corrections whose verdict is "no_match". */
|
|
1834
|
+
listNonMatches(): readonly Correction[];
|
|
1835
|
+
count(): number;
|
|
1836
|
+
clear(): void;
|
|
1837
|
+
/**
|
|
1838
|
+
* Trust-based upsert: if a correction for the same (pair, feature) already
|
|
1839
|
+
* exists, keep whichever has higher trust. Ties break toward the more recent
|
|
1840
|
+
* correction.
|
|
1841
|
+
*/
|
|
1842
|
+
upsert(correction: Correction): void;
|
|
1843
|
+
/** Return the effective config (for debugging). */
|
|
1844
|
+
getConfig(): MemoryStoreConfig;
|
|
1845
|
+
}
|
|
1846
|
+
|
|
1847
|
+
/**
|
|
1848
|
+
* memory/corrections.ts — Apply stored corrections to scored pairs.
|
|
1849
|
+
* Edge-safe: no `node:` imports.
|
|
1850
|
+
*
|
|
1851
|
+
* Ports goldenmatch/core/memory/corrections.py. A correction is only
|
|
1852
|
+
* applied if both rows still hash to the values seen when the correction
|
|
1853
|
+
* was recorded (dual-hash staleness detection).
|
|
1854
|
+
*/
|
|
1855
|
+
|
|
1856
|
+
/** Hash of a row across its non-internal fields (sorted, stringified). */
|
|
1857
|
+
declare function hashRow(row: Row): string;
|
|
1858
|
+
interface StoredRowHashes {
|
|
1859
|
+
readonly rowIdAHash: string;
|
|
1860
|
+
readonly rowIdBHash: string;
|
|
1861
|
+
}
|
|
1862
|
+
/**
|
|
1863
|
+
* A caller can either provide a per-correction hash map (populated at
|
|
1864
|
+
* collection time) or ask applyCorrections to compute current hashes alone
|
|
1865
|
+
* — in which case staleness detection is a no-op (hashes always match).
|
|
1866
|
+
*/
|
|
1867
|
+
interface ApplyCorrectionsOptions {
|
|
1868
|
+
readonly originalHashes?: ReadonlyMap<string, StoredRowHashes>;
|
|
1869
|
+
/** When a correction matches, clamp pair score to this value. Default 1.0 for match, 0.0 for no_match. */
|
|
1870
|
+
readonly matchScore?: number;
|
|
1871
|
+
readonly noMatchScore?: number;
|
|
1872
|
+
}
|
|
1873
|
+
/**
|
|
1874
|
+
* Apply user corrections stored in `store` to a list of scored pairs.
|
|
1875
|
+
*
|
|
1876
|
+
* For each correction:
|
|
1877
|
+
* - Find the pair (idA,idB) in the scored_pairs list.
|
|
1878
|
+
* - If caller supplied original hashes, compare them against a fresh
|
|
1879
|
+
* hash of the current row. Mismatch => stale, skip.
|
|
1880
|
+
* - Otherwise apply the verdict:
|
|
1881
|
+
* "match" -> score clamped to matchScore (default 1.0)
|
|
1882
|
+
* "no_match" -> score clamped to noMatchScore (default 0.0)
|
|
1883
|
+
*
|
|
1884
|
+
* Returns the modified pairs plus counts of applied / stale corrections.
|
|
1885
|
+
*/
|
|
1886
|
+
declare function applyCorrections(pairs: readonly ScoredPair[], rows: readonly Row[], store: MemoryStore, options?: ApplyCorrectionsOptions): {
|
|
1887
|
+
pairs: readonly ScoredPair[];
|
|
1888
|
+
applied: number;
|
|
1889
|
+
stale: number;
|
|
1890
|
+
};
|
|
1891
|
+
|
|
1892
|
+
/**
|
|
1893
|
+
* memory/learner.ts — Threshold tuning & weight learning from corrections.
|
|
1894
|
+
* Edge-safe: no `node:` imports.
|
|
1895
|
+
*
|
|
1896
|
+
* Ports goldenmatch/core/memory/learner.py. Given ≥10 corrections, sweep
|
|
1897
|
+
* thresholds and pick the one maximizing F1 on the correction set. Given
|
|
1898
|
+
* ≥50 corrections with per-field subscores, fit a simple logistic-
|
|
1899
|
+
* regression-like weight update.
|
|
1900
|
+
*/
|
|
1901
|
+
|
|
1902
|
+
interface LearnedParams {
|
|
1903
|
+
readonly threshold?: number;
|
|
1904
|
+
readonly fieldWeights?: Readonly<Record<string, number>>;
|
|
1905
|
+
readonly correctionCount: number;
|
|
1906
|
+
}
|
|
1907
|
+
/**
|
|
1908
|
+
* Per-correction subscores. When present, keys correspond to matchkey field
|
|
1909
|
+
* names and values are in [0,1] representing each field's contribution.
|
|
1910
|
+
* The learner uses these only when ≥ weightsMinCorrections samples include
|
|
1911
|
+
* them.
|
|
1912
|
+
*/
|
|
1913
|
+
interface CorrectionSubscores {
|
|
1914
|
+
readonly pairKey: string;
|
|
1915
|
+
readonly subscores: Readonly<Record<string, number>>;
|
|
1916
|
+
}
|
|
1917
|
+
declare class MemoryLearner {
|
|
1918
|
+
private readonly config;
|
|
1919
|
+
constructor(config?: LearningConfig);
|
|
1920
|
+
/**
|
|
1921
|
+
* Tune threshold and (optionally) field weights from corrections.
|
|
1922
|
+
*
|
|
1923
|
+
* Threshold tuning: sweep 0.5..0.95 in 0.05 steps, compute F1 using each
|
|
1924
|
+
* correction's stored `score` vs its verdict. Returns the threshold with
|
|
1925
|
+
* the best F1 (ties break toward higher threshold for precision).
|
|
1926
|
+
*
|
|
1927
|
+
* Field weights: requires subscores. Fits a tiny gradient update that
|
|
1928
|
+
* nudges weights toward better discrimination of match / no_match.
|
|
1929
|
+
*/
|
|
1930
|
+
learn(corrections: readonly Correction[], baseline: MatchkeyConfig, subscores?: readonly CorrectionSubscores[]): LearnedParams;
|
|
1931
|
+
}
|
|
1932
|
+
|
|
1933
|
+
/**
|
|
1934
|
+
* pprl/protocol.ts — Privacy-preserving record linkage.
|
|
1935
|
+
* Edge-safe: no `node:` imports.
|
|
1936
|
+
*
|
|
1937
|
+
* Ports goldenmatch/pprl/protocol.py. Encodes both datasets as bloom
|
|
1938
|
+
* filters (CLKs) over the selected fields, then scores pairs via Dice or
|
|
1939
|
+
* Jaccard similarity. Two protocol stubs are surfaced: trusted third
|
|
1940
|
+
* party (no crypto beyond the bloom filter itself) and a simple SMC
|
|
1941
|
+
* sketch that adds a salt per party before encoding.
|
|
1942
|
+
*/
|
|
1943
|
+
|
|
1944
|
+
interface PPRLConfig {
|
|
1945
|
+
readonly fields: readonly string[];
|
|
1946
|
+
readonly securityLevel: "standard" | "high" | "paranoid";
|
|
1947
|
+
readonly protocol: "trusted_third_party" | "smc";
|
|
1948
|
+
readonly threshold: number;
|
|
1949
|
+
/** Optional salt used with "high"/"paranoid" levels. */
|
|
1950
|
+
readonly salt?: string;
|
|
1951
|
+
}
|
|
1952
|
+
interface PPRLMatch {
|
|
1953
|
+
readonly idA: number;
|
|
1954
|
+
readonly idB: number;
|
|
1955
|
+
readonly score: number;
|
|
1956
|
+
}
|
|
1957
|
+
interface PPRLResult {
|
|
1958
|
+
readonly matches: readonly PPRLMatch[];
|
|
1959
|
+
readonly stats: Readonly<Record<string, unknown>>;
|
|
1960
|
+
}
|
|
1961
|
+
/**
|
|
1962
|
+
* Encode both row sets as bloom filters and emit pair matches above the
|
|
1963
|
+
* configured threshold.
|
|
1964
|
+
*/
|
|
1965
|
+
declare function runPPRL(rowsA: readonly Row[], rowsB: readonly Row[], config: PPRLConfig): PPRLResult;
|
|
1966
|
+
/**
|
|
1967
|
+
* Auto-pick PPRL parameters for the given dataset pair. Penalizes
|
|
1968
|
+
* near-unique fields (IDs), over-long fields, and high-null fields.
|
|
1969
|
+
*/
|
|
1970
|
+
declare function autoConfigurePPRL(rowsA: readonly Row[], rowsB: readonly Row[]): PPRLConfig;
|
|
1971
|
+
|
|
1972
|
+
export { ANNBlocker, type ANNBlockerBase, type ANNBlockerOptions, type AutoFixLog, type AutoconfigOptions, BlockResult, BlockingConfig, BlockingKeyConfig, BudgetConfig, type BudgetSnapshot, BudgetTracker, type BuildANNOptions, type CCMSResult, type ClusterExplanation, ClusterInfo, ClusterProvenance, type ColumnProfile, ColumnValue, type Correction, type CreateANNBlockerOptions, CrossEncoderHttpError, CrossEncoderModel, type CrossEncoderModelOptions, type CrossEncoderOptions, type CrossEncoderProvider, type CrossEncoderReranker, type DatasetProfile, DedupeResult, type DomainProfile, type EMResult, Embedder, EmbedderError, type EmbedderOptions, type EmbedderProvider, type EmbeddingResult, type EvalResult, type GatedResult, GoldenFieldRule, GoldenMatchConfig, GoldenRulesConfig, type GraphERResult, HNSWANNBlocker, type HNSWIndexLike, type HNSWModule, type HNSWOptions, type LLMScoreResult, LLMScorerConfig, type LearnedParams, type LearnedPredicate, type LearnedRules, LearningConfig, type LineageBundle, type LineageEdge, MatchResult, MatchkeyConfig, MatchkeyField, MemoryLearner, MemoryStore, type MemoryStoreConfig, type PPRLConfig, type PPRLResult, type PairExplanation, PairKey, QualityConfig, type QualityFinding, type Relationship, type ReviewItem, ReviewQueue, Row, ScoredPair, type SensitivityResult, StreamProcessor, type SweepParam, type SweepPoint, type TableSchema, TabularData, UnionFind, type ValidationReport, type ValidationRule, _resetCrossEncoderModelCache, addRowIds, addSourceColumn, addToCluster, applyColumnMap, applyCorrections, applyLearnedBlocks, applyStandardization, applyStandardizer, applyTransform, applyTransforms, asString, autoConfigurePPRL, autoConfigureRows, autoFixRows, buildANNBlocks, buildANNPairBlocks, buildAdaptiveBlocks, buildBlocks, buildBlocksAsync, buildClusters, buildComparisonVector, buildGoldenRecord, buildGoldenRecordWithProvenance, buildLineage, buildMst, buildMultiPassBlocks, buildStaticBlocks, compareClusters, computeClusterConfidence, computeMatchkeyValue, computeMatchkeys, concatRows, configToYaml, cosineSim, countTokensApprox, createANNBlocker, dedupe, detectDomain, diceCoefficient, ensembleScore, euclideanDist, evaluateClusters, evaluatePairs, explainCluster, explainPair, extractFeatures, findExactMatches, findExactMatchesOne, findFuzzyMatches, gatePairs, getClusterPairScores, getEmbedder, hashRow, indelDistance, indelSimilarity, isNullish, jaccardSimilarity, jaro, jaroWinkler, learnBlockingRules, levenshteinDistance, levenshteinSimilarity, lineageFromJson, lineageToJson, llmClusterPairs, llmScorePairs, loadGroundTruthPairs, match, matchOne, mergeField, metaphone, pairKey, parseConfig, parseConfigYaml, parsePairKey, profileRows, rerankPair, rerankTopPairs, runDedupePipeline, runGraphER, runMatchPipeline, runPPRL, runQualityCheck, runSensitivity, scanQuality, scoreBlocksSequential, scoreField, scoreMatrix, scorePair, scorePairRecord, scoreProbabilistic, scoreStrings, scoreStringsWithLlm, selectBestBlockingKey, soundex, soundexMatch, splitOversizedCluster, stabilityReport, toColumnValue, tokenSortRatio, trainEM, unmergeCluster, unmergeRecord, validateColumns, validateRows };
|