goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* memory/corrections.ts — Apply stored corrections to scored pairs.
|
|
3
|
+
* Edge-safe: no `node:` imports.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/memory/corrections.py. A correction is only
|
|
6
|
+
* applied if both rows still hash to the values seen when the correction
|
|
7
|
+
* was recorded (dual-hash staleness detection).
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import type { Row, ScoredPair } from "../types.js";
|
|
11
|
+
import { makeScoredPair } from "../types.js";
|
|
12
|
+
import type { Correction, MemoryStore } from "./store.js";
|
|
13
|
+
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
// Row hashing
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Deterministic FNV-1a 32-bit hash. Matches store-side hashing so
|
|
20
|
+
* corrections can survive serialization/round-trips.
|
|
21
|
+
*/
|
|
22
|
+
function hashString(s: string): string {
|
|
23
|
+
let h = 2166136261;
|
|
24
|
+
for (let i = 0; i < s.length; i++) {
|
|
25
|
+
h ^= s.charCodeAt(i);
|
|
26
|
+
h = Math.imul(h, 16777619);
|
|
27
|
+
}
|
|
28
|
+
return (h >>> 0).toString(16);
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
/** Hash of a row across its non-internal fields (sorted, stringified). */
|
|
32
|
+
export function hashRow(row: Row): string {
|
|
33
|
+
const keys = Object.keys(row)
|
|
34
|
+
.filter((k) => !k.startsWith("__"))
|
|
35
|
+
.sort();
|
|
36
|
+
const parts: string[] = [];
|
|
37
|
+
for (const k of keys) {
|
|
38
|
+
const v = row[k];
|
|
39
|
+
const s = v === null || v === undefined ? "\u0000null" : String(v);
|
|
40
|
+
parts.push(`${k}=${s}`);
|
|
41
|
+
}
|
|
42
|
+
return hashString(parts.join("|"));
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
// ---------------------------------------------------------------------------
|
|
46
|
+
// Helpers
|
|
47
|
+
// ---------------------------------------------------------------------------
|
|
48
|
+
|
|
49
|
+
function pairKey(a: number, b: number): string {
|
|
50
|
+
return a < b ? `${a}|${b}` : `${b}|${a}`;
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
function getRowId(row: Row): number | null {
|
|
54
|
+
const raw = row["__row_id__"];
|
|
55
|
+
if (typeof raw === "number") return raw;
|
|
56
|
+
if (typeof raw === "string") {
|
|
57
|
+
const n = Number(raw);
|
|
58
|
+
return Number.isFinite(n) ? n : null;
|
|
59
|
+
}
|
|
60
|
+
return null;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// ---------------------------------------------------------------------------
|
|
64
|
+
// Stored-correction metadata
|
|
65
|
+
// ---------------------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
export interface StoredRowHashes {
|
|
68
|
+
readonly rowIdAHash: string;
|
|
69
|
+
readonly rowIdBHash: string;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* A caller can either provide a per-correction hash map (populated at
|
|
74
|
+
* collection time) or ask applyCorrections to compute current hashes alone
|
|
75
|
+
* — in which case staleness detection is a no-op (hashes always match).
|
|
76
|
+
*/
|
|
77
|
+
export interface ApplyCorrectionsOptions {
|
|
78
|
+
readonly originalHashes?: ReadonlyMap<string, StoredRowHashes>;
|
|
79
|
+
/** When a correction matches, clamp pair score to this value. Default 1.0 for match, 0.0 for no_match. */
|
|
80
|
+
readonly matchScore?: number;
|
|
81
|
+
readonly noMatchScore?: number;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// ---------------------------------------------------------------------------
|
|
85
|
+
// Apply corrections
|
|
86
|
+
// ---------------------------------------------------------------------------
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Apply user corrections stored in `store` to a list of scored pairs.
|
|
90
|
+
*
|
|
91
|
+
* For each correction:
|
|
92
|
+
* - Find the pair (idA,idB) in the scored_pairs list.
|
|
93
|
+
* - If caller supplied original hashes, compare them against a fresh
|
|
94
|
+
* hash of the current row. Mismatch => stale, skip.
|
|
95
|
+
* - Otherwise apply the verdict:
|
|
96
|
+
* "match" -> score clamped to matchScore (default 1.0)
|
|
97
|
+
* "no_match" -> score clamped to noMatchScore (default 0.0)
|
|
98
|
+
*
|
|
99
|
+
* Returns the modified pairs plus counts of applied / stale corrections.
|
|
100
|
+
*/
|
|
101
|
+
export function applyCorrections(
|
|
102
|
+
pairs: readonly ScoredPair[],
|
|
103
|
+
rows: readonly Row[],
|
|
104
|
+
store: MemoryStore,
|
|
105
|
+
options?: ApplyCorrectionsOptions,
|
|
106
|
+
): { pairs: readonly ScoredPair[]; applied: number; stale: number } {
|
|
107
|
+
const matchScore = options?.matchScore ?? 1.0;
|
|
108
|
+
const noMatchScore = options?.noMatchScore ?? 0.0;
|
|
109
|
+
|
|
110
|
+
// Build index: rowId -> Row for current-state hashing.
|
|
111
|
+
const rowById = new Map<number, Row>();
|
|
112
|
+
for (const r of rows) {
|
|
113
|
+
const id = getRowId(r);
|
|
114
|
+
if (id !== null) rowById.set(id, r);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// Index corrections by canonical pair key.
|
|
118
|
+
const byPair = new Map<string, Correction>();
|
|
119
|
+
for (const c of store.list()) {
|
|
120
|
+
const key = pairKey(c.rowIdA, c.rowIdB);
|
|
121
|
+
const existing = byPair.get(key);
|
|
122
|
+
// Keep the highest-trust correction per pair (most recent on tie).
|
|
123
|
+
if (
|
|
124
|
+
existing === undefined ||
|
|
125
|
+
c.trust > existing.trust ||
|
|
126
|
+
(c.trust === existing.trust && c.timestamp > existing.timestamp)
|
|
127
|
+
) {
|
|
128
|
+
byPair.set(key, c);
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
let applied = 0;
|
|
133
|
+
let stale = 0;
|
|
134
|
+
const out: ScoredPair[] = [];
|
|
135
|
+
|
|
136
|
+
for (const pair of pairs) {
|
|
137
|
+
const key = pairKey(pair.idA, pair.idB);
|
|
138
|
+
const correction = byPair.get(key);
|
|
139
|
+
if (!correction) {
|
|
140
|
+
out.push(pair);
|
|
141
|
+
continue;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
// Dual-hash staleness check (if caller populated `originalHashes`).
|
|
145
|
+
if (options?.originalHashes) {
|
|
146
|
+
const stored = options.originalHashes.get(key);
|
|
147
|
+
if (stored) {
|
|
148
|
+
const rowA = rowById.get(pair.idA);
|
|
149
|
+
const rowB = rowById.get(pair.idB);
|
|
150
|
+
if (!rowA || !rowB) {
|
|
151
|
+
stale++;
|
|
152
|
+
out.push(pair);
|
|
153
|
+
continue;
|
|
154
|
+
}
|
|
155
|
+
const currentA = hashRow(rowA);
|
|
156
|
+
const currentB = hashRow(rowB);
|
|
157
|
+
const match =
|
|
158
|
+
(currentA === stored.rowIdAHash && currentB === stored.rowIdBHash) ||
|
|
159
|
+
(currentA === stored.rowIdBHash && currentB === stored.rowIdAHash);
|
|
160
|
+
if (!match) {
|
|
161
|
+
stale++;
|
|
162
|
+
out.push(pair);
|
|
163
|
+
continue;
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
applied++;
|
|
169
|
+
out.push(
|
|
170
|
+
makeScoredPair(
|
|
171
|
+
pair.idA,
|
|
172
|
+
pair.idB,
|
|
173
|
+
correction.verdict === "match" ? matchScore : noMatchScore,
|
|
174
|
+
),
|
|
175
|
+
);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
return { pairs: out, applied, stale };
|
|
179
|
+
}
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* memory/learner.ts — Threshold tuning & weight learning from corrections.
|
|
3
|
+
* Edge-safe: no `node:` imports.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/memory/learner.py. Given ≥10 corrections, sweep
|
|
6
|
+
* thresholds and pick the one maximizing F1 on the correction set. Given
|
|
7
|
+
* ≥50 corrections with per-field subscores, fit a simple logistic-
|
|
8
|
+
* regression-like weight update.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import type { LearningConfig, MatchkeyConfig } from "../types.js";
|
|
12
|
+
import type { Correction } from "./store.js";
|
|
13
|
+
|
|
14
|
+
// ---------------------------------------------------------------------------
|
|
15
|
+
// Types
|
|
16
|
+
// ---------------------------------------------------------------------------
|
|
17
|
+
|
|
18
|
+
export interface LearnedParams {
|
|
19
|
+
readonly threshold?: number;
|
|
20
|
+
readonly fieldWeights?: Readonly<Record<string, number>>;
|
|
21
|
+
readonly correctionCount: number;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Per-correction subscores. When present, keys correspond to matchkey field
|
|
26
|
+
* names and values are in [0,1] representing each field's contribution.
|
|
27
|
+
* The learner uses these only when ≥ weightsMinCorrections samples include
|
|
28
|
+
* them.
|
|
29
|
+
*/
|
|
30
|
+
export interface CorrectionSubscores {
|
|
31
|
+
readonly pairKey: string; // "minId|maxId"
|
|
32
|
+
readonly subscores: Readonly<Record<string, number>>;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
// Learner
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
|
|
39
|
+
const DEFAULT_LEARNING_CONFIG: LearningConfig = {
|
|
40
|
+
thresholdMinCorrections: 10,
|
|
41
|
+
weightsMinCorrections: 50,
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
export class MemoryLearner {
|
|
45
|
+
constructor(
|
|
46
|
+
private readonly config: LearningConfig = DEFAULT_LEARNING_CONFIG,
|
|
47
|
+
) {}
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Tune threshold and (optionally) field weights from corrections.
|
|
51
|
+
*
|
|
52
|
+
* Threshold tuning: sweep 0.5..0.95 in 0.05 steps, compute F1 using each
|
|
53
|
+
* correction's stored `score` vs its verdict. Returns the threshold with
|
|
54
|
+
* the best F1 (ties break toward higher threshold for precision).
|
|
55
|
+
*
|
|
56
|
+
* Field weights: requires subscores. Fits a tiny gradient update that
|
|
57
|
+
* nudges weights toward better discrimination of match / no_match.
|
|
58
|
+
*/
|
|
59
|
+
learn(
|
|
60
|
+
corrections: readonly Correction[],
|
|
61
|
+
baseline: MatchkeyConfig,
|
|
62
|
+
subscores?: readonly CorrectionSubscores[],
|
|
63
|
+
): LearnedParams {
|
|
64
|
+
const result: {
|
|
65
|
+
threshold?: number;
|
|
66
|
+
fieldWeights?: Record<string, number>;
|
|
67
|
+
correctionCount: number;
|
|
68
|
+
} = { correctionCount: corrections.length };
|
|
69
|
+
|
|
70
|
+
if (corrections.length >= this.config.thresholdMinCorrections) {
|
|
71
|
+
const tuned = tuneThreshold(corrections);
|
|
72
|
+
if (tuned !== null) result.threshold = tuned;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
if (
|
|
76
|
+
subscores &&
|
|
77
|
+
corrections.length >= this.config.weightsMinCorrections &&
|
|
78
|
+
subscores.length >= this.config.weightsMinCorrections
|
|
79
|
+
) {
|
|
80
|
+
const learnedWeights = tuneWeights(corrections, subscores, baseline);
|
|
81
|
+
if (learnedWeights) result.fieldWeights = learnedWeights;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
return result;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// ---------------------------------------------------------------------------
|
|
89
|
+
// Threshold tuning
|
|
90
|
+
// ---------------------------------------------------------------------------
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Sweep thresholds in [0.5, 0.95] step 0.05 and pick one maximizing F1.
|
|
94
|
+
* Returns null if corrections cannot produce a meaningful F1 (e.g. all
|
|
95
|
+
* same verdict).
|
|
96
|
+
*/
|
|
97
|
+
function tuneThreshold(corrections: readonly Correction[]): number | null {
|
|
98
|
+
const positives = corrections.filter((c) => c.verdict === "match");
|
|
99
|
+
const negatives = corrections.filter((c) => c.verdict === "no_match");
|
|
100
|
+
if (positives.length === 0 || negatives.length === 0) return null;
|
|
101
|
+
|
|
102
|
+
let bestThreshold = 0.85;
|
|
103
|
+
let bestF1 = -1;
|
|
104
|
+
|
|
105
|
+
for (let t = 0.5; t <= 0.95 + 1e-9; t += 0.05) {
|
|
106
|
+
let tp = 0;
|
|
107
|
+
let fp = 0;
|
|
108
|
+
let fn = 0;
|
|
109
|
+
for (const c of corrections) {
|
|
110
|
+
const predicted = c.score >= t;
|
|
111
|
+
if (c.verdict === "match") {
|
|
112
|
+
if (predicted) tp++;
|
|
113
|
+
else fn++;
|
|
114
|
+
} else {
|
|
115
|
+
if (predicted) fp++;
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
const precision = tp + fp === 0 ? 0 : tp / (tp + fp);
|
|
119
|
+
const recall = tp + fn === 0 ? 0 : tp / (tp + fn);
|
|
120
|
+
const f1 =
|
|
121
|
+
precision + recall === 0 ? 0 : (2 * precision * recall) / (precision + recall);
|
|
122
|
+
if (f1 > bestF1 || (f1 === bestF1 && t > bestThreshold)) {
|
|
123
|
+
bestF1 = f1;
|
|
124
|
+
bestThreshold = t;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
return Number(bestThreshold.toFixed(3));
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
// ---------------------------------------------------------------------------
|
|
132
|
+
// Weight tuning (simple gradient pass)
|
|
133
|
+
// ---------------------------------------------------------------------------
|
|
134
|
+
|
|
135
|
+
function sigmoid(x: number): number {
|
|
136
|
+
if (x >= 0) {
|
|
137
|
+
const ex = Math.exp(-x);
|
|
138
|
+
return 1 / (1 + ex);
|
|
139
|
+
}
|
|
140
|
+
const ex = Math.exp(x);
|
|
141
|
+
return ex / (1 + ex);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
function tuneWeights(
|
|
145
|
+
corrections: readonly Correction[],
|
|
146
|
+
subscores: readonly CorrectionSubscores[],
|
|
147
|
+
baseline: MatchkeyConfig,
|
|
148
|
+
): Record<string, number> | null {
|
|
149
|
+
const subByPair = new Map<string, Record<string, number>>();
|
|
150
|
+
for (const s of subscores) {
|
|
151
|
+
subByPair.set(s.pairKey, { ...s.subscores });
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Collect field list from baseline matchkey.
|
|
155
|
+
const fields = baseline.fields.map((f) => f.field);
|
|
156
|
+
if (fields.length === 0) return null;
|
|
157
|
+
|
|
158
|
+
// Initialize weights from baseline.
|
|
159
|
+
const weights = new Map<string, number>();
|
|
160
|
+
for (const f of baseline.fields) weights.set(f.field, f.weight);
|
|
161
|
+
|
|
162
|
+
// Build training set: for each correction we find its subscores.
|
|
163
|
+
type Sample = { y: number; x: Record<string, number> };
|
|
164
|
+
const samples: Sample[] = [];
|
|
165
|
+
for (const c of corrections) {
|
|
166
|
+
const [a, b] = c.rowIdA < c.rowIdB ? [c.rowIdA, c.rowIdB] : [c.rowIdB, c.rowIdA];
|
|
167
|
+
const key = `${a}|${b}`;
|
|
168
|
+
const sub = subByPair.get(key);
|
|
169
|
+
if (!sub) continue;
|
|
170
|
+
samples.push({
|
|
171
|
+
y: c.verdict === "match" ? 1 : 0,
|
|
172
|
+
x: sub,
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
if (samples.length < 10) return null;
|
|
176
|
+
|
|
177
|
+
const learningRate = 0.1;
|
|
178
|
+
const iterations = 50;
|
|
179
|
+
for (let iter = 0; iter < iterations; iter++) {
|
|
180
|
+
const grad = new Map<string, number>();
|
|
181
|
+
for (const f of fields) grad.set(f, 0);
|
|
182
|
+
|
|
183
|
+
for (const sample of samples) {
|
|
184
|
+
let z = 0;
|
|
185
|
+
for (const f of fields) {
|
|
186
|
+
const w = weights.get(f) ?? 0;
|
|
187
|
+
const x = sample.x[f] ?? 0;
|
|
188
|
+
z += w * x;
|
|
189
|
+
}
|
|
190
|
+
const pred = sigmoid(z);
|
|
191
|
+
const err = pred - sample.y;
|
|
192
|
+
for (const f of fields) {
|
|
193
|
+
const x = sample.x[f] ?? 0;
|
|
194
|
+
grad.set(f, (grad.get(f) ?? 0) + err * x);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
for (const f of fields) {
|
|
199
|
+
const g = (grad.get(f) ?? 0) / samples.length;
|
|
200
|
+
const w = weights.get(f) ?? 0;
|
|
201
|
+
weights.set(f, w - learningRate * g);
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Re-normalize weights so they sum to 1 (matchkey weights must average
|
|
206
|
+
// out to the original budget; keep same total).
|
|
207
|
+
const originalTotal = baseline.fields.reduce((acc, f) => acc + f.weight, 0);
|
|
208
|
+
const newTotal = fields.reduce((acc, f) => acc + Math.max(0, weights.get(f) ?? 0), 0);
|
|
209
|
+
if (newTotal <= 0) return null;
|
|
210
|
+
const scale = originalTotal / newTotal;
|
|
211
|
+
|
|
212
|
+
const out: Record<string, number> = {};
|
|
213
|
+
for (const f of fields) {
|
|
214
|
+
const w = Math.max(0, weights.get(f) ?? 0) * scale;
|
|
215
|
+
out[f] = Number(w.toFixed(4));
|
|
216
|
+
}
|
|
217
|
+
return out;
|
|
218
|
+
}
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* memory/store.ts — Learning Memory store (in-memory backend).
|
|
3
|
+
* Edge-safe: no `node:` imports.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/memory/store.py. SQLite / Postgres backends are
|
|
6
|
+
* deferred (they require host-specific drivers); the in-memory backend
|
|
7
|
+
* keeps all corrections in a plain array with trust-based upsert.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
// Types
|
|
12
|
+
// ---------------------------------------------------------------------------
|
|
13
|
+
|
|
14
|
+
export interface Correction {
|
|
15
|
+
readonly rowIdA: number;
|
|
16
|
+
readonly rowIdB: number;
|
|
17
|
+
readonly verdict: "match" | "no_match";
|
|
18
|
+
readonly feature: string;
|
|
19
|
+
readonly score: number;
|
|
20
|
+
readonly timestamp: number;
|
|
21
|
+
readonly trust: number;
|
|
22
|
+
readonly source: string;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
export interface MemoryStoreConfig {
|
|
26
|
+
readonly backend: "memory" | "sqlite" | "postgres";
|
|
27
|
+
readonly path?: string;
|
|
28
|
+
readonly trustDefault?: number;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
// Helpers
|
|
33
|
+
// ---------------------------------------------------------------------------
|
|
34
|
+
|
|
35
|
+
function pairFeatureKey(c: Correction): string {
|
|
36
|
+
const [a, b] = c.rowIdA < c.rowIdB ? [c.rowIdA, c.rowIdB] : [c.rowIdB, c.rowIdA];
|
|
37
|
+
return `${a}|${b}|${c.feature}`;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// ---------------------------------------------------------------------------
|
|
41
|
+
// MemoryStore
|
|
42
|
+
// ---------------------------------------------------------------------------
|
|
43
|
+
|
|
44
|
+
export class MemoryStore {
|
|
45
|
+
private corrections: Correction[] = [];
|
|
46
|
+
|
|
47
|
+
constructor(private readonly config: MemoryStoreConfig = { backend: "memory" }) {
|
|
48
|
+
if (config.backend !== "memory") {
|
|
49
|
+
// SQLite/Postgres backends intentionally unsupported in edge-safe code.
|
|
50
|
+
// Callers that need persistence should swap in a host-specific wrapper.
|
|
51
|
+
// We don't throw here to keep the class usable for tests.
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/** Append a correction unconditionally. */
|
|
56
|
+
add(correction: Correction): void {
|
|
57
|
+
this.corrections.push(correction);
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/** Append many corrections unconditionally. */
|
|
61
|
+
addBatch(corrections: readonly Correction[]): void {
|
|
62
|
+
for (const c of corrections) this.corrections.push(c);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/** All corrections, in insertion order. */
|
|
66
|
+
list(): readonly Correction[] {
|
|
67
|
+
return this.corrections;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
/** Corrections whose verdict is "match". */
|
|
71
|
+
listMatches(): readonly Correction[] {
|
|
72
|
+
return this.corrections.filter((c) => c.verdict === "match");
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/** Corrections whose verdict is "no_match". */
|
|
76
|
+
listNonMatches(): readonly Correction[] {
|
|
77
|
+
return this.corrections.filter((c) => c.verdict === "no_match");
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
count(): number {
|
|
81
|
+
return this.corrections.length;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
clear(): void {
|
|
85
|
+
this.corrections = [];
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Trust-based upsert: if a correction for the same (pair, feature) already
|
|
90
|
+
* exists, keep whichever has higher trust. Ties break toward the more recent
|
|
91
|
+
* correction.
|
|
92
|
+
*/
|
|
93
|
+
upsert(correction: Correction): void {
|
|
94
|
+
const key = pairFeatureKey(correction);
|
|
95
|
+
for (let i = 0; i < this.corrections.length; i++) {
|
|
96
|
+
const existing = this.corrections[i]!;
|
|
97
|
+
if (pairFeatureKey(existing) !== key) continue;
|
|
98
|
+
|
|
99
|
+
const newer = correction.timestamp >= existing.timestamp;
|
|
100
|
+
const higherTrust = correction.trust > existing.trust;
|
|
101
|
+
const equalTrustButNewer = correction.trust === existing.trust && newer;
|
|
102
|
+
if (higherTrust || equalTrustButNewer) {
|
|
103
|
+
this.corrections[i] = correction;
|
|
104
|
+
}
|
|
105
|
+
return;
|
|
106
|
+
}
|
|
107
|
+
this.corrections.push(correction);
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
/** Return the effective config (for debugging). */
|
|
111
|
+
getConfig(): MemoryStoreConfig {
|
|
112
|
+
return this.config;
|
|
113
|
+
}
|
|
114
|
+
}
|