goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* compare-clusters.ts — CCMS (Case Count Metric System) cluster comparison.
|
|
3
|
+
* Edge-safe: no Node.js imports, pure TypeScript only.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/core/compare_clusters.py.
|
|
6
|
+
* Reference: Talburt et al., Case Count Metric System, arXiv:2601.02824v1.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import type { ClusterInfo } from "./types.js";
|
|
10
|
+
|
|
11
|
+
// ---------------------------------------------------------------------------
|
|
12
|
+
// Types
|
|
13
|
+
// ---------------------------------------------------------------------------
|
|
14
|
+
|
|
15
|
+
export type ClusterCase = "unchanged" | "merged" | "partitioned" | "overlapping";
|
|
16
|
+
|
|
17
|
+
export interface CCMSResult {
|
|
18
|
+
readonly unchanged: number;
|
|
19
|
+
readonly merged: number;
|
|
20
|
+
readonly partitioned: number;
|
|
21
|
+
readonly overlapping: number;
|
|
22
|
+
readonly twi: number;
|
|
23
|
+
readonly clusterClassifications: Readonly<Record<number, ClusterCase>>;
|
|
24
|
+
readonly cc1: number;
|
|
25
|
+
readonly cc2: number;
|
|
26
|
+
readonly rc: number;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
// Helpers
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
function buildMemberSets(
|
|
34
|
+
clusters: ReadonlyMap<number, ClusterInfo>,
|
|
35
|
+
): { sets: Map<number, Set<number>>; ids: Set<number> } {
|
|
36
|
+
const sets = new Map<number, Set<number>>();
|
|
37
|
+
const ids = new Set<number>();
|
|
38
|
+
for (const [cid, info] of clusters) {
|
|
39
|
+
const memberSet = new Set<number>(info.members);
|
|
40
|
+
sets.set(cid, memberSet);
|
|
41
|
+
for (const m of memberSet) ids.add(m);
|
|
42
|
+
}
|
|
43
|
+
return { sets, ids };
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
function setsEqual(a: ReadonlySet<number>, b: ReadonlySet<number>): boolean {
|
|
47
|
+
if (a.size !== b.size) return false;
|
|
48
|
+
for (const v of a) if (!b.has(v)) return false;
|
|
49
|
+
return true;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function isSubsetOf(
|
|
53
|
+
sub: ReadonlySet<number>,
|
|
54
|
+
sup: ReadonlySet<number>,
|
|
55
|
+
): boolean {
|
|
56
|
+
if (sub.size > sup.size) return false;
|
|
57
|
+
for (const v of sub) if (!sup.has(v)) return false;
|
|
58
|
+
return true;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// ---------------------------------------------------------------------------
|
|
62
|
+
// compareClusters
|
|
63
|
+
// ---------------------------------------------------------------------------
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Compare two clustering outcomes via the CCMS framework.
|
|
67
|
+
*
|
|
68
|
+
* Classifies each cluster in A as unchanged, merged, partitioned, or
|
|
69
|
+
* overlapping relative to B, and computes the Talburt-Wang Index:
|
|
70
|
+
* TWI = sqrt(CC1 * CC2) / V
|
|
71
|
+
* where CC1/CC2 are cluster counts and V is the number of non-empty
|
|
72
|
+
* A-to-B cluster intersections.
|
|
73
|
+
*
|
|
74
|
+
* Throws if the two cluster dicts do not cover the same row IDs.
|
|
75
|
+
*/
|
|
76
|
+
export function compareClusters(
|
|
77
|
+
clustersA: ReadonlyMap<number, ClusterInfo>,
|
|
78
|
+
clustersB: ReadonlyMap<number, ClusterInfo>,
|
|
79
|
+
): CCMSResult {
|
|
80
|
+
const { sets: setsA, ids: idsA } = buildMemberSets(clustersA);
|
|
81
|
+
const { sets: setsB, ids: idsB } = buildMemberSets(clustersB);
|
|
82
|
+
|
|
83
|
+
if (idsA.size !== idsB.size) {
|
|
84
|
+
throw new Error(
|
|
85
|
+
`Cluster dicts cover different row IDs: ${idsA.size} vs ${idsB.size}`,
|
|
86
|
+
);
|
|
87
|
+
}
|
|
88
|
+
for (const id of idsA) {
|
|
89
|
+
if (!idsB.has(id)) {
|
|
90
|
+
throw new Error(
|
|
91
|
+
`Cluster dicts cover different row IDs (id ${id} only in A).`,
|
|
92
|
+
);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// Reverse lookup: row_id -> B cluster id
|
|
97
|
+
const rowToB = new Map<number, number>();
|
|
98
|
+
for (const [cid, members] of setsB) {
|
|
99
|
+
for (const m of members) rowToB.set(m, cid);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
const classifications: Record<number, ClusterCase> = {};
|
|
103
|
+
let unchanged = 0;
|
|
104
|
+
let merged = 0;
|
|
105
|
+
let partitioned = 0;
|
|
106
|
+
let overlapping = 0;
|
|
107
|
+
let nonEmptyIntersections = 0;
|
|
108
|
+
|
|
109
|
+
for (const [cidA, membersA] of setsA) {
|
|
110
|
+
// Group A's members by which B-cluster they land in
|
|
111
|
+
const bMapping = new Map<number, number[]>();
|
|
112
|
+
for (const m of membersA) {
|
|
113
|
+
const cidB = rowToB.get(m);
|
|
114
|
+
if (cidB === undefined) continue;
|
|
115
|
+
const list = bMapping.get(cidB);
|
|
116
|
+
if (list !== undefined) list.push(m);
|
|
117
|
+
else bMapping.set(cidB, [m]);
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
nonEmptyIntersections += bMapping.size;
|
|
121
|
+
|
|
122
|
+
let caseKind: ClusterCase;
|
|
123
|
+
if (bMapping.size === 1) {
|
|
124
|
+
const cidB = bMapping.keys().next().value as number;
|
|
125
|
+
const bMembers = setsB.get(cidB)!;
|
|
126
|
+
if (setsEqual(bMembers, membersA)) {
|
|
127
|
+
caseKind = "unchanged";
|
|
128
|
+
unchanged++;
|
|
129
|
+
} else {
|
|
130
|
+
caseKind = "merged";
|
|
131
|
+
merged++;
|
|
132
|
+
}
|
|
133
|
+
} else {
|
|
134
|
+
// Multiple B clusters intersect this A cluster
|
|
135
|
+
let allSubsets = true;
|
|
136
|
+
for (const cidB of bMapping.keys()) {
|
|
137
|
+
const bMembers = setsB.get(cidB)!;
|
|
138
|
+
if (!isSubsetOf(bMembers, membersA)) {
|
|
139
|
+
allSubsets = false;
|
|
140
|
+
break;
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
if (allSubsets) {
|
|
144
|
+
caseKind = "partitioned";
|
|
145
|
+
partitioned++;
|
|
146
|
+
} else {
|
|
147
|
+
caseKind = "overlapping";
|
|
148
|
+
overlapping++;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
classifications[cidA] = caseKind;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
const cc1 = setsA.size;
|
|
155
|
+
const cc2 = setsB.size;
|
|
156
|
+
const rc = idsA.size;
|
|
157
|
+
|
|
158
|
+
let twi: number;
|
|
159
|
+
if (nonEmptyIntersections > 0) {
|
|
160
|
+
twi = Math.sqrt(cc1 * cc2) / nonEmptyIntersections;
|
|
161
|
+
} else {
|
|
162
|
+
twi = cc1 === 0 && cc2 === 0 ? 1.0 : 0.0;
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
return {
|
|
166
|
+
unchanged,
|
|
167
|
+
merged,
|
|
168
|
+
partitioned,
|
|
169
|
+
overlapping,
|
|
170
|
+
twi,
|
|
171
|
+
clusterClassifications: classifications,
|
|
172
|
+
cc1,
|
|
173
|
+
cc2,
|
|
174
|
+
rc,
|
|
175
|
+
};
|
|
176
|
+
}
|