goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,176 @@
1
+ /**
2
+ * compare-clusters.ts — CCMS (Case Count Metric System) cluster comparison.
3
+ * Edge-safe: no Node.js imports, pure TypeScript only.
4
+ *
5
+ * Ports goldenmatch/core/compare_clusters.py.
6
+ * Reference: Talburt et al., Case Count Metric System, arXiv:2601.02824v1.
7
+ */
8
+
9
+ import type { ClusterInfo } from "./types.js";
10
+
11
+ // ---------------------------------------------------------------------------
12
+ // Types
13
+ // ---------------------------------------------------------------------------
14
+
15
+ export type ClusterCase = "unchanged" | "merged" | "partitioned" | "overlapping";
16
+
17
+ export interface CCMSResult {
18
+ readonly unchanged: number;
19
+ readonly merged: number;
20
+ readonly partitioned: number;
21
+ readonly overlapping: number;
22
+ readonly twi: number;
23
+ readonly clusterClassifications: Readonly<Record<number, ClusterCase>>;
24
+ readonly cc1: number;
25
+ readonly cc2: number;
26
+ readonly rc: number;
27
+ }
28
+
29
+ // ---------------------------------------------------------------------------
30
+ // Helpers
31
+ // ---------------------------------------------------------------------------
32
+
33
+ function buildMemberSets(
34
+ clusters: ReadonlyMap<number, ClusterInfo>,
35
+ ): { sets: Map<number, Set<number>>; ids: Set<number> } {
36
+ const sets = new Map<number, Set<number>>();
37
+ const ids = new Set<number>();
38
+ for (const [cid, info] of clusters) {
39
+ const memberSet = new Set<number>(info.members);
40
+ sets.set(cid, memberSet);
41
+ for (const m of memberSet) ids.add(m);
42
+ }
43
+ return { sets, ids };
44
+ }
45
+
46
+ function setsEqual(a: ReadonlySet<number>, b: ReadonlySet<number>): boolean {
47
+ if (a.size !== b.size) return false;
48
+ for (const v of a) if (!b.has(v)) return false;
49
+ return true;
50
+ }
51
+
52
+ function isSubsetOf(
53
+ sub: ReadonlySet<number>,
54
+ sup: ReadonlySet<number>,
55
+ ): boolean {
56
+ if (sub.size > sup.size) return false;
57
+ for (const v of sub) if (!sup.has(v)) return false;
58
+ return true;
59
+ }
60
+
61
+ // ---------------------------------------------------------------------------
62
+ // compareClusters
63
+ // ---------------------------------------------------------------------------
64
+
65
+ /**
66
+ * Compare two clustering outcomes via the CCMS framework.
67
+ *
68
+ * Classifies each cluster in A as unchanged, merged, partitioned, or
69
+ * overlapping relative to B, and computes the Talburt-Wang Index:
70
+ * TWI = sqrt(CC1 * CC2) / V
71
+ * where CC1/CC2 are cluster counts and V is the number of non-empty
72
+ * A-to-B cluster intersections.
73
+ *
74
+ * Throws if the two cluster dicts do not cover the same row IDs.
75
+ */
76
+ export function compareClusters(
77
+ clustersA: ReadonlyMap<number, ClusterInfo>,
78
+ clustersB: ReadonlyMap<number, ClusterInfo>,
79
+ ): CCMSResult {
80
+ const { sets: setsA, ids: idsA } = buildMemberSets(clustersA);
81
+ const { sets: setsB, ids: idsB } = buildMemberSets(clustersB);
82
+
83
+ if (idsA.size !== idsB.size) {
84
+ throw new Error(
85
+ `Cluster dicts cover different row IDs: ${idsA.size} vs ${idsB.size}`,
86
+ );
87
+ }
88
+ for (const id of idsA) {
89
+ if (!idsB.has(id)) {
90
+ throw new Error(
91
+ `Cluster dicts cover different row IDs (id ${id} only in A).`,
92
+ );
93
+ }
94
+ }
95
+
96
+ // Reverse lookup: row_id -> B cluster id
97
+ const rowToB = new Map<number, number>();
98
+ for (const [cid, members] of setsB) {
99
+ for (const m of members) rowToB.set(m, cid);
100
+ }
101
+
102
+ const classifications: Record<number, ClusterCase> = {};
103
+ let unchanged = 0;
104
+ let merged = 0;
105
+ let partitioned = 0;
106
+ let overlapping = 0;
107
+ let nonEmptyIntersections = 0;
108
+
109
+ for (const [cidA, membersA] of setsA) {
110
+ // Group A's members by which B-cluster they land in
111
+ const bMapping = new Map<number, number[]>();
112
+ for (const m of membersA) {
113
+ const cidB = rowToB.get(m);
114
+ if (cidB === undefined) continue;
115
+ const list = bMapping.get(cidB);
116
+ if (list !== undefined) list.push(m);
117
+ else bMapping.set(cidB, [m]);
118
+ }
119
+
120
+ nonEmptyIntersections += bMapping.size;
121
+
122
+ let caseKind: ClusterCase;
123
+ if (bMapping.size === 1) {
124
+ const cidB = bMapping.keys().next().value as number;
125
+ const bMembers = setsB.get(cidB)!;
126
+ if (setsEqual(bMembers, membersA)) {
127
+ caseKind = "unchanged";
128
+ unchanged++;
129
+ } else {
130
+ caseKind = "merged";
131
+ merged++;
132
+ }
133
+ } else {
134
+ // Multiple B clusters intersect this A cluster
135
+ let allSubsets = true;
136
+ for (const cidB of bMapping.keys()) {
137
+ const bMembers = setsB.get(cidB)!;
138
+ if (!isSubsetOf(bMembers, membersA)) {
139
+ allSubsets = false;
140
+ break;
141
+ }
142
+ }
143
+ if (allSubsets) {
144
+ caseKind = "partitioned";
145
+ partitioned++;
146
+ } else {
147
+ caseKind = "overlapping";
148
+ overlapping++;
149
+ }
150
+ }
151
+ classifications[cidA] = caseKind;
152
+ }
153
+
154
+ const cc1 = setsA.size;
155
+ const cc2 = setsB.size;
156
+ const rc = idsA.size;
157
+
158
+ let twi: number;
159
+ if (nonEmptyIntersections > 0) {
160
+ twi = Math.sqrt(cc1 * cc2) / nonEmptyIntersections;
161
+ } else {
162
+ twi = cc1 === 0 && cc2 === 0 ? 1.0 : 0.0;
163
+ }
164
+
165
+ return {
166
+ unchanged,
167
+ merged,
168
+ partitioned,
169
+ overlapping,
170
+ twi,
171
+ clusterClassifications: classifications,
172
+ cc1,
173
+ cc2,
174
+ rc,
175
+ };
176
+ }