goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,304 @@
1
+ /**
2
+ * types.ts — GoldenMatch config interfaces and result types.
3
+ * Edge-safe: no Node.js imports, no `process`.
4
+ */
5
+ type ColumnValue = string | number | boolean | null;
6
+ type Row = Readonly<Record<string, unknown>>;
7
+ /** A canonical pair key in the form "minId:maxId". Only produced by pairKey(). */
8
+ type PairKey = string & {
9
+ readonly __brand: "PairKey";
10
+ };
11
+ interface MatchkeyField {
12
+ readonly field: string;
13
+ readonly transforms: readonly string[];
14
+ readonly scorer: string;
15
+ readonly weight: number;
16
+ readonly model?: string;
17
+ readonly columns?: readonly string[];
18
+ readonly columnWeights?: Readonly<Record<string, number>>;
19
+ readonly levels?: number;
20
+ readonly partialThreshold?: number;
21
+ }
22
+ interface ExactMatchkey {
23
+ readonly name: string;
24
+ readonly type: "exact";
25
+ readonly fields: readonly MatchkeyField[];
26
+ }
27
+ interface WeightedMatchkey {
28
+ readonly name: string;
29
+ readonly type: "weighted";
30
+ readonly fields: readonly MatchkeyField[];
31
+ readonly threshold: number;
32
+ readonly autoThreshold?: boolean;
33
+ readonly rerank?: boolean;
34
+ readonly rerankModel?: string;
35
+ readonly rerankBand?: number;
36
+ }
37
+ interface ProbabilisticMatchkey {
38
+ readonly name: string;
39
+ readonly type: "probabilistic";
40
+ readonly fields: readonly MatchkeyField[];
41
+ readonly threshold?: number;
42
+ readonly emIterations?: number;
43
+ readonly convergenceThreshold?: number;
44
+ readonly linkThreshold?: number;
45
+ readonly reviewThreshold?: number;
46
+ }
47
+ type MatchkeyConfig = ExactMatchkey | WeightedMatchkey | ProbabilisticMatchkey;
48
+ interface BlockingKeyConfig {
49
+ readonly fields: readonly string[];
50
+ readonly transforms: readonly string[];
51
+ }
52
+ interface SortKeyField {
53
+ readonly column: string;
54
+ readonly transforms: readonly string[];
55
+ }
56
+ interface CanopyConfig {
57
+ readonly fields: readonly string[];
58
+ readonly looseThreshold: number;
59
+ readonly tightThreshold: number;
60
+ readonly maxCanopySize: number;
61
+ }
62
+ interface BlockingConfig {
63
+ readonly strategy: "static" | "adaptive" | "sorted_neighborhood" | "multi_pass" | "ann" | "canopy" | "ann_pairs" | "learned";
64
+ readonly keys: readonly BlockingKeyConfig[];
65
+ readonly maxBlockSize: number;
66
+ readonly skipOversized: boolean;
67
+ readonly autoSuggest?: boolean;
68
+ readonly autoSelect?: boolean;
69
+ readonly subBlockKeys?: readonly BlockingKeyConfig[];
70
+ readonly windowSize?: number;
71
+ readonly sortKey?: readonly SortKeyField[];
72
+ readonly passes?: readonly BlockingKeyConfig[];
73
+ readonly unionMode?: boolean;
74
+ readonly maxTotalComparisons?: number;
75
+ readonly annColumn?: string;
76
+ readonly annModel?: string;
77
+ readonly annTopK?: number;
78
+ readonly canopy?: CanopyConfig;
79
+ readonly learnedSampleSize?: number;
80
+ readonly learnedMinRecall?: number;
81
+ readonly learnedMinReduction?: number;
82
+ readonly learnedPredicateDepth?: number;
83
+ readonly learnedCachePath?: string;
84
+ }
85
+ interface GoldenFieldRule {
86
+ readonly strategy: "most_complete" | "majority_vote" | "source_priority" | "most_recent" | "first_non_null";
87
+ readonly dateColumn?: string;
88
+ readonly sourcePriority?: readonly string[];
89
+ }
90
+ interface GoldenRulesConfig {
91
+ readonly defaultStrategy: string;
92
+ readonly fieldRules: Readonly<Record<string, GoldenFieldRule>>;
93
+ readonly maxClusterSize: number;
94
+ readonly autoSplit: boolean;
95
+ readonly qualityWeighting: boolean;
96
+ readonly weakClusterThreshold: number;
97
+ }
98
+ interface StandardizationConfig {
99
+ readonly rules: Readonly<Record<string, readonly string[]>>;
100
+ }
101
+ interface ValidationRuleConfig {
102
+ readonly column: string;
103
+ readonly ruleType: "regex" | "min_length" | "max_length" | "not_null" | "in_set" | "format";
104
+ readonly params: Readonly<Record<string, unknown>>;
105
+ readonly action: "null" | "quarantine" | "flag";
106
+ }
107
+ interface ValidationConfig {
108
+ readonly rules: readonly ValidationRuleConfig[];
109
+ readonly autoFix: boolean;
110
+ }
111
+ interface QualityConfig {
112
+ readonly enabled: boolean;
113
+ readonly mode: "silent" | "announced" | "disabled";
114
+ readonly fixMode: "safe" | "moderate" | "none";
115
+ readonly domain?: string;
116
+ }
117
+ interface TransformConfig {
118
+ readonly enabled: boolean;
119
+ readonly mode: "silent" | "announced" | "disabled";
120
+ }
121
+ interface BudgetConfig {
122
+ readonly maxCostUsd?: number;
123
+ readonly maxCalls?: number;
124
+ readonly escalationModel?: string;
125
+ readonly escalationBand?: readonly number[];
126
+ readonly escalationBudgetPct?: number;
127
+ readonly warnAtPct?: number;
128
+ }
129
+ interface LLMScorerConfig {
130
+ readonly enabled: boolean;
131
+ readonly provider?: string;
132
+ readonly model?: string;
133
+ readonly autoThreshold: number;
134
+ readonly candidateLo: number;
135
+ readonly candidateHi: number;
136
+ readonly batchSize: number;
137
+ readonly maxWorkers: number;
138
+ readonly budget?: BudgetConfig;
139
+ readonly mode: "pairwise" | "cluster";
140
+ readonly clusterMaxSize?: number;
141
+ readonly clusterMinSize?: number;
142
+ }
143
+ interface DomainConfig {
144
+ readonly enabled: boolean;
145
+ readonly mode?: string;
146
+ readonly confidenceThreshold: number;
147
+ readonly llmValidation: boolean;
148
+ readonly budget?: BudgetConfig;
149
+ }
150
+ interface LearningConfig {
151
+ readonly thresholdMinCorrections: number;
152
+ readonly weightsMinCorrections: number;
153
+ }
154
+ interface MemoryConfig {
155
+ readonly enabled: boolean;
156
+ readonly backend: "sqlite" | "postgres";
157
+ readonly path?: string;
158
+ readonly trust: number;
159
+ readonly learning: LearningConfig;
160
+ }
161
+ interface InputFileConfig {
162
+ readonly path: string;
163
+ readonly idColumn?: string;
164
+ readonly sourceLabel?: string;
165
+ readonly sourceName?: string;
166
+ readonly columnMap?: Readonly<Record<string, string>>;
167
+ readonly delimiter?: string;
168
+ readonly encoding?: string;
169
+ readonly sheet?: string;
170
+ readonly parseMode?: string;
171
+ readonly headerRow?: number;
172
+ readonly hasHeader?: boolean;
173
+ readonly skipRows?: readonly number[];
174
+ }
175
+ interface InputConfig {
176
+ readonly files: readonly InputFileConfig[];
177
+ readonly fileA?: InputFileConfig;
178
+ readonly fileB?: InputFileConfig;
179
+ }
180
+ interface OutputConfig {
181
+ readonly path?: string;
182
+ readonly format?: string;
183
+ readonly directory?: string;
184
+ readonly runName?: string;
185
+ }
186
+ interface GoldenMatchConfig {
187
+ readonly matchkeys?: readonly MatchkeyConfig[];
188
+ readonly matchSettings?: readonly MatchkeyConfig[];
189
+ readonly blocking?: BlockingConfig;
190
+ readonly threshold?: number;
191
+ readonly goldenRules?: GoldenRulesConfig;
192
+ readonly standardization?: StandardizationConfig;
193
+ readonly validation?: ValidationConfig;
194
+ readonly quality?: QualityConfig;
195
+ readonly transform?: TransformConfig;
196
+ readonly llmScorer?: LLMScorerConfig;
197
+ readonly domain?: DomainConfig;
198
+ readonly memory?: MemoryConfig;
199
+ readonly input?: InputConfig;
200
+ readonly output?: OutputConfig;
201
+ readonly backend?: string;
202
+ readonly llmAuto?: boolean;
203
+ readonly llmBoost?: boolean;
204
+ }
205
+ interface ScoredPair {
206
+ readonly idA: number;
207
+ readonly idB: number;
208
+ readonly score: number;
209
+ }
210
+ interface ClusterInfo {
211
+ readonly members: readonly number[];
212
+ readonly size: number;
213
+ readonly oversized: boolean;
214
+ readonly pairScores: ReadonlyMap<PairKey, number>;
215
+ readonly confidence: number;
216
+ readonly bottleneckPair: readonly [number, number] | null;
217
+ readonly clusterQuality: "strong" | "weak" | "split";
218
+ }
219
+ interface DedupeStats {
220
+ readonly totalRecords: number;
221
+ readonly totalClusters: number;
222
+ readonly matchRate: number;
223
+ readonly matchedRecords: number;
224
+ readonly uniqueRecords: number;
225
+ }
226
+ interface DedupeResult {
227
+ readonly goldenRecords: readonly Row[];
228
+ readonly clusters: ReadonlyMap<number, ClusterInfo>;
229
+ readonly dupes: readonly Row[];
230
+ readonly unique: readonly Row[];
231
+ readonly stats: DedupeStats;
232
+ readonly scoredPairs: readonly ScoredPair[];
233
+ readonly config: GoldenMatchConfig;
234
+ }
235
+ interface MatchResult {
236
+ readonly matched: readonly Row[];
237
+ readonly unmatched: readonly Row[];
238
+ readonly stats: Readonly<Record<string, unknown>>;
239
+ }
240
+ interface FieldProvenance {
241
+ readonly value: unknown;
242
+ readonly sourceRowId: number;
243
+ readonly strategy: string;
244
+ readonly confidence: number;
245
+ readonly candidates: readonly Readonly<Record<string, unknown>>[];
246
+ }
247
+ interface ClusterProvenance {
248
+ readonly clusterId: number;
249
+ readonly clusterQuality: string;
250
+ readonly clusterConfidence: number;
251
+ readonly fields: Readonly<Record<string, FieldProvenance>>;
252
+ }
253
+ interface BlockResult {
254
+ readonly blockKey: string;
255
+ readonly rows: readonly Row[];
256
+ readonly strategy: string;
257
+ readonly depth: number;
258
+ readonly parentKey?: string;
259
+ readonly preScoredPairs?: readonly ScoredPair[];
260
+ }
261
+ declare const VALID_SCORERS: Set<"exact" | "jaro_winkler" | "levenshtein" | "token_sort" | "soundex_match" | "embedding" | "record_embedding" | "ensemble" | "dice" | "jaccard">;
262
+ declare const VALID_TRANSFORMS: Set<"token_sort" | "lowercase" | "uppercase" | "strip" | "strip_all" | "soundex" | "metaphone" | "digits_only" | "alpha_only" | "normalize_whitespace" | "first_token" | "last_token">;
263
+ declare const VALID_STRATEGIES: Set<"most_complete" | "majority_vote" | "source_priority" | "most_recent" | "first_non_null">;
264
+ declare const VALID_STANDARDIZERS: Set<"strip" | "email" | "name_proper" | "name_upper" | "name_lower" | "phone" | "zip5" | "address" | "state" | "trim_whitespace">;
265
+ /**
266
+ * Create a ScoredPair guaranteeing idA <= idB (canonical order).
267
+ * Always use this instead of constructing `{ idA, idB, score }` directly.
268
+ */
269
+ declare function makeScoredPair(a: number, b: number, score: number): ScoredPair;
270
+ /** Create a MatchkeyField with sensible defaults. */
271
+ declare function makeMatchkeyField(partial: Partial<MatchkeyField> & Pick<MatchkeyField, "field">): MatchkeyField;
272
+ /**
273
+ * Shape accepted by `makeMatchkeyConfig`. All variant-specific fields are
274
+ * optional; the factory picks the right variant based on `type`.
275
+ */
276
+ interface MakeMatchkeyConfigInput {
277
+ readonly name: string;
278
+ readonly type?: "exact" | "weighted" | "probabilistic";
279
+ readonly fields?: readonly MatchkeyField[];
280
+ readonly threshold?: number;
281
+ readonly autoThreshold?: boolean;
282
+ readonly rerank?: boolean;
283
+ readonly rerankModel?: string;
284
+ readonly rerankBand?: number;
285
+ readonly emIterations?: number;
286
+ readonly convergenceThreshold?: number;
287
+ readonly linkThreshold?: number;
288
+ readonly reviewThreshold?: number;
289
+ }
290
+ /** Create a MatchkeyConfig with sensible defaults. Produces the correct variant. */
291
+ declare function makeMatchkeyConfig(partial: MakeMatchkeyConfigInput): MatchkeyConfig;
292
+ /** Create a BlockingConfig with sensible defaults. */
293
+ declare function makeBlockingConfig(partial?: Partial<BlockingConfig>): BlockingConfig;
294
+ /** Create a GoldenRulesConfig with sensible defaults. */
295
+ declare function makeGoldenRulesConfig(partial?: Partial<GoldenRulesConfig>): GoldenRulesConfig;
296
+ /** Create a full GoldenMatchConfig with sensible defaults. */
297
+ declare function makeConfig(partial?: Partial<GoldenMatchConfig>): GoldenMatchConfig;
298
+ /**
299
+ * Return matchkeys from config, checking both `matchkeys` and `matchSettings`.
300
+ * Mirrors Python's `GoldenMatchConfig.get_matchkeys()`.
301
+ */
302
+ declare function getMatchkeys(config: GoldenMatchConfig): readonly MatchkeyConfig[];
303
+
304
+ export { makeConfig as A, type BlockResult as B, type CanopyConfig as C, type DedupeResult as D, type ExactMatchkey as E, type FieldProvenance as F, type GoldenMatchConfig as G, makeGoldenRulesConfig as H, type InputConfig as I, makeMatchkeyConfig as J, makeMatchkeyField as K, type LLMScorerConfig as L, type MatchkeyConfig as M, makeScoredPair as N, type OutputConfig as O, type PairKey as P, type QualityConfig as Q, type Row as R, type ScoredPair as S, type TransformConfig as T, VALID_SCORERS as V, type WeightedMatchkey as W, type MatchResult as a, type BlockingConfig as b, type BlockingKeyConfig as c, type BudgetConfig as d, type ClusterInfo as e, type ClusterProvenance as f, type ColumnValue as g, type DedupeStats as h, type DomainConfig as i, type GoldenFieldRule as j, type GoldenRulesConfig as k, type InputFileConfig as l, type LearningConfig as m, type MakeMatchkeyConfigInput as n, type MatchkeyField as o, type MemoryConfig as p, type ProbabilisticMatchkey as q, type SortKeyField as r, type StandardizationConfig as s, VALID_STANDARDIZERS as t, VALID_STRATEGIES as u, VALID_TRANSFORMS as v, type ValidationConfig as w, type ValidationRuleConfig as x, getMatchkeys as y, makeBlockingConfig as z };
@@ -0,0 +1,304 @@
1
+ /**
2
+ * types.ts — GoldenMatch config interfaces and result types.
3
+ * Edge-safe: no Node.js imports, no `process`.
4
+ */
5
+ type ColumnValue = string | number | boolean | null;
6
+ type Row = Readonly<Record<string, unknown>>;
7
+ /** A canonical pair key in the form "minId:maxId". Only produced by pairKey(). */
8
+ type PairKey = string & {
9
+ readonly __brand: "PairKey";
10
+ };
11
+ interface MatchkeyField {
12
+ readonly field: string;
13
+ readonly transforms: readonly string[];
14
+ readonly scorer: string;
15
+ readonly weight: number;
16
+ readonly model?: string;
17
+ readonly columns?: readonly string[];
18
+ readonly columnWeights?: Readonly<Record<string, number>>;
19
+ readonly levels?: number;
20
+ readonly partialThreshold?: number;
21
+ }
22
+ interface ExactMatchkey {
23
+ readonly name: string;
24
+ readonly type: "exact";
25
+ readonly fields: readonly MatchkeyField[];
26
+ }
27
+ interface WeightedMatchkey {
28
+ readonly name: string;
29
+ readonly type: "weighted";
30
+ readonly fields: readonly MatchkeyField[];
31
+ readonly threshold: number;
32
+ readonly autoThreshold?: boolean;
33
+ readonly rerank?: boolean;
34
+ readonly rerankModel?: string;
35
+ readonly rerankBand?: number;
36
+ }
37
+ interface ProbabilisticMatchkey {
38
+ readonly name: string;
39
+ readonly type: "probabilistic";
40
+ readonly fields: readonly MatchkeyField[];
41
+ readonly threshold?: number;
42
+ readonly emIterations?: number;
43
+ readonly convergenceThreshold?: number;
44
+ readonly linkThreshold?: number;
45
+ readonly reviewThreshold?: number;
46
+ }
47
+ type MatchkeyConfig = ExactMatchkey | WeightedMatchkey | ProbabilisticMatchkey;
48
+ interface BlockingKeyConfig {
49
+ readonly fields: readonly string[];
50
+ readonly transforms: readonly string[];
51
+ }
52
+ interface SortKeyField {
53
+ readonly column: string;
54
+ readonly transforms: readonly string[];
55
+ }
56
+ interface CanopyConfig {
57
+ readonly fields: readonly string[];
58
+ readonly looseThreshold: number;
59
+ readonly tightThreshold: number;
60
+ readonly maxCanopySize: number;
61
+ }
62
+ interface BlockingConfig {
63
+ readonly strategy: "static" | "adaptive" | "sorted_neighborhood" | "multi_pass" | "ann" | "canopy" | "ann_pairs" | "learned";
64
+ readonly keys: readonly BlockingKeyConfig[];
65
+ readonly maxBlockSize: number;
66
+ readonly skipOversized: boolean;
67
+ readonly autoSuggest?: boolean;
68
+ readonly autoSelect?: boolean;
69
+ readonly subBlockKeys?: readonly BlockingKeyConfig[];
70
+ readonly windowSize?: number;
71
+ readonly sortKey?: readonly SortKeyField[];
72
+ readonly passes?: readonly BlockingKeyConfig[];
73
+ readonly unionMode?: boolean;
74
+ readonly maxTotalComparisons?: number;
75
+ readonly annColumn?: string;
76
+ readonly annModel?: string;
77
+ readonly annTopK?: number;
78
+ readonly canopy?: CanopyConfig;
79
+ readonly learnedSampleSize?: number;
80
+ readonly learnedMinRecall?: number;
81
+ readonly learnedMinReduction?: number;
82
+ readonly learnedPredicateDepth?: number;
83
+ readonly learnedCachePath?: string;
84
+ }
85
+ interface GoldenFieldRule {
86
+ readonly strategy: "most_complete" | "majority_vote" | "source_priority" | "most_recent" | "first_non_null";
87
+ readonly dateColumn?: string;
88
+ readonly sourcePriority?: readonly string[];
89
+ }
90
+ interface GoldenRulesConfig {
91
+ readonly defaultStrategy: string;
92
+ readonly fieldRules: Readonly<Record<string, GoldenFieldRule>>;
93
+ readonly maxClusterSize: number;
94
+ readonly autoSplit: boolean;
95
+ readonly qualityWeighting: boolean;
96
+ readonly weakClusterThreshold: number;
97
+ }
98
+ interface StandardizationConfig {
99
+ readonly rules: Readonly<Record<string, readonly string[]>>;
100
+ }
101
+ interface ValidationRuleConfig {
102
+ readonly column: string;
103
+ readonly ruleType: "regex" | "min_length" | "max_length" | "not_null" | "in_set" | "format";
104
+ readonly params: Readonly<Record<string, unknown>>;
105
+ readonly action: "null" | "quarantine" | "flag";
106
+ }
107
+ interface ValidationConfig {
108
+ readonly rules: readonly ValidationRuleConfig[];
109
+ readonly autoFix: boolean;
110
+ }
111
+ interface QualityConfig {
112
+ readonly enabled: boolean;
113
+ readonly mode: "silent" | "announced" | "disabled";
114
+ readonly fixMode: "safe" | "moderate" | "none";
115
+ readonly domain?: string;
116
+ }
117
+ interface TransformConfig {
118
+ readonly enabled: boolean;
119
+ readonly mode: "silent" | "announced" | "disabled";
120
+ }
121
+ interface BudgetConfig {
122
+ readonly maxCostUsd?: number;
123
+ readonly maxCalls?: number;
124
+ readonly escalationModel?: string;
125
+ readonly escalationBand?: readonly number[];
126
+ readonly escalationBudgetPct?: number;
127
+ readonly warnAtPct?: number;
128
+ }
129
+ interface LLMScorerConfig {
130
+ readonly enabled: boolean;
131
+ readonly provider?: string;
132
+ readonly model?: string;
133
+ readonly autoThreshold: number;
134
+ readonly candidateLo: number;
135
+ readonly candidateHi: number;
136
+ readonly batchSize: number;
137
+ readonly maxWorkers: number;
138
+ readonly budget?: BudgetConfig;
139
+ readonly mode: "pairwise" | "cluster";
140
+ readonly clusterMaxSize?: number;
141
+ readonly clusterMinSize?: number;
142
+ }
143
+ interface DomainConfig {
144
+ readonly enabled: boolean;
145
+ readonly mode?: string;
146
+ readonly confidenceThreshold: number;
147
+ readonly llmValidation: boolean;
148
+ readonly budget?: BudgetConfig;
149
+ }
150
+ interface LearningConfig {
151
+ readonly thresholdMinCorrections: number;
152
+ readonly weightsMinCorrections: number;
153
+ }
154
+ interface MemoryConfig {
155
+ readonly enabled: boolean;
156
+ readonly backend: "sqlite" | "postgres";
157
+ readonly path?: string;
158
+ readonly trust: number;
159
+ readonly learning: LearningConfig;
160
+ }
161
+ interface InputFileConfig {
162
+ readonly path: string;
163
+ readonly idColumn?: string;
164
+ readonly sourceLabel?: string;
165
+ readonly sourceName?: string;
166
+ readonly columnMap?: Readonly<Record<string, string>>;
167
+ readonly delimiter?: string;
168
+ readonly encoding?: string;
169
+ readonly sheet?: string;
170
+ readonly parseMode?: string;
171
+ readonly headerRow?: number;
172
+ readonly hasHeader?: boolean;
173
+ readonly skipRows?: readonly number[];
174
+ }
175
+ interface InputConfig {
176
+ readonly files: readonly InputFileConfig[];
177
+ readonly fileA?: InputFileConfig;
178
+ readonly fileB?: InputFileConfig;
179
+ }
180
+ interface OutputConfig {
181
+ readonly path?: string;
182
+ readonly format?: string;
183
+ readonly directory?: string;
184
+ readonly runName?: string;
185
+ }
186
+ interface GoldenMatchConfig {
187
+ readonly matchkeys?: readonly MatchkeyConfig[];
188
+ readonly matchSettings?: readonly MatchkeyConfig[];
189
+ readonly blocking?: BlockingConfig;
190
+ readonly threshold?: number;
191
+ readonly goldenRules?: GoldenRulesConfig;
192
+ readonly standardization?: StandardizationConfig;
193
+ readonly validation?: ValidationConfig;
194
+ readonly quality?: QualityConfig;
195
+ readonly transform?: TransformConfig;
196
+ readonly llmScorer?: LLMScorerConfig;
197
+ readonly domain?: DomainConfig;
198
+ readonly memory?: MemoryConfig;
199
+ readonly input?: InputConfig;
200
+ readonly output?: OutputConfig;
201
+ readonly backend?: string;
202
+ readonly llmAuto?: boolean;
203
+ readonly llmBoost?: boolean;
204
+ }
205
+ interface ScoredPair {
206
+ readonly idA: number;
207
+ readonly idB: number;
208
+ readonly score: number;
209
+ }
210
+ interface ClusterInfo {
211
+ readonly members: readonly number[];
212
+ readonly size: number;
213
+ readonly oversized: boolean;
214
+ readonly pairScores: ReadonlyMap<PairKey, number>;
215
+ readonly confidence: number;
216
+ readonly bottleneckPair: readonly [number, number] | null;
217
+ readonly clusterQuality: "strong" | "weak" | "split";
218
+ }
219
+ interface DedupeStats {
220
+ readonly totalRecords: number;
221
+ readonly totalClusters: number;
222
+ readonly matchRate: number;
223
+ readonly matchedRecords: number;
224
+ readonly uniqueRecords: number;
225
+ }
226
+ interface DedupeResult {
227
+ readonly goldenRecords: readonly Row[];
228
+ readonly clusters: ReadonlyMap<number, ClusterInfo>;
229
+ readonly dupes: readonly Row[];
230
+ readonly unique: readonly Row[];
231
+ readonly stats: DedupeStats;
232
+ readonly scoredPairs: readonly ScoredPair[];
233
+ readonly config: GoldenMatchConfig;
234
+ }
235
+ interface MatchResult {
236
+ readonly matched: readonly Row[];
237
+ readonly unmatched: readonly Row[];
238
+ readonly stats: Readonly<Record<string, unknown>>;
239
+ }
240
+ interface FieldProvenance {
241
+ readonly value: unknown;
242
+ readonly sourceRowId: number;
243
+ readonly strategy: string;
244
+ readonly confidence: number;
245
+ readonly candidates: readonly Readonly<Record<string, unknown>>[];
246
+ }
247
+ interface ClusterProvenance {
248
+ readonly clusterId: number;
249
+ readonly clusterQuality: string;
250
+ readonly clusterConfidence: number;
251
+ readonly fields: Readonly<Record<string, FieldProvenance>>;
252
+ }
253
+ interface BlockResult {
254
+ readonly blockKey: string;
255
+ readonly rows: readonly Row[];
256
+ readonly strategy: string;
257
+ readonly depth: number;
258
+ readonly parentKey?: string;
259
+ readonly preScoredPairs?: readonly ScoredPair[];
260
+ }
261
+ declare const VALID_SCORERS: Set<"exact" | "jaro_winkler" | "levenshtein" | "token_sort" | "soundex_match" | "embedding" | "record_embedding" | "ensemble" | "dice" | "jaccard">;
262
+ declare const VALID_TRANSFORMS: Set<"token_sort" | "lowercase" | "uppercase" | "strip" | "strip_all" | "soundex" | "metaphone" | "digits_only" | "alpha_only" | "normalize_whitespace" | "first_token" | "last_token">;
263
+ declare const VALID_STRATEGIES: Set<"most_complete" | "majority_vote" | "source_priority" | "most_recent" | "first_non_null">;
264
+ declare const VALID_STANDARDIZERS: Set<"strip" | "email" | "name_proper" | "name_upper" | "name_lower" | "phone" | "zip5" | "address" | "state" | "trim_whitespace">;
265
+ /**
266
+ * Create a ScoredPair guaranteeing idA <= idB (canonical order).
267
+ * Always use this instead of constructing `{ idA, idB, score }` directly.
268
+ */
269
+ declare function makeScoredPair(a: number, b: number, score: number): ScoredPair;
270
+ /** Create a MatchkeyField with sensible defaults. */
271
+ declare function makeMatchkeyField(partial: Partial<MatchkeyField> & Pick<MatchkeyField, "field">): MatchkeyField;
272
+ /**
273
+ * Shape accepted by `makeMatchkeyConfig`. All variant-specific fields are
274
+ * optional; the factory picks the right variant based on `type`.
275
+ */
276
+ interface MakeMatchkeyConfigInput {
277
+ readonly name: string;
278
+ readonly type?: "exact" | "weighted" | "probabilistic";
279
+ readonly fields?: readonly MatchkeyField[];
280
+ readonly threshold?: number;
281
+ readonly autoThreshold?: boolean;
282
+ readonly rerank?: boolean;
283
+ readonly rerankModel?: string;
284
+ readonly rerankBand?: number;
285
+ readonly emIterations?: number;
286
+ readonly convergenceThreshold?: number;
287
+ readonly linkThreshold?: number;
288
+ readonly reviewThreshold?: number;
289
+ }
290
+ /** Create a MatchkeyConfig with sensible defaults. Produces the correct variant. */
291
+ declare function makeMatchkeyConfig(partial: MakeMatchkeyConfigInput): MatchkeyConfig;
292
+ /** Create a BlockingConfig with sensible defaults. */
293
+ declare function makeBlockingConfig(partial?: Partial<BlockingConfig>): BlockingConfig;
294
+ /** Create a GoldenRulesConfig with sensible defaults. */
295
+ declare function makeGoldenRulesConfig(partial?: Partial<GoldenRulesConfig>): GoldenRulesConfig;
296
+ /** Create a full GoldenMatchConfig with sensible defaults. */
297
+ declare function makeConfig(partial?: Partial<GoldenMatchConfig>): GoldenMatchConfig;
298
+ /**
299
+ * Return matchkeys from config, checking both `matchkeys` and `matchSettings`.
300
+ * Mirrors Python's `GoldenMatchConfig.get_matchkeys()`.
301
+ */
302
+ declare function getMatchkeys(config: GoldenMatchConfig): readonly MatchkeyConfig[];
303
+
304
+ export { makeConfig as A, type BlockResult as B, type CanopyConfig as C, type DedupeResult as D, type ExactMatchkey as E, type FieldProvenance as F, type GoldenMatchConfig as G, makeGoldenRulesConfig as H, type InputConfig as I, makeMatchkeyConfig as J, makeMatchkeyField as K, type LLMScorerConfig as L, type MatchkeyConfig as M, makeScoredPair as N, type OutputConfig as O, type PairKey as P, type QualityConfig as Q, type Row as R, type ScoredPair as S, type TransformConfig as T, VALID_SCORERS as V, type WeightedMatchkey as W, type MatchResult as a, type BlockingConfig as b, type BlockingKeyConfig as c, type BudgetConfig as d, type ClusterInfo as e, type ClusterProvenance as f, type ColumnValue as g, type DedupeStats as h, type DomainConfig as i, type GoldenFieldRule as j, type GoldenRulesConfig as k, type InputFileConfig as l, type LearningConfig as m, type MakeMatchkeyConfigInput as n, type MatchkeyField as o, type MemoryConfig as p, type ProbabilisticMatchkey as q, type SortKeyField as r, type StandardizationConfig as s, VALID_STANDARDIZERS as t, VALID_STRATEGIES as u, VALID_TRANSFORMS as v, type ValidationConfig as w, type ValidationRuleConfig as x, getMatchkeys as y, makeBlockingConfig as z };
@@ -0,0 +1,60 @@
1
+ /**
2
+ * Basic deduplication: find duplicate people in a small array.
3
+ * Run: npx tsx examples/01-basic-dedupe.ts
4
+ */
5
+ import { dedupe } from "goldenmatch";
6
+
7
+ const people = [
8
+ { id: 1, first_name: "John", last_name: "Smith", email: "john@example.com", zip: "12345" },
9
+ { id: 2, first_name: "Jon", last_name: "Smith", email: "john@example.com", zip: "12345" },
10
+ { id: 3, first_name: "Johnny", last_name: "Smith", email: "j.smith@example.com", zip: "12345" },
11
+ { id: 4, first_name: "Jane", last_name: "Doe", email: "jane@example.com", zip: "54321" },
12
+ { id: 5, first_name: "Janet", last_name: "Doe", email: "janet@example.com", zip: "54321" },
13
+ ];
14
+
15
+ const result = dedupe(people, {
16
+ exact: ["email"],
17
+ fuzzy: { first_name: 0.8, last_name: 0.85 },
18
+ blocking: ["zip"],
19
+ threshold: 0.85,
20
+ });
21
+
22
+ console.log(`Records: ${result.stats.totalRecords}`);
23
+ console.log(`Clusters: ${result.stats.totalClusters}`);
24
+ console.log(`Match rate: ${(result.stats.matchRate * 100).toFixed(1)}%\n`);
25
+
26
+ console.log("Golden records:");
27
+ for (const rec of result.goldenRecords) {
28
+ console.log(" ", rec);
29
+ }
30
+
31
+ console.log("\nDuplicate groups:");
32
+ for (const [cid, cluster] of result.clusters) {
33
+ if (cluster.size < 2) continue;
34
+ console.log(
35
+ ` Cluster ${cid} (${cluster.size} members, confidence ${cluster.confidence.toFixed(2)}):`,
36
+ );
37
+ for (const mid of cluster.members) {
38
+ console.log(` row ${mid}`);
39
+ }
40
+ }
41
+
42
+ /**
43
+ * Expected output (approximate):
44
+ *
45
+ * Records: 5
46
+ * Clusters: 3
47
+ * Match rate: 40.0%
48
+ *
49
+ * Golden records:
50
+ * { __cluster_id__: 0, id: 1, first_name: 'John', ... }
51
+ * ...
52
+ *
53
+ * Duplicate groups:
54
+ * Cluster 0 (2 members, confidence 0.95): row 0, row 1
55
+ * Cluster 1 (...): ...
56
+ *
57
+ * Python -> TS differences:
58
+ * - Python returns DataFrames; TS returns plain arrays + ReadonlyMap of clusters.
59
+ * - Python `result.stats["total_records"]`; TS `result.stats.totalRecords`.
60
+ */