goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,243 @@
1
+ /**
2
+ * validate.ts — Column validation rules with quarantine/flag actions.
3
+ * Edge-safe: no Node.js imports, pure TypeScript only.
4
+ *
5
+ * Ports goldenmatch/core/validate.py.
6
+ */
7
+
8
+ import type { Row } from "./types.js";
9
+
10
+ // ---------------------------------------------------------------------------
11
+ // Types
12
+ // ---------------------------------------------------------------------------
13
+
14
+ export type ValidationAction = "null" | "quarantine" | "flag";
15
+
16
+ export type ValidationRuleType =
17
+ | "regex"
18
+ | "min_length"
19
+ | "max_length"
20
+ | "not_null"
21
+ | "in_set"
22
+ | "format";
23
+
24
+ export interface ValidationRule {
25
+ readonly column: string;
26
+ readonly ruleType: ValidationRuleType;
27
+ readonly params: Readonly<Record<string, unknown>>;
28
+ readonly action: ValidationAction;
29
+ }
30
+
31
+ export interface ValidationReport {
32
+ readonly totalRows: number;
33
+ readonly quarantined: number;
34
+ readonly flagged: number;
35
+ readonly ruleViolations: Readonly<Record<string, number>>;
36
+ }
37
+
38
+ export interface ValidationResult {
39
+ readonly valid: Row[];
40
+ readonly quarantine: Row[];
41
+ readonly report: ValidationReport;
42
+ }
43
+
44
+ // ---------------------------------------------------------------------------
45
+ // Built-in format matchers
46
+ // ---------------------------------------------------------------------------
47
+
48
+ const FORMAT_MATCHERS: Readonly<Record<string, RegExp>> = {
49
+ email: /^[^@\s]+@[^@\s]+\.[^@\s]+$/,
50
+ phone: /^\+?[\d\s().-]{7,}$/,
51
+ zip: /^\d{5}(-\d{4})?$/,
52
+ date: /^\d{4}-\d{2}-\d{2}$/,
53
+ };
54
+
55
+ // ---------------------------------------------------------------------------
56
+ // Rule checker
57
+ // ---------------------------------------------------------------------------
58
+
59
+ function valueToStr(v: unknown): string | null {
60
+ if (v === null || v === undefined) return null;
61
+ if (typeof v === "string") return v;
62
+ if (typeof v === "number" || typeof v === "boolean") return String(v);
63
+ return null;
64
+ }
65
+
66
+ /**
67
+ * Compile a rule into a checker function. Expensive work (regex compilation)
68
+ * happens here, not on every row. If a regex is invalid, log once and return
69
+ * a checker that matches no rows.
70
+ */
71
+ function compileRule(rule: ValidationRule): (value: unknown) => boolean {
72
+ if (rule.ruleType === "not_null") {
73
+ return (v) => v !== null && v !== undefined && v !== "";
74
+ }
75
+
76
+ if (rule.ruleType === "regex") {
77
+ const pat = rule.params["pattern"];
78
+ if (typeof pat !== "string") {
79
+ return (v) => v === null || v === undefined;
80
+ }
81
+ let re: RegExp;
82
+ try {
83
+ re = new RegExp(pat);
84
+ } catch (err) {
85
+ // eslint-disable-next-line no-console
86
+ console.warn(
87
+ `Invalid regex pattern for rule on column '${rule.column}': ${pat}. ` +
88
+ `Error: ${err instanceof Error ? err.message : String(err)}. ` +
89
+ `Rule will match no rows.`,
90
+ );
91
+ return (v) => {
92
+ if (v === null || v === undefined) return true;
93
+ return false;
94
+ };
95
+ }
96
+ return (v) => {
97
+ if (v === null || v === undefined) return true;
98
+ const str = valueToStr(v);
99
+ if (str === null) return false;
100
+ return re.test(str);
101
+ };
102
+ }
103
+
104
+ if (rule.ruleType === "min_length") {
105
+ const min =
106
+ typeof rule.params["value"] === "number" ? rule.params["value"] : 0;
107
+ return (v) => {
108
+ if (v === null || v === undefined) return true;
109
+ const str = valueToStr(v) ?? "";
110
+ return str.length >= min;
111
+ };
112
+ }
113
+
114
+ if (rule.ruleType === "max_length") {
115
+ const max =
116
+ typeof rule.params["value"] === "number"
117
+ ? rule.params["value"]
118
+ : Infinity;
119
+ return (v) => {
120
+ if (v === null || v === undefined) return true;
121
+ const str = valueToStr(v) ?? "";
122
+ return str.length <= max;
123
+ };
124
+ }
125
+
126
+ if (rule.ruleType === "in_set") {
127
+ const allowed = rule.params["values"];
128
+ if (!Array.isArray(allowed)) return () => true;
129
+ return (v) => {
130
+ if (v === null || v === undefined) return true;
131
+ return allowed.includes(v);
132
+ };
133
+ }
134
+
135
+ if (rule.ruleType === "format") {
136
+ const name = rule.params["name"];
137
+ if (typeof name !== "string") return () => true;
138
+ const matcher = FORMAT_MATCHERS[name];
139
+ if (matcher === undefined) return () => true;
140
+ return (v) => {
141
+ if (v === null || v === undefined) return true;
142
+ const str = valueToStr(v);
143
+ if (str === null) return false;
144
+ return matcher.test(str);
145
+ };
146
+ }
147
+
148
+ return () => true;
149
+ }
150
+
151
+ /** Returns true if the rule passes for this value, false otherwise. */
152
+ export function checkRule(value: unknown, rule: ValidationRule): boolean {
153
+ return compileRule(rule)(value);
154
+ }
155
+
156
+ function ruleKey(rule: ValidationRule): string {
157
+ return `${rule.column}:${rule.ruleType}`;
158
+ }
159
+
160
+ // ---------------------------------------------------------------------------
161
+ // validateRows
162
+ // ---------------------------------------------------------------------------
163
+
164
+ /**
165
+ * Validate rows against a list of rules.
166
+ *
167
+ * Actions:
168
+ * - "null": replace the failing value with null, row stays valid
169
+ * - "quarantine": move row to quarantine bucket
170
+ * - "flag": add __flags__ entry, row stays valid
171
+ */
172
+ export function validateRows(
173
+ rows: readonly Row[],
174
+ rules: readonly ValidationRule[],
175
+ ): ValidationResult {
176
+ const valid: Row[] = [];
177
+ const quarantine: Row[] = [];
178
+ const violations = new Map<string, number>();
179
+ let flagged = 0;
180
+
181
+ // Pre-compile all rule checkers once. Logs any regex errors exactly once.
182
+ const compiled = rules.map((rule) => ({
183
+ rule,
184
+ check: compileRule(rule),
185
+ }));
186
+
187
+ for (const row of rows) {
188
+ let current: Record<string, unknown> = { ...row };
189
+ let shouldQuarantine = false;
190
+ let wasFlagged = false;
191
+ const flags: string[] = Array.isArray(current["__flags__"])
192
+ ? [...(current["__flags__"] as unknown[])].filter(
193
+ (f): f is string => typeof f === "string",
194
+ )
195
+ : [];
196
+
197
+ for (const { rule, check } of compiled) {
198
+ const value = current[rule.column];
199
+ if (check(value)) continue;
200
+
201
+ const key = ruleKey(rule);
202
+ violations.set(key, (violations.get(key) ?? 0) + 1);
203
+
204
+ switch (rule.action) {
205
+ case "null":
206
+ current[rule.column] = null;
207
+ break;
208
+ case "quarantine":
209
+ shouldQuarantine = true;
210
+ break;
211
+ case "flag":
212
+ flags.push(key);
213
+ wasFlagged = true;
214
+ break;
215
+ }
216
+ if (shouldQuarantine) break;
217
+ }
218
+
219
+ if (shouldQuarantine) {
220
+ quarantine.push(current as Row);
221
+ } else {
222
+ if (wasFlagged) {
223
+ current["__flags__"] = flags;
224
+ flagged++;
225
+ }
226
+ valid.push(current as Row);
227
+ }
228
+ }
229
+
230
+ const ruleViolations: Record<string, number> = {};
231
+ for (const [k, v] of violations) ruleViolations[k] = v;
232
+
233
+ return {
234
+ valid,
235
+ quarantine,
236
+ report: {
237
+ totalRows: rows.length,
238
+ quarantined: quarantine.length,
239
+ flagged,
240
+ ruleViolations,
241
+ },
242
+ };
243
+ }
package/src/index.ts ADDED
@@ -0,0 +1,8 @@
1
+ /**
2
+ * index.ts -- Main package entry point.
3
+ *
4
+ * Re-exports the edge-safe core API. For Node-only helpers (file I/O,
5
+ * config loading, CLI), import from `goldenmatch/node`.
6
+ */
7
+
8
+ export * from "./core/index.js";