goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,212 @@
1
+ /**
2
+ * profiler.ts — Lightweight per-column data profiler.
3
+ * Edge-safe: no `node:` imports.
4
+ *
5
+ * Ports parts of goldenmatch/core/profiler.py that autoconfig relies on.
6
+ */
7
+
8
+ import type { Row } from "./types.js";
9
+
10
+ // ---------------------------------------------------------------------------
11
+ // Types
12
+ // ---------------------------------------------------------------------------
13
+
14
+ export type ColumnType =
15
+ | "email"
16
+ | "phone"
17
+ | "zip"
18
+ | "date"
19
+ | "name"
20
+ | "geo"
21
+ | "id"
22
+ | "numeric"
23
+ | "text";
24
+
25
+ export interface ColumnProfile {
26
+ readonly name: string;
27
+ readonly nullRate: number;
28
+ readonly nullCount: number;
29
+ readonly totalCount: number;
30
+ readonly distinctCount: number;
31
+ readonly cardinalityRatio: number;
32
+ readonly inferredType: ColumnType;
33
+ readonly avgLength: number;
34
+ readonly maxLength: number;
35
+ readonly sampleValues: readonly string[];
36
+ }
37
+
38
+ export interface DatasetProfile {
39
+ readonly rowCount: number;
40
+ readonly columns: readonly ColumnProfile[];
41
+ readonly byName: Readonly<Record<string, ColumnProfile>>;
42
+ }
43
+
44
+ // ---------------------------------------------------------------------------
45
+ // Regex heuristics
46
+ // ---------------------------------------------------------------------------
47
+
48
+ const EMAIL_VALUE_RE = /^[^\s@]+@[^\s@]+\.[^\s@]+$/;
49
+ const PHONE_STRIP_RE = /[()\-+.\s]/g;
50
+ const DATE_VALUE_RES: readonly RegExp[] = [
51
+ /^\d{1,2}[/\-]\d{1,2}[/\-]\d{2,4}$/,
52
+ /^\d{4}[/\-]\d{1,2}[/\-]\d{1,2}$/,
53
+ /^\d{1,2}\s[A-Za-z]+\s\d{2,4}$/,
54
+ ];
55
+ const ZIP_VALUE_RE = /^\d{5}(-?\d{4})?$/;
56
+ const NAME_VALUE_RE = /^[A-Za-z][A-Za-z \-']{0,28}[A-Za-z]$|^[A-Za-z]{2,3}$/;
57
+
58
+ // ---------------------------------------------------------------------------
59
+ // Per-column profiling
60
+ // ---------------------------------------------------------------------------
61
+
62
+ function toStringOrNull(value: unknown): string | null {
63
+ if (value === null || value === undefined) return null;
64
+ if (typeof value === "string") {
65
+ const t = value.trim();
66
+ return t.length === 0 ? null : t;
67
+ }
68
+ return String(value);
69
+ }
70
+
71
+ function guessType(values: readonly string[], columnName: string): ColumnType {
72
+ if (values.length === 0) return "text";
73
+ const n = values.length;
74
+ const lname = columnName.toLowerCase();
75
+
76
+ // Email: >60% look like addresses
77
+ const emailCount = values.reduce(
78
+ (acc, v) => acc + (EMAIL_VALUE_RE.test(v) ? 1 : 0),
79
+ 0,
80
+ );
81
+ if (emailCount / n > 0.6) return "email";
82
+
83
+ // Phone
84
+ let phoneCount = 0;
85
+ for (const v of values) {
86
+ const stripped = v.replace(PHONE_STRIP_RE, "");
87
+ if (/^\d+$/.test(stripped) && stripped.length >= 7 && stripped.length <= 15) {
88
+ phoneCount++;
89
+ }
90
+ }
91
+ if (phoneCount / n > 0.6) return "phone";
92
+
93
+ // Zip: 5 or 9 digits (with optional dash)
94
+ const zipCount = values.reduce(
95
+ (acc, v) => acc + (ZIP_VALUE_RE.test(v) ? 1 : 0),
96
+ 0,
97
+ );
98
+ if (zipCount / n > 0.6) return "zip";
99
+
100
+ // Date
101
+ let dateCount = 0;
102
+ for (const v of values) {
103
+ if (DATE_VALUE_RES.some((re) => re.test(v))) dateCount++;
104
+ }
105
+ if (dateCount / n > 0.6) return "date";
106
+
107
+ // Geographic columns by name + short text values
108
+ if (/^(city|state|county|country|region|province)/i.test(lname)) return "geo";
109
+ if (/city_desc|state_cd|country_code|state_code/i.test(lname)) return "geo";
110
+
111
+ // Identifier columns by name
112
+ if (/^id$|_id$|uuid|guid/i.test(lname)) return "id";
113
+
114
+ // Name: >60% match alpha-name pattern
115
+ const nameCount = values.reduce(
116
+ (acc, v) => acc + (NAME_VALUE_RE.test(v) ? 1 : 0),
117
+ 0,
118
+ );
119
+ if (nameCount / n > 0.6) return "name";
120
+
121
+ // Numeric
122
+ let numericCount = 0;
123
+ for (const v of values) {
124
+ if (/^-?\d+(\.\d+)?$/.test(v)) numericCount++;
125
+ }
126
+ if (numericCount / n > 0.8) return "numeric";
127
+
128
+ return "text";
129
+ }
130
+
131
+ function profileColumn(name: string, rawValues: readonly unknown[]): ColumnProfile {
132
+ const totalCount = rawValues.length;
133
+ let nullCount = 0;
134
+ const nonNull: string[] = [];
135
+ for (const v of rawValues) {
136
+ const s = toStringOrNull(v);
137
+ if (s === null) nullCount++;
138
+ else nonNull.push(s);
139
+ }
140
+
141
+ const distinct = new Set(nonNull);
142
+ const distinctCount = distinct.size;
143
+ const cardinalityRatio = totalCount > 0 ? distinctCount / totalCount : 0;
144
+
145
+ let totalLen = 0;
146
+ let maxLen = 0;
147
+ for (const v of nonNull) {
148
+ totalLen += v.length;
149
+ if (v.length > maxLen) maxLen = v.length;
150
+ }
151
+ const avgLength = nonNull.length > 0 ? totalLen / nonNull.length : 0;
152
+ const nullRate = totalCount > 0 ? nullCount / totalCount : 0;
153
+
154
+ // Sample values (first 5 unique)
155
+ const sampleValues: string[] = [];
156
+ for (const v of distinct) {
157
+ sampleValues.push(v);
158
+ if (sampleValues.length >= 5) break;
159
+ }
160
+
161
+ // Subsample for type guessing for performance
162
+ const sampleForType = nonNull.length > 500 ? nonNull.slice(0, 500) : nonNull;
163
+ const inferredType = guessType(sampleForType, name);
164
+
165
+ return {
166
+ name,
167
+ nullRate,
168
+ nullCount,
169
+ totalCount,
170
+ distinctCount,
171
+ cardinalityRatio,
172
+ inferredType,
173
+ avgLength,
174
+ maxLength: maxLen,
175
+ sampleValues,
176
+ };
177
+ }
178
+
179
+ // ---------------------------------------------------------------------------
180
+ // Public API
181
+ // ---------------------------------------------------------------------------
182
+
183
+ /** Profile all columns of a row array. */
184
+ export function profileRows(rows: readonly Row[]): DatasetProfile {
185
+ if (rows.length === 0) {
186
+ return { rowCount: 0, columns: [], byName: {} };
187
+ }
188
+
189
+ // Collect column names from all rows (not just first)
190
+ const colSet = new Set<string>();
191
+ for (const r of rows) {
192
+ for (const k of Object.keys(r)) {
193
+ if (!k.startsWith("__")) colSet.add(k);
194
+ }
195
+ }
196
+ const columns = [...colSet];
197
+
198
+ const profiles: ColumnProfile[] = [];
199
+ const byName: Record<string, ColumnProfile> = {};
200
+ for (const col of columns) {
201
+ const values = rows.map((r) => r[col]);
202
+ const profile = profileColumn(col, values);
203
+ profiles.push(profile);
204
+ byName[col] = profile;
205
+ }
206
+
207
+ return {
208
+ rowCount: rows.length,
209
+ columns: profiles,
210
+ byName,
211
+ };
212
+ }
@@ -0,0 +1,197 @@
1
+ /**
2
+ * quality.ts — Lightweight quality scan stub.
3
+ * Edge-safe: no Node.js imports, pure TypeScript only.
4
+ *
5
+ * Ports a subset of goldenmatch/core/quality.py. The Python version
6
+ * integrates with GoldenCheck; this port only provides the interface and a
7
+ * handful of basic heuristics that are safe to run client-side.
8
+ */
9
+
10
+ import type { Row, QualityConfig } from "./types.js";
11
+
12
+ // ---------------------------------------------------------------------------
13
+ // Types
14
+ // ---------------------------------------------------------------------------
15
+
16
+ export type QualitySeverity = "info" | "warn" | "error";
17
+
18
+ export interface QualityFinding {
19
+ readonly column: string;
20
+ readonly issue: string;
21
+ readonly severity: QualitySeverity;
22
+ readonly affectedRows: number;
23
+ readonly sampleValues: readonly unknown[];
24
+ }
25
+
26
+ export interface QualityRunResult {
27
+ readonly rows: readonly Row[];
28
+ readonly findings: readonly QualityFinding[];
29
+ }
30
+
31
+ // ---------------------------------------------------------------------------
32
+ // Pattern detectors
33
+ // ---------------------------------------------------------------------------
34
+
35
+ const EMAIL_RE = /^[^@\s]+@[^@\s]+\.[^@\s]+$/;
36
+ const DIGITS_RE = /^\d+$/;
37
+ const DATE_PATTERNS: readonly RegExp[] = [
38
+ /^\d{4}-\d{2}-\d{2}$/, // ISO
39
+ /^\d{1,2}\/\d{1,2}\/\d{2,4}$/, // US
40
+ /^\d{1,2}-\d{1,2}-\d{2,4}$/,
41
+ /^\d{8}$/, // yyyymmdd
42
+ ];
43
+
44
+ function collectColumns(rows: readonly Row[]): string[] {
45
+ const cols = new Set<string>();
46
+ for (const row of rows) {
47
+ for (const key of Object.keys(row)) {
48
+ if (!key.startsWith("__")) cols.add(key);
49
+ }
50
+ }
51
+ return [...cols];
52
+ }
53
+
54
+ function asStr(v: unknown): string | null {
55
+ if (v === null || v === undefined) return null;
56
+ if (typeof v === "string") return v;
57
+ if (typeof v === "number" || typeof v === "boolean") return String(v);
58
+ return null;
59
+ }
60
+
61
+ // ---------------------------------------------------------------------------
62
+ // scanQuality
63
+ // ---------------------------------------------------------------------------
64
+
65
+ /**
66
+ * Run a few cheap heuristics across the dataset: high null rate, low
67
+ * cardinality, inconsistent date format, obviously malformed emails.
68
+ */
69
+ export function scanQuality(
70
+ rows: readonly Row[],
71
+ _config?: QualityConfig,
72
+ ): QualityFinding[] {
73
+ const findings: QualityFinding[] = [];
74
+ if (rows.length === 0) return findings;
75
+
76
+ const total = rows.length;
77
+ const columns = collectColumns(rows);
78
+
79
+ for (const col of columns) {
80
+ let nullCount = 0;
81
+ let emailLike = 0;
82
+ let malformedEmail = 0;
83
+ let dateLike = 0;
84
+ const dateFormatsSeen = new Set<number>();
85
+ const nonNullSamples: unknown[] = [];
86
+ const distinct = new Set<string>();
87
+
88
+ for (const row of rows) {
89
+ const raw = row[col];
90
+ if (raw === null || raw === undefined || raw === "") {
91
+ nullCount++;
92
+ continue;
93
+ }
94
+ if (nonNullSamples.length < 5) nonNullSamples.push(raw);
95
+ const s = asStr(raw);
96
+ if (s !== null) {
97
+ distinct.add(s);
98
+ // Email heuristics
99
+ if (s.includes("@")) {
100
+ emailLike++;
101
+ if (!EMAIL_RE.test(s)) malformedEmail++;
102
+ }
103
+ // Date format tracking
104
+ for (let i = 0; i < DATE_PATTERNS.length; i++) {
105
+ if (DATE_PATTERNS[i]!.test(s)) {
106
+ dateFormatsSeen.add(i);
107
+ dateLike++;
108
+ break;
109
+ }
110
+ }
111
+ }
112
+ }
113
+
114
+ const nullRate = nullCount / total;
115
+ if (nullRate > 0.5) {
116
+ findings.push({
117
+ column: col,
118
+ issue: `High null rate: ${(nullRate * 100).toFixed(1)}%`,
119
+ severity: nullRate > 0.9 ? "error" : "warn",
120
+ affectedRows: nullCount,
121
+ sampleValues: [],
122
+ });
123
+ }
124
+
125
+ const nonNull = total - nullCount;
126
+ if (nonNull > 0) {
127
+ const cardinalityRatio = distinct.size / nonNull;
128
+ if (cardinalityRatio < 0.001 && distinct.size <= 1) {
129
+ findings.push({
130
+ column: col,
131
+ issue: "Constant column (single distinct non-null value)",
132
+ severity: "info",
133
+ affectedRows: nonNull,
134
+ sampleValues: nonNullSamples,
135
+ });
136
+ }
137
+ }
138
+
139
+ if (emailLike > 0 && malformedEmail > 0) {
140
+ findings.push({
141
+ column: col,
142
+ issue: `Malformed email values (${malformedEmail} of ${emailLike})`,
143
+ severity: "warn",
144
+ affectedRows: malformedEmail,
145
+ sampleValues: nonNullSamples,
146
+ });
147
+ }
148
+
149
+ if (dateLike > 0 && dateFormatsSeen.size > 1) {
150
+ findings.push({
151
+ column: col,
152
+ issue: `Inconsistent date formats (${dateFormatsSeen.size} distinct patterns)`,
153
+ severity: "warn",
154
+ affectedRows: dateLike,
155
+ sampleValues: nonNullSamples,
156
+ });
157
+ }
158
+
159
+ // Numeric-looking string column
160
+ if (nonNull > 0 && distinct.size > 0) {
161
+ let digitCount = 0;
162
+ for (const v of distinct) {
163
+ if (DIGITS_RE.test(v)) digitCount++;
164
+ }
165
+ if (digitCount === distinct.size && distinct.size > 1) {
166
+ // Entire column is numeric strings — informational.
167
+ findings.push({
168
+ column: col,
169
+ issue: "Column contains only numeric strings (consider typing)",
170
+ severity: "info",
171
+ affectedRows: nonNull,
172
+ sampleValues: nonNullSamples,
173
+ });
174
+ }
175
+ }
176
+ }
177
+
178
+ return findings;
179
+ }
180
+
181
+ // ---------------------------------------------------------------------------
182
+ // runQualityCheck
183
+ // ---------------------------------------------------------------------------
184
+
185
+ /**
186
+ * Pass-through runner: produce findings, echo rows unchanged.
187
+ *
188
+ * Mirrors `_scan_only` / `run_quality_check` from the Python module: no
189
+ * GoldenCheck, no row rewrites, just reportable findings.
190
+ */
191
+ export function runQualityCheck(
192
+ rows: readonly Row[],
193
+ config?: QualityConfig,
194
+ ): QualityRunResult {
195
+ const findings = scanQuality(rows, config);
196
+ return { rows, findings };
197
+ }
@@ -0,0 +1,177 @@
1
+ /**
2
+ * review-queue.ts — Human-in-the-loop pair gating.
3
+ * Edge-safe: no Node.js imports, pure TypeScript only.
4
+ *
5
+ * Ports goldenmatch/core/review_queue.py. Default gates: >=0.95 auto-approve,
6
+ * <0.75 auto-reject, everything in between needs review.
7
+ */
8
+
9
+ import type { ScoredPair } from "./types.js";
10
+
11
+ // ---------------------------------------------------------------------------
12
+ // Types
13
+ // ---------------------------------------------------------------------------
14
+
15
+ export type ReviewStatus = "pending" | "approved" | "rejected";
16
+
17
+ export interface ReviewItem {
18
+ readonly pairId: string;
19
+ readonly idA: number;
20
+ readonly idB: number;
21
+ readonly score: number;
22
+ readonly status: ReviewStatus;
23
+ readonly createdAt: number;
24
+ }
25
+
26
+ export interface GatedResult {
27
+ readonly autoApproved: readonly ScoredPair[];
28
+ readonly needsReview: readonly ReviewItem[];
29
+ readonly rejected: readonly ScoredPair[];
30
+ }
31
+
32
+ export interface GateOptions {
33
+ readonly approveAbove?: number;
34
+ readonly rejectBelow?: number;
35
+ }
36
+
37
+ // ---------------------------------------------------------------------------
38
+ // Helpers
39
+ // ---------------------------------------------------------------------------
40
+
41
+ function canonicalIds(a: number, b: number): [number, number] {
42
+ return a < b ? [a, b] : [b, a];
43
+ }
44
+
45
+ function pairIdFor(a: number, b: number): string {
46
+ const [lo, hi] = canonicalIds(a, b);
47
+ return `${lo}:${hi}`;
48
+ }
49
+
50
+ function now(): number {
51
+ // Date.now is edge-safe (no node imports).
52
+ return Date.now();
53
+ }
54
+
55
+ // ---------------------------------------------------------------------------
56
+ // gatePairs
57
+ // ---------------------------------------------------------------------------
58
+
59
+ /**
60
+ * Split pairs into auto-approved, needs-review, and rejected buckets.
61
+ *
62
+ * Defaults: approveAbove=0.95, rejectBelow=0.75.
63
+ */
64
+ export function gatePairs(
65
+ pairs: readonly ScoredPair[],
66
+ options?: GateOptions,
67
+ ): GatedResult {
68
+ const approveAbove = options?.approveAbove ?? 0.95;
69
+ const rejectBelow = options?.rejectBelow ?? 0.75;
70
+
71
+ const autoApproved: ScoredPair[] = [];
72
+ const needsReview: ReviewItem[] = [];
73
+ const rejected: ScoredPair[] = [];
74
+ const t = now();
75
+
76
+ for (const p of pairs) {
77
+ if (p.score >= approveAbove) {
78
+ autoApproved.push(p);
79
+ } else if (p.score < rejectBelow) {
80
+ rejected.push(p);
81
+ } else {
82
+ const [lo, hi] = canonicalIds(p.idA, p.idB);
83
+ needsReview.push({
84
+ pairId: `${lo}:${hi}`,
85
+ idA: lo,
86
+ idB: hi,
87
+ score: p.score,
88
+ status: "pending",
89
+ createdAt: t,
90
+ });
91
+ }
92
+ }
93
+
94
+ return { autoApproved, needsReview, rejected };
95
+ }
96
+
97
+ // ---------------------------------------------------------------------------
98
+ // ReviewQueue
99
+ // ---------------------------------------------------------------------------
100
+
101
+ /**
102
+ * In-memory review queue for human adjudication of borderline pairs.
103
+ */
104
+ export class ReviewQueue {
105
+ private readonly items = new Map<string, ReviewItem>();
106
+
107
+ /** Add a pair as a pending review item (idempotent by canonical pair id). */
108
+ add(pair: ScoredPair): void {
109
+ const [lo, hi] = canonicalIds(pair.idA, pair.idB);
110
+ const pairId = `${lo}:${hi}`;
111
+ if (this.items.has(pairId)) return;
112
+ this.items.set(pairId, {
113
+ pairId,
114
+ idA: lo,
115
+ idB: hi,
116
+ score: pair.score,
117
+ status: "pending",
118
+ createdAt: now(),
119
+ });
120
+ }
121
+
122
+ /** Get an item by canonical pair id ("minId:maxId"). */
123
+ get(pairId: string): ReviewItem | undefined {
124
+ return this.items.get(pairId);
125
+ }
126
+
127
+ /** Mark a pair approved. No-op if unknown. */
128
+ approve(pairId: string): void {
129
+ const item = this.items.get(pairId);
130
+ if (item === undefined) return;
131
+ this.items.set(pairId, { ...item, status: "approved" });
132
+ }
133
+
134
+ /** Mark a pair rejected. No-op if unknown. */
135
+ reject(pairId: string): void {
136
+ const item = this.items.get(pairId);
137
+ if (item === undefined) return;
138
+ this.items.set(pairId, { ...item, status: "rejected" });
139
+ }
140
+
141
+ /** All pending items. */
142
+ pending(): ReviewItem[] {
143
+ const out: ReviewItem[] = [];
144
+ for (const item of this.items.values()) {
145
+ if (item.status === "pending") out.push(item);
146
+ }
147
+ return out;
148
+ }
149
+
150
+ /** All approved items. */
151
+ approved(): ReviewItem[] {
152
+ const out: ReviewItem[] = [];
153
+ for (const item of this.items.values()) {
154
+ if (item.status === "approved") out.push(item);
155
+ }
156
+ return out;
157
+ }
158
+
159
+ /** All rejected items. */
160
+ rejected(): ReviewItem[] {
161
+ const out: ReviewItem[] = [];
162
+ for (const item of this.items.values()) {
163
+ if (item.status === "rejected") out.push(item);
164
+ }
165
+ return out;
166
+ }
167
+
168
+ /** Current queue size. */
169
+ size(): number {
170
+ return this.items.size;
171
+ }
172
+
173
+ /** Canonical pair id helper ("minId:maxId"). */
174
+ static pairIdFor(a: number, b: number): string {
175
+ return pairIdFor(a, b);
176
+ }
177
+ }