goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
@@ -0,0 +1,61 @@
1
+ /**
2
+ * Evaluate predicted pairs against ground truth: precision, recall, F1.
3
+ *
4
+ * Two evaluation modes:
5
+ * - evaluatePairs: compare the predicted ScoredPair[] to ground truth
6
+ * - evaluateClusters: expand clusters into pairs, then compare
7
+ *
8
+ * Run: npx tsx examples/11-evaluate.ts
9
+ */
10
+ import { dedupe, evaluatePairs, evaluateClusters } from "goldenmatch";
11
+
12
+ // Dataset with known ground truth
13
+ const rows = [
14
+ { id: 0, name: "John Smith", zip: "12345" },
15
+ { id: 1, name: "Jon Smith", zip: "12345" }, // dupe of 0
16
+ { id: 2, name: "Johnny Smth", zip: "12345" }, // dupe of 0
17
+ { id: 3, name: "Jane Doe", zip: "54321" },
18
+ { id: 4, name: "Janet Doe", zip: "54321" }, // dupe of 3
19
+ { id: 5, name: "Bob Jones", zip: "99999" },
20
+ ];
21
+
22
+ // Ground truth: pairs that SHOULD match (canonicalized min:max)
23
+ const truth: [number, number][] = [
24
+ [0, 1],
25
+ [0, 2],
26
+ [1, 2],
27
+ [3, 4],
28
+ ];
29
+
30
+ // Run dedupe
31
+ const result = dedupe(rows, {
32
+ fuzzy: { name: 0.85 },
33
+ blocking: ["zip"],
34
+ threshold: 0.85,
35
+ });
36
+
37
+ // --- Evaluate via pair set ---
38
+ const pairEval = evaluatePairs(result.scoredPairs, truth);
39
+ console.log("=== Pair-based evaluation ===");
40
+ console.log(` Precision: ${pairEval.precision.toFixed(3)}`);
41
+ console.log(` Recall: ${pairEval.recall.toFixed(3)}`);
42
+ console.log(` F1: ${pairEval.f1.toFixed(3)}`);
43
+ console.log(` TP=${pairEval.truePositives} FP=${pairEval.falsePositives} FN=${pairEval.falseNegatives}`);
44
+
45
+ // --- Evaluate via cluster expansion ---
46
+ const allIds = rows.map((r) => r.id);
47
+ const clusterEval = evaluateClusters(result.clusters, truth, allIds);
48
+ console.log("\n=== Cluster-based evaluation ===");
49
+ console.log(` Precision: ${clusterEval.precision.toFixed(3)}`);
50
+ console.log(` Recall: ${clusterEval.recall.toFixed(3)}`);
51
+ console.log(` F1: ${clusterEval.f1.toFixed(3)}`);
52
+
53
+ /**
54
+ * Cluster-based eval is typically more favorable than pair-based because
55
+ * transitive closures pick up pairs the direct scorer missed (if A~B and
56
+ * B~C cluster together, A~C counts as a TP even if A~C scored below
57
+ * threshold directly).
58
+ *
59
+ * Python parity: `evaluate_pairs()` / `evaluate_clusters()` in
60
+ * `goldenmatch.core.evaluate`. CLI: `goldenmatch evaluate --ground-truth gt.csv`.
61
+ */
@@ -0,0 +1,53 @@
1
+ # GoldenMatch TypeScript Examples
2
+
3
+ Each example is a standalone `.ts` file runnable with:
4
+
5
+ ```bash
6
+ npx tsx examples/<name>.ts
7
+ ```
8
+
9
+ | # | Example | What it shows |
10
+ |---|---------|---------------|
11
+ | 01 | `01-basic-dedupe.ts` | Dedupe an in-memory array with exact + fuzzy matchkeys |
12
+ | 02 | `02-match-two-datasets.ts` | Match target records against a reference dataset |
13
+ | 03 | `03-csv-file-pipeline.ts` | Read CSV -> dedupe -> write golden records |
14
+ | 04 | `04-string-scoring.ts` | Compare every scorer on the same string pairs |
15
+ | 05 | `05-custom-config.ts` | Build/save/load a full GoldenMatchConfig (YAML) |
16
+ | 06 | `06-probabilistic-fs.ts` | Fellegi-Sunter with EM training |
17
+ | 07 | `07-pprl-privacy.ts` | Privacy-preserving record linkage (3 security levels) |
18
+ | 08 | `08-streaming.ts` | Incremental streaming -- add records one at a time |
19
+ | 09 | `09-llm-scorer.ts` | LLM scorer for borderline pairs (needs OPENAI_API_KEY) |
20
+ | 10 | `10-explain.ts` | Template NL explanation of a pair match |
21
+ | 11 | `11-evaluate.ts` | Evaluate against ground truth (precision/recall/F1) |
22
+
23
+ ## Running
24
+
25
+ From the repo root:
26
+
27
+ ```bash
28
+ cd packages/goldenmatch-js
29
+ npm install
30
+ npx tsx examples/01-basic-dedupe.ts
31
+ ```
32
+
33
+ Or install `tsx` globally:
34
+
35
+ ```bash
36
+ npm install -g tsx
37
+ tsx examples/01-basic-dedupe.ts
38
+ ```
39
+
40
+ ## Optional peer deps by example
41
+
42
+ Most examples use only the core package. A few need optional peer deps:
43
+
44
+ | Example | Peer dep required |
45
+ |---------|-------------------|
46
+ | `05-custom-config.ts` (YAML save/load path only) | `yaml` |
47
+ | `09-llm-scorer.ts` | none (uses `fetch`); needs `OPENAI_API_KEY` or `ANTHROPIC_API_KEY` in env |
48
+
49
+ Install them on demand:
50
+
51
+ ```bash
52
+ npm install yaml
53
+ ```
package/package.json ADDED
@@ -0,0 +1,66 @@
1
+ {
2
+ "name": "goldenmatch",
3
+ "version": "0.1.0",
4
+ "description": "Entity resolution toolkit — deduplicate, match, and create golden records",
5
+ "type": "module",
6
+ "exports": {
7
+ ".": { "types": "./dist/index.d.ts", "import": "./dist/index.js", "require": "./dist/index.cjs" },
8
+ "./core": { "types": "./dist/core/index.d.ts", "import": "./dist/core/index.js", "require": "./dist/core/index.cjs" },
9
+ "./node": { "types": "./dist/node/index.d.ts", "import": "./dist/node/index.js", "require": "./dist/node/index.cjs" }
10
+ },
11
+ "bin": { "goldenmatch-js": "./dist/cli.cjs" },
12
+ "engines": { "node": ">=20" },
13
+ "scripts": {
14
+ "build": "tsup",
15
+ "typecheck": "tsc --noEmit",
16
+ "test": "vitest run",
17
+ "test:watch": "vitest",
18
+ "lint": "tsc --noEmit"
19
+ },
20
+ "dependencies": {
21
+ "commander": "^13.0.0"
22
+ },
23
+ "peerDependencies": {
24
+ "yaml": "*",
25
+ "ink": "^5.0.0",
26
+ "react": "^18.0.0",
27
+ "hnswlib-node": "^3.0.0",
28
+ "@huggingface/transformers": "^3.0.0",
29
+ "piscina": "^5.0.0",
30
+ "ink-table": "^4.0.0",
31
+ "ink-select-input": "^6.0.0",
32
+ "ink-text-input": "^6.0.0",
33
+ "ink-spinner": "^5.0.0",
34
+ "ink-gradient": "^3.0.0"
35
+ },
36
+ "peerDependenciesMeta": {
37
+ "yaml": { "optional": true },
38
+ "ink": { "optional": true },
39
+ "react": { "optional": true },
40
+ "hnswlib-node": { "optional": true },
41
+ "@huggingface/transformers": { "optional": true },
42
+ "piscina": { "optional": true },
43
+ "ink-table": { "optional": true },
44
+ "ink-select-input": { "optional": true },
45
+ "ink-text-input": { "optional": true },
46
+ "ink-spinner": { "optional": true },
47
+ "ink-gradient": { "optional": true }
48
+ },
49
+ "devDependencies": {
50
+ "@types/node": "^20.0.0",
51
+ "rimraf": "^5.0.0",
52
+ "tsup": "^8.5.1",
53
+ "typescript": "^5.4.0",
54
+ "vitest": "^4.1.0",
55
+ "yaml": "^2.7.0"
56
+ },
57
+ "keywords": [
58
+ "entity-resolution", "deduplication", "record-linkage", "fuzzy-matching",
59
+ "golden-record", "data-quality", "jaro-winkler", "levenshtein"
60
+ ],
61
+ "license": "MIT",
62
+ "repository": {
63
+ "type": "git",
64
+ "url": "https://github.com/benzsevern/goldenmatch"
65
+ }
66
+ }
package/src/cli.ts ADDED
@@ -0,0 +1,372 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * cli.ts -- GoldenMatch command-line interface.
4
+ *
5
+ * Built on commander. Exposes `dedupe`, `match`, `score`, `profile`,
6
+ * `info`, and `demo` subcommands.
7
+ */
8
+
9
+ import { Command } from "commander";
10
+ import { extname, basename } from "node:path";
11
+ import {
12
+ readFile,
13
+ writeCsv,
14
+ writeJson,
15
+ } from "./node/connectors/file.js";
16
+ import { dedupe, match, scoreStrings } from "./core/api.js";
17
+ import { loadConfigFile } from "./node/config-file.js";
18
+ import type { Row } from "./core/types.js";
19
+
20
+ // ---------------------------------------------------------------------------
21
+ // Helpers
22
+ // ---------------------------------------------------------------------------
23
+
24
+ function parseFuzzyArg(raw: string): Record<string, number> {
25
+ const out: Record<string, number> = {};
26
+ for (const pair of raw.split(",")) {
27
+ const trimmed = pair.trim();
28
+ if (trimmed === "") continue;
29
+ const idx = trimmed.indexOf(":");
30
+ let field: string;
31
+ let threshold = 0.85;
32
+ if (idx === -1) {
33
+ field = trimmed;
34
+ } else {
35
+ field = trimmed.slice(0, idx).trim();
36
+ const rawThreshold = trimmed.slice(idx + 1).trim();
37
+ const parsed = parseFloat(rawThreshold);
38
+ if (Number.isFinite(parsed)) threshold = parsed;
39
+ }
40
+ if (field !== "") out[field] = threshold;
41
+ }
42
+ return out;
43
+ }
44
+
45
+ function parseCsvList(raw: string): string[] {
46
+ return raw
47
+ .split(",")
48
+ .map((s) => s.trim())
49
+ .filter((s) => s.length > 0);
50
+ }
51
+
52
+ function loadFilesWithSource(paths: readonly string[]): Row[] {
53
+ const rows: Row[] = [];
54
+ for (let i = 0; i < paths.length; i++) {
55
+ const p = paths[i]!;
56
+ const source = basename(p, extname(p)) || `file_${i}`;
57
+ const fileRows = readFile(p);
58
+ for (const r of fileRows) {
59
+ rows.push({ ...r, __source__: source });
60
+ }
61
+ }
62
+ return rows;
63
+ }
64
+
65
+ interface SharedMatchOpts {
66
+ config?: string;
67
+ exact?: string;
68
+ fuzzy?: string;
69
+ blocking?: string;
70
+ threshold?: number;
71
+ output?: string;
72
+ format?: string;
73
+ }
74
+
75
+ function buildOptionsFromFlags(opts: SharedMatchOpts) {
76
+ const out: {
77
+ config?: ReturnType<typeof loadConfigFile>;
78
+ exact?: string[];
79
+ fuzzy?: Record<string, number>;
80
+ blocking?: string[];
81
+ threshold?: number;
82
+ } = {};
83
+ if (opts.config) out.config = loadConfigFile(opts.config);
84
+ if (opts.exact) out.exact = parseCsvList(opts.exact);
85
+ if (opts.fuzzy) out.fuzzy = parseFuzzyArg(opts.fuzzy);
86
+ if (opts.blocking) out.blocking = parseCsvList(opts.blocking);
87
+ if (opts.threshold !== undefined) out.threshold = opts.threshold;
88
+ return out;
89
+ }
90
+
91
+ function writeOutputRows(
92
+ path: string,
93
+ rows: readonly Row[],
94
+ format: string,
95
+ ): void {
96
+ const ext = extname(path).toLowerCase();
97
+ const useJson =
98
+ format === "json" ||
99
+ ext === ".json" ||
100
+ ext === ".jsonl" ||
101
+ ext === ".ndjson";
102
+ if (useJson) {
103
+ writeJson(path, rows);
104
+ } else {
105
+ const delimiter = ext === ".tsv" ? "\t" : ",";
106
+ writeCsv(path, rows, { delimiter });
107
+ }
108
+ }
109
+
110
+ // ---------------------------------------------------------------------------
111
+ // CLI definition
112
+ // ---------------------------------------------------------------------------
113
+
114
+ const program = new Command();
115
+
116
+ program
117
+ .name("goldenmatch-js")
118
+ .description("Entity resolution toolkit -- dedupe, match, build golden records")
119
+ .version("0.1.0");
120
+
121
+ // ---------- dedupe ----------
122
+ program
123
+ .command("dedupe")
124
+ .description("Deduplicate records in one or more files")
125
+ .argument("<files...>", "input file paths (.csv, .tsv, .json, .jsonl)")
126
+ .option("-c, --config <path>", "path to YAML config file")
127
+ .option("-e, --exact <fields>", "comma-separated exact match fields")
128
+ .option(
129
+ "-f, --fuzzy <fields>",
130
+ "fuzzy match fields, e.g. 'name:0.85,email:0.9'",
131
+ )
132
+ .option("-b, --blocking <fields>", "comma-separated blocking keys")
133
+ .option("-t, --threshold <value>", "overall fuzzy threshold", parseFloat)
134
+ .option("-o, --output <path>", "output path for golden records")
135
+ .option("--format <format>", "output format: csv or json", "csv")
136
+ .action(async (files: string[], opts: SharedMatchOpts) => {
137
+ const rows = loadFilesWithSource(files);
138
+ const options = buildOptionsFromFlags(opts);
139
+ const result = dedupe(rows, options);
140
+ const pct = (result.stats.matchRate * 100).toFixed(1);
141
+ process.stdout.write(
142
+ `Dedupe complete: ${result.stats.totalRecords} records -> ${result.stats.totalClusters} clusters (${pct}% match rate)\n`,
143
+ );
144
+ if (opts.output) {
145
+ writeOutputRows(
146
+ opts.output,
147
+ result.goldenRecords,
148
+ opts.format ?? "csv",
149
+ );
150
+ process.stdout.write(
151
+ `Wrote ${result.goldenRecords.length} golden records to ${opts.output}\n`,
152
+ );
153
+ }
154
+ });
155
+
156
+ // ---------- match ----------
157
+ program
158
+ .command("match")
159
+ .description("Match target records against a reference dataset")
160
+ .argument("<target>", "target file path")
161
+ .argument("<reference>", "reference file path")
162
+ .option("-c, --config <path>", "path to YAML config file")
163
+ .option("-e, --exact <fields>", "comma-separated exact match fields")
164
+ .option(
165
+ "-f, --fuzzy <fields>",
166
+ "fuzzy match fields, e.g. 'name:0.85,email:0.9'",
167
+ )
168
+ .option("-b, --blocking <fields>", "comma-separated blocking keys")
169
+ .option("-t, --threshold <value>", "overall fuzzy threshold", parseFloat)
170
+ .option("-o, --output <path>", "output path for matched records")
171
+ .option("--format <format>", "output format: csv or json", "csv")
172
+ .action(
173
+ async (targetPath: string, referencePath: string, opts: SharedMatchOpts) => {
174
+ const targetRows = readFile(targetPath).map((row) => ({
175
+ ...row,
176
+ __source__: "target",
177
+ }));
178
+ const referenceRows = readFile(referencePath).map((row) => ({
179
+ ...row,
180
+ __source__: "reference",
181
+ }));
182
+ const options = buildOptionsFromFlags(opts);
183
+ const result = match(targetRows, referenceRows, options);
184
+ process.stdout.write(
185
+ `Match complete: ${result.matched.length} matched, ${result.unmatched.length} unmatched\n`,
186
+ );
187
+ if (opts.output) {
188
+ writeOutputRows(
189
+ opts.output,
190
+ result.matched,
191
+ opts.format ?? "csv",
192
+ );
193
+ process.stdout.write(
194
+ `Wrote ${result.matched.length} matched records to ${opts.output}\n`,
195
+ );
196
+ }
197
+ },
198
+ );
199
+
200
+ // ---------- score ----------
201
+ program
202
+ .command("score")
203
+ .description("Score similarity between two strings")
204
+ .argument("<a>", "first string")
205
+ .argument("<b>", "second string")
206
+ .option(
207
+ "-s, --scorer <name>",
208
+ "scorer: exact, jaro_winkler, levenshtein, token_sort, soundex_match, dice, jaccard, ensemble",
209
+ "jaro_winkler",
210
+ )
211
+ .action((a: string, b: string, opts: { scorer: string }) => {
212
+ const score = scoreStrings(a, b, opts.scorer);
213
+ process.stdout.write(`${opts.scorer}: ${score.toFixed(4)}\n`);
214
+ });
215
+
216
+ // ---------- info ----------
217
+ program
218
+ .command("info")
219
+ .description("Show information about the package")
220
+ .action(() => {
221
+ process.stdout.write("GoldenMatch JS v0.1.0\n");
222
+ process.stdout.write(
223
+ "Scorers: exact, jaro_winkler, levenshtein, token_sort, soundex_match, dice, jaccard, ensemble\n",
224
+ );
225
+ process.stdout.write(
226
+ "Strategies: most_complete, majority_vote, source_priority, most_recent, first_non_null\n",
227
+ );
228
+ process.stdout.write(
229
+ "Blocking: static, multi_pass, sorted_neighborhood, adaptive\n",
230
+ );
231
+ process.stdout.write(
232
+ "Transforms: lowercase, uppercase, strip, soundex, metaphone, digits_only, alpha_only, token_sort\n",
233
+ );
234
+ });
235
+
236
+ // ---------- profile ----------
237
+ program
238
+ .command("profile")
239
+ .description("Profile a dataset (column stats, nulls, cardinality)")
240
+ .argument("<file>", "input file")
241
+ .action((file: string) => {
242
+ const rows = readFile(file);
243
+ const total = rows.length;
244
+ process.stdout.write(`File: ${file}\n`);
245
+ process.stdout.write(`Rows: ${total}\n`);
246
+ if (total === 0) return;
247
+ const columns = new Set<string>();
248
+ for (const r of rows) for (const k of Object.keys(r)) columns.add(k);
249
+ process.stdout.write(`Columns: ${columns.size}\n`);
250
+ process.stdout.write("\n");
251
+ const colList = [...columns];
252
+ const nameWidth = Math.max(6, ...colList.map((c) => c.length));
253
+ const pad = (s: string, w: number) => s + " ".repeat(Math.max(0, w - s.length));
254
+ process.stdout.write(
255
+ `${pad("column", nameWidth)} ${pad("nulls", 8)} ${pad("null%", 7)} ${pad("distinct", 9)} sample\n`,
256
+ );
257
+ process.stdout.write(
258
+ `${"-".repeat(nameWidth)} ${"-".repeat(8)} ${"-".repeat(7)} ${"-".repeat(9)} ------\n`,
259
+ );
260
+ for (const col of colList) {
261
+ let nulls = 0;
262
+ const distinct = new Set<string>();
263
+ let sample: string | null = null;
264
+ for (const row of rows) {
265
+ const v = row[col];
266
+ if (v === null || v === undefined || v === "") {
267
+ nulls++;
268
+ } else {
269
+ const s = String(v);
270
+ distinct.add(s);
271
+ if (sample === null) sample = s;
272
+ }
273
+ }
274
+ const nullPct = ((nulls / total) * 100).toFixed(1);
275
+ const sampleStr = sample === null ? "-" : sample.length > 30 ? sample.slice(0, 27) + "..." : sample;
276
+ process.stdout.write(
277
+ `${pad(col, nameWidth)} ${pad(String(nulls), 8)} ${pad(nullPct + "%", 7)} ${pad(String(distinct.size), 9)} ${sampleStr}\n`,
278
+ );
279
+ }
280
+ });
281
+
282
+ // ---------- demo ----------
283
+ program
284
+ .command("demo")
285
+ .description("Run a quick demo on synthetic data")
286
+ .action(() => {
287
+ const rows: Row[] = [
288
+ { id: 1, name: "John Smith", email: "john@example.com", zip: "01234" },
289
+ { id: 2, name: "Jon Smith", email: "john@example.com", zip: "01234" },
290
+ { id: 3, name: "Jane Doe", email: "jane@example.com", zip: "02139" },
291
+ { id: 4, name: "J. Doe", email: "jane@example.com", zip: "02139" },
292
+ { id: 5, name: "Bob Jones", email: "bob@example.com", zip: "10001" },
293
+ ];
294
+ process.stdout.write(`Input: ${rows.length} synthetic records\n`);
295
+ const result = dedupe(rows, {
296
+ exact: ["email"],
297
+ fuzzy: { name: 0.8 },
298
+ blocking: ["zip"],
299
+ threshold: 0.8,
300
+ });
301
+ process.stdout.write(
302
+ `Dedupe: ${result.stats.totalRecords} records -> ${result.stats.totalClusters} clusters\n`,
303
+ );
304
+ process.stdout.write(
305
+ `Match rate: ${(result.stats.matchRate * 100).toFixed(1)}%\n`,
306
+ );
307
+ process.stdout.write(`Golden records: ${result.goldenRecords.length}\n`);
308
+ for (const g of result.goldenRecords) {
309
+ process.stdout.write(` ${JSON.stringify(g)}\n`);
310
+ }
311
+ });
312
+
313
+ // ---------- mcp-serve ----------
314
+ program
315
+ .command("mcp-serve")
316
+ .description("Start MCP server over stdio (JSON-RPC 2.0)")
317
+ .action(async () => {
318
+ const { startMcpServer } = await import("./node/mcp/server.js");
319
+ startMcpServer();
320
+ });
321
+
322
+ // ---------- serve (REST API) ----------
323
+ program
324
+ .command("serve")
325
+ .description("Start the REST API server")
326
+ .option("-p, --port <port>", "port", "8000")
327
+ .option("-h, --host <host>", "host", "127.0.0.1")
328
+ .action(async (opts: { port: string; host: string }) => {
329
+ const { startApiServer } = await import("./node/api/server.js");
330
+ startApiServer({ port: parseInt(opts.port, 10), host: opts.host });
331
+ });
332
+
333
+ // ---------- agent-serve (A2A) ----------
334
+ program
335
+ .command("agent-serve")
336
+ .description("Start the A2A agent-to-agent server")
337
+ .option("-p, --port <port>", "port", "8200")
338
+ .option("-h, --host <host>", "host", "127.0.0.1")
339
+ .action(async (opts: { port: string; host: string }) => {
340
+ const { startA2aServer } = await import("./node/a2a/server.js");
341
+ startA2aServer({ port: parseInt(opts.port, 10), host: opts.host });
342
+ });
343
+
344
+ // ---------- tui ----------
345
+ program
346
+ .command("tui")
347
+ .description("Launch interactive TUI (requires optional peer deps: ink + react)")
348
+ .argument("[files...]", "input files to load on startup")
349
+ .option("-c, --config <path>", "path to YAML config file")
350
+ .action(async (files: string[], opts: { config?: string }) => {
351
+ try {
352
+ const { startTui } = await import("./node/tui/app.js");
353
+ const tuiOpts: { files?: string[]; config?: ReturnType<typeof loadConfigFile> } = {};
354
+ if (files && files.length > 0) tuiOpts.files = files;
355
+ if (opts.config) tuiOpts.config = loadConfigFile(opts.config);
356
+ await startTui(tuiOpts);
357
+ } catch (err: unknown) {
358
+ const message = err instanceof Error ? err.message : String(err);
359
+ process.stderr.write(`TUI error: ${message}\n`);
360
+ process.exit(1);
361
+ }
362
+ });
363
+
364
+ // ---------------------------------------------------------------------------
365
+ // Entry point
366
+ // ---------------------------------------------------------------------------
367
+
368
+ program.parseAsync(process.argv).catch((err: unknown) => {
369
+ const message = err instanceof Error ? err.message : String(err);
370
+ process.stderr.write(`Error: ${message}\n`);
371
+ process.exit(1);
372
+ });