goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Evaluate predicted pairs against ground truth: precision, recall, F1.
|
|
3
|
+
*
|
|
4
|
+
* Two evaluation modes:
|
|
5
|
+
* - evaluatePairs: compare the predicted ScoredPair[] to ground truth
|
|
6
|
+
* - evaluateClusters: expand clusters into pairs, then compare
|
|
7
|
+
*
|
|
8
|
+
* Run: npx tsx examples/11-evaluate.ts
|
|
9
|
+
*/
|
|
10
|
+
import { dedupe, evaluatePairs, evaluateClusters } from "goldenmatch";
|
|
11
|
+
|
|
12
|
+
// Dataset with known ground truth
|
|
13
|
+
const rows = [
|
|
14
|
+
{ id: 0, name: "John Smith", zip: "12345" },
|
|
15
|
+
{ id: 1, name: "Jon Smith", zip: "12345" }, // dupe of 0
|
|
16
|
+
{ id: 2, name: "Johnny Smth", zip: "12345" }, // dupe of 0
|
|
17
|
+
{ id: 3, name: "Jane Doe", zip: "54321" },
|
|
18
|
+
{ id: 4, name: "Janet Doe", zip: "54321" }, // dupe of 3
|
|
19
|
+
{ id: 5, name: "Bob Jones", zip: "99999" },
|
|
20
|
+
];
|
|
21
|
+
|
|
22
|
+
// Ground truth: pairs that SHOULD match (canonicalized min:max)
|
|
23
|
+
const truth: [number, number][] = [
|
|
24
|
+
[0, 1],
|
|
25
|
+
[0, 2],
|
|
26
|
+
[1, 2],
|
|
27
|
+
[3, 4],
|
|
28
|
+
];
|
|
29
|
+
|
|
30
|
+
// Run dedupe
|
|
31
|
+
const result = dedupe(rows, {
|
|
32
|
+
fuzzy: { name: 0.85 },
|
|
33
|
+
blocking: ["zip"],
|
|
34
|
+
threshold: 0.85,
|
|
35
|
+
});
|
|
36
|
+
|
|
37
|
+
// --- Evaluate via pair set ---
|
|
38
|
+
const pairEval = evaluatePairs(result.scoredPairs, truth);
|
|
39
|
+
console.log("=== Pair-based evaluation ===");
|
|
40
|
+
console.log(` Precision: ${pairEval.precision.toFixed(3)}`);
|
|
41
|
+
console.log(` Recall: ${pairEval.recall.toFixed(3)}`);
|
|
42
|
+
console.log(` F1: ${pairEval.f1.toFixed(3)}`);
|
|
43
|
+
console.log(` TP=${pairEval.truePositives} FP=${pairEval.falsePositives} FN=${pairEval.falseNegatives}`);
|
|
44
|
+
|
|
45
|
+
// --- Evaluate via cluster expansion ---
|
|
46
|
+
const allIds = rows.map((r) => r.id);
|
|
47
|
+
const clusterEval = evaluateClusters(result.clusters, truth, allIds);
|
|
48
|
+
console.log("\n=== Cluster-based evaluation ===");
|
|
49
|
+
console.log(` Precision: ${clusterEval.precision.toFixed(3)}`);
|
|
50
|
+
console.log(` Recall: ${clusterEval.recall.toFixed(3)}`);
|
|
51
|
+
console.log(` F1: ${clusterEval.f1.toFixed(3)}`);
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Cluster-based eval is typically more favorable than pair-based because
|
|
55
|
+
* transitive closures pick up pairs the direct scorer missed (if A~B and
|
|
56
|
+
* B~C cluster together, A~C counts as a TP even if A~C scored below
|
|
57
|
+
* threshold directly).
|
|
58
|
+
*
|
|
59
|
+
* Python parity: `evaluate_pairs()` / `evaluate_clusters()` in
|
|
60
|
+
* `goldenmatch.core.evaluate`. CLI: `goldenmatch evaluate --ground-truth gt.csv`.
|
|
61
|
+
*/
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# GoldenMatch TypeScript Examples
|
|
2
|
+
|
|
3
|
+
Each example is a standalone `.ts` file runnable with:
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
npx tsx examples/<name>.ts
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
| # | Example | What it shows |
|
|
10
|
+
|---|---------|---------------|
|
|
11
|
+
| 01 | `01-basic-dedupe.ts` | Dedupe an in-memory array with exact + fuzzy matchkeys |
|
|
12
|
+
| 02 | `02-match-two-datasets.ts` | Match target records against a reference dataset |
|
|
13
|
+
| 03 | `03-csv-file-pipeline.ts` | Read CSV -> dedupe -> write golden records |
|
|
14
|
+
| 04 | `04-string-scoring.ts` | Compare every scorer on the same string pairs |
|
|
15
|
+
| 05 | `05-custom-config.ts` | Build/save/load a full GoldenMatchConfig (YAML) |
|
|
16
|
+
| 06 | `06-probabilistic-fs.ts` | Fellegi-Sunter with EM training |
|
|
17
|
+
| 07 | `07-pprl-privacy.ts` | Privacy-preserving record linkage (3 security levels) |
|
|
18
|
+
| 08 | `08-streaming.ts` | Incremental streaming -- add records one at a time |
|
|
19
|
+
| 09 | `09-llm-scorer.ts` | LLM scorer for borderline pairs (needs OPENAI_API_KEY) |
|
|
20
|
+
| 10 | `10-explain.ts` | Template NL explanation of a pair match |
|
|
21
|
+
| 11 | `11-evaluate.ts` | Evaluate against ground truth (precision/recall/F1) |
|
|
22
|
+
|
|
23
|
+
## Running
|
|
24
|
+
|
|
25
|
+
From the repo root:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
cd packages/goldenmatch-js
|
|
29
|
+
npm install
|
|
30
|
+
npx tsx examples/01-basic-dedupe.ts
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
Or install `tsx` globally:
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
npm install -g tsx
|
|
37
|
+
tsx examples/01-basic-dedupe.ts
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Optional peer deps by example
|
|
41
|
+
|
|
42
|
+
Most examples use only the core package. A few need optional peer deps:
|
|
43
|
+
|
|
44
|
+
| Example | Peer dep required |
|
|
45
|
+
|---------|-------------------|
|
|
46
|
+
| `05-custom-config.ts` (YAML save/load path only) | `yaml` |
|
|
47
|
+
| `09-llm-scorer.ts` | none (uses `fetch`); needs `OPENAI_API_KEY` or `ANTHROPIC_API_KEY` in env |
|
|
48
|
+
|
|
49
|
+
Install them on demand:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
npm install yaml
|
|
53
|
+
```
|
package/package.json
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "goldenmatch",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Entity resolution toolkit — deduplicate, match, and create golden records",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"exports": {
|
|
7
|
+
".": { "types": "./dist/index.d.ts", "import": "./dist/index.js", "require": "./dist/index.cjs" },
|
|
8
|
+
"./core": { "types": "./dist/core/index.d.ts", "import": "./dist/core/index.js", "require": "./dist/core/index.cjs" },
|
|
9
|
+
"./node": { "types": "./dist/node/index.d.ts", "import": "./dist/node/index.js", "require": "./dist/node/index.cjs" }
|
|
10
|
+
},
|
|
11
|
+
"bin": { "goldenmatch-js": "./dist/cli.cjs" },
|
|
12
|
+
"engines": { "node": ">=20" },
|
|
13
|
+
"scripts": {
|
|
14
|
+
"build": "tsup",
|
|
15
|
+
"typecheck": "tsc --noEmit",
|
|
16
|
+
"test": "vitest run",
|
|
17
|
+
"test:watch": "vitest",
|
|
18
|
+
"lint": "tsc --noEmit"
|
|
19
|
+
},
|
|
20
|
+
"dependencies": {
|
|
21
|
+
"commander": "^13.0.0"
|
|
22
|
+
},
|
|
23
|
+
"peerDependencies": {
|
|
24
|
+
"yaml": "*",
|
|
25
|
+
"ink": "^5.0.0",
|
|
26
|
+
"react": "^18.0.0",
|
|
27
|
+
"hnswlib-node": "^3.0.0",
|
|
28
|
+
"@huggingface/transformers": "^3.0.0",
|
|
29
|
+
"piscina": "^5.0.0",
|
|
30
|
+
"ink-table": "^4.0.0",
|
|
31
|
+
"ink-select-input": "^6.0.0",
|
|
32
|
+
"ink-text-input": "^6.0.0",
|
|
33
|
+
"ink-spinner": "^5.0.0",
|
|
34
|
+
"ink-gradient": "^3.0.0"
|
|
35
|
+
},
|
|
36
|
+
"peerDependenciesMeta": {
|
|
37
|
+
"yaml": { "optional": true },
|
|
38
|
+
"ink": { "optional": true },
|
|
39
|
+
"react": { "optional": true },
|
|
40
|
+
"hnswlib-node": { "optional": true },
|
|
41
|
+
"@huggingface/transformers": { "optional": true },
|
|
42
|
+
"piscina": { "optional": true },
|
|
43
|
+
"ink-table": { "optional": true },
|
|
44
|
+
"ink-select-input": { "optional": true },
|
|
45
|
+
"ink-text-input": { "optional": true },
|
|
46
|
+
"ink-spinner": { "optional": true },
|
|
47
|
+
"ink-gradient": { "optional": true }
|
|
48
|
+
},
|
|
49
|
+
"devDependencies": {
|
|
50
|
+
"@types/node": "^20.0.0",
|
|
51
|
+
"rimraf": "^5.0.0",
|
|
52
|
+
"tsup": "^8.5.1",
|
|
53
|
+
"typescript": "^5.4.0",
|
|
54
|
+
"vitest": "^4.1.0",
|
|
55
|
+
"yaml": "^2.7.0"
|
|
56
|
+
},
|
|
57
|
+
"keywords": [
|
|
58
|
+
"entity-resolution", "deduplication", "record-linkage", "fuzzy-matching",
|
|
59
|
+
"golden-record", "data-quality", "jaro-winkler", "levenshtein"
|
|
60
|
+
],
|
|
61
|
+
"license": "MIT",
|
|
62
|
+
"repository": {
|
|
63
|
+
"type": "git",
|
|
64
|
+
"url": "https://github.com/benzsevern/goldenmatch"
|
|
65
|
+
}
|
|
66
|
+
}
|
package/src/cli.ts
ADDED
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* cli.ts -- GoldenMatch command-line interface.
|
|
4
|
+
*
|
|
5
|
+
* Built on commander. Exposes `dedupe`, `match`, `score`, `profile`,
|
|
6
|
+
* `info`, and `demo` subcommands.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { Command } from "commander";
|
|
10
|
+
import { extname, basename } from "node:path";
|
|
11
|
+
import {
|
|
12
|
+
readFile,
|
|
13
|
+
writeCsv,
|
|
14
|
+
writeJson,
|
|
15
|
+
} from "./node/connectors/file.js";
|
|
16
|
+
import { dedupe, match, scoreStrings } from "./core/api.js";
|
|
17
|
+
import { loadConfigFile } from "./node/config-file.js";
|
|
18
|
+
import type { Row } from "./core/types.js";
|
|
19
|
+
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
// Helpers
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
|
|
24
|
+
function parseFuzzyArg(raw: string): Record<string, number> {
|
|
25
|
+
const out: Record<string, number> = {};
|
|
26
|
+
for (const pair of raw.split(",")) {
|
|
27
|
+
const trimmed = pair.trim();
|
|
28
|
+
if (trimmed === "") continue;
|
|
29
|
+
const idx = trimmed.indexOf(":");
|
|
30
|
+
let field: string;
|
|
31
|
+
let threshold = 0.85;
|
|
32
|
+
if (idx === -1) {
|
|
33
|
+
field = trimmed;
|
|
34
|
+
} else {
|
|
35
|
+
field = trimmed.slice(0, idx).trim();
|
|
36
|
+
const rawThreshold = trimmed.slice(idx + 1).trim();
|
|
37
|
+
const parsed = parseFloat(rawThreshold);
|
|
38
|
+
if (Number.isFinite(parsed)) threshold = parsed;
|
|
39
|
+
}
|
|
40
|
+
if (field !== "") out[field] = threshold;
|
|
41
|
+
}
|
|
42
|
+
return out;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function parseCsvList(raw: string): string[] {
|
|
46
|
+
return raw
|
|
47
|
+
.split(",")
|
|
48
|
+
.map((s) => s.trim())
|
|
49
|
+
.filter((s) => s.length > 0);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function loadFilesWithSource(paths: readonly string[]): Row[] {
|
|
53
|
+
const rows: Row[] = [];
|
|
54
|
+
for (let i = 0; i < paths.length; i++) {
|
|
55
|
+
const p = paths[i]!;
|
|
56
|
+
const source = basename(p, extname(p)) || `file_${i}`;
|
|
57
|
+
const fileRows = readFile(p);
|
|
58
|
+
for (const r of fileRows) {
|
|
59
|
+
rows.push({ ...r, __source__: source });
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
return rows;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
interface SharedMatchOpts {
|
|
66
|
+
config?: string;
|
|
67
|
+
exact?: string;
|
|
68
|
+
fuzzy?: string;
|
|
69
|
+
blocking?: string;
|
|
70
|
+
threshold?: number;
|
|
71
|
+
output?: string;
|
|
72
|
+
format?: string;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
function buildOptionsFromFlags(opts: SharedMatchOpts) {
|
|
76
|
+
const out: {
|
|
77
|
+
config?: ReturnType<typeof loadConfigFile>;
|
|
78
|
+
exact?: string[];
|
|
79
|
+
fuzzy?: Record<string, number>;
|
|
80
|
+
blocking?: string[];
|
|
81
|
+
threshold?: number;
|
|
82
|
+
} = {};
|
|
83
|
+
if (opts.config) out.config = loadConfigFile(opts.config);
|
|
84
|
+
if (opts.exact) out.exact = parseCsvList(opts.exact);
|
|
85
|
+
if (opts.fuzzy) out.fuzzy = parseFuzzyArg(opts.fuzzy);
|
|
86
|
+
if (opts.blocking) out.blocking = parseCsvList(opts.blocking);
|
|
87
|
+
if (opts.threshold !== undefined) out.threshold = opts.threshold;
|
|
88
|
+
return out;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function writeOutputRows(
|
|
92
|
+
path: string,
|
|
93
|
+
rows: readonly Row[],
|
|
94
|
+
format: string,
|
|
95
|
+
): void {
|
|
96
|
+
const ext = extname(path).toLowerCase();
|
|
97
|
+
const useJson =
|
|
98
|
+
format === "json" ||
|
|
99
|
+
ext === ".json" ||
|
|
100
|
+
ext === ".jsonl" ||
|
|
101
|
+
ext === ".ndjson";
|
|
102
|
+
if (useJson) {
|
|
103
|
+
writeJson(path, rows);
|
|
104
|
+
} else {
|
|
105
|
+
const delimiter = ext === ".tsv" ? "\t" : ",";
|
|
106
|
+
writeCsv(path, rows, { delimiter });
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// ---------------------------------------------------------------------------
|
|
111
|
+
// CLI definition
|
|
112
|
+
// ---------------------------------------------------------------------------
|
|
113
|
+
|
|
114
|
+
const program = new Command();
|
|
115
|
+
|
|
116
|
+
program
|
|
117
|
+
.name("goldenmatch-js")
|
|
118
|
+
.description("Entity resolution toolkit -- dedupe, match, build golden records")
|
|
119
|
+
.version("0.1.0");
|
|
120
|
+
|
|
121
|
+
// ---------- dedupe ----------
|
|
122
|
+
program
|
|
123
|
+
.command("dedupe")
|
|
124
|
+
.description("Deduplicate records in one or more files")
|
|
125
|
+
.argument("<files...>", "input file paths (.csv, .tsv, .json, .jsonl)")
|
|
126
|
+
.option("-c, --config <path>", "path to YAML config file")
|
|
127
|
+
.option("-e, --exact <fields>", "comma-separated exact match fields")
|
|
128
|
+
.option(
|
|
129
|
+
"-f, --fuzzy <fields>",
|
|
130
|
+
"fuzzy match fields, e.g. 'name:0.85,email:0.9'",
|
|
131
|
+
)
|
|
132
|
+
.option("-b, --blocking <fields>", "comma-separated blocking keys")
|
|
133
|
+
.option("-t, --threshold <value>", "overall fuzzy threshold", parseFloat)
|
|
134
|
+
.option("-o, --output <path>", "output path for golden records")
|
|
135
|
+
.option("--format <format>", "output format: csv or json", "csv")
|
|
136
|
+
.action(async (files: string[], opts: SharedMatchOpts) => {
|
|
137
|
+
const rows = loadFilesWithSource(files);
|
|
138
|
+
const options = buildOptionsFromFlags(opts);
|
|
139
|
+
const result = dedupe(rows, options);
|
|
140
|
+
const pct = (result.stats.matchRate * 100).toFixed(1);
|
|
141
|
+
process.stdout.write(
|
|
142
|
+
`Dedupe complete: ${result.stats.totalRecords} records -> ${result.stats.totalClusters} clusters (${pct}% match rate)\n`,
|
|
143
|
+
);
|
|
144
|
+
if (opts.output) {
|
|
145
|
+
writeOutputRows(
|
|
146
|
+
opts.output,
|
|
147
|
+
result.goldenRecords,
|
|
148
|
+
opts.format ?? "csv",
|
|
149
|
+
);
|
|
150
|
+
process.stdout.write(
|
|
151
|
+
`Wrote ${result.goldenRecords.length} golden records to ${opts.output}\n`,
|
|
152
|
+
);
|
|
153
|
+
}
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
// ---------- match ----------
|
|
157
|
+
program
|
|
158
|
+
.command("match")
|
|
159
|
+
.description("Match target records against a reference dataset")
|
|
160
|
+
.argument("<target>", "target file path")
|
|
161
|
+
.argument("<reference>", "reference file path")
|
|
162
|
+
.option("-c, --config <path>", "path to YAML config file")
|
|
163
|
+
.option("-e, --exact <fields>", "comma-separated exact match fields")
|
|
164
|
+
.option(
|
|
165
|
+
"-f, --fuzzy <fields>",
|
|
166
|
+
"fuzzy match fields, e.g. 'name:0.85,email:0.9'",
|
|
167
|
+
)
|
|
168
|
+
.option("-b, --blocking <fields>", "comma-separated blocking keys")
|
|
169
|
+
.option("-t, --threshold <value>", "overall fuzzy threshold", parseFloat)
|
|
170
|
+
.option("-o, --output <path>", "output path for matched records")
|
|
171
|
+
.option("--format <format>", "output format: csv or json", "csv")
|
|
172
|
+
.action(
|
|
173
|
+
async (targetPath: string, referencePath: string, opts: SharedMatchOpts) => {
|
|
174
|
+
const targetRows = readFile(targetPath).map((row) => ({
|
|
175
|
+
...row,
|
|
176
|
+
__source__: "target",
|
|
177
|
+
}));
|
|
178
|
+
const referenceRows = readFile(referencePath).map((row) => ({
|
|
179
|
+
...row,
|
|
180
|
+
__source__: "reference",
|
|
181
|
+
}));
|
|
182
|
+
const options = buildOptionsFromFlags(opts);
|
|
183
|
+
const result = match(targetRows, referenceRows, options);
|
|
184
|
+
process.stdout.write(
|
|
185
|
+
`Match complete: ${result.matched.length} matched, ${result.unmatched.length} unmatched\n`,
|
|
186
|
+
);
|
|
187
|
+
if (opts.output) {
|
|
188
|
+
writeOutputRows(
|
|
189
|
+
opts.output,
|
|
190
|
+
result.matched,
|
|
191
|
+
opts.format ?? "csv",
|
|
192
|
+
);
|
|
193
|
+
process.stdout.write(
|
|
194
|
+
`Wrote ${result.matched.length} matched records to ${opts.output}\n`,
|
|
195
|
+
);
|
|
196
|
+
}
|
|
197
|
+
},
|
|
198
|
+
);
|
|
199
|
+
|
|
200
|
+
// ---------- score ----------
|
|
201
|
+
program
|
|
202
|
+
.command("score")
|
|
203
|
+
.description("Score similarity between two strings")
|
|
204
|
+
.argument("<a>", "first string")
|
|
205
|
+
.argument("<b>", "second string")
|
|
206
|
+
.option(
|
|
207
|
+
"-s, --scorer <name>",
|
|
208
|
+
"scorer: exact, jaro_winkler, levenshtein, token_sort, soundex_match, dice, jaccard, ensemble",
|
|
209
|
+
"jaro_winkler",
|
|
210
|
+
)
|
|
211
|
+
.action((a: string, b: string, opts: { scorer: string }) => {
|
|
212
|
+
const score = scoreStrings(a, b, opts.scorer);
|
|
213
|
+
process.stdout.write(`${opts.scorer}: ${score.toFixed(4)}\n`);
|
|
214
|
+
});
|
|
215
|
+
|
|
216
|
+
// ---------- info ----------
|
|
217
|
+
program
|
|
218
|
+
.command("info")
|
|
219
|
+
.description("Show information about the package")
|
|
220
|
+
.action(() => {
|
|
221
|
+
process.stdout.write("GoldenMatch JS v0.1.0\n");
|
|
222
|
+
process.stdout.write(
|
|
223
|
+
"Scorers: exact, jaro_winkler, levenshtein, token_sort, soundex_match, dice, jaccard, ensemble\n",
|
|
224
|
+
);
|
|
225
|
+
process.stdout.write(
|
|
226
|
+
"Strategies: most_complete, majority_vote, source_priority, most_recent, first_non_null\n",
|
|
227
|
+
);
|
|
228
|
+
process.stdout.write(
|
|
229
|
+
"Blocking: static, multi_pass, sorted_neighborhood, adaptive\n",
|
|
230
|
+
);
|
|
231
|
+
process.stdout.write(
|
|
232
|
+
"Transforms: lowercase, uppercase, strip, soundex, metaphone, digits_only, alpha_only, token_sort\n",
|
|
233
|
+
);
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
// ---------- profile ----------
|
|
237
|
+
program
|
|
238
|
+
.command("profile")
|
|
239
|
+
.description("Profile a dataset (column stats, nulls, cardinality)")
|
|
240
|
+
.argument("<file>", "input file")
|
|
241
|
+
.action((file: string) => {
|
|
242
|
+
const rows = readFile(file);
|
|
243
|
+
const total = rows.length;
|
|
244
|
+
process.stdout.write(`File: ${file}\n`);
|
|
245
|
+
process.stdout.write(`Rows: ${total}\n`);
|
|
246
|
+
if (total === 0) return;
|
|
247
|
+
const columns = new Set<string>();
|
|
248
|
+
for (const r of rows) for (const k of Object.keys(r)) columns.add(k);
|
|
249
|
+
process.stdout.write(`Columns: ${columns.size}\n`);
|
|
250
|
+
process.stdout.write("\n");
|
|
251
|
+
const colList = [...columns];
|
|
252
|
+
const nameWidth = Math.max(6, ...colList.map((c) => c.length));
|
|
253
|
+
const pad = (s: string, w: number) => s + " ".repeat(Math.max(0, w - s.length));
|
|
254
|
+
process.stdout.write(
|
|
255
|
+
`${pad("column", nameWidth)} ${pad("nulls", 8)} ${pad("null%", 7)} ${pad("distinct", 9)} sample\n`,
|
|
256
|
+
);
|
|
257
|
+
process.stdout.write(
|
|
258
|
+
`${"-".repeat(nameWidth)} ${"-".repeat(8)} ${"-".repeat(7)} ${"-".repeat(9)} ------\n`,
|
|
259
|
+
);
|
|
260
|
+
for (const col of colList) {
|
|
261
|
+
let nulls = 0;
|
|
262
|
+
const distinct = new Set<string>();
|
|
263
|
+
let sample: string | null = null;
|
|
264
|
+
for (const row of rows) {
|
|
265
|
+
const v = row[col];
|
|
266
|
+
if (v === null || v === undefined || v === "") {
|
|
267
|
+
nulls++;
|
|
268
|
+
} else {
|
|
269
|
+
const s = String(v);
|
|
270
|
+
distinct.add(s);
|
|
271
|
+
if (sample === null) sample = s;
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
const nullPct = ((nulls / total) * 100).toFixed(1);
|
|
275
|
+
const sampleStr = sample === null ? "-" : sample.length > 30 ? sample.slice(0, 27) + "..." : sample;
|
|
276
|
+
process.stdout.write(
|
|
277
|
+
`${pad(col, nameWidth)} ${pad(String(nulls), 8)} ${pad(nullPct + "%", 7)} ${pad(String(distinct.size), 9)} ${sampleStr}\n`,
|
|
278
|
+
);
|
|
279
|
+
}
|
|
280
|
+
});
|
|
281
|
+
|
|
282
|
+
// ---------- demo ----------
|
|
283
|
+
program
|
|
284
|
+
.command("demo")
|
|
285
|
+
.description("Run a quick demo on synthetic data")
|
|
286
|
+
.action(() => {
|
|
287
|
+
const rows: Row[] = [
|
|
288
|
+
{ id: 1, name: "John Smith", email: "john@example.com", zip: "01234" },
|
|
289
|
+
{ id: 2, name: "Jon Smith", email: "john@example.com", zip: "01234" },
|
|
290
|
+
{ id: 3, name: "Jane Doe", email: "jane@example.com", zip: "02139" },
|
|
291
|
+
{ id: 4, name: "J. Doe", email: "jane@example.com", zip: "02139" },
|
|
292
|
+
{ id: 5, name: "Bob Jones", email: "bob@example.com", zip: "10001" },
|
|
293
|
+
];
|
|
294
|
+
process.stdout.write(`Input: ${rows.length} synthetic records\n`);
|
|
295
|
+
const result = dedupe(rows, {
|
|
296
|
+
exact: ["email"],
|
|
297
|
+
fuzzy: { name: 0.8 },
|
|
298
|
+
blocking: ["zip"],
|
|
299
|
+
threshold: 0.8,
|
|
300
|
+
});
|
|
301
|
+
process.stdout.write(
|
|
302
|
+
`Dedupe: ${result.stats.totalRecords} records -> ${result.stats.totalClusters} clusters\n`,
|
|
303
|
+
);
|
|
304
|
+
process.stdout.write(
|
|
305
|
+
`Match rate: ${(result.stats.matchRate * 100).toFixed(1)}%\n`,
|
|
306
|
+
);
|
|
307
|
+
process.stdout.write(`Golden records: ${result.goldenRecords.length}\n`);
|
|
308
|
+
for (const g of result.goldenRecords) {
|
|
309
|
+
process.stdout.write(` ${JSON.stringify(g)}\n`);
|
|
310
|
+
}
|
|
311
|
+
});
|
|
312
|
+
|
|
313
|
+
// ---------- mcp-serve ----------
|
|
314
|
+
program
|
|
315
|
+
.command("mcp-serve")
|
|
316
|
+
.description("Start MCP server over stdio (JSON-RPC 2.0)")
|
|
317
|
+
.action(async () => {
|
|
318
|
+
const { startMcpServer } = await import("./node/mcp/server.js");
|
|
319
|
+
startMcpServer();
|
|
320
|
+
});
|
|
321
|
+
|
|
322
|
+
// ---------- serve (REST API) ----------
|
|
323
|
+
program
|
|
324
|
+
.command("serve")
|
|
325
|
+
.description("Start the REST API server")
|
|
326
|
+
.option("-p, --port <port>", "port", "8000")
|
|
327
|
+
.option("-h, --host <host>", "host", "127.0.0.1")
|
|
328
|
+
.action(async (opts: { port: string; host: string }) => {
|
|
329
|
+
const { startApiServer } = await import("./node/api/server.js");
|
|
330
|
+
startApiServer({ port: parseInt(opts.port, 10), host: opts.host });
|
|
331
|
+
});
|
|
332
|
+
|
|
333
|
+
// ---------- agent-serve (A2A) ----------
|
|
334
|
+
program
|
|
335
|
+
.command("agent-serve")
|
|
336
|
+
.description("Start the A2A agent-to-agent server")
|
|
337
|
+
.option("-p, --port <port>", "port", "8200")
|
|
338
|
+
.option("-h, --host <host>", "host", "127.0.0.1")
|
|
339
|
+
.action(async (opts: { port: string; host: string }) => {
|
|
340
|
+
const { startA2aServer } = await import("./node/a2a/server.js");
|
|
341
|
+
startA2aServer({ port: parseInt(opts.port, 10), host: opts.host });
|
|
342
|
+
});
|
|
343
|
+
|
|
344
|
+
// ---------- tui ----------
|
|
345
|
+
program
|
|
346
|
+
.command("tui")
|
|
347
|
+
.description("Launch interactive TUI (requires optional peer deps: ink + react)")
|
|
348
|
+
.argument("[files...]", "input files to load on startup")
|
|
349
|
+
.option("-c, --config <path>", "path to YAML config file")
|
|
350
|
+
.action(async (files: string[], opts: { config?: string }) => {
|
|
351
|
+
try {
|
|
352
|
+
const { startTui } = await import("./node/tui/app.js");
|
|
353
|
+
const tuiOpts: { files?: string[]; config?: ReturnType<typeof loadConfigFile> } = {};
|
|
354
|
+
if (files && files.length > 0) tuiOpts.files = files;
|
|
355
|
+
if (opts.config) tuiOpts.config = loadConfigFile(opts.config);
|
|
356
|
+
await startTui(tuiOpts);
|
|
357
|
+
} catch (err: unknown) {
|
|
358
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
359
|
+
process.stderr.write(`TUI error: ${message}\n`);
|
|
360
|
+
process.exit(1);
|
|
361
|
+
}
|
|
362
|
+
});
|
|
363
|
+
|
|
364
|
+
// ---------------------------------------------------------------------------
|
|
365
|
+
// Entry point
|
|
366
|
+
// ---------------------------------------------------------------------------
|
|
367
|
+
|
|
368
|
+
program.parseAsync(process.argv).catch((err: unknown) => {
|
|
369
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
370
|
+
process.stderr.write(`Error: ${message}\n`);
|
|
371
|
+
process.exit(1);
|
|
372
|
+
});
|