goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
package/README.md
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# GoldenMatch (TypeScript)
|
|
2
|
+
|
|
3
|
+
**Entity resolution toolkit for Node.js and edge runtimes. Deduplicate, match, and create golden records — in TypeScript.**
|
|
4
|
+
|
|
5
|
+
```bash
|
|
6
|
+
npm install goldenmatch
|
|
7
|
+
```
|
|
8
|
+
|
|
9
|
+
[](https://www.npmjs.com/package/goldenmatch)
|
|
10
|
+
[](https://nodejs.org/)
|
|
11
|
+
[](https://github.com/benzsevern/goldenmatch/blob/main/LICENSE)
|
|
12
|
+
[](https://github.com/benzsevern/goldenmatch/tree/main/packages/goldenmatch-js/tests)
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## Why this port?
|
|
17
|
+
|
|
18
|
+
- **Edge-safe core** — the matching engine runs in browsers, Workers, Vercel Edge Runtime, Deno
|
|
19
|
+
- **Pure TypeScript** — no native dependencies required; peer deps unlock performance (hnswlib, ONNX, piscina)
|
|
20
|
+
- **Feature parity with Python goldenmatch** — same scorers, same clustering, same YAML configs
|
|
21
|
+
- **478 tests, strict TypeScript** — `noUncheckedIndexedAccess`, `exactOptionalPropertyTypes`
|
|
22
|
+
|
|
23
|
+
## Quick Start
|
|
24
|
+
|
|
25
|
+
```typescript
|
|
26
|
+
import { dedupe } from "goldenmatch";
|
|
27
|
+
|
|
28
|
+
const rows = [
|
|
29
|
+
{ id: 1, name: "John Smith", email: "john@example.com", zip: "12345" },
|
|
30
|
+
{ id: 2, name: "Jon Smith", email: "john@example.com", zip: "12345" },
|
|
31
|
+
{ id: 3, name: "Jane Doe", email: "jane@example.com", zip: "54321" },
|
|
32
|
+
];
|
|
33
|
+
|
|
34
|
+
const result = dedupe(rows, {
|
|
35
|
+
fuzzy: { name: 0.85 },
|
|
36
|
+
blocking: ["zip"],
|
|
37
|
+
threshold: 0.85,
|
|
38
|
+
});
|
|
39
|
+
|
|
40
|
+
console.log(result.stats);
|
|
41
|
+
// { totalRecords: 3, totalClusters: 2, matchRate: 0.67, ... }
|
|
42
|
+
|
|
43
|
+
for (const record of result.goldenRecords) {
|
|
44
|
+
console.log(record);
|
|
45
|
+
}
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Three entrypoints
|
|
49
|
+
|
|
50
|
+
```typescript
|
|
51
|
+
import { dedupe, match, scoreStrings } from "goldenmatch"; // edge-safe core
|
|
52
|
+
import { readFile, writeCsv } from "goldenmatch/node"; // Node-only file I/O
|
|
53
|
+
// CLI: `npx goldenmatch-js dedupe data.csv --output golden.csv`
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
## Feature matrix
|
|
57
|
+
|
|
58
|
+
### Scoring algorithms
|
|
59
|
+
- Exact, Jaro-Winkler, Levenshtein, Token-Sort, Soundex, Dice, Jaccard, Ensemble
|
|
60
|
+
- Probabilistic (Fellegi-Sunter with Splink-style EM)
|
|
61
|
+
- LLM scorer (OpenAI/Anthropic via fetch — edge-safe)
|
|
62
|
+
- Cross-encoder reranking (via @huggingface/transformers)
|
|
63
|
+
|
|
64
|
+
### Blocking strategies
|
|
65
|
+
- Static, multi-pass, sorted-neighborhood, adaptive
|
|
66
|
+
- ANN (approximate nearest neighbor via hnswlib-node peer dep or brute-force)
|
|
67
|
+
- Canopy (TF-IDF)
|
|
68
|
+
- Learned (data-driven predicate selection)
|
|
69
|
+
|
|
70
|
+
### Golden record strategies
|
|
71
|
+
- most_complete, majority_vote, source_priority, most_recent, first_non_null
|
|
72
|
+
- Full provenance tracking
|
|
73
|
+
|
|
74
|
+
### Pipeline features
|
|
75
|
+
- PPRL (privacy-preserving record linkage, 3 security levels with HMAC-SHA256)
|
|
76
|
+
- Graph ER (multi-table entity resolution with evidence propagation)
|
|
77
|
+
- Sensitivity analysis (parameter sweep with CCMS/TWI)
|
|
78
|
+
- Streaming (incremental single-record matching)
|
|
79
|
+
- Memory (persistent corrections + threshold learning)
|
|
80
|
+
- Review queue (human-in-the-loop)
|
|
81
|
+
|
|
82
|
+
## Optional peer deps
|
|
83
|
+
|
|
84
|
+
Zero-dep install works. These unlock advanced paths:
|
|
85
|
+
|
|
86
|
+
| Peer dep | What it enables |
|
|
87
|
+
|---|---|
|
|
88
|
+
| `yaml` | YAML config file loading |
|
|
89
|
+
| `hnswlib-node` | True sub-linear ANN blocking (vs brute-force) |
|
|
90
|
+
| `@huggingface/transformers` | ONNX cross-encoder reranking (MiniLM) |
|
|
91
|
+
| `piscina` | Worker-thread parallel block scoring |
|
|
92
|
+
| `ink` + `react` | Interactive terminal UI |
|
|
93
|
+
| `ink-table`, `ink-select-input`, `ink-text-input`, `ink-spinner`, `ink-gradient` | Richer TUI widgets |
|
|
94
|
+
| `pg` | Postgres connector + sync |
|
|
95
|
+
| `@duckdb/node-api` | DuckDB connector |
|
|
96
|
+
| `snowflake-sdk`, `@google-cloud/bigquery`, `@databricks/sql` | Cloud warehouse connectors |
|
|
97
|
+
|
|
98
|
+
## Servers
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
# MCP server (for Claude Desktop / Code)
|
|
102
|
+
npx goldenmatch-js mcp-serve
|
|
103
|
+
|
|
104
|
+
# REST API
|
|
105
|
+
npx goldenmatch-js serve --port 8000
|
|
106
|
+
|
|
107
|
+
# A2A agent server
|
|
108
|
+
npx goldenmatch-js agent-serve --port 8200
|
|
109
|
+
|
|
110
|
+
# Interactive TUI
|
|
111
|
+
npx goldenmatch-js tui data.csv
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## CLI commands
|
|
115
|
+
|
|
116
|
+
```
|
|
117
|
+
goldenmatch-js dedupe <files...> Deduplicate records
|
|
118
|
+
goldenmatch-js match <target> <ref> Match target against reference
|
|
119
|
+
goldenmatch-js score <a> <b> Score similarity between two strings
|
|
120
|
+
goldenmatch-js info Show scorers, strategies, transforms
|
|
121
|
+
goldenmatch-js profile <file> Profile a dataset
|
|
122
|
+
goldenmatch-js demo Run a quick demo on synthetic data
|
|
123
|
+
goldenmatch-js mcp-serve Start MCP server (stdio)
|
|
124
|
+
goldenmatch-js serve Start REST API
|
|
125
|
+
goldenmatch-js agent-serve Start A2A agent
|
|
126
|
+
goldenmatch-js tui Interactive terminal UI
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Examples
|
|
130
|
+
|
|
131
|
+
See [`examples/`](./examples) for 10+ full examples covering basic dedupe, CSV pipelines,
|
|
132
|
+
probabilistic matching (Fellegi-Sunter), PPRL, streaming, LLM scoring, explanations, and evaluation.
|
|
133
|
+
|
|
134
|
+
## Documentation
|
|
135
|
+
|
|
136
|
+
Full docs: https://benzsevern.github.io/goldenmatch/typescript
|
|
137
|
+
|
|
138
|
+
## License
|
|
139
|
+
|
|
140
|
+
MIT. See [LICENSE](https://github.com/benzsevern/goldenmatch/blob/main/LICENSE).
|