goldenmatch 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (162) hide show
  1. package/README.md +140 -0
  2. package/dist/cli.cjs +6079 -0
  3. package/dist/cli.cjs.map +1 -0
  4. package/dist/cli.d.cts +1 -0
  5. package/dist/cli.d.ts +1 -0
  6. package/dist/cli.js +6076 -0
  7. package/dist/cli.js.map +1 -0
  8. package/dist/core/index.cjs +8449 -0
  9. package/dist/core/index.cjs.map +1 -0
  10. package/dist/core/index.d.cts +1972 -0
  11. package/dist/core/index.d.ts +1972 -0
  12. package/dist/core/index.js +8318 -0
  13. package/dist/core/index.js.map +1 -0
  14. package/dist/index.cjs +8449 -0
  15. package/dist/index.cjs.map +1 -0
  16. package/dist/index.d.cts +2 -0
  17. package/dist/index.d.ts +2 -0
  18. package/dist/index.js +8318 -0
  19. package/dist/index.js.map +1 -0
  20. package/dist/node/backends/score-worker.cjs +934 -0
  21. package/dist/node/backends/score-worker.cjs.map +1 -0
  22. package/dist/node/backends/score-worker.d.cts +14 -0
  23. package/dist/node/backends/score-worker.d.ts +14 -0
  24. package/dist/node/backends/score-worker.js +932 -0
  25. package/dist/node/backends/score-worker.js.map +1 -0
  26. package/dist/node/index.cjs +11430 -0
  27. package/dist/node/index.cjs.map +1 -0
  28. package/dist/node/index.d.cts +554 -0
  29. package/dist/node/index.d.ts +554 -0
  30. package/dist/node/index.js +11277 -0
  31. package/dist/node/index.js.map +1 -0
  32. package/dist/types-DhUdX5Rc.d.cts +304 -0
  33. package/dist/types-DhUdX5Rc.d.ts +304 -0
  34. package/examples/01-basic-dedupe.ts +60 -0
  35. package/examples/02-match-two-datasets.ts +48 -0
  36. package/examples/03-csv-file-pipeline.ts +62 -0
  37. package/examples/04-string-scoring.ts +63 -0
  38. package/examples/05-custom-config.ts +94 -0
  39. package/examples/06-probabilistic-fs.ts +72 -0
  40. package/examples/07-pprl-privacy.ts +76 -0
  41. package/examples/08-streaming.ts +79 -0
  42. package/examples/09-llm-scorer.ts +79 -0
  43. package/examples/10-explain.ts +60 -0
  44. package/examples/11-evaluate.ts +61 -0
  45. package/examples/README.md +53 -0
  46. package/package.json +66 -0
  47. package/src/cli.ts +372 -0
  48. package/src/core/ann-blocker.ts +593 -0
  49. package/src/core/api.ts +220 -0
  50. package/src/core/autoconfig.ts +363 -0
  51. package/src/core/autofix.ts +102 -0
  52. package/src/core/blocker.ts +655 -0
  53. package/src/core/cluster.ts +699 -0
  54. package/src/core/compare-clusters.ts +176 -0
  55. package/src/core/config/loader.ts +869 -0
  56. package/src/core/cross-encoder.ts +614 -0
  57. package/src/core/data.ts +430 -0
  58. package/src/core/domain.ts +277 -0
  59. package/src/core/embedder.ts +562 -0
  60. package/src/core/evaluate.ts +156 -0
  61. package/src/core/explain.ts +352 -0
  62. package/src/core/golden.ts +524 -0
  63. package/src/core/graph-er.ts +371 -0
  64. package/src/core/index.ts +314 -0
  65. package/src/core/ingest.ts +112 -0
  66. package/src/core/learned-blocking.ts +305 -0
  67. package/src/core/lineage.ts +221 -0
  68. package/src/core/llm/budget.ts +258 -0
  69. package/src/core/llm/cluster.ts +542 -0
  70. package/src/core/llm/scorer.ts +396 -0
  71. package/src/core/match-one.ts +95 -0
  72. package/src/core/matchkey.ts +97 -0
  73. package/src/core/memory/corrections.ts +179 -0
  74. package/src/core/memory/learner.ts +218 -0
  75. package/src/core/memory/store.ts +114 -0
  76. package/src/core/pipeline.ts +366 -0
  77. package/src/core/pprl/protocol.ts +216 -0
  78. package/src/core/probabilistic.ts +511 -0
  79. package/src/core/profiler.ts +212 -0
  80. package/src/core/quality.ts +197 -0
  81. package/src/core/review-queue.ts +177 -0
  82. package/src/core/scorer.ts +855 -0
  83. package/src/core/sensitivity.ts +196 -0
  84. package/src/core/standardize.ts +279 -0
  85. package/src/core/streaming.ts +128 -0
  86. package/src/core/transforms.ts +599 -0
  87. package/src/core/types.ts +570 -0
  88. package/src/core/validate.ts +243 -0
  89. package/src/index.ts +8 -0
  90. package/src/node/a2a/server.ts +470 -0
  91. package/src/node/api/server.ts +412 -0
  92. package/src/node/backends/duckdb.ts +130 -0
  93. package/src/node/backends/score-worker.ts +41 -0
  94. package/src/node/backends/workers.ts +212 -0
  95. package/src/node/config-file.ts +66 -0
  96. package/src/node/connectors/base.ts +57 -0
  97. package/src/node/connectors/bigquery.ts +61 -0
  98. package/src/node/connectors/databricks.ts +69 -0
  99. package/src/node/connectors/file.ts +350 -0
  100. package/src/node/connectors/hubspot.ts +62 -0
  101. package/src/node/connectors/index.ts +43 -0
  102. package/src/node/connectors/salesforce.ts +93 -0
  103. package/src/node/connectors/snowflake.ts +73 -0
  104. package/src/node/db/postgres.ts +173 -0
  105. package/src/node/db/sync.ts +103 -0
  106. package/src/node/dedupe-file.ts +156 -0
  107. package/src/node/index.ts +89 -0
  108. package/src/node/mcp/server.ts +940 -0
  109. package/src/node/tui/app.ts +756 -0
  110. package/src/node/tui/index.ts +6 -0
  111. package/src/node/tui/widgets.ts +128 -0
  112. package/tests/parity/scorer-ground-truth.test.ts +118 -0
  113. package/tests/smoke.test.ts +46 -0
  114. package/tests/unit/a2a-server.test.ts +175 -0
  115. package/tests/unit/ann-blocker.test.ts +117 -0
  116. package/tests/unit/api-server.test.ts +239 -0
  117. package/tests/unit/api.test.ts +77 -0
  118. package/tests/unit/autoconfig.test.ts +103 -0
  119. package/tests/unit/autofix.test.ts +71 -0
  120. package/tests/unit/blocker.test.ts +164 -0
  121. package/tests/unit/buildBlocksAsync.test.ts +63 -0
  122. package/tests/unit/cluster.test.ts +213 -0
  123. package/tests/unit/compare-clusters.test.ts +42 -0
  124. package/tests/unit/config-loader.test.ts +301 -0
  125. package/tests/unit/connectors-base.test.ts +48 -0
  126. package/tests/unit/cross-encoder-model.test.ts +198 -0
  127. package/tests/unit/cross-encoder.test.ts +173 -0
  128. package/tests/unit/db-connectors.test.ts +37 -0
  129. package/tests/unit/domain.test.ts +80 -0
  130. package/tests/unit/embedder.test.ts +151 -0
  131. package/tests/unit/evaluate.test.ts +85 -0
  132. package/tests/unit/explain.test.ts +73 -0
  133. package/tests/unit/golden.test.ts +97 -0
  134. package/tests/unit/graph-er.test.ts +173 -0
  135. package/tests/unit/hnsw-ann.test.ts +283 -0
  136. package/tests/unit/hubspot-connector.test.ts +118 -0
  137. package/tests/unit/ingest.test.ts +97 -0
  138. package/tests/unit/learned-blocking.test.ts +134 -0
  139. package/tests/unit/lineage.test.ts +135 -0
  140. package/tests/unit/match-one.test.ts +129 -0
  141. package/tests/unit/matchkey.test.ts +97 -0
  142. package/tests/unit/mcp-server.test.ts +183 -0
  143. package/tests/unit/memory.test.ts +119 -0
  144. package/tests/unit/pipeline.test.ts +118 -0
  145. package/tests/unit/pprl-protocol.test.ts +381 -0
  146. package/tests/unit/probabilistic.test.ts +494 -0
  147. package/tests/unit/profiler.test.ts +68 -0
  148. package/tests/unit/review-queue.test.ts +68 -0
  149. package/tests/unit/salesforce-connector.test.ts +148 -0
  150. package/tests/unit/scorer.test.ts +301 -0
  151. package/tests/unit/sensitivity.test.ts +154 -0
  152. package/tests/unit/standardize.test.ts +84 -0
  153. package/tests/unit/streaming.test.ts +82 -0
  154. package/tests/unit/transforms.test.ts +208 -0
  155. package/tests/unit/tui-widgets.test.ts +42 -0
  156. package/tests/unit/tui.test.ts +24 -0
  157. package/tests/unit/validate.test.ts +145 -0
  158. package/tests/unit/workers-parallel.test.ts +99 -0
  159. package/tests/unit/workers.test.ts +74 -0
  160. package/tsconfig.json +25 -0
  161. package/tsup.config.ts +37 -0
  162. package/vitest.config.ts +11 -0
package/README.md ADDED
@@ -0,0 +1,140 @@
1
+ # GoldenMatch (TypeScript)
2
+
3
+ **Entity resolution toolkit for Node.js and edge runtimes. Deduplicate, match, and create golden records — in TypeScript.**
4
+
5
+ ```bash
6
+ npm install goldenmatch
7
+ ```
8
+
9
+ [![npm](https://img.shields.io/npm/v/goldenmatch?color=d4a017)](https://www.npmjs.com/package/goldenmatch)
10
+ [![Node](https://img.shields.io/node/v/goldenmatch?color=339933)](https://nodejs.org/)
11
+ [![License: MIT](https://img.shields.io/badge/license-MIT-green)](https://github.com/benzsevern/goldenmatch/blob/main/LICENSE)
12
+ [![Tests](https://img.shields.io/badge/tests-478%20passing-brightgreen)](https://github.com/benzsevern/goldenmatch/tree/main/packages/goldenmatch-js/tests)
13
+
14
+ ---
15
+
16
+ ## Why this port?
17
+
18
+ - **Edge-safe core** — the matching engine runs in browsers, Workers, Vercel Edge Runtime, Deno
19
+ - **Pure TypeScript** — no native dependencies required; peer deps unlock performance (hnswlib, ONNX, piscina)
20
+ - **Feature parity with Python goldenmatch** — same scorers, same clustering, same YAML configs
21
+ - **478 tests, strict TypeScript** — `noUncheckedIndexedAccess`, `exactOptionalPropertyTypes`
22
+
23
+ ## Quick Start
24
+
25
+ ```typescript
26
+ import { dedupe } from "goldenmatch";
27
+
28
+ const rows = [
29
+ { id: 1, name: "John Smith", email: "john@example.com", zip: "12345" },
30
+ { id: 2, name: "Jon Smith", email: "john@example.com", zip: "12345" },
31
+ { id: 3, name: "Jane Doe", email: "jane@example.com", zip: "54321" },
32
+ ];
33
+
34
+ const result = dedupe(rows, {
35
+ fuzzy: { name: 0.85 },
36
+ blocking: ["zip"],
37
+ threshold: 0.85,
38
+ });
39
+
40
+ console.log(result.stats);
41
+ // { totalRecords: 3, totalClusters: 2, matchRate: 0.67, ... }
42
+
43
+ for (const record of result.goldenRecords) {
44
+ console.log(record);
45
+ }
46
+ ```
47
+
48
+ ## Three entrypoints
49
+
50
+ ```typescript
51
+ import { dedupe, match, scoreStrings } from "goldenmatch"; // edge-safe core
52
+ import { readFile, writeCsv } from "goldenmatch/node"; // Node-only file I/O
53
+ // CLI: `npx goldenmatch-js dedupe data.csv --output golden.csv`
54
+ ```
55
+
56
+ ## Feature matrix
57
+
58
+ ### Scoring algorithms
59
+ - Exact, Jaro-Winkler, Levenshtein, Token-Sort, Soundex, Dice, Jaccard, Ensemble
60
+ - Probabilistic (Fellegi-Sunter with Splink-style EM)
61
+ - LLM scorer (OpenAI/Anthropic via fetch — edge-safe)
62
+ - Cross-encoder reranking (via @huggingface/transformers)
63
+
64
+ ### Blocking strategies
65
+ - Static, multi-pass, sorted-neighborhood, adaptive
66
+ - ANN (approximate nearest neighbor via hnswlib-node peer dep or brute-force)
67
+ - Canopy (TF-IDF)
68
+ - Learned (data-driven predicate selection)
69
+
70
+ ### Golden record strategies
71
+ - most_complete, majority_vote, source_priority, most_recent, first_non_null
72
+ - Full provenance tracking
73
+
74
+ ### Pipeline features
75
+ - PPRL (privacy-preserving record linkage, 3 security levels with HMAC-SHA256)
76
+ - Graph ER (multi-table entity resolution with evidence propagation)
77
+ - Sensitivity analysis (parameter sweep with CCMS/TWI)
78
+ - Streaming (incremental single-record matching)
79
+ - Memory (persistent corrections + threshold learning)
80
+ - Review queue (human-in-the-loop)
81
+
82
+ ## Optional peer deps
83
+
84
+ Zero-dep install works. These unlock advanced paths:
85
+
86
+ | Peer dep | What it enables |
87
+ |---|---|
88
+ | `yaml` | YAML config file loading |
89
+ | `hnswlib-node` | True sub-linear ANN blocking (vs brute-force) |
90
+ | `@huggingface/transformers` | ONNX cross-encoder reranking (MiniLM) |
91
+ | `piscina` | Worker-thread parallel block scoring |
92
+ | `ink` + `react` | Interactive terminal UI |
93
+ | `ink-table`, `ink-select-input`, `ink-text-input`, `ink-spinner`, `ink-gradient` | Richer TUI widgets |
94
+ | `pg` | Postgres connector + sync |
95
+ | `@duckdb/node-api` | DuckDB connector |
96
+ | `snowflake-sdk`, `@google-cloud/bigquery`, `@databricks/sql` | Cloud warehouse connectors |
97
+
98
+ ## Servers
99
+
100
+ ```bash
101
+ # MCP server (for Claude Desktop / Code)
102
+ npx goldenmatch-js mcp-serve
103
+
104
+ # REST API
105
+ npx goldenmatch-js serve --port 8000
106
+
107
+ # A2A agent server
108
+ npx goldenmatch-js agent-serve --port 8200
109
+
110
+ # Interactive TUI
111
+ npx goldenmatch-js tui data.csv
112
+ ```
113
+
114
+ ## CLI commands
115
+
116
+ ```
117
+ goldenmatch-js dedupe <files...> Deduplicate records
118
+ goldenmatch-js match <target> <ref> Match target against reference
119
+ goldenmatch-js score <a> <b> Score similarity between two strings
120
+ goldenmatch-js info Show scorers, strategies, transforms
121
+ goldenmatch-js profile <file> Profile a dataset
122
+ goldenmatch-js demo Run a quick demo on synthetic data
123
+ goldenmatch-js mcp-serve Start MCP server (stdio)
124
+ goldenmatch-js serve Start REST API
125
+ goldenmatch-js agent-serve Start A2A agent
126
+ goldenmatch-js tui Interactive terminal UI
127
+ ```
128
+
129
+ ## Examples
130
+
131
+ See [`examples/`](./examples) for 10+ full examples covering basic dedupe, CSV pipelines,
132
+ probabilistic matching (Fellegi-Sunter), PPRL, streaming, LLM scoring, explanations, and evaluation.
133
+
134
+ ## Documentation
135
+
136
+ Full docs: https://benzsevern.github.io/goldenmatch/typescript
137
+
138
+ ## License
139
+
140
+ MIT. See [LICENSE](https://github.com/benzsevern/goldenmatch/blob/main/LICENSE).