goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Match records in a "target" dataset against a "reference" dataset.
|
|
3
|
+
* Useful for: incoming leads vs. CRM, transactions vs. customers, etc.
|
|
4
|
+
*
|
|
5
|
+
* Run: npx tsx examples/02-match-two-datasets.ts
|
|
6
|
+
*/
|
|
7
|
+
import { match } from "goldenmatch";
|
|
8
|
+
|
|
9
|
+
// Reference dataset: known customers
|
|
10
|
+
const customers = [
|
|
11
|
+
{ id: "C001", name: "Acme Corp", city: "Seattle", phone: "555-1000" },
|
|
12
|
+
{ id: "C002", name: "Globex Inc", city: "Portland", phone: "555-2000" },
|
|
13
|
+
{ id: "C003", name: "Initech LLC", city: "Austin", phone: "555-3000" },
|
|
14
|
+
{ id: "C004", name: "Umbrella Co", city: "Boston", phone: "555-4000" },
|
|
15
|
+
];
|
|
16
|
+
|
|
17
|
+
// Target dataset: incoming leads (possibly dupes of customers, possibly new)
|
|
18
|
+
const leads = [
|
|
19
|
+
{ id: "L1", name: "ACME Corporation", city: "Seattle", phone: "555-1000" },
|
|
20
|
+
{ id: "L2", name: "Globex, Inc.", city: "Portland", phone: "555-2000" },
|
|
21
|
+
{ id: "L3", name: "Stark Industries", city: "New York", phone: "555-9000" },
|
|
22
|
+
{ id: "L4", name: "Initech", city: "Austin", phone: "555-3000" },
|
|
23
|
+
];
|
|
24
|
+
|
|
25
|
+
const result = match(leads, customers, {
|
|
26
|
+
fuzzy: { name: 0.75 },
|
|
27
|
+
blocking: ["city"],
|
|
28
|
+
threshold: 0.75,
|
|
29
|
+
});
|
|
30
|
+
|
|
31
|
+
console.log(`Matched leads: ${result.matched.length}`);
|
|
32
|
+
console.log(`Unmatched leads: ${result.unmatched.length}\n`);
|
|
33
|
+
|
|
34
|
+
console.log("Matched (likely existing customers):");
|
|
35
|
+
for (const row of result.matched) {
|
|
36
|
+
console.log(" ", row);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
console.log("\nUnmatched (likely new leads):");
|
|
40
|
+
for (const row of result.unmatched) {
|
|
41
|
+
console.log(" ", row);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
/**
|
|
45
|
+
* Python -> TS differences:
|
|
46
|
+
* - Python `match()` returns a MatchResult with DataFrames; TS returns arrays.
|
|
47
|
+
* - `result.stats` is a generic record; shape differs slightly across modes.
|
|
48
|
+
*/
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* CSV file pipeline: read CSV -> dedupe -> write golden records back to CSV.
|
|
3
|
+
*
|
|
4
|
+
* This example uses `goldenmatch/node` (the Node-only subpackage) for
|
|
5
|
+
* file I/O. The core `goldenmatch` package is edge-safe and doesn't
|
|
6
|
+
* import `node:fs`.
|
|
7
|
+
*
|
|
8
|
+
* Run: npx tsx examples/03-csv-file-pipeline.ts
|
|
9
|
+
*
|
|
10
|
+
* Prereq: a file `customers.csv` in the working directory. We create one
|
|
11
|
+
* inline for demo purposes.
|
|
12
|
+
*/
|
|
13
|
+
import { writeFileSync, unlinkSync } from "node:fs";
|
|
14
|
+
import { dedupe } from "goldenmatch";
|
|
15
|
+
import { readFile, writeCsv } from "goldenmatch/node";
|
|
16
|
+
|
|
17
|
+
// --- Step 0: create a demo CSV (you'd skip this step in real life) ---
|
|
18
|
+
const DEMO_PATH = "customers_demo.csv";
|
|
19
|
+
const GOLDEN_PATH = "golden_demo.csv";
|
|
20
|
+
writeFileSync(
|
|
21
|
+
DEMO_PATH,
|
|
22
|
+
[
|
|
23
|
+
"id,name,email,zip",
|
|
24
|
+
"1,John Smith,john@example.com,12345",
|
|
25
|
+
"2,Jon Smith,john@example.com,12345",
|
|
26
|
+
"3,Jane Doe,jane@example.com,54321",
|
|
27
|
+
"4,J. Smith,john@example.com,12345",
|
|
28
|
+
"5,Janet Doe,janet@example.com,54321",
|
|
29
|
+
].join("\n"),
|
|
30
|
+
);
|
|
31
|
+
|
|
32
|
+
// --- Step 1: read the CSV (auto-coerces numbers, handles BOM) ---
|
|
33
|
+
const rows = readFile(DEMO_PATH);
|
|
34
|
+
console.log(`Read ${rows.length} rows from ${DEMO_PATH}`);
|
|
35
|
+
|
|
36
|
+
// --- Step 2: dedupe ---
|
|
37
|
+
const result = dedupe(rows, {
|
|
38
|
+
exact: ["email"],
|
|
39
|
+
fuzzy: { name: 0.8 },
|
|
40
|
+
blocking: ["zip"],
|
|
41
|
+
threshold: 0.8,
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
console.log(
|
|
45
|
+
`Found ${result.stats.totalClusters} clusters from ${result.stats.totalRecords} records`,
|
|
46
|
+
);
|
|
47
|
+
|
|
48
|
+
// --- Step 3: write golden records back to CSV ---
|
|
49
|
+
writeCsv(GOLDEN_PATH, result.goldenRecords);
|
|
50
|
+
console.log(`Wrote ${result.goldenRecords.length} golden records to ${GOLDEN_PATH}`);
|
|
51
|
+
|
|
52
|
+
// Cleanup demo files
|
|
53
|
+
unlinkSync(DEMO_PATH);
|
|
54
|
+
unlinkSync(GOLDEN_PATH);
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* `readFile()` auto-detects CSV vs JSON by extension. Use `readCsv()` /
|
|
58
|
+
* `readJson()` directly if you want to pass options (delimiter, encoding).
|
|
59
|
+
*
|
|
60
|
+
* `writeCsv()` serializes an array of objects; columns are inferred from
|
|
61
|
+
* the union of keys across all rows.
|
|
62
|
+
*/
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* String scoring: compare every scorer on the same string pairs.
|
|
3
|
+
*
|
|
4
|
+
* This is pedagogical — each scorer has different strengths:
|
|
5
|
+
* - jaro_winkler: short strings, common-prefix bonus (great for names)
|
|
6
|
+
* - levenshtein: edit distance (typos)
|
|
7
|
+
* - token_sort: order-independent (word reordering)
|
|
8
|
+
* - soundex_match: phonetic (Smith/Smyth)
|
|
9
|
+
* - dice / jaccard: bigram/set overlap
|
|
10
|
+
* - ensemble: weighted combination (default for names)
|
|
11
|
+
*
|
|
12
|
+
* Run: npx tsx examples/04-string-scoring.ts
|
|
13
|
+
*/
|
|
14
|
+
import { scoreStrings } from "goldenmatch";
|
|
15
|
+
|
|
16
|
+
const pairs: [string, string, string][] = [
|
|
17
|
+
["John Smith", "Jon Smith", "typo / common name variant"],
|
|
18
|
+
["John Smith", "Smith, John", "word reorder"],
|
|
19
|
+
["Smith", "Smyth", "phonetic equivalent"],
|
|
20
|
+
["Robert", "Bob", "nickname (no scorer handles well)"],
|
|
21
|
+
["123 Main St", "123 Main Street", "abbreviation"],
|
|
22
|
+
["apple inc", "Apple, Inc.", "punctuation/case noise"],
|
|
23
|
+
["totally", "different", "no similarity"],
|
|
24
|
+
];
|
|
25
|
+
|
|
26
|
+
const scorers = [
|
|
27
|
+
"exact",
|
|
28
|
+
"jaro_winkler",
|
|
29
|
+
"levenshtein",
|
|
30
|
+
"token_sort",
|
|
31
|
+
"soundex_match",
|
|
32
|
+
"dice",
|
|
33
|
+
"jaccard",
|
|
34
|
+
"ensemble",
|
|
35
|
+
];
|
|
36
|
+
|
|
37
|
+
// Header
|
|
38
|
+
const pad = (s: string, n: number) => s.padEnd(n);
|
|
39
|
+
const colWidths = [28, 28, ...scorers.map(() => 14)];
|
|
40
|
+
|
|
41
|
+
process.stdout.write(pad("A", colWidths[0]!));
|
|
42
|
+
process.stdout.write(pad("B", colWidths[1]!));
|
|
43
|
+
for (let i = 0; i < scorers.length; i++) {
|
|
44
|
+
process.stdout.write(pad(scorers[i]!, colWidths[i + 2]!));
|
|
45
|
+
}
|
|
46
|
+
process.stdout.write("\n");
|
|
47
|
+
process.stdout.write("-".repeat(colWidths.reduce((a, b) => a + b, 0)) + "\n");
|
|
48
|
+
|
|
49
|
+
for (const [a, b, _label] of pairs) {
|
|
50
|
+
process.stdout.write(pad(a, colWidths[0]!));
|
|
51
|
+
process.stdout.write(pad(b, colWidths[1]!));
|
|
52
|
+
for (let i = 0; i < scorers.length; i++) {
|
|
53
|
+
const score = scoreStrings(a, b, scorers[i]!);
|
|
54
|
+
process.stdout.write(pad(score.toFixed(2), colWidths[i + 2]!));
|
|
55
|
+
}
|
|
56
|
+
process.stdout.write("\n");
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Expected: jaro_winkler typically wins on short names, token_sort crushes
|
|
61
|
+
* word-reorder cases, soundex_match catches Smith/Smyth, and ensemble is
|
|
62
|
+
* the most balanced default.
|
|
63
|
+
*/
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Build a full GoldenMatchConfig manually (matchkeys, standardization,
|
|
3
|
+
* blocking, golden rules). Save to YAML, reload, use it.
|
|
4
|
+
*
|
|
5
|
+
* Run: npx tsx examples/05-custom-config.ts
|
|
6
|
+
* Requires: `npm install yaml` (optional peer dep for YAML I/O).
|
|
7
|
+
*/
|
|
8
|
+
import { writeFileSync, unlinkSync } from "node:fs";
|
|
9
|
+
import {
|
|
10
|
+
dedupe,
|
|
11
|
+
makeConfig,
|
|
12
|
+
makeMatchkeyConfig,
|
|
13
|
+
makeMatchkeyField,
|
|
14
|
+
makeBlockingConfig,
|
|
15
|
+
makeGoldenRulesConfig,
|
|
16
|
+
} from "goldenmatch";
|
|
17
|
+
import { loadConfigFile, writeConfigFile } from "goldenmatch/node";
|
|
18
|
+
|
|
19
|
+
// Build config programmatically
|
|
20
|
+
const config = makeConfig({
|
|
21
|
+
matchkeys: [
|
|
22
|
+
makeMatchkeyConfig({
|
|
23
|
+
name: "email_exact",
|
|
24
|
+
type: "exact",
|
|
25
|
+
fields: [makeMatchkeyField({ field: "email", transforms: ["lowercase", "strip"], scorer: "exact" })],
|
|
26
|
+
}),
|
|
27
|
+
makeMatchkeyConfig({
|
|
28
|
+
name: "identity",
|
|
29
|
+
type: "weighted",
|
|
30
|
+
threshold: 0.85,
|
|
31
|
+
fields: [
|
|
32
|
+
makeMatchkeyField({ field: "first_name", transforms: ["lowercase"], scorer: "jaro_winkler", weight: 0.3 }),
|
|
33
|
+
makeMatchkeyField({ field: "last_name", transforms: ["lowercase"], scorer: "jaro_winkler", weight: 0.4 }),
|
|
34
|
+
makeMatchkeyField({ field: "phone", transforms: ["digits_only"], scorer: "exact", weight: 0.3 }),
|
|
35
|
+
],
|
|
36
|
+
}),
|
|
37
|
+
],
|
|
38
|
+
blocking: makeBlockingConfig({
|
|
39
|
+
strategy: "multi_pass",
|
|
40
|
+
keys: [{ fields: ["zip"], transforms: ["lowercase", "strip"] }],
|
|
41
|
+
passes: [
|
|
42
|
+
{ fields: ["zip"], transforms: ["lowercase", "strip"] },
|
|
43
|
+
{ fields: ["last_name"], transforms: ["soundex"] },
|
|
44
|
+
],
|
|
45
|
+
}),
|
|
46
|
+
goldenRules: makeGoldenRulesConfig({
|
|
47
|
+
defaultStrategy: "most_complete",
|
|
48
|
+
fieldRules: {
|
|
49
|
+
email: { strategy: "first_non_null" },
|
|
50
|
+
phone: { strategy: "most_complete" },
|
|
51
|
+
},
|
|
52
|
+
}),
|
|
53
|
+
standardization: {
|
|
54
|
+
rules: {
|
|
55
|
+
email: ["email"],
|
|
56
|
+
phone: ["phone"],
|
|
57
|
+
first_name: ["strip", "name_proper"],
|
|
58
|
+
last_name: ["strip", "name_proper"],
|
|
59
|
+
},
|
|
60
|
+
},
|
|
61
|
+
threshold: 0.85,
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
// Save to YAML (requires `yaml` peer dep)
|
|
65
|
+
const YAML_PATH = "custom_config_demo.yml";
|
|
66
|
+
try {
|
|
67
|
+
writeConfigFile(YAML_PATH, config);
|
|
68
|
+
console.log(`Wrote config to ${YAML_PATH}`);
|
|
69
|
+
|
|
70
|
+
const reloaded = loadConfigFile(YAML_PATH);
|
|
71
|
+
console.log(`Reloaded config has ${reloaded.matchkeys?.length ?? 0} matchkeys`);
|
|
72
|
+
|
|
73
|
+
unlinkSync(YAML_PATH);
|
|
74
|
+
} catch (err) {
|
|
75
|
+
console.warn(`YAML save/load skipped: ${(err as Error).message}`);
|
|
76
|
+
console.warn("(install the `yaml` peer dep to enable: npm install yaml)");
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Use the config
|
|
80
|
+
const rows = [
|
|
81
|
+
{ id: 1, first_name: "john", last_name: "smith", email: "j@x.com", phone: "555-123-4567", zip: "12345" },
|
|
82
|
+
{ id: 2, first_name: "John", last_name: "Smyth", email: "J@X.COM", phone: "(555) 123-4567", zip: "12345" },
|
|
83
|
+
{ id: 3, first_name: "Jane", last_name: "Doe", email: "jd@x.com", phone: "555-000-0000", zip: "54321" },
|
|
84
|
+
];
|
|
85
|
+
|
|
86
|
+
const result = dedupe(rows, { config });
|
|
87
|
+
console.log(`\nDeduped: ${result.stats.totalRecords} -> ${result.stats.totalClusters} clusters`);
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Python -> TS differences:
|
|
91
|
+
* - Python: `StandardizationConfig(rules={...})`; TS: `{ rules: {...} }` plain object.
|
|
92
|
+
* - Python: pydantic validation; TS: `makeConfig` normalizes defaults, `parseConfig`
|
|
93
|
+
* (used internally by loadConfigFile) validates YAML input.
|
|
94
|
+
*/
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Fellegi-Sunter probabilistic matching with Splink-style EM training.
|
|
3
|
+
*
|
|
4
|
+
* The F-S model learns per-field "agreement" probabilities under match
|
|
5
|
+
* vs. non-match hypotheses, producing match weights (log-likelihood
|
|
6
|
+
* ratios). Train on unlabeled data via EM -- no ground truth required.
|
|
7
|
+
*
|
|
8
|
+
* Run: npx tsx examples/06-probabilistic-fs.ts
|
|
9
|
+
*/
|
|
10
|
+
import {
|
|
11
|
+
trainEM,
|
|
12
|
+
scoreProbabilistic,
|
|
13
|
+
makeMatchkeyConfig,
|
|
14
|
+
makeMatchkeyField,
|
|
15
|
+
} from "goldenmatch";
|
|
16
|
+
|
|
17
|
+
// Synthetic labeled-ish data (row ids must be on __row_id__)
|
|
18
|
+
const rows = [
|
|
19
|
+
{ __row_id__: 0, first_name: "John", last_name: "Smith", zip: "12345" },
|
|
20
|
+
{ __row_id__: 1, first_name: "Jon", last_name: "Smith", zip: "12345" },
|
|
21
|
+
{ __row_id__: 2, first_name: "John", last_name: "Smyth", zip: "12345" },
|
|
22
|
+
{ __row_id__: 3, first_name: "Jane", last_name: "Doe", zip: "54321" },
|
|
23
|
+
{ __row_id__: 4, first_name: "Janet", last_name: "Doe", zip: "54321" },
|
|
24
|
+
{ __row_id__: 5, first_name: "Bob", last_name: "Jones", zip: "99999" },
|
|
25
|
+
{ __row_id__: 6, first_name: "Alice", last_name: "Miller", zip: "11111" },
|
|
26
|
+
{ __row_id__: 7, first_name: "Alice", last_name: "Miller", zip: "11111" },
|
|
27
|
+
];
|
|
28
|
+
|
|
29
|
+
// Build a probabilistic matchkey
|
|
30
|
+
const mk = makeMatchkeyConfig({
|
|
31
|
+
name: "fs_identity",
|
|
32
|
+
type: "probabilistic",
|
|
33
|
+
threshold: 0.5,
|
|
34
|
+
linkThreshold: 0.5,
|
|
35
|
+
fields: [
|
|
36
|
+
makeMatchkeyField({ field: "first_name", transforms: ["lowercase"], scorer: "jaro_winkler", levels: 3 }),
|
|
37
|
+
makeMatchkeyField({ field: "last_name", transforms: ["lowercase"], scorer: "jaro_winkler", levels: 3 }),
|
|
38
|
+
makeMatchkeyField({ field: "zip", transforms: [], scorer: "exact", levels: 2 }),
|
|
39
|
+
],
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
// Train the EM model
|
|
43
|
+
const em = trainEM(rows, mk, {
|
|
44
|
+
maxIterations: 25,
|
|
45
|
+
convergence: 1e-4,
|
|
46
|
+
blockingFields: ["zip"], // zip is used for blocking; fix neutral priors
|
|
47
|
+
seed: 42,
|
|
48
|
+
nSamplePairs: 200,
|
|
49
|
+
});
|
|
50
|
+
|
|
51
|
+
console.log(`EM converged: ${em.converged} (${em.iterations} iterations)`);
|
|
52
|
+
console.log(`Estimated p(match): ${em.proportionMatched.toFixed(3)}\n`);
|
|
53
|
+
|
|
54
|
+
console.log("Match weights (log2 m/u) per field per level:");
|
|
55
|
+
for (const [field, weights] of Object.entries(em.matchWeights)) {
|
|
56
|
+
console.log(` ${field.padEnd(12)} [${weights.map((w) => w.toFixed(2)).join(", ")}]`);
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Score all pairs in the block
|
|
60
|
+
const matches = scoreProbabilistic(rows, mk, em, { threshold: 0.5 });
|
|
61
|
+
|
|
62
|
+
console.log(`\nFound ${matches.length} probabilistic matches:`);
|
|
63
|
+
for (const m of matches) {
|
|
64
|
+
console.log(` (${m.idA}, ${m.idB}) -> ${m.score.toFixed(3)}`);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Python -> TS differences:
|
|
69
|
+
* - Python `train_em()` vs. TS `trainEM()` (camelCase).
|
|
70
|
+
* - Python returns numpy arrays; TS returns `readonly number[]`.
|
|
71
|
+
* - TS requires `__row_id__` to be attached; Python adds it in its DF pipeline.
|
|
72
|
+
*/
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Privacy-preserving record linkage (PPRL) via bloom filter CLKs.
|
|
3
|
+
*
|
|
4
|
+
* Two parties want to find common records without revealing plaintext.
|
|
5
|
+
* Each side encodes rows as bloom filters (CLKs) over agreed fields,
|
|
6
|
+
* exchanges only the encoded bit-vectors, and scores via Dice similarity.
|
|
7
|
+
*
|
|
8
|
+
* Three security levels:
|
|
9
|
+
* - standard: bloom filter only (fast, demo/low-risk data)
|
|
10
|
+
* - high: HMAC-SHA256 per-party salt (requires coordinated salt)
|
|
11
|
+
* - paranoid: balanced padding + HMAC (resists frequency analysis)
|
|
12
|
+
*
|
|
13
|
+
* Run: npx tsx examples/07-pprl-privacy.ts
|
|
14
|
+
*/
|
|
15
|
+
import { runPPRL, autoConfigurePPRL } from "goldenmatch";
|
|
16
|
+
|
|
17
|
+
// Party A (e.g., hospital)
|
|
18
|
+
const partyA = [
|
|
19
|
+
{ name: "John Smith", dob: "1980-01-15", city: "Seattle" },
|
|
20
|
+
{ name: "Jane Doe", dob: "1975-06-22", city: "Portland" },
|
|
21
|
+
{ name: "Bob Johnson", dob: "1990-11-03", city: "Austin" },
|
|
22
|
+
{ name: "Alice Miller", dob: "1985-03-18", city: "Boston" },
|
|
23
|
+
];
|
|
24
|
+
|
|
25
|
+
// Party B (e.g., insurance) -- some overlap with A
|
|
26
|
+
const partyB = [
|
|
27
|
+
{ name: "Jon Smith", dob: "1980-01-15", city: "Seattle" }, // same as A[0]
|
|
28
|
+
{ name: "Jane Doe", dob: "1975-06-22", city: "Portland" }, // same as A[1]
|
|
29
|
+
{ name: "Carol Young", dob: "1992-08-09", city: "Denver" }, // new
|
|
30
|
+
];
|
|
31
|
+
|
|
32
|
+
// --- Standard security: trusted-third-party protocol ---
|
|
33
|
+
console.log("=== Standard security (trusted third party) ===");
|
|
34
|
+
const stdResult = runPPRL(partyA, partyB, {
|
|
35
|
+
fields: ["name", "dob", "city"],
|
|
36
|
+
securityLevel: "standard",
|
|
37
|
+
protocol: "trusted_third_party",
|
|
38
|
+
threshold: 0.85,
|
|
39
|
+
});
|
|
40
|
+
for (const m of stdResult.matches) {
|
|
41
|
+
console.log(` A[${m.idA}] <-> B[${m.idB}] score=${m.score.toFixed(3)}`);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// --- High security: salted HMAC ---
|
|
45
|
+
console.log("\n=== High security (HMAC salt) ===");
|
|
46
|
+
const highResult = runPPRL(partyA, partyB, {
|
|
47
|
+
fields: ["name", "dob", "city"],
|
|
48
|
+
securityLevel: "high",
|
|
49
|
+
protocol: "trusted_third_party",
|
|
50
|
+
threshold: 0.85,
|
|
51
|
+
salt: "shared-secret-agreed-upon-out-of-band",
|
|
52
|
+
});
|
|
53
|
+
console.log(` ${highResult.matches.length} matches found`);
|
|
54
|
+
|
|
55
|
+
// --- Paranoid + SMC stub: salted, balanced padding, SMC protocol ---
|
|
56
|
+
console.log("\n=== Paranoid + SMC protocol ===");
|
|
57
|
+
const smcResult = runPPRL(partyA, partyB, {
|
|
58
|
+
fields: ["name", "dob", "city"],
|
|
59
|
+
securityLevel: "paranoid",
|
|
60
|
+
protocol: "smc",
|
|
61
|
+
threshold: 0.85,
|
|
62
|
+
salt: "shared-secret-agreed-upon-out-of-band",
|
|
63
|
+
});
|
|
64
|
+
console.log(` ${smcResult.matches.length} matches found`);
|
|
65
|
+
|
|
66
|
+
// --- Auto-configure: let GoldenMatch pick the fields / threshold ---
|
|
67
|
+
console.log("\n=== Auto-configured ===");
|
|
68
|
+
const autoConfig = autoConfigurePPRL(partyA, partyB);
|
|
69
|
+
console.log(` Auto-picked fields: ${autoConfig.fields.join(", ")}`);
|
|
70
|
+
console.log(` Threshold: ${autoConfig.threshold}`);
|
|
71
|
+
|
|
72
|
+
/**
|
|
73
|
+
* NB: standard level leaks frequency info -- names appearing many times
|
|
74
|
+
* produce identifiable bit patterns. Use "high" or "paranoid" for
|
|
75
|
+
* anything beyond demos.
|
|
76
|
+
*/
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Streaming / incremental record matching.
|
|
3
|
+
*
|
|
4
|
+
* Add records one at a time to a running cluster state. Useful for:
|
|
5
|
+
* - Kafka / Kinesis consumers
|
|
6
|
+
* - Continuous CDC ingest
|
|
7
|
+
* - Web forms (match new signup vs. existing customers on submit)
|
|
8
|
+
*
|
|
9
|
+
* Each `add()` does one `matchOne()` against the running set, then
|
|
10
|
+
* updates the cluster map in-place.
|
|
11
|
+
*
|
|
12
|
+
* Run: npx tsx examples/08-streaming.ts
|
|
13
|
+
*/
|
|
14
|
+
import {
|
|
15
|
+
StreamProcessor,
|
|
16
|
+
makeMatchkeyConfig,
|
|
17
|
+
makeMatchkeyField,
|
|
18
|
+
} from "goldenmatch";
|
|
19
|
+
|
|
20
|
+
const mk = makeMatchkeyConfig({
|
|
21
|
+
name: "identity",
|
|
22
|
+
type: "weighted",
|
|
23
|
+
threshold: 0.85,
|
|
24
|
+
fields: [
|
|
25
|
+
makeMatchkeyField({ field: "name", transforms: ["lowercase", "strip"], scorer: "jaro_winkler", weight: 0.6 }),
|
|
26
|
+
makeMatchkeyField({ field: "email", transforms: ["lowercase", "strip"], scorer: "exact", weight: 0.4 }),
|
|
27
|
+
],
|
|
28
|
+
});
|
|
29
|
+
|
|
30
|
+
const stream = new StreamProcessor({
|
|
31
|
+
matchkey: mk,
|
|
32
|
+
threshold: 0.85,
|
|
33
|
+
maxClusterSize: 50,
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
// Simulated record stream
|
|
37
|
+
const records = [
|
|
38
|
+
{ name: "John Smith", email: "john@example.com" },
|
|
39
|
+
{ name: "Jane Doe", email: "jane@example.com" },
|
|
40
|
+
{ name: "Jon Smith", email: "john@example.com" }, // dupe of #0
|
|
41
|
+
{ name: "Bob Jones", email: "bob@example.com" },
|
|
42
|
+
{ name: "Janet Doe", email: "janet@example.com" },
|
|
43
|
+
{ name: "J. Smith", email: "john@example.com" }, // dupe of #0
|
|
44
|
+
{ name: "Alice Chen", email: "alice@example.com" },
|
|
45
|
+
// ... imagine 43 more records
|
|
46
|
+
];
|
|
47
|
+
|
|
48
|
+
console.log(`Streaming ${records.length} records...\n`);
|
|
49
|
+
|
|
50
|
+
for (let i = 0; i < records.length; i++) {
|
|
51
|
+
const rec = records[i]!;
|
|
52
|
+
const result = stream.add(rec);
|
|
53
|
+
if (result.matchedIds.length > 0) {
|
|
54
|
+
console.log(
|
|
55
|
+
` #${i} "${rec.name}" -> cluster ${result.clusterId}, matched ${result.matchedIds.length} existing (ids: ${result.matchedIds.join(", ")})`,
|
|
56
|
+
);
|
|
57
|
+
} else {
|
|
58
|
+
console.log(` #${i} "${rec.name}" -> new cluster ${result.clusterId}`);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Final snapshot
|
|
63
|
+
const snap = stream.snapshot();
|
|
64
|
+
console.log(`\nFinal state: ${stream.size} records in ${snap.clusters.size} clusters`);
|
|
65
|
+
|
|
66
|
+
for (const [cid, info] of snap.clusters) {
|
|
67
|
+
if (info.size < 2) continue;
|
|
68
|
+
console.log(
|
|
69
|
+
` Cluster ${cid}: ${info.size} members ${JSON.stringify(info.members)} (confidence ${info.confidence.toFixed(2)})`,
|
|
70
|
+
);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Python -> TS differences:
|
|
75
|
+
* - Python `StreamProcessor.add(df_row)`; TS `stream.add(rowObject)`.
|
|
76
|
+
* - `matchedIds` is the list of pre-existing rows the new record joined.
|
|
77
|
+
* - `clusterId` is the cluster the record ultimately landed in (may be
|
|
78
|
+
* a brand-new cluster or an existing one that got merged into).
|
|
79
|
+
*/
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* LLM scorer: send borderline pairs to OpenAI/Anthropic for a YES/NO
|
|
3
|
+
* verdict, with a hard budget cap.
|
|
4
|
+
*
|
|
5
|
+
* Flow:
|
|
6
|
+
* - score >= autoThreshold: auto-accept (promoted to 1.0, no LLM call)
|
|
7
|
+
* - candidateLo <= score < autoThreshold: ask the LLM
|
|
8
|
+
* - score < candidateLo: left as-is
|
|
9
|
+
*
|
|
10
|
+
* Run: OPENAI_API_KEY=sk-... npx tsx examples/09-llm-scorer.ts
|
|
11
|
+
*/
|
|
12
|
+
import {
|
|
13
|
+
llmScorePairs,
|
|
14
|
+
makeScoredPair,
|
|
15
|
+
type LLMScorerConfig,
|
|
16
|
+
type Row,
|
|
17
|
+
type ScoredPair,
|
|
18
|
+
} from "goldenmatch";
|
|
19
|
+
|
|
20
|
+
// Borderline pairs from a prior dedupe() run (scores 0.65 - 0.92)
|
|
21
|
+
const rows: Row[] = [
|
|
22
|
+
{ __row_id__: 0, name: "Apple Inc", description: "Consumer electronics manufacturer" },
|
|
23
|
+
{ __row_id__: 1, name: "Apple Incorporated", description: "Maker of iPhones and Macs" },
|
|
24
|
+
{ __row_id__: 2, name: "Apple Orchard Co", description: "Fruit grower in Washington State" },
|
|
25
|
+
{ __row_id__: 3, name: "Microsoft Corp", description: "Software company, maker of Windows" },
|
|
26
|
+
{ __row_id__: 4, name: "Microsoft", description: "Cloud + software + Xbox" },
|
|
27
|
+
];
|
|
28
|
+
|
|
29
|
+
const candidatePairs: ScoredPair[] = [
|
|
30
|
+
makeScoredPair(0, 1, 0.87), // borderline -- ask LLM
|
|
31
|
+
makeScoredPair(0, 2, 0.72), // borderline -- ask LLM (but almost certainly not same)
|
|
32
|
+
makeScoredPair(3, 4, 0.93), // auto-accept (>= 0.90)
|
|
33
|
+
];
|
|
34
|
+
|
|
35
|
+
const config: LLMScorerConfig = {
|
|
36
|
+
enabled: true,
|
|
37
|
+
provider: "openai", // or "anthropic"
|
|
38
|
+
model: "gpt-4o-mini",
|
|
39
|
+
autoThreshold: 0.90,
|
|
40
|
+
candidateLo: 0.60,
|
|
41
|
+
candidateHi: 0.90,
|
|
42
|
+
batchSize: 10,
|
|
43
|
+
maxWorkers: 4,
|
|
44
|
+
mode: "pairwise",
|
|
45
|
+
budget: {
|
|
46
|
+
maxCostUsd: 0.05,
|
|
47
|
+
maxCalls: 50,
|
|
48
|
+
warnAtPct: 0.8,
|
|
49
|
+
},
|
|
50
|
+
};
|
|
51
|
+
|
|
52
|
+
const apiKey = process.env["OPENAI_API_KEY"];
|
|
53
|
+
if (!apiKey) {
|
|
54
|
+
console.warn("OPENAI_API_KEY not set. Running in no-op mode (candidates pass through).\n");
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const result = await llmScorePairs(candidatePairs, rows, config, apiKey);
|
|
58
|
+
|
|
59
|
+
console.log("After LLM scoring:");
|
|
60
|
+
for (const p of result.pairs) {
|
|
61
|
+
console.log(` (${p.idA}, ${p.idB}) score=${p.score.toFixed(2)}`);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
if (result.budget) {
|
|
65
|
+
console.log("\nBudget usage:");
|
|
66
|
+
console.log(` Calls: ${result.budget.calls}`);
|
|
67
|
+
console.log(` Cost USD: ${result.budget.costUsd.toFixed(4)}`);
|
|
68
|
+
console.log(` Tokens: ${result.budget.inputTokens} in / ${result.budget.outputTokens} out`);
|
|
69
|
+
} else {
|
|
70
|
+
console.log("\nNo budget info (no LLM calls made).");
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Python -> TS differences:
|
|
75
|
+
* - Python `llm_score_pairs()` is sync-looking via asyncio; TS `llmScorePairs()` is
|
|
76
|
+
* an async function returning a Promise.
|
|
77
|
+
* - API key is passed explicitly (no env var magic); pass `process.env.OPENAI_API_KEY`
|
|
78
|
+
* at the call site. Works on edge runtimes that expose `fetch`.
|
|
79
|
+
*/
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Explain why two records matched (or didn't).
|
|
3
|
+
*
|
|
4
|
+
* `explainPair()` re-runs per-field scoring with the matchkey config and
|
|
5
|
+
* produces an NL explanation plus per-field scores. Zero LLM cost.
|
|
6
|
+
*
|
|
7
|
+
* Run: npx tsx examples/10-explain.ts
|
|
8
|
+
*/
|
|
9
|
+
import {
|
|
10
|
+
explainPair,
|
|
11
|
+
makeMatchkeyConfig,
|
|
12
|
+
makeMatchkeyField,
|
|
13
|
+
} from "goldenmatch";
|
|
14
|
+
|
|
15
|
+
const mk = makeMatchkeyConfig({
|
|
16
|
+
name: "identity",
|
|
17
|
+
type: "weighted",
|
|
18
|
+
threshold: 0.85,
|
|
19
|
+
fields: [
|
|
20
|
+
makeMatchkeyField({ field: "first_name", transforms: ["lowercase"], scorer: "jaro_winkler", weight: 0.3 }),
|
|
21
|
+
makeMatchkeyField({ field: "last_name", transforms: ["lowercase"], scorer: "jaro_winkler", weight: 0.4 }),
|
|
22
|
+
makeMatchkeyField({ field: "email", transforms: ["lowercase"], scorer: "exact", weight: 0.3 }),
|
|
23
|
+
],
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
// --- Case 1: strong match ---
|
|
27
|
+
const a1 = { first_name: "John", last_name: "Smith", email: "john@example.com" };
|
|
28
|
+
const b1 = { first_name: "Jon", last_name: "Smith", email: "john@example.com" };
|
|
29
|
+
|
|
30
|
+
const exp1 = explainPair(a1, b1, mk);
|
|
31
|
+
console.log("=== Case 1: strong match ===");
|
|
32
|
+
console.log(`Overall score: ${exp1.score.toFixed(3)} (confidence: ${exp1.confidence})`);
|
|
33
|
+
console.log(`Explanation: ${exp1.explanation}`);
|
|
34
|
+
console.log("Per-field scores:");
|
|
35
|
+
for (const [field, score] of Object.entries(exp1.fieldScores)) {
|
|
36
|
+
console.log(` ${field.padEnd(12)} ${score === null ? "missing" : score.toFixed(3)}`);
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
// --- Case 2: weak match ---
|
|
40
|
+
const a2 = { first_name: "John", last_name: "Smith", email: "john@example.com" };
|
|
41
|
+
const b2 = { first_name: "Johan", last_name: "Smyth", email: "j.smith@other.com" };
|
|
42
|
+
|
|
43
|
+
const exp2 = explainPair(a2, b2, mk);
|
|
44
|
+
console.log("\n=== Case 2: weak match ===");
|
|
45
|
+
console.log(`Overall score: ${exp2.score.toFixed(3)} (confidence: ${exp2.confidence})`);
|
|
46
|
+
console.log(`Explanation: ${exp2.explanation}`);
|
|
47
|
+
console.log("Reasoning steps:");
|
|
48
|
+
for (const step of exp2.reasoning) {
|
|
49
|
+
console.log(` - ${step}`);
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* `details` also contains the full `FieldScoreDetail` array (normalized values
|
|
54
|
+
* after transforms, diff classification). Useful for building review-queue UIs.
|
|
55
|
+
*/
|
|
56
|
+
for (const d of exp2.details) {
|
|
57
|
+
console.log(
|
|
58
|
+
` [details] ${d.field}: "${d.valueA}" vs "${d.valueB}" (${d.diffType})`,
|
|
59
|
+
);
|
|
60
|
+
}
|