@mailwoman/match 4.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,68 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Geographic distance as a scoring feature — the other half of geocode-first matching.
7
+ *
8
+ * Blocking uses geography to _propose_ candidates; this scores them on it. The research is explicit
9
+ * that an address must be matched as a SPATIAL attribute, not by string similarity (a
10
+ * one-character edit can be 650 m apart), and that distance measurably helps as a comparison
11
+ * feature. So we bucket the great-circle distance between two records' coordinates into ordered
12
+ * Fellegi-Sunter agreement levels (Splink's `DistanceInKMAtThresholds`): "same building" / "same
13
+ * block" / "same area" / far, each with its own m/u and weight.
14
+ *
15
+ * Calibrate the bucket boundaries to the geocoder's OWN error, which is heavy-tailed and density-
16
+ * dependent (≈38 m urban, ≈200 m rural). A weakening of this evidence by geocode quality (a
17
+ * shared interpolated centroid is softer than a shared rooftop point) is the documented
18
+ * refinement.
19
+ */
20
+ /** Mean Earth radius in km (IUGG). */
21
+ const EARTH_RADIUS_KM = 6371.0088;
22
+ /** Great-circle (haversine) distance in km between two coordinates. */
23
+ export function haversineKm(a, b) {
24
+ const toRad = (degrees) => (degrees * Math.PI) / 180;
25
+ const dLat = toRad(b.latitude - a.latitude);
26
+ const dLon = toRad(b.longitude - a.longitude);
27
+ const lat1 = toRad(a.latitude);
28
+ const lat2 = toRad(b.latitude);
29
+ const h = Math.sin(dLat / 2) ** 2 + Math.cos(lat1) * Math.cos(lat2) * Math.sin(dLon / 2) ** 2;
30
+ return 2 * EARTH_RADIUS_KM * Math.asin(Math.min(1, Math.sqrt(h)));
31
+ }
32
+ /**
33
+ * A geo-distance comparison: bucket the great-circle distance between two records' coordinates into
34
+ * ordered agreement levels. Levels must be ordered NEAREST first by `maxKm`, the last acting as the
35
+ * `far` catch-all (`maxKm` omitted → unbounded). A missing/invalid coordinate on either side yields
36
+ * no evidence.
37
+ */
38
+ export function distanceComparison(config) {
39
+ const valid = (c) => !!c && Number.isFinite(c.latitude) && Number.isFinite(c.longitude);
40
+ return {
41
+ name: config.name,
42
+ levels: config.levels,
43
+ assess(a, b) {
44
+ const ca = config.extract(a);
45
+ const cb = config.extract(b);
46
+ if (!valid(ca) || !valid(cb))
47
+ return -1;
48
+ const km = haversineKm(ca, cb);
49
+ for (let i = 0; i < config.levels.length; i++) {
50
+ if (km <= (config.levels[i].maxKm ?? Infinity))
51
+ return i;
52
+ }
53
+ return config.levels.length - 1;
54
+ },
55
+ };
56
+ }
57
+ /**
58
+ * Default distance levels, nearest → far, with boundaries at rooftop / block / locality scale. The
59
+ * m/u are illustrative seeds (EM re-estimates them); the boundaries reflect typical geocoder
60
+ * error.
61
+ */
62
+ export const DEFAULT_DISTANCE_LEVELS = [
63
+ { label: "same-building", maxKm: 0.05, m: 0.7, u: 0.001 },
64
+ { label: "same-block", maxKm: 0.5, m: 0.2, u: 0.02 },
65
+ { label: "same-area", maxKm: 5, m: 0.08, u: 0.2 },
66
+ { label: "far", m: 0.02, u: 0.779 },
67
+ ];
68
+ //# sourceMappingURL=distance.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"distance.js","sourceRoot":"","sources":["../distance.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAKH,sCAAsC;AACtC,MAAM,eAAe,GAAG,SAAS,CAAA;AAEjC,uEAAuE;AACvE,MAAM,UAAU,WAAW,CAAC,CAAS,EAAE,CAAS;IAC/C,MAAM,KAAK,GAAG,CAAC,OAAe,EAAU,EAAE,CAAC,CAAC,OAAO,GAAG,IAAI,CAAC,EAAE,CAAC,GAAG,GAAG,CAAA;IACpE,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAA;IAC3C,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS,CAAC,CAAA;IAC7C,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAA;IAC9B,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAA;IAE9B,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC,CAAA;IAC7F,OAAO,CAAC,GAAG,eAAe,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;AAClE,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,kBAAkB,CAAI,MAIrC;IACA,MAAM,KAAK,GAAG,CAAC,CAA4B,EAAe,EAAE,CAC3D,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC,CAAA;IAEnE,OAAO;QACN,IAAI,EAAE,MAAM,CAAC,IAAI;QACjB,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,MAAM,CAAC,CAAC,EAAE,CAAC;YACV,MAAM,EAAE,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA;YAC5B,MAAM,EAAE,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA;YAC5B,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;gBAAE,OAAO,CAAC,CAAC,CAAA;YAEvC,MAAM,EAAE,GAAG,WAAW,CAAC,EAAE,EAAE,EAAE,CAAC,CAAA;YAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC/C,IAAI,EAAE,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,IAAI,QAAQ,CAAC;oBAAE,OAAO,CAAC,CAAA;YAC1D,CAAC;YACD,OAAO,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAA;QAChC,CAAC;KACD,CAAA;AACF,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,MAAM,uBAAuB,GAAsB;IACzD,EAAE,KAAK,EAAE,eAAe,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,KAAK,EAAE;IACzD,EAAE,KAAK,EAAE,YAAY,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,IAAI,EAAE;IACpD,EAAE,KAAK,EAAE,WAAW,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,GAAG,EAAE;IACjD,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE;CACnC,CAAA"}
package/out/em.d.ts ADDED
@@ -0,0 +1,51 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Unsupervised parameter estimation for the Fellegi-Sunter model — the part that makes the matcher
7
+ * work with no labeled data.
8
+ *
9
+ * The paradox: to estimate `m`/`u` you need to know which pairs match, but finding matches is the
10
+ * whole problem. EM (Winkler 1988) breaks it by treating the match/non-match status as a hidden
11
+ * variable and iterating:
12
+ *
13
+ * - **E-step** — under the current parameters, compute each pair's posterior responsibility `g =
14
+ * P(match | its agreement pattern)`.
15
+ * - **M-step** — re-estimate `λ`, and each level's `m`/`u`, as `g`-weighted (resp. `(1-g)`-weighted)
16
+ * fractions of the pairs landing in that level.
17
+ *
18
+ * It converges because true matches agree on most fields and non-matches don't, so the two classes
19
+ * pull apart. Assumes conditional independence of the comparisons given match status (the
20
+ * standard F-S assumption). Caveat from the literature: EM can land in a local optimum when the
21
+ * true match rate is very low — seed from sensible `m`/`u` (the model's existing levels do this)
22
+ * and sanity- check that the recovered `m` exceeds `u` on the top agreement level.
23
+ */
24
+ import type { Comparison, FellegiSunterModel } from "./fellegi-sunter.js";
25
+ /** Reduce a record pair to its agreement pattern — the per-comparison level index (`-1` = missing). */
26
+ export declare function agreementPattern<R>(comparisons: Comparison<R>[], a: R, b: R): number[];
27
+ /** Options for {@link estimateParameters}. */
28
+ export interface EmOptions {
29
+ /** Hard iteration cap. Default 100. */
30
+ maxIterations?: number;
31
+ /** Convergence tolerance on the largest parameter change between iterations. Default 1e-6. */
32
+ tolerance?: number;
33
+ /** Starting prior match rate. Defaults to the model's `lambda`. */
34
+ initialLambda?: number;
35
+ }
36
+ /** The fitted model plus convergence diagnostics. */
37
+ export interface EmResult<R> {
38
+ /** The input model with every level's `m`/`u` and the prior `lambda` re-estimated. */
39
+ model: FellegiSunterModel<R>;
40
+ /** The estimated prior match rate. */
41
+ lambda: number;
42
+ iterations: number;
43
+ converged: boolean;
44
+ }
45
+ /**
46
+ * Estimate `m`/`u` and the prior `λ` from unlabeled agreement patterns via EM. The patterns are
47
+ * per-comparison level indices (as produced by {@link agreementPattern}); a `-1` (missing) field
48
+ * contributes no evidence to either class. The model's existing level `m`/`u` seed the iteration.
49
+ */
50
+ export declare function estimateParameters<R>(model: FellegiSunterModel<R>, patterns: number[][], opts?: EmOptions): EmResult<R>;
51
+ //# sourceMappingURL=em.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"em.d.ts","sourceRoot":"","sources":["../em.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,kBAAkB,EAAE,MAAM,qBAAqB,CAAA;AAQzE,uGAAuG;AACvG,wBAAgB,gBAAgB,CAAC,CAAC,EAAE,WAAW,EAAE,UAAU,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAEtF;AAED,8CAA8C;AAC9C,MAAM,WAAW,SAAS;IACzB,uCAAuC;IACvC,aAAa,CAAC,EAAE,MAAM,CAAA;IACtB,8FAA8F;IAC9F,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,mEAAmE;IACnE,aAAa,CAAC,EAAE,MAAM,CAAA;CACtB;AAED,qDAAqD;AACrD,MAAM,WAAW,QAAQ,CAAC,CAAC;IAC1B,sFAAsF;IACtF,KAAK,EAAE,kBAAkB,CAAC,CAAC,CAAC,CAAA;IAC5B,sCAAsC;IACtC,MAAM,EAAE,MAAM,CAAA;IACd,UAAU,EAAE,MAAM,CAAA;IAClB,SAAS,EAAE,OAAO,CAAA;CAClB;AAED;;;;GAIG;AACH,wBAAgB,kBAAkB,CAAC,CAAC,EACnC,KAAK,EAAE,kBAAkB,CAAC,CAAC,CAAC,EAC5B,QAAQ,EAAE,MAAM,EAAE,EAAE,EACpB,IAAI,GAAE,SAAc,GAClB,QAAQ,CAAC,CAAC,CAAC,CAgFb"}
package/out/em.js ADDED
@@ -0,0 +1,108 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Unsupervised parameter estimation for the Fellegi-Sunter model — the part that makes the matcher
7
+ * work with no labeled data.
8
+ *
9
+ * The paradox: to estimate `m`/`u` you need to know which pairs match, but finding matches is the
10
+ * whole problem. EM (Winkler 1988) breaks it by treating the match/non-match status as a hidden
11
+ * variable and iterating:
12
+ *
13
+ * - **E-step** — under the current parameters, compute each pair's posterior responsibility `g =
14
+ * P(match | its agreement pattern)`.
15
+ * - **M-step** — re-estimate `λ`, and each level's `m`/`u`, as `g`-weighted (resp. `(1-g)`-weighted)
16
+ * fractions of the pairs landing in that level.
17
+ *
18
+ * It converges because true matches agree on most fields and non-matches don't, so the two classes
19
+ * pull apart. Assumes conditional independence of the comparisons given match status (the
20
+ * standard F-S assumption). Caveat from the literature: EM can land in a local optimum when the
21
+ * true match rate is very low — seed from sensible `m`/`u` (the model's existing levels do this)
22
+ * and sanity- check that the recovered `m` exceeds `u` on the top agreement level.
23
+ */
24
+ /**
25
+ * Tiny floor mixed into the M-step so an unobserved level never produces a zero (→ infinite
26
+ * weight).
27
+ */
28
+ const EPSILON = 1e-9;
29
+ /** Reduce a record pair to its agreement pattern — the per-comparison level index (`-1` = missing). */
30
+ export function agreementPattern(comparisons, a, b) {
31
+ return comparisons.map((comparison) => comparison.assess(a, b));
32
+ }
33
+ /**
34
+ * Estimate `m`/`u` and the prior `λ` from unlabeled agreement patterns via EM. The patterns are
35
+ * per-comparison level indices (as produced by {@link agreementPattern}); a `-1` (missing) field
36
+ * contributes no evidence to either class. The model's existing level `m`/`u` seed the iteration.
37
+ */
38
+ export function estimateParameters(model, patterns, opts = {}) {
39
+ const maxIterations = opts.maxIterations ?? 100;
40
+ const tolerance = opts.tolerance ?? 1e-6;
41
+ const comparisons = model.comparisons;
42
+ const levelCounts = comparisons.map((c) => c.levels.length);
43
+ // Per-comparison, per-level m/u, seeded from the model's current levels.
44
+ const m = comparisons.map((c) => c.levels.map((l) => l.m));
45
+ const u = comparisons.map((c) => c.levels.map((l) => l.u));
46
+ let lambda = opts.initialLambda ?? model.lambda;
47
+ let iterations = 0;
48
+ let converged = false;
49
+ if (patterns.length === 0) {
50
+ return { model, lambda, iterations, converged };
51
+ }
52
+ for (; iterations < maxIterations; iterations++) {
53
+ const mNumerator = comparisons.map((_, i) => new Array(levelCounts[i]).fill(0));
54
+ const uNumerator = comparisons.map((_, i) => new Array(levelCounts[i]).fill(0));
55
+ const mDenominator = comparisons.map(() => 0);
56
+ const uDenominator = comparisons.map(() => 0);
57
+ let responsibilitySum = 0;
58
+ // E-step: posterior P(match | pattern) for each pair.
59
+ for (const pattern of patterns) {
60
+ let matchLikelihood = lambda;
61
+ let nonMatchLikelihood = 1 - lambda;
62
+ for (let i = 0; i < comparisons.length; i++) {
63
+ const level = pattern[i];
64
+ if (level < 0)
65
+ continue;
66
+ matchLikelihood *= m[i][level];
67
+ nonMatchLikelihood *= u[i][level];
68
+ }
69
+ const total = matchLikelihood + nonMatchLikelihood;
70
+ const g = total > 0 ? matchLikelihood / total : 0;
71
+ responsibilitySum += g;
72
+ for (let i = 0; i < comparisons.length; i++) {
73
+ const level = pattern[i];
74
+ if (level < 0)
75
+ continue;
76
+ mNumerator[i][level] += g;
77
+ uNumerator[i][level] += 1 - g;
78
+ mDenominator[i] += g;
79
+ uDenominator[i] += 1 - g;
80
+ }
81
+ }
82
+ // M-step: re-estimate λ and each level's m/u as (1-)g-weighted fractions.
83
+ const newLambda = responsibilitySum / patterns.length;
84
+ let maxDelta = Math.abs(newLambda - lambda);
85
+ lambda = newLambda;
86
+ for (let i = 0; i < comparisons.length; i++) {
87
+ const levels = levelCounts[i];
88
+ for (let l = 0; l < levels; l++) {
89
+ const newM = mDenominator[i] > 0 ? (mNumerator[i][l] + EPSILON) / (mDenominator[i] + EPSILON * levels) : m[i][l];
90
+ const newU = uDenominator[i] > 0 ? (uNumerator[i][l] + EPSILON) / (uDenominator[i] + EPSILON * levels) : u[i][l];
91
+ maxDelta = Math.max(maxDelta, Math.abs(newM - m[i][l]), Math.abs(newU - u[i][l]));
92
+ m[i][l] = newM;
93
+ u[i][l] = newU;
94
+ }
95
+ }
96
+ if (maxDelta < tolerance) {
97
+ converged = true;
98
+ iterations++;
99
+ break;
100
+ }
101
+ }
102
+ const fittedComparisons = comparisons.map((c, i) => ({
103
+ ...c,
104
+ levels: c.levels.map((level, j) => ({ ...level, m: m[i][j], u: u[i][j] })),
105
+ }));
106
+ return { model: { comparisons: fittedComparisons, lambda }, lambda, iterations, converged };
107
+ }
108
+ //# sourceMappingURL=em.js.map
package/out/em.js.map ADDED
@@ -0,0 +1 @@
1
+ {"version":3,"file":"em.js","sourceRoot":"","sources":["../em.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAIH;;;GAGG;AACH,MAAM,OAAO,GAAG,IAAI,CAAA;AAEpB,uGAAuG;AACvG,MAAM,UAAU,gBAAgB,CAAI,WAA4B,EAAE,CAAI,EAAE,CAAI;IAC3E,OAAO,WAAW,CAAC,GAAG,CAAC,CAAC,UAAU,EAAE,EAAE,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;AAChE,CAAC;AAsBD;;;;GAIG;AACH,MAAM,UAAU,kBAAkB,CACjC,KAA4B,EAC5B,QAAoB,EACpB,OAAkB,EAAE;IAEpB,MAAM,aAAa,GAAG,IAAI,CAAC,aAAa,IAAI,GAAG,CAAA;IAC/C,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,IAAI,CAAA;IACxC,MAAM,WAAW,GAAG,KAAK,CAAC,WAAW,CAAA;IACrC,MAAM,WAAW,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAA;IAE3D,yEAAyE;IACzE,MAAM,CAAC,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;IAC1D,MAAM,CAAC,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;IAC1D,IAAI,MAAM,GAAG,IAAI,CAAC,aAAa,IAAI,KAAK,CAAC,MAAM,CAAA;IAE/C,IAAI,UAAU,GAAG,CAAC,CAAA;IAClB,IAAI,SAAS,GAAG,KAAK,CAAA;IAErB,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3B,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,UAAU,EAAE,SAAS,EAAE,CAAA;IAChD,CAAC;IAED,OAAO,UAAU,GAAG,aAAa,EAAE,UAAU,EAAE,EAAE,CAAC;QACjD,MAAM,UAAU,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,IAAI,KAAK,CAAS,WAAW,CAAC,CAAC,CAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;QACxF,MAAM,UAAU,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,IAAI,KAAK,CAAS,WAAW,CAAC,CAAC,CAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;QACxF,MAAM,YAAY,GAAG,WAAW,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAA;QAC7C,MAAM,YAAY,GAAG,WAAW,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAA;QAC7C,IAAI,iBAAiB,GAAG,CAAC,CAAA;QAEzB,sDAAsD;QACtD,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAChC,IAAI,eAAe,GAAG,MAAM,CAAA;YAC5B,IAAI,kBAAkB,GAAG,CAAC,GAAG,MAAM,CAAA;YACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC7C,MAAM,KAAK,GAAG,OAAO,CAAC,CAAC,CAAE,CAAA;gBACzB,IAAI,KAAK,GAAG,CAAC;oBAAE,SAAQ;gBACvB,eAAe,IAAI,CAAC,CAAC,CAAC,CAAE,CAAC,KAAK,CAAE,CAAA;gBAChC,kBAAkB,IAAI,CAAC,CAAC,CAAC,CAAE,CAAC,KAAK,CAAE,CAAA;YACpC,CAAC;YACD,MAAM,KAAK,GAAG,eAAe,GAAG,kBAAkB,CAAA;YAClD,MAAM,CAAC,GAAG,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,eAAe,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAA;YACjD,iBAAiB,IAAI,CAAC,CAAA;YAEtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC7C,MAAM,KAAK,GAAG,OAAO,CAAC,CAAC,CAAE,CAAA;gBACzB,IAAI,KAAK,GAAG,CAAC;oBAAE,SAAQ;gBACvB,UAAU,CAAC,CAAC,CAAE,CAAC,KAAK,CAAE,IAAI,CAAC,CAAA;gBAC3B,UAAU,CAAC,CAAC,CAAE,CAAC,KAAK,CAAE,IAAI,CAAC,GAAG,CAAC,CAAA;gBAC/B,YAAY,CAAC,CAAC,CAAE,IAAI,CAAC,CAAA;gBACrB,YAAY,CAAC,CAAC,CAAE,IAAI,CAAC,GAAG,CAAC,CAAA;YAC1B,CAAC;QACF,CAAC;QAED,0EAA0E;QAC1E,MAAM,SAAS,GAAG,iBAAiB,GAAG,QAAQ,CAAC,MAAM,CAAA;QACrD,IAAI,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,GAAG,MAAM,CAAC,CAAA;QAC3C,MAAM,GAAG,SAAS,CAAA;QAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7C,MAAM,MAAM,GAAG,WAAW,CAAC,CAAC,CAAE,CAAA;YAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACjC,MAAM,IAAI,GACT,YAAY,CAAC,CAAC,CAAE,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,GAAG,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC,CAAE,GAAG,OAAO,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,CAAA;gBAC1G,MAAM,IAAI,GACT,YAAY,CAAC,CAAC,CAAE,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,GAAG,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC,CAAE,GAAG,OAAO,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,CAAA;gBAC1G,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,IAAI,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,CAAC,CAAC,CAAA;gBACrF,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAC,GAAG,IAAI,CAAA;gBACf,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAC,GAAG,IAAI,CAAA;YAChB,CAAC;QACF,CAAC;QAED,IAAI,QAAQ,GAAG,SAAS,EAAE,CAAC;YAC1B,SAAS,GAAG,IAAI,CAAA;YAChB,UAAU,EAAE,CAAA;YACZ,MAAK;QACN,CAAC;IACF,CAAC;IAED,MAAM,iBAAiB,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QACpD,GAAG,CAAC;QACJ,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,KAAK,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,EAAE,CAAC,CAAC;KAC9E,CAAC,CAAC,CAAA;IAEH,OAAO,EAAE,KAAK,EAAE,EAAE,WAAW,EAAE,iBAAiB,EAAE,MAAM,EAAE,EAAE,MAAM,EAAE,UAAU,EAAE,SAAS,EAAE,CAAA;AAC5F,CAAC"}
@@ -0,0 +1,124 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * The Fellegi-Sunter scorer — the matcher's decision layer.
7
+ *
8
+ * Each field comparison lands a record pair in a discrete _agreement level_ (exact / high / low /
9
+ * different / missing). Each level carries two probabilities: `m` = P(this level | the pair
10
+ * really matches) and `u` = P(this level | it doesn't). Their ratio is a Bayes factor, and its
11
+ * log is the level's contribution to the total match weight in bits:
12
+ *
13
+ * ```
14
+ * M = log2(λ / (1 - λ)) + Σ_fields log2(m_level / u_level)
15
+ * ```
16
+ *
17
+ * — a prior (how likely any two random records match) plus an additive, per-field-attributable
18
+ * stack of evidence. Convert `M` to a probability and threshold it: above the upper bound is a
19
+ * link, below the lower bound a non-link, and the band between is _clerical review_ — the
20
+ * calibrated abstain zone the whole design leans on.
21
+ *
22
+ * The `m`/`u` numbers here are NOT universal constants. They are estimated from the data — by EM,
23
+ * unsupervised (the next increment) — and the term-frequency adjustment that makes a rare-name
24
+ * agreement count more than a common one layers on top. This module is the deterministic core
25
+ * those build on: given the levels, it produces the weights, the probability, and the decision.
26
+ */
27
+ /** One agreement level of a comparison, with its match / non-match probabilities. */
28
+ export interface ComparisonLevel {
29
+ /** Human-readable label for debugging (`exact`, `high`, `different`). */
30
+ label: string;
31
+ /** P(a pair lands in this level | it is a true match). A measure of data quality. */
32
+ m: number;
33
+ /** P(a pair lands in this level | it is NOT a match). A measure of coincidence / cardinality. */
34
+ u: number;
35
+ /** For similarity-driven comparisons: the minimum similarity (inclusive) to qualify. */
36
+ minSimilarity?: number;
37
+ /** For distance-driven comparisons: the maximum distance in km (inclusive) to qualify. */
38
+ maxKm?: number;
39
+ }
40
+ /** A per-field comparison: pull a value from each record and assign an agreement level. */
41
+ export interface Comparison<R> {
42
+ /** Field name, for attribution. */
43
+ name: string;
44
+ /** Levels ordered highest agreement → lowest (`exact` first, `different` last). */
45
+ levels: ComparisonLevel[];
46
+ /** Index into {@link levels}, or `-1` when either value is missing (no evidence → weight 0). */
47
+ assess(a: R, b: R): number;
48
+ /**
49
+ * Optional term-frequency adjustment: on the levels it names, replace the level's average `u`
50
+ * with the agreeing value's actual frequency, so agreement on a rare value (`Vijayan`) outweighs
51
+ * agreement on a common one (`Smith`). See `withTermFrequency`.
52
+ */
53
+ termFrequency?: TermFrequencyAdjustment<R>;
54
+ }
55
+ /**
56
+ * Per-value term-frequency adjustment for a comparison (the Splink/Winkler mechanism). `m` is
57
+ * unchanged; on an agreement level the effective `u` becomes the value's own frequency, adding
58
+ * `log2(u_level / frequency)` to the weight — large and positive for rare values, negative for
59
+ * common ones. Floored at {@link TermFrequencyAdjustment.minimumFrequency} so an ultra-rare value
60
+ * can't produce an unbounded boost.
61
+ */
62
+ export interface TermFrequencyAdjustment<R> {
63
+ /** Relative frequency of a value in the data, in (0, 1]. Typically computed on-the-fly. */
64
+ frequency(value: string): number;
65
+ /** The level indices the adjustment applies to (typically just the exact level). */
66
+ levels: ReadonlySet<number>;
67
+ /** The agreeing value to look up for a pair (a normalized field value), or null to skip. */
68
+ value(a: R, b: R): string | null | undefined;
69
+ /** Scale the adjustment in [0, 1]. Default 1. */
70
+ weight?: number;
71
+ /** Floor for the looked-up frequency, bounding the boost on ultra-rare values. Default 1e-4. */
72
+ minimumFrequency?: number;
73
+ }
74
+ /** A Fellegi-Sunter model: the field comparisons plus the prior match rate `λ`. */
75
+ export interface FellegiSunterModel<R> {
76
+ comparisons: Comparison<R>[];
77
+ /** Prior probability that two records drawn at random are a match. */
78
+ lambda: number;
79
+ }
80
+ /** The scored outcome for one record pair. */
81
+ export interface PairScore {
82
+ /** Total match weight in bits (`log2` odds). */
83
+ weight: number;
84
+ /** Match probability in [0, 1]. */
85
+ probability: number;
86
+ /** Per-field breakdown — what drove the score. */
87
+ contributions: Array<{
88
+ name: string;
89
+ level: string | null;
90
+ weight: number;
91
+ }>;
92
+ }
93
+ /** The terminal decision for a pair under upper / lower match-weight thresholds. */
94
+ export type MatchDecision = "match" | "review" | "non-match";
95
+ /** The Bayes-factor weight of a single level, in bits: `log2(m / u)`. */
96
+ export declare function levelWeight(level: ComparisonLevel): number;
97
+ /** The prior match weight in bits: `log2(λ / (1 - λ))`. */
98
+ export declare function priorWeight(lambda: number): number;
99
+ /** Convert a total match weight (bits) to a probability, numerically stable for extreme weights. */
100
+ export declare function probabilityFromWeight(weight: number): number;
101
+ /**
102
+ * A comparison driven by a similarity function and a tier of `minSimilarity` thresholds (the
103
+ * StatCan/Splink recipe). Levels must be ordered highest → lowest similarity, the last acting as
104
+ * the `different` catch-all (`minSimilarity` 0). A missing value on either side yields no
105
+ * evidence.
106
+ */
107
+ export declare function similarityComparison<R>(config: {
108
+ name: string;
109
+ extract: (record: R) => string | null | undefined;
110
+ /** Defaults to {@link nameSimilarity}. */
111
+ similarity?: (a: string, b: string) => number;
112
+ levels: ComparisonLevel[];
113
+ }): Comparison<R>;
114
+ /** Score a record pair: total match weight, probability, and the per-field contributions. */
115
+ export declare function scorePair<R>(model: FellegiSunterModel<R>, a: R, b: R): PairScore;
116
+ /**
117
+ * Classify a score against upper / lower match-weight thresholds (in bits): at or above `upper` is
118
+ * a link, at or below `lower` a non-link, and the band between is clerical review (abstain).
119
+ */
120
+ export declare function decide(score: PairScore, thresholds: {
121
+ upper: number;
122
+ lower: number;
123
+ }): MatchDecision;
124
+ //# sourceMappingURL=fellegi-sunter.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fellegi-sunter.d.ts","sourceRoot":"","sources":["../fellegi-sunter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AAIH,qFAAqF;AACrF,MAAM,WAAW,eAAe;IAC/B,yEAAyE;IACzE,KAAK,EAAE,MAAM,CAAA;IACb,qFAAqF;IACrF,CAAC,EAAE,MAAM,CAAA;IACT,iGAAiG;IACjG,CAAC,EAAE,MAAM,CAAA;IACT,wFAAwF;IACxF,aAAa,CAAC,EAAE,MAAM,CAAA;IACtB,0FAA0F;IAC1F,KAAK,CAAC,EAAE,MAAM,CAAA;CACd;AAED,2FAA2F;AAC3F,MAAM,WAAW,UAAU,CAAC,CAAC;IAC5B,mCAAmC;IACnC,IAAI,EAAE,MAAM,CAAA;IACZ,mFAAmF;IACnF,MAAM,EAAE,eAAe,EAAE,CAAA;IACzB,gGAAgG;IAChG,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,MAAM,CAAA;IAC1B;;;;OAIG;IACH,aAAa,CAAC,EAAE,uBAAuB,CAAC,CAAC,CAAC,CAAA;CAC1C;AAED;;;;;;GAMG;AACH,MAAM,WAAW,uBAAuB,CAAC,CAAC;IACzC,2FAA2F;IAC3F,SAAS,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CAAA;IAChC,oFAAoF;IACpF,MAAM,EAAE,WAAW,CAAC,MAAM,CAAC,CAAA;IAC3B,4FAA4F;IAC5F,KAAK,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,MAAM,GAAG,IAAI,GAAG,SAAS,CAAA;IAC5C,iDAAiD;IACjD,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,gGAAgG;IAChG,gBAAgB,CAAC,EAAE,MAAM,CAAA;CACzB;AAED,mFAAmF;AACnF,MAAM,WAAW,kBAAkB,CAAC,CAAC;IACpC,WAAW,EAAE,UAAU,CAAC,CAAC,CAAC,EAAE,CAAA;IAC5B,sEAAsE;IACtE,MAAM,EAAE,MAAM,CAAA;CACd;AAED,8CAA8C;AAC9C,MAAM,WAAW,SAAS;IACzB,gDAAgD;IAChD,MAAM,EAAE,MAAM,CAAA;IACd,mCAAmC;IACnC,WAAW,EAAE,MAAM,CAAA;IACnB,kDAAkD;IAClD,aAAa,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;CAC5E;AAED,oFAAoF;AACpF,MAAM,MAAM,aAAa,GAAG,OAAO,GAAG,QAAQ,GAAG,WAAW,CAAA;AAE5D,yEAAyE;AACzE,wBAAgB,WAAW,CAAC,KAAK,EAAE,eAAe,GAAG,MAAM,CAG1D;AAED,2DAA2D;AAC3D,wBAAgB,WAAW,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CAIlD;AAED,oGAAoG;AACpG,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CAE5D;AAED;;;;;GAKG;AACH,wBAAgB,oBAAoB,CAAC,CAAC,EAAE,MAAM,EAAE;IAC/C,IAAI,EAAE,MAAM,CAAA;IACZ,OAAO,EAAE,CAAC,MAAM,EAAE,CAAC,KAAK,MAAM,GAAG,IAAI,GAAG,SAAS,CAAA;IACjD,0CAA0C;IAC1C,UAAU,CAAC,EAAE,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,KAAK,MAAM,CAAA;IAC7C,MAAM,EAAE,eAAe,EAAE,CAAA;CACzB,GAAG,UAAU,CAAC,CAAC,CAAC,CAkBhB;AAED,6FAA6F;AAC7F,wBAAgB,SAAS,CAAC,CAAC,EAAE,KAAK,EAAE,kBAAkB,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,SAAS,CA4BhF;AAED;;;GAGG;AACH,wBAAgB,MAAM,CAAC,KAAK,EAAE,SAAS,EAAE,UAAU,EAAE;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,GAAG,aAAa,CAIpG"}
@@ -0,0 +1,109 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * The Fellegi-Sunter scorer — the matcher's decision layer.
7
+ *
8
+ * Each field comparison lands a record pair in a discrete _agreement level_ (exact / high / low /
9
+ * different / missing). Each level carries two probabilities: `m` = P(this level | the pair
10
+ * really matches) and `u` = P(this level | it doesn't). Their ratio is a Bayes factor, and its
11
+ * log is the level's contribution to the total match weight in bits:
12
+ *
13
+ * ```
14
+ * M = log2(λ / (1 - λ)) + Σ_fields log2(m_level / u_level)
15
+ * ```
16
+ *
17
+ * — a prior (how likely any two random records match) plus an additive, per-field-attributable
18
+ * stack of evidence. Convert `M` to a probability and threshold it: above the upper bound is a
19
+ * link, below the lower bound a non-link, and the band between is _clerical review_ — the
20
+ * calibrated abstain zone the whole design leans on.
21
+ *
22
+ * The `m`/`u` numbers here are NOT universal constants. They are estimated from the data — by EM,
23
+ * unsupervised (the next increment) — and the term-frequency adjustment that makes a rare-name
24
+ * agreement count more than a common one layers on top. This module is the deterministic core
25
+ * those build on: given the levels, it produces the weights, the probability, and the decision.
26
+ */
27
+ import { nameSimilarity } from "./comparators.js";
28
+ /** The Bayes-factor weight of a single level, in bits: `log2(m / u)`. */
29
+ export function levelWeight(level) {
30
+ if (level.u <= 0)
31
+ return level.m > 0 ? Infinity : 0;
32
+ return Math.log2(level.m / level.u);
33
+ }
34
+ /** The prior match weight in bits: `log2(λ / (1 - λ))`. */
35
+ export function priorWeight(lambda) {
36
+ if (lambda <= 0)
37
+ return -Infinity;
38
+ if (lambda >= 1)
39
+ return Infinity;
40
+ return Math.log2(lambda / (1 - lambda));
41
+ }
42
+ /** Convert a total match weight (bits) to a probability, numerically stable for extreme weights. */
43
+ export function probabilityFromWeight(weight) {
44
+ return 1 / (1 + 2 ** -weight);
45
+ }
46
+ /**
47
+ * A comparison driven by a similarity function and a tier of `minSimilarity` thresholds (the
48
+ * StatCan/Splink recipe). Levels must be ordered highest → lowest similarity, the last acting as
49
+ * the `different` catch-all (`minSimilarity` 0). A missing value on either side yields no
50
+ * evidence.
51
+ */
52
+ export function similarityComparison(config) {
53
+ const similarity = config.similarity ?? nameSimilarity;
54
+ return {
55
+ name: config.name,
56
+ levels: config.levels,
57
+ assess(a, b) {
58
+ const va = config.extract(a);
59
+ const vb = config.extract(b);
60
+ if (!va || !vb || !va.trim() || !vb.trim())
61
+ return -1;
62
+ const sim = similarity(va, vb);
63
+ for (let i = 0; i < config.levels.length; i++) {
64
+ if (sim >= (config.levels[i].minSimilarity ?? 0))
65
+ return i;
66
+ }
67
+ return config.levels.length - 1;
68
+ },
69
+ };
70
+ }
71
+ /** Score a record pair: total match weight, probability, and the per-field contributions. */
72
+ export function scorePair(model, a, b) {
73
+ let weight = priorWeight(model.lambda);
74
+ const contributions = [];
75
+ for (const comparison of model.comparisons) {
76
+ const index = comparison.assess(a, b);
77
+ if (index < 0) {
78
+ contributions.push({ name: comparison.name, level: null, weight: 0 });
79
+ continue;
80
+ }
81
+ const level = comparison.levels[index];
82
+ let w = levelWeight(level);
83
+ // Term-frequency adjustment: swap the level's average u for the agreeing value's own frequency.
84
+ const tf = comparison.termFrequency;
85
+ if (tf && tf.levels.has(index) && level.u > 0) {
86
+ const value = tf.value(a, b);
87
+ if (value) {
88
+ const frequency = Math.max(tf.frequency(value), tf.minimumFrequency ?? 1e-4);
89
+ if (frequency > 0)
90
+ w += Math.log2(level.u / frequency) * (tf.weight ?? 1);
91
+ }
92
+ }
93
+ weight += w;
94
+ contributions.push({ name: comparison.name, level: level.label, weight: w });
95
+ }
96
+ return { weight, probability: probabilityFromWeight(weight), contributions };
97
+ }
98
+ /**
99
+ * Classify a score against upper / lower match-weight thresholds (in bits): at or above `upper` is
100
+ * a link, at or below `lower` a non-link, and the band between is clerical review (abstain).
101
+ */
102
+ export function decide(score, thresholds) {
103
+ if (score.weight >= thresholds.upper)
104
+ return "match";
105
+ if (score.weight <= thresholds.lower)
106
+ return "non-match";
107
+ return "review";
108
+ }
109
+ //# sourceMappingURL=fellegi-sunter.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"fellegi-sunter.js","sourceRoot":"","sources":["../fellegi-sunter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AAEH,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAA;AAwEjD,yEAAyE;AACzE,MAAM,UAAU,WAAW,CAAC,KAAsB;IACjD,IAAI,KAAK,CAAC,CAAC,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAA;IACnD,OAAO,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,CAAA;AACpC,CAAC;AAED,2DAA2D;AAC3D,MAAM,UAAU,WAAW,CAAC,MAAc;IACzC,IAAI,MAAM,IAAI,CAAC;QAAE,OAAO,CAAC,QAAQ,CAAA;IACjC,IAAI,MAAM,IAAI,CAAC;QAAE,OAAO,QAAQ,CAAA;IAChC,OAAO,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC,CAAA;AACxC,CAAC;AAED,oGAAoG;AACpG,MAAM,UAAU,qBAAqB,CAAC,MAAc;IACnD,OAAO,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;AAC9B,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,oBAAoB,CAAI,MAMvC;IACA,MAAM,UAAU,GAAG,MAAM,CAAC,UAAU,IAAI,cAAc,CAAA;IAEtD,OAAO;QACN,IAAI,EAAE,MAAM,CAAC,IAAI;QACjB,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,MAAM,CAAC,CAAC,EAAE,CAAC;YACV,MAAM,EAAE,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA;YAC5B,MAAM,EAAE,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA;YAC5B,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,CAAC,IAAI,EAAE,IAAI,CAAC,EAAE,CAAC,IAAI,EAAE;gBAAE,OAAO,CAAC,CAAC,CAAA;YAErD,MAAM,GAAG,GAAG,UAAU,CAAC,EAAE,EAAE,EAAE,CAAC,CAAA;YAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC/C,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,aAAa,IAAI,CAAC,CAAC;oBAAE,OAAO,CAAC,CAAA;YAC5D,CAAC;YACD,OAAO,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAA;QAChC,CAAC;KACD,CAAA;AACF,CAAC;AAED,6FAA6F;AAC7F,MAAM,UAAU,SAAS,CAAI,KAA4B,EAAE,CAAI,EAAE,CAAI;IACpE,IAAI,MAAM,GAAG,WAAW,CAAC,KAAK,CAAC,MAAM,CAAC,CAAA;IACtC,MAAM,aAAa,GAA+B,EAAE,CAAA;IAEpD,KAAK,MAAM,UAAU,IAAI,KAAK,CAAC,WAAW,EAAE,CAAC;QAC5C,MAAM,KAAK,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAA;QACrC,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;YACf,aAAa,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,UAAU,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC,CAAA;YACrE,SAAQ;QACT,CAAC;QACD,MAAM,KAAK,GAAG,UAAU,CAAC,MAAM,CAAC,KAAK,CAAE,CAAA;QACvC,IAAI,CAAC,GAAG,WAAW,CAAC,KAAK,CAAC,CAAA;QAE1B,gGAAgG;QAChG,MAAM,EAAE,GAAG,UAAU,CAAC,aAAa,CAAA;QACnC,IAAI,EAAE,IAAI,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC;YAC/C,MAAM,KAAK,GAAG,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAA;YAC5B,IAAI,KAAK,EAAE,CAAC;gBACX,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,SAAS,CAAC,KAAK,CAAC,EAAE,EAAE,CAAC,gBAAgB,IAAI,IAAI,CAAC,CAAA;gBAC5E,IAAI,SAAS,GAAG,CAAC;oBAAE,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,SAAS,CAAC,GAAG,CAAC,EAAE,CAAC,MAAM,IAAI,CAAC,CAAC,CAAA;YAC1E,CAAC;QACF,CAAC;QAED,MAAM,IAAI,CAAC,CAAA;QACX,aAAa,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,UAAU,CAAC,IAAI,EAAE,KAAK,EAAE,KAAK,CAAC,KAAK,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC,CAAA;IAC7E,CAAC;IAED,OAAO,EAAE,MAAM,EAAE,WAAW,EAAE,qBAAqB,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAA;AAC7E,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,MAAM,CAAC,KAAgB,EAAE,UAA4C;IACpF,IAAI,KAAK,CAAC,MAAM,IAAI,UAAU,CAAC,KAAK;QAAE,OAAO,OAAO,CAAA;IACpD,IAAI,KAAK,CAAC,MAAM,IAAI,UAAU,CAAC,KAAK;QAAE,OAAO,WAAW,CAAA;IACxD,OAAO,QAAQ,CAAA;AAChB,CAAC"}
package/out/index.d.ts ADDED
@@ -0,0 +1,25 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `@mailwoman/match` — the geocode-first record matcher: block → score → cluster.
7
+ *
8
+ * The full three-stage pipeline:
9
+ *
10
+ * 1. {@link block Block} — geo-first candidate generation (a spatial-cell union of cheap, high-recall
11
+ * keys), so two records at the same place meet regardless of address spelling.
12
+ * 2. **Score** — string {@link jaroWinkler comparators} → the {@link scorePair Fellegi-Sunter} weight
13
+ * model (agreement levels → `log2(m/u)` weights → probability → link / review / non-link),
14
+ * with `m`/`u` learned label-free by {@link estimateParameters EM} and rare-value agreement
15
+ * up-weighted by {@link withTermFrequency term frequency}.
16
+ * 3. {@link cluster Cluster} — resolve the non-transitive pairwise link graph into canonical entities.
17
+ */
18
+ export * from "./blocking.js";
19
+ export * from "./clustering.js";
20
+ export * from "./comparators.js";
21
+ export * from "./distance.js";
22
+ export * from "./em.js";
23
+ export * from "./fellegi-sunter.js";
24
+ export * from "./tf.js";
25
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAEH,cAAc,eAAe,CAAA;AAC7B,cAAc,iBAAiB,CAAA;AAC/B,cAAc,kBAAkB,CAAA;AAChC,cAAc,eAAe,CAAA;AAC7B,cAAc,SAAS,CAAA;AACvB,cAAc,qBAAqB,CAAA;AACnC,cAAc,SAAS,CAAA"}
package/out/index.js ADDED
@@ -0,0 +1,25 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * `@mailwoman/match` — the geocode-first record matcher: block → score → cluster.
7
+ *
8
+ * The full three-stage pipeline:
9
+ *
10
+ * 1. {@link block Block} — geo-first candidate generation (a spatial-cell union of cheap, high-recall
11
+ * keys), so two records at the same place meet regardless of address spelling.
12
+ * 2. **Score** — string {@link jaroWinkler comparators} → the {@link scorePair Fellegi-Sunter} weight
13
+ * model (agreement levels → `log2(m/u)` weights → probability → link / review / non-link),
14
+ * with `m`/`u` learned label-free by {@link estimateParameters EM} and rare-value agreement
15
+ * up-weighted by {@link withTermFrequency term frequency}.
16
+ * 3. {@link cluster Cluster} — resolve the non-transitive pairwise link graph into canonical entities.
17
+ */
18
+ export * from "./blocking.js";
19
+ export * from "./clustering.js";
20
+ export * from "./comparators.js";
21
+ export * from "./distance.js";
22
+ export * from "./em.js";
23
+ export * from "./fellegi-sunter.js";
24
+ export * from "./tf.js";
25
+ //# sourceMappingURL=index.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAEH,cAAc,eAAe,CAAA;AAC7B,cAAc,iBAAiB,CAAA;AAC/B,cAAc,kBAAkB,CAAA;AAChC,cAAc,eAAe,CAAA;AAC7B,cAAc,SAAS,CAAA;AACvB,cAAc,qBAAqB,CAAA;AACnC,cAAc,SAAS,CAAA"}
package/out/tf.d.ts ADDED
@@ -0,0 +1,55 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Term-frequency adjustment — making a rare-value agreement count more than a common one.
7
+ *
8
+ * Two people both named "Vijayan" is far stronger evidence of a match than two both named "Smith",
9
+ * because "Smith" agreements happen by chance all the time and "Vijayan" agreements don't. The
10
+ * Fellegi-Sunter `m` (how often a true match agrees) is roughly the same either way; what differs
11
+ * is `u` — the chance a _non_-match agrees — which for an exact agreement on value `v` is just
12
+ * how common `v` is. So we leave `m`, and replace the level's average `u` with `frequency(v)`,
13
+ * adding `log2(u_level / frequency(v))` to the weight: a big positive bump for rare values, a
14
+ * penalty for common ones.
15
+ *
16
+ * Crucially for a label-free matcher: the frequencies are computed ON-THE-FLY from the input column
17
+ * (the Splink approach) — no external Census table required. Build a {@link TermFrequencyTable}
18
+ * from the values you're matching, then attach it to a comparison with {@link withTermFrequency}.
19
+ */
20
+ import type { Comparison } from "./fellegi-sunter.js";
21
+ /** A lookup of how common each value is, in (0, 1], built from a column of observed values. */
22
+ export interface TermFrequencyTable {
23
+ /** Relative frequency of a value (its normalized form), or 0 if never seen. */
24
+ frequency(value: string): number;
25
+ /** Total observations the table was built from. */
26
+ readonly total: number;
27
+ /** Number of distinct normalized values. */
28
+ readonly distinct: number;
29
+ }
30
+ /**
31
+ * Build a {@link TermFrequencyTable} from an iterable of values (e.g. every `given` name in the
32
+ * dataset). Values are normalized (default: trim + lowercase + collapse whitespace) before
33
+ * counting, and `frequency()` normalizes its argument the same way, so callers pass raw field
34
+ * values.
35
+ */
36
+ export declare function buildTermFrequencyTable(values: Iterable<string | null | undefined>, opts?: {
37
+ normalize?: (value: string) => string;
38
+ }): TermFrequencyTable;
39
+ /**
40
+ * Attach a term-frequency adjustment to a comparison. By default it applies to the exact level
41
+ * (index 0) and looks up the value via `value(a, b)` — usually the agreeing field extracted from
42
+ * one side. Returns a new comparison; the underlying `assess` and levels are untouched, so this
43
+ * composes with EM (which re-estimates the base `m`/`u` the adjustment sits on top of).
44
+ */
45
+ export declare function withTermFrequency<R>(comparison: Comparison<R>, config: {
46
+ table: TermFrequencyTable;
47
+ value: (a: R, b: R) => string | null | undefined;
48
+ /** Level indices to adjust. Default `[0]` (the exact level). */
49
+ levels?: Iterable<number>;
50
+ /** Scale in [0, 1]. Default 1. */
51
+ weight?: number;
52
+ /** Frequency floor bounding the boost on ultra-rare values. Default 1e-4. */
53
+ minimumFrequency?: number;
54
+ }): Comparison<R>;
55
+ //# sourceMappingURL=tf.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tf.d.ts","sourceRoot":"","sources":["../tf.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,OAAO,KAAK,EAAE,UAAU,EAA2B,MAAM,qBAAqB,CAAA;AAE9E,+FAA+F;AAC/F,MAAM,WAAW,kBAAkB;IAClC,+EAA+E;IAC/E,SAAS,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CAAA;IAChC,mDAAmD;IACnD,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAA;IACtB,4CAA4C;IAC5C,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAA;CACzB;AAID;;;;;GAKG;AACH,wBAAgB,uBAAuB,CACtC,MAAM,EAAE,QAAQ,CAAC,MAAM,GAAG,IAAI,GAAG,SAAS,CAAC,EAC3C,IAAI,GAAE;IAAE,SAAS,CAAC,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,MAAM,CAAA;CAAO,GAClD,kBAAkB,CAqBpB;AAED;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,CAAC,EAClC,UAAU,EAAE,UAAU,CAAC,CAAC,CAAC,EACzB,MAAM,EAAE;IACP,KAAK,EAAE,kBAAkB,CAAA;IACzB,KAAK,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,KAAK,MAAM,GAAG,IAAI,GAAG,SAAS,CAAA;IAChD,gEAAgE;IAChE,MAAM,CAAC,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAA;IACzB,kCAAkC;IAClC,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,6EAA6E;IAC7E,gBAAgB,CAAC,EAAE,MAAM,CAAA;CACzB,GACC,UAAU,CAAC,CAAC,CAAC,CAUf"}