npm - @mailwoman/match - Versions diffs - 4.8.1 - Mend

@mailwoman/match 4.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

package/out/blocking.d.ts +77 -0
package/out/blocking.d.ts.map +1 -0
package/out/blocking.js +136 -0
package/out/blocking.js.map +1 -0
package/out/clustering.d.ts +50 -0
package/out/clustering.d.ts.map +1 -0
package/out/clustering.js +101 -0
package/out/clustering.js.map +1 -0
package/out/comparators.d.ts +49 -0
package/out/comparators.d.ts.map +1 -0
package/out/comparators.js +119 -0
package/out/comparators.js.map +1 -0
package/out/distance.d.ts +41 -0
package/out/distance.d.ts.map +1 -0
package/out/distance.js +68 -0
package/out/distance.js.map +1 -0
package/out/em.d.ts +51 -0
package/out/em.d.ts.map +1 -0
package/out/em.js +108 -0
package/out/em.js.map +1 -0
package/out/fellegi-sunter.d.ts +124 -0
package/out/fellegi-sunter.d.ts.map +1 -0
package/out/fellegi-sunter.js +109 -0
package/out/fellegi-sunter.js.map +1 -0
package/out/index.d.ts +25 -0
package/out/index.d.ts.map +1 -0
package/out/index.js +25 -0
package/out/index.js.map +1 -0
package/out/tf.d.ts +55 -0
package/out/tf.d.ts.map +1 -0
package/out/tf.js +66 -0
package/out/tf.js.map +1 -0
package/package.json +38 -0

package/out/distance.js ADDED Viewed

@@ -0,0 +1,68 @@
+/**
+ * @copyright Sister Software
+ * @license AGPL-3.0
+ * @author Teffen Ellis, et al.
+ *
+ *   Geographic distance as a scoring feature — the other half of geocode-first matching.
+ *
+ *   Blocking uses geography to _propose_ candidates; this scores them on it. The research is explicit
+ *   that an address must be matched as a SPATIAL attribute, not by string similarity (a
+ *   one-character edit can be 650 m apart), and that distance measurably helps as a comparison
+ *   feature. So we bucket the great-circle distance between two records' coordinates into ordered
+ *   Fellegi-Sunter agreement levels (Splink's `DistanceInKMAtThresholds`): "same building" / "same
+ *   block" / "same area" / far, each with its own m/u and weight.
+ *
+ *   Calibrate the bucket boundaries to the geocoder's OWN error, which is heavy-tailed and density-
+ *   dependent (≈38 m urban, ≈200 m rural). A weakening of this evidence by geocode quality (a
+ *   shared interpolated centroid is softer than a shared rooftop point) is the documented
+ *   refinement.
+ */
+/** Mean Earth radius in km (IUGG). */
+const EARTH_RADIUS_KM = 6371.0088;
+/** Great-circle (haversine) distance in km between two coordinates. */
+export function haversineKm(a, b) {
+    const toRad = (degrees) => (degrees * Math.PI) / 180;
+    const dLat = toRad(b.latitude - a.latitude);
+    const dLon = toRad(b.longitude - a.longitude);
+    const lat1 = toRad(a.latitude);
+    const lat2 = toRad(b.latitude);
+    const h = Math.sin(dLat / 2) ** 2 + Math.cos(lat1) * Math.cos(lat2) * Math.sin(dLon / 2) ** 2;
+    return 2 * EARTH_RADIUS_KM * Math.asin(Math.min(1, Math.sqrt(h)));
+}
+/**
+ * A geo-distance comparison: bucket the great-circle distance between two records' coordinates into
+ * ordered agreement levels. Levels must be ordered NEAREST first by `maxKm`, the last acting as the
+ * `far` catch-all (`maxKm` omitted → unbounded). A missing/invalid coordinate on either side yields
+ * no evidence.
+ */
+export function distanceComparison(config) {
+    const valid = (c) => !!c && Number.isFinite(c.latitude) && Number.isFinite(c.longitude);
+    return {
+        name: config.name,
+        levels: config.levels,
+        assess(a, b) {
+            const ca = config.extract(a);
+            const cb = config.extract(b);
+            if (!valid(ca) || !valid(cb))
+                return -1;
+            const km = haversineKm(ca, cb);
+            for (let i = 0; i < config.levels.length; i++) {
+                if (km <= (config.levels[i].maxKm ?? Infinity))
+                    return i;
+            }
+            return config.levels.length - 1;
+        },
+    };
+}
+/**
+ * Default distance levels, nearest → far, with boundaries at rooftop / block / locality scale. The
+ * m/u are illustrative seeds (EM re-estimates them); the boundaries reflect typical geocoder
+ * error.
+ */
+export const DEFAULT_DISTANCE_LEVELS = [
+    { label: "same-building", maxKm: 0.05, m: 0.7, u: 0.001 },
+    { label: "same-block", maxKm: 0.5, m: 0.2, u: 0.02 },
+    { label: "same-area", maxKm: 5, m: 0.08, u: 0.2 },
+    { label: "far", m: 0.02, u: 0.779 },
+];
+//# sourceMappingURL=distance.js.map

package/out/distance.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"distance.js","sourceRoot":"","sources":["../distance.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAKH,sCAAsC;AACtC,MAAM,eAAe,GAAG,SAAS,CAAA;AAEjC,uEAAuE;AACvE,MAAM,UAAU,WAAW,CAAC,CAAS,EAAE,CAAS;IAC/C,MAAM,KAAK,GAAG,CAAC,OAAe,EAAU,EAAE,CAAC,CAAC,OAAO,GAAG,IAAI,CAAC,EAAE,CAAC,GAAG,GAAG,CAAA;IACpE,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,QAAQ,GAAG,CAAC,CAAC,QAAQ,CAAC,CAAA;IAC3C,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,SAAS,GAAG,CAAC,CAAC,SAAS,CAAC,CAAA;IAC7C,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAA;IAC9B,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAA;IAE9B,MAAM,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,CAAC,IAAI,CAAC,CAAA;IAC7F,OAAO,CAAC,GAAG,eAAe,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;AAClE,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,kBAAkB,CAAI,MAIrC;IACA,MAAM,KAAK,GAAG,CAAC,CAA4B,EAAe,EAAE,CAC3D,CAAC,CAAC,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,QAAQ,CAAC,IAAI,MAAM,CAAC,QAAQ,CAAC,CAAC,CAAC,SAAS,CAAC,CAAA;IAEnE,OAAO;QACN,IAAI,EAAE,MAAM,CAAC,IAAI;QACjB,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,MAAM,CAAC,CAAC,EAAE,CAAC;YACV,MAAM,EAAE,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA;YAC5B,MAAM,EAAE,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA;YAC5B,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;gBAAE,OAAO,CAAC,CAAC,CAAA;YAEvC,MAAM,EAAE,GAAG,WAAW,CAAC,EAAE,EAAE,EAAE,CAAC,CAAA;YAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC/C,IAAI,EAAE,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,KAAK,IAAI,QAAQ,CAAC;oBAAE,OAAO,CAAC,CAAA;YAC1D,CAAC;YACD,OAAO,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAA;QAChC,CAAC;KACD,CAAA;AACF,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,MAAM,uBAAuB,GAAsB;IACzD,EAAE,KAAK,EAAE,eAAe,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,KAAK,EAAE;IACzD,EAAE,KAAK,EAAE,YAAY,EAAE,KAAK,EAAE,GAAG,EAAE,CAAC,EAAE,GAAG,EAAE,CAAC,EAAE,IAAI,EAAE;IACpD,EAAE,KAAK,EAAE,WAAW,EAAE,KAAK,EAAE,CAAC,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,GAAG,EAAE;IACjD,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,EAAE,IAAI,EAAE,CAAC,EAAE,KAAK,EAAE;CACnC,CAAA"}

package/out/em.d.ts ADDED Viewed

@@ -0,0 +1,51 @@
+/**
+ * @copyright Sister Software
+ * @license AGPL-3.0
+ * @author Teffen Ellis, et al.
+ *
+ *   Unsupervised parameter estimation for the Fellegi-Sunter model — the part that makes the matcher
+ *   work with no labeled data.
+ *
+ *   The paradox: to estimate `m`/`u` you need to know which pairs match, but finding matches is the
+ *   whole problem. EM (Winkler 1988) breaks it by treating the match/non-match status as a hidden
+ *   variable and iterating:
+ *
+ *   - **E-step** — under the current parameters, compute each pair's posterior responsibility `g =
+ *       P(match | its agreement pattern)`.
+ *   - **M-step** — re-estimate `λ`, and each level's `m`/`u`, as `g`-weighted (resp. `(1-g)`-weighted)
+ *       fractions of the pairs landing in that level.
+ *
+ *   It converges because true matches agree on most fields and non-matches don't, so the two classes
+ *   pull apart. Assumes conditional independence of the comparisons given match status (the
+ *   standard F-S assumption). Caveat from the literature: EM can land in a local optimum when the
+ *   true match rate is very low — seed from sensible `m`/`u` (the model's existing levels do this)
+ *   and sanity- check that the recovered `m` exceeds `u` on the top agreement level.
+ */
+import type { Comparison, FellegiSunterModel } from "./fellegi-sunter.js";
+/** Reduce a record pair to its agreement pattern — the per-comparison level index (`-1` = missing). */
+export declare function agreementPattern<R>(comparisons: Comparison<R>[], a: R, b: R): number[];
+/** Options for {@link estimateParameters}. */
+export interface EmOptions {
+    /** Hard iteration cap. Default 100. */
+    maxIterations?: number;
+    /** Convergence tolerance on the largest parameter change between iterations. Default 1e-6. */
+    tolerance?: number;
+    /** Starting prior match rate. Defaults to the model's `lambda`. */
+    initialLambda?: number;
+}
+/** The fitted model plus convergence diagnostics. */
+export interface EmResult<R> {
+    /** The input model with every level's `m`/`u` and the prior `lambda` re-estimated. */
+    model: FellegiSunterModel<R>;
+    /** The estimated prior match rate. */
+    lambda: number;
+    iterations: number;
+    converged: boolean;
+}
+/**
+ * Estimate `m`/`u` and the prior `λ` from unlabeled agreement patterns via EM. The patterns are
+ * per-comparison level indices (as produced by {@link agreementPattern}); a `-1` (missing) field
+ * contributes no evidence to either class. The model's existing level `m`/`u` seed the iteration.
+ */
+export declare function estimateParameters<R>(model: FellegiSunterModel<R>, patterns: number[][], opts?: EmOptions): EmResult<R>;
+//# sourceMappingURL=em.d.ts.map

package/out/em.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"em.d.ts","sourceRoot":"","sources":["../em.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,kBAAkB,EAAE,MAAM,qBAAqB,CAAA;AAQzE,uGAAuG;AACvG,wBAAgB,gBAAgB,CAAC,CAAC,EAAE,WAAW,EAAE,UAAU,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAEtF;AAED,8CAA8C;AAC9C,MAAM,WAAW,SAAS;IACzB,uCAAuC;IACvC,aAAa,CAAC,EAAE,MAAM,CAAA;IACtB,8FAA8F;IAC9F,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,mEAAmE;IACnE,aAAa,CAAC,EAAE,MAAM,CAAA;CACtB;AAED,qDAAqD;AACrD,MAAM,WAAW,QAAQ,CAAC,CAAC;IAC1B,sFAAsF;IACtF,KAAK,EAAE,kBAAkB,CAAC,CAAC,CAAC,CAAA;IAC5B,sCAAsC;IACtC,MAAM,EAAE,MAAM,CAAA;IACd,UAAU,EAAE,MAAM,CAAA;IAClB,SAAS,EAAE,OAAO,CAAA;CAClB;AAED;;;;GAIG;AACH,wBAAgB,kBAAkB,CAAC,CAAC,EACnC,KAAK,EAAE,kBAAkB,CAAC,CAAC,CAAC,EAC5B,QAAQ,EAAE,MAAM,EAAE,EAAE,EACpB,IAAI,GAAE,SAAc,GAClB,QAAQ,CAAC,CAAC,CAAC,CAgFb"}

package/out/em.js ADDED Viewed

@@ -0,0 +1,108 @@
+/**
+ * @copyright Sister Software
+ * @license AGPL-3.0
+ * @author Teffen Ellis, et al.
+ *
+ *   Unsupervised parameter estimation for the Fellegi-Sunter model — the part that makes the matcher
+ *   work with no labeled data.
+ *
+ *   The paradox: to estimate `m`/`u` you need to know which pairs match, but finding matches is the
+ *   whole problem. EM (Winkler 1988) breaks it by treating the match/non-match status as a hidden
+ *   variable and iterating:
+ *
+ *   - **E-step** — under the current parameters, compute each pair's posterior responsibility `g =
+ *       P(match | its agreement pattern)`.
+ *   - **M-step** — re-estimate `λ`, and each level's `m`/`u`, as `g`-weighted (resp. `(1-g)`-weighted)
+ *       fractions of the pairs landing in that level.
+ *
+ *   It converges because true matches agree on most fields and non-matches don't, so the two classes
+ *   pull apart. Assumes conditional independence of the comparisons given match status (the
+ *   standard F-S assumption). Caveat from the literature: EM can land in a local optimum when the
+ *   true match rate is very low — seed from sensible `m`/`u` (the model's existing levels do this)
+ *   and sanity- check that the recovered `m` exceeds `u` on the top agreement level.
+ */
+/**
+ * Tiny floor mixed into the M-step so an unobserved level never produces a zero (→ infinite
+ * weight).
+ */
+const EPSILON = 1e-9;
+/** Reduce a record pair to its agreement pattern — the per-comparison level index (`-1` = missing). */
+export function agreementPattern(comparisons, a, b) {
+    return comparisons.map((comparison) => comparison.assess(a, b));
+}
+/**
+ * Estimate `m`/`u` and the prior `λ` from unlabeled agreement patterns via EM. The patterns are
+ * per-comparison level indices (as produced by {@link agreementPattern}); a `-1` (missing) field
+ * contributes no evidence to either class. The model's existing level `m`/`u` seed the iteration.
+ */
+export function estimateParameters(model, patterns, opts = {}) {
+    const maxIterations = opts.maxIterations ?? 100;
+    const tolerance = opts.tolerance ?? 1e-6;
+    const comparisons = model.comparisons;
+    const levelCounts = comparisons.map((c) => c.levels.length);
+    // Per-comparison, per-level m/u, seeded from the model's current levels.
+    const m = comparisons.map((c) => c.levels.map((l) => l.m));
+    const u = comparisons.map((c) => c.levels.map((l) => l.u));
+    let lambda = opts.initialLambda ?? model.lambda;
+    let iterations = 0;
+    let converged = false;
+    if (patterns.length === 0) {
+        return { model, lambda, iterations, converged };
+    }
+    for (; iterations < maxIterations; iterations++) {
+        const mNumerator = comparisons.map((_, i) => new Array(levelCounts[i]).fill(0));
+        const uNumerator = comparisons.map((_, i) => new Array(levelCounts[i]).fill(0));
+        const mDenominator = comparisons.map(() => 0);
+        const uDenominator = comparisons.map(() => 0);
+        let responsibilitySum = 0;
+        // E-step: posterior P(match | pattern) for each pair.
+        for (const pattern of patterns) {
+            let matchLikelihood = lambda;
+            let nonMatchLikelihood = 1 - lambda;
+            for (let i = 0; i < comparisons.length; i++) {
+                const level = pattern[i];
+                if (level < 0)
+                    continue;
+                matchLikelihood *= m[i][level];
+                nonMatchLikelihood *= u[i][level];
+            }
+            const total = matchLikelihood + nonMatchLikelihood;
+            const g = total > 0 ? matchLikelihood / total : 0;
+            responsibilitySum += g;
+            for (let i = 0; i < comparisons.length; i++) {
+                const level = pattern[i];
+                if (level < 0)
+                    continue;
+                mNumerator[i][level] += g;
+                uNumerator[i][level] += 1 - g;
+                mDenominator[i] += g;
+                uDenominator[i] += 1 - g;
+            }
+        }
+        // M-step: re-estimate λ and each level's m/u as (1-)g-weighted fractions.
+        const newLambda = responsibilitySum / patterns.length;
+        let maxDelta = Math.abs(newLambda - lambda);
+        lambda = newLambda;
+        for (let i = 0; i < comparisons.length; i++) {
+            const levels = levelCounts[i];
+            for (let l = 0; l < levels; l++) {
+                const newM = mDenominator[i] > 0 ? (mNumerator[i][l] + EPSILON) / (mDenominator[i] + EPSILON * levels) : m[i][l];
+                const newU = uDenominator[i] > 0 ? (uNumerator[i][l] + EPSILON) / (uDenominator[i] + EPSILON * levels) : u[i][l];
+                maxDelta = Math.max(maxDelta, Math.abs(newM - m[i][l]), Math.abs(newU - u[i][l]));
+                m[i][l] = newM;
+                u[i][l] = newU;
+            }
+        }
+        if (maxDelta < tolerance) {
+            converged = true;
+            iterations++;
+            break;
+        }
+    }
+    const fittedComparisons = comparisons.map((c, i) => ({
+        ...c,
+        levels: c.levels.map((level, j) => ({ ...level, m: m[i][j], u: u[i][j] })),
+    }));
+    return { model: { comparisons: fittedComparisons, lambda }, lambda, iterations, converged };
+}
+//# sourceMappingURL=em.js.map

package/out/em.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"em.js","sourceRoot":"","sources":["../em.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;GAsBG;AAIH;;;GAGG;AACH,MAAM,OAAO,GAAG,IAAI,CAAA;AAEpB,uGAAuG;AACvG,MAAM,UAAU,gBAAgB,CAAI,WAA4B,EAAE,CAAI,EAAE,CAAI;IAC3E,OAAO,WAAW,CAAC,GAAG,CAAC,CAAC,UAAU,EAAE,EAAE,CAAC,UAAU,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;AAChE,CAAC;AAsBD;;;;GAIG;AACH,MAAM,UAAU,kBAAkB,CACjC,KAA4B,EAC5B,QAAoB,EACpB,OAAkB,EAAE;IAEpB,MAAM,aAAa,GAAG,IAAI,CAAC,aAAa,IAAI,GAAG,CAAA;IAC/C,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,IAAI,CAAA;IACxC,MAAM,WAAW,GAAG,KAAK,CAAC,WAAW,CAAA;IACrC,MAAM,WAAW,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,CAAA;IAE3D,yEAAyE;IACzE,MAAM,CAAC,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;IAC1D,MAAM,CAAC,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAA;IAC1D,IAAI,MAAM,GAAG,IAAI,CAAC,aAAa,IAAI,KAAK,CAAC,MAAM,CAAA;IAE/C,IAAI,UAAU,GAAG,CAAC,CAAA;IAClB,IAAI,SAAS,GAAG,KAAK,CAAA;IAErB,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC3B,OAAO,EAAE,KAAK,EAAE,MAAM,EAAE,UAAU,EAAE,SAAS,EAAE,CAAA;IAChD,CAAC;IAED,OAAO,UAAU,GAAG,aAAa,EAAE,UAAU,EAAE,EAAE,CAAC;QACjD,MAAM,UAAU,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,IAAI,KAAK,CAAS,WAAW,CAAC,CAAC,CAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;QACxF,MAAM,UAAU,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,IAAI,KAAK,CAAS,WAAW,CAAC,CAAC,CAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAA;QACxF,MAAM,YAAY,GAAG,WAAW,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAA;QAC7C,MAAM,YAAY,GAAG,WAAW,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAA;QAC7C,IAAI,iBAAiB,GAAG,CAAC,CAAA;QAEzB,sDAAsD;QACtD,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;YAChC,IAAI,eAAe,GAAG,MAAM,CAAA;YAC5B,IAAI,kBAAkB,GAAG,CAAC,GAAG,MAAM,CAAA;YACnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC7C,MAAM,KAAK,GAAG,OAAO,CAAC,CAAC,CAAE,CAAA;gBACzB,IAAI,KAAK,GAAG,CAAC;oBAAE,SAAQ;gBACvB,eAAe,IAAI,CAAC,CAAC,CAAC,CAAE,CAAC,KAAK,CAAE,CAAA;gBAChC,kBAAkB,IAAI,CAAC,CAAC,CAAC,CAAE,CAAC,KAAK,CAAE,CAAA;YACpC,CAAC;YACD,MAAM,KAAK,GAAG,eAAe,GAAG,kBAAkB,CAAA;YAClD,MAAM,CAAC,GAAG,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,eAAe,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAA;YACjD,iBAAiB,IAAI,CAAC,CAAA;YAEtB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC7C,MAAM,KAAK,GAAG,OAAO,CAAC,CAAC,CAAE,CAAA;gBACzB,IAAI,KAAK,GAAG,CAAC;oBAAE,SAAQ;gBACvB,UAAU,CAAC,CAAC,CAAE,CAAC,KAAK,CAAE,IAAI,CAAC,CAAA;gBAC3B,UAAU,CAAC,CAAC,CAAE,CAAC,KAAK,CAAE,IAAI,CAAC,GAAG,CAAC,CAAA;gBAC/B,YAAY,CAAC,CAAC,CAAE,IAAI,CAAC,CAAA;gBACrB,YAAY,CAAC,CAAC,CAAE,IAAI,CAAC,GAAG,CAAC,CAAA;YAC1B,CAAC;QACF,CAAC;QAED,0EAA0E;QAC1E,MAAM,SAAS,GAAG,iBAAiB,GAAG,QAAQ,CAAC,MAAM,CAAA;QACrD,IAAI,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,GAAG,MAAM,CAAC,CAAA;QAC3C,MAAM,GAAG,SAAS,CAAA;QAElB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC7C,MAAM,MAAM,GAAG,WAAW,CAAC,CAAC,CAAE,CAAA;YAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBACjC,MAAM,IAAI,GACT,YAAY,CAAC,CAAC,CAAE,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,GAAG,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC,CAAE,GAAG,OAAO,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,CAAA;gBAC1G,MAAM,IAAI,GACT,YAAY,CAAC,CAAC,CAAE,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,GAAG,OAAO,CAAC,GAAG,CAAC,YAAY,CAAC,CAAC,CAAE,GAAG,OAAO,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,CAAA;gBAC1G,QAAQ,GAAG,IAAI,CAAC,GAAG,CAAC,QAAQ,EAAE,IAAI,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,IAAI,GAAG,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,CAAC,CAAC,CAAA;gBACrF,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAC,GAAG,IAAI,CAAA;gBACf,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAC,GAAG,IAAI,CAAA;YAChB,CAAC;QACF,CAAC;QAED,IAAI,QAAQ,GAAG,SAAS,EAAE,CAAC;YAC1B,SAAS,GAAG,IAAI,CAAA;YAChB,UAAU,EAAE,CAAA;YACZ,MAAK;QACN,CAAC;IACF,CAAC;IAED,MAAM,iBAAiB,GAAG,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QACpD,GAAG,CAAC;QACJ,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,KAAK,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC,CAAE,CAAC,CAAC,CAAE,EAAE,CAAC,CAAC;KAC9E,CAAC,CAAC,CAAA;IAEH,OAAO,EAAE,KAAK,EAAE,EAAE,WAAW,EAAE,iBAAiB,EAAE,MAAM,EAAE,EAAE,MAAM,EAAE,UAAU,EAAE,SAAS,EAAE,CAAA;AAC5F,CAAC"}

package/out/fellegi-sunter.d.ts ADDED Viewed

@@ -0,0 +1,124 @@
+/**
+ * @copyright Sister Software
+ * @license AGPL-3.0
+ * @author Teffen Ellis, et al.
+ *
+ *   The Fellegi-Sunter scorer — the matcher's decision layer.
+ *
+ *   Each field comparison lands a record pair in a discrete _agreement level_ (exact / high / low /
+ *   different / missing). Each level carries two probabilities: `m` = P(this level | the pair
+ *   really matches) and `u` = P(this level | it doesn't). Their ratio is a Bayes factor, and its
+ *   log is the level's contribution to the total match weight in bits:
+ *
+ *   ```
+ *   M = log2(λ / (1 - λ))  +  Σ_fields  log2(m_level / u_level)
+ * ```
+ *
+ *   — a prior (how likely any two random records match) plus an additive, per-field-attributable
+ *   stack of evidence. Convert `M` to a probability and threshold it: above the upper bound is a
+ *   link, below the lower bound a non-link, and the band between is _clerical review_ — the
+ *   calibrated abstain zone the whole design leans on.
+ *
+ *   The `m`/`u` numbers here are NOT universal constants. They are estimated from the data — by EM,
+ *   unsupervised (the next increment) — and the term-frequency adjustment that makes a rare-name
+ *   agreement count more than a common one layers on top. This module is the deterministic core
+ *   those build on: given the levels, it produces the weights, the probability, and the decision.
+ */
+/** One agreement level of a comparison, with its match / non-match probabilities. */
+export interface ComparisonLevel {
+    /** Human-readable label for debugging (`exact`, `high`, `different`). */
+    label: string;
+    /** P(a pair lands in this level | it is a true match). A measure of data quality. */
+    m: number;
+    /** P(a pair lands in this level | it is NOT a match). A measure of coincidence / cardinality. */
+    u: number;
+    /** For similarity-driven comparisons: the minimum similarity (inclusive) to qualify. */
+    minSimilarity?: number;
+    /** For distance-driven comparisons: the maximum distance in km (inclusive) to qualify. */
+    maxKm?: number;
+}
+/** A per-field comparison: pull a value from each record and assign an agreement level. */
+export interface Comparison<R> {
+    /** Field name, for attribution. */
+    name: string;
+    /** Levels ordered highest agreement → lowest (`exact` first, `different` last). */
+    levels: ComparisonLevel[];
+    /** Index into {@link levels}, or `-1` when either value is missing (no evidence → weight 0). */
+    assess(a: R, b: R): number;
+    /**
+     * Optional term-frequency adjustment: on the levels it names, replace the level's average `u`
+     * with the agreeing value's actual frequency, so agreement on a rare value (`Vijayan`) outweighs
+     * agreement on a common one (`Smith`). See `withTermFrequency`.
+     */
+    termFrequency?: TermFrequencyAdjustment<R>;
+}
+/**
+ * Per-value term-frequency adjustment for a comparison (the Splink/Winkler mechanism). `m` is
+ * unchanged; on an agreement level the effective `u` becomes the value's own frequency, adding
+ * `log2(u_level / frequency)` to the weight — large and positive for rare values, negative for
+ * common ones. Floored at {@link TermFrequencyAdjustment.minimumFrequency} so an ultra-rare value
+ * can't produce an unbounded boost.
+ */
+export interface TermFrequencyAdjustment<R> {
+    /** Relative frequency of a value in the data, in (0, 1]. Typically computed on-the-fly. */
+    frequency(value: string): number;
+    /** The level indices the adjustment applies to (typically just the exact level). */
+    levels: ReadonlySet<number>;
+    /** The agreeing value to look up for a pair (a normalized field value), or null to skip. */
+    value(a: R, b: R): string | null | undefined;
+    /** Scale the adjustment in [0, 1]. Default 1. */
+    weight?: number;
+    /** Floor for the looked-up frequency, bounding the boost on ultra-rare values. Default 1e-4. */
+    minimumFrequency?: number;
+}
+/** A Fellegi-Sunter model: the field comparisons plus the prior match rate `λ`. */
+export interface FellegiSunterModel<R> {
+    comparisons: Comparison<R>[];
+    /** Prior probability that two records drawn at random are a match. */
+    lambda: number;
+}
+/** The scored outcome for one record pair. */
+export interface PairScore {
+    /** Total match weight in bits (`log2` odds). */
+    weight: number;
+    /** Match probability in [0, 1]. */
+    probability: number;
+    /** Per-field breakdown — what drove the score. */
+    contributions: Array<{
+        name: string;
+        level: string | null;
+        weight: number;
+    }>;
+}
+/** The terminal decision for a pair under upper / lower match-weight thresholds. */
+export type MatchDecision = "match" | "review" | "non-match";
+/** The Bayes-factor weight of a single level, in bits: `log2(m / u)`. */
+export declare function levelWeight(level: ComparisonLevel): number;
+/** The prior match weight in bits: `log2(λ / (1 - λ))`. */
+export declare function priorWeight(lambda: number): number;
+/** Convert a total match weight (bits) to a probability, numerically stable for extreme weights. */
+export declare function probabilityFromWeight(weight: number): number;
+/**
+ * A comparison driven by a similarity function and a tier of `minSimilarity` thresholds (the
+ * StatCan/Splink recipe). Levels must be ordered highest → lowest similarity, the last acting as
+ * the `different` catch-all (`minSimilarity` 0). A missing value on either side yields no
+ * evidence.
+ */
+export declare function similarityComparison<R>(config: {
+    name: string;
+    extract: (record: R) => string | null | undefined;
+    /** Defaults to {@link nameSimilarity}. */
+    similarity?: (a: string, b: string) => number;
+    levels: ComparisonLevel[];
+}): Comparison<R>;
+/** Score a record pair: total match weight, probability, and the per-field contributions. */
+export declare function scorePair<R>(model: FellegiSunterModel<R>, a: R, b: R): PairScore;
+/**
+ * Classify a score against upper / lower match-weight thresholds (in bits): at or above `upper` is
+ * a link, at or below `lower` a non-link, and the band between is clerical review (abstain).
+ */
+export declare function decide(score: PairScore, thresholds: {
+    upper: number;
+    lower: number;
+}): MatchDecision;
+//# sourceMappingURL=fellegi-sunter.d.ts.map

package/out/fellegi-sunter.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"fellegi-sunter.d.ts","sourceRoot":"","sources":["../fellegi-sunter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AAIH,qFAAqF;AACrF,MAAM,WAAW,eAAe;IAC/B,yEAAyE;IACzE,KAAK,EAAE,MAAM,CAAA;IACb,qFAAqF;IACrF,CAAC,EAAE,MAAM,CAAA;IACT,iGAAiG;IACjG,CAAC,EAAE,MAAM,CAAA;IACT,wFAAwF;IACxF,aAAa,CAAC,EAAE,MAAM,CAAA;IACtB,0FAA0F;IAC1F,KAAK,CAAC,EAAE,MAAM,CAAA;CACd;AAED,2FAA2F;AAC3F,MAAM,WAAW,UAAU,CAAC,CAAC;IAC5B,mCAAmC;IACnC,IAAI,EAAE,MAAM,CAAA;IACZ,mFAAmF;IACnF,MAAM,EAAE,eAAe,EAAE,CAAA;IACzB,gGAAgG;IAChG,MAAM,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,MAAM,CAAA;IAC1B;;;;OAIG;IACH,aAAa,CAAC,EAAE,uBAAuB,CAAC,CAAC,CAAC,CAAA;CAC1C;AAED;;;;;;GAMG;AACH,MAAM,WAAW,uBAAuB,CAAC,CAAC;IACzC,2FAA2F;IAC3F,SAAS,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CAAA;IAChC,oFAAoF;IACpF,MAAM,EAAE,WAAW,CAAC,MAAM,CAAC,CAAA;IAC3B,4FAA4F;IAC5F,KAAK,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,MAAM,GAAG,IAAI,GAAG,SAAS,CAAA;IAC5C,iDAAiD;IACjD,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,gGAAgG;IAChG,gBAAgB,CAAC,EAAE,MAAM,CAAA;CACzB;AAED,mFAAmF;AACnF,MAAM,WAAW,kBAAkB,CAAC,CAAC;IACpC,WAAW,EAAE,UAAU,CAAC,CAAC,CAAC,EAAE,CAAA;IAC5B,sEAAsE;IACtE,MAAM,EAAE,MAAM,CAAA;CACd;AAED,8CAA8C;AAC9C,MAAM,WAAW,SAAS;IACzB,gDAAgD;IAChD,MAAM,EAAE,MAAM,CAAA;IACd,mCAAmC;IACnC,WAAW,EAAE,MAAM,CAAA;IACnB,kDAAkD;IAClD,aAAa,EAAE,KAAK,CAAC;QAAE,IAAI,EAAE,MAAM,CAAC;QAAC,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;QAAC,MAAM,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;CAC5E;AAED,oFAAoF;AACpF,MAAM,MAAM,aAAa,GAAG,OAAO,GAAG,QAAQ,GAAG,WAAW,CAAA;AAE5D,yEAAyE;AACzE,wBAAgB,WAAW,CAAC,KAAK,EAAE,eAAe,GAAG,MAAM,CAG1D;AAED,2DAA2D;AAC3D,wBAAgB,WAAW,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CAIlD;AAED,oGAAoG;AACpG,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CAE5D;AAED;;;;;GAKG;AACH,wBAAgB,oBAAoB,CAAC,CAAC,EAAE,MAAM,EAAE;IAC/C,IAAI,EAAE,MAAM,CAAA;IACZ,OAAO,EAAE,CAAC,MAAM,EAAE,CAAC,KAAK,MAAM,GAAG,IAAI,GAAG,SAAS,CAAA;IACjD,0CAA0C;IAC1C,UAAU,CAAC,EAAE,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,KAAK,MAAM,CAAA;IAC7C,MAAM,EAAE,eAAe,EAAE,CAAA;CACzB,GAAG,UAAU,CAAC,CAAC,CAAC,CAkBhB;AAED,6FAA6F;AAC7F,wBAAgB,SAAS,CAAC,CAAC,EAAE,KAAK,EAAE,kBAAkB,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,GAAG,SAAS,CA4BhF;AAED;;;GAGG;AACH,wBAAgB,MAAM,CAAC,KAAK,EAAE,SAAS,EAAE,UAAU,EAAE;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,KAAK,EAAE,MAAM,CAAA;CAAE,GAAG,aAAa,CAIpG"}

package/out/fellegi-sunter.js ADDED Viewed

@@ -0,0 +1,109 @@
+/**
+ * @copyright Sister Software
+ * @license AGPL-3.0
+ * @author Teffen Ellis, et al.
+ *
+ *   The Fellegi-Sunter scorer — the matcher's decision layer.
+ *
+ *   Each field comparison lands a record pair in a discrete _agreement level_ (exact / high / low /
+ *   different / missing). Each level carries two probabilities: `m` = P(this level | the pair
+ *   really matches) and `u` = P(this level | it doesn't). Their ratio is a Bayes factor, and its
+ *   log is the level's contribution to the total match weight in bits:
+ *
+ *   ```
+ *   M = log2(λ / (1 - λ))  +  Σ_fields  log2(m_level / u_level)
+ * ```
+ *
+ *   — a prior (how likely any two random records match) plus an additive, per-field-attributable
+ *   stack of evidence. Convert `M` to a probability and threshold it: above the upper bound is a
+ *   link, below the lower bound a non-link, and the band between is _clerical review_ — the
+ *   calibrated abstain zone the whole design leans on.
+ *
+ *   The `m`/`u` numbers here are NOT universal constants. They are estimated from the data — by EM,
+ *   unsupervised (the next increment) — and the term-frequency adjustment that makes a rare-name
+ *   agreement count more than a common one layers on top. This module is the deterministic core
+ *   those build on: given the levels, it produces the weights, the probability, and the decision.
+ */
+import { nameSimilarity } from "./comparators.js";
+/** The Bayes-factor weight of a single level, in bits: `log2(m / u)`. */
+export function levelWeight(level) {
+    if (level.u <= 0)
+        return level.m > 0 ? Infinity : 0;
+    return Math.log2(level.m / level.u);
+}
+/** The prior match weight in bits: `log2(λ / (1 - λ))`. */
+export function priorWeight(lambda) {
+    if (lambda <= 0)
+        return -Infinity;
+    if (lambda >= 1)
+        return Infinity;
+    return Math.log2(lambda / (1 - lambda));
+}
+/** Convert a total match weight (bits) to a probability, numerically stable for extreme weights. */
+export function probabilityFromWeight(weight) {
+    return 1 / (1 + 2 ** -weight);
+}
+/**
+ * A comparison driven by a similarity function and a tier of `minSimilarity` thresholds (the
+ * StatCan/Splink recipe). Levels must be ordered highest → lowest similarity, the last acting as
+ * the `different` catch-all (`minSimilarity` 0). A missing value on either side yields no
+ * evidence.
+ */
+export function similarityComparison(config) {
+    const similarity = config.similarity ?? nameSimilarity;
+    return {
+        name: config.name,
+        levels: config.levels,
+        assess(a, b) {
+            const va = config.extract(a);
+            const vb = config.extract(b);
+            if (!va || !vb || !va.trim() || !vb.trim())
+                return -1;
+            const sim = similarity(va, vb);
+            for (let i = 0; i < config.levels.length; i++) {
+                if (sim >= (config.levels[i].minSimilarity ?? 0))
+                    return i;
+            }
+            return config.levels.length - 1;
+        },
+    };
+}
+/** Score a record pair: total match weight, probability, and the per-field contributions. */
+export function scorePair(model, a, b) {
+    let weight = priorWeight(model.lambda);
+    const contributions = [];
+    for (const comparison of model.comparisons) {
+        const index = comparison.assess(a, b);
+        if (index < 0) {
+            contributions.push({ name: comparison.name, level: null, weight: 0 });
+            continue;
+        }
+        const level = comparison.levels[index];
+        let w = levelWeight(level);
+        // Term-frequency adjustment: swap the level's average u for the agreeing value's own frequency.
+        const tf = comparison.termFrequency;
+        if (tf && tf.levels.has(index) && level.u > 0) {
+            const value = tf.value(a, b);
+            if (value) {
+                const frequency = Math.max(tf.frequency(value), tf.minimumFrequency ?? 1e-4);
+                if (frequency > 0)
+                    w += Math.log2(level.u / frequency) * (tf.weight ?? 1);
+            }
+        }
+        weight += w;
+        contributions.push({ name: comparison.name, level: level.label, weight: w });
+    }
+    return { weight, probability: probabilityFromWeight(weight), contributions };
+}
+/**
+ * Classify a score against upper / lower match-weight thresholds (in bits): at or above `upper` is
+ * a link, at or below `lower` a non-link, and the band between is clerical review (abstain).
+ */
+export function decide(score, thresholds) {
+    if (score.weight >= thresholds.upper)
+        return "match";
+    if (score.weight <= thresholds.lower)
+        return "non-match";
+    return "review";
+}
+//# sourceMappingURL=fellegi-sunter.js.map

package/out/fellegi-sunter.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"fellegi-sunter.js","sourceRoot":"","sources":["../fellegi-sunter.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AAEH,OAAO,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAA;AAwEjD,yEAAyE;AACzE,MAAM,UAAU,WAAW,CAAC,KAAsB;IACjD,IAAI,KAAK,CAAC,CAAC,IAAI,CAAC;QAAE,OAAO,KAAK,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC,CAAA;IACnD,OAAO,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,KAAK,CAAC,CAAC,CAAC,CAAA;AACpC,CAAC;AAED,2DAA2D;AAC3D,MAAM,UAAU,WAAW,CAAC,MAAc;IACzC,IAAI,MAAM,IAAI,CAAC;QAAE,OAAO,CAAC,QAAQ,CAAA;IACjC,IAAI,MAAM,IAAI,CAAC;QAAE,OAAO,QAAQ,CAAA;IAChC,OAAO,IAAI,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,GAAG,MAAM,CAAC,CAAC,CAAA;AACxC,CAAC;AAED,oGAAoG;AACpG,MAAM,UAAU,qBAAqB,CAAC,MAAc;IACnD,OAAO,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;AAC9B,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,oBAAoB,CAAI,MAMvC;IACA,MAAM,UAAU,GAAG,MAAM,CAAC,UAAU,IAAI,cAAc,CAAA;IAEtD,OAAO;QACN,IAAI,EAAE,MAAM,CAAC,IAAI;QACjB,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,MAAM,CAAC,CAAC,EAAE,CAAC;YACV,MAAM,EAAE,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA;YAC5B,MAAM,EAAE,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,CAAA;YAC5B,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,IAAI,CAAC,EAAE,CAAC,IAAI,EAAE,IAAI,CAAC,EAAE,CAAC,IAAI,EAAE;gBAAE,OAAO,CAAC,CAAC,CAAA;YAErD,MAAM,GAAG,GAAG,UAAU,CAAC,EAAE,EAAE,EAAE,CAAC,CAAA;YAC9B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC/C,IAAI,GAAG,IAAI,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAE,CAAC,aAAa,IAAI,CAAC,CAAC;oBAAE,OAAO,CAAC,CAAA;YAC5D,CAAC;YACD,OAAO,MAAM,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAA;QAChC,CAAC;KACD,CAAA;AACF,CAAC;AAED,6FAA6F;AAC7F,MAAM,UAAU,SAAS,CAAI,KAA4B,EAAE,CAAI,EAAE,CAAI;IACpE,IAAI,MAAM,GAAG,WAAW,CAAC,KAAK,CAAC,MAAM,CAAC,CAAA;IACtC,MAAM,aAAa,GAA+B,EAAE,CAAA;IAEpD,KAAK,MAAM,UAAU,IAAI,KAAK,CAAC,WAAW,EAAE,CAAC;QAC5C,MAAM,KAAK,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,CAAA;QACrC,IAAI,KAAK,GAAG,CAAC,EAAE,CAAC;YACf,aAAa,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,UAAU,CAAC,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC,CAAA;YACrE,SAAQ;QACT,CAAC;QACD,MAAM,KAAK,GAAG,UAAU,CAAC,MAAM,CAAC,KAAK,CAAE,CAAA;QACvC,IAAI,CAAC,GAAG,WAAW,CAAC,KAAK,CAAC,CAAA;QAE1B,gGAAgG;QAChG,MAAM,EAAE,GAAG,UAAU,CAAC,aAAa,CAAA;QACnC,IAAI,EAAE,IAAI,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,KAAK,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC;YAC/C,MAAM,KAAK,GAAG,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAA;YAC5B,IAAI,KAAK,EAAE,CAAC;gBACX,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,SAAS,CAAC,KAAK,CAAC,EAAE,EAAE,CAAC,gBAAgB,IAAI,IAAI,CAAC,CAAA;gBAC5E,IAAI,SAAS,GAAG,CAAC;oBAAE,CAAC,IAAI,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,GAAG,SAAS,CAAC,GAAG,CAAC,EAAE,CAAC,MAAM,IAAI,CAAC,CAAC,CAAA;YAC1E,CAAC;QACF,CAAC;QAED,MAAM,IAAI,CAAC,CAAA;QACX,aAAa,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,UAAU,CAAC,IAAI,EAAE,KAAK,EAAE,KAAK,CAAC,KAAK,EAAE,MAAM,EAAE,CAAC,EAAE,CAAC,CAAA;IAC7E,CAAC;IAED,OAAO,EAAE,MAAM,EAAE,WAAW,EAAE,qBAAqB,CAAC,MAAM,CAAC,EAAE,aAAa,EAAE,CAAA;AAC7E,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,MAAM,CAAC,KAAgB,EAAE,UAA4C;IACpF,IAAI,KAAK,CAAC,MAAM,IAAI,UAAU,CAAC,KAAK;QAAE,OAAO,OAAO,CAAA;IACpD,IAAI,KAAK,CAAC,MAAM,IAAI,UAAU,CAAC,KAAK;QAAE,OAAO,WAAW,CAAA;IACxD,OAAO,QAAQ,CAAA;AAChB,CAAC"}

package/out/index.d.ts ADDED Viewed

@@ -0,0 +1,25 @@
+/**
+ * @copyright Sister Software
+ * @license AGPL-3.0
+ * @author Teffen Ellis, et al.
+ *
+ *   `@mailwoman/match` — the geocode-first record matcher: block → score → cluster.
+ *
+ *   The full three-stage pipeline:
+ *
+ *   1. {@link block Block} — geo-first candidate generation (a spatial-cell union of cheap, high-recall
+ *        keys), so two records at the same place meet regardless of address spelling.
+ *   2. **Score** — string {@link jaroWinkler comparators} → the {@link scorePair Fellegi-Sunter} weight
+ *        model (agreement levels → `log2(m/u)` weights → probability → link / review / non-link),
+ *        with `m`/`u` learned label-free by {@link estimateParameters EM} and rare-value agreement
+ *        up-weighted by {@link withTermFrequency term frequency}.
+ *   3. {@link cluster Cluster} — resolve the non-transitive pairwise link graph into canonical entities.
+ */
+export * from "./blocking.js";
+export * from "./clustering.js";
+export * from "./comparators.js";
+export * from "./distance.js";
+export * from "./em.js";
+export * from "./fellegi-sunter.js";
+export * from "./tf.js";
+//# sourceMappingURL=index.d.ts.map

package/out/index.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAEH,cAAc,eAAe,CAAA;AAC7B,cAAc,iBAAiB,CAAA;AAC/B,cAAc,kBAAkB,CAAA;AAChC,cAAc,eAAe,CAAA;AAC7B,cAAc,SAAS,CAAA;AACvB,cAAc,qBAAqB,CAAA;AACnC,cAAc,SAAS,CAAA"}

package/out/index.js ADDED Viewed

@@ -0,0 +1,25 @@
+/**
+ * @copyright Sister Software
+ * @license AGPL-3.0
+ * @author Teffen Ellis, et al.
+ *
+ *   `@mailwoman/match` — the geocode-first record matcher: block → score → cluster.
+ *
+ *   The full three-stage pipeline:
+ *
+ *   1. {@link block Block} — geo-first candidate generation (a spatial-cell union of cheap, high-recall
+ *        keys), so two records at the same place meet regardless of address spelling.
+ *   2. **Score** — string {@link jaroWinkler comparators} → the {@link scorePair Fellegi-Sunter} weight
+ *        model (agreement levels → `log2(m/u)` weights → probability → link / review / non-link),
+ *        with `m`/`u` learned label-free by {@link estimateParameters EM} and rare-value agreement
+ *        up-weighted by {@link withTermFrequency term frequency}.
+ *   3. {@link cluster Cluster} — resolve the non-transitive pairwise link graph into canonical entities.
+ */
+export * from "./blocking.js";
+export * from "./clustering.js";
+export * from "./comparators.js";
+export * from "./distance.js";
+export * from "./em.js";
+export * from "./fellegi-sunter.js";
+export * from "./tf.js";
+//# sourceMappingURL=index.js.map

package/out/index.js.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"index.js","sourceRoot":"","sources":["../index.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;GAgBG;AAEH,cAAc,eAAe,CAAA;AAC7B,cAAc,iBAAiB,CAAA;AAC/B,cAAc,kBAAkB,CAAA;AAChC,cAAc,eAAe,CAAA;AAC7B,cAAc,SAAS,CAAA;AACvB,cAAc,qBAAqB,CAAA;AACnC,cAAc,SAAS,CAAA"}

package/out/tf.d.ts ADDED Viewed

@@ -0,0 +1,55 @@
+/**
+ * @copyright Sister Software
+ * @license AGPL-3.0
+ * @author Teffen Ellis, et al.
+ *
+ *   Term-frequency adjustment — making a rare-value agreement count more than a common one.
+ *
+ *   Two people both named "Vijayan" is far stronger evidence of a match than two both named "Smith",
+ *   because "Smith" agreements happen by chance all the time and "Vijayan" agreements don't. The
+ *   Fellegi-Sunter `m` (how often a true match agrees) is roughly the same either way; what differs
+ *   is `u` — the chance a _non_-match agrees — which for an exact agreement on value `v` is just
+ *   how common `v` is. So we leave `m`, and replace the level's average `u` with `frequency(v)`,
+ *   adding `log2(u_level / frequency(v))` to the weight: a big positive bump for rare values, a
+ *   penalty for common ones.
+ *
+ *   Crucially for a label-free matcher: the frequencies are computed ON-THE-FLY from the input column
+ *   (the Splink approach) — no external Census table required. Build a {@link TermFrequencyTable}
+ *   from the values you're matching, then attach it to a comparison with {@link withTermFrequency}.
+ */
+import type { Comparison } from "./fellegi-sunter.js";
+/** A lookup of how common each value is, in (0, 1], built from a column of observed values. */
+export interface TermFrequencyTable {
+    /** Relative frequency of a value (its normalized form), or 0 if never seen. */
+    frequency(value: string): number;
+    /** Total observations the table was built from. */
+    readonly total: number;
+    /** Number of distinct normalized values. */
+    readonly distinct: number;
+}
+/**
+ * Build a {@link TermFrequencyTable} from an iterable of values (e.g. every `given` name in the
+ * dataset). Values are normalized (default: trim + lowercase + collapse whitespace) before
+ * counting, and `frequency()` normalizes its argument the same way, so callers pass raw field
+ * values.
+ */
+export declare function buildTermFrequencyTable(values: Iterable<string | null | undefined>, opts?: {
+    normalize?: (value: string) => string;
+}): TermFrequencyTable;
+/**
+ * Attach a term-frequency adjustment to a comparison. By default it applies to the exact level
+ * (index 0) and looks up the value via `value(a, b)` — usually the agreeing field extracted from
+ * one side. Returns a new comparison; the underlying `assess` and levels are untouched, so this
+ * composes with EM (which re-estimates the base `m`/`u` the adjustment sits on top of).
+ */
+export declare function withTermFrequency<R>(comparison: Comparison<R>, config: {
+    table: TermFrequencyTable;
+    value: (a: R, b: R) => string | null | undefined;
+    /** Level indices to adjust. Default `[0]` (the exact level). */
+    levels?: Iterable<number>;
+    /** Scale in [0, 1]. Default 1. */
+    weight?: number;
+    /** Frequency floor bounding the boost on ultra-rare values. Default 1e-4. */
+    minimumFrequency?: number;
+}): Comparison<R>;
+//# sourceMappingURL=tf.d.ts.map

package/out/tf.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"tf.d.ts","sourceRoot":"","sources":["../tf.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,OAAO,KAAK,EAAE,UAAU,EAA2B,MAAM,qBAAqB,CAAA;AAE9E,+FAA+F;AAC/F,MAAM,WAAW,kBAAkB;IAClC,+EAA+E;IAC/E,SAAS,CAAC,KAAK,EAAE,MAAM,GAAG,MAAM,CAAA;IAChC,mDAAmD;IACnD,QAAQ,CAAC,KAAK,EAAE,MAAM,CAAA;IACtB,4CAA4C;IAC5C,QAAQ,CAAC,QAAQ,EAAE,MAAM,CAAA;CACzB;AAID;;;;;GAKG;AACH,wBAAgB,uBAAuB,CACtC,MAAM,EAAE,QAAQ,CAAC,MAAM,GAAG,IAAI,GAAG,SAAS,CAAC,EAC3C,IAAI,GAAE;IAAE,SAAS,CAAC,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,MAAM,CAAA;CAAO,GAClD,kBAAkB,CAqBpB;AAED;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,CAAC,EAClC,UAAU,EAAE,UAAU,CAAC,CAAC,CAAC,EACzB,MAAM,EAAE;IACP,KAAK,EAAE,kBAAkB,CAAA;IACzB,KAAK,EAAE,CAAC,CAAC,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,KAAK,MAAM,GAAG,IAAI,GAAG,SAAS,CAAA;IAChD,gEAAgE;IAChE,MAAM,CAAC,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAA;IACzB,kCAAkC;IAClC,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,6EAA6E;IAC7E,gBAAgB,CAAC,EAAE,MAAM,CAAA;CACzB,GACC,UAAU,CAAC,CAAC,CAAC,CAUf"}