@mailwoman/match 4.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,77 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Blocking — candidate generation. Comparing every pair is O(n²) (a million records is a trillion
7
+ * comparisons), so we only score pairs that share a cheap key. This is where the geocode-first
8
+ * bet pays off: two records resolving to the same place land in the same spatial cell regardless
9
+ * of how their address strings are spelled, so geography is the primary block.
10
+ *
11
+ * A {@link BlockingKey} maps a record to zero or more string keys; records sharing any key become
12
+ * candidates. Keys compose as a _union_ (the standard multi-pass approach — high recall from
13
+ * cheap rules): block on the spatial cell OR the canonical key OR the postcode, and a pair that
14
+ * any rule catches is scored. {@link conjunction} builds the AND-style key Geo-ER uses (`name-cell
15
+ * AND geo-cell`) when a single rule is too loose.
16
+ *
17
+ * Recall is the priority — a pair the blocker never proposes can never match, the most dangerous
18
+ * silent failure in record linkage. So the spatial grid is generous and neighbour-expanded by
19
+ * default, and any block too large to scan is _reported_, never quietly dropped.
20
+ */
21
+ /** Maps a record to zero or more block keys. Two records sharing any key become a candidate pair. */
22
+ export type BlockingKey<R> = (record: R) => string[];
23
+ /** A geographic coordinate (WGS84 decimal degrees). */
24
+ export interface LatLon {
25
+ latitude: number;
26
+ longitude: number;
27
+ }
28
+ /**
29
+ * A spatial-cell block key: a configurable lat/lon grid. `precisionDegrees` sets the cell size
30
+ * (default 0.05° ≈ 5.5 km of latitude — deliberately generous, per the literature, so same-place
31
+ * records reliably co-block). With `neighbors` (default `true`) a record also keys its 8 adjacent
32
+ * cells, so a pair straddling a cell boundary still meets.
33
+ *
34
+ * Note: an equal-_degree_ grid (longitude cells shrink toward the poles) and neighbour expansion
35
+ * inflates block sizes ~9×; an equal-area H3/geohash index with a single-cell + neighbour-query is
36
+ * the refinement. Behaviour — proximity co-blocking — is the same.
37
+ */
38
+ export declare function geoCellKey<R>(extract: (record: R) => LatLon | null | undefined, opts?: {
39
+ precisionDegrees?: number;
40
+ neighbors?: boolean;
41
+ }): BlockingKey<R>;
42
+ /**
43
+ * An exact-value block key (the canonical address key, a postcode, an email domain…), normalized
44
+ * and optionally truncated to a leading `prefix` of characters (a cheaper, higher-recall rule). A
45
+ * missing or empty value produces no key.
46
+ */
47
+ export declare function exactKey<R>(extract: (record: R) => string | null | undefined, opts?: {
48
+ prefix?: number;
49
+ normalize?: (value: string) => string;
50
+ }): BlockingKey<R>;
51
+ /**
52
+ * A conjunctive block key — the cross-product of its sub-keys, joined (Geo-ER's "name AND
53
+ * distance"). A record is keyed by every combination of one sub-key from each input, so two records
54
+ * co-block only when they agree on _all_ inputs. Tighter blocks, lower recall — use when a single
55
+ * rule is too loose.
56
+ */
57
+ export declare function conjunction<R>(...keys: BlockingKey<R>[]): BlockingKey<R>;
58
+ /** The outcome of a blocking pass. */
59
+ export interface BlockResult<R> {
60
+ /** Deduplicated candidate pairs (no self-pairs; a pair caught by multiple keys appears once). */
61
+ pairs: Array<[R, R]>;
62
+ /** Blocks that exceeded `maxBlockSize` and were skipped — surfaced so coverage limits are visible. */
63
+ droppedBlocks: Array<{
64
+ key: string;
65
+ size: number;
66
+ }>;
67
+ }
68
+ /**
69
+ * Generate candidate pairs from `records` via one or more blocking keys (their union). Builds an
70
+ * inverted index (key → records) and emits the unique within-block pairs. A block larger than
71
+ * `maxBlockSize` is skipped and reported in `droppedBlocks` rather than blowing up into a quadratic
72
+ * scan — an explicit, visible coverage limit, not a silent drop.
73
+ */
74
+ export declare function block<R>(records: readonly R[], blockingKeys: BlockingKey<R> | BlockingKey<R>[], opts?: {
75
+ maxBlockSize?: number;
76
+ }): BlockResult<R>;
77
+ //# sourceMappingURL=blocking.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"blocking.d.ts","sourceRoot":"","sources":["../blocking.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAEH,qGAAqG;AACrG,MAAM,MAAM,WAAW,CAAC,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,KAAK,MAAM,EAAE,CAAA;AAEpD,uDAAuD;AACvD,MAAM,WAAW,MAAM;IACtB,QAAQ,EAAE,MAAM,CAAA;IAChB,SAAS,EAAE,MAAM,CAAA;CACjB;AAED;;;;;;;;;GASG;AACH,wBAAgB,UAAU,CAAC,CAAC,EAC3B,OAAO,EAAE,CAAC,MAAM,EAAE,CAAC,KAAK,MAAM,GAAG,IAAI,GAAG,SAAS,EACjD,IAAI,GAAE;IAAE,gBAAgB,CAAC,EAAE,MAAM,CAAC;IAAC,SAAS,CAAC,EAAE,OAAO,CAAA;CAAO,GAC3D,WAAW,CAAC,CAAC,CAAC,CAoBhB;AAED;;;;GAIG;AACH,wBAAgB,QAAQ,CAAC,CAAC,EACzB,OAAO,EAAE,CAAC,MAAM,EAAE,CAAC,KAAK,MAAM,GAAG,IAAI,GAAG,SAAS,EACjD,IAAI,GAAE;IAAE,MAAM,CAAC,EAAE,MAAM,CAAC;IAAC,SAAS,CAAC,EAAE,CAAC,KAAK,EAAE,MAAM,KAAK,MAAM,CAAA;CAAO,GACnE,WAAW,CAAC,CAAC,CAAC,CAUhB;AAED;;;;;GAKG;AACH,wBAAgB,WAAW,CAAC,CAAC,EAAE,GAAG,IAAI,EAAE,WAAW,CAAC,CAAC,CAAC,EAAE,GAAG,WAAW,CAAC,CAAC,CAAC,CAUxE;AAED,sCAAsC;AACtC,MAAM,WAAW,WAAW,CAAC,CAAC;IAC7B,iGAAiG;IACjG,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;IACpB,sGAAsG;IACtG,aAAa,EAAE,KAAK,CAAC;QAAE,GAAG,EAAE,MAAM,CAAC;QAAC,IAAI,EAAE,MAAM,CAAA;KAAE,CAAC,CAAA;CACnD;AAED;;;;;GAKG;AACH,wBAAgB,KAAK,CAAC,CAAC,EACtB,OAAO,EAAE,SAAS,CAAC,EAAE,EACrB,YAAY,EAAE,WAAW,CAAC,CAAC,CAAC,GAAG,WAAW,CAAC,CAAC,CAAC,EAAE,EAC/C,IAAI,GAAE;IAAE,YAAY,CAAC,EAAE,MAAM,CAAA;CAAO,GAClC,WAAW,CAAC,CAAC,CAAC,CA0ChB"}
@@ -0,0 +1,136 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Blocking — candidate generation. Comparing every pair is O(n²) (a million records is a trillion
7
+ * comparisons), so we only score pairs that share a cheap key. This is where the geocode-first
8
+ * bet pays off: two records resolving to the same place land in the same spatial cell regardless
9
+ * of how their address strings are spelled, so geography is the primary block.
10
+ *
11
+ * A {@link BlockingKey} maps a record to zero or more string keys; records sharing any key become
12
+ * candidates. Keys compose as a _union_ (the standard multi-pass approach — high recall from
13
+ * cheap rules): block on the spatial cell OR the canonical key OR the postcode, and a pair that
14
+ * any rule catches is scored. {@link conjunction} builds the AND-style key Geo-ER uses (`name-cell
15
+ * AND geo-cell`) when a single rule is too loose.
16
+ *
17
+ * Recall is the priority — a pair the blocker never proposes can never match, the most dangerous
18
+ * silent failure in record linkage. So the spatial grid is generous and neighbour-expanded by
19
+ * default, and any block too large to scan is _reported_, never quietly dropped.
20
+ */
21
+ /**
22
+ * A spatial-cell block key: a configurable lat/lon grid. `precisionDegrees` sets the cell size
23
+ * (default 0.05° ≈ 5.5 km of latitude — deliberately generous, per the literature, so same-place
24
+ * records reliably co-block). With `neighbors` (default `true`) a record also keys its 8 adjacent
25
+ * cells, so a pair straddling a cell boundary still meets.
26
+ *
27
+ * Note: an equal-_degree_ grid (longitude cells shrink toward the poles) and neighbour expansion
28
+ * inflates block sizes ~9×; an equal-area H3/geohash index with a single-cell + neighbour-query is
29
+ * the refinement. Behaviour — proximity co-blocking — is the same.
30
+ */
31
+ export function geoCellKey(extract, opts = {}) {
32
+ const step = opts.precisionDegrees ?? 0.05;
33
+ const expand = opts.neighbors ?? true;
34
+ return (record) => {
35
+ const coordinate = extract(record);
36
+ if (!coordinate || !Number.isFinite(coordinate.latitude) || !Number.isFinite(coordinate.longitude))
37
+ return [];
38
+ const latCell = Math.floor(coordinate.latitude / step);
39
+ const lonCell = Math.floor(coordinate.longitude / step);
40
+ if (!expand)
41
+ return [`${latCell}:${lonCell}`];
42
+ const keys = [];
43
+ for (let dLat = -1; dLat <= 1; dLat++) {
44
+ for (let dLon = -1; dLon <= 1; dLon++) {
45
+ keys.push(`${latCell + dLat}:${lonCell + dLon}`);
46
+ }
47
+ }
48
+ return keys;
49
+ };
50
+ }
51
+ /**
52
+ * An exact-value block key (the canonical address key, a postcode, an email domain…), normalized
53
+ * and optionally truncated to a leading `prefix` of characters (a cheaper, higher-recall rule). A
54
+ * missing or empty value produces no key.
55
+ */
56
+ export function exactKey(extract, opts = {}) {
57
+ const normalize = opts.normalize ?? ((v) => v.trim().toLowerCase().replace(/\s+/g, " "));
58
+ return (record) => {
59
+ const value = extract(record);
60
+ if (!value)
61
+ return [];
62
+ const normalized = normalize(value);
63
+ if (!normalized)
64
+ return [];
65
+ return [opts.prefix ? normalized.slice(0, opts.prefix) : normalized];
66
+ };
67
+ }
68
+ /**
69
+ * A conjunctive block key — the cross-product of its sub-keys, joined (Geo-ER's "name AND
70
+ * distance"). A record is keyed by every combination of one sub-key from each input, so two records
71
+ * co-block only when they agree on _all_ inputs. Tighter blocks, lower recall — use when a single
72
+ * rule is too loose.
73
+ */
74
+ export function conjunction(...keys) {
75
+ return (record) => {
76
+ let combos = [""];
77
+ for (const key of keys) {
78
+ const parts = key(record);
79
+ if (parts.length === 0)
80
+ return [];
81
+ combos = combos.flatMap((prefix) => parts.map((part) => (prefix ? `${prefix}&${part}` : part)));
82
+ }
83
+ return combos;
84
+ };
85
+ }
86
+ /**
87
+ * Generate candidate pairs from `records` via one or more blocking keys (their union). Builds an
88
+ * inverted index (key → records) and emits the unique within-block pairs. A block larger than
89
+ * `maxBlockSize` is skipped and reported in `droppedBlocks` rather than blowing up into a quadratic
90
+ * scan — an explicit, visible coverage limit, not a silent drop.
91
+ */
92
+ export function block(records, blockingKeys, opts = {}) {
93
+ const keys = Array.isArray(blockingKeys) ? blockingKeys : [blockingKeys];
94
+ const maxBlockSize = opts.maxBlockSize ?? Infinity;
95
+ const index = new Map();
96
+ records.forEach((record, i) => {
97
+ const seen = new Set();
98
+ for (const keyFn of keys) {
99
+ for (const key of keyFn(record)) {
100
+ if (!key || seen.has(key))
101
+ continue;
102
+ seen.add(key);
103
+ const bucket = index.get(key);
104
+ if (bucket)
105
+ bucket.push(i);
106
+ else
107
+ index.set(key, [i]);
108
+ }
109
+ }
110
+ });
111
+ const n = records.length;
112
+ const emitted = new Set();
113
+ const pairs = [];
114
+ const droppedBlocks = [];
115
+ for (const [key, bucket] of index) {
116
+ if (bucket.length < 2)
117
+ continue;
118
+ if (bucket.length > maxBlockSize) {
119
+ droppedBlocks.push({ key, size: bucket.length });
120
+ continue;
121
+ }
122
+ for (let a = 0; a < bucket.length; a++) {
123
+ for (let b = a + 1; b < bucket.length; b++) {
124
+ const lo = Math.min(bucket[a], bucket[b]);
125
+ const hi = Math.max(bucket[a], bucket[b]);
126
+ const id = lo * n + hi;
127
+ if (emitted.has(id))
128
+ continue;
129
+ emitted.add(id);
130
+ pairs.push([records[lo], records[hi]]);
131
+ }
132
+ }
133
+ }
134
+ return { pairs, droppedBlocks };
135
+ }
136
+ //# sourceMappingURL=blocking.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"blocking.js","sourceRoot":"","sources":["../blocking.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;GAmBG;AAWH;;;;;;;;;GASG;AACH,MAAM,UAAU,UAAU,CACzB,OAAiD,EACjD,OAA2D,EAAE;IAE7D,MAAM,IAAI,GAAG,IAAI,CAAC,gBAAgB,IAAI,IAAI,CAAA;IAC1C,MAAM,MAAM,GAAG,IAAI,CAAC,SAAS,IAAI,IAAI,CAAA;IAErC,OAAO,CAAC,MAAM,EAAE,EAAE;QACjB,MAAM,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,CAAA;QAClC,IAAI,CAAC,UAAU,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,QAAQ,CAAC,UAAU,CAAC,SAAS,CAAC;YAAE,OAAO,EAAE,CAAA;QAE7G,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,QAAQ,GAAG,IAAI,CAAC,CAAA;QACtD,MAAM,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,SAAS,GAAG,IAAI,CAAC,CAAA;QACvD,IAAI,CAAC,MAAM;YAAE,OAAO,CAAC,GAAG,OAAO,IAAI,OAAO,EAAE,CAAC,CAAA;QAE7C,MAAM,IAAI,GAAa,EAAE,CAAA;QACzB,KAAK,IAAI,IAAI,GAAG,CAAC,CAAC,EAAE,IAAI,IAAI,CAAC,EAAE,IAAI,EAAE,EAAE,CAAC;YACvC,KAAK,IAAI,IAAI,GAAG,CAAC,CAAC,EAAE,IAAI,IAAI,CAAC,EAAE,IAAI,EAAE,EAAE,CAAC;gBACvC,IAAI,CAAC,IAAI,CAAC,GAAG,OAAO,GAAG,IAAI,IAAI,OAAO,GAAG,IAAI,EAAE,CAAC,CAAA;YACjD,CAAC;QACF,CAAC;QACD,OAAO,IAAI,CAAA;IACZ,CAAC,CAAA;AACF,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,QAAQ,CACvB,OAAiD,EACjD,OAAmE,EAAE;IAErE,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,CAAC,CAAC,CAAS,EAAE,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,CAAA;IAEhG,OAAO,CAAC,MAAM,EAAE,EAAE;QACjB,MAAM,KAAK,GAAG,OAAO,CAAC,MAAM,CAAC,CAAA;QAC7B,IAAI,CAAC,KAAK;YAAE,OAAO,EAAE,CAAA;QACrB,MAAM,UAAU,GAAG,SAAS,CAAC,KAAK,CAAC,CAAA;QACnC,IAAI,CAAC,UAAU;YAAE,OAAO,EAAE,CAAA;QAC1B,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,CAAA;IACrE,CAAC,CAAA;AACF,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,WAAW,CAAI,GAAG,IAAsB;IACvD,OAAO,CAAC,MAAM,EAAE,EAAE;QACjB,IAAI,MAAM,GAAG,CAAC,EAAE,CAAC,CAAA;QACjB,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;YACxB,MAAM,KAAK,GAAG,GAAG,CAAC,MAAM,CAAC,CAAA;YACzB,IAAI,KAAK,CAAC,MAAM,KAAK,CAAC;gBAAE,OAAO,EAAE,CAAA;YACjC,MAAM,GAAG,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,MAAM,IAAI,IAAI,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QAChG,CAAC;QACD,OAAO,MAAM,CAAA;IACd,CAAC,CAAA;AACF,CAAC;AAUD;;;;;GAKG;AACH,MAAM,UAAU,KAAK,CACpB,OAAqB,EACrB,YAA+C,EAC/C,OAAkC,EAAE;IAEpC,MAAM,IAAI,GAAG,KAAK,CAAC,OAAO,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,YAAY,CAAC,CAAA;IACxE,MAAM,YAAY,GAAG,IAAI,CAAC,YAAY,IAAI,QAAQ,CAAA;IAClD,MAAM,KAAK,GAAG,IAAI,GAAG,EAAoB,CAAA;IAEzC,OAAO,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE;QAC7B,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAA;QAC9B,KAAK,MAAM,KAAK,IAAI,IAAI,EAAE,CAAC;YAC1B,KAAK,MAAM,GAAG,IAAI,KAAK,CAAC,MAAM,CAAC,EAAE,CAAC;gBACjC,IAAI,CAAC,GAAG,IAAI,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC;oBAAE,SAAQ;gBACnC,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,CAAA;gBACb,MAAM,MAAM,GAAG,KAAK,CAAC,GAAG,CAAC,GAAG,CAAC,CAAA;gBAC7B,IAAI,MAAM;oBAAE,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;;oBACrB,KAAK,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAA;YACzB,CAAC;QACF,CAAC;IACF,CAAC,CAAC,CAAA;IAEF,MAAM,CAAC,GAAG,OAAO,CAAC,MAAM,CAAA;IACxB,MAAM,OAAO,GAAG,IAAI,GAAG,EAAU,CAAA;IACjC,MAAM,KAAK,GAAkB,EAAE,CAAA;IAC/B,MAAM,aAAa,GAAoC,EAAE,CAAA;IAEzD,KAAK,MAAM,CAAC,GAAG,EAAE,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACnC,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC;YAAE,SAAQ;QAC/B,IAAI,MAAM,CAAC,MAAM,GAAG,YAAY,EAAE,CAAC;YAClC,aAAa,CAAC,IAAI,CAAC,EAAE,GAAG,EAAE,IAAI,EAAE,MAAM,CAAC,MAAM,EAAE,CAAC,CAAA;YAChD,SAAQ;QACT,CAAC;QACD,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YACxC,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC5C,MAAM,EAAE,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAE,EAAE,MAAM,CAAC,CAAC,CAAE,CAAC,CAAA;gBAC3C,MAAM,EAAE,GAAG,IAAI,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,CAAE,EAAE,MAAM,CAAC,CAAC,CAAE,CAAC,CAAA;gBAC3C,MAAM,EAAE,GAAG,EAAE,GAAG,CAAC,GAAG,EAAE,CAAA;gBACtB,IAAI,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC;oBAAE,SAAQ;gBAC7B,OAAO,CAAC,GAAG,CAAC,EAAE,CAAC,CAAA;gBACf,KAAK,CAAC,IAAI,CAAC,CAAC,OAAO,CAAC,EAAE,CAAE,EAAE,OAAO,CAAC,EAAE,CAAE,CAAC,CAAC,CAAA;YACzC,CAAC;QACF,CAAC;IACF,CAAC;IAED,OAAO,EAAE,KAAK,EAAE,aAAa,EAAE,CAAA;AAChC,CAAC"}
@@ -0,0 +1,50 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Clustering — the third and final matcher stage: resolve scored pairs into canonical entities.
7
+ *
8
+ * The pairwise scorer treats each pair independently, and its scores are NOT transitive: A~B at a
9
+ * high weight and B~C at a high weight does not guarantee A~C is a match. So a distinct stage is
10
+ * required to turn the graph of above-threshold links into coherent groups — skip it and your
11
+ * "entities" quietly fracture or fuse.
12
+ *
13
+ * This ships the standard baseline: connected components of the link graph (union-find), with the
14
+ * link threshold as the precision/recall knob — raise it for tighter, purer clusters, lower it
15
+ * for more recall. Its known weakness is over-merging via transitive chains (a string of weak
16
+ * links can pull unrelated records into one component); the principled fix is
17
+ * centroid-/average-linkage hierarchical clustering (Dedupe), which uses the full within-cluster
18
+ * score matrix — a documented refinement, not this first cut. For a geocode-first matcher the
19
+ * over-merge risk is already damped: blocking keeps candidate sets local, so chains can't run
20
+ * across the whole dataset.
21
+ */
22
+ /** A scored candidate pair: two records and the match weight (bits) the scorer assigned them. */
23
+ export interface ScoredLink<R> {
24
+ a: R;
25
+ b: R;
26
+ weight: number;
27
+ }
28
+ /** Options for {@link cluster}. */
29
+ export interface ClusterOptions {
30
+ /**
31
+ * Link two records only when their match weight is at or above this (bits) — the precision/recall
32
+ * knob.
33
+ */
34
+ threshold: number;
35
+ }
36
+ /**
37
+ * Cluster records into canonical entities by connected components of the above-threshold link
38
+ * graph. Every input record lands in exactly one cluster — a record with no qualifying link is a
39
+ * singleton. Links referencing a record not in `records` are ignored. Reference identity is used,
40
+ * so pass the same record objects to both arguments.
41
+ */
42
+ export declare function cluster<R>(records: readonly R[], links: Iterable<ScoredLink<R>>, opts: ClusterOptions): R[][];
43
+ /**
44
+ * Pick a cluster's most complete record as its canonical representative — the one with the fewest
45
+ * empty fields (`null` / `undefined` / `""`). Ties keep the earliest. A basic, generic
46
+ * canonicalizer; field-level merging across the cluster is the application's job (it knows which
47
+ * source to trust).
48
+ */
49
+ export declare function representative<R extends object>(group: readonly R[]): R | undefined;
50
+ //# sourceMappingURL=clustering.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"clustering.d.ts","sourceRoot":"","sources":["../clustering.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAEH,iGAAiG;AACjG,MAAM,WAAW,UAAU,CAAC,CAAC;IAC5B,CAAC,EAAE,CAAC,CAAA;IACJ,CAAC,EAAE,CAAC,CAAA;IACJ,MAAM,EAAE,MAAM,CAAA;CACd;AAED,mCAAmC;AACnC,MAAM,WAAW,cAAc;IAC9B;;;OAGG;IACH,SAAS,EAAE,MAAM,CAAA;CACjB;AAED;;;;;GAKG;AACH,wBAAgB,OAAO,CAAC,CAAC,EAAE,OAAO,EAAE,SAAS,CAAC,EAAE,EAAE,KAAK,EAAE,QAAQ,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE,cAAc,GAAG,CAAC,EAAE,EAAE,CAgD7G;AAED;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,CAAC,SAAS,MAAM,EAAE,KAAK,EAAE,SAAS,CAAC,EAAE,GAAG,CAAC,GAAG,SAAS,CAgBnF"}
@@ -0,0 +1,101 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Clustering — the third and final matcher stage: resolve scored pairs into canonical entities.
7
+ *
8
+ * The pairwise scorer treats each pair independently, and its scores are NOT transitive: A~B at a
9
+ * high weight and B~C at a high weight does not guarantee A~C is a match. So a distinct stage is
10
+ * required to turn the graph of above-threshold links into coherent groups — skip it and your
11
+ * "entities" quietly fracture or fuse.
12
+ *
13
+ * This ships the standard baseline: connected components of the link graph (union-find), with the
14
+ * link threshold as the precision/recall knob — raise it for tighter, purer clusters, lower it
15
+ * for more recall. Its known weakness is over-merging via transitive chains (a string of weak
16
+ * links can pull unrelated records into one component); the principled fix is
17
+ * centroid-/average-linkage hierarchical clustering (Dedupe), which uses the full within-cluster
18
+ * score matrix — a documented refinement, not this first cut. For a geocode-first matcher the
19
+ * over-merge risk is already damped: blocking keeps candidate sets local, so chains can't run
20
+ * across the whole dataset.
21
+ */
22
+ /**
23
+ * Cluster records into canonical entities by connected components of the above-threshold link
24
+ * graph. Every input record lands in exactly one cluster — a record with no qualifying link is a
25
+ * singleton. Links referencing a record not in `records` are ignored. Reference identity is used,
26
+ * so pass the same record objects to both arguments.
27
+ */
28
+ export function cluster(records, links, opts) {
29
+ const index = new Map();
30
+ records.forEach((record, i) => index.set(record, i));
31
+ const parent = records.map((_, i) => i);
32
+ const rank = new Array(records.length).fill(0);
33
+ const find = (x) => {
34
+ let root = x;
35
+ while (parent[root] !== root)
36
+ root = parent[root];
37
+ // Path compression.
38
+ while (parent[x] !== root) {
39
+ const next = parent[x];
40
+ parent[x] = root;
41
+ x = next;
42
+ }
43
+ return root;
44
+ };
45
+ const union = (x, y) => {
46
+ const rx = find(x);
47
+ const ry = find(y);
48
+ if (rx === ry)
49
+ return;
50
+ if (rank[rx] < rank[ry])
51
+ parent[rx] = ry;
52
+ else if (rank[rx] > rank[ry])
53
+ parent[ry] = rx;
54
+ else {
55
+ parent[ry] = rx;
56
+ rank[rx]++;
57
+ }
58
+ };
59
+ for (const link of links) {
60
+ if (link.weight < opts.threshold)
61
+ continue;
62
+ const ia = index.get(link.a);
63
+ const ib = index.get(link.b);
64
+ if (ia === undefined || ib === undefined)
65
+ continue;
66
+ union(ia, ib);
67
+ }
68
+ const groups = new Map();
69
+ records.forEach((record, i) => {
70
+ const root = find(i);
71
+ const group = groups.get(root);
72
+ if (group)
73
+ group.push(record);
74
+ else
75
+ groups.set(root, [record]);
76
+ });
77
+ return [...groups.values()];
78
+ }
79
+ /**
80
+ * Pick a cluster's most complete record as its canonical representative — the one with the fewest
81
+ * empty fields (`null` / `undefined` / `""`). Ties keep the earliest. A basic, generic
82
+ * canonicalizer; field-level merging across the cluster is the application's job (it knows which
83
+ * source to trust).
84
+ */
85
+ export function representative(group) {
86
+ let best;
87
+ let bestFilled = -1;
88
+ for (const record of group) {
89
+ let filled = 0;
90
+ for (const value of Object.values(record)) {
91
+ if (value !== null && value !== undefined && value !== "")
92
+ filled++;
93
+ }
94
+ if (filled > bestFilled) {
95
+ bestFilled = filled;
96
+ best = record;
97
+ }
98
+ }
99
+ return best;
100
+ }
101
+ //# sourceMappingURL=clustering.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"clustering.js","sourceRoot":"","sources":["../clustering.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAkBH;;;;;GAKG;AACH,MAAM,UAAU,OAAO,CAAI,OAAqB,EAAE,KAA8B,EAAE,IAAoB;IACrG,MAAM,KAAK,GAAG,IAAI,GAAG,EAAa,CAAA;IAClC,OAAO,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC,CAAA;IAEpD,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAA;IACvC,MAAM,IAAI,GAAG,IAAI,KAAK,CAAS,OAAO,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;IAEtD,MAAM,IAAI,GAAG,CAAC,CAAS,EAAU,EAAE;QAClC,IAAI,IAAI,GAAG,CAAC,CAAA;QACZ,OAAO,MAAM,CAAC,IAAI,CAAC,KAAK,IAAI;YAAE,IAAI,GAAG,MAAM,CAAC,IAAI,CAAE,CAAA;QAClD,oBAAoB;QACpB,OAAO,MAAM,CAAC,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;YAC3B,MAAM,IAAI,GAAG,MAAM,CAAC,CAAC,CAAE,CAAA;YACvB,MAAM,CAAC,CAAC,CAAC,GAAG,IAAI,CAAA;YAChB,CAAC,GAAG,IAAI,CAAA;QACT,CAAC;QACD,OAAO,IAAI,CAAA;IACZ,CAAC,CAAA;IAED,MAAM,KAAK,GAAG,CAAC,CAAS,EAAE,CAAS,EAAQ,EAAE;QAC5C,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC,CAAA;QAClB,MAAM,EAAE,GAAG,IAAI,CAAC,CAAC,CAAC,CAAA;QAClB,IAAI,EAAE,KAAK,EAAE;YAAE,OAAM;QACrB,IAAI,IAAI,CAAC,EAAE,CAAE,GAAG,IAAI,CAAC,EAAE,CAAE;YAAE,MAAM,CAAC,EAAE,CAAC,GAAG,EAAE,CAAA;aACrC,IAAI,IAAI,CAAC,EAAE,CAAE,GAAG,IAAI,CAAC,EAAE,CAAE;YAAE,MAAM,CAAC,EAAE,CAAC,GAAG,EAAE,CAAA;aAC1C,CAAC;YACL,MAAM,CAAC,EAAE,CAAC,GAAG,EAAE,CAAA;YACf,IAAI,CAAC,EAAE,CAAE,EAAE,CAAA;QACZ,CAAC;IACF,CAAC,CAAA;IAED,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QAC1B,IAAI,IAAI,CAAC,MAAM,GAAG,IAAI,CAAC,SAAS;YAAE,SAAQ;QAC1C,MAAM,EAAE,GAAG,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QAC5B,MAAM,EAAE,GAAG,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,CAAC,CAAA;QAC5B,IAAI,EAAE,KAAK,SAAS,IAAI,EAAE,KAAK,SAAS;YAAE,SAAQ;QAClD,KAAK,CAAC,EAAE,EAAE,EAAE,CAAC,CAAA;IACd,CAAC;IAED,MAAM,MAAM,GAAG,IAAI,GAAG,EAAe,CAAA;IACrC,OAAO,CAAC,OAAO,CAAC,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE;QAC7B,MAAM,IAAI,GAAG,IAAI,CAAC,CAAC,CAAC,CAAA;QACpB,MAAM,KAAK,GAAG,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC,CAAA;QAC9B,IAAI,KAAK;YAAE,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAA;;YACxB,MAAM,CAAC,GAAG,CAAC,IAAI,EAAE,CAAC,MAAM,CAAC,CAAC,CAAA;IAChC,CAAC,CAAC,CAAA;IAEF,OAAO,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,CAAA;AAC5B,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,cAAc,CAAmB,KAAmB;IACnE,IAAI,IAAmB,CAAA;IACvB,IAAI,UAAU,GAAG,CAAC,CAAC,CAAA;IAEnB,KAAK,MAAM,MAAM,IAAI,KAAK,EAAE,CAAC;QAC5B,IAAI,MAAM,GAAG,CAAC,CAAA;QACd,KAAK,MAAM,KAAK,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,EAAE,CAAC;YAC3C,IAAI,KAAK,KAAK,IAAI,IAAI,KAAK,KAAK,SAAS,IAAI,KAAK,KAAK,EAAE;gBAAE,MAAM,EAAE,CAAA;QACpE,CAAC;QACD,IAAI,MAAM,GAAG,UAAU,EAAE,CAAC;YACzB,UAAU,GAAG,MAAM,CAAA;YACnB,IAAI,GAAG,MAAM,CAAA;QACd,CAAC;IACF,CAAC;IAED,OAAO,IAAI,CAAA;AACZ,CAAC"}
@@ -0,0 +1,49 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * String comparators for the matcher's scoring stage.
7
+ *
8
+ * The record-linkage literature (Winkler/Census; Belin 1993) settles on the prefix-weighted Jaro
9
+ * comparator (Jaro-Winkler) as the default for names: it tolerates the typographical error real
10
+ * data is full of better than raw character-edit distance. But J-W has a documented blind spot on
11
+ * compound / double surnames (e.g. Hispanic `Garcia Lopez`): the second half of the compound
12
+ * falls outside J-W's match window, so `Lopez` vs `Garcia Lopez` scores ~0. The fix the
13
+ * literature prescribes is an edit-distance / token fallback for single-vs-compound pairs —
14
+ * implemented in {@link nameSimilarity}.
15
+ *
16
+ * These are pure similarity primitives in [0, 1]. The mapping of a similarity onto discrete
17
+ * Fellegi-Sunter agreement levels (and the m/u weights) is the scorer's job, not theirs.
18
+ */
19
+ /**
20
+ * Jaro similarity in [0, 1]. Two empty strings are identical (1); one empty is 0. Counts matching
21
+ * characters within a sliding window of `floor(max(len)/2) - 1`, discounting half-transpositions.
22
+ */
23
+ export declare function jaro(a: string, b: string): number;
24
+ /**
25
+ * Jaro-Winkler similarity in [0, 1]: Jaro with a bonus for a shared prefix — `jw = jaro + prefix *
26
+ * weight * (1 - jaro)`, prefix capped at `maxPrefix` (Winkler's standard 4), `weight` the scaling
27
+ * factor (standard 0.1). Only boosts when `jaro` already clears `boostThreshold` (0.7), per
28
+ * Winkler.
29
+ */
30
+ export declare function jaroWinkler(a: string, b: string, opts?: {
31
+ weight?: number;
32
+ maxPrefix?: number;
33
+ boostThreshold?: number;
34
+ }): number;
35
+ /** Normalized Levenshtein similarity in [0, 1]: `1 - editDistance / max(len)`. */
36
+ export declare function levenshteinSimilarity(a: string, b: string): number;
37
+ /**
38
+ * Name-aware similarity in [0, 1]. Jaro-Winkler by default, with the compound-surname fallback the
39
+ * literature prescribes:
40
+ *
41
+ * - If one name's tokens are a strict subset of the other's (`Lopez` ⊂ `Garcia Lopez`), that is
42
+ * strong partial agreement J-W misses — floor the score at 0.9.
43
+ * - Otherwise return the better of Jaro-Winkler and normalized edit similarity, so a single token
44
+ * that is a substring of a longer compound (`Garcia` vs `Garcialopez`) still scores sensibly.
45
+ *
46
+ * Case- and whitespace-insensitive. Empty input scores 0.
47
+ */
48
+ export declare function nameSimilarity(a: string, b: string): number;
49
+ //# sourceMappingURL=comparators.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"comparators.d.ts","sourceRoot":"","sources":["../comparators.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;GAiBG;AAIH;;;GAGG;AACH,wBAAgB,IAAI,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,GAAG,MAAM,CAqCjD;AAED;;;;;GAKG;AACH,wBAAgB,WAAW,CAC1B,CAAC,EAAE,MAAM,EACT,CAAC,EAAE,MAAM,EACT,IAAI,GAAE;IAAE,MAAM,CAAC,EAAE,MAAM,CAAC;IAAC,SAAS,CAAC,EAAE,MAAM,CAAC;IAAC,cAAc,CAAC,EAAE,MAAM,CAAA;CAAO,GACzE,MAAM,CAaR;AAED,kFAAkF;AAClF,wBAAgB,qBAAqB,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,GAAG,MAAM,CAKlE;AAED;;;;;;;;;;GAUG;AACH,wBAAgB,cAAc,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,GAAG,MAAM,CAe3D"}
@@ -0,0 +1,119 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * String comparators for the matcher's scoring stage.
7
+ *
8
+ * The record-linkage literature (Winkler/Census; Belin 1993) settles on the prefix-weighted Jaro
9
+ * comparator (Jaro-Winkler) as the default for names: it tolerates the typographical error real
10
+ * data is full of better than raw character-edit distance. But J-W has a documented blind spot on
11
+ * compound / double surnames (e.g. Hispanic `Garcia Lopez`): the second half of the compound
12
+ * falls outside J-W's match window, so `Lopez` vs `Garcia Lopez` scores ~0. The fix the
13
+ * literature prescribes is an edit-distance / token fallback for single-vs-compound pairs —
14
+ * implemented in {@link nameSimilarity}.
15
+ *
16
+ * These are pure similarity primitives in [0, 1]. The mapping of a similarity onto discrete
17
+ * Fellegi-Sunter agreement levels (and the m/u weights) is the scorer's job, not theirs.
18
+ */
19
+ import { distance as levenshteinDistance } from "fastest-levenshtein";
20
+ /**
21
+ * Jaro similarity in [0, 1]. Two empty strings are identical (1); one empty is 0. Counts matching
22
+ * characters within a sliding window of `floor(max(len)/2) - 1`, discounting half-transpositions.
23
+ */
24
+ export function jaro(a, b) {
25
+ if (a === b)
26
+ return 1;
27
+ const la = a.length;
28
+ const lb = b.length;
29
+ if (la === 0 || lb === 0)
30
+ return 0;
31
+ const window = Math.max(0, Math.floor(Math.max(la, lb) / 2) - 1);
32
+ const aMatched = new Array(la).fill(false);
33
+ const bMatched = new Array(lb).fill(false);
34
+ let matches = 0;
35
+ for (let i = 0; i < la; i++) {
36
+ const start = Math.max(0, i - window);
37
+ const end = Math.min(i + window + 1, lb);
38
+ for (let j = start; j < end; j++) {
39
+ if (bMatched[j] || a[i] !== b[j])
40
+ continue;
41
+ aMatched[i] = true;
42
+ bMatched[j] = true;
43
+ matches++;
44
+ break;
45
+ }
46
+ }
47
+ if (matches === 0)
48
+ return 0;
49
+ // Count transpositions: matched chars of `a` and `b`, in order, that disagree (halved).
50
+ let transpositions = 0;
51
+ let k = 0;
52
+ for (let i = 0; i < la; i++) {
53
+ if (!aMatched[i])
54
+ continue;
55
+ while (!bMatched[k])
56
+ k++;
57
+ if (a[i] !== b[k])
58
+ transpositions++;
59
+ k++;
60
+ }
61
+ transpositions /= 2;
62
+ return (matches / la + matches / lb + (matches - transpositions) / matches) / 3;
63
+ }
64
+ /**
65
+ * Jaro-Winkler similarity in [0, 1]: Jaro with a bonus for a shared prefix — `jw = jaro + prefix *
66
+ * weight * (1 - jaro)`, prefix capped at `maxPrefix` (Winkler's standard 4), `weight` the scaling
67
+ * factor (standard 0.1). Only boosts when `jaro` already clears `boostThreshold` (0.7), per
68
+ * Winkler.
69
+ */
70
+ export function jaroWinkler(a, b, opts = {}) {
71
+ const weight = opts.weight ?? 0.1;
72
+ const maxPrefix = opts.maxPrefix ?? 4;
73
+ const boostThreshold = opts.boostThreshold ?? 0.7;
74
+ const base = jaro(a, b);
75
+ if (base < boostThreshold)
76
+ return base;
77
+ let prefix = 0;
78
+ const limit = Math.min(maxPrefix, a.length, b.length);
79
+ while (prefix < limit && a[prefix] === b[prefix])
80
+ prefix++;
81
+ return base + prefix * weight * (1 - base);
82
+ }
83
+ /** Normalized Levenshtein similarity in [0, 1]: `1 - editDistance / max(len)`. */
84
+ export function levenshteinSimilarity(a, b) {
85
+ if (a === b)
86
+ return 1;
87
+ const longest = Math.max(a.length, b.length);
88
+ if (longest === 0)
89
+ return 1;
90
+ return 1 - levenshteinDistance(a, b) / longest;
91
+ }
92
+ /**
93
+ * Name-aware similarity in [0, 1]. Jaro-Winkler by default, with the compound-surname fallback the
94
+ * literature prescribes:
95
+ *
96
+ * - If one name's tokens are a strict subset of the other's (`Lopez` ⊂ `Garcia Lopez`), that is
97
+ * strong partial agreement J-W misses — floor the score at 0.9.
98
+ * - Otherwise return the better of Jaro-Winkler and normalized edit similarity, so a single token
99
+ * that is a substring of a longer compound (`Garcia` vs `Garcialopez`) still scores sensibly.
100
+ *
101
+ * Case- and whitespace-insensitive. Empty input scores 0.
102
+ */
103
+ export function nameSimilarity(a, b) {
104
+ const x = a.trim().toLowerCase().replace(/\s+/g, " ");
105
+ const y = b.trim().toLowerCase().replace(/\s+/g, " ");
106
+ if (!x || !y)
107
+ return 0;
108
+ if (x === y)
109
+ return 1;
110
+ const jw = jaroWinkler(x, y);
111
+ const xTokens = new Set(x.split(" "));
112
+ const yTokens = new Set(y.split(" "));
113
+ const [small, big] = xTokens.size <= yTokens.size ? [xTokens, yTokens] : [yTokens, xTokens];
114
+ const subset = small.size < big.size && [...small].every((t) => big.has(t));
115
+ if (subset)
116
+ return Math.max(jw, 0.9);
117
+ return Math.max(jw, levenshteinSimilarity(x, y));
118
+ }
119
+ //# sourceMappingURL=comparators.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"comparators.js","sourceRoot":"","sources":["../comparators.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;GAiBG;AAEH,OAAO,EAAE,QAAQ,IAAI,mBAAmB,EAAE,MAAM,qBAAqB,CAAA;AAErE;;;GAGG;AACH,MAAM,UAAU,IAAI,CAAC,CAAS,EAAE,CAAS;IACxC,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,CAAC,CAAA;IACrB,MAAM,EAAE,GAAG,CAAC,CAAC,MAAM,CAAA;IACnB,MAAM,EAAE,GAAG,CAAC,CAAC,MAAM,CAAA;IACnB,IAAI,EAAE,KAAK,CAAC,IAAI,EAAE,KAAK,CAAC;QAAE,OAAO,CAAC,CAAA;IAElC,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;IAChE,MAAM,QAAQ,GAAG,IAAI,KAAK,CAAU,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IACnD,MAAM,QAAQ,GAAG,IAAI,KAAK,CAAU,EAAE,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAA;IAEnD,IAAI,OAAO,GAAG,CAAC,CAAA;IACf,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC;QAC7B,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,CAAA;QACrC,MAAM,GAAG,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,GAAG,MAAM,GAAG,CAAC,EAAE,EAAE,CAAC,CAAA;QACxC,KAAK,IAAI,CAAC,GAAG,KAAK,EAAE,CAAC,GAAG,GAAG,EAAE,CAAC,EAAE,EAAE,CAAC;YAClC,IAAI,QAAQ,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;gBAAE,SAAQ;YAC1C,QAAQ,CAAC,CAAC,CAAC,GAAG,IAAI,CAAA;YAClB,QAAQ,CAAC,CAAC,CAAC,GAAG,IAAI,CAAA;YAClB,OAAO,EAAE,CAAA;YACT,MAAK;QACN,CAAC;IACF,CAAC;IAED,IAAI,OAAO,KAAK,CAAC;QAAE,OAAO,CAAC,CAAA;IAE3B,wFAAwF;IACxF,IAAI,cAAc,GAAG,CAAC,CAAA;IACtB,IAAI,CAAC,GAAG,CAAC,CAAA;IACT,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,EAAE,EAAE,CAAC,EAAE,EAAE,CAAC;QAC7B,IAAI,CAAC,QAAQ,CAAC,CAAC,CAAC;YAAE,SAAQ;QAC1B,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC;YAAE,CAAC,EAAE,CAAA;QACxB,IAAI,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;YAAE,cAAc,EAAE,CAAA;QACnC,CAAC,EAAE,CAAA;IACJ,CAAC;IACD,cAAc,IAAI,CAAC,CAAA;IAEnB,OAAO,CAAC,OAAO,GAAG,EAAE,GAAG,OAAO,GAAG,EAAE,GAAG,CAAC,OAAO,GAAG,cAAc,CAAC,GAAG,OAAO,CAAC,GAAG,CAAC,CAAA;AAChF,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,WAAW,CAC1B,CAAS,EACT,CAAS,EACT,OAAyE,EAAE;IAE3E,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,IAAI,GAAG,CAAA;IACjC,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,CAAC,CAAA;IACrC,MAAM,cAAc,GAAG,IAAI,CAAC,cAAc,IAAI,GAAG,CAAA;IAEjD,MAAM,IAAI,GAAG,IAAI,CAAC,CAAC,EAAE,CAAC,CAAC,CAAA;IACvB,IAAI,IAAI,GAAG,cAAc;QAAE,OAAO,IAAI,CAAA;IAEtC,IAAI,MAAM,GAAG,CAAC,CAAA;IACd,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,CAAA;IACrD,OAAO,MAAM,GAAG,KAAK,IAAI,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC;QAAE,MAAM,EAAE,CAAA;IAE1D,OAAO,IAAI,GAAG,MAAM,GAAG,MAAM,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,CAAA;AAC3C,CAAC;AAED,kFAAkF;AAClF,MAAM,UAAU,qBAAqB,CAAC,CAAS,EAAE,CAAS;IACzD,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,CAAC,CAAA;IACrB,MAAM,OAAO,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,MAAM,CAAC,CAAA;IAC5C,IAAI,OAAO,KAAK,CAAC;QAAE,OAAO,CAAC,CAAA;IAC3B,OAAO,CAAC,GAAG,mBAAmB,CAAC,CAAC,EAAE,CAAC,CAAC,GAAG,OAAO,CAAA;AAC/C,CAAC;AAED;;;;;;;;;;GAUG;AACH,MAAM,UAAU,cAAc,CAAC,CAAS,EAAE,CAAS;IAClD,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;IACrD,MAAM,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;IACrD,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC;QAAE,OAAO,CAAC,CAAA;IACtB,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,CAAC,CAAA;IAErB,MAAM,EAAE,GAAG,WAAW,CAAC,CAAC,EAAE,CAAC,CAAC,CAAA;IAE5B,MAAM,OAAO,GAAG,IAAI,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAA;IACrC,MAAM,OAAO,GAAG,IAAI,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAA;IACrC,MAAM,CAAC,KAAK,EAAE,GAAG,CAAC,GAAG,OAAO,CAAC,IAAI,IAAI,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,OAAO,EAAE,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,EAAE,OAAO,CAAC,CAAA;IAC3F,MAAM,MAAM,GAAG,KAAK,CAAC,IAAI,GAAG,GAAG,CAAC,IAAI,IAAI,CAAC,GAAG,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAA;IAC3E,IAAI,MAAM;QAAE,OAAO,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,CAAA;IAEpC,OAAO,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,qBAAqB,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAA;AACjD,CAAC"}
@@ -0,0 +1,41 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Geographic distance as a scoring feature — the other half of geocode-first matching.
7
+ *
8
+ * Blocking uses geography to _propose_ candidates; this scores them on it. The research is explicit
9
+ * that an address must be matched as a SPATIAL attribute, not by string similarity (a
10
+ * one-character edit can be 650 m apart), and that distance measurably helps as a comparison
11
+ * feature. So we bucket the great-circle distance between two records' coordinates into ordered
12
+ * Fellegi-Sunter agreement levels (Splink's `DistanceInKMAtThresholds`): "same building" / "same
13
+ * block" / "same area" / far, each with its own m/u and weight.
14
+ *
15
+ * Calibrate the bucket boundaries to the geocoder's OWN error, which is heavy-tailed and density-
16
+ * dependent (≈38 m urban, ≈200 m rural). A weakening of this evidence by geocode quality (a
17
+ * shared interpolated centroid is softer than a shared rooftop point) is the documented
18
+ * refinement.
19
+ */
20
+ import type { LatLon } from "./blocking.js";
21
+ import type { Comparison, ComparisonLevel } from "./fellegi-sunter.js";
22
+ /** Great-circle (haversine) distance in km between two coordinates. */
23
+ export declare function haversineKm(a: LatLon, b: LatLon): number;
24
+ /**
25
+ * A geo-distance comparison: bucket the great-circle distance between two records' coordinates into
26
+ * ordered agreement levels. Levels must be ordered NEAREST first by `maxKm`, the last acting as the
27
+ * `far` catch-all (`maxKm` omitted → unbounded). A missing/invalid coordinate on either side yields
28
+ * no evidence.
29
+ */
30
+ export declare function distanceComparison<R>(config: {
31
+ name: string;
32
+ extract: (record: R) => LatLon | null | undefined;
33
+ levels: ComparisonLevel[];
34
+ }): Comparison<R>;
35
+ /**
36
+ * Default distance levels, nearest → far, with boundaries at rooftop / block / locality scale. The
37
+ * m/u are illustrative seeds (EM re-estimates them); the boundaries reflect typical geocoder
38
+ * error.
39
+ */
40
+ export declare const DEFAULT_DISTANCE_LEVELS: ComparisonLevel[];
41
+ //# sourceMappingURL=distance.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"distance.d.ts","sourceRoot":"","sources":["../distance.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,OAAO,KAAK,EAAE,MAAM,EAAE,MAAM,eAAe,CAAA;AAC3C,OAAO,KAAK,EAAE,UAAU,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAA;AAKtE,uEAAuE;AACvE,wBAAgB,WAAW,CAAC,CAAC,EAAE,MAAM,EAAE,CAAC,EAAE,MAAM,GAAG,MAAM,CASxD;AAED;;;;;GAKG;AACH,wBAAgB,kBAAkB,CAAC,CAAC,EAAE,MAAM,EAAE;IAC7C,IAAI,EAAE,MAAM,CAAA;IACZ,OAAO,EAAE,CAAC,MAAM,EAAE,CAAC,KAAK,MAAM,GAAG,IAAI,GAAG,SAAS,CAAA;IACjD,MAAM,EAAE,eAAe,EAAE,CAAA;CACzB,GAAG,UAAU,CAAC,CAAC,CAAC,CAmBhB;AAED;;;;GAIG;AACH,eAAO,MAAM,uBAAuB,EAAE,eAAe,EAKpD,CAAA"}