@mailwoman/match 4.8.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/out/blocking.d.ts +77 -0
- package/out/blocking.d.ts.map +1 -0
- package/out/blocking.js +136 -0
- package/out/blocking.js.map +1 -0
- package/out/clustering.d.ts +50 -0
- package/out/clustering.d.ts.map +1 -0
- package/out/clustering.js +101 -0
- package/out/clustering.js.map +1 -0
- package/out/comparators.d.ts +49 -0
- package/out/comparators.d.ts.map +1 -0
- package/out/comparators.js +119 -0
- package/out/comparators.js.map +1 -0
- package/out/distance.d.ts +41 -0
- package/out/distance.d.ts.map +1 -0
- package/out/distance.js +68 -0
- package/out/distance.js.map +1 -0
- package/out/em.d.ts +51 -0
- package/out/em.d.ts.map +1 -0
- package/out/em.js +108 -0
- package/out/em.js.map +1 -0
- package/out/fellegi-sunter.d.ts +124 -0
- package/out/fellegi-sunter.d.ts.map +1 -0
- package/out/fellegi-sunter.js +109 -0
- package/out/fellegi-sunter.js.map +1 -0
- package/out/index.d.ts +25 -0
- package/out/index.d.ts.map +1 -0
- package/out/index.js +25 -0
- package/out/index.js.map +1 -0
- package/out/tf.d.ts +55 -0
- package/out/tf.d.ts.map +1 -0
- package/out/tf.js +66 -0
- package/out/tf.js.map +1 -0
- package/package.json +38 -0
package/out/tf.js
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* @copyright Sister Software
|
|
3
|
+
* @license AGPL-3.0
|
|
4
|
+
* @author Teffen Ellis, et al.
|
|
5
|
+
*
|
|
6
|
+
* Term-frequency adjustment — making a rare-value agreement count more than a common one.
|
|
7
|
+
*
|
|
8
|
+
* Two people both named "Vijayan" is far stronger evidence of a match than two both named "Smith",
|
|
9
|
+
* because "Smith" agreements happen by chance all the time and "Vijayan" agreements don't. The
|
|
10
|
+
* Fellegi-Sunter `m` (how often a true match agrees) is roughly the same either way; what differs
|
|
11
|
+
* is `u` — the chance a _non_-match agrees — which for an exact agreement on value `v` is just
|
|
12
|
+
* how common `v` is. So we leave `m`, and replace the level's average `u` with `frequency(v)`,
|
|
13
|
+
* adding `log2(u_level / frequency(v))` to the weight: a big positive bump for rare values, a
|
|
14
|
+
* penalty for common ones.
|
|
15
|
+
*
|
|
16
|
+
* Crucially for a label-free matcher: the frequencies are computed ON-THE-FLY from the input column
|
|
17
|
+
* (the Splink approach) — no external Census table required. Build a {@link TermFrequencyTable}
|
|
18
|
+
* from the values you're matching, then attach it to a comparison with {@link withTermFrequency}.
|
|
19
|
+
*/
|
|
20
|
+
const defaultNormalize = (value) => value.trim().toLowerCase().replace(/\s+/g, " ");
|
|
21
|
+
/**
|
|
22
|
+
* Build a {@link TermFrequencyTable} from an iterable of values (e.g. every `given` name in the
|
|
23
|
+
* dataset). Values are normalized (default: trim + lowercase + collapse whitespace) before
|
|
24
|
+
* counting, and `frequency()` normalizes its argument the same way, so callers pass raw field
|
|
25
|
+
* values.
|
|
26
|
+
*/
|
|
27
|
+
export function buildTermFrequencyTable(values, opts = {}) {
|
|
28
|
+
const normalize = opts.normalize ?? defaultNormalize;
|
|
29
|
+
const counts = new Map();
|
|
30
|
+
let total = 0;
|
|
31
|
+
for (const value of values) {
|
|
32
|
+
if (value == null)
|
|
33
|
+
continue;
|
|
34
|
+
const key = normalize(value);
|
|
35
|
+
if (!key)
|
|
36
|
+
continue;
|
|
37
|
+
counts.set(key, (counts.get(key) ?? 0) + 1);
|
|
38
|
+
total++;
|
|
39
|
+
}
|
|
40
|
+
return {
|
|
41
|
+
total,
|
|
42
|
+
distinct: counts.size,
|
|
43
|
+
frequency(value) {
|
|
44
|
+
if (total === 0)
|
|
45
|
+
return 0;
|
|
46
|
+
return (counts.get(normalize(value)) ?? 0) / total;
|
|
47
|
+
},
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Attach a term-frequency adjustment to a comparison. By default it applies to the exact level
|
|
52
|
+
* (index 0) and looks up the value via `value(a, b)` — usually the agreeing field extracted from
|
|
53
|
+
* one side. Returns a new comparison; the underlying `assess` and levels are untouched, so this
|
|
54
|
+
* composes with EM (which re-estimates the base `m`/`u` the adjustment sits on top of).
|
|
55
|
+
*/
|
|
56
|
+
export function withTermFrequency(comparison, config) {
|
|
57
|
+
const adjustment = {
|
|
58
|
+
frequency: (value) => config.table.frequency(value),
|
|
59
|
+
levels: new Set(config.levels ?? [0]),
|
|
60
|
+
value: config.value,
|
|
61
|
+
weight: config.weight,
|
|
62
|
+
minimumFrequency: config.minimumFrequency,
|
|
63
|
+
};
|
|
64
|
+
return { ...comparison, termFrequency: adjustment };
|
|
65
|
+
}
|
|
66
|
+
//# sourceMappingURL=tf.js.map
|
package/out/tf.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tf.js","sourceRoot":"","sources":["../tf.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAcH,MAAM,gBAAgB,GAAG,CAAC,KAAa,EAAU,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;AAEnG;;;;;GAKG;AACH,MAAM,UAAU,uBAAuB,CACtC,MAA2C,EAC3C,OAAkD,EAAE;IAEpD,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,gBAAgB,CAAA;IACpD,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAA;IACxC,IAAI,KAAK,GAAG,CAAC,CAAA;IAEb,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC5B,IAAI,KAAK,IAAI,IAAI;YAAE,SAAQ;QAC3B,MAAM,GAAG,GAAG,SAAS,CAAC,KAAK,CAAC,CAAA;QAC5B,IAAI,CAAC,GAAG;YAAE,SAAQ;QAClB,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;QAC3C,KAAK,EAAE,CAAA;IACR,CAAC;IAED,OAAO;QACN,KAAK;QACL,QAAQ,EAAE,MAAM,CAAC,IAAI;QACrB,SAAS,CAAC,KAAK;YACd,IAAI,KAAK,KAAK,CAAC;gBAAE,OAAO,CAAC,CAAA;YACzB,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,KAAK,CAAA;QACnD,CAAC;KACD,CAAA;AACF,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,iBAAiB,CAChC,UAAyB,EACzB,MASC;IAED,MAAM,UAAU,GAA+B;QAC9C,SAAS,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC;QACnD,MAAM,EAAE,IAAI,GAAG,CAAC,MAAM,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,KAAK,EAAE,MAAM,CAAC,KAAK;QACnB,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,gBAAgB,EAAE,MAAM,CAAC,gBAAgB;KACzC,CAAA;IAED,OAAO,EAAE,GAAG,UAAU,EAAE,aAAa,EAAE,UAAU,EAAE,CAAA;AACpD,CAAC"}
|
package/package.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@mailwoman/match",
|
|
3
|
+
"version": "4.8.1",
|
|
4
|
+
"description": "The geocode-first record matcher: block → score → cluster. This first cut ships the string comparators (Jaro / Jaro-Winkler + an edit-distance fallback for compound surnames) that the Fellegi-Sunter scorer is built on.",
|
|
5
|
+
"license": "AGPL-3.0-only",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "https://github.com/sister-software/mailwoman.git",
|
|
9
|
+
"directory": "match"
|
|
10
|
+
},
|
|
11
|
+
"type": "module",
|
|
12
|
+
"exports": {
|
|
13
|
+
"./package.json": "./package.json",
|
|
14
|
+
".": "./out/index.js",
|
|
15
|
+
"./blocking": "./out/blocking.js",
|
|
16
|
+
"./clustering": "./out/clustering.js",
|
|
17
|
+
"./comparators": "./out/comparators.js",
|
|
18
|
+
"./distance": "./out/distance.js",
|
|
19
|
+
"./fellegi-sunter": "./out/fellegi-sunter.js",
|
|
20
|
+
"./em": "./out/em.js",
|
|
21
|
+
"./tf": "./out/tf.js"
|
|
22
|
+
},
|
|
23
|
+
"dependencies": {
|
|
24
|
+
"fastest-levenshtein": "^1.0.16"
|
|
25
|
+
},
|
|
26
|
+
"devDependencies": {
|
|
27
|
+
"@types/node": "^25.9.2"
|
|
28
|
+
},
|
|
29
|
+
"files": [
|
|
30
|
+
"out/**/*.js",
|
|
31
|
+
"out/**/*.js.map",
|
|
32
|
+
"out/**/*.d.ts",
|
|
33
|
+
"out/**/*.d.ts.map"
|
|
34
|
+
],
|
|
35
|
+
"publishConfig": {
|
|
36
|
+
"access": "public"
|
|
37
|
+
}
|
|
38
|
+
}
|