@mailwoman/match 4.8.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/out/tf.js ADDED
@@ -0,0 +1,66 @@
1
+ /**
2
+ * @copyright Sister Software
3
+ * @license AGPL-3.0
4
+ * @author Teffen Ellis, et al.
5
+ *
6
+ * Term-frequency adjustment — making a rare-value agreement count more than a common one.
7
+ *
8
+ * Two people both named "Vijayan" is far stronger evidence of a match than two both named "Smith",
9
+ * because "Smith" agreements happen by chance all the time and "Vijayan" agreements don't. The
10
+ * Fellegi-Sunter `m` (how often a true match agrees) is roughly the same either way; what differs
11
+ * is `u` — the chance a _non_-match agrees — which for an exact agreement on value `v` is just
12
+ * how common `v` is. So we leave `m`, and replace the level's average `u` with `frequency(v)`,
13
+ * adding `log2(u_level / frequency(v))` to the weight: a big positive bump for rare values, a
14
+ * penalty for common ones.
15
+ *
16
+ * Crucially for a label-free matcher: the frequencies are computed ON-THE-FLY from the input column
17
+ * (the Splink approach) — no external Census table required. Build a {@link TermFrequencyTable}
18
+ * from the values you're matching, then attach it to a comparison with {@link withTermFrequency}.
19
+ */
20
+ const defaultNormalize = (value) => value.trim().toLowerCase().replace(/\s+/g, " ");
21
+ /**
22
+ * Build a {@link TermFrequencyTable} from an iterable of values (e.g. every `given` name in the
23
+ * dataset). Values are normalized (default: trim + lowercase + collapse whitespace) before
24
+ * counting, and `frequency()` normalizes its argument the same way, so callers pass raw field
25
+ * values.
26
+ */
27
+ export function buildTermFrequencyTable(values, opts = {}) {
28
+ const normalize = opts.normalize ?? defaultNormalize;
29
+ const counts = new Map();
30
+ let total = 0;
31
+ for (const value of values) {
32
+ if (value == null)
33
+ continue;
34
+ const key = normalize(value);
35
+ if (!key)
36
+ continue;
37
+ counts.set(key, (counts.get(key) ?? 0) + 1);
38
+ total++;
39
+ }
40
+ return {
41
+ total,
42
+ distinct: counts.size,
43
+ frequency(value) {
44
+ if (total === 0)
45
+ return 0;
46
+ return (counts.get(normalize(value)) ?? 0) / total;
47
+ },
48
+ };
49
+ }
50
+ /**
51
+ * Attach a term-frequency adjustment to a comparison. By default it applies to the exact level
52
+ * (index 0) and looks up the value via `value(a, b)` — usually the agreeing field extracted from
53
+ * one side. Returns a new comparison; the underlying `assess` and levels are untouched, so this
54
+ * composes with EM (which re-estimates the base `m`/`u` the adjustment sits on top of).
55
+ */
56
+ export function withTermFrequency(comparison, config) {
57
+ const adjustment = {
58
+ frequency: (value) => config.table.frequency(value),
59
+ levels: new Set(config.levels ?? [0]),
60
+ value: config.value,
61
+ weight: config.weight,
62
+ minimumFrequency: config.minimumFrequency,
63
+ };
64
+ return { ...comparison, termFrequency: adjustment };
65
+ }
66
+ //# sourceMappingURL=tf.js.map
package/out/tf.js.map ADDED
@@ -0,0 +1 @@
1
+ {"version":3,"file":"tf.js","sourceRoot":"","sources":["../tf.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAcH,MAAM,gBAAgB,GAAG,CAAC,KAAa,EAAU,EAAE,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAA;AAEnG;;;;;GAKG;AACH,MAAM,UAAU,uBAAuB,CACtC,MAA2C,EAC3C,OAAkD,EAAE;IAEpD,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,gBAAgB,CAAA;IACpD,MAAM,MAAM,GAAG,IAAI,GAAG,EAAkB,CAAA;IACxC,IAAI,KAAK,GAAG,CAAC,CAAA;IAEb,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC5B,IAAI,KAAK,IAAI,IAAI;YAAE,SAAQ;QAC3B,MAAM,GAAG,GAAG,SAAS,CAAC,KAAK,CAAC,CAAA;QAC5B,IAAI,CAAC,GAAG;YAAE,SAAQ;QAClB,MAAM,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,MAAM,CAAC,GAAG,CAAC,GAAG,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAA;QAC3C,KAAK,EAAE,CAAA;IACR,CAAC;IAED,OAAO;QACN,KAAK;QACL,QAAQ,EAAE,MAAM,CAAC,IAAI;QACrB,SAAS,CAAC,KAAK;YACd,IAAI,KAAK,KAAK,CAAC;gBAAE,OAAO,CAAC,CAAA;YACzB,OAAO,CAAC,MAAM,CAAC,GAAG,CAAC,SAAS,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,KAAK,CAAA;QACnD,CAAC;KACD,CAAA;AACF,CAAC;AAED;;;;;GAKG;AACH,MAAM,UAAU,iBAAiB,CAChC,UAAyB,EACzB,MASC;IAED,MAAM,UAAU,GAA+B;QAC9C,SAAS,EAAE,CAAC,KAAK,EAAE,EAAE,CAAC,MAAM,CAAC,KAAK,CAAC,SAAS,CAAC,KAAK,CAAC;QACnD,MAAM,EAAE,IAAI,GAAG,CAAC,MAAM,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,CAAC;QACrC,KAAK,EAAE,MAAM,CAAC,KAAK;QACnB,MAAM,EAAE,MAAM,CAAC,MAAM;QACrB,gBAAgB,EAAE,MAAM,CAAC,gBAAgB;KACzC,CAAA;IAED,OAAO,EAAE,GAAG,UAAU,EAAE,aAAa,EAAE,UAAU,EAAE,CAAA;AACpD,CAAC"}
package/package.json ADDED
@@ -0,0 +1,38 @@
1
+ {
2
+ "name": "@mailwoman/match",
3
+ "version": "4.8.1",
4
+ "description": "The geocode-first record matcher: block → score → cluster. This first cut ships the string comparators (Jaro / Jaro-Winkler + an edit-distance fallback for compound surnames) that the Fellegi-Sunter scorer is built on.",
5
+ "license": "AGPL-3.0-only",
6
+ "repository": {
7
+ "type": "git",
8
+ "url": "https://github.com/sister-software/mailwoman.git",
9
+ "directory": "match"
10
+ },
11
+ "type": "module",
12
+ "exports": {
13
+ "./package.json": "./package.json",
14
+ ".": "./out/index.js",
15
+ "./blocking": "./out/blocking.js",
16
+ "./clustering": "./out/clustering.js",
17
+ "./comparators": "./out/comparators.js",
18
+ "./distance": "./out/distance.js",
19
+ "./fellegi-sunter": "./out/fellegi-sunter.js",
20
+ "./em": "./out/em.js",
21
+ "./tf": "./out/tf.js"
22
+ },
23
+ "dependencies": {
24
+ "fastest-levenshtein": "^1.0.16"
25
+ },
26
+ "devDependencies": {
27
+ "@types/node": "^25.9.2"
28
+ },
29
+ "files": [
30
+ "out/**/*.js",
31
+ "out/**/*.js.map",
32
+ "out/**/*.d.ts",
33
+ "out/**/*.d.ts.map"
34
+ ],
35
+ "publishConfig": {
36
+ "access": "public"
37
+ }
38
+ }