@blamejs/core 0.7.103 → 0.7.105

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,297 @@
1
+ "use strict";
2
+ /**
3
+ * Fuzzy name-matching primitives for sanctions screening.
4
+ *
5
+ * Operators screening names against the OFAC SDN list / EU CSL /
6
+ * UK HMT consolidated list need to handle:
7
+ * - Transliteration variations (Mohamed / Mohammed / Muhammad)
8
+ * - Order-of-name variations (Smith John vs John Smith)
9
+ * - Initials vs full names (J. Smith vs John Smith)
10
+ * - Diacritical noise (Müller vs Muller)
11
+ * - Substring containment (the SDN entry "Acme Corp" matches a
12
+ * local record "Acme Corp Limited")
13
+ *
14
+ * This module exports the algorithmic core; b.compliance.sanctions
15
+ * orchestrates parser/index/match against it.
16
+ *
17
+ * Functions:
18
+ * normalize(name) → canonical lowercase form, diacritics
19
+ * stripped, multi-space collapsed
20
+ * tokenize(name) → array of normalized tokens
21
+ * levenshtein(a, b, capDist) → edit distance with O(min(a,b)) memory
22
+ * + early-exit when distance > capDist
23
+ * jaroWinkler(a, b, prefix) → 0..1 similarity score per Jaro-Winkler
24
+ * (1996); operators typically threshold
25
+ * at >= 0.85 for "probable match"
26
+ * tokenSetSimilarity(a, b) → bag-of-tokens overlap with token-pair
27
+ * Jaro-Winkler scoring; resilient to
28
+ * word order and missing/extra terms
29
+ *
30
+ * Performance: worst-case O(n*m) for Levenshtein (n,m = string lengths),
31
+ * O(n*m) for Jaro-Winkler. Operators screening against a list of N
32
+ * entries should pre-filter on token-set overlap before computing
33
+ * Jaro-Winkler on every candidate.
34
+ */
35
+
36
+ var validateOpts = require("./validate-opts");
37
+ var { defineClass } = require("./framework-error");
38
+
39
+ var FuzzyError = defineClass("FuzzyError", { alwaysPermanent: true });
40
+
41
+ // ---- normalize ----
42
+
43
+ // Diacritic-stripping table — covers the most common Latin Unicode
44
+ // ranges. The framework intentionally ships a focused table (not a
45
+ // full Unicode normalizer) so the LoC is bounded; operators with
46
+ // non-Latin lists install ICU normalizer in their pre-processing.
47
+ var _DIACRITIC_MAP = {
48
+ "à":"a","á":"a","â":"a","ã":"a","ä":"a","å":"a","ą":"a","ă":"a",
49
+ "ç":"c","ć":"c","č":"c","ĉ":"c",
50
+ "ď":"d","đ":"d",
51
+ "è":"e","é":"e","ê":"e","ë":"e","ę":"e","ě":"e","ĕ":"e",
52
+ "ğ":"g","ĝ":"g","ġ":"g",
53
+ "ĥ":"h",
54
+ "ì":"i","í":"i","î":"i","ï":"i","ı":"i","į":"i",
55
+ "ĵ":"j",
56
+ "ķ":"k",
57
+ "ĺ":"l","ľ":"l","ł":"l","ļ":"l",
58
+ "ñ":"n","ń":"n","ň":"n","ņ":"n",
59
+ "ò":"o","ó":"o","ô":"o","õ":"o","ö":"o","ø":"o","ő":"o",
60
+ "ŕ":"r","ř":"r",
61
+ "ś":"s","š":"s","ş":"s","ș":"s","ŝ":"s",
62
+ "ť":"t","ţ":"t","ț":"t",
63
+ "ù":"u","ú":"u","û":"u","ü":"u","ū":"u","ů":"u","ű":"u","ŭ":"u",
64
+ "ŵ":"w",
65
+ "ý":"y","ÿ":"y","ŷ":"y",
66
+ "ź":"z","ż":"z","ž":"z",
67
+ "ß":"ss","æ":"ae","œ":"oe",
68
+ "À":"A","Á":"A","Â":"A","Ã":"A","Ä":"A","Å":"A",
69
+ "Ç":"C","È":"E","É":"E","Ê":"E","Ë":"E",
70
+ "Ì":"I","Í":"I","Î":"I","Ï":"I",
71
+ "Ñ":"N",
72
+ "Ò":"O","Ó":"O","Ô":"O","Õ":"O","Ö":"O","Ø":"O",
73
+ "Ù":"U","Ú":"U","Û":"U","Ü":"U",
74
+ "Ý":"Y","Ÿ":"Y",
75
+ "Ž":"Z","Š":"S",
76
+ };
77
+
78
+ function normalize(name) {
79
+ if (typeof name !== "string") return "";
80
+ // 1. Strip diacritics
81
+ var stripped = "";
82
+ for (var i = 0; i < name.length; i++) {
83
+ var ch = name.charAt(i);
84
+ stripped += _DIACRITIC_MAP[ch] || ch;
85
+ }
86
+ // 2. Lowercase
87
+ var lower = stripped.toLowerCase();
88
+ // 3. Strip punctuation other than hyphen + apostrophe (preserved
89
+ // inside names like O'Brien / Al-Faisal)
90
+ var punctStripped = lower.replace(/[^\p{Letter}\p{Number}'\- ]+/gu, " "); // allow:regex-no-length-cap — caller bounds total input via tokenize() length cap
91
+ // 4. Collapse whitespace
92
+ var collapsed = punctStripped.replace(/\s+/g, " ").trim();
93
+ return collapsed;
94
+ }
95
+
96
+ function tokenize(name) {
97
+ if (typeof name !== "string") return [];
98
+ if (name.length > MAX_INPUT_LEN) {
99
+ throw new FuzzyError("fuzzy/input-too-long",
100
+ "tokenize: input exceeds " + MAX_INPUT_LEN + " char cap");
101
+ }
102
+ var n = normalize(name);
103
+ if (n.length === 0) return [];
104
+ return n.split(" ").filter(function (t) { return t.length > 0; });
105
+ }
106
+
107
+ var MAX_INPUT_LEN = 512; // allow:raw-byte-literal — name length sanity cap (operators can override fuzzy.create)
108
+
109
+ // ---- Levenshtein with cap + early-exit ----
110
+
111
+ function levenshtein(a, b, capDist) {
112
+ if (typeof a !== "string" || typeof b !== "string") {
113
+ throw new FuzzyError("fuzzy/bad-input",
114
+ "levenshtein: a + b must be strings");
115
+ }
116
+ // Trivial cases
117
+ if (a === b) return 0;
118
+ if (a.length === 0) return b.length;
119
+ if (b.length === 0) return a.length;
120
+
121
+ // Cap (Math.abs(a.length - b.length) is the lower bound; if this
122
+ // already exceeds cap we can skip the full DP)
123
+ if (typeof capDist === "number" && capDist >= 0) {
124
+ var lengthDelta = Math.abs(a.length - b.length);
125
+ if (lengthDelta > capDist) return capDist + 1;
126
+ }
127
+
128
+ // Two-row DP: O(min(a.length, b.length)) memory.
129
+ var s = a.length <= b.length ? a : b;
130
+ var t = a.length <= b.length ? b : a;
131
+ var prev = new Array(s.length + 1);
132
+ var curr = new Array(s.length + 1);
133
+ for (var i = 0; i <= s.length; i++) prev[i] = i;
134
+ for (var j = 1; j <= t.length; j++) {
135
+ curr[0] = j;
136
+ var rowMin = j;
137
+ for (var k = 1; k <= s.length; k++) {
138
+ var cost = s.charAt(k - 1) === t.charAt(j - 1) ? 0 : 1;
139
+ curr[k] = Math.min(
140
+ prev[k] + 1, // deletion
141
+ curr[k - 1] + 1, // insertion
142
+ prev[k - 1] + cost // substitution
143
+ );
144
+ if (curr[k] < rowMin) rowMin = curr[k];
145
+ }
146
+ if (typeof capDist === "number" && rowMin > capDist) return capDist + 1;
147
+ var swap = prev; prev = curr; curr = swap;
148
+ }
149
+ return prev[s.length];
150
+ }
151
+
152
+ // ---- Jaro and Jaro-Winkler ----
153
+
154
+ function jaro(a, b) {
155
+ if (typeof a !== "string" || typeof b !== "string") return 0;
156
+ if (a === b) return a.length === 0 ? 0 : 1;
157
+ if (a.length === 0 || b.length === 0) return 0;
158
+ var matchWindow = Math.max(0, Math.floor(Math.max(a.length, b.length) / 2) - 1); // allow:raw-byte-literal — Jaro match-window formula
159
+ var aMatched = new Array(a.length).fill(false);
160
+ var bMatched = new Array(b.length).fill(false);
161
+ var matches = 0;
162
+ for (var i = 0; i < a.length; i++) {
163
+ var lo = Math.max(0, i - matchWindow);
164
+ var hi = Math.min(b.length - 1, i + matchWindow);
165
+ for (var j = lo; j <= hi; j++) {
166
+ if (bMatched[j]) continue;
167
+ if (a.charAt(i) !== b.charAt(j)) continue;
168
+ aMatched[i] = true;
169
+ bMatched[j] = true;
170
+ matches += 1;
171
+ break;
172
+ }
173
+ }
174
+ if (matches === 0) return 0;
175
+ // Count transpositions
176
+ var t = 0;
177
+ var k = 0;
178
+ for (var ii = 0; ii < a.length; ii++) {
179
+ if (!aMatched[ii]) continue;
180
+ while (!bMatched[k]) k += 1;
181
+ if (a.charAt(ii) !== b.charAt(k)) t += 1;
182
+ k += 1;
183
+ }
184
+ var transpositions = t / 2;
185
+ return (matches / a.length + matches / b.length +
186
+ (matches - transpositions) / matches) / 3; // allow:raw-byte-literal — Jaro 3-term formula
187
+ }
188
+
189
+ function jaroWinkler(a, b, prefixWeight) {
190
+ // prefixWeight defaults to 0.1 per the original Winkler paper;
191
+ // operators can lower to reduce prefix bias.
192
+ var w = (typeof prefixWeight === "number" && isFinite(prefixWeight))
193
+ ? prefixWeight : 0.1;
194
+ if (w < 0 || w > 0.25) {
195
+ throw new FuzzyError("fuzzy/bad-prefix-weight",
196
+ "jaroWinkler: prefixWeight must be in [0, 0.25]");
197
+ }
198
+ var j = jaro(a, b);
199
+ if (j === 0) return 0;
200
+ // Common prefix up to 4 chars (Winkler's cap)
201
+ var maxPrefix = 4; // allow:raw-byte-literal — Jaro-Winkler prefix cap (Winkler 1990)
202
+ var prefixLen = 0;
203
+ var max = Math.min(a.length, b.length, maxPrefix);
204
+ for (var i = 0; i < max; i++) {
205
+ if (a.charAt(i) !== b.charAt(i)) break;
206
+ prefixLen += 1;
207
+ }
208
+ return j + prefixLen * w * (1 - j);
209
+ }
210
+
211
+ // ---- Token-set similarity ----
212
+
213
+ function tokenSetSimilarity(a, b, opts) {
214
+ opts = opts || {};
215
+ var prefixWeight = opts.prefixWeight;
216
+ var threshold = (typeof opts.threshold === "number" && isFinite(opts.threshold))
217
+ ? opts.threshold : 0.85;
218
+ var tokensA = tokenize(a);
219
+ var tokensB = tokenize(b);
220
+ if (tokensA.length === 0 || tokensB.length === 0) return 0;
221
+ // Greedy bipartite matching: for each token in A, find the best
222
+ // unmatched B token; sum & average. This is O(n*m) but the typical
223
+ // name has ≤ 5 tokens so it's bounded.
224
+ var bUsed = new Array(tokensB.length).fill(false);
225
+ var matchedScores = [];
226
+ for (var i = 0; i < tokensA.length; i++) {
227
+ var bestScore = 0;
228
+ var bestIdx = -1;
229
+ for (var j = 0; j < tokensB.length; j++) {
230
+ if (bUsed[j]) continue;
231
+ var s = jaroWinkler(tokensA[i], tokensB[j], prefixWeight);
232
+ if (s > bestScore) { bestScore = s; bestIdx = j; }
233
+ }
234
+ if (bestIdx !== -1 && bestScore >= threshold) {
235
+ bUsed[bestIdx] = true;
236
+ matchedScores.push(bestScore);
237
+ }
238
+ }
239
+ if (matchedScores.length === 0) return 0;
240
+ // Token-set similarity: average of the matched-pair scores, weighted
241
+ // by coverage of the smaller-token-side.
242
+ var avg = matchedScores.reduce(function (a2, b2) { return a2 + b2; }, 0) /
243
+ matchedScores.length;
244
+ var coverage = matchedScores.length / Math.min(tokensA.length, tokensB.length);
245
+ return avg * coverage;
246
+ }
247
+
248
+ // ---- Container helpers ----
249
+
250
+ // substringContains — true when the normalized form of `needle` is a
251
+ // whitespace-bounded substring of the normalized form of `haystack`.
252
+ // Useful for catching SDN entries like "Acme Corp" inside a fuller
253
+ // local record like "Acme Corp Limited Liability Company".
254
+ function substringContains(haystack, needle) {
255
+ var nh = " " + normalize(haystack) + " ";
256
+ var nn = " " + normalize(needle) + " ";
257
+ return nh.indexOf(nn) !== -1;
258
+ }
259
+
260
+ // initialsMatch — true when the normalized form of `a` is shaped like
261
+ // "J Smith" / "J. Smith" / "JS" and matches the leading-character
262
+ // pattern of `b`. Catches the common "screen-typo" pattern where the
263
+ // user typed an initial instead of a full first name.
264
+ function initialsMatch(a, b) {
265
+ var ta = tokenize(a);
266
+ var tb = tokenize(b);
267
+ if (ta.length === 0 || tb.length === 0) return false;
268
+ if (ta.length !== tb.length) return false;
269
+ for (var i = 0; i < ta.length; i++) {
270
+ var x = ta[i];
271
+ var y = tb[i];
272
+ if (x === y) continue;
273
+ // Match if either side is a single char and matches the other's
274
+ // first char.
275
+ if (x.length === 1 && y.startsWith(x)) continue;
276
+ if (y.length === 1 && x.startsWith(y)) continue;
277
+ return false;
278
+ }
279
+ return true;
280
+ }
281
+
282
+ module.exports = {
283
+ normalize: normalize,
284
+ tokenize: tokenize,
285
+ levenshtein: levenshtein,
286
+ jaro: jaro,
287
+ jaroWinkler: jaroWinkler,
288
+ tokenSetSimilarity: tokenSetSimilarity,
289
+ substringContains: substringContains,
290
+ initialsMatch: initialsMatch,
291
+ FuzzyError: FuzzyError,
292
+ MAX_INPUT_LEN: MAX_INPUT_LEN,
293
+ };
294
+ // note: validateOpts intentionally not used in this file (pure
295
+ // algorithmic helpers); imported only to keep the require shape
296
+ // consistent with sister modules.
297
+ void validateOpts;