@blamejs/core 0.7.103 → 0.7.105
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +4 -0
- package/index.js +2 -0
- package/lib/audit.js +1 -0
- package/lib/compliance-sanctions-aliases.js +167 -0
- package/lib/compliance-sanctions-fetcher.js +206 -0
- package/lib/compliance-sanctions-fuzzy.js +297 -0
- package/lib/compliance-sanctions.js +569 -0
- package/lib/compliance.js +2 -0
- package/lib/dsr.js +953 -0
- package/package.json +1 -1
- package/sbom.cyclonedx.json +6 -6
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Fuzzy name-matching primitives for sanctions screening.
|
|
4
|
+
*
|
|
5
|
+
* Operators screening names against the OFAC SDN list / EU CSL /
|
|
6
|
+
* UK HMT consolidated list need to handle:
|
|
7
|
+
* - Transliteration variations (Mohamed / Mohammed / Muhammad)
|
|
8
|
+
* - Order-of-name variations (Smith John vs John Smith)
|
|
9
|
+
* - Initials vs full names (J. Smith vs John Smith)
|
|
10
|
+
* - Diacritical noise (Müller vs Muller)
|
|
11
|
+
* - Substring containment (the SDN entry "Acme Corp" matches a
|
|
12
|
+
* local record "Acme Corp Limited")
|
|
13
|
+
*
|
|
14
|
+
* This module exports the algorithmic core; b.compliance.sanctions
|
|
15
|
+
* orchestrates parser/index/match against it.
|
|
16
|
+
*
|
|
17
|
+
* Functions:
|
|
18
|
+
* normalize(name) → canonical lowercase form, diacritics
|
|
19
|
+
* stripped, multi-space collapsed
|
|
20
|
+
* tokenize(name) → array of normalized tokens
|
|
21
|
+
* levenshtein(a, b, capDist) → edit distance with O(min(a,b)) memory
|
|
22
|
+
* + early-exit when distance > capDist
|
|
23
|
+
* jaroWinkler(a, b, prefix) → 0..1 similarity score per Jaro-Winkler
|
|
24
|
+
* (1996); operators typically threshold
|
|
25
|
+
* at >= 0.85 for "probable match"
|
|
26
|
+
* tokenSetSimilarity(a, b) → bag-of-tokens overlap with token-pair
|
|
27
|
+
* Jaro-Winkler scoring; resilient to
|
|
28
|
+
* word order and missing/extra terms
|
|
29
|
+
*
|
|
30
|
+
* Performance: worst-case O(n*m) for Levenshtein (n,m = string lengths),
|
|
31
|
+
* O(n*m) for Jaro-Winkler. Operators screening against a list of N
|
|
32
|
+
* entries should pre-filter on token-set overlap before computing
|
|
33
|
+
* Jaro-Winkler on every candidate.
|
|
34
|
+
*/
|
|
35
|
+
|
|
36
|
+
var validateOpts = require("./validate-opts");
|
|
37
|
+
var { defineClass } = require("./framework-error");
|
|
38
|
+
|
|
39
|
+
var FuzzyError = defineClass("FuzzyError", { alwaysPermanent: true });
|
|
40
|
+
|
|
41
|
+
// ---- normalize ----
|
|
42
|
+
|
|
43
|
+
// Diacritic-stripping table — covers the most common Latin Unicode
|
|
44
|
+
// ranges. The framework intentionally ships a focused table (not a
|
|
45
|
+
// full Unicode normalizer) so the LoC is bounded; operators with
|
|
46
|
+
// non-Latin lists install ICU normalizer in their pre-processing.
|
|
47
|
+
var _DIACRITIC_MAP = {
|
|
48
|
+
"à":"a","á":"a","â":"a","ã":"a","ä":"a","å":"a","ą":"a","ă":"a",
|
|
49
|
+
"ç":"c","ć":"c","č":"c","ĉ":"c",
|
|
50
|
+
"ď":"d","đ":"d",
|
|
51
|
+
"è":"e","é":"e","ê":"e","ë":"e","ę":"e","ě":"e","ĕ":"e",
|
|
52
|
+
"ğ":"g","ĝ":"g","ġ":"g",
|
|
53
|
+
"ĥ":"h",
|
|
54
|
+
"ì":"i","í":"i","î":"i","ï":"i","ı":"i","į":"i",
|
|
55
|
+
"ĵ":"j",
|
|
56
|
+
"ķ":"k",
|
|
57
|
+
"ĺ":"l","ľ":"l","ł":"l","ļ":"l",
|
|
58
|
+
"ñ":"n","ń":"n","ň":"n","ņ":"n",
|
|
59
|
+
"ò":"o","ó":"o","ô":"o","õ":"o","ö":"o","ø":"o","ő":"o",
|
|
60
|
+
"ŕ":"r","ř":"r",
|
|
61
|
+
"ś":"s","š":"s","ş":"s","ș":"s","ŝ":"s",
|
|
62
|
+
"ť":"t","ţ":"t","ț":"t",
|
|
63
|
+
"ù":"u","ú":"u","û":"u","ü":"u","ū":"u","ů":"u","ű":"u","ŭ":"u",
|
|
64
|
+
"ŵ":"w",
|
|
65
|
+
"ý":"y","ÿ":"y","ŷ":"y",
|
|
66
|
+
"ź":"z","ż":"z","ž":"z",
|
|
67
|
+
"ß":"ss","æ":"ae","œ":"oe",
|
|
68
|
+
"À":"A","Á":"A","Â":"A","Ã":"A","Ä":"A","Å":"A",
|
|
69
|
+
"Ç":"C","È":"E","É":"E","Ê":"E","Ë":"E",
|
|
70
|
+
"Ì":"I","Í":"I","Î":"I","Ï":"I",
|
|
71
|
+
"Ñ":"N",
|
|
72
|
+
"Ò":"O","Ó":"O","Ô":"O","Õ":"O","Ö":"O","Ø":"O",
|
|
73
|
+
"Ù":"U","Ú":"U","Û":"U","Ü":"U",
|
|
74
|
+
"Ý":"Y","Ÿ":"Y",
|
|
75
|
+
"Ž":"Z","Š":"S",
|
|
76
|
+
};
|
|
77
|
+
|
|
78
|
+
function normalize(name) {
|
|
79
|
+
if (typeof name !== "string") return "";
|
|
80
|
+
// 1. Strip diacritics
|
|
81
|
+
var stripped = "";
|
|
82
|
+
for (var i = 0; i < name.length; i++) {
|
|
83
|
+
var ch = name.charAt(i);
|
|
84
|
+
stripped += _DIACRITIC_MAP[ch] || ch;
|
|
85
|
+
}
|
|
86
|
+
// 2. Lowercase
|
|
87
|
+
var lower = stripped.toLowerCase();
|
|
88
|
+
// 3. Strip punctuation other than hyphen + apostrophe (preserved
|
|
89
|
+
// inside names like O'Brien / Al-Faisal)
|
|
90
|
+
var punctStripped = lower.replace(/[^\p{Letter}\p{Number}'\- ]+/gu, " "); // allow:regex-no-length-cap — caller bounds total input via tokenize() length cap
|
|
91
|
+
// 4. Collapse whitespace
|
|
92
|
+
var collapsed = punctStripped.replace(/\s+/g, " ").trim();
|
|
93
|
+
return collapsed;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
function tokenize(name) {
|
|
97
|
+
if (typeof name !== "string") return [];
|
|
98
|
+
if (name.length > MAX_INPUT_LEN) {
|
|
99
|
+
throw new FuzzyError("fuzzy/input-too-long",
|
|
100
|
+
"tokenize: input exceeds " + MAX_INPUT_LEN + " char cap");
|
|
101
|
+
}
|
|
102
|
+
var n = normalize(name);
|
|
103
|
+
if (n.length === 0) return [];
|
|
104
|
+
return n.split(" ").filter(function (t) { return t.length > 0; });
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
var MAX_INPUT_LEN = 512; // allow:raw-byte-literal — name length sanity cap (operators can override fuzzy.create)
|
|
108
|
+
|
|
109
|
+
// ---- Levenshtein with cap + early-exit ----
|
|
110
|
+
|
|
111
|
+
function levenshtein(a, b, capDist) {
|
|
112
|
+
if (typeof a !== "string" || typeof b !== "string") {
|
|
113
|
+
throw new FuzzyError("fuzzy/bad-input",
|
|
114
|
+
"levenshtein: a + b must be strings");
|
|
115
|
+
}
|
|
116
|
+
// Trivial cases
|
|
117
|
+
if (a === b) return 0;
|
|
118
|
+
if (a.length === 0) return b.length;
|
|
119
|
+
if (b.length === 0) return a.length;
|
|
120
|
+
|
|
121
|
+
// Cap (Math.abs(a.length - b.length) is the lower bound; if this
|
|
122
|
+
// already exceeds cap we can skip the full DP)
|
|
123
|
+
if (typeof capDist === "number" && capDist >= 0) {
|
|
124
|
+
var lengthDelta = Math.abs(a.length - b.length);
|
|
125
|
+
if (lengthDelta > capDist) return capDist + 1;
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// Two-row DP: O(min(a.length, b.length)) memory.
|
|
129
|
+
var s = a.length <= b.length ? a : b;
|
|
130
|
+
var t = a.length <= b.length ? b : a;
|
|
131
|
+
var prev = new Array(s.length + 1);
|
|
132
|
+
var curr = new Array(s.length + 1);
|
|
133
|
+
for (var i = 0; i <= s.length; i++) prev[i] = i;
|
|
134
|
+
for (var j = 1; j <= t.length; j++) {
|
|
135
|
+
curr[0] = j;
|
|
136
|
+
var rowMin = j;
|
|
137
|
+
for (var k = 1; k <= s.length; k++) {
|
|
138
|
+
var cost = s.charAt(k - 1) === t.charAt(j - 1) ? 0 : 1;
|
|
139
|
+
curr[k] = Math.min(
|
|
140
|
+
prev[k] + 1, // deletion
|
|
141
|
+
curr[k - 1] + 1, // insertion
|
|
142
|
+
prev[k - 1] + cost // substitution
|
|
143
|
+
);
|
|
144
|
+
if (curr[k] < rowMin) rowMin = curr[k];
|
|
145
|
+
}
|
|
146
|
+
if (typeof capDist === "number" && rowMin > capDist) return capDist + 1;
|
|
147
|
+
var swap = prev; prev = curr; curr = swap;
|
|
148
|
+
}
|
|
149
|
+
return prev[s.length];
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
// ---- Jaro and Jaro-Winkler ----
|
|
153
|
+
|
|
154
|
+
function jaro(a, b) {
|
|
155
|
+
if (typeof a !== "string" || typeof b !== "string") return 0;
|
|
156
|
+
if (a === b) return a.length === 0 ? 0 : 1;
|
|
157
|
+
if (a.length === 0 || b.length === 0) return 0;
|
|
158
|
+
var matchWindow = Math.max(0, Math.floor(Math.max(a.length, b.length) / 2) - 1); // allow:raw-byte-literal — Jaro match-window formula
|
|
159
|
+
var aMatched = new Array(a.length).fill(false);
|
|
160
|
+
var bMatched = new Array(b.length).fill(false);
|
|
161
|
+
var matches = 0;
|
|
162
|
+
for (var i = 0; i < a.length; i++) {
|
|
163
|
+
var lo = Math.max(0, i - matchWindow);
|
|
164
|
+
var hi = Math.min(b.length - 1, i + matchWindow);
|
|
165
|
+
for (var j = lo; j <= hi; j++) {
|
|
166
|
+
if (bMatched[j]) continue;
|
|
167
|
+
if (a.charAt(i) !== b.charAt(j)) continue;
|
|
168
|
+
aMatched[i] = true;
|
|
169
|
+
bMatched[j] = true;
|
|
170
|
+
matches += 1;
|
|
171
|
+
break;
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
if (matches === 0) return 0;
|
|
175
|
+
// Count transpositions
|
|
176
|
+
var t = 0;
|
|
177
|
+
var k = 0;
|
|
178
|
+
for (var ii = 0; ii < a.length; ii++) {
|
|
179
|
+
if (!aMatched[ii]) continue;
|
|
180
|
+
while (!bMatched[k]) k += 1;
|
|
181
|
+
if (a.charAt(ii) !== b.charAt(k)) t += 1;
|
|
182
|
+
k += 1;
|
|
183
|
+
}
|
|
184
|
+
var transpositions = t / 2;
|
|
185
|
+
return (matches / a.length + matches / b.length +
|
|
186
|
+
(matches - transpositions) / matches) / 3; // allow:raw-byte-literal — Jaro 3-term formula
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
function jaroWinkler(a, b, prefixWeight) {
|
|
190
|
+
// prefixWeight defaults to 0.1 per the original Winkler paper;
|
|
191
|
+
// operators can lower to reduce prefix bias.
|
|
192
|
+
var w = (typeof prefixWeight === "number" && isFinite(prefixWeight))
|
|
193
|
+
? prefixWeight : 0.1;
|
|
194
|
+
if (w < 0 || w > 0.25) {
|
|
195
|
+
throw new FuzzyError("fuzzy/bad-prefix-weight",
|
|
196
|
+
"jaroWinkler: prefixWeight must be in [0, 0.25]");
|
|
197
|
+
}
|
|
198
|
+
var j = jaro(a, b);
|
|
199
|
+
if (j === 0) return 0;
|
|
200
|
+
// Common prefix up to 4 chars (Winkler's cap)
|
|
201
|
+
var maxPrefix = 4; // allow:raw-byte-literal — Jaro-Winkler prefix cap (Winkler 1990)
|
|
202
|
+
var prefixLen = 0;
|
|
203
|
+
var max = Math.min(a.length, b.length, maxPrefix);
|
|
204
|
+
for (var i = 0; i < max; i++) {
|
|
205
|
+
if (a.charAt(i) !== b.charAt(i)) break;
|
|
206
|
+
prefixLen += 1;
|
|
207
|
+
}
|
|
208
|
+
return j + prefixLen * w * (1 - j);
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
// ---- Token-set similarity ----
|
|
212
|
+
|
|
213
|
+
function tokenSetSimilarity(a, b, opts) {
|
|
214
|
+
opts = opts || {};
|
|
215
|
+
var prefixWeight = opts.prefixWeight;
|
|
216
|
+
var threshold = (typeof opts.threshold === "number" && isFinite(opts.threshold))
|
|
217
|
+
? opts.threshold : 0.85;
|
|
218
|
+
var tokensA = tokenize(a);
|
|
219
|
+
var tokensB = tokenize(b);
|
|
220
|
+
if (tokensA.length === 0 || tokensB.length === 0) return 0;
|
|
221
|
+
// Greedy bipartite matching: for each token in A, find the best
|
|
222
|
+
// unmatched B token; sum & average. This is O(n*m) but the typical
|
|
223
|
+
// name has ≤ 5 tokens so it's bounded.
|
|
224
|
+
var bUsed = new Array(tokensB.length).fill(false);
|
|
225
|
+
var matchedScores = [];
|
|
226
|
+
for (var i = 0; i < tokensA.length; i++) {
|
|
227
|
+
var bestScore = 0;
|
|
228
|
+
var bestIdx = -1;
|
|
229
|
+
for (var j = 0; j < tokensB.length; j++) {
|
|
230
|
+
if (bUsed[j]) continue;
|
|
231
|
+
var s = jaroWinkler(tokensA[i], tokensB[j], prefixWeight);
|
|
232
|
+
if (s > bestScore) { bestScore = s; bestIdx = j; }
|
|
233
|
+
}
|
|
234
|
+
if (bestIdx !== -1 && bestScore >= threshold) {
|
|
235
|
+
bUsed[bestIdx] = true;
|
|
236
|
+
matchedScores.push(bestScore);
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
if (matchedScores.length === 0) return 0;
|
|
240
|
+
// Token-set similarity: average of the matched-pair scores, weighted
|
|
241
|
+
// by coverage of the smaller-token-side.
|
|
242
|
+
var avg = matchedScores.reduce(function (a2, b2) { return a2 + b2; }, 0) /
|
|
243
|
+
matchedScores.length;
|
|
244
|
+
var coverage = matchedScores.length / Math.min(tokensA.length, tokensB.length);
|
|
245
|
+
return avg * coverage;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
// ---- Container helpers ----
|
|
249
|
+
|
|
250
|
+
// substringContains — true when the normalized form of `needle` is a
|
|
251
|
+
// whitespace-bounded substring of the normalized form of `haystack`.
|
|
252
|
+
// Useful for catching SDN entries like "Acme Corp" inside a fuller
|
|
253
|
+
// local record like "Acme Corp Limited Liability Company".
|
|
254
|
+
function substringContains(haystack, needle) {
|
|
255
|
+
var nh = " " + normalize(haystack) + " ";
|
|
256
|
+
var nn = " " + normalize(needle) + " ";
|
|
257
|
+
return nh.indexOf(nn) !== -1;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// initialsMatch — true when the normalized form of `a` is shaped like
|
|
261
|
+
// "J Smith" / "J. Smith" / "JS" and matches the leading-character
|
|
262
|
+
// pattern of `b`. Catches the common "screen-typo" pattern where the
|
|
263
|
+
// user typed an initial instead of a full first name.
|
|
264
|
+
function initialsMatch(a, b) {
|
|
265
|
+
var ta = tokenize(a);
|
|
266
|
+
var tb = tokenize(b);
|
|
267
|
+
if (ta.length === 0 || tb.length === 0) return false;
|
|
268
|
+
if (ta.length !== tb.length) return false;
|
|
269
|
+
for (var i = 0; i < ta.length; i++) {
|
|
270
|
+
var x = ta[i];
|
|
271
|
+
var y = tb[i];
|
|
272
|
+
if (x === y) continue;
|
|
273
|
+
// Match if either side is a single char and matches the other's
|
|
274
|
+
// first char.
|
|
275
|
+
if (x.length === 1 && y.startsWith(x)) continue;
|
|
276
|
+
if (y.length === 1 && x.startsWith(y)) continue;
|
|
277
|
+
return false;
|
|
278
|
+
}
|
|
279
|
+
return true;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
module.exports = {
|
|
283
|
+
normalize: normalize,
|
|
284
|
+
tokenize: tokenize,
|
|
285
|
+
levenshtein: levenshtein,
|
|
286
|
+
jaro: jaro,
|
|
287
|
+
jaroWinkler: jaroWinkler,
|
|
288
|
+
tokenSetSimilarity: tokenSetSimilarity,
|
|
289
|
+
substringContains: substringContains,
|
|
290
|
+
initialsMatch: initialsMatch,
|
|
291
|
+
FuzzyError: FuzzyError,
|
|
292
|
+
MAX_INPUT_LEN: MAX_INPUT_LEN,
|
|
293
|
+
};
|
|
294
|
+
// note: validateOpts intentionally not used in this file (pure
|
|
295
|
+
// algorithmic helpers); imported only to keep the require shape
|
|
296
|
+
// consistent with sister modules.
|
|
297
|
+
void validateOpts;
|