goldenmatch 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +140 -0
- package/dist/cli.cjs +6079 -0
- package/dist/cli.cjs.map +1 -0
- package/dist/cli.d.cts +1 -0
- package/dist/cli.d.ts +1 -0
- package/dist/cli.js +6076 -0
- package/dist/cli.js.map +1 -0
- package/dist/core/index.cjs +8449 -0
- package/dist/core/index.cjs.map +1 -0
- package/dist/core/index.d.cts +1972 -0
- package/dist/core/index.d.ts +1972 -0
- package/dist/core/index.js +8318 -0
- package/dist/core/index.js.map +1 -0
- package/dist/index.cjs +8449 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +2 -0
- package/dist/index.d.ts +2 -0
- package/dist/index.js +8318 -0
- package/dist/index.js.map +1 -0
- package/dist/node/backends/score-worker.cjs +934 -0
- package/dist/node/backends/score-worker.cjs.map +1 -0
- package/dist/node/backends/score-worker.d.cts +14 -0
- package/dist/node/backends/score-worker.d.ts +14 -0
- package/dist/node/backends/score-worker.js +932 -0
- package/dist/node/backends/score-worker.js.map +1 -0
- package/dist/node/index.cjs +11430 -0
- package/dist/node/index.cjs.map +1 -0
- package/dist/node/index.d.cts +554 -0
- package/dist/node/index.d.ts +554 -0
- package/dist/node/index.js +11277 -0
- package/dist/node/index.js.map +1 -0
- package/dist/types-DhUdX5Rc.d.cts +304 -0
- package/dist/types-DhUdX5Rc.d.ts +304 -0
- package/examples/01-basic-dedupe.ts +60 -0
- package/examples/02-match-two-datasets.ts +48 -0
- package/examples/03-csv-file-pipeline.ts +62 -0
- package/examples/04-string-scoring.ts +63 -0
- package/examples/05-custom-config.ts +94 -0
- package/examples/06-probabilistic-fs.ts +72 -0
- package/examples/07-pprl-privacy.ts +76 -0
- package/examples/08-streaming.ts +79 -0
- package/examples/09-llm-scorer.ts +79 -0
- package/examples/10-explain.ts +60 -0
- package/examples/11-evaluate.ts +61 -0
- package/examples/README.md +53 -0
- package/package.json +66 -0
- package/src/cli.ts +372 -0
- package/src/core/ann-blocker.ts +593 -0
- package/src/core/api.ts +220 -0
- package/src/core/autoconfig.ts +363 -0
- package/src/core/autofix.ts +102 -0
- package/src/core/blocker.ts +655 -0
- package/src/core/cluster.ts +699 -0
- package/src/core/compare-clusters.ts +176 -0
- package/src/core/config/loader.ts +869 -0
- package/src/core/cross-encoder.ts +614 -0
- package/src/core/data.ts +430 -0
- package/src/core/domain.ts +277 -0
- package/src/core/embedder.ts +562 -0
- package/src/core/evaluate.ts +156 -0
- package/src/core/explain.ts +352 -0
- package/src/core/golden.ts +524 -0
- package/src/core/graph-er.ts +371 -0
- package/src/core/index.ts +314 -0
- package/src/core/ingest.ts +112 -0
- package/src/core/learned-blocking.ts +305 -0
- package/src/core/lineage.ts +221 -0
- package/src/core/llm/budget.ts +258 -0
- package/src/core/llm/cluster.ts +542 -0
- package/src/core/llm/scorer.ts +396 -0
- package/src/core/match-one.ts +95 -0
- package/src/core/matchkey.ts +97 -0
- package/src/core/memory/corrections.ts +179 -0
- package/src/core/memory/learner.ts +218 -0
- package/src/core/memory/store.ts +114 -0
- package/src/core/pipeline.ts +366 -0
- package/src/core/pprl/protocol.ts +216 -0
- package/src/core/probabilistic.ts +511 -0
- package/src/core/profiler.ts +212 -0
- package/src/core/quality.ts +197 -0
- package/src/core/review-queue.ts +177 -0
- package/src/core/scorer.ts +855 -0
- package/src/core/sensitivity.ts +196 -0
- package/src/core/standardize.ts +279 -0
- package/src/core/streaming.ts +128 -0
- package/src/core/transforms.ts +599 -0
- package/src/core/types.ts +570 -0
- package/src/core/validate.ts +243 -0
- package/src/index.ts +8 -0
- package/src/node/a2a/server.ts +470 -0
- package/src/node/api/server.ts +412 -0
- package/src/node/backends/duckdb.ts +130 -0
- package/src/node/backends/score-worker.ts +41 -0
- package/src/node/backends/workers.ts +212 -0
- package/src/node/config-file.ts +66 -0
- package/src/node/connectors/base.ts +57 -0
- package/src/node/connectors/bigquery.ts +61 -0
- package/src/node/connectors/databricks.ts +69 -0
- package/src/node/connectors/file.ts +350 -0
- package/src/node/connectors/hubspot.ts +62 -0
- package/src/node/connectors/index.ts +43 -0
- package/src/node/connectors/salesforce.ts +93 -0
- package/src/node/connectors/snowflake.ts +73 -0
- package/src/node/db/postgres.ts +173 -0
- package/src/node/db/sync.ts +103 -0
- package/src/node/dedupe-file.ts +156 -0
- package/src/node/index.ts +89 -0
- package/src/node/mcp/server.ts +940 -0
- package/src/node/tui/app.ts +756 -0
- package/src/node/tui/index.ts +6 -0
- package/src/node/tui/widgets.ts +128 -0
- package/tests/parity/scorer-ground-truth.test.ts +118 -0
- package/tests/smoke.test.ts +46 -0
- package/tests/unit/a2a-server.test.ts +175 -0
- package/tests/unit/ann-blocker.test.ts +117 -0
- package/tests/unit/api-server.test.ts +239 -0
- package/tests/unit/api.test.ts +77 -0
- package/tests/unit/autoconfig.test.ts +103 -0
- package/tests/unit/autofix.test.ts +71 -0
- package/tests/unit/blocker.test.ts +164 -0
- package/tests/unit/buildBlocksAsync.test.ts +63 -0
- package/tests/unit/cluster.test.ts +213 -0
- package/tests/unit/compare-clusters.test.ts +42 -0
- package/tests/unit/config-loader.test.ts +301 -0
- package/tests/unit/connectors-base.test.ts +48 -0
- package/tests/unit/cross-encoder-model.test.ts +198 -0
- package/tests/unit/cross-encoder.test.ts +173 -0
- package/tests/unit/db-connectors.test.ts +37 -0
- package/tests/unit/domain.test.ts +80 -0
- package/tests/unit/embedder.test.ts +151 -0
- package/tests/unit/evaluate.test.ts +85 -0
- package/tests/unit/explain.test.ts +73 -0
- package/tests/unit/golden.test.ts +97 -0
- package/tests/unit/graph-er.test.ts +173 -0
- package/tests/unit/hnsw-ann.test.ts +283 -0
- package/tests/unit/hubspot-connector.test.ts +118 -0
- package/tests/unit/ingest.test.ts +97 -0
- package/tests/unit/learned-blocking.test.ts +134 -0
- package/tests/unit/lineage.test.ts +135 -0
- package/tests/unit/match-one.test.ts +129 -0
- package/tests/unit/matchkey.test.ts +97 -0
- package/tests/unit/mcp-server.test.ts +183 -0
- package/tests/unit/memory.test.ts +119 -0
- package/tests/unit/pipeline.test.ts +118 -0
- package/tests/unit/pprl-protocol.test.ts +381 -0
- package/tests/unit/probabilistic.test.ts +494 -0
- package/tests/unit/profiler.test.ts +68 -0
- package/tests/unit/review-queue.test.ts +68 -0
- package/tests/unit/salesforce-connector.test.ts +148 -0
- package/tests/unit/scorer.test.ts +301 -0
- package/tests/unit/sensitivity.test.ts +154 -0
- package/tests/unit/standardize.test.ts +84 -0
- package/tests/unit/streaming.test.ts +82 -0
- package/tests/unit/transforms.test.ts +208 -0
- package/tests/unit/tui-widgets.test.ts +42 -0
- package/tests/unit/tui.test.ts +24 -0
- package/tests/unit/validate.test.ts +145 -0
- package/tests/unit/workers-parallel.test.ts +99 -0
- package/tests/unit/workers.test.ts +74 -0
- package/tsconfig.json +25 -0
- package/tsup.config.ts +37 -0
- package/vitest.config.ts +11 -0
|
@@ -0,0 +1,599 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* transforms.ts — Pure field transform utilities.
|
|
3
|
+
* Edge-safe: no Node.js imports, no `process`.
|
|
4
|
+
*
|
|
5
|
+
* Ports goldenmatch/utils/transforms.py.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
// ---------------------------------------------------------------------------
|
|
9
|
+
// Public API
|
|
10
|
+
// ---------------------------------------------------------------------------
|
|
11
|
+
|
|
12
|
+
/** Apply a single named transform to a value. Returns null if input is null. */
|
|
13
|
+
export function applyTransform(
|
|
14
|
+
value: string | null,
|
|
15
|
+
transform: string,
|
|
16
|
+
): string | null {
|
|
17
|
+
if (value === null) return null;
|
|
18
|
+
|
|
19
|
+
// Handle parameterized transforms (substring:start:end, qgram:n, bloom_filter:...)
|
|
20
|
+
if (transform.startsWith("substring:")) return applySubstring(value, transform);
|
|
21
|
+
if (transform.startsWith("qgram:")) return applyQgram(value, transform);
|
|
22
|
+
if (transform.startsWith("bloom_filter")) return applyBloomFilter(value, transform);
|
|
23
|
+
|
|
24
|
+
switch (transform) {
|
|
25
|
+
case "lowercase":
|
|
26
|
+
return value.toLowerCase();
|
|
27
|
+
case "uppercase":
|
|
28
|
+
return value.toUpperCase();
|
|
29
|
+
case "strip":
|
|
30
|
+
return value.trim();
|
|
31
|
+
case "strip_all":
|
|
32
|
+
return value.replace(/\s+/g, "");
|
|
33
|
+
case "digits_only":
|
|
34
|
+
return value.replace(/\D/g, "");
|
|
35
|
+
case "alpha_only":
|
|
36
|
+
return value.replace(/[^a-zA-Z]/g, "");
|
|
37
|
+
case "normalize_whitespace":
|
|
38
|
+
return value.trim().replace(/\s+/g, " ");
|
|
39
|
+
case "token_sort":
|
|
40
|
+
return value
|
|
41
|
+
.trim()
|
|
42
|
+
.split(/\s+/)
|
|
43
|
+
.sort()
|
|
44
|
+
.join(" ");
|
|
45
|
+
case "first_token":
|
|
46
|
+
return value.trim().split(/\s+/)[0] ?? "";
|
|
47
|
+
case "last_token": {
|
|
48
|
+
const tokens = value.trim().split(/\s+/);
|
|
49
|
+
return tokens[tokens.length - 1] ?? "";
|
|
50
|
+
}
|
|
51
|
+
case "soundex":
|
|
52
|
+
return soundex(value);
|
|
53
|
+
case "metaphone":
|
|
54
|
+
return metaphone(value);
|
|
55
|
+
default:
|
|
56
|
+
return value;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/** Apply a chain of transforms in order. */
|
|
61
|
+
export function applyTransforms(
|
|
62
|
+
value: string | null,
|
|
63
|
+
transforms: readonly string[],
|
|
64
|
+
): string | null {
|
|
65
|
+
let result = value;
|
|
66
|
+
for (const t of transforms) {
|
|
67
|
+
result = applyTransform(result, t);
|
|
68
|
+
if (result === null) return null;
|
|
69
|
+
}
|
|
70
|
+
return result;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
// ---------------------------------------------------------------------------
|
|
74
|
+
// Parameterized transforms
|
|
75
|
+
// ---------------------------------------------------------------------------
|
|
76
|
+
|
|
77
|
+
/** substring:start:end */
|
|
78
|
+
function applySubstring(value: string, transform: string): string {
|
|
79
|
+
const parts = transform.split(":");
|
|
80
|
+
const start = parseInt(parts[1] ?? "0", 10);
|
|
81
|
+
const end = parts[2] !== undefined ? parseInt(parts[2], 10) : undefined;
|
|
82
|
+
return value.slice(start, end);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/** qgram:n — split into character n-grams, sorted and space-separated. */
|
|
86
|
+
function applyQgram(value: string, transform: string): string {
|
|
87
|
+
const parts = transform.split(":");
|
|
88
|
+
const n = parseInt(parts[1] ?? "2", 10);
|
|
89
|
+
if (n <= 0 || value.length < n) return value;
|
|
90
|
+
const grams: string[] = [];
|
|
91
|
+
for (let i = 0; i <= value.length - n; i++) {
|
|
92
|
+
grams.push(value.slice(i, i + n));
|
|
93
|
+
}
|
|
94
|
+
return grams.sort().join(" ");
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
// ---------------------------------------------------------------------------
|
|
98
|
+
// Soundex — Robert Russell's algorithm
|
|
99
|
+
// ---------------------------------------------------------------------------
|
|
100
|
+
|
|
101
|
+
const SOUNDEX_MAP: Record<string, string> = {
|
|
102
|
+
B: "1", F: "1", P: "1", V: "1",
|
|
103
|
+
C: "2", G: "2", J: "2", K: "2", Q: "2", S: "2", X: "2", Z: "2",
|
|
104
|
+
D: "3", T: "3",
|
|
105
|
+
L: "4",
|
|
106
|
+
M: "5", N: "5",
|
|
107
|
+
R: "6",
|
|
108
|
+
};
|
|
109
|
+
|
|
110
|
+
/**
|
|
111
|
+
* American Soundex (Robert Russell, 1918).
|
|
112
|
+
* 1. Keep first letter
|
|
113
|
+
* 2. Map consonants to digits (B/F/P/V->1, C/G/J/K/Q/S/X/Z->2, D/T->3, L->4, M/N->5, R->6)
|
|
114
|
+
* 3. Remove adjacent duplicates, vowels/H/W
|
|
115
|
+
* 4. Pad/truncate to 4 chars
|
|
116
|
+
*
|
|
117
|
+
* H and W are transparent — they do NOT reset the duplicate suppression.
|
|
118
|
+
* Vowels (A/E/I/O/U/Y) DO reset, so "Pfister" and "Jackson" work correctly.
|
|
119
|
+
*/
|
|
120
|
+
export function soundex(value: string): string {
|
|
121
|
+
const clean = value.toUpperCase().replace(/[^A-Z]/g, "");
|
|
122
|
+
if (clean.length === 0) return "0000";
|
|
123
|
+
|
|
124
|
+
const firstLetter = clean[0]!;
|
|
125
|
+
let code = firstLetter;
|
|
126
|
+
let lastDigit = SOUNDEX_MAP[firstLetter] ?? "0";
|
|
127
|
+
|
|
128
|
+
for (let i = 1; i < clean.length && code.length < 4; i++) {
|
|
129
|
+
const ch = clean[i]!;
|
|
130
|
+
const digit = SOUNDEX_MAP[ch];
|
|
131
|
+
if (digit && digit !== lastDigit) {
|
|
132
|
+
code += digit;
|
|
133
|
+
lastDigit = digit;
|
|
134
|
+
} else if (!digit) {
|
|
135
|
+
// Vowel / H / W / Y — only H and W are transparent (do NOT reset)
|
|
136
|
+
if (ch !== "H" && ch !== "W") {
|
|
137
|
+
lastDigit = "0";
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
return (code + "0000").slice(0, 4);
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// ---------------------------------------------------------------------------
|
|
146
|
+
// Simplified Metaphone (Lawrence Philips, 1990)
|
|
147
|
+
// ---------------------------------------------------------------------------
|
|
148
|
+
|
|
149
|
+
/**
|
|
150
|
+
* Simplified Metaphone.
|
|
151
|
+
* Returns a phonetic code of up to 4 characters.
|
|
152
|
+
*/
|
|
153
|
+
export function metaphone(value: string): string {
|
|
154
|
+
let word = value.toUpperCase().replace(/[^A-Z]/g, "");
|
|
155
|
+
if (word.length === 0) return "";
|
|
156
|
+
|
|
157
|
+
// Drop initial silent letter pairs
|
|
158
|
+
const dropPrefixes = ["AE", "GN", "KN", "PN", "WR"];
|
|
159
|
+
for (const prefix of dropPrefixes) {
|
|
160
|
+
if (word.startsWith(prefix)) {
|
|
161
|
+
word = word.slice(1);
|
|
162
|
+
break;
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
// Drop trailing MB (silent B)
|
|
167
|
+
if (word.endsWith("MB")) {
|
|
168
|
+
word = word.slice(0, -1);
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
let code = "";
|
|
172
|
+
let i = 0;
|
|
173
|
+
|
|
174
|
+
while (i < word.length && code.length < 4) {
|
|
175
|
+
const ch = word[i]!;
|
|
176
|
+
const next = word[i + 1] ?? "";
|
|
177
|
+
const prev = i > 0 ? word[i - 1]! : "";
|
|
178
|
+
|
|
179
|
+
// Skip duplicate adjacent letters (except C)
|
|
180
|
+
if (ch === prev && ch !== "C") {
|
|
181
|
+
i++;
|
|
182
|
+
continue;
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
switch (ch) {
|
|
186
|
+
case "A":
|
|
187
|
+
case "E":
|
|
188
|
+
case "I":
|
|
189
|
+
case "O":
|
|
190
|
+
case "U":
|
|
191
|
+
// Vowels only kept at the beginning
|
|
192
|
+
if (i === 0) code += ch;
|
|
193
|
+
break;
|
|
194
|
+
|
|
195
|
+
case "B":
|
|
196
|
+
code += "B";
|
|
197
|
+
break;
|
|
198
|
+
|
|
199
|
+
case "C":
|
|
200
|
+
if (next === "I" || next === "E" || next === "Y") {
|
|
201
|
+
code += "S";
|
|
202
|
+
} else {
|
|
203
|
+
code += "K";
|
|
204
|
+
}
|
|
205
|
+
break;
|
|
206
|
+
|
|
207
|
+
case "D":
|
|
208
|
+
if (next === "G" && "IEY".includes(word[i + 2] ?? "")) {
|
|
209
|
+
code += "J";
|
|
210
|
+
} else {
|
|
211
|
+
code += "T";
|
|
212
|
+
}
|
|
213
|
+
break;
|
|
214
|
+
|
|
215
|
+
case "F":
|
|
216
|
+
code += "F";
|
|
217
|
+
break;
|
|
218
|
+
|
|
219
|
+
case "G":
|
|
220
|
+
if (next === "H" && i + 2 < word.length && !"AEIOU".includes(word[i + 2] ?? "")) {
|
|
221
|
+
// GH before non-vowel is silent
|
|
222
|
+
i += 2;
|
|
223
|
+
continue;
|
|
224
|
+
} else if (i > 0 && (next === "N" || (next === "N" && word[i + 2] === "E" && i + 2 === word.length - 1))) {
|
|
225
|
+
// GN at end is silent — skip
|
|
226
|
+
} else if (prev === "G") {
|
|
227
|
+
// Already handled double G
|
|
228
|
+
} else if (next === "I" || next === "E" || next === "Y") {
|
|
229
|
+
code += "J";
|
|
230
|
+
} else {
|
|
231
|
+
code += "K";
|
|
232
|
+
}
|
|
233
|
+
break;
|
|
234
|
+
|
|
235
|
+
case "H":
|
|
236
|
+
if ("AEIOU".includes(next) && !"AEIOU".includes(prev)) {
|
|
237
|
+
code += "H";
|
|
238
|
+
}
|
|
239
|
+
break;
|
|
240
|
+
|
|
241
|
+
case "J":
|
|
242
|
+
code += "J";
|
|
243
|
+
break;
|
|
244
|
+
|
|
245
|
+
case "K":
|
|
246
|
+
if (prev !== "C") code += "K";
|
|
247
|
+
break;
|
|
248
|
+
|
|
249
|
+
case "L":
|
|
250
|
+
code += "L";
|
|
251
|
+
break;
|
|
252
|
+
|
|
253
|
+
case "M":
|
|
254
|
+
code += "M";
|
|
255
|
+
break;
|
|
256
|
+
|
|
257
|
+
case "N":
|
|
258
|
+
code += "N";
|
|
259
|
+
break;
|
|
260
|
+
|
|
261
|
+
case "P":
|
|
262
|
+
if (next === "H") {
|
|
263
|
+
code += "F";
|
|
264
|
+
i++;
|
|
265
|
+
} else {
|
|
266
|
+
code += "P";
|
|
267
|
+
}
|
|
268
|
+
break;
|
|
269
|
+
|
|
270
|
+
case "Q":
|
|
271
|
+
code += "K";
|
|
272
|
+
break;
|
|
273
|
+
|
|
274
|
+
case "R":
|
|
275
|
+
code += "R";
|
|
276
|
+
break;
|
|
277
|
+
|
|
278
|
+
case "S":
|
|
279
|
+
if (next === "H" || (next === "I" && (word[i + 2] === "O" || word[i + 2] === "A"))) {
|
|
280
|
+
code += "X";
|
|
281
|
+
i++;
|
|
282
|
+
} else if (next === "C" && word[i + 2] === "H") {
|
|
283
|
+
code += "SK";
|
|
284
|
+
i += 2;
|
|
285
|
+
} else {
|
|
286
|
+
code += "S";
|
|
287
|
+
}
|
|
288
|
+
break;
|
|
289
|
+
|
|
290
|
+
case "T":
|
|
291
|
+
if (next === "H") {
|
|
292
|
+
code += "0"; // theta
|
|
293
|
+
i++;
|
|
294
|
+
} else if (next === "I" && (word[i + 2] === "O" || word[i + 2] === "A")) {
|
|
295
|
+
code += "X";
|
|
296
|
+
} else {
|
|
297
|
+
code += "T";
|
|
298
|
+
}
|
|
299
|
+
break;
|
|
300
|
+
|
|
301
|
+
case "V":
|
|
302
|
+
code += "F";
|
|
303
|
+
break;
|
|
304
|
+
|
|
305
|
+
case "W":
|
|
306
|
+
case "Y":
|
|
307
|
+
if ("AEIOU".includes(next)) {
|
|
308
|
+
code += ch;
|
|
309
|
+
}
|
|
310
|
+
break;
|
|
311
|
+
|
|
312
|
+
case "X":
|
|
313
|
+
code += "KS";
|
|
314
|
+
break;
|
|
315
|
+
|
|
316
|
+
case "Z":
|
|
317
|
+
code += "S";
|
|
318
|
+
break;
|
|
319
|
+
|
|
320
|
+
default:
|
|
321
|
+
break;
|
|
322
|
+
}
|
|
323
|
+
i++;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
return code.slice(0, 4);
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// ---------------------------------------------------------------------------
|
|
330
|
+
// Bloom filter transform (pure TS, for PPRL) — SHA-256 parity with Python
|
|
331
|
+
// ---------------------------------------------------------------------------
|
|
332
|
+
|
|
333
|
+
/**
|
|
334
|
+
* Security level presets for bloom filter parameters (match Python exactly).
|
|
335
|
+
*
|
|
336
|
+
* standard: 512-bit, 20 hash functions, 2-grams
|
|
337
|
+
* high: 1024-bit, 30 hash functions, 2-grams, HMAC-SHA256 salting
|
|
338
|
+
* paranoid: 2048-bit, 40 hash functions, 3-grams, HMAC-SHA256 + balanced padding
|
|
339
|
+
*
|
|
340
|
+
* See goldenmatch/utils/transforms.py::_bloom_filter_transform for the
|
|
341
|
+
* reference algorithm we match.
|
|
342
|
+
*/
|
|
343
|
+
interface BloomPreset {
|
|
344
|
+
readonly size: number;
|
|
345
|
+
readonly k: number;
|
|
346
|
+
readonly ngram: number;
|
|
347
|
+
readonly hmac: boolean;
|
|
348
|
+
readonly balanced: boolean;
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
const BLOOM_PRESETS: Record<string, BloomPreset> = {
|
|
352
|
+
standard: { size: 512, k: 20, ngram: 2, hmac: false, balanced: false },
|
|
353
|
+
high: { size: 1024, k: 30, ngram: 2, hmac: true, balanced: false },
|
|
354
|
+
paranoid: { size: 2048, k: 40, ngram: 3, hmac: true, balanced: true },
|
|
355
|
+
};
|
|
356
|
+
|
|
357
|
+
/** Default parameters when called as plain "bloom_filter" (matches Python). */
|
|
358
|
+
const BLOOM_DEFAULTS = { size: 1024, k: 20, ngram: 2 };
|
|
359
|
+
|
|
360
|
+
/** Default HMAC key used by the high/paranoid presets (matches Python). */
|
|
361
|
+
const BLOOM_DEFAULT_HMAC_KEY = "default_field_key";
|
|
362
|
+
|
|
363
|
+
/**
|
|
364
|
+
* Build a CLK (Cryptographic Longterm Key) bloom filter hex string.
|
|
365
|
+
*
|
|
366
|
+
* Forms accepted:
|
|
367
|
+
* - "bloom_filter" -> defaults (1024/20/2, no hmac)
|
|
368
|
+
* - "bloom_filter:standard" -> preset
|
|
369
|
+
* - "bloom_filter:high[:customKey]" -> preset, optional HMAC key override
|
|
370
|
+
* - "bloom_filter:paranoid[:customKey]" -> preset, optional HMAC key override
|
|
371
|
+
* - "bloom_filter:<ngram>:<k>:<size>[:hmac_key]" -> fully parametric
|
|
372
|
+
*/
|
|
373
|
+
function applyBloomFilter(value: string, transform: string): string {
|
|
374
|
+
let ngramSize = BLOOM_DEFAULTS.ngram;
|
|
375
|
+
let numHashes = BLOOM_DEFAULTS.k;
|
|
376
|
+
let filterSize = BLOOM_DEFAULTS.size;
|
|
377
|
+
let hmacKey: string | null = null;
|
|
378
|
+
let balanced = false;
|
|
379
|
+
|
|
380
|
+
if (transform === "bloom_filter") {
|
|
381
|
+
// defaults
|
|
382
|
+
} else {
|
|
383
|
+
const parts = transform.split(":");
|
|
384
|
+
const maybeLevel = parts[1];
|
|
385
|
+
if (maybeLevel && (maybeLevel in BLOOM_PRESETS)) {
|
|
386
|
+
const preset = BLOOM_PRESETS[maybeLevel]!;
|
|
387
|
+
ngramSize = preset.ngram;
|
|
388
|
+
numHashes = preset.k;
|
|
389
|
+
filterSize = preset.size;
|
|
390
|
+
balanced = preset.balanced;
|
|
391
|
+
if (preset.hmac) {
|
|
392
|
+
// Allow per-field HMAC key override via bloom_filter:<level>:<key>
|
|
393
|
+
hmacKey = parts[2] && parts[2].length > 0 ? parts[2] : BLOOM_DEFAULT_HMAC_KEY;
|
|
394
|
+
}
|
|
395
|
+
} else {
|
|
396
|
+
// Parametric form: bloom_filter:<ngram>:<k>:<size>[:hmac_key]
|
|
397
|
+
ngramSize = parseInt(parts[1] ?? String(BLOOM_DEFAULTS.ngram), 10);
|
|
398
|
+
numHashes = parseInt(parts[2] ?? String(BLOOM_DEFAULTS.k), 10);
|
|
399
|
+
filterSize = parseInt(parts[3] ?? String(BLOOM_DEFAULTS.size), 10);
|
|
400
|
+
if (parts.length > 4 && parts[4]!.length > 0) {
|
|
401
|
+
hmacKey = parts[4]!;
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
const filterBytes = Math.floor(filterSize / 8);
|
|
407
|
+
const bits = new Uint8Array(filterBytes);
|
|
408
|
+
|
|
409
|
+
// Match Python: value.lower().strip(), left-pad with '_' up to ngramSize.
|
|
410
|
+
let padded = value.toLowerCase().trim();
|
|
411
|
+
if (padded.length < ngramSize) {
|
|
412
|
+
padded = padded.padEnd(ngramSize, "_");
|
|
413
|
+
}
|
|
414
|
+
|
|
415
|
+
// Balanced padding: deterministic salt append to normalize filter density.
|
|
416
|
+
if (balanced && padded.length < 8) {
|
|
417
|
+
const salt = sha256Hex(padded).slice(0, 8);
|
|
418
|
+
padded = padded + salt;
|
|
419
|
+
}
|
|
420
|
+
|
|
421
|
+
// Generate character n-grams.
|
|
422
|
+
const ngrams: string[] = [];
|
|
423
|
+
for (let i = 0; i <= padded.length - ngramSize; i++) {
|
|
424
|
+
ngrams.push(padded.slice(i, i + ngramSize));
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
// Hash each n-gram k times.
|
|
428
|
+
for (const ngram of ngrams) {
|
|
429
|
+
for (let k = 0; k < numHashes; k++) {
|
|
430
|
+
const hex = hmacKey
|
|
431
|
+
? hmacSha256Hex(`${hmacKey}:${k}`, ngram)
|
|
432
|
+
: sha256Hex(`${k}:${ngram}`);
|
|
433
|
+
// bit_pos = int(h, 16) % filter_size
|
|
434
|
+
const bitPos = Number(modHexBigInt(hex, filterSize));
|
|
435
|
+
bits[bitPos >> 3]! |= 1 << (bitPos & 7);
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
return hexEncode(bits);
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
/** Compute (BigInt hex) mod (Number) and return a non-negative Number result. */
|
|
443
|
+
function modHexBigInt(hex: string, modulus: number): number {
|
|
444
|
+
// filterSize fits comfortably in a Number; use BigInt only for the big hex.
|
|
445
|
+
const big = BigInt("0x" + hex);
|
|
446
|
+
const mod = BigInt(modulus);
|
|
447
|
+
const rem = big % mod;
|
|
448
|
+
return Number(rem);
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
/** Hex-encode a Uint8Array. */
|
|
452
|
+
function hexEncode(bytes: Uint8Array): string {
|
|
453
|
+
const hex: string[] = [];
|
|
454
|
+
for (let i = 0; i < bytes.length; i++) {
|
|
455
|
+
hex.push(bytes[i]!.toString(16).padStart(2, "0"));
|
|
456
|
+
}
|
|
457
|
+
return hex.join("");
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
// ---------------------------------------------------------------------------
|
|
461
|
+
// SHA-256 — pure TS, edge-safe (no node: imports)
|
|
462
|
+
// ---------------------------------------------------------------------------
|
|
463
|
+
|
|
464
|
+
// FIPS 180-4 round constants.
|
|
465
|
+
const K256 = new Uint32Array([
|
|
466
|
+
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
|
|
467
|
+
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
|
|
468
|
+
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
|
|
469
|
+
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
|
|
470
|
+
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
|
|
471
|
+
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
|
|
472
|
+
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
|
|
473
|
+
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
|
|
474
|
+
]);
|
|
475
|
+
|
|
476
|
+
function rotr32(x: number, n: number): number {
|
|
477
|
+
return ((x >>> n) | (x << (32 - n))) >>> 0;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
/** UTF-8 encode a string to bytes (edge-safe — uses TextEncoder). */
|
|
481
|
+
function utf8Encode(input: string): Uint8Array {
|
|
482
|
+
return new TextEncoder().encode(input);
|
|
483
|
+
}
|
|
484
|
+
|
|
485
|
+
/** SHA-256 core: digest a byte array, return 32-byte digest. */
|
|
486
|
+
function sha256Bytes(msg: Uint8Array): Uint8Array {
|
|
487
|
+
// Initial hash values (FIPS 180-4).
|
|
488
|
+
const H = new Uint32Array([
|
|
489
|
+
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
|
|
490
|
+
0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19,
|
|
491
|
+
]);
|
|
492
|
+
|
|
493
|
+
// Pre-processing: append 1 bit, k zero bits, 64-bit length (big-endian).
|
|
494
|
+
const msgLen = msg.length;
|
|
495
|
+
const bitLen = msgLen * 8;
|
|
496
|
+
const withPadLen = ((msgLen + 9 + 63) >> 6) << 6; // round up to 64-byte block
|
|
497
|
+
const padded = new Uint8Array(withPadLen);
|
|
498
|
+
padded.set(msg);
|
|
499
|
+
padded[msgLen] = 0x80;
|
|
500
|
+
// Length in bits as 64-bit big-endian at the end.
|
|
501
|
+
// High 32 bits are ~0 for any realistic input in JS; write bitLen in low 32.
|
|
502
|
+
const hi = Math.floor(bitLen / 0x100000000);
|
|
503
|
+
const lo = bitLen >>> 0;
|
|
504
|
+
const dv = new DataView(padded.buffer);
|
|
505
|
+
dv.setUint32(withPadLen - 8, hi, false);
|
|
506
|
+
dv.setUint32(withPadLen - 4, lo, false);
|
|
507
|
+
|
|
508
|
+
const W = new Uint32Array(64);
|
|
509
|
+
|
|
510
|
+
for (let offset = 0; offset < withPadLen; offset += 64) {
|
|
511
|
+
// Schedule
|
|
512
|
+
for (let t = 0; t < 16; t++) {
|
|
513
|
+
W[t] = dv.getUint32(offset + t * 4, false);
|
|
514
|
+
}
|
|
515
|
+
for (let t = 16; t < 64; t++) {
|
|
516
|
+
const w15 = W[t - 15]!;
|
|
517
|
+
const w2 = W[t - 2]!;
|
|
518
|
+
const s0 = rotr32(w15, 7) ^ rotr32(w15, 18) ^ (w15 >>> 3);
|
|
519
|
+
const s1 = rotr32(w2, 17) ^ rotr32(w2, 19) ^ (w2 >>> 10);
|
|
520
|
+
W[t] = (W[t - 16]! + s0 + W[t - 7]! + s1) >>> 0;
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
let a = H[0]!, b = H[1]!, c = H[2]!, d = H[3]!;
|
|
524
|
+
let e = H[4]!, f = H[5]!, g = H[6]!, h = H[7]!;
|
|
525
|
+
|
|
526
|
+
for (let t = 0; t < 64; t++) {
|
|
527
|
+
const S1 = rotr32(e, 6) ^ rotr32(e, 11) ^ rotr32(e, 25);
|
|
528
|
+
const ch = (e & f) ^ (~e & g);
|
|
529
|
+
const T1 = (h + S1 + ch + K256[t]! + W[t]!) >>> 0;
|
|
530
|
+
const S0 = rotr32(a, 2) ^ rotr32(a, 13) ^ rotr32(a, 22);
|
|
531
|
+
const mj = (a & b) ^ (a & c) ^ (b & c);
|
|
532
|
+
const T2 = (S0 + mj) >>> 0;
|
|
533
|
+
h = g;
|
|
534
|
+
g = f;
|
|
535
|
+
f = e;
|
|
536
|
+
e = (d + T1) >>> 0;
|
|
537
|
+
d = c;
|
|
538
|
+
c = b;
|
|
539
|
+
b = a;
|
|
540
|
+
a = (T1 + T2) >>> 0;
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
H[0] = (H[0]! + a) >>> 0;
|
|
544
|
+
H[1] = (H[1]! + b) >>> 0;
|
|
545
|
+
H[2] = (H[2]! + c) >>> 0;
|
|
546
|
+
H[3] = (H[3]! + d) >>> 0;
|
|
547
|
+
H[4] = (H[4]! + e) >>> 0;
|
|
548
|
+
H[5] = (H[5]! + f) >>> 0;
|
|
549
|
+
H[6] = (H[6]! + g) >>> 0;
|
|
550
|
+
H[7] = (H[7]! + h) >>> 0;
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
const out = new Uint8Array(32);
|
|
554
|
+
const outDv = new DataView(out.buffer);
|
|
555
|
+
for (let i = 0; i < 8; i++) outDv.setUint32(i * 4, H[i]!, false);
|
|
556
|
+
return out;
|
|
557
|
+
}
|
|
558
|
+
|
|
559
|
+
/**
|
|
560
|
+
* SHA-256 digest of a UTF-8 string, returned as lowercase 64-char hex.
|
|
561
|
+
*
|
|
562
|
+
* Matches Python `hashlib.sha256(s.encode()).hexdigest()` bit-for-bit.
|
|
563
|
+
*/
|
|
564
|
+
export function sha256Hex(input: string): string {
|
|
565
|
+
return hexEncode(sha256Bytes(utf8Encode(input)));
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
/**
|
|
569
|
+
* HMAC-SHA256(key, msg) as lowercase 64-char hex.
|
|
570
|
+
*
|
|
571
|
+
* Matches Python `hmac.new(key.encode(), msg.encode(), hashlib.sha256).hexdigest()`.
|
|
572
|
+
*/
|
|
573
|
+
export function hmacSha256Hex(key: string, msg: string): string {
|
|
574
|
+
const blockSize = 64;
|
|
575
|
+
let keyBytes = utf8Encode(key);
|
|
576
|
+
if (keyBytes.length > blockSize) {
|
|
577
|
+
keyBytes = sha256Bytes(keyBytes);
|
|
578
|
+
}
|
|
579
|
+
const kPad = new Uint8Array(blockSize);
|
|
580
|
+
kPad.set(keyBytes);
|
|
581
|
+
|
|
582
|
+
const oKeyPad = new Uint8Array(blockSize);
|
|
583
|
+
const iKeyPad = new Uint8Array(blockSize);
|
|
584
|
+
for (let i = 0; i < blockSize; i++) {
|
|
585
|
+
oKeyPad[i] = kPad[i]! ^ 0x5c;
|
|
586
|
+
iKeyPad[i] = kPad[i]! ^ 0x36;
|
|
587
|
+
}
|
|
588
|
+
|
|
589
|
+
const msgBytes = utf8Encode(msg);
|
|
590
|
+
const inner = new Uint8Array(blockSize + msgBytes.length);
|
|
591
|
+
inner.set(iKeyPad);
|
|
592
|
+
inner.set(msgBytes, blockSize);
|
|
593
|
+
const innerHash = sha256Bytes(inner);
|
|
594
|
+
|
|
595
|
+
const outer = new Uint8Array(blockSize + innerHash.length);
|
|
596
|
+
outer.set(oKeyPad);
|
|
597
|
+
outer.set(innerHash, blockSize);
|
|
598
|
+
return hexEncode(sha256Bytes(outer));
|
|
599
|
+
}
|