min-mphash 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.en.md +321 -0
- package/README.md +335 -0
- package/README.zh.md +323 -0
- package/dist/MinMPHash.d.ts +140 -0
- package/dist/MinMPLookup.d.ts +115 -0
- package/dist/index.d.ts +4 -0
- package/dist/index.js +937 -0
- package/dist/util.d.ts +26 -0
- package/package.json +37 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,937 @@
|
|
|
1
|
+
function _define_property(obj, key, value) {
|
|
2
|
+
if (key in obj) Object.defineProperty(obj, key, {
|
|
3
|
+
value: value,
|
|
4
|
+
enumerable: true,
|
|
5
|
+
configurable: true,
|
|
6
|
+
writable: true
|
|
7
|
+
});
|
|
8
|
+
else obj[key] = value;
|
|
9
|
+
return obj;
|
|
10
|
+
}
|
|
11
|
+
function writeVarInt(val, buffer, offset) {
|
|
12
|
+
let bytes = 0;
|
|
13
|
+
while(val >= 0x80){
|
|
14
|
+
const b = 0x7f & val | 0x80;
|
|
15
|
+
if (Array.isArray(buffer)) buffer.push(b);
|
|
16
|
+
else if (void 0 !== offset) buffer[offset + bytes] = b;
|
|
17
|
+
val >>>= 7;
|
|
18
|
+
bytes++;
|
|
19
|
+
}
|
|
20
|
+
if (Array.isArray(buffer)) buffer.push(val);
|
|
21
|
+
else if (void 0 !== offset) buffer[offset + bytes] = val;
|
|
22
|
+
return bytes + 1;
|
|
23
|
+
}
|
|
24
|
+
function readVarInt(buffer, offset) {
|
|
25
|
+
let val = 0;
|
|
26
|
+
let shift = 0;
|
|
27
|
+
let bytes = 0;
|
|
28
|
+
while(true){
|
|
29
|
+
const b = buffer[offset + bytes];
|
|
30
|
+
val |= (0x7f & b) << shift;
|
|
31
|
+
bytes++;
|
|
32
|
+
if ((0x80 & b) === 0) break;
|
|
33
|
+
shift += 7;
|
|
34
|
+
}
|
|
35
|
+
return {
|
|
36
|
+
value: val,
|
|
37
|
+
bytes
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
const CBOR = {
|
|
41
|
+
encodeInt (val, buffer) {
|
|
42
|
+
const major = 0x00;
|
|
43
|
+
if (val < 24) buffer.push(major | val);
|
|
44
|
+
else if (val <= 0xff) buffer.push(24 | major, val);
|
|
45
|
+
else if (val <= 0xffff) buffer.push(25 | major, val >> 8, 0xff & val);
|
|
46
|
+
else buffer.push(26 | major, val >>> 24 & 0xff, val >>> 16 & 0xff, val >>> 8 & 0xff, 0xff & val);
|
|
47
|
+
},
|
|
48
|
+
encodeBytes (bytes, buffer) {
|
|
49
|
+
const major = 0x40;
|
|
50
|
+
const len = bytes.byteLength;
|
|
51
|
+
if (len < 24) buffer.push(major | len);
|
|
52
|
+
else if (len <= 0xff) buffer.push(24 | major, len);
|
|
53
|
+
else if (len <= 0xffff) buffer.push(25 | major, len >> 8, 0xff & len);
|
|
54
|
+
else buffer.push(26 | major, len >>> 24 & 0xff, len >>> 16 & 0xff, len >>> 8 & 0xff, 0xff & len);
|
|
55
|
+
for(let i = 0; i < len; i++)buffer.push(bytes[i]);
|
|
56
|
+
},
|
|
57
|
+
encodeNull (buffer) {
|
|
58
|
+
buffer.push(0xf6);
|
|
59
|
+
},
|
|
60
|
+
encodeArrayHead (len, buffer) {
|
|
61
|
+
const major = 0x80;
|
|
62
|
+
if (len < 24) buffer.push(major | len);
|
|
63
|
+
},
|
|
64
|
+
decode (view, offsetRef) {
|
|
65
|
+
const byte = view.getUint8(offsetRef.current++);
|
|
66
|
+
const major = 0xe0 & byte;
|
|
67
|
+
const additional = 0x1f & byte;
|
|
68
|
+
let val = 0;
|
|
69
|
+
if (additional < 24) val = additional;
|
|
70
|
+
else if (24 === additional) {
|
|
71
|
+
val = view.getUint8(offsetRef.current);
|
|
72
|
+
offsetRef.current += 1;
|
|
73
|
+
} else if (25 === additional) {
|
|
74
|
+
val = view.getUint16(offsetRef.current, false);
|
|
75
|
+
offsetRef.current += 2;
|
|
76
|
+
} else if (26 === additional) {
|
|
77
|
+
val = view.getUint32(offsetRef.current, false);
|
|
78
|
+
offsetRef.current += 4;
|
|
79
|
+
} else throw new Error("Unsupported CBOR size");
|
|
80
|
+
if (0x00 === major) return val;
|
|
81
|
+
if (0x40 === major) {
|
|
82
|
+
const len = val;
|
|
83
|
+
const buf = new Uint8Array(view.buffer.slice(view.byteOffset + offsetRef.current, view.byteOffset + offsetRef.current + len));
|
|
84
|
+
offsetRef.current += len;
|
|
85
|
+
return buf;
|
|
86
|
+
}
|
|
87
|
+
if (0x80 === major) {
|
|
88
|
+
const len = val;
|
|
89
|
+
const arr = [];
|
|
90
|
+
for(let i = 0; i < len; i++)arr.push(CBOR.decode(view, offsetRef));
|
|
91
|
+
return arr;
|
|
92
|
+
}
|
|
93
|
+
if (0xf6 === byte) return null;
|
|
94
|
+
throw new Error(`Unknown CBOR type: ${byte.toString(16)}`);
|
|
95
|
+
}
|
|
96
|
+
};
|
|
97
|
+
const MODE_TO_INT = {
|
|
98
|
+
none: 0,
|
|
99
|
+
4: 1,
|
|
100
|
+
8: 2,
|
|
101
|
+
16: 3,
|
|
102
|
+
32: 4,
|
|
103
|
+
2: 5
|
|
104
|
+
};
|
|
105
|
+
const INT_TO_MODE = [
|
|
106
|
+
"none",
|
|
107
|
+
"4",
|
|
108
|
+
"8",
|
|
109
|
+
"16",
|
|
110
|
+
"32",
|
|
111
|
+
"2"
|
|
112
|
+
];
|
|
113
|
+
function dictToCBOR(dict) {
|
|
114
|
+
const buffer = [];
|
|
115
|
+
CBOR.encodeArrayHead(9, buffer);
|
|
116
|
+
CBOR.encodeInt(dict.n, buffer);
|
|
117
|
+
CBOR.encodeInt(dict.m, buffer);
|
|
118
|
+
CBOR.encodeInt(dict.seed0, buffer);
|
|
119
|
+
CBOR.encodeBytes(dict.bucketSizes, buffer);
|
|
120
|
+
CBOR.encodeBytes(dict.seedStream, buffer);
|
|
121
|
+
CBOR.encodeInt(MODE_TO_INT[dict.validationMode], buffer);
|
|
122
|
+
if (dict.fingerprints && "none" !== dict.validationMode) {
|
|
123
|
+
let fpBytes;
|
|
124
|
+
if (dict.fingerprints instanceof Uint8Array) fpBytes = dict.fingerprints;
|
|
125
|
+
else if (dict.fingerprints instanceof Uint16Array || dict.fingerprints instanceof Uint32Array) fpBytes = new Uint8Array(dict.fingerprints.buffer, dict.fingerprints.byteOffset, dict.fingerprints.byteLength);
|
|
126
|
+
else {
|
|
127
|
+
const mode = dict.validationMode;
|
|
128
|
+
let typed;
|
|
129
|
+
typed = "2" === mode || "4" === mode || "8" === mode ? new Uint8Array(dict.fingerprints) : "16" === mode ? new Uint16Array(dict.fingerprints) : new Uint32Array(dict.fingerprints);
|
|
130
|
+
fpBytes = new Uint8Array(typed.buffer, typed.byteOffset, typed.byteLength);
|
|
131
|
+
}
|
|
132
|
+
CBOR.encodeBytes(fpBytes, buffer);
|
|
133
|
+
} else CBOR.encodeNull(buffer);
|
|
134
|
+
if (dict.seedZeroBitmap) CBOR.encodeBytes(dict.seedZeroBitmap, buffer);
|
|
135
|
+
else CBOR.encodeNull(buffer);
|
|
136
|
+
CBOR.encodeInt(dict.hashSeed || 0, buffer);
|
|
137
|
+
return new Uint8Array(buffer);
|
|
138
|
+
}
|
|
139
|
+
function dictFromCBOR(bin) {
|
|
140
|
+
const view = new DataView(bin.buffer, bin.byteOffset, bin.byteLength);
|
|
141
|
+
const offsetRef = {
|
|
142
|
+
current: 0
|
|
143
|
+
};
|
|
144
|
+
const arr = CBOR.decode(view, offsetRef);
|
|
145
|
+
if (!Array.isArray(arr) || arr.length < 7) throw new Error("Invalid CBOR format");
|
|
146
|
+
const [n, m, seed0, bucketSizes, seedStream, modeInt, fpRaw, seedZeroBitmap, hashSeed] = arr;
|
|
147
|
+
const validationMode = INT_TO_MODE[modeInt] || "none";
|
|
148
|
+
let fingerprints;
|
|
149
|
+
if (fpRaw && "none" !== validationMode) {
|
|
150
|
+
if ("2" === validationMode || "4" === validationMode || "8" === validationMode) fingerprints = fpRaw;
|
|
151
|
+
else if ("16" === validationMode) fingerprints = new Uint16Array(fpRaw.buffer, fpRaw.byteOffset, fpRaw.byteLength / 2);
|
|
152
|
+
else if ("32" === validationMode) fingerprints = new Uint32Array(fpRaw.buffer, fpRaw.byteOffset, fpRaw.byteLength / 4);
|
|
153
|
+
}
|
|
154
|
+
return {
|
|
155
|
+
n,
|
|
156
|
+
m,
|
|
157
|
+
seed0,
|
|
158
|
+
hashSeed: hashSeed || 0,
|
|
159
|
+
bucketSizes,
|
|
160
|
+
seedStream,
|
|
161
|
+
validationMode,
|
|
162
|
+
fingerprints,
|
|
163
|
+
seedZeroBitmap: seedZeroBitmap || void 0
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
async function compressIBinary(data) {
|
|
167
|
+
const stream = new Blob([
|
|
168
|
+
data
|
|
169
|
+
]).stream().pipeThrough(new CompressionStream("gzip"));
|
|
170
|
+
return new Uint8Array(await new Response(stream).arrayBuffer());
|
|
171
|
+
}
|
|
172
|
+
async function decompressIBinary(data) {
|
|
173
|
+
const stream = new Blob([
|
|
174
|
+
data
|
|
175
|
+
]).stream().pipeThrough(new DecompressionStream("gzip"));
|
|
176
|
+
return new Uint8Array(await new Response(stream).arrayBuffer());
|
|
177
|
+
}
|
|
178
|
+
class BitWriter {
|
|
179
|
+
write(value, bits) {
|
|
180
|
+
for(let i = 0; i < bits; i++){
|
|
181
|
+
const bit = value >> i & 1;
|
|
182
|
+
this.currentByte |= bit << this.bitCount;
|
|
183
|
+
this.bitCount++;
|
|
184
|
+
if (8 === this.bitCount) {
|
|
185
|
+
this.buffer.push(this.currentByte);
|
|
186
|
+
this.currentByte = 0;
|
|
187
|
+
this.bitCount = 0;
|
|
188
|
+
}
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
flush() {
|
|
192
|
+
if (this.bitCount > 0) {
|
|
193
|
+
this.buffer.push(this.currentByte);
|
|
194
|
+
this.currentByte = 0;
|
|
195
|
+
this.bitCount = 0;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
getData() {
|
|
199
|
+
this.flush();
|
|
200
|
+
return new Uint8Array(this.buffer);
|
|
201
|
+
}
|
|
202
|
+
constructor(){
|
|
203
|
+
_define_property(this, "buffer", []);
|
|
204
|
+
_define_property(this, "currentByte", 0);
|
|
205
|
+
_define_property(this, "bitCount", 0);
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
class BitReader {
|
|
209
|
+
read(bits) {
|
|
210
|
+
let value = 0;
|
|
211
|
+
for(let i = 0; i < bits; i++){
|
|
212
|
+
if (this.byteOffset >= this.buffer.length) return 0;
|
|
213
|
+
const bit = this.buffer[this.byteOffset] >> this.bitOffset & 1;
|
|
214
|
+
value |= bit << i;
|
|
215
|
+
this.bitOffset++;
|
|
216
|
+
if (8 === this.bitOffset) {
|
|
217
|
+
this.byteOffset++;
|
|
218
|
+
this.bitOffset = 0;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
return value;
|
|
222
|
+
}
|
|
223
|
+
constructor(buffer){
|
|
224
|
+
_define_property(this, "buffer", void 0);
|
|
225
|
+
_define_property(this, "byteOffset", 0);
|
|
226
|
+
_define_property(this, "bitOffset", 0);
|
|
227
|
+
this.buffer = buffer;
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
function readBitsAt(buffer, bitOffset, bitLength) {
|
|
231
|
+
let value = 0;
|
|
232
|
+
let currentBit = bitOffset;
|
|
233
|
+
for(let i = 0; i < bitLength; i++){
|
|
234
|
+
const byteIdx = currentBit >>> 3;
|
|
235
|
+
const bitIdx = 7 & currentBit;
|
|
236
|
+
if (byteIdx >= buffer.length) return 0;
|
|
237
|
+
const bit = buffer[byteIdx] >> bitIdx & 1;
|
|
238
|
+
value |= bit << i;
|
|
239
|
+
currentBit++;
|
|
240
|
+
}
|
|
241
|
+
return value;
|
|
242
|
+
}
|
|
243
|
+
function MinMPHash_define_property(obj, key, value) {
|
|
244
|
+
if (key in obj) Object.defineProperty(obj, key, {
|
|
245
|
+
value: value,
|
|
246
|
+
enumerable: true,
|
|
247
|
+
configurable: true,
|
|
248
|
+
writable: true
|
|
249
|
+
});
|
|
250
|
+
else obj[key] = value;
|
|
251
|
+
return obj;
|
|
252
|
+
}
|
|
253
|
+
function createMinMPHashDict(dataSet, options) {
|
|
254
|
+
const n = dataSet.length;
|
|
255
|
+
if (0 === n) {
|
|
256
|
+
const emptyDict = {
|
|
257
|
+
n: 0,
|
|
258
|
+
m: 0,
|
|
259
|
+
seed0: 0,
|
|
260
|
+
seedStream: new Uint8Array(0),
|
|
261
|
+
bucketSizes: new Uint8Array(0),
|
|
262
|
+
validationMode: "none"
|
|
263
|
+
};
|
|
264
|
+
return options?.outputBinary ? dictToCBOR(emptyDict) : emptyDict;
|
|
265
|
+
}
|
|
266
|
+
const targetRate = options?.level ?? 5.0;
|
|
267
|
+
let validationMode = "none";
|
|
268
|
+
if (options?.onlySet === true) validationMode = "8";
|
|
269
|
+
else if ("string" == typeof options?.onlySet) validationMode = options.onlySet;
|
|
270
|
+
const adjustedRate = n > 500000 ? Math.max(1, 0.90 * targetRate) : targetRate;
|
|
271
|
+
const m = Math.max(1, Math.ceil(n / adjustedRate));
|
|
272
|
+
const hashesL = new Uint32Array(n);
|
|
273
|
+
const hashesH = new Uint32Array(n);
|
|
274
|
+
let hashSeed = 0;
|
|
275
|
+
while(true){
|
|
276
|
+
const seen = new Map();
|
|
277
|
+
const complexSeen = new Map();
|
|
278
|
+
let collision = false;
|
|
279
|
+
for(let i = 0; i < n; i++){
|
|
280
|
+
const h1 = murmurHash3_32(dataSet[i], hashSeed);
|
|
281
|
+
const h2 = murmurHash3_32(dataSet[i], ~hashSeed);
|
|
282
|
+
let isDuplicate = false;
|
|
283
|
+
if (complexSeen.has(h1)) {
|
|
284
|
+
const set = complexSeen.get(h1);
|
|
285
|
+
if (set.has(h2)) isDuplicate = true;
|
|
286
|
+
else set.add(h2);
|
|
287
|
+
} else if (seen.has(h1)) {
|
|
288
|
+
const existingH2 = seen.get(h1);
|
|
289
|
+
if (existingH2 === h2) isDuplicate = true;
|
|
290
|
+
else {
|
|
291
|
+
const set = new Set();
|
|
292
|
+
set.add(existingH2);
|
|
293
|
+
set.add(h2);
|
|
294
|
+
complexSeen.set(h1, set);
|
|
295
|
+
seen.delete(h1);
|
|
296
|
+
}
|
|
297
|
+
} else seen.set(h1, h2);
|
|
298
|
+
if (isDuplicate) {
|
|
299
|
+
collision = true;
|
|
300
|
+
break;
|
|
301
|
+
}
|
|
302
|
+
hashesL[i] = h1;
|
|
303
|
+
hashesH[i] = h2;
|
|
304
|
+
}
|
|
305
|
+
if (!collision) break;
|
|
306
|
+
hashSeed++;
|
|
307
|
+
if (hashSeed > 100) throw new Error(`Could not find a collision-free hash seed after ${hashSeed} attempts.`);
|
|
308
|
+
}
|
|
309
|
+
let bestHead = new Int32Array(m).fill(-1);
|
|
310
|
+
let bestNext = new Int32Array(n).fill(-1);
|
|
311
|
+
let bestSeed0 = 0;
|
|
312
|
+
let minMaxLen = 1 / 0;
|
|
313
|
+
const currentHead = new Int32Array(m);
|
|
314
|
+
const currentNext = new Int32Array(n);
|
|
315
|
+
const bucketCounts = new Int32Array(m);
|
|
316
|
+
const maxAttempts = 2000;
|
|
317
|
+
for(let attempt = 0; attempt < maxAttempts; attempt++){
|
|
318
|
+
const currentSeed = Math.floor(0xffffffff * Math.random());
|
|
319
|
+
currentHead.fill(-1);
|
|
320
|
+
bucketCounts.fill(0);
|
|
321
|
+
let currentMaxLen = 0;
|
|
322
|
+
for(let i = 0; i < n; i++){
|
|
323
|
+
const h = (scramble(hashesL[i], currentSeed) ^ hashesH[i]) >>> 0;
|
|
324
|
+
const bIdx = Math.floor(h / 4294967296 * m);
|
|
325
|
+
currentNext[i] = currentHead[bIdx];
|
|
326
|
+
currentHead[bIdx] = i;
|
|
327
|
+
bucketCounts[bIdx]++;
|
|
328
|
+
if (bucketCounts[bIdx] > currentMaxLen) currentMaxLen = bucketCounts[bIdx];
|
|
329
|
+
}
|
|
330
|
+
if (currentMaxLen < 13) {
|
|
331
|
+
bestSeed0 = currentSeed;
|
|
332
|
+
bestHead.set(currentHead);
|
|
333
|
+
bestNext.set(currentNext);
|
|
334
|
+
minMaxLen = currentMaxLen;
|
|
335
|
+
break;
|
|
336
|
+
}
|
|
337
|
+
if (currentMaxLen < minMaxLen) {
|
|
338
|
+
minMaxLen = currentMaxLen;
|
|
339
|
+
bestSeed0 = currentSeed;
|
|
340
|
+
bestHead.set(currentHead);
|
|
341
|
+
bestNext.set(currentNext);
|
|
342
|
+
}
|
|
343
|
+
if (minMaxLen < 16 && attempt > 50) break;
|
|
344
|
+
}
|
|
345
|
+
if (minMaxLen >= 16) throw new Error(`MPHF Build Failed: Could not find a bucket distribution with max size < 16 (best: ${minMaxLen}). Try reducing the optimization level (current: ${options?.level ?? 5}).`);
|
|
346
|
+
const bucketSizes = new Uint8Array(Math.ceil(m / 2));
|
|
347
|
+
for(let i = 0; i < m; i++){
|
|
348
|
+
let count = 0;
|
|
349
|
+
let ptr = bestHead[i];
|
|
350
|
+
while(-1 !== ptr){
|
|
351
|
+
count++;
|
|
352
|
+
ptr = bestNext[ptr];
|
|
353
|
+
}
|
|
354
|
+
const byteIdx = i >>> 1;
|
|
355
|
+
if ((1 & i) === 0) bucketSizes[byteIdx] |= count;
|
|
356
|
+
else bucketSizes[byteIdx] |= count << 4;
|
|
357
|
+
}
|
|
358
|
+
const seedWriter = new VarIntBuffer();
|
|
359
|
+
const seedZeroBitmap = new Uint8Array(Math.ceil(m / 8));
|
|
360
|
+
for(let i = 0; i < m; i++){
|
|
361
|
+
let k = 0;
|
|
362
|
+
let p = bestHead[i];
|
|
363
|
+
while(-1 !== p){
|
|
364
|
+
k++;
|
|
365
|
+
p = bestNext[p];
|
|
366
|
+
}
|
|
367
|
+
if (k <= 1) {
|
|
368
|
+
seedZeroBitmap[i >>> 3] |= 1 << (7 & i);
|
|
369
|
+
continue;
|
|
370
|
+
}
|
|
371
|
+
let s = 0;
|
|
372
|
+
let found = false;
|
|
373
|
+
const MAX_TRIALS = k > 14 ? 50000000 : 5000000;
|
|
374
|
+
while(!found){
|
|
375
|
+
let visited = 0;
|
|
376
|
+
let collision = false;
|
|
377
|
+
let ptr = bestHead[i];
|
|
378
|
+
while(-1 !== ptr){
|
|
379
|
+
const h = (scramble(hashesL[ptr], s) ^ hashesH[ptr]) >>> 0;
|
|
380
|
+
const pos = h % k;
|
|
381
|
+
if ((visited & 1 << pos) !== 0) {
|
|
382
|
+
collision = true;
|
|
383
|
+
break;
|
|
384
|
+
}
|
|
385
|
+
visited |= 1 << pos;
|
|
386
|
+
ptr = bestNext[ptr];
|
|
387
|
+
}
|
|
388
|
+
if (collision) {
|
|
389
|
+
s++;
|
|
390
|
+
if (s > MAX_TRIALS) throw new Error(`MPHF Failed: Bucket ${i} (size ${k}) is too hard.`);
|
|
391
|
+
} else {
|
|
392
|
+
if (0 === s) seedZeroBitmap[i >>> 3] |= 1 << (7 & i);
|
|
393
|
+
else seedWriter.write(s);
|
|
394
|
+
found = true;
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
const dict = {
|
|
399
|
+
n,
|
|
400
|
+
m,
|
|
401
|
+
seed0: bestSeed0,
|
|
402
|
+
hashSeed,
|
|
403
|
+
seedStream: seedWriter.toUint8Array(),
|
|
404
|
+
bucketSizes,
|
|
405
|
+
seedZeroBitmap,
|
|
406
|
+
validationMode
|
|
407
|
+
};
|
|
408
|
+
if ("none" !== validationMode) {
|
|
409
|
+
let fingerprints;
|
|
410
|
+
fingerprints = "2" === validationMode ? new Uint8Array(Math.ceil(n / 4)) : "4" === validationMode ? new Uint8Array(Math.ceil(n / 2)) : "8" === validationMode ? new Uint8Array(n) : "16" === validationMode ? new Uint16Array(n) : new Uint32Array(n);
|
|
411
|
+
const tempHasher = new MinMPHash({
|
|
412
|
+
...dict,
|
|
413
|
+
validationMode: "none"
|
|
414
|
+
});
|
|
415
|
+
const FP_SEED = 0x1234abcd;
|
|
416
|
+
for(let i = 0; i < n; i++){
|
|
417
|
+
const key = dataSet[i];
|
|
418
|
+
const idx = tempHasher.hash(key);
|
|
419
|
+
if (idx >= 0 && idx < n) {
|
|
420
|
+
const fullHash = murmurHash3_32(key, FP_SEED);
|
|
421
|
+
if ("2" === validationMode) {
|
|
422
|
+
const fp2 = 0x03 & fullHash;
|
|
423
|
+
const byteIdx = idx >>> 2;
|
|
424
|
+
const shift = (3 & idx) << 1;
|
|
425
|
+
fingerprints[byteIdx] |= fp2 << shift;
|
|
426
|
+
} else if ("4" === validationMode) {
|
|
427
|
+
const fp4 = 0x0f & fullHash;
|
|
428
|
+
const byteIdx = idx >>> 1;
|
|
429
|
+
if ((1 & idx) === 0) fingerprints[byteIdx] |= fp4;
|
|
430
|
+
else fingerprints[byteIdx] |= fp4 << 4;
|
|
431
|
+
} else if ("8" === validationMode) fingerprints[idx] = 0xff & fullHash;
|
|
432
|
+
else if ("16" === validationMode) fingerprints[idx] = 0xffff & fullHash;
|
|
433
|
+
else fingerprints[idx] = fullHash >>> 0;
|
|
434
|
+
}
|
|
435
|
+
}
|
|
436
|
+
dict.fingerprints = fingerprints;
|
|
437
|
+
}
|
|
438
|
+
if (options?.outputBinary) {
|
|
439
|
+
const binary = dictToCBOR(dict);
|
|
440
|
+
if (options.enableCompression) return compressIBinary(binary);
|
|
441
|
+
return binary;
|
|
442
|
+
}
|
|
443
|
+
return dict;
|
|
444
|
+
}
|
|
445
|
+
class MinMPHash {
|
|
446
|
+
static async fromCompressed(data) {
|
|
447
|
+
const decompressed = await decompressIBinary(data);
|
|
448
|
+
return new MinMPHash(decompressed);
|
|
449
|
+
}
|
|
450
|
+
hash(input) {
|
|
451
|
+
if (0 === this.n) return -1;
|
|
452
|
+
const h1 = murmurHash3_32(input, this.hashSeed);
|
|
453
|
+
const h2 = murmurHash3_32(input, ~this.hashSeed);
|
|
454
|
+
const h0 = (scramble(h1, this.seed0) ^ h2) >>> 0;
|
|
455
|
+
const bIdx = Math.floor(h0 / 4294967296 * this.m);
|
|
456
|
+
const offset = this.offsets[bIdx];
|
|
457
|
+
const nextOffset = this.offsets[bIdx + 1];
|
|
458
|
+
const bucketSize = nextOffset - offset;
|
|
459
|
+
if (0 === bucketSize) return -1;
|
|
460
|
+
let resultIdx = 0;
|
|
461
|
+
if (1 === bucketSize) resultIdx = offset;
|
|
462
|
+
else {
|
|
463
|
+
const s = this.seeds[bIdx];
|
|
464
|
+
const h = (scramble(h1, s) ^ h2) >>> 0;
|
|
465
|
+
resultIdx = offset + h % bucketSize;
|
|
466
|
+
}
|
|
467
|
+
if ("none" !== this.validationMode && this.fingerprints) {
|
|
468
|
+
const fpHash = murmurHash3_32(input, MinMPHash.FP_SEED);
|
|
469
|
+
if ("2" === this.validationMode) {
|
|
470
|
+
const expectedFp2 = 0x03 & fpHash;
|
|
471
|
+
const byteIdx = resultIdx >>> 2;
|
|
472
|
+
const shift = (3 & resultIdx) << 1;
|
|
473
|
+
if ((this.fingerprints[byteIdx] >>> shift & 0x03) !== expectedFp2) return -1;
|
|
474
|
+
} else if ("4" === this.validationMode) {
|
|
475
|
+
const expectedFp4 = 0x0f & fpHash;
|
|
476
|
+
const byteIdx = resultIdx >>> 1;
|
|
477
|
+
const storedByte = this.fingerprints[byteIdx];
|
|
478
|
+
const storedFp4 = (1 & resultIdx) === 0 ? 0x0f & storedByte : storedByte >>> 4 & 0x0f;
|
|
479
|
+
if (storedFp4 !== expectedFp4) return -1;
|
|
480
|
+
} else if ("8" === this.validationMode) {
|
|
481
|
+
if (this.fingerprints[resultIdx] !== (0xff & fpHash)) return -1;
|
|
482
|
+
} else if ("16" === this.validationMode) {
|
|
483
|
+
if (this.fingerprints[resultIdx] !== (0xffff & fpHash)) return -1;
|
|
484
|
+
} else if (this.fingerprints[resultIdx] !== fpHash >>> 0) return -1;
|
|
485
|
+
}
|
|
486
|
+
return resultIdx;
|
|
487
|
+
}
|
|
488
|
+
constructor(dict){
|
|
489
|
+
MinMPHash_define_property(this, "n", void 0);
|
|
490
|
+
MinMPHash_define_property(this, "m", void 0);
|
|
491
|
+
MinMPHash_define_property(this, "seed0", void 0);
|
|
492
|
+
MinMPHash_define_property(this, "hashSeed", void 0);
|
|
493
|
+
MinMPHash_define_property(this, "offsets", void 0);
|
|
494
|
+
MinMPHash_define_property(this, "seeds", void 0);
|
|
495
|
+
MinMPHash_define_property(this, "validationMode", void 0);
|
|
496
|
+
MinMPHash_define_property(this, "fingerprints", null);
|
|
497
|
+
if (dict instanceof Uint8Array) dict = dictFromCBOR(dict);
|
|
498
|
+
this.n = dict.n;
|
|
499
|
+
this.m = dict.m;
|
|
500
|
+
this.seed0 = dict.seed0;
|
|
501
|
+
this.hashSeed = dict.hashSeed || 0;
|
|
502
|
+
this.validationMode = dict.validationMode || "none";
|
|
503
|
+
if (0 === this.n) {
|
|
504
|
+
this.offsets = new Uint32Array(0);
|
|
505
|
+
this.seeds = new Int32Array(0);
|
|
506
|
+
return;
|
|
507
|
+
}
|
|
508
|
+
this.offsets = new Uint32Array(this.m + 1);
|
|
509
|
+
let currentOffset = 0;
|
|
510
|
+
for(let i = 0; i < this.m; i++){
|
|
511
|
+
this.offsets[i] = currentOffset;
|
|
512
|
+
const byte = dict.bucketSizes[i >>> 1];
|
|
513
|
+
const len = 1 & i ? byte >>> 4 : 0x0f & byte;
|
|
514
|
+
currentOffset += len;
|
|
515
|
+
}
|
|
516
|
+
this.offsets[this.m] = currentOffset;
|
|
517
|
+
this.seeds = new Int32Array(this.m);
|
|
518
|
+
let ptr = 0;
|
|
519
|
+
const buf = dict.seedStream;
|
|
520
|
+
const bitmap = dict.seedZeroBitmap;
|
|
521
|
+
for(let i = 0; i < this.m; i++){
|
|
522
|
+
let isZero = false;
|
|
523
|
+
if (bitmap) {
|
|
524
|
+
if ((bitmap[i >>> 3] & 1 << (7 & i)) !== 0) isZero = true;
|
|
525
|
+
}
|
|
526
|
+
if (isZero) this.seeds[i] = 0;
|
|
527
|
+
else {
|
|
528
|
+
let result = 0;
|
|
529
|
+
let shift = 0;
|
|
530
|
+
while(true){
|
|
531
|
+
const byte = buf[ptr++];
|
|
532
|
+
result |= (0x7f & byte) << shift;
|
|
533
|
+
if ((0x80 & byte) === 0) break;
|
|
534
|
+
shift += 7;
|
|
535
|
+
}
|
|
536
|
+
this.seeds[i] = result;
|
|
537
|
+
}
|
|
538
|
+
}
|
|
539
|
+
if ("none" !== this.validationMode && dict.fingerprints) {
|
|
540
|
+
const raw = dict.fingerprints;
|
|
541
|
+
if ("2" === this.validationMode || "4" === this.validationMode || "8" === this.validationMode) this.fingerprints = raw instanceof Uint8Array ? raw : new Uint8Array(raw);
|
|
542
|
+
else if ("16" === this.validationMode) this.fingerprints = raw instanceof Uint16Array ? raw : new Uint16Array(raw);
|
|
543
|
+
else this.fingerprints = raw instanceof Uint32Array ? raw : new Uint32Array(raw);
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
}
|
|
547
|
+
MinMPHash_define_property(MinMPHash, "FP_SEED", 0x1234abcd);
|
|
548
|
+
function scramble(k, seed) {
|
|
549
|
+
k ^= seed;
|
|
550
|
+
k = Math.imul(k, 0x85ebca6b);
|
|
551
|
+
k ^= k >>> 13;
|
|
552
|
+
k = Math.imul(k, 0xc2b2ae35);
|
|
553
|
+
k ^= k >>> 16;
|
|
554
|
+
return k >>> 0;
|
|
555
|
+
}
|
|
556
|
+
function murmurHash3_32(key, seed) {
|
|
557
|
+
let h1 = seed;
|
|
558
|
+
const c1 = 0xcc9e2d51;
|
|
559
|
+
const c2 = 0x1b873593;
|
|
560
|
+
for(let i = 0; i < key.length; i++){
|
|
561
|
+
let k1 = key.charCodeAt(i);
|
|
562
|
+
k1 = Math.imul(k1, c1);
|
|
563
|
+
k1 = k1 << 15 | k1 >>> 17;
|
|
564
|
+
k1 = Math.imul(k1, c2);
|
|
565
|
+
h1 ^= k1;
|
|
566
|
+
h1 = h1 << 13 | h1 >>> 19;
|
|
567
|
+
h1 = Math.imul(h1, 5) + 0xe6546b64;
|
|
568
|
+
}
|
|
569
|
+
h1 ^= key.length;
|
|
570
|
+
h1 ^= h1 >>> 16;
|
|
571
|
+
h1 = Math.imul(h1, 0x85ebca6b);
|
|
572
|
+
h1 ^= h1 >>> 13;
|
|
573
|
+
h1 = Math.imul(h1, 0xc2b2ae35);
|
|
574
|
+
h1 ^= h1 >>> 16;
|
|
575
|
+
return h1 >>> 0;
|
|
576
|
+
}
|
|
577
|
+
class VarIntBuffer {
|
|
578
|
+
write(value) {
|
|
579
|
+
while(value >= 0x80){
|
|
580
|
+
this.buffer.push(0x7f & value | 0x80);
|
|
581
|
+
value >>>= 7;
|
|
582
|
+
}
|
|
583
|
+
this.buffer.push(value);
|
|
584
|
+
}
|
|
585
|
+
toUint8Array() {
|
|
586
|
+
return new Uint8Array(this.buffer);
|
|
587
|
+
}
|
|
588
|
+
constructor(){
|
|
589
|
+
MinMPHash_define_property(this, "buffer", []);
|
|
590
|
+
}
|
|
591
|
+
}
|
|
592
|
+
function MinMPLookup_define_property(obj, key, value) {
|
|
593
|
+
if (key in obj) Object.defineProperty(obj, key, {
|
|
594
|
+
value: value,
|
|
595
|
+
enumerable: true,
|
|
596
|
+
configurable: true,
|
|
597
|
+
writable: true
|
|
598
|
+
});
|
|
599
|
+
else obj[key] = value;
|
|
600
|
+
return obj;
|
|
601
|
+
}
|
|
602
|
+
function createMinMPLookupDict(lookupMap, options) {
|
|
603
|
+
const keys = Object.keys(lookupMap);
|
|
604
|
+
const uniqueValuesSet = new Set();
|
|
605
|
+
for (const key of keys){
|
|
606
|
+
const values = lookupMap[key];
|
|
607
|
+
for (const v of values)uniqueValuesSet.add(v);
|
|
608
|
+
}
|
|
609
|
+
const allValues = Array.from(uniqueValuesSet);
|
|
610
|
+
const mphBin = createMinMPHashDict(allValues, {
|
|
611
|
+
level: options?.level,
|
|
612
|
+
outputBinary: true,
|
|
613
|
+
onlySet: options?.onlySet ?? "8"
|
|
614
|
+
});
|
|
615
|
+
const mph = new MinMPHash(mphBin);
|
|
616
|
+
const valueToKeys = new Map();
|
|
617
|
+
for(let i = 0; i < keys.length; i++){
|
|
618
|
+
const key = keys[i];
|
|
619
|
+
const values = lookupMap[key];
|
|
620
|
+
for (const v of values){
|
|
621
|
+
if (!valueToKeys.has(v)) valueToKeys.set(v, []);
|
|
622
|
+
valueToKeys.get(v).push(i);
|
|
623
|
+
}
|
|
624
|
+
}
|
|
625
|
+
let collisionCount = 0;
|
|
626
|
+
for (const [_, kIndices] of valueToKeys)if (kIndices.length > 1) collisionCount++;
|
|
627
|
+
const isMostlyOneToOne = collisionCount < 0.1 * allValues.length;
|
|
628
|
+
let dict;
|
|
629
|
+
if (isMostlyOneToOne) {
|
|
630
|
+
const bitsPerKey = Math.ceil(Math.log2(keys.length + 1));
|
|
631
|
+
const bw = new BitWriter();
|
|
632
|
+
const collisionMap = new Map();
|
|
633
|
+
const valueToKeyMap = new Int32Array(mph.n).fill(-1);
|
|
634
|
+
for (const [v, kIndices] of valueToKeys){
|
|
635
|
+
const h = mph.hash(v);
|
|
636
|
+
if (h >= 0) if (1 === kIndices.length) valueToKeyMap[h] = kIndices[0];
|
|
637
|
+
else {
|
|
638
|
+
valueToKeyMap[h] = keys.length;
|
|
639
|
+
collisionMap.set(h, kIndices);
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
for(let i = 0; i < mph.n; i++){
|
|
643
|
+
const keyIdx = valueToKeyMap[i];
|
|
644
|
+
bw.write(keyIdx >= 0 ? keyIdx : 0, bitsPerKey);
|
|
645
|
+
}
|
|
646
|
+
dict = {
|
|
647
|
+
mmpHashDictBin: mphBin,
|
|
648
|
+
keys,
|
|
649
|
+
valueToKeyIndexes: bw.getData(),
|
|
650
|
+
bitsPerKey,
|
|
651
|
+
collisionMap: collisionMap.size > 0 ? collisionMap : void 0
|
|
652
|
+
};
|
|
653
|
+
} else {
|
|
654
|
+
const keyToHashes = [];
|
|
655
|
+
for (const key of keys){
|
|
656
|
+
const values = lookupMap[key];
|
|
657
|
+
const hashes = [];
|
|
658
|
+
for (const v of values){
|
|
659
|
+
const h = mph.hash(v);
|
|
660
|
+
if (h >= 0) hashes.push(h);
|
|
661
|
+
}
|
|
662
|
+
hashes.sort((a, b)=>a - b);
|
|
663
|
+
keyToHashes.push(new Uint32Array(hashes));
|
|
664
|
+
}
|
|
665
|
+
dict = {
|
|
666
|
+
mmpHashDictBin: mphBin,
|
|
667
|
+
keys,
|
|
668
|
+
keyToHashes
|
|
669
|
+
};
|
|
670
|
+
}
|
|
671
|
+
if (options?.outputBinary) {
|
|
672
|
+
const serialized = serializeMinMPLookupDict(dict);
|
|
673
|
+
if (options?.enableCompression) return compressIBinary(serialized);
|
|
674
|
+
return serialized;
|
|
675
|
+
}
|
|
676
|
+
if (options?.enableCompression) return Promise.resolve(dict);
|
|
677
|
+
return dict;
|
|
678
|
+
}
|
|
679
|
+
function serializeMinMPLookupDict(dict) {
|
|
680
|
+
const parts = [];
|
|
681
|
+
const encoder = new TextEncoder();
|
|
682
|
+
const writeU32 = (val)=>{
|
|
683
|
+
const b = new Uint8Array(4);
|
|
684
|
+
new DataView(b.buffer).setUint32(0, val, false);
|
|
685
|
+
return b;
|
|
686
|
+
};
|
|
687
|
+
parts.push(writeU32(dict.mmpHashDictBin.length));
|
|
688
|
+
parts.push(dict.mmpHashDictBin);
|
|
689
|
+
parts.push(writeU32(dict.keys.length));
|
|
690
|
+
for (const key of dict.keys){
|
|
691
|
+
const keyBytes = encoder.encode(key);
|
|
692
|
+
parts.push(writeU32(keyBytes.length));
|
|
693
|
+
parts.push(keyBytes);
|
|
694
|
+
}
|
|
695
|
+
if (dict.valueToKeyIndexes && void 0 !== dict.bitsPerKey) {
|
|
696
|
+
parts.push(writeU32(0xFFFFFFFF));
|
|
697
|
+
parts.push(writeU32(dict.bitsPerKey));
|
|
698
|
+
parts.push(writeU32(dict.valueToKeyIndexes.length));
|
|
699
|
+
parts.push(dict.valueToKeyIndexes);
|
|
700
|
+
if (dict.collisionMap && dict.collisionMap.size > 0) {
|
|
701
|
+
const colBuffer = [];
|
|
702
|
+
writeVarInt(dict.collisionMap.size, colBuffer);
|
|
703
|
+
const sortedHashes = Array.from(dict.collisionMap.keys()).sort((a, b)=>a - b);
|
|
704
|
+
let prevHash = 0;
|
|
705
|
+
for (const h of sortedHashes){
|
|
706
|
+
writeVarInt(h - prevHash, colBuffer);
|
|
707
|
+
prevHash = h;
|
|
708
|
+
const kIndices = dict.collisionMap.get(h);
|
|
709
|
+
writeVarInt(kIndices.length, colBuffer);
|
|
710
|
+
kIndices.sort((a, b)=>a - b);
|
|
711
|
+
let prevKey = 0;
|
|
712
|
+
for (const k of kIndices){
|
|
713
|
+
writeVarInt(k - prevKey, colBuffer);
|
|
714
|
+
prevKey = k;
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
const colBytes = new Uint8Array(colBuffer);
|
|
718
|
+
parts.push(writeU32(colBytes.length));
|
|
719
|
+
parts.push(colBytes);
|
|
720
|
+
} else parts.push(writeU32(0));
|
|
721
|
+
} else if (dict.keyToHashes) {
|
|
722
|
+
const hashBuffer = [];
|
|
723
|
+
for (const hashes of dict.keyToHashes){
|
|
724
|
+
writeVarInt(hashes.length, hashBuffer);
|
|
725
|
+
if (0 === hashes.length) continue;
|
|
726
|
+
let maxDelta = 0;
|
|
727
|
+
let prev = 0;
|
|
728
|
+
const deltas = [];
|
|
729
|
+
for(let i = 0; i < hashes.length; i++){
|
|
730
|
+
const h = hashes[i];
|
|
731
|
+
const delta = h - prev;
|
|
732
|
+
deltas.push(delta);
|
|
733
|
+
if (delta > maxDelta) maxDelta = delta;
|
|
734
|
+
prev = h;
|
|
735
|
+
}
|
|
736
|
+
let bits = 0;
|
|
737
|
+
if (maxDelta > 0) bits = Math.ceil(Math.log2(maxDelta + 1));
|
|
738
|
+
hashBuffer.push(bits);
|
|
739
|
+
const bw = new BitWriter();
|
|
740
|
+
for (const d of deltas)bw.write(d, bits);
|
|
741
|
+
const packed = bw.getData();
|
|
742
|
+
for(let i = 0; i < packed.length; i++)hashBuffer.push(packed[i]);
|
|
743
|
+
}
|
|
744
|
+
const hashBytes = new Uint8Array(hashBuffer);
|
|
745
|
+
parts.push(writeU32(hashBytes.length));
|
|
746
|
+
parts.push(hashBytes);
|
|
747
|
+
}
|
|
748
|
+
const totalLen = parts.reduce((sum, b)=>sum + b.length, 0);
|
|
749
|
+
const res = new Uint8Array(totalLen);
|
|
750
|
+
let offset = 0;
|
|
751
|
+
for (const b of parts){
|
|
752
|
+
res.set(b, offset);
|
|
753
|
+
offset += b.length;
|
|
754
|
+
}
|
|
755
|
+
return res;
|
|
756
|
+
}
|
|
757
|
+
function deserializeLookupDict(data) {
|
|
758
|
+
const view = new DataView(data.buffer, data.byteOffset, data.byteLength);
|
|
759
|
+
let offset = 0;
|
|
760
|
+
const decoder = new TextDecoder();
|
|
761
|
+
const readU32 = ()=>{
|
|
762
|
+
const val = view.getUint32(offset, false);
|
|
763
|
+
offset += 4;
|
|
764
|
+
return val;
|
|
765
|
+
};
|
|
766
|
+
const mphLen = readU32();
|
|
767
|
+
const mmpHashDictBin = data.subarray(offset, offset + mphLen);
|
|
768
|
+
offset += mphLen;
|
|
769
|
+
const keysCount = readU32();
|
|
770
|
+
const keys = [];
|
|
771
|
+
for(let i = 0; i < keysCount; i++){
|
|
772
|
+
const kLen = readU32();
|
|
773
|
+
const kBytes = data.subarray(offset, offset + kLen);
|
|
774
|
+
offset += kLen;
|
|
775
|
+
keys.push(decoder.decode(kBytes));
|
|
776
|
+
}
|
|
777
|
+
const sectionLen = readU32();
|
|
778
|
+
if (0xFFFFFFFF === sectionLen) {
|
|
779
|
+
const bitsPerKey = readU32();
|
|
780
|
+
const dataLen = readU32();
|
|
781
|
+
const valueToKeyIndexes = data.subarray(offset, offset + dataLen);
|
|
782
|
+
offset += dataLen;
|
|
783
|
+
const colMapLen = readU32();
|
|
784
|
+
let collisionMap;
|
|
785
|
+
if (colMapLen > 0) {
|
|
786
|
+
const colBytes = data.subarray(offset, offset + colMapLen);
|
|
787
|
+
offset += colMapLen;
|
|
788
|
+
collisionMap = new Map();
|
|
789
|
+
let cOffset = 0;
|
|
790
|
+
const { value: count, bytes: b1 } = readVarInt(colBytes, cOffset);
|
|
791
|
+
cOffset += b1;
|
|
792
|
+
let prevHash = 0;
|
|
793
|
+
for(let i = 0; i < count; i++){
|
|
794
|
+
const { value: deltaHash, bytes: b2 } = readVarInt(colBytes, cOffset);
|
|
795
|
+
cOffset += b2;
|
|
796
|
+
const h = prevHash + deltaHash;
|
|
797
|
+
prevHash = h;
|
|
798
|
+
const { value: kCount, bytes: b3 } = readVarInt(colBytes, cOffset);
|
|
799
|
+
cOffset += b3;
|
|
800
|
+
const kIndices = [];
|
|
801
|
+
let prevKey = 0;
|
|
802
|
+
for(let j = 0; j < kCount; j++){
|
|
803
|
+
const { value: deltaKey, bytes: b4 } = readVarInt(colBytes, cOffset);
|
|
804
|
+
cOffset += b4;
|
|
805
|
+
const k = prevKey + deltaKey;
|
|
806
|
+
prevKey = k;
|
|
807
|
+
kIndices.push(k);
|
|
808
|
+
}
|
|
809
|
+
collisionMap.set(h, kIndices);
|
|
810
|
+
}
|
|
811
|
+
}
|
|
812
|
+
return {
|
|
813
|
+
mmpHashDictBin,
|
|
814
|
+
keys,
|
|
815
|
+
valueToKeyIndexes,
|
|
816
|
+
bitsPerKey,
|
|
817
|
+
collisionMap
|
|
818
|
+
};
|
|
819
|
+
}
|
|
820
|
+
{
|
|
821
|
+
const hashBytesLen = sectionLen;
|
|
822
|
+
const hashBytes = data.subarray(offset, offset + hashBytesLen);
|
|
823
|
+
offset += hashBytesLen;
|
|
824
|
+
const keyToHashes = [];
|
|
825
|
+
let hOffset = 0;
|
|
826
|
+
for(let i = 0; i < keysCount; i++){
|
|
827
|
+
const { value: count, bytes: b1 } = readVarInt(hashBytes, hOffset);
|
|
828
|
+
hOffset += b1;
|
|
829
|
+
if (0 === count) {
|
|
830
|
+
keyToHashes.push(new Uint32Array(0));
|
|
831
|
+
continue;
|
|
832
|
+
}
|
|
833
|
+
const bits = hashBytes[hOffset];
|
|
834
|
+
hOffset += 1;
|
|
835
|
+
const totalBits = bits * count;
|
|
836
|
+
const packedBytesLen = Math.ceil(totalBits / 8);
|
|
837
|
+
const packedData = hashBytes.subarray(hOffset, hOffset + packedBytesLen);
|
|
838
|
+
hOffset += packedBytesLen;
|
|
839
|
+
const br = new BitReader(packedData);
|
|
840
|
+
const hashes = new Uint32Array(count);
|
|
841
|
+
let prev = 0;
|
|
842
|
+
for(let j = 0; j < count; j++){
|
|
843
|
+
const delta = br.read(bits);
|
|
844
|
+
prev += delta;
|
|
845
|
+
hashes[j] = prev;
|
|
846
|
+
}
|
|
847
|
+
keyToHashes.push(hashes);
|
|
848
|
+
}
|
|
849
|
+
return {
|
|
850
|
+
mmpHashDictBin,
|
|
851
|
+
keys,
|
|
852
|
+
keyToHashes
|
|
853
|
+
};
|
|
854
|
+
}
|
|
855
|
+
}
|
|
856
|
+
class MinMPLookup {
|
|
857
|
+
static async fromCompressed(data) {
|
|
858
|
+
const decompressed = await decompressIBinary(data);
|
|
859
|
+
const dict = deserializeLookupDict(decompressed);
|
|
860
|
+
return new MinMPLookup(dict);
|
|
861
|
+
}
|
|
862
|
+
static fromBinary(data) {
|
|
863
|
+
const dict = deserializeLookupDict(data);
|
|
864
|
+
return new MinMPLookup(dict);
|
|
865
|
+
}
|
|
866
|
+
buildInvertedIndex() {
|
|
867
|
+
if (!this.dict.keyToHashes) return;
|
|
868
|
+
const n = this.mph.n;
|
|
869
|
+
this._invertedIndex = Array.from({
|
|
870
|
+
length: n
|
|
871
|
+
}, ()=>[]);
|
|
872
|
+
for(let i = 0; i < this.dict.keys.length; i++){
|
|
873
|
+
const hashes = this.dict.keyToHashes[i];
|
|
874
|
+
for(let j = 0; j < hashes.length; j++){
|
|
875
|
+
const h = hashes[j];
|
|
876
|
+
if (h < n) this._invertedIndex[h].push(i);
|
|
877
|
+
}
|
|
878
|
+
}
|
|
879
|
+
}
|
|
880
|
+
query(value) {
|
|
881
|
+
if (this.dict.valueToKeyIndexes && this.dict.bitsPerKey) {
|
|
882
|
+
const h = this.mph.hash(value);
|
|
883
|
+
if (h < 0 || h >= this.mph.n) return null;
|
|
884
|
+
const keyIdx = readBitsAt(this.dict.valueToKeyIndexes, h * this.dict.bitsPerKey, this.dict.bitsPerKey);
|
|
885
|
+
if (keyIdx === this.dict.keys.length) {
|
|
886
|
+
if (this.dict.collisionMap && this.dict.collisionMap.has(h)) {
|
|
887
|
+
const indices = this.dict.collisionMap.get(h);
|
|
888
|
+
return indices.length > 0 ? this.dict.keys[indices[0]] : null;
|
|
889
|
+
}
|
|
890
|
+
return null;
|
|
891
|
+
}
|
|
892
|
+
if (keyIdx >= this.dict.keys.length) return null;
|
|
893
|
+
return this.dict.keys[keyIdx];
|
|
894
|
+
}
|
|
895
|
+
const keys = this.queryAll(value);
|
|
896
|
+
return keys && keys.length > 0 ? keys[0] : null;
|
|
897
|
+
}
|
|
898
|
+
queryAll(value) {
|
|
899
|
+
if (this.dict.valueToKeyIndexes && this.dict.bitsPerKey) {
|
|
900
|
+
const h = this.mph.hash(value);
|
|
901
|
+
if (h < 0 || h >= this.mph.n) return null;
|
|
902
|
+
const keyIdx = readBitsAt(this.dict.valueToKeyIndexes, h * this.dict.bitsPerKey, this.dict.bitsPerKey);
|
|
903
|
+
if (keyIdx === this.dict.keys.length) {
|
|
904
|
+
if (this.dict.collisionMap && this.dict.collisionMap.has(h)) {
|
|
905
|
+
const indices = this.dict.collisionMap.get(h);
|
|
906
|
+
return indices.map((i)=>this.dict.keys[i]);
|
|
907
|
+
}
|
|
908
|
+
return null;
|
|
909
|
+
}
|
|
910
|
+
if (keyIdx >= this.dict.keys.length) return null;
|
|
911
|
+
return [
|
|
912
|
+
this.dict.keys[keyIdx]
|
|
913
|
+
];
|
|
914
|
+
}
|
|
915
|
+
const idx = this.mph.hash(value);
|
|
916
|
+
if (idx < 0 || !this._invertedIndex) return null;
|
|
917
|
+
if (idx >= this._invertedIndex.length) return null;
|
|
918
|
+
const keyIndices = this._invertedIndex[idx];
|
|
919
|
+
if (0 === keyIndices.length) return null;
|
|
920
|
+
const results = [];
|
|
921
|
+
for (const keyIdx of keyIndices)results.push(this.dict.keys[keyIdx]);
|
|
922
|
+
return results.length > 0 ? results : null;
|
|
923
|
+
}
|
|
924
|
+
keys() {
|
|
925
|
+
return this.dict.keys;
|
|
926
|
+
}
|
|
927
|
+
constructor(dict){
|
|
928
|
+
MinMPLookup_define_property(this, "dict", void 0);
|
|
929
|
+
MinMPLookup_define_property(this, "mph", void 0);
|
|
930
|
+
MinMPLookup_define_property(this, "_invertedIndex", void 0);
|
|
931
|
+
this.dict = dict;
|
|
932
|
+
this._invertedIndex = null;
|
|
933
|
+
this.mph = new MinMPHash(dict.mmpHashDictBin);
|
|
934
|
+
if (dict.keyToHashes) this.buildInvertedIndex();
|
|
935
|
+
}
|
|
936
|
+
}
|
|
937
|
+
export { MinMPHash, MinMPLookup, createMinMPHashDict, createMinMPLookupDict };
|