gs-search 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/core.cjs +242 -14
- package/lib/core.d.ts +88 -2
- package/lib/core.js +248 -15
- package/lib/type.d.ts +73 -1
- package/package.json +1 -1
package/lib/core.cjs
CHANGED
|
@@ -181,21 +181,129 @@ function murmur3_32(str, h = 305419896) {
|
|
|
181
181
|
const remainder = len & 3;
|
|
182
182
|
return remainder > 0 && (remainder >= 3 && (k1 ^= (str.charCodeAt(i + 2) & 255) << 16), remainder >= 2 && (k1 ^= (str.charCodeAt(i + 1) & 255) << 8), remainder >= 1 && (k1 ^= str.charCodeAt(i) & 255), k1 = (k1 & 65535) * 3432918353 + (((k1 >>> 16) * 3432918353 & 65535) << 16) & 4294967295, k1 = k1 << 15 | k1 >>> 17, k1 = (k1 & 65535) * 461845907 + (((k1 >>> 16) * 461845907 & 65535) << 16) & 4294967295, h ^= k1), h ^= len, h ^= h >>> 16, h = (h & 65535) * 2246822507 + (((h >>> 16) * 2246822507 & 65535) << 16) & 4294967295, h ^= h >>> 13, h = (h & 65535) * 3266489909 + (((h >>> 16) * 3266489909 & 65535) << 16) & 4294967295, h ^= h >>> 16, h >>> 0;
|
|
183
183
|
}
|
|
184
|
+
function murmur3_64(str, seed = 305419896) {
|
|
185
|
+
const len = str.length, nBlocks = len >> 3;
|
|
186
|
+
let h1 = BigInt(seed), h2 = BigInt(seed);
|
|
187
|
+
const c1 = 0x87c37b91114253d5n, c2 = 0x4cf5ad432745937fn;
|
|
188
|
+
let i = 0;
|
|
189
|
+
for (; i < nBlocks; ) {
|
|
190
|
+
let k12 = BigInt(str.charCodeAt(i) & 255) | BigInt(str.charCodeAt(++i) & 255) << 8n | BigInt(str.charCodeAt(++i) & 255) << 16n | BigInt(str.charCodeAt(++i) & 255) << 24n | BigInt(str.charCodeAt(++i) & 255) << 32n | BigInt(str.charCodeAt(++i) & 255) << 40n | BigInt(str.charCodeAt(++i) & 255) << 48n | BigInt(str.charCodeAt(++i) & 255) << 56n;
|
|
191
|
+
++i;
|
|
192
|
+
let k22 = BigInt(str.charCodeAt(i) & 255) | BigInt(str.charCodeAt(++i) & 255) << 8n | BigInt(str.charCodeAt(++i) & 255) << 16n | BigInt(str.charCodeAt(++i) & 255) << 24n | BigInt(str.charCodeAt(++i) & 255) << 32n | BigInt(str.charCodeAt(++i) & 255) << 40n | BigInt(str.charCodeAt(++i) & 255) << 48n | BigInt(str.charCodeAt(++i) & 255) << 56n;
|
|
193
|
+
++i, k12 = k12 * c1 % 2n ** 64n, k12 = (k12 << 31n | k12 >> 33n) % 2n ** 64n, k12 = k12 * c2 % 2n ** 64n, h1 ^= k12, h1 = (h1 << 27n | h1 >> 37n) % 2n ** 64n, h1 = (h1 + h2) % 2n ** 64n, h1 = (h1 * 5n + 0x52dce729n) % 2n ** 64n, k22 = k22 * c2 % 2n ** 64n, k22 = (k22 << 33n | k22 >> 31n) % 2n ** 64n, k22 = k22 * c1 % 2n ** 64n, h2 ^= k22, h2 = (h2 << 31n | h2 >> 33n) % 2n ** 64n, h2 = (h2 + h1) % 2n ** 64n, h2 = (h2 * 5n + 0x38495ab5n) % 2n ** 64n;
|
|
194
|
+
}
|
|
195
|
+
let k1 = 0n, k2 = 0n;
|
|
196
|
+
const remainder = len & 7;
|
|
197
|
+
return remainder > 0 && (remainder >= 8 && (k2 ^= BigInt(str.charCodeAt(i + 7) & 255) << 56n), remainder >= 7 && (k2 ^= BigInt(str.charCodeAt(i + 6) & 255) << 48n), remainder >= 6 && (k2 ^= BigInt(str.charCodeAt(i + 5) & 255) << 40n), remainder >= 5 && (k2 ^= BigInt(str.charCodeAt(i + 4) & 255) << 32n), remainder >= 4 && (k1 ^= BigInt(str.charCodeAt(i + 3) & 255) << 24n), remainder >= 3 && (k1 ^= BigInt(str.charCodeAt(i + 2) & 255) << 16n), remainder >= 2 && (k1 ^= BigInt(str.charCodeAt(i + 1) & 255) << 8n), remainder >= 1 && (k1 ^= BigInt(str.charCodeAt(i) & 255)), k1 = k1 * c1 % 2n ** 64n, k1 = (k1 << 31n | k1 >> 33n) % 2n ** 64n, k1 = k1 * c2 % 2n ** 64n, h1 ^= k1, k2 = k2 * c2 % 2n ** 64n, k2 = (k2 << 33n | k2 >> 31n) % 2n ** 64n, k2 = k2 * c1 % 2n ** 64n, h2 ^= k2), h1 ^= BigInt(len), h2 ^= BigInt(len), h1 = (h1 + h2) % 2n ** 64n, h2 = (h2 + h1) % 2n ** 64n, h1 = (h1 ^ h1 >> 33n) * 0xff51afd7ed558ccdn, h1 = (h1 ^ h1 >> 33n) * 0xc4ceb9fe1a85ec53n, h1 = h1 ^ h1 >> 33n, h2 = (h2 ^ h2 >> 33n) * 0xff51afd7ed558ccdn, h2 = (h2 ^ h2 >> 33n) * 0xc4ceb9fe1a85ec53n, h2 = h2 ^ h2 >> 33n, (h2 << 64n | h1) & 0xffffffffffffffffn;
|
|
198
|
+
}
|
|
199
|
+
class Murmur3_32 {
|
|
200
|
+
seed;
|
|
201
|
+
/**
|
|
202
|
+
* 构造函数
|
|
203
|
+
* @param seed 种子值,默认为0x12345678
|
|
204
|
+
*/
|
|
205
|
+
constructor(seed = 305419896) {
|
|
206
|
+
this.seed = seed;
|
|
207
|
+
}
|
|
208
|
+
/**
|
|
209
|
+
* 计算字符串的32位哈希值
|
|
210
|
+
* @param str 输入字符串
|
|
211
|
+
* @returns 32位哈希值(number类型)
|
|
212
|
+
*/
|
|
213
|
+
hash(str) {
|
|
214
|
+
return murmur3_32(str, this.seed);
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
class Murmur3_64 {
|
|
218
|
+
seed;
|
|
219
|
+
/**
|
|
220
|
+
* 构造函数
|
|
221
|
+
* @param seed 种子值,默认为0x12345678
|
|
222
|
+
*/
|
|
223
|
+
constructor(seed = 305419896) {
|
|
224
|
+
this.seed = seed;
|
|
225
|
+
}
|
|
226
|
+
/**
|
|
227
|
+
* 计算字符串的64位哈希值
|
|
228
|
+
* @param str 输入字符串
|
|
229
|
+
* @returns 64位哈希值(bigint类型)
|
|
230
|
+
*/
|
|
231
|
+
hash(str) {
|
|
232
|
+
return murmur3_64(str, this.seed);
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
class Murmur3_128 {
|
|
236
|
+
seed;
|
|
237
|
+
/**
|
|
238
|
+
* 构造函数
|
|
239
|
+
* @param seed 种子值,默认为0x12345678
|
|
240
|
+
*/
|
|
241
|
+
constructor(seed = 305419896) {
|
|
242
|
+
this.seed = seed;
|
|
243
|
+
}
|
|
244
|
+
/**
|
|
245
|
+
* 计算字符串的128位哈希值
|
|
246
|
+
* @param str 输入字符串
|
|
247
|
+
* @returns 128位哈希值(bigint类型)
|
|
248
|
+
*/
|
|
249
|
+
hash(str) {
|
|
250
|
+
const hash1 = murmur3_64(str, this.seed);
|
|
251
|
+
return murmur3_64(str + str, this.seed ^ 1515870810) << 64n | hash1;
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
class Murmur3HashFactory {
|
|
255
|
+
/**
|
|
256
|
+
* 创建32位哈希算法实例
|
|
257
|
+
* @returns 32位哈希算法实例
|
|
258
|
+
*/
|
|
259
|
+
create32() {
|
|
260
|
+
return new Murmur3_32();
|
|
261
|
+
}
|
|
262
|
+
/**
|
|
263
|
+
* 创建64位哈希算法实例
|
|
264
|
+
* @returns 64位哈希算法实例
|
|
265
|
+
*/
|
|
266
|
+
create64() {
|
|
267
|
+
return new Murmur3_64();
|
|
268
|
+
}
|
|
269
|
+
/**
|
|
270
|
+
* 创建128位哈希算法实例
|
|
271
|
+
* @returns 128位哈希算法实例
|
|
272
|
+
*/
|
|
273
|
+
create128() {
|
|
274
|
+
return new Murmur3_128();
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
const defaultHashFactory = new Murmur3HashFactory();
|
|
184
278
|
class IndexSegment {
|
|
185
279
|
#filename;
|
|
186
280
|
#storage;
|
|
187
281
|
#buffer = null;
|
|
188
282
|
#view = null;
|
|
283
|
+
#hashAlgorithm;
|
|
189
284
|
/**
|
|
190
|
-
*
|
|
285
|
+
* 构造函数
|
|
286
|
+
* @param filename 文件名
|
|
287
|
+
* @param storage 存储接口
|
|
288
|
+
* @param hashAlgorithm 哈希算法实例,默认为Murmur3_32
|
|
289
|
+
*/
|
|
290
|
+
constructor(filename, storage, hashAlgorithm = new Murmur3_32()) {
|
|
291
|
+
this.#filename = filename, this.#storage = storage, this.#hashAlgorithm = hashAlgorithm;
|
|
292
|
+
}
|
|
293
|
+
/**
|
|
294
|
+
* 使用当前哈希算法计算字符串哈希值
|
|
191
295
|
* @param str 要哈希的字符串
|
|
192
296
|
* @returns 32位无符号哈希值
|
|
193
297
|
*/
|
|
194
|
-
|
|
195
|
-
return
|
|
298
|
+
hash(str) {
|
|
299
|
+
return this.#hashAlgorithm.hash(str);
|
|
196
300
|
}
|
|
197
|
-
|
|
198
|
-
|
|
301
|
+
/**
|
|
302
|
+
* 设置哈希算法
|
|
303
|
+
* @param hashAlgorithm 新的哈希算法实例
|
|
304
|
+
*/
|
|
305
|
+
setHashAlgorithm(hashAlgorithm) {
|
|
306
|
+
this.#hashAlgorithm = hashAlgorithm;
|
|
199
307
|
}
|
|
200
308
|
async loadIndex() {
|
|
201
309
|
return this.#buffer ? !0 : (this.#buffer = await this.#storage.read(this.#filename), this.#buffer ? (this.#view = new DataView(this.#buffer), !0) : !1);
|
|
@@ -206,7 +314,7 @@ class IndexSegment {
|
|
|
206
314
|
const uniqueTokens = /* @__PURE__ */ new Map();
|
|
207
315
|
for (const token of doc.tokens)
|
|
208
316
|
uniqueTokens.has(token) || (uniqueTokens.set(token, !0), tokenMap.has(token) || tokenMap.set(token, {
|
|
209
|
-
hash:
|
|
317
|
+
hash: this.hash(token),
|
|
210
318
|
postings: []
|
|
211
319
|
}), tokenMap.get(token).postings.push(doc.id));
|
|
212
320
|
}
|
|
@@ -222,8 +330,8 @@ class IndexSegment {
|
|
|
222
330
|
const headerSize = 12, dictSize = entries.length * 20, postingsSize = totalPostings * 4, tokensOffset = headerSize + dictSize + postingsSize, totalSize = tokensOffset + totalTokensSize, buffer = new ArrayBuffer(totalSize), view = new DataView(buffer);
|
|
223
331
|
view.setUint32(0, 1229866072), view.setUint32(4, entries.length, !0), view.setUint32(8, tokensOffset, !0);
|
|
224
332
|
let currentDictOffset = headerSize, currentPostingsOffset = headerSize + dictSize, currentTokenOffset = tokensOffset;
|
|
225
|
-
for (const [token, { hash
|
|
226
|
-
view.setUint32(currentDictOffset,
|
|
333
|
+
for (const [token, { hash, postings }] of entries) {
|
|
334
|
+
view.setUint32(currentDictOffset, hash, !0);
|
|
227
335
|
const tokenBytes = encoder.encode(token);
|
|
228
336
|
view.setUint32(currentDictOffset + 4, tokenBytes.length, !0), view.setUint32(currentDictOffset + 8, currentTokenOffset, !0), view.setUint32(currentDictOffset + 12, currentPostingsOffset, !0), view.setUint32(currentDictOffset + 16, postings.length, !0), currentDictOffset += 20;
|
|
229
337
|
for (let i = 0; i < postings.length; i++)
|
|
@@ -236,7 +344,7 @@ class IndexSegment {
|
|
|
236
344
|
}
|
|
237
345
|
search(term) {
|
|
238
346
|
if (!this.#view || !this.#buffer) return [];
|
|
239
|
-
const h =
|
|
347
|
+
const h = this.hash(term), count = this.#view.getUint32(4, !0);
|
|
240
348
|
let left = 0, right = count - 1;
|
|
241
349
|
const headerSize = 12, entrySize = 20, decoder = new TextDecoder();
|
|
242
350
|
for (; left <= right; ) {
|
|
@@ -277,6 +385,116 @@ class IndexSegment {
|
|
|
277
385
|
return [];
|
|
278
386
|
}
|
|
279
387
|
}
|
|
388
|
+
class IndexSegment64 {
|
|
389
|
+
#filename;
|
|
390
|
+
#storage;
|
|
391
|
+
#buffer = null;
|
|
392
|
+
#view = null;
|
|
393
|
+
#hashAlgorithm;
|
|
394
|
+
/**
|
|
395
|
+
* 构造函数
|
|
396
|
+
* @param filename 文件名
|
|
397
|
+
* @param storage 存储接口
|
|
398
|
+
* @param hashAlgorithm 哈希算法实例,默认为Murmur3_64
|
|
399
|
+
*/
|
|
400
|
+
constructor(filename, storage, hashAlgorithm = new Murmur3_64()) {
|
|
401
|
+
this.#filename = filename, this.#storage = storage, this.#hashAlgorithm = hashAlgorithm;
|
|
402
|
+
}
|
|
403
|
+
/**
|
|
404
|
+
* 使用当前哈希算法计算字符串哈希值
|
|
405
|
+
* @param str 要哈希的字符串
|
|
406
|
+
* @returns 64位无符号哈希值
|
|
407
|
+
*/
|
|
408
|
+
hash(str) {
|
|
409
|
+
return this.#hashAlgorithm.hash(str);
|
|
410
|
+
}
|
|
411
|
+
/**
|
|
412
|
+
* 设置哈希算法
|
|
413
|
+
* @param hashAlgorithm 新的哈希算法实例
|
|
414
|
+
*/
|
|
415
|
+
setHashAlgorithm(hashAlgorithm) {
|
|
416
|
+
this.#hashAlgorithm = hashAlgorithm;
|
|
417
|
+
}
|
|
418
|
+
async loadIndex() {
|
|
419
|
+
return this.#buffer ? !0 : (this.#buffer = await this.#storage.read(this.#filename), this.#buffer ? (this.#view = new DataView(this.#buffer), !0) : !1);
|
|
420
|
+
}
|
|
421
|
+
async buildAndSave(docs) {
|
|
422
|
+
const tokenMap = /* @__PURE__ */ new Map();
|
|
423
|
+
for (const doc of docs) {
|
|
424
|
+
const uniqueTokens = /* @__PURE__ */ new Map();
|
|
425
|
+
for (const token of doc.tokens)
|
|
426
|
+
uniqueTokens.has(token) || (uniqueTokens.set(token, !0), tokenMap.has(token) || tokenMap.set(token, {
|
|
427
|
+
hash: this.hash(token),
|
|
428
|
+
postings: []
|
|
429
|
+
}), tokenMap.get(token).postings.push(doc.id));
|
|
430
|
+
}
|
|
431
|
+
const entries = Array.from(tokenMap.entries());
|
|
432
|
+
entries.sort(([a, ah], [b, bh]) => ah.hash !== bh.hash ? ah.hash > bh.hash ? 1 : -1 : a.localeCompare(b));
|
|
433
|
+
const encoder = new TextEncoder();
|
|
434
|
+
let totalPostings = 0, totalTokensSize = 0;
|
|
435
|
+
for (const [token, { postings }] of entries) {
|
|
436
|
+
totalPostings += postings.length;
|
|
437
|
+
const bytes = encoder.encode(token);
|
|
438
|
+
totalTokensSize += bytes.length + 1;
|
|
439
|
+
}
|
|
440
|
+
const headerSize = 16, dictSize = entries.length * 28, postingsSize = totalPostings * 4, tokensOffset = headerSize + dictSize + postingsSize, totalSize = tokensOffset + totalTokensSize, buffer = new ArrayBuffer(totalSize), view = new DataView(buffer);
|
|
441
|
+
view.setUint32(0, 1229866072), view.setUint32(4, entries.length, !0), view.setUint32(8, tokensOffset, !0), view.setUint32(12, 64, !0);
|
|
442
|
+
let currentDictOffset = headerSize, currentPostingsOffset = headerSize + dictSize, currentTokenOffset = tokensOffset;
|
|
443
|
+
for (const [token, { hash, postings }] of entries) {
|
|
444
|
+
view.setBigUint64(currentDictOffset, hash, !0);
|
|
445
|
+
const tokenBytes = encoder.encode(token);
|
|
446
|
+
view.setUint32(currentDictOffset + 8, tokenBytes.length, !0), view.setUint32(currentDictOffset + 12, currentTokenOffset, !0), view.setUint32(currentDictOffset + 16, currentPostingsOffset, !0), view.setUint32(currentDictOffset + 20, postings.length, !0), currentDictOffset += 28;
|
|
447
|
+
for (let i = 0; i < postings.length; i++)
|
|
448
|
+
view.setUint32(currentPostingsOffset, postings[i], !0), currentPostingsOffset += 4;
|
|
449
|
+
for (let i = 0; i < tokenBytes.length; i++)
|
|
450
|
+
view.setUint8(currentTokenOffset++, tokenBytes[i]);
|
|
451
|
+
view.setUint8(currentTokenOffset++, 0);
|
|
452
|
+
}
|
|
453
|
+
await this.#storage.write(this.#filename, buffer), this.#buffer = buffer, this.#view = view;
|
|
454
|
+
}
|
|
455
|
+
search(term) {
|
|
456
|
+
if (!this.#view || !this.#buffer) return [];
|
|
457
|
+
const h = this.hash(term), count = this.#view.getUint32(4, !0);
|
|
458
|
+
let left = 0, right = count - 1;
|
|
459
|
+
const headerSize = 16, entrySize = 28, decoder = new TextDecoder();
|
|
460
|
+
for (; left <= right; ) {
|
|
461
|
+
const mid = left + right >>> 1, entryPos = headerSize + mid * entrySize, entryHash = this.#view.getBigUint64(entryPos, !0);
|
|
462
|
+
if (entryHash < h)
|
|
463
|
+
left = mid + 1;
|
|
464
|
+
else if (entryHash > h)
|
|
465
|
+
right = mid - 1;
|
|
466
|
+
else {
|
|
467
|
+
if (!(mid > 0 && this.#view.getBigUint64(headerSize + (mid - 1) * entrySize, !0) === h || mid < count - 1 && this.#view.getBigUint64(headerSize + (mid + 1) * entrySize, !0) === h)) {
|
|
468
|
+
const postingsOffset = this.#view.getUint32(headerSize + mid * entrySize + 16, !0), postingsLen = this.#view.getUint32(headerSize + mid * entrySize + 20, !0), result = [];
|
|
469
|
+
for (let j = 0; j < postingsLen; j++)
|
|
470
|
+
result.push(this.#view.getUint32(postingsOffset + j * 4, !0));
|
|
471
|
+
return result;
|
|
472
|
+
}
|
|
473
|
+
let firstMatch = mid;
|
|
474
|
+
for (; firstMatch > 0; ) {
|
|
475
|
+
const prevPos = headerSize + (firstMatch - 1) * entrySize;
|
|
476
|
+
if (this.#view.getBigUint64(prevPos, !0) === h)
|
|
477
|
+
firstMatch--;
|
|
478
|
+
else
|
|
479
|
+
break;
|
|
480
|
+
}
|
|
481
|
+
for (let i = firstMatch; i < count; i++) {
|
|
482
|
+
const checkPos = headerSize + i * entrySize;
|
|
483
|
+
if (this.#view.getBigUint64(checkPos, !0) !== h) break;
|
|
484
|
+
const tokenLen = this.#view.getUint32(checkPos + 8, !0), tokenOffset = this.#view.getUint32(checkPos + 12, !0), tokenBuffer = new Uint8Array(this.#buffer, tokenOffset, tokenLen);
|
|
485
|
+
if (decoder.decode(tokenBuffer) === term) {
|
|
486
|
+
const postingsOffset = this.#view.getUint32(checkPos + 16, !0), postingsLen = this.#view.getUint32(checkPos + 20, !0), result = [];
|
|
487
|
+
for (let j = 0; j < postingsLen; j++)
|
|
488
|
+
result.push(this.#view.getUint32(postingsOffset + j * 4, !0));
|
|
489
|
+
return result;
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
return [];
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
return [];
|
|
496
|
+
}
|
|
497
|
+
}
|
|
280
498
|
const defaultTokenize = ({ text }) => {
|
|
281
499
|
try {
|
|
282
500
|
if (typeof Intl < "u" && typeof Intl.Segmenter == "function" && typeof Array.from == "function") {
|
|
@@ -295,6 +513,9 @@ class SearchEngine {
|
|
|
295
513
|
#segments;
|
|
296
514
|
#initialized = !1;
|
|
297
515
|
#config;
|
|
516
|
+
#isHash64Bit = !0;
|
|
517
|
+
#hashAlgorithm32;
|
|
518
|
+
#hashAlgorithm64;
|
|
298
519
|
// 批处理状态
|
|
299
520
|
#inBatch = !1;
|
|
300
521
|
#pendingTokenCounts = { word: 0, char: 0 };
|
|
@@ -306,12 +527,19 @@ class SearchEngine {
|
|
|
306
527
|
minCharTokenSave: 0,
|
|
307
528
|
indexingTokenizer: config.indexingTokenizer || defaultTokenize,
|
|
308
529
|
...config
|
|
309
|
-
}, (this.#config.minWordTokenSave || 0) >= (this.#config.wordSegmentTokenThreshold || 1e5))
|
|
530
|
+
}, this.#processHashAlgorithmConfig(), (this.#config.minWordTokenSave || 0) >= (this.#config.wordSegmentTokenThreshold || 1e5))
|
|
310
531
|
throw new Error("minWordTokenSave must be less than wordSegmentTokenThreshold");
|
|
311
532
|
if ((this.#config.minCharTokenSave || 0) >= (this.#config.charSegmentTokenThreshold || 5e5))
|
|
312
533
|
throw new Error("minCharTokenSave must be less than charSegmentTokenThreshold");
|
|
313
534
|
this.#storage = config.storage, this.#meta = new MetaManager(this.#storage), this.#cache = new IntermediateCache(this.#storage), this.#segments = /* @__PURE__ */ new Map();
|
|
314
535
|
}
|
|
536
|
+
/**
|
|
537
|
+
* 处理哈希算法配置
|
|
538
|
+
*/
|
|
539
|
+
#processHashAlgorithmConfig() {
|
|
540
|
+
const hashConfig = this.#config.hashAlgorithm;
|
|
541
|
+
hashConfig === 64 ? (this.#isHash64Bit = !0, this.#hashAlgorithm64 = new Murmur3_64()) : hashConfig === 32 ? (this.#isHash64Bit = !1, this.#hashAlgorithm32 = new Murmur3_32()) : hashConfig && typeof hashConfig.hash == "function" ? typeof hashConfig.hash("test") == "bigint" ? (this.#isHash64Bit = !0, this.#hashAlgorithm64 = hashConfig) : (this.#isHash64Bit = !1, this.#hashAlgorithm32 = hashConfig) : (this.#isHash64Bit = !0, this.#hashAlgorithm64 = new Murmur3_64());
|
|
542
|
+
}
|
|
315
543
|
/**
|
|
316
544
|
* 开启批处理
|
|
317
545
|
* 批处理期间 addDocuments 只写入缓存,不触发索引段构建
|
|
@@ -397,7 +625,7 @@ class SearchEngine {
|
|
|
397
625
|
const segmentsMeta = this.#meta.getSegments(type);
|
|
398
626
|
for (const meta of segmentsMeta) {
|
|
399
627
|
const filename = meta.filename;
|
|
400
|
-
!this.#segments.has(filename) && !segmentsToLoad.has(filename) && segmentsToLoad.set(filename, new IndexSegment(filename, this.#storage));
|
|
628
|
+
!this.#segments.has(filename) && !segmentsToLoad.has(filename) && (this.#isHash64Bit ? segmentsToLoad.set(filename, new IndexSegment64(filename, this.#storage, this.#hashAlgorithm64)) : segmentsToLoad.set(filename, new IndexSegment(filename, this.#storage, this.#hashAlgorithm32)));
|
|
401
629
|
}
|
|
402
630
|
};
|
|
403
631
|
collectSegments("word"), collectSegments("char"), await Promise.all(
|
|
@@ -466,7 +694,7 @@ class SearchEngine {
|
|
|
466
694
|
...this.#meta.getSegments("char")
|
|
467
695
|
];
|
|
468
696
|
for (const seg of allSegments)
|
|
469
|
-
this.#segments.has(seg.filename) || this.#segments.set(seg.filename, new IndexSegment(seg.filename, this.#storage)), await this.#segments.get(seg.filename).loadIndex();
|
|
697
|
+
this.#segments.has(seg.filename) || (this.#isHash64Bit ? this.#segments.set(seg.filename, new IndexSegment64(seg.filename, this.#storage, this.#hashAlgorithm64)) : this.#segments.set(seg.filename, new IndexSegment(seg.filename, this.#storage, this.#hashAlgorithm32))), await this.#segments.get(seg.filename).loadIndex();
|
|
470
698
|
this.#initialized = !0;
|
|
471
699
|
}
|
|
472
700
|
#getIndexingTokens(doc) {
|
|
@@ -498,7 +726,7 @@ class SearchEngine {
|
|
|
498
726
|
}
|
|
499
727
|
const docsToBuild = await this.#cache.readRange(cacheFilename, startOffset, currentCacheSize);
|
|
500
728
|
let segment = this.#segments.get(targetSegmentName);
|
|
501
|
-
segment || (segment = new IndexSegment(targetSegmentName, this.#storage), this.#segments.set(targetSegmentName, segment)), await segment.buildAndSave(docsToBuild), this.#meta.updateSegment(type, targetSegmentName, startOffset, currentCacheSize, newTokenCountTotal, isNew);
|
|
729
|
+
segment || (this.#isHash64Bit ? segment = new IndexSegment64(targetSegmentName, this.#storage, this.#hashAlgorithm64) : segment = new IndexSegment(targetSegmentName, this.#storage, this.#hashAlgorithm32), this.#segments.set(targetSegmentName, segment)), await segment.buildAndSave(docsToBuild), this.#meta.updateSegment(type, targetSegmentName, startOffset, currentCacheSize, newTokenCountTotal, isNew);
|
|
502
730
|
}
|
|
503
731
|
}
|
|
504
|
-
exports.SearchEngine = SearchEngine, exports.
|
|
732
|
+
exports.Murmur3HashFactory = Murmur3HashFactory, exports.Murmur3_128 = Murmur3_128, exports.Murmur3_32 = Murmur3_32, exports.Murmur3_64 = Murmur3_64, exports.SearchEngine = SearchEngine, exports.defaultHashFactory = defaultHashFactory, exports.murmur3_32 = murmur3_32, exports.murmur3_64 = murmur3_64;
|
package/lib/core.d.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { ISearchEngine, ISearchEngineOption, IDocument, IDocumentBase, IResult, ISearchEngineStatus } from './type';
|
|
1
|
+
import { ISearchEngine, ISearchEngineOption, IDocument, IDocumentBase, IResult, ISearchEngineStatus, IHashAlgorithm32, IHashAlgorithm64, IHashAlgorithm128, IHashAlgorithmFactory } from './type';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* 核心搜索引擎类 (多实例支持)
|
|
@@ -52,4 +52,90 @@ declare class SearchEngine implements ISearchEngine {
|
|
|
52
52
|
*/
|
|
53
53
|
declare function murmur3_32(str: string, h?: number): number;
|
|
54
54
|
|
|
55
|
-
|
|
55
|
+
/**
|
|
56
|
+
* MurmurHash3 64位实现
|
|
57
|
+
* 高效的非加密哈希函数,适用于哈希表等数据结构
|
|
58
|
+
*/
|
|
59
|
+
/**
|
|
60
|
+
* 计算字符串的64位MurmurHash3哈希值
|
|
61
|
+
* @param str 要哈希的字符串
|
|
62
|
+
* @param seed 种子值,默认为0x12345678
|
|
63
|
+
* @returns 64位无符号哈希值(BigInt类型)
|
|
64
|
+
*/
|
|
65
|
+
declare function murmur3_64(str: string, seed?: number): bigint;
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Murmur3 32位哈希算法实现类
|
|
69
|
+
*/
|
|
70
|
+
declare class Murmur3_32 implements IHashAlgorithm32 {
|
|
71
|
+
private seed;
|
|
72
|
+
/**
|
|
73
|
+
* 构造函数
|
|
74
|
+
* @param seed 种子值,默认为0x12345678
|
|
75
|
+
*/
|
|
76
|
+
constructor(seed?: number);
|
|
77
|
+
/**
|
|
78
|
+
* 计算字符串的32位哈希值
|
|
79
|
+
* @param str 输入字符串
|
|
80
|
+
* @returns 32位哈希值(number类型)
|
|
81
|
+
*/
|
|
82
|
+
hash(str: string): number;
|
|
83
|
+
}
|
|
84
|
+
/**
|
|
85
|
+
* Murmur3 64位哈希算法实现类
|
|
86
|
+
*/
|
|
87
|
+
declare class Murmur3_64 implements IHashAlgorithm64 {
|
|
88
|
+
private seed;
|
|
89
|
+
/**
|
|
90
|
+
* 构造函数
|
|
91
|
+
* @param seed 种子值,默认为0x12345678
|
|
92
|
+
*/
|
|
93
|
+
constructor(seed?: number);
|
|
94
|
+
/**
|
|
95
|
+
* 计算字符串的64位哈希值
|
|
96
|
+
* @param str 输入字符串
|
|
97
|
+
* @returns 64位哈希值(bigint类型)
|
|
98
|
+
*/
|
|
99
|
+
hash(str: string): bigint;
|
|
100
|
+
}
|
|
101
|
+
/**
|
|
102
|
+
* Murmur3 128位哈希算法实现类
|
|
103
|
+
* 注意:当前项目中没有实际的128位实现,这里只是一个占位符
|
|
104
|
+
*/
|
|
105
|
+
declare class Murmur3_128 implements IHashAlgorithm128 {
|
|
106
|
+
private seed;
|
|
107
|
+
/**
|
|
108
|
+
* 构造函数
|
|
109
|
+
* @param seed 种子值,默认为0x12345678
|
|
110
|
+
*/
|
|
111
|
+
constructor(seed?: number);
|
|
112
|
+
/**
|
|
113
|
+
* 计算字符串的128位哈希值
|
|
114
|
+
* @param str 输入字符串
|
|
115
|
+
* @returns 128位哈希值(bigint类型)
|
|
116
|
+
*/
|
|
117
|
+
hash(str: string): bigint;
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* Murmur3哈希算法工厂类
|
|
121
|
+
*/
|
|
122
|
+
declare class Murmur3HashFactory implements IHashAlgorithmFactory {
|
|
123
|
+
/**
|
|
124
|
+
* 创建32位哈希算法实例
|
|
125
|
+
* @returns 32位哈希算法实例
|
|
126
|
+
*/
|
|
127
|
+
create32(): IHashAlgorithm32;
|
|
128
|
+
/**
|
|
129
|
+
* 创建64位哈希算法实例
|
|
130
|
+
* @returns 64位哈希算法实例
|
|
131
|
+
*/
|
|
132
|
+
create64(): IHashAlgorithm64;
|
|
133
|
+
/**
|
|
134
|
+
* 创建128位哈希算法实例
|
|
135
|
+
* @returns 128位哈希算法实例
|
|
136
|
+
*/
|
|
137
|
+
create128(): IHashAlgorithm128;
|
|
138
|
+
}
|
|
139
|
+
declare const defaultHashFactory: Murmur3HashFactory;
|
|
140
|
+
|
|
141
|
+
export { Murmur3HashFactory, Murmur3_128, Murmur3_32, Murmur3_64, SearchEngine, defaultHashFactory, murmur3_32, murmur3_64 };
|
package/lib/core.js
CHANGED
|
@@ -180,21 +180,129 @@ function murmur3_32(str, h = 305419896) {
|
|
|
180
180
|
const remainder = len & 3;
|
|
181
181
|
return remainder > 0 && (remainder >= 3 && (k1 ^= (str.charCodeAt(i + 2) & 255) << 16), remainder >= 2 && (k1 ^= (str.charCodeAt(i + 1) & 255) << 8), remainder >= 1 && (k1 ^= str.charCodeAt(i) & 255), k1 = (k1 & 65535) * 3432918353 + (((k1 >>> 16) * 3432918353 & 65535) << 16) & 4294967295, k1 = k1 << 15 | k1 >>> 17, k1 = (k1 & 65535) * 461845907 + (((k1 >>> 16) * 461845907 & 65535) << 16) & 4294967295, h ^= k1), h ^= len, h ^= h >>> 16, h = (h & 65535) * 2246822507 + (((h >>> 16) * 2246822507 & 65535) << 16) & 4294967295, h ^= h >>> 13, h = (h & 65535) * 3266489909 + (((h >>> 16) * 3266489909 & 65535) << 16) & 4294967295, h ^= h >>> 16, h >>> 0;
|
|
182
182
|
}
|
|
183
|
+
function murmur3_64(str, seed = 305419896) {
|
|
184
|
+
const len = str.length, nBlocks = len >> 3;
|
|
185
|
+
let h1 = BigInt(seed), h2 = BigInt(seed);
|
|
186
|
+
const c1 = 0x87c37b91114253d5n, c2 = 0x4cf5ad432745937fn;
|
|
187
|
+
let i = 0;
|
|
188
|
+
for (; i < nBlocks; ) {
|
|
189
|
+
let k12 = BigInt(str.charCodeAt(i) & 255) | BigInt(str.charCodeAt(++i) & 255) << 8n | BigInt(str.charCodeAt(++i) & 255) << 16n | BigInt(str.charCodeAt(++i) & 255) << 24n | BigInt(str.charCodeAt(++i) & 255) << 32n | BigInt(str.charCodeAt(++i) & 255) << 40n | BigInt(str.charCodeAt(++i) & 255) << 48n | BigInt(str.charCodeAt(++i) & 255) << 56n;
|
|
190
|
+
++i;
|
|
191
|
+
let k22 = BigInt(str.charCodeAt(i) & 255) | BigInt(str.charCodeAt(++i) & 255) << 8n | BigInt(str.charCodeAt(++i) & 255) << 16n | BigInt(str.charCodeAt(++i) & 255) << 24n | BigInt(str.charCodeAt(++i) & 255) << 32n | BigInt(str.charCodeAt(++i) & 255) << 40n | BigInt(str.charCodeAt(++i) & 255) << 48n | BigInt(str.charCodeAt(++i) & 255) << 56n;
|
|
192
|
+
++i, k12 = k12 * c1 % 2n ** 64n, k12 = (k12 << 31n | k12 >> 33n) % 2n ** 64n, k12 = k12 * c2 % 2n ** 64n, h1 ^= k12, h1 = (h1 << 27n | h1 >> 37n) % 2n ** 64n, h1 = (h1 + h2) % 2n ** 64n, h1 = (h1 * 5n + 0x52dce729n) % 2n ** 64n, k22 = k22 * c2 % 2n ** 64n, k22 = (k22 << 33n | k22 >> 31n) % 2n ** 64n, k22 = k22 * c1 % 2n ** 64n, h2 ^= k22, h2 = (h2 << 31n | h2 >> 33n) % 2n ** 64n, h2 = (h2 + h1) % 2n ** 64n, h2 = (h2 * 5n + 0x38495ab5n) % 2n ** 64n;
|
|
193
|
+
}
|
|
194
|
+
let k1 = 0n, k2 = 0n;
|
|
195
|
+
const remainder = len & 7;
|
|
196
|
+
return remainder > 0 && (remainder >= 8 && (k2 ^= BigInt(str.charCodeAt(i + 7) & 255) << 56n), remainder >= 7 && (k2 ^= BigInt(str.charCodeAt(i + 6) & 255) << 48n), remainder >= 6 && (k2 ^= BigInt(str.charCodeAt(i + 5) & 255) << 40n), remainder >= 5 && (k2 ^= BigInt(str.charCodeAt(i + 4) & 255) << 32n), remainder >= 4 && (k1 ^= BigInt(str.charCodeAt(i + 3) & 255) << 24n), remainder >= 3 && (k1 ^= BigInt(str.charCodeAt(i + 2) & 255) << 16n), remainder >= 2 && (k1 ^= BigInt(str.charCodeAt(i + 1) & 255) << 8n), remainder >= 1 && (k1 ^= BigInt(str.charCodeAt(i) & 255)), k1 = k1 * c1 % 2n ** 64n, k1 = (k1 << 31n | k1 >> 33n) % 2n ** 64n, k1 = k1 * c2 % 2n ** 64n, h1 ^= k1, k2 = k2 * c2 % 2n ** 64n, k2 = (k2 << 33n | k2 >> 31n) % 2n ** 64n, k2 = k2 * c1 % 2n ** 64n, h2 ^= k2), h1 ^= BigInt(len), h2 ^= BigInt(len), h1 = (h1 + h2) % 2n ** 64n, h2 = (h2 + h1) % 2n ** 64n, h1 = (h1 ^ h1 >> 33n) * 0xff51afd7ed558ccdn, h1 = (h1 ^ h1 >> 33n) * 0xc4ceb9fe1a85ec53n, h1 = h1 ^ h1 >> 33n, h2 = (h2 ^ h2 >> 33n) * 0xff51afd7ed558ccdn, h2 = (h2 ^ h2 >> 33n) * 0xc4ceb9fe1a85ec53n, h2 = h2 ^ h2 >> 33n, (h2 << 64n | h1) & 0xffffffffffffffffn;
|
|
197
|
+
}
|
|
198
|
+
class Murmur3_32 {
|
|
199
|
+
seed;
|
|
200
|
+
/**
|
|
201
|
+
* 构造函数
|
|
202
|
+
* @param seed 种子值,默认为0x12345678
|
|
203
|
+
*/
|
|
204
|
+
constructor(seed = 305419896) {
|
|
205
|
+
this.seed = seed;
|
|
206
|
+
}
|
|
207
|
+
/**
|
|
208
|
+
* 计算字符串的32位哈希值
|
|
209
|
+
* @param str 输入字符串
|
|
210
|
+
* @returns 32位哈希值(number类型)
|
|
211
|
+
*/
|
|
212
|
+
hash(str) {
|
|
213
|
+
return murmur3_32(str, this.seed);
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
class Murmur3_64 {
|
|
217
|
+
seed;
|
|
218
|
+
/**
|
|
219
|
+
* 构造函数
|
|
220
|
+
* @param seed 种子值,默认为0x12345678
|
|
221
|
+
*/
|
|
222
|
+
constructor(seed = 305419896) {
|
|
223
|
+
this.seed = seed;
|
|
224
|
+
}
|
|
225
|
+
/**
|
|
226
|
+
* 计算字符串的64位哈希值
|
|
227
|
+
* @param str 输入字符串
|
|
228
|
+
* @returns 64位哈希值(bigint类型)
|
|
229
|
+
*/
|
|
230
|
+
hash(str) {
|
|
231
|
+
return murmur3_64(str, this.seed);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
class Murmur3_128 {
|
|
235
|
+
seed;
|
|
236
|
+
/**
|
|
237
|
+
* 构造函数
|
|
238
|
+
* @param seed 种子值,默认为0x12345678
|
|
239
|
+
*/
|
|
240
|
+
constructor(seed = 305419896) {
|
|
241
|
+
this.seed = seed;
|
|
242
|
+
}
|
|
243
|
+
/**
|
|
244
|
+
* 计算字符串的128位哈希值
|
|
245
|
+
* @param str 输入字符串
|
|
246
|
+
* @returns 128位哈希值(bigint类型)
|
|
247
|
+
*/
|
|
248
|
+
hash(str) {
|
|
249
|
+
const hash1 = murmur3_64(str, this.seed);
|
|
250
|
+
return murmur3_64(str + str, this.seed ^ 1515870810) << 64n | hash1;
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
class Murmur3HashFactory {
|
|
254
|
+
/**
|
|
255
|
+
* 创建32位哈希算法实例
|
|
256
|
+
* @returns 32位哈希算法实例
|
|
257
|
+
*/
|
|
258
|
+
create32() {
|
|
259
|
+
return new Murmur3_32();
|
|
260
|
+
}
|
|
261
|
+
/**
|
|
262
|
+
* 创建64位哈希算法实例
|
|
263
|
+
* @returns 64位哈希算法实例
|
|
264
|
+
*/
|
|
265
|
+
create64() {
|
|
266
|
+
return new Murmur3_64();
|
|
267
|
+
}
|
|
268
|
+
/**
|
|
269
|
+
* 创建128位哈希算法实例
|
|
270
|
+
* @returns 128位哈希算法实例
|
|
271
|
+
*/
|
|
272
|
+
create128() {
|
|
273
|
+
return new Murmur3_128();
|
|
274
|
+
}
|
|
275
|
+
}
|
|
276
|
+
const defaultHashFactory = new Murmur3HashFactory();
|
|
183
277
|
class IndexSegment {
|
|
184
278
|
#filename;
|
|
185
279
|
#storage;
|
|
186
280
|
#buffer = null;
|
|
187
281
|
#view = null;
|
|
282
|
+
#hashAlgorithm;
|
|
188
283
|
/**
|
|
189
|
-
*
|
|
284
|
+
* 构造函数
|
|
285
|
+
* @param filename 文件名
|
|
286
|
+
* @param storage 存储接口
|
|
287
|
+
* @param hashAlgorithm 哈希算法实例,默认为Murmur3_32
|
|
288
|
+
*/
|
|
289
|
+
constructor(filename, storage, hashAlgorithm = new Murmur3_32()) {
|
|
290
|
+
this.#filename = filename, this.#storage = storage, this.#hashAlgorithm = hashAlgorithm;
|
|
291
|
+
}
|
|
292
|
+
/**
|
|
293
|
+
* 使用当前哈希算法计算字符串哈希值
|
|
190
294
|
* @param str 要哈希的字符串
|
|
191
295
|
* @returns 32位无符号哈希值
|
|
192
296
|
*/
|
|
193
|
-
|
|
194
|
-
return
|
|
297
|
+
hash(str) {
|
|
298
|
+
return this.#hashAlgorithm.hash(str);
|
|
195
299
|
}
|
|
196
|
-
|
|
197
|
-
|
|
300
|
+
/**
|
|
301
|
+
* 设置哈希算法
|
|
302
|
+
* @param hashAlgorithm 新的哈希算法实例
|
|
303
|
+
*/
|
|
304
|
+
setHashAlgorithm(hashAlgorithm) {
|
|
305
|
+
this.#hashAlgorithm = hashAlgorithm;
|
|
198
306
|
}
|
|
199
307
|
async loadIndex() {
|
|
200
308
|
return this.#buffer ? !0 : (this.#buffer = await this.#storage.read(this.#filename), this.#buffer ? (this.#view = new DataView(this.#buffer), !0) : !1);
|
|
@@ -205,7 +313,7 @@ class IndexSegment {
|
|
|
205
313
|
const uniqueTokens = /* @__PURE__ */ new Map();
|
|
206
314
|
for (const token of doc.tokens)
|
|
207
315
|
uniqueTokens.has(token) || (uniqueTokens.set(token, !0), tokenMap.has(token) || tokenMap.set(token, {
|
|
208
|
-
hash:
|
|
316
|
+
hash: this.hash(token),
|
|
209
317
|
postings: []
|
|
210
318
|
}), tokenMap.get(token).postings.push(doc.id));
|
|
211
319
|
}
|
|
@@ -221,8 +329,8 @@ class IndexSegment {
|
|
|
221
329
|
const headerSize = 12, dictSize = entries.length * 20, postingsSize = totalPostings * 4, tokensOffset = headerSize + dictSize + postingsSize, totalSize = tokensOffset + totalTokensSize, buffer = new ArrayBuffer(totalSize), view = new DataView(buffer);
|
|
222
330
|
view.setUint32(0, 1229866072), view.setUint32(4, entries.length, !0), view.setUint32(8, tokensOffset, !0);
|
|
223
331
|
let currentDictOffset = headerSize, currentPostingsOffset = headerSize + dictSize, currentTokenOffset = tokensOffset;
|
|
224
|
-
for (const [token, { hash
|
|
225
|
-
view.setUint32(currentDictOffset,
|
|
332
|
+
for (const [token, { hash, postings }] of entries) {
|
|
333
|
+
view.setUint32(currentDictOffset, hash, !0);
|
|
226
334
|
const tokenBytes = encoder.encode(token);
|
|
227
335
|
view.setUint32(currentDictOffset + 4, tokenBytes.length, !0), view.setUint32(currentDictOffset + 8, currentTokenOffset, !0), view.setUint32(currentDictOffset + 12, currentPostingsOffset, !0), view.setUint32(currentDictOffset + 16, postings.length, !0), currentDictOffset += 20;
|
|
228
336
|
for (let i = 0; i < postings.length; i++)
|
|
@@ -235,7 +343,7 @@ class IndexSegment {
|
|
|
235
343
|
}
|
|
236
344
|
search(term) {
|
|
237
345
|
if (!this.#view || !this.#buffer) return [];
|
|
238
|
-
const h =
|
|
346
|
+
const h = this.hash(term), count = this.#view.getUint32(4, !0);
|
|
239
347
|
let left = 0, right = count - 1;
|
|
240
348
|
const headerSize = 12, entrySize = 20, decoder = new TextDecoder();
|
|
241
349
|
for (; left <= right; ) {
|
|
@@ -276,6 +384,116 @@ class IndexSegment {
|
|
|
276
384
|
return [];
|
|
277
385
|
}
|
|
278
386
|
}
|
|
387
|
+
class IndexSegment64 {
|
|
388
|
+
#filename;
|
|
389
|
+
#storage;
|
|
390
|
+
#buffer = null;
|
|
391
|
+
#view = null;
|
|
392
|
+
#hashAlgorithm;
|
|
393
|
+
/**
|
|
394
|
+
* 构造函数
|
|
395
|
+
* @param filename 文件名
|
|
396
|
+
* @param storage 存储接口
|
|
397
|
+
* @param hashAlgorithm 哈希算法实例,默认为Murmur3_64
|
|
398
|
+
*/
|
|
399
|
+
constructor(filename, storage, hashAlgorithm = new Murmur3_64()) {
|
|
400
|
+
this.#filename = filename, this.#storage = storage, this.#hashAlgorithm = hashAlgorithm;
|
|
401
|
+
}
|
|
402
|
+
/**
|
|
403
|
+
* 使用当前哈希算法计算字符串哈希值
|
|
404
|
+
* @param str 要哈希的字符串
|
|
405
|
+
* @returns 64位无符号哈希值
|
|
406
|
+
*/
|
|
407
|
+
hash(str) {
|
|
408
|
+
return this.#hashAlgorithm.hash(str);
|
|
409
|
+
}
|
|
410
|
+
/**
|
|
411
|
+
* 设置哈希算法
|
|
412
|
+
* @param hashAlgorithm 新的哈希算法实例
|
|
413
|
+
*/
|
|
414
|
+
setHashAlgorithm(hashAlgorithm) {
|
|
415
|
+
this.#hashAlgorithm = hashAlgorithm;
|
|
416
|
+
}
|
|
417
|
+
async loadIndex() {
|
|
418
|
+
return this.#buffer ? !0 : (this.#buffer = await this.#storage.read(this.#filename), this.#buffer ? (this.#view = new DataView(this.#buffer), !0) : !1);
|
|
419
|
+
}
|
|
420
|
+
async buildAndSave(docs) {
|
|
421
|
+
const tokenMap = /* @__PURE__ */ new Map();
|
|
422
|
+
for (const doc of docs) {
|
|
423
|
+
const uniqueTokens = /* @__PURE__ */ new Map();
|
|
424
|
+
for (const token of doc.tokens)
|
|
425
|
+
uniqueTokens.has(token) || (uniqueTokens.set(token, !0), tokenMap.has(token) || tokenMap.set(token, {
|
|
426
|
+
hash: this.hash(token),
|
|
427
|
+
postings: []
|
|
428
|
+
}), tokenMap.get(token).postings.push(doc.id));
|
|
429
|
+
}
|
|
430
|
+
const entries = Array.from(tokenMap.entries());
|
|
431
|
+
entries.sort(([a, ah], [b, bh]) => ah.hash !== bh.hash ? ah.hash > bh.hash ? 1 : -1 : a.localeCompare(b));
|
|
432
|
+
const encoder = new TextEncoder();
|
|
433
|
+
let totalPostings = 0, totalTokensSize = 0;
|
|
434
|
+
for (const [token, { postings }] of entries) {
|
|
435
|
+
totalPostings += postings.length;
|
|
436
|
+
const bytes = encoder.encode(token);
|
|
437
|
+
totalTokensSize += bytes.length + 1;
|
|
438
|
+
}
|
|
439
|
+
const headerSize = 16, dictSize = entries.length * 28, postingsSize = totalPostings * 4, tokensOffset = headerSize + dictSize + postingsSize, totalSize = tokensOffset + totalTokensSize, buffer = new ArrayBuffer(totalSize), view = new DataView(buffer);
|
|
440
|
+
view.setUint32(0, 1229866072), view.setUint32(4, entries.length, !0), view.setUint32(8, tokensOffset, !0), view.setUint32(12, 64, !0);
|
|
441
|
+
let currentDictOffset = headerSize, currentPostingsOffset = headerSize + dictSize, currentTokenOffset = tokensOffset;
|
|
442
|
+
for (const [token, { hash, postings }] of entries) {
|
|
443
|
+
view.setBigUint64(currentDictOffset, hash, !0);
|
|
444
|
+
const tokenBytes = encoder.encode(token);
|
|
445
|
+
view.setUint32(currentDictOffset + 8, tokenBytes.length, !0), view.setUint32(currentDictOffset + 12, currentTokenOffset, !0), view.setUint32(currentDictOffset + 16, currentPostingsOffset, !0), view.setUint32(currentDictOffset + 20, postings.length, !0), currentDictOffset += 28;
|
|
446
|
+
for (let i = 0; i < postings.length; i++)
|
|
447
|
+
view.setUint32(currentPostingsOffset, postings[i], !0), currentPostingsOffset += 4;
|
|
448
|
+
for (let i = 0; i < tokenBytes.length; i++)
|
|
449
|
+
view.setUint8(currentTokenOffset++, tokenBytes[i]);
|
|
450
|
+
view.setUint8(currentTokenOffset++, 0);
|
|
451
|
+
}
|
|
452
|
+
await this.#storage.write(this.#filename, buffer), this.#buffer = buffer, this.#view = view;
|
|
453
|
+
}
|
|
454
|
+
search(term) {
|
|
455
|
+
if (!this.#view || !this.#buffer) return [];
|
|
456
|
+
const h = this.hash(term), count = this.#view.getUint32(4, !0);
|
|
457
|
+
let left = 0, right = count - 1;
|
|
458
|
+
const headerSize = 16, entrySize = 28, decoder = new TextDecoder();
|
|
459
|
+
for (; left <= right; ) {
|
|
460
|
+
const mid = left + right >>> 1, entryPos = headerSize + mid * entrySize, entryHash = this.#view.getBigUint64(entryPos, !0);
|
|
461
|
+
if (entryHash < h)
|
|
462
|
+
left = mid + 1;
|
|
463
|
+
else if (entryHash > h)
|
|
464
|
+
right = mid - 1;
|
|
465
|
+
else {
|
|
466
|
+
if (!(mid > 0 && this.#view.getBigUint64(headerSize + (mid - 1) * entrySize, !0) === h || mid < count - 1 && this.#view.getBigUint64(headerSize + (mid + 1) * entrySize, !0) === h)) {
|
|
467
|
+
const postingsOffset = this.#view.getUint32(headerSize + mid * entrySize + 16, !0), postingsLen = this.#view.getUint32(headerSize + mid * entrySize + 20, !0), result = [];
|
|
468
|
+
for (let j = 0; j < postingsLen; j++)
|
|
469
|
+
result.push(this.#view.getUint32(postingsOffset + j * 4, !0));
|
|
470
|
+
return result;
|
|
471
|
+
}
|
|
472
|
+
let firstMatch = mid;
|
|
473
|
+
for (; firstMatch > 0; ) {
|
|
474
|
+
const prevPos = headerSize + (firstMatch - 1) * entrySize;
|
|
475
|
+
if (this.#view.getBigUint64(prevPos, !0) === h)
|
|
476
|
+
firstMatch--;
|
|
477
|
+
else
|
|
478
|
+
break;
|
|
479
|
+
}
|
|
480
|
+
for (let i = firstMatch; i < count; i++) {
|
|
481
|
+
const checkPos = headerSize + i * entrySize;
|
|
482
|
+
if (this.#view.getBigUint64(checkPos, !0) !== h) break;
|
|
483
|
+
const tokenLen = this.#view.getUint32(checkPos + 8, !0), tokenOffset = this.#view.getUint32(checkPos + 12, !0), tokenBuffer = new Uint8Array(this.#buffer, tokenOffset, tokenLen);
|
|
484
|
+
if (decoder.decode(tokenBuffer) === term) {
|
|
485
|
+
const postingsOffset = this.#view.getUint32(checkPos + 16, !0), postingsLen = this.#view.getUint32(checkPos + 20, !0), result = [];
|
|
486
|
+
for (let j = 0; j < postingsLen; j++)
|
|
487
|
+
result.push(this.#view.getUint32(postingsOffset + j * 4, !0));
|
|
488
|
+
return result;
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
return [];
|
|
492
|
+
}
|
|
493
|
+
}
|
|
494
|
+
return [];
|
|
495
|
+
}
|
|
496
|
+
}
|
|
279
497
|
const defaultTokenize = ({ text }) => {
|
|
280
498
|
try {
|
|
281
499
|
if (typeof Intl < "u" && typeof Intl.Segmenter == "function" && typeof Array.from == "function") {
|
|
@@ -294,6 +512,9 @@ class SearchEngine {
|
|
|
294
512
|
#segments;
|
|
295
513
|
#initialized = !1;
|
|
296
514
|
#config;
|
|
515
|
+
#isHash64Bit = !0;
|
|
516
|
+
#hashAlgorithm32;
|
|
517
|
+
#hashAlgorithm64;
|
|
297
518
|
// 批处理状态
|
|
298
519
|
#inBatch = !1;
|
|
299
520
|
#pendingTokenCounts = { word: 0, char: 0 };
|
|
@@ -305,12 +526,19 @@ class SearchEngine {
|
|
|
305
526
|
minCharTokenSave: 0,
|
|
306
527
|
indexingTokenizer: config.indexingTokenizer || defaultTokenize,
|
|
307
528
|
...config
|
|
308
|
-
}, (this.#config.minWordTokenSave || 0) >= (this.#config.wordSegmentTokenThreshold || 1e5))
|
|
529
|
+
}, this.#processHashAlgorithmConfig(), (this.#config.minWordTokenSave || 0) >= (this.#config.wordSegmentTokenThreshold || 1e5))
|
|
309
530
|
throw new Error("minWordTokenSave must be less than wordSegmentTokenThreshold");
|
|
310
531
|
if ((this.#config.minCharTokenSave || 0) >= (this.#config.charSegmentTokenThreshold || 5e5))
|
|
311
532
|
throw new Error("minCharTokenSave must be less than charSegmentTokenThreshold");
|
|
312
533
|
this.#storage = config.storage, this.#meta = new MetaManager(this.#storage), this.#cache = new IntermediateCache(this.#storage), this.#segments = /* @__PURE__ */ new Map();
|
|
313
534
|
}
|
|
535
|
+
/**
|
|
536
|
+
* 处理哈希算法配置
|
|
537
|
+
*/
|
|
538
|
+
#processHashAlgorithmConfig() {
|
|
539
|
+
const hashConfig = this.#config.hashAlgorithm;
|
|
540
|
+
hashConfig === 64 ? (this.#isHash64Bit = !0, this.#hashAlgorithm64 = new Murmur3_64()) : hashConfig === 32 ? (this.#isHash64Bit = !1, this.#hashAlgorithm32 = new Murmur3_32()) : hashConfig && typeof hashConfig.hash == "function" ? typeof hashConfig.hash("test") == "bigint" ? (this.#isHash64Bit = !0, this.#hashAlgorithm64 = hashConfig) : (this.#isHash64Bit = !1, this.#hashAlgorithm32 = hashConfig) : (this.#isHash64Bit = !0, this.#hashAlgorithm64 = new Murmur3_64());
|
|
541
|
+
}
|
|
314
542
|
/**
|
|
315
543
|
* 开启批处理
|
|
316
544
|
* 批处理期间 addDocuments 只写入缓存,不触发索引段构建
|
|
@@ -396,7 +624,7 @@ class SearchEngine {
|
|
|
396
624
|
const segmentsMeta = this.#meta.getSegments(type);
|
|
397
625
|
for (const meta of segmentsMeta) {
|
|
398
626
|
const filename = meta.filename;
|
|
399
|
-
!this.#segments.has(filename) && !segmentsToLoad.has(filename) && segmentsToLoad.set(filename, new IndexSegment(filename, this.#storage));
|
|
627
|
+
!this.#segments.has(filename) && !segmentsToLoad.has(filename) && (this.#isHash64Bit ? segmentsToLoad.set(filename, new IndexSegment64(filename, this.#storage, this.#hashAlgorithm64)) : segmentsToLoad.set(filename, new IndexSegment(filename, this.#storage, this.#hashAlgorithm32)));
|
|
400
628
|
}
|
|
401
629
|
};
|
|
402
630
|
collectSegments("word"), collectSegments("char"), await Promise.all(
|
|
@@ -465,7 +693,7 @@ class SearchEngine {
|
|
|
465
693
|
...this.#meta.getSegments("char")
|
|
466
694
|
];
|
|
467
695
|
for (const seg of allSegments)
|
|
468
|
-
this.#segments.has(seg.filename) || this.#segments.set(seg.filename, new IndexSegment(seg.filename, this.#storage)), await this.#segments.get(seg.filename).loadIndex();
|
|
696
|
+
this.#segments.has(seg.filename) || (this.#isHash64Bit ? this.#segments.set(seg.filename, new IndexSegment64(seg.filename, this.#storage, this.#hashAlgorithm64)) : this.#segments.set(seg.filename, new IndexSegment(seg.filename, this.#storage, this.#hashAlgorithm32))), await this.#segments.get(seg.filename).loadIndex();
|
|
469
697
|
this.#initialized = !0;
|
|
470
698
|
}
|
|
471
699
|
#getIndexingTokens(doc) {
|
|
@@ -497,11 +725,16 @@ class SearchEngine {
|
|
|
497
725
|
}
|
|
498
726
|
const docsToBuild = await this.#cache.readRange(cacheFilename, startOffset, currentCacheSize);
|
|
499
727
|
let segment = this.#segments.get(targetSegmentName);
|
|
500
|
-
segment || (segment = new IndexSegment(targetSegmentName, this.#storage), this.#segments.set(targetSegmentName, segment)), await segment.buildAndSave(docsToBuild), this.#meta.updateSegment(type, targetSegmentName, startOffset, currentCacheSize, newTokenCountTotal, isNew);
|
|
728
|
+
segment || (this.#isHash64Bit ? segment = new IndexSegment64(targetSegmentName, this.#storage, this.#hashAlgorithm64) : segment = new IndexSegment(targetSegmentName, this.#storage, this.#hashAlgorithm32), this.#segments.set(targetSegmentName, segment)), await segment.buildAndSave(docsToBuild), this.#meta.updateSegment(type, targetSegmentName, startOffset, currentCacheSize, newTokenCountTotal, isNew);
|
|
501
729
|
}
|
|
502
730
|
}
|
|
503
731
|
export {
|
|
732
|
+
Murmur3HashFactory,
|
|
733
|
+
Murmur3_128,
|
|
734
|
+
Murmur3_32,
|
|
735
|
+
Murmur3_64,
|
|
504
736
|
SearchEngine,
|
|
505
|
-
|
|
506
|
-
murmur3_32
|
|
737
|
+
defaultHashFactory,
|
|
738
|
+
murmur3_32,
|
|
739
|
+
murmur3_64
|
|
507
740
|
};
|
package/lib/type.d.ts
CHANGED
|
@@ -61,6 +61,35 @@ interface IStorage {
|
|
|
61
61
|
getFileSize(filename: string): Promise<number>;
|
|
62
62
|
}
|
|
63
63
|
|
|
64
|
+
interface IHashAlgorithm<T> {
|
|
65
|
+
/**
|
|
66
|
+
* 计算字符串的哈希值
|
|
67
|
+
* @param str 输入字符串
|
|
68
|
+
* @returns 哈希值
|
|
69
|
+
*/
|
|
70
|
+
hash(str: string): T;
|
|
71
|
+
}
|
|
72
|
+
interface IHashAlgorithm32 extends IHashAlgorithm<number> {
|
|
73
|
+
}
|
|
74
|
+
interface IHashAlgorithm64 extends IHashAlgorithm<bigint> {
|
|
75
|
+
}
|
|
76
|
+
interface IHashAlgorithm128 extends IHashAlgorithm<bigint> {
|
|
77
|
+
}
|
|
78
|
+
interface IHashAlgorithmFactory {
|
|
79
|
+
/**
|
|
80
|
+
* 创建32位哈希算法实例
|
|
81
|
+
*/
|
|
82
|
+
create32(): IHashAlgorithm32;
|
|
83
|
+
/**
|
|
84
|
+
* 创建64位哈希算法实例
|
|
85
|
+
*/
|
|
86
|
+
create64(): IHashAlgorithm64;
|
|
87
|
+
/**
|
|
88
|
+
* 创建128位哈希算法实例
|
|
89
|
+
*/
|
|
90
|
+
create128(): IHashAlgorithm128;
|
|
91
|
+
}
|
|
92
|
+
|
|
64
93
|
/**
|
|
65
94
|
* 索引类型
|
|
66
95
|
*/
|
|
@@ -93,6 +122,15 @@ interface ISearchEngineOption {
|
|
|
93
122
|
* - 影响: 直接决定搜索匹配的范围和结果的相关性
|
|
94
123
|
*/
|
|
95
124
|
searchTokenizer?: SearchTokenizer;
|
|
125
|
+
/**
|
|
126
|
+
* 哈希算法配置 (可选)
|
|
127
|
+
* - 32: 使用默认32位哈希算法
|
|
128
|
+
* - 64: 使用默认64位哈希算法
|
|
129
|
+
* - IHashAlgorithm32: 使用自定义32位哈希算法
|
|
130
|
+
* - IHashAlgorithm64: 使用自定义64位哈希算法
|
|
131
|
+
* - undefined: 默认使用32位哈希算法
|
|
132
|
+
*/
|
|
133
|
+
hashAlgorithm?: 32 | 64 | IHashAlgorithm32 | IHashAlgorithm64;
|
|
96
134
|
/**
|
|
97
135
|
* 词索引分段阈值 (Token数) - 分段算法配置
|
|
98
136
|
* - 作用: 控制词索引文件的大小,超过阈值时创建新的索引段
|
|
@@ -173,4 +211,38 @@ interface ISearchEngine {
|
|
|
173
211
|
hasDocument(id: number): Promise<boolean>;
|
|
174
212
|
}
|
|
175
213
|
|
|
176
|
-
|
|
214
|
+
/**
|
|
215
|
+
* 索引段接口,定义了IndexSegment和IndexSegment64的共同方法
|
|
216
|
+
*/
|
|
217
|
+
|
|
218
|
+
interface IIndexSegment {
|
|
219
|
+
/**
|
|
220
|
+
* 使用当前哈希算法计算字符串哈希值
|
|
221
|
+
* @param str 要哈希的字符串
|
|
222
|
+
* @returns 哈希值(number | bigint)
|
|
223
|
+
*/
|
|
224
|
+
hash(str: string): number | bigint;
|
|
225
|
+
/**
|
|
226
|
+
* 设置哈希算法
|
|
227
|
+
* @param hashAlgorithm 新的哈希算法实例
|
|
228
|
+
*/
|
|
229
|
+
setHashAlgorithm(hashAlgorithm: IHashAlgorithm32 | IHashAlgorithm64): void;
|
|
230
|
+
/**
|
|
231
|
+
* 加载索引
|
|
232
|
+
* @returns 是否成功加载索引
|
|
233
|
+
*/
|
|
234
|
+
loadIndex(): Promise<boolean>;
|
|
235
|
+
/**
|
|
236
|
+
* 构建并保存索引
|
|
237
|
+
* @param docs 要索引的文档
|
|
238
|
+
*/
|
|
239
|
+
buildAndSave(docs: ITokenizedDoc[]): Promise<void>;
|
|
240
|
+
/**
|
|
241
|
+
* 搜索索引
|
|
242
|
+
* @param term 搜索词
|
|
243
|
+
* @returns 匹配的文档ID数组
|
|
244
|
+
*/
|
|
245
|
+
search(term: string): number[];
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
export type { IDocument, IDocumentBase, IHashAlgorithm, IHashAlgorithm128, IHashAlgorithm32, IHashAlgorithm64, IHashAlgorithmFactory, IIndexMeta, IIndexSegment, IResult, ISearchEngine, ISearchEngineOption, ISearchEngineStatus, ISegmentMeta, IStorage, ITokenizedDoc, IndexType, IndexingTokenizer, SearchTokenizer };
|