gs-search 0.1.7 → 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/core.cjs CHANGED
@@ -181,21 +181,129 @@ function murmur3_32(str, h = 305419896) {
181
181
  const remainder = len & 3;
182
182
  return remainder > 0 && (remainder >= 3 && (k1 ^= (str.charCodeAt(i + 2) & 255) << 16), remainder >= 2 && (k1 ^= (str.charCodeAt(i + 1) & 255) << 8), remainder >= 1 && (k1 ^= str.charCodeAt(i) & 255), k1 = (k1 & 65535) * 3432918353 + (((k1 >>> 16) * 3432918353 & 65535) << 16) & 4294967295, k1 = k1 << 15 | k1 >>> 17, k1 = (k1 & 65535) * 461845907 + (((k1 >>> 16) * 461845907 & 65535) << 16) & 4294967295, h ^= k1), h ^= len, h ^= h >>> 16, h = (h & 65535) * 2246822507 + (((h >>> 16) * 2246822507 & 65535) << 16) & 4294967295, h ^= h >>> 13, h = (h & 65535) * 3266489909 + (((h >>> 16) * 3266489909 & 65535) << 16) & 4294967295, h ^= h >>> 16, h >>> 0;
183
183
  }
184
+ function murmur3_64(str, seed = 305419896) {
185
+ const len = str.length, nBlocks = len >> 3;
186
+ let h1 = BigInt(seed), h2 = BigInt(seed);
187
+ const c1 = 0x87c37b91114253d5n, c2 = 0x4cf5ad432745937fn;
188
+ let i = 0;
189
+ for (; i < nBlocks; ) {
190
+ let k12 = BigInt(str.charCodeAt(i) & 255) | BigInt(str.charCodeAt(++i) & 255) << 8n | BigInt(str.charCodeAt(++i) & 255) << 16n | BigInt(str.charCodeAt(++i) & 255) << 24n | BigInt(str.charCodeAt(++i) & 255) << 32n | BigInt(str.charCodeAt(++i) & 255) << 40n | BigInt(str.charCodeAt(++i) & 255) << 48n | BigInt(str.charCodeAt(++i) & 255) << 56n;
191
+ ++i;
192
+ let k22 = BigInt(str.charCodeAt(i) & 255) | BigInt(str.charCodeAt(++i) & 255) << 8n | BigInt(str.charCodeAt(++i) & 255) << 16n | BigInt(str.charCodeAt(++i) & 255) << 24n | BigInt(str.charCodeAt(++i) & 255) << 32n | BigInt(str.charCodeAt(++i) & 255) << 40n | BigInt(str.charCodeAt(++i) & 255) << 48n | BigInt(str.charCodeAt(++i) & 255) << 56n;
193
+ ++i, k12 = k12 * c1 % 2n ** 64n, k12 = (k12 << 31n | k12 >> 33n) % 2n ** 64n, k12 = k12 * c2 % 2n ** 64n, h1 ^= k12, h1 = (h1 << 27n | h1 >> 37n) % 2n ** 64n, h1 = (h1 + h2) % 2n ** 64n, h1 = (h1 * 5n + 0x52dce729n) % 2n ** 64n, k22 = k22 * c2 % 2n ** 64n, k22 = (k22 << 33n | k22 >> 31n) % 2n ** 64n, k22 = k22 * c1 % 2n ** 64n, h2 ^= k22, h2 = (h2 << 31n | h2 >> 33n) % 2n ** 64n, h2 = (h2 + h1) % 2n ** 64n, h2 = (h2 * 5n + 0x38495ab5n) % 2n ** 64n;
194
+ }
195
+ let k1 = 0n, k2 = 0n;
196
+ const remainder = len & 7;
197
+ return remainder > 0 && (remainder >= 8 && (k2 ^= BigInt(str.charCodeAt(i + 7) & 255) << 56n), remainder >= 7 && (k2 ^= BigInt(str.charCodeAt(i + 6) & 255) << 48n), remainder >= 6 && (k2 ^= BigInt(str.charCodeAt(i + 5) & 255) << 40n), remainder >= 5 && (k2 ^= BigInt(str.charCodeAt(i + 4) & 255) << 32n), remainder >= 4 && (k1 ^= BigInt(str.charCodeAt(i + 3) & 255) << 24n), remainder >= 3 && (k1 ^= BigInt(str.charCodeAt(i + 2) & 255) << 16n), remainder >= 2 && (k1 ^= BigInt(str.charCodeAt(i + 1) & 255) << 8n), remainder >= 1 && (k1 ^= BigInt(str.charCodeAt(i) & 255)), k1 = k1 * c1 % 2n ** 64n, k1 = (k1 << 31n | k1 >> 33n) % 2n ** 64n, k1 = k1 * c2 % 2n ** 64n, h1 ^= k1, k2 = k2 * c2 % 2n ** 64n, k2 = (k2 << 33n | k2 >> 31n) % 2n ** 64n, k2 = k2 * c1 % 2n ** 64n, h2 ^= k2), h1 ^= BigInt(len), h2 ^= BigInt(len), h1 = (h1 + h2) % 2n ** 64n, h2 = (h2 + h1) % 2n ** 64n, h1 = (h1 ^ h1 >> 33n) * 0xff51afd7ed558ccdn, h1 = (h1 ^ h1 >> 33n) * 0xc4ceb9fe1a85ec53n, h1 = h1 ^ h1 >> 33n, h2 = (h2 ^ h2 >> 33n) * 0xff51afd7ed558ccdn, h2 = (h2 ^ h2 >> 33n) * 0xc4ceb9fe1a85ec53n, h2 = h2 ^ h2 >> 33n, (h2 << 64n | h1) & 0xffffffffffffffffn;
198
+ }
199
+ class Murmur3_32 {
200
+ seed;
201
+ /**
202
+ * 构造函数
203
+ * @param seed 种子值,默认为0x12345678
204
+ */
205
+ constructor(seed = 305419896) {
206
+ this.seed = seed;
207
+ }
208
+ /**
209
+ * 计算字符串的32位哈希值
210
+ * @param str 输入字符串
211
+ * @returns 32位哈希值(number类型)
212
+ */
213
+ hash(str) {
214
+ return murmur3_32(str, this.seed);
215
+ }
216
+ }
217
+ class Murmur3_64 {
218
+ seed;
219
+ /**
220
+ * 构造函数
221
+ * @param seed 种子值,默认为0x12345678
222
+ */
223
+ constructor(seed = 305419896) {
224
+ this.seed = seed;
225
+ }
226
+ /**
227
+ * 计算字符串的64位哈希值
228
+ * @param str 输入字符串
229
+ * @returns 64位哈希值(bigint类型)
230
+ */
231
+ hash(str) {
232
+ return murmur3_64(str, this.seed);
233
+ }
234
+ }
235
+ class Murmur3_128 {
236
+ seed;
237
+ /**
238
+ * 构造函数
239
+ * @param seed 种子值,默认为0x12345678
240
+ */
241
+ constructor(seed = 305419896) {
242
+ this.seed = seed;
243
+ }
244
+ /**
245
+ * 计算字符串的128位哈希值
246
+ * @param str 输入字符串
247
+ * @returns 128位哈希值(bigint类型)
248
+ */
249
+ hash(str) {
250
+ const hash1 = murmur3_64(str, this.seed);
251
+ return murmur3_64(str + str, this.seed ^ 1515870810) << 64n | hash1;
252
+ }
253
+ }
254
+ class Murmur3HashFactory {
255
+ /**
256
+ * 创建32位哈希算法实例
257
+ * @returns 32位哈希算法实例
258
+ */
259
+ create32() {
260
+ return new Murmur3_32();
261
+ }
262
+ /**
263
+ * 创建64位哈希算法实例
264
+ * @returns 64位哈希算法实例
265
+ */
266
+ create64() {
267
+ return new Murmur3_64();
268
+ }
269
+ /**
270
+ * 创建128位哈希算法实例
271
+ * @returns 128位哈希算法实例
272
+ */
273
+ create128() {
274
+ return new Murmur3_128();
275
+ }
276
+ }
277
+ const defaultHashFactory = new Murmur3HashFactory();
184
278
  class IndexSegment {
185
279
  #filename;
186
280
  #storage;
187
281
  #buffer = null;
188
282
  #view = null;
283
+ #hashAlgorithm;
189
284
  /**
190
- * 使用MurmurHash3计算字符串哈希值
285
+ * 构造函数
286
+ * @param filename 文件名
287
+ * @param storage 存储接口
288
+ * @param hashAlgorithm 哈希算法实例,默认为Murmur3_32
289
+ */
290
+ constructor(filename, storage, hashAlgorithm = new Murmur3_32()) {
291
+ this.#filename = filename, this.#storage = storage, this.#hashAlgorithm = hashAlgorithm;
292
+ }
293
+ /**
294
+ * 使用当前哈希算法计算字符串哈希值
191
295
  * @param str 要哈希的字符串
192
296
  * @returns 32位无符号哈希值
193
297
  */
194
- static hash(str) {
195
- return murmur3_32(str);
298
+ hash(str) {
299
+ return this.#hashAlgorithm.hash(str);
196
300
  }
197
- constructor(filename, storage) {
198
- this.#filename = filename, this.#storage = storage;
301
+ /**
302
+ * 设置哈希算法
303
+ * @param hashAlgorithm 新的哈希算法实例
304
+ */
305
+ setHashAlgorithm(hashAlgorithm) {
306
+ this.#hashAlgorithm = hashAlgorithm;
199
307
  }
200
308
  async loadIndex() {
201
309
  return this.#buffer ? !0 : (this.#buffer = await this.#storage.read(this.#filename), this.#buffer ? (this.#view = new DataView(this.#buffer), !0) : !1);
@@ -206,7 +314,7 @@ class IndexSegment {
206
314
  const uniqueTokens = /* @__PURE__ */ new Map();
207
315
  for (const token of doc.tokens)
208
316
  uniqueTokens.has(token) || (uniqueTokens.set(token, !0), tokenMap.has(token) || tokenMap.set(token, {
209
- hash: IndexSegment.hash(token),
317
+ hash: this.hash(token),
210
318
  postings: []
211
319
  }), tokenMap.get(token).postings.push(doc.id));
212
320
  }
@@ -222,8 +330,8 @@ class IndexSegment {
222
330
  const headerSize = 12, dictSize = entries.length * 20, postingsSize = totalPostings * 4, tokensOffset = headerSize + dictSize + postingsSize, totalSize = tokensOffset + totalTokensSize, buffer = new ArrayBuffer(totalSize), view = new DataView(buffer);
223
331
  view.setUint32(0, 1229866072), view.setUint32(4, entries.length, !0), view.setUint32(8, tokensOffset, !0);
224
332
  let currentDictOffset = headerSize, currentPostingsOffset = headerSize + dictSize, currentTokenOffset = tokensOffset;
225
- for (const [token, { hash: hash2, postings }] of entries) {
226
- view.setUint32(currentDictOffset, hash2, !0);
333
+ for (const [token, { hash, postings }] of entries) {
334
+ view.setUint32(currentDictOffset, hash, !0);
227
335
  const tokenBytes = encoder.encode(token);
228
336
  view.setUint32(currentDictOffset + 4, tokenBytes.length, !0), view.setUint32(currentDictOffset + 8, currentTokenOffset, !0), view.setUint32(currentDictOffset + 12, currentPostingsOffset, !0), view.setUint32(currentDictOffset + 16, postings.length, !0), currentDictOffset += 20;
229
337
  for (let i = 0; i < postings.length; i++)
@@ -236,7 +344,7 @@ class IndexSegment {
236
344
  }
237
345
  search(term) {
238
346
  if (!this.#view || !this.#buffer) return [];
239
- const h = IndexSegment.hash(term), count = this.#view.getUint32(4, !0);
347
+ const h = this.hash(term), count = this.#view.getUint32(4, !0);
240
348
  let left = 0, right = count - 1;
241
349
  const headerSize = 12, entrySize = 20, decoder = new TextDecoder();
242
350
  for (; left <= right; ) {
@@ -277,6 +385,116 @@ class IndexSegment {
277
385
  return [];
278
386
  }
279
387
  }
388
+ class IndexSegment64 {
389
+ #filename;
390
+ #storage;
391
+ #buffer = null;
392
+ #view = null;
393
+ #hashAlgorithm;
394
+ /**
395
+ * 构造函数
396
+ * @param filename 文件名
397
+ * @param storage 存储接口
398
+ * @param hashAlgorithm 哈希算法实例,默认为Murmur3_64
399
+ */
400
+ constructor(filename, storage, hashAlgorithm = new Murmur3_64()) {
401
+ this.#filename = filename, this.#storage = storage, this.#hashAlgorithm = hashAlgorithm;
402
+ }
403
+ /**
404
+ * 使用当前哈希算法计算字符串哈希值
405
+ * @param str 要哈希的字符串
406
+ * @returns 64位无符号哈希值
407
+ */
408
+ hash(str) {
409
+ return this.#hashAlgorithm.hash(str);
410
+ }
411
+ /**
412
+ * 设置哈希算法
413
+ * @param hashAlgorithm 新的哈希算法实例
414
+ */
415
+ setHashAlgorithm(hashAlgorithm) {
416
+ this.#hashAlgorithm = hashAlgorithm;
417
+ }
418
+ async loadIndex() {
419
+ return this.#buffer ? !0 : (this.#buffer = await this.#storage.read(this.#filename), this.#buffer ? (this.#view = new DataView(this.#buffer), !0) : !1);
420
+ }
421
+ async buildAndSave(docs) {
422
+ const tokenMap = /* @__PURE__ */ new Map();
423
+ for (const doc of docs) {
424
+ const uniqueTokens = /* @__PURE__ */ new Map();
425
+ for (const token of doc.tokens)
426
+ uniqueTokens.has(token) || (uniqueTokens.set(token, !0), tokenMap.has(token) || tokenMap.set(token, {
427
+ hash: this.hash(token),
428
+ postings: []
429
+ }), tokenMap.get(token).postings.push(doc.id));
430
+ }
431
+ const entries = Array.from(tokenMap.entries());
432
+ entries.sort(([a, ah], [b, bh]) => ah.hash !== bh.hash ? ah.hash > bh.hash ? 1 : -1 : a.localeCompare(b));
433
+ const encoder = new TextEncoder();
434
+ let totalPostings = 0, totalTokensSize = 0;
435
+ for (const [token, { postings }] of entries) {
436
+ totalPostings += postings.length;
437
+ const bytes = encoder.encode(token);
438
+ totalTokensSize += bytes.length + 1;
439
+ }
440
+ const headerSize = 16, dictSize = entries.length * 28, postingsSize = totalPostings * 4, tokensOffset = headerSize + dictSize + postingsSize, totalSize = tokensOffset + totalTokensSize, buffer = new ArrayBuffer(totalSize), view = new DataView(buffer);
441
+ view.setUint32(0, 1229866072), view.setUint32(4, entries.length, !0), view.setUint32(8, tokensOffset, !0), view.setUint32(12, 64, !0);
442
+ let currentDictOffset = headerSize, currentPostingsOffset = headerSize + dictSize, currentTokenOffset = tokensOffset;
443
+ for (const [token, { hash, postings }] of entries) {
444
+ view.setBigUint64(currentDictOffset, hash, !0);
445
+ const tokenBytes = encoder.encode(token);
446
+ view.setUint32(currentDictOffset + 8, tokenBytes.length, !0), view.setUint32(currentDictOffset + 12, currentTokenOffset, !0), view.setUint32(currentDictOffset + 16, currentPostingsOffset, !0), view.setUint32(currentDictOffset + 20, postings.length, !0), currentDictOffset += 28;
447
+ for (let i = 0; i < postings.length; i++)
448
+ view.setUint32(currentPostingsOffset, postings[i], !0), currentPostingsOffset += 4;
449
+ for (let i = 0; i < tokenBytes.length; i++)
450
+ view.setUint8(currentTokenOffset++, tokenBytes[i]);
451
+ view.setUint8(currentTokenOffset++, 0);
452
+ }
453
+ await this.#storage.write(this.#filename, buffer), this.#buffer = buffer, this.#view = view;
454
+ }
455
+ search(term) {
456
+ if (!this.#view || !this.#buffer) return [];
457
+ const h = this.hash(term), count = this.#view.getUint32(4, !0);
458
+ let left = 0, right = count - 1;
459
+ const headerSize = 16, entrySize = 28, decoder = new TextDecoder();
460
+ for (; left <= right; ) {
461
+ const mid = left + right >>> 1, entryPos = headerSize + mid * entrySize, entryHash = this.#view.getBigUint64(entryPos, !0);
462
+ if (entryHash < h)
463
+ left = mid + 1;
464
+ else if (entryHash > h)
465
+ right = mid - 1;
466
+ else {
467
+ if (!(mid > 0 && this.#view.getBigUint64(headerSize + (mid - 1) * entrySize, !0) === h || mid < count - 1 && this.#view.getBigUint64(headerSize + (mid + 1) * entrySize, !0) === h)) {
468
+ const postingsOffset = this.#view.getUint32(headerSize + mid * entrySize + 16, !0), postingsLen = this.#view.getUint32(headerSize + mid * entrySize + 20, !0), result = [];
469
+ for (let j = 0; j < postingsLen; j++)
470
+ result.push(this.#view.getUint32(postingsOffset + j * 4, !0));
471
+ return result;
472
+ }
473
+ let firstMatch = mid;
474
+ for (; firstMatch > 0; ) {
475
+ const prevPos = headerSize + (firstMatch - 1) * entrySize;
476
+ if (this.#view.getBigUint64(prevPos, !0) === h)
477
+ firstMatch--;
478
+ else
479
+ break;
480
+ }
481
+ for (let i = firstMatch; i < count; i++) {
482
+ const checkPos = headerSize + i * entrySize;
483
+ if (this.#view.getBigUint64(checkPos, !0) !== h) break;
484
+ const tokenLen = this.#view.getUint32(checkPos + 8, !0), tokenOffset = this.#view.getUint32(checkPos + 12, !0), tokenBuffer = new Uint8Array(this.#buffer, tokenOffset, tokenLen);
485
+ if (decoder.decode(tokenBuffer) === term) {
486
+ const postingsOffset = this.#view.getUint32(checkPos + 16, !0), postingsLen = this.#view.getUint32(checkPos + 20, !0), result = [];
487
+ for (let j = 0; j < postingsLen; j++)
488
+ result.push(this.#view.getUint32(postingsOffset + j * 4, !0));
489
+ return result;
490
+ }
491
+ }
492
+ return [];
493
+ }
494
+ }
495
+ return [];
496
+ }
497
+ }
280
498
  const defaultTokenize = ({ text }) => {
281
499
  try {
282
500
  if (typeof Intl < "u" && typeof Intl.Segmenter == "function" && typeof Array.from == "function") {
@@ -295,6 +513,9 @@ class SearchEngine {
295
513
  #segments;
296
514
  #initialized = !1;
297
515
  #config;
516
+ #isHash64Bit = !0;
517
+ #hashAlgorithm32;
518
+ #hashAlgorithm64;
298
519
  // 批处理状态
299
520
  #inBatch = !1;
300
521
  #pendingTokenCounts = { word: 0, char: 0 };
@@ -306,12 +527,19 @@ class SearchEngine {
306
527
  minCharTokenSave: 0,
307
528
  indexingTokenizer: config.indexingTokenizer || defaultTokenize,
308
529
  ...config
309
- }, (this.#config.minWordTokenSave || 0) >= (this.#config.wordSegmentTokenThreshold || 1e5))
530
+ }, this.#processHashAlgorithmConfig(), (this.#config.minWordTokenSave || 0) >= (this.#config.wordSegmentTokenThreshold || 1e5))
310
531
  throw new Error("minWordTokenSave must be less than wordSegmentTokenThreshold");
311
532
  if ((this.#config.minCharTokenSave || 0) >= (this.#config.charSegmentTokenThreshold || 5e5))
312
533
  throw new Error("minCharTokenSave must be less than charSegmentTokenThreshold");
313
534
  this.#storage = config.storage, this.#meta = new MetaManager(this.#storage), this.#cache = new IntermediateCache(this.#storage), this.#segments = /* @__PURE__ */ new Map();
314
535
  }
536
+ /**
537
+ * 处理哈希算法配置
538
+ */
539
+ #processHashAlgorithmConfig() {
540
+ const hashConfig = this.#config.hashAlgorithm;
541
+ hashConfig === 64 ? (this.#isHash64Bit = !0, this.#hashAlgorithm64 = new Murmur3_64()) : hashConfig === 32 ? (this.#isHash64Bit = !1, this.#hashAlgorithm32 = new Murmur3_32()) : hashConfig && typeof hashConfig.hash == "function" ? typeof hashConfig.hash("test") == "bigint" ? (this.#isHash64Bit = !0, this.#hashAlgorithm64 = hashConfig) : (this.#isHash64Bit = !1, this.#hashAlgorithm32 = hashConfig) : (this.#isHash64Bit = !0, this.#hashAlgorithm64 = new Murmur3_64());
542
+ }
315
543
  /**
316
544
  * 开启批处理
317
545
  * 批处理期间 addDocuments 只写入缓存,不触发索引段构建
@@ -397,7 +625,7 @@ class SearchEngine {
397
625
  const segmentsMeta = this.#meta.getSegments(type);
398
626
  for (const meta of segmentsMeta) {
399
627
  const filename = meta.filename;
400
- !this.#segments.has(filename) && !segmentsToLoad.has(filename) && segmentsToLoad.set(filename, new IndexSegment(filename, this.#storage));
628
+ !this.#segments.has(filename) && !segmentsToLoad.has(filename) && (this.#isHash64Bit ? segmentsToLoad.set(filename, new IndexSegment64(filename, this.#storage, this.#hashAlgorithm64)) : segmentsToLoad.set(filename, new IndexSegment(filename, this.#storage, this.#hashAlgorithm32)));
401
629
  }
402
630
  };
403
631
  collectSegments("word"), collectSegments("char"), await Promise.all(
@@ -466,7 +694,7 @@ class SearchEngine {
466
694
  ...this.#meta.getSegments("char")
467
695
  ];
468
696
  for (const seg of allSegments)
469
- this.#segments.has(seg.filename) || this.#segments.set(seg.filename, new IndexSegment(seg.filename, this.#storage)), await this.#segments.get(seg.filename).loadIndex();
697
+ this.#segments.has(seg.filename) || (this.#isHash64Bit ? this.#segments.set(seg.filename, new IndexSegment64(seg.filename, this.#storage, this.#hashAlgorithm64)) : this.#segments.set(seg.filename, new IndexSegment(seg.filename, this.#storage, this.#hashAlgorithm32))), await this.#segments.get(seg.filename).loadIndex();
470
698
  this.#initialized = !0;
471
699
  }
472
700
  #getIndexingTokens(doc) {
@@ -498,7 +726,7 @@ class SearchEngine {
498
726
  }
499
727
  const docsToBuild = await this.#cache.readRange(cacheFilename, startOffset, currentCacheSize);
500
728
  let segment = this.#segments.get(targetSegmentName);
501
- segment || (segment = new IndexSegment(targetSegmentName, this.#storage), this.#segments.set(targetSegmentName, segment)), await segment.buildAndSave(docsToBuild), this.#meta.updateSegment(type, targetSegmentName, startOffset, currentCacheSize, newTokenCountTotal, isNew);
729
+ segment || (this.#isHash64Bit ? segment = new IndexSegment64(targetSegmentName, this.#storage, this.#hashAlgorithm64) : segment = new IndexSegment(targetSegmentName, this.#storage, this.#hashAlgorithm32), this.#segments.set(targetSegmentName, segment)), await segment.buildAndSave(docsToBuild), this.#meta.updateSegment(type, targetSegmentName, startOffset, currentCacheSize, newTokenCountTotal, isNew);
502
730
  }
503
731
  }
504
- exports.SearchEngine = SearchEngine, exports.hash = murmur3_32, exports.murmur3_32 = murmur3_32;
732
+ exports.Murmur3HashFactory = Murmur3HashFactory, exports.Murmur3_128 = Murmur3_128, exports.Murmur3_32 = Murmur3_32, exports.Murmur3_64 = Murmur3_64, exports.SearchEngine = SearchEngine, exports.defaultHashFactory = defaultHashFactory, exports.murmur3_32 = murmur3_32, exports.murmur3_64 = murmur3_64;
package/lib/core.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { ISearchEngine, ISearchEngineOption, IDocument, IDocumentBase, IResult, ISearchEngineStatus } from './type';
1
+ import { ISearchEngine, ISearchEngineOption, IDocument, IDocumentBase, IResult, ISearchEngineStatus, IHashAlgorithm32, IHashAlgorithm64, IHashAlgorithm128, IHashAlgorithmFactory } from './type';
2
2
 
3
3
  /**
4
4
  * 核心搜索引擎类 (多实例支持)
@@ -52,4 +52,90 @@ declare class SearchEngine implements ISearchEngine {
52
52
  */
53
53
  declare function murmur3_32(str: string, h?: number): number;
54
54
 
55
- export { SearchEngine, murmur3_32 as hash, murmur3_32 };
55
+ /**
56
+ * MurmurHash3 64位实现
57
+ * 高效的非加密哈希函数,适用于哈希表等数据结构
58
+ */
59
+ /**
60
+ * 计算字符串的64位MurmurHash3哈希值
61
+ * @param str 要哈希的字符串
62
+ * @param seed 种子值,默认为0x12345678
63
+ * @returns 64位无符号哈希值(BigInt类型)
64
+ */
65
+ declare function murmur3_64(str: string, seed?: number): bigint;
66
+
67
+ /**
68
+ * Murmur3 32位哈希算法实现类
69
+ */
70
+ declare class Murmur3_32 implements IHashAlgorithm32 {
71
+ private seed;
72
+ /**
73
+ * 构造函数
74
+ * @param seed 种子值,默认为0x12345678
75
+ */
76
+ constructor(seed?: number);
77
+ /**
78
+ * 计算字符串的32位哈希值
79
+ * @param str 输入字符串
80
+ * @returns 32位哈希值(number类型)
81
+ */
82
+ hash(str: string): number;
83
+ }
84
+ /**
85
+ * Murmur3 64位哈希算法实现类
86
+ */
87
+ declare class Murmur3_64 implements IHashAlgorithm64 {
88
+ private seed;
89
+ /**
90
+ * 构造函数
91
+ * @param seed 种子值,默认为0x12345678
92
+ */
93
+ constructor(seed?: number);
94
+ /**
95
+ * 计算字符串的64位哈希值
96
+ * @param str 输入字符串
97
+ * @returns 64位哈希值(bigint类型)
98
+ */
99
+ hash(str: string): bigint;
100
+ }
101
+ /**
102
+ * Murmur3 128位哈希算法实现类
103
+ * 注意:当前项目中没有实际的128位实现,这里只是一个占位符
104
+ */
105
+ declare class Murmur3_128 implements IHashAlgorithm128 {
106
+ private seed;
107
+ /**
108
+ * 构造函数
109
+ * @param seed 种子值,默认为0x12345678
110
+ */
111
+ constructor(seed?: number);
112
+ /**
113
+ * 计算字符串的128位哈希值
114
+ * @param str 输入字符串
115
+ * @returns 128位哈希值(bigint类型)
116
+ */
117
+ hash(str: string): bigint;
118
+ }
119
+ /**
120
+ * Murmur3哈希算法工厂类
121
+ */
122
+ declare class Murmur3HashFactory implements IHashAlgorithmFactory {
123
+ /**
124
+ * 创建32位哈希算法实例
125
+ * @returns 32位哈希算法实例
126
+ */
127
+ create32(): IHashAlgorithm32;
128
+ /**
129
+ * 创建64位哈希算法实例
130
+ * @returns 64位哈希算法实例
131
+ */
132
+ create64(): IHashAlgorithm64;
133
+ /**
134
+ * 创建128位哈希算法实例
135
+ * @returns 128位哈希算法实例
136
+ */
137
+ create128(): IHashAlgorithm128;
138
+ }
139
+ declare const defaultHashFactory: Murmur3HashFactory;
140
+
141
+ export { Murmur3HashFactory, Murmur3_128, Murmur3_32, Murmur3_64, SearchEngine, defaultHashFactory, murmur3_32, murmur3_64 };
package/lib/core.js CHANGED
@@ -180,21 +180,129 @@ function murmur3_32(str, h = 305419896) {
180
180
  const remainder = len & 3;
181
181
  return remainder > 0 && (remainder >= 3 && (k1 ^= (str.charCodeAt(i + 2) & 255) << 16), remainder >= 2 && (k1 ^= (str.charCodeAt(i + 1) & 255) << 8), remainder >= 1 && (k1 ^= str.charCodeAt(i) & 255), k1 = (k1 & 65535) * 3432918353 + (((k1 >>> 16) * 3432918353 & 65535) << 16) & 4294967295, k1 = k1 << 15 | k1 >>> 17, k1 = (k1 & 65535) * 461845907 + (((k1 >>> 16) * 461845907 & 65535) << 16) & 4294967295, h ^= k1), h ^= len, h ^= h >>> 16, h = (h & 65535) * 2246822507 + (((h >>> 16) * 2246822507 & 65535) << 16) & 4294967295, h ^= h >>> 13, h = (h & 65535) * 3266489909 + (((h >>> 16) * 3266489909 & 65535) << 16) & 4294967295, h ^= h >>> 16, h >>> 0;
182
182
  }
183
+ function murmur3_64(str, seed = 305419896) {
184
+ const len = str.length, nBlocks = len >> 3;
185
+ let h1 = BigInt(seed), h2 = BigInt(seed);
186
+ const c1 = 0x87c37b91114253d5n, c2 = 0x4cf5ad432745937fn;
187
+ let i = 0;
188
+ for (; i < nBlocks; ) {
189
+ let k12 = BigInt(str.charCodeAt(i) & 255) | BigInt(str.charCodeAt(++i) & 255) << 8n | BigInt(str.charCodeAt(++i) & 255) << 16n | BigInt(str.charCodeAt(++i) & 255) << 24n | BigInt(str.charCodeAt(++i) & 255) << 32n | BigInt(str.charCodeAt(++i) & 255) << 40n | BigInt(str.charCodeAt(++i) & 255) << 48n | BigInt(str.charCodeAt(++i) & 255) << 56n;
190
+ ++i;
191
+ let k22 = BigInt(str.charCodeAt(i) & 255) | BigInt(str.charCodeAt(++i) & 255) << 8n | BigInt(str.charCodeAt(++i) & 255) << 16n | BigInt(str.charCodeAt(++i) & 255) << 24n | BigInt(str.charCodeAt(++i) & 255) << 32n | BigInt(str.charCodeAt(++i) & 255) << 40n | BigInt(str.charCodeAt(++i) & 255) << 48n | BigInt(str.charCodeAt(++i) & 255) << 56n;
192
+ ++i, k12 = k12 * c1 % 2n ** 64n, k12 = (k12 << 31n | k12 >> 33n) % 2n ** 64n, k12 = k12 * c2 % 2n ** 64n, h1 ^= k12, h1 = (h1 << 27n | h1 >> 37n) % 2n ** 64n, h1 = (h1 + h2) % 2n ** 64n, h1 = (h1 * 5n + 0x52dce729n) % 2n ** 64n, k22 = k22 * c2 % 2n ** 64n, k22 = (k22 << 33n | k22 >> 31n) % 2n ** 64n, k22 = k22 * c1 % 2n ** 64n, h2 ^= k22, h2 = (h2 << 31n | h2 >> 33n) % 2n ** 64n, h2 = (h2 + h1) % 2n ** 64n, h2 = (h2 * 5n + 0x38495ab5n) % 2n ** 64n;
193
+ }
194
+ let k1 = 0n, k2 = 0n;
195
+ const remainder = len & 7;
196
+ return remainder > 0 && (remainder >= 8 && (k2 ^= BigInt(str.charCodeAt(i + 7) & 255) << 56n), remainder >= 7 && (k2 ^= BigInt(str.charCodeAt(i + 6) & 255) << 48n), remainder >= 6 && (k2 ^= BigInt(str.charCodeAt(i + 5) & 255) << 40n), remainder >= 5 && (k2 ^= BigInt(str.charCodeAt(i + 4) & 255) << 32n), remainder >= 4 && (k1 ^= BigInt(str.charCodeAt(i + 3) & 255) << 24n), remainder >= 3 && (k1 ^= BigInt(str.charCodeAt(i + 2) & 255) << 16n), remainder >= 2 && (k1 ^= BigInt(str.charCodeAt(i + 1) & 255) << 8n), remainder >= 1 && (k1 ^= BigInt(str.charCodeAt(i) & 255)), k1 = k1 * c1 % 2n ** 64n, k1 = (k1 << 31n | k1 >> 33n) % 2n ** 64n, k1 = k1 * c2 % 2n ** 64n, h1 ^= k1, k2 = k2 * c2 % 2n ** 64n, k2 = (k2 << 33n | k2 >> 31n) % 2n ** 64n, k2 = k2 * c1 % 2n ** 64n, h2 ^= k2), h1 ^= BigInt(len), h2 ^= BigInt(len), h1 = (h1 + h2) % 2n ** 64n, h2 = (h2 + h1) % 2n ** 64n, h1 = (h1 ^ h1 >> 33n) * 0xff51afd7ed558ccdn, h1 = (h1 ^ h1 >> 33n) * 0xc4ceb9fe1a85ec53n, h1 = h1 ^ h1 >> 33n, h2 = (h2 ^ h2 >> 33n) * 0xff51afd7ed558ccdn, h2 = (h2 ^ h2 >> 33n) * 0xc4ceb9fe1a85ec53n, h2 = h2 ^ h2 >> 33n, (h2 << 64n | h1) & 0xffffffffffffffffn;
197
+ }
198
+ class Murmur3_32 {
199
+ seed;
200
+ /**
201
+ * 构造函数
202
+ * @param seed 种子值,默认为0x12345678
203
+ */
204
+ constructor(seed = 305419896) {
205
+ this.seed = seed;
206
+ }
207
+ /**
208
+ * 计算字符串的32位哈希值
209
+ * @param str 输入字符串
210
+ * @returns 32位哈希值(number类型)
211
+ */
212
+ hash(str) {
213
+ return murmur3_32(str, this.seed);
214
+ }
215
+ }
216
+ class Murmur3_64 {
217
+ seed;
218
+ /**
219
+ * 构造函数
220
+ * @param seed 种子值,默认为0x12345678
221
+ */
222
+ constructor(seed = 305419896) {
223
+ this.seed = seed;
224
+ }
225
+ /**
226
+ * 计算字符串的64位哈希值
227
+ * @param str 输入字符串
228
+ * @returns 64位哈希值(bigint类型)
229
+ */
230
+ hash(str) {
231
+ return murmur3_64(str, this.seed);
232
+ }
233
+ }
234
+ class Murmur3_128 {
235
+ seed;
236
+ /**
237
+ * 构造函数
238
+ * @param seed 种子值,默认为0x12345678
239
+ */
240
+ constructor(seed = 305419896) {
241
+ this.seed = seed;
242
+ }
243
+ /**
244
+ * 计算字符串的128位哈希值
245
+ * @param str 输入字符串
246
+ * @returns 128位哈希值(bigint类型)
247
+ */
248
+ hash(str) {
249
+ const hash1 = murmur3_64(str, this.seed);
250
+ return murmur3_64(str + str, this.seed ^ 1515870810) << 64n | hash1;
251
+ }
252
+ }
253
+ class Murmur3HashFactory {
254
+ /**
255
+ * 创建32位哈希算法实例
256
+ * @returns 32位哈希算法实例
257
+ */
258
+ create32() {
259
+ return new Murmur3_32();
260
+ }
261
+ /**
262
+ * 创建64位哈希算法实例
263
+ * @returns 64位哈希算法实例
264
+ */
265
+ create64() {
266
+ return new Murmur3_64();
267
+ }
268
+ /**
269
+ * 创建128位哈希算法实例
270
+ * @returns 128位哈希算法实例
271
+ */
272
+ create128() {
273
+ return new Murmur3_128();
274
+ }
275
+ }
276
+ const defaultHashFactory = new Murmur3HashFactory();
183
277
  class IndexSegment {
184
278
  #filename;
185
279
  #storage;
186
280
  #buffer = null;
187
281
  #view = null;
282
+ #hashAlgorithm;
188
283
  /**
189
- * 使用MurmurHash3计算字符串哈希值
284
+ * 构造函数
285
+ * @param filename 文件名
286
+ * @param storage 存储接口
287
+ * @param hashAlgorithm 哈希算法实例,默认为Murmur3_32
288
+ */
289
+ constructor(filename, storage, hashAlgorithm = new Murmur3_32()) {
290
+ this.#filename = filename, this.#storage = storage, this.#hashAlgorithm = hashAlgorithm;
291
+ }
292
+ /**
293
+ * 使用当前哈希算法计算字符串哈希值
190
294
  * @param str 要哈希的字符串
191
295
  * @returns 32位无符号哈希值
192
296
  */
193
- static hash(str) {
194
- return murmur3_32(str);
297
+ hash(str) {
298
+ return this.#hashAlgorithm.hash(str);
195
299
  }
196
- constructor(filename, storage) {
197
- this.#filename = filename, this.#storage = storage;
300
+ /**
301
+ * 设置哈希算法
302
+ * @param hashAlgorithm 新的哈希算法实例
303
+ */
304
+ setHashAlgorithm(hashAlgorithm) {
305
+ this.#hashAlgorithm = hashAlgorithm;
198
306
  }
199
307
  async loadIndex() {
200
308
  return this.#buffer ? !0 : (this.#buffer = await this.#storage.read(this.#filename), this.#buffer ? (this.#view = new DataView(this.#buffer), !0) : !1);
@@ -205,7 +313,7 @@ class IndexSegment {
205
313
  const uniqueTokens = /* @__PURE__ */ new Map();
206
314
  for (const token of doc.tokens)
207
315
  uniqueTokens.has(token) || (uniqueTokens.set(token, !0), tokenMap.has(token) || tokenMap.set(token, {
208
- hash: IndexSegment.hash(token),
316
+ hash: this.hash(token),
209
317
  postings: []
210
318
  }), tokenMap.get(token).postings.push(doc.id));
211
319
  }
@@ -221,8 +329,8 @@ class IndexSegment {
221
329
  const headerSize = 12, dictSize = entries.length * 20, postingsSize = totalPostings * 4, tokensOffset = headerSize + dictSize + postingsSize, totalSize = tokensOffset + totalTokensSize, buffer = new ArrayBuffer(totalSize), view = new DataView(buffer);
222
330
  view.setUint32(0, 1229866072), view.setUint32(4, entries.length, !0), view.setUint32(8, tokensOffset, !0);
223
331
  let currentDictOffset = headerSize, currentPostingsOffset = headerSize + dictSize, currentTokenOffset = tokensOffset;
224
- for (const [token, { hash: hash2, postings }] of entries) {
225
- view.setUint32(currentDictOffset, hash2, !0);
332
+ for (const [token, { hash, postings }] of entries) {
333
+ view.setUint32(currentDictOffset, hash, !0);
226
334
  const tokenBytes = encoder.encode(token);
227
335
  view.setUint32(currentDictOffset + 4, tokenBytes.length, !0), view.setUint32(currentDictOffset + 8, currentTokenOffset, !0), view.setUint32(currentDictOffset + 12, currentPostingsOffset, !0), view.setUint32(currentDictOffset + 16, postings.length, !0), currentDictOffset += 20;
228
336
  for (let i = 0; i < postings.length; i++)
@@ -235,7 +343,7 @@ class IndexSegment {
235
343
  }
236
344
  search(term) {
237
345
  if (!this.#view || !this.#buffer) return [];
238
- const h = IndexSegment.hash(term), count = this.#view.getUint32(4, !0);
346
+ const h = this.hash(term), count = this.#view.getUint32(4, !0);
239
347
  let left = 0, right = count - 1;
240
348
  const headerSize = 12, entrySize = 20, decoder = new TextDecoder();
241
349
  for (; left <= right; ) {
@@ -276,6 +384,116 @@ class IndexSegment {
276
384
  return [];
277
385
  }
278
386
  }
387
+ class IndexSegment64 {
388
+ #filename;
389
+ #storage;
390
+ #buffer = null;
391
+ #view = null;
392
+ #hashAlgorithm;
393
+ /**
394
+ * 构造函数
395
+ * @param filename 文件名
396
+ * @param storage 存储接口
397
+ * @param hashAlgorithm 哈希算法实例,默认为Murmur3_64
398
+ */
399
+ constructor(filename, storage, hashAlgorithm = new Murmur3_64()) {
400
+ this.#filename = filename, this.#storage = storage, this.#hashAlgorithm = hashAlgorithm;
401
+ }
402
+ /**
403
+ * 使用当前哈希算法计算字符串哈希值
404
+ * @param str 要哈希的字符串
405
+ * @returns 64位无符号哈希值
406
+ */
407
+ hash(str) {
408
+ return this.#hashAlgorithm.hash(str);
409
+ }
410
+ /**
411
+ * 设置哈希算法
412
+ * @param hashAlgorithm 新的哈希算法实例
413
+ */
414
+ setHashAlgorithm(hashAlgorithm) {
415
+ this.#hashAlgorithm = hashAlgorithm;
416
+ }
417
+ async loadIndex() {
418
+ return this.#buffer ? !0 : (this.#buffer = await this.#storage.read(this.#filename), this.#buffer ? (this.#view = new DataView(this.#buffer), !0) : !1);
419
+ }
420
+ async buildAndSave(docs) {
421
+ const tokenMap = /* @__PURE__ */ new Map();
422
+ for (const doc of docs) {
423
+ const uniqueTokens = /* @__PURE__ */ new Map();
424
+ for (const token of doc.tokens)
425
+ uniqueTokens.has(token) || (uniqueTokens.set(token, !0), tokenMap.has(token) || tokenMap.set(token, {
426
+ hash: this.hash(token),
427
+ postings: []
428
+ }), tokenMap.get(token).postings.push(doc.id));
429
+ }
430
+ const entries = Array.from(tokenMap.entries());
431
+ entries.sort(([a, ah], [b, bh]) => ah.hash !== bh.hash ? ah.hash > bh.hash ? 1 : -1 : a.localeCompare(b));
432
+ const encoder = new TextEncoder();
433
+ let totalPostings = 0, totalTokensSize = 0;
434
+ for (const [token, { postings }] of entries) {
435
+ totalPostings += postings.length;
436
+ const bytes = encoder.encode(token);
437
+ totalTokensSize += bytes.length + 1;
438
+ }
439
+ const headerSize = 16, dictSize = entries.length * 28, postingsSize = totalPostings * 4, tokensOffset = headerSize + dictSize + postingsSize, totalSize = tokensOffset + totalTokensSize, buffer = new ArrayBuffer(totalSize), view = new DataView(buffer);
440
+ view.setUint32(0, 1229866072), view.setUint32(4, entries.length, !0), view.setUint32(8, tokensOffset, !0), view.setUint32(12, 64, !0);
441
+ let currentDictOffset = headerSize, currentPostingsOffset = headerSize + dictSize, currentTokenOffset = tokensOffset;
442
+ for (const [token, { hash, postings }] of entries) {
443
+ view.setBigUint64(currentDictOffset, hash, !0);
444
+ const tokenBytes = encoder.encode(token);
445
+ view.setUint32(currentDictOffset + 8, tokenBytes.length, !0), view.setUint32(currentDictOffset + 12, currentTokenOffset, !0), view.setUint32(currentDictOffset + 16, currentPostingsOffset, !0), view.setUint32(currentDictOffset + 20, postings.length, !0), currentDictOffset += 28;
446
+ for (let i = 0; i < postings.length; i++)
447
+ view.setUint32(currentPostingsOffset, postings[i], !0), currentPostingsOffset += 4;
448
+ for (let i = 0; i < tokenBytes.length; i++)
449
+ view.setUint8(currentTokenOffset++, tokenBytes[i]);
450
+ view.setUint8(currentTokenOffset++, 0);
451
+ }
452
+ await this.#storage.write(this.#filename, buffer), this.#buffer = buffer, this.#view = view;
453
+ }
454
+ search(term) {
455
+ if (!this.#view || !this.#buffer) return [];
456
+ const h = this.hash(term), count = this.#view.getUint32(4, !0);
457
+ let left = 0, right = count - 1;
458
+ const headerSize = 16, entrySize = 28, decoder = new TextDecoder();
459
+ for (; left <= right; ) {
460
+ const mid = left + right >>> 1, entryPos = headerSize + mid * entrySize, entryHash = this.#view.getBigUint64(entryPos, !0);
461
+ if (entryHash < h)
462
+ left = mid + 1;
463
+ else if (entryHash > h)
464
+ right = mid - 1;
465
+ else {
466
+ if (!(mid > 0 && this.#view.getBigUint64(headerSize + (mid - 1) * entrySize, !0) === h || mid < count - 1 && this.#view.getBigUint64(headerSize + (mid + 1) * entrySize, !0) === h)) {
467
+ const postingsOffset = this.#view.getUint32(headerSize + mid * entrySize + 16, !0), postingsLen = this.#view.getUint32(headerSize + mid * entrySize + 20, !0), result = [];
468
+ for (let j = 0; j < postingsLen; j++)
469
+ result.push(this.#view.getUint32(postingsOffset + j * 4, !0));
470
+ return result;
471
+ }
472
+ let firstMatch = mid;
473
+ for (; firstMatch > 0; ) {
474
+ const prevPos = headerSize + (firstMatch - 1) * entrySize;
475
+ if (this.#view.getBigUint64(prevPos, !0) === h)
476
+ firstMatch--;
477
+ else
478
+ break;
479
+ }
480
+ for (let i = firstMatch; i < count; i++) {
481
+ const checkPos = headerSize + i * entrySize;
482
+ if (this.#view.getBigUint64(checkPos, !0) !== h) break;
483
+ const tokenLen = this.#view.getUint32(checkPos + 8, !0), tokenOffset = this.#view.getUint32(checkPos + 12, !0), tokenBuffer = new Uint8Array(this.#buffer, tokenOffset, tokenLen);
484
+ if (decoder.decode(tokenBuffer) === term) {
485
+ const postingsOffset = this.#view.getUint32(checkPos + 16, !0), postingsLen = this.#view.getUint32(checkPos + 20, !0), result = [];
486
+ for (let j = 0; j < postingsLen; j++)
487
+ result.push(this.#view.getUint32(postingsOffset + j * 4, !0));
488
+ return result;
489
+ }
490
+ }
491
+ return [];
492
+ }
493
+ }
494
+ return [];
495
+ }
496
+ }
279
497
  const defaultTokenize = ({ text }) => {
280
498
  try {
281
499
  if (typeof Intl < "u" && typeof Intl.Segmenter == "function" && typeof Array.from == "function") {
@@ -294,6 +512,9 @@ class SearchEngine {
294
512
  #segments;
295
513
  #initialized = !1;
296
514
  #config;
515
+ #isHash64Bit = !0;
516
+ #hashAlgorithm32;
517
+ #hashAlgorithm64;
297
518
  // 批处理状态
298
519
  #inBatch = !1;
299
520
  #pendingTokenCounts = { word: 0, char: 0 };
@@ -305,12 +526,19 @@ class SearchEngine {
305
526
  minCharTokenSave: 0,
306
527
  indexingTokenizer: config.indexingTokenizer || defaultTokenize,
307
528
  ...config
308
- }, (this.#config.minWordTokenSave || 0) >= (this.#config.wordSegmentTokenThreshold || 1e5))
529
+ }, this.#processHashAlgorithmConfig(), (this.#config.minWordTokenSave || 0) >= (this.#config.wordSegmentTokenThreshold || 1e5))
309
530
  throw new Error("minWordTokenSave must be less than wordSegmentTokenThreshold");
310
531
  if ((this.#config.minCharTokenSave || 0) >= (this.#config.charSegmentTokenThreshold || 5e5))
311
532
  throw new Error("minCharTokenSave must be less than charSegmentTokenThreshold");
312
533
  this.#storage = config.storage, this.#meta = new MetaManager(this.#storage), this.#cache = new IntermediateCache(this.#storage), this.#segments = /* @__PURE__ */ new Map();
313
534
  }
535
+ /**
536
+ * 处理哈希算法配置
537
+ */
538
+ #processHashAlgorithmConfig() {
539
+ const hashConfig = this.#config.hashAlgorithm;
540
+ hashConfig === 64 ? (this.#isHash64Bit = !0, this.#hashAlgorithm64 = new Murmur3_64()) : hashConfig === 32 ? (this.#isHash64Bit = !1, this.#hashAlgorithm32 = new Murmur3_32()) : hashConfig && typeof hashConfig.hash == "function" ? typeof hashConfig.hash("test") == "bigint" ? (this.#isHash64Bit = !0, this.#hashAlgorithm64 = hashConfig) : (this.#isHash64Bit = !1, this.#hashAlgorithm32 = hashConfig) : (this.#isHash64Bit = !0, this.#hashAlgorithm64 = new Murmur3_64());
541
+ }
314
542
  /**
315
543
  * 开启批处理
316
544
  * 批处理期间 addDocuments 只写入缓存,不触发索引段构建
@@ -396,7 +624,7 @@ class SearchEngine {
396
624
  const segmentsMeta = this.#meta.getSegments(type);
397
625
  for (const meta of segmentsMeta) {
398
626
  const filename = meta.filename;
399
- !this.#segments.has(filename) && !segmentsToLoad.has(filename) && segmentsToLoad.set(filename, new IndexSegment(filename, this.#storage));
627
+ !this.#segments.has(filename) && !segmentsToLoad.has(filename) && (this.#isHash64Bit ? segmentsToLoad.set(filename, new IndexSegment64(filename, this.#storage, this.#hashAlgorithm64)) : segmentsToLoad.set(filename, new IndexSegment(filename, this.#storage, this.#hashAlgorithm32)));
400
628
  }
401
629
  };
402
630
  collectSegments("word"), collectSegments("char"), await Promise.all(
@@ -465,7 +693,7 @@ class SearchEngine {
465
693
  ...this.#meta.getSegments("char")
466
694
  ];
467
695
  for (const seg of allSegments)
468
- this.#segments.has(seg.filename) || this.#segments.set(seg.filename, new IndexSegment(seg.filename, this.#storage)), await this.#segments.get(seg.filename).loadIndex();
696
+ this.#segments.has(seg.filename) || (this.#isHash64Bit ? this.#segments.set(seg.filename, new IndexSegment64(seg.filename, this.#storage, this.#hashAlgorithm64)) : this.#segments.set(seg.filename, new IndexSegment(seg.filename, this.#storage, this.#hashAlgorithm32))), await this.#segments.get(seg.filename).loadIndex();
469
697
  this.#initialized = !0;
470
698
  }
471
699
  #getIndexingTokens(doc) {
@@ -497,11 +725,16 @@ class SearchEngine {
497
725
  }
498
726
  const docsToBuild = await this.#cache.readRange(cacheFilename, startOffset, currentCacheSize);
499
727
  let segment = this.#segments.get(targetSegmentName);
500
- segment || (segment = new IndexSegment(targetSegmentName, this.#storage), this.#segments.set(targetSegmentName, segment)), await segment.buildAndSave(docsToBuild), this.#meta.updateSegment(type, targetSegmentName, startOffset, currentCacheSize, newTokenCountTotal, isNew);
728
+ segment || (this.#isHash64Bit ? segment = new IndexSegment64(targetSegmentName, this.#storage, this.#hashAlgorithm64) : segment = new IndexSegment(targetSegmentName, this.#storage, this.#hashAlgorithm32), this.#segments.set(targetSegmentName, segment)), await segment.buildAndSave(docsToBuild), this.#meta.updateSegment(type, targetSegmentName, startOffset, currentCacheSize, newTokenCountTotal, isNew);
501
729
  }
502
730
  }
503
731
  export {
732
+ Murmur3HashFactory,
733
+ Murmur3_128,
734
+ Murmur3_32,
735
+ Murmur3_64,
504
736
  SearchEngine,
505
- murmur3_32 as hash,
506
- murmur3_32
737
+ defaultHashFactory,
738
+ murmur3_32,
739
+ murmur3_64
507
740
  };
package/lib/type.d.ts CHANGED
@@ -61,6 +61,35 @@ interface IStorage {
61
61
  getFileSize(filename: string): Promise<number>;
62
62
  }
63
63
 
64
+ interface IHashAlgorithm<T> {
65
+ /**
66
+ * 计算字符串的哈希值
67
+ * @param str 输入字符串
68
+ * @returns 哈希值
69
+ */
70
+ hash(str: string): T;
71
+ }
72
+ interface IHashAlgorithm32 extends IHashAlgorithm<number> {
73
+ }
74
+ interface IHashAlgorithm64 extends IHashAlgorithm<bigint> {
75
+ }
76
+ interface IHashAlgorithm128 extends IHashAlgorithm<bigint> {
77
+ }
78
+ interface IHashAlgorithmFactory {
79
+ /**
80
+ * 创建32位哈希算法实例
81
+ */
82
+ create32(): IHashAlgorithm32;
83
+ /**
84
+ * 创建64位哈希算法实例
85
+ */
86
+ create64(): IHashAlgorithm64;
87
+ /**
88
+ * 创建128位哈希算法实例
89
+ */
90
+ create128(): IHashAlgorithm128;
91
+ }
92
+
64
93
  /**
65
94
  * 索引类型
66
95
  */
@@ -93,6 +122,15 @@ interface ISearchEngineOption {
93
122
  * - 影响: 直接决定搜索匹配的范围和结果的相关性
94
123
  */
95
124
  searchTokenizer?: SearchTokenizer;
125
+ /**
126
+ * 哈希算法配置 (可选)
127
+ * - 32: 使用默认32位哈希算法
128
+ * - 64: 使用默认64位哈希算法
129
+ * - IHashAlgorithm32: 使用自定义32位哈希算法
130
+ * - IHashAlgorithm64: 使用自定义64位哈希算法
131
+ * - undefined: 默认使用32位哈希算法
132
+ */
133
+ hashAlgorithm?: 32 | 64 | IHashAlgorithm32 | IHashAlgorithm64;
96
134
  /**
97
135
  * 词索引分段阈值 (Token数) - 分段算法配置
98
136
  * - 作用: 控制词索引文件的大小,超过阈值时创建新的索引段
@@ -173,4 +211,38 @@ interface ISearchEngine {
173
211
  hasDocument(id: number): Promise<boolean>;
174
212
  }
175
213
 
176
- export type { IDocument, IDocumentBase, IIndexMeta, IResult, ISearchEngine, ISearchEngineOption, ISearchEngineStatus, ISegmentMeta, IStorage, ITokenizedDoc, IndexType, IndexingTokenizer, SearchTokenizer };
214
+ /**
215
+ * 索引段接口,定义了IndexSegment和IndexSegment64的共同方法
216
+ */
217
+
218
+ interface IIndexSegment {
219
+ /**
220
+ * 使用当前哈希算法计算字符串哈希值
221
+ * @param str 要哈希的字符串
222
+ * @returns 哈希值(number | bigint)
223
+ */
224
+ hash(str: string): number | bigint;
225
+ /**
226
+ * 设置哈希算法
227
+ * @param hashAlgorithm 新的哈希算法实例
228
+ */
229
+ setHashAlgorithm(hashAlgorithm: IHashAlgorithm32 | IHashAlgorithm64): void;
230
+ /**
231
+ * 加载索引
232
+ * @returns 是否成功加载索引
233
+ */
234
+ loadIndex(): Promise<boolean>;
235
+ /**
236
+ * 构建并保存索引
237
+ * @param docs 要索引的文档
238
+ */
239
+ buildAndSave(docs: ITokenizedDoc[]): Promise<void>;
240
+ /**
241
+ * 搜索索引
242
+ * @param term 搜索词
243
+ * @returns 匹配的文档ID数组
244
+ */
245
+ search(term: string): number[];
246
+ }
247
+
248
+ export type { IDocument, IDocumentBase, IHashAlgorithm, IHashAlgorithm128, IHashAlgorithm32, IHashAlgorithm64, IHashAlgorithmFactory, IIndexMeta, IIndexSegment, IResult, ISearchEngine, ISearchEngineOption, ISearchEngineStatus, ISegmentMeta, IStorage, ITokenizedDoc, IndexType, IndexingTokenizer, SearchTokenizer };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "gs-search",
3
- "version": "0.1.7",
3
+ "version": "0.1.8",
4
4
  "type": "module",
5
5
  "main": "lib/index.cjs",
6
6
  "module": "lib/index.js",