gs-search 0.1.5 → 0.1.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/browser.cjs +68 -1
- package/lib/browser.js +69 -1
- package/lib/core.cjs +504 -1
- package/lib/core.d.ts +5 -5
- package/lib/core.js +507 -1
- package/lib/index.cjs +38 -1
- package/lib/index.js +5 -1
- package/lib/node.cjs +99 -1
- package/lib/node.js +81 -1
- package/lib/simple.cjs +64 -1
- package/lib/simple.d.ts +5 -5
- package/lib/simple.js +67 -1
- package/lib/type.d.ts +5 -7
- package/lib/type.js +1 -0
- package/package.json +1 -1
package/lib/browser.cjs
CHANGED
|
@@ -1 +1,68 @@
|
|
|
1
|
-
"use strict";
|
|
1
|
+
"use strict";
|
|
2
|
+
class BrowserStorage {
|
|
3
|
+
#baseDir;
|
|
4
|
+
constructor(baseDir) {
|
|
5
|
+
this.#baseDir = baseDir;
|
|
6
|
+
}
|
|
7
|
+
// 获取当前实例的专属目录句柄
|
|
8
|
+
async #getDirHandle() {
|
|
9
|
+
return await (await navigator.storage.getDirectory()).getDirectoryHandle(this.#baseDir, { create: !0 });
|
|
10
|
+
}
|
|
11
|
+
async write(filename, data) {
|
|
12
|
+
const writable = await (await (await this.#getDirHandle()).getFileHandle(filename, { create: !0 })).createWritable();
|
|
13
|
+
await writable.write(data), await writable.close();
|
|
14
|
+
}
|
|
15
|
+
async append(filename, data) {
|
|
16
|
+
const dir = await this.#getDirHandle();
|
|
17
|
+
let fileHandle;
|
|
18
|
+
try {
|
|
19
|
+
fileHandle = await dir.getFileHandle(filename, { create: !0 });
|
|
20
|
+
} catch {
|
|
21
|
+
fileHandle = await dir.getFileHandle(filename, { create: !0 });
|
|
22
|
+
}
|
|
23
|
+
const file = await fileHandle.getFile(), writable = await fileHandle.createWritable({ keepExistingData: !0 });
|
|
24
|
+
await writable.seek(file.size), await writable.write(data), await writable.close();
|
|
25
|
+
}
|
|
26
|
+
async read(filename) {
|
|
27
|
+
const dir = await this.#getDirHandle();
|
|
28
|
+
try {
|
|
29
|
+
return await (await (await dir.getFileHandle(filename)).getFile()).arrayBuffer();
|
|
30
|
+
} catch {
|
|
31
|
+
return null;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
async readRange(filename, start, end) {
|
|
35
|
+
const dir = await this.#getDirHandle();
|
|
36
|
+
try {
|
|
37
|
+
return await (await (await dir.getFileHandle(filename)).getFile()).slice(start, end).arrayBuffer();
|
|
38
|
+
} catch {
|
|
39
|
+
return null;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
async remove(filename) {
|
|
43
|
+
const dir = await this.#getDirHandle();
|
|
44
|
+
try {
|
|
45
|
+
await dir.removeEntry(filename);
|
|
46
|
+
} catch {
|
|
47
|
+
}
|
|
48
|
+
}
|
|
49
|
+
async listFiles() {
|
|
50
|
+
const dir = await this.#getDirHandle(), keys = [];
|
|
51
|
+
for await (const key of dir.keys()) keys.push(key);
|
|
52
|
+
return keys;
|
|
53
|
+
}
|
|
54
|
+
async clearAll() {
|
|
55
|
+
const dir = await this.#getDirHandle();
|
|
56
|
+
for await (const key of dir.keys())
|
|
57
|
+
await dir.removeEntry(key, { recursive: !0 });
|
|
58
|
+
}
|
|
59
|
+
async getFileSize(filename) {
|
|
60
|
+
const dir = await this.#getDirHandle();
|
|
61
|
+
try {
|
|
62
|
+
return (await (await dir.getFileHandle(filename)).getFile()).size;
|
|
63
|
+
} catch {
|
|
64
|
+
return 0;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
exports.BrowserStorage = BrowserStorage;
|
package/lib/browser.js
CHANGED
|
@@ -1 +1,69 @@
|
|
|
1
|
-
class
|
|
1
|
+
class BrowserStorage {
|
|
2
|
+
#baseDir;
|
|
3
|
+
constructor(baseDir) {
|
|
4
|
+
this.#baseDir = baseDir;
|
|
5
|
+
}
|
|
6
|
+
// 获取当前实例的专属目录句柄
|
|
7
|
+
async #getDirHandle() {
|
|
8
|
+
return await (await navigator.storage.getDirectory()).getDirectoryHandle(this.#baseDir, { create: !0 });
|
|
9
|
+
}
|
|
10
|
+
async write(filename, data) {
|
|
11
|
+
const writable = await (await (await this.#getDirHandle()).getFileHandle(filename, { create: !0 })).createWritable();
|
|
12
|
+
await writable.write(data), await writable.close();
|
|
13
|
+
}
|
|
14
|
+
async append(filename, data) {
|
|
15
|
+
const dir = await this.#getDirHandle();
|
|
16
|
+
let fileHandle;
|
|
17
|
+
try {
|
|
18
|
+
fileHandle = await dir.getFileHandle(filename, { create: !0 });
|
|
19
|
+
} catch {
|
|
20
|
+
fileHandle = await dir.getFileHandle(filename, { create: !0 });
|
|
21
|
+
}
|
|
22
|
+
const file = await fileHandle.getFile(), writable = await fileHandle.createWritable({ keepExistingData: !0 });
|
|
23
|
+
await writable.seek(file.size), await writable.write(data), await writable.close();
|
|
24
|
+
}
|
|
25
|
+
async read(filename) {
|
|
26
|
+
const dir = await this.#getDirHandle();
|
|
27
|
+
try {
|
|
28
|
+
return await (await (await dir.getFileHandle(filename)).getFile()).arrayBuffer();
|
|
29
|
+
} catch {
|
|
30
|
+
return null;
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
async readRange(filename, start, end) {
|
|
34
|
+
const dir = await this.#getDirHandle();
|
|
35
|
+
try {
|
|
36
|
+
return await (await (await dir.getFileHandle(filename)).getFile()).slice(start, end).arrayBuffer();
|
|
37
|
+
} catch {
|
|
38
|
+
return null;
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
async remove(filename) {
|
|
42
|
+
const dir = await this.#getDirHandle();
|
|
43
|
+
try {
|
|
44
|
+
await dir.removeEntry(filename);
|
|
45
|
+
} catch {
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
async listFiles() {
|
|
49
|
+
const dir = await this.#getDirHandle(), keys = [];
|
|
50
|
+
for await (const key of dir.keys()) keys.push(key);
|
|
51
|
+
return keys;
|
|
52
|
+
}
|
|
53
|
+
async clearAll() {
|
|
54
|
+
const dir = await this.#getDirHandle();
|
|
55
|
+
for await (const key of dir.keys())
|
|
56
|
+
await dir.removeEntry(key, { recursive: !0 });
|
|
57
|
+
}
|
|
58
|
+
async getFileSize(filename) {
|
|
59
|
+
const dir = await this.#getDirHandle();
|
|
60
|
+
try {
|
|
61
|
+
return (await (await dir.getFileHandle(filename)).getFile()).size;
|
|
62
|
+
} catch {
|
|
63
|
+
return 0;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
export {
|
|
68
|
+
BrowserStorage
|
|
69
|
+
};
|
package/lib/core.cjs
CHANGED
|
@@ -1 +1,504 @@
|
|
|
1
|
-
"use strict";const e="search_meta.json",t="deleted_ids.bin",s="added_ids.bin";class n{#e;#t={wordSegments:[],charSegments:[]};#s=new Set;#n=new Set;constructor(e){this.#e=e}async load(){const n=await this.#e.read(e);if(n){const e=(new TextDecoder).decode(n);this.#t=JSON.parse(e)}else this.#t={wordSegments:[],charSegments:[]};const i=await this.#e.read(t);if(i){const e=new DataView(i);let t=0;const s=i.byteLength;for(;t<s&&!(t+4>s);){const n=e.getUint32(t,!0);this.#s.add(n),t+=4,t<s&&30===e.getUint8(t)&&(t+=1)}}const a=await this.#e.read(s);if(a){const e=new DataView(a);let t=0;const s=a.byteLength;for(;t<s&&!(t+4>s);){const n=e.getUint32(t,!0);this.#n.add(n),t+=4,t<s&&30===e.getUint8(t)&&(t+=1)}}}async save(){const n=JSON.stringify(this.#t);if(await this.#e.write(e,(new TextEncoder).encode(n).buffer),0===this.#s.size)await this.#e.remove(t);else{const e=4*this.#s.size+this.#s.size,s=new ArrayBuffer(e),n=new DataView(s);let i=0;for(const e of this.#s)n.setUint32(i,e,!0),i+=4,n.setUint8(i,30),i+=1;await this.#e.write(t,s)}if(0===this.#n.size)await this.#e.remove(s);else{const e=4*this.#n.size+this.#n.size,t=new ArrayBuffer(e),n=new DataView(t);let i=0;for(const e of this.#n)n.setUint32(i,e,!0),i+=4,n.setUint8(i,30),i+=1;await this.#e.write(s,t)}}getSegments(e){return"word"===e?this.#t.wordSegments:this.#t.charSegments}getDeletedIds(){return this.#s}addDeletedId(e){this.#s.add(e)}isDeleted(e){return this.#s.has(e)}addAddedId(e){this.#n.add(e)}removeAddedId(e){this.#n.delete(e)}isAdded(e){return this.#n.has(e)}getAddedIds(){return this.#n}hasDocument(e){return this.#n.has(e)||this.#s.has(e)}getLastSegmentInfo(e){const t=this.getSegments(e);return 0===t.length?null:t[t.length-1]}updateSegment(e,t,s,n,i,a){const o="word"===e?this.#t.wordSegments:this.#t.charSegments;if(a)o.push({filename:t,start:s,end:n,tokenCount:i});else{const e=o[o.length-1];e&&e.filename===t&&(e.end=n,e.tokenCount=i)}}reset(){this.#t={wordSegments:[],charSegments:[]},this.#s.clear(),this.#n.clear()}}class i{static SEPARATOR=30;#e;constructor(e){this.#e=e}async appendBatch(e,t){if(0===t.length)return await this.#e.getFileSize(e);const s=new TextEncoder;let n=0;for(const e of t){n+=8;for(const t of e.tokens){n+=2+Math.min(s.encode(t).byteLength,65535)}n+=1}const a=new Uint8Array(n);let o=0;for(const e of t){const t=[];for(const n of e.tokens){const e=s.encode(n),i=e.byteLength>65535?e.slice(0,65535):e;t.push(i)}const n=new DataView(a.buffer,o);n.setUint32(0,e.id,!0),n.setUint32(4,t.length,!0),o+=8;for(const e of t)new DataView(a.buffer,o).setUint16(0,e.byteLength,!0),o+=2,a.set(e,o),o+=e.byteLength;a[o++]=i.SEPARATOR}return await this.#e.append(e,a.buffer),await this.#e.getFileSize(e)}async readRange(e,t,s){const n=await this.#e.readRange(e,t,s);if(!n||0===n.byteLength)return[];const a=new DataView(n),o=new Uint8Array(n),r=new TextDecoder,h=[];let d=0;const c=n.byteLength;for(;d<c&&!(d+8>c);){const e=a.getUint32(d,!0);d+=4;const t=a.getUint32(d,!0);d+=4;const s=[];for(let e=0;e<t&&!(d+2>c);e++){const e=a.getUint16(d,!0);if(d+=2,d+e>c)break;const t=new Uint8Array(n,d,e);s.push(r.decode(t)),d+=e}d<c&&o[d]===i.SEPARATOR&&(d+=1),h.push({id:e,tokens:s})}return h}async getCurrentSize(e){return await this.#e.getFileSize(e)}}function a(e,t=305419896){const s=e.length,n=s>>2;let i=0;for(;i<n;){let s=255&e.charCodeAt(i)|(255&e.charCodeAt(++i))<<8|(255&e.charCodeAt(++i))<<16|(255&e.charCodeAt(++i))<<24;++i,s=3432918353*(65535&s)+((3432918353*(s>>>16)&65535)<<16)&4294967295,s=s<<15|s>>>17,s=461845907*(65535&s)+((461845907*(s>>>16)&65535)<<16)&4294967295,t=27492+(65535&(t=5*(65535&(t=(t^=s)<<13|t>>>19))+((5*(t>>>16)&65535)<<16)&4294967295))+(((t>>>16)+58964&65535)<<16)}let a=0;const o=3&s;return o>0&&(o>=3&&(a^=(255&e.charCodeAt(i+2))<<16),o>=2&&(a^=(255&e.charCodeAt(i+1))<<8),o>=1&&(a^=255&e.charCodeAt(i)),a=3432918353*(65535&a)+((3432918353*(a>>>16)&65535)<<16)&4294967295,a=a<<15|a>>>17,a=461845907*(65535&a)+((461845907*(a>>>16)&65535)<<16)&4294967295,t^=a),t^=s,t=2246822507*(65535&(t^=t>>>16))+((2246822507*(t>>>16)&65535)<<16)&4294967295,t=3266489909*(65535&(t^=t>>>13))+((3266489909*(t>>>16)&65535)<<16)&4294967295,(t^=t>>>16)>>>0}class o{#i;#e;#a=null;#o=null;static hash(e){return a(e)}constructor(e,t){this.#i=e,this.#e=t}async loadIndex(){return!!this.#a||(this.#a=await this.#e.read(this.#i),!!this.#a&&(this.#o=new DataView(this.#a),!0))}async buildAndSave(e){const t=new Map;for(const s of e){const e=new Map;for(const n of s.tokens)e.has(n)||(e.set(n,!0),t.has(n)||t.set(n,{hash:o.hash(n),postings:[]}),t.get(n).postings.push(s.id))}const s=Array.from(t.entries());s.sort(([e,{hash:t}],[s,{hash:n}])=>t!==n?t-n:e.localeCompare(s));let n=0,i=0;for(const[e,{postings:t}]of s)n+=t.length,i+=e.length+1;const a=20*s.length,r=12+a+4*n,h=new ArrayBuffer(r+i),d=new DataView(h);d.setUint32(0,1229866072),d.setUint32(4,s.length),d.setUint32(8,r);let c=12,g=12+a,f=r;for(const[e,{hash:t,postings:n}]of s){d.setUint32(c,t),d.setUint32(c+4,e.length),d.setUint32(c+8,f),d.setUint32(c+12,g),d.setUint32(c+16,n.length),c+=20;for(let e=0;e<n.length;e++)d.setUint32(g,n[e],!0),g+=4;const s=(new TextEncoder).encode(e);for(let e=0;e<s.length;e++)d.setUint8(f++,s[e]);d.setUint8(f++,0)}await this.#e.write(this.#i,h),this.#a=h,this.#o=d}search(e){if(!this.#o||!this.#a)return[];const t=o.hash(e),s=this.#o.getUint32(4);let n=0,i=s-1;const a=12,r=20,h=new TextDecoder;for(;n<=i;){const o=n+i>>>1,d=a+o*r,c=this.#o.getUint32(d);if(c<t)n=o+1;else{if(!(c>t)){if(!(o>0&&this.#o.getUint32(a+(o-1)*r)===t||o<s-1&&this.#o.getUint32(a+(o+1)*r)===t)){const e=this.#o.getUint32(a+o*r+12),t=this.#o.getUint32(a+o*r+16),s=[];for(let n=0;n<t;n++)s.push(this.#o.getUint32(e+4*n,!0));return s}let n=o;for(;n>0;){const e=a+(n-1)*r;if(this.#o.getUint32(e)!==t)break;n--}for(let i=n;i<s;i++){const s=a+i*r;if(this.#o.getUint32(s)!==t)break;const n=this.#o.getUint32(s+4),o=this.#o.getUint32(s+8),d=new Uint8Array(this.#a,o,n);if(h.decode(d)===e){const e=this.#o.getUint32(s+12),t=this.#o.getUint32(s+16),n=[];for(let s=0;s<t;s++)n.push(this.#o.getUint32(e+4*s,!0));return n}}return[]}i=o-1}}return[]}}const r=({text:e})=>{try{if(typeof Intl<"u"&&"function"==typeof Intl.Segmenter&&"function"==typeof Array.from){const t=new Intl.Segmenter([],{granularity:"word"}).segment(e);if("object"==typeof t&&null!==t)return Array.from(t).filter(e=>e?.isWordLike).map(e=>e?.segment?.toLowerCase()||"")}}catch{}return e.toLowerCase().split(/[^a-z0-9\u4e00-\u9fa5]+/g).filter(e=>e.length>0)},h="word_cache.bin",d="char_cache.bin";exports.SearchEngine=class{#e;#t;#r;#h;#d=!1;#c;#g=!1;#f={word:0,char:0};constructor(e){if(this.#c={wordSegmentTokenThreshold:1e5,charSegmentTokenThreshold:5e5,minWordTokenSave:0,minCharTokenSave:0,indexingTokenizer:e.indexingTokenizer||r,...e},(this.#c.minWordTokenSave||0)>=(this.#c.wordSegmentTokenThreshold||1e5))throw new Error("minWordTokenSave must be less than wordSegmentTokenThreshold");if((this.#c.minCharTokenSave||0)>=(this.#c.charSegmentTokenThreshold||5e5))throw new Error("minCharTokenSave must be less than charSegmentTokenThreshold");this.#e=e.storage,this.#t=new n(this.#e),this.#r=new i(this.#e),this.#h=new Map}startBatch(){this.#g=!0,this.#f={word:0,char:0}}async endBatch(){this.#g=!1,this.#f.word>0&&await this.#l("word",this.#f.word),this.#f.char>0&&await this.#l("char",this.#f.char),this.#f={word:0,char:0},await this.#t.save()}async addDocument(e){return this.addDocuments([e])}async addDocumentIfMissing(e){return this.addDocumentsIfMissing([e])}async addDocumentsIfMissing(e){if(this.#d||await this.#w(),0===e.length)return;const t=this.#t.getDeletedIds(),s=[],n=[],i=[];for(const a of e){if(t.has(a.id)||this.#t.isAdded(a.id))continue;const e=this.#m(a),o=[],r=[];for(const t of e)t.length>1?o.push(t):1===t.length&&r.push(t);o.length>0&&s.push({id:a.id,tokens:o}),r.length>0&&n.push({id:a.id,tokens:r}),i.push(a)}if(0===i.length)return;let a=0,o=0;if(s.length>0){await this.#r.appendBatch(h,s);for(const e of s)a+=e.tokens.length}if(n.length>0){await this.#r.appendBatch(d,n);for(const e of n)o+=e.tokens.length}for(const e of i)this.#t.addAddedId(e.id);this.#g?(this.#f.word+=a,this.#f.char+=o):(a>0&&await this.#l("word",a),o>0&&await this.#l("char",o),await this.#t.save())}async addDocuments(e){if(this.#d||await this.#w(),0===e.length)return;const t=this.#t.getDeletedIds(),s=[],n=[];for(const i of e){if(t.has(i.id))throw new Error(`Document ID ${i.id} has been deleted and cannot be re-added.`);if(this.#t.isAdded(i.id))throw new Error(`Document ID ${i.id} already exists.`);const e=this.#m(i),a=[],o=[];for(const t of e)t.length>1?a.push(t):1===t.length&&o.push(t);a.length>0&&s.push({id:i.id,tokens:a}),o.length>0&&n.push({id:i.id,tokens:o})}let i=0,a=0;if(s.length>0){await this.#r.appendBatch(h,s);for(const e of s)i+=e.tokens.length}if(n.length>0){await this.#r.appendBatch(d,n);for(const e of n)a+=e.tokens.length}for(const t of e)this.#t.addAddedId(t.id);this.#g?(this.#f.word+=i,this.#f.char+=a):(i>0&&await this.#l("word",i),a>0&&await this.#l("char",a),await this.#t.save())}async search(e,t){this.#d||await this.#w();const s="string"==typeof e?{text:e}:e,n=this.#u(s),i=n.filter(e=>e.length>1),a=n.filter(e=>1===e.length),r=this.#t.getDeletedIds(),h=new Map,d=new Map,c=e=>{const t=this.#t.getSegments(e);for(const e of t){const t=e.filename;!this.#h.has(t)&&!d.has(t)&&d.set(t,new o(t,this.#e))}};c("word"),c("char"),await Promise.all(Array.from(d.entries()).map(([e,t])=>t.loadIndex().then(s=>{s&&this.#h.set(e,t)})));const g=async(e,t)=>{if(0===t.length)return;const s=this.#t.getSegments(e);for(const e of s){const s=e.filename,n=this.#h.get(s);if(n)for(const e of t){const t=n.search(e),s=1+.1*e.length;for(const n of t)if(!r.has(n))if(h.has(n)){const t=h.get(n);t.score+=s,t.tokens.add(e)}else h.set(n,{score:0,tokens:new Set([e])})}}};await g("word",i),await g("char",a);const f=[];return h.forEach((e,t)=>{f.push({id:t,score:e.score,tokens:Array.from(e.tokens)})}),f.sort((e,t)=>t.score-e.score),"number"==typeof t&&t>0?f.slice(0,t):f}async removeDocument(e){this.#d||await this.#w(),this.#t.addDeletedId(e),this.#t.removeAddedId(e),await this.#t.save()}async clearAll(){await this.#e.clearAll(),this.#h.clear(),this.#t.reset(),this.#d=!1,this.#g=!1,this.#f={word:0,char:0}}async getStatus(){return this.#d||await this.#w(),{wordSegments:this.#t.getSegments("word").length,charSegments:this.#t.getSegments("char").length,deleted:this.#t.getDeletedIds().size,wordCacheSize:await this.#r.getCurrentSize(h),charCacheSize:await this.#r.getCurrentSize(d),inBatch:this.#g}}async hasDocument(e){return this.#d||await this.#w(),this.#t.hasDocument(e)}async#w(){if(this.#d)return;await this.#t.load();const e=[...this.#t.getSegments("word"),...this.#t.getSegments("char")];for(const t of e)this.#h.has(t.filename)||this.#h.set(t.filename,new o(t.filename,this.#e)),await this.#h.get(t.filename).loadIndex();this.#d=!0}#m(e){return this.#c.indexingTokenizer(e)}#u(e){return this.#c.searchTokenizer?this.#c.searchTokenizer(e):this.#m(e)}async#l(e,t){const s="word"===e?h:d,n=await this.#r.getCurrentSize(s),i="word"===e?this.#c.wordSegmentTokenThreshold||1e5:this.#c.charSegmentTokenThreshold||5e5,a="word"===e?this.#c.minWordTokenSave||0:this.#c.minCharTokenSave||0,r=this.#t.getLastSegmentInfo(e);let c,g,f,l;const w=()=>{const t=this.#t.getSegments(e).length+1;return`${e}_seg_${t}.bin`};if(r){const e=r.tokenCount;e>=i||e+t>=i?(c=w(),f=!0,g=r.end,l=t):(c=r.filename,f=!1,g=r.start,l=e+t)}else c=w(),f=!0,g=0,l=t;if(l<a)return void this.#t.updateSegment(e,c,g,n,l,f);const m=await this.#r.readRange(s,g,n);let u=this.#h.get(c);u||(u=new o(c,this.#e),this.#h.set(c,u)),await u.buildAndSave(m),this.#t.updateSegment(e,c,g,n,l,f)}},exports.hash=a,exports.murmur3_32=a;
|
|
1
|
+
"use strict";
|
|
2
|
+
const META_FILE = "search_meta.json", DELETED_IDS_FILE = "deleted_ids.bin", ADDED_IDS_FILE = "added_ids.bin", SEPARATOR = 30;
|
|
3
|
+
class MetaManager {
|
|
4
|
+
#storage;
|
|
5
|
+
#meta = {
|
|
6
|
+
wordSegments: [],
|
|
7
|
+
charSegments: []
|
|
8
|
+
};
|
|
9
|
+
#deletedIds = /* @__PURE__ */ new Set();
|
|
10
|
+
#addedIds = /* @__PURE__ */ new Set();
|
|
11
|
+
constructor(storage) {
|
|
12
|
+
this.#storage = storage;
|
|
13
|
+
}
|
|
14
|
+
async load() {
|
|
15
|
+
const buffer = await this.#storage.read(META_FILE);
|
|
16
|
+
if (buffer) {
|
|
17
|
+
const json = new TextDecoder().decode(buffer);
|
|
18
|
+
this.#meta = JSON.parse(json);
|
|
19
|
+
} else
|
|
20
|
+
this.#meta = { wordSegments: [], charSegments: [] };
|
|
21
|
+
const deletedBuffer = await this.#storage.read(DELETED_IDS_FILE);
|
|
22
|
+
if (deletedBuffer) {
|
|
23
|
+
const view = new DataView(deletedBuffer);
|
|
24
|
+
let offset = 0;
|
|
25
|
+
const max = deletedBuffer.byteLength;
|
|
26
|
+
for (; offset < max && !(offset + 4 > max); ) {
|
|
27
|
+
const id = view.getUint32(offset, !0);
|
|
28
|
+
this.#deletedIds.add(id), offset += 4, offset < max && view.getUint8(offset) === 30 && (offset += 1);
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
const addedBuffer = await this.#storage.read(ADDED_IDS_FILE);
|
|
32
|
+
if (addedBuffer) {
|
|
33
|
+
const view = new DataView(addedBuffer);
|
|
34
|
+
let offset = 0;
|
|
35
|
+
const max = addedBuffer.byteLength;
|
|
36
|
+
for (; offset < max && !(offset + 4 > max); ) {
|
|
37
|
+
const id = view.getUint32(offset, !0);
|
|
38
|
+
this.#addedIds.add(id), offset += 4, offset < max && view.getUint8(offset) === 30 && (offset += 1);
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
async save() {
|
|
43
|
+
const json = JSON.stringify(this.#meta);
|
|
44
|
+
if (await this.#storage.write(META_FILE, new TextEncoder().encode(json).buffer), this.#deletedIds.size === 0)
|
|
45
|
+
await this.#storage.remove(DELETED_IDS_FILE);
|
|
46
|
+
else {
|
|
47
|
+
const totalSize = this.#deletedIds.size * 4 + this.#deletedIds.size, buffer = new ArrayBuffer(totalSize), view = new DataView(buffer);
|
|
48
|
+
let offset = 0;
|
|
49
|
+
for (const id of this.#deletedIds)
|
|
50
|
+
view.setUint32(offset, id, !0), offset += 4, view.setUint8(offset, 30), offset += 1;
|
|
51
|
+
await this.#storage.write(DELETED_IDS_FILE, buffer);
|
|
52
|
+
}
|
|
53
|
+
if (this.#addedIds.size === 0)
|
|
54
|
+
await this.#storage.remove(ADDED_IDS_FILE);
|
|
55
|
+
else {
|
|
56
|
+
const totalSize = this.#addedIds.size * 4 + this.#addedIds.size, buffer = new ArrayBuffer(totalSize), view = new DataView(buffer);
|
|
57
|
+
let offset = 0;
|
|
58
|
+
for (const id of this.#addedIds)
|
|
59
|
+
view.setUint32(offset, id, !0), offset += 4, view.setUint8(offset, 30), offset += 1;
|
|
60
|
+
await this.#storage.write(ADDED_IDS_FILE, buffer);
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
getSegments(type) {
|
|
64
|
+
return type === "word" ? this.#meta.wordSegments : this.#meta.charSegments;
|
|
65
|
+
}
|
|
66
|
+
getDeletedIds() {
|
|
67
|
+
return this.#deletedIds;
|
|
68
|
+
}
|
|
69
|
+
addDeletedId(id) {
|
|
70
|
+
this.#deletedIds.add(id);
|
|
71
|
+
}
|
|
72
|
+
isDeleted(id) {
|
|
73
|
+
return this.#deletedIds.has(id);
|
|
74
|
+
}
|
|
75
|
+
addAddedId(id) {
|
|
76
|
+
this.#addedIds.add(id);
|
|
77
|
+
}
|
|
78
|
+
removeAddedId(id) {
|
|
79
|
+
this.#addedIds.delete(id);
|
|
80
|
+
}
|
|
81
|
+
isAdded(id) {
|
|
82
|
+
return this.#addedIds.has(id);
|
|
83
|
+
}
|
|
84
|
+
getAddedIds() {
|
|
85
|
+
return this.#addedIds;
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* 检查文档ID是否曾经添加过(包括已删除的)
|
|
89
|
+
* @param id 文档ID
|
|
90
|
+
* @returns 文档是否曾经添加过的布尔值
|
|
91
|
+
*/
|
|
92
|
+
hasDocument(id) {
|
|
93
|
+
return this.#addedIds.has(id) || this.#deletedIds.has(id);
|
|
94
|
+
}
|
|
95
|
+
getLastSegmentInfo(type) {
|
|
96
|
+
const segments = this.getSegments(type);
|
|
97
|
+
return segments.length === 0 ? null : segments[segments.length - 1];
|
|
98
|
+
}
|
|
99
|
+
updateSegment(type, filename, start, end, tokenCount, isNew) {
|
|
100
|
+
const segments = type === "word" ? this.#meta.wordSegments : this.#meta.charSegments;
|
|
101
|
+
if (isNew)
|
|
102
|
+
segments.push({ filename, start, end, tokenCount });
|
|
103
|
+
else {
|
|
104
|
+
const last = segments[segments.length - 1];
|
|
105
|
+
last && last.filename === filename && (last.end = end, last.tokenCount = tokenCount);
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
reset() {
|
|
109
|
+
this.#meta = { wordSegments: [], charSegments: [] }, this.#deletedIds.clear(), this.#addedIds.clear();
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
class IntermediateCache {
|
|
113
|
+
static SEPARATOR = 30;
|
|
114
|
+
#storage;
|
|
115
|
+
constructor(storage) {
|
|
116
|
+
this.#storage = storage;
|
|
117
|
+
}
|
|
118
|
+
async appendBatch(filename, docs) {
|
|
119
|
+
if (docs.length === 0)
|
|
120
|
+
return await this.#storage.getFileSize(filename);
|
|
121
|
+
const encoder = new TextEncoder();
|
|
122
|
+
let totalLen = 0;
|
|
123
|
+
for (const doc of docs) {
|
|
124
|
+
totalLen += 8;
|
|
125
|
+
for (const token of doc.tokens) {
|
|
126
|
+
const tokenLen = Math.min(encoder.encode(token).byteLength, 65535);
|
|
127
|
+
totalLen += 2 + tokenLen;
|
|
128
|
+
}
|
|
129
|
+
totalLen += 1;
|
|
130
|
+
}
|
|
131
|
+
const combined = new Uint8Array(totalLen);
|
|
132
|
+
let pos = 0;
|
|
133
|
+
for (const doc of docs) {
|
|
134
|
+
const tokenBuffers = [];
|
|
135
|
+
for (const token of doc.tokens) {
|
|
136
|
+
const buf = encoder.encode(token), finalBuf = buf.byteLength > 65535 ? buf.slice(0, 65535) : buf;
|
|
137
|
+
tokenBuffers.push(finalBuf);
|
|
138
|
+
}
|
|
139
|
+
const view = new DataView(combined.buffer, pos);
|
|
140
|
+
view.setUint32(0, doc.id, !0), view.setUint32(4, tokenBuffers.length, !0), pos += 8;
|
|
141
|
+
for (const buf of tokenBuffers)
|
|
142
|
+
new DataView(combined.buffer, pos).setUint16(0, buf.byteLength, !0), pos += 2, combined.set(buf, pos), pos += buf.byteLength;
|
|
143
|
+
combined[pos++] = IntermediateCache.SEPARATOR;
|
|
144
|
+
}
|
|
145
|
+
return await this.#storage.append(filename, combined.buffer), await this.#storage.getFileSize(filename);
|
|
146
|
+
}
|
|
147
|
+
async readRange(filename, start, end) {
|
|
148
|
+
const buffer = await this.#storage.readRange(filename, start, end);
|
|
149
|
+
if (!buffer || buffer.byteLength === 0) return [];
|
|
150
|
+
const view = new DataView(buffer), uint8 = new Uint8Array(buffer), decoder = new TextDecoder(), docs = [];
|
|
151
|
+
let offset = 0;
|
|
152
|
+
const max = buffer.byteLength;
|
|
153
|
+
for (; offset < max && !(offset + 8 > max); ) {
|
|
154
|
+
const id = view.getUint32(offset, !0);
|
|
155
|
+
offset += 4;
|
|
156
|
+
const count = view.getUint32(offset, !0);
|
|
157
|
+
offset += 4;
|
|
158
|
+
const tokens = [];
|
|
159
|
+
for (let i = 0; i < count && !(offset + 2 > max); i++) {
|
|
160
|
+
const len = view.getUint16(offset, !0);
|
|
161
|
+
if (offset += 2, offset + len > max) break;
|
|
162
|
+
const textBuf = new Uint8Array(buffer, offset, len);
|
|
163
|
+
tokens.push(decoder.decode(textBuf)), offset += len;
|
|
164
|
+
}
|
|
165
|
+
offset < max && uint8[offset] === IntermediateCache.SEPARATOR && (offset += 1), docs.push({ id, tokens });
|
|
166
|
+
}
|
|
167
|
+
return docs;
|
|
168
|
+
}
|
|
169
|
+
async getCurrentSize(filename) {
|
|
170
|
+
return await this.#storage.getFileSize(filename);
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
function murmur3_32(str, h = 305419896) {
|
|
174
|
+
const len = str.length, nBlocks = len >> 2;
|
|
175
|
+
let i = 0;
|
|
176
|
+
for (; i < nBlocks; ) {
|
|
177
|
+
let k12 = str.charCodeAt(i) & 255 | (str.charCodeAt(++i) & 255) << 8 | (str.charCodeAt(++i) & 255) << 16 | (str.charCodeAt(++i) & 255) << 24;
|
|
178
|
+
++i, k12 = (k12 & 65535) * 3432918353 + (((k12 >>> 16) * 3432918353 & 65535) << 16) & 4294967295, k12 = k12 << 15 | k12 >>> 17, k12 = (k12 & 65535) * 461845907 + (((k12 >>> 16) * 461845907 & 65535) << 16) & 4294967295, h ^= k12, h = h << 13 | h >>> 19, h = (h & 65535) * 5 + (((h >>> 16) * 5 & 65535) << 16) & 4294967295, h = (h & 65535) + 27492 + (((h >>> 16) + 58964 & 65535) << 16);
|
|
179
|
+
}
|
|
180
|
+
let k1 = 0;
|
|
181
|
+
const remainder = len & 3;
|
|
182
|
+
return remainder > 0 && (remainder >= 3 && (k1 ^= (str.charCodeAt(i + 2) & 255) << 16), remainder >= 2 && (k1 ^= (str.charCodeAt(i + 1) & 255) << 8), remainder >= 1 && (k1 ^= str.charCodeAt(i) & 255), k1 = (k1 & 65535) * 3432918353 + (((k1 >>> 16) * 3432918353 & 65535) << 16) & 4294967295, k1 = k1 << 15 | k1 >>> 17, k1 = (k1 & 65535) * 461845907 + (((k1 >>> 16) * 461845907 & 65535) << 16) & 4294967295, h ^= k1), h ^= len, h ^= h >>> 16, h = (h & 65535) * 2246822507 + (((h >>> 16) * 2246822507 & 65535) << 16) & 4294967295, h ^= h >>> 13, h = (h & 65535) * 3266489909 + (((h >>> 16) * 3266489909 & 65535) << 16) & 4294967295, h ^= h >>> 16, h >>> 0;
|
|
183
|
+
}
|
|
184
|
+
class IndexSegment {
|
|
185
|
+
#filename;
|
|
186
|
+
#storage;
|
|
187
|
+
#buffer = null;
|
|
188
|
+
#view = null;
|
|
189
|
+
/**
|
|
190
|
+
* 使用MurmurHash3计算字符串哈希值
|
|
191
|
+
* @param str 要哈希的字符串
|
|
192
|
+
* @returns 32位无符号哈希值
|
|
193
|
+
*/
|
|
194
|
+
static hash(str) {
|
|
195
|
+
return murmur3_32(str);
|
|
196
|
+
}
|
|
197
|
+
constructor(filename, storage) {
|
|
198
|
+
this.#filename = filename, this.#storage = storage;
|
|
199
|
+
}
|
|
200
|
+
async loadIndex() {
|
|
201
|
+
return this.#buffer ? !0 : (this.#buffer = await this.#storage.read(this.#filename), this.#buffer ? (this.#view = new DataView(this.#buffer), !0) : !1);
|
|
202
|
+
}
|
|
203
|
+
async buildAndSave(docs) {
|
|
204
|
+
const tokenMap = /* @__PURE__ */ new Map();
|
|
205
|
+
for (const doc of docs) {
|
|
206
|
+
const uniqueTokens = /* @__PURE__ */ new Map();
|
|
207
|
+
for (const token of doc.tokens)
|
|
208
|
+
uniqueTokens.has(token) || (uniqueTokens.set(token, !0), tokenMap.has(token) || tokenMap.set(token, {
|
|
209
|
+
hash: IndexSegment.hash(token),
|
|
210
|
+
postings: []
|
|
211
|
+
}), tokenMap.get(token).postings.push(doc.id));
|
|
212
|
+
}
|
|
213
|
+
const entries = Array.from(tokenMap.entries());
|
|
214
|
+
entries.sort(([a, ah], [b, bh]) => ah.hash !== bh.hash ? ah.hash - bh.hash : a.localeCompare(b));
|
|
215
|
+
const encoder = new TextEncoder();
|
|
216
|
+
let totalPostings = 0, totalTokensSize = 0;
|
|
217
|
+
for (const [token, { postings }] of entries) {
|
|
218
|
+
totalPostings += postings.length;
|
|
219
|
+
const bytes = encoder.encode(token);
|
|
220
|
+
totalTokensSize += bytes.length + 1;
|
|
221
|
+
}
|
|
222
|
+
const headerSize = 12, dictSize = entries.length * 20, postingsSize = totalPostings * 4, tokensOffset = headerSize + dictSize + postingsSize, totalSize = tokensOffset + totalTokensSize, buffer = new ArrayBuffer(totalSize), view = new DataView(buffer);
|
|
223
|
+
view.setUint32(0, 1229866072), view.setUint32(4, entries.length, !0), view.setUint32(8, tokensOffset, !0);
|
|
224
|
+
let currentDictOffset = headerSize, currentPostingsOffset = headerSize + dictSize, currentTokenOffset = tokensOffset;
|
|
225
|
+
for (const [token, { hash: hash2, postings }] of entries) {
|
|
226
|
+
view.setUint32(currentDictOffset, hash2, !0);
|
|
227
|
+
const tokenBytes = encoder.encode(token);
|
|
228
|
+
view.setUint32(currentDictOffset + 4, tokenBytes.length, !0), view.setUint32(currentDictOffset + 8, currentTokenOffset, !0), view.setUint32(currentDictOffset + 12, currentPostingsOffset, !0), view.setUint32(currentDictOffset + 16, postings.length, !0), currentDictOffset += 20;
|
|
229
|
+
for (let i = 0; i < postings.length; i++)
|
|
230
|
+
view.setUint32(currentPostingsOffset, postings[i], !0), currentPostingsOffset += 4;
|
|
231
|
+
for (let i = 0; i < tokenBytes.length; i++)
|
|
232
|
+
view.setUint8(currentTokenOffset++, tokenBytes[i]);
|
|
233
|
+
view.setUint8(currentTokenOffset++, 0);
|
|
234
|
+
}
|
|
235
|
+
await this.#storage.write(this.#filename, buffer), this.#buffer = buffer, this.#view = view;
|
|
236
|
+
}
|
|
237
|
+
search(term) {
|
|
238
|
+
if (!this.#view || !this.#buffer) return [];
|
|
239
|
+
const h = IndexSegment.hash(term), count = this.#view.getUint32(4, !0);
|
|
240
|
+
let left = 0, right = count - 1;
|
|
241
|
+
const headerSize = 12, entrySize = 20, decoder = new TextDecoder();
|
|
242
|
+
for (; left <= right; ) {
|
|
243
|
+
const mid = left + right >>> 1, entryPos = headerSize + mid * entrySize, entryHash = this.#view.getUint32(entryPos, !0);
|
|
244
|
+
if (entryHash < h)
|
|
245
|
+
left = mid + 1;
|
|
246
|
+
else if (entryHash > h)
|
|
247
|
+
right = mid - 1;
|
|
248
|
+
else {
|
|
249
|
+
if (!(mid > 0 && this.#view.getUint32(headerSize + (mid - 1) * entrySize, !0) === h || mid < count - 1 && this.#view.getUint32(headerSize + (mid + 1) * entrySize, !0) === h)) {
|
|
250
|
+
const postingsOffset = this.#view.getUint32(headerSize + mid * entrySize + 12, !0), postingsLen = this.#view.getUint32(headerSize + mid * entrySize + 16, !0), result = [];
|
|
251
|
+
for (let j = 0; j < postingsLen; j++)
|
|
252
|
+
result.push(this.#view.getUint32(postingsOffset + j * 4, !0));
|
|
253
|
+
return result;
|
|
254
|
+
}
|
|
255
|
+
let firstMatch = mid;
|
|
256
|
+
for (; firstMatch > 0; ) {
|
|
257
|
+
const prevPos = headerSize + (firstMatch - 1) * entrySize;
|
|
258
|
+
if (this.#view.getUint32(prevPos, !0) === h)
|
|
259
|
+
firstMatch--;
|
|
260
|
+
else
|
|
261
|
+
break;
|
|
262
|
+
}
|
|
263
|
+
for (let i = firstMatch; i < count; i++) {
|
|
264
|
+
const checkPos = headerSize + i * entrySize;
|
|
265
|
+
if (this.#view.getUint32(checkPos, !0) !== h) break;
|
|
266
|
+
const tokenLen = this.#view.getUint32(checkPos + 4, !0), tokenOffset = this.#view.getUint32(checkPos + 8, !0), tokenBuffer = new Uint8Array(this.#buffer, tokenOffset, tokenLen);
|
|
267
|
+
if (decoder.decode(tokenBuffer) === term) {
|
|
268
|
+
const postingsOffset = this.#view.getUint32(checkPos + 12, !0), postingsLen = this.#view.getUint32(checkPos + 16, !0), result = [];
|
|
269
|
+
for (let j = 0; j < postingsLen; j++)
|
|
270
|
+
result.push(this.#view.getUint32(postingsOffset + j * 4, !0));
|
|
271
|
+
return result;
|
|
272
|
+
}
|
|
273
|
+
}
|
|
274
|
+
return [];
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
return [];
|
|
278
|
+
}
|
|
279
|
+
}
|
|
280
|
+
const defaultTokenize = ({ text }) => {
|
|
281
|
+
try {
|
|
282
|
+
if (typeof Intl < "u" && typeof Intl.Segmenter == "function" && typeof Array.from == "function") {
|
|
283
|
+
const segments = new Intl.Segmenter([], { granularity: "word" }).segment(text);
|
|
284
|
+
if (typeof segments == "object" && segments !== null)
|
|
285
|
+
return Array.from(segments).filter((s) => s?.isWordLike).map((s) => s?.segment?.toLowerCase() || "");
|
|
286
|
+
}
|
|
287
|
+
} catch {
|
|
288
|
+
}
|
|
289
|
+
return text.toLowerCase().split(/[^a-z0-9\u4e00-\u9fa5]+/g).filter((t) => t.length > 0);
|
|
290
|
+
}, WORD_CACHE_FILE = "word_cache.bin", CHAR_CACHE_FILE = "char_cache.bin";
|
|
291
|
+
class SearchEngine {
|
|
292
|
+
#storage;
|
|
293
|
+
#meta;
|
|
294
|
+
#cache;
|
|
295
|
+
#segments;
|
|
296
|
+
#initialized = !1;
|
|
297
|
+
#config;
|
|
298
|
+
// 批处理状态
|
|
299
|
+
#inBatch = !1;
|
|
300
|
+
#pendingTokenCounts = { word: 0, char: 0 };
|
|
301
|
+
constructor(config) {
|
|
302
|
+
if (this.#config = {
|
|
303
|
+
wordSegmentTokenThreshold: 1e5,
|
|
304
|
+
charSegmentTokenThreshold: 5e5,
|
|
305
|
+
minWordTokenSave: 0,
|
|
306
|
+
minCharTokenSave: 0,
|
|
307
|
+
indexingTokenizer: config.indexingTokenizer || defaultTokenize,
|
|
308
|
+
...config
|
|
309
|
+
}, (this.#config.minWordTokenSave || 0) >= (this.#config.wordSegmentTokenThreshold || 1e5))
|
|
310
|
+
throw new Error("minWordTokenSave must be less than wordSegmentTokenThreshold");
|
|
311
|
+
if ((this.#config.minCharTokenSave || 0) >= (this.#config.charSegmentTokenThreshold || 5e5))
|
|
312
|
+
throw new Error("minCharTokenSave must be less than charSegmentTokenThreshold");
|
|
313
|
+
this.#storage = config.storage, this.#meta = new MetaManager(this.#storage), this.#cache = new IntermediateCache(this.#storage), this.#segments = /* @__PURE__ */ new Map();
|
|
314
|
+
}
|
|
315
|
+
/**
|
|
316
|
+
* 开启批处理
|
|
317
|
+
* 批处理期间 addDocuments 只写入缓存,不触发索引段构建
|
|
318
|
+
*/
|
|
319
|
+
startBatch() {
|
|
320
|
+
this.#inBatch = !0, this.#pendingTokenCounts = { word: 0, char: 0 };
|
|
321
|
+
}
|
|
322
|
+
/**
|
|
323
|
+
* 结束批处理
|
|
324
|
+
* 触发索引构建检查并保存元数据
|
|
325
|
+
*/
|
|
326
|
+
async endBatch() {
|
|
327
|
+
this.#inBatch = !1, this.#pendingTokenCounts.word > 0 && await this.#processSegmentLogic("word", this.#pendingTokenCounts.word), this.#pendingTokenCounts.char > 0 && await this.#processSegmentLogic("char", this.#pendingTokenCounts.char), this.#pendingTokenCounts = { word: 0, char: 0 }, await this.#meta.save();
|
|
328
|
+
}
|
|
329
|
+
async addDocument(doc) {
|
|
330
|
+
return this.addDocuments([doc]);
|
|
331
|
+
}
|
|
332
|
+
/**
|
|
333
|
+
* 添加单个文档,如果文档ID已存在则跳过
|
|
334
|
+
* 用于在批量添加中途出错后的恢复添加行为,也可直接用于单个文档添加
|
|
335
|
+
*/
|
|
336
|
+
async addDocumentIfMissing(doc) {
|
|
337
|
+
return this.addDocumentsIfMissing([doc]);
|
|
338
|
+
}
|
|
339
|
+
/**
|
|
340
|
+
* 添加多个文档,跳过已存在的文档ID
|
|
341
|
+
* 用于在批量添加中途出错后的恢复添加行为,也可直接用于批量添加
|
|
342
|
+
*/
|
|
343
|
+
async addDocumentsIfMissing(docs) {
|
|
344
|
+
if (this.#initialized || await this.#init(), docs.length === 0) return;
|
|
345
|
+
const deletedIds = this.#meta.getDeletedIds(), batchWordDocs = [], batchCharDocs = [], newDocs = [];
|
|
346
|
+
for (const doc of docs) {
|
|
347
|
+
if (deletedIds.has(doc.id) || this.#meta.isAdded(doc.id))
|
|
348
|
+
continue;
|
|
349
|
+
const rawTokens = this.#getIndexingTokens(doc), wordTokens = [], charTokens = [];
|
|
350
|
+
for (const t of rawTokens)
|
|
351
|
+
t.length > 1 ? wordTokens.push(t) : t.length === 1 && charTokens.push(t);
|
|
352
|
+
wordTokens.length > 0 && batchWordDocs.push({ id: doc.id, tokens: wordTokens }), charTokens.length > 0 && batchCharDocs.push({ id: doc.id, tokens: charTokens }), newDocs.push(doc);
|
|
353
|
+
}
|
|
354
|
+
if (newDocs.length === 0) return;
|
|
355
|
+
let addedWordTokens = 0, addedCharTokens = 0;
|
|
356
|
+
if (batchWordDocs.length > 0) {
|
|
357
|
+
await this.#cache.appendBatch(WORD_CACHE_FILE, batchWordDocs);
|
|
358
|
+
for (const d of batchWordDocs) addedWordTokens += d.tokens.length;
|
|
359
|
+
}
|
|
360
|
+
if (batchCharDocs.length > 0) {
|
|
361
|
+
await this.#cache.appendBatch(CHAR_CACHE_FILE, batchCharDocs);
|
|
362
|
+
for (const d of batchCharDocs) addedCharTokens += d.tokens.length;
|
|
363
|
+
}
|
|
364
|
+
for (const doc of newDocs)
|
|
365
|
+
this.#meta.addAddedId(doc.id);
|
|
366
|
+
this.#inBatch ? (this.#pendingTokenCounts.word += addedWordTokens, this.#pendingTokenCounts.char += addedCharTokens) : (addedWordTokens > 0 && await this.#processSegmentLogic("word", addedWordTokens), addedCharTokens > 0 && await this.#processSegmentLogic("char", addedCharTokens), await this.#meta.save());
|
|
367
|
+
}
|
|
368
|
+
async addDocuments(docs) {
|
|
369
|
+
if (this.#initialized || await this.#init(), docs.length === 0) return;
|
|
370
|
+
const deletedIds = this.#meta.getDeletedIds(), batchWordDocs = [], batchCharDocs = [];
|
|
371
|
+
for (const doc of docs) {
|
|
372
|
+
if (deletedIds.has(doc.id))
|
|
373
|
+
throw new Error(`Document ID ${doc.id} has been deleted and cannot be re-added.`);
|
|
374
|
+
if (this.#meta.isAdded(doc.id))
|
|
375
|
+
throw new Error(`Document ID ${doc.id} already exists.`);
|
|
376
|
+
const rawTokens = this.#getIndexingTokens(doc), wordTokens = [], charTokens = [];
|
|
377
|
+
for (const t of rawTokens)
|
|
378
|
+
t.length > 1 ? wordTokens.push(t) : t.length === 1 && charTokens.push(t);
|
|
379
|
+
wordTokens.length > 0 && batchWordDocs.push({ id: doc.id, tokens: wordTokens }), charTokens.length > 0 && batchCharDocs.push({ id: doc.id, tokens: charTokens });
|
|
380
|
+
}
|
|
381
|
+
let addedWordTokens = 0, addedCharTokens = 0;
|
|
382
|
+
if (batchWordDocs.length > 0) {
|
|
383
|
+
await this.#cache.appendBatch(WORD_CACHE_FILE, batchWordDocs);
|
|
384
|
+
for (const d of batchWordDocs) addedWordTokens += d.tokens.length;
|
|
385
|
+
}
|
|
386
|
+
if (batchCharDocs.length > 0) {
|
|
387
|
+
await this.#cache.appendBatch(CHAR_CACHE_FILE, batchCharDocs);
|
|
388
|
+
for (const d of batchCharDocs) addedCharTokens += d.tokens.length;
|
|
389
|
+
}
|
|
390
|
+
for (const doc of docs)
|
|
391
|
+
this.#meta.addAddedId(doc.id);
|
|
392
|
+
this.#inBatch ? (this.#pendingTokenCounts.word += addedWordTokens, this.#pendingTokenCounts.char += addedCharTokens) : (addedWordTokens > 0 && await this.#processSegmentLogic("word", addedWordTokens), addedCharTokens > 0 && await this.#processSegmentLogic("char", addedCharTokens), await this.#meta.save());
|
|
393
|
+
}
|
|
394
|
+
async search(query, limit) {
|
|
395
|
+
this.#initialized || await this.#init();
|
|
396
|
+
const queryDoc = typeof query == "string" ? { text: query } : query, rawTokens = this.#getSearchTokens(queryDoc), wordTerms = rawTokens.filter((t) => t.length > 1), charTerms = rawTokens.filter((t) => t.length === 1), deletedIds = this.#meta.getDeletedIds(), docMatches = /* @__PURE__ */ new Map(), segmentsToLoad = /* @__PURE__ */ new Map(), collectSegments = (type) => {
|
|
397
|
+
const segmentsMeta = this.#meta.getSegments(type);
|
|
398
|
+
for (const meta of segmentsMeta) {
|
|
399
|
+
const filename = meta.filename;
|
|
400
|
+
!this.#segments.has(filename) && !segmentsToLoad.has(filename) && segmentsToLoad.set(filename, new IndexSegment(filename, this.#storage));
|
|
401
|
+
}
|
|
402
|
+
};
|
|
403
|
+
collectSegments("word"), collectSegments("char"), await Promise.all(
|
|
404
|
+
Array.from(segmentsToLoad.entries()).map(([filename, segment]) => segment.loadIndex().then((loaded) => {
|
|
405
|
+
loaded && this.#segments.set(filename, segment);
|
|
406
|
+
}))
|
|
407
|
+
);
|
|
408
|
+
const processTerms = async (type, terms) => {
|
|
409
|
+
if (terms.length === 0) return;
|
|
410
|
+
const segmentsMeta = this.#meta.getSegments(type);
|
|
411
|
+
for (const meta of segmentsMeta) {
|
|
412
|
+
const filename = meta.filename, segment = this.#segments.get(filename);
|
|
413
|
+
if (segment)
|
|
414
|
+
for (const term of terms) {
|
|
415
|
+
const hits = segment.search(term), termScore = 1 + term.length * 0.1;
|
|
416
|
+
for (const id of hits)
|
|
417
|
+
if (!deletedIds.has(id))
|
|
418
|
+
if (!docMatches.has(id))
|
|
419
|
+
docMatches.set(id, { score: 0, tokens: /* @__PURE__ */ new Set([term]) });
|
|
420
|
+
else {
|
|
421
|
+
const match = docMatches.get(id);
|
|
422
|
+
match.score += termScore, match.tokens.add(term);
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
}
|
|
426
|
+
};
|
|
427
|
+
await processTerms("word", wordTerms), await processTerms("char", charTerms);
|
|
428
|
+
const results = [];
|
|
429
|
+
return docMatches.forEach((data, id) => {
|
|
430
|
+
results.push({
|
|
431
|
+
id,
|
|
432
|
+
score: data.score,
|
|
433
|
+
tokens: Array.from(data.tokens)
|
|
434
|
+
});
|
|
435
|
+
}), results.sort((a, b) => b.score - a.score), typeof limit == "number" && limit > 0 ? results.slice(0, limit) : results;
|
|
436
|
+
}
|
|
437
|
+
async removeDocument(id) {
|
|
438
|
+
this.#initialized || await this.#init(), this.#meta.addDeletedId(id), this.#meta.removeAddedId(id), await this.#meta.save();
|
|
439
|
+
}
|
|
440
|
+
async clearAll() {
|
|
441
|
+
await this.#storage.clearAll(), this.#segments.clear(), this.#meta.reset(), this.#initialized = !1, this.#inBatch = !1, this.#pendingTokenCounts = { word: 0, char: 0 };
|
|
442
|
+
}
|
|
443
|
+
async getStatus() {
|
|
444
|
+
return this.#initialized || await this.#init(), {
|
|
445
|
+
wordSegments: this.#meta.getSegments("word").length,
|
|
446
|
+
charSegments: this.#meta.getSegments("char").length,
|
|
447
|
+
deleted: this.#meta.getDeletedIds().size,
|
|
448
|
+
wordCacheSize: await this.#cache.getCurrentSize(WORD_CACHE_FILE),
|
|
449
|
+
charCacheSize: await this.#cache.getCurrentSize(CHAR_CACHE_FILE),
|
|
450
|
+
inBatch: this.#inBatch
|
|
451
|
+
};
|
|
452
|
+
}
|
|
453
|
+
/**
|
|
454
|
+
* 检查文档ID是否曾经添加过(包括已删除的)
|
|
455
|
+
* @param id 文档ID
|
|
456
|
+
* @returns 文档是否曾经添加过的布尔值
|
|
457
|
+
*/
|
|
458
|
+
async hasDocument(id) {
|
|
459
|
+
return this.#initialized || await this.#init(), this.#meta.hasDocument(id);
|
|
460
|
+
}
|
|
461
|
+
async #init() {
|
|
462
|
+
if (this.#initialized) return;
|
|
463
|
+
await this.#meta.load();
|
|
464
|
+
const allSegments = [
|
|
465
|
+
...this.#meta.getSegments("word"),
|
|
466
|
+
...this.#meta.getSegments("char")
|
|
467
|
+
];
|
|
468
|
+
for (const seg of allSegments)
|
|
469
|
+
this.#segments.has(seg.filename) || this.#segments.set(seg.filename, new IndexSegment(seg.filename, this.#storage)), await this.#segments.get(seg.filename).loadIndex();
|
|
470
|
+
this.#initialized = !0;
|
|
471
|
+
}
|
|
472
|
+
#getIndexingTokens(doc) {
|
|
473
|
+
return this.#config.indexingTokenizer(doc);
|
|
474
|
+
}
|
|
475
|
+
#getSearchTokens(doc) {
|
|
476
|
+
return this.#config.searchTokenizer ? this.#config.searchTokenizer(doc) : this.#getIndexingTokens(doc);
|
|
477
|
+
}
|
|
478
|
+
/**
|
|
479
|
+
* 核心索引段处理逻辑
|
|
480
|
+
* 负责判断是否需要合并、新建 Segment,并执行构建
|
|
481
|
+
*/
|
|
482
|
+
async #processSegmentLogic(type, addedTokenCount) {
|
|
483
|
+
const cacheFilename = type === "word" ? WORD_CACHE_FILE : CHAR_CACHE_FILE, currentCacheSize = await this.#cache.getCurrentSize(cacheFilename), segThreshold = type === "word" ? this.#config.wordSegmentTokenThreshold || 1e5 : this.#config.charSegmentTokenThreshold || 5e5, minSave = type === "word" ? this.#config.minWordTokenSave || 0 : this.#config.minCharTokenSave || 0, lastSegInfo = this.#meta.getLastSegmentInfo(type);
|
|
484
|
+
let targetSegmentName, startOffset, isNew, newTokenCountTotal;
|
|
485
|
+
const generateSegmentName = () => {
|
|
486
|
+
const nextNumber = this.#meta.getSegments(type).length + 1;
|
|
487
|
+
return `${type}_seg_${nextNumber}.bin`;
|
|
488
|
+
};
|
|
489
|
+
if (!lastSegInfo)
|
|
490
|
+
targetSegmentName = generateSegmentName(), isNew = !0, startOffset = 0, newTokenCountTotal = addedTokenCount;
|
|
491
|
+
else {
|
|
492
|
+
const existingTokenCount = lastSegInfo.tokenCount;
|
|
493
|
+
existingTokenCount >= segThreshold || existingTokenCount + addedTokenCount >= segThreshold ? (targetSegmentName = generateSegmentName(), isNew = !0, startOffset = lastSegInfo.end, newTokenCountTotal = addedTokenCount) : (targetSegmentName = lastSegInfo.filename, isNew = !1, startOffset = lastSegInfo.start, newTokenCountTotal = existingTokenCount + addedTokenCount);
|
|
494
|
+
}
|
|
495
|
+
if (newTokenCountTotal < minSave) {
|
|
496
|
+
this.#meta.updateSegment(type, targetSegmentName, startOffset, currentCacheSize, newTokenCountTotal, isNew);
|
|
497
|
+
return;
|
|
498
|
+
}
|
|
499
|
+
const docsToBuild = await this.#cache.readRange(cacheFilename, startOffset, currentCacheSize);
|
|
500
|
+
let segment = this.#segments.get(targetSegmentName);
|
|
501
|
+
segment || (segment = new IndexSegment(targetSegmentName, this.#storage), this.#segments.set(targetSegmentName, segment)), await segment.buildAndSave(docsToBuild), this.#meta.updateSegment(type, targetSegmentName, startOffset, currentCacheSize, newTokenCountTotal, isNew);
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
exports.SearchEngine = SearchEngine, exports.hash = murmur3_32, exports.murmur3_32 = murmur3_32;
|