gs-search 0.1.4 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ja.md +1 -1
- package/README.ko.md +1 -1
- package/README.md +3 -3
- package/README.zh-CN.md +2 -2
- package/lib/core.cjs +1 -1
- package/lib/core.d.ts +5 -13
- package/lib/core.js +1 -1
- package/lib/simple.cjs +1 -1
- package/lib/simple.d.ts +5 -11
- package/lib/simple.js +1 -1
- package/lib/type.d.ts +53 -10
- package/package.json +1 -1
package/README.ja.md
CHANGED
|
@@ -134,7 +134,7 @@ const engine = new SearchEngine({
|
|
|
134
134
|
|
|
135
135
|
### SearchEngine
|
|
136
136
|
|
|
137
|
-
- `constructor(options:
|
|
137
|
+
- `constructor(options: ISearchEngineOption)`: 新しいコアエンジンインスタンスを作成
|
|
138
138
|
- `init(): Promise<void>`: エンジンを初期化
|
|
139
139
|
- `addDocument(doc: IDocument): Promise<void>`: 単一ドキュメントを追加
|
|
140
140
|
- `addDocuments(docs: IDocument[]): Promise<void>`: 複数ドキュメントを追加
|
package/README.ko.md
CHANGED
|
@@ -134,7 +134,7 @@ const engine = new SearchEngine({
|
|
|
134
134
|
|
|
135
135
|
### SearchEngine
|
|
136
136
|
|
|
137
|
-
- `constructor(options:
|
|
137
|
+
- `constructor(options: ISearchEngineOption)`: 새로운 코어 엔진 인스턴스 생성
|
|
138
138
|
- `init(): Promise<void>`: 엔진 초기화
|
|
139
139
|
- `addDocument(doc: IDocument): Promise<void>`: 단일 문서 추가
|
|
140
140
|
- `addDocuments(docs: IDocument[]): Promise<void>`: 다중 문서 추가
|
package/README.md
CHANGED
|
@@ -116,7 +116,7 @@ const customTokenizer = (text: string): string[] => {
|
|
|
116
116
|
|
|
117
117
|
// Create engine with custom tokenizers
|
|
118
118
|
const engine = new SearchEngine({
|
|
119
|
-
|
|
119
|
+
storage: new BrowserStorage('search-data'),
|
|
120
120
|
indexingTokenizer: customTokenizer,
|
|
121
121
|
searchTokenizer: customTokenizer
|
|
122
122
|
});
|
|
@@ -127,7 +127,7 @@ const engine = new SearchEngine({
|
|
|
127
127
|
### SimpleSearch
|
|
128
128
|
|
|
129
129
|
**Static Methods (No instance creation required):**
|
|
130
|
-
- `configure(config: Partial<
|
|
130
|
+
- `configure(config: Partial<ISearchEngineOption>): void`: Configure the search engine
|
|
131
131
|
- `addDocument(doc: IDocument): Promise<void>`: Add a single document
|
|
132
132
|
- `addDocuments(docs: IDocument[]): Promise<void>`: Add multiple documents
|
|
133
133
|
- `addDocumentIfMissing(doc: IDocument): Promise<void>`: Add a single document if it doesn't exist
|
|
@@ -141,7 +141,7 @@ const engine = new SearchEngine({
|
|
|
141
141
|
|
|
142
142
|
### SearchEngine
|
|
143
143
|
|
|
144
|
-
- `constructor(options:
|
|
144
|
+
- `constructor(options: ISearchEngineOption)`: Create a new core engine instance
|
|
145
145
|
- `init(): Promise<void>`: Initialize the engine
|
|
146
146
|
- `addDocument(doc: IDocument): Promise<void>`: Add a single document
|
|
147
147
|
- `addDocuments(docs: IDocument[]): Promise<void>`: Add multiple documents
|
package/README.zh-CN.md
CHANGED
|
@@ -225,7 +225,7 @@ SimpleSearch.configure({
|
|
|
225
225
|
### SimpleSearch
|
|
226
226
|
|
|
227
227
|
**静态方法(无需实例创建):**
|
|
228
|
-
- `configure(config: Partial<
|
|
228
|
+
- `configure(config: Partial<ISearchEngineOption>): void`: 配置搜索引擎
|
|
229
229
|
- `addDocument(doc: IDocument): Promise<void>`: 添加单个文档
|
|
230
230
|
- `addDocuments(docs: IDocument[]): Promise<void>`: 添加多个文档
|
|
231
231
|
- `addDocumentIfMissing(doc: IDocument): Promise<void>`: 如果文档不存在则添加单个文档
|
|
@@ -239,7 +239,7 @@ SimpleSearch.configure({
|
|
|
239
239
|
|
|
240
240
|
### SearchEngine
|
|
241
241
|
|
|
242
|
-
- `constructor(options:
|
|
242
|
+
- `constructor(options: ISearchEngineOption)`: 创建一个新的核心引擎实例
|
|
243
243
|
- `init(): Promise<void>`: 初始化引擎
|
|
244
244
|
- `addDocument(doc: IDocument): Promise<void>`: 添加单个文档
|
|
245
245
|
- `addDocuments(docs: IDocument[]): Promise<void>`: 添加多个文档
|
package/lib/core.cjs
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
"use strict";var e=require('./browser'),t=require('./node');const s="search_meta.json",n="deleted_ids.bin",i="added_ids.bin";class o{#e;#t={wordSegments:[],charSegments:[]};#s=new Set;#n=new Set;constructor(e){this.#e=e}async load(){const e=await this.#e.read(s);if(e){const t=(new TextDecoder).decode(e);this.#t=JSON.parse(t)}else this.#t={wordSegments:[],charSegments:[]};const t=await this.#e.read(n);if(t){const e=new DataView(t);let s=0;const n=t.byteLength;for(;s<n&&!(s+4>n);){const t=e.getUint32(s,!0);this.#s.add(t),s+=4,s<n&&30===e.getUint8(s)&&(s+=1)}}const o=await this.#e.read(i);if(o){const e=new DataView(o);let t=0;const s=o.byteLength;for(;t<s&&!(t+4>s);){const n=e.getUint32(t,!0);this.#n.add(n),t+=4,t<s&&30===e.getUint8(t)&&(t+=1)}}}async save(){const e=JSON.stringify(this.#t);if(await this.#e.write(s,(new TextEncoder).encode(e).buffer),0===this.#s.size)await this.#e.remove(n);else{const e=4*this.#s.size+this.#s.size,t=new ArrayBuffer(e),s=new DataView(t);let i=0;for(const e of this.#s)s.setUint32(i,e,!0),i+=4,s.setUint8(i,30),i+=1;await this.#e.write(n,t)}if(0===this.#n.size)await this.#e.remove(i);else{const e=4*this.#n.size+this.#n.size,t=new ArrayBuffer(e),s=new DataView(t);let n=0;for(const e of this.#n)s.setUint32(n,e,!0),n+=4,s.setUint8(n,30),n+=1;await this.#e.write(i,t)}}getSegments(e){return"word"===e?this.#t.wordSegments:this.#t.charSegments}getDeletedIds(){return this.#s}addDeletedId(e){this.#s.add(e)}isDeleted(e){return this.#s.has(e)}addAddedId(e){this.#n.add(e)}removeAddedId(e){this.#n.delete(e)}isAdded(e){return this.#n.has(e)}getAddedIds(){return this.#n}hasDocument(e){return this.#n.has(e)||this.#s.has(e)}getLastSegmentInfo(e){const t=this.getSegments(e);return 0===t.length?null:t[t.length-1]}updateSegment(e,t,s,n,i,o){const a="word"===e?this.#t.wordSegments:this.#t.charSegments;if(o)a.push({filename:t,start:s,end:n,tokenCount:i});else{const e=a[a.length-1];e&&e.filename===t&&(e.end=n,e.tokenCount=i)}}reset(){this.#t={wordSegments:[],charSegments:[]},this.#s.clear(),this.#n.clear()}}class a{static SEPARATOR=30;#e;constructor(e){this.#e=e}async appendBatch(e,t){if(0===t.length)return await this.#e.getFileSize(e);const s=new TextEncoder;let n=0;for(const e of t){n+=8;for(const t of e.tokens){n+=2+Math.min(s.encode(t).byteLength,65535)}n+=1}const i=new Uint8Array(n);let o=0;for(const e of t){const t=[];for(const n of e.tokens){const e=s.encode(n),i=e.byteLength>65535?e.slice(0,65535):e;t.push(i)}const n=new DataView(i.buffer,o);n.setUint32(0,e.id,!0),n.setUint32(4,t.length,!0),o+=8;for(const e of t)new DataView(i.buffer,o).setUint16(0,e.byteLength,!0),o+=2,i.set(e,o),o+=e.byteLength;i[o++]=a.SEPARATOR}return await this.#e.append(e,i.buffer),await this.#e.getFileSize(e)}async readRange(e,t,s){const n=await this.#e.readRange(e,t,s);if(!n||0===n.byteLength)return[];const i=new DataView(n),o=new Uint8Array(n),r=new TextDecoder,h=[];let d=0;const c=n.byteLength;for(;d<c&&!(d+8>c);){const e=i.getUint32(d,!0);d+=4;const t=i.getUint32(d,!0);d+=4;const s=[];for(let e=0;e<t&&!(d+2>c);e++){const e=i.getUint16(d,!0);if(d+=2,d+e>c)break;const t=new Uint8Array(n,d,e);s.push(r.decode(t)),d+=e}d<c&&o[d]===a.SEPARATOR&&(d+=1),h.push({id:e,tokens:s})}return h}async getCurrentSize(e){return await this.#e.getFileSize(e)}}function r(e,t=305419896){const s=e.length,n=s>>2;let i=0;for(;i<n;){let s=255&e.charCodeAt(i)|(255&e.charCodeAt(++i))<<8|(255&e.charCodeAt(++i))<<16|(255&e.charCodeAt(++i))<<24;++i,s=3432918353*(65535&s)+((3432918353*(s>>>16)&65535)<<16)&4294967295,s=s<<15|s>>>17,s=461845907*(65535&s)+((461845907*(s>>>16)&65535)<<16)&4294967295,t=27492+(65535&(t=5*(65535&(t=(t^=s)<<13|t>>>19))+((5*(t>>>16)&65535)<<16)&4294967295))+(((t>>>16)+58964&65535)<<16)}let o=0;const a=3&s;return a>0&&(a>=3&&(o^=(255&e.charCodeAt(i+2))<<16),a>=2&&(o^=(255&e.charCodeAt(i+1))<<8),a>=1&&(o^=255&e.charCodeAt(i)),o=3432918353*(65535&o)+((3432918353*(o>>>16)&65535)<<16)&4294967295,o=o<<15|o>>>17,o=461845907*(65535&o)+((461845907*(o>>>16)&65535)<<16)&4294967295,t^=o),t^=s,t=2246822507*(65535&(t^=t>>>16))+((2246822507*(t>>>16)&65535)<<16)&4294967295,t=3266489909*(65535&(t^=t>>>13))+((3266489909*(t>>>16)&65535)<<16)&4294967295,(t^=t>>>16)>>>0}class h{#i;#e;#o=null;#a=null;static hash(e){return r(e)}constructor(e,t){this.#i=e,this.#e=t}async loadIndex(){return!!this.#o||(this.#o=await this.#e.read(this.#i),!!this.#o&&(this.#a=new DataView(this.#o),!0))}async buildAndSave(e){const t=new Map;for(const s of e){const e=new Map;for(const n of s.tokens)e.has(n)||(e.set(n,!0),t.has(n)||t.set(n,{hash:h.hash(n),postings:[]}),t.get(n).postings.push(s.id))}const s=Array.from(t.entries());s.sort(([e,{hash:t}],[s,{hash:n}])=>t!==n?t-n:e.localeCompare(s));let n=0,i=0;for(const[e,{postings:t}]of s)n+=t.length,i+=e.length+1;const o=20*s.length,a=12+o+4*n,r=new ArrayBuffer(a+i),d=new DataView(r);d.setUint32(0,1229866072),d.setUint32(4,s.length),d.setUint32(8,a);let c=12,g=12+o,f=a;for(const[e,{hash:t,postings:n}]of s){d.setUint32(c,t),d.setUint32(c+4,e.length),d.setUint32(c+8,f),d.setUint32(c+12,g),d.setUint32(c+16,n.length),c+=20;for(let e=0;e<n.length;e++)d.setUint32(g,n[e],!0),g+=4;const s=(new TextEncoder).encode(e);for(let e=0;e<s.length;e++)d.setUint8(f++,s[e]);d.setUint8(f++,0)}await this.#e.write(this.#i,r),this.#o=r,this.#a=d}search(e){if(!this.#a||!this.#o)return[];const t=h.hash(e),s=this.#a.getUint32(4);let n=0,i=s-1;const o=12,a=20,r=new TextDecoder;for(;n<=i;){const h=n+i>>>1,d=o+h*a,c=this.#a.getUint32(d);if(c<t)n=h+1;else{if(!(c>t)){if(!(h>0&&this.#a.getUint32(o+(h-1)*a)===t||h<s-1&&this.#a.getUint32(o+(h+1)*a)===t)){const e=this.#a.getUint32(o+h*a+12),t=this.#a.getUint32(o+h*a+16),s=[];for(let n=0;n<t;n++)s.push(this.#a.getUint32(e+4*n,!0));return s}let n=h;for(;n>0;){const e=o+(n-1)*a;if(this.#a.getUint32(e)!==t)break;n--}for(let i=n;i<s;i++){const s=o+i*a;if(this.#a.getUint32(s)!==t)break;const n=this.#a.getUint32(s+4),h=this.#a.getUint32(s+8),d=new Uint8Array(this.#o,h,n);if(r.decode(d)===e){const e=this.#a.getUint32(s+12),t=this.#a.getUint32(s+16),n=[];for(let s=0;s<t;s++)n.push(this.#a.getUint32(e+4*s,!0));return n}}return[]}i=h-1}}return[]}}const d="word_cache.bin",c="char_cache.bin";exports.SearchEngine=class{#e;#t;#r;#h;#d=!1;#c;#g=!1;#f={word:0,char:0};constructor(s){if(!s.baseDir)throw new Error("SearchEngine requires 'baseDir' in config.");if(this.#c={wordSegmentTokenThreshold:1e5,charSegmentTokenThreshold:5e5,minWordTokenSave:0,minCharTokenSave:0,...s},(this.#c.minWordTokenSave||0)>=(this.#c.wordSegmentTokenThreshold||1e5))throw new Error("minWordTokenSave must be less than wordSegmentTokenThreshold");if((this.#c.minCharTokenSave||0)>=(this.#c.charSegmentTokenThreshold||5e5))throw new Error("minCharTokenSave must be less than charSegmentTokenThreshold");let n=null;if(this.#c.storage&&("object"==typeof this.#c.storage?n=this.#c.storage:"browser"===this.#c.storage?n=new e.BrowserStorage(this.#c.baseDir):"node"===this.#c.storage&&(n=new t.NodeStorage(this.#c.baseDir))),!n){const s=typeof navigator<"u"&&navigator?.storage?.getDirectory instanceof Function,i=typeof process<"u"&&null!=process.versions&&null!=process.versions.node;s?n=new e.BrowserStorage(this.#c.baseDir):i&&(n=new t.NodeStorage(this.#c.baseDir))}if(!n)throw new Error('Storage initialization failed. Please configure "storage" explicitly or ensure you are in a supported environment (Browser/Node).');this.#e=n,this.#t=new o(this.#e),this.#r=new a(this.#e),this.#h=new Map}async init(){if(this.#d)return;await this.#t.load();const e=[...this.#t.getSegments("word"),...this.#t.getSegments("char")];for(const t of e)this.#h.has(t.filename)||this.#h.set(t.filename,new h(t.filename,this.#e)),await this.#h.get(t.filename).loadIndex();this.#d=!0}startBatch(){this.#g=!0,this.#f={word:0,char:0}}async endBatch(){this.#g=!1,this.#f.word>0&&await this.#l("word",this.#f.word),this.#f.char>0&&await this.#l("char",this.#f.char),this.#f={word:0,char:0},await this.#t.save()}#w(e){try{if(typeof Intl<"u"&&"function"==typeof Intl.Segmenter&&"function"==typeof Array.from){const t=new Intl.Segmenter([],{granularity:"word"}).segment(e);if("object"==typeof t&&null!==t)return Array.from(t).filter(e=>e?.isWordLike).map(e=>e?.segment?.toLowerCase()||"")}}catch{}return e.toLowerCase().split(/[^a-z0-9\u4e00-\u9fa5]+/g).filter(e=>e.length>0)}#m(e){return this.#c.indexingTokenizer?this.#c.indexingTokenizer(e):this.#w(e.text)}#u(e){return this.#c.searchTokenizer?this.#c.searchTokenizer(e):this.#m(e)}async addDocument(e){return this.addDocuments([e])}async addDocumentIfMissing(e){return this.addDocumentsIfMissing([e])}async addDocumentsIfMissing(e){if(this.#d||await this.init(),0===e.length)return;const t=this.#t.getDeletedIds(),s=[],n=[],i=[];for(const o of e){if(t.has(o.id)||this.#t.isAdded(o.id))continue;const e=this.#m(o),a=[],r=[];for(const t of e)t.length>1?a.push(t):1===t.length&&r.push(t);a.length>0&&s.push({id:o.id,tokens:a}),r.length>0&&n.push({id:o.id,tokens:r}),i.push(o)}if(0===i.length)return;let o=0,a=0;if(s.length>0){await this.#r.appendBatch(d,s);for(const e of s)o+=e.tokens.length}if(n.length>0){await this.#r.appendBatch(c,n);for(const e of n)a+=e.tokens.length}for(const e of i)this.#t.addAddedId(e.id);this.#g?(this.#f.word+=o,this.#f.char+=a):(o>0&&await this.#l("word",o),a>0&&await this.#l("char",a),await this.#t.save())}async addDocuments(e){if(this.#d||await this.init(),0===e.length)return;const t=this.#t.getDeletedIds(),s=[],n=[];for(const i of e){if(t.has(i.id))throw new Error(`Document ID ${i.id} has been deleted and cannot be re-added.`);if(this.#t.isAdded(i.id))throw new Error(`Document ID ${i.id} already exists.`);const e=this.#m(i),o=[],a=[];for(const t of e)t.length>1?o.push(t):1===t.length&&a.push(t);o.length>0&&s.push({id:i.id,tokens:o}),a.length>0&&n.push({id:i.id,tokens:a})}let i=0,o=0;if(s.length>0){await this.#r.appendBatch(d,s);for(const e of s)i+=e.tokens.length}if(n.length>0){await this.#r.appendBatch(c,n);for(const e of n)o+=e.tokens.length}for(const t of e)this.#t.addAddedId(t.id);this.#g?(this.#f.word+=i,this.#f.char+=o):(i>0&&await this.#l("word",i),o>0&&await this.#l("char",o),await this.#t.save())}async#l(e,t){const s="word"===e?d:c,n=await this.#r.getCurrentSize(s),i="word"===e?this.#c.wordSegmentTokenThreshold||1e5:this.#c.charSegmentTokenThreshold||5e5,o="word"===e?this.#c.minWordTokenSave||0:this.#c.minCharTokenSave||0,a=this.#t.getLastSegmentInfo(e);let r,g,f,l;const w=()=>{const t=this.#t.getSegments(e).length+1;return`${e}_seg_${t}.bin`};if(a){const e=a.tokenCount;e>=i||e+t>=i?(r=w(),f=!0,g=a.end,l=t):(r=a.filename,f=!1,g=a.start,l=e+t)}else r=w(),f=!0,g=0,l=t;if(l<o)return void this.#t.updateSegment(e,r,g,n,l,f);const m=await this.#r.readRange(s,g,n);let u=this.#h.get(r);u||(u=new h(r,this.#e),this.#h.set(r,u)),await u.buildAndSave(m),this.#t.updateSegment(e,r,g,n,l,f)}async search(e,t){this.#d||await this.init();const s="string"==typeof e?{text:e}:e,n=this.#u(s),i=n.filter(e=>e.length>1),o=n.filter(e=>1===e.length),a=this.#t.getDeletedIds(),r=new Map,d=new Map,c=e=>{const t=this.#t.getSegments(e);for(const e of t){const t=e.filename;!this.#h.has(t)&&!d.has(t)&&d.set(t,new h(t,this.#e))}};c("word"),c("char"),await Promise.all(Array.from(d.entries()).map(([e,t])=>t.loadIndex().then(s=>{s&&this.#h.set(e,t)})));const g=async(e,t)=>{if(0===t.length)return;const s=this.#t.getSegments(e);for(const e of s){const s=e.filename,n=this.#h.get(s);if(n)for(const e of t){const t=n.search(e),s=1+.1*e.length;for(const n of t)if(!a.has(n))if(r.has(n)){const t=r.get(n);t.score+=s,t.tokens.add(e)}else r.set(n,{score:0,tokens:new Set([e])})}}};await g("word",i),await g("char",o);const f=[];return r.forEach((e,t)=>{f.push({id:t,score:e.score,tokens:Array.from(e.tokens)})}),f.sort((e,t)=>t.score-e.score),"number"==typeof t&&t>0?f.slice(0,t):f}async removeDocument(e){this.#d||await this.init(),this.#t.addDeletedId(e),this.#t.removeAddedId(e),await this.#t.save()}async clearAll(){await this.#e.clearAll(),this.#h.clear(),this.#t.reset(),this.#d=!1,this.#g=!1,this.#f={word:0,char:0}}async getStatus(){return this.#d||await this.init(),{wordSegments:this.#t.getSegments("word").length,charSegments:this.#t.getSegments("char").length,deleted:this.#t.getDeletedIds().size,wordCacheSize:await this.#r.getCurrentSize(d),charCacheSize:await this.#r.getCurrentSize(c),inBatch:this.#g}}async hasDocument(e){return this.#d||await this.init(),this.#t.hasDocument(e)}},exports.hash=r,exports.murmur3_32=r;
|
|
1
|
+
"use strict";const e="search_meta.json",t="deleted_ids.bin",s="added_ids.bin";class n{#e;#t={wordSegments:[],charSegments:[]};#s=new Set;#n=new Set;constructor(e){this.#e=e}async load(){const n=await this.#e.read(e);if(n){const e=(new TextDecoder).decode(n);this.#t=JSON.parse(e)}else this.#t={wordSegments:[],charSegments:[]};const i=await this.#e.read(t);if(i){const e=new DataView(i);let t=0;const s=i.byteLength;for(;t<s&&!(t+4>s);){const n=e.getUint32(t,!0);this.#s.add(n),t+=4,t<s&&30===e.getUint8(t)&&(t+=1)}}const a=await this.#e.read(s);if(a){const e=new DataView(a);let t=0;const s=a.byteLength;for(;t<s&&!(t+4>s);){const n=e.getUint32(t,!0);this.#n.add(n),t+=4,t<s&&30===e.getUint8(t)&&(t+=1)}}}async save(){const n=JSON.stringify(this.#t);if(await this.#e.write(e,(new TextEncoder).encode(n).buffer),0===this.#s.size)await this.#e.remove(t);else{const e=4*this.#s.size+this.#s.size,s=new ArrayBuffer(e),n=new DataView(s);let i=0;for(const e of this.#s)n.setUint32(i,e,!0),i+=4,n.setUint8(i,30),i+=1;await this.#e.write(t,s)}if(0===this.#n.size)await this.#e.remove(s);else{const e=4*this.#n.size+this.#n.size,t=new ArrayBuffer(e),n=new DataView(t);let i=0;for(const e of this.#n)n.setUint32(i,e,!0),i+=4,n.setUint8(i,30),i+=1;await this.#e.write(s,t)}}getSegments(e){return"word"===e?this.#t.wordSegments:this.#t.charSegments}getDeletedIds(){return this.#s}addDeletedId(e){this.#s.add(e)}isDeleted(e){return this.#s.has(e)}addAddedId(e){this.#n.add(e)}removeAddedId(e){this.#n.delete(e)}isAdded(e){return this.#n.has(e)}getAddedIds(){return this.#n}hasDocument(e){return this.#n.has(e)||this.#s.has(e)}getLastSegmentInfo(e){const t=this.getSegments(e);return 0===t.length?null:t[t.length-1]}updateSegment(e,t,s,n,i,a){const o="word"===e?this.#t.wordSegments:this.#t.charSegments;if(a)o.push({filename:t,start:s,end:n,tokenCount:i});else{const e=o[o.length-1];e&&e.filename===t&&(e.end=n,e.tokenCount=i)}}reset(){this.#t={wordSegments:[],charSegments:[]},this.#s.clear(),this.#n.clear()}}class i{static SEPARATOR=30;#e;constructor(e){this.#e=e}async appendBatch(e,t){if(0===t.length)return await this.#e.getFileSize(e);const s=new TextEncoder;let n=0;for(const e of t){n+=8;for(const t of e.tokens){n+=2+Math.min(s.encode(t).byteLength,65535)}n+=1}const a=new Uint8Array(n);let o=0;for(const e of t){const t=[];for(const n of e.tokens){const e=s.encode(n),i=e.byteLength>65535?e.slice(0,65535):e;t.push(i)}const n=new DataView(a.buffer,o);n.setUint32(0,e.id,!0),n.setUint32(4,t.length,!0),o+=8;for(const e of t)new DataView(a.buffer,o).setUint16(0,e.byteLength,!0),o+=2,a.set(e,o),o+=e.byteLength;a[o++]=i.SEPARATOR}return await this.#e.append(e,a.buffer),await this.#e.getFileSize(e)}async readRange(e,t,s){const n=await this.#e.readRange(e,t,s);if(!n||0===n.byteLength)return[];const a=new DataView(n),o=new Uint8Array(n),r=new TextDecoder,h=[];let d=0;const c=n.byteLength;for(;d<c&&!(d+8>c);){const e=a.getUint32(d,!0);d+=4;const t=a.getUint32(d,!0);d+=4;const s=[];for(let e=0;e<t&&!(d+2>c);e++){const e=a.getUint16(d,!0);if(d+=2,d+e>c)break;const t=new Uint8Array(n,d,e);s.push(r.decode(t)),d+=e}d<c&&o[d]===i.SEPARATOR&&(d+=1),h.push({id:e,tokens:s})}return h}async getCurrentSize(e){return await this.#e.getFileSize(e)}}function a(e,t=305419896){const s=e.length,n=s>>2;let i=0;for(;i<n;){let s=255&e.charCodeAt(i)|(255&e.charCodeAt(++i))<<8|(255&e.charCodeAt(++i))<<16|(255&e.charCodeAt(++i))<<24;++i,s=3432918353*(65535&s)+((3432918353*(s>>>16)&65535)<<16)&4294967295,s=s<<15|s>>>17,s=461845907*(65535&s)+((461845907*(s>>>16)&65535)<<16)&4294967295,t=27492+(65535&(t=5*(65535&(t=(t^=s)<<13|t>>>19))+((5*(t>>>16)&65535)<<16)&4294967295))+(((t>>>16)+58964&65535)<<16)}let a=0;const o=3&s;return o>0&&(o>=3&&(a^=(255&e.charCodeAt(i+2))<<16),o>=2&&(a^=(255&e.charCodeAt(i+1))<<8),o>=1&&(a^=255&e.charCodeAt(i)),a=3432918353*(65535&a)+((3432918353*(a>>>16)&65535)<<16)&4294967295,a=a<<15|a>>>17,a=461845907*(65535&a)+((461845907*(a>>>16)&65535)<<16)&4294967295,t^=a),t^=s,t=2246822507*(65535&(t^=t>>>16))+((2246822507*(t>>>16)&65535)<<16)&4294967295,t=3266489909*(65535&(t^=t>>>13))+((3266489909*(t>>>16)&65535)<<16)&4294967295,(t^=t>>>16)>>>0}class o{#i;#e;#a=null;#o=null;static hash(e){return a(e)}constructor(e,t){this.#i=e,this.#e=t}async loadIndex(){return!!this.#a||(this.#a=await this.#e.read(this.#i),!!this.#a&&(this.#o=new DataView(this.#a),!0))}async buildAndSave(e){const t=new Map;for(const s of e){const e=new Map;for(const n of s.tokens)e.has(n)||(e.set(n,!0),t.has(n)||t.set(n,{hash:o.hash(n),postings:[]}),t.get(n).postings.push(s.id))}const s=Array.from(t.entries());s.sort(([e,{hash:t}],[s,{hash:n}])=>t!==n?t-n:e.localeCompare(s));let n=0,i=0;for(const[e,{postings:t}]of s)n+=t.length,i+=e.length+1;const a=20*s.length,r=12+a+4*n,h=new ArrayBuffer(r+i),d=new DataView(h);d.setUint32(0,1229866072),d.setUint32(4,s.length),d.setUint32(8,r);let c=12,g=12+a,f=r;for(const[e,{hash:t,postings:n}]of s){d.setUint32(c,t),d.setUint32(c+4,e.length),d.setUint32(c+8,f),d.setUint32(c+12,g),d.setUint32(c+16,n.length),c+=20;for(let e=0;e<n.length;e++)d.setUint32(g,n[e],!0),g+=4;const s=(new TextEncoder).encode(e);for(let e=0;e<s.length;e++)d.setUint8(f++,s[e]);d.setUint8(f++,0)}await this.#e.write(this.#i,h),this.#a=h,this.#o=d}search(e){if(!this.#o||!this.#a)return[];const t=o.hash(e),s=this.#o.getUint32(4);let n=0,i=s-1;const a=12,r=20,h=new TextDecoder;for(;n<=i;){const o=n+i>>>1,d=a+o*r,c=this.#o.getUint32(d);if(c<t)n=o+1;else{if(!(c>t)){if(!(o>0&&this.#o.getUint32(a+(o-1)*r)===t||o<s-1&&this.#o.getUint32(a+(o+1)*r)===t)){const e=this.#o.getUint32(a+o*r+12),t=this.#o.getUint32(a+o*r+16),s=[];for(let n=0;n<t;n++)s.push(this.#o.getUint32(e+4*n,!0));return s}let n=o;for(;n>0;){const e=a+(n-1)*r;if(this.#o.getUint32(e)!==t)break;n--}for(let i=n;i<s;i++){const s=a+i*r;if(this.#o.getUint32(s)!==t)break;const n=this.#o.getUint32(s+4),o=this.#o.getUint32(s+8),d=new Uint8Array(this.#a,o,n);if(h.decode(d)===e){const e=this.#o.getUint32(s+12),t=this.#o.getUint32(s+16),n=[];for(let s=0;s<t;s++)n.push(this.#o.getUint32(e+4*s,!0));return n}}return[]}i=o-1}}return[]}}const r=({text:e})=>{try{if(typeof Intl<"u"&&"function"==typeof Intl.Segmenter&&"function"==typeof Array.from){const t=new Intl.Segmenter([],{granularity:"word"}).segment(e);if("object"==typeof t&&null!==t)return Array.from(t).filter(e=>e?.isWordLike).map(e=>e?.segment?.toLowerCase()||"")}}catch{}return e.toLowerCase().split(/[^a-z0-9\u4e00-\u9fa5]+/g).filter(e=>e.length>0)},h="word_cache.bin",d="char_cache.bin";exports.SearchEngine=class{#e;#t;#r;#h;#d=!1;#c;#g=!1;#f={word:0,char:0};constructor(e){if(this.#c={wordSegmentTokenThreshold:1e5,charSegmentTokenThreshold:5e5,minWordTokenSave:0,minCharTokenSave:0,indexingTokenizer:e.indexingTokenizer||r,...e},(this.#c.minWordTokenSave||0)>=(this.#c.wordSegmentTokenThreshold||1e5))throw new Error("minWordTokenSave must be less than wordSegmentTokenThreshold");if((this.#c.minCharTokenSave||0)>=(this.#c.charSegmentTokenThreshold||5e5))throw new Error("minCharTokenSave must be less than charSegmentTokenThreshold");this.#e=e.storage,this.#t=new n(this.#e),this.#r=new i(this.#e),this.#h=new Map}startBatch(){this.#g=!0,this.#f={word:0,char:0}}async endBatch(){this.#g=!1,this.#f.word>0&&await this.#l("word",this.#f.word),this.#f.char>0&&await this.#l("char",this.#f.char),this.#f={word:0,char:0},await this.#t.save()}async addDocument(e){return this.addDocuments([e])}async addDocumentIfMissing(e){return this.addDocumentsIfMissing([e])}async addDocumentsIfMissing(e){if(this.#d||await this.#w(),0===e.length)return;const t=this.#t.getDeletedIds(),s=[],n=[],i=[];for(const a of e){if(t.has(a.id)||this.#t.isAdded(a.id))continue;const e=this.#m(a),o=[],r=[];for(const t of e)t.length>1?o.push(t):1===t.length&&r.push(t);o.length>0&&s.push({id:a.id,tokens:o}),r.length>0&&n.push({id:a.id,tokens:r}),i.push(a)}if(0===i.length)return;let a=0,o=0;if(s.length>0){await this.#r.appendBatch(h,s);for(const e of s)a+=e.tokens.length}if(n.length>0){await this.#r.appendBatch(d,n);for(const e of n)o+=e.tokens.length}for(const e of i)this.#t.addAddedId(e.id);this.#g?(this.#f.word+=a,this.#f.char+=o):(a>0&&await this.#l("word",a),o>0&&await this.#l("char",o),await this.#t.save())}async addDocuments(e){if(this.#d||await this.#w(),0===e.length)return;const t=this.#t.getDeletedIds(),s=[],n=[];for(const i of e){if(t.has(i.id))throw new Error(`Document ID ${i.id} has been deleted and cannot be re-added.`);if(this.#t.isAdded(i.id))throw new Error(`Document ID ${i.id} already exists.`);const e=this.#m(i),a=[],o=[];for(const t of e)t.length>1?a.push(t):1===t.length&&o.push(t);a.length>0&&s.push({id:i.id,tokens:a}),o.length>0&&n.push({id:i.id,tokens:o})}let i=0,a=0;if(s.length>0){await this.#r.appendBatch(h,s);for(const e of s)i+=e.tokens.length}if(n.length>0){await this.#r.appendBatch(d,n);for(const e of n)a+=e.tokens.length}for(const t of e)this.#t.addAddedId(t.id);this.#g?(this.#f.word+=i,this.#f.char+=a):(i>0&&await this.#l("word",i),a>0&&await this.#l("char",a),await this.#t.save())}async search(e,t){this.#d||await this.#w();const s="string"==typeof e?{text:e}:e,n=this.#u(s),i=n.filter(e=>e.length>1),a=n.filter(e=>1===e.length),r=this.#t.getDeletedIds(),h=new Map,d=new Map,c=e=>{const t=this.#t.getSegments(e);for(const e of t){const t=e.filename;!this.#h.has(t)&&!d.has(t)&&d.set(t,new o(t,this.#e))}};c("word"),c("char"),await Promise.all(Array.from(d.entries()).map(([e,t])=>t.loadIndex().then(s=>{s&&this.#h.set(e,t)})));const g=async(e,t)=>{if(0===t.length)return;const s=this.#t.getSegments(e);for(const e of s){const s=e.filename,n=this.#h.get(s);if(n)for(const e of t){const t=n.search(e),s=1+.1*e.length;for(const n of t)if(!r.has(n))if(h.has(n)){const t=h.get(n);t.score+=s,t.tokens.add(e)}else h.set(n,{score:0,tokens:new Set([e])})}}};await g("word",i),await g("char",a);const f=[];return h.forEach((e,t)=>{f.push({id:t,score:e.score,tokens:Array.from(e.tokens)})}),f.sort((e,t)=>t.score-e.score),"number"==typeof t&&t>0?f.slice(0,t):f}async removeDocument(e){this.#d||await this.#w(),this.#t.addDeletedId(e),this.#t.removeAddedId(e),await this.#t.save()}async clearAll(){await this.#e.clearAll(),this.#h.clear(),this.#t.reset(),this.#d=!1,this.#g=!1,this.#f={word:0,char:0}}async getStatus(){return this.#d||await this.#w(),{wordSegments:this.#t.getSegments("word").length,charSegments:this.#t.getSegments("char").length,deleted:this.#t.getDeletedIds().size,wordCacheSize:await this.#r.getCurrentSize(h),charCacheSize:await this.#r.getCurrentSize(d),inBatch:this.#g}}async hasDocument(e){return this.#d||await this.#w(),this.#t.hasDocument(e)}async#w(){if(this.#d)return;await this.#t.load();const e=[...this.#t.getSegments("word"),...this.#t.getSegments("char")];for(const t of e)this.#h.has(t.filename)||this.#h.set(t.filename,new o(t.filename,this.#e)),await this.#h.get(t.filename).loadIndex();this.#d=!0}#m(e){return this.#c.indexingTokenizer(e)}#u(e){return this.#c.searchTokenizer?this.#c.searchTokenizer(e):this.#m(e)}async#l(e,t){const s="word"===e?h:d,n=await this.#r.getCurrentSize(s),i="word"===e?this.#c.wordSegmentTokenThreshold||1e5:this.#c.charSegmentTokenThreshold||5e5,a="word"===e?this.#c.minWordTokenSave||0:this.#c.minCharTokenSave||0,r=this.#t.getLastSegmentInfo(e);let c,g,f,l;const w=()=>{const t=this.#t.getSegments(e).length+1;return`${e}_seg_${t}.bin`};if(r){const e=r.tokenCount;e>=i||e+t>=i?(c=w(),f=!0,g=r.end,l=t):(c=r.filename,f=!1,g=r.start,l=e+t)}else c=w(),f=!0,g=0,l=t;if(l<a)return void this.#t.updateSegment(e,c,g,n,l,f);const m=await this.#r.readRange(s,g,n);let u=this.#h.get(c);u||(u=new o(c,this.#e),this.#h.set(c,u)),await u.buildAndSave(m),this.#t.updateSegment(e,c,g,n,l,f)}},exports.hash=a,exports.murmur3_32=a;
|
package/lib/core.d.ts
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { ISearchEngine, ISearchEngineOption, IDocument, IDocumentBase, IResult, ISearchEngineStatus } from './type';
|
|
2
2
|
|
|
3
3
|
/**
|
|
4
4
|
* 核心搜索引擎类 (多实例支持)
|
|
5
5
|
*/
|
|
6
|
-
declare class SearchEngine {
|
|
6
|
+
declare class SearchEngine implements ISearchEngine {
|
|
7
7
|
#private;
|
|
8
|
-
constructor(config:
|
|
9
|
-
init(): Promise<void>;
|
|
8
|
+
constructor(config: ISearchEngineOption);
|
|
10
9
|
/**
|
|
11
10
|
* 开启批处理
|
|
12
11
|
* 批处理期间 addDocuments 只写入缓存,不触发索引段构建
|
|
@@ -29,17 +28,10 @@ declare class SearchEngine {
|
|
|
29
28
|
*/
|
|
30
29
|
addDocumentsIfMissing<T extends IDocument = IDocument>(docs: T[]): Promise<void>;
|
|
31
30
|
addDocuments<T extends IDocument = IDocument>(docs: T[]): Promise<void>;
|
|
32
|
-
search<T extends IDocumentBase =
|
|
31
|
+
search<T extends IDocumentBase | string = any>(query: T, limit?: number): Promise<IResult[]>;
|
|
33
32
|
removeDocument(id: number): Promise<void>;
|
|
34
33
|
clearAll(): Promise<void>;
|
|
35
|
-
getStatus(): Promise<
|
|
36
|
-
wordSegments: number;
|
|
37
|
-
charSegments: number;
|
|
38
|
-
deleted: number;
|
|
39
|
-
wordCacheSize: number;
|
|
40
|
-
charCacheSize: number;
|
|
41
|
-
inBatch: boolean;
|
|
42
|
-
}>;
|
|
34
|
+
getStatus(): Promise<ISearchEngineStatus>;
|
|
43
35
|
/**
|
|
44
36
|
* 检查文档ID是否曾经添加过(包括已删除的)
|
|
45
37
|
* @param id 文档ID
|
package/lib/core.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
import{BrowserStorage as e}from'./browser';import{NodeStorage as t}from'./node';const s="search_meta.json",n="deleted_ids.bin",i="added_ids.bin";class o{#e;#t={wordSegments:[],charSegments:[]};#s=new Set;#n=new Set;constructor(e){this.#e=e}async load(){const e=await this.#e.read(s);if(e){const t=(new TextDecoder).decode(e);this.#t=JSON.parse(t)}else this.#t={wordSegments:[],charSegments:[]};const t=await this.#e.read(n);if(t){const e=new DataView(t);let s=0;const n=t.byteLength;for(;s<n&&!(s+4>n);){const t=e.getUint32(s,!0);this.#s.add(t),s+=4,s<n&&30===e.getUint8(s)&&(s+=1)}}const o=await this.#e.read(i);if(o){const e=new DataView(o);let t=0;const s=o.byteLength;for(;t<s&&!(t+4>s);){const n=e.getUint32(t,!0);this.#n.add(n),t+=4,t<s&&30===e.getUint8(t)&&(t+=1)}}}async save(){const e=JSON.stringify(this.#t);if(await this.#e.write(s,(new TextEncoder).encode(e).buffer),0===this.#s.size)await this.#e.remove(n);else{const e=4*this.#s.size+this.#s.size,t=new ArrayBuffer(e),s=new DataView(t);let i=0;for(const e of this.#s)s.setUint32(i,e,!0),i+=4,s.setUint8(i,30),i+=1;await this.#e.write(n,t)}if(0===this.#n.size)await this.#e.remove(i);else{const e=4*this.#n.size+this.#n.size,t=new ArrayBuffer(e),s=new DataView(t);let n=0;for(const e of this.#n)s.setUint32(n,e,!0),n+=4,s.setUint8(n,30),n+=1;await this.#e.write(i,t)}}getSegments(e){return"word"===e?this.#t.wordSegments:this.#t.charSegments}getDeletedIds(){return this.#s}addDeletedId(e){this.#s.add(e)}isDeleted(e){return this.#s.has(e)}addAddedId(e){this.#n.add(e)}removeAddedId(e){this.#n.delete(e)}isAdded(e){return this.#n.has(e)}getAddedIds(){return this.#n}hasDocument(e){return this.#n.has(e)||this.#s.has(e)}getLastSegmentInfo(e){const t=this.getSegments(e);return 0===t.length?null:t[t.length-1]}updateSegment(e,t,s,n,i,o){const a="word"===e?this.#t.wordSegments:this.#t.charSegments;if(o)a.push({filename:t,start:s,end:n,tokenCount:i});else{const e=a[a.length-1];e&&e.filename===t&&(e.end=n,e.tokenCount=i)}}reset(){this.#t={wordSegments:[],charSegments:[]},this.#s.clear(),this.#n.clear()}}class a{static SEPARATOR=30;#e;constructor(e){this.#e=e}async appendBatch(e,t){if(0===t.length)return await this.#e.getFileSize(e);const s=new TextEncoder;let n=0;for(const e of t){n+=8;for(const t of e.tokens){n+=2+Math.min(s.encode(t).byteLength,65535)}n+=1}const i=new Uint8Array(n);let o=0;for(const e of t){const t=[];for(const n of e.tokens){const e=s.encode(n),i=e.byteLength>65535?e.slice(0,65535):e;t.push(i)}const n=new DataView(i.buffer,o);n.setUint32(0,e.id,!0),n.setUint32(4,t.length,!0),o+=8;for(const e of t)new DataView(i.buffer,o).setUint16(0,e.byteLength,!0),o+=2,i.set(e,o),o+=e.byteLength;i[o++]=a.SEPARATOR}return await this.#e.append(e,i.buffer),await this.#e.getFileSize(e)}async readRange(e,t,s){const n=await this.#e.readRange(e,t,s);if(!n||0===n.byteLength)return[];const i=new DataView(n),o=new Uint8Array(n),r=new TextDecoder,h=[];let d=0;const c=n.byteLength;for(;d<c&&!(d+8>c);){const e=i.getUint32(d,!0);d+=4;const t=i.getUint32(d,!0);d+=4;const s=[];for(let e=0;e<t&&!(d+2>c);e++){const e=i.getUint16(d,!0);if(d+=2,d+e>c)break;const t=new Uint8Array(n,d,e);s.push(r.decode(t)),d+=e}d<c&&o[d]===a.SEPARATOR&&(d+=1),h.push({id:e,tokens:s})}return h}async getCurrentSize(e){return await this.#e.getFileSize(e)}}function r(e,t=305419896){const s=e.length,n=s>>2;let i=0;for(;i<n;){let s=255&e.charCodeAt(i)|(255&e.charCodeAt(++i))<<8|(255&e.charCodeAt(++i))<<16|(255&e.charCodeAt(++i))<<24;++i,s=3432918353*(65535&s)+((3432918353*(s>>>16)&65535)<<16)&4294967295,s=s<<15|s>>>17,s=461845907*(65535&s)+((461845907*(s>>>16)&65535)<<16)&4294967295,t=27492+(65535&(t=5*(65535&(t=(t^=s)<<13|t>>>19))+((5*(t>>>16)&65535)<<16)&4294967295))+(((t>>>16)+58964&65535)<<16)}let o=0;const a=3&s;return a>0&&(a>=3&&(o^=(255&e.charCodeAt(i+2))<<16),a>=2&&(o^=(255&e.charCodeAt(i+1))<<8),a>=1&&(o^=255&e.charCodeAt(i)),o=3432918353*(65535&o)+((3432918353*(o>>>16)&65535)<<16)&4294967295,o=o<<15|o>>>17,o=461845907*(65535&o)+((461845907*(o>>>16)&65535)<<16)&4294967295,t^=o),t^=s,t=2246822507*(65535&(t^=t>>>16))+((2246822507*(t>>>16)&65535)<<16)&4294967295,t=3266489909*(65535&(t^=t>>>13))+((3266489909*(t>>>16)&65535)<<16)&4294967295,(t^=t>>>16)>>>0}class h{#i;#e;#o=null;#a=null;static hash(e){return r(e)}constructor(e,t){this.#i=e,this.#e=t}async loadIndex(){return!!this.#o||(this.#o=await this.#e.read(this.#i),!!this.#o&&(this.#a=new DataView(this.#o),!0))}async buildAndSave(e){const t=new Map;for(const s of e){const e=new Map;for(const n of s.tokens)e.has(n)||(e.set(n,!0),t.has(n)||t.set(n,{hash:h.hash(n),postings:[]}),t.get(n).postings.push(s.id))}const s=Array.from(t.entries());s.sort(([e,{hash:t}],[s,{hash:n}])=>t!==n?t-n:e.localeCompare(s));let n=0,i=0;for(const[e,{postings:t}]of s)n+=t.length,i+=e.length+1;const o=20*s.length,a=12+o+4*n,r=new ArrayBuffer(a+i),d=new DataView(r);d.setUint32(0,1229866072),d.setUint32(4,s.length),d.setUint32(8,a);let c=12,g=12+o,f=a;for(const[e,{hash:t,postings:n}]of s){d.setUint32(c,t),d.setUint32(c+4,e.length),d.setUint32(c+8,f),d.setUint32(c+12,g),d.setUint32(c+16,n.length),c+=20;for(let e=0;e<n.length;e++)d.setUint32(g,n[e],!0),g+=4;const s=(new TextEncoder).encode(e);for(let e=0;e<s.length;e++)d.setUint8(f++,s[e]);d.setUint8(f++,0)}await this.#e.write(this.#i,r),this.#o=r,this.#a=d}search(e){if(!this.#a||!this.#o)return[];const t=h.hash(e),s=this.#a.getUint32(4);let n=0,i=s-1;const o=12,a=20,r=new TextDecoder;for(;n<=i;){const h=n+i>>>1,d=o+h*a,c=this.#a.getUint32(d);if(c<t)n=h+1;else{if(!(c>t)){if(!(h>0&&this.#a.getUint32(o+(h-1)*a)===t||h<s-1&&this.#a.getUint32(o+(h+1)*a)===t)){const e=this.#a.getUint32(o+h*a+12),t=this.#a.getUint32(o+h*a+16),s=[];for(let n=0;n<t;n++)s.push(this.#a.getUint32(e+4*n,!0));return s}let n=h;for(;n>0;){const e=o+(n-1)*a;if(this.#a.getUint32(e)!==t)break;n--}for(let i=n;i<s;i++){const s=o+i*a;if(this.#a.getUint32(s)!==t)break;const n=this.#a.getUint32(s+4),h=this.#a.getUint32(s+8),d=new Uint8Array(this.#o,h,n);if(r.decode(d)===e){const e=this.#a.getUint32(s+12),t=this.#a.getUint32(s+16),n=[];for(let s=0;s<t;s++)n.push(this.#a.getUint32(e+4*s,!0));return n}}return[]}i=h-1}}return[]}}const d="word_cache.bin",c="char_cache.bin";class g{#e;#t;#r;#h;#d=!1;#c;#g=!1;#f={word:0,char:0};constructor(s){if(!s.baseDir)throw new Error("SearchEngine requires 'baseDir' in config.");if(this.#c={wordSegmentTokenThreshold:1e5,charSegmentTokenThreshold:5e5,minWordTokenSave:0,minCharTokenSave:0,...s},(this.#c.minWordTokenSave||0)>=(this.#c.wordSegmentTokenThreshold||1e5))throw new Error("minWordTokenSave must be less than wordSegmentTokenThreshold");if((this.#c.minCharTokenSave||0)>=(this.#c.charSegmentTokenThreshold||5e5))throw new Error("minCharTokenSave must be less than charSegmentTokenThreshold");let n=null;if(this.#c.storage&&("object"==typeof this.#c.storage?n=this.#c.storage:"browser"===this.#c.storage?n=new e(this.#c.baseDir):"node"===this.#c.storage&&(n=new t(this.#c.baseDir))),!n){const s=typeof navigator<"u"&&navigator?.storage?.getDirectory instanceof Function,i=typeof process<"u"&&null!=process.versions&&null!=process.versions.node;s?n=new e(this.#c.baseDir):i&&(n=new t(this.#c.baseDir))}if(!n)throw new Error('Storage initialization failed. Please configure "storage" explicitly or ensure you are in a supported environment (Browser/Node).');this.#e=n,this.#t=new o(this.#e),this.#r=new a(this.#e),this.#h=new Map}async init(){if(this.#d)return;await this.#t.load();const e=[...this.#t.getSegments("word"),...this.#t.getSegments("char")];for(const t of e)this.#h.has(t.filename)||this.#h.set(t.filename,new h(t.filename,this.#e)),await this.#h.get(t.filename).loadIndex();this.#d=!0}startBatch(){this.#g=!0,this.#f={word:0,char:0}}async endBatch(){this.#g=!1,this.#f.word>0&&await this.#l("word",this.#f.word),this.#f.char>0&&await this.#l("char",this.#f.char),this.#f={word:0,char:0},await this.#t.save()}#w(e){try{if(typeof Intl<"u"&&"function"==typeof Intl.Segmenter&&"function"==typeof Array.from){const t=new Intl.Segmenter([],{granularity:"word"}).segment(e);if("object"==typeof t&&null!==t)return Array.from(t).filter(e=>e?.isWordLike).map(e=>e?.segment?.toLowerCase()||"")}}catch{}return e.toLowerCase().split(/[^a-z0-9\u4e00-\u9fa5]+/g).filter(e=>e.length>0)}#m(e){return this.#c.indexingTokenizer?this.#c.indexingTokenizer(e):this.#w(e.text)}#u(e){return this.#c.searchTokenizer?this.#c.searchTokenizer(e):this.#m(e)}async addDocument(e){return this.addDocuments([e])}async addDocumentIfMissing(e){return this.addDocumentsIfMissing([e])}async addDocumentsIfMissing(e){if(this.#d||await this.init(),0===e.length)return;const t=this.#t.getDeletedIds(),s=[],n=[],i=[];for(const o of e){if(t.has(o.id)||this.#t.isAdded(o.id))continue;const e=this.#m(o),a=[],r=[];for(const t of e)t.length>1?a.push(t):1===t.length&&r.push(t);a.length>0&&s.push({id:o.id,tokens:a}),r.length>0&&n.push({id:o.id,tokens:r}),i.push(o)}if(0===i.length)return;let o=0,a=0;if(s.length>0){await this.#r.appendBatch(d,s);for(const e of s)o+=e.tokens.length}if(n.length>0){await this.#r.appendBatch(c,n);for(const e of n)a+=e.tokens.length}for(const e of i)this.#t.addAddedId(e.id);this.#g?(this.#f.word+=o,this.#f.char+=a):(o>0&&await this.#l("word",o),a>0&&await this.#l("char",a),await this.#t.save())}async addDocuments(e){if(this.#d||await this.init(),0===e.length)return;const t=this.#t.getDeletedIds(),s=[],n=[];for(const i of e){if(t.has(i.id))throw new Error(`Document ID ${i.id} has been deleted and cannot be re-added.`);if(this.#t.isAdded(i.id))throw new Error(`Document ID ${i.id} already exists.`);const e=this.#m(i),o=[],a=[];for(const t of e)t.length>1?o.push(t):1===t.length&&a.push(t);o.length>0&&s.push({id:i.id,tokens:o}),a.length>0&&n.push({id:i.id,tokens:a})}let i=0,o=0;if(s.length>0){await this.#r.appendBatch(d,s);for(const e of s)i+=e.tokens.length}if(n.length>0){await this.#r.appendBatch(c,n);for(const e of n)o+=e.tokens.length}for(const t of e)this.#t.addAddedId(t.id);this.#g?(this.#f.word+=i,this.#f.char+=o):(i>0&&await this.#l("word",i),o>0&&await this.#l("char",o),await this.#t.save())}async#l(e,t){const s="word"===e?d:c,n=await this.#r.getCurrentSize(s),i="word"===e?this.#c.wordSegmentTokenThreshold||1e5:this.#c.charSegmentTokenThreshold||5e5,o="word"===e?this.#c.minWordTokenSave||0:this.#c.minCharTokenSave||0,a=this.#t.getLastSegmentInfo(e);let r,g,f,l;const w=()=>{const t=this.#t.getSegments(e).length+1;return`${e}_seg_${t}.bin`};if(a){const e=a.tokenCount;e>=i||e+t>=i?(r=w(),f=!0,g=a.end,l=t):(r=a.filename,f=!1,g=a.start,l=e+t)}else r=w(),f=!0,g=0,l=t;if(l<o)return void this.#t.updateSegment(e,r,g,n,l,f);const m=await this.#r.readRange(s,g,n);let u=this.#h.get(r);u||(u=new h(r,this.#e),this.#h.set(r,u)),await u.buildAndSave(m),this.#t.updateSegment(e,r,g,n,l,f)}async search(e,t){this.#d||await this.init();const s="string"==typeof e?{text:e}:e,n=this.#u(s),i=n.filter(e=>e.length>1),o=n.filter(e=>1===e.length),a=this.#t.getDeletedIds(),r=new Map,d=new Map,c=e=>{const t=this.#t.getSegments(e);for(const e of t){const t=e.filename;!this.#h.has(t)&&!d.has(t)&&d.set(t,new h(t,this.#e))}};c("word"),c("char"),await Promise.all(Array.from(d.entries()).map(([e,t])=>t.loadIndex().then(s=>{s&&this.#h.set(e,t)})));const g=async(e,t)=>{if(0===t.length)return;const s=this.#t.getSegments(e);for(const e of s){const s=e.filename,n=this.#h.get(s);if(n)for(const e of t){const t=n.search(e),s=1+.1*e.length;for(const n of t)if(!a.has(n))if(r.has(n)){const t=r.get(n);t.score+=s,t.tokens.add(e)}else r.set(n,{score:0,tokens:new Set([e])})}}};await g("word",i),await g("char",o);const f=[];return r.forEach((e,t)=>{f.push({id:t,score:e.score,tokens:Array.from(e.tokens)})}),f.sort((e,t)=>t.score-e.score),"number"==typeof t&&t>0?f.slice(0,t):f}async removeDocument(e){this.#d||await this.init(),this.#t.addDeletedId(e),this.#t.removeAddedId(e),await this.#t.save()}async clearAll(){await this.#e.clearAll(),this.#h.clear(),this.#t.reset(),this.#d=!1,this.#g=!1,this.#f={word:0,char:0}}async getStatus(){return this.#d||await this.init(),{wordSegments:this.#t.getSegments("word").length,charSegments:this.#t.getSegments("char").length,deleted:this.#t.getDeletedIds().size,wordCacheSize:await this.#r.getCurrentSize(d),charCacheSize:await this.#r.getCurrentSize(c),inBatch:this.#g}}async hasDocument(e){return this.#d||await this.init(),this.#t.hasDocument(e)}}export{g as SearchEngine,r as hash,r as murmur3_32};
|
|
1
|
+
const e="search_meta.json",t="deleted_ids.bin",s="added_ids.bin";class n{#e;#t={wordSegments:[],charSegments:[]};#s=new Set;#n=new Set;constructor(e){this.#e=e}async load(){const n=await this.#e.read(e);if(n){const e=(new TextDecoder).decode(n);this.#t=JSON.parse(e)}else this.#t={wordSegments:[],charSegments:[]};const i=await this.#e.read(t);if(i){const e=new DataView(i);let t=0;const s=i.byteLength;for(;t<s&&!(t+4>s);){const n=e.getUint32(t,!0);this.#s.add(n),t+=4,t<s&&30===e.getUint8(t)&&(t+=1)}}const a=await this.#e.read(s);if(a){const e=new DataView(a);let t=0;const s=a.byteLength;for(;t<s&&!(t+4>s);){const n=e.getUint32(t,!0);this.#n.add(n),t+=4,t<s&&30===e.getUint8(t)&&(t+=1)}}}async save(){const n=JSON.stringify(this.#t);if(await this.#e.write(e,(new TextEncoder).encode(n).buffer),0===this.#s.size)await this.#e.remove(t);else{const e=4*this.#s.size+this.#s.size,s=new ArrayBuffer(e),n=new DataView(s);let i=0;for(const e of this.#s)n.setUint32(i,e,!0),i+=4,n.setUint8(i,30),i+=1;await this.#e.write(t,s)}if(0===this.#n.size)await this.#e.remove(s);else{const e=4*this.#n.size+this.#n.size,t=new ArrayBuffer(e),n=new DataView(t);let i=0;for(const e of this.#n)n.setUint32(i,e,!0),i+=4,n.setUint8(i,30),i+=1;await this.#e.write(s,t)}}getSegments(e){return"word"===e?this.#t.wordSegments:this.#t.charSegments}getDeletedIds(){return this.#s}addDeletedId(e){this.#s.add(e)}isDeleted(e){return this.#s.has(e)}addAddedId(e){this.#n.add(e)}removeAddedId(e){this.#n.delete(e)}isAdded(e){return this.#n.has(e)}getAddedIds(){return this.#n}hasDocument(e){return this.#n.has(e)||this.#s.has(e)}getLastSegmentInfo(e){const t=this.getSegments(e);return 0===t.length?null:t[t.length-1]}updateSegment(e,t,s,n,i,a){const o="word"===e?this.#t.wordSegments:this.#t.charSegments;if(a)o.push({filename:t,start:s,end:n,tokenCount:i});else{const e=o[o.length-1];e&&e.filename===t&&(e.end=n,e.tokenCount=i)}}reset(){this.#t={wordSegments:[],charSegments:[]},this.#s.clear(),this.#n.clear()}}class i{static SEPARATOR=30;#e;constructor(e){this.#e=e}async appendBatch(e,t){if(0===t.length)return await this.#e.getFileSize(e);const s=new TextEncoder;let n=0;for(const e of t){n+=8;for(const t of e.tokens){n+=2+Math.min(s.encode(t).byteLength,65535)}n+=1}const a=new Uint8Array(n);let o=0;for(const e of t){const t=[];for(const n of e.tokens){const e=s.encode(n),i=e.byteLength>65535?e.slice(0,65535):e;t.push(i)}const n=new DataView(a.buffer,o);n.setUint32(0,e.id,!0),n.setUint32(4,t.length,!0),o+=8;for(const e of t)new DataView(a.buffer,o).setUint16(0,e.byteLength,!0),o+=2,a.set(e,o),o+=e.byteLength;a[o++]=i.SEPARATOR}return await this.#e.append(e,a.buffer),await this.#e.getFileSize(e)}async readRange(e,t,s){const n=await this.#e.readRange(e,t,s);if(!n||0===n.byteLength)return[];const a=new DataView(n),o=new Uint8Array(n),r=new TextDecoder,h=[];let d=0;const c=n.byteLength;for(;d<c&&!(d+8>c);){const e=a.getUint32(d,!0);d+=4;const t=a.getUint32(d,!0);d+=4;const s=[];for(let e=0;e<t&&!(d+2>c);e++){const e=a.getUint16(d,!0);if(d+=2,d+e>c)break;const t=new Uint8Array(n,d,e);s.push(r.decode(t)),d+=e}d<c&&o[d]===i.SEPARATOR&&(d+=1),h.push({id:e,tokens:s})}return h}async getCurrentSize(e){return await this.#e.getFileSize(e)}}function a(e,t=305419896){const s=e.length,n=s>>2;let i=0;for(;i<n;){let s=255&e.charCodeAt(i)|(255&e.charCodeAt(++i))<<8|(255&e.charCodeAt(++i))<<16|(255&e.charCodeAt(++i))<<24;++i,s=3432918353*(65535&s)+((3432918353*(s>>>16)&65535)<<16)&4294967295,s=s<<15|s>>>17,s=461845907*(65535&s)+((461845907*(s>>>16)&65535)<<16)&4294967295,t=27492+(65535&(t=5*(65535&(t=(t^=s)<<13|t>>>19))+((5*(t>>>16)&65535)<<16)&4294967295))+(((t>>>16)+58964&65535)<<16)}let a=0;const o=3&s;return o>0&&(o>=3&&(a^=(255&e.charCodeAt(i+2))<<16),o>=2&&(a^=(255&e.charCodeAt(i+1))<<8),o>=1&&(a^=255&e.charCodeAt(i)),a=3432918353*(65535&a)+((3432918353*(a>>>16)&65535)<<16)&4294967295,a=a<<15|a>>>17,a=461845907*(65535&a)+((461845907*(a>>>16)&65535)<<16)&4294967295,t^=a),t^=s,t=2246822507*(65535&(t^=t>>>16))+((2246822507*(t>>>16)&65535)<<16)&4294967295,t=3266489909*(65535&(t^=t>>>13))+((3266489909*(t>>>16)&65535)<<16)&4294967295,(t^=t>>>16)>>>0}class o{#i;#e;#a=null;#o=null;static hash(e){return a(e)}constructor(e,t){this.#i=e,this.#e=t}async loadIndex(){return!!this.#a||(this.#a=await this.#e.read(this.#i),!!this.#a&&(this.#o=new DataView(this.#a),!0))}async buildAndSave(e){const t=new Map;for(const s of e){const e=new Map;for(const n of s.tokens)e.has(n)||(e.set(n,!0),t.has(n)||t.set(n,{hash:o.hash(n),postings:[]}),t.get(n).postings.push(s.id))}const s=Array.from(t.entries());s.sort(([e,{hash:t}],[s,{hash:n}])=>t!==n?t-n:e.localeCompare(s));let n=0,i=0;for(const[e,{postings:t}]of s)n+=t.length,i+=e.length+1;const a=20*s.length,r=12+a+4*n,h=new ArrayBuffer(r+i),d=new DataView(h);d.setUint32(0,1229866072),d.setUint32(4,s.length),d.setUint32(8,r);let c=12,g=12+a,f=r;for(const[e,{hash:t,postings:n}]of s){d.setUint32(c,t),d.setUint32(c+4,e.length),d.setUint32(c+8,f),d.setUint32(c+12,g),d.setUint32(c+16,n.length),c+=20;for(let e=0;e<n.length;e++)d.setUint32(g,n[e],!0),g+=4;const s=(new TextEncoder).encode(e);for(let e=0;e<s.length;e++)d.setUint8(f++,s[e]);d.setUint8(f++,0)}await this.#e.write(this.#i,h),this.#a=h,this.#o=d}search(e){if(!this.#o||!this.#a)return[];const t=o.hash(e),s=this.#o.getUint32(4);let n=0,i=s-1;const a=12,r=20,h=new TextDecoder;for(;n<=i;){const o=n+i>>>1,d=a+o*r,c=this.#o.getUint32(d);if(c<t)n=o+1;else{if(!(c>t)){if(!(o>0&&this.#o.getUint32(a+(o-1)*r)===t||o<s-1&&this.#o.getUint32(a+(o+1)*r)===t)){const e=this.#o.getUint32(a+o*r+12),t=this.#o.getUint32(a+o*r+16),s=[];for(let n=0;n<t;n++)s.push(this.#o.getUint32(e+4*n,!0));return s}let n=o;for(;n>0;){const e=a+(n-1)*r;if(this.#o.getUint32(e)!==t)break;n--}for(let i=n;i<s;i++){const s=a+i*r;if(this.#o.getUint32(s)!==t)break;const n=this.#o.getUint32(s+4),o=this.#o.getUint32(s+8),d=new Uint8Array(this.#a,o,n);if(h.decode(d)===e){const e=this.#o.getUint32(s+12),t=this.#o.getUint32(s+16),n=[];for(let s=0;s<t;s++)n.push(this.#o.getUint32(e+4*s,!0));return n}}return[]}i=o-1}}return[]}}const r=({text:e})=>{try{if(typeof Intl<"u"&&"function"==typeof Intl.Segmenter&&"function"==typeof Array.from){const t=new Intl.Segmenter([],{granularity:"word"}).segment(e);if("object"==typeof t&&null!==t)return Array.from(t).filter(e=>e?.isWordLike).map(e=>e?.segment?.toLowerCase()||"")}}catch{}return e.toLowerCase().split(/[^a-z0-9\u4e00-\u9fa5]+/g).filter(e=>e.length>0)},h="word_cache.bin",d="char_cache.bin";class c{#e;#t;#r;#h;#d=!1;#c;#g=!1;#f={word:0,char:0};constructor(e){if(this.#c={wordSegmentTokenThreshold:1e5,charSegmentTokenThreshold:5e5,minWordTokenSave:0,minCharTokenSave:0,indexingTokenizer:e.indexingTokenizer||r,...e},(this.#c.minWordTokenSave||0)>=(this.#c.wordSegmentTokenThreshold||1e5))throw new Error("minWordTokenSave must be less than wordSegmentTokenThreshold");if((this.#c.minCharTokenSave||0)>=(this.#c.charSegmentTokenThreshold||5e5))throw new Error("minCharTokenSave must be less than charSegmentTokenThreshold");this.#e=e.storage,this.#t=new n(this.#e),this.#r=new i(this.#e),this.#h=new Map}startBatch(){this.#g=!0,this.#f={word:0,char:0}}async endBatch(){this.#g=!1,this.#f.word>0&&await this.#l("word",this.#f.word),this.#f.char>0&&await this.#l("char",this.#f.char),this.#f={word:0,char:0},await this.#t.save()}async addDocument(e){return this.addDocuments([e])}async addDocumentIfMissing(e){return this.addDocumentsIfMissing([e])}async addDocumentsIfMissing(e){if(this.#d||await this.#w(),0===e.length)return;const t=this.#t.getDeletedIds(),s=[],n=[],i=[];for(const a of e){if(t.has(a.id)||this.#t.isAdded(a.id))continue;const e=this.#m(a),o=[],r=[];for(const t of e)t.length>1?o.push(t):1===t.length&&r.push(t);o.length>0&&s.push({id:a.id,tokens:o}),r.length>0&&n.push({id:a.id,tokens:r}),i.push(a)}if(0===i.length)return;let a=0,o=0;if(s.length>0){await this.#r.appendBatch(h,s);for(const e of s)a+=e.tokens.length}if(n.length>0){await this.#r.appendBatch(d,n);for(const e of n)o+=e.tokens.length}for(const e of i)this.#t.addAddedId(e.id);this.#g?(this.#f.word+=a,this.#f.char+=o):(a>0&&await this.#l("word",a),o>0&&await this.#l("char",o),await this.#t.save())}async addDocuments(e){if(this.#d||await this.#w(),0===e.length)return;const t=this.#t.getDeletedIds(),s=[],n=[];for(const i of e){if(t.has(i.id))throw new Error(`Document ID ${i.id} has been deleted and cannot be re-added.`);if(this.#t.isAdded(i.id))throw new Error(`Document ID ${i.id} already exists.`);const e=this.#m(i),a=[],o=[];for(const t of e)t.length>1?a.push(t):1===t.length&&o.push(t);a.length>0&&s.push({id:i.id,tokens:a}),o.length>0&&n.push({id:i.id,tokens:o})}let i=0,a=0;if(s.length>0){await this.#r.appendBatch(h,s);for(const e of s)i+=e.tokens.length}if(n.length>0){await this.#r.appendBatch(d,n);for(const e of n)a+=e.tokens.length}for(const t of e)this.#t.addAddedId(t.id);this.#g?(this.#f.word+=i,this.#f.char+=a):(i>0&&await this.#l("word",i),a>0&&await this.#l("char",a),await this.#t.save())}async search(e,t){this.#d||await this.#w();const s="string"==typeof e?{text:e}:e,n=this.#u(s),i=n.filter(e=>e.length>1),a=n.filter(e=>1===e.length),r=this.#t.getDeletedIds(),h=new Map,d=new Map,c=e=>{const t=this.#t.getSegments(e);for(const e of t){const t=e.filename;!this.#h.has(t)&&!d.has(t)&&d.set(t,new o(t,this.#e))}};c("word"),c("char"),await Promise.all(Array.from(d.entries()).map(([e,t])=>t.loadIndex().then(s=>{s&&this.#h.set(e,t)})));const g=async(e,t)=>{if(0===t.length)return;const s=this.#t.getSegments(e);for(const e of s){const s=e.filename,n=this.#h.get(s);if(n)for(const e of t){const t=n.search(e),s=1+.1*e.length;for(const n of t)if(!r.has(n))if(h.has(n)){const t=h.get(n);t.score+=s,t.tokens.add(e)}else h.set(n,{score:0,tokens:new Set([e])})}}};await g("word",i),await g("char",a);const f=[];return h.forEach((e,t)=>{f.push({id:t,score:e.score,tokens:Array.from(e.tokens)})}),f.sort((e,t)=>t.score-e.score),"number"==typeof t&&t>0?f.slice(0,t):f}async removeDocument(e){this.#d||await this.#w(),this.#t.addDeletedId(e),this.#t.removeAddedId(e),await this.#t.save()}async clearAll(){await this.#e.clearAll(),this.#h.clear(),this.#t.reset(),this.#d=!1,this.#g=!1,this.#f={word:0,char:0}}async getStatus(){return this.#d||await this.#w(),{wordSegments:this.#t.getSegments("word").length,charSegments:this.#t.getSegments("char").length,deleted:this.#t.getDeletedIds().size,wordCacheSize:await this.#r.getCurrentSize(h),charCacheSize:await this.#r.getCurrentSize(d),inBatch:this.#g}}async hasDocument(e){return this.#d||await this.#w(),this.#t.hasDocument(e)}async#w(){if(this.#d)return;await this.#t.load();const e=[...this.#t.getSegments("word"),...this.#t.getSegments("char")];for(const t of e)this.#h.has(t.filename)||this.#h.set(t.filename,new o(t.filename,this.#e)),await this.#h.get(t.filename).loadIndex();this.#d=!0}#m(e){return this.#c.indexingTokenizer(e)}#u(e){return this.#c.searchTokenizer?this.#c.searchTokenizer(e):this.#m(e)}async#l(e,t){const s="word"===e?h:d,n=await this.#r.getCurrentSize(s),i="word"===e?this.#c.wordSegmentTokenThreshold||1e5:this.#c.charSegmentTokenThreshold||5e5,a="word"===e?this.#c.minWordTokenSave||0:this.#c.minCharTokenSave||0,r=this.#t.getLastSegmentInfo(e);let c,g,f,l;const w=()=>{const t=this.#t.getSegments(e).length+1;return`${e}_seg_${t}.bin`};if(r){const e=r.tokenCount;e>=i||e+t>=i?(c=w(),f=!0,g=r.end,l=t):(c=r.filename,f=!1,g=r.start,l=e+t)}else c=w(),f=!0,g=0,l=t;if(l<a)return void this.#t.updateSegment(e,c,g,n,l,f);const m=await this.#r.readRange(s,g,n);let u=this.#h.get(c);u||(u=new o(c,this.#e),this.#h.set(c,u)),await u.buildAndSave(m),this.#t.updateSegment(e,c,g,n,l,f)}}export{c as SearchEngine,a as hash,a as murmur3_32};
|
package/lib/simple.cjs
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
"use strict";var t=require('./core');exports.SimpleSearch=class{static#t=null;static#e
|
|
1
|
+
"use strict";var t=require('./core'),e=require('./browser'),n=require('./node');const s=Object.freeze({wordSegmentTokenThreshold:1e5,minWordTokenSave:0}),c="simple-search";exports.SimpleSearch=class{static#t=null;static#e;static get config(){if(this.#e)return this.#e;const t={...s};return typeof navigator<"u"&&navigator?.storage?.getDirectory instanceof Function?t.storage=new e.BrowserStorage(c):t.storage=new n.NodeStorage(c),this.#e=t}static configure(e){this.#e={...this.config,...e},this.#t&&(this.#t=new t.SearchEngine(this.config))}static async startBatch(){this.#n().startBatch()}static async endBatch(){return this.#n().endBatch()}static async addDocument(t){return this.#n().addDocument(t)}static async addDocumentIfMissing(t){return this.#n().addDocumentIfMissing(t)}static async addDocuments(t){return this.#n().addDocuments(t)}static async addDocumentsIfMissing(t){return this.#n().addDocumentsIfMissing(t)}static async search(t,e){return this.#n().search(t,e)}static async removeDocument(t){return this.#n().removeDocument(t)}static async clearAll(){return this.#n().clearAll()}static async getStatus(){return this.#n().getStatus()}static async hasDocument(t){return this.#n().hasDocument(t)}static#n(){return this.#t||(this.#t=new t.SearchEngine(this.config)),this.#t}};
|
package/lib/simple.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import * as ___type from './type';
|
|
2
|
-
import {
|
|
2
|
+
import { ISearchEngineOption, IDocument, IDocumentBase } from './type';
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* 快速使用封装
|
|
@@ -7,27 +7,21 @@ import { ISearchEngineConfig, IDocument, IDocumentBase } from './type';
|
|
|
7
7
|
*/
|
|
8
8
|
declare class SimpleSearch {
|
|
9
9
|
#private;
|
|
10
|
+
static get config(): ISearchEngineOption;
|
|
10
11
|
/**
|
|
11
12
|
* 配置并初始化单例
|
|
12
13
|
*/
|
|
13
|
-
static configure(config: Partial<
|
|
14
|
+
static configure(config: Partial<ISearchEngineOption>): void;
|
|
14
15
|
static startBatch(): Promise<void>;
|
|
15
16
|
static endBatch(): Promise<void>;
|
|
16
17
|
static addDocument<T extends IDocument = IDocument>(doc: T): Promise<void>;
|
|
17
18
|
static addDocumentIfMissing<T extends IDocument = IDocument>(doc: T): Promise<void>;
|
|
18
19
|
static addDocuments<T extends IDocument = IDocument>(docs: T[]): Promise<void>;
|
|
19
20
|
static addDocumentsIfMissing<T extends IDocument = IDocument>(docs: T[]): Promise<void>;
|
|
20
|
-
static search<T extends IDocumentBase =
|
|
21
|
+
static search<T extends IDocumentBase = any>(query: T | string, limit?: number): Promise<___type.IResult[]>;
|
|
21
22
|
static removeDocument(id: number): Promise<void>;
|
|
22
23
|
static clearAll(): Promise<void>;
|
|
23
|
-
static getStatus(): Promise<
|
|
24
|
-
wordSegments: number;
|
|
25
|
-
charSegments: number;
|
|
26
|
-
deleted: number;
|
|
27
|
-
wordCacheSize: number;
|
|
28
|
-
charCacheSize: number;
|
|
29
|
-
inBatch: boolean;
|
|
30
|
-
}>;
|
|
24
|
+
static getStatus(): Promise<___type.ISearchEngineStatus>;
|
|
31
25
|
/**
|
|
32
26
|
* 检查文档ID是否曾经添加过(包括已删除的)
|
|
33
27
|
* @param id 文档ID
|
package/lib/simple.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
import{SearchEngine as t}from'./core';
|
|
1
|
+
import{SearchEngine as t}from'./core';import{BrowserStorage as n}from'./browser';import{NodeStorage as s}from'./node';const e=Object.freeze({wordSegmentTokenThreshold:1e5,minWordTokenSave:0}),c="simple-search";class a{static#t=null;static#n;static get config(){if(this.#n)return this.#n;const t={...e};return typeof navigator<"u"&&navigator?.storage?.getDirectory instanceof Function?t.storage=new n(c):t.storage=new s(c),this.#n=t}static configure(n){this.#n={...this.config,...n},this.#t&&(this.#t=new t(this.config))}static async startBatch(){this.#s().startBatch()}static async endBatch(){return this.#s().endBatch()}static async addDocument(t){return this.#s().addDocument(t)}static async addDocumentIfMissing(t){return this.#s().addDocumentIfMissing(t)}static async addDocuments(t){return this.#s().addDocuments(t)}static async addDocumentsIfMissing(t){return this.#s().addDocumentsIfMissing(t)}static async search(t,n){return this.#s().search(t,n)}static async removeDocument(t){return this.#s().removeDocument(t)}static async clearAll(){return this.#s().clearAll()}static async getStatus(){return this.#s().getStatus()}static async hasDocument(t){return this.#s().hasDocument(t)}static#s(){return this.#t||(this.#t=new t(this.config)),this.#t}}export{a as SimpleSearch};
|
package/lib/type.d.ts
CHANGED
|
@@ -66,12 +66,9 @@ interface IStorage {
|
|
|
66
66
|
*/
|
|
67
67
|
type IndexType = 'word' | 'char';
|
|
68
68
|
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
* 用于区分不同的搜索引擎实例
|
|
73
|
-
*/
|
|
74
|
-
baseDir: string;
|
|
69
|
+
type IndexingTokenizer = <T extends IDocument = IDocument>(doc: T) => string[];
|
|
70
|
+
type SearchTokenizer = <T extends IDocumentBase = IDocumentBase>(doc: T) => string[];
|
|
71
|
+
interface ISearchEngineOption {
|
|
75
72
|
/**
|
|
76
73
|
* 存储实现配置 (可选)
|
|
77
74
|
* - 'browser': 强制使用 OPFS (BrowserStorage)
|
|
@@ -79,7 +76,7 @@ interface ISearchEngineConfig {
|
|
|
79
76
|
* - IStorage: 传入自定义的存储实例
|
|
80
77
|
* - undefined: 自动检测环境
|
|
81
78
|
*/
|
|
82
|
-
storage
|
|
79
|
+
storage: IStorage;
|
|
83
80
|
/**
|
|
84
81
|
* 索引时使用的分词器 (算法核心配置)
|
|
85
82
|
* - 作用: 将文档文本转换为索引用的token序列
|
|
@@ -87,7 +84,7 @@ interface ISearchEngineConfig {
|
|
|
87
84
|
* - 建议: 针对不同语言(中文/英文/日文等)使用专门的分词实现
|
|
88
85
|
* - 影响: 直接决定索引的粒度和搜索的准确性
|
|
89
86
|
*/
|
|
90
|
-
indexingTokenizer?:
|
|
87
|
+
indexingTokenizer?: IndexingTokenizer;
|
|
91
88
|
/**
|
|
92
89
|
* 搜索时使用的分词器 (算法核心配置)
|
|
93
90
|
* - 作用: 将查询文本转换为搜索用的token序列
|
|
@@ -95,7 +92,7 @@ interface ISearchEngineConfig {
|
|
|
95
92
|
* - 建议: 与indexingTokenizer保持一致的分词策略以确保搜索准确性
|
|
96
93
|
* - 影响: 直接决定搜索匹配的范围和结果的相关性
|
|
97
94
|
*/
|
|
98
|
-
searchTokenizer?:
|
|
95
|
+
searchTokenizer?: SearchTokenizer;
|
|
99
96
|
/**
|
|
100
97
|
* 词索引分段阈值 (Token数) - 分段算法配置
|
|
101
98
|
* - 作用: 控制词索引文件的大小,超过阈值时创建新的索引段
|
|
@@ -130,6 +127,52 @@ interface ISearchEngineConfig {
|
|
|
130
127
|
minCharTokenSave?: number;
|
|
131
128
|
}
|
|
132
129
|
|
|
130
|
+
interface ISearchEngineStatus {
|
|
131
|
+
wordSegments: number;
|
|
132
|
+
charSegments: number;
|
|
133
|
+
deleted: number;
|
|
134
|
+
wordCacheSize: number;
|
|
135
|
+
charCacheSize: number;
|
|
136
|
+
inBatch: boolean;
|
|
137
|
+
}
|
|
138
|
+
/**
|
|
139
|
+
* 核心搜索引擎
|
|
140
|
+
*/
|
|
141
|
+
interface ISearchEngine {
|
|
142
|
+
/**
|
|
143
|
+
* 开启批处理
|
|
144
|
+
* 批处理期间 addDocuments 只写入缓存,不触发索引段构建
|
|
145
|
+
*/
|
|
146
|
+
startBatch(): void;
|
|
147
|
+
/**
|
|
148
|
+
* 结束批处理
|
|
149
|
+
* 触发索引构建检查并保存元数据
|
|
150
|
+
*/
|
|
151
|
+
endBatch(): Promise<void>;
|
|
152
|
+
addDocument<T extends IDocument = IDocument>(doc: T): Promise<void>;
|
|
153
|
+
/**
|
|
154
|
+
* 添加单个文档,如果文档ID已存在则跳过
|
|
155
|
+
* 用于在批量添加中途出错后的恢复添加行为,也可直接用于单个文档添加
|
|
156
|
+
*/
|
|
157
|
+
addDocumentIfMissing<T extends IDocument = IDocument>(doc: T): Promise<void>;
|
|
158
|
+
/**
|
|
159
|
+
* 添加多个文档,跳过已存在的文档ID
|
|
160
|
+
* 用于在批量添加中途出错后的恢复添加行为,也可直接用于批量添加
|
|
161
|
+
*/
|
|
162
|
+
addDocumentsIfMissing<T extends IDocument = IDocument>(docs: T[]): Promise<void>;
|
|
163
|
+
addDocuments<T extends IDocument = IDocument>(docs: T[]): Promise<void>;
|
|
164
|
+
search<T extends IDocumentBase | string = any>(query: string, limit?: number): Promise<IResult[]>;
|
|
165
|
+
removeDocument(id: number): Promise<void>;
|
|
166
|
+
clearAll(): Promise<void>;
|
|
167
|
+
getStatus(): Promise<ISearchEngineStatus>;
|
|
168
|
+
/**
|
|
169
|
+
* 检查文档ID是否曾经添加过(包括已删除的)
|
|
170
|
+
* @param id 文档ID
|
|
171
|
+
* @returns 文档是否曾经添加过的布尔值
|
|
172
|
+
*/
|
|
173
|
+
hasDocument(id: number): Promise<boolean>;
|
|
174
|
+
}
|
|
175
|
+
|
|
133
176
|
|
|
134
177
|
|
|
135
|
-
export type { IDocument, IDocumentBase, IIndexMeta, IResult,
|
|
178
|
+
export type { IDocument, IDocumentBase, IIndexMeta, IResult, ISearchEngine, ISearchEngineOption, ISearchEngineStatus, ISegmentMeta, IStorage, ITokenizedDoc, IndexType, IndexingTokenizer, SearchTokenizer };
|