gs-search 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.ja.md +55 -17
- package/README.ko.md +53 -15
- package/README.md +48 -6
- package/README.zh-CN.md +76 -19
- package/lib/index.cjs +1 -1
- package/lib/index.d.ts +37 -11
- package/lib/index.js +1 -1
- package/package.json +4 -3
package/README.ja.md
CHANGED
|
@@ -1,20 +1,19 @@
|
|
|
1
1
|
# gs-search
|
|
2
2
|
|
|
3
|
+
JavaScript/TypeScriptアプリケーション向けの軽量、高速、メモリ効率の良い全文検索エンジンです。
|
|
4
|
+
|
|
3
5
|
## 他の言語
|
|
4
6
|
|
|
5
7
|
- [中文 README](README.zh-CN.md)
|
|
6
8
|
- [English README](README.md)
|
|
7
9
|
- [한국어 README](README.ko.md)
|
|
8
10
|
|
|
9
|
-
JavaScript/TypeScriptアプリケーション向けの軽量、高速、メモリ効率の良い全文検索エンジンです。
|
|
10
|
-
|
|
11
11
|
## 特徴
|
|
12
12
|
|
|
13
13
|
- 🔍 **全文検索**(トークン化サポート付き)
|
|
14
14
|
- 📦 **軽量**(外部依存関係なし)
|
|
15
15
|
- ⚡ **高速**な検索パフォーマンス
|
|
16
16
|
- 📱 **ブラウザ & Node.js** サポート
|
|
17
|
-
- 🌐 **多言語**トークン化
|
|
18
17
|
- 🗄️ **カスタムストレージ**サポート
|
|
19
18
|
- 📊 **バッチ操作**(効率的なインデックス作成)
|
|
20
19
|
|
|
@@ -73,18 +72,53 @@ const engine = new SearchEngine({
|
|
|
73
72
|
// エンジンを初期化
|
|
74
73
|
await engine.init();
|
|
75
74
|
|
|
76
|
-
//
|
|
77
|
-
await engine.
|
|
75
|
+
// バッチ操作でドキュメントを追加
|
|
76
|
+
await engine.startBatch();
|
|
78
77
|
try {
|
|
79
78
|
await engine.addDocuments([
|
|
80
79
|
// ... ドキュメント
|
|
81
80
|
]);
|
|
82
|
-
await engine.commit();
|
|
83
81
|
} catch (error) {
|
|
84
|
-
|
|
82
|
+
// エラー処理
|
|
83
|
+
} finally {
|
|
84
|
+
// エラーが発生しても必ずバッチを終了し、インデックスが正しく再構築されるようにする
|
|
85
|
+
await engine.endBatch();
|
|
85
86
|
}
|
|
86
87
|
```
|
|
87
88
|
|
|
89
|
+
### カスタムトークナイザ
|
|
90
|
+
|
|
91
|
+
特定の言語やトークン化の要件をサポートするために、カスタムトークナイザを設定できます。トークナイザは完全なドキュメントオブジェクトにアクセスできます:
|
|
92
|
+
|
|
93
|
+
```typescript
|
|
94
|
+
import { SearchEngine } from 'gs-search';
|
|
95
|
+
|
|
96
|
+
// カスタムインデックストークナイザ:ドキュメントの複数フィールドを使用
|
|
97
|
+
const indexingTokenizer = (doc: { id: string; text: string; category: string; author: string }): string[] => {
|
|
98
|
+
// ドキュメントの全てのプロパティにアクセスできます
|
|
99
|
+
const fullText = `${doc.text} ${doc.category} ${doc.author}`;
|
|
100
|
+
return fullText.toLowerCase().split(/\s+/);
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
// カスタム検索トークナイザ:検索コンテキストをサポート
|
|
104
|
+
const searchTokenizer = (query: { text: string; language?: string; context?: string }): string[] => {
|
|
105
|
+
// クエリの言語やコンテキストに応じてトークン化を調整できます
|
|
106
|
+
const tokens = query.text.toLowerCase().split(/\s+/);
|
|
107
|
+
// コンテキストに応じて追加の検索語を追加
|
|
108
|
+
if (query.context === 'technical') {
|
|
109
|
+
tokens.push('technical');
|
|
110
|
+
}
|
|
111
|
+
return tokens;
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
// カスタムトークナイザを設定してエンジンを作成
|
|
115
|
+
const engine = new SearchEngine({
|
|
116
|
+
baseDir: 'search-data',
|
|
117
|
+
indexingTokenizer,
|
|
118
|
+
searchTokenizer
|
|
119
|
+
});
|
|
120
|
+
```
|
|
121
|
+
|
|
88
122
|
## APIリファレンス
|
|
89
123
|
|
|
90
124
|
### SimpleSearch
|
|
@@ -92,22 +126,26 @@ try {
|
|
|
92
126
|
- `constructor()`: 新しい検索エンジンインスタンスを作成
|
|
93
127
|
- `addDocument(doc: IDocument): Promise<void>`: 単一のドキュメントを追加
|
|
94
128
|
- `addDocuments(docs: IDocument[]): Promise<void>`: 複数のドキュメントを追加
|
|
95
|
-
- `
|
|
129
|
+
- `addDocumentIfMissing(doc: IDocument): Promise<void>`: ドキュメントが存在しない場合は単一のドキュメントを追加
|
|
130
|
+
- `addDocumentsIfMissing(docs: IDocument[]): Promise<void>`: 複数のドキュメントを追加し、既存のドキュメントはスキップ
|
|
131
|
+
- `removeDocument(id: number): Promise<void>`: ドキュメントを削除
|
|
96
132
|
- `search(query: string, limit?: number): Promise<IResult[]>`: ドキュメントを検索
|
|
97
133
|
- `getStatus(): Promise<IStatus>`: 検索エンジンのステータスを取得
|
|
98
134
|
|
|
99
|
-
###
|
|
135
|
+
### SearchEngine
|
|
100
136
|
|
|
101
|
-
- `constructor(options:
|
|
137
|
+
- `constructor(options: ISearchEngineConfig)`: 新しいコアエンジンインスタンスを作成
|
|
102
138
|
- `init(): Promise<void>`: エンジンを初期化
|
|
103
|
-
- `addDocument(doc: IDocument): Promise<void>`:
|
|
104
|
-
- `addDocuments(docs: IDocument[]): Promise<void>`:
|
|
105
|
-
- `
|
|
139
|
+
- `addDocument(doc: IDocument): Promise<void>`: 単一ドキュメントを追加
|
|
140
|
+
- `addDocuments(docs: IDocument[]): Promise<void>`: 複数ドキュメントを追加
|
|
141
|
+
- `addDocumentIfMissing(doc: IDocument): Promise<void>`: ドキュメントが存在しない場合は単一のドキュメントを追加
|
|
142
|
+
- `addDocumentsIfMissing(docs: IDocument[]): Promise<void>`: 複数のドキュメントを追加し、既存のドキュメントはスキップ
|
|
143
|
+
- `removeDocument(id: number): Promise<void>`: ドキュメントを削除
|
|
106
144
|
- `search(query: string, limit?: number): Promise<IResult[]>`: ドキュメントを検索
|
|
107
|
-
- `getStatus(): Promise<IStatus>`:
|
|
108
|
-
- `
|
|
109
|
-
- `
|
|
110
|
-
- `
|
|
145
|
+
- `getStatus(): Promise<IStatus>`: 検索エンジンの状態を取得する
|
|
146
|
+
- `hasDocument(id: number): Promise<boolean>`: ドキュメントIDが追加されたことがあるかを確認(削除されたものも含む)
|
|
147
|
+
- `startBatch(): void`: バッチ操作を開始する
|
|
148
|
+
- `endBatch(): Promise<void>`: バッチ操作を終了
|
|
111
149
|
|
|
112
150
|
## ストレージ
|
|
113
151
|
|
package/README.ko.md
CHANGED
|
@@ -1,20 +1,19 @@
|
|
|
1
1
|
# gs-search
|
|
2
2
|
|
|
3
|
+
JavaScript/TypeScript 애플리케이션을 위한 가볍고 빠르며 메모리 효율적인 전문 검색 엔진입니다.
|
|
4
|
+
|
|
3
5
|
## 다른 언어
|
|
4
6
|
|
|
5
7
|
- [中文 README](README.zh-CN.md)
|
|
6
8
|
- [English README](README.md)
|
|
7
9
|
- [日本語 README](README.ja.md)
|
|
8
10
|
|
|
9
|
-
JavaScript/TypeScript 애플리케이션을 위한 가볍고 빠르며 메모리 효율적인 전문 검색 엔진입니다.
|
|
10
|
-
|
|
11
11
|
## 기능
|
|
12
12
|
|
|
13
13
|
- 🔍 **전문 검색** 토큰화 지원
|
|
14
14
|
- 📦 **가볍고** 외부 의존성 없음
|
|
15
15
|
- ⚡ **고속** 검색 성능
|
|
16
16
|
- 📱 **브라우저 & Node.js** 지원
|
|
17
|
-
- 🌐 **다국어** 토큰화
|
|
18
17
|
- 🗄️ **커스텀 스토리지** 지원
|
|
19
18
|
- 📊 **일괄 작업** 효율적인 인덱싱
|
|
20
19
|
|
|
@@ -73,18 +72,53 @@ const engine = new SearchEngine({
|
|
|
73
72
|
// 엔진 초기화
|
|
74
73
|
await engine.init();
|
|
75
74
|
|
|
76
|
-
//
|
|
77
|
-
await engine.
|
|
75
|
+
// 일괄 작업으로 문서 추가
|
|
76
|
+
await engine.startBatch();
|
|
78
77
|
try {
|
|
79
78
|
await engine.addDocuments([
|
|
80
79
|
// ... 문서
|
|
81
80
|
]);
|
|
82
|
-
await engine.commit();
|
|
83
81
|
} catch (error) {
|
|
84
|
-
|
|
82
|
+
// 오류 처리
|
|
83
|
+
} finally {
|
|
84
|
+
// 오류가 발생하더라도 항상 일괄 작업을 종료하여 인덱스가 올바르게 재구축되도록 합니다
|
|
85
|
+
await engine.endBatch();
|
|
85
86
|
}
|
|
86
87
|
```
|
|
87
88
|
|
|
89
|
+
### 커스텀 토크나이저
|
|
90
|
+
|
|
91
|
+
특정 언어나 토크나이징 요구사항을 지원하기 위해 커스텀 토크나이저를 설정할 수 있습니다. 토크나이저는 전체 문서 객체에 액세스할 수 있습니다:
|
|
92
|
+
|
|
93
|
+
```typescript
|
|
94
|
+
import { SearchEngine } from 'gs-search';
|
|
95
|
+
|
|
96
|
+
// 커스텀 인덱스 토크나이저: 문서의 text와 category 필드를 사용
|
|
97
|
+
const indexingTokenizer = (doc: { id: string; text: string; category: string; author: string }): string[] => {
|
|
98
|
+
// 문서의 모든 속성에 액세스할 수 있습니다
|
|
99
|
+
const fullText = `${doc.text} ${doc.category} ${doc.author}`;
|
|
100
|
+
return fullText.toLowerCase().split(/\s+/);
|
|
101
|
+
};
|
|
102
|
+
|
|
103
|
+
// 커스텀 검색 토크나이저: 검색 컨텍스트 지원
|
|
104
|
+
const searchTokenizer = (query: { text: string; language?: string; context?: string }): string[] => {
|
|
105
|
+
// 쿼리의 언어나 컨텍스트에 따라 토크나이징을 조정할 수 있습니다
|
|
106
|
+
const tokens = query.text.toLowerCase().split(/\s+/);
|
|
107
|
+
// 컨텍스트에 따라 추가 검색어를 추가합니다
|
|
108
|
+
if (query.context === 'technical') {
|
|
109
|
+
tokens.push('technical');
|
|
110
|
+
}
|
|
111
|
+
return tokens;
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
// 커스텀 토크나이저를 설정하여 엔진 생성
|
|
115
|
+
const engine = new SearchEngine({
|
|
116
|
+
baseDir: 'search-data',
|
|
117
|
+
indexingTokenizer,
|
|
118
|
+
searchTokenizer
|
|
119
|
+
});
|
|
120
|
+
```
|
|
121
|
+
|
|
88
122
|
## API 참조
|
|
89
123
|
|
|
90
124
|
### SimpleSearch
|
|
@@ -92,22 +126,26 @@ try {
|
|
|
92
126
|
- `constructor()`: 새로운 검색 엔진 인스턴스 생성
|
|
93
127
|
- `addDocument(doc: IDocument): Promise<void>`: 단일 문서 추가
|
|
94
128
|
- `addDocuments(docs: IDocument[]): Promise<void>`: 여러 문서 추가
|
|
129
|
+
- `addDocumentIfMissing(doc: IDocument): Promise<void>`: 문서가 존재하지 않는 경우 단일 문서 추가
|
|
130
|
+
- `addDocumentsIfMissing(docs: IDocument[]): Promise<void>`: 여러 문서를 추가하고 기존 문서는 건너뜀
|
|
95
131
|
- `deleteDocument(id: number): Promise<void>`: 문서 삭제
|
|
96
132
|
- `search(query: string, limit?: number): Promise<IResult[]>`: 문서 검색
|
|
97
133
|
- `getStatus(): Promise<IStatus>`: 검색 엔진 상태 가져오기
|
|
98
134
|
|
|
99
|
-
###
|
|
135
|
+
### SearchEngine
|
|
100
136
|
|
|
101
|
-
- `constructor(options:
|
|
137
|
+
- `constructor(options: ISearchEngineConfig)`: 새로운 코어 엔진 인스턴스 생성
|
|
102
138
|
- `init(): Promise<void>`: 엔진 초기화
|
|
103
139
|
- `addDocument(doc: IDocument): Promise<void>`: 단일 문서 추가
|
|
104
|
-
- `addDocuments(docs: IDocument[]): Promise<void>`:
|
|
105
|
-
- `
|
|
140
|
+
- `addDocuments(docs: IDocument[]): Promise<void>`: 다중 문서 추가
|
|
141
|
+
- `addDocumentIfMissing(doc: IDocument): Promise<void>`: 문서가 존재하지 않는 경우 단일 문서 추가
|
|
142
|
+
- `addDocumentsIfMissing(docs: IDocument[]): Promise<void>`: 여러 문서를 추가하고 기존 문서는 건너뜀
|
|
143
|
+
- `removeDocument(id: number): Promise<void>`: 문서 삭제
|
|
106
144
|
- `search(query: string, limit?: number): Promise<IResult[]>`: 문서 검색
|
|
107
|
-
- `getStatus(): Promise<IStatus>`: 검색 엔진 상태
|
|
108
|
-
- `
|
|
109
|
-
- `
|
|
110
|
-
- `
|
|
145
|
+
- `getStatus(): Promise<IStatus>`: 검색 엔진 상태 조회
|
|
146
|
+
- `hasDocument(id: number): Promise<boolean>`: 문서 ID가 추가된 적이 있는지 확인합니다 (삭제된 문서도 포함)
|
|
147
|
+
- `startBatch(): void`: 배치 작업 시작
|
|
148
|
+
- `endBatch(): Promise<void>`: 배치 작업 종료
|
|
111
149
|
|
|
112
150
|
## 스토리지
|
|
113
151
|
|
package/README.md
CHANGED
|
@@ -1,20 +1,19 @@
|
|
|
1
1
|
# gs-search
|
|
2
2
|
|
|
3
|
+
A lightweight, fast, and memory-efficient full-text search engine for JavaScript/TypeScript applications.
|
|
4
|
+
|
|
3
5
|
## Other Languages
|
|
4
6
|
|
|
5
7
|
- [中文 README](README.zh-CN.md)
|
|
6
8
|
- [日本語 README](README.ja.md)
|
|
7
9
|
- [한국어 README](README.ko.md)
|
|
8
10
|
|
|
9
|
-
A lightweight, fast, and memory-efficient full-text search engine for JavaScript/TypeScript applications.
|
|
10
|
-
|
|
11
11
|
## Features
|
|
12
12
|
|
|
13
13
|
- 🔍 **Full-text search** with tokenization support
|
|
14
14
|
- 📦 **Lightweight** with zero external dependencies
|
|
15
15
|
- ⚡ **Fast** search performance
|
|
16
16
|
- 📱 **Browser & Node.js** support
|
|
17
|
-
- 🌐 **Multi-language** tokenization
|
|
18
17
|
- 🗄️ **Custom storage** support
|
|
19
18
|
- 📊 **Batch operations** for efficient indexing
|
|
20
19
|
|
|
@@ -79,12 +78,50 @@ try {
|
|
|
79
78
|
await engine.addDocuments([
|
|
80
79
|
// ... documents
|
|
81
80
|
]);
|
|
82
|
-
await engine.endBatch();
|
|
83
81
|
} catch (error) {
|
|
84
82
|
// Handle error
|
|
83
|
+
} finally {
|
|
84
|
+
// Always end batch to ensure index rebuilds properly
|
|
85
|
+
await engine.endBatch();
|
|
85
86
|
}
|
|
86
87
|
```
|
|
87
88
|
|
|
89
|
+
### Custom Tokenizers
|
|
90
|
+
|
|
91
|
+
You can configure custom tokenizers to support specific languages or tokenization requirements:
|
|
92
|
+
|
|
93
|
+
```typescript
|
|
94
|
+
import { SearchEngine, BrowserStorage } from 'gs-search';
|
|
95
|
+
|
|
96
|
+
// Custom tokenizer that splits by spaces and limits token length
|
|
97
|
+
const customTokenizer = (text: string): string[] => {
|
|
98
|
+
// Split by whitespace
|
|
99
|
+
const tokens: string[] = [];
|
|
100
|
+
const words = text.toLowerCase().split(/\s+/);
|
|
101
|
+
|
|
102
|
+
// Process each word, limiting token length to 5 characters
|
|
103
|
+
for (const word of words) {
|
|
104
|
+
if (word.length <= 5) {
|
|
105
|
+
tokens.push(word);
|
|
106
|
+
} else {
|
|
107
|
+
// Split long words character by character
|
|
108
|
+
for (let i = 0; i < word.length; i++) {
|
|
109
|
+
tokens.push(word[i]);
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
return tokens;
|
|
115
|
+
};
|
|
116
|
+
|
|
117
|
+
// Create engine with custom tokenizers
|
|
118
|
+
const engine = new SearchEngine({
|
|
119
|
+
baseDir: 'search-data',
|
|
120
|
+
indexingTokenizer: customTokenizer,
|
|
121
|
+
searchTokenizer: customTokenizer
|
|
122
|
+
});
|
|
123
|
+
```
|
|
124
|
+
|
|
88
125
|
## API Reference
|
|
89
126
|
|
|
90
127
|
### SimpleSearch
|
|
@@ -93,18 +130,23 @@ try {
|
|
|
93
130
|
- `configure(config: Partial<ISearchEngineConfig>): void`: Configure the search engine
|
|
94
131
|
- `addDocument(doc: IDocument): Promise<void>`: Add a single document
|
|
95
132
|
- `addDocuments(docs: IDocument[]): Promise<void>`: Add multiple documents
|
|
133
|
+
- `addDocumentIfMissing(doc: IDocument): Promise<void>`: Add a single document if it doesn't exist
|
|
134
|
+
- `addDocumentsIfMissing(docs: IDocument[]): Promise<void>`: Add multiple documents, skipping existing ones
|
|
96
135
|
- `removeDocument(id: number): Promise<void>`: Delete a document
|
|
97
136
|
- `search(query: string, limit?: number): Promise<IResult[]>`: Search for documents
|
|
98
137
|
- `getStatus(): Promise<IStatus>`: Get search engine status
|
|
138
|
+
- `hasDocument(id: number): Promise<boolean>`: Checks if a document ID has been added (including deleted ones)
|
|
99
139
|
- `startBatch(): void`: Start batch operations
|
|
100
140
|
- `endBatch(): Promise<void>`: End batch operations
|
|
101
141
|
|
|
102
|
-
###
|
|
142
|
+
### SearchEngine
|
|
103
143
|
|
|
104
|
-
- `constructor(options:
|
|
144
|
+
- `constructor(options: ISearchEngineConfig)`: Create a new core engine instance
|
|
105
145
|
- `init(): Promise<void>`: Initialize the engine
|
|
106
146
|
- `addDocument(doc: IDocument): Promise<void>`: Add a single document
|
|
107
147
|
- `addDocuments(docs: IDocument[]): Promise<void>`: Add multiple documents
|
|
148
|
+
- `addDocumentIfMissing(doc: IDocument): Promise<void>`: Add a single document if it doesn't exist
|
|
149
|
+
- `addDocumentsIfMissing(docs: IDocument[]): Promise<void>`: Add multiple documents, skipping existing ones
|
|
108
150
|
- `removeDocument(id: number): Promise<void>`: Delete a document
|
|
109
151
|
- `search(query: string, limit?: number): Promise<IResult[]>`: Search for documents
|
|
110
152
|
- `getStatus(): Promise<IStatus>`: Get search engine status
|
package/README.zh-CN.md
CHANGED
|
@@ -1,20 +1,19 @@
|
|
|
1
1
|
# gs-search
|
|
2
2
|
|
|
3
|
+
可以在现代浏览器运行,且会自动存储索引的极小纯前端搜索库,可以配合其它强大的分词库使用效果更好
|
|
4
|
+
|
|
3
5
|
## 其他语言
|
|
4
6
|
|
|
5
7
|
- [English README](README.md)
|
|
6
8
|
- [日本語 README](README.ja.md)
|
|
7
9
|
- [한국어 README](README.ko.md)
|
|
8
10
|
|
|
9
|
-
可以在现代浏览器运行,且会自动存储索引的极小纯前端搜索库,可以配合其它强大的分词库使用效果更好
|
|
10
|
-
|
|
11
11
|
## 特性
|
|
12
12
|
|
|
13
13
|
- 🔍 **全文搜索** 支持多语言分词
|
|
14
14
|
- 📦 **轻量级** 无任何第三方依赖,体积小
|
|
15
15
|
- ⚡ **高性能** 快速搜索与索引构建
|
|
16
16
|
- 📱 **浏览器兼容** 支持现代浏览器
|
|
17
|
-
- 🌐 **多语言支持** 适配不同语言分词需求
|
|
18
17
|
- 🗄️ **自定义存储** 支持灵活的存储实现
|
|
19
18
|
- 📊 **批处理操作** 批量添加文档更高效
|
|
20
19
|
|
|
@@ -79,9 +78,11 @@ try {
|
|
|
79
78
|
await engine.addDocuments([
|
|
80
79
|
// ... 文档
|
|
81
80
|
]);
|
|
82
|
-
await engine.endBatch();
|
|
83
81
|
} catch (error) {
|
|
84
82
|
// 处理错误
|
|
83
|
+
} finally {
|
|
84
|
+
// 始终结束批处理以确保索引正确重建
|
|
85
|
+
await engine.endBatch();
|
|
85
86
|
}
|
|
86
87
|
```
|
|
87
88
|
|
|
@@ -112,30 +113,81 @@ const storage = new CustomStorage();
|
|
|
112
113
|
const engine = new SearchEngine({ storage });
|
|
113
114
|
```
|
|
114
115
|
|
|
115
|
-
###
|
|
116
|
+
### 批处理操作
|
|
116
117
|
|
|
117
|
-
|
|
118
|
+
使用批处理操作进行高效的文档索引:
|
|
118
119
|
|
|
119
120
|
```typescript
|
|
120
|
-
|
|
121
|
+
// 开始批处理操作
|
|
122
|
+
await engine.startBatch();
|
|
121
123
|
|
|
122
124
|
try {
|
|
123
125
|
// 批量添加文档
|
|
124
126
|
for (let i = 0; i < 1000; i++) {
|
|
125
127
|
await engine.addDocuments([{ id: i, text: `文档 ${i}` }]);
|
|
126
128
|
}
|
|
127
|
-
|
|
128
|
-
// 提交事务
|
|
129
|
-
await engine.commitTransaction();
|
|
130
129
|
} catch (error) {
|
|
131
|
-
//
|
|
132
|
-
|
|
130
|
+
// 处理错误
|
|
131
|
+
console.error('批处理操作失败:', error);
|
|
132
|
+
} finally {
|
|
133
|
+
// 无论是否发生错误,都必须结束批处理以确保索引正常重建
|
|
134
|
+
await engine.endBatch();
|
|
133
135
|
}
|
|
134
136
|
```
|
|
135
137
|
|
|
136
138
|
## 自定义分词器
|
|
137
139
|
|
|
138
|
-
|
|
140
|
+
### 支持完整文档对象的分词器
|
|
141
|
+
|
|
142
|
+
您可以通过配置自定义分词器来支持特定的语言或分词需求。分词器可以访问完整的文档对象,让您能够基于文档的多个属性进行分词:
|
|
143
|
+
|
|
144
|
+
```typescript
|
|
145
|
+
import { SearchEngine } from 'gs-search';
|
|
146
|
+
|
|
147
|
+
// 自定义索引分词器:使用文档的text和category字段进行分词
|
|
148
|
+
const indexingTokenizer = (doc: { id: string; text: string; category: string; author: string }): string[] => {
|
|
149
|
+
// 可以访问文档的所有属性
|
|
150
|
+
const fullText = `${doc.text} ${doc.category} ${doc.author}`;
|
|
151
|
+
return fullText.toLowerCase().split(/\s+/);
|
|
152
|
+
};
|
|
153
|
+
|
|
154
|
+
// 自定义搜索分词器:支持搜索上下文
|
|
155
|
+
const searchTokenizer = (query: { text: string; language?: string; context?: string }): string[] => {
|
|
156
|
+
// 可以根据查询的语言或上下文调整分词
|
|
157
|
+
const tokens = query.text.toLowerCase().split(/\s+/);
|
|
158
|
+
// 根据上下文添加额外的搜索词
|
|
159
|
+
if (query.context === 'technical') {
|
|
160
|
+
tokens.push('technical');
|
|
161
|
+
}
|
|
162
|
+
return tokens;
|
|
163
|
+
};
|
|
164
|
+
|
|
165
|
+
// 创建引擎并配置自定义分词器
|
|
166
|
+
const engine = new SearchEngine({
|
|
167
|
+
baseDir: 'search-data',
|
|
168
|
+
indexingTokenizer,
|
|
169
|
+
searchTokenizer
|
|
170
|
+
});
|
|
171
|
+
|
|
172
|
+
// 索引包含额外属性的文档
|
|
173
|
+
await engine.addDocument({
|
|
174
|
+
id: '1',
|
|
175
|
+
text: '这是一个技术文档',
|
|
176
|
+
category: '技术',
|
|
177
|
+
author: '张三'
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
// 使用包含上下文的查询进行搜索
|
|
181
|
+
const results = await engine.search({
|
|
182
|
+
text: '技术',
|
|
183
|
+
language: 'zh',
|
|
184
|
+
context: 'technical'
|
|
185
|
+
});
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### 简单的字符/空格分词器
|
|
189
|
+
|
|
190
|
+
以下是一个简单的正则分词器示例,按空格和字符分词,且最长token不超过5字符:
|
|
139
191
|
|
|
140
192
|
```typescript
|
|
141
193
|
import { SimpleSearch } from 'gs-search';
|
|
@@ -172,22 +224,27 @@ SimpleSearch.configure({
|
|
|
172
224
|
|
|
173
225
|
### SimpleSearch
|
|
174
226
|
|
|
175
|
-
|
|
227
|
+
**静态方法(无需实例创建):**
|
|
176
228
|
- `configure(config: Partial<ISearchEngineConfig>): void`: 配置搜索引擎
|
|
177
229
|
- `addDocument(doc: IDocument): Promise<void>`: 添加单个文档
|
|
178
|
-
- `addDocuments(docs: IDocument[]): Promise<void>`:
|
|
230
|
+
- `addDocuments(docs: IDocument[]): Promise<void>`: 添加多个文档
|
|
231
|
+
- `addDocumentIfMissing(doc: IDocument): Promise<void>`: 如果文档不存在则添加单个文档
|
|
232
|
+
- `addDocumentsIfMissing(docs: IDocument[]): Promise<void>`: 添加多个文档,跳过已存在的文档
|
|
179
233
|
- `removeDocument(id: number): Promise<void>`: 删除文档
|
|
180
234
|
- `search(query: string, limit?: number): Promise<IResult[]>`: 搜索文档
|
|
181
235
|
- `getStatus(): Promise<IStatus>`: 获取搜索引擎状态
|
|
182
|
-
- `
|
|
236
|
+
- `hasDocument(id: number): Promise<boolean>`: 检查文档ID是否曾经添加过(包括已删除的)
|
|
237
|
+
- `startBatch(): void`: 开始批量操作
|
|
183
238
|
- `endBatch(): Promise<void>`: 结束批处理操作
|
|
184
239
|
|
|
185
|
-
###
|
|
240
|
+
### SearchEngine
|
|
186
241
|
|
|
187
|
-
- `constructor(options:
|
|
242
|
+
- `constructor(options: ISearchEngineConfig)`: 创建一个新的核心引擎实例
|
|
188
243
|
- `init(): Promise<void>`: 初始化引擎
|
|
189
244
|
- `addDocument(doc: IDocument): Promise<void>`: 添加单个文档
|
|
190
|
-
- `addDocuments(docs: IDocument[]): Promise<void>`:
|
|
245
|
+
- `addDocuments(docs: IDocument[]): Promise<void>`: 添加多个文档
|
|
246
|
+
- `addDocumentIfMissing(doc: IDocument): Promise<void>`: 如果文档不存在则添加单个文档
|
|
247
|
+
- `addDocumentsIfMissing(docs: IDocument[]): Promise<void>`: 添加多个文档,跳过已存在的文档
|
|
191
248
|
- `removeDocument(id: number): Promise<void>`: 删除文档
|
|
192
249
|
- `search(query: string, limit?: number): Promise<IResult[]>`: 搜索文档
|
|
193
250
|
- `getStatus(): Promise<IStatus>`: 获取搜索引擎状态
|
package/lib/index.cjs
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
"use strict";Object.create,Object.defineProperty,Object.getOwnPropertyDescriptor,Object.getOwnPropertyNames,Object.getPrototypeOf,Object.prototype.hasOwnProperty;class t{#t;constructor(t){this.#t=t}async#e(){return await(await navigator.storage.getDirectory()).getDirectoryHandle(this.#t,{create:!0})}async write(t,e){const s=await(await(await this.#e()).getFileHandle(t,{create:!0})).createWritable();await s.write(e),await s.close()}async append(t,e){const s=await this.#e();let i;try{i=await s.getFileHandle(t,{create:!0})}catch{i=await s.getFileHandle(t,{create:!0})}const n=await i.getFile(),a=await i.createWritable({keepExistingData:!0});await a.seek(n.size),await a.write(e),await a.close()}async read(t){const e=await this.#e();try{return await(await(await e.getFileHandle(t)).getFile()).arrayBuffer()}catch{return null}}async readRange(t,e,s){const i=await this.#e();try{return await(await(await i.getFileHandle(t)).getFile()).slice(e,s).arrayBuffer()}catch{return null}}async remove(t){const e=await this.#e();try{await e.removeEntry(t)}catch{}}async listFiles(){const t=await this.#e(),e=[];for await(const s of t.keys())e.push(s);return e}async clearAll(){const t=await this.#e();for await(const e of t.keys())await t.removeEntry(e,{recursive:!0})}async getFileSize(t){const e=await this.#e();try{return(await(await e.getFileHandle(t)).getFile()).size}catch{return 0}}}class e{#s=null;#i=null;#t;#n="";constructor(t){this.#t=t}async#a(){if(this.#s)return;const t=await import("node:fs"),e=await import("node:path");this.#s=t.promises,this.#i=e.default||e,this.#n=this.#i.join(process.cwd(),this.#t);try{await this.#s.access(this.#n)}catch{await this.#s.mkdir(this.#n,{recursive:!0})}}#r(t){return this.#i.join(this.#n,t)}async write(t,e){await this.#a(),await this.#s.writeFile(this.#r(t),Buffer.from(e))}async append(t,e){await this.#a(),await this.#s.appendFile(this.#r(t),Buffer.from(e))}async read(t){await this.#a();try{const e=await this.#s.readFile(this.#r(t));return e.buffer.slice(e.byteOffset,e.byteOffset+e.byteLength)}catch{return null}}async readRange(t,e,s){await this.#a();try{const i=await this.#s.open(this.#r(t),"r"),n=s-e,a=Buffer.alloc(n);return await i.read(a,0,n,e),await i.close(),a.buffer.slice(a.byteOffset,a.byteOffset+a.byteLength)}catch{return null}}async remove(t){await this.#a();try{await this.#s.unlink(this.#r(t))}catch{}}async listFiles(){await this.#a();try{return await this.#s.readdir(this.#n)}catch{return[]}}async clearAll(){await this.#a();try{const t=await this.#s.readdir(this.#n);for(const e of t)await this.#s.unlink(this.#i.join(this.#n,e))}catch{}}async getFileSize(t){await this.#a();try{return(await this.#s.stat(this.#r(t))).size}catch{return 0}}}const s="search_meta.json",i="deleted_ids.bin",n="added_ids.bin";class a{#o;#h={wordSegments:[],charSegments:[]};#c=new Set;#d=new Set;constructor(t){this.#o=t}async load(){const t=await this.#o.read(s);if(t){const e=(new TextDecoder).decode(t);this.#h=JSON.parse(e)}else this.#h={wordSegments:[],charSegments:[]};const e=await this.#o.read(i);if(e){const t=new DataView(e);let s=0;const i=e.byteLength;for(;s<i&&!(s+4>i);){const e=t.getUint32(s,!0);this.#c.add(e),s+=4,s<i&&30===t.getUint8(s)&&(s+=1)}}const a=await this.#o.read(n);if(a){const t=new DataView(a);let e=0;const s=a.byteLength;for(;e<s&&!(e+4>s);){const i=t.getUint32(e,!0);this.#d.add(i),e+=4,e<s&&30===t.getUint8(e)&&(e+=1)}}}async save(){const t=JSON.stringify(this.#h);if(await this.#o.write(s,(new TextEncoder).encode(t).buffer),0===this.#c.size)await this.#o.remove(i);else{const t=4*this.#c.size+this.#c.size,e=new ArrayBuffer(t),s=new DataView(e);let n=0;for(const t of this.#c)s.setUint32(n,t,!0),n+=4,s.setUint8(n,30),n+=1;await this.#o.write(i,e)}if(0===this.#d.size)await this.#o.remove(n);else{const t=4*this.#d.size+this.#d.size,e=new ArrayBuffer(t),s=new DataView(e);let i=0;for(const t of this.#d)s.setUint32(i,t,!0),i+=4,s.setUint8(i,30),i+=1;await this.#o.write(n,e)}}getSegments(t){return"word"===t?this.#h.wordSegments:this.#h.charSegments}getDeletedIds(){return this.#c}addDeletedId(t){this.#c.add(t)}isDeleted(t){return this.#c.has(t)}addAddedId(t){this.#d.add(t)}removeAddedId(t){this.#d.delete(t)}isAdded(t){return this.#d.has(t)}getAddedIds(){return this.#d}getLastSegmentInfo(t){const e=this.getSegments(t);return 0===e.length?null:e[e.length-1]}updateSegment(t,e,s,i,n,a){const r="word"===t?this.#h.wordSegments:this.#h.charSegments;if(a)r.push({filename:e,start:s,end:i,tokenCount:n});else{const t=r[r.length-1];t&&t.filename===e&&(t.end=i,t.tokenCount=n)}}reset(){this.#h={wordSegments:[],charSegments:[]},this.#c.clear(),this.#d.clear()}}class r{static SEPARATOR=30;#o;constructor(t){this.#o=t}async appendBatch(t,e){if(0===e.length)return await this.#o.getFileSize(t);const s=new TextEncoder;let i=0;for(const t of e){i+=8;for(const e of t.tokens){i+=2+Math.min(s.encode(e).byteLength,65535)}i+=1}const n=new Uint8Array(i);let a=0;for(const t of e){const e=[];for(const i of t.tokens){const t=s.encode(i),n=t.byteLength>65535?t.slice(0,65535):t;e.push(n)}const i=new DataView(n.buffer,a);i.setUint32(0,t.id,!0),i.setUint32(4,e.length,!0),a+=8;for(const t of e)new DataView(n.buffer,a).setUint16(0,t.byteLength,!0),a+=2,n.set(t,a),a+=t.byteLength;n[a++]=r.SEPARATOR}return await this.#o.append(t,n.buffer),await this.#o.getFileSize(t)}async readRange(t,e,s){const i=await this.#o.readRange(t,e,s);if(!i||0===i.byteLength)return[];const n=new DataView(i),a=new Uint8Array(i),o=new TextDecoder,h=[];let c=0;const d=i.byteLength;for(;c<d&&!(c+8>d);){const t=n.getUint32(c,!0);c+=4;const e=n.getUint32(c,!0);c+=4;const s=[];for(let t=0;t<e&&!(c+2>d);t++){const t=n.getUint16(c,!0);if(c+=2,c+t>d)break;const e=new Uint8Array(i,c,t);s.push(o.decode(e)),c+=t}c<d&&a[c]===r.SEPARATOR&&(c+=1),h.push({id:t,tokens:s})}return h}async getCurrentSize(t){return await this.#o.getFileSize(t)}}class o{#g;#o;#l=null;#f=null;static hash(t){let e=5381;for(let s=0;s<t.length;s++)e=(e<<5)+e^t.charCodeAt(s);return e>>>0}constructor(t,e){this.#g=t,this.#o=e}async loadIndex(){return!!this.#l||(this.#l=await this.#o.read(this.#g),!!this.#l&&(this.#f=new DataView(this.#l),!0))}async buildAndSave(t){const e=new Map;for(const s of t){const t=new Map;for(const i of s.tokens)if(!t.has(i)){t.set(i,!0);const n=o.hash(i);e.has(n)||e.set(n,[]),e.get(n).push(s.id)}}const s=Array.from(e.keys()).sort((t,e)=>t-e);let i=0;const n=new Array(s.length);for(let t=0;t<s.length;t++){const a=s[t],r=e.get(a);n[t]=r,i+=r.length}const a=12*s.length,r=new ArrayBuffer(8+a+4*i),h=new DataView(r);h.setUint32(0,1229866072),h.setUint32(4,s.length);let c=8,d=8+a;for(let t=0;t<s.length;t++){const e=s[t],i=n[t];h.setUint32(c,e),h.setUint32(c+4,d),h.setUint32(c+8,i.length),c+=12;for(let t=0;t<i.length;t++)h.setUint32(d,i[t],!0),d+=4}await this.#o.write(this.#g,r),this.#l=r,this.#f=h}search(t){if(!this.#f||!this.#l)return[];const e=o.hash(t);let s=0,i=this.#f.getUint32(4)-1;for(;s<=i;){const t=s+i>>>1,n=8+12*t,a=this.#f.getUint32(n);if(a<e)s=t+1;else{if(!(a>e)){const t=this.#f.getUint32(n+4),e=this.#f.getUint32(n+8),s=[];for(let i=0;i<e;i++)s.push(this.#f.getUint32(t+4*i,!0));return s}i=t-1}}return[]}}const h="word_cache.bin",c="char_cache.bin";class d{#o;#h;#w;#u;#m=!1;#y;#p=!1;#S={word:0,char:0};constructor(s){if(!s.baseDir)throw new Error("SearchEngine requires 'baseDir' in config.");if(this.#y={wordSegmentTokenThreshold:1e5,charSegmentTokenThreshold:5e5,minWordTokenSave:0,minCharTokenSave:0,...s},(this.#y.minWordTokenSave||0)>=(this.#y.wordSegmentTokenThreshold||1e5))throw new Error("minWordTokenSave must be less than wordSegmentTokenThreshold");if((this.#y.minCharTokenSave||0)>=(this.#y.charSegmentTokenThreshold||5e5))throw new Error("minCharTokenSave must be less than charSegmentTokenThreshold");let i=null;if(this.#y.storage&&("object"==typeof this.#y.storage?i=this.#y.storage:"browser"===this.#y.storage?i=new t(this.#y.baseDir):"node"===this.#y.storage&&(i=new e(this.#y.baseDir))),!i){const s=typeof navigator<"u"&&navigator?.storage?.getDirectory instanceof Function,n=typeof process<"u"&&null!=process.versions&&null!=process.versions.node;s?i=new t(this.#y.baseDir):n&&(i=new e(this.#y.baseDir))}if(!i)throw new Error('Storage initialization failed. Please configure "storage" explicitly or ensure you are in a supported environment (Browser/Node).');this.#o=i,this.#h=new a(this.#o),this.#w=new r(this.#o),this.#u=new Map}async init(){if(this.#m)return;await this.#h.load();const t=[...this.#h.getSegments("word"),...this.#h.getSegments("char")];for(const e of t)this.#u.has(e.filename)||this.#u.set(e.filename,new o(e.filename,this.#o)),await this.#u.get(e.filename).loadIndex();this.#m=!0}startBatch(){this.#p=!0,this.#S={word:0,char:0}}async endBatch(){this.#p=!1,this.#S.word>0&&await this.#D("word",this.#S.word),this.#S.char>0&&await this.#D("char",this.#S.char),this.#S={word:0,char:0},await this.#h.save()}#k(t){if(typeof Intl<"u"&&Intl.Segmenter){const e=new Intl.Segmenter([],{granularity:"word"});return Array.from(e.segment(t)).filter(t=>t.isWordLike).map(t=>t.segment.toLowerCase())}return t.toLowerCase().split(/[^a-z0-9\u4e00-\u9fa5]+/g).filter(t=>t.length>0)}#b(t){return this.#y.indexingTokenizer?this.#y.indexingTokenizer(t):this.#k(t)}#T(t){return this.#y.searchTokenizer?this.#y.searchTokenizer(t):this.#y.indexingTokenizer?this.#y.indexingTokenizer(t):this.#k(t)}async addDocument(t){return this.addDocuments([t])}async addDocuments(t){if(this.#m||await this.init(),0===t.length)return;const e=this.#h.getDeletedIds(),s=[],i=[];for(const n of t){if(e.has(n.id))throw new Error(`Document ID ${n.id} has been deleted and cannot be re-added.`);if(this.#h.isAdded(n.id))throw new Error(`Document ID ${n.id} already exists.`);const t=this.#b(n.text),a=[],r=[];for(const e of t)e.length>1?a.push(e):1===e.length&&r.push(e);a.length>0&&s.push({id:n.id,tokens:a}),r.length>0&&i.push({id:n.id,tokens:r})}let n=0,a=0;if(s.length>0){await this.#w.appendBatch(h,s);for(const t of s)n+=t.tokens.length}if(i.length>0){await this.#w.appendBatch(c,i);for(const t of i)a+=t.tokens.length}for(const e of t)this.#h.addAddedId(e.id);this.#p?(this.#S.word+=n,this.#S.char+=a):(n>0&&await this.#D("word",n),a>0&&await this.#D("char",a),await this.#h.save())}async#D(t,e){const s="word"===t?h:c,i=await this.#w.getCurrentSize(s),n="word"===t?this.#y.wordSegmentTokenThreshold||1e5:this.#y.charSegmentTokenThreshold||5e5,a="word"===t?this.#y.minWordTokenSave||0:this.#y.minCharTokenSave||0,r=this.#h.getLastSegmentInfo(t);let d,g,l,f;const w=()=>{const e=this.#h.getSegments(t).length+1;return`${t}_seg_${e}.bin`};if(r){const t=r.tokenCount;t>=n||t+e>=n?(d=w(),l=!0,g=r.end,f=e):(d=r.filename,l=!1,g=r.start,f=t+e)}else d=w(),l=!0,g=0,f=e;if(f<a)return void this.#h.updateSegment(t,d,g,i,f,l);const u=await this.#w.readRange(s,g,i);let m=this.#u.get(d);m||(m=new o(d,this.#o),this.#u.set(d,m)),await m.buildAndSave(u),this.#h.updateSegment(t,d,g,i,f,l)}async search(t,e){this.#m||await this.init();const s=this.#T(t),i=s.filter(t=>t.length>1),n=s.filter(t=>1===t.length),a=this.#h.getDeletedIds(),r=new Map,h=new Map,c=t=>{const e=this.#h.getSegments(t);for(const t of e){const e=t.filename;!this.#u.has(e)&&!h.has(e)&&h.set(e,new o(e,this.#o))}};c("word"),c("char"),await Promise.all(Array.from(h.entries()).map(([t,e])=>e.loadIndex().then(s=>{s&&this.#u.set(t,e)})));const d=async(t,e)=>{if(0===e.length)return;const s=this.#h.getSegments(t);for(const t of s){const s=t.filename,i=this.#u.get(s);if(i)for(const t of e){const e=i.search(t),s=1+.1*t.length;for(const i of e)if(!a.has(i))if(r.has(i)){const e=r.get(i);e.score+=s,e.tokens.add(t)}else r.set(i,{score:0,tokens:new Set([t])})}}};await d("word",i),await d("char",n);const g=[];return r.forEach((t,e)=>{g.push({id:e,score:t.score,tokens:Array.from(t.tokens)})}),g.sort((t,e)=>e.score-t.score),"number"==typeof e&&e>0?g.slice(0,e):g}async removeDocument(t){this.#m||await this.init(),this.#h.addDeletedId(t),this.#h.removeAddedId(t),await this.#h.save()}async clearAll(){await this.#o.clearAll(),this.#u.clear(),this.#h.reset(),this.#m=!1,this.#p=!1,this.#S={word:0,char:0}}async getStatus(){return this.#m||await this.init(),{wordSegments:this.#h.getSegments("word").length,charSegments:this.#h.getSegments("char").length,deleted:this.#h.getDeletedIds().size,wordCacheSize:await this.#w.getCurrentSize(h),charCacheSize:await this.#w.getCurrentSize(c),inBatch:this.#p}}}exports.BrowserStorage=t,exports.NodeStorage=e,exports.SearchEngine=d,exports.SimpleSearch=class{static#I=null;static#v={baseDir:"simple_search_data",wordSegmentTokenThreshold:1e5,minWordTokenSave:0};static configure(t){const e={...this.#v,...t};this.#I=new d(e)}static#z(){return this.#I||(this.#I=new d(this.#v)),this.#I}static async startBatch(){this.#z().startBatch()}static async endBatch(){return this.#z().endBatch()}static async addDocument(t){return this.#z().addDocument(t)}static async addDocuments(t){return this.#z().addDocuments(t)}static async search(t,e){return this.#z().search(t,e)}static async removeDocument(t){return this.#z().removeDocument(t)}static async clearAll(){return this.#z().clearAll()}static async getStatus(){return this.#z().getStatus()}};
|
|
1
|
+
"use strict";Object.create,Object.defineProperty,Object.getOwnPropertyDescriptor,Object.getOwnPropertyNames,Object.getPrototypeOf,Object.prototype.hasOwnProperty;class t{#t;constructor(t){this.#t=t}async#e(){return await(await navigator.storage.getDirectory()).getDirectoryHandle(this.#t,{create:!0})}async write(t,e){const s=await(await(await this.#e()).getFileHandle(t,{create:!0})).createWritable();await s.write(e),await s.close()}async append(t,e){const s=await this.#e();let i;try{i=await s.getFileHandle(t,{create:!0})}catch{i=await s.getFileHandle(t,{create:!0})}const n=await i.getFile(),a=await i.createWritable({keepExistingData:!0});await a.seek(n.size),await a.write(e),await a.close()}async read(t){const e=await this.#e();try{return await(await(await e.getFileHandle(t)).getFile()).arrayBuffer()}catch{return null}}async readRange(t,e,s){const i=await this.#e();try{return await(await(await i.getFileHandle(t)).getFile()).slice(e,s).arrayBuffer()}catch{return null}}async remove(t){const e=await this.#e();try{await e.removeEntry(t)}catch{}}async listFiles(){const t=await this.#e(),e=[];for await(const s of t.keys())e.push(s);return e}async clearAll(){const t=await this.#e();for await(const e of t.keys())await t.removeEntry(e,{recursive:!0})}async getFileSize(t){const e=await this.#e();try{return(await(await e.getFileHandle(t)).getFile()).size}catch{return 0}}}class e{#s=null;#i=null;#t;#n="";constructor(t){this.#t=t}async#a(){if(this.#s)return;const t=await import("node:fs"),e=await import("node:path");this.#s=t.promises,this.#i=e.default||e,this.#n=this.#i.join(process.cwd(),this.#t);try{await this.#s.access(this.#n)}catch{await this.#s.mkdir(this.#n,{recursive:!0})}}#r(t){return this.#i.join(this.#n,t)}async write(t,e){await this.#a(),await this.#s.writeFile(this.#r(t),Buffer.from(e))}async append(t,e){await this.#a(),await this.#s.appendFile(this.#r(t),Buffer.from(e))}async read(t){await this.#a();try{const e=await this.#s.readFile(this.#r(t));return e.buffer.slice(e.byteOffset,e.byteOffset+e.byteLength)}catch{return null}}async readRange(t,e,s){await this.#a();try{const i=await this.#s.open(this.#r(t),"r"),n=s-e,a=Buffer.alloc(n);return await i.read(a,0,n,e),await i.close(),a.buffer.slice(a.byteOffset,a.byteOffset+a.byteLength)}catch{return null}}async remove(t){await this.#a();try{await this.#s.unlink(this.#r(t))}catch{}}async listFiles(){await this.#a();try{return await this.#s.readdir(this.#n)}catch{return[]}}async clearAll(){await this.#a();try{const t=await this.#s.readdir(this.#n);for(const e of t)await this.#s.unlink(this.#i.join(this.#n,e))}catch{}}async getFileSize(t){await this.#a();try{return(await this.#s.stat(this.#r(t))).size}catch{return 0}}}const s="search_meta.json",i="deleted_ids.bin",n="added_ids.bin";class a{#o;#h={wordSegments:[],charSegments:[]};#c=new Set;#d=new Set;constructor(t){this.#o=t}async load(){const t=await this.#o.read(s);if(t){const e=(new TextDecoder).decode(t);this.#h=JSON.parse(e)}else this.#h={wordSegments:[],charSegments:[]};const e=await this.#o.read(i);if(e){const t=new DataView(e);let s=0;const i=e.byteLength;for(;s<i&&!(s+4>i);){const e=t.getUint32(s,!0);this.#c.add(e),s+=4,s<i&&30===t.getUint8(s)&&(s+=1)}}const a=await this.#o.read(n);if(a){const t=new DataView(a);let e=0;const s=a.byteLength;for(;e<s&&!(e+4>s);){const i=t.getUint32(e,!0);this.#d.add(i),e+=4,e<s&&30===t.getUint8(e)&&(e+=1)}}}async save(){const t=JSON.stringify(this.#h);if(await this.#o.write(s,(new TextEncoder).encode(t).buffer),0===this.#c.size)await this.#o.remove(i);else{const t=4*this.#c.size+this.#c.size,e=new ArrayBuffer(t),s=new DataView(e);let n=0;for(const t of this.#c)s.setUint32(n,t,!0),n+=4,s.setUint8(n,30),n+=1;await this.#o.write(i,e)}if(0===this.#d.size)await this.#o.remove(n);else{const t=4*this.#d.size+this.#d.size,e=new ArrayBuffer(t),s=new DataView(e);let i=0;for(const t of this.#d)s.setUint32(i,t,!0),i+=4,s.setUint8(i,30),i+=1;await this.#o.write(n,e)}}getSegments(t){return"word"===t?this.#h.wordSegments:this.#h.charSegments}getDeletedIds(){return this.#c}addDeletedId(t){this.#c.add(t)}isDeleted(t){return this.#c.has(t)}addAddedId(t){this.#d.add(t)}removeAddedId(t){this.#d.delete(t)}isAdded(t){return this.#d.has(t)}getAddedIds(){return this.#d}hasDocument(t){return this.#d.has(t)||this.#c.has(t)}getLastSegmentInfo(t){const e=this.getSegments(t);return 0===e.length?null:e[e.length-1]}updateSegment(t,e,s,i,n,a){const r="word"===t?this.#h.wordSegments:this.#h.charSegments;if(a)r.push({filename:e,start:s,end:i,tokenCount:n});else{const t=r[r.length-1];t&&t.filename===e&&(t.end=i,t.tokenCount=n)}}reset(){this.#h={wordSegments:[],charSegments:[]},this.#c.clear(),this.#d.clear()}}class r{static SEPARATOR=30;#o;constructor(t){this.#o=t}async appendBatch(t,e){if(0===e.length)return await this.#o.getFileSize(t);const s=new TextEncoder;let i=0;for(const t of e){i+=8;for(const e of t.tokens){i+=2+Math.min(s.encode(e).byteLength,65535)}i+=1}const n=new Uint8Array(i);let a=0;for(const t of e){const e=[];for(const i of t.tokens){const t=s.encode(i),n=t.byteLength>65535?t.slice(0,65535):t;e.push(n)}const i=new DataView(n.buffer,a);i.setUint32(0,t.id,!0),i.setUint32(4,e.length,!0),a+=8;for(const t of e)new DataView(n.buffer,a).setUint16(0,t.byteLength,!0),a+=2,n.set(t,a),a+=t.byteLength;n[a++]=r.SEPARATOR}return await this.#o.append(t,n.buffer),await this.#o.getFileSize(t)}async readRange(t,e,s){const i=await this.#o.readRange(t,e,s);if(!i||0===i.byteLength)return[];const n=new DataView(i),a=new Uint8Array(i),o=new TextDecoder,h=[];let c=0;const d=i.byteLength;for(;c<d&&!(c+8>d);){const t=n.getUint32(c,!0);c+=4;const e=n.getUint32(c,!0);c+=4;const s=[];for(let t=0;t<e&&!(c+2>d);t++){const t=n.getUint16(c,!0);if(c+=2,c+t>d)break;const e=new Uint8Array(i,c,t);s.push(o.decode(e)),c+=t}c<d&&a[c]===r.SEPARATOR&&(c+=1),h.push({id:t,tokens:s})}return h}async getCurrentSize(t){return await this.#o.getFileSize(t)}}class o{#g;#o;#l=null;#f=null;static hash(t){let e=5381;for(let s=0;s<t.length;s++)e=(e<<5)+e^t.charCodeAt(s);return e>>>0}constructor(t,e){this.#g=t,this.#o=e}async loadIndex(){return!!this.#l||(this.#l=await this.#o.read(this.#g),!!this.#l&&(this.#f=new DataView(this.#l),!0))}async buildAndSave(t){const e=new Map;for(const s of t){const t=new Map;for(const i of s.tokens)if(!t.has(i)){t.set(i,!0);const n=o.hash(i);e.has(n)||e.set(n,[]),e.get(n).push(s.id)}}const s=Array.from(e.keys()).sort((t,e)=>t-e);let i=0;const n=new Array(s.length);for(let t=0;t<s.length;t++){const a=s[t],r=e.get(a);n[t]=r,i+=r.length}const a=12*s.length,r=new ArrayBuffer(8+a+4*i),h=new DataView(r);h.setUint32(0,1229866072),h.setUint32(4,s.length);let c=8,d=8+a;for(let t=0;t<s.length;t++){const e=s[t],i=n[t];h.setUint32(c,e),h.setUint32(c+4,d),h.setUint32(c+8,i.length),c+=12;for(let t=0;t<i.length;t++)h.setUint32(d,i[t],!0),d+=4}await this.#o.write(this.#g,r),this.#l=r,this.#f=h}search(t){if(!this.#f||!this.#l)return[];const e=o.hash(t);let s=0,i=this.#f.getUint32(4)-1;for(;s<=i;){const t=s+i>>>1,n=8+12*t,a=this.#f.getUint32(n);if(a<e)s=t+1;else{if(!(a>e)){const t=this.#f.getUint32(n+4),e=this.#f.getUint32(n+8),s=[];for(let i=0;i<e;i++)s.push(this.#f.getUint32(t+4*i,!0));return s}i=t-1}}return[]}}const h="word_cache.bin",c="char_cache.bin";class d{#o;#h;#w;#u;#m=!1;#y;#p=!1;#D={word:0,char:0};constructor(s){if(!s.baseDir)throw new Error("SearchEngine requires 'baseDir' in config.");if(this.#y={wordSegmentTokenThreshold:1e5,charSegmentTokenThreshold:5e5,minWordTokenSave:0,minCharTokenSave:0,...s},(this.#y.minWordTokenSave||0)>=(this.#y.wordSegmentTokenThreshold||1e5))throw new Error("minWordTokenSave must be less than wordSegmentTokenThreshold");if((this.#y.minCharTokenSave||0)>=(this.#y.charSegmentTokenThreshold||5e5))throw new Error("minCharTokenSave must be less than charSegmentTokenThreshold");let i=null;if(this.#y.storage&&("object"==typeof this.#y.storage?i=this.#y.storage:"browser"===this.#y.storage?i=new t(this.#y.baseDir):"node"===this.#y.storage&&(i=new e(this.#y.baseDir))),!i){const s=typeof navigator<"u"&&navigator?.storage?.getDirectory instanceof Function,n=typeof process<"u"&&null!=process.versions&&null!=process.versions.node;s?i=new t(this.#y.baseDir):n&&(i=new e(this.#y.baseDir))}if(!i)throw new Error('Storage initialization failed. Please configure "storage" explicitly or ensure you are in a supported environment (Browser/Node).');this.#o=i,this.#h=new a(this.#o),this.#w=new r(this.#o),this.#u=new Map}async init(){if(this.#m)return;await this.#h.load();const t=[...this.#h.getSegments("word"),...this.#h.getSegments("char")];for(const e of t)this.#u.has(e.filename)||this.#u.set(e.filename,new o(e.filename,this.#o)),await this.#u.get(e.filename).loadIndex();this.#m=!0}startBatch(){this.#p=!0,this.#D={word:0,char:0}}async endBatch(){this.#p=!1,this.#D.word>0&&await this.#S("word",this.#D.word),this.#D.char>0&&await this.#S("char",this.#D.char),this.#D={word:0,char:0},await this.#h.save()}#k(t){try{if(typeof Intl<"u"&&"function"==typeof Intl.Segmenter&&"function"==typeof Array.from){const e=new Intl.Segmenter([],{granularity:"word"}).segment(t);if("object"==typeof e&&null!==e)return Array.from(e).filter(t=>t?.isWordLike).map(t=>t?.segment?.toLowerCase()||"")}}catch{}return t.toLowerCase().split(/[^a-z0-9\u4e00-\u9fa5]+/g).filter(t=>t.length>0)}#I(t){return this.#y.indexingTokenizer?this.#y.indexingTokenizer(t):this.#k(t.text)}#b(t){return this.#y.searchTokenizer?this.#y.searchTokenizer(t):this.#I(t)}async addDocument(t){return this.addDocuments([t])}async addDocumentIfMissing(t){return this.addDocumentsIfMissing([t])}async addDocumentsIfMissing(t){if(this.#m||await this.init(),0===t.length)return;const e=this.#h.getDeletedIds(),s=[],i=[],n=[];for(const a of t){if(e.has(a.id)||this.#h.isAdded(a.id))continue;const t=this.#I(a),r=[],o=[];for(const e of t)e.length>1?r.push(e):1===e.length&&o.push(e);r.length>0&&s.push({id:a.id,tokens:r}),o.length>0&&i.push({id:a.id,tokens:o}),n.push(a)}if(0===n.length)return;let a=0,r=0;if(s.length>0){await this.#w.appendBatch(h,s);for(const t of s)a+=t.tokens.length}if(i.length>0){await this.#w.appendBatch(c,i);for(const t of i)r+=t.tokens.length}for(const t of n)this.#h.addAddedId(t.id);this.#p?(this.#D.word+=a,this.#D.char+=r):(a>0&&await this.#S("word",a),r>0&&await this.#S("char",r),await this.#h.save())}async addDocuments(t){if(this.#m||await this.init(),0===t.length)return;const e=this.#h.getDeletedIds(),s=[],i=[];for(const n of t){if(e.has(n.id))throw new Error(`Document ID ${n.id} has been deleted and cannot be re-added.`);if(this.#h.isAdded(n.id))throw new Error(`Document ID ${n.id} already exists.`);const t=this.#I(n),a=[],r=[];for(const e of t)e.length>1?a.push(e):1===e.length&&r.push(e);a.length>0&&s.push({id:n.id,tokens:a}),r.length>0&&i.push({id:n.id,tokens:r})}let n=0,a=0;if(s.length>0){await this.#w.appendBatch(h,s);for(const t of s)n+=t.tokens.length}if(i.length>0){await this.#w.appendBatch(c,i);for(const t of i)a+=t.tokens.length}for(const e of t)this.#h.addAddedId(e.id);this.#p?(this.#D.word+=n,this.#D.char+=a):(n>0&&await this.#S("word",n),a>0&&await this.#S("char",a),await this.#h.save())}async#S(t,e){const s="word"===t?h:c,i=await this.#w.getCurrentSize(s),n="word"===t?this.#y.wordSegmentTokenThreshold||1e5:this.#y.charSegmentTokenThreshold||5e5,a="word"===t?this.#y.minWordTokenSave||0:this.#y.minCharTokenSave||0,r=this.#h.getLastSegmentInfo(t);let d,g,l,f;const w=()=>{const e=this.#h.getSegments(t).length+1;return`${t}_seg_${e}.bin`};if(r){const t=r.tokenCount;t>=n||t+e>=n?(d=w(),l=!0,g=r.end,f=e):(d=r.filename,l=!1,g=r.start,f=t+e)}else d=w(),l=!0,g=0,f=e;if(f<a)return void this.#h.updateSegment(t,d,g,i,f,l);const u=await this.#w.readRange(s,g,i);let m=this.#u.get(d);m||(m=new o(d,this.#o),this.#u.set(d,m)),await m.buildAndSave(u),this.#h.updateSegment(t,d,g,i,f,l)}async search(t,e){this.#m||await this.init();const s="string"==typeof t?{text:t}:t,i=this.#b(s),n=i.filter(t=>t.length>1),a=i.filter(t=>1===t.length),r=this.#h.getDeletedIds(),h=new Map,c=new Map,d=t=>{const e=this.#h.getSegments(t);for(const t of e){const e=t.filename;!this.#u.has(e)&&!c.has(e)&&c.set(e,new o(e,this.#o))}};d("word"),d("char"),await Promise.all(Array.from(c.entries()).map(([t,e])=>e.loadIndex().then(s=>{s&&this.#u.set(t,e)})));const g=async(t,e)=>{if(0===e.length)return;const s=this.#h.getSegments(t);for(const t of s){const s=t.filename,i=this.#u.get(s);if(i)for(const t of e){const e=i.search(t),s=1+.1*t.length;for(const i of e)if(!r.has(i))if(h.has(i)){const e=h.get(i);e.score+=s,e.tokens.add(t)}else h.set(i,{score:0,tokens:new Set([t])})}}};await g("word",n),await g("char",a);const l=[];return h.forEach((t,e)=>{l.push({id:e,score:t.score,tokens:Array.from(t.tokens)})}),l.sort((t,e)=>e.score-t.score),"number"==typeof e&&e>0?l.slice(0,e):l}async removeDocument(t){this.#m||await this.init(),this.#h.addDeletedId(t),this.#h.removeAddedId(t),await this.#h.save()}async clearAll(){await this.#o.clearAll(),this.#u.clear(),this.#h.reset(),this.#m=!1,this.#p=!1,this.#D={word:0,char:0}}async getStatus(){return this.#m||await this.init(),{wordSegments:this.#h.getSegments("word").length,charSegments:this.#h.getSegments("char").length,deleted:this.#h.getDeletedIds().size,wordCacheSize:await this.#w.getCurrentSize(h),charCacheSize:await this.#w.getCurrentSize(c),inBatch:this.#p}}async hasDocument(t){return this.#m||await this.init(),this.#h.hasDocument(t)}}exports.BrowserStorage=t,exports.NodeStorage=e,exports.SearchEngine=d,exports.SimpleSearch=class{static#T=null;static#v={baseDir:"simple_search_data",wordSegmentTokenThreshold:1e5,minWordTokenSave:0};static configure(t){const e={...this.#v,...t};this.#T=new d(e)}static#z(){return this.#T||(this.#T=new d(this.#v)),this.#T}static async startBatch(){this.#z().startBatch()}static async endBatch(){return this.#z().endBatch()}static async addDocument(t){return this.#z().addDocument(t)}static async addDocumentIfMissing(t){return this.#z().addDocumentIfMissing(t)}static async addDocuments(t){return this.#z().addDocuments(t)}static async addDocumentsIfMissing(t){return this.#z().addDocumentsIfMissing(t)}static async search(t,e){return this.#z().search(t,e)}static async removeDocument(t){return this.#z().removeDocument(t)}static async clearAll(){return this.#z().clearAll()}static async getStatus(){return this.#z().getStatus()}static async hasDocument(t){return this.#z().hasDocument(t)}};
|
package/lib/index.d.ts
CHANGED
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
/**
|
|
2
2
|
* 核心类型定义
|
|
3
3
|
*/
|
|
4
|
-
interface
|
|
5
|
-
id: number;
|
|
4
|
+
interface IDocumentBase {
|
|
6
5
|
text: string;
|
|
7
6
|
}
|
|
7
|
+
interface IDocument extends IDocumentBase {
|
|
8
|
+
id: number;
|
|
9
|
+
}
|
|
8
10
|
interface IResult {
|
|
9
11
|
id: number;
|
|
10
12
|
score: number;
|
|
@@ -61,7 +63,7 @@ interface ISearchEngineConfig {
|
|
|
61
63
|
* - 建议: 针对不同语言(中文/英文/日文等)使用专门的分词实现
|
|
62
64
|
* - 影响: 直接决定索引的粒度和搜索的准确性
|
|
63
65
|
*/
|
|
64
|
-
indexingTokenizer?: (
|
|
66
|
+
indexingTokenizer?: <T extends IDocument = IDocument>(doc: T) => string[];
|
|
65
67
|
/**
|
|
66
68
|
* 搜索时使用的分词器 (算法核心配置)
|
|
67
69
|
* - 作用: 将查询文本转换为搜索用的token序列
|
|
@@ -69,7 +71,7 @@ interface ISearchEngineConfig {
|
|
|
69
71
|
* - 建议: 与indexingTokenizer保持一致的分词策略以确保搜索准确性
|
|
70
72
|
* - 影响: 直接决定搜索匹配的范围和结果的相关性
|
|
71
73
|
*/
|
|
72
|
-
searchTokenizer?: (
|
|
74
|
+
searchTokenizer?: <T extends IDocumentBase = IDocumentBase>(doc: T) => string[];
|
|
73
75
|
/**
|
|
74
76
|
* 词索引分段阈值 (Token数) - 分段算法配置
|
|
75
77
|
* - 作用: 控制词索引文件的大小,超过阈值时创建新的索引段
|
|
@@ -121,9 +123,19 @@ declare class SearchEngine {
|
|
|
121
123
|
* 触发索引构建检查并保存元数据
|
|
122
124
|
*/
|
|
123
125
|
endBatch(): Promise<void>;
|
|
124
|
-
addDocument(doc:
|
|
125
|
-
|
|
126
|
-
|
|
126
|
+
addDocument<T extends IDocument = IDocument>(doc: T): Promise<void>;
|
|
127
|
+
/**
|
|
128
|
+
* 添加单个文档,如果文档ID已存在则跳过
|
|
129
|
+
* 用于在批量添加中途出错后的恢复添加行为,也可直接用于单个文档添加
|
|
130
|
+
*/
|
|
131
|
+
addDocumentIfMissing<T extends IDocument = IDocument>(doc: T): Promise<void>;
|
|
132
|
+
/**
|
|
133
|
+
* 添加多个文档,跳过已存在的文档ID
|
|
134
|
+
* 用于在批量添加中途出错后的恢复添加行为,也可直接用于批量添加
|
|
135
|
+
*/
|
|
136
|
+
addDocumentsIfMissing<T extends IDocument = IDocument>(docs: T[]): Promise<void>;
|
|
137
|
+
addDocuments<T extends IDocument = IDocument>(docs: T[]): Promise<void>;
|
|
138
|
+
search<T extends IDocumentBase = IDocumentBase>(query: T | string, limit?: number): Promise<IResult[]>;
|
|
127
139
|
removeDocument(id: number): Promise<void>;
|
|
128
140
|
clearAll(): Promise<void>;
|
|
129
141
|
getStatus(): Promise<{
|
|
@@ -134,6 +146,12 @@ declare class SearchEngine {
|
|
|
134
146
|
charCacheSize: number;
|
|
135
147
|
inBatch: boolean;
|
|
136
148
|
}>;
|
|
149
|
+
/**
|
|
150
|
+
* 检查文档ID是否曾经添加过(包括已删除的)
|
|
151
|
+
* @param id 文档ID
|
|
152
|
+
* @returns 文档是否曾经添加过的布尔值
|
|
153
|
+
*/
|
|
154
|
+
hasDocument(id: number): Promise<boolean>;
|
|
137
155
|
}
|
|
138
156
|
|
|
139
157
|
/**
|
|
@@ -148,9 +166,11 @@ declare class SimpleSearch {
|
|
|
148
166
|
static configure(config: Partial<ISearchEngineConfig>): void;
|
|
149
167
|
static startBatch(): Promise<void>;
|
|
150
168
|
static endBatch(): Promise<void>;
|
|
151
|
-
static addDocument(doc:
|
|
152
|
-
static
|
|
153
|
-
static
|
|
169
|
+
static addDocument<T extends IDocument = IDocument>(doc: T): Promise<void>;
|
|
170
|
+
static addDocumentIfMissing<T extends IDocument = IDocument>(doc: T): Promise<void>;
|
|
171
|
+
static addDocuments<T extends IDocument = IDocument>(docs: T[]): Promise<void>;
|
|
172
|
+
static addDocumentsIfMissing<T extends IDocument = IDocument>(docs: T[]): Promise<void>;
|
|
173
|
+
static search<T extends IDocumentBase = IDocumentBase>(query: T | string, limit?: number): Promise<IResult[]>;
|
|
154
174
|
static removeDocument(id: number): Promise<void>;
|
|
155
175
|
static clearAll(): Promise<void>;
|
|
156
176
|
static getStatus(): Promise<{
|
|
@@ -161,6 +181,12 @@ declare class SimpleSearch {
|
|
|
161
181
|
charCacheSize: number;
|
|
162
182
|
inBatch: boolean;
|
|
163
183
|
}>;
|
|
184
|
+
/**
|
|
185
|
+
* 检查文档ID是否曾经添加过(包括已删除的)
|
|
186
|
+
* @param id 文档ID
|
|
187
|
+
* @returns 文档是否曾经添加过的布尔值
|
|
188
|
+
*/
|
|
189
|
+
static hasDocument(id: number): Promise<boolean>;
|
|
164
190
|
}
|
|
165
191
|
|
|
166
192
|
/**
|
|
@@ -196,4 +222,4 @@ declare class NodeStorage implements IStorage {
|
|
|
196
222
|
}
|
|
197
223
|
|
|
198
224
|
export { BrowserStorage, NodeStorage, SearchEngine, SimpleSearch };
|
|
199
|
-
export type { IDocument, IIndexMeta, IResult, ISearchEngineConfig, ISegmentMeta, IStorage, ITokenizedDoc, IndexType };
|
|
225
|
+
export type { IDocument, IDocumentBase, IIndexMeta, IResult, ISearchEngineConfig, ISegmentMeta, IStorage, ITokenizedDoc, IndexType };
|
package/lib/index.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
class t{#t;constructor(t){this.#t=t}async#e(){return await(await navigator.storage.getDirectory()).getDirectoryHandle(this.#t,{create:!0})}async write(t,e){const s=await(await(await this.#e()).getFileHandle(t,{create:!0})).createWritable();await s.write(e),await s.close()}async append(t,e){const s=await this.#e();let i;try{i=await s.getFileHandle(t,{create:!0})}catch{i=await s.getFileHandle(t,{create:!0})}const n=await i.getFile(),a=await i.createWritable({keepExistingData:!0});await a.seek(n.size),await a.write(e),await a.close()}async read(t){const e=await this.#e();try{return await(await(await e.getFileHandle(t)).getFile()).arrayBuffer()}catch{return null}}async readRange(t,e,s){const i=await this.#e();try{return await(await(await i.getFileHandle(t)).getFile()).slice(e,s).arrayBuffer()}catch{return null}}async remove(t){const e=await this.#e();try{await e.removeEntry(t)}catch{}}async listFiles(){const t=await this.#e(),e=[];for await(const s of t.keys())e.push(s);return e}async clearAll(){const t=await this.#e();for await(const e of t.keys())await t.removeEntry(e,{recursive:!0})}async getFileSize(t){const e=await this.#e();try{return(await(await e.getFileHandle(t)).getFile()).size}catch{return 0}}}class e{#s=null;#i=null;#t;#n="";constructor(t){this.#t=t}async#a(){if(this.#s)return;const t=await import("node:fs"),e=await import("node:path");this.#s=t.promises,this.#i=e.default||e,this.#n=this.#i.join(process.cwd(),this.#t);try{await this.#s.access(this.#n)}catch{await this.#s.mkdir(this.#n,{recursive:!0})}}#r(t){return this.#i.join(this.#n,t)}async write(t,e){await this.#a(),await this.#s.writeFile(this.#r(t),Buffer.from(e))}async append(t,e){await this.#a(),await this.#s.appendFile(this.#r(t),Buffer.from(e))}async read(t){await this.#a();try{const e=await this.#s.readFile(this.#r(t));return e.buffer.slice(e.byteOffset,e.byteOffset+e.byteLength)}catch{return null}}async readRange(t,e,s){await this.#a();try{const i=await this.#s.open(this.#r(t),"r"),n=s-e,a=Buffer.alloc(n);return await i.read(a,0,n,e),await i.close(),a.buffer.slice(a.byteOffset,a.byteOffset+a.byteLength)}catch{return null}}async remove(t){await this.#a();try{await this.#s.unlink(this.#r(t))}catch{}}async listFiles(){await this.#a();try{return await this.#s.readdir(this.#n)}catch{return[]}}async clearAll(){await this.#a();try{const t=await this.#s.readdir(this.#n);for(const e of t)await this.#s.unlink(this.#i.join(this.#n,e))}catch{}}async getFileSize(t){await this.#a();try{return(await this.#s.stat(this.#r(t))).size}catch{return 0}}}const s="search_meta.json",i="deleted_ids.bin",n="added_ids.bin";class a{#o;#h={wordSegments:[],charSegments:[]};#c=new Set;#d=new Set;constructor(t){this.#o=t}async load(){const t=await this.#o.read(s);if(t){const e=(new TextDecoder).decode(t);this.#h=JSON.parse(e)}else this.#h={wordSegments:[],charSegments:[]};const e=await this.#o.read(i);if(e){const t=new DataView(e);let s=0;const i=e.byteLength;for(;s<i&&!(s+4>i);){const e=t.getUint32(s,!0);this.#c.add(e),s+=4,s<i&&30===t.getUint8(s)&&(s+=1)}}const a=await this.#o.read(n);if(a){const t=new DataView(a);let e=0;const s=a.byteLength;for(;e<s&&!(e+4>s);){const i=t.getUint32(e,!0);this.#d.add(i),e+=4,e<s&&30===t.getUint8(e)&&(e+=1)}}}async save(){const t=JSON.stringify(this.#h);if(await this.#o.write(s,(new TextEncoder).encode(t).buffer),0===this.#c.size)await this.#o.remove(i);else{const t=4*this.#c.size+this.#c.size,e=new ArrayBuffer(t),s=new DataView(e);let n=0;for(const t of this.#c)s.setUint32(n,t,!0),n+=4,s.setUint8(n,30),n+=1;await this.#o.write(i,e)}if(0===this.#d.size)await this.#o.remove(n);else{const t=4*this.#d.size+this.#d.size,e=new ArrayBuffer(t),s=new DataView(e);let i=0;for(const t of this.#d)s.setUint32(i,t,!0),i+=4,s.setUint8(i,30),i+=1;await this.#o.write(n,e)}}getSegments(t){return"word"===t?this.#h.wordSegments:this.#h.charSegments}getDeletedIds(){return this.#c}addDeletedId(t){this.#c.add(t)}isDeleted(t){return this.#c.has(t)}addAddedId(t){this.#d.add(t)}removeAddedId(t){this.#d.delete(t)}isAdded(t){return this.#d.has(t)}getAddedIds(){return this.#d}getLastSegmentInfo(t){const e=this.getSegments(t);return 0===e.length?null:e[e.length-1]}updateSegment(t,e,s,i,n,a){const r="word"===t?this.#h.wordSegments:this.#h.charSegments;if(a)r.push({filename:e,start:s,end:i,tokenCount:n});else{const t=r[r.length-1];t&&t.filename===e&&(t.end=i,t.tokenCount=n)}}reset(){this.#h={wordSegments:[],charSegments:[]},this.#c.clear(),this.#d.clear()}}class r{static SEPARATOR=30;#o;constructor(t){this.#o=t}async appendBatch(t,e){if(0===e.length)return await this.#o.getFileSize(t);const s=new TextEncoder;let i=0;for(const t of e){i+=8;for(const e of t.tokens){i+=2+Math.min(s.encode(e).byteLength,65535)}i+=1}const n=new Uint8Array(i);let a=0;for(const t of e){const e=[];for(const i of t.tokens){const t=s.encode(i),n=t.byteLength>65535?t.slice(0,65535):t;e.push(n)}const i=new DataView(n.buffer,a);i.setUint32(0,t.id,!0),i.setUint32(4,e.length,!0),a+=8;for(const t of e)new DataView(n.buffer,a).setUint16(0,t.byteLength,!0),a+=2,n.set(t,a),a+=t.byteLength;n[a++]=r.SEPARATOR}return await this.#o.append(t,n.buffer),await this.#o.getFileSize(t)}async readRange(t,e,s){const i=await this.#o.readRange(t,e,s);if(!i||0===i.byteLength)return[];const n=new DataView(i),a=new Uint8Array(i),o=new TextDecoder,h=[];let c=0;const d=i.byteLength;for(;c<d&&!(c+8>d);){const t=n.getUint32(c,!0);c+=4;const e=n.getUint32(c,!0);c+=4;const s=[];for(let t=0;t<e&&!(c+2>d);t++){const t=n.getUint16(c,!0);if(c+=2,c+t>d)break;const e=new Uint8Array(i,c,t);s.push(o.decode(e)),c+=t}c<d&&a[c]===r.SEPARATOR&&(c+=1),h.push({id:t,tokens:s})}return h}async getCurrentSize(t){return await this.#o.getFileSize(t)}}class o{#g;#o;#l=null;#f=null;static hash(t){let e=5381;for(let s=0;s<t.length;s++)e=(e<<5)+e^t.charCodeAt(s);return e>>>0}constructor(t,e){this.#g=t,this.#o=e}async loadIndex(){return!!this.#l||(this.#l=await this.#o.read(this.#g),!!this.#l&&(this.#f=new DataView(this.#l),!0))}async buildAndSave(t){const e=new Map;for(const s of t){const t=new Map;for(const i of s.tokens)if(!t.has(i)){t.set(i,!0);const n=o.hash(i);e.has(n)||e.set(n,[]),e.get(n).push(s.id)}}const s=Array.from(e.keys()).sort((t,e)=>t-e);let i=0;const n=new Array(s.length);for(let t=0;t<s.length;t++){const a=s[t],r=e.get(a);n[t]=r,i+=r.length}const a=12*s.length,r=new ArrayBuffer(8+a+4*i),h=new DataView(r);h.setUint32(0,1229866072),h.setUint32(4,s.length);let c=8,d=8+a;for(let t=0;t<s.length;t++){const e=s[t],i=n[t];h.setUint32(c,e),h.setUint32(c+4,d),h.setUint32(c+8,i.length),c+=12;for(let t=0;t<i.length;t++)h.setUint32(d,i[t],!0),d+=4}await this.#o.write(this.#g,r),this.#l=r,this.#f=h}search(t){if(!this.#f||!this.#l)return[];const e=o.hash(t);let s=0,i=this.#f.getUint32(4)-1;for(;s<=i;){const t=s+i>>>1,n=8+12*t,a=this.#f.getUint32(n);if(a<e)s=t+1;else{if(!(a>e)){const t=this.#f.getUint32(n+4),e=this.#f.getUint32(n+8),s=[];for(let i=0;i<e;i++)s.push(this.#f.getUint32(t+4*i,!0));return s}i=t-1}}return[]}}const h="word_cache.bin",c="char_cache.bin";class d{#o;#h;#w;#u;#m=!1;#y;#p=!1;#S={word:0,char:0};constructor(s){if(!s.baseDir)throw new Error("SearchEngine requires 'baseDir' in config.");if(this.#y={wordSegmentTokenThreshold:1e5,charSegmentTokenThreshold:5e5,minWordTokenSave:0,minCharTokenSave:0,...s},(this.#y.minWordTokenSave||0)>=(this.#y.wordSegmentTokenThreshold||1e5))throw new Error("minWordTokenSave must be less than wordSegmentTokenThreshold");if((this.#y.minCharTokenSave||0)>=(this.#y.charSegmentTokenThreshold||5e5))throw new Error("minCharTokenSave must be less than charSegmentTokenThreshold");let i=null;if(this.#y.storage&&("object"==typeof this.#y.storage?i=this.#y.storage:"browser"===this.#y.storage?i=new t(this.#y.baseDir):"node"===this.#y.storage&&(i=new e(this.#y.baseDir))),!i){const s=typeof navigator<"u"&&navigator?.storage?.getDirectory instanceof Function,n=typeof process<"u"&&null!=process.versions&&null!=process.versions.node;s?i=new t(this.#y.baseDir):n&&(i=new e(this.#y.baseDir))}if(!i)throw new Error('Storage initialization failed. Please configure "storage" explicitly or ensure you are in a supported environment (Browser/Node).');this.#o=i,this.#h=new a(this.#o),this.#w=new r(this.#o),this.#u=new Map}async init(){if(this.#m)return;await this.#h.load();const t=[...this.#h.getSegments("word"),...this.#h.getSegments("char")];for(const e of t)this.#u.has(e.filename)||this.#u.set(e.filename,new o(e.filename,this.#o)),await this.#u.get(e.filename).loadIndex();this.#m=!0}startBatch(){this.#p=!0,this.#S={word:0,char:0}}async endBatch(){this.#p=!1,this.#S.word>0&&await this.#D("word",this.#S.word),this.#S.char>0&&await this.#D("char",this.#S.char),this.#S={word:0,char:0},await this.#h.save()}#k(t){if(typeof Intl<"u"&&Intl.Segmenter){const e=new Intl.Segmenter([],{granularity:"word"});return Array.from(e.segment(t)).filter(t=>t.isWordLike).map(t=>t.segment.toLowerCase())}return t.toLowerCase().split(/[^a-z0-9\u4e00-\u9fa5]+/g).filter(t=>t.length>0)}#b(t){return this.#y.indexingTokenizer?this.#y.indexingTokenizer(t):this.#k(t)}#T(t){return this.#y.searchTokenizer?this.#y.searchTokenizer(t):this.#y.indexingTokenizer?this.#y.indexingTokenizer(t):this.#k(t)}async addDocument(t){return this.addDocuments([t])}async addDocuments(t){if(this.#m||await this.init(),0===t.length)return;const e=this.#h.getDeletedIds(),s=[],i=[];for(const n of t){if(e.has(n.id))throw new Error(`Document ID ${n.id} has been deleted and cannot be re-added.`);if(this.#h.isAdded(n.id))throw new Error(`Document ID ${n.id} already exists.`);const t=this.#b(n.text),a=[],r=[];for(const e of t)e.length>1?a.push(e):1===e.length&&r.push(e);a.length>0&&s.push({id:n.id,tokens:a}),r.length>0&&i.push({id:n.id,tokens:r})}let n=0,a=0;if(s.length>0){await this.#w.appendBatch(h,s);for(const t of s)n+=t.tokens.length}if(i.length>0){await this.#w.appendBatch(c,i);for(const t of i)a+=t.tokens.length}for(const e of t)this.#h.addAddedId(e.id);this.#p?(this.#S.word+=n,this.#S.char+=a):(n>0&&await this.#D("word",n),a>0&&await this.#D("char",a),await this.#h.save())}async#D(t,e){const s="word"===t?h:c,i=await this.#w.getCurrentSize(s),n="word"===t?this.#y.wordSegmentTokenThreshold||1e5:this.#y.charSegmentTokenThreshold||5e5,a="word"===t?this.#y.minWordTokenSave||0:this.#y.minCharTokenSave||0,r=this.#h.getLastSegmentInfo(t);let d,g,l,f;const w=()=>{const e=this.#h.getSegments(t).length+1;return`${t}_seg_${e}.bin`};if(r){const t=r.tokenCount;t>=n||t+e>=n?(d=w(),l=!0,g=r.end,f=e):(d=r.filename,l=!1,g=r.start,f=t+e)}else d=w(),l=!0,g=0,f=e;if(f<a)return void this.#h.updateSegment(t,d,g,i,f,l);const u=await this.#w.readRange(s,g,i);let m=this.#u.get(d);m||(m=new o(d,this.#o),this.#u.set(d,m)),await m.buildAndSave(u),this.#h.updateSegment(t,d,g,i,f,l)}async search(t,e){this.#m||await this.init();const s=this.#T(t),i=s.filter(t=>t.length>1),n=s.filter(t=>1===t.length),a=this.#h.getDeletedIds(),r=new Map,h=new Map,c=t=>{const e=this.#h.getSegments(t);for(const t of e){const e=t.filename;!this.#u.has(e)&&!h.has(e)&&h.set(e,new o(e,this.#o))}};c("word"),c("char"),await Promise.all(Array.from(h.entries()).map(([t,e])=>e.loadIndex().then(s=>{s&&this.#u.set(t,e)})));const d=async(t,e)=>{if(0===e.length)return;const s=this.#h.getSegments(t);for(const t of s){const s=t.filename,i=this.#u.get(s);if(i)for(const t of e){const e=i.search(t),s=1+.1*t.length;for(const i of e)if(!a.has(i))if(r.has(i)){const e=r.get(i);e.score+=s,e.tokens.add(t)}else r.set(i,{score:0,tokens:new Set([t])})}}};await d("word",i),await d("char",n);const g=[];return r.forEach((t,e)=>{g.push({id:e,score:t.score,tokens:Array.from(t.tokens)})}),g.sort((t,e)=>e.score-t.score),"number"==typeof e&&e>0?g.slice(0,e):g}async removeDocument(t){this.#m||await this.init(),this.#h.addDeletedId(t),this.#h.removeAddedId(t),await this.#h.save()}async clearAll(){await this.#o.clearAll(),this.#u.clear(),this.#h.reset(),this.#m=!1,this.#p=!1,this.#S={word:0,char:0}}async getStatus(){return this.#m||await this.init(),{wordSegments:this.#h.getSegments("word").length,charSegments:this.#h.getSegments("char").length,deleted:this.#h.getDeletedIds().size,wordCacheSize:await this.#w.getCurrentSize(h),charCacheSize:await this.#w.getCurrentSize(c),inBatch:this.#p}}}class g{static#I=null;static#v={baseDir:"simple_search_data",wordSegmentTokenThreshold:1e5,minWordTokenSave:0};static configure(t){const e={...this.#v,...t};this.#I=new d(e)}static#z(){return this.#I||(this.#I=new d(this.#v)),this.#I}static async startBatch(){this.#z().startBatch()}static async endBatch(){return this.#z().endBatch()}static async addDocument(t){return this.#z().addDocument(t)}static async addDocuments(t){return this.#z().addDocuments(t)}static async search(t,e){return this.#z().search(t,e)}static async removeDocument(t){return this.#z().removeDocument(t)}static async clearAll(){return this.#z().clearAll()}static async getStatus(){return this.#z().getStatus()}}export{t as BrowserStorage,e as NodeStorage,d as SearchEngine,g as SimpleSearch};
|
|
1
|
+
class t{#t;constructor(t){this.#t=t}async#e(){return await(await navigator.storage.getDirectory()).getDirectoryHandle(this.#t,{create:!0})}async write(t,e){const s=await(await(await this.#e()).getFileHandle(t,{create:!0})).createWritable();await s.write(e),await s.close()}async append(t,e){const s=await this.#e();let i;try{i=await s.getFileHandle(t,{create:!0})}catch{i=await s.getFileHandle(t,{create:!0})}const n=await i.getFile(),a=await i.createWritable({keepExistingData:!0});await a.seek(n.size),await a.write(e),await a.close()}async read(t){const e=await this.#e();try{return await(await(await e.getFileHandle(t)).getFile()).arrayBuffer()}catch{return null}}async readRange(t,e,s){const i=await this.#e();try{return await(await(await i.getFileHandle(t)).getFile()).slice(e,s).arrayBuffer()}catch{return null}}async remove(t){const e=await this.#e();try{await e.removeEntry(t)}catch{}}async listFiles(){const t=await this.#e(),e=[];for await(const s of t.keys())e.push(s);return e}async clearAll(){const t=await this.#e();for await(const e of t.keys())await t.removeEntry(e,{recursive:!0})}async getFileSize(t){const e=await this.#e();try{return(await(await e.getFileHandle(t)).getFile()).size}catch{return 0}}}class e{#s=null;#i=null;#t;#n="";constructor(t){this.#t=t}async#a(){if(this.#s)return;const t=await import("node:fs"),e=await import("node:path");this.#s=t.promises,this.#i=e.default||e,this.#n=this.#i.join(process.cwd(),this.#t);try{await this.#s.access(this.#n)}catch{await this.#s.mkdir(this.#n,{recursive:!0})}}#r(t){return this.#i.join(this.#n,t)}async write(t,e){await this.#a(),await this.#s.writeFile(this.#r(t),Buffer.from(e))}async append(t,e){await this.#a(),await this.#s.appendFile(this.#r(t),Buffer.from(e))}async read(t){await this.#a();try{const e=await this.#s.readFile(this.#r(t));return e.buffer.slice(e.byteOffset,e.byteOffset+e.byteLength)}catch{return null}}async readRange(t,e,s){await this.#a();try{const i=await this.#s.open(this.#r(t),"r"),n=s-e,a=Buffer.alloc(n);return await i.read(a,0,n,e),await i.close(),a.buffer.slice(a.byteOffset,a.byteOffset+a.byteLength)}catch{return null}}async remove(t){await this.#a();try{await this.#s.unlink(this.#r(t))}catch{}}async listFiles(){await this.#a();try{return await this.#s.readdir(this.#n)}catch{return[]}}async clearAll(){await this.#a();try{const t=await this.#s.readdir(this.#n);for(const e of t)await this.#s.unlink(this.#i.join(this.#n,e))}catch{}}async getFileSize(t){await this.#a();try{return(await this.#s.stat(this.#r(t))).size}catch{return 0}}}const s="search_meta.json",i="deleted_ids.bin",n="added_ids.bin";class a{#o;#h={wordSegments:[],charSegments:[]};#c=new Set;#d=new Set;constructor(t){this.#o=t}async load(){const t=await this.#o.read(s);if(t){const e=(new TextDecoder).decode(t);this.#h=JSON.parse(e)}else this.#h={wordSegments:[],charSegments:[]};const e=await this.#o.read(i);if(e){const t=new DataView(e);let s=0;const i=e.byteLength;for(;s<i&&!(s+4>i);){const e=t.getUint32(s,!0);this.#c.add(e),s+=4,s<i&&30===t.getUint8(s)&&(s+=1)}}const a=await this.#o.read(n);if(a){const t=new DataView(a);let e=0;const s=a.byteLength;for(;e<s&&!(e+4>s);){const i=t.getUint32(e,!0);this.#d.add(i),e+=4,e<s&&30===t.getUint8(e)&&(e+=1)}}}async save(){const t=JSON.stringify(this.#h);if(await this.#o.write(s,(new TextEncoder).encode(t).buffer),0===this.#c.size)await this.#o.remove(i);else{const t=4*this.#c.size+this.#c.size,e=new ArrayBuffer(t),s=new DataView(e);let n=0;for(const t of this.#c)s.setUint32(n,t,!0),n+=4,s.setUint8(n,30),n+=1;await this.#o.write(i,e)}if(0===this.#d.size)await this.#o.remove(n);else{const t=4*this.#d.size+this.#d.size,e=new ArrayBuffer(t),s=new DataView(e);let i=0;for(const t of this.#d)s.setUint32(i,t,!0),i+=4,s.setUint8(i,30),i+=1;await this.#o.write(n,e)}}getSegments(t){return"word"===t?this.#h.wordSegments:this.#h.charSegments}getDeletedIds(){return this.#c}addDeletedId(t){this.#c.add(t)}isDeleted(t){return this.#c.has(t)}addAddedId(t){this.#d.add(t)}removeAddedId(t){this.#d.delete(t)}isAdded(t){return this.#d.has(t)}getAddedIds(){return this.#d}hasDocument(t){return this.#d.has(t)||this.#c.has(t)}getLastSegmentInfo(t){const e=this.getSegments(t);return 0===e.length?null:e[e.length-1]}updateSegment(t,e,s,i,n,a){const r="word"===t?this.#h.wordSegments:this.#h.charSegments;if(a)r.push({filename:e,start:s,end:i,tokenCount:n});else{const t=r[r.length-1];t&&t.filename===e&&(t.end=i,t.tokenCount=n)}}reset(){this.#h={wordSegments:[],charSegments:[]},this.#c.clear(),this.#d.clear()}}class r{static SEPARATOR=30;#o;constructor(t){this.#o=t}async appendBatch(t,e){if(0===e.length)return await this.#o.getFileSize(t);const s=new TextEncoder;let i=0;for(const t of e){i+=8;for(const e of t.tokens){i+=2+Math.min(s.encode(e).byteLength,65535)}i+=1}const n=new Uint8Array(i);let a=0;for(const t of e){const e=[];for(const i of t.tokens){const t=s.encode(i),n=t.byteLength>65535?t.slice(0,65535):t;e.push(n)}const i=new DataView(n.buffer,a);i.setUint32(0,t.id,!0),i.setUint32(4,e.length,!0),a+=8;for(const t of e)new DataView(n.buffer,a).setUint16(0,t.byteLength,!0),a+=2,n.set(t,a),a+=t.byteLength;n[a++]=r.SEPARATOR}return await this.#o.append(t,n.buffer),await this.#o.getFileSize(t)}async readRange(t,e,s){const i=await this.#o.readRange(t,e,s);if(!i||0===i.byteLength)return[];const n=new DataView(i),a=new Uint8Array(i),o=new TextDecoder,h=[];let c=0;const d=i.byteLength;for(;c<d&&!(c+8>d);){const t=n.getUint32(c,!0);c+=4;const e=n.getUint32(c,!0);c+=4;const s=[];for(let t=0;t<e&&!(c+2>d);t++){const t=n.getUint16(c,!0);if(c+=2,c+t>d)break;const e=new Uint8Array(i,c,t);s.push(o.decode(e)),c+=t}c<d&&a[c]===r.SEPARATOR&&(c+=1),h.push({id:t,tokens:s})}return h}async getCurrentSize(t){return await this.#o.getFileSize(t)}}class o{#g;#o;#l=null;#f=null;static hash(t){let e=5381;for(let s=0;s<t.length;s++)e=(e<<5)+e^t.charCodeAt(s);return e>>>0}constructor(t,e){this.#g=t,this.#o=e}async loadIndex(){return!!this.#l||(this.#l=await this.#o.read(this.#g),!!this.#l&&(this.#f=new DataView(this.#l),!0))}async buildAndSave(t){const e=new Map;for(const s of t){const t=new Map;for(const i of s.tokens)if(!t.has(i)){t.set(i,!0);const n=o.hash(i);e.has(n)||e.set(n,[]),e.get(n).push(s.id)}}const s=Array.from(e.keys()).sort((t,e)=>t-e);let i=0;const n=new Array(s.length);for(let t=0;t<s.length;t++){const a=s[t],r=e.get(a);n[t]=r,i+=r.length}const a=12*s.length,r=new ArrayBuffer(8+a+4*i),h=new DataView(r);h.setUint32(0,1229866072),h.setUint32(4,s.length);let c=8,d=8+a;for(let t=0;t<s.length;t++){const e=s[t],i=n[t];h.setUint32(c,e),h.setUint32(c+4,d),h.setUint32(c+8,i.length),c+=12;for(let t=0;t<i.length;t++)h.setUint32(d,i[t],!0),d+=4}await this.#o.write(this.#g,r),this.#l=r,this.#f=h}search(t){if(!this.#f||!this.#l)return[];const e=o.hash(t);let s=0,i=this.#f.getUint32(4)-1;for(;s<=i;){const t=s+i>>>1,n=8+12*t,a=this.#f.getUint32(n);if(a<e)s=t+1;else{if(!(a>e)){const t=this.#f.getUint32(n+4),e=this.#f.getUint32(n+8),s=[];for(let i=0;i<e;i++)s.push(this.#f.getUint32(t+4*i,!0));return s}i=t-1}}return[]}}const h="word_cache.bin",c="char_cache.bin";class d{#o;#h;#w;#u;#m=!1;#y;#p=!1;#D={word:0,char:0};constructor(s){if(!s.baseDir)throw new Error("SearchEngine requires 'baseDir' in config.");if(this.#y={wordSegmentTokenThreshold:1e5,charSegmentTokenThreshold:5e5,minWordTokenSave:0,minCharTokenSave:0,...s},(this.#y.minWordTokenSave||0)>=(this.#y.wordSegmentTokenThreshold||1e5))throw new Error("minWordTokenSave must be less than wordSegmentTokenThreshold");if((this.#y.minCharTokenSave||0)>=(this.#y.charSegmentTokenThreshold||5e5))throw new Error("minCharTokenSave must be less than charSegmentTokenThreshold");let i=null;if(this.#y.storage&&("object"==typeof this.#y.storage?i=this.#y.storage:"browser"===this.#y.storage?i=new t(this.#y.baseDir):"node"===this.#y.storage&&(i=new e(this.#y.baseDir))),!i){const s=typeof navigator<"u"&&navigator?.storage?.getDirectory instanceof Function,n=typeof process<"u"&&null!=process.versions&&null!=process.versions.node;s?i=new t(this.#y.baseDir):n&&(i=new e(this.#y.baseDir))}if(!i)throw new Error('Storage initialization failed. Please configure "storage" explicitly or ensure you are in a supported environment (Browser/Node).');this.#o=i,this.#h=new a(this.#o),this.#w=new r(this.#o),this.#u=new Map}async init(){if(this.#m)return;await this.#h.load();const t=[...this.#h.getSegments("word"),...this.#h.getSegments("char")];for(const e of t)this.#u.has(e.filename)||this.#u.set(e.filename,new o(e.filename,this.#o)),await this.#u.get(e.filename).loadIndex();this.#m=!0}startBatch(){this.#p=!0,this.#D={word:0,char:0}}async endBatch(){this.#p=!1,this.#D.word>0&&await this.#S("word",this.#D.word),this.#D.char>0&&await this.#S("char",this.#D.char),this.#D={word:0,char:0},await this.#h.save()}#k(t){try{if(typeof Intl<"u"&&"function"==typeof Intl.Segmenter&&"function"==typeof Array.from){const e=new Intl.Segmenter([],{granularity:"word"}).segment(t);if("object"==typeof e&&null!==e)return Array.from(e).filter(t=>t?.isWordLike).map(t=>t?.segment?.toLowerCase()||"")}}catch{}return t.toLowerCase().split(/[^a-z0-9\u4e00-\u9fa5]+/g).filter(t=>t.length>0)}#I(t){return this.#y.indexingTokenizer?this.#y.indexingTokenizer(t):this.#k(t.text)}#b(t){return this.#y.searchTokenizer?this.#y.searchTokenizer(t):this.#I(t)}async addDocument(t){return this.addDocuments([t])}async addDocumentIfMissing(t){return this.addDocumentsIfMissing([t])}async addDocumentsIfMissing(t){if(this.#m||await this.init(),0===t.length)return;const e=this.#h.getDeletedIds(),s=[],i=[],n=[];for(const a of t){if(e.has(a.id)||this.#h.isAdded(a.id))continue;const t=this.#I(a),r=[],o=[];for(const e of t)e.length>1?r.push(e):1===e.length&&o.push(e);r.length>0&&s.push({id:a.id,tokens:r}),o.length>0&&i.push({id:a.id,tokens:o}),n.push(a)}if(0===n.length)return;let a=0,r=0;if(s.length>0){await this.#w.appendBatch(h,s);for(const t of s)a+=t.tokens.length}if(i.length>0){await this.#w.appendBatch(c,i);for(const t of i)r+=t.tokens.length}for(const t of n)this.#h.addAddedId(t.id);this.#p?(this.#D.word+=a,this.#D.char+=r):(a>0&&await this.#S("word",a),r>0&&await this.#S("char",r),await this.#h.save())}async addDocuments(t){if(this.#m||await this.init(),0===t.length)return;const e=this.#h.getDeletedIds(),s=[],i=[];for(const n of t){if(e.has(n.id))throw new Error(`Document ID ${n.id} has been deleted and cannot be re-added.`);if(this.#h.isAdded(n.id))throw new Error(`Document ID ${n.id} already exists.`);const t=this.#I(n),a=[],r=[];for(const e of t)e.length>1?a.push(e):1===e.length&&r.push(e);a.length>0&&s.push({id:n.id,tokens:a}),r.length>0&&i.push({id:n.id,tokens:r})}let n=0,a=0;if(s.length>0){await this.#w.appendBatch(h,s);for(const t of s)n+=t.tokens.length}if(i.length>0){await this.#w.appendBatch(c,i);for(const t of i)a+=t.tokens.length}for(const e of t)this.#h.addAddedId(e.id);this.#p?(this.#D.word+=n,this.#D.char+=a):(n>0&&await this.#S("word",n),a>0&&await this.#S("char",a),await this.#h.save())}async#S(t,e){const s="word"===t?h:c,i=await this.#w.getCurrentSize(s),n="word"===t?this.#y.wordSegmentTokenThreshold||1e5:this.#y.charSegmentTokenThreshold||5e5,a="word"===t?this.#y.minWordTokenSave||0:this.#y.minCharTokenSave||0,r=this.#h.getLastSegmentInfo(t);let d,g,l,f;const w=()=>{const e=this.#h.getSegments(t).length+1;return`${t}_seg_${e}.bin`};if(r){const t=r.tokenCount;t>=n||t+e>=n?(d=w(),l=!0,g=r.end,f=e):(d=r.filename,l=!1,g=r.start,f=t+e)}else d=w(),l=!0,g=0,f=e;if(f<a)return void this.#h.updateSegment(t,d,g,i,f,l);const u=await this.#w.readRange(s,g,i);let m=this.#u.get(d);m||(m=new o(d,this.#o),this.#u.set(d,m)),await m.buildAndSave(u),this.#h.updateSegment(t,d,g,i,f,l)}async search(t,e){this.#m||await this.init();const s="string"==typeof t?{text:t}:t,i=this.#b(s),n=i.filter(t=>t.length>1),a=i.filter(t=>1===t.length),r=this.#h.getDeletedIds(),h=new Map,c=new Map,d=t=>{const e=this.#h.getSegments(t);for(const t of e){const e=t.filename;!this.#u.has(e)&&!c.has(e)&&c.set(e,new o(e,this.#o))}};d("word"),d("char"),await Promise.all(Array.from(c.entries()).map(([t,e])=>e.loadIndex().then(s=>{s&&this.#u.set(t,e)})));const g=async(t,e)=>{if(0===e.length)return;const s=this.#h.getSegments(t);for(const t of s){const s=t.filename,i=this.#u.get(s);if(i)for(const t of e){const e=i.search(t),s=1+.1*t.length;for(const i of e)if(!r.has(i))if(h.has(i)){const e=h.get(i);e.score+=s,e.tokens.add(t)}else h.set(i,{score:0,tokens:new Set([t])})}}};await g("word",n),await g("char",a);const l=[];return h.forEach((t,e)=>{l.push({id:e,score:t.score,tokens:Array.from(t.tokens)})}),l.sort((t,e)=>e.score-t.score),"number"==typeof e&&e>0?l.slice(0,e):l}async removeDocument(t){this.#m||await this.init(),this.#h.addDeletedId(t),this.#h.removeAddedId(t),await this.#h.save()}async clearAll(){await this.#o.clearAll(),this.#u.clear(),this.#h.reset(),this.#m=!1,this.#p=!1,this.#D={word:0,char:0}}async getStatus(){return this.#m||await this.init(),{wordSegments:this.#h.getSegments("word").length,charSegments:this.#h.getSegments("char").length,deleted:this.#h.getDeletedIds().size,wordCacheSize:await this.#w.getCurrentSize(h),charCacheSize:await this.#w.getCurrentSize(c),inBatch:this.#p}}async hasDocument(t){return this.#m||await this.init(),this.#h.hasDocument(t)}}class g{static#T=null;static#v={baseDir:"simple_search_data",wordSegmentTokenThreshold:1e5,minWordTokenSave:0};static configure(t){const e={...this.#v,...t};this.#T=new d(e)}static#z(){return this.#T||(this.#T=new d(this.#v)),this.#T}static async startBatch(){this.#z().startBatch()}static async endBatch(){return this.#z().endBatch()}static async addDocument(t){return this.#z().addDocument(t)}static async addDocumentIfMissing(t){return this.#z().addDocumentIfMissing(t)}static async addDocuments(t){return this.#z().addDocuments(t)}static async addDocumentsIfMissing(t){return this.#z().addDocumentsIfMissing(t)}static async search(t,e){return this.#z().search(t,e)}static async removeDocument(t){return this.#z().removeDocument(t)}static async clearAll(){return this.#z().clearAll()}static async getStatus(){return this.#z().getStatus()}static async hasDocument(t){return this.#z().hasDocument(t)}}export{t as BrowserStorage,e as NodeStorage,d as SearchEngine,g as SimpleSearch};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "gs-search",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.2",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"main": "lib/index.cjs",
|
|
6
6
|
"module": "lib/index.js",
|
|
@@ -13,6 +13,8 @@
|
|
|
13
13
|
},
|
|
14
14
|
"types": "lib/index.d.ts",
|
|
15
15
|
"keywords": [
|
|
16
|
+
"full-text-search",
|
|
17
|
+
"browser-search-engine"
|
|
16
18
|
],
|
|
17
19
|
"homepage": "https://github.com/grain-sand/gs-search",
|
|
18
20
|
"repository": {
|
|
@@ -23,6 +25,5 @@
|
|
|
23
25
|
"url": "https://github.com/grain-sand/gs-search/issues"
|
|
24
26
|
},
|
|
25
27
|
"author": "grain-sand",
|
|
26
|
-
"license": "MIT"
|
|
27
|
-
"dependencies": {}
|
|
28
|
+
"license": "MIT"
|
|
28
29
|
}
|