jfox-cli 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jfox/__init__.py +5 -0
- jfox/__main__.py +6 -0
- jfox/bm25_index.py +388 -0
- jfox/cli.py +1895 -0
- jfox/config.py +180 -0
- jfox/embedding_backend.py +65 -0
- jfox/formatters.py +305 -0
- jfox/global_config.py +281 -0
- jfox/graph.py +331 -0
- jfox/indexer.py +366 -0
- jfox/kb_manager.py +316 -0
- jfox/models.py +144 -0
- jfox/note.py +464 -0
- jfox/performance.py +408 -0
- jfox/search_engine.py +237 -0
- jfox/template.py +301 -0
- jfox/template_cli.py +327 -0
- jfox/vector_store.py +200 -0
- jfox_cli-0.1.0.dist-info/METADATA +637 -0
- jfox_cli-0.1.0.dist-info/RECORD +22 -0
- jfox_cli-0.1.0.dist-info/WHEEL +4 -0
- jfox_cli-0.1.0.dist-info/entry_points.txt +2 -0
jfox/__init__.py
ADDED
jfox/__main__.py
ADDED
jfox/bm25_index.py
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BM25 索引模块
|
|
3
|
+
|
|
4
|
+
提供基于 BM25 算法的关键词搜索功能,支持索引持久化和增量更新。
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import logging
|
|
9
|
+
import pickle
|
|
10
|
+
import re
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Dict, List, Optional, Set, Tuple
|
|
13
|
+
|
|
14
|
+
from rank_bm25 import BM25Okapi
|
|
15
|
+
|
|
16
|
+
from .config import config
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class BM25Index:
|
|
22
|
+
"""
|
|
23
|
+
BM25 索引管理器
|
|
24
|
+
|
|
25
|
+
负责构建、保存、加载和查询 BM25 索引。
|
|
26
|
+
支持增量更新和全量重建。
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
INDEX_VERSION = 1
|
|
30
|
+
INDEX_FILENAME = "bm25_index.pkl"
|
|
31
|
+
METADATA_FILENAME = "bm25_metadata.json"
|
|
32
|
+
|
|
33
|
+
def __init__(self, index_dir: Optional[Path] = None):
|
|
34
|
+
"""
|
|
35
|
+
初始化 BM25 索引
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
index_dir: 索引文件存放目录,默认为 config.zk_dir
|
|
39
|
+
"""
|
|
40
|
+
self.index_dir = index_dir or config.zk_dir
|
|
41
|
+
self.index_path = self.index_dir / self.INDEX_FILENAME
|
|
42
|
+
self.metadata_path = self.index_dir / self.METADATA_FILENAME
|
|
43
|
+
|
|
44
|
+
# 索引数据
|
|
45
|
+
self.bm25: Optional[BM25Okapi] = None
|
|
46
|
+
self.documents: List[str] = [] # 分词后的文档列表
|
|
47
|
+
self.doc_ids: List[str] = [] # 文档 ID 列表
|
|
48
|
+
self.doc_mapping: Dict[str, int] = {} # note_id -> index
|
|
49
|
+
|
|
50
|
+
# 加载已有索引
|
|
51
|
+
self._load()
|
|
52
|
+
|
|
53
|
+
def _tokenize(self, text: str) -> List[str]:
|
|
54
|
+
"""
|
|
55
|
+
分词函数 - 适配中英文
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
text: 输入文本
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
分词结果列表
|
|
62
|
+
"""
|
|
63
|
+
if not text:
|
|
64
|
+
return []
|
|
65
|
+
|
|
66
|
+
# 转换为小写
|
|
67
|
+
text = text.lower()
|
|
68
|
+
|
|
69
|
+
# 提取中文字符串(2-10字)和英文单词
|
|
70
|
+
# 中文按字符分割,英文按单词分割
|
|
71
|
+
tokens = []
|
|
72
|
+
|
|
73
|
+
# 匹配中文字符
|
|
74
|
+
chinese_chars = re.findall(r'[\u4e00-\u9fff]', text)
|
|
75
|
+
tokens.extend(chinese_chars)
|
|
76
|
+
|
|
77
|
+
# 匹配英文单词(包括下划线连接的变量名)
|
|
78
|
+
english_words = re.findall(r'[a-z][a-z0-9_]{0,20}', text)
|
|
79
|
+
tokens.extend(english_words)
|
|
80
|
+
|
|
81
|
+
# 匹配数字
|
|
82
|
+
numbers = re.findall(r'\d+', text)
|
|
83
|
+
tokens.extend(numbers)
|
|
84
|
+
|
|
85
|
+
return tokens
|
|
86
|
+
|
|
87
|
+
def _load(self) -> bool:
|
|
88
|
+
"""
|
|
89
|
+
从磁盘加载索引
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
是否成功加载
|
|
93
|
+
"""
|
|
94
|
+
try:
|
|
95
|
+
if not self.index_path.exists() or not self.metadata_path.exists():
|
|
96
|
+
logger.info("BM25 index not found, will create new index")
|
|
97
|
+
return False
|
|
98
|
+
|
|
99
|
+
# 加载元数据
|
|
100
|
+
with open(self.metadata_path, 'r', encoding='utf-8') as f:
|
|
101
|
+
metadata = json.load(f)
|
|
102
|
+
|
|
103
|
+
# 检查版本
|
|
104
|
+
if metadata.get('version') != self.INDEX_VERSION:
|
|
105
|
+
logger.warning(f"BM25 index version mismatch: {metadata.get('version')} != {self.INDEX_VERSION}")
|
|
106
|
+
return False
|
|
107
|
+
|
|
108
|
+
# 加载索引
|
|
109
|
+
with open(self.index_path, 'rb') as f:
|
|
110
|
+
index_data = pickle.load(f)
|
|
111
|
+
|
|
112
|
+
self.bm25 = index_data['bm25']
|
|
113
|
+
self.documents = index_data['documents']
|
|
114
|
+
self.doc_ids = index_data['doc_ids']
|
|
115
|
+
self.doc_mapping = index_data['doc_mapping']
|
|
116
|
+
|
|
117
|
+
logger.info(f"Loaded BM25 index: {len(self.doc_ids)} documents")
|
|
118
|
+
return True
|
|
119
|
+
|
|
120
|
+
except Exception as e:
|
|
121
|
+
logger.error(f"Failed to load BM25 index: {e}")
|
|
122
|
+
self._reset()
|
|
123
|
+
return False
|
|
124
|
+
|
|
125
|
+
def _save(self) -> bool:
|
|
126
|
+
"""
|
|
127
|
+
保存索引到磁盘
|
|
128
|
+
|
|
129
|
+
Returns:
|
|
130
|
+
是否成功保存
|
|
131
|
+
"""
|
|
132
|
+
try:
|
|
133
|
+
# 确保目录存在
|
|
134
|
+
self.index_dir.mkdir(parents=True, exist_ok=True)
|
|
135
|
+
|
|
136
|
+
# 保存元数据
|
|
137
|
+
metadata = {
|
|
138
|
+
'version': self.INDEX_VERSION,
|
|
139
|
+
'doc_count': len(self.doc_ids),
|
|
140
|
+
}
|
|
141
|
+
with open(self.metadata_path, 'w', encoding='utf-8') as f:
|
|
142
|
+
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
|
143
|
+
|
|
144
|
+
# 保存索引数据
|
|
145
|
+
index_data = {
|
|
146
|
+
'bm25': self.bm25,
|
|
147
|
+
'documents': self.documents,
|
|
148
|
+
'doc_ids': self.doc_ids,
|
|
149
|
+
'doc_mapping': self.doc_mapping,
|
|
150
|
+
}
|
|
151
|
+
with open(self.index_path, 'wb') as f:
|
|
152
|
+
pickle.dump(index_data, f)
|
|
153
|
+
|
|
154
|
+
logger.info(f"Saved BM25 index: {len(self.doc_ids)} documents")
|
|
155
|
+
return True
|
|
156
|
+
|
|
157
|
+
except Exception as e:
|
|
158
|
+
logger.error(f"Failed to save BM25 index: {e}")
|
|
159
|
+
return False
|
|
160
|
+
|
|
161
|
+
def _reset(self):
|
|
162
|
+
"""重置索引状态"""
|
|
163
|
+
self.bm25 = None
|
|
164
|
+
self.documents = []
|
|
165
|
+
self.doc_ids = []
|
|
166
|
+
self.doc_mapping = {}
|
|
167
|
+
|
|
168
|
+
def _rebuild_index(self):
|
|
169
|
+
"""重新构建 BM25 索引"""
|
|
170
|
+
if self.documents:
|
|
171
|
+
self.bm25 = BM25Okapi(self.documents)
|
|
172
|
+
else:
|
|
173
|
+
self.bm25 = None
|
|
174
|
+
|
|
175
|
+
def add_document(self, note_id: str, content: str) -> bool:
|
|
176
|
+
"""
|
|
177
|
+
添加文档到索引(增量更新)
|
|
178
|
+
|
|
179
|
+
Args:
|
|
180
|
+
note_id: 笔记 ID
|
|
181
|
+
content: 笔记内容
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
是否成功添加
|
|
185
|
+
"""
|
|
186
|
+
try:
|
|
187
|
+
# 如果已存在,先移除
|
|
188
|
+
if note_id in self.doc_mapping:
|
|
189
|
+
self.remove_document(note_id)
|
|
190
|
+
|
|
191
|
+
# 分词
|
|
192
|
+
tokens = self._tokenize(content)
|
|
193
|
+
if not tokens:
|
|
194
|
+
return True
|
|
195
|
+
|
|
196
|
+
# 添加到索引
|
|
197
|
+
idx = len(self.documents)
|
|
198
|
+
self.documents.append(tokens)
|
|
199
|
+
self.doc_ids.append(note_id)
|
|
200
|
+
self.doc_mapping[note_id] = idx
|
|
201
|
+
|
|
202
|
+
# 重建索引
|
|
203
|
+
self._rebuild_index()
|
|
204
|
+
|
|
205
|
+
# 保存
|
|
206
|
+
self._save()
|
|
207
|
+
|
|
208
|
+
return True
|
|
209
|
+
|
|
210
|
+
except Exception as e:
|
|
211
|
+
logger.error(f"Failed to add document {note_id}: {e}")
|
|
212
|
+
return False
|
|
213
|
+
|
|
214
|
+
def remove_document(self, note_id: str) -> bool:
|
|
215
|
+
"""
|
|
216
|
+
从索引中移除文档
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
note_id: 笔记 ID
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
是否成功移除
|
|
223
|
+
"""
|
|
224
|
+
try:
|
|
225
|
+
if note_id not in self.doc_mapping:
|
|
226
|
+
return True
|
|
227
|
+
|
|
228
|
+
idx = self.doc_mapping[note_id]
|
|
229
|
+
|
|
230
|
+
# 移除数据
|
|
231
|
+
self.documents.pop(idx)
|
|
232
|
+
self.doc_ids.pop(idx)
|
|
233
|
+
del self.doc_mapping[note_id]
|
|
234
|
+
|
|
235
|
+
# 更新其他文档的索引
|
|
236
|
+
self.doc_mapping = {}
|
|
237
|
+
for i, doc_id in enumerate(self.doc_ids):
|
|
238
|
+
self.doc_mapping[doc_id] = i
|
|
239
|
+
|
|
240
|
+
# 重建索引
|
|
241
|
+
self._rebuild_index()
|
|
242
|
+
|
|
243
|
+
# 保存
|
|
244
|
+
self._save()
|
|
245
|
+
|
|
246
|
+
return True
|
|
247
|
+
|
|
248
|
+
except Exception as e:
|
|
249
|
+
logger.error(f"Failed to remove document {note_id}: {e}")
|
|
250
|
+
return False
|
|
251
|
+
|
|
252
|
+
def search(self, query: str, top_k: int = 5) -> List[Dict]:
|
|
253
|
+
"""
|
|
254
|
+
搜索文档
|
|
255
|
+
|
|
256
|
+
Args:
|
|
257
|
+
query: 搜索查询
|
|
258
|
+
top_k: 返回结果数量
|
|
259
|
+
|
|
260
|
+
Returns:
|
|
261
|
+
搜索结果列表,每项包含 note_id 和 score
|
|
262
|
+
"""
|
|
263
|
+
if not self.bm25 or not self.documents:
|
|
264
|
+
return []
|
|
265
|
+
|
|
266
|
+
try:
|
|
267
|
+
# 分词
|
|
268
|
+
query_tokens = self._tokenize(query)
|
|
269
|
+
if not query_tokens:
|
|
270
|
+
return []
|
|
271
|
+
|
|
272
|
+
# BM25 搜索
|
|
273
|
+
scores = self.bm25.get_scores(query_tokens)
|
|
274
|
+
|
|
275
|
+
# 获取 top_k 结果
|
|
276
|
+
top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
|
|
277
|
+
|
|
278
|
+
results = []
|
|
279
|
+
for idx in top_indices:
|
|
280
|
+
# BM25 分数可能为负,只要大于最小值就返回
|
|
281
|
+
if scores[idx] > -10: # 使用合理的阈值
|
|
282
|
+
results.append({
|
|
283
|
+
'note_id': self.doc_ids[idx],
|
|
284
|
+
'score': float(scores[idx]),
|
|
285
|
+
})
|
|
286
|
+
|
|
287
|
+
return results
|
|
288
|
+
|
|
289
|
+
except Exception as e:
|
|
290
|
+
logger.error(f"BM25 search failed: {e}")
|
|
291
|
+
return []
|
|
292
|
+
|
|
293
|
+
def rebuild_from_notes(self, notes: List) -> bool:
|
|
294
|
+
"""
|
|
295
|
+
从笔记列表全量重建索引
|
|
296
|
+
|
|
297
|
+
Args:
|
|
298
|
+
notes: Note 对象列表
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
是否成功重建
|
|
302
|
+
"""
|
|
303
|
+
try:
|
|
304
|
+
self._reset()
|
|
305
|
+
|
|
306
|
+
for note in notes:
|
|
307
|
+
# 组合标题和内容
|
|
308
|
+
content = f"{note.title} {note.content}"
|
|
309
|
+
tokens = self._tokenize(content)
|
|
310
|
+
|
|
311
|
+
if tokens:
|
|
312
|
+
idx = len(self.documents)
|
|
313
|
+
self.documents.append(tokens)
|
|
314
|
+
self.doc_ids.append(note.id)
|
|
315
|
+
self.doc_mapping[note.id] = idx
|
|
316
|
+
|
|
317
|
+
# 构建索引
|
|
318
|
+
self._rebuild_index()
|
|
319
|
+
|
|
320
|
+
# 保存
|
|
321
|
+
self._save()
|
|
322
|
+
|
|
323
|
+
logger.info(f"Rebuilt BM25 index from {len(notes)} notes")
|
|
324
|
+
return True
|
|
325
|
+
|
|
326
|
+
except Exception as e:
|
|
327
|
+
logger.error(f"Failed to rebuild BM25 index: {e}")
|
|
328
|
+
return False
|
|
329
|
+
|
|
330
|
+
def get_stats(self) -> Dict:
|
|
331
|
+
"""
|
|
332
|
+
获取索引统计信息
|
|
333
|
+
|
|
334
|
+
Returns:
|
|
335
|
+
统计信息字典
|
|
336
|
+
"""
|
|
337
|
+
return {
|
|
338
|
+
'indexed': len(self.doc_ids),
|
|
339
|
+
'version': self.INDEX_VERSION,
|
|
340
|
+
'index_path': str(self.index_path),
|
|
341
|
+
'index_exists': self.index_path.exists(),
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
def clear(self) -> bool:
|
|
345
|
+
"""
|
|
346
|
+
清空索引
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
是否成功清空
|
|
350
|
+
"""
|
|
351
|
+
try:
|
|
352
|
+
self._reset()
|
|
353
|
+
|
|
354
|
+
# 删除文件
|
|
355
|
+
if self.index_path.exists():
|
|
356
|
+
self.index_path.unlink()
|
|
357
|
+
if self.metadata_path.exists():
|
|
358
|
+
self.metadata_path.unlink()
|
|
359
|
+
|
|
360
|
+
logger.info("Cleared BM25 index")
|
|
361
|
+
return True
|
|
362
|
+
|
|
363
|
+
except Exception as e:
|
|
364
|
+
logger.error(f"Failed to clear BM25 index: {e}")
|
|
365
|
+
return False
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
# 全局索引实例
|
|
369
|
+
_bm25_index: Optional[BM25Index] = None
|
|
370
|
+
|
|
371
|
+
|
|
372
|
+
def get_bm25_index() -> BM25Index:
|
|
373
|
+
"""
|
|
374
|
+
获取 BM25 索引实例(单例模式)
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
BM25Index 实例
|
|
378
|
+
"""
|
|
379
|
+
global _bm25_index
|
|
380
|
+
if _bm25_index is None:
|
|
381
|
+
_bm25_index = BM25Index()
|
|
382
|
+
return _bm25_index
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
def reset_bm25_index():
|
|
386
|
+
"""重置全局索引实例(用于切换知识库时)"""
|
|
387
|
+
global _bm25_index
|
|
388
|
+
_bm25_index = None
|