jfox-cli 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
jfox/__init__.py ADDED
@@ -0,0 +1,5 @@
1
+ """JFox - Zettelkasten 知识管理工具"""
2
+
3
+ __version__ = "0.1.0"
4
+ __author__ = "User"
5
+ __email__ = "user@example.com"
jfox/__main__.py ADDED
@@ -0,0 +1,6 @@
1
+ """入口点"""
2
+
3
+ from .cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
jfox/bm25_index.py ADDED
@@ -0,0 +1,388 @@
1
+ """
2
+ BM25 索引模块
3
+
4
+ 提供基于 BM25 算法的关键词搜索功能,支持索引持久化和增量更新。
5
+ """
6
+
7
+ import json
8
+ import logging
9
+ import pickle
10
+ import re
11
+ from pathlib import Path
12
+ from typing import Dict, List, Optional, Set, Tuple
13
+
14
+ from rank_bm25 import BM25Okapi
15
+
16
+ from .config import config
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class BM25Index:
22
+ """
23
+ BM25 索引管理器
24
+
25
+ 负责构建、保存、加载和查询 BM25 索引。
26
+ 支持增量更新和全量重建。
27
+ """
28
+
29
+ INDEX_VERSION = 1
30
+ INDEX_FILENAME = "bm25_index.pkl"
31
+ METADATA_FILENAME = "bm25_metadata.json"
32
+
33
+ def __init__(self, index_dir: Optional[Path] = None):
34
+ """
35
+ 初始化 BM25 索引
36
+
37
+ Args:
38
+ index_dir: 索引文件存放目录,默认为 config.zk_dir
39
+ """
40
+ self.index_dir = index_dir or config.zk_dir
41
+ self.index_path = self.index_dir / self.INDEX_FILENAME
42
+ self.metadata_path = self.index_dir / self.METADATA_FILENAME
43
+
44
+ # 索引数据
45
+ self.bm25: Optional[BM25Okapi] = None
46
+ self.documents: List[str] = [] # 分词后的文档列表
47
+ self.doc_ids: List[str] = [] # 文档 ID 列表
48
+ self.doc_mapping: Dict[str, int] = {} # note_id -> index
49
+
50
+ # 加载已有索引
51
+ self._load()
52
+
53
+ def _tokenize(self, text: str) -> List[str]:
54
+ """
55
+ 分词函数 - 适配中英文
56
+
57
+ Args:
58
+ text: 输入文本
59
+
60
+ Returns:
61
+ 分词结果列表
62
+ """
63
+ if not text:
64
+ return []
65
+
66
+ # 转换为小写
67
+ text = text.lower()
68
+
69
+ # 提取中文字符串(2-10字)和英文单词
70
+ # 中文按字符分割,英文按单词分割
71
+ tokens = []
72
+
73
+ # 匹配中文字符
74
+ chinese_chars = re.findall(r'[\u4e00-\u9fff]', text)
75
+ tokens.extend(chinese_chars)
76
+
77
+ # 匹配英文单词(包括下划线连接的变量名)
78
+ english_words = re.findall(r'[a-z][a-z0-9_]{0,20}', text)
79
+ tokens.extend(english_words)
80
+
81
+ # 匹配数字
82
+ numbers = re.findall(r'\d+', text)
83
+ tokens.extend(numbers)
84
+
85
+ return tokens
86
+
87
+ def _load(self) -> bool:
88
+ """
89
+ 从磁盘加载索引
90
+
91
+ Returns:
92
+ 是否成功加载
93
+ """
94
+ try:
95
+ if not self.index_path.exists() or not self.metadata_path.exists():
96
+ logger.info("BM25 index not found, will create new index")
97
+ return False
98
+
99
+ # 加载元数据
100
+ with open(self.metadata_path, 'r', encoding='utf-8') as f:
101
+ metadata = json.load(f)
102
+
103
+ # 检查版本
104
+ if metadata.get('version') != self.INDEX_VERSION:
105
+ logger.warning(f"BM25 index version mismatch: {metadata.get('version')} != {self.INDEX_VERSION}")
106
+ return False
107
+
108
+ # 加载索引
109
+ with open(self.index_path, 'rb') as f:
110
+ index_data = pickle.load(f)
111
+
112
+ self.bm25 = index_data['bm25']
113
+ self.documents = index_data['documents']
114
+ self.doc_ids = index_data['doc_ids']
115
+ self.doc_mapping = index_data['doc_mapping']
116
+
117
+ logger.info(f"Loaded BM25 index: {len(self.doc_ids)} documents")
118
+ return True
119
+
120
+ except Exception as e:
121
+ logger.error(f"Failed to load BM25 index: {e}")
122
+ self._reset()
123
+ return False
124
+
125
+ def _save(self) -> bool:
126
+ """
127
+ 保存索引到磁盘
128
+
129
+ Returns:
130
+ 是否成功保存
131
+ """
132
+ try:
133
+ # 确保目录存在
134
+ self.index_dir.mkdir(parents=True, exist_ok=True)
135
+
136
+ # 保存元数据
137
+ metadata = {
138
+ 'version': self.INDEX_VERSION,
139
+ 'doc_count': len(self.doc_ids),
140
+ }
141
+ with open(self.metadata_path, 'w', encoding='utf-8') as f:
142
+ json.dump(metadata, f, ensure_ascii=False, indent=2)
143
+
144
+ # 保存索引数据
145
+ index_data = {
146
+ 'bm25': self.bm25,
147
+ 'documents': self.documents,
148
+ 'doc_ids': self.doc_ids,
149
+ 'doc_mapping': self.doc_mapping,
150
+ }
151
+ with open(self.index_path, 'wb') as f:
152
+ pickle.dump(index_data, f)
153
+
154
+ logger.info(f"Saved BM25 index: {len(self.doc_ids)} documents")
155
+ return True
156
+
157
+ except Exception as e:
158
+ logger.error(f"Failed to save BM25 index: {e}")
159
+ return False
160
+
161
+ def _reset(self):
162
+ """重置索引状态"""
163
+ self.bm25 = None
164
+ self.documents = []
165
+ self.doc_ids = []
166
+ self.doc_mapping = {}
167
+
168
+ def _rebuild_index(self):
169
+ """重新构建 BM25 索引"""
170
+ if self.documents:
171
+ self.bm25 = BM25Okapi(self.documents)
172
+ else:
173
+ self.bm25 = None
174
+
175
+ def add_document(self, note_id: str, content: str) -> bool:
176
+ """
177
+ 添加文档到索引(增量更新)
178
+
179
+ Args:
180
+ note_id: 笔记 ID
181
+ content: 笔记内容
182
+
183
+ Returns:
184
+ 是否成功添加
185
+ """
186
+ try:
187
+ # 如果已存在,先移除
188
+ if note_id in self.doc_mapping:
189
+ self.remove_document(note_id)
190
+
191
+ # 分词
192
+ tokens = self._tokenize(content)
193
+ if not tokens:
194
+ return True
195
+
196
+ # 添加到索引
197
+ idx = len(self.documents)
198
+ self.documents.append(tokens)
199
+ self.doc_ids.append(note_id)
200
+ self.doc_mapping[note_id] = idx
201
+
202
+ # 重建索引
203
+ self._rebuild_index()
204
+
205
+ # 保存
206
+ self._save()
207
+
208
+ return True
209
+
210
+ except Exception as e:
211
+ logger.error(f"Failed to add document {note_id}: {e}")
212
+ return False
213
+
214
+ def remove_document(self, note_id: str) -> bool:
215
+ """
216
+ 从索引中移除文档
217
+
218
+ Args:
219
+ note_id: 笔记 ID
220
+
221
+ Returns:
222
+ 是否成功移除
223
+ """
224
+ try:
225
+ if note_id not in self.doc_mapping:
226
+ return True
227
+
228
+ idx = self.doc_mapping[note_id]
229
+
230
+ # 移除数据
231
+ self.documents.pop(idx)
232
+ self.doc_ids.pop(idx)
233
+ del self.doc_mapping[note_id]
234
+
235
+ # 更新其他文档的索引
236
+ self.doc_mapping = {}
237
+ for i, doc_id in enumerate(self.doc_ids):
238
+ self.doc_mapping[doc_id] = i
239
+
240
+ # 重建索引
241
+ self._rebuild_index()
242
+
243
+ # 保存
244
+ self._save()
245
+
246
+ return True
247
+
248
+ except Exception as e:
249
+ logger.error(f"Failed to remove document {note_id}: {e}")
250
+ return False
251
+
252
+ def search(self, query: str, top_k: int = 5) -> List[Dict]:
253
+ """
254
+ 搜索文档
255
+
256
+ Args:
257
+ query: 搜索查询
258
+ top_k: 返回结果数量
259
+
260
+ Returns:
261
+ 搜索结果列表,每项包含 note_id 和 score
262
+ """
263
+ if not self.bm25 or not self.documents:
264
+ return []
265
+
266
+ try:
267
+ # 分词
268
+ query_tokens = self._tokenize(query)
269
+ if not query_tokens:
270
+ return []
271
+
272
+ # BM25 搜索
273
+ scores = self.bm25.get_scores(query_tokens)
274
+
275
+ # 获取 top_k 结果
276
+ top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
277
+
278
+ results = []
279
+ for idx in top_indices:
280
+ # BM25 分数可能为负,只要大于最小值就返回
281
+ if scores[idx] > -10: # 使用合理的阈值
282
+ results.append({
283
+ 'note_id': self.doc_ids[idx],
284
+ 'score': float(scores[idx]),
285
+ })
286
+
287
+ return results
288
+
289
+ except Exception as e:
290
+ logger.error(f"BM25 search failed: {e}")
291
+ return []
292
+
293
+ def rebuild_from_notes(self, notes: List) -> bool:
294
+ """
295
+ 从笔记列表全量重建索引
296
+
297
+ Args:
298
+ notes: Note 对象列表
299
+
300
+ Returns:
301
+ 是否成功重建
302
+ """
303
+ try:
304
+ self._reset()
305
+
306
+ for note in notes:
307
+ # 组合标题和内容
308
+ content = f"{note.title} {note.content}"
309
+ tokens = self._tokenize(content)
310
+
311
+ if tokens:
312
+ idx = len(self.documents)
313
+ self.documents.append(tokens)
314
+ self.doc_ids.append(note.id)
315
+ self.doc_mapping[note.id] = idx
316
+
317
+ # 构建索引
318
+ self._rebuild_index()
319
+
320
+ # 保存
321
+ self._save()
322
+
323
+ logger.info(f"Rebuilt BM25 index from {len(notes)} notes")
324
+ return True
325
+
326
+ except Exception as e:
327
+ logger.error(f"Failed to rebuild BM25 index: {e}")
328
+ return False
329
+
330
+ def get_stats(self) -> Dict:
331
+ """
332
+ 获取索引统计信息
333
+
334
+ Returns:
335
+ 统计信息字典
336
+ """
337
+ return {
338
+ 'indexed': len(self.doc_ids),
339
+ 'version': self.INDEX_VERSION,
340
+ 'index_path': str(self.index_path),
341
+ 'index_exists': self.index_path.exists(),
342
+ }
343
+
344
+ def clear(self) -> bool:
345
+ """
346
+ 清空索引
347
+
348
+ Returns:
349
+ 是否成功清空
350
+ """
351
+ try:
352
+ self._reset()
353
+
354
+ # 删除文件
355
+ if self.index_path.exists():
356
+ self.index_path.unlink()
357
+ if self.metadata_path.exists():
358
+ self.metadata_path.unlink()
359
+
360
+ logger.info("Cleared BM25 index")
361
+ return True
362
+
363
+ except Exception as e:
364
+ logger.error(f"Failed to clear BM25 index: {e}")
365
+ return False
366
+
367
+
368
+ # 全局索引实例
369
+ _bm25_index: Optional[BM25Index] = None
370
+
371
+
372
+ def get_bm25_index() -> BM25Index:
373
+ """
374
+ 获取 BM25 索引实例(单例模式)
375
+
376
+ Returns:
377
+ BM25Index 实例
378
+ """
379
+ global _bm25_index
380
+ if _bm25_index is None:
381
+ _bm25_index = BM25Index()
382
+ return _bm25_index
383
+
384
+
385
+ def reset_bm25_index():
386
+ """重置全局索引实例(用于切换知识库时)"""
387
+ global _bm25_index
388
+ _bm25_index = None