jfox-cli 0.1.4__tar.gz → 0.1.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/.github/workflows/integration-test.yml +98 -4
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/CLAUDE.md +4 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/PKG-INFO +1 -1
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/__init__.py +1 -1
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/bm25_index.py +110 -106
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/cli.py +532 -418
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/config.py +32 -29
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/embedding_backend.py +10 -11
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/formatters.py +55 -66
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/global_config.py +56 -64
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/graph.py +54 -57
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/indexer.py +114 -95
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/kb_manager.py +62 -67
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/models.py +40 -37
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/note.py +281 -110
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/performance.py +93 -90
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/search_engine.py +65 -52
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/template.py +64 -53
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/template_cli.py +47 -42
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/vector_store.py +76 -51
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/pyproject.toml +4 -2
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/skills-recommend/claude-code/jfox-insert/SKILL.md +25 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/skills-recommend/claude-code/jfox-organize/SKILL.md +7 -2
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/conftest.py +96 -89
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/integration/test_backlinks.py +35 -51
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/performance/test_performance.py +76 -75
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/test_advanced_features.py +58 -29
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/test_cli_format.py +40 -31
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/test_config_unit.py +74 -70
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/test_core_workflow.py +140 -129
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/test_hybrid_search.py +40 -37
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/test_integration.py +11 -11
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/test_kb_current.py +18 -20
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/unit/test_bm25_batch.py +11 -7
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/unit/test_edit.py +9 -12
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/unit/test_format_unify.py +30 -12
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/unit/test_formatters.py +35 -24
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/unit/test_global_config.py +157 -174
- jfox_cli-0.1.5/tests/unit/test_index_kb_param.py +92 -0
- jfox_cli-0.1.5/tests/unit/test_indexer_clear_before_rebuild.py +69 -0
- jfox_cli-0.1.5/tests/unit/test_indexer_verify.py +53 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/unit/test_kb_manager.py +114 -127
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/unit/test_template.py +32 -43
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/unit/test_template_cli.py +250 -233
- jfox_cli-0.1.5/tests/unit/test_vector_store_clear.py +76 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/utils/assertions.py +45 -32
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/utils/jfox_cli.py +84 -105
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/utils/note_generator.py +44 -51
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/utils/temp_kb.py +5 -5
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/uv.lock +1 -1
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/.githooks/pre-push +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/.github/workflows/publish.yml +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/.gitignore +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/.python-version +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/AGENTS.md +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/DEVELOPMENT_PLAN.md +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/README.md +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/SESSION.md +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/SESSION_SUMMARY.md +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/docs/superpowers/specs/2026-04-03-bugfixes-design.md +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jessica-jones-static-cable.md +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/jfox/__main__.py +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/pytest.ini +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/run_full_test.ps1 +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/skill/evals/evals.json +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/skill/knowledge-base-notes/SKILL.md +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/skill/knowledge-base-workspace/SKILL.md +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/skills-recommend/README.md +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/skills-recommend/claude-code/jfox-health/SKILL.md +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/skills-recommend/claude-code/jfox-init/SKILL.md +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/skills-recommend/claude-code/jfox-search/SKILL.md +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/COVERAGE_PLAN.md +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/MIGRATION.md +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/TESTS.md +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/integration/__init__.py +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/performance/__init__.py +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/test_suggest_links.py +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/unit/__init__.py +0 -0
- {jfox_cli-0.1.4 → jfox_cli-0.1.5}/tests/utils/__init__.py +0 -0
|
@@ -14,6 +14,7 @@ on:
|
|
|
14
14
|
- 'jfox/**'
|
|
15
15
|
- 'tests/**'
|
|
16
16
|
- 'pyproject.toml'
|
|
17
|
+
- '.github/workflows/integration-test.yml'
|
|
17
18
|
# 允许手动触发
|
|
18
19
|
workflow_dispatch:
|
|
19
20
|
inputs:
|
|
@@ -33,6 +34,32 @@ env:
|
|
|
33
34
|
PYTHONUTF8: 1
|
|
34
35
|
|
|
35
36
|
jobs:
|
|
37
|
+
# ============ Lint 检查 ============
|
|
38
|
+
lint:
|
|
39
|
+
runs-on: ubuntu-latest
|
|
40
|
+
steps:
|
|
41
|
+
- name: Checkout code
|
|
42
|
+
uses: actions/checkout@v4
|
|
43
|
+
|
|
44
|
+
- name: Set up Python
|
|
45
|
+
uses: actions/setup-python@v5
|
|
46
|
+
with:
|
|
47
|
+
python-version: '3.11'
|
|
48
|
+
|
|
49
|
+
- uses: astral-sh/setup-uv@v4
|
|
50
|
+
with:
|
|
51
|
+
version: "latest"
|
|
52
|
+
enable-cache: true
|
|
53
|
+
|
|
54
|
+
- name: Install dependencies
|
|
55
|
+
run: uv sync --extra dev
|
|
56
|
+
|
|
57
|
+
- name: Run ruff check
|
|
58
|
+
run: uv run ruff check jfox/ tests/
|
|
59
|
+
|
|
60
|
+
- name: Run black check
|
|
61
|
+
run: uv run black --check jfox/ tests/
|
|
62
|
+
|
|
36
63
|
# ============ 快速测试(PR 和 push 触发)============
|
|
37
64
|
test-fast:
|
|
38
65
|
runs-on: ${{ matrix.os }}
|
|
@@ -102,12 +129,20 @@ jobs:
|
|
|
102
129
|
version: "latest"
|
|
103
130
|
enable-cache: true
|
|
104
131
|
|
|
105
|
-
- name: Cache model
|
|
132
|
+
- name: Cache model (Unix)
|
|
133
|
+
if: runner.os != 'Windows'
|
|
106
134
|
uses: actions/cache@v4
|
|
107
135
|
with:
|
|
108
136
|
path: ~/.cache/torch/sentence_transformers
|
|
109
137
|
key: ${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2
|
|
110
138
|
|
|
139
|
+
- name: Cache model (Windows)
|
|
140
|
+
if: runner.os == 'Windows'
|
|
141
|
+
uses: actions/cache@v4
|
|
142
|
+
with:
|
|
143
|
+
path: ~\AppData\Local\torch\sentence_transformers
|
|
144
|
+
key: ${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2
|
|
145
|
+
|
|
111
146
|
- name: Install dependencies
|
|
112
147
|
run: uv sync --extra dev
|
|
113
148
|
|
|
@@ -152,12 +187,20 @@ jobs:
|
|
|
152
187
|
version: "latest"
|
|
153
188
|
enable-cache: true
|
|
154
189
|
|
|
155
|
-
- name: Cache model
|
|
190
|
+
- name: Cache model (Unix)
|
|
191
|
+
if: runner.os != 'Windows'
|
|
156
192
|
uses: actions/cache@v4
|
|
157
193
|
with:
|
|
158
194
|
path: ~/.cache/torch/sentence_transformers
|
|
159
195
|
key: ${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2
|
|
160
196
|
|
|
197
|
+
- name: Cache model (Windows)
|
|
198
|
+
if: runner.os == 'Windows'
|
|
199
|
+
uses: actions/cache@v4
|
|
200
|
+
with:
|
|
201
|
+
path: ~\AppData\Local\torch\sentence_transformers
|
|
202
|
+
key: ${{ runner.os }}-sentence-transformers-all-MiniLM-L6-v2
|
|
203
|
+
|
|
161
204
|
- name: Install dependencies
|
|
162
205
|
run: uv sync --extra dev
|
|
163
206
|
|
|
@@ -178,7 +221,25 @@ jobs:
|
|
|
178
221
|
path: |
|
|
179
222
|
.pytest_cache/
|
|
180
223
|
|
|
181
|
-
# ============
|
|
224
|
+
# ============ 质量门禁(所有必须测试通过才算成功)============
|
|
225
|
+
quality-gate:
|
|
226
|
+
runs-on: ubuntu-latest
|
|
227
|
+
needs: [lint, test-fast]
|
|
228
|
+
if: always()
|
|
229
|
+
steps:
|
|
230
|
+
- name: Check all jobs passed
|
|
231
|
+
run: |
|
|
232
|
+
echo "lint: ${{ needs.lint.result }}"
|
|
233
|
+
echo "test-fast: ${{ needs.test-fast.result }}"
|
|
234
|
+
if [[ "${{ needs.lint.result }}" == "success" && "${{ needs.test-fast.result }}" == "success" ]]; then
|
|
235
|
+
echo "Quality gate passed!"
|
|
236
|
+
exit 0
|
|
237
|
+
else
|
|
238
|
+
echo "Quality gate FAILED!"
|
|
239
|
+
exit 1
|
|
240
|
+
fi
|
|
241
|
+
|
|
242
|
+
# ============ 覆盖率报告(仅 fast 测试通过后)============
|
|
182
243
|
coverage:
|
|
183
244
|
runs-on: ubuntu-latest
|
|
184
245
|
needs: [test-fast]
|
|
@@ -203,9 +264,42 @@ jobs:
|
|
|
203
264
|
|
|
204
265
|
- name: Run coverage
|
|
205
266
|
run: |
|
|
206
|
-
uv run pytest tests/ -m "not embedding and not slow" --cov=jfox --cov-report=xml --cov-report=html -v --timeout=300
|
|
267
|
+
uv run pytest tests/ -m "not embedding and not slow" --cov=jfox --cov-report=xml --cov-report=html --cov-report=term -v --timeout=300
|
|
207
268
|
timeout-minutes: 25
|
|
208
269
|
|
|
270
|
+
- name: Post coverage comment on PR
|
|
271
|
+
if: github.event_name == 'pull_request'
|
|
272
|
+
env:
|
|
273
|
+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
|
274
|
+
run: |
|
|
275
|
+
python -c "
|
|
276
|
+
import xml.etree.ElementTree as ET
|
|
277
|
+
import subprocess
|
|
278
|
+
|
|
279
|
+
tree = ET.parse('coverage.xml')
|
|
280
|
+
root = tree.getroot()
|
|
281
|
+
rate = float(root.attrib['line-rate'])
|
|
282
|
+
lines_covered = int(root.attrib['lines-covered'])
|
|
283
|
+
lines_valid = int(root.attrib['lines-valid'])
|
|
284
|
+
|
|
285
|
+
rows = []
|
|
286
|
+
for cls in root.iter('class'):
|
|
287
|
+
name = cls.attrib['filename']
|
|
288
|
+
r = float(cls.attrib['line-rate'])
|
|
289
|
+
rows.append((name, r))
|
|
290
|
+
rows.sort(key=lambda x: x[1])
|
|
291
|
+
|
|
292
|
+
comment = '## Test Coverage\n\n'
|
|
293
|
+
comment += '**Overall: {:.1f}%** ({}/{} lines)\n\n'.format(rate * 100, lines_covered, lines_valid)
|
|
294
|
+
comment += '| Module | Coverage | Status |\n|--------|----------|--------|\n'
|
|
295
|
+
for name, r in rows:
|
|
296
|
+
icon = ':green_circle:' if r >= 0.8 else ':yellow_circle:' if r >= 0.5 else ':red_circle:'
|
|
297
|
+
comment += '| {} | {:.1f}% | {} |\n'.format(name, r * 100, icon)
|
|
298
|
+
|
|
299
|
+
pr = '${{ github.event.pull_request.number }}'
|
|
300
|
+
subprocess.run(['gh', 'pr', 'comment', pr, '--body', comment])
|
|
301
|
+
"
|
|
302
|
+
|
|
209
303
|
- name: Upload coverage report
|
|
210
304
|
uses: actions/upload-artifact@v4
|
|
211
305
|
with:
|
|
@@ -126,6 +126,10 @@ Four jobs in `.github/workflows/integration-test.yml`:
|
|
|
126
126
|
- Set `PYTHONUTF8=1` and `chcp 65001` for encoding
|
|
127
127
|
- HuggingFace mirror for China: `export HF_ENDPOINT=https://hf-mirror.com`
|
|
128
128
|
|
|
129
|
+
## Branch Rules
|
|
130
|
+
|
|
131
|
+
- **main 是保护分支**,不能直接 commit 或 push。所有改动必须通过新分支 + PR 合入。
|
|
132
|
+
|
|
129
133
|
## Gotchas
|
|
130
134
|
|
|
131
135
|
- `pytest.ini` `addopts` includes `-v`, so `pytest tests/` already runs verbose — adding `-v` manually is redundant
|
|
@@ -9,7 +9,7 @@ import logging
|
|
|
9
9
|
import pickle
|
|
10
10
|
import re
|
|
11
11
|
from pathlib import Path
|
|
12
|
-
from typing import Dict, List, Optional,
|
|
12
|
+
from typing import Dict, List, Optional, Tuple
|
|
13
13
|
|
|
14
14
|
from rank_bm25 import BM25Okapi
|
|
15
15
|
|
|
@@ -21,73 +21,73 @@ logger = logging.getLogger(__name__)
|
|
|
21
21
|
class BM25Index:
|
|
22
22
|
"""
|
|
23
23
|
BM25 索引管理器
|
|
24
|
-
|
|
24
|
+
|
|
25
25
|
负责构建、保存、加载和查询 BM25 索引。
|
|
26
26
|
支持增量更新和全量重建。
|
|
27
27
|
"""
|
|
28
|
-
|
|
28
|
+
|
|
29
29
|
INDEX_VERSION = 1
|
|
30
30
|
INDEX_FILENAME = "bm25_index.pkl"
|
|
31
31
|
METADATA_FILENAME = "bm25_metadata.json"
|
|
32
|
-
|
|
32
|
+
|
|
33
33
|
def __init__(self, index_dir: Optional[Path] = None):
|
|
34
34
|
"""
|
|
35
35
|
初始化 BM25 索引
|
|
36
|
-
|
|
36
|
+
|
|
37
37
|
Args:
|
|
38
38
|
index_dir: 索引文件存放目录,默认为 config.zk_dir
|
|
39
39
|
"""
|
|
40
40
|
self.index_dir = index_dir or config.zk_dir
|
|
41
41
|
self.index_path = self.index_dir / self.INDEX_FILENAME
|
|
42
42
|
self.metadata_path = self.index_dir / self.METADATA_FILENAME
|
|
43
|
-
|
|
43
|
+
|
|
44
44
|
# 索引数据
|
|
45
45
|
self.bm25: Optional[BM25Okapi] = None
|
|
46
46
|
self.documents: List[str] = [] # 分词后的文档列表
|
|
47
|
-
self.doc_ids: List[str] = []
|
|
47
|
+
self.doc_ids: List[str] = [] # 文档 ID 列表
|
|
48
48
|
self.doc_mapping: Dict[str, int] = {} # note_id -> index
|
|
49
|
-
|
|
49
|
+
|
|
50
50
|
# 加载已有索引
|
|
51
51
|
self._load()
|
|
52
|
-
|
|
52
|
+
|
|
53
53
|
def _tokenize(self, text: str) -> List[str]:
|
|
54
54
|
"""
|
|
55
55
|
分词函数 - 适配中英文
|
|
56
|
-
|
|
56
|
+
|
|
57
57
|
Args:
|
|
58
58
|
text: 输入文本
|
|
59
|
-
|
|
59
|
+
|
|
60
60
|
Returns:
|
|
61
61
|
分词结果列表
|
|
62
62
|
"""
|
|
63
63
|
if not text:
|
|
64
64
|
return []
|
|
65
|
-
|
|
65
|
+
|
|
66
66
|
# 转换为小写
|
|
67
67
|
text = text.lower()
|
|
68
|
-
|
|
68
|
+
|
|
69
69
|
# 提取中文字符串(2-10字)和英文单词
|
|
70
70
|
# 中文按字符分割,英文按单词分割
|
|
71
71
|
tokens = []
|
|
72
|
-
|
|
72
|
+
|
|
73
73
|
# 匹配中文字符
|
|
74
|
-
chinese_chars = re.findall(r
|
|
74
|
+
chinese_chars = re.findall(r"[\u4e00-\u9fff]", text)
|
|
75
75
|
tokens.extend(chinese_chars)
|
|
76
|
-
|
|
76
|
+
|
|
77
77
|
# 匹配英文单词(包括下划线连接的变量名)
|
|
78
|
-
english_words = re.findall(r
|
|
78
|
+
english_words = re.findall(r"[a-z][a-z0-9_]{0,20}", text)
|
|
79
79
|
tokens.extend(english_words)
|
|
80
|
-
|
|
80
|
+
|
|
81
81
|
# 匹配数字
|
|
82
|
-
numbers = re.findall(r
|
|
82
|
+
numbers = re.findall(r"\d+", text)
|
|
83
83
|
tokens.extend(numbers)
|
|
84
|
-
|
|
84
|
+
|
|
85
85
|
return tokens
|
|
86
|
-
|
|
86
|
+
|
|
87
87
|
def _load(self) -> bool:
|
|
88
88
|
"""
|
|
89
89
|
从磁盘加载索引
|
|
90
|
-
|
|
90
|
+
|
|
91
91
|
Returns:
|
|
92
92
|
是否成功加载
|
|
93
93
|
"""
|
|
@@ -95,91 +95,93 @@ class BM25Index:
|
|
|
95
95
|
if not self.index_path.exists() or not self.metadata_path.exists():
|
|
96
96
|
logger.info("BM25 index not found, will create new index")
|
|
97
97
|
return False
|
|
98
|
-
|
|
98
|
+
|
|
99
99
|
# 加载元数据
|
|
100
|
-
with open(self.metadata_path,
|
|
100
|
+
with open(self.metadata_path, "r", encoding="utf-8") as f:
|
|
101
101
|
metadata = json.load(f)
|
|
102
|
-
|
|
102
|
+
|
|
103
103
|
# 检查版本
|
|
104
|
-
if metadata.get(
|
|
105
|
-
logger.warning(
|
|
104
|
+
if metadata.get("version") != self.INDEX_VERSION:
|
|
105
|
+
logger.warning(
|
|
106
|
+
f"BM25 index version mismatch: {metadata.get('version')} != {self.INDEX_VERSION}"
|
|
107
|
+
)
|
|
106
108
|
return False
|
|
107
|
-
|
|
109
|
+
|
|
108
110
|
# 加载索引
|
|
109
|
-
with open(self.index_path,
|
|
111
|
+
with open(self.index_path, "rb") as f:
|
|
110
112
|
index_data = pickle.load(f)
|
|
111
|
-
|
|
112
|
-
self.bm25 = index_data[
|
|
113
|
-
self.documents = index_data[
|
|
114
|
-
self.doc_ids = index_data[
|
|
115
|
-
self.doc_mapping = index_data[
|
|
116
|
-
|
|
113
|
+
|
|
114
|
+
self.bm25 = index_data["bm25"]
|
|
115
|
+
self.documents = index_data["documents"]
|
|
116
|
+
self.doc_ids = index_data["doc_ids"]
|
|
117
|
+
self.doc_mapping = index_data["doc_mapping"]
|
|
118
|
+
|
|
117
119
|
logger.info(f"Loaded BM25 index: {len(self.doc_ids)} documents")
|
|
118
120
|
return True
|
|
119
|
-
|
|
121
|
+
|
|
120
122
|
except Exception as e:
|
|
121
123
|
logger.error(f"Failed to load BM25 index: {e}")
|
|
122
124
|
self._reset()
|
|
123
125
|
return False
|
|
124
|
-
|
|
126
|
+
|
|
125
127
|
def _save(self) -> bool:
|
|
126
128
|
"""
|
|
127
129
|
保存索引到磁盘
|
|
128
|
-
|
|
130
|
+
|
|
129
131
|
Returns:
|
|
130
132
|
是否成功保存
|
|
131
133
|
"""
|
|
132
134
|
try:
|
|
133
135
|
# 确保目录存在
|
|
134
136
|
self.index_dir.mkdir(parents=True, exist_ok=True)
|
|
135
|
-
|
|
137
|
+
|
|
136
138
|
# 保存元数据
|
|
137
139
|
metadata = {
|
|
138
|
-
|
|
139
|
-
|
|
140
|
+
"version": self.INDEX_VERSION,
|
|
141
|
+
"doc_count": len(self.doc_ids),
|
|
140
142
|
}
|
|
141
|
-
with open(self.metadata_path,
|
|
143
|
+
with open(self.metadata_path, "w", encoding="utf-8") as f:
|
|
142
144
|
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
|
143
|
-
|
|
145
|
+
|
|
144
146
|
# 保存索引数据
|
|
145
147
|
index_data = {
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
148
|
+
"bm25": self.bm25,
|
|
149
|
+
"documents": self.documents,
|
|
150
|
+
"doc_ids": self.doc_ids,
|
|
151
|
+
"doc_mapping": self.doc_mapping,
|
|
150
152
|
}
|
|
151
|
-
with open(self.index_path,
|
|
153
|
+
with open(self.index_path, "wb") as f:
|
|
152
154
|
pickle.dump(index_data, f)
|
|
153
|
-
|
|
155
|
+
|
|
154
156
|
logger.info(f"Saved BM25 index: {len(self.doc_ids)} documents")
|
|
155
157
|
return True
|
|
156
|
-
|
|
158
|
+
|
|
157
159
|
except Exception as e:
|
|
158
160
|
logger.error(f"Failed to save BM25 index: {e}")
|
|
159
161
|
return False
|
|
160
|
-
|
|
162
|
+
|
|
161
163
|
def _reset(self):
|
|
162
164
|
"""重置索引状态"""
|
|
163
165
|
self.bm25 = None
|
|
164
166
|
self.documents = []
|
|
165
167
|
self.doc_ids = []
|
|
166
168
|
self.doc_mapping = {}
|
|
167
|
-
|
|
169
|
+
|
|
168
170
|
def _rebuild_index(self):
|
|
169
171
|
"""重新构建 BM25 索引"""
|
|
170
172
|
if self.documents:
|
|
171
173
|
self.bm25 = BM25Okapi(self.documents)
|
|
172
174
|
else:
|
|
173
175
|
self.bm25 = None
|
|
174
|
-
|
|
176
|
+
|
|
175
177
|
def add_document(self, note_id: str, content: str) -> bool:
|
|
176
178
|
"""
|
|
177
179
|
添加文档到索引(增量更新)
|
|
178
|
-
|
|
180
|
+
|
|
179
181
|
Args:
|
|
180
182
|
note_id: 笔记 ID
|
|
181
183
|
content: 笔记内容
|
|
182
|
-
|
|
184
|
+
|
|
183
185
|
Returns:
|
|
184
186
|
是否成功添加
|
|
185
187
|
"""
|
|
@@ -187,68 +189,68 @@ class BM25Index:
|
|
|
187
189
|
# 如果已存在,先移除
|
|
188
190
|
if note_id in self.doc_mapping:
|
|
189
191
|
self.remove_document(note_id)
|
|
190
|
-
|
|
192
|
+
|
|
191
193
|
# 分词
|
|
192
194
|
tokens = self._tokenize(content)
|
|
193
195
|
if not tokens:
|
|
194
196
|
return True
|
|
195
|
-
|
|
197
|
+
|
|
196
198
|
# 添加到索引
|
|
197
199
|
idx = len(self.documents)
|
|
198
200
|
self.documents.append(tokens)
|
|
199
201
|
self.doc_ids.append(note_id)
|
|
200
202
|
self.doc_mapping[note_id] = idx
|
|
201
|
-
|
|
203
|
+
|
|
202
204
|
# 重建索引
|
|
203
205
|
self._rebuild_index()
|
|
204
|
-
|
|
206
|
+
|
|
205
207
|
# 保存
|
|
206
208
|
self._save()
|
|
207
|
-
|
|
209
|
+
|
|
208
210
|
return True
|
|
209
|
-
|
|
211
|
+
|
|
210
212
|
except Exception as e:
|
|
211
213
|
logger.error(f"Failed to add document {note_id}: {e}")
|
|
212
214
|
return False
|
|
213
|
-
|
|
215
|
+
|
|
214
216
|
def remove_document(self, note_id: str) -> bool:
|
|
215
217
|
"""
|
|
216
218
|
从索引中移除文档
|
|
217
|
-
|
|
219
|
+
|
|
218
220
|
Args:
|
|
219
221
|
note_id: 笔记 ID
|
|
220
|
-
|
|
222
|
+
|
|
221
223
|
Returns:
|
|
222
224
|
是否成功移除
|
|
223
225
|
"""
|
|
224
226
|
try:
|
|
225
227
|
if note_id not in self.doc_mapping:
|
|
226
228
|
return True
|
|
227
|
-
|
|
229
|
+
|
|
228
230
|
idx = self.doc_mapping[note_id]
|
|
229
|
-
|
|
231
|
+
|
|
230
232
|
# 移除数据
|
|
231
233
|
self.documents.pop(idx)
|
|
232
234
|
self.doc_ids.pop(idx)
|
|
233
235
|
del self.doc_mapping[note_id]
|
|
234
|
-
|
|
236
|
+
|
|
235
237
|
# 更新其他文档的索引
|
|
236
238
|
self.doc_mapping = {}
|
|
237
239
|
for i, doc_id in enumerate(self.doc_ids):
|
|
238
240
|
self.doc_mapping[doc_id] = i
|
|
239
|
-
|
|
241
|
+
|
|
240
242
|
# 重建索引
|
|
241
243
|
self._rebuild_index()
|
|
242
|
-
|
|
244
|
+
|
|
243
245
|
# 保存
|
|
244
246
|
self._save()
|
|
245
|
-
|
|
247
|
+
|
|
246
248
|
return True
|
|
247
|
-
|
|
249
|
+
|
|
248
250
|
except Exception as e:
|
|
249
251
|
logger.error(f"Failed to remove document {note_id}: {e}")
|
|
250
252
|
return False
|
|
251
|
-
|
|
253
|
+
|
|
252
254
|
def add_documents_batch(self, documents: List[Tuple[str, str]]) -> bool:
|
|
253
255
|
"""
|
|
254
256
|
批量添加文档到索引(高效版本)
|
|
@@ -309,7 +311,7 @@ class BM25Index:
|
|
|
309
311
|
logger.info(f"Batch added {len(documents)} documents to BM25 index")
|
|
310
312
|
return True
|
|
311
313
|
|
|
312
|
-
except Exception
|
|
314
|
+
except Exception:
|
|
313
315
|
# 恢复到批次前的状态
|
|
314
316
|
self.documents = saved_docs
|
|
315
317
|
self.doc_ids = saved_ids
|
|
@@ -324,114 +326,116 @@ class BM25Index:
|
|
|
324
326
|
def search(self, query: str, top_k: int = 5) -> List[Dict]:
|
|
325
327
|
"""
|
|
326
328
|
搜索文档
|
|
327
|
-
|
|
329
|
+
|
|
328
330
|
Args:
|
|
329
331
|
query: 搜索查询
|
|
330
332
|
top_k: 返回结果数量
|
|
331
|
-
|
|
333
|
+
|
|
332
334
|
Returns:
|
|
333
335
|
搜索结果列表,每项包含 note_id 和 score
|
|
334
336
|
"""
|
|
335
337
|
if not self.bm25 or not self.documents:
|
|
336
338
|
return []
|
|
337
|
-
|
|
339
|
+
|
|
338
340
|
try:
|
|
339
341
|
# 分词
|
|
340
342
|
query_tokens = self._tokenize(query)
|
|
341
343
|
if not query_tokens:
|
|
342
344
|
return []
|
|
343
|
-
|
|
345
|
+
|
|
344
346
|
# BM25 搜索
|
|
345
347
|
scores = self.bm25.get_scores(query_tokens)
|
|
346
|
-
|
|
348
|
+
|
|
347
349
|
# 获取 top_k 结果
|
|
348
350
|
top_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:top_k]
|
|
349
|
-
|
|
351
|
+
|
|
350
352
|
results = []
|
|
351
353
|
for idx in top_indices:
|
|
352
354
|
# BM25 分数可能为负,只要大于最小值就返回
|
|
353
355
|
if scores[idx] > -10: # 使用合理的阈值
|
|
354
|
-
results.append(
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
356
|
+
results.append(
|
|
357
|
+
{
|
|
358
|
+
"note_id": self.doc_ids[idx],
|
|
359
|
+
"score": float(scores[idx]),
|
|
360
|
+
}
|
|
361
|
+
)
|
|
362
|
+
|
|
359
363
|
return results
|
|
360
|
-
|
|
364
|
+
|
|
361
365
|
except Exception as e:
|
|
362
366
|
logger.error(f"BM25 search failed: {e}")
|
|
363
367
|
return []
|
|
364
|
-
|
|
368
|
+
|
|
365
369
|
def rebuild_from_notes(self, notes: List) -> bool:
|
|
366
370
|
"""
|
|
367
371
|
从笔记列表全量重建索引
|
|
368
|
-
|
|
372
|
+
|
|
369
373
|
Args:
|
|
370
374
|
notes: Note 对象列表
|
|
371
|
-
|
|
375
|
+
|
|
372
376
|
Returns:
|
|
373
377
|
是否成功重建
|
|
374
378
|
"""
|
|
375
379
|
try:
|
|
376
380
|
self._reset()
|
|
377
|
-
|
|
381
|
+
|
|
378
382
|
for note in notes:
|
|
379
383
|
# 组合标题和内容
|
|
380
384
|
content = f"{note.title} {note.content}"
|
|
381
385
|
tokens = self._tokenize(content)
|
|
382
|
-
|
|
386
|
+
|
|
383
387
|
if tokens:
|
|
384
388
|
idx = len(self.documents)
|
|
385
389
|
self.documents.append(tokens)
|
|
386
390
|
self.doc_ids.append(note.id)
|
|
387
391
|
self.doc_mapping[note.id] = idx
|
|
388
|
-
|
|
392
|
+
|
|
389
393
|
# 构建索引
|
|
390
394
|
self._rebuild_index()
|
|
391
|
-
|
|
395
|
+
|
|
392
396
|
# 保存
|
|
393
397
|
self._save()
|
|
394
|
-
|
|
398
|
+
|
|
395
399
|
logger.info(f"Rebuilt BM25 index from {len(notes)} notes")
|
|
396
400
|
return True
|
|
397
|
-
|
|
401
|
+
|
|
398
402
|
except Exception as e:
|
|
399
403
|
logger.error(f"Failed to rebuild BM25 index: {e}")
|
|
400
404
|
return False
|
|
401
|
-
|
|
405
|
+
|
|
402
406
|
def get_stats(self) -> Dict:
|
|
403
407
|
"""
|
|
404
408
|
获取索引统计信息
|
|
405
|
-
|
|
409
|
+
|
|
406
410
|
Returns:
|
|
407
411
|
统计信息字典
|
|
408
412
|
"""
|
|
409
413
|
return {
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
+
"indexed": len(self.doc_ids),
|
|
415
|
+
"version": self.INDEX_VERSION,
|
|
416
|
+
"index_path": str(self.index_path),
|
|
417
|
+
"index_exists": self.index_path.exists(),
|
|
414
418
|
}
|
|
415
|
-
|
|
419
|
+
|
|
416
420
|
def clear(self) -> bool:
|
|
417
421
|
"""
|
|
418
422
|
清空索引
|
|
419
|
-
|
|
423
|
+
|
|
420
424
|
Returns:
|
|
421
425
|
是否成功清空
|
|
422
426
|
"""
|
|
423
427
|
try:
|
|
424
428
|
self._reset()
|
|
425
|
-
|
|
429
|
+
|
|
426
430
|
# 删除文件
|
|
427
431
|
if self.index_path.exists():
|
|
428
432
|
self.index_path.unlink()
|
|
429
433
|
if self.metadata_path.exists():
|
|
430
434
|
self.metadata_path.unlink()
|
|
431
|
-
|
|
435
|
+
|
|
432
436
|
logger.info("Cleared BM25 index")
|
|
433
437
|
return True
|
|
434
|
-
|
|
438
|
+
|
|
435
439
|
except Exception as e:
|
|
436
440
|
logger.error(f"Failed to clear BM25 index: {e}")
|
|
437
441
|
return False
|
|
@@ -444,7 +448,7 @@ _bm25_index: Optional[BM25Index] = None
|
|
|
444
448
|
def get_bm25_index() -> BM25Index:
|
|
445
449
|
"""
|
|
446
450
|
获取 BM25 索引实例(单例模式)
|
|
447
|
-
|
|
451
|
+
|
|
448
452
|
Returns:
|
|
449
453
|
BM25Index 实例
|
|
450
454
|
"""
|