knowledge-rag 3.6.2__tar.gz → 3.8.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {knowledge_rag-3.6.2 → knowledge_rag-3.8.0}/PKG-INFO +77 -10
- {knowledge_rag-3.6.2 → knowledge_rag-3.8.0}/README.md +75 -8
- {knowledge_rag-3.6.2 → knowledge_rag-3.8.0}/mcp_server/__init__.py +1 -1
- {knowledge_rag-3.6.2 → knowledge_rag-3.8.0}/mcp_server/config.py +5 -4
- knowledge_rag-3.8.0/mcp_server/guarded.py +10 -0
- knowledge_rag-3.8.0/mcp_server/instance_lock.py +188 -0
- knowledge_rag-3.8.0/mcp_server/preflight.py +74 -0
- {knowledge_rag-3.6.2 → knowledge_rag-3.8.0}/mcp_server/server.py +104 -54
- {knowledge_rag-3.6.2 → knowledge_rag-3.8.0}/npm/README.md +1 -1
- {knowledge_rag-3.6.2 → knowledge_rag-3.8.0}/pyproject.toml +8 -2
- {knowledge_rag-3.6.2 → knowledge_rag-3.8.0}/requirements.txt +4 -4
- {knowledge_rag-3.6.2 → knowledge_rag-3.8.0}/.gitignore +0 -0
- {knowledge_rag-3.6.2 → knowledge_rag-3.8.0}/LICENSE +0 -0
- {knowledge_rag-3.6.2 → knowledge_rag-3.8.0}/config.example.yaml +0 -0
- {knowledge_rag-3.6.2 → knowledge_rag-3.8.0}/documents/examples/sample-document.md +0 -0
- {knowledge_rag-3.6.2 → knowledge_rag-3.8.0}/mcp_server/ingestion.py +0 -0
- {knowledge_rag-3.6.2 → knowledge_rag-3.8.0}/presets/cybersecurity.yaml +0 -0
- {knowledge_rag-3.6.2 → knowledge_rag-3.8.0}/presets/developer.yaml +0 -0
- {knowledge_rag-3.6.2 → knowledge_rag-3.8.0}/presets/general.yaml +0 -0
- {knowledge_rag-3.6.2 → knowledge_rag-3.8.0}/presets/research.yaml +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: knowledge-rag
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.8.0
|
|
4
4
|
Summary: Local RAG System for Claude Code — Hybrid search + Cross-encoder Reranking + 12 MCP Tools + 20 Format Parsers. Zero external servers.
|
|
5
5
|
Project-URL: Homepage, https://github.com/lyonzin/knowledge-rag
|
|
6
6
|
Project-URL: Repository, https://github.com/lyonzin/knowledge-rag
|
|
@@ -30,7 +30,7 @@ Requires-Dist: python-docx>=1.0.0
|
|
|
30
30
|
Requires-Dist: python-pptx>=1.0.0
|
|
31
31
|
Requires-Dist: pyyaml>=6.0
|
|
32
32
|
Requires-Dist: rank-bm25>=0.2.2
|
|
33
|
-
Requires-Dist: requests>=2.
|
|
33
|
+
Requires-Dist: requests>=2.33.0
|
|
34
34
|
Requires-Dist: watchdog>=4.0.0
|
|
35
35
|
Provides-Extra: gpu
|
|
36
36
|
Requires-Dist: onnxruntime-gpu>=1.14.0; extra == 'gpu'
|
|
@@ -40,7 +40,9 @@ Description-Content-Type: text/markdown
|
|
|
40
40
|
|
|
41
41
|
<div align="center">
|
|
42
42
|
|
|
43
|
-
](https://pypi.org/project/knowledge-rag/)
|
|
44
|
+
[](https://www.npmjs.com/package/knowledge-rag)
|
|
45
|
+
[](https://pepy.tech/projects/knowledge-rag)
|
|
44
46
|

|
|
45
47
|

|
|
46
48
|

|
|
@@ -48,7 +50,6 @@ Description-Content-Type: text/markdown
|
|
|
48
50
|
[](https://github.com/lyonzin/knowledge-rag/actions/workflows/ci.yml)
|
|
49
51
|
[](https://github.com/lyonzin/knowledge-rag/actions/workflows/security.yml)
|
|
50
52
|
[](https://glama.ai/mcp/servers/lyonzin/knowledge-rag)
|
|
51
|
-
[](https://pypi.org/project/knowledge-rag/)
|
|
52
53
|
|
|
53
54
|
### Your docs, your machine, zero cloud. Claude Code searches them natively.
|
|
54
55
|
|
|
@@ -70,11 +71,21 @@ pip install knowledge-rag → restart Claude Code → search_knowledge("your que
|
|
|
70
71
|
|
|
71
72
|
---
|
|
72
73
|
|
|
73
|
-
## What's New in v3.
|
|
74
|
+
## What's New in v3.8.0
|
|
75
|
+
|
|
76
|
+
### Lazy-Loaded Embeddings — Cheaper Idle Processes
|
|
74
77
|
|
|
75
|
-
|
|
78
|
+
The FastEmbed ONNX model (~200MB resident) now loads on the **first query**, not at startup. Idle `knowledge-rag` processes are now genuinely cheap. Why this matters: MCP stdio is one-process-per-client by protocol — multiple Claude Code windows, Claude Desktop + IDE simultaneously, or review/approval flows that open extra connections all spawn their own processes. Before v3.8.0, every one of them paid the full embedding-model cost up front. Now only processes that actually serve queries load the model. Public API is unchanged.
|
|
76
79
|
|
|
77
|
-
|
|
80
|
+
### Opt-In Single-Instance Guard
|
|
81
|
+
|
|
82
|
+
For users who measured their setup and want a hard cap of one server per `data_dir`:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
export KNOWLEDGE_RAG_SINGLE_INSTANCE=1
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
A second instance exits immediately with code 75. **OFF by default** so multi-client MCP usage continues to work unchanged. Stale-PID recovery + SIGINT/SIGTERM cleanup wired correctly. Full guide in [docs/single-instance.md](docs/single-instance.md). Sample MCP config in [examples/mcp-config-single-instance.json](examples/mcp-config-single-instance.json).
|
|
78
89
|
|
|
79
90
|
### 5 Ways to Install
|
|
80
91
|
|
|
@@ -90,6 +101,7 @@ All methods produce the same MCP server. See [Installation](#installation) for f
|
|
|
90
101
|
|
|
91
102
|
### Recent Highlights
|
|
92
103
|
|
|
104
|
+
- **v3.8.0** — Lazy-load embeddings, opt-in single-instance guard, version sync across PyPI/NPM/Docker
|
|
93
105
|
- **v3.6.0** — Multi-language code parsing (C/C++/JS/TS/XML), NPM wrapper, Docker image, automated release pipeline
|
|
94
106
|
- **v3.5.2** — CUDA DLL auto-discovery from pip packages, graceful GPU→CPU fallback, explicit CPU provider (no CUDA noise when `gpu: false`), BASE_DIR resolution fix for editable installs
|
|
95
107
|
- **v3.5.1** — Remove Python `<3.13` upper bound — 3.13 and 3.14 now supported
|
|
@@ -809,7 +821,7 @@ models:
|
|
|
809
821
|
dimensions: 384
|
|
810
822
|
gpu: false # Set true + pip install knowledge-rag[gpu]
|
|
811
823
|
reranker:
|
|
812
|
-
enabled: true #
|
|
824
|
+
enabled: true # Falls back to RRF if model is unavailable
|
|
813
825
|
model: "Xenova/ms-marco-MiniLM-L-6-v2"
|
|
814
826
|
top_k_multiplier: 3 # Candidates fetched before reranking
|
|
815
827
|
|
|
@@ -896,6 +908,8 @@ For `.md` files, chunking splits at `##` and `###` header boundaries first. Sect
|
|
|
896
908
|
| `models.reranker.model` | `Xenova/ms-marco-MiniLM-L-6-v2` | Reranker model |
|
|
897
909
|
| `models.reranker.top_k_multiplier` | 3 | Fetch N*multiplier candidates for reranking |
|
|
898
910
|
|
|
911
|
+
If the reranker model is not available locally and the machine cannot download it, search now falls back to the RRF order from hybrid semantic+BM25 retrieval. This keeps `search_knowledge` available offline, but result ordering may be less precise for ambiguous queries until the reranker model is cached.
|
|
912
|
+
|
|
899
913
|
**Embedding model options** (fastest → most accurate):
|
|
900
914
|
- `BAAI/bge-small-en-v1.5` — 384D, ~33MB (default)
|
|
901
915
|
- `BAAI/bge-base-en-v1.5` — 768D, ~130MB
|
|
@@ -1026,6 +1040,31 @@ rm -rf models_cache
|
|
|
1026
1040
|
# Then restart the MCP server
|
|
1027
1041
|
```
|
|
1028
1042
|
|
|
1043
|
+
### Reranker model download fails
|
|
1044
|
+
|
|
1045
|
+
The reranker is lazy-loaded on the first query. If the model is not cached and the machine is offline, search continues without reranking and uses the RRF order from hybrid retrieval. To keep reranking enabled offline, run one query while online or pre-populate `models_cache/` on the target machine.
|
|
1046
|
+
|
|
1047
|
+
You can still disable reranking explicitly in `config.yaml`:
|
|
1048
|
+
|
|
1049
|
+
```yaml
|
|
1050
|
+
models:
|
|
1051
|
+
reranker:
|
|
1052
|
+
enabled: false
|
|
1053
|
+
```
|
|
1054
|
+
|
|
1055
|
+
Disabling reranking reduces memory use and avoids first-query model loading. The tradeoff is lower ranking precision, especially when several chunks match the same terms but only one is the best answer.
|
|
1056
|
+
|
|
1057
|
+
### ChromaDB index crashes on startup
|
|
1058
|
+
|
|
1059
|
+
Native ChromaDB failures can terminate Python before normal exception handling runs. Startup now probes ChromaDB in a child process before initializing the MCP server. If the probe crashes, the active `chroma_db/` and `index_metadata.json` are moved to `data/backups/auto-repair-*`, and the next startup can rebuild a clean index.
|
|
1060
|
+
|
|
1061
|
+
The same guarded behavior is available through either console script:
|
|
1062
|
+
|
|
1063
|
+
```bash
|
|
1064
|
+
knowledge-rag
|
|
1065
|
+
knowledge-rag-guarded
|
|
1066
|
+
```
|
|
1067
|
+
|
|
1029
1068
|
### Index is empty
|
|
1030
1069
|
|
|
1031
1070
|
```bash
|
|
@@ -1056,16 +1095,44 @@ pip install --upgrade knowledge-rag
|
|
|
1056
1095
|
|
|
1057
1096
|
### Slow first query
|
|
1058
1097
|
|
|
1059
|
-
The cross-encoder reranker model is lazy-loaded on the first query. This adds a one-time ~2-3 second delay for model download and loading. Subsequent queries are fast.
|
|
1098
|
+
The cross-encoder reranker model is lazy-loaded on the first query. This adds a one-time ~2-3 second delay for model download and loading. Subsequent queries are fast. If the model cannot be loaded, search falls back to RRF ordering and does not retry loading the reranker until the server restarts.
|
|
1060
1099
|
|
|
1061
1100
|
### Memory usage
|
|
1062
1101
|
|
|
1063
|
-
With ~200 documents, expect ~300-500MB RAM. The embedding model (~
|
|
1102
|
+
With ~200 documents, expect ~300-500MB RAM. The embedding model (~200MB ONNX runtime resident, lazy-loaded on first query since v3.8.0) and reranker (~25MB, lazy-loaded) are loaded into memory only when actually used. For very large knowledge bases (1000+ documents), consider enabling GPU acceleration and using exclude patterns to limit index scope.
|
|
1103
|
+
|
|
1104
|
+
### Multiple MCP clients spawn duplicate servers
|
|
1105
|
+
|
|
1106
|
+
MCP stdio is one process per client by protocol — multiple Claude Code windows, Claude Desktop + IDE, etc. each spawn their own `knowledge-rag` process. Since v3.8.0 idle processes are cheap (no embedding model loaded until first query). If you've measured and want a hard cap of one server per data directory, opt in:
|
|
1107
|
+
|
|
1108
|
+
```bash
|
|
1109
|
+
export KNOWLEDGE_RAG_SINGLE_INSTANCE=1
|
|
1110
|
+
```
|
|
1111
|
+
|
|
1112
|
+
A second instance exits immediately with code 75. Default is OFF (multi-client friendly). Full guide: [docs/single-instance.md](docs/single-instance.md). Sample MCP config: [examples/mcp-config-single-instance.json](examples/mcp-config-single-instance.json).
|
|
1064
1113
|
|
|
1065
1114
|
---
|
|
1066
1115
|
|
|
1067
1116
|
## Changelog
|
|
1068
1117
|
|
|
1118
|
+
### v3.8.0 (2026-05-10)
|
|
1119
|
+
|
|
1120
|
+
- **NEW**: Lazy-load FastEmbed embedding model (~200MB ONNX runtime). Loads on first query instead of startup — idle `knowledge-rag` processes are now cheap, which matters when MCP stdio clients spawn parallel server processes (multiple Claude Code windows, Claude Desktop + IDE, etc.). Public API unchanged. (#32)
|
|
1121
|
+
- **NEW**: Opt-in single-instance guard via `KNOWLEDGE_RAG_SINGLE_INSTANCE=1` env var. **OFF by default** — multi-client MCP usage continues to work unchanged. When enabled, a second server process for the same `data_dir` exits with code 75 (`EX_TEMPFAIL`). Includes stale-PID recovery and SIGINT/SIGTERM handlers. See [docs/single-instance.md](docs/single-instance.md). (#33, original concept by @Hohlas in #31)
|
|
1122
|
+
- **NEW**: `examples/mcp-config-single-instance.json` — sample MCP client config for the opt-in guard.
|
|
1123
|
+
- **DOCS**: New `docs/single-instance.md` — when to use, when NOT to use, troubleshooting, full activation reference.
|
|
1124
|
+
- **DOCS**: README troubleshooting section for "Multiple MCP clients spawn duplicate servers" + memory-usage note for lazy embeddings.
|
|
1125
|
+
- **CHORE**: Sync version across `pyproject.toml`, `mcp_server/__init__.py`, and `npm/package.json` (was drifting since v3.5.x).
|
|
1126
|
+
- **CHORE**: pytest `tmp_path_retention_count=1` to avoid Windows atexit cleanup race in CI.
|
|
1127
|
+
- **ROADMAP**: Tracked v4.0 shared-service architecture (one daemon, many thin MCP clients) as the long-term fix for multi-process resource duplication. (#34)
|
|
1128
|
+
|
|
1129
|
+
### Unreleased
|
|
1130
|
+
|
|
1131
|
+
- **FIX**: Startup preflight probes ChromaDB in a child process and moves crashing persistent indexes to `data/backups/auto-repair-*` before MCP initialization.
|
|
1132
|
+
- **FIX**: Reranker load failures now fall back to RRF ordering instead of failing `search_knowledge` on offline machines.
|
|
1133
|
+
- **FIX**: Virtualenv project-root detection now handles Python symlinks that resolve to the system interpreter.
|
|
1134
|
+
- **NEW**: `knowledge-rag-guarded` console script kept as an explicit guarded startup alias.
|
|
1135
|
+
|
|
1069
1136
|
### v3.6.2 (2026-04-23)
|
|
1070
1137
|
|
|
1071
1138
|
- **INFRA**: NPM provenance attestation (SLSA supply chain security), full README on npm page
|
|
@@ -2,7 +2,9 @@
|
|
|
2
2
|
|
|
3
3
|
<div align="center">
|
|
4
4
|
|
|
5
|
-
](https://pypi.org/project/knowledge-rag/)
|
|
6
|
+
[](https://www.npmjs.com/package/knowledge-rag)
|
|
7
|
+
[](https://pepy.tech/projects/knowledge-rag)
|
|
6
8
|

|
|
7
9
|

|
|
8
10
|

|
|
@@ -10,7 +12,6 @@
|
|
|
10
12
|
[](https://github.com/lyonzin/knowledge-rag/actions/workflows/ci.yml)
|
|
11
13
|
[](https://github.com/lyonzin/knowledge-rag/actions/workflows/security.yml)
|
|
12
14
|
[](https://glama.ai/mcp/servers/lyonzin/knowledge-rag)
|
|
13
|
-
[](https://pypi.org/project/knowledge-rag/)
|
|
14
15
|
|
|
15
16
|
### Your docs, your machine, zero cloud. Claude Code searches them natively.
|
|
16
17
|
|
|
@@ -32,11 +33,21 @@ pip install knowledge-rag → restart Claude Code → search_knowledge("your que
|
|
|
32
33
|
|
|
33
34
|
---
|
|
34
35
|
|
|
35
|
-
## What's New in v3.
|
|
36
|
+
## What's New in v3.8.0
|
|
37
|
+
|
|
38
|
+
### Lazy-Loaded Embeddings — Cheaper Idle Processes
|
|
36
39
|
|
|
37
|
-
|
|
40
|
+
The FastEmbed ONNX model (~200MB resident) now loads on the **first query**, not at startup. Idle `knowledge-rag` processes are now genuinely cheap. Why this matters: MCP stdio is one-process-per-client by protocol — multiple Claude Code windows, Claude Desktop + IDE simultaneously, or review/approval flows that open extra connections all spawn their own processes. Before v3.8.0, every one of them paid the full embedding-model cost up front. Now only processes that actually serve queries load the model. Public API is unchanged.
|
|
38
41
|
|
|
39
|
-
|
|
42
|
+
### Opt-In Single-Instance Guard
|
|
43
|
+
|
|
44
|
+
For users who measured their setup and want a hard cap of one server per `data_dir`:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
export KNOWLEDGE_RAG_SINGLE_INSTANCE=1
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
A second instance exits immediately with code 75. **OFF by default** so multi-client MCP usage continues to work unchanged. Stale-PID recovery + SIGINT/SIGTERM cleanup wired correctly. Full guide in [docs/single-instance.md](docs/single-instance.md). Sample MCP config in [examples/mcp-config-single-instance.json](examples/mcp-config-single-instance.json).
|
|
40
51
|
|
|
41
52
|
### 5 Ways to Install
|
|
42
53
|
|
|
@@ -52,6 +63,7 @@ All methods produce the same MCP server. See [Installation](#installation) for f
|
|
|
52
63
|
|
|
53
64
|
### Recent Highlights
|
|
54
65
|
|
|
66
|
+
- **v3.8.0** — Lazy-load embeddings, opt-in single-instance guard, version sync across PyPI/NPM/Docker
|
|
55
67
|
- **v3.6.0** — Multi-language code parsing (C/C++/JS/TS/XML), NPM wrapper, Docker image, automated release pipeline
|
|
56
68
|
- **v3.5.2** — CUDA DLL auto-discovery from pip packages, graceful GPU→CPU fallback, explicit CPU provider (no CUDA noise when `gpu: false`), BASE_DIR resolution fix for editable installs
|
|
57
69
|
- **v3.5.1** — Remove Python `<3.13` upper bound — 3.13 and 3.14 now supported
|
|
@@ -771,7 +783,7 @@ models:
|
|
|
771
783
|
dimensions: 384
|
|
772
784
|
gpu: false # Set true + pip install knowledge-rag[gpu]
|
|
773
785
|
reranker:
|
|
774
|
-
enabled: true #
|
|
786
|
+
enabled: true # Falls back to RRF if model is unavailable
|
|
775
787
|
model: "Xenova/ms-marco-MiniLM-L-6-v2"
|
|
776
788
|
top_k_multiplier: 3 # Candidates fetched before reranking
|
|
777
789
|
|
|
@@ -858,6 +870,8 @@ For `.md` files, chunking splits at `##` and `###` header boundaries first. Sect
|
|
|
858
870
|
| `models.reranker.model` | `Xenova/ms-marco-MiniLM-L-6-v2` | Reranker model |
|
|
859
871
|
| `models.reranker.top_k_multiplier` | 3 | Fetch N*multiplier candidates for reranking |
|
|
860
872
|
|
|
873
|
+
If the reranker model is not available locally and the machine cannot download it, search now falls back to the RRF order from hybrid semantic+BM25 retrieval. This keeps `search_knowledge` available offline, but result ordering may be less precise for ambiguous queries until the reranker model is cached.
|
|
874
|
+
|
|
861
875
|
**Embedding model options** (fastest → most accurate):
|
|
862
876
|
- `BAAI/bge-small-en-v1.5` — 384D, ~33MB (default)
|
|
863
877
|
- `BAAI/bge-base-en-v1.5` — 768D, ~130MB
|
|
@@ -988,6 +1002,31 @@ rm -rf models_cache
|
|
|
988
1002
|
# Then restart the MCP server
|
|
989
1003
|
```
|
|
990
1004
|
|
|
1005
|
+
### Reranker model download fails
|
|
1006
|
+
|
|
1007
|
+
The reranker is lazy-loaded on the first query. If the model is not cached and the machine is offline, search continues without reranking and uses the RRF order from hybrid retrieval. To keep reranking enabled offline, run one query while online or pre-populate `models_cache/` on the target machine.
|
|
1008
|
+
|
|
1009
|
+
You can still disable reranking explicitly in `config.yaml`:
|
|
1010
|
+
|
|
1011
|
+
```yaml
|
|
1012
|
+
models:
|
|
1013
|
+
reranker:
|
|
1014
|
+
enabled: false
|
|
1015
|
+
```
|
|
1016
|
+
|
|
1017
|
+
Disabling reranking reduces memory use and avoids first-query model loading. The tradeoff is lower ranking precision, especially when several chunks match the same terms but only one is the best answer.
|
|
1018
|
+
|
|
1019
|
+
### ChromaDB index crashes on startup
|
|
1020
|
+
|
|
1021
|
+
Native ChromaDB failures can terminate Python before normal exception handling runs. Startup now probes ChromaDB in a child process before initializing the MCP server. If the probe crashes, the active `chroma_db/` and `index_metadata.json` are moved to `data/backups/auto-repair-*`, and the next startup can rebuild a clean index.
|
|
1022
|
+
|
|
1023
|
+
The same guarded behavior is available through either console script:
|
|
1024
|
+
|
|
1025
|
+
```bash
|
|
1026
|
+
knowledge-rag
|
|
1027
|
+
knowledge-rag-guarded
|
|
1028
|
+
```
|
|
1029
|
+
|
|
991
1030
|
### Index is empty
|
|
992
1031
|
|
|
993
1032
|
```bash
|
|
@@ -1018,16 +1057,44 @@ pip install --upgrade knowledge-rag
|
|
|
1018
1057
|
|
|
1019
1058
|
### Slow first query
|
|
1020
1059
|
|
|
1021
|
-
The cross-encoder reranker model is lazy-loaded on the first query. This adds a one-time ~2-3 second delay for model download and loading. Subsequent queries are fast.
|
|
1060
|
+
The cross-encoder reranker model is lazy-loaded on the first query. This adds a one-time ~2-3 second delay for model download and loading. Subsequent queries are fast. If the model cannot be loaded, search falls back to RRF ordering and does not retry loading the reranker until the server restarts.
|
|
1022
1061
|
|
|
1023
1062
|
### Memory usage
|
|
1024
1063
|
|
|
1025
|
-
With ~200 documents, expect ~300-500MB RAM. The embedding model (~
|
|
1064
|
+
With ~200 documents, expect ~300-500MB RAM. The embedding model (~200MB ONNX runtime resident, lazy-loaded on first query since v3.8.0) and reranker (~25MB, lazy-loaded) are loaded into memory only when actually used. For very large knowledge bases (1000+ documents), consider enabling GPU acceleration and using exclude patterns to limit index scope.
|
|
1065
|
+
|
|
1066
|
+
### Multiple MCP clients spawn duplicate servers
|
|
1067
|
+
|
|
1068
|
+
MCP stdio is one process per client by protocol — multiple Claude Code windows, Claude Desktop + IDE, etc. each spawn their own `knowledge-rag` process. Since v3.8.0 idle processes are cheap (no embedding model loaded until first query). If you've measured and want a hard cap of one server per data directory, opt in:
|
|
1069
|
+
|
|
1070
|
+
```bash
|
|
1071
|
+
export KNOWLEDGE_RAG_SINGLE_INSTANCE=1
|
|
1072
|
+
```
|
|
1073
|
+
|
|
1074
|
+
A second instance exits immediately with code 75. Default is OFF (multi-client friendly). Full guide: [docs/single-instance.md](docs/single-instance.md). Sample MCP config: [examples/mcp-config-single-instance.json](examples/mcp-config-single-instance.json).
|
|
1026
1075
|
|
|
1027
1076
|
---
|
|
1028
1077
|
|
|
1029
1078
|
## Changelog
|
|
1030
1079
|
|
|
1080
|
+
### v3.8.0 (2026-05-10)
|
|
1081
|
+
|
|
1082
|
+
- **NEW**: Lazy-load FastEmbed embedding model (~200MB ONNX runtime). Loads on first query instead of startup — idle `knowledge-rag` processes are now cheap, which matters when MCP stdio clients spawn parallel server processes (multiple Claude Code windows, Claude Desktop + IDE, etc.). Public API unchanged. (#32)
|
|
1083
|
+
- **NEW**: Opt-in single-instance guard via `KNOWLEDGE_RAG_SINGLE_INSTANCE=1` env var. **OFF by default** — multi-client MCP usage continues to work unchanged. When enabled, a second server process for the same `data_dir` exits with code 75 (`EX_TEMPFAIL`). Includes stale-PID recovery and SIGINT/SIGTERM handlers. See [docs/single-instance.md](docs/single-instance.md). (#33, original concept by @Hohlas in #31)
|
|
1084
|
+
- **NEW**: `examples/mcp-config-single-instance.json` — sample MCP client config for the opt-in guard.
|
|
1085
|
+
- **DOCS**: New `docs/single-instance.md` — when to use, when NOT to use, troubleshooting, full activation reference.
|
|
1086
|
+
- **DOCS**: README troubleshooting section for "Multiple MCP clients spawn duplicate servers" + memory-usage note for lazy embeddings.
|
|
1087
|
+
- **CHORE**: Sync version across `pyproject.toml`, `mcp_server/__init__.py`, and `npm/package.json` (was drifting since v3.5.x).
|
|
1088
|
+
- **CHORE**: pytest `tmp_path_retention_count=1` to avoid Windows atexit cleanup race in CI.
|
|
1089
|
+
- **ROADMAP**: Tracked v4.0 shared-service architecture (one daemon, many thin MCP clients) as the long-term fix for multi-process resource duplication. (#34)
|
|
1090
|
+
|
|
1091
|
+
### Unreleased
|
|
1092
|
+
|
|
1093
|
+
- **FIX**: Startup preflight probes ChromaDB in a child process and moves crashing persistent indexes to `data/backups/auto-repair-*` before MCP initialization.
|
|
1094
|
+
- **FIX**: Reranker load failures now fall back to RRF ordering instead of failing `search_knowledge` on offline machines.
|
|
1095
|
+
- **FIX**: Virtualenv project-root detection now handles Python symlinks that resolve to the system interpreter.
|
|
1096
|
+
- **NEW**: `knowledge-rag-guarded` console script kept as an explicit guarded startup alias.
|
|
1097
|
+
|
|
1031
1098
|
### v3.6.2 (2026-04-23)
|
|
1032
1099
|
|
|
1033
1100
|
- **INFRA**: NPM provenance attestation (SLSA supply chain security), full README on npm page
|
|
@@ -54,10 +54,11 @@ def _has_documents(path: Path) -> bool:
|
|
|
54
54
|
|
|
55
55
|
def _venv_project_dir():
|
|
56
56
|
"""Detect project root from venv location (pip install from PyPI)."""
|
|
57
|
-
|
|
58
|
-
for
|
|
59
|
-
|
|
60
|
-
|
|
57
|
+
candidates = [Path(sys.prefix), Path(sys.executable), Path(sys.executable).resolve()]
|
|
58
|
+
for candidate in candidates:
|
|
59
|
+
for parent in (candidate, *candidate.parents):
|
|
60
|
+
if parent.name in ("venv", ".venv", "env", ".env"):
|
|
61
|
+
return parent.parent
|
|
61
62
|
return None
|
|
62
63
|
|
|
63
64
|
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""Optional single-instance guard for the MCP server process.
|
|
2
|
+
|
|
3
|
+
Background
|
|
4
|
+
----------
|
|
5
|
+
MCP stdio servers are 1-process-per-client by protocol design. Multiple
|
|
6
|
+
Claude Code windows, Claude Desktop + IDE running simultaneously, or clients
|
|
7
|
+
that open extra internal connections during approval/review flows will all
|
|
8
|
+
spawn additional `knowledge-rag` processes. Each process holds its own
|
|
9
|
+
embedding model, ChromaDB client, BM25 state, and file watcher.
|
|
10
|
+
|
|
11
|
+
Lazy-loading the embedding model (v3.8.0) reduces idle cost dramatically,
|
|
12
|
+
but some users still want a hard cap of one process per data directory.
|
|
13
|
+
This module provides that cap as an OPT-IN, never as a default.
|
|
14
|
+
|
|
15
|
+
Activation
|
|
16
|
+
----------
|
|
17
|
+
Set the environment variable in your MCP client config:
|
|
18
|
+
|
|
19
|
+
KNOWLEDGE_RAG_SINGLE_INSTANCE=1 # also accepts: true, yes, on (case-insensitive)
|
|
20
|
+
|
|
21
|
+
When unset (default), `single_instance_lock()` is a no-op and the server
|
|
22
|
+
behaves exactly as it did before this module existed.
|
|
23
|
+
|
|
24
|
+
When enabled, the server creates `<data_dir>/knowledge-rag.lock` containing
|
|
25
|
+
its PID. A second process starting against the same `data_dir` will detect
|
|
26
|
+
the live PID and exit with code 75 (EX_TEMPFAIL). Stale locks (PID gone)
|
|
27
|
+
are cleaned up automatically.
|
|
28
|
+
|
|
29
|
+
Cleanup is wired in three places so the lock does not outlive the process:
|
|
30
|
+
1. Normal exit: contextmanager `finally` block removes the lock.
|
|
31
|
+
2. SIGINT / SIGTERM: handlers remove the lock and re-raise the default action.
|
|
32
|
+
3. Crash / SIGKILL: stale-PID detection on the next startup removes it.
|
|
33
|
+
|
|
34
|
+
Authors
|
|
35
|
+
-------
|
|
36
|
+
- Concept and original guard: Sergey Khokhlov (@Hohlas) in PR #31
|
|
37
|
+
- Reworked as opt-in + signal handlers + tests: Lyon. (knowledge-rag maintainer)
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
from __future__ import annotations
|
|
41
|
+
|
|
42
|
+
import os
|
|
43
|
+
import signal
|
|
44
|
+
from contextlib import contextmanager
|
|
45
|
+
from pathlib import Path
|
|
46
|
+
from typing import Iterator, Optional
|
|
47
|
+
|
|
48
|
+
from .config import config
|
|
49
|
+
|
|
50
|
+
LOCK_FILENAME = "knowledge-rag.lock"
|
|
51
|
+
ALREADY_RUNNING_EXIT_CODE = 75 # EX_TEMPFAIL from sysexits.h
|
|
52
|
+
ENV_VAR = "KNOWLEDGE_RAG_SINGLE_INSTANCE"
|
|
53
|
+
_TRUTHY = {"1", "true", "yes", "on"}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class AlreadyRunningError(RuntimeError):
|
|
57
|
+
"""Raised when another knowledge-rag server instance already holds the lock."""
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def single_instance_enabled() -> bool:
|
|
61
|
+
"""Return True if the user opted into the single-instance guard.
|
|
62
|
+
|
|
63
|
+
Reads `KNOWLEDGE_RAG_SINGLE_INSTANCE`. Accepts ``1``, ``true``, ``yes``, ``on``
|
|
64
|
+
(case-insensitive, surrounding whitespace ignored). Anything else — including
|
|
65
|
+
unset, empty, ``0``, ``false`` — leaves the guard disabled.
|
|
66
|
+
"""
|
|
67
|
+
raw = os.environ.get(ENV_VAR, "").strip().lower()
|
|
68
|
+
return raw in _TRUTHY
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _pid_is_running(pid: int) -> bool:
|
|
72
|
+
"""Return True if a process with PID appears to be alive."""
|
|
73
|
+
if pid <= 0:
|
|
74
|
+
return False
|
|
75
|
+
try:
|
|
76
|
+
os.kill(pid, 0)
|
|
77
|
+
except ProcessLookupError:
|
|
78
|
+
return False
|
|
79
|
+
except PermissionError:
|
|
80
|
+
# Process exists but is owned by another user / has tighter ACLs
|
|
81
|
+
return True
|
|
82
|
+
except OSError:
|
|
83
|
+
return False
|
|
84
|
+
return True
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _read_lock_pid(lock_path: Path) -> Optional[int]:
|
|
88
|
+
try:
|
|
89
|
+
raw = lock_path.read_text(encoding="utf-8").strip().splitlines()[0]
|
|
90
|
+
return int(raw)
|
|
91
|
+
except (IndexError, OSError, ValueError):
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _lock_path() -> Path:
|
|
96
|
+
return config.data_dir / LOCK_FILENAME
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _remove_if_ours(lock_path: Path) -> None:
|
|
100
|
+
"""Remove the lock file ONLY if it still references our PID."""
|
|
101
|
+
if _read_lock_pid(lock_path) == os.getpid():
|
|
102
|
+
try:
|
|
103
|
+
lock_path.unlink()
|
|
104
|
+
except FileNotFoundError:
|
|
105
|
+
pass
|
|
106
|
+
except OSError:
|
|
107
|
+
# Best-effort; stale-PID check on next startup will recover
|
|
108
|
+
pass
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@contextmanager
|
|
112
|
+
def single_instance_lock() -> Iterator[Optional[Path]]:
|
|
113
|
+
"""Acquire the single-instance lock if opt-in flag is set.
|
|
114
|
+
|
|
115
|
+
No-op when ``KNOWLEDGE_RAG_SINGLE_INSTANCE`` is unset / falsy — yields ``None``
|
|
116
|
+
and the caller proceeds normally with no side effects on disk.
|
|
117
|
+
|
|
118
|
+
When enabled:
|
|
119
|
+
- Creates ``<data_dir>/knowledge-rag.lock`` containing this process's PID.
|
|
120
|
+
- Raises :class:`AlreadyRunningError` if another live PID already holds it.
|
|
121
|
+
- Recovers stale locks (PID no longer running).
|
|
122
|
+
- Registers SIGINT/SIGTERM handlers that remove the lock and re-raise.
|
|
123
|
+
- Removes the lock on normal exit via ``finally``.
|
|
124
|
+
"""
|
|
125
|
+
if not single_instance_enabled():
|
|
126
|
+
yield None
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
config.data_dir.mkdir(parents=True, exist_ok=True)
|
|
130
|
+
lock_path = _lock_path()
|
|
131
|
+
|
|
132
|
+
while True:
|
|
133
|
+
try:
|
|
134
|
+
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
|
|
135
|
+
except FileExistsError:
|
|
136
|
+
pid = _read_lock_pid(lock_path)
|
|
137
|
+
if pid is not None and _pid_is_running(pid):
|
|
138
|
+
raise AlreadyRunningError(
|
|
139
|
+
f"knowledge-rag MCP server is already running (pid {pid}). "
|
|
140
|
+
f"Refusing to start a second instance because "
|
|
141
|
+
f"{ENV_VAR} is enabled."
|
|
142
|
+
)
|
|
143
|
+
try:
|
|
144
|
+
lock_path.unlink()
|
|
145
|
+
except FileNotFoundError:
|
|
146
|
+
pass
|
|
147
|
+
except OSError as exc:
|
|
148
|
+
raise AlreadyRunningError(f"Failed to clear stale lock {lock_path}: {exc}") from exc
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
|
152
|
+
f.write(f"{os.getpid()}\n")
|
|
153
|
+
break
|
|
154
|
+
|
|
155
|
+
# Wire signal handlers so SIGINT/SIGTERM cleanup the lock before exit
|
|
156
|
+
previous_handlers: dict[int, object] = {}
|
|
157
|
+
|
|
158
|
+
def _signal_cleanup(signum: int, frame) -> None:
|
|
159
|
+
_remove_if_ours(lock_path)
|
|
160
|
+
# Restore original handler and re-raise so default action runs
|
|
161
|
+
prev = previous_handlers.get(signum, signal.SIG_DFL)
|
|
162
|
+
try:
|
|
163
|
+
signal.signal(signum, prev) # type: ignore[arg-type]
|
|
164
|
+
except (ValueError, OSError):
|
|
165
|
+
pass
|
|
166
|
+
# Re-send the signal to ourselves so the original disposition fires
|
|
167
|
+
os.kill(os.getpid(), signum)
|
|
168
|
+
|
|
169
|
+
for sig_name in ("SIGINT", "SIGTERM"):
|
|
170
|
+
sig = getattr(signal, sig_name, None)
|
|
171
|
+
if sig is None:
|
|
172
|
+
continue
|
|
173
|
+
try:
|
|
174
|
+
previous_handlers[sig] = signal.getsignal(sig)
|
|
175
|
+
signal.signal(sig, _signal_cleanup)
|
|
176
|
+
except (ValueError, OSError):
|
|
177
|
+
# signal.signal raises if not on the main thread; tests may hit this
|
|
178
|
+
pass
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
yield lock_path
|
|
182
|
+
finally:
|
|
183
|
+
_remove_if_ours(lock_path)
|
|
184
|
+
for sig, prev in previous_handlers.items():
|
|
185
|
+
try:
|
|
186
|
+
signal.signal(sig, prev) # type: ignore[arg-type]
|
|
187
|
+
except (ValueError, OSError):
|
|
188
|
+
pass
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
"""Startup preflight checks for persistent ChromaDB state."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import shutil
|
|
7
|
+
import subprocess
|
|
8
|
+
import sys
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from .config import BASE_DIR, config
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _backup_active_index(reason: str) -> Path:
|
|
16
|
+
"""Move active ChromaDB state aside so the server can rebuild cleanly."""
|
|
17
|
+
stamp = datetime.now().strftime("%Y%m%d-%H%M%S")
|
|
18
|
+
backup_dir = config.data_dir / "backups" / f"auto-repair-{stamp}"
|
|
19
|
+
backup_dir.mkdir(parents=True, exist_ok=False)
|
|
20
|
+
|
|
21
|
+
if config.chroma_dir.exists():
|
|
22
|
+
shutil.move(str(config.chroma_dir), str(backup_dir / f"chroma_db.{reason}"))
|
|
23
|
+
|
|
24
|
+
metadata_file = config.data_dir / "index_metadata.json"
|
|
25
|
+
if metadata_file.exists():
|
|
26
|
+
shutil.move(str(metadata_file), str(backup_dir / f"index_metadata.{reason}.json"))
|
|
27
|
+
|
|
28
|
+
return backup_dir
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _probe_chroma(timeout_seconds: int = 30) -> subprocess.CompletedProcess[str]:
|
|
32
|
+
"""Check Chroma in a child process so native crashes do not kill MCP startup."""
|
|
33
|
+
code = r"""
|
|
34
|
+
import chromadb
|
|
35
|
+
|
|
36
|
+
from mcp_server.config import config
|
|
37
|
+
|
|
38
|
+
if not config.chroma_dir.exists():
|
|
39
|
+
print("missing")
|
|
40
|
+
raise SystemExit(0)
|
|
41
|
+
|
|
42
|
+
client = chromadb.PersistentClient(path=str(config.chroma_dir))
|
|
43
|
+
collection = client.get_or_create_collection(name=config.collection_name)
|
|
44
|
+
print(collection.count())
|
|
45
|
+
"""
|
|
46
|
+
env = os.environ.copy()
|
|
47
|
+
env.setdefault("KNOWLEDGE_RAG_DIR", str(BASE_DIR))
|
|
48
|
+
return subprocess.run(
|
|
49
|
+
[sys.executable, "-c", code],
|
|
50
|
+
cwd=str(BASE_DIR),
|
|
51
|
+
env=env,
|
|
52
|
+
text=True,
|
|
53
|
+
stdout=subprocess.PIPE,
|
|
54
|
+
stderr=subprocess.PIPE,
|
|
55
|
+
timeout=timeout_seconds,
|
|
56
|
+
check=False,
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def run_preflight(timeout_seconds: int = 30) -> bool:
|
|
61
|
+
"""Return True when active Chroma state was moved aside for repair."""
|
|
62
|
+
result = _probe_chroma(timeout_seconds=timeout_seconds)
|
|
63
|
+
if result.returncode == 0:
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
reason = "segfault" if result.returncode in (-11, 139) else "failed"
|
|
67
|
+
backup_dir = _backup_active_index(reason)
|
|
68
|
+
print(
|
|
69
|
+
f"[RECOVERY] Chroma preflight failed with code {result.returncode}; moved active index to {backup_dir}",
|
|
70
|
+
file=sys.stderr,
|
|
71
|
+
)
|
|
72
|
+
if result.stderr:
|
|
73
|
+
print(result.stderr[-2000:], file=sys.stderr)
|
|
74
|
+
return True
|
|
@@ -136,6 +136,17 @@ class FastEmbedEmbeddings:
|
|
|
136
136
|
Uses ONNX Runtime in-process for embedding generation.
|
|
137
137
|
No external server required (replaces Ollama).
|
|
138
138
|
Model: BAAI/bge-small-en-v1.5 (384-dim, MTEB score 62.x)
|
|
139
|
+
|
|
140
|
+
Lazy-loading (since v3.8.0):
|
|
141
|
+
The ONNX model (~200MB resident) is NOT loaded in __init__.
|
|
142
|
+
It loads on the first call to __call__/embed_query/embed_documents.
|
|
143
|
+
This makes idle MCP server processes cheap, which matters when
|
|
144
|
+
multiple stdio clients spawn parallel knowledge-rag processes
|
|
145
|
+
(e.g. multiple Claude Code windows). The CrossEncoderReranker
|
|
146
|
+
already follows this same pattern.
|
|
147
|
+
|
|
148
|
+
Thread-safe: load is guarded by a lock so concurrent first-callers
|
|
149
|
+
don't double-initialize the model.
|
|
139
150
|
"""
|
|
140
151
|
|
|
141
152
|
@staticmethod
|
|
@@ -174,20 +185,34 @@ class FastEmbedEmbeddings:
|
|
|
174
185
|
def __init__(self, model: str = None):
|
|
175
186
|
self.model_name = model or config.embedding_model
|
|
176
187
|
self._dim = config.embedding_dim
|
|
177
|
-
kwargs
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
188
|
+
# Build kwargs once; defer the heavy TextEmbedding(**kwargs) call to first use.
|
|
189
|
+
self._init_kwargs = {"model_name": self.model_name, "cache_dir": str(config.models_cache_dir)}
|
|
190
|
+
self._gpu = bool(config.gpu_acceleration)
|
|
191
|
+
self._model: Optional[TextEmbedding] = None
|
|
192
|
+
self._load_lock = threading.Lock()
|
|
193
|
+
|
|
194
|
+
def _load_model(self) -> None:
|
|
195
|
+
"""Load the ONNX model on demand. Idempotent and thread-safe."""
|
|
196
|
+
if self._model is not None:
|
|
197
|
+
return
|
|
198
|
+
with self._load_lock:
|
|
199
|
+
if self._model is not None: # double-checked under the lock
|
|
200
|
+
return
|
|
201
|
+
kwargs = dict(self._init_kwargs)
|
|
202
|
+
if self._gpu:
|
|
203
|
+
self._setup_cuda_dll_paths()
|
|
204
|
+
kwargs["providers"] = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
|
205
|
+
print(f"[INFO] Loading embedding model: {self.model_name} ({self._dim}D) [GPU accelerated]...")
|
|
206
|
+
try:
|
|
207
|
+
self._model = TextEmbedding(**kwargs)
|
|
208
|
+
print("[INFO] Embedding model loaded successfully [GPU]")
|
|
209
|
+
return
|
|
210
|
+
except (ValueError, RuntimeError) as e:
|
|
211
|
+
print(f"[WARN] GPU init failed ({e}), falling back to CPU...")
|
|
212
|
+
kwargs["providers"] = ["CPUExecutionProvider"]
|
|
213
|
+
self._model = TextEmbedding(**kwargs)
|
|
214
|
+
print("[INFO] Embedding model loaded successfully [CPU fallback]")
|
|
215
|
+
return
|
|
191
216
|
kwargs["providers"] = ["CPUExecutionProvider"]
|
|
192
217
|
print(f"[INFO] Loading embedding model: {self.model_name} ({self._dim}D)...")
|
|
193
218
|
self._model = TextEmbedding(**kwargs)
|
|
@@ -203,6 +228,7 @@ class FastEmbedEmbeddings:
|
|
|
203
228
|
if not input:
|
|
204
229
|
return []
|
|
205
230
|
|
|
231
|
+
self._load_model()
|
|
206
232
|
try:
|
|
207
233
|
embeddings = list(self._model.embed(input))
|
|
208
234
|
return [emb.tolist() for emb in embeddings]
|
|
@@ -248,13 +274,22 @@ class CrossEncoderReranker:
|
|
|
248
274
|
def __init__(self, model: str = None):
|
|
249
275
|
self.model_name = model or config.reranker_model
|
|
250
276
|
self._model = None # Lazy init
|
|
277
|
+
self._load_failed = False
|
|
251
278
|
|
|
252
|
-
def _ensure_model(self):
|
|
279
|
+
def _ensure_model(self) -> bool:
|
|
253
280
|
"""Lazy initialization of cross-encoder model"""
|
|
281
|
+
if self._load_failed:
|
|
282
|
+
return False
|
|
254
283
|
if self._model is None:
|
|
255
284
|
print(f"[INFO] Loading reranker model: {self.model_name}...")
|
|
256
|
-
|
|
257
|
-
|
|
285
|
+
try:
|
|
286
|
+
self._model = TextCrossEncoder(model_name=self.model_name, cache_dir=str(config.models_cache_dir))
|
|
287
|
+
print("[INFO] Reranker model loaded successfully")
|
|
288
|
+
except Exception as e:
|
|
289
|
+
self._load_failed = True
|
|
290
|
+
print(f"[WARN] Reranker unavailable, using RRF order: {e}")
|
|
291
|
+
return False
|
|
292
|
+
return True
|
|
258
293
|
|
|
259
294
|
def rerank(self, query: str, documents: List[Dict[str, Any]], top_k: int = 5) -> List[Dict[str, Any]]:
|
|
260
295
|
"""
|
|
@@ -271,7 +306,8 @@ class CrossEncoderReranker:
|
|
|
271
306
|
if not documents or not config.reranker_enabled:
|
|
272
307
|
return documents[:top_k]
|
|
273
308
|
|
|
274
|
-
self._ensure_model()
|
|
309
|
+
if not self._ensure_model():
|
|
310
|
+
return documents[:top_k]
|
|
275
311
|
|
|
276
312
|
texts = [doc.get("document", "") for doc in documents]
|
|
277
313
|
|
|
@@ -1924,44 +1960,58 @@ def main():
|
|
|
1924
1960
|
_handle_init()
|
|
1925
1961
|
return
|
|
1926
1962
|
|
|
1927
|
-
|
|
1928
|
-
|
|
1929
|
-
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
try:
|
|
1934
|
-
stats = orchestrator.nuclear_rebuild()
|
|
1935
|
-
print(
|
|
1936
|
-
f"[MIGRATION] Rebuild complete: {stats['indexed']} docs, "
|
|
1937
|
-
f"{stats['chunks_added']} chunks in {stats.get('elapsed_seconds', '?')}s"
|
|
1938
|
-
)
|
|
1939
|
-
except Exception as e:
|
|
1940
|
-
print(f"[ERROR] Migration failed: {e}")
|
|
1941
|
-
print("[FALLBACK] Attempting regular index instead...")
|
|
1942
|
-
stats = orchestrator.index_all(force=True)
|
|
1943
|
-
elif orchestrator.collection.count() == 0:
|
|
1944
|
-
print("[INFO] No documents indexed. Running initial indexing...")
|
|
1945
|
-
stats = orchestrator.index_all()
|
|
1946
|
-
print(f"[INFO] Indexed {stats['indexed']} documents with {stats['chunks_added']} chunks")
|
|
1963
|
+
from .instance_lock import (
|
|
1964
|
+
ALREADY_RUNNING_EXIT_CODE,
|
|
1965
|
+
AlreadyRunningError,
|
|
1966
|
+
single_instance_lock,
|
|
1967
|
+
)
|
|
1968
|
+
from .preflight import run_preflight
|
|
1947
1969
|
|
|
1948
|
-
# Start file watcher for auto-reindex on document changes
|
|
1949
1970
|
try:
|
|
1950
|
-
|
|
1951
|
-
|
|
1952
|
-
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
|
|
1956
|
-
|
|
1957
|
-
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
1961
|
-
|
|
1962
|
-
|
|
1963
|
-
|
|
1964
|
-
|
|
1971
|
+
with single_instance_lock():
|
|
1972
|
+
run_preflight()
|
|
1973
|
+
|
|
1974
|
+
orchestrator = get_orchestrator()
|
|
1975
|
+
|
|
1976
|
+
# Migration: check dimension mismatch AFTER full init (avoids segfault during __init__)
|
|
1977
|
+
orchestrator._needs_rebuild = orchestrator._check_dimension_mismatch()
|
|
1978
|
+
if orchestrator._needs_rebuild:
|
|
1979
|
+
print("[MIGRATION] Running nuclear rebuild for embedding model change...")
|
|
1980
|
+
try:
|
|
1981
|
+
stats = orchestrator.nuclear_rebuild()
|
|
1982
|
+
print(
|
|
1983
|
+
f"[MIGRATION] Rebuild complete: {stats['indexed']} docs, "
|
|
1984
|
+
f"{stats['chunks_added']} chunks in {stats.get('elapsed_seconds', '?')}s"
|
|
1985
|
+
)
|
|
1986
|
+
except Exception as e:
|
|
1987
|
+
print(f"[ERROR] Migration failed: {e}")
|
|
1988
|
+
print("[FALLBACK] Attempting regular index instead...")
|
|
1989
|
+
stats = orchestrator.index_all(force=True)
|
|
1990
|
+
elif orchestrator.collection.count() == 0:
|
|
1991
|
+
print("[INFO] No documents indexed. Running initial indexing...")
|
|
1992
|
+
stats = orchestrator.index_all()
|
|
1993
|
+
print(f"[INFO] Indexed {stats['indexed']} documents with {stats['chunks_added']} chunks")
|
|
1994
|
+
|
|
1995
|
+
# Start file watcher for auto-reindex on document changes
|
|
1996
|
+
try:
|
|
1997
|
+
watcher = DocumentWatcher(get_orchestrator, debounce_seconds=5.0)
|
|
1998
|
+
observer = Observer()
|
|
1999
|
+
observer.schedule(watcher, str(config.documents_dir), recursive=True)
|
|
2000
|
+
observer.daemon = True
|
|
2001
|
+
observer.start()
|
|
2002
|
+
print(f"[WATCHER] Monitoring {config.documents_dir} for changes")
|
|
2003
|
+
except Exception as e:
|
|
2004
|
+
print(f"[WARN] Failed to start file watcher: {e}")
|
|
2005
|
+
print("[WARN] Auto-reindexing disabled. Use reindex_documents tool manually.")
|
|
2006
|
+
|
|
2007
|
+
# Restore real stdout for MCP JSON-RPC, keep print() going to stderr
|
|
2008
|
+
from . import _original_stdout
|
|
2009
|
+
|
|
2010
|
+
sys.stdout = _original_stdout
|
|
2011
|
+
mcp.run()
|
|
2012
|
+
except AlreadyRunningError as e:
|
|
2013
|
+
print(f"[ERROR] {e}", file=sys.stderr)
|
|
2014
|
+
raise SystemExit(ALREADY_RUNNING_EXIT_CODE) from e
|
|
1965
2015
|
|
|
1966
2016
|
|
|
1967
2017
|
if __name__ == "__main__":
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
# Knowledge RAG
|
|
4
4
|
|
|
5
|
-
Local RAG system for Claude Code. Hybrid BM25 + semantic search with cross-encoder reranking. 12 MCP tools. Zero external servers. Everything runs on your machine.
|
|
5
|
+
Local RAG system for Claude Code. Hybrid BM25 + semantic search with cross-encoder reranking. 12 MCP tools, 20 format parsers. Zero external servers. Everything runs on your machine.
|
|
6
6
|
|
|
7
7
|
## Quick Start
|
|
8
8
|
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "knowledge-rag"
|
|
7
|
-
version = "3.
|
|
7
|
+
version = "3.8.0"
|
|
8
8
|
description = "Local RAG System for Claude Code — Hybrid search + Cross-encoder Reranking + 12 MCP Tools + 20 Format Parsers. Zero external servers."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -34,7 +34,7 @@ dependencies = [
|
|
|
34
34
|
"fastembed[reranking]>=0.4.0",
|
|
35
35
|
"mcp>=1.0.0",
|
|
36
36
|
"rank-bm25>=0.2.2",
|
|
37
|
-
"requests>=2.
|
|
37
|
+
"requests>=2.33.0",
|
|
38
38
|
"beautifulsoup4>=4.12.0",
|
|
39
39
|
"python-docx>=1.0.0",
|
|
40
40
|
"openpyxl>=3.1.0",
|
|
@@ -54,6 +54,7 @@ Changelog = "https://github.com/lyonzin/knowledge-rag/releases"
|
|
|
54
54
|
|
|
55
55
|
[project.scripts]
|
|
56
56
|
knowledge-rag = "mcp_server.server:main"
|
|
57
|
+
knowledge-rag-guarded = "mcp_server.guarded:guarded_main"
|
|
57
58
|
|
|
58
59
|
[tool.hatch.build.targets.wheel]
|
|
59
60
|
packages = ["mcp_server"]
|
|
@@ -94,6 +95,11 @@ exclude = [
|
|
|
94
95
|
[tool.pytest.ini_options]
|
|
95
96
|
testpaths = ["tests"]
|
|
96
97
|
pythonpath = ["."]
|
|
98
|
+
# Limit retained tmp_path directories to avoid pytest's atexit cleanup race
|
|
99
|
+
# on Windows (cleanup_numbered_dir + pathlib.glob "garbage-*" can fail when
|
|
100
|
+
# many tmp dirs accumulate). Tests run isolated; we don't need history.
|
|
101
|
+
tmp_path_retention_count = 1
|
|
102
|
+
tmp_path_retention_policy = "failed"
|
|
97
103
|
|
|
98
104
|
[tool.ruff]
|
|
99
105
|
target-version = "py311"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Knowledge RAG System - Python Dependencies
|
|
2
2
|
# ==========================================
|
|
3
|
-
# Requires Python 3.11
|
|
3
|
+
# Requires Python 3.11+ (3.11, 3.12, 3.13, 3.14 supported)
|
|
4
4
|
|
|
5
5
|
# Vector Database (uses new PersistentClient API)
|
|
6
6
|
chromadb>=1.4.0
|
|
@@ -19,7 +19,7 @@ mcp>=1.0.0
|
|
|
19
19
|
rank-bm25>=0.2.2
|
|
20
20
|
|
|
21
21
|
# URL content fetching (add_from_url tool)
|
|
22
|
-
requests>=2.
|
|
22
|
+
requests>=2.33.0
|
|
23
23
|
|
|
24
24
|
# HTML parsing (add_from_url tool)
|
|
25
25
|
beautifulsoup4>=4.12.0
|
|
@@ -44,6 +44,6 @@ watchdog>=4.0.0
|
|
|
44
44
|
# 2. Default embedding model: BAAI/bge-small-en-v1.5 (384-dim)
|
|
45
45
|
# Cached in ~/.cache/fastembed/
|
|
46
46
|
#
|
|
47
|
-
# 3. Python 3.13+ is
|
|
48
|
-
#
|
|
47
|
+
# 3. Python 3.13+ is supported since v3.5.1
|
|
48
|
+
# (onnxruntime now ships wheels for 3.13 and 3.14)
|
|
49
49
|
# ==========================================
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|