knowledge-rag 3.7.0__tar.gz → 3.8.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/PKG-INFO +46 -6
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/README.md +45 -5
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/mcp_server/__init__.py +1 -1
- knowledge_rag-3.8.1/mcp_server/instance_lock.py +188 -0
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/mcp_server/server.py +151 -58
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/pyproject.toml +6 -1
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/.gitignore +0 -0
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/LICENSE +0 -0
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/config.example.yaml +0 -0
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/documents/examples/sample-document.md +0 -0
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/mcp_server/config.py +0 -0
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/mcp_server/guarded.py +0 -0
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/mcp_server/ingestion.py +0 -0
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/mcp_server/preflight.py +0 -0
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/npm/README.md +0 -0
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/presets/cybersecurity.yaml +0 -0
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/presets/developer.yaml +0 -0
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/presets/general.yaml +0 -0
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/presets/research.yaml +0 -0
- {knowledge_rag-3.7.0 → knowledge_rag-3.8.1}/requirements.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: knowledge-rag
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.8.1
|
|
4
4
|
Summary: Local RAG System for Claude Code — Hybrid search + Cross-encoder Reranking + 12 MCP Tools + 20 Format Parsers. Zero external servers.
|
|
5
5
|
Project-URL: Homepage, https://github.com/lyonzin/knowledge-rag
|
|
6
6
|
Project-URL: Repository, https://github.com/lyonzin/knowledge-rag
|
|
@@ -42,7 +42,7 @@ Description-Content-Type: text/markdown
|
|
|
42
42
|
|
|
43
43
|
[](https://pypi.org/project/knowledge-rag/)
|
|
44
44
|
[](https://www.npmjs.com/package/knowledge-rag)
|
|
45
|
-
[](https://pepy.tech/projects/knowledge-rag)
|
|
46
46
|

|
|
47
47
|

|
|
48
48
|

|
|
@@ -71,11 +71,21 @@ pip install knowledge-rag → restart Claude Code → search_knowledge("your que
|
|
|
71
71
|
|
|
72
72
|
---
|
|
73
73
|
|
|
74
|
-
## What's New in v3.
|
|
74
|
+
## What's New in v3.8.0
|
|
75
75
|
|
|
76
|
-
###
|
|
76
|
+
### Lazy-Loaded Embeddings — Cheaper Idle Processes
|
|
77
77
|
|
|
78
|
-
|
|
78
|
+
The FastEmbed ONNX model (~200MB resident) now loads on the **first query**, not at startup. Idle `knowledge-rag` processes are now genuinely cheap. Why this matters: MCP stdio is one-process-per-client by protocol — multiple Claude Code windows, Claude Desktop + IDE simultaneously, or review/approval flows that open extra connections all spawn their own processes. Before v3.8.0, every one of them paid the full embedding-model cost up front. Now only processes that actually serve queries load the model. Public API is unchanged.
|
|
79
|
+
|
|
80
|
+
### Opt-In Single-Instance Guard
|
|
81
|
+
|
|
82
|
+
For users who measured their setup and want a hard cap of one server per `data_dir`:
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
export KNOWLEDGE_RAG_SINGLE_INSTANCE=1
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
A second instance exits immediately with code 75. **OFF by default** so multi-client MCP usage continues to work unchanged. Stale-PID recovery + SIGINT/SIGTERM cleanup wired correctly. Full guide in [docs/single-instance.md](docs/single-instance.md). Sample MCP config in [examples/mcp-config-single-instance.json](examples/mcp-config-single-instance.json).
|
|
79
89
|
|
|
80
90
|
### 5 Ways to Install
|
|
81
91
|
|
|
@@ -91,6 +101,7 @@ All methods produce the same MCP server. See [Installation](#installation) for f
|
|
|
91
101
|
|
|
92
102
|
### Recent Highlights
|
|
93
103
|
|
|
104
|
+
- **v3.8.0** — Lazy-load embeddings, opt-in single-instance guard, version sync across PyPI/NPM/Docker
|
|
94
105
|
- **v3.6.0** — Multi-language code parsing (C/C++/JS/TS/XML), NPM wrapper, Docker image, automated release pipeline
|
|
95
106
|
- **v3.5.2** — CUDA DLL auto-discovery from pip packages, graceful GPU→CPU fallback, explicit CPU provider (no CUDA noise when `gpu: false`), BASE_DIR resolution fix for editable installs
|
|
96
107
|
- **v3.5.1** — Remove Python `<3.13` upper bound — 3.13 and 3.14 now supported
|
|
@@ -1088,12 +1099,41 @@ The cross-encoder reranker model is lazy-loaded on the first query. This adds a
|
|
|
1088
1099
|
|
|
1089
1100
|
### Memory usage
|
|
1090
1101
|
|
|
1091
|
-
With ~200 documents, expect ~300-500MB RAM. The embedding model (~
|
|
1102
|
+
With ~200 documents, expect ~300-500MB RAM. The embedding model (~200MB ONNX runtime resident, lazy-loaded on first query since v3.8.0) and reranker (~25MB, lazy-loaded) are loaded into memory only when actually used. For very large knowledge bases (1000+ documents), consider enabling GPU acceleration and using exclude patterns to limit index scope.
|
|
1103
|
+
|
|
1104
|
+
### Multiple MCP clients spawn duplicate servers
|
|
1105
|
+
|
|
1106
|
+
MCP stdio is one process per client by protocol — multiple Claude Code windows, Claude Desktop + IDE, etc. each spawn their own `knowledge-rag` process. Since v3.8.0 idle processes are cheap (no embedding model loaded until first query). If you've measured and want a hard cap of one server per data directory, opt in:
|
|
1107
|
+
|
|
1108
|
+
```bash
|
|
1109
|
+
export KNOWLEDGE_RAG_SINGLE_INSTANCE=1
|
|
1110
|
+
```
|
|
1111
|
+
|
|
1112
|
+
A second instance exits immediately with code 75. Default is OFF (multi-client friendly). Full guide: [docs/single-instance.md](docs/single-instance.md). Sample MCP config: [examples/mcp-config-single-instance.json](examples/mcp-config-single-instance.json).
|
|
1092
1113
|
|
|
1093
1114
|
---
|
|
1094
1115
|
|
|
1095
1116
|
## Changelog
|
|
1096
1117
|
|
|
1118
|
+
### v3.8.1 (2026-05-10) — hotfix
|
|
1119
|
+
|
|
1120
|
+
- **FIX (critical)**: `FastEmbedEmbeddings.__call__` no longer returns vectors of zeros when the ONNX model fails to load or `embed()` raises. The previous behavior silently corrupted the index — ChromaDB stored zero embeddings, `count()` reported normal numbers, smart-reindex skipped the bad chunks, and queries returned garbage scores with no error visible. Now raises `EmbeddingModelLoadError` / `EmbeddingError`. (#36)
|
|
1121
|
+
- **FIX**: Sticky `_load_failed` flag — after a load failure, subsequent calls re-raise immediately instead of looping through HuggingFace download attempts (was the "frozen query" UX in v3.8.0).
|
|
1122
|
+
- **NEW**: Sanity checks in `__call__` — embed count and dim mismatches raise `EmbeddingError` instead of silently returning malformed vectors.
|
|
1123
|
+
- **TEST**: 7 new regression cases in `tests/test_lazy_embeddings.py`, including `test_does_not_return_zero_vectors_silently` as a guard for the whole class of bug.
|
|
1124
|
+
- **NOTE**: This is a pre-existing bug in master, not introduced by v3.8.0. v3.8.0 lazy-load expanded the impact (failures moved to query time). All v3.8.0 users should upgrade.
|
|
1125
|
+
|
|
1126
|
+
### v3.8.0 (2026-05-10)
|
|
1127
|
+
|
|
1128
|
+
- **NEW**: Lazy-load FastEmbed embedding model (~200MB ONNX runtime). Loads on first query instead of startup — idle `knowledge-rag` processes are now cheap, which matters when MCP stdio clients spawn parallel server processes (multiple Claude Code windows, Claude Desktop + IDE, etc.). Public API unchanged. (#32)
|
|
1129
|
+
- **NEW**: Opt-in single-instance guard via `KNOWLEDGE_RAG_SINGLE_INSTANCE=1` env var. **OFF by default** — multi-client MCP usage continues to work unchanged. When enabled, a second server process for the same `data_dir` exits with code 75 (`EX_TEMPFAIL`). Includes stale-PID recovery and SIGINT/SIGTERM handlers. See [docs/single-instance.md](docs/single-instance.md). (#33, original concept by @Hohlas in #31)
|
|
1130
|
+
- **NEW**: `examples/mcp-config-single-instance.json` — sample MCP client config for the opt-in guard.
|
|
1131
|
+
- **DOCS**: New `docs/single-instance.md` — when to use, when NOT to use, troubleshooting, full activation reference.
|
|
1132
|
+
- **DOCS**: README troubleshooting section for "Multiple MCP clients spawn duplicate servers" + memory-usage note for lazy embeddings.
|
|
1133
|
+
- **CHORE**: Sync version across `pyproject.toml`, `mcp_server/__init__.py`, and `npm/package.json` (was drifting since v3.5.x).
|
|
1134
|
+
- **CHORE**: pytest `tmp_path_retention_count=1` to avoid Windows atexit cleanup race in CI.
|
|
1135
|
+
- **ROADMAP**: Tracked v4.0 shared-service architecture (one daemon, many thin MCP clients) as the long-term fix for multi-process resource duplication. (#34)
|
|
1136
|
+
|
|
1097
1137
|
### Unreleased
|
|
1098
1138
|
|
|
1099
1139
|
- **FIX**: Startup preflight probes ChromaDB in a child process and moves crashing persistent indexes to `data/backups/auto-repair-*` before MCP initialization.
|
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
[](https://pypi.org/project/knowledge-rag/)
|
|
6
6
|
[](https://www.npmjs.com/package/knowledge-rag)
|
|
7
|
-
[](https://pepy.tech/projects/knowledge-rag)
|
|
8
8
|

|
|
9
9
|

|
|
10
10
|

|
|
@@ -33,11 +33,21 @@ pip install knowledge-rag → restart Claude Code → search_knowledge("your que
|
|
|
33
33
|
|
|
34
34
|
---
|
|
35
35
|
|
|
36
|
-
## What's New in v3.
|
|
36
|
+
## What's New in v3.8.0
|
|
37
37
|
|
|
38
|
-
###
|
|
38
|
+
### Lazy-Loaded Embeddings — Cheaper Idle Processes
|
|
39
39
|
|
|
40
|
-
|
|
40
|
+
The FastEmbed ONNX model (~200MB resident) now loads on the **first query**, not at startup. Idle `knowledge-rag` processes are now genuinely cheap. Why this matters: MCP stdio is one-process-per-client by protocol — multiple Claude Code windows, Claude Desktop + IDE simultaneously, or review/approval flows that open extra connections all spawn their own processes. Before v3.8.0, every one of them paid the full embedding-model cost up front. Now only processes that actually serve queries load the model. Public API is unchanged.
|
|
41
|
+
|
|
42
|
+
### Opt-In Single-Instance Guard
|
|
43
|
+
|
|
44
|
+
For users who measured their setup and want a hard cap of one server per `data_dir`:
|
|
45
|
+
|
|
46
|
+
```bash
|
|
47
|
+
export KNOWLEDGE_RAG_SINGLE_INSTANCE=1
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
A second instance exits immediately with code 75. **OFF by default** so multi-client MCP usage continues to work unchanged. Stale-PID recovery + SIGINT/SIGTERM cleanup wired correctly. Full guide in [docs/single-instance.md](docs/single-instance.md). Sample MCP config in [examples/mcp-config-single-instance.json](examples/mcp-config-single-instance.json).
|
|
41
51
|
|
|
42
52
|
### 5 Ways to Install
|
|
43
53
|
|
|
@@ -53,6 +63,7 @@ All methods produce the same MCP server. See [Installation](#installation) for f
|
|
|
53
63
|
|
|
54
64
|
### Recent Highlights
|
|
55
65
|
|
|
66
|
+
- **v3.8.0** — Lazy-load embeddings, opt-in single-instance guard, version sync across PyPI/NPM/Docker
|
|
56
67
|
- **v3.6.0** — Multi-language code parsing (C/C++/JS/TS/XML), NPM wrapper, Docker image, automated release pipeline
|
|
57
68
|
- **v3.5.2** — CUDA DLL auto-discovery from pip packages, graceful GPU→CPU fallback, explicit CPU provider (no CUDA noise when `gpu: false`), BASE_DIR resolution fix for editable installs
|
|
58
69
|
- **v3.5.1** — Remove Python `<3.13` upper bound — 3.13 and 3.14 now supported
|
|
@@ -1050,12 +1061,41 @@ The cross-encoder reranker model is lazy-loaded on the first query. This adds a
|
|
|
1050
1061
|
|
|
1051
1062
|
### Memory usage
|
|
1052
1063
|
|
|
1053
|
-
With ~200 documents, expect ~300-500MB RAM. The embedding model (~
|
|
1064
|
+
With ~200 documents, expect ~300-500MB RAM. The embedding model (~200MB ONNX runtime resident, lazy-loaded on first query since v3.8.0) and reranker (~25MB, lazy-loaded) are loaded into memory only when actually used. For very large knowledge bases (1000+ documents), consider enabling GPU acceleration and using exclude patterns to limit index scope.
|
|
1065
|
+
|
|
1066
|
+
### Multiple MCP clients spawn duplicate servers
|
|
1067
|
+
|
|
1068
|
+
MCP stdio is one process per client by protocol — multiple Claude Code windows, Claude Desktop + IDE, etc. each spawn their own `knowledge-rag` process. Since v3.8.0 idle processes are cheap (no embedding model loaded until first query). If you've measured and want a hard cap of one server per data directory, opt in:
|
|
1069
|
+
|
|
1070
|
+
```bash
|
|
1071
|
+
export KNOWLEDGE_RAG_SINGLE_INSTANCE=1
|
|
1072
|
+
```
|
|
1073
|
+
|
|
1074
|
+
A second instance exits immediately with code 75. Default is OFF (multi-client friendly). Full guide: [docs/single-instance.md](docs/single-instance.md). Sample MCP config: [examples/mcp-config-single-instance.json](examples/mcp-config-single-instance.json).
|
|
1054
1075
|
|
|
1055
1076
|
---
|
|
1056
1077
|
|
|
1057
1078
|
## Changelog
|
|
1058
1079
|
|
|
1080
|
+
### v3.8.1 (2026-05-10) — hotfix
|
|
1081
|
+
|
|
1082
|
+
- **FIX (critical)**: `FastEmbedEmbeddings.__call__` no longer returns vectors of zeros when the ONNX model fails to load or `embed()` raises. The previous behavior silently corrupted the index — ChromaDB stored zero embeddings, `count()` reported normal numbers, smart-reindex skipped the bad chunks, and queries returned garbage scores with no error visible. Now raises `EmbeddingModelLoadError` / `EmbeddingError`. (#36)
|
|
1083
|
+
- **FIX**: Sticky `_load_failed` flag — after a load failure, subsequent calls re-raise immediately instead of looping through HuggingFace download attempts (was the "frozen query" UX in v3.8.0).
|
|
1084
|
+
- **NEW**: Sanity checks in `__call__` — embed count and dim mismatches raise `EmbeddingError` instead of silently returning malformed vectors.
|
|
1085
|
+
- **TEST**: 7 new regression cases in `tests/test_lazy_embeddings.py`, including `test_does_not_return_zero_vectors_silently` as a guard for the whole class of bug.
|
|
1086
|
+
- **NOTE**: This is a pre-existing bug in master, not introduced by v3.8.0. v3.8.0 lazy-load expanded the impact (failures moved to query time). All v3.8.0 users should upgrade.
|
|
1087
|
+
|
|
1088
|
+
### v3.8.0 (2026-05-10)
|
|
1089
|
+
|
|
1090
|
+
- **NEW**: Lazy-load FastEmbed embedding model (~200MB ONNX runtime). Loads on first query instead of startup — idle `knowledge-rag` processes are now cheap, which matters when MCP stdio clients spawn parallel server processes (multiple Claude Code windows, Claude Desktop + IDE, etc.). Public API unchanged. (#32)
|
|
1091
|
+
- **NEW**: Opt-in single-instance guard via `KNOWLEDGE_RAG_SINGLE_INSTANCE=1` env var. **OFF by default** — multi-client MCP usage continues to work unchanged. When enabled, a second server process for the same `data_dir` exits with code 75 (`EX_TEMPFAIL`). Includes stale-PID recovery and SIGINT/SIGTERM handlers. See [docs/single-instance.md](docs/single-instance.md). (#33, original concept by @Hohlas in #31)
|
|
1092
|
+
- **NEW**: `examples/mcp-config-single-instance.json` — sample MCP client config for the opt-in guard.
|
|
1093
|
+
- **DOCS**: New `docs/single-instance.md` — when to use, when NOT to use, troubleshooting, full activation reference.
|
|
1094
|
+
- **DOCS**: README troubleshooting section for "Multiple MCP clients spawn duplicate servers" + memory-usage note for lazy embeddings.
|
|
1095
|
+
- **CHORE**: Sync version across `pyproject.toml`, `mcp_server/__init__.py`, and `npm/package.json` (was drifting since v3.5.x).
|
|
1096
|
+
- **CHORE**: pytest `tmp_path_retention_count=1` to avoid Windows atexit cleanup race in CI.
|
|
1097
|
+
- **ROADMAP**: Tracked v4.0 shared-service architecture (one daemon, many thin MCP clients) as the long-term fix for multi-process resource duplication. (#34)
|
|
1098
|
+
|
|
1059
1099
|
### Unreleased
|
|
1060
1100
|
|
|
1061
1101
|
- **FIX**: Startup preflight probes ChromaDB in a child process and moves crashing persistent indexes to `data/backups/auto-repair-*` before MCP initialization.
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"""Optional single-instance guard for the MCP server process.
|
|
2
|
+
|
|
3
|
+
Background
|
|
4
|
+
----------
|
|
5
|
+
MCP stdio servers are 1-process-per-client by protocol design. Multiple
|
|
6
|
+
Claude Code windows, Claude Desktop + IDE running simultaneously, or clients
|
|
7
|
+
that open extra internal connections during approval/review flows will all
|
|
8
|
+
spawn additional `knowledge-rag` processes. Each process holds its own
|
|
9
|
+
embedding model, ChromaDB client, BM25 state, and file watcher.
|
|
10
|
+
|
|
11
|
+
Lazy-loading the embedding model (v3.8.0) reduces idle cost dramatically,
|
|
12
|
+
but some users still want a hard cap of one process per data directory.
|
|
13
|
+
This module provides that cap as an OPT-IN, never as a default.
|
|
14
|
+
|
|
15
|
+
Activation
|
|
16
|
+
----------
|
|
17
|
+
Set the environment variable in your MCP client config:
|
|
18
|
+
|
|
19
|
+
KNOWLEDGE_RAG_SINGLE_INSTANCE=1 # also accepts: true, yes, on (case-insensitive)
|
|
20
|
+
|
|
21
|
+
When unset (default), `single_instance_lock()` is a no-op and the server
|
|
22
|
+
behaves exactly as it did before this module existed.
|
|
23
|
+
|
|
24
|
+
When enabled, the server creates `<data_dir>/knowledge-rag.lock` containing
|
|
25
|
+
its PID. A second process starting against the same `data_dir` will detect
|
|
26
|
+
the live PID and exit with code 75 (EX_TEMPFAIL). Stale locks (PID gone)
|
|
27
|
+
are cleaned up automatically.
|
|
28
|
+
|
|
29
|
+
Cleanup is wired in three places so the lock does not outlive the process:
|
|
30
|
+
1. Normal exit: contextmanager `finally` block removes the lock.
|
|
31
|
+
2. SIGINT / SIGTERM: handlers remove the lock and re-raise the default action.
|
|
32
|
+
3. Crash / SIGKILL: stale-PID detection on the next startup removes it.
|
|
33
|
+
|
|
34
|
+
Authors
|
|
35
|
+
-------
|
|
36
|
+
- Concept and original guard: Sergey Khokhlov (@Hohlas) in PR #31
|
|
37
|
+
- Reworked as opt-in + signal handlers + tests: Lyon. (knowledge-rag maintainer)
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
from __future__ import annotations
|
|
41
|
+
|
|
42
|
+
import os
|
|
43
|
+
import signal
|
|
44
|
+
from contextlib import contextmanager
|
|
45
|
+
from pathlib import Path
|
|
46
|
+
from typing import Iterator, Optional
|
|
47
|
+
|
|
48
|
+
from .config import config
|
|
49
|
+
|
|
50
|
+
LOCK_FILENAME = "knowledge-rag.lock"
|
|
51
|
+
ALREADY_RUNNING_EXIT_CODE = 75 # EX_TEMPFAIL from sysexits.h
|
|
52
|
+
ENV_VAR = "KNOWLEDGE_RAG_SINGLE_INSTANCE"
|
|
53
|
+
_TRUTHY = {"1", "true", "yes", "on"}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class AlreadyRunningError(RuntimeError):
|
|
57
|
+
"""Raised when another knowledge-rag server instance already holds the lock."""
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def single_instance_enabled() -> bool:
|
|
61
|
+
"""Return True if the user opted into the single-instance guard.
|
|
62
|
+
|
|
63
|
+
Reads `KNOWLEDGE_RAG_SINGLE_INSTANCE`. Accepts ``1``, ``true``, ``yes``, ``on``
|
|
64
|
+
(case-insensitive, surrounding whitespace ignored). Anything else — including
|
|
65
|
+
unset, empty, ``0``, ``false`` — leaves the guard disabled.
|
|
66
|
+
"""
|
|
67
|
+
raw = os.environ.get(ENV_VAR, "").strip().lower()
|
|
68
|
+
return raw in _TRUTHY
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _pid_is_running(pid: int) -> bool:
|
|
72
|
+
"""Return True if a process with PID appears to be alive."""
|
|
73
|
+
if pid <= 0:
|
|
74
|
+
return False
|
|
75
|
+
try:
|
|
76
|
+
os.kill(pid, 0)
|
|
77
|
+
except ProcessLookupError:
|
|
78
|
+
return False
|
|
79
|
+
except PermissionError:
|
|
80
|
+
# Process exists but is owned by another user / has tighter ACLs
|
|
81
|
+
return True
|
|
82
|
+
except OSError:
|
|
83
|
+
return False
|
|
84
|
+
return True
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _read_lock_pid(lock_path: Path) -> Optional[int]:
|
|
88
|
+
try:
|
|
89
|
+
raw = lock_path.read_text(encoding="utf-8").strip().splitlines()[0]
|
|
90
|
+
return int(raw)
|
|
91
|
+
except (IndexError, OSError, ValueError):
|
|
92
|
+
return None
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def _lock_path() -> Path:
|
|
96
|
+
return config.data_dir / LOCK_FILENAME
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _remove_if_ours(lock_path: Path) -> None:
|
|
100
|
+
"""Remove the lock file ONLY if it still references our PID."""
|
|
101
|
+
if _read_lock_pid(lock_path) == os.getpid():
|
|
102
|
+
try:
|
|
103
|
+
lock_path.unlink()
|
|
104
|
+
except FileNotFoundError:
|
|
105
|
+
pass
|
|
106
|
+
except OSError:
|
|
107
|
+
# Best-effort; stale-PID check on next startup will recover
|
|
108
|
+
pass
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@contextmanager
|
|
112
|
+
def single_instance_lock() -> Iterator[Optional[Path]]:
|
|
113
|
+
"""Acquire the single-instance lock if opt-in flag is set.
|
|
114
|
+
|
|
115
|
+
No-op when ``KNOWLEDGE_RAG_SINGLE_INSTANCE`` is unset / falsy — yields ``None``
|
|
116
|
+
and the caller proceeds normally with no side effects on disk.
|
|
117
|
+
|
|
118
|
+
When enabled:
|
|
119
|
+
- Creates ``<data_dir>/knowledge-rag.lock`` containing this process's PID.
|
|
120
|
+
- Raises :class:`AlreadyRunningError` if another live PID already holds it.
|
|
121
|
+
- Recovers stale locks (PID no longer running).
|
|
122
|
+
- Registers SIGINT/SIGTERM handlers that remove the lock and re-raise.
|
|
123
|
+
- Removes the lock on normal exit via ``finally``.
|
|
124
|
+
"""
|
|
125
|
+
if not single_instance_enabled():
|
|
126
|
+
yield None
|
|
127
|
+
return
|
|
128
|
+
|
|
129
|
+
config.data_dir.mkdir(parents=True, exist_ok=True)
|
|
130
|
+
lock_path = _lock_path()
|
|
131
|
+
|
|
132
|
+
while True:
|
|
133
|
+
try:
|
|
134
|
+
fd = os.open(lock_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY, 0o644)
|
|
135
|
+
except FileExistsError:
|
|
136
|
+
pid = _read_lock_pid(lock_path)
|
|
137
|
+
if pid is not None and _pid_is_running(pid):
|
|
138
|
+
raise AlreadyRunningError(
|
|
139
|
+
f"knowledge-rag MCP server is already running (pid {pid}). "
|
|
140
|
+
f"Refusing to start a second instance because "
|
|
141
|
+
f"{ENV_VAR} is enabled."
|
|
142
|
+
)
|
|
143
|
+
try:
|
|
144
|
+
lock_path.unlink()
|
|
145
|
+
except FileNotFoundError:
|
|
146
|
+
pass
|
|
147
|
+
except OSError as exc:
|
|
148
|
+
raise AlreadyRunningError(f"Failed to clear stale lock {lock_path}: {exc}") from exc
|
|
149
|
+
continue
|
|
150
|
+
|
|
151
|
+
with os.fdopen(fd, "w", encoding="utf-8") as f:
|
|
152
|
+
f.write(f"{os.getpid()}\n")
|
|
153
|
+
break
|
|
154
|
+
|
|
155
|
+
# Wire signal handlers so SIGINT/SIGTERM cleanup the lock before exit
|
|
156
|
+
previous_handlers: dict[int, object] = {}
|
|
157
|
+
|
|
158
|
+
def _signal_cleanup(signum: int, frame) -> None:
|
|
159
|
+
_remove_if_ours(lock_path)
|
|
160
|
+
# Restore original handler and re-raise so default action runs
|
|
161
|
+
prev = previous_handlers.get(signum, signal.SIG_DFL)
|
|
162
|
+
try:
|
|
163
|
+
signal.signal(signum, prev) # type: ignore[arg-type]
|
|
164
|
+
except (ValueError, OSError):
|
|
165
|
+
pass
|
|
166
|
+
# Re-send the signal to ourselves so the original disposition fires
|
|
167
|
+
os.kill(os.getpid(), signum)
|
|
168
|
+
|
|
169
|
+
for sig_name in ("SIGINT", "SIGTERM"):
|
|
170
|
+
sig = getattr(signal, sig_name, None)
|
|
171
|
+
if sig is None:
|
|
172
|
+
continue
|
|
173
|
+
try:
|
|
174
|
+
previous_handlers[sig] = signal.getsignal(sig)
|
|
175
|
+
signal.signal(sig, _signal_cleanup)
|
|
176
|
+
except (ValueError, OSError):
|
|
177
|
+
# signal.signal raises if not on the main thread; tests may hit this
|
|
178
|
+
pass
|
|
179
|
+
|
|
180
|
+
try:
|
|
181
|
+
yield lock_path
|
|
182
|
+
finally:
|
|
183
|
+
_remove_if_ours(lock_path)
|
|
184
|
+
for sig, prev in previous_handlers.items():
|
|
185
|
+
try:
|
|
186
|
+
signal.signal(sig, prev) # type: ignore[arg-type]
|
|
187
|
+
except (ValueError, OSError):
|
|
188
|
+
pass
|
|
@@ -129,6 +129,18 @@ class QueryCache:
|
|
|
129
129
|
# =============================================================================
|
|
130
130
|
|
|
131
131
|
|
|
132
|
+
class EmbeddingError(RuntimeError):
|
|
133
|
+
"""Raised when embedding generation fails after a successful model load."""
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
class EmbeddingModelLoadError(RuntimeError):
|
|
137
|
+
"""Raised when the embedding model itself cannot be loaded.
|
|
138
|
+
|
|
139
|
+
Distinct from EmbeddingError so callers can decide whether to retry
|
|
140
|
+
(transient runtime failure) or surface a hard configuration problem.
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
|
|
132
144
|
class FastEmbedEmbeddings:
|
|
133
145
|
"""
|
|
134
146
|
FastEmbed-based embedding function for ChromaDB (v1.4.0+ compatible).
|
|
@@ -136,6 +148,17 @@ class FastEmbedEmbeddings:
|
|
|
136
148
|
Uses ONNX Runtime in-process for embedding generation.
|
|
137
149
|
No external server required (replaces Ollama).
|
|
138
150
|
Model: BAAI/bge-small-en-v1.5 (384-dim, MTEB score 62.x)
|
|
151
|
+
|
|
152
|
+
Lazy-loading (since v3.8.0):
|
|
153
|
+
The ONNX model (~200MB resident) is NOT loaded in __init__.
|
|
154
|
+
It loads on the first call to __call__/embed_query/embed_documents.
|
|
155
|
+
This makes idle MCP server processes cheap, which matters when
|
|
156
|
+
multiple stdio clients spawn parallel knowledge-rag processes
|
|
157
|
+
(e.g. multiple Claude Code windows). The CrossEncoderReranker
|
|
158
|
+
already follows this same pattern.
|
|
159
|
+
|
|
160
|
+
Thread-safe: load is guarded by a lock so concurrent first-callers
|
|
161
|
+
don't double-initialize the model.
|
|
139
162
|
"""
|
|
140
163
|
|
|
141
164
|
@staticmethod
|
|
@@ -174,24 +197,62 @@ class FastEmbedEmbeddings:
|
|
|
174
197
|
def __init__(self, model: str = None):
|
|
175
198
|
self.model_name = model or config.embedding_model
|
|
176
199
|
self._dim = config.embedding_dim
|
|
177
|
-
kwargs
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
200
|
+
# Build kwargs once; defer the heavy TextEmbedding(**kwargs) call to first use.
|
|
201
|
+
self._init_kwargs = {"model_name": self.model_name, "cache_dir": str(config.models_cache_dir)}
|
|
202
|
+
self._gpu = bool(config.gpu_acceleration)
|
|
203
|
+
self._model: Optional[TextEmbedding] = None
|
|
204
|
+
self._load_lock = threading.Lock()
|
|
205
|
+
# Sticky failure flag: once load fails, subsequent calls re-raise immediately
|
|
206
|
+
# instead of looping through download/retry. Same pattern as CrossEncoderReranker.
|
|
207
|
+
self._load_failed: Optional[Exception] = None
|
|
208
|
+
|
|
209
|
+
def _load_model(self) -> None:
|
|
210
|
+
"""Load the ONNX model on demand. Idempotent and thread-safe.
|
|
211
|
+
|
|
212
|
+
Raises:
|
|
213
|
+
EmbeddingModelLoadError: when the underlying ONNX runtime cannot
|
|
214
|
+
instantiate the model (missing files, hash mismatch, etc.). The
|
|
215
|
+
exception is sticky — subsequent calls raise the same error
|
|
216
|
+
without retrying so callers do not loop through HF downloads.
|
|
217
|
+
"""
|
|
218
|
+
if self._model is not None:
|
|
219
|
+
return
|
|
220
|
+
if self._load_failed is not None:
|
|
221
|
+
raise EmbeddingModelLoadError(
|
|
222
|
+
f"Embedding model previously failed to load: {self._load_failed}"
|
|
223
|
+
) from self._load_failed
|
|
224
|
+
with self._load_lock:
|
|
225
|
+
if self._model is not None: # double-checked under the lock
|
|
226
|
+
return
|
|
227
|
+
if self._load_failed is not None:
|
|
228
|
+
raise EmbeddingModelLoadError(
|
|
229
|
+
f"Embedding model previously failed to load: {self._load_failed}"
|
|
230
|
+
) from self._load_failed
|
|
231
|
+
kwargs = dict(self._init_kwargs)
|
|
182
232
|
try:
|
|
183
|
-
self.
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
233
|
+
if self._gpu:
|
|
234
|
+
self._setup_cuda_dll_paths()
|
|
235
|
+
kwargs["providers"] = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
|
236
|
+
print(f"[INFO] Loading embedding model: {self.model_name} ({self._dim}D) [GPU accelerated]...")
|
|
237
|
+
try:
|
|
238
|
+
self._model = TextEmbedding(**kwargs)
|
|
239
|
+
print("[INFO] Embedding model loaded successfully [GPU]")
|
|
240
|
+
except (ValueError, RuntimeError) as e:
|
|
241
|
+
print(f"[WARN] GPU init failed ({e}), falling back to CPU...")
|
|
242
|
+
kwargs["providers"] = ["CPUExecutionProvider"]
|
|
243
|
+
self._model = TextEmbedding(**kwargs)
|
|
244
|
+
print("[INFO] Embedding model loaded successfully [CPU fallback]")
|
|
245
|
+
else:
|
|
246
|
+
kwargs["providers"] = ["CPUExecutionProvider"]
|
|
247
|
+
print(f"[INFO] Loading embedding model: {self.model_name} ({self._dim}D)...")
|
|
248
|
+
self._model = TextEmbedding(**kwargs)
|
|
249
|
+
print("[INFO] Embedding model loaded successfully")
|
|
250
|
+
except Exception as exc:
|
|
251
|
+
# ONNXRuntimeError, FileNotFoundError, etc. — record and re-raise loud
|
|
252
|
+
self._load_failed = exc
|
|
253
|
+
self._model = None
|
|
254
|
+
print(f"[ERROR] Embedding model load FAILED: {exc}", file=sys.stderr)
|
|
255
|
+
raise EmbeddingModelLoadError(f"Failed to load embedding model: {exc}") from exc
|
|
195
256
|
|
|
196
257
|
def __call__(self, input: List[str]) -> List[List[float]]:
|
|
197
258
|
"""
|
|
@@ -199,16 +260,38 @@ class FastEmbedEmbeddings:
|
|
|
199
260
|
|
|
200
261
|
ChromaDB embedding_function interface: __call__(input: List[str]) -> List[List[float]]
|
|
201
262
|
FastEmbed.embed() returns a generator, so we consume it into a list.
|
|
263
|
+
|
|
264
|
+
Raises:
|
|
265
|
+
EmbeddingModelLoadError: when the model could not be loaded.
|
|
266
|
+
EmbeddingError: when embedding generation fails after a successful load.
|
|
267
|
+
|
|
268
|
+
Behavior note (changed in v3.8.1):
|
|
269
|
+
Previously this method swallowed any exception and returned vectors
|
|
270
|
+
of zeros (``[[0.0]*dim for _ in input]``). That silently corrupted
|
|
271
|
+
the index — ChromaDB stored zero vectors as document embeddings,
|
|
272
|
+
``count()`` returned the right number of chunks, smart-reindex
|
|
273
|
+
would skip them as "already indexed", and queries returned garbage
|
|
274
|
+
similarity scores. Failures are now LOUD: the caller (ChromaDB
|
|
275
|
+
``add()``, MCP search tool, etc.) sees the real error and can
|
|
276
|
+
surface it to the user.
|
|
202
277
|
"""
|
|
203
278
|
if not input:
|
|
204
279
|
return []
|
|
205
280
|
|
|
281
|
+
self._load_model() # may raise EmbeddingModelLoadError
|
|
206
282
|
try:
|
|
207
283
|
embeddings = list(self._model.embed(input))
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
284
|
+
except Exception as exc:
|
|
285
|
+
print(f"[ERROR] Embedding generation FAILED: {exc}", file=sys.stderr)
|
|
286
|
+
raise EmbeddingError(f"Embedding generation failed: {exc}") from exc
|
|
287
|
+
|
|
288
|
+
# Sanity check: model returned the right number of vectors with the right dim
|
|
289
|
+
if len(embeddings) != len(input):
|
|
290
|
+
raise EmbeddingError(f"Embedding count mismatch: expected {len(input)}, got {len(embeddings)}")
|
|
291
|
+
result = [emb.tolist() for emb in embeddings]
|
|
292
|
+
if result and len(result[0]) != self._dim:
|
|
293
|
+
raise EmbeddingError(f"Embedding dim mismatch: expected {self._dim}, got {len(result[0])}")
|
|
294
|
+
return result
|
|
212
295
|
|
|
213
296
|
def name(self) -> str:
|
|
214
297
|
"""Return embedding function name (required by ChromaDB v1.4.0+)"""
|
|
@@ -1934,48 +2017,58 @@ def main():
|
|
|
1934
2017
|
_handle_init()
|
|
1935
2018
|
return
|
|
1936
2019
|
|
|
2020
|
+
from .instance_lock import (
|
|
2021
|
+
ALREADY_RUNNING_EXIT_CODE,
|
|
2022
|
+
AlreadyRunningError,
|
|
2023
|
+
single_instance_lock,
|
|
2024
|
+
)
|
|
1937
2025
|
from .preflight import run_preflight
|
|
1938
2026
|
|
|
1939
|
-
|
|
2027
|
+
try:
|
|
2028
|
+
with single_instance_lock():
|
|
2029
|
+
run_preflight()
|
|
1940
2030
|
|
|
1941
|
-
|
|
2031
|
+
orchestrator = get_orchestrator()
|
|
1942
2032
|
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
|
|
1947
|
-
|
|
1948
|
-
|
|
1949
|
-
|
|
1950
|
-
|
|
1951
|
-
|
|
1952
|
-
|
|
1953
|
-
|
|
1954
|
-
|
|
1955
|
-
|
|
1956
|
-
|
|
1957
|
-
|
|
1958
|
-
|
|
1959
|
-
|
|
1960
|
-
|
|
2033
|
+
# Migration: check dimension mismatch AFTER full init (avoids segfault during __init__)
|
|
2034
|
+
orchestrator._needs_rebuild = orchestrator._check_dimension_mismatch()
|
|
2035
|
+
if orchestrator._needs_rebuild:
|
|
2036
|
+
print("[MIGRATION] Running nuclear rebuild for embedding model change...")
|
|
2037
|
+
try:
|
|
2038
|
+
stats = orchestrator.nuclear_rebuild()
|
|
2039
|
+
print(
|
|
2040
|
+
f"[MIGRATION] Rebuild complete: {stats['indexed']} docs, "
|
|
2041
|
+
f"{stats['chunks_added']} chunks in {stats.get('elapsed_seconds', '?')}s"
|
|
2042
|
+
)
|
|
2043
|
+
except Exception as e:
|
|
2044
|
+
print(f"[ERROR] Migration failed: {e}")
|
|
2045
|
+
print("[FALLBACK] Attempting regular index instead...")
|
|
2046
|
+
stats = orchestrator.index_all(force=True)
|
|
2047
|
+
elif orchestrator.collection.count() == 0:
|
|
2048
|
+
print("[INFO] No documents indexed. Running initial indexing...")
|
|
2049
|
+
stats = orchestrator.index_all()
|
|
2050
|
+
print(f"[INFO] Indexed {stats['indexed']} documents with {stats['chunks_added']} chunks")
|
|
2051
|
+
|
|
2052
|
+
# Start file watcher for auto-reindex on document changes
|
|
2053
|
+
try:
|
|
2054
|
+
watcher = DocumentWatcher(get_orchestrator, debounce_seconds=5.0)
|
|
2055
|
+
observer = Observer()
|
|
2056
|
+
observer.schedule(watcher, str(config.documents_dir), recursive=True)
|
|
2057
|
+
observer.daemon = True
|
|
2058
|
+
observer.start()
|
|
2059
|
+
print(f"[WATCHER] Monitoring {config.documents_dir} for changes")
|
|
2060
|
+
except Exception as e:
|
|
2061
|
+
print(f"[WARN] Failed to start file watcher: {e}")
|
|
2062
|
+
print("[WARN] Auto-reindexing disabled. Use reindex_documents tool manually.")
|
|
1961
2063
|
|
|
1962
|
-
|
|
1963
|
-
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
|
|
1967
|
-
|
|
1968
|
-
|
|
1969
|
-
|
|
1970
|
-
except Exception as e:
|
|
1971
|
-
print(f"[WARN] Failed to start file watcher: {e}")
|
|
1972
|
-
print("[WARN] Auto-reindexing disabled. Use reindex_documents tool manually.")
|
|
1973
|
-
|
|
1974
|
-
# Restore real stdout for MCP JSON-RPC, keep print() going to stderr
|
|
1975
|
-
from . import _original_stdout
|
|
1976
|
-
|
|
1977
|
-
sys.stdout = _original_stdout
|
|
1978
|
-
mcp.run()
|
|
2064
|
+
# Restore real stdout for MCP JSON-RPC, keep print() going to stderr
|
|
2065
|
+
from . import _original_stdout
|
|
2066
|
+
|
|
2067
|
+
sys.stdout = _original_stdout
|
|
2068
|
+
mcp.run()
|
|
2069
|
+
except AlreadyRunningError as e:
|
|
2070
|
+
print(f"[ERROR] {e}", file=sys.stderr)
|
|
2071
|
+
raise SystemExit(ALREADY_RUNNING_EXIT_CODE) from e
|
|
1979
2072
|
|
|
1980
2073
|
|
|
1981
2074
|
if __name__ == "__main__":
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "knowledge-rag"
|
|
7
|
-
version = "3.
|
|
7
|
+
version = "3.8.1"
|
|
8
8
|
description = "Local RAG System for Claude Code — Hybrid search + Cross-encoder Reranking + 12 MCP Tools + 20 Format Parsers. Zero external servers."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = {text = "MIT"}
|
|
@@ -95,6 +95,11 @@ exclude = [
|
|
|
95
95
|
[tool.pytest.ini_options]
|
|
96
96
|
testpaths = ["tests"]
|
|
97
97
|
pythonpath = ["."]
|
|
98
|
+
# Limit retained tmp_path directories to avoid pytest's atexit cleanup race
|
|
99
|
+
# on Windows (cleanup_numbered_dir + pathlib.glob "garbage-*" can fail when
|
|
100
|
+
# many tmp dirs accumulate). Tests run isolated; we don't need history.
|
|
101
|
+
tmp_path_retention_count = 1
|
|
102
|
+
tmp_path_retention_policy = "failed"
|
|
98
103
|
|
|
99
104
|
[tool.ruff]
|
|
100
105
|
target-version = "py311"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|