cfunklabs-rag-react-docs 0.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cfunklabs_rag_react_docs-0.1.2.dist-info/METADATA +228 -0
- cfunklabs_rag_react_docs-0.1.2.dist-info/RECORD +10 -0
- cfunklabs_rag_react_docs-0.1.2.dist-info/WHEEL +4 -0
- cfunklabs_rag_react_docs-0.1.2.dist-info/entry_points.txt +2 -0
- rag_react_docs/__init__.py +8 -0
- rag_react_docs/config.py +49 -0
- rag_react_docs/datastore.py +125 -0
- rag_react_docs/retrieval.py +48 -0
- rag_react_docs/server.py +83 -0
- rag_react_docs/source_label.py +26 -0
|
@@ -0,0 +1,228 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cfunklabs-rag-react-docs
|
|
3
|
+
Version: 0.1.2
|
|
4
|
+
Summary: Retrieval-only MCP server over the indexed React documentation, with a prebuilt index downloaded on first run.
|
|
5
|
+
Project-URL: Homepage, https://github.com/cfunklabs/rag-react-docs
|
|
6
|
+
Project-URL: Repository, https://github.com/cfunklabs/rag-react-docs
|
|
7
|
+
Author: Corey Funk
|
|
8
|
+
License: MIT
|
|
9
|
+
Keywords: chromadb,documentation,mcp,rag,react,retrieval
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Software Development :: Documentation
|
|
15
|
+
Requires-Python: >=3.14
|
|
16
|
+
Requires-Dist: chromadb>=1.5.9
|
|
17
|
+
Requires-Dist: mcp>=1.28.1
|
|
18
|
+
Requires-Dist: platformdirs>=4.0.0
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
|
|
21
|
+
# cfunklabs-rag-react-docs
|
|
22
|
+
|
|
23
|
+
Backend for the RAG demo, built with LangChain, LangGraph, Anthropic Claude, and ChromaDB.
|
|
24
|
+
It also ships a retrieval-only MCP server, published to PyPI as `cfunklabs-rag-react-docs`,
|
|
25
|
+
that serves grounding context from the indexed React documentation (see [MCP server](#mcp-server)).
|
|
26
|
+
|
|
27
|
+
## Prerequisites
|
|
28
|
+
|
|
29
|
+
- [Python 3.14+](https://www.python.org/downloads/)
|
|
30
|
+
- [uv](https://docs.astral.sh/uv/getting-started/installation/) — used for dependency management and running the project
|
|
31
|
+
- An [Anthropic API key](https://console.anthropic.com/)
|
|
32
|
+
|
|
33
|
+
## Setup
|
|
34
|
+
|
|
35
|
+
All commands should be run from the `backend` directory.
|
|
36
|
+
|
|
37
|
+
### 1. Install dependencies
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
uv sync
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### 2. Configure environment variables
|
|
44
|
+
|
|
45
|
+
Copy the sample env file and add your Anthropic API key:
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
cp .env.sample .env
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Open `.env` and set your key:
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
ANTHROPIC_API_KEY=your_api_key_here
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
### 3. Initialize the vector database
|
|
58
|
+
|
|
59
|
+
Create the ChromaDB collection used to store document embeddings:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
uv run src/utils/init_db.py
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
This creates a persistent ChromaDB store under `rag_datastore/` and a collection
|
|
66
|
+
named after `tool.rag_db.rag_doc_collection_name` in [pyproject.toml](pyproject.toml).
|
|
67
|
+
The `rag_datastore/` directory is gitignored.
|
|
68
|
+
|
|
69
|
+
### 4. Fetch the React docs dataset
|
|
70
|
+
|
|
71
|
+
The corpus is a local mirror of the [React documentation](https://react.dev/) markdown files. Download it with:
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
uv run src/utils/fetch_react_docs.py
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
This script:
|
|
78
|
+
|
|
79
|
+
1. Fetches the index at [react.dev/llms.txt](https://react.dev/llms.txt)
|
|
80
|
+
2. Follows every linked `https://react.dev/*.md` URL
|
|
81
|
+
3. Saves each page under `docs/` at the **repository root**, using the index heading structure as directories and each file's frontmatter `title` as the filename
|
|
82
|
+
|
|
83
|
+
Example output path:
|
|
84
|
+
|
|
85
|
+
```
|
|
86
|
+
docs/API Reference/React/Components/Built-in React Components.md
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
The `docs/` directory is gitignored — run the script locally after cloning and re-run it anytime to refresh the dataset.
|
|
90
|
+
|
|
91
|
+
## Running
|
|
92
|
+
|
|
93
|
+
### Startup check
|
|
94
|
+
|
|
95
|
+
Run `main.py` to verify your environment is configured correctly. It prints installed package versions and performs a live health check against the Claude API:
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
uv run main.py
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
A passing run looks like:
|
|
102
|
+
|
|
103
|
+
```
|
|
104
|
+
Versions:
|
|
105
|
+
- langchain_core_version: x.x.x
|
|
106
|
+
- langgraph_version: x.x.x
|
|
107
|
+
- langchain_anthropic_version: x.x.x
|
|
108
|
+
|
|
109
|
+
Checking LLM service health... PASSED
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
If the health check fails, confirm that `ANTHROPIC_API_KEY` is set correctly in your `.env` file.
|
|
113
|
+
|
|
114
|
+
### Ingesting documents
|
|
115
|
+
|
|
116
|
+
Chunk every Markdown file under `docs/`, embed the chunks, and upsert them into the vector store:
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
uv run main.py
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
To process a single file (useful while iterating on chunking), pass `--md_file_path`. Add
|
|
123
|
+
`--evaluate_chunking` to write LLM-as-judge quality reports to `evals/results`, or
|
|
124
|
+
`--print_chunks` to dump each chunk to stdout.
|
|
125
|
+
|
|
126
|
+
### Querying
|
|
127
|
+
|
|
128
|
+
Ask a question against the ingested docs. The query pipeline is a LangGraph graph
|
|
129
|
+
(`retrieve` -> `generate`) that embeds your question with the same model used at ingestion,
|
|
130
|
+
retrieves the most similar chunks from ChromaDB, and has Claude answer using only that
|
|
131
|
+
context. The answer streams token-by-token and is followed by the cited sources:
|
|
132
|
+
|
|
133
|
+
```bash
|
|
134
|
+
uv run query.py "How does memo work?"
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
Options:
|
|
138
|
+
|
|
139
|
+
- `--k N` — number of chunks to retrieve (default `top_k` in [pyproject.toml](pyproject.toml)).
|
|
140
|
+
- `--no-stream` — wait for the full answer instead of streaming tokens.
|
|
141
|
+
- `--show-scores` — show the retrieval distance for each cited source.
|
|
142
|
+
- `-v`, `--verbose` — show diagnostic output (the preflight datastore and LLM health checks). Hidden by default.
|
|
143
|
+
|
|
144
|
+
Retrieval and generation settings live under `[tool.rag_query]` in [pyproject.toml](pyproject.toml)
|
|
145
|
+
(`top_k` and `generation_model`). Run `uv run main.py` first — querying requires a populated
|
|
146
|
+
collection.
|
|
147
|
+
|
|
148
|
+
### MCP server
|
|
149
|
+
|
|
150
|
+
In addition to the CLI, the retrieval pipeline is exposed as an [MCP](https://modelcontextprotocol.io/)
|
|
151
|
+
server over stdio, so MCP clients (Cursor, Claude Desktop, etc.) can pull grounding context
|
|
152
|
+
directly. It exposes a single **retrieval-only** tool:
|
|
153
|
+
|
|
154
|
+
- `search_docs(question, k?)` — embeds the question with the same model used at ingestion,
|
|
155
|
+
retrieves the most similar chunks from ChromaDB, and returns each chunk's `source` label,
|
|
156
|
+
`content`, and retrieval `distance`. The client LLM generates the answer from those chunks,
|
|
157
|
+
so no Anthropic key is needed to run the server.
|
|
158
|
+
|
|
159
|
+
#### For end users (published package)
|
|
160
|
+
|
|
161
|
+
The server is published to PyPI as **`cfunklabs-rag-react-docs`**. End users don't clone the
|
|
162
|
+
repo or run the ingestion pipeline — the prebuilt index (~34 MB) is downloaded from a GitHub
|
|
163
|
+
Release and cached on first run. Just register it with your MCP client:
|
|
164
|
+
|
|
165
|
+
```json
|
|
166
|
+
{
|
|
167
|
+
"mcpServers": {
|
|
168
|
+
"rag-react-docs": {
|
|
169
|
+
"command": "uvx",
|
|
170
|
+
"args": ["cfunklabs-rag-react-docs"]
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
```
|
|
175
|
+
|
|
176
|
+
The first launch needs network access to fetch the index; subsequent runs read from the local
|
|
177
|
+
cache (`platformdirs` cache dir) and work offline. Optional environment overrides: `RAG_TOP_K`,
|
|
178
|
+
`RAG_COLLECTION_NAME`, `RAG_INDEX_URL`, and `RAG_DATASTORE_DIR`.
|
|
179
|
+
|
|
180
|
+
#### Local development
|
|
181
|
+
|
|
182
|
+
Run the dev server from the `backend` directory (so `pyproject.toml` resolves) against the
|
|
183
|
+
locally-built `rag_datastore`:
|
|
184
|
+
|
|
185
|
+
```bash
|
|
186
|
+
uv run mcp_server.py
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
Run standalone this way, the server prints a short startup banner to stderr and then blocks
|
|
190
|
+
silently by design — the stdio transport reserves stdout for the JSON-RPC protocol, so it
|
|
191
|
+
waits for a client to connect rather than logging. Running it directly is mainly a smoke test;
|
|
192
|
+
press Ctrl+C to stop.
|
|
193
|
+
|
|
194
|
+
Run `uv run main.py` first — the dev server needs a populated collection. For interactive
|
|
195
|
+
testing, launch the MCP Inspector with `uv run mcp dev mcp_server.py`.
|
|
196
|
+
|
|
197
|
+
### Publishing to PyPI
|
|
198
|
+
|
|
199
|
+
The published package (`cfunklabs-rag-react-docs`) contains only the retrieval + MCP server
|
|
200
|
+
(the import package `rag_react_docs` under `src/`). Ingestion/query tooling and `src/utils/*`
|
|
201
|
+
are dev-only and excluded from the wheel.
|
|
202
|
+
|
|
203
|
+
Two artifacts get published: the Python package (to PyPI) and the prebuilt index (to a GitHub
|
|
204
|
+
Release). They version independently — the index version is pinned as `INDEX_VERSION` in
|
|
205
|
+
[src/rag_react_docs/config.py](src/rag_react_docs/config.py).
|
|
206
|
+
|
|
207
|
+
1. Build and upload the index archive (after `uv run main.py` has populated `rag_datastore`):
|
|
208
|
+
|
|
209
|
+
```bash
|
|
210
|
+
uv run scripts/build_index_archive.py
|
|
211
|
+
gh release create index-19-2-v1 dist/rag-index-19-2-v1.tar.gz dist/rag-index-19-2-v1.tar.gz.sha256
|
|
212
|
+
```
|
|
213
|
+
|
|
214
|
+
2. Build and publish the package (test on TestPyPI first):
|
|
215
|
+
|
|
216
|
+
```bash
|
|
217
|
+
uv build # -> dist/ wheel + sdist (only rag_react_docs)
|
|
218
|
+
uv publish --publish-url https://test.pypi.org/legacy/ # TestPyPI dry run
|
|
219
|
+
uv publish # PyPI
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
The index version follows the standard `index-<react-version>-v<incremental>` (e.g.
|
|
223
|
+
`index-19-2-v1`), composed in [src/rag_react_docs/config.py](src/rag_react_docs/config.py) from
|
|
224
|
+
`REACT_VERSION` and `INDEX_REVISION`. Bump `REACT_VERSION` when re-fetching the docs for a new
|
|
225
|
+
React release, and bump `INDEX_REVISION` for re-chunk or embedding-model changes within the same
|
|
226
|
+
React version. Either bump changes the release tag/asset name and cache path, so clients pull a
|
|
227
|
+
fresh, compatible index instead of reusing a stale cache — re-release the archive under the new
|
|
228
|
+
`index-<react-version>-v<incremental>` tag.
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
rag_react_docs/__init__.py,sha256=nupgbeHjZH1ctdSVHnuX6mk7daPKoHGlLov9jDd0mwc,351
|
|
2
|
+
rag_react_docs/config.py,sha256=d8nR65kfBkpkbrdroAYz5qQFfqGSCAXiXBSp39bwduw,2289
|
|
3
|
+
rag_react_docs/datastore.py,sha256=lGB7EvctHa-cVayY3-YSIPaeqfHaQjtVleHNylNYnrA,4540
|
|
4
|
+
rag_react_docs/retrieval.py,sha256=wtxZ7SqbpTApKeTT_5A5eToij3Qw9LvAsC8XGB89JA8,1879
|
|
5
|
+
rag_react_docs/server.py,sha256=Z0xK7KVy06nVOy-w715DSykenMULpBYbc1gPfW8dSSg,3146
|
|
6
|
+
rag_react_docs/source_label.py,sha256=-4q3yknGmI8eT_CBZaaOABPXSPVXdwG3WiyOI3Vm1jA,910
|
|
7
|
+
cfunklabs_rag_react_docs-0.1.2.dist-info/METADATA,sha256=4vNJ61N18foC6ZgwB0DhUQ3A620qlZfpIsoVvEnFE68,8239
|
|
8
|
+
cfunklabs_rag_react_docs-0.1.2.dist-info/WHEEL,sha256=mffPy8wBnZQn2VnJUU5jE99KsxaSfiyMHV9Yt0aLVxs,87
|
|
9
|
+
cfunklabs_rag_react_docs-0.1.2.dist-info/entry_points.txt,sha256=gANoXAUJMNFOzlDJH_S93cJXYkUzYJI6ldkRtf8LXk8,72
|
|
10
|
+
cfunklabs_rag_react_docs-0.1.2.dist-info/RECORD,,
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
"""rag-react-docs: a retrieval-only MCP server over the indexed React documentation.
|
|
2
|
+
|
|
3
|
+
Distributed on PyPI as `cfunklabs-rag-react-docs`; imported as `rag_react_docs`. The prebuilt
|
|
4
|
+
ChromaDB index is downloaded from a GitHub Release on first run (see `datastore.py`), so end
|
|
5
|
+
users never run the ingestion pipeline themselves.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
__version__ = "0.1.2"
|
rag_react_docs/config.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Runtime configuration for the published package.
|
|
2
|
+
|
|
3
|
+
Unlike the dev tooling (which reads `pyproject.toml`), the installed package has no repo
|
|
4
|
+
checkout to read from, so defaults are baked in here and overridable via environment variables.
|
|
5
|
+
This keeps the wheel self-contained while still letting power users retarget the collection,
|
|
6
|
+
retrieval depth, index location, or download URL from their MCP client config.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import os
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import platformdirs
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
# The all-MiniLM-L6-v2 vectors were ingested into this collection; the name must match the
|
|
16
|
+
# collection stored inside the downloaded index archive.
|
|
17
|
+
COLLECTION_NAME = os.environ.get("RAG_COLLECTION_NAME", "rag_doc_collection")
|
|
18
|
+
|
|
19
|
+
DEFAULT_TOP_K = int(os.environ.get("RAG_TOP_K", "5"))
|
|
20
|
+
|
|
21
|
+
# Index version standard: index-<react-version>-v<incremental> (e.g. index-19-2-v1).
|
|
22
|
+
# REACT_VERSION is the React docs version with dots as dashes; bump it when the corpus is
|
|
23
|
+
# re-fetched for a new React release. INDEX_REVISION bumps for re-chunk/embedding changes
|
|
24
|
+
# within the same React version. INDEX_VERSION is part of both the cache path and the release
|
|
25
|
+
# asset name, so any bump forces a fresh download rather than reusing a stale cached index.
|
|
26
|
+
REACT_VERSION = "19-2"
|
|
27
|
+
INDEX_REVISION = "v1"
|
|
28
|
+
INDEX_VERSION = f"{REACT_VERSION}-{INDEX_REVISION}"
|
|
29
|
+
|
|
30
|
+
# The prebuilt index is published as a GitHub Release asset. A sibling `<archive>.sha256` file
|
|
31
|
+
# is fetched alongside it to verify the download before extraction.
|
|
32
|
+
_DEFAULT_INDEX_URL = (
|
|
33
|
+
"https://github.com/cfunklabs/rag-react-docs/releases/download/"
|
|
34
|
+
f"index-{INDEX_VERSION}/rag-index-{INDEX_VERSION}.tar.gz"
|
|
35
|
+
)
|
|
36
|
+
INDEX_URL = os.environ.get("RAG_INDEX_URL", _DEFAULT_INDEX_URL)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def datastore_dir() -> Path:
|
|
40
|
+
"""Return the directory that holds (or will hold) the extracted ChromaDB index.
|
|
41
|
+
|
|
42
|
+
Defaults to a per-user cache directory namespaced by index version so multiple versions
|
|
43
|
+
can coexist and a version bump never collides with an older cached index. Overridable via
|
|
44
|
+
`RAG_DATASTORE_DIR` (e.g. to point at a repo-local store during development).
|
|
45
|
+
"""
|
|
46
|
+
override = os.environ.get("RAG_DATASTORE_DIR")
|
|
47
|
+
if override:
|
|
48
|
+
return Path(override).expanduser()
|
|
49
|
+
return Path(platformdirs.user_cache_dir("cfunklabs-rag-react-docs")) / "index" / INDEX_VERSION
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
"""Download-on-first-run access to the prebuilt ChromaDB index.
|
|
2
|
+
|
|
3
|
+
The published package ships no vectors: the ~34 MB index lives as a GitHub Release asset and is
|
|
4
|
+
fetched + cached the first time the server needs it. Subsequent runs read straight from the
|
|
5
|
+
cache and work offline. Only the standard library is used for the download so the wheel stays
|
|
6
|
+
dependency-light (no httpx/requests).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import hashlib
|
|
10
|
+
import os
|
|
11
|
+
import shutil
|
|
12
|
+
import tarfile
|
|
13
|
+
import tempfile
|
|
14
|
+
import urllib.request
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
import chromadb
|
|
18
|
+
|
|
19
|
+
from .config import COLLECTION_NAME, INDEX_URL, datastore_dir
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# Sentinel file that marks a fully-extracted index. We only treat the cache as populated when
|
|
23
|
+
# this exists, so an interrupted download/extraction never leaves a half-written index that
|
|
24
|
+
# looks valid on the next run.
|
|
25
|
+
_MARKER = "chroma.sqlite3"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _download(url: str, dest: Path) -> None:
|
|
29
|
+
with urllib.request.urlopen(url) as response, open(dest, "wb") as out:
|
|
30
|
+
shutil.copyfileobj(response, out)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _sha256(path: Path) -> str:
|
|
34
|
+
digest = hashlib.sha256()
|
|
35
|
+
with open(path, "rb") as f:
|
|
36
|
+
for block in iter(lambda: f.read(1024 * 1024), b""):
|
|
37
|
+
digest.update(block)
|
|
38
|
+
return digest.hexdigest()
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def _verify_checksum(archive: Path, url: str) -> None:
|
|
42
|
+
"""Verify `archive` against its published `<url>.sha256`, if one is available.
|
|
43
|
+
|
|
44
|
+
The checksum file contents may be a bare hex digest or the common `"<digest> <filename>"`
|
|
45
|
+
format; we take the first whitespace-delimited token either way. A missing checksum file is
|
|
46
|
+
tolerated (some releases may not publish one) but a present-and-mismatched one is fatal.
|
|
47
|
+
"""
|
|
48
|
+
try:
|
|
49
|
+
with urllib.request.urlopen(url + ".sha256") as response:
|
|
50
|
+
expected = response.read().decode().strip().split()[0]
|
|
51
|
+
except Exception:
|
|
52
|
+
return
|
|
53
|
+
|
|
54
|
+
actual = _sha256(archive)
|
|
55
|
+
if actual != expected:
|
|
56
|
+
raise RuntimeError(
|
|
57
|
+
f"Index checksum mismatch: expected {expected}, got {actual}. "
|
|
58
|
+
f"Refusing to use a corrupted download from {url}."
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def ensure_index() -> Path:
|
|
63
|
+
"""Return the local index directory, downloading + extracting it on first use.
|
|
64
|
+
|
|
65
|
+
Extraction is staged in a sibling temp directory and atomically renamed into place so a
|
|
66
|
+
concurrent or interrupted run can't expose a partially-written index.
|
|
67
|
+
"""
|
|
68
|
+
dest = datastore_dir()
|
|
69
|
+
if (dest / _MARKER).exists():
|
|
70
|
+
return dest
|
|
71
|
+
|
|
72
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
73
|
+
|
|
74
|
+
with tempfile.TemporaryDirectory(dir=str(dest.parent)) as tmp:
|
|
75
|
+
tmp_path = Path(tmp)
|
|
76
|
+
archive = tmp_path / "index.tar.gz"
|
|
77
|
+
|
|
78
|
+
_download(INDEX_URL, archive)
|
|
79
|
+
_verify_checksum(archive, INDEX_URL)
|
|
80
|
+
|
|
81
|
+
extract_dir = tmp_path / "extracted"
|
|
82
|
+
extract_dir.mkdir()
|
|
83
|
+
with tarfile.open(archive) as tar:
|
|
84
|
+
_safe_extractall(tar, extract_dir)
|
|
85
|
+
|
|
86
|
+
# Support archives that either contain the store files at the top level or nest them
|
|
87
|
+
# under a single wrapping directory.
|
|
88
|
+
root = extract_dir
|
|
89
|
+
if not (root / _MARKER).exists():
|
|
90
|
+
subdirs = [p for p in root.iterdir() if p.is_dir()]
|
|
91
|
+
if len(subdirs) == 1 and (subdirs[0] / _MARKER).exists():
|
|
92
|
+
root = subdirs[0]
|
|
93
|
+
if not (root / _MARKER).exists():
|
|
94
|
+
raise RuntimeError(
|
|
95
|
+
f"Downloaded index archive from {INDEX_URL} did not contain '{_MARKER}'."
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
# Another process may have won the race and populated dest while we were downloading.
|
|
99
|
+
if (dest / _MARKER).exists():
|
|
100
|
+
return dest
|
|
101
|
+
if dest.exists():
|
|
102
|
+
shutil.rmtree(dest)
|
|
103
|
+
os.replace(root, dest)
|
|
104
|
+
|
|
105
|
+
return dest
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _safe_extractall(tar: tarfile.TarFile, dest: Path) -> None:
|
|
109
|
+
"""Extract `tar` into `dest`, rejecting members that would escape the target directory.
|
|
110
|
+
|
|
111
|
+
Guards against path-traversal ("tar slip") in a downloaded archive by resolving each
|
|
112
|
+
member's destination and confirming it stays within `dest`.
|
|
113
|
+
"""
|
|
114
|
+
dest_root = dest.resolve()
|
|
115
|
+
for member in tar.getmembers():
|
|
116
|
+
target = (dest / member.name).resolve()
|
|
117
|
+
if not (target == dest_root or dest_root in target.parents):
|
|
118
|
+
raise RuntimeError(f"Unsafe path in index archive: {member.name!r}")
|
|
119
|
+
tar.extractall(dest)
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def get_rag_collection():
|
|
123
|
+
"""Return the persistent ChromaDB collection, ensuring the index is present first."""
|
|
124
|
+
client = chromadb.PersistentClient(path=str(ensure_index()))
|
|
125
|
+
return client.get_collection(name=COLLECTION_NAME)
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""Retrieval side of the RAG pipeline for the published package.
|
|
2
|
+
|
|
3
|
+
Embeds the question with the *same* embedding function used at ingestion (Chroma's
|
|
4
|
+
DefaultEmbeddingFunction, all-MiniLM-L6-v2) and runs a nearest-neighbour query against the
|
|
5
|
+
downloaded ChromaDB collection. Using an identical embedding model is critical: the stored
|
|
6
|
+
vectors were produced by that same model, so a different embedder would make cosine distances
|
|
7
|
+
meaningless. Results are returned as plain dicts so the package needs no langchain-core.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from chromadb.utils import embedding_functions
|
|
11
|
+
|
|
12
|
+
from .datastore import get_rag_collection
|
|
13
|
+
from .source_label import format_source_label
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Constructing the embedding function loads the all-MiniLM-L6-v2 model, so build it once at
|
|
17
|
+
# import time rather than per query.
|
|
18
|
+
_embedding_fn = embedding_functions.DefaultEmbeddingFunction()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def retrieve_chunks(question: str, k: int) -> list[dict]:
|
|
22
|
+
"""Return the `k` chunks most semantically similar to `question`.
|
|
23
|
+
|
|
24
|
+
Each result is a dict with `source` (provenance label), `content` (raw chunk text), and
|
|
25
|
+
`distance` (retrieval distance; lower is more similar).
|
|
26
|
+
"""
|
|
27
|
+
collection = get_rag_collection()
|
|
28
|
+
|
|
29
|
+
query_embedding = _embedding_fn([question])
|
|
30
|
+
results = collection.query(
|
|
31
|
+
query_embeddings=query_embedding,
|
|
32
|
+
n_results=k,
|
|
33
|
+
include=["documents", "metadatas", "distances"],
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# Chroma returns a list-of-lists keyed by query; we issued a single query so index [0].
|
|
37
|
+
documents = results.get("documents", [[]])[0]
|
|
38
|
+
metadatas = results.get("metadatas", [[]])[0]
|
|
39
|
+
distances = results.get("distances", [[]])[0]
|
|
40
|
+
|
|
41
|
+
return [
|
|
42
|
+
{
|
|
43
|
+
"source": format_source_label(dict(metadata or {})),
|
|
44
|
+
"content": content,
|
|
45
|
+
"distance": distance,
|
|
46
|
+
}
|
|
47
|
+
for content, metadata, distance in zip(documents, metadatas, distances)
|
|
48
|
+
]
|
rag_react_docs/server.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""MCP server exposing the RAG retrieval pipeline over stdio.
|
|
2
|
+
|
|
3
|
+
Published as the `cfunklabs-rag-react-docs` console script (`uvx cfunklabs-rag-react-docs`). It
|
|
4
|
+
exposes a single `search_docs` tool that performs *retrieval only* against the downloaded
|
|
5
|
+
ChromaDB collection and returns the top-k chunks with their source/heading labels. The
|
|
6
|
+
consuming LLM (Cursor, Claude Desktop, etc.) ingests those chunks and generates its own
|
|
7
|
+
grounded answer, so no Anthropic API key or generation stack is needed server-side.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import sys
|
|
11
|
+
|
|
12
|
+
from mcp.server.fastmcp import FastMCP
|
|
13
|
+
|
|
14
|
+
from .config import DEFAULT_TOP_K
|
|
15
|
+
from .datastore import get_rag_collection
|
|
16
|
+
from .retrieval import retrieve_chunks
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
mcp = FastMCP("rag-react-docs")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _collection_is_empty() -> bool:
|
|
23
|
+
try:
|
|
24
|
+
return get_rag_collection().count() == 0
|
|
25
|
+
except Exception:
|
|
26
|
+
# Treat a missing/uninitialized/failed-download collection the same as an empty one.
|
|
27
|
+
return True
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@mcp.tool()
|
|
31
|
+
def search_docs(question: str, k: int = DEFAULT_TOP_K) -> list[dict]:
|
|
32
|
+
"""Search the indexed React documentation and return the most relevant chunks.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
question: A natural-language question to search the React docs for.
|
|
36
|
+
k: How many chunks to return (defaults to `DEFAULT_TOP_K`).
|
|
37
|
+
|
|
38
|
+
Returns a list of results ordered by relevance. Each item has:
|
|
39
|
+
- source: a human-readable provenance label (file path > heading path)
|
|
40
|
+
- content: the raw chunk text to ground an answer on
|
|
41
|
+
- distance: the retrieval distance (lower is more similar)
|
|
42
|
+
"""
|
|
43
|
+
if _collection_is_empty():
|
|
44
|
+
return [
|
|
45
|
+
{
|
|
46
|
+
"source": "rag-react-docs",
|
|
47
|
+
"content": (
|
|
48
|
+
"The documentation index is empty or could not be loaded. "
|
|
49
|
+
"Check network access on first run so the index can be downloaded."
|
|
50
|
+
),
|
|
51
|
+
"distance": None,
|
|
52
|
+
}
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
return retrieve_chunks(question, k)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def main() -> None:
|
|
59
|
+
"""Console-script entry point: start the MCP server on stdio."""
|
|
60
|
+
# Human-facing messages must go to stderr: the stdio transport reserves stdout for the
|
|
61
|
+
# JSON-RPC protocol, so anything printed there would corrupt the stream.
|
|
62
|
+
print(f"[rag-react-docs] MCP server starting on stdio (top_k={DEFAULT_TOP_K}).", file=sys.stderr)
|
|
63
|
+
print("[rag-react-docs] Ensuring documentation index is available...", file=sys.stderr)
|
|
64
|
+
try:
|
|
65
|
+
if _collection_is_empty():
|
|
66
|
+
print(
|
|
67
|
+
"[rag-react-docs] Warning: index empty or unavailable -- check network access.",
|
|
68
|
+
file=sys.stderr,
|
|
69
|
+
)
|
|
70
|
+
else:
|
|
71
|
+
print("[rag-react-docs] Index ready.", file=sys.stderr)
|
|
72
|
+
except Exception as exc: # pragma: no cover - defensive; _collection_is_empty swallows most
|
|
73
|
+
print(f"[rag-react-docs] Warning: could not verify index: {exc}", file=sys.stderr)
|
|
74
|
+
|
|
75
|
+
print("[rag-react-docs] Ready. Press Ctrl+C to stop.", file=sys.stderr)
|
|
76
|
+
try:
|
|
77
|
+
mcp.run()
|
|
78
|
+
except KeyboardInterrupt:
|
|
79
|
+
print("\n[rag-react-docs] Shutting down.", file=sys.stderr)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__":
|
|
83
|
+
main()
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""Build human-readable provenance labels from a chunk's stored metadata.
|
|
2
|
+
|
|
3
|
+
Ported from the dev tooling's `format_source_label`. The installed package has no repo `docs/`
|
|
4
|
+
directory, so the relative-path shortening will not match; in that case we fall back to the
|
|
5
|
+
source file's basename, still prefixed onto the stored heading hierarchy.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def format_source_label(metadata: dict) -> str:
|
|
12
|
+
"""Return a label like "memo.md > Reference > memo(Component, arePropsEqual?)".
|
|
13
|
+
|
|
14
|
+
Combines the source file (shortened to its basename) with the stored heading hierarchy.
|
|
15
|
+
"""
|
|
16
|
+
source = metadata.get("source", "unknown source")
|
|
17
|
+
source = Path(source).name
|
|
18
|
+
|
|
19
|
+
headings = [
|
|
20
|
+
metadata[field]
|
|
21
|
+
for field in ("Header 1", "Header 2", "Header 3")
|
|
22
|
+
if metadata.get(field)
|
|
23
|
+
]
|
|
24
|
+
if headings:
|
|
25
|
+
return f"{source} > {' > '.join(headings)}"
|
|
26
|
+
return source
|