document-rag-mcp 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- document_rag_mcp-0.1.0/.github/workflows/docs.yml +21 -0
- document_rag_mcp-0.1.0/.github/workflows/release.yml +59 -0
- document_rag_mcp-0.1.0/.github/workflows/test.yml +35 -0
- document_rag_mcp-0.1.0/.gitignore +138 -0
- document_rag_mcp-0.1.0/PKG-INFO +23 -0
- document_rag_mcp-0.1.0/README.md +5 -0
- document_rag_mcp-0.1.0/config.example.yaml +53 -0
- document_rag_mcp-0.1.0/docs/architecture.md +38 -0
- document_rag_mcp-0.1.0/docs/cli.md +53 -0
- document_rag_mcp-0.1.0/docs/configuration.md +40 -0
- document_rag_mcp-0.1.0/docs/getting-started.md +58 -0
- document_rag_mcp-0.1.0/docs/index.md +14 -0
- document_rag_mcp-0.1.0/mkdocs.yml +35 -0
- document_rag_mcp-0.1.0/pyproject.toml +50 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/__init__.py +2 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/cli.py +136 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/config.py +94 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/embedding/__init__.py +1 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/embedding/client.py +47 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/ingestion/__init__.py +1 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/ingestion/chunker.py +125 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/ingestion/extractor.py +181 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/ingestion/pipeline.py +172 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/ingestion/scanner.py +35 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/ingestion/watcher.py +80 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/models.py +32 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/search/__init__.py +1 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/search/engine.py +53 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/server.py +256 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/storage/__init__.py +1 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/storage/state_store.py +145 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/storage/vector_store.py +164 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/vision/__init__.py +1 -0
- document_rag_mcp-0.1.0/src/document_rag_mcp/vision/client.py +53 -0
- document_rag_mcp-0.1.0/tests/__init__.py +1 -0
- document_rag_mcp-0.1.0/tests/conftest.py +7 -0
- document_rag_mcp-0.1.0/tests/test_chunker.py +88 -0
- document_rag_mcp-0.1.0/tests/test_cli.py +66 -0
- document_rag_mcp-0.1.0/tests/test_config.py +100 -0
- document_rag_mcp-0.1.0/tests/test_embedding_client.py +90 -0
- document_rag_mcp-0.1.0/tests/test_extractor.py +133 -0
- document_rag_mcp-0.1.0/tests/test_integration.py +146 -0
- document_rag_mcp-0.1.0/tests/test_pipeline.py +192 -0
- document_rag_mcp-0.1.0/tests/test_scanner.py +87 -0
- document_rag_mcp-0.1.0/tests/test_search.py +113 -0
- document_rag_mcp-0.1.0/tests/test_server.py +168 -0
- document_rag_mcp-0.1.0/tests/test_state_store.py +108 -0
- document_rag_mcp-0.1.0/tests/test_vector_store.py +102 -0
- document_rag_mcp-0.1.0/tests/test_vision_client.py +55 -0
- document_rag_mcp-0.1.0/tests/test_watcher.py +78 -0
- document_rag_mcp-0.1.0/uv.lock +3669 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
name: docs
|
|
2
|
+
on:
|
|
3
|
+
push:
|
|
4
|
+
branches:
|
|
5
|
+
- master
|
|
6
|
+
- main
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: write
|
|
10
|
+
|
|
11
|
+
jobs:
|
|
12
|
+
deploy:
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
- name: Install uv
|
|
17
|
+
uses: astral-sh/setup-uv@v5
|
|
18
|
+
- name: Install dependencies
|
|
19
|
+
run: uv sync --group docs
|
|
20
|
+
- name: Build and deploy docs
|
|
21
|
+
run: uv run mkdocs gh-deploy --force
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
name: release
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- 'v*' # Trigger on tags like v1.0.0, v0.1.0, etc.
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
build:
|
|
10
|
+
name: Build Python distribution 📦
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
steps:
|
|
13
|
+
- uses: actions/checkout@v4
|
|
14
|
+
- name: Install uv
|
|
15
|
+
uses: astral-sh/setup-uv@v5
|
|
16
|
+
- name: Build distribution 📦
|
|
17
|
+
run: uv build
|
|
18
|
+
- name: Upload distribution 📦
|
|
19
|
+
uses: actions/upload-artifact@v4
|
|
20
|
+
with:
|
|
21
|
+
name: dist
|
|
22
|
+
path: dist/
|
|
23
|
+
|
|
24
|
+
pypi-publish:
|
|
25
|
+
name: Publish Python distribution 📦 to PyPI
|
|
26
|
+
needs: build
|
|
27
|
+
runs-on: ubuntu-latest
|
|
28
|
+
environment:
|
|
29
|
+
name: pypi
|
|
30
|
+
url: https://pypi.org/p/document-rag-mcp
|
|
31
|
+
permissions:
|
|
32
|
+
id-token: write # IMPORTANT: mandatory for trusted publishing
|
|
33
|
+
contents: read
|
|
34
|
+
steps:
|
|
35
|
+
- name: Download distribution 📦
|
|
36
|
+
uses: actions/download-artifact@v4
|
|
37
|
+
with:
|
|
38
|
+
name: dist
|
|
39
|
+
path: dist/
|
|
40
|
+
- name: Publish distribution 📦 to PyPI
|
|
41
|
+
uses: pypa/gh-action-pypi-publish@release/v1
|
|
42
|
+
|
|
43
|
+
github-release:
|
|
44
|
+
name: Create GitHub Release
|
|
45
|
+
needs: build
|
|
46
|
+
runs-on: ubuntu-latest
|
|
47
|
+
permissions:
|
|
48
|
+
contents: write
|
|
49
|
+
steps:
|
|
50
|
+
- name: Download distribution 📦
|
|
51
|
+
uses: actions/download-artifact@v4
|
|
52
|
+
with:
|
|
53
|
+
name: dist
|
|
54
|
+
path: dist/
|
|
55
|
+
- name: Create GitHub Release
|
|
56
|
+
uses: softprops/action-gh-release@v2
|
|
57
|
+
with:
|
|
58
|
+
files: dist/*
|
|
59
|
+
generate_release_notes: true
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
name: Tests
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [ main, master ]
|
|
6
|
+
pull_request:
|
|
7
|
+
branches: [ main, master ]
|
|
8
|
+
|
|
9
|
+
jobs:
|
|
10
|
+
test:
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
strategy:
|
|
13
|
+
matrix:
|
|
14
|
+
python-version: ["3.11", "3.12", "3.13"]
|
|
15
|
+
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Set up uv
|
|
20
|
+
uses: astral-sh/setup-uv@v5
|
|
21
|
+
with:
|
|
22
|
+
version: "latest"
|
|
23
|
+
enable-cache: true
|
|
24
|
+
|
|
25
|
+
- name: Set up Python
|
|
26
|
+
run: uv python install ${{ matrix.python-version }}
|
|
27
|
+
|
|
28
|
+
- name: Install dependencies
|
|
29
|
+
run: uv sync --group dev
|
|
30
|
+
|
|
31
|
+
- name: Run Ruff Lint and Format Checks
|
|
32
|
+
run: uv run ruff check src/ tests/
|
|
33
|
+
|
|
34
|
+
- name: Run Tests with Coverage
|
|
35
|
+
run: uv run pytest --cov=document_rag_mcp --cov-report=xml -v
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
|
|
6
|
+
# C extensions
|
|
7
|
+
*.so
|
|
8
|
+
|
|
9
|
+
# Distribution / packaging
|
|
10
|
+
.Python
|
|
11
|
+
build/
|
|
12
|
+
develop-eggs/
|
|
13
|
+
dist/
|
|
14
|
+
downloads/
|
|
15
|
+
eggs/
|
|
16
|
+
.eggs/
|
|
17
|
+
lib/
|
|
18
|
+
lib64/
|
|
19
|
+
parts/
|
|
20
|
+
sdist/
|
|
21
|
+
var/
|
|
22
|
+
wheels/
|
|
23
|
+
share/python-wheels/
|
|
24
|
+
*.egg-info/
|
|
25
|
+
.installed.cfg
|
|
26
|
+
*.egg
|
|
27
|
+
MANIFEST
|
|
28
|
+
|
|
29
|
+
# Pyinstaller
|
|
30
|
+
*.manifest
|
|
31
|
+
*.spec
|
|
32
|
+
|
|
33
|
+
# Installer logs
|
|
34
|
+
pip-log.txt
|
|
35
|
+
pip-delete-this-directory.txt
|
|
36
|
+
|
|
37
|
+
# Unit test / coverage reports
|
|
38
|
+
htmlcov/
|
|
39
|
+
.tox/
|
|
40
|
+
.nosenv/
|
|
41
|
+
.pytest_cache/
|
|
42
|
+
.pickle_cache/
|
|
43
|
+
.cache/
|
|
44
|
+
.coverage
|
|
45
|
+
.coverage.*
|
|
46
|
+
.xml
|
|
47
|
+
cov.xml
|
|
48
|
+
.hypothesis/
|
|
49
|
+
.htmlcov/
|
|
50
|
+
|
|
51
|
+
# Translations
|
|
52
|
+
*.mo
|
|
53
|
+
*.pot
|
|
54
|
+
|
|
55
|
+
# Django stuff:
|
|
56
|
+
*.log
|
|
57
|
+
local_settings.py
|
|
58
|
+
db.sqlite3
|
|
59
|
+
db.sqlite3-journal
|
|
60
|
+
|
|
61
|
+
# Sphinx documentation
|
|
62
|
+
docs/_build/
|
|
63
|
+
|
|
64
|
+
# PyBuilder
|
|
65
|
+
.pybuilder/
|
|
66
|
+
target/
|
|
67
|
+
|
|
68
|
+
# Jupyter Notebook
|
|
69
|
+
.ipynb_checkpoints
|
|
70
|
+
|
|
71
|
+
# IPython
|
|
72
|
+
profile_default/
|
|
73
|
+
ipython_config.py
|
|
74
|
+
|
|
75
|
+
# pyenv
|
|
76
|
+
# For a library or app, you might want to share your .python-version.
|
|
77
|
+
# For an executable, it's typically fine to ignore.
|
|
78
|
+
# .python-version
|
|
79
|
+
|
|
80
|
+
# pipenv
|
|
81
|
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
82
|
+
# However, in case of collaboration, if you choose to keep it in git:
|
|
83
|
+
#Pipfile.lock
|
|
84
|
+
|
|
85
|
+
# poetry
|
|
86
|
+
# Similarly, in case of poetry, it's recommended to include poetry.lock in version control.
|
|
87
|
+
#poetry.lock
|
|
88
|
+
|
|
89
|
+
# pdm
|
|
90
|
+
# Similar to Pipfile.lock, it is NOT recommended to include pdm.lock in version control.
|
|
91
|
+
#pdm.lock
|
|
92
|
+
.pdm-plugins/
|
|
93
|
+
|
|
94
|
+
# celry beat schedule file
|
|
95
|
+
celerybeat-schedule
|
|
96
|
+
celerybeat.pid
|
|
97
|
+
|
|
98
|
+
# SageMath parsed files
|
|
99
|
+
*.sage.py
|
|
100
|
+
|
|
101
|
+
# Environments
|
|
102
|
+
.env
|
|
103
|
+
.venv
|
|
104
|
+
env/
|
|
105
|
+
venv/
|
|
106
|
+
ENV/
|
|
107
|
+
env.bak/
|
|
108
|
+
venv.bak/
|
|
109
|
+
|
|
110
|
+
# Spyder project settings
|
|
111
|
+
.spyderproject
|
|
112
|
+
.spyproject
|
|
113
|
+
|
|
114
|
+
# Rope project settings
|
|
115
|
+
.ropeproject
|
|
116
|
+
|
|
117
|
+
# mkdocs documentation
|
|
118
|
+
/site/
|
|
119
|
+
|
|
120
|
+
# mypy
|
|
121
|
+
.mypy_cache/
|
|
122
|
+
.dmypy.json
|
|
123
|
+
dmypy.json
|
|
124
|
+
|
|
125
|
+
# Pyre type checker
|
|
126
|
+
.pyre/
|
|
127
|
+
|
|
128
|
+
# pytype static type analyzer
|
|
129
|
+
.pytype/
|
|
130
|
+
|
|
131
|
+
# Cython debug symbols
|
|
132
|
+
cython_debug/
|
|
133
|
+
|
|
134
|
+
# Local databases and runtime state data directories
|
|
135
|
+
/data/
|
|
136
|
+
*.db
|
|
137
|
+
*.db-journal
|
|
138
|
+
chroma/
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: document-rag-mcp
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: RAG MCP Server — watches folders, chunks documents, serves semantic search via MCP
|
|
5
|
+
License: AGPL-3.0-or-later
|
|
6
|
+
Requires-Python: >=3.11
|
|
7
|
+
Requires-Dist: chonkie[semantic]>=1.0
|
|
8
|
+
Requires-Dist: chromadb>=1.5
|
|
9
|
+
Requires-Dist: click>=8.0
|
|
10
|
+
Requires-Dist: mcp[cli]<2,>=1.27
|
|
11
|
+
Requires-Dist: openai>=1.80
|
|
12
|
+
Requires-Dist: pydantic-settings>=2.0
|
|
13
|
+
Requires-Dist: pydantic>=2.0
|
|
14
|
+
Requires-Dist: pymupdf>=1.25
|
|
15
|
+
Requires-Dist: pyyaml>=6.0
|
|
16
|
+
Requires-Dist: watchfiles>=1.0
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# document-rag-mcp
|
|
20
|
+
|
|
21
|
+
A RAG MCP (Model Context Protocol) server. It recursively scans and watches configured directories for `.txt`, `.md`, and `.pdf` files, semantically chunks them, computes embeddings using an OpenAI-compatible API, and stores them in an embedded ChromaDB instance.
|
|
22
|
+
|
|
23
|
+
It exposes tools to search documents and retrieve original content/metadata over the MCP protocol.
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
# document-rag-mcp
|
|
2
|
+
|
|
3
|
+
A RAG MCP (Model Context Protocol) server. It recursively scans and watches configured directories for `.txt`, `.md`, and `.pdf` files, semantically chunks them, computes embeddings using an OpenAI-compatible API, and stores them in an embedded ChromaDB instance.
|
|
4
|
+
|
|
5
|
+
It exposes tools to search documents and retrieve original content/metadata over the MCP protocol.
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# document-rag-mcp configuration example
|
|
2
|
+
|
|
3
|
+
# Define your document collections.
|
|
4
|
+
# Each collection groups one or more folders and scans them recursively.
|
|
5
|
+
collections:
|
|
6
|
+
- name: "project-docs"
|
|
7
|
+
paths:
|
|
8
|
+
- "/absolute/path/to/my-project/docs"
|
|
9
|
+
- "/absolute/path/to/another/folder"
|
|
10
|
+
file_patterns:
|
|
11
|
+
- "*.txt"
|
|
12
|
+
- "*.md"
|
|
13
|
+
- "*.pdf"
|
|
14
|
+
|
|
15
|
+
- name: "research-papers"
|
|
16
|
+
paths:
|
|
17
|
+
- "/absolute/path/to/papers"
|
|
18
|
+
file_patterns:
|
|
19
|
+
- "*.pdf"
|
|
20
|
+
|
|
21
|
+
# Remote embedding model config via an OpenAI-compatible endpoint.
|
|
22
|
+
# Compatible with lemonade, OpenRouter, Ollama, etc.
|
|
23
|
+
embedding:
|
|
24
|
+
base_url: "http://localhost:8080/v1" # Default local lemonade endpoint
|
|
25
|
+
api_key: "unused" # Use "unused" or actual key
|
|
26
|
+
model: "embed-gemma-300m-FLM" # Default embedding model
|
|
27
|
+
dimensions: 768 # Dimensionality of the vectors
|
|
28
|
+
batch_size: 32 # Batch size for embedding requests
|
|
29
|
+
|
|
30
|
+
# Local semantic chunking configurations.
|
|
31
|
+
chunking:
|
|
32
|
+
max_chunk_size: 512 # Maximum tokens per chunk
|
|
33
|
+
similarity_threshold: 0.5 # Topic shift threshold for semantic chunking
|
|
34
|
+
local_model: "all-MiniLM-L6-v2" # Model used for local semantic boundary detection
|
|
35
|
+
# Note: Falls back to TokenChunker if sentence-transformers is not installed.
|
|
36
|
+
|
|
37
|
+
# Storage paths for metadata and vector storage.
|
|
38
|
+
storage:
|
|
39
|
+
data_dir: "./data" # Path where SQLite and ChromaDB data will be saved
|
|
40
|
+
|
|
41
|
+
# Optional Vision LLM config for scanned/un-OCRed PDFs.
|
|
42
|
+
# Set enabled to true to run page-to-image extraction via multimodal completions.
|
|
43
|
+
vision:
|
|
44
|
+
enabled: false
|
|
45
|
+
base_url: "http://localhost:8080/v1" # Vision endpoint URL
|
|
46
|
+
api_key: "unused" # Vision API Key
|
|
47
|
+
model: "gpt-4o" # Multimodal model name
|
|
48
|
+
|
|
49
|
+
# HTTP/SSE Server bind configurations.
|
|
50
|
+
# Only used when serving via HTTP transport.
|
|
51
|
+
server:
|
|
52
|
+
host: "127.0.0.1"
|
|
53
|
+
port: 8000
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
# System Architecture
|
|
2
|
+
|
|
3
|
+
The Document RAG MCP server is designed as a modular, lightweight, and performant pipeline.
|
|
4
|
+
|
|
5
|
+
## System Diagram
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
┌─────────────────────────────────────────────────────────────────┐
|
|
9
|
+
│ MCP Server (FastMCP) │
|
|
10
|
+
│ Transport: stdio | streamable-http │
|
|
11
|
+
│ │
|
|
12
|
+
│ Tools: search, list_collections, get_document_content, │
|
|
13
|
+
│ get_document_original, ingest_now │
|
|
14
|
+
└──────────┬──────────────────────────────┬───────────────────────┘
|
|
15
|
+
│ │
|
|
16
|
+
┌─────▼──────┐ ┌────────▼────────┐
|
|
17
|
+
│ Ingestion │ │ Search │
|
|
18
|
+
│ Pipeline │ │ Engine │
|
|
19
|
+
│ │ │ │
|
|
20
|
+
│ extract → │ │ embed query → │
|
|
21
|
+
│ chunk → │ │ vector query → │
|
|
22
|
+
│ hash → │ │ merge & rank │
|
|
23
|
+
│ upsert │ └────────┬────────┘
|
|
24
|
+
└─────┬──────┘ │
|
|
25
|
+
│ │
|
|
26
|
+
┌─────▼──────┐ ┌─────▼──────┐
|
|
27
|
+
│State Store │ │Vector Store│
|
|
28
|
+
│ (SQLite) │ │ (ChromaDB) │
|
|
29
|
+
└────────────┘ └────────────┘
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Modular Description
|
|
33
|
+
|
|
34
|
+
- **Extraction (`extractor.py`)**: Responsible for reading plain text, parsing frontmatter/headings in markdown, and extraction from PDFs via PyMuPDF. It uses font sizes and tags in the PDF layout to detect headers.
|
|
35
|
+
- **Chunking (`chunker.py`)**: Uses `chonkie` to partition text into tokens. Markdown is recursively split at section headers and paragraph shifts, while TXT/PDF is semantically chunked.
|
|
36
|
+
- **Incremental Pipeline (`pipeline.py`)**: Checks file-level hashes (SHA-256) and stores them in SQLite. If a file is modified, it computes new chunk hashes, maps existing chunk vectors from ChromaDB, and only requests embeddings for new/modified chunks, saving API tokens.
|
|
37
|
+
- **Search Engine (`engine.py`)**: Performs semantic searches by generating a vector representation of the query and querying ChromaDB. For multi-collection queries, it merges and ranks results using ascending order of L2 distance.
|
|
38
|
+
- **Security Boundaries**: Path operations inside the MCP tools (`get_document_content`, `get_document_original`) validate that the target files reside within one of the collection directories before performing file system reads, protecting the server host from arbitrary path traversal attacks.
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
# CLI Reference
|
|
2
|
+
|
|
3
|
+
The `document-rag-mcp` CLI exposes subcommands to run the RAG server, test searches, and trigger manual index scanning.
|
|
4
|
+
|
|
5
|
+
## Global Options
|
|
6
|
+
|
|
7
|
+
- `--config`, `-c`: Path to the YAML configuration file. Can also be set via the `DOCRAG_CONFIG` environment variable.
|
|
8
|
+
- `--chunking-model`: Override the local model name for semantic boundary splits. Can also be set via the `DOCRAG_CHUNKING_MODEL` environment variable.
|
|
9
|
+
- `--help`: Show the help message and exit.
|
|
10
|
+
|
|
11
|
+
## Subcommands
|
|
12
|
+
|
|
13
|
+
### `serve`
|
|
14
|
+
Starts the Model Context Protocol (MCP) server.
|
|
15
|
+
|
|
16
|
+
- `--transport`: Choose `stdio` (default) or `http` (SSE).
|
|
17
|
+
- `--host`: Bind host for SSE transport (default `127.0.0.1`).
|
|
18
|
+
- `--port`: Bind port for SSE transport (default `8000`).
|
|
19
|
+
|
|
20
|
+
**Example:**
|
|
21
|
+
```bash
|
|
22
|
+
document-rag-mcp serve --transport stdio
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
### `ingest`
|
|
26
|
+
Trigger a one-shot synchronous recursive scan and index of files in all collections (or a specific collection). This command prunes entries for deleted files.
|
|
27
|
+
|
|
28
|
+
- `--collection`, `-c`: Limit the scan to a specific collection by name.
|
|
29
|
+
|
|
30
|
+
**Example:**
|
|
31
|
+
```bash
|
|
32
|
+
document-rag-mcp ingest --collection "project-docs"
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### `search`
|
|
36
|
+
Executes a semantic search against the indexed collections and prints formatted results to the terminal.
|
|
37
|
+
|
|
38
|
+
- `QUERY` (Argument, Required): The semantic search query text.
|
|
39
|
+
- `--collection`, `-c`: Filter search results to a specific collection by name.
|
|
40
|
+
- `--top-k`, `-k`: The number of nearest matches to display (default 5).
|
|
41
|
+
|
|
42
|
+
**Example:**
|
|
43
|
+
```bash
|
|
44
|
+
document-rag-mcp search "how to configure the pipeline" -k 3
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### `collections`
|
|
48
|
+
Lists all collections specified in the configuration file, showing their configured directories, search file glob patterns, and the total count of indexed chunks.
|
|
49
|
+
|
|
50
|
+
**Example:**
|
|
51
|
+
```bash
|
|
52
|
+
document-rag-mcp collections
|
|
53
|
+
```
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# Configuration Reference
|
|
2
|
+
|
|
3
|
+
The server loads configuration from a YAML file. You can specify the config file path using the `-c`/`--config` CLI flag or the `DOCRAG_CONFIG` environment variable.
|
|
4
|
+
|
|
5
|
+
## Environment Variables Override
|
|
6
|
+
|
|
7
|
+
All configuration fields can be overridden using environment variables prefixed with `DOCRAG_` and using a double underscore `__` for nesting.
|
|
8
|
+
|
|
9
|
+
**Examples:**
|
|
10
|
+
- Override storage directory: `DOCRAG_STORAGE__DATA_DIR="/tmp/data"`
|
|
11
|
+
- Override embedding model: `DOCRAG_EMBEDDING__MODEL="text-embedding-3-small"`
|
|
12
|
+
- Override vision enabled: `DOCRAG_VISION__ENABLED="true"`
|
|
13
|
+
|
|
14
|
+
## Detailed Configuration Options
|
|
15
|
+
|
|
16
|
+
### Collections Settings
|
|
17
|
+
Configure folders and matching patterns:
|
|
18
|
+
- `name`: Unique name of the collection (conform to alphanumeric rules).
|
|
19
|
+
- `paths`: Absolute paths to folders/files to index.
|
|
20
|
+
- `file_patterns`: Glob patterns, e.g. `["*.txt", "*.md", "*.pdf"]`.
|
|
21
|
+
|
|
22
|
+
### Embedding Settings
|
|
23
|
+
Configure the OpenAI-compatible embedding API:
|
|
24
|
+
- `base_url`: Target endpoint (e.g. `http://localhost:8080/v1` for lemonade).
|
|
25
|
+
- `api_key`: Authorization API Key (use "unused" if endpoint does not require one).
|
|
26
|
+
- `model`: Embedding model name.
|
|
27
|
+
- `dimensions`: Vector dimensions size (e.g., 768 for Gemma).
|
|
28
|
+
- `batch_size`: Maximum texts sent in a single batch request.
|
|
29
|
+
|
|
30
|
+
### Local Chunking Settings
|
|
31
|
+
Configure document splitting limits:
|
|
32
|
+
- `max_chunk_size`: Maximum number of tokens per chunk.
|
|
33
|
+
- `similarity_threshold`: Threshold for semantic boundary splits.
|
|
34
|
+
- `local_model`: Local model name (e.g. `all-MiniLM-L6-v2`) used for chunking boundaries.
|
|
35
|
+
|
|
36
|
+
### Vision Settings (Optional)
|
|
37
|
+
Configure scanned PDF page-to-image extraction:
|
|
38
|
+
- `enabled`: Set `true` to enable multimodal OCR fallback.
|
|
39
|
+
- `base_url`: OpenAI-compatible vision completion endpoint.
|
|
40
|
+
- `model`: Multimodal model name (e.g. `gpt-4o`).
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Getting Started
|
|
2
|
+
|
|
3
|
+
Follow these steps to install, configure, and run the Document RAG MCP server.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
This project uses `uv` for python dependency management. Ensure you have `uv` installed, then synchronize the environment:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
git clone https://github.com/user/document-rag-mcp.git
|
|
11
|
+
cd document-rag-mcp
|
|
12
|
+
uv sync --group dev
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## First Configuration
|
|
16
|
+
|
|
17
|
+
Copy the example configuration file and adjust it to your directories:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
cp config.example.yaml config.yaml
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
Edit `config.yaml` to specify the folders you want to watch.
|
|
24
|
+
|
|
25
|
+
## Quick CLI Usage
|
|
26
|
+
|
|
27
|
+
You can test the ingestion and run semantic searches directly from the command line:
|
|
28
|
+
|
|
29
|
+
### 1. Ingest Documents
|
|
30
|
+
Run a one-shot ingestion scan to populate the vector store:
|
|
31
|
+
```bash
|
|
32
|
+
uv run document-rag-mcp ingest
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### 2. Run a Test Search
|
|
36
|
+
Query the indexed collections:
|
|
37
|
+
```bash
|
|
38
|
+
uv run document-rag-mcp search "What is project antigravity?"
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### 3. Show Collection Statistics
|
|
42
|
+
Check chunk counts and file listings:
|
|
43
|
+
```bash
|
|
44
|
+
uv run document-rag-mcp collections
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Running the MCP Server
|
|
48
|
+
|
|
49
|
+
Start the server using `stdio` transport (default) for integration with LLM clients:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
uv run document-rag-mcp serve --transport stdio
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
For HTTP/SSE integration, specify the transport and bindings:
|
|
56
|
+
```bash
|
|
57
|
+
uv run document-rag-mcp serve --transport http --host 127.0.0.1 --port 8000
|
|
58
|
+
```
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
# Document RAG MCP Server
|
|
2
|
+
|
|
3
|
+
Welcome to the **document-rag-mcp** server documentation!
|
|
4
|
+
|
|
5
|
+
This Model Context Protocol (MCP) server enables semantic search and content extraction over text, Markdown, and PDF documents. It works fully in-process without requiring external database servers, using ChromaDB as the vector store and SQLite as the ingestion state tracker.
|
|
6
|
+
|
|
7
|
+
## Key Features
|
|
8
|
+
|
|
9
|
+
- **Recursive Folder Scanning**: Configured folders are recursively scanned on startup and then monitored in real-time via inotify (`watchfiles`).
|
|
10
|
+
- **Incremental Indexing**: Uses content hashing (SHA-256) at both the file and chunk levels. Files that have not changed are skipped completely on startup, and modified files only re-embed chunks that actually changed.
|
|
11
|
+
- **Auto-Pruning**: Automatically detects when files are deleted from the disk (both while running and when offline) and prunes them from the index.
|
|
12
|
+
- **Multimodal PDF Processing**: Detects scanned or text-less PDF pages and routes them through an optional vision-capable LLM to extract text.
|
|
13
|
+
- **MCP Native**: Exposes tools for semantic search, collection stats, metadata analysis, and full document text/binary content retrieval.
|
|
14
|
+
- **Secure Boundaries**: Strictly validates all path parameters against configured collections folders to protect against directory traversal attacks.
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
site_name: document-rag-mcp Documentation
|
|
2
|
+
theme:
|
|
3
|
+
name: material
|
|
4
|
+
palette:
|
|
5
|
+
- scheme: slate
|
|
6
|
+
primary: teal
|
|
7
|
+
accent: teal
|
|
8
|
+
toggle:
|
|
9
|
+
icon: material/brightness-4
|
|
10
|
+
name: Switch to light mode
|
|
11
|
+
- scheme: default
|
|
12
|
+
primary: teal
|
|
13
|
+
accent: teal
|
|
14
|
+
toggle:
|
|
15
|
+
icon: material/brightness-7
|
|
16
|
+
name: Switch to dark mode
|
|
17
|
+
features:
|
|
18
|
+
- navigation.sections
|
|
19
|
+
- navigation.expand
|
|
20
|
+
- content.code.copy
|
|
21
|
+
|
|
22
|
+
plugins:
|
|
23
|
+
- search
|
|
24
|
+
- mkdocstrings:
|
|
25
|
+
default_handler: python
|
|
26
|
+
handlers:
|
|
27
|
+
python:
|
|
28
|
+
paths: [src]
|
|
29
|
+
|
|
30
|
+
nav:
|
|
31
|
+
- Home: index.md
|
|
32
|
+
- Getting Started: getting-started.md
|
|
33
|
+
- Configuration: configuration.md
|
|
34
|
+
- Architecture: architecture.md
|
|
35
|
+
- CLI Reference: cli.md
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "document-rag-mcp"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "RAG MCP Server — watches folders, chunks documents, serves semantic search via MCP"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
license = { text = "AGPL-3.0-or-later" }
|
|
7
|
+
requires-python = ">=3.11"
|
|
8
|
+
dependencies = [
|
|
9
|
+
"mcp[cli]>=1.27,<2",
|
|
10
|
+
"chromadb>=1.5",
|
|
11
|
+
"openai>=1.80",
|
|
12
|
+
"chonkie[semantic]>=1.0",
|
|
13
|
+
"pymupdf>=1.25",
|
|
14
|
+
"watchfiles>=1.0",
|
|
15
|
+
"pydantic>=2.0",
|
|
16
|
+
"pydantic-settings>=2.0",
|
|
17
|
+
"click>=8.0",
|
|
18
|
+
"pyyaml>=6.0",
|
|
19
|
+
]
|
|
20
|
+
|
|
21
|
+
[dependency-groups]
|
|
22
|
+
dev = [
|
|
23
|
+
"pytest>=8.0",
|
|
24
|
+
"pytest-asyncio>=0.24",
|
|
25
|
+
"pytest-cov>=6.0",
|
|
26
|
+
"ruff>=0.9",
|
|
27
|
+
]
|
|
28
|
+
docs = [
|
|
29
|
+
"mkdocs-material>=9.6",
|
|
30
|
+
"mkdocstrings[python]>=0.27",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
[project.scripts]
|
|
34
|
+
document-rag-mcp = "document_rag_mcp.cli:main"
|
|
35
|
+
|
|
36
|
+
[build-system]
|
|
37
|
+
requires = ["hatchling"]
|
|
38
|
+
build-backend = "hatchling.build"
|
|
39
|
+
|
|
40
|
+
[tool.hatch.build.targets.wheel]
|
|
41
|
+
packages = ["src/document_rag_mcp"]
|
|
42
|
+
|
|
43
|
+
[tool.pytest.ini_options]
|
|
44
|
+
asyncio_mode = "auto"
|
|
45
|
+
testpaths = ["tests"]
|
|
46
|
+
|
|
47
|
+
[tool.ruff]
|
|
48
|
+
target-version = "py311"
|
|
49
|
+
line-length = 100
|
|
50
|
+
src = ["src"]
|