document-rag-mcp 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. document_rag_mcp-0.1.0/.github/workflows/docs.yml +21 -0
  2. document_rag_mcp-0.1.0/.github/workflows/release.yml +59 -0
  3. document_rag_mcp-0.1.0/.github/workflows/test.yml +35 -0
  4. document_rag_mcp-0.1.0/.gitignore +138 -0
  5. document_rag_mcp-0.1.0/PKG-INFO +23 -0
  6. document_rag_mcp-0.1.0/README.md +5 -0
  7. document_rag_mcp-0.1.0/config.example.yaml +53 -0
  8. document_rag_mcp-0.1.0/docs/architecture.md +38 -0
  9. document_rag_mcp-0.1.0/docs/cli.md +53 -0
  10. document_rag_mcp-0.1.0/docs/configuration.md +40 -0
  11. document_rag_mcp-0.1.0/docs/getting-started.md +58 -0
  12. document_rag_mcp-0.1.0/docs/index.md +14 -0
  13. document_rag_mcp-0.1.0/mkdocs.yml +35 -0
  14. document_rag_mcp-0.1.0/pyproject.toml +50 -0
  15. document_rag_mcp-0.1.0/src/document_rag_mcp/__init__.py +2 -0
  16. document_rag_mcp-0.1.0/src/document_rag_mcp/cli.py +136 -0
  17. document_rag_mcp-0.1.0/src/document_rag_mcp/config.py +94 -0
  18. document_rag_mcp-0.1.0/src/document_rag_mcp/embedding/__init__.py +1 -0
  19. document_rag_mcp-0.1.0/src/document_rag_mcp/embedding/client.py +47 -0
  20. document_rag_mcp-0.1.0/src/document_rag_mcp/ingestion/__init__.py +1 -0
  21. document_rag_mcp-0.1.0/src/document_rag_mcp/ingestion/chunker.py +125 -0
  22. document_rag_mcp-0.1.0/src/document_rag_mcp/ingestion/extractor.py +181 -0
  23. document_rag_mcp-0.1.0/src/document_rag_mcp/ingestion/pipeline.py +172 -0
  24. document_rag_mcp-0.1.0/src/document_rag_mcp/ingestion/scanner.py +35 -0
  25. document_rag_mcp-0.1.0/src/document_rag_mcp/ingestion/watcher.py +80 -0
  26. document_rag_mcp-0.1.0/src/document_rag_mcp/models.py +32 -0
  27. document_rag_mcp-0.1.0/src/document_rag_mcp/search/__init__.py +1 -0
  28. document_rag_mcp-0.1.0/src/document_rag_mcp/search/engine.py +53 -0
  29. document_rag_mcp-0.1.0/src/document_rag_mcp/server.py +256 -0
  30. document_rag_mcp-0.1.0/src/document_rag_mcp/storage/__init__.py +1 -0
  31. document_rag_mcp-0.1.0/src/document_rag_mcp/storage/state_store.py +145 -0
  32. document_rag_mcp-0.1.0/src/document_rag_mcp/storage/vector_store.py +164 -0
  33. document_rag_mcp-0.1.0/src/document_rag_mcp/vision/__init__.py +1 -0
  34. document_rag_mcp-0.1.0/src/document_rag_mcp/vision/client.py +53 -0
  35. document_rag_mcp-0.1.0/tests/__init__.py +1 -0
  36. document_rag_mcp-0.1.0/tests/conftest.py +7 -0
  37. document_rag_mcp-0.1.0/tests/test_chunker.py +88 -0
  38. document_rag_mcp-0.1.0/tests/test_cli.py +66 -0
  39. document_rag_mcp-0.1.0/tests/test_config.py +100 -0
  40. document_rag_mcp-0.1.0/tests/test_embedding_client.py +90 -0
  41. document_rag_mcp-0.1.0/tests/test_extractor.py +133 -0
  42. document_rag_mcp-0.1.0/tests/test_integration.py +146 -0
  43. document_rag_mcp-0.1.0/tests/test_pipeline.py +192 -0
  44. document_rag_mcp-0.1.0/tests/test_scanner.py +87 -0
  45. document_rag_mcp-0.1.0/tests/test_search.py +113 -0
  46. document_rag_mcp-0.1.0/tests/test_server.py +168 -0
  47. document_rag_mcp-0.1.0/tests/test_state_store.py +108 -0
  48. document_rag_mcp-0.1.0/tests/test_vector_store.py +102 -0
  49. document_rag_mcp-0.1.0/tests/test_vision_client.py +55 -0
  50. document_rag_mcp-0.1.0/tests/test_watcher.py +78 -0
  51. document_rag_mcp-0.1.0/uv.lock +3669 -0
@@ -0,0 +1,21 @@
1
+ name: docs
2
+ on:
3
+ push:
4
+ branches:
5
+ - master
6
+ - main
7
+
8
+ permissions:
9
+ contents: write
10
+
11
+ jobs:
12
+ deploy:
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - name: Install uv
17
+ uses: astral-sh/setup-uv@v5
18
+ - name: Install dependencies
19
+ run: uv sync --group docs
20
+ - name: Build and deploy docs
21
+ run: uv run mkdocs gh-deploy --force
@@ -0,0 +1,59 @@
1
+ name: release
2
+
3
+ on:
4
+ push:
5
+ tags:
6
+ - 'v*' # Trigger on tags like v1.0.0, v0.1.0, etc.
7
+
8
+ jobs:
9
+ build:
10
+ name: Build Python distribution 📦
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - name: Install uv
15
+ uses: astral-sh/setup-uv@v5
16
+ - name: Build distribution 📦
17
+ run: uv build
18
+ - name: Upload distribution 📦
19
+ uses: actions/upload-artifact@v4
20
+ with:
21
+ name: dist
22
+ path: dist/
23
+
24
+ pypi-publish:
25
+ name: Publish Python distribution 📦 to PyPI
26
+ needs: build
27
+ runs-on: ubuntu-latest
28
+ environment:
29
+ name: pypi
30
+ url: https://pypi.org/p/document-rag-mcp
31
+ permissions:
32
+ id-token: write # IMPORTANT: mandatory for trusted publishing
33
+ contents: read
34
+ steps:
35
+ - name: Download distribution 📦
36
+ uses: actions/download-artifact@v4
37
+ with:
38
+ name: dist
39
+ path: dist/
40
+ - name: Publish distribution 📦 to PyPI
41
+ uses: pypa/gh-action-pypi-publish@release/v1
42
+
43
+ github-release:
44
+ name: Create GitHub Release
45
+ needs: build
46
+ runs-on: ubuntu-latest
47
+ permissions:
48
+ contents: write
49
+ steps:
50
+ - name: Download distribution 📦
51
+ uses: actions/download-artifact@v4
52
+ with:
53
+ name: dist
54
+ path: dist/
55
+ - name: Create GitHub Release
56
+ uses: softprops/action-gh-release@v2
57
+ with:
58
+ files: dist/*
59
+ generate_release_notes: true
@@ -0,0 +1,35 @@
1
+ name: Tests
2
+
3
+ on:
4
+ push:
5
+ branches: [ main, master ]
6
+ pull_request:
7
+ branches: [ main, master ]
8
+
9
+ jobs:
10
+ test:
11
+ runs-on: ubuntu-latest
12
+ strategy:
13
+ matrix:
14
+ python-version: ["3.11", "3.12", "3.13"]
15
+
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Set up uv
20
+ uses: astral-sh/setup-uv@v5
21
+ with:
22
+ version: "latest"
23
+ enable-cache: true
24
+
25
+ - name: Set up Python
26
+ run: uv python install ${{ matrix.python-version }}
27
+
28
+ - name: Install dependencies
29
+ run: uv sync --group dev
30
+
31
+ - name: Run Ruff Lint and Format Checks
32
+ run: uv run ruff check src/ tests/
33
+
34
+ - name: Run Tests with Coverage
35
+ run: uv run pytest --cov=document_rag_mcp --cov-report=xml -v
@@ -0,0 +1,138 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # Pyinstaller
30
+ *.manifest
31
+ *.spec
32
+
33
+ # Installer logs
34
+ pip-log.txt
35
+ pip-delete-this-directory.txt
36
+
37
+ # Unit test / coverage reports
38
+ htmlcov/
39
+ .tox/
40
+ .nosenv/
41
+ .pytest_cache/
42
+ .pickle_cache/
43
+ .cache/
44
+ .coverage
45
+ .coverage.*
46
+ .xml
47
+ cov.xml
48
+ .hypothesis/
49
+ .htmlcov/
50
+
51
+ # Translations
52
+ *.mo
53
+ *.pot
54
+
55
+ # Django stuff:
56
+ *.log
57
+ local_settings.py
58
+ db.sqlite3
59
+ db.sqlite3-journal
60
+
61
+ # Sphinx documentation
62
+ docs/_build/
63
+
64
+ # PyBuilder
65
+ .pybuilder/
66
+ target/
67
+
68
+ # Jupyter Notebook
69
+ .ipynb_checkpoints
70
+
71
+ # IPython
72
+ profile_default/
73
+ ipython_config.py
74
+
75
+ # pyenv
76
+ # For a library or app, you might want to share your .python-version.
77
+ # For an executable, it's typically fine to ignore.
78
+ # .python-version
79
+
80
+ # pipenv
81
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
82
+ # However, in case of collaboration, if you choose to keep it in git:
83
+ #Pipfile.lock
84
+
85
+ # poetry
86
+ # Similarly, in case of poetry, it's recommended to include poetry.lock in version control.
87
+ #poetry.lock
88
+
89
+ # pdm
90
+ # Similar to Pipfile.lock, it is NOT recommended to include pdm.lock in version control.
91
+ #pdm.lock
92
+ .pdm-plugins/
93
+
94
+ # celry beat schedule file
95
+ celerybeat-schedule
96
+ celerybeat.pid
97
+
98
+ # SageMath parsed files
99
+ *.sage.py
100
+
101
+ # Environments
102
+ .env
103
+ .venv
104
+ env/
105
+ venv/
106
+ ENV/
107
+ env.bak/
108
+ venv.bak/
109
+
110
+ # Spyder project settings
111
+ .spyderproject
112
+ .spyproject
113
+
114
+ # Rope project settings
115
+ .ropeproject
116
+
117
+ # mkdocs documentation
118
+ /site/
119
+
120
+ # mypy
121
+ .mypy_cache/
122
+ .dmypy.json
123
+ dmypy.json
124
+
125
+ # Pyre type checker
126
+ .pyre/
127
+
128
+ # pytype static type analyzer
129
+ .pytype/
130
+
131
+ # Cython debug symbols
132
+ cython_debug/
133
+
134
+ # Local databases and runtime state data directories
135
+ /data/
136
+ *.db
137
+ *.db-journal
138
+ chroma/
@@ -0,0 +1,23 @@
1
+ Metadata-Version: 2.4
2
+ Name: document-rag-mcp
3
+ Version: 0.1.0
4
+ Summary: RAG MCP Server — watches folders, chunks documents, serves semantic search via MCP
5
+ License: AGPL-3.0-or-later
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: chonkie[semantic]>=1.0
8
+ Requires-Dist: chromadb>=1.5
9
+ Requires-Dist: click>=8.0
10
+ Requires-Dist: mcp[cli]<2,>=1.27
11
+ Requires-Dist: openai>=1.80
12
+ Requires-Dist: pydantic-settings>=2.0
13
+ Requires-Dist: pydantic>=2.0
14
+ Requires-Dist: pymupdf>=1.25
15
+ Requires-Dist: pyyaml>=6.0
16
+ Requires-Dist: watchfiles>=1.0
17
+ Description-Content-Type: text/markdown
18
+
19
+ # document-rag-mcp
20
+
21
+ A RAG MCP (Model Context Protocol) server. It recursively scans and watches configured directories for `.txt`, `.md`, and `.pdf` files, semantically chunks them, computes embeddings using an OpenAI-compatible API, and stores them in an embedded ChromaDB instance.
22
+
23
+ It exposes tools to search documents and retrieve original content/metadata over the MCP protocol.
@@ -0,0 +1,5 @@
1
+ # document-rag-mcp
2
+
3
+ A RAG MCP (Model Context Protocol) server. It recursively scans and watches configured directories for `.txt`, `.md`, and `.pdf` files, semantically chunks them, computes embeddings using an OpenAI-compatible API, and stores them in an embedded ChromaDB instance.
4
+
5
+ It exposes tools to search documents and retrieve original content/metadata over the MCP protocol.
@@ -0,0 +1,53 @@
1
+ # document-rag-mcp configuration example
2
+
3
+ # Define your document collections.
4
+ # Each collection groups one or more folders and scans them recursively.
5
+ collections:
6
+ - name: "project-docs"
7
+ paths:
8
+ - "/absolute/path/to/my-project/docs"
9
+ - "/absolute/path/to/another/folder"
10
+ file_patterns:
11
+ - "*.txt"
12
+ - "*.md"
13
+ - "*.pdf"
14
+
15
+ - name: "research-papers"
16
+ paths:
17
+ - "/absolute/path/to/papers"
18
+ file_patterns:
19
+ - "*.pdf"
20
+
21
+ # Remote embedding model config via an OpenAI-compatible endpoint.
22
+ # Compatible with lemonade, OpenRouter, Ollama, etc.
23
+ embedding:
24
+ base_url: "http://localhost:8080/v1" # Default local lemonade endpoint
25
+ api_key: "unused" # Use "unused" or actual key
26
+ model: "embed-gemma-300m-FLM" # Default embedding model
27
+ dimensions: 768 # Dimensionality of the vectors
28
+ batch_size: 32 # Batch size for embedding requests
29
+
30
+ # Local semantic chunking configurations.
31
+ chunking:
32
+ max_chunk_size: 512 # Maximum tokens per chunk
33
+ similarity_threshold: 0.5 # Topic shift threshold for semantic chunking
34
+ local_model: "all-MiniLM-L6-v2" # Model used for local semantic boundary detection
35
+ # Note: Falls back to TokenChunker if sentence-transformers is not installed.
36
+
37
+ # Storage paths for metadata and vector storage.
38
+ storage:
39
+ data_dir: "./data" # Path where SQLite and ChromaDB data will be saved
40
+
41
+ # Optional Vision LLM config for scanned/un-OCRed PDFs.
42
+ # Set enabled to true to run page-to-image extraction via multimodal completions.
43
+ vision:
44
+ enabled: false
45
+ base_url: "http://localhost:8080/v1" # Vision endpoint URL
46
+ api_key: "unused" # Vision API Key
47
+ model: "gpt-4o" # Multimodal model name
48
+
49
+ # HTTP/SSE Server bind configurations.
50
+ # Only used when serving via HTTP transport.
51
+ server:
52
+ host: "127.0.0.1"
53
+ port: 8000
@@ -0,0 +1,38 @@
1
+ # System Architecture
2
+
3
+ The Document RAG MCP server is designed as a modular, lightweight, and performant pipeline.
4
+
5
+ ## System Diagram
6
+
7
+ ```
8
+ ┌─────────────────────────────────────────────────────────────────┐
9
+ │ MCP Server (FastMCP) │
10
+ │ Transport: stdio | streamable-http │
11
+ │ │
12
+ │ Tools: search, list_collections, get_document_content, │
13
+ │ get_document_original, ingest_now │
14
+ └──────────┬──────────────────────────────┬───────────────────────┘
15
+ │ │
16
+ ┌─────▼──────┐ ┌────────▼────────┐
17
+ │ Ingestion │ │ Search │
18
+ │ Pipeline │ │ Engine │
19
+ │ │ │ │
20
+ │ extract → │ │ embed query → │
21
+ │ chunk → │ │ vector query → │
22
+ │ hash → │ │ merge & rank │
23
+ │ upsert │ └────────┬────────┘
24
+ └─────┬──────┘ │
25
+ │ │
26
+ ┌─────▼──────┐ ┌─────▼──────┐
27
+ │State Store │ │Vector Store│
28
+ │ (SQLite) │ │ (ChromaDB) │
29
+ └────────────┘ └────────────┘
30
+ ```
31
+
32
+ ## Modular Description
33
+
34
+ - **Extraction (`extractor.py`)**: Responsible for reading plain text, parsing frontmatter/headings in markdown, and extraction from PDFs via PyMuPDF. It uses font sizes and tags in the PDF layout to detect headers.
35
+ - **Chunking (`chunker.py`)**: Uses `chonkie` to partition text into tokens. Markdown is recursively split at section headers and paragraph shifts, while TXT/PDF is semantically chunked.
36
+ - **Incremental Pipeline (`pipeline.py`)**: Checks file-level hashes (SHA-256) and stores them in SQLite. If a file is modified, it computes new chunk hashes, maps existing chunk vectors from ChromaDB, and only requests embeddings for new/modified chunks, saving API tokens.
37
+ - **Search Engine (`engine.py`)**: Performs semantic searches by generating a vector representation of the query and querying ChromaDB. For multi-collection queries, it merges and ranks results using ascending order of L2 distance.
38
+ - **Security Boundaries**: Path operations inside the MCP tools (`get_document_content`, `get_document_original`) validate that the target files reside within one of the collection directories before performing file system reads, protecting the server host from arbitrary path traversal attacks.
@@ -0,0 +1,53 @@
1
+ # CLI Reference
2
+
3
+ The `document-rag-mcp` CLI exposes subcommands to run the RAG server, test searches, and trigger manual index scanning.
4
+
5
+ ## Global Options
6
+
7
+ - `--config`, `-c`: Path to the YAML configuration file. Can also be set via the `DOCRAG_CONFIG` environment variable.
8
+ - `--chunking-model`: Override the local model name for semantic boundary splits. Can also be set via the `DOCRAG_CHUNKING_MODEL` environment variable.
9
+ - `--help`: Show the help message and exit.
10
+
11
+ ## Subcommands
12
+
13
+ ### `serve`
14
+ Starts the Model Context Protocol (MCP) server.
15
+
16
+ - `--transport`: Choose `stdio` (default) or `http` (SSE).
17
+ - `--host`: Bind host for SSE transport (default `127.0.0.1`).
18
+ - `--port`: Bind port for SSE transport (default `8000`).
19
+
20
+ **Example:**
21
+ ```bash
22
+ document-rag-mcp serve --transport stdio
23
+ ```
24
+
25
+ ### `ingest`
26
+ Trigger a one-shot synchronous recursive scan and index of files in all collections (or a specific collection). This command prunes entries for deleted files.
27
+
28
+ - `--collection`, `-c`: Limit the scan to a specific collection by name.
29
+
30
+ **Example:**
31
+ ```bash
32
+ document-rag-mcp ingest --collection "project-docs"
33
+ ```
34
+
35
+ ### `search`
36
+ Executes a semantic search against the indexed collections and prints formatted results to the terminal.
37
+
38
+ - `QUERY` (Argument, Required): The semantic search query text.
39
+ - `--collection`, `-c`: Filter search results to a specific collection by name.
40
+ - `--top-k`, `-k`: The number of nearest matches to display (default 5).
41
+
42
+ **Example:**
43
+ ```bash
44
+ document-rag-mcp search "how to configure the pipeline" -k 3
45
+ ```
46
+
47
+ ### `collections`
48
+ Lists all collections specified in the configuration file, showing their configured directories, search file glob patterns, and the total count of indexed chunks.
49
+
50
+ **Example:**
51
+ ```bash
52
+ document-rag-mcp collections
53
+ ```
@@ -0,0 +1,40 @@
1
+ # Configuration Reference
2
+
3
+ The server loads configuration from a YAML file. You can specify the config file path using the `-c`/`--config` CLI flag or the `DOCRAG_CONFIG` environment variable.
4
+
5
+ ## Environment Variables Override
6
+
7
+ All configuration fields can be overridden using environment variables prefixed with `DOCRAG_` and using a double underscore `__` for nesting.
8
+
9
+ **Examples:**
10
+ - Override storage directory: `DOCRAG_STORAGE__DATA_DIR="/tmp/data"`
11
+ - Override embedding model: `DOCRAG_EMBEDDING__MODEL="text-embedding-3-small"`
12
+ - Override vision enabled: `DOCRAG_VISION__ENABLED="true"`
13
+
14
+ ## Detailed Configuration Options
15
+
16
+ ### Collections Settings
17
+ Configure folders and matching patterns:
18
+ - `name`: Unique name of the collection (conform to alphanumeric rules).
19
+ - `paths`: Absolute paths to folders/files to index.
20
+ - `file_patterns`: Glob patterns, e.g. `["*.txt", "*.md", "*.pdf"]`.
21
+
22
+ ### Embedding Settings
23
+ Configure the OpenAI-compatible embedding API:
24
+ - `base_url`: Target endpoint (e.g. `http://localhost:8080/v1` for lemonade).
25
+ - `api_key`: Authorization API Key (use "unused" if endpoint does not require one).
26
+ - `model`: Embedding model name.
27
+ - `dimensions`: Vector dimensions size (e.g., 768 for Gemma).
28
+ - `batch_size`: Maximum texts sent in a single batch request.
29
+
30
+ ### Local Chunking Settings
31
+ Configure document splitting limits:
32
+ - `max_chunk_size`: Maximum number of tokens per chunk.
33
+ - `similarity_threshold`: Threshold for semantic boundary splits.
34
+ - `local_model`: Local model name (e.g. `all-MiniLM-L6-v2`) used for chunking boundaries.
35
+
36
+ ### Vision Settings (Optional)
37
+ Configure scanned PDF page-to-image extraction:
38
+ - `enabled`: Set `true` to enable multimodal OCR fallback.
39
+ - `base_url`: OpenAI-compatible vision completion endpoint.
40
+ - `model`: Multimodal model name (e.g. `gpt-4o`).
@@ -0,0 +1,58 @@
1
+ # Getting Started
2
+
3
+ Follow these steps to install, configure, and run the Document RAG MCP server.
4
+
5
+ ## Installation
6
+
7
+ This project uses `uv` for python dependency management. Ensure you have `uv` installed, then synchronize the environment:
8
+
9
+ ```bash
10
+ git clone https://github.com/user/document-rag-mcp.git
11
+ cd document-rag-mcp
12
+ uv sync --group dev
13
+ ```
14
+
15
+ ## First Configuration
16
+
17
+ Copy the example configuration file and adjust it to your directories:
18
+
19
+ ```bash
20
+ cp config.example.yaml config.yaml
21
+ ```
22
+
23
+ Edit `config.yaml` to specify the folders you want to watch.
24
+
25
+ ## Quick CLI Usage
26
+
27
+ You can test the ingestion and run semantic searches directly from the command line:
28
+
29
+ ### 1. Ingest Documents
30
+ Run a one-shot ingestion scan to populate the vector store:
31
+ ```bash
32
+ uv run document-rag-mcp ingest
33
+ ```
34
+
35
+ ### 2. Run a Test Search
36
+ Query the indexed collections:
37
+ ```bash
38
+ uv run document-rag-mcp search "What is project antigravity?"
39
+ ```
40
+
41
+ ### 3. Show Collection Statistics
42
+ Check chunk counts and file listings:
43
+ ```bash
44
+ uv run document-rag-mcp collections
45
+ ```
46
+
47
+ ## Running the MCP Server
48
+
49
+ Start the server using `stdio` transport (default) for integration with LLM clients:
50
+
51
+ ```bash
52
+ uv run document-rag-mcp serve --transport stdio
53
+ ```
54
+
55
+ For HTTP/SSE integration, specify the transport and bindings:
56
+ ```bash
57
+ uv run document-rag-mcp serve --transport http --host 127.0.0.1 --port 8000
58
+ ```
@@ -0,0 +1,14 @@
1
+ # Document RAG MCP Server
2
+
3
+ Welcome to the **document-rag-mcp** server documentation!
4
+
5
+ This Model Context Protocol (MCP) server enables semantic search and content extraction over text, Markdown, and PDF documents. It works fully in-process without requiring external database servers, using ChromaDB as the vector store and SQLite as the ingestion state tracker.
6
+
7
+ ## Key Features
8
+
9
+ - **Recursive Folder Scanning**: Configured folders are recursively scanned on startup and then monitored in real-time via inotify (`watchfiles`).
10
+ - **Incremental Indexing**: Uses content hashing (SHA-256) at both the file and chunk levels. Files that have not changed are skipped completely on startup, and modified files only re-embed chunks that actually changed.
11
+ - **Auto-Pruning**: Automatically detects when files are deleted from the disk (both while running and when offline) and prunes them from the index.
12
+ - **Multimodal PDF Processing**: Detects scanned or text-less PDF pages and routes them through an optional vision-capable LLM to extract text.
13
+ - **MCP Native**: Exposes tools for semantic search, collection stats, metadata analysis, and full document text/binary content retrieval.
14
+ - **Secure Boundaries**: Strictly validates all path parameters against configured collections folders to protect against directory traversal attacks.
@@ -0,0 +1,35 @@
1
+ site_name: document-rag-mcp Documentation
2
+ theme:
3
+ name: material
4
+ palette:
5
+ - scheme: slate
6
+ primary: teal
7
+ accent: teal
8
+ toggle:
9
+ icon: material/brightness-4
10
+ name: Switch to light mode
11
+ - scheme: default
12
+ primary: teal
13
+ accent: teal
14
+ toggle:
15
+ icon: material/brightness-7
16
+ name: Switch to dark mode
17
+ features:
18
+ - navigation.sections
19
+ - navigation.expand
20
+ - content.code.copy
21
+
22
+ plugins:
23
+ - search
24
+ - mkdocstrings:
25
+ default_handler: python
26
+ handlers:
27
+ python:
28
+ paths: [src]
29
+
30
+ nav:
31
+ - Home: index.md
32
+ - Getting Started: getting-started.md
33
+ - Configuration: configuration.md
34
+ - Architecture: architecture.md
35
+ - CLI Reference: cli.md
@@ -0,0 +1,50 @@
1
+ [project]
2
+ name = "document-rag-mcp"
3
+ version = "0.1.0"
4
+ description = "RAG MCP Server — watches folders, chunks documents, serves semantic search via MCP"
5
+ readme = "README.md"
6
+ license = { text = "AGPL-3.0-or-later" }
7
+ requires-python = ">=3.11"
8
+ dependencies = [
9
+ "mcp[cli]>=1.27,<2",
10
+ "chromadb>=1.5",
11
+ "openai>=1.80",
12
+ "chonkie[semantic]>=1.0",
13
+ "pymupdf>=1.25",
14
+ "watchfiles>=1.0",
15
+ "pydantic>=2.0",
16
+ "pydantic-settings>=2.0",
17
+ "click>=8.0",
18
+ "pyyaml>=6.0",
19
+ ]
20
+
21
+ [dependency-groups]
22
+ dev = [
23
+ "pytest>=8.0",
24
+ "pytest-asyncio>=0.24",
25
+ "pytest-cov>=6.0",
26
+ "ruff>=0.9",
27
+ ]
28
+ docs = [
29
+ "mkdocs-material>=9.6",
30
+ "mkdocstrings[python]>=0.27",
31
+ ]
32
+
33
+ [project.scripts]
34
+ document-rag-mcp = "document_rag_mcp.cli:main"
35
+
36
+ [build-system]
37
+ requires = ["hatchling"]
38
+ build-backend = "hatchling.build"
39
+
40
+ [tool.hatch.build.targets.wheel]
41
+ packages = ["src/document_rag_mcp"]
42
+
43
+ [tool.pytest.ini_options]
44
+ asyncio_mode = "auto"
45
+ testpaths = ["tests"]
46
+
47
+ [tool.ruff]
48
+ target-version = "py311"
49
+ line-length = 100
50
+ src = ["src"]
@@ -0,0 +1,2 @@
1
+ # document-rag-mcp package
2
+ __version__ = "0.1.0"