document-rag-mcp 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ # document-rag-mcp package
2
+ __version__ = "0.1.0"
@@ -0,0 +1,136 @@
1
+ import asyncio
2
+ import os
3
+ from pathlib import Path
4
+ import click
5
+
6
+
7
+ @click.group()
8
+ @click.option(
9
+ "--config",
10
+ "-c",
11
+ type=click.Path(exists=True, path_type=Path),
12
+ envvar="DOCRAG_CONFIG",
13
+ help="Path to the YAML configuration file.",
14
+ )
15
+ @click.option(
16
+ "--chunking-model",
17
+ envvar="DOCRAG_CHUNKING_MODEL",
18
+ help="Override the local model used for semantic chunking boundary detection.",
19
+ )
20
+ @click.pass_context
21
+ def main(ctx: click.Context, config: Path | None, chunking_model: str | None) -> None:
22
+ """RAG MCP Server CLI — Manage document indexing and semantic search server."""
23
+ if config:
24
+ os.environ["DOCRAG_CONFIG"] = str(config.resolve())
25
+ if chunking_model:
26
+ os.environ["DOCRAG_CHUNKING_MODEL"] = chunking_model
27
+
28
+
29
+ @main.command()
30
+ @click.option(
31
+ "--transport",
32
+ type=click.Choice(["stdio", "http"]),
33
+ default="stdio",
34
+ help="Transport protocol to run the server on (stdio or http/sse).",
35
+ )
36
+ @click.option(
37
+ "--host",
38
+ default="127.0.0.1",
39
+ help="Host to bind the HTTP server to (http transport only).",
40
+ )
41
+ @click.option(
42
+ "--port",
43
+ default=8000,
44
+ type=int,
45
+ help="Port to run the HTTP server on (http transport only).",
46
+ )
47
+ def serve(transport: str, host: str, port: int) -> None:
48
+ """Start the Model Context Protocol (MCP) server."""
49
+ # Lazy import to ensure CLI overrides are set in environment first
50
+ import document_rag_mcp.server as mcp_server
51
+
52
+ transport_mcp = "sse" if transport == "http" else "stdio"
53
+ click.echo(f"Starting document-rag-mcp server on {transport} transport...")
54
+ if transport == "http":
55
+ click.echo(f"Binding to http://{host}:{port}")
56
+
57
+ mcp_server.mcp.run(transport=transport_mcp, host=host, port=port)
58
+
59
+
60
+ @main.command()
61
+ @click.argument("query")
62
+ @click.option(
63
+ "--collection",
64
+ "-c",
65
+ default=None,
66
+ help="Filter search results by a specific collection.",
67
+ )
68
+ @click.option(
69
+ "--top-k",
70
+ "-k",
71
+ default=5,
72
+ type=int,
73
+ help="Number of results to return.",
74
+ )
75
+ def search(query: str, collection: str | None, top_k: int) -> None:
76
+ """Run a semantic search against the indexed collections."""
77
+ import document_rag_mcp.server as mcp_server
78
+
79
+ async def run_search():
80
+ click.echo(f"Searching for: '{query}'...")
81
+ results = await mcp_server.search_engine.search(
82
+ query=query, collection_name=collection, top_k=top_k
83
+ )
84
+ if not results:
85
+ click.echo("No matching documents found.")
86
+ return
87
+
88
+ for i, r in enumerate(results):
89
+ click.echo(f"\n[{i+1}] Score: {r.score:.4f} | File: {r.metadata.file_path}")
90
+ click.echo(f"Collection: {r.metadata.collection} | Page: {r.metadata.page_number or 1}")
91
+ click.echo(f"Title: {r.metadata.title or 'N/A'} | Section: {r.metadata.section or 'N/A'}")
92
+ click.echo("-" * 40)
93
+ click.echo(r.text.strip())
94
+ click.echo("=" * 60)
95
+
96
+ asyncio.run(run_search())
97
+
98
+
99
+ @main.command()
100
+ @click.option(
101
+ "--collection",
102
+ "-c",
103
+ default=None,
104
+ help="Limit ingestion to a specific collection.",
105
+ )
106
+ def ingest(collection: str | None) -> None:
107
+ """Recursively scan folders and index new/changed files immediately."""
108
+ import document_rag_mcp.server as mcp_server
109
+
110
+ async def run_ingest():
111
+ click.echo("Starting one-shot ingestion scan...")
112
+ res = await mcp_server.ingest_now(collection=collection)
113
+ click.echo(res)
114
+
115
+ asyncio.run(run_ingest())
116
+
117
+
118
+ @main.command()
119
+ def collections() -> None:
120
+ """List all configured collections and their indexing status."""
121
+ import document_rag_mcp.server as mcp_server
122
+
123
+ click.echo("Configured Collections:")
124
+ click.echo("=" * 60)
125
+ for coll in mcp_server.config.collections:
126
+ stats = mcp_server.vector_store.collection_stats(coll.name)
127
+ paths_str = ", ".join(str(p) for p in coll.paths)
128
+ click.echo(f"Name: {coll.name}")
129
+ click.echo(f"Paths: {paths_str}")
130
+ click.echo(f"Patterns: {', '.join(coll.file_patterns)}")
131
+ click.echo(f"Indexed Chunks: {stats['count']}")
132
+ click.echo("-" * 60)
133
+
134
+
135
+ if __name__ == "__main__":
136
+ main()
@@ -0,0 +1,94 @@
1
+ import os
2
+ from pathlib import Path
3
+ from typing import Any
4
+ import yaml
5
+ from pydantic import BaseModel, Field
6
+ from pydantic_settings import (
7
+ BaseSettings,
8
+ SettingsConfigDict,
9
+ PydanticBaseSettingsSource,
10
+ )
11
+
12
+ class CollectionConfig(BaseModel):
13
+ name: str # e.g., "project-docs"
14
+ paths: list[Path] # folders to watch/scan
15
+ file_patterns: list[str] = ["*.txt", "*.md", "*.pdf"]
16
+
17
+
18
+ class EmbeddingConfig(BaseModel):
19
+ base_url: str = "http://localhost:8080/v1" # lemonade default
20
+ api_key: str = "unused"
21
+ model: str = "embed-gemma-300m-FLM"
22
+ dimensions: int = 768 # gemma embedding dims
23
+ batch_size: int = 32
24
+
25
+
26
+ class VisionConfig(BaseModel):
27
+ enabled: bool = False
28
+ base_url: str = "http://localhost:8080/v1"
29
+ api_key: str = "unused"
30
+ model: str = "gpt-4o"
31
+
32
+
33
+ class ChunkingConfig(BaseModel):
34
+ max_chunk_size: int = 512 # tokens
35
+ similarity_threshold: float = 0.5 # for semantic boundary detection
36
+ local_model: str = "all-MiniLM-L6-v2" # overridable via --chunking-model CLI flag
37
+
38
+
39
+ class StorageConfig(BaseModel):
40
+ data_dir: Path = Path("./data") # ChromaDB + SQLite storage
41
+
42
+
43
+ class ServerConfig(BaseModel):
44
+ host: str = "127.0.0.1"
45
+ port: int = 8000
46
+
47
+
48
+ class AppConfig(BaseSettings):
49
+ model_config = SettingsConfigDict(
50
+ env_prefix="DOCRAG_",
51
+ env_nested_delimiter="__",
52
+ extra="ignore",
53
+ )
54
+
55
+ collections: list[CollectionConfig] = Field(default_factory=list)
56
+ embedding: EmbeddingConfig = EmbeddingConfig()
57
+ vision: VisionConfig = VisionConfig()
58
+ chunking: ChunkingConfig = ChunkingConfig()
59
+ storage: StorageConfig = StorageConfig()
60
+ server: ServerConfig = ServerConfig()
61
+
62
+ @classmethod
63
+ def settings_customise_sources(
64
+ cls,
65
+ settings_cls: type[BaseSettings],
66
+ init_settings: PydanticBaseSettingsSource,
67
+ env_settings: PydanticBaseSettingsSource,
68
+ dotenv_settings: PydanticBaseSettingsSource,
69
+ file_secret_settings: PydanticBaseSettingsSource,
70
+ ) -> tuple[PydanticBaseSettingsSource, ...]:
71
+ # Prioritize environment variables over initialization arguments (YAML data)
72
+ return env_settings, init_settings, dotenv_settings, file_secret_settings
73
+
74
+
75
+ def load_config(config_path: Path | str | None = None) -> AppConfig:
76
+ """Loads application configuration from YAML file and overrides with environment variables.
77
+
78
+ Environment variables must be prefixed with DOCRAG_ and nested with double underscores.
79
+ For example: DOCRAG_EMBEDDING__MODEL="text-embedding-3-small"
80
+ """
81
+ if not config_path:
82
+ config_path = os.environ.get("DOCRAG_CONFIG")
83
+
84
+ yaml_data: dict[str, Any] = {}
85
+ if config_path:
86
+ path = Path(config_path)
87
+ if path.exists():
88
+ with open(path, "r", encoding="utf-8") as f:
89
+ loaded = yaml.safe_load(f)
90
+ if isinstance(loaded, dict):
91
+ yaml_data = loaded
92
+
93
+ # Pydantic BaseSettings automatically prioritizes env vars over values passed in init.
94
+ return AppConfig(**yaml_data)
@@ -0,0 +1 @@
1
+ # Embedding module
@@ -0,0 +1,47 @@
1
+ from openai import AsyncOpenAI
2
+ from ..config import EmbeddingConfig
3
+
4
+
5
+ class EmbeddingClient:
6
+ def __init__(self, config: EmbeddingConfig):
7
+ self.client = AsyncOpenAI(base_url=config.base_url, api_key=config.api_key)
8
+ self.model = config.model
9
+ self.dimensions = config.dimensions
10
+ self.batch_size = config.batch_size
11
+
12
+ async def embed(self, texts: list[str]) -> list[list[float]]:
13
+ """Generates embeddings for a list of texts in batches."""
14
+ if not texts:
15
+ return []
16
+
17
+ embeddings: list[list[float]] = []
18
+
19
+ # Process in batches
20
+ for i in range(0, len(texts), self.batch_size):
21
+ batch = texts[i : i + self.batch_size]
22
+ try:
23
+ # Try sending with dimensions parameter
24
+ response = await self.client.embeddings.create(
25
+ input=batch,
26
+ model=self.model,
27
+ dimensions=self.dimensions,
28
+ )
29
+ except Exception:
30
+ # Fallback for APIs that don't support the dimensions parameter
31
+ response = await self.client.embeddings.create(
32
+ input=batch,
33
+ model=self.model,
34
+ )
35
+
36
+ # Extract embeddings in correct order
37
+ batch_embeddings = [data.embedding for data in response.data]
38
+ embeddings.extend(batch_embeddings)
39
+
40
+ return embeddings
41
+
42
+ async def embed_query(self, query: str) -> list[float]:
43
+ """Generates embedding for a single search query."""
44
+ results = await self.embed([query])
45
+ if not results:
46
+ raise ValueError("Failed to generate embedding for query.")
47
+ return results[0]
@@ -0,0 +1 @@
1
+ # Ingestion module
@@ -0,0 +1,125 @@
1
+ from datetime import datetime, timezone
2
+ import hashlib
3
+ import os
4
+ from ..config import ChunkingConfig
5
+ from ..models import ChunkMetadata, DocumentChunk
6
+ from .extractor import ExtractedPage
7
+
8
+
9
+ class DocumentChunker:
10
+ def __init__(self, config: ChunkingConfig):
11
+ self.config = config
12
+ self._init_chunkers()
13
+
14
+ def _init_chunkers(self) -> None:
15
+ import chonkie
16
+
17
+ # 1. Initialize semantic chunker (for TXT / PDF)
18
+ try:
19
+ self.semantic_chunker = chonkie.SemanticChunker(
20
+ embedding_model=self.config.local_model,
21
+ chunk_size=self.config.max_chunk_size,
22
+ threshold=self.config.similarity_threshold,
23
+ )
24
+ except Exception as e:
25
+ # Fallback to SentenceChunker/TokenChunker if sentence-transformers is missing or fails
26
+ print(
27
+ f"Warning: Failed to load SemanticChunker with '{self.config.local_model}': {e}"
28
+ )
29
+ print("Falling back to TokenChunker for semantic text.")
30
+ self.semantic_chunker = chonkie.TokenChunker(chunk_size=self.config.max_chunk_size)
31
+
32
+ # 2. Initialize recursive chunker for Markdown (uses split rules rather than semantic embeddings)
33
+ self.markdown_chunker = chonkie.RecursiveChunker(
34
+ chunk_size=self.config.max_chunk_size,
35
+ )
36
+
37
+ def chunk_pages(
38
+ self,
39
+ pages: list[ExtractedPage],
40
+ file_path: str,
41
+ collection_name: str,
42
+ file_hash: str,
43
+ file_type: str,
44
+ last_modified: datetime,
45
+ ) -> list[DocumentChunk]:
46
+ """Transforms a list of ExtractedPages into a flat list of metadata-rich DocumentChunks."""
47
+ file_name = os.path.basename(file_path)
48
+
49
+ # Resolve document title once for the entire document
50
+ document_title = None
51
+ if pages and pages[0].metadata:
52
+ document_title = pages[0].metadata.get("title")
53
+ if not document_title and pages and pages[0].headings:
54
+ for heading_text, level in pages[0].headings:
55
+ if level == 1:
56
+ document_title = heading_text
57
+ break
58
+ if not document_title:
59
+ document_title = os.path.splitext(file_name)[0]
60
+
61
+ raw_chunks: list[tuple[str, int | None, list[tuple[str, int]]]] = []
62
+
63
+ # Generate chunks per page
64
+ for page in pages:
65
+ text_to_chunk = page.text.strip()
66
+ if not text_to_chunk:
67
+ continue
68
+
69
+ if file_type == "md":
70
+ chunks_out = self.markdown_chunker.chunk(text_to_chunk)
71
+ else:
72
+ chunks_out = self.semantic_chunker.chunk(text_to_chunk)
73
+
74
+ for c in chunks_out:
75
+ raw_chunks.append((c.text, page.page_number, page.headings))
76
+
77
+ total_chunks = len(raw_chunks)
78
+ document_chunks: list[DocumentChunk] = []
79
+
80
+ for idx, (chunk_text, page_num, headings) in enumerate(raw_chunks):
81
+ # Resolve section / heading
82
+ section = None
83
+ if headings:
84
+ # Check headings in reverse order (closest heading before/within the chunk)
85
+ for heading_text, _ in reversed(headings):
86
+ if heading_text in chunk_text:
87
+ section = heading_text
88
+ break
89
+ if not section:
90
+ section = headings[0][0]
91
+
92
+ # Compute chunk-level hash
93
+ chunk_hash = hashlib.sha256(chunk_text.encode("utf-8")).hexdigest()
94
+
95
+ # Deterministic chunk ID
96
+ chunk_id_seed = f"{file_path}_{idx}"
97
+ chunk_id = hashlib.sha256(chunk_id_seed.encode("utf-8")).hexdigest()
98
+
99
+ metadata = ChunkMetadata(
100
+ file_path=file_path,
101
+ file_name=file_name,
102
+ collection=collection_name,
103
+ file_hash=file_hash,
104
+ chunk_hash=chunk_hash,
105
+ chunk_index=idx,
106
+ total_chunks=total_chunks,
107
+ title=document_title,
108
+ section=section,
109
+ file_type=file_type,
110
+ page_number=page_num,
111
+ last_modified=last_modified,
112
+ ingested_at=datetime.now(timezone.utc),
113
+ vision_processed=False,
114
+ )
115
+
116
+ document_chunks.append(
117
+ DocumentChunk(
118
+ id=chunk_id,
119
+ text=chunk_text,
120
+ metadata=metadata,
121
+ embedding=None,
122
+ )
123
+ )
124
+
125
+ return document_chunks
@@ -0,0 +1,181 @@
1
+ from collections import Counter
2
+ from dataclasses import dataclass, field
3
+ from pathlib import Path
4
+ import fitz # PyMuPDF
5
+ import yaml
6
+
7
+
8
+ @dataclass
9
+ class ExtractedPage:
10
+ text: str
11
+ page_number: int | None = None
12
+ headings: list[tuple[str, int]] = field(default_factory=list) # list of (heading_text, heading_level)
13
+ image_bytes: bytes | None = None
14
+ metadata: dict[str, any] = field(default_factory=dict)
15
+
16
+
17
+ class DocumentExtractor:
18
+ def __init__(self, vision_enabled: bool = False):
19
+ self.vision_enabled = vision_enabled
20
+
21
+ def extract(self, file_path: Path | str) -> list[ExtractedPage]:
22
+ """Extracts content from a file depending on its extension."""
23
+ path = Path(file_path)
24
+ ext = path.suffix.lower()
25
+
26
+ if ext == ".txt":
27
+ return self._extract_txt(path)
28
+ elif ext == ".md":
29
+ return self._extract_md(path)
30
+ elif ext == ".pdf":
31
+ return self._extract_pdf(path)
32
+ else:
33
+ raise ValueError(f"Unsupported file extension: {ext}")
34
+
35
+ def _extract_txt(self, path: Path) -> list[ExtractedPage]:
36
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
37
+ content = f.read()
38
+
39
+ headings = []
40
+ lines = content.splitlines()
41
+ if lines:
42
+ first_line = lines[0].strip()
43
+ # If first line looks like a title, mark it as heading level 1
44
+ if first_line and len(first_line) < 80 and not first_line.endswith((".", ",", ";")):
45
+ headings.append((first_line, 1))
46
+
47
+ return [
48
+ ExtractedPage(
49
+ text=content,
50
+ page_number=1,
51
+ headings=headings,
52
+ metadata={"title": path.stem},
53
+ )
54
+ ]
55
+
56
+ def _extract_md(self, path: Path) -> list[ExtractedPage]:
57
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
58
+ content = f.read()
59
+
60
+ frontmatter = {}
61
+ body = content
62
+
63
+ # Parse frontmatter
64
+ if content.startswith("---"):
65
+ parts = content.split("---", 2)
66
+ if len(parts) >= 3:
67
+ try:
68
+ frontmatter = yaml.safe_load(parts[1]) or {}
69
+ body = parts[2]
70
+ except Exception:
71
+ pass
72
+
73
+ headings = []
74
+ for line in body.splitlines():
75
+ line_strip = line.strip()
76
+ if line_strip.startswith("#"):
77
+ # Count the heading level
78
+ level = 0
79
+ while level < len(line_strip) and line_strip[level] == "#":
80
+ level += 1
81
+ if level > 0 and level < len(line_strip) and line_strip[level] == " ":
82
+ heading_text = line_strip[level:].strip()
83
+ if heading_text:
84
+ headings.append((heading_text, level))
85
+
86
+ # Default title is the frontmatter title, or file stem
87
+ title = frontmatter.get("title", path.stem)
88
+ meta = {"title": title, "frontmatter": frontmatter}
89
+
90
+ return [
91
+ ExtractedPage(
92
+ text=body,
93
+ page_number=1,
94
+ headings=headings,
95
+ metadata=meta,
96
+ )
97
+ ]
98
+
99
+ def _extract_pdf(self, path: Path) -> list[ExtractedPage]:
100
+ doc = fitz.open(path)
101
+ pages = []
102
+
103
+ for page_idx, page in enumerate(doc):
104
+ page_num = page_idx + 1
105
+ text = page.get_text("text")
106
+
107
+ # Scanned check (very little text)
108
+ non_space_chars = len("".join(text.split()))
109
+ is_empty_or_scanned = non_space_chars < 40
110
+
111
+ image_bytes = None
112
+ if is_empty_or_scanned and self.vision_enabled:
113
+ # Render to high-quality PNG (144 DPI)
114
+ pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
115
+ image_bytes = pix.tobytes("png")
116
+
117
+ # Typography-aware heading detection using get_text("dict")
118
+ headings = []
119
+ try:
120
+ blocks = page.get_text("dict").get("blocks", [])
121
+ sizes = []
122
+ for b in blocks:
123
+ if b.get("type") == 0: # text block
124
+ for line in b.get("lines", []):
125
+ for span in line.get("spans", []):
126
+ txt = span.get("text", "").strip()
127
+ if txt:
128
+ sizes.append(span.get("size", 10.0))
129
+
130
+ # Compute body text font size (most common size)
131
+ body_size = 10.0
132
+ if sizes:
133
+ body_size = Counter(sizes).most_common(1)[0][0]
134
+
135
+ # Detect headings
136
+ for b in blocks:
137
+ if b.get("type") == 0:
138
+ for line in b.get("lines", []):
139
+ spans = line.get("spans", [])
140
+ if not spans:
141
+ continue
142
+ line_text = "".join(s.get("text", "") for s in spans).strip()
143
+ if not line_text:
144
+ continue
145
+
146
+ first_span = spans[0]
147
+ size = first_span.get("size", 10.0)
148
+ font = first_span.get("font", "").lower()
149
+
150
+ is_bold = "bold" in font or "black" in font or "heavy" in font
151
+ is_large = size > body_size * 1.2
152
+
153
+ # Short line, doesn't end with typical sentence punctuation
154
+ if (
155
+ (is_large or is_bold)
156
+ and len(line_text) < 120
157
+ and not line_text.endswith((".", ":", ";", ","))
158
+ ):
159
+ if size > body_size * 1.5:
160
+ level = 1
161
+ elif size > body_size * 1.3:
162
+ level = 2
163
+ else:
164
+ level = 3
165
+ headings.append((line_text, level))
166
+ except Exception:
167
+ # Fallback: if dict parsing fails, don't break extraction
168
+ pass
169
+
170
+ pages.append(
171
+ ExtractedPage(
172
+ text=text,
173
+ page_number=page_num,
174
+ headings=headings,
175
+ image_bytes=image_bytes,
176
+ metadata={"title": path.stem},
177
+ )
178
+ )
179
+
180
+ doc.close()
181
+ return pages