document-rag-mcp 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- document_rag_mcp/__init__.py +2 -0
- document_rag_mcp/cli.py +136 -0
- document_rag_mcp/config.py +94 -0
- document_rag_mcp/embedding/__init__.py +1 -0
- document_rag_mcp/embedding/client.py +47 -0
- document_rag_mcp/ingestion/__init__.py +1 -0
- document_rag_mcp/ingestion/chunker.py +125 -0
- document_rag_mcp/ingestion/extractor.py +181 -0
- document_rag_mcp/ingestion/pipeline.py +172 -0
- document_rag_mcp/ingestion/scanner.py +35 -0
- document_rag_mcp/ingestion/watcher.py +80 -0
- document_rag_mcp/models.py +32 -0
- document_rag_mcp/search/__init__.py +1 -0
- document_rag_mcp/search/engine.py +53 -0
- document_rag_mcp/server.py +256 -0
- document_rag_mcp/storage/__init__.py +1 -0
- document_rag_mcp/storage/state_store.py +145 -0
- document_rag_mcp/storage/vector_store.py +164 -0
- document_rag_mcp/vision/__init__.py +1 -0
- document_rag_mcp/vision/client.py +53 -0
- document_rag_mcp-0.1.0.dist-info/METADATA +23 -0
- document_rag_mcp-0.1.0.dist-info/RECORD +24 -0
- document_rag_mcp-0.1.0.dist-info/WHEEL +4 -0
- document_rag_mcp-0.1.0.dist-info/entry_points.txt +2 -0
document_rag_mcp/cli.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import os
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import click
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
@click.group()
|
|
8
|
+
@click.option(
|
|
9
|
+
"--config",
|
|
10
|
+
"-c",
|
|
11
|
+
type=click.Path(exists=True, path_type=Path),
|
|
12
|
+
envvar="DOCRAG_CONFIG",
|
|
13
|
+
help="Path to the YAML configuration file.",
|
|
14
|
+
)
|
|
15
|
+
@click.option(
|
|
16
|
+
"--chunking-model",
|
|
17
|
+
envvar="DOCRAG_CHUNKING_MODEL",
|
|
18
|
+
help="Override the local model used for semantic chunking boundary detection.",
|
|
19
|
+
)
|
|
20
|
+
@click.pass_context
|
|
21
|
+
def main(ctx: click.Context, config: Path | None, chunking_model: str | None) -> None:
|
|
22
|
+
"""RAG MCP Server CLI — Manage document indexing and semantic search server."""
|
|
23
|
+
if config:
|
|
24
|
+
os.environ["DOCRAG_CONFIG"] = str(config.resolve())
|
|
25
|
+
if chunking_model:
|
|
26
|
+
os.environ["DOCRAG_CHUNKING_MODEL"] = chunking_model
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@main.command()
|
|
30
|
+
@click.option(
|
|
31
|
+
"--transport",
|
|
32
|
+
type=click.Choice(["stdio", "http"]),
|
|
33
|
+
default="stdio",
|
|
34
|
+
help="Transport protocol to run the server on (stdio or http/sse).",
|
|
35
|
+
)
|
|
36
|
+
@click.option(
|
|
37
|
+
"--host",
|
|
38
|
+
default="127.0.0.1",
|
|
39
|
+
help="Host to bind the HTTP server to (http transport only).",
|
|
40
|
+
)
|
|
41
|
+
@click.option(
|
|
42
|
+
"--port",
|
|
43
|
+
default=8000,
|
|
44
|
+
type=int,
|
|
45
|
+
help="Port to run the HTTP server on (http transport only).",
|
|
46
|
+
)
|
|
47
|
+
def serve(transport: str, host: str, port: int) -> None:
|
|
48
|
+
"""Start the Model Context Protocol (MCP) server."""
|
|
49
|
+
# Lazy import to ensure CLI overrides are set in environment first
|
|
50
|
+
import document_rag_mcp.server as mcp_server
|
|
51
|
+
|
|
52
|
+
transport_mcp = "sse" if transport == "http" else "stdio"
|
|
53
|
+
click.echo(f"Starting document-rag-mcp server on {transport} transport...")
|
|
54
|
+
if transport == "http":
|
|
55
|
+
click.echo(f"Binding to http://{host}:{port}")
|
|
56
|
+
|
|
57
|
+
mcp_server.mcp.run(transport=transport_mcp, host=host, port=port)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@main.command()
|
|
61
|
+
@click.argument("query")
|
|
62
|
+
@click.option(
|
|
63
|
+
"--collection",
|
|
64
|
+
"-c",
|
|
65
|
+
default=None,
|
|
66
|
+
help="Filter search results by a specific collection.",
|
|
67
|
+
)
|
|
68
|
+
@click.option(
|
|
69
|
+
"--top-k",
|
|
70
|
+
"-k",
|
|
71
|
+
default=5,
|
|
72
|
+
type=int,
|
|
73
|
+
help="Number of results to return.",
|
|
74
|
+
)
|
|
75
|
+
def search(query: str, collection: str | None, top_k: int) -> None:
|
|
76
|
+
"""Run a semantic search against the indexed collections."""
|
|
77
|
+
import document_rag_mcp.server as mcp_server
|
|
78
|
+
|
|
79
|
+
async def run_search():
|
|
80
|
+
click.echo(f"Searching for: '{query}'...")
|
|
81
|
+
results = await mcp_server.search_engine.search(
|
|
82
|
+
query=query, collection_name=collection, top_k=top_k
|
|
83
|
+
)
|
|
84
|
+
if not results:
|
|
85
|
+
click.echo("No matching documents found.")
|
|
86
|
+
return
|
|
87
|
+
|
|
88
|
+
for i, r in enumerate(results):
|
|
89
|
+
click.echo(f"\n[{i+1}] Score: {r.score:.4f} | File: {r.metadata.file_path}")
|
|
90
|
+
click.echo(f"Collection: {r.metadata.collection} | Page: {r.metadata.page_number or 1}")
|
|
91
|
+
click.echo(f"Title: {r.metadata.title or 'N/A'} | Section: {r.metadata.section or 'N/A'}")
|
|
92
|
+
click.echo("-" * 40)
|
|
93
|
+
click.echo(r.text.strip())
|
|
94
|
+
click.echo("=" * 60)
|
|
95
|
+
|
|
96
|
+
asyncio.run(run_search())
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
@main.command()
|
|
100
|
+
@click.option(
|
|
101
|
+
"--collection",
|
|
102
|
+
"-c",
|
|
103
|
+
default=None,
|
|
104
|
+
help="Limit ingestion to a specific collection.",
|
|
105
|
+
)
|
|
106
|
+
def ingest(collection: str | None) -> None:
|
|
107
|
+
"""Recursively scan folders and index new/changed files immediately."""
|
|
108
|
+
import document_rag_mcp.server as mcp_server
|
|
109
|
+
|
|
110
|
+
async def run_ingest():
|
|
111
|
+
click.echo("Starting one-shot ingestion scan...")
|
|
112
|
+
res = await mcp_server.ingest_now(collection=collection)
|
|
113
|
+
click.echo(res)
|
|
114
|
+
|
|
115
|
+
asyncio.run(run_ingest())
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@main.command()
|
|
119
|
+
def collections() -> None:
|
|
120
|
+
"""List all configured collections and their indexing status."""
|
|
121
|
+
import document_rag_mcp.server as mcp_server
|
|
122
|
+
|
|
123
|
+
click.echo("Configured Collections:")
|
|
124
|
+
click.echo("=" * 60)
|
|
125
|
+
for coll in mcp_server.config.collections:
|
|
126
|
+
stats = mcp_server.vector_store.collection_stats(coll.name)
|
|
127
|
+
paths_str = ", ".join(str(p) for p in coll.paths)
|
|
128
|
+
click.echo(f"Name: {coll.name}")
|
|
129
|
+
click.echo(f"Paths: {paths_str}")
|
|
130
|
+
click.echo(f"Patterns: {', '.join(coll.file_patterns)}")
|
|
131
|
+
click.echo(f"Indexed Chunks: {stats['count']}")
|
|
132
|
+
click.echo("-" * 60)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
if __name__ == "__main__":
|
|
136
|
+
main()
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from typing import Any
|
|
4
|
+
import yaml
|
|
5
|
+
from pydantic import BaseModel, Field
|
|
6
|
+
from pydantic_settings import (
|
|
7
|
+
BaseSettings,
|
|
8
|
+
SettingsConfigDict,
|
|
9
|
+
PydanticBaseSettingsSource,
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
class CollectionConfig(BaseModel):
|
|
13
|
+
name: str # e.g., "project-docs"
|
|
14
|
+
paths: list[Path] # folders to watch/scan
|
|
15
|
+
file_patterns: list[str] = ["*.txt", "*.md", "*.pdf"]
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class EmbeddingConfig(BaseModel):
|
|
19
|
+
base_url: str = "http://localhost:8080/v1" # lemonade default
|
|
20
|
+
api_key: str = "unused"
|
|
21
|
+
model: str = "embed-gemma-300m-FLM"
|
|
22
|
+
dimensions: int = 768 # gemma embedding dims
|
|
23
|
+
batch_size: int = 32
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class VisionConfig(BaseModel):
|
|
27
|
+
enabled: bool = False
|
|
28
|
+
base_url: str = "http://localhost:8080/v1"
|
|
29
|
+
api_key: str = "unused"
|
|
30
|
+
model: str = "gpt-4o"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ChunkingConfig(BaseModel):
|
|
34
|
+
max_chunk_size: int = 512 # tokens
|
|
35
|
+
similarity_threshold: float = 0.5 # for semantic boundary detection
|
|
36
|
+
local_model: str = "all-MiniLM-L6-v2" # overridable via --chunking-model CLI flag
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class StorageConfig(BaseModel):
|
|
40
|
+
data_dir: Path = Path("./data") # ChromaDB + SQLite storage
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ServerConfig(BaseModel):
|
|
44
|
+
host: str = "127.0.0.1"
|
|
45
|
+
port: int = 8000
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class AppConfig(BaseSettings):
|
|
49
|
+
model_config = SettingsConfigDict(
|
|
50
|
+
env_prefix="DOCRAG_",
|
|
51
|
+
env_nested_delimiter="__",
|
|
52
|
+
extra="ignore",
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
collections: list[CollectionConfig] = Field(default_factory=list)
|
|
56
|
+
embedding: EmbeddingConfig = EmbeddingConfig()
|
|
57
|
+
vision: VisionConfig = VisionConfig()
|
|
58
|
+
chunking: ChunkingConfig = ChunkingConfig()
|
|
59
|
+
storage: StorageConfig = StorageConfig()
|
|
60
|
+
server: ServerConfig = ServerConfig()
|
|
61
|
+
|
|
62
|
+
@classmethod
|
|
63
|
+
def settings_customise_sources(
|
|
64
|
+
cls,
|
|
65
|
+
settings_cls: type[BaseSettings],
|
|
66
|
+
init_settings: PydanticBaseSettingsSource,
|
|
67
|
+
env_settings: PydanticBaseSettingsSource,
|
|
68
|
+
dotenv_settings: PydanticBaseSettingsSource,
|
|
69
|
+
file_secret_settings: PydanticBaseSettingsSource,
|
|
70
|
+
) -> tuple[PydanticBaseSettingsSource, ...]:
|
|
71
|
+
# Prioritize environment variables over initialization arguments (YAML data)
|
|
72
|
+
return env_settings, init_settings, dotenv_settings, file_secret_settings
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def load_config(config_path: Path | str | None = None) -> AppConfig:
|
|
76
|
+
"""Loads application configuration from YAML file and overrides with environment variables.
|
|
77
|
+
|
|
78
|
+
Environment variables must be prefixed with DOCRAG_ and nested with double underscores.
|
|
79
|
+
For example: DOCRAG_EMBEDDING__MODEL="text-embedding-3-small"
|
|
80
|
+
"""
|
|
81
|
+
if not config_path:
|
|
82
|
+
config_path = os.environ.get("DOCRAG_CONFIG")
|
|
83
|
+
|
|
84
|
+
yaml_data: dict[str, Any] = {}
|
|
85
|
+
if config_path:
|
|
86
|
+
path = Path(config_path)
|
|
87
|
+
if path.exists():
|
|
88
|
+
with open(path, "r", encoding="utf-8") as f:
|
|
89
|
+
loaded = yaml.safe_load(f)
|
|
90
|
+
if isinstance(loaded, dict):
|
|
91
|
+
yaml_data = loaded
|
|
92
|
+
|
|
93
|
+
# Pydantic BaseSettings automatically prioritizes env vars over values passed in init.
|
|
94
|
+
return AppConfig(**yaml_data)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Embedding module
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
from openai import AsyncOpenAI
|
|
2
|
+
from ..config import EmbeddingConfig
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class EmbeddingClient:
|
|
6
|
+
def __init__(self, config: EmbeddingConfig):
|
|
7
|
+
self.client = AsyncOpenAI(base_url=config.base_url, api_key=config.api_key)
|
|
8
|
+
self.model = config.model
|
|
9
|
+
self.dimensions = config.dimensions
|
|
10
|
+
self.batch_size = config.batch_size
|
|
11
|
+
|
|
12
|
+
async def embed(self, texts: list[str]) -> list[list[float]]:
|
|
13
|
+
"""Generates embeddings for a list of texts in batches."""
|
|
14
|
+
if not texts:
|
|
15
|
+
return []
|
|
16
|
+
|
|
17
|
+
embeddings: list[list[float]] = []
|
|
18
|
+
|
|
19
|
+
# Process in batches
|
|
20
|
+
for i in range(0, len(texts), self.batch_size):
|
|
21
|
+
batch = texts[i : i + self.batch_size]
|
|
22
|
+
try:
|
|
23
|
+
# Try sending with dimensions parameter
|
|
24
|
+
response = await self.client.embeddings.create(
|
|
25
|
+
input=batch,
|
|
26
|
+
model=self.model,
|
|
27
|
+
dimensions=self.dimensions,
|
|
28
|
+
)
|
|
29
|
+
except Exception:
|
|
30
|
+
# Fallback for APIs that don't support the dimensions parameter
|
|
31
|
+
response = await self.client.embeddings.create(
|
|
32
|
+
input=batch,
|
|
33
|
+
model=self.model,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
# Extract embeddings in correct order
|
|
37
|
+
batch_embeddings = [data.embedding for data in response.data]
|
|
38
|
+
embeddings.extend(batch_embeddings)
|
|
39
|
+
|
|
40
|
+
return embeddings
|
|
41
|
+
|
|
42
|
+
async def embed_query(self, query: str) -> list[float]:
|
|
43
|
+
"""Generates embedding for a single search query."""
|
|
44
|
+
results = await self.embed([query])
|
|
45
|
+
if not results:
|
|
46
|
+
raise ValueError("Failed to generate embedding for query.")
|
|
47
|
+
return results[0]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# Ingestion module
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
from datetime import datetime, timezone
|
|
2
|
+
import hashlib
|
|
3
|
+
import os
|
|
4
|
+
from ..config import ChunkingConfig
|
|
5
|
+
from ..models import ChunkMetadata, DocumentChunk
|
|
6
|
+
from .extractor import ExtractedPage
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class DocumentChunker:
|
|
10
|
+
def __init__(self, config: ChunkingConfig):
|
|
11
|
+
self.config = config
|
|
12
|
+
self._init_chunkers()
|
|
13
|
+
|
|
14
|
+
def _init_chunkers(self) -> None:
|
|
15
|
+
import chonkie
|
|
16
|
+
|
|
17
|
+
# 1. Initialize semantic chunker (for TXT / PDF)
|
|
18
|
+
try:
|
|
19
|
+
self.semantic_chunker = chonkie.SemanticChunker(
|
|
20
|
+
embedding_model=self.config.local_model,
|
|
21
|
+
chunk_size=self.config.max_chunk_size,
|
|
22
|
+
threshold=self.config.similarity_threshold,
|
|
23
|
+
)
|
|
24
|
+
except Exception as e:
|
|
25
|
+
# Fallback to SentenceChunker/TokenChunker if sentence-transformers is missing or fails
|
|
26
|
+
print(
|
|
27
|
+
f"Warning: Failed to load SemanticChunker with '{self.config.local_model}': {e}"
|
|
28
|
+
)
|
|
29
|
+
print("Falling back to TokenChunker for semantic text.")
|
|
30
|
+
self.semantic_chunker = chonkie.TokenChunker(chunk_size=self.config.max_chunk_size)
|
|
31
|
+
|
|
32
|
+
# 2. Initialize recursive chunker for Markdown (uses split rules rather than semantic embeddings)
|
|
33
|
+
self.markdown_chunker = chonkie.RecursiveChunker(
|
|
34
|
+
chunk_size=self.config.max_chunk_size,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
def chunk_pages(
|
|
38
|
+
self,
|
|
39
|
+
pages: list[ExtractedPage],
|
|
40
|
+
file_path: str,
|
|
41
|
+
collection_name: str,
|
|
42
|
+
file_hash: str,
|
|
43
|
+
file_type: str,
|
|
44
|
+
last_modified: datetime,
|
|
45
|
+
) -> list[DocumentChunk]:
|
|
46
|
+
"""Transforms a list of ExtractedPages into a flat list of metadata-rich DocumentChunks."""
|
|
47
|
+
file_name = os.path.basename(file_path)
|
|
48
|
+
|
|
49
|
+
# Resolve document title once for the entire document
|
|
50
|
+
document_title = None
|
|
51
|
+
if pages and pages[0].metadata:
|
|
52
|
+
document_title = pages[0].metadata.get("title")
|
|
53
|
+
if not document_title and pages and pages[0].headings:
|
|
54
|
+
for heading_text, level in pages[0].headings:
|
|
55
|
+
if level == 1:
|
|
56
|
+
document_title = heading_text
|
|
57
|
+
break
|
|
58
|
+
if not document_title:
|
|
59
|
+
document_title = os.path.splitext(file_name)[0]
|
|
60
|
+
|
|
61
|
+
raw_chunks: list[tuple[str, int | None, list[tuple[str, int]]]] = []
|
|
62
|
+
|
|
63
|
+
# Generate chunks per page
|
|
64
|
+
for page in pages:
|
|
65
|
+
text_to_chunk = page.text.strip()
|
|
66
|
+
if not text_to_chunk:
|
|
67
|
+
continue
|
|
68
|
+
|
|
69
|
+
if file_type == "md":
|
|
70
|
+
chunks_out = self.markdown_chunker.chunk(text_to_chunk)
|
|
71
|
+
else:
|
|
72
|
+
chunks_out = self.semantic_chunker.chunk(text_to_chunk)
|
|
73
|
+
|
|
74
|
+
for c in chunks_out:
|
|
75
|
+
raw_chunks.append((c.text, page.page_number, page.headings))
|
|
76
|
+
|
|
77
|
+
total_chunks = len(raw_chunks)
|
|
78
|
+
document_chunks: list[DocumentChunk] = []
|
|
79
|
+
|
|
80
|
+
for idx, (chunk_text, page_num, headings) in enumerate(raw_chunks):
|
|
81
|
+
# Resolve section / heading
|
|
82
|
+
section = None
|
|
83
|
+
if headings:
|
|
84
|
+
# Check headings in reverse order (closest heading before/within the chunk)
|
|
85
|
+
for heading_text, _ in reversed(headings):
|
|
86
|
+
if heading_text in chunk_text:
|
|
87
|
+
section = heading_text
|
|
88
|
+
break
|
|
89
|
+
if not section:
|
|
90
|
+
section = headings[0][0]
|
|
91
|
+
|
|
92
|
+
# Compute chunk-level hash
|
|
93
|
+
chunk_hash = hashlib.sha256(chunk_text.encode("utf-8")).hexdigest()
|
|
94
|
+
|
|
95
|
+
# Deterministic chunk ID
|
|
96
|
+
chunk_id_seed = f"{file_path}_{idx}"
|
|
97
|
+
chunk_id = hashlib.sha256(chunk_id_seed.encode("utf-8")).hexdigest()
|
|
98
|
+
|
|
99
|
+
metadata = ChunkMetadata(
|
|
100
|
+
file_path=file_path,
|
|
101
|
+
file_name=file_name,
|
|
102
|
+
collection=collection_name,
|
|
103
|
+
file_hash=file_hash,
|
|
104
|
+
chunk_hash=chunk_hash,
|
|
105
|
+
chunk_index=idx,
|
|
106
|
+
total_chunks=total_chunks,
|
|
107
|
+
title=document_title,
|
|
108
|
+
section=section,
|
|
109
|
+
file_type=file_type,
|
|
110
|
+
page_number=page_num,
|
|
111
|
+
last_modified=last_modified,
|
|
112
|
+
ingested_at=datetime.now(timezone.utc),
|
|
113
|
+
vision_processed=False,
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
document_chunks.append(
|
|
117
|
+
DocumentChunk(
|
|
118
|
+
id=chunk_id,
|
|
119
|
+
text=chunk_text,
|
|
120
|
+
metadata=metadata,
|
|
121
|
+
embedding=None,
|
|
122
|
+
)
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
return document_chunks
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
from collections import Counter
|
|
2
|
+
from dataclasses import dataclass, field
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
import fitz # PyMuPDF
|
|
5
|
+
import yaml
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class ExtractedPage:
|
|
10
|
+
text: str
|
|
11
|
+
page_number: int | None = None
|
|
12
|
+
headings: list[tuple[str, int]] = field(default_factory=list) # list of (heading_text, heading_level)
|
|
13
|
+
image_bytes: bytes | None = None
|
|
14
|
+
metadata: dict[str, any] = field(default_factory=dict)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DocumentExtractor:
|
|
18
|
+
def __init__(self, vision_enabled: bool = False):
|
|
19
|
+
self.vision_enabled = vision_enabled
|
|
20
|
+
|
|
21
|
+
def extract(self, file_path: Path | str) -> list[ExtractedPage]:
|
|
22
|
+
"""Extracts content from a file depending on its extension."""
|
|
23
|
+
path = Path(file_path)
|
|
24
|
+
ext = path.suffix.lower()
|
|
25
|
+
|
|
26
|
+
if ext == ".txt":
|
|
27
|
+
return self._extract_txt(path)
|
|
28
|
+
elif ext == ".md":
|
|
29
|
+
return self._extract_md(path)
|
|
30
|
+
elif ext == ".pdf":
|
|
31
|
+
return self._extract_pdf(path)
|
|
32
|
+
else:
|
|
33
|
+
raise ValueError(f"Unsupported file extension: {ext}")
|
|
34
|
+
|
|
35
|
+
def _extract_txt(self, path: Path) -> list[ExtractedPage]:
|
|
36
|
+
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
|
37
|
+
content = f.read()
|
|
38
|
+
|
|
39
|
+
headings = []
|
|
40
|
+
lines = content.splitlines()
|
|
41
|
+
if lines:
|
|
42
|
+
first_line = lines[0].strip()
|
|
43
|
+
# If first line looks like a title, mark it as heading level 1
|
|
44
|
+
if first_line and len(first_line) < 80 and not first_line.endswith((".", ",", ";")):
|
|
45
|
+
headings.append((first_line, 1))
|
|
46
|
+
|
|
47
|
+
return [
|
|
48
|
+
ExtractedPage(
|
|
49
|
+
text=content,
|
|
50
|
+
page_number=1,
|
|
51
|
+
headings=headings,
|
|
52
|
+
metadata={"title": path.stem},
|
|
53
|
+
)
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
def _extract_md(self, path: Path) -> list[ExtractedPage]:
|
|
57
|
+
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
|
58
|
+
content = f.read()
|
|
59
|
+
|
|
60
|
+
frontmatter = {}
|
|
61
|
+
body = content
|
|
62
|
+
|
|
63
|
+
# Parse frontmatter
|
|
64
|
+
if content.startswith("---"):
|
|
65
|
+
parts = content.split("---", 2)
|
|
66
|
+
if len(parts) >= 3:
|
|
67
|
+
try:
|
|
68
|
+
frontmatter = yaml.safe_load(parts[1]) or {}
|
|
69
|
+
body = parts[2]
|
|
70
|
+
except Exception:
|
|
71
|
+
pass
|
|
72
|
+
|
|
73
|
+
headings = []
|
|
74
|
+
for line in body.splitlines():
|
|
75
|
+
line_strip = line.strip()
|
|
76
|
+
if line_strip.startswith("#"):
|
|
77
|
+
# Count the heading level
|
|
78
|
+
level = 0
|
|
79
|
+
while level < len(line_strip) and line_strip[level] == "#":
|
|
80
|
+
level += 1
|
|
81
|
+
if level > 0 and level < len(line_strip) and line_strip[level] == " ":
|
|
82
|
+
heading_text = line_strip[level:].strip()
|
|
83
|
+
if heading_text:
|
|
84
|
+
headings.append((heading_text, level))
|
|
85
|
+
|
|
86
|
+
# Default title is the frontmatter title, or file stem
|
|
87
|
+
title = frontmatter.get("title", path.stem)
|
|
88
|
+
meta = {"title": title, "frontmatter": frontmatter}
|
|
89
|
+
|
|
90
|
+
return [
|
|
91
|
+
ExtractedPage(
|
|
92
|
+
text=body,
|
|
93
|
+
page_number=1,
|
|
94
|
+
headings=headings,
|
|
95
|
+
metadata=meta,
|
|
96
|
+
)
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
def _extract_pdf(self, path: Path) -> list[ExtractedPage]:
|
|
100
|
+
doc = fitz.open(path)
|
|
101
|
+
pages = []
|
|
102
|
+
|
|
103
|
+
for page_idx, page in enumerate(doc):
|
|
104
|
+
page_num = page_idx + 1
|
|
105
|
+
text = page.get_text("text")
|
|
106
|
+
|
|
107
|
+
# Scanned check (very little text)
|
|
108
|
+
non_space_chars = len("".join(text.split()))
|
|
109
|
+
is_empty_or_scanned = non_space_chars < 40
|
|
110
|
+
|
|
111
|
+
image_bytes = None
|
|
112
|
+
if is_empty_or_scanned and self.vision_enabled:
|
|
113
|
+
# Render to high-quality PNG (144 DPI)
|
|
114
|
+
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
|
|
115
|
+
image_bytes = pix.tobytes("png")
|
|
116
|
+
|
|
117
|
+
# Typography-aware heading detection using get_text("dict")
|
|
118
|
+
headings = []
|
|
119
|
+
try:
|
|
120
|
+
blocks = page.get_text("dict").get("blocks", [])
|
|
121
|
+
sizes = []
|
|
122
|
+
for b in blocks:
|
|
123
|
+
if b.get("type") == 0: # text block
|
|
124
|
+
for line in b.get("lines", []):
|
|
125
|
+
for span in line.get("spans", []):
|
|
126
|
+
txt = span.get("text", "").strip()
|
|
127
|
+
if txt:
|
|
128
|
+
sizes.append(span.get("size", 10.0))
|
|
129
|
+
|
|
130
|
+
# Compute body text font size (most common size)
|
|
131
|
+
body_size = 10.0
|
|
132
|
+
if sizes:
|
|
133
|
+
body_size = Counter(sizes).most_common(1)[0][0]
|
|
134
|
+
|
|
135
|
+
# Detect headings
|
|
136
|
+
for b in blocks:
|
|
137
|
+
if b.get("type") == 0:
|
|
138
|
+
for line in b.get("lines", []):
|
|
139
|
+
spans = line.get("spans", [])
|
|
140
|
+
if not spans:
|
|
141
|
+
continue
|
|
142
|
+
line_text = "".join(s.get("text", "") for s in spans).strip()
|
|
143
|
+
if not line_text:
|
|
144
|
+
continue
|
|
145
|
+
|
|
146
|
+
first_span = spans[0]
|
|
147
|
+
size = first_span.get("size", 10.0)
|
|
148
|
+
font = first_span.get("font", "").lower()
|
|
149
|
+
|
|
150
|
+
is_bold = "bold" in font or "black" in font or "heavy" in font
|
|
151
|
+
is_large = size > body_size * 1.2
|
|
152
|
+
|
|
153
|
+
# Short line, doesn't end with typical sentence punctuation
|
|
154
|
+
if (
|
|
155
|
+
(is_large or is_bold)
|
|
156
|
+
and len(line_text) < 120
|
|
157
|
+
and not line_text.endswith((".", ":", ";", ","))
|
|
158
|
+
):
|
|
159
|
+
if size > body_size * 1.5:
|
|
160
|
+
level = 1
|
|
161
|
+
elif size > body_size * 1.3:
|
|
162
|
+
level = 2
|
|
163
|
+
else:
|
|
164
|
+
level = 3
|
|
165
|
+
headings.append((line_text, level))
|
|
166
|
+
except Exception:
|
|
167
|
+
# Fallback: if dict parsing fails, don't break extraction
|
|
168
|
+
pass
|
|
169
|
+
|
|
170
|
+
pages.append(
|
|
171
|
+
ExtractedPage(
|
|
172
|
+
text=text,
|
|
173
|
+
page_number=page_num,
|
|
174
|
+
headings=headings,
|
|
175
|
+
image_bytes=image_bytes,
|
|
176
|
+
metadata={"title": path.stem},
|
|
177
|
+
)
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
doc.close()
|
|
181
|
+
return pages
|