msaas-doc-processing 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ node_modules/
2
+ dist/
3
+ .next/
4
+ .turbo/
5
+ *.pyc
6
+ __pycache__/
7
+ .venv/
8
+ *.egg-info/
9
+ .pytest_cache/
10
+ .ruff_cache/
11
+ .env
12
+ .env.*
13
+ !.env.example
14
+ !.env.*.example
15
+ !.env.*.template
16
+ .DS_Store
17
+ coverage/
18
+
19
+ # Runtime artifacts
20
+ logs_llm/
21
+ vectors.db
22
+ vectors.db-shm
23
+ vectors.db-wal
@@ -0,0 +1,22 @@
1
+ Metadata-Version: 2.4
2
+ Name: msaas-doc-processing
3
+ Version: 0.1.0
4
+ Summary: Document processing pipeline — upload, extract, version, convert
5
+ License: MIT
6
+ Requires-Python: >=3.12
7
+ Requires-Dist: msaas-api-core
8
+ Requires-Dist: msaas-errors
9
+ Requires-Dist: pydantic>=2.0
10
+ Requires-Dist: pypdf>=4.0
11
+ Provides-Extra: all
12
+ Requires-Dist: pillow>=10.0; extra == 'all'
13
+ Requires-Dist: python-docx>=1.0; extra == 'all'
14
+ Provides-Extra: dev
15
+ Requires-Dist: pillow>=10.0; extra == 'dev'
16
+ Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
17
+ Requires-Dist: pytest>=8.0; extra == 'dev'
18
+ Requires-Dist: python-docx>=1.0; extra == 'dev'
19
+ Provides-Extra: docx
20
+ Requires-Dist: python-docx>=1.0; extra == 'docx'
21
+ Provides-Extra: image
22
+ Requires-Dist: pillow>=10.0; extra == 'image'
@@ -0,0 +1,38 @@
1
+ [project]
2
+ name = "msaas-doc-processing"
3
+ version = "0.1.0"
4
+ description = "Document processing pipeline — upload, extract, version, convert"
5
+ requires-python = ">=3.12"
6
+ license = { text = "MIT" }
7
+ dependencies = [
8
+ "msaas-errors",
9
+ "msaas-api-core",
10
+ "pydantic>=2.0",
11
+ "pypdf>=4.0",
12
+ ]
13
+
14
+ [project.optional-dependencies]
15
+ docx = ["python-docx>=1.0"]
16
+ image = ["Pillow>=10.0"]
17
+ all = ["python-docx>=1.0", "Pillow>=10.0"]
18
+ dev = [
19
+ "pytest>=8.0",
20
+ "pytest-asyncio>=0.24.0",
21
+ "python-docx>=1.0",
22
+ "Pillow>=10.0",
23
+ ]
24
+
25
+ [build-system]
26
+ requires = ["hatchling"]
27
+ build-backend = "hatchling.build"
28
+
29
+ [tool.hatch.build.targets.wheel]
30
+ packages = ["src/doc_processing"]
31
+
32
+ [tool.pytest.ini_options]
33
+ testpaths = ["tests"]
34
+ asyncio_mode = "auto"
35
+
36
+ [tool.uv.sources]
37
+ msaas-errors = { workspace = true }
38
+ msaas-api-core = { workspace = true }
@@ -0,0 +1,21 @@
1
+ """Willian Doc Processing — document processing pipeline library."""
2
+
3
+ from doc_processing.models import (
4
+ DocumentConfig,
5
+ DocumentFormat,
6
+ DocumentInfo,
7
+ DocumentVersion,
8
+ TextExtractionResult,
9
+ )
10
+ from doc_processing.processor import DocumentProcessor
11
+
12
+ __all__ = [
13
+ # Processor
14
+ "DocumentProcessor",
15
+ # Models
16
+ "DocumentConfig",
17
+ "DocumentFormat",
18
+ "DocumentInfo",
19
+ "DocumentVersion",
20
+ "TextExtractionResult",
21
+ ]
@@ -0,0 +1 @@
1
+ """Document text extractors — pluggable backends for each format."""
@@ -0,0 +1,39 @@
1
+ """Abstract base extractor that all format-specific extractors must implement."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import ABC, abstractmethod
6
+ from typing import TYPE_CHECKING
7
+
8
+ if TYPE_CHECKING:
9
+ from doc_processing.models import DocumentFormat, TextExtractionResult
10
+
11
+
12
+ class BaseExtractor(ABC):
13
+ """Base class for document text extraction implementations.
14
+
15
+ Each extractor handles one or more document formats and normalizes
16
+ the extraction output into a common TextExtractionResult model.
17
+ """
18
+
19
+ @abstractmethod
20
+ async def extract(self, content: bytes) -> TextExtractionResult:
21
+ """Extract text from raw document bytes.
22
+
23
+ Args:
24
+ content: Raw bytes of the document file.
25
+
26
+ Returns:
27
+ Extraction result with text, metadata, and timing information.
28
+ """
29
+
30
+ @abstractmethod
31
+ def supports(self, fmt: DocumentFormat) -> bool:
32
+ """Check whether this extractor supports the given format.
33
+
34
+ Args:
35
+ fmt: The document format to check.
36
+
37
+ Returns:
38
+ True if this extractor can handle the format.
39
+ """
@@ -0,0 +1,88 @@
1
+ """DOCX text extractor using python-docx.
2
+
3
+ This module is optional — it requires the ``python-docx`` package.
4
+ Install with: ``pip install msaas-doc-processing[docx]``
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import io
10
+ import time
11
+
12
+ from doc_processing.extractors.base import BaseExtractor
13
+ from doc_processing.models import DocumentFormat, TextExtractionResult
14
+
15
+ try:
16
+ import structlog
17
+
18
+ logger = structlog.get_logger(__name__)
19
+ except ImportError:
20
+ import logging
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class DocxExtractor(BaseExtractor):
26
+ """Extract text from DOCX documents using python-docx.
27
+
28
+ Extracts text from paragraphs and tables, plus core metadata
29
+ (author, title, creation date).
30
+ """
31
+
32
+ async def extract(self, content: bytes) -> TextExtractionResult:
33
+ """Extract text from a DOCX file.
34
+
35
+ Args:
36
+ content: Raw DOCX bytes.
37
+
38
+ Returns:
39
+ Extraction result with paragraphs and table text combined.
40
+ """
41
+ from docx import Document
42
+
43
+ start = time.monotonic()
44
+
45
+ doc = Document(io.BytesIO(content))
46
+
47
+ # Extract paragraphs
48
+ paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
49
+
50
+ # Extract table content
51
+ table_texts: list[str] = []
52
+ for table in doc.tables:
53
+ for row in table.rows:
54
+ cells = [cell.text.strip() for cell in row.cells if cell.text.strip()]
55
+ if cells:
56
+ table_texts.append(" | ".join(cells))
57
+
58
+ all_parts = paragraphs + table_texts
59
+ full_text = "\n\n".join(all_parts).strip()
60
+ word_count = len(full_text.split()) if full_text else 0
61
+
62
+ metadata: dict = {}
63
+ props = doc.core_properties
64
+ if props.title:
65
+ metadata["title"] = props.title
66
+ if props.author:
67
+ metadata["author"] = props.author
68
+ if props.created:
69
+ metadata["created"] = props.created.isoformat()
70
+ if props.modified:
71
+ metadata["modified"] = props.modified.isoformat()
72
+ if props.subject:
73
+ metadata["subject"] = props.subject
74
+
75
+ elapsed_ms = int((time.monotonic() - start) * 1000)
76
+
77
+ return TextExtractionResult(
78
+ text=full_text,
79
+ format=DocumentFormat.DOCX,
80
+ page_count=0,
81
+ word_count=word_count,
82
+ extraction_method="docx_parser",
83
+ metadata=metadata,
84
+ processing_time_ms=elapsed_ms,
85
+ )
86
+
87
+ def supports(self, fmt: DocumentFormat) -> bool:
88
+ return fmt == DocumentFormat.DOCX
@@ -0,0 +1,75 @@
1
+ """PDF text extractor using pypdf for digital text extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import io
6
+ import time
7
+
8
+ from doc_processing.extractors.base import BaseExtractor
9
+ from doc_processing.models import DocumentFormat, TextExtractionResult
10
+
11
+ try:
12
+ import structlog
13
+
14
+ logger = structlog.get_logger(__name__)
15
+ except ImportError:
16
+ import logging
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class PDFExtractor(BaseExtractor):
22
+ """Extract text from PDF documents using pypdf.
23
+
24
+ Handles digitally-created PDFs with embedded text. For scanned PDFs
25
+ (where extracted text is empty), the caller should fall back to an OCR
26
+ pipeline.
27
+ """
28
+
29
+ async def extract(self, content: bytes) -> TextExtractionResult:
30
+ """Extract text from a PDF file.
31
+
32
+ Args:
33
+ content: Raw PDF bytes.
34
+
35
+ Returns:
36
+ Extraction result. ``text`` will be empty for scanned PDFs.
37
+ """
38
+ from pypdf import PdfReader
39
+
40
+ start = time.monotonic()
41
+
42
+ reader = PdfReader(io.BytesIO(content))
43
+ pages: list[str] = []
44
+ for page in reader.pages:
45
+ text = page.extract_text() or ""
46
+ pages.append(text)
47
+
48
+ full_text = "\n\n".join(pages).strip()
49
+ word_count = len(full_text.split()) if full_text else 0
50
+
51
+ metadata: dict = {}
52
+ if reader.metadata:
53
+ if reader.metadata.title:
54
+ metadata["title"] = reader.metadata.title
55
+ if reader.metadata.author:
56
+ metadata["author"] = reader.metadata.author
57
+ if reader.metadata.subject:
58
+ metadata["subject"] = reader.metadata.subject
59
+ if reader.metadata.creator:
60
+ metadata["creator"] = reader.metadata.creator
61
+
62
+ elapsed_ms = int((time.monotonic() - start) * 1000)
63
+
64
+ return TextExtractionResult(
65
+ text=full_text,
66
+ format=DocumentFormat.PDF,
67
+ page_count=len(reader.pages),
68
+ word_count=word_count,
69
+ extraction_method="digital" if full_text else "empty",
70
+ metadata=metadata,
71
+ processing_time_ms=elapsed_ms,
72
+ )
73
+
74
+ def supports(self, fmt: DocumentFormat) -> bool:
75
+ return fmt == DocumentFormat.PDF
@@ -0,0 +1,50 @@
1
+ """Plain text and markdown extractor."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import time
6
+
7
+ from doc_processing.extractors.base import BaseExtractor
8
+ from doc_processing.models import DocumentFormat, TextExtractionResult
9
+
10
+
11
+ class TextExtractor(BaseExtractor):
12
+ """Extract text from plain text and markdown documents.
13
+
14
+ Handles UTF-8 decoding, word counting, and basic metadata extraction.
15
+ """
16
+
17
+ async def extract(self, content: bytes) -> TextExtractionResult:
18
+ """Extract text from plain text or markdown bytes.
19
+
20
+ Args:
21
+ content: Raw bytes (assumed UTF-8).
22
+
23
+ Returns:
24
+ Extraction result with text and word count.
25
+ """
26
+ start = time.monotonic()
27
+
28
+ text = content.decode("utf-8", errors="replace").strip()
29
+ word_count = len(text.split()) if text else 0
30
+ line_count = text.count("\n") + 1 if text else 0
31
+
32
+ metadata: dict = {
33
+ "line_count": line_count,
34
+ "char_count": len(text),
35
+ }
36
+
37
+ elapsed_ms = int((time.monotonic() - start) * 1000)
38
+
39
+ return TextExtractionResult(
40
+ text=text,
41
+ format=DocumentFormat.TXT,
42
+ page_count=0,
43
+ word_count=word_count,
44
+ extraction_method="plain_text",
45
+ metadata=metadata,
46
+ processing_time_ms=elapsed_ms,
47
+ )
48
+
49
+ def supports(self, fmt: DocumentFormat) -> bool:
50
+ return fmt in {DocumentFormat.TXT, DocumentFormat.MARKDOWN}
@@ -0,0 +1,69 @@
1
+ """Domain models for document processing, metadata, and versioning."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import datetime # noqa: TC003
6
+ from enum import StrEnum
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+
11
+ class DocumentFormat(StrEnum):
12
+ """Supported document formats."""
13
+
14
+ PDF = "pdf"
15
+ DOCX = "docx"
16
+ TXT = "txt"
17
+ IMAGE = "image"
18
+ MARKDOWN = "markdown"
19
+ HTML = "html"
20
+
21
+
22
+ class DocumentConfig(BaseModel):
23
+ """Configuration for the document processing pipeline."""
24
+
25
+ max_file_size_mb: int = 50
26
+ allowed_formats: list[DocumentFormat] = Field(
27
+ default_factory=lambda: list(DocumentFormat),
28
+ )
29
+ extract_metadata: bool = True
30
+ store_versions: bool = True
31
+ max_versions: int = 10
32
+
33
+
34
+ class DocumentInfo(BaseModel):
35
+ """Metadata about a processed document."""
36
+
37
+ id: str
38
+ filename: str
39
+ format: DocumentFormat
40
+ size_bytes: int
41
+ page_count: int = 0
42
+ word_count: int = 0
43
+ language: str = ""
44
+ title: str = ""
45
+ author: str = ""
46
+ created_at: datetime
47
+ metadata: dict = Field(default_factory=dict)
48
+
49
+
50
+ class TextExtractionResult(BaseModel):
51
+ """Result of text extraction from a document."""
52
+
53
+ text: str
54
+ format: DocumentFormat
55
+ page_count: int = 0
56
+ word_count: int = 0
57
+ extraction_method: str = ""
58
+ metadata: dict = Field(default_factory=dict)
59
+ processing_time_ms: int = 0
60
+
61
+
62
+ class DocumentVersion(BaseModel):
63
+ """A single version snapshot of a document."""
64
+
65
+ version: int
66
+ document_id: str
67
+ text_hash: str
68
+ changes_summary: str = ""
69
+ created_at: datetime
@@ -0,0 +1,212 @@
1
+ """Core document processor — format detection, validation, and extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import uuid
6
+ from datetime import UTC, datetime
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING
9
+
10
+ from errors import ValidationError
11
+
12
+ from doc_processing.extractors.pdf import PDFExtractor
13
+ from doc_processing.extractors.text import TextExtractor
14
+
15
+ if TYPE_CHECKING:
16
+ from doc_processing.extractors.base import BaseExtractor
17
+ from doc_processing.models import (
18
+ DocumentConfig,
19
+ DocumentFormat,
20
+ DocumentInfo,
21
+ TextExtractionResult,
22
+ )
23
+
24
+ try:
25
+ import structlog
26
+
27
+ logger = structlog.get_logger(__name__)
28
+ except ImportError:
29
+ import logging
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+ # Extension-to-format mapping
34
+ _EXTENSION_MAP: dict[str, DocumentFormat] = {
35
+ ".pdf": DocumentFormat.PDF,
36
+ ".docx": DocumentFormat.DOCX,
37
+ ".txt": DocumentFormat.TXT,
38
+ ".text": DocumentFormat.TXT,
39
+ ".md": DocumentFormat.MARKDOWN,
40
+ ".markdown": DocumentFormat.MARKDOWN,
41
+ ".html": DocumentFormat.HTML,
42
+ ".htm": DocumentFormat.HTML,
43
+ ".jpg": DocumentFormat.IMAGE,
44
+ ".jpeg": DocumentFormat.IMAGE,
45
+ ".png": DocumentFormat.IMAGE,
46
+ ".tiff": DocumentFormat.IMAGE,
47
+ ".tif": DocumentFormat.IMAGE,
48
+ ".bmp": DocumentFormat.IMAGE,
49
+ ".webp": DocumentFormat.IMAGE,
50
+ }
51
+
52
+ # Magic bytes for format detection
53
+ _MAGIC_BYTES: dict[bytes, DocumentFormat] = {
54
+ b"%PDF": DocumentFormat.PDF,
55
+ b"PK\x03\x04": DocumentFormat.DOCX, # ZIP-based (DOCX, XLSX, etc.)
56
+ }
57
+
58
+
59
+ class DocumentProcessor:
60
+ """High-level document processing pipeline.
61
+
62
+ Detects format, validates constraints, routes to the appropriate
63
+ extractor, and returns structured results.
64
+
65
+ Usage::
66
+
67
+ processor = DocumentProcessor()
68
+ result = await processor.process(raw_bytes, "report.pdf")
69
+ """
70
+
71
+ def __init__(self, config: DocumentConfig | None = None) -> None:
72
+ self._config = config or DocumentConfig()
73
+ self._extractors: dict[DocumentFormat, BaseExtractor] = {}
74
+ self._init_extractors()
75
+
76
+ def _init_extractors(self) -> None:
77
+ """Register available extractors, gracefully skipping optional ones."""
78
+ self._extractors[DocumentFormat.PDF] = PDFExtractor()
79
+ self._extractors[DocumentFormat.TXT] = TextExtractor()
80
+ self._extractors[DocumentFormat.MARKDOWN] = TextExtractor()
81
+
82
+ try:
83
+ from doc_processing.extractors.docx import DocxExtractor
84
+
85
+ self._extractors[DocumentFormat.DOCX] = DocxExtractor()
86
+ except ImportError:
87
+ logger.info("python-docx not installed, DOCX extraction disabled")
88
+
89
+ @property
90
+ def config(self) -> DocumentConfig:
91
+ """Return the current configuration."""
92
+ return self._config
93
+
94
+ @property
95
+ def supported_formats(self) -> list[DocumentFormat]:
96
+ """Return list of formats with registered extractors."""
97
+ return list(self._extractors.keys())
98
+
99
+ def detect_format(self, filename: str, content: bytes | None = None) -> DocumentFormat:
100
+ """Detect document format from filename extension and optional magic bytes.
101
+
102
+ Args:
103
+ filename: Original filename with extension.
104
+ content: Optional raw bytes for magic-byte detection.
105
+
106
+ Returns:
107
+ Detected document format.
108
+
109
+ Raises:
110
+ ValidationError: If the format cannot be determined.
111
+ """
112
+ # Try magic bytes first when content is available
113
+ if content:
114
+ for magic, fmt in _MAGIC_BYTES.items():
115
+ if content[: len(magic)] == magic:
116
+ return fmt
117
+
118
+ # Fall back to extension
119
+ suffix = Path(filename).suffix.lower()
120
+ fmt = _EXTENSION_MAP.get(suffix)
121
+ if fmt is not None:
122
+ return fmt
123
+
124
+ raise ValidationError(
125
+ f"Unsupported file format: {suffix or 'unknown'}",
126
+ details={"filename": filename, "extension": suffix},
127
+ )
128
+
129
+ def validate(self, filename: str, size_bytes: int) -> None:
130
+ """Validate file against configuration constraints.
131
+
132
+ Args:
133
+ filename: Original filename.
134
+ size_bytes: File size in bytes.
135
+
136
+ Raises:
137
+ ValidationError: If the file violates any constraint.
138
+ """
139
+ max_bytes = self._config.max_file_size_mb * 1024 * 1024
140
+ if size_bytes > max_bytes:
141
+ raise ValidationError(
142
+ f"File exceeds maximum size of {self._config.max_file_size_mb} MB",
143
+ details={
144
+ "filename": filename,
145
+ "size_bytes": size_bytes,
146
+ "max_bytes": max_bytes,
147
+ },
148
+ )
149
+
150
+ fmt = self.detect_format(filename)
151
+ if fmt not in self._config.allowed_formats:
152
+ raise ValidationError(
153
+ f"Format '{fmt.value}' is not allowed",
154
+ details={
155
+ "filename": filename,
156
+ "format": fmt.value,
157
+ "allowed_formats": [f.value for f in self._config.allowed_formats],
158
+ },
159
+ )
160
+
161
+ async def process(self, content: bytes, filename: str) -> TextExtractionResult:
162
+ """Process a document: detect format, validate, and extract text.
163
+
164
+ Args:
165
+ content: Raw document bytes.
166
+ filename: Original filename.
167
+
168
+ Returns:
169
+ Text extraction result with metadata.
170
+
171
+ Raises:
172
+ ValidationError: If validation fails or no extractor is available.
173
+ """
174
+ self.validate(filename, len(content))
175
+
176
+ fmt = self.detect_format(filename, content)
177
+ extractor = self._extractors.get(fmt)
178
+ if extractor is None:
179
+ raise ValidationError(
180
+ f"No extractor available for format '{fmt.value}'",
181
+ details={"format": fmt.value, "filename": filename},
182
+ )
183
+
184
+ logger.info("processing_document", filename=filename, format=fmt.value)
185
+ return await extractor.extract(content)
186
+
187
+ async def get_info(self, content: bytes, filename: str) -> DocumentInfo:
188
+ """Get document metadata without full text extraction.
189
+
190
+ Args:
191
+ content: Raw document bytes.
192
+ filename: Original filename.
193
+
194
+ Returns:
195
+ Document metadata.
196
+ """
197
+ fmt = self.detect_format(filename, content)
198
+ result = await self.process(content, filename)
199
+
200
+ return DocumentInfo(
201
+ id=uuid.uuid4().hex[:12],
202
+ filename=filename,
203
+ format=fmt,
204
+ size_bytes=len(content),
205
+ page_count=result.page_count,
206
+ word_count=result.word_count,
207
+ language="",
208
+ title=result.metadata.get("title", ""),
209
+ author=result.metadata.get("author", ""),
210
+ created_at=datetime.now(UTC),
211
+ metadata=result.metadata,
212
+ )
@@ -0,0 +1,68 @@
1
+ """FastAPI router exposing document processing endpoints.
2
+
3
+ Usage::
4
+
5
+ from doc_processing.router import create_doc_router
6
+
7
+ app = FastAPI()
8
+ app.include_router(create_doc_router())
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from doc_processing.models import DocumentInfo, TextExtractionResult
14
+ from doc_processing.processor import DocumentProcessor
15
+
16
+
17
+ def _get_fastapi():
18
+ try:
19
+ import fastapi
20
+
21
+ return fastapi
22
+ except ImportError:
23
+ raise ImportError(
24
+ "FastAPI is required for the API router. "
25
+ "Install with: pip install msaas-doc-processing[all]"
26
+ ) from None
27
+
28
+
29
+ def create_doc_router(
30
+ processor: DocumentProcessor | None = None,
31
+ *,
32
+ prefix: str = "/documents",
33
+ tags: list[str] | None = None,
34
+ ):
35
+ """Create a FastAPI APIRouter with document processing endpoints.
36
+
37
+ Args:
38
+ processor: An optional DocumentProcessor instance. Creates one
39
+ with default config if not provided.
40
+ prefix: URL prefix for all routes.
41
+ tags: OpenAPI tags for the router.
42
+
43
+ Returns:
44
+ A FastAPI APIRouter ready to be included in an app.
45
+ """
46
+ fastapi = _get_fastapi()
47
+ router = fastapi.APIRouter(prefix=prefix, tags=tags or ["documents"])
48
+ _processor = processor or DocumentProcessor()
49
+
50
+ @router.post("/process", response_model=TextExtractionResult)
51
+ async def process_document(
52
+ file: fastapi.UploadFile,
53
+ ) -> TextExtractionResult:
54
+ """Upload a document and extract its text content."""
55
+ content = await file.read()
56
+ filename = file.filename or "unknown"
57
+ return await _processor.process(content, filename)
58
+
59
+ @router.post("/info", response_model=DocumentInfo)
60
+ async def document_info(
61
+ file: fastapi.UploadFile,
62
+ ) -> DocumentInfo:
63
+ """Upload a document and retrieve its metadata."""
64
+ content = await file.read()
65
+ filename = file.filename or "unknown"
66
+ return await _processor.get_info(content, filename)
67
+
68
+ return router
File without changes
@@ -0,0 +1,142 @@
1
+ """Tests for document processing domain models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from datetime import UTC, datetime
6
+
7
+ from doc_processing.models import (
8
+ DocumentConfig,
9
+ DocumentFormat,
10
+ DocumentInfo,
11
+ DocumentVersion,
12
+ TextExtractionResult,
13
+ )
14
+
15
+
16
+ class TestDocumentFormat:
17
+ def test_values(self):
18
+ assert DocumentFormat.PDF == "pdf"
19
+ assert DocumentFormat.DOCX == "docx"
20
+ assert DocumentFormat.TXT == "txt"
21
+ assert DocumentFormat.IMAGE == "image"
22
+ assert DocumentFormat.MARKDOWN == "markdown"
23
+ assert DocumentFormat.HTML == "html"
24
+
25
+ def test_string_comparison(self):
26
+ assert DocumentFormat.PDF == "pdf"
27
+ assert DocumentFormat.DOCX == "docx"
28
+
29
+ def test_all_formats_listed(self):
30
+ assert len(DocumentFormat) == 6
31
+
32
+
33
+ class TestDocumentConfig:
34
+ def test_defaults(self):
35
+ cfg = DocumentConfig()
36
+ assert cfg.max_file_size_mb == 50
37
+ assert cfg.extract_metadata is True
38
+ assert cfg.store_versions is True
39
+ assert cfg.max_versions == 10
40
+ assert len(cfg.allowed_formats) == len(DocumentFormat)
41
+
42
+ def test_custom_values(self):
43
+ cfg = DocumentConfig(
44
+ max_file_size_mb=10,
45
+ allowed_formats=[DocumentFormat.PDF, DocumentFormat.TXT],
46
+ extract_metadata=False,
47
+ store_versions=False,
48
+ max_versions=5,
49
+ )
50
+ assert cfg.max_file_size_mb == 10
51
+ assert len(cfg.allowed_formats) == 2
52
+ assert cfg.extract_metadata is False
53
+
54
+ def test_allowed_formats_default_contains_all(self):
55
+ cfg = DocumentConfig()
56
+ for fmt in DocumentFormat:
57
+ assert fmt in cfg.allowed_formats
58
+
59
+
60
+ class TestDocumentInfo:
61
+ def test_create_info(self):
62
+ now = datetime.now(UTC)
63
+ info = DocumentInfo(
64
+ id="abc123",
65
+ filename="report.pdf",
66
+ format=DocumentFormat.PDF,
67
+ size_bytes=1024,
68
+ page_count=5,
69
+ word_count=500,
70
+ created_at=now,
71
+ )
72
+ assert info.id == "abc123"
73
+ assert info.filename == "report.pdf"
74
+ assert info.format == DocumentFormat.PDF
75
+ assert info.size_bytes == 1024
76
+ assert info.page_count == 5
77
+ assert info.word_count == 500
78
+
79
+ def test_defaults(self):
80
+ now = datetime.now(UTC)
81
+ info = DocumentInfo(
82
+ id="x",
83
+ filename="file.txt",
84
+ format=DocumentFormat.TXT,
85
+ size_bytes=100,
86
+ created_at=now,
87
+ )
88
+ assert info.page_count == 0
89
+ assert info.word_count == 0
90
+ assert info.language == ""
91
+ assert info.title == ""
92
+ assert info.author == ""
93
+ assert info.metadata == {}
94
+
95
+
96
+ class TestTextExtractionResult:
97
+ def test_create_result(self):
98
+ result = TextExtractionResult(
99
+ text="Hello world",
100
+ format=DocumentFormat.TXT,
101
+ page_count=1,
102
+ word_count=2,
103
+ extraction_method="plain_text",
104
+ processing_time_ms=10,
105
+ )
106
+ assert result.text == "Hello world"
107
+ assert result.word_count == 2
108
+ assert result.extraction_method == "plain_text"
109
+
110
+ def test_defaults(self):
111
+ result = TextExtractionResult(text="", format=DocumentFormat.PDF)
112
+ assert result.page_count == 0
113
+ assert result.word_count == 0
114
+ assert result.extraction_method == ""
115
+ assert result.metadata == {}
116
+ assert result.processing_time_ms == 0
117
+
118
+
119
+ class TestDocumentVersion:
120
+ def test_create_version(self):
121
+ now = datetime.now(UTC)
122
+ version = DocumentVersion(
123
+ version=1,
124
+ document_id="doc-123",
125
+ text_hash="abc123def456",
126
+ changes_summary="Initial version",
127
+ created_at=now,
128
+ )
129
+ assert version.version == 1
130
+ assert version.document_id == "doc-123"
131
+ assert version.text_hash == "abc123def456"
132
+ assert version.changes_summary == "Initial version"
133
+
134
+ def test_defaults(self):
135
+ now = datetime.now(UTC)
136
+ version = DocumentVersion(
137
+ version=2,
138
+ document_id="doc-456",
139
+ text_hash="xyz789",
140
+ created_at=now,
141
+ )
142
+ assert version.changes_summary == ""
@@ -0,0 +1,144 @@
1
+ """Tests for the DocumentProcessor — format detection, validation, and extraction."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import pytest
6
+ from doc_processing.models import DocumentConfig, DocumentFormat
7
+ from doc_processing.processor import DocumentProcessor
8
+ from errors import ValidationError
9
+
10
+
11
+ class TestDetectFormat:
12
+ def test_pdf_by_extension(self):
13
+ processor = DocumentProcessor()
14
+ assert processor.detect_format("report.pdf") == DocumentFormat.PDF
15
+
16
+ def test_docx_by_extension(self):
17
+ processor = DocumentProcessor()
18
+ assert processor.detect_format("document.docx") == DocumentFormat.DOCX
19
+
20
+ def test_txt_by_extension(self):
21
+ processor = DocumentProcessor()
22
+ assert processor.detect_format("notes.txt") == DocumentFormat.TXT
23
+
24
+ def test_markdown_by_extension(self):
25
+ processor = DocumentProcessor()
26
+ assert processor.detect_format("README.md") == DocumentFormat.MARKDOWN
27
+
28
+ def test_html_by_extension(self):
29
+ processor = DocumentProcessor()
30
+ assert processor.detect_format("page.html") == DocumentFormat.HTML
31
+
32
+ def test_image_by_extension(self):
33
+ processor = DocumentProcessor()
34
+ assert processor.detect_format("photo.jpg") == DocumentFormat.IMAGE
35
+ assert processor.detect_format("scan.png") == DocumentFormat.IMAGE
36
+ assert processor.detect_format("doc.tiff") == DocumentFormat.IMAGE
37
+
38
+ def test_pdf_by_magic_bytes(self):
39
+ processor = DocumentProcessor()
40
+ content = b"%PDF-1.4 rest of the file..."
41
+ assert processor.detect_format("unknown", content) == DocumentFormat.PDF
42
+
43
+ def test_case_insensitive_extension(self):
44
+ processor = DocumentProcessor()
45
+ assert processor.detect_format("REPORT.PDF") == DocumentFormat.PDF
46
+ assert processor.detect_format("doc.TXT") == DocumentFormat.TXT
47
+
48
+ def test_unknown_format_raises(self):
49
+ processor = DocumentProcessor()
50
+ with pytest.raises(ValidationError, match="Unsupported file format"):
51
+ processor.detect_format("file.xyz")
52
+
53
+ def test_no_extension_raises(self):
54
+ processor = DocumentProcessor()
55
+ with pytest.raises(ValidationError, match="Unsupported file format"):
56
+ processor.detect_format("Makefile")
57
+
58
+
59
+ class TestValidation:
60
+ def test_valid_file(self):
61
+ processor = DocumentProcessor()
62
+ processor.validate("report.pdf", 1024) # Should not raise
63
+
64
+ def test_file_too_large(self):
65
+ config = DocumentConfig(max_file_size_mb=1)
66
+ processor = DocumentProcessor(config)
67
+ with pytest.raises(ValidationError, match="exceeds maximum size"):
68
+ processor.validate("report.pdf", 2 * 1024 * 1024)
69
+
70
+ def test_format_not_allowed(self):
71
+ config = DocumentConfig(allowed_formats=[DocumentFormat.PDF])
72
+ processor = DocumentProcessor(config)
73
+ with pytest.raises(ValidationError, match="not allowed"):
74
+ processor.validate("notes.txt", 100)
75
+
76
+ def test_allowed_format_passes(self):
77
+ config = DocumentConfig(allowed_formats=[DocumentFormat.PDF, DocumentFormat.TXT])
78
+ processor = DocumentProcessor(config)
79
+ processor.validate("notes.txt", 100) # Should not raise
80
+
81
+
82
+ class TestProcess:
83
+ async def test_process_text_file(self):
84
+ processor = DocumentProcessor()
85
+ content = b"Hello world, this is a test document."
86
+ result = await processor.process(content, "test.txt")
87
+
88
+ assert result.text == "Hello world, this is a test document."
89
+ assert result.format == DocumentFormat.TXT
90
+ assert result.word_count == 7
91
+ assert result.extraction_method == "plain_text"
92
+
93
+ async def test_process_markdown_file(self):
94
+ processor = DocumentProcessor()
95
+ content = b"# Title\n\nSome paragraph text here."
96
+ result = await processor.process(content, "doc.md")
97
+
98
+ assert "# Title" in result.text
99
+ assert result.word_count > 0
100
+
101
+ async def test_process_unknown_format_raises(self):
102
+ processor = DocumentProcessor()
103
+ with pytest.raises(ValidationError):
104
+ await processor.process(b"data", "file.xyz")
105
+
106
+ async def test_process_no_extractor_raises(self):
107
+ config = DocumentConfig(allowed_formats=list(DocumentFormat))
108
+ processor = DocumentProcessor(config)
109
+ # IMAGE format has no registered extractor
110
+ with pytest.raises(ValidationError, match="No extractor available"):
111
+ await processor.process(b"\x89PNG", "photo.png")
112
+
113
+ async def test_process_validates_size(self):
114
+ config = DocumentConfig(max_file_size_mb=0)
115
+ processor = DocumentProcessor(config)
116
+ with pytest.raises(ValidationError, match="exceeds maximum size"):
117
+ await processor.process(b"some content", "test.txt")
118
+
119
+
120
+ class TestGetInfo:
121
+ async def test_get_info_text_file(self):
122
+ processor = DocumentProcessor()
123
+ content = b"A simple test file with some words."
124
+ info = await processor.get_info(content, "test.txt")
125
+
126
+ assert info.filename == "test.txt"
127
+ assert info.format == DocumentFormat.TXT
128
+ assert info.size_bytes == len(content)
129
+ assert info.word_count == 7
130
+ assert info.id # Should have a generated ID
131
+
132
+
133
+ class TestSupportedFormats:
134
+ def test_default_extractors(self):
135
+ processor = DocumentProcessor()
136
+ supported = processor.supported_formats
137
+ assert DocumentFormat.PDF in supported
138
+ assert DocumentFormat.TXT in supported
139
+ assert DocumentFormat.MARKDOWN in supported
140
+
141
+ def test_config_accessible(self):
142
+ config = DocumentConfig(max_file_size_mb=25)
143
+ processor = DocumentProcessor(config)
144
+ assert processor.config.max_file_size_mb == 25