msaas-doc-processing 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- msaas_doc_processing-0.1.0/.gitignore +23 -0
- msaas_doc_processing-0.1.0/PKG-INFO +22 -0
- msaas_doc_processing-0.1.0/pyproject.toml +38 -0
- msaas_doc_processing-0.1.0/src/doc_processing/__init__.py +21 -0
- msaas_doc_processing-0.1.0/src/doc_processing/extractors/__init__.py +1 -0
- msaas_doc_processing-0.1.0/src/doc_processing/extractors/base.py +39 -0
- msaas_doc_processing-0.1.0/src/doc_processing/extractors/docx.py +88 -0
- msaas_doc_processing-0.1.0/src/doc_processing/extractors/pdf.py +75 -0
- msaas_doc_processing-0.1.0/src/doc_processing/extractors/text.py +50 -0
- msaas_doc_processing-0.1.0/src/doc_processing/models.py +69 -0
- msaas_doc_processing-0.1.0/src/doc_processing/processor.py +212 -0
- msaas_doc_processing-0.1.0/src/doc_processing/router.py +68 -0
- msaas_doc_processing-0.1.0/tests/__init__.py +0 -0
- msaas_doc_processing-0.1.0/tests/test_models.py +142 -0
- msaas_doc_processing-0.1.0/tests/test_processor.py +144 -0
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
node_modules/
|
|
2
|
+
dist/
|
|
3
|
+
.next/
|
|
4
|
+
.turbo/
|
|
5
|
+
*.pyc
|
|
6
|
+
__pycache__/
|
|
7
|
+
.venv/
|
|
8
|
+
*.egg-info/
|
|
9
|
+
.pytest_cache/
|
|
10
|
+
.ruff_cache/
|
|
11
|
+
.env
|
|
12
|
+
.env.*
|
|
13
|
+
!.env.example
|
|
14
|
+
!.env.*.example
|
|
15
|
+
!.env.*.template
|
|
16
|
+
.DS_Store
|
|
17
|
+
coverage/
|
|
18
|
+
|
|
19
|
+
# Runtime artifacts
|
|
20
|
+
logs_llm/
|
|
21
|
+
vectors.db
|
|
22
|
+
vectors.db-shm
|
|
23
|
+
vectors.db-wal
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: msaas-doc-processing
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Document processing pipeline — upload, extract, version, convert
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
|
+
Requires-Dist: msaas-api-core
|
|
8
|
+
Requires-Dist: msaas-errors
|
|
9
|
+
Requires-Dist: pydantic>=2.0
|
|
10
|
+
Requires-Dist: pypdf>=4.0
|
|
11
|
+
Provides-Extra: all
|
|
12
|
+
Requires-Dist: pillow>=10.0; extra == 'all'
|
|
13
|
+
Requires-Dist: python-docx>=1.0; extra == 'all'
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: pillow>=10.0; extra == 'dev'
|
|
16
|
+
Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
|
|
17
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
18
|
+
Requires-Dist: python-docx>=1.0; extra == 'dev'
|
|
19
|
+
Provides-Extra: docx
|
|
20
|
+
Requires-Dist: python-docx>=1.0; extra == 'docx'
|
|
21
|
+
Provides-Extra: image
|
|
22
|
+
Requires-Dist: pillow>=10.0; extra == 'image'
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "msaas-doc-processing"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "Document processing pipeline — upload, extract, version, convert"
|
|
5
|
+
requires-python = ">=3.12"
|
|
6
|
+
license = { text = "MIT" }
|
|
7
|
+
dependencies = [
|
|
8
|
+
"msaas-errors",
|
|
9
|
+
"msaas-api-core",
|
|
10
|
+
"pydantic>=2.0",
|
|
11
|
+
"pypdf>=4.0",
|
|
12
|
+
]
|
|
13
|
+
|
|
14
|
+
[project.optional-dependencies]
|
|
15
|
+
docx = ["python-docx>=1.0"]
|
|
16
|
+
image = ["Pillow>=10.0"]
|
|
17
|
+
all = ["python-docx>=1.0", "Pillow>=10.0"]
|
|
18
|
+
dev = [
|
|
19
|
+
"pytest>=8.0",
|
|
20
|
+
"pytest-asyncio>=0.24.0",
|
|
21
|
+
"python-docx>=1.0",
|
|
22
|
+
"Pillow>=10.0",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
[build-system]
|
|
26
|
+
requires = ["hatchling"]
|
|
27
|
+
build-backend = "hatchling.build"
|
|
28
|
+
|
|
29
|
+
[tool.hatch.build.targets.wheel]
|
|
30
|
+
packages = ["src/doc_processing"]
|
|
31
|
+
|
|
32
|
+
[tool.pytest.ini_options]
|
|
33
|
+
testpaths = ["tests"]
|
|
34
|
+
asyncio_mode = "auto"
|
|
35
|
+
|
|
36
|
+
[tool.uv.sources]
|
|
37
|
+
msaas-errors = { workspace = true }
|
|
38
|
+
msaas-api-core = { workspace = true }
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Willian Doc Processing — document processing pipeline library."""
|
|
2
|
+
|
|
3
|
+
from doc_processing.models import (
|
|
4
|
+
DocumentConfig,
|
|
5
|
+
DocumentFormat,
|
|
6
|
+
DocumentInfo,
|
|
7
|
+
DocumentVersion,
|
|
8
|
+
TextExtractionResult,
|
|
9
|
+
)
|
|
10
|
+
from doc_processing.processor import DocumentProcessor
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
# Processor
|
|
14
|
+
"DocumentProcessor",
|
|
15
|
+
# Models
|
|
16
|
+
"DocumentConfig",
|
|
17
|
+
"DocumentFormat",
|
|
18
|
+
"DocumentInfo",
|
|
19
|
+
"DocumentVersion",
|
|
20
|
+
"TextExtractionResult",
|
|
21
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Document text extractors — pluggable backends for each format."""
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Abstract base extractor that all format-specific extractors must implement."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from doc_processing.models import DocumentFormat, TextExtractionResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaseExtractor(ABC):
|
|
13
|
+
"""Base class for document text extraction implementations.
|
|
14
|
+
|
|
15
|
+
Each extractor handles one or more document formats and normalizes
|
|
16
|
+
the extraction output into a common TextExtractionResult model.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
async def extract(self, content: bytes) -> TextExtractionResult:
|
|
21
|
+
"""Extract text from raw document bytes.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
content: Raw bytes of the document file.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Extraction result with text, metadata, and timing information.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def supports(self, fmt: DocumentFormat) -> bool:
|
|
32
|
+
"""Check whether this extractor supports the given format.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
fmt: The document format to check.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
True if this extractor can handle the format.
|
|
39
|
+
"""
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""DOCX text extractor using python-docx.
|
|
2
|
+
|
|
3
|
+
This module is optional — it requires the ``python-docx`` package.
|
|
4
|
+
Install with: ``pip install msaas-doc-processing[docx]``
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import io
|
|
10
|
+
import time
|
|
11
|
+
|
|
12
|
+
from doc_processing.extractors.base import BaseExtractor
|
|
13
|
+
from doc_processing.models import DocumentFormat, TextExtractionResult
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
import structlog
|
|
17
|
+
|
|
18
|
+
logger = structlog.get_logger(__name__)
|
|
19
|
+
except ImportError:
|
|
20
|
+
import logging
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DocxExtractor(BaseExtractor):
|
|
26
|
+
"""Extract text from DOCX documents using python-docx.
|
|
27
|
+
|
|
28
|
+
Extracts text from paragraphs and tables, plus core metadata
|
|
29
|
+
(author, title, creation date).
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
async def extract(self, content: bytes) -> TextExtractionResult:
|
|
33
|
+
"""Extract text from a DOCX file.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
content: Raw DOCX bytes.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Extraction result with paragraphs and table text combined.
|
|
40
|
+
"""
|
|
41
|
+
from docx import Document
|
|
42
|
+
|
|
43
|
+
start = time.monotonic()
|
|
44
|
+
|
|
45
|
+
doc = Document(io.BytesIO(content))
|
|
46
|
+
|
|
47
|
+
# Extract paragraphs
|
|
48
|
+
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
49
|
+
|
|
50
|
+
# Extract table content
|
|
51
|
+
table_texts: list[str] = []
|
|
52
|
+
for table in doc.tables:
|
|
53
|
+
for row in table.rows:
|
|
54
|
+
cells = [cell.text.strip() for cell in row.cells if cell.text.strip()]
|
|
55
|
+
if cells:
|
|
56
|
+
table_texts.append(" | ".join(cells))
|
|
57
|
+
|
|
58
|
+
all_parts = paragraphs + table_texts
|
|
59
|
+
full_text = "\n\n".join(all_parts).strip()
|
|
60
|
+
word_count = len(full_text.split()) if full_text else 0
|
|
61
|
+
|
|
62
|
+
metadata: dict = {}
|
|
63
|
+
props = doc.core_properties
|
|
64
|
+
if props.title:
|
|
65
|
+
metadata["title"] = props.title
|
|
66
|
+
if props.author:
|
|
67
|
+
metadata["author"] = props.author
|
|
68
|
+
if props.created:
|
|
69
|
+
metadata["created"] = props.created.isoformat()
|
|
70
|
+
if props.modified:
|
|
71
|
+
metadata["modified"] = props.modified.isoformat()
|
|
72
|
+
if props.subject:
|
|
73
|
+
metadata["subject"] = props.subject
|
|
74
|
+
|
|
75
|
+
elapsed_ms = int((time.monotonic() - start) * 1000)
|
|
76
|
+
|
|
77
|
+
return TextExtractionResult(
|
|
78
|
+
text=full_text,
|
|
79
|
+
format=DocumentFormat.DOCX,
|
|
80
|
+
page_count=0,
|
|
81
|
+
word_count=word_count,
|
|
82
|
+
extraction_method="docx_parser",
|
|
83
|
+
metadata=metadata,
|
|
84
|
+
processing_time_ms=elapsed_ms,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def supports(self, fmt: DocumentFormat) -> bool:
|
|
88
|
+
return fmt == DocumentFormat.DOCX
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""PDF text extractor using pypdf for digital text extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
|
+
from doc_processing.extractors.base import BaseExtractor
|
|
9
|
+
from doc_processing.models import DocumentFormat, TextExtractionResult
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import structlog
|
|
13
|
+
|
|
14
|
+
logger = structlog.get_logger(__name__)
|
|
15
|
+
except ImportError:
|
|
16
|
+
import logging
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PDFExtractor(BaseExtractor):
|
|
22
|
+
"""Extract text from PDF documents using pypdf.
|
|
23
|
+
|
|
24
|
+
Handles digitally-created PDFs with embedded text. For scanned PDFs
|
|
25
|
+
(where extracted text is empty), the caller should fall back to an OCR
|
|
26
|
+
pipeline.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
async def extract(self, content: bytes) -> TextExtractionResult:
|
|
30
|
+
"""Extract text from a PDF file.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
content: Raw PDF bytes.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Extraction result. ``text`` will be empty for scanned PDFs.
|
|
37
|
+
"""
|
|
38
|
+
from pypdf import PdfReader
|
|
39
|
+
|
|
40
|
+
start = time.monotonic()
|
|
41
|
+
|
|
42
|
+
reader = PdfReader(io.BytesIO(content))
|
|
43
|
+
pages: list[str] = []
|
|
44
|
+
for page in reader.pages:
|
|
45
|
+
text = page.extract_text() or ""
|
|
46
|
+
pages.append(text)
|
|
47
|
+
|
|
48
|
+
full_text = "\n\n".join(pages).strip()
|
|
49
|
+
word_count = len(full_text.split()) if full_text else 0
|
|
50
|
+
|
|
51
|
+
metadata: dict = {}
|
|
52
|
+
if reader.metadata:
|
|
53
|
+
if reader.metadata.title:
|
|
54
|
+
metadata["title"] = reader.metadata.title
|
|
55
|
+
if reader.metadata.author:
|
|
56
|
+
metadata["author"] = reader.metadata.author
|
|
57
|
+
if reader.metadata.subject:
|
|
58
|
+
metadata["subject"] = reader.metadata.subject
|
|
59
|
+
if reader.metadata.creator:
|
|
60
|
+
metadata["creator"] = reader.metadata.creator
|
|
61
|
+
|
|
62
|
+
elapsed_ms = int((time.monotonic() - start) * 1000)
|
|
63
|
+
|
|
64
|
+
return TextExtractionResult(
|
|
65
|
+
text=full_text,
|
|
66
|
+
format=DocumentFormat.PDF,
|
|
67
|
+
page_count=len(reader.pages),
|
|
68
|
+
word_count=word_count,
|
|
69
|
+
extraction_method="digital" if full_text else "empty",
|
|
70
|
+
metadata=metadata,
|
|
71
|
+
processing_time_ms=elapsed_ms,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def supports(self, fmt: DocumentFormat) -> bool:
|
|
75
|
+
return fmt == DocumentFormat.PDF
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Plain text and markdown extractor."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
from doc_processing.extractors.base import BaseExtractor
|
|
8
|
+
from doc_processing.models import DocumentFormat, TextExtractionResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TextExtractor(BaseExtractor):
|
|
12
|
+
"""Extract text from plain text and markdown documents.
|
|
13
|
+
|
|
14
|
+
Handles UTF-8 decoding, word counting, and basic metadata extraction.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
async def extract(self, content: bytes) -> TextExtractionResult:
|
|
18
|
+
"""Extract text from plain text or markdown bytes.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
content: Raw bytes (assumed UTF-8).
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Extraction result with text and word count.
|
|
25
|
+
"""
|
|
26
|
+
start = time.monotonic()
|
|
27
|
+
|
|
28
|
+
text = content.decode("utf-8", errors="replace").strip()
|
|
29
|
+
word_count = len(text.split()) if text else 0
|
|
30
|
+
line_count = text.count("\n") + 1 if text else 0
|
|
31
|
+
|
|
32
|
+
metadata: dict = {
|
|
33
|
+
"line_count": line_count,
|
|
34
|
+
"char_count": len(text),
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
elapsed_ms = int((time.monotonic() - start) * 1000)
|
|
38
|
+
|
|
39
|
+
return TextExtractionResult(
|
|
40
|
+
text=text,
|
|
41
|
+
format=DocumentFormat.TXT,
|
|
42
|
+
page_count=0,
|
|
43
|
+
word_count=word_count,
|
|
44
|
+
extraction_method="plain_text",
|
|
45
|
+
metadata=metadata,
|
|
46
|
+
processing_time_ms=elapsed_ms,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
def supports(self, fmt: DocumentFormat) -> bool:
|
|
50
|
+
return fmt in {DocumentFormat.TXT, DocumentFormat.MARKDOWN}
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Domain models for document processing, metadata, and versioning."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime # noqa: TC003
|
|
6
|
+
from enum import StrEnum
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DocumentFormat(StrEnum):
|
|
12
|
+
"""Supported document formats."""
|
|
13
|
+
|
|
14
|
+
PDF = "pdf"
|
|
15
|
+
DOCX = "docx"
|
|
16
|
+
TXT = "txt"
|
|
17
|
+
IMAGE = "image"
|
|
18
|
+
MARKDOWN = "markdown"
|
|
19
|
+
HTML = "html"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DocumentConfig(BaseModel):
|
|
23
|
+
"""Configuration for the document processing pipeline."""
|
|
24
|
+
|
|
25
|
+
max_file_size_mb: int = 50
|
|
26
|
+
allowed_formats: list[DocumentFormat] = Field(
|
|
27
|
+
default_factory=lambda: list(DocumentFormat),
|
|
28
|
+
)
|
|
29
|
+
extract_metadata: bool = True
|
|
30
|
+
store_versions: bool = True
|
|
31
|
+
max_versions: int = 10
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DocumentInfo(BaseModel):
|
|
35
|
+
"""Metadata about a processed document."""
|
|
36
|
+
|
|
37
|
+
id: str
|
|
38
|
+
filename: str
|
|
39
|
+
format: DocumentFormat
|
|
40
|
+
size_bytes: int
|
|
41
|
+
page_count: int = 0
|
|
42
|
+
word_count: int = 0
|
|
43
|
+
language: str = ""
|
|
44
|
+
title: str = ""
|
|
45
|
+
author: str = ""
|
|
46
|
+
created_at: datetime
|
|
47
|
+
metadata: dict = Field(default_factory=dict)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class TextExtractionResult(BaseModel):
|
|
51
|
+
"""Result of text extraction from a document."""
|
|
52
|
+
|
|
53
|
+
text: str
|
|
54
|
+
format: DocumentFormat
|
|
55
|
+
page_count: int = 0
|
|
56
|
+
word_count: int = 0
|
|
57
|
+
extraction_method: str = ""
|
|
58
|
+
metadata: dict = Field(default_factory=dict)
|
|
59
|
+
processing_time_ms: int = 0
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class DocumentVersion(BaseModel):
|
|
63
|
+
"""A single version snapshot of a document."""
|
|
64
|
+
|
|
65
|
+
version: int
|
|
66
|
+
document_id: str
|
|
67
|
+
text_hash: str
|
|
68
|
+
changes_summary: str = ""
|
|
69
|
+
created_at: datetime
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""Core document processor — format detection, validation, and extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import uuid
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
from errors import ValidationError
|
|
11
|
+
|
|
12
|
+
from doc_processing.extractors.pdf import PDFExtractor
|
|
13
|
+
from doc_processing.extractors.text import TextExtractor
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from doc_processing.extractors.base import BaseExtractor
|
|
17
|
+
from doc_processing.models import (
|
|
18
|
+
DocumentConfig,
|
|
19
|
+
DocumentFormat,
|
|
20
|
+
DocumentInfo,
|
|
21
|
+
TextExtractionResult,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
import structlog
|
|
26
|
+
|
|
27
|
+
logger = structlog.get_logger(__name__)
|
|
28
|
+
except ImportError:
|
|
29
|
+
import logging
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
# Extension-to-format mapping
|
|
34
|
+
_EXTENSION_MAP: dict[str, DocumentFormat] = {
|
|
35
|
+
".pdf": DocumentFormat.PDF,
|
|
36
|
+
".docx": DocumentFormat.DOCX,
|
|
37
|
+
".txt": DocumentFormat.TXT,
|
|
38
|
+
".text": DocumentFormat.TXT,
|
|
39
|
+
".md": DocumentFormat.MARKDOWN,
|
|
40
|
+
".markdown": DocumentFormat.MARKDOWN,
|
|
41
|
+
".html": DocumentFormat.HTML,
|
|
42
|
+
".htm": DocumentFormat.HTML,
|
|
43
|
+
".jpg": DocumentFormat.IMAGE,
|
|
44
|
+
".jpeg": DocumentFormat.IMAGE,
|
|
45
|
+
".png": DocumentFormat.IMAGE,
|
|
46
|
+
".tiff": DocumentFormat.IMAGE,
|
|
47
|
+
".tif": DocumentFormat.IMAGE,
|
|
48
|
+
".bmp": DocumentFormat.IMAGE,
|
|
49
|
+
".webp": DocumentFormat.IMAGE,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
# Magic bytes for format detection
|
|
53
|
+
_MAGIC_BYTES: dict[bytes, DocumentFormat] = {
|
|
54
|
+
b"%PDF": DocumentFormat.PDF,
|
|
55
|
+
b"PK\x03\x04": DocumentFormat.DOCX, # ZIP-based (DOCX, XLSX, etc.)
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class DocumentProcessor:
|
|
60
|
+
"""High-level document processing pipeline.
|
|
61
|
+
|
|
62
|
+
Detects format, validates constraints, routes to the appropriate
|
|
63
|
+
extractor, and returns structured results.
|
|
64
|
+
|
|
65
|
+
Usage::
|
|
66
|
+
|
|
67
|
+
processor = DocumentProcessor()
|
|
68
|
+
result = await processor.process(raw_bytes, "report.pdf")
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(self, config: DocumentConfig | None = None) -> None:
|
|
72
|
+
self._config = config or DocumentConfig()
|
|
73
|
+
self._extractors: dict[DocumentFormat, BaseExtractor] = {}
|
|
74
|
+
self._init_extractors()
|
|
75
|
+
|
|
76
|
+
def _init_extractors(self) -> None:
|
|
77
|
+
"""Register available extractors, gracefully skipping optional ones."""
|
|
78
|
+
self._extractors[DocumentFormat.PDF] = PDFExtractor()
|
|
79
|
+
self._extractors[DocumentFormat.TXT] = TextExtractor()
|
|
80
|
+
self._extractors[DocumentFormat.MARKDOWN] = TextExtractor()
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
from doc_processing.extractors.docx import DocxExtractor
|
|
84
|
+
|
|
85
|
+
self._extractors[DocumentFormat.DOCX] = DocxExtractor()
|
|
86
|
+
except ImportError:
|
|
87
|
+
logger.info("python-docx not installed, DOCX extraction disabled")
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def config(self) -> DocumentConfig:
|
|
91
|
+
"""Return the current configuration."""
|
|
92
|
+
return self._config
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def supported_formats(self) -> list[DocumentFormat]:
|
|
96
|
+
"""Return list of formats with registered extractors."""
|
|
97
|
+
return list(self._extractors.keys())
|
|
98
|
+
|
|
99
|
+
def detect_format(self, filename: str, content: bytes | None = None) -> DocumentFormat:
|
|
100
|
+
"""Detect document format from filename extension and optional magic bytes.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
filename: Original filename with extension.
|
|
104
|
+
content: Optional raw bytes for magic-byte detection.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Detected document format.
|
|
108
|
+
|
|
109
|
+
Raises:
|
|
110
|
+
ValidationError: If the format cannot be determined.
|
|
111
|
+
"""
|
|
112
|
+
# Try magic bytes first when content is available
|
|
113
|
+
if content:
|
|
114
|
+
for magic, fmt in _MAGIC_BYTES.items():
|
|
115
|
+
if content[: len(magic)] == magic:
|
|
116
|
+
return fmt
|
|
117
|
+
|
|
118
|
+
# Fall back to extension
|
|
119
|
+
suffix = Path(filename).suffix.lower()
|
|
120
|
+
fmt = _EXTENSION_MAP.get(suffix)
|
|
121
|
+
if fmt is not None:
|
|
122
|
+
return fmt
|
|
123
|
+
|
|
124
|
+
raise ValidationError(
|
|
125
|
+
f"Unsupported file format: {suffix or 'unknown'}",
|
|
126
|
+
details={"filename": filename, "extension": suffix},
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
def validate(self, filename: str, size_bytes: int) -> None:
|
|
130
|
+
"""Validate file against configuration constraints.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
filename: Original filename.
|
|
134
|
+
size_bytes: File size in bytes.
|
|
135
|
+
|
|
136
|
+
Raises:
|
|
137
|
+
ValidationError: If the file violates any constraint.
|
|
138
|
+
"""
|
|
139
|
+
max_bytes = self._config.max_file_size_mb * 1024 * 1024
|
|
140
|
+
if size_bytes > max_bytes:
|
|
141
|
+
raise ValidationError(
|
|
142
|
+
f"File exceeds maximum size of {self._config.max_file_size_mb} MB",
|
|
143
|
+
details={
|
|
144
|
+
"filename": filename,
|
|
145
|
+
"size_bytes": size_bytes,
|
|
146
|
+
"max_bytes": max_bytes,
|
|
147
|
+
},
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
fmt = self.detect_format(filename)
|
|
151
|
+
if fmt not in self._config.allowed_formats:
|
|
152
|
+
raise ValidationError(
|
|
153
|
+
f"Format '{fmt.value}' is not allowed",
|
|
154
|
+
details={
|
|
155
|
+
"filename": filename,
|
|
156
|
+
"format": fmt.value,
|
|
157
|
+
"allowed_formats": [f.value for f in self._config.allowed_formats],
|
|
158
|
+
},
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
async def process(self, content: bytes, filename: str) -> TextExtractionResult:
|
|
162
|
+
"""Process a document: detect format, validate, and extract text.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
content: Raw document bytes.
|
|
166
|
+
filename: Original filename.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Text extraction result with metadata.
|
|
170
|
+
|
|
171
|
+
Raises:
|
|
172
|
+
ValidationError: If validation fails or no extractor is available.
|
|
173
|
+
"""
|
|
174
|
+
self.validate(filename, len(content))
|
|
175
|
+
|
|
176
|
+
fmt = self.detect_format(filename, content)
|
|
177
|
+
extractor = self._extractors.get(fmt)
|
|
178
|
+
if extractor is None:
|
|
179
|
+
raise ValidationError(
|
|
180
|
+
f"No extractor available for format '{fmt.value}'",
|
|
181
|
+
details={"format": fmt.value, "filename": filename},
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
logger.info("processing_document", filename=filename, format=fmt.value)
|
|
185
|
+
return await extractor.extract(content)
|
|
186
|
+
|
|
187
|
+
async def get_info(self, content: bytes, filename: str) -> DocumentInfo:
|
|
188
|
+
"""Get document metadata without full text extraction.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
content: Raw document bytes.
|
|
192
|
+
filename: Original filename.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
Document metadata.
|
|
196
|
+
"""
|
|
197
|
+
fmt = self.detect_format(filename, content)
|
|
198
|
+
result = await self.process(content, filename)
|
|
199
|
+
|
|
200
|
+
return DocumentInfo(
|
|
201
|
+
id=uuid.uuid4().hex[:12],
|
|
202
|
+
filename=filename,
|
|
203
|
+
format=fmt,
|
|
204
|
+
size_bytes=len(content),
|
|
205
|
+
page_count=result.page_count,
|
|
206
|
+
word_count=result.word_count,
|
|
207
|
+
language="",
|
|
208
|
+
title=result.metadata.get("title", ""),
|
|
209
|
+
author=result.metadata.get("author", ""),
|
|
210
|
+
created_at=datetime.now(UTC),
|
|
211
|
+
metadata=result.metadata,
|
|
212
|
+
)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""FastAPI router exposing document processing endpoints.
|
|
2
|
+
|
|
3
|
+
Usage::
|
|
4
|
+
|
|
5
|
+
from doc_processing.router import create_doc_router
|
|
6
|
+
|
|
7
|
+
app = FastAPI()
|
|
8
|
+
app.include_router(create_doc_router())
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from doc_processing.models import DocumentInfo, TextExtractionResult
|
|
14
|
+
from doc_processing.processor import DocumentProcessor
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _get_fastapi():
|
|
18
|
+
try:
|
|
19
|
+
import fastapi
|
|
20
|
+
|
|
21
|
+
return fastapi
|
|
22
|
+
except ImportError:
|
|
23
|
+
raise ImportError(
|
|
24
|
+
"FastAPI is required for the API router. "
|
|
25
|
+
"Install with: pip install msaas-doc-processing[all]"
|
|
26
|
+
) from None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def create_doc_router(
|
|
30
|
+
processor: DocumentProcessor | None = None,
|
|
31
|
+
*,
|
|
32
|
+
prefix: str = "/documents",
|
|
33
|
+
tags: list[str] | None = None,
|
|
34
|
+
):
|
|
35
|
+
"""Create a FastAPI APIRouter with document processing endpoints.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
processor: An optional DocumentProcessor instance. Creates one
|
|
39
|
+
with default config if not provided.
|
|
40
|
+
prefix: URL prefix for all routes.
|
|
41
|
+
tags: OpenAPI tags for the router.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
A FastAPI APIRouter ready to be included in an app.
|
|
45
|
+
"""
|
|
46
|
+
fastapi = _get_fastapi()
|
|
47
|
+
router = fastapi.APIRouter(prefix=prefix, tags=tags or ["documents"])
|
|
48
|
+
_processor = processor or DocumentProcessor()
|
|
49
|
+
|
|
50
|
+
@router.post("/process", response_model=TextExtractionResult)
|
|
51
|
+
async def process_document(
|
|
52
|
+
file: fastapi.UploadFile,
|
|
53
|
+
) -> TextExtractionResult:
|
|
54
|
+
"""Upload a document and extract its text content."""
|
|
55
|
+
content = await file.read()
|
|
56
|
+
filename = file.filename or "unknown"
|
|
57
|
+
return await _processor.process(content, filename)
|
|
58
|
+
|
|
59
|
+
@router.post("/info", response_model=DocumentInfo)
|
|
60
|
+
async def document_info(
|
|
61
|
+
file: fastapi.UploadFile,
|
|
62
|
+
) -> DocumentInfo:
|
|
63
|
+
"""Upload a document and retrieve its metadata."""
|
|
64
|
+
content = await file.read()
|
|
65
|
+
filename = file.filename or "unknown"
|
|
66
|
+
return await _processor.get_info(content, filename)
|
|
67
|
+
|
|
68
|
+
return router
|
|
File without changes
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""Tests for document processing domain models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import UTC, datetime
|
|
6
|
+
|
|
7
|
+
from doc_processing.models import (
|
|
8
|
+
DocumentConfig,
|
|
9
|
+
DocumentFormat,
|
|
10
|
+
DocumentInfo,
|
|
11
|
+
DocumentVersion,
|
|
12
|
+
TextExtractionResult,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class TestDocumentFormat:
|
|
17
|
+
def test_values(self):
|
|
18
|
+
assert DocumentFormat.PDF == "pdf"
|
|
19
|
+
assert DocumentFormat.DOCX == "docx"
|
|
20
|
+
assert DocumentFormat.TXT == "txt"
|
|
21
|
+
assert DocumentFormat.IMAGE == "image"
|
|
22
|
+
assert DocumentFormat.MARKDOWN == "markdown"
|
|
23
|
+
assert DocumentFormat.HTML == "html"
|
|
24
|
+
|
|
25
|
+
def test_string_comparison(self):
|
|
26
|
+
assert DocumentFormat.PDF == "pdf"
|
|
27
|
+
assert DocumentFormat.DOCX == "docx"
|
|
28
|
+
|
|
29
|
+
def test_all_formats_listed(self):
|
|
30
|
+
assert len(DocumentFormat) == 6
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class TestDocumentConfig:
|
|
34
|
+
def test_defaults(self):
|
|
35
|
+
cfg = DocumentConfig()
|
|
36
|
+
assert cfg.max_file_size_mb == 50
|
|
37
|
+
assert cfg.extract_metadata is True
|
|
38
|
+
assert cfg.store_versions is True
|
|
39
|
+
assert cfg.max_versions == 10
|
|
40
|
+
assert len(cfg.allowed_formats) == len(DocumentFormat)
|
|
41
|
+
|
|
42
|
+
def test_custom_values(self):
|
|
43
|
+
cfg = DocumentConfig(
|
|
44
|
+
max_file_size_mb=10,
|
|
45
|
+
allowed_formats=[DocumentFormat.PDF, DocumentFormat.TXT],
|
|
46
|
+
extract_metadata=False,
|
|
47
|
+
store_versions=False,
|
|
48
|
+
max_versions=5,
|
|
49
|
+
)
|
|
50
|
+
assert cfg.max_file_size_mb == 10
|
|
51
|
+
assert len(cfg.allowed_formats) == 2
|
|
52
|
+
assert cfg.extract_metadata is False
|
|
53
|
+
|
|
54
|
+
def test_allowed_formats_default_contains_all(self):
|
|
55
|
+
cfg = DocumentConfig()
|
|
56
|
+
for fmt in DocumentFormat:
|
|
57
|
+
assert fmt in cfg.allowed_formats
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class TestDocumentInfo:
|
|
61
|
+
def test_create_info(self):
|
|
62
|
+
now = datetime.now(UTC)
|
|
63
|
+
info = DocumentInfo(
|
|
64
|
+
id="abc123",
|
|
65
|
+
filename="report.pdf",
|
|
66
|
+
format=DocumentFormat.PDF,
|
|
67
|
+
size_bytes=1024,
|
|
68
|
+
page_count=5,
|
|
69
|
+
word_count=500,
|
|
70
|
+
created_at=now,
|
|
71
|
+
)
|
|
72
|
+
assert info.id == "abc123"
|
|
73
|
+
assert info.filename == "report.pdf"
|
|
74
|
+
assert info.format == DocumentFormat.PDF
|
|
75
|
+
assert info.size_bytes == 1024
|
|
76
|
+
assert info.page_count == 5
|
|
77
|
+
assert info.word_count == 500
|
|
78
|
+
|
|
79
|
+
def test_defaults(self):
|
|
80
|
+
now = datetime.now(UTC)
|
|
81
|
+
info = DocumentInfo(
|
|
82
|
+
id="x",
|
|
83
|
+
filename="file.txt",
|
|
84
|
+
format=DocumentFormat.TXT,
|
|
85
|
+
size_bytes=100,
|
|
86
|
+
created_at=now,
|
|
87
|
+
)
|
|
88
|
+
assert info.page_count == 0
|
|
89
|
+
assert info.word_count == 0
|
|
90
|
+
assert info.language == ""
|
|
91
|
+
assert info.title == ""
|
|
92
|
+
assert info.author == ""
|
|
93
|
+
assert info.metadata == {}
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class TestTextExtractionResult:
|
|
97
|
+
def test_create_result(self):
|
|
98
|
+
result = TextExtractionResult(
|
|
99
|
+
text="Hello world",
|
|
100
|
+
format=DocumentFormat.TXT,
|
|
101
|
+
page_count=1,
|
|
102
|
+
word_count=2,
|
|
103
|
+
extraction_method="plain_text",
|
|
104
|
+
processing_time_ms=10,
|
|
105
|
+
)
|
|
106
|
+
assert result.text == "Hello world"
|
|
107
|
+
assert result.word_count == 2
|
|
108
|
+
assert result.extraction_method == "plain_text"
|
|
109
|
+
|
|
110
|
+
def test_defaults(self):
|
|
111
|
+
result = TextExtractionResult(text="", format=DocumentFormat.PDF)
|
|
112
|
+
assert result.page_count == 0
|
|
113
|
+
assert result.word_count == 0
|
|
114
|
+
assert result.extraction_method == ""
|
|
115
|
+
assert result.metadata == {}
|
|
116
|
+
assert result.processing_time_ms == 0
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class TestDocumentVersion:
|
|
120
|
+
def test_create_version(self):
|
|
121
|
+
now = datetime.now(UTC)
|
|
122
|
+
version = DocumentVersion(
|
|
123
|
+
version=1,
|
|
124
|
+
document_id="doc-123",
|
|
125
|
+
text_hash="abc123def456",
|
|
126
|
+
changes_summary="Initial version",
|
|
127
|
+
created_at=now,
|
|
128
|
+
)
|
|
129
|
+
assert version.version == 1
|
|
130
|
+
assert version.document_id == "doc-123"
|
|
131
|
+
assert version.text_hash == "abc123def456"
|
|
132
|
+
assert version.changes_summary == "Initial version"
|
|
133
|
+
|
|
134
|
+
def test_defaults(self):
|
|
135
|
+
now = datetime.now(UTC)
|
|
136
|
+
version = DocumentVersion(
|
|
137
|
+
version=2,
|
|
138
|
+
document_id="doc-456",
|
|
139
|
+
text_hash="xyz789",
|
|
140
|
+
created_at=now,
|
|
141
|
+
)
|
|
142
|
+
assert version.changes_summary == ""
|
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
"""Tests for the DocumentProcessor — format detection, validation, and extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import pytest
|
|
6
|
+
from doc_processing.models import DocumentConfig, DocumentFormat
|
|
7
|
+
from doc_processing.processor import DocumentProcessor
|
|
8
|
+
from errors import ValidationError
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TestDetectFormat:
|
|
12
|
+
def test_pdf_by_extension(self):
|
|
13
|
+
processor = DocumentProcessor()
|
|
14
|
+
assert processor.detect_format("report.pdf") == DocumentFormat.PDF
|
|
15
|
+
|
|
16
|
+
def test_docx_by_extension(self):
|
|
17
|
+
processor = DocumentProcessor()
|
|
18
|
+
assert processor.detect_format("document.docx") == DocumentFormat.DOCX
|
|
19
|
+
|
|
20
|
+
def test_txt_by_extension(self):
|
|
21
|
+
processor = DocumentProcessor()
|
|
22
|
+
assert processor.detect_format("notes.txt") == DocumentFormat.TXT
|
|
23
|
+
|
|
24
|
+
def test_markdown_by_extension(self):
|
|
25
|
+
processor = DocumentProcessor()
|
|
26
|
+
assert processor.detect_format("README.md") == DocumentFormat.MARKDOWN
|
|
27
|
+
|
|
28
|
+
def test_html_by_extension(self):
|
|
29
|
+
processor = DocumentProcessor()
|
|
30
|
+
assert processor.detect_format("page.html") == DocumentFormat.HTML
|
|
31
|
+
|
|
32
|
+
def test_image_by_extension(self):
|
|
33
|
+
processor = DocumentProcessor()
|
|
34
|
+
assert processor.detect_format("photo.jpg") == DocumentFormat.IMAGE
|
|
35
|
+
assert processor.detect_format("scan.png") == DocumentFormat.IMAGE
|
|
36
|
+
assert processor.detect_format("doc.tiff") == DocumentFormat.IMAGE
|
|
37
|
+
|
|
38
|
+
def test_pdf_by_magic_bytes(self):
|
|
39
|
+
processor = DocumentProcessor()
|
|
40
|
+
content = b"%PDF-1.4 rest of the file..."
|
|
41
|
+
assert processor.detect_format("unknown", content) == DocumentFormat.PDF
|
|
42
|
+
|
|
43
|
+
def test_case_insensitive_extension(self):
|
|
44
|
+
processor = DocumentProcessor()
|
|
45
|
+
assert processor.detect_format("REPORT.PDF") == DocumentFormat.PDF
|
|
46
|
+
assert processor.detect_format("doc.TXT") == DocumentFormat.TXT
|
|
47
|
+
|
|
48
|
+
def test_unknown_format_raises(self):
|
|
49
|
+
processor = DocumentProcessor()
|
|
50
|
+
with pytest.raises(ValidationError, match="Unsupported file format"):
|
|
51
|
+
processor.detect_format("file.xyz")
|
|
52
|
+
|
|
53
|
+
def test_no_extension_raises(self):
|
|
54
|
+
processor = DocumentProcessor()
|
|
55
|
+
with pytest.raises(ValidationError, match="Unsupported file format"):
|
|
56
|
+
processor.detect_format("Makefile")
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class TestValidation:
|
|
60
|
+
def test_valid_file(self):
|
|
61
|
+
processor = DocumentProcessor()
|
|
62
|
+
processor.validate("report.pdf", 1024) # Should not raise
|
|
63
|
+
|
|
64
|
+
def test_file_too_large(self):
|
|
65
|
+
config = DocumentConfig(max_file_size_mb=1)
|
|
66
|
+
processor = DocumentProcessor(config)
|
|
67
|
+
with pytest.raises(ValidationError, match="exceeds maximum size"):
|
|
68
|
+
processor.validate("report.pdf", 2 * 1024 * 1024)
|
|
69
|
+
|
|
70
|
+
def test_format_not_allowed(self):
|
|
71
|
+
config = DocumentConfig(allowed_formats=[DocumentFormat.PDF])
|
|
72
|
+
processor = DocumentProcessor(config)
|
|
73
|
+
with pytest.raises(ValidationError, match="not allowed"):
|
|
74
|
+
processor.validate("notes.txt", 100)
|
|
75
|
+
|
|
76
|
+
def test_allowed_format_passes(self):
|
|
77
|
+
config = DocumentConfig(allowed_formats=[DocumentFormat.PDF, DocumentFormat.TXT])
|
|
78
|
+
processor = DocumentProcessor(config)
|
|
79
|
+
processor.validate("notes.txt", 100) # Should not raise
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class TestProcess:
|
|
83
|
+
async def test_process_text_file(self):
|
|
84
|
+
processor = DocumentProcessor()
|
|
85
|
+
content = b"Hello world, this is a test document."
|
|
86
|
+
result = await processor.process(content, "test.txt")
|
|
87
|
+
|
|
88
|
+
assert result.text == "Hello world, this is a test document."
|
|
89
|
+
assert result.format == DocumentFormat.TXT
|
|
90
|
+
assert result.word_count == 7
|
|
91
|
+
assert result.extraction_method == "plain_text"
|
|
92
|
+
|
|
93
|
+
async def test_process_markdown_file(self):
|
|
94
|
+
processor = DocumentProcessor()
|
|
95
|
+
content = b"# Title\n\nSome paragraph text here."
|
|
96
|
+
result = await processor.process(content, "doc.md")
|
|
97
|
+
|
|
98
|
+
assert "# Title" in result.text
|
|
99
|
+
assert result.word_count > 0
|
|
100
|
+
|
|
101
|
+
async def test_process_unknown_format_raises(self):
|
|
102
|
+
processor = DocumentProcessor()
|
|
103
|
+
with pytest.raises(ValidationError):
|
|
104
|
+
await processor.process(b"data", "file.xyz")
|
|
105
|
+
|
|
106
|
+
async def test_process_no_extractor_raises(self):
|
|
107
|
+
config = DocumentConfig(allowed_formats=list(DocumentFormat))
|
|
108
|
+
processor = DocumentProcessor(config)
|
|
109
|
+
# IMAGE format has no registered extractor
|
|
110
|
+
with pytest.raises(ValidationError, match="No extractor available"):
|
|
111
|
+
await processor.process(b"\x89PNG", "photo.png")
|
|
112
|
+
|
|
113
|
+
async def test_process_validates_size(self):
|
|
114
|
+
config = DocumentConfig(max_file_size_mb=0)
|
|
115
|
+
processor = DocumentProcessor(config)
|
|
116
|
+
with pytest.raises(ValidationError, match="exceeds maximum size"):
|
|
117
|
+
await processor.process(b"some content", "test.txt")
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class TestGetInfo:
|
|
121
|
+
async def test_get_info_text_file(self):
|
|
122
|
+
processor = DocumentProcessor()
|
|
123
|
+
content = b"A simple test file with some words."
|
|
124
|
+
info = await processor.get_info(content, "test.txt")
|
|
125
|
+
|
|
126
|
+
assert info.filename == "test.txt"
|
|
127
|
+
assert info.format == DocumentFormat.TXT
|
|
128
|
+
assert info.size_bytes == len(content)
|
|
129
|
+
assert info.word_count == 7
|
|
130
|
+
assert info.id # Should have a generated ID
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
class TestSupportedFormats:
|
|
134
|
+
def test_default_extractors(self):
|
|
135
|
+
processor = DocumentProcessor()
|
|
136
|
+
supported = processor.supported_formats
|
|
137
|
+
assert DocumentFormat.PDF in supported
|
|
138
|
+
assert DocumentFormat.TXT in supported
|
|
139
|
+
assert DocumentFormat.MARKDOWN in supported
|
|
140
|
+
|
|
141
|
+
def test_config_accessible(self):
|
|
142
|
+
config = DocumentConfig(max_file_size_mb=25)
|
|
143
|
+
processor = DocumentProcessor(config)
|
|
144
|
+
assert processor.config.max_file_size_mb == 25
|