msaas-doc-processing 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doc_processing/__init__.py +21 -0
- doc_processing/extractors/__init__.py +1 -0
- doc_processing/extractors/base.py +39 -0
- doc_processing/extractors/docx.py +88 -0
- doc_processing/extractors/pdf.py +75 -0
- doc_processing/extractors/text.py +50 -0
- doc_processing/models.py +69 -0
- doc_processing/processor.py +212 -0
- doc_processing/router.py +68 -0
- msaas_doc_processing-0.1.0.dist-info/METADATA +22 -0
- msaas_doc_processing-0.1.0.dist-info/RECORD +12 -0
- msaas_doc_processing-0.1.0.dist-info/WHEEL +4 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""Willian Doc Processing — document processing pipeline library."""
|
|
2
|
+
|
|
3
|
+
from doc_processing.models import (
|
|
4
|
+
DocumentConfig,
|
|
5
|
+
DocumentFormat,
|
|
6
|
+
DocumentInfo,
|
|
7
|
+
DocumentVersion,
|
|
8
|
+
TextExtractionResult,
|
|
9
|
+
)
|
|
10
|
+
from doc_processing.processor import DocumentProcessor
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
# Processor
|
|
14
|
+
"DocumentProcessor",
|
|
15
|
+
# Models
|
|
16
|
+
"DocumentConfig",
|
|
17
|
+
"DocumentFormat",
|
|
18
|
+
"DocumentInfo",
|
|
19
|
+
"DocumentVersion",
|
|
20
|
+
"TextExtractionResult",
|
|
21
|
+
]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""Document text extractors — pluggable backends for each format."""
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""Abstract base extractor that all format-specific extractors must implement."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import ABC, abstractmethod
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
|
+
|
|
8
|
+
if TYPE_CHECKING:
|
|
9
|
+
from doc_processing.models import DocumentFormat, TextExtractionResult
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaseExtractor(ABC):
|
|
13
|
+
"""Base class for document text extraction implementations.
|
|
14
|
+
|
|
15
|
+
Each extractor handles one or more document formats and normalizes
|
|
16
|
+
the extraction output into a common TextExtractionResult model.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
async def extract(self, content: bytes) -> TextExtractionResult:
|
|
21
|
+
"""Extract text from raw document bytes.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
content: Raw bytes of the document file.
|
|
25
|
+
|
|
26
|
+
Returns:
|
|
27
|
+
Extraction result with text, metadata, and timing information.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def supports(self, fmt: DocumentFormat) -> bool:
|
|
32
|
+
"""Check whether this extractor supports the given format.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
fmt: The document format to check.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
True if this extractor can handle the format.
|
|
39
|
+
"""
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""DOCX text extractor using python-docx.
|
|
2
|
+
|
|
3
|
+
This module is optional — it requires the ``python-docx`` package.
|
|
4
|
+
Install with: ``pip install msaas-doc-processing[docx]``
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import io
|
|
10
|
+
import time
|
|
11
|
+
|
|
12
|
+
from doc_processing.extractors.base import BaseExtractor
|
|
13
|
+
from doc_processing.models import DocumentFormat, TextExtractionResult
|
|
14
|
+
|
|
15
|
+
try:
|
|
16
|
+
import structlog
|
|
17
|
+
|
|
18
|
+
logger = structlog.get_logger(__name__)
|
|
19
|
+
except ImportError:
|
|
20
|
+
import logging
|
|
21
|
+
|
|
22
|
+
logger = logging.getLogger(__name__)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DocxExtractor(BaseExtractor):
|
|
26
|
+
"""Extract text from DOCX documents using python-docx.
|
|
27
|
+
|
|
28
|
+
Extracts text from paragraphs and tables, plus core metadata
|
|
29
|
+
(author, title, creation date).
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
async def extract(self, content: bytes) -> TextExtractionResult:
|
|
33
|
+
"""Extract text from a DOCX file.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
content: Raw DOCX bytes.
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
Extraction result with paragraphs and table text combined.
|
|
40
|
+
"""
|
|
41
|
+
from docx import Document
|
|
42
|
+
|
|
43
|
+
start = time.monotonic()
|
|
44
|
+
|
|
45
|
+
doc = Document(io.BytesIO(content))
|
|
46
|
+
|
|
47
|
+
# Extract paragraphs
|
|
48
|
+
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
49
|
+
|
|
50
|
+
# Extract table content
|
|
51
|
+
table_texts: list[str] = []
|
|
52
|
+
for table in doc.tables:
|
|
53
|
+
for row in table.rows:
|
|
54
|
+
cells = [cell.text.strip() for cell in row.cells if cell.text.strip()]
|
|
55
|
+
if cells:
|
|
56
|
+
table_texts.append(" | ".join(cells))
|
|
57
|
+
|
|
58
|
+
all_parts = paragraphs + table_texts
|
|
59
|
+
full_text = "\n\n".join(all_parts).strip()
|
|
60
|
+
word_count = len(full_text.split()) if full_text else 0
|
|
61
|
+
|
|
62
|
+
metadata: dict = {}
|
|
63
|
+
props = doc.core_properties
|
|
64
|
+
if props.title:
|
|
65
|
+
metadata["title"] = props.title
|
|
66
|
+
if props.author:
|
|
67
|
+
metadata["author"] = props.author
|
|
68
|
+
if props.created:
|
|
69
|
+
metadata["created"] = props.created.isoformat()
|
|
70
|
+
if props.modified:
|
|
71
|
+
metadata["modified"] = props.modified.isoformat()
|
|
72
|
+
if props.subject:
|
|
73
|
+
metadata["subject"] = props.subject
|
|
74
|
+
|
|
75
|
+
elapsed_ms = int((time.monotonic() - start) * 1000)
|
|
76
|
+
|
|
77
|
+
return TextExtractionResult(
|
|
78
|
+
text=full_text,
|
|
79
|
+
format=DocumentFormat.DOCX,
|
|
80
|
+
page_count=0,
|
|
81
|
+
word_count=word_count,
|
|
82
|
+
extraction_method="docx_parser",
|
|
83
|
+
metadata=metadata,
|
|
84
|
+
processing_time_ms=elapsed_ms,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
def supports(self, fmt: DocumentFormat) -> bool:
|
|
88
|
+
return fmt == DocumentFormat.DOCX
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""PDF text extractor using pypdf for digital text extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import io
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
|
+
from doc_processing.extractors.base import BaseExtractor
|
|
9
|
+
from doc_processing.models import DocumentFormat, TextExtractionResult
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import structlog
|
|
13
|
+
|
|
14
|
+
logger = structlog.get_logger(__name__)
|
|
15
|
+
except ImportError:
|
|
16
|
+
import logging
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PDFExtractor(BaseExtractor):
|
|
22
|
+
"""Extract text from PDF documents using pypdf.
|
|
23
|
+
|
|
24
|
+
Handles digitally-created PDFs with embedded text. For scanned PDFs
|
|
25
|
+
(where extracted text is empty), the caller should fall back to an OCR
|
|
26
|
+
pipeline.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
async def extract(self, content: bytes) -> TextExtractionResult:
|
|
30
|
+
"""Extract text from a PDF file.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
content: Raw PDF bytes.
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
Extraction result. ``text`` will be empty for scanned PDFs.
|
|
37
|
+
"""
|
|
38
|
+
from pypdf import PdfReader
|
|
39
|
+
|
|
40
|
+
start = time.monotonic()
|
|
41
|
+
|
|
42
|
+
reader = PdfReader(io.BytesIO(content))
|
|
43
|
+
pages: list[str] = []
|
|
44
|
+
for page in reader.pages:
|
|
45
|
+
text = page.extract_text() or ""
|
|
46
|
+
pages.append(text)
|
|
47
|
+
|
|
48
|
+
full_text = "\n\n".join(pages).strip()
|
|
49
|
+
word_count = len(full_text.split()) if full_text else 0
|
|
50
|
+
|
|
51
|
+
metadata: dict = {}
|
|
52
|
+
if reader.metadata:
|
|
53
|
+
if reader.metadata.title:
|
|
54
|
+
metadata["title"] = reader.metadata.title
|
|
55
|
+
if reader.metadata.author:
|
|
56
|
+
metadata["author"] = reader.metadata.author
|
|
57
|
+
if reader.metadata.subject:
|
|
58
|
+
metadata["subject"] = reader.metadata.subject
|
|
59
|
+
if reader.metadata.creator:
|
|
60
|
+
metadata["creator"] = reader.metadata.creator
|
|
61
|
+
|
|
62
|
+
elapsed_ms = int((time.monotonic() - start) * 1000)
|
|
63
|
+
|
|
64
|
+
return TextExtractionResult(
|
|
65
|
+
text=full_text,
|
|
66
|
+
format=DocumentFormat.PDF,
|
|
67
|
+
page_count=len(reader.pages),
|
|
68
|
+
word_count=word_count,
|
|
69
|
+
extraction_method="digital" if full_text else "empty",
|
|
70
|
+
metadata=metadata,
|
|
71
|
+
processing_time_ms=elapsed_ms,
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
def supports(self, fmt: DocumentFormat) -> bool:
|
|
75
|
+
return fmt == DocumentFormat.PDF
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Plain text and markdown extractor."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import time
|
|
6
|
+
|
|
7
|
+
from doc_processing.extractors.base import BaseExtractor
|
|
8
|
+
from doc_processing.models import DocumentFormat, TextExtractionResult
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class TextExtractor(BaseExtractor):
|
|
12
|
+
"""Extract text from plain text and markdown documents.
|
|
13
|
+
|
|
14
|
+
Handles UTF-8 decoding, word counting, and basic metadata extraction.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
async def extract(self, content: bytes) -> TextExtractionResult:
|
|
18
|
+
"""Extract text from plain text or markdown bytes.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
content: Raw bytes (assumed UTF-8).
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
Extraction result with text and word count.
|
|
25
|
+
"""
|
|
26
|
+
start = time.monotonic()
|
|
27
|
+
|
|
28
|
+
text = content.decode("utf-8", errors="replace").strip()
|
|
29
|
+
word_count = len(text.split()) if text else 0
|
|
30
|
+
line_count = text.count("\n") + 1 if text else 0
|
|
31
|
+
|
|
32
|
+
metadata: dict = {
|
|
33
|
+
"line_count": line_count,
|
|
34
|
+
"char_count": len(text),
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
elapsed_ms = int((time.monotonic() - start) * 1000)
|
|
38
|
+
|
|
39
|
+
return TextExtractionResult(
|
|
40
|
+
text=text,
|
|
41
|
+
format=DocumentFormat.TXT,
|
|
42
|
+
page_count=0,
|
|
43
|
+
word_count=word_count,
|
|
44
|
+
extraction_method="plain_text",
|
|
45
|
+
metadata=metadata,
|
|
46
|
+
processing_time_ms=elapsed_ms,
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
def supports(self, fmt: DocumentFormat) -> bool:
|
|
50
|
+
return fmt in {DocumentFormat.TXT, DocumentFormat.MARKDOWN}
|
doc_processing/models.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Domain models for document processing, metadata, and versioning."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from datetime import datetime # noqa: TC003
|
|
6
|
+
from enum import StrEnum
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class DocumentFormat(StrEnum):
|
|
12
|
+
"""Supported document formats."""
|
|
13
|
+
|
|
14
|
+
PDF = "pdf"
|
|
15
|
+
DOCX = "docx"
|
|
16
|
+
TXT = "txt"
|
|
17
|
+
IMAGE = "image"
|
|
18
|
+
MARKDOWN = "markdown"
|
|
19
|
+
HTML = "html"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DocumentConfig(BaseModel):
|
|
23
|
+
"""Configuration for the document processing pipeline."""
|
|
24
|
+
|
|
25
|
+
max_file_size_mb: int = 50
|
|
26
|
+
allowed_formats: list[DocumentFormat] = Field(
|
|
27
|
+
default_factory=lambda: list(DocumentFormat),
|
|
28
|
+
)
|
|
29
|
+
extract_metadata: bool = True
|
|
30
|
+
store_versions: bool = True
|
|
31
|
+
max_versions: int = 10
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class DocumentInfo(BaseModel):
|
|
35
|
+
"""Metadata about a processed document."""
|
|
36
|
+
|
|
37
|
+
id: str
|
|
38
|
+
filename: str
|
|
39
|
+
format: DocumentFormat
|
|
40
|
+
size_bytes: int
|
|
41
|
+
page_count: int = 0
|
|
42
|
+
word_count: int = 0
|
|
43
|
+
language: str = ""
|
|
44
|
+
title: str = ""
|
|
45
|
+
author: str = ""
|
|
46
|
+
created_at: datetime
|
|
47
|
+
metadata: dict = Field(default_factory=dict)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class TextExtractionResult(BaseModel):
|
|
51
|
+
"""Result of text extraction from a document."""
|
|
52
|
+
|
|
53
|
+
text: str
|
|
54
|
+
format: DocumentFormat
|
|
55
|
+
page_count: int = 0
|
|
56
|
+
word_count: int = 0
|
|
57
|
+
extraction_method: str = ""
|
|
58
|
+
metadata: dict = Field(default_factory=dict)
|
|
59
|
+
processing_time_ms: int = 0
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class DocumentVersion(BaseModel):
|
|
63
|
+
"""A single version snapshot of a document."""
|
|
64
|
+
|
|
65
|
+
version: int
|
|
66
|
+
document_id: str
|
|
67
|
+
text_hash: str
|
|
68
|
+
changes_summary: str = ""
|
|
69
|
+
created_at: datetime
|
|
@@ -0,0 +1,212 @@
|
|
|
1
|
+
"""Core document processor — format detection, validation, and extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import uuid
|
|
6
|
+
from datetime import UTC, datetime
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import TYPE_CHECKING
|
|
9
|
+
|
|
10
|
+
from errors import ValidationError
|
|
11
|
+
|
|
12
|
+
from doc_processing.extractors.pdf import PDFExtractor
|
|
13
|
+
from doc_processing.extractors.text import TextExtractor
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from doc_processing.extractors.base import BaseExtractor
|
|
17
|
+
from doc_processing.models import (
|
|
18
|
+
DocumentConfig,
|
|
19
|
+
DocumentFormat,
|
|
20
|
+
DocumentInfo,
|
|
21
|
+
TextExtractionResult,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
try:
|
|
25
|
+
import structlog
|
|
26
|
+
|
|
27
|
+
logger = structlog.get_logger(__name__)
|
|
28
|
+
except ImportError:
|
|
29
|
+
import logging
|
|
30
|
+
|
|
31
|
+
logger = logging.getLogger(__name__)
|
|
32
|
+
|
|
33
|
+
# Extension-to-format mapping
|
|
34
|
+
_EXTENSION_MAP: dict[str, DocumentFormat] = {
|
|
35
|
+
".pdf": DocumentFormat.PDF,
|
|
36
|
+
".docx": DocumentFormat.DOCX,
|
|
37
|
+
".txt": DocumentFormat.TXT,
|
|
38
|
+
".text": DocumentFormat.TXT,
|
|
39
|
+
".md": DocumentFormat.MARKDOWN,
|
|
40
|
+
".markdown": DocumentFormat.MARKDOWN,
|
|
41
|
+
".html": DocumentFormat.HTML,
|
|
42
|
+
".htm": DocumentFormat.HTML,
|
|
43
|
+
".jpg": DocumentFormat.IMAGE,
|
|
44
|
+
".jpeg": DocumentFormat.IMAGE,
|
|
45
|
+
".png": DocumentFormat.IMAGE,
|
|
46
|
+
".tiff": DocumentFormat.IMAGE,
|
|
47
|
+
".tif": DocumentFormat.IMAGE,
|
|
48
|
+
".bmp": DocumentFormat.IMAGE,
|
|
49
|
+
".webp": DocumentFormat.IMAGE,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
# Magic bytes for format detection
|
|
53
|
+
_MAGIC_BYTES: dict[bytes, DocumentFormat] = {
|
|
54
|
+
b"%PDF": DocumentFormat.PDF,
|
|
55
|
+
b"PK\x03\x04": DocumentFormat.DOCX, # ZIP-based (DOCX, XLSX, etc.)
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class DocumentProcessor:
|
|
60
|
+
"""High-level document processing pipeline.
|
|
61
|
+
|
|
62
|
+
Detects format, validates constraints, routes to the appropriate
|
|
63
|
+
extractor, and returns structured results.
|
|
64
|
+
|
|
65
|
+
Usage::
|
|
66
|
+
|
|
67
|
+
processor = DocumentProcessor()
|
|
68
|
+
result = await processor.process(raw_bytes, "report.pdf")
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(self, config: DocumentConfig | None = None) -> None:
|
|
72
|
+
self._config = config or DocumentConfig()
|
|
73
|
+
self._extractors: dict[DocumentFormat, BaseExtractor] = {}
|
|
74
|
+
self._init_extractors()
|
|
75
|
+
|
|
76
|
+
def _init_extractors(self) -> None:
|
|
77
|
+
"""Register available extractors, gracefully skipping optional ones."""
|
|
78
|
+
self._extractors[DocumentFormat.PDF] = PDFExtractor()
|
|
79
|
+
self._extractors[DocumentFormat.TXT] = TextExtractor()
|
|
80
|
+
self._extractors[DocumentFormat.MARKDOWN] = TextExtractor()
|
|
81
|
+
|
|
82
|
+
try:
|
|
83
|
+
from doc_processing.extractors.docx import DocxExtractor
|
|
84
|
+
|
|
85
|
+
self._extractors[DocumentFormat.DOCX] = DocxExtractor()
|
|
86
|
+
except ImportError:
|
|
87
|
+
logger.info("python-docx not installed, DOCX extraction disabled")
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def config(self) -> DocumentConfig:
|
|
91
|
+
"""Return the current configuration."""
|
|
92
|
+
return self._config
|
|
93
|
+
|
|
94
|
+
@property
|
|
95
|
+
def supported_formats(self) -> list[DocumentFormat]:
|
|
96
|
+
"""Return list of formats with registered extractors."""
|
|
97
|
+
return list(self._extractors.keys())
|
|
98
|
+
|
|
99
|
+
def detect_format(self, filename: str, content: bytes | None = None) -> DocumentFormat:
|
|
100
|
+
"""Detect document format from filename extension and optional magic bytes.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
filename: Original filename with extension.
|
|
104
|
+
content: Optional raw bytes for magic-byte detection.
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
Detected document format.
|
|
108
|
+
|
|
109
|
+
Raises:
|
|
110
|
+
ValidationError: If the format cannot be determined.
|
|
111
|
+
"""
|
|
112
|
+
# Try magic bytes first when content is available
|
|
113
|
+
if content:
|
|
114
|
+
for magic, fmt in _MAGIC_BYTES.items():
|
|
115
|
+
if content[: len(magic)] == magic:
|
|
116
|
+
return fmt
|
|
117
|
+
|
|
118
|
+
# Fall back to extension
|
|
119
|
+
suffix = Path(filename).suffix.lower()
|
|
120
|
+
fmt = _EXTENSION_MAP.get(suffix)
|
|
121
|
+
if fmt is not None:
|
|
122
|
+
return fmt
|
|
123
|
+
|
|
124
|
+
raise ValidationError(
|
|
125
|
+
f"Unsupported file format: {suffix or 'unknown'}",
|
|
126
|
+
details={"filename": filename, "extension": suffix},
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
def validate(self, filename: str, size_bytes: int) -> None:
|
|
130
|
+
"""Validate file against configuration constraints.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
filename: Original filename.
|
|
134
|
+
size_bytes: File size in bytes.
|
|
135
|
+
|
|
136
|
+
Raises:
|
|
137
|
+
ValidationError: If the file violates any constraint.
|
|
138
|
+
"""
|
|
139
|
+
max_bytes = self._config.max_file_size_mb * 1024 * 1024
|
|
140
|
+
if size_bytes > max_bytes:
|
|
141
|
+
raise ValidationError(
|
|
142
|
+
f"File exceeds maximum size of {self._config.max_file_size_mb} MB",
|
|
143
|
+
details={
|
|
144
|
+
"filename": filename,
|
|
145
|
+
"size_bytes": size_bytes,
|
|
146
|
+
"max_bytes": max_bytes,
|
|
147
|
+
},
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
fmt = self.detect_format(filename)
|
|
151
|
+
if fmt not in self._config.allowed_formats:
|
|
152
|
+
raise ValidationError(
|
|
153
|
+
f"Format '{fmt.value}' is not allowed",
|
|
154
|
+
details={
|
|
155
|
+
"filename": filename,
|
|
156
|
+
"format": fmt.value,
|
|
157
|
+
"allowed_formats": [f.value for f in self._config.allowed_formats],
|
|
158
|
+
},
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
async def process(self, content: bytes, filename: str) -> TextExtractionResult:
|
|
162
|
+
"""Process a document: detect format, validate, and extract text.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
content: Raw document bytes.
|
|
166
|
+
filename: Original filename.
|
|
167
|
+
|
|
168
|
+
Returns:
|
|
169
|
+
Text extraction result with metadata.
|
|
170
|
+
|
|
171
|
+
Raises:
|
|
172
|
+
ValidationError: If validation fails or no extractor is available.
|
|
173
|
+
"""
|
|
174
|
+
self.validate(filename, len(content))
|
|
175
|
+
|
|
176
|
+
fmt = self.detect_format(filename, content)
|
|
177
|
+
extractor = self._extractors.get(fmt)
|
|
178
|
+
if extractor is None:
|
|
179
|
+
raise ValidationError(
|
|
180
|
+
f"No extractor available for format '{fmt.value}'",
|
|
181
|
+
details={"format": fmt.value, "filename": filename},
|
|
182
|
+
)
|
|
183
|
+
|
|
184
|
+
logger.info("processing_document", filename=filename, format=fmt.value)
|
|
185
|
+
return await extractor.extract(content)
|
|
186
|
+
|
|
187
|
+
async def get_info(self, content: bytes, filename: str) -> DocumentInfo:
|
|
188
|
+
"""Get document metadata without full text extraction.
|
|
189
|
+
|
|
190
|
+
Args:
|
|
191
|
+
content: Raw document bytes.
|
|
192
|
+
filename: Original filename.
|
|
193
|
+
|
|
194
|
+
Returns:
|
|
195
|
+
Document metadata.
|
|
196
|
+
"""
|
|
197
|
+
fmt = self.detect_format(filename, content)
|
|
198
|
+
result = await self.process(content, filename)
|
|
199
|
+
|
|
200
|
+
return DocumentInfo(
|
|
201
|
+
id=uuid.uuid4().hex[:12],
|
|
202
|
+
filename=filename,
|
|
203
|
+
format=fmt,
|
|
204
|
+
size_bytes=len(content),
|
|
205
|
+
page_count=result.page_count,
|
|
206
|
+
word_count=result.word_count,
|
|
207
|
+
language="",
|
|
208
|
+
title=result.metadata.get("title", ""),
|
|
209
|
+
author=result.metadata.get("author", ""),
|
|
210
|
+
created_at=datetime.now(UTC),
|
|
211
|
+
metadata=result.metadata,
|
|
212
|
+
)
|
doc_processing/router.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""FastAPI router exposing document processing endpoints.
|
|
2
|
+
|
|
3
|
+
Usage::
|
|
4
|
+
|
|
5
|
+
from doc_processing.router import create_doc_router
|
|
6
|
+
|
|
7
|
+
app = FastAPI()
|
|
8
|
+
app.include_router(create_doc_router())
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from doc_processing.models import DocumentInfo, TextExtractionResult
|
|
14
|
+
from doc_processing.processor import DocumentProcessor
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _get_fastapi():
|
|
18
|
+
try:
|
|
19
|
+
import fastapi
|
|
20
|
+
|
|
21
|
+
return fastapi
|
|
22
|
+
except ImportError:
|
|
23
|
+
raise ImportError(
|
|
24
|
+
"FastAPI is required for the API router. "
|
|
25
|
+
"Install with: pip install msaas-doc-processing[all]"
|
|
26
|
+
) from None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def create_doc_router(
|
|
30
|
+
processor: DocumentProcessor | None = None,
|
|
31
|
+
*,
|
|
32
|
+
prefix: str = "/documents",
|
|
33
|
+
tags: list[str] | None = None,
|
|
34
|
+
):
|
|
35
|
+
"""Create a FastAPI APIRouter with document processing endpoints.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
processor: An optional DocumentProcessor instance. Creates one
|
|
39
|
+
with default config if not provided.
|
|
40
|
+
prefix: URL prefix for all routes.
|
|
41
|
+
tags: OpenAPI tags for the router.
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
A FastAPI APIRouter ready to be included in an app.
|
|
45
|
+
"""
|
|
46
|
+
fastapi = _get_fastapi()
|
|
47
|
+
router = fastapi.APIRouter(prefix=prefix, tags=tags or ["documents"])
|
|
48
|
+
_processor = processor or DocumentProcessor()
|
|
49
|
+
|
|
50
|
+
@router.post("/process", response_model=TextExtractionResult)
|
|
51
|
+
async def process_document(
|
|
52
|
+
file: fastapi.UploadFile,
|
|
53
|
+
) -> TextExtractionResult:
|
|
54
|
+
"""Upload a document and extract its text content."""
|
|
55
|
+
content = await file.read()
|
|
56
|
+
filename = file.filename or "unknown"
|
|
57
|
+
return await _processor.process(content, filename)
|
|
58
|
+
|
|
59
|
+
@router.post("/info", response_model=DocumentInfo)
|
|
60
|
+
async def document_info(
|
|
61
|
+
file: fastapi.UploadFile,
|
|
62
|
+
) -> DocumentInfo:
|
|
63
|
+
"""Upload a document and retrieve its metadata."""
|
|
64
|
+
content = await file.read()
|
|
65
|
+
filename = file.filename or "unknown"
|
|
66
|
+
return await _processor.get_info(content, filename)
|
|
67
|
+
|
|
68
|
+
return router
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: msaas-doc-processing
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Document processing pipeline — upload, extract, version, convert
|
|
5
|
+
License: MIT
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
|
+
Requires-Dist: msaas-api-core
|
|
8
|
+
Requires-Dist: msaas-errors
|
|
9
|
+
Requires-Dist: pydantic>=2.0
|
|
10
|
+
Requires-Dist: pypdf>=4.0
|
|
11
|
+
Provides-Extra: all
|
|
12
|
+
Requires-Dist: pillow>=10.0; extra == 'all'
|
|
13
|
+
Requires-Dist: python-docx>=1.0; extra == 'all'
|
|
14
|
+
Provides-Extra: dev
|
|
15
|
+
Requires-Dist: pillow>=10.0; extra == 'dev'
|
|
16
|
+
Requires-Dist: pytest-asyncio>=0.24.0; extra == 'dev'
|
|
17
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
18
|
+
Requires-Dist: python-docx>=1.0; extra == 'dev'
|
|
19
|
+
Provides-Extra: docx
|
|
20
|
+
Requires-Dist: python-docx>=1.0; extra == 'docx'
|
|
21
|
+
Provides-Extra: image
|
|
22
|
+
Requires-Dist: pillow>=10.0; extra == 'image'
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
doc_processing/__init__.py,sha256=-3d2jTyxfFvg-s64MKaEuOr_DO6IjflefSaGxcfMiWQ,454
|
|
2
|
+
doc_processing/models.py,sha256=E2XwU2jMlNOCcYxa4TMSSSaeQz5XsUeK3_V1SLOVctM,1577
|
|
3
|
+
doc_processing/processor.py,sha256=tvj-vD2iWjpxOuhnlRlYT1H0z7LT8pXi6WqzEblRJb0,6895
|
|
4
|
+
doc_processing/router.py,sha256=C5j27jGfykB1RRUjkudnwrOO9YgzER_TZ2e7F02K794,2026
|
|
5
|
+
doc_processing/extractors/__init__.py,sha256=nVhpeUCcpwHijbtOYaoqugdGxn6CHr9Hbodwn2Pk0cA,71
|
|
6
|
+
doc_processing/extractors/base.py,sha256=lIESQwF-UUEY50E6zdl5R86diZqOaYYAQwLKbdtg-98,1125
|
|
7
|
+
doc_processing/extractors/docx.py,sha256=TDRRylQ0S7HChV7TqKDc16pV2p4XgZ64vXat_J34xsI,2576
|
|
8
|
+
doc_processing/extractors/pdf.py,sha256=rXP25nUrCcnzMWFcGQWMzDCBRJ-rnitDxfW0hyr3YWk,2248
|
|
9
|
+
doc_processing/extractors/text.py,sha256=1TkxNp6LZu1ga0epXActGT7UAMQvNqpCPYfnBw_kohA,1473
|
|
10
|
+
msaas_doc_processing-0.1.0.dist-info/METADATA,sha256=dYSFMsDbyxcKXG4ZQxtQR0dXh_TO4cycxpfWiBYn35g,749
|
|
11
|
+
msaas_doc_processing-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
12
|
+
msaas_doc_processing-0.1.0.dist-info/RECORD,,
|