agent-runtime-core 0.7.1__py3-none-any.whl → 0.9.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agent_runtime_core/__init__.py +65 -3
- agent_runtime_core/agentic_loop.py +275 -15
- agent_runtime_core/config.py +4 -0
- agent_runtime_core/contexts.py +72 -4
- agent_runtime_core/files/__init__.py +88 -0
- agent_runtime_core/files/base.py +343 -0
- agent_runtime_core/files/ocr.py +406 -0
- agent_runtime_core/files/processors.py +508 -0
- agent_runtime_core/files/tools.py +317 -0
- agent_runtime_core/files/vision.py +360 -0
- agent_runtime_core/llm/anthropic.py +83 -0
- agent_runtime_core/multi_agent.py +1408 -16
- agent_runtime_core/persistence/__init__.py +8 -0
- agent_runtime_core/persistence/base.py +318 -1
- agent_runtime_core/persistence/file.py +226 -2
- agent_runtime_core/privacy.py +250 -0
- {agent_runtime_core-0.7.1.dist-info → agent_runtime_core-0.9.0.dist-info}/METADATA +36 -1
- {agent_runtime_core-0.7.1.dist-info → agent_runtime_core-0.9.0.dist-info}/RECORD +20 -13
- {agent_runtime_core-0.7.1.dist-info → agent_runtime_core-0.9.0.dist-info}/WHEEL +0 -0
- {agent_runtime_core-0.7.1.dist-info → agent_runtime_core-0.9.0.dist-info}/licenses/LICENSE +0 -0
agent_runtime_core/contexts.py
CHANGED
|
@@ -24,11 +24,15 @@ import json
|
|
|
24
24
|
import os
|
|
25
25
|
from datetime import datetime
|
|
26
26
|
from pathlib import Path
|
|
27
|
-
from typing import Any, Callable, Optional
|
|
27
|
+
from typing import Any, Callable, Optional, TYPE_CHECKING
|
|
28
28
|
from uuid import UUID, uuid4
|
|
29
29
|
|
|
30
30
|
from agent_runtime_core.interfaces import EventType, Message, ToolRegistry
|
|
31
31
|
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from agent_runtime_core.multi_agent import SystemContext
|
|
34
|
+
from agent_runtime_core.privacy import PrivacyConfig, UserContext
|
|
35
|
+
|
|
32
36
|
|
|
33
37
|
class InMemoryRunContext:
|
|
34
38
|
"""
|
|
@@ -65,10 +69,13 @@ class InMemoryRunContext:
|
|
|
65
69
|
metadata: Optional[dict] = None,
|
|
66
70
|
tool_registry: Optional[ToolRegistry] = None,
|
|
67
71
|
on_event: Optional[Callable[[str, dict], None]] = None,
|
|
72
|
+
system_context: Optional["SystemContext"] = None,
|
|
73
|
+
user_context: Optional["UserContext"] = None,
|
|
74
|
+
privacy_config: Optional["PrivacyConfig"] = None,
|
|
68
75
|
):
|
|
69
76
|
"""
|
|
70
77
|
Initialize an in-memory run context.
|
|
71
|
-
|
|
78
|
+
|
|
72
79
|
Args:
|
|
73
80
|
run_id: Unique identifier for this run (auto-generated if not provided)
|
|
74
81
|
conversation_id: Associated conversation ID (optional)
|
|
@@ -77,6 +84,9 @@ class InMemoryRunContext:
|
|
|
77
84
|
metadata: Run metadata
|
|
78
85
|
tool_registry: Registry of available tools
|
|
79
86
|
on_event: Optional callback for events (for testing/debugging)
|
|
87
|
+
system_context: Optional SystemContext for multi-agent systems with shared knowledge
|
|
88
|
+
user_context: Optional UserContext for user isolation and privacy
|
|
89
|
+
privacy_config: Optional PrivacyConfig for privacy settings (defaults to max privacy)
|
|
80
90
|
"""
|
|
81
91
|
self._run_id = run_id or uuid4()
|
|
82
92
|
self._conversation_id = conversation_id
|
|
@@ -88,7 +98,18 @@ class InMemoryRunContext:
|
|
|
88
98
|
self._state: Optional[dict] = None
|
|
89
99
|
self._events: list[dict] = []
|
|
90
100
|
self._on_event = on_event
|
|
91
|
-
|
|
101
|
+
self._system_context = system_context
|
|
102
|
+
|
|
103
|
+
# Import here to avoid circular imports
|
|
104
|
+
from agent_runtime_core.privacy import (
|
|
105
|
+
DEFAULT_PRIVACY_CONFIG,
|
|
106
|
+
ANONYMOUS_USER,
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Default to secure settings: anonymous user + strict privacy
|
|
110
|
+
self._user_context = user_context if user_context is not None else ANONYMOUS_USER
|
|
111
|
+
self._privacy_config = privacy_config if privacy_config is not None else DEFAULT_PRIVACY_CONFIG
|
|
112
|
+
|
|
92
113
|
@property
|
|
93
114
|
def run_id(self) -> UUID:
|
|
94
115
|
"""Unique identifier for this run."""
|
|
@@ -118,7 +139,22 @@ class InMemoryRunContext:
|
|
|
118
139
|
def tool_registry(self) -> ToolRegistry:
|
|
119
140
|
"""Registry of available tools for this agent."""
|
|
120
141
|
return self._tool_registry
|
|
121
|
-
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def system_context(self) -> Optional["SystemContext"]:
|
|
145
|
+
"""System context for multi-agent systems with shared knowledge."""
|
|
146
|
+
return self._system_context
|
|
147
|
+
|
|
148
|
+
@property
|
|
149
|
+
def user_context(self) -> "UserContext":
|
|
150
|
+
"""User context for privacy and data isolation. Defaults to ANONYMOUS_USER."""
|
|
151
|
+
return self._user_context
|
|
152
|
+
|
|
153
|
+
@property
|
|
154
|
+
def privacy_config(self) -> "PrivacyConfig":
|
|
155
|
+
"""Privacy configuration for this run. Defaults to DEFAULT_PRIVACY_CONFIG (strict)."""
|
|
156
|
+
return self._privacy_config
|
|
157
|
+
|
|
122
158
|
async def emit(self, event_type: EventType | str, payload: dict) -> None:
|
|
123
159
|
"""Emit an event (stored in memory)."""
|
|
124
160
|
event_type_str = event_type.value if hasattr(event_type, 'value') else str(event_type)
|
|
@@ -195,6 +231,9 @@ class FileRunContext:
|
|
|
195
231
|
metadata: Optional[dict] = None,
|
|
196
232
|
tool_registry: Optional[ToolRegistry] = None,
|
|
197
233
|
on_event: Optional[Callable[[str, dict], None]] = None,
|
|
234
|
+
system_context: Optional["SystemContext"] = None,
|
|
235
|
+
user_context: Optional["UserContext"] = None,
|
|
236
|
+
privacy_config: Optional["PrivacyConfig"] = None,
|
|
198
237
|
):
|
|
199
238
|
"""
|
|
200
239
|
Initialize a file-based run context.
|
|
@@ -208,6 +247,9 @@ class FileRunContext:
|
|
|
208
247
|
metadata: Run metadata
|
|
209
248
|
tool_registry: Registry of available tools
|
|
210
249
|
on_event: Optional callback for events
|
|
250
|
+
system_context: Optional SystemContext for multi-agent systems with shared knowledge
|
|
251
|
+
user_context: Optional UserContext for user isolation and privacy
|
|
252
|
+
privacy_config: Optional PrivacyConfig for privacy settings (defaults to max privacy)
|
|
211
253
|
"""
|
|
212
254
|
self._run_id = run_id or uuid4()
|
|
213
255
|
self._checkpoint_dir = Path(checkpoint_dir)
|
|
@@ -219,6 +261,17 @@ class FileRunContext:
|
|
|
219
261
|
self._cancelled = False
|
|
220
262
|
self._on_event = on_event
|
|
221
263
|
self._state_cache: Optional[dict] = None
|
|
264
|
+
self._system_context = system_context
|
|
265
|
+
|
|
266
|
+
# Import here to avoid circular imports
|
|
267
|
+
from agent_runtime_core.privacy import (
|
|
268
|
+
DEFAULT_PRIVACY_CONFIG,
|
|
269
|
+
ANONYMOUS_USER,
|
|
270
|
+
)
|
|
271
|
+
|
|
272
|
+
# Default to secure settings: anonymous user + strict privacy
|
|
273
|
+
self._user_context = user_context if user_context is not None else ANONYMOUS_USER
|
|
274
|
+
self._privacy_config = privacy_config if privacy_config is not None else DEFAULT_PRIVACY_CONFIG
|
|
222
275
|
|
|
223
276
|
# Ensure checkpoint directory exists
|
|
224
277
|
self._checkpoint_dir.mkdir(parents=True, exist_ok=True)
|
|
@@ -253,6 +306,21 @@ class FileRunContext:
|
|
|
253
306
|
"""Registry of available tools for this agent."""
|
|
254
307
|
return self._tool_registry
|
|
255
308
|
|
|
309
|
+
@property
|
|
310
|
+
def system_context(self) -> Optional["SystemContext"]:
|
|
311
|
+
"""System context for multi-agent systems with shared knowledge."""
|
|
312
|
+
return self._system_context
|
|
313
|
+
|
|
314
|
+
@property
|
|
315
|
+
def user_context(self) -> "UserContext":
|
|
316
|
+
"""User context for privacy and data isolation. Defaults to ANONYMOUS_USER."""
|
|
317
|
+
return self._user_context
|
|
318
|
+
|
|
319
|
+
@property
|
|
320
|
+
def privacy_config(self) -> "PrivacyConfig":
|
|
321
|
+
"""Privacy configuration for this run. Defaults to DEFAULT_PRIVACY_CONFIG (strict)."""
|
|
322
|
+
return self._privacy_config
|
|
323
|
+
|
|
256
324
|
def _checkpoint_path(self) -> Path:
|
|
257
325
|
"""Get the path to the checkpoint file for this run."""
|
|
258
326
|
return self._checkpoint_dir / f"{self._run_id}.json"
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""
|
|
2
|
+
File processing module for agent_runtime_core.
|
|
3
|
+
|
|
4
|
+
Provides pluggable file processors for reading various file types,
|
|
5
|
+
OCR integration, and AI vision capabilities.
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
from agent_runtime_core.files import FileProcessorRegistry, process_file
|
|
9
|
+
|
|
10
|
+
# Register processors
|
|
11
|
+
registry = FileProcessorRegistry()
|
|
12
|
+
registry.auto_register() # Register all available processors
|
|
13
|
+
|
|
14
|
+
# Process a file
|
|
15
|
+
result = await registry.process("document.pdf", file_bytes)
|
|
16
|
+
print(result.text) # Extracted text
|
|
17
|
+
print(result.metadata) # File metadata
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from .base import (
|
|
21
|
+
FileProcessor,
|
|
22
|
+
FileProcessorRegistry,
|
|
23
|
+
ProcessedFile,
|
|
24
|
+
FileType,
|
|
25
|
+
ProcessingOptions,
|
|
26
|
+
)
|
|
27
|
+
from .processors import (
|
|
28
|
+
TextFileProcessor,
|
|
29
|
+
PDFProcessor,
|
|
30
|
+
ImageProcessor,
|
|
31
|
+
DocxProcessor,
|
|
32
|
+
XlsxProcessor,
|
|
33
|
+
CsvProcessor,
|
|
34
|
+
)
|
|
35
|
+
from .ocr import (
|
|
36
|
+
OCRProvider,
|
|
37
|
+
TesseractOCR,
|
|
38
|
+
GoogleVisionOCR,
|
|
39
|
+
AWSTextractOCR,
|
|
40
|
+
AzureDocumentOCR,
|
|
41
|
+
)
|
|
42
|
+
from .vision import (
|
|
43
|
+
VisionProvider,
|
|
44
|
+
OpenAIVision,
|
|
45
|
+
AnthropicVision,
|
|
46
|
+
GeminiVision,
|
|
47
|
+
)
|
|
48
|
+
from .tools import (
|
|
49
|
+
FileTools,
|
|
50
|
+
FileToolsConfig,
|
|
51
|
+
get_file_read_schema,
|
|
52
|
+
get_file_write_schema,
|
|
53
|
+
get_file_list_schema,
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
__all__ = [
|
|
57
|
+
# Base classes
|
|
58
|
+
"FileProcessor",
|
|
59
|
+
"FileProcessorRegistry",
|
|
60
|
+
"ProcessedFile",
|
|
61
|
+
"FileType",
|
|
62
|
+
"ProcessingOptions",
|
|
63
|
+
# Processors
|
|
64
|
+
"TextFileProcessor",
|
|
65
|
+
"PDFProcessor",
|
|
66
|
+
"ImageProcessor",
|
|
67
|
+
"DocxProcessor",
|
|
68
|
+
"XlsxProcessor",
|
|
69
|
+
"CsvProcessor",
|
|
70
|
+
# OCR
|
|
71
|
+
"OCRProvider",
|
|
72
|
+
"TesseractOCR",
|
|
73
|
+
"GoogleVisionOCR",
|
|
74
|
+
"AWSTextractOCR",
|
|
75
|
+
"AzureDocumentOCR",
|
|
76
|
+
# Vision
|
|
77
|
+
"VisionProvider",
|
|
78
|
+
"OpenAIVision",
|
|
79
|
+
"AnthropicVision",
|
|
80
|
+
"GeminiVision",
|
|
81
|
+
# Tools
|
|
82
|
+
"FileTools",
|
|
83
|
+
"FileToolsConfig",
|
|
84
|
+
"get_file_read_schema",
|
|
85
|
+
"get_file_write_schema",
|
|
86
|
+
"get_file_list_schema",
|
|
87
|
+
]
|
|
88
|
+
|
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base classes for file processing.
|
|
3
|
+
|
|
4
|
+
Provides the FileProcessor abstract base class and registry pattern
|
|
5
|
+
for pluggable file type handling.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from abc import ABC, abstractmethod
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from enum import Enum
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any, Optional, Type, Union
|
|
13
|
+
import mimetypes
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class FileType(str, Enum):
|
|
17
|
+
"""Supported file types."""
|
|
18
|
+
TEXT = "text"
|
|
19
|
+
PDF = "pdf"
|
|
20
|
+
IMAGE = "image"
|
|
21
|
+
DOCX = "docx"
|
|
22
|
+
XLSX = "xlsx"
|
|
23
|
+
CSV = "csv"
|
|
24
|
+
JSON = "json"
|
|
25
|
+
MARKDOWN = "markdown"
|
|
26
|
+
HTML = "html"
|
|
27
|
+
UNKNOWN = "unknown"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class ProcessingOptions:
|
|
32
|
+
"""Options for file processing."""
|
|
33
|
+
# General options
|
|
34
|
+
max_size_bytes: int = 100 * 1024 * 1024 # 100MB default
|
|
35
|
+
extract_text: bool = True
|
|
36
|
+
extract_metadata: bool = True
|
|
37
|
+
|
|
38
|
+
# OCR options
|
|
39
|
+
use_ocr: bool = False
|
|
40
|
+
ocr_provider: Optional[str] = None # tesseract, google, aws, azure
|
|
41
|
+
ocr_language: str = "eng"
|
|
42
|
+
|
|
43
|
+
# Vision AI options
|
|
44
|
+
use_vision: bool = False
|
|
45
|
+
vision_provider: Optional[str] = None # openai, anthropic, gemini
|
|
46
|
+
vision_prompt: Optional[str] = None # Custom prompt for vision analysis
|
|
47
|
+
|
|
48
|
+
# Image options
|
|
49
|
+
generate_thumbnail: bool = True
|
|
50
|
+
thumbnail_size: tuple[int, int] = (200, 200)
|
|
51
|
+
|
|
52
|
+
# PDF options
|
|
53
|
+
pdf_extract_images: bool = False
|
|
54
|
+
pdf_page_limit: Optional[int] = None # Limit pages to process
|
|
55
|
+
|
|
56
|
+
# Additional provider-specific options
|
|
57
|
+
extra: dict = field(default_factory=dict)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class ProcessedFile:
|
|
62
|
+
"""Result of processing a file."""
|
|
63
|
+
# Core data
|
|
64
|
+
filename: str
|
|
65
|
+
file_type: FileType
|
|
66
|
+
mime_type: str
|
|
67
|
+
size_bytes: int
|
|
68
|
+
|
|
69
|
+
# Extracted content
|
|
70
|
+
text: str = ""
|
|
71
|
+
text_chunks: list[str] = field(default_factory=list) # For chunked processing
|
|
72
|
+
|
|
73
|
+
# Metadata
|
|
74
|
+
metadata: dict = field(default_factory=dict)
|
|
75
|
+
|
|
76
|
+
# Visual data
|
|
77
|
+
thumbnail_base64: Optional[str] = None
|
|
78
|
+
preview_url: Optional[str] = None
|
|
79
|
+
|
|
80
|
+
# OCR/Vision results
|
|
81
|
+
ocr_text: Optional[str] = None
|
|
82
|
+
vision_description: Optional[str] = None
|
|
83
|
+
vision_analysis: Optional[dict] = None
|
|
84
|
+
|
|
85
|
+
# Processing info
|
|
86
|
+
processor_used: str = ""
|
|
87
|
+
processing_time_ms: float = 0
|
|
88
|
+
warnings: list[str] = field(default_factory=list)
|
|
89
|
+
|
|
90
|
+
# Raw data (optional, for further processing)
|
|
91
|
+
raw_content: Optional[bytes] = None
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class FileProcessor(ABC):
|
|
95
|
+
"""
|
|
96
|
+
Abstract base class for file processors.
|
|
97
|
+
|
|
98
|
+
Subclass this to create processors for specific file types.
|
|
99
|
+
Each processor declares which file types and MIME types it handles.
|
|
100
|
+
"""
|
|
101
|
+
|
|
102
|
+
@property
|
|
103
|
+
@abstractmethod
|
|
104
|
+
def name(self) -> str:
|
|
105
|
+
"""Unique name for this processor."""
|
|
106
|
+
...
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
@abstractmethod
|
|
110
|
+
def supported_types(self) -> list[FileType]:
|
|
111
|
+
"""List of FileType enums this processor handles."""
|
|
112
|
+
...
|
|
113
|
+
|
|
114
|
+
@property
|
|
115
|
+
@abstractmethod
|
|
116
|
+
def supported_extensions(self) -> list[str]:
|
|
117
|
+
"""List of file extensions this processor handles (e.g., ['.pdf', '.PDF'])."""
|
|
118
|
+
...
|
|
119
|
+
|
|
120
|
+
@property
|
|
121
|
+
def supported_mime_types(self) -> list[str]:
|
|
122
|
+
"""List of MIME types this processor handles. Override if needed."""
|
|
123
|
+
return []
|
|
124
|
+
|
|
125
|
+
@abstractmethod
|
|
126
|
+
async def process(
|
|
127
|
+
self,
|
|
128
|
+
content: bytes,
|
|
129
|
+
filename: str,
|
|
130
|
+
options: ProcessingOptions,
|
|
131
|
+
) -> ProcessedFile:
|
|
132
|
+
"""
|
|
133
|
+
Process file content and extract text/metadata.
|
|
134
|
+
|
|
135
|
+
Args:
|
|
136
|
+
content: Raw file bytes
|
|
137
|
+
filename: Original filename
|
|
138
|
+
options: Processing options
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
ProcessedFile with extracted content
|
|
142
|
+
"""
|
|
143
|
+
...
|
|
144
|
+
|
|
145
|
+
def can_process(self, filename: str, mime_type: Optional[str] = None) -> bool:
|
|
146
|
+
"""Check if this processor can handle the given file."""
|
|
147
|
+
ext = Path(filename).suffix.lower()
|
|
148
|
+
if ext in [e.lower() for e in self.supported_extensions]:
|
|
149
|
+
return True
|
|
150
|
+
if mime_type and mime_type in self.supported_mime_types:
|
|
151
|
+
return True
|
|
152
|
+
return False
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class FileProcessorRegistry:
|
|
156
|
+
"""
|
|
157
|
+
Registry of file processors.
|
|
158
|
+
|
|
159
|
+
Manages processor registration and selection based on file type.
|
|
160
|
+
"""
|
|
161
|
+
|
|
162
|
+
def __init__(self):
|
|
163
|
+
self._processors: dict[str, FileProcessor] = {}
|
|
164
|
+
self._type_map: dict[FileType, list[str]] = {}
|
|
165
|
+
self._extension_map: dict[str, str] = {}
|
|
166
|
+
|
|
167
|
+
def register(self, processor: FileProcessor) -> None:
|
|
168
|
+
"""Register a file processor."""
|
|
169
|
+
self._processors[processor.name] = processor
|
|
170
|
+
|
|
171
|
+
# Map file types to processor
|
|
172
|
+
for file_type in processor.supported_types:
|
|
173
|
+
if file_type not in self._type_map:
|
|
174
|
+
self._type_map[file_type] = []
|
|
175
|
+
self._type_map[file_type].append(processor.name)
|
|
176
|
+
|
|
177
|
+
# Map extensions to processor
|
|
178
|
+
for ext in processor.supported_extensions:
|
|
179
|
+
self._extension_map[ext.lower()] = processor.name
|
|
180
|
+
|
|
181
|
+
def get(self, name: str) -> Optional[FileProcessor]:
|
|
182
|
+
"""Get a processor by name."""
|
|
183
|
+
return self._processors.get(name)
|
|
184
|
+
|
|
185
|
+
def get_for_file(
|
|
186
|
+
self,
|
|
187
|
+
filename: str,
|
|
188
|
+
mime_type: Optional[str] = None,
|
|
189
|
+
) -> Optional[FileProcessor]:
|
|
190
|
+
"""Get the best processor for a file."""
|
|
191
|
+
ext = Path(filename).suffix.lower()
|
|
192
|
+
|
|
193
|
+
# Try extension first
|
|
194
|
+
if ext in self._extension_map:
|
|
195
|
+
return self._processors[self._extension_map[ext]]
|
|
196
|
+
|
|
197
|
+
# Try MIME type
|
|
198
|
+
if mime_type:
|
|
199
|
+
for processor in self._processors.values():
|
|
200
|
+
if mime_type in processor.supported_mime_types:
|
|
201
|
+
return processor
|
|
202
|
+
|
|
203
|
+
# Guess MIME type from filename
|
|
204
|
+
guessed_mime, _ = mimetypes.guess_type(filename)
|
|
205
|
+
if guessed_mime:
|
|
206
|
+
for processor in self._processors.values():
|
|
207
|
+
if guessed_mime in processor.supported_mime_types:
|
|
208
|
+
return processor
|
|
209
|
+
|
|
210
|
+
return None
|
|
211
|
+
|
|
212
|
+
async def process(
|
|
213
|
+
self,
|
|
214
|
+
filename: str,
|
|
215
|
+
content: bytes,
|
|
216
|
+
options: Optional[ProcessingOptions] = None,
|
|
217
|
+
mime_type: Optional[str] = None,
|
|
218
|
+
) -> ProcessedFile:
|
|
219
|
+
"""
|
|
220
|
+
Process a file using the appropriate processor.
|
|
221
|
+
|
|
222
|
+
Args:
|
|
223
|
+
filename: Original filename
|
|
224
|
+
content: Raw file bytes
|
|
225
|
+
options: Processing options (uses defaults if not provided)
|
|
226
|
+
mime_type: Optional MIME type hint
|
|
227
|
+
|
|
228
|
+
Returns:
|
|
229
|
+
ProcessedFile with extracted content
|
|
230
|
+
|
|
231
|
+
Raises:
|
|
232
|
+
ValueError: If no processor found for file type
|
|
233
|
+
ValueError: If file exceeds size limit
|
|
234
|
+
"""
|
|
235
|
+
if options is None:
|
|
236
|
+
options = ProcessingOptions()
|
|
237
|
+
|
|
238
|
+
# Check size limit
|
|
239
|
+
if len(content) > options.max_size_bytes:
|
|
240
|
+
raise ValueError(
|
|
241
|
+
f"File size ({len(content)} bytes) exceeds limit "
|
|
242
|
+
f"({options.max_size_bytes} bytes)"
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# Find processor
|
|
246
|
+
processor = self.get_for_file(filename, mime_type)
|
|
247
|
+
if not processor:
|
|
248
|
+
raise ValueError(f"No processor found for file: {filename}")
|
|
249
|
+
|
|
250
|
+
# Process
|
|
251
|
+
return await processor.process(content, filename, options)
|
|
252
|
+
|
|
253
|
+
def list_processors(self) -> list[FileProcessor]:
|
|
254
|
+
"""List all registered processors."""
|
|
255
|
+
return list(self._processors.values())
|
|
256
|
+
|
|
257
|
+
def supported_extensions(self) -> list[str]:
|
|
258
|
+
"""List all supported file extensions."""
|
|
259
|
+
return list(self._extension_map.keys())
|
|
260
|
+
|
|
261
|
+
def auto_register(self) -> None:
|
|
262
|
+
"""
|
|
263
|
+
Auto-register all available processors.
|
|
264
|
+
|
|
265
|
+
Registers built-in processors and checks for optional dependencies.
|
|
266
|
+
"""
|
|
267
|
+
from .processors import (
|
|
268
|
+
TextFileProcessor,
|
|
269
|
+
PDFProcessor,
|
|
270
|
+
ImageProcessor,
|
|
271
|
+
DocxProcessor,
|
|
272
|
+
XlsxProcessor,
|
|
273
|
+
CsvProcessor,
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# Always available
|
|
277
|
+
self.register(TextFileProcessor())
|
|
278
|
+
self.register(CsvProcessor())
|
|
279
|
+
|
|
280
|
+
# Check for optional dependencies
|
|
281
|
+
try:
|
|
282
|
+
import pypdf
|
|
283
|
+
self.register(PDFProcessor())
|
|
284
|
+
except ImportError:
|
|
285
|
+
pass
|
|
286
|
+
|
|
287
|
+
try:
|
|
288
|
+
from PIL import Image
|
|
289
|
+
self.register(ImageProcessor())
|
|
290
|
+
except ImportError:
|
|
291
|
+
pass
|
|
292
|
+
|
|
293
|
+
try:
|
|
294
|
+
import docx
|
|
295
|
+
self.register(DocxProcessor())
|
|
296
|
+
except ImportError:
|
|
297
|
+
pass
|
|
298
|
+
|
|
299
|
+
try:
|
|
300
|
+
import openpyxl
|
|
301
|
+
self.register(XlsxProcessor())
|
|
302
|
+
except ImportError:
|
|
303
|
+
pass
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def detect_file_type(filename: str, content: Optional[bytes] = None) -> FileType:
|
|
307
|
+
"""
|
|
308
|
+
Detect file type from filename and optionally content.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
filename: Filename with extension
|
|
312
|
+
content: Optional file content for magic number detection
|
|
313
|
+
|
|
314
|
+
Returns:
|
|
315
|
+
Detected FileType
|
|
316
|
+
"""
|
|
317
|
+
ext = Path(filename).suffix.lower()
|
|
318
|
+
|
|
319
|
+
extension_map = {
|
|
320
|
+
".txt": FileType.TEXT,
|
|
321
|
+
".text": FileType.TEXT,
|
|
322
|
+
".log": FileType.TEXT,
|
|
323
|
+
".pdf": FileType.PDF,
|
|
324
|
+
".png": FileType.IMAGE,
|
|
325
|
+
".jpg": FileType.IMAGE,
|
|
326
|
+
".jpeg": FileType.IMAGE,
|
|
327
|
+
".gif": FileType.IMAGE,
|
|
328
|
+
".webp": FileType.IMAGE,
|
|
329
|
+
".bmp": FileType.IMAGE,
|
|
330
|
+
".docx": FileType.DOCX,
|
|
331
|
+
".doc": FileType.DOCX,
|
|
332
|
+
".xlsx": FileType.XLSX,
|
|
333
|
+
".xls": FileType.XLSX,
|
|
334
|
+
".csv": FileType.CSV,
|
|
335
|
+
".json": FileType.JSON,
|
|
336
|
+
".md": FileType.MARKDOWN,
|
|
337
|
+
".markdown": FileType.MARKDOWN,
|
|
338
|
+
".html": FileType.HTML,
|
|
339
|
+
".htm": FileType.HTML,
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
return extension_map.get(ext, FileType.UNKNOWN)
|
|
343
|
+
|