slide-narrator 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of slide-narrator might be problematic. Click here for more details.
- narrator/__init__.py +18 -0
- narrator/database/__init__.py +8 -0
- narrator/database/cli.py +66 -0
- narrator/database/migrations/__init__.py +6 -0
- narrator/database/models.py +69 -0
- narrator/database/storage_backend.py +580 -0
- narrator/database/thread_store.py +280 -0
- narrator/models/__init__.py +9 -0
- narrator/models/attachment.py +363 -0
- narrator/models/message.py +507 -0
- narrator/models/thread.py +469 -0
- narrator/storage/__init__.py +7 -0
- narrator/storage/file_store.py +535 -0
- narrator/utils/__init__.py +9 -0
- narrator/utils/logging.py +58 -0
- slide_narrator-0.2.1.dist-info/METADATA +531 -0
- slide_narrator-0.2.1.dist-info/RECORD +20 -0
- slide_narrator-0.2.1.dist-info/WHEEL +4 -0
- slide_narrator-0.2.1.dist-info/entry_points.txt +2 -0
- slide_narrator-0.2.1.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,280 @@
|
|
|
1
|
+
"""Thread storage implementation."""
|
|
2
|
+
from typing import Optional, Dict, Any, List
|
|
3
|
+
from ..models.thread import Thread
|
|
4
|
+
from ..models.message import Message
|
|
5
|
+
from ..utils.logging import get_logger
|
|
6
|
+
from .storage_backend import MemoryBackend, SQLBackend
|
|
7
|
+
|
|
8
|
+
logger = get_logger(__name__)
|
|
9
|
+
|
|
10
|
+
class ThreadStore:
|
|
11
|
+
"""
|
|
12
|
+
Thread storage implementation with pluggable backends.
|
|
13
|
+
Supports both in-memory and SQL (SQLite/PostgreSQL) storage.
|
|
14
|
+
|
|
15
|
+
Key characteristics:
|
|
16
|
+
- Unified interface for all storage types
|
|
17
|
+
- Memory backend for development/testing (default)
|
|
18
|
+
- SQLite for local persistence
|
|
19
|
+
- PostgreSQL for production
|
|
20
|
+
- Built-in connection pooling for SQLBackend
|
|
21
|
+
|
|
22
|
+
Usage:
|
|
23
|
+
# RECOMMENDED: Factory pattern for immediate connection validation
|
|
24
|
+
from narrator import ThreadStore
|
|
25
|
+
store = await ThreadStore.create("postgresql+asyncpg://user:pass@localhost/dbname")
|
|
26
|
+
|
|
27
|
+
# Or for in-memory storage:
|
|
28
|
+
store = await ThreadStore.create() # Uses memory backend
|
|
29
|
+
|
|
30
|
+
# Direct constructor (connects on first operation):
|
|
31
|
+
store = ThreadStore("postgresql+asyncpg://user:pass@localhost/dbname")
|
|
32
|
+
|
|
33
|
+
Connection pooling settings can be configured via environment variables:
|
|
34
|
+
- NARRATOR_DB_POOL_SIZE: Max number of connections to keep open (default: 5)
|
|
35
|
+
- NARRATOR_DB_MAX_OVERFLOW: Max number of connections to create above pool_size (default: 10)
|
|
36
|
+
- NARRATOR_DB_POOL_TIMEOUT: Seconds to wait for a connection from pool (default: 30)
|
|
37
|
+
- NARRATOR_DB_POOL_RECYCLE: Seconds after which a connection is recycled (default: 300)
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, database_url = None):
|
|
41
|
+
"""
|
|
42
|
+
Initialize thread store with optional database URL.
|
|
43
|
+
If no URL is provided, uses in-memory storage by default.
|
|
44
|
+
This constructor doesn't establish database connections - they happen on first use.
|
|
45
|
+
|
|
46
|
+
For immediate connection validation, use the async factory method:
|
|
47
|
+
`store = await ThreadStore.create(database_url)`
|
|
48
|
+
|
|
49
|
+
Args:
|
|
50
|
+
database_url: SQLAlchemy async database URL. Examples:
|
|
51
|
+
- "postgresql+asyncpg://user:pass@localhost/dbname"
|
|
52
|
+
- "sqlite+aiosqlite:///path/to/db.sqlite"
|
|
53
|
+
- None for in-memory storage
|
|
54
|
+
"""
|
|
55
|
+
if database_url is None:
|
|
56
|
+
# Default to in-memory storage
|
|
57
|
+
logger.info("No database URL provided. Using in-memory storage.")
|
|
58
|
+
self._backend = MemoryBackend()
|
|
59
|
+
else:
|
|
60
|
+
# Use SQLBackend with the provided URL
|
|
61
|
+
logger.info(f"Using database URL: {database_url}")
|
|
62
|
+
self._backend = SQLBackend(database_url)
|
|
63
|
+
|
|
64
|
+
# Add initialization flag
|
|
65
|
+
self._initialized = False
|
|
66
|
+
|
|
67
|
+
@classmethod
|
|
68
|
+
async def create(cls, database_url = None):
|
|
69
|
+
"""
|
|
70
|
+
Factory method to create and initialize a ThreadStore.
|
|
71
|
+
This method connects to the database immediately, allowing early validation
|
|
72
|
+
of connection parameters.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
database_url: SQLAlchemy async database URL. Examples:
|
|
76
|
+
- "postgresql+asyncpg://user:pass@localhost/dbname"
|
|
77
|
+
- "sqlite+aiosqlite:///path/to/db.sqlite"
|
|
78
|
+
- None for in-memory storage
|
|
79
|
+
|
|
80
|
+
Returns:
|
|
81
|
+
Initialized ThreadStore instance
|
|
82
|
+
|
|
83
|
+
Raises:
|
|
84
|
+
Exception: If database connection fails
|
|
85
|
+
"""
|
|
86
|
+
# Create instance
|
|
87
|
+
store = cls(database_url)
|
|
88
|
+
|
|
89
|
+
# Initialize immediately
|
|
90
|
+
try:
|
|
91
|
+
await store.initialize()
|
|
92
|
+
except Exception as e:
|
|
93
|
+
# If a database URL was provided but initialization failed, we should raise the error
|
|
94
|
+
# instead of silently falling back to memory storage
|
|
95
|
+
if database_url is not None:
|
|
96
|
+
raise RuntimeError(f"Failed to initialize database with URL {database_url}: {str(e)}") from e
|
|
97
|
+
raise
|
|
98
|
+
|
|
99
|
+
return store
|
|
100
|
+
|
|
101
|
+
async def _ensure_initialized(self) -> None:
|
|
102
|
+
"""Ensure the storage backend is initialized."""
|
|
103
|
+
if not self._initialized:
|
|
104
|
+
await self.initialize()
|
|
105
|
+
self._initialized = True
|
|
106
|
+
|
|
107
|
+
async def initialize(self) -> None:
|
|
108
|
+
"""Initialize the storage backend."""
|
|
109
|
+
await self._backend.initialize()
|
|
110
|
+
self._initialized = True
|
|
111
|
+
|
|
112
|
+
async def save(self, thread: Thread) -> Thread:
|
|
113
|
+
"""
|
|
114
|
+
Save a thread to storage, filtering out system messages.
|
|
115
|
+
|
|
116
|
+
System messages are not persisted to storage by design, but are kept
|
|
117
|
+
in the original Thread object in memory.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
thread: The Thread object to save
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
The original Thread object (with system messages intact)
|
|
124
|
+
"""
|
|
125
|
+
await self._ensure_initialized()
|
|
126
|
+
|
|
127
|
+
# Create a filtered copy of the thread without system messages
|
|
128
|
+
filtered_thread = Thread(
|
|
129
|
+
id=thread.id,
|
|
130
|
+
title=thread.title,
|
|
131
|
+
created_at=thread.created_at,
|
|
132
|
+
updated_at=thread.updated_at,
|
|
133
|
+
attributes=thread.attributes.copy() if thread.attributes else {},
|
|
134
|
+
platforms=thread.platforms.copy() if thread.platforms else {}
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Only copy non-system messages to the filtered thread
|
|
138
|
+
for message in thread.messages:
|
|
139
|
+
if message.role != "system":
|
|
140
|
+
# We create a shallow copy of the message to preserve the original
|
|
141
|
+
filtered_thread.messages.append(message)
|
|
142
|
+
|
|
143
|
+
# Save the filtered thread to storage
|
|
144
|
+
await self._backend.save(filtered_thread)
|
|
145
|
+
|
|
146
|
+
# Return the original thread (with system messages intact)
|
|
147
|
+
return thread
|
|
148
|
+
|
|
149
|
+
async def get(self, thread_id: str) -> Optional[Thread]:
|
|
150
|
+
"""Get a thread by ID."""
|
|
151
|
+
await self._ensure_initialized()
|
|
152
|
+
return await self._backend.get(thread_id)
|
|
153
|
+
|
|
154
|
+
async def delete(self, thread_id: str) -> bool:
|
|
155
|
+
"""Delete a thread by ID."""
|
|
156
|
+
await self._ensure_initialized()
|
|
157
|
+
return await self._backend.delete(thread_id)
|
|
158
|
+
|
|
159
|
+
async def list(self, limit: int = 100, offset: int = 0) -> List[Thread]:
|
|
160
|
+
"""List threads with pagination."""
|
|
161
|
+
await self._ensure_initialized()
|
|
162
|
+
return await self._backend.list(limit, offset)
|
|
163
|
+
|
|
164
|
+
async def find_by_attributes(self, attributes: Dict[str, Any]) -> List[Thread]:
|
|
165
|
+
"""Find threads by matching attributes."""
|
|
166
|
+
await self._ensure_initialized()
|
|
167
|
+
return await self._backend.find_by_attributes(attributes)
|
|
168
|
+
|
|
169
|
+
async def find_by_platform(self, platform_name: str, properties: Dict[str, Any]) -> List[Thread]:
|
|
170
|
+
"""Find threads by platform name and properties."""
|
|
171
|
+
await self._ensure_initialized()
|
|
172
|
+
return await self._backend.find_by_platform(platform_name, properties)
|
|
173
|
+
|
|
174
|
+
async def list_recent(self, limit: Optional[int] = None) -> List[Thread]:
|
|
175
|
+
"""List recent threads."""
|
|
176
|
+
await self._ensure_initialized()
|
|
177
|
+
return await self._backend.list_recent(limit)
|
|
178
|
+
|
|
179
|
+
async def find_messages_by_attribute(self, path: str, value: Any) -> List[Message]:
|
|
180
|
+
"""
|
|
181
|
+
Find messages with a specific attribute at a given JSON path.
|
|
182
|
+
This is useful for finding messages with specific metadata (like a Slack ts).
|
|
183
|
+
|
|
184
|
+
Args:
|
|
185
|
+
path: Dot-notation path to the attribute (e.g., "platforms.slack.ts")
|
|
186
|
+
value: The value to search for
|
|
187
|
+
|
|
188
|
+
Returns:
|
|
189
|
+
List of Message objects that match the criteria (possibly empty)
|
|
190
|
+
"""
|
|
191
|
+
await self._ensure_initialized()
|
|
192
|
+
if hasattr(self._backend, 'find_messages_by_attribute'):
|
|
193
|
+
message_records = await self._backend.find_messages_by_attribute(path, value)
|
|
194
|
+
|
|
195
|
+
# Convert MessageRecord objects to Message objects
|
|
196
|
+
messages = []
|
|
197
|
+
if hasattr(self._backend, '_create_message_from_record'):
|
|
198
|
+
for record in message_records:
|
|
199
|
+
message = self._backend._create_message_from_record(record)
|
|
200
|
+
messages.append(message)
|
|
201
|
+
|
|
202
|
+
return messages
|
|
203
|
+
else:
|
|
204
|
+
# Fallback implementation for backends that don't support this method
|
|
205
|
+
# This is less efficient but provides compatibility
|
|
206
|
+
messages = []
|
|
207
|
+
threads = await self._backend.list_recent(100) # Get recent threads
|
|
208
|
+
|
|
209
|
+
# Check each thread's messages
|
|
210
|
+
for thread in threads:
|
|
211
|
+
for message in thread.messages:
|
|
212
|
+
# Navigate the path to get the value
|
|
213
|
+
current = message
|
|
214
|
+
parts = path.split('.')
|
|
215
|
+
|
|
216
|
+
for part in parts:
|
|
217
|
+
if isinstance(current, dict) and part in current:
|
|
218
|
+
current = current[part]
|
|
219
|
+
elif hasattr(current, part):
|
|
220
|
+
current = getattr(current, part)
|
|
221
|
+
else:
|
|
222
|
+
current = None
|
|
223
|
+
break
|
|
224
|
+
|
|
225
|
+
# Check if we found a match
|
|
226
|
+
if current == value:
|
|
227
|
+
messages.append(message)
|
|
228
|
+
|
|
229
|
+
return messages
|
|
230
|
+
|
|
231
|
+
# Add properties to expose backend attributes
|
|
232
|
+
@property
|
|
233
|
+
def database_url(self):
|
|
234
|
+
return getattr(self._backend, "database_url", None)
|
|
235
|
+
|
|
236
|
+
@property
|
|
237
|
+
def engine(self):
|
|
238
|
+
return getattr(self._backend, "engine", None)
|
|
239
|
+
|
|
240
|
+
async def get_thread_by_message_id(self, message_id: str) -> Optional[Thread]:
|
|
241
|
+
"""
|
|
242
|
+
Find a thread containing a specific message ID.
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
message_id: The ID of the message to find
|
|
246
|
+
|
|
247
|
+
Returns:
|
|
248
|
+
The Thread containing the message, or None if not found
|
|
249
|
+
"""
|
|
250
|
+
await self._ensure_initialized()
|
|
251
|
+
|
|
252
|
+
# Check if backend has native implementation
|
|
253
|
+
if hasattr(self._backend, 'get_thread_by_message_id'):
|
|
254
|
+
return await self._backend.get_thread_by_message_id(message_id)
|
|
255
|
+
|
|
256
|
+
# Fallback implementation for backends that don't support this method
|
|
257
|
+
threads = await self._backend.list_recent(500) # Get recent threads
|
|
258
|
+
|
|
259
|
+
# Check each thread's messages for the message ID
|
|
260
|
+
for thread in threads:
|
|
261
|
+
for message in thread.messages:
|
|
262
|
+
if message.id == message_id:
|
|
263
|
+
return thread
|
|
264
|
+
|
|
265
|
+
return None
|
|
266
|
+
|
|
267
|
+
# Optional PostgreSQL-specific implementation
|
|
268
|
+
try:
|
|
269
|
+
import asyncpg
|
|
270
|
+
|
|
271
|
+
class SQLAlchemyThreadStore(ThreadStore):
|
|
272
|
+
"""PostgreSQL-based thread storage for production use."""
|
|
273
|
+
|
|
274
|
+
def __init__(self, database_url):
|
|
275
|
+
if not database_url.startswith('postgresql+asyncpg://'):
|
|
276
|
+
database_url = database_url.replace('postgresql://', 'postgresql+asyncpg://')
|
|
277
|
+
super().__init__(database_url)
|
|
278
|
+
|
|
279
|
+
except ImportError:
|
|
280
|
+
pass
|
|
@@ -0,0 +1,363 @@
|
|
|
1
|
+
from typing import Dict, Optional, Any, Union, Literal
|
|
2
|
+
from pydantic import BaseModel, computed_field
|
|
3
|
+
import base64
|
|
4
|
+
import io
|
|
5
|
+
import magic
|
|
6
|
+
from ..utils.logging import get_logger
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from ..storage.file_store import FileStore
|
|
9
|
+
import hashlib
|
|
10
|
+
|
|
11
|
+
# Get configured logger
|
|
12
|
+
logger = get_logger(__name__)
|
|
13
|
+
|
|
14
|
+
class Attachment(BaseModel):
|
|
15
|
+
"""Represents a file attached to a message"""
|
|
16
|
+
filename: str
|
|
17
|
+
content: Optional[Union[bytes, str]] = None # Can be either bytes or base64 string
|
|
18
|
+
mime_type: Optional[str] = None
|
|
19
|
+
attributes: Optional[Dict[str, Any]] = None # Renamed from processed_content
|
|
20
|
+
file_id: Optional[str] = None # Reference to stored file
|
|
21
|
+
storage_path: Optional[str] = None # Path in storage backend
|
|
22
|
+
storage_backend: Optional[str] = None # Storage backend type
|
|
23
|
+
status: Literal["pending", "stored", "failed"] = "pending"
|
|
24
|
+
|
|
25
|
+
@computed_field
|
|
26
|
+
@property
|
|
27
|
+
def id(self) -> str:
|
|
28
|
+
"""Generate a unique ID based on content hash"""
|
|
29
|
+
if self.content is None:
|
|
30
|
+
# If no content, use filename and other attributes
|
|
31
|
+
hash_input = f"{self.filename}{self.mime_type or ''}"
|
|
32
|
+
return hashlib.sha256(hash_input.encode()).hexdigest()[:16]
|
|
33
|
+
|
|
34
|
+
# Get content as bytes for hashing
|
|
35
|
+
if isinstance(self.content, bytes):
|
|
36
|
+
content_bytes = self.content
|
|
37
|
+
elif isinstance(self.content, str):
|
|
38
|
+
# Try to decode as base64 first
|
|
39
|
+
try:
|
|
40
|
+
content_bytes = base64.b64decode(self.content)
|
|
41
|
+
except:
|
|
42
|
+
# If not base64, encode as UTF-8
|
|
43
|
+
content_bytes = self.content.encode('utf-8')
|
|
44
|
+
else:
|
|
45
|
+
# Fallback to filename hash
|
|
46
|
+
return hashlib.sha256(self.filename.encode()).hexdigest()[:16]
|
|
47
|
+
|
|
48
|
+
# Create hash of filename + content
|
|
49
|
+
hash_input = self.filename.encode() + content_bytes
|
|
50
|
+
return hashlib.sha256(hash_input).hexdigest()[:16]
|
|
51
|
+
|
|
52
|
+
@classmethod
|
|
53
|
+
def from_file_path(cls, file_path: Union[str, Path]) -> 'Attachment':
|
|
54
|
+
"""Create an attachment from a file path"""
|
|
55
|
+
file_path = Path(file_path)
|
|
56
|
+
|
|
57
|
+
if not file_path.exists():
|
|
58
|
+
raise FileNotFoundError(f"File not found: {file_path}")
|
|
59
|
+
|
|
60
|
+
# Read file content
|
|
61
|
+
content = file_path.read_bytes()
|
|
62
|
+
|
|
63
|
+
# Detect MIME type
|
|
64
|
+
mime_type = magic.from_buffer(content, mime=True)
|
|
65
|
+
|
|
66
|
+
return cls(
|
|
67
|
+
filename=file_path.name,
|
|
68
|
+
content=content,
|
|
69
|
+
mime_type=mime_type
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
def detect_mime_type(self) -> None:
|
|
73
|
+
"""Detect and set MIME type from content"""
|
|
74
|
+
if self.content is None:
|
|
75
|
+
logger.warning(f"Cannot detect MIME type for {self.filename}: no content")
|
|
76
|
+
return
|
|
77
|
+
|
|
78
|
+
# Get content as bytes
|
|
79
|
+
if isinstance(self.content, bytes):
|
|
80
|
+
content_bytes = self.content
|
|
81
|
+
elif isinstance(self.content, str):
|
|
82
|
+
try:
|
|
83
|
+
content_bytes = base64.b64decode(self.content)
|
|
84
|
+
except:
|
|
85
|
+
content_bytes = self.content.encode('utf-8')
|
|
86
|
+
else:
|
|
87
|
+
logger.warning(f"Cannot detect MIME type for {self.filename}: invalid content type")
|
|
88
|
+
return
|
|
89
|
+
|
|
90
|
+
# Detect MIME type
|
|
91
|
+
detected_mime_type = magic.from_buffer(content_bytes, mime=True)
|
|
92
|
+
|
|
93
|
+
if not self.mime_type:
|
|
94
|
+
self.mime_type = detected_mime_type
|
|
95
|
+
logger.debug(f"Detected MIME type for {self.filename}: {self.mime_type}")
|
|
96
|
+
else:
|
|
97
|
+
logger.debug(f"MIME type already set for {self.filename}: {self.mime_type}")
|
|
98
|
+
|
|
99
|
+
def model_dump(self, mode: str = "json") -> Dict[str, Any]:
|
|
100
|
+
"""Convert attachment to a dictionary suitable for JSON serialization
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
mode: Serialization mode, either "json" or "python".
|
|
104
|
+
"json" converts datetimes to ISO strings (default).
|
|
105
|
+
"python" keeps datetimes as datetime objects.
|
|
106
|
+
"""
|
|
107
|
+
data = {
|
|
108
|
+
"filename": self.filename,
|
|
109
|
+
"mime_type": self.mime_type,
|
|
110
|
+
"attributes": self.attributes,
|
|
111
|
+
"file_id": self.file_id,
|
|
112
|
+
"storage_path": self.storage_path,
|
|
113
|
+
"storage_backend": self.storage_backend,
|
|
114
|
+
"status": self.status
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
return data
|
|
118
|
+
|
|
119
|
+
async def get_content_bytes(self, file_store: Optional[FileStore] = None) -> bytes:
|
|
120
|
+
"""Get the content as bytes, converting from base64 if necessary
|
|
121
|
+
|
|
122
|
+
If file_id is present, retrieves content from file storage.
|
|
123
|
+
Otherwise falls back to content field.
|
|
124
|
+
|
|
125
|
+
Args:
|
|
126
|
+
file_store: FileStore instance to use for retrieving file content.
|
|
127
|
+
Required when file_id is present.
|
|
128
|
+
"""
|
|
129
|
+
logger.debug(f"Getting content bytes for {self.filename}")
|
|
130
|
+
|
|
131
|
+
if self.file_id:
|
|
132
|
+
logger.debug(f"Retrieving content from file store for file_id: {self.file_id}")
|
|
133
|
+
if file_store is None:
|
|
134
|
+
raise ValueError("FileStore instance required to retrieve content for file_id")
|
|
135
|
+
if self.storage_path is None:
|
|
136
|
+
raise ValueError("storage_path required to retrieve content for file_id")
|
|
137
|
+
return await file_store.get(self.file_id, self.storage_path)
|
|
138
|
+
|
|
139
|
+
if isinstance(self.content, bytes):
|
|
140
|
+
logger.debug(f"Content is already in bytes format for {self.filename}")
|
|
141
|
+
return self.content
|
|
142
|
+
elif isinstance(self.content, str):
|
|
143
|
+
logger.debug(f"Converting string content for {self.filename}")
|
|
144
|
+
if self.content.startswith('data:'):
|
|
145
|
+
# Handle data URLs
|
|
146
|
+
logger.debug("Detected data URL format")
|
|
147
|
+
header, encoded = self.content.split(",", 1)
|
|
148
|
+
logger.debug(f"Data URL header: {header}")
|
|
149
|
+
try:
|
|
150
|
+
decoded = base64.b64decode(encoded)
|
|
151
|
+
logger.debug(f"Successfully decoded data URL content, size: {len(decoded)} bytes")
|
|
152
|
+
return decoded
|
|
153
|
+
except Exception as e:
|
|
154
|
+
logger.error(f"Failed to decode data URL content: {e}")
|
|
155
|
+
raise
|
|
156
|
+
else:
|
|
157
|
+
try:
|
|
158
|
+
# Try base64 decode
|
|
159
|
+
logger.debug("Attempting base64 decode")
|
|
160
|
+
decoded = base64.b64decode(self.content)
|
|
161
|
+
logger.debug(f"Successfully decoded base64 content, size: {len(decoded)} bytes")
|
|
162
|
+
return decoded
|
|
163
|
+
except:
|
|
164
|
+
logger.debug("Not base64, treating as UTF-8 text")
|
|
165
|
+
# If not base64, encode as UTF-8
|
|
166
|
+
return self.content.encode('utf-8')
|
|
167
|
+
|
|
168
|
+
raise ValueError("No content available - attachment has neither file_id nor content")
|
|
169
|
+
|
|
170
|
+
def update_attributes_with_url(self) -> None:
|
|
171
|
+
"""Update attributes with URL after storage_path is set."""
|
|
172
|
+
if self.storage_path:
|
|
173
|
+
if not self.attributes:
|
|
174
|
+
self.attributes = {}
|
|
175
|
+
|
|
176
|
+
try:
|
|
177
|
+
# Get the file URL from FileStore
|
|
178
|
+
self.attributes["url"] = FileStore.get_file_url(self.storage_path)
|
|
179
|
+
logger.debug(f"Updated attributes with URL: {self.attributes['url']}")
|
|
180
|
+
except Exception as e:
|
|
181
|
+
# Log the error but don't fail - the URL will be missing but that's better than crashing
|
|
182
|
+
logger.error(f"Failed to construct URL for attachment: {e}")
|
|
183
|
+
self.attributes["error"] = f"Failed to construct URL: {str(e)}"
|
|
184
|
+
|
|
185
|
+
async def process_and_store(self, file_store: FileStore, force: bool = False) -> None:
|
|
186
|
+
"""Process the attachment content and store it in the file store.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
file_store: FileStore instance to use for storing files
|
|
190
|
+
force: Whether to force processing even if already stored
|
|
191
|
+
"""
|
|
192
|
+
logger.debug(f"Starting process_and_store for {self.filename} (force={force})")
|
|
193
|
+
logger.debug(f"Initial state - mime_type: {self.mime_type}, status: {self.status}, content type: {type(self.content)}")
|
|
194
|
+
|
|
195
|
+
if not force and self.status == "stored":
|
|
196
|
+
logger.info(f"Skipping process_and_store for {self.filename} - already stored")
|
|
197
|
+
return
|
|
198
|
+
|
|
199
|
+
if self.content is None:
|
|
200
|
+
logger.error(f"Cannot process attachment {self.filename}: no content provided")
|
|
201
|
+
self.status = "failed"
|
|
202
|
+
raise RuntimeError(f"Cannot process attachment {self.filename}: no content provided")
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
# Get content as bytes first
|
|
206
|
+
logger.debug("Converting content to bytes")
|
|
207
|
+
content_bytes = await self.get_content_bytes(file_store=file_store)
|
|
208
|
+
logger.debug(f"Successfully converted content to bytes, size: {len(content_bytes)} bytes")
|
|
209
|
+
|
|
210
|
+
# Detect/verify MIME type
|
|
211
|
+
logger.debug("Detecting MIME type")
|
|
212
|
+
detected_mime_type = magic.from_buffer(content_bytes, mime=True)
|
|
213
|
+
logger.debug(f"Detected MIME type: {detected_mime_type}")
|
|
214
|
+
|
|
215
|
+
if not self.mime_type:
|
|
216
|
+
self.mime_type = detected_mime_type
|
|
217
|
+
logger.debug(f"Set MIME type to detected type: {self.mime_type}")
|
|
218
|
+
elif self.mime_type != detected_mime_type:
|
|
219
|
+
logger.warning(f"Provided MIME type {self.mime_type} doesn't match detected type {detected_mime_type}")
|
|
220
|
+
|
|
221
|
+
# Initialize attributes
|
|
222
|
+
if not self.attributes:
|
|
223
|
+
self.attributes = {}
|
|
224
|
+
|
|
225
|
+
# Process content based on MIME type
|
|
226
|
+
logger.debug(f"Processing content based on MIME type: {self.mime_type}")
|
|
227
|
+
|
|
228
|
+
if self.mime_type.startswith('image/'):
|
|
229
|
+
logger.debug("Processing as image")
|
|
230
|
+
self.attributes.update({
|
|
231
|
+
"type": "image",
|
|
232
|
+
"description": f"Image file {self.filename}",
|
|
233
|
+
"mime_type": self.mime_type
|
|
234
|
+
})
|
|
235
|
+
|
|
236
|
+
elif self.mime_type.startswith('audio/'):
|
|
237
|
+
logger.debug("Processing as audio")
|
|
238
|
+
self.attributes.update({
|
|
239
|
+
"type": "audio",
|
|
240
|
+
"description": f"Audio file {self.filename}",
|
|
241
|
+
"mime_type": self.mime_type
|
|
242
|
+
})
|
|
243
|
+
|
|
244
|
+
elif self.mime_type == 'application/pdf':
|
|
245
|
+
logger.debug("Processing as PDF")
|
|
246
|
+
try:
|
|
247
|
+
from pypdf import PdfReader
|
|
248
|
+
reader = PdfReader(io.BytesIO(content_bytes))
|
|
249
|
+
text = ""
|
|
250
|
+
for page in reader.pages:
|
|
251
|
+
try:
|
|
252
|
+
extracted = page.extract_text()
|
|
253
|
+
if extracted:
|
|
254
|
+
text += extracted + "\n"
|
|
255
|
+
except Exception as e:
|
|
256
|
+
logger.warning(f"Error extracting text from PDF page: {e}")
|
|
257
|
+
continue
|
|
258
|
+
self.attributes.update({
|
|
259
|
+
"type": "document",
|
|
260
|
+
"text": text.strip(),
|
|
261
|
+
"overview": f"Extracted text from {self.filename}",
|
|
262
|
+
"mime_type": self.mime_type
|
|
263
|
+
})
|
|
264
|
+
except ImportError:
|
|
265
|
+
logger.warning("pypdf not available, skipping PDF text extraction")
|
|
266
|
+
self.attributes.update({
|
|
267
|
+
"type": "document",
|
|
268
|
+
"description": f"PDF document {self.filename}",
|
|
269
|
+
"mime_type": self.mime_type
|
|
270
|
+
})
|
|
271
|
+
|
|
272
|
+
elif self.mime_type.startswith('text/'):
|
|
273
|
+
logger.debug("Processing as text")
|
|
274
|
+
try:
|
|
275
|
+
text = content_bytes.decode('utf-8')
|
|
276
|
+
self.attributes.update({
|
|
277
|
+
"type": "text",
|
|
278
|
+
"text": text,
|
|
279
|
+
"mime_type": self.mime_type
|
|
280
|
+
})
|
|
281
|
+
except UnicodeDecodeError:
|
|
282
|
+
logger.warning("UTF-8 decode failed, trying alternative encodings")
|
|
283
|
+
# Try alternative encodings
|
|
284
|
+
for encoding in ['latin-1', 'cp1252', 'iso-8859-1']:
|
|
285
|
+
try:
|
|
286
|
+
text = content_bytes.decode(encoding)
|
|
287
|
+
self.attributes.update({
|
|
288
|
+
"type": "text",
|
|
289
|
+
"text": text,
|
|
290
|
+
"encoding": encoding,
|
|
291
|
+
"mime_type": self.mime_type
|
|
292
|
+
})
|
|
293
|
+
logger.debug(f"Successfully decoded text using {encoding}")
|
|
294
|
+
break
|
|
295
|
+
except UnicodeDecodeError:
|
|
296
|
+
continue
|
|
297
|
+
|
|
298
|
+
elif self.mime_type == 'application/json':
|
|
299
|
+
logger.debug("Processing as JSON")
|
|
300
|
+
import json
|
|
301
|
+
try:
|
|
302
|
+
json_text = content_bytes.decode('utf-8')
|
|
303
|
+
json_data = json.loads(json_text)
|
|
304
|
+
self.attributes.update({
|
|
305
|
+
"type": "json",
|
|
306
|
+
"overview": "JSON data structure",
|
|
307
|
+
"parsed_content": json_data,
|
|
308
|
+
"mime_type": self.mime_type
|
|
309
|
+
})
|
|
310
|
+
except Exception as e:
|
|
311
|
+
logger.warning(f"Error parsing JSON content: {e}")
|
|
312
|
+
self.attributes.update({
|
|
313
|
+
"type": "json",
|
|
314
|
+
"error": f"Failed to parse JSON: {str(e)}",
|
|
315
|
+
"mime_type": self.mime_type
|
|
316
|
+
})
|
|
317
|
+
|
|
318
|
+
else:
|
|
319
|
+
logger.debug(f"Processing as binary file with MIME type: {self.mime_type}")
|
|
320
|
+
self.attributes.update({
|
|
321
|
+
"type": "binary",
|
|
322
|
+
"description": f"Binary file {self.filename}",
|
|
323
|
+
"mime_type": self.mime_type
|
|
324
|
+
})
|
|
325
|
+
|
|
326
|
+
# Store the file
|
|
327
|
+
logger.debug("Storing file in FileStore")
|
|
328
|
+
|
|
329
|
+
try:
|
|
330
|
+
logger.debug(f"Saving file to storage, content size: {len(content_bytes)} bytes")
|
|
331
|
+
result = await file_store.save(content_bytes, self.filename, self.mime_type)
|
|
332
|
+
logger.debug(f"Successfully saved file. Result: {result}")
|
|
333
|
+
|
|
334
|
+
self.file_id = result['id']
|
|
335
|
+
self.storage_backend = result['storage_backend']
|
|
336
|
+
self.storage_path = result['storage_path']
|
|
337
|
+
self.status = "stored"
|
|
338
|
+
|
|
339
|
+
# Update filename to match the one created by the file store
|
|
340
|
+
# Extract the actual filename from the storage path
|
|
341
|
+
new_filename = Path(self.storage_path).name
|
|
342
|
+
logger.debug(f"Updating attachment filename from {self.filename} to {new_filename}")
|
|
343
|
+
self.filename = new_filename
|
|
344
|
+
|
|
345
|
+
# Add storage info to attributes
|
|
346
|
+
self.attributes["storage_path"] = self.storage_path
|
|
347
|
+
self.update_attributes_with_url()
|
|
348
|
+
|
|
349
|
+
# Clear content after successful storage
|
|
350
|
+
self.content = None
|
|
351
|
+
logger.debug(f"Cleared content after successful storage for {self.filename}")
|
|
352
|
+
|
|
353
|
+
logger.debug(f"Successfully processed and stored attachment {self.filename}")
|
|
354
|
+
|
|
355
|
+
except Exception as e:
|
|
356
|
+
logger.error(f"Error processing attachment {self.filename}: {e}")
|
|
357
|
+
self.status = "failed"
|
|
358
|
+
raise
|
|
359
|
+
|
|
360
|
+
except Exception as e:
|
|
361
|
+
logger.error(f"Failed to process attachment {self.filename}: {str(e)}")
|
|
362
|
+
self.status = "failed"
|
|
363
|
+
raise RuntimeError(f"Failed to process attachment {self.filename}: {str(e)}") from e
|