autoforge-ai 0.1.17 → 0.1.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/requirements-prod.txt +4 -0
- package/server/routers/expand_project.py +3 -3
- package/server/routers/spec_creation.py +3 -3
- package/server/schemas.py +42 -17
- package/server/services/chat_constants.py +36 -0
- package/server/services/expand_chat_session.py +14 -13
- package/server/services/spec_chat_session.py +12 -14
- package/server/utils/document_extraction.py +221 -0
- package/ui/dist/assets/{index-CkQ1S0MR.js → index-DXm5cuJA.js} +30 -30
- package/ui/dist/assets/index-DlYws_VI.css +1 -0
- package/ui/dist/index.html +2 -2
- package/ui/dist/assets/index-CP8iLkFV.css +0 -1
package/package.json
CHANGED
package/requirements-prod.txt
CHANGED
|
@@ -13,7 +13,7 @@ from typing import Optional
|
|
|
13
13
|
from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect
|
|
14
14
|
from pydantic import BaseModel, ValidationError
|
|
15
15
|
|
|
16
|
-
from ..schemas import
|
|
16
|
+
from ..schemas import FileAttachment
|
|
17
17
|
from ..services.expand_chat_session import (
|
|
18
18
|
ExpandChatSession,
|
|
19
19
|
create_expand_session,
|
|
@@ -181,12 +181,12 @@ async def expand_project_websocket(websocket: WebSocket, project_name: str):
|
|
|
181
181
|
user_content = message.get("content", "").strip()
|
|
182
182
|
|
|
183
183
|
# Parse attachments if present
|
|
184
|
-
attachments: list[
|
|
184
|
+
attachments: list[FileAttachment] = []
|
|
185
185
|
raw_attachments = message.get("attachments", [])
|
|
186
186
|
if raw_attachments:
|
|
187
187
|
try:
|
|
188
188
|
for raw_att in raw_attachments:
|
|
189
|
-
attachments.append(
|
|
189
|
+
attachments.append(FileAttachment(**raw_att))
|
|
190
190
|
except (ValidationError, Exception) as e:
|
|
191
191
|
logger.warning(f"Invalid attachment data: {e}")
|
|
192
192
|
await websocket.send_json({
|
|
@@ -12,7 +12,7 @@ from typing import Optional
|
|
|
12
12
|
from fastapi import APIRouter, HTTPException, WebSocket, WebSocketDisconnect
|
|
13
13
|
from pydantic import BaseModel, ValidationError
|
|
14
14
|
|
|
15
|
-
from ..schemas import
|
|
15
|
+
from ..schemas import FileAttachment
|
|
16
16
|
from ..services.spec_chat_session import (
|
|
17
17
|
SpecChatSession,
|
|
18
18
|
create_session,
|
|
@@ -242,12 +242,12 @@ async def spec_chat_websocket(websocket: WebSocket, project_name: str):
|
|
|
242
242
|
user_content = message.get("content", "").strip()
|
|
243
243
|
|
|
244
244
|
# Parse attachments if present
|
|
245
|
-
attachments: list[
|
|
245
|
+
attachments: list[FileAttachment] = []
|
|
246
246
|
raw_attachments = message.get("attachments", [])
|
|
247
247
|
if raw_attachments:
|
|
248
248
|
try:
|
|
249
249
|
for raw_att in raw_attachments:
|
|
250
|
-
attachments.append(
|
|
250
|
+
attachments.append(FileAttachment(**raw_att))
|
|
251
251
|
except (ValidationError, Exception) as e:
|
|
252
252
|
logger.warning(f"Invalid attachment data: {e}")
|
|
253
253
|
await websocket.send_json({
|
package/server/schemas.py
CHANGED
|
@@ -11,7 +11,7 @@ from datetime import datetime
|
|
|
11
11
|
from pathlib import Path
|
|
12
12
|
from typing import Literal
|
|
13
13
|
|
|
14
|
-
from pydantic import BaseModel, Field, field_validator
|
|
14
|
+
from pydantic import BaseModel, Field, field_validator, model_validator
|
|
15
15
|
|
|
16
16
|
# Import model constants from registry (single source of truth)
|
|
17
17
|
_root = Path(__file__).parent.parent
|
|
@@ -331,36 +331,61 @@ class WSAgentUpdateMessage(BaseModel):
|
|
|
331
331
|
|
|
332
332
|
|
|
333
333
|
# ============================================================================
|
|
334
|
-
#
|
|
334
|
+
# Chat Attachment Schemas
|
|
335
335
|
# ============================================================================
|
|
336
336
|
|
|
337
|
-
#
|
|
338
|
-
MAX_IMAGE_SIZE = 5 * 1024 * 1024
|
|
337
|
+
# Size limits
|
|
338
|
+
MAX_IMAGE_SIZE = 5 * 1024 * 1024 # 5 MB for images
|
|
339
|
+
MAX_DOCUMENT_SIZE = 20 * 1024 * 1024 # 20 MB for documents
|
|
339
340
|
|
|
341
|
+
_IMAGE_MIME_TYPES = {'image/jpeg', 'image/png'}
|
|
340
342
|
|
|
341
|
-
|
|
342
|
-
|
|
343
|
+
|
|
344
|
+
class FileAttachment(BaseModel):
|
|
345
|
+
"""File attachment from client for spec creation / expand project chat."""
|
|
343
346
|
filename: str = Field(..., min_length=1, max_length=255)
|
|
344
|
-
mimeType: Literal[
|
|
347
|
+
mimeType: Literal[
|
|
348
|
+
'image/jpeg', 'image/png',
|
|
349
|
+
'text/plain', 'text/markdown', 'text/csv',
|
|
350
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
|
351
|
+
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
|
352
|
+
'application/pdf',
|
|
353
|
+
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
|
|
354
|
+
]
|
|
345
355
|
base64Data: str
|
|
346
356
|
|
|
347
357
|
@field_validator('base64Data')
|
|
348
358
|
@classmethod
|
|
349
|
-
def
|
|
350
|
-
"""Validate that base64 data is
|
|
359
|
+
def validate_base64(cls, v: str) -> str:
|
|
360
|
+
"""Validate that base64 data is decodable."""
|
|
351
361
|
try:
|
|
352
|
-
|
|
353
|
-
if len(decoded) > MAX_IMAGE_SIZE:
|
|
354
|
-
raise ValueError(
|
|
355
|
-
f'Image size ({len(decoded) / (1024 * 1024):.1f} MB) exceeds '
|
|
356
|
-
f'maximum of {MAX_IMAGE_SIZE // (1024 * 1024)} MB'
|
|
357
|
-
)
|
|
362
|
+
base64.b64decode(v)
|
|
358
363
|
return v
|
|
359
364
|
except Exception as e:
|
|
360
|
-
if 'Image size' in str(e):
|
|
361
|
-
raise
|
|
362
365
|
raise ValueError(f'Invalid base64 data: {e}')
|
|
363
366
|
|
|
367
|
+
@model_validator(mode='after')
|
|
368
|
+
def validate_size(self) -> 'FileAttachment':
|
|
369
|
+
"""Validate file size based on MIME type."""
|
|
370
|
+
try:
|
|
371
|
+
decoded = base64.b64decode(self.base64Data)
|
|
372
|
+
except Exception:
|
|
373
|
+
return self # Already caught by field validator
|
|
374
|
+
|
|
375
|
+
if self.mimeType in _IMAGE_MIME_TYPES:
|
|
376
|
+
max_size = MAX_IMAGE_SIZE
|
|
377
|
+
label = "Image"
|
|
378
|
+
else:
|
|
379
|
+
max_size = MAX_DOCUMENT_SIZE
|
|
380
|
+
label = "Document"
|
|
381
|
+
|
|
382
|
+
if len(decoded) > max_size:
|
|
383
|
+
raise ValueError(
|
|
384
|
+
f'{label} size ({len(decoded) / (1024 * 1024):.1f} MB) exceeds '
|
|
385
|
+
f'maximum of {max_size // (1024 * 1024)} MB'
|
|
386
|
+
)
|
|
387
|
+
return self
|
|
388
|
+
|
|
364
389
|
|
|
365
390
|
# ============================================================================
|
|
366
391
|
# Filesystem Schemas
|
|
@@ -35,6 +35,13 @@ if _root_str not in sys.path:
|
|
|
35
35
|
from env_constants import API_ENV_VARS # noqa: E402, F401
|
|
36
36
|
from rate_limit_utils import is_rate_limit_error, parse_retry_after # noqa: E402, F401
|
|
37
37
|
|
|
38
|
+
from ..schemas import FileAttachment
|
|
39
|
+
from ..utils.document_extraction import (
|
|
40
|
+
extract_text_from_document,
|
|
41
|
+
is_document,
|
|
42
|
+
is_image,
|
|
43
|
+
)
|
|
44
|
+
|
|
38
45
|
logger = logging.getLogger(__name__)
|
|
39
46
|
|
|
40
47
|
|
|
@@ -88,6 +95,35 @@ async def safe_receive_response(client: Any, log: logging.Logger) -> AsyncGenera
|
|
|
88
95
|
raise
|
|
89
96
|
|
|
90
97
|
|
|
98
|
+
def build_attachment_content_blocks(attachments: list[FileAttachment]) -> list[dict]:
|
|
99
|
+
"""Convert FileAttachment objects to Claude API content blocks.
|
|
100
|
+
|
|
101
|
+
Images become image content blocks (passed directly to Claude's vision).
|
|
102
|
+
Documents are extracted to text and become text content blocks.
|
|
103
|
+
|
|
104
|
+
Raises:
|
|
105
|
+
DocumentExtractionError: If a document cannot be read.
|
|
106
|
+
"""
|
|
107
|
+
blocks: list[dict] = []
|
|
108
|
+
for att in attachments:
|
|
109
|
+
if is_image(att.mimeType):
|
|
110
|
+
blocks.append({
|
|
111
|
+
"type": "image",
|
|
112
|
+
"source": {
|
|
113
|
+
"type": "base64",
|
|
114
|
+
"media_type": att.mimeType,
|
|
115
|
+
"data": att.base64Data,
|
|
116
|
+
}
|
|
117
|
+
})
|
|
118
|
+
elif is_document(att.mimeType):
|
|
119
|
+
text = extract_text_from_document(att.base64Data, att.mimeType, att.filename)
|
|
120
|
+
blocks.append({
|
|
121
|
+
"type": "text",
|
|
122
|
+
"text": f"[Content of uploaded file: {att.filename}]\n\n{text}",
|
|
123
|
+
})
|
|
124
|
+
return blocks
|
|
125
|
+
|
|
126
|
+
|
|
91
127
|
async def make_multimodal_message(content_blocks: list[dict]) -> AsyncGenerator[dict, None]:
|
|
92
128
|
"""Yield a single multimodal user message in Claude Agent SDK format.
|
|
93
129
|
|
|
@@ -21,9 +21,11 @@ from typing import Any, AsyncGenerator, Optional
|
|
|
21
21
|
from claude_agent_sdk import ClaudeAgentOptions, ClaudeSDKClient
|
|
22
22
|
from dotenv import load_dotenv
|
|
23
23
|
|
|
24
|
-
from ..schemas import
|
|
24
|
+
from ..schemas import FileAttachment
|
|
25
|
+
from ..utils.document_extraction import DocumentExtractionError
|
|
25
26
|
from .chat_constants import (
|
|
26
27
|
ROOT_DIR,
|
|
28
|
+
build_attachment_content_blocks,
|
|
27
29
|
check_rate_limit_error,
|
|
28
30
|
make_multimodal_message,
|
|
29
31
|
safe_receive_response,
|
|
@@ -226,7 +228,7 @@ class ExpandChatSession:
|
|
|
226
228
|
async def send_message(
|
|
227
229
|
self,
|
|
228
230
|
user_message: str,
|
|
229
|
-
attachments: list[
|
|
231
|
+
attachments: list[FileAttachment] | None = None
|
|
230
232
|
) -> AsyncGenerator[dict, None]:
|
|
231
233
|
"""
|
|
232
234
|
Send user message and stream Claude's response.
|
|
@@ -273,7 +275,7 @@ class ExpandChatSession:
|
|
|
273
275
|
async def _query_claude(
|
|
274
276
|
self,
|
|
275
277
|
message: str,
|
|
276
|
-
attachments: list[
|
|
278
|
+
attachments: list[FileAttachment] | None = None
|
|
277
279
|
) -> AsyncGenerator[dict, None]:
|
|
278
280
|
"""
|
|
279
281
|
Internal method to query Claude and stream responses.
|
|
@@ -289,17 +291,16 @@ class ExpandChatSession:
|
|
|
289
291
|
content_blocks: list[dict[str, Any]] = []
|
|
290
292
|
if message:
|
|
291
293
|
content_blocks.append({"type": "text", "text": message})
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
})
|
|
294
|
+
|
|
295
|
+
# Add attachment blocks (images as image blocks, documents as extracted text)
|
|
296
|
+
try:
|
|
297
|
+
content_blocks.extend(build_attachment_content_blocks(attachments))
|
|
298
|
+
except DocumentExtractionError as e:
|
|
299
|
+
yield {"type": "error", "content": str(e)}
|
|
300
|
+
return
|
|
301
|
+
|
|
301
302
|
await self.client.query(make_multimodal_message(content_blocks))
|
|
302
|
-
logger.info(f"Sent multimodal message with {len(attachments)}
|
|
303
|
+
logger.info(f"Sent multimodal message with {len(attachments)} attachment(s)")
|
|
303
304
|
else:
|
|
304
305
|
await self.client.query(message)
|
|
305
306
|
|
|
@@ -18,9 +18,11 @@ from typing import Any, AsyncGenerator, Optional
|
|
|
18
18
|
from claude_agent_sdk import ClaudeAgentOptions, ClaudeSDKClient
|
|
19
19
|
from dotenv import load_dotenv
|
|
20
20
|
|
|
21
|
-
from ..schemas import
|
|
21
|
+
from ..schemas import FileAttachment
|
|
22
|
+
from ..utils.document_extraction import DocumentExtractionError
|
|
22
23
|
from .chat_constants import (
|
|
23
24
|
ROOT_DIR,
|
|
25
|
+
build_attachment_content_blocks,
|
|
24
26
|
check_rate_limit_error,
|
|
25
27
|
make_multimodal_message,
|
|
26
28
|
safe_receive_response,
|
|
@@ -201,7 +203,7 @@ class SpecChatSession:
|
|
|
201
203
|
async def send_message(
|
|
202
204
|
self,
|
|
203
205
|
user_message: str,
|
|
204
|
-
attachments: list[
|
|
206
|
+
attachments: list[FileAttachment] | None = None
|
|
205
207
|
) -> AsyncGenerator[dict, None]:
|
|
206
208
|
"""
|
|
207
209
|
Send user message and stream Claude's response.
|
|
@@ -247,7 +249,7 @@ class SpecChatSession:
|
|
|
247
249
|
async def _query_claude(
|
|
248
250
|
self,
|
|
249
251
|
message: str,
|
|
250
|
-
attachments: list[
|
|
252
|
+
attachments: list[FileAttachment] | None = None
|
|
251
253
|
) -> AsyncGenerator[dict, None]:
|
|
252
254
|
"""
|
|
253
255
|
Internal method to query Claude and stream responses.
|
|
@@ -273,21 +275,17 @@ class SpecChatSession:
|
|
|
273
275
|
if message:
|
|
274
276
|
content_blocks.append({"type": "text", "text": message})
|
|
275
277
|
|
|
276
|
-
# Add image blocks
|
|
277
|
-
|
|
278
|
-
content_blocks.
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
"media_type": att.mimeType,
|
|
283
|
-
"data": att.base64Data,
|
|
284
|
-
}
|
|
285
|
-
})
|
|
278
|
+
# Add attachment blocks (images as image blocks, documents as extracted text)
|
|
279
|
+
try:
|
|
280
|
+
content_blocks.extend(build_attachment_content_blocks(attachments))
|
|
281
|
+
except DocumentExtractionError as e:
|
|
282
|
+
yield {"type": "error", "content": str(e)}
|
|
283
|
+
return
|
|
286
284
|
|
|
287
285
|
# Send multimodal content to Claude using async generator format
|
|
288
286
|
# The SDK's query() accepts AsyncIterable[dict] for custom message formats
|
|
289
287
|
await self.client.query(make_multimodal_message(content_blocks))
|
|
290
|
-
logger.info(f"Sent multimodal message with {len(attachments)}
|
|
288
|
+
logger.info(f"Sent multimodal message with {len(attachments)} attachment(s)")
|
|
291
289
|
else:
|
|
292
290
|
# Text-only message: use string format
|
|
293
291
|
await self.client.query(message)
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Document Extraction Utility
|
|
3
|
+
============================
|
|
4
|
+
|
|
5
|
+
Extracts text content from various document formats in memory (no disk I/O).
|
|
6
|
+
Supports: TXT, MD, CSV, DOCX, XLSX, PDF, PPTX.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import base64
|
|
10
|
+
import csv
|
|
11
|
+
import io
|
|
12
|
+
import logging
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# Maximum characters of extracted text to send to Claude
|
|
17
|
+
MAX_EXTRACTED_CHARS = 200_000
|
|
18
|
+
|
|
19
|
+
# Maximum rows per sheet for Excel files
|
|
20
|
+
MAX_EXCEL_ROWS_PER_SHEET = 10_000
|
|
21
|
+
MAX_EXCEL_SHEETS = 50
|
|
22
|
+
|
|
23
|
+
# MIME type classification
|
|
24
|
+
DOCUMENT_MIME_TYPES: dict[str, str] = {
|
|
25
|
+
"text/plain": ".txt",
|
|
26
|
+
"text/markdown": ".md",
|
|
27
|
+
"text/csv": ".csv",
|
|
28
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
|
|
29
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
|
|
30
|
+
"application/pdf": ".pdf",
|
|
31
|
+
"application/vnd.openxmlformats-officedocument.presentationml.presentation": ".pptx",
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
IMAGE_MIME_TYPES = {"image/jpeg", "image/png"}
|
|
35
|
+
|
|
36
|
+
ALL_ALLOWED_MIME_TYPES = IMAGE_MIME_TYPES | set(DOCUMENT_MIME_TYPES.keys())
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def is_image(mime_type: str) -> bool:
|
|
40
|
+
"""Check if the MIME type is a supported image format."""
|
|
41
|
+
return mime_type in IMAGE_MIME_TYPES
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def is_document(mime_type: str) -> bool:
|
|
45
|
+
"""Check if the MIME type is a supported document format."""
|
|
46
|
+
return mime_type in DOCUMENT_MIME_TYPES
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class DocumentExtractionError(Exception):
|
|
50
|
+
"""Raised when text extraction from a document fails."""
|
|
51
|
+
|
|
52
|
+
def __init__(self, filename: str, reason: str):
|
|
53
|
+
self.filename = filename
|
|
54
|
+
self.reason = reason
|
|
55
|
+
super().__init__(f"Failed to read {filename}: {reason}")
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def _truncate(text: str) -> str:
|
|
59
|
+
"""Truncate text if it exceeds the maximum character limit."""
|
|
60
|
+
if len(text) > MAX_EXTRACTED_CHARS:
|
|
61
|
+
omitted = len(text) - MAX_EXTRACTED_CHARS
|
|
62
|
+
return text[:MAX_EXTRACTED_CHARS] + f"\n\n[... truncated, {omitted:,} characters omitted]"
|
|
63
|
+
return text
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _extract_plain_text(data: bytes) -> str:
|
|
67
|
+
"""Extract text from plain text or markdown files."""
|
|
68
|
+
try:
|
|
69
|
+
return data.decode("utf-8")
|
|
70
|
+
except UnicodeDecodeError:
|
|
71
|
+
return data.decode("latin-1")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _extract_csv(data: bytes) -> str:
|
|
75
|
+
"""Extract text from CSV files, formatted as a readable table."""
|
|
76
|
+
try:
|
|
77
|
+
text = data.decode("utf-8")
|
|
78
|
+
except UnicodeDecodeError:
|
|
79
|
+
text = data.decode("latin-1")
|
|
80
|
+
|
|
81
|
+
reader = csv.reader(io.StringIO(text))
|
|
82
|
+
lines = []
|
|
83
|
+
for i, row in enumerate(reader):
|
|
84
|
+
lines.append(f"Row {i + 1}: {', '.join(row)}")
|
|
85
|
+
return "\n".join(lines)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def _extract_docx(data: bytes) -> str:
|
|
89
|
+
"""Extract text from Word documents."""
|
|
90
|
+
from docx import Document
|
|
91
|
+
|
|
92
|
+
doc = Document(io.BytesIO(data))
|
|
93
|
+
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
|
94
|
+
return "\n\n".join(paragraphs)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _extract_xlsx(data: bytes) -> str:
|
|
98
|
+
"""Extract text from Excel spreadsheets."""
|
|
99
|
+
from openpyxl import load_workbook
|
|
100
|
+
|
|
101
|
+
wb = load_workbook(io.BytesIO(data), read_only=True, data_only=True)
|
|
102
|
+
sections = []
|
|
103
|
+
|
|
104
|
+
for sheet_idx, sheet_name in enumerate(wb.sheetnames):
|
|
105
|
+
if sheet_idx >= MAX_EXCEL_SHEETS:
|
|
106
|
+
sections.append(f"\n[... {len(wb.sheetnames) - MAX_EXCEL_SHEETS} more sheets omitted]")
|
|
107
|
+
break
|
|
108
|
+
|
|
109
|
+
ws = wb[sheet_name]
|
|
110
|
+
rows_text = [f"=== Sheet: {sheet_name} ==="]
|
|
111
|
+
row_count = 0
|
|
112
|
+
|
|
113
|
+
for row in ws.iter_rows(values_only=True):
|
|
114
|
+
if row_count >= MAX_EXCEL_ROWS_PER_SHEET:
|
|
115
|
+
rows_text.append(f"[... more rows omitted, limit {MAX_EXCEL_ROWS_PER_SHEET:,} rows/sheet]")
|
|
116
|
+
break
|
|
117
|
+
cells = [str(cell) if cell is not None else "" for cell in row]
|
|
118
|
+
rows_text.append("\t".join(cells))
|
|
119
|
+
row_count += 1
|
|
120
|
+
|
|
121
|
+
sections.append("\n".join(rows_text))
|
|
122
|
+
|
|
123
|
+
wb.close()
|
|
124
|
+
return "\n\n".join(sections)
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def _extract_pdf(data: bytes, filename: str) -> str:
|
|
128
|
+
"""Extract text from PDF files."""
|
|
129
|
+
from PyPDF2 import PdfReader
|
|
130
|
+
from PyPDF2.errors import PdfReadError
|
|
131
|
+
|
|
132
|
+
try:
|
|
133
|
+
reader = PdfReader(io.BytesIO(data))
|
|
134
|
+
except PdfReadError as e:
|
|
135
|
+
if "encrypt" in str(e).lower() or "password" in str(e).lower():
|
|
136
|
+
raise DocumentExtractionError(filename, "PDF is password-protected")
|
|
137
|
+
raise
|
|
138
|
+
|
|
139
|
+
if reader.is_encrypted:
|
|
140
|
+
raise DocumentExtractionError(filename, "PDF is password-protected")
|
|
141
|
+
|
|
142
|
+
pages = []
|
|
143
|
+
for i, page in enumerate(reader.pages):
|
|
144
|
+
text = page.extract_text()
|
|
145
|
+
if text and text.strip():
|
|
146
|
+
pages.append(f"--- Page {i + 1} ---\n{text}")
|
|
147
|
+
|
|
148
|
+
return "\n\n".join(pages)
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _extract_pptx(data: bytes) -> str:
|
|
152
|
+
"""Extract text from PowerPoint presentations."""
|
|
153
|
+
from pptx import Presentation
|
|
154
|
+
|
|
155
|
+
prs = Presentation(io.BytesIO(data))
|
|
156
|
+
slides_text = []
|
|
157
|
+
|
|
158
|
+
for i, slide in enumerate(prs.slides):
|
|
159
|
+
texts = []
|
|
160
|
+
for shape in slide.shapes:
|
|
161
|
+
if shape.has_text_frame:
|
|
162
|
+
for paragraph in shape.text_frame.paragraphs:
|
|
163
|
+
text = paragraph.text.strip()
|
|
164
|
+
if text:
|
|
165
|
+
texts.append(text)
|
|
166
|
+
if texts:
|
|
167
|
+
slides_text.append(f"--- Slide {i + 1} ---\n" + "\n".join(texts))
|
|
168
|
+
|
|
169
|
+
return "\n\n".join(slides_text)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def extract_text_from_document(base64_data: str, mime_type: str, filename: str) -> str:
|
|
173
|
+
"""
|
|
174
|
+
Extract text content from a document file.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
base64_data: Base64-encoded file content
|
|
178
|
+
mime_type: MIME type of the document
|
|
179
|
+
filename: Original filename (for error messages)
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Extracted text content, truncated if necessary
|
|
183
|
+
|
|
184
|
+
Raises:
|
|
185
|
+
DocumentExtractionError: If extraction fails
|
|
186
|
+
"""
|
|
187
|
+
if mime_type not in DOCUMENT_MIME_TYPES:
|
|
188
|
+
raise DocumentExtractionError(filename, f"unsupported document type: {mime_type}")
|
|
189
|
+
|
|
190
|
+
try:
|
|
191
|
+
data = base64.b64decode(base64_data)
|
|
192
|
+
except Exception as e:
|
|
193
|
+
raise DocumentExtractionError(filename, f"invalid base64 data: {e}")
|
|
194
|
+
|
|
195
|
+
try:
|
|
196
|
+
if mime_type in ("text/plain", "text/markdown"):
|
|
197
|
+
text = _extract_plain_text(data)
|
|
198
|
+
elif mime_type == "text/csv":
|
|
199
|
+
text = _extract_csv(data)
|
|
200
|
+
elif mime_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
|
201
|
+
text = _extract_docx(data)
|
|
202
|
+
elif mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
|
|
203
|
+
text = _extract_xlsx(data)
|
|
204
|
+
elif mime_type == "application/pdf":
|
|
205
|
+
text = _extract_pdf(data, filename)
|
|
206
|
+
elif mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
|
|
207
|
+
text = _extract_pptx(data)
|
|
208
|
+
else:
|
|
209
|
+
raise DocumentExtractionError(filename, f"unsupported document type: {mime_type}")
|
|
210
|
+
except DocumentExtractionError:
|
|
211
|
+
raise
|
|
212
|
+
except Exception as e:
|
|
213
|
+
logger.warning(f"Document extraction failed for {filename}: {e}")
|
|
214
|
+
raise DocumentExtractionError(
|
|
215
|
+
filename, "file appears to be corrupt or in an unexpected format"
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
if not text or not text.strip():
|
|
219
|
+
return f"[File {filename} is empty or contains no extractable text]"
|
|
220
|
+
|
|
221
|
+
return _truncate(text)
|