jaf-py 2.3.1__py3-none-any.whl → 2.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,561 @@
1
+ """
2
+ Document processing utilities for the JAF framework.
3
+
4
+ This module provides robust document content extraction with support for various
5
+ formats including PDF, Word documents, Excel spreadsheets, and more.
6
+ """
7
+
8
+ import asyncio
9
+ import base64
10
+ import csv
11
+ import io
12
+ import json
13
+ import zipfile
14
+ from typing import Dict, Any, Optional, List
15
+
16
+ try:
17
+ import aiofiles
18
+ HAS_AIOFILES = True
19
+ except ImportError:
20
+ HAS_AIOFILES = False
21
+
22
+ try:
23
+ import httpx
24
+ HAS_HTTPX = True
25
+ except ImportError:
26
+ HAS_HTTPX = False
27
+ from pydantic import BaseModel
28
+
29
+ from ..core.types import Attachment
30
+
31
+ # Optional imports with graceful fallbacks
32
+ try:
33
+ import PyPDF2
34
+ HAS_PDF = True
35
+ except ImportError:
36
+ HAS_PDF = False
37
+
38
+ try:
39
+ from docx import Document
40
+ HAS_DOCX = True
41
+ except ImportError:
42
+ HAS_DOCX = False
43
+
44
+ try:
45
+ from openpyxl import load_workbook
46
+ HAS_EXCEL = True
47
+ except ImportError:
48
+ HAS_EXCEL = False
49
+
50
+ try:
51
+ import magic
52
+ HAS_MAGIC = True
53
+ except ImportError:
54
+ HAS_MAGIC = False
55
+
56
+ try:
57
+ from PIL import Image
58
+ HAS_PIL = True
59
+ except ImportError:
60
+ HAS_PIL = False
61
+
62
+
63
+ # Constants
64
+ FETCH_TIMEOUT = 30.0
65
+ MAX_DOCUMENT_SIZE = 25 * 1024 * 1024 # 25MB
66
+ MAX_CSV_PREVIEW_ROWS = 10
67
+ MAX_EXCEL_SHEETS = 3
68
+ MAX_EXCEL_ROWS_PER_SHEET = 20
69
+
70
+
71
+ class DocumentProcessingError(Exception):
72
+ """Exception raised when document processing fails."""
73
+
74
+ def __init__(self, message: str, cause: Optional[Exception] = None):
75
+ super().__init__(message)
76
+ self.cause = cause
77
+
78
+
79
+ class NetworkError(Exception):
80
+ """Exception raised when network operations fail."""
81
+
82
+ def __init__(self, message: str, status_code: Optional[int] = None):
83
+ super().__init__(message)
84
+ self.status_code = status_code
85
+
86
+
87
+ class ProcessedDocument(BaseModel):
88
+ """Result of document processing."""
89
+ content: str
90
+ metadata: Optional[Dict[str, Any]] = None
91
+
92
+
93
+ async def _fetch_url_content(url: str) -> tuple[bytes, Optional[str]]:
94
+ """
95
+ Fetch content from URL and return as bytes with content type.
96
+
97
+ Args:
98
+ url: URL to fetch
99
+
100
+ Returns:
101
+ Tuple of (content_bytes, content_type)
102
+
103
+ Raises:
104
+ NetworkError: If fetch fails
105
+ DocumentProcessingError: If file is too large
106
+ """
107
+ if not HAS_HTTPX:
108
+ raise DocumentProcessingError(
109
+ "URL fetching not available. Install with: pip install 'jaf-py[attachments]'"
110
+ )
111
+
112
+ try:
113
+ async with httpx.AsyncClient(timeout=FETCH_TIMEOUT) as client:
114
+ # First check content length with a HEAD request if possible
115
+ try:
116
+ head_response = await client.head(
117
+ url,
118
+ headers={'User-Agent': 'JAF-DocumentProcessor/1.0'},
119
+ timeout=FETCH_TIMEOUT / 2 # Shorter timeout for HEAD request
120
+ )
121
+ head_response.raise_for_status()
122
+
123
+ # Check Content-Length header if present
124
+ content_length_str = head_response.headers.get('content-length')
125
+ if content_length_str and content_length_str.isdigit():
126
+ content_length = int(content_length_str)
127
+ if content_length > MAX_DOCUMENT_SIZE:
128
+ size_mb = round(content_length / 1024 / 1024)
129
+ max_mb = round(MAX_DOCUMENT_SIZE / 1024 / 1024)
130
+ raise DocumentProcessingError(
131
+ f"File size ({size_mb}MB) exceeds maximum allowed size ({max_mb}MB)"
132
+ )
133
+ except (httpx.HTTPStatusError, httpx.RequestError):
134
+ # HEAD request failed, we'll check size during streaming
135
+ pass
136
+
137
+ # Stream the response to validate size as we download
138
+ content_type = None
139
+ accumulated_bytes = bytearray()
140
+ async with client.stream(
141
+ 'GET',
142
+ url,
143
+ headers={'User-Agent': 'JAF-DocumentProcessor/1.0'}
144
+ ) as response:
145
+ response.raise_for_status()
146
+ content_type = response.headers.get('content-type')
147
+
148
+ # Process the response in chunks
149
+ async for chunk in response.aiter_bytes(chunk_size=8192):
150
+ accumulated_bytes.extend(chunk)
151
+ if len(accumulated_bytes) > MAX_DOCUMENT_SIZE:
152
+ size_mb = round(len(accumulated_bytes) / 1024 / 1024)
153
+ max_mb = round(MAX_DOCUMENT_SIZE / 1024 / 1024)
154
+ raise DocumentProcessingError(
155
+ f"File size ({size_mb}MB) exceeds maximum allowed size ({max_mb}MB)"
156
+ )
157
+
158
+ return bytes(accumulated_bytes), content_type
159
+
160
+ except httpx.HTTPStatusError as e:
161
+ raise NetworkError(f"HTTP {e.response.status_code}: {e.response.reason_phrase}", e.response.status_code)
162
+ except httpx.RequestError as e:
163
+ raise NetworkError(f"Failed to fetch URL content: {e}", cause=e)
164
+ except Exception as e:
165
+ # Preserve system exceptions
166
+ if isinstance(e, (KeyboardInterrupt, SystemExit, GeneratorExit, MemoryError)):
167
+ raise
168
+ raise NetworkError(f"Failed to fetch URL content: {e}", cause=e)
169
+
170
+
171
+ async def extract_document_content(attachment: Attachment) -> ProcessedDocument:
172
+ """
173
+ Extract text content from various document formats.
174
+
175
+ Args:
176
+ attachment: Attachment to process
177
+
178
+ Returns:
179
+ ProcessedDocument with extracted content
180
+
181
+ Raises:
182
+ DocumentProcessingError: If processing fails
183
+ """
184
+ # Get content as bytes
185
+ if attachment.url and not attachment.data:
186
+ content_bytes, detected_mime_type = await _fetch_url_content(attachment.url)
187
+ mime_type = attachment.mime_type or detected_mime_type
188
+ elif attachment.data:
189
+ content_bytes = base64.b64decode(attachment.data)
190
+ mime_type = attachment.mime_type
191
+ else:
192
+ raise DocumentProcessingError('No document data or URL provided')
193
+
194
+ # Normalize MIME type
195
+ mime_type = mime_type.lower() if mime_type else None
196
+
197
+ # Process based on MIME type
198
+ if mime_type == 'application/pdf':
199
+ return await _extract_pdf_content(content_bytes)
200
+ elif mime_type in ['application/vnd.openxmlformats-officedocument.wordprocessingml.document']:
201
+ return _extract_docx_content(content_bytes)
202
+ elif mime_type in ['application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/vnd.ms-excel']:
203
+ return _extract_excel_content(content_bytes)
204
+ elif mime_type == 'application/json':
205
+ return _extract_json_content(content_bytes)
206
+ elif mime_type == 'application/zip':
207
+ return _extract_zip_content(content_bytes)
208
+ elif mime_type in ['text/plain', 'text/csv']:
209
+ return _extract_text_content(content_bytes, mime_type)
210
+ else:
211
+ # Fallback: try to extract as text
212
+ return _extract_text_content(content_bytes, 'text/plain')
213
+
214
+
215
+ async def _extract_pdf_content(content_bytes: bytes) -> ProcessedDocument:
216
+ """Extract content from PDF."""
217
+ if not HAS_PDF:
218
+ raise DocumentProcessingError(
219
+ "PDF processing not available. Install with: pip install 'jaf-py[attachments]'"
220
+ )
221
+
222
+ try:
223
+ # Run PDF processing in thread pool to avoid blocking
224
+ def _process_pdf() -> ProcessedDocument:
225
+ reader = PyPDF2.PdfReader(io.BytesIO(content_bytes))
226
+ text_parts = []
227
+
228
+ for page in reader.pages:
229
+ text_parts.append(page.extract_text())
230
+
231
+ content = '\n'.join(text_parts).strip()
232
+
233
+ return ProcessedDocument(
234
+ content=content,
235
+ metadata={
236
+ 'pages': len(reader.pages),
237
+ 'info': dict(reader.metadata) if reader.metadata else None
238
+ }
239
+ )
240
+
241
+ return await asyncio.get_event_loop().run_in_executor(None, _process_pdf)
242
+
243
+ except Exception as e:
244
+ raise DocumentProcessingError(f"Failed to extract PDF content: {e}") from e
245
+
246
+
247
+ def _extract_text_content(content_bytes: bytes, mime_type: str) -> ProcessedDocument:
248
+ """Extract content from text files."""
249
+ try:
250
+ content = content_bytes.decode('utf-8').strip()
251
+
252
+ if mime_type == 'text/csv':
253
+ # Parse CSV to provide structured overview
254
+ try:
255
+ csv_reader = csv.DictReader(io.StringIO(content))
256
+ rows = list(csv_reader)
257
+ columns = csv_reader.fieldnames or []
258
+
259
+ content_lines = content.split('\n')
260
+ preview_lines = content_lines[:MAX_CSV_PREVIEW_ROWS]
261
+
262
+ formatted_content = (
263
+ f"CSV File Content:\n"
264
+ f"Rows: {len(rows)}, Columns: {len(columns)}\n"
265
+ f"Columns: {', '.join(columns)}\n\n"
266
+ f"First few rows:\n{chr(10).join(preview_lines)}"
267
+ )
268
+
269
+ return ProcessedDocument(
270
+ content=formatted_content,
271
+ metadata={
272
+ 'rows': len(rows),
273
+ 'columns': len(columns),
274
+ 'fields': columns
275
+ }
276
+ )
277
+ except Exception:
278
+ # Fallback to raw text if CSV parsing fails
279
+ pass
280
+
281
+ return ProcessedDocument(content=content)
282
+
283
+ except UnicodeDecodeError as e:
284
+ raise DocumentProcessingError(f"Failed to decode text content: {e}") from e
285
+
286
+
287
+ def _extract_excel_content(content_bytes: bytes) -> ProcessedDocument:
288
+ """Extract content from Excel files."""
289
+ if not HAS_EXCEL:
290
+ raise DocumentProcessingError(
291
+ "Excel processing not available. Install with: pip install 'jaf-py[attachments]'"
292
+ )
293
+
294
+ try:
295
+ workbook = load_workbook(io.BytesIO(content_bytes), read_only=True)
296
+ sheet_names = workbook.sheetnames
297
+
298
+ content_parts = [f"Excel File Content:\nSheets: {', '.join(sheet_names)}\n"]
299
+
300
+ # Extract content from each sheet (limit to avoid overwhelming output)
301
+ for i, sheet_name in enumerate(sheet_names):
302
+ if i >= MAX_EXCEL_SHEETS:
303
+ break
304
+
305
+ worksheet = workbook[sheet_name]
306
+ content_parts.append(f"\nSheet: {sheet_name}")
307
+
308
+ # Extract up to MAX_EXCEL_ROWS_PER_SHEET rows
309
+ rows_data = []
310
+ for row_num, row in enumerate(worksheet.iter_rows(values_only=True), 1):
311
+ if row_num > MAX_EXCEL_ROWS_PER_SHEET:
312
+ break
313
+ # Convert row to strings, handling None values
314
+ row_strings = [str(cell) if cell is not None else '' for cell in row]
315
+ rows_data.append(','.join(row_strings))
316
+
317
+ content_parts.append('\n'.join(rows_data))
318
+
319
+ content = '\n'.join(content_parts).strip()
320
+
321
+ return ProcessedDocument(
322
+ content=content,
323
+ metadata={'sheets': sheet_names}
324
+ )
325
+
326
+ except Exception as e:
327
+ raise DocumentProcessingError(f"Failed to extract Excel content: {e}") from e
328
+
329
+
330
+ def _extract_docx_content(content_bytes: bytes) -> ProcessedDocument:
331
+ """Extract content from Word documents."""
332
+ if not HAS_DOCX:
333
+ raise DocumentProcessingError(
334
+ "Word document processing not available. Install with: pip install 'jaf-py[attachments]'"
335
+ )
336
+
337
+ try:
338
+ document = Document(io.BytesIO(content_bytes))
339
+ paragraphs = [paragraph.text for paragraph in document.paragraphs]
340
+ content = '\n'.join(paragraphs).strip()
341
+
342
+ return ProcessedDocument(
343
+ content=content,
344
+ metadata={'paragraphs': len(paragraphs)}
345
+ )
346
+
347
+ except Exception as e:
348
+ raise DocumentProcessingError(f"Failed to extract DOCX content: {e}") from e
349
+
350
+
351
+ def _extract_json_content(content_bytes: bytes) -> ProcessedDocument:
352
+ """Extract content from JSON files."""
353
+ try:
354
+ json_str = content_bytes.decode('utf-8')
355
+ json_obj = json.loads(json_str)
356
+
357
+ # Pretty print JSON with some metadata
358
+ formatted_content = f"JSON File Content:\n{json.dumps(json_obj, indent=2)}"
359
+
360
+ metadata = {
361
+ 'type': 'array' if isinstance(json_obj, list) else type(json_obj).__name__
362
+ }
363
+
364
+ if isinstance(json_obj, dict):
365
+ metadata['keys'] = list(json_obj.keys())
366
+ elif isinstance(json_obj, list):
367
+ metadata['length'] = len(json_obj)
368
+
369
+ return ProcessedDocument(
370
+ content=formatted_content,
371
+ metadata=metadata
372
+ )
373
+
374
+ except (UnicodeDecodeError, json.JSONDecodeError):
375
+ # Fallback to raw text if JSON parsing fails
376
+ if isinstance(content_bytes, bytes):
377
+ # If input is bytes, decode with error handling
378
+ fallback_content = content_bytes.decode('utf-8', errors='replace').strip()
379
+ else:
380
+ # If input is already a string (from a previous decode attempt)
381
+ fallback_content = json_str.strip() if isinstance(json_str, str) else str(content_bytes)
382
+
383
+ return ProcessedDocument(content=fallback_content)
384
+
385
+
386
+ def _extract_zip_content(content_bytes: bytes) -> ProcessedDocument:
387
+ """Extract file listing from ZIP archives."""
388
+ try:
389
+ with zipfile.ZipFile(io.BytesIO(content_bytes), 'r') as zip_file:
390
+ files = zip_file.namelist()
391
+
392
+ content_parts = ['ZIP File Contents:\n']
393
+ safe_files = []
394
+
395
+ # Create virtual root for path safety checks
396
+ from pathlib import Path
397
+ import os
398
+ virtual_root = Path("/safe_extract_dir") # Virtual root never actually used for extraction
399
+
400
+ for file_name in files:
401
+ # Skip empty entries
402
+ if not file_name:
403
+ continue
404
+
405
+ # Basic security checks
406
+ if (file_name.startswith('/') or # Absolute path
407
+ file_name.startswith('\\') or # Windows absolute path
408
+ file_name.startswith('..') or # Parent directory traversal
409
+ '..' in file_name.split('/') or # Parent directory traversal
410
+ '..' in file_name.split('\\') or # Windows traversal
411
+ ':' in file_name or # Windows drive letter
412
+ '\0' in file_name): # Null byte
413
+ # Skip unsafe entries
414
+ content_parts.append(f"WARNING: Skipped suspicious path: {file_name[:50]}...")
415
+ continue
416
+
417
+ # Normalize path for additional safety check
418
+ try:
419
+ # Create safe path relative to virtual root
420
+ norm_path = os.path.normpath(file_name)
421
+ if norm_path.startswith('..'):
422
+ # Skip unsafe entries that normalize to traversal
423
+ content_parts.append(f"WARNING: Skipped path traversal attempt: {file_name[:50]}...")
424
+ continue
425
+
426
+ # Check if path would escape the virtual root
427
+ test_path = virtual_root.joinpath(norm_path).resolve()
428
+ if not str(test_path).startswith(str(virtual_root)):
429
+ # Skip unsafe entries that would escape extraction root
430
+ content_parts.append(f"WARNING: Skipped path traversal attempt: {file_name[:50]}...")
431
+ continue
432
+
433
+ # Passed all security checks, add to safe file list
434
+ safe_files.append(file_name)
435
+
436
+ # Get file info for display
437
+ if file_name.endswith('/'):
438
+ content_parts.append(f"DIR: {file_name}")
439
+ else:
440
+ try:
441
+ file_info = zip_file.getinfo(file_name)
442
+ size = file_info.file_size
443
+ content_parts.append(f"FILE: {file_name} ({size} bytes)")
444
+ except KeyError:
445
+ content_parts.append(f"FILE: {file_name}")
446
+ except Exception:
447
+ # Skip any entry that causes normalization errors
448
+ content_parts.append(f"WARNING: Skipped invalid path: {file_name[:50]}...")
449
+ continue
450
+
451
+ content = '\n'.join(content_parts).strip()
452
+
453
+ return ProcessedDocument(
454
+ content=content,
455
+ metadata={
456
+ 'files': safe_files,
457
+ 'total_files': len(safe_files)
458
+ }
459
+ )
460
+
461
+ except Exception as e:
462
+ raise DocumentProcessingError(f"Failed to process ZIP file: {e}") from e
463
+
464
+
465
+ def is_document_supported(mime_type: Optional[str]) -> bool:
466
+ """
467
+ Check if a MIME type is supported for content extraction.
468
+
469
+ Args:
470
+ mime_type: MIME type to check
471
+
472
+ Returns:
473
+ True if supported, False otherwise
474
+ """
475
+ if not mime_type:
476
+ return False
477
+
478
+ supported_types = [
479
+ 'application/pdf',
480
+ 'text/plain',
481
+ 'text/csv',
482
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
483
+ 'application/vnd.ms-excel',
484
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
485
+ 'application/json',
486
+ 'application/zip'
487
+ ]
488
+
489
+ return mime_type.lower() in supported_types
490
+
491
+
492
+ def get_document_description(mime_type: Optional[str]) -> str:
493
+ """
494
+ Get a human-readable description of what content will be extracted.
495
+
496
+ Args:
497
+ mime_type: MIME type to describe
498
+
499
+ Returns:
500
+ Human-readable description
501
+ """
502
+ if not mime_type:
503
+ return 'document content'
504
+
505
+ descriptions = {
506
+ 'application/pdf': 'PDF text content',
507
+ 'text/plain': 'plain text content',
508
+ 'text/csv': 'CSV data structure and sample rows',
509
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'Excel spreadsheet data',
510
+ 'application/vnd.ms-excel': 'Excel spreadsheet data',
511
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'Word document text content',
512
+ 'application/json': 'JSON data structure',
513
+ 'application/zip': 'ZIP file listing'
514
+ }
515
+
516
+ return descriptions.get(mime_type.lower(), 'document content')
517
+
518
+
519
+ def get_missing_dependencies() -> List[str]:
520
+ """
521
+ Get list of missing optional dependencies for document processing.
522
+
523
+ Returns:
524
+ List of missing dependency names
525
+ """
526
+ missing = []
527
+
528
+ if not HAS_PDF:
529
+ missing.append('PyPDF2 (for PDF processing)')
530
+ if not HAS_DOCX:
531
+ missing.append('python-docx (for Word document processing)')
532
+ if not HAS_EXCEL:
533
+ missing.append('openpyxl (for Excel processing)')
534
+ if not HAS_PIL:
535
+ missing.append('Pillow (for image processing)')
536
+ if not HAS_MAGIC:
537
+ missing.append('python-magic (for MIME type detection)')
538
+ if not HAS_HTTPX:
539
+ missing.append('httpx (for URL fetching)')
540
+ if not HAS_AIOFILES:
541
+ missing.append('aiofiles (for async file operations)')
542
+
543
+ return missing
544
+
545
+
546
+ def check_dependencies() -> Dict[str, bool]:
547
+ """
548
+ Check availability of optional dependencies.
549
+
550
+ Returns:
551
+ Dictionary mapping dependency names to availability
552
+ """
553
+ return {
554
+ 'pdf': HAS_PDF,
555
+ 'docx': HAS_DOCX,
556
+ 'excel': HAS_EXCEL,
557
+ 'image': HAS_PIL,
558
+ 'magic': HAS_MAGIC,
559
+ 'httpx': HAS_HTTPX,
560
+ 'aiofiles': HAS_AIOFILES
561
+ }