jaf-py 2.5.10__py3-none-any.whl → 2.5.12__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. jaf/__init__.py +154 -57
  2. jaf/a2a/__init__.py +42 -21
  3. jaf/a2a/agent.py +79 -126
  4. jaf/a2a/agent_card.py +87 -78
  5. jaf/a2a/client.py +30 -66
  6. jaf/a2a/examples/client_example.py +12 -12
  7. jaf/a2a/examples/integration_example.py +38 -47
  8. jaf/a2a/examples/server_example.py +56 -53
  9. jaf/a2a/memory/__init__.py +0 -4
  10. jaf/a2a/memory/cleanup.py +28 -21
  11. jaf/a2a/memory/factory.py +155 -133
  12. jaf/a2a/memory/providers/composite.py +21 -26
  13. jaf/a2a/memory/providers/in_memory.py +89 -83
  14. jaf/a2a/memory/providers/postgres.py +117 -115
  15. jaf/a2a/memory/providers/redis.py +128 -121
  16. jaf/a2a/memory/serialization.py +77 -87
  17. jaf/a2a/memory/tests/run_comprehensive_tests.py +112 -83
  18. jaf/a2a/memory/tests/test_cleanup.py +211 -94
  19. jaf/a2a/memory/tests/test_serialization.py +73 -68
  20. jaf/a2a/memory/tests/test_stress_concurrency.py +186 -133
  21. jaf/a2a/memory/tests/test_task_lifecycle.py +138 -120
  22. jaf/a2a/memory/types.py +91 -53
  23. jaf/a2a/protocol.py +95 -125
  24. jaf/a2a/server.py +90 -118
  25. jaf/a2a/standalone_client.py +30 -43
  26. jaf/a2a/tests/__init__.py +16 -33
  27. jaf/a2a/tests/run_tests.py +17 -53
  28. jaf/a2a/tests/test_agent.py +40 -140
  29. jaf/a2a/tests/test_client.py +54 -117
  30. jaf/a2a/tests/test_integration.py +28 -82
  31. jaf/a2a/tests/test_protocol.py +54 -139
  32. jaf/a2a/tests/test_types.py +50 -136
  33. jaf/a2a/types.py +58 -34
  34. jaf/cli.py +21 -41
  35. jaf/core/__init__.py +7 -1
  36. jaf/core/agent_tool.py +93 -72
  37. jaf/core/analytics.py +257 -207
  38. jaf/core/checkpoint.py +223 -0
  39. jaf/core/composition.py +249 -235
  40. jaf/core/engine.py +817 -519
  41. jaf/core/errors.py +55 -42
  42. jaf/core/guardrails.py +276 -202
  43. jaf/core/handoff.py +47 -31
  44. jaf/core/parallel_agents.py +69 -75
  45. jaf/core/performance.py +75 -73
  46. jaf/core/proxy.py +43 -44
  47. jaf/core/proxy_helpers.py +24 -27
  48. jaf/core/regeneration.py +220 -129
  49. jaf/core/state.py +68 -66
  50. jaf/core/streaming.py +115 -108
  51. jaf/core/tool_results.py +111 -101
  52. jaf/core/tools.py +114 -116
  53. jaf/core/tracing.py +310 -210
  54. jaf/core/types.py +403 -151
  55. jaf/core/workflows.py +209 -168
  56. jaf/exceptions.py +46 -38
  57. jaf/memory/__init__.py +1 -6
  58. jaf/memory/approval_storage.py +54 -77
  59. jaf/memory/factory.py +4 -4
  60. jaf/memory/providers/in_memory.py +216 -180
  61. jaf/memory/providers/postgres.py +216 -146
  62. jaf/memory/providers/redis.py +173 -116
  63. jaf/memory/types.py +70 -51
  64. jaf/memory/utils.py +36 -34
  65. jaf/plugins/__init__.py +12 -12
  66. jaf/plugins/base.py +105 -96
  67. jaf/policies/__init__.py +0 -1
  68. jaf/policies/handoff.py +37 -46
  69. jaf/policies/validation.py +76 -52
  70. jaf/providers/__init__.py +6 -3
  71. jaf/providers/mcp.py +97 -51
  72. jaf/providers/model.py +475 -283
  73. jaf/server/__init__.py +1 -1
  74. jaf/server/main.py +7 -11
  75. jaf/server/server.py +514 -359
  76. jaf/server/types.py +208 -52
  77. jaf/utils/__init__.py +17 -18
  78. jaf/utils/attachments.py +111 -116
  79. jaf/utils/document_processor.py +175 -174
  80. jaf/visualization/__init__.py +1 -1
  81. jaf/visualization/example.py +111 -110
  82. jaf/visualization/functional_core.py +46 -71
  83. jaf/visualization/graphviz.py +154 -189
  84. jaf/visualization/imperative_shell.py +7 -16
  85. jaf/visualization/types.py +8 -4
  86. {jaf_py-2.5.10.dist-info → jaf_py-2.5.12.dist-info}/METADATA +2 -2
  87. jaf_py-2.5.12.dist-info/RECORD +97 -0
  88. jaf_py-2.5.10.dist-info/RECORD +0 -96
  89. {jaf_py-2.5.10.dist-info → jaf_py-2.5.12.dist-info}/WHEEL +0 -0
  90. {jaf_py-2.5.10.dist-info → jaf_py-2.5.12.dist-info}/entry_points.txt +0 -0
  91. {jaf_py-2.5.10.dist-info → jaf_py-2.5.12.dist-info}/licenses/LICENSE +0 -0
  92. {jaf_py-2.5.10.dist-info → jaf_py-2.5.12.dist-info}/top_level.txt +0 -0
@@ -15,12 +15,14 @@ from typing import Dict, Any, Optional, List
15
15
 
16
16
  try:
17
17
  import aiofiles
18
+
18
19
  HAS_AIOFILES = True
19
20
  except ImportError:
20
21
  HAS_AIOFILES = False
21
22
 
22
23
  try:
23
24
  import httpx
25
+
24
26
  HAS_HTTPX = True
25
27
  except ImportError:
26
28
  HAS_HTTPX = False
@@ -31,30 +33,35 @@ from ..core.types import Attachment
31
33
  # Optional imports with graceful fallbacks
32
34
  try:
33
35
  import PyPDF2
36
+
34
37
  HAS_PDF = True
35
38
  except ImportError:
36
39
  HAS_PDF = False
37
40
 
38
41
  try:
39
42
  from docx import Document
43
+
40
44
  HAS_DOCX = True
41
45
  except ImportError:
42
46
  HAS_DOCX = False
43
47
 
44
48
  try:
45
49
  from openpyxl import load_workbook
50
+
46
51
  HAS_EXCEL = True
47
52
  except ImportError:
48
53
  HAS_EXCEL = False
49
54
 
50
55
  try:
51
56
  import magic
57
+
52
58
  HAS_MAGIC = True
53
59
  except ImportError:
54
60
  HAS_MAGIC = False
55
61
 
56
62
  try:
57
63
  from PIL import Image
64
+
58
65
  HAS_PIL = True
59
66
  except ImportError:
60
67
  HAS_PIL = False
@@ -70,7 +77,7 @@ MAX_EXCEL_ROWS_PER_SHEET = 20
70
77
 
71
78
  class DocumentProcessingError(Exception):
72
79
  """Exception raised when document processing fails."""
73
-
80
+
74
81
  def __init__(self, message: str, cause: Optional[Exception] = None):
75
82
  super().__init__(message)
76
83
  self.cause = cause
@@ -78,7 +85,7 @@ class DocumentProcessingError(Exception):
78
85
 
79
86
  class NetworkError(Exception):
80
87
  """Exception raised when network operations fail."""
81
-
88
+
82
89
  def __init__(self, message: str, status_code: Optional[int] = None):
83
90
  super().__init__(message)
84
91
  self.status_code = status_code
@@ -86,6 +93,7 @@ class NetworkError(Exception):
86
93
 
87
94
  class ProcessedDocument(BaseModel):
88
95
  """Result of document processing."""
96
+
89
97
  content: str
90
98
  metadata: Optional[Dict[str, Any]] = None
91
99
 
@@ -93,13 +101,13 @@ class ProcessedDocument(BaseModel):
93
101
  async def _fetch_url_content(url: str) -> tuple[bytes, Optional[str]]:
94
102
  """
95
103
  Fetch content from URL and return as bytes with content type.
96
-
104
+
97
105
  Args:
98
106
  url: URL to fetch
99
-
107
+
100
108
  Returns:
101
109
  Tuple of (content_bytes, content_type)
102
-
110
+
103
111
  Raises:
104
112
  NetworkError: If fetch fails
105
113
  DocumentProcessingError: If file is too large
@@ -108,20 +116,20 @@ async def _fetch_url_content(url: str) -> tuple[bytes, Optional[str]]:
108
116
  raise DocumentProcessingError(
109
117
  "URL fetching not available. Install with: pip install 'jaf-py[attachments]'"
110
118
  )
111
-
119
+
112
120
  try:
113
121
  async with httpx.AsyncClient(timeout=FETCH_TIMEOUT) as client:
114
122
  # First check content length with a HEAD request if possible
115
123
  try:
116
124
  head_response = await client.head(
117
125
  url,
118
- headers={'User-Agent': 'JAF-DocumentProcessor/1.0'},
119
- timeout=FETCH_TIMEOUT / 2 # Shorter timeout for HEAD request
126
+ headers={"User-Agent": "JAF-DocumentProcessor/1.0"},
127
+ timeout=FETCH_TIMEOUT / 2, # Shorter timeout for HEAD request
120
128
  )
121
129
  head_response.raise_for_status()
122
-
130
+
123
131
  # Check Content-Length header if present
124
- content_length_str = head_response.headers.get('content-length')
132
+ content_length_str = head_response.headers.get("content-length")
125
133
  if content_length_str and content_length_str.isdigit():
126
134
  content_length = int(content_length_str)
127
135
  if content_length > MAX_DOCUMENT_SIZE:
@@ -133,18 +141,16 @@ async def _fetch_url_content(url: str) -> tuple[bytes, Optional[str]]:
133
141
  except (httpx.HTTPStatusError, httpx.RequestError):
134
142
  # HEAD request failed, we'll check size during streaming
135
143
  pass
136
-
144
+
137
145
  # Stream the response to validate size as we download
138
146
  content_type = None
139
147
  accumulated_bytes = bytearray()
140
148
  async with client.stream(
141
- 'GET',
142
- url,
143
- headers={'User-Agent': 'JAF-DocumentProcessor/1.0'}
149
+ "GET", url, headers={"User-Agent": "JAF-DocumentProcessor/1.0"}
144
150
  ) as response:
145
151
  response.raise_for_status()
146
- content_type = response.headers.get('content-type')
147
-
152
+ content_type = response.headers.get("content-type")
153
+
148
154
  # Process the response in chunks
149
155
  async for chunk in response.aiter_bytes(chunk_size=8192):
150
156
  accumulated_bytes.extend(chunk)
@@ -154,11 +160,13 @@ async def _fetch_url_content(url: str) -> tuple[bytes, Optional[str]]:
154
160
  raise DocumentProcessingError(
155
161
  f"File size ({size_mb}MB) exceeds maximum allowed size ({max_mb}MB)"
156
162
  )
157
-
163
+
158
164
  return bytes(accumulated_bytes), content_type
159
-
165
+
160
166
  except httpx.HTTPStatusError as e:
161
- raise NetworkError(f"HTTP {e.response.status_code}: {e.response.reason_phrase}", e.response.status_code)
167
+ raise NetworkError(
168
+ f"HTTP {e.response.status_code}: {e.response.reason_phrase}", e.response.status_code
169
+ )
162
170
  except httpx.RequestError as e:
163
171
  raise NetworkError(f"Failed to fetch URL content: {e}", cause=e)
164
172
  except Exception as e:
@@ -171,13 +179,13 @@ async def _fetch_url_content(url: str) -> tuple[bytes, Optional[str]]:
171
179
  async def extract_document_content(attachment: Attachment) -> ProcessedDocument:
172
180
  """
173
181
  Extract text content from various document formats.
174
-
182
+
175
183
  Args:
176
184
  attachment: Attachment to process
177
-
185
+
178
186
  Returns:
179
187
  ProcessedDocument with extracted content
180
-
188
+
181
189
  Raises:
182
190
  DocumentProcessingError: If processing fails
183
191
  """
@@ -189,27 +197,30 @@ async def extract_document_content(attachment: Attachment) -> ProcessedDocument:
189
197
  content_bytes = base64.b64decode(attachment.data)
190
198
  mime_type = attachment.mime_type
191
199
  else:
192
- raise DocumentProcessingError('No document data or URL provided')
193
-
200
+ raise DocumentProcessingError("No document data or URL provided")
201
+
194
202
  # Normalize MIME type
195
203
  mime_type = mime_type.lower() if mime_type else None
196
-
204
+
197
205
  # Process based on MIME type
198
- if mime_type == 'application/pdf':
206
+ if mime_type == "application/pdf":
199
207
  return await _extract_pdf_content(content_bytes)
200
- elif mime_type in ['application/vnd.openxmlformats-officedocument.wordprocessingml.document']:
208
+ elif mime_type in ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]:
201
209
  return _extract_docx_content(content_bytes)
202
- elif mime_type in ['application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', 'application/vnd.ms-excel']:
210
+ elif mime_type in [
211
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
212
+ "application/vnd.ms-excel",
213
+ ]:
203
214
  return _extract_excel_content(content_bytes)
204
- elif mime_type == 'application/json':
215
+ elif mime_type == "application/json":
205
216
  return _extract_json_content(content_bytes)
206
- elif mime_type == 'application/zip':
217
+ elif mime_type == "application/zip":
207
218
  return _extract_zip_content(content_bytes)
208
- elif mime_type in ['text/plain', 'text/csv']:
219
+ elif mime_type in ["text/plain", "text/csv"]:
209
220
  return _extract_text_content(content_bytes, mime_type)
210
221
  else:
211
222
  # Fallback: try to extract as text
212
- return _extract_text_content(content_bytes, 'text/plain')
223
+ return _extract_text_content(content_bytes, "text/plain")
213
224
 
214
225
 
215
226
  async def _extract_pdf_content(content_bytes: bytes) -> ProcessedDocument:
@@ -218,28 +229,28 @@ async def _extract_pdf_content(content_bytes: bytes) -> ProcessedDocument:
218
229
  raise DocumentProcessingError(
219
230
  "PDF processing not available. Install with: pip install 'jaf-py[attachments]'"
220
231
  )
221
-
232
+
222
233
  try:
223
234
  # Run PDF processing in thread pool to avoid blocking
224
235
  def _process_pdf() -> ProcessedDocument:
225
236
  reader = PyPDF2.PdfReader(io.BytesIO(content_bytes))
226
237
  text_parts = []
227
-
238
+
228
239
  for page in reader.pages:
229
240
  text_parts.append(page.extract_text())
230
-
231
- content = '\n'.join(text_parts).strip()
232
-
241
+
242
+ content = "\n".join(text_parts).strip()
243
+
233
244
  return ProcessedDocument(
234
245
  content=content,
235
246
  metadata={
236
- 'pages': len(reader.pages),
237
- 'info': dict(reader.metadata) if reader.metadata else None
238
- }
247
+ "pages": len(reader.pages),
248
+ "info": dict(reader.metadata) if reader.metadata else None,
249
+ },
239
250
  )
240
-
251
+
241
252
  return await asyncio.get_event_loop().run_in_executor(None, _process_pdf)
242
-
253
+
243
254
  except Exception as e:
244
255
  raise DocumentProcessingError(f"Failed to extract PDF content: {e}") from e
245
256
 
@@ -247,39 +258,35 @@ async def _extract_pdf_content(content_bytes: bytes) -> ProcessedDocument:
247
258
  def _extract_text_content(content_bytes: bytes, mime_type: str) -> ProcessedDocument:
248
259
  """Extract content from text files."""
249
260
  try:
250
- content = content_bytes.decode('utf-8').strip()
251
-
252
- if mime_type == 'text/csv':
261
+ content = content_bytes.decode("utf-8").strip()
262
+
263
+ if mime_type == "text/csv":
253
264
  # Parse CSV to provide structured overview
254
265
  try:
255
266
  csv_reader = csv.DictReader(io.StringIO(content))
256
267
  rows = list(csv_reader)
257
268
  columns = csv_reader.fieldnames or []
258
-
259
- content_lines = content.split('\n')
269
+
270
+ content_lines = content.split("\n")
260
271
  preview_lines = content_lines[:MAX_CSV_PREVIEW_ROWS]
261
-
272
+
262
273
  formatted_content = (
263
274
  f"CSV File Content:\n"
264
275
  f"Rows: {len(rows)}, Columns: {len(columns)}\n"
265
276
  f"Columns: {', '.join(columns)}\n\n"
266
277
  f"First few rows:\n{chr(10).join(preview_lines)}"
267
278
  )
268
-
279
+
269
280
  return ProcessedDocument(
270
281
  content=formatted_content,
271
- metadata={
272
- 'rows': len(rows),
273
- 'columns': len(columns),
274
- 'fields': columns
275
- }
282
+ metadata={"rows": len(rows), "columns": len(columns), "fields": columns},
276
283
  )
277
284
  except Exception:
278
285
  # Fallback to raw text if CSV parsing fails
279
286
  pass
280
-
287
+
281
288
  return ProcessedDocument(content=content)
282
-
289
+
283
290
  except UnicodeDecodeError as e:
284
291
  raise DocumentProcessingError(f"Failed to decode text content: {e}") from e
285
292
 
@@ -290,39 +297,36 @@ def _extract_excel_content(content_bytes: bytes) -> ProcessedDocument:
290
297
  raise DocumentProcessingError(
291
298
  "Excel processing not available. Install with: pip install 'jaf-py[attachments]'"
292
299
  )
293
-
300
+
294
301
  try:
295
302
  workbook = load_workbook(io.BytesIO(content_bytes), read_only=True)
296
303
  sheet_names = workbook.sheetnames
297
-
304
+
298
305
  content_parts = [f"Excel File Content:\nSheets: {', '.join(sheet_names)}\n"]
299
-
306
+
300
307
  # Extract content from each sheet (limit to avoid overwhelming output)
301
308
  for i, sheet_name in enumerate(sheet_names):
302
309
  if i >= MAX_EXCEL_SHEETS:
303
310
  break
304
-
311
+
305
312
  worksheet = workbook[sheet_name]
306
313
  content_parts.append(f"\nSheet: {sheet_name}")
307
-
314
+
308
315
  # Extract up to MAX_EXCEL_ROWS_PER_SHEET rows
309
316
  rows_data = []
310
317
  for row_num, row in enumerate(worksheet.iter_rows(values_only=True), 1):
311
318
  if row_num > MAX_EXCEL_ROWS_PER_SHEET:
312
319
  break
313
320
  # Convert row to strings, handling None values
314
- row_strings = [str(cell) if cell is not None else '' for cell in row]
315
- rows_data.append(','.join(row_strings))
316
-
317
- content_parts.append('\n'.join(rows_data))
318
-
319
- content = '\n'.join(content_parts).strip()
320
-
321
- return ProcessedDocument(
322
- content=content,
323
- metadata={'sheets': sheet_names}
324
- )
325
-
321
+ row_strings = [str(cell) if cell is not None else "" for cell in row]
322
+ rows_data.append(",".join(row_strings))
323
+
324
+ content_parts.append("\n".join(rows_data))
325
+
326
+ content = "\n".join(content_parts).strip()
327
+
328
+ return ProcessedDocument(content=content, metadata={"sheets": sheet_names})
329
+
326
330
  except Exception as e:
327
331
  raise DocumentProcessingError(f"Failed to extract Excel content: {e}") from e
328
332
 
@@ -333,17 +337,14 @@ def _extract_docx_content(content_bytes: bytes) -> ProcessedDocument:
333
337
  raise DocumentProcessingError(
334
338
  "Word document processing not available. Install with: pip install 'jaf-py[attachments]'"
335
339
  )
336
-
340
+
337
341
  try:
338
342
  document = Document(io.BytesIO(content_bytes))
339
343
  paragraphs = [paragraph.text for paragraph in document.paragraphs]
340
- content = '\n'.join(paragraphs).strip()
341
-
342
- return ProcessedDocument(
343
- content=content,
344
- metadata={'paragraphs': len(paragraphs)}
345
- )
346
-
344
+ content = "\n".join(paragraphs).strip()
345
+
346
+ return ProcessedDocument(content=content, metadata={"paragraphs": len(paragraphs)})
347
+
347
348
  except Exception as e:
348
349
  raise DocumentProcessingError(f"Failed to extract DOCX content: {e}") from e
349
350
 
@@ -351,90 +352,94 @@ def _extract_docx_content(content_bytes: bytes) -> ProcessedDocument:
351
352
  def _extract_json_content(content_bytes: bytes) -> ProcessedDocument:
352
353
  """Extract content from JSON files."""
353
354
  try:
354
- json_str = content_bytes.decode('utf-8')
355
+ json_str = content_bytes.decode("utf-8")
355
356
  json_obj = json.loads(json_str)
356
-
357
+
357
358
  # Pretty print JSON with some metadata
358
359
  formatted_content = f"JSON File Content:\n{json.dumps(json_obj, indent=2)}"
359
-
360
- metadata = {
361
- 'type': 'array' if isinstance(json_obj, list) else type(json_obj).__name__
362
- }
363
-
360
+
361
+ metadata = {"type": "array" if isinstance(json_obj, list) else type(json_obj).__name__}
362
+
364
363
  if isinstance(json_obj, dict):
365
- metadata['keys'] = list(json_obj.keys())
364
+ metadata["keys"] = list(json_obj.keys())
366
365
  elif isinstance(json_obj, list):
367
- metadata['length'] = len(json_obj)
368
-
369
- return ProcessedDocument(
370
- content=formatted_content,
371
- metadata=metadata
372
- )
373
-
366
+ metadata["length"] = len(json_obj)
367
+
368
+ return ProcessedDocument(content=formatted_content, metadata=metadata)
369
+
374
370
  except (UnicodeDecodeError, json.JSONDecodeError):
375
371
  # Fallback to raw text if JSON parsing fails
376
372
  if isinstance(content_bytes, bytes):
377
373
  # If input is bytes, decode with error handling
378
- fallback_content = content_bytes.decode('utf-8', errors='replace').strip()
374
+ fallback_content = content_bytes.decode("utf-8", errors="replace").strip()
379
375
  else:
380
376
  # If input is already a string (from a previous decode attempt)
381
377
  fallback_content = json_str.strip() if isinstance(json_str, str) else str(content_bytes)
382
-
378
+
383
379
  return ProcessedDocument(content=fallback_content)
384
380
 
385
381
 
386
382
  def _extract_zip_content(content_bytes: bytes) -> ProcessedDocument:
387
383
  """Extract file listing from ZIP archives."""
388
384
  try:
389
- with zipfile.ZipFile(io.BytesIO(content_bytes), 'r') as zip_file:
385
+ with zipfile.ZipFile(io.BytesIO(content_bytes), "r") as zip_file:
390
386
  files = zip_file.namelist()
391
-
392
- content_parts = ['ZIP File Contents:\n']
387
+
388
+ content_parts = ["ZIP File Contents:\n"]
393
389
  safe_files = []
394
-
390
+
395
391
  # Create virtual root for path safety checks
396
392
  from pathlib import Path
397
393
  import os
398
- virtual_root = Path("/safe_extract_dir") # Virtual root never actually used for extraction
399
-
394
+
395
+ virtual_root = Path(
396
+ "/safe_extract_dir"
397
+ ) # Virtual root never actually used for extraction
398
+
400
399
  for file_name in files:
401
400
  # Skip empty entries
402
401
  if not file_name:
403
402
  continue
404
-
403
+
405
404
  # Basic security checks
406
- if (file_name.startswith('/') or # Absolute path
407
- file_name.startswith('\\') or # Windows absolute path
408
- file_name.startswith('..') or # Parent directory traversal
409
- '..' in file_name.split('/') or # Parent directory traversal
410
- '..' in file_name.split('\\') or # Windows traversal
411
- ':' in file_name or # Windows drive letter
412
- '\0' in file_name): # Null byte
405
+ if (
406
+ file_name.startswith("/") # Absolute path
407
+ or file_name.startswith("\\") # Windows absolute path
408
+ or file_name.startswith("..") # Parent directory traversal
409
+ or ".." in file_name.split("/") # Parent directory traversal
410
+ or ".." in file_name.split("\\") # Windows traversal
411
+ or ":" in file_name # Windows drive letter
412
+ or "\0" in file_name
413
+ ): # Null byte
413
414
  # Skip unsafe entries
414
415
  content_parts.append(f"WARNING: Skipped suspicious path: {file_name[:50]}...")
415
416
  continue
416
-
417
+
417
418
  # Normalize path for additional safety check
418
419
  try:
419
420
  # Create safe path relative to virtual root
420
421
  norm_path = os.path.normpath(file_name)
421
- if norm_path.startswith('..'):
422
+ if norm_path.startswith(".."):
422
423
  # Skip unsafe entries that normalize to traversal
423
- content_parts.append(f"WARNING: Skipped path traversal attempt: {file_name[:50]}...")
424
+ content_parts.append(
425
+ f"WARNING: Skipped path traversal attempt: {file_name[:50]}..."
426
+ )
424
427
  continue
425
-
428
+
426
429
  # Check if path would escape the virtual root
427
430
  test_path = virtual_root.joinpath(norm_path).resolve()
428
431
  if not str(test_path).startswith(str(virtual_root)):
429
432
  # Skip unsafe entries that would escape extraction root
430
- content_parts.append(f"WARNING: Skipped path traversal attempt: {file_name[:50]}...")
433
+ content_parts.append(
434
+ f"WARNING: Skipped path traversal attempt: {file_name[:50]}..."
435
+ )
431
436
  continue
432
-
437
+
433
438
  # Passed all security checks, add to safe file list
434
439
  safe_files.append(file_name)
435
-
440
+
436
441
  # Get file info for display
437
- if file_name.endswith('/'):
442
+ if file_name.endswith("/"):
438
443
  content_parts.append(f"DIR: {file_name}")
439
444
  else:
440
445
  try:
@@ -447,17 +452,13 @@ def _extract_zip_content(content_bytes: bytes) -> ProcessedDocument:
447
452
  # Skip any entry that causes normalization errors
448
453
  content_parts.append(f"WARNING: Skipped invalid path: {file_name[:50]}...")
449
454
  continue
450
-
451
- content = '\n'.join(content_parts).strip()
452
-
455
+
456
+ content = "\n".join(content_parts).strip()
457
+
453
458
  return ProcessedDocument(
454
- content=content,
455
- metadata={
456
- 'files': safe_files,
457
- 'total_files': len(safe_files)
458
- }
459
+ content=content, metadata={"files": safe_files, "total_files": len(safe_files)}
459
460
  )
460
-
461
+
461
462
  except Exception as e:
462
463
  raise DocumentProcessingError(f"Failed to process ZIP file: {e}") from e
463
464
 
@@ -465,97 +466,97 @@ def _extract_zip_content(content_bytes: bytes) -> ProcessedDocument:
465
466
  def is_document_supported(mime_type: Optional[str]) -> bool:
466
467
  """
467
468
  Check if a MIME type is supported for content extraction.
468
-
469
+
469
470
  Args:
470
471
  mime_type: MIME type to check
471
-
472
+
472
473
  Returns:
473
474
  True if supported, False otherwise
474
475
  """
475
476
  if not mime_type:
476
477
  return False
477
-
478
+
478
479
  supported_types = [
479
- 'application/pdf',
480
- 'text/plain',
481
- 'text/csv',
482
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
483
- 'application/vnd.ms-excel',
484
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
485
- 'application/json',
486
- 'application/zip'
480
+ "application/pdf",
481
+ "text/plain",
482
+ "text/csv",
483
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
484
+ "application/vnd.ms-excel",
485
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
486
+ "application/json",
487
+ "application/zip",
487
488
  ]
488
-
489
+
489
490
  return mime_type.lower() in supported_types
490
491
 
491
492
 
492
493
  def get_document_description(mime_type: Optional[str]) -> str:
493
494
  """
494
495
  Get a human-readable description of what content will be extracted.
495
-
496
+
496
497
  Args:
497
498
  mime_type: MIME type to describe
498
-
499
+
499
500
  Returns:
500
501
  Human-readable description
501
502
  """
502
503
  if not mime_type:
503
- return 'document content'
504
-
504
+ return "document content"
505
+
505
506
  descriptions = {
506
- 'application/pdf': 'PDF text content',
507
- 'text/plain': 'plain text content',
508
- 'text/csv': 'CSV data structure and sample rows',
509
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'Excel spreadsheet data',
510
- 'application/vnd.ms-excel': 'Excel spreadsheet data',
511
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'Word document text content',
512
- 'application/json': 'JSON data structure',
513
- 'application/zip': 'ZIP file listing'
507
+ "application/pdf": "PDF text content",
508
+ "text/plain": "plain text content",
509
+ "text/csv": "CSV data structure and sample rows",
510
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "Excel spreadsheet data",
511
+ "application/vnd.ms-excel": "Excel spreadsheet data",
512
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "Word document text content",
513
+ "application/json": "JSON data structure",
514
+ "application/zip": "ZIP file listing",
514
515
  }
515
-
516
- return descriptions.get(mime_type.lower(), 'document content')
516
+
517
+ return descriptions.get(mime_type.lower(), "document content")
517
518
 
518
519
 
519
520
  def get_missing_dependencies() -> List[str]:
520
521
  """
521
522
  Get list of missing optional dependencies for document processing.
522
-
523
+
523
524
  Returns:
524
525
  List of missing dependency names
525
526
  """
526
527
  missing = []
527
-
528
+
528
529
  if not HAS_PDF:
529
- missing.append('PyPDF2 (for PDF processing)')
530
+ missing.append("PyPDF2 (for PDF processing)")
530
531
  if not HAS_DOCX:
531
- missing.append('python-docx (for Word document processing)')
532
+ missing.append("python-docx (for Word document processing)")
532
533
  if not HAS_EXCEL:
533
- missing.append('openpyxl (for Excel processing)')
534
+ missing.append("openpyxl (for Excel processing)")
534
535
  if not HAS_PIL:
535
- missing.append('Pillow (for image processing)')
536
+ missing.append("Pillow (for image processing)")
536
537
  if not HAS_MAGIC:
537
- missing.append('python-magic (for MIME type detection)')
538
+ missing.append("python-magic (for MIME type detection)")
538
539
  if not HAS_HTTPX:
539
- missing.append('httpx (for URL fetching)')
540
+ missing.append("httpx (for URL fetching)")
540
541
  if not HAS_AIOFILES:
541
- missing.append('aiofiles (for async file operations)')
542
-
542
+ missing.append("aiofiles (for async file operations)")
543
+
543
544
  return missing
544
545
 
545
546
 
546
547
  def check_dependencies() -> Dict[str, bool]:
547
548
  """
548
549
  Check availability of optional dependencies.
549
-
550
+
550
551
  Returns:
551
552
  Dictionary mapping dependency names to availability
552
553
  """
553
554
  return {
554
- 'pdf': HAS_PDF,
555
- 'docx': HAS_DOCX,
556
- 'excel': HAS_EXCEL,
557
- 'image': HAS_PIL,
558
- 'magic': HAS_MAGIC,
559
- 'httpx': HAS_HTTPX,
560
- 'aiofiles': HAS_AIOFILES
561
- }
555
+ "pdf": HAS_PDF,
556
+ "docx": HAS_DOCX,
557
+ "excel": HAS_EXCEL,
558
+ "image": HAS_PIL,
559
+ "magic": HAS_MAGIC,
560
+ "httpx": HAS_HTTPX,
561
+ "aiofiles": HAS_AIOFILES,
562
+ }