jaf-py 2.5.10__py3-none-any.whl → 2.5.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- jaf/__init__.py +154 -57
- jaf/a2a/__init__.py +42 -21
- jaf/a2a/agent.py +79 -126
- jaf/a2a/agent_card.py +87 -78
- jaf/a2a/client.py +30 -66
- jaf/a2a/examples/client_example.py +12 -12
- jaf/a2a/examples/integration_example.py +38 -47
- jaf/a2a/examples/server_example.py +56 -53
- jaf/a2a/memory/__init__.py +0 -4
- jaf/a2a/memory/cleanup.py +28 -21
- jaf/a2a/memory/factory.py +155 -133
- jaf/a2a/memory/providers/composite.py +21 -26
- jaf/a2a/memory/providers/in_memory.py +89 -83
- jaf/a2a/memory/providers/postgres.py +117 -115
- jaf/a2a/memory/providers/redis.py +128 -121
- jaf/a2a/memory/serialization.py +77 -87
- jaf/a2a/memory/tests/run_comprehensive_tests.py +112 -83
- jaf/a2a/memory/tests/test_cleanup.py +211 -94
- jaf/a2a/memory/tests/test_serialization.py +73 -68
- jaf/a2a/memory/tests/test_stress_concurrency.py +186 -133
- jaf/a2a/memory/tests/test_task_lifecycle.py +138 -120
- jaf/a2a/memory/types.py +91 -53
- jaf/a2a/protocol.py +95 -125
- jaf/a2a/server.py +90 -118
- jaf/a2a/standalone_client.py +30 -43
- jaf/a2a/tests/__init__.py +16 -33
- jaf/a2a/tests/run_tests.py +17 -53
- jaf/a2a/tests/test_agent.py +40 -140
- jaf/a2a/tests/test_client.py +54 -117
- jaf/a2a/tests/test_integration.py +28 -82
- jaf/a2a/tests/test_protocol.py +54 -139
- jaf/a2a/tests/test_types.py +50 -136
- jaf/a2a/types.py +58 -34
- jaf/cli.py +21 -41
- jaf/core/__init__.py +7 -1
- jaf/core/agent_tool.py +93 -72
- jaf/core/analytics.py +257 -207
- jaf/core/checkpoint.py +223 -0
- jaf/core/composition.py +249 -235
- jaf/core/engine.py +817 -519
- jaf/core/errors.py +55 -42
- jaf/core/guardrails.py +276 -202
- jaf/core/handoff.py +47 -31
- jaf/core/parallel_agents.py +69 -75
- jaf/core/performance.py +75 -73
- jaf/core/proxy.py +43 -44
- jaf/core/proxy_helpers.py +24 -27
- jaf/core/regeneration.py +220 -129
- jaf/core/state.py +68 -66
- jaf/core/streaming.py +115 -108
- jaf/core/tool_results.py +111 -101
- jaf/core/tools.py +114 -116
- jaf/core/tracing.py +269 -210
- jaf/core/types.py +371 -151
- jaf/core/workflows.py +209 -168
- jaf/exceptions.py +46 -38
- jaf/memory/__init__.py +1 -6
- jaf/memory/approval_storage.py +54 -77
- jaf/memory/factory.py +4 -4
- jaf/memory/providers/in_memory.py +216 -180
- jaf/memory/providers/postgres.py +216 -146
- jaf/memory/providers/redis.py +173 -116
- jaf/memory/types.py +70 -51
- jaf/memory/utils.py +36 -34
- jaf/plugins/__init__.py +12 -12
- jaf/plugins/base.py +105 -96
- jaf/policies/__init__.py +0 -1
- jaf/policies/handoff.py +37 -46
- jaf/policies/validation.py +76 -52
- jaf/providers/__init__.py +6 -3
- jaf/providers/mcp.py +97 -51
- jaf/providers/model.py +360 -279
- jaf/server/__init__.py +1 -1
- jaf/server/main.py +7 -11
- jaf/server/server.py +514 -359
- jaf/server/types.py +208 -52
- jaf/utils/__init__.py +17 -18
- jaf/utils/attachments.py +111 -116
- jaf/utils/document_processor.py +175 -174
- jaf/visualization/__init__.py +1 -1
- jaf/visualization/example.py +111 -110
- jaf/visualization/functional_core.py +46 -71
- jaf/visualization/graphviz.py +154 -189
- jaf/visualization/imperative_shell.py +7 -16
- jaf/visualization/types.py +8 -4
- {jaf_py-2.5.10.dist-info → jaf_py-2.5.11.dist-info}/METADATA +2 -2
- jaf_py-2.5.11.dist-info/RECORD +97 -0
- jaf_py-2.5.10.dist-info/RECORD +0 -96
- {jaf_py-2.5.10.dist-info → jaf_py-2.5.11.dist-info}/WHEEL +0 -0
- {jaf_py-2.5.10.dist-info → jaf_py-2.5.11.dist-info}/entry_points.txt +0 -0
- {jaf_py-2.5.10.dist-info → jaf_py-2.5.11.dist-info}/licenses/LICENSE +0 -0
- {jaf_py-2.5.10.dist-info → jaf_py-2.5.11.dist-info}/top_level.txt +0 -0
jaf/utils/document_processor.py
CHANGED
|
@@ -15,12 +15,14 @@ from typing import Dict, Any, Optional, List
|
|
|
15
15
|
|
|
16
16
|
try:
|
|
17
17
|
import aiofiles
|
|
18
|
+
|
|
18
19
|
HAS_AIOFILES = True
|
|
19
20
|
except ImportError:
|
|
20
21
|
HAS_AIOFILES = False
|
|
21
22
|
|
|
22
23
|
try:
|
|
23
24
|
import httpx
|
|
25
|
+
|
|
24
26
|
HAS_HTTPX = True
|
|
25
27
|
except ImportError:
|
|
26
28
|
HAS_HTTPX = False
|
|
@@ -31,30 +33,35 @@ from ..core.types import Attachment
|
|
|
31
33
|
# Optional imports with graceful fallbacks
|
|
32
34
|
try:
|
|
33
35
|
import PyPDF2
|
|
36
|
+
|
|
34
37
|
HAS_PDF = True
|
|
35
38
|
except ImportError:
|
|
36
39
|
HAS_PDF = False
|
|
37
40
|
|
|
38
41
|
try:
|
|
39
42
|
from docx import Document
|
|
43
|
+
|
|
40
44
|
HAS_DOCX = True
|
|
41
45
|
except ImportError:
|
|
42
46
|
HAS_DOCX = False
|
|
43
47
|
|
|
44
48
|
try:
|
|
45
49
|
from openpyxl import load_workbook
|
|
50
|
+
|
|
46
51
|
HAS_EXCEL = True
|
|
47
52
|
except ImportError:
|
|
48
53
|
HAS_EXCEL = False
|
|
49
54
|
|
|
50
55
|
try:
|
|
51
56
|
import magic
|
|
57
|
+
|
|
52
58
|
HAS_MAGIC = True
|
|
53
59
|
except ImportError:
|
|
54
60
|
HAS_MAGIC = False
|
|
55
61
|
|
|
56
62
|
try:
|
|
57
63
|
from PIL import Image
|
|
64
|
+
|
|
58
65
|
HAS_PIL = True
|
|
59
66
|
except ImportError:
|
|
60
67
|
HAS_PIL = False
|
|
@@ -70,7 +77,7 @@ MAX_EXCEL_ROWS_PER_SHEET = 20
|
|
|
70
77
|
|
|
71
78
|
class DocumentProcessingError(Exception):
|
|
72
79
|
"""Exception raised when document processing fails."""
|
|
73
|
-
|
|
80
|
+
|
|
74
81
|
def __init__(self, message: str, cause: Optional[Exception] = None):
|
|
75
82
|
super().__init__(message)
|
|
76
83
|
self.cause = cause
|
|
@@ -78,7 +85,7 @@ class DocumentProcessingError(Exception):
|
|
|
78
85
|
|
|
79
86
|
class NetworkError(Exception):
|
|
80
87
|
"""Exception raised when network operations fail."""
|
|
81
|
-
|
|
88
|
+
|
|
82
89
|
def __init__(self, message: str, status_code: Optional[int] = None):
|
|
83
90
|
super().__init__(message)
|
|
84
91
|
self.status_code = status_code
|
|
@@ -86,6 +93,7 @@ class NetworkError(Exception):
|
|
|
86
93
|
|
|
87
94
|
class ProcessedDocument(BaseModel):
|
|
88
95
|
"""Result of document processing."""
|
|
96
|
+
|
|
89
97
|
content: str
|
|
90
98
|
metadata: Optional[Dict[str, Any]] = None
|
|
91
99
|
|
|
@@ -93,13 +101,13 @@ class ProcessedDocument(BaseModel):
|
|
|
93
101
|
async def _fetch_url_content(url: str) -> tuple[bytes, Optional[str]]:
|
|
94
102
|
"""
|
|
95
103
|
Fetch content from URL and return as bytes with content type.
|
|
96
|
-
|
|
104
|
+
|
|
97
105
|
Args:
|
|
98
106
|
url: URL to fetch
|
|
99
|
-
|
|
107
|
+
|
|
100
108
|
Returns:
|
|
101
109
|
Tuple of (content_bytes, content_type)
|
|
102
|
-
|
|
110
|
+
|
|
103
111
|
Raises:
|
|
104
112
|
NetworkError: If fetch fails
|
|
105
113
|
DocumentProcessingError: If file is too large
|
|
@@ -108,20 +116,20 @@ async def _fetch_url_content(url: str) -> tuple[bytes, Optional[str]]:
|
|
|
108
116
|
raise DocumentProcessingError(
|
|
109
117
|
"URL fetching not available. Install with: pip install 'jaf-py[attachments]'"
|
|
110
118
|
)
|
|
111
|
-
|
|
119
|
+
|
|
112
120
|
try:
|
|
113
121
|
async with httpx.AsyncClient(timeout=FETCH_TIMEOUT) as client:
|
|
114
122
|
# First check content length with a HEAD request if possible
|
|
115
123
|
try:
|
|
116
124
|
head_response = await client.head(
|
|
117
125
|
url,
|
|
118
|
-
headers={
|
|
119
|
-
timeout=FETCH_TIMEOUT / 2 # Shorter timeout for HEAD request
|
|
126
|
+
headers={"User-Agent": "JAF-DocumentProcessor/1.0"},
|
|
127
|
+
timeout=FETCH_TIMEOUT / 2, # Shorter timeout for HEAD request
|
|
120
128
|
)
|
|
121
129
|
head_response.raise_for_status()
|
|
122
|
-
|
|
130
|
+
|
|
123
131
|
# Check Content-Length header if present
|
|
124
|
-
content_length_str = head_response.headers.get(
|
|
132
|
+
content_length_str = head_response.headers.get("content-length")
|
|
125
133
|
if content_length_str and content_length_str.isdigit():
|
|
126
134
|
content_length = int(content_length_str)
|
|
127
135
|
if content_length > MAX_DOCUMENT_SIZE:
|
|
@@ -133,18 +141,16 @@ async def _fetch_url_content(url: str) -> tuple[bytes, Optional[str]]:
|
|
|
133
141
|
except (httpx.HTTPStatusError, httpx.RequestError):
|
|
134
142
|
# HEAD request failed, we'll check size during streaming
|
|
135
143
|
pass
|
|
136
|
-
|
|
144
|
+
|
|
137
145
|
# Stream the response to validate size as we download
|
|
138
146
|
content_type = None
|
|
139
147
|
accumulated_bytes = bytearray()
|
|
140
148
|
async with client.stream(
|
|
141
|
-
|
|
142
|
-
url,
|
|
143
|
-
headers={'User-Agent': 'JAF-DocumentProcessor/1.0'}
|
|
149
|
+
"GET", url, headers={"User-Agent": "JAF-DocumentProcessor/1.0"}
|
|
144
150
|
) as response:
|
|
145
151
|
response.raise_for_status()
|
|
146
|
-
content_type = response.headers.get(
|
|
147
|
-
|
|
152
|
+
content_type = response.headers.get("content-type")
|
|
153
|
+
|
|
148
154
|
# Process the response in chunks
|
|
149
155
|
async for chunk in response.aiter_bytes(chunk_size=8192):
|
|
150
156
|
accumulated_bytes.extend(chunk)
|
|
@@ -154,11 +160,13 @@ async def _fetch_url_content(url: str) -> tuple[bytes, Optional[str]]:
|
|
|
154
160
|
raise DocumentProcessingError(
|
|
155
161
|
f"File size ({size_mb}MB) exceeds maximum allowed size ({max_mb}MB)"
|
|
156
162
|
)
|
|
157
|
-
|
|
163
|
+
|
|
158
164
|
return bytes(accumulated_bytes), content_type
|
|
159
|
-
|
|
165
|
+
|
|
160
166
|
except httpx.HTTPStatusError as e:
|
|
161
|
-
raise NetworkError(
|
|
167
|
+
raise NetworkError(
|
|
168
|
+
f"HTTP {e.response.status_code}: {e.response.reason_phrase}", e.response.status_code
|
|
169
|
+
)
|
|
162
170
|
except httpx.RequestError as e:
|
|
163
171
|
raise NetworkError(f"Failed to fetch URL content: {e}", cause=e)
|
|
164
172
|
except Exception as e:
|
|
@@ -171,13 +179,13 @@ async def _fetch_url_content(url: str) -> tuple[bytes, Optional[str]]:
|
|
|
171
179
|
async def extract_document_content(attachment: Attachment) -> ProcessedDocument:
|
|
172
180
|
"""
|
|
173
181
|
Extract text content from various document formats.
|
|
174
|
-
|
|
182
|
+
|
|
175
183
|
Args:
|
|
176
184
|
attachment: Attachment to process
|
|
177
|
-
|
|
185
|
+
|
|
178
186
|
Returns:
|
|
179
187
|
ProcessedDocument with extracted content
|
|
180
|
-
|
|
188
|
+
|
|
181
189
|
Raises:
|
|
182
190
|
DocumentProcessingError: If processing fails
|
|
183
191
|
"""
|
|
@@ -189,27 +197,30 @@ async def extract_document_content(attachment: Attachment) -> ProcessedDocument:
|
|
|
189
197
|
content_bytes = base64.b64decode(attachment.data)
|
|
190
198
|
mime_type = attachment.mime_type
|
|
191
199
|
else:
|
|
192
|
-
raise DocumentProcessingError(
|
|
193
|
-
|
|
200
|
+
raise DocumentProcessingError("No document data or URL provided")
|
|
201
|
+
|
|
194
202
|
# Normalize MIME type
|
|
195
203
|
mime_type = mime_type.lower() if mime_type else None
|
|
196
|
-
|
|
204
|
+
|
|
197
205
|
# Process based on MIME type
|
|
198
|
-
if mime_type ==
|
|
206
|
+
if mime_type == "application/pdf":
|
|
199
207
|
return await _extract_pdf_content(content_bytes)
|
|
200
|
-
elif mime_type in [
|
|
208
|
+
elif mime_type in ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]:
|
|
201
209
|
return _extract_docx_content(content_bytes)
|
|
202
|
-
elif mime_type in [
|
|
210
|
+
elif mime_type in [
|
|
211
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
212
|
+
"application/vnd.ms-excel",
|
|
213
|
+
]:
|
|
203
214
|
return _extract_excel_content(content_bytes)
|
|
204
|
-
elif mime_type ==
|
|
215
|
+
elif mime_type == "application/json":
|
|
205
216
|
return _extract_json_content(content_bytes)
|
|
206
|
-
elif mime_type ==
|
|
217
|
+
elif mime_type == "application/zip":
|
|
207
218
|
return _extract_zip_content(content_bytes)
|
|
208
|
-
elif mime_type in [
|
|
219
|
+
elif mime_type in ["text/plain", "text/csv"]:
|
|
209
220
|
return _extract_text_content(content_bytes, mime_type)
|
|
210
221
|
else:
|
|
211
222
|
# Fallback: try to extract as text
|
|
212
|
-
return _extract_text_content(content_bytes,
|
|
223
|
+
return _extract_text_content(content_bytes, "text/plain")
|
|
213
224
|
|
|
214
225
|
|
|
215
226
|
async def _extract_pdf_content(content_bytes: bytes) -> ProcessedDocument:
|
|
@@ -218,28 +229,28 @@ async def _extract_pdf_content(content_bytes: bytes) -> ProcessedDocument:
|
|
|
218
229
|
raise DocumentProcessingError(
|
|
219
230
|
"PDF processing not available. Install with: pip install 'jaf-py[attachments]'"
|
|
220
231
|
)
|
|
221
|
-
|
|
232
|
+
|
|
222
233
|
try:
|
|
223
234
|
# Run PDF processing in thread pool to avoid blocking
|
|
224
235
|
def _process_pdf() -> ProcessedDocument:
|
|
225
236
|
reader = PyPDF2.PdfReader(io.BytesIO(content_bytes))
|
|
226
237
|
text_parts = []
|
|
227
|
-
|
|
238
|
+
|
|
228
239
|
for page in reader.pages:
|
|
229
240
|
text_parts.append(page.extract_text())
|
|
230
|
-
|
|
231
|
-
content =
|
|
232
|
-
|
|
241
|
+
|
|
242
|
+
content = "\n".join(text_parts).strip()
|
|
243
|
+
|
|
233
244
|
return ProcessedDocument(
|
|
234
245
|
content=content,
|
|
235
246
|
metadata={
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
}
|
|
247
|
+
"pages": len(reader.pages),
|
|
248
|
+
"info": dict(reader.metadata) if reader.metadata else None,
|
|
249
|
+
},
|
|
239
250
|
)
|
|
240
|
-
|
|
251
|
+
|
|
241
252
|
return await asyncio.get_event_loop().run_in_executor(None, _process_pdf)
|
|
242
|
-
|
|
253
|
+
|
|
243
254
|
except Exception as e:
|
|
244
255
|
raise DocumentProcessingError(f"Failed to extract PDF content: {e}") from e
|
|
245
256
|
|
|
@@ -247,39 +258,35 @@ async def _extract_pdf_content(content_bytes: bytes) -> ProcessedDocument:
|
|
|
247
258
|
def _extract_text_content(content_bytes: bytes, mime_type: str) -> ProcessedDocument:
|
|
248
259
|
"""Extract content from text files."""
|
|
249
260
|
try:
|
|
250
|
-
content = content_bytes.decode(
|
|
251
|
-
|
|
252
|
-
if mime_type ==
|
|
261
|
+
content = content_bytes.decode("utf-8").strip()
|
|
262
|
+
|
|
263
|
+
if mime_type == "text/csv":
|
|
253
264
|
# Parse CSV to provide structured overview
|
|
254
265
|
try:
|
|
255
266
|
csv_reader = csv.DictReader(io.StringIO(content))
|
|
256
267
|
rows = list(csv_reader)
|
|
257
268
|
columns = csv_reader.fieldnames or []
|
|
258
|
-
|
|
259
|
-
content_lines = content.split(
|
|
269
|
+
|
|
270
|
+
content_lines = content.split("\n")
|
|
260
271
|
preview_lines = content_lines[:MAX_CSV_PREVIEW_ROWS]
|
|
261
|
-
|
|
272
|
+
|
|
262
273
|
formatted_content = (
|
|
263
274
|
f"CSV File Content:\n"
|
|
264
275
|
f"Rows: {len(rows)}, Columns: {len(columns)}\n"
|
|
265
276
|
f"Columns: {', '.join(columns)}\n\n"
|
|
266
277
|
f"First few rows:\n{chr(10).join(preview_lines)}"
|
|
267
278
|
)
|
|
268
|
-
|
|
279
|
+
|
|
269
280
|
return ProcessedDocument(
|
|
270
281
|
content=formatted_content,
|
|
271
|
-
metadata={
|
|
272
|
-
'rows': len(rows),
|
|
273
|
-
'columns': len(columns),
|
|
274
|
-
'fields': columns
|
|
275
|
-
}
|
|
282
|
+
metadata={"rows": len(rows), "columns": len(columns), "fields": columns},
|
|
276
283
|
)
|
|
277
284
|
except Exception:
|
|
278
285
|
# Fallback to raw text if CSV parsing fails
|
|
279
286
|
pass
|
|
280
|
-
|
|
287
|
+
|
|
281
288
|
return ProcessedDocument(content=content)
|
|
282
|
-
|
|
289
|
+
|
|
283
290
|
except UnicodeDecodeError as e:
|
|
284
291
|
raise DocumentProcessingError(f"Failed to decode text content: {e}") from e
|
|
285
292
|
|
|
@@ -290,39 +297,36 @@ def _extract_excel_content(content_bytes: bytes) -> ProcessedDocument:
|
|
|
290
297
|
raise DocumentProcessingError(
|
|
291
298
|
"Excel processing not available. Install with: pip install 'jaf-py[attachments]'"
|
|
292
299
|
)
|
|
293
|
-
|
|
300
|
+
|
|
294
301
|
try:
|
|
295
302
|
workbook = load_workbook(io.BytesIO(content_bytes), read_only=True)
|
|
296
303
|
sheet_names = workbook.sheetnames
|
|
297
|
-
|
|
304
|
+
|
|
298
305
|
content_parts = [f"Excel File Content:\nSheets: {', '.join(sheet_names)}\n"]
|
|
299
|
-
|
|
306
|
+
|
|
300
307
|
# Extract content from each sheet (limit to avoid overwhelming output)
|
|
301
308
|
for i, sheet_name in enumerate(sheet_names):
|
|
302
309
|
if i >= MAX_EXCEL_SHEETS:
|
|
303
310
|
break
|
|
304
|
-
|
|
311
|
+
|
|
305
312
|
worksheet = workbook[sheet_name]
|
|
306
313
|
content_parts.append(f"\nSheet: {sheet_name}")
|
|
307
|
-
|
|
314
|
+
|
|
308
315
|
# Extract up to MAX_EXCEL_ROWS_PER_SHEET rows
|
|
309
316
|
rows_data = []
|
|
310
317
|
for row_num, row in enumerate(worksheet.iter_rows(values_only=True), 1):
|
|
311
318
|
if row_num > MAX_EXCEL_ROWS_PER_SHEET:
|
|
312
319
|
break
|
|
313
320
|
# Convert row to strings, handling None values
|
|
314
|
-
row_strings = [str(cell) if cell is not None else
|
|
315
|
-
rows_data.append(
|
|
316
|
-
|
|
317
|
-
content_parts.append(
|
|
318
|
-
|
|
319
|
-
content =
|
|
320
|
-
|
|
321
|
-
return ProcessedDocument(
|
|
322
|
-
|
|
323
|
-
metadata={'sheets': sheet_names}
|
|
324
|
-
)
|
|
325
|
-
|
|
321
|
+
row_strings = [str(cell) if cell is not None else "" for cell in row]
|
|
322
|
+
rows_data.append(",".join(row_strings))
|
|
323
|
+
|
|
324
|
+
content_parts.append("\n".join(rows_data))
|
|
325
|
+
|
|
326
|
+
content = "\n".join(content_parts).strip()
|
|
327
|
+
|
|
328
|
+
return ProcessedDocument(content=content, metadata={"sheets": sheet_names})
|
|
329
|
+
|
|
326
330
|
except Exception as e:
|
|
327
331
|
raise DocumentProcessingError(f"Failed to extract Excel content: {e}") from e
|
|
328
332
|
|
|
@@ -333,17 +337,14 @@ def _extract_docx_content(content_bytes: bytes) -> ProcessedDocument:
|
|
|
333
337
|
raise DocumentProcessingError(
|
|
334
338
|
"Word document processing not available. Install with: pip install 'jaf-py[attachments]'"
|
|
335
339
|
)
|
|
336
|
-
|
|
340
|
+
|
|
337
341
|
try:
|
|
338
342
|
document = Document(io.BytesIO(content_bytes))
|
|
339
343
|
paragraphs = [paragraph.text for paragraph in document.paragraphs]
|
|
340
|
-
content =
|
|
341
|
-
|
|
342
|
-
return ProcessedDocument(
|
|
343
|
-
|
|
344
|
-
metadata={'paragraphs': len(paragraphs)}
|
|
345
|
-
)
|
|
346
|
-
|
|
344
|
+
content = "\n".join(paragraphs).strip()
|
|
345
|
+
|
|
346
|
+
return ProcessedDocument(content=content, metadata={"paragraphs": len(paragraphs)})
|
|
347
|
+
|
|
347
348
|
except Exception as e:
|
|
348
349
|
raise DocumentProcessingError(f"Failed to extract DOCX content: {e}") from e
|
|
349
350
|
|
|
@@ -351,90 +352,94 @@ def _extract_docx_content(content_bytes: bytes) -> ProcessedDocument:
|
|
|
351
352
|
def _extract_json_content(content_bytes: bytes) -> ProcessedDocument:
|
|
352
353
|
"""Extract content from JSON files."""
|
|
353
354
|
try:
|
|
354
|
-
json_str = content_bytes.decode(
|
|
355
|
+
json_str = content_bytes.decode("utf-8")
|
|
355
356
|
json_obj = json.loads(json_str)
|
|
356
|
-
|
|
357
|
+
|
|
357
358
|
# Pretty print JSON with some metadata
|
|
358
359
|
formatted_content = f"JSON File Content:\n{json.dumps(json_obj, indent=2)}"
|
|
359
|
-
|
|
360
|
-
metadata = {
|
|
361
|
-
|
|
362
|
-
}
|
|
363
|
-
|
|
360
|
+
|
|
361
|
+
metadata = {"type": "array" if isinstance(json_obj, list) else type(json_obj).__name__}
|
|
362
|
+
|
|
364
363
|
if isinstance(json_obj, dict):
|
|
365
|
-
metadata[
|
|
364
|
+
metadata["keys"] = list(json_obj.keys())
|
|
366
365
|
elif isinstance(json_obj, list):
|
|
367
|
-
metadata[
|
|
368
|
-
|
|
369
|
-
return ProcessedDocument(
|
|
370
|
-
|
|
371
|
-
metadata=metadata
|
|
372
|
-
)
|
|
373
|
-
|
|
366
|
+
metadata["length"] = len(json_obj)
|
|
367
|
+
|
|
368
|
+
return ProcessedDocument(content=formatted_content, metadata=metadata)
|
|
369
|
+
|
|
374
370
|
except (UnicodeDecodeError, json.JSONDecodeError):
|
|
375
371
|
# Fallback to raw text if JSON parsing fails
|
|
376
372
|
if isinstance(content_bytes, bytes):
|
|
377
373
|
# If input is bytes, decode with error handling
|
|
378
|
-
fallback_content = content_bytes.decode(
|
|
374
|
+
fallback_content = content_bytes.decode("utf-8", errors="replace").strip()
|
|
379
375
|
else:
|
|
380
376
|
# If input is already a string (from a previous decode attempt)
|
|
381
377
|
fallback_content = json_str.strip() if isinstance(json_str, str) else str(content_bytes)
|
|
382
|
-
|
|
378
|
+
|
|
383
379
|
return ProcessedDocument(content=fallback_content)
|
|
384
380
|
|
|
385
381
|
|
|
386
382
|
def _extract_zip_content(content_bytes: bytes) -> ProcessedDocument:
|
|
387
383
|
"""Extract file listing from ZIP archives."""
|
|
388
384
|
try:
|
|
389
|
-
with zipfile.ZipFile(io.BytesIO(content_bytes),
|
|
385
|
+
with zipfile.ZipFile(io.BytesIO(content_bytes), "r") as zip_file:
|
|
390
386
|
files = zip_file.namelist()
|
|
391
|
-
|
|
392
|
-
content_parts = [
|
|
387
|
+
|
|
388
|
+
content_parts = ["ZIP File Contents:\n"]
|
|
393
389
|
safe_files = []
|
|
394
|
-
|
|
390
|
+
|
|
395
391
|
# Create virtual root for path safety checks
|
|
396
392
|
from pathlib import Path
|
|
397
393
|
import os
|
|
398
|
-
|
|
399
|
-
|
|
394
|
+
|
|
395
|
+
virtual_root = Path(
|
|
396
|
+
"/safe_extract_dir"
|
|
397
|
+
) # Virtual root never actually used for extraction
|
|
398
|
+
|
|
400
399
|
for file_name in files:
|
|
401
400
|
# Skip empty entries
|
|
402
401
|
if not file_name:
|
|
403
402
|
continue
|
|
404
|
-
|
|
403
|
+
|
|
405
404
|
# Basic security checks
|
|
406
|
-
if (
|
|
407
|
-
file_name.startswith(
|
|
408
|
-
file_name.startswith(
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
405
|
+
if (
|
|
406
|
+
file_name.startswith("/") # Absolute path
|
|
407
|
+
or file_name.startswith("\\") # Windows absolute path
|
|
408
|
+
or file_name.startswith("..") # Parent directory traversal
|
|
409
|
+
or ".." in file_name.split("/") # Parent directory traversal
|
|
410
|
+
or ".." in file_name.split("\\") # Windows traversal
|
|
411
|
+
or ":" in file_name # Windows drive letter
|
|
412
|
+
or "\0" in file_name
|
|
413
|
+
): # Null byte
|
|
413
414
|
# Skip unsafe entries
|
|
414
415
|
content_parts.append(f"WARNING: Skipped suspicious path: {file_name[:50]}...")
|
|
415
416
|
continue
|
|
416
|
-
|
|
417
|
+
|
|
417
418
|
# Normalize path for additional safety check
|
|
418
419
|
try:
|
|
419
420
|
# Create safe path relative to virtual root
|
|
420
421
|
norm_path = os.path.normpath(file_name)
|
|
421
|
-
if norm_path.startswith(
|
|
422
|
+
if norm_path.startswith(".."):
|
|
422
423
|
# Skip unsafe entries that normalize to traversal
|
|
423
|
-
content_parts.append(
|
|
424
|
+
content_parts.append(
|
|
425
|
+
f"WARNING: Skipped path traversal attempt: {file_name[:50]}..."
|
|
426
|
+
)
|
|
424
427
|
continue
|
|
425
|
-
|
|
428
|
+
|
|
426
429
|
# Check if path would escape the virtual root
|
|
427
430
|
test_path = virtual_root.joinpath(norm_path).resolve()
|
|
428
431
|
if not str(test_path).startswith(str(virtual_root)):
|
|
429
432
|
# Skip unsafe entries that would escape extraction root
|
|
430
|
-
content_parts.append(
|
|
433
|
+
content_parts.append(
|
|
434
|
+
f"WARNING: Skipped path traversal attempt: {file_name[:50]}..."
|
|
435
|
+
)
|
|
431
436
|
continue
|
|
432
|
-
|
|
437
|
+
|
|
433
438
|
# Passed all security checks, add to safe file list
|
|
434
439
|
safe_files.append(file_name)
|
|
435
|
-
|
|
440
|
+
|
|
436
441
|
# Get file info for display
|
|
437
|
-
if file_name.endswith(
|
|
442
|
+
if file_name.endswith("/"):
|
|
438
443
|
content_parts.append(f"DIR: {file_name}")
|
|
439
444
|
else:
|
|
440
445
|
try:
|
|
@@ -447,17 +452,13 @@ def _extract_zip_content(content_bytes: bytes) -> ProcessedDocument:
|
|
|
447
452
|
# Skip any entry that causes normalization errors
|
|
448
453
|
content_parts.append(f"WARNING: Skipped invalid path: {file_name[:50]}...")
|
|
449
454
|
continue
|
|
450
|
-
|
|
451
|
-
content =
|
|
452
|
-
|
|
455
|
+
|
|
456
|
+
content = "\n".join(content_parts).strip()
|
|
457
|
+
|
|
453
458
|
return ProcessedDocument(
|
|
454
|
-
content=content,
|
|
455
|
-
metadata={
|
|
456
|
-
'files': safe_files,
|
|
457
|
-
'total_files': len(safe_files)
|
|
458
|
-
}
|
|
459
|
+
content=content, metadata={"files": safe_files, "total_files": len(safe_files)}
|
|
459
460
|
)
|
|
460
|
-
|
|
461
|
+
|
|
461
462
|
except Exception as e:
|
|
462
463
|
raise DocumentProcessingError(f"Failed to process ZIP file: {e}") from e
|
|
463
464
|
|
|
@@ -465,97 +466,97 @@ def _extract_zip_content(content_bytes: bytes) -> ProcessedDocument:
|
|
|
465
466
|
def is_document_supported(mime_type: Optional[str]) -> bool:
|
|
466
467
|
"""
|
|
467
468
|
Check if a MIME type is supported for content extraction.
|
|
468
|
-
|
|
469
|
+
|
|
469
470
|
Args:
|
|
470
471
|
mime_type: MIME type to check
|
|
471
|
-
|
|
472
|
+
|
|
472
473
|
Returns:
|
|
473
474
|
True if supported, False otherwise
|
|
474
475
|
"""
|
|
475
476
|
if not mime_type:
|
|
476
477
|
return False
|
|
477
|
-
|
|
478
|
+
|
|
478
479
|
supported_types = [
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
480
|
+
"application/pdf",
|
|
481
|
+
"text/plain",
|
|
482
|
+
"text/csv",
|
|
483
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
484
|
+
"application/vnd.ms-excel",
|
|
485
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
|
486
|
+
"application/json",
|
|
487
|
+
"application/zip",
|
|
487
488
|
]
|
|
488
|
-
|
|
489
|
+
|
|
489
490
|
return mime_type.lower() in supported_types
|
|
490
491
|
|
|
491
492
|
|
|
492
493
|
def get_document_description(mime_type: Optional[str]) -> str:
|
|
493
494
|
"""
|
|
494
495
|
Get a human-readable description of what content will be extracted.
|
|
495
|
-
|
|
496
|
+
|
|
496
497
|
Args:
|
|
497
498
|
mime_type: MIME type to describe
|
|
498
|
-
|
|
499
|
+
|
|
499
500
|
Returns:
|
|
500
501
|
Human-readable description
|
|
501
502
|
"""
|
|
502
503
|
if not mime_type:
|
|
503
|
-
return
|
|
504
|
-
|
|
504
|
+
return "document content"
|
|
505
|
+
|
|
505
506
|
descriptions = {
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
507
|
+
"application/pdf": "PDF text content",
|
|
508
|
+
"text/plain": "plain text content",
|
|
509
|
+
"text/csv": "CSV data structure and sample rows",
|
|
510
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": "Excel spreadsheet data",
|
|
511
|
+
"application/vnd.ms-excel": "Excel spreadsheet data",
|
|
512
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "Word document text content",
|
|
513
|
+
"application/json": "JSON data structure",
|
|
514
|
+
"application/zip": "ZIP file listing",
|
|
514
515
|
}
|
|
515
|
-
|
|
516
|
-
return descriptions.get(mime_type.lower(),
|
|
516
|
+
|
|
517
|
+
return descriptions.get(mime_type.lower(), "document content")
|
|
517
518
|
|
|
518
519
|
|
|
519
520
|
def get_missing_dependencies() -> List[str]:
|
|
520
521
|
"""
|
|
521
522
|
Get list of missing optional dependencies for document processing.
|
|
522
|
-
|
|
523
|
+
|
|
523
524
|
Returns:
|
|
524
525
|
List of missing dependency names
|
|
525
526
|
"""
|
|
526
527
|
missing = []
|
|
527
|
-
|
|
528
|
+
|
|
528
529
|
if not HAS_PDF:
|
|
529
|
-
missing.append(
|
|
530
|
+
missing.append("PyPDF2 (for PDF processing)")
|
|
530
531
|
if not HAS_DOCX:
|
|
531
|
-
missing.append(
|
|
532
|
+
missing.append("python-docx (for Word document processing)")
|
|
532
533
|
if not HAS_EXCEL:
|
|
533
|
-
missing.append(
|
|
534
|
+
missing.append("openpyxl (for Excel processing)")
|
|
534
535
|
if not HAS_PIL:
|
|
535
|
-
missing.append(
|
|
536
|
+
missing.append("Pillow (for image processing)")
|
|
536
537
|
if not HAS_MAGIC:
|
|
537
|
-
missing.append(
|
|
538
|
+
missing.append("python-magic (for MIME type detection)")
|
|
538
539
|
if not HAS_HTTPX:
|
|
539
|
-
missing.append(
|
|
540
|
+
missing.append("httpx (for URL fetching)")
|
|
540
541
|
if not HAS_AIOFILES:
|
|
541
|
-
missing.append(
|
|
542
|
-
|
|
542
|
+
missing.append("aiofiles (for async file operations)")
|
|
543
|
+
|
|
543
544
|
return missing
|
|
544
545
|
|
|
545
546
|
|
|
546
547
|
def check_dependencies() -> Dict[str, bool]:
|
|
547
548
|
"""
|
|
548
549
|
Check availability of optional dependencies.
|
|
549
|
-
|
|
550
|
+
|
|
550
551
|
Returns:
|
|
551
552
|
Dictionary mapping dependency names to availability
|
|
552
553
|
"""
|
|
553
554
|
return {
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
}
|
|
555
|
+
"pdf": HAS_PDF,
|
|
556
|
+
"docx": HAS_DOCX,
|
|
557
|
+
"excel": HAS_EXCEL,
|
|
558
|
+
"image": HAS_PIL,
|
|
559
|
+
"magic": HAS_MAGIC,
|
|
560
|
+
"httpx": HAS_HTTPX,
|
|
561
|
+
"aiofiles": HAS_AIOFILES,
|
|
562
|
+
}
|