dtSpark 1.0.9__py3-none-any.whl → 1.0.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
dtSpark/tools/builtin.py CHANGED
@@ -63,6 +63,20 @@ def get_builtin_tools(config: Optional[Dict[str, Any]] = None) -> List[Dict[str,
63
63
  tools.extend(fs_tools)
64
64
  logging.info(f"Embedded filesystem tools enabled: {len(fs_tools)} tools added")
65
65
 
66
+ # Add document tools if enabled
67
+ doc_config = config.get('embedded_tools', {}).get('documents', {})
68
+ if doc_config.get('enabled', False):
69
+ doc_tools = _get_document_tools(doc_config)
70
+ tools.extend(doc_tools)
71
+ logging.info(f"Embedded document tools enabled: {len(doc_tools)} tools added")
72
+
73
+ # Add archive tools if enabled
74
+ archive_config = config.get('embedded_tools', {}).get('archives', {})
75
+ if archive_config.get('enabled', False):
76
+ archive_tools = _get_archive_tools(archive_config)
77
+ tools.extend(archive_tools)
78
+ logging.info(f"Embedded archive tools enabled: {len(archive_tools)} tools added")
79
+
66
80
  return tools
67
81
 
68
82
 
@@ -99,6 +113,33 @@ def execute_builtin_tool(tool_name: str, tool_input: Dict[str, Any],
99
113
  return _execute_write_file(tool_input, config)
100
114
  elif tool_name == "create_directories":
101
115
  return _execute_create_directories(tool_input, config)
116
+
117
+ # Document tools
118
+ elif tool_name == "get_file_info":
119
+ return _execute_get_file_info(tool_input, config)
120
+ elif tool_name == "read_word_document":
121
+ return _execute_read_word_document(tool_input, config)
122
+ elif tool_name == "read_excel_document":
123
+ return _execute_read_excel_document(tool_input, config)
124
+ elif tool_name == "read_powerpoint_document":
125
+ return _execute_read_powerpoint_document(tool_input, config)
126
+ elif tool_name == "read_pdf_document":
127
+ return _execute_read_pdf_document(tool_input, config)
128
+ elif tool_name == "create_word_document":
129
+ return _execute_create_word_document(tool_input, config)
130
+ elif tool_name == "create_excel_document":
131
+ return _execute_create_excel_document(tool_input, config)
132
+ elif tool_name == "create_powerpoint_document":
133
+ return _execute_create_powerpoint_document(tool_input, config)
134
+
135
+ # Archive tools
136
+ elif tool_name == "list_archive_contents":
137
+ return _execute_list_archive_contents(tool_input, config)
138
+ elif tool_name == "read_archive_file":
139
+ return _execute_read_archive_file(tool_input, config)
140
+ elif tool_name == "extract_archive":
141
+ return _execute_extract_archive(tool_input, config)
142
+
102
143
  else:
103
144
  return {
104
145
  "success": False,
@@ -831,3 +872,1400 @@ def _execute_create_directories(tool_input: Dict[str, Any],
831
872
  except Exception as e:
832
873
  logging.error(f"Error creating directories {dir_path}: {e}")
833
874
  return {"success": False, "error": str(e)}
875
+
876
+
877
+ # ============================================================================
878
+ # Document Tools
879
+ # ============================================================================
880
+
881
+ def _get_document_tools(doc_config: Dict[str, Any]) -> List[Dict[str, Any]]:
882
+ """
883
+ Get document tool definitions based on configuration.
884
+
885
+ Args:
886
+ doc_config: Document tools configuration dictionary
887
+
888
+ Returns:
889
+ List of document tool definitions
890
+ """
891
+ access_mode = doc_config.get('access_mode', 'read')
892
+ allowed_path = doc_config.get('allowed_path', '.')
893
+
894
+ # File info tool (always included)
895
+ tools = [
896
+ {
897
+ "name": "get_file_info",
898
+ "description": f"Get detailed file information including type, size, MIME type, and extension. "
899
+ f"Works for any file within the allowed path ({allowed_path}). "
900
+ "Useful for determining how to process a file before reading it.",
901
+ "input_schema": {
902
+ "type": "object",
903
+ "properties": {
904
+ "path": {
905
+ "type": "string",
906
+ "description": "Path to the file (relative to allowed path or absolute within allowed path)"
907
+ }
908
+ },
909
+ "required": ["path"]
910
+ }
911
+ }
912
+ ]
913
+
914
+ # Read-only document tools
915
+ tools.extend([
916
+ {
917
+ "name": "read_word_document",
918
+ "description": f"Extract text content from Microsoft Word documents (.docx) within the allowed path ({allowed_path}). "
919
+ "Returns the document text organised by paragraphs. Also extracts headings and tables if present.",
920
+ "input_schema": {
921
+ "type": "object",
922
+ "properties": {
923
+ "path": {
924
+ "type": "string",
925
+ "description": "Path to the .docx file"
926
+ },
927
+ "include_tables": {
928
+ "type": "boolean",
929
+ "description": "Include table content in the output",
930
+ "default": True
931
+ },
932
+ "include_headers_footers": {
933
+ "type": "boolean",
934
+ "description": "Include header and footer content",
935
+ "default": False
936
+ }
937
+ },
938
+ "required": ["path"]
939
+ }
940
+ },
941
+ {
942
+ "name": "read_excel_document",
943
+ "description": f"Extract data from Microsoft Excel documents (.xlsx) within the allowed path ({allowed_path}). "
944
+ "Returns spreadsheet data as structured JSON. Can read specific sheets or all sheets.",
945
+ "input_schema": {
946
+ "type": "object",
947
+ "properties": {
948
+ "path": {
949
+ "type": "string",
950
+ "description": "Path to the .xlsx file"
951
+ },
952
+ "sheet_name": {
953
+ "type": "string",
954
+ "description": "Specific sheet name to read. If not provided, reads the active sheet."
955
+ },
956
+ "include_all_sheets": {
957
+ "type": "boolean",
958
+ "description": "Read all sheets in the workbook",
959
+ "default": False
960
+ },
961
+ "max_rows": {
962
+ "type": "integer",
963
+ "description": "Maximum number of rows to read (0 = use config default)",
964
+ "default": 0
965
+ }
966
+ },
967
+ "required": ["path"]
968
+ }
969
+ },
970
+ {
971
+ "name": "read_powerpoint_document",
972
+ "description": f"Extract text content from Microsoft PowerPoint documents (.pptx) within the allowed path ({allowed_path}). "
973
+ "Returns text organised by slide, including titles, body text, and notes.",
974
+ "input_schema": {
975
+ "type": "object",
976
+ "properties": {
977
+ "path": {
978
+ "type": "string",
979
+ "description": "Path to the .pptx file"
980
+ },
981
+ "include_notes": {
982
+ "type": "boolean",
983
+ "description": "Include speaker notes in the output",
984
+ "default": True
985
+ }
986
+ },
987
+ "required": ["path"]
988
+ }
989
+ },
990
+ {
991
+ "name": "read_pdf_document",
992
+ "description": f"Extract text content from PDF documents within the allowed path ({allowed_path}). "
993
+ "Returns text organised by page. Can extract metadata and specific pages.",
994
+ "input_schema": {
995
+ "type": "object",
996
+ "properties": {
997
+ "path": {
998
+ "type": "string",
999
+ "description": "Path to the .pdf file"
1000
+ },
1001
+ "page_numbers": {
1002
+ "type": "array",
1003
+ "items": {"type": "integer"},
1004
+ "description": "Specific page numbers to extract (1-indexed). If not provided, extracts all pages."
1005
+ },
1006
+ "include_metadata": {
1007
+ "type": "boolean",
1008
+ "description": "Include document metadata (author, title, etc.)",
1009
+ "default": True
1010
+ }
1011
+ },
1012
+ "required": ["path"]
1013
+ }
1014
+ }
1015
+ ])
1016
+
1017
+ # Write/create tools (only if access_mode is read_write)
1018
+ if access_mode == 'read_write':
1019
+ tools.extend([
1020
+ {
1021
+ "name": "create_word_document",
1022
+ "description": f"Create a Microsoft Word document (.docx) within the allowed path ({allowed_path}). "
1023
+ "Supports creating from scratch with structured content, or using a template with placeholder replacement. "
1024
+ "When using a template, placeholders in the format {{{{placeholder_name}}}} will be replaced with provided values.",
1025
+ "input_schema": {
1026
+ "type": "object",
1027
+ "properties": {
1028
+ "path": {
1029
+ "type": "string",
1030
+ "description": "Path for the output .docx file"
1031
+ },
1032
+ "content": {
1033
+ "type": "object",
1034
+ "description": "Document content structure",
1035
+ "properties": {
1036
+ "title": {"type": "string", "description": "Document title"},
1037
+ "paragraphs": {
1038
+ "type": "array",
1039
+ "items": {
1040
+ "type": "object",
1041
+ "properties": {
1042
+ "text": {"type": "string"},
1043
+ "style": {"type": "string", "description": "Style: Normal, Heading 1, Heading 2, Heading 3, Title"}
1044
+ }
1045
+ },
1046
+ "description": "List of paragraphs with optional styles"
1047
+ }
1048
+ }
1049
+ },
1050
+ "template_path": {
1051
+ "type": "string",
1052
+ "description": "Path to a .docx template file. If provided, placeholders will be replaced."
1053
+ },
1054
+ "placeholders": {
1055
+ "type": "object",
1056
+ "description": "Dictionary of placeholder names to values for template replacement",
1057
+ "additionalProperties": {"type": "string"}
1058
+ }
1059
+ },
1060
+ "required": ["path"]
1061
+ }
1062
+ },
1063
+ {
1064
+ "name": "create_excel_document",
1065
+ "description": f"Create a Microsoft Excel document (.xlsx) within the allowed path ({allowed_path}). "
1066
+ "Creates spreadsheets from structured data. Supports multiple sheets.",
1067
+ "input_schema": {
1068
+ "type": "object",
1069
+ "properties": {
1070
+ "path": {
1071
+ "type": "string",
1072
+ "description": "Path for the output .xlsx file"
1073
+ },
1074
+ "sheets": {
1075
+ "type": "array",
1076
+ "items": {
1077
+ "type": "object",
1078
+ "properties": {
1079
+ "name": {"type": "string", "description": "Sheet name"},
1080
+ "headers": {"type": "array", "items": {"type": "string"}, "description": "Column headers"},
1081
+ "data": {
1082
+ "type": "array",
1083
+ "items": {"type": "array"},
1084
+ "description": "2D array of cell values (rows of columns)"
1085
+ }
1086
+ },
1087
+ "required": ["name", "data"]
1088
+ },
1089
+ "description": "List of sheets to create"
1090
+ }
1091
+ },
1092
+ "required": ["path", "sheets"]
1093
+ }
1094
+ },
1095
+ {
1096
+ "name": "create_powerpoint_document",
1097
+ "description": f"Create a Microsoft PowerPoint document (.pptx) within the allowed path ({allowed_path}). "
1098
+ "Creates presentations with title and content slides. Supports templates with placeholder replacement.",
1099
+ "input_schema": {
1100
+ "type": "object",
1101
+ "properties": {
1102
+ "path": {
1103
+ "type": "string",
1104
+ "description": "Path for the output .pptx file"
1105
+ },
1106
+ "slides": {
1107
+ "type": "array",
1108
+ "items": {
1109
+ "type": "object",
1110
+ "properties": {
1111
+ "layout": {
1112
+ "type": "string",
1113
+ "description": "Slide layout: title, title_content, content, blank"
1114
+ },
1115
+ "title": {"type": "string", "description": "Slide title"},
1116
+ "content": {
1117
+ "type": "array",
1118
+ "items": {"type": "string"},
1119
+ "description": "Bullet points or paragraphs"
1120
+ },
1121
+ "notes": {"type": "string", "description": "Speaker notes"}
1122
+ }
1123
+ },
1124
+ "description": "List of slides to create"
1125
+ },
1126
+ "template_path": {
1127
+ "type": "string",
1128
+ "description": "Path to a .pptx template file"
1129
+ },
1130
+ "placeholders": {
1131
+ "type": "object",
1132
+ "description": "Dictionary of placeholder names to values for template replacement",
1133
+ "additionalProperties": {"type": "string"}
1134
+ }
1135
+ },
1136
+ "required": ["path", "slides"]
1137
+ }
1138
+ }
1139
+ ])
1140
+
1141
+ return tools
1142
+
1143
+
1144
+ def _execute_get_file_info(tool_input: Dict[str, Any],
1145
+ config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
1146
+ """Execute the get_file_info tool."""
1147
+ if not config.get('embedded_tools'):
1148
+ return {"success": False, "error": "Document tools not configured"}
1149
+
1150
+ doc_config = config.get('embedded_tools', {}).get('documents', {})
1151
+ if not doc_config.get('enabled', False):
1152
+ return {"success": False, "error": "Document tools are not enabled"}
1153
+
1154
+ allowed_path = doc_config.get('allowed_path', '.')
1155
+ file_path = tool_input.get('path')
1156
+
1157
+ if not file_path:
1158
+ return {"success": False, "error": "File path is required"}
1159
+
1160
+ validation = _validate_path(file_path, allowed_path)
1161
+ if not validation['valid']:
1162
+ return {"success": False, "error": validation['error']}
1163
+
1164
+ full_path = Path(validation['resolved_path'])
1165
+
1166
+ if not full_path.exists():
1167
+ return {"success": False, "error": f"File does not exist: {file_path}"}
1168
+
1169
+ if not full_path.is_file():
1170
+ return {"success": False, "error": f"Path is not a file: {file_path}"}
1171
+
1172
+ try:
1173
+ import mimetypes
1174
+ stat_info = full_path.stat()
1175
+
1176
+ # Try to get MIME type
1177
+ mime_type, _ = mimetypes.guess_type(str(full_path))
1178
+
1179
+ # Try python-magic for more accurate detection
1180
+ try:
1181
+ import magic
1182
+ mime_type_magic = magic.from_file(str(full_path), mime=True)
1183
+ if mime_type_magic:
1184
+ mime_type = mime_type_magic
1185
+ except ImportError:
1186
+ pass
1187
+ except Exception:
1188
+ pass
1189
+
1190
+ result = {
1191
+ "path": file_path,
1192
+ "full_path": str(full_path),
1193
+ "filename": full_path.name,
1194
+ "extension": full_path.suffix.lower(),
1195
+ "mime_type": mime_type or "application/octet-stream",
1196
+ "size_bytes": stat_info.st_size,
1197
+ "size_human": _format_size(stat_info.st_size),
1198
+ "modified": datetime.fromtimestamp(stat_info.st_mtime).isoformat(),
1199
+ "created": datetime.fromtimestamp(stat_info.st_ctime).isoformat()
1200
+ }
1201
+
1202
+ logging.info(f"Got file info: {file_path}")
1203
+ return {"success": True, "result": result}
1204
+
1205
+ except Exception as e:
1206
+ logging.error(f"Error getting file info {file_path}: {e}")
1207
+ return {"success": False, "error": str(e)}
1208
+
1209
+
1210
+ def _format_size(size_bytes: int) -> str:
1211
+ """Format file size in human-readable format."""
1212
+ for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
1213
+ if size_bytes < 1024.0:
1214
+ return f"{size_bytes:.2f} {unit}"
1215
+ size_bytes /= 1024.0
1216
+ return f"{size_bytes:.2f} PB"
1217
+
1218
+
1219
+ def _execute_read_word_document(tool_input: Dict[str, Any],
1220
+ config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
1221
+ """Execute the read_word_document tool."""
1222
+ if not config.get('embedded_tools'):
1223
+ return {"success": False, "error": "Document tools not configured"}
1224
+
1225
+ doc_config = config.get('embedded_tools', {}).get('documents', {})
1226
+ if not doc_config.get('enabled', False):
1227
+ return {"success": False, "error": "Document tools are not enabled"}
1228
+
1229
+ allowed_path = doc_config.get('allowed_path', '.')
1230
+ max_size_mb = doc_config.get('max_file_size_mb', 50)
1231
+
1232
+ file_path = tool_input.get('path')
1233
+ include_tables = tool_input.get('include_tables', True)
1234
+ include_headers_footers = tool_input.get('include_headers_footers', False)
1235
+
1236
+ if not file_path:
1237
+ return {"success": False, "error": "File path is required"}
1238
+
1239
+ validation = _validate_path(file_path, allowed_path)
1240
+ if not validation['valid']:
1241
+ return {"success": False, "error": validation['error']}
1242
+
1243
+ full_path = Path(validation['resolved_path'])
1244
+
1245
+ if not full_path.exists():
1246
+ return {"success": False, "error": f"File does not exist: {file_path}"}
1247
+
1248
+ if full_path.suffix.lower() != '.docx':
1249
+ return {"success": False, "error": f"File is not a Word document (.docx): {file_path}"}
1250
+
1251
+ if full_path.stat().st_size > max_size_mb * 1024 * 1024:
1252
+ return {"success": False, "error": f"File exceeds maximum size of {max_size_mb} MB"}
1253
+
1254
+ try:
1255
+ from docx import Document
1256
+ doc = Document(str(full_path))
1257
+
1258
+ paragraphs = []
1259
+ for para in doc.paragraphs:
1260
+ if para.text.strip():
1261
+ paragraphs.append({
1262
+ "text": para.text,
1263
+ "style": para.style.name if para.style else "Normal"
1264
+ })
1265
+
1266
+ tables = []
1267
+ if include_tables:
1268
+ for table in doc.tables:
1269
+ table_data = []
1270
+ for row in table.rows:
1271
+ row_data = [cell.text for cell in row.cells]
1272
+ table_data.append(row_data)
1273
+ if table_data:
1274
+ tables.append(table_data)
1275
+
1276
+ headers_footers = []
1277
+ if include_headers_footers:
1278
+ for section in doc.sections:
1279
+ if section.header and section.header.paragraphs:
1280
+ for para in section.header.paragraphs:
1281
+ if para.text.strip():
1282
+ headers_footers.append({"type": "header", "text": para.text})
1283
+ if section.footer and section.footer.paragraphs:
1284
+ for para in section.footer.paragraphs:
1285
+ if para.text.strip():
1286
+ headers_footers.append({"type": "footer", "text": para.text})
1287
+
1288
+ result = {
1289
+ "path": file_path,
1290
+ "full_path": str(full_path),
1291
+ "paragraph_count": len(paragraphs),
1292
+ "paragraphs": paragraphs,
1293
+ "table_count": len(tables),
1294
+ "tables": tables if tables else None,
1295
+ "headers_footers": headers_footers if headers_footers else None
1296
+ }
1297
+
1298
+ logging.info(f"Read Word document: {file_path} ({len(paragraphs)} paragraphs, {len(tables)} tables)")
1299
+ return {"success": True, "result": result}
1300
+
1301
+ except Exception as e:
1302
+ logging.error(f"Error reading Word document {file_path}: {e}")
1303
+ return {"success": False, "error": str(e)}
1304
+
1305
+
1306
+ def _execute_read_excel_document(tool_input: Dict[str, Any],
1307
+ config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
1308
+ """Execute the read_excel_document tool."""
1309
+ if not config.get('embedded_tools'):
1310
+ return {"success": False, "error": "Document tools not configured"}
1311
+
1312
+ doc_config = config.get('embedded_tools', {}).get('documents', {})
1313
+ if not doc_config.get('enabled', False):
1314
+ return {"success": False, "error": "Document tools are not enabled"}
1315
+
1316
+ allowed_path = doc_config.get('allowed_path', '.')
1317
+ max_size_mb = doc_config.get('max_file_size_mb', 50)
1318
+ default_max_rows = doc_config.get('reading', {}).get('max_excel_rows', 10000)
1319
+
1320
+ file_path = tool_input.get('path')
1321
+ sheet_name = tool_input.get('sheet_name')
1322
+ include_all_sheets = tool_input.get('include_all_sheets', False)
1323
+ max_rows = tool_input.get('max_rows', 0) or default_max_rows
1324
+
1325
+ if not file_path:
1326
+ return {"success": False, "error": "File path is required"}
1327
+
1328
+ validation = _validate_path(file_path, allowed_path)
1329
+ if not validation['valid']:
1330
+ return {"success": False, "error": validation['error']}
1331
+
1332
+ full_path = Path(validation['resolved_path'])
1333
+
1334
+ if not full_path.exists():
1335
+ return {"success": False, "error": f"File does not exist: {file_path}"}
1336
+
1337
+ if full_path.suffix.lower() != '.xlsx':
1338
+ return {"success": False, "error": f"File is not an Excel document (.xlsx): {file_path}"}
1339
+
1340
+ if full_path.stat().st_size > max_size_mb * 1024 * 1024:
1341
+ return {"success": False, "error": f"File exceeds maximum size of {max_size_mb} MB"}
1342
+
1343
+ try:
1344
+ from openpyxl import load_workbook
1345
+ wb = load_workbook(str(full_path), read_only=True, data_only=True)
1346
+
1347
+ sheets_data = {}
1348
+ sheet_names = wb.sheetnames
1349
+
1350
+ if include_all_sheets:
1351
+ sheets_to_read = sheet_names
1352
+ elif sheet_name:
1353
+ if sheet_name not in sheet_names:
1354
+ return {"success": False, "error": f"Sheet '{sheet_name}' not found. Available: {sheet_names}"}
1355
+ sheets_to_read = [sheet_name]
1356
+ else:
1357
+ sheets_to_read = [wb.active.title] if wb.active else sheet_names[:1]
1358
+
1359
+ for sname in sheets_to_read:
1360
+ ws = wb[sname]
1361
+ rows = []
1362
+ row_count = 0
1363
+ for row in ws.iter_rows(values_only=True):
1364
+ if row_count >= max_rows:
1365
+ break
1366
+ rows.append(list(row))
1367
+ row_count += 1
1368
+
1369
+ sheets_data[sname] = {
1370
+ "rows": rows,
1371
+ "row_count": len(rows),
1372
+ "truncated": row_count >= max_rows
1373
+ }
1374
+
1375
+ wb.close()
1376
+
1377
+ result = {
1378
+ "path": file_path,
1379
+ "full_path": str(full_path),
1380
+ "sheet_names": sheet_names,
1381
+ "sheets_read": list(sheets_data.keys()),
1382
+ "data": sheets_data,
1383
+ "max_rows_limit": max_rows
1384
+ }
1385
+
1386
+ logging.info(f"Read Excel document: {file_path} ({len(sheets_data)} sheets)")
1387
+ return {"success": True, "result": result}
1388
+
1389
+ except Exception as e:
1390
+ logging.error(f"Error reading Excel document {file_path}: {e}")
1391
+ return {"success": False, "error": str(e)}
1392
+
1393
+
1394
+ def _execute_read_powerpoint_document(tool_input: Dict[str, Any],
1395
+ config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
1396
+ """Execute the read_powerpoint_document tool."""
1397
+ if not config.get('embedded_tools'):
1398
+ return {"success": False, "error": "Document tools not configured"}
1399
+
1400
+ doc_config = config.get('embedded_tools', {}).get('documents', {})
1401
+ if not doc_config.get('enabled', False):
1402
+ return {"success": False, "error": "Document tools are not enabled"}
1403
+
1404
+ allowed_path = doc_config.get('allowed_path', '.')
1405
+ max_size_mb = doc_config.get('max_file_size_mb', 50)
1406
+
1407
+ file_path = tool_input.get('path')
1408
+ include_notes = tool_input.get('include_notes', True)
1409
+
1410
+ if not file_path:
1411
+ return {"success": False, "error": "File path is required"}
1412
+
1413
+ validation = _validate_path(file_path, allowed_path)
1414
+ if not validation['valid']:
1415
+ return {"success": False, "error": validation['error']}
1416
+
1417
+ full_path = Path(validation['resolved_path'])
1418
+
1419
+ if not full_path.exists():
1420
+ return {"success": False, "error": f"File does not exist: {file_path}"}
1421
+
1422
+ if full_path.suffix.lower() != '.pptx':
1423
+ return {"success": False, "error": f"File is not a PowerPoint document (.pptx): {file_path}"}
1424
+
1425
+ if full_path.stat().st_size > max_size_mb * 1024 * 1024:
1426
+ return {"success": False, "error": f"File exceeds maximum size of {max_size_mb} MB"}
1427
+
1428
+ try:
1429
+ from pptx import Presentation
1430
+ prs = Presentation(str(full_path))
1431
+
1432
+ slides = []
1433
+ for idx, slide in enumerate(prs.slides, 1):
1434
+ slide_data = {
1435
+ "slide_number": idx,
1436
+ "title": None,
1437
+ "content": []
1438
+ }
1439
+
1440
+ for shape in slide.shapes:
1441
+ if shape.has_text_frame:
1442
+ for para in shape.text_frame.paragraphs:
1443
+ text = para.text.strip()
1444
+ if text:
1445
+ if shape.is_placeholder and hasattr(shape, 'placeholder_format'):
1446
+ if shape.placeholder_format.type == 1: # Title
1447
+ slide_data["title"] = text
1448
+ else:
1449
+ slide_data["content"].append(text)
1450
+ else:
1451
+ slide_data["content"].append(text)
1452
+
1453
+ if include_notes and slide.has_notes_slide:
1454
+ notes_frame = slide.notes_slide.notes_text_frame
1455
+ if notes_frame:
1456
+ notes_text = notes_frame.text.strip()
1457
+ if notes_text:
1458
+ slide_data["notes"] = notes_text
1459
+
1460
+ slides.append(slide_data)
1461
+
1462
+ result = {
1463
+ "path": file_path,
1464
+ "full_path": str(full_path),
1465
+ "slide_count": len(slides),
1466
+ "slides": slides
1467
+ }
1468
+
1469
+ logging.info(f"Read PowerPoint document: {file_path} ({len(slides)} slides)")
1470
+ return {"success": True, "result": result}
1471
+
1472
+ except Exception as e:
1473
+ logging.error(f"Error reading PowerPoint document {file_path}: {e}")
1474
+ return {"success": False, "error": str(e)}
1475
+
1476
+
1477
+ def _execute_read_pdf_document(tool_input: Dict[str, Any],
1478
+ config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
1479
+ """Execute the read_pdf_document tool."""
1480
+ if not config.get('embedded_tools'):
1481
+ return {"success": False, "error": "Document tools not configured"}
1482
+
1483
+ doc_config = config.get('embedded_tools', {}).get('documents', {})
1484
+ if not doc_config.get('enabled', False):
1485
+ return {"success": False, "error": "Document tools are not enabled"}
1486
+
1487
+ allowed_path = doc_config.get('allowed_path', '.')
1488
+ max_size_mb = doc_config.get('max_file_size_mb', 50)
1489
+ max_pages = doc_config.get('reading', {}).get('max_pdf_pages', 100)
1490
+
1491
+ file_path = tool_input.get('path')
1492
+ page_numbers = tool_input.get('page_numbers')
1493
+ include_metadata = tool_input.get('include_metadata', True)
1494
+
1495
+ if not file_path:
1496
+ return {"success": False, "error": "File path is required"}
1497
+
1498
+ validation = _validate_path(file_path, allowed_path)
1499
+ if not validation['valid']:
1500
+ return {"success": False, "error": validation['error']}
1501
+
1502
+ full_path = Path(validation['resolved_path'])
1503
+
1504
+ if not full_path.exists():
1505
+ return {"success": False, "error": f"File does not exist: {file_path}"}
1506
+
1507
+ if full_path.suffix.lower() != '.pdf':
1508
+ return {"success": False, "error": f"File is not a PDF document (.pdf): {file_path}"}
1509
+
1510
+ if full_path.stat().st_size > max_size_mb * 1024 * 1024:
1511
+ return {"success": False, "error": f"File exceeds maximum size of {max_size_mb} MB"}
1512
+
1513
+ try:
1514
+ import pdfplumber
1515
+
1516
+ pages_data = []
1517
+ metadata = None
1518
+
1519
+ with pdfplumber.open(str(full_path)) as pdf:
1520
+ total_pages = len(pdf.pages)
1521
+
1522
+ if include_metadata:
1523
+ metadata = pdf.metadata
1524
+
1525
+ # Determine which pages to extract
1526
+ if page_numbers:
1527
+ pages_to_read = [p - 1 for p in page_numbers if 0 < p <= total_pages]
1528
+ else:
1529
+ pages_to_read = list(range(min(total_pages, max_pages)))
1530
+
1531
+ for page_idx in pages_to_read:
1532
+ page = pdf.pages[page_idx]
1533
+ text = page.extract_text() or ""
1534
+ pages_data.append({
1535
+ "page_number": page_idx + 1,
1536
+ "text": text,
1537
+ "width": page.width,
1538
+ "height": page.height
1539
+ })
1540
+
1541
+ result = {
1542
+ "path": file_path,
1543
+ "full_path": str(full_path),
1544
+ "total_pages": total_pages,
1545
+ "pages_extracted": len(pages_data),
1546
+ "pages": pages_data,
1547
+ "truncated": len(pages_data) < total_pages and not page_numbers,
1548
+ "metadata": metadata
1549
+ }
1550
+
1551
+ logging.info(f"Read PDF document: {file_path} ({len(pages_data)} pages)")
1552
+ return {"success": True, "result": result}
1553
+
1554
+ except Exception as e:
1555
+ logging.error(f"Error reading PDF document {file_path}: {e}")
1556
+ return {"success": False, "error": str(e)}
1557
+
1558
+
1559
+ def _execute_create_word_document(tool_input: Dict[str, Any],
1560
+ config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
1561
+ """Execute the create_word_document tool."""
1562
+ if not config.get('embedded_tools'):
1563
+ return {"success": False, "error": "Document tools not configured"}
1564
+
1565
+ doc_config = config.get('embedded_tools', {}).get('documents', {})
1566
+ if not doc_config.get('enabled', False):
1567
+ return {"success": False, "error": "Document tools are not enabled"}
1568
+
1569
+ if doc_config.get('access_mode', 'read') != 'read_write':
1570
+ return {"success": False, "error": "Write operations require access_mode: read_write"}
1571
+
1572
+ allowed_path = doc_config.get('allowed_path', '.')
1573
+ templates_path = doc_config.get('creation', {}).get('templates_path')
1574
+
1575
+ file_path = tool_input.get('path')
1576
+ content = tool_input.get('content', {})
1577
+ template_path = tool_input.get('template_path')
1578
+ placeholders = tool_input.get('placeholders', {})
1579
+
1580
+ if not file_path:
1581
+ return {"success": False, "error": "Output file path is required"}
1582
+
1583
+ validation = _validate_path(file_path, allowed_path)
1584
+ if not validation['valid']:
1585
+ return {"success": False, "error": validation['error']}
1586
+
1587
+ full_path = Path(validation['resolved_path'])
1588
+
1589
+ if not full_path.parent.exists():
1590
+ return {"success": False, "error": f"Parent directory does not exist: {full_path.parent}"}
1591
+
1592
+ try:
1593
+ from docx import Document
1594
+
1595
+ # Use template if provided
1596
+ if template_path:
1597
+ # Validate template path
1598
+ if templates_path:
1599
+ template_full = Path(templates_path) / template_path
1600
+ else:
1601
+ template_validation = _validate_path(template_path, allowed_path)
1602
+ if not template_validation['valid']:
1603
+ return {"success": False, "error": f"Template path error: {template_validation['error']}"}
1604
+ template_full = Path(template_validation['resolved_path'])
1605
+
1606
+ if not template_full.exists():
1607
+ return {"success": False, "error": f"Template does not exist: {template_path}"}
1608
+
1609
+ doc = Document(str(template_full))
1610
+
1611
+ # Replace placeholders in paragraphs
1612
+ for para in doc.paragraphs:
1613
+ for key, value in placeholders.items():
1614
+ if f"{{{{{key}}}}}" in para.text:
1615
+ for run in para.runs:
1616
+ run.text = run.text.replace(f"{{{{{key}}}}}", str(value))
1617
+
1618
+ # Replace placeholders in tables
1619
+ for table in doc.tables:
1620
+ for row in table.rows:
1621
+ for cell in row.cells:
1622
+ for key, value in placeholders.items():
1623
+ if f"{{{{{key}}}}}" in cell.text:
1624
+ cell.text = cell.text.replace(f"{{{{{key}}}}}", str(value))
1625
+
1626
+ else:
1627
+ doc = Document()
1628
+
1629
+ # Add title if provided
1630
+ if content.get('title'):
1631
+ doc.add_heading(content['title'], 0)
1632
+
1633
+ # Add paragraphs
1634
+ for para_data in content.get('paragraphs', []):
1635
+ text = para_data.get('text', '')
1636
+ style = para_data.get('style', 'Normal')
1637
+ if style.startswith('Heading'):
1638
+ level = int(style.split()[-1]) if style.split()[-1].isdigit() else 1
1639
+ doc.add_heading(text, level)
1640
+ else:
1641
+ doc.add_paragraph(text, style=style)
1642
+
1643
+ doc.save(str(full_path))
1644
+
1645
+ result = {
1646
+ "path": file_path,
1647
+ "full_path": str(full_path),
1648
+ "size_bytes": full_path.stat().st_size,
1649
+ "used_template": template_path is not None,
1650
+ "placeholders_replaced": list(placeholders.keys()) if placeholders else []
1651
+ }
1652
+
1653
+ logging.info(f"Created Word document: {file_path}")
1654
+ return {"success": True, "result": result}
1655
+
1656
+ except Exception as e:
1657
+ logging.error(f"Error creating Word document {file_path}: {e}")
1658
+ return {"success": False, "error": str(e)}
1659
+
1660
+
1661
+ def _execute_create_excel_document(tool_input: Dict[str, Any],
1662
+ config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
1663
+ """Execute the create_excel_document tool."""
1664
+ if not config.get('embedded_tools'):
1665
+ return {"success": False, "error": "Document tools not configured"}
1666
+
1667
+ doc_config = config.get('embedded_tools', {}).get('documents', {})
1668
+ if not doc_config.get('enabled', False):
1669
+ return {"success": False, "error": "Document tools are not enabled"}
1670
+
1671
+ if doc_config.get('access_mode', 'read') != 'read_write':
1672
+ return {"success": False, "error": "Write operations require access_mode: read_write"}
1673
+
1674
+ allowed_path = doc_config.get('allowed_path', '.')
1675
+
1676
+ file_path = tool_input.get('path')
1677
+ sheets = tool_input.get('sheets', [])
1678
+
1679
+ if not file_path:
1680
+ return {"success": False, "error": "Output file path is required"}
1681
+
1682
+ if not sheets:
1683
+ return {"success": False, "error": "At least one sheet is required"}
1684
+
1685
+ validation = _validate_path(file_path, allowed_path)
1686
+ if not validation['valid']:
1687
+ return {"success": False, "error": validation['error']}
1688
+
1689
+ full_path = Path(validation['resolved_path'])
1690
+
1691
+ if not full_path.parent.exists():
1692
+ return {"success": False, "error": f"Parent directory does not exist: {full_path.parent}"}
1693
+
1694
+ try:
1695
+ from openpyxl import Workbook
1696
+
1697
+ wb = Workbook()
1698
+ # Remove default sheet
1699
+ if 'Sheet' in wb.sheetnames:
1700
+ del wb['Sheet']
1701
+
1702
+ for sheet_data in sheets:
1703
+ sheet_name = sheet_data.get('name', 'Sheet')
1704
+ headers = sheet_data.get('headers', [])
1705
+ data = sheet_data.get('data', [])
1706
+
1707
+ ws = wb.create_sheet(title=sheet_name)
1708
+
1709
+ # Add headers if provided
1710
+ if headers:
1711
+ for col, header in enumerate(headers, 1):
1712
+ ws.cell(row=1, column=col, value=header)
1713
+ start_row = 2
1714
+ else:
1715
+ start_row = 1
1716
+
1717
+ # Add data
1718
+ for row_idx, row_data in enumerate(data, start_row):
1719
+ for col_idx, value in enumerate(row_data, 1):
1720
+ ws.cell(row=row_idx, column=col_idx, value=value)
1721
+
1722
+ wb.save(str(full_path))
1723
+
1724
+ result = {
1725
+ "path": file_path,
1726
+ "full_path": str(full_path),
1727
+ "size_bytes": full_path.stat().st_size,
1728
+ "sheets_created": [s.get('name', 'Sheet') for s in sheets]
1729
+ }
1730
+
1731
+ logging.info(f"Created Excel document: {file_path}")
1732
+ return {"success": True, "result": result}
1733
+
1734
+ except Exception as e:
1735
+ logging.error(f"Error creating Excel document {file_path}: {e}")
1736
+ return {"success": False, "error": str(e)}
1737
+
1738
+
1739
+ def _execute_create_powerpoint_document(tool_input: Dict[str, Any],
1740
+ config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
1741
+ """Execute the create_powerpoint_document tool."""
1742
+ if not config.get('embedded_tools'):
1743
+ return {"success": False, "error": "Document tools not configured"}
1744
+
1745
+ doc_config = config.get('embedded_tools', {}).get('documents', {})
1746
+ if not doc_config.get('enabled', False):
1747
+ return {"success": False, "error": "Document tools are not enabled"}
1748
+
1749
+ if doc_config.get('access_mode', 'read') != 'read_write':
1750
+ return {"success": False, "error": "Write operations require access_mode: read_write"}
1751
+
1752
+ allowed_path = doc_config.get('allowed_path', '.')
1753
+ templates_path = doc_config.get('creation', {}).get('templates_path')
1754
+
1755
+ file_path = tool_input.get('path')
1756
+ slides_data = tool_input.get('slides', [])
1757
+ template_path = tool_input.get('template_path')
1758
+ placeholders = tool_input.get('placeholders', {})
1759
+
1760
+ if not file_path:
1761
+ return {"success": False, "error": "Output file path is required"}
1762
+
1763
+ if not slides_data and not template_path:
1764
+ return {"success": False, "error": "Either slides or template_path is required"}
1765
+
1766
+ validation = _validate_path(file_path, allowed_path)
1767
+ if not validation['valid']:
1768
+ return {"success": False, "error": validation['error']}
1769
+
1770
+ full_path = Path(validation['resolved_path'])
1771
+
1772
+ if not full_path.parent.exists():
1773
+ return {"success": False, "error": f"Parent directory does not exist: {full_path.parent}"}
1774
+
1775
+ try:
1776
+ from pptx import Presentation
1777
+ from pptx.util import Inches, Pt
1778
+
1779
+ # Use template if provided
1780
+ if template_path:
1781
+ if templates_path:
1782
+ template_full = Path(templates_path) / template_path
1783
+ else:
1784
+ template_validation = _validate_path(template_path, allowed_path)
1785
+ if not template_validation['valid']:
1786
+ return {"success": False, "error": f"Template path error: {template_validation['error']}"}
1787
+ template_full = Path(template_validation['resolved_path'])
1788
+
1789
+ if not template_full.exists():
1790
+ return {"success": False, "error": f"Template does not exist: {template_path}"}
1791
+
1792
+ prs = Presentation(str(template_full))
1793
+
1794
+ # Replace placeholders in existing slides
1795
+ for slide in prs.slides:
1796
+ for shape in slide.shapes:
1797
+ if shape.has_text_frame:
1798
+ for para in shape.text_frame.paragraphs:
1799
+ for run in para.runs:
1800
+ for key, value in placeholders.items():
1801
+ if f"{{{{{key}}}}}" in run.text:
1802
+ run.text = run.text.replace(f"{{{{{key}}}}}", str(value))
1803
+ else:
1804
+ prs = Presentation()
1805
+
1806
+ # Add new slides
1807
+ for slide_data in slides_data:
1808
+ layout_name = slide_data.get('layout', 'title_content')
1809
+ title = slide_data.get('title', '')
1810
+ content = slide_data.get('content', [])
1811
+ notes = slide_data.get('notes', '')
1812
+
1813
+ # Map layout names to indices
1814
+ layout_map = {
1815
+ 'title': 0,
1816
+ 'title_content': 1,
1817
+ 'content': 5,
1818
+ 'blank': 6
1819
+ }
1820
+ layout_idx = layout_map.get(layout_name, 1)
1821
+
1822
+ if layout_idx < len(prs.slide_layouts):
1823
+ slide_layout = prs.slide_layouts[layout_idx]
1824
+ else:
1825
+ slide_layout = prs.slide_layouts[0]
1826
+
1827
+ slide = prs.slides.add_slide(slide_layout)
1828
+
1829
+ # Set title
1830
+ if title and slide.shapes.title:
1831
+ slide.shapes.title.text = title
1832
+
1833
+ # Add content
1834
+ if content:
1835
+ for shape in slide.shapes:
1836
+ if shape.has_text_frame and shape != slide.shapes.title:
1837
+ tf = shape.text_frame
1838
+ tf.clear()
1839
+ for i, text in enumerate(content):
1840
+ if i == 0:
1841
+ tf.paragraphs[0].text = text
1842
+ else:
1843
+ p = tf.add_paragraph()
1844
+ p.text = text
1845
+ break
1846
+
1847
+ # Add notes
1848
+ if notes:
1849
+ notes_slide = slide.notes_slide
1850
+ notes_slide.notes_text_frame.text = notes
1851
+
1852
+ prs.save(str(full_path))
1853
+
1854
+ result = {
1855
+ "path": file_path,
1856
+ "full_path": str(full_path),
1857
+ "size_bytes": full_path.stat().st_size,
1858
+ "slides_added": len(slides_data),
1859
+ "used_template": template_path is not None,
1860
+ "placeholders_replaced": list(placeholders.keys()) if placeholders else []
1861
+ }
1862
+
1863
+ logging.info(f"Created PowerPoint document: {file_path}")
1864
+ return {"success": True, "result": result}
1865
+
1866
+ except Exception as e:
1867
+ logging.error(f"Error creating PowerPoint document {file_path}: {e}")
1868
+ return {"success": False, "error": str(e)}
1869
+
1870
+
1871
+ # ============================================================================
1872
+ # Archive Tools
1873
+ # ============================================================================
1874
+
1875
+ def _get_archive_tools(archive_config: Dict[str, Any]) -> List[Dict[str, Any]]:
1876
+ """
1877
+ Get archive tool definitions based on configuration.
1878
+
1879
+ Args:
1880
+ archive_config: Archive tools configuration dictionary
1881
+
1882
+ Returns:
1883
+ List of archive tool definitions
1884
+ """
1885
+ access_mode = archive_config.get('access_mode', 'read')
1886
+ allowed_path = archive_config.get('allowed_path', '.')
1887
+
1888
+ tools = [
1889
+ {
1890
+ "name": "list_archive_contents",
1891
+ "description": f"List the contents of an archive file within the allowed path ({allowed_path}). "
1892
+ "Supports .zip, .tar, .tar.gz, .tgz, and .tar.bz2 files. "
1893
+ "Returns file names, sizes, and modification times.",
1894
+ "input_schema": {
1895
+ "type": "object",
1896
+ "properties": {
1897
+ "path": {
1898
+ "type": "string",
1899
+ "description": "Path to the archive file"
1900
+ }
1901
+ },
1902
+ "required": ["path"]
1903
+ }
1904
+ },
1905
+ {
1906
+ "name": "read_archive_file",
1907
+ "description": f"Read a specific file from within an archive without extracting to disk ({allowed_path}). "
1908
+ "Returns the file content. Text files are returned as strings, binary files as base64.",
1909
+ "input_schema": {
1910
+ "type": "object",
1911
+ "properties": {
1912
+ "archive_path": {
1913
+ "type": "string",
1914
+ "description": "Path to the archive file"
1915
+ },
1916
+ "file_path": {
1917
+ "type": "string",
1918
+ "description": "Path of the file within the archive to read"
1919
+ },
1920
+ "encoding": {
1921
+ "type": "string",
1922
+ "description": "Text encoding to use when reading as text (default: utf-8)",
1923
+ "default": "utf-8"
1924
+ },
1925
+ "as_binary": {
1926
+ "type": "boolean",
1927
+ "description": "Force reading as binary (returns base64)",
1928
+ "default": False
1929
+ }
1930
+ },
1931
+ "required": ["archive_path", "file_path"]
1932
+ }
1933
+ }
1934
+ ]
1935
+
1936
+ # Extract tool (only if access_mode is read_write)
1937
+ if access_mode == 'read_write':
1938
+ tools.append({
1939
+ "name": "extract_archive",
1940
+ "description": f"Extract an archive file to a destination directory within the allowed path ({allowed_path}). "
1941
+ "Supports .zip, .tar, .tar.gz, .tgz, and .tar.bz2 files. "
1942
+ "Can extract all files or specific files only.",
1943
+ "input_schema": {
1944
+ "type": "object",
1945
+ "properties": {
1946
+ "archive_path": {
1947
+ "type": "string",
1948
+ "description": "Path to the archive file"
1949
+ },
1950
+ "destination": {
1951
+ "type": "string",
1952
+ "description": "Destination directory for extracted files"
1953
+ },
1954
+ "files": {
1955
+ "type": "array",
1956
+ "items": {"type": "string"},
1957
+ "description": "Specific files to extract. If not provided, extracts all files."
1958
+ },
1959
+ "overwrite": {
1960
+ "type": "boolean",
1961
+ "description": "Overwrite existing files",
1962
+ "default": False
1963
+ }
1964
+ },
1965
+ "required": ["archive_path", "destination"]
1966
+ }
1967
+ })
1968
+
1969
+ return tools
1970
+
1971
+
1972
+ def _get_archive_type(file_path: Path) -> Optional[str]:
1973
+ """Determine archive type from file extension."""
1974
+ suffix = file_path.suffix.lower()
1975
+ name = file_path.name.lower()
1976
+
1977
+ if suffix == '.zip':
1978
+ return 'zip'
1979
+ elif suffix == '.tar':
1980
+ return 'tar'
1981
+ elif name.endswith('.tar.gz') or suffix == '.tgz':
1982
+ return 'tar.gz'
1983
+ elif name.endswith('.tar.bz2'):
1984
+ return 'tar.bz2'
1985
+ return None
1986
+
1987
+
1988
+ def _execute_list_archive_contents(tool_input: Dict[str, Any],
1989
+ config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
1990
+ """Execute the list_archive_contents tool."""
1991
+ import zipfile
1992
+ import tarfile
1993
+
1994
+ if not config.get('embedded_tools'):
1995
+ return {"success": False, "error": "Archive tools not configured"}
1996
+
1997
+ archive_config = config.get('embedded_tools', {}).get('archives', {})
1998
+ if not archive_config.get('enabled', False):
1999
+ return {"success": False, "error": "Archive tools are not enabled"}
2000
+
2001
+ allowed_path = archive_config.get('allowed_path', '.')
2002
+ max_size_mb = archive_config.get('max_file_size_mb', 100)
2003
+ max_files = archive_config.get('max_files_to_list', 1000)
2004
+
2005
+ file_path = tool_input.get('path')
2006
+
2007
+ if not file_path:
2008
+ return {"success": False, "error": "Archive path is required"}
2009
+
2010
+ validation = _validate_path(file_path, allowed_path)
2011
+ if not validation['valid']:
2012
+ return {"success": False, "error": validation['error']}
2013
+
2014
+ full_path = Path(validation['resolved_path'])
2015
+
2016
+ if not full_path.exists():
2017
+ return {"success": False, "error": f"Archive does not exist: {file_path}"}
2018
+
2019
+ if full_path.stat().st_size > max_size_mb * 1024 * 1024:
2020
+ return {"success": False, "error": f"Archive exceeds maximum size of {max_size_mb} MB"}
2021
+
2022
+ archive_type = _get_archive_type(full_path)
2023
+ if not archive_type:
2024
+ return {"success": False, "error": f"Unsupported archive format: {full_path.suffix}"}
2025
+
2026
+ try:
2027
+ files = []
2028
+
2029
+ if archive_type == 'zip':
2030
+ with zipfile.ZipFile(str(full_path), 'r') as zf:
2031
+ for info in zf.infolist()[:max_files]:
2032
+ files.append({
2033
+ "path": info.filename,
2034
+ "size_bytes": info.file_size,
2035
+ "compressed_size": info.compress_size,
2036
+ "is_directory": info.is_dir(),
2037
+ "modified": datetime(*info.date_time).isoformat() if info.date_time else None
2038
+ })
2039
+ else:
2040
+ mode = 'r:gz' if archive_type == 'tar.gz' else 'r:bz2' if archive_type == 'tar.bz2' else 'r'
2041
+ with tarfile.open(str(full_path), mode) as tf:
2042
+ count = 0
2043
+ for member in tf:
2044
+ if count >= max_files:
2045
+ break
2046
+ files.append({
2047
+ "path": member.name,
2048
+ "size_bytes": member.size,
2049
+ "is_directory": member.isdir(),
2050
+ "modified": datetime.fromtimestamp(member.mtime).isoformat() if member.mtime else None
2051
+ })
2052
+ count += 1
2053
+
2054
+ result = {
2055
+ "path": file_path,
2056
+ "full_path": str(full_path),
2057
+ "archive_type": archive_type,
2058
+ "total_files": len(files),
2059
+ "truncated": len(files) >= max_files,
2060
+ "files": files
2061
+ }
2062
+
2063
+ logging.info(f"Listed archive contents: {file_path} ({len(files)} files)")
2064
+ return {"success": True, "result": result}
2065
+
2066
+ except Exception as e:
2067
+ logging.error(f"Error listing archive {file_path}: {e}")
2068
+ return {"success": False, "error": str(e)}
2069
+
2070
+
2071
+ def _execute_read_archive_file(tool_input: Dict[str, Any],
2072
+ config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
2073
+ """Execute the read_archive_file tool."""
2074
+ import zipfile
2075
+ import tarfile
2076
+
2077
+ if not config.get('embedded_tools'):
2078
+ return {"success": False, "error": "Archive tools not configured"}
2079
+
2080
+ archive_config = config.get('embedded_tools', {}).get('archives', {})
2081
+ if not archive_config.get('enabled', False):
2082
+ return {"success": False, "error": "Archive tools are not enabled"}
2083
+
2084
+ allowed_path = archive_config.get('allowed_path', '.')
2085
+ max_size_mb = archive_config.get('max_file_size_mb', 100)
2086
+
2087
+ archive_path = tool_input.get('archive_path')
2088
+ file_path = tool_input.get('file_path')
2089
+ encoding = tool_input.get('encoding', 'utf-8')
2090
+ as_binary = tool_input.get('as_binary', False)
2091
+
2092
+ if not archive_path:
2093
+ return {"success": False, "error": "Archive path is required"}
2094
+
2095
+ if not file_path:
2096
+ return {"success": False, "error": "File path within archive is required"}
2097
+
2098
+ validation = _validate_path(archive_path, allowed_path)
2099
+ if not validation['valid']:
2100
+ return {"success": False, "error": validation['error']}
2101
+
2102
+ full_path = Path(validation['resolved_path'])
2103
+
2104
+ if not full_path.exists():
2105
+ return {"success": False, "error": f"Archive does not exist: {archive_path}"}
2106
+
2107
+ if full_path.stat().st_size > max_size_mb * 1024 * 1024:
2108
+ return {"success": False, "error": f"Archive exceeds maximum size of {max_size_mb} MB"}
2109
+
2110
+ archive_type = _get_archive_type(full_path)
2111
+ if not archive_type:
2112
+ return {"success": False, "error": f"Unsupported archive format: {full_path.suffix}"}
2113
+
2114
+ try:
2115
+ content = None
2116
+
2117
+ if archive_type == 'zip':
2118
+ with zipfile.ZipFile(str(full_path), 'r') as zf:
2119
+ if file_path not in zf.namelist():
2120
+ return {"success": False, "error": f"File not found in archive: {file_path}"}
2121
+ content = zf.read(file_path)
2122
+ else:
2123
+ mode = 'r:gz' if archive_type == 'tar.gz' else 'r:bz2' if archive_type == 'tar.bz2' else 'r'
2124
+ with tarfile.open(str(full_path), mode) as tf:
2125
+ try:
2126
+ member = tf.getmember(file_path)
2127
+ f = tf.extractfile(member)
2128
+ if f:
2129
+ content = f.read()
2130
+ else:
2131
+ return {"success": False, "error": f"Cannot read directory: {file_path}"}
2132
+ except KeyError:
2133
+ return {"success": False, "error": f"File not found in archive: {file_path}"}
2134
+
2135
+ # Try to decode as text unless binary requested
2136
+ if as_binary:
2137
+ result = {
2138
+ "archive_path": archive_path,
2139
+ "file_path": file_path,
2140
+ "content_base64": base64.b64encode(content).decode('utf-8'),
2141
+ "size_bytes": len(content),
2142
+ "is_binary": True
2143
+ }
2144
+ else:
2145
+ try:
2146
+ text_content = content.decode(encoding)
2147
+ result = {
2148
+ "archive_path": archive_path,
2149
+ "file_path": file_path,
2150
+ "content": text_content,
2151
+ "size_bytes": len(content),
2152
+ "encoding": encoding,
2153
+ "is_binary": False
2154
+ }
2155
+ except UnicodeDecodeError:
2156
+ result = {
2157
+ "archive_path": archive_path,
2158
+ "file_path": file_path,
2159
+ "content_base64": base64.b64encode(content).decode('utf-8'),
2160
+ "size_bytes": len(content),
2161
+ "is_binary": True,
2162
+ "note": f"Could not decode as {encoding}, returned as base64"
2163
+ }
2164
+
2165
+ logging.info(f"Read file from archive: {archive_path}/{file_path}")
2166
+ return {"success": True, "result": result}
2167
+
2168
+ except Exception as e:
2169
+ logging.error(f"Error reading from archive {archive_path}: {e}")
2170
+ return {"success": False, "error": str(e)}
2171
+
2172
+
2173
+ def _execute_extract_archive(tool_input: Dict[str, Any],
2174
+ config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
2175
+ """Execute the extract_archive tool."""
2176
+ import zipfile
2177
+ import tarfile
2178
+
2179
+ if not config.get('embedded_tools'):
2180
+ return {"success": False, "error": "Archive tools not configured"}
2181
+
2182
+ archive_config = config.get('embedded_tools', {}).get('archives', {})
2183
+ if not archive_config.get('enabled', False):
2184
+ return {"success": False, "error": "Archive tools are not enabled"}
2185
+
2186
+ if archive_config.get('access_mode', 'read') != 'read_write':
2187
+ return {"success": False, "error": "Extract operations require access_mode: read_write"}
2188
+
2189
+ allowed_path = archive_config.get('allowed_path', '.')
2190
+ max_size_mb = archive_config.get('max_file_size_mb', 100)
2191
+
2192
+ archive_path = tool_input.get('archive_path')
2193
+ destination = tool_input.get('destination')
2194
+ files_to_extract = tool_input.get('files')
2195
+ overwrite = tool_input.get('overwrite', False)
2196
+
2197
+ if not archive_path:
2198
+ return {"success": False, "error": "Archive path is required"}
2199
+
2200
+ if not destination:
2201
+ return {"success": False, "error": "Destination directory is required"}
2202
+
2203
+ # Validate archive path
2204
+ archive_validation = _validate_path(archive_path, allowed_path)
2205
+ if not archive_validation['valid']:
2206
+ return {"success": False, "error": archive_validation['error']}
2207
+
2208
+ full_archive_path = Path(archive_validation['resolved_path'])
2209
+
2210
+ # Validate destination path
2211
+ dest_validation = _validate_path(destination, allowed_path)
2212
+ if not dest_validation['valid']:
2213
+ return {"success": False, "error": dest_validation['error']}
2214
+
2215
+ full_dest_path = Path(dest_validation['resolved_path'])
2216
+
2217
+ if not full_archive_path.exists():
2218
+ return {"success": False, "error": f"Archive does not exist: {archive_path}"}
2219
+
2220
+ if full_archive_path.stat().st_size > max_size_mb * 1024 * 1024:
2221
+ return {"success": False, "error": f"Archive exceeds maximum size of {max_size_mb} MB"}
2222
+
2223
+ archive_type = _get_archive_type(full_archive_path)
2224
+ if not archive_type:
2225
+ return {"success": False, "error": f"Unsupported archive format: {full_archive_path.suffix}"}
2226
+
2227
+ try:
2228
+ # Create destination directory
2229
+ full_dest_path.mkdir(parents=True, exist_ok=True)
2230
+
2231
+ extracted_files = []
2232
+
2233
+ if archive_type == 'zip':
2234
+ with zipfile.ZipFile(str(full_archive_path), 'r') as zf:
2235
+ members = files_to_extract if files_to_extract else zf.namelist()
2236
+ for member in members:
2237
+ if member in zf.namelist():
2238
+ dest_file = full_dest_path / member
2239
+ if dest_file.exists() and not overwrite:
2240
+ continue
2241
+ zf.extract(member, str(full_dest_path))
2242
+ extracted_files.append(member)
2243
+ else:
2244
+ mode = 'r:gz' if archive_type == 'tar.gz' else 'r:bz2' if archive_type == 'tar.bz2' else 'r'
2245
+ with tarfile.open(str(full_archive_path), mode) as tf:
2246
+ if files_to_extract:
2247
+ members = [tf.getmember(f) for f in files_to_extract if f in tf.getnames()]
2248
+ else:
2249
+ members = tf.getmembers()
2250
+
2251
+ for member in members:
2252
+ dest_file = full_dest_path / member.name
2253
+ if dest_file.exists() and not overwrite:
2254
+ continue
2255
+ tf.extract(member, str(full_dest_path))
2256
+ extracted_files.append(member.name)
2257
+
2258
+ result = {
2259
+ "archive_path": archive_path,
2260
+ "destination": destination,
2261
+ "full_destination": str(full_dest_path),
2262
+ "files_extracted": len(extracted_files),
2263
+ "extracted": extracted_files
2264
+ }
2265
+
2266
+ logging.info(f"Extracted archive: {archive_path} -> {destination} ({len(extracted_files)} files)")
2267
+ return {"success": True, "result": result}
2268
+
2269
+ except Exception as e:
2270
+ logging.error(f"Error extracting archive {archive_path}: {e}")
2271
+ return {"success": False, "error": str(e)}