aiecs 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. aiecs/__init__.py +72 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +469 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +363 -0
  7. aiecs/application/knowledge_graph/__init__.py +7 -0
  8. aiecs/application/knowledge_graph/builder/__init__.py +37 -0
  9. aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
  10. aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
  11. aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
  12. aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
  13. aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
  14. aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
  15. aiecs/application/knowledge_graph/extractors/base.py +100 -0
  16. aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
  17. aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
  18. aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
  19. aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
  20. aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
  21. aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
  22. aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
  23. aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
  24. aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
  25. aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
  26. aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
  27. aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
  28. aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
  29. aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
  30. aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
  31. aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
  32. aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
  33. aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
  34. aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
  35. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
  36. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
  37. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
  38. aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
  39. aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
  40. aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
  41. aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
  42. aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
  43. aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
  44. aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
  45. aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
  46. aiecs/application/knowledge_graph/search/__init__.py +59 -0
  47. aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
  48. aiecs/application/knowledge_graph/search/reranker.py +295 -0
  49. aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
  50. aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
  51. aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
  52. aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
  53. aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
  54. aiecs/application/knowledge_graph/validators/__init__.py +13 -0
  55. aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
  56. aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
  57. aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
  58. aiecs/common/__init__.py +9 -0
  59. aiecs/common/knowledge_graph/__init__.py +17 -0
  60. aiecs/common/knowledge_graph/runnable.py +484 -0
  61. aiecs/config/__init__.py +16 -0
  62. aiecs/config/config.py +498 -0
  63. aiecs/config/graph_config.py +137 -0
  64. aiecs/config/registry.py +23 -0
  65. aiecs/core/__init__.py +46 -0
  66. aiecs/core/interface/__init__.py +34 -0
  67. aiecs/core/interface/execution_interface.py +152 -0
  68. aiecs/core/interface/storage_interface.py +171 -0
  69. aiecs/domain/__init__.py +289 -0
  70. aiecs/domain/agent/__init__.py +189 -0
  71. aiecs/domain/agent/base_agent.py +697 -0
  72. aiecs/domain/agent/exceptions.py +103 -0
  73. aiecs/domain/agent/graph_aware_mixin.py +559 -0
  74. aiecs/domain/agent/hybrid_agent.py +490 -0
  75. aiecs/domain/agent/integration/__init__.py +26 -0
  76. aiecs/domain/agent/integration/context_compressor.py +222 -0
  77. aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
  78. aiecs/domain/agent/integration/retry_policy.py +219 -0
  79. aiecs/domain/agent/integration/role_config.py +213 -0
  80. aiecs/domain/agent/knowledge_aware_agent.py +646 -0
  81. aiecs/domain/agent/lifecycle.py +296 -0
  82. aiecs/domain/agent/llm_agent.py +300 -0
  83. aiecs/domain/agent/memory/__init__.py +12 -0
  84. aiecs/domain/agent/memory/conversation.py +197 -0
  85. aiecs/domain/agent/migration/__init__.py +14 -0
  86. aiecs/domain/agent/migration/conversion.py +160 -0
  87. aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
  88. aiecs/domain/agent/models.py +317 -0
  89. aiecs/domain/agent/observability.py +407 -0
  90. aiecs/domain/agent/persistence.py +289 -0
  91. aiecs/domain/agent/prompts/__init__.py +29 -0
  92. aiecs/domain/agent/prompts/builder.py +161 -0
  93. aiecs/domain/agent/prompts/formatters.py +189 -0
  94. aiecs/domain/agent/prompts/template.py +255 -0
  95. aiecs/domain/agent/registry.py +260 -0
  96. aiecs/domain/agent/tool_agent.py +257 -0
  97. aiecs/domain/agent/tools/__init__.py +12 -0
  98. aiecs/domain/agent/tools/schema_generator.py +221 -0
  99. aiecs/domain/community/__init__.py +155 -0
  100. aiecs/domain/community/agent_adapter.py +477 -0
  101. aiecs/domain/community/analytics.py +481 -0
  102. aiecs/domain/community/collaborative_workflow.py +642 -0
  103. aiecs/domain/community/communication_hub.py +645 -0
  104. aiecs/domain/community/community_builder.py +320 -0
  105. aiecs/domain/community/community_integration.py +800 -0
  106. aiecs/domain/community/community_manager.py +813 -0
  107. aiecs/domain/community/decision_engine.py +879 -0
  108. aiecs/domain/community/exceptions.py +225 -0
  109. aiecs/domain/community/models/__init__.py +33 -0
  110. aiecs/domain/community/models/community_models.py +268 -0
  111. aiecs/domain/community/resource_manager.py +457 -0
  112. aiecs/domain/community/shared_context_manager.py +603 -0
  113. aiecs/domain/context/__init__.py +58 -0
  114. aiecs/domain/context/context_engine.py +989 -0
  115. aiecs/domain/context/conversation_models.py +354 -0
  116. aiecs/domain/context/graph_memory.py +467 -0
  117. aiecs/domain/execution/__init__.py +12 -0
  118. aiecs/domain/execution/model.py +57 -0
  119. aiecs/domain/knowledge_graph/__init__.py +19 -0
  120. aiecs/domain/knowledge_graph/models/__init__.py +52 -0
  121. aiecs/domain/knowledge_graph/models/entity.py +130 -0
  122. aiecs/domain/knowledge_graph/models/evidence.py +194 -0
  123. aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
  124. aiecs/domain/knowledge_graph/models/path.py +179 -0
  125. aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
  126. aiecs/domain/knowledge_graph/models/query.py +272 -0
  127. aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
  128. aiecs/domain/knowledge_graph/models/relation.py +136 -0
  129. aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
  130. aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
  131. aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
  132. aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
  133. aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
  134. aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
  135. aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
  136. aiecs/domain/task/__init__.py +13 -0
  137. aiecs/domain/task/dsl_processor.py +613 -0
  138. aiecs/domain/task/model.py +62 -0
  139. aiecs/domain/task/task_context.py +268 -0
  140. aiecs/infrastructure/__init__.py +24 -0
  141. aiecs/infrastructure/graph_storage/__init__.py +11 -0
  142. aiecs/infrastructure/graph_storage/base.py +601 -0
  143. aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
  144. aiecs/infrastructure/graph_storage/cache.py +429 -0
  145. aiecs/infrastructure/graph_storage/distributed.py +226 -0
  146. aiecs/infrastructure/graph_storage/error_handling.py +390 -0
  147. aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
  148. aiecs/infrastructure/graph_storage/health_checks.py +378 -0
  149. aiecs/infrastructure/graph_storage/in_memory.py +514 -0
  150. aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
  151. aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
  152. aiecs/infrastructure/graph_storage/metrics.py +357 -0
  153. aiecs/infrastructure/graph_storage/migration.py +413 -0
  154. aiecs/infrastructure/graph_storage/pagination.py +471 -0
  155. aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
  156. aiecs/infrastructure/graph_storage/postgres.py +871 -0
  157. aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
  158. aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
  159. aiecs/infrastructure/graph_storage/sqlite.py +623 -0
  160. aiecs/infrastructure/graph_storage/streaming.py +495 -0
  161. aiecs/infrastructure/messaging/__init__.py +13 -0
  162. aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
  163. aiecs/infrastructure/messaging/websocket_manager.py +298 -0
  164. aiecs/infrastructure/monitoring/__init__.py +34 -0
  165. aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
  166. aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
  167. aiecs/infrastructure/monitoring/structured_logger.py +48 -0
  168. aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
  169. aiecs/infrastructure/persistence/__init__.py +24 -0
  170. aiecs/infrastructure/persistence/context_engine_client.py +187 -0
  171. aiecs/infrastructure/persistence/database_manager.py +333 -0
  172. aiecs/infrastructure/persistence/file_storage.py +754 -0
  173. aiecs/infrastructure/persistence/redis_client.py +220 -0
  174. aiecs/llm/__init__.py +86 -0
  175. aiecs/llm/callbacks/__init__.py +11 -0
  176. aiecs/llm/callbacks/custom_callbacks.py +264 -0
  177. aiecs/llm/client_factory.py +420 -0
  178. aiecs/llm/clients/__init__.py +33 -0
  179. aiecs/llm/clients/base_client.py +193 -0
  180. aiecs/llm/clients/googleai_client.py +181 -0
  181. aiecs/llm/clients/openai_client.py +131 -0
  182. aiecs/llm/clients/vertex_client.py +437 -0
  183. aiecs/llm/clients/xai_client.py +184 -0
  184. aiecs/llm/config/__init__.py +51 -0
  185. aiecs/llm/config/config_loader.py +275 -0
  186. aiecs/llm/config/config_validator.py +236 -0
  187. aiecs/llm/config/model_config.py +151 -0
  188. aiecs/llm/utils/__init__.py +10 -0
  189. aiecs/llm/utils/validate_config.py +91 -0
  190. aiecs/main.py +363 -0
  191. aiecs/scripts/__init__.py +3 -0
  192. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  193. aiecs/scripts/aid/__init__.py +19 -0
  194. aiecs/scripts/aid/version_manager.py +215 -0
  195. aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
  196. aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
  197. aiecs/scripts/dependance_check/__init__.py +17 -0
  198. aiecs/scripts/dependance_check/dependency_checker.py +938 -0
  199. aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
  200. aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
  201. aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
  202. aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
  203. aiecs/scripts/dependance_patch/__init__.py +7 -0
  204. aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
  205. aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
  206. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
  207. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
  208. aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
  209. aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
  210. aiecs/scripts/tools_develop/README.md +449 -0
  211. aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
  212. aiecs/scripts/tools_develop/__init__.py +21 -0
  213. aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
  214. aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
  215. aiecs/scripts/tools_develop/verify_tools.py +356 -0
  216. aiecs/tasks/__init__.py +1 -0
  217. aiecs/tasks/worker.py +172 -0
  218. aiecs/tools/__init__.py +299 -0
  219. aiecs/tools/apisource/__init__.py +99 -0
  220. aiecs/tools/apisource/intelligence/__init__.py +19 -0
  221. aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
  222. aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
  223. aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
  224. aiecs/tools/apisource/monitoring/__init__.py +9 -0
  225. aiecs/tools/apisource/monitoring/metrics.py +303 -0
  226. aiecs/tools/apisource/providers/__init__.py +115 -0
  227. aiecs/tools/apisource/providers/base.py +664 -0
  228. aiecs/tools/apisource/providers/census.py +401 -0
  229. aiecs/tools/apisource/providers/fred.py +564 -0
  230. aiecs/tools/apisource/providers/newsapi.py +412 -0
  231. aiecs/tools/apisource/providers/worldbank.py +357 -0
  232. aiecs/tools/apisource/reliability/__init__.py +12 -0
  233. aiecs/tools/apisource/reliability/error_handler.py +375 -0
  234. aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
  235. aiecs/tools/apisource/tool.py +850 -0
  236. aiecs/tools/apisource/utils/__init__.py +9 -0
  237. aiecs/tools/apisource/utils/validators.py +338 -0
  238. aiecs/tools/base_tool.py +201 -0
  239. aiecs/tools/docs/__init__.py +121 -0
  240. aiecs/tools/docs/ai_document_orchestrator.py +599 -0
  241. aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
  242. aiecs/tools/docs/content_insertion_tool.py +1333 -0
  243. aiecs/tools/docs/document_creator_tool.py +1317 -0
  244. aiecs/tools/docs/document_layout_tool.py +1166 -0
  245. aiecs/tools/docs/document_parser_tool.py +994 -0
  246. aiecs/tools/docs/document_writer_tool.py +1818 -0
  247. aiecs/tools/knowledge_graph/__init__.py +17 -0
  248. aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
  249. aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
  250. aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
  251. aiecs/tools/langchain_adapter.py +542 -0
  252. aiecs/tools/schema_generator.py +275 -0
  253. aiecs/tools/search_tool/__init__.py +100 -0
  254. aiecs/tools/search_tool/analyzers.py +589 -0
  255. aiecs/tools/search_tool/cache.py +260 -0
  256. aiecs/tools/search_tool/constants.py +128 -0
  257. aiecs/tools/search_tool/context.py +216 -0
  258. aiecs/tools/search_tool/core.py +749 -0
  259. aiecs/tools/search_tool/deduplicator.py +123 -0
  260. aiecs/tools/search_tool/error_handler.py +271 -0
  261. aiecs/tools/search_tool/metrics.py +371 -0
  262. aiecs/tools/search_tool/rate_limiter.py +178 -0
  263. aiecs/tools/search_tool/schemas.py +277 -0
  264. aiecs/tools/statistics/__init__.py +80 -0
  265. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
  266. aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
  267. aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
  268. aiecs/tools/statistics/data_loader_tool.py +564 -0
  269. aiecs/tools/statistics/data_profiler_tool.py +658 -0
  270. aiecs/tools/statistics/data_transformer_tool.py +573 -0
  271. aiecs/tools/statistics/data_visualizer_tool.py +495 -0
  272. aiecs/tools/statistics/model_trainer_tool.py +487 -0
  273. aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
  274. aiecs/tools/task_tools/__init__.py +86 -0
  275. aiecs/tools/task_tools/chart_tool.py +732 -0
  276. aiecs/tools/task_tools/classfire_tool.py +922 -0
  277. aiecs/tools/task_tools/image_tool.py +447 -0
  278. aiecs/tools/task_tools/office_tool.py +684 -0
  279. aiecs/tools/task_tools/pandas_tool.py +635 -0
  280. aiecs/tools/task_tools/report_tool.py +635 -0
  281. aiecs/tools/task_tools/research_tool.py +392 -0
  282. aiecs/tools/task_tools/scraper_tool.py +715 -0
  283. aiecs/tools/task_tools/stats_tool.py +688 -0
  284. aiecs/tools/temp_file_manager.py +130 -0
  285. aiecs/tools/tool_executor/__init__.py +37 -0
  286. aiecs/tools/tool_executor/tool_executor.py +881 -0
  287. aiecs/utils/LLM_output_structor.py +445 -0
  288. aiecs/utils/__init__.py +34 -0
  289. aiecs/utils/base_callback.py +47 -0
  290. aiecs/utils/cache_provider.py +695 -0
  291. aiecs/utils/execution_utils.py +184 -0
  292. aiecs/utils/logging.py +1 -0
  293. aiecs/utils/prompt_loader.py +14 -0
  294. aiecs/utils/token_usage_repository.py +323 -0
  295. aiecs/ws/__init__.py +0 -0
  296. aiecs/ws/socket_server.py +52 -0
  297. aiecs-1.5.1.dist-info/METADATA +608 -0
  298. aiecs-1.5.1.dist-info/RECORD +302 -0
  299. aiecs-1.5.1.dist-info/WHEEL +5 -0
  300. aiecs-1.5.1.dist-info/entry_points.txt +10 -0
  301. aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
  302. aiecs-1.5.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,684 @@
1
+ from aiecs.tools import register_tool
2
+ from aiecs.tools.base_tool import BaseTool
3
+ from pydantic import BaseModel, field_validator, ConfigDict, Field
4
+ from pptx.util import Inches
5
+ from pptx import Presentation
6
+ from docx.shared import Pt
7
+ from docx import Document as DocxDocument
8
+ from tika import parser
9
+ import os
10
+ import logging
11
+ import warnings
12
+ from typing import List, Dict, Optional, Any
13
+
14
+ import pandas as pd
15
+ import pdfplumber
16
+ import pytesseract
17
+ from PIL import Image
18
+
19
+ # Configure Tika log path to user-writable directory before importing
20
+ os.environ["TIKA_LOG_PATH"] = os.path.expanduser("~/.cache/tika")
21
+ os.makedirs(os.path.expanduser("~/.cache/tika"), exist_ok=True)
22
+
23
+ # Suppress pkg_resources deprecation warning from tika
24
+ warnings.filterwarnings("ignore", category=UserWarning, module="tika")
25
+
26
+
27
+ # Module-level default configuration for validators
28
+ _DEFAULT_MAX_FILE_SIZE_MB = 100
29
+ _DEFAULT_ALLOWED_EXTENSIONS = [
30
+ ".docx",
31
+ ".pptx",
32
+ ".xlsx",
33
+ ".pdf",
34
+ ".png",
35
+ ".jpg",
36
+ ".jpeg",
37
+ ".tiff",
38
+ ".bmp",
39
+ ".gif",
40
+ ]
41
+
42
+ # Exceptions
43
+
44
+
45
+ class OfficeToolError(Exception):
46
+ """Base exception for OfficeTool errors."""
47
+
48
+
49
+ class InputValidationError(OfficeToolError):
50
+ """Raised when input validation fails."""
51
+
52
+
53
+ class FileOperationError(OfficeToolError):
54
+ """Raised when file operations fail."""
55
+
56
+
57
+ class SecurityError(OfficeToolError):
58
+ """Raised for security-related issues."""
59
+
60
+
61
+ class ContentValidationError(OfficeToolError):
62
+ """Raised when document content validation fails."""
63
+
64
+
65
+ # Base schema for common fields
66
+
67
+
68
+ class BaseFileSchema(BaseModel):
69
+ file_path: Optional[str] = None
70
+ output_path: Optional[str] = None
71
+ image_path: Optional[str] = None
72
+
73
+ @field_validator("file_path", "output_path", "image_path")
74
+ def validate_path(cls, v: Optional[str], field) -> Optional[str]:
75
+ """Validate file paths for existence, size, extension, and path traversal."""
76
+ if not v:
77
+ return v
78
+ abs_path = os.path.abspath(os.path.normpath(v))
79
+ # Check for path traversal
80
+ if ".." in v or "~" in v or "%" in v:
81
+ raise SecurityError(f"Path traversal attempt detected: {v}")
82
+ # Ensure path is in allowed directories
83
+ base_dir = os.path.abspath(os.getcwd())
84
+ allowed_dirs = [
85
+ os.path.abspath(os.path.normpath(d)) for d in ["/tmp", "./data", "./uploads"]
86
+ ]
87
+ if not abs_path.startswith(base_dir) and not any(
88
+ abs_path.startswith(d) for d in allowed_dirs
89
+ ):
90
+ raise SecurityError(f"Path not in allowed directories: {abs_path}")
91
+ # Check extension
92
+ ext = os.path.splitext(abs_path)[1].lower()
93
+ if ext not in _DEFAULT_ALLOWED_EXTENSIONS:
94
+ raise SecurityError(
95
+ f"Extension '{ext}' not allowed for '{field.field_name}', expected {_DEFAULT_ALLOWED_EXTENSIONS}"
96
+ )
97
+ # Check file existence and size for input paths
98
+ if field.field_name == "file_path":
99
+ if not os.path.isfile(abs_path):
100
+ raise FileOperationError(f"{field.field_name}: File not found: {abs_path}")
101
+ size_mb = os.path.getsize(abs_path) / (1024 * 1024)
102
+ if size_mb > _DEFAULT_MAX_FILE_SIZE_MB:
103
+ raise FileOperationError(
104
+ f"{field.field_name}: File too large: {size_mb:.1f}MB, max {_DEFAULT_MAX_FILE_SIZE_MB}MB"
105
+ )
106
+ # Check for existing output paths
107
+ elif field.field_name == "output_path" and os.path.exists(abs_path):
108
+ raise FileOperationError(f"{field.field_name}: File already exists: {abs_path}")
109
+ return abs_path
110
+
111
+
112
+ # Schemas for operations
113
+
114
+
115
+ class ReadDocxSchema(BaseFileSchema):
116
+ """Schema for reading DOCX files."""
117
+
118
+ file_path: str
119
+ include_tables: bool = False
120
+
121
+
122
+ class WriteDocxSchema(BaseFileSchema):
123
+ """Schema for writing DOCX files."""
124
+
125
+ text: str
126
+ output_path: str
127
+ table_data: Optional[List[List[str]]] = None
128
+
129
+
130
+ class ReadPptxSchema(BaseFileSchema):
131
+ """Schema for reading PPTX files."""
132
+
133
+ file_path: str
134
+
135
+
136
+ class WritePptxSchema(BaseFileSchema):
137
+ """Schema for writing PPTX files."""
138
+
139
+ slides: List[str]
140
+ output_path: str
141
+ image_path: Optional[str] = None
142
+
143
+
144
+ class ReadXlsxSchema(BaseFileSchema):
145
+ """Schema for reading XLSX files."""
146
+
147
+ file_path: str
148
+ sheet_name: Optional[str] = None
149
+
150
+
151
+ class WriteXlsxSchema(BaseFileSchema):
152
+ """Schema for writing XLSX files."""
153
+
154
+ data: List[Dict]
155
+ output_path: str
156
+ sheet_name: str = "Sheet1"
157
+
158
+
159
+ class ExtractTextSchema(BaseFileSchema):
160
+ """Schema for extracting text from files."""
161
+
162
+ file_path: str
163
+
164
+
165
+ @register_tool("office")
166
+ class OfficeTool(BaseTool):
167
+ """
168
+ Office document processing tool supporting:
169
+ - read_docx: Read content from DOCX files.
170
+ - write_docx: Write content to DOCX files.
171
+ - read_pptx: Read content from PPTX files.
172
+ - write_pptx: Write content to PPTX files.
173
+ - read_xlsx: Read content from XLSX files.
174
+ - write_xlsx: Write content to XLSX files.
175
+ - extract_text: Extract text from various file formats.
176
+
177
+ Inherits from BaseTool to leverage ToolExecutor for caching, concurrency, and error handling.
178
+ """
179
+
180
+ # Configuration schema
181
+ class Config(BaseModel):
182
+ """Configuration for the office tool"""
183
+
184
+ model_config = ConfigDict(env_prefix="OFFICE_TOOL_")
185
+
186
+ max_file_size_mb: int = Field(default=100, description="Maximum file size in megabytes")
187
+ default_font: str = Field(default="Arial", description="Default font for documents")
188
+ default_font_size: int = Field(default=12, description="Default font size in points")
189
+ allowed_extensions: List[str] = Field(
190
+ default=[
191
+ ".docx",
192
+ ".pptx",
193
+ ".xlsx",
194
+ ".pdf",
195
+ ".png",
196
+ ".jpg",
197
+ ".jpeg",
198
+ ".tiff",
199
+ ".bmp",
200
+ ".gif",
201
+ ],
202
+ description="Allowed document file extensions",
203
+ )
204
+
205
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
206
+ """
207
+ Initialize OfficeTool with configuration.
208
+
209
+ Args:
210
+ config (Dict, optional): Configuration overrides for OfficeTool.
211
+
212
+ Raises:
213
+ ValueError: If config contains invalid settings.
214
+ """
215
+ super().__init__(config)
216
+
217
+ # Parse configuration
218
+ self.config = self.Config(**(config or {}))
219
+
220
+ self.logger = logging.getLogger(__name__)
221
+ if not self.logger.handlers:
222
+ handler = logging.StreamHandler()
223
+ handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
224
+ self.logger.addHandler(handler)
225
+ self.logger.setLevel(logging.INFO)
226
+
227
+ def _validate_document(self, file_path: str, file_type: str) -> None:
228
+ """
229
+ Validate document structure before processing.
230
+
231
+ Args:
232
+ file_path (str): Path to the document file.
233
+ file_type (str): Type of document ('docx', 'pptx', 'xlsx', 'pdf', 'image').
234
+
235
+ Raises:
236
+ ContentValidationError: If document structure is invalid.
237
+ """
238
+ try:
239
+ if file_type == "docx":
240
+ doc = DocxDocument(file_path)
241
+ if not hasattr(doc, "paragraphs"):
242
+ raise ContentValidationError("Invalid DOCX structure")
243
+ elif file_type == "pptx":
244
+ prs = Presentation(file_path)
245
+ if not hasattr(prs, "slides"):
246
+ raise ContentValidationError("Invalid PPTX structure")
247
+ elif file_type == "xlsx":
248
+ # Just validate that file can be read - don't care about return
249
+ # type
250
+ pd.read_excel(file_path, nrows=5)
251
+ elif file_type == "pdf":
252
+ with pdfplumber.open(file_path) as pdf:
253
+ if len(pdf.pages) == 0:
254
+ raise ContentValidationError("PDF has no pages")
255
+ elif file_type == "image":
256
+ img = Image.open(file_path)
257
+ img.verify() # Verify it's a valid image
258
+ else:
259
+ # Use tika as fallback for other formats
260
+ parsed = parser.from_file(file_path)
261
+ if not parsed or not parsed.get("content"):
262
+ raise ContentValidationError("Unable to parse file content")
263
+ except Exception as e:
264
+ raise ContentValidationError(f"Invalid {file_type.upper()} file: {str(e)}")
265
+
266
+ def _sanitize_text(self, text: str) -> str:
267
+ """
268
+ Sanitize text to remove potentially harmful control characters.
269
+
270
+ Args:
271
+ text (str): Input text.
272
+
273
+ Returns:
274
+ str: Sanitized text.
275
+ """
276
+ if not text:
277
+ return ""
278
+ return "".join(char for char in text if ord(char) >= 32 or char in "\n\r\t")
279
+
280
+ def _sanitize_table_data(
281
+ self, table_data: Optional[List[List[str]]]
282
+ ) -> Optional[List[List[str]]]:
283
+ """
284
+ Sanitize table data to remove harmful content.
285
+
286
+ Args:
287
+ table_data (Optional[List[List[str]]]): Table data to sanitize.
288
+
289
+ Returns:
290
+ Optional[List[List[str]]]: Sanitized table data.
291
+ """
292
+ if not table_data:
293
+ return None
294
+ return [[self._sanitize_text(str(cell)) for cell in row] for row in table_data]
295
+
296
+ def _sanitize_data(self, data_list: List[Dict]) -> List[Dict]:
297
+ """
298
+ Sanitize Excel data to remove harmful content and enforce limits.
299
+
300
+ Args:
301
+ data_list (List[Dict]): List of dictionaries to sanitize.
302
+
303
+ Returns:
304
+ List[Dict]: Sanitized data.
305
+ """
306
+ if not data_list:
307
+ return []
308
+ sanitized = []
309
+ for item in data_list:
310
+ clean_item = {}
311
+ for k, v in item.items():
312
+ # Excel key limit with sanitization
313
+ clean_key = self._sanitize_text(str(k))[:255]
314
+ if isinstance(v, str):
315
+ clean_value = self._sanitize_text(v)[:32767] # Excel cell limit
316
+ else:
317
+ clean_value = v
318
+ clean_item[clean_key] = clean_value
319
+ sanitized.append(clean_item)
320
+ return sanitized
321
+
322
+ def _extract_pdf_text(self, file_path: str) -> str:
323
+ """
324
+ Extract text from PDF using pdfplumber.
325
+
326
+ Args:
327
+ file_path (str): Path to the PDF file.
328
+
329
+ Returns:
330
+ str: Extracted text content.
331
+
332
+ Raises:
333
+ FileOperationError: If PDF text extraction fails.
334
+ """
335
+ try:
336
+ text_content = []
337
+ with pdfplumber.open(file_path) as pdf:
338
+ for page in pdf.pages:
339
+ page_text = page.extract_text()
340
+ if page_text:
341
+ text_content.append(page_text)
342
+ return "\n".join(text_content)
343
+ except Exception as e:
344
+ raise FileOperationError(f"Failed to extract PDF text: {str(e)}")
345
+
346
+ def _extract_image_text(self, file_path: str) -> str:
347
+ """
348
+ Extract text from image using pytesseract OCR.
349
+
350
+ Args:
351
+ file_path (str): Path to the image file.
352
+
353
+ Returns:
354
+ str: Extracted text content.
355
+
356
+ Raises:
357
+ FileOperationError: If image text extraction fails.
358
+ """
359
+ try:
360
+ image = Image.open(file_path)
361
+ # Convert to RGB if necessary
362
+ if image.mode != "RGB":
363
+ image = image.convert("RGB")
364
+ text = pytesseract.image_to_string(image, lang="eng+chi_sim")
365
+ return text.strip()
366
+ except Exception as e:
367
+ raise FileOperationError(f"Failed to extract image text: {str(e)}")
368
+
369
+ def _extract_tika_text(self, file_path: str) -> str:
370
+ """
371
+ Extract text using Apache Tika as fallback.
372
+
373
+ Args:
374
+ file_path (str): Path to the file.
375
+
376
+ Returns:
377
+ str: Extracted text content.
378
+
379
+ Raises:
380
+ FileOperationError: If Tika text extraction fails.
381
+ """
382
+ try:
383
+ parsed = parser.from_file(file_path)
384
+ content = parsed.get("content", "")
385
+ return content.strip() if content else ""
386
+ except Exception as e:
387
+ raise FileOperationError(f"Failed to extract text with Tika: {str(e)}")
388
+
389
+ def read_docx(self, file_path: str, include_tables: bool = False) -> Dict[str, Any]:
390
+ """
391
+ Read content from a DOCX file.
392
+
393
+ Args:
394
+ file_path (str): Path to the DOCX file.
395
+ include_tables (bool): Whether to include table data.
396
+
397
+ Returns:
398
+ Dict[str, Any]: Document content {'paragraphs': List[str], 'tables': Optional[List[List[List[str]]]]}.
399
+
400
+ Raises:
401
+ FileOperationError: If file cannot be read.
402
+ ContentValidationError: If document structure is invalid.
403
+ """
404
+ try:
405
+ self._validate_document(file_path, "docx")
406
+ doc = DocxDocument(file_path)
407
+ paras = [p.text for p in doc.paragraphs if p.text.strip()]
408
+ tables = None
409
+ if include_tables:
410
+ tables = [
411
+ [[cell.text for cell in row.cells] for row in table.rows]
412
+ for table in doc.tables
413
+ ]
414
+ return {"paragraphs": paras, "tables": tables}
415
+ except ContentValidationError:
416
+ raise
417
+ except Exception as e:
418
+ raise FileOperationError(f"Failed to read DOCX: {str(e)}")
419
+
420
+ def write_docx(
421
+ self,
422
+ text: str,
423
+ output_path: str,
424
+ table_data: Optional[List[List[str]]] = None,
425
+ ) -> Dict[str, Any]:
426
+ """
427
+ Write content to a DOCX file.
428
+
429
+ Args:
430
+ text (str): Text content to write.
431
+ output_path (str): Path to save the DOCX file.
432
+ table_data (Optional[List[List[str]]]): Table data to include.
433
+
434
+ Returns:
435
+ Dict[str, Any]: Status {'success': bool, 'file_path': str}.
436
+
437
+ Raises:
438
+ FileOperationError: If file cannot be written.
439
+ """
440
+ try:
441
+ sanitized_text = self._sanitize_text(text)
442
+ sanitized_table_data = self._sanitize_table_data(table_data)
443
+ doc = DocxDocument()
444
+ style = doc.styles["Normal"]
445
+ style.font.name = self.config.default_font
446
+ style.font.size = Pt(self.config.default_font_size)
447
+ for line in sanitized_text.splitlines():
448
+ doc.add_paragraph(line)
449
+ if sanitized_table_data and sanitized_table_data[0]:
450
+ # Find maximum number of columns to handle irregular table data
451
+ max_cols = max(len(row) for row in sanitized_table_data)
452
+ table = doc.add_table(rows=len(sanitized_table_data), cols=max_cols)
453
+ for i, row in enumerate(sanitized_table_data):
454
+ for j in range(max_cols):
455
+ if j < len(row):
456
+ table.rows[i].cells[j].text = str(row[j])
457
+ else:
458
+ # Empty cell for missing data
459
+ table.rows[i].cells[j].text = ""
460
+ doc.save(output_path)
461
+ return {"success": True, "file_path": output_path}
462
+ except Exception as e:
463
+ raise FileOperationError(f"Failed to write DOCX: {str(e)}")
464
+
465
+ def read_pptx(self, file_path: str) -> List[str]:
466
+ """
467
+ Read content from a PPTX file.
468
+
469
+ Args:
470
+ file_path (str): Path to the PPTX file.
471
+
472
+ Returns:
473
+ List[str]: List of text content from slides.
474
+
475
+ Raises:
476
+ FileOperationError: If file cannot be read.
477
+ ContentValidationError: If document structure is invalid.
478
+ """
479
+ try:
480
+ self._validate_document(file_path, "pptx")
481
+ prs = Presentation(file_path)
482
+ texts = []
483
+ for slide in prs.slides:
484
+ for shape in slide.shapes:
485
+ if hasattr(shape, "text"):
486
+ txt = shape.text.strip()
487
+ if txt:
488
+ texts.append(txt)
489
+ return texts
490
+ except ContentValidationError:
491
+ raise
492
+ except Exception as e:
493
+ raise FileOperationError(f"Failed to read PPTX: {str(e)}")
494
+
495
+ def write_pptx(
496
+ self,
497
+ slides: List[str],
498
+ output_path: str,
499
+ image_path: Optional[str] = None,
500
+ ) -> Dict[str, Any]:
501
+ """
502
+ Write content to a PPTX file.
503
+
504
+ Args:
505
+ slides (List[str]): List of slide contents.
506
+ output_path (str): Path to save the PPTX file.
507
+ image_path (Optional[str]): Path to an image to include on the first slide.
508
+
509
+ Returns:
510
+ Dict[str, Any]: Status {'success': bool, 'file_path': str}.
511
+
512
+ Raises:
513
+ FileOperationError: If file cannot be written.
514
+ """
515
+ try:
516
+ sanitized_slides = [self._sanitize_text(slide) for slide in slides]
517
+ prs = Presentation()
518
+ blank = prs.slide_layouts[6]
519
+ for idx, content in enumerate(sanitized_slides):
520
+ slide = prs.slides.add_slide(blank)
521
+ box = slide.shapes.add_textbox(Inches(1), Inches(1), Inches(8), Inches(5))
522
+ tf = box.text_frame
523
+ lines = content.splitlines()
524
+ if lines:
525
+ # Set text for the first paragraph (which already exists)
526
+ tf.text = lines[0]
527
+ # Add additional paragraphs for remaining lines
528
+ for line in lines[1:]:
529
+ p = tf.add_paragraph()
530
+ p.text = line
531
+ if idx == 0 and image_path:
532
+ try:
533
+ slide.shapes.add_picture(image_path, Inches(1), Inches(6), Inches(4))
534
+ except Exception as img_err:
535
+ self.logger.warning(f"Could not add image to slide: {img_err}")
536
+ prs.save(output_path)
537
+ return {"success": True, "file_path": output_path}
538
+ except Exception as e:
539
+ raise FileOperationError(f"Failed to write PPTX: {str(e)}")
540
+
541
+ def read_xlsx(self, file_path: str, sheet_name: Optional[str] = None) -> List[Dict]:
542
+ """
543
+ Read content from an XLSX file.
544
+
545
+ Args:
546
+ file_path (str): Path to the XLSX file.
547
+ sheet_name (Optional[str]): Name of the sheet to read.
548
+
549
+ Returns:
550
+ List[Dict]: List of dictionaries representing Excel data.
551
+
552
+ Raises:
553
+ FileOperationError: If file cannot be read.
554
+ ContentValidationError: If document structure is invalid.
555
+ """
556
+ try:
557
+ self._validate_document(file_path, "xlsx")
558
+ data = pd.read_excel(file_path, sheet_name=sheet_name)
559
+
560
+ # Handle different return types from pd.read_excel()
561
+ if isinstance(data, pd.DataFrame):
562
+ # Single sheet or specific sheet requested
563
+ return data.to_dict(orient="records")
564
+ elif isinstance(data, dict):
565
+ # Multiple sheets returned as dict - use the first sheet
566
+ first_sheet_name = list(data.keys())[0]
567
+ first_df = data[first_sheet_name]
568
+ return first_df.to_dict(orient="records")
569
+ else:
570
+ raise FileOperationError("Unexpected data type returned from Excel file")
571
+
572
+ except ContentValidationError:
573
+ raise
574
+ except Exception as e:
575
+ raise FileOperationError(f"Failed to read XLSX: {str(e)}")
576
+
577
+ def write_xlsx(
578
+ self, data: List[Dict], output_path: str, sheet_name: str = "Sheet1"
579
+ ) -> Dict[str, Any]:
580
+ """
581
+ Write content to an XLSX file.
582
+
583
+ Args:
584
+ data (List[Dict]): Data to write.
585
+ output_path (str): Path to save the XLSX file.
586
+ sheet_name (str): Name of the sheet.
587
+
588
+ Returns:
589
+ Dict[str, Any]: Status {'success': bool, 'file_path': str}.
590
+
591
+ Raises:
592
+ FileOperationError: If file cannot be written.
593
+ """
594
+ try:
595
+ sanitized_data = self._sanitize_data(data)
596
+ if not sanitized_data:
597
+ pd.DataFrame().to_excel(output_path, index=False, sheet_name=sheet_name)
598
+ else:
599
+ pd.DataFrame(sanitized_data).to_excel(
600
+ output_path, index=False, sheet_name=sheet_name
601
+ )
602
+ return {"success": True, "file_path": output_path}
603
+ except Exception as e:
604
+ raise FileOperationError(f"Failed to write XLSX: {str(e)}")
605
+
606
+ def extract_text(self, file_path: str) -> str:
607
+ """
608
+ Extract text from various file formats using combination library approach.
609
+
610
+ Args:
611
+ file_path (str): Path to the file.
612
+
613
+ Returns:
614
+ str: Extracted text content.
615
+
616
+ Raises:
617
+ FileOperationError: If text extraction fails.
618
+ ContentValidationError: If document structure is invalid.
619
+ """
620
+ try:
621
+ file_ext = os.path.splitext(file_path)[1].lower()
622
+
623
+ # Determine file type and validate
624
+ if file_ext == ".pdf":
625
+ file_type = "pdf"
626
+ elif file_ext == ".docx":
627
+ file_type = "docx"
628
+ elif file_ext == ".pptx":
629
+ file_type = "pptx"
630
+ elif file_ext == ".xlsx":
631
+ file_type = "xlsx"
632
+ elif file_ext in [
633
+ ".png",
634
+ ".jpg",
635
+ ".jpeg",
636
+ ".tiff",
637
+ ".bmp",
638
+ ".gif",
639
+ ]:
640
+ file_type = "image"
641
+ else:
642
+ file_type = "other"
643
+
644
+ # Validate document structure
645
+ self._validate_document(file_path, file_type)
646
+
647
+ # Extract text based on file type
648
+ if file_type == "pdf":
649
+ return self._sanitize_text(self._extract_pdf_text(file_path))
650
+ elif file_type == "docx":
651
+ doc = DocxDocument(file_path)
652
+ paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
653
+ return self._sanitize_text("\n".join(paragraphs))
654
+ elif file_type == "pptx":
655
+ prs = Presentation(file_path)
656
+ texts = []
657
+ for slide in prs.slides:
658
+ for shape in slide.shapes:
659
+ if hasattr(shape, "text") and shape.text.strip():
660
+ texts.append(shape.text)
661
+ return self._sanitize_text("\n".join(texts))
662
+ elif file_type == "xlsx":
663
+ data = pd.read_excel(file_path)
664
+ # Handle different return types from pd.read_excel()
665
+ if isinstance(data, pd.DataFrame):
666
+ return self._sanitize_text(data.to_string(index=False))
667
+ elif isinstance(data, dict):
668
+ # Multiple sheets returned as dict - use the first sheet
669
+ first_sheet_name = list(data.keys())[0]
670
+ first_df = data[first_sheet_name]
671
+ return self._sanitize_text(first_df.to_string(index=False))
672
+ else:
673
+ # Fallback for unexpected data types
674
+ return self._sanitize_text("")
675
+ elif file_type == "image":
676
+ return self._sanitize_text(self._extract_image_text(file_path))
677
+ else:
678
+ # Use Tika as fallback for other formats
679
+ return self._sanitize_text(self._extract_tika_text(file_path))
680
+
681
+ except ContentValidationError:
682
+ raise
683
+ except Exception as e:
684
+ raise FileOperationError(f"Failed to extract text: {str(e)}")