aiecs 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. aiecs/__init__.py +72 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +469 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +363 -0
  7. aiecs/application/knowledge_graph/__init__.py +7 -0
  8. aiecs/application/knowledge_graph/builder/__init__.py +37 -0
  9. aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
  10. aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
  11. aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
  12. aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
  13. aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
  14. aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
  15. aiecs/application/knowledge_graph/extractors/base.py +100 -0
  16. aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
  17. aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
  18. aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
  19. aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
  20. aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
  21. aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
  22. aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
  23. aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
  24. aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
  25. aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
  26. aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
  27. aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
  28. aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
  29. aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
  30. aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
  31. aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
  32. aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
  33. aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
  34. aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
  35. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
  36. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
  37. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
  38. aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
  39. aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
  40. aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
  41. aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
  42. aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
  43. aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
  44. aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
  45. aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
  46. aiecs/application/knowledge_graph/search/__init__.py +59 -0
  47. aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
  48. aiecs/application/knowledge_graph/search/reranker.py +295 -0
  49. aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
  50. aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
  51. aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
  52. aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
  53. aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
  54. aiecs/application/knowledge_graph/validators/__init__.py +13 -0
  55. aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
  56. aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
  57. aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
  58. aiecs/common/__init__.py +9 -0
  59. aiecs/common/knowledge_graph/__init__.py +17 -0
  60. aiecs/common/knowledge_graph/runnable.py +484 -0
  61. aiecs/config/__init__.py +16 -0
  62. aiecs/config/config.py +498 -0
  63. aiecs/config/graph_config.py +137 -0
  64. aiecs/config/registry.py +23 -0
  65. aiecs/core/__init__.py +46 -0
  66. aiecs/core/interface/__init__.py +34 -0
  67. aiecs/core/interface/execution_interface.py +152 -0
  68. aiecs/core/interface/storage_interface.py +171 -0
  69. aiecs/domain/__init__.py +289 -0
  70. aiecs/domain/agent/__init__.py +189 -0
  71. aiecs/domain/agent/base_agent.py +697 -0
  72. aiecs/domain/agent/exceptions.py +103 -0
  73. aiecs/domain/agent/graph_aware_mixin.py +559 -0
  74. aiecs/domain/agent/hybrid_agent.py +490 -0
  75. aiecs/domain/agent/integration/__init__.py +26 -0
  76. aiecs/domain/agent/integration/context_compressor.py +222 -0
  77. aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
  78. aiecs/domain/agent/integration/retry_policy.py +219 -0
  79. aiecs/domain/agent/integration/role_config.py +213 -0
  80. aiecs/domain/agent/knowledge_aware_agent.py +646 -0
  81. aiecs/domain/agent/lifecycle.py +296 -0
  82. aiecs/domain/agent/llm_agent.py +300 -0
  83. aiecs/domain/agent/memory/__init__.py +12 -0
  84. aiecs/domain/agent/memory/conversation.py +197 -0
  85. aiecs/domain/agent/migration/__init__.py +14 -0
  86. aiecs/domain/agent/migration/conversion.py +160 -0
  87. aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
  88. aiecs/domain/agent/models.py +317 -0
  89. aiecs/domain/agent/observability.py +407 -0
  90. aiecs/domain/agent/persistence.py +289 -0
  91. aiecs/domain/agent/prompts/__init__.py +29 -0
  92. aiecs/domain/agent/prompts/builder.py +161 -0
  93. aiecs/domain/agent/prompts/formatters.py +189 -0
  94. aiecs/domain/agent/prompts/template.py +255 -0
  95. aiecs/domain/agent/registry.py +260 -0
  96. aiecs/domain/agent/tool_agent.py +257 -0
  97. aiecs/domain/agent/tools/__init__.py +12 -0
  98. aiecs/domain/agent/tools/schema_generator.py +221 -0
  99. aiecs/domain/community/__init__.py +155 -0
  100. aiecs/domain/community/agent_adapter.py +477 -0
  101. aiecs/domain/community/analytics.py +481 -0
  102. aiecs/domain/community/collaborative_workflow.py +642 -0
  103. aiecs/domain/community/communication_hub.py +645 -0
  104. aiecs/domain/community/community_builder.py +320 -0
  105. aiecs/domain/community/community_integration.py +800 -0
  106. aiecs/domain/community/community_manager.py +813 -0
  107. aiecs/domain/community/decision_engine.py +879 -0
  108. aiecs/domain/community/exceptions.py +225 -0
  109. aiecs/domain/community/models/__init__.py +33 -0
  110. aiecs/domain/community/models/community_models.py +268 -0
  111. aiecs/domain/community/resource_manager.py +457 -0
  112. aiecs/domain/community/shared_context_manager.py +603 -0
  113. aiecs/domain/context/__init__.py +58 -0
  114. aiecs/domain/context/context_engine.py +989 -0
  115. aiecs/domain/context/conversation_models.py +354 -0
  116. aiecs/domain/context/graph_memory.py +467 -0
  117. aiecs/domain/execution/__init__.py +12 -0
  118. aiecs/domain/execution/model.py +57 -0
  119. aiecs/domain/knowledge_graph/__init__.py +19 -0
  120. aiecs/domain/knowledge_graph/models/__init__.py +52 -0
  121. aiecs/domain/knowledge_graph/models/entity.py +130 -0
  122. aiecs/domain/knowledge_graph/models/evidence.py +194 -0
  123. aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
  124. aiecs/domain/knowledge_graph/models/path.py +179 -0
  125. aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
  126. aiecs/domain/knowledge_graph/models/query.py +272 -0
  127. aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
  128. aiecs/domain/knowledge_graph/models/relation.py +136 -0
  129. aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
  130. aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
  131. aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
  132. aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
  133. aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
  134. aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
  135. aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
  136. aiecs/domain/task/__init__.py +13 -0
  137. aiecs/domain/task/dsl_processor.py +613 -0
  138. aiecs/domain/task/model.py +62 -0
  139. aiecs/domain/task/task_context.py +268 -0
  140. aiecs/infrastructure/__init__.py +24 -0
  141. aiecs/infrastructure/graph_storage/__init__.py +11 -0
  142. aiecs/infrastructure/graph_storage/base.py +601 -0
  143. aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
  144. aiecs/infrastructure/graph_storage/cache.py +429 -0
  145. aiecs/infrastructure/graph_storage/distributed.py +226 -0
  146. aiecs/infrastructure/graph_storage/error_handling.py +390 -0
  147. aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
  148. aiecs/infrastructure/graph_storage/health_checks.py +378 -0
  149. aiecs/infrastructure/graph_storage/in_memory.py +514 -0
  150. aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
  151. aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
  152. aiecs/infrastructure/graph_storage/metrics.py +357 -0
  153. aiecs/infrastructure/graph_storage/migration.py +413 -0
  154. aiecs/infrastructure/graph_storage/pagination.py +471 -0
  155. aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
  156. aiecs/infrastructure/graph_storage/postgres.py +871 -0
  157. aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
  158. aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
  159. aiecs/infrastructure/graph_storage/sqlite.py +623 -0
  160. aiecs/infrastructure/graph_storage/streaming.py +495 -0
  161. aiecs/infrastructure/messaging/__init__.py +13 -0
  162. aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
  163. aiecs/infrastructure/messaging/websocket_manager.py +298 -0
  164. aiecs/infrastructure/monitoring/__init__.py +34 -0
  165. aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
  166. aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
  167. aiecs/infrastructure/monitoring/structured_logger.py +48 -0
  168. aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
  169. aiecs/infrastructure/persistence/__init__.py +24 -0
  170. aiecs/infrastructure/persistence/context_engine_client.py +187 -0
  171. aiecs/infrastructure/persistence/database_manager.py +333 -0
  172. aiecs/infrastructure/persistence/file_storage.py +754 -0
  173. aiecs/infrastructure/persistence/redis_client.py +220 -0
  174. aiecs/llm/__init__.py +86 -0
  175. aiecs/llm/callbacks/__init__.py +11 -0
  176. aiecs/llm/callbacks/custom_callbacks.py +264 -0
  177. aiecs/llm/client_factory.py +420 -0
  178. aiecs/llm/clients/__init__.py +33 -0
  179. aiecs/llm/clients/base_client.py +193 -0
  180. aiecs/llm/clients/googleai_client.py +181 -0
  181. aiecs/llm/clients/openai_client.py +131 -0
  182. aiecs/llm/clients/vertex_client.py +437 -0
  183. aiecs/llm/clients/xai_client.py +184 -0
  184. aiecs/llm/config/__init__.py +51 -0
  185. aiecs/llm/config/config_loader.py +275 -0
  186. aiecs/llm/config/config_validator.py +236 -0
  187. aiecs/llm/config/model_config.py +151 -0
  188. aiecs/llm/utils/__init__.py +10 -0
  189. aiecs/llm/utils/validate_config.py +91 -0
  190. aiecs/main.py +363 -0
  191. aiecs/scripts/__init__.py +3 -0
  192. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  193. aiecs/scripts/aid/__init__.py +19 -0
  194. aiecs/scripts/aid/version_manager.py +215 -0
  195. aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
  196. aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
  197. aiecs/scripts/dependance_check/__init__.py +17 -0
  198. aiecs/scripts/dependance_check/dependency_checker.py +938 -0
  199. aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
  200. aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
  201. aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
  202. aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
  203. aiecs/scripts/dependance_patch/__init__.py +7 -0
  204. aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
  205. aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
  206. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
  207. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
  208. aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
  209. aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
  210. aiecs/scripts/tools_develop/README.md +449 -0
  211. aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
  212. aiecs/scripts/tools_develop/__init__.py +21 -0
  213. aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
  214. aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
  215. aiecs/scripts/tools_develop/verify_tools.py +356 -0
  216. aiecs/tasks/__init__.py +1 -0
  217. aiecs/tasks/worker.py +172 -0
  218. aiecs/tools/__init__.py +299 -0
  219. aiecs/tools/apisource/__init__.py +99 -0
  220. aiecs/tools/apisource/intelligence/__init__.py +19 -0
  221. aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
  222. aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
  223. aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
  224. aiecs/tools/apisource/monitoring/__init__.py +9 -0
  225. aiecs/tools/apisource/monitoring/metrics.py +303 -0
  226. aiecs/tools/apisource/providers/__init__.py +115 -0
  227. aiecs/tools/apisource/providers/base.py +664 -0
  228. aiecs/tools/apisource/providers/census.py +401 -0
  229. aiecs/tools/apisource/providers/fred.py +564 -0
  230. aiecs/tools/apisource/providers/newsapi.py +412 -0
  231. aiecs/tools/apisource/providers/worldbank.py +357 -0
  232. aiecs/tools/apisource/reliability/__init__.py +12 -0
  233. aiecs/tools/apisource/reliability/error_handler.py +375 -0
  234. aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
  235. aiecs/tools/apisource/tool.py +850 -0
  236. aiecs/tools/apisource/utils/__init__.py +9 -0
  237. aiecs/tools/apisource/utils/validators.py +338 -0
  238. aiecs/tools/base_tool.py +201 -0
  239. aiecs/tools/docs/__init__.py +121 -0
  240. aiecs/tools/docs/ai_document_orchestrator.py +599 -0
  241. aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
  242. aiecs/tools/docs/content_insertion_tool.py +1333 -0
  243. aiecs/tools/docs/document_creator_tool.py +1317 -0
  244. aiecs/tools/docs/document_layout_tool.py +1166 -0
  245. aiecs/tools/docs/document_parser_tool.py +994 -0
  246. aiecs/tools/docs/document_writer_tool.py +1818 -0
  247. aiecs/tools/knowledge_graph/__init__.py +17 -0
  248. aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
  249. aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
  250. aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
  251. aiecs/tools/langchain_adapter.py +542 -0
  252. aiecs/tools/schema_generator.py +275 -0
  253. aiecs/tools/search_tool/__init__.py +100 -0
  254. aiecs/tools/search_tool/analyzers.py +589 -0
  255. aiecs/tools/search_tool/cache.py +260 -0
  256. aiecs/tools/search_tool/constants.py +128 -0
  257. aiecs/tools/search_tool/context.py +216 -0
  258. aiecs/tools/search_tool/core.py +749 -0
  259. aiecs/tools/search_tool/deduplicator.py +123 -0
  260. aiecs/tools/search_tool/error_handler.py +271 -0
  261. aiecs/tools/search_tool/metrics.py +371 -0
  262. aiecs/tools/search_tool/rate_limiter.py +178 -0
  263. aiecs/tools/search_tool/schemas.py +277 -0
  264. aiecs/tools/statistics/__init__.py +80 -0
  265. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
  266. aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
  267. aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
  268. aiecs/tools/statistics/data_loader_tool.py +564 -0
  269. aiecs/tools/statistics/data_profiler_tool.py +658 -0
  270. aiecs/tools/statistics/data_transformer_tool.py +573 -0
  271. aiecs/tools/statistics/data_visualizer_tool.py +495 -0
  272. aiecs/tools/statistics/model_trainer_tool.py +487 -0
  273. aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
  274. aiecs/tools/task_tools/__init__.py +86 -0
  275. aiecs/tools/task_tools/chart_tool.py +732 -0
  276. aiecs/tools/task_tools/classfire_tool.py +922 -0
  277. aiecs/tools/task_tools/image_tool.py +447 -0
  278. aiecs/tools/task_tools/office_tool.py +684 -0
  279. aiecs/tools/task_tools/pandas_tool.py +635 -0
  280. aiecs/tools/task_tools/report_tool.py +635 -0
  281. aiecs/tools/task_tools/research_tool.py +392 -0
  282. aiecs/tools/task_tools/scraper_tool.py +715 -0
  283. aiecs/tools/task_tools/stats_tool.py +688 -0
  284. aiecs/tools/temp_file_manager.py +130 -0
  285. aiecs/tools/tool_executor/__init__.py +37 -0
  286. aiecs/tools/tool_executor/tool_executor.py +881 -0
  287. aiecs/utils/LLM_output_structor.py +445 -0
  288. aiecs/utils/__init__.py +34 -0
  289. aiecs/utils/base_callback.py +47 -0
  290. aiecs/utils/cache_provider.py +695 -0
  291. aiecs/utils/execution_utils.py +184 -0
  292. aiecs/utils/logging.py +1 -0
  293. aiecs/utils/prompt_loader.py +14 -0
  294. aiecs/utils/token_usage_repository.py +323 -0
  295. aiecs/ws/__init__.py +0 -0
  296. aiecs/ws/socket_server.py +52 -0
  297. aiecs-1.5.1.dist-info/METADATA +608 -0
  298. aiecs-1.5.1.dist-info/RECORD +302 -0
  299. aiecs-1.5.1.dist-info/WHEEL +5 -0
  300. aiecs-1.5.1.dist-info/entry_points.txt +10 -0
  301. aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
  302. aiecs-1.5.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,635 @@
1
+ from io import StringIO
2
+ import pandas as pd
3
+ import numpy as np
4
+ from typing import List, Dict, Union, Optional
5
+ from pydantic import BaseModel, ConfigDict, Field
6
+ import logging
7
+
8
+ from aiecs.tools.base_tool import BaseTool
9
+ from aiecs.tools import register_tool
10
+
11
+ # Custom exceptions
12
+
13
+
14
+ class PandasToolError(Exception):
15
+ """Base exception for PandasTool errors."""
16
+
17
+
18
+ class InputValidationError(PandasToolError):
19
+ """Input validation error."""
20
+
21
+
22
+ class DataFrameError(PandasToolError):
23
+ """DataFrame operation error."""
24
+
25
+
26
+ class SecurityError(PandasToolError):
27
+ """Security-related error."""
28
+
29
+
30
+ class ValidationError(PandasToolError):
31
+ """Validation error."""
32
+
33
+
34
+ @register_tool("pandas")
35
+ class PandasTool(BaseTool):
36
+ """
37
+ Tool encapsulating pandas functionality for data processing, supporting 30+ operations including:
38
+ - Data reading/writing (CSV, JSON, Excel).
39
+ - Descriptive statistics (summary, describe, value_counts).
40
+ - Filtering and selection (filter, select_columns, drop_columns).
41
+ - Grouping and aggregation (groupby, pivot_table).
42
+ - Merging and concatenation (merge, concat).
43
+ - Data transformation (sort_values, rename_columns, replace_values, fill_na, astype, apply).
44
+ - Data reshaping (melt, pivot, stack, unstack).
45
+ - Data cleaning (strip_strings, to_numeric, to_datetime).
46
+ - Statistical computations (mean, sum, count, min, max).
47
+ - Window functions (rolling).
48
+ - Sampling and viewing (head, tail, sample).
49
+
50
+ Inherits from BaseTool to leverage ToolExecutor for caching, concurrency, and error handling.
51
+ """
52
+
53
+ # Configuration schema
54
+ class Config(BaseModel):
55
+ """Configuration for the pandas tool"""
56
+
57
+ model_config = ConfigDict(env_prefix="PANDAS_TOOL_")
58
+
59
+ csv_delimiter: str = Field(default=",", description="Delimiter for CSV files")
60
+ encoding: str = Field(default="utf-8", description="Encoding for file operations")
61
+ default_agg: Dict[str, str] = Field(
62
+ default={"numeric": "mean", "object": "count"},
63
+ description="Default aggregation functions",
64
+ )
65
+ chunk_size: int = Field(default=10000, description="Chunk size for large file processing")
66
+ max_csv_size: int = Field(
67
+ default=1000000, description="Threshold for chunked CSV processing"
68
+ )
69
+ allowed_file_extensions: List[str] = Field(
70
+ default=[".csv", ".xlsx", ".json"],
71
+ description="Allowed file extensions",
72
+ )
73
+
74
+ def __init__(self, config: Optional[Dict] = None):
75
+ """
76
+ Initialize PandasTool with configuration.
77
+
78
+ Args:
79
+ config (Dict, optional): Configuration overrides for PandasTool.
80
+
81
+ Raises:
82
+ ValueError: If config is invalid.
83
+ """
84
+ super().__init__(config)
85
+
86
+ # Parse configuration
87
+ self.config = self.Config(**(config or {}))
88
+
89
+ self.logger = logging.getLogger(__name__)
90
+ if not self.logger.handlers:
91
+ handler = logging.StreamHandler()
92
+ handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
93
+ self.logger.addHandler(handler)
94
+ self.logger.setLevel(logging.INFO)
95
+
96
+ def _validate_df(self, records: List[Dict]) -> pd.DataFrame:
97
+ """
98
+ Convert records to a DataFrame and validate.
99
+
100
+ Args:
101
+ records (List[Dict]): List of records to convert.
102
+
103
+ Returns:
104
+ pd.DataFrame: Validated DataFrame.
105
+
106
+ Raises:
107
+ InputValidationError: If records are empty or invalid.
108
+ """
109
+ if not records:
110
+ raise InputValidationError("Records list is empty")
111
+ try:
112
+ df = pd.DataFrame(records)
113
+ if df.empty:
114
+ raise InputValidationError("DataFrame is empty")
115
+ return df
116
+ except Exception as e:
117
+ raise InputValidationError(f"Failed to create DataFrame: {e}")
118
+
119
+ def _validate_columns(self, df: pd.DataFrame, columns: List[str]) -> None:
120
+ """
121
+ Validate column names exist in DataFrame.
122
+
123
+ Args:
124
+ df (pd.DataFrame): DataFrame to validate.
125
+ columns (List[str]): Columns to check.
126
+
127
+ Raises:
128
+ InputValidationError: If columns are not found.
129
+ """
130
+ if not columns:
131
+ return
132
+ available_columns = set(df.columns)
133
+ missing = [col for col in columns if col not in available_columns]
134
+ if missing:
135
+ raise InputValidationError(
136
+ f"Columns not found: {missing}. Available columns: {list(available_columns)}"
137
+ )
138
+
139
+ def _to_json_serializable(
140
+ self, result: Union[pd.DataFrame, pd.Series, Dict]
141
+ ) -> Union[List[Dict], Dict]:
142
+ """
143
+ Convert result to JSON-serializable format.
144
+
145
+ Args:
146
+ result (Union[pd.DataFrame, pd.Series, Dict]): Result to convert.
147
+
148
+ Returns:
149
+ Union[List[Dict], Dict]: JSON-serializable result.
150
+ """
151
+ if isinstance(result, pd.DataFrame):
152
+ for col in result.select_dtypes(include=["datetime64"]).columns:
153
+ result[col] = result[col].dt.strftime("%Y-%m-%d %H:%M:%S")
154
+ return result.to_dict(orient="records")
155
+ elif isinstance(result, pd.Series):
156
+ if pd.api.types.is_datetime64_any_dtype(result):
157
+ result = result.dt.strftime("%Y-%m-%d %H:%M:%S")
158
+ return result.to_dict()
159
+ elif isinstance(result, dict):
160
+
161
+ def convert_value(v):
162
+ if isinstance(v, (np.floating, np.integer)):
163
+ return float(v)
164
+ elif isinstance(v, np.bool_):
165
+ return bool(v)
166
+ elif isinstance(v, (pd.Timestamp, np.datetime64)):
167
+ return str(v)
168
+ elif isinstance(v, np.ndarray):
169
+ return v.tolist()
170
+ elif pd.isna(v):
171
+ return None
172
+ return v
173
+
174
+ return {k: convert_value(v) for k, v in result.items()}
175
+ return result
176
+
177
+ def read_csv(self, csv_str: str) -> List[Dict]:
178
+ """Read CSV string into a DataFrame."""
179
+ try:
180
+ if len(csv_str) > self.config.max_csv_size:
181
+ chunks = []
182
+ for chunk in pd.read_csv(
183
+ StringIO(csv_str),
184
+ sep=self.config.csv_delimiter,
185
+ encoding=self.config.encoding,
186
+ chunksize=self.config.chunk_size,
187
+ ):
188
+ chunks.append(chunk)
189
+ df = pd.concat(chunks)
190
+ else:
191
+ df = pd.read_csv(
192
+ StringIO(csv_str),
193
+ sep=self.config.csv_delimiter,
194
+ encoding=self.config.encoding,
195
+ )
196
+ return self._to_json_serializable(df)
197
+ except Exception as e:
198
+ raise DataFrameError(f"Failed to read CSV: {e}")
199
+
200
+ def read_json(self, json_str: str) -> List[Dict]:
201
+ """Read JSON string into a DataFrame."""
202
+ try:
203
+ df = pd.read_json(StringIO(json_str))
204
+ return self._to_json_serializable(df)
205
+ except Exception as e:
206
+ raise DataFrameError(f"Failed to read JSON: {e}")
207
+
208
+ def read_file(self, file_path: str, file_type: str = "csv") -> List[Dict]:
209
+ """Read data from a file (CSV, Excel, JSON)."""
210
+ try:
211
+ if file_type == "csv":
212
+ file_size = sum(1 for _ in open(file_path, "r", encoding=self.config.encoding))
213
+ if file_size > self.config.chunk_size:
214
+ chunks = []
215
+ for chunk in pd.read_csv(
216
+ file_path,
217
+ sep=self.config.csv_delimiter,
218
+ encoding=self.config.encoding,
219
+ chunksize=self.config.chunk_size,
220
+ ):
221
+ chunks.append(chunk)
222
+ df = pd.concat(chunks)
223
+ else:
224
+ df = pd.read_csv(
225
+ file_path,
226
+ sep=self.config.csv_delimiter,
227
+ encoding=self.config.encoding,
228
+ )
229
+ elif file_type == "excel":
230
+ df = pd.read_excel(file_path)
231
+ elif file_type == "json":
232
+ df = pd.read_json(file_path)
233
+ else:
234
+ raise ValidationError(f"Unsupported file type: {file_type}")
235
+ return self._to_json_serializable(df)
236
+ except ValidationError:
237
+ raise
238
+ except Exception as e:
239
+ raise DataFrameError(f"Failed to read file: {e}")
240
+
241
+ def write_file(self, records: List[Dict], file_path: str, file_type: str = "csv") -> Dict:
242
+ """Write DataFrame to a file."""
243
+ df = self._validate_df(records)
244
+ try:
245
+ if file_type == "csv":
246
+ df.to_csv(
247
+ file_path,
248
+ index=False,
249
+ sep=self.config.csv_delimiter,
250
+ encoding=self.config.encoding,
251
+ )
252
+ elif file_type == "excel":
253
+ df.to_excel(file_path, index=False)
254
+ elif file_type == "json":
255
+ df.to_json(file_path, orient="records")
256
+ else:
257
+ raise ValidationError(f"Unsupported file type: {file_type}")
258
+ return {"success": True, "file_path": file_path, "rows": len(df)}
259
+ except Exception as e:
260
+ raise DataFrameError(f"Failed to write file: {e}")
261
+
262
+ def summary(self, records: List[Dict]) -> Dict:
263
+ """Compute summary statistics for DataFrame."""
264
+ df = self._validate_df(records)
265
+ desc = df.describe(include="all").to_dict()
266
+ return self._to_json_serializable(desc)
267
+
268
+ def describe(self, records: List[Dict], columns: Optional[List[str]] = None) -> Dict:
269
+ """Compute descriptive statistics for specified columns."""
270
+ df = self._validate_df(records)
271
+ if columns:
272
+ self._validate_columns(df, columns)
273
+ df = df[columns]
274
+ desc = df.describe().to_dict()
275
+ return self._to_json_serializable(desc)
276
+
277
+ def value_counts(self, records: List[Dict], columns: List[str]) -> Dict:
278
+ """Compute value counts for specified columns."""
279
+ df = self._validate_df(records)
280
+ self._validate_columns(df, columns)
281
+ result = {col: df[col].value_counts().to_dict() for col in columns}
282
+ return self._to_json_serializable(result)
283
+
284
+ def filter(self, records: List[Dict], condition: str) -> List[Dict]:
285
+ """Filter DataFrame based on a condition."""
286
+ df = self._validate_df(records)
287
+ try:
288
+ df = df.query(condition, engine="python")
289
+ return self._to_json_serializable(df)
290
+ except Exception as e:
291
+ raise DataFrameError(f"Invalid query condition: {e}")
292
+
293
+ def select_columns(self, records: List[Dict], columns: List[str]) -> List[Dict]:
294
+ """Select specified columns from DataFrame."""
295
+ df = self._validate_df(records)
296
+ self._validate_columns(df, columns)
297
+ return self._to_json_serializable(df[columns])
298
+
299
+ def drop_columns(self, records: List[Dict], columns: List[str]) -> List[Dict]:
300
+ """Drop specified columns from DataFrame."""
301
+ df = self._validate_df(records)
302
+ self._validate_columns(df, columns)
303
+ return self._to_json_serializable(df.drop(columns=columns))
304
+
305
+ def drop_duplicates(
306
+ self, records: List[Dict], columns: Optional[List[str]] = None
307
+ ) -> List[Dict]:
308
+ """Drop duplicate rows based on specified columns."""
309
+ df = self._validate_df(records)
310
+ if columns:
311
+ self._validate_columns(df, columns)
312
+ return self._to_json_serializable(df.drop_duplicates(subset=columns))
313
+
314
+ def dropna(self, records: List[Dict], axis: int = 0, how: str = "any") -> List[Dict]:
315
+ """Drop rows or columns with missing values."""
316
+ df = self._validate_df(records)
317
+ if how not in ["any", "all"]:
318
+ raise ValidationError("how must be 'any' or 'all'")
319
+ return self._to_json_serializable(df.dropna(axis=axis, how=how))
320
+
321
+ def groupby(self, records: List[Dict], by: List[str], agg: Dict[str, str]) -> List[Dict]:
322
+ """Group DataFrame and apply aggregations."""
323
+ df = self._validate_df(records)
324
+ self._validate_columns(df, by + list(agg.keys()))
325
+ try:
326
+ df = df.groupby(by).agg(agg).reset_index()
327
+ return self._to_json_serializable(df)
328
+ except Exception as e:
329
+ raise DataFrameError(f"Groupby failed: {e}")
330
+
331
+ def pivot_table(
332
+ self,
333
+ records: List[Dict],
334
+ values: List[str],
335
+ index: List[str],
336
+ columns: List[str],
337
+ aggfunc: str = "mean",
338
+ ) -> List[Dict]:
339
+ """Create a pivot table from DataFrame."""
340
+ df = self._validate_df(records)
341
+ self._validate_columns(df, values + index + columns)
342
+ try:
343
+ df = pd.pivot_table(
344
+ df,
345
+ values=values,
346
+ index=index,
347
+ columns=columns,
348
+ aggfunc=aggfunc,
349
+ )
350
+ return self._to_json_serializable(df.reset_index())
351
+ except Exception as e:
352
+ raise DataFrameError(f"Pivot table failed: {e}")
353
+
354
+ def merge(
355
+ self,
356
+ records: List[Dict],
357
+ records_right: List[Dict],
358
+ on: Union[str, List[str]],
359
+ join_type: str = "inner",
360
+ ) -> List[Dict]:
361
+ """Merge two DataFrames."""
362
+ df_left = self._validate_df(records)
363
+ df_right = self._validate_df(records_right)
364
+ if join_type not in ["inner", "left", "right", "outer"]:
365
+ raise ValidationError("join_type must be one of: inner, left, right, outer")
366
+ self._validate_columns(df_left, [on] if isinstance(on, str) else on)
367
+ self._validate_columns(df_right, [on] if isinstance(on, str) else on)
368
+ try:
369
+ df = df_left.merge(df_right, on=on, how=join_type)
370
+ return self._to_json_serializable(df)
371
+ except Exception as e:
372
+ raise DataFrameError(f"Merge failed: {e}")
373
+
374
+ def concat(self, records_list: List[List[Dict]], axis: int = 0) -> List[Dict]:
375
+ """Concatenate multiple DataFrames."""
376
+ if not records_list or not all(records_list):
377
+ raise ValidationError("Records list is empty")
378
+ dfs = [self._validate_df(records) for records in records_list]
379
+ try:
380
+ df = pd.concat(dfs, axis=axis, ignore_index=True)
381
+ return self._to_json_serializable(df)
382
+ except Exception as e:
383
+ raise DataFrameError(f"Concat failed: {e}")
384
+
385
+ def sort_values(
386
+ self,
387
+ records: List[Dict],
388
+ sort_by: List[str],
389
+ ascending: Union[bool, List[bool]] = True,
390
+ ) -> List[Dict]:
391
+ """Sort DataFrame by specified columns."""
392
+ df = self._validate_df(records)
393
+ self._validate_columns(df, sort_by)
394
+ try:
395
+ df = df.sort_values(by=sort_by, ascending=ascending)
396
+ return self._to_json_serializable(df)
397
+ except Exception as e:
398
+ raise DataFrameError(f"Sort failed: {e}")
399
+
400
+ def rename_columns(self, records: List[Dict], mapping: Dict[str, str]) -> List[Dict]:
401
+ """Rename DataFrame columns."""
402
+ df = self._validate_df(records)
403
+ self._validate_columns(df, list(mapping.keys()))
404
+ return self._to_json_serializable(df.rename(columns=mapping))
405
+
406
+ def replace_values(
407
+ self,
408
+ records: List[Dict],
409
+ to_replace: Dict,
410
+ columns: Optional[List[str]] = None,
411
+ ) -> List[Dict]:
412
+ """Replace values in DataFrame."""
413
+ df = self._validate_df(records)
414
+ if columns:
415
+ self._validate_columns(df, columns)
416
+ df = df[columns]
417
+ return self._to_json_serializable(df.replace(to_replace))
418
+
419
+ def fill_na(
420
+ self,
421
+ records: List[Dict],
422
+ value: Union[str, int, float],
423
+ columns: Optional[List[str]] = None,
424
+ ) -> List[Dict]:
425
+ """Fill missing values in DataFrame."""
426
+ df = self._validate_df(records)
427
+ if columns:
428
+ self._validate_columns(df, columns)
429
+ df[columns] = df[columns].fillna(value)
430
+ else:
431
+ df = df.fillna(value)
432
+ return self._to_json_serializable(df)
433
+
434
+ def astype(self, records: List[Dict], dtypes: Dict[str, str]) -> List[Dict]:
435
+ """Convert column types in DataFrame."""
436
+ df = self._validate_df(records)
437
+ self._validate_columns(df, list(dtypes.keys()))
438
+ try:
439
+ df = df.astype(dtypes)
440
+ return self._to_json_serializable(df)
441
+ except Exception as e:
442
+ raise DataFrameError(f"Type conversion failed: {e}")
443
+
444
+ def apply(
445
+ self, records: List[Dict], func: str, columns: List[str], axis: int = 0
446
+ ) -> List[Dict]:
447
+ """Apply a function to specified columns or rows."""
448
+ df = self._validate_df(records)
449
+ self._validate_columns(df, columns)
450
+ allowed_funcs = {
451
+ "upper": lambda x: x.upper() if isinstance(x, str) else x,
452
+ "lower": lambda x: x.lower() if isinstance(x, str) else x,
453
+ "strip": lambda x: x.strip() if isinstance(x, str) else x,
454
+ "capitalize": lambda x: (x.capitalize() if isinstance(x, str) else x),
455
+ "title": lambda x: x.title() if isinstance(x, str) else x,
456
+ "len": lambda x: len(str(x)) if pd.notna(x) else 0,
457
+ "abs": lambda x: (abs(float(x)) if pd.notna(x) and not isinstance(x, str) else x),
458
+ "round": lambda x: (round(float(x)) if pd.notna(x) and not isinstance(x, str) else x),
459
+ "ceil": lambda x: (np.ceil(float(x)) if pd.notna(x) and not isinstance(x, str) else x),
460
+ "floor": lambda x: (
461
+ np.floor(float(x)) if pd.notna(x) and not isinstance(x, str) else x
462
+ ),
463
+ "int": lambda x: (int(float(x)) if pd.notna(x) and not isinstance(x, str) else None),
464
+ "float": lambda x: (float(x) if pd.notna(x) and not isinstance(x, str) else None),
465
+ "str": lambda x: str(x) if pd.notna(x) else "",
466
+ "bool": lambda x: bool(x) if pd.notna(x) else False,
467
+ "date_only": lambda x: (x.date() if isinstance(x, pd.Timestamp) else x),
468
+ "year": lambda x: x.year if isinstance(x, pd.Timestamp) else None,
469
+ "month": lambda x: (x.month if isinstance(x, pd.Timestamp) else None),
470
+ "day": lambda x: x.day if isinstance(x, pd.Timestamp) else None,
471
+ }
472
+ try:
473
+ if axis == 0:
474
+ for col in columns:
475
+ df[col] = df[col].apply(allowed_funcs[func])
476
+ else:
477
+ df[columns] = df[columns].apply(allowed_funcs[func], axis=1)
478
+ return self._to_json_serializable(df)
479
+ except Exception as e:
480
+ raise DataFrameError(f"Apply failed: {e}")
481
+
482
+ def melt(self, records: List[Dict], id_vars: List[str], value_vars: List[str]) -> List[Dict]:
483
+ """Melt DataFrame to long format."""
484
+ df = self._validate_df(records)
485
+ self._validate_columns(df, id_vars + value_vars)
486
+ try:
487
+ df = pd.melt(df, id_vars=id_vars, value_vars=value_vars)
488
+ return self._to_json_serializable(df)
489
+ except Exception as e:
490
+ raise DataFrameError(f"Melt failed: {e}")
491
+
492
+ def pivot(self, records: List[Dict], index: str, columns: str, values: str) -> List[Dict]:
493
+ """Pivot DataFrame to wide format."""
494
+ df = self._validate_df(records)
495
+ self._validate_columns(df, [index, columns, values])
496
+ try:
497
+ df = df.pivot(index=index, columns=columns, values=values)
498
+ return self._to_json_serializable(df.reset_index())
499
+ except Exception as e:
500
+ raise DataFrameError(f"Pivot failed: {e}")
501
+
502
+ def stack(self, records: List[Dict]) -> List[Dict]:
503
+ """Stack DataFrame columns into rows."""
504
+ df = self._validate_df(records)
505
+ try:
506
+ df = df.stack().reset_index()
507
+ return self._to_json_serializable(df)
508
+ except Exception as e:
509
+ raise DataFrameError(f"Stack failed: {e}")
510
+
511
+ def unstack(self, records: List[Dict], level: Union[int, str] = -1) -> List[Dict]:
512
+ """Unstack DataFrame rows into columns."""
513
+ df = self._validate_df(records)
514
+ try:
515
+ df = df.unstack(level=level).reset_index()
516
+ return self._to_json_serializable(df)
517
+ except Exception as e:
518
+ raise DataFrameError(f"Unstack failed: {e}")
519
+
520
+ def strip_strings(self, records: List[Dict], columns: List[str]) -> List[Dict]:
521
+ """Strip whitespace from string columns."""
522
+ df = self._validate_df(records)
523
+ self._validate_columns(df, columns)
524
+ for col in columns:
525
+ if df[col].dtype == "object":
526
+ df[col] = df[col].str.strip()
527
+ return self._to_json_serializable(df)
528
+
529
+ def to_numeric(self, records: List[Dict], columns: List[str]) -> List[Dict]:
530
+ """Convert columns to numeric type."""
531
+ df = self._validate_df(records)
532
+ self._validate_columns(df, columns)
533
+ try:
534
+ for col in columns:
535
+ df[col] = pd.to_numeric(df[col], errors="coerce")
536
+ return self._to_json_serializable(df)
537
+ except Exception as e:
538
+ raise DataFrameError(f"To numeric failed: {e}")
539
+
540
+ def to_datetime(
541
+ self,
542
+ records: List[Dict],
543
+ columns: List[str],
544
+ format: Optional[str] = None,
545
+ ) -> List[Dict]:
546
+ """Convert columns to datetime type."""
547
+ df = self._validate_df(records)
548
+ self._validate_columns(df, columns)
549
+ try:
550
+ for col in columns:
551
+ df[col] = pd.to_datetime(df[col], format=format, errors="coerce")
552
+ return self._to_json_serializable(df)
553
+ except Exception as e:
554
+ raise DataFrameError(f"To datetime failed: {e}")
555
+
556
+ def mean(self, records: List[Dict], columns: Optional[List[str]] = None) -> Dict:
557
+ """Compute mean of numeric columns."""
558
+ df = self._validate_df(records)
559
+ if columns:
560
+ self._validate_columns(df, columns)
561
+ df = df[columns]
562
+ return self._to_json_serializable(df.select_dtypes(include=np.number).mean())
563
+
564
+ def sum(self, records: List[Dict], columns: Optional[List[str]] = None) -> Dict:
565
+ """Compute sum of numeric columns."""
566
+ df = self._validate_df(records)
567
+ if columns:
568
+ self._validate_columns(df, columns)
569
+ df = df[columns]
570
+ return self._to_json_serializable(df.select_dtypes(include=np.number).sum())
571
+
572
+ def count(self, records: List[Dict], columns: Optional[List[str]] = None) -> Dict:
573
+ """Compute count of non-null values."""
574
+ df = self._validate_df(records)
575
+ if columns:
576
+ self._validate_columns(df, columns)
577
+ df = df[columns]
578
+ return self._to_json_serializable(df.count())
579
+
580
+ def min(self, records: List[Dict], columns: Optional[List[str]] = None) -> Dict:
581
+ """Compute minimum values."""
582
+ df = self._validate_df(records)
583
+ if columns:
584
+ self._validate_columns(df, columns)
585
+ df = df[columns]
586
+ return self._to_json_serializable(df.min())
587
+
588
+ def max(self, records: List[Dict], columns: Optional[List[str]] = None) -> Dict:
589
+ """Compute maximum values."""
590
+ df = self._validate_df(records)
591
+ if columns:
592
+ self._validate_columns(df, columns)
593
+ df = df[columns]
594
+ return self._to_json_serializable(df.max())
595
+
596
+ def rolling(
597
+ self,
598
+ records: List[Dict],
599
+ columns: List[str],
600
+ window: int,
601
+ function: str = "mean",
602
+ ) -> List[Dict]:
603
+ """Apply rolling window function to columns."""
604
+ df = self._validate_df(records)
605
+ self._validate_columns(df, columns)
606
+ allowed_funcs = ["mean", "sum", "min", "max", "std", "count", "median"]
607
+ if function not in allowed_funcs:
608
+ raise ValidationError(f"Function '{function}' not allowed. Available: {allowed_funcs}")
609
+ try:
610
+ for col in columns:
611
+ if pd.api.types.is_numeric_dtype(df[col]):
612
+ df[f"{col}_{function}_{window}"] = getattr(df[col].rolling(window), function)()
613
+ return self._to_json_serializable(df)
614
+ except Exception as e:
615
+ raise DataFrameError(f"Rolling operation failed: {e}")
616
+
617
+ def head(self, records: List[Dict], n: int = 5) -> List[Dict]:
618
+ """Return first n rows of DataFrame."""
619
+ df = self._validate_df(records)
620
+ return self._to_json_serializable(df.head(n))
621
+
622
+ def tail(self, records: List[Dict], n: int = 5) -> List[Dict]:
623
+ """Return last n rows of DataFrame."""
624
+ df = self._validate_df(records)
625
+ return self._to_json_serializable(df.tail(n))
626
+
627
+ def sample(
628
+ self,
629
+ records: List[Dict],
630
+ n: int = 5,
631
+ random_state: Optional[int] = None,
632
+ ) -> List[Dict]:
633
+ """Return random sample of n rows from DataFrame."""
634
+ df = self._validate_df(records)
635
+ return self._to_json_serializable(df.sample(n=min(n, len(df)), random_state=random_state))