aiecs 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. aiecs/__init__.py +72 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +469 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +363 -0
  7. aiecs/application/knowledge_graph/__init__.py +7 -0
  8. aiecs/application/knowledge_graph/builder/__init__.py +37 -0
  9. aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
  10. aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
  11. aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
  12. aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
  13. aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
  14. aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
  15. aiecs/application/knowledge_graph/extractors/base.py +100 -0
  16. aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
  17. aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
  18. aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
  19. aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
  20. aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
  21. aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
  22. aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
  23. aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
  24. aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
  25. aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
  26. aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
  27. aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
  28. aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
  29. aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
  30. aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
  31. aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
  32. aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
  33. aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
  34. aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
  35. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
  36. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
  37. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
  38. aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
  39. aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
  40. aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
  41. aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
  42. aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
  43. aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
  44. aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
  45. aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
  46. aiecs/application/knowledge_graph/search/__init__.py +59 -0
  47. aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
  48. aiecs/application/knowledge_graph/search/reranker.py +295 -0
  49. aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
  50. aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
  51. aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
  52. aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
  53. aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
  54. aiecs/application/knowledge_graph/validators/__init__.py +13 -0
  55. aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
  56. aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
  57. aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
  58. aiecs/common/__init__.py +9 -0
  59. aiecs/common/knowledge_graph/__init__.py +17 -0
  60. aiecs/common/knowledge_graph/runnable.py +484 -0
  61. aiecs/config/__init__.py +16 -0
  62. aiecs/config/config.py +498 -0
  63. aiecs/config/graph_config.py +137 -0
  64. aiecs/config/registry.py +23 -0
  65. aiecs/core/__init__.py +46 -0
  66. aiecs/core/interface/__init__.py +34 -0
  67. aiecs/core/interface/execution_interface.py +152 -0
  68. aiecs/core/interface/storage_interface.py +171 -0
  69. aiecs/domain/__init__.py +289 -0
  70. aiecs/domain/agent/__init__.py +189 -0
  71. aiecs/domain/agent/base_agent.py +697 -0
  72. aiecs/domain/agent/exceptions.py +103 -0
  73. aiecs/domain/agent/graph_aware_mixin.py +559 -0
  74. aiecs/domain/agent/hybrid_agent.py +490 -0
  75. aiecs/domain/agent/integration/__init__.py +26 -0
  76. aiecs/domain/agent/integration/context_compressor.py +222 -0
  77. aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
  78. aiecs/domain/agent/integration/retry_policy.py +219 -0
  79. aiecs/domain/agent/integration/role_config.py +213 -0
  80. aiecs/domain/agent/knowledge_aware_agent.py +646 -0
  81. aiecs/domain/agent/lifecycle.py +296 -0
  82. aiecs/domain/agent/llm_agent.py +300 -0
  83. aiecs/domain/agent/memory/__init__.py +12 -0
  84. aiecs/domain/agent/memory/conversation.py +197 -0
  85. aiecs/domain/agent/migration/__init__.py +14 -0
  86. aiecs/domain/agent/migration/conversion.py +160 -0
  87. aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
  88. aiecs/domain/agent/models.py +317 -0
  89. aiecs/domain/agent/observability.py +407 -0
  90. aiecs/domain/agent/persistence.py +289 -0
  91. aiecs/domain/agent/prompts/__init__.py +29 -0
  92. aiecs/domain/agent/prompts/builder.py +161 -0
  93. aiecs/domain/agent/prompts/formatters.py +189 -0
  94. aiecs/domain/agent/prompts/template.py +255 -0
  95. aiecs/domain/agent/registry.py +260 -0
  96. aiecs/domain/agent/tool_agent.py +257 -0
  97. aiecs/domain/agent/tools/__init__.py +12 -0
  98. aiecs/domain/agent/tools/schema_generator.py +221 -0
  99. aiecs/domain/community/__init__.py +155 -0
  100. aiecs/domain/community/agent_adapter.py +477 -0
  101. aiecs/domain/community/analytics.py +481 -0
  102. aiecs/domain/community/collaborative_workflow.py +642 -0
  103. aiecs/domain/community/communication_hub.py +645 -0
  104. aiecs/domain/community/community_builder.py +320 -0
  105. aiecs/domain/community/community_integration.py +800 -0
  106. aiecs/domain/community/community_manager.py +813 -0
  107. aiecs/domain/community/decision_engine.py +879 -0
  108. aiecs/domain/community/exceptions.py +225 -0
  109. aiecs/domain/community/models/__init__.py +33 -0
  110. aiecs/domain/community/models/community_models.py +268 -0
  111. aiecs/domain/community/resource_manager.py +457 -0
  112. aiecs/domain/community/shared_context_manager.py +603 -0
  113. aiecs/domain/context/__init__.py +58 -0
  114. aiecs/domain/context/context_engine.py +989 -0
  115. aiecs/domain/context/conversation_models.py +354 -0
  116. aiecs/domain/context/graph_memory.py +467 -0
  117. aiecs/domain/execution/__init__.py +12 -0
  118. aiecs/domain/execution/model.py +57 -0
  119. aiecs/domain/knowledge_graph/__init__.py +19 -0
  120. aiecs/domain/knowledge_graph/models/__init__.py +52 -0
  121. aiecs/domain/knowledge_graph/models/entity.py +130 -0
  122. aiecs/domain/knowledge_graph/models/evidence.py +194 -0
  123. aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
  124. aiecs/domain/knowledge_graph/models/path.py +179 -0
  125. aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
  126. aiecs/domain/knowledge_graph/models/query.py +272 -0
  127. aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
  128. aiecs/domain/knowledge_graph/models/relation.py +136 -0
  129. aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
  130. aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
  131. aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
  132. aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
  133. aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
  134. aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
  135. aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
  136. aiecs/domain/task/__init__.py +13 -0
  137. aiecs/domain/task/dsl_processor.py +613 -0
  138. aiecs/domain/task/model.py +62 -0
  139. aiecs/domain/task/task_context.py +268 -0
  140. aiecs/infrastructure/__init__.py +24 -0
  141. aiecs/infrastructure/graph_storage/__init__.py +11 -0
  142. aiecs/infrastructure/graph_storage/base.py +601 -0
  143. aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
  144. aiecs/infrastructure/graph_storage/cache.py +429 -0
  145. aiecs/infrastructure/graph_storage/distributed.py +226 -0
  146. aiecs/infrastructure/graph_storage/error_handling.py +390 -0
  147. aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
  148. aiecs/infrastructure/graph_storage/health_checks.py +378 -0
  149. aiecs/infrastructure/graph_storage/in_memory.py +514 -0
  150. aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
  151. aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
  152. aiecs/infrastructure/graph_storage/metrics.py +357 -0
  153. aiecs/infrastructure/graph_storage/migration.py +413 -0
  154. aiecs/infrastructure/graph_storage/pagination.py +471 -0
  155. aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
  156. aiecs/infrastructure/graph_storage/postgres.py +871 -0
  157. aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
  158. aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
  159. aiecs/infrastructure/graph_storage/sqlite.py +623 -0
  160. aiecs/infrastructure/graph_storage/streaming.py +495 -0
  161. aiecs/infrastructure/messaging/__init__.py +13 -0
  162. aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
  163. aiecs/infrastructure/messaging/websocket_manager.py +298 -0
  164. aiecs/infrastructure/monitoring/__init__.py +34 -0
  165. aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
  166. aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
  167. aiecs/infrastructure/monitoring/structured_logger.py +48 -0
  168. aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
  169. aiecs/infrastructure/persistence/__init__.py +24 -0
  170. aiecs/infrastructure/persistence/context_engine_client.py +187 -0
  171. aiecs/infrastructure/persistence/database_manager.py +333 -0
  172. aiecs/infrastructure/persistence/file_storage.py +754 -0
  173. aiecs/infrastructure/persistence/redis_client.py +220 -0
  174. aiecs/llm/__init__.py +86 -0
  175. aiecs/llm/callbacks/__init__.py +11 -0
  176. aiecs/llm/callbacks/custom_callbacks.py +264 -0
  177. aiecs/llm/client_factory.py +420 -0
  178. aiecs/llm/clients/__init__.py +33 -0
  179. aiecs/llm/clients/base_client.py +193 -0
  180. aiecs/llm/clients/googleai_client.py +181 -0
  181. aiecs/llm/clients/openai_client.py +131 -0
  182. aiecs/llm/clients/vertex_client.py +437 -0
  183. aiecs/llm/clients/xai_client.py +184 -0
  184. aiecs/llm/config/__init__.py +51 -0
  185. aiecs/llm/config/config_loader.py +275 -0
  186. aiecs/llm/config/config_validator.py +236 -0
  187. aiecs/llm/config/model_config.py +151 -0
  188. aiecs/llm/utils/__init__.py +10 -0
  189. aiecs/llm/utils/validate_config.py +91 -0
  190. aiecs/main.py +363 -0
  191. aiecs/scripts/__init__.py +3 -0
  192. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  193. aiecs/scripts/aid/__init__.py +19 -0
  194. aiecs/scripts/aid/version_manager.py +215 -0
  195. aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
  196. aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
  197. aiecs/scripts/dependance_check/__init__.py +17 -0
  198. aiecs/scripts/dependance_check/dependency_checker.py +938 -0
  199. aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
  200. aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
  201. aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
  202. aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
  203. aiecs/scripts/dependance_patch/__init__.py +7 -0
  204. aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
  205. aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
  206. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
  207. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
  208. aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
  209. aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
  210. aiecs/scripts/tools_develop/README.md +449 -0
  211. aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
  212. aiecs/scripts/tools_develop/__init__.py +21 -0
  213. aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
  214. aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
  215. aiecs/scripts/tools_develop/verify_tools.py +356 -0
  216. aiecs/tasks/__init__.py +1 -0
  217. aiecs/tasks/worker.py +172 -0
  218. aiecs/tools/__init__.py +299 -0
  219. aiecs/tools/apisource/__init__.py +99 -0
  220. aiecs/tools/apisource/intelligence/__init__.py +19 -0
  221. aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
  222. aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
  223. aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
  224. aiecs/tools/apisource/monitoring/__init__.py +9 -0
  225. aiecs/tools/apisource/monitoring/metrics.py +303 -0
  226. aiecs/tools/apisource/providers/__init__.py +115 -0
  227. aiecs/tools/apisource/providers/base.py +664 -0
  228. aiecs/tools/apisource/providers/census.py +401 -0
  229. aiecs/tools/apisource/providers/fred.py +564 -0
  230. aiecs/tools/apisource/providers/newsapi.py +412 -0
  231. aiecs/tools/apisource/providers/worldbank.py +357 -0
  232. aiecs/tools/apisource/reliability/__init__.py +12 -0
  233. aiecs/tools/apisource/reliability/error_handler.py +375 -0
  234. aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
  235. aiecs/tools/apisource/tool.py +850 -0
  236. aiecs/tools/apisource/utils/__init__.py +9 -0
  237. aiecs/tools/apisource/utils/validators.py +338 -0
  238. aiecs/tools/base_tool.py +201 -0
  239. aiecs/tools/docs/__init__.py +121 -0
  240. aiecs/tools/docs/ai_document_orchestrator.py +599 -0
  241. aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
  242. aiecs/tools/docs/content_insertion_tool.py +1333 -0
  243. aiecs/tools/docs/document_creator_tool.py +1317 -0
  244. aiecs/tools/docs/document_layout_tool.py +1166 -0
  245. aiecs/tools/docs/document_parser_tool.py +994 -0
  246. aiecs/tools/docs/document_writer_tool.py +1818 -0
  247. aiecs/tools/knowledge_graph/__init__.py +17 -0
  248. aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
  249. aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
  250. aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
  251. aiecs/tools/langchain_adapter.py +542 -0
  252. aiecs/tools/schema_generator.py +275 -0
  253. aiecs/tools/search_tool/__init__.py +100 -0
  254. aiecs/tools/search_tool/analyzers.py +589 -0
  255. aiecs/tools/search_tool/cache.py +260 -0
  256. aiecs/tools/search_tool/constants.py +128 -0
  257. aiecs/tools/search_tool/context.py +216 -0
  258. aiecs/tools/search_tool/core.py +749 -0
  259. aiecs/tools/search_tool/deduplicator.py +123 -0
  260. aiecs/tools/search_tool/error_handler.py +271 -0
  261. aiecs/tools/search_tool/metrics.py +371 -0
  262. aiecs/tools/search_tool/rate_limiter.py +178 -0
  263. aiecs/tools/search_tool/schemas.py +277 -0
  264. aiecs/tools/statistics/__init__.py +80 -0
  265. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
  266. aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
  267. aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
  268. aiecs/tools/statistics/data_loader_tool.py +564 -0
  269. aiecs/tools/statistics/data_profiler_tool.py +658 -0
  270. aiecs/tools/statistics/data_transformer_tool.py +573 -0
  271. aiecs/tools/statistics/data_visualizer_tool.py +495 -0
  272. aiecs/tools/statistics/model_trainer_tool.py +487 -0
  273. aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
  274. aiecs/tools/task_tools/__init__.py +86 -0
  275. aiecs/tools/task_tools/chart_tool.py +732 -0
  276. aiecs/tools/task_tools/classfire_tool.py +922 -0
  277. aiecs/tools/task_tools/image_tool.py +447 -0
  278. aiecs/tools/task_tools/office_tool.py +684 -0
  279. aiecs/tools/task_tools/pandas_tool.py +635 -0
  280. aiecs/tools/task_tools/report_tool.py +635 -0
  281. aiecs/tools/task_tools/research_tool.py +392 -0
  282. aiecs/tools/task_tools/scraper_tool.py +715 -0
  283. aiecs/tools/task_tools/stats_tool.py +688 -0
  284. aiecs/tools/temp_file_manager.py +130 -0
  285. aiecs/tools/tool_executor/__init__.py +37 -0
  286. aiecs/tools/tool_executor/tool_executor.py +881 -0
  287. aiecs/utils/LLM_output_structor.py +445 -0
  288. aiecs/utils/__init__.py +34 -0
  289. aiecs/utils/base_callback.py +47 -0
  290. aiecs/utils/cache_provider.py +695 -0
  291. aiecs/utils/execution_utils.py +184 -0
  292. aiecs/utils/logging.py +1 -0
  293. aiecs/utils/prompt_loader.py +14 -0
  294. aiecs/utils/token_usage_repository.py +323 -0
  295. aiecs/ws/__init__.py +0 -0
  296. aiecs/ws/socket_server.py +52 -0
  297. aiecs-1.5.1.dist-info/METADATA +608 -0
  298. aiecs-1.5.1.dist-info/RECORD +302 -0
  299. aiecs-1.5.1.dist-info/WHEEL +5 -0
  300. aiecs-1.5.1.dist-info/entry_points.txt +10 -0
  301. aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
  302. aiecs-1.5.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,573 @@
1
+ """
2
+ Data Transformer Tool - Data cleaning, transformation, and feature engineering
3
+
4
+ This tool provides comprehensive data transformation capabilities with:
5
+ - Data cleaning and preprocessing
6
+ - Feature engineering and encoding
7
+ - Normalization and standardization
8
+ - Transformation pipelines
9
+ - Missing value handling
10
+ """
11
+
12
+ import logging
13
+ from typing import Dict, Any, List, Optional, Union
14
+ from enum import Enum
15
+
16
+ import pandas as pd
17
+ import numpy as np
18
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
19
+ from pydantic import BaseModel, Field, ConfigDict
20
+
21
+ from aiecs.tools.base_tool import BaseTool
22
+ from aiecs.tools import register_tool
23
+
24
+
25
+ class TransformationType(str, Enum):
26
+ """Types of transformations"""
27
+
28
+ # Cleaning operations
29
+ REMOVE_DUPLICATES = "remove_duplicates"
30
+ FILL_MISSING = "fill_missing"
31
+ REMOVE_OUTLIERS = "remove_outliers"
32
+
33
+ # Transformation operations
34
+ NORMALIZE = "normalize"
35
+ STANDARDIZE = "standardize"
36
+ LOG_TRANSFORM = "log_transform"
37
+ BOX_COX = "box_cox"
38
+
39
+ # Encoding operations
40
+ ONE_HOT_ENCODE = "one_hot_encode"
41
+ LABEL_ENCODE = "label_encode"
42
+ TARGET_ENCODE = "target_encode"
43
+
44
+ # Feature engineering
45
+ POLYNOMIAL_FEATURES = "polynomial_features"
46
+ INTERACTION_FEATURES = "interaction_features"
47
+ BINNING = "binning"
48
+ AGGREGATION = "aggregation"
49
+
50
+
51
+ class MissingValueStrategy(str, Enum):
52
+ """Strategies for handling missing values"""
53
+
54
+ DROP = "drop"
55
+ MEAN = "mean"
56
+ MEDIAN = "median"
57
+ MODE = "mode"
58
+ FORWARD_FILL = "forward_fill"
59
+ BACKWARD_FILL = "backward_fill"
60
+ INTERPOLATE = "interpolate"
61
+ CONSTANT = "constant"
62
+
63
+
64
+ class DataTransformerError(Exception):
65
+ """Base exception for DataTransformer errors"""
66
+
67
+
68
+ class TransformationError(DataTransformerError):
69
+ """Raised when transformation fails"""
70
+
71
+
72
+ @register_tool("data_transformer")
73
+ class DataTransformerTool(BaseTool):
74
+ """
75
+ Advanced data transformation tool that can:
76
+ 1. Clean and preprocess data
77
+ 2. Engineer features
78
+ 3. Transform and normalize data
79
+ 4. Build transformation pipelines
80
+
81
+ Integrates with pandas_tool for core operations.
82
+ """
83
+
84
+ # Configuration schema
85
+ class Config(BaseModel):
86
+ """Configuration for the data transformer tool"""
87
+
88
+ model_config = ConfigDict(env_prefix="DATA_TRANSFORMER_")
89
+
90
+ outlier_std_threshold: float = Field(
91
+ default=3.0,
92
+ description="Standard deviation threshold for outlier detection",
93
+ )
94
+ default_missing_strategy: str = Field(
95
+ default="mean",
96
+ description="Default strategy for handling missing values",
97
+ )
98
+ enable_pipeline_caching: bool = Field(
99
+ default=True,
100
+ description="Whether to enable transformation pipeline caching",
101
+ )
102
+ max_one_hot_categories: int = Field(
103
+ default=10,
104
+ description="Maximum number of categories for one-hot encoding",
105
+ )
106
+
107
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
108
+ """
109
+ Initialize DataTransformerTool with settings.
110
+
111
+ Args:
112
+ config: Optional configuration overrides
113
+ """
114
+ super().__init__(config)
115
+
116
+ # Parse configuration
117
+ self.config = self.Config(**(config or {}))
118
+
119
+ self.logger = logging.getLogger(__name__)
120
+ if not self.logger.handlers:
121
+ handler = logging.StreamHandler()
122
+ handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
123
+ self.logger.addHandler(handler)
124
+ self.logger.setLevel(logging.INFO)
125
+
126
+ # Initialize external tools
127
+ self._init_external_tools()
128
+
129
+ # Initialize transformation pipeline cache
130
+ self.pipeline_cache = {}
131
+
132
+ def _init_external_tools(self):
133
+ """Initialize external task tools"""
134
+ self.external_tools = {}
135
+
136
+ # Initialize PandasTool for data operations
137
+ try:
138
+ from aiecs.tools.task_tools.pandas_tool import PandasTool
139
+
140
+ self.external_tools["pandas"] = PandasTool()
141
+ self.logger.info("PandasTool initialized successfully")
142
+ except ImportError:
143
+ self.logger.warning("PandasTool not available")
144
+ self.external_tools["pandas"] = None
145
+
146
+ # Schema definitions
147
+ class TransformDataSchema(BaseModel):
148
+ """Schema for transform_data operation"""
149
+
150
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to transform")
151
+ transformations: List[Dict[str, Any]] = Field(description="List of transformation steps")
152
+ enable_validation: bool = Field(default=True, description="Validate transformations")
153
+
154
+ class AutoTransformSchema(BaseModel):
155
+ """Schema for auto_transform operation"""
156
+
157
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to transform")
158
+ target_column: Optional[str] = Field(default=None, description="Target column name")
159
+ task_type: Optional[str] = Field(
160
+ default=None, description="Task type: classification or regression"
161
+ )
162
+
163
+ class HandleMissingValuesSchema(BaseModel):
164
+ """Schema for handle_missing_values operation"""
165
+
166
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(
167
+ description="Data with missing values"
168
+ )
169
+ strategy: MissingValueStrategy = Field(
170
+ default=MissingValueStrategy.MEAN,
171
+ description="Strategy for handling missing values",
172
+ )
173
+ columns: Optional[List[str]] = Field(default=None, description="Specific columns to handle")
174
+ fill_value: Optional[Any] = Field(default=None, description="Value for constant strategy")
175
+
176
+ class EncodeFeaturesSchema(BaseModel):
177
+ """Schema for encode_features operation"""
178
+
179
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to encode")
180
+ columns: List[str] = Field(description="Columns to encode")
181
+ method: str = Field(default="one_hot", description="Encoding method: one_hot or label")
182
+
183
+ def transform_data(
184
+ self,
185
+ data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
186
+ transformations: List[Dict[str, Any]],
187
+ validate: bool = True,
188
+ ) -> Dict[str, Any]:
189
+ """
190
+ Apply transformation pipeline to data.
191
+
192
+ Args:
193
+ data: Data to transform
194
+ transformations: List of transformation steps, each containing:
195
+ - type: TransformationType
196
+ - columns: List of columns (optional)
197
+ - params: Additional parameters
198
+ validate: Whether to validate transformations
199
+
200
+ Returns:
201
+ Dict containing:
202
+ - transformed_data: Transformed DataFrame
203
+ - transformation_log: Log of applied transformations
204
+ - quality_improvement: Quality metrics comparison
205
+
206
+ Raises:
207
+ TransformationError: If transformation fails
208
+ """
209
+ try:
210
+ df = self._to_dataframe(data)
211
+ original_df = df.copy()
212
+
213
+ transformation_log = []
214
+
215
+ for i, transform in enumerate(transformations):
216
+ trans_type = transform.get("type")
217
+ columns = transform.get("columns")
218
+ params = transform.get("params", {})
219
+
220
+ self.logger.info(
221
+ f"Applying transformation {i+1}/{len(transformations)}: {trans_type}"
222
+ )
223
+
224
+ # Apply transformation
225
+ df = self._apply_single_transformation(df, trans_type, columns, params)
226
+
227
+ transformation_log.append(
228
+ {
229
+ "step": i + 1,
230
+ "type": trans_type,
231
+ "columns": columns,
232
+ "params": params,
233
+ "status": "success",
234
+ }
235
+ )
236
+
237
+ # Calculate quality improvement
238
+ quality_improvement = self._calculate_quality_improvement(original_df, df)
239
+
240
+ return {
241
+ "transformed_data": df,
242
+ "transformation_log": transformation_log,
243
+ "quality_improvement": quality_improvement,
244
+ "original_shape": original_df.shape,
245
+ "new_shape": df.shape,
246
+ }
247
+
248
+ except Exception as e:
249
+ self.logger.error(f"Error in transformation pipeline: {e}")
250
+ raise TransformationError(f"Transformation failed: {e}")
251
+
252
+ def auto_transform(
253
+ self,
254
+ data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
255
+ target_column: Optional[str] = None,
256
+ task_type: Optional[str] = None,
257
+ ) -> Dict[str, Any]:
258
+ """
259
+ Automatically determine and apply optimal transformations.
260
+
261
+ Args:
262
+ data: Data to transform
263
+ target_column: Target column for ML tasks
264
+ task_type: Type of task (classification or regression)
265
+
266
+ Returns:
267
+ Dict containing transformed data and applied transformations
268
+ """
269
+ try:
270
+ df = self._to_dataframe(data)
271
+
272
+ # Determine transformations needed
273
+ transformations = self._determine_transformations(df, target_column, task_type)
274
+
275
+ # Apply transformations
276
+ result = self.transform_data(df, transformations, validate=True)
277
+ result["auto_detected_transformations"] = transformations
278
+
279
+ return result
280
+
281
+ except Exception as e:
282
+ self.logger.error(f"Error in auto transform: {e}")
283
+ raise TransformationError(f"Auto transform failed: {e}")
284
+
285
+ def handle_missing_values(
286
+ self,
287
+ data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
288
+ strategy: MissingValueStrategy = MissingValueStrategy.MEAN,
289
+ columns: Optional[List[str]] = None,
290
+ fill_value: Optional[Any] = None,
291
+ ) -> Dict[str, Any]:
292
+ """
293
+ Handle missing values in data.
294
+
295
+ Args:
296
+ data: Data with missing values
297
+ strategy: Strategy for handling missing values
298
+ columns: Specific columns to handle (None for all)
299
+ fill_value: Value for constant strategy
300
+
301
+ Returns:
302
+ Dict containing data with handled missing values
303
+ """
304
+ try:
305
+ df = self._to_dataframe(data)
306
+ original_missing = df.isnull().sum().sum()
307
+
308
+ # Select columns to handle
309
+ cols_to_handle = columns if columns else df.columns.tolist()
310
+
311
+ # Apply strategy
312
+ if strategy == MissingValueStrategy.DROP:
313
+ df = df.dropna(subset=cols_to_handle)
314
+ elif strategy == MissingValueStrategy.MEAN:
315
+ for col in cols_to_handle:
316
+ if df[col].dtype in ["int64", "float64"]:
317
+ df[col].fillna(df[col].mean(), inplace=True)
318
+ elif strategy == MissingValueStrategy.MEDIAN:
319
+ for col in cols_to_handle:
320
+ if df[col].dtype in ["int64", "float64"]:
321
+ df[col].fillna(df[col].median(), inplace=True)
322
+ elif strategy == MissingValueStrategy.MODE:
323
+ for col in cols_to_handle:
324
+ if not df[col].mode().empty:
325
+ df[col].fillna(df[col].mode()[0], inplace=True)
326
+ elif strategy == MissingValueStrategy.FORWARD_FILL:
327
+ df[cols_to_handle] = df[cols_to_handle].fillna(method="ffill")
328
+ elif strategy == MissingValueStrategy.BACKWARD_FILL:
329
+ df[cols_to_handle] = df[cols_to_handle].fillna(method="bfill")
330
+ elif strategy == MissingValueStrategy.INTERPOLATE:
331
+ for col in cols_to_handle:
332
+ if df[col].dtype in ["int64", "float64"]:
333
+ df[col] = df[col].interpolate()
334
+ elif strategy == MissingValueStrategy.CONSTANT:
335
+ df[cols_to_handle] = df[cols_to_handle].fillna(fill_value)
336
+
337
+ final_missing = df.isnull().sum().sum()
338
+
339
+ return {
340
+ "data": df,
341
+ "original_missing": int(original_missing),
342
+ "final_missing": int(final_missing),
343
+ "missing_handled": int(original_missing - final_missing),
344
+ "strategy": strategy.value,
345
+ }
346
+
347
+ except Exception as e:
348
+ self.logger.error(f"Error handling missing values: {e}")
349
+ raise TransformationError(f"Failed to handle missing values: {e}")
350
+
351
+ def encode_features(
352
+ self,
353
+ data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
354
+ columns: List[str],
355
+ method: str = "one_hot",
356
+ ) -> Dict[str, Any]:
357
+ """
358
+ Encode categorical features.
359
+
360
+ Args:
361
+ data: Data to encode
362
+ columns: Columns to encode
363
+ method: Encoding method (one_hot or label)
364
+
365
+ Returns:
366
+ Dict containing encoded data
367
+ """
368
+ try:
369
+ df = self._to_dataframe(data)
370
+
371
+ if method == "one_hot":
372
+ # One-hot encoding
373
+ df_encoded = pd.get_dummies(df, columns=columns, prefix=columns)
374
+ encoding_info = {
375
+ "method": "one_hot",
376
+ "original_columns": columns,
377
+ "new_columns": [col for col in df_encoded.columns if col not in df.columns],
378
+ }
379
+ elif method == "label":
380
+ # Label encoding
381
+ df_encoded = df.copy()
382
+ encoders = {}
383
+ for col in columns:
384
+ le = LabelEncoder()
385
+ df_encoded[col] = le.fit_transform(df[col].astype(str))
386
+ encoders[col] = le
387
+ encoding_info = {
388
+ "method": "label",
389
+ "columns": columns,
390
+ "encoders": encoders,
391
+ }
392
+ else:
393
+ raise TransformationError(f"Unsupported encoding method: {method}")
394
+
395
+ return {
396
+ "data": df_encoded,
397
+ "encoding_info": encoding_info,
398
+ "original_shape": df.shape,
399
+ "new_shape": df_encoded.shape,
400
+ }
401
+
402
+ except Exception as e:
403
+ self.logger.error(f"Error encoding features: {e}")
404
+ raise TransformationError(f"Feature encoding failed: {e}")
405
+
406
+ # Internal helper methods
407
+
408
+ def _to_dataframe(self, data: Union[Dict, List, pd.DataFrame]) -> pd.DataFrame:
409
+ """Convert data to DataFrame"""
410
+ if isinstance(data, pd.DataFrame):
411
+ return data
412
+ elif isinstance(data, list):
413
+ return pd.DataFrame(data)
414
+ elif isinstance(data, dict):
415
+ return pd.DataFrame([data])
416
+ else:
417
+ raise TransformationError(f"Unsupported data type: {type(data)}")
418
+
419
+ def _apply_single_transformation(
420
+ self,
421
+ df: pd.DataFrame,
422
+ trans_type: str,
423
+ columns: Optional[List[str]],
424
+ params: Dict[str, Any],
425
+ ) -> pd.DataFrame:
426
+ """Apply a single transformation"""
427
+ if trans_type == TransformationType.REMOVE_DUPLICATES.value:
428
+ return df.drop_duplicates()
429
+
430
+ elif trans_type == TransformationType.FILL_MISSING.value:
431
+ strategy = params.get("strategy", "mean")
432
+ for col in columns or df.columns:
433
+ if df[col].isnull().any():
434
+ if strategy == "mean" and df[col].dtype in [
435
+ "int64",
436
+ "float64",
437
+ ]:
438
+ df[col].fillna(df[col].mean(), inplace=True)
439
+ elif strategy == "median" and df[col].dtype in [
440
+ "int64",
441
+ "float64",
442
+ ]:
443
+ df[col].fillna(df[col].median(), inplace=True)
444
+ elif strategy == "mode":
445
+ if not df[col].mode().empty:
446
+ df[col].fillna(df[col].mode()[0], inplace=True)
447
+ return df
448
+
449
+ elif trans_type == TransformationType.REMOVE_OUTLIERS.value:
450
+ for col in columns or df.select_dtypes(include=[np.number]).columns:
451
+ if df[col].std() > 0:
452
+ z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
453
+ df = df[z_scores < self.config.outlier_std_threshold]
454
+ return df
455
+
456
+ elif trans_type == TransformationType.STANDARDIZE.value:
457
+ scaler = StandardScaler()
458
+ cols = columns or df.select_dtypes(include=[np.number]).columns.tolist()
459
+ df[cols] = scaler.fit_transform(df[cols])
460
+ return df
461
+
462
+ elif trans_type == TransformationType.NORMALIZE.value:
463
+ scaler = MinMaxScaler()
464
+ cols = columns or df.select_dtypes(include=[np.number]).columns.tolist()
465
+ df[cols] = scaler.fit_transform(df[cols])
466
+ return df
467
+
468
+ elif trans_type == TransformationType.LOG_TRANSFORM.value:
469
+ cols = columns or df.select_dtypes(include=[np.number]).columns.tolist()
470
+ for col in cols:
471
+ if (df[col] > 0).all():
472
+ df[col] = np.log(df[col])
473
+ return df
474
+
475
+ elif trans_type == TransformationType.ONE_HOT_ENCODE.value:
476
+ cols = columns or df.select_dtypes(include=["object"]).columns.tolist()
477
+ return pd.get_dummies(df, columns=cols)
478
+
479
+ elif trans_type == TransformationType.LABEL_ENCODE.value:
480
+ cols = columns or df.select_dtypes(include=["object"]).columns.tolist()
481
+ for col in cols:
482
+ le = LabelEncoder()
483
+ df[col] = le.fit_transform(df[col].astype(str))
484
+ return df
485
+
486
+ else:
487
+ self.logger.warning(f"Transformation type {trans_type} not implemented, skipping")
488
+ return df
489
+
490
+ def _determine_transformations(
491
+ self,
492
+ df: pd.DataFrame,
493
+ target_column: Optional[str],
494
+ task_type: Optional[str],
495
+ ) -> List[Dict[str, Any]]:
496
+ """Determine transformations needed for data"""
497
+ transformations = []
498
+
499
+ # Remove duplicates if present
500
+ if df.duplicated().sum() > 0:
501
+ transformations.append(
502
+ {
503
+ "type": TransformationType.REMOVE_DUPLICATES.value,
504
+ "columns": None,
505
+ "params": {},
506
+ }
507
+ )
508
+
509
+ # Handle missing values
510
+ if df.isnull().sum().sum() > 0:
511
+ transformations.append(
512
+ {
513
+ "type": TransformationType.FILL_MISSING.value,
514
+ "columns": None,
515
+ "params": {"strategy": "mean"},
516
+ }
517
+ )
518
+
519
+ # Encode categorical variables
520
+ categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
521
+ if target_column and target_column in categorical_cols:
522
+ categorical_cols.remove(target_column)
523
+
524
+ if len(categorical_cols) > 0:
525
+ # Use label encoding if too many categories, otherwise one-hot
526
+ for col in categorical_cols:
527
+ if df[col].nunique() > self.config.max_one_hot_categories:
528
+ transformations.append(
529
+ {
530
+ "type": TransformationType.LABEL_ENCODE.value,
531
+ "columns": [col],
532
+ "params": {},
533
+ }
534
+ )
535
+ else:
536
+ transformations.append(
537
+ {
538
+ "type": TransformationType.ONE_HOT_ENCODE.value,
539
+ "columns": [col],
540
+ "params": {},
541
+ }
542
+ )
543
+
544
+ # Standardize numeric features
545
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
546
+ if target_column and target_column in numeric_cols:
547
+ numeric_cols.remove(target_column)
548
+
549
+ if len(numeric_cols) > 0:
550
+ transformations.append(
551
+ {
552
+ "type": TransformationType.STANDARDIZE.value,
553
+ "columns": numeric_cols,
554
+ "params": {},
555
+ }
556
+ )
557
+
558
+ return transformations
559
+
560
+ def _calculate_quality_improvement(
561
+ self, original_df: pd.DataFrame, transformed_df: pd.DataFrame
562
+ ) -> Dict[str, Any]:
563
+ """Calculate quality improvement metrics"""
564
+ return {
565
+ "missing_before": int(original_df.isnull().sum().sum()),
566
+ "missing_after": int(transformed_df.isnull().sum().sum()),
567
+ "duplicates_before": int(original_df.duplicated().sum()),
568
+ "duplicates_after": int(transformed_df.duplicated().sum()),
569
+ "rows_before": len(original_df),
570
+ "rows_after": len(transformed_df),
571
+ "columns_before": len(original_df.columns),
572
+ "columns_after": len(transformed_df.columns),
573
+ }