aiecs 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. aiecs/__init__.py +72 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +469 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +363 -0
  7. aiecs/application/knowledge_graph/__init__.py +7 -0
  8. aiecs/application/knowledge_graph/builder/__init__.py +37 -0
  9. aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
  10. aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
  11. aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
  12. aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
  13. aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
  14. aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
  15. aiecs/application/knowledge_graph/extractors/base.py +100 -0
  16. aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
  17. aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
  18. aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
  19. aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
  20. aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
  21. aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
  22. aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
  23. aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
  24. aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
  25. aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
  26. aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
  27. aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
  28. aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
  29. aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
  30. aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
  31. aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
  32. aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
  33. aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
  34. aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
  35. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
  36. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
  37. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
  38. aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
  39. aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
  40. aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
  41. aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
  42. aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
  43. aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
  44. aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
  45. aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
  46. aiecs/application/knowledge_graph/search/__init__.py +59 -0
  47. aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
  48. aiecs/application/knowledge_graph/search/reranker.py +295 -0
  49. aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
  50. aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
  51. aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
  52. aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
  53. aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
  54. aiecs/application/knowledge_graph/validators/__init__.py +13 -0
  55. aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
  56. aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
  57. aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
  58. aiecs/common/__init__.py +9 -0
  59. aiecs/common/knowledge_graph/__init__.py +17 -0
  60. aiecs/common/knowledge_graph/runnable.py +484 -0
  61. aiecs/config/__init__.py +16 -0
  62. aiecs/config/config.py +498 -0
  63. aiecs/config/graph_config.py +137 -0
  64. aiecs/config/registry.py +23 -0
  65. aiecs/core/__init__.py +46 -0
  66. aiecs/core/interface/__init__.py +34 -0
  67. aiecs/core/interface/execution_interface.py +152 -0
  68. aiecs/core/interface/storage_interface.py +171 -0
  69. aiecs/domain/__init__.py +289 -0
  70. aiecs/domain/agent/__init__.py +189 -0
  71. aiecs/domain/agent/base_agent.py +697 -0
  72. aiecs/domain/agent/exceptions.py +103 -0
  73. aiecs/domain/agent/graph_aware_mixin.py +559 -0
  74. aiecs/domain/agent/hybrid_agent.py +490 -0
  75. aiecs/domain/agent/integration/__init__.py +26 -0
  76. aiecs/domain/agent/integration/context_compressor.py +222 -0
  77. aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
  78. aiecs/domain/agent/integration/retry_policy.py +219 -0
  79. aiecs/domain/agent/integration/role_config.py +213 -0
  80. aiecs/domain/agent/knowledge_aware_agent.py +646 -0
  81. aiecs/domain/agent/lifecycle.py +296 -0
  82. aiecs/domain/agent/llm_agent.py +300 -0
  83. aiecs/domain/agent/memory/__init__.py +12 -0
  84. aiecs/domain/agent/memory/conversation.py +197 -0
  85. aiecs/domain/agent/migration/__init__.py +14 -0
  86. aiecs/domain/agent/migration/conversion.py +160 -0
  87. aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
  88. aiecs/domain/agent/models.py +317 -0
  89. aiecs/domain/agent/observability.py +407 -0
  90. aiecs/domain/agent/persistence.py +289 -0
  91. aiecs/domain/agent/prompts/__init__.py +29 -0
  92. aiecs/domain/agent/prompts/builder.py +161 -0
  93. aiecs/domain/agent/prompts/formatters.py +189 -0
  94. aiecs/domain/agent/prompts/template.py +255 -0
  95. aiecs/domain/agent/registry.py +260 -0
  96. aiecs/domain/agent/tool_agent.py +257 -0
  97. aiecs/domain/agent/tools/__init__.py +12 -0
  98. aiecs/domain/agent/tools/schema_generator.py +221 -0
  99. aiecs/domain/community/__init__.py +155 -0
  100. aiecs/domain/community/agent_adapter.py +477 -0
  101. aiecs/domain/community/analytics.py +481 -0
  102. aiecs/domain/community/collaborative_workflow.py +642 -0
  103. aiecs/domain/community/communication_hub.py +645 -0
  104. aiecs/domain/community/community_builder.py +320 -0
  105. aiecs/domain/community/community_integration.py +800 -0
  106. aiecs/domain/community/community_manager.py +813 -0
  107. aiecs/domain/community/decision_engine.py +879 -0
  108. aiecs/domain/community/exceptions.py +225 -0
  109. aiecs/domain/community/models/__init__.py +33 -0
  110. aiecs/domain/community/models/community_models.py +268 -0
  111. aiecs/domain/community/resource_manager.py +457 -0
  112. aiecs/domain/community/shared_context_manager.py +603 -0
  113. aiecs/domain/context/__init__.py +58 -0
  114. aiecs/domain/context/context_engine.py +989 -0
  115. aiecs/domain/context/conversation_models.py +354 -0
  116. aiecs/domain/context/graph_memory.py +467 -0
  117. aiecs/domain/execution/__init__.py +12 -0
  118. aiecs/domain/execution/model.py +57 -0
  119. aiecs/domain/knowledge_graph/__init__.py +19 -0
  120. aiecs/domain/knowledge_graph/models/__init__.py +52 -0
  121. aiecs/domain/knowledge_graph/models/entity.py +130 -0
  122. aiecs/domain/knowledge_graph/models/evidence.py +194 -0
  123. aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
  124. aiecs/domain/knowledge_graph/models/path.py +179 -0
  125. aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
  126. aiecs/domain/knowledge_graph/models/query.py +272 -0
  127. aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
  128. aiecs/domain/knowledge_graph/models/relation.py +136 -0
  129. aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
  130. aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
  131. aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
  132. aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
  133. aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
  134. aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
  135. aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
  136. aiecs/domain/task/__init__.py +13 -0
  137. aiecs/domain/task/dsl_processor.py +613 -0
  138. aiecs/domain/task/model.py +62 -0
  139. aiecs/domain/task/task_context.py +268 -0
  140. aiecs/infrastructure/__init__.py +24 -0
  141. aiecs/infrastructure/graph_storage/__init__.py +11 -0
  142. aiecs/infrastructure/graph_storage/base.py +601 -0
  143. aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
  144. aiecs/infrastructure/graph_storage/cache.py +429 -0
  145. aiecs/infrastructure/graph_storage/distributed.py +226 -0
  146. aiecs/infrastructure/graph_storage/error_handling.py +390 -0
  147. aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
  148. aiecs/infrastructure/graph_storage/health_checks.py +378 -0
  149. aiecs/infrastructure/graph_storage/in_memory.py +514 -0
  150. aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
  151. aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
  152. aiecs/infrastructure/graph_storage/metrics.py +357 -0
  153. aiecs/infrastructure/graph_storage/migration.py +413 -0
  154. aiecs/infrastructure/graph_storage/pagination.py +471 -0
  155. aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
  156. aiecs/infrastructure/graph_storage/postgres.py +871 -0
  157. aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
  158. aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
  159. aiecs/infrastructure/graph_storage/sqlite.py +623 -0
  160. aiecs/infrastructure/graph_storage/streaming.py +495 -0
  161. aiecs/infrastructure/messaging/__init__.py +13 -0
  162. aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
  163. aiecs/infrastructure/messaging/websocket_manager.py +298 -0
  164. aiecs/infrastructure/monitoring/__init__.py +34 -0
  165. aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
  166. aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
  167. aiecs/infrastructure/monitoring/structured_logger.py +48 -0
  168. aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
  169. aiecs/infrastructure/persistence/__init__.py +24 -0
  170. aiecs/infrastructure/persistence/context_engine_client.py +187 -0
  171. aiecs/infrastructure/persistence/database_manager.py +333 -0
  172. aiecs/infrastructure/persistence/file_storage.py +754 -0
  173. aiecs/infrastructure/persistence/redis_client.py +220 -0
  174. aiecs/llm/__init__.py +86 -0
  175. aiecs/llm/callbacks/__init__.py +11 -0
  176. aiecs/llm/callbacks/custom_callbacks.py +264 -0
  177. aiecs/llm/client_factory.py +420 -0
  178. aiecs/llm/clients/__init__.py +33 -0
  179. aiecs/llm/clients/base_client.py +193 -0
  180. aiecs/llm/clients/googleai_client.py +181 -0
  181. aiecs/llm/clients/openai_client.py +131 -0
  182. aiecs/llm/clients/vertex_client.py +437 -0
  183. aiecs/llm/clients/xai_client.py +184 -0
  184. aiecs/llm/config/__init__.py +51 -0
  185. aiecs/llm/config/config_loader.py +275 -0
  186. aiecs/llm/config/config_validator.py +236 -0
  187. aiecs/llm/config/model_config.py +151 -0
  188. aiecs/llm/utils/__init__.py +10 -0
  189. aiecs/llm/utils/validate_config.py +91 -0
  190. aiecs/main.py +363 -0
  191. aiecs/scripts/__init__.py +3 -0
  192. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  193. aiecs/scripts/aid/__init__.py +19 -0
  194. aiecs/scripts/aid/version_manager.py +215 -0
  195. aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
  196. aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
  197. aiecs/scripts/dependance_check/__init__.py +17 -0
  198. aiecs/scripts/dependance_check/dependency_checker.py +938 -0
  199. aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
  200. aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
  201. aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
  202. aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
  203. aiecs/scripts/dependance_patch/__init__.py +7 -0
  204. aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
  205. aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
  206. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
  207. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
  208. aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
  209. aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
  210. aiecs/scripts/tools_develop/README.md +449 -0
  211. aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
  212. aiecs/scripts/tools_develop/__init__.py +21 -0
  213. aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
  214. aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
  215. aiecs/scripts/tools_develop/verify_tools.py +356 -0
  216. aiecs/tasks/__init__.py +1 -0
  217. aiecs/tasks/worker.py +172 -0
  218. aiecs/tools/__init__.py +299 -0
  219. aiecs/tools/apisource/__init__.py +99 -0
  220. aiecs/tools/apisource/intelligence/__init__.py +19 -0
  221. aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
  222. aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
  223. aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
  224. aiecs/tools/apisource/monitoring/__init__.py +9 -0
  225. aiecs/tools/apisource/monitoring/metrics.py +303 -0
  226. aiecs/tools/apisource/providers/__init__.py +115 -0
  227. aiecs/tools/apisource/providers/base.py +664 -0
  228. aiecs/tools/apisource/providers/census.py +401 -0
  229. aiecs/tools/apisource/providers/fred.py +564 -0
  230. aiecs/tools/apisource/providers/newsapi.py +412 -0
  231. aiecs/tools/apisource/providers/worldbank.py +357 -0
  232. aiecs/tools/apisource/reliability/__init__.py +12 -0
  233. aiecs/tools/apisource/reliability/error_handler.py +375 -0
  234. aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
  235. aiecs/tools/apisource/tool.py +850 -0
  236. aiecs/tools/apisource/utils/__init__.py +9 -0
  237. aiecs/tools/apisource/utils/validators.py +338 -0
  238. aiecs/tools/base_tool.py +201 -0
  239. aiecs/tools/docs/__init__.py +121 -0
  240. aiecs/tools/docs/ai_document_orchestrator.py +599 -0
  241. aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
  242. aiecs/tools/docs/content_insertion_tool.py +1333 -0
  243. aiecs/tools/docs/document_creator_tool.py +1317 -0
  244. aiecs/tools/docs/document_layout_tool.py +1166 -0
  245. aiecs/tools/docs/document_parser_tool.py +994 -0
  246. aiecs/tools/docs/document_writer_tool.py +1818 -0
  247. aiecs/tools/knowledge_graph/__init__.py +17 -0
  248. aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
  249. aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
  250. aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
  251. aiecs/tools/langchain_adapter.py +542 -0
  252. aiecs/tools/schema_generator.py +275 -0
  253. aiecs/tools/search_tool/__init__.py +100 -0
  254. aiecs/tools/search_tool/analyzers.py +589 -0
  255. aiecs/tools/search_tool/cache.py +260 -0
  256. aiecs/tools/search_tool/constants.py +128 -0
  257. aiecs/tools/search_tool/context.py +216 -0
  258. aiecs/tools/search_tool/core.py +749 -0
  259. aiecs/tools/search_tool/deduplicator.py +123 -0
  260. aiecs/tools/search_tool/error_handler.py +271 -0
  261. aiecs/tools/search_tool/metrics.py +371 -0
  262. aiecs/tools/search_tool/rate_limiter.py +178 -0
  263. aiecs/tools/search_tool/schemas.py +277 -0
  264. aiecs/tools/statistics/__init__.py +80 -0
  265. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
  266. aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
  267. aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
  268. aiecs/tools/statistics/data_loader_tool.py +564 -0
  269. aiecs/tools/statistics/data_profiler_tool.py +658 -0
  270. aiecs/tools/statistics/data_transformer_tool.py +573 -0
  271. aiecs/tools/statistics/data_visualizer_tool.py +495 -0
  272. aiecs/tools/statistics/model_trainer_tool.py +487 -0
  273. aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
  274. aiecs/tools/task_tools/__init__.py +86 -0
  275. aiecs/tools/task_tools/chart_tool.py +732 -0
  276. aiecs/tools/task_tools/classfire_tool.py +922 -0
  277. aiecs/tools/task_tools/image_tool.py +447 -0
  278. aiecs/tools/task_tools/office_tool.py +684 -0
  279. aiecs/tools/task_tools/pandas_tool.py +635 -0
  280. aiecs/tools/task_tools/report_tool.py +635 -0
  281. aiecs/tools/task_tools/research_tool.py +392 -0
  282. aiecs/tools/task_tools/scraper_tool.py +715 -0
  283. aiecs/tools/task_tools/stats_tool.py +688 -0
  284. aiecs/tools/temp_file_manager.py +130 -0
  285. aiecs/tools/tool_executor/__init__.py +37 -0
  286. aiecs/tools/tool_executor/tool_executor.py +881 -0
  287. aiecs/utils/LLM_output_structor.py +445 -0
  288. aiecs/utils/__init__.py +34 -0
  289. aiecs/utils/base_callback.py +47 -0
  290. aiecs/utils/cache_provider.py +695 -0
  291. aiecs/utils/execution_utils.py +184 -0
  292. aiecs/utils/logging.py +1 -0
  293. aiecs/utils/prompt_loader.py +14 -0
  294. aiecs/utils/token_usage_repository.py +323 -0
  295. aiecs/ws/__init__.py +0 -0
  296. aiecs/ws/socket_server.py +52 -0
  297. aiecs-1.5.1.dist-info/METADATA +608 -0
  298. aiecs-1.5.1.dist-info/RECORD +302 -0
  299. aiecs-1.5.1.dist-info/WHEEL +5 -0
  300. aiecs-1.5.1.dist-info/entry_points.txt +10 -0
  301. aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
  302. aiecs-1.5.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,658 @@
1
+ """
2
+ Data Profiler Tool - Comprehensive data profiling and quality assessment
3
+
4
+ This tool provides advanced data profiling capabilities with:
5
+ - Statistical summaries and distributions
6
+ - Data quality issue detection
7
+ - Pattern and anomaly identification
8
+ - Preprocessing recommendations
9
+ - Column-level and dataset-level analysis
10
+ """
11
+
12
+ import logging
13
+ from typing import Dict, Any, List, Optional, Union
14
+ from enum import Enum
15
+
16
+ import pandas as pd
17
+ import numpy as np
18
+ from pydantic import BaseModel, Field, ConfigDict
19
+
20
+ from aiecs.tools.base_tool import BaseTool
21
+ from aiecs.tools import register_tool
22
+
23
+
24
+ class ProfileLevel(str, Enum):
25
+ """Data profiling depth levels"""
26
+
27
+ BASIC = "basic"
28
+ STANDARD = "standard"
29
+ COMPREHENSIVE = "comprehensive"
30
+ DEEP = "deep"
31
+
32
+
33
+ class DataQualityCheck(str, Enum):
34
+ """Types of data quality checks"""
35
+
36
+ MISSING_VALUES = "missing_values"
37
+ DUPLICATES = "duplicates"
38
+ OUTLIERS = "outliers"
39
+ INCONSISTENCIES = "inconsistencies"
40
+ DATA_TYPES = "data_types"
41
+ DISTRIBUTIONS = "distributions"
42
+ CORRELATIONS = "correlations"
43
+
44
+
45
+ class DataProfilerError(Exception):
46
+ """Base exception for DataProfiler errors"""
47
+
48
+
49
+ class ProfilingError(DataProfilerError):
50
+ """Raised when profiling operation fails"""
51
+
52
+
53
+ @register_tool("data_profiler")
54
+ class DataProfilerTool(BaseTool):
55
+ """
56
+ Comprehensive data profiling tool that can:
57
+ 1. Generate statistical summaries
58
+ 2. Detect data quality issues
59
+ 3. Identify patterns and anomalies
60
+ 4. Recommend preprocessing steps
61
+
62
+ Integrates with stats_tool and pandas_tool for core operations.
63
+ """
64
+
65
+ # Configuration schema
66
+ class Config(BaseModel):
67
+ """Configuration for the data profiler tool"""
68
+
69
+ model_config = ConfigDict(env_prefix="DATA_PROFILER_")
70
+
71
+ default_profile_level: str = Field(
72
+ default="standard", description="Default profiling depth level"
73
+ )
74
+ outlier_std_threshold: float = Field(
75
+ default=3.0,
76
+ description="Standard deviation threshold for outlier detection",
77
+ )
78
+ correlation_threshold: float = Field(
79
+ default=0.7,
80
+ description="Correlation threshold for identifying strong relationships",
81
+ )
82
+ missing_threshold: float = Field(
83
+ default=0.5,
84
+ description="Missing value threshold for quality assessment",
85
+ )
86
+ enable_visualizations: bool = Field(
87
+ default=True,
88
+ description="Whether to enable visualization generation",
89
+ )
90
+ max_unique_values_categorical: int = Field(
91
+ default=50,
92
+ description="Maximum unique values for categorical analysis",
93
+ )
94
+
95
+ def __init__(self, config: Optional[Dict[str, Any]] = None):
96
+ """
97
+ Initialize DataProfilerTool with settings.
98
+
99
+ Args:
100
+ config: Optional configuration overrides
101
+ """
102
+ super().__init__(config)
103
+
104
+ # Parse configuration
105
+ self.config = self.Config(**(config or {}))
106
+
107
+ self.logger = logging.getLogger(__name__)
108
+ if not self.logger.handlers:
109
+ handler = logging.StreamHandler()
110
+ handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
111
+ self.logger.addHandler(handler)
112
+ self.logger.setLevel(logging.INFO)
113
+
114
+ # Initialize external tools
115
+ self._init_external_tools()
116
+
117
+ def _init_external_tools(self):
118
+ """Initialize external task tools"""
119
+ self.external_tools = {}
120
+
121
+ # Initialize StatsTool for statistical operations
122
+ try:
123
+ from aiecs.tools.task_tools.stats_tool import StatsTool
124
+
125
+ self.external_tools["stats"] = StatsTool()
126
+ self.logger.info("StatsTool initialized successfully")
127
+ except ImportError:
128
+ self.logger.warning("StatsTool not available")
129
+ self.external_tools["stats"] = None
130
+
131
+ # Initialize PandasTool for data operations
132
+ try:
133
+ from aiecs.tools.task_tools.pandas_tool import PandasTool
134
+
135
+ self.external_tools["pandas"] = PandasTool()
136
+ self.logger.info("PandasTool initialized successfully")
137
+ except ImportError:
138
+ self.logger.warning("PandasTool not available")
139
+ self.external_tools["pandas"] = None
140
+
141
+ # Schema definitions
142
+ class ProfileDatasetSchema(BaseModel):
143
+ """Schema for profile_dataset operation"""
144
+
145
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to profile")
146
+ level: ProfileLevel = Field(
147
+ default=ProfileLevel.STANDARD, description="Profiling depth level"
148
+ )
149
+ checks: Optional[List[DataQualityCheck]] = Field(
150
+ default=None, description="Specific quality checks to perform"
151
+ )
152
+ generate_visualizations: bool = Field(
153
+ default=False, description="Generate visualization data"
154
+ )
155
+
156
+ class DetectQualityIssuesSchema(BaseModel):
157
+ """Schema for detect_quality_issues operation"""
158
+
159
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to check")
160
+ checks: Optional[List[DataQualityCheck]] = Field(
161
+ default=None, description="Specific checks to perform"
162
+ )
163
+
164
+ class RecommendPreprocessingSchema(BaseModel):
165
+ """Schema for recommend_preprocessing operation"""
166
+
167
+ data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to analyze")
168
+ target_column: Optional[str] = Field(default=None, description="Target column for ML tasks")
169
+
170
+ def profile_dataset(
171
+ self,
172
+ data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
173
+ level: ProfileLevel = ProfileLevel.STANDARD,
174
+ checks: Optional[List[DataQualityCheck]] = None,
175
+ generate_visualizations: bool = False,
176
+ ) -> Dict[str, Any]:
177
+ """
178
+ Generate comprehensive data profile.
179
+
180
+ Args:
181
+ data: Data to profile (dict, list of dicts, or DataFrame)
182
+ level: Profiling depth level
183
+ checks: Specific quality checks to perform (all if None)
184
+ generate_visualizations: Whether to generate visualization data
185
+
186
+ Returns:
187
+ Dict containing:
188
+ - summary: Dataset-level summary
189
+ - column_profiles: Column-level profiles
190
+ - quality_issues: Detected quality issues
191
+ - correlations: Correlation analysis
192
+ - recommendations: Preprocessing recommendations
193
+
194
+ Raises:
195
+ ProfilingError: If profiling fails
196
+ """
197
+ try:
198
+ # Convert to DataFrame if needed
199
+ df = self._to_dataframe(data)
200
+
201
+ self.logger.info(f"Profiling dataset with {len(df)} rows and {len(df.columns)} columns")
202
+
203
+ # Generate summary
204
+ summary = self._generate_summary(df)
205
+
206
+ # Generate column profiles
207
+ column_profiles = self._profile_columns(df, level)
208
+
209
+ # Detect quality issues
210
+ quality_issues = self._detect_quality_issues(df, checks)
211
+
212
+ # Correlation analysis (for comprehensive and deep levels)
213
+ correlations = {}
214
+ if level in [ProfileLevel.COMPREHENSIVE, ProfileLevel.DEEP]:
215
+ correlations = self._analyze_correlations(df)
216
+
217
+ # Generate recommendations
218
+ recommendations = self._generate_recommendations(df, quality_issues, level)
219
+
220
+ # Generate visualization data if requested
221
+ visualization_data = {}
222
+ if generate_visualizations:
223
+ visualization_data = self._generate_visualization_data(df)
224
+
225
+ result = {
226
+ "summary": summary,
227
+ "column_profiles": column_profiles,
228
+ "quality_issues": quality_issues,
229
+ "correlations": correlations,
230
+ "recommendations": recommendations,
231
+ "profile_level": level.value,
232
+ }
233
+
234
+ if visualization_data:
235
+ result["visualization_data"] = visualization_data
236
+
237
+ self.logger.info("Dataset profiling completed successfully")
238
+ return result
239
+
240
+ except Exception as e:
241
+ self.logger.error(f"Error profiling dataset: {e}")
242
+ raise ProfilingError(f"Failed to profile dataset: {e}")
243
+
244
+ def detect_quality_issues(
245
+ self,
246
+ data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
247
+ checks: Optional[List[DataQualityCheck]] = None,
248
+ ) -> Dict[str, Any]:
249
+ """
250
+ Detect data quality issues.
251
+
252
+ Args:
253
+ data: Data to check
254
+ checks: Specific checks to perform (all if None)
255
+
256
+ Returns:
257
+ Dict containing detected issues by category
258
+ """
259
+ try:
260
+ df = self._to_dataframe(data)
261
+ issues = self._detect_quality_issues(df, checks)
262
+
263
+ return {
264
+ "issues": issues,
265
+ "total_issues": sum(len(v) for v in issues.values()),
266
+ "severity_counts": self._categorize_severity(issues),
267
+ }
268
+
269
+ except Exception as e:
270
+ self.logger.error(f"Error detecting quality issues: {e}")
271
+ raise ProfilingError(f"Failed to detect quality issues: {e}")
272
+
273
+ def recommend_preprocessing(
274
+ self,
275
+ data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
276
+ target_column: Optional[str] = None,
277
+ ) -> Dict[str, Any]:
278
+ """
279
+ Recommend preprocessing steps based on data analysis.
280
+
281
+ Args:
282
+ data: Data to analyze
283
+ target_column: Target column for ML tasks (if applicable)
284
+
285
+ Returns:
286
+ Dict containing recommended preprocessing steps
287
+ """
288
+ try:
289
+ df = self._to_dataframe(data)
290
+
291
+ # Detect quality issues
292
+ quality_issues = self._detect_quality_issues(df, None)
293
+
294
+ # Generate recommendations
295
+ recommendations = self._generate_recommendations(
296
+ df, quality_issues, ProfileLevel.COMPREHENSIVE
297
+ )
298
+
299
+ # Add task-specific recommendations
300
+ if target_column and target_column in df.columns:
301
+ task_recommendations = self._generate_task_recommendations(df, target_column)
302
+ recommendations.extend(task_recommendations)
303
+
304
+ # Prioritize recommendations
305
+ prioritized = self._prioritize_recommendations(recommendations)
306
+
307
+ return {
308
+ "recommendations": prioritized,
309
+ "total_steps": len(prioritized),
310
+ "estimated_impact": "medium", # Placeholder for impact estimation
311
+ }
312
+
313
+ except Exception as e:
314
+ self.logger.error(f"Error generating recommendations: {e}")
315
+ raise ProfilingError(f"Failed to generate recommendations: {e}")
316
+
317
+ # Internal helper methods
318
+
319
+ def _to_dataframe(self, data: Union[Dict, List, pd.DataFrame]) -> pd.DataFrame:
320
+ """Convert data to DataFrame"""
321
+ if isinstance(data, pd.DataFrame):
322
+ return data
323
+ elif isinstance(data, list):
324
+ return pd.DataFrame(data)
325
+ elif isinstance(data, dict):
326
+ return pd.DataFrame([data])
327
+ else:
328
+ raise ProfilingError(f"Unsupported data type: {type(data)}")
329
+
330
+ def _generate_summary(self, df: pd.DataFrame) -> Dict[str, Any]:
331
+ """Generate dataset-level summary"""
332
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
333
+ categorical_cols = df.select_dtypes(include=["object", "category"]).columns
334
+
335
+ return {
336
+ "rows": len(df),
337
+ "columns": len(df.columns),
338
+ "numeric_columns": len(numeric_cols),
339
+ "categorical_columns": len(categorical_cols),
340
+ "memory_usage_mb": df.memory_usage(deep=True).sum() / (1024 * 1024),
341
+ "missing_cells": df.isnull().sum().sum(),
342
+ "missing_percentage": (
343
+ (df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100) if len(df) > 0 else 0
344
+ ),
345
+ "duplicate_rows": df.duplicated().sum(),
346
+ "duplicate_percentage": ((df.duplicated().sum() / len(df) * 100) if len(df) > 0 else 0),
347
+ }
348
+
349
+ def _profile_columns(self, df: pd.DataFrame, level: ProfileLevel) -> Dict[str, Dict[str, Any]]:
350
+ """Generate column-level profiles"""
351
+ profiles = {}
352
+
353
+ for col in df.columns:
354
+ profile = {
355
+ "name": col,
356
+ "dtype": str(df[col].dtype),
357
+ "missing_count": df[col].isnull().sum(),
358
+ "missing_percentage": (
359
+ (df[col].isnull().sum() / len(df) * 100) if len(df) > 0 else 0
360
+ ),
361
+ "unique_count": df[col].nunique(),
362
+ "unique_percentage": ((df[col].nunique() / len(df) * 100) if len(df) > 0 else 0),
363
+ }
364
+
365
+ # Add type-specific statistics
366
+ if df[col].dtype in ["int64", "float64"]:
367
+ profile.update(self._profile_numeric_column(df[col], level))
368
+ else:
369
+ profile.update(self._profile_categorical_column(df[col], level))
370
+
371
+ profiles[col] = profile
372
+
373
+ return profiles
374
+
375
+ def _profile_numeric_column(self, series: pd.Series, level: ProfileLevel) -> Dict[str, Any]:
376
+ """Profile numeric column"""
377
+ profile = {
378
+ "type": "numeric",
379
+ "min": float(series.min()) if not series.empty else None,
380
+ "max": float(series.max()) if not series.empty else None,
381
+ "mean": float(series.mean()) if not series.empty else None,
382
+ "median": float(series.median()) if not series.empty else None,
383
+ "std": float(series.std()) if not series.empty else None,
384
+ }
385
+
386
+ if level in [ProfileLevel.COMPREHENSIVE, ProfileLevel.DEEP]:
387
+ profile.update(
388
+ {
389
+ "q25": (float(series.quantile(0.25)) if not series.empty else None),
390
+ "q75": (float(series.quantile(0.75)) if not series.empty else None),
391
+ "skewness": (float(series.skew()) if not series.empty else None),
392
+ "kurtosis": (float(series.kurt()) if not series.empty else None),
393
+ }
394
+ )
395
+
396
+ # Detect outliers
397
+ if not series.empty and series.std() > 0:
398
+ z_scores = np.abs((series - series.mean()) / series.std())
399
+ outlier_count = (z_scores > self.config.outlier_std_threshold).sum()
400
+ profile["outlier_count"] = int(outlier_count)
401
+ profile["outlier_percentage"] = float(outlier_count / len(series) * 100)
402
+
403
+ return profile
404
+
405
+ def _profile_categorical_column(self, series: pd.Series, level: ProfileLevel) -> Dict[str, Any]:
406
+ """Profile categorical column"""
407
+ value_counts = series.value_counts()
408
+
409
+ profile = {
410
+ "type": "categorical",
411
+ "unique_values": int(series.nunique()),
412
+ "most_common": (str(value_counts.index[0]) if not value_counts.empty else None),
413
+ "most_common_count": (int(value_counts.iloc[0]) if not value_counts.empty else None),
414
+ }
415
+
416
+ if level in [ProfileLevel.COMPREHENSIVE, ProfileLevel.DEEP]:
417
+ # Add top categories
418
+ top_n = min(10, len(value_counts))
419
+ profile["top_categories"] = {
420
+ str(k): int(v) for k, v in value_counts.head(top_n).items()
421
+ }
422
+
423
+ return profile
424
+
425
+ def _detect_quality_issues(
426
+ self, df: pd.DataFrame, checks: Optional[List[DataQualityCheck]]
427
+ ) -> Dict[str, List[Dict[str, Any]]]:
428
+ """Detect data quality issues"""
429
+ issues = {
430
+ "missing_values": [],
431
+ "duplicates": [],
432
+ "outliers": [],
433
+ "inconsistencies": [],
434
+ "data_types": [],
435
+ "distributions": [],
436
+ "correlations": [],
437
+ }
438
+
439
+ # All checks by default
440
+ if checks is None:
441
+ checks = list(DataQualityCheck)
442
+
443
+ # Missing values check
444
+ if DataQualityCheck.MISSING_VALUES in checks:
445
+ for col in df.columns:
446
+ missing_pct = (df[col].isnull().sum() / len(df) * 100) if len(df) > 0 else 0
447
+ if missing_pct > 0:
448
+ issues["missing_values"].append(
449
+ {
450
+ "column": col,
451
+ "missing_percentage": missing_pct,
452
+ "severity": (
453
+ "high"
454
+ if missing_pct > self.config.missing_threshold * 100
455
+ else "medium"
456
+ ),
457
+ }
458
+ )
459
+
460
+ # Duplicates check
461
+ if DataQualityCheck.DUPLICATES in checks:
462
+ dup_count = df.duplicated().sum()
463
+ if dup_count > 0:
464
+ issues["duplicates"].append(
465
+ {
466
+ "type": "row_duplicates",
467
+ "count": int(dup_count),
468
+ "percentage": (float(dup_count / len(df) * 100) if len(df) > 0 else 0),
469
+ "severity": "medium",
470
+ }
471
+ )
472
+
473
+ # Outliers check
474
+ if DataQualityCheck.OUTLIERS in checks:
475
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
476
+ for col in numeric_cols:
477
+ if df[col].std() > 0:
478
+ z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
479
+ outlier_count = (z_scores > self.config.outlier_std_threshold).sum()
480
+ if outlier_count > 0:
481
+ issues["outliers"].append(
482
+ {
483
+ "column": col,
484
+ "count": int(outlier_count),
485
+ "percentage": float(outlier_count / len(df) * 100),
486
+ "severity": "low",
487
+ }
488
+ )
489
+
490
+ return issues
491
+
492
+ def _analyze_correlations(self, df: pd.DataFrame) -> Dict[str, Any]:
493
+ """Analyze correlations between numeric columns"""
494
+ numeric_df = df.select_dtypes(include=[np.number])
495
+
496
+ if numeric_df.shape[1] < 2:
497
+ return {"message": "Insufficient numeric columns for correlation analysis"}
498
+
499
+ corr_matrix = numeric_df.corr()
500
+
501
+ # Find high correlations
502
+ high_corr_pairs = []
503
+ for i in range(len(corr_matrix.columns)):
504
+ for j in range(i + 1, len(corr_matrix.columns)):
505
+ corr_value = corr_matrix.iloc[i, j]
506
+ if abs(corr_value) > self.config.correlation_threshold:
507
+ high_corr_pairs.append(
508
+ {
509
+ "column1": corr_matrix.columns[i],
510
+ "column2": corr_matrix.columns[j],
511
+ "correlation": float(corr_value),
512
+ }
513
+ )
514
+
515
+ return {
516
+ "correlation_matrix": corr_matrix.to_dict(),
517
+ "high_correlations": high_corr_pairs,
518
+ "num_high_correlations": len(high_corr_pairs),
519
+ }
520
+
521
+ def _generate_recommendations(
522
+ self,
523
+ df: pd.DataFrame,
524
+ quality_issues: Dict[str, List],
525
+ level: ProfileLevel,
526
+ ) -> List[Dict[str, Any]]:
527
+ """Generate preprocessing recommendations"""
528
+ recommendations = []
529
+
530
+ # Missing value recommendations
531
+ for issue in quality_issues.get("missing_values", []):
532
+ if issue["missing_percentage"] < 5:
533
+ recommendations.append(
534
+ {
535
+ "action": "drop_missing_rows",
536
+ "column": issue["column"],
537
+ "reason": f"Low missing percentage ({issue['missing_percentage']:.2f}%)",
538
+ "priority": "medium",
539
+ }
540
+ )
541
+ elif issue["missing_percentage"] < 50:
542
+ recommendations.append(
543
+ {
544
+ "action": "impute_missing",
545
+ "column": issue["column"],
546
+ "method": (
547
+ "mean" if df[issue["column"]].dtype in ["int64", "float64"] else "mode"
548
+ ),
549
+ "reason": f"Moderate missing percentage ({issue['missing_percentage']:.2f}%)",
550
+ "priority": "high",
551
+ }
552
+ )
553
+ else:
554
+ recommendations.append(
555
+ {
556
+ "action": "consider_dropping_column",
557
+ "column": issue["column"],
558
+ "reason": f"High missing percentage ({issue['missing_percentage']:.2f}%)",
559
+ "priority": "high",
560
+ }
561
+ )
562
+
563
+ # Duplicate recommendations
564
+ if quality_issues.get("duplicates"):
565
+ recommendations.append(
566
+ {
567
+ "action": "remove_duplicates",
568
+ "reason": f"{quality_issues['duplicates'][0]['count']} duplicate rows found",
569
+ "priority": "high",
570
+ }
571
+ )
572
+
573
+ # Outlier recommendations
574
+ if quality_issues.get("outliers"):
575
+ for issue in quality_issues["outliers"]:
576
+ if issue["percentage"] > 5:
577
+ recommendations.append(
578
+ {
579
+ "action": "handle_outliers",
580
+ "column": issue["column"],
581
+ "method": "winsorize or cap",
582
+ "reason": f"Significant outliers detected ({issue['percentage']:.2f}%)",
583
+ "priority": "medium",
584
+ }
585
+ )
586
+
587
+ return recommendations
588
+
589
+ def _generate_task_recommendations(
590
+ self, df: pd.DataFrame, target_column: str
591
+ ) -> List[Dict[str, Any]]:
592
+ """Generate task-specific recommendations"""
593
+ recommendations = []
594
+
595
+ # Check if target is numeric or categorical
596
+ if df[target_column].dtype in ["int64", "float64"]:
597
+ task_type = "regression"
598
+ else:
599
+ task_type = "classification"
600
+
601
+ recommendations.append(
602
+ {
603
+ "action": "task_identified",
604
+ "task_type": task_type,
605
+ "target_column": target_column,
606
+ "reason": f"Based on target column type: {df[target_column].dtype}",
607
+ "priority": "info",
608
+ }
609
+ )
610
+
611
+ return recommendations
612
+
613
+ def _prioritize_recommendations(
614
+ self, recommendations: List[Dict[str, Any]]
615
+ ) -> List[Dict[str, Any]]:
616
+ """Prioritize recommendations by importance"""
617
+ priority_order = {"high": 0, "medium": 1, "low": 2, "info": 3}
618
+ return sorted(
619
+ recommendations,
620
+ key=lambda x: priority_order.get(x.get("priority", "low"), 2),
621
+ )
622
+
623
+ def _categorize_severity(self, issues: Dict[str, List]) -> Dict[str, int]:
624
+ """Categorize issues by severity"""
625
+ severity_counts = {"high": 0, "medium": 0, "low": 0}
626
+
627
+ for issue_list in issues.values():
628
+ for issue in issue_list:
629
+ severity = issue.get("severity", "low")
630
+ severity_counts[severity] = severity_counts.get(severity, 0) + 1
631
+
632
+ return severity_counts
633
+
634
+ def _generate_visualization_data(self, df: pd.DataFrame) -> Dict[str, Any]:
635
+ """Generate data for visualizations"""
636
+ viz_data = {}
637
+
638
+ # Numeric distributions
639
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
640
+ if len(numeric_cols) > 0:
641
+ viz_data["numeric_distributions"] = {
642
+ col: {
643
+ # Sample for performance
644
+ "values": df[col].dropna().tolist()[:1000],
645
+ "bins": 30,
646
+ }
647
+ for col in numeric_cols[:5] # Limit to first 5
648
+ }
649
+
650
+ # Categorical distributions
651
+ categorical_cols = df.select_dtypes(include=["object", "category"]).columns
652
+ if len(categorical_cols) > 0:
653
+ viz_data["categorical_distributions"] = {
654
+ col: df[col].value_counts().head(10).to_dict()
655
+ for col in categorical_cols[:5] # Limit to first 5
656
+ }
657
+
658
+ return viz_data