aiecs 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiecs/__init__.py +72 -0
- aiecs/__main__.py +41 -0
- aiecs/aiecs_client.py +469 -0
- aiecs/application/__init__.py +10 -0
- aiecs/application/executors/__init__.py +10 -0
- aiecs/application/executors/operation_executor.py +363 -0
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +100 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
- aiecs/application/knowledge_graph/search/reranker.py +295 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +484 -0
- aiecs/config/__init__.py +16 -0
- aiecs/config/config.py +498 -0
- aiecs/config/graph_config.py +137 -0
- aiecs/config/registry.py +23 -0
- aiecs/core/__init__.py +46 -0
- aiecs/core/interface/__init__.py +34 -0
- aiecs/core/interface/execution_interface.py +152 -0
- aiecs/core/interface/storage_interface.py +171 -0
- aiecs/domain/__init__.py +289 -0
- aiecs/domain/agent/__init__.py +189 -0
- aiecs/domain/agent/base_agent.py +697 -0
- aiecs/domain/agent/exceptions.py +103 -0
- aiecs/domain/agent/graph_aware_mixin.py +559 -0
- aiecs/domain/agent/hybrid_agent.py +490 -0
- aiecs/domain/agent/integration/__init__.py +26 -0
- aiecs/domain/agent/integration/context_compressor.py +222 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
- aiecs/domain/agent/integration/retry_policy.py +219 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +646 -0
- aiecs/domain/agent/lifecycle.py +296 -0
- aiecs/domain/agent/llm_agent.py +300 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +197 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +160 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
- aiecs/domain/agent/models.py +317 -0
- aiecs/domain/agent/observability.py +407 -0
- aiecs/domain/agent/persistence.py +289 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +161 -0
- aiecs/domain/agent/prompts/formatters.py +189 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +260 -0
- aiecs/domain/agent/tool_agent.py +257 -0
- aiecs/domain/agent/tools/__init__.py +12 -0
- aiecs/domain/agent/tools/schema_generator.py +221 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +477 -0
- aiecs/domain/community/analytics.py +481 -0
- aiecs/domain/community/collaborative_workflow.py +642 -0
- aiecs/domain/community/communication_hub.py +645 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +800 -0
- aiecs/domain/community/community_manager.py +813 -0
- aiecs/domain/community/decision_engine.py +879 -0
- aiecs/domain/community/exceptions.py +225 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +268 -0
- aiecs/domain/community/resource_manager.py +457 -0
- aiecs/domain/community/shared_context_manager.py +603 -0
- aiecs/domain/context/__init__.py +58 -0
- aiecs/domain/context/context_engine.py +989 -0
- aiecs/domain/context/conversation_models.py +354 -0
- aiecs/domain/context/graph_memory.py +467 -0
- aiecs/domain/execution/__init__.py +12 -0
- aiecs/domain/execution/model.py +57 -0
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +130 -0
- aiecs/domain/knowledge_graph/models/evidence.py +194 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
- aiecs/domain/knowledge_graph/models/path.py +179 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
- aiecs/domain/knowledge_graph/models/query.py +272 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
- aiecs/domain/knowledge_graph/models/relation.py +136 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
- aiecs/domain/task/__init__.py +13 -0
- aiecs/domain/task/dsl_processor.py +613 -0
- aiecs/domain/task/model.py +62 -0
- aiecs/domain/task/task_context.py +268 -0
- aiecs/infrastructure/__init__.py +24 -0
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +601 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
- aiecs/infrastructure/graph_storage/cache.py +429 -0
- aiecs/infrastructure/graph_storage/distributed.py +226 -0
- aiecs/infrastructure/graph_storage/error_handling.py +390 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +514 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
- aiecs/infrastructure/graph_storage/metrics.py +357 -0
- aiecs/infrastructure/graph_storage/migration.py +413 -0
- aiecs/infrastructure/graph_storage/pagination.py +471 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
- aiecs/infrastructure/graph_storage/postgres.py +871 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +623 -0
- aiecs/infrastructure/graph_storage/streaming.py +495 -0
- aiecs/infrastructure/messaging/__init__.py +13 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
- aiecs/infrastructure/messaging/websocket_manager.py +298 -0
- aiecs/infrastructure/monitoring/__init__.py +34 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
- aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
- aiecs/infrastructure/monitoring/structured_logger.py +48 -0
- aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
- aiecs/infrastructure/persistence/__init__.py +24 -0
- aiecs/infrastructure/persistence/context_engine_client.py +187 -0
- aiecs/infrastructure/persistence/database_manager.py +333 -0
- aiecs/infrastructure/persistence/file_storage.py +754 -0
- aiecs/infrastructure/persistence/redis_client.py +220 -0
- aiecs/llm/__init__.py +86 -0
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/callbacks/custom_callbacks.py +264 -0
- aiecs/llm/client_factory.py +420 -0
- aiecs/llm/clients/__init__.py +33 -0
- aiecs/llm/clients/base_client.py +193 -0
- aiecs/llm/clients/googleai_client.py +181 -0
- aiecs/llm/clients/openai_client.py +131 -0
- aiecs/llm/clients/vertex_client.py +437 -0
- aiecs/llm/clients/xai_client.py +184 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +275 -0
- aiecs/llm/config/config_validator.py +236 -0
- aiecs/llm/config/model_config.py +151 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +91 -0
- aiecs/main.py +363 -0
- aiecs/scripts/__init__.py +3 -0
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/version_manager.py +215 -0
- aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
- aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
- aiecs/scripts/dependance_check/__init__.py +17 -0
- aiecs/scripts/dependance_check/dependency_checker.py +938 -0
- aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
- aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
- aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
- aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
- aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
- aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
- aiecs/scripts/tools_develop/README.md +449 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
- aiecs/scripts/tools_develop/verify_tools.py +356 -0
- aiecs/tasks/__init__.py +1 -0
- aiecs/tasks/worker.py +172 -0
- aiecs/tools/__init__.py +299 -0
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +303 -0
- aiecs/tools/apisource/providers/__init__.py +115 -0
- aiecs/tools/apisource/providers/base.py +664 -0
- aiecs/tools/apisource/providers/census.py +401 -0
- aiecs/tools/apisource/providers/fred.py +564 -0
- aiecs/tools/apisource/providers/newsapi.py +412 -0
- aiecs/tools/apisource/providers/worldbank.py +357 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +375 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
- aiecs/tools/apisource/tool.py +850 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +338 -0
- aiecs/tools/base_tool.py +201 -0
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +599 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
- aiecs/tools/docs/content_insertion_tool.py +1333 -0
- aiecs/tools/docs/document_creator_tool.py +1317 -0
- aiecs/tools/docs/document_layout_tool.py +1166 -0
- aiecs/tools/docs/document_parser_tool.py +994 -0
- aiecs/tools/docs/document_writer_tool.py +1818 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
- aiecs/tools/langchain_adapter.py +542 -0
- aiecs/tools/schema_generator.py +275 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +589 -0
- aiecs/tools/search_tool/cache.py +260 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +216 -0
- aiecs/tools/search_tool/core.py +749 -0
- aiecs/tools/search_tool/deduplicator.py +123 -0
- aiecs/tools/search_tool/error_handler.py +271 -0
- aiecs/tools/search_tool/metrics.py +371 -0
- aiecs/tools/search_tool/rate_limiter.py +178 -0
- aiecs/tools/search_tool/schemas.py +277 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
- aiecs/tools/statistics/data_loader_tool.py +564 -0
- aiecs/tools/statistics/data_profiler_tool.py +658 -0
- aiecs/tools/statistics/data_transformer_tool.py +573 -0
- aiecs/tools/statistics/data_visualizer_tool.py +495 -0
- aiecs/tools/statistics/model_trainer_tool.py +487 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
- aiecs/tools/task_tools/__init__.py +86 -0
- aiecs/tools/task_tools/chart_tool.py +732 -0
- aiecs/tools/task_tools/classfire_tool.py +922 -0
- aiecs/tools/task_tools/image_tool.py +447 -0
- aiecs/tools/task_tools/office_tool.py +684 -0
- aiecs/tools/task_tools/pandas_tool.py +635 -0
- aiecs/tools/task_tools/report_tool.py +635 -0
- aiecs/tools/task_tools/research_tool.py +392 -0
- aiecs/tools/task_tools/scraper_tool.py +715 -0
- aiecs/tools/task_tools/stats_tool.py +688 -0
- aiecs/tools/temp_file_manager.py +130 -0
- aiecs/tools/tool_executor/__init__.py +37 -0
- aiecs/tools/tool_executor/tool_executor.py +881 -0
- aiecs/utils/LLM_output_structor.py +445 -0
- aiecs/utils/__init__.py +34 -0
- aiecs/utils/base_callback.py +47 -0
- aiecs/utils/cache_provider.py +695 -0
- aiecs/utils/execution_utils.py +184 -0
- aiecs/utils/logging.py +1 -0
- aiecs/utils/prompt_loader.py +14 -0
- aiecs/utils/token_usage_repository.py +323 -0
- aiecs/ws/__init__.py +0 -0
- aiecs/ws/socket_server.py +52 -0
- aiecs-1.5.1.dist-info/METADATA +608 -0
- aiecs-1.5.1.dist-info/RECORD +302 -0
- aiecs-1.5.1.dist-info/WHEEL +5 -0
- aiecs-1.5.1.dist-info/entry_points.txt +10 -0
- aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
- aiecs-1.5.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,658 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Profiler Tool - Comprehensive data profiling and quality assessment
|
|
3
|
+
|
|
4
|
+
This tool provides advanced data profiling capabilities with:
|
|
5
|
+
- Statistical summaries and distributions
|
|
6
|
+
- Data quality issue detection
|
|
7
|
+
- Pattern and anomaly identification
|
|
8
|
+
- Preprocessing recommendations
|
|
9
|
+
- Column-level and dataset-level analysis
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
from typing import Dict, Any, List, Optional, Union
|
|
14
|
+
from enum import Enum
|
|
15
|
+
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import numpy as np
|
|
18
|
+
from pydantic import BaseModel, Field, ConfigDict
|
|
19
|
+
|
|
20
|
+
from aiecs.tools.base_tool import BaseTool
|
|
21
|
+
from aiecs.tools import register_tool
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class ProfileLevel(str, Enum):
|
|
25
|
+
"""Data profiling depth levels"""
|
|
26
|
+
|
|
27
|
+
BASIC = "basic"
|
|
28
|
+
STANDARD = "standard"
|
|
29
|
+
COMPREHENSIVE = "comprehensive"
|
|
30
|
+
DEEP = "deep"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class DataQualityCheck(str, Enum):
|
|
34
|
+
"""Types of data quality checks"""
|
|
35
|
+
|
|
36
|
+
MISSING_VALUES = "missing_values"
|
|
37
|
+
DUPLICATES = "duplicates"
|
|
38
|
+
OUTLIERS = "outliers"
|
|
39
|
+
INCONSISTENCIES = "inconsistencies"
|
|
40
|
+
DATA_TYPES = "data_types"
|
|
41
|
+
DISTRIBUTIONS = "distributions"
|
|
42
|
+
CORRELATIONS = "correlations"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class DataProfilerError(Exception):
|
|
46
|
+
"""Base exception for DataProfiler errors"""
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class ProfilingError(DataProfilerError):
|
|
50
|
+
"""Raised when profiling operation fails"""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@register_tool("data_profiler")
|
|
54
|
+
class DataProfilerTool(BaseTool):
|
|
55
|
+
"""
|
|
56
|
+
Comprehensive data profiling tool that can:
|
|
57
|
+
1. Generate statistical summaries
|
|
58
|
+
2. Detect data quality issues
|
|
59
|
+
3. Identify patterns and anomalies
|
|
60
|
+
4. Recommend preprocessing steps
|
|
61
|
+
|
|
62
|
+
Integrates with stats_tool and pandas_tool for core operations.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
# Configuration schema
|
|
66
|
+
class Config(BaseModel):
|
|
67
|
+
"""Configuration for the data profiler tool"""
|
|
68
|
+
|
|
69
|
+
model_config = ConfigDict(env_prefix="DATA_PROFILER_")
|
|
70
|
+
|
|
71
|
+
default_profile_level: str = Field(
|
|
72
|
+
default="standard", description="Default profiling depth level"
|
|
73
|
+
)
|
|
74
|
+
outlier_std_threshold: float = Field(
|
|
75
|
+
default=3.0,
|
|
76
|
+
description="Standard deviation threshold for outlier detection",
|
|
77
|
+
)
|
|
78
|
+
correlation_threshold: float = Field(
|
|
79
|
+
default=0.7,
|
|
80
|
+
description="Correlation threshold for identifying strong relationships",
|
|
81
|
+
)
|
|
82
|
+
missing_threshold: float = Field(
|
|
83
|
+
default=0.5,
|
|
84
|
+
description="Missing value threshold for quality assessment",
|
|
85
|
+
)
|
|
86
|
+
enable_visualizations: bool = Field(
|
|
87
|
+
default=True,
|
|
88
|
+
description="Whether to enable visualization generation",
|
|
89
|
+
)
|
|
90
|
+
max_unique_values_categorical: int = Field(
|
|
91
|
+
default=50,
|
|
92
|
+
description="Maximum unique values for categorical analysis",
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
96
|
+
"""
|
|
97
|
+
Initialize DataProfilerTool with settings.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
config: Optional configuration overrides
|
|
101
|
+
"""
|
|
102
|
+
super().__init__(config)
|
|
103
|
+
|
|
104
|
+
# Parse configuration
|
|
105
|
+
self.config = self.Config(**(config or {}))
|
|
106
|
+
|
|
107
|
+
self.logger = logging.getLogger(__name__)
|
|
108
|
+
if not self.logger.handlers:
|
|
109
|
+
handler = logging.StreamHandler()
|
|
110
|
+
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
|
|
111
|
+
self.logger.addHandler(handler)
|
|
112
|
+
self.logger.setLevel(logging.INFO)
|
|
113
|
+
|
|
114
|
+
# Initialize external tools
|
|
115
|
+
self._init_external_tools()
|
|
116
|
+
|
|
117
|
+
def _init_external_tools(self):
|
|
118
|
+
"""Initialize external task tools"""
|
|
119
|
+
self.external_tools = {}
|
|
120
|
+
|
|
121
|
+
# Initialize StatsTool for statistical operations
|
|
122
|
+
try:
|
|
123
|
+
from aiecs.tools.task_tools.stats_tool import StatsTool
|
|
124
|
+
|
|
125
|
+
self.external_tools["stats"] = StatsTool()
|
|
126
|
+
self.logger.info("StatsTool initialized successfully")
|
|
127
|
+
except ImportError:
|
|
128
|
+
self.logger.warning("StatsTool not available")
|
|
129
|
+
self.external_tools["stats"] = None
|
|
130
|
+
|
|
131
|
+
# Initialize PandasTool for data operations
|
|
132
|
+
try:
|
|
133
|
+
from aiecs.tools.task_tools.pandas_tool import PandasTool
|
|
134
|
+
|
|
135
|
+
self.external_tools["pandas"] = PandasTool()
|
|
136
|
+
self.logger.info("PandasTool initialized successfully")
|
|
137
|
+
except ImportError:
|
|
138
|
+
self.logger.warning("PandasTool not available")
|
|
139
|
+
self.external_tools["pandas"] = None
|
|
140
|
+
|
|
141
|
+
# Schema definitions
|
|
142
|
+
class ProfileDatasetSchema(BaseModel):
|
|
143
|
+
"""Schema for profile_dataset operation"""
|
|
144
|
+
|
|
145
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to profile")
|
|
146
|
+
level: ProfileLevel = Field(
|
|
147
|
+
default=ProfileLevel.STANDARD, description="Profiling depth level"
|
|
148
|
+
)
|
|
149
|
+
checks: Optional[List[DataQualityCheck]] = Field(
|
|
150
|
+
default=None, description="Specific quality checks to perform"
|
|
151
|
+
)
|
|
152
|
+
generate_visualizations: bool = Field(
|
|
153
|
+
default=False, description="Generate visualization data"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
class DetectQualityIssuesSchema(BaseModel):
|
|
157
|
+
"""Schema for detect_quality_issues operation"""
|
|
158
|
+
|
|
159
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to check")
|
|
160
|
+
checks: Optional[List[DataQualityCheck]] = Field(
|
|
161
|
+
default=None, description="Specific checks to perform"
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
class RecommendPreprocessingSchema(BaseModel):
|
|
165
|
+
"""Schema for recommend_preprocessing operation"""
|
|
166
|
+
|
|
167
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to analyze")
|
|
168
|
+
target_column: Optional[str] = Field(default=None, description="Target column for ML tasks")
|
|
169
|
+
|
|
170
|
+
def profile_dataset(
|
|
171
|
+
self,
|
|
172
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
|
|
173
|
+
level: ProfileLevel = ProfileLevel.STANDARD,
|
|
174
|
+
checks: Optional[List[DataQualityCheck]] = None,
|
|
175
|
+
generate_visualizations: bool = False,
|
|
176
|
+
) -> Dict[str, Any]:
|
|
177
|
+
"""
|
|
178
|
+
Generate comprehensive data profile.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
data: Data to profile (dict, list of dicts, or DataFrame)
|
|
182
|
+
level: Profiling depth level
|
|
183
|
+
checks: Specific quality checks to perform (all if None)
|
|
184
|
+
generate_visualizations: Whether to generate visualization data
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Dict containing:
|
|
188
|
+
- summary: Dataset-level summary
|
|
189
|
+
- column_profiles: Column-level profiles
|
|
190
|
+
- quality_issues: Detected quality issues
|
|
191
|
+
- correlations: Correlation analysis
|
|
192
|
+
- recommendations: Preprocessing recommendations
|
|
193
|
+
|
|
194
|
+
Raises:
|
|
195
|
+
ProfilingError: If profiling fails
|
|
196
|
+
"""
|
|
197
|
+
try:
|
|
198
|
+
# Convert to DataFrame if needed
|
|
199
|
+
df = self._to_dataframe(data)
|
|
200
|
+
|
|
201
|
+
self.logger.info(f"Profiling dataset with {len(df)} rows and {len(df.columns)} columns")
|
|
202
|
+
|
|
203
|
+
# Generate summary
|
|
204
|
+
summary = self._generate_summary(df)
|
|
205
|
+
|
|
206
|
+
# Generate column profiles
|
|
207
|
+
column_profiles = self._profile_columns(df, level)
|
|
208
|
+
|
|
209
|
+
# Detect quality issues
|
|
210
|
+
quality_issues = self._detect_quality_issues(df, checks)
|
|
211
|
+
|
|
212
|
+
# Correlation analysis (for comprehensive and deep levels)
|
|
213
|
+
correlations = {}
|
|
214
|
+
if level in [ProfileLevel.COMPREHENSIVE, ProfileLevel.DEEP]:
|
|
215
|
+
correlations = self._analyze_correlations(df)
|
|
216
|
+
|
|
217
|
+
# Generate recommendations
|
|
218
|
+
recommendations = self._generate_recommendations(df, quality_issues, level)
|
|
219
|
+
|
|
220
|
+
# Generate visualization data if requested
|
|
221
|
+
visualization_data = {}
|
|
222
|
+
if generate_visualizations:
|
|
223
|
+
visualization_data = self._generate_visualization_data(df)
|
|
224
|
+
|
|
225
|
+
result = {
|
|
226
|
+
"summary": summary,
|
|
227
|
+
"column_profiles": column_profiles,
|
|
228
|
+
"quality_issues": quality_issues,
|
|
229
|
+
"correlations": correlations,
|
|
230
|
+
"recommendations": recommendations,
|
|
231
|
+
"profile_level": level.value,
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
if visualization_data:
|
|
235
|
+
result["visualization_data"] = visualization_data
|
|
236
|
+
|
|
237
|
+
self.logger.info("Dataset profiling completed successfully")
|
|
238
|
+
return result
|
|
239
|
+
|
|
240
|
+
except Exception as e:
|
|
241
|
+
self.logger.error(f"Error profiling dataset: {e}")
|
|
242
|
+
raise ProfilingError(f"Failed to profile dataset: {e}")
|
|
243
|
+
|
|
244
|
+
def detect_quality_issues(
|
|
245
|
+
self,
|
|
246
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
|
|
247
|
+
checks: Optional[List[DataQualityCheck]] = None,
|
|
248
|
+
) -> Dict[str, Any]:
|
|
249
|
+
"""
|
|
250
|
+
Detect data quality issues.
|
|
251
|
+
|
|
252
|
+
Args:
|
|
253
|
+
data: Data to check
|
|
254
|
+
checks: Specific checks to perform (all if None)
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
Dict containing detected issues by category
|
|
258
|
+
"""
|
|
259
|
+
try:
|
|
260
|
+
df = self._to_dataframe(data)
|
|
261
|
+
issues = self._detect_quality_issues(df, checks)
|
|
262
|
+
|
|
263
|
+
return {
|
|
264
|
+
"issues": issues,
|
|
265
|
+
"total_issues": sum(len(v) for v in issues.values()),
|
|
266
|
+
"severity_counts": self._categorize_severity(issues),
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
except Exception as e:
|
|
270
|
+
self.logger.error(f"Error detecting quality issues: {e}")
|
|
271
|
+
raise ProfilingError(f"Failed to detect quality issues: {e}")
|
|
272
|
+
|
|
273
|
+
def recommend_preprocessing(
|
|
274
|
+
self,
|
|
275
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
|
|
276
|
+
target_column: Optional[str] = None,
|
|
277
|
+
) -> Dict[str, Any]:
|
|
278
|
+
"""
|
|
279
|
+
Recommend preprocessing steps based on data analysis.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
data: Data to analyze
|
|
283
|
+
target_column: Target column for ML tasks (if applicable)
|
|
284
|
+
|
|
285
|
+
Returns:
|
|
286
|
+
Dict containing recommended preprocessing steps
|
|
287
|
+
"""
|
|
288
|
+
try:
|
|
289
|
+
df = self._to_dataframe(data)
|
|
290
|
+
|
|
291
|
+
# Detect quality issues
|
|
292
|
+
quality_issues = self._detect_quality_issues(df, None)
|
|
293
|
+
|
|
294
|
+
# Generate recommendations
|
|
295
|
+
recommendations = self._generate_recommendations(
|
|
296
|
+
df, quality_issues, ProfileLevel.COMPREHENSIVE
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
# Add task-specific recommendations
|
|
300
|
+
if target_column and target_column in df.columns:
|
|
301
|
+
task_recommendations = self._generate_task_recommendations(df, target_column)
|
|
302
|
+
recommendations.extend(task_recommendations)
|
|
303
|
+
|
|
304
|
+
# Prioritize recommendations
|
|
305
|
+
prioritized = self._prioritize_recommendations(recommendations)
|
|
306
|
+
|
|
307
|
+
return {
|
|
308
|
+
"recommendations": prioritized,
|
|
309
|
+
"total_steps": len(prioritized),
|
|
310
|
+
"estimated_impact": "medium", # Placeholder for impact estimation
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
except Exception as e:
|
|
314
|
+
self.logger.error(f"Error generating recommendations: {e}")
|
|
315
|
+
raise ProfilingError(f"Failed to generate recommendations: {e}")
|
|
316
|
+
|
|
317
|
+
# Internal helper methods
|
|
318
|
+
|
|
319
|
+
def _to_dataframe(self, data: Union[Dict, List, pd.DataFrame]) -> pd.DataFrame:
|
|
320
|
+
"""Convert data to DataFrame"""
|
|
321
|
+
if isinstance(data, pd.DataFrame):
|
|
322
|
+
return data
|
|
323
|
+
elif isinstance(data, list):
|
|
324
|
+
return pd.DataFrame(data)
|
|
325
|
+
elif isinstance(data, dict):
|
|
326
|
+
return pd.DataFrame([data])
|
|
327
|
+
else:
|
|
328
|
+
raise ProfilingError(f"Unsupported data type: {type(data)}")
|
|
329
|
+
|
|
330
|
+
def _generate_summary(self, df: pd.DataFrame) -> Dict[str, Any]:
|
|
331
|
+
"""Generate dataset-level summary"""
|
|
332
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
333
|
+
categorical_cols = df.select_dtypes(include=["object", "category"]).columns
|
|
334
|
+
|
|
335
|
+
return {
|
|
336
|
+
"rows": len(df),
|
|
337
|
+
"columns": len(df.columns),
|
|
338
|
+
"numeric_columns": len(numeric_cols),
|
|
339
|
+
"categorical_columns": len(categorical_cols),
|
|
340
|
+
"memory_usage_mb": df.memory_usage(deep=True).sum() / (1024 * 1024),
|
|
341
|
+
"missing_cells": df.isnull().sum().sum(),
|
|
342
|
+
"missing_percentage": (
|
|
343
|
+
(df.isnull().sum().sum() / (len(df) * len(df.columns)) * 100) if len(df) > 0 else 0
|
|
344
|
+
),
|
|
345
|
+
"duplicate_rows": df.duplicated().sum(),
|
|
346
|
+
"duplicate_percentage": ((df.duplicated().sum() / len(df) * 100) if len(df) > 0 else 0),
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
def _profile_columns(self, df: pd.DataFrame, level: ProfileLevel) -> Dict[str, Dict[str, Any]]:
|
|
350
|
+
"""Generate column-level profiles"""
|
|
351
|
+
profiles = {}
|
|
352
|
+
|
|
353
|
+
for col in df.columns:
|
|
354
|
+
profile = {
|
|
355
|
+
"name": col,
|
|
356
|
+
"dtype": str(df[col].dtype),
|
|
357
|
+
"missing_count": df[col].isnull().sum(),
|
|
358
|
+
"missing_percentage": (
|
|
359
|
+
(df[col].isnull().sum() / len(df) * 100) if len(df) > 0 else 0
|
|
360
|
+
),
|
|
361
|
+
"unique_count": df[col].nunique(),
|
|
362
|
+
"unique_percentage": ((df[col].nunique() / len(df) * 100) if len(df) > 0 else 0),
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
# Add type-specific statistics
|
|
366
|
+
if df[col].dtype in ["int64", "float64"]:
|
|
367
|
+
profile.update(self._profile_numeric_column(df[col], level))
|
|
368
|
+
else:
|
|
369
|
+
profile.update(self._profile_categorical_column(df[col], level))
|
|
370
|
+
|
|
371
|
+
profiles[col] = profile
|
|
372
|
+
|
|
373
|
+
return profiles
|
|
374
|
+
|
|
375
|
+
def _profile_numeric_column(self, series: pd.Series, level: ProfileLevel) -> Dict[str, Any]:
|
|
376
|
+
"""Profile numeric column"""
|
|
377
|
+
profile = {
|
|
378
|
+
"type": "numeric",
|
|
379
|
+
"min": float(series.min()) if not series.empty else None,
|
|
380
|
+
"max": float(series.max()) if not series.empty else None,
|
|
381
|
+
"mean": float(series.mean()) if not series.empty else None,
|
|
382
|
+
"median": float(series.median()) if not series.empty else None,
|
|
383
|
+
"std": float(series.std()) if not series.empty else None,
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
if level in [ProfileLevel.COMPREHENSIVE, ProfileLevel.DEEP]:
|
|
387
|
+
profile.update(
|
|
388
|
+
{
|
|
389
|
+
"q25": (float(series.quantile(0.25)) if not series.empty else None),
|
|
390
|
+
"q75": (float(series.quantile(0.75)) if not series.empty else None),
|
|
391
|
+
"skewness": (float(series.skew()) if not series.empty else None),
|
|
392
|
+
"kurtosis": (float(series.kurt()) if not series.empty else None),
|
|
393
|
+
}
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
# Detect outliers
|
|
397
|
+
if not series.empty and series.std() > 0:
|
|
398
|
+
z_scores = np.abs((series - series.mean()) / series.std())
|
|
399
|
+
outlier_count = (z_scores > self.config.outlier_std_threshold).sum()
|
|
400
|
+
profile["outlier_count"] = int(outlier_count)
|
|
401
|
+
profile["outlier_percentage"] = float(outlier_count / len(series) * 100)
|
|
402
|
+
|
|
403
|
+
return profile
|
|
404
|
+
|
|
405
|
+
def _profile_categorical_column(self, series: pd.Series, level: ProfileLevel) -> Dict[str, Any]:
|
|
406
|
+
"""Profile categorical column"""
|
|
407
|
+
value_counts = series.value_counts()
|
|
408
|
+
|
|
409
|
+
profile = {
|
|
410
|
+
"type": "categorical",
|
|
411
|
+
"unique_values": int(series.nunique()),
|
|
412
|
+
"most_common": (str(value_counts.index[0]) if not value_counts.empty else None),
|
|
413
|
+
"most_common_count": (int(value_counts.iloc[0]) if not value_counts.empty else None),
|
|
414
|
+
}
|
|
415
|
+
|
|
416
|
+
if level in [ProfileLevel.COMPREHENSIVE, ProfileLevel.DEEP]:
|
|
417
|
+
# Add top categories
|
|
418
|
+
top_n = min(10, len(value_counts))
|
|
419
|
+
profile["top_categories"] = {
|
|
420
|
+
str(k): int(v) for k, v in value_counts.head(top_n).items()
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
return profile
|
|
424
|
+
|
|
425
|
+
def _detect_quality_issues(
|
|
426
|
+
self, df: pd.DataFrame, checks: Optional[List[DataQualityCheck]]
|
|
427
|
+
) -> Dict[str, List[Dict[str, Any]]]:
|
|
428
|
+
"""Detect data quality issues"""
|
|
429
|
+
issues = {
|
|
430
|
+
"missing_values": [],
|
|
431
|
+
"duplicates": [],
|
|
432
|
+
"outliers": [],
|
|
433
|
+
"inconsistencies": [],
|
|
434
|
+
"data_types": [],
|
|
435
|
+
"distributions": [],
|
|
436
|
+
"correlations": [],
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
# All checks by default
|
|
440
|
+
if checks is None:
|
|
441
|
+
checks = list(DataQualityCheck)
|
|
442
|
+
|
|
443
|
+
# Missing values check
|
|
444
|
+
if DataQualityCheck.MISSING_VALUES in checks:
|
|
445
|
+
for col in df.columns:
|
|
446
|
+
missing_pct = (df[col].isnull().sum() / len(df) * 100) if len(df) > 0 else 0
|
|
447
|
+
if missing_pct > 0:
|
|
448
|
+
issues["missing_values"].append(
|
|
449
|
+
{
|
|
450
|
+
"column": col,
|
|
451
|
+
"missing_percentage": missing_pct,
|
|
452
|
+
"severity": (
|
|
453
|
+
"high"
|
|
454
|
+
if missing_pct > self.config.missing_threshold * 100
|
|
455
|
+
else "medium"
|
|
456
|
+
),
|
|
457
|
+
}
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
# Duplicates check
|
|
461
|
+
if DataQualityCheck.DUPLICATES in checks:
|
|
462
|
+
dup_count = df.duplicated().sum()
|
|
463
|
+
if dup_count > 0:
|
|
464
|
+
issues["duplicates"].append(
|
|
465
|
+
{
|
|
466
|
+
"type": "row_duplicates",
|
|
467
|
+
"count": int(dup_count),
|
|
468
|
+
"percentage": (float(dup_count / len(df) * 100) if len(df) > 0 else 0),
|
|
469
|
+
"severity": "medium",
|
|
470
|
+
}
|
|
471
|
+
)
|
|
472
|
+
|
|
473
|
+
# Outliers check
|
|
474
|
+
if DataQualityCheck.OUTLIERS in checks:
|
|
475
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
476
|
+
for col in numeric_cols:
|
|
477
|
+
if df[col].std() > 0:
|
|
478
|
+
z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
|
|
479
|
+
outlier_count = (z_scores > self.config.outlier_std_threshold).sum()
|
|
480
|
+
if outlier_count > 0:
|
|
481
|
+
issues["outliers"].append(
|
|
482
|
+
{
|
|
483
|
+
"column": col,
|
|
484
|
+
"count": int(outlier_count),
|
|
485
|
+
"percentage": float(outlier_count / len(df) * 100),
|
|
486
|
+
"severity": "low",
|
|
487
|
+
}
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
return issues
|
|
491
|
+
|
|
492
|
+
def _analyze_correlations(self, df: pd.DataFrame) -> Dict[str, Any]:
|
|
493
|
+
"""Analyze correlations between numeric columns"""
|
|
494
|
+
numeric_df = df.select_dtypes(include=[np.number])
|
|
495
|
+
|
|
496
|
+
if numeric_df.shape[1] < 2:
|
|
497
|
+
return {"message": "Insufficient numeric columns for correlation analysis"}
|
|
498
|
+
|
|
499
|
+
corr_matrix = numeric_df.corr()
|
|
500
|
+
|
|
501
|
+
# Find high correlations
|
|
502
|
+
high_corr_pairs = []
|
|
503
|
+
for i in range(len(corr_matrix.columns)):
|
|
504
|
+
for j in range(i + 1, len(corr_matrix.columns)):
|
|
505
|
+
corr_value = corr_matrix.iloc[i, j]
|
|
506
|
+
if abs(corr_value) > self.config.correlation_threshold:
|
|
507
|
+
high_corr_pairs.append(
|
|
508
|
+
{
|
|
509
|
+
"column1": corr_matrix.columns[i],
|
|
510
|
+
"column2": corr_matrix.columns[j],
|
|
511
|
+
"correlation": float(corr_value),
|
|
512
|
+
}
|
|
513
|
+
)
|
|
514
|
+
|
|
515
|
+
return {
|
|
516
|
+
"correlation_matrix": corr_matrix.to_dict(),
|
|
517
|
+
"high_correlations": high_corr_pairs,
|
|
518
|
+
"num_high_correlations": len(high_corr_pairs),
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
def _generate_recommendations(
|
|
522
|
+
self,
|
|
523
|
+
df: pd.DataFrame,
|
|
524
|
+
quality_issues: Dict[str, List],
|
|
525
|
+
level: ProfileLevel,
|
|
526
|
+
) -> List[Dict[str, Any]]:
|
|
527
|
+
"""Generate preprocessing recommendations"""
|
|
528
|
+
recommendations = []
|
|
529
|
+
|
|
530
|
+
# Missing value recommendations
|
|
531
|
+
for issue in quality_issues.get("missing_values", []):
|
|
532
|
+
if issue["missing_percentage"] < 5:
|
|
533
|
+
recommendations.append(
|
|
534
|
+
{
|
|
535
|
+
"action": "drop_missing_rows",
|
|
536
|
+
"column": issue["column"],
|
|
537
|
+
"reason": f"Low missing percentage ({issue['missing_percentage']:.2f}%)",
|
|
538
|
+
"priority": "medium",
|
|
539
|
+
}
|
|
540
|
+
)
|
|
541
|
+
elif issue["missing_percentage"] < 50:
|
|
542
|
+
recommendations.append(
|
|
543
|
+
{
|
|
544
|
+
"action": "impute_missing",
|
|
545
|
+
"column": issue["column"],
|
|
546
|
+
"method": (
|
|
547
|
+
"mean" if df[issue["column"]].dtype in ["int64", "float64"] else "mode"
|
|
548
|
+
),
|
|
549
|
+
"reason": f"Moderate missing percentage ({issue['missing_percentage']:.2f}%)",
|
|
550
|
+
"priority": "high",
|
|
551
|
+
}
|
|
552
|
+
)
|
|
553
|
+
else:
|
|
554
|
+
recommendations.append(
|
|
555
|
+
{
|
|
556
|
+
"action": "consider_dropping_column",
|
|
557
|
+
"column": issue["column"],
|
|
558
|
+
"reason": f"High missing percentage ({issue['missing_percentage']:.2f}%)",
|
|
559
|
+
"priority": "high",
|
|
560
|
+
}
|
|
561
|
+
)
|
|
562
|
+
|
|
563
|
+
# Duplicate recommendations
|
|
564
|
+
if quality_issues.get("duplicates"):
|
|
565
|
+
recommendations.append(
|
|
566
|
+
{
|
|
567
|
+
"action": "remove_duplicates",
|
|
568
|
+
"reason": f"{quality_issues['duplicates'][0]['count']} duplicate rows found",
|
|
569
|
+
"priority": "high",
|
|
570
|
+
}
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
# Outlier recommendations
|
|
574
|
+
if quality_issues.get("outliers"):
|
|
575
|
+
for issue in quality_issues["outliers"]:
|
|
576
|
+
if issue["percentage"] > 5:
|
|
577
|
+
recommendations.append(
|
|
578
|
+
{
|
|
579
|
+
"action": "handle_outliers",
|
|
580
|
+
"column": issue["column"],
|
|
581
|
+
"method": "winsorize or cap",
|
|
582
|
+
"reason": f"Significant outliers detected ({issue['percentage']:.2f}%)",
|
|
583
|
+
"priority": "medium",
|
|
584
|
+
}
|
|
585
|
+
)
|
|
586
|
+
|
|
587
|
+
return recommendations
|
|
588
|
+
|
|
589
|
+
def _generate_task_recommendations(
|
|
590
|
+
self, df: pd.DataFrame, target_column: str
|
|
591
|
+
) -> List[Dict[str, Any]]:
|
|
592
|
+
"""Generate task-specific recommendations"""
|
|
593
|
+
recommendations = []
|
|
594
|
+
|
|
595
|
+
# Check if target is numeric or categorical
|
|
596
|
+
if df[target_column].dtype in ["int64", "float64"]:
|
|
597
|
+
task_type = "regression"
|
|
598
|
+
else:
|
|
599
|
+
task_type = "classification"
|
|
600
|
+
|
|
601
|
+
recommendations.append(
|
|
602
|
+
{
|
|
603
|
+
"action": "task_identified",
|
|
604
|
+
"task_type": task_type,
|
|
605
|
+
"target_column": target_column,
|
|
606
|
+
"reason": f"Based on target column type: {df[target_column].dtype}",
|
|
607
|
+
"priority": "info",
|
|
608
|
+
}
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
return recommendations
|
|
612
|
+
|
|
613
|
+
def _prioritize_recommendations(
|
|
614
|
+
self, recommendations: List[Dict[str, Any]]
|
|
615
|
+
) -> List[Dict[str, Any]]:
|
|
616
|
+
"""Prioritize recommendations by importance"""
|
|
617
|
+
priority_order = {"high": 0, "medium": 1, "low": 2, "info": 3}
|
|
618
|
+
return sorted(
|
|
619
|
+
recommendations,
|
|
620
|
+
key=lambda x: priority_order.get(x.get("priority", "low"), 2),
|
|
621
|
+
)
|
|
622
|
+
|
|
623
|
+
def _categorize_severity(self, issues: Dict[str, List]) -> Dict[str, int]:
|
|
624
|
+
"""Categorize issues by severity"""
|
|
625
|
+
severity_counts = {"high": 0, "medium": 0, "low": 0}
|
|
626
|
+
|
|
627
|
+
for issue_list in issues.values():
|
|
628
|
+
for issue in issue_list:
|
|
629
|
+
severity = issue.get("severity", "low")
|
|
630
|
+
severity_counts[severity] = severity_counts.get(severity, 0) + 1
|
|
631
|
+
|
|
632
|
+
return severity_counts
|
|
633
|
+
|
|
634
|
+
def _generate_visualization_data(self, df: pd.DataFrame) -> Dict[str, Any]:
|
|
635
|
+
"""Generate data for visualizations"""
|
|
636
|
+
viz_data = {}
|
|
637
|
+
|
|
638
|
+
# Numeric distributions
|
|
639
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
640
|
+
if len(numeric_cols) > 0:
|
|
641
|
+
viz_data["numeric_distributions"] = {
|
|
642
|
+
col: {
|
|
643
|
+
# Sample for performance
|
|
644
|
+
"values": df[col].dropna().tolist()[:1000],
|
|
645
|
+
"bins": 30,
|
|
646
|
+
}
|
|
647
|
+
for col in numeric_cols[:5] # Limit to first 5
|
|
648
|
+
}
|
|
649
|
+
|
|
650
|
+
# Categorical distributions
|
|
651
|
+
categorical_cols = df.select_dtypes(include=["object", "category"]).columns
|
|
652
|
+
if len(categorical_cols) > 0:
|
|
653
|
+
viz_data["categorical_distributions"] = {
|
|
654
|
+
col: df[col].value_counts().head(10).to_dict()
|
|
655
|
+
for col in categorical_cols[:5] # Limit to first 5
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
return viz_data
|