aiecs 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiecs/__init__.py +72 -0
- aiecs/__main__.py +41 -0
- aiecs/aiecs_client.py +469 -0
- aiecs/application/__init__.py +10 -0
- aiecs/application/executors/__init__.py +10 -0
- aiecs/application/executors/operation_executor.py +363 -0
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +100 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
- aiecs/application/knowledge_graph/search/reranker.py +295 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +484 -0
- aiecs/config/__init__.py +16 -0
- aiecs/config/config.py +498 -0
- aiecs/config/graph_config.py +137 -0
- aiecs/config/registry.py +23 -0
- aiecs/core/__init__.py +46 -0
- aiecs/core/interface/__init__.py +34 -0
- aiecs/core/interface/execution_interface.py +152 -0
- aiecs/core/interface/storage_interface.py +171 -0
- aiecs/domain/__init__.py +289 -0
- aiecs/domain/agent/__init__.py +189 -0
- aiecs/domain/agent/base_agent.py +697 -0
- aiecs/domain/agent/exceptions.py +103 -0
- aiecs/domain/agent/graph_aware_mixin.py +559 -0
- aiecs/domain/agent/hybrid_agent.py +490 -0
- aiecs/domain/agent/integration/__init__.py +26 -0
- aiecs/domain/agent/integration/context_compressor.py +222 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
- aiecs/domain/agent/integration/retry_policy.py +219 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +646 -0
- aiecs/domain/agent/lifecycle.py +296 -0
- aiecs/domain/agent/llm_agent.py +300 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +197 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +160 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
- aiecs/domain/agent/models.py +317 -0
- aiecs/domain/agent/observability.py +407 -0
- aiecs/domain/agent/persistence.py +289 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +161 -0
- aiecs/domain/agent/prompts/formatters.py +189 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +260 -0
- aiecs/domain/agent/tool_agent.py +257 -0
- aiecs/domain/agent/tools/__init__.py +12 -0
- aiecs/domain/agent/tools/schema_generator.py +221 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +477 -0
- aiecs/domain/community/analytics.py +481 -0
- aiecs/domain/community/collaborative_workflow.py +642 -0
- aiecs/domain/community/communication_hub.py +645 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +800 -0
- aiecs/domain/community/community_manager.py +813 -0
- aiecs/domain/community/decision_engine.py +879 -0
- aiecs/domain/community/exceptions.py +225 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +268 -0
- aiecs/domain/community/resource_manager.py +457 -0
- aiecs/domain/community/shared_context_manager.py +603 -0
- aiecs/domain/context/__init__.py +58 -0
- aiecs/domain/context/context_engine.py +989 -0
- aiecs/domain/context/conversation_models.py +354 -0
- aiecs/domain/context/graph_memory.py +467 -0
- aiecs/domain/execution/__init__.py +12 -0
- aiecs/domain/execution/model.py +57 -0
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +130 -0
- aiecs/domain/knowledge_graph/models/evidence.py +194 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
- aiecs/domain/knowledge_graph/models/path.py +179 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
- aiecs/domain/knowledge_graph/models/query.py +272 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
- aiecs/domain/knowledge_graph/models/relation.py +136 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
- aiecs/domain/task/__init__.py +13 -0
- aiecs/domain/task/dsl_processor.py +613 -0
- aiecs/domain/task/model.py +62 -0
- aiecs/domain/task/task_context.py +268 -0
- aiecs/infrastructure/__init__.py +24 -0
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +601 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
- aiecs/infrastructure/graph_storage/cache.py +429 -0
- aiecs/infrastructure/graph_storage/distributed.py +226 -0
- aiecs/infrastructure/graph_storage/error_handling.py +390 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +514 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
- aiecs/infrastructure/graph_storage/metrics.py +357 -0
- aiecs/infrastructure/graph_storage/migration.py +413 -0
- aiecs/infrastructure/graph_storage/pagination.py +471 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
- aiecs/infrastructure/graph_storage/postgres.py +871 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +623 -0
- aiecs/infrastructure/graph_storage/streaming.py +495 -0
- aiecs/infrastructure/messaging/__init__.py +13 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
- aiecs/infrastructure/messaging/websocket_manager.py +298 -0
- aiecs/infrastructure/monitoring/__init__.py +34 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
- aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
- aiecs/infrastructure/monitoring/structured_logger.py +48 -0
- aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
- aiecs/infrastructure/persistence/__init__.py +24 -0
- aiecs/infrastructure/persistence/context_engine_client.py +187 -0
- aiecs/infrastructure/persistence/database_manager.py +333 -0
- aiecs/infrastructure/persistence/file_storage.py +754 -0
- aiecs/infrastructure/persistence/redis_client.py +220 -0
- aiecs/llm/__init__.py +86 -0
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/callbacks/custom_callbacks.py +264 -0
- aiecs/llm/client_factory.py +420 -0
- aiecs/llm/clients/__init__.py +33 -0
- aiecs/llm/clients/base_client.py +193 -0
- aiecs/llm/clients/googleai_client.py +181 -0
- aiecs/llm/clients/openai_client.py +131 -0
- aiecs/llm/clients/vertex_client.py +437 -0
- aiecs/llm/clients/xai_client.py +184 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +275 -0
- aiecs/llm/config/config_validator.py +236 -0
- aiecs/llm/config/model_config.py +151 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +91 -0
- aiecs/main.py +363 -0
- aiecs/scripts/__init__.py +3 -0
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/version_manager.py +215 -0
- aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
- aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
- aiecs/scripts/dependance_check/__init__.py +17 -0
- aiecs/scripts/dependance_check/dependency_checker.py +938 -0
- aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
- aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
- aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
- aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
- aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
- aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
- aiecs/scripts/tools_develop/README.md +449 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
- aiecs/scripts/tools_develop/verify_tools.py +356 -0
- aiecs/tasks/__init__.py +1 -0
- aiecs/tasks/worker.py +172 -0
- aiecs/tools/__init__.py +299 -0
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +303 -0
- aiecs/tools/apisource/providers/__init__.py +115 -0
- aiecs/tools/apisource/providers/base.py +664 -0
- aiecs/tools/apisource/providers/census.py +401 -0
- aiecs/tools/apisource/providers/fred.py +564 -0
- aiecs/tools/apisource/providers/newsapi.py +412 -0
- aiecs/tools/apisource/providers/worldbank.py +357 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +375 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
- aiecs/tools/apisource/tool.py +850 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +338 -0
- aiecs/tools/base_tool.py +201 -0
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +599 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
- aiecs/tools/docs/content_insertion_tool.py +1333 -0
- aiecs/tools/docs/document_creator_tool.py +1317 -0
- aiecs/tools/docs/document_layout_tool.py +1166 -0
- aiecs/tools/docs/document_parser_tool.py +994 -0
- aiecs/tools/docs/document_writer_tool.py +1818 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
- aiecs/tools/langchain_adapter.py +542 -0
- aiecs/tools/schema_generator.py +275 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +589 -0
- aiecs/tools/search_tool/cache.py +260 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +216 -0
- aiecs/tools/search_tool/core.py +749 -0
- aiecs/tools/search_tool/deduplicator.py +123 -0
- aiecs/tools/search_tool/error_handler.py +271 -0
- aiecs/tools/search_tool/metrics.py +371 -0
- aiecs/tools/search_tool/rate_limiter.py +178 -0
- aiecs/tools/search_tool/schemas.py +277 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
- aiecs/tools/statistics/data_loader_tool.py +564 -0
- aiecs/tools/statistics/data_profiler_tool.py +658 -0
- aiecs/tools/statistics/data_transformer_tool.py +573 -0
- aiecs/tools/statistics/data_visualizer_tool.py +495 -0
- aiecs/tools/statistics/model_trainer_tool.py +487 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
- aiecs/tools/task_tools/__init__.py +86 -0
- aiecs/tools/task_tools/chart_tool.py +732 -0
- aiecs/tools/task_tools/classfire_tool.py +922 -0
- aiecs/tools/task_tools/image_tool.py +447 -0
- aiecs/tools/task_tools/office_tool.py +684 -0
- aiecs/tools/task_tools/pandas_tool.py +635 -0
- aiecs/tools/task_tools/report_tool.py +635 -0
- aiecs/tools/task_tools/research_tool.py +392 -0
- aiecs/tools/task_tools/scraper_tool.py +715 -0
- aiecs/tools/task_tools/stats_tool.py +688 -0
- aiecs/tools/temp_file_manager.py +130 -0
- aiecs/tools/tool_executor/__init__.py +37 -0
- aiecs/tools/tool_executor/tool_executor.py +881 -0
- aiecs/utils/LLM_output_structor.py +445 -0
- aiecs/utils/__init__.py +34 -0
- aiecs/utils/base_callback.py +47 -0
- aiecs/utils/cache_provider.py +695 -0
- aiecs/utils/execution_utils.py +184 -0
- aiecs/utils/logging.py +1 -0
- aiecs/utils/prompt_loader.py +14 -0
- aiecs/utils/token_usage_repository.py +323 -0
- aiecs/ws/__init__.py +0 -0
- aiecs/ws/socket_server.py +52 -0
- aiecs-1.5.1.dist-info/METADATA +608 -0
- aiecs-1.5.1.dist-info/RECORD +302 -0
- aiecs-1.5.1.dist-info/WHEEL +5 -0
- aiecs-1.5.1.dist-info/entry_points.txt +10 -0
- aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
- aiecs-1.5.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,573 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Transformer Tool - Data cleaning, transformation, and feature engineering
|
|
3
|
+
|
|
4
|
+
This tool provides comprehensive data transformation capabilities with:
|
|
5
|
+
- Data cleaning and preprocessing
|
|
6
|
+
- Feature engineering and encoding
|
|
7
|
+
- Normalization and standardization
|
|
8
|
+
- Transformation pipelines
|
|
9
|
+
- Missing value handling
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import logging
|
|
13
|
+
from typing import Dict, Any, List, Optional, Union
|
|
14
|
+
from enum import Enum
|
|
15
|
+
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import numpy as np
|
|
18
|
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
|
|
19
|
+
from pydantic import BaseModel, Field, ConfigDict
|
|
20
|
+
|
|
21
|
+
from aiecs.tools.base_tool import BaseTool
|
|
22
|
+
from aiecs.tools import register_tool
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class TransformationType(str, Enum):
|
|
26
|
+
"""Types of transformations"""
|
|
27
|
+
|
|
28
|
+
# Cleaning operations
|
|
29
|
+
REMOVE_DUPLICATES = "remove_duplicates"
|
|
30
|
+
FILL_MISSING = "fill_missing"
|
|
31
|
+
REMOVE_OUTLIERS = "remove_outliers"
|
|
32
|
+
|
|
33
|
+
# Transformation operations
|
|
34
|
+
NORMALIZE = "normalize"
|
|
35
|
+
STANDARDIZE = "standardize"
|
|
36
|
+
LOG_TRANSFORM = "log_transform"
|
|
37
|
+
BOX_COX = "box_cox"
|
|
38
|
+
|
|
39
|
+
# Encoding operations
|
|
40
|
+
ONE_HOT_ENCODE = "one_hot_encode"
|
|
41
|
+
LABEL_ENCODE = "label_encode"
|
|
42
|
+
TARGET_ENCODE = "target_encode"
|
|
43
|
+
|
|
44
|
+
# Feature engineering
|
|
45
|
+
POLYNOMIAL_FEATURES = "polynomial_features"
|
|
46
|
+
INTERACTION_FEATURES = "interaction_features"
|
|
47
|
+
BINNING = "binning"
|
|
48
|
+
AGGREGATION = "aggregation"
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class MissingValueStrategy(str, Enum):
|
|
52
|
+
"""Strategies for handling missing values"""
|
|
53
|
+
|
|
54
|
+
DROP = "drop"
|
|
55
|
+
MEAN = "mean"
|
|
56
|
+
MEDIAN = "median"
|
|
57
|
+
MODE = "mode"
|
|
58
|
+
FORWARD_FILL = "forward_fill"
|
|
59
|
+
BACKWARD_FILL = "backward_fill"
|
|
60
|
+
INTERPOLATE = "interpolate"
|
|
61
|
+
CONSTANT = "constant"
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
class DataTransformerError(Exception):
|
|
65
|
+
"""Base exception for DataTransformer errors"""
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class TransformationError(DataTransformerError):
|
|
69
|
+
"""Raised when transformation fails"""
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@register_tool("data_transformer")
|
|
73
|
+
class DataTransformerTool(BaseTool):
|
|
74
|
+
"""
|
|
75
|
+
Advanced data transformation tool that can:
|
|
76
|
+
1. Clean and preprocess data
|
|
77
|
+
2. Engineer features
|
|
78
|
+
3. Transform and normalize data
|
|
79
|
+
4. Build transformation pipelines
|
|
80
|
+
|
|
81
|
+
Integrates with pandas_tool for core operations.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
# Configuration schema
|
|
85
|
+
class Config(BaseModel):
|
|
86
|
+
"""Configuration for the data transformer tool"""
|
|
87
|
+
|
|
88
|
+
model_config = ConfigDict(env_prefix="DATA_TRANSFORMER_")
|
|
89
|
+
|
|
90
|
+
outlier_std_threshold: float = Field(
|
|
91
|
+
default=3.0,
|
|
92
|
+
description="Standard deviation threshold for outlier detection",
|
|
93
|
+
)
|
|
94
|
+
default_missing_strategy: str = Field(
|
|
95
|
+
default="mean",
|
|
96
|
+
description="Default strategy for handling missing values",
|
|
97
|
+
)
|
|
98
|
+
enable_pipeline_caching: bool = Field(
|
|
99
|
+
default=True,
|
|
100
|
+
description="Whether to enable transformation pipeline caching",
|
|
101
|
+
)
|
|
102
|
+
max_one_hot_categories: int = Field(
|
|
103
|
+
default=10,
|
|
104
|
+
description="Maximum number of categories for one-hot encoding",
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
108
|
+
"""
|
|
109
|
+
Initialize DataTransformerTool with settings.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
config: Optional configuration overrides
|
|
113
|
+
"""
|
|
114
|
+
super().__init__(config)
|
|
115
|
+
|
|
116
|
+
# Parse configuration
|
|
117
|
+
self.config = self.Config(**(config or {}))
|
|
118
|
+
|
|
119
|
+
self.logger = logging.getLogger(__name__)
|
|
120
|
+
if not self.logger.handlers:
|
|
121
|
+
handler = logging.StreamHandler()
|
|
122
|
+
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
|
|
123
|
+
self.logger.addHandler(handler)
|
|
124
|
+
self.logger.setLevel(logging.INFO)
|
|
125
|
+
|
|
126
|
+
# Initialize external tools
|
|
127
|
+
self._init_external_tools()
|
|
128
|
+
|
|
129
|
+
# Initialize transformation pipeline cache
|
|
130
|
+
self.pipeline_cache = {}
|
|
131
|
+
|
|
132
|
+
def _init_external_tools(self):
|
|
133
|
+
"""Initialize external task tools"""
|
|
134
|
+
self.external_tools = {}
|
|
135
|
+
|
|
136
|
+
# Initialize PandasTool for data operations
|
|
137
|
+
try:
|
|
138
|
+
from aiecs.tools.task_tools.pandas_tool import PandasTool
|
|
139
|
+
|
|
140
|
+
self.external_tools["pandas"] = PandasTool()
|
|
141
|
+
self.logger.info("PandasTool initialized successfully")
|
|
142
|
+
except ImportError:
|
|
143
|
+
self.logger.warning("PandasTool not available")
|
|
144
|
+
self.external_tools["pandas"] = None
|
|
145
|
+
|
|
146
|
+
# Schema definitions
|
|
147
|
+
class TransformDataSchema(BaseModel):
|
|
148
|
+
"""Schema for transform_data operation"""
|
|
149
|
+
|
|
150
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to transform")
|
|
151
|
+
transformations: List[Dict[str, Any]] = Field(description="List of transformation steps")
|
|
152
|
+
enable_validation: bool = Field(default=True, description="Validate transformations")
|
|
153
|
+
|
|
154
|
+
class AutoTransformSchema(BaseModel):
|
|
155
|
+
"""Schema for auto_transform operation"""
|
|
156
|
+
|
|
157
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to transform")
|
|
158
|
+
target_column: Optional[str] = Field(default=None, description="Target column name")
|
|
159
|
+
task_type: Optional[str] = Field(
|
|
160
|
+
default=None, description="Task type: classification or regression"
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
class HandleMissingValuesSchema(BaseModel):
|
|
164
|
+
"""Schema for handle_missing_values operation"""
|
|
165
|
+
|
|
166
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(
|
|
167
|
+
description="Data with missing values"
|
|
168
|
+
)
|
|
169
|
+
strategy: MissingValueStrategy = Field(
|
|
170
|
+
default=MissingValueStrategy.MEAN,
|
|
171
|
+
description="Strategy for handling missing values",
|
|
172
|
+
)
|
|
173
|
+
columns: Optional[List[str]] = Field(default=None, description="Specific columns to handle")
|
|
174
|
+
fill_value: Optional[Any] = Field(default=None, description="Value for constant strategy")
|
|
175
|
+
|
|
176
|
+
class EncodeFeaturesSchema(BaseModel):
|
|
177
|
+
"""Schema for encode_features operation"""
|
|
178
|
+
|
|
179
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to encode")
|
|
180
|
+
columns: List[str] = Field(description="Columns to encode")
|
|
181
|
+
method: str = Field(default="one_hot", description="Encoding method: one_hot or label")
|
|
182
|
+
|
|
183
|
+
def transform_data(
|
|
184
|
+
self,
|
|
185
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
|
|
186
|
+
transformations: List[Dict[str, Any]],
|
|
187
|
+
validate: bool = True,
|
|
188
|
+
) -> Dict[str, Any]:
|
|
189
|
+
"""
|
|
190
|
+
Apply transformation pipeline to data.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
data: Data to transform
|
|
194
|
+
transformations: List of transformation steps, each containing:
|
|
195
|
+
- type: TransformationType
|
|
196
|
+
- columns: List of columns (optional)
|
|
197
|
+
- params: Additional parameters
|
|
198
|
+
validate: Whether to validate transformations
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Dict containing:
|
|
202
|
+
- transformed_data: Transformed DataFrame
|
|
203
|
+
- transformation_log: Log of applied transformations
|
|
204
|
+
- quality_improvement: Quality metrics comparison
|
|
205
|
+
|
|
206
|
+
Raises:
|
|
207
|
+
TransformationError: If transformation fails
|
|
208
|
+
"""
|
|
209
|
+
try:
|
|
210
|
+
df = self._to_dataframe(data)
|
|
211
|
+
original_df = df.copy()
|
|
212
|
+
|
|
213
|
+
transformation_log = []
|
|
214
|
+
|
|
215
|
+
for i, transform in enumerate(transformations):
|
|
216
|
+
trans_type = transform.get("type")
|
|
217
|
+
columns = transform.get("columns")
|
|
218
|
+
params = transform.get("params", {})
|
|
219
|
+
|
|
220
|
+
self.logger.info(
|
|
221
|
+
f"Applying transformation {i+1}/{len(transformations)}: {trans_type}"
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
# Apply transformation
|
|
225
|
+
df = self._apply_single_transformation(df, trans_type, columns, params)
|
|
226
|
+
|
|
227
|
+
transformation_log.append(
|
|
228
|
+
{
|
|
229
|
+
"step": i + 1,
|
|
230
|
+
"type": trans_type,
|
|
231
|
+
"columns": columns,
|
|
232
|
+
"params": params,
|
|
233
|
+
"status": "success",
|
|
234
|
+
}
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# Calculate quality improvement
|
|
238
|
+
quality_improvement = self._calculate_quality_improvement(original_df, df)
|
|
239
|
+
|
|
240
|
+
return {
|
|
241
|
+
"transformed_data": df,
|
|
242
|
+
"transformation_log": transformation_log,
|
|
243
|
+
"quality_improvement": quality_improvement,
|
|
244
|
+
"original_shape": original_df.shape,
|
|
245
|
+
"new_shape": df.shape,
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
except Exception as e:
|
|
249
|
+
self.logger.error(f"Error in transformation pipeline: {e}")
|
|
250
|
+
raise TransformationError(f"Transformation failed: {e}")
|
|
251
|
+
|
|
252
|
+
def auto_transform(
|
|
253
|
+
self,
|
|
254
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
|
|
255
|
+
target_column: Optional[str] = None,
|
|
256
|
+
task_type: Optional[str] = None,
|
|
257
|
+
) -> Dict[str, Any]:
|
|
258
|
+
"""
|
|
259
|
+
Automatically determine and apply optimal transformations.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
data: Data to transform
|
|
263
|
+
target_column: Target column for ML tasks
|
|
264
|
+
task_type: Type of task (classification or regression)
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Dict containing transformed data and applied transformations
|
|
268
|
+
"""
|
|
269
|
+
try:
|
|
270
|
+
df = self._to_dataframe(data)
|
|
271
|
+
|
|
272
|
+
# Determine transformations needed
|
|
273
|
+
transformations = self._determine_transformations(df, target_column, task_type)
|
|
274
|
+
|
|
275
|
+
# Apply transformations
|
|
276
|
+
result = self.transform_data(df, transformations, validate=True)
|
|
277
|
+
result["auto_detected_transformations"] = transformations
|
|
278
|
+
|
|
279
|
+
return result
|
|
280
|
+
|
|
281
|
+
except Exception as e:
|
|
282
|
+
self.logger.error(f"Error in auto transform: {e}")
|
|
283
|
+
raise TransformationError(f"Auto transform failed: {e}")
|
|
284
|
+
|
|
285
|
+
def handle_missing_values(
|
|
286
|
+
self,
|
|
287
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
|
|
288
|
+
strategy: MissingValueStrategy = MissingValueStrategy.MEAN,
|
|
289
|
+
columns: Optional[List[str]] = None,
|
|
290
|
+
fill_value: Optional[Any] = None,
|
|
291
|
+
) -> Dict[str, Any]:
|
|
292
|
+
"""
|
|
293
|
+
Handle missing values in data.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
data: Data with missing values
|
|
297
|
+
strategy: Strategy for handling missing values
|
|
298
|
+
columns: Specific columns to handle (None for all)
|
|
299
|
+
fill_value: Value for constant strategy
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
Dict containing data with handled missing values
|
|
303
|
+
"""
|
|
304
|
+
try:
|
|
305
|
+
df = self._to_dataframe(data)
|
|
306
|
+
original_missing = df.isnull().sum().sum()
|
|
307
|
+
|
|
308
|
+
# Select columns to handle
|
|
309
|
+
cols_to_handle = columns if columns else df.columns.tolist()
|
|
310
|
+
|
|
311
|
+
# Apply strategy
|
|
312
|
+
if strategy == MissingValueStrategy.DROP:
|
|
313
|
+
df = df.dropna(subset=cols_to_handle)
|
|
314
|
+
elif strategy == MissingValueStrategy.MEAN:
|
|
315
|
+
for col in cols_to_handle:
|
|
316
|
+
if df[col].dtype in ["int64", "float64"]:
|
|
317
|
+
df[col].fillna(df[col].mean(), inplace=True)
|
|
318
|
+
elif strategy == MissingValueStrategy.MEDIAN:
|
|
319
|
+
for col in cols_to_handle:
|
|
320
|
+
if df[col].dtype in ["int64", "float64"]:
|
|
321
|
+
df[col].fillna(df[col].median(), inplace=True)
|
|
322
|
+
elif strategy == MissingValueStrategy.MODE:
|
|
323
|
+
for col in cols_to_handle:
|
|
324
|
+
if not df[col].mode().empty:
|
|
325
|
+
df[col].fillna(df[col].mode()[0], inplace=True)
|
|
326
|
+
elif strategy == MissingValueStrategy.FORWARD_FILL:
|
|
327
|
+
df[cols_to_handle] = df[cols_to_handle].fillna(method="ffill")
|
|
328
|
+
elif strategy == MissingValueStrategy.BACKWARD_FILL:
|
|
329
|
+
df[cols_to_handle] = df[cols_to_handle].fillna(method="bfill")
|
|
330
|
+
elif strategy == MissingValueStrategy.INTERPOLATE:
|
|
331
|
+
for col in cols_to_handle:
|
|
332
|
+
if df[col].dtype in ["int64", "float64"]:
|
|
333
|
+
df[col] = df[col].interpolate()
|
|
334
|
+
elif strategy == MissingValueStrategy.CONSTANT:
|
|
335
|
+
df[cols_to_handle] = df[cols_to_handle].fillna(fill_value)
|
|
336
|
+
|
|
337
|
+
final_missing = df.isnull().sum().sum()
|
|
338
|
+
|
|
339
|
+
return {
|
|
340
|
+
"data": df,
|
|
341
|
+
"original_missing": int(original_missing),
|
|
342
|
+
"final_missing": int(final_missing),
|
|
343
|
+
"missing_handled": int(original_missing - final_missing),
|
|
344
|
+
"strategy": strategy.value,
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
except Exception as e:
|
|
348
|
+
self.logger.error(f"Error handling missing values: {e}")
|
|
349
|
+
raise TransformationError(f"Failed to handle missing values: {e}")
|
|
350
|
+
|
|
351
|
+
def encode_features(
|
|
352
|
+
self,
|
|
353
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]], pd.DataFrame],
|
|
354
|
+
columns: List[str],
|
|
355
|
+
method: str = "one_hot",
|
|
356
|
+
) -> Dict[str, Any]:
|
|
357
|
+
"""
|
|
358
|
+
Encode categorical features.
|
|
359
|
+
|
|
360
|
+
Args:
|
|
361
|
+
data: Data to encode
|
|
362
|
+
columns: Columns to encode
|
|
363
|
+
method: Encoding method (one_hot or label)
|
|
364
|
+
|
|
365
|
+
Returns:
|
|
366
|
+
Dict containing encoded data
|
|
367
|
+
"""
|
|
368
|
+
try:
|
|
369
|
+
df = self._to_dataframe(data)
|
|
370
|
+
|
|
371
|
+
if method == "one_hot":
|
|
372
|
+
# One-hot encoding
|
|
373
|
+
df_encoded = pd.get_dummies(df, columns=columns, prefix=columns)
|
|
374
|
+
encoding_info = {
|
|
375
|
+
"method": "one_hot",
|
|
376
|
+
"original_columns": columns,
|
|
377
|
+
"new_columns": [col for col in df_encoded.columns if col not in df.columns],
|
|
378
|
+
}
|
|
379
|
+
elif method == "label":
|
|
380
|
+
# Label encoding
|
|
381
|
+
df_encoded = df.copy()
|
|
382
|
+
encoders = {}
|
|
383
|
+
for col in columns:
|
|
384
|
+
le = LabelEncoder()
|
|
385
|
+
df_encoded[col] = le.fit_transform(df[col].astype(str))
|
|
386
|
+
encoders[col] = le
|
|
387
|
+
encoding_info = {
|
|
388
|
+
"method": "label",
|
|
389
|
+
"columns": columns,
|
|
390
|
+
"encoders": encoders,
|
|
391
|
+
}
|
|
392
|
+
else:
|
|
393
|
+
raise TransformationError(f"Unsupported encoding method: {method}")
|
|
394
|
+
|
|
395
|
+
return {
|
|
396
|
+
"data": df_encoded,
|
|
397
|
+
"encoding_info": encoding_info,
|
|
398
|
+
"original_shape": df.shape,
|
|
399
|
+
"new_shape": df_encoded.shape,
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
except Exception as e:
|
|
403
|
+
self.logger.error(f"Error encoding features: {e}")
|
|
404
|
+
raise TransformationError(f"Feature encoding failed: {e}")
|
|
405
|
+
|
|
406
|
+
# Internal helper methods
|
|
407
|
+
|
|
408
|
+
def _to_dataframe(self, data: Union[Dict, List, pd.DataFrame]) -> pd.DataFrame:
|
|
409
|
+
"""Convert data to DataFrame"""
|
|
410
|
+
if isinstance(data, pd.DataFrame):
|
|
411
|
+
return data
|
|
412
|
+
elif isinstance(data, list):
|
|
413
|
+
return pd.DataFrame(data)
|
|
414
|
+
elif isinstance(data, dict):
|
|
415
|
+
return pd.DataFrame([data])
|
|
416
|
+
else:
|
|
417
|
+
raise TransformationError(f"Unsupported data type: {type(data)}")
|
|
418
|
+
|
|
419
|
+
def _apply_single_transformation(
|
|
420
|
+
self,
|
|
421
|
+
df: pd.DataFrame,
|
|
422
|
+
trans_type: str,
|
|
423
|
+
columns: Optional[List[str]],
|
|
424
|
+
params: Dict[str, Any],
|
|
425
|
+
) -> pd.DataFrame:
|
|
426
|
+
"""Apply a single transformation"""
|
|
427
|
+
if trans_type == TransformationType.REMOVE_DUPLICATES.value:
|
|
428
|
+
return df.drop_duplicates()
|
|
429
|
+
|
|
430
|
+
elif trans_type == TransformationType.FILL_MISSING.value:
|
|
431
|
+
strategy = params.get("strategy", "mean")
|
|
432
|
+
for col in columns or df.columns:
|
|
433
|
+
if df[col].isnull().any():
|
|
434
|
+
if strategy == "mean" and df[col].dtype in [
|
|
435
|
+
"int64",
|
|
436
|
+
"float64",
|
|
437
|
+
]:
|
|
438
|
+
df[col].fillna(df[col].mean(), inplace=True)
|
|
439
|
+
elif strategy == "median" and df[col].dtype in [
|
|
440
|
+
"int64",
|
|
441
|
+
"float64",
|
|
442
|
+
]:
|
|
443
|
+
df[col].fillna(df[col].median(), inplace=True)
|
|
444
|
+
elif strategy == "mode":
|
|
445
|
+
if not df[col].mode().empty:
|
|
446
|
+
df[col].fillna(df[col].mode()[0], inplace=True)
|
|
447
|
+
return df
|
|
448
|
+
|
|
449
|
+
elif trans_type == TransformationType.REMOVE_OUTLIERS.value:
|
|
450
|
+
for col in columns or df.select_dtypes(include=[np.number]).columns:
|
|
451
|
+
if df[col].std() > 0:
|
|
452
|
+
z_scores = np.abs((df[col] - df[col].mean()) / df[col].std())
|
|
453
|
+
df = df[z_scores < self.config.outlier_std_threshold]
|
|
454
|
+
return df
|
|
455
|
+
|
|
456
|
+
elif trans_type == TransformationType.STANDARDIZE.value:
|
|
457
|
+
scaler = StandardScaler()
|
|
458
|
+
cols = columns or df.select_dtypes(include=[np.number]).columns.tolist()
|
|
459
|
+
df[cols] = scaler.fit_transform(df[cols])
|
|
460
|
+
return df
|
|
461
|
+
|
|
462
|
+
elif trans_type == TransformationType.NORMALIZE.value:
|
|
463
|
+
scaler = MinMaxScaler()
|
|
464
|
+
cols = columns or df.select_dtypes(include=[np.number]).columns.tolist()
|
|
465
|
+
df[cols] = scaler.fit_transform(df[cols])
|
|
466
|
+
return df
|
|
467
|
+
|
|
468
|
+
elif trans_type == TransformationType.LOG_TRANSFORM.value:
|
|
469
|
+
cols = columns or df.select_dtypes(include=[np.number]).columns.tolist()
|
|
470
|
+
for col in cols:
|
|
471
|
+
if (df[col] > 0).all():
|
|
472
|
+
df[col] = np.log(df[col])
|
|
473
|
+
return df
|
|
474
|
+
|
|
475
|
+
elif trans_type == TransformationType.ONE_HOT_ENCODE.value:
|
|
476
|
+
cols = columns or df.select_dtypes(include=["object"]).columns.tolist()
|
|
477
|
+
return pd.get_dummies(df, columns=cols)
|
|
478
|
+
|
|
479
|
+
elif trans_type == TransformationType.LABEL_ENCODE.value:
|
|
480
|
+
cols = columns or df.select_dtypes(include=["object"]).columns.tolist()
|
|
481
|
+
for col in cols:
|
|
482
|
+
le = LabelEncoder()
|
|
483
|
+
df[col] = le.fit_transform(df[col].astype(str))
|
|
484
|
+
return df
|
|
485
|
+
|
|
486
|
+
else:
|
|
487
|
+
self.logger.warning(f"Transformation type {trans_type} not implemented, skipping")
|
|
488
|
+
return df
|
|
489
|
+
|
|
490
|
+
def _determine_transformations(
|
|
491
|
+
self,
|
|
492
|
+
df: pd.DataFrame,
|
|
493
|
+
target_column: Optional[str],
|
|
494
|
+
task_type: Optional[str],
|
|
495
|
+
) -> List[Dict[str, Any]]:
|
|
496
|
+
"""Determine transformations needed for data"""
|
|
497
|
+
transformations = []
|
|
498
|
+
|
|
499
|
+
# Remove duplicates if present
|
|
500
|
+
if df.duplicated().sum() > 0:
|
|
501
|
+
transformations.append(
|
|
502
|
+
{
|
|
503
|
+
"type": TransformationType.REMOVE_DUPLICATES.value,
|
|
504
|
+
"columns": None,
|
|
505
|
+
"params": {},
|
|
506
|
+
}
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
# Handle missing values
|
|
510
|
+
if df.isnull().sum().sum() > 0:
|
|
511
|
+
transformations.append(
|
|
512
|
+
{
|
|
513
|
+
"type": TransformationType.FILL_MISSING.value,
|
|
514
|
+
"columns": None,
|
|
515
|
+
"params": {"strategy": "mean"},
|
|
516
|
+
}
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
# Encode categorical variables
|
|
520
|
+
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
|
|
521
|
+
if target_column and target_column in categorical_cols:
|
|
522
|
+
categorical_cols.remove(target_column)
|
|
523
|
+
|
|
524
|
+
if len(categorical_cols) > 0:
|
|
525
|
+
# Use label encoding if too many categories, otherwise one-hot
|
|
526
|
+
for col in categorical_cols:
|
|
527
|
+
if df[col].nunique() > self.config.max_one_hot_categories:
|
|
528
|
+
transformations.append(
|
|
529
|
+
{
|
|
530
|
+
"type": TransformationType.LABEL_ENCODE.value,
|
|
531
|
+
"columns": [col],
|
|
532
|
+
"params": {},
|
|
533
|
+
}
|
|
534
|
+
)
|
|
535
|
+
else:
|
|
536
|
+
transformations.append(
|
|
537
|
+
{
|
|
538
|
+
"type": TransformationType.ONE_HOT_ENCODE.value,
|
|
539
|
+
"columns": [col],
|
|
540
|
+
"params": {},
|
|
541
|
+
}
|
|
542
|
+
)
|
|
543
|
+
|
|
544
|
+
# Standardize numeric features
|
|
545
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
|
546
|
+
if target_column and target_column in numeric_cols:
|
|
547
|
+
numeric_cols.remove(target_column)
|
|
548
|
+
|
|
549
|
+
if len(numeric_cols) > 0:
|
|
550
|
+
transformations.append(
|
|
551
|
+
{
|
|
552
|
+
"type": TransformationType.STANDARDIZE.value,
|
|
553
|
+
"columns": numeric_cols,
|
|
554
|
+
"params": {},
|
|
555
|
+
}
|
|
556
|
+
)
|
|
557
|
+
|
|
558
|
+
return transformations
|
|
559
|
+
|
|
560
|
+
def _calculate_quality_improvement(
|
|
561
|
+
self, original_df: pd.DataFrame, transformed_df: pd.DataFrame
|
|
562
|
+
) -> Dict[str, Any]:
|
|
563
|
+
"""Calculate quality improvement metrics"""
|
|
564
|
+
return {
|
|
565
|
+
"missing_before": int(original_df.isnull().sum().sum()),
|
|
566
|
+
"missing_after": int(transformed_df.isnull().sum().sum()),
|
|
567
|
+
"duplicates_before": int(original_df.duplicated().sum()),
|
|
568
|
+
"duplicates_after": int(transformed_df.duplicated().sum()),
|
|
569
|
+
"rows_before": len(original_df),
|
|
570
|
+
"rows_after": len(transformed_df),
|
|
571
|
+
"columns_before": len(original_df.columns),
|
|
572
|
+
"columns_after": len(transformed_df.columns),
|
|
573
|
+
}
|