aiecs 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiecs/__init__.py +72 -0
- aiecs/__main__.py +41 -0
- aiecs/aiecs_client.py +469 -0
- aiecs/application/__init__.py +10 -0
- aiecs/application/executors/__init__.py +10 -0
- aiecs/application/executors/operation_executor.py +363 -0
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +100 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
- aiecs/application/knowledge_graph/search/reranker.py +295 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +484 -0
- aiecs/config/__init__.py +16 -0
- aiecs/config/config.py +498 -0
- aiecs/config/graph_config.py +137 -0
- aiecs/config/registry.py +23 -0
- aiecs/core/__init__.py +46 -0
- aiecs/core/interface/__init__.py +34 -0
- aiecs/core/interface/execution_interface.py +152 -0
- aiecs/core/interface/storage_interface.py +171 -0
- aiecs/domain/__init__.py +289 -0
- aiecs/domain/agent/__init__.py +189 -0
- aiecs/domain/agent/base_agent.py +697 -0
- aiecs/domain/agent/exceptions.py +103 -0
- aiecs/domain/agent/graph_aware_mixin.py +559 -0
- aiecs/domain/agent/hybrid_agent.py +490 -0
- aiecs/domain/agent/integration/__init__.py +26 -0
- aiecs/domain/agent/integration/context_compressor.py +222 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
- aiecs/domain/agent/integration/retry_policy.py +219 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +646 -0
- aiecs/domain/agent/lifecycle.py +296 -0
- aiecs/domain/agent/llm_agent.py +300 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +197 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +160 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
- aiecs/domain/agent/models.py +317 -0
- aiecs/domain/agent/observability.py +407 -0
- aiecs/domain/agent/persistence.py +289 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +161 -0
- aiecs/domain/agent/prompts/formatters.py +189 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +260 -0
- aiecs/domain/agent/tool_agent.py +257 -0
- aiecs/domain/agent/tools/__init__.py +12 -0
- aiecs/domain/agent/tools/schema_generator.py +221 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +477 -0
- aiecs/domain/community/analytics.py +481 -0
- aiecs/domain/community/collaborative_workflow.py +642 -0
- aiecs/domain/community/communication_hub.py +645 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +800 -0
- aiecs/domain/community/community_manager.py +813 -0
- aiecs/domain/community/decision_engine.py +879 -0
- aiecs/domain/community/exceptions.py +225 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +268 -0
- aiecs/domain/community/resource_manager.py +457 -0
- aiecs/domain/community/shared_context_manager.py +603 -0
- aiecs/domain/context/__init__.py +58 -0
- aiecs/domain/context/context_engine.py +989 -0
- aiecs/domain/context/conversation_models.py +354 -0
- aiecs/domain/context/graph_memory.py +467 -0
- aiecs/domain/execution/__init__.py +12 -0
- aiecs/domain/execution/model.py +57 -0
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +130 -0
- aiecs/domain/knowledge_graph/models/evidence.py +194 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
- aiecs/domain/knowledge_graph/models/path.py +179 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
- aiecs/domain/knowledge_graph/models/query.py +272 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
- aiecs/domain/knowledge_graph/models/relation.py +136 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
- aiecs/domain/task/__init__.py +13 -0
- aiecs/domain/task/dsl_processor.py +613 -0
- aiecs/domain/task/model.py +62 -0
- aiecs/domain/task/task_context.py +268 -0
- aiecs/infrastructure/__init__.py +24 -0
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +601 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
- aiecs/infrastructure/graph_storage/cache.py +429 -0
- aiecs/infrastructure/graph_storage/distributed.py +226 -0
- aiecs/infrastructure/graph_storage/error_handling.py +390 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +514 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
- aiecs/infrastructure/graph_storage/metrics.py +357 -0
- aiecs/infrastructure/graph_storage/migration.py +413 -0
- aiecs/infrastructure/graph_storage/pagination.py +471 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
- aiecs/infrastructure/graph_storage/postgres.py +871 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +623 -0
- aiecs/infrastructure/graph_storage/streaming.py +495 -0
- aiecs/infrastructure/messaging/__init__.py +13 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
- aiecs/infrastructure/messaging/websocket_manager.py +298 -0
- aiecs/infrastructure/monitoring/__init__.py +34 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
- aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
- aiecs/infrastructure/monitoring/structured_logger.py +48 -0
- aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
- aiecs/infrastructure/persistence/__init__.py +24 -0
- aiecs/infrastructure/persistence/context_engine_client.py +187 -0
- aiecs/infrastructure/persistence/database_manager.py +333 -0
- aiecs/infrastructure/persistence/file_storage.py +754 -0
- aiecs/infrastructure/persistence/redis_client.py +220 -0
- aiecs/llm/__init__.py +86 -0
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/callbacks/custom_callbacks.py +264 -0
- aiecs/llm/client_factory.py +420 -0
- aiecs/llm/clients/__init__.py +33 -0
- aiecs/llm/clients/base_client.py +193 -0
- aiecs/llm/clients/googleai_client.py +181 -0
- aiecs/llm/clients/openai_client.py +131 -0
- aiecs/llm/clients/vertex_client.py +437 -0
- aiecs/llm/clients/xai_client.py +184 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +275 -0
- aiecs/llm/config/config_validator.py +236 -0
- aiecs/llm/config/model_config.py +151 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +91 -0
- aiecs/main.py +363 -0
- aiecs/scripts/__init__.py +3 -0
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/version_manager.py +215 -0
- aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
- aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
- aiecs/scripts/dependance_check/__init__.py +17 -0
- aiecs/scripts/dependance_check/dependency_checker.py +938 -0
- aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
- aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
- aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
- aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
- aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
- aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
- aiecs/scripts/tools_develop/README.md +449 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
- aiecs/scripts/tools_develop/verify_tools.py +356 -0
- aiecs/tasks/__init__.py +1 -0
- aiecs/tasks/worker.py +172 -0
- aiecs/tools/__init__.py +299 -0
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +303 -0
- aiecs/tools/apisource/providers/__init__.py +115 -0
- aiecs/tools/apisource/providers/base.py +664 -0
- aiecs/tools/apisource/providers/census.py +401 -0
- aiecs/tools/apisource/providers/fred.py +564 -0
- aiecs/tools/apisource/providers/newsapi.py +412 -0
- aiecs/tools/apisource/providers/worldbank.py +357 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +375 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
- aiecs/tools/apisource/tool.py +850 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +338 -0
- aiecs/tools/base_tool.py +201 -0
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +599 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
- aiecs/tools/docs/content_insertion_tool.py +1333 -0
- aiecs/tools/docs/document_creator_tool.py +1317 -0
- aiecs/tools/docs/document_layout_tool.py +1166 -0
- aiecs/tools/docs/document_parser_tool.py +994 -0
- aiecs/tools/docs/document_writer_tool.py +1818 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
- aiecs/tools/langchain_adapter.py +542 -0
- aiecs/tools/schema_generator.py +275 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +589 -0
- aiecs/tools/search_tool/cache.py +260 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +216 -0
- aiecs/tools/search_tool/core.py +749 -0
- aiecs/tools/search_tool/deduplicator.py +123 -0
- aiecs/tools/search_tool/error_handler.py +271 -0
- aiecs/tools/search_tool/metrics.py +371 -0
- aiecs/tools/search_tool/rate_limiter.py +178 -0
- aiecs/tools/search_tool/schemas.py +277 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
- aiecs/tools/statistics/data_loader_tool.py +564 -0
- aiecs/tools/statistics/data_profiler_tool.py +658 -0
- aiecs/tools/statistics/data_transformer_tool.py +573 -0
- aiecs/tools/statistics/data_visualizer_tool.py +495 -0
- aiecs/tools/statistics/model_trainer_tool.py +487 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
- aiecs/tools/task_tools/__init__.py +86 -0
- aiecs/tools/task_tools/chart_tool.py +732 -0
- aiecs/tools/task_tools/classfire_tool.py +922 -0
- aiecs/tools/task_tools/image_tool.py +447 -0
- aiecs/tools/task_tools/office_tool.py +684 -0
- aiecs/tools/task_tools/pandas_tool.py +635 -0
- aiecs/tools/task_tools/report_tool.py +635 -0
- aiecs/tools/task_tools/research_tool.py +392 -0
- aiecs/tools/task_tools/scraper_tool.py +715 -0
- aiecs/tools/task_tools/stats_tool.py +688 -0
- aiecs/tools/temp_file_manager.py +130 -0
- aiecs/tools/tool_executor/__init__.py +37 -0
- aiecs/tools/tool_executor/tool_executor.py +881 -0
- aiecs/utils/LLM_output_structor.py +445 -0
- aiecs/utils/__init__.py +34 -0
- aiecs/utils/base_callback.py +47 -0
- aiecs/utils/cache_provider.py +695 -0
- aiecs/utils/execution_utils.py +184 -0
- aiecs/utils/logging.py +1 -0
- aiecs/utils/prompt_loader.py +14 -0
- aiecs/utils/token_usage_repository.py +323 -0
- aiecs/ws/__init__.py +0 -0
- aiecs/ws/socket_server.py +52 -0
- aiecs-1.5.1.dist-info/METADATA +608 -0
- aiecs-1.5.1.dist-info/RECORD +302 -0
- aiecs-1.5.1.dist-info/WHEEL +5 -0
- aiecs-1.5.1.dist-info/entry_points.txt +10 -0
- aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
- aiecs-1.5.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,564 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Loader Tool - Universal data loading from multiple file formats
|
|
3
|
+
|
|
4
|
+
This tool provides comprehensive data loading capabilities with:
|
|
5
|
+
- Auto-detection of file formats
|
|
6
|
+
- Multiple loading strategies (full, streaming, chunked, lazy)
|
|
7
|
+
- Data quality validation on load
|
|
8
|
+
- Schema inference and validation
|
|
9
|
+
- Support for CSV, Excel, JSON, Parquet, and other formats
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import os
|
|
13
|
+
import logging
|
|
14
|
+
from typing import Dict, Any, List, Optional, Union, Iterator
|
|
15
|
+
from enum import Enum
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
import pandas as pd
|
|
19
|
+
from pydantic import BaseModel, Field, ConfigDict
|
|
20
|
+
|
|
21
|
+
from aiecs.tools.base_tool import BaseTool
|
|
22
|
+
from aiecs.tools import register_tool
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class DataSourceType(str, Enum):
|
|
26
|
+
"""Supported data source types"""
|
|
27
|
+
|
|
28
|
+
CSV = "csv"
|
|
29
|
+
EXCEL = "excel"
|
|
30
|
+
JSON = "json"
|
|
31
|
+
PARQUET = "parquet"
|
|
32
|
+
FEATHER = "feather"
|
|
33
|
+
HDF5 = "hdf5"
|
|
34
|
+
STATA = "stata"
|
|
35
|
+
SAS = "sas"
|
|
36
|
+
SPSS = "spss"
|
|
37
|
+
AUTO = "auto"
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class LoadStrategy(str, Enum):
|
|
41
|
+
"""Data loading strategies"""
|
|
42
|
+
|
|
43
|
+
FULL_LOAD = "full_load"
|
|
44
|
+
STREAMING = "streaming"
|
|
45
|
+
CHUNKED = "chunked"
|
|
46
|
+
LAZY = "lazy"
|
|
47
|
+
INCREMENTAL = "incremental"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class DataLoaderError(Exception):
|
|
51
|
+
"""Base exception for DataLoader errors"""
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class FileFormatError(DataLoaderError):
|
|
55
|
+
"""Raised when file format is unsupported or invalid"""
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class SchemaValidationError(DataLoaderError):
|
|
59
|
+
"""Raised when schema validation fails"""
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class DataQualityError(DataLoaderError):
|
|
63
|
+
"""Raised when data quality issues are detected"""
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
@register_tool("data_loader")
|
|
67
|
+
class DataLoaderTool(BaseTool):
|
|
68
|
+
"""
|
|
69
|
+
Universal data loading tool that can:
|
|
70
|
+
1. Load data from multiple file formats
|
|
71
|
+
2. Auto-detect data formats and schemas
|
|
72
|
+
3. Handle large datasets with streaming
|
|
73
|
+
4. Validate data quality on load
|
|
74
|
+
|
|
75
|
+
Integrates with pandas_tool for core data operations.
|
|
76
|
+
"""
|
|
77
|
+
|
|
78
|
+
# Configuration schema
|
|
79
|
+
class Config(BaseModel):
|
|
80
|
+
"""Configuration for the data loader tool"""
|
|
81
|
+
|
|
82
|
+
model_config = ConfigDict(env_prefix="DATA_LOADER_")
|
|
83
|
+
|
|
84
|
+
max_file_size_mb: int = Field(default=500, description="Maximum file size in megabytes")
|
|
85
|
+
default_chunk_size: int = Field(
|
|
86
|
+
default=10000, description="Default chunk size for chunked loading"
|
|
87
|
+
)
|
|
88
|
+
max_memory_usage_mb: int = Field(
|
|
89
|
+
default=2000, description="Maximum memory usage in megabytes"
|
|
90
|
+
)
|
|
91
|
+
enable_schema_inference: bool = Field(
|
|
92
|
+
default=True,
|
|
93
|
+
description="Whether to enable automatic schema inference",
|
|
94
|
+
)
|
|
95
|
+
enable_quality_validation: bool = Field(
|
|
96
|
+
default=True,
|
|
97
|
+
description="Whether to enable data quality validation",
|
|
98
|
+
)
|
|
99
|
+
default_encoding: str = Field(
|
|
100
|
+
default="utf-8",
|
|
101
|
+
description="Default text encoding for file operations",
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
|
105
|
+
"""
|
|
106
|
+
Initialize DataLoaderTool with settings.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
config: Optional configuration overrides
|
|
110
|
+
"""
|
|
111
|
+
super().__init__(config)
|
|
112
|
+
|
|
113
|
+
# Parse configuration
|
|
114
|
+
self.config = self.Config(**(config or {}))
|
|
115
|
+
|
|
116
|
+
self.logger = logging.getLogger(__name__)
|
|
117
|
+
if not self.logger.handlers:
|
|
118
|
+
handler = logging.StreamHandler()
|
|
119
|
+
handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
|
|
120
|
+
self.logger.addHandler(handler)
|
|
121
|
+
self.logger.setLevel(logging.INFO)
|
|
122
|
+
|
|
123
|
+
# Initialize external tools
|
|
124
|
+
self._init_external_tools()
|
|
125
|
+
|
|
126
|
+
def _init_external_tools(self):
|
|
127
|
+
"""Initialize external task tools"""
|
|
128
|
+
self.external_tools = {}
|
|
129
|
+
|
|
130
|
+
# Initialize PandasTool for data operations
|
|
131
|
+
try:
|
|
132
|
+
from aiecs.tools.task_tools.pandas_tool import PandasTool
|
|
133
|
+
|
|
134
|
+
self.external_tools["pandas"] = PandasTool()
|
|
135
|
+
self.logger.info("PandasTool initialized successfully")
|
|
136
|
+
except ImportError:
|
|
137
|
+
self.logger.warning("PandasTool not available")
|
|
138
|
+
self.external_tools["pandas"] = None
|
|
139
|
+
|
|
140
|
+
# Schema definitions
|
|
141
|
+
class LoadDataSchema(BaseModel):
|
|
142
|
+
"""Schema for load_data operation"""
|
|
143
|
+
|
|
144
|
+
source: str = Field(description="Path to data source file")
|
|
145
|
+
source_type: Optional[DataSourceType] = Field(
|
|
146
|
+
default=DataSourceType.AUTO, description="Data source type"
|
|
147
|
+
)
|
|
148
|
+
strategy: LoadStrategy = Field(
|
|
149
|
+
default=LoadStrategy.FULL_LOAD, description="Loading strategy"
|
|
150
|
+
)
|
|
151
|
+
data_schema: Optional[Dict[str, Any]] = Field(
|
|
152
|
+
default=None, description="Expected schema for validation"
|
|
153
|
+
)
|
|
154
|
+
validation_rules: Optional[Dict[str, Any]] = Field(
|
|
155
|
+
default=None, description="Data quality validation rules"
|
|
156
|
+
)
|
|
157
|
+
nrows: Optional[int] = Field(default=None, description="Number of rows to load")
|
|
158
|
+
chunk_size: Optional[int] = Field(
|
|
159
|
+
default=None, description="Chunk size for chunked loading"
|
|
160
|
+
)
|
|
161
|
+
encoding: Optional[str] = Field(default=None, description="File encoding")
|
|
162
|
+
|
|
163
|
+
class DetectFormatSchema(BaseModel):
|
|
164
|
+
"""Schema for detect_format operation"""
|
|
165
|
+
|
|
166
|
+
source: str = Field(description="Path to data source file")
|
|
167
|
+
|
|
168
|
+
class ValidateSchemaSchema(BaseModel):
|
|
169
|
+
"""Schema for validate_schema operation"""
|
|
170
|
+
|
|
171
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]]] = Field(description="Data to validate")
|
|
172
|
+
data_schema: Dict[str, Any] = Field(description="Expected schema")
|
|
173
|
+
|
|
174
|
+
class StreamDataSchema(BaseModel):
|
|
175
|
+
"""Schema for stream_data operation"""
|
|
176
|
+
|
|
177
|
+
source: str = Field(description="Path to data source file")
|
|
178
|
+
chunk_size: int = Field(default=10000, description="Chunk size for streaming")
|
|
179
|
+
source_type: Optional[DataSourceType] = Field(
|
|
180
|
+
default=DataSourceType.AUTO, description="Data source type"
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
def load_data(
|
|
184
|
+
self,
|
|
185
|
+
source: str,
|
|
186
|
+
source_type: DataSourceType = DataSourceType.AUTO,
|
|
187
|
+
strategy: LoadStrategy = LoadStrategy.FULL_LOAD,
|
|
188
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
189
|
+
validation_rules: Optional[Dict[str, Any]] = None,
|
|
190
|
+
nrows: Optional[int] = None,
|
|
191
|
+
chunk_size: Optional[int] = None,
|
|
192
|
+
encoding: Optional[str] = None,
|
|
193
|
+
) -> Dict[str, Any]:
|
|
194
|
+
"""
|
|
195
|
+
Load data from source with automatic format detection.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
source: Path to data source file
|
|
199
|
+
source_type: Type of data source (auto-detected if not specified)
|
|
200
|
+
strategy: Loading strategy to use
|
|
201
|
+
schema: Expected schema for validation
|
|
202
|
+
validation_rules: Data quality validation rules
|
|
203
|
+
nrows: Number of rows to load (None for all)
|
|
204
|
+
chunk_size: Chunk size for chunked loading
|
|
205
|
+
encoding: File encoding
|
|
206
|
+
|
|
207
|
+
Returns:
|
|
208
|
+
Dict containing:
|
|
209
|
+
- data: Loaded DataFrame or data structure
|
|
210
|
+
- metadata: Metadata about loaded data
|
|
211
|
+
- quality_report: Quality assessment results
|
|
212
|
+
|
|
213
|
+
Raises:
|
|
214
|
+
DataLoaderError: If loading fails
|
|
215
|
+
FileFormatError: If format is unsupported
|
|
216
|
+
"""
|
|
217
|
+
try:
|
|
218
|
+
# Validate source exists
|
|
219
|
+
if not os.path.exists(source):
|
|
220
|
+
raise DataLoaderError(f"Source file not found: {source}")
|
|
221
|
+
|
|
222
|
+
# Detect format if auto
|
|
223
|
+
if source_type == DataSourceType.AUTO:
|
|
224
|
+
source_type = self._detect_format(source)
|
|
225
|
+
|
|
226
|
+
# Check file size
|
|
227
|
+
file_size_mb = os.path.getsize(source) / (1024 * 1024)
|
|
228
|
+
if file_size_mb > self.config.max_file_size_mb:
|
|
229
|
+
self.logger.warning(f"File size {file_size_mb:.2f}MB exceeds recommended limit")
|
|
230
|
+
|
|
231
|
+
# Load data based on strategy
|
|
232
|
+
if strategy == LoadStrategy.FULL_LOAD:
|
|
233
|
+
data = self._load_full(source, source_type, nrows, encoding)
|
|
234
|
+
elif strategy == LoadStrategy.CHUNKED:
|
|
235
|
+
data = self._load_chunked(
|
|
236
|
+
source,
|
|
237
|
+
source_type,
|
|
238
|
+
chunk_size or self.config.default_chunk_size,
|
|
239
|
+
encoding,
|
|
240
|
+
)
|
|
241
|
+
elif strategy == LoadStrategy.STREAMING:
|
|
242
|
+
data = self._load_streaming(
|
|
243
|
+
source,
|
|
244
|
+
source_type,
|
|
245
|
+
chunk_size or self.config.default_chunk_size,
|
|
246
|
+
encoding,
|
|
247
|
+
)
|
|
248
|
+
elif strategy == LoadStrategy.LAZY:
|
|
249
|
+
data = self._load_lazy(source, source_type, encoding)
|
|
250
|
+
else:
|
|
251
|
+
raise DataLoaderError(f"Unsupported loading strategy: {strategy}")
|
|
252
|
+
|
|
253
|
+
# Generate metadata
|
|
254
|
+
metadata = self._generate_metadata(data, source, source_type)
|
|
255
|
+
|
|
256
|
+
# Validate schema if provided
|
|
257
|
+
if schema and self.config.enable_schema_inference:
|
|
258
|
+
schema_valid = self._validate_schema_internal(data, schema)
|
|
259
|
+
metadata["schema_valid"] = schema_valid
|
|
260
|
+
|
|
261
|
+
# Validate quality if enabled
|
|
262
|
+
quality_report = {}
|
|
263
|
+
if self.config.enable_quality_validation and isinstance(data, pd.DataFrame):
|
|
264
|
+
quality_report = self._validate_quality(data, validation_rules)
|
|
265
|
+
|
|
266
|
+
self.logger.info(f"Successfully loaded data from {source}")
|
|
267
|
+
|
|
268
|
+
return {
|
|
269
|
+
"data": data,
|
|
270
|
+
"metadata": metadata,
|
|
271
|
+
"quality_report": quality_report,
|
|
272
|
+
"source": source,
|
|
273
|
+
"source_type": source_type.value,
|
|
274
|
+
"strategy": strategy.value,
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
except Exception as e:
|
|
278
|
+
self.logger.error(f"Error loading data from {source}: {e}")
|
|
279
|
+
raise DataLoaderError(f"Failed to load data: {e}")
|
|
280
|
+
|
|
281
|
+
def detect_format(self, source: str) -> Dict[str, Any]:
|
|
282
|
+
"""
|
|
283
|
+
Detect file format from source.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
source: Path to data source file
|
|
287
|
+
|
|
288
|
+
Returns:
|
|
289
|
+
Dict containing detected format information
|
|
290
|
+
"""
|
|
291
|
+
try:
|
|
292
|
+
detected_type = self._detect_format(source)
|
|
293
|
+
|
|
294
|
+
return {
|
|
295
|
+
"source": source,
|
|
296
|
+
"detected_type": detected_type.value,
|
|
297
|
+
"file_extension": Path(source).suffix.lower(),
|
|
298
|
+
"confidence": "high",
|
|
299
|
+
}
|
|
300
|
+
except Exception as e:
|
|
301
|
+
self.logger.error(f"Error detecting format: {e}")
|
|
302
|
+
raise FileFormatError(f"Failed to detect format: {e}")
|
|
303
|
+
|
|
304
|
+
def validate_schema(
|
|
305
|
+
self,
|
|
306
|
+
data: Union[Dict[str, Any], List[Dict[str, Any]]],
|
|
307
|
+
schema: Dict[str, Any],
|
|
308
|
+
) -> Dict[str, Any]:
|
|
309
|
+
"""
|
|
310
|
+
Validate data against expected schema.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
data: Data to validate
|
|
314
|
+
schema: Expected schema definition
|
|
315
|
+
|
|
316
|
+
Returns:
|
|
317
|
+
Dict containing validation results
|
|
318
|
+
"""
|
|
319
|
+
try:
|
|
320
|
+
# Convert to DataFrame if needed
|
|
321
|
+
if isinstance(data, list):
|
|
322
|
+
df = pd.DataFrame(data)
|
|
323
|
+
elif isinstance(data, dict):
|
|
324
|
+
df = pd.DataFrame([data])
|
|
325
|
+
else:
|
|
326
|
+
df = data
|
|
327
|
+
|
|
328
|
+
is_valid = self._validate_schema_internal(df, schema)
|
|
329
|
+
|
|
330
|
+
issues = []
|
|
331
|
+
if not is_valid:
|
|
332
|
+
# Check column presence
|
|
333
|
+
expected_columns = set(schema.get("columns", {}).keys())
|
|
334
|
+
actual_columns = set(df.columns)
|
|
335
|
+
missing = expected_columns - actual_columns
|
|
336
|
+
extra = actual_columns - expected_columns
|
|
337
|
+
|
|
338
|
+
if missing:
|
|
339
|
+
issues.append(f"Missing columns: {missing}")
|
|
340
|
+
if extra:
|
|
341
|
+
issues.append(f"Extra columns: {extra}")
|
|
342
|
+
|
|
343
|
+
return {
|
|
344
|
+
"valid": is_valid,
|
|
345
|
+
"issues": issues,
|
|
346
|
+
"expected_columns": list(schema.get("columns", {}).keys()),
|
|
347
|
+
"actual_columns": list(df.columns),
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
except Exception as e:
|
|
351
|
+
self.logger.error(f"Error validating schema: {e}")
|
|
352
|
+
raise SchemaValidationError(f"Schema validation failed: {e}")
|
|
353
|
+
|
|
354
|
+
def stream_data(
|
|
355
|
+
self,
|
|
356
|
+
source: str,
|
|
357
|
+
chunk_size: int = 10000,
|
|
358
|
+
source_type: DataSourceType = DataSourceType.AUTO,
|
|
359
|
+
) -> Dict[str, Any]:
|
|
360
|
+
"""
|
|
361
|
+
Stream data in chunks for large files.
|
|
362
|
+
|
|
363
|
+
Args:
|
|
364
|
+
source: Path to data source file
|
|
365
|
+
chunk_size: Size of each chunk
|
|
366
|
+
source_type: Type of data source
|
|
367
|
+
|
|
368
|
+
Returns:
|
|
369
|
+
Dict containing streaming iterator information
|
|
370
|
+
"""
|
|
371
|
+
try:
|
|
372
|
+
if source_type == DataSourceType.AUTO:
|
|
373
|
+
source_type = self._detect_format(source)
|
|
374
|
+
|
|
375
|
+
# Create iterator based on format
|
|
376
|
+
if source_type == DataSourceType.CSV:
|
|
377
|
+
iterator = pd.read_csv(source, chunksize=chunk_size)
|
|
378
|
+
elif source_type == DataSourceType.JSON:
|
|
379
|
+
iterator = pd.read_json(source, lines=True, chunksize=chunk_size)
|
|
380
|
+
else:
|
|
381
|
+
raise FileFormatError(f"Streaming not supported for format: {source_type}")
|
|
382
|
+
|
|
383
|
+
return {
|
|
384
|
+
"iterator": iterator,
|
|
385
|
+
"chunk_size": chunk_size,
|
|
386
|
+
"source_type": source_type.value,
|
|
387
|
+
"message": "Streaming iterator created successfully",
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
except Exception as e:
|
|
391
|
+
self.logger.error(f"Error creating stream: {e}")
|
|
392
|
+
raise DataLoaderError(f"Failed to create stream: {e}")
|
|
393
|
+
|
|
394
|
+
# Internal helper methods
|
|
395
|
+
|
|
396
|
+
def _detect_format(self, source: str) -> DataSourceType:
|
|
397
|
+
"""Detect file format from extension"""
|
|
398
|
+
ext = Path(source).suffix.lower()
|
|
399
|
+
|
|
400
|
+
format_map = {
|
|
401
|
+
".csv": DataSourceType.CSV,
|
|
402
|
+
".xlsx": DataSourceType.EXCEL,
|
|
403
|
+
".xls": DataSourceType.EXCEL,
|
|
404
|
+
".json": DataSourceType.JSON,
|
|
405
|
+
".parquet": DataSourceType.PARQUET,
|
|
406
|
+
".feather": DataSourceType.FEATHER,
|
|
407
|
+
".h5": DataSourceType.HDF5,
|
|
408
|
+
".hdf": DataSourceType.HDF5,
|
|
409
|
+
".dta": DataSourceType.STATA,
|
|
410
|
+
".sas7bdat": DataSourceType.SAS,
|
|
411
|
+
".sav": DataSourceType.SPSS,
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
detected = format_map.get(ext)
|
|
415
|
+
if not detected:
|
|
416
|
+
raise FileFormatError(f"Unsupported file format: {ext}")
|
|
417
|
+
|
|
418
|
+
return detected
|
|
419
|
+
|
|
420
|
+
def _load_full(
|
|
421
|
+
self,
|
|
422
|
+
source: str,
|
|
423
|
+
source_type: DataSourceType,
|
|
424
|
+
nrows: Optional[int],
|
|
425
|
+
encoding: Optional[str],
|
|
426
|
+
) -> pd.DataFrame:
|
|
427
|
+
"""Load entire dataset into memory"""
|
|
428
|
+
encoding = encoding or self.config.default_encoding
|
|
429
|
+
|
|
430
|
+
if source_type == DataSourceType.CSV:
|
|
431
|
+
return pd.read_csv(source, nrows=nrows, encoding=encoding)
|
|
432
|
+
elif source_type == DataSourceType.EXCEL:
|
|
433
|
+
return pd.read_excel(source, nrows=nrows)
|
|
434
|
+
elif source_type == DataSourceType.JSON:
|
|
435
|
+
return pd.read_json(source, nrows=nrows, encoding=encoding)
|
|
436
|
+
elif source_type == DataSourceType.PARQUET:
|
|
437
|
+
return pd.read_parquet(source)
|
|
438
|
+
elif source_type == DataSourceType.FEATHER:
|
|
439
|
+
return pd.read_feather(source)
|
|
440
|
+
elif source_type == DataSourceType.HDF5:
|
|
441
|
+
return pd.read_hdf(source)
|
|
442
|
+
elif source_type == DataSourceType.STATA:
|
|
443
|
+
df = pd.read_stata(source)
|
|
444
|
+
if nrows:
|
|
445
|
+
return df.head(nrows)
|
|
446
|
+
return df
|
|
447
|
+
elif source_type == DataSourceType.SAS:
|
|
448
|
+
return pd.read_sas(source)
|
|
449
|
+
elif source_type == DataSourceType.SPSS:
|
|
450
|
+
try:
|
|
451
|
+
import pyreadstat
|
|
452
|
+
|
|
453
|
+
df, meta = pyreadstat.read_sav(source)
|
|
454
|
+
return df
|
|
455
|
+
except ImportError:
|
|
456
|
+
raise DataLoaderError("pyreadstat required for SPSS files")
|
|
457
|
+
else:
|
|
458
|
+
raise FileFormatError(f"Unsupported format for full load: {source_type}")
|
|
459
|
+
|
|
460
|
+
def _load_chunked(
|
|
461
|
+
self,
|
|
462
|
+
source: str,
|
|
463
|
+
source_type: DataSourceType,
|
|
464
|
+
chunk_size: int,
|
|
465
|
+
encoding: Optional[str],
|
|
466
|
+
) -> pd.DataFrame:
|
|
467
|
+
"""Load data in chunks and combine"""
|
|
468
|
+
encoding = encoding or self.config.default_encoding
|
|
469
|
+
chunks = []
|
|
470
|
+
|
|
471
|
+
if source_type == DataSourceType.CSV:
|
|
472
|
+
for chunk in pd.read_csv(source, chunksize=chunk_size, encoding=encoding):
|
|
473
|
+
chunks.append(chunk)
|
|
474
|
+
elif source_type == DataSourceType.JSON:
|
|
475
|
+
for chunk in pd.read_json(source, lines=True, chunksize=chunk_size, encoding=encoding):
|
|
476
|
+
chunks.append(chunk)
|
|
477
|
+
else:
|
|
478
|
+
raise FileFormatError(f"Chunked loading not supported for: {source_type}")
|
|
479
|
+
|
|
480
|
+
return pd.concat(chunks, ignore_index=True)
|
|
481
|
+
|
|
482
|
+
def _load_streaming(
|
|
483
|
+
self,
|
|
484
|
+
source: str,
|
|
485
|
+
source_type: DataSourceType,
|
|
486
|
+
chunk_size: int,
|
|
487
|
+
encoding: Optional[str],
|
|
488
|
+
) -> Iterator[pd.DataFrame]:
|
|
489
|
+
"""Create streaming iterator"""
|
|
490
|
+
encoding = encoding or self.config.default_encoding
|
|
491
|
+
|
|
492
|
+
if source_type == DataSourceType.CSV:
|
|
493
|
+
return pd.read_csv(source, chunksize=chunk_size, encoding=encoding)
|
|
494
|
+
elif source_type == DataSourceType.JSON:
|
|
495
|
+
return pd.read_json(source, lines=True, chunksize=chunk_size, encoding=encoding)
|
|
496
|
+
else:
|
|
497
|
+
raise FileFormatError(f"Streaming not supported for: {source_type}")
|
|
498
|
+
|
|
499
|
+
def _load_lazy(self, source: str, source_type: DataSourceType, encoding: Optional[str]) -> Any:
|
|
500
|
+
"""Create lazy loading wrapper"""
|
|
501
|
+
# For now, return full load with warning
|
|
502
|
+
self.logger.warning("Lazy loading not fully implemented, using full load")
|
|
503
|
+
return self._load_full(source, source_type, None, encoding)
|
|
504
|
+
|
|
505
|
+
def _generate_metadata(
|
|
506
|
+
self, data: Any, source: str, source_type: DataSourceType
|
|
507
|
+
) -> Dict[str, Any]:
|
|
508
|
+
"""Generate metadata about loaded data"""
|
|
509
|
+
if isinstance(data, pd.DataFrame):
|
|
510
|
+
return {
|
|
511
|
+
"rows": len(data),
|
|
512
|
+
"columns": len(data.columns),
|
|
513
|
+
"column_names": list(data.columns),
|
|
514
|
+
"dtypes": {col: str(dtype) for col, dtype in data.dtypes.items()},
|
|
515
|
+
"memory_usage_mb": data.memory_usage(deep=True).sum() / (1024 * 1024),
|
|
516
|
+
"file_size_mb": os.path.getsize(source) / (1024 * 1024),
|
|
517
|
+
}
|
|
518
|
+
else:
|
|
519
|
+
return {
|
|
520
|
+
"type": str(type(data)),
|
|
521
|
+
"file_size_mb": os.path.getsize(source) / (1024 * 1024),
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
def _validate_schema_internal(self, data: pd.DataFrame, schema: Dict[str, Any]) -> bool:
|
|
525
|
+
"""Internal schema validation"""
|
|
526
|
+
if "columns" not in schema:
|
|
527
|
+
return True
|
|
528
|
+
|
|
529
|
+
expected_columns = set(schema["columns"].keys())
|
|
530
|
+
actual_columns = set(data.columns)
|
|
531
|
+
|
|
532
|
+
return expected_columns.issubset(actual_columns)
|
|
533
|
+
|
|
534
|
+
def _validate_quality(
|
|
535
|
+
self, data: pd.DataFrame, validation_rules: Optional[Dict[str, Any]]
|
|
536
|
+
) -> Dict[str, Any]:
|
|
537
|
+
"""Validate data quality"""
|
|
538
|
+
quality_report = {
|
|
539
|
+
"total_rows": len(data),
|
|
540
|
+
"total_columns": len(data.columns),
|
|
541
|
+
"missing_values": data.isnull().sum().to_dict(),
|
|
542
|
+
"duplicate_rows": data.duplicated().sum(),
|
|
543
|
+
"quality_score": 1.0,
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
# Calculate quality score
|
|
547
|
+
missing_ratio = (
|
|
548
|
+
data.isnull().sum().sum() / (len(data) * len(data.columns)) if len(data) > 0 else 0
|
|
549
|
+
)
|
|
550
|
+
duplicate_ratio = quality_report["duplicate_rows"] / len(data) if len(data) > 0 else 0
|
|
551
|
+
|
|
552
|
+
quality_score = 1.0 - (missing_ratio * 0.5 + duplicate_ratio * 0.5)
|
|
553
|
+
quality_report["quality_score"] = max(0.0, min(1.0, quality_score))
|
|
554
|
+
|
|
555
|
+
# Add issues list
|
|
556
|
+
issues = []
|
|
557
|
+
if missing_ratio > 0.1:
|
|
558
|
+
issues.append(f"High missing value ratio: {missing_ratio:.2%}")
|
|
559
|
+
if duplicate_ratio > 0.05:
|
|
560
|
+
issues.append(f"High duplicate ratio: {duplicate_ratio:.2%}")
|
|
561
|
+
|
|
562
|
+
quality_report["issues"] = issues
|
|
563
|
+
|
|
564
|
+
return quality_report
|