aiecs 1.0.1__py3-none-any.whl → 1.7.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +13 -16
- aiecs/__main__.py +7 -7
- aiecs/aiecs_client.py +269 -75
- aiecs/application/executors/operation_executor.py +79 -54
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
- aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
- aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
- aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +98 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
- aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
- aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
- aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
- aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
- aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
- aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
- aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
- aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
- aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
- aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
- aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
- aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
- aiecs/application/knowledge_graph/search/reranker.py +293 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +471 -0
- aiecs/config/__init__.py +20 -5
- aiecs/config/config.py +762 -31
- aiecs/config/graph_config.py +131 -0
- aiecs/config/tool_config.py +435 -0
- aiecs/core/__init__.py +29 -13
- aiecs/core/interface/__init__.py +2 -2
- aiecs/core/interface/execution_interface.py +22 -22
- aiecs/core/interface/storage_interface.py +37 -88
- aiecs/core/registry/__init__.py +31 -0
- aiecs/core/registry/service_registry.py +92 -0
- aiecs/domain/__init__.py +270 -1
- aiecs/domain/agent/__init__.py +191 -0
- aiecs/domain/agent/base_agent.py +3949 -0
- aiecs/domain/agent/exceptions.py +99 -0
- aiecs/domain/agent/graph_aware_mixin.py +569 -0
- aiecs/domain/agent/hybrid_agent.py +1731 -0
- aiecs/domain/agent/integration/__init__.py +29 -0
- aiecs/domain/agent/integration/context_compressor.py +216 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
- aiecs/domain/agent/integration/protocols.py +281 -0
- aiecs/domain/agent/integration/retry_policy.py +218 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
- aiecs/domain/agent/lifecycle.py +291 -0
- aiecs/domain/agent/llm_agent.py +692 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +1124 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +163 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
- aiecs/domain/agent/models.py +894 -0
- aiecs/domain/agent/observability.py +479 -0
- aiecs/domain/agent/persistence.py +449 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +159 -0
- aiecs/domain/agent/prompts/formatters.py +187 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +253 -0
- aiecs/domain/agent/tool_agent.py +444 -0
- aiecs/domain/agent/tools/__init__.py +15 -0
- aiecs/domain/agent/tools/schema_generator.py +377 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +469 -0
- aiecs/domain/community/analytics.py +432 -0
- aiecs/domain/community/collaborative_workflow.py +648 -0
- aiecs/domain/community/communication_hub.py +634 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +796 -0
- aiecs/domain/community/community_manager.py +803 -0
- aiecs/domain/community/decision_engine.py +849 -0
- aiecs/domain/community/exceptions.py +231 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +234 -0
- aiecs/domain/community/resource_manager.py +461 -0
- aiecs/domain/community/shared_context_manager.py +589 -0
- aiecs/domain/context/__init__.py +40 -10
- aiecs/domain/context/context_engine.py +1910 -0
- aiecs/domain/context/conversation_models.py +87 -53
- aiecs/domain/context/graph_memory.py +582 -0
- aiecs/domain/execution/model.py +12 -4
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +148 -0
- aiecs/domain/knowledge_graph/models/evidence.py +178 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
- aiecs/domain/knowledge_graph/models/path.py +171 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
- aiecs/domain/knowledge_graph/models/query.py +261 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
- aiecs/domain/knowledge_graph/models/relation.py +202 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
- aiecs/domain/task/dsl_processor.py +172 -56
- aiecs/domain/task/model.py +20 -8
- aiecs/domain/task/task_context.py +27 -24
- aiecs/infrastructure/__init__.py +0 -2
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +837 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
- aiecs/infrastructure/graph_storage/cache.py +424 -0
- aiecs/infrastructure/graph_storage/distributed.py +223 -0
- aiecs/infrastructure/graph_storage/error_handling.py +380 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
- aiecs/infrastructure/graph_storage/metrics.py +344 -0
- aiecs/infrastructure/graph_storage/migration.py +400 -0
- aiecs/infrastructure/graph_storage/pagination.py +483 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
- aiecs/infrastructure/graph_storage/postgres.py +1563 -0
- aiecs/infrastructure/graph_storage/property_storage.py +353 -0
- aiecs/infrastructure/graph_storage/protocols.py +76 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
- aiecs/infrastructure/graph_storage/streaming.py +487 -0
- aiecs/infrastructure/graph_storage/tenant.py +412 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
- aiecs/infrastructure/messaging/websocket_manager.py +51 -35
- aiecs/infrastructure/monitoring/__init__.py +22 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
- aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
- aiecs/infrastructure/monitoring/structured_logger.py +3 -7
- aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
- aiecs/infrastructure/persistence/__init__.py +14 -1
- aiecs/infrastructure/persistence/context_engine_client.py +184 -0
- aiecs/infrastructure/persistence/database_manager.py +67 -43
- aiecs/infrastructure/persistence/file_storage.py +180 -103
- aiecs/infrastructure/persistence/redis_client.py +74 -21
- aiecs/llm/__init__.py +73 -25
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
- aiecs/llm/client_factory.py +230 -37
- aiecs/llm/client_resolver.py +155 -0
- aiecs/llm/clients/__init__.py +38 -0
- aiecs/llm/clients/base_client.py +328 -0
- aiecs/llm/clients/google_function_calling_mixin.py +415 -0
- aiecs/llm/clients/googleai_client.py +314 -0
- aiecs/llm/clients/openai_client.py +158 -0
- aiecs/llm/clients/openai_compatible_mixin.py +367 -0
- aiecs/llm/clients/vertex_client.py +1186 -0
- aiecs/llm/clients/xai_client.py +201 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +272 -0
- aiecs/llm/config/config_validator.py +206 -0
- aiecs/llm/config/model_config.py +143 -0
- aiecs/llm/protocols.py +149 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +89 -0
- aiecs/main.py +140 -121
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/module_checker.py +499 -0
- aiecs/scripts/aid/version_manager.py +235 -0
- aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
- aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
- aiecs/scripts/dependance_check/__init__.py +15 -0
- aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
- aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
- aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
- aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
- aiecs/scripts/knowledge_graph/__init__.py +3 -0
- aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
- aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
- aiecs/scripts/tools_develop/README.md +671 -0
- aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
- aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
- aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
- aiecs/scripts/tools_develop/schema_coverage.py +511 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
- aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
- aiecs/scripts/tools_develop/verify_tools.py +352 -0
- aiecs/tasks/__init__.py +0 -1
- aiecs/tasks/worker.py +115 -47
- aiecs/tools/__init__.py +194 -72
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +330 -0
- aiecs/tools/apisource/providers/__init__.py +112 -0
- aiecs/tools/apisource/providers/base.py +671 -0
- aiecs/tools/apisource/providers/census.py +397 -0
- aiecs/tools/apisource/providers/fred.py +535 -0
- aiecs/tools/apisource/providers/newsapi.py +409 -0
- aiecs/tools/apisource/providers/worldbank.py +352 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +363 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
- aiecs/tools/apisource/tool.py +832 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +334 -0
- aiecs/tools/base_tool.py +415 -21
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +607 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
- aiecs/tools/docs/content_insertion_tool.py +1320 -0
- aiecs/tools/docs/document_creator_tool.py +1464 -0
- aiecs/tools/docs/document_layout_tool.py +1160 -0
- aiecs/tools/docs/document_parser_tool.py +1016 -0
- aiecs/tools/docs/document_writer_tool.py +2008 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
- aiecs/tools/langchain_adapter.py +300 -138
- aiecs/tools/schema_generator.py +455 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +581 -0
- aiecs/tools/search_tool/cache.py +264 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +224 -0
- aiecs/tools/search_tool/core.py +778 -0
- aiecs/tools/search_tool/deduplicator.py +119 -0
- aiecs/tools/search_tool/error_handler.py +242 -0
- aiecs/tools/search_tool/metrics.py +343 -0
- aiecs/tools/search_tool/rate_limiter.py +172 -0
- aiecs/tools/search_tool/schemas.py +275 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
- aiecs/tools/statistics/data_loader_tool.py +555 -0
- aiecs/tools/statistics/data_profiler_tool.py +638 -0
- aiecs/tools/statistics/data_transformer_tool.py +580 -0
- aiecs/tools/statistics/data_visualizer_tool.py +498 -0
- aiecs/tools/statistics/model_trainer_tool.py +507 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
- aiecs/tools/task_tools/__init__.py +49 -36
- aiecs/tools/task_tools/chart_tool.py +200 -184
- aiecs/tools/task_tools/classfire_tool.py +268 -267
- aiecs/tools/task_tools/image_tool.py +220 -141
- aiecs/tools/task_tools/office_tool.py +226 -146
- aiecs/tools/task_tools/pandas_tool.py +477 -121
- aiecs/tools/task_tools/report_tool.py +390 -142
- aiecs/tools/task_tools/research_tool.py +149 -79
- aiecs/tools/task_tools/scraper_tool.py +339 -145
- aiecs/tools/task_tools/stats_tool.py +448 -209
- aiecs/tools/temp_file_manager.py +26 -24
- aiecs/tools/tool_executor/__init__.py +18 -16
- aiecs/tools/tool_executor/tool_executor.py +364 -52
- aiecs/utils/LLM_output_structor.py +74 -48
- aiecs/utils/__init__.py +14 -3
- aiecs/utils/base_callback.py +0 -3
- aiecs/utils/cache_provider.py +696 -0
- aiecs/utils/execution_utils.py +50 -31
- aiecs/utils/prompt_loader.py +1 -0
- aiecs/utils/token_usage_repository.py +37 -11
- aiecs/ws/socket_server.py +14 -4
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/METADATA +52 -15
- aiecs-1.7.17.dist-info/RECORD +337 -0
- aiecs-1.7.17.dist-info/entry_points.txt +13 -0
- aiecs/config/registry.py +0 -19
- aiecs/domain/context/content_engine.py +0 -982
- aiecs/llm/base_client.py +0 -99
- aiecs/llm/openai_client.py +0 -125
- aiecs/llm/vertex_client.py +0 -186
- aiecs/llm/xai_client.py +0 -184
- aiecs/scripts/dependency_checker.py +0 -857
- aiecs/scripts/quick_dependency_check.py +0 -269
- aiecs/tools/task_tools/search_api.py +0 -7
- aiecs-1.0.1.dist-info/RECORD +0 -90
- aiecs-1.0.1.dist-info/entry_points.txt +0 -7
- /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
- /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
- /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
- /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/WHEEL +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/top_level.txt +0 -0
|
@@ -5,38 +5,44 @@ from typing import Dict, Any, List, Optional, Union, Tuple
|
|
|
5
5
|
from enum import Enum
|
|
6
6
|
from dataclasses import dataclass
|
|
7
7
|
|
|
8
|
-
import pandas as pd
|
|
8
|
+
import pandas as pd # type: ignore[import-untyped]
|
|
9
9
|
import numpy as np
|
|
10
|
-
from
|
|
11
|
-
from
|
|
10
|
+
from pydantic import Field, BaseModel
|
|
11
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
12
12
|
|
|
13
13
|
from aiecs.tools.base_tool import BaseTool
|
|
14
14
|
from aiecs.tools import register_tool
|
|
15
15
|
|
|
16
16
|
# Enums for configuration options
|
|
17
|
+
|
|
18
|
+
|
|
17
19
|
class ScalerType(str, Enum):
|
|
18
20
|
STANDARD = "standard"
|
|
19
21
|
MINMAX = "minmax"
|
|
20
22
|
ROBUST = "robust"
|
|
21
23
|
NONE = "none"
|
|
22
24
|
|
|
23
|
-
class StatsSettings(BaseSettings):
|
|
24
|
-
"""Configuration for StatsTool."""
|
|
25
|
-
max_file_size_mb: int = 200
|
|
26
|
-
allowed_extensions: List[str] = ['.sav', '.sas7bdat', '.por', '.csv', '.xlsx', '.xls', '.json', '.parquet', '.feather']
|
|
27
|
-
env_prefix: str = 'STATS_TOOL_'
|
|
28
|
-
|
|
29
|
-
model_config = ConfigDict(env_prefix='STATS_TOOL_')
|
|
30
25
|
|
|
31
26
|
# Exceptions
|
|
32
|
-
class StatsToolError(Exception):
|
|
33
|
-
|
|
34
|
-
|
|
27
|
+
class StatsToolError(Exception):
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class FileOperationError(StatsToolError):
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class AnalysisError(StatsToolError):
|
|
36
|
+
pass
|
|
37
|
+
|
|
35
38
|
|
|
36
39
|
# Utility Dataclass for Statistical Results
|
|
40
|
+
|
|
41
|
+
|
|
37
42
|
@dataclass
|
|
38
43
|
class StatsResult:
|
|
39
44
|
"""Structured statistical result."""
|
|
45
|
+
|
|
40
46
|
test_type: str
|
|
41
47
|
statistic: float
|
|
42
48
|
pvalue: float
|
|
@@ -45,53 +51,186 @@ class StatsResult:
|
|
|
45
51
|
|
|
46
52
|
def to_dict(self) -> Dict[str, Any]:
|
|
47
53
|
return {
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
**self.additional_metrics
|
|
54
|
+
"test_type": self.test_type,
|
|
55
|
+
"statistic": self.statistic,
|
|
56
|
+
"pvalue": self.pvalue,
|
|
57
|
+
"significant": self.significant,
|
|
58
|
+
**self.additional_metrics,
|
|
53
59
|
}
|
|
54
60
|
|
|
55
|
-
|
|
61
|
+
|
|
62
|
+
@register_tool("stats")
|
|
56
63
|
class StatsTool(BaseTool):
|
|
57
64
|
"""Enhanced statistical analysis tool for various data formats and operations."""
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
65
|
+
|
|
66
|
+
# Configuration schema
|
|
67
|
+
class Config(BaseSettings):
|
|
68
|
+
"""Configuration for the stats tool
|
|
69
|
+
|
|
70
|
+
Automatically reads from environment variables with STATS_TOOL_ prefix.
|
|
71
|
+
Example: STATS_TOOL_MAX_FILE_SIZE_MB -> max_file_size_mb
|
|
72
|
+
"""
|
|
73
|
+
|
|
74
|
+
model_config = SettingsConfigDict(env_prefix="STATS_TOOL_")
|
|
75
|
+
|
|
76
|
+
max_file_size_mb: int = Field(default=200, description="Maximum file size in megabytes")
|
|
77
|
+
allowed_extensions: List[str] = Field(
|
|
78
|
+
default=[
|
|
79
|
+
".sav",
|
|
80
|
+
".sas7bdat",
|
|
81
|
+
".por",
|
|
82
|
+
".csv",
|
|
83
|
+
".xlsx",
|
|
84
|
+
".xls",
|
|
85
|
+
".json",
|
|
86
|
+
".parquet",
|
|
87
|
+
".feather",
|
|
88
|
+
],
|
|
89
|
+
description="Allowed file extensions",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Schema definitions
|
|
93
|
+
class Read_dataSchema(BaseModel):
|
|
94
|
+
"""Schema for read_data operation"""
|
|
95
|
+
|
|
96
|
+
file_path: str = Field(description="Path to the data file to read")
|
|
97
|
+
nrows: Optional[int] = Field(default=None, description="Optional number of rows to read from the file. If None, reads all rows")
|
|
98
|
+
sheet_name: Optional[Union[str, int]] = Field(default=0, description="Sheet name or index for Excel files. Can be a string name or integer index (0-based)")
|
|
99
|
+
|
|
100
|
+
class DescribeSchema(BaseModel):
|
|
101
|
+
"""Schema for describe operation"""
|
|
102
|
+
|
|
103
|
+
file_path: str = Field(description="Path to the data file")
|
|
104
|
+
variables: Optional[List[str]] = Field(default=None, description="Optional list of variable names to describe. If None, describes all variables")
|
|
105
|
+
include_percentiles: bool = Field(default=False, description="Whether to include custom percentiles in the descriptive statistics")
|
|
106
|
+
percentiles: Optional[List[float]] = Field(default=None, description="Optional list of percentile values (0.0 to 1.0) to include. Only used if include_percentiles is True")
|
|
107
|
+
|
|
108
|
+
class TtestSchema(BaseModel):
|
|
109
|
+
"""Schema for ttest operation"""
|
|
110
|
+
|
|
111
|
+
file_path: str = Field(description="Path to the data file")
|
|
112
|
+
var1: str = Field(description="Name of the first variable for the t-test")
|
|
113
|
+
var2: str = Field(description="Name of the second variable for the t-test")
|
|
114
|
+
equal_var: bool = Field(default=True, description="Whether to assume equal variances. If True, uses standard t-test; if False, uses Welch's t-test")
|
|
115
|
+
paired: bool = Field(default=False, description="Whether to perform a paired t-test. If True, performs paired t-test; if False, performs independent t-test")
|
|
116
|
+
|
|
117
|
+
class CorrelationSchema(BaseModel):
|
|
118
|
+
"""Schema for correlation operation"""
|
|
119
|
+
|
|
120
|
+
file_path: str = Field(description="Path to the data file")
|
|
121
|
+
variables: Optional[List[str]] = Field(default=None, description="Optional list of variable names for correlation matrix. If provided, computes correlation matrix for all pairs")
|
|
122
|
+
var1: Optional[str] = Field(default=None, description="First variable name for pairwise correlation. Must be used together with var2")
|
|
123
|
+
var2: Optional[str] = Field(default=None, description="Second variable name for pairwise correlation. Must be used together with var1")
|
|
124
|
+
method: str = Field(default="pearson", description="Correlation method: 'pearson' (linear), 'spearman' (rank-based), or 'kendall' (tau)")
|
|
125
|
+
|
|
126
|
+
class AnovaSchema(BaseModel):
|
|
127
|
+
"""Schema for anova operation"""
|
|
128
|
+
|
|
129
|
+
file_path: str = Field(description="Path to the data file")
|
|
130
|
+
dependent: str = Field(description="Name of the dependent variable (continuous)")
|
|
131
|
+
factor: str = Field(description="Name of the factor/grouping variable (categorical)")
|
|
132
|
+
post_hoc: bool = Field(default=False, description="Whether to perform post-hoc tests (Tukey HSD) to identify which groups differ significantly")
|
|
133
|
+
|
|
134
|
+
class Chi_squareSchema(BaseModel):
|
|
135
|
+
"""Schema for chi_square operation"""
|
|
136
|
+
|
|
137
|
+
file_path: str = Field(description="Path to the data file")
|
|
138
|
+
var1: str = Field(description="Name of the first categorical variable")
|
|
139
|
+
var2: str = Field(description="Name of the second categorical variable")
|
|
140
|
+
correction: bool = Field(default=True, description="Whether to apply Yates' correction for continuity. Recommended for 2x2 contingency tables")
|
|
141
|
+
|
|
142
|
+
class Non_parametricSchema(BaseModel):
|
|
143
|
+
"""Schema for non_parametric operation"""
|
|
144
|
+
|
|
145
|
+
file_path: str = Field(description="Path to the data file")
|
|
146
|
+
test_type: str = Field(description="Type of non-parametric test: 'mann_whitney' (2 groups), 'wilcoxon' (paired), 'kruskal' (multiple groups), or 'friedman' (repeated measures)")
|
|
147
|
+
variables: List[str] = Field(description="List of variable names to test. Number of variables depends on test_type")
|
|
148
|
+
grouping: Optional[str] = Field(default=None, description="Optional grouping variable name. Required for 'kruskal' test, not used for other tests")
|
|
149
|
+
|
|
150
|
+
class RegressionSchema(BaseModel):
|
|
151
|
+
"""Schema for regression operation"""
|
|
152
|
+
|
|
153
|
+
file_path: str = Field(description="Path to the data file")
|
|
154
|
+
formula: str = Field(description="Regression formula string (e.g., 'y ~ x1 + x2'). Uses R-style formula syntax")
|
|
155
|
+
regression_type: str = Field(default="ols", description="Type of regression model: 'ols' (ordinary least squares), 'logit' (logistic), 'probit', or 'poisson'")
|
|
156
|
+
robust: bool = Field(default=False, description="Whether to use robust standard errors (HC3 heteroscedasticity-consistent)")
|
|
157
|
+
structured_output: bool = Field(default=True, description="Whether to return structured output with coefficients, p-values, and confidence intervals. If False, returns summary text only")
|
|
158
|
+
|
|
159
|
+
class Time_seriesSchema(BaseModel):
|
|
160
|
+
"""Schema for time_series operation"""
|
|
161
|
+
|
|
162
|
+
file_path: str = Field(description="Path to the data file")
|
|
163
|
+
variable: str = Field(description="Name of the time series variable to analyze")
|
|
164
|
+
date_variable: Optional[str] = Field(default=None, description="Optional name of the date/time variable. If provided, uses it as the time index")
|
|
165
|
+
model_type: str = Field(default="arima", description="Type of time series model: 'arima' or 'sarima' (seasonal ARIMA)")
|
|
166
|
+
order: Optional[Tuple[int, int, int]] = Field(default=(1, 1, 1), description="ARIMA order tuple (p, d, q) where p=autoregressive, d=differencing, q=moving average")
|
|
167
|
+
seasonal_order: Optional[Tuple[int, int, int, int]] = Field(default=None, description="Optional SARIMA seasonal order tuple (P, D, Q, s). Required for 'sarima' model type")
|
|
168
|
+
forecast_periods: int = Field(default=10, description="Number of periods to forecast into the future")
|
|
169
|
+
|
|
170
|
+
class PreprocessSchema(BaseModel):
|
|
171
|
+
"""Schema for preprocess operation"""
|
|
172
|
+
|
|
173
|
+
file_path: str = Field(description="Path to the data file")
|
|
174
|
+
variables: List[str] = Field(description="List of variable names to preprocess")
|
|
175
|
+
operation: str = Field(description="Preprocessing operation: 'scale' (normalize) or 'impute' (fill missing values)")
|
|
176
|
+
scaler_type: ScalerType = Field(default=ScalerType.STANDARD, description="Type of scaler to use for scaling operation: 'standard' (z-score), 'minmax' (0-1), 'robust' (median/IQR), or 'none'")
|
|
177
|
+
output_path: Optional[str] = Field(default=None, description="Optional path to save the preprocessed data. If None, data is not saved to file")
|
|
178
|
+
|
|
179
|
+
def __init__(self, config: Optional[Dict[str, Any]] = None, **kwargs):
|
|
180
|
+
"""
|
|
181
|
+
Initialize StatsTool with settings and resources.
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
config (Dict, optional): Configuration overrides for StatsTool.
|
|
185
|
+
**kwargs: Additional arguments passed to BaseTool (e.g., tool_name)
|
|
186
|
+
|
|
187
|
+
Configuration is automatically loaded by BaseTool from:
|
|
188
|
+
1. Explicit config dict (highest priority)
|
|
189
|
+
2. YAML config files (config/tools/stats.yaml)
|
|
190
|
+
3. Environment variables (via dotenv from .env files)
|
|
191
|
+
4. Tool defaults (lowest priority)
|
|
192
|
+
"""
|
|
193
|
+
super().__init__(config, **kwargs)
|
|
194
|
+
|
|
195
|
+
# Configuration is automatically loaded by BaseTool into self._config_obj
|
|
196
|
+
# Access config via self._config_obj (BaseSettings instance)
|
|
197
|
+
self.config = self._config_obj if self._config_obj else self.Config()
|
|
198
|
+
|
|
66
199
|
self.logger = logging.getLogger(__name__)
|
|
67
200
|
if not self.logger.handlers:
|
|
68
201
|
h = logging.StreamHandler()
|
|
69
|
-
h.setFormatter(logging.Formatter(
|
|
202
|
+
h.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(message)s"))
|
|
70
203
|
self.logger.addHandler(h)
|
|
71
204
|
self.logger.setLevel(logging.INFO)
|
|
72
205
|
|
|
73
|
-
def _load_data(
|
|
206
|
+
def _load_data(
|
|
207
|
+
self,
|
|
208
|
+
file_path: str,
|
|
209
|
+
nrows: Optional[int] = None,
|
|
210
|
+
sheet_name: Optional[Union[str, int]] = 0,
|
|
211
|
+
) -> pd.DataFrame:
|
|
74
212
|
"""Load data from various file formats into a pandas DataFrame."""
|
|
75
213
|
try:
|
|
76
214
|
ext = os.path.splitext(file_path)[1].lower()
|
|
77
|
-
if ext in [
|
|
78
|
-
import pyreadstat
|
|
79
|
-
|
|
215
|
+
if ext in [".sav", ".sas7bdat", ".por"]:
|
|
216
|
+
import pyreadstat # type: ignore[import-untyped]
|
|
217
|
+
|
|
218
|
+
if ext == ".sav":
|
|
80
219
|
df, meta = pyreadstat.read_sav(file_path)
|
|
81
|
-
elif ext ==
|
|
220
|
+
elif ext == ".sas7bdat":
|
|
82
221
|
df, meta = pyreadstat.read_sas7bdat(file_path)
|
|
83
222
|
else:
|
|
84
223
|
df, meta = pyreadstat.read_por(file_path)
|
|
85
224
|
return df
|
|
86
|
-
elif ext ==
|
|
225
|
+
elif ext == ".csv":
|
|
87
226
|
return pd.read_csv(file_path, nrows=nrows)
|
|
88
|
-
elif ext in [
|
|
227
|
+
elif ext in [".xlsx", ".xls"]:
|
|
89
228
|
return pd.read_excel(file_path, sheet_name=sheet_name, nrows=nrows)
|
|
90
|
-
elif ext ==
|
|
229
|
+
elif ext == ".json":
|
|
91
230
|
return pd.read_json(file_path)
|
|
92
|
-
elif ext ==
|
|
231
|
+
elif ext == ".parquet":
|
|
93
232
|
return pd.read_parquet(file_path)
|
|
94
|
-
elif ext ==
|
|
233
|
+
elif ext == ".feather":
|
|
95
234
|
return pd.read_feather(file_path)
|
|
96
235
|
else:
|
|
97
236
|
raise FileOperationError(f"Unsupported file format: {ext}")
|
|
@@ -115,18 +254,29 @@ class StatsTool(BaseTool):
|
|
|
115
254
|
return label
|
|
116
255
|
return "large"
|
|
117
256
|
|
|
118
|
-
def read_data(
|
|
257
|
+
def read_data(
|
|
258
|
+
self,
|
|
259
|
+
file_path: str,
|
|
260
|
+
nrows: Optional[int] = None,
|
|
261
|
+
sheet_name: Optional[Union[str, int]] = 0,
|
|
262
|
+
) -> Dict[str, Any]:
|
|
119
263
|
"""Read data from various file formats."""
|
|
120
264
|
df = self._load_data(file_path, nrows, sheet_name)
|
|
121
265
|
return {
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
266
|
+
"variables": df.columns.tolist(),
|
|
267
|
+
"observations": len(df),
|
|
268
|
+
"dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
|
|
269
|
+
"memory_usage": df.memory_usage(deep=True).sum() / (1024 * 1024),
|
|
270
|
+
"preview": df.head(5).to_dict(orient="records"),
|
|
127
271
|
}
|
|
128
272
|
|
|
129
|
-
def describe(
|
|
273
|
+
def describe(
|
|
274
|
+
self,
|
|
275
|
+
file_path: str,
|
|
276
|
+
variables: Optional[List[str]] = None,
|
|
277
|
+
include_percentiles: bool = False,
|
|
278
|
+
percentiles: Optional[List[float]] = None,
|
|
279
|
+
) -> Dict[str, Any]:
|
|
130
280
|
"""Generate descriptive statistics for variables."""
|
|
131
281
|
df = self._load_data(file_path)
|
|
132
282
|
if variables:
|
|
@@ -137,21 +287,31 @@ class StatsTool(BaseTool):
|
|
|
137
287
|
additional_percentiles = [p for p in percentiles if p not in [0.25, 0.5, 0.75]]
|
|
138
288
|
if additional_percentiles:
|
|
139
289
|
additional_desc = df.describe(percentiles=percentiles)
|
|
140
|
-
desc = pd.concat(
|
|
290
|
+
desc = pd.concat(
|
|
291
|
+
[
|
|
292
|
+
desc,
|
|
293
|
+
additional_desc.loc[[f"{int(p*100)}%" for p in additional_percentiles]],
|
|
294
|
+
]
|
|
295
|
+
)
|
|
141
296
|
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
142
297
|
if numeric_cols.any():
|
|
143
|
-
desc.loc[
|
|
144
|
-
desc.loc[
|
|
145
|
-
return {
|
|
146
|
-
'statistics': desc.to_dict(),
|
|
147
|
-
'summary': desc.to_string()
|
|
148
|
-
}
|
|
298
|
+
desc.loc["skew"] = df[numeric_cols].skew()
|
|
299
|
+
desc.loc["kurtosis"] = df[numeric_cols].kurt()
|
|
300
|
+
return {"statistics": desc.to_dict(), "summary": desc.to_string()}
|
|
149
301
|
|
|
150
|
-
def ttest(
|
|
302
|
+
def ttest(
|
|
303
|
+
self,
|
|
304
|
+
file_path: str,
|
|
305
|
+
var1: str,
|
|
306
|
+
var2: str,
|
|
307
|
+
equal_var: bool = True,
|
|
308
|
+
paired: bool = False,
|
|
309
|
+
) -> Dict[str, Any]:
|
|
151
310
|
"""Perform t-tests (independent or paired). Also handles legacy ttest_ind."""
|
|
152
311
|
df = self._load_data(file_path)
|
|
153
312
|
self._validate_variables(df, [var1, var2])
|
|
154
|
-
import scipy.stats as stats
|
|
313
|
+
import scipy.stats as stats # type: ignore[import-untyped]
|
|
314
|
+
|
|
155
315
|
a = df[var1].dropna().values
|
|
156
316
|
b = df[var2].dropna().values
|
|
157
317
|
if paired:
|
|
@@ -176,64 +336,85 @@ class StatsTool(BaseTool):
|
|
|
176
336
|
pvalue=float(p),
|
|
177
337
|
significant=p < 0.05,
|
|
178
338
|
additional_metrics={
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
}
|
|
339
|
+
"cohens_d": float(cohens_d),
|
|
340
|
+
"effect_size_interpretation": self._interpret_effect_size(cohens_d),
|
|
341
|
+
"group1_mean": float(mean_a),
|
|
342
|
+
"group2_mean": float(mean_b),
|
|
343
|
+
"group1_std": float(std_a),
|
|
344
|
+
"group2_std": float(std_b),
|
|
345
|
+
"group1_n": int(len(a)),
|
|
346
|
+
"group2_n": int(len(b)),
|
|
347
|
+
},
|
|
188
348
|
).to_dict()
|
|
189
349
|
|
|
190
350
|
# Legacy method (now an alias)
|
|
191
351
|
ttest_ind = ttest
|
|
192
352
|
|
|
193
|
-
def correlation(
|
|
353
|
+
def correlation(
|
|
354
|
+
self,
|
|
355
|
+
file_path: str,
|
|
356
|
+
variables: Optional[List[str]] = None,
|
|
357
|
+
var1: Optional[str] = None,
|
|
358
|
+
var2: Optional[str] = None,
|
|
359
|
+
method: str = "pearson",
|
|
360
|
+
) -> Dict[str, Any]:
|
|
194
361
|
"""Perform correlation analysis."""
|
|
195
362
|
df = self._load_data(file_path)
|
|
196
363
|
if variables:
|
|
197
364
|
self._validate_variables(df, variables)
|
|
198
365
|
if var1 and var2:
|
|
199
366
|
self._validate_variables(df, [var1, var2])
|
|
200
|
-
import scipy.stats as stats
|
|
367
|
+
import scipy.stats as stats # type: ignore[import-untyped]
|
|
368
|
+
|
|
201
369
|
result = {}
|
|
202
370
|
if variables:
|
|
203
371
|
corr_matrix = df[variables].corr(method=method)
|
|
204
|
-
result[
|
|
372
|
+
result["correlation_matrix"] = corr_matrix.to_dict()
|
|
205
373
|
flat_corrs = [
|
|
206
|
-
{
|
|
374
|
+
{
|
|
375
|
+
"var1": v1,
|
|
376
|
+
"var2": v2,
|
|
377
|
+
"correlation": corr_matrix.loc[v1, v2],
|
|
378
|
+
"abs_correlation": abs(corr_matrix.loc[v1, v2]),
|
|
379
|
+
}
|
|
207
380
|
for i, v1 in enumerate(variables)
|
|
208
|
-
for j, v2 in enumerate(variables)
|
|
381
|
+
for j, v2 in enumerate(variables)
|
|
382
|
+
if i < j
|
|
209
383
|
]
|
|
210
|
-
flat_corrs.sort(key=lambda x: x[
|
|
211
|
-
result[
|
|
384
|
+
flat_corrs.sort(key=lambda x: x["abs_correlation"], reverse=True)
|
|
385
|
+
result["pairs"] = flat_corrs
|
|
212
386
|
elif var1 and var2:
|
|
213
387
|
x = df[var1].dropna()
|
|
214
388
|
y = df[var2].dropna()
|
|
215
389
|
method_map = {
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
390
|
+
"pearson": (stats.pearsonr, "Pearson's r"),
|
|
391
|
+
"spearman": (stats.spearmanr, "Spearman's rho"),
|
|
392
|
+
"kendall": (stats.kendalltau, "Kendall's tau"),
|
|
219
393
|
}
|
|
220
394
|
func, method_name = method_map[method]
|
|
221
395
|
corr, p = func(x, y)
|
|
222
396
|
result = {
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
397
|
+
"method": method_name,
|
|
398
|
+
"correlation": float(corr),
|
|
399
|
+
"pvalue": float(p),
|
|
400
|
+
"significant": p < 0.05,
|
|
401
|
+
"n": len(x),
|
|
228
402
|
}
|
|
229
403
|
return result
|
|
230
404
|
|
|
231
|
-
def anova(
|
|
405
|
+
def anova(
|
|
406
|
+
self,
|
|
407
|
+
file_path: str,
|
|
408
|
+
dependent: str,
|
|
409
|
+
factor: str,
|
|
410
|
+
post_hoc: bool = False,
|
|
411
|
+
) -> Dict[str, Any]:
|
|
232
412
|
"""Perform one-way ANOVA with optional post-hoc tests."""
|
|
233
413
|
df = self._load_data(file_path)
|
|
234
414
|
self._validate_variables(df, [dependent, factor])
|
|
235
|
-
import scipy.stats as stats
|
|
236
|
-
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
|
415
|
+
import scipy.stats as stats # type: ignore[import-untyped] # type: ignore[import-untyped]
|
|
416
|
+
from statsmodels.stats.multicomp import pairwise_tukeyhsd # type: ignore[import-untyped]
|
|
417
|
+
|
|
237
418
|
dependent_var = df[dependent].dropna()
|
|
238
419
|
factor_var = df[factor].dropna()
|
|
239
420
|
min_len = min(len(dependent_var), len(factor_var))
|
|
@@ -242,42 +423,46 @@ class StatsTool(BaseTool):
|
|
|
242
423
|
groups = {name: group[dependent].dropna().values for name, group in df.groupby(factor)}
|
|
243
424
|
stat, p = stats.f_oneway(*groups.values())
|
|
244
425
|
result = {
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
426
|
+
"F": float(stat),
|
|
427
|
+
"pvalue": float(p),
|
|
428
|
+
"significant": p < 0.05,
|
|
429
|
+
"groups": len(groups),
|
|
430
|
+
"group_sizes": {name: len(values) for name, values in groups.items()},
|
|
431
|
+
"group_means": {name: float(np.mean(values)) for name, values in groups.items()},
|
|
432
|
+
"group_std": {name: float(np.std(values, ddof=1)) for name, values in groups.items()},
|
|
252
433
|
}
|
|
253
434
|
if post_hoc:
|
|
254
|
-
post_hoc_df = pd.DataFrame({
|
|
255
|
-
tukey = pairwise_tukeyhsd(post_hoc_df[
|
|
435
|
+
post_hoc_df = pd.DataFrame({"value": dependent_var, "group": factor_var})
|
|
436
|
+
tukey = pairwise_tukeyhsd(post_hoc_df["value"], post_hoc_df["group"])
|
|
256
437
|
from itertools import combinations
|
|
438
|
+
|
|
257
439
|
group_pairs = list(combinations(tukey.groupsunique, 2))
|
|
258
440
|
tukey_results = [
|
|
259
441
|
{
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
442
|
+
"group1": str(group1),
|
|
443
|
+
"group2": str(group2),
|
|
444
|
+
"mean_difference": float(mean_diff),
|
|
445
|
+
"p_adjusted": float(p_adj),
|
|
446
|
+
"significant": bool(reject),
|
|
447
|
+
"conf_lower": float(lower),
|
|
448
|
+
"conf_upper": float(upper),
|
|
267
449
|
}
|
|
268
|
-
for (
|
|
450
|
+
for (
|
|
451
|
+
group1,
|
|
452
|
+
group2,
|
|
453
|
+
), mean_diff, p_adj, lower, upper, reject in zip(
|
|
269
454
|
group_pairs,
|
|
270
455
|
tukey.meandiffs,
|
|
271
456
|
tukey.pvalues,
|
|
272
|
-
tukey.confint[:,0],
|
|
273
|
-
tukey.confint[:,1],
|
|
274
|
-
tukey.reject
|
|
457
|
+
tukey.confint[:, 0],
|
|
458
|
+
tukey.confint[:, 1],
|
|
459
|
+
tukey.reject,
|
|
275
460
|
)
|
|
276
461
|
]
|
|
277
|
-
result[
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
462
|
+
result["post_hoc"] = {
|
|
463
|
+
"method": "Tukey HSD",
|
|
464
|
+
"alpha": 0.05, # Standard significance level for Tukey HSD
|
|
465
|
+
"comparisons": tukey_results,
|
|
281
466
|
}
|
|
282
467
|
return result
|
|
283
468
|
|
|
@@ -285,48 +470,56 @@ class StatsTool(BaseTool):
|
|
|
285
470
|
"""Perform chi-square test of independence."""
|
|
286
471
|
df = self._load_data(file_path)
|
|
287
472
|
self._validate_variables(df, [var1, var2])
|
|
288
|
-
import scipy.stats as stats
|
|
473
|
+
import scipy.stats as stats # type: ignore[import-untyped]
|
|
474
|
+
|
|
289
475
|
contingency = pd.crosstab(df[var1], df[var2])
|
|
290
476
|
chi2, p, dof, expected = stats.chi2_contingency(contingency, correction=correction)
|
|
291
477
|
n = contingency.sum().sum()
|
|
292
478
|
min_dim = min(contingency.shape) - 1
|
|
293
479
|
cramers_v = np.sqrt(chi2 / (n * min_dim))
|
|
294
480
|
return {
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
481
|
+
"chi2": float(chi2),
|
|
482
|
+
"pvalue": float(p),
|
|
483
|
+
"dof": int(dof),
|
|
484
|
+
"significant": p < 0.05,
|
|
485
|
+
"cramers_v": float(cramers_v),
|
|
486
|
+
"effect_size_interpretation": self._interpret_effect_size(cramers_v),
|
|
487
|
+
"contingency_table": contingency.to_dict(),
|
|
488
|
+
"expected_frequencies": pd.DataFrame(expected, index=contingency.index, columns=contingency.columns).to_dict(),
|
|
489
|
+
"test_type": ("Chi-square test with Yates correction" if correction else "Chi-square test"),
|
|
304
490
|
}
|
|
305
491
|
|
|
306
|
-
def non_parametric(
|
|
492
|
+
def non_parametric(
|
|
493
|
+
self,
|
|
494
|
+
file_path: str,
|
|
495
|
+
test_type: str,
|
|
496
|
+
variables: List[str],
|
|
497
|
+
grouping: Optional[str] = None,
|
|
498
|
+
) -> Dict[str, Any]:
|
|
307
499
|
"""Perform non-parametric statistical tests."""
|
|
308
500
|
df = self._load_data(file_path)
|
|
309
501
|
self._validate_variables(df, variables + ([grouping] if grouping else []))
|
|
310
|
-
import scipy.stats as stats
|
|
311
|
-
|
|
502
|
+
import scipy.stats as stats # type: ignore[import-untyped]
|
|
503
|
+
|
|
504
|
+
if test_type == "mann_whitney":
|
|
312
505
|
if len(variables) != 2:
|
|
313
506
|
raise AnalysisError("Mann-Whitney U test requires exactly 2 variables")
|
|
314
507
|
x = df[variables[0]].dropna().values
|
|
315
508
|
y = df[variables[1]].dropna().values
|
|
316
509
|
u_stat, p_value = stats.mannwhitneyu(x, y)
|
|
317
510
|
return StatsResult(
|
|
318
|
-
test_type=
|
|
511
|
+
test_type="Mann-Whitney U test",
|
|
319
512
|
statistic=float(u_stat),
|
|
320
513
|
pvalue=float(p_value),
|
|
321
514
|
significant=p_value < 0.05,
|
|
322
515
|
additional_metrics={
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
}
|
|
516
|
+
"n1": len(x),
|
|
517
|
+
"n2": len(y),
|
|
518
|
+
"median1": float(np.median(x)),
|
|
519
|
+
"median2": float(np.median(y)),
|
|
520
|
+
},
|
|
328
521
|
).to_dict()
|
|
329
|
-
elif test_type ==
|
|
522
|
+
elif test_type == "wilcoxon":
|
|
330
523
|
if len(variables) != 2:
|
|
331
524
|
raise AnalysisError("Wilcoxon signed-rank test requires exactly 2 variables")
|
|
332
525
|
x = df[variables[0]].dropna().values
|
|
@@ -336,161 +529,202 @@ class StatsTool(BaseTool):
|
|
|
336
529
|
y = y[:min_len]
|
|
337
530
|
w_stat, p_value = stats.wilcoxon(x, y)
|
|
338
531
|
return StatsResult(
|
|
339
|
-
test_type=
|
|
532
|
+
test_type="Wilcoxon signed-rank test",
|
|
340
533
|
statistic=float(w_stat),
|
|
341
534
|
pvalue=float(p_value),
|
|
342
535
|
significant=p_value < 0.05,
|
|
343
536
|
additional_metrics={
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
}
|
|
537
|
+
"n_pairs": min_len,
|
|
538
|
+
"median_difference": float(np.median(x - y)),
|
|
539
|
+
},
|
|
347
540
|
).to_dict()
|
|
348
|
-
elif test_type ==
|
|
541
|
+
elif test_type == "kruskal":
|
|
349
542
|
if not grouping:
|
|
350
543
|
raise AnalysisError("Kruskal-Wallis test requires a grouping variable")
|
|
351
544
|
groups = {f"{var}_{name}": group[var].dropna().values for name, group in df.groupby(grouping) for var in variables}
|
|
352
545
|
h_stat, p_value = stats.kruskal(*groups.values())
|
|
353
546
|
return StatsResult(
|
|
354
|
-
test_type=
|
|
547
|
+
test_type="Kruskal-Wallis H test",
|
|
355
548
|
statistic=float(h_stat),
|
|
356
549
|
pvalue=float(p_value),
|
|
357
550
|
significant=p_value < 0.05,
|
|
358
551
|
additional_metrics={
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
}
|
|
552
|
+
"groups": len(groups),
|
|
553
|
+
"group_sizes": {name: len(values) for name, values in groups.items()},
|
|
554
|
+
"group_medians": {name: float(np.median(values)) for name, values in groups.items()},
|
|
555
|
+
},
|
|
363
556
|
).to_dict()
|
|
364
|
-
elif test_type ==
|
|
557
|
+
elif test_type == "friedman":
|
|
365
558
|
if len(variables) < 2:
|
|
366
559
|
raise AnalysisError("Friedman test requires at least 2 variables")
|
|
367
560
|
data = df[variables].dropna()
|
|
368
561
|
chi2, p_value = stats.friedmanchisquare(*[data[var].values for var in variables])
|
|
369
562
|
return StatsResult(
|
|
370
|
-
test_type=
|
|
563
|
+
test_type="Friedman test",
|
|
371
564
|
statistic=float(chi2),
|
|
372
565
|
pvalue=float(p_value),
|
|
373
566
|
significant=p_value < 0.05,
|
|
374
567
|
additional_metrics={
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
}
|
|
568
|
+
"n_measures": len(variables),
|
|
569
|
+
"n_samples": len(data),
|
|
570
|
+
"variable_medians": {var: float(np.median(data[var])) for var in variables},
|
|
571
|
+
},
|
|
379
572
|
).to_dict()
|
|
380
573
|
else:
|
|
381
574
|
raise AnalysisError(f"Unsupported non-parametric test type: {test_type}. Supported types: mann_whitney, wilcoxon, kruskal, friedman")
|
|
382
575
|
|
|
383
|
-
def regression(
|
|
576
|
+
def regression(
|
|
577
|
+
self,
|
|
578
|
+
file_path: str,
|
|
579
|
+
formula: str,
|
|
580
|
+
regression_type: str = "ols",
|
|
581
|
+
robust: bool = False,
|
|
582
|
+
structured_output: bool = True,
|
|
583
|
+
) -> Dict[str, Any]:
|
|
384
584
|
"""Perform regression analysis with various models."""
|
|
385
585
|
df = self._load_data(file_path)
|
|
386
|
-
import statsmodels.formula.api as smf
|
|
586
|
+
import statsmodels.formula.api as smf # type: ignore[import-untyped]
|
|
587
|
+
|
|
387
588
|
try:
|
|
388
589
|
model_map = {
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
590
|
+
"ols": smf.ols,
|
|
591
|
+
"logit": smf.logit,
|
|
592
|
+
"probit": smf.probit,
|
|
593
|
+
"poisson": smf.poisson,
|
|
393
594
|
}
|
|
394
595
|
model = model_map[regression_type](formula=formula, data=df)
|
|
395
|
-
fit = model.fit(cov_type=
|
|
596
|
+
fit = model.fit(cov_type="HC3" if robust else "nonrobust")
|
|
396
597
|
if structured_output:
|
|
397
598
|
result = {
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
599
|
+
"model_type": regression_type,
|
|
600
|
+
"formula": formula,
|
|
601
|
+
"n_observations": int(fit.nobs),
|
|
602
|
+
"r_squared": (float(fit.rsquared) if hasattr(fit, "rsquared") else None),
|
|
603
|
+
"adj_r_squared": (float(fit.rsquared_adj) if hasattr(fit, "rsquared_adj") else None),
|
|
604
|
+
"aic": float(fit.aic) if hasattr(fit, "aic") else None,
|
|
605
|
+
"bic": float(fit.bic) if hasattr(fit, "bic") else None,
|
|
606
|
+
"f_statistic": (float(fit.fvalue) if hasattr(fit, "fvalue") else None),
|
|
607
|
+
"f_pvalue": (float(fit.f_pvalue) if hasattr(fit, "f_pvalue") else None),
|
|
608
|
+
"log_likelihood": (float(fit.llf) if hasattr(fit, "llf") else None),
|
|
609
|
+
"coefficients": {
|
|
409
610
|
var: {
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
}
|
|
418
|
-
|
|
611
|
+
"coef": float(fit.params[var]),
|
|
612
|
+
"std_err": float(fit.bse[var]),
|
|
613
|
+
"t_value": (float(fit.tvalues[var]) if hasattr(fit, "tvalues") else None),
|
|
614
|
+
"p_value": float(fit.pvalues[var]),
|
|
615
|
+
"significant": fit.pvalues[var] < 0.05,
|
|
616
|
+
"conf_lower": float(fit.conf_int().loc[var, 0]),
|
|
617
|
+
"conf_upper": float(fit.conf_int().loc[var, 1]),
|
|
618
|
+
}
|
|
619
|
+
for var in fit.params.index
|
|
620
|
+
},
|
|
419
621
|
}
|
|
420
|
-
return {
|
|
421
|
-
|
|
622
|
+
return {
|
|
623
|
+
"summary_text": fit.summary().as_text(),
|
|
624
|
+
"structured": result,
|
|
625
|
+
}
|
|
626
|
+
return {"summary": fit.summary().as_text()}
|
|
422
627
|
except Exception as e:
|
|
423
628
|
raise AnalysisError(f"Regression error: {str(e)}")
|
|
424
629
|
|
|
425
|
-
def time_series(
|
|
630
|
+
def time_series(
|
|
631
|
+
self,
|
|
632
|
+
file_path: str,
|
|
633
|
+
variable: str,
|
|
634
|
+
date_variable: Optional[str] = None,
|
|
635
|
+
model_type: str = "arima",
|
|
636
|
+
order: Optional[Tuple[int, int, int]] = (1, 1, 1),
|
|
637
|
+
seasonal_order: Optional[Tuple[int, int, int, int]] = None,
|
|
638
|
+
forecast_periods: int = 10,
|
|
639
|
+
) -> Dict[str, Any]:
|
|
426
640
|
"""Perform time series analysis."""
|
|
427
641
|
df = self._load_data(file_path)
|
|
428
642
|
self._validate_variables(df, [variable] + ([date_variable] if date_variable else []))
|
|
429
|
-
from statsmodels.tsa.arima.model import ARIMA
|
|
430
|
-
from statsmodels.tsa.statespace.sarimax import SARIMAX
|
|
643
|
+
from statsmodels.tsa.arima.model import ARIMA # type: ignore[import-untyped]
|
|
644
|
+
from statsmodels.tsa.statespace.sarimax import SARIMAX # type: ignore[import-untyped]
|
|
645
|
+
|
|
431
646
|
try:
|
|
432
647
|
ts_data = df[variable].dropna()
|
|
433
648
|
if date_variable and date_variable in df.columns:
|
|
434
649
|
ts_data.index = df[date_variable]
|
|
435
|
-
if model_type ==
|
|
650
|
+
if model_type == "arima":
|
|
436
651
|
model = ARIMA(ts_data, order=order)
|
|
437
652
|
fit = model.fit()
|
|
438
|
-
model_type_name =
|
|
439
|
-
elif model_type ==
|
|
653
|
+
model_type_name = "ARIMA"
|
|
654
|
+
elif model_type == "sarima":
|
|
440
655
|
if not seasonal_order:
|
|
441
656
|
raise AnalysisError("seasonal_order must be provided for SARIMA model")
|
|
442
657
|
model = SARIMAX(ts_data, order=order, seasonal_order=seasonal_order)
|
|
443
658
|
fit = model.fit(disp=False)
|
|
444
|
-
model_type_name =
|
|
659
|
+
model_type_name = "SARIMA"
|
|
445
660
|
else:
|
|
446
661
|
raise AnalysisError(f"Unsupported time series model: {model_type}")
|
|
447
662
|
forecast = fit.forecast(steps=forecast_periods)
|
|
448
663
|
forecast_index = pd.date_range(
|
|
449
|
-
start=ts_data.index[-1] if isinstance(ts_data.index, pd.DatetimeIndex) else len(ts_data),
|
|
664
|
+
start=(ts_data.index[-1] if isinstance(ts_data.index, pd.DatetimeIndex) else len(ts_data)),
|
|
450
665
|
periods=forecast_periods + 1,
|
|
451
|
-
freq=
|
|
666
|
+
freq="D",
|
|
452
667
|
)[1:]
|
|
453
668
|
return {
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
669
|
+
"model_type": model_type_name,
|
|
670
|
+
"order": order,
|
|
671
|
+
"seasonal_order": (seasonal_order if model_type == "sarima" else None),
|
|
672
|
+
"aic": float(fit.aic),
|
|
673
|
+
"bic": float(fit.bic),
|
|
674
|
+
"forecast": {
|
|
675
|
+
"values": (forecast.tolist() if isinstance(forecast, np.ndarray) else forecast.values.tolist()),
|
|
676
|
+
"index": (forecast_index.strftime("%Y-%m-%d").tolist() if isinstance(forecast_index, pd.DatetimeIndex) else list(range(len(forecast)))),
|
|
462
677
|
},
|
|
463
|
-
|
|
678
|
+
"summary": str(fit.summary()),
|
|
464
679
|
}
|
|
465
680
|
except Exception as e:
|
|
466
681
|
raise AnalysisError(f"Time series analysis error: {str(e)}")
|
|
467
682
|
|
|
468
|
-
def preprocess(
|
|
683
|
+
def preprocess(
|
|
684
|
+
self,
|
|
685
|
+
file_path: str,
|
|
686
|
+
variables: List[str],
|
|
687
|
+
operation: str,
|
|
688
|
+
scaler_type: ScalerType = ScalerType.STANDARD,
|
|
689
|
+
output_path: Optional[str] = None,
|
|
690
|
+
) -> Dict[str, Any]:
|
|
469
691
|
"""Preprocess data with various operations."""
|
|
470
692
|
df = self._load_data(file_path)
|
|
471
693
|
self._validate_variables(df, variables)
|
|
472
694
|
data = df[variables].copy()
|
|
473
|
-
result = {
|
|
474
|
-
if operation ==
|
|
475
|
-
from sklearn.preprocessing import
|
|
695
|
+
result: Dict[str, Any] = {"operation": operation}
|
|
696
|
+
if operation == "scale":
|
|
697
|
+
from sklearn.preprocessing import ( # type: ignore[import-untyped]
|
|
698
|
+
StandardScaler,
|
|
699
|
+
MinMaxScaler,
|
|
700
|
+
RobustScaler,
|
|
701
|
+
)
|
|
702
|
+
|
|
476
703
|
scaler_map = {
|
|
477
704
|
ScalerType.STANDARD: (StandardScaler, "StandardScaler"),
|
|
478
705
|
ScalerType.MINMAX: (MinMaxScaler, "MinMaxScaler"),
|
|
479
|
-
ScalerType.ROBUST: (RobustScaler, "RobustScaler")
|
|
706
|
+
ScalerType.ROBUST: (RobustScaler, "RobustScaler"),
|
|
480
707
|
}
|
|
481
708
|
scaler_cls, scaler_name = scaler_map[scaler_type]
|
|
482
709
|
scaler = scaler_cls()
|
|
483
710
|
scaled_data = scaler.fit_transform(data)
|
|
484
|
-
scaled_df = pd.DataFrame(
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
711
|
+
scaled_df = pd.DataFrame(
|
|
712
|
+
scaled_data,
|
|
713
|
+
columns=[f"{col}_scaled" for col in data.columns],
|
|
714
|
+
index=data.index,
|
|
715
|
+
)
|
|
716
|
+
result.update(
|
|
717
|
+
{
|
|
718
|
+
"scaler": scaler_name,
|
|
719
|
+
"original_stats": data.describe().to_dict(),
|
|
720
|
+
"scaled_stats": scaled_df.describe().to_dict(),
|
|
721
|
+
"preview": scaled_df.head(5).to_dict(orient="records"),
|
|
722
|
+
}
|
|
723
|
+
)
|
|
491
724
|
processed_df = scaled_df
|
|
492
|
-
elif operation ==
|
|
725
|
+
elif operation == "impute":
|
|
493
726
|
import numpy as np
|
|
727
|
+
|
|
494
728
|
imputed_df = data.copy()
|
|
495
729
|
numeric_cols = data.select_dtypes(include=[np.number]).columns
|
|
496
730
|
for col in numeric_cols:
|
|
@@ -498,16 +732,21 @@ class StatsTool(BaseTool):
|
|
|
498
732
|
cat_cols = data.select_dtypes(exclude=[np.number]).columns
|
|
499
733
|
for col in cat_cols:
|
|
500
734
|
imputed_df[col] = data[col].fillna(data[col].mode()[0] if not data[col].mode().empty else None)
|
|
501
|
-
result.update(
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
735
|
+
result.update(
|
|
736
|
+
{
|
|
737
|
+
"imputation_method": {
|
|
738
|
+
"numeric": "mean",
|
|
739
|
+
"categorical": "mode",
|
|
740
|
+
},
|
|
741
|
+
"missing_counts_before": data.isna().sum().to_dict(),
|
|
742
|
+
"missing_counts_after": imputed_df.isna().sum().to_dict(),
|
|
743
|
+
"preview": imputed_df.head(5).to_dict(orient="records"),
|
|
744
|
+
}
|
|
745
|
+
)
|
|
507
746
|
processed_df = imputed_df
|
|
508
747
|
if output_path:
|
|
509
|
-
output_path = os.path.abspath(output_path) if os.path.isabs(output_path) else os.path.join(tempfile.gettempdir(),
|
|
748
|
+
output_path = os.path.abspath(output_path) if os.path.isabs(output_path) else os.path.join(tempfile.gettempdir(), "stats_outputs", output_path)
|
|
510
749
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
511
750
|
processed_df.to_csv(output_path)
|
|
512
|
-
result[
|
|
751
|
+
result["output_file"] = output_path
|
|
513
752
|
return result
|