aiecs 1.0.1__py3-none-any.whl → 1.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +13 -16
- aiecs/__main__.py +7 -7
- aiecs/aiecs_client.py +269 -75
- aiecs/application/executors/operation_executor.py +79 -54
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
- aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
- aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
- aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +98 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
- aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
- aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
- aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
- aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
- aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
- aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
- aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
- aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
- aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
- aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
- aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
- aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
- aiecs/application/knowledge_graph/search/reranker.py +293 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +471 -0
- aiecs/config/__init__.py +20 -5
- aiecs/config/config.py +762 -31
- aiecs/config/graph_config.py +131 -0
- aiecs/config/tool_config.py +399 -0
- aiecs/core/__init__.py +29 -13
- aiecs/core/interface/__init__.py +2 -2
- aiecs/core/interface/execution_interface.py +22 -22
- aiecs/core/interface/storage_interface.py +37 -88
- aiecs/core/registry/__init__.py +31 -0
- aiecs/core/registry/service_registry.py +92 -0
- aiecs/domain/__init__.py +270 -1
- aiecs/domain/agent/__init__.py +191 -0
- aiecs/domain/agent/base_agent.py +3870 -0
- aiecs/domain/agent/exceptions.py +99 -0
- aiecs/domain/agent/graph_aware_mixin.py +569 -0
- aiecs/domain/agent/hybrid_agent.py +1435 -0
- aiecs/domain/agent/integration/__init__.py +29 -0
- aiecs/domain/agent/integration/context_compressor.py +216 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
- aiecs/domain/agent/integration/protocols.py +281 -0
- aiecs/domain/agent/integration/retry_policy.py +218 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
- aiecs/domain/agent/lifecycle.py +291 -0
- aiecs/domain/agent/llm_agent.py +692 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +1124 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +163 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
- aiecs/domain/agent/models.py +884 -0
- aiecs/domain/agent/observability.py +479 -0
- aiecs/domain/agent/persistence.py +449 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +159 -0
- aiecs/domain/agent/prompts/formatters.py +187 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +253 -0
- aiecs/domain/agent/tool_agent.py +444 -0
- aiecs/domain/agent/tools/__init__.py +15 -0
- aiecs/domain/agent/tools/schema_generator.py +364 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +469 -0
- aiecs/domain/community/analytics.py +432 -0
- aiecs/domain/community/collaborative_workflow.py +648 -0
- aiecs/domain/community/communication_hub.py +634 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +796 -0
- aiecs/domain/community/community_manager.py +803 -0
- aiecs/domain/community/decision_engine.py +849 -0
- aiecs/domain/community/exceptions.py +231 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +234 -0
- aiecs/domain/community/resource_manager.py +461 -0
- aiecs/domain/community/shared_context_manager.py +589 -0
- aiecs/domain/context/__init__.py +40 -10
- aiecs/domain/context/context_engine.py +1910 -0
- aiecs/domain/context/conversation_models.py +87 -53
- aiecs/domain/context/graph_memory.py +582 -0
- aiecs/domain/execution/model.py +12 -4
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +148 -0
- aiecs/domain/knowledge_graph/models/evidence.py +178 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
- aiecs/domain/knowledge_graph/models/path.py +171 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
- aiecs/domain/knowledge_graph/models/query.py +261 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
- aiecs/domain/knowledge_graph/models/relation.py +202 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
- aiecs/domain/task/dsl_processor.py +172 -56
- aiecs/domain/task/model.py +20 -8
- aiecs/domain/task/task_context.py +27 -24
- aiecs/infrastructure/__init__.py +0 -2
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +837 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
- aiecs/infrastructure/graph_storage/cache.py +424 -0
- aiecs/infrastructure/graph_storage/distributed.py +223 -0
- aiecs/infrastructure/graph_storage/error_handling.py +380 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
- aiecs/infrastructure/graph_storage/metrics.py +344 -0
- aiecs/infrastructure/graph_storage/migration.py +400 -0
- aiecs/infrastructure/graph_storage/pagination.py +483 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
- aiecs/infrastructure/graph_storage/postgres.py +1563 -0
- aiecs/infrastructure/graph_storage/property_storage.py +353 -0
- aiecs/infrastructure/graph_storage/protocols.py +76 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
- aiecs/infrastructure/graph_storage/streaming.py +487 -0
- aiecs/infrastructure/graph_storage/tenant.py +412 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
- aiecs/infrastructure/messaging/websocket_manager.py +51 -35
- aiecs/infrastructure/monitoring/__init__.py +22 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
- aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
- aiecs/infrastructure/monitoring/structured_logger.py +3 -7
- aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
- aiecs/infrastructure/persistence/__init__.py +14 -1
- aiecs/infrastructure/persistence/context_engine_client.py +184 -0
- aiecs/infrastructure/persistence/database_manager.py +67 -43
- aiecs/infrastructure/persistence/file_storage.py +180 -103
- aiecs/infrastructure/persistence/redis_client.py +74 -21
- aiecs/llm/__init__.py +73 -25
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
- aiecs/llm/client_factory.py +224 -36
- aiecs/llm/client_resolver.py +155 -0
- aiecs/llm/clients/__init__.py +38 -0
- aiecs/llm/clients/base_client.py +324 -0
- aiecs/llm/clients/google_function_calling_mixin.py +457 -0
- aiecs/llm/clients/googleai_client.py +241 -0
- aiecs/llm/clients/openai_client.py +158 -0
- aiecs/llm/clients/openai_compatible_mixin.py +367 -0
- aiecs/llm/clients/vertex_client.py +897 -0
- aiecs/llm/clients/xai_client.py +201 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +272 -0
- aiecs/llm/config/config_validator.py +206 -0
- aiecs/llm/config/model_config.py +143 -0
- aiecs/llm/protocols.py +149 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +89 -0
- aiecs/main.py +140 -121
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/module_checker.py +499 -0
- aiecs/scripts/aid/version_manager.py +235 -0
- aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
- aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
- aiecs/scripts/dependance_check/__init__.py +15 -0
- aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
- aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
- aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
- aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
- aiecs/scripts/knowledge_graph/__init__.py +3 -0
- aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
- aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
- aiecs/scripts/tools_develop/README.md +671 -0
- aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
- aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
- aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
- aiecs/scripts/tools_develop/schema_coverage.py +511 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
- aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
- aiecs/scripts/tools_develop/verify_tools.py +352 -0
- aiecs/tasks/__init__.py +0 -1
- aiecs/tasks/worker.py +115 -47
- aiecs/tools/__init__.py +194 -72
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +330 -0
- aiecs/tools/apisource/providers/__init__.py +112 -0
- aiecs/tools/apisource/providers/base.py +671 -0
- aiecs/tools/apisource/providers/census.py +397 -0
- aiecs/tools/apisource/providers/fred.py +535 -0
- aiecs/tools/apisource/providers/newsapi.py +409 -0
- aiecs/tools/apisource/providers/worldbank.py +352 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +363 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
- aiecs/tools/apisource/tool.py +832 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +334 -0
- aiecs/tools/base_tool.py +415 -21
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +607 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
- aiecs/tools/docs/content_insertion_tool.py +1320 -0
- aiecs/tools/docs/document_creator_tool.py +1323 -0
- aiecs/tools/docs/document_layout_tool.py +1160 -0
- aiecs/tools/docs/document_parser_tool.py +1011 -0
- aiecs/tools/docs/document_writer_tool.py +1829 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
- aiecs/tools/langchain_adapter.py +300 -138
- aiecs/tools/schema_generator.py +455 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +581 -0
- aiecs/tools/search_tool/cache.py +264 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +224 -0
- aiecs/tools/search_tool/core.py +778 -0
- aiecs/tools/search_tool/deduplicator.py +119 -0
- aiecs/tools/search_tool/error_handler.py +242 -0
- aiecs/tools/search_tool/metrics.py +343 -0
- aiecs/tools/search_tool/rate_limiter.py +172 -0
- aiecs/tools/search_tool/schemas.py +275 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
- aiecs/tools/statistics/data_loader_tool.py +555 -0
- aiecs/tools/statistics/data_profiler_tool.py +638 -0
- aiecs/tools/statistics/data_transformer_tool.py +580 -0
- aiecs/tools/statistics/data_visualizer_tool.py +498 -0
- aiecs/tools/statistics/model_trainer_tool.py +507 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
- aiecs/tools/task_tools/__init__.py +49 -36
- aiecs/tools/task_tools/chart_tool.py +200 -184
- aiecs/tools/task_tools/classfire_tool.py +268 -267
- aiecs/tools/task_tools/image_tool.py +175 -131
- aiecs/tools/task_tools/office_tool.py +226 -146
- aiecs/tools/task_tools/pandas_tool.py +477 -121
- aiecs/tools/task_tools/report_tool.py +390 -142
- aiecs/tools/task_tools/research_tool.py +149 -79
- aiecs/tools/task_tools/scraper_tool.py +339 -145
- aiecs/tools/task_tools/stats_tool.py +448 -209
- aiecs/tools/temp_file_manager.py +26 -24
- aiecs/tools/tool_executor/__init__.py +18 -16
- aiecs/tools/tool_executor/tool_executor.py +364 -52
- aiecs/utils/LLM_output_structor.py +74 -48
- aiecs/utils/__init__.py +14 -3
- aiecs/utils/base_callback.py +0 -3
- aiecs/utils/cache_provider.py +696 -0
- aiecs/utils/execution_utils.py +50 -31
- aiecs/utils/prompt_loader.py +1 -0
- aiecs/utils/token_usage_repository.py +37 -11
- aiecs/ws/socket_server.py +14 -4
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/METADATA +52 -15
- aiecs-1.7.6.dist-info/RECORD +337 -0
- aiecs-1.7.6.dist-info/entry_points.txt +13 -0
- aiecs/config/registry.py +0 -19
- aiecs/domain/context/content_engine.py +0 -982
- aiecs/llm/base_client.py +0 -99
- aiecs/llm/openai_client.py +0 -125
- aiecs/llm/vertex_client.py +0 -186
- aiecs/llm/xai_client.py +0 -184
- aiecs/scripts/dependency_checker.py +0 -857
- aiecs/scripts/quick_dependency_check.py +0 -269
- aiecs/tools/task_tools/search_api.py +0 -7
- aiecs-1.0.1.dist-info/RECORD +0 -90
- aiecs-1.0.1.dist-info/entry_points.txt +0 -7
- /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
- /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
- /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
- /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/WHEEL +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import logging
|
|
3
|
-
from typing import Dict, List, Any
|
|
3
|
+
from typing import Dict, List, Any
|
|
4
4
|
from aiecs.tools import get_tool
|
|
5
5
|
from aiecs.tools.tool_executor import ToolExecutor
|
|
6
6
|
from aiecs.utils.execution_utils import ExecutionUtils
|
|
@@ -14,27 +14,33 @@ class OperationExecutor:
|
|
|
14
14
|
Core logic for handling operation execution
|
|
15
15
|
"""
|
|
16
16
|
|
|
17
|
-
def __init__(
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
tool_executor: ToolExecutor,
|
|
20
|
+
execution_utils: ExecutionUtils,
|
|
21
|
+
config: Dict[str, Any],
|
|
22
|
+
):
|
|
18
23
|
self.tool_executor = tool_executor
|
|
19
24
|
self.execution_utils = execution_utils
|
|
20
25
|
self.config = config
|
|
21
|
-
self._tool_instances = {}
|
|
22
|
-
self.semaphore = asyncio.Semaphore(config.get(
|
|
26
|
+
self._tool_instances: Dict[str, Any] = {}
|
|
27
|
+
self.semaphore = asyncio.Semaphore(config.get("rate_limit_requests_per_second", 5))
|
|
23
28
|
|
|
24
29
|
def _filter_tool_params(self, params: Dict[str, Any]) -> Dict[str, Any]:
|
|
25
30
|
"""
|
|
26
31
|
Filter out system-related parameters, keeping only parameters needed by tool methods
|
|
27
32
|
"""
|
|
28
33
|
# System-related parameters that should not be passed to tool methods
|
|
29
|
-
system_params = {
|
|
34
|
+
system_params = {"user_id", "task_id", "op"}
|
|
30
35
|
return {k: v for k, v in params.items() if k not in system_params}
|
|
31
36
|
|
|
32
37
|
def _filter_tool_call_params(self, params: Dict[str, Any]) -> Dict[str, Any]:
|
|
33
38
|
"""
|
|
34
39
|
Filter out system-related parameters in tool calls, but keep 'op' parameter (needed by BaseTool.run())
|
|
35
40
|
"""
|
|
36
|
-
# Only filter user and task IDs, keep 'op' parameter for BaseTool.run()
|
|
37
|
-
|
|
41
|
+
# Only filter user and task IDs, keep 'op' parameter for BaseTool.run()
|
|
42
|
+
# to use
|
|
43
|
+
system_params = {"user_id", "task_id"}
|
|
38
44
|
return {k: v for k, v in params.items() if k not in system_params}
|
|
39
45
|
|
|
40
46
|
async def execute_operation(self, operation_spec: str, params: Dict[str, Any]) -> Any:
|
|
@@ -44,7 +50,9 @@ class OperationExecutor:
|
|
|
44
50
|
if "." not in operation_spec:
|
|
45
51
|
raise ValueError(f"Invalid operation spec: {operation_spec}, expected 'tool_name.operation_name'")
|
|
46
52
|
|
|
47
|
-
|
|
53
|
+
parts = operation_spec.split(".", 1)
|
|
54
|
+
tool_name: str = parts[0]
|
|
55
|
+
operation_name: str = parts[1]
|
|
48
56
|
|
|
49
57
|
# Get or create tool instance
|
|
50
58
|
if tool_name not in self._tool_instances:
|
|
@@ -69,29 +77,37 @@ class OperationExecutor:
|
|
|
69
77
|
Batch execute operations with rate limiting
|
|
70
78
|
"""
|
|
71
79
|
results = []
|
|
72
|
-
batch_size = self.config.get(
|
|
73
|
-
rate_limit = self.config.get(
|
|
80
|
+
batch_size = self.config.get("batch_size", 10)
|
|
81
|
+
rate_limit = self.config.get("rate_limit_requests_per_second", 5)
|
|
74
82
|
|
|
75
83
|
for i in range(0, len(operations), batch_size):
|
|
76
|
-
batch = operations[i:i + batch_size]
|
|
84
|
+
batch = operations[i : i + batch_size]
|
|
77
85
|
batch_results = await asyncio.gather(
|
|
78
86
|
*[self.execute_operation(op["operation"], op.get("params", {})) for op in batch],
|
|
79
|
-
return_exceptions=True
|
|
87
|
+
return_exceptions=True,
|
|
80
88
|
)
|
|
81
89
|
results.extend(batch_results)
|
|
82
90
|
await asyncio.sleep(1.0 / rate_limit)
|
|
83
91
|
|
|
84
92
|
return results
|
|
85
93
|
|
|
86
|
-
async def execute_operations_sequence(
|
|
87
|
-
|
|
94
|
+
async def execute_operations_sequence(
|
|
95
|
+
self,
|
|
96
|
+
operations: List[Dict[str, Any]],
|
|
97
|
+
user_id: str,
|
|
98
|
+
task_id: str,
|
|
99
|
+
stop_on_failure: bool = False,
|
|
100
|
+
save_callback=None,
|
|
101
|
+
) -> List[TaskStepResult]:
|
|
88
102
|
"""
|
|
89
103
|
Execute operations sequence sequentially, with option to stop on failure
|
|
90
104
|
"""
|
|
91
|
-
results = []
|
|
105
|
+
results: List[TaskStepResult] = []
|
|
92
106
|
|
|
93
107
|
for step, op_info in enumerate(operations):
|
|
94
108
|
operation_spec = op_info.get("operation")
|
|
109
|
+
if not isinstance(operation_spec, str):
|
|
110
|
+
raise ValueError(f"Invalid operation spec: {operation_spec}, expected string")
|
|
95
111
|
params = op_info.get("params", {})
|
|
96
112
|
|
|
97
113
|
# Process parameter references
|
|
@@ -104,7 +120,7 @@ class OperationExecutor:
|
|
|
104
120
|
result=result,
|
|
105
121
|
completed=True,
|
|
106
122
|
message=f"Completed operation {operation_spec}",
|
|
107
|
-
status=TaskStatus.COMPLETED.value
|
|
123
|
+
status=TaskStatus.COMPLETED.value,
|
|
108
124
|
)
|
|
109
125
|
except Exception as e:
|
|
110
126
|
step_result = TaskStepResult(
|
|
@@ -114,7 +130,7 @@ class OperationExecutor:
|
|
|
114
130
|
message=f"Failed to execute {operation_spec}",
|
|
115
131
|
status=TaskStatus.FAILED.value,
|
|
116
132
|
error_code=ErrorCode.EXECUTION_ERROR.value,
|
|
117
|
-
error_message=str(e)
|
|
133
|
+
error_message=str(e),
|
|
118
134
|
)
|
|
119
135
|
|
|
120
136
|
if stop_on_failure:
|
|
@@ -138,9 +154,9 @@ class OperationExecutor:
|
|
|
138
154
|
processed = {}
|
|
139
155
|
|
|
140
156
|
for name, value in params.items():
|
|
141
|
-
if isinstance(value, str) and value.startswith(
|
|
157
|
+
if isinstance(value, str) and value.startswith("$result["):
|
|
142
158
|
try:
|
|
143
|
-
ref_parts = value[8:].split(
|
|
159
|
+
ref_parts = value[8:].split("]", 1)
|
|
144
160
|
idx = int(ref_parts[0])
|
|
145
161
|
|
|
146
162
|
if idx >= len(results):
|
|
@@ -148,9 +164,10 @@ class OperationExecutor:
|
|
|
148
164
|
|
|
149
165
|
ref_value = results[idx].result
|
|
150
166
|
|
|
151
|
-
# Handle nested attribute access, such as
|
|
152
|
-
|
|
153
|
-
|
|
167
|
+
# Handle nested attribute access, such as
|
|
168
|
+
# $result[0].data.field
|
|
169
|
+
if len(ref_parts) > 1 and ref_parts[1].startswith("."):
|
|
170
|
+
for attr in ref_parts[1][1:].split("."):
|
|
154
171
|
if attr:
|
|
155
172
|
if isinstance(ref_value, dict):
|
|
156
173
|
ref_value = ref_value.get(attr)
|
|
@@ -171,14 +188,14 @@ class OperationExecutor:
|
|
|
171
188
|
Execute batch tool calls with rate limiting
|
|
172
189
|
"""
|
|
173
190
|
results = []
|
|
174
|
-
batch_size = self.config.get(
|
|
175
|
-
rate_limit = self.config.get(
|
|
191
|
+
batch_size = self.config.get("batch_size", 10)
|
|
192
|
+
rate_limit = self.config.get("rate_limit_requests_per_second", 5)
|
|
176
193
|
|
|
177
194
|
for i in range(0, len(tool_calls), batch_size):
|
|
178
|
-
batch = tool_calls[i:i + batch_size]
|
|
195
|
+
batch = tool_calls[i : i + batch_size]
|
|
179
196
|
batch_results = await asyncio.gather(
|
|
180
197
|
*[self._execute_tool_call(call, tool_executor_func) for call in batch],
|
|
181
|
-
return_exceptions=True
|
|
198
|
+
return_exceptions=True,
|
|
182
199
|
)
|
|
183
200
|
results.extend(batch_results)
|
|
184
201
|
await asyncio.sleep(1.0 / rate_limit)
|
|
@@ -190,11 +207,14 @@ class OperationExecutor:
|
|
|
190
207
|
Execute a single tool call with rate limiting
|
|
191
208
|
"""
|
|
192
209
|
async with self.semaphore:
|
|
193
|
-
|
|
210
|
+
tool_name_raw = call.get("tool")
|
|
211
|
+
if not isinstance(tool_name_raw, str):
|
|
212
|
+
raise ValueError(f"Invalid tool name: {tool_name_raw}, expected string")
|
|
213
|
+
tool_name: str = tool_name_raw
|
|
194
214
|
params = call.get("params", {})
|
|
195
215
|
|
|
196
216
|
# Use context-aware caching
|
|
197
|
-
if self.config.get(
|
|
217
|
+
if self.config.get("enable_cache", True):
|
|
198
218
|
user_id = params.get("user_id", "anonymous")
|
|
199
219
|
task_id = params.get("task_id", "none")
|
|
200
220
|
cache_key = self.execution_utils.generate_cache_key("tool_call", user_id, task_id, (), params)
|
|
@@ -211,14 +231,16 @@ class OperationExecutor:
|
|
|
211
231
|
if tool_name not in self._tool_instances:
|
|
212
232
|
self._tool_instances[tool_name] = get_tool(tool_name)
|
|
213
233
|
tool = self._tool_instances[tool_name]
|
|
214
|
-
|
|
215
|
-
# Filter parameters, remove system-related parameters (but keep
|
|
234
|
+
|
|
235
|
+
# Filter parameters, remove system-related parameters (but keep
|
|
236
|
+
# 'op' parameter)
|
|
216
237
|
tool_params = self._filter_tool_call_params(params)
|
|
217
|
-
# Execute through BaseTool.run method, passing filtered
|
|
238
|
+
# Execute through BaseTool.run method, passing filtered
|
|
239
|
+
# parameters
|
|
218
240
|
result = await self.tool_executor.execute_async(tool, "run", **tool_params)
|
|
219
241
|
|
|
220
242
|
# Cache result
|
|
221
|
-
if self.config.get(
|
|
243
|
+
if self.config.get("enable_cache", True):
|
|
222
244
|
self.execution_utils.add_to_cache(cache_key, result)
|
|
223
245
|
|
|
224
246
|
return result
|
|
@@ -230,7 +252,7 @@ class OperationExecutor:
|
|
|
230
252
|
import re
|
|
231
253
|
|
|
232
254
|
tool_calls = []
|
|
233
|
-
tool_pattern = r
|
|
255
|
+
tool_pattern = r"\{\{(\w+)\((.*?)\)\}\}"
|
|
234
256
|
matches = re.finditer(tool_pattern, description)
|
|
235
257
|
|
|
236
258
|
for match in matches:
|
|
@@ -256,10 +278,7 @@ class OperationExecutor:
|
|
|
256
278
|
|
|
257
279
|
params[param_name] = param_value
|
|
258
280
|
|
|
259
|
-
tool_calls.append({
|
|
260
|
-
"tool": tool_name,
|
|
261
|
-
"params": params
|
|
262
|
-
})
|
|
281
|
+
tool_calls.append({"tool": tool_name, "params": params})
|
|
263
282
|
|
|
264
283
|
return tool_calls
|
|
265
284
|
|
|
@@ -271,9 +290,11 @@ class OperationExecutor:
|
|
|
271
290
|
|
|
272
291
|
for i, op_info in enumerate(operations):
|
|
273
292
|
operation_spec = op_info.get("operation")
|
|
293
|
+
if not isinstance(operation_spec, str):
|
|
294
|
+
raise ValueError(f"Invalid operation spec: {operation_spec}, expected string")
|
|
274
295
|
params = op_info.get("params", {})
|
|
275
296
|
|
|
276
|
-
async def execute_single_op(spec, p, index):
|
|
297
|
+
async def execute_single_op(spec: str, p: Dict[str, Any], index: int) -> TaskStepResult:
|
|
277
298
|
try:
|
|
278
299
|
result = await self.execute_operation(spec, p)
|
|
279
300
|
return TaskStepResult(
|
|
@@ -281,7 +302,7 @@ class OperationExecutor:
|
|
|
281
302
|
result=result,
|
|
282
303
|
completed=True,
|
|
283
304
|
message=f"Completed parallel operation {spec}",
|
|
284
|
-
status=TaskStatus.COMPLETED.value
|
|
305
|
+
status=TaskStatus.COMPLETED.value,
|
|
285
306
|
)
|
|
286
307
|
except Exception as e:
|
|
287
308
|
return TaskStepResult(
|
|
@@ -291,7 +312,7 @@ class OperationExecutor:
|
|
|
291
312
|
message=f"Failed parallel operation {spec}",
|
|
292
313
|
status=TaskStatus.FAILED.value,
|
|
293
314
|
error_code=ErrorCode.EXECUTION_ERROR.value,
|
|
294
|
-
error_message=str(e)
|
|
315
|
+
error_message=str(e),
|
|
295
316
|
)
|
|
296
317
|
|
|
297
318
|
tasks.append(execute_single_op(operation_spec, params, i))
|
|
@@ -299,19 +320,23 @@ class OperationExecutor:
|
|
|
299
320
|
results = await asyncio.gather(*tasks, return_exceptions=True)
|
|
300
321
|
|
|
301
322
|
# Handle exception results
|
|
302
|
-
processed_results = []
|
|
323
|
+
processed_results: List[TaskStepResult] = []
|
|
303
324
|
for i, result in enumerate(results):
|
|
304
325
|
if isinstance(result, Exception):
|
|
305
|
-
processed_results.append(
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
326
|
+
processed_results.append(
|
|
327
|
+
TaskStepResult(
|
|
328
|
+
step=f"parallel_{i}_error",
|
|
329
|
+
result=None,
|
|
330
|
+
completed=False,
|
|
331
|
+
message="Parallel operation failed with exception",
|
|
332
|
+
status=TaskStatus.FAILED.value,
|
|
333
|
+
error_code=ErrorCode.EXECUTION_ERROR.value,
|
|
334
|
+
error_message=str(result),
|
|
335
|
+
)
|
|
336
|
+
)
|
|
314
337
|
else:
|
|
338
|
+
# result is TaskStepResult here because execute_single_op always returns TaskStepResult
|
|
339
|
+
assert isinstance(result, TaskStepResult), f"Expected TaskStepResult, got {type(result)}"
|
|
315
340
|
processed_results.append(result)
|
|
316
341
|
|
|
317
342
|
return processed_results
|
|
@@ -334,8 +359,8 @@ class OperationExecutor:
|
|
|
334
359
|
"tool_names": list(self._tool_instances.keys()),
|
|
335
360
|
"semaphore_value": self.semaphore._value,
|
|
336
361
|
"config": {
|
|
337
|
-
"batch_size": self.config.get(
|
|
338
|
-
"rate_limit": self.config.get(
|
|
339
|
-
"enable_cache": self.config.get(
|
|
340
|
-
}
|
|
362
|
+
"batch_size": self.config.get("batch_size", 10),
|
|
363
|
+
"rate_limit": self.config.get("rate_limit_requests_per_second", 5),
|
|
364
|
+
"enable_cache": self.config.get("enable_cache", True),
|
|
365
|
+
},
|
|
341
366
|
}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Knowledge Graph Builder Pipeline
|
|
3
|
+
|
|
4
|
+
Orchestrates document-to-graph conversion workflow.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from aiecs.application.knowledge_graph.builder.graph_builder import (
|
|
8
|
+
GraphBuilder,
|
|
9
|
+
)
|
|
10
|
+
from aiecs.application.knowledge_graph.builder.document_builder import (
|
|
11
|
+
DocumentGraphBuilder,
|
|
12
|
+
)
|
|
13
|
+
from aiecs.application.knowledge_graph.builder.text_chunker import TextChunker
|
|
14
|
+
from aiecs.application.knowledge_graph.builder.schema_mapping import (
|
|
15
|
+
SchemaMapping,
|
|
16
|
+
EntityMapping,
|
|
17
|
+
RelationMapping,
|
|
18
|
+
PropertyTransformation,
|
|
19
|
+
TransformationType,
|
|
20
|
+
)
|
|
21
|
+
from aiecs.application.knowledge_graph.builder.structured_pipeline import (
|
|
22
|
+
StructuredDataPipeline,
|
|
23
|
+
ImportResult,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
"GraphBuilder",
|
|
28
|
+
"DocumentGraphBuilder",
|
|
29
|
+
"TextChunker",
|
|
30
|
+
"SchemaMapping",
|
|
31
|
+
"EntityMapping",
|
|
32
|
+
"RelationMapping",
|
|
33
|
+
"PropertyTransformation",
|
|
34
|
+
"TransformationType",
|
|
35
|
+
"StructuredDataPipeline",
|
|
36
|
+
"ImportResult",
|
|
37
|
+
]
|
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Quality Validation for Knowledge Graph Import
|
|
3
|
+
|
|
4
|
+
Provides validation capabilities to ensure data quality during import,
|
|
5
|
+
including range validation, outlier detection, completeness checks, and
|
|
6
|
+
type consistency validation.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from typing import Dict, List, Optional, Any, Set, Union
|
|
10
|
+
from dataclasses import dataclass, field
|
|
11
|
+
from enum import Enum
|
|
12
|
+
import logging
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# Check for pandas and numpy availability
|
|
17
|
+
try:
|
|
18
|
+
import pandas as pd
|
|
19
|
+
import numpy as np
|
|
20
|
+
PANDAS_AVAILABLE = True
|
|
21
|
+
except ImportError:
|
|
22
|
+
PANDAS_AVAILABLE = False
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class ViolationType(Enum):
|
|
26
|
+
"""Types of data quality violations"""
|
|
27
|
+
RANGE_VIOLATION = "range_violation"
|
|
28
|
+
OUTLIER = "outlier"
|
|
29
|
+
MISSING_VALUE = "missing_value"
|
|
30
|
+
TYPE_MISMATCH = "type_mismatch"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class ValidationViolation:
|
|
35
|
+
"""
|
|
36
|
+
Represents a single data quality violation
|
|
37
|
+
|
|
38
|
+
Attributes:
|
|
39
|
+
violation_type: Type of violation
|
|
40
|
+
property_name: Property that violated the rule
|
|
41
|
+
row_id: Identifier of the row with violation
|
|
42
|
+
value: The violating value
|
|
43
|
+
expected: Expected value or constraint
|
|
44
|
+
message: Human-readable description
|
|
45
|
+
"""
|
|
46
|
+
violation_type: ViolationType
|
|
47
|
+
property_name: str
|
|
48
|
+
row_id: Any
|
|
49
|
+
value: Any
|
|
50
|
+
expected: Any
|
|
51
|
+
message: str
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
@dataclass
|
|
55
|
+
class QualityReport:
|
|
56
|
+
"""
|
|
57
|
+
Data quality validation report
|
|
58
|
+
|
|
59
|
+
Attributes:
|
|
60
|
+
total_rows: Total number of rows validated
|
|
61
|
+
violations: List of all violations found
|
|
62
|
+
completeness: Completeness percentage per property
|
|
63
|
+
outlier_count: Number of outliers detected per property
|
|
64
|
+
range_violations: Number of range violations per property
|
|
65
|
+
type_violations: Number of type violations per property
|
|
66
|
+
passed: Whether validation passed (no critical violations)
|
|
67
|
+
"""
|
|
68
|
+
total_rows: int
|
|
69
|
+
violations: List[ValidationViolation] = field(default_factory=list)
|
|
70
|
+
completeness: Dict[str, float] = field(default_factory=dict)
|
|
71
|
+
outlier_count: Dict[str, int] = field(default_factory=dict)
|
|
72
|
+
range_violations: Dict[str, int] = field(default_factory=dict)
|
|
73
|
+
type_violations: Dict[str, int] = field(default_factory=dict)
|
|
74
|
+
passed: bool = True
|
|
75
|
+
|
|
76
|
+
def add_violation(self, violation: ValidationViolation):
|
|
77
|
+
"""Add a violation to the report"""
|
|
78
|
+
self.violations.append(violation)
|
|
79
|
+
|
|
80
|
+
# Update counts
|
|
81
|
+
if violation.violation_type == ViolationType.RANGE_VIOLATION:
|
|
82
|
+
self.range_violations[violation.property_name] = \
|
|
83
|
+
self.range_violations.get(violation.property_name, 0) + 1
|
|
84
|
+
elif violation.violation_type == ViolationType.OUTLIER:
|
|
85
|
+
self.outlier_count[violation.property_name] = \
|
|
86
|
+
self.outlier_count.get(violation.property_name, 0) + 1
|
|
87
|
+
elif violation.violation_type == ViolationType.TYPE_MISMATCH:
|
|
88
|
+
self.type_violations[violation.property_name] = \
|
|
89
|
+
self.type_violations.get(violation.property_name, 0) + 1
|
|
90
|
+
|
|
91
|
+
def get_summary(self) -> Dict[str, Any]:
|
|
92
|
+
"""Get a summary of the quality report"""
|
|
93
|
+
return {
|
|
94
|
+
"total_rows": self.total_rows,
|
|
95
|
+
"total_violations": len(self.violations),
|
|
96
|
+
"range_violations": sum(self.range_violations.values()),
|
|
97
|
+
"outliers": sum(self.outlier_count.values()),
|
|
98
|
+
"type_violations": sum(self.type_violations.values()),
|
|
99
|
+
"completeness": self.completeness,
|
|
100
|
+
"passed": self.passed
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@dataclass
|
|
105
|
+
class RangeRule:
|
|
106
|
+
"""Range validation rule for numeric properties"""
|
|
107
|
+
min_value: Optional[float] = None
|
|
108
|
+
max_value: Optional[float] = None
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@dataclass
|
|
112
|
+
class ValidationConfig:
|
|
113
|
+
"""
|
|
114
|
+
Configuration for data quality validation
|
|
115
|
+
|
|
116
|
+
Attributes:
|
|
117
|
+
range_rules: Range validation rules per property
|
|
118
|
+
required_properties: Set of required properties
|
|
119
|
+
detect_outliers: Whether to detect outliers (3 std devs)
|
|
120
|
+
fail_on_violations: Whether to fail import on violations
|
|
121
|
+
max_violation_rate: Maximum allowed violation rate (0.0-1.0)
|
|
122
|
+
"""
|
|
123
|
+
range_rules: Dict[str, RangeRule] = field(default_factory=dict)
|
|
124
|
+
required_properties: Set[str] = field(default_factory=set)
|
|
125
|
+
detect_outliers: bool = False
|
|
126
|
+
fail_on_violations: bool = False
|
|
127
|
+
max_violation_rate: float = 0.1 # 10% by default
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class DataQualityValidator:
|
|
131
|
+
"""
|
|
132
|
+
Validates data quality during knowledge graph import
|
|
133
|
+
|
|
134
|
+
Provides range validation, outlier detection, completeness checks,
|
|
135
|
+
and type consistency validation.
|
|
136
|
+
"""
|
|
137
|
+
|
|
138
|
+
def __init__(self, config: Optional[ValidationConfig] = None):
|
|
139
|
+
"""
|
|
140
|
+
Initialize validator with configuration
|
|
141
|
+
|
|
142
|
+
Args:
|
|
143
|
+
config: Validation configuration
|
|
144
|
+
"""
|
|
145
|
+
self.config = config or ValidationConfig()
|
|
146
|
+
self._property_stats: Dict[str, Dict[str, float]] = {}
|
|
147
|
+
|
|
148
|
+
def validate_dataframe(self, df: 'pd.DataFrame', id_column: Optional[str] = None) -> QualityReport:
|
|
149
|
+
"""
|
|
150
|
+
Validate a pandas DataFrame
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
df: DataFrame to validate
|
|
154
|
+
id_column: Column to use as row identifier
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
QualityReport with validation results
|
|
158
|
+
"""
|
|
159
|
+
if not PANDAS_AVAILABLE:
|
|
160
|
+
raise ImportError("pandas and numpy are required for data quality validation")
|
|
161
|
+
|
|
162
|
+
report = QualityReport(total_rows=len(df))
|
|
163
|
+
|
|
164
|
+
# Use index as row ID if no id_column specified
|
|
165
|
+
row_ids = df[id_column] if id_column and id_column in df.columns else df.index
|
|
166
|
+
|
|
167
|
+
# Check completeness
|
|
168
|
+
self._check_completeness(df, report)
|
|
169
|
+
|
|
170
|
+
# Check required properties
|
|
171
|
+
self._check_required_properties(df, row_ids, report)
|
|
172
|
+
|
|
173
|
+
# Validate ranges
|
|
174
|
+
self._validate_ranges(df, row_ids, report)
|
|
175
|
+
|
|
176
|
+
# Detect outliers
|
|
177
|
+
if self.config.detect_outliers:
|
|
178
|
+
self._detect_outliers(df, row_ids, report)
|
|
179
|
+
|
|
180
|
+
# Check if validation passed
|
|
181
|
+
violation_rate = len(report.violations) / max(report.total_rows, 1)
|
|
182
|
+
if self.config.fail_on_violations and violation_rate > self.config.max_violation_rate:
|
|
183
|
+
report.passed = False
|
|
184
|
+
|
|
185
|
+
return report
|
|
186
|
+
|
|
187
|
+
def _check_completeness(self, df: 'pd.DataFrame', report: QualityReport):
|
|
188
|
+
"""Check completeness of properties"""
|
|
189
|
+
for col in df.columns:
|
|
190
|
+
non_null_count = df[col].notna().sum()
|
|
191
|
+
completeness = non_null_count / len(df) if len(df) > 0 else 0.0
|
|
192
|
+
report.completeness[col] = completeness
|
|
193
|
+
|
|
194
|
+
def _check_required_properties(self, df: 'pd.DataFrame', row_ids: Any, report: QualityReport):
|
|
195
|
+
"""Check that required properties are present and non-null"""
|
|
196
|
+
for prop in self.config.required_properties:
|
|
197
|
+
if prop not in df.columns:
|
|
198
|
+
# Property missing entirely
|
|
199
|
+
violation = ValidationViolation(
|
|
200
|
+
violation_type=ViolationType.MISSING_VALUE,
|
|
201
|
+
property_name=prop,
|
|
202
|
+
row_id="ALL",
|
|
203
|
+
value=None,
|
|
204
|
+
expected="required property",
|
|
205
|
+
message=f"Required property '{prop}' is missing from dataset"
|
|
206
|
+
)
|
|
207
|
+
report.add_violation(violation)
|
|
208
|
+
else:
|
|
209
|
+
# Check for null values in required property
|
|
210
|
+
null_mask = df[prop].isna()
|
|
211
|
+
for idx in df[null_mask].index:
|
|
212
|
+
row_id = row_ids.iloc[idx] if hasattr(row_ids, 'iloc') else row_ids[idx]
|
|
213
|
+
violation = ValidationViolation(
|
|
214
|
+
violation_type=ViolationType.MISSING_VALUE,
|
|
215
|
+
property_name=prop,
|
|
216
|
+
row_id=row_id,
|
|
217
|
+
value=None,
|
|
218
|
+
expected="non-null value",
|
|
219
|
+
message=f"Required property '{prop}' is null in row {row_id}"
|
|
220
|
+
)
|
|
221
|
+
report.add_violation(violation)
|
|
222
|
+
|
|
223
|
+
def _validate_ranges(self, df: 'pd.DataFrame', row_ids: Any, report: QualityReport):
|
|
224
|
+
"""Validate numeric properties are within specified ranges"""
|
|
225
|
+
for prop, rule in self.config.range_rules.items():
|
|
226
|
+
if prop not in df.columns:
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
# Only validate numeric columns
|
|
230
|
+
if not pd.api.types.is_numeric_dtype(df[prop]):
|
|
231
|
+
continue
|
|
232
|
+
|
|
233
|
+
# Check min value
|
|
234
|
+
if rule.min_value is not None:
|
|
235
|
+
violations_mask = df[prop] < rule.min_value
|
|
236
|
+
for idx in df[violations_mask].index:
|
|
237
|
+
row_id = row_ids.iloc[idx] if hasattr(row_ids, 'iloc') else row_ids[idx]
|
|
238
|
+
value = df[prop].iloc[idx]
|
|
239
|
+
violation = ValidationViolation(
|
|
240
|
+
violation_type=ViolationType.RANGE_VIOLATION,
|
|
241
|
+
property_name=prop,
|
|
242
|
+
row_id=row_id,
|
|
243
|
+
value=value,
|
|
244
|
+
expected=f">= {rule.min_value}",
|
|
245
|
+
message=f"Value {value} is below minimum {rule.min_value} for property '{prop}' in row {row_id}"
|
|
246
|
+
)
|
|
247
|
+
report.add_violation(violation)
|
|
248
|
+
|
|
249
|
+
# Check max value
|
|
250
|
+
if rule.max_value is not None:
|
|
251
|
+
violations_mask = df[prop] > rule.max_value
|
|
252
|
+
for idx in df[violations_mask].index:
|
|
253
|
+
row_id = row_ids.iloc[idx] if hasattr(row_ids, 'iloc') else row_ids[idx]
|
|
254
|
+
value = df[prop].iloc[idx]
|
|
255
|
+
violation = ValidationViolation(
|
|
256
|
+
violation_type=ViolationType.RANGE_VIOLATION,
|
|
257
|
+
property_name=prop,
|
|
258
|
+
row_id=row_id,
|
|
259
|
+
value=value,
|
|
260
|
+
expected=f"<= {rule.max_value}",
|
|
261
|
+
message=f"Value {value} is above maximum {rule.max_value} for property '{prop}' in row {row_id}"
|
|
262
|
+
)
|
|
263
|
+
report.add_violation(violation)
|
|
264
|
+
|
|
265
|
+
def _detect_outliers(self, df: 'pd.DataFrame', row_ids: Any, report: QualityReport):
|
|
266
|
+
"""Detect outliers using 3 standard deviations rule"""
|
|
267
|
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
268
|
+
|
|
269
|
+
for col in numeric_cols:
|
|
270
|
+
# Skip if all values are null
|
|
271
|
+
if df[col].isna().all():
|
|
272
|
+
continue
|
|
273
|
+
|
|
274
|
+
# Calculate mean and std
|
|
275
|
+
mean = df[col].mean()
|
|
276
|
+
std = df[col].std()
|
|
277
|
+
|
|
278
|
+
# Skip if std is 0 or NaN
|
|
279
|
+
if pd.isna(std) or std == 0:
|
|
280
|
+
continue
|
|
281
|
+
|
|
282
|
+
# Store stats for later use
|
|
283
|
+
self._property_stats[col] = {"mean": mean, "std": std}
|
|
284
|
+
|
|
285
|
+
# Detect outliers (beyond 3 standard deviations)
|
|
286
|
+
lower_bound = mean - 3 * std
|
|
287
|
+
upper_bound = mean + 3 * std
|
|
288
|
+
outliers_mask = (df[col] < lower_bound) | (df[col] > upper_bound)
|
|
289
|
+
|
|
290
|
+
for idx in df[outliers_mask].index:
|
|
291
|
+
row_id = row_ids.iloc[idx] if hasattr(row_ids, 'iloc') else row_ids[idx]
|
|
292
|
+
value = df[col].iloc[idx]
|
|
293
|
+
violation = ValidationViolation(
|
|
294
|
+
violation_type=ViolationType.OUTLIER,
|
|
295
|
+
property_name=col,
|
|
296
|
+
row_id=row_id,
|
|
297
|
+
value=value,
|
|
298
|
+
expected=f"within [{lower_bound:.2f}, {upper_bound:.2f}]",
|
|
299
|
+
message=f"Value {value} is an outlier (>3 std devs) for property '{col}' in row {row_id}"
|
|
300
|
+
)
|
|
301
|
+
report.add_violation(violation)
|
|
302
|
+
|