aiecs 1.0.1__py3-none-any.whl → 1.7.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +13 -16
- aiecs/__main__.py +7 -7
- aiecs/aiecs_client.py +269 -75
- aiecs/application/executors/operation_executor.py +79 -54
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
- aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
- aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
- aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +98 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
- aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
- aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
- aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
- aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
- aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
- aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
- aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
- aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
- aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
- aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
- aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
- aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
- aiecs/application/knowledge_graph/search/reranker.py +293 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +471 -0
- aiecs/config/__init__.py +20 -5
- aiecs/config/config.py +762 -31
- aiecs/config/graph_config.py +131 -0
- aiecs/config/tool_config.py +435 -0
- aiecs/core/__init__.py +29 -13
- aiecs/core/interface/__init__.py +2 -2
- aiecs/core/interface/execution_interface.py +22 -22
- aiecs/core/interface/storage_interface.py +37 -88
- aiecs/core/registry/__init__.py +31 -0
- aiecs/core/registry/service_registry.py +92 -0
- aiecs/domain/__init__.py +270 -1
- aiecs/domain/agent/__init__.py +191 -0
- aiecs/domain/agent/base_agent.py +3949 -0
- aiecs/domain/agent/exceptions.py +99 -0
- aiecs/domain/agent/graph_aware_mixin.py +569 -0
- aiecs/domain/agent/hybrid_agent.py +1731 -0
- aiecs/domain/agent/integration/__init__.py +29 -0
- aiecs/domain/agent/integration/context_compressor.py +216 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
- aiecs/domain/agent/integration/protocols.py +281 -0
- aiecs/domain/agent/integration/retry_policy.py +218 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
- aiecs/domain/agent/lifecycle.py +291 -0
- aiecs/domain/agent/llm_agent.py +692 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +1124 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +163 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
- aiecs/domain/agent/models.py +894 -0
- aiecs/domain/agent/observability.py +479 -0
- aiecs/domain/agent/persistence.py +449 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +159 -0
- aiecs/domain/agent/prompts/formatters.py +187 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +253 -0
- aiecs/domain/agent/tool_agent.py +444 -0
- aiecs/domain/agent/tools/__init__.py +15 -0
- aiecs/domain/agent/tools/schema_generator.py +377 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +469 -0
- aiecs/domain/community/analytics.py +432 -0
- aiecs/domain/community/collaborative_workflow.py +648 -0
- aiecs/domain/community/communication_hub.py +634 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +796 -0
- aiecs/domain/community/community_manager.py +803 -0
- aiecs/domain/community/decision_engine.py +849 -0
- aiecs/domain/community/exceptions.py +231 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +234 -0
- aiecs/domain/community/resource_manager.py +461 -0
- aiecs/domain/community/shared_context_manager.py +589 -0
- aiecs/domain/context/__init__.py +40 -10
- aiecs/domain/context/context_engine.py +1910 -0
- aiecs/domain/context/conversation_models.py +87 -53
- aiecs/domain/context/graph_memory.py +582 -0
- aiecs/domain/execution/model.py +12 -4
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +148 -0
- aiecs/domain/knowledge_graph/models/evidence.py +178 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
- aiecs/domain/knowledge_graph/models/path.py +171 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
- aiecs/domain/knowledge_graph/models/query.py +261 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
- aiecs/domain/knowledge_graph/models/relation.py +202 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
- aiecs/domain/task/dsl_processor.py +172 -56
- aiecs/domain/task/model.py +20 -8
- aiecs/domain/task/task_context.py +27 -24
- aiecs/infrastructure/__init__.py +0 -2
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +837 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
- aiecs/infrastructure/graph_storage/cache.py +424 -0
- aiecs/infrastructure/graph_storage/distributed.py +223 -0
- aiecs/infrastructure/graph_storage/error_handling.py +380 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
- aiecs/infrastructure/graph_storage/metrics.py +344 -0
- aiecs/infrastructure/graph_storage/migration.py +400 -0
- aiecs/infrastructure/graph_storage/pagination.py +483 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
- aiecs/infrastructure/graph_storage/postgres.py +1563 -0
- aiecs/infrastructure/graph_storage/property_storage.py +353 -0
- aiecs/infrastructure/graph_storage/protocols.py +76 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
- aiecs/infrastructure/graph_storage/streaming.py +487 -0
- aiecs/infrastructure/graph_storage/tenant.py +412 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
- aiecs/infrastructure/messaging/websocket_manager.py +51 -35
- aiecs/infrastructure/monitoring/__init__.py +22 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
- aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
- aiecs/infrastructure/monitoring/structured_logger.py +3 -7
- aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
- aiecs/infrastructure/persistence/__init__.py +14 -1
- aiecs/infrastructure/persistence/context_engine_client.py +184 -0
- aiecs/infrastructure/persistence/database_manager.py +67 -43
- aiecs/infrastructure/persistence/file_storage.py +180 -103
- aiecs/infrastructure/persistence/redis_client.py +74 -21
- aiecs/llm/__init__.py +73 -25
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
- aiecs/llm/client_factory.py +230 -37
- aiecs/llm/client_resolver.py +155 -0
- aiecs/llm/clients/__init__.py +38 -0
- aiecs/llm/clients/base_client.py +328 -0
- aiecs/llm/clients/google_function_calling_mixin.py +415 -0
- aiecs/llm/clients/googleai_client.py +314 -0
- aiecs/llm/clients/openai_client.py +158 -0
- aiecs/llm/clients/openai_compatible_mixin.py +367 -0
- aiecs/llm/clients/vertex_client.py +1186 -0
- aiecs/llm/clients/xai_client.py +201 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +272 -0
- aiecs/llm/config/config_validator.py +206 -0
- aiecs/llm/config/model_config.py +143 -0
- aiecs/llm/protocols.py +149 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +89 -0
- aiecs/main.py +140 -121
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/module_checker.py +499 -0
- aiecs/scripts/aid/version_manager.py +235 -0
- aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
- aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
- aiecs/scripts/dependance_check/__init__.py +15 -0
- aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
- aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
- aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
- aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
- aiecs/scripts/knowledge_graph/__init__.py +3 -0
- aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
- aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
- aiecs/scripts/tools_develop/README.md +671 -0
- aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
- aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
- aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
- aiecs/scripts/tools_develop/schema_coverage.py +511 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
- aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
- aiecs/scripts/tools_develop/verify_tools.py +352 -0
- aiecs/tasks/__init__.py +0 -1
- aiecs/tasks/worker.py +115 -47
- aiecs/tools/__init__.py +194 -72
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +330 -0
- aiecs/tools/apisource/providers/__init__.py +112 -0
- aiecs/tools/apisource/providers/base.py +671 -0
- aiecs/tools/apisource/providers/census.py +397 -0
- aiecs/tools/apisource/providers/fred.py +535 -0
- aiecs/tools/apisource/providers/newsapi.py +409 -0
- aiecs/tools/apisource/providers/worldbank.py +352 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +363 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
- aiecs/tools/apisource/tool.py +832 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +334 -0
- aiecs/tools/base_tool.py +415 -21
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +607 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
- aiecs/tools/docs/content_insertion_tool.py +1320 -0
- aiecs/tools/docs/document_creator_tool.py +1464 -0
- aiecs/tools/docs/document_layout_tool.py +1160 -0
- aiecs/tools/docs/document_parser_tool.py +1016 -0
- aiecs/tools/docs/document_writer_tool.py +2008 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
- aiecs/tools/langchain_adapter.py +300 -138
- aiecs/tools/schema_generator.py +455 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +581 -0
- aiecs/tools/search_tool/cache.py +264 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +224 -0
- aiecs/tools/search_tool/core.py +778 -0
- aiecs/tools/search_tool/deduplicator.py +119 -0
- aiecs/tools/search_tool/error_handler.py +242 -0
- aiecs/tools/search_tool/metrics.py +343 -0
- aiecs/tools/search_tool/rate_limiter.py +172 -0
- aiecs/tools/search_tool/schemas.py +275 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
- aiecs/tools/statistics/data_loader_tool.py +555 -0
- aiecs/tools/statistics/data_profiler_tool.py +638 -0
- aiecs/tools/statistics/data_transformer_tool.py +580 -0
- aiecs/tools/statistics/data_visualizer_tool.py +498 -0
- aiecs/tools/statistics/model_trainer_tool.py +507 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
- aiecs/tools/task_tools/__init__.py +49 -36
- aiecs/tools/task_tools/chart_tool.py +200 -184
- aiecs/tools/task_tools/classfire_tool.py +268 -267
- aiecs/tools/task_tools/image_tool.py +220 -141
- aiecs/tools/task_tools/office_tool.py +226 -146
- aiecs/tools/task_tools/pandas_tool.py +477 -121
- aiecs/tools/task_tools/report_tool.py +390 -142
- aiecs/tools/task_tools/research_tool.py +149 -79
- aiecs/tools/task_tools/scraper_tool.py +339 -145
- aiecs/tools/task_tools/stats_tool.py +448 -209
- aiecs/tools/temp_file_manager.py +26 -24
- aiecs/tools/tool_executor/__init__.py +18 -16
- aiecs/tools/tool_executor/tool_executor.py +364 -52
- aiecs/utils/LLM_output_structor.py +74 -48
- aiecs/utils/__init__.py +14 -3
- aiecs/utils/base_callback.py +0 -3
- aiecs/utils/cache_provider.py +696 -0
- aiecs/utils/execution_utils.py +50 -31
- aiecs/utils/prompt_loader.py +1 -0
- aiecs/utils/token_usage_repository.py +37 -11
- aiecs/ws/socket_server.py +14 -4
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/METADATA +52 -15
- aiecs-1.7.17.dist-info/RECORD +337 -0
- aiecs-1.7.17.dist-info/entry_points.txt +13 -0
- aiecs/config/registry.py +0 -19
- aiecs/domain/context/content_engine.py +0 -982
- aiecs/llm/base_client.py +0 -99
- aiecs/llm/openai_client.py +0 -125
- aiecs/llm/vertex_client.py +0 -186
- aiecs/llm/xai_client.py +0 -184
- aiecs/scripts/dependency_checker.py +0 -857
- aiecs/scripts/quick_dependency_check.py +0 -269
- aiecs/tools/task_tools/search_api.py +0 -7
- aiecs-1.0.1.dist-info/RECORD +0 -90
- aiecs-1.0.1.dist-info/entry_points.txt +0 -7
- /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
- /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
- /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
- /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/WHEEL +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,462 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Schema Inference for Structured Data Import
|
|
3
|
+
|
|
4
|
+
Automatically infers schema mappings from data structure, reducing manual configuration effort.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import Dict, List, Optional, Any, Union, Set
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
import re
|
|
11
|
+
import logging
|
|
12
|
+
import warnings
|
|
13
|
+
|
|
14
|
+
from aiecs.application.knowledge_graph.builder.schema_mapping import (
|
|
15
|
+
SchemaMapping,
|
|
16
|
+
EntityMapping,
|
|
17
|
+
RelationMapping,
|
|
18
|
+
)
|
|
19
|
+
from aiecs.domain.knowledge_graph.schema.property_schema import PropertyType
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
# Check for pandas availability
|
|
24
|
+
try:
|
|
25
|
+
import pandas as pd
|
|
26
|
+
PANDAS_AVAILABLE = True
|
|
27
|
+
except ImportError:
|
|
28
|
+
PANDAS_AVAILABLE = False
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class InferredSchema:
|
|
33
|
+
"""
|
|
34
|
+
Result of schema inference
|
|
35
|
+
|
|
36
|
+
Contains inferred entity and relation mappings that can be reviewed and modified.
|
|
37
|
+
"""
|
|
38
|
+
entity_mappings: List[EntityMapping]
|
|
39
|
+
relation_mappings: List[RelationMapping]
|
|
40
|
+
confidence_scores: Dict[str, float] # Mapping name -> confidence score (0-1)
|
|
41
|
+
warnings: List[str]
|
|
42
|
+
|
|
43
|
+
def to_schema_mapping(self) -> SchemaMapping:
|
|
44
|
+
"""Convert to SchemaMapping for use in pipeline"""
|
|
45
|
+
return SchemaMapping(
|
|
46
|
+
entity_mappings=self.entity_mappings,
|
|
47
|
+
relation_mappings=self.relation_mappings,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class SchemaInference:
|
|
52
|
+
"""
|
|
53
|
+
Automatic schema inference from structured data
|
|
54
|
+
|
|
55
|
+
Analyzes data structure and content to automatically generate schema mappings.
|
|
56
|
+
"""
|
|
57
|
+
|
|
58
|
+
# Common ID column patterns
|
|
59
|
+
ID_PATTERNS = [
|
|
60
|
+
r'^id$',
|
|
61
|
+
r'^.*_id$',
|
|
62
|
+
r'^key$',
|
|
63
|
+
r'^.*_key$',
|
|
64
|
+
r'^pk$',
|
|
65
|
+
r'^.*_pk$',
|
|
66
|
+
]
|
|
67
|
+
|
|
68
|
+
# Foreign key patterns (for relation inference)
|
|
69
|
+
FK_PATTERNS = [
|
|
70
|
+
r'^(.+)_id$', # e.g., dept_id -> dept
|
|
71
|
+
r'^(.+)_key$', # e.g., dept_key -> dept
|
|
72
|
+
r'^fk_(.+)$', # e.g., fk_dept -> dept
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
def __init__(self, sample_size: int = 1000):
|
|
76
|
+
"""
|
|
77
|
+
Initialize schema inference
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
sample_size: Number of rows to sample for inference (default: 1000)
|
|
81
|
+
"""
|
|
82
|
+
self.sample_size = sample_size
|
|
83
|
+
|
|
84
|
+
def infer_from_dataframe(
|
|
85
|
+
self,
|
|
86
|
+
df: 'pd.DataFrame',
|
|
87
|
+
entity_type_hint: Optional[str] = None,
|
|
88
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
89
|
+
) -> InferredSchema:
|
|
90
|
+
"""
|
|
91
|
+
Infer schema from pandas DataFrame
|
|
92
|
+
|
|
93
|
+
Args:
|
|
94
|
+
df: DataFrame to analyze
|
|
95
|
+
entity_type_hint: Optional hint for entity type name
|
|
96
|
+
metadata: Optional metadata (e.g., SPSS variable labels)
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
InferredSchema with entity and relation mappings
|
|
100
|
+
"""
|
|
101
|
+
if not PANDAS_AVAILABLE:
|
|
102
|
+
raise ImportError("pandas is required for schema inference")
|
|
103
|
+
|
|
104
|
+
warnings = []
|
|
105
|
+
confidence_scores = {}
|
|
106
|
+
|
|
107
|
+
# Sample data if too large
|
|
108
|
+
if len(df) > self.sample_size:
|
|
109
|
+
df_sample = df.sample(n=self.sample_size, random_state=42)
|
|
110
|
+
warnings.append(f"Sampled {self.sample_size} rows from {len(df)} for inference")
|
|
111
|
+
else:
|
|
112
|
+
df_sample = df
|
|
113
|
+
|
|
114
|
+
# Detect ID column
|
|
115
|
+
id_column = self._detect_id_column(df_sample)
|
|
116
|
+
if id_column:
|
|
117
|
+
confidence_scores['id_column'] = 0.9
|
|
118
|
+
else:
|
|
119
|
+
warnings.append("No clear ID column detected, will use first column")
|
|
120
|
+
id_column = df.columns[0] if len(df.columns) > 0 else None
|
|
121
|
+
confidence_scores['id_column'] = 0.5
|
|
122
|
+
|
|
123
|
+
# Infer property types
|
|
124
|
+
property_types = self._infer_property_types(df_sample, metadata)
|
|
125
|
+
|
|
126
|
+
# Determine entity type
|
|
127
|
+
entity_type = entity_type_hint or self._infer_entity_type(df.columns.tolist(), id_column)
|
|
128
|
+
|
|
129
|
+
# Create entity mapping
|
|
130
|
+
entity_mapping = EntityMapping(
|
|
131
|
+
source_columns=df.columns.tolist(),
|
|
132
|
+
entity_type=entity_type,
|
|
133
|
+
property_mapping={col: col for col in df.columns},
|
|
134
|
+
id_column=id_column,
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Infer relations from foreign key patterns
|
|
138
|
+
relation_mappings = self._infer_relations(df_sample, id_column)
|
|
139
|
+
if relation_mappings:
|
|
140
|
+
confidence_scores['relations'] = 0.7
|
|
141
|
+
|
|
142
|
+
return InferredSchema(
|
|
143
|
+
entity_mappings=[entity_mapping],
|
|
144
|
+
relation_mappings=relation_mappings,
|
|
145
|
+
confidence_scores=confidence_scores,
|
|
146
|
+
warnings=warnings,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def _detect_id_column(self, df: 'pd.DataFrame') -> Optional[str]:
|
|
150
|
+
"""
|
|
151
|
+
Detect ID column from DataFrame
|
|
152
|
+
|
|
153
|
+
Looks for columns matching ID patterns or columns with unique values.
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
df: DataFrame to analyze
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Name of ID column, or None if not found
|
|
160
|
+
"""
|
|
161
|
+
# Check for columns matching ID patterns
|
|
162
|
+
for col in df.columns:
|
|
163
|
+
col_lower = col.lower()
|
|
164
|
+
for pattern in self.ID_PATTERNS:
|
|
165
|
+
if re.match(pattern, col_lower):
|
|
166
|
+
return col
|
|
167
|
+
|
|
168
|
+
# Check for columns with all unique values
|
|
169
|
+
for col in df.columns:
|
|
170
|
+
if df[col].nunique() == len(df):
|
|
171
|
+
return col
|
|
172
|
+
|
|
173
|
+
return None
|
|
174
|
+
|
|
175
|
+
def _infer_property_types(
|
|
176
|
+
self,
|
|
177
|
+
df: 'pd.DataFrame',
|
|
178
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
179
|
+
) -> Dict[str, PropertyType]:
|
|
180
|
+
"""
|
|
181
|
+
Infer property types from DataFrame columns
|
|
182
|
+
|
|
183
|
+
Args:
|
|
184
|
+
df: DataFrame to analyze
|
|
185
|
+
metadata: Optional metadata (e.g., SPSS variable labels)
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
Dictionary mapping column name to PropertyType
|
|
189
|
+
"""
|
|
190
|
+
property_types = {}
|
|
191
|
+
|
|
192
|
+
for col in df.columns:
|
|
193
|
+
# Get pandas dtype
|
|
194
|
+
dtype = df[col].dtype
|
|
195
|
+
|
|
196
|
+
# Infer PropertyType from pandas dtype
|
|
197
|
+
if pd.api.types.is_integer_dtype(dtype):
|
|
198
|
+
property_types[col] = PropertyType.INTEGER
|
|
199
|
+
elif pd.api.types.is_float_dtype(dtype):
|
|
200
|
+
property_types[col] = PropertyType.FLOAT
|
|
201
|
+
elif pd.api.types.is_bool_dtype(dtype):
|
|
202
|
+
property_types[col] = PropertyType.BOOLEAN
|
|
203
|
+
elif pd.api.types.is_datetime64_any_dtype(dtype):
|
|
204
|
+
property_types[col] = PropertyType.DATETIME
|
|
205
|
+
else:
|
|
206
|
+
# Default to string, but check if it could be a date
|
|
207
|
+
if self._could_be_date(df[col]):
|
|
208
|
+
property_types[col] = PropertyType.DATETIME
|
|
209
|
+
else:
|
|
210
|
+
property_types[col] = PropertyType.STRING
|
|
211
|
+
|
|
212
|
+
return property_types
|
|
213
|
+
|
|
214
|
+
def _could_be_date(self, series: 'pd.Series') -> bool:
|
|
215
|
+
"""
|
|
216
|
+
Check if a string series could be dates
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
series: Pandas series to check
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
True if series looks like dates
|
|
223
|
+
"""
|
|
224
|
+
# Sample a few non-null values
|
|
225
|
+
sample = series.dropna().head(10)
|
|
226
|
+
if len(sample) == 0:
|
|
227
|
+
return False
|
|
228
|
+
|
|
229
|
+
# Try to parse as dates
|
|
230
|
+
# Suppress UserWarning about dateutil fallback - this is expected behavior
|
|
231
|
+
# when pandas can't infer the date format automatically
|
|
232
|
+
try:
|
|
233
|
+
with warnings.catch_warnings():
|
|
234
|
+
warnings.simplefilter("ignore", UserWarning)
|
|
235
|
+
pd.to_datetime(sample, errors='raise')
|
|
236
|
+
return True
|
|
237
|
+
except (ValueError, TypeError):
|
|
238
|
+
return False
|
|
239
|
+
|
|
240
|
+
def _infer_entity_type(self, columns: List[str], id_column: Optional[str]) -> str:
|
|
241
|
+
"""
|
|
242
|
+
Infer entity type name from column names
|
|
243
|
+
|
|
244
|
+
Args:
|
|
245
|
+
columns: List of column names
|
|
246
|
+
id_column: Name of ID column (if detected)
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
Inferred entity type name
|
|
250
|
+
"""
|
|
251
|
+
# If ID column has a prefix, use that as entity type
|
|
252
|
+
if id_column:
|
|
253
|
+
# Try to extract entity type from ID column name
|
|
254
|
+
for pattern in self.FK_PATTERNS:
|
|
255
|
+
match = re.match(pattern, id_column.lower())
|
|
256
|
+
if match:
|
|
257
|
+
entity_type = match.group(1)
|
|
258
|
+
# Capitalize first letter
|
|
259
|
+
return entity_type.capitalize()
|
|
260
|
+
|
|
261
|
+
# Default to "Entity"
|
|
262
|
+
return "Entity"
|
|
263
|
+
|
|
264
|
+
def infer_from_csv(
|
|
265
|
+
self,
|
|
266
|
+
file_path: Union[str, Path],
|
|
267
|
+
encoding: str = "utf-8",
|
|
268
|
+
) -> InferredSchema:
|
|
269
|
+
"""
|
|
270
|
+
Infer schema from CSV file
|
|
271
|
+
|
|
272
|
+
Args:
|
|
273
|
+
file_path: Path to CSV file
|
|
274
|
+
encoding: File encoding
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
InferredSchema with entity and relation mappings
|
|
278
|
+
"""
|
|
279
|
+
if not PANDAS_AVAILABLE:
|
|
280
|
+
raise ImportError("pandas is required for CSV schema inference")
|
|
281
|
+
|
|
282
|
+
df = pd.read_csv(file_path, encoding=encoding, nrows=self.sample_size)
|
|
283
|
+
return self.infer_from_dataframe(df)
|
|
284
|
+
|
|
285
|
+
def infer_from_spss(
|
|
286
|
+
self,
|
|
287
|
+
file_path: Union[str, Path],
|
|
288
|
+
encoding: str = "utf-8",
|
|
289
|
+
) -> InferredSchema:
|
|
290
|
+
"""
|
|
291
|
+
Infer schema from SPSS file
|
|
292
|
+
|
|
293
|
+
Uses SPSS variable labels as property names and value labels for categorical data.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
file_path: Path to SPSS file
|
|
297
|
+
encoding: File encoding
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
InferredSchema with entity and relation mappings
|
|
301
|
+
"""
|
|
302
|
+
try:
|
|
303
|
+
import pyreadstat # type: ignore[import-untyped]
|
|
304
|
+
except ImportError:
|
|
305
|
+
raise ImportError("pyreadstat is required for SPSS schema inference")
|
|
306
|
+
|
|
307
|
+
if not PANDAS_AVAILABLE:
|
|
308
|
+
raise ImportError("pandas is required for SPSS schema inference")
|
|
309
|
+
|
|
310
|
+
# Read SPSS file with metadata
|
|
311
|
+
df, meta = pyreadstat.read_sav(str(file_path), encoding=encoding, row_limit=self.sample_size)
|
|
312
|
+
|
|
313
|
+
# Extract metadata
|
|
314
|
+
metadata = {
|
|
315
|
+
"column_names": meta.column_names if hasattr(meta, 'column_names') else [],
|
|
316
|
+
"column_labels": meta.column_labels if hasattr(meta, 'column_labels') else [],
|
|
317
|
+
"variable_value_labels": meta.variable_value_labels if hasattr(meta, 'variable_value_labels') else {},
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
return self.infer_from_dataframe(df, metadata=metadata)
|
|
321
|
+
|
|
322
|
+
def _infer_relations(
|
|
323
|
+
self,
|
|
324
|
+
df: 'pd.DataFrame',
|
|
325
|
+
id_column: Optional[str],
|
|
326
|
+
) -> List[RelationMapping]:
|
|
327
|
+
"""
|
|
328
|
+
Infer relation mappings from foreign key patterns
|
|
329
|
+
|
|
330
|
+
Detects columns that look like foreign keys and creates relation mappings.
|
|
331
|
+
|
|
332
|
+
Args:
|
|
333
|
+
df: DataFrame to analyze
|
|
334
|
+
id_column: Name of ID column (source entity)
|
|
335
|
+
|
|
336
|
+
Returns:
|
|
337
|
+
List of inferred RelationMapping objects
|
|
338
|
+
"""
|
|
339
|
+
if not id_column:
|
|
340
|
+
return []
|
|
341
|
+
|
|
342
|
+
relation_mappings = []
|
|
343
|
+
|
|
344
|
+
# Look for foreign key columns
|
|
345
|
+
for col in df.columns:
|
|
346
|
+
if col == id_column:
|
|
347
|
+
continue
|
|
348
|
+
|
|
349
|
+
col_lower = col.lower()
|
|
350
|
+
|
|
351
|
+
# Check if column matches FK pattern
|
|
352
|
+
for pattern in self.FK_PATTERNS:
|
|
353
|
+
match = re.match(pattern, col_lower)
|
|
354
|
+
if match:
|
|
355
|
+
# Extract target entity type from FK column name
|
|
356
|
+
target_entity_type = match.group(1).capitalize()
|
|
357
|
+
|
|
358
|
+
# Infer relation type from column name
|
|
359
|
+
# e.g., "dept_id" -> "BELONGS_TO_DEPT" or "HAS_DEPT"
|
|
360
|
+
relation_type = self._infer_relation_type(id_column, col, target_entity_type)
|
|
361
|
+
|
|
362
|
+
# Create relation mapping
|
|
363
|
+
relation_mapping = RelationMapping(
|
|
364
|
+
source_columns=[id_column, col],
|
|
365
|
+
relation_type=relation_type,
|
|
366
|
+
source_entity_column=id_column,
|
|
367
|
+
target_entity_column=col,
|
|
368
|
+
property_mapping={},
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
relation_mappings.append(relation_mapping)
|
|
372
|
+
break
|
|
373
|
+
|
|
374
|
+
return relation_mappings
|
|
375
|
+
|
|
376
|
+
def _infer_relation_type(
|
|
377
|
+
self,
|
|
378
|
+
source_column: str,
|
|
379
|
+
target_column: str,
|
|
380
|
+
target_entity_type: str,
|
|
381
|
+
) -> str:
|
|
382
|
+
"""
|
|
383
|
+
Infer relation type name from column names
|
|
384
|
+
|
|
385
|
+
Args:
|
|
386
|
+
source_column: Source entity column name
|
|
387
|
+
target_column: Target entity (FK) column name
|
|
388
|
+
target_entity_type: Inferred target entity type
|
|
389
|
+
|
|
390
|
+
Returns:
|
|
391
|
+
Inferred relation type name (e.g., "BELONGS_TO", "HAS")
|
|
392
|
+
"""
|
|
393
|
+
# Common relation patterns
|
|
394
|
+
# e.g., "emp_id" -> "dept_id" = "WORKS_IN" or "BELONGS_TO"
|
|
395
|
+
|
|
396
|
+
# Extract base names
|
|
397
|
+
source_base = source_column.lower().replace('_id', '').replace('_key', '')
|
|
398
|
+
target_base = target_column.lower().replace('_id', '').replace('_key', '').replace('fk_', '')
|
|
399
|
+
|
|
400
|
+
# Common relation verbs based on context
|
|
401
|
+
if 'dept' in target_base or 'department' in target_base:
|
|
402
|
+
return "WORKS_IN"
|
|
403
|
+
elif 'manager' in target_base or 'supervisor' in target_base:
|
|
404
|
+
return "REPORTS_TO"
|
|
405
|
+
elif 'company' in target_base or 'organization' in target_base:
|
|
406
|
+
return "BELONGS_TO"
|
|
407
|
+
elif 'project' in target_base:
|
|
408
|
+
return "ASSIGNED_TO"
|
|
409
|
+
elif 'team' in target_base or 'group' in target_base:
|
|
410
|
+
return "MEMBER_OF"
|
|
411
|
+
else:
|
|
412
|
+
# Generic relation type
|
|
413
|
+
return f"HAS_{target_entity_type.upper()}"
|
|
414
|
+
|
|
415
|
+
def merge_with_partial_schema(
|
|
416
|
+
self,
|
|
417
|
+
inferred: InferredSchema,
|
|
418
|
+
partial_mapping: SchemaMapping,
|
|
419
|
+
) -> InferredSchema:
|
|
420
|
+
"""
|
|
421
|
+
Merge inferred schema with user-provided partial schema
|
|
422
|
+
|
|
423
|
+
User-defined mappings take precedence over inferred ones.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
inferred: Inferred schema
|
|
427
|
+
partial_mapping: User-provided partial schema mapping
|
|
428
|
+
|
|
429
|
+
Returns:
|
|
430
|
+
Merged InferredSchema
|
|
431
|
+
"""
|
|
432
|
+
# Start with user-defined mappings
|
|
433
|
+
entity_mappings = list(partial_mapping.entity_mappings)
|
|
434
|
+
relation_mappings = list(partial_mapping.relation_mappings)
|
|
435
|
+
|
|
436
|
+
# Get entity types already defined by user
|
|
437
|
+
user_entity_types = {em.entity_type for em in partial_mapping.entity_mappings}
|
|
438
|
+
|
|
439
|
+
# Add inferred entity mappings that don't conflict
|
|
440
|
+
for inferred_em in inferred.entity_mappings:
|
|
441
|
+
if inferred_em.entity_type not in user_entity_types:
|
|
442
|
+
entity_mappings.append(inferred_em)
|
|
443
|
+
|
|
444
|
+
# Get relation types already defined by user
|
|
445
|
+
user_relation_types = {
|
|
446
|
+
(rm.source_entity_column, rm.target_entity_column, rm.relation_type)
|
|
447
|
+
for rm in partial_mapping.relation_mappings
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
# Add inferred relation mappings that don't conflict
|
|
451
|
+
for inferred_rm in inferred.relation_mappings:
|
|
452
|
+
key = (inferred_rm.source_entity_column, inferred_rm.target_entity_column, inferred_rm.relation_type)
|
|
453
|
+
if key not in user_relation_types:
|
|
454
|
+
relation_mappings.append(inferred_rm)
|
|
455
|
+
|
|
456
|
+
return InferredSchema(
|
|
457
|
+
entity_mappings=entity_mappings,
|
|
458
|
+
relation_mappings=relation_mappings,
|
|
459
|
+
confidence_scores=inferred.confidence_scores,
|
|
460
|
+
warnings=inferred.warnings + ["Merged with user-provided partial schema"],
|
|
461
|
+
)
|
|
462
|
+
|