aiecs 1.0.1__py3-none-any.whl → 1.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +13 -16
- aiecs/__main__.py +7 -7
- aiecs/aiecs_client.py +269 -75
- aiecs/application/executors/operation_executor.py +79 -54
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
- aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
- aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
- aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +98 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
- aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
- aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
- aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
- aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
- aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
- aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
- aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
- aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
- aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
- aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
- aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
- aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
- aiecs/application/knowledge_graph/search/reranker.py +293 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +471 -0
- aiecs/config/__init__.py +20 -5
- aiecs/config/config.py +762 -31
- aiecs/config/graph_config.py +131 -0
- aiecs/config/tool_config.py +399 -0
- aiecs/core/__init__.py +29 -13
- aiecs/core/interface/__init__.py +2 -2
- aiecs/core/interface/execution_interface.py +22 -22
- aiecs/core/interface/storage_interface.py +37 -88
- aiecs/core/registry/__init__.py +31 -0
- aiecs/core/registry/service_registry.py +92 -0
- aiecs/domain/__init__.py +270 -1
- aiecs/domain/agent/__init__.py +191 -0
- aiecs/domain/agent/base_agent.py +3870 -0
- aiecs/domain/agent/exceptions.py +99 -0
- aiecs/domain/agent/graph_aware_mixin.py +569 -0
- aiecs/domain/agent/hybrid_agent.py +1435 -0
- aiecs/domain/agent/integration/__init__.py +29 -0
- aiecs/domain/agent/integration/context_compressor.py +216 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
- aiecs/domain/agent/integration/protocols.py +281 -0
- aiecs/domain/agent/integration/retry_policy.py +218 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
- aiecs/domain/agent/lifecycle.py +291 -0
- aiecs/domain/agent/llm_agent.py +692 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +1124 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +163 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
- aiecs/domain/agent/models.py +884 -0
- aiecs/domain/agent/observability.py +479 -0
- aiecs/domain/agent/persistence.py +449 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +159 -0
- aiecs/domain/agent/prompts/formatters.py +187 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +253 -0
- aiecs/domain/agent/tool_agent.py +444 -0
- aiecs/domain/agent/tools/__init__.py +15 -0
- aiecs/domain/agent/tools/schema_generator.py +364 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +469 -0
- aiecs/domain/community/analytics.py +432 -0
- aiecs/domain/community/collaborative_workflow.py +648 -0
- aiecs/domain/community/communication_hub.py +634 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +796 -0
- aiecs/domain/community/community_manager.py +803 -0
- aiecs/domain/community/decision_engine.py +849 -0
- aiecs/domain/community/exceptions.py +231 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +234 -0
- aiecs/domain/community/resource_manager.py +461 -0
- aiecs/domain/community/shared_context_manager.py +589 -0
- aiecs/domain/context/__init__.py +40 -10
- aiecs/domain/context/context_engine.py +1910 -0
- aiecs/domain/context/conversation_models.py +87 -53
- aiecs/domain/context/graph_memory.py +582 -0
- aiecs/domain/execution/model.py +12 -4
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +148 -0
- aiecs/domain/knowledge_graph/models/evidence.py +178 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
- aiecs/domain/knowledge_graph/models/path.py +171 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
- aiecs/domain/knowledge_graph/models/query.py +261 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
- aiecs/domain/knowledge_graph/models/relation.py +202 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
- aiecs/domain/task/dsl_processor.py +172 -56
- aiecs/domain/task/model.py +20 -8
- aiecs/domain/task/task_context.py +27 -24
- aiecs/infrastructure/__init__.py +0 -2
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +837 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
- aiecs/infrastructure/graph_storage/cache.py +424 -0
- aiecs/infrastructure/graph_storage/distributed.py +223 -0
- aiecs/infrastructure/graph_storage/error_handling.py +380 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
- aiecs/infrastructure/graph_storage/metrics.py +344 -0
- aiecs/infrastructure/graph_storage/migration.py +400 -0
- aiecs/infrastructure/graph_storage/pagination.py +483 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
- aiecs/infrastructure/graph_storage/postgres.py +1563 -0
- aiecs/infrastructure/graph_storage/property_storage.py +353 -0
- aiecs/infrastructure/graph_storage/protocols.py +76 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
- aiecs/infrastructure/graph_storage/streaming.py +487 -0
- aiecs/infrastructure/graph_storage/tenant.py +412 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
- aiecs/infrastructure/messaging/websocket_manager.py +51 -35
- aiecs/infrastructure/monitoring/__init__.py +22 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
- aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
- aiecs/infrastructure/monitoring/structured_logger.py +3 -7
- aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
- aiecs/infrastructure/persistence/__init__.py +14 -1
- aiecs/infrastructure/persistence/context_engine_client.py +184 -0
- aiecs/infrastructure/persistence/database_manager.py +67 -43
- aiecs/infrastructure/persistence/file_storage.py +180 -103
- aiecs/infrastructure/persistence/redis_client.py +74 -21
- aiecs/llm/__init__.py +73 -25
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
- aiecs/llm/client_factory.py +224 -36
- aiecs/llm/client_resolver.py +155 -0
- aiecs/llm/clients/__init__.py +38 -0
- aiecs/llm/clients/base_client.py +324 -0
- aiecs/llm/clients/google_function_calling_mixin.py +457 -0
- aiecs/llm/clients/googleai_client.py +241 -0
- aiecs/llm/clients/openai_client.py +158 -0
- aiecs/llm/clients/openai_compatible_mixin.py +367 -0
- aiecs/llm/clients/vertex_client.py +897 -0
- aiecs/llm/clients/xai_client.py +201 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +272 -0
- aiecs/llm/config/config_validator.py +206 -0
- aiecs/llm/config/model_config.py +143 -0
- aiecs/llm/protocols.py +149 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +89 -0
- aiecs/main.py +140 -121
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/module_checker.py +499 -0
- aiecs/scripts/aid/version_manager.py +235 -0
- aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
- aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
- aiecs/scripts/dependance_check/__init__.py +15 -0
- aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
- aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
- aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
- aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
- aiecs/scripts/knowledge_graph/__init__.py +3 -0
- aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
- aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
- aiecs/scripts/tools_develop/README.md +671 -0
- aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
- aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
- aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
- aiecs/scripts/tools_develop/schema_coverage.py +511 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
- aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
- aiecs/scripts/tools_develop/verify_tools.py +352 -0
- aiecs/tasks/__init__.py +0 -1
- aiecs/tasks/worker.py +115 -47
- aiecs/tools/__init__.py +194 -72
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +330 -0
- aiecs/tools/apisource/providers/__init__.py +112 -0
- aiecs/tools/apisource/providers/base.py +671 -0
- aiecs/tools/apisource/providers/census.py +397 -0
- aiecs/tools/apisource/providers/fred.py +535 -0
- aiecs/tools/apisource/providers/newsapi.py +409 -0
- aiecs/tools/apisource/providers/worldbank.py +352 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +363 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
- aiecs/tools/apisource/tool.py +832 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +334 -0
- aiecs/tools/base_tool.py +415 -21
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +607 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
- aiecs/tools/docs/content_insertion_tool.py +1320 -0
- aiecs/tools/docs/document_creator_tool.py +1323 -0
- aiecs/tools/docs/document_layout_tool.py +1160 -0
- aiecs/tools/docs/document_parser_tool.py +1011 -0
- aiecs/tools/docs/document_writer_tool.py +1829 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
- aiecs/tools/langchain_adapter.py +300 -138
- aiecs/tools/schema_generator.py +455 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +581 -0
- aiecs/tools/search_tool/cache.py +264 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +224 -0
- aiecs/tools/search_tool/core.py +778 -0
- aiecs/tools/search_tool/deduplicator.py +119 -0
- aiecs/tools/search_tool/error_handler.py +242 -0
- aiecs/tools/search_tool/metrics.py +343 -0
- aiecs/tools/search_tool/rate_limiter.py +172 -0
- aiecs/tools/search_tool/schemas.py +275 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
- aiecs/tools/statistics/data_loader_tool.py +555 -0
- aiecs/tools/statistics/data_profiler_tool.py +638 -0
- aiecs/tools/statistics/data_transformer_tool.py +580 -0
- aiecs/tools/statistics/data_visualizer_tool.py +498 -0
- aiecs/tools/statistics/model_trainer_tool.py +507 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
- aiecs/tools/task_tools/__init__.py +49 -36
- aiecs/tools/task_tools/chart_tool.py +200 -184
- aiecs/tools/task_tools/classfire_tool.py +268 -267
- aiecs/tools/task_tools/image_tool.py +175 -131
- aiecs/tools/task_tools/office_tool.py +226 -146
- aiecs/tools/task_tools/pandas_tool.py +477 -121
- aiecs/tools/task_tools/report_tool.py +390 -142
- aiecs/tools/task_tools/research_tool.py +149 -79
- aiecs/tools/task_tools/scraper_tool.py +339 -145
- aiecs/tools/task_tools/stats_tool.py +448 -209
- aiecs/tools/temp_file_manager.py +26 -24
- aiecs/tools/tool_executor/__init__.py +18 -16
- aiecs/tools/tool_executor/tool_executor.py +364 -52
- aiecs/utils/LLM_output_structor.py +74 -48
- aiecs/utils/__init__.py +14 -3
- aiecs/utils/base_callback.py +0 -3
- aiecs/utils/cache_provider.py +696 -0
- aiecs/utils/execution_utils.py +50 -31
- aiecs/utils/prompt_loader.py +1 -0
- aiecs/utils/token_usage_repository.py +37 -11
- aiecs/ws/socket_server.py +14 -4
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/METADATA +52 -15
- aiecs-1.7.6.dist-info/RECORD +337 -0
- aiecs-1.7.6.dist-info/entry_points.txt +13 -0
- aiecs/config/registry.py +0 -19
- aiecs/domain/context/content_engine.py +0 -982
- aiecs/llm/base_client.py +0 -99
- aiecs/llm/openai_client.py +0 -125
- aiecs/llm/vertex_client.py +0 -186
- aiecs/llm/xai_client.py +0 -184
- aiecs/scripts/dependency_checker.py +0 -857
- aiecs/scripts/quick_dependency_check.py +0 -269
- aiecs/tools/task_tools/search_api.py +0 -7
- aiecs-1.0.1.dist-info/RECORD +0 -90
- aiecs-1.0.1.dist-info/entry_points.txt +0 -7
- /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
- /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
- /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
- /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/WHEEL +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,632 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Knowledge Fusion Orchestrator
|
|
3
|
+
|
|
4
|
+
High-level orchestrator for cross-document entity merging and knowledge fusion.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Dict, Set, Tuple, Any, Optional
|
|
8
|
+
from aiecs.domain.knowledge_graph.models.entity import Entity
|
|
9
|
+
from aiecs.infrastructure.graph_storage.base import GraphStore
|
|
10
|
+
from aiecs.infrastructure.graph_storage.tenant import (
|
|
11
|
+
TenantContext,
|
|
12
|
+
CrossTenantFusionError,
|
|
13
|
+
)
|
|
14
|
+
from aiecs.application.knowledge_graph.fusion.entity_deduplicator import (
|
|
15
|
+
EntityDeduplicator,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class KnowledgeFusion:
|
|
20
|
+
"""
|
|
21
|
+
Orchestrate knowledge fusion across multiple documents
|
|
22
|
+
|
|
23
|
+
After extracting entities and relations from multiple documents,
|
|
24
|
+
this class performs cross-document fusion to:
|
|
25
|
+
- Identify entities that appear in multiple documents
|
|
26
|
+
- Merge duplicate entities across documents
|
|
27
|
+
- Resolve conflicts in entity properties
|
|
28
|
+
- Track provenance (which documents contributed to each entity)
|
|
29
|
+
|
|
30
|
+
Example:
|
|
31
|
+
```python
|
|
32
|
+
fusion = KnowledgeFusion(graph_store)
|
|
33
|
+
|
|
34
|
+
# After processing multiple documents
|
|
35
|
+
await fusion.fuse_cross_document_entities(
|
|
36
|
+
similarity_threshold=0.9
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
print(f"Merged {fusion.entities_merged} entities across documents")
|
|
40
|
+
```
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(
|
|
44
|
+
self,
|
|
45
|
+
graph_store: GraphStore,
|
|
46
|
+
similarity_threshold: float = 0.90, # High threshold for cross-document fusion
|
|
47
|
+
):
|
|
48
|
+
"""
|
|
49
|
+
Initialize knowledge fusion orchestrator
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
graph_store: Graph storage containing entities to fuse
|
|
53
|
+
similarity_threshold: Minimum similarity for cross-document merging
|
|
54
|
+
"""
|
|
55
|
+
self.graph_store = graph_store
|
|
56
|
+
self.similarity_threshold = similarity_threshold
|
|
57
|
+
self.entities_merged = 0
|
|
58
|
+
self.conflicts_resolved = 0
|
|
59
|
+
|
|
60
|
+
async def fuse_cross_document_entities(
|
|
61
|
+
self,
|
|
62
|
+
entity_types: Optional[List[str]] = None,
|
|
63
|
+
context: Optional[TenantContext] = None,
|
|
64
|
+
) -> Dict[str, int]:
|
|
65
|
+
"""
|
|
66
|
+
Perform cross-document entity fusion
|
|
67
|
+
|
|
68
|
+
This method identifies and merges entities that appear across multiple documents.
|
|
69
|
+
It uses similarity matching to find duplicate entities and merges them while
|
|
70
|
+
preserving provenance information.
|
|
71
|
+
|
|
72
|
+
**Tenant Isolation**: When context is provided, fusion operates only within the
|
|
73
|
+
specified tenant scope. Entities from different tenants will never be merged.
|
|
74
|
+
|
|
75
|
+
Algorithm:
|
|
76
|
+
1. Query all entities from graph (optionally filtered by type and tenant)
|
|
77
|
+
2. Filter entities to ensure tenant isolation (if context provided)
|
|
78
|
+
3. Group entities by type
|
|
79
|
+
4. For each type, find similar entities using similarity matching
|
|
80
|
+
5. Identify merge groups (clusters of similar entities)
|
|
81
|
+
6. Merge each group into a canonical entity
|
|
82
|
+
7. Update graph with merged entities and update relations
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
entity_types: Optional list of entity types to fuse (None = all types)
|
|
86
|
+
context: Optional tenant context for multi-tenant isolation
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Dictionary with fusion statistics:
|
|
90
|
+
- entities_analyzed: Total entities analyzed
|
|
91
|
+
- entities_merged: Number of entities merged
|
|
92
|
+
- conflicts_resolved: Number of property conflicts resolved
|
|
93
|
+
- merge_groups: Number of merge groups identified
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
CrossTenantFusionError: If entities from multiple tenants are detected
|
|
97
|
+
"""
|
|
98
|
+
stats = {
|
|
99
|
+
"entities_analyzed": 0,
|
|
100
|
+
"entities_merged": 0,
|
|
101
|
+
"conflicts_resolved": 0,
|
|
102
|
+
"merge_groups": 0,
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
# Reset counters
|
|
106
|
+
self.entities_merged = 0
|
|
107
|
+
self.conflicts_resolved = 0
|
|
108
|
+
|
|
109
|
+
# Step 1: Query all entities from graph (with tenant context)
|
|
110
|
+
entities = await self._query_entities(entity_types, context)
|
|
111
|
+
|
|
112
|
+
# Step 2: Filter entities by tenant_id when context provided (defense-in-depth)
|
|
113
|
+
if context:
|
|
114
|
+
entities = self._filter_entities_by_tenant(entities, context.tenant_id)
|
|
115
|
+
|
|
116
|
+
stats["entities_analyzed"] = len(entities)
|
|
117
|
+
|
|
118
|
+
if len(entities) < 2:
|
|
119
|
+
# Nothing to merge
|
|
120
|
+
return stats
|
|
121
|
+
|
|
122
|
+
# Step 3: Group entities by type (only merge within same type)
|
|
123
|
+
entities_by_type = self._group_entities_by_type(entities)
|
|
124
|
+
|
|
125
|
+
# Step 4-7: Process each type group
|
|
126
|
+
for entity_type, type_entities in entities_by_type.items():
|
|
127
|
+
if len(type_entities) < 2:
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
# Find merge candidates (groups of similar entities)
|
|
131
|
+
merge_groups = await self._find_merge_groups(type_entities)
|
|
132
|
+
stats["merge_groups"] += len(merge_groups)
|
|
133
|
+
|
|
134
|
+
# Merge each group
|
|
135
|
+
for group in merge_groups:
|
|
136
|
+
if len(group) < 2:
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
# Merge entities in group
|
|
140
|
+
await self._merge_entity_group(group)
|
|
141
|
+
# N entities -> 1 entity
|
|
142
|
+
stats["entities_merged"] += len(group) - 1
|
|
143
|
+
|
|
144
|
+
stats["conflicts_resolved"] = self.conflicts_resolved
|
|
145
|
+
|
|
146
|
+
return stats
|
|
147
|
+
|
|
148
|
+
async def resolve_property_conflicts(self, entities: List[Entity], strategy: str = "most_complete") -> Entity:
|
|
149
|
+
"""
|
|
150
|
+
Resolve conflicts when merging entities with different property values
|
|
151
|
+
|
|
152
|
+
Strategies:
|
|
153
|
+
- "most_complete": Prefer non-empty over empty values (default)
|
|
154
|
+
- "most_recent": Prefer most recent value (requires timestamp in provenance)
|
|
155
|
+
- "most_confident": Prefer value from most confident source (requires confidence score)
|
|
156
|
+
- "longest": Prefer longest string value
|
|
157
|
+
- "keep_all": Keep all conflicting values as a list
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
entities: List of entities to merge
|
|
161
|
+
strategy: Conflict resolution strategy
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
Merged entity with resolved conflicts
|
|
165
|
+
"""
|
|
166
|
+
if not entities:
|
|
167
|
+
raise ValueError("Cannot merge empty entity list")
|
|
168
|
+
|
|
169
|
+
if len(entities) == 1:
|
|
170
|
+
return entities[0]
|
|
171
|
+
|
|
172
|
+
# Create a new merged entity (copy first entity as base)
|
|
173
|
+
merged = Entity(
|
|
174
|
+
id=entities[0].id,
|
|
175
|
+
entity_type=entities[0].entity_type,
|
|
176
|
+
properties=entities[0].properties.copy(),
|
|
177
|
+
embedding=entities[0].embedding,
|
|
178
|
+
tenant_id=entities[0].tenant_id,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
conflicting_properties = {}
|
|
182
|
+
|
|
183
|
+
# Merge properties from all entities
|
|
184
|
+
for entity in entities[1:]:
|
|
185
|
+
for key, value in entity.properties.items():
|
|
186
|
+
if key.startswith("_"):
|
|
187
|
+
# Skip internal properties (will handle separately)
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
if key not in merged.properties:
|
|
191
|
+
# Property doesn't exist in merged, add it
|
|
192
|
+
merged.properties[key] = value
|
|
193
|
+
elif merged.properties[key] != value:
|
|
194
|
+
# Conflict detected - apply resolution strategy
|
|
195
|
+
resolved_value = self._resolve_conflict(
|
|
196
|
+
key=key,
|
|
197
|
+
values=[merged.properties[key], value],
|
|
198
|
+
entities=[entities[0], entity],
|
|
199
|
+
strategy=strategy,
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
# Track conflict
|
|
203
|
+
if key not in conflicting_properties:
|
|
204
|
+
conflicting_properties[key] = [merged.properties[key]]
|
|
205
|
+
conflicting_properties[key].append(value)
|
|
206
|
+
|
|
207
|
+
# Update with resolved value
|
|
208
|
+
merged.properties[key] = resolved_value
|
|
209
|
+
|
|
210
|
+
# Store conflicting values for transparency
|
|
211
|
+
if conflicting_properties:
|
|
212
|
+
merged.properties["_property_conflicts"] = conflicting_properties
|
|
213
|
+
self.conflicts_resolved += len(conflicting_properties)
|
|
214
|
+
|
|
215
|
+
# Merge provenance information
|
|
216
|
+
provenances = []
|
|
217
|
+
for entity in entities:
|
|
218
|
+
prov = entity.properties.get("_provenance")
|
|
219
|
+
if prov:
|
|
220
|
+
provenances.append(prov)
|
|
221
|
+
if provenances:
|
|
222
|
+
merged.properties["_provenance_merged"] = provenances
|
|
223
|
+
|
|
224
|
+
# Merge embeddings (average if multiple)
|
|
225
|
+
embeddings = [e.embedding for e in entities if e.embedding]
|
|
226
|
+
if len(embeddings) > 1:
|
|
227
|
+
# Average embeddings
|
|
228
|
+
import numpy as np
|
|
229
|
+
|
|
230
|
+
merged.embedding = list(np.mean(embeddings, axis=0))
|
|
231
|
+
elif embeddings:
|
|
232
|
+
merged.embedding = embeddings[0]
|
|
233
|
+
|
|
234
|
+
return merged
|
|
235
|
+
|
|
236
|
+
def _resolve_conflict(
|
|
237
|
+
self,
|
|
238
|
+
key: str,
|
|
239
|
+
values: List[Any],
|
|
240
|
+
entities: List[Entity],
|
|
241
|
+
strategy: str,
|
|
242
|
+
) -> Any:
|
|
243
|
+
"""
|
|
244
|
+
Resolve a single property conflict using specified strategy
|
|
245
|
+
|
|
246
|
+
Args:
|
|
247
|
+
key: Property key
|
|
248
|
+
values: Conflicting values
|
|
249
|
+
entities: Entities that have these values
|
|
250
|
+
strategy: Resolution strategy
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
Resolved value
|
|
254
|
+
"""
|
|
255
|
+
if strategy == "most_complete":
|
|
256
|
+
# Prefer non-empty, non-None values
|
|
257
|
+
# Prefer longer strings
|
|
258
|
+
non_empty = [v for v in values if v not in (None, "", [], {})]
|
|
259
|
+
if non_empty:
|
|
260
|
+
# If strings, prefer longest
|
|
261
|
+
if all(isinstance(v, str) for v in non_empty):
|
|
262
|
+
return max(non_empty, key=len)
|
|
263
|
+
return non_empty[0]
|
|
264
|
+
return values[0]
|
|
265
|
+
|
|
266
|
+
elif strategy == "most_recent":
|
|
267
|
+
# Prefer value from entity with most recent timestamp
|
|
268
|
+
timestamps = []
|
|
269
|
+
for entity in entities:
|
|
270
|
+
prov = entity.properties.get("_provenance", {})
|
|
271
|
+
if isinstance(prov, dict) and "timestamp" in prov:
|
|
272
|
+
timestamps.append(prov["timestamp"])
|
|
273
|
+
else:
|
|
274
|
+
timestamps.append(0) # No timestamp = oldest
|
|
275
|
+
|
|
276
|
+
if timestamps:
|
|
277
|
+
most_recent_idx = timestamps.index(max(timestamps))
|
|
278
|
+
return values[most_recent_idx]
|
|
279
|
+
return values[0]
|
|
280
|
+
|
|
281
|
+
elif strategy == "most_confident":
|
|
282
|
+
# Prefer value from entity with highest confidence
|
|
283
|
+
confidences = []
|
|
284
|
+
for entity in entities:
|
|
285
|
+
prov = entity.properties.get("_provenance", {})
|
|
286
|
+
if isinstance(prov, dict) and "confidence" in prov:
|
|
287
|
+
confidences.append(prov["confidence"])
|
|
288
|
+
else:
|
|
289
|
+
confidences.append(0.0) # No confidence = lowest
|
|
290
|
+
|
|
291
|
+
if confidences:
|
|
292
|
+
most_confident_idx = confidences.index(max(confidences))
|
|
293
|
+
return values[most_confident_idx]
|
|
294
|
+
return values[0]
|
|
295
|
+
|
|
296
|
+
elif strategy == "longest":
|
|
297
|
+
# Prefer longest value (for strings)
|
|
298
|
+
if all(isinstance(v, str) for v in values):
|
|
299
|
+
return max(values, key=len)
|
|
300
|
+
return values[0]
|
|
301
|
+
|
|
302
|
+
elif strategy == "keep_all":
|
|
303
|
+
# Keep all values as a list
|
|
304
|
+
return values
|
|
305
|
+
|
|
306
|
+
else:
|
|
307
|
+
# Default: return first value
|
|
308
|
+
return values[0]
|
|
309
|
+
|
|
310
|
+
async def track_entity_provenance(self, entity_id: str) -> List[str]:
|
|
311
|
+
"""
|
|
312
|
+
Get list of documents that contributed to an entity
|
|
313
|
+
|
|
314
|
+
Args:
|
|
315
|
+
entity_id: Entity ID
|
|
316
|
+
|
|
317
|
+
Returns:
|
|
318
|
+
List of document sources
|
|
319
|
+
"""
|
|
320
|
+
entity = await self.graph_store.get_entity(entity_id)
|
|
321
|
+
if not entity:
|
|
322
|
+
return []
|
|
323
|
+
|
|
324
|
+
sources = []
|
|
325
|
+
|
|
326
|
+
# Check single provenance
|
|
327
|
+
if "_provenance" in entity.properties:
|
|
328
|
+
prov = entity.properties["_provenance"]
|
|
329
|
+
if isinstance(prov, dict) and "source" in prov:
|
|
330
|
+
sources.append(prov["source"])
|
|
331
|
+
|
|
332
|
+
# Check merged provenances
|
|
333
|
+
if "_provenance_merged" in entity.properties:
|
|
334
|
+
merged_provs = entity.properties["_provenance_merged"]
|
|
335
|
+
if isinstance(merged_provs, list):
|
|
336
|
+
for prov in merged_provs:
|
|
337
|
+
if isinstance(prov, dict) and "source" in prov:
|
|
338
|
+
sources.append(prov["source"])
|
|
339
|
+
|
|
340
|
+
return list(set(sources)) # Remove duplicates
|
|
341
|
+
|
|
342
|
+
# =========================================================================
|
|
343
|
+
# Helper Methods for Cross-Document Fusion
|
|
344
|
+
# =========================================================================
|
|
345
|
+
|
|
346
|
+
async def _query_entities(
|
|
347
|
+
self,
|
|
348
|
+
entity_types: Optional[List[str]] = None,
|
|
349
|
+
context: Optional[TenantContext] = None,
|
|
350
|
+
) -> List[Entity]:
|
|
351
|
+
"""
|
|
352
|
+
Query entities from graph store with tenant filtering
|
|
353
|
+
|
|
354
|
+
Args:
|
|
355
|
+
entity_types: Optional list of entity types to query
|
|
356
|
+
context: Optional tenant context for filtering
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
List of entities (filtered by tenant if context provided)
|
|
360
|
+
"""
|
|
361
|
+
entities = []
|
|
362
|
+
|
|
363
|
+
# Check if graph store has get_all_entities method
|
|
364
|
+
if hasattr(self.graph_store, "get_all_entities"):
|
|
365
|
+
if entity_types:
|
|
366
|
+
# Query each type separately
|
|
367
|
+
for entity_type in entity_types:
|
|
368
|
+
# Pass context to ensure tenant filtering at storage layer
|
|
369
|
+
if context:
|
|
370
|
+
type_entities = await self.graph_store.get_all_entities(
|
|
371
|
+
entity_type=entity_type, context=context
|
|
372
|
+
)
|
|
373
|
+
else:
|
|
374
|
+
type_entities = await self.graph_store.get_all_entities(
|
|
375
|
+
entity_type=entity_type
|
|
376
|
+
)
|
|
377
|
+
entities.extend(type_entities)
|
|
378
|
+
else:
|
|
379
|
+
# Query all entities
|
|
380
|
+
if context:
|
|
381
|
+
entities = await self.graph_store.get_all_entities(context=context)
|
|
382
|
+
else:
|
|
383
|
+
entities = await self.graph_store.get_all_entities()
|
|
384
|
+
else:
|
|
385
|
+
# Fallback: graph store doesn't support bulk queries
|
|
386
|
+
# This is a limitation - we can't efficiently query all entities
|
|
387
|
+
# In this case, return empty list
|
|
388
|
+
# Note: Implementations should add get_all_entities() method
|
|
389
|
+
pass
|
|
390
|
+
|
|
391
|
+
return entities
|
|
392
|
+
|
|
393
|
+
def _filter_entities_by_tenant(
|
|
394
|
+
self, entities: List[Entity], tenant_id: str
|
|
395
|
+
) -> List[Entity]:
|
|
396
|
+
"""
|
|
397
|
+
Filter entities to only those belonging to the specified tenant.
|
|
398
|
+
|
|
399
|
+
This is a defense-in-depth mechanism in addition to storage-level filtering.
|
|
400
|
+
Silently filters out entities from other tenants.
|
|
401
|
+
|
|
402
|
+
Args:
|
|
403
|
+
entities: List of entities to filter
|
|
404
|
+
tenant_id: Target tenant ID
|
|
405
|
+
|
|
406
|
+
Returns:
|
|
407
|
+
List of entities belonging to the specified tenant
|
|
408
|
+
"""
|
|
409
|
+
return [e for e in entities if e.tenant_id == tenant_id]
|
|
410
|
+
|
|
411
|
+
def _group_entities_by_type(self, entities: List[Entity]) -> Dict[str, List[Entity]]:
|
|
412
|
+
"""
|
|
413
|
+
Group entities by their type
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
entities: List of entities
|
|
417
|
+
|
|
418
|
+
Returns:
|
|
419
|
+
Dictionary mapping entity type to list of entities
|
|
420
|
+
"""
|
|
421
|
+
entities_by_type: Dict[str, List[Entity]] = {}
|
|
422
|
+
|
|
423
|
+
for entity in entities:
|
|
424
|
+
entity_type = entity.entity_type
|
|
425
|
+
if entity_type not in entities_by_type:
|
|
426
|
+
entities_by_type[entity_type] = []
|
|
427
|
+
entities_by_type[entity_type].append(entity)
|
|
428
|
+
|
|
429
|
+
return entities_by_type
|
|
430
|
+
|
|
431
|
+
async def _find_merge_groups(self, entities: List[Entity]) -> List[List[Entity]]:
|
|
432
|
+
"""
|
|
433
|
+
Find groups of entities that should be merged together
|
|
434
|
+
|
|
435
|
+
Uses similarity matching to identify clusters of similar entities.
|
|
436
|
+
Entities are grouped using connected components algorithm.
|
|
437
|
+
|
|
438
|
+
Args:
|
|
439
|
+
entities: List of entities (all same type)
|
|
440
|
+
|
|
441
|
+
Returns:
|
|
442
|
+
List of merge groups (each group is a list of entities)
|
|
443
|
+
"""
|
|
444
|
+
if len(entities) < 2:
|
|
445
|
+
return []
|
|
446
|
+
|
|
447
|
+
# Build similarity graph
|
|
448
|
+
n = len(entities)
|
|
449
|
+
similar_pairs: Set[Tuple[int, int]] = set()
|
|
450
|
+
|
|
451
|
+
# Compare all pairs
|
|
452
|
+
for i in range(n):
|
|
453
|
+
for j in range(i + 1, n):
|
|
454
|
+
similarity = await self._compute_entity_similarity(entities[i], entities[j])
|
|
455
|
+
if similarity >= self.similarity_threshold:
|
|
456
|
+
similar_pairs.add((i, j))
|
|
457
|
+
|
|
458
|
+
# Find connected components (merge groups)
|
|
459
|
+
merge_groups = self._find_connected_components(n, similar_pairs)
|
|
460
|
+
|
|
461
|
+
# Convert indices to entities
|
|
462
|
+
entity_groups = []
|
|
463
|
+
for group_indices in merge_groups:
|
|
464
|
+
if len(group_indices) >= 2: # Only groups with 2+ entities
|
|
465
|
+
entity_group = [entities[i] for i in group_indices]
|
|
466
|
+
entity_groups.append(entity_group)
|
|
467
|
+
|
|
468
|
+
return entity_groups
|
|
469
|
+
|
|
470
|
+
def _find_connected_components(self, n: int, edges: Set[Tuple[int, int]]) -> List[List[int]]:
|
|
471
|
+
"""
|
|
472
|
+
Find connected components in an undirected graph
|
|
473
|
+
|
|
474
|
+
Uses Union-Find (Disjoint Set Union) algorithm.
|
|
475
|
+
|
|
476
|
+
Args:
|
|
477
|
+
n: Number of nodes
|
|
478
|
+
edges: Set of edges (pairs of node indices)
|
|
479
|
+
|
|
480
|
+
Returns:
|
|
481
|
+
List of components (each component is a list of node indices)
|
|
482
|
+
"""
|
|
483
|
+
# Initialize parent array for Union-Find
|
|
484
|
+
parent = list(range(n))
|
|
485
|
+
|
|
486
|
+
def find(x: int) -> int:
|
|
487
|
+
"""Find root of x with path compression"""
|
|
488
|
+
if parent[x] != x:
|
|
489
|
+
parent[x] = find(parent[x])
|
|
490
|
+
return parent[x]
|
|
491
|
+
|
|
492
|
+
def union(x: int, y: int) -> None:
|
|
493
|
+
"""Union two sets"""
|
|
494
|
+
root_x = find(x)
|
|
495
|
+
root_y = find(y)
|
|
496
|
+
if root_x != root_y:
|
|
497
|
+
parent[root_x] = root_y
|
|
498
|
+
|
|
499
|
+
# Build connected components
|
|
500
|
+
for i, j in edges:
|
|
501
|
+
union(i, j)
|
|
502
|
+
|
|
503
|
+
# Group nodes by their root
|
|
504
|
+
components: Dict[int, List[int]] = {}
|
|
505
|
+
for i in range(n):
|
|
506
|
+
root = find(i)
|
|
507
|
+
if root not in components:
|
|
508
|
+
components[root] = []
|
|
509
|
+
components[root].append(i)
|
|
510
|
+
|
|
511
|
+
return list(components.values())
|
|
512
|
+
|
|
513
|
+
async def _compute_entity_similarity(self, entity1: Entity, entity2: Entity) -> float:
|
|
514
|
+
"""
|
|
515
|
+
Compute similarity between two entities
|
|
516
|
+
|
|
517
|
+
Uses EntityDeduplicator for similarity computation.
|
|
518
|
+
|
|
519
|
+
Args:
|
|
520
|
+
entity1: First entity
|
|
521
|
+
entity2: Second entity
|
|
522
|
+
|
|
523
|
+
Returns:
|
|
524
|
+
Similarity score (0.0-1.0)
|
|
525
|
+
"""
|
|
526
|
+
# Use EntityDeduplicator for similarity computation
|
|
527
|
+
deduplicator = EntityDeduplicator(similarity_threshold=self.similarity_threshold)
|
|
528
|
+
return await deduplicator._compute_similarity(entity1, entity2)
|
|
529
|
+
|
|
530
|
+
async def _merge_entity_group(self, entities: List[Entity]) -> None:
|
|
531
|
+
"""
|
|
532
|
+
Merge a group of entities into a single canonical entity
|
|
533
|
+
|
|
534
|
+
Steps:
|
|
535
|
+
1. Resolve property conflicts to create merged entity
|
|
536
|
+
2. Update graph: replace all entities with merged entity
|
|
537
|
+
3. Update relations: redirect to merged entity
|
|
538
|
+
4. Delete old entities
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
entities: List of entities to merge (2 or more)
|
|
542
|
+
"""
|
|
543
|
+
if len(entities) < 2:
|
|
544
|
+
return
|
|
545
|
+
|
|
546
|
+
# Step 1: Resolve conflicts and create merged entity
|
|
547
|
+
merged_entity = await self.resolve_property_conflicts(entities)
|
|
548
|
+
|
|
549
|
+
# Track merge provenance
|
|
550
|
+
merged_entity.properties["_merged_from"] = [e.id for e in entities]
|
|
551
|
+
merged_entity.properties["_merge_count"] = len(entities)
|
|
552
|
+
|
|
553
|
+
# Step 2: Add merged entity to graph (use first entity's ID as
|
|
554
|
+
# canonical)
|
|
555
|
+
canonical_id = entities[0].id
|
|
556
|
+
merged_entity.id = canonical_id
|
|
557
|
+
|
|
558
|
+
# Update entity in graph
|
|
559
|
+
# Try update_entity if available, otherwise delete and re-add
|
|
560
|
+
if hasattr(self.graph_store, "update_entity"):
|
|
561
|
+
await self.graph_store.update_entity(merged_entity)
|
|
562
|
+
else:
|
|
563
|
+
# Delete old entity and add merged one
|
|
564
|
+
# For InMemoryGraphStore, we need to manually update
|
|
565
|
+
if hasattr(self.graph_store, "entities"):
|
|
566
|
+
# Direct update for InMemoryGraphStore
|
|
567
|
+
self.graph_store.entities[canonical_id] = merged_entity
|
|
568
|
+
if hasattr(self.graph_store, "graph") and self.graph_store.graph:
|
|
569
|
+
self.graph_store.graph.nodes[canonical_id]["entity"] = merged_entity
|
|
570
|
+
else:
|
|
571
|
+
# Fallback: try to add (may fail if exists)
|
|
572
|
+
try:
|
|
573
|
+
await self.graph_store.add_entity(merged_entity)
|
|
574
|
+
except ValueError:
|
|
575
|
+
# Entity already exists, skip
|
|
576
|
+
pass
|
|
577
|
+
|
|
578
|
+
# Step 3: Update relations pointing to merged entities
|
|
579
|
+
await self._update_relations_for_merge(entities, canonical_id)
|
|
580
|
+
|
|
581
|
+
# Step 4: Delete old entities (except canonical)
|
|
582
|
+
for entity in entities[1:]:
|
|
583
|
+
# Delete entity from graph
|
|
584
|
+
if hasattr(self.graph_store, "delete_entity"):
|
|
585
|
+
await self.graph_store.delete_entity(entity.id)
|
|
586
|
+
|
|
587
|
+
# Update counter
|
|
588
|
+
self.entities_merged += len(entities) - 1
|
|
589
|
+
|
|
590
|
+
async def _update_relations_for_merge(self, merged_entities: List[Entity], canonical_id: str) -> None:
|
|
591
|
+
"""
|
|
592
|
+
Update relations to point to canonical merged entity
|
|
593
|
+
|
|
594
|
+
For each merged entity (except canonical):
|
|
595
|
+
- Find all relations where it's source or target
|
|
596
|
+
- Update relation to use canonical_id instead
|
|
597
|
+
- Remove duplicate relations
|
|
598
|
+
|
|
599
|
+
Args:
|
|
600
|
+
merged_entities: List of entities that were merged
|
|
601
|
+
canonical_id: ID of canonical entity
|
|
602
|
+
"""
|
|
603
|
+
{e.id for e in merged_entities}
|
|
604
|
+
|
|
605
|
+
# For each merged entity (except canonical)
|
|
606
|
+
for entity in merged_entities:
|
|
607
|
+
if entity.id == canonical_id:
|
|
608
|
+
continue
|
|
609
|
+
|
|
610
|
+
# Get outgoing relations
|
|
611
|
+
if hasattr(self.graph_store, "get_outgoing_relations"):
|
|
612
|
+
outgoing = await self.graph_store.get_outgoing_relations(entity.id)
|
|
613
|
+
for relation in outgoing:
|
|
614
|
+
# Update source to canonical
|
|
615
|
+
relation.source_id = canonical_id
|
|
616
|
+
await self.graph_store.add_relation(relation)
|
|
617
|
+
|
|
618
|
+
# Get incoming relations
|
|
619
|
+
if hasattr(self.graph_store, "get_incoming_relations"):
|
|
620
|
+
incoming = await self.graph_store.get_incoming_relations(entity.id)
|
|
621
|
+
for relation in incoming:
|
|
622
|
+
# Update target to canonical
|
|
623
|
+
relation.target_id = canonical_id
|
|
624
|
+
await self.graph_store.add_relation(relation)
|
|
625
|
+
|
|
626
|
+
# Alternative: use get_neighbors to find relations
|
|
627
|
+
# This is less efficient but works with basic GraphStore interface
|
|
628
|
+
if not hasattr(self.graph_store, "get_outgoing_relations"):
|
|
629
|
+
# Get neighbors (this implicitly uses relations)
|
|
630
|
+
# We can't easily update relations without direct access
|
|
631
|
+
# This is a limitation of the basic interface
|
|
632
|
+
pass
|