aiecs 1.0.1__py3-none-any.whl → 1.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +13 -16
- aiecs/__main__.py +7 -7
- aiecs/aiecs_client.py +269 -75
- aiecs/application/executors/operation_executor.py +79 -54
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
- aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
- aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
- aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +98 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
- aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
- aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
- aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
- aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
- aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
- aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
- aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
- aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
- aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
- aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
- aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
- aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
- aiecs/application/knowledge_graph/search/reranker.py +293 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +471 -0
- aiecs/config/__init__.py +20 -5
- aiecs/config/config.py +762 -31
- aiecs/config/graph_config.py +131 -0
- aiecs/config/tool_config.py +399 -0
- aiecs/core/__init__.py +29 -13
- aiecs/core/interface/__init__.py +2 -2
- aiecs/core/interface/execution_interface.py +22 -22
- aiecs/core/interface/storage_interface.py +37 -88
- aiecs/core/registry/__init__.py +31 -0
- aiecs/core/registry/service_registry.py +92 -0
- aiecs/domain/__init__.py +270 -1
- aiecs/domain/agent/__init__.py +191 -0
- aiecs/domain/agent/base_agent.py +3870 -0
- aiecs/domain/agent/exceptions.py +99 -0
- aiecs/domain/agent/graph_aware_mixin.py +569 -0
- aiecs/domain/agent/hybrid_agent.py +1435 -0
- aiecs/domain/agent/integration/__init__.py +29 -0
- aiecs/domain/agent/integration/context_compressor.py +216 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
- aiecs/domain/agent/integration/protocols.py +281 -0
- aiecs/domain/agent/integration/retry_policy.py +218 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
- aiecs/domain/agent/lifecycle.py +291 -0
- aiecs/domain/agent/llm_agent.py +692 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +1124 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +163 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
- aiecs/domain/agent/models.py +884 -0
- aiecs/domain/agent/observability.py +479 -0
- aiecs/domain/agent/persistence.py +449 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +159 -0
- aiecs/domain/agent/prompts/formatters.py +187 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +253 -0
- aiecs/domain/agent/tool_agent.py +444 -0
- aiecs/domain/agent/tools/__init__.py +15 -0
- aiecs/domain/agent/tools/schema_generator.py +364 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +469 -0
- aiecs/domain/community/analytics.py +432 -0
- aiecs/domain/community/collaborative_workflow.py +648 -0
- aiecs/domain/community/communication_hub.py +634 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +796 -0
- aiecs/domain/community/community_manager.py +803 -0
- aiecs/domain/community/decision_engine.py +849 -0
- aiecs/domain/community/exceptions.py +231 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +234 -0
- aiecs/domain/community/resource_manager.py +461 -0
- aiecs/domain/community/shared_context_manager.py +589 -0
- aiecs/domain/context/__init__.py +40 -10
- aiecs/domain/context/context_engine.py +1910 -0
- aiecs/domain/context/conversation_models.py +87 -53
- aiecs/domain/context/graph_memory.py +582 -0
- aiecs/domain/execution/model.py +12 -4
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +148 -0
- aiecs/domain/knowledge_graph/models/evidence.py +178 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
- aiecs/domain/knowledge_graph/models/path.py +171 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
- aiecs/domain/knowledge_graph/models/query.py +261 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
- aiecs/domain/knowledge_graph/models/relation.py +202 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
- aiecs/domain/task/dsl_processor.py +172 -56
- aiecs/domain/task/model.py +20 -8
- aiecs/domain/task/task_context.py +27 -24
- aiecs/infrastructure/__init__.py +0 -2
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +837 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
- aiecs/infrastructure/graph_storage/cache.py +424 -0
- aiecs/infrastructure/graph_storage/distributed.py +223 -0
- aiecs/infrastructure/graph_storage/error_handling.py +380 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
- aiecs/infrastructure/graph_storage/metrics.py +344 -0
- aiecs/infrastructure/graph_storage/migration.py +400 -0
- aiecs/infrastructure/graph_storage/pagination.py +483 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
- aiecs/infrastructure/graph_storage/postgres.py +1563 -0
- aiecs/infrastructure/graph_storage/property_storage.py +353 -0
- aiecs/infrastructure/graph_storage/protocols.py +76 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
- aiecs/infrastructure/graph_storage/streaming.py +487 -0
- aiecs/infrastructure/graph_storage/tenant.py +412 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
- aiecs/infrastructure/messaging/websocket_manager.py +51 -35
- aiecs/infrastructure/monitoring/__init__.py +22 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
- aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
- aiecs/infrastructure/monitoring/structured_logger.py +3 -7
- aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
- aiecs/infrastructure/persistence/__init__.py +14 -1
- aiecs/infrastructure/persistence/context_engine_client.py +184 -0
- aiecs/infrastructure/persistence/database_manager.py +67 -43
- aiecs/infrastructure/persistence/file_storage.py +180 -103
- aiecs/infrastructure/persistence/redis_client.py +74 -21
- aiecs/llm/__init__.py +73 -25
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
- aiecs/llm/client_factory.py +224 -36
- aiecs/llm/client_resolver.py +155 -0
- aiecs/llm/clients/__init__.py +38 -0
- aiecs/llm/clients/base_client.py +324 -0
- aiecs/llm/clients/google_function_calling_mixin.py +457 -0
- aiecs/llm/clients/googleai_client.py +241 -0
- aiecs/llm/clients/openai_client.py +158 -0
- aiecs/llm/clients/openai_compatible_mixin.py +367 -0
- aiecs/llm/clients/vertex_client.py +897 -0
- aiecs/llm/clients/xai_client.py +201 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +272 -0
- aiecs/llm/config/config_validator.py +206 -0
- aiecs/llm/config/model_config.py +143 -0
- aiecs/llm/protocols.py +149 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +89 -0
- aiecs/main.py +140 -121
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/module_checker.py +499 -0
- aiecs/scripts/aid/version_manager.py +235 -0
- aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
- aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
- aiecs/scripts/dependance_check/__init__.py +15 -0
- aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
- aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
- aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
- aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
- aiecs/scripts/knowledge_graph/__init__.py +3 -0
- aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
- aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
- aiecs/scripts/tools_develop/README.md +671 -0
- aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
- aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
- aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
- aiecs/scripts/tools_develop/schema_coverage.py +511 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
- aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
- aiecs/scripts/tools_develop/verify_tools.py +352 -0
- aiecs/tasks/__init__.py +0 -1
- aiecs/tasks/worker.py +115 -47
- aiecs/tools/__init__.py +194 -72
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +330 -0
- aiecs/tools/apisource/providers/__init__.py +112 -0
- aiecs/tools/apisource/providers/base.py +671 -0
- aiecs/tools/apisource/providers/census.py +397 -0
- aiecs/tools/apisource/providers/fred.py +535 -0
- aiecs/tools/apisource/providers/newsapi.py +409 -0
- aiecs/tools/apisource/providers/worldbank.py +352 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +363 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
- aiecs/tools/apisource/tool.py +832 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +334 -0
- aiecs/tools/base_tool.py +415 -21
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +607 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
- aiecs/tools/docs/content_insertion_tool.py +1320 -0
- aiecs/tools/docs/document_creator_tool.py +1323 -0
- aiecs/tools/docs/document_layout_tool.py +1160 -0
- aiecs/tools/docs/document_parser_tool.py +1011 -0
- aiecs/tools/docs/document_writer_tool.py +1829 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
- aiecs/tools/langchain_adapter.py +300 -138
- aiecs/tools/schema_generator.py +455 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +581 -0
- aiecs/tools/search_tool/cache.py +264 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +224 -0
- aiecs/tools/search_tool/core.py +778 -0
- aiecs/tools/search_tool/deduplicator.py +119 -0
- aiecs/tools/search_tool/error_handler.py +242 -0
- aiecs/tools/search_tool/metrics.py +343 -0
- aiecs/tools/search_tool/rate_limiter.py +172 -0
- aiecs/tools/search_tool/schemas.py +275 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
- aiecs/tools/statistics/data_loader_tool.py +555 -0
- aiecs/tools/statistics/data_profiler_tool.py +638 -0
- aiecs/tools/statistics/data_transformer_tool.py +580 -0
- aiecs/tools/statistics/data_visualizer_tool.py +498 -0
- aiecs/tools/statistics/model_trainer_tool.py +507 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
- aiecs/tools/task_tools/__init__.py +49 -36
- aiecs/tools/task_tools/chart_tool.py +200 -184
- aiecs/tools/task_tools/classfire_tool.py +268 -267
- aiecs/tools/task_tools/image_tool.py +175 -131
- aiecs/tools/task_tools/office_tool.py +226 -146
- aiecs/tools/task_tools/pandas_tool.py +477 -121
- aiecs/tools/task_tools/report_tool.py +390 -142
- aiecs/tools/task_tools/research_tool.py +149 -79
- aiecs/tools/task_tools/scraper_tool.py +339 -145
- aiecs/tools/task_tools/stats_tool.py +448 -209
- aiecs/tools/temp_file_manager.py +26 -24
- aiecs/tools/tool_executor/__init__.py +18 -16
- aiecs/tools/tool_executor/tool_executor.py +364 -52
- aiecs/utils/LLM_output_structor.py +74 -48
- aiecs/utils/__init__.py +14 -3
- aiecs/utils/base_callback.py +0 -3
- aiecs/utils/cache_provider.py +696 -0
- aiecs/utils/execution_utils.py +50 -31
- aiecs/utils/prompt_loader.py +1 -0
- aiecs/utils/token_usage_repository.py +37 -11
- aiecs/ws/socket_server.py +14 -4
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/METADATA +52 -15
- aiecs-1.7.6.dist-info/RECORD +337 -0
- aiecs-1.7.6.dist-info/entry_points.txt +13 -0
- aiecs/config/registry.py +0 -19
- aiecs/domain/context/content_engine.py +0 -982
- aiecs/llm/base_client.py +0 -99
- aiecs/llm/openai_client.py +0 -125
- aiecs/llm/vertex_client.py +0 -186
- aiecs/llm/xai_client.py +0 -184
- aiecs/scripts/dependency_checker.py +0 -857
- aiecs/scripts/quick_dependency_check.py +0 -269
- aiecs/tools/task_tools/search_api.py +0 -7
- aiecs-1.0.1.dist-info/RECORD +0 -90
- aiecs-1.0.1.dist-info/entry_points.txt +0 -7
- /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
- /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
- /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
- /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/WHEEL +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,433 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Entity Deduplicator
|
|
3
|
+
|
|
4
|
+
Identifies and merges duplicate entities based on similarity matching.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Dict, Optional, Tuple, Set, TYPE_CHECKING
|
|
8
|
+
from difflib import SequenceMatcher
|
|
9
|
+
from aiecs.domain.knowledge_graph.models.entity import Entity
|
|
10
|
+
from aiecs.infrastructure.graph_storage.tenant import (
|
|
11
|
+
TenantContext,
|
|
12
|
+
CrossTenantFusionError,
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from aiecs.application.knowledge_graph.fusion.similarity_pipeline import (
|
|
17
|
+
SimilarityPipeline,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class EntityDeduplicator:
|
|
22
|
+
"""
|
|
23
|
+
Deduplicate entities based on similarity
|
|
24
|
+
|
|
25
|
+
When extracting entities from text, it's common to get duplicates:
|
|
26
|
+
- "Apple Inc." vs "Apple" vs "Apple Incorporated"
|
|
27
|
+
- "John Smith" vs "J. Smith" vs "Smith, John"
|
|
28
|
+
- "New York" vs "New York City" vs "NYC"
|
|
29
|
+
|
|
30
|
+
This class identifies such duplicates and merges them into canonical entities.
|
|
31
|
+
|
|
32
|
+
Features:
|
|
33
|
+
- Name-based fuzzy matching
|
|
34
|
+
- Type-aware matching (only match entities of same type)
|
|
35
|
+
- Property-based matching (use properties to improve matching)
|
|
36
|
+
- Configurable similarity threshold
|
|
37
|
+
- Embedding-based matching (when embeddings available)
|
|
38
|
+
|
|
39
|
+
Example:
|
|
40
|
+
```python
|
|
41
|
+
deduplicator = EntityDeduplicator(similarity_threshold=0.85)
|
|
42
|
+
|
|
43
|
+
entities = [
|
|
44
|
+
Entity(type="Company", properties={"name": "Apple Inc."}),
|
|
45
|
+
Entity(type="Company", properties={"name": "Apple"}),
|
|
46
|
+
Entity(type="Company", properties={"name": "Microsoft"})
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
deduplicated = await deduplicator.deduplicate(entities)
|
|
50
|
+
# Returns: [
|
|
51
|
+
# Entity(type="Company", properties={"name": "Apple Inc.", "_aliases": ["Apple"]}),
|
|
52
|
+
# Entity(type="Company", properties={"name": "Microsoft"})
|
|
53
|
+
# ]
|
|
54
|
+
```
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
similarity_threshold: float = 0.85,
|
|
60
|
+
use_embeddings: bool = True,
|
|
61
|
+
embedding_threshold: float = 0.90,
|
|
62
|
+
similarity_pipeline: Optional["SimilarityPipeline"] = None,
|
|
63
|
+
):
|
|
64
|
+
"""
|
|
65
|
+
Initialize entity deduplicator
|
|
66
|
+
|
|
67
|
+
Args:
|
|
68
|
+
similarity_threshold: Minimum similarity score to consider entities as duplicates (0.0-1.0)
|
|
69
|
+
use_embeddings: Whether to use embeddings for similarity (if available)
|
|
70
|
+
embedding_threshold: Minimum embedding similarity for duplicates (0.0-1.0)
|
|
71
|
+
similarity_pipeline: Optional SimilarityPipeline for enhanced matching
|
|
72
|
+
"""
|
|
73
|
+
self.similarity_threshold = similarity_threshold
|
|
74
|
+
self.use_embeddings = use_embeddings
|
|
75
|
+
self.embedding_threshold = embedding_threshold
|
|
76
|
+
self._similarity_pipeline = similarity_pipeline
|
|
77
|
+
|
|
78
|
+
async def deduplicate(
|
|
79
|
+
self, entities: List[Entity], context: Optional[TenantContext] = None
|
|
80
|
+
) -> List[Entity]:
|
|
81
|
+
"""
|
|
82
|
+
Deduplicate a list of entities
|
|
83
|
+
|
|
84
|
+
**Tenant Isolation**: When context is provided, deduplication only compares
|
|
85
|
+
entities within the same tenant. Entities from other tenants are filtered out
|
|
86
|
+
(defense-in-depth).
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
entities: List of entities to deduplicate
|
|
90
|
+
context: Optional tenant context for multi-tenant isolation
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
List of deduplicated entities (with merged properties and aliases)
|
|
94
|
+
"""
|
|
95
|
+
if not entities:
|
|
96
|
+
return []
|
|
97
|
+
|
|
98
|
+
# Filter to only entities in the specified tenant (defense-in-depth)
|
|
99
|
+
if context:
|
|
100
|
+
entities = [e for e in entities if e.tenant_id == context.tenant_id]
|
|
101
|
+
|
|
102
|
+
# Group entities by type (only match within same type)
|
|
103
|
+
entities_by_type: Dict[str, List[Entity]] = {}
|
|
104
|
+
for entity in entities:
|
|
105
|
+
if entity.entity_type not in entities_by_type:
|
|
106
|
+
entities_by_type[entity.entity_type] = []
|
|
107
|
+
entities_by_type[entity.entity_type].append(entity)
|
|
108
|
+
|
|
109
|
+
# Deduplicate within each type
|
|
110
|
+
deduplicated_entities = []
|
|
111
|
+
for entity_type, type_entities in entities_by_type.items():
|
|
112
|
+
deduped = await self._deduplicate_type_group(type_entities)
|
|
113
|
+
deduplicated_entities.extend(deduped)
|
|
114
|
+
|
|
115
|
+
return deduplicated_entities
|
|
116
|
+
|
|
117
|
+
async def _deduplicate_type_group(self, entities: List[Entity]) -> List[Entity]:
|
|
118
|
+
"""
|
|
119
|
+
Deduplicate entities of the same type
|
|
120
|
+
|
|
121
|
+
Algorithm:
|
|
122
|
+
1. Build similarity matrix between all pairs
|
|
123
|
+
2. Find clusters of similar entities (connected components)
|
|
124
|
+
3. Merge each cluster into a single canonical entity
|
|
125
|
+
|
|
126
|
+
Note: Assumes all entities in the group are from the same tenant
|
|
127
|
+
(validated by caller if in multi-tenant mode)
|
|
128
|
+
|
|
129
|
+
Args:
|
|
130
|
+
entities: List of entities (all same type and same tenant)
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
List of deduplicated entities
|
|
134
|
+
"""
|
|
135
|
+
if len(entities) <= 1:
|
|
136
|
+
return entities
|
|
137
|
+
|
|
138
|
+
# Build similarity graph
|
|
139
|
+
n = len(entities)
|
|
140
|
+
similar_pairs: Set[Tuple[int, int]] = set()
|
|
141
|
+
|
|
142
|
+
for i in range(n):
|
|
143
|
+
for j in range(i + 1, n):
|
|
144
|
+
similarity = await self._compute_similarity(entities[i], entities[j])
|
|
145
|
+
if similarity >= self.similarity_threshold:
|
|
146
|
+
similar_pairs.add((i, j))
|
|
147
|
+
|
|
148
|
+
# Find connected components (clusters of similar entities)
|
|
149
|
+
clusters = self._find_clusters(n, similar_pairs)
|
|
150
|
+
|
|
151
|
+
# Merge each cluster into canonical entity
|
|
152
|
+
deduplicated = []
|
|
153
|
+
for cluster in clusters:
|
|
154
|
+
cluster_entities = [entities[idx] for idx in cluster]
|
|
155
|
+
merged_entity = self._merge_entities(cluster_entities)
|
|
156
|
+
deduplicated.append(merged_entity)
|
|
157
|
+
|
|
158
|
+
return deduplicated
|
|
159
|
+
|
|
160
|
+
async def _compute_similarity(self, entity1: Entity, entity2: Entity) -> float:
|
|
161
|
+
"""
|
|
162
|
+
Compute similarity between two entities
|
|
163
|
+
|
|
164
|
+
Uses multiple signals:
|
|
165
|
+
1. Name similarity (via SimilarityPipeline if available, else fuzzy string matching)
|
|
166
|
+
2. Property overlap
|
|
167
|
+
3. Embedding similarity (if available)
|
|
168
|
+
|
|
169
|
+
Args:
|
|
170
|
+
entity1: First entity
|
|
171
|
+
entity2: Second entity
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Similarity score (0.0-1.0)
|
|
175
|
+
"""
|
|
176
|
+
# Get entity names
|
|
177
|
+
name1 = self._get_entity_name(entity1)
|
|
178
|
+
name2 = self._get_entity_name(entity2)
|
|
179
|
+
|
|
180
|
+
if not name1 or not name2:
|
|
181
|
+
return 0.0
|
|
182
|
+
|
|
183
|
+
# 1. Name-based similarity (use pipeline if available)
|
|
184
|
+
if self._similarity_pipeline is not None:
|
|
185
|
+
# Use enhanced similarity pipeline with per-entity-type configuration
|
|
186
|
+
pipeline_result = await self._similarity_pipeline.compute_similarity(
|
|
187
|
+
name1=name1,
|
|
188
|
+
name2=name2,
|
|
189
|
+
entity_type=entity1.entity_type,
|
|
190
|
+
)
|
|
191
|
+
name_similarity = pipeline_result.final_score
|
|
192
|
+
else:
|
|
193
|
+
# Fallback to basic string similarity
|
|
194
|
+
name_similarity = self._string_similarity(name1, name2)
|
|
195
|
+
|
|
196
|
+
# 2. Property overlap
|
|
197
|
+
property_similarity = self._property_similarity(entity1.properties, entity2.properties)
|
|
198
|
+
|
|
199
|
+
# 3. Embedding similarity (if available)
|
|
200
|
+
embedding_similarity = 0.0
|
|
201
|
+
if self.use_embeddings and entity1.embedding and entity2.embedding:
|
|
202
|
+
embedding_similarity = self._cosine_similarity(entity1.embedding, entity2.embedding)
|
|
203
|
+
|
|
204
|
+
# Weighted combination
|
|
205
|
+
if entity1.embedding and entity2.embedding and self.use_embeddings:
|
|
206
|
+
# If embeddings available, give them high weight
|
|
207
|
+
return 0.3 * name_similarity + 0.2 * property_similarity + 0.5 * embedding_similarity
|
|
208
|
+
else:
|
|
209
|
+
# No embeddings, rely on name and properties
|
|
210
|
+
return 0.7 * name_similarity + 0.3 * property_similarity
|
|
211
|
+
|
|
212
|
+
def set_similarity_pipeline(self, pipeline: "SimilarityPipeline") -> None:
|
|
213
|
+
"""
|
|
214
|
+
Set the similarity pipeline for enhanced matching.
|
|
215
|
+
|
|
216
|
+
Args:
|
|
217
|
+
pipeline: SimilarityPipeline instance
|
|
218
|
+
"""
|
|
219
|
+
self._similarity_pipeline = pipeline
|
|
220
|
+
|
|
221
|
+
@property
|
|
222
|
+
def similarity_pipeline(self) -> Optional["SimilarityPipeline"]:
|
|
223
|
+
"""Get the current similarity pipeline."""
|
|
224
|
+
return self._similarity_pipeline
|
|
225
|
+
|
|
226
|
+
def _get_entity_name(self, entity: Entity) -> str:
|
|
227
|
+
"""Extract entity name from properties"""
|
|
228
|
+
return entity.properties.get("name") or entity.properties.get("title") or entity.properties.get("text") or ""
|
|
229
|
+
|
|
230
|
+
def _string_similarity(self, str1: str, str2: str) -> float:
|
|
231
|
+
"""
|
|
232
|
+
Compute string similarity using multiple methods
|
|
233
|
+
|
|
234
|
+
Combines:
|
|
235
|
+
- Exact match (normalized)
|
|
236
|
+
- SequenceMatcher ratio
|
|
237
|
+
- Token overlap (for multi-word entities)
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
str1: First string
|
|
241
|
+
str2: Second string
|
|
242
|
+
|
|
243
|
+
Returns:
|
|
244
|
+
Similarity score (0.0-1.0)
|
|
245
|
+
"""
|
|
246
|
+
# Normalize strings
|
|
247
|
+
s1 = str1.lower().strip()
|
|
248
|
+
s2 = str2.lower().strip()
|
|
249
|
+
|
|
250
|
+
# Exact match
|
|
251
|
+
if s1 == s2:
|
|
252
|
+
return 1.0
|
|
253
|
+
|
|
254
|
+
# One is substring of other
|
|
255
|
+
if s1 in s2 or s2 in s1:
|
|
256
|
+
return 0.95
|
|
257
|
+
|
|
258
|
+
# Sequence matcher
|
|
259
|
+
seq_similarity = SequenceMatcher(None, s1, s2).ratio()
|
|
260
|
+
|
|
261
|
+
# Token overlap (for multi-word names)
|
|
262
|
+
tokens1 = set(s1.split())
|
|
263
|
+
tokens2 = set(s2.split())
|
|
264
|
+
if tokens1 and tokens2:
|
|
265
|
+
token_overlap = len(tokens1 & tokens2) / len(tokens1 | tokens2)
|
|
266
|
+
else:
|
|
267
|
+
token_overlap = 0.0
|
|
268
|
+
|
|
269
|
+
# Combine
|
|
270
|
+
return max(seq_similarity, token_overlap)
|
|
271
|
+
|
|
272
|
+
def _property_similarity(self, props1: Dict, props2: Dict) -> float:
|
|
273
|
+
"""
|
|
274
|
+
Compute similarity based on property overlap
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
props1: Properties of first entity
|
|
278
|
+
props2: Properties of second entity
|
|
279
|
+
|
|
280
|
+
Returns:
|
|
281
|
+
Similarity score (0.0-1.0)
|
|
282
|
+
"""
|
|
283
|
+
# Remove internal properties
|
|
284
|
+
keys1 = {k for k in props1.keys() if not k.startswith("_")}
|
|
285
|
+
keys2 = {k for k in props2.keys() if not k.startswith("_")}
|
|
286
|
+
|
|
287
|
+
if not keys1 and not keys2:
|
|
288
|
+
return 0.5 # No properties to compare
|
|
289
|
+
|
|
290
|
+
# Key overlap
|
|
291
|
+
common_keys = keys1 & keys2
|
|
292
|
+
all_keys = keys1 | keys2
|
|
293
|
+
|
|
294
|
+
if not all_keys:
|
|
295
|
+
return 0.5
|
|
296
|
+
|
|
297
|
+
key_overlap = len(common_keys) / len(all_keys)
|
|
298
|
+
|
|
299
|
+
# Value similarity for common keys
|
|
300
|
+
value_matches = 0
|
|
301
|
+
for key in common_keys:
|
|
302
|
+
val1 = str(props1[key]).lower()
|
|
303
|
+
val2 = str(props2[key]).lower()
|
|
304
|
+
if val1 == val2:
|
|
305
|
+
value_matches += 1
|
|
306
|
+
|
|
307
|
+
value_similarity = value_matches / len(common_keys) if common_keys else 0.0
|
|
308
|
+
|
|
309
|
+
# Combine
|
|
310
|
+
return 0.5 * key_overlap + 0.5 * value_similarity
|
|
311
|
+
|
|
312
|
+
def _cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
|
|
313
|
+
"""
|
|
314
|
+
Compute cosine similarity between two vectors
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
vec1: First vector
|
|
318
|
+
vec2: Second vector
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
Cosine similarity (0.0-1.0)
|
|
322
|
+
"""
|
|
323
|
+
if len(vec1) != len(vec2):
|
|
324
|
+
return 0.0
|
|
325
|
+
|
|
326
|
+
dot_product = sum(a * b for a, b in zip(vec1, vec2))
|
|
327
|
+
magnitude1 = sum(a * a for a in vec1) ** 0.5
|
|
328
|
+
magnitude2 = sum(b * b for b in vec2) ** 0.5
|
|
329
|
+
|
|
330
|
+
if magnitude1 == 0 or magnitude2 == 0:
|
|
331
|
+
return 0.0
|
|
332
|
+
|
|
333
|
+
# Cosine similarity ranges from -1 to 1, normalize to 0 to 1
|
|
334
|
+
similarity = dot_product / (magnitude1 * magnitude2)
|
|
335
|
+
return (similarity + 1) / 2
|
|
336
|
+
|
|
337
|
+
def _find_clusters(self, n: int, edges: Set[Tuple[int, int]]) -> List[List[int]]:
|
|
338
|
+
"""
|
|
339
|
+
Find connected components using Union-Find
|
|
340
|
+
|
|
341
|
+
Args:
|
|
342
|
+
n: Number of nodes
|
|
343
|
+
edges: Set of edges (i, j) indicating similarity
|
|
344
|
+
|
|
345
|
+
Returns:
|
|
346
|
+
List of clusters, where each cluster is a list of node indices
|
|
347
|
+
"""
|
|
348
|
+
# Union-Find data structure
|
|
349
|
+
parent = list(range(n))
|
|
350
|
+
|
|
351
|
+
def find(x):
|
|
352
|
+
if parent[x] != x:
|
|
353
|
+
parent[x] = find(parent[x]) # Path compression
|
|
354
|
+
return parent[x]
|
|
355
|
+
|
|
356
|
+
def union(x, y):
|
|
357
|
+
px, py = find(x), find(y)
|
|
358
|
+
if px != py:
|
|
359
|
+
parent[px] = py
|
|
360
|
+
|
|
361
|
+
# Build connected components
|
|
362
|
+
for i, j in edges:
|
|
363
|
+
union(i, j)
|
|
364
|
+
|
|
365
|
+
# Group by root
|
|
366
|
+
clusters_dict: Dict[int, List[int]] = {}
|
|
367
|
+
for i in range(n):
|
|
368
|
+
root = find(i)
|
|
369
|
+
if root not in clusters_dict:
|
|
370
|
+
clusters_dict[root] = []
|
|
371
|
+
clusters_dict[root].append(i)
|
|
372
|
+
|
|
373
|
+
return list(clusters_dict.values())
|
|
374
|
+
|
|
375
|
+
def _merge_entities(self, entities: List[Entity]) -> Entity:
|
|
376
|
+
"""
|
|
377
|
+
Merge a cluster of similar entities into one canonical entity
|
|
378
|
+
|
|
379
|
+
Strategy:
|
|
380
|
+
- Use the first entity as base
|
|
381
|
+
- Merge all properties (prefer non-empty values)
|
|
382
|
+
- Store alternative names as aliases
|
|
383
|
+
- Keep highest confidence score
|
|
384
|
+
|
|
385
|
+
Args:
|
|
386
|
+
entities: List of entities to merge
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
Merged canonical entity
|
|
390
|
+
"""
|
|
391
|
+
if len(entities) == 1:
|
|
392
|
+
return entities[0]
|
|
393
|
+
|
|
394
|
+
# Use first entity as base
|
|
395
|
+
canonical = entities[0]
|
|
396
|
+
|
|
397
|
+
# Collect all names as aliases
|
|
398
|
+
aliases = set()
|
|
399
|
+
for entity in entities:
|
|
400
|
+
name = self._get_entity_name(entity)
|
|
401
|
+
if name and name != self._get_entity_name(canonical):
|
|
402
|
+
aliases.add(name)
|
|
403
|
+
|
|
404
|
+
# Merge properties (prefer non-empty, non-None values)
|
|
405
|
+
merged_properties = dict(canonical.properties)
|
|
406
|
+
|
|
407
|
+
for entity in entities[1:]:
|
|
408
|
+
for key, value in entity.properties.items():
|
|
409
|
+
if key not in merged_properties or not merged_properties[key]:
|
|
410
|
+
merged_properties[key] = value
|
|
411
|
+
|
|
412
|
+
# Add aliases
|
|
413
|
+
if aliases:
|
|
414
|
+
merged_properties["_aliases"] = list(aliases)
|
|
415
|
+
|
|
416
|
+
# Take highest confidence
|
|
417
|
+
confidences = [e.properties.get("_extraction_confidence", 0.5) for e in entities]
|
|
418
|
+
merged_properties["_extraction_confidence"] = max(confidences)
|
|
419
|
+
|
|
420
|
+
# Track merge count
|
|
421
|
+
merged_properties["_merged_count"] = len(entities)
|
|
422
|
+
|
|
423
|
+
# Create merged entity (preserve tenant_id from canonical entity)
|
|
424
|
+
merged_entity = Entity(
|
|
425
|
+
id=canonical.id,
|
|
426
|
+
entity_type=canonical.entity_type,
|
|
427
|
+
properties=merged_properties,
|
|
428
|
+
embedding=canonical.embedding,
|
|
429
|
+
source=canonical.source,
|
|
430
|
+
tenant_id=canonical.tenant_id,
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
return merged_entity
|