aiecs 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiecs/__init__.py +72 -0
- aiecs/__main__.py +41 -0
- aiecs/aiecs_client.py +469 -0
- aiecs/application/__init__.py +10 -0
- aiecs/application/executors/__init__.py +10 -0
- aiecs/application/executors/operation_executor.py +363 -0
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +100 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
- aiecs/application/knowledge_graph/search/reranker.py +295 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +484 -0
- aiecs/config/__init__.py +16 -0
- aiecs/config/config.py +498 -0
- aiecs/config/graph_config.py +137 -0
- aiecs/config/registry.py +23 -0
- aiecs/core/__init__.py +46 -0
- aiecs/core/interface/__init__.py +34 -0
- aiecs/core/interface/execution_interface.py +152 -0
- aiecs/core/interface/storage_interface.py +171 -0
- aiecs/domain/__init__.py +289 -0
- aiecs/domain/agent/__init__.py +189 -0
- aiecs/domain/agent/base_agent.py +697 -0
- aiecs/domain/agent/exceptions.py +103 -0
- aiecs/domain/agent/graph_aware_mixin.py +559 -0
- aiecs/domain/agent/hybrid_agent.py +490 -0
- aiecs/domain/agent/integration/__init__.py +26 -0
- aiecs/domain/agent/integration/context_compressor.py +222 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
- aiecs/domain/agent/integration/retry_policy.py +219 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +646 -0
- aiecs/domain/agent/lifecycle.py +296 -0
- aiecs/domain/agent/llm_agent.py +300 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +197 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +160 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
- aiecs/domain/agent/models.py +317 -0
- aiecs/domain/agent/observability.py +407 -0
- aiecs/domain/agent/persistence.py +289 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +161 -0
- aiecs/domain/agent/prompts/formatters.py +189 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +260 -0
- aiecs/domain/agent/tool_agent.py +257 -0
- aiecs/domain/agent/tools/__init__.py +12 -0
- aiecs/domain/agent/tools/schema_generator.py +221 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +477 -0
- aiecs/domain/community/analytics.py +481 -0
- aiecs/domain/community/collaborative_workflow.py +642 -0
- aiecs/domain/community/communication_hub.py +645 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +800 -0
- aiecs/domain/community/community_manager.py +813 -0
- aiecs/domain/community/decision_engine.py +879 -0
- aiecs/domain/community/exceptions.py +225 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +268 -0
- aiecs/domain/community/resource_manager.py +457 -0
- aiecs/domain/community/shared_context_manager.py +603 -0
- aiecs/domain/context/__init__.py +58 -0
- aiecs/domain/context/context_engine.py +989 -0
- aiecs/domain/context/conversation_models.py +354 -0
- aiecs/domain/context/graph_memory.py +467 -0
- aiecs/domain/execution/__init__.py +12 -0
- aiecs/domain/execution/model.py +57 -0
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +130 -0
- aiecs/domain/knowledge_graph/models/evidence.py +194 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
- aiecs/domain/knowledge_graph/models/path.py +179 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
- aiecs/domain/knowledge_graph/models/query.py +272 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
- aiecs/domain/knowledge_graph/models/relation.py +136 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
- aiecs/domain/task/__init__.py +13 -0
- aiecs/domain/task/dsl_processor.py +613 -0
- aiecs/domain/task/model.py +62 -0
- aiecs/domain/task/task_context.py +268 -0
- aiecs/infrastructure/__init__.py +24 -0
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +601 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
- aiecs/infrastructure/graph_storage/cache.py +429 -0
- aiecs/infrastructure/graph_storage/distributed.py +226 -0
- aiecs/infrastructure/graph_storage/error_handling.py +390 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +514 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
- aiecs/infrastructure/graph_storage/metrics.py +357 -0
- aiecs/infrastructure/graph_storage/migration.py +413 -0
- aiecs/infrastructure/graph_storage/pagination.py +471 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
- aiecs/infrastructure/graph_storage/postgres.py +871 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +623 -0
- aiecs/infrastructure/graph_storage/streaming.py +495 -0
- aiecs/infrastructure/messaging/__init__.py +13 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
- aiecs/infrastructure/messaging/websocket_manager.py +298 -0
- aiecs/infrastructure/monitoring/__init__.py +34 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
- aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
- aiecs/infrastructure/monitoring/structured_logger.py +48 -0
- aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
- aiecs/infrastructure/persistence/__init__.py +24 -0
- aiecs/infrastructure/persistence/context_engine_client.py +187 -0
- aiecs/infrastructure/persistence/database_manager.py +333 -0
- aiecs/infrastructure/persistence/file_storage.py +754 -0
- aiecs/infrastructure/persistence/redis_client.py +220 -0
- aiecs/llm/__init__.py +86 -0
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/callbacks/custom_callbacks.py +264 -0
- aiecs/llm/client_factory.py +420 -0
- aiecs/llm/clients/__init__.py +33 -0
- aiecs/llm/clients/base_client.py +193 -0
- aiecs/llm/clients/googleai_client.py +181 -0
- aiecs/llm/clients/openai_client.py +131 -0
- aiecs/llm/clients/vertex_client.py +437 -0
- aiecs/llm/clients/xai_client.py +184 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +275 -0
- aiecs/llm/config/config_validator.py +236 -0
- aiecs/llm/config/model_config.py +151 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +91 -0
- aiecs/main.py +363 -0
- aiecs/scripts/__init__.py +3 -0
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/version_manager.py +215 -0
- aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
- aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
- aiecs/scripts/dependance_check/__init__.py +17 -0
- aiecs/scripts/dependance_check/dependency_checker.py +938 -0
- aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
- aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
- aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
- aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
- aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
- aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
- aiecs/scripts/tools_develop/README.md +449 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
- aiecs/scripts/tools_develop/verify_tools.py +356 -0
- aiecs/tasks/__init__.py +1 -0
- aiecs/tasks/worker.py +172 -0
- aiecs/tools/__init__.py +299 -0
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +303 -0
- aiecs/tools/apisource/providers/__init__.py +115 -0
- aiecs/tools/apisource/providers/base.py +664 -0
- aiecs/tools/apisource/providers/census.py +401 -0
- aiecs/tools/apisource/providers/fred.py +564 -0
- aiecs/tools/apisource/providers/newsapi.py +412 -0
- aiecs/tools/apisource/providers/worldbank.py +357 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +375 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
- aiecs/tools/apisource/tool.py +850 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +338 -0
- aiecs/tools/base_tool.py +201 -0
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +599 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
- aiecs/tools/docs/content_insertion_tool.py +1333 -0
- aiecs/tools/docs/document_creator_tool.py +1317 -0
- aiecs/tools/docs/document_layout_tool.py +1166 -0
- aiecs/tools/docs/document_parser_tool.py +994 -0
- aiecs/tools/docs/document_writer_tool.py +1818 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
- aiecs/tools/langchain_adapter.py +542 -0
- aiecs/tools/schema_generator.py +275 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +589 -0
- aiecs/tools/search_tool/cache.py +260 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +216 -0
- aiecs/tools/search_tool/core.py +749 -0
- aiecs/tools/search_tool/deduplicator.py +123 -0
- aiecs/tools/search_tool/error_handler.py +271 -0
- aiecs/tools/search_tool/metrics.py +371 -0
- aiecs/tools/search_tool/rate_limiter.py +178 -0
- aiecs/tools/search_tool/schemas.py +277 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
- aiecs/tools/statistics/data_loader_tool.py +564 -0
- aiecs/tools/statistics/data_profiler_tool.py +658 -0
- aiecs/tools/statistics/data_transformer_tool.py +573 -0
- aiecs/tools/statistics/data_visualizer_tool.py +495 -0
- aiecs/tools/statistics/model_trainer_tool.py +487 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
- aiecs/tools/task_tools/__init__.py +86 -0
- aiecs/tools/task_tools/chart_tool.py +732 -0
- aiecs/tools/task_tools/classfire_tool.py +922 -0
- aiecs/tools/task_tools/image_tool.py +447 -0
- aiecs/tools/task_tools/office_tool.py +684 -0
- aiecs/tools/task_tools/pandas_tool.py +635 -0
- aiecs/tools/task_tools/report_tool.py +635 -0
- aiecs/tools/task_tools/research_tool.py +392 -0
- aiecs/tools/task_tools/scraper_tool.py +715 -0
- aiecs/tools/task_tools/stats_tool.py +688 -0
- aiecs/tools/temp_file_manager.py +130 -0
- aiecs/tools/tool_executor/__init__.py +37 -0
- aiecs/tools/tool_executor/tool_executor.py +881 -0
- aiecs/utils/LLM_output_structor.py +445 -0
- aiecs/utils/__init__.py +34 -0
- aiecs/utils/base_callback.py +47 -0
- aiecs/utils/cache_provider.py +695 -0
- aiecs/utils/execution_utils.py +184 -0
- aiecs/utils/logging.py +1 -0
- aiecs/utils/prompt_loader.py +14 -0
- aiecs/utils/token_usage_repository.py +323 -0
- aiecs/ws/__init__.py +0 -0
- aiecs/ws/socket_server.py +52 -0
- aiecs-1.5.1.dist-info/METADATA +608 -0
- aiecs-1.5.1.dist-info/RECORD +302 -0
- aiecs-1.5.1.dist-info/WHEEL +5 -0
- aiecs-1.5.1.dist-info/entry_points.txt +10 -0
- aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
- aiecs-1.5.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Entity Linker
|
|
3
|
+
|
|
4
|
+
Links newly extracted entities to existing entities in the knowledge graph.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Optional
|
|
8
|
+
from aiecs.domain.knowledge_graph.models.entity import Entity
|
|
9
|
+
from aiecs.infrastructure.graph_storage.base import GraphStore
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class EntityLinker:
|
|
13
|
+
"""
|
|
14
|
+
Link new entities to existing entities in the graph
|
|
15
|
+
|
|
16
|
+
When extracting entities from new documents, many entities may already exist
|
|
17
|
+
in the knowledge graph. This class identifies such matches and links them,
|
|
18
|
+
preventing duplication across the entire graph.
|
|
19
|
+
|
|
20
|
+
Features:
|
|
21
|
+
- Exact ID matching
|
|
22
|
+
- Name-based fuzzy matching
|
|
23
|
+
- Embedding-based similarity search
|
|
24
|
+
- Type-aware linking
|
|
25
|
+
- Confidence scoring
|
|
26
|
+
|
|
27
|
+
Workflow:
|
|
28
|
+
1. For each new entity, search graph for similar existing entities
|
|
29
|
+
2. If match found, return existing entity ID (link)
|
|
30
|
+
3. If no match, entity is new and should be added
|
|
31
|
+
|
|
32
|
+
Example:
|
|
33
|
+
```python
|
|
34
|
+
linker = EntityLinker(graph_store, similarity_threshold=0.85)
|
|
35
|
+
|
|
36
|
+
new_entity = Entity(type="Person", properties={"name": "Alice Smith"})
|
|
37
|
+
|
|
38
|
+
# Check if Alice already exists
|
|
39
|
+
link_result = await linker.link_entity(new_entity)
|
|
40
|
+
|
|
41
|
+
if link_result.linked:
|
|
42
|
+
print(f"Linked to existing entity: {link_result.existing_entity.id}")
|
|
43
|
+
# Use existing entity instead of creating new one
|
|
44
|
+
else:
|
|
45
|
+
print("New entity - add to graph")
|
|
46
|
+
# Add new_entity to graph
|
|
47
|
+
```
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
graph_store: GraphStore,
|
|
53
|
+
similarity_threshold: float = 0.85,
|
|
54
|
+
use_embeddings: bool = True,
|
|
55
|
+
embedding_threshold: float = 0.90,
|
|
56
|
+
):
|
|
57
|
+
"""
|
|
58
|
+
Initialize entity linker
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
graph_store: Graph storage to search for existing entities
|
|
62
|
+
similarity_threshold: Minimum similarity to link entities (0.0-1.0)
|
|
63
|
+
use_embeddings: Use embedding similarity for matching
|
|
64
|
+
embedding_threshold: Minimum embedding similarity for linking (0.0-1.0)
|
|
65
|
+
"""
|
|
66
|
+
self.graph_store = graph_store
|
|
67
|
+
self.similarity_threshold = similarity_threshold
|
|
68
|
+
self.use_embeddings = use_embeddings
|
|
69
|
+
self.embedding_threshold = embedding_threshold
|
|
70
|
+
|
|
71
|
+
async def link_entity(self, new_entity: Entity, candidate_limit: int = 10) -> "LinkResult":
|
|
72
|
+
"""
|
|
73
|
+
Link a new entity to existing entity in graph (if match found)
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
new_entity: Entity to link
|
|
77
|
+
candidate_limit: Maximum number of candidates to consider
|
|
78
|
+
|
|
79
|
+
Returns:
|
|
80
|
+
LinkResult with linking decision and matched entity (if any)
|
|
81
|
+
"""
|
|
82
|
+
# Try exact ID match first
|
|
83
|
+
existing = await self.graph_store.get_entity(new_entity.id)
|
|
84
|
+
if existing:
|
|
85
|
+
return LinkResult(
|
|
86
|
+
linked=True,
|
|
87
|
+
existing_entity=existing,
|
|
88
|
+
new_entity=new_entity,
|
|
89
|
+
similarity=1.0,
|
|
90
|
+
link_type="exact_id",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Try embedding-based search (fast, semantic)
|
|
94
|
+
if self.use_embeddings and new_entity.embedding:
|
|
95
|
+
link_result = await self._link_by_embedding(new_entity, candidate_limit)
|
|
96
|
+
if link_result.linked:
|
|
97
|
+
return link_result
|
|
98
|
+
|
|
99
|
+
# Try name-based search (fallback)
|
|
100
|
+
link_result = await self._link_by_name(new_entity, candidate_limit)
|
|
101
|
+
|
|
102
|
+
return link_result
|
|
103
|
+
|
|
104
|
+
async def link_entities(
|
|
105
|
+
self, new_entities: List[Entity], candidate_limit: int = 10
|
|
106
|
+
) -> List["LinkResult"]:
|
|
107
|
+
"""
|
|
108
|
+
Link multiple entities in batch
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
new_entities: List of entities to link
|
|
112
|
+
candidate_limit: Maximum candidates per entity
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
List of LinkResult objects (one per input entity)
|
|
116
|
+
"""
|
|
117
|
+
results = []
|
|
118
|
+
for entity in new_entities:
|
|
119
|
+
result = await self.link_entity(entity, candidate_limit)
|
|
120
|
+
results.append(result)
|
|
121
|
+
return results
|
|
122
|
+
|
|
123
|
+
async def _link_by_embedding(self, new_entity: Entity, candidate_limit: int) -> "LinkResult":
|
|
124
|
+
"""
|
|
125
|
+
Link entity using embedding similarity search
|
|
126
|
+
|
|
127
|
+
Args:
|
|
128
|
+
new_entity: Entity to link
|
|
129
|
+
candidate_limit: Maximum candidates to consider
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
LinkResult
|
|
133
|
+
"""
|
|
134
|
+
if not new_entity.embedding:
|
|
135
|
+
return LinkResult(linked=False, new_entity=new_entity)
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
# Vector search in graph
|
|
139
|
+
candidates = await self.graph_store.vector_search(
|
|
140
|
+
query_embedding=new_entity.embedding,
|
|
141
|
+
entity_type=new_entity.entity_type,
|
|
142
|
+
max_results=candidate_limit,
|
|
143
|
+
score_threshold=self.embedding_threshold,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
if not candidates:
|
|
147
|
+
return LinkResult(linked=False, new_entity=new_entity)
|
|
148
|
+
|
|
149
|
+
# Get best candidate
|
|
150
|
+
best_entity, best_score = candidates[0]
|
|
151
|
+
|
|
152
|
+
# Check if score meets threshold
|
|
153
|
+
if best_score >= self.embedding_threshold:
|
|
154
|
+
# Also verify name similarity (sanity check)
|
|
155
|
+
name_match = self._check_name_similarity(new_entity, best_entity)
|
|
156
|
+
|
|
157
|
+
if name_match or best_score >= 0.95: # High embedding score = trust it
|
|
158
|
+
return LinkResult(
|
|
159
|
+
linked=True,
|
|
160
|
+
existing_entity=best_entity,
|
|
161
|
+
new_entity=new_entity,
|
|
162
|
+
similarity=best_score,
|
|
163
|
+
link_type="embedding",
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
except NotImplementedError:
|
|
167
|
+
# Graph store doesn't support vector search
|
|
168
|
+
pass
|
|
169
|
+
except Exception as e:
|
|
170
|
+
# Log error but don't fail
|
|
171
|
+
print(f"Warning: Embedding search failed: {e}")
|
|
172
|
+
|
|
173
|
+
return LinkResult(linked=False, new_entity=new_entity)
|
|
174
|
+
|
|
175
|
+
async def _link_by_name(self, new_entity: Entity, candidate_limit: int) -> "LinkResult":
|
|
176
|
+
"""
|
|
177
|
+
Link entity using name-based matching
|
|
178
|
+
|
|
179
|
+
This is slower than embedding search but works without embeddings.
|
|
180
|
+
|
|
181
|
+
Strategy:
|
|
182
|
+
1. Get all entities of same type (if feasible)
|
|
183
|
+
2. Compare names using fuzzy matching
|
|
184
|
+
3. Return best match if above threshold
|
|
185
|
+
|
|
186
|
+
Args:
|
|
187
|
+
new_entity: Entity to link
|
|
188
|
+
candidate_limit: Maximum candidates to consider
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
LinkResult
|
|
192
|
+
"""
|
|
193
|
+
new_name = self._get_entity_name(new_entity)
|
|
194
|
+
if not new_name:
|
|
195
|
+
return LinkResult(linked=False, new_entity=new_entity)
|
|
196
|
+
|
|
197
|
+
try:
|
|
198
|
+
# Get candidate entities of same type
|
|
199
|
+
# Note: This is a simplified implementation
|
|
200
|
+
# In production, you'd want an indexed search or LIKE query
|
|
201
|
+
candidates = await self._get_candidate_entities(new_entity.entity_type, candidate_limit)
|
|
202
|
+
|
|
203
|
+
if not candidates:
|
|
204
|
+
return LinkResult(linked=False, new_entity=new_entity)
|
|
205
|
+
|
|
206
|
+
# Find best match
|
|
207
|
+
best_match = None
|
|
208
|
+
best_score = 0.0
|
|
209
|
+
|
|
210
|
+
for candidate in candidates:
|
|
211
|
+
candidate_name = self._get_entity_name(candidate)
|
|
212
|
+
if candidate_name:
|
|
213
|
+
score = self._name_similarity(new_name, candidate_name)
|
|
214
|
+
if score > best_score:
|
|
215
|
+
best_score = score
|
|
216
|
+
best_match = candidate
|
|
217
|
+
|
|
218
|
+
# Check threshold
|
|
219
|
+
if best_score >= self.similarity_threshold and best_match:
|
|
220
|
+
return LinkResult(
|
|
221
|
+
linked=True,
|
|
222
|
+
existing_entity=best_match,
|
|
223
|
+
new_entity=new_entity,
|
|
224
|
+
similarity=best_score,
|
|
225
|
+
link_type="name",
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
except Exception as e:
|
|
229
|
+
print(f"Warning: Name-based linking failed: {e}")
|
|
230
|
+
|
|
231
|
+
return LinkResult(linked=False, new_entity=new_entity)
|
|
232
|
+
|
|
233
|
+
async def _get_candidate_entities(self, entity_type: str, limit: int) -> List[Entity]:
|
|
234
|
+
"""
|
|
235
|
+
Get candidate entities for linking
|
|
236
|
+
|
|
237
|
+
This is a placeholder - in production, you'd want:
|
|
238
|
+
- Indexed search by entity type
|
|
239
|
+
- LIKE queries for name matching
|
|
240
|
+
- Pagination for large result sets
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
entity_type: Entity type to filter by
|
|
244
|
+
limit: Maximum candidates
|
|
245
|
+
|
|
246
|
+
Returns:
|
|
247
|
+
List of candidate entities
|
|
248
|
+
"""
|
|
249
|
+
# TODO: Implement efficient candidate retrieval
|
|
250
|
+
# For now, return empty list (will rely on embedding search primarily)
|
|
251
|
+
# In Phase 3 (SQLite) and Phase 6 (PostgreSQL), we'll implement
|
|
252
|
+
# efficient queries for this
|
|
253
|
+
return []
|
|
254
|
+
|
|
255
|
+
def _check_name_similarity(self, entity1: Entity, entity2: Entity) -> bool:
|
|
256
|
+
"""
|
|
257
|
+
Quick name similarity check
|
|
258
|
+
|
|
259
|
+
Args:
|
|
260
|
+
entity1: First entity
|
|
261
|
+
entity2: Second entity
|
|
262
|
+
|
|
263
|
+
Returns:
|
|
264
|
+
True if names are similar enough
|
|
265
|
+
"""
|
|
266
|
+
name1 = self._get_entity_name(entity1)
|
|
267
|
+
name2 = self._get_entity_name(entity2)
|
|
268
|
+
|
|
269
|
+
if not name1 or not name2:
|
|
270
|
+
return False
|
|
271
|
+
|
|
272
|
+
return self._name_similarity(name1, name2) >= self.similarity_threshold
|
|
273
|
+
|
|
274
|
+
def _get_entity_name(self, entity: Entity) -> str:
|
|
275
|
+
"""Extract entity name from properties"""
|
|
276
|
+
return (
|
|
277
|
+
entity.properties.get("name")
|
|
278
|
+
or entity.properties.get("title")
|
|
279
|
+
or entity.properties.get("text")
|
|
280
|
+
or ""
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
def _name_similarity(self, name1: str, name2: str) -> float:
|
|
284
|
+
"""
|
|
285
|
+
Compute name similarity using fuzzy matching
|
|
286
|
+
|
|
287
|
+
Args:
|
|
288
|
+
name1: First name
|
|
289
|
+
name2: Second name
|
|
290
|
+
|
|
291
|
+
Returns:
|
|
292
|
+
Similarity score (0.0-1.0)
|
|
293
|
+
"""
|
|
294
|
+
from difflib import SequenceMatcher
|
|
295
|
+
|
|
296
|
+
# Normalize
|
|
297
|
+
n1 = name1.lower().strip()
|
|
298
|
+
n2 = name2.lower().strip()
|
|
299
|
+
|
|
300
|
+
# Exact match
|
|
301
|
+
if n1 == n2:
|
|
302
|
+
return 1.0
|
|
303
|
+
|
|
304
|
+
# Substring match
|
|
305
|
+
if n1 in n2 or n2 in n1:
|
|
306
|
+
return 0.95
|
|
307
|
+
|
|
308
|
+
# Fuzzy match
|
|
309
|
+
return SequenceMatcher(None, n1, n2).ratio()
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
class LinkResult:
|
|
313
|
+
"""
|
|
314
|
+
Result of entity linking operation
|
|
315
|
+
|
|
316
|
+
Attributes:
|
|
317
|
+
linked: Whether a link was found
|
|
318
|
+
existing_entity: The existing entity (if linked)
|
|
319
|
+
new_entity: The new entity being linked
|
|
320
|
+
similarity: Similarity score (0.0-1.0)
|
|
321
|
+
link_type: Type of link ("exact_id", "embedding", "name", "none")
|
|
322
|
+
"""
|
|
323
|
+
|
|
324
|
+
def __init__(
|
|
325
|
+
self,
|
|
326
|
+
linked: bool,
|
|
327
|
+
new_entity: Entity,
|
|
328
|
+
existing_entity: Optional[Entity] = None,
|
|
329
|
+
similarity: float = 0.0,
|
|
330
|
+
link_type: str = "none",
|
|
331
|
+
):
|
|
332
|
+
self.linked = linked
|
|
333
|
+
self.existing_entity = existing_entity
|
|
334
|
+
self.new_entity = new_entity
|
|
335
|
+
self.similarity = similarity
|
|
336
|
+
self.link_type = link_type
|
|
337
|
+
|
|
338
|
+
def __repr__(self) -> str:
|
|
339
|
+
if self.linked:
|
|
340
|
+
return (
|
|
341
|
+
f"LinkResult(linked=True, type={self.link_type}, similarity={self.similarity:.2f})"
|
|
342
|
+
)
|
|
343
|
+
return "LinkResult(linked=False)"
|