aiecs 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- aiecs/__init__.py +72 -0
- aiecs/__main__.py +41 -0
- aiecs/aiecs_client.py +469 -0
- aiecs/application/__init__.py +10 -0
- aiecs/application/executors/__init__.py +10 -0
- aiecs/application/executors/operation_executor.py +363 -0
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +100 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
- aiecs/application/knowledge_graph/search/reranker.py +295 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +484 -0
- aiecs/config/__init__.py +16 -0
- aiecs/config/config.py +498 -0
- aiecs/config/graph_config.py +137 -0
- aiecs/config/registry.py +23 -0
- aiecs/core/__init__.py +46 -0
- aiecs/core/interface/__init__.py +34 -0
- aiecs/core/interface/execution_interface.py +152 -0
- aiecs/core/interface/storage_interface.py +171 -0
- aiecs/domain/__init__.py +289 -0
- aiecs/domain/agent/__init__.py +189 -0
- aiecs/domain/agent/base_agent.py +697 -0
- aiecs/domain/agent/exceptions.py +103 -0
- aiecs/domain/agent/graph_aware_mixin.py +559 -0
- aiecs/domain/agent/hybrid_agent.py +490 -0
- aiecs/domain/agent/integration/__init__.py +26 -0
- aiecs/domain/agent/integration/context_compressor.py +222 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
- aiecs/domain/agent/integration/retry_policy.py +219 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +646 -0
- aiecs/domain/agent/lifecycle.py +296 -0
- aiecs/domain/agent/llm_agent.py +300 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +197 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +160 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
- aiecs/domain/agent/models.py +317 -0
- aiecs/domain/agent/observability.py +407 -0
- aiecs/domain/agent/persistence.py +289 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +161 -0
- aiecs/domain/agent/prompts/formatters.py +189 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +260 -0
- aiecs/domain/agent/tool_agent.py +257 -0
- aiecs/domain/agent/tools/__init__.py +12 -0
- aiecs/domain/agent/tools/schema_generator.py +221 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +477 -0
- aiecs/domain/community/analytics.py +481 -0
- aiecs/domain/community/collaborative_workflow.py +642 -0
- aiecs/domain/community/communication_hub.py +645 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +800 -0
- aiecs/domain/community/community_manager.py +813 -0
- aiecs/domain/community/decision_engine.py +879 -0
- aiecs/domain/community/exceptions.py +225 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +268 -0
- aiecs/domain/community/resource_manager.py +457 -0
- aiecs/domain/community/shared_context_manager.py +603 -0
- aiecs/domain/context/__init__.py +58 -0
- aiecs/domain/context/context_engine.py +989 -0
- aiecs/domain/context/conversation_models.py +354 -0
- aiecs/domain/context/graph_memory.py +467 -0
- aiecs/domain/execution/__init__.py +12 -0
- aiecs/domain/execution/model.py +57 -0
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +130 -0
- aiecs/domain/knowledge_graph/models/evidence.py +194 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
- aiecs/domain/knowledge_graph/models/path.py +179 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
- aiecs/domain/knowledge_graph/models/query.py +272 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
- aiecs/domain/knowledge_graph/models/relation.py +136 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
- aiecs/domain/task/__init__.py +13 -0
- aiecs/domain/task/dsl_processor.py +613 -0
- aiecs/domain/task/model.py +62 -0
- aiecs/domain/task/task_context.py +268 -0
- aiecs/infrastructure/__init__.py +24 -0
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +601 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
- aiecs/infrastructure/graph_storage/cache.py +429 -0
- aiecs/infrastructure/graph_storage/distributed.py +226 -0
- aiecs/infrastructure/graph_storage/error_handling.py +390 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +514 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
- aiecs/infrastructure/graph_storage/metrics.py +357 -0
- aiecs/infrastructure/graph_storage/migration.py +413 -0
- aiecs/infrastructure/graph_storage/pagination.py +471 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
- aiecs/infrastructure/graph_storage/postgres.py +871 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +623 -0
- aiecs/infrastructure/graph_storage/streaming.py +495 -0
- aiecs/infrastructure/messaging/__init__.py +13 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
- aiecs/infrastructure/messaging/websocket_manager.py +298 -0
- aiecs/infrastructure/monitoring/__init__.py +34 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
- aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
- aiecs/infrastructure/monitoring/structured_logger.py +48 -0
- aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
- aiecs/infrastructure/persistence/__init__.py +24 -0
- aiecs/infrastructure/persistence/context_engine_client.py +187 -0
- aiecs/infrastructure/persistence/database_manager.py +333 -0
- aiecs/infrastructure/persistence/file_storage.py +754 -0
- aiecs/infrastructure/persistence/redis_client.py +220 -0
- aiecs/llm/__init__.py +86 -0
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/callbacks/custom_callbacks.py +264 -0
- aiecs/llm/client_factory.py +420 -0
- aiecs/llm/clients/__init__.py +33 -0
- aiecs/llm/clients/base_client.py +193 -0
- aiecs/llm/clients/googleai_client.py +181 -0
- aiecs/llm/clients/openai_client.py +131 -0
- aiecs/llm/clients/vertex_client.py +437 -0
- aiecs/llm/clients/xai_client.py +184 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +275 -0
- aiecs/llm/config/config_validator.py +236 -0
- aiecs/llm/config/model_config.py +151 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +91 -0
- aiecs/main.py +363 -0
- aiecs/scripts/__init__.py +3 -0
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/version_manager.py +215 -0
- aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
- aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
- aiecs/scripts/dependance_check/__init__.py +17 -0
- aiecs/scripts/dependance_check/dependency_checker.py +938 -0
- aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
- aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
- aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
- aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
- aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
- aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
- aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
- aiecs/scripts/tools_develop/README.md +449 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
- aiecs/scripts/tools_develop/verify_tools.py +356 -0
- aiecs/tasks/__init__.py +1 -0
- aiecs/tasks/worker.py +172 -0
- aiecs/tools/__init__.py +299 -0
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +303 -0
- aiecs/tools/apisource/providers/__init__.py +115 -0
- aiecs/tools/apisource/providers/base.py +664 -0
- aiecs/tools/apisource/providers/census.py +401 -0
- aiecs/tools/apisource/providers/fred.py +564 -0
- aiecs/tools/apisource/providers/newsapi.py +412 -0
- aiecs/tools/apisource/providers/worldbank.py +357 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +375 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
- aiecs/tools/apisource/tool.py +850 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +338 -0
- aiecs/tools/base_tool.py +201 -0
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +599 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
- aiecs/tools/docs/content_insertion_tool.py +1333 -0
- aiecs/tools/docs/document_creator_tool.py +1317 -0
- aiecs/tools/docs/document_layout_tool.py +1166 -0
- aiecs/tools/docs/document_parser_tool.py +994 -0
- aiecs/tools/docs/document_writer_tool.py +1818 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
- aiecs/tools/langchain_adapter.py +542 -0
- aiecs/tools/schema_generator.py +275 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +589 -0
- aiecs/tools/search_tool/cache.py +260 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +216 -0
- aiecs/tools/search_tool/core.py +749 -0
- aiecs/tools/search_tool/deduplicator.py +123 -0
- aiecs/tools/search_tool/error_handler.py +271 -0
- aiecs/tools/search_tool/metrics.py +371 -0
- aiecs/tools/search_tool/rate_limiter.py +178 -0
- aiecs/tools/search_tool/schemas.py +277 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
- aiecs/tools/statistics/data_loader_tool.py +564 -0
- aiecs/tools/statistics/data_profiler_tool.py +658 -0
- aiecs/tools/statistics/data_transformer_tool.py +573 -0
- aiecs/tools/statistics/data_visualizer_tool.py +495 -0
- aiecs/tools/statistics/model_trainer_tool.py +487 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
- aiecs/tools/task_tools/__init__.py +86 -0
- aiecs/tools/task_tools/chart_tool.py +732 -0
- aiecs/tools/task_tools/classfire_tool.py +922 -0
- aiecs/tools/task_tools/image_tool.py +447 -0
- aiecs/tools/task_tools/office_tool.py +684 -0
- aiecs/tools/task_tools/pandas_tool.py +635 -0
- aiecs/tools/task_tools/report_tool.py +635 -0
- aiecs/tools/task_tools/research_tool.py +392 -0
- aiecs/tools/task_tools/scraper_tool.py +715 -0
- aiecs/tools/task_tools/stats_tool.py +688 -0
- aiecs/tools/temp_file_manager.py +130 -0
- aiecs/tools/tool_executor/__init__.py +37 -0
- aiecs/tools/tool_executor/tool_executor.py +881 -0
- aiecs/utils/LLM_output_structor.py +445 -0
- aiecs/utils/__init__.py +34 -0
- aiecs/utils/base_callback.py +47 -0
- aiecs/utils/cache_provider.py +695 -0
- aiecs/utils/execution_utils.py +184 -0
- aiecs/utils/logging.py +1 -0
- aiecs/utils/prompt_loader.py +14 -0
- aiecs/utils/token_usage_repository.py +323 -0
- aiecs/ws/__init__.py +0 -0
- aiecs/ws/socket_server.py +52 -0
- aiecs-1.5.1.dist-info/METADATA +608 -0
- aiecs-1.5.1.dist-info/RECORD +302 -0
- aiecs-1.5.1.dist-info/WHEEL +5 -0
- aiecs-1.5.1.dist-info/entry_points.txt +10 -0
- aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
- aiecs-1.5.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
"""
|
|
2
|
+
spaCy NER-based Entity Extractor
|
|
3
|
+
|
|
4
|
+
Extracts entities using spaCy's Named Entity Recognition.
|
|
5
|
+
Fast, offline, and cost-free alternative to LLM extraction.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import List, Optional
|
|
9
|
+
import spacy
|
|
10
|
+
from spacy.language import Language
|
|
11
|
+
|
|
12
|
+
from aiecs.application.knowledge_graph.extractors.base import EntityExtractor
|
|
13
|
+
from aiecs.domain.knowledge_graph.models.entity import Entity
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class NEREntityExtractor(EntityExtractor):
|
|
17
|
+
"""
|
|
18
|
+
Extract entities using spaCy Named Entity Recognition
|
|
19
|
+
|
|
20
|
+
This extractor uses spaCy's pre-trained NER models to identify entities.
|
|
21
|
+
It's fast, free, and works offline, but limited to standard NER types.
|
|
22
|
+
|
|
23
|
+
Features:
|
|
24
|
+
- Fast extraction (no API calls)
|
|
25
|
+
- Works offline
|
|
26
|
+
- No cost
|
|
27
|
+
- Standard NER types (PERSON, ORG, GPE, LOC, DATE, etc.)
|
|
28
|
+
|
|
29
|
+
Limitations:
|
|
30
|
+
- Only standard entity types (no custom types)
|
|
31
|
+
- Limited property extraction (mainly just entity text)
|
|
32
|
+
- Lower quality than LLM extraction
|
|
33
|
+
|
|
34
|
+
Use Cases:
|
|
35
|
+
- Development and testing
|
|
36
|
+
- Cost-sensitive scenarios
|
|
37
|
+
- High-volume extraction where LLM is too expensive
|
|
38
|
+
- Baseline for comparison
|
|
39
|
+
|
|
40
|
+
Example:
|
|
41
|
+
```python
|
|
42
|
+
extractor = NEREntityExtractor(model="en_core_web_sm")
|
|
43
|
+
|
|
44
|
+
entities = await extractor.extract_entities(
|
|
45
|
+
"Alice works at Tech Corp in San Francisco."
|
|
46
|
+
)
|
|
47
|
+
# Returns: [
|
|
48
|
+
# Entity(type="Person", properties={"name": "Alice", "text": "Alice"}),
|
|
49
|
+
# Entity(type="Organization", properties={"name": "Tech Corp", "text": "Tech Corp"}),
|
|
50
|
+
# Entity(type="Location", properties={"name": "San Francisco", "text": "San Francisco"})
|
|
51
|
+
# ]
|
|
52
|
+
```
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
# Mapping from spaCy NER labels to generic entity types
|
|
56
|
+
LABEL_MAPPING = {
|
|
57
|
+
"PERSON": "Person",
|
|
58
|
+
"PER": "Person",
|
|
59
|
+
"ORG": "Organization",
|
|
60
|
+
"ORGANIZATION": "Organization",
|
|
61
|
+
"GPE": "Location", # Geo-Political Entity
|
|
62
|
+
"LOC": "Location",
|
|
63
|
+
"LOCATION": "Location",
|
|
64
|
+
"FAC": "Facility",
|
|
65
|
+
"FACILITY": "Facility",
|
|
66
|
+
"PRODUCT": "Product",
|
|
67
|
+
"EVENT": "Event",
|
|
68
|
+
"WORK_OF_ART": "WorkOfArt",
|
|
69
|
+
"LAW": "Law",
|
|
70
|
+
"LANGUAGE": "Language",
|
|
71
|
+
"DATE": "Date",
|
|
72
|
+
"TIME": "Time",
|
|
73
|
+
"PERCENT": "Percentage",
|
|
74
|
+
"MONEY": "Money",
|
|
75
|
+
"QUANTITY": "Quantity",
|
|
76
|
+
"ORDINAL": "Ordinal",
|
|
77
|
+
"CARDINAL": "Cardinal",
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
model: str = "en_core_web_sm",
|
|
83
|
+
disable_components: Optional[List[str]] = None,
|
|
84
|
+
):
|
|
85
|
+
"""
|
|
86
|
+
Initialize NER entity extractor
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
model: spaCy model name (default: "en_core_web_sm")
|
|
90
|
+
Available models:
|
|
91
|
+
- en_core_web_sm: Small English model (~13MB)
|
|
92
|
+
- en_core_web_md: Medium English model (~40MB)
|
|
93
|
+
- en_core_web_lg: Large English model (~560MB)
|
|
94
|
+
disable_components: spaCy pipeline components to disable (for speed)
|
|
95
|
+
Default: disable all except NER
|
|
96
|
+
"""
|
|
97
|
+
self.model_name = model
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
# Load spaCy model
|
|
101
|
+
if disable_components is None:
|
|
102
|
+
# Disable everything except NER for speed
|
|
103
|
+
disable_components = [
|
|
104
|
+
"tok2vec",
|
|
105
|
+
"tagger",
|
|
106
|
+
"parser",
|
|
107
|
+
"attribute_ruler",
|
|
108
|
+
"lemmatizer",
|
|
109
|
+
]
|
|
110
|
+
|
|
111
|
+
self.nlp: Language = spacy.load(model, disable=disable_components)
|
|
112
|
+
except OSError as e:
|
|
113
|
+
raise RuntimeError(
|
|
114
|
+
f"spaCy model '{model}' not found. "
|
|
115
|
+
f"Install it with: python -m spacy download {model}"
|
|
116
|
+
) from e
|
|
117
|
+
|
|
118
|
+
async def extract_entities(
|
|
119
|
+
self, text: str, entity_types: Optional[List[str]] = None, **kwargs
|
|
120
|
+
) -> List[Entity]:
|
|
121
|
+
"""
|
|
122
|
+
Extract entities from text using spaCy NER
|
|
123
|
+
|
|
124
|
+
Args:
|
|
125
|
+
text: Input text to extract entities from
|
|
126
|
+
entity_types: Optional filter for specific entity types
|
|
127
|
+
(will be matched against LABEL_MAPPING values)
|
|
128
|
+
**kwargs: Additional parameters (unused for NER)
|
|
129
|
+
|
|
130
|
+
Returns:
|
|
131
|
+
List of extracted Entity objects
|
|
132
|
+
|
|
133
|
+
Raises:
|
|
134
|
+
ValueError: If text is empty
|
|
135
|
+
"""
|
|
136
|
+
if not text or not text.strip():
|
|
137
|
+
raise ValueError("Input text cannot be empty")
|
|
138
|
+
|
|
139
|
+
# Process text with spaCy
|
|
140
|
+
doc = self.nlp(text)
|
|
141
|
+
|
|
142
|
+
# Extract entities
|
|
143
|
+
entities = []
|
|
144
|
+
seen_texts = set() # Simple deduplication within same text
|
|
145
|
+
|
|
146
|
+
for ent in doc.ents:
|
|
147
|
+
# Map spaCy label to generic entity type
|
|
148
|
+
entity_type = self.LABEL_MAPPING.get(ent.label_, ent.label_)
|
|
149
|
+
|
|
150
|
+
# Filter by entity type if requested
|
|
151
|
+
if entity_types and entity_type not in entity_types:
|
|
152
|
+
continue
|
|
153
|
+
|
|
154
|
+
# Simple deduplication: skip if we've seen this exact text already
|
|
155
|
+
entity_text = ent.text.strip()
|
|
156
|
+
if entity_text in seen_texts:
|
|
157
|
+
continue
|
|
158
|
+
seen_texts.add(entity_text)
|
|
159
|
+
|
|
160
|
+
# Create entity
|
|
161
|
+
entity = Entity(
|
|
162
|
+
id=self._generate_entity_id(entity_type, entity_text),
|
|
163
|
+
entity_type=entity_type,
|
|
164
|
+
properties={
|
|
165
|
+
"name": entity_text,
|
|
166
|
+
"text": entity_text,
|
|
167
|
+
"label": ent.label_, # Original spaCy label
|
|
168
|
+
"start_char": ent.start_char,
|
|
169
|
+
"end_char": ent.end_char,
|
|
170
|
+
"_extraction_confidence": self._estimate_confidence(ent),
|
|
171
|
+
},
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
entities.append(entity)
|
|
175
|
+
|
|
176
|
+
return entities
|
|
177
|
+
|
|
178
|
+
def _generate_entity_id(self, entity_type: str, text: str) -> str:
|
|
179
|
+
"""
|
|
180
|
+
Generate a unique ID for an entity
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
entity_type: Entity type name
|
|
184
|
+
text: Entity text
|
|
185
|
+
|
|
186
|
+
Returns:
|
|
187
|
+
Unique entity ID string
|
|
188
|
+
"""
|
|
189
|
+
# Create deterministic ID from type + text
|
|
190
|
+
normalized = f"{entity_type}_{text}".lower().replace(" ", "_")
|
|
191
|
+
# Add short hash for uniqueness
|
|
192
|
+
import hashlib
|
|
193
|
+
|
|
194
|
+
hash_suffix = hashlib.md5(normalized.encode()).hexdigest()[:8]
|
|
195
|
+
return f"{normalized}_{hash_suffix}"
|
|
196
|
+
|
|
197
|
+
def _estimate_confidence(self, ent) -> float:
|
|
198
|
+
"""
|
|
199
|
+
Estimate confidence for NER extraction
|
|
200
|
+
|
|
201
|
+
spaCy doesn't provide confidence scores directly, so we use heuristics:
|
|
202
|
+
- Longer entities are generally more confident
|
|
203
|
+
- Entities with more context are more confident
|
|
204
|
+
- Capitalized entities (proper nouns) are more confident
|
|
205
|
+
|
|
206
|
+
Args:
|
|
207
|
+
ent: spaCy entity
|
|
208
|
+
|
|
209
|
+
Returns:
|
|
210
|
+
Confidence score (0.0-1.0)
|
|
211
|
+
"""
|
|
212
|
+
# Base confidence
|
|
213
|
+
confidence = 0.7
|
|
214
|
+
|
|
215
|
+
# Adjust based on entity length
|
|
216
|
+
if len(ent.text) > 20:
|
|
217
|
+
confidence += 0.1
|
|
218
|
+
elif len(ent.text) < 3:
|
|
219
|
+
confidence -= 0.2
|
|
220
|
+
|
|
221
|
+
# Adjust based on capitalization (proper nouns)
|
|
222
|
+
if ent.text[0].isupper():
|
|
223
|
+
confidence += 0.1
|
|
224
|
+
|
|
225
|
+
# Clamp to [0.0, 1.0]
|
|
226
|
+
return max(0.0, min(1.0, confidence))
|
|
227
|
+
|
|
228
|
+
def get_supported_types(self) -> List[str]:
|
|
229
|
+
"""
|
|
230
|
+
Get list of entity types that this extractor can produce
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
List of entity type names
|
|
234
|
+
"""
|
|
235
|
+
return list(set(self.LABEL_MAPPING.values()))
|
|
236
|
+
|
|
237
|
+
def get_available_labels(self) -> List[str]:
|
|
238
|
+
"""
|
|
239
|
+
Get list of NER labels available in the loaded model
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
List of spaCy NER labels
|
|
243
|
+
"""
|
|
244
|
+
return self.nlp.get_pipe("ner").labels
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Knowledge Fusion Components
|
|
3
|
+
|
|
4
|
+
Components for deduplicating, merging, and linking entities across documents.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from aiecs.application.knowledge_graph.fusion.entity_deduplicator import (
|
|
8
|
+
EntityDeduplicator,
|
|
9
|
+
)
|
|
10
|
+
from aiecs.application.knowledge_graph.fusion.entity_linker import EntityLinker
|
|
11
|
+
from aiecs.application.knowledge_graph.fusion.relation_deduplicator import (
|
|
12
|
+
RelationDeduplicator,
|
|
13
|
+
)
|
|
14
|
+
from aiecs.application.knowledge_graph.fusion.knowledge_fusion import (
|
|
15
|
+
KnowledgeFusion,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
"EntityDeduplicator",
|
|
20
|
+
"EntityLinker",
|
|
21
|
+
"RelationDeduplicator",
|
|
22
|
+
"KnowledgeFusion",
|
|
23
|
+
]
|
|
@@ -0,0 +1,387 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Entity Deduplicator
|
|
3
|
+
|
|
4
|
+
Identifies and merges duplicate entities based on similarity matching.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Dict, Tuple, Set
|
|
8
|
+
from difflib import SequenceMatcher
|
|
9
|
+
from aiecs.domain.knowledge_graph.models.entity import Entity
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class EntityDeduplicator:
|
|
13
|
+
"""
|
|
14
|
+
Deduplicate entities based on similarity
|
|
15
|
+
|
|
16
|
+
When extracting entities from text, it's common to get duplicates:
|
|
17
|
+
- "Apple Inc." vs "Apple" vs "Apple Incorporated"
|
|
18
|
+
- "John Smith" vs "J. Smith" vs "Smith, John"
|
|
19
|
+
- "New York" vs "New York City" vs "NYC"
|
|
20
|
+
|
|
21
|
+
This class identifies such duplicates and merges them into canonical entities.
|
|
22
|
+
|
|
23
|
+
Features:
|
|
24
|
+
- Name-based fuzzy matching
|
|
25
|
+
- Type-aware matching (only match entities of same type)
|
|
26
|
+
- Property-based matching (use properties to improve matching)
|
|
27
|
+
- Configurable similarity threshold
|
|
28
|
+
- Embedding-based matching (when embeddings available)
|
|
29
|
+
|
|
30
|
+
Example:
|
|
31
|
+
```python
|
|
32
|
+
deduplicator = EntityDeduplicator(similarity_threshold=0.85)
|
|
33
|
+
|
|
34
|
+
entities = [
|
|
35
|
+
Entity(type="Company", properties={"name": "Apple Inc."}),
|
|
36
|
+
Entity(type="Company", properties={"name": "Apple"}),
|
|
37
|
+
Entity(type="Company", properties={"name": "Microsoft"})
|
|
38
|
+
]
|
|
39
|
+
|
|
40
|
+
deduplicated = await deduplicator.deduplicate(entities)
|
|
41
|
+
# Returns: [
|
|
42
|
+
# Entity(type="Company", properties={"name": "Apple Inc.", "_aliases": ["Apple"]}),
|
|
43
|
+
# Entity(type="Company", properties={"name": "Microsoft"})
|
|
44
|
+
# ]
|
|
45
|
+
```
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
similarity_threshold: float = 0.85,
|
|
51
|
+
use_embeddings: bool = True,
|
|
52
|
+
embedding_threshold: float = 0.90,
|
|
53
|
+
):
|
|
54
|
+
"""
|
|
55
|
+
Initialize entity deduplicator
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
similarity_threshold: Minimum similarity score to consider entities as duplicates (0.0-1.0)
|
|
59
|
+
use_embeddings: Whether to use embeddings for similarity (if available)
|
|
60
|
+
embedding_threshold: Minimum embedding similarity for duplicates (0.0-1.0)
|
|
61
|
+
"""
|
|
62
|
+
self.similarity_threshold = similarity_threshold
|
|
63
|
+
self.use_embeddings = use_embeddings
|
|
64
|
+
self.embedding_threshold = embedding_threshold
|
|
65
|
+
|
|
66
|
+
async def deduplicate(self, entities: List[Entity]) -> List[Entity]:
|
|
67
|
+
"""
|
|
68
|
+
Deduplicate a list of entities
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
entities: List of entities to deduplicate
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
List of deduplicated entities (with merged properties and aliases)
|
|
75
|
+
"""
|
|
76
|
+
if not entities:
|
|
77
|
+
return []
|
|
78
|
+
|
|
79
|
+
# Group entities by type (only match within same type)
|
|
80
|
+
entities_by_type: Dict[str, List[Entity]] = {}
|
|
81
|
+
for entity in entities:
|
|
82
|
+
if entity.entity_type not in entities_by_type:
|
|
83
|
+
entities_by_type[entity.entity_type] = []
|
|
84
|
+
entities_by_type[entity.entity_type].append(entity)
|
|
85
|
+
|
|
86
|
+
# Deduplicate within each type
|
|
87
|
+
deduplicated_entities = []
|
|
88
|
+
for entity_type, type_entities in entities_by_type.items():
|
|
89
|
+
deduped = await self._deduplicate_type_group(type_entities)
|
|
90
|
+
deduplicated_entities.extend(deduped)
|
|
91
|
+
|
|
92
|
+
return deduplicated_entities
|
|
93
|
+
|
|
94
|
+
async def _deduplicate_type_group(self, entities: List[Entity]) -> List[Entity]:
|
|
95
|
+
"""
|
|
96
|
+
Deduplicate entities of the same type
|
|
97
|
+
|
|
98
|
+
Algorithm:
|
|
99
|
+
1. Build similarity matrix between all pairs
|
|
100
|
+
2. Find clusters of similar entities (connected components)
|
|
101
|
+
3. Merge each cluster into a single canonical entity
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
entities: List of entities (all same type)
|
|
105
|
+
|
|
106
|
+
Returns:
|
|
107
|
+
List of deduplicated entities
|
|
108
|
+
"""
|
|
109
|
+
if len(entities) <= 1:
|
|
110
|
+
return entities
|
|
111
|
+
|
|
112
|
+
# Build similarity graph
|
|
113
|
+
n = len(entities)
|
|
114
|
+
similar_pairs: Set[Tuple[int, int]] = set()
|
|
115
|
+
|
|
116
|
+
for i in range(n):
|
|
117
|
+
for j in range(i + 1, n):
|
|
118
|
+
similarity = await self._compute_similarity(entities[i], entities[j])
|
|
119
|
+
if similarity >= self.similarity_threshold:
|
|
120
|
+
similar_pairs.add((i, j))
|
|
121
|
+
|
|
122
|
+
# Find connected components (clusters of similar entities)
|
|
123
|
+
clusters = self._find_clusters(n, similar_pairs)
|
|
124
|
+
|
|
125
|
+
# Merge each cluster into canonical entity
|
|
126
|
+
deduplicated = []
|
|
127
|
+
for cluster in clusters:
|
|
128
|
+
cluster_entities = [entities[idx] for idx in cluster]
|
|
129
|
+
merged_entity = self._merge_entities(cluster_entities)
|
|
130
|
+
deduplicated.append(merged_entity)
|
|
131
|
+
|
|
132
|
+
return deduplicated
|
|
133
|
+
|
|
134
|
+
async def _compute_similarity(self, entity1: Entity, entity2: Entity) -> float:
|
|
135
|
+
"""
|
|
136
|
+
Compute similarity between two entities
|
|
137
|
+
|
|
138
|
+
Uses multiple signals:
|
|
139
|
+
1. Name similarity (fuzzy string matching)
|
|
140
|
+
2. Property overlap
|
|
141
|
+
3. Embedding similarity (if available)
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
entity1: First entity
|
|
145
|
+
entity2: Second entity
|
|
146
|
+
|
|
147
|
+
Returns:
|
|
148
|
+
Similarity score (0.0-1.0)
|
|
149
|
+
"""
|
|
150
|
+
# Get entity names
|
|
151
|
+
name1 = self._get_entity_name(entity1)
|
|
152
|
+
name2 = self._get_entity_name(entity2)
|
|
153
|
+
|
|
154
|
+
if not name1 or not name2:
|
|
155
|
+
return 0.0
|
|
156
|
+
|
|
157
|
+
# 1. Name-based similarity
|
|
158
|
+
name_similarity = self._string_similarity(name1, name2)
|
|
159
|
+
|
|
160
|
+
# 2. Property overlap
|
|
161
|
+
property_similarity = self._property_similarity(entity1.properties, entity2.properties)
|
|
162
|
+
|
|
163
|
+
# 3. Embedding similarity (if available)
|
|
164
|
+
embedding_similarity = 0.0
|
|
165
|
+
if self.use_embeddings and entity1.embedding and entity2.embedding:
|
|
166
|
+
embedding_similarity = self._cosine_similarity(entity1.embedding, entity2.embedding)
|
|
167
|
+
|
|
168
|
+
# Weighted combination
|
|
169
|
+
if entity1.embedding and entity2.embedding and self.use_embeddings:
|
|
170
|
+
# If embeddings available, give them high weight
|
|
171
|
+
return 0.3 * name_similarity + 0.2 * property_similarity + 0.5 * embedding_similarity
|
|
172
|
+
else:
|
|
173
|
+
# No embeddings, rely on name and properties
|
|
174
|
+
return 0.7 * name_similarity + 0.3 * property_similarity
|
|
175
|
+
|
|
176
|
+
def _get_entity_name(self, entity: Entity) -> str:
|
|
177
|
+
"""Extract entity name from properties"""
|
|
178
|
+
return (
|
|
179
|
+
entity.properties.get("name")
|
|
180
|
+
or entity.properties.get("title")
|
|
181
|
+
or entity.properties.get("text")
|
|
182
|
+
or ""
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
def _string_similarity(self, str1: str, str2: str) -> float:
|
|
186
|
+
"""
|
|
187
|
+
Compute string similarity using multiple methods
|
|
188
|
+
|
|
189
|
+
Combines:
|
|
190
|
+
- Exact match (normalized)
|
|
191
|
+
- SequenceMatcher ratio
|
|
192
|
+
- Token overlap (for multi-word entities)
|
|
193
|
+
|
|
194
|
+
Args:
|
|
195
|
+
str1: First string
|
|
196
|
+
str2: Second string
|
|
197
|
+
|
|
198
|
+
Returns:
|
|
199
|
+
Similarity score (0.0-1.0)
|
|
200
|
+
"""
|
|
201
|
+
# Normalize strings
|
|
202
|
+
s1 = str1.lower().strip()
|
|
203
|
+
s2 = str2.lower().strip()
|
|
204
|
+
|
|
205
|
+
# Exact match
|
|
206
|
+
if s1 == s2:
|
|
207
|
+
return 1.0
|
|
208
|
+
|
|
209
|
+
# One is substring of other
|
|
210
|
+
if s1 in s2 or s2 in s1:
|
|
211
|
+
return 0.95
|
|
212
|
+
|
|
213
|
+
# Sequence matcher
|
|
214
|
+
seq_similarity = SequenceMatcher(None, s1, s2).ratio()
|
|
215
|
+
|
|
216
|
+
# Token overlap (for multi-word names)
|
|
217
|
+
tokens1 = set(s1.split())
|
|
218
|
+
tokens2 = set(s2.split())
|
|
219
|
+
if tokens1 and tokens2:
|
|
220
|
+
token_overlap = len(tokens1 & tokens2) / len(tokens1 | tokens2)
|
|
221
|
+
else:
|
|
222
|
+
token_overlap = 0.0
|
|
223
|
+
|
|
224
|
+
# Combine
|
|
225
|
+
return max(seq_similarity, token_overlap)
|
|
226
|
+
|
|
227
|
+
def _property_similarity(self, props1: Dict, props2: Dict) -> float:
|
|
228
|
+
"""
|
|
229
|
+
Compute similarity based on property overlap
|
|
230
|
+
|
|
231
|
+
Args:
|
|
232
|
+
props1: Properties of first entity
|
|
233
|
+
props2: Properties of second entity
|
|
234
|
+
|
|
235
|
+
Returns:
|
|
236
|
+
Similarity score (0.0-1.0)
|
|
237
|
+
"""
|
|
238
|
+
# Remove internal properties
|
|
239
|
+
keys1 = {k for k in props1.keys() if not k.startswith("_")}
|
|
240
|
+
keys2 = {k for k in props2.keys() if not k.startswith("_")}
|
|
241
|
+
|
|
242
|
+
if not keys1 and not keys2:
|
|
243
|
+
return 0.5 # No properties to compare
|
|
244
|
+
|
|
245
|
+
# Key overlap
|
|
246
|
+
common_keys = keys1 & keys2
|
|
247
|
+
all_keys = keys1 | keys2
|
|
248
|
+
|
|
249
|
+
if not all_keys:
|
|
250
|
+
return 0.5
|
|
251
|
+
|
|
252
|
+
key_overlap = len(common_keys) / len(all_keys)
|
|
253
|
+
|
|
254
|
+
# Value similarity for common keys
|
|
255
|
+
value_matches = 0
|
|
256
|
+
for key in common_keys:
|
|
257
|
+
val1 = str(props1[key]).lower()
|
|
258
|
+
val2 = str(props2[key]).lower()
|
|
259
|
+
if val1 == val2:
|
|
260
|
+
value_matches += 1
|
|
261
|
+
|
|
262
|
+
value_similarity = value_matches / len(common_keys) if common_keys else 0.0
|
|
263
|
+
|
|
264
|
+
# Combine
|
|
265
|
+
return 0.5 * key_overlap + 0.5 * value_similarity
|
|
266
|
+
|
|
267
|
+
def _cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
|
|
268
|
+
"""
|
|
269
|
+
Compute cosine similarity between two vectors
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
vec1: First vector
|
|
273
|
+
vec2: Second vector
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
Cosine similarity (0.0-1.0)
|
|
277
|
+
"""
|
|
278
|
+
if len(vec1) != len(vec2):
|
|
279
|
+
return 0.0
|
|
280
|
+
|
|
281
|
+
dot_product = sum(a * b for a, b in zip(vec1, vec2))
|
|
282
|
+
magnitude1 = sum(a * a for a in vec1) ** 0.5
|
|
283
|
+
magnitude2 = sum(b * b for b in vec2) ** 0.5
|
|
284
|
+
|
|
285
|
+
if magnitude1 == 0 or magnitude2 == 0:
|
|
286
|
+
return 0.0
|
|
287
|
+
|
|
288
|
+
# Cosine similarity ranges from -1 to 1, normalize to 0 to 1
|
|
289
|
+
similarity = dot_product / (magnitude1 * magnitude2)
|
|
290
|
+
return (similarity + 1) / 2
|
|
291
|
+
|
|
292
|
+
def _find_clusters(self, n: int, edges: Set[Tuple[int, int]]) -> List[List[int]]:
|
|
293
|
+
"""
|
|
294
|
+
Find connected components using Union-Find
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
n: Number of nodes
|
|
298
|
+
edges: Set of edges (i, j) indicating similarity
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
List of clusters, where each cluster is a list of node indices
|
|
302
|
+
"""
|
|
303
|
+
# Union-Find data structure
|
|
304
|
+
parent = list(range(n))
|
|
305
|
+
|
|
306
|
+
def find(x):
|
|
307
|
+
if parent[x] != x:
|
|
308
|
+
parent[x] = find(parent[x]) # Path compression
|
|
309
|
+
return parent[x]
|
|
310
|
+
|
|
311
|
+
def union(x, y):
|
|
312
|
+
px, py = find(x), find(y)
|
|
313
|
+
if px != py:
|
|
314
|
+
parent[px] = py
|
|
315
|
+
|
|
316
|
+
# Build connected components
|
|
317
|
+
for i, j in edges:
|
|
318
|
+
union(i, j)
|
|
319
|
+
|
|
320
|
+
# Group by root
|
|
321
|
+
clusters_dict: Dict[int, List[int]] = {}
|
|
322
|
+
for i in range(n):
|
|
323
|
+
root = find(i)
|
|
324
|
+
if root not in clusters_dict:
|
|
325
|
+
clusters_dict[root] = []
|
|
326
|
+
clusters_dict[root].append(i)
|
|
327
|
+
|
|
328
|
+
return list(clusters_dict.values())
|
|
329
|
+
|
|
330
|
+
def _merge_entities(self, entities: List[Entity]) -> Entity:
|
|
331
|
+
"""
|
|
332
|
+
Merge a cluster of similar entities into one canonical entity
|
|
333
|
+
|
|
334
|
+
Strategy:
|
|
335
|
+
- Use the first entity as base
|
|
336
|
+
- Merge all properties (prefer non-empty values)
|
|
337
|
+
- Store alternative names as aliases
|
|
338
|
+
- Keep highest confidence score
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
entities: List of entities to merge
|
|
342
|
+
|
|
343
|
+
Returns:
|
|
344
|
+
Merged canonical entity
|
|
345
|
+
"""
|
|
346
|
+
if len(entities) == 1:
|
|
347
|
+
return entities[0]
|
|
348
|
+
|
|
349
|
+
# Use first entity as base
|
|
350
|
+
canonical = entities[0]
|
|
351
|
+
|
|
352
|
+
# Collect all names as aliases
|
|
353
|
+
aliases = set()
|
|
354
|
+
for entity in entities:
|
|
355
|
+
name = self._get_entity_name(entity)
|
|
356
|
+
if name and name != self._get_entity_name(canonical):
|
|
357
|
+
aliases.add(name)
|
|
358
|
+
|
|
359
|
+
# Merge properties (prefer non-empty, non-None values)
|
|
360
|
+
merged_properties = dict(canonical.properties)
|
|
361
|
+
|
|
362
|
+
for entity in entities[1:]:
|
|
363
|
+
for key, value in entity.properties.items():
|
|
364
|
+
if key not in merged_properties or not merged_properties[key]:
|
|
365
|
+
merged_properties[key] = value
|
|
366
|
+
|
|
367
|
+
# Add aliases
|
|
368
|
+
if aliases:
|
|
369
|
+
merged_properties["_aliases"] = list(aliases)
|
|
370
|
+
|
|
371
|
+
# Take highest confidence
|
|
372
|
+
confidences = [e.properties.get("_extraction_confidence", 0.5) for e in entities]
|
|
373
|
+
merged_properties["_extraction_confidence"] = max(confidences)
|
|
374
|
+
|
|
375
|
+
# Track merge count
|
|
376
|
+
merged_properties["_merged_count"] = len(entities)
|
|
377
|
+
|
|
378
|
+
# Create merged entity
|
|
379
|
+
merged_entity = Entity(
|
|
380
|
+
id=canonical.id,
|
|
381
|
+
entity_type=canonical.entity_type,
|
|
382
|
+
properties=merged_properties,
|
|
383
|
+
embedding=canonical.embedding,
|
|
384
|
+
source=canonical.source,
|
|
385
|
+
)
|
|
386
|
+
|
|
387
|
+
return merged_entity
|