aiecs 1.0.1__py3-none-any.whl → 1.7.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +13 -16
- aiecs/__main__.py +7 -7
- aiecs/aiecs_client.py +269 -75
- aiecs/application/executors/operation_executor.py +79 -54
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
- aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
- aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
- aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +98 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
- aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
- aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
- aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
- aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
- aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
- aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
- aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
- aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
- aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
- aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
- aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
- aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
- aiecs/application/knowledge_graph/search/reranker.py +293 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +471 -0
- aiecs/config/__init__.py +20 -5
- aiecs/config/config.py +762 -31
- aiecs/config/graph_config.py +131 -0
- aiecs/config/tool_config.py +435 -0
- aiecs/core/__init__.py +29 -13
- aiecs/core/interface/__init__.py +2 -2
- aiecs/core/interface/execution_interface.py +22 -22
- aiecs/core/interface/storage_interface.py +37 -88
- aiecs/core/registry/__init__.py +31 -0
- aiecs/core/registry/service_registry.py +92 -0
- aiecs/domain/__init__.py +270 -1
- aiecs/domain/agent/__init__.py +191 -0
- aiecs/domain/agent/base_agent.py +3949 -0
- aiecs/domain/agent/exceptions.py +99 -0
- aiecs/domain/agent/graph_aware_mixin.py +569 -0
- aiecs/domain/agent/hybrid_agent.py +1731 -0
- aiecs/domain/agent/integration/__init__.py +29 -0
- aiecs/domain/agent/integration/context_compressor.py +216 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
- aiecs/domain/agent/integration/protocols.py +281 -0
- aiecs/domain/agent/integration/retry_policy.py +218 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
- aiecs/domain/agent/lifecycle.py +291 -0
- aiecs/domain/agent/llm_agent.py +692 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +1124 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +163 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
- aiecs/domain/agent/models.py +894 -0
- aiecs/domain/agent/observability.py +479 -0
- aiecs/domain/agent/persistence.py +449 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +159 -0
- aiecs/domain/agent/prompts/formatters.py +187 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +253 -0
- aiecs/domain/agent/tool_agent.py +444 -0
- aiecs/domain/agent/tools/__init__.py +15 -0
- aiecs/domain/agent/tools/schema_generator.py +377 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +469 -0
- aiecs/domain/community/analytics.py +432 -0
- aiecs/domain/community/collaborative_workflow.py +648 -0
- aiecs/domain/community/communication_hub.py +634 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +796 -0
- aiecs/domain/community/community_manager.py +803 -0
- aiecs/domain/community/decision_engine.py +849 -0
- aiecs/domain/community/exceptions.py +231 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +234 -0
- aiecs/domain/community/resource_manager.py +461 -0
- aiecs/domain/community/shared_context_manager.py +589 -0
- aiecs/domain/context/__init__.py +40 -10
- aiecs/domain/context/context_engine.py +1910 -0
- aiecs/domain/context/conversation_models.py +87 -53
- aiecs/domain/context/graph_memory.py +582 -0
- aiecs/domain/execution/model.py +12 -4
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +148 -0
- aiecs/domain/knowledge_graph/models/evidence.py +178 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
- aiecs/domain/knowledge_graph/models/path.py +171 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
- aiecs/domain/knowledge_graph/models/query.py +261 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
- aiecs/domain/knowledge_graph/models/relation.py +202 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
- aiecs/domain/task/dsl_processor.py +172 -56
- aiecs/domain/task/model.py +20 -8
- aiecs/domain/task/task_context.py +27 -24
- aiecs/infrastructure/__init__.py +0 -2
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +837 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
- aiecs/infrastructure/graph_storage/cache.py +424 -0
- aiecs/infrastructure/graph_storage/distributed.py +223 -0
- aiecs/infrastructure/graph_storage/error_handling.py +380 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
- aiecs/infrastructure/graph_storage/metrics.py +344 -0
- aiecs/infrastructure/graph_storage/migration.py +400 -0
- aiecs/infrastructure/graph_storage/pagination.py +483 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
- aiecs/infrastructure/graph_storage/postgres.py +1563 -0
- aiecs/infrastructure/graph_storage/property_storage.py +353 -0
- aiecs/infrastructure/graph_storage/protocols.py +76 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
- aiecs/infrastructure/graph_storage/streaming.py +487 -0
- aiecs/infrastructure/graph_storage/tenant.py +412 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
- aiecs/infrastructure/messaging/websocket_manager.py +51 -35
- aiecs/infrastructure/monitoring/__init__.py +22 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
- aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
- aiecs/infrastructure/monitoring/structured_logger.py +3 -7
- aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
- aiecs/infrastructure/persistence/__init__.py +14 -1
- aiecs/infrastructure/persistence/context_engine_client.py +184 -0
- aiecs/infrastructure/persistence/database_manager.py +67 -43
- aiecs/infrastructure/persistence/file_storage.py +180 -103
- aiecs/infrastructure/persistence/redis_client.py +74 -21
- aiecs/llm/__init__.py +73 -25
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
- aiecs/llm/client_factory.py +230 -37
- aiecs/llm/client_resolver.py +155 -0
- aiecs/llm/clients/__init__.py +38 -0
- aiecs/llm/clients/base_client.py +328 -0
- aiecs/llm/clients/google_function_calling_mixin.py +415 -0
- aiecs/llm/clients/googleai_client.py +314 -0
- aiecs/llm/clients/openai_client.py +158 -0
- aiecs/llm/clients/openai_compatible_mixin.py +367 -0
- aiecs/llm/clients/vertex_client.py +1186 -0
- aiecs/llm/clients/xai_client.py +201 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +272 -0
- aiecs/llm/config/config_validator.py +206 -0
- aiecs/llm/config/model_config.py +143 -0
- aiecs/llm/protocols.py +149 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +89 -0
- aiecs/main.py +140 -121
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/module_checker.py +499 -0
- aiecs/scripts/aid/version_manager.py +235 -0
- aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
- aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
- aiecs/scripts/dependance_check/__init__.py +15 -0
- aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
- aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
- aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
- aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
- aiecs/scripts/knowledge_graph/__init__.py +3 -0
- aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
- aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
- aiecs/scripts/tools_develop/README.md +671 -0
- aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
- aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
- aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
- aiecs/scripts/tools_develop/schema_coverage.py +511 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
- aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
- aiecs/scripts/tools_develop/verify_tools.py +352 -0
- aiecs/tasks/__init__.py +0 -1
- aiecs/tasks/worker.py +115 -47
- aiecs/tools/__init__.py +194 -72
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +330 -0
- aiecs/tools/apisource/providers/__init__.py +112 -0
- aiecs/tools/apisource/providers/base.py +671 -0
- aiecs/tools/apisource/providers/census.py +397 -0
- aiecs/tools/apisource/providers/fred.py +535 -0
- aiecs/tools/apisource/providers/newsapi.py +409 -0
- aiecs/tools/apisource/providers/worldbank.py +352 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +363 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
- aiecs/tools/apisource/tool.py +832 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +334 -0
- aiecs/tools/base_tool.py +415 -21
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +607 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
- aiecs/tools/docs/content_insertion_tool.py +1320 -0
- aiecs/tools/docs/document_creator_tool.py +1464 -0
- aiecs/tools/docs/document_layout_tool.py +1160 -0
- aiecs/tools/docs/document_parser_tool.py +1016 -0
- aiecs/tools/docs/document_writer_tool.py +2008 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
- aiecs/tools/langchain_adapter.py +300 -138
- aiecs/tools/schema_generator.py +455 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +581 -0
- aiecs/tools/search_tool/cache.py +264 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +224 -0
- aiecs/tools/search_tool/core.py +778 -0
- aiecs/tools/search_tool/deduplicator.py +119 -0
- aiecs/tools/search_tool/error_handler.py +242 -0
- aiecs/tools/search_tool/metrics.py +343 -0
- aiecs/tools/search_tool/rate_limiter.py +172 -0
- aiecs/tools/search_tool/schemas.py +275 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
- aiecs/tools/statistics/data_loader_tool.py +555 -0
- aiecs/tools/statistics/data_profiler_tool.py +638 -0
- aiecs/tools/statistics/data_transformer_tool.py +580 -0
- aiecs/tools/statistics/data_visualizer_tool.py +498 -0
- aiecs/tools/statistics/model_trainer_tool.py +507 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
- aiecs/tools/task_tools/__init__.py +49 -36
- aiecs/tools/task_tools/chart_tool.py +200 -184
- aiecs/tools/task_tools/classfire_tool.py +268 -267
- aiecs/tools/task_tools/image_tool.py +220 -141
- aiecs/tools/task_tools/office_tool.py +226 -146
- aiecs/tools/task_tools/pandas_tool.py +477 -121
- aiecs/tools/task_tools/report_tool.py +390 -142
- aiecs/tools/task_tools/research_tool.py +149 -79
- aiecs/tools/task_tools/scraper_tool.py +339 -145
- aiecs/tools/task_tools/stats_tool.py +448 -209
- aiecs/tools/temp_file_manager.py +26 -24
- aiecs/tools/tool_executor/__init__.py +18 -16
- aiecs/tools/tool_executor/tool_executor.py +364 -52
- aiecs/utils/LLM_output_structor.py +74 -48
- aiecs/utils/__init__.py +14 -3
- aiecs/utils/base_callback.py +0 -3
- aiecs/utils/cache_provider.py +696 -0
- aiecs/utils/execution_utils.py +50 -31
- aiecs/utils/prompt_loader.py +1 -0
- aiecs/utils/token_usage_repository.py +37 -11
- aiecs/ws/socket_server.py +14 -4
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/METADATA +52 -15
- aiecs-1.7.17.dist-info/RECORD +337 -0
- aiecs-1.7.17.dist-info/entry_points.txt +13 -0
- aiecs/config/registry.py +0 -19
- aiecs/domain/context/content_engine.py +0 -982
- aiecs/llm/base_client.py +0 -99
- aiecs/llm/openai_client.py +0 -125
- aiecs/llm/vertex_client.py +0 -186
- aiecs/llm/xai_client.py +0 -184
- aiecs/scripts/dependency_checker.py +0 -857
- aiecs/scripts/quick_dependency_check.py +0 -269
- aiecs/tools/task_tools/search_api.py +0 -7
- aiecs-1.0.1.dist-info/RECORD +0 -90
- aiecs-1.0.1.dist-info/entry_points.txt +0 -7
- /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
- /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
- /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
- /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/WHEEL +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.17.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Name Normalizer
|
|
3
|
+
|
|
4
|
+
Normalizes entity names for comparison to handle common variations:
|
|
5
|
+
- Prefixes (Dr., Prof., Mr., Mrs., Ms.)
|
|
6
|
+
- Suffixes (Jr., Sr., PhD, MD, III)
|
|
7
|
+
- Initials (J. Smith → John Smith)
|
|
8
|
+
- Whitespace and punctuation
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import re
|
|
12
|
+
from typing import List, Set, Tuple, Optional
|
|
13
|
+
from dataclasses import dataclass, field
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class NormalizationResult:
|
|
18
|
+
"""Result of name normalization"""
|
|
19
|
+
normalized: str
|
|
20
|
+
original: str
|
|
21
|
+
stripped_prefixes: List[str] = field(default_factory=list)
|
|
22
|
+
stripped_suffixes: List[str] = field(default_factory=list)
|
|
23
|
+
has_initials: bool = False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class NameNormalizer:
|
|
27
|
+
"""
|
|
28
|
+
Normalize entity names for improved matching
|
|
29
|
+
|
|
30
|
+
Handles common name variations:
|
|
31
|
+
- Title prefixes (Dr., Prof., Mr., Mrs., Ms.)
|
|
32
|
+
- Name suffixes (Jr., Sr., PhD, MD, III, IV)
|
|
33
|
+
- Initial patterns (J. Smith, A. Einstein)
|
|
34
|
+
- Whitespace and punctuation normalization
|
|
35
|
+
|
|
36
|
+
Example:
|
|
37
|
+
```python
|
|
38
|
+
normalizer = NameNormalizer()
|
|
39
|
+
|
|
40
|
+
# Strip prefixes and suffixes
|
|
41
|
+
result = normalizer.normalize("Dr. John Smith, PhD")
|
|
42
|
+
assert result.normalized == "john smith"
|
|
43
|
+
assert "Dr." in result.stripped_prefixes
|
|
44
|
+
assert "PhD" in result.stripped_suffixes
|
|
45
|
+
|
|
46
|
+
# Match initials with full names
|
|
47
|
+
assert normalizer.names_match_with_initials("J. Smith", "John Smith")
|
|
48
|
+
```
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
# Common prefixes to strip (case-insensitive)
|
|
52
|
+
DEFAULT_PREFIXES: Set[str] = {
|
|
53
|
+
"dr", "dr.", "doctor",
|
|
54
|
+
"prof", "prof.", "professor",
|
|
55
|
+
"mr", "mr.", "mister",
|
|
56
|
+
"mrs", "mrs.",
|
|
57
|
+
"ms", "ms.", "miss",
|
|
58
|
+
"sir", "dame", "lord", "lady",
|
|
59
|
+
"rev", "rev.", "reverend",
|
|
60
|
+
"hon", "hon.", "honorable",
|
|
61
|
+
"capt", "capt.", "captain",
|
|
62
|
+
"col", "col.", "colonel",
|
|
63
|
+
"gen", "gen.", "general",
|
|
64
|
+
"lt", "lt.", "lieutenant",
|
|
65
|
+
"sgt", "sgt.", "sergeant",
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
# Common suffixes to strip (case-insensitive)
|
|
69
|
+
DEFAULT_SUFFIXES: Set[str] = {
|
|
70
|
+
"jr", "jr.", "junior",
|
|
71
|
+
"sr", "sr.", "senior",
|
|
72
|
+
"phd", "ph.d", "ph.d.",
|
|
73
|
+
"md", "m.d", "m.d.",
|
|
74
|
+
"esq", "esq.", "esquire",
|
|
75
|
+
"ii", "iii", "iv", "v",
|
|
76
|
+
"2nd", "3rd", "4th", "5th",
|
|
77
|
+
"cpa", "c.p.a", "c.p.a.",
|
|
78
|
+
"mba", "m.b.a", "m.b.a.",
|
|
79
|
+
"jd", "j.d", "j.d.",
|
|
80
|
+
"llb", "ll.b", "ll.b.",
|
|
81
|
+
"dds", "d.d.s", "d.d.s.",
|
|
82
|
+
"rn", "r.n", "r.n.",
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
# Pattern for detecting initials (e.g., "J.", "A. B.")
|
|
86
|
+
INITIAL_PATTERN = re.compile(r'^([A-Za-z])\.$')
|
|
87
|
+
|
|
88
|
+
def __init__(
|
|
89
|
+
self,
|
|
90
|
+
custom_prefixes: Optional[Set[str]] = None,
|
|
91
|
+
custom_suffixes: Optional[Set[str]] = None,
|
|
92
|
+
):
|
|
93
|
+
"""
|
|
94
|
+
Initialize name normalizer
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
custom_prefixes: Additional prefixes to strip (merged with defaults)
|
|
98
|
+
custom_suffixes: Additional suffixes to strip (merged with defaults)
|
|
99
|
+
"""
|
|
100
|
+
self.prefixes = self.DEFAULT_PREFIXES.copy()
|
|
101
|
+
self.suffixes = self.DEFAULT_SUFFIXES.copy()
|
|
102
|
+
|
|
103
|
+
if custom_prefixes:
|
|
104
|
+
self.prefixes.update(p.lower() for p in custom_prefixes)
|
|
105
|
+
if custom_suffixes:
|
|
106
|
+
self.suffixes.update(s.lower() for s in custom_suffixes)
|
|
107
|
+
|
|
108
|
+
def normalize(self, name: str) -> NormalizationResult:
|
|
109
|
+
"""
|
|
110
|
+
Normalize a name for comparison
|
|
111
|
+
|
|
112
|
+
Steps:
|
|
113
|
+
1. Normalize whitespace (multiple spaces, tabs → single space)
|
|
114
|
+
2. Normalize punctuation (remove extra, standardize)
|
|
115
|
+
3. Strip prefixes (Dr., Prof., etc.)
|
|
116
|
+
4. Strip suffixes (Jr., PhD, etc.)
|
|
117
|
+
5. Lowercase
|
|
118
|
+
6. Detect initials
|
|
119
|
+
|
|
120
|
+
Args:
|
|
121
|
+
name: Original name string
|
|
122
|
+
|
|
123
|
+
Returns:
|
|
124
|
+
NormalizationResult with normalized name and metadata
|
|
125
|
+
"""
|
|
126
|
+
if not name:
|
|
127
|
+
return NormalizationResult(normalized="", original=name)
|
|
128
|
+
|
|
129
|
+
original = name
|
|
130
|
+
|
|
131
|
+
# Step 1: Normalize whitespace
|
|
132
|
+
normalized = self._normalize_whitespace(name)
|
|
133
|
+
|
|
134
|
+
# Step 2: Normalize punctuation
|
|
135
|
+
normalized = self._normalize_punctuation(normalized)
|
|
136
|
+
|
|
137
|
+
# Step 3: Strip prefixes
|
|
138
|
+
normalized, stripped_prefixes = self._strip_prefixes(normalized)
|
|
139
|
+
|
|
140
|
+
# Step 4: Strip suffixes
|
|
141
|
+
normalized, stripped_suffixes = self._strip_suffixes(normalized)
|
|
142
|
+
|
|
143
|
+
# Step 5: Lowercase
|
|
144
|
+
normalized = normalized.lower().strip()
|
|
145
|
+
|
|
146
|
+
# Step 6: Detect initials
|
|
147
|
+
has_initials = self._has_initials(normalized)
|
|
148
|
+
|
|
149
|
+
return NormalizationResult(
|
|
150
|
+
normalized=normalized,
|
|
151
|
+
original=original,
|
|
152
|
+
stripped_prefixes=stripped_prefixes,
|
|
153
|
+
stripped_suffixes=stripped_suffixes,
|
|
154
|
+
has_initials=has_initials,
|
|
155
|
+
)
|
|
156
|
+
|
|
157
|
+
def _normalize_whitespace(self, name: str) -> str:
|
|
158
|
+
"""Normalize whitespace: multiple spaces/tabs → single space"""
|
|
159
|
+
return re.sub(r'\s+', ' ', name).strip()
|
|
160
|
+
|
|
161
|
+
def _normalize_punctuation(self, name: str) -> str:
|
|
162
|
+
"""
|
|
163
|
+
Normalize punctuation for comparison
|
|
164
|
+
|
|
165
|
+
- "Smith, John" → "Smith John"
|
|
166
|
+
- "O'Brien" → "O'Brien" (preserve apostrophes in names)
|
|
167
|
+
- "Smith-Jones" → "Smith-Jones" (preserve hyphens)
|
|
168
|
+
"""
|
|
169
|
+
# Remove commas (handles "Smith, John" format)
|
|
170
|
+
name = name.replace(',', ' ')
|
|
171
|
+
# Normalize whitespace after comma removal
|
|
172
|
+
name = re.sub(r'\s+', ' ', name).strip()
|
|
173
|
+
return name
|
|
174
|
+
|
|
175
|
+
def _strip_prefixes(self, name: str) -> Tuple[str, List[str]]:
|
|
176
|
+
"""
|
|
177
|
+
Strip common prefixes from name
|
|
178
|
+
|
|
179
|
+
Returns:
|
|
180
|
+
Tuple of (name_without_prefixes, list_of_stripped_prefixes)
|
|
181
|
+
"""
|
|
182
|
+
stripped = []
|
|
183
|
+
tokens = name.split()
|
|
184
|
+
|
|
185
|
+
while tokens:
|
|
186
|
+
token_lower = tokens[0].lower().rstrip('.')
|
|
187
|
+
# Check if first token is a prefix
|
|
188
|
+
if token_lower in self.prefixes or f"{token_lower}." in self.prefixes:
|
|
189
|
+
stripped.append(tokens.pop(0))
|
|
190
|
+
else:
|
|
191
|
+
break
|
|
192
|
+
|
|
193
|
+
return ' '.join(tokens), stripped
|
|
194
|
+
|
|
195
|
+
def _strip_suffixes(self, name: str) -> Tuple[str, List[str]]:
|
|
196
|
+
"""
|
|
197
|
+
Strip common suffixes from name
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
Tuple of (name_without_suffixes, list_of_stripped_suffixes)
|
|
201
|
+
"""
|
|
202
|
+
stripped = []
|
|
203
|
+
tokens = name.split()
|
|
204
|
+
|
|
205
|
+
while tokens:
|
|
206
|
+
# Check last token
|
|
207
|
+
token_lower = tokens[-1].lower().rstrip('.').rstrip(',')
|
|
208
|
+
if token_lower in self.suffixes or f"{token_lower}." in self.suffixes:
|
|
209
|
+
stripped.insert(0, tokens.pop()) # Insert at beginning to preserve order
|
|
210
|
+
else:
|
|
211
|
+
break
|
|
212
|
+
|
|
213
|
+
return ' '.join(tokens), stripped
|
|
214
|
+
|
|
215
|
+
def _has_initials(self, name: str) -> bool:
|
|
216
|
+
"""Check if name contains initials (e.g., 'J.' or 'A. B.')"""
|
|
217
|
+
tokens = name.split()
|
|
218
|
+
for token in tokens:
|
|
219
|
+
if self.INITIAL_PATTERN.match(token):
|
|
220
|
+
return True
|
|
221
|
+
# Also check for single letters without period
|
|
222
|
+
if len(token) == 1 and token.isalpha():
|
|
223
|
+
return True
|
|
224
|
+
return False
|
|
225
|
+
|
|
226
|
+
def names_match_with_initials(self, name1: str, name2: str) -> bool:
|
|
227
|
+
"""
|
|
228
|
+
Check if two names match, allowing initials to match full names
|
|
229
|
+
|
|
230
|
+
"J. Smith" matches "John Smith", "James Smith", etc.
|
|
231
|
+
"A. Einstein" matches "Albert Einstein"
|
|
232
|
+
|
|
233
|
+
Args:
|
|
234
|
+
name1: First name
|
|
235
|
+
name2: Second name
|
|
236
|
+
|
|
237
|
+
Returns:
|
|
238
|
+
True if names match (accounting for initials)
|
|
239
|
+
"""
|
|
240
|
+
# Normalize both names
|
|
241
|
+
result1 = self.normalize(name1)
|
|
242
|
+
result2 = self.normalize(name2)
|
|
243
|
+
|
|
244
|
+
# Exact match after normalization
|
|
245
|
+
if result1.normalized == result2.normalized:
|
|
246
|
+
return True
|
|
247
|
+
|
|
248
|
+
# If neither has initials, no match
|
|
249
|
+
if not result1.has_initials and not result2.has_initials:
|
|
250
|
+
return False
|
|
251
|
+
|
|
252
|
+
# Try matching with initial expansion
|
|
253
|
+
tokens1 = result1.normalized.split()
|
|
254
|
+
tokens2 = result2.normalized.split()
|
|
255
|
+
|
|
256
|
+
# Must have same number of tokens (or be comparable)
|
|
257
|
+
if abs(len(tokens1) - len(tokens2)) > 1:
|
|
258
|
+
return False
|
|
259
|
+
|
|
260
|
+
return self._tokens_match_with_initials(tokens1, tokens2)
|
|
261
|
+
|
|
262
|
+
def _tokens_match_with_initials(self, tokens1: List[str], tokens2: List[str]) -> bool:
|
|
263
|
+
"""
|
|
264
|
+
Check if token lists match, allowing initials
|
|
265
|
+
|
|
266
|
+
"j" matches "john" (initial to full name)
|
|
267
|
+
"j." matches "john"
|
|
268
|
+
"""
|
|
269
|
+
# Handle different lengths - pad shorter list
|
|
270
|
+
len1, len2 = len(tokens1), len(tokens2)
|
|
271
|
+
if len1 != len2:
|
|
272
|
+
# Try to match with potential middle name difference
|
|
273
|
+
if abs(len1 - len2) == 1:
|
|
274
|
+
# Try skipping middle name in longer list
|
|
275
|
+
if len1 > len2:
|
|
276
|
+
# Try removing middle token from tokens1
|
|
277
|
+
for i in range(len(tokens1)):
|
|
278
|
+
test_tokens = tokens1[:i] + tokens1[i+1:]
|
|
279
|
+
if self._tokens_match_exact(test_tokens, tokens2):
|
|
280
|
+
return True
|
|
281
|
+
else:
|
|
282
|
+
# Try removing middle token from tokens2
|
|
283
|
+
for i in range(len(tokens2)):
|
|
284
|
+
test_tokens = tokens2[:i] + tokens2[i+1:]
|
|
285
|
+
if self._tokens_match_exact(tokens1, test_tokens):
|
|
286
|
+
return True
|
|
287
|
+
return False
|
|
288
|
+
|
|
289
|
+
return self._tokens_match_exact(tokens1, tokens2)
|
|
290
|
+
|
|
291
|
+
def _tokens_match_exact(self, tokens1: List[str], tokens2: List[str]) -> bool:
|
|
292
|
+
"""Check if token lists match exactly (with initial expansion)"""
|
|
293
|
+
if len(tokens1) != len(tokens2):
|
|
294
|
+
return False
|
|
295
|
+
|
|
296
|
+
for t1, t2 in zip(tokens1, tokens2):
|
|
297
|
+
if not self._token_matches(t1, t2):
|
|
298
|
+
return False
|
|
299
|
+
return True
|
|
300
|
+
|
|
301
|
+
def _token_matches(self, token1: str, token2: str) -> bool:
|
|
302
|
+
"""
|
|
303
|
+
Check if two tokens match (with initial expansion)
|
|
304
|
+
|
|
305
|
+
Returns True if:
|
|
306
|
+
- Tokens are equal
|
|
307
|
+
- One is an initial that matches the first letter of the other
|
|
308
|
+
"""
|
|
309
|
+
t1 = token1.rstrip('.')
|
|
310
|
+
t2 = token2.rstrip('.')
|
|
311
|
+
|
|
312
|
+
# Exact match
|
|
313
|
+
if t1 == t2:
|
|
314
|
+
return True
|
|
315
|
+
|
|
316
|
+
# Initial match: single letter matches first letter of other token
|
|
317
|
+
if len(t1) == 1 and t2.startswith(t1):
|
|
318
|
+
return True
|
|
319
|
+
if len(t2) == 1 and t1.startswith(t2):
|
|
320
|
+
return True
|
|
321
|
+
|
|
322
|
+
return False
|
|
323
|
+
|
|
324
|
+
def get_initial_variants(self, name: str) -> List[str]:
|
|
325
|
+
"""
|
|
326
|
+
Generate possible variants with initials for a name
|
|
327
|
+
|
|
328
|
+
"John Smith" → ["John Smith", "J. Smith", "J Smith"]
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
name: Full name
|
|
332
|
+
|
|
333
|
+
Returns:
|
|
334
|
+
List of name variants (including original)
|
|
335
|
+
"""
|
|
336
|
+
result = self.normalize(name)
|
|
337
|
+
tokens = result.normalized.split()
|
|
338
|
+
|
|
339
|
+
if not tokens:
|
|
340
|
+
return [name]
|
|
341
|
+
|
|
342
|
+
variants = [result.normalized]
|
|
343
|
+
|
|
344
|
+
# Generate variants with first name as initial
|
|
345
|
+
if len(tokens) >= 2 and len(tokens[0]) > 1:
|
|
346
|
+
initial_variant = f"{tokens[0][0]}. {' '.join(tokens[1:])}"
|
|
347
|
+
variants.append(initial_variant)
|
|
348
|
+
# Also without period
|
|
349
|
+
variants.append(f"{tokens[0][0]} {' '.join(tokens[1:])}")
|
|
350
|
+
|
|
351
|
+
return variants
|
|
352
|
+
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Relation Deduplicator
|
|
3
|
+
|
|
4
|
+
Identifies and removes duplicate relations.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from typing import List, Tuple, Dict
|
|
8
|
+
from aiecs.domain.knowledge_graph.models.relation import Relation
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class RelationDeduplicator:
|
|
12
|
+
"""
|
|
13
|
+
Deduplicate relations based on equivalence
|
|
14
|
+
|
|
15
|
+
Two relations are considered duplicates if they have:
|
|
16
|
+
- Same source entity
|
|
17
|
+
- Same target entity
|
|
18
|
+
- Same relation type
|
|
19
|
+
- (Optionally) Similar properties
|
|
20
|
+
|
|
21
|
+
This handles cases like:
|
|
22
|
+
- Extracting "Alice WORKS_FOR Tech Corp" multiple times from different sentences
|
|
23
|
+
- Multiple mentions of the same relationship with slight variations
|
|
24
|
+
|
|
25
|
+
Example:
|
|
26
|
+
```python
|
|
27
|
+
deduplicator = RelationDeduplicator()
|
|
28
|
+
|
|
29
|
+
relations = [
|
|
30
|
+
Relation(source="e1", target="e2", type="WORKS_FOR"),
|
|
31
|
+
Relation(source="e1", target="e2", type="WORKS_FOR", properties={"since": "2020"}),
|
|
32
|
+
Relation(source="e1", target="e3", type="KNOWS")
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
deduplicated = await deduplicator.deduplicate(relations)
|
|
36
|
+
# Returns: [
|
|
37
|
+
# Relation(source="e1", target="e2", type="WORKS_FOR", properties={"since": "2020"}),
|
|
38
|
+
# Relation(source="e1", target="e3", type="KNOWS")
|
|
39
|
+
# ]
|
|
40
|
+
```
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(self, merge_properties: bool = True):
|
|
44
|
+
"""
|
|
45
|
+
Initialize relation deduplicator
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
merge_properties: If True, merge properties from duplicate relations
|
|
49
|
+
"""
|
|
50
|
+
self.merge_properties = merge_properties
|
|
51
|
+
|
|
52
|
+
async def deduplicate(self, relations: List[Relation]) -> List[Relation]:
|
|
53
|
+
"""
|
|
54
|
+
Deduplicate a list of relations
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
relations: List of relations to deduplicate
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
List of unique relations (with merged properties if enabled)
|
|
61
|
+
"""
|
|
62
|
+
if not relations:
|
|
63
|
+
return []
|
|
64
|
+
|
|
65
|
+
# Group relations by (source, target, type) tuple
|
|
66
|
+
relation_groups: Dict[Tuple[str, str, str], List[Relation]] = {}
|
|
67
|
+
|
|
68
|
+
for relation in relations:
|
|
69
|
+
key = (
|
|
70
|
+
relation.source_id,
|
|
71
|
+
relation.target_id,
|
|
72
|
+
relation.relation_type,
|
|
73
|
+
)
|
|
74
|
+
|
|
75
|
+
if key not in relation_groups:
|
|
76
|
+
relation_groups[key] = []
|
|
77
|
+
relation_groups[key].append(relation)
|
|
78
|
+
|
|
79
|
+
# For each group, merge duplicates
|
|
80
|
+
deduplicated = []
|
|
81
|
+
for key, group in relation_groups.items():
|
|
82
|
+
if len(group) == 1:
|
|
83
|
+
deduplicated.append(group[0])
|
|
84
|
+
else:
|
|
85
|
+
merged = self._merge_relations(group)
|
|
86
|
+
deduplicated.append(merged)
|
|
87
|
+
|
|
88
|
+
return deduplicated
|
|
89
|
+
|
|
90
|
+
def _merge_relations(self, relations: List[Relation]) -> Relation:
|
|
91
|
+
"""
|
|
92
|
+
Merge a group of duplicate relations into one
|
|
93
|
+
|
|
94
|
+
Strategy:
|
|
95
|
+
- Use first relation as base
|
|
96
|
+
- Merge properties (prefer non-empty values)
|
|
97
|
+
- Keep highest weight
|
|
98
|
+
- Keep highest confidence
|
|
99
|
+
|
|
100
|
+
Args:
|
|
101
|
+
relations: List of duplicate relations
|
|
102
|
+
|
|
103
|
+
Returns:
|
|
104
|
+
Merged relation
|
|
105
|
+
"""
|
|
106
|
+
if len(relations) == 1:
|
|
107
|
+
return relations[0]
|
|
108
|
+
|
|
109
|
+
# Use first relation as base
|
|
110
|
+
base = relations[0]
|
|
111
|
+
|
|
112
|
+
# Merge properties
|
|
113
|
+
merged_properties = dict(base.properties) if base.properties else {}
|
|
114
|
+
|
|
115
|
+
if self.merge_properties:
|
|
116
|
+
for relation in relations[1:]:
|
|
117
|
+
if relation.properties:
|
|
118
|
+
for key, value in relation.properties.items():
|
|
119
|
+
# Add property if not exists or current value is empty
|
|
120
|
+
if key not in merged_properties or not merged_properties[key]:
|
|
121
|
+
merged_properties[key] = value
|
|
122
|
+
|
|
123
|
+
# Take highest weight
|
|
124
|
+
max_weight = max(r.weight for r in relations)
|
|
125
|
+
|
|
126
|
+
# Take highest confidence (if present in properties)
|
|
127
|
+
confidences = [r.properties.get("_extraction_confidence", 0.5) for r in relations if r.properties]
|
|
128
|
+
if confidences:
|
|
129
|
+
merged_properties["_extraction_confidence"] = max(confidences)
|
|
130
|
+
|
|
131
|
+
# Track merge count
|
|
132
|
+
merged_properties["_merged_count"] = len(relations)
|
|
133
|
+
|
|
134
|
+
# Create merged relation
|
|
135
|
+
merged = Relation(
|
|
136
|
+
id=base.id,
|
|
137
|
+
relation_type=base.relation_type,
|
|
138
|
+
source_id=base.source_id,
|
|
139
|
+
target_id=base.target_id,
|
|
140
|
+
properties=merged_properties,
|
|
141
|
+
weight=max_weight,
|
|
142
|
+
source=base.source,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
return merged
|
|
146
|
+
|
|
147
|
+
def find_duplicates(self, relations: List[Relation]) -> List[Tuple[Relation, Relation]]:
|
|
148
|
+
"""
|
|
149
|
+
Find pairs of duplicate relations without merging
|
|
150
|
+
|
|
151
|
+
Useful for debugging or manual review.
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
relations: List of relations to check
|
|
155
|
+
|
|
156
|
+
Returns:
|
|
157
|
+
List of (relation1, relation2) tuples that are duplicates
|
|
158
|
+
"""
|
|
159
|
+
duplicates = []
|
|
160
|
+
n = len(relations)
|
|
161
|
+
|
|
162
|
+
for i in range(n):
|
|
163
|
+
for j in range(i + 1, n):
|
|
164
|
+
r1 = relations[i]
|
|
165
|
+
r2 = relations[j]
|
|
166
|
+
|
|
167
|
+
if self._are_duplicates(r1, r2):
|
|
168
|
+
duplicates.append((r1, r2))
|
|
169
|
+
|
|
170
|
+
return duplicates
|
|
171
|
+
|
|
172
|
+
def _are_duplicates(self, r1: Relation, r2: Relation) -> bool:
|
|
173
|
+
"""
|
|
174
|
+
Check if two relations are duplicates
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
r1: First relation
|
|
178
|
+
r2: Second relation
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
True if relations are duplicates
|
|
182
|
+
"""
|
|
183
|
+
return r1.source_id == r2.source_id and r1.target_id == r2.target_id and r1.relation_type == r2.relation_type
|