aiecs 1.0.1__py3-none-any.whl → 1.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +13 -16
- aiecs/__main__.py +7 -7
- aiecs/aiecs_client.py +269 -75
- aiecs/application/executors/operation_executor.py +79 -54
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
- aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
- aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
- aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +98 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
- aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
- aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
- aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
- aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
- aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
- aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
- aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
- aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
- aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
- aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
- aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
- aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
- aiecs/application/knowledge_graph/search/reranker.py +293 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +471 -0
- aiecs/config/__init__.py +20 -5
- aiecs/config/config.py +762 -31
- aiecs/config/graph_config.py +131 -0
- aiecs/config/tool_config.py +399 -0
- aiecs/core/__init__.py +29 -13
- aiecs/core/interface/__init__.py +2 -2
- aiecs/core/interface/execution_interface.py +22 -22
- aiecs/core/interface/storage_interface.py +37 -88
- aiecs/core/registry/__init__.py +31 -0
- aiecs/core/registry/service_registry.py +92 -0
- aiecs/domain/__init__.py +270 -1
- aiecs/domain/agent/__init__.py +191 -0
- aiecs/domain/agent/base_agent.py +3870 -0
- aiecs/domain/agent/exceptions.py +99 -0
- aiecs/domain/agent/graph_aware_mixin.py +569 -0
- aiecs/domain/agent/hybrid_agent.py +1435 -0
- aiecs/domain/agent/integration/__init__.py +29 -0
- aiecs/domain/agent/integration/context_compressor.py +216 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
- aiecs/domain/agent/integration/protocols.py +281 -0
- aiecs/domain/agent/integration/retry_policy.py +218 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
- aiecs/domain/agent/lifecycle.py +291 -0
- aiecs/domain/agent/llm_agent.py +692 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +1124 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +163 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
- aiecs/domain/agent/models.py +884 -0
- aiecs/domain/agent/observability.py +479 -0
- aiecs/domain/agent/persistence.py +449 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +159 -0
- aiecs/domain/agent/prompts/formatters.py +187 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +253 -0
- aiecs/domain/agent/tool_agent.py +444 -0
- aiecs/domain/agent/tools/__init__.py +15 -0
- aiecs/domain/agent/tools/schema_generator.py +364 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +469 -0
- aiecs/domain/community/analytics.py +432 -0
- aiecs/domain/community/collaborative_workflow.py +648 -0
- aiecs/domain/community/communication_hub.py +634 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +796 -0
- aiecs/domain/community/community_manager.py +803 -0
- aiecs/domain/community/decision_engine.py +849 -0
- aiecs/domain/community/exceptions.py +231 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +234 -0
- aiecs/domain/community/resource_manager.py +461 -0
- aiecs/domain/community/shared_context_manager.py +589 -0
- aiecs/domain/context/__init__.py +40 -10
- aiecs/domain/context/context_engine.py +1910 -0
- aiecs/domain/context/conversation_models.py +87 -53
- aiecs/domain/context/graph_memory.py +582 -0
- aiecs/domain/execution/model.py +12 -4
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +148 -0
- aiecs/domain/knowledge_graph/models/evidence.py +178 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
- aiecs/domain/knowledge_graph/models/path.py +171 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
- aiecs/domain/knowledge_graph/models/query.py +261 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
- aiecs/domain/knowledge_graph/models/relation.py +202 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
- aiecs/domain/task/dsl_processor.py +172 -56
- aiecs/domain/task/model.py +20 -8
- aiecs/domain/task/task_context.py +27 -24
- aiecs/infrastructure/__init__.py +0 -2
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +837 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
- aiecs/infrastructure/graph_storage/cache.py +424 -0
- aiecs/infrastructure/graph_storage/distributed.py +223 -0
- aiecs/infrastructure/graph_storage/error_handling.py +380 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
- aiecs/infrastructure/graph_storage/metrics.py +344 -0
- aiecs/infrastructure/graph_storage/migration.py +400 -0
- aiecs/infrastructure/graph_storage/pagination.py +483 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
- aiecs/infrastructure/graph_storage/postgres.py +1563 -0
- aiecs/infrastructure/graph_storage/property_storage.py +353 -0
- aiecs/infrastructure/graph_storage/protocols.py +76 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
- aiecs/infrastructure/graph_storage/streaming.py +487 -0
- aiecs/infrastructure/graph_storage/tenant.py +412 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
- aiecs/infrastructure/messaging/websocket_manager.py +51 -35
- aiecs/infrastructure/monitoring/__init__.py +22 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
- aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
- aiecs/infrastructure/monitoring/structured_logger.py +3 -7
- aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
- aiecs/infrastructure/persistence/__init__.py +14 -1
- aiecs/infrastructure/persistence/context_engine_client.py +184 -0
- aiecs/infrastructure/persistence/database_manager.py +67 -43
- aiecs/infrastructure/persistence/file_storage.py +180 -103
- aiecs/infrastructure/persistence/redis_client.py +74 -21
- aiecs/llm/__init__.py +73 -25
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
- aiecs/llm/client_factory.py +224 -36
- aiecs/llm/client_resolver.py +155 -0
- aiecs/llm/clients/__init__.py +38 -0
- aiecs/llm/clients/base_client.py +324 -0
- aiecs/llm/clients/google_function_calling_mixin.py +457 -0
- aiecs/llm/clients/googleai_client.py +241 -0
- aiecs/llm/clients/openai_client.py +158 -0
- aiecs/llm/clients/openai_compatible_mixin.py +367 -0
- aiecs/llm/clients/vertex_client.py +897 -0
- aiecs/llm/clients/xai_client.py +201 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +272 -0
- aiecs/llm/config/config_validator.py +206 -0
- aiecs/llm/config/model_config.py +143 -0
- aiecs/llm/protocols.py +149 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +89 -0
- aiecs/main.py +140 -121
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/module_checker.py +499 -0
- aiecs/scripts/aid/version_manager.py +235 -0
- aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
- aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
- aiecs/scripts/dependance_check/__init__.py +15 -0
- aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
- aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
- aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
- aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
- aiecs/scripts/knowledge_graph/__init__.py +3 -0
- aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
- aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
- aiecs/scripts/tools_develop/README.md +671 -0
- aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
- aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
- aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
- aiecs/scripts/tools_develop/schema_coverage.py +511 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
- aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
- aiecs/scripts/tools_develop/verify_tools.py +352 -0
- aiecs/tasks/__init__.py +0 -1
- aiecs/tasks/worker.py +115 -47
- aiecs/tools/__init__.py +194 -72
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +330 -0
- aiecs/tools/apisource/providers/__init__.py +112 -0
- aiecs/tools/apisource/providers/base.py +671 -0
- aiecs/tools/apisource/providers/census.py +397 -0
- aiecs/tools/apisource/providers/fred.py +535 -0
- aiecs/tools/apisource/providers/newsapi.py +409 -0
- aiecs/tools/apisource/providers/worldbank.py +352 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +363 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
- aiecs/tools/apisource/tool.py +832 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +334 -0
- aiecs/tools/base_tool.py +415 -21
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +607 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
- aiecs/tools/docs/content_insertion_tool.py +1320 -0
- aiecs/tools/docs/document_creator_tool.py +1323 -0
- aiecs/tools/docs/document_layout_tool.py +1160 -0
- aiecs/tools/docs/document_parser_tool.py +1011 -0
- aiecs/tools/docs/document_writer_tool.py +1829 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
- aiecs/tools/langchain_adapter.py +300 -138
- aiecs/tools/schema_generator.py +455 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +581 -0
- aiecs/tools/search_tool/cache.py +264 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +224 -0
- aiecs/tools/search_tool/core.py +778 -0
- aiecs/tools/search_tool/deduplicator.py +119 -0
- aiecs/tools/search_tool/error_handler.py +242 -0
- aiecs/tools/search_tool/metrics.py +343 -0
- aiecs/tools/search_tool/rate_limiter.py +172 -0
- aiecs/tools/search_tool/schemas.py +275 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
- aiecs/tools/statistics/data_loader_tool.py +555 -0
- aiecs/tools/statistics/data_profiler_tool.py +638 -0
- aiecs/tools/statistics/data_transformer_tool.py +580 -0
- aiecs/tools/statistics/data_visualizer_tool.py +498 -0
- aiecs/tools/statistics/model_trainer_tool.py +507 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
- aiecs/tools/task_tools/__init__.py +49 -36
- aiecs/tools/task_tools/chart_tool.py +200 -184
- aiecs/tools/task_tools/classfire_tool.py +268 -267
- aiecs/tools/task_tools/image_tool.py +175 -131
- aiecs/tools/task_tools/office_tool.py +226 -146
- aiecs/tools/task_tools/pandas_tool.py +477 -121
- aiecs/tools/task_tools/report_tool.py +390 -142
- aiecs/tools/task_tools/research_tool.py +149 -79
- aiecs/tools/task_tools/scraper_tool.py +339 -145
- aiecs/tools/task_tools/stats_tool.py +448 -209
- aiecs/tools/temp_file_manager.py +26 -24
- aiecs/tools/tool_executor/__init__.py +18 -16
- aiecs/tools/tool_executor/tool_executor.py +364 -52
- aiecs/utils/LLM_output_structor.py +74 -48
- aiecs/utils/__init__.py +14 -3
- aiecs/utils/base_callback.py +0 -3
- aiecs/utils/cache_provider.py +696 -0
- aiecs/utils/execution_utils.py +50 -31
- aiecs/utils/prompt_loader.py +1 -0
- aiecs/utils/token_usage_repository.py +37 -11
- aiecs/ws/socket_server.py +14 -4
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/METADATA +52 -15
- aiecs-1.7.6.dist-info/RECORD +337 -0
- aiecs-1.7.6.dist-info/entry_points.txt +13 -0
- aiecs/config/registry.py +0 -19
- aiecs/domain/context/content_engine.py +0 -982
- aiecs/llm/base_client.py +0 -99
- aiecs/llm/openai_client.py +0 -125
- aiecs/llm/vertex_client.py +0 -186
- aiecs/llm/xai_client.py +0 -184
- aiecs/scripts/dependency_checker.py +0 -857
- aiecs/scripts/quick_dependency_check.py +0 -269
- aiecs/tools/task_tools/search_api.py +0 -7
- aiecs-1.0.1.dist-info/RECORD +0 -90
- aiecs-1.0.1.dist-info/entry_points.txt +0 -7
- /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
- /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
- /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
- /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/WHEEL +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,632 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Fusion Engine for Cross-Provider Results
|
|
3
|
+
|
|
4
|
+
Intelligently merges results from multiple API providers:
|
|
5
|
+
- Detect and handle duplicate data
|
|
6
|
+
- Resolve conflicts based on quality scores
|
|
7
|
+
- Support multiple fusion strategies
|
|
8
|
+
- Preserve provenance information
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import logging
|
|
12
|
+
from typing import Any, Dict, List, Optional, Tuple, cast
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class DataFusionEngine:
|
|
18
|
+
"""
|
|
19
|
+
Fuses data from multiple providers intelligently.
|
|
20
|
+
|
|
21
|
+
Handles duplicate detection, conflict resolution, and data quality
|
|
22
|
+
optimization when combining results from different sources.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
# Fusion strategies
|
|
26
|
+
STRATEGY_BEST_QUALITY = "best_quality"
|
|
27
|
+
STRATEGY_MERGE_ALL = "merge_all"
|
|
28
|
+
STRATEGY_CONSENSUS = "consensus"
|
|
29
|
+
STRATEGY_FIRST_SUCCESS = "first_success"
|
|
30
|
+
|
|
31
|
+
def __init__(self):
|
|
32
|
+
"""Initialize data fusion engine"""
|
|
33
|
+
|
|
34
|
+
def fuse_multi_provider_results(
|
|
35
|
+
self,
|
|
36
|
+
results: List[Dict[str, Any]],
|
|
37
|
+
fusion_strategy: str = STRATEGY_BEST_QUALITY,
|
|
38
|
+
) -> Optional[Dict[str, Any]]:
|
|
39
|
+
"""
|
|
40
|
+
Fuse results from multiple providers.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
results: List of results from different providers
|
|
44
|
+
fusion_strategy: Strategy to use for fusion:
|
|
45
|
+
- 'best_quality': Select result with highest quality score
|
|
46
|
+
- 'merge_all': Merge all results, preserving sources
|
|
47
|
+
- 'consensus': Use data points agreed upon by multiple sources
|
|
48
|
+
- 'first_success': Use first successful result
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Fused result dictionary or None if no valid results
|
|
52
|
+
"""
|
|
53
|
+
if not results:
|
|
54
|
+
return None
|
|
55
|
+
|
|
56
|
+
# Filter out failed results
|
|
57
|
+
valid_results = [r for r in results if r.get("data") is not None]
|
|
58
|
+
|
|
59
|
+
if not valid_results:
|
|
60
|
+
return None
|
|
61
|
+
|
|
62
|
+
if fusion_strategy == self.STRATEGY_BEST_QUALITY:
|
|
63
|
+
return self._fuse_best_quality(valid_results)
|
|
64
|
+
|
|
65
|
+
elif fusion_strategy == self.STRATEGY_MERGE_ALL:
|
|
66
|
+
return self._fuse_merge_all(valid_results)
|
|
67
|
+
|
|
68
|
+
elif fusion_strategy == self.STRATEGY_CONSENSUS:
|
|
69
|
+
return self._fuse_consensus(valid_results)
|
|
70
|
+
|
|
71
|
+
elif fusion_strategy == self.STRATEGY_FIRST_SUCCESS:
|
|
72
|
+
return valid_results[0]
|
|
73
|
+
|
|
74
|
+
else:
|
|
75
|
+
logger.warning(f"Unknown fusion strategy: {fusion_strategy}, using best_quality")
|
|
76
|
+
return self._fuse_best_quality(valid_results)
|
|
77
|
+
|
|
78
|
+
def _fuse_best_quality(self, results: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
79
|
+
"""
|
|
80
|
+
Select result with highest quality score.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
results: List of valid results
|
|
84
|
+
|
|
85
|
+
Returns:
|
|
86
|
+
Result with best quality
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def get_quality_score(result: Dict[str, Any]) -> float:
|
|
90
|
+
"""Extract quality score from result"""
|
|
91
|
+
metadata = result.get("metadata", {})
|
|
92
|
+
quality = metadata.get("quality", {})
|
|
93
|
+
return quality.get("score", 0.5)
|
|
94
|
+
|
|
95
|
+
best_result = max(results, key=get_quality_score)
|
|
96
|
+
|
|
97
|
+
# Add fusion metadata
|
|
98
|
+
best_result["metadata"]["fusion_info"] = {
|
|
99
|
+
"strategy": self.STRATEGY_BEST_QUALITY,
|
|
100
|
+
"total_providers_queried": len(results),
|
|
101
|
+
"selected_provider": best_result.get("provider"),
|
|
102
|
+
"quality_score": get_quality_score(best_result),
|
|
103
|
+
"alternative_providers": [r.get("provider") for r in results if r.get("provider") != best_result.get("provider")],
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
return best_result
|
|
107
|
+
|
|
108
|
+
def _fuse_merge_all(self, results: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
109
|
+
"""
|
|
110
|
+
Merge all results, preserving source information.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
results: List of valid results
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Merged result with all data
|
|
117
|
+
"""
|
|
118
|
+
merged: Dict[str, Any] = {
|
|
119
|
+
"operation": "multi_provider_search",
|
|
120
|
+
"data": [],
|
|
121
|
+
"metadata": {
|
|
122
|
+
"fusion_info": {
|
|
123
|
+
"strategy": self.STRATEGY_MERGE_ALL,
|
|
124
|
+
"total_providers": len(results),
|
|
125
|
+
"sources": [],
|
|
126
|
+
}
|
|
127
|
+
},
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
# Collect all data with source tags
|
|
131
|
+
for result in results:
|
|
132
|
+
provider = result.get("provider", "unknown")
|
|
133
|
+
data = result.get("data", [])
|
|
134
|
+
metadata = result.get("metadata", {})
|
|
135
|
+
|
|
136
|
+
# Handle different data structures
|
|
137
|
+
if isinstance(data, list):
|
|
138
|
+
for item in data:
|
|
139
|
+
if isinstance(item, dict):
|
|
140
|
+
# Add source information to each item
|
|
141
|
+
enriched_item = item.copy()
|
|
142
|
+
enriched_item["_source_provider"] = provider
|
|
143
|
+
enriched_item["_source_quality"] = metadata.get("quality", {})
|
|
144
|
+
enriched_item["_source_timestamp"] = metadata.get("timestamp")
|
|
145
|
+
merged["data"].append(enriched_item)
|
|
146
|
+
else:
|
|
147
|
+
# Handle non-dict items
|
|
148
|
+
merged["data"].append(
|
|
149
|
+
{
|
|
150
|
+
"value": item,
|
|
151
|
+
"_source_provider": provider,
|
|
152
|
+
"_source_quality": metadata.get("quality", {}),
|
|
153
|
+
}
|
|
154
|
+
)
|
|
155
|
+
elif isinstance(data, dict):
|
|
156
|
+
# Single dict result
|
|
157
|
+
enriched_data = data.copy()
|
|
158
|
+
enriched_data["_source_provider"] = provider
|
|
159
|
+
enriched_data["_source_quality"] = metadata.get("quality", {})
|
|
160
|
+
merged["data"].append(enriched_data)
|
|
161
|
+
|
|
162
|
+
# Record source info
|
|
163
|
+
fusion_info = cast(Dict[str, Any], merged["metadata"]["fusion_info"])
|
|
164
|
+
sources = cast(List[Dict[str, Any]], fusion_info["sources"])
|
|
165
|
+
sources.append(
|
|
166
|
+
{
|
|
167
|
+
"provider": provider,
|
|
168
|
+
"operation": result.get("operation"),
|
|
169
|
+
"record_count": len(data) if isinstance(data, list) else 1,
|
|
170
|
+
"quality": metadata.get("quality", {}),
|
|
171
|
+
}
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
return merged
|
|
175
|
+
|
|
176
|
+
def _fuse_consensus(self, results: List[Dict[str, Any]]) -> Dict[str, Any]:
|
|
177
|
+
"""
|
|
178
|
+
Use consensus-based fusion (data agreed upon by multiple sources).
|
|
179
|
+
|
|
180
|
+
Implements sophisticated consensus logic:
|
|
181
|
+
- Detects data point agreement across providers
|
|
182
|
+
- Uses majority voting for conflicting values
|
|
183
|
+
- Applies quality-weighted consensus calculation
|
|
184
|
+
- Handles partial agreement scenarios
|
|
185
|
+
- Calculates confidence scores
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
results: List of valid results
|
|
189
|
+
|
|
190
|
+
Returns:
|
|
191
|
+
Consensus result with confidence scores
|
|
192
|
+
"""
|
|
193
|
+
if not results:
|
|
194
|
+
return {}
|
|
195
|
+
|
|
196
|
+
# Extract all data points with provider and quality information
|
|
197
|
+
all_data_points: List[Dict[str, Any]] = []
|
|
198
|
+
for result in results:
|
|
199
|
+
provider = result.get("provider", "unknown")
|
|
200
|
+
data = result.get("data", [])
|
|
201
|
+
metadata = result.get("metadata", {})
|
|
202
|
+
quality_score = metadata.get("quality", {}).get("score", 0.5)
|
|
203
|
+
|
|
204
|
+
# Normalize data to list format
|
|
205
|
+
if isinstance(data, list):
|
|
206
|
+
for item in data:
|
|
207
|
+
if isinstance(item, dict):
|
|
208
|
+
enriched_item = item.copy()
|
|
209
|
+
enriched_item["_provider"] = provider
|
|
210
|
+
enriched_item["_quality"] = quality_score
|
|
211
|
+
all_data_points.append(enriched_item)
|
|
212
|
+
else:
|
|
213
|
+
all_data_points.append({
|
|
214
|
+
"value": item,
|
|
215
|
+
"_provider": provider,
|
|
216
|
+
"_quality": quality_score
|
|
217
|
+
})
|
|
218
|
+
elif isinstance(data, dict):
|
|
219
|
+
enriched_data = data.copy()
|
|
220
|
+
enriched_data["_provider"] = provider
|
|
221
|
+
enriched_data["_quality"] = quality_score
|
|
222
|
+
all_data_points.append(enriched_data)
|
|
223
|
+
|
|
224
|
+
if not all_data_points:
|
|
225
|
+
# Fallback to best quality if no data points
|
|
226
|
+
return self._fuse_best_quality(results)
|
|
227
|
+
|
|
228
|
+
# Group matching data points (agreement detection)
|
|
229
|
+
data_groups = self._group_matching_data_points(all_data_points)
|
|
230
|
+
|
|
231
|
+
# Build consensus result
|
|
232
|
+
consensus_data = []
|
|
233
|
+
total_confidence = 0.0
|
|
234
|
+
agreement_stats = {
|
|
235
|
+
"full_agreement": 0,
|
|
236
|
+
"partial_agreement": 0,
|
|
237
|
+
"conflicts": 0,
|
|
238
|
+
"single_source": 0
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
for group in data_groups:
|
|
242
|
+
if len(group) == 0:
|
|
243
|
+
continue
|
|
244
|
+
|
|
245
|
+
# Build consensus item from group
|
|
246
|
+
consensus_item, confidence, agreement_type = self._build_consensus_item(group)
|
|
247
|
+
consensus_data.append(consensus_item)
|
|
248
|
+
total_confidence += confidence
|
|
249
|
+
agreement_stats[agreement_type] += 1
|
|
250
|
+
|
|
251
|
+
# Calculate average confidence
|
|
252
|
+
avg_confidence = total_confidence / len(consensus_data) if consensus_data else 0.0
|
|
253
|
+
|
|
254
|
+
# Build consensus result
|
|
255
|
+
consensus_result: Dict[str, Any] = {
|
|
256
|
+
"operation": "multi_provider_search",
|
|
257
|
+
"data": consensus_data,
|
|
258
|
+
"metadata": {
|
|
259
|
+
"fusion_info": {
|
|
260
|
+
"strategy": self.STRATEGY_CONSENSUS,
|
|
261
|
+
"total_providers": len(results),
|
|
262
|
+
"providers": [r.get("provider", "unknown") for r in results],
|
|
263
|
+
"consensus_confidence": avg_confidence,
|
|
264
|
+
"agreement_stats": agreement_stats,
|
|
265
|
+
"data_points_analyzed": len(all_data_points),
|
|
266
|
+
"consensus_groups": len(data_groups),
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
return consensus_result
|
|
272
|
+
|
|
273
|
+
def _group_matching_data_points(self, data_points: List[Dict[str, Any]]) -> List[List[Dict[str, Any]]]:
|
|
274
|
+
"""
|
|
275
|
+
Group data points that represent the same entity/data point.
|
|
276
|
+
|
|
277
|
+
Uses duplicate detection to identify matching data points across providers.
|
|
278
|
+
|
|
279
|
+
Args:
|
|
280
|
+
data_points: List of data points with provider info
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
List of groups, where each group contains matching data points
|
|
284
|
+
"""
|
|
285
|
+
groups: List[List[Dict[str, Any]]] = []
|
|
286
|
+
processed = set()
|
|
287
|
+
|
|
288
|
+
for i, data_point in enumerate(data_points):
|
|
289
|
+
if i in processed:
|
|
290
|
+
continue
|
|
291
|
+
|
|
292
|
+
# Start a new group with this data point
|
|
293
|
+
group = [data_point]
|
|
294
|
+
processed.add(i)
|
|
295
|
+
|
|
296
|
+
# Find matching data points
|
|
297
|
+
for j, other_point in enumerate(data_points[i + 1:], start=i + 1):
|
|
298
|
+
if j in processed:
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
is_duplicate, similarity = self.detect_duplicate_data(data_point, other_point)
|
|
302
|
+
if is_duplicate:
|
|
303
|
+
group.append(other_point)
|
|
304
|
+
processed.add(j)
|
|
305
|
+
|
|
306
|
+
groups.append(group)
|
|
307
|
+
|
|
308
|
+
return groups
|
|
309
|
+
|
|
310
|
+
def _build_consensus_item(
|
|
311
|
+
self, group: List[Dict[str, Any]]
|
|
312
|
+
) -> Tuple[Dict[str, Any], float, str]:
|
|
313
|
+
"""
|
|
314
|
+
Build a consensus item from a group of matching data points.
|
|
315
|
+
|
|
316
|
+
Args:
|
|
317
|
+
group: List of matching data points from different providers
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
Tuple of (consensus_item, confidence_score, agreement_type)
|
|
321
|
+
"""
|
|
322
|
+
if len(group) == 1:
|
|
323
|
+
# Single source - use as-is with lower confidence
|
|
324
|
+
item = group[0].copy()
|
|
325
|
+
item.pop("_provider", None)
|
|
326
|
+
item.pop("_quality", None)
|
|
327
|
+
return item, 0.5, "single_source"
|
|
328
|
+
|
|
329
|
+
# Multiple sources - build consensus
|
|
330
|
+
consensus_item: Dict[str, Any] = {}
|
|
331
|
+
field_agreements: Dict[str, List[Tuple[Any, float]]] = {} # field -> [(value, quality), ...]
|
|
332
|
+
|
|
333
|
+
# Collect all field values with their quality scores
|
|
334
|
+
for data_point in group:
|
|
335
|
+
quality = data_point.get("_quality", 0.5)
|
|
336
|
+
for key, value in data_point.items():
|
|
337
|
+
if key.startswith("_"): # Skip metadata fields
|
|
338
|
+
continue
|
|
339
|
+
if key not in field_agreements:
|
|
340
|
+
field_agreements[key] = []
|
|
341
|
+
field_agreements[key].append((value, quality))
|
|
342
|
+
|
|
343
|
+
# Build consensus for each field
|
|
344
|
+
field_confidences: Dict[str, float] = {}
|
|
345
|
+
full_agreement_count = 0
|
|
346
|
+
partial_agreement_count = 0
|
|
347
|
+
conflict_count = 0
|
|
348
|
+
|
|
349
|
+
for field, value_quality_pairs in field_agreements.items():
|
|
350
|
+
# Detect agreement
|
|
351
|
+
unique_values = {}
|
|
352
|
+
for value, quality in value_quality_pairs:
|
|
353
|
+
value_key = str(value) # Use string for comparison
|
|
354
|
+
if value_key not in unique_values:
|
|
355
|
+
unique_values[value_key] = []
|
|
356
|
+
unique_values[value_key].append((value, quality))
|
|
357
|
+
|
|
358
|
+
if len(unique_values) == 1:
|
|
359
|
+
# Full agreement - all providers have same value
|
|
360
|
+
consensus_item[field] = value_quality_pairs[0][0]
|
|
361
|
+
# Confidence based on number of agreeing sources and quality
|
|
362
|
+
avg_quality = sum(q for _, q in value_quality_pairs) / len(value_quality_pairs)
|
|
363
|
+
agreement_ratio = len(value_quality_pairs) / len(group)
|
|
364
|
+
field_confidences[field] = avg_quality * agreement_ratio
|
|
365
|
+
full_agreement_count += 1
|
|
366
|
+
else:
|
|
367
|
+
# Conflict - resolve using majority voting or quality weighting
|
|
368
|
+
consensus_value, field_confidence = self._resolve_field_conflict(
|
|
369
|
+
unique_values, len(group)
|
|
370
|
+
)
|
|
371
|
+
consensus_item[field] = consensus_value
|
|
372
|
+
field_confidences[field] = field_confidence
|
|
373
|
+
|
|
374
|
+
# Check if majority agrees (>= 50%)
|
|
375
|
+
max_agreement = max(len(vals) for vals in unique_values.values())
|
|
376
|
+
if max_agreement >= len(group) * 0.5:
|
|
377
|
+
partial_agreement_count += 1
|
|
378
|
+
else:
|
|
379
|
+
conflict_count += 1
|
|
380
|
+
|
|
381
|
+
# Calculate overall confidence
|
|
382
|
+
if field_confidences:
|
|
383
|
+
overall_confidence = sum(field_confidences.values()) / len(field_confidences)
|
|
384
|
+
else:
|
|
385
|
+
overall_confidence = 0.5
|
|
386
|
+
|
|
387
|
+
# Determine agreement type
|
|
388
|
+
if conflict_count == 0 and partial_agreement_count == 0:
|
|
389
|
+
agreement_type = "full_agreement"
|
|
390
|
+
elif conflict_count == 0:
|
|
391
|
+
agreement_type = "partial_agreement"
|
|
392
|
+
else:
|
|
393
|
+
agreement_type = "conflicts"
|
|
394
|
+
|
|
395
|
+
# Add consensus metadata
|
|
396
|
+
consensus_item["_consensus_metadata"] = {
|
|
397
|
+
"sources_count": len(group),
|
|
398
|
+
"providers": [dp.get("_provider", "unknown") for dp in group],
|
|
399
|
+
"field_confidences": field_confidences,
|
|
400
|
+
"overall_confidence": overall_confidence,
|
|
401
|
+
"agreement_type": agreement_type,
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
return consensus_item, overall_confidence, agreement_type
|
|
405
|
+
|
|
406
|
+
def _resolve_field_conflict(
|
|
407
|
+
self, unique_values: Dict[str, List[Tuple[Any, float]]], total_sources: int
|
|
408
|
+
) -> Tuple[Any, float]:
|
|
409
|
+
"""
|
|
410
|
+
Resolve conflict for a single field using majority voting and quality weighting.
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
unique_values: Dict mapping value strings to list of (value, quality) tuples
|
|
414
|
+
total_sources: Total number of sources
|
|
415
|
+
|
|
416
|
+
Returns:
|
|
417
|
+
Tuple of (resolved_value, confidence_score)
|
|
418
|
+
"""
|
|
419
|
+
# Calculate support (count) and quality-weighted scores for each value
|
|
420
|
+
value_scores: List[Tuple[Any, float, int]] = [] # (value, quality_weighted_score, count)
|
|
421
|
+
|
|
422
|
+
for value_str, value_quality_pairs in unique_values.items():
|
|
423
|
+
count = len(value_quality_pairs)
|
|
424
|
+
# Quality-weighted score: average quality * support ratio
|
|
425
|
+
avg_quality = sum(q for _, q in value_quality_pairs) / count
|
|
426
|
+
support_ratio = count / total_sources
|
|
427
|
+
quality_weighted_score = avg_quality * support_ratio
|
|
428
|
+
|
|
429
|
+
# Get original value (not string)
|
|
430
|
+
original_value = value_quality_pairs[0][0]
|
|
431
|
+
value_scores.append((original_value, quality_weighted_score, count))
|
|
432
|
+
|
|
433
|
+
# Sort by quality-weighted score (descending), then by count (descending)
|
|
434
|
+
value_scores.sort(key=lambda x: (x[1], x[2]), reverse=True)
|
|
435
|
+
|
|
436
|
+
# Use majority voting: if majority agrees (>50%), use that value
|
|
437
|
+
best_value, best_score, best_count = value_scores[0]
|
|
438
|
+
|
|
439
|
+
# Check if majority agrees
|
|
440
|
+
if best_count > total_sources / 2:
|
|
441
|
+
# Majority vote wins
|
|
442
|
+
confidence = best_score * (best_count / total_sources)
|
|
443
|
+
else:
|
|
444
|
+
# No clear majority - use quality-weighted consensus
|
|
445
|
+
# Confidence is lower when no majority
|
|
446
|
+
confidence = best_score * 0.7 # Penalty for no majority
|
|
447
|
+
|
|
448
|
+
return best_value, confidence
|
|
449
|
+
|
|
450
|
+
def detect_duplicate_data(
|
|
451
|
+
self,
|
|
452
|
+
data1: Dict[str, Any],
|
|
453
|
+
data2: Dict[str, Any],
|
|
454
|
+
key_fields: Optional[List[str]] = None,
|
|
455
|
+
) -> Tuple[bool, float]:
|
|
456
|
+
"""
|
|
457
|
+
Detect if two data items are duplicates.
|
|
458
|
+
|
|
459
|
+
Args:
|
|
460
|
+
data1: First data item
|
|
461
|
+
data2: Second data item
|
|
462
|
+
key_fields: Fields to compare (auto-detected if None)
|
|
463
|
+
|
|
464
|
+
Returns:
|
|
465
|
+
Tuple of (is_duplicate, similarity_score)
|
|
466
|
+
"""
|
|
467
|
+
if key_fields is None:
|
|
468
|
+
# Auto-detect key fields
|
|
469
|
+
key_fields = [
|
|
470
|
+
"id",
|
|
471
|
+
"series_id",
|
|
472
|
+
"indicator_code",
|
|
473
|
+
"indicator_id",
|
|
474
|
+
"title",
|
|
475
|
+
"name",
|
|
476
|
+
"code",
|
|
477
|
+
]
|
|
478
|
+
|
|
479
|
+
matches = 0
|
|
480
|
+
total_fields = 0
|
|
481
|
+
|
|
482
|
+
for field in key_fields:
|
|
483
|
+
if field in data1 and field in data2:
|
|
484
|
+
total_fields += 1
|
|
485
|
+
if data1[field] == data2[field]:
|
|
486
|
+
matches += 1
|
|
487
|
+
|
|
488
|
+
if total_fields == 0:
|
|
489
|
+
# No common key fields, check title/name similarity
|
|
490
|
+
return self._check_text_similarity(data1, data2)
|
|
491
|
+
|
|
492
|
+
similarity = matches / total_fields if total_fields > 0 else 0.0
|
|
493
|
+
is_duplicate = similarity > 0.8
|
|
494
|
+
|
|
495
|
+
return is_duplicate, similarity
|
|
496
|
+
|
|
497
|
+
def _check_text_similarity(self, data1: Dict[str, Any], data2: Dict[str, Any]) -> Tuple[bool, float]:
|
|
498
|
+
"""
|
|
499
|
+
Check text similarity for title/name fields.
|
|
500
|
+
|
|
501
|
+
Args:
|
|
502
|
+
data1: First data item
|
|
503
|
+
data2: Second data item
|
|
504
|
+
|
|
505
|
+
Returns:
|
|
506
|
+
Tuple of (is_duplicate, similarity_score)
|
|
507
|
+
"""
|
|
508
|
+
text_fields = ["title", "name", "description"]
|
|
509
|
+
|
|
510
|
+
for field in text_fields:
|
|
511
|
+
if field in data1 and field in data2:
|
|
512
|
+
text1 = str(data1[field]).lower()
|
|
513
|
+
text2 = str(data2[field]).lower()
|
|
514
|
+
|
|
515
|
+
# Simple word-based similarity
|
|
516
|
+
words1 = set(text1.split())
|
|
517
|
+
words2 = set(text2.split())
|
|
518
|
+
|
|
519
|
+
if not words1 or not words2:
|
|
520
|
+
continue
|
|
521
|
+
|
|
522
|
+
intersection = len(words1 & words2)
|
|
523
|
+
union = len(words1 | words2)
|
|
524
|
+
|
|
525
|
+
similarity = intersection / union if union > 0 else 0.0
|
|
526
|
+
|
|
527
|
+
if similarity > 0.7:
|
|
528
|
+
return True, similarity
|
|
529
|
+
|
|
530
|
+
return False, 0.0
|
|
531
|
+
|
|
532
|
+
def resolve_conflict(
|
|
533
|
+
self,
|
|
534
|
+
values: List[Dict[str, Any]],
|
|
535
|
+
resolution_strategy: str = "quality",
|
|
536
|
+
) -> Any:
|
|
537
|
+
"""
|
|
538
|
+
Resolve conflicts when multiple sources provide different values.
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
values: List of value dictionaries with {'value': ..., 'quality': ..., 'source': ...}
|
|
542
|
+
resolution_strategy: Strategy for resolution ('quality', 'majority', 'average')
|
|
543
|
+
|
|
544
|
+
Returns:
|
|
545
|
+
Resolved value
|
|
546
|
+
"""
|
|
547
|
+
if not values:
|
|
548
|
+
return None
|
|
549
|
+
|
|
550
|
+
if len(values) == 1:
|
|
551
|
+
return values[0].get("value")
|
|
552
|
+
|
|
553
|
+
if resolution_strategy == "quality":
|
|
554
|
+
# Choose value from source with highest quality
|
|
555
|
+
best = max(values, key=lambda v: v.get("quality", {}).get("score", 0))
|
|
556
|
+
return best.get("value")
|
|
557
|
+
|
|
558
|
+
elif resolution_strategy == "majority":
|
|
559
|
+
# Use most common value
|
|
560
|
+
from collections import Counter
|
|
561
|
+
|
|
562
|
+
value_counts = Counter([str(v.get("value")) for v in values])
|
|
563
|
+
most_common = value_counts.most_common(1)[0][0]
|
|
564
|
+
# Return original type
|
|
565
|
+
for v in values:
|
|
566
|
+
if str(v.get("value")) == most_common:
|
|
567
|
+
return v.get("value")
|
|
568
|
+
|
|
569
|
+
elif resolution_strategy == "average":
|
|
570
|
+
# Average numeric values
|
|
571
|
+
try:
|
|
572
|
+
numeric_values = []
|
|
573
|
+
for v in values:
|
|
574
|
+
value = v.get("value")
|
|
575
|
+
if value is not None:
|
|
576
|
+
try:
|
|
577
|
+
numeric_values.append(float(value))
|
|
578
|
+
except (ValueError, TypeError):
|
|
579
|
+
continue
|
|
580
|
+
if numeric_values:
|
|
581
|
+
return sum(numeric_values) / len(numeric_values)
|
|
582
|
+
except (ValueError, TypeError):
|
|
583
|
+
# Fall back to quality-based
|
|
584
|
+
return self.resolve_conflict(values, "quality")
|
|
585
|
+
|
|
586
|
+
# Default: return first value
|
|
587
|
+
return values[0].get("value")
|
|
588
|
+
|
|
589
|
+
def deduplicate_results(
|
|
590
|
+
self,
|
|
591
|
+
data_list: List[Dict[str, Any]],
|
|
592
|
+
key_fields: Optional[List[str]] = None,
|
|
593
|
+
) -> List[Dict[str, Any]]:
|
|
594
|
+
"""
|
|
595
|
+
Remove duplicate entries from a data list.
|
|
596
|
+
|
|
597
|
+
Args:
|
|
598
|
+
data_list: List of data items
|
|
599
|
+
key_fields: Fields to use for duplicate detection
|
|
600
|
+
|
|
601
|
+
Returns:
|
|
602
|
+
Deduplicated list
|
|
603
|
+
"""
|
|
604
|
+
if not data_list:
|
|
605
|
+
return []
|
|
606
|
+
|
|
607
|
+
unique_data = []
|
|
608
|
+
seen_signatures = set()
|
|
609
|
+
|
|
610
|
+
for item in data_list:
|
|
611
|
+
# Create a signature for this item
|
|
612
|
+
if key_fields:
|
|
613
|
+
signature = tuple(item.get(field) for field in key_fields if field in item)
|
|
614
|
+
else:
|
|
615
|
+
# Auto signature from common fields
|
|
616
|
+
signature_fields = [
|
|
617
|
+
"id",
|
|
618
|
+
"series_id",
|
|
619
|
+
"indicator_code",
|
|
620
|
+
"title",
|
|
621
|
+
"name",
|
|
622
|
+
]
|
|
623
|
+
signature = tuple(item.get(field) for field in signature_fields if field in item)
|
|
624
|
+
|
|
625
|
+
if signature and signature not in seen_signatures:
|
|
626
|
+
seen_signatures.add(signature)
|
|
627
|
+
unique_data.append(item)
|
|
628
|
+
elif not signature:
|
|
629
|
+
# No identifiable signature, include it
|
|
630
|
+
unique_data.append(item)
|
|
631
|
+
|
|
632
|
+
return unique_data
|