aiecs 1.0.1__py3-none-any.whl → 1.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +13 -16
- aiecs/__main__.py +7 -7
- aiecs/aiecs_client.py +269 -75
- aiecs/application/executors/operation_executor.py +79 -54
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
- aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
- aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
- aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +98 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
- aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
- aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
- aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
- aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
- aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
- aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
- aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
- aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
- aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
- aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
- aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
- aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
- aiecs/application/knowledge_graph/search/reranker.py +293 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +471 -0
- aiecs/config/__init__.py +20 -5
- aiecs/config/config.py +762 -31
- aiecs/config/graph_config.py +131 -0
- aiecs/config/tool_config.py +399 -0
- aiecs/core/__init__.py +29 -13
- aiecs/core/interface/__init__.py +2 -2
- aiecs/core/interface/execution_interface.py +22 -22
- aiecs/core/interface/storage_interface.py +37 -88
- aiecs/core/registry/__init__.py +31 -0
- aiecs/core/registry/service_registry.py +92 -0
- aiecs/domain/__init__.py +270 -1
- aiecs/domain/agent/__init__.py +191 -0
- aiecs/domain/agent/base_agent.py +3870 -0
- aiecs/domain/agent/exceptions.py +99 -0
- aiecs/domain/agent/graph_aware_mixin.py +569 -0
- aiecs/domain/agent/hybrid_agent.py +1435 -0
- aiecs/domain/agent/integration/__init__.py +29 -0
- aiecs/domain/agent/integration/context_compressor.py +216 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
- aiecs/domain/agent/integration/protocols.py +281 -0
- aiecs/domain/agent/integration/retry_policy.py +218 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
- aiecs/domain/agent/lifecycle.py +291 -0
- aiecs/domain/agent/llm_agent.py +692 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +1124 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +163 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
- aiecs/domain/agent/models.py +884 -0
- aiecs/domain/agent/observability.py +479 -0
- aiecs/domain/agent/persistence.py +449 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +159 -0
- aiecs/domain/agent/prompts/formatters.py +187 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +253 -0
- aiecs/domain/agent/tool_agent.py +444 -0
- aiecs/domain/agent/tools/__init__.py +15 -0
- aiecs/domain/agent/tools/schema_generator.py +364 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +469 -0
- aiecs/domain/community/analytics.py +432 -0
- aiecs/domain/community/collaborative_workflow.py +648 -0
- aiecs/domain/community/communication_hub.py +634 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +796 -0
- aiecs/domain/community/community_manager.py +803 -0
- aiecs/domain/community/decision_engine.py +849 -0
- aiecs/domain/community/exceptions.py +231 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +234 -0
- aiecs/domain/community/resource_manager.py +461 -0
- aiecs/domain/community/shared_context_manager.py +589 -0
- aiecs/domain/context/__init__.py +40 -10
- aiecs/domain/context/context_engine.py +1910 -0
- aiecs/domain/context/conversation_models.py +87 -53
- aiecs/domain/context/graph_memory.py +582 -0
- aiecs/domain/execution/model.py +12 -4
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +148 -0
- aiecs/domain/knowledge_graph/models/evidence.py +178 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
- aiecs/domain/knowledge_graph/models/path.py +171 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
- aiecs/domain/knowledge_graph/models/query.py +261 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
- aiecs/domain/knowledge_graph/models/relation.py +202 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
- aiecs/domain/task/dsl_processor.py +172 -56
- aiecs/domain/task/model.py +20 -8
- aiecs/domain/task/task_context.py +27 -24
- aiecs/infrastructure/__init__.py +0 -2
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +837 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
- aiecs/infrastructure/graph_storage/cache.py +424 -0
- aiecs/infrastructure/graph_storage/distributed.py +223 -0
- aiecs/infrastructure/graph_storage/error_handling.py +380 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
- aiecs/infrastructure/graph_storage/metrics.py +344 -0
- aiecs/infrastructure/graph_storage/migration.py +400 -0
- aiecs/infrastructure/graph_storage/pagination.py +483 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
- aiecs/infrastructure/graph_storage/postgres.py +1563 -0
- aiecs/infrastructure/graph_storage/property_storage.py +353 -0
- aiecs/infrastructure/graph_storage/protocols.py +76 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
- aiecs/infrastructure/graph_storage/streaming.py +487 -0
- aiecs/infrastructure/graph_storage/tenant.py +412 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
- aiecs/infrastructure/messaging/websocket_manager.py +51 -35
- aiecs/infrastructure/monitoring/__init__.py +22 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
- aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
- aiecs/infrastructure/monitoring/structured_logger.py +3 -7
- aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
- aiecs/infrastructure/persistence/__init__.py +14 -1
- aiecs/infrastructure/persistence/context_engine_client.py +184 -0
- aiecs/infrastructure/persistence/database_manager.py +67 -43
- aiecs/infrastructure/persistence/file_storage.py +180 -103
- aiecs/infrastructure/persistence/redis_client.py +74 -21
- aiecs/llm/__init__.py +73 -25
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
- aiecs/llm/client_factory.py +224 -36
- aiecs/llm/client_resolver.py +155 -0
- aiecs/llm/clients/__init__.py +38 -0
- aiecs/llm/clients/base_client.py +324 -0
- aiecs/llm/clients/google_function_calling_mixin.py +457 -0
- aiecs/llm/clients/googleai_client.py +241 -0
- aiecs/llm/clients/openai_client.py +158 -0
- aiecs/llm/clients/openai_compatible_mixin.py +367 -0
- aiecs/llm/clients/vertex_client.py +897 -0
- aiecs/llm/clients/xai_client.py +201 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +272 -0
- aiecs/llm/config/config_validator.py +206 -0
- aiecs/llm/config/model_config.py +143 -0
- aiecs/llm/protocols.py +149 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +89 -0
- aiecs/main.py +140 -121
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/module_checker.py +499 -0
- aiecs/scripts/aid/version_manager.py +235 -0
- aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
- aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
- aiecs/scripts/dependance_check/__init__.py +15 -0
- aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
- aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
- aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
- aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
- aiecs/scripts/knowledge_graph/__init__.py +3 -0
- aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
- aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
- aiecs/scripts/tools_develop/README.md +671 -0
- aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
- aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
- aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
- aiecs/scripts/tools_develop/schema_coverage.py +511 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
- aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
- aiecs/scripts/tools_develop/verify_tools.py +352 -0
- aiecs/tasks/__init__.py +0 -1
- aiecs/tasks/worker.py +115 -47
- aiecs/tools/__init__.py +194 -72
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +330 -0
- aiecs/tools/apisource/providers/__init__.py +112 -0
- aiecs/tools/apisource/providers/base.py +671 -0
- aiecs/tools/apisource/providers/census.py +397 -0
- aiecs/tools/apisource/providers/fred.py +535 -0
- aiecs/tools/apisource/providers/newsapi.py +409 -0
- aiecs/tools/apisource/providers/worldbank.py +352 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +363 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
- aiecs/tools/apisource/tool.py +832 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +334 -0
- aiecs/tools/base_tool.py +415 -21
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +607 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
- aiecs/tools/docs/content_insertion_tool.py +1320 -0
- aiecs/tools/docs/document_creator_tool.py +1323 -0
- aiecs/tools/docs/document_layout_tool.py +1160 -0
- aiecs/tools/docs/document_parser_tool.py +1011 -0
- aiecs/tools/docs/document_writer_tool.py +1829 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
- aiecs/tools/langchain_adapter.py +300 -138
- aiecs/tools/schema_generator.py +455 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +581 -0
- aiecs/tools/search_tool/cache.py +264 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +224 -0
- aiecs/tools/search_tool/core.py +778 -0
- aiecs/tools/search_tool/deduplicator.py +119 -0
- aiecs/tools/search_tool/error_handler.py +242 -0
- aiecs/tools/search_tool/metrics.py +343 -0
- aiecs/tools/search_tool/rate_limiter.py +172 -0
- aiecs/tools/search_tool/schemas.py +275 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
- aiecs/tools/statistics/data_loader_tool.py +555 -0
- aiecs/tools/statistics/data_profiler_tool.py +638 -0
- aiecs/tools/statistics/data_transformer_tool.py +580 -0
- aiecs/tools/statistics/data_visualizer_tool.py +498 -0
- aiecs/tools/statistics/model_trainer_tool.py +507 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
- aiecs/tools/task_tools/__init__.py +49 -36
- aiecs/tools/task_tools/chart_tool.py +200 -184
- aiecs/tools/task_tools/classfire_tool.py +268 -267
- aiecs/tools/task_tools/image_tool.py +175 -131
- aiecs/tools/task_tools/office_tool.py +226 -146
- aiecs/tools/task_tools/pandas_tool.py +477 -121
- aiecs/tools/task_tools/report_tool.py +390 -142
- aiecs/tools/task_tools/research_tool.py +149 -79
- aiecs/tools/task_tools/scraper_tool.py +339 -145
- aiecs/tools/task_tools/stats_tool.py +448 -209
- aiecs/tools/temp_file_manager.py +26 -24
- aiecs/tools/tool_executor/__init__.py +18 -16
- aiecs/tools/tool_executor/tool_executor.py +364 -52
- aiecs/utils/LLM_output_structor.py +74 -48
- aiecs/utils/__init__.py +14 -3
- aiecs/utils/base_callback.py +0 -3
- aiecs/utils/cache_provider.py +696 -0
- aiecs/utils/execution_utils.py +50 -31
- aiecs/utils/prompt_loader.py +1 -0
- aiecs/utils/token_usage_repository.py +37 -11
- aiecs/ws/socket_server.py +14 -4
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/METADATA +52 -15
- aiecs-1.7.6.dist-info/RECORD +337 -0
- aiecs-1.7.6.dist-info/entry_points.txt +13 -0
- aiecs/config/registry.py +0 -19
- aiecs/domain/context/content_engine.py +0 -982
- aiecs/llm/base_client.py +0 -99
- aiecs/llm/openai_client.py +0 -125
- aiecs/llm/vertex_client.py +0 -186
- aiecs/llm/xai_client.py +0 -184
- aiecs/scripts/dependency_checker.py +0 -857
- aiecs/scripts/quick_dependency_check.py +0 -269
- aiecs/tools/task_tools/search_api.py +0 -7
- aiecs-1.0.1.dist-info/RECORD +0 -90
- aiecs-1.0.1.dist-info/entry_points.txt +0 -7
- /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
- /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
- /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
- /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/WHEEL +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,464 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Semantic Name Matcher for Knowledge Graph Entity Fusion.
|
|
3
|
+
|
|
4
|
+
Provides embedding-based semantic matching for entity names using LLM embeddings.
|
|
5
|
+
Supports configurable similarity thresholds and caching to minimize API calls.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import asyncio
|
|
9
|
+
import logging
|
|
10
|
+
import math
|
|
11
|
+
from dataclasses import dataclass, field
|
|
12
|
+
from typing import Dict, List, Optional, Tuple, Any
|
|
13
|
+
from collections import OrderedDict
|
|
14
|
+
import threading
|
|
15
|
+
|
|
16
|
+
from aiecs.llm import LLMClientFactory, AIProvider
|
|
17
|
+
from aiecs.llm.protocols import LLMClientProtocol
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
@dataclass
|
|
23
|
+
class SemanticMatchResult:
|
|
24
|
+
"""Result of semantic name matching."""
|
|
25
|
+
name1: str
|
|
26
|
+
name2: str
|
|
27
|
+
similarity: float
|
|
28
|
+
is_match: bool
|
|
29
|
+
embedding1: Optional[List[float]] = None
|
|
30
|
+
embedding2: Optional[List[float]] = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@dataclass
|
|
34
|
+
class EmbeddingCacheConfig:
|
|
35
|
+
"""Configuration for embedding cache."""
|
|
36
|
+
max_size: int = 10000
|
|
37
|
+
ttl_seconds: Optional[int] = None # None = no TTL
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class SemanticMatcherConfig:
|
|
42
|
+
"""Configuration for SemanticNameMatcher."""
|
|
43
|
+
# Similarity threshold for match
|
|
44
|
+
similarity_threshold: float = 0.85
|
|
45
|
+
# LLM provider for embeddings
|
|
46
|
+
embedding_provider: str = "OpenAI"
|
|
47
|
+
# Embedding model name (optional, uses provider default)
|
|
48
|
+
embedding_model: Optional[str] = None
|
|
49
|
+
# Cache configuration
|
|
50
|
+
cache_max_size: int = 10000
|
|
51
|
+
# Batch size for embedding API calls
|
|
52
|
+
batch_size: int = 100
|
|
53
|
+
# Enable/disable semantic matching
|
|
54
|
+
enabled: bool = True
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class LRUEmbeddingCache:
|
|
58
|
+
"""
|
|
59
|
+
Thread-safe LRU cache for name embeddings.
|
|
60
|
+
|
|
61
|
+
Provides O(1) lookup and insertion with configurable max size.
|
|
62
|
+
Uses OrderedDict for LRU ordering.
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
def __init__(self, max_size: int = 10000):
|
|
66
|
+
self._cache: OrderedDict[str, List[float]] = OrderedDict()
|
|
67
|
+
self._max_size = max_size
|
|
68
|
+
self._lock = threading.Lock()
|
|
69
|
+
self._hits = 0
|
|
70
|
+
self._misses = 0
|
|
71
|
+
|
|
72
|
+
def get(self, key: str) -> Optional[List[float]]:
|
|
73
|
+
"""Get embedding from cache. Returns None if not found."""
|
|
74
|
+
normalized_key = key.lower().strip()
|
|
75
|
+
with self._lock:
|
|
76
|
+
if normalized_key in self._cache:
|
|
77
|
+
# Move to end (most recently used)
|
|
78
|
+
self._cache.move_to_end(normalized_key)
|
|
79
|
+
self._hits += 1
|
|
80
|
+
return self._cache[normalized_key]
|
|
81
|
+
self._misses += 1
|
|
82
|
+
return None
|
|
83
|
+
|
|
84
|
+
def set(self, key: str, embedding: List[float]) -> None:
|
|
85
|
+
"""Set embedding in cache. Evicts LRU entry if full."""
|
|
86
|
+
normalized_key = key.lower().strip()
|
|
87
|
+
with self._lock:
|
|
88
|
+
if normalized_key in self._cache:
|
|
89
|
+
# Update the embedding value and move to end (most recently used)
|
|
90
|
+
self._cache[normalized_key] = embedding
|
|
91
|
+
self._cache.move_to_end(normalized_key)
|
|
92
|
+
else:
|
|
93
|
+
if len(self._cache) >= self._max_size:
|
|
94
|
+
# Evict least recently used
|
|
95
|
+
self._cache.popitem(last=False)
|
|
96
|
+
self._cache[normalized_key] = embedding
|
|
97
|
+
|
|
98
|
+
def invalidate(self, key: str) -> bool:
|
|
99
|
+
"""Remove entry from cache. Returns True if entry was removed."""
|
|
100
|
+
normalized_key = key.lower().strip()
|
|
101
|
+
with self._lock:
|
|
102
|
+
if normalized_key in self._cache:
|
|
103
|
+
del self._cache[normalized_key]
|
|
104
|
+
return True
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
def invalidate_many(self, keys: List[str]) -> int:
|
|
108
|
+
"""Remove multiple entries from cache. Returns count of removed."""
|
|
109
|
+
removed = 0
|
|
110
|
+
with self._lock:
|
|
111
|
+
for key in keys:
|
|
112
|
+
normalized_key = key.lower().strip()
|
|
113
|
+
if normalized_key in self._cache:
|
|
114
|
+
del self._cache[normalized_key]
|
|
115
|
+
removed += 1
|
|
116
|
+
return removed
|
|
117
|
+
|
|
118
|
+
def clear(self) -> None:
|
|
119
|
+
"""Clear all entries from cache."""
|
|
120
|
+
with self._lock:
|
|
121
|
+
self._cache.clear()
|
|
122
|
+
self._hits = 0
|
|
123
|
+
self._misses = 0
|
|
124
|
+
|
|
125
|
+
def size(self) -> int:
|
|
126
|
+
"""Return number of entries in cache."""
|
|
127
|
+
with self._lock:
|
|
128
|
+
return len(self._cache)
|
|
129
|
+
|
|
130
|
+
def get_stats(self) -> Dict[str, Any]:
|
|
131
|
+
"""Get cache statistics."""
|
|
132
|
+
with self._lock:
|
|
133
|
+
total = self._hits + self._misses
|
|
134
|
+
hit_rate = self._hits / total if total > 0 else 0.0
|
|
135
|
+
return {
|
|
136
|
+
"size": len(self._cache),
|
|
137
|
+
"max_size": self._max_size,
|
|
138
|
+
"hits": self._hits,
|
|
139
|
+
"misses": self._misses,
|
|
140
|
+
"hit_rate": hit_rate,
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
def contains(self, key: str) -> bool:
|
|
144
|
+
"""Check if key exists in cache."""
|
|
145
|
+
normalized_key = key.lower().strip()
|
|
146
|
+
with self._lock:
|
|
147
|
+
return normalized_key in self._cache
|
|
148
|
+
|
|
149
|
+
def get_all_keys(self) -> List[str]:
|
|
150
|
+
"""Get all keys in cache (for debugging/testing)."""
|
|
151
|
+
with self._lock:
|
|
152
|
+
return list(self._cache.keys())
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
class SemanticNameMatcher:
|
|
156
|
+
"""
|
|
157
|
+
Semantic name matcher using LLM embeddings.
|
|
158
|
+
|
|
159
|
+
Provides embedding-based entity name matching with:
|
|
160
|
+
- Configurable similarity threshold
|
|
161
|
+
- LRU embedding cache to minimize API calls
|
|
162
|
+
- Batch embedding generation
|
|
163
|
+
- Cosine similarity calculation
|
|
164
|
+
|
|
165
|
+
Example:
|
|
166
|
+
```python
|
|
167
|
+
config = SemanticMatcherConfig(similarity_threshold=0.85)
|
|
168
|
+
matcher = SemanticNameMatcher(config)
|
|
169
|
+
|
|
170
|
+
# Check if two names match semantically
|
|
171
|
+
result = await matcher.match("Albert Einstein", "A. Einstein")
|
|
172
|
+
if result.is_match:
|
|
173
|
+
print(f"Match! Similarity: {result.similarity}")
|
|
174
|
+
|
|
175
|
+
# Get embedding for a name (cached)
|
|
176
|
+
embedding = await matcher.get_embedding("Albert Einstein")
|
|
177
|
+
```
|
|
178
|
+
"""
|
|
179
|
+
|
|
180
|
+
def __init__(
|
|
181
|
+
self,
|
|
182
|
+
config: Optional[SemanticMatcherConfig] = None,
|
|
183
|
+
llm_client: Optional[LLMClientProtocol] = None,
|
|
184
|
+
):
|
|
185
|
+
"""
|
|
186
|
+
Initialize semantic name matcher.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
config: Configuration for matching behavior
|
|
190
|
+
llm_client: Optional LLM client for embeddings (uses factory if not provided)
|
|
191
|
+
"""
|
|
192
|
+
self._config = config or SemanticMatcherConfig()
|
|
193
|
+
self._cache = LRUEmbeddingCache(max_size=self._config.cache_max_size)
|
|
194
|
+
self._llm_client = llm_client
|
|
195
|
+
self._lock = asyncio.Lock()
|
|
196
|
+
|
|
197
|
+
async def _get_llm_client(self) -> LLMClientProtocol:
|
|
198
|
+
"""Get or create LLM client for embeddings."""
|
|
199
|
+
async with self._lock:
|
|
200
|
+
if self._llm_client is None:
|
|
201
|
+
try:
|
|
202
|
+
provider = AIProvider(self._config.embedding_provider)
|
|
203
|
+
except ValueError:
|
|
204
|
+
# Try as custom provider
|
|
205
|
+
provider = self._config.embedding_provider
|
|
206
|
+
self._llm_client = LLMClientFactory.get_client(provider)
|
|
207
|
+
return self._llm_client
|
|
208
|
+
|
|
209
|
+
async def get_embedding(self, name: str) -> List[float]:
|
|
210
|
+
"""
|
|
211
|
+
Get embedding for a name.
|
|
212
|
+
|
|
213
|
+
Uses cache to minimize API calls. Generates new embedding if not cached.
|
|
214
|
+
|
|
215
|
+
Args:
|
|
216
|
+
name: Name to embed
|
|
217
|
+
|
|
218
|
+
Returns:
|
|
219
|
+
Embedding vector
|
|
220
|
+
"""
|
|
221
|
+
if not self._config.enabled:
|
|
222
|
+
return []
|
|
223
|
+
|
|
224
|
+
# Check cache first
|
|
225
|
+
cached = self._cache.get(name)
|
|
226
|
+
if cached is not None:
|
|
227
|
+
return cached
|
|
228
|
+
|
|
229
|
+
# Generate embedding
|
|
230
|
+
client = await self._get_llm_client()
|
|
231
|
+
try:
|
|
232
|
+
embeddings = await client.get_embeddings(
|
|
233
|
+
[name],
|
|
234
|
+
model=self._config.embedding_model,
|
|
235
|
+
)
|
|
236
|
+
if embeddings and embeddings[0]:
|
|
237
|
+
embedding = embeddings[0]
|
|
238
|
+
self._cache.set(name, embedding)
|
|
239
|
+
return embedding
|
|
240
|
+
except Exception as e:
|
|
241
|
+
logger.warning(f"Failed to generate embedding for '{name}': {e}")
|
|
242
|
+
|
|
243
|
+
return []
|
|
244
|
+
|
|
245
|
+
async def get_embeddings_batch(
|
|
246
|
+
self, names: List[str]
|
|
247
|
+
) -> Dict[str, List[float]]:
|
|
248
|
+
"""
|
|
249
|
+
Get embeddings for multiple names in batch.
|
|
250
|
+
|
|
251
|
+
Uses cache for already-embedded names and batches API calls for new ones.
|
|
252
|
+
|
|
253
|
+
Args:
|
|
254
|
+
names: List of names to embed
|
|
255
|
+
|
|
256
|
+
Returns:
|
|
257
|
+
Dict mapping name to embedding
|
|
258
|
+
"""
|
|
259
|
+
if not self._config.enabled:
|
|
260
|
+
return {name: [] for name in names}
|
|
261
|
+
|
|
262
|
+
results: Dict[str, List[float]] = {}
|
|
263
|
+
names_to_embed: List[str] = []
|
|
264
|
+
|
|
265
|
+
# Check cache for each name
|
|
266
|
+
for name in names:
|
|
267
|
+
cached = self._cache.get(name)
|
|
268
|
+
if cached is not None:
|
|
269
|
+
results[name] = cached
|
|
270
|
+
else:
|
|
271
|
+
names_to_embed.append(name)
|
|
272
|
+
|
|
273
|
+
# Batch embed uncached names
|
|
274
|
+
if names_to_embed:
|
|
275
|
+
client = await self._get_llm_client()
|
|
276
|
+
try:
|
|
277
|
+
# Process in batches
|
|
278
|
+
for i in range(0, len(names_to_embed), self._config.batch_size):
|
|
279
|
+
batch = names_to_embed[i:i + self._config.batch_size]
|
|
280
|
+
embeddings = await client.get_embeddings(
|
|
281
|
+
batch,
|
|
282
|
+
model=self._config.embedding_model,
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
for name, embedding in zip(batch, embeddings):
|
|
286
|
+
if embedding:
|
|
287
|
+
self._cache.set(name, embedding)
|
|
288
|
+
results[name] = embedding
|
|
289
|
+
else:
|
|
290
|
+
results[name] = []
|
|
291
|
+
except Exception as e:
|
|
292
|
+
logger.warning(f"Failed to generate batch embeddings: {e}")
|
|
293
|
+
for name in names_to_embed:
|
|
294
|
+
results[name] = []
|
|
295
|
+
|
|
296
|
+
return results
|
|
297
|
+
|
|
298
|
+
def cosine_similarity(
|
|
299
|
+
self, embedding1: List[float], embedding2: List[float]
|
|
300
|
+
) -> float:
|
|
301
|
+
"""
|
|
302
|
+
Calculate cosine similarity between two embeddings.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
embedding1: First embedding vector
|
|
306
|
+
embedding2: Second embedding vector
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
Cosine similarity score between -1 and 1
|
|
310
|
+
"""
|
|
311
|
+
if not embedding1 or not embedding2:
|
|
312
|
+
return 0.0
|
|
313
|
+
|
|
314
|
+
if len(embedding1) != len(embedding2):
|
|
315
|
+
logger.warning(
|
|
316
|
+
f"Embedding dimension mismatch: {len(embedding1)} vs {len(embedding2)}"
|
|
317
|
+
)
|
|
318
|
+
return 0.0
|
|
319
|
+
|
|
320
|
+
# Calculate dot product and magnitudes
|
|
321
|
+
dot_product = sum(a * b for a, b in zip(embedding1, embedding2))
|
|
322
|
+
magnitude1 = math.sqrt(sum(a * a for a in embedding1))
|
|
323
|
+
magnitude2 = math.sqrt(sum(b * b for b in embedding2))
|
|
324
|
+
|
|
325
|
+
if magnitude1 == 0 or magnitude2 == 0:
|
|
326
|
+
return 0.0
|
|
327
|
+
|
|
328
|
+
return dot_product / (magnitude1 * magnitude2)
|
|
329
|
+
|
|
330
|
+
async def match(
|
|
331
|
+
self,
|
|
332
|
+
name1: str,
|
|
333
|
+
name2: str,
|
|
334
|
+
threshold: Optional[float] = None,
|
|
335
|
+
) -> SemanticMatchResult:
|
|
336
|
+
"""
|
|
337
|
+
Check if two names match semantically.
|
|
338
|
+
|
|
339
|
+
Args:
|
|
340
|
+
name1: First name
|
|
341
|
+
name2: Second name
|
|
342
|
+
threshold: Override similarity threshold (uses config default if None)
|
|
343
|
+
|
|
344
|
+
Returns:
|
|
345
|
+
SemanticMatchResult with similarity score and match status
|
|
346
|
+
"""
|
|
347
|
+
effective_threshold = threshold or self._config.similarity_threshold
|
|
348
|
+
|
|
349
|
+
if not self._config.enabled:
|
|
350
|
+
return SemanticMatchResult(
|
|
351
|
+
name1=name1,
|
|
352
|
+
name2=name2,
|
|
353
|
+
similarity=0.0,
|
|
354
|
+
is_match=False,
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
# Get embeddings
|
|
358
|
+
embedding1 = await self.get_embedding(name1)
|
|
359
|
+
embedding2 = await self.get_embedding(name2)
|
|
360
|
+
|
|
361
|
+
# Calculate similarity
|
|
362
|
+
similarity = self.cosine_similarity(embedding1, embedding2)
|
|
363
|
+
is_match = similarity >= effective_threshold
|
|
364
|
+
|
|
365
|
+
return SemanticMatchResult(
|
|
366
|
+
name1=name1,
|
|
367
|
+
name2=name2,
|
|
368
|
+
similarity=similarity,
|
|
369
|
+
is_match=is_match,
|
|
370
|
+
embedding1=embedding1,
|
|
371
|
+
embedding2=embedding2,
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
async def find_best_match(
|
|
375
|
+
self,
|
|
376
|
+
name: str,
|
|
377
|
+
candidates: List[str],
|
|
378
|
+
threshold: Optional[float] = None,
|
|
379
|
+
) -> Optional[Tuple[str, float]]:
|
|
380
|
+
"""
|
|
381
|
+
Find the best semantic match for a name among candidates.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
name: Name to match
|
|
385
|
+
candidates: List of candidate names
|
|
386
|
+
threshold: Minimum similarity threshold
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
Tuple of (best_match_name, similarity) or None if no match above threshold
|
|
390
|
+
"""
|
|
391
|
+
if not candidates or not self._config.enabled:
|
|
392
|
+
return None
|
|
393
|
+
|
|
394
|
+
effective_threshold = threshold or self._config.similarity_threshold
|
|
395
|
+
|
|
396
|
+
# Get embedding for target name
|
|
397
|
+
target_embedding = await self.get_embedding(name)
|
|
398
|
+
if not target_embedding:
|
|
399
|
+
return None
|
|
400
|
+
|
|
401
|
+
# Get embeddings for all candidates in batch
|
|
402
|
+
candidate_embeddings = await self.get_embeddings_batch(candidates)
|
|
403
|
+
|
|
404
|
+
# Find best match
|
|
405
|
+
best_match = None
|
|
406
|
+
best_similarity = effective_threshold
|
|
407
|
+
|
|
408
|
+
for candidate in candidates:
|
|
409
|
+
candidate_embedding = candidate_embeddings.get(candidate, [])
|
|
410
|
+
if candidate_embedding:
|
|
411
|
+
similarity = self.cosine_similarity(target_embedding, candidate_embedding)
|
|
412
|
+
if similarity > best_similarity:
|
|
413
|
+
best_similarity = similarity
|
|
414
|
+
best_match = candidate
|
|
415
|
+
|
|
416
|
+
if best_match:
|
|
417
|
+
return (best_match, best_similarity)
|
|
418
|
+
return None
|
|
419
|
+
|
|
420
|
+
def invalidate_cache(self, name: str) -> bool:
|
|
421
|
+
"""
|
|
422
|
+
Invalidate cache entry for a name.
|
|
423
|
+
|
|
424
|
+
Args:
|
|
425
|
+
name: Name to invalidate
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
True if entry was removed
|
|
429
|
+
"""
|
|
430
|
+
return self._cache.invalidate(name)
|
|
431
|
+
|
|
432
|
+
def invalidate_cache_many(self, names: List[str]) -> int:
|
|
433
|
+
"""
|
|
434
|
+
Invalidate cache entries for multiple names.
|
|
435
|
+
|
|
436
|
+
Args:
|
|
437
|
+
names: Names to invalidate
|
|
438
|
+
|
|
439
|
+
Returns:
|
|
440
|
+
Number of entries removed
|
|
441
|
+
"""
|
|
442
|
+
return self._cache.invalidate_many(names)
|
|
443
|
+
|
|
444
|
+
def clear_cache(self) -> None:
|
|
445
|
+
"""Clear all cache entries."""
|
|
446
|
+
self._cache.clear()
|
|
447
|
+
|
|
448
|
+
def get_cache_stats(self) -> Dict[str, Any]:
|
|
449
|
+
"""Get cache statistics."""
|
|
450
|
+
return self._cache.get_stats()
|
|
451
|
+
|
|
452
|
+
@property
|
|
453
|
+
def cache(self) -> LRUEmbeddingCache:
|
|
454
|
+
"""Access to the embedding cache."""
|
|
455
|
+
return self._cache
|
|
456
|
+
|
|
457
|
+
@property
|
|
458
|
+
def config(self) -> SemanticMatcherConfig:
|
|
459
|
+
"""Get current configuration."""
|
|
460
|
+
return self._config
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
|
|
464
|
+
|