aiecs 1.0.1__py3-none-any.whl → 1.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +13 -16
- aiecs/__main__.py +7 -7
- aiecs/aiecs_client.py +269 -75
- aiecs/application/executors/operation_executor.py +79 -54
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
- aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
- aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
- aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +98 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
- aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
- aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
- aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
- aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
- aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
- aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
- aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
- aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
- aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
- aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
- aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
- aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
- aiecs/application/knowledge_graph/search/reranker.py +293 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +471 -0
- aiecs/config/__init__.py +20 -5
- aiecs/config/config.py +762 -31
- aiecs/config/graph_config.py +131 -0
- aiecs/config/tool_config.py +399 -0
- aiecs/core/__init__.py +29 -13
- aiecs/core/interface/__init__.py +2 -2
- aiecs/core/interface/execution_interface.py +22 -22
- aiecs/core/interface/storage_interface.py +37 -88
- aiecs/core/registry/__init__.py +31 -0
- aiecs/core/registry/service_registry.py +92 -0
- aiecs/domain/__init__.py +270 -1
- aiecs/domain/agent/__init__.py +191 -0
- aiecs/domain/agent/base_agent.py +3870 -0
- aiecs/domain/agent/exceptions.py +99 -0
- aiecs/domain/agent/graph_aware_mixin.py +569 -0
- aiecs/domain/agent/hybrid_agent.py +1435 -0
- aiecs/domain/agent/integration/__init__.py +29 -0
- aiecs/domain/agent/integration/context_compressor.py +216 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
- aiecs/domain/agent/integration/protocols.py +281 -0
- aiecs/domain/agent/integration/retry_policy.py +218 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
- aiecs/domain/agent/lifecycle.py +291 -0
- aiecs/domain/agent/llm_agent.py +692 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +1124 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +163 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
- aiecs/domain/agent/models.py +884 -0
- aiecs/domain/agent/observability.py +479 -0
- aiecs/domain/agent/persistence.py +449 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +159 -0
- aiecs/domain/agent/prompts/formatters.py +187 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +253 -0
- aiecs/domain/agent/tool_agent.py +444 -0
- aiecs/domain/agent/tools/__init__.py +15 -0
- aiecs/domain/agent/tools/schema_generator.py +364 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +469 -0
- aiecs/domain/community/analytics.py +432 -0
- aiecs/domain/community/collaborative_workflow.py +648 -0
- aiecs/domain/community/communication_hub.py +634 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +796 -0
- aiecs/domain/community/community_manager.py +803 -0
- aiecs/domain/community/decision_engine.py +849 -0
- aiecs/domain/community/exceptions.py +231 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +234 -0
- aiecs/domain/community/resource_manager.py +461 -0
- aiecs/domain/community/shared_context_manager.py +589 -0
- aiecs/domain/context/__init__.py +40 -10
- aiecs/domain/context/context_engine.py +1910 -0
- aiecs/domain/context/conversation_models.py +87 -53
- aiecs/domain/context/graph_memory.py +582 -0
- aiecs/domain/execution/model.py +12 -4
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +148 -0
- aiecs/domain/knowledge_graph/models/evidence.py +178 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
- aiecs/domain/knowledge_graph/models/path.py +171 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
- aiecs/domain/knowledge_graph/models/query.py +261 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
- aiecs/domain/knowledge_graph/models/relation.py +202 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
- aiecs/domain/task/dsl_processor.py +172 -56
- aiecs/domain/task/model.py +20 -8
- aiecs/domain/task/task_context.py +27 -24
- aiecs/infrastructure/__init__.py +0 -2
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +837 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
- aiecs/infrastructure/graph_storage/cache.py +424 -0
- aiecs/infrastructure/graph_storage/distributed.py +223 -0
- aiecs/infrastructure/graph_storage/error_handling.py +380 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
- aiecs/infrastructure/graph_storage/metrics.py +344 -0
- aiecs/infrastructure/graph_storage/migration.py +400 -0
- aiecs/infrastructure/graph_storage/pagination.py +483 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
- aiecs/infrastructure/graph_storage/postgres.py +1563 -0
- aiecs/infrastructure/graph_storage/property_storage.py +353 -0
- aiecs/infrastructure/graph_storage/protocols.py +76 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
- aiecs/infrastructure/graph_storage/streaming.py +487 -0
- aiecs/infrastructure/graph_storage/tenant.py +412 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
- aiecs/infrastructure/messaging/websocket_manager.py +51 -35
- aiecs/infrastructure/monitoring/__init__.py +22 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
- aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
- aiecs/infrastructure/monitoring/structured_logger.py +3 -7
- aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
- aiecs/infrastructure/persistence/__init__.py +14 -1
- aiecs/infrastructure/persistence/context_engine_client.py +184 -0
- aiecs/infrastructure/persistence/database_manager.py +67 -43
- aiecs/infrastructure/persistence/file_storage.py +180 -103
- aiecs/infrastructure/persistence/redis_client.py +74 -21
- aiecs/llm/__init__.py +73 -25
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
- aiecs/llm/client_factory.py +224 -36
- aiecs/llm/client_resolver.py +155 -0
- aiecs/llm/clients/__init__.py +38 -0
- aiecs/llm/clients/base_client.py +324 -0
- aiecs/llm/clients/google_function_calling_mixin.py +457 -0
- aiecs/llm/clients/googleai_client.py +241 -0
- aiecs/llm/clients/openai_client.py +158 -0
- aiecs/llm/clients/openai_compatible_mixin.py +367 -0
- aiecs/llm/clients/vertex_client.py +897 -0
- aiecs/llm/clients/xai_client.py +201 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +272 -0
- aiecs/llm/config/config_validator.py +206 -0
- aiecs/llm/config/model_config.py +143 -0
- aiecs/llm/protocols.py +149 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +89 -0
- aiecs/main.py +140 -121
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/module_checker.py +499 -0
- aiecs/scripts/aid/version_manager.py +235 -0
- aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
- aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
- aiecs/scripts/dependance_check/__init__.py +15 -0
- aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
- aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
- aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
- aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
- aiecs/scripts/knowledge_graph/__init__.py +3 -0
- aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
- aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
- aiecs/scripts/tools_develop/README.md +671 -0
- aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
- aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
- aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
- aiecs/scripts/tools_develop/schema_coverage.py +511 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
- aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
- aiecs/scripts/tools_develop/verify_tools.py +352 -0
- aiecs/tasks/__init__.py +0 -1
- aiecs/tasks/worker.py +115 -47
- aiecs/tools/__init__.py +194 -72
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +330 -0
- aiecs/tools/apisource/providers/__init__.py +112 -0
- aiecs/tools/apisource/providers/base.py +671 -0
- aiecs/tools/apisource/providers/census.py +397 -0
- aiecs/tools/apisource/providers/fred.py +535 -0
- aiecs/tools/apisource/providers/newsapi.py +409 -0
- aiecs/tools/apisource/providers/worldbank.py +352 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +363 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
- aiecs/tools/apisource/tool.py +832 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +334 -0
- aiecs/tools/base_tool.py +415 -21
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +607 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
- aiecs/tools/docs/content_insertion_tool.py +1320 -0
- aiecs/tools/docs/document_creator_tool.py +1323 -0
- aiecs/tools/docs/document_layout_tool.py +1160 -0
- aiecs/tools/docs/document_parser_tool.py +1011 -0
- aiecs/tools/docs/document_writer_tool.py +1829 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
- aiecs/tools/langchain_adapter.py +300 -138
- aiecs/tools/schema_generator.py +455 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +581 -0
- aiecs/tools/search_tool/cache.py +264 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +224 -0
- aiecs/tools/search_tool/core.py +778 -0
- aiecs/tools/search_tool/deduplicator.py +119 -0
- aiecs/tools/search_tool/error_handler.py +242 -0
- aiecs/tools/search_tool/metrics.py +343 -0
- aiecs/tools/search_tool/rate_limiter.py +172 -0
- aiecs/tools/search_tool/schemas.py +275 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
- aiecs/tools/statistics/data_loader_tool.py +555 -0
- aiecs/tools/statistics/data_profiler_tool.py +638 -0
- aiecs/tools/statistics/data_transformer_tool.py +580 -0
- aiecs/tools/statistics/data_visualizer_tool.py +498 -0
- aiecs/tools/statistics/model_trainer_tool.py +507 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
- aiecs/tools/task_tools/__init__.py +49 -36
- aiecs/tools/task_tools/chart_tool.py +200 -184
- aiecs/tools/task_tools/classfire_tool.py +268 -267
- aiecs/tools/task_tools/image_tool.py +175 -131
- aiecs/tools/task_tools/office_tool.py +226 -146
- aiecs/tools/task_tools/pandas_tool.py +477 -121
- aiecs/tools/task_tools/report_tool.py +390 -142
- aiecs/tools/task_tools/research_tool.py +149 -79
- aiecs/tools/task_tools/scraper_tool.py +339 -145
- aiecs/tools/task_tools/stats_tool.py +448 -209
- aiecs/tools/temp_file_manager.py +26 -24
- aiecs/tools/tool_executor/__init__.py +18 -16
- aiecs/tools/tool_executor/tool_executor.py +364 -52
- aiecs/utils/LLM_output_structor.py +74 -48
- aiecs/utils/__init__.py +14 -3
- aiecs/utils/base_callback.py +0 -3
- aiecs/utils/cache_provider.py +696 -0
- aiecs/utils/execution_utils.py +50 -31
- aiecs/utils/prompt_loader.py +1 -0
- aiecs/utils/token_usage_repository.py +37 -11
- aiecs/ws/socket_server.py +14 -4
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/METADATA +52 -15
- aiecs-1.7.6.dist-info/RECORD +337 -0
- aiecs-1.7.6.dist-info/entry_points.txt +13 -0
- aiecs/config/registry.py +0 -19
- aiecs/domain/context/content_engine.py +0 -982
- aiecs/llm/base_client.py +0 -99
- aiecs/llm/openai_client.py +0 -125
- aiecs/llm/vertex_client.py +0 -186
- aiecs/llm/xai_client.py +0 -184
- aiecs/scripts/dependency_checker.py +0 -857
- aiecs/scripts/quick_dependency_check.py +0 -269
- aiecs/tools/task_tools/search_api.py +0 -7
- aiecs-1.0.1.dist-info/RECORD +0 -90
- aiecs-1.0.1.dist-info/entry_points.txt +0 -7
- /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
- /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
- /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
- /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/WHEEL +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Evaluation Dataset for Knowledge Fusion Matching.
|
|
3
|
+
|
|
4
|
+
Contains curated test cases with known entity matches and non-matches,
|
|
5
|
+
including edge cases for threshold validation and A/B testing.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
@dataclass
|
|
13
|
+
class EntityPair:
|
|
14
|
+
"""
|
|
15
|
+
A pair of entity names for evaluation.
|
|
16
|
+
|
|
17
|
+
Attributes:
|
|
18
|
+
name1: First entity name
|
|
19
|
+
name2: Second entity name
|
|
20
|
+
entity_type: Type of entities (e.g., "Person", "Organization")
|
|
21
|
+
should_match: Whether these entities should be considered a match
|
|
22
|
+
match_reason: Reason why they should/shouldn't match (for documentation)
|
|
23
|
+
domain: Domain context (academic, corporate, medical, etc.)
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
name1: str
|
|
27
|
+
name2: str
|
|
28
|
+
entity_type: str = "Person"
|
|
29
|
+
should_match: bool = True
|
|
30
|
+
match_reason: str = ""
|
|
31
|
+
domain: str = "general"
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class EvaluationDataset:
|
|
36
|
+
"""
|
|
37
|
+
Collection of entity pairs for evaluation.
|
|
38
|
+
|
|
39
|
+
Attributes:
|
|
40
|
+
pairs: List of entity pairs to evaluate
|
|
41
|
+
name: Name/description of the dataset
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
pairs: List[EntityPair]
|
|
45
|
+
name: str = "default"
|
|
46
|
+
|
|
47
|
+
def __len__(self) -> int:
|
|
48
|
+
"""Return number of pairs in dataset."""
|
|
49
|
+
return len(self.pairs)
|
|
50
|
+
|
|
51
|
+
def get_by_domain(self, domain: str) -> "EvaluationDataset":
|
|
52
|
+
"""Filter pairs by domain."""
|
|
53
|
+
filtered = [p for p in self.pairs if p.domain == domain]
|
|
54
|
+
return EvaluationDataset(pairs=filtered, name=f"{self.name}_{domain}")
|
|
55
|
+
|
|
56
|
+
def get_by_type(self, entity_type: str) -> "EvaluationDataset":
|
|
57
|
+
"""Filter pairs by entity type."""
|
|
58
|
+
filtered = [p for p in self.pairs if p.entity_type == entity_type]
|
|
59
|
+
return EvaluationDataset(pairs=filtered, name=f"{self.name}_{entity_type}")
|
|
60
|
+
|
|
61
|
+
def get_positive_pairs(self) -> List[EntityPair]:
|
|
62
|
+
"""Get pairs that should match."""
|
|
63
|
+
return [p for p in self.pairs if p.should_match]
|
|
64
|
+
|
|
65
|
+
def get_negative_pairs(self) -> List[EntityPair]:
|
|
66
|
+
"""Get pairs that should not match."""
|
|
67
|
+
return [p for p in self.pairs if not p.should_match]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def create_default_evaluation_dataset() -> EvaluationDataset:
|
|
71
|
+
"""
|
|
72
|
+
Create default evaluation dataset with known matches and edge cases.
|
|
73
|
+
|
|
74
|
+
Includes:
|
|
75
|
+
- Name variations (initials, titles, suffixes)
|
|
76
|
+
- Abbreviations/acronyms
|
|
77
|
+
- Normalization cases
|
|
78
|
+
- Semantic matches
|
|
79
|
+
- False positives (similar but different entities)
|
|
80
|
+
"""
|
|
81
|
+
pairs: List[EntityPair] = []
|
|
82
|
+
|
|
83
|
+
# ============================================================================
|
|
84
|
+
# Person Entity Matches (Academic Domain)
|
|
85
|
+
# ============================================================================
|
|
86
|
+
academic_person_pairs = [
|
|
87
|
+
# Exact matches
|
|
88
|
+
EntityPair("Albert Einstein", "Albert Einstein", "Person", True, "Exact match", "academic"),
|
|
89
|
+
EntityPair("Dr. John Smith", "John Smith", "Person", True, "Title prefix", "academic"),
|
|
90
|
+
EntityPair("John Smith, PhD", "John Smith", "Person", True, "Suffix", "academic"),
|
|
91
|
+
EntityPair("Prof. Jane Doe", "Jane Doe", "Person", True, "Professor title", "academic"),
|
|
92
|
+
|
|
93
|
+
# Initial variations
|
|
94
|
+
EntityPair("A. Einstein", "Albert Einstein", "Person", True, "Initial expansion", "academic"),
|
|
95
|
+
EntityPair("J. Smith", "John Smith", "Person", True, "Initial expansion", "academic"),
|
|
96
|
+
EntityPair("J. K. Rowling", "Joanne Rowling", "Person", True, "Initial expansion", "academic"),
|
|
97
|
+
EntityPair("M. L. King", "Martin Luther King", "Person", True, "Initial expansion", "academic"),
|
|
98
|
+
|
|
99
|
+
# Name order variations
|
|
100
|
+
EntityPair("Smith, John", "John Smith", "Person", True, "Name order", "academic"),
|
|
101
|
+
EntityPair("Einstein, Albert", "Albert Einstein", "Person", True, "Name order", "academic"),
|
|
102
|
+
|
|
103
|
+
# Title combinations
|
|
104
|
+
EntityPair("Dr. A. Einstein", "Albert Einstein", "Person", True, "Title + initial", "academic"),
|
|
105
|
+
EntityPair("Prof. J. Smith, PhD", "John Smith", "Person", True, "Title + initial + suffix", "academic"),
|
|
106
|
+
|
|
107
|
+
# False positives (should NOT match)
|
|
108
|
+
EntityPair("John Smith", "Jane Smith", "Person", False, "Different first names", "academic"),
|
|
109
|
+
EntityPair("A. Einstein", "A. Newton", "Person", False, "Different surnames", "academic"),
|
|
110
|
+
EntityPair("John Smith", "John Smyth", "Person", False, "Similar but different surname", "academic"),
|
|
111
|
+
]
|
|
112
|
+
pairs.extend(academic_person_pairs)
|
|
113
|
+
|
|
114
|
+
# ============================================================================
|
|
115
|
+
# Organization Entity Matches (Corporate Domain)
|
|
116
|
+
# ============================================================================
|
|
117
|
+
corporate_org_pairs = [
|
|
118
|
+
# Abbreviation matches
|
|
119
|
+
EntityPair("MIT", "Massachusetts Institute of Technology", "Organization", True, "Abbreviation expansion", "corporate"),
|
|
120
|
+
EntityPair("IBM", "International Business Machines", "Organization", True, "Abbreviation expansion", "corporate"),
|
|
121
|
+
EntityPair("NASA", "National Aeronautics and Space Administration", "Organization", True, "Abbreviation expansion", "corporate"),
|
|
122
|
+
EntityPair("NYC", "New York City", "Organization", True, "Abbreviation expansion", "corporate"),
|
|
123
|
+
EntityPair("USA", "United States of America", "Organization", True, "Abbreviation expansion", "corporate"),
|
|
124
|
+
|
|
125
|
+
# Name variations
|
|
126
|
+
EntityPair("Apple Inc.", "Apple", "Organization", True, "Incorporation suffix", "corporate"),
|
|
127
|
+
EntityPair("Apple Incorporated", "Apple Inc.", "Organization", True, "Full vs abbreviated suffix", "corporate"),
|
|
128
|
+
EntityPair("Microsoft Corporation", "Microsoft", "Organization", True, "Corporation suffix", "corporate"),
|
|
129
|
+
EntityPair("Microsoft Corp.", "Microsoft Corporation", "Organization", True, "Corp abbreviation", "corporate"),
|
|
130
|
+
|
|
131
|
+
# Common name variations
|
|
132
|
+
EntityPair("The New York Times", "New York Times", "Organization", True, "Article prefix", "corporate"),
|
|
133
|
+
EntityPair("AT&T", "AT and T", "Organization", True, "Symbol expansion", "corporate"),
|
|
134
|
+
|
|
135
|
+
# False positives
|
|
136
|
+
EntityPair("Apple Inc.", "Apple Computer", "Organization", False, "Different company names", "corporate"),
|
|
137
|
+
EntityPair("Microsoft", "Microsystems", "Organization", False, "Similar but different", "corporate"),
|
|
138
|
+
EntityPair("IBM", "HP", "Organization", False, "Different abbreviations", "corporate"),
|
|
139
|
+
]
|
|
140
|
+
pairs.extend(corporate_org_pairs)
|
|
141
|
+
|
|
142
|
+
# ============================================================================
|
|
143
|
+
# Medical Domain Entity Matches
|
|
144
|
+
# ============================================================================
|
|
145
|
+
medical_pairs = [
|
|
146
|
+
# Medical abbreviations
|
|
147
|
+
EntityPair("COVID-19", "Coronavirus Disease 2019", "Concept", True, "Medical abbreviation", "medical"),
|
|
148
|
+
EntityPair("HIV", "Human Immunodeficiency Virus", "Concept", True, "Medical abbreviation", "medical"),
|
|
149
|
+
EntityPair("AIDS", "Acquired Immunodeficiency Syndrome", "Concept", True, "Medical abbreviation", "medical"),
|
|
150
|
+
EntityPair("DNA", "Deoxyribonucleic Acid", "Concept", True, "Scientific abbreviation", "medical"),
|
|
151
|
+
EntityPair("RNA", "Ribonucleic Acid", "Concept", True, "Scientific abbreviation", "medical"),
|
|
152
|
+
|
|
153
|
+
# Medical professional titles
|
|
154
|
+
EntityPair("Dr. Sarah Johnson", "Sarah Johnson, MD", "Person", True, "MD suffix", "medical"),
|
|
155
|
+
EntityPair("Dr. Michael Chen", "Michael Chen, M.D.", "Person", True, "M.D. suffix", "medical"),
|
|
156
|
+
EntityPair("Dr. Emily Brown", "Emily Brown, Doctor", "Person", True, "Doctor title", "medical"),
|
|
157
|
+
|
|
158
|
+
# Medical institution variations
|
|
159
|
+
EntityPair("Mayo Clinic", "Mayo Medical Center", "Organization", True, "Clinic vs center", "medical"),
|
|
160
|
+
EntityPair("Johns Hopkins Hospital", "Johns Hopkins", "Organization", True, "Hospital suffix", "medical"),
|
|
161
|
+
|
|
162
|
+
# False positives
|
|
163
|
+
EntityPair("COVID-19", "COVID-20", "Concept", False, "Different disease variant", "medical"),
|
|
164
|
+
EntityPair("HIV", "HPV", "Concept", False, "Different viruses", "medical"),
|
|
165
|
+
]
|
|
166
|
+
pairs.extend(medical_pairs)
|
|
167
|
+
|
|
168
|
+
# ============================================================================
|
|
169
|
+
# Edge Cases - Challenging Matches
|
|
170
|
+
# ============================================================================
|
|
171
|
+
edge_case_pairs = [
|
|
172
|
+
# Very similar but different
|
|
173
|
+
EntityPair("John Smith", "Jon Smith", "Person", False, "Different spelling", "general"),
|
|
174
|
+
EntityPair("Steven", "Stephen", "Person", False, "Different spelling", "general"),
|
|
175
|
+
EntityPair("Catherine", "Katherine", "Person", False, "Different spelling", "general"),
|
|
176
|
+
|
|
177
|
+
# Substring cases
|
|
178
|
+
EntityPair("New York", "New York City", "Organization", True, "Substring match", "general"),
|
|
179
|
+
EntityPair("University", "State University", "Organization", False, "Too generic", "general"),
|
|
180
|
+
|
|
181
|
+
# Special characters
|
|
182
|
+
EntityPair("O'Brien", "OBrien", "Person", True, "Apostrophe normalization", "general"),
|
|
183
|
+
EntityPair("José", "Jose", "Person", True, "Accent normalization", "general"),
|
|
184
|
+
EntityPair("Müller", "Mueller", "Person", True, "Umlaut normalization", "general"),
|
|
185
|
+
|
|
186
|
+
# Multiple word variations
|
|
187
|
+
EntityPair("New York University", "NYU", "Organization", True, "Multi-word abbreviation", "general"),
|
|
188
|
+
EntityPair("United States", "US", "Organization", True, "Country abbreviation", "general"),
|
|
189
|
+
EntityPair("United Kingdom", "UK", "Organization", True, "Country abbreviation", "general"),
|
|
190
|
+
|
|
191
|
+
# Case variations
|
|
192
|
+
EntityPair("APPLE INC.", "apple inc.", "Organization", True, "Case normalization", "general"),
|
|
193
|
+
EntityPair("JOHN SMITH", "john smith", "Person", True, "Case normalization", "general"),
|
|
194
|
+
|
|
195
|
+
# Whitespace variations
|
|
196
|
+
EntityPair("John Smith", "John Smith", "Person", True, "Whitespace normalization", "general"),
|
|
197
|
+
EntityPair("New York", "New York", "Organization", True, "Whitespace normalization", "general"),
|
|
198
|
+
]
|
|
199
|
+
pairs.extend(edge_case_pairs)
|
|
200
|
+
|
|
201
|
+
# ============================================================================
|
|
202
|
+
# Semantic Similarity Cases (should match via embeddings)
|
|
203
|
+
# ============================================================================
|
|
204
|
+
semantic_pairs = [
|
|
205
|
+
# Synonyms and related terms
|
|
206
|
+
EntityPair("Doctor", "Physician", "Person", True, "Semantic synonym", "medical"),
|
|
207
|
+
EntityPair("Hospital", "Medical Center", "Organization", True, "Semantic similarity", "medical"),
|
|
208
|
+
EntityPair("University", "College", "Organization", True, "Semantic similarity", "academic"),
|
|
209
|
+
|
|
210
|
+
# Transliterations (if supported)
|
|
211
|
+
EntityPair("München", "Munich", "Organization", True, "Transliteration", "general"),
|
|
212
|
+
EntityPair("Moskva", "Moscow", "Organization", True, "Transliteration", "general"),
|
|
213
|
+
|
|
214
|
+
# False semantic matches
|
|
215
|
+
EntityPair("Apple", "Orange", "Organization", False, "Different fruits", "general"),
|
|
216
|
+
EntityPair("Microsoft", "Apple", "Organization", False, "Different companies", "corporate"),
|
|
217
|
+
]
|
|
218
|
+
pairs.extend(semantic_pairs)
|
|
219
|
+
|
|
220
|
+
return EvaluationDataset(pairs=pairs, name="default_evaluation")
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def create_minimal_evaluation_dataset() -> EvaluationDataset:
|
|
224
|
+
"""
|
|
225
|
+
Create a minimal dataset for quick testing.
|
|
226
|
+
|
|
227
|
+
Returns a small subset of the default dataset.
|
|
228
|
+
"""
|
|
229
|
+
pairs = [
|
|
230
|
+
# Positive matches
|
|
231
|
+
EntityPair("Albert Einstein", "A. Einstein", "Person", True, "Initial expansion", "academic"),
|
|
232
|
+
EntityPair("MIT", "Massachusetts Institute of Technology", "Organization", True, "Abbreviation", "corporate"),
|
|
233
|
+
EntityPair("Dr. John Smith", "John Smith", "Person", True, "Title prefix", "academic"),
|
|
234
|
+
|
|
235
|
+
# Negative matches
|
|
236
|
+
EntityPair("John Smith", "Jane Smith", "Person", False, "Different names", "academic"),
|
|
237
|
+
EntityPair("Apple Inc.", "Microsoft", "Organization", False, "Different companies", "corporate"),
|
|
238
|
+
]
|
|
239
|
+
|
|
240
|
+
return EvaluationDataset(pairs=pairs, name="minimal_evaluation")
|