aiecs 1.0.1__py3-none-any.whl → 1.7.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of aiecs might be problematic. Click here for more details.
- aiecs/__init__.py +13 -16
- aiecs/__main__.py +7 -7
- aiecs/aiecs_client.py +269 -75
- aiecs/application/executors/operation_executor.py +79 -54
- aiecs/application/knowledge_graph/__init__.py +7 -0
- aiecs/application/knowledge_graph/builder/__init__.py +37 -0
- aiecs/application/knowledge_graph/builder/data_quality.py +302 -0
- aiecs/application/knowledge_graph/builder/data_reshaping.py +293 -0
- aiecs/application/knowledge_graph/builder/document_builder.py +369 -0
- aiecs/application/knowledge_graph/builder/graph_builder.py +490 -0
- aiecs/application/knowledge_graph/builder/import_optimizer.py +396 -0
- aiecs/application/knowledge_graph/builder/schema_inference.py +462 -0
- aiecs/application/knowledge_graph/builder/schema_mapping.py +563 -0
- aiecs/application/knowledge_graph/builder/structured_pipeline.py +1384 -0
- aiecs/application/knowledge_graph/builder/text_chunker.py +317 -0
- aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
- aiecs/application/knowledge_graph/extractors/base.py +98 -0
- aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +422 -0
- aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +347 -0
- aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +241 -0
- aiecs/application/knowledge_graph/fusion/__init__.py +78 -0
- aiecs/application/knowledge_graph/fusion/ab_testing.py +395 -0
- aiecs/application/knowledge_graph/fusion/abbreviation_expander.py +327 -0
- aiecs/application/knowledge_graph/fusion/alias_index.py +597 -0
- aiecs/application/knowledge_graph/fusion/alias_matcher.py +384 -0
- aiecs/application/knowledge_graph/fusion/cache_coordinator.py +343 -0
- aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +433 -0
- aiecs/application/knowledge_graph/fusion/entity_linker.py +511 -0
- aiecs/application/knowledge_graph/fusion/evaluation_dataset.py +240 -0
- aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +632 -0
- aiecs/application/knowledge_graph/fusion/matching_config.py +489 -0
- aiecs/application/knowledge_graph/fusion/name_normalizer.py +352 -0
- aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +183 -0
- aiecs/application/knowledge_graph/fusion/semantic_name_matcher.py +464 -0
- aiecs/application/knowledge_graph/fusion/similarity_pipeline.py +534 -0
- aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
- aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +342 -0
- aiecs/application/knowledge_graph/pattern_matching/query_executor.py +366 -0
- aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
- aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +195 -0
- aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
- aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
- aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +341 -0
- aiecs/application/knowledge_graph/reasoning/inference_engine.py +500 -0
- aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +163 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +913 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +866 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +475 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +396 -0
- aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +208 -0
- aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +170 -0
- aiecs/application/knowledge_graph/reasoning/query_planner.py +855 -0
- aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +518 -0
- aiecs/application/knowledge_graph/retrieval/__init__.py +27 -0
- aiecs/application/knowledge_graph/retrieval/query_intent_classifier.py +211 -0
- aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +592 -0
- aiecs/application/knowledge_graph/retrieval/strategy_types.py +23 -0
- aiecs/application/knowledge_graph/search/__init__.py +59 -0
- aiecs/application/knowledge_graph/search/hybrid_search.py +457 -0
- aiecs/application/knowledge_graph/search/reranker.py +293 -0
- aiecs/application/knowledge_graph/search/reranker_strategies.py +535 -0
- aiecs/application/knowledge_graph/search/text_similarity.py +392 -0
- aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
- aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +305 -0
- aiecs/application/knowledge_graph/traversal/path_scorer.py +271 -0
- aiecs/application/knowledge_graph/validators/__init__.py +13 -0
- aiecs/application/knowledge_graph/validators/relation_validator.py +239 -0
- aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
- aiecs/application/knowledge_graph/visualization/graph_visualizer.py +313 -0
- aiecs/common/__init__.py +9 -0
- aiecs/common/knowledge_graph/__init__.py +17 -0
- aiecs/common/knowledge_graph/runnable.py +471 -0
- aiecs/config/__init__.py +20 -5
- aiecs/config/config.py +762 -31
- aiecs/config/graph_config.py +131 -0
- aiecs/config/tool_config.py +399 -0
- aiecs/core/__init__.py +29 -13
- aiecs/core/interface/__init__.py +2 -2
- aiecs/core/interface/execution_interface.py +22 -22
- aiecs/core/interface/storage_interface.py +37 -88
- aiecs/core/registry/__init__.py +31 -0
- aiecs/core/registry/service_registry.py +92 -0
- aiecs/domain/__init__.py +270 -1
- aiecs/domain/agent/__init__.py +191 -0
- aiecs/domain/agent/base_agent.py +3870 -0
- aiecs/domain/agent/exceptions.py +99 -0
- aiecs/domain/agent/graph_aware_mixin.py +569 -0
- aiecs/domain/agent/hybrid_agent.py +1435 -0
- aiecs/domain/agent/integration/__init__.py +29 -0
- aiecs/domain/agent/integration/context_compressor.py +216 -0
- aiecs/domain/agent/integration/context_engine_adapter.py +587 -0
- aiecs/domain/agent/integration/protocols.py +281 -0
- aiecs/domain/agent/integration/retry_policy.py +218 -0
- aiecs/domain/agent/integration/role_config.py +213 -0
- aiecs/domain/agent/knowledge_aware_agent.py +1892 -0
- aiecs/domain/agent/lifecycle.py +291 -0
- aiecs/domain/agent/llm_agent.py +692 -0
- aiecs/domain/agent/memory/__init__.py +12 -0
- aiecs/domain/agent/memory/conversation.py +1124 -0
- aiecs/domain/agent/migration/__init__.py +14 -0
- aiecs/domain/agent/migration/conversion.py +163 -0
- aiecs/domain/agent/migration/legacy_wrapper.py +86 -0
- aiecs/domain/agent/models.py +884 -0
- aiecs/domain/agent/observability.py +479 -0
- aiecs/domain/agent/persistence.py +449 -0
- aiecs/domain/agent/prompts/__init__.py +29 -0
- aiecs/domain/agent/prompts/builder.py +159 -0
- aiecs/domain/agent/prompts/formatters.py +187 -0
- aiecs/domain/agent/prompts/template.py +255 -0
- aiecs/domain/agent/registry.py +253 -0
- aiecs/domain/agent/tool_agent.py +444 -0
- aiecs/domain/agent/tools/__init__.py +15 -0
- aiecs/domain/agent/tools/schema_generator.py +364 -0
- aiecs/domain/community/__init__.py +155 -0
- aiecs/domain/community/agent_adapter.py +469 -0
- aiecs/domain/community/analytics.py +432 -0
- aiecs/domain/community/collaborative_workflow.py +648 -0
- aiecs/domain/community/communication_hub.py +634 -0
- aiecs/domain/community/community_builder.py +320 -0
- aiecs/domain/community/community_integration.py +796 -0
- aiecs/domain/community/community_manager.py +803 -0
- aiecs/domain/community/decision_engine.py +849 -0
- aiecs/domain/community/exceptions.py +231 -0
- aiecs/domain/community/models/__init__.py +33 -0
- aiecs/domain/community/models/community_models.py +234 -0
- aiecs/domain/community/resource_manager.py +461 -0
- aiecs/domain/community/shared_context_manager.py +589 -0
- aiecs/domain/context/__init__.py +40 -10
- aiecs/domain/context/context_engine.py +1910 -0
- aiecs/domain/context/conversation_models.py +87 -53
- aiecs/domain/context/graph_memory.py +582 -0
- aiecs/domain/execution/model.py +12 -4
- aiecs/domain/knowledge_graph/__init__.py +19 -0
- aiecs/domain/knowledge_graph/models/__init__.py +52 -0
- aiecs/domain/knowledge_graph/models/entity.py +148 -0
- aiecs/domain/knowledge_graph/models/evidence.py +178 -0
- aiecs/domain/knowledge_graph/models/inference_rule.py +184 -0
- aiecs/domain/knowledge_graph/models/path.py +171 -0
- aiecs/domain/knowledge_graph/models/path_pattern.py +171 -0
- aiecs/domain/knowledge_graph/models/query.py +261 -0
- aiecs/domain/knowledge_graph/models/query_plan.py +181 -0
- aiecs/domain/knowledge_graph/models/relation.py +202 -0
- aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
- aiecs/domain/knowledge_graph/schema/entity_type.py +131 -0
- aiecs/domain/knowledge_graph/schema/graph_schema.py +253 -0
- aiecs/domain/knowledge_graph/schema/property_schema.py +143 -0
- aiecs/domain/knowledge_graph/schema/relation_type.py +163 -0
- aiecs/domain/knowledge_graph/schema/schema_manager.py +691 -0
- aiecs/domain/knowledge_graph/schema/type_enums.py +209 -0
- aiecs/domain/task/dsl_processor.py +172 -56
- aiecs/domain/task/model.py +20 -8
- aiecs/domain/task/task_context.py +27 -24
- aiecs/infrastructure/__init__.py +0 -2
- aiecs/infrastructure/graph_storage/__init__.py +11 -0
- aiecs/infrastructure/graph_storage/base.py +837 -0
- aiecs/infrastructure/graph_storage/batch_operations.py +458 -0
- aiecs/infrastructure/graph_storage/cache.py +424 -0
- aiecs/infrastructure/graph_storage/distributed.py +223 -0
- aiecs/infrastructure/graph_storage/error_handling.py +380 -0
- aiecs/infrastructure/graph_storage/graceful_degradation.py +294 -0
- aiecs/infrastructure/graph_storage/health_checks.py +378 -0
- aiecs/infrastructure/graph_storage/in_memory.py +1197 -0
- aiecs/infrastructure/graph_storage/index_optimization.py +446 -0
- aiecs/infrastructure/graph_storage/lazy_loading.py +431 -0
- aiecs/infrastructure/graph_storage/metrics.py +344 -0
- aiecs/infrastructure/graph_storage/migration.py +400 -0
- aiecs/infrastructure/graph_storage/pagination.py +483 -0
- aiecs/infrastructure/graph_storage/performance_monitoring.py +456 -0
- aiecs/infrastructure/graph_storage/postgres.py +1563 -0
- aiecs/infrastructure/graph_storage/property_storage.py +353 -0
- aiecs/infrastructure/graph_storage/protocols.py +76 -0
- aiecs/infrastructure/graph_storage/query_optimizer.py +642 -0
- aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
- aiecs/infrastructure/graph_storage/sqlite.py +1373 -0
- aiecs/infrastructure/graph_storage/streaming.py +487 -0
- aiecs/infrastructure/graph_storage/tenant.py +412 -0
- aiecs/infrastructure/messaging/celery_task_manager.py +92 -54
- aiecs/infrastructure/messaging/websocket_manager.py +51 -35
- aiecs/infrastructure/monitoring/__init__.py +22 -0
- aiecs/infrastructure/monitoring/executor_metrics.py +45 -11
- aiecs/infrastructure/monitoring/global_metrics_manager.py +212 -0
- aiecs/infrastructure/monitoring/structured_logger.py +3 -7
- aiecs/infrastructure/monitoring/tracing_manager.py +63 -35
- aiecs/infrastructure/persistence/__init__.py +14 -1
- aiecs/infrastructure/persistence/context_engine_client.py +184 -0
- aiecs/infrastructure/persistence/database_manager.py +67 -43
- aiecs/infrastructure/persistence/file_storage.py +180 -103
- aiecs/infrastructure/persistence/redis_client.py +74 -21
- aiecs/llm/__init__.py +73 -25
- aiecs/llm/callbacks/__init__.py +11 -0
- aiecs/llm/{custom_callbacks.py → callbacks/custom_callbacks.py} +26 -19
- aiecs/llm/client_factory.py +224 -36
- aiecs/llm/client_resolver.py +155 -0
- aiecs/llm/clients/__init__.py +38 -0
- aiecs/llm/clients/base_client.py +324 -0
- aiecs/llm/clients/google_function_calling_mixin.py +457 -0
- aiecs/llm/clients/googleai_client.py +241 -0
- aiecs/llm/clients/openai_client.py +158 -0
- aiecs/llm/clients/openai_compatible_mixin.py +367 -0
- aiecs/llm/clients/vertex_client.py +897 -0
- aiecs/llm/clients/xai_client.py +201 -0
- aiecs/llm/config/__init__.py +51 -0
- aiecs/llm/config/config_loader.py +272 -0
- aiecs/llm/config/config_validator.py +206 -0
- aiecs/llm/config/model_config.py +143 -0
- aiecs/llm/protocols.py +149 -0
- aiecs/llm/utils/__init__.py +10 -0
- aiecs/llm/utils/validate_config.py +89 -0
- aiecs/main.py +140 -121
- aiecs/scripts/aid/VERSION_MANAGEMENT.md +138 -0
- aiecs/scripts/aid/__init__.py +19 -0
- aiecs/scripts/aid/module_checker.py +499 -0
- aiecs/scripts/aid/version_manager.py +235 -0
- aiecs/scripts/{DEPENDENCY_SYSTEM_SUMMARY.md → dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md} +1 -0
- aiecs/scripts/{README_DEPENDENCY_CHECKER.md → dependance_check/README_DEPENDENCY_CHECKER.md} +1 -0
- aiecs/scripts/dependance_check/__init__.py +15 -0
- aiecs/scripts/dependance_check/dependency_checker.py +1835 -0
- aiecs/scripts/{dependency_fixer.py → dependance_check/dependency_fixer.py} +192 -90
- aiecs/scripts/{download_nlp_data.py → dependance_check/download_nlp_data.py} +203 -71
- aiecs/scripts/dependance_patch/__init__.py +7 -0
- aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
- aiecs/scripts/{fix_weasel_validator.py → dependance_patch/fix_weasel/fix_weasel_validator.py} +21 -14
- aiecs/scripts/{patch_weasel_library.sh → dependance_patch/fix_weasel/patch_weasel_library.sh} +1 -1
- aiecs/scripts/knowledge_graph/__init__.py +3 -0
- aiecs/scripts/knowledge_graph/run_threshold_experiments.py +212 -0
- aiecs/scripts/migrations/multi_tenancy/README.md +142 -0
- aiecs/scripts/tools_develop/README.md +671 -0
- aiecs/scripts/tools_develop/README_CONFIG_CHECKER.md +273 -0
- aiecs/scripts/tools_develop/TOOLS_CONFIG_GUIDE.md +1287 -0
- aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
- aiecs/scripts/tools_develop/__init__.py +21 -0
- aiecs/scripts/tools_develop/check_all_tools_config.py +548 -0
- aiecs/scripts/tools_develop/check_type_annotations.py +257 -0
- aiecs/scripts/tools_develop/pre-commit-schema-coverage.sh +66 -0
- aiecs/scripts/tools_develop/schema_coverage.py +511 -0
- aiecs/scripts/tools_develop/validate_tool_schemas.py +475 -0
- aiecs/scripts/tools_develop/verify_executor_config_fix.py +98 -0
- aiecs/scripts/tools_develop/verify_tools.py +352 -0
- aiecs/tasks/__init__.py +0 -1
- aiecs/tasks/worker.py +115 -47
- aiecs/tools/__init__.py +194 -72
- aiecs/tools/apisource/__init__.py +99 -0
- aiecs/tools/apisource/intelligence/__init__.py +19 -0
- aiecs/tools/apisource/intelligence/data_fusion.py +632 -0
- aiecs/tools/apisource/intelligence/query_analyzer.py +417 -0
- aiecs/tools/apisource/intelligence/search_enhancer.py +385 -0
- aiecs/tools/apisource/monitoring/__init__.py +9 -0
- aiecs/tools/apisource/monitoring/metrics.py +330 -0
- aiecs/tools/apisource/providers/__init__.py +112 -0
- aiecs/tools/apisource/providers/base.py +671 -0
- aiecs/tools/apisource/providers/census.py +397 -0
- aiecs/tools/apisource/providers/fred.py +535 -0
- aiecs/tools/apisource/providers/newsapi.py +409 -0
- aiecs/tools/apisource/providers/worldbank.py +352 -0
- aiecs/tools/apisource/reliability/__init__.py +12 -0
- aiecs/tools/apisource/reliability/error_handler.py +363 -0
- aiecs/tools/apisource/reliability/fallback_strategy.py +376 -0
- aiecs/tools/apisource/tool.py +832 -0
- aiecs/tools/apisource/utils/__init__.py +9 -0
- aiecs/tools/apisource/utils/validators.py +334 -0
- aiecs/tools/base_tool.py +415 -21
- aiecs/tools/docs/__init__.py +121 -0
- aiecs/tools/docs/ai_document_orchestrator.py +607 -0
- aiecs/tools/docs/ai_document_writer_orchestrator.py +2350 -0
- aiecs/tools/docs/content_insertion_tool.py +1320 -0
- aiecs/tools/docs/document_creator_tool.py +1323 -0
- aiecs/tools/docs/document_layout_tool.py +1160 -0
- aiecs/tools/docs/document_parser_tool.py +1011 -0
- aiecs/tools/docs/document_writer_tool.py +1829 -0
- aiecs/tools/knowledge_graph/__init__.py +17 -0
- aiecs/tools/knowledge_graph/graph_reasoning_tool.py +807 -0
- aiecs/tools/knowledge_graph/graph_search_tool.py +944 -0
- aiecs/tools/knowledge_graph/kg_builder_tool.py +524 -0
- aiecs/tools/langchain_adapter.py +300 -138
- aiecs/tools/schema_generator.py +455 -0
- aiecs/tools/search_tool/__init__.py +100 -0
- aiecs/tools/search_tool/analyzers.py +581 -0
- aiecs/tools/search_tool/cache.py +264 -0
- aiecs/tools/search_tool/constants.py +128 -0
- aiecs/tools/search_tool/context.py +224 -0
- aiecs/tools/search_tool/core.py +778 -0
- aiecs/tools/search_tool/deduplicator.py +119 -0
- aiecs/tools/search_tool/error_handler.py +242 -0
- aiecs/tools/search_tool/metrics.py +343 -0
- aiecs/tools/search_tool/rate_limiter.py +172 -0
- aiecs/tools/search_tool/schemas.py +275 -0
- aiecs/tools/statistics/__init__.py +80 -0
- aiecs/tools/statistics/ai_data_analysis_orchestrator.py +646 -0
- aiecs/tools/statistics/ai_insight_generator_tool.py +508 -0
- aiecs/tools/statistics/ai_report_orchestrator_tool.py +684 -0
- aiecs/tools/statistics/data_loader_tool.py +555 -0
- aiecs/tools/statistics/data_profiler_tool.py +638 -0
- aiecs/tools/statistics/data_transformer_tool.py +580 -0
- aiecs/tools/statistics/data_visualizer_tool.py +498 -0
- aiecs/tools/statistics/model_trainer_tool.py +507 -0
- aiecs/tools/statistics/statistical_analyzer_tool.py +472 -0
- aiecs/tools/task_tools/__init__.py +49 -36
- aiecs/tools/task_tools/chart_tool.py +200 -184
- aiecs/tools/task_tools/classfire_tool.py +268 -267
- aiecs/tools/task_tools/image_tool.py +175 -131
- aiecs/tools/task_tools/office_tool.py +226 -146
- aiecs/tools/task_tools/pandas_tool.py +477 -121
- aiecs/tools/task_tools/report_tool.py +390 -142
- aiecs/tools/task_tools/research_tool.py +149 -79
- aiecs/tools/task_tools/scraper_tool.py +339 -145
- aiecs/tools/task_tools/stats_tool.py +448 -209
- aiecs/tools/temp_file_manager.py +26 -24
- aiecs/tools/tool_executor/__init__.py +18 -16
- aiecs/tools/tool_executor/tool_executor.py +364 -52
- aiecs/utils/LLM_output_structor.py +74 -48
- aiecs/utils/__init__.py +14 -3
- aiecs/utils/base_callback.py +0 -3
- aiecs/utils/cache_provider.py +696 -0
- aiecs/utils/execution_utils.py +50 -31
- aiecs/utils/prompt_loader.py +1 -0
- aiecs/utils/token_usage_repository.py +37 -11
- aiecs/ws/socket_server.py +14 -4
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/METADATA +52 -15
- aiecs-1.7.6.dist-info/RECORD +337 -0
- aiecs-1.7.6.dist-info/entry_points.txt +13 -0
- aiecs/config/registry.py +0 -19
- aiecs/domain/context/content_engine.py +0 -982
- aiecs/llm/base_client.py +0 -99
- aiecs/llm/openai_client.py +0 -125
- aiecs/llm/vertex_client.py +0 -186
- aiecs/llm/xai_client.py +0 -184
- aiecs/scripts/dependency_checker.py +0 -857
- aiecs/scripts/quick_dependency_check.py +0 -269
- aiecs/tools/task_tools/search_api.py +0 -7
- aiecs-1.0.1.dist-info/RECORD +0 -90
- aiecs-1.0.1.dist-info/entry_points.txt +0 -7
- /aiecs/scripts/{setup_nlp_data.sh → dependance_check/setup_nlp_data.sh} +0 -0
- /aiecs/scripts/{README_WEASEL_PATCH.md → dependance_patch/fix_weasel/README_WEASEL_PATCH.md} +0 -0
- /aiecs/scripts/{fix_weasel_validator.sh → dependance_patch/fix_weasel/fix_weasel_validator.sh} +0 -0
- /aiecs/scripts/{run_weasel_patch.sh → dependance_patch/fix_weasel/run_weasel_patch.sh} +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/WHEEL +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/licenses/LICENSE +0 -0
- {aiecs-1.0.1.dist-info → aiecs-1.7.6.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
"""
|
|
2
|
+
LLM-based Relation Extractor
|
|
3
|
+
|
|
4
|
+
Extracts relations between entities using Large Language Models.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import json
|
|
8
|
+
import uuid
|
|
9
|
+
from typing import List, Optional
|
|
10
|
+
from aiecs.application.knowledge_graph.extractors.base import RelationExtractor
|
|
11
|
+
from aiecs.domain.knowledge_graph.models.entity import Entity
|
|
12
|
+
from aiecs.domain.knowledge_graph.models.relation import Relation
|
|
13
|
+
from aiecs.domain.knowledge_graph.schema.graph_schema import GraphSchema
|
|
14
|
+
from aiecs.llm import get_llm_manager, AIProvider, LLMClientManager
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class LLMRelationExtractor(RelationExtractor):
|
|
18
|
+
"""
|
|
19
|
+
Extract relations between entities using LLMs
|
|
20
|
+
|
|
21
|
+
Given text and a list of entities, identifies relationships between them.
|
|
22
|
+
Uses LLMs to understand semantic relationships and extract structured relations.
|
|
23
|
+
|
|
24
|
+
Features:
|
|
25
|
+
- Schema-aware extraction (uses relation types from schema)
|
|
26
|
+
- Entity-aware (only extracts relations between known entities)
|
|
27
|
+
- Property extraction (relation properties/attributes)
|
|
28
|
+
- Confidence scoring
|
|
29
|
+
- Directional relation support
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
```python
|
|
33
|
+
extractor = LLMRelationExtractor(schema=graph_schema)
|
|
34
|
+
|
|
35
|
+
alice = Entity(id="e1", type="Person", properties={"name": "Alice"})
|
|
36
|
+
tech_corp = Entity(id="e2", type="Company", properties={"name": "Tech Corp"})
|
|
37
|
+
|
|
38
|
+
relations = await extractor.extract_relations(
|
|
39
|
+
text="Alice works as a senior engineer at Tech Corp.",
|
|
40
|
+
entities=[alice, tech_corp]
|
|
41
|
+
)
|
|
42
|
+
# Returns: [
|
|
43
|
+
# Relation(
|
|
44
|
+
# source_id="e1",
|
|
45
|
+
# target_id="e2",
|
|
46
|
+
# relation_type="WORKS_FOR",
|
|
47
|
+
# properties={"title": "senior engineer"}
|
|
48
|
+
# )
|
|
49
|
+
# ]
|
|
50
|
+
```
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
def __init__(
|
|
54
|
+
self,
|
|
55
|
+
schema: Optional[GraphSchema] = None,
|
|
56
|
+
provider: Optional[AIProvider] = None,
|
|
57
|
+
model: Optional[str] = None,
|
|
58
|
+
temperature: float = 0.1,
|
|
59
|
+
max_tokens: Optional[int] = 2000,
|
|
60
|
+
):
|
|
61
|
+
"""
|
|
62
|
+
Initialize LLM relation extractor
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
schema: Optional GraphSchema to guide extraction
|
|
66
|
+
provider: LLM provider (default: Vertex AI)
|
|
67
|
+
model: Specific model to use
|
|
68
|
+
temperature: LLM temperature (low for deterministic extraction)
|
|
69
|
+
max_tokens: Maximum tokens in response
|
|
70
|
+
"""
|
|
71
|
+
self.schema = schema
|
|
72
|
+
self.provider = provider
|
|
73
|
+
self.model = model
|
|
74
|
+
self.temperature = temperature
|
|
75
|
+
self.max_tokens = max_tokens
|
|
76
|
+
self._llm_manager: Optional[LLMClientManager] = None # Lazy-loaded in async methods
|
|
77
|
+
|
|
78
|
+
async def extract_relations(
|
|
79
|
+
self,
|
|
80
|
+
text: str,
|
|
81
|
+
entities: List[Entity],
|
|
82
|
+
relation_types: Optional[List[str]] = None,
|
|
83
|
+
**kwargs,
|
|
84
|
+
) -> List[Relation]:
|
|
85
|
+
"""
|
|
86
|
+
Extract relations from text given known entities
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
text: Input text containing entities
|
|
90
|
+
entities: List of entities already extracted
|
|
91
|
+
relation_types: Optional filter for specific relation types
|
|
92
|
+
**kwargs: Additional parameters
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
List of extracted Relation objects
|
|
96
|
+
|
|
97
|
+
Raises:
|
|
98
|
+
ValueError: If text or entities are empty
|
|
99
|
+
RuntimeError: If LLM extraction fails
|
|
100
|
+
"""
|
|
101
|
+
if not text or not text.strip():
|
|
102
|
+
raise ValueError("Input text cannot be empty")
|
|
103
|
+
|
|
104
|
+
if not entities or len(entities) < 2:
|
|
105
|
+
# Need at least 2 entities to have a relation
|
|
106
|
+
return []
|
|
107
|
+
|
|
108
|
+
# Lazy-load LLM manager
|
|
109
|
+
if self._llm_manager is None:
|
|
110
|
+
self._llm_manager = await get_llm_manager()
|
|
111
|
+
|
|
112
|
+
# Build extraction prompt
|
|
113
|
+
prompt = self._build_extraction_prompt(text, entities, relation_types)
|
|
114
|
+
|
|
115
|
+
# Call LLM
|
|
116
|
+
try:
|
|
117
|
+
response = await self._llm_manager.generate_text(
|
|
118
|
+
messages=prompt,
|
|
119
|
+
provider=self.provider,
|
|
120
|
+
model=self.model,
|
|
121
|
+
temperature=self.temperature,
|
|
122
|
+
max_tokens=self.max_tokens,
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Parse LLM response to Relation objects
|
|
126
|
+
relations = self._parse_llm_response(response.content, entities)
|
|
127
|
+
|
|
128
|
+
return relations
|
|
129
|
+
|
|
130
|
+
except Exception as e:
|
|
131
|
+
raise RuntimeError(f"LLM relation extraction failed: {str(e)}") from e
|
|
132
|
+
|
|
133
|
+
def _build_extraction_prompt(
|
|
134
|
+
self,
|
|
135
|
+
text: str,
|
|
136
|
+
entities: List[Entity],
|
|
137
|
+
relation_types: Optional[List[str]] = None,
|
|
138
|
+
) -> str:
|
|
139
|
+
"""
|
|
140
|
+
Build prompt for LLM relation extraction
|
|
141
|
+
|
|
142
|
+
The prompt includes:
|
|
143
|
+
1. Task description
|
|
144
|
+
2. List of entities to consider
|
|
145
|
+
3. Relation types to extract (from schema)
|
|
146
|
+
4. Output format specification
|
|
147
|
+
5. The text to analyze
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
text: Input text
|
|
151
|
+
entities: List of known entities
|
|
152
|
+
relation_types: Optional filter for relation types
|
|
153
|
+
|
|
154
|
+
Returns:
|
|
155
|
+
Formatted prompt string
|
|
156
|
+
"""
|
|
157
|
+
# Build entity reference list
|
|
158
|
+
entity_list = []
|
|
159
|
+
entity_index = {}
|
|
160
|
+
for idx, entity in enumerate(entities):
|
|
161
|
+
entity_name = self._get_entity_name(entity)
|
|
162
|
+
entity_list.append(f" [{idx}] {entity.entity_type}: {entity_name} (ID: {entity.id})")
|
|
163
|
+
entity_index[entity.id] = idx
|
|
164
|
+
|
|
165
|
+
entities_section = "\n".join(entity_list)
|
|
166
|
+
|
|
167
|
+
# Build relation type descriptions
|
|
168
|
+
types_to_extract = []
|
|
169
|
+
if self.schema:
|
|
170
|
+
available_types = self.schema.get_relation_type_names()
|
|
171
|
+
if relation_types:
|
|
172
|
+
types_to_extract = [t for t in relation_types if t in available_types]
|
|
173
|
+
else:
|
|
174
|
+
types_to_extract = available_types
|
|
175
|
+
elif relation_types:
|
|
176
|
+
types_to_extract = relation_types
|
|
177
|
+
else:
|
|
178
|
+
# No schema, use common relation types
|
|
179
|
+
types_to_extract = [
|
|
180
|
+
"WORKS_FOR",
|
|
181
|
+
"LOCATED_IN",
|
|
182
|
+
"PART_OF",
|
|
183
|
+
"KNOWS",
|
|
184
|
+
"OWNS",
|
|
185
|
+
"MANAGES",
|
|
186
|
+
"PRODUCES",
|
|
187
|
+
"RELATED_TO",
|
|
188
|
+
]
|
|
189
|
+
|
|
190
|
+
# Build relation type descriptions
|
|
191
|
+
relation_descriptions = []
|
|
192
|
+
for rel_type in types_to_extract:
|
|
193
|
+
if self.schema and self.schema.has_relation_type(rel_type):
|
|
194
|
+
schema_rel = self.schema.get_relation_type(rel_type)
|
|
195
|
+
if schema_rel is not None:
|
|
196
|
+
desc = schema_rel.description or f"'{rel_type}' relation"
|
|
197
|
+
relation_descriptions.append(f"- {rel_type}: {desc}")
|
|
198
|
+
else:
|
|
199
|
+
relation_descriptions.append(f"- {rel_type}: Extract this type of relationship")
|
|
200
|
+
else:
|
|
201
|
+
relation_descriptions.append(f"- {rel_type}: Extract this type of relationship")
|
|
202
|
+
|
|
203
|
+
relations_section = "\n".join(relation_descriptions)
|
|
204
|
+
|
|
205
|
+
# Build prompt
|
|
206
|
+
prompt = f"""You are an expert at extracting relationships between entities from text.
|
|
207
|
+
|
|
208
|
+
Given the following entities:
|
|
209
|
+
{entities_section}
|
|
210
|
+
|
|
211
|
+
Extract all relationships between these entities from the text.
|
|
212
|
+
|
|
213
|
+
Allowed relation types:
|
|
214
|
+
{relations_section}
|
|
215
|
+
|
|
216
|
+
For each relation, provide:
|
|
217
|
+
1. source_id: ID of the source entity (from list above)
|
|
218
|
+
2. target_id: ID of the target entity (from list above)
|
|
219
|
+
3. relation_type: Type of relation (one of the allowed types)
|
|
220
|
+
4. properties: Optional dictionary of relation properties (e.g., since="2020", role="engineer")
|
|
221
|
+
5. confidence: Your confidence in this extraction (0.0 to 1.0)
|
|
222
|
+
|
|
223
|
+
Return ONLY a valid JSON array with this structure:
|
|
224
|
+
[
|
|
225
|
+
{{
|
|
226
|
+
"source_id": "entity_id_here",
|
|
227
|
+
"target_id": "entity_id_here",
|
|
228
|
+
"relation_type": "RELATION_TYPE",
|
|
229
|
+
"properties": {{"property1": "value1"}},
|
|
230
|
+
"confidence": 0.95
|
|
231
|
+
}}
|
|
232
|
+
]
|
|
233
|
+
|
|
234
|
+
Important:
|
|
235
|
+
- Only extract relations that are explicitly stated or strongly implied in the text
|
|
236
|
+
- Use the exact entity IDs from the list above
|
|
237
|
+
- Relations should be directional (source -> target matters)
|
|
238
|
+
- If unsure about a property, omit it
|
|
239
|
+
- Return empty array [] if no relations found
|
|
240
|
+
|
|
241
|
+
Text to analyze:
|
|
242
|
+
\"\"\"{text}\"\"\"
|
|
243
|
+
|
|
244
|
+
JSON output:"""
|
|
245
|
+
|
|
246
|
+
return prompt
|
|
247
|
+
|
|
248
|
+
def _parse_llm_response(self, response_text: str, entities: List[Entity]) -> List[Relation]:
|
|
249
|
+
"""
|
|
250
|
+
Parse LLM response to Relation objects
|
|
251
|
+
|
|
252
|
+
Expected JSON format:
|
|
253
|
+
[
|
|
254
|
+
{
|
|
255
|
+
"source_id": "e1",
|
|
256
|
+
"target_id": "e2",
|
|
257
|
+
"relation_type": "WORKS_FOR",
|
|
258
|
+
"properties": {"title": "engineer"},
|
|
259
|
+
"confidence": 0.95
|
|
260
|
+
}
|
|
261
|
+
]
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
response_text: LLM response string
|
|
265
|
+
entities: List of entities for validation
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
List of Relation objects
|
|
269
|
+
"""
|
|
270
|
+
relations = []
|
|
271
|
+
entity_ids = {e.id for e in entities}
|
|
272
|
+
|
|
273
|
+
try:
|
|
274
|
+
# Extract JSON from response
|
|
275
|
+
json_str = self._extract_json_from_text(response_text)
|
|
276
|
+
|
|
277
|
+
# Parse JSON
|
|
278
|
+
extracted_data = json.loads(json_str)
|
|
279
|
+
|
|
280
|
+
if not isinstance(extracted_data, list):
|
|
281
|
+
extracted_data = [extracted_data]
|
|
282
|
+
|
|
283
|
+
# Convert to Relation objects
|
|
284
|
+
for item in extracted_data:
|
|
285
|
+
source_id = item.get("source_id")
|
|
286
|
+
target_id = item.get("target_id")
|
|
287
|
+
relation_type = item.get("relation_type")
|
|
288
|
+
properties = item.get("properties", {})
|
|
289
|
+
confidence = item.get("confidence", 0.5)
|
|
290
|
+
|
|
291
|
+
# Validate required fields
|
|
292
|
+
if not source_id or not target_id:
|
|
293
|
+
continue
|
|
294
|
+
if not relation_type: # relation_type is required and cannot be None
|
|
295
|
+
continue
|
|
296
|
+
if source_id not in entity_ids or target_id not in entity_ids:
|
|
297
|
+
# LLM hallucinated entity IDs
|
|
298
|
+
continue
|
|
299
|
+
if source_id == target_id:
|
|
300
|
+
# Self-loop, skip
|
|
301
|
+
continue
|
|
302
|
+
|
|
303
|
+
# Generate unique ID
|
|
304
|
+
relation_id = str(uuid.uuid4())
|
|
305
|
+
|
|
306
|
+
# Create Relation
|
|
307
|
+
relation = Relation(
|
|
308
|
+
id=relation_id,
|
|
309
|
+
relation_type=relation_type,
|
|
310
|
+
source_id=source_id,
|
|
311
|
+
target_id=target_id,
|
|
312
|
+
properties=properties,
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
# Store confidence
|
|
316
|
+
relation.properties["_extraction_confidence"] = confidence
|
|
317
|
+
|
|
318
|
+
relations.append(relation)
|
|
319
|
+
|
|
320
|
+
except json.JSONDecodeError as e:
|
|
321
|
+
print(f"Warning: Failed to parse LLM response as JSON: {e}")
|
|
322
|
+
print(f"Response was: {response_text[:200]}...")
|
|
323
|
+
return []
|
|
324
|
+
|
|
325
|
+
return relations
|
|
326
|
+
|
|
327
|
+
def _extract_json_from_text(self, text: str) -> str:
|
|
328
|
+
"""Extract JSON array from text"""
|
|
329
|
+
# Find JSON array boundaries
|
|
330
|
+
start = text.find("[")
|
|
331
|
+
end = text.rfind("]") + 1
|
|
332
|
+
|
|
333
|
+
if start != -1 and end > start:
|
|
334
|
+
return text[start:end]
|
|
335
|
+
|
|
336
|
+
# Try single object
|
|
337
|
+
start = text.find("{")
|
|
338
|
+
end = text.rfind("}") + 1
|
|
339
|
+
|
|
340
|
+
if start != -1 and end > start:
|
|
341
|
+
return text[start:end]
|
|
342
|
+
|
|
343
|
+
return text
|
|
344
|
+
|
|
345
|
+
def _get_entity_name(self, entity: Entity) -> str:
|
|
346
|
+
"""Extract entity name from properties"""
|
|
347
|
+
return entity.properties.get("name") or entity.properties.get("title") or entity.properties.get("text") or f"{entity.entity_type}_{entity.id[:8]}"
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
"""
|
|
2
|
+
spaCy NER-based Entity Extractor
|
|
3
|
+
|
|
4
|
+
Extracts entities using spaCy's Named Entity Recognition.
|
|
5
|
+
Fast, offline, and cost-free alternative to LLM extraction.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from typing import List, Optional
|
|
9
|
+
import spacy
|
|
10
|
+
from spacy.language import Language
|
|
11
|
+
|
|
12
|
+
from aiecs.application.knowledge_graph.extractors.base import EntityExtractor
|
|
13
|
+
from aiecs.domain.knowledge_graph.models.entity import Entity
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class NEREntityExtractor(EntityExtractor):
|
|
17
|
+
"""
|
|
18
|
+
Extract entities using spaCy Named Entity Recognition
|
|
19
|
+
|
|
20
|
+
This extractor uses spaCy's pre-trained NER models to identify entities.
|
|
21
|
+
It's fast, free, and works offline, but limited to standard NER types.
|
|
22
|
+
|
|
23
|
+
Features:
|
|
24
|
+
- Fast extraction (no API calls)
|
|
25
|
+
- Works offline
|
|
26
|
+
- No cost
|
|
27
|
+
- Standard NER types (PERSON, ORG, GPE, LOC, DATE, etc.)
|
|
28
|
+
|
|
29
|
+
Limitations:
|
|
30
|
+
- Only standard entity types (no custom types)
|
|
31
|
+
- Limited property extraction (mainly just entity text)
|
|
32
|
+
- Lower quality than LLM extraction
|
|
33
|
+
|
|
34
|
+
Use Cases:
|
|
35
|
+
- Development and testing
|
|
36
|
+
- Cost-sensitive scenarios
|
|
37
|
+
- High-volume extraction where LLM is too expensive
|
|
38
|
+
- Baseline for comparison
|
|
39
|
+
|
|
40
|
+
Example:
|
|
41
|
+
```python
|
|
42
|
+
extractor = NEREntityExtractor(model="en_core_web_sm")
|
|
43
|
+
|
|
44
|
+
entities = await extractor.extract_entities(
|
|
45
|
+
"Alice works at Tech Corp in San Francisco."
|
|
46
|
+
)
|
|
47
|
+
# Returns: [
|
|
48
|
+
# Entity(type="Person", properties={"name": "Alice", "text": "Alice"}),
|
|
49
|
+
# Entity(type="Organization", properties={"name": "Tech Corp", "text": "Tech Corp"}),
|
|
50
|
+
# Entity(type="Location", properties={"name": "San Francisco", "text": "San Francisco"})
|
|
51
|
+
# ]
|
|
52
|
+
```
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
# Mapping from spaCy NER labels to generic entity types
|
|
56
|
+
LABEL_MAPPING = {
|
|
57
|
+
"PERSON": "Person",
|
|
58
|
+
"PER": "Person",
|
|
59
|
+
"ORG": "Organization",
|
|
60
|
+
"ORGANIZATION": "Organization",
|
|
61
|
+
"GPE": "Location", # Geo-Political Entity
|
|
62
|
+
"LOC": "Location",
|
|
63
|
+
"LOCATION": "Location",
|
|
64
|
+
"FAC": "Facility",
|
|
65
|
+
"FACILITY": "Facility",
|
|
66
|
+
"PRODUCT": "Product",
|
|
67
|
+
"EVENT": "Event",
|
|
68
|
+
"WORK_OF_ART": "WorkOfArt",
|
|
69
|
+
"LAW": "Law",
|
|
70
|
+
"LANGUAGE": "Language",
|
|
71
|
+
"DATE": "Date",
|
|
72
|
+
"TIME": "Time",
|
|
73
|
+
"PERCENT": "Percentage",
|
|
74
|
+
"MONEY": "Money",
|
|
75
|
+
"QUANTITY": "Quantity",
|
|
76
|
+
"ORDINAL": "Ordinal",
|
|
77
|
+
"CARDINAL": "Cardinal",
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
def __init__(
|
|
81
|
+
self,
|
|
82
|
+
model: str = "en_core_web_sm",
|
|
83
|
+
disable_components: Optional[List[str]] = None,
|
|
84
|
+
):
|
|
85
|
+
"""
|
|
86
|
+
Initialize NER entity extractor
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
model: spaCy model name (default: "en_core_web_sm")
|
|
90
|
+
Available models:
|
|
91
|
+
- en_core_web_sm: Small English model (~13MB)
|
|
92
|
+
- en_core_web_md: Medium English model (~40MB)
|
|
93
|
+
- en_core_web_lg: Large English model (~560MB)
|
|
94
|
+
disable_components: spaCy pipeline components to disable (for speed)
|
|
95
|
+
Default: disable all except NER
|
|
96
|
+
"""
|
|
97
|
+
self.model_name = model
|
|
98
|
+
|
|
99
|
+
try:
|
|
100
|
+
# Load spaCy model
|
|
101
|
+
if disable_components is None:
|
|
102
|
+
# Disable everything except NER for speed
|
|
103
|
+
disable_components = [
|
|
104
|
+
"tok2vec",
|
|
105
|
+
"tagger",
|
|
106
|
+
"parser",
|
|
107
|
+
"attribute_ruler",
|
|
108
|
+
"lemmatizer",
|
|
109
|
+
]
|
|
110
|
+
|
|
111
|
+
self.nlp: Language = spacy.load(model, disable=disable_components)
|
|
112
|
+
except OSError as e:
|
|
113
|
+
raise RuntimeError(f"spaCy model '{model}' not found. " f"Install it with: python -m spacy download {model}") from e
|
|
114
|
+
|
|
115
|
+
async def extract_entities(self, text: str, entity_types: Optional[List[str]] = None, **kwargs) -> List[Entity]:
|
|
116
|
+
"""
|
|
117
|
+
Extract entities from text using spaCy NER
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
text: Input text to extract entities from
|
|
121
|
+
entity_types: Optional filter for specific entity types
|
|
122
|
+
(will be matched against LABEL_MAPPING values)
|
|
123
|
+
**kwargs: Additional parameters (unused for NER)
|
|
124
|
+
|
|
125
|
+
Returns:
|
|
126
|
+
List of extracted Entity objects
|
|
127
|
+
|
|
128
|
+
Raises:
|
|
129
|
+
ValueError: If text is empty
|
|
130
|
+
"""
|
|
131
|
+
if not text or not text.strip():
|
|
132
|
+
raise ValueError("Input text cannot be empty")
|
|
133
|
+
|
|
134
|
+
# Process text with spaCy
|
|
135
|
+
doc = self.nlp(text)
|
|
136
|
+
|
|
137
|
+
# Extract entities
|
|
138
|
+
entities = []
|
|
139
|
+
seen_texts = set() # Simple deduplication within same text
|
|
140
|
+
|
|
141
|
+
for ent in doc.ents:
|
|
142
|
+
# Map spaCy label to generic entity type
|
|
143
|
+
entity_type = self.LABEL_MAPPING.get(ent.label_, ent.label_)
|
|
144
|
+
|
|
145
|
+
# Filter by entity type if requested
|
|
146
|
+
if entity_types and entity_type not in entity_types:
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
# Simple deduplication: skip if we've seen this exact text already
|
|
150
|
+
entity_text = ent.text.strip()
|
|
151
|
+
if entity_text in seen_texts:
|
|
152
|
+
continue
|
|
153
|
+
seen_texts.add(entity_text)
|
|
154
|
+
|
|
155
|
+
# Create entity
|
|
156
|
+
entity = Entity(
|
|
157
|
+
id=self._generate_entity_id(entity_type, entity_text),
|
|
158
|
+
entity_type=entity_type,
|
|
159
|
+
properties={
|
|
160
|
+
"name": entity_text,
|
|
161
|
+
"text": entity_text,
|
|
162
|
+
"label": ent.label_, # Original spaCy label
|
|
163
|
+
"start_char": ent.start_char,
|
|
164
|
+
"end_char": ent.end_char,
|
|
165
|
+
"_extraction_confidence": self._estimate_confidence(ent),
|
|
166
|
+
},
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
entities.append(entity)
|
|
170
|
+
|
|
171
|
+
return entities
|
|
172
|
+
|
|
173
|
+
def _generate_entity_id(self, entity_type: str, text: str) -> str:
|
|
174
|
+
"""
|
|
175
|
+
Generate a unique ID for an entity
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
entity_type: Entity type name
|
|
179
|
+
text: Entity text
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
Unique entity ID string
|
|
183
|
+
"""
|
|
184
|
+
# Create deterministic ID from type + text
|
|
185
|
+
normalized = f"{entity_type}_{text}".lower().replace(" ", "_")
|
|
186
|
+
# Add short hash for uniqueness
|
|
187
|
+
import hashlib
|
|
188
|
+
|
|
189
|
+
hash_suffix = hashlib.md5(normalized.encode()).hexdigest()[:8]
|
|
190
|
+
return f"{normalized}_{hash_suffix}"
|
|
191
|
+
|
|
192
|
+
def _estimate_confidence(self, ent) -> float:
|
|
193
|
+
"""
|
|
194
|
+
Estimate confidence for NER extraction
|
|
195
|
+
|
|
196
|
+
spaCy doesn't provide confidence scores directly, so we use heuristics:
|
|
197
|
+
- Longer entities are generally more confident
|
|
198
|
+
- Entities with more context are more confident
|
|
199
|
+
- Capitalized entities (proper nouns) are more confident
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
ent: spaCy entity
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Confidence score (0.0-1.0)
|
|
206
|
+
"""
|
|
207
|
+
# Base confidence
|
|
208
|
+
confidence = 0.7
|
|
209
|
+
|
|
210
|
+
# Adjust based on entity length
|
|
211
|
+
if len(ent.text) > 20:
|
|
212
|
+
confidence += 0.1
|
|
213
|
+
elif len(ent.text) < 3:
|
|
214
|
+
confidence -= 0.2
|
|
215
|
+
|
|
216
|
+
# Adjust based on capitalization (proper nouns)
|
|
217
|
+
if ent.text[0].isupper():
|
|
218
|
+
confidence += 0.1
|
|
219
|
+
|
|
220
|
+
# Clamp to [0.0, 1.0]
|
|
221
|
+
return max(0.0, min(1.0, confidence))
|
|
222
|
+
|
|
223
|
+
def get_supported_types(self) -> List[str]:
|
|
224
|
+
"""
|
|
225
|
+
Get list of entity types that this extractor can produce
|
|
226
|
+
|
|
227
|
+
Returns:
|
|
228
|
+
List of entity type names
|
|
229
|
+
"""
|
|
230
|
+
return list(set(self.LABEL_MAPPING.values()))
|
|
231
|
+
|
|
232
|
+
def get_available_labels(self) -> List[str]:
|
|
233
|
+
"""
|
|
234
|
+
Get list of NER labels available in the loaded model
|
|
235
|
+
|
|
236
|
+
Returns:
|
|
237
|
+
List of spaCy NER labels
|
|
238
|
+
"""
|
|
239
|
+
ner_pipe = self.nlp.get_pipe("ner")
|
|
240
|
+
# spaCy NER pipe has labels attribute
|
|
241
|
+
return ner_pipe.labels if hasattr(ner_pipe, "labels") else [] # type: ignore[attr-defined]
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Knowledge Fusion Components
|
|
3
|
+
|
|
4
|
+
Components for deduplicating, merging, and linking entities across documents.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from aiecs.application.knowledge_graph.fusion.entity_deduplicator import (
|
|
8
|
+
EntityDeduplicator,
|
|
9
|
+
)
|
|
10
|
+
from aiecs.application.knowledge_graph.fusion.entity_linker import EntityLinker
|
|
11
|
+
from aiecs.application.knowledge_graph.fusion.relation_deduplicator import (
|
|
12
|
+
RelationDeduplicator,
|
|
13
|
+
)
|
|
14
|
+
from aiecs.application.knowledge_graph.fusion.knowledge_fusion import (
|
|
15
|
+
KnowledgeFusion,
|
|
16
|
+
)
|
|
17
|
+
from aiecs.application.knowledge_graph.fusion.matching_config import (
|
|
18
|
+
EntityTypeConfig,
|
|
19
|
+
FusionMatchingConfig,
|
|
20
|
+
load_matching_config,
|
|
21
|
+
load_matching_config_from_dict,
|
|
22
|
+
load_matching_config_from_json,
|
|
23
|
+
load_matching_config_from_yaml,
|
|
24
|
+
save_matching_config_to_dict,
|
|
25
|
+
save_matching_config_to_json,
|
|
26
|
+
save_matching_config_to_yaml,
|
|
27
|
+
VALID_STAGES,
|
|
28
|
+
DEFAULT_ENABLED_STAGES,
|
|
29
|
+
)
|
|
30
|
+
from aiecs.application.knowledge_graph.fusion.similarity_pipeline import (
|
|
31
|
+
SimilarityPipeline,
|
|
32
|
+
MatchStage,
|
|
33
|
+
MatchResult,
|
|
34
|
+
PipelineResult,
|
|
35
|
+
)
|
|
36
|
+
from aiecs.application.knowledge_graph.fusion.ab_testing import (
|
|
37
|
+
ABTestingFramework,
|
|
38
|
+
EvaluationMetrics,
|
|
39
|
+
ExperimentResult,
|
|
40
|
+
)
|
|
41
|
+
from aiecs.application.knowledge_graph.fusion.evaluation_dataset import (
|
|
42
|
+
EntityPair,
|
|
43
|
+
EvaluationDataset,
|
|
44
|
+
create_default_evaluation_dataset,
|
|
45
|
+
create_minimal_evaluation_dataset,
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
__all__ = [
|
|
49
|
+
"EntityDeduplicator",
|
|
50
|
+
"EntityLinker",
|
|
51
|
+
"RelationDeduplicator",
|
|
52
|
+
"KnowledgeFusion",
|
|
53
|
+
# Matching configuration
|
|
54
|
+
"EntityTypeConfig",
|
|
55
|
+
"FusionMatchingConfig",
|
|
56
|
+
"load_matching_config",
|
|
57
|
+
"load_matching_config_from_dict",
|
|
58
|
+
"load_matching_config_from_json",
|
|
59
|
+
"load_matching_config_from_yaml",
|
|
60
|
+
"save_matching_config_to_dict",
|
|
61
|
+
"save_matching_config_to_json",
|
|
62
|
+
"save_matching_config_to_yaml",
|
|
63
|
+
"VALID_STAGES",
|
|
64
|
+
"DEFAULT_ENABLED_STAGES",
|
|
65
|
+
# Similarity pipeline
|
|
66
|
+
"SimilarityPipeline",
|
|
67
|
+
"MatchStage",
|
|
68
|
+
"MatchResult",
|
|
69
|
+
"PipelineResult",
|
|
70
|
+
# Evaluation and testing
|
|
71
|
+
"ABTestingFramework",
|
|
72
|
+
"EvaluationMetrics",
|
|
73
|
+
"ExperimentResult",
|
|
74
|
+
"EntityPair",
|
|
75
|
+
"EvaluationDataset",
|
|
76
|
+
"create_default_evaluation_dataset",
|
|
77
|
+
"create_minimal_evaluation_dataset",
|
|
78
|
+
]
|