aiecs 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (302) hide show
  1. aiecs/__init__.py +72 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +469 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +363 -0
  7. aiecs/application/knowledge_graph/__init__.py +7 -0
  8. aiecs/application/knowledge_graph/builder/__init__.py +37 -0
  9. aiecs/application/knowledge_graph/builder/document_builder.py +375 -0
  10. aiecs/application/knowledge_graph/builder/graph_builder.py +356 -0
  11. aiecs/application/knowledge_graph/builder/schema_mapping.py +531 -0
  12. aiecs/application/knowledge_graph/builder/structured_pipeline.py +443 -0
  13. aiecs/application/knowledge_graph/builder/text_chunker.py +319 -0
  14. aiecs/application/knowledge_graph/extractors/__init__.py +27 -0
  15. aiecs/application/knowledge_graph/extractors/base.py +100 -0
  16. aiecs/application/knowledge_graph/extractors/llm_entity_extractor.py +327 -0
  17. aiecs/application/knowledge_graph/extractors/llm_relation_extractor.py +349 -0
  18. aiecs/application/knowledge_graph/extractors/ner_entity_extractor.py +244 -0
  19. aiecs/application/knowledge_graph/fusion/__init__.py +23 -0
  20. aiecs/application/knowledge_graph/fusion/entity_deduplicator.py +387 -0
  21. aiecs/application/knowledge_graph/fusion/entity_linker.py +343 -0
  22. aiecs/application/knowledge_graph/fusion/knowledge_fusion.py +580 -0
  23. aiecs/application/knowledge_graph/fusion/relation_deduplicator.py +189 -0
  24. aiecs/application/knowledge_graph/pattern_matching/__init__.py +21 -0
  25. aiecs/application/knowledge_graph/pattern_matching/pattern_matcher.py +344 -0
  26. aiecs/application/knowledge_graph/pattern_matching/query_executor.py +378 -0
  27. aiecs/application/knowledge_graph/profiling/__init__.py +12 -0
  28. aiecs/application/knowledge_graph/profiling/query_plan_visualizer.py +199 -0
  29. aiecs/application/knowledge_graph/profiling/query_profiler.py +223 -0
  30. aiecs/application/knowledge_graph/reasoning/__init__.py +27 -0
  31. aiecs/application/knowledge_graph/reasoning/evidence_synthesis.py +347 -0
  32. aiecs/application/knowledge_graph/reasoning/inference_engine.py +504 -0
  33. aiecs/application/knowledge_graph/reasoning/logic_form_parser.py +167 -0
  34. aiecs/application/knowledge_graph/reasoning/logic_parser/__init__.py +79 -0
  35. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_builder.py +513 -0
  36. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_nodes.py +630 -0
  37. aiecs/application/knowledge_graph/reasoning/logic_parser/ast_validator.py +654 -0
  38. aiecs/application/knowledge_graph/reasoning/logic_parser/error_handler.py +477 -0
  39. aiecs/application/knowledge_graph/reasoning/logic_parser/parser.py +390 -0
  40. aiecs/application/knowledge_graph/reasoning/logic_parser/query_context.py +217 -0
  41. aiecs/application/knowledge_graph/reasoning/logic_query_integration.py +169 -0
  42. aiecs/application/knowledge_graph/reasoning/query_planner.py +872 -0
  43. aiecs/application/knowledge_graph/reasoning/reasoning_engine.py +554 -0
  44. aiecs/application/knowledge_graph/retrieval/__init__.py +19 -0
  45. aiecs/application/knowledge_graph/retrieval/retrieval_strategies.py +596 -0
  46. aiecs/application/knowledge_graph/search/__init__.py +59 -0
  47. aiecs/application/knowledge_graph/search/hybrid_search.py +423 -0
  48. aiecs/application/knowledge_graph/search/reranker.py +295 -0
  49. aiecs/application/knowledge_graph/search/reranker_strategies.py +553 -0
  50. aiecs/application/knowledge_graph/search/text_similarity.py +398 -0
  51. aiecs/application/knowledge_graph/traversal/__init__.py +15 -0
  52. aiecs/application/knowledge_graph/traversal/enhanced_traversal.py +329 -0
  53. aiecs/application/knowledge_graph/traversal/path_scorer.py +269 -0
  54. aiecs/application/knowledge_graph/validators/__init__.py +13 -0
  55. aiecs/application/knowledge_graph/validators/relation_validator.py +189 -0
  56. aiecs/application/knowledge_graph/visualization/__init__.py +11 -0
  57. aiecs/application/knowledge_graph/visualization/graph_visualizer.py +321 -0
  58. aiecs/common/__init__.py +9 -0
  59. aiecs/common/knowledge_graph/__init__.py +17 -0
  60. aiecs/common/knowledge_graph/runnable.py +484 -0
  61. aiecs/config/__init__.py +16 -0
  62. aiecs/config/config.py +498 -0
  63. aiecs/config/graph_config.py +137 -0
  64. aiecs/config/registry.py +23 -0
  65. aiecs/core/__init__.py +46 -0
  66. aiecs/core/interface/__init__.py +34 -0
  67. aiecs/core/interface/execution_interface.py +152 -0
  68. aiecs/core/interface/storage_interface.py +171 -0
  69. aiecs/domain/__init__.py +289 -0
  70. aiecs/domain/agent/__init__.py +189 -0
  71. aiecs/domain/agent/base_agent.py +697 -0
  72. aiecs/domain/agent/exceptions.py +103 -0
  73. aiecs/domain/agent/graph_aware_mixin.py +559 -0
  74. aiecs/domain/agent/hybrid_agent.py +490 -0
  75. aiecs/domain/agent/integration/__init__.py +26 -0
  76. aiecs/domain/agent/integration/context_compressor.py +222 -0
  77. aiecs/domain/agent/integration/context_engine_adapter.py +252 -0
  78. aiecs/domain/agent/integration/retry_policy.py +219 -0
  79. aiecs/domain/agent/integration/role_config.py +213 -0
  80. aiecs/domain/agent/knowledge_aware_agent.py +646 -0
  81. aiecs/domain/agent/lifecycle.py +296 -0
  82. aiecs/domain/agent/llm_agent.py +300 -0
  83. aiecs/domain/agent/memory/__init__.py +12 -0
  84. aiecs/domain/agent/memory/conversation.py +197 -0
  85. aiecs/domain/agent/migration/__init__.py +14 -0
  86. aiecs/domain/agent/migration/conversion.py +160 -0
  87. aiecs/domain/agent/migration/legacy_wrapper.py +90 -0
  88. aiecs/domain/agent/models.py +317 -0
  89. aiecs/domain/agent/observability.py +407 -0
  90. aiecs/domain/agent/persistence.py +289 -0
  91. aiecs/domain/agent/prompts/__init__.py +29 -0
  92. aiecs/domain/agent/prompts/builder.py +161 -0
  93. aiecs/domain/agent/prompts/formatters.py +189 -0
  94. aiecs/domain/agent/prompts/template.py +255 -0
  95. aiecs/domain/agent/registry.py +260 -0
  96. aiecs/domain/agent/tool_agent.py +257 -0
  97. aiecs/domain/agent/tools/__init__.py +12 -0
  98. aiecs/domain/agent/tools/schema_generator.py +221 -0
  99. aiecs/domain/community/__init__.py +155 -0
  100. aiecs/domain/community/agent_adapter.py +477 -0
  101. aiecs/domain/community/analytics.py +481 -0
  102. aiecs/domain/community/collaborative_workflow.py +642 -0
  103. aiecs/domain/community/communication_hub.py +645 -0
  104. aiecs/domain/community/community_builder.py +320 -0
  105. aiecs/domain/community/community_integration.py +800 -0
  106. aiecs/domain/community/community_manager.py +813 -0
  107. aiecs/domain/community/decision_engine.py +879 -0
  108. aiecs/domain/community/exceptions.py +225 -0
  109. aiecs/domain/community/models/__init__.py +33 -0
  110. aiecs/domain/community/models/community_models.py +268 -0
  111. aiecs/domain/community/resource_manager.py +457 -0
  112. aiecs/domain/community/shared_context_manager.py +603 -0
  113. aiecs/domain/context/__init__.py +58 -0
  114. aiecs/domain/context/context_engine.py +989 -0
  115. aiecs/domain/context/conversation_models.py +354 -0
  116. aiecs/domain/context/graph_memory.py +467 -0
  117. aiecs/domain/execution/__init__.py +12 -0
  118. aiecs/domain/execution/model.py +57 -0
  119. aiecs/domain/knowledge_graph/__init__.py +19 -0
  120. aiecs/domain/knowledge_graph/models/__init__.py +52 -0
  121. aiecs/domain/knowledge_graph/models/entity.py +130 -0
  122. aiecs/domain/knowledge_graph/models/evidence.py +194 -0
  123. aiecs/domain/knowledge_graph/models/inference_rule.py +186 -0
  124. aiecs/domain/knowledge_graph/models/path.py +179 -0
  125. aiecs/domain/knowledge_graph/models/path_pattern.py +173 -0
  126. aiecs/domain/knowledge_graph/models/query.py +272 -0
  127. aiecs/domain/knowledge_graph/models/query_plan.py +187 -0
  128. aiecs/domain/knowledge_graph/models/relation.py +136 -0
  129. aiecs/domain/knowledge_graph/schema/__init__.py +23 -0
  130. aiecs/domain/knowledge_graph/schema/entity_type.py +135 -0
  131. aiecs/domain/knowledge_graph/schema/graph_schema.py +271 -0
  132. aiecs/domain/knowledge_graph/schema/property_schema.py +155 -0
  133. aiecs/domain/knowledge_graph/schema/relation_type.py +171 -0
  134. aiecs/domain/knowledge_graph/schema/schema_manager.py +496 -0
  135. aiecs/domain/knowledge_graph/schema/type_enums.py +205 -0
  136. aiecs/domain/task/__init__.py +13 -0
  137. aiecs/domain/task/dsl_processor.py +613 -0
  138. aiecs/domain/task/model.py +62 -0
  139. aiecs/domain/task/task_context.py +268 -0
  140. aiecs/infrastructure/__init__.py +24 -0
  141. aiecs/infrastructure/graph_storage/__init__.py +11 -0
  142. aiecs/infrastructure/graph_storage/base.py +601 -0
  143. aiecs/infrastructure/graph_storage/batch_operations.py +449 -0
  144. aiecs/infrastructure/graph_storage/cache.py +429 -0
  145. aiecs/infrastructure/graph_storage/distributed.py +226 -0
  146. aiecs/infrastructure/graph_storage/error_handling.py +390 -0
  147. aiecs/infrastructure/graph_storage/graceful_degradation.py +306 -0
  148. aiecs/infrastructure/graph_storage/health_checks.py +378 -0
  149. aiecs/infrastructure/graph_storage/in_memory.py +514 -0
  150. aiecs/infrastructure/graph_storage/index_optimization.py +483 -0
  151. aiecs/infrastructure/graph_storage/lazy_loading.py +410 -0
  152. aiecs/infrastructure/graph_storage/metrics.py +357 -0
  153. aiecs/infrastructure/graph_storage/migration.py +413 -0
  154. aiecs/infrastructure/graph_storage/pagination.py +471 -0
  155. aiecs/infrastructure/graph_storage/performance_monitoring.py +466 -0
  156. aiecs/infrastructure/graph_storage/postgres.py +871 -0
  157. aiecs/infrastructure/graph_storage/query_optimizer.py +635 -0
  158. aiecs/infrastructure/graph_storage/schema_cache.py +290 -0
  159. aiecs/infrastructure/graph_storage/sqlite.py +623 -0
  160. aiecs/infrastructure/graph_storage/streaming.py +495 -0
  161. aiecs/infrastructure/messaging/__init__.py +13 -0
  162. aiecs/infrastructure/messaging/celery_task_manager.py +383 -0
  163. aiecs/infrastructure/messaging/websocket_manager.py +298 -0
  164. aiecs/infrastructure/monitoring/__init__.py +34 -0
  165. aiecs/infrastructure/monitoring/executor_metrics.py +174 -0
  166. aiecs/infrastructure/monitoring/global_metrics_manager.py +213 -0
  167. aiecs/infrastructure/monitoring/structured_logger.py +48 -0
  168. aiecs/infrastructure/monitoring/tracing_manager.py +410 -0
  169. aiecs/infrastructure/persistence/__init__.py +24 -0
  170. aiecs/infrastructure/persistence/context_engine_client.py +187 -0
  171. aiecs/infrastructure/persistence/database_manager.py +333 -0
  172. aiecs/infrastructure/persistence/file_storage.py +754 -0
  173. aiecs/infrastructure/persistence/redis_client.py +220 -0
  174. aiecs/llm/__init__.py +86 -0
  175. aiecs/llm/callbacks/__init__.py +11 -0
  176. aiecs/llm/callbacks/custom_callbacks.py +264 -0
  177. aiecs/llm/client_factory.py +420 -0
  178. aiecs/llm/clients/__init__.py +33 -0
  179. aiecs/llm/clients/base_client.py +193 -0
  180. aiecs/llm/clients/googleai_client.py +181 -0
  181. aiecs/llm/clients/openai_client.py +131 -0
  182. aiecs/llm/clients/vertex_client.py +437 -0
  183. aiecs/llm/clients/xai_client.py +184 -0
  184. aiecs/llm/config/__init__.py +51 -0
  185. aiecs/llm/config/config_loader.py +275 -0
  186. aiecs/llm/config/config_validator.py +236 -0
  187. aiecs/llm/config/model_config.py +151 -0
  188. aiecs/llm/utils/__init__.py +10 -0
  189. aiecs/llm/utils/validate_config.py +91 -0
  190. aiecs/main.py +363 -0
  191. aiecs/scripts/__init__.py +3 -0
  192. aiecs/scripts/aid/VERSION_MANAGEMENT.md +97 -0
  193. aiecs/scripts/aid/__init__.py +19 -0
  194. aiecs/scripts/aid/version_manager.py +215 -0
  195. aiecs/scripts/dependance_check/DEPENDENCY_SYSTEM_SUMMARY.md +242 -0
  196. aiecs/scripts/dependance_check/README_DEPENDENCY_CHECKER.md +310 -0
  197. aiecs/scripts/dependance_check/__init__.py +17 -0
  198. aiecs/scripts/dependance_check/dependency_checker.py +938 -0
  199. aiecs/scripts/dependance_check/dependency_fixer.py +391 -0
  200. aiecs/scripts/dependance_check/download_nlp_data.py +396 -0
  201. aiecs/scripts/dependance_check/quick_dependency_check.py +270 -0
  202. aiecs/scripts/dependance_check/setup_nlp_data.sh +217 -0
  203. aiecs/scripts/dependance_patch/__init__.py +7 -0
  204. aiecs/scripts/dependance_patch/fix_weasel/README_WEASEL_PATCH.md +126 -0
  205. aiecs/scripts/dependance_patch/fix_weasel/__init__.py +11 -0
  206. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.py +128 -0
  207. aiecs/scripts/dependance_patch/fix_weasel/fix_weasel_validator.sh +82 -0
  208. aiecs/scripts/dependance_patch/fix_weasel/patch_weasel_library.sh +188 -0
  209. aiecs/scripts/dependance_patch/fix_weasel/run_weasel_patch.sh +41 -0
  210. aiecs/scripts/tools_develop/README.md +449 -0
  211. aiecs/scripts/tools_develop/TOOL_AUTO_DISCOVERY.md +234 -0
  212. aiecs/scripts/tools_develop/__init__.py +21 -0
  213. aiecs/scripts/tools_develop/check_type_annotations.py +259 -0
  214. aiecs/scripts/tools_develop/validate_tool_schemas.py +422 -0
  215. aiecs/scripts/tools_develop/verify_tools.py +356 -0
  216. aiecs/tasks/__init__.py +1 -0
  217. aiecs/tasks/worker.py +172 -0
  218. aiecs/tools/__init__.py +299 -0
  219. aiecs/tools/apisource/__init__.py +99 -0
  220. aiecs/tools/apisource/intelligence/__init__.py +19 -0
  221. aiecs/tools/apisource/intelligence/data_fusion.py +381 -0
  222. aiecs/tools/apisource/intelligence/query_analyzer.py +413 -0
  223. aiecs/tools/apisource/intelligence/search_enhancer.py +388 -0
  224. aiecs/tools/apisource/monitoring/__init__.py +9 -0
  225. aiecs/tools/apisource/monitoring/metrics.py +303 -0
  226. aiecs/tools/apisource/providers/__init__.py +115 -0
  227. aiecs/tools/apisource/providers/base.py +664 -0
  228. aiecs/tools/apisource/providers/census.py +401 -0
  229. aiecs/tools/apisource/providers/fred.py +564 -0
  230. aiecs/tools/apisource/providers/newsapi.py +412 -0
  231. aiecs/tools/apisource/providers/worldbank.py +357 -0
  232. aiecs/tools/apisource/reliability/__init__.py +12 -0
  233. aiecs/tools/apisource/reliability/error_handler.py +375 -0
  234. aiecs/tools/apisource/reliability/fallback_strategy.py +391 -0
  235. aiecs/tools/apisource/tool.py +850 -0
  236. aiecs/tools/apisource/utils/__init__.py +9 -0
  237. aiecs/tools/apisource/utils/validators.py +338 -0
  238. aiecs/tools/base_tool.py +201 -0
  239. aiecs/tools/docs/__init__.py +121 -0
  240. aiecs/tools/docs/ai_document_orchestrator.py +599 -0
  241. aiecs/tools/docs/ai_document_writer_orchestrator.py +2403 -0
  242. aiecs/tools/docs/content_insertion_tool.py +1333 -0
  243. aiecs/tools/docs/document_creator_tool.py +1317 -0
  244. aiecs/tools/docs/document_layout_tool.py +1166 -0
  245. aiecs/tools/docs/document_parser_tool.py +994 -0
  246. aiecs/tools/docs/document_writer_tool.py +1818 -0
  247. aiecs/tools/knowledge_graph/__init__.py +17 -0
  248. aiecs/tools/knowledge_graph/graph_reasoning_tool.py +734 -0
  249. aiecs/tools/knowledge_graph/graph_search_tool.py +923 -0
  250. aiecs/tools/knowledge_graph/kg_builder_tool.py +476 -0
  251. aiecs/tools/langchain_adapter.py +542 -0
  252. aiecs/tools/schema_generator.py +275 -0
  253. aiecs/tools/search_tool/__init__.py +100 -0
  254. aiecs/tools/search_tool/analyzers.py +589 -0
  255. aiecs/tools/search_tool/cache.py +260 -0
  256. aiecs/tools/search_tool/constants.py +128 -0
  257. aiecs/tools/search_tool/context.py +216 -0
  258. aiecs/tools/search_tool/core.py +749 -0
  259. aiecs/tools/search_tool/deduplicator.py +123 -0
  260. aiecs/tools/search_tool/error_handler.py +271 -0
  261. aiecs/tools/search_tool/metrics.py +371 -0
  262. aiecs/tools/search_tool/rate_limiter.py +178 -0
  263. aiecs/tools/search_tool/schemas.py +277 -0
  264. aiecs/tools/statistics/__init__.py +80 -0
  265. aiecs/tools/statistics/ai_data_analysis_orchestrator.py +643 -0
  266. aiecs/tools/statistics/ai_insight_generator_tool.py +505 -0
  267. aiecs/tools/statistics/ai_report_orchestrator_tool.py +694 -0
  268. aiecs/tools/statistics/data_loader_tool.py +564 -0
  269. aiecs/tools/statistics/data_profiler_tool.py +658 -0
  270. aiecs/tools/statistics/data_transformer_tool.py +573 -0
  271. aiecs/tools/statistics/data_visualizer_tool.py +495 -0
  272. aiecs/tools/statistics/model_trainer_tool.py +487 -0
  273. aiecs/tools/statistics/statistical_analyzer_tool.py +459 -0
  274. aiecs/tools/task_tools/__init__.py +86 -0
  275. aiecs/tools/task_tools/chart_tool.py +732 -0
  276. aiecs/tools/task_tools/classfire_tool.py +922 -0
  277. aiecs/tools/task_tools/image_tool.py +447 -0
  278. aiecs/tools/task_tools/office_tool.py +684 -0
  279. aiecs/tools/task_tools/pandas_tool.py +635 -0
  280. aiecs/tools/task_tools/report_tool.py +635 -0
  281. aiecs/tools/task_tools/research_tool.py +392 -0
  282. aiecs/tools/task_tools/scraper_tool.py +715 -0
  283. aiecs/tools/task_tools/stats_tool.py +688 -0
  284. aiecs/tools/temp_file_manager.py +130 -0
  285. aiecs/tools/tool_executor/__init__.py +37 -0
  286. aiecs/tools/tool_executor/tool_executor.py +881 -0
  287. aiecs/utils/LLM_output_structor.py +445 -0
  288. aiecs/utils/__init__.py +34 -0
  289. aiecs/utils/base_callback.py +47 -0
  290. aiecs/utils/cache_provider.py +695 -0
  291. aiecs/utils/execution_utils.py +184 -0
  292. aiecs/utils/logging.py +1 -0
  293. aiecs/utils/prompt_loader.py +14 -0
  294. aiecs/utils/token_usage_repository.py +323 -0
  295. aiecs/ws/__init__.py +0 -0
  296. aiecs/ws/socket_server.py +52 -0
  297. aiecs-1.5.1.dist-info/METADATA +608 -0
  298. aiecs-1.5.1.dist-info/RECORD +302 -0
  299. aiecs-1.5.1.dist-info/WHEEL +5 -0
  300. aiecs-1.5.1.dist-info/entry_points.txt +10 -0
  301. aiecs-1.5.1.dist-info/licenses/LICENSE +225 -0
  302. aiecs-1.5.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,580 @@
1
+ """
2
+ Knowledge Fusion Orchestrator
3
+
4
+ High-level orchestrator for cross-document entity merging and knowledge fusion.
5
+ """
6
+
7
+ from typing import List, Dict, Set, Tuple, Any
8
+ from aiecs.domain.knowledge_graph.models.entity import Entity
9
+ from aiecs.infrastructure.graph_storage.base import GraphStore
10
+ from aiecs.application.knowledge_graph.fusion.entity_deduplicator import (
11
+ EntityDeduplicator,
12
+ )
13
+
14
+
15
+ class KnowledgeFusion:
16
+ """
17
+ Orchestrate knowledge fusion across multiple documents
18
+
19
+ After extracting entities and relations from multiple documents,
20
+ this class performs cross-document fusion to:
21
+ - Identify entities that appear in multiple documents
22
+ - Merge duplicate entities across documents
23
+ - Resolve conflicts in entity properties
24
+ - Track provenance (which documents contributed to each entity)
25
+
26
+ Example:
27
+ ```python
28
+ fusion = KnowledgeFusion(graph_store)
29
+
30
+ # After processing multiple documents
31
+ await fusion.fuse_cross_document_entities(
32
+ similarity_threshold=0.9
33
+ )
34
+
35
+ print(f"Merged {fusion.entities_merged} entities across documents")
36
+ ```
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ graph_store: GraphStore,
42
+ similarity_threshold: float = 0.90, # High threshold for cross-document fusion
43
+ ):
44
+ """
45
+ Initialize knowledge fusion orchestrator
46
+
47
+ Args:
48
+ graph_store: Graph storage containing entities to fuse
49
+ similarity_threshold: Minimum similarity for cross-document merging
50
+ """
51
+ self.graph_store = graph_store
52
+ self.similarity_threshold = similarity_threshold
53
+ self.entities_merged = 0
54
+ self.conflicts_resolved = 0
55
+
56
+ async def fuse_cross_document_entities(self, entity_types: List[str] = None) -> Dict[str, int]:
57
+ """
58
+ Perform cross-document entity fusion
59
+
60
+ This method identifies and merges entities that appear across multiple documents.
61
+ It uses similarity matching to find duplicate entities and merges them while
62
+ preserving provenance information.
63
+
64
+ Algorithm:
65
+ 1. Query all entities from graph (optionally filtered by type)
66
+ 2. Group entities by type
67
+ 3. For each type, find similar entities using similarity matching
68
+ 4. Identify merge groups (clusters of similar entities)
69
+ 5. Merge each group into a canonical entity
70
+ 6. Update graph with merged entities and update relations
71
+
72
+ Args:
73
+ entity_types: Optional list of entity types to fuse (None = all types)
74
+
75
+ Returns:
76
+ Dictionary with fusion statistics:
77
+ - entities_analyzed: Total entities analyzed
78
+ - entities_merged: Number of entities merged
79
+ - conflicts_resolved: Number of property conflicts resolved
80
+ - merge_groups: Number of merge groups identified
81
+ """
82
+ stats = {
83
+ "entities_analyzed": 0,
84
+ "entities_merged": 0,
85
+ "conflicts_resolved": 0,
86
+ "merge_groups": 0,
87
+ }
88
+
89
+ # Reset counters
90
+ self.entities_merged = 0
91
+ self.conflicts_resolved = 0
92
+
93
+ # Step 1: Query all entities from graph
94
+ entities = await self._query_entities(entity_types)
95
+ stats["entities_analyzed"] = len(entities)
96
+
97
+ if len(entities) < 2:
98
+ # Nothing to merge
99
+ return stats
100
+
101
+ # Step 2: Group entities by type (only merge within same type)
102
+ entities_by_type = self._group_entities_by_type(entities)
103
+
104
+ # Step 3-6: Process each type group
105
+ for entity_type, type_entities in entities_by_type.items():
106
+ if len(type_entities) < 2:
107
+ continue
108
+
109
+ # Find merge candidates (groups of similar entities)
110
+ merge_groups = await self._find_merge_groups(type_entities)
111
+ stats["merge_groups"] += len(merge_groups)
112
+
113
+ # Merge each group
114
+ for group in merge_groups:
115
+ if len(group) < 2:
116
+ continue
117
+
118
+ # Merge entities in group
119
+ await self._merge_entity_group(group)
120
+ # N entities -> 1 entity
121
+ stats["entities_merged"] += len(group) - 1
122
+
123
+ stats["conflicts_resolved"] = self.conflicts_resolved
124
+
125
+ return stats
126
+
127
+ async def resolve_property_conflicts(
128
+ self, entities: List[Entity], strategy: str = "most_complete"
129
+ ) -> Entity:
130
+ """
131
+ Resolve conflicts when merging entities with different property values
132
+
133
+ Strategies:
134
+ - "most_complete": Prefer non-empty over empty values (default)
135
+ - "most_recent": Prefer most recent value (requires timestamp in provenance)
136
+ - "most_confident": Prefer value from most confident source (requires confidence score)
137
+ - "longest": Prefer longest string value
138
+ - "keep_all": Keep all conflicting values as a list
139
+
140
+ Args:
141
+ entities: List of entities to merge
142
+ strategy: Conflict resolution strategy
143
+
144
+ Returns:
145
+ Merged entity with resolved conflicts
146
+ """
147
+ if not entities:
148
+ raise ValueError("Cannot merge empty entity list")
149
+
150
+ if len(entities) == 1:
151
+ return entities[0]
152
+
153
+ # Create a new merged entity (copy first entity as base)
154
+ merged = Entity(
155
+ id=entities[0].id,
156
+ entity_type=entities[0].entity_type,
157
+ properties=entities[0].properties.copy(),
158
+ embedding=entities[0].embedding,
159
+ )
160
+
161
+ conflicting_properties = {}
162
+
163
+ # Merge properties from all entities
164
+ for entity in entities[1:]:
165
+ for key, value in entity.properties.items():
166
+ if key.startswith("_"):
167
+ # Skip internal properties (will handle separately)
168
+ continue
169
+
170
+ if key not in merged.properties:
171
+ # Property doesn't exist in merged, add it
172
+ merged.properties[key] = value
173
+ elif merged.properties[key] != value:
174
+ # Conflict detected - apply resolution strategy
175
+ resolved_value = self._resolve_conflict(
176
+ key=key,
177
+ values=[merged.properties[key], value],
178
+ entities=[entities[0], entity],
179
+ strategy=strategy,
180
+ )
181
+
182
+ # Track conflict
183
+ if key not in conflicting_properties:
184
+ conflicting_properties[key] = [merged.properties[key]]
185
+ conflicting_properties[key].append(value)
186
+
187
+ # Update with resolved value
188
+ merged.properties[key] = resolved_value
189
+
190
+ # Store conflicting values for transparency
191
+ if conflicting_properties:
192
+ merged.properties["_property_conflicts"] = conflicting_properties
193
+ self.conflicts_resolved += len(conflicting_properties)
194
+
195
+ # Merge provenance information
196
+ provenances = []
197
+ for entity in entities:
198
+ prov = entity.properties.get("_provenance")
199
+ if prov:
200
+ provenances.append(prov)
201
+ if provenances:
202
+ merged.properties["_provenance_merged"] = provenances
203
+
204
+ # Merge embeddings (average if multiple)
205
+ embeddings = [e.embedding for e in entities if e.embedding]
206
+ if len(embeddings) > 1:
207
+ # Average embeddings
208
+ import numpy as np
209
+
210
+ merged.embedding = list(np.mean(embeddings, axis=0))
211
+ elif embeddings:
212
+ merged.embedding = embeddings[0]
213
+
214
+ return merged
215
+
216
+ def _resolve_conflict(
217
+ self,
218
+ key: str,
219
+ values: List[Any],
220
+ entities: List[Entity],
221
+ strategy: str,
222
+ ) -> Any:
223
+ """
224
+ Resolve a single property conflict using specified strategy
225
+
226
+ Args:
227
+ key: Property key
228
+ values: Conflicting values
229
+ entities: Entities that have these values
230
+ strategy: Resolution strategy
231
+
232
+ Returns:
233
+ Resolved value
234
+ """
235
+ if strategy == "most_complete":
236
+ # Prefer non-empty, non-None values
237
+ # Prefer longer strings
238
+ non_empty = [v for v in values if v not in (None, "", [], {})]
239
+ if non_empty:
240
+ # If strings, prefer longest
241
+ if all(isinstance(v, str) for v in non_empty):
242
+ return max(non_empty, key=len)
243
+ return non_empty[0]
244
+ return values[0]
245
+
246
+ elif strategy == "most_recent":
247
+ # Prefer value from entity with most recent timestamp
248
+ timestamps = []
249
+ for entity in entities:
250
+ prov = entity.properties.get("_provenance", {})
251
+ if isinstance(prov, dict) and "timestamp" in prov:
252
+ timestamps.append(prov["timestamp"])
253
+ else:
254
+ timestamps.append(0) # No timestamp = oldest
255
+
256
+ if timestamps:
257
+ most_recent_idx = timestamps.index(max(timestamps))
258
+ return values[most_recent_idx]
259
+ return values[0]
260
+
261
+ elif strategy == "most_confident":
262
+ # Prefer value from entity with highest confidence
263
+ confidences = []
264
+ for entity in entities:
265
+ prov = entity.properties.get("_provenance", {})
266
+ if isinstance(prov, dict) and "confidence" in prov:
267
+ confidences.append(prov["confidence"])
268
+ else:
269
+ confidences.append(0.0) # No confidence = lowest
270
+
271
+ if confidences:
272
+ most_confident_idx = confidences.index(max(confidences))
273
+ return values[most_confident_idx]
274
+ return values[0]
275
+
276
+ elif strategy == "longest":
277
+ # Prefer longest value (for strings)
278
+ if all(isinstance(v, str) for v in values):
279
+ return max(values, key=len)
280
+ return values[0]
281
+
282
+ elif strategy == "keep_all":
283
+ # Keep all values as a list
284
+ return values
285
+
286
+ else:
287
+ # Default: return first value
288
+ return values[0]
289
+
290
+ async def track_entity_provenance(self, entity_id: str) -> List[str]:
291
+ """
292
+ Get list of documents that contributed to an entity
293
+
294
+ Args:
295
+ entity_id: Entity ID
296
+
297
+ Returns:
298
+ List of document sources
299
+ """
300
+ entity = await self.graph_store.get_entity(entity_id)
301
+ if not entity:
302
+ return []
303
+
304
+ sources = []
305
+
306
+ # Check single provenance
307
+ if "_provenance" in entity.properties:
308
+ prov = entity.properties["_provenance"]
309
+ if isinstance(prov, dict) and "source" in prov:
310
+ sources.append(prov["source"])
311
+
312
+ # Check merged provenances
313
+ if "_provenance_merged" in entity.properties:
314
+ merged_provs = entity.properties["_provenance_merged"]
315
+ if isinstance(merged_provs, list):
316
+ for prov in merged_provs:
317
+ if isinstance(prov, dict) and "source" in prov:
318
+ sources.append(prov["source"])
319
+
320
+ return list(set(sources)) # Remove duplicates
321
+
322
+ # =========================================================================
323
+ # Helper Methods for Cross-Document Fusion
324
+ # =========================================================================
325
+
326
+ async def _query_entities(self, entity_types: List[str] = None) -> List[Entity]:
327
+ """
328
+ Query entities from graph store
329
+
330
+ Args:
331
+ entity_types: Optional list of entity types to query
332
+
333
+ Returns:
334
+ List of entities
335
+ """
336
+ entities = []
337
+
338
+ # Check if graph store has get_all_entities method
339
+ if hasattr(self.graph_store, "get_all_entities"):
340
+ if entity_types:
341
+ # Query each type separately
342
+ for entity_type in entity_types:
343
+ type_entities = await self.graph_store.get_all_entities(entity_type=entity_type)
344
+ entities.extend(type_entities)
345
+ else:
346
+ # Query all entities
347
+ entities = await self.graph_store.get_all_entities()
348
+ else:
349
+ # Fallback: graph store doesn't support bulk queries
350
+ # This is a limitation - we can't efficiently query all entities
351
+ # In this case, return empty list
352
+ # Note: Implementations should add get_all_entities() method
353
+ pass
354
+
355
+ return entities
356
+
357
+ def _group_entities_by_type(self, entities: List[Entity]) -> Dict[str, List[Entity]]:
358
+ """
359
+ Group entities by their type
360
+
361
+ Args:
362
+ entities: List of entities
363
+
364
+ Returns:
365
+ Dictionary mapping entity type to list of entities
366
+ """
367
+ entities_by_type: Dict[str, List[Entity]] = {}
368
+
369
+ for entity in entities:
370
+ entity_type = entity.entity_type
371
+ if entity_type not in entities_by_type:
372
+ entities_by_type[entity_type] = []
373
+ entities_by_type[entity_type].append(entity)
374
+
375
+ return entities_by_type
376
+
377
+ async def _find_merge_groups(self, entities: List[Entity]) -> List[List[Entity]]:
378
+ """
379
+ Find groups of entities that should be merged together
380
+
381
+ Uses similarity matching to identify clusters of similar entities.
382
+ Entities are grouped using connected components algorithm.
383
+
384
+ Args:
385
+ entities: List of entities (all same type)
386
+
387
+ Returns:
388
+ List of merge groups (each group is a list of entities)
389
+ """
390
+ if len(entities) < 2:
391
+ return []
392
+
393
+ # Build similarity graph
394
+ n = len(entities)
395
+ similar_pairs: Set[Tuple[int, int]] = set()
396
+
397
+ # Compare all pairs
398
+ for i in range(n):
399
+ for j in range(i + 1, n):
400
+ similarity = await self._compute_entity_similarity(entities[i], entities[j])
401
+ if similarity >= self.similarity_threshold:
402
+ similar_pairs.add((i, j))
403
+
404
+ # Find connected components (merge groups)
405
+ merge_groups = self._find_connected_components(n, similar_pairs)
406
+
407
+ # Convert indices to entities
408
+ entity_groups = []
409
+ for group_indices in merge_groups:
410
+ if len(group_indices) >= 2: # Only groups with 2+ entities
411
+ entity_group = [entities[i] for i in group_indices]
412
+ entity_groups.append(entity_group)
413
+
414
+ return entity_groups
415
+
416
+ def _find_connected_components(self, n: int, edges: Set[Tuple[int, int]]) -> List[List[int]]:
417
+ """
418
+ Find connected components in an undirected graph
419
+
420
+ Uses Union-Find (Disjoint Set Union) algorithm.
421
+
422
+ Args:
423
+ n: Number of nodes
424
+ edges: Set of edges (pairs of node indices)
425
+
426
+ Returns:
427
+ List of components (each component is a list of node indices)
428
+ """
429
+ # Initialize parent array for Union-Find
430
+ parent = list(range(n))
431
+
432
+ def find(x: int) -> int:
433
+ """Find root of x with path compression"""
434
+ if parent[x] != x:
435
+ parent[x] = find(parent[x])
436
+ return parent[x]
437
+
438
+ def union(x: int, y: int) -> None:
439
+ """Union two sets"""
440
+ root_x = find(x)
441
+ root_y = find(y)
442
+ if root_x != root_y:
443
+ parent[root_x] = root_y
444
+
445
+ # Build connected components
446
+ for i, j in edges:
447
+ union(i, j)
448
+
449
+ # Group nodes by their root
450
+ components: Dict[int, List[int]] = {}
451
+ for i in range(n):
452
+ root = find(i)
453
+ if root not in components:
454
+ components[root] = []
455
+ components[root].append(i)
456
+
457
+ return list(components.values())
458
+
459
+ async def _compute_entity_similarity(self, entity1: Entity, entity2: Entity) -> float:
460
+ """
461
+ Compute similarity between two entities
462
+
463
+ Uses EntityDeduplicator for similarity computation.
464
+
465
+ Args:
466
+ entity1: First entity
467
+ entity2: Second entity
468
+
469
+ Returns:
470
+ Similarity score (0.0-1.0)
471
+ """
472
+ # Use EntityDeduplicator for similarity computation
473
+ deduplicator = EntityDeduplicator(similarity_threshold=self.similarity_threshold)
474
+ return await deduplicator._compute_similarity(entity1, entity2)
475
+
476
+ async def _merge_entity_group(self, entities: List[Entity]) -> None:
477
+ """
478
+ Merge a group of entities into a single canonical entity
479
+
480
+ Steps:
481
+ 1. Resolve property conflicts to create merged entity
482
+ 2. Update graph: replace all entities with merged entity
483
+ 3. Update relations: redirect to merged entity
484
+ 4. Delete old entities
485
+
486
+ Args:
487
+ entities: List of entities to merge (2 or more)
488
+ """
489
+ if len(entities) < 2:
490
+ return
491
+
492
+ # Step 1: Resolve conflicts and create merged entity
493
+ merged_entity = await self.resolve_property_conflicts(entities)
494
+
495
+ # Track merge provenance
496
+ merged_entity.properties["_merged_from"] = [e.id for e in entities]
497
+ merged_entity.properties["_merge_count"] = len(entities)
498
+
499
+ # Step 2: Add merged entity to graph (use first entity's ID as
500
+ # canonical)
501
+ canonical_id = entities[0].id
502
+ merged_entity.id = canonical_id
503
+
504
+ # Update entity in graph
505
+ # Try update_entity if available, otherwise delete and re-add
506
+ if hasattr(self.graph_store, "update_entity"):
507
+ await self.graph_store.update_entity(merged_entity)
508
+ else:
509
+ # Delete old entity and add merged one
510
+ # For InMemoryGraphStore, we need to manually update
511
+ if hasattr(self.graph_store, "entities"):
512
+ # Direct update for InMemoryGraphStore
513
+ self.graph_store.entities[canonical_id] = merged_entity
514
+ if hasattr(self.graph_store, "graph") and self.graph_store.graph:
515
+ self.graph_store.graph.nodes[canonical_id]["entity"] = merged_entity
516
+ else:
517
+ # Fallback: try to add (may fail if exists)
518
+ try:
519
+ await self.graph_store.add_entity(merged_entity)
520
+ except ValueError:
521
+ # Entity already exists, skip
522
+ pass
523
+
524
+ # Step 3: Update relations pointing to merged entities
525
+ await self._update_relations_for_merge(entities, canonical_id)
526
+
527
+ # Step 4: Delete old entities (except canonical)
528
+ for entity in entities[1:]:
529
+ # Delete entity from graph
530
+ if hasattr(self.graph_store, "delete_entity"):
531
+ await self.graph_store.delete_entity(entity.id)
532
+
533
+ # Update counter
534
+ self.entities_merged += len(entities) - 1
535
+
536
+ async def _update_relations_for_merge(
537
+ self, merged_entities: List[Entity], canonical_id: str
538
+ ) -> None:
539
+ """
540
+ Update relations to point to canonical merged entity
541
+
542
+ For each merged entity (except canonical):
543
+ - Find all relations where it's source or target
544
+ - Update relation to use canonical_id instead
545
+ - Remove duplicate relations
546
+
547
+ Args:
548
+ merged_entities: List of entities that were merged
549
+ canonical_id: ID of canonical entity
550
+ """
551
+ {e.id for e in merged_entities}
552
+
553
+ # For each merged entity (except canonical)
554
+ for entity in merged_entities:
555
+ if entity.id == canonical_id:
556
+ continue
557
+
558
+ # Get outgoing relations
559
+ if hasattr(self.graph_store, "get_outgoing_relations"):
560
+ outgoing = await self.graph_store.get_outgoing_relations(entity.id)
561
+ for relation in outgoing:
562
+ # Update source to canonical
563
+ relation.source_id = canonical_id
564
+ await self.graph_store.add_relation(relation)
565
+
566
+ # Get incoming relations
567
+ if hasattr(self.graph_store, "get_incoming_relations"):
568
+ incoming = await self.graph_store.get_incoming_relations(entity.id)
569
+ for relation in incoming:
570
+ # Update target to canonical
571
+ relation.target_id = canonical_id
572
+ await self.graph_store.add_relation(relation)
573
+
574
+ # Alternative: use get_neighbors to find relations
575
+ # This is less efficient but works with basic GraphStore interface
576
+ if not hasattr(self.graph_store, "get_outgoing_relations"):
577
+ # Get neighbors (this implicitly uses relations)
578
+ # We can't easily update relations without direct access
579
+ # This is a limitation of the basic interface
580
+ pass