alita-sdk 0.3.462__py3-none-any.whl → 0.3.627__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. alita_sdk/cli/agent/__init__.py +5 -0
  2. alita_sdk/cli/agent/default.py +258 -0
  3. alita_sdk/cli/agent_executor.py +15 -3
  4. alita_sdk/cli/agent_loader.py +56 -8
  5. alita_sdk/cli/agent_ui.py +93 -31
  6. alita_sdk/cli/agents.py +2274 -230
  7. alita_sdk/cli/callbacks.py +96 -25
  8. alita_sdk/cli/cli.py +10 -1
  9. alita_sdk/cli/config.py +162 -9
  10. alita_sdk/cli/context/__init__.py +30 -0
  11. alita_sdk/cli/context/cleanup.py +198 -0
  12. alita_sdk/cli/context/manager.py +731 -0
  13. alita_sdk/cli/context/message.py +285 -0
  14. alita_sdk/cli/context/strategies.py +289 -0
  15. alita_sdk/cli/context/token_estimation.py +127 -0
  16. alita_sdk/cli/input_handler.py +419 -0
  17. alita_sdk/cli/inventory.py +1073 -0
  18. alita_sdk/cli/testcases/__init__.py +94 -0
  19. alita_sdk/cli/testcases/data_generation.py +119 -0
  20. alita_sdk/cli/testcases/discovery.py +96 -0
  21. alita_sdk/cli/testcases/executor.py +84 -0
  22. alita_sdk/cli/testcases/logger.py +85 -0
  23. alita_sdk/cli/testcases/parser.py +172 -0
  24. alita_sdk/cli/testcases/prompts.py +91 -0
  25. alita_sdk/cli/testcases/reporting.py +125 -0
  26. alita_sdk/cli/testcases/setup.py +108 -0
  27. alita_sdk/cli/testcases/test_runner.py +282 -0
  28. alita_sdk/cli/testcases/utils.py +39 -0
  29. alita_sdk/cli/testcases/validation.py +90 -0
  30. alita_sdk/cli/testcases/workflow.py +196 -0
  31. alita_sdk/cli/toolkit.py +14 -17
  32. alita_sdk/cli/toolkit_loader.py +35 -5
  33. alita_sdk/cli/tools/__init__.py +36 -2
  34. alita_sdk/cli/tools/approval.py +224 -0
  35. alita_sdk/cli/tools/filesystem.py +910 -64
  36. alita_sdk/cli/tools/planning.py +389 -0
  37. alita_sdk/cli/tools/terminal.py +414 -0
  38. alita_sdk/community/__init__.py +72 -12
  39. alita_sdk/community/inventory/__init__.py +236 -0
  40. alita_sdk/community/inventory/config.py +257 -0
  41. alita_sdk/community/inventory/enrichment.py +2137 -0
  42. alita_sdk/community/inventory/extractors.py +1469 -0
  43. alita_sdk/community/inventory/ingestion.py +3172 -0
  44. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  45. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  46. alita_sdk/community/inventory/parsers/base.py +295 -0
  47. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  48. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  49. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  50. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  51. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  52. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  53. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  54. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  55. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  56. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  57. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  58. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  59. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  60. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  61. alita_sdk/community/inventory/patterns/loader.py +348 -0
  62. alita_sdk/community/inventory/patterns/registry.py +198 -0
  63. alita_sdk/community/inventory/presets.py +535 -0
  64. alita_sdk/community/inventory/retrieval.py +1403 -0
  65. alita_sdk/community/inventory/toolkit.py +173 -0
  66. alita_sdk/community/inventory/toolkit_utils.py +176 -0
  67. alita_sdk/community/inventory/visualize.py +1370 -0
  68. alita_sdk/configurations/__init__.py +1 -1
  69. alita_sdk/configurations/ado.py +141 -20
  70. alita_sdk/configurations/bitbucket.py +0 -3
  71. alita_sdk/configurations/confluence.py +76 -42
  72. alita_sdk/configurations/figma.py +76 -0
  73. alita_sdk/configurations/gitlab.py +17 -5
  74. alita_sdk/configurations/openapi.py +329 -0
  75. alita_sdk/configurations/qtest.py +72 -1
  76. alita_sdk/configurations/report_portal.py +96 -0
  77. alita_sdk/configurations/sharepoint.py +148 -0
  78. alita_sdk/configurations/testio.py +83 -0
  79. alita_sdk/runtime/clients/artifact.py +3 -3
  80. alita_sdk/runtime/clients/client.py +353 -48
  81. alita_sdk/runtime/clients/sandbox_client.py +0 -21
  82. alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
  83. alita_sdk/runtime/langchain/assistant.py +123 -26
  84. alita_sdk/runtime/langchain/constants.py +642 -1
  85. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
  86. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
  87. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +6 -3
  88. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +226 -7
  89. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +5 -2
  90. alita_sdk/runtime/langchain/document_loaders/constants.py +12 -7
  91. alita_sdk/runtime/langchain/langraph_agent.py +279 -73
  92. alita_sdk/runtime/langchain/utils.py +82 -15
  93. alita_sdk/runtime/llms/preloaded.py +2 -6
  94. alita_sdk/runtime/skills/__init__.py +91 -0
  95. alita_sdk/runtime/skills/callbacks.py +498 -0
  96. alita_sdk/runtime/skills/discovery.py +540 -0
  97. alita_sdk/runtime/skills/executor.py +610 -0
  98. alita_sdk/runtime/skills/input_builder.py +371 -0
  99. alita_sdk/runtime/skills/models.py +330 -0
  100. alita_sdk/runtime/skills/registry.py +355 -0
  101. alita_sdk/runtime/skills/skill_runner.py +330 -0
  102. alita_sdk/runtime/toolkits/__init__.py +7 -0
  103. alita_sdk/runtime/toolkits/application.py +21 -9
  104. alita_sdk/runtime/toolkits/artifact.py +15 -5
  105. alita_sdk/runtime/toolkits/datasource.py +13 -6
  106. alita_sdk/runtime/toolkits/mcp.py +139 -251
  107. alita_sdk/runtime/toolkits/mcp_config.py +1048 -0
  108. alita_sdk/runtime/toolkits/planning.py +178 -0
  109. alita_sdk/runtime/toolkits/skill_router.py +238 -0
  110. alita_sdk/runtime/toolkits/subgraph.py +251 -6
  111. alita_sdk/runtime/toolkits/tools.py +238 -32
  112. alita_sdk/runtime/toolkits/vectorstore.py +11 -5
  113. alita_sdk/runtime/tools/__init__.py +3 -1
  114. alita_sdk/runtime/tools/application.py +20 -6
  115. alita_sdk/runtime/tools/artifact.py +511 -28
  116. alita_sdk/runtime/tools/data_analysis.py +183 -0
  117. alita_sdk/runtime/tools/function.py +43 -15
  118. alita_sdk/runtime/tools/image_generation.py +50 -44
  119. alita_sdk/runtime/tools/llm.py +852 -67
  120. alita_sdk/runtime/tools/loop.py +3 -1
  121. alita_sdk/runtime/tools/loop_output.py +3 -1
  122. alita_sdk/runtime/tools/mcp_remote_tool.py +25 -10
  123. alita_sdk/runtime/tools/mcp_server_tool.py +7 -6
  124. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  125. alita_sdk/runtime/tools/planning/models.py +246 -0
  126. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  127. alita_sdk/runtime/tools/router.py +2 -4
  128. alita_sdk/runtime/tools/sandbox.py +9 -6
  129. alita_sdk/runtime/tools/skill_router.py +776 -0
  130. alita_sdk/runtime/tools/tool.py +3 -1
  131. alita_sdk/runtime/tools/vectorstore.py +7 -2
  132. alita_sdk/runtime/tools/vectorstore_base.py +51 -11
  133. alita_sdk/runtime/utils/AlitaCallback.py +137 -21
  134. alita_sdk/runtime/utils/constants.py +5 -1
  135. alita_sdk/runtime/utils/mcp_client.py +492 -0
  136. alita_sdk/runtime/utils/mcp_oauth.py +202 -5
  137. alita_sdk/runtime/utils/mcp_sse_client.py +36 -7
  138. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  139. alita_sdk/runtime/utils/serialization.py +155 -0
  140. alita_sdk/runtime/utils/streamlit.py +6 -10
  141. alita_sdk/runtime/utils/toolkit_utils.py +16 -5
  142. alita_sdk/runtime/utils/utils.py +36 -0
  143. alita_sdk/tools/__init__.py +113 -29
  144. alita_sdk/tools/ado/repos/__init__.py +51 -33
  145. alita_sdk/tools/ado/repos/repos_wrapper.py +148 -89
  146. alita_sdk/tools/ado/test_plan/__init__.py +25 -9
  147. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +23 -1
  148. alita_sdk/tools/ado/utils.py +1 -18
  149. alita_sdk/tools/ado/wiki/__init__.py +25 -8
  150. alita_sdk/tools/ado/wiki/ado_wrapper.py +291 -22
  151. alita_sdk/tools/ado/work_item/__init__.py +26 -9
  152. alita_sdk/tools/ado/work_item/ado_wrapper.py +56 -3
  153. alita_sdk/tools/advanced_jira_mining/__init__.py +11 -8
  154. alita_sdk/tools/aws/delta_lake/__init__.py +13 -9
  155. alita_sdk/tools/aws/delta_lake/tool.py +5 -1
  156. alita_sdk/tools/azure_ai/search/__init__.py +11 -8
  157. alita_sdk/tools/azure_ai/search/api_wrapper.py +1 -1
  158. alita_sdk/tools/base/tool.py +5 -1
  159. alita_sdk/tools/base_indexer_toolkit.py +170 -45
  160. alita_sdk/tools/bitbucket/__init__.py +17 -12
  161. alita_sdk/tools/bitbucket/api_wrapper.py +59 -11
  162. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +49 -35
  163. alita_sdk/tools/browser/__init__.py +5 -4
  164. alita_sdk/tools/carrier/__init__.py +5 -6
  165. alita_sdk/tools/carrier/backend_reports_tool.py +6 -6
  166. alita_sdk/tools/carrier/run_ui_test_tool.py +6 -6
  167. alita_sdk/tools/carrier/ui_reports_tool.py +5 -5
  168. alita_sdk/tools/chunkers/__init__.py +3 -1
  169. alita_sdk/tools/chunkers/code/treesitter/treesitter.py +37 -13
  170. alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
  171. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  172. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  173. alita_sdk/tools/cloud/aws/__init__.py +10 -7
  174. alita_sdk/tools/cloud/azure/__init__.py +10 -7
  175. alita_sdk/tools/cloud/gcp/__init__.py +10 -7
  176. alita_sdk/tools/cloud/k8s/__init__.py +10 -7
  177. alita_sdk/tools/code/linter/__init__.py +10 -8
  178. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  179. alita_sdk/tools/code/sonar/__init__.py +10 -7
  180. alita_sdk/tools/code_indexer_toolkit.py +73 -23
  181. alita_sdk/tools/confluence/__init__.py +21 -15
  182. alita_sdk/tools/confluence/api_wrapper.py +78 -23
  183. alita_sdk/tools/confluence/loader.py +4 -2
  184. alita_sdk/tools/custom_open_api/__init__.py +12 -5
  185. alita_sdk/tools/elastic/__init__.py +11 -8
  186. alita_sdk/tools/elitea_base.py +493 -30
  187. alita_sdk/tools/figma/__init__.py +58 -11
  188. alita_sdk/tools/figma/api_wrapper.py +1235 -143
  189. alita_sdk/tools/figma/figma_client.py +73 -0
  190. alita_sdk/tools/figma/toon_tools.py +2748 -0
  191. alita_sdk/tools/github/__init__.py +13 -14
  192. alita_sdk/tools/github/github_client.py +224 -100
  193. alita_sdk/tools/github/graphql_client_wrapper.py +119 -33
  194. alita_sdk/tools/github/schemas.py +14 -5
  195. alita_sdk/tools/github/tool.py +5 -1
  196. alita_sdk/tools/github/tool_prompts.py +9 -22
  197. alita_sdk/tools/gitlab/__init__.py +15 -11
  198. alita_sdk/tools/gitlab/api_wrapper.py +207 -41
  199. alita_sdk/tools/gitlab_org/__init__.py +10 -8
  200. alita_sdk/tools/gitlab_org/api_wrapper.py +63 -64
  201. alita_sdk/tools/google/bigquery/__init__.py +13 -12
  202. alita_sdk/tools/google/bigquery/tool.py +5 -1
  203. alita_sdk/tools/google_places/__init__.py +10 -8
  204. alita_sdk/tools/google_places/api_wrapper.py +1 -1
  205. alita_sdk/tools/jira/__init__.py +17 -11
  206. alita_sdk/tools/jira/api_wrapper.py +91 -40
  207. alita_sdk/tools/keycloak/__init__.py +11 -8
  208. alita_sdk/tools/localgit/__init__.py +9 -3
  209. alita_sdk/tools/localgit/local_git.py +62 -54
  210. alita_sdk/tools/localgit/tool.py +5 -1
  211. alita_sdk/tools/memory/__init__.py +11 -3
  212. alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
  213. alita_sdk/tools/ocr/__init__.py +11 -8
  214. alita_sdk/tools/openapi/__init__.py +490 -114
  215. alita_sdk/tools/openapi/api_wrapper.py +1368 -0
  216. alita_sdk/tools/openapi/tool.py +20 -0
  217. alita_sdk/tools/pandas/__init__.py +20 -12
  218. alita_sdk/tools/pandas/api_wrapper.py +38 -25
  219. alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
  220. alita_sdk/tools/postman/__init__.py +11 -11
  221. alita_sdk/tools/pptx/__init__.py +10 -9
  222. alita_sdk/tools/pptx/pptx_wrapper.py +1 -1
  223. alita_sdk/tools/qtest/__init__.py +30 -10
  224. alita_sdk/tools/qtest/api_wrapper.py +430 -13
  225. alita_sdk/tools/rally/__init__.py +10 -8
  226. alita_sdk/tools/rally/api_wrapper.py +1 -1
  227. alita_sdk/tools/report_portal/__init__.py +12 -9
  228. alita_sdk/tools/salesforce/__init__.py +10 -9
  229. alita_sdk/tools/servicenow/__init__.py +17 -14
  230. alita_sdk/tools/servicenow/api_wrapper.py +1 -1
  231. alita_sdk/tools/sharepoint/__init__.py +10 -8
  232. alita_sdk/tools/sharepoint/api_wrapper.py +4 -4
  233. alita_sdk/tools/slack/__init__.py +10 -8
  234. alita_sdk/tools/slack/api_wrapper.py +2 -2
  235. alita_sdk/tools/sql/__init__.py +11 -9
  236. alita_sdk/tools/testio/__init__.py +10 -8
  237. alita_sdk/tools/testrail/__init__.py +11 -8
  238. alita_sdk/tools/testrail/api_wrapper.py +1 -1
  239. alita_sdk/tools/utils/__init__.py +9 -4
  240. alita_sdk/tools/utils/content_parser.py +77 -3
  241. alita_sdk/tools/utils/text_operations.py +410 -0
  242. alita_sdk/tools/utils/tool_prompts.py +79 -0
  243. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +17 -13
  244. alita_sdk/tools/xray/__init__.py +12 -9
  245. alita_sdk/tools/yagmail/__init__.py +9 -3
  246. alita_sdk/tools/zephyr/__init__.py +9 -7
  247. alita_sdk/tools/zephyr_enterprise/__init__.py +11 -8
  248. alita_sdk/tools/zephyr_essential/__init__.py +10 -8
  249. alita_sdk/tools/zephyr_essential/api_wrapper.py +30 -13
  250. alita_sdk/tools/zephyr_essential/client.py +2 -2
  251. alita_sdk/tools/zephyr_scale/__init__.py +11 -9
  252. alita_sdk/tools/zephyr_scale/api_wrapper.py +2 -2
  253. alita_sdk/tools/zephyr_squad/__init__.py +10 -8
  254. {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/METADATA +147 -7
  255. alita_sdk-0.3.627.dist-info/RECORD +468 -0
  256. alita_sdk-0.3.627.dist-info/entry_points.txt +2 -0
  257. alita_sdk-0.3.462.dist-info/RECORD +0 -384
  258. alita_sdk-0.3.462.dist-info/entry_points.txt +0 -2
  259. {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/WHEEL +0 -0
  260. {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/licenses/LICENSE +0 -0
  261. {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,2137 @@
1
+ """
2
+ Knowledge Graph Enrichment Utilities.
3
+
4
+ Post-processing tools to improve graph connectivity by:
5
+ 1. Soft entity deduplication (merging same/similar entities with different types)
6
+ 2. Linking semantically similar entities across sources
7
+ 3. Creating cross-reference relationships (implements, documents, etc.)
8
+ 4. Connecting orphan nodes to parent concepts
9
+
10
+ Usage:
11
+ from alita_sdk.community.inventory.enrichment import GraphEnricher
12
+
13
+ enricher = GraphEnricher(graph_path="./graph.json")
14
+ enricher.enrich()
15
+ enricher.save()
16
+ """
17
+
18
+ import json
19
+ import logging
20
+ import re
21
+ import hashlib
22
+ from collections import defaultdict
23
+ from pathlib import Path
24
+ from typing import Dict, List, Set, Tuple, Optional, Any
25
+ from difflib import SequenceMatcher
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ # ============================================================================
31
+ # TYPE NORMALIZATION FOR ENRICHMENT
32
+ # ============================================================================
33
+
34
+ # Comprehensive type consolidation map
35
+ # Maps many ad-hoc LLM types to a smaller set of canonical types
36
+ # NOTE: All keys should be lowercase - normalize_type() lowercases input first
37
+ TYPE_NORMALIZATION_MAP = {
38
+ # ==========================================================================
39
+ # IDENTITY MAPPINGS - Types that MUST be preserved as-is
40
+ # ==========================================================================
41
+ "fact": "fact",
42
+ "source_file": "source_file",
43
+ "feature": "feature",
44
+ "module": "module",
45
+ "constant": "constant",
46
+ "rule": "rule",
47
+ "parameter": "parameter",
48
+ "error_handling": "error_handling",
49
+ "todo": "todo",
50
+ "property": "property",
51
+ "configuration": "configuration",
52
+ "process": "process",
53
+ "integration": "integration",
54
+ "interface": "interface",
55
+ "user_story": "user_story",
56
+ "test": "test",
57
+ "variable": "variable",
58
+ "function": "function",
59
+
60
+ # ==========================================================================
61
+ # CODE STRUCTURE FAMILY → map to preserved types
62
+ # ==========================================================================
63
+ "named": "export",
64
+ "default": "export",
65
+ "business_rule": "rule",
66
+ "domain_concept": "concept",
67
+ "business_concept": "concept",
68
+ "integration_point": "integration",
69
+ "user_interface_element": "interface",
70
+ "user_interface_component": "interface",
71
+ "user_interaction": "interface",
72
+ "user_action": "interface",
73
+ "api_contract": "rest_api",
74
+ "technical_debt": "todo",
75
+ "test_scenario": "test",
76
+ "test_case": "test",
77
+ "tooltype": "tool",
78
+
79
+ # ==========================================================================
80
+ # TOOL & TOOLKIT FAMILY → tool, toolkit
81
+ # ==========================================================================
82
+ "tool": "tool",
83
+ "tools": "tool",
84
+ "tool_used": "tool",
85
+ "tool_example": "tool",
86
+ "tool_category": "tool",
87
+ "internal_tool": "tool",
88
+ "documentationtool": "tool",
89
+ "toolkit": "toolkit",
90
+ "toolkits": "toolkit",
91
+ "toolkit_type": "toolkit",
92
+
93
+ # ==========================================================================
94
+ # FEATURE & CAPABILITY FAMILY → feature
95
+ # ==========================================================================
96
+ "features": "feature",
97
+ "functionality": "feature",
98
+ "capability": "feature",
99
+ "benefit": "feature",
100
+ "characteristic": "feature",
101
+
102
+ # ==========================================================================
103
+ # PROCESS & WORKFLOW FAMILY → process
104
+ # ==========================================================================
105
+ "processes": "process",
106
+ "procedure": "process",
107
+ "workflow": "workflow",
108
+ "flow": "process",
109
+ "pipeline": "process",
110
+
111
+ # ==========================================================================
112
+ # CONCEPT & ENTITY FAMILY → concept
113
+ # ==========================================================================
114
+ "concept": "concept",
115
+ "concepts": "concept",
116
+ "entity": "entity",
117
+ "entities": "entity",
118
+ "entity_type": "entity",
119
+ "entitytype": "entity",
120
+ "domain_entity": "entity",
121
+ "domain": "concept",
122
+ "topic": "concept",
123
+ "term": "concept",
124
+ "glossary_term": "concept",
125
+ "key_concept": "concept",
126
+
127
+ # ==========================================================================
128
+ # CONFIGURATION FAMILY → configuration
129
+ # ==========================================================================
130
+ "config": "configuration",
131
+ "configuration_section": "configuration",
132
+ "configuration_field": "configuration",
133
+ "configuration_option": "configuration",
134
+ "configuration_file": "configuration",
135
+ "configurationfile": "configuration",
136
+ "configurationchange": "configuration",
137
+ "configuration_command": "configuration",
138
+ "setting": "configuration",
139
+ "environment": "configuration",
140
+
141
+ # ==========================================================================
142
+ # DOCUMENTATION & GUIDE FAMILY → documentation
143
+ # ==========================================================================
144
+ "documentation": "documentation",
145
+ "documentation_section": "documentation",
146
+ "documentation_template": "documentation",
147
+ "guide": "documentation",
148
+ "guideline": "documentation",
149
+ "instruction": "documentation",
150
+ "tip": "documentation",
151
+ "note": "documentation",
152
+ "faq": "documentation",
153
+ "overview": "documentation",
154
+ "summary": "documentation",
155
+ "best_practice": "documentation",
156
+
157
+ # ==========================================================================
158
+ # SECTION & STRUCTURE FAMILY → section
159
+ # ==========================================================================
160
+ "section": "section",
161
+ "sections": "section",
162
+ "interface_section": "section",
163
+ "navigation_structure": "section",
164
+ "navigation_group": "section",
165
+ "navigation": "section",
166
+
167
+ # ==========================================================================
168
+ # COMPONENT & UI FAMILY → component
169
+ # ==========================================================================
170
+ "component": "component",
171
+ "components": "component",
172
+ "ui_component": "component",
173
+ "ui_element": "component",
174
+ "ui_layout": "component",
175
+ "interface_element": "component",
176
+ "button": "component",
177
+ "menu": "component",
178
+ "tab": "component",
179
+ "panel": "component",
180
+ "editor": "component",
181
+ "view": "component",
182
+
183
+ # ==========================================================================
184
+ # ISSUE & PROBLEM FAMILY → issue
185
+ # ==========================================================================
186
+ "issue": "issue",
187
+ "issues": "issue",
188
+ "issue_type": "issue",
189
+ "issuetype": "issue",
190
+ "known_issue": "issue",
191
+ "fixed_issue": "issue",
192
+ "limitation": "issue",
193
+ "challenge": "issue",
194
+ "problem": "issue",
195
+ "error_message": "issue",
196
+ "troubleshooting": "issue",
197
+ "compatibilityissue": "issue",
198
+
199
+ # ==========================================================================
200
+ # ACTION & COMMAND FAMILY → action
201
+ # ==========================================================================
202
+ "action": "action",
203
+ "actions": "action",
204
+ "command": "action",
205
+ "operation": "action",
206
+ "task": "action",
207
+ "trigger": "action",
208
+ "automation_rule": "action",
209
+
210
+ # ==========================================================================
211
+ # PARAMETER & FIELD FAMILY → parameter
212
+ # ==========================================================================
213
+ "parameters": "parameter",
214
+ "field": "parameter",
215
+ "field_identifier": "parameter",
216
+ "placeholder": "parameter",
217
+ "value": "parameter",
218
+ "label": "parameter",
219
+ "tag": "parameter",
220
+
221
+ # ==========================================================================
222
+ # CREDENTIAL & AUTH FAMILY → credential
223
+ # ==========================================================================
224
+ "credential": "credential",
225
+ "credential_type": "credential",
226
+ "secret": "credential",
227
+ "token": "credential",
228
+ "api_key": "credential",
229
+ "api_token": "credential",
230
+ "key": "credential",
231
+ "authentication": "credential",
232
+ "authentication_method": "credential",
233
+ "permission": "credential",
234
+ "access_control": "credential",
235
+ "access_requirement": "credential",
236
+
237
+ # ==========================================================================
238
+ # RESOURCE & FILE FAMILY → resource
239
+ # ==========================================================================
240
+ "resource": "resource",
241
+ "resources": "resource",
242
+ "file": "resource",
243
+ "file_type": "resource",
244
+ "file_format": "resource",
245
+ "file_path": "resource",
246
+ "folder": "resource",
247
+ "artifact": "resource",
248
+ "artifact_type": "resource",
249
+ "document": "resource",
250
+ "template": "resource",
251
+ "script": "resource",
252
+
253
+ # ==========================================================================
254
+ # PLATFORM & SOFTWARE FAMILY → platform
255
+ # ==========================================================================
256
+ "platform": "platform",
257
+ "platforms": "platform",
258
+ "software": "platform",
259
+ "softwareversion": "platform",
260
+ "application": "platform",
261
+ "app": "platform",
262
+ "system": "platform",
263
+ "framework": "platform",
264
+ "library": "platform",
265
+ "technology": "platform",
266
+ "product": "platform",
267
+
268
+ # ==========================================================================
269
+ # SERVICE & API FAMILY → Keep distinct types for different communication patterns
270
+ # ==========================================================================
271
+ "service": "service",
272
+ "services": "service",
273
+ "microservice": "service",
274
+ "web_service": "service",
275
+ "server": "service",
276
+ "client": "service",
277
+ "hostingservice": "service",
278
+
279
+ # REST API (do NOT normalize to generic 'service')
280
+ "rest api": "rest_api",
281
+ "rest_api": "rest_api",
282
+ "restapi": "rest_api",
283
+ "rest": "rest_api",
284
+ "api": "rest_api",
285
+ "openapi": "rest_api",
286
+ "swagger": "rest_api",
287
+ "rest endpoint": "rest_endpoint",
288
+ "rest_endpoint": "rest_endpoint",
289
+ "endpoint": "rest_endpoint",
290
+ "api_endpoint": "rest_endpoint",
291
+ "http_endpoint": "rest_endpoint",
292
+ "rest_resource": "rest_resource",
293
+
294
+ # GraphQL (do NOT normalize to 'service')
295
+ "graphql api": "graphql_api",
296
+ "graphql_api": "graphql_api",
297
+ "graphql": "graphql_api",
298
+ "graphql_schema": "graphql_api",
299
+ "graphql query": "graphql_query",
300
+ "graphql_query": "graphql_query",
301
+ "query": "graphql_query",
302
+ "graphql mutation": "graphql_mutation",
303
+ "graphql_mutation": "graphql_mutation",
304
+ "mutation": "graphql_mutation",
305
+ "graphql subscription": "graphql_subscription",
306
+ "graphql_subscription": "graphql_subscription",
307
+ "subscription": "graphql_subscription",
308
+ "graphql type": "graphql_type",
309
+ "graphql_type": "graphql_type",
310
+
311
+ # gRPC (do NOT normalize to 'service')
312
+ "grpc service": "grpc_service",
313
+ "grpc_service": "grpc_service",
314
+ "grpc": "grpc_service",
315
+ "grpc method": "grpc_method",
316
+ "grpc_method": "grpc_method",
317
+ "rpc_method": "grpc_method",
318
+ "protobuf_message": "protobuf_message",
319
+ "protobuf": "protobuf_message",
320
+ "proto_message": "protobuf_message",
321
+ "protocol buffer": "protobuf_message",
322
+
323
+ # Event-Driven Architecture (do NOT normalize to 'service')
324
+ "event bus": "event_bus",
325
+ "event_bus": "event_bus",
326
+ "message_broker": "event_bus",
327
+ "message_queue": "event_bus",
328
+ "kafka": "event_bus",
329
+ "rabbitmq": "event_bus",
330
+ "event type": "event_type",
331
+ "event_type": "event_type",
332
+ "event": "event_type",
333
+ "message_type": "event_type",
334
+ "event producer": "event_producer",
335
+ "event_producer": "event_producer",
336
+ "publisher": "event_producer",
337
+ "event consumer": "event_consumer",
338
+ "event_consumer": "event_consumer",
339
+ "subscriber": "event_consumer",
340
+ "listener": "event_consumer",
341
+ "event handler": "event_handler",
342
+ "event_handler": "event_handler",
343
+ "message_handler": "event_handler",
344
+ "handler": "event_handler",
345
+
346
+ # ==========================================================================
347
+ # INTEGRATION & CONNECTION FAMILY → integration
348
+ # ==========================================================================
349
+ "integrations": "integration",
350
+ "connection": "integration",
351
+ "connection_type": "integration",
352
+ "connector": "integration",
353
+ "adapter": "integration",
354
+ "datasource": "integration",
355
+ "database": "integration",
356
+
357
+ # ==========================================================================
358
+ # EXAMPLE & USE CASE FAMILY → example
359
+ # ==========================================================================
360
+ "example": "example",
361
+ "examples": "example",
362
+ "example_type": "example",
363
+ "example_request": "example",
364
+ "use_case": "example",
365
+ "use_case_category": "example",
366
+ "code_sample": "example",
367
+ "sample_prompt": "example",
368
+
369
+ # ==========================================================================
370
+ # NODE & GRAPH FAMILY → node
371
+ # ==========================================================================
372
+ "node": "node",
373
+ "nodetype": "node",
374
+ "node_type": "node",
375
+ "execution_node": "node",
376
+ "iteration_node": "node",
377
+ "interaction_node": "node",
378
+ "utilitynode": "node",
379
+
380
+ # ==========================================================================
381
+ # STEP & PROCEDURE FAMILY → step
382
+ # ==========================================================================
383
+ "step": "step",
384
+ "steps": "step",
385
+ "number_of_step": "step",
386
+ "prerequisite": "step",
387
+
388
+ # ==========================================================================
389
+ # STATUS & STATE FAMILY → status
390
+ # ==========================================================================
391
+ "status": "status",
392
+ "state": "status",
393
+ "state_type": "status",
394
+ "mode": "status",
395
+ "session_mode": "status",
396
+
397
+ # ==========================================================================
398
+ # PROJECT & WORKSPACE FAMILY → project
399
+ # ==========================================================================
400
+ "project": "project",
401
+ "workspace": "project",
402
+ "project_scope": "project",
403
+ "repository": "project",
404
+ "space": "project",
405
+
406
+ # ==========================================================================
407
+ # ROLE & USER FAMILY → role
408
+ # ==========================================================================
409
+ "role": "role",
410
+ "user_role": "role",
411
+ "team": "role",
412
+ "person": "role",
413
+ "audience": "role",
414
+ "stakeholder": "role",
415
+ "owner": "role",
416
+
417
+ # ==========================================================================
418
+ # AGENT FAMILY → agent
419
+ # ==========================================================================
420
+ "agent": "agent",
421
+ "agents": "agent",
422
+ "agent_type": "agent",
423
+ "agent_configuration": "agent",
424
+ "ai_agent": "agent",
425
+ "public_agent": "agent",
426
+
427
+ # ==========================================================================
428
+ # DATA & TYPE FAMILY → data_type
429
+ # ==========================================================================
430
+ "data_type": "data_type",
431
+ "datatype": "data_type",
432
+ "data_structure": "data_type",
433
+ "schema": "data_type",
434
+ "format": "data_type",
435
+ "content_type": "data_type",
436
+ "collection": "data_type",
437
+ "collectiontype": "data_type",
438
+ "list": "data_type",
439
+ "table": "data_type",
440
+
441
+ # ==========================================================================
442
+ # RELEASE & VERSION FAMILY → release
443
+ # ==========================================================================
444
+ "release": "release",
445
+ "version": "release",
446
+ "change": "release",
447
+ "feature_change": "release",
448
+ "migration": "release",
449
+ "deployment": "release",
450
+ "fix": "release",
451
+
452
+ # ==========================================================================
453
+ # REFERENCE & LINK FAMILY → reference
454
+ # ==========================================================================
455
+ "reference": "reference",
456
+ "related_page": "reference",
457
+ "url": "reference",
458
+ "webpage": "reference",
459
+ "website": "reference",
460
+ "page": "reference",
461
+ "link": "reference",
462
+
463
+ # ==========================================================================
464
+ # RULE & POLICY FAMILY → rule
465
+ # ==========================================================================
466
+ "rules": "rule",
467
+ "policy": "rule",
468
+ "formatting_rule": "rule",
469
+ "directive": "rule",
470
+ "requirement": "rule",
471
+ "specification": "rule",
472
+
473
+ # ==========================================================================
474
+ # MCP FAMILY → mcp_server
475
+ # ==========================================================================
476
+ "mcp server": "mcp_server",
477
+ "mcp_server": "mcp_server",
478
+ "mcp tool": "mcp_tool",
479
+ "mcp_tool": "mcp_tool",
480
+ "mcp resource": "mcp_resource",
481
+ "mcp_resource": "mcp_resource",
482
+ "mcp_type": "mcp_server",
483
+ "transport": "mcp_server",
484
+
485
+ # ==========================================================================
486
+ # MISCELLANEOUS → map to closest canonical type
487
+ # ==========================================================================
488
+ "method": "method",
489
+ "model": "concept",
490
+ "category": "concept",
491
+ "metric": "parameter",
492
+ "identifier": "parameter",
493
+ "port": "parameter",
494
+ "protocol": "service",
495
+ "security": "credential",
496
+ "support": "documentation",
497
+ "community": "documentation",
498
+ "contact": "reference",
499
+ "contactmethod": "reference",
500
+ "contact_information": "reference",
501
+ "contactinfo": "reference",
502
+ "building_block": "component",
503
+ "container": "component",
504
+ "instance": "entity",
505
+ "object": "entity",
506
+ "sourcetype": "data_type",
507
+ "input_mapping_type": "data_type",
508
+ "control_flow_feature": "feature",
509
+ "export_option": "action",
510
+ "export_format": "data_type",
511
+ "conversion": "action",
512
+ "customization": "configuration",
513
+ "viewing_option": "configuration",
514
+ "review_outcome": "status",
515
+ "goal": "feature",
516
+ "engagement": "action",
517
+ "output": "data_type",
518
+ "effect": "action",
519
+ "solution": "documentation",
520
+ "cause": "issue",
521
+ "indicator": "status",
522
+ "date": "parameter",
523
+ "screenshot": "resource",
524
+ "open_question": "issue",
525
+ "static_site_generator": "platform",
526
+ "theme": "configuration",
527
+ "theme_convention": "rule",
528
+ "file_naming_convention": "rule",
529
+ "metadata_guideline": "rule",
530
+ "linking_guideline": "rule",
531
+ "media_guideline": "rule",
532
+ "accessibility_guideline": "rule",
533
+ "page_type": "section",
534
+ "document_category": "section",
535
+ "prompt": "example",
536
+ "chat": "feature",
537
+ "ide": "platform",
538
+ "tagging": "action",
539
+ "account": "credential",
540
+ "installation_command": "action",
541
+ "usage": "documentation",
542
+ "mechanism": "concept",
543
+ "ai_component": "component",
544
+ "communication_method": "integration",
545
+ "dns_record": "configuration",
546
+ "tone": "rule",
547
+ "voice": "rule",
548
+
549
+ # ==========================================================================
550
+ # FACT & KNOWLEDGE FAMILY → fact (semantic facts extracted by LLM)
551
+ # ==========================================================================
552
+ "facts": "fact",
553
+ "algorithm": "fact",
554
+ "behavior": "fact",
555
+ "validation": "fact",
556
+ "decision": "fact",
557
+ "definition": "fact",
558
+
559
+ # ==========================================================================
560
+ # FILE & STRUCTURE FAMILY → file types (container nodes for entities)
561
+ # ==========================================================================
562
+ "document_file": "document_file",
563
+ "config_file": "config_file",
564
+ "web_file": "web_file",
565
+ "directory": "directory",
566
+ "package": "package",
567
+ }
568
+
569
+ # Types that should NEVER be normalized - they pass through as-is
570
+ PRESERVED_TYPES = {
571
+ "fact", "source_file", "feature", "module", "constant", "rule",
572
+ "parameter", "error_handling", "todo", "property", "configuration",
573
+ "process", "integration", "interface", "user_story", "test",
574
+ "export", "rest_api", "concept", "component", "workflow",
575
+ "document_file", "config_file", "web_file", "directory", "package",
576
+ "variable", "function", # Code entities - preserve for impact analysis
577
+ }
578
+
579
+ def normalize_type(entity_type: str) -> str:
580
+ """
581
+ Normalize entity type to canonical lowercase form.
582
+
583
+ Aggressively consolidates types to a small set of ~25 canonical types:
584
+ - feature, tool, toolkit, process, concept, entity
585
+ - section, component, issue, action, parameter, credential
586
+ - resource, platform, service, integration, example, node
587
+ - step, status, project, role, agent, data_type, release
588
+ - reference, rule, documentation, configuration, mcp_server
589
+
590
+ Args:
591
+ entity_type: Raw entity type
592
+
593
+ Returns:
594
+ Canonical lowercase entity type
595
+ """
596
+ if not entity_type:
597
+ return "concept" # Default to concept for unknown
598
+
599
+ # Normalize to lowercase first - all checks are case-insensitive
600
+ normalized = entity_type.lower().strip().replace(" ", "_").replace("-", "_")
601
+
602
+ # First: check if type should be preserved as-is (25+ canonical types)
603
+ if normalized in PRESERVED_TYPES:
604
+ return normalized
605
+
606
+ # Check explicit mapping (all keys are lowercase now)
607
+ if normalized in TYPE_NORMALIZATION_MAP:
608
+ return TYPE_NORMALIZATION_MAP[normalized]
609
+
610
+ # Handle plural forms
611
+ if normalized.endswith('s') and not normalized.endswith('ss') and len(normalized) > 3:
612
+ singular = normalized[:-1]
613
+ if singular in PRESERVED_TYPES:
614
+ return singular
615
+ if singular in TYPE_NORMALIZATION_MAP:
616
+ return TYPE_NORMALIZATION_MAP[singular]
617
+
618
+ # Fallback heuristics based on common suffixes/patterns
619
+ if '_type' in normalized or normalized.endswith('type'):
620
+ return "data_type"
621
+ if '_section' in normalized or normalized.endswith('section'):
622
+ return "section"
623
+ if '_field' in normalized or normalized.endswith('field'):
624
+ return "parameter"
625
+ if '_node' in normalized or normalized.endswith('node'):
626
+ return "node"
627
+ if '_issue' in normalized or normalized.endswith('issue'):
628
+ return "issue"
629
+ if '_guide' in normalized or normalized.endswith('guide'):
630
+ return "documentation"
631
+ if '_config' in normalized or normalized.endswith('config'):
632
+ return "configuration"
633
+ if '_tool' in normalized or normalized.endswith('tool'):
634
+ return "tool"
635
+ if '_service' in normalized or normalized.endswith('service'):
636
+ return "service"
637
+
638
+ # If still unknown, map to concept (generic catch-all)
639
+ return "concept"
640
+
641
+ # Relationship types for cross-source linking
642
+ CROSS_SOURCE_RELATIONS = {
643
+ # (source_type, target_type): relation_type
644
+ ("class", "concept"): "implements",
645
+ ("module", "concept"): "implements",
646
+ ("function", "concept"): "implements",
647
+ ("method", "concept"): "implements",
648
+ ("class", "entity"): "implements",
649
+ ("module", "feature"): "implements",
650
+ ("command", "feature"): "provides",
651
+ ("toolkit", "toolkit_type"): "is_type_of",
652
+ ("source_toolkit", "toolkit_type"): "is_type_of",
653
+ ("SourceToolkit", "toolkit_type"): "is_type_of",
654
+ ("import", "module"): "imports",
655
+ ("import", "class"): "imports",
656
+ }
657
+
658
+ # Types that represent code vs documentation
659
+ CODE_TYPES = {
660
+ "class", "module", "function", "method", "variable", "constant",
661
+ "import", "attribute", "property", "command", "command_group",
662
+ "SourceToolkit", "source_toolkit", "toolkit"
663
+ }
664
+
665
+ DOC_TYPES = {
666
+ "concept", "entity", "feature", "Feature", "guide", "section",
667
+ "step", "process", "guideline", "tutorial", "example", "overview",
668
+ "toolkit_type", "platform", "software", "integration"
669
+ }
670
+
671
+ # Type priority for deduplication - higher priority types are preferred
672
+ # When merging entities with different types, the higher priority type wins
673
+ TYPE_PRIORITY = {
674
+ # Code layer - highest priority (most specific)
675
+ "class": 100,
676
+ "function": 99,
677
+ "method": 98,
678
+ "module": 97,
679
+ "interface": 96,
680
+ "constant": 95,
681
+ "variable": 94,
682
+ "configuration": 93,
683
+
684
+ # Service layer - specific communication patterns have higher priority than generic
685
+ "service": 90,
686
+
687
+ # REST API types
688
+ "rest_api": 89,
689
+ "rest_endpoint": 88,
690
+ "rest_resource": 87,
691
+
692
+ # GraphQL types
693
+ "graphql_api": 89,
694
+ "graphql_mutation": 88,
695
+ "graphql_query": 87,
696
+ "graphql_subscription": 86,
697
+ "graphql_type": 85,
698
+
699
+ # gRPC types
700
+ "grpc_service": 89,
701
+ "grpc_method": 88,
702
+ "protobuf_message": 87,
703
+
704
+ # Event-driven types
705
+ "event_bus": 89,
706
+ "event_type": 88,
707
+ "event_producer": 87,
708
+ "event_consumer": 87,
709
+ "event_handler": 86,
710
+
711
+ # Generic fallbacks (lower priority)
712
+ "integration": 84,
713
+ "payload": 83,
714
+
715
+ # Data layer
716
+ "database": 85,
717
+ "table": 84,
718
+ "column": 83,
719
+ "constraint": 82,
720
+ "index": 81,
721
+ "migration": 80,
722
+ "enum": 79,
723
+
724
+ # Product layer
725
+ "feature": 75,
726
+ "epic": 74,
727
+ "user_story": 73,
728
+ "screen": 72,
729
+ "ux_flow": 71,
730
+ "ui_component": 70,
731
+ "ui_field": 69,
732
+
733
+ # Domain layer
734
+ "domain_entity": 65,
735
+ "attribute": 64,
736
+ "business_rule": 63,
737
+ "business_event": 62,
738
+ "glossary_term": 61,
739
+ "workflow": 60,
740
+
741
+ # Testing layer
742
+ "test_suite": 55,
743
+ "test_case": 54,
744
+ "test_step": 53,
745
+ "assertion": 52,
746
+ "test_data": 51,
747
+ "defect": 50,
748
+ "incident": 49,
749
+
750
+ # Delivery layer
751
+ "release": 45,
752
+ "sprint": 44,
753
+ "commit": 43,
754
+ "pull_request": 42,
755
+ "ticket": 41,
756
+ "deployment": 40,
757
+
758
+ # Organization layer
759
+ "team": 35,
760
+ "owner": 34,
761
+ "stakeholder": 33,
762
+ "repository": 32,
763
+ "documentation": 31,
764
+
765
+ # Toolkits (specific types)
766
+ "toolkit": 28,
767
+ "source_toolkit": 27,
768
+ "SourceToolkit": 26,
769
+ "command": 25,
770
+ "command_group": 24,
771
+
772
+ # Generic types - lowest priority
773
+ "concept": 15,
774
+ "entity": 14,
775
+ "component": 13,
776
+ "object": 12,
777
+ "item": 11,
778
+ "element": 10,
779
+ "thing": 5,
780
+ "unknown": 1,
781
+ }
782
+
783
+ # Types that should NOT be merged even with same name
784
+ # These represent fundamentally different concepts
785
+ NON_MERGEABLE_TYPES = {
786
+ # Don't merge tests with the things they test
787
+ ("test_case", "function"),
788
+ ("test_case", "class"),
789
+ ("test_case", "endpoint"),
790
+ ("test_suite", "module"),
791
+
792
+ # Don't merge documentation with code
793
+ ("documentation", "module"),
794
+ ("documentation", "class"),
795
+
796
+ # Don't merge defects with features
797
+ ("defect", "feature"),
798
+ ("incident", "feature"),
799
+
800
+ # Don't merge owners with owned items
801
+ ("owner", "module"),
802
+ ("owner", "service"),
803
+ ("team", "repository"),
804
+ }
805
+
806
+ # Types that should NEVER be deduplicated even with exact same name
807
+ # These are context-dependent - same name in different files means different things
808
+ # e.g., "Get Tests" tool in Xray toolkit != "Get Tests" tool in Zephyr toolkit
809
+ NEVER_DEDUPLICATE_TYPES = {
810
+ "tool", # Tools belong to specific toolkits
811
+ "property", # Properties belong to specific entities
812
+ "properties", # Same as above
813
+ "parameter", # Parameters belong to specific functions/methods
814
+ "argument", # Arguments belong to specific functions
815
+ "field", # Fields belong to specific tables/forms
816
+ "column", # Columns belong to specific tables
817
+ "attribute", # Attributes belong to specific entities
818
+ "option", # Options belong to specific settings
819
+ "setting", # Settings may have same name in different contexts
820
+ "step", # Steps belong to specific workflows/processes
821
+ "test_step", # Test steps belong to specific test cases
822
+ "ui_field", # UI fields belong to specific screens
823
+ "method", # Methods belong to specific classes
824
+
825
+ # API types - same name can exist in different API contexts
826
+ "rest_endpoint", # /users endpoint in API A != /users in API B
827
+ "rest_resource", # Same resource name in different REST APIs
828
+ "graphql_query", # Same query name in different GraphQL schemas
829
+ "graphql_mutation", # Same mutation name in different GraphQL schemas
830
+ "graphql_subscription", # Same subscription in different GraphQL schemas
831
+ "graphql_type", # Same type name in different GraphQL schemas
832
+ "grpc_method", # Same method name in different gRPC services
833
+ "protobuf_message", # Same message name in different proto files
834
+ "event_type", # Same event name in different event busses
835
+ "event_handler", # Same handler name in different services
836
+ }
837
+
838
+
839
+ class GraphEnricher:
840
+ """
841
+ Enriches a knowledge graph with cross-source relationships.
842
+ """
843
+
844
+ def __init__(self, graph_path: str):
845
+ """
846
+ Initialize enricher with a graph file.
847
+
848
+ Args:
849
+ graph_path: Path to the graph JSON file
850
+ """
851
+ self.graph_path = Path(graph_path)
852
+ self.graph_data: Dict[str, Any] = {}
853
+ self.nodes_by_id: Dict[str, Dict] = {}
854
+ self.nodes_by_name: Dict[str, List[Dict]] = defaultdict(list)
855
+ self.existing_links: Set[Tuple[str, str]] = set()
856
+ self.new_links: List[Dict] = []
857
+ self.id_mapping: Dict[str, str] = {} # old_id -> new_id for merged nodes
858
+ self.merged_nodes: List[Dict] = [] # Track merged node info
859
+ self.stats = {
860
+ "cross_source_links": 0,
861
+ "orphan_links": 0,
862
+ "similarity_links": 0,
863
+ "entities_merged": 0,
864
+ "merge_groups": 0,
865
+ }
866
+
867
+ self._load_graph()
868
+
869
+ def _load_graph(self):
870
+ """Load graph from JSON file."""
871
+ with open(self.graph_path) as f:
872
+ self.graph_data = json.load(f)
873
+
874
+ # Build indices
875
+ for node in self.graph_data.get("nodes", []):
876
+ self.nodes_by_id[node["id"]] = node
877
+ name_key = self._normalize_name(node.get("name", ""))
878
+ self.nodes_by_name[name_key].append(node)
879
+
880
+ # Track existing links
881
+ for link in self.graph_data.get("links", []):
882
+ self.existing_links.add((link["source"], link["target"]))
883
+ self.existing_links.add((link["target"], link["source"])) # bidirectional check
884
+
885
+ logger.info(f"Loaded graph: {len(self.nodes_by_id)} nodes, {len(self.existing_links)//2} links")
886
+
887
+ def normalize_entity_types(self):
888
+ """
889
+ Normalize all entity types in the graph to canonical lowercase forms.
890
+
891
+ This fixes inconsistencies like Tool/tool/Tools all becoming 'tool'.
892
+ Should be run before other enrichment steps.
893
+ """
894
+ logger.info("Normalizing entity types...")
895
+ types_normalized = 0
896
+ type_changes: Dict[str, str] = {} # original -> normalized
897
+
898
+ for node in self.graph_data.get("nodes", []):
899
+ original_type = node.get("type", "")
900
+ normalized = normalize_type(original_type)
901
+
902
+ if normalized != original_type:
903
+ if original_type not in type_changes:
904
+ type_changes[original_type] = normalized
905
+ node["type"] = normalized
906
+ types_normalized += 1
907
+
908
+ # Log what was changed
909
+ if type_changes:
910
+ logger.info(f"Normalized {types_normalized} entity types:")
911
+ for orig, norm in sorted(type_changes.items()):
912
+ logger.debug(f" {orig} -> {norm}")
913
+
914
+ self.stats["types_normalized"] = types_normalized
915
+ self.stats["type_changes"] = len(type_changes)
916
+
917
+ # Rebuild indices after type normalization
918
+ self.nodes_by_id.clear()
919
+ self.nodes_by_name.clear()
920
+ for node in self.graph_data.get("nodes", []):
921
+ self.nodes_by_id[node["id"]] = node
922
+ name_key = self._normalize_name(node.get("name", ""))
923
+ self.nodes_by_name[name_key].append(node)
924
+
925
+ logger.info(f"Normalized {len(type_changes)} distinct type variations")
926
+
927
+ def _normalize_name(self, name: str) -> str:
928
+ """Normalize entity name for matching."""
929
+ # Convert to lowercase, replace separators with spaces
930
+ name = name.lower().strip()
931
+ name = re.sub(r'[_\-\.]+', ' ', name)
932
+ name = re.sub(r'\s+', ' ', name)
933
+ return name
934
+
935
+ def _tokenize_name(self, name: str) -> Set[str]:
936
+ """Tokenize name into significant words."""
937
+ normalized = self._normalize_name(name)
938
+ # Remove common stop words
939
+ stop_words = {'the', 'a', 'an', 'and', 'or', 'of', 'to', 'in', 'for', 'on', 'with', 'by', 'is', 'it'}
940
+ words = set(normalized.split())
941
+ return words - stop_words
942
+
943
+ def _get_source(self, node: Dict) -> str:
944
+ """Determine source category for a node."""
945
+ citations = node.get("citations", [])
946
+ if node.get("citation"):
947
+ citations = [node["citation"]]
948
+
949
+ if not citations:
950
+ return "unknown"
951
+
952
+ fp = citations[0].get("file_path", "")
953
+ if "alita-sdk" in fp or "alita_sdk" in fp:
954
+ return "sdk"
955
+ elif "elitea_core" in fp:
956
+ return "core"
957
+ elif "AlitaUI" in fp:
958
+ return "ui"
959
+ elif "docs/" in fp or fp.endswith(".md"):
960
+ return "docs"
961
+ else:
962
+ return "other"
963
+
964
+ def _is_code_type(self, entity_type: str) -> bool:
965
+ """Check if entity type represents code."""
966
+ return entity_type.lower() in {t.lower() for t in CODE_TYPES}
967
+
968
+ def _is_doc_type(self, entity_type: str) -> bool:
969
+ """Check if entity type represents documentation."""
970
+ return entity_type.lower() in {t.lower() for t in DOC_TYPES}
971
+
972
+ def _get_type_priority(self, entity_type: str) -> int:
973
+ """Get priority score for entity type."""
974
+ return TYPE_PRIORITY.get(entity_type.lower(), TYPE_PRIORITY.get(entity_type, 0))
975
+
976
+ def _are_types_mergeable(self, type1: str, type2: str) -> bool:
977
+ """Check if two entity types can be merged."""
978
+ t1, t2 = type1.lower(), type2.lower()
979
+ pair1 = (t1, t2)
980
+ pair2 = (t2, t1)
981
+ return pair1 not in NON_MERGEABLE_TYPES and pair2 not in NON_MERGEABLE_TYPES
982
+
983
+ def _generate_merged_id(self, name: str, entity_type: str) -> str:
984
+ """Generate a consistent ID for merged entity."""
985
+ normalized = self._normalize_name(name)
986
+ key = f"{entity_type}:{normalized}"
987
+ return hashlib.md5(key.encode()).hexdigest()[:16]
988
+
989
+ def _add_link(self, source_id: str, target_id: str, relation_type: str, reason: str):
990
+ """Add a new link if it doesn't exist."""
991
+ # Apply ID mapping for merged nodes
992
+ source_id = self.id_mapping.get(source_id, source_id)
993
+ target_id = self.id_mapping.get(target_id, target_id)
994
+
995
+ if source_id == target_id:
996
+ return False
997
+ if (source_id, target_id) in self.existing_links:
998
+ return False
999
+
1000
+ self.new_links.append({
1001
+ "source": source_id,
1002
+ "target": target_id,
1003
+ "relation_type": relation_type,
1004
+ "enrichment_reason": reason,
1005
+ })
1006
+ self.existing_links.add((source_id, target_id))
1007
+ self.existing_links.add((target_id, source_id))
1008
+ return True
1009
+
1010
+ def _similarity(self, s1: str, s2: str) -> float:
1011
+ """Calculate string similarity ratio."""
1012
+ return SequenceMatcher(None, s1.lower(), s2.lower()).ratio()
1013
+
1014
+ def _word_overlap_score(self, name1: str, name2: str) -> float:
1015
+ """Calculate word overlap score between two names."""
1016
+ words1 = self._tokenize_name(name1)
1017
+ words2 = self._tokenize_name(name2)
1018
+ if not words1 or not words2:
1019
+ return 0.0
1020
+ overlap = len(words1 & words2)
1021
+ return overlap / max(len(words1), len(words2))
1022
+
1023
+ def deduplicate_entities(self,
1024
+ name_similarity_threshold: float = 0.95,
1025
+ require_exact_match: bool = True) -> int:
1026
+ """
1027
+ Soft entity deduplication - merge entities that represent the same concept.
1028
+
1029
+ CONSERVATIVE APPROACH: Only merges entities with EXACT same name (after normalization).
1030
+ This prevents incorrectly merging related but distinct concepts like:
1031
+ - "Artifact Toolkit" vs "Artifact Toolkit Guide"
1032
+ - "Feature X" vs "Configure Feature X"
1033
+
1034
+ Entities with different names but similar concepts should be LINKED, not merged.
1035
+
1036
+ When merging, it:
1037
+ - Selects the best type based on TYPE_PRIORITY
1038
+ - Consolidates all citations from merged entities
1039
+ - Preserves all properties/attributes
1040
+ - Updates all links to point to the merged entity
1041
+
1042
+ Args:
1043
+ name_similarity_threshold: Min similarity for fuzzy matching (only if require_exact_match=False)
1044
+ require_exact_match: If True (default), only merge exact name matches
1045
+
1046
+ Returns:
1047
+ Number of entities merged
1048
+ """
1049
+ logger.info("Starting soft entity deduplication (exact match only)...")
1050
+
1051
+ nodes = self.graph_data.get("nodes", [])
1052
+ if not nodes:
1053
+ return 0
1054
+
1055
+ # Group entities by normalized name for exact matches
1056
+ name_groups: Dict[str, List[Dict]] = defaultdict(list)
1057
+ for node in nodes:
1058
+ name_key = self._normalize_name(node.get("name", ""))
1059
+ if len(name_key) >= 2: # Skip very short names
1060
+ name_groups[name_key].append(node)
1061
+
1062
+ # Find merge candidates - ONLY exact name matches
1063
+ merge_groups: List[List[Dict]] = []
1064
+ processed_ids: Set[str] = set()
1065
+
1066
+ for name_key, group_nodes in name_groups.items():
1067
+ if len(group_nodes) < 2:
1068
+ continue
1069
+
1070
+ # Skip types that should NEVER be deduplicated (context-dependent)
1071
+ # e.g., "Get Tests" tool in Xray != "Get Tests" tool in Zephyr
1072
+ group_nodes = [
1073
+ n for n in group_nodes
1074
+ if n.get("type", "").lower() not in NEVER_DEDUPLICATE_TYPES
1075
+ ]
1076
+ if len(group_nodes) < 2:
1077
+ continue
1078
+
1079
+ # Filter to only mergeable types within exact name matches
1080
+ mergeable_groups: List[List[Dict]] = []
1081
+ for node in group_nodes:
1082
+ if node["id"] in processed_ids:
1083
+ continue
1084
+
1085
+ # Try to add to existing group if types are compatible
1086
+ added = False
1087
+ for mg in mergeable_groups:
1088
+ if all(self._are_types_mergeable(node.get("type", ""), m.get("type", "")) for m in mg):
1089
+ mg.append(node)
1090
+ added = True
1091
+ break
1092
+
1093
+ if not added:
1094
+ mergeable_groups.append([node])
1095
+
1096
+ # Add groups with multiple nodes
1097
+ for mg in mergeable_groups:
1098
+ if len(mg) >= 2:
1099
+ merge_groups.append(mg)
1100
+ for node in mg:
1101
+ processed_ids.add(node["id"])
1102
+
1103
+ # Optional: Phase 2 - Very high similarity fuzzy matches (disabled by default)
1104
+ if not require_exact_match:
1105
+ remaining_nodes = [n for n in nodes if n["id"] not in processed_ids]
1106
+
1107
+ for i, node1 in enumerate(remaining_nodes):
1108
+ if node1["id"] in processed_ids:
1109
+ continue
1110
+
1111
+ name1 = self._normalize_name(node1.get("name", ""))
1112
+ if len(name1) < 3:
1113
+ continue
1114
+
1115
+ candidates = [node1]
1116
+
1117
+ for node2 in remaining_nodes[i+1:]:
1118
+ if node2["id"] in processed_ids:
1119
+ continue
1120
+
1121
+ name2 = self._normalize_name(node2.get("name", ""))
1122
+ if len(name2) < 3:
1123
+ continue
1124
+
1125
+ # Check if types are mergeable
1126
+ if not self._are_types_mergeable(node1.get("type", ""), node2.get("type", "")):
1127
+ continue
1128
+
1129
+ # Only merge on VERY high similarity (almost identical names)
1130
+ str_sim = self._similarity(name1, name2)
1131
+ if str_sim >= name_similarity_threshold:
1132
+ candidates.append(node2)
1133
+
1134
+ if len(candidates) >= 2:
1135
+ merge_groups.append(candidates)
1136
+ for node in candidates:
1137
+ processed_ids.add(node["id"])
1138
+
1139
+ # Execute merges
1140
+ logger.info(f"Found {len(merge_groups)} merge groups")
1141
+
1142
+ nodes_to_remove: Set[str] = set()
1143
+ nodes_to_add: List[Dict] = []
1144
+
1145
+ for group in merge_groups:
1146
+ merged = self._merge_entity_group(group)
1147
+ if merged:
1148
+ nodes_to_add.append(merged["new_node"])
1149
+ nodes_to_remove.update(merged["removed_ids"])
1150
+ self.merged_nodes.append(merged)
1151
+ self.stats["entities_merged"] += len(merged["removed_ids"])
1152
+
1153
+ self.stats["merge_groups"] = len(merge_groups)
1154
+
1155
+ # Update nodes list
1156
+ self.graph_data["nodes"] = [n for n in nodes if n["id"] not in nodes_to_remove]
1157
+ self.graph_data["nodes"].extend(nodes_to_add)
1158
+
1159
+ # Update links to use new IDs
1160
+ self._update_links_after_merge()
1161
+
1162
+ # Rebuild indices
1163
+ self._rebuild_indices()
1164
+
1165
+ logger.info(f"Deduplication complete: {self.stats['entities_merged']} entities merged into {self.stats['merge_groups']} groups")
1166
+
1167
+ return self.stats["entities_merged"]
1168
+
1169
+ def _merge_entity_group(self, group: List[Dict]) -> Optional[Dict]:
1170
+ """
1171
+ Merge a group of entities into a single entity.
1172
+
1173
+ Returns merge info dict or None if merge failed.
1174
+ """
1175
+ if len(group) < 2:
1176
+ return None
1177
+
1178
+ # Select best type based on priority
1179
+ best_node = max(group, key=lambda n: self._get_type_priority(n.get("type", "")))
1180
+ best_type = best_node.get("type", "entity")
1181
+
1182
+ # Use the name from the highest priority node
1183
+ best_name = best_node.get("name", "")
1184
+
1185
+ # Generate merged ID
1186
+ new_id = self._generate_merged_id(best_name, best_type)
1187
+
1188
+ # Collect all citations
1189
+ all_citations = []
1190
+ all_sources = set()
1191
+ for node in group:
1192
+ if "citations" in node:
1193
+ all_citations.extend(node["citations"])
1194
+ if "citation" in node:
1195
+ all_citations.append(node["citation"])
1196
+ all_sources.add(self._get_source(node))
1197
+
1198
+ # Remove duplicate citations
1199
+ seen_citations = set()
1200
+ unique_citations = []
1201
+ for cit in all_citations:
1202
+ cit_key = (cit.get("file_path", ""), cit.get("chunk_index", 0))
1203
+ if cit_key not in seen_citations:
1204
+ seen_citations.add(cit_key)
1205
+ unique_citations.append(cit)
1206
+
1207
+ # Collect all properties
1208
+ all_properties = {}
1209
+ for node in group:
1210
+ if "properties" in node:
1211
+ all_properties.update(node["properties"])
1212
+
1213
+ # Collect all types as alternative_types
1214
+ all_types = list(set(n.get("type", "") for n in group if n.get("type")))
1215
+ all_types = [t for t in all_types if t != best_type]
1216
+
1217
+ # Create merged node
1218
+ merged_node = {
1219
+ "id": new_id,
1220
+ "name": best_name,
1221
+ "type": best_type,
1222
+ "citations": unique_citations,
1223
+ "sources": list(all_sources),
1224
+ "merged_from": [n["id"] for n in group],
1225
+ "alternative_types": all_types,
1226
+ }
1227
+
1228
+ if all_properties:
1229
+ merged_node["properties"] = all_properties
1230
+
1231
+ # Add description from best node
1232
+ if "description" in best_node:
1233
+ merged_node["description"] = best_node["description"]
1234
+ else:
1235
+ # Try to get description from any node
1236
+ for node in group:
1237
+ if "description" in node:
1238
+ merged_node["description"] = node["description"]
1239
+ break
1240
+
1241
+ # Map old IDs to new ID
1242
+ removed_ids = []
1243
+ for node in group:
1244
+ old_id = node["id"]
1245
+ self.id_mapping[old_id] = new_id
1246
+ removed_ids.append(old_id)
1247
+
1248
+ return {
1249
+ "new_node": merged_node,
1250
+ "removed_ids": removed_ids,
1251
+ "merged_types": [n.get("type", "") for n in group],
1252
+ }
1253
+
1254
+ def _update_links_after_merge(self):
1255
+ """Update all links to use merged node IDs."""
1256
+ updated_links = []
1257
+ seen_links = set()
1258
+
1259
+ for link in self.graph_data.get("links", []):
1260
+ source = self.id_mapping.get(link["source"], link["source"])
1261
+ target = self.id_mapping.get(link["target"], link["target"])
1262
+
1263
+ # Skip self-links and duplicates
1264
+ if source == target:
1265
+ continue
1266
+
1267
+ link_key = (source, target, link.get("relation_type", ""))
1268
+ if link_key in seen_links:
1269
+ continue
1270
+ seen_links.add(link_key)
1271
+
1272
+ updated_link = link.copy()
1273
+ updated_link["source"] = source
1274
+ updated_link["target"] = target
1275
+ updated_links.append(updated_link)
1276
+
1277
+ self.graph_data["links"] = updated_links
1278
+
1279
+ def _rebuild_indices(self):
1280
+ """Rebuild internal indices after modifications."""
1281
+ self.nodes_by_id.clear()
1282
+ self.nodes_by_name.clear()
1283
+ self.existing_links.clear()
1284
+
1285
+ for node in self.graph_data.get("nodes", []):
1286
+ self.nodes_by_id[node["id"]] = node
1287
+ name_key = self._normalize_name(node.get("name", ""))
1288
+ self.nodes_by_name[name_key].append(node)
1289
+
1290
+ for link in self.graph_data.get("links", []):
1291
+ self.existing_links.add((link["source"], link["target"]))
1292
+ self.existing_links.add((link["target"], link["source"]))
1293
+
1294
+ def enrich_cross_source_links(self, min_similarity: float = 0.85):
1295
+ """
1296
+ Create links between entities with similar names across different sources.
1297
+
1298
+ For example, link SDK class "Toolkit" to docs concept "Toolkit".
1299
+ """
1300
+ logger.info("Creating cross-source links...")
1301
+
1302
+ for name_key, nodes in self.nodes_by_name.items():
1303
+ if len(nodes) < 2:
1304
+ continue
1305
+
1306
+ # Group by source
1307
+ by_source: Dict[str, List[Dict]] = defaultdict(list)
1308
+ for node in nodes:
1309
+ source = self._get_source(node)
1310
+ by_source[source].append(node)
1311
+
1312
+ if len(by_source) < 2:
1313
+ continue # All from same source
1314
+
1315
+ # Link code entities to doc entities
1316
+ code_nodes = []
1317
+ doc_nodes = []
1318
+
1319
+ for source, source_nodes in by_source.items():
1320
+ for node in source_nodes:
1321
+ if self._is_code_type(node.get("type", "")):
1322
+ code_nodes.append(node)
1323
+ elif self._is_doc_type(node.get("type", "")):
1324
+ doc_nodes.append(node)
1325
+
1326
+ # Create cross-links
1327
+ for code_node in code_nodes:
1328
+ for doc_node in doc_nodes:
1329
+ code_type = code_node.get("type", "").lower()
1330
+ doc_type = doc_node.get("type", "").lower()
1331
+
1332
+ # Determine relationship type
1333
+ rel_type = CROSS_SOURCE_RELATIONS.get(
1334
+ (code_type, doc_type),
1335
+ "related_to"
1336
+ )
1337
+
1338
+ if self._add_link(
1339
+ code_node["id"],
1340
+ doc_node["id"],
1341
+ rel_type,
1342
+ f"cross_source:{name_key}"
1343
+ ):
1344
+ self.stats["cross_source_links"] += 1
1345
+
1346
+ logger.info(f"Created {self.stats['cross_source_links']} cross-source links")
1347
+
1348
+ def enrich_semantic_links(self,
1349
+ min_word_overlap: float = 0.5,
1350
+ max_links_per_entity: int = 5):
1351
+ """
1352
+ Create semantic links between entities based on shared concepts.
1353
+
1354
+ This enhanced cross-linking finds relationships by:
1355
+ 1. Shared significant words in entity names
1356
+ 2. Similar context (source/type combinations)
1357
+ 3. Hierarchical relationships (parent-child by naming)
1358
+
1359
+ Args:
1360
+ min_word_overlap: Minimum word overlap ratio
1361
+ max_links_per_entity: Maximum new links per entity
1362
+ """
1363
+ logger.info("Creating semantic cross-links...")
1364
+
1365
+ nodes = self.graph_data.get("nodes", [])
1366
+ links_created = 0
1367
+
1368
+ # Build word index for efficient lookup
1369
+ word_to_nodes: Dict[str, List[Dict]] = defaultdict(list)
1370
+ for node in nodes:
1371
+ words = self._tokenize_name(node.get("name", ""))
1372
+ for word in words:
1373
+ if len(word) >= 3: # Skip very short words
1374
+ word_to_nodes[word].append(node)
1375
+
1376
+ # Find semantic relationships
1377
+ processed_pairs: Set[Tuple[str, str]] = set()
1378
+ entity_link_count: Dict[str, int] = defaultdict(int)
1379
+
1380
+ for node in nodes:
1381
+ if entity_link_count[node["id"]] >= max_links_per_entity:
1382
+ continue
1383
+
1384
+ node_words = self._tokenize_name(node.get("name", ""))
1385
+ if not node_words:
1386
+ continue
1387
+
1388
+ # Find candidate nodes sharing words
1389
+ candidates: Dict[str, float] = {}
1390
+ for word in node_words:
1391
+ for other in word_to_nodes.get(word, []):
1392
+ if other["id"] == node["id"]:
1393
+ continue
1394
+
1395
+ pair = tuple(sorted([node["id"], other["id"]]))
1396
+ if pair in processed_pairs:
1397
+ continue
1398
+ if pair in self.existing_links:
1399
+ continue
1400
+
1401
+ other_words = self._tokenize_name(other.get("name", ""))
1402
+ if not other_words:
1403
+ continue
1404
+
1405
+ # Calculate overlap
1406
+ overlap = len(node_words & other_words)
1407
+ overlap_ratio = overlap / max(len(node_words), len(other_words))
1408
+
1409
+ if overlap_ratio >= min_word_overlap:
1410
+ if other["id"] not in candidates:
1411
+ candidates[other["id"]] = overlap_ratio
1412
+ else:
1413
+ candidates[other["id"]] = max(candidates[other["id"]], overlap_ratio)
1414
+
1415
+ # Create links to top candidates
1416
+ sorted_candidates = sorted(candidates.items(), key=lambda x: x[1], reverse=True)
1417
+
1418
+ for other_id, overlap in sorted_candidates[:max_links_per_entity]:
1419
+ if entity_link_count[node["id"]] >= max_links_per_entity:
1420
+ break
1421
+ if entity_link_count[other_id] >= max_links_per_entity:
1422
+ continue
1423
+
1424
+ pair = tuple(sorted([node["id"], other_id]))
1425
+ processed_pairs.add(pair)
1426
+
1427
+ other_node = self.nodes_by_id.get(other_id)
1428
+ if not other_node:
1429
+ continue
1430
+
1431
+ # Determine relationship type
1432
+ rel_type = self._infer_relationship_type(node, other_node)
1433
+
1434
+ if self._add_link(
1435
+ node["id"],
1436
+ other_id,
1437
+ rel_type,
1438
+ f"semantic_overlap:{overlap:.2f}"
1439
+ ):
1440
+ links_created += 1
1441
+ entity_link_count[node["id"]] += 1
1442
+ entity_link_count[other_id] += 1
1443
+
1444
+ self.stats["semantic_links"] = links_created
1445
+ logger.info(f"Created {links_created} semantic cross-links")
1446
+ return links_created
1447
+
1448
+ def _infer_relationship_type(self, node1: Dict, node2: Dict) -> str:
1449
+ """Infer the best relationship type between two entities."""
1450
+ type1 = node1.get("type", "").lower()
1451
+ type2 = node2.get("type", "").lower()
1452
+ name1 = self._normalize_name(node1.get("name", ""))
1453
+ name2 = self._normalize_name(node2.get("name", ""))
1454
+
1455
+ # Tool/Toolkit relationships - highest priority
1456
+ if type1 == "toolkit" and type2 == "tool":
1457
+ return "contains"
1458
+ if type2 == "toolkit" and type1 == "tool":
1459
+ return "part_of"
1460
+ if type1 == "mcp_server" and type2 == "mcp_tool":
1461
+ return "provides"
1462
+ if type2 == "mcp_server" and type1 == "mcp_tool":
1463
+ return "provided_by"
1464
+
1465
+ # Check for hierarchical relationship (one name contains the other)
1466
+ if name1 in name2 or name2 in name1:
1467
+ if len(name1) < len(name2):
1468
+ return "part_of"
1469
+ else:
1470
+ return "contains"
1471
+
1472
+ # Check for type-based relationships
1473
+ type_pairs = [
1474
+ ({"class", "function", "method", "module"}, {"feature", "concept"}, "implements"),
1475
+ ({"endpoint", "api"}, {"service"}, "part_of"),
1476
+ ({"test_case", "test_suite"}, {"feature", "function", "class"}, "tests"),
1477
+ ({"defect", "incident"}, {"feature", "component"}, "affects"),
1478
+ ({"ticket"}, {"feature", "epic", "user_story"}, "implements"),
1479
+ ({"documentation"}, {"feature", "api", "class"}, "documents"),
1480
+ ({"toolkit"}, {"feature", "capability", "function"}, "provides"),
1481
+ ({"tool"}, {"feature", "capability", "function"}, "implements"),
1482
+ ]
1483
+
1484
+ for types_a, types_b, rel in type_pairs:
1485
+ if (type1 in types_a and type2 in types_b) or (type2 in types_a and type1 in types_b):
1486
+ return rel
1487
+
1488
+ # Check cross-source relation map
1489
+ if (type1, type2) in CROSS_SOURCE_RELATIONS:
1490
+ return CROSS_SOURCE_RELATIONS[(type1, type2)]
1491
+ if (type2, type1) in CROSS_SOURCE_RELATIONS:
1492
+ return CROSS_SOURCE_RELATIONS[(type2, type1)]
1493
+
1494
+ return "related_to"
1495
+
1496
+ def enrich_toolkit_tool_links(self):
1497
+ """
1498
+ Create explicit links between toolkits and their tools.
1499
+
1500
+ This method specifically handles the toolkit → tool relationship by:
1501
+ 1. Finding all toolkit and tool entities
1502
+ 2. Matching tools to toolkits based on:
1503
+ - Same file path (tools defined in toolkit's documentation)
1504
+ - Toolkit name appearing in tool's parent_toolkit property
1505
+ - Tool name containing toolkit name prefix
1506
+ """
1507
+ logger.info("Linking tools to toolkits...")
1508
+
1509
+ nodes = self.graph_data.get("nodes", [])
1510
+ links_created = 0
1511
+
1512
+ # Index toolkits and tools
1513
+ toolkits = [n for n in nodes if n.get("type", "").lower() == "toolkit"]
1514
+ tools = [n for n in nodes if n.get("type", "").lower() == "tool"]
1515
+
1516
+ # Index toolkits by file_path and name
1517
+ toolkit_by_file: Dict[str, List[Dict]] = defaultdict(list)
1518
+ toolkit_by_name: Dict[str, Dict] = {}
1519
+
1520
+ for tk in toolkits:
1521
+ file_path = tk.get("file_path", "")
1522
+ if file_path:
1523
+ toolkit_by_file[file_path].append(tk)
1524
+ name = tk.get("name", "").lower()
1525
+ if name:
1526
+ toolkit_by_name[name] = tk
1527
+ # Also index by common variations
1528
+ # e.g., "GitHub Toolkit" → "github", "github toolkit"
1529
+ short_name = name.replace(" toolkit", "").replace("_toolkit", "")
1530
+ toolkit_by_name[short_name] = tk
1531
+
1532
+ for tool in tools:
1533
+ tool_id = tool["id"]
1534
+ tool_file = tool.get("file_path", "")
1535
+ tool_name = tool.get("name", "").lower()
1536
+ tool_props = tool.get("properties", {})
1537
+ parent_toolkit = tool_props.get("parent_toolkit", "").lower()
1538
+
1539
+ matched_toolkit = None
1540
+ match_reason = ""
1541
+
1542
+ # Strategy 1: Match by parent_toolkit property
1543
+ if parent_toolkit:
1544
+ for tk_name, tk in toolkit_by_name.items():
1545
+ if tk_name in parent_toolkit or parent_toolkit in tk_name:
1546
+ matched_toolkit = tk
1547
+ match_reason = f"parent_toolkit:{parent_toolkit}"
1548
+ break
1549
+
1550
+ # Strategy 2: Match by same file path
1551
+ if not matched_toolkit and tool_file:
1552
+ if tool_file in toolkit_by_file:
1553
+ # Pick first matching toolkit in same file
1554
+ matched_toolkit = toolkit_by_file[tool_file][0]
1555
+ match_reason = f"same_file:{tool_file}"
1556
+
1557
+ # Strategy 3: Match by tool name containing toolkit name
1558
+ if not matched_toolkit:
1559
+ for tk_name, tk in toolkit_by_name.items():
1560
+ if tk_name in tool_name:
1561
+ matched_toolkit = tk
1562
+ match_reason = f"name_match:{tk_name}"
1563
+ break
1564
+
1565
+ # Create link if matched
1566
+ if matched_toolkit:
1567
+ pair = tuple(sorted([matched_toolkit["id"], tool_id]))
1568
+ if pair not in self.existing_links:
1569
+ if self._add_link(
1570
+ matched_toolkit["id"],
1571
+ tool_id,
1572
+ "contains",
1573
+ f"toolkit_tool:{match_reason}"
1574
+ ):
1575
+ links_created += 1
1576
+
1577
+ self.stats["toolkit_tool_links"] = links_created
1578
+ logger.info(f"Created {links_created} toolkit → tool links")
1579
+
1580
+ def enrich_orphan_nodes(self, max_links_per_orphan: int = 3):
1581
+ """
1582
+ Connect orphan nodes to related entities based on name similarity.
1583
+ """
1584
+ logger.info("Connecting orphan nodes...")
1585
+
1586
+ # Find orphans
1587
+ connected = set()
1588
+ for link in self.graph_data.get("links", []):
1589
+ connected.add(link["source"])
1590
+ connected.add(link["target"])
1591
+ for link in self.new_links:
1592
+ connected.add(link["source"])
1593
+ connected.add(link["target"])
1594
+
1595
+ orphans = [
1596
+ node for node in self.graph_data.get("nodes", [])
1597
+ if node["id"] not in connected
1598
+ ]
1599
+
1600
+ logger.info(f"Found {len(orphans)} orphan nodes")
1601
+
1602
+ # For each orphan, find potential parents
1603
+ for orphan in orphans:
1604
+ orphan_name = self._normalize_name(orphan.get("name", ""))
1605
+ orphan_words = set(orphan_name.split())
1606
+
1607
+ candidates = []
1608
+
1609
+ for node in self.graph_data.get("nodes", []):
1610
+ if node["id"] == orphan["id"]:
1611
+ continue
1612
+ if node["id"] not in connected:
1613
+ continue # Don't link orphans to orphans
1614
+
1615
+ node_name = self._normalize_name(node.get("name", ""))
1616
+ node_words = set(node_name.split())
1617
+
1618
+ # Check word overlap
1619
+ overlap = len(orphan_words & node_words)
1620
+ if overlap > 0:
1621
+ # Calculate similarity score
1622
+ sim = self._similarity(orphan_name, node_name)
1623
+ word_score = overlap / max(len(orphan_words), 1)
1624
+ score = (sim + word_score) / 2
1625
+
1626
+ if score > 0.3: # Minimum threshold
1627
+ candidates.append((node, score))
1628
+
1629
+ # Sort by score and take top matches
1630
+ candidates.sort(key=lambda x: x[1], reverse=True)
1631
+
1632
+ for node, score in candidates[:max_links_per_orphan]:
1633
+ if self._add_link(
1634
+ orphan["id"],
1635
+ node["id"],
1636
+ "related_to",
1637
+ f"orphan_link:score={score:.2f}"
1638
+ ):
1639
+ self.stats["orphan_links"] += 1
1640
+
1641
+ logger.info(f"Created {self.stats['orphan_links']} orphan links")
1642
+
1643
+ def enrich_similarity_links(self, min_similarity: float = 0.9):
1644
+ """
1645
+ Create links between entities with very similar names.
1646
+
1647
+ This catches variations like "Create Toolkit" and "Toolkit Creation".
1648
+ """
1649
+ logger.info(f"Creating similarity links (threshold={min_similarity})...")
1650
+
1651
+ nodes = self.graph_data.get("nodes", [])
1652
+ processed = set()
1653
+
1654
+ for i, node1 in enumerate(nodes):
1655
+ name1 = self._normalize_name(node1.get("name", ""))
1656
+ if len(name1) < 3:
1657
+ continue
1658
+
1659
+ for j, node2 in enumerate(nodes[i+1:], i+1):
1660
+ pair = (node1["id"], node2["id"])
1661
+ if pair in processed:
1662
+ continue
1663
+ processed.add(pair)
1664
+
1665
+ name2 = self._normalize_name(node2.get("name", ""))
1666
+ if len(name2) < 3:
1667
+ continue
1668
+
1669
+ # Calculate similarity
1670
+ sim = self._similarity(name1, name2)
1671
+
1672
+ if sim >= min_similarity:
1673
+ if self._add_link(
1674
+ node1["id"],
1675
+ node2["id"],
1676
+ "similar_to",
1677
+ f"similarity:{sim:.2f}"
1678
+ ):
1679
+ self.stats["similarity_links"] += 1
1680
+
1681
+ logger.info(f"Created {self.stats['similarity_links']} similarity links")
1682
+
1683
+ def validate_low_confidence_relationships(
1684
+ self,
1685
+ confidence_threshold: float = 0.7,
1686
+ llm: Optional[Any] = None
1687
+ ) -> Dict[str, Any]:
1688
+ """
1689
+ Validate and re-evaluate relationships with confidence below threshold.
1690
+
1691
+ This method routes low-confidence relationships through additional validation:
1692
+ 1. Gather context from both source and target entities
1693
+ 2. Check if relationship makes semantic sense given entity types
1694
+ 3. Optionally use LLM to validate ambiguous relationships
1695
+
1696
+ Args:
1697
+ confidence_threshold: Relationships below this are candidates for validation
1698
+ llm: Optional LLM for re-evaluation (if None, uses heuristics only)
1699
+
1700
+ Returns:
1701
+ Dictionary with validation stats:
1702
+ - validated: Number of relationships confirmed
1703
+ - rejected: Number of relationships removed
1704
+ - upgraded: Number of relationships with increased confidence
1705
+ - downgraded: Number of relationships with decreased confidence
1706
+ """
1707
+ logger.info(f"Validating low-confidence relationships (threshold={confidence_threshold})...")
1708
+
1709
+ stats = {
1710
+ "candidates": 0,
1711
+ "validated": 0,
1712
+ "rejected": 0,
1713
+ "upgraded": 0,
1714
+ "downgraded": 0,
1715
+ }
1716
+
1717
+ links_to_keep = []
1718
+ links_to_remove = []
1719
+
1720
+ for link in self.graph_data.get("links", []):
1721
+ confidence = link.get("confidence", 1.0)
1722
+
1723
+ # Skip high-confidence links
1724
+ if confidence >= confidence_threshold:
1725
+ links_to_keep.append(link)
1726
+ continue
1727
+
1728
+ # Skip parser-extracted relationships (already validated by code structure)
1729
+ if link.get("source") == "parser":
1730
+ links_to_keep.append(link)
1731
+ continue
1732
+
1733
+ stats["candidates"] += 1
1734
+
1735
+ # Get source and target entities
1736
+ source_id = link.get("source")
1737
+ target_id = link.get("target")
1738
+ source_node = self.nodes_by_id.get(source_id)
1739
+ target_node = self.nodes_by_id.get(target_id)
1740
+
1741
+ if not source_node or not target_node:
1742
+ # Invalid link - remove
1743
+ stats["rejected"] += 1
1744
+ links_to_remove.append(link)
1745
+ continue
1746
+
1747
+ # Validate using heuristics
1748
+ validation_result = self._validate_relationship_heuristic(
1749
+ source_node, target_node, link
1750
+ )
1751
+
1752
+ if validation_result["action"] == "keep":
1753
+ # Update confidence if suggested
1754
+ if "new_confidence" in validation_result:
1755
+ link["confidence"] = validation_result["new_confidence"]
1756
+ link["validation_reason"] = validation_result.get("reason", "heuristic")
1757
+ if validation_result["new_confidence"] > confidence:
1758
+ stats["upgraded"] += 1
1759
+ elif validation_result["new_confidence"] < confidence:
1760
+ stats["downgraded"] += 1
1761
+ stats["validated"] += 1
1762
+ links_to_keep.append(link)
1763
+
1764
+ elif validation_result["action"] == "remove":
1765
+ stats["rejected"] += 1
1766
+ links_to_remove.append(link)
1767
+ logger.debug(
1768
+ f"Removing low-confidence relationship: {source_node.get('name')} "
1769
+ f"--[{link.get('relation_type')}]--> {target_node.get('name')} "
1770
+ f"(reason: {validation_result.get('reason', 'unknown')})"
1771
+ )
1772
+
1773
+ elif validation_result["action"] == "llm_validate" and llm:
1774
+ # Use LLM for ambiguous cases
1775
+ llm_result = self._validate_relationship_with_llm(
1776
+ source_node, target_node, link, llm
1777
+ )
1778
+ if llm_result["valid"]:
1779
+ link["confidence"] = llm_result.get("confidence", confidence)
1780
+ link["validation_reason"] = "llm_validated"
1781
+ stats["validated"] += 1
1782
+ links_to_keep.append(link)
1783
+ else:
1784
+ stats["rejected"] += 1
1785
+ links_to_remove.append(link)
1786
+ else:
1787
+ # Default: keep with same confidence
1788
+ links_to_keep.append(link)
1789
+ stats["validated"] += 1
1790
+
1791
+ # Update links
1792
+ self.graph_data["links"] = links_to_keep
1793
+
1794
+ # Log removed links for analysis
1795
+ if links_to_remove:
1796
+ logger.info(f"Removed {len(links_to_remove)} invalid low-confidence relationships")
1797
+
1798
+ self.stats["low_confidence_validation"] = stats
1799
+ logger.info(
1800
+ f"Low-confidence validation: {stats['candidates']} candidates, "
1801
+ f"{stats['validated']} validated, {stats['rejected']} rejected, "
1802
+ f"{stats['upgraded']} upgraded, {stats['downgraded']} downgraded"
1803
+ )
1804
+
1805
+ return stats
1806
+
1807
+ def _validate_relationship_heuristic(
1808
+ self,
1809
+ source_node: Dict,
1810
+ target_node: Dict,
1811
+ link: Dict
1812
+ ) -> Dict[str, Any]:
1813
+ """
1814
+ Validate a relationship using heuristic rules.
1815
+
1816
+ Returns:
1817
+ Dict with 'action' (keep/remove/llm_validate) and optional 'new_confidence'
1818
+ """
1819
+ source_type = source_node.get("type", "").lower()
1820
+ target_type = target_node.get("type", "").lower()
1821
+ relation_type = link.get("relation_type", "").lower()
1822
+ confidence = link.get("confidence", 0.5)
1823
+
1824
+ # Rule 1: Invalid type combinations for specific relationships
1825
+ invalid_combinations = {
1826
+ # imports should be between code entities
1827
+ "imports": {
1828
+ "invalid_source": {"feature", "concept", "documentation", "requirement"},
1829
+ "invalid_target": {"feature", "concept", "documentation", "requirement"},
1830
+ },
1831
+ # implements should have code as source
1832
+ "implements": {
1833
+ "invalid_source": {"documentation", "concept", "glossary_term"},
1834
+ },
1835
+ # contains should have container as source
1836
+ "contains": {
1837
+ "invalid_source": {"constant", "variable", "field", "property"},
1838
+ },
1839
+ # tests should have test as source
1840
+ "tests": {
1841
+ "invalid_source": {"class", "function", "method", "module"},
1842
+ },
1843
+ }
1844
+
1845
+ if relation_type in invalid_combinations:
1846
+ rules = invalid_combinations[relation_type]
1847
+ if source_type in rules.get("invalid_source", set()):
1848
+ return {"action": "remove", "reason": f"invalid_source_type:{source_type}"}
1849
+ if target_type in rules.get("invalid_target", set()):
1850
+ return {"action": "remove", "reason": f"invalid_target_type:{target_type}"}
1851
+
1852
+ # Rule 2: Boost confidence for semantically valid combinations
1853
+ valid_combinations = {
1854
+ ("class", "interface", "implements"): 0.9,
1855
+ ("method", "function", "calls"): 0.85,
1856
+ ("test_case", "function", "tests"): 0.9,
1857
+ ("test_case", "class", "tests"): 0.9,
1858
+ ("documentation", "class", "documents"): 0.85,
1859
+ ("documentation", "function", "documents"): 0.85,
1860
+ ("ticket", "feature", "implements"): 0.8,
1861
+ ("feature", "requirement", "implements"): 0.85,
1862
+ ("toolkit", "tool", "contains"): 0.95,
1863
+ ("module", "class", "contains"): 0.9,
1864
+ ("class", "method", "contains"): 0.95,
1865
+ }
1866
+
1867
+ combo_key = (source_type, target_type, relation_type)
1868
+ if combo_key in valid_combinations:
1869
+ suggested_confidence = valid_combinations[combo_key]
1870
+ return {
1871
+ "action": "keep",
1872
+ "new_confidence": max(confidence, suggested_confidence),
1873
+ "reason": f"valid_combination:{combo_key}"
1874
+ }
1875
+
1876
+ # Rule 3: Check name overlap for related_to relationships
1877
+ if relation_type == "related_to":
1878
+ source_words = self._tokenize_name(source_node.get("name", ""))
1879
+ target_words = self._tokenize_name(target_node.get("name", ""))
1880
+
1881
+ if source_words and target_words:
1882
+ overlap = len(source_words & target_words)
1883
+ if overlap >= 2:
1884
+ # Good overlap - boost confidence
1885
+ return {
1886
+ "action": "keep",
1887
+ "new_confidence": min(confidence + 0.2, 0.9),
1888
+ "reason": f"name_overlap:{overlap}"
1889
+ }
1890
+ elif overlap == 0 and confidence < 0.5:
1891
+ # No overlap and low confidence - consider removal
1892
+ return {"action": "llm_validate", "reason": "no_name_overlap"}
1893
+
1894
+ # Rule 4: Very low confidence with no semantic support
1895
+ if confidence < 0.4:
1896
+ # Check if there's any semantic basis
1897
+ source_name = source_node.get("name", "").lower()
1898
+ target_name = target_node.get("name", "").lower()
1899
+
1900
+ if (source_name not in target_name and
1901
+ target_name not in source_name and
1902
+ self._word_overlap_score(source_name, target_name) < 0.3):
1903
+ return {"action": "remove", "reason": "very_low_confidence_no_semantic_support"}
1904
+
1905
+ # Default: keep with same confidence
1906
+ return {"action": "keep", "reason": "default"}
1907
+
1908
+ def _validate_relationship_with_llm(
1909
+ self,
1910
+ source_node: Dict,
1911
+ target_node: Dict,
1912
+ link: Dict,
1913
+ llm: Any
1914
+ ) -> Dict[str, Any]:
1915
+ """
1916
+ Use LLM to validate an ambiguous relationship.
1917
+
1918
+ Args:
1919
+ source_node: Source entity
1920
+ target_node: Target entity
1921
+ link: The relationship to validate
1922
+ llm: LLM instance for validation
1923
+
1924
+ Returns:
1925
+ Dict with 'valid' (bool) and 'confidence' (float)
1926
+ """
1927
+ from langchain_core.prompts import ChatPromptTemplate
1928
+ from langchain_core.output_parsers import JsonOutputParser
1929
+
1930
+ prompt_template = """Validate if the following relationship makes semantic sense.
1931
+
1932
+ Source Entity:
1933
+ - Name: {source_name}
1934
+ - Type: {source_type}
1935
+ - Description: {source_desc}
1936
+
1937
+ Relationship: {relation_type}
1938
+
1939
+ Target Entity:
1940
+ - Name: {target_name}
1941
+ - Type: {target_type}
1942
+ - Description: {target_desc}
1943
+
1944
+ Question: Does it make sense that "{source_name}" {relation_type} "{target_name}"?
1945
+
1946
+ Respond with ONLY a JSON object:
1947
+ {{"valid": true/false, "confidence": 0.0-1.0, "reason": "<brief explanation>"}}
1948
+ """
1949
+
1950
+ try:
1951
+ prompt = ChatPromptTemplate.from_template(prompt_template)
1952
+ parser = JsonOutputParser()
1953
+ chain = prompt | llm | parser
1954
+
1955
+ result = chain.invoke({
1956
+ "source_name": source_node.get("name", ""),
1957
+ "source_type": source_node.get("type", ""),
1958
+ "source_desc": source_node.get("description", "No description"),
1959
+ "relation_type": link.get("relation_type", "related_to"),
1960
+ "target_name": target_node.get("name", ""),
1961
+ "target_type": target_node.get("type", ""),
1962
+ "target_desc": target_node.get("description", "No description"),
1963
+ })
1964
+
1965
+ return {
1966
+ "valid": result.get("valid", False),
1967
+ "confidence": result.get("confidence", 0.5),
1968
+ "reason": result.get("reason", "llm_validated")
1969
+ }
1970
+
1971
+ except Exception as e:
1972
+ logger.warning(f"LLM validation failed: {e}")
1973
+ # On LLM failure, keep the relationship
1974
+ return {"valid": True, "confidence": link.get("confidence", 0.5)}
1975
+
1976
+ def enrich(
1977
+ self,
1978
+ normalize_types: bool = True, # Normalize entity types first
1979
+ deduplicate: bool = False, # Disabled by default - can lose semantic meaning
1980
+ cross_source: bool = True,
1981
+ semantic_links: bool = True,
1982
+ toolkit_tools: bool = True, # Link tools to their toolkits
1983
+ orphans: bool = True,
1984
+ similarity: bool = False, # Disabled by default - can create too many links
1985
+ validate_low_confidence: bool = True, # Validate relationships with confidence < 0.7
1986
+ confidence_threshold: float = 0.7, # Threshold for low-confidence validation
1987
+ min_similarity: float = 0.9,
1988
+ exact_match_only: bool = True,
1989
+ llm: Optional[Any] = None, # Optional LLM for relationship validation
1990
+ ):
1991
+ """
1992
+ Run all enrichment steps.
1993
+
1994
+ The recommended order is:
1995
+ 0. Normalize entity types (Tool/tool/Tools → tool)
1996
+ 1. Deduplicate entities (DISABLED by default - use with caution)
1997
+ 2. Link tools to toolkits (explicit toolkit → tool relationships)
1998
+ 3. Create cross-source links (code ↔ docs)
1999
+ 4. Create semantic links (shared concepts) - LINKS related entities
2000
+ 5. Connect orphans
2001
+ 6. Similarity links (optional)
2002
+ 7. Validate low-confidence relationships
2003
+
2004
+ Args:
2005
+ normalize_types: Normalize entity types to canonical forms
2006
+ deduplicate: Merge entities with exact same name (DISABLED by default)
2007
+ cross_source: Link same-named entities across sources
2008
+ semantic_links: Link entities sharing significant words
2009
+ toolkit_tools: Create explicit toolkit → tool relationships
2010
+ orphans: Connect orphan nodes to related entities
2011
+ similarity: Link highly similar entity names
2012
+ validate_low_confidence: Validate relationships below confidence_threshold
2013
+ confidence_threshold: Threshold for low-confidence validation (default: 0.7)
2014
+ min_similarity: Threshold for similarity matching
2015
+ exact_match_only: Only merge exact name matches if dedup enabled
2016
+ llm: Optional LLM instance for validating ambiguous relationships
2017
+ """
2018
+ # Step 0: Normalize entity types (Tool/tool/Tools → tool)
2019
+ if normalize_types:
2020
+ self.normalize_entity_types()
2021
+
2022
+ # Step 1: Deduplication (DISABLED by default - can lose semantic meaning)
2023
+ if deduplicate:
2024
+ self.deduplicate_entities(require_exact_match=exact_match_only)
2025
+
2026
+ # Step 2: Link tools to their toolkits (high priority - structural)
2027
+ if toolkit_tools:
2028
+ self.enrich_toolkit_tool_links()
2029
+
2030
+ # Step 3: Cross-source linking
2031
+ if cross_source:
2032
+ self.enrich_cross_source_links()
2033
+
2034
+ # Step 4: Semantic cross-linking (LINKS related entities, doesn't merge)
2035
+ if semantic_links:
2036
+ self.enrich_semantic_links()
2037
+
2038
+ # Step 5: Orphan connections
2039
+ if orphans:
2040
+ self.enrich_orphan_nodes()
2041
+
2042
+ # Step 6: High similarity links (optional)
2043
+ if similarity:
2044
+ self.enrich_similarity_links(min_similarity)
2045
+
2046
+ # Step 7: Validate low-confidence relationships
2047
+ if validate_low_confidence:
2048
+ self.validate_low_confidence_relationships(
2049
+ confidence_threshold=confidence_threshold,
2050
+ llm=llm
2051
+ )
2052
+
2053
+ logger.info(f"Enrichment complete: {len(self.new_links)} new links added")
2054
+ return self.stats
2055
+
2056
+ def save(self, output_path: Optional[str] = None):
2057
+ """
2058
+ Save enriched graph.
2059
+
2060
+ Args:
2061
+ output_path: Optional output path. If None, overwrites input file.
2062
+ """
2063
+ output = Path(output_path) if output_path else self.graph_path
2064
+
2065
+ # Merge new links
2066
+ all_links = self.graph_data.get("links", []) + self.new_links
2067
+ self.graph_data["links"] = all_links
2068
+
2069
+ # Add enrichment metadata
2070
+ if "metadata" not in self.graph_data:
2071
+ self.graph_data["metadata"] = {}
2072
+ self.graph_data["metadata"]["enrichment_stats"] = self.stats
2073
+
2074
+ with open(output, "w") as f:
2075
+ json.dump(self.graph_data, f, indent=2)
2076
+
2077
+ logger.info(f"Saved enriched graph to {output}")
2078
+ return str(output)
2079
+
2080
+ def get_stats(self) -> Dict[str, Any]:
2081
+ """Get enrichment statistics."""
2082
+ return {
2083
+ **self.stats,
2084
+ "total_new_links": len(self.new_links),
2085
+ "original_nodes": len(self.nodes_by_id) + self.stats.get("entities_merged", 0),
2086
+ "final_nodes": len(self.nodes_by_id),
2087
+ "original_links": len(self.graph_data.get("links", [])) - len(self.new_links),
2088
+ "final_links": len(self.graph_data.get("links", [])),
2089
+ }
2090
+
2091
+
2092
+ def enrich_graph(
2093
+ graph_path: str,
2094
+ output_path: Optional[str] = None,
2095
+ deduplicate: bool = False, # Disabled by default
2096
+ cross_source: bool = True,
2097
+ semantic_links: bool = True,
2098
+ toolkit_tools: bool = True,
2099
+ orphans: bool = True,
2100
+ similarity: bool = False,
2101
+ validate_low_confidence: bool = True,
2102
+ confidence_threshold: float = 0.7,
2103
+ llm: Optional[Any] = None,
2104
+ ) -> Dict[str, Any]:
2105
+ """
2106
+ Convenience function to enrich a graph file.
2107
+
2108
+ Args:
2109
+ graph_path: Path to input graph JSON
2110
+ output_path: Path to output (default: overwrite input)
2111
+ deduplicate: Merge same/similar entities (disabled by default)
2112
+ cross_source: Create cross-source links
2113
+ semantic_links: Create semantic cross-links
2114
+ toolkit_tools: Link tools to their toolkits
2115
+ orphans: Connect orphan nodes
2116
+ similarity: Create similarity links
2117
+ validate_low_confidence: Validate relationships below confidence_threshold
2118
+ confidence_threshold: Threshold for low-confidence validation (default: 0.7)
2119
+ llm: Optional LLM instance for validating ambiguous relationships
2120
+
2121
+ Returns:
2122
+ Enrichment statistics
2123
+ """
2124
+ enricher = GraphEnricher(graph_path)
2125
+ stats = enricher.enrich(
2126
+ deduplicate=deduplicate,
2127
+ cross_source=cross_source,
2128
+ semantic_links=semantic_links,
2129
+ toolkit_tools=toolkit_tools,
2130
+ orphans=orphans,
2131
+ similarity=similarity,
2132
+ validate_low_confidence=validate_low_confidence,
2133
+ confidence_threshold=confidence_threshold,
2134
+ llm=llm,
2135
+ )
2136
+ enricher.save(output_path)
2137
+ return stats