alita-sdk 0.3.379__py3-none-any.whl → 0.3.627__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent/__init__.py +5 -0
  4. alita_sdk/cli/agent/default.py +258 -0
  5. alita_sdk/cli/agent_executor.py +156 -0
  6. alita_sdk/cli/agent_loader.py +245 -0
  7. alita_sdk/cli/agent_ui.py +228 -0
  8. alita_sdk/cli/agents.py +3113 -0
  9. alita_sdk/cli/callbacks.py +647 -0
  10. alita_sdk/cli/cli.py +168 -0
  11. alita_sdk/cli/config.py +306 -0
  12. alita_sdk/cli/context/__init__.py +30 -0
  13. alita_sdk/cli/context/cleanup.py +198 -0
  14. alita_sdk/cli/context/manager.py +731 -0
  15. alita_sdk/cli/context/message.py +285 -0
  16. alita_sdk/cli/context/strategies.py +289 -0
  17. alita_sdk/cli/context/token_estimation.py +127 -0
  18. alita_sdk/cli/formatting.py +182 -0
  19. alita_sdk/cli/input_handler.py +419 -0
  20. alita_sdk/cli/inventory.py +1073 -0
  21. alita_sdk/cli/mcp_loader.py +315 -0
  22. alita_sdk/cli/testcases/__init__.py +94 -0
  23. alita_sdk/cli/testcases/data_generation.py +119 -0
  24. alita_sdk/cli/testcases/discovery.py +96 -0
  25. alita_sdk/cli/testcases/executor.py +84 -0
  26. alita_sdk/cli/testcases/logger.py +85 -0
  27. alita_sdk/cli/testcases/parser.py +172 -0
  28. alita_sdk/cli/testcases/prompts.py +91 -0
  29. alita_sdk/cli/testcases/reporting.py +125 -0
  30. alita_sdk/cli/testcases/setup.py +108 -0
  31. alita_sdk/cli/testcases/test_runner.py +282 -0
  32. alita_sdk/cli/testcases/utils.py +39 -0
  33. alita_sdk/cli/testcases/validation.py +90 -0
  34. alita_sdk/cli/testcases/workflow.py +196 -0
  35. alita_sdk/cli/toolkit.py +327 -0
  36. alita_sdk/cli/toolkit_loader.py +85 -0
  37. alita_sdk/cli/tools/__init__.py +43 -0
  38. alita_sdk/cli/tools/approval.py +224 -0
  39. alita_sdk/cli/tools/filesystem.py +1751 -0
  40. alita_sdk/cli/tools/planning.py +389 -0
  41. alita_sdk/cli/tools/terminal.py +414 -0
  42. alita_sdk/community/__init__.py +72 -12
  43. alita_sdk/community/inventory/__init__.py +236 -0
  44. alita_sdk/community/inventory/config.py +257 -0
  45. alita_sdk/community/inventory/enrichment.py +2137 -0
  46. alita_sdk/community/inventory/extractors.py +1469 -0
  47. alita_sdk/community/inventory/ingestion.py +3172 -0
  48. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  49. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  50. alita_sdk/community/inventory/parsers/base.py +295 -0
  51. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  52. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  53. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  54. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  55. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  56. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  57. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  58. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  59. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  60. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  61. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  62. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  63. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  64. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  65. alita_sdk/community/inventory/patterns/loader.py +348 -0
  66. alita_sdk/community/inventory/patterns/registry.py +198 -0
  67. alita_sdk/community/inventory/presets.py +535 -0
  68. alita_sdk/community/inventory/retrieval.py +1403 -0
  69. alita_sdk/community/inventory/toolkit.py +173 -0
  70. alita_sdk/community/inventory/toolkit_utils.py +176 -0
  71. alita_sdk/community/inventory/visualize.py +1370 -0
  72. alita_sdk/configurations/__init__.py +1 -1
  73. alita_sdk/configurations/ado.py +141 -20
  74. alita_sdk/configurations/bitbucket.py +94 -2
  75. alita_sdk/configurations/confluence.py +130 -1
  76. alita_sdk/configurations/figma.py +76 -0
  77. alita_sdk/configurations/gitlab.py +91 -0
  78. alita_sdk/configurations/jira.py +103 -0
  79. alita_sdk/configurations/openapi.py +329 -0
  80. alita_sdk/configurations/qtest.py +72 -1
  81. alita_sdk/configurations/report_portal.py +96 -0
  82. alita_sdk/configurations/sharepoint.py +148 -0
  83. alita_sdk/configurations/testio.py +83 -0
  84. alita_sdk/configurations/testrail.py +88 -0
  85. alita_sdk/configurations/xray.py +93 -0
  86. alita_sdk/configurations/zephyr_enterprise.py +93 -0
  87. alita_sdk/configurations/zephyr_essential.py +75 -0
  88. alita_sdk/runtime/clients/artifact.py +3 -3
  89. alita_sdk/runtime/clients/client.py +388 -46
  90. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  91. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  92. alita_sdk/runtime/clients/sandbox_client.py +8 -21
  93. alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
  94. alita_sdk/runtime/langchain/assistant.py +157 -39
  95. alita_sdk/runtime/langchain/constants.py +647 -1
  96. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  97. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
  98. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
  99. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -4
  100. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +226 -7
  101. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +5 -2
  102. alita_sdk/runtime/langchain/document_loaders/constants.py +40 -19
  103. alita_sdk/runtime/langchain/langraph_agent.py +405 -84
  104. alita_sdk/runtime/langchain/utils.py +106 -7
  105. alita_sdk/runtime/llms/preloaded.py +2 -6
  106. alita_sdk/runtime/models/mcp_models.py +61 -0
  107. alita_sdk/runtime/skills/__init__.py +91 -0
  108. alita_sdk/runtime/skills/callbacks.py +498 -0
  109. alita_sdk/runtime/skills/discovery.py +540 -0
  110. alita_sdk/runtime/skills/executor.py +610 -0
  111. alita_sdk/runtime/skills/input_builder.py +371 -0
  112. alita_sdk/runtime/skills/models.py +330 -0
  113. alita_sdk/runtime/skills/registry.py +355 -0
  114. alita_sdk/runtime/skills/skill_runner.py +330 -0
  115. alita_sdk/runtime/toolkits/__init__.py +31 -0
  116. alita_sdk/runtime/toolkits/application.py +29 -10
  117. alita_sdk/runtime/toolkits/artifact.py +20 -11
  118. alita_sdk/runtime/toolkits/datasource.py +13 -6
  119. alita_sdk/runtime/toolkits/mcp.py +783 -0
  120. alita_sdk/runtime/toolkits/mcp_config.py +1048 -0
  121. alita_sdk/runtime/toolkits/planning.py +178 -0
  122. alita_sdk/runtime/toolkits/skill_router.py +238 -0
  123. alita_sdk/runtime/toolkits/subgraph.py +251 -6
  124. alita_sdk/runtime/toolkits/tools.py +356 -69
  125. alita_sdk/runtime/toolkits/vectorstore.py +11 -5
  126. alita_sdk/runtime/tools/__init__.py +10 -3
  127. alita_sdk/runtime/tools/application.py +27 -6
  128. alita_sdk/runtime/tools/artifact.py +511 -28
  129. alita_sdk/runtime/tools/data_analysis.py +183 -0
  130. alita_sdk/runtime/tools/function.py +67 -35
  131. alita_sdk/runtime/tools/graph.py +10 -4
  132. alita_sdk/runtime/tools/image_generation.py +148 -46
  133. alita_sdk/runtime/tools/llm.py +1003 -128
  134. alita_sdk/runtime/tools/loop.py +3 -1
  135. alita_sdk/runtime/tools/loop_output.py +3 -1
  136. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  137. alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
  138. alita_sdk/runtime/tools/mcp_server_tool.py +8 -5
  139. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  140. alita_sdk/runtime/tools/planning/models.py +246 -0
  141. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  142. alita_sdk/runtime/tools/router.py +2 -4
  143. alita_sdk/runtime/tools/sandbox.py +65 -48
  144. alita_sdk/runtime/tools/skill_router.py +776 -0
  145. alita_sdk/runtime/tools/tool.py +3 -1
  146. alita_sdk/runtime/tools/vectorstore.py +9 -3
  147. alita_sdk/runtime/tools/vectorstore_base.py +70 -14
  148. alita_sdk/runtime/utils/AlitaCallback.py +137 -21
  149. alita_sdk/runtime/utils/constants.py +5 -1
  150. alita_sdk/runtime/utils/mcp_client.py +492 -0
  151. alita_sdk/runtime/utils/mcp_oauth.py +361 -0
  152. alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
  153. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  154. alita_sdk/runtime/utils/serialization.py +155 -0
  155. alita_sdk/runtime/utils/streamlit.py +40 -13
  156. alita_sdk/runtime/utils/toolkit_utils.py +30 -9
  157. alita_sdk/runtime/utils/utils.py +36 -0
  158. alita_sdk/tools/__init__.py +134 -35
  159. alita_sdk/tools/ado/repos/__init__.py +51 -32
  160. alita_sdk/tools/ado/repos/repos_wrapper.py +148 -89
  161. alita_sdk/tools/ado/test_plan/__init__.py +25 -9
  162. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +23 -1
  163. alita_sdk/tools/ado/utils.py +1 -18
  164. alita_sdk/tools/ado/wiki/__init__.py +25 -12
  165. alita_sdk/tools/ado/wiki/ado_wrapper.py +291 -22
  166. alita_sdk/tools/ado/work_item/__init__.py +26 -13
  167. alita_sdk/tools/ado/work_item/ado_wrapper.py +73 -11
  168. alita_sdk/tools/advanced_jira_mining/__init__.py +11 -8
  169. alita_sdk/tools/aws/delta_lake/__init__.py +13 -9
  170. alita_sdk/tools/aws/delta_lake/tool.py +5 -1
  171. alita_sdk/tools/azure_ai/search/__init__.py +11 -8
  172. alita_sdk/tools/azure_ai/search/api_wrapper.py +1 -1
  173. alita_sdk/tools/base/tool.py +5 -1
  174. alita_sdk/tools/base_indexer_toolkit.py +271 -84
  175. alita_sdk/tools/bitbucket/__init__.py +17 -11
  176. alita_sdk/tools/bitbucket/api_wrapper.py +59 -11
  177. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +49 -35
  178. alita_sdk/tools/browser/__init__.py +5 -4
  179. alita_sdk/tools/carrier/__init__.py +5 -6
  180. alita_sdk/tools/carrier/backend_reports_tool.py +6 -6
  181. alita_sdk/tools/carrier/run_ui_test_tool.py +6 -6
  182. alita_sdk/tools/carrier/ui_reports_tool.py +5 -5
  183. alita_sdk/tools/chunkers/__init__.py +3 -1
  184. alita_sdk/tools/chunkers/code/treesitter/treesitter.py +37 -13
  185. alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
  186. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  187. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  188. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  189. alita_sdk/tools/cloud/aws/__init__.py +10 -7
  190. alita_sdk/tools/cloud/azure/__init__.py +10 -7
  191. alita_sdk/tools/cloud/gcp/__init__.py +10 -7
  192. alita_sdk/tools/cloud/k8s/__init__.py +10 -7
  193. alita_sdk/tools/code/linter/__init__.py +10 -8
  194. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  195. alita_sdk/tools/code/sonar/__init__.py +11 -8
  196. alita_sdk/tools/code_indexer_toolkit.py +82 -22
  197. alita_sdk/tools/confluence/__init__.py +22 -16
  198. alita_sdk/tools/confluence/api_wrapper.py +107 -30
  199. alita_sdk/tools/confluence/loader.py +14 -2
  200. alita_sdk/tools/custom_open_api/__init__.py +12 -5
  201. alita_sdk/tools/elastic/__init__.py +11 -8
  202. alita_sdk/tools/elitea_base.py +493 -30
  203. alita_sdk/tools/figma/__init__.py +58 -11
  204. alita_sdk/tools/figma/api_wrapper.py +1235 -143
  205. alita_sdk/tools/figma/figma_client.py +73 -0
  206. alita_sdk/tools/figma/toon_tools.py +2748 -0
  207. alita_sdk/tools/github/__init__.py +14 -15
  208. alita_sdk/tools/github/github_client.py +224 -100
  209. alita_sdk/tools/github/graphql_client_wrapper.py +119 -33
  210. alita_sdk/tools/github/schemas.py +14 -5
  211. alita_sdk/tools/github/tool.py +5 -1
  212. alita_sdk/tools/github/tool_prompts.py +9 -22
  213. alita_sdk/tools/gitlab/__init__.py +16 -11
  214. alita_sdk/tools/gitlab/api_wrapper.py +218 -48
  215. alita_sdk/tools/gitlab_org/__init__.py +10 -9
  216. alita_sdk/tools/gitlab_org/api_wrapper.py +63 -64
  217. alita_sdk/tools/google/bigquery/__init__.py +13 -12
  218. alita_sdk/tools/google/bigquery/tool.py +5 -1
  219. alita_sdk/tools/google_places/__init__.py +11 -8
  220. alita_sdk/tools/google_places/api_wrapper.py +1 -1
  221. alita_sdk/tools/jira/__init__.py +17 -10
  222. alita_sdk/tools/jira/api_wrapper.py +92 -41
  223. alita_sdk/tools/keycloak/__init__.py +11 -8
  224. alita_sdk/tools/localgit/__init__.py +9 -3
  225. alita_sdk/tools/localgit/local_git.py +62 -54
  226. alita_sdk/tools/localgit/tool.py +5 -1
  227. alita_sdk/tools/memory/__init__.py +12 -4
  228. alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
  229. alita_sdk/tools/ocr/__init__.py +11 -8
  230. alita_sdk/tools/openapi/__init__.py +491 -106
  231. alita_sdk/tools/openapi/api_wrapper.py +1368 -0
  232. alita_sdk/tools/openapi/tool.py +20 -0
  233. alita_sdk/tools/pandas/__init__.py +20 -12
  234. alita_sdk/tools/pandas/api_wrapper.py +38 -25
  235. alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
  236. alita_sdk/tools/postman/__init__.py +10 -9
  237. alita_sdk/tools/pptx/__init__.py +11 -10
  238. alita_sdk/tools/pptx/pptx_wrapper.py +1 -1
  239. alita_sdk/tools/qtest/__init__.py +31 -11
  240. alita_sdk/tools/qtest/api_wrapper.py +2135 -86
  241. alita_sdk/tools/rally/__init__.py +10 -9
  242. alita_sdk/tools/rally/api_wrapper.py +1 -1
  243. alita_sdk/tools/report_portal/__init__.py +12 -8
  244. alita_sdk/tools/salesforce/__init__.py +10 -8
  245. alita_sdk/tools/servicenow/__init__.py +17 -15
  246. alita_sdk/tools/servicenow/api_wrapper.py +1 -1
  247. alita_sdk/tools/sharepoint/__init__.py +10 -7
  248. alita_sdk/tools/sharepoint/api_wrapper.py +129 -38
  249. alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
  250. alita_sdk/tools/sharepoint/utils.py +8 -2
  251. alita_sdk/tools/slack/__init__.py +10 -7
  252. alita_sdk/tools/slack/api_wrapper.py +2 -2
  253. alita_sdk/tools/sql/__init__.py +12 -9
  254. alita_sdk/tools/testio/__init__.py +10 -7
  255. alita_sdk/tools/testrail/__init__.py +11 -10
  256. alita_sdk/tools/testrail/api_wrapper.py +1 -1
  257. alita_sdk/tools/utils/__init__.py +9 -4
  258. alita_sdk/tools/utils/content_parser.py +103 -18
  259. alita_sdk/tools/utils/text_operations.py +410 -0
  260. alita_sdk/tools/utils/tool_prompts.py +79 -0
  261. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +30 -13
  262. alita_sdk/tools/xray/__init__.py +13 -9
  263. alita_sdk/tools/yagmail/__init__.py +9 -3
  264. alita_sdk/tools/zephyr/__init__.py +10 -7
  265. alita_sdk/tools/zephyr_enterprise/__init__.py +11 -7
  266. alita_sdk/tools/zephyr_essential/__init__.py +10 -7
  267. alita_sdk/tools/zephyr_essential/api_wrapper.py +30 -13
  268. alita_sdk/tools/zephyr_essential/client.py +2 -2
  269. alita_sdk/tools/zephyr_scale/__init__.py +11 -8
  270. alita_sdk/tools/zephyr_scale/api_wrapper.py +2 -2
  271. alita_sdk/tools/zephyr_squad/__init__.py +10 -7
  272. {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/METADATA +154 -8
  273. alita_sdk-0.3.627.dist-info/RECORD +468 -0
  274. alita_sdk-0.3.627.dist-info/entry_points.txt +2 -0
  275. alita_sdk-0.3.379.dist-info/RECORD +0 -360
  276. {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/WHEEL +0 -0
  277. {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/licenses/LICENSE +0 -0
  278. {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1457 @@
1
+ """
2
+ NetworkX-based Knowledge Graph implementation.
3
+
4
+ Provides lightweight in-memory graph storage with JSON persistence.
5
+ Entities contain citations (source file, line numbers) instead of raw content.
6
+ Raw data should be retrieved on-demand using filesystem tools.
7
+ """
8
+
9
+ import json
10
+ import logging
11
+ from datetime import datetime
12
+ from typing import Any, Optional, List, Dict, Set
13
+ from collections import defaultdict
14
+
15
+ try:
16
+ import networkx as nx
17
+ from networkx import DiGraph
18
+ except ImportError:
19
+ nx = None
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+
24
+ class Citation:
25
+ """
26
+ Represents a source citation for an entity.
27
+
28
+ Citations are lightweight references to source files and line ranges.
29
+ The actual content should be retrieved on-demand using filesystem tools.
30
+ """
31
+
32
+ def __init__(
33
+ self,
34
+ file_path: str,
35
+ line_start: Optional[int] = None,
36
+ line_end: Optional[int] = None,
37
+ source_toolkit: Optional[str] = None,
38
+ doc_id: Optional[str] = None,
39
+ content_hash: Optional[str] = None,
40
+ ):
41
+ self.file_path = file_path
42
+ self.line_start = line_start
43
+ self.line_end = line_end
44
+ self.source_toolkit = source_toolkit
45
+ self.doc_id = doc_id
46
+ self.content_hash = content_hash
47
+
48
+ def to_dict(self) -> Dict[str, Any]:
49
+ """Convert citation to dictionary."""
50
+ return {
51
+ 'file_path': self.file_path,
52
+ 'line_start': self.line_start,
53
+ 'line_end': self.line_end,
54
+ 'source_toolkit': self.source_toolkit,
55
+ 'doc_id': self.doc_id,
56
+ 'content_hash': self.content_hash,
57
+ }
58
+
59
+ @classmethod
60
+ def from_dict(cls, data: Dict[str, Any]) -> 'Citation':
61
+ """Create citation from dictionary."""
62
+ return cls(
63
+ file_path=data.get('file_path', ''),
64
+ line_start=data.get('line_start'),
65
+ line_end=data.get('line_end'),
66
+ source_toolkit=data.get('source_toolkit'),
67
+ doc_id=data.get('doc_id'),
68
+ content_hash=data.get('content_hash'),
69
+ )
70
+
71
+ def __repr__(self) -> str:
72
+ if self.line_start and self.line_end:
73
+ return f"{self.file_path}:{self.line_start}-{self.line_end}"
74
+ elif self.line_start:
75
+ return f"{self.file_path}:{self.line_start}"
76
+ return self.file_path
77
+
78
+
79
+ class KnowledgeGraph:
80
+ """
81
+ Lightweight NetworkX-based knowledge graph for storing entities and relationships.
82
+
83
+ Design principles:
84
+ - Graph contains only entity metadata and citations (not raw content)
85
+ - Citations reference source files and line numbers
86
+ - Raw content is retrieved on-demand via filesystem tools
87
+ - Graph file stays small and portable
88
+
89
+ Features:
90
+ - In-memory property graph using NetworkX
91
+ - JSON persistence via node_link_data format
92
+ - Delta update support with source document tracking
93
+ - Entity deduplication with merge strategies
94
+ - Impact analysis via graph traversal
95
+ - Enhanced search with fuzzy matching, token-based search, and file path patterns
96
+ """
97
+
98
+ # Layer classification based on entity types
99
+ LAYER_TYPE_MAPPING = {
100
+ 'code': {
101
+ 'class', 'function', 'method', 'module', 'import', 'variable',
102
+ 'constant', 'attribute', 'decorator', 'exception', 'enum',
103
+ 'class_reference', 'class_import', 'function_import', 'function_reference',
104
+ 'function_call', 'method_call', 'test_function', 'pydanticmodel'
105
+ },
106
+ 'service': {
107
+ 'api_endpoint', 'rpc_method', 'route', 'service', 'handler',
108
+ 'controller', 'middleware', 'event', 'sio', 'rpc'
109
+ },
110
+ 'data': {
111
+ 'model', 'schema', 'field', 'table', 'database', 'migration',
112
+ 'entity', 'pydantic_model', 'dictionary', 'list', 'object'
113
+ },
114
+ 'product': {
115
+ 'feature', 'capability', 'platform', 'product', 'application',
116
+ 'menu', 'ui_element', 'ui_component', 'interface_element'
117
+ },
118
+ 'domain': {
119
+ 'concept', 'process', 'action', 'use_case', 'workflow',
120
+ 'requirement', 'guideline', 'best_practice'
121
+ },
122
+ 'documentation': {
123
+ 'document', 'guide', 'section', 'subsection', 'tip',
124
+ 'example', 'resource', 'reference', 'documentation'
125
+ },
126
+ 'configuration': {
127
+ 'configuration', 'configuration_option', 'configuration_section',
128
+ 'setting', 'credential', 'secret', 'integration'
129
+ },
130
+ 'testing': {
131
+ 'test', 'test_case', 'test_function', 'fixture', 'mock'
132
+ },
133
+ 'tooling': {
134
+ 'tool', 'toolkit', 'command', 'node_type', 'node'
135
+ },
136
+ 'knowledge': {
137
+ # Facts extracted from code and documentation
138
+ 'fact',
139
+ # Code-specific fact types
140
+ 'algorithm', 'behavior', 'validation', 'dependency', 'error_handling',
141
+ # Text-specific fact types
142
+ 'decision', 'definition', 'date', 'contact',
143
+ },
144
+ 'structure': {
145
+ # File-level container nodes
146
+ 'file', 'source_file', 'document_file', 'config_file', 'web_file',
147
+ # Directory/package structure
148
+ 'directory', 'package',
149
+ }
150
+ }
151
+
152
+ # Reverse mapping: type -> layer
153
+ TYPE_TO_LAYER = {}
154
+ for layer, types in LAYER_TYPE_MAPPING.items():
155
+ for t in types:
156
+ TYPE_TO_LAYER[t] = layer
157
+
158
+ def __init__(self):
159
+ """Initialize an empty knowledge graph."""
160
+ if nx is None:
161
+ raise ImportError("networkx is required for KnowledgeGraph. Install with: pip install networkx>=3.0")
162
+
163
+ self._graph: DiGraph = DiGraph()
164
+ self._entity_index: Dict[str, Set[str]] = defaultdict(set) # name -> set of node_ids (handles duplicates)
165
+ self._type_index: Dict[str, Set[str]] = defaultdict(set) # type (lowercase) -> node_ids
166
+ self._file_index: Dict[str, Set[str]] = defaultdict(set) # file_path -> node_ids
167
+ self._source_doc_index: Dict[str, Set[str]] = defaultdict(set) # source_doc_id -> node_ids
168
+ self._metadata: Dict[str, Any] = {} # Graph metadata (sources, timestamps)
169
+ self._schema: Optional[Dict[str, Any]] = None # Discovered entity schema
170
+
171
+ # ========== Entity Operations ==========
172
+
173
+ def add_entity(
174
+ self,
175
+ entity_id: str,
176
+ name: str,
177
+ entity_type: str,
178
+ citation: Optional[Citation] = None,
179
+ properties: Optional[Dict[str, Any]] = None,
180
+ ) -> str:
181
+ """
182
+ Add an entity to the graph with optional citation.
183
+
184
+ If an entity with this ID already exists, the citation is merged
185
+ into the existing entity's citations list (enabling same-named
186
+ entities from different files to be unified).
187
+
188
+ Args:
189
+ entity_id: Unique identifier for the entity
190
+ name: Human-readable entity name
191
+ entity_type: Type classification (e.g., 'Class', 'Function', 'Service')
192
+ citation: Source citation (file path, line numbers)
193
+ properties: Additional properties (no raw content, only metadata)
194
+
195
+ Returns:
196
+ The entity_id (node ID in graph)
197
+ """
198
+ # Check if entity already exists (for merging citations)
199
+ existing = self._graph.nodes.get(entity_id)
200
+
201
+ if existing:
202
+ # Entity exists - merge the new citation
203
+ if citation:
204
+ new_citation_dict = citation.to_dict()
205
+ existing_citations = existing.get('citations', [])
206
+
207
+ # Migrate legacy single 'citation' to list
208
+ if 'citation' in existing and existing['citation']:
209
+ legacy = existing['citation']
210
+ if legacy not in existing_citations:
211
+ existing_citations.append(legacy)
212
+
213
+ # Add new citation if not duplicate
214
+ if new_citation_dict not in existing_citations:
215
+ existing_citations.append(new_citation_dict)
216
+
217
+ # Update node with merged citations
218
+ self._graph.nodes[entity_id]['citations'] = existing_citations
219
+ self._graph.nodes[entity_id].pop('citation', None) # Remove legacy field
220
+
221
+ # Track source document
222
+ if citation.doc_id:
223
+ self._source_doc_index[citation.doc_id].add(entity_id)
224
+
225
+ logger.debug(f"Merged citation into existing entity: {entity_type} '{name}' ({entity_id})")
226
+ return entity_id
227
+
228
+ # New entity - prepare node data
229
+ node_data = {
230
+ 'id': entity_id,
231
+ 'name': name,
232
+ 'type': entity_type,
233
+ }
234
+
235
+ # Auto-assign layer based on entity type
236
+ inferred_layer = self.TYPE_TO_LAYER.get(entity_type.lower())
237
+ if inferred_layer:
238
+ node_data['layer'] = inferred_layer
239
+
240
+ # Store citation in list format from the start
241
+ if citation:
242
+ node_data['citations'] = [citation.to_dict()]
243
+ # Track source document
244
+ if citation.doc_id:
245
+ self._source_doc_index[citation.doc_id].add(entity_id)
246
+ # Track file index
247
+ if citation.file_path:
248
+ self._file_index[citation.file_path].add(entity_id)
249
+
250
+ # Add other properties (excluding any large content)
251
+ if properties:
252
+ # Filter out raw content fields
253
+ excluded_keys = {'content', 'text', 'raw', 'body', 'source_content'}
254
+ for key, value in properties.items():
255
+ if key not in excluded_keys:
256
+ # Only store if serializable and reasonably sized
257
+ if isinstance(value, (str, int, float, bool, list, dict)) and \
258
+ (not isinstance(value, str) or len(value) < 1000):
259
+ node_data[key] = value
260
+
261
+ # Add new node
262
+ self._graph.add_node(entity_id, **node_data)
263
+
264
+ # Update indices - store ALL entities with this name (not just one)
265
+ self._entity_index[name.lower()].add(entity_id)
266
+ self._type_index[entity_type.lower()].add(entity_id)
267
+
268
+ logger.debug(f"Added entity: {entity_type} '{name}' ({entity_id})")
269
+ return entity_id
270
+
271
+ def get_entity(self, entity_id: str) -> Optional[Dict[str, Any]]:
272
+ """Get entity by ID."""
273
+ if self._graph.has_node(entity_id):
274
+ return dict(self._graph.nodes[entity_id])
275
+ return None
276
+
277
+ def find_entity_by_name(self, name: str) -> Optional[Dict[str, Any]]:
278
+ """
279
+ Find entity by name (case-insensitive).
280
+
281
+ If multiple entities have the same name, returns the first one found.
282
+ Use find_all_entities_by_name to get all matches.
283
+ """
284
+ node_ids = self._entity_index.get(name.lower(), set())
285
+ if node_ids:
286
+ # Return first match
287
+ return self.get_entity(next(iter(node_ids)))
288
+ return None
289
+
290
+ def find_all_entities_by_name(self, name: str) -> List[Dict[str, Any]]:
291
+ """
292
+ Find all entities with the given name (case-insensitive).
293
+
294
+ Returns all entities if multiple have the same name but different types.
295
+ """
296
+ node_ids = self._entity_index.get(name.lower(), set())
297
+ return [self.get_entity(nid) for nid in node_ids if nid]
298
+
299
+ def get_entities_by_type(self, entity_type: str, limit: Optional[int] = None) -> List[Dict[str, Any]]:
300
+ """
301
+ Get all entities of a specific type (case-insensitive).
302
+
303
+ Also checks layer-based type groups. For example, searching for 'code'
304
+ will return classes, functions, methods, etc.
305
+ """
306
+ entity_type_lower = entity_type.lower()
307
+
308
+ # Check if this is a layer name
309
+ if entity_type_lower in self.LAYER_TYPE_MAPPING:
310
+ # Get all types in this layer
311
+ results = []
312
+ for t in self.LAYER_TYPE_MAPPING[entity_type_lower]:
313
+ node_ids = self._type_index.get(t, set())
314
+ for nid in node_ids:
315
+ entity = self.get_entity(nid)
316
+ if entity:
317
+ results.append(entity)
318
+ if limit:
319
+ return results[:limit]
320
+ return results
321
+
322
+ # Use type index for fast lookup
323
+ node_ids = self._type_index.get(entity_type_lower, set())
324
+ if node_ids:
325
+ results = [self.get_entity(nid) for nid in node_ids if nid]
326
+ if limit:
327
+ return results[:limit]
328
+ return results
329
+
330
+ # Fallback: linear scan (for types not in index)
331
+ results = [
332
+ dict(data)
333
+ for _, data in self._graph.nodes(data=True)
334
+ if data.get('type', '').lower() == entity_type_lower
335
+ ]
336
+ if limit:
337
+ return results[:limit]
338
+ return results
339
+
340
+ def get_entities_by_layer(self, layer: str, limit: Optional[int] = None) -> List[Dict[str, Any]]:
341
+ """
342
+ Get all entities in a specific layer (product, domain, service, code, data, etc.).
343
+
344
+ Layer is inferred from entity type if not explicitly set on the entity.
345
+ """
346
+ layer_lower = layer.lower()
347
+
348
+ # Get types that belong to this layer
349
+ layer_types = self.LAYER_TYPE_MAPPING.get(layer_lower, set())
350
+
351
+ results = []
352
+ for _, data in self._graph.nodes(data=True):
353
+ # Check explicit layer
354
+ if data.get('layer', '').lower() == layer_lower:
355
+ results.append(dict(data))
356
+ continue
357
+
358
+ # Check if type belongs to this layer
359
+ entity_type = data.get('type', '').lower()
360
+ if entity_type in layer_types:
361
+ results.append(dict(data))
362
+
363
+ if limit:
364
+ return results[:limit]
365
+ return results
366
+
367
+ def get_all_entities(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
368
+ """Get all entities in the graph."""
369
+ results = [
370
+ {'id': node_id, **dict(data)}
371
+ for node_id, data in self._graph.nodes(data=True)
372
+ ]
373
+ if limit:
374
+ return results[:limit]
375
+ return results
376
+
377
+ def get_all_entity_types(self) -> List[str]:
378
+ """Get list of all entity types in the graph."""
379
+ types = set()
380
+ for _, data in self._graph.nodes(data=True):
381
+ if 'type' in data:
382
+ types.add(data['type'])
383
+ return sorted(types)
384
+
385
+ def update_entity(self, entity_id: str, updates: Dict[str, Any]) -> bool:
386
+ """
387
+ Update entity properties.
388
+
389
+ Args:
390
+ entity_id: Entity to update
391
+ updates: Properties to update (merged with existing)
392
+
393
+ Returns:
394
+ True if entity exists and was updated
395
+ """
396
+ if not self._graph.has_node(entity_id):
397
+ return False
398
+
399
+ # Filter out raw content
400
+ excluded_keys = {'content', 'text', 'raw', 'body', 'source_content'}
401
+ filtered_updates = {
402
+ k: v for k, v in updates.items()
403
+ if k not in excluded_keys
404
+ }
405
+
406
+ current = dict(self._graph.nodes[entity_id])
407
+ current.update(filtered_updates)
408
+
409
+ for key, value in current.items():
410
+ self._graph.nodes[entity_id][key] = value
411
+
412
+ return True
413
+
414
+ def remove_entity(self, entity_id: str) -> bool:
415
+ """Remove entity and its edges from the graph."""
416
+ if not self._graph.has_node(entity_id):
417
+ return False
418
+
419
+ # Remove from all indices
420
+ entity = self.get_entity(entity_id)
421
+ if entity:
422
+ # Remove from name index
423
+ name = entity.get('name', '').lower()
424
+ if name in self._entity_index:
425
+ self._entity_index[name].discard(entity_id)
426
+ if not self._entity_index[name]:
427
+ del self._entity_index[name]
428
+
429
+ # Remove from type index
430
+ entity_type = entity.get('type', '').lower()
431
+ if entity_type in self._type_index:
432
+ self._type_index[entity_type].discard(entity_id)
433
+ if not self._type_index[entity_type]:
434
+ del self._type_index[entity_type]
435
+
436
+ # Remove from file index
437
+ file_path = entity.get('file_path', '')
438
+ if file_path in self._file_index:
439
+ self._file_index[file_path].discard(entity_id)
440
+ if not self._file_index[file_path]:
441
+ del self._file_index[file_path]
442
+
443
+ # Remove from source doc index
444
+ for citation in entity.get('citations', []):
445
+ if isinstance(citation, dict):
446
+ doc_id = citation.get('doc_id')
447
+ if doc_id and entity_id in self._source_doc_index.get(doc_id, set()):
448
+ self._source_doc_index[doc_id].discard(entity_id)
449
+
450
+ self._graph.remove_node(entity_id)
451
+ return True
452
+
453
+ # ========== Relation Operations ==========
454
+
455
+ def add_relation(
456
+ self,
457
+ source_id: str,
458
+ target_id: str,
459
+ relation_type: str,
460
+ properties: Optional[Dict[str, Any]] = None,
461
+ ) -> bool:
462
+ """
463
+ Add a directed relation between entities.
464
+
465
+ Args:
466
+ source_id: Source entity ID
467
+ target_id: Target entity ID
468
+ relation_type: Type of relationship (e.g., 'CALLS', 'IMPORTS', 'INHERITS')
469
+ properties: Additional edge properties
470
+
471
+ Returns:
472
+ True if relation was added
473
+ """
474
+ if not self._graph.has_node(source_id):
475
+ logger.warning(f"Source entity {source_id} not found")
476
+ return False
477
+ if not self._graph.has_node(target_id):
478
+ logger.warning(f"Target entity {target_id} not found")
479
+ return False
480
+
481
+ edge_data = {'relation_type': relation_type}
482
+ if properties:
483
+ edge_data.update(properties)
484
+
485
+ self._graph.add_edge(source_id, target_id, **edge_data)
486
+ logger.debug(f"Added relation: {source_id} --[{relation_type}]--> {target_id}")
487
+ return True
488
+
489
+ def get_relations(self, entity_id: str, direction: str = 'both') -> List[Dict[str, Any]]:
490
+ """
491
+ Get relations for an entity.
492
+
493
+ Args:
494
+ entity_id: Entity ID
495
+ direction: 'outgoing', 'incoming', or 'both'
496
+
497
+ Returns:
498
+ List of relation dicts with source, target, type, properties
499
+ """
500
+ relations = []
501
+
502
+ if direction in ('outgoing', 'both'):
503
+ for _, target, data in self._graph.out_edges(entity_id, data=True):
504
+ relations.append({
505
+ 'source': entity_id,
506
+ 'target': target,
507
+ 'relation_type': data.get('relation_type'),
508
+ 'properties': {k: v for k, v in data.items() if k != 'relation_type'}
509
+ })
510
+
511
+ if direction in ('incoming', 'both'):
512
+ for source, _, data in self._graph.in_edges(entity_id, data=True):
513
+ relations.append({
514
+ 'source': source,
515
+ 'target': entity_id,
516
+ 'relation_type': data.get('relation_type'),
517
+ 'properties': {k: v for k, v in data.items() if k != 'relation_type'}
518
+ })
519
+
520
+ return relations
521
+
522
+ def remove_relation(self, source_id: str, target_id: str) -> bool:
523
+ """Remove a relation between entities."""
524
+ if self._graph.has_edge(source_id, target_id):
525
+ self._graph.remove_edge(source_id, target_id)
526
+ return True
527
+ return False
528
+
529
+ def get_relations_by_source(
530
+ self,
531
+ source_toolkit: str,
532
+ relation_type: Optional[str] = None
533
+ ) -> List[Dict[str, Any]]:
534
+ """
535
+ Get all relations from a specific source toolkit.
536
+
537
+ Args:
538
+ source_toolkit: Name of source toolkit (e.g., 'github', 'jira')
539
+ relation_type: Optional filter by relation type
540
+
541
+ Returns:
542
+ List of relations with their properties
543
+ """
544
+ relations = []
545
+
546
+ for source, target, data in self._graph.edges(data=True):
547
+ # Check if this relation is from the specified source
548
+ rel_source = data.get('source_toolkit')
549
+ if rel_source == source_toolkit:
550
+ # Filter by relation type if specified
551
+ if relation_type is None or data.get('relation_type') == relation_type:
552
+ relations.append({
553
+ 'source': source,
554
+ 'target': target,
555
+ 'relation_type': data.get('relation_type'),
556
+ 'source_toolkit': rel_source,
557
+ 'properties': {k: v for k, v in data.items()
558
+ if k not in ('relation_type', 'source_toolkit')}
559
+ })
560
+
561
+ return relations
562
+
563
+ def get_cross_source_relations(self) -> List[Dict[str, Any]]:
564
+ """
565
+ Get relations that connect entities from different sources.
566
+
567
+ These are particularly valuable for understanding how different
568
+ data sources relate to each other (e.g., Jira ticket references GitHub PR).
569
+
570
+ Returns:
571
+ List of cross-source relations
572
+ """
573
+ cross_source = []
574
+
575
+ for source, target, data in self._graph.edges(data=True):
576
+ source_node = self._graph.nodes.get(source, {})
577
+ target_node = self._graph.nodes.get(target, {})
578
+
579
+ # Get source toolkits from entity citations
580
+ source_citations = source_node.get('citations', [])
581
+ target_citations = target_node.get('citations', [])
582
+
583
+ if not source_citations or not target_citations:
584
+ continue
585
+
586
+ # Get unique source toolkits for each entity
587
+ source_toolkits = set()
588
+ target_toolkits = set()
589
+
590
+ for citation in source_citations:
591
+ if isinstance(citation, dict):
592
+ toolkit = citation.get('source_toolkit')
593
+ elif hasattr(citation, 'source_toolkit'):
594
+ toolkit = citation.source_toolkit
595
+ else:
596
+ toolkit = None
597
+ if toolkit:
598
+ source_toolkits.add(toolkit)
599
+
600
+ for citation in target_citations:
601
+ if isinstance(citation, dict):
602
+ toolkit = citation.get('source_toolkit')
603
+ elif hasattr(citation, 'source_toolkit'):
604
+ toolkit = citation.source_toolkit
605
+ else:
606
+ toolkit = None
607
+ if toolkit:
608
+ target_toolkits.add(toolkit)
609
+
610
+ # Check if entities come from different sources
611
+ if source_toolkits and target_toolkits and source_toolkits != target_toolkits:
612
+ cross_source.append({
613
+ 'source': source,
614
+ 'target': target,
615
+ 'source_toolkits': list(source_toolkits),
616
+ 'target_toolkits': list(target_toolkits),
617
+ 'relation_type': data.get('relation_type'),
618
+ 'relation_source': data.get('source_toolkit'),
619
+ 'properties': {k: v for k, v in data.items()
620
+ if k not in ('relation_type', 'source_toolkit')}
621
+ })
622
+
623
+ return cross_source
624
+
625
+ # ========== Graph Analysis ==========
626
+
627
+ def get_neighbors(
628
+ self,
629
+ entity_id: str,
630
+ max_depth: int = 1,
631
+ relation_types: Optional[List[str]] = None,
632
+ ) -> Dict[str, Any]:
633
+ """
634
+ Get neighboring entities up to a certain depth.
635
+
636
+ Args:
637
+ entity_id: Starting entity
638
+ max_depth: How many hops to traverse
639
+ relation_types: Filter by relation types
640
+
641
+ Returns:
642
+ Dict with entities and relations
643
+ """
644
+ if not self._graph.has_node(entity_id):
645
+ return {'entities': [], 'relations': []}
646
+
647
+ visited = {entity_id}
648
+ entities = [self.get_entity(entity_id)]
649
+ relations = []
650
+
651
+ current_level = [entity_id]
652
+
653
+ for _ in range(max_depth):
654
+ next_level = []
655
+
656
+ for node in current_level:
657
+ # Outgoing edges
658
+ for _, target, data in self._graph.out_edges(node, data=True):
659
+ rel_type = data.get('relation_type')
660
+ if relation_types and rel_type not in relation_types:
661
+ continue
662
+
663
+ relations.append({
664
+ 'source': node,
665
+ 'target': target,
666
+ 'relation_type': rel_type,
667
+ })
668
+
669
+ if target not in visited:
670
+ visited.add(target)
671
+ next_level.append(target)
672
+ entities.append(self.get_entity(target))
673
+
674
+ # Incoming edges
675
+ for source, _, data in self._graph.in_edges(node, data=True):
676
+ rel_type = data.get('relation_type')
677
+ if relation_types and rel_type not in relation_types:
678
+ continue
679
+
680
+ relations.append({
681
+ 'source': source,
682
+ 'target': node,
683
+ 'relation_type': rel_type,
684
+ })
685
+
686
+ if source not in visited:
687
+ visited.add(source)
688
+ next_level.append(source)
689
+ entities.append(self.get_entity(source))
690
+
691
+ current_level = next_level
692
+
693
+ return {'entities': entities, 'relations': relations}
694
+
695
+ def find_path(self, source_id: str, target_id: str) -> Optional[List[str]]:
696
+ """Find shortest path between two entities."""
697
+ if not self._graph.has_node(source_id) or not self._graph.has_node(target_id):
698
+ return None
699
+
700
+ try:
701
+ path = nx.shortest_path(self._graph, source_id, target_id)
702
+ return path
703
+ except nx.NetworkXNoPath:
704
+ return None
705
+
706
+ def impact_analysis(
707
+ self,
708
+ entity_id: str,
709
+ direction: str = 'downstream',
710
+ max_depth: int = 3,
711
+ ) -> Dict[str, Any]:
712
+ """
713
+ Analyze impact of changes to an entity.
714
+
715
+ Args:
716
+ entity_id: Entity to analyze
717
+ direction: 'downstream' (what depends on this) or 'upstream' (what this depends on)
718
+ max_depth: Maximum traversal depth
719
+
720
+ Returns:
721
+ Dict with impacted entities and paths
722
+ """
723
+ if not self._graph.has_node(entity_id):
724
+ return {'impacted': [], 'paths': []}
725
+
726
+ impacted = []
727
+ paths = []
728
+
729
+ # Use BFS for level-by-level analysis
730
+ visited = {entity_id}
731
+ queue = [(entity_id, [entity_id], 0)]
732
+
733
+ while queue:
734
+ current, path, depth = queue.pop(0)
735
+
736
+ if depth >= max_depth:
737
+ continue
738
+
739
+ # Get edges based on direction
740
+ if direction == 'downstream':
741
+ edges = self._graph.in_edges(current, data=True)
742
+ else: # upstream
743
+ edges = self._graph.out_edges(current, data=True)
744
+
745
+ for edge in edges:
746
+ if direction == 'downstream':
747
+ neighbor = edge[0]
748
+ else:
749
+ neighbor = edge[1]
750
+
751
+ if neighbor not in visited:
752
+ visited.add(neighbor)
753
+ new_path = path + [neighbor]
754
+
755
+ entity = self.get_entity(neighbor)
756
+ impacted.append({
757
+ 'entity': entity,
758
+ 'depth': depth + 1,
759
+ 'path': new_path,
760
+ })
761
+ paths.append(new_path)
762
+
763
+ queue.append((neighbor, new_path, depth + 1))
764
+
765
+ return {'impacted': impacted, 'paths': paths}
766
+
767
+ # ========== Search Operations ==========
768
+
769
+ def _tokenize(self, text: str) -> Set[str]:
770
+ """Tokenize text into searchable tokens (handles camelCase, snake_case, etc.)."""
771
+ import re
772
+ if not text:
773
+ return set()
774
+
775
+ # Split on non-alphanumeric
776
+ words = re.split(r'[^a-zA-Z0-9]+', text.lower())
777
+
778
+ # Also split camelCase
779
+ tokens = set()
780
+ for word in words:
781
+ if word:
782
+ tokens.add(word)
783
+ # Split camelCase: "ChatMessageHandler" -> ["chat", "message", "handler"]
784
+ camel_parts = re.findall(r'[a-z]+|[A-Z][a-z]*|[0-9]+', word)
785
+ tokens.update(p.lower() for p in camel_parts if p)
786
+
787
+ return tokens
788
+
789
+ def _calculate_match_score(
790
+ self,
791
+ query_tokens: Set[str],
792
+ query_lower: str,
793
+ name: str,
794
+ entity_type: str,
795
+ description: str,
796
+ file_path: str,
797
+ ) -> tuple:
798
+ """
799
+ Calculate match score for an entity.
800
+
801
+ Returns (score, match_field) tuple.
802
+ Higher scores mean better matches.
803
+ """
804
+ name_lower = name.lower()
805
+ name_tokens = self._tokenize(name)
806
+
807
+ # Exact name match (highest priority)
808
+ if query_lower == name_lower:
809
+ return (1.0, 'name_exact')
810
+
811
+ # Exact substring in name
812
+ if query_lower in name_lower:
813
+ # Prefer matches at word boundaries
814
+ score = 0.85 if name_lower.startswith(query_lower) else 0.75
815
+ return (score, 'name_contains')
816
+
817
+ # Token overlap in name (for camelCase matching)
818
+ if query_tokens and name_tokens:
819
+ overlap = len(query_tokens & name_tokens)
820
+ if overlap > 0:
821
+ # Score based on percentage of query tokens matched
822
+ score = 0.6 * (overlap / len(query_tokens))
823
+ if overlap == len(query_tokens): # All query tokens found
824
+ score = 0.7
825
+ return (score, 'name_tokens')
826
+
827
+ # Check file path
828
+ if file_path and query_lower in file_path.lower():
829
+ return (0.55, 'file_path')
830
+
831
+ # Check description
832
+ if description:
833
+ desc_lower = description.lower()
834
+ if query_lower in desc_lower:
835
+ return (0.5, 'description')
836
+ # Token match in description
837
+ desc_tokens = self._tokenize(description)
838
+ if query_tokens and desc_tokens:
839
+ overlap = len(query_tokens & desc_tokens)
840
+ if overlap > 0:
841
+ score = 0.35 * (overlap / len(query_tokens))
842
+ return (score, 'description_tokens')
843
+
844
+ # Check entity type
845
+ if query_lower in entity_type.lower():
846
+ return (0.3, 'type')
847
+
848
+ return (0.0, None)
849
+
850
+ def search(
851
+ self,
852
+ query: str,
853
+ top_k: int = 10,
854
+ entity_type: Optional[str] = None,
855
+ layer: Optional[str] = None,
856
+ file_pattern: Optional[str] = None,
857
+ ) -> List[Dict[str, Any]]:
858
+ """
859
+ Search entities with enhanced matching capabilities.
860
+
861
+ Supports:
862
+ - Exact and partial name matching
863
+ - Token-based matching (handles camelCase, snake_case)
864
+ - Description and property search
865
+ - File path pattern matching
866
+ - Type and layer filtering
867
+
868
+ Args:
869
+ query: Search query string
870
+ top_k: Maximum results to return
871
+ entity_type: Filter by entity type (case-insensitive)
872
+ layer: Filter by layer (code, service, data, product, etc.)
873
+ file_pattern: Filter by file path pattern (glob-like)
874
+
875
+ Returns:
876
+ List of matching entities with scores
877
+ """
878
+ import re
879
+
880
+ results = []
881
+ query_lower = query.lower().strip()
882
+ query_tokens = self._tokenize(query)
883
+
884
+ # Get layer types for filtering
885
+ layer_types = set()
886
+ if layer:
887
+ layer_types = self.LAYER_TYPE_MAPPING.get(layer.lower(), set())
888
+
889
+ # Compile file pattern if provided
890
+ file_regex = None
891
+ if file_pattern:
892
+ # Convert glob pattern to regex
893
+ pattern = file_pattern.replace('.', r'\.').replace('*', '.*').replace('?', '.')
894
+ try:
895
+ file_regex = re.compile(pattern, re.IGNORECASE)
896
+ except re.error:
897
+ pass
898
+
899
+ for node_id, data in self._graph.nodes(data=True):
900
+ # Type filter (case-insensitive)
901
+ data_type = data.get('type', '').lower()
902
+ if entity_type and data_type != entity_type.lower():
903
+ continue
904
+
905
+ # Layer filter
906
+ if layer:
907
+ entity_layer = data.get('layer', '').lower()
908
+ if entity_layer != layer.lower() and data_type not in layer_types:
909
+ continue
910
+
911
+ # File pattern filter
912
+ citations = data.get('citations', [])
913
+ if not citations and 'citation' in data:
914
+ citations = [data['citation']]
915
+
916
+ file_paths = [c.get('file_path', '') for c in citations if isinstance(c, dict)]
917
+ primary_file = file_paths[0] if file_paths else data.get('file_path', '')
918
+
919
+ if file_regex and primary_file:
920
+ if not file_regex.search(primary_file):
921
+ continue
922
+
923
+ # Calculate match score
924
+ name = data.get('name', '')
925
+ description = data.get('description', '')
926
+ if isinstance(data.get('properties'), dict):
927
+ description = description or data['properties'].get('description', '')
928
+
929
+ score, match_field = self._calculate_match_score(
930
+ query_tokens, query_lower, name, data_type, description, primary_file
931
+ )
932
+
933
+ if score > 0:
934
+ results.append({
935
+ 'entity': dict(data),
936
+ 'score': score,
937
+ 'match_field': match_field,
938
+ })
939
+
940
+ # Sort by score (descending), then by name
941
+ results.sort(key=lambda x: (-x['score'], x['entity'].get('name', '').lower()))
942
+ return results[:top_k]
943
+
944
+ def search_by_file(self, file_path_pattern: str, limit: int = 50) -> List[Dict[str, Any]]:
945
+ """
946
+ Search entities by file path pattern.
947
+
948
+ Args:
949
+ file_path_pattern: Glob-like pattern (e.g., "api/*.py", "**/chat*.py")
950
+ limit: Maximum results
951
+
952
+ Returns:
953
+ List of entities from matching files
954
+ """
955
+ import re
956
+
957
+ # Convert glob to regex
958
+ pattern = file_path_pattern.replace('.', r'\.').replace('**', '.*').replace('*', '[^/]*').replace('?', '.')
959
+ try:
960
+ file_regex = re.compile(pattern, re.IGNORECASE)
961
+ except re.error:
962
+ return []
963
+
964
+ results = []
965
+ for file_path, node_ids in self._file_index.items():
966
+ if file_regex.search(file_path):
967
+ for nid in node_ids:
968
+ entity = self.get_entity(nid)
969
+ if entity:
970
+ results.append(entity)
971
+ if len(results) >= limit:
972
+ return results
973
+
974
+ # Also check entities with file_path attribute (backup)
975
+ if not results:
976
+ for _, data in self._graph.nodes(data=True):
977
+ fp = data.get('file_path', '')
978
+ if fp and file_regex.search(fp):
979
+ results.append(dict(data))
980
+ if len(results) >= limit:
981
+ break
982
+
983
+ return results
984
+
985
+ def search_advanced(
986
+ self,
987
+ query: Optional[str] = None,
988
+ entity_types: Optional[List[str]] = None,
989
+ layers: Optional[List[str]] = None,
990
+ file_patterns: Optional[List[str]] = None,
991
+ has_relations: Optional[bool] = None,
992
+ min_citations: Optional[int] = None,
993
+ top_k: int = 20,
994
+ ) -> List[Dict[str, Any]]:
995
+ """
996
+ Advanced search with multiple filter criteria.
997
+
998
+ Args:
999
+ query: Text search query (optional)
1000
+ entity_types: List of types to include (OR logic)
1001
+ layers: List of layers to include (OR logic)
1002
+ file_patterns: List of file patterns to include (OR logic)
1003
+ has_relations: If True, only entities with relations; if False, isolated entities
1004
+ min_citations: Minimum number of citations required
1005
+ top_k: Maximum results
1006
+
1007
+ Returns:
1008
+ List of matching entities
1009
+ """
1010
+ import re
1011
+
1012
+ # Build type filter set
1013
+ type_filter = set()
1014
+ if entity_types:
1015
+ for t in entity_types:
1016
+ type_filter.add(t.lower())
1017
+ # Expand layer names to types
1018
+ if t.lower() in self.LAYER_TYPE_MAPPING:
1019
+ type_filter.update(self.LAYER_TYPE_MAPPING[t.lower()])
1020
+
1021
+ # Build layer filter set
1022
+ layer_filter = set()
1023
+ if layers:
1024
+ for l in layers:
1025
+ layer_filter.add(l.lower())
1026
+
1027
+ # Build file regex patterns
1028
+ file_regexes = []
1029
+ if file_patterns:
1030
+ for fp in file_patterns:
1031
+ pattern = fp.replace('.', r'\.').replace('**', '.*').replace('*', '[^/]*')
1032
+ try:
1033
+ file_regexes.append(re.compile(pattern, re.IGNORECASE))
1034
+ except re.error:
1035
+ pass
1036
+
1037
+ query_tokens = self._tokenize(query) if query else set()
1038
+ query_lower = query.lower().strip() if query else ''
1039
+
1040
+ results = []
1041
+
1042
+ for node_id, data in self._graph.nodes(data=True):
1043
+ data_type = data.get('type', '').lower()
1044
+ data_layer = data.get('layer', '').lower() or self.TYPE_TO_LAYER.get(data_type, '')
1045
+
1046
+ # Type filter
1047
+ if type_filter and data_type not in type_filter:
1048
+ continue
1049
+
1050
+ # Layer filter
1051
+ if layer_filter and data_layer not in layer_filter:
1052
+ continue
1053
+
1054
+ # File pattern filter
1055
+ file_path = data.get('file_path', '')
1056
+ if file_regexes:
1057
+ if not any(rx.search(file_path) for rx in file_regexes):
1058
+ continue
1059
+
1060
+ # Relations filter
1061
+ if has_relations is not None:
1062
+ has_edges = (
1063
+ self._graph.in_degree(node_id) > 0 or
1064
+ self._graph.out_degree(node_id) > 0
1065
+ )
1066
+ if has_relations and not has_edges:
1067
+ continue
1068
+ if not has_relations and has_edges:
1069
+ continue
1070
+
1071
+ # Citations filter
1072
+ if min_citations:
1073
+ citations = data.get('citations', [])
1074
+ if len(citations) < min_citations:
1075
+ continue
1076
+
1077
+ # Text search
1078
+ score = 1.0
1079
+ match_field = 'filter'
1080
+
1081
+ if query:
1082
+ name = data.get('name', '')
1083
+ description = data.get('description', '')
1084
+ if isinstance(data.get('properties'), dict):
1085
+ description = description or data['properties'].get('description', '')
1086
+
1087
+ score, match_field = self._calculate_match_score(
1088
+ query_tokens, query_lower, name, data_type, description, file_path
1089
+ )
1090
+
1091
+ if score == 0:
1092
+ continue
1093
+
1094
+ results.append({
1095
+ 'entity': dict(data),
1096
+ 'score': score,
1097
+ 'match_field': match_field,
1098
+ })
1099
+
1100
+ results.sort(key=lambda x: (-x['score'], x['entity'].get('name', '').lower()))
1101
+ return results[:top_k]
1102
+
1103
+ def get_entities_by_source(self, doc_id: str) -> List[Dict[str, Any]]:
1104
+ """Get all entities from a specific source document."""
1105
+ node_ids = self._source_doc_index.get(doc_id, set())
1106
+ return [self.get_entity(nid) for nid in node_ids if nid]
1107
+
1108
+ def get_entities_by_file(self, file_path: str) -> List[Dict[str, Any]]:
1109
+ """Get all entities with citations from a specific file."""
1110
+ # First try the file index
1111
+ node_ids = self._file_index.get(file_path, set())
1112
+ if node_ids:
1113
+ return [self.get_entity(nid) for nid in node_ids if nid]
1114
+
1115
+ # Fallback to linear scan for partial matches
1116
+ results = []
1117
+ for _, data in self._graph.nodes(data=True):
1118
+ # Check file_path attribute
1119
+ if data.get('file_path') == file_path:
1120
+ results.append(dict(data))
1121
+ continue
1122
+
1123
+ # Check citations
1124
+ for citation in data.get('citations', []):
1125
+ if isinstance(citation, dict) and citation.get('file_path') == file_path:
1126
+ results.append(dict(data))
1127
+ break
1128
+
1129
+ return results
1130
+
1131
+ # ========== Delta Operations ==========
1132
+
1133
+ def remove_entities_by_source(self, doc_id: str) -> int:
1134
+ """
1135
+ Remove all entities from a specific source document.
1136
+ Used for delta updates to clean stale entities.
1137
+
1138
+ Returns:
1139
+ Number of entities removed
1140
+ """
1141
+ node_ids = list(self._source_doc_index.get(doc_id, set()))
1142
+ for node_id in node_ids:
1143
+ self.remove_entity(node_id)
1144
+ return len(node_ids)
1145
+
1146
+ def remove_entities_by_file(self, file_path: str) -> int:
1147
+ """
1148
+ Remove all entities with citations from a specific file.
1149
+ Used for delta updates when a file changes.
1150
+
1151
+ Returns:
1152
+ Number of entities removed
1153
+ """
1154
+ to_remove = []
1155
+ for node_id, data in self._graph.nodes(data=True):
1156
+ citation = data.get('citation', {})
1157
+ if isinstance(citation, dict) and citation.get('file_path') == file_path:
1158
+ to_remove.append(node_id)
1159
+
1160
+ for node_id in to_remove:
1161
+ self.remove_entity(node_id)
1162
+
1163
+ return len(to_remove)
1164
+
1165
+ # ========== Schema Operations ==========
1166
+
1167
+ def set_schema(self, schema: Dict[str, Any]) -> None:
1168
+ """Store the discovered entity schema."""
1169
+ self._schema = schema
1170
+
1171
+ def get_schema(self) -> Optional[Dict[str, Any]]:
1172
+ """Get the discovered schema."""
1173
+ return self._schema
1174
+
1175
+ # ========== Statistics ==========
1176
+
1177
+ def get_stats(self) -> Dict[str, Any]:
1178
+ """Get graph statistics."""
1179
+ entity_types = defaultdict(int)
1180
+ relation_types = defaultdict(int)
1181
+ sources = set()
1182
+ relations_by_source = defaultdict(int)
1183
+
1184
+ for _, data in self._graph.nodes(data=True):
1185
+ if 'type' in data:
1186
+ entity_types[data['type']] += 1
1187
+ citation = data.get('citation', {})
1188
+ if isinstance(citation, dict) and citation.get('source_toolkit'):
1189
+ sources.add(citation['source_toolkit'])
1190
+
1191
+ for _, _, data in self._graph.edges(data=True):
1192
+ if 'relation_type' in data:
1193
+ relation_types[data['relation_type']] += 1
1194
+ # Track relations by source
1195
+ rel_source = data.get('source_toolkit')
1196
+ if rel_source:
1197
+ relations_by_source[rel_source] += 1
1198
+
1199
+ return {
1200
+ 'node_count': self._graph.number_of_nodes(),
1201
+ 'edge_count': self._graph.number_of_edges(),
1202
+ 'entity_types': dict(entity_types),
1203
+ 'relation_types': dict(relation_types),
1204
+ 'source_toolkits': sorted(sources),
1205
+ 'relations_by_source': dict(relations_by_source),
1206
+ 'cross_source_relations': len(self.get_cross_source_relations()),
1207
+ 'last_saved': self._metadata.get('last_saved'),
1208
+ }
1209
+
1210
+ # ========== Persistence ==========
1211
+
1212
+ def dump_to_json(self, path: str) -> None:
1213
+ """
1214
+ Export graph to JSON file using node_link format.
1215
+
1216
+ The graph file is lightweight - contains only:
1217
+ - Entity metadata and citations (no raw content)
1218
+ - Relationships
1219
+ - Schema and indices
1220
+
1221
+ Args:
1222
+ path: File path to write JSON
1223
+ """
1224
+ # Use edges="links" explicitly for NetworkX 3.5+ compatibility
1225
+ # This ensures consistent format that visualize.py and load_from_json expect
1226
+ data = nx.node_link_data(self._graph, edges="links")
1227
+
1228
+ # Add index data for persistence
1229
+ data['_indices'] = {
1230
+ 'entity_index': {k: list(v) for k, v in self._entity_index.items()},
1231
+ 'type_index': {k: list(v) for k, v in self._type_index.items()},
1232
+ 'file_index': {k: list(v) for k, v in self._file_index.items()},
1233
+ 'source_doc_index': {k: list(v) for k, v in self._source_doc_index.items()}
1234
+ }
1235
+
1236
+ # Add schema if discovered
1237
+ if self._schema:
1238
+ data['_schema'] = self._schema
1239
+
1240
+ # Add metadata
1241
+ self._metadata['last_saved'] = datetime.now().isoformat()
1242
+ self._metadata['version'] = '2.1' # Enhanced indices version
1243
+ data['_metadata'] = self._metadata
1244
+
1245
+ with open(path, 'w', encoding='utf-8') as f:
1246
+ json.dump(data, f, indent=2, default=str)
1247
+
1248
+ logger.info(f"Saved graph to {path} ({self._graph.number_of_nodes()} entities, {self._graph.number_of_edges()} relations)")
1249
+
1250
+ def load_from_json(self, path: str) -> None:
1251
+ """
1252
+ Load graph from JSON file.
1253
+
1254
+ Args:
1255
+ path: File path to read JSON from
1256
+
1257
+ Raises:
1258
+ FileNotFoundError: If file doesn't exist
1259
+ """
1260
+ with open(path, 'r', encoding='utf-8') as f:
1261
+ data = json.load(f)
1262
+
1263
+ # Restore indices
1264
+ indices = data.pop('_indices', {})
1265
+
1266
+ # Entity index - convert to set (handles both old string format and new list format)
1267
+ self._entity_index = defaultdict(set)
1268
+ for k, v in indices.get('entity_index', {}).items():
1269
+ if isinstance(v, list):
1270
+ self._entity_index[k] = set(v)
1271
+ elif isinstance(v, str):
1272
+ self._entity_index[k] = {v} # Legacy format
1273
+
1274
+ # Type index
1275
+ self._type_index = defaultdict(set)
1276
+ for k, v in indices.get('type_index', {}).items():
1277
+ self._type_index[k] = set(v) if isinstance(v, list) else set()
1278
+
1279
+ # File index
1280
+ self._file_index = defaultdict(set)
1281
+ for k, v in indices.get('file_index', {}).items():
1282
+ self._file_index[k] = set(v) if isinstance(v, list) else set()
1283
+
1284
+ # Source doc index
1285
+ self._source_doc_index = defaultdict(set)
1286
+ for k, v in indices.get('source_doc_index', {}).items():
1287
+ self._source_doc_index[k] = set(v) if isinstance(v, list) else set()
1288
+
1289
+ # Restore schema
1290
+ self._schema = data.pop('_schema', None)
1291
+
1292
+ # Restore metadata
1293
+ self._metadata = data.pop('_metadata', {})
1294
+
1295
+ # Restore graph - handle both "links" and "edges" keys for compatibility
1296
+ # NetworkX 3.5+ defaults to "edges", but we write "links" for visualization compatibility
1297
+ if 'edges' in data and 'links' not in data:
1298
+ # Data uses new NetworkX 3.5+ default "edges" key - rename to "links" for node_link_graph
1299
+ data['links'] = data.pop('edges')
1300
+
1301
+ self._graph = nx.node_link_graph(data, edges="links")
1302
+
1303
+ # Rebuild missing indices if needed (for legacy graphs)
1304
+ if not self._type_index or not self._file_index:
1305
+ self._rebuild_indices()
1306
+
1307
+ logger.info(f"Loaded graph from {path} ({self._graph.number_of_nodes()} entities, {self._graph.number_of_edges()} relations)")
1308
+
1309
+ def _rebuild_indices(self) -> None:
1310
+ """Rebuild all indices from graph data (for legacy graph files)."""
1311
+ self._entity_index = defaultdict(set)
1312
+ self._type_index = defaultdict(set)
1313
+ self._file_index = defaultdict(set)
1314
+ self._source_doc_index = defaultdict(set)
1315
+
1316
+ for node_id, data in self._graph.nodes(data=True):
1317
+ # Name index
1318
+ name = data.get('name', '').lower()
1319
+ if name:
1320
+ self._entity_index[name].add(node_id)
1321
+
1322
+ # Type index
1323
+ entity_type = data.get('type', '').lower()
1324
+ if entity_type:
1325
+ self._type_index[entity_type].add(node_id)
1326
+
1327
+ # File index (from file_path attribute)
1328
+ file_path = data.get('file_path', '')
1329
+ if file_path:
1330
+ self._file_index[file_path].add(node_id)
1331
+
1332
+ # Also index from citations
1333
+ for citation in data.get('citations', []):
1334
+ if isinstance(citation, dict):
1335
+ fp = citation.get('file_path', '')
1336
+ if fp:
1337
+ self._file_index[fp].add(node_id)
1338
+ doc_id = citation.get('doc_id', '')
1339
+ if doc_id:
1340
+ self._source_doc_index[doc_id].add(node_id)
1341
+
1342
+ logger.info(f"Rebuilt indices: {len(self._entity_index)} names, {len(self._type_index)} types, {len(self._file_index)} files")
1343
+
1344
+ def clear(self) -> None:
1345
+ """Clear all data from the graph."""
1346
+ self._graph.clear()
1347
+ self._entity_index.clear()
1348
+ self._type_index.clear()
1349
+ self._file_index.clear()
1350
+ self._source_doc_index.clear()
1351
+ self._schema = None
1352
+ self._metadata = {}
1353
+
1354
+ # ========== Subgraph Operations ==========
1355
+
1356
+ def get_subgraph(self, node_ids: List[str]) -> 'KnowledgeGraph':
1357
+ """
1358
+ Get a subgraph containing only specified nodes and their edges.
1359
+
1360
+ Args:
1361
+ node_ids: List of node IDs to include
1362
+
1363
+ Returns:
1364
+ New KnowledgeGraph instance with subgraph
1365
+ """
1366
+ subgraph = KnowledgeGraph()
1367
+ subgraph._graph = self._graph.subgraph(node_ids).copy()
1368
+
1369
+ # Rebuild indices for subgraph
1370
+ for node_id, data in subgraph._graph.nodes(data=True):
1371
+ name = data.get('name', '').lower()
1372
+ if name:
1373
+ subgraph._entity_index[name] = node_id
1374
+
1375
+ citation = data.get('citation', {})
1376
+ if isinstance(citation, dict):
1377
+ doc_id = citation.get('doc_id')
1378
+ if doc_id:
1379
+ subgraph._source_doc_index[doc_id].add(node_id)
1380
+
1381
+ return subgraph
1382
+
1383
+ def get_connected_component(self, node_id: str) -> List[str]:
1384
+ """
1385
+ Get all nodes in the same connected component as the given node.
1386
+
1387
+ Args:
1388
+ node_id: Starting node ID
1389
+
1390
+ Returns:
1391
+ List of node IDs in the connected component
1392
+ """
1393
+ if not self._graph.has_node(node_id):
1394
+ return []
1395
+
1396
+ # For directed graphs, use weakly connected components
1397
+ undirected = self._graph.to_undirected()
1398
+ component = nx.node_connected_component(undirected, node_id)
1399
+ return list(component)
1400
+
1401
+ # ========== Citation Helpers ==========
1402
+
1403
+ def get_citation(self, entity_id: str) -> Optional[Citation]:
1404
+ """Get citation for an entity."""
1405
+ entity = self.get_entity(entity_id)
1406
+ if entity and 'citation' in entity:
1407
+ return Citation.from_dict(entity['citation'])
1408
+ return None
1409
+
1410
+ def get_citations_for_query(self, query: str, top_k: int = 5) -> List[Citation]:
1411
+ """
1412
+ Get citations for entities matching a query.
1413
+
1414
+ Useful for the LLM to retrieve source content on-demand.
1415
+
1416
+ Args:
1417
+ query: Search query
1418
+ top_k: Maximum citations to return
1419
+
1420
+ Returns:
1421
+ List of Citation objects
1422
+ """
1423
+ results = self.search(query, top_k=top_k)
1424
+ citations = []
1425
+
1426
+ for result in results:
1427
+ entity = result['entity']
1428
+ if 'citation' in entity:
1429
+ citations.append(Citation.from_dict(entity['citation']))
1430
+
1431
+ return citations
1432
+
1433
+ def export_citations_summary(self) -> Dict[str, List[Dict[str, Any]]]:
1434
+ """
1435
+ Export a summary of all citations grouped by file.
1436
+
1437
+ Returns:
1438
+ Dict mapping file paths to lists of entity summaries
1439
+ """
1440
+ by_file: Dict[str, List[Dict[str, Any]]] = defaultdict(list)
1441
+
1442
+ for node_id, data in self._graph.nodes(data=True):
1443
+ citation = data.get('citation', {})
1444
+ if isinstance(citation, dict) and citation.get('file_path'):
1445
+ by_file[citation['file_path']].append({
1446
+ 'entity_id': node_id,
1447
+ 'name': data.get('name'),
1448
+ 'type': data.get('type'),
1449
+ 'line_start': citation.get('line_start'),
1450
+ 'line_end': citation.get('line_end'),
1451
+ })
1452
+
1453
+ # Sort entities within each file by line number
1454
+ for file_path in by_file:
1455
+ by_file[file_path].sort(key=lambda x: x.get('line_start') or 0)
1456
+
1457
+ return dict(by_file)