alita-sdk 0.3.379__py3-none-any.whl → 0.3.627__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent/__init__.py +5 -0
  4. alita_sdk/cli/agent/default.py +258 -0
  5. alita_sdk/cli/agent_executor.py +156 -0
  6. alita_sdk/cli/agent_loader.py +245 -0
  7. alita_sdk/cli/agent_ui.py +228 -0
  8. alita_sdk/cli/agents.py +3113 -0
  9. alita_sdk/cli/callbacks.py +647 -0
  10. alita_sdk/cli/cli.py +168 -0
  11. alita_sdk/cli/config.py +306 -0
  12. alita_sdk/cli/context/__init__.py +30 -0
  13. alita_sdk/cli/context/cleanup.py +198 -0
  14. alita_sdk/cli/context/manager.py +731 -0
  15. alita_sdk/cli/context/message.py +285 -0
  16. alita_sdk/cli/context/strategies.py +289 -0
  17. alita_sdk/cli/context/token_estimation.py +127 -0
  18. alita_sdk/cli/formatting.py +182 -0
  19. alita_sdk/cli/input_handler.py +419 -0
  20. alita_sdk/cli/inventory.py +1073 -0
  21. alita_sdk/cli/mcp_loader.py +315 -0
  22. alita_sdk/cli/testcases/__init__.py +94 -0
  23. alita_sdk/cli/testcases/data_generation.py +119 -0
  24. alita_sdk/cli/testcases/discovery.py +96 -0
  25. alita_sdk/cli/testcases/executor.py +84 -0
  26. alita_sdk/cli/testcases/logger.py +85 -0
  27. alita_sdk/cli/testcases/parser.py +172 -0
  28. alita_sdk/cli/testcases/prompts.py +91 -0
  29. alita_sdk/cli/testcases/reporting.py +125 -0
  30. alita_sdk/cli/testcases/setup.py +108 -0
  31. alita_sdk/cli/testcases/test_runner.py +282 -0
  32. alita_sdk/cli/testcases/utils.py +39 -0
  33. alita_sdk/cli/testcases/validation.py +90 -0
  34. alita_sdk/cli/testcases/workflow.py +196 -0
  35. alita_sdk/cli/toolkit.py +327 -0
  36. alita_sdk/cli/toolkit_loader.py +85 -0
  37. alita_sdk/cli/tools/__init__.py +43 -0
  38. alita_sdk/cli/tools/approval.py +224 -0
  39. alita_sdk/cli/tools/filesystem.py +1751 -0
  40. alita_sdk/cli/tools/planning.py +389 -0
  41. alita_sdk/cli/tools/terminal.py +414 -0
  42. alita_sdk/community/__init__.py +72 -12
  43. alita_sdk/community/inventory/__init__.py +236 -0
  44. alita_sdk/community/inventory/config.py +257 -0
  45. alita_sdk/community/inventory/enrichment.py +2137 -0
  46. alita_sdk/community/inventory/extractors.py +1469 -0
  47. alita_sdk/community/inventory/ingestion.py +3172 -0
  48. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  49. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  50. alita_sdk/community/inventory/parsers/base.py +295 -0
  51. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  52. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  53. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  54. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  55. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  56. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  57. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  58. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  59. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  60. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  61. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  62. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  63. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  64. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  65. alita_sdk/community/inventory/patterns/loader.py +348 -0
  66. alita_sdk/community/inventory/patterns/registry.py +198 -0
  67. alita_sdk/community/inventory/presets.py +535 -0
  68. alita_sdk/community/inventory/retrieval.py +1403 -0
  69. alita_sdk/community/inventory/toolkit.py +173 -0
  70. alita_sdk/community/inventory/toolkit_utils.py +176 -0
  71. alita_sdk/community/inventory/visualize.py +1370 -0
  72. alita_sdk/configurations/__init__.py +1 -1
  73. alita_sdk/configurations/ado.py +141 -20
  74. alita_sdk/configurations/bitbucket.py +94 -2
  75. alita_sdk/configurations/confluence.py +130 -1
  76. alita_sdk/configurations/figma.py +76 -0
  77. alita_sdk/configurations/gitlab.py +91 -0
  78. alita_sdk/configurations/jira.py +103 -0
  79. alita_sdk/configurations/openapi.py +329 -0
  80. alita_sdk/configurations/qtest.py +72 -1
  81. alita_sdk/configurations/report_portal.py +96 -0
  82. alita_sdk/configurations/sharepoint.py +148 -0
  83. alita_sdk/configurations/testio.py +83 -0
  84. alita_sdk/configurations/testrail.py +88 -0
  85. alita_sdk/configurations/xray.py +93 -0
  86. alita_sdk/configurations/zephyr_enterprise.py +93 -0
  87. alita_sdk/configurations/zephyr_essential.py +75 -0
  88. alita_sdk/runtime/clients/artifact.py +3 -3
  89. alita_sdk/runtime/clients/client.py +388 -46
  90. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  91. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  92. alita_sdk/runtime/clients/sandbox_client.py +8 -21
  93. alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
  94. alita_sdk/runtime/langchain/assistant.py +157 -39
  95. alita_sdk/runtime/langchain/constants.py +647 -1
  96. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  97. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
  98. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
  99. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -4
  100. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +226 -7
  101. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +5 -2
  102. alita_sdk/runtime/langchain/document_loaders/constants.py +40 -19
  103. alita_sdk/runtime/langchain/langraph_agent.py +405 -84
  104. alita_sdk/runtime/langchain/utils.py +106 -7
  105. alita_sdk/runtime/llms/preloaded.py +2 -6
  106. alita_sdk/runtime/models/mcp_models.py +61 -0
  107. alita_sdk/runtime/skills/__init__.py +91 -0
  108. alita_sdk/runtime/skills/callbacks.py +498 -0
  109. alita_sdk/runtime/skills/discovery.py +540 -0
  110. alita_sdk/runtime/skills/executor.py +610 -0
  111. alita_sdk/runtime/skills/input_builder.py +371 -0
  112. alita_sdk/runtime/skills/models.py +330 -0
  113. alita_sdk/runtime/skills/registry.py +355 -0
  114. alita_sdk/runtime/skills/skill_runner.py +330 -0
  115. alita_sdk/runtime/toolkits/__init__.py +31 -0
  116. alita_sdk/runtime/toolkits/application.py +29 -10
  117. alita_sdk/runtime/toolkits/artifact.py +20 -11
  118. alita_sdk/runtime/toolkits/datasource.py +13 -6
  119. alita_sdk/runtime/toolkits/mcp.py +783 -0
  120. alita_sdk/runtime/toolkits/mcp_config.py +1048 -0
  121. alita_sdk/runtime/toolkits/planning.py +178 -0
  122. alita_sdk/runtime/toolkits/skill_router.py +238 -0
  123. alita_sdk/runtime/toolkits/subgraph.py +251 -6
  124. alita_sdk/runtime/toolkits/tools.py +356 -69
  125. alita_sdk/runtime/toolkits/vectorstore.py +11 -5
  126. alita_sdk/runtime/tools/__init__.py +10 -3
  127. alita_sdk/runtime/tools/application.py +27 -6
  128. alita_sdk/runtime/tools/artifact.py +511 -28
  129. alita_sdk/runtime/tools/data_analysis.py +183 -0
  130. alita_sdk/runtime/tools/function.py +67 -35
  131. alita_sdk/runtime/tools/graph.py +10 -4
  132. alita_sdk/runtime/tools/image_generation.py +148 -46
  133. alita_sdk/runtime/tools/llm.py +1003 -128
  134. alita_sdk/runtime/tools/loop.py +3 -1
  135. alita_sdk/runtime/tools/loop_output.py +3 -1
  136. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  137. alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
  138. alita_sdk/runtime/tools/mcp_server_tool.py +8 -5
  139. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  140. alita_sdk/runtime/tools/planning/models.py +246 -0
  141. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  142. alita_sdk/runtime/tools/router.py +2 -4
  143. alita_sdk/runtime/tools/sandbox.py +65 -48
  144. alita_sdk/runtime/tools/skill_router.py +776 -0
  145. alita_sdk/runtime/tools/tool.py +3 -1
  146. alita_sdk/runtime/tools/vectorstore.py +9 -3
  147. alita_sdk/runtime/tools/vectorstore_base.py +70 -14
  148. alita_sdk/runtime/utils/AlitaCallback.py +137 -21
  149. alita_sdk/runtime/utils/constants.py +5 -1
  150. alita_sdk/runtime/utils/mcp_client.py +492 -0
  151. alita_sdk/runtime/utils/mcp_oauth.py +361 -0
  152. alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
  153. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  154. alita_sdk/runtime/utils/serialization.py +155 -0
  155. alita_sdk/runtime/utils/streamlit.py +40 -13
  156. alita_sdk/runtime/utils/toolkit_utils.py +30 -9
  157. alita_sdk/runtime/utils/utils.py +36 -0
  158. alita_sdk/tools/__init__.py +134 -35
  159. alita_sdk/tools/ado/repos/__init__.py +51 -32
  160. alita_sdk/tools/ado/repos/repos_wrapper.py +148 -89
  161. alita_sdk/tools/ado/test_plan/__init__.py +25 -9
  162. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +23 -1
  163. alita_sdk/tools/ado/utils.py +1 -18
  164. alita_sdk/tools/ado/wiki/__init__.py +25 -12
  165. alita_sdk/tools/ado/wiki/ado_wrapper.py +291 -22
  166. alita_sdk/tools/ado/work_item/__init__.py +26 -13
  167. alita_sdk/tools/ado/work_item/ado_wrapper.py +73 -11
  168. alita_sdk/tools/advanced_jira_mining/__init__.py +11 -8
  169. alita_sdk/tools/aws/delta_lake/__init__.py +13 -9
  170. alita_sdk/tools/aws/delta_lake/tool.py +5 -1
  171. alita_sdk/tools/azure_ai/search/__init__.py +11 -8
  172. alita_sdk/tools/azure_ai/search/api_wrapper.py +1 -1
  173. alita_sdk/tools/base/tool.py +5 -1
  174. alita_sdk/tools/base_indexer_toolkit.py +271 -84
  175. alita_sdk/tools/bitbucket/__init__.py +17 -11
  176. alita_sdk/tools/bitbucket/api_wrapper.py +59 -11
  177. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +49 -35
  178. alita_sdk/tools/browser/__init__.py +5 -4
  179. alita_sdk/tools/carrier/__init__.py +5 -6
  180. alita_sdk/tools/carrier/backend_reports_tool.py +6 -6
  181. alita_sdk/tools/carrier/run_ui_test_tool.py +6 -6
  182. alita_sdk/tools/carrier/ui_reports_tool.py +5 -5
  183. alita_sdk/tools/chunkers/__init__.py +3 -1
  184. alita_sdk/tools/chunkers/code/treesitter/treesitter.py +37 -13
  185. alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
  186. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  187. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  188. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  189. alita_sdk/tools/cloud/aws/__init__.py +10 -7
  190. alita_sdk/tools/cloud/azure/__init__.py +10 -7
  191. alita_sdk/tools/cloud/gcp/__init__.py +10 -7
  192. alita_sdk/tools/cloud/k8s/__init__.py +10 -7
  193. alita_sdk/tools/code/linter/__init__.py +10 -8
  194. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  195. alita_sdk/tools/code/sonar/__init__.py +11 -8
  196. alita_sdk/tools/code_indexer_toolkit.py +82 -22
  197. alita_sdk/tools/confluence/__init__.py +22 -16
  198. alita_sdk/tools/confluence/api_wrapper.py +107 -30
  199. alita_sdk/tools/confluence/loader.py +14 -2
  200. alita_sdk/tools/custom_open_api/__init__.py +12 -5
  201. alita_sdk/tools/elastic/__init__.py +11 -8
  202. alita_sdk/tools/elitea_base.py +493 -30
  203. alita_sdk/tools/figma/__init__.py +58 -11
  204. alita_sdk/tools/figma/api_wrapper.py +1235 -143
  205. alita_sdk/tools/figma/figma_client.py +73 -0
  206. alita_sdk/tools/figma/toon_tools.py +2748 -0
  207. alita_sdk/tools/github/__init__.py +14 -15
  208. alita_sdk/tools/github/github_client.py +224 -100
  209. alita_sdk/tools/github/graphql_client_wrapper.py +119 -33
  210. alita_sdk/tools/github/schemas.py +14 -5
  211. alita_sdk/tools/github/tool.py +5 -1
  212. alita_sdk/tools/github/tool_prompts.py +9 -22
  213. alita_sdk/tools/gitlab/__init__.py +16 -11
  214. alita_sdk/tools/gitlab/api_wrapper.py +218 -48
  215. alita_sdk/tools/gitlab_org/__init__.py +10 -9
  216. alita_sdk/tools/gitlab_org/api_wrapper.py +63 -64
  217. alita_sdk/tools/google/bigquery/__init__.py +13 -12
  218. alita_sdk/tools/google/bigquery/tool.py +5 -1
  219. alita_sdk/tools/google_places/__init__.py +11 -8
  220. alita_sdk/tools/google_places/api_wrapper.py +1 -1
  221. alita_sdk/tools/jira/__init__.py +17 -10
  222. alita_sdk/tools/jira/api_wrapper.py +92 -41
  223. alita_sdk/tools/keycloak/__init__.py +11 -8
  224. alita_sdk/tools/localgit/__init__.py +9 -3
  225. alita_sdk/tools/localgit/local_git.py +62 -54
  226. alita_sdk/tools/localgit/tool.py +5 -1
  227. alita_sdk/tools/memory/__init__.py +12 -4
  228. alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
  229. alita_sdk/tools/ocr/__init__.py +11 -8
  230. alita_sdk/tools/openapi/__init__.py +491 -106
  231. alita_sdk/tools/openapi/api_wrapper.py +1368 -0
  232. alita_sdk/tools/openapi/tool.py +20 -0
  233. alita_sdk/tools/pandas/__init__.py +20 -12
  234. alita_sdk/tools/pandas/api_wrapper.py +38 -25
  235. alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
  236. alita_sdk/tools/postman/__init__.py +10 -9
  237. alita_sdk/tools/pptx/__init__.py +11 -10
  238. alita_sdk/tools/pptx/pptx_wrapper.py +1 -1
  239. alita_sdk/tools/qtest/__init__.py +31 -11
  240. alita_sdk/tools/qtest/api_wrapper.py +2135 -86
  241. alita_sdk/tools/rally/__init__.py +10 -9
  242. alita_sdk/tools/rally/api_wrapper.py +1 -1
  243. alita_sdk/tools/report_portal/__init__.py +12 -8
  244. alita_sdk/tools/salesforce/__init__.py +10 -8
  245. alita_sdk/tools/servicenow/__init__.py +17 -15
  246. alita_sdk/tools/servicenow/api_wrapper.py +1 -1
  247. alita_sdk/tools/sharepoint/__init__.py +10 -7
  248. alita_sdk/tools/sharepoint/api_wrapper.py +129 -38
  249. alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
  250. alita_sdk/tools/sharepoint/utils.py +8 -2
  251. alita_sdk/tools/slack/__init__.py +10 -7
  252. alita_sdk/tools/slack/api_wrapper.py +2 -2
  253. alita_sdk/tools/sql/__init__.py +12 -9
  254. alita_sdk/tools/testio/__init__.py +10 -7
  255. alita_sdk/tools/testrail/__init__.py +11 -10
  256. alita_sdk/tools/testrail/api_wrapper.py +1 -1
  257. alita_sdk/tools/utils/__init__.py +9 -4
  258. alita_sdk/tools/utils/content_parser.py +103 -18
  259. alita_sdk/tools/utils/text_operations.py +410 -0
  260. alita_sdk/tools/utils/tool_prompts.py +79 -0
  261. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +30 -13
  262. alita_sdk/tools/xray/__init__.py +13 -9
  263. alita_sdk/tools/yagmail/__init__.py +9 -3
  264. alita_sdk/tools/zephyr/__init__.py +10 -7
  265. alita_sdk/tools/zephyr_enterprise/__init__.py +11 -7
  266. alita_sdk/tools/zephyr_essential/__init__.py +10 -7
  267. alita_sdk/tools/zephyr_essential/api_wrapper.py +30 -13
  268. alita_sdk/tools/zephyr_essential/client.py +2 -2
  269. alita_sdk/tools/zephyr_scale/__init__.py +11 -8
  270. alita_sdk/tools/zephyr_scale/api_wrapper.py +2 -2
  271. alita_sdk/tools/zephyr_squad/__init__.py +10 -7
  272. {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/METADATA +154 -8
  273. alita_sdk-0.3.627.dist-info/RECORD +468 -0
  274. alita_sdk-0.3.627.dist-info/entry_points.txt +2 -0
  275. alita_sdk-0.3.379.dist-info/RECORD +0 -360
  276. {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/WHEEL +0 -0
  277. {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/licenses/LICENSE +0 -0
  278. {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,3172 @@
1
+ """
2
+ Inventory Ingestion Pipeline.
3
+
4
+ This module provides a workflow/pipeline for building and updating knowledge graphs
5
+ from source code repositories. It is NOT a toolkit - it's a defined process that:
6
+
7
+ 1. Connects to source toolkits (GitHub, ADO, LocalGit, etc.)
8
+ 2. Fetches documents via their loader() methods
9
+ 3. Extracts entities using LLM
10
+ 4. Extracts relations between entities
11
+ 5. Tracks source information for both entities (via citations) and relations
12
+ 6. Persists the graph to JSON
13
+
14
+ The result is a graph dump that can be queried by the RetrievalToolkit.
15
+
16
+ Multi-Source Support:
17
+ - Entities from different sources are merged when they have the same (type, name)
18
+ - Each entity maintains citations from all sources that reference it
19
+ - Relations are tagged with source_toolkit to track which source created them
20
+ - Cross-source relations are automatically tracked (e.g., Jira ticket -> GitHub PR)
21
+ - Query relations by source: graph.get_relations_by_source('github')
22
+ - Find cross-source relations: graph.get_cross_source_relations()
23
+
24
+ Usage:
25
+ # With full configuration
26
+ from alita_sdk.community.inventory import IngestionConfig, IngestionPipeline
27
+
28
+ config = IngestionConfig.from_env() # or .from_yaml("config.yml")
29
+ pipeline = IngestionPipeline.from_config(config)
30
+ pipeline.register_toolkit('github', github_toolkit)
31
+ result = pipeline.run(source='github', branch='main')
32
+
33
+ # Or simpler approach
34
+ pipeline = IngestionPipeline(
35
+ llm=llm,
36
+ graph_path="/path/to/graph.json",
37
+ source_toolkits={'github': github_toolkit}
38
+ )
39
+ result = pipeline.run(source='github')
40
+
41
+ # Or delta update for changed files
42
+ result = pipeline.delta_update(
43
+ source='github',
44
+ file_paths=['src/app.py', 'src/utils.py']
45
+ )
46
+ """
47
+
48
+ import logging
49
+ import hashlib
50
+ import re
51
+ import time
52
+ import asyncio
53
+ from concurrent.futures import ThreadPoolExecutor, as_completed
54
+ from pathlib import Path
55
+ from typing import Any, Optional, List, Dict, Generator, Callable, TYPE_CHECKING, Tuple
56
+ from datetime import datetime
57
+
58
+ from pydantic import BaseModel, Field, PrivateAttr
59
+ from langchain_core.documents import Document
60
+
61
+ from .knowledge_graph import KnowledgeGraph, Citation
62
+ from .extractors import (
63
+ DocumentClassifier,
64
+ EntitySchemaDiscoverer,
65
+ EntityExtractor,
66
+ RelationExtractor,
67
+ FactExtractor,
68
+ ENTITY_TAXONOMY,
69
+ RELATIONSHIP_TAXONOMY,
70
+ )
71
+ from .parsers import (
72
+ parse_file as parser_parse_file,
73
+ get_parser_for_file,
74
+ ParseResult,
75
+ Symbol,
76
+ Relationship as ParserRelationship,
77
+ SymbolType,
78
+ RelationshipType as ParserRelationshipType,
79
+ )
80
+
81
+ if TYPE_CHECKING:
82
+ from .config import GuardrailsConfig, IngestionConfig
83
+
84
+ logger = logging.getLogger(__name__)
85
+
86
+ # ============================================================================
87
+ # PARSER-BASED EXTRACTION (AST/Regex - No LLM)
88
+ # ============================================================================
89
+
90
+ # Symbol types that parsers extract (skip LLM for these)
91
+ PARSER_EXTRACTED_TYPES = {
92
+ SymbolType.CLASS, SymbolType.FUNCTION, SymbolType.METHOD,
93
+ SymbolType.MODULE, SymbolType.INTERFACE, SymbolType.CONSTANT,
94
+ SymbolType.VARIABLE, SymbolType.IMPORT, SymbolType.PROPERTY,
95
+ SymbolType.FIELD, SymbolType.ENUM, SymbolType.TYPE_ALIAS,
96
+ SymbolType.DECORATOR, SymbolType.NAMESPACE, SymbolType.PARAMETER,
97
+ }
98
+
99
+ # Map parser SymbolType to entity type strings
100
+ SYMBOL_TYPE_TO_ENTITY_TYPE = {
101
+ SymbolType.CLASS: "class",
102
+ SymbolType.FUNCTION: "function",
103
+ SymbolType.METHOD: "method",
104
+ SymbolType.MODULE: "module",
105
+ SymbolType.INTERFACE: "interface",
106
+ SymbolType.CONSTANT: "constant",
107
+ SymbolType.VARIABLE: "variable",
108
+ SymbolType.IMPORT: "import",
109
+ SymbolType.PROPERTY: "property",
110
+ SymbolType.FIELD: "field",
111
+ SymbolType.ENUM: "enum",
112
+ SymbolType.TYPE_ALIAS: "type_alias",
113
+ SymbolType.DECORATOR: "decorator",
114
+ SymbolType.NAMESPACE: "namespace",
115
+ SymbolType.PARAMETER: "parameter",
116
+ }
117
+
118
+ # Map parser RelationshipType to relation type strings
119
+ PARSER_REL_TYPE_TO_STRING = {
120
+ ParserRelationshipType.IMPORTS: "imports",
121
+ ParserRelationshipType.EXPORTS: "exports",
122
+ ParserRelationshipType.CALLS: "calls",
123
+ ParserRelationshipType.RETURNS: "returns",
124
+ ParserRelationshipType.INHERITANCE: "extends",
125
+ ParserRelationshipType.IMPLEMENTATION: "implements",
126
+ ParserRelationshipType.COMPOSITION: "contains",
127
+ ParserRelationshipType.AGGREGATION: "uses",
128
+ ParserRelationshipType.DEFINES: "defines",
129
+ ParserRelationshipType.CONTAINS: "contains",
130
+ ParserRelationshipType.DECORATES: "decorates",
131
+ ParserRelationshipType.ANNOTATES: "annotates",
132
+ ParserRelationshipType.REFERENCES: "references",
133
+ ParserRelationshipType.USES: "uses",
134
+ }
135
+
136
+
137
+ def _is_code_file(file_path: str) -> bool:
138
+ """Check if file is a code file that parsers can handle."""
139
+ code_extensions = {
140
+ '.py', '.pyx', '.pyi', # Python
141
+ '.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs', # JavaScript/TypeScript
142
+ '.java', # Java
143
+ '.kt', '.kts', # Kotlin
144
+ '.cs', # C#
145
+ '.rs', # Rust
146
+ '.swift', # Swift
147
+ '.go', # Go
148
+ }
149
+ ext = Path(file_path).suffix.lower()
150
+ return ext in code_extensions
151
+
152
+
153
+ def _is_code_like_file(file_path: str) -> bool:
154
+ """
155
+ Check if file looks like code but may not have a specific parser.
156
+
157
+ This includes:
158
+ - Supported code files (with parsers)
159
+ - Unsupported code files (no parser - use hybrid fallback)
160
+ - Script files that contain code structure
161
+ """
162
+ # All supported code files
163
+ supported_extensions = {
164
+ '.py', '.pyx', '.pyi', # Python
165
+ '.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs', # JavaScript/TypeScript
166
+ '.java', # Java
167
+ '.kt', '.kts', # Kotlin
168
+ '.cs', # C#
169
+ '.rs', # Rust
170
+ '.swift', # Swift
171
+ '.go', # Go
172
+ }
173
+
174
+ # Additional code-like files that need hybrid fallback
175
+ unsupported_code_extensions = {
176
+ # Scripting languages
177
+ '.lua', '.pl', '.pm', '.perl', '.rb', '.php',
178
+ '.sh', '.bash', '.zsh', '.fish', '.ps1', '.bat', '.cmd',
179
+ # Other programming languages
180
+ '.scala', '.clj', '.cljs', '.ex', '.exs', '.erl', '.hrl',
181
+ '.hs', '.ml', '.fs', '.fsx', '.r', '.R', '.jl',
182
+ '.dart', '.nim', '.v', '.zig', '.cr', '.d',
183
+ '.c', '.cpp', '.cc', '.cxx', '.h', '.hpp', '.hxx',
184
+ '.m', '.mm', # Objective-C
185
+ '.groovy', '.gradle',
186
+ # Data/Config that may contain code
187
+ '.cmake', '.makefile', '.mk',
188
+ }
189
+
190
+ ext = Path(file_path).suffix.lower()
191
+
192
+ # Also check for Makefile without extension
193
+ file_name = Path(file_path).name.lower()
194
+ if file_name in {'makefile', 'gnumakefile'}:
195
+ return True
196
+
197
+ return ext in supported_extensions or ext in unsupported_code_extensions
198
+
199
+
200
+ def _symbol_to_entity(
201
+ symbol: Symbol,
202
+ source_toolkit: str,
203
+ generate_id_func: Callable[[str, str, str], str]
204
+ ) -> Dict[str, Any]:
205
+ """
206
+ Convert a parser Symbol to an entity dict.
207
+
208
+ Args:
209
+ symbol: Parsed symbol from code parser
210
+ source_toolkit: Source toolkit name
211
+ generate_id_func: Function to generate entity ID
212
+
213
+ Returns:
214
+ Entity dictionary compatible with graph
215
+ """
216
+ entity_type = SYMBOL_TYPE_TO_ENTITY_TYPE.get(symbol.symbol_type, "unknown")
217
+
218
+ # Generate entity ID
219
+ entity_id = generate_id_func(entity_type, symbol.name, symbol.file_path)
220
+
221
+ # Build properties from symbol metadata
222
+ properties = {
223
+ 'description': symbol.docstring or '',
224
+ 'parent_symbol': symbol.parent_symbol,
225
+ 'full_name': symbol.full_name or symbol.get_qualified_name(),
226
+ 'visibility': symbol.visibility,
227
+ 'is_static': symbol.is_static,
228
+ 'is_async': symbol.is_async,
229
+ 'is_exported': symbol.is_exported,
230
+ 'signature': symbol.signature,
231
+ 'return_type': symbol.return_type,
232
+ }
233
+ # Add any extra metadata
234
+ properties.update(symbol.metadata)
235
+ # Remove None values
236
+ properties = {k: v for k, v in properties.items() if v is not None}
237
+
238
+ # Create citation with line range
239
+ line_start = symbol.range.start.line if symbol.range else 1
240
+ line_end = symbol.range.end.line if symbol.range else line_start
241
+
242
+ citation = Citation(
243
+ file_path=symbol.file_path,
244
+ line_start=line_start,
245
+ line_end=line_end,
246
+ source_toolkit=source_toolkit,
247
+ doc_id=f"{source_toolkit}://{symbol.file_path}",
248
+ )
249
+
250
+ return {
251
+ 'id': entity_id,
252
+ 'name': symbol.name,
253
+ 'type': entity_type,
254
+ 'citation': citation,
255
+ 'properties': properties,
256
+ 'source': 'parser', # Mark as parser-extracted
257
+ }
258
+
259
+
260
+ def _parser_relationship_to_dict(
261
+ rel: ParserRelationship,
262
+ source_toolkit: str,
263
+ ) -> Dict[str, Any]:
264
+ """
265
+ Convert a parser Relationship to a relation dict.
266
+
267
+ Args:
268
+ rel: Parsed relationship from code parser
269
+ source_toolkit: Source toolkit name
270
+
271
+ Returns:
272
+ Relation dictionary compatible with graph
273
+ """
274
+ rel_type = PARSER_REL_TYPE_TO_STRING.get(rel.relationship_type, "references")
275
+
276
+ return {
277
+ 'source_symbol': rel.source_symbol,
278
+ 'target_symbol': rel.target_symbol,
279
+ 'relation_type': rel_type,
280
+ 'source_file': rel.source_file,
281
+ 'target_file': rel.target_file,
282
+ 'confidence': rel.confidence,
283
+ 'is_cross_file': rel.is_cross_file,
284
+ 'source': 'parser', # Mark as parser-extracted
285
+ 'source_toolkit': source_toolkit,
286
+ }
287
+
288
+ # ============================================================================
289
+ # ENTITY TYPE NORMALIZATION
290
+ # ============================================================================
291
+
292
+ # Types that should never be deduplicated (context-dependent)
293
+ CONTEXT_DEPENDENT_TYPES = {
294
+ "tool", "property", "properties", "parameter", "argument",
295
+ "field", "column", "attribute", "option", "setting",
296
+ "step", "test_step", "ui_field", "endpoint", "method",
297
+ "mcp_tool", "mcp_resource",
298
+ # File-level nodes are unique per file path
299
+ "file", "source_file", "document_file", "config_file", "web_file",
300
+ }
301
+
302
+ # Build canonical type set from ENTITY_TAXONOMY
303
+ _CANONICAL_TYPES = set()
304
+ for layer_data in ENTITY_TAXONOMY.values():
305
+ for type_def in layer_data["types"]:
306
+ _CANONICAL_TYPES.add(type_def["name"].lower())
307
+
308
+ # Map common variations to canonical forms
309
+ TYPE_NORMALIZATION_MAP = {
310
+ # Tool/Toolkit variations
311
+ "tools": "tool",
312
+ "Tool": "tool",
313
+ "Tools": "tool",
314
+ "Toolkit": "toolkit",
315
+ "toolkits": "toolkit",
316
+ # MCP variations
317
+ "MCP Server": "mcp_server",
318
+ "MCP Tool": "mcp_tool",
319
+ "MCP Resource": "mcp_resource",
320
+ # Common variations
321
+ "Feature": "feature",
322
+ "Features": "feature",
323
+ "API": "api",
324
+ "APIs": "api",
325
+ "Service": "service",
326
+ "Services": "service",
327
+ "Endpoint": "endpoint",
328
+ "Endpoints": "endpoint",
329
+ "Configuration": "configuration",
330
+ "Config": "configuration",
331
+ "Test Case": "test_case",
332
+ "Test Cases": "test_case",
333
+ "test case": "test_case",
334
+ "User Story": "user_story",
335
+ "User Stories": "user_story",
336
+ "user story": "user_story",
337
+ "Business Rule": "business_rule",
338
+ "business rule": "business_rule",
339
+ "UI Component": "ui_component",
340
+ "ui component": "ui_component",
341
+ "UI Field": "ui_field",
342
+ "ui field": "ui_field",
343
+ "Test Suite": "test_suite",
344
+ "test suite": "test_suite",
345
+ "Test Step": "test_step",
346
+ "test step": "test_step",
347
+ "Glossary Term": "glossary_term",
348
+ "glossary term": "glossary_term",
349
+ "Domain Entity": "domain_entity",
350
+ "domain entity": "domain_entity",
351
+ "Pull Request": "pull_request",
352
+ "pull request": "pull_request",
353
+ }
354
+
355
+ def normalize_entity_type(entity_type: str) -> str:
356
+ """
357
+ Normalize entity type to canonical lowercase form.
358
+
359
+ Args:
360
+ entity_type: Raw entity type from LLM extraction
361
+
362
+ Returns:
363
+ Canonical lowercase entity type
364
+ """
365
+ if not entity_type:
366
+ return "unknown"
367
+
368
+ # Check explicit mapping first
369
+ if entity_type in TYPE_NORMALIZATION_MAP:
370
+ return TYPE_NORMALIZATION_MAP[entity_type]
371
+
372
+ # Normalize: lowercase, replace spaces with underscores
373
+ normalized = entity_type.lower().strip().replace(" ", "_").replace("-", "_")
374
+
375
+ # If it's already canonical, return it
376
+ if normalized in _CANONICAL_TYPES:
377
+ return normalized
378
+
379
+ # Handle plural forms by removing trailing 's' (but not 'ss' like 'class')
380
+ if normalized.endswith('s') and not normalized.endswith('ss') and len(normalized) > 3:
381
+ singular = normalized[:-1]
382
+ if singular in _CANONICAL_TYPES:
383
+ return singular
384
+
385
+ # Return the normalized form even if not in taxonomy
386
+ # (allows for custom types while maintaining consistency)
387
+ return normalized
388
+
389
+
390
+ class IngestionResult(BaseModel):
391
+ """Result of an ingestion run."""
392
+ success: bool = True
393
+ source: str = "unknown"
394
+ documents_processed: int = 0
395
+ documents_skipped: int = 0
396
+ entities_added: int = 0
397
+ entities_removed: int = 0
398
+ relations_added: int = 0
399
+ duration_seconds: float = 0.0
400
+ errors: List[str] = Field(default_factory=list)
401
+ failed_documents: List[str] = Field(default_factory=list)
402
+ graph_stats: Dict[str, Any] = Field(default_factory=dict)
403
+ resumed_from_checkpoint: bool = False
404
+
405
+ def __str__(self) -> str:
406
+ status = "✅ Success" if self.success else "❌ Failed"
407
+ resumed = " (resumed)" if self.resumed_from_checkpoint else ""
408
+ skipped_info = f"\n Documents skipped: {self.documents_skipped}" if self.documents_skipped else ""
409
+ failed_info = f"\n Failed documents: {len(self.failed_documents)}" if self.failed_documents else ""
410
+ return (
411
+ f"{status}: Ingestion from {self.source}{resumed}\n"
412
+ f" Documents processed: {self.documents_processed}{skipped_info}{failed_info}\n"
413
+ f" Entities added: {self.entities_added}\n"
414
+ f" Relations added: {self.relations_added}\n"
415
+ f" Duration: {self.duration_seconds:.1f}s\n"
416
+ f" Graph: {self.graph_stats.get('node_count', 0)} entities, "
417
+ f"{self.graph_stats.get('edge_count', 0)} relations"
418
+ )
419
+
420
+
421
+ class IngestionCheckpoint(BaseModel):
422
+ """
423
+ Checkpoint for resumable ingestion.
424
+
425
+ Saved periodically during ingestion to allow recovery from failures.
426
+ """
427
+ # Run identification
428
+ run_id: str = Field(description="Unique identifier for this ingestion run")
429
+ source: str = Field(description="Source toolkit name")
430
+ started_at: str = Field(description="ISO timestamp when ingestion started")
431
+ updated_at: str = Field(description="ISO timestamp of last checkpoint update")
432
+
433
+ # Configuration
434
+ branch: Optional[str] = None
435
+ whitelist: Optional[List[str]] = None
436
+ blacklist: Optional[List[str]] = None
437
+ extract_relations: bool = True
438
+
439
+ # Progress tracking
440
+ phase: str = Field(default="fetch", description="Current phase: fetch, extract, relations, complete")
441
+ documents_processed: int = 0
442
+ entities_added: int = 0
443
+ relations_added: int = 0
444
+
445
+ # Processed document tracking with content hashes for incremental updates
446
+ # Maps file_path -> content_hash (allows detecting changed files)
447
+ processed_files: List[str] = Field(default_factory=list) # Legacy: just paths
448
+ file_hashes: Dict[str, str] = Field(default_factory=dict) # New: path -> content_hash
449
+
450
+ # Failed document tracking for retry
451
+ failed_files: List[Dict[str, Any]] = Field(default_factory=list) # [{file_path, error, attempts}]
452
+
453
+ # Collected entities for relation extraction (stored if phase changes)
454
+ pending_entities: List[Dict[str, Any]] = Field(default_factory=list)
455
+
456
+ # Status
457
+ completed: bool = False
458
+ errors: List[str] = Field(default_factory=list)
459
+
460
+ @classmethod
461
+ def create(cls, source: str, branch: Optional[str] = None,
462
+ whitelist: Optional[List[str]] = None,
463
+ blacklist: Optional[List[str]] = None,
464
+ extract_relations: bool = True) -> 'IngestionCheckpoint':
465
+ """Create a new checkpoint for a fresh ingestion run."""
466
+ import uuid
467
+ now = datetime.utcnow().isoformat()
468
+ return cls(
469
+ run_id=str(uuid.uuid4())[:8],
470
+ source=source,
471
+ started_at=now,
472
+ updated_at=now,
473
+ branch=branch,
474
+ whitelist=whitelist,
475
+ blacklist=blacklist,
476
+ extract_relations=extract_relations,
477
+ )
478
+
479
+ def save(self, checkpoint_path: str) -> None:
480
+ """Save checkpoint to disk."""
481
+ import json
482
+ self.updated_at = datetime.utcnow().isoformat()
483
+ path = Path(checkpoint_path)
484
+ path.parent.mkdir(parents=True, exist_ok=True)
485
+
486
+ # Write to temp file first, then rename for atomicity
487
+ temp_path = path.with_suffix('.tmp')
488
+ with open(temp_path, 'w') as f:
489
+ json.dump(self.model_dump(), f, indent=2, default=str)
490
+ temp_path.rename(path)
491
+
492
+ logger.debug(f"Checkpoint saved: {self.documents_processed} docs, {self.entities_added} entities")
493
+
494
+ @classmethod
495
+ def load(cls, checkpoint_path: str) -> Optional['IngestionCheckpoint']:
496
+ """Load checkpoint from disk. Returns None if not found."""
497
+ import json
498
+ path = Path(checkpoint_path)
499
+ if not path.exists():
500
+ return None
501
+
502
+ try:
503
+ with open(path) as f:
504
+ data = json.load(f)
505
+ return cls(**data)
506
+ except Exception as e:
507
+ logger.warning(f"Failed to load checkpoint: {e}")
508
+ return None
509
+
510
+ def mark_file_processed(self, file_path: str, content_hash: Optional[str] = None) -> None:
511
+ """Mark a file as successfully processed with optional content hash."""
512
+ if file_path not in self.processed_files:
513
+ self.processed_files.append(file_path)
514
+ if content_hash:
515
+ self.file_hashes[file_path] = content_hash
516
+
517
+ def mark_file_failed(self, file_path: str, error: str) -> None:
518
+ """Mark a file as failed with error details."""
519
+ # Check if already in failed list
520
+ for failed in self.failed_files:
521
+ if failed['file_path'] == file_path:
522
+ failed['attempts'] = failed.get('attempts', 1) + 1
523
+ failed['last_error'] = error
524
+ return
525
+
526
+ self.failed_files.append({
527
+ 'file_path': file_path,
528
+ 'error': error,
529
+ 'attempts': 1
530
+ })
531
+
532
+ def is_file_processed(self, file_path: str) -> bool:
533
+ """Check if a file has already been processed."""
534
+ return file_path in self.processed_files
535
+
536
+ def has_file_changed(self, file_path: str, content_hash: str) -> bool:
537
+ """
538
+ Check if a file has changed since last processing.
539
+
540
+ Returns True if:
541
+ - File was never processed before
542
+ - File was processed but we don't have its hash (legacy)
543
+ - File content hash differs from stored hash
544
+ """
545
+ if file_path not in self.file_hashes:
546
+ return True # Never seen or no hash stored
547
+ return self.file_hashes.get(file_path) != content_hash
548
+
549
+ def get_file_hash(self, file_path: str) -> Optional[str]:
550
+ """Get stored content hash for a file."""
551
+ return self.file_hashes.get(file_path)
552
+
553
+ def get_retry_files(self, max_attempts: int = 3) -> List[str]:
554
+ """Get files that should be retried (under max attempts)."""
555
+ return [
556
+ f['file_path'] for f in self.failed_files
557
+ if f.get('attempts', 1) < max_attempts
558
+ ]
559
+
560
+
561
+ class IngestionPipeline(BaseModel):
562
+ """
563
+ Pipeline for ingesting source code into a knowledge graph.
564
+
565
+ This is a workflow, not a toolkit. It processes sources and produces
566
+ a graph dump that can be queried by the RetrievalToolkit.
567
+
568
+ The pipeline:
569
+ 1. Connects to source toolkits (GitHub, ADO, LocalGit, etc.)
570
+ 2. Fetches documents via their loader() methods
571
+ 3. Uses LLM to extract entities based on ENTITY_TAXONOMY
572
+ 4. Uses LLM to extract relations based on RELATIONSHIP_TAXONOMY
573
+ 5. Persists graph to JSON (auto-save after mutations)
574
+
575
+ Configuration can be provided directly or via IngestionConfig:
576
+
577
+ # Direct configuration
578
+ pipeline = IngestionPipeline(
579
+ llm=llm,
580
+ graph_path="./graph.json",
581
+ guardrails=GuardrailsConfig(max_tokens_per_doc=4000),
582
+ )
583
+
584
+ # From config file
585
+ config = IngestionConfig.from_yaml("config.yml")
586
+ pipeline = IngestionPipeline.from_config(config)
587
+ """
588
+
589
+ # Core dependencies
590
+ llm: Any = None
591
+ alita: Any = None
592
+
593
+ # Graph persistence path
594
+ graph_path: str = Field(description="Path to persist the knowledge graph JSON")
595
+
596
+ # Source toolkits (injected by runtime)
597
+ # Maps toolkit name -> toolkit instance (e.g., {'github': GitHubApiWrapper})
598
+ source_toolkits: Dict[str, Any] = Field(default_factory=dict)
599
+
600
+ # Optional embedding for semantic search
601
+ embedding: Optional[Any] = Field(default=None, description="Embedding model instance")
602
+ embedding_model: Optional[str] = Field(default=None, description="Embedding model name (for Alita)")
603
+
604
+ # Guardrails configuration
605
+ guardrails: Optional[Any] = Field(
606
+ default=None,
607
+ description="GuardrailsConfig for rate limiting, content filtering, etc."
608
+ )
609
+
610
+ # Checkpoint configuration for resumable ingestion
611
+ checkpoint_dir: Optional[str] = Field(
612
+ default=None,
613
+ description="Directory to store checkpoints. If None, uses graph_path directory."
614
+ )
615
+ checkpoint_interval: int = Field(
616
+ default=10,
617
+ description="Save checkpoint every N documents processed"
618
+ )
619
+
620
+ # Parallel processing configuration
621
+ max_parallel_extractions: int = Field(
622
+ default=10,
623
+ description="Maximum number of parallel entity extraction requests (default: 10)"
624
+ )
625
+ batch_size: int = Field(
626
+ default=10,
627
+ description="Number of documents to process in each parallel batch (default: 10)"
628
+ )
629
+
630
+ # Skip trivial files configuration
631
+ min_file_lines: int = Field(
632
+ default=20,
633
+ description="Minimum number of lines for LLM extraction (smaller files only use parser)"
634
+ )
635
+ min_file_chars: int = Field(
636
+ default=300,
637
+ description="Minimum number of characters for LLM extraction (smaller files only use parser)"
638
+ )
639
+
640
+ # Progress callback (optional)
641
+ # Signature: callback(message: str, phase: str) -> None
642
+ progress_callback: Optional[Callable[[str, str], None]] = None
643
+
644
+ # Private attributes
645
+ _embedding: Optional[Any] = PrivateAttr(default=None)
646
+ _knowledge_graph: Optional[KnowledgeGraph] = PrivateAttr(default=None)
647
+ _document_classifier: Optional[DocumentClassifier] = PrivateAttr(default=None)
648
+ _schema_discoverer: Optional[EntitySchemaDiscoverer] = PrivateAttr(default=None)
649
+ _entity_extractor: Optional[EntityExtractor] = PrivateAttr(default=None)
650
+ _relation_extractor: Optional[RelationExtractor] = PrivateAttr(default=None)
651
+ _initialized: bool = PrivateAttr(default=False)
652
+ _last_request_time: float = PrivateAttr(default=0.0)
653
+ _request_count: int = PrivateAttr(default=0)
654
+ _current_checkpoint: Optional[IngestionCheckpoint] = PrivateAttr(default=None)
655
+
656
+ class Config:
657
+ arbitrary_types_allowed = True
658
+
659
+ def model_post_init(self, __context) -> None:
660
+ """Initialize after model construction."""
661
+ # Initialize knowledge graph
662
+ self._knowledge_graph = KnowledgeGraph()
663
+
664
+ # Handle model_construct case where graph_path may not be set
665
+ graph_path = getattr(self, 'graph_path', None)
666
+ if graph_path:
667
+ try:
668
+ path = Path(graph_path)
669
+ if path.exists():
670
+ self._knowledge_graph.load_from_json(graph_path)
671
+ stats = self._knowledge_graph.get_stats()
672
+ logger.info(f"Loaded existing graph: {stats['node_count']} entities, {stats['edge_count']} relations")
673
+ except Exception as e:
674
+ logger.warning(f"Could not load existing graph: {e}")
675
+
676
+ self._init_extractors()
677
+
678
+ def _init_extractors(self) -> bool:
679
+ """Initialize LLM-based extractors."""
680
+ if self._initialized:
681
+ return True
682
+
683
+ if not self.llm:
684
+ logger.warning("LLM not configured - extraction will fail")
685
+ return False
686
+
687
+ # Initialize embedding if configured (either directly or via Alita)
688
+ if self.embedding:
689
+ self._embedding = self.embedding
690
+ elif self.alita and self.embedding_model:
691
+ try:
692
+ self._embedding = self.alita.get_embeddings(self.embedding_model)
693
+ except Exception as e:
694
+ logger.warning(f"Could not initialize embeddings: {e}")
695
+
696
+ # Initialize extractors
697
+ self._document_classifier = DocumentClassifier(llm=self.llm)
698
+ self._schema_discoverer = EntitySchemaDiscoverer(llm=self.llm)
699
+ self._entity_extractor = EntityExtractor(llm=self.llm, embedding=self._embedding)
700
+ self._relation_extractor = RelationExtractor(llm=self.llm)
701
+ self._initialized = True
702
+
703
+ logger.info("Ingestion extractors initialized")
704
+ return True
705
+
706
+ def _apply_rate_limit(self) -> None:
707
+ """Apply rate limiting if configured in guardrails."""
708
+ if not self.guardrails:
709
+ return
710
+
711
+ rpm = getattr(self.guardrails, 'rate_limit_requests_per_minute', None)
712
+ if not rpm:
713
+ return
714
+
715
+ # Calculate minimum interval between requests
716
+ min_interval = 60.0 / rpm
717
+ elapsed = time.time() - self._last_request_time
718
+
719
+ if elapsed < min_interval:
720
+ sleep_time = min_interval - elapsed
721
+ logger.debug(f"Rate limiting: sleeping {sleep_time:.2f}s")
722
+ time.sleep(sleep_time)
723
+
724
+ self._last_request_time = time.time()
725
+ self._request_count += 1
726
+
727
+ def _filter_content(self, content: str) -> str:
728
+ """Apply content filtering based on guardrails."""
729
+ if not self.guardrails:
730
+ return content
731
+
732
+ if not getattr(self.guardrails, 'content_filter_enabled', False):
733
+ return content
734
+
735
+ filtered = content
736
+ patterns = getattr(self.guardrails, 'filter_patterns', [])
737
+
738
+ for pattern in patterns:
739
+ try:
740
+ filtered = re.sub(pattern, '[FILTERED]', filtered, flags=re.IGNORECASE)
741
+ except re.error as e:
742
+ logger.warning(f"Invalid filter pattern '{pattern}': {e}")
743
+
744
+ if filtered != content:
745
+ logger.debug("Content filtered for PII/secrets")
746
+
747
+ return filtered
748
+
749
+ def _get_max_entities(self) -> int:
750
+ """Get max entities per doc from guardrails."""
751
+ if self.guardrails:
752
+ return getattr(self.guardrails, 'max_entities_per_doc', 50)
753
+ return 50
754
+
755
+ def _get_max_relations(self) -> int:
756
+ """Get max relations per doc from guardrails."""
757
+ if self.guardrails:
758
+ return getattr(self.guardrails, 'max_relations_per_doc', 100)
759
+ return 100
760
+
761
+ def _get_confidence_threshold(self, for_relations: bool = False) -> float:
762
+ """Get confidence threshold from guardrails."""
763
+ if not self.guardrails:
764
+ return 0.5
765
+
766
+ if for_relations:
767
+ return getattr(self.guardrails, 'relation_confidence_threshold', 0.5)
768
+ return getattr(self.guardrails, 'entity_confidence_threshold', 0.5)
769
+
770
+ def _log_progress(self, message: str, phase: str = "ingestion") -> None:
771
+ """Log progress and call callback if set."""
772
+ logger.info(f"[{phase}] {message}")
773
+ if self.progress_callback:
774
+ try:
775
+ self.progress_callback(message, phase)
776
+ except Exception as e:
777
+ logger.debug(f"Progress callback failed: {e}")
778
+
779
+ def _auto_save(self) -> None:
780
+ """Auto-save graph after mutations."""
781
+ if self.graph_path:
782
+ try:
783
+ self._knowledge_graph.dump_to_json(self.graph_path)
784
+ logger.debug(f"Auto-saved graph to {self.graph_path}")
785
+ except Exception as e:
786
+ logger.warning(f"Failed to auto-save: {e}")
787
+
788
+ def _get_checkpoint_path(self, source: str) -> str:
789
+ """Get checkpoint file path for a source."""
790
+ if self.checkpoint_dir:
791
+ base_dir = Path(self.checkpoint_dir)
792
+ else:
793
+ base_dir = Path(self.graph_path).parent
794
+
795
+ return str(base_dir / f".ingestion-checkpoint-{source}.json")
796
+
797
+ def _save_checkpoint(self, checkpoint: IngestionCheckpoint) -> None:
798
+ """Save checkpoint to disk."""
799
+ try:
800
+ checkpoint_path = self._get_checkpoint_path(checkpoint.source)
801
+ checkpoint.save(checkpoint_path)
802
+ except Exception as e:
803
+ logger.warning(f"Failed to save checkpoint: {e}")
804
+
805
+ def _load_checkpoint(self, source: str) -> Optional[IngestionCheckpoint]:
806
+ """Load checkpoint from disk if exists."""
807
+ checkpoint_path = self._get_checkpoint_path(source)
808
+ return IngestionCheckpoint.load(checkpoint_path)
809
+
810
+ def _clear_checkpoint(self, source: str) -> None:
811
+ """Clear checkpoint file after successful completion."""
812
+ try:
813
+ checkpoint_path = Path(self._get_checkpoint_path(source))
814
+ if checkpoint_path.exists():
815
+ checkpoint_path.unlink()
816
+ logger.debug(f"Cleared checkpoint for {source}")
817
+ except Exception as e:
818
+ logger.warning(f"Failed to clear checkpoint: {e}")
819
+
820
+ def clear_checkpoint(self, source: str) -> bool:
821
+ """
822
+ Clear checkpoint for a source to force fresh ingestion.
823
+
824
+ Use this when you want to re-ingest everything from scratch,
825
+ ignoring previous file hashes and processing state.
826
+
827
+ Args:
828
+ source: Name of source toolkit
829
+
830
+ Returns:
831
+ True if checkpoint was cleared, False if no checkpoint existed
832
+ """
833
+ checkpoint_path = Path(self._get_checkpoint_path(source))
834
+ if checkpoint_path.exists():
835
+ self._clear_checkpoint(source)
836
+ self._log_progress(f"🗑️ Cleared checkpoint for {source}", "reset")
837
+ return True
838
+ return False
839
+
840
+ def get_checkpoint_info(self, source: str) -> Optional[Dict[str, Any]]:
841
+ """
842
+ Get information about existing checkpoint for a source.
843
+
844
+ Useful for checking if incremental update is available and
845
+ how many files are being tracked.
846
+
847
+ Args:
848
+ source: Name of source toolkit
849
+
850
+ Returns:
851
+ Dict with checkpoint info or None if no checkpoint exists
852
+ """
853
+ checkpoint = self._load_checkpoint(source)
854
+ if not checkpoint:
855
+ return None
856
+
857
+ return {
858
+ 'run_id': checkpoint.run_id,
859
+ 'completed': checkpoint.completed,
860
+ 'phase': checkpoint.phase,
861
+ 'started_at': checkpoint.started_at,
862
+ 'updated_at': checkpoint.updated_at,
863
+ 'documents_processed': checkpoint.documents_processed,
864
+ 'entities_added': checkpoint.entities_added,
865
+ 'relations_added': checkpoint.relations_added,
866
+ 'files_tracked': len(checkpoint.file_hashes),
867
+ 'files_processed': len(checkpoint.processed_files),
868
+ 'files_failed': len(checkpoint.failed_files),
869
+ }
870
+
871
+ def _generate_entity_id(self, entity_type: str, name: str, file_path: str = None) -> str:
872
+ """
873
+ Generate unique entity ID.
874
+
875
+ For most entity types, IDs are based on (type, name) only - NOT file_path.
876
+ This enables same-named entities from different files to be merged,
877
+ creating a unified knowledge graph with multiple citations per entity.
878
+
879
+ HOWEVER, for context-dependent types (tools, properties, etc.), the file_path
880
+ IS included because the same name in different files means different things:
881
+ - "Get Tests" tool in Xray toolkit != "Get Tests" tool in Zephyr toolkit
882
+ - "name" property in User entity != "name" property in Project entity
883
+ """
884
+ # Types that are context-dependent - same name in different files = different entities
885
+ CONTEXT_DEPENDENT_TYPES = {
886
+ "tool", "property", "properties", "parameter", "argument",
887
+ "field", "column", "attribute", "option", "setting",
888
+ "step", "test_step", "ui_field", "endpoint", "method",
889
+ # File-level nodes are unique per file path
890
+ "file", "source_file", "document_file", "config_file", "web_file",
891
+ }
892
+
893
+ # Normalize name for consistent hashing
894
+ normalized_name = name.lower().strip()
895
+ normalized_type = entity_type.lower().strip()
896
+
897
+ # Include file_path for context-dependent types
898
+ if normalized_type in CONTEXT_DEPENDENT_TYPES and file_path:
899
+ # Use file path to differentiate same-named entities from different contexts
900
+ content = f"{entity_type}:{normalized_name}:{file_path}"
901
+ else:
902
+ # Standard: merge same-named entities across files
903
+ content = f"{entity_type}:{normalized_name}"
904
+
905
+ return hashlib.md5(content.encode()).hexdigest()[:12]
906
+
907
+ def _normalize_document(self, doc: Any, source_toolkit: str) -> Optional[Document]:
908
+ """Normalize various document formats to LangChain Document."""
909
+ if isinstance(doc, Document):
910
+ # Already a Document, ensure metadata has source_toolkit
911
+ doc.metadata['source_toolkit'] = source_toolkit
912
+ return doc
913
+
914
+ if isinstance(doc, dict):
915
+ # Dict from loader generator
916
+ content = doc.get('file_content') or doc.get('page_content', '')
917
+ if not content:
918
+ return None
919
+
920
+ metadata = {
921
+ 'file_path': doc.get('file_name') or doc.get('source', 'unknown'),
922
+ 'commit_hash': doc.get('commit_hash'),
923
+ 'source_toolkit': source_toolkit,
924
+ }
925
+ # Merge additional metadata
926
+ for k, v in doc.items():
927
+ if k not in ('file_content', 'page_content', 'file_name', 'source', 'commit_hash'):
928
+ metadata[k] = v
929
+
930
+ return Document(page_content=content, metadata=metadata)
931
+
932
+ logger.warning(f"Unknown document type: {type(doc)}")
933
+ return None
934
+
935
+ def _extract_entities_from_doc(
936
+ self,
937
+ doc: Document,
938
+ source_toolkit: str,
939
+ schema: Optional[Dict] = None
940
+ ) -> Tuple[List[Dict[str, Any]], List[str], List[Dict[str, Any]]]:
941
+ """Extract entities from a single document.
942
+
943
+ Uses parser-first approach:
944
+ 1. For code files with parser: Use AST/regex parsers to extract symbols (no LLM)
945
+ 2. For code files without parser: HYBRID FALLBACK - TextParser + full LLM
946
+ 3. For non-code files: LLM extracts semantic entities
947
+ 4. For all files with parser: Also run LLM for semantic entities not in code structure
948
+
949
+ Returns:
950
+ Tuple of (entities, failed_file_paths, parser_relationships) where:
951
+ - entities: List of extracted entity dicts
952
+ - failed_file_paths: File path if extraction failed, empty list otherwise
953
+ - parser_relationships: List of relationships from parser (for code files)
954
+ """
955
+ file_path = (doc.metadata.get('file_path') or
956
+ doc.metadata.get('file_name') or
957
+ doc.metadata.get('source', 'unknown'))
958
+
959
+ entities = []
960
+ parser_relationships = []
961
+ failed_docs = []
962
+
963
+ # Get chunk position info for line number adjustment
964
+ start_line = doc.metadata.get('start_line') or doc.metadata.get('line_start')
965
+
966
+ # ========== PARSER-FIRST EXTRACTION ==========
967
+ # Try to use parser for code files (AST/regex - no LLM needed)
968
+ parser = get_parser_for_file(file_path)
969
+ parser_extracted_names = set() # Track what parser extracted to avoid LLM duplication
970
+ use_full_llm_extraction = False # Flag for hybrid fallback
971
+
972
+ if parser and _is_code_file(file_path):
973
+ try:
974
+ # Parse file content with language-specific parser
975
+ parse_result = parser_parse_file(file_path, content=doc.page_content)
976
+
977
+ # Build symbol name to entity ID mapping for containment edges
978
+ symbol_name_to_entity_id = {}
979
+
980
+ # Convert symbols to entities
981
+ for symbol in parse_result.symbols:
982
+ entity = _symbol_to_entity(
983
+ symbol,
984
+ source_toolkit,
985
+ self._generate_entity_id
986
+ )
987
+ # Update citation with commit hash if available
988
+ if doc.metadata.get('commit_hash'):
989
+ entity['citation'].content_hash = doc.metadata.get('commit_hash')
990
+
991
+ entities.append(entity)
992
+ parser_extracted_names.add(symbol.name.lower())
993
+
994
+ # Track symbol full name to entity ID for containment edges
995
+ full_name = symbol.full_name or symbol.get_qualified_name() or symbol.name
996
+ symbol_name_to_entity_id[full_name] = entity['id']
997
+ # Also track by simple name for fallback matching
998
+ symbol_name_to_entity_id[symbol.name] = entity['id']
999
+
1000
+ # Convert relationships from parser
1001
+ for rel in parse_result.relationships:
1002
+ parser_relationships.append(
1003
+ _parser_relationship_to_dict(rel, source_toolkit)
1004
+ )
1005
+
1006
+ # ========== INTRA-FILE CONTAINMENT EDGES ==========
1007
+ # Create containment relationships based on Symbol.parent_symbol
1008
+ containment_count = 0
1009
+ for symbol in parse_result.symbols:
1010
+ if symbol.parent_symbol:
1011
+ # Find parent entity ID
1012
+ child_full_name = symbol.full_name or symbol.get_qualified_name() or symbol.name
1013
+ child_id = symbol_name_to_entity_id.get(child_full_name) or symbol_name_to_entity_id.get(symbol.name)
1014
+
1015
+ # Try to find parent by full name or simple name
1016
+ parent_id = symbol_name_to_entity_id.get(symbol.parent_symbol)
1017
+
1018
+ if child_id and parent_id and child_id != parent_id:
1019
+ parser_relationships.append({
1020
+ 'source_symbol': symbol.parent_symbol,
1021
+ 'target_symbol': child_full_name,
1022
+ 'relation_type': 'contains',
1023
+ 'source_file': file_path,
1024
+ 'target_file': file_path,
1025
+ 'confidence': 1.0, # High confidence - structural
1026
+ 'is_cross_file': False,
1027
+ 'source': 'parser',
1028
+ 'source_toolkit': source_toolkit,
1029
+ # Pre-resolved IDs for graph insertion
1030
+ '_resolved_source_id': parent_id,
1031
+ '_resolved_target_id': child_id,
1032
+ })
1033
+ containment_count += 1
1034
+
1035
+ logger.debug(f"Parser extracted {len(entities)} entities, {len(parser_relationships)} relationships ({containment_count} containment) from {file_path}")
1036
+
1037
+ except Exception as e:
1038
+ logger.warning(f"Parser failed for {file_path}: {e}, using hybrid fallback")
1039
+ use_full_llm_extraction = True # Enable full LLM extraction
1040
+
1041
+ elif _is_code_like_file(file_path) and not parser:
1042
+ # ========== HYBRID FALLBACK ==========
1043
+ # File looks like code but no parser available (e.g., .lua, .perl, .sh)
1044
+ # Use TextParser to extract textual references + full LLM extraction
1045
+ logger.info(f"Hybrid fallback for unsupported code file: {file_path}")
1046
+ use_full_llm_extraction = True
1047
+
1048
+ try:
1049
+ # Use TextParser to extract textual references
1050
+ from .parsers import TextParser
1051
+ text_parser = TextParser()
1052
+ parse_result = text_parser.parse_file(file_path, content=doc.page_content)
1053
+
1054
+ # Extract any textual relationships (See X, Depends on Y, etc.)
1055
+ for rel in parse_result.relationships:
1056
+ parser_relationships.append(
1057
+ _parser_relationship_to_dict(rel, source_toolkit)
1058
+ )
1059
+
1060
+ logger.debug(f"TextParser extracted {len(parse_result.relationships)} textual references from {file_path}")
1061
+
1062
+ except Exception as e:
1063
+ logger.warning(f"TextParser failed for {file_path}: {e}")
1064
+
1065
+ # ========== LLM EXTRACTION (semantic entities) ==========
1066
+ # For code files with parser: LLM extracts only semantic entities (features, requirements, etc.)
1067
+ # For hybrid fallback: LLM does full extraction including code structure
1068
+ # For non-code files: LLM does full extraction
1069
+
1070
+ if self._entity_extractor:
1071
+ try:
1072
+ # Extract entities - skip_on_error=True returns (entities, failed_docs)
1073
+ extracted, llm_failed_docs = self._entity_extractor.extract_batch(
1074
+ [doc], schema=schema, skip_on_error=True
1075
+ )
1076
+ failed_docs.extend(llm_failed_docs)
1077
+
1078
+ for entity in extracted:
1079
+ entity_name = entity.get('name', '').lower()
1080
+ raw_type = entity.get('type', 'unknown')
1081
+ normalized_type = normalize_entity_type(raw_type)
1082
+
1083
+ # Skip if parser already extracted this (avoid duplicates for code entities)
1084
+ # Only skip for code_layer types that parsers handle, and only if not hybrid fallback
1085
+ code_layer_types = {'class', 'function', 'method', 'module', 'interface',
1086
+ 'constant', 'variable', 'import', 'property', 'field'}
1087
+ if (not use_full_llm_extraction and
1088
+ entity_name in parser_extracted_names and
1089
+ normalized_type in code_layer_types):
1090
+ continue
1091
+
1092
+ # Adjust line numbers if this is a chunk with offset
1093
+ entity_line_start = entity.get('line_start')
1094
+ entity_line_end = entity.get('line_end')
1095
+
1096
+ if start_line and entity_line_start:
1097
+ entity_line_start = start_line + entity_line_start - 1
1098
+ if entity_line_end:
1099
+ entity_line_end = start_line + entity_line_end - 1
1100
+
1101
+ entity_id = self._generate_entity_id(
1102
+ normalized_type,
1103
+ entity.get('name', 'unnamed'),
1104
+ file_path
1105
+ )
1106
+
1107
+ # Create citation
1108
+ citation = Citation(
1109
+ file_path=file_path,
1110
+ line_start=entity_line_start or entity.get('line_start'),
1111
+ line_end=entity_line_end or entity.get('line_end'),
1112
+ source_toolkit=source_toolkit,
1113
+ doc_id=f"{source_toolkit}://{file_path}",
1114
+ content_hash=doc.metadata.get('commit_hash'),
1115
+ )
1116
+
1117
+ entities.append({
1118
+ 'id': entity_id,
1119
+ 'name': entity.get('name', 'unnamed'),
1120
+ 'type': normalized_type,
1121
+ 'citation': citation,
1122
+ 'properties': {
1123
+ k: v for k, v in entity.items()
1124
+ if k not in ('id', 'name', 'type', 'content', 'text', 'line_start', 'line_end')
1125
+ },
1126
+ 'source_doc': doc,
1127
+ 'source': 'llm_hybrid' if use_full_llm_extraction else 'llm',
1128
+ })
1129
+
1130
+ except Exception as e:
1131
+ logger.error(f"LLM extraction failed for {file_path}: {e}")
1132
+ failed_docs.append(file_path)
1133
+
1134
+ # =====================================================================
1135
+ # FACT EXTRACTION - Lightweight LLM for semantic insights
1136
+ # Code files: extract algorithms, behaviors, validations, dependencies
1137
+ # Text files: extract decisions, requirements, definitions, dates
1138
+ # =====================================================================
1139
+ if self.llm:
1140
+ try:
1141
+ fact_extractor = FactExtractor(self.llm)
1142
+ is_code = _is_code_file(file_path) or _is_code_like_file(file_path)
1143
+
1144
+ # Use appropriate extraction method based on file type
1145
+ if is_code:
1146
+ facts = fact_extractor.extract_code(doc)
1147
+ else:
1148
+ facts = fact_extractor.extract(doc)
1149
+
1150
+ for fact in facts:
1151
+ fact_id = self._generate_entity_id(
1152
+ 'fact',
1153
+ f"{fact.get('fact_type', 'unknown')}_{fact.get('subject', 'unknown')[:30]}",
1154
+ file_path
1155
+ )
1156
+
1157
+ # Create citation for the fact
1158
+ citation = Citation(
1159
+ file_path=file_path,
1160
+ line_start=fact.get('line_start'),
1161
+ line_end=fact.get('line_end'),
1162
+ source_toolkit=source_toolkit,
1163
+ doc_id=f"{source_toolkit}://{file_path}",
1164
+ content_hash=doc.metadata.get('commit_hash'),
1165
+ )
1166
+
1167
+ entities.append({
1168
+ 'id': fact_id,
1169
+ 'name': fact.get('subject', 'unknown fact'),
1170
+ 'type': 'fact',
1171
+ 'citation': citation,
1172
+ 'properties': {
1173
+ 'fact_type': fact.get('fact_type'),
1174
+ 'subject': fact.get('subject'),
1175
+ 'predicate': fact.get('predicate'),
1176
+ 'object': fact.get('object'),
1177
+ 'confidence': fact.get('confidence', 0.8),
1178
+ },
1179
+ 'source_doc': doc,
1180
+ 'source': 'llm_fact',
1181
+ })
1182
+
1183
+ logger.debug(f"Extracted {len(facts)} facts from {file_path}")
1184
+ except Exception as e:
1185
+ logger.warning(f"Fact extraction failed for {file_path}: {e}")
1186
+
1187
+ return entities, failed_docs, parser_relationships
1188
+
1189
+ def _process_documents_batch(
1190
+ self,
1191
+ documents: List[Document],
1192
+ source_toolkit: str,
1193
+ schema: Optional[Dict] = None
1194
+ ) -> Tuple[List[Dict[str, Any]], List[str], Dict[str, str], List[Dict[str, Any]]]:
1195
+ """
1196
+ Process a batch of documents in parallel for entity extraction.
1197
+
1198
+ Args:
1199
+ documents: List of documents to process
1200
+ source_toolkit: Source toolkit name
1201
+ schema: Optional schema for extraction
1202
+
1203
+ Returns:
1204
+ Tuple of (all_entities, failed_files, file_hashes, parser_relationships) where:
1205
+ - all_entities: Combined list of entities from all documents
1206
+ - failed_files: List of file paths that failed extraction
1207
+ - file_hashes: Dict mapping file_path to content_hash
1208
+ - parser_relationships: List of relationships from parsers (AST/regex extracted)
1209
+ """
1210
+ all_entities = []
1211
+ failed_files = []
1212
+ file_hashes = {}
1213
+ all_parser_relationships = []
1214
+
1215
+ # Use ThreadPoolExecutor for parallel extraction
1216
+ with ThreadPoolExecutor(max_workers=self.max_parallel_extractions) as executor:
1217
+ # Submit all extraction tasks
1218
+ future_to_doc = {
1219
+ executor.submit(self._extract_entities_from_doc, doc, source_toolkit, schema): doc
1220
+ for doc in documents
1221
+ }
1222
+
1223
+ # Process completed tasks as they finish
1224
+ for future in as_completed(future_to_doc):
1225
+ doc = future_to_doc[future]
1226
+ file_path = (doc.metadata.get('file_path') or
1227
+ doc.metadata.get('file_name') or
1228
+ doc.metadata.get('source', 'unknown'))
1229
+
1230
+ try:
1231
+ entities, extraction_failures, parser_relationships = future.result()
1232
+
1233
+ # Track content hash
1234
+ content_hash = hashlib.sha256(doc.page_content.encode()).hexdigest()
1235
+ file_hashes[file_path] = content_hash
1236
+
1237
+ # Add entities to batch results
1238
+ all_entities.extend(entities)
1239
+
1240
+ # Collect parser relationships
1241
+ all_parser_relationships.extend(parser_relationships)
1242
+
1243
+ # Track failures
1244
+ if extraction_failures:
1245
+ failed_files.extend(extraction_failures)
1246
+
1247
+ except Exception as e:
1248
+ logger.warning(f"Failed to process document '{file_path}': {e}")
1249
+ failed_files.append(file_path)
1250
+
1251
+ return all_entities, failed_files, file_hashes, all_parser_relationships
1252
+
1253
+ def _process_batch_and_update_graph(
1254
+ self,
1255
+ doc_batch: List[Document],
1256
+ source: str,
1257
+ schema: Optional[Dict],
1258
+ checkpoint: IngestionCheckpoint,
1259
+ result: IngestionResult,
1260
+ all_entities: List[Dict[str, Any]],
1261
+ all_parser_relationships: List[Dict[str, Any]],
1262
+ is_incremental_update: bool
1263
+ ) -> None:
1264
+ """
1265
+ Process a batch of documents in parallel and update the graph.
1266
+
1267
+ This method extracts entities from all documents in the batch concurrently,
1268
+ then adds them to the graph sequentially (graph operations are not thread-safe).
1269
+
1270
+ Args:
1271
+ doc_batch: List of documents to process
1272
+ source: Source toolkit name
1273
+ schema: Optional schema for extraction
1274
+ checkpoint: Checkpoint for progress tracking
1275
+ result: IngestionResult to update
1276
+ all_entities: List to accumulate all entities
1277
+ all_parser_relationships: List to accumulate parser-extracted relationships
1278
+ is_incremental_update: Whether this is an incremental update
1279
+ """
1280
+ # Extract entities from all docs in parallel
1281
+ batch_entities, failed_files, file_hashes, parser_rels = self._process_documents_batch(
1282
+ doc_batch, source, schema
1283
+ )
1284
+
1285
+ # Update graph with batch results (sequential - graph is not thread-safe)
1286
+ for entity in batch_entities:
1287
+ self._knowledge_graph.add_entity(
1288
+ entity_id=entity['id'],
1289
+ name=entity['name'],
1290
+ entity_type=entity['type'],
1291
+ citation=entity['citation'],
1292
+ properties=entity['properties']
1293
+ )
1294
+ result.entities_added += 1
1295
+ all_entities.append(entity)
1296
+
1297
+ # Collect parser relationships for later processing
1298
+ all_parser_relationships.extend(parser_rels)
1299
+
1300
+ # Update checkpoint with processed files and hashes
1301
+ for file_path, content_hash in file_hashes.items():
1302
+ if file_path not in failed_files:
1303
+ checkpoint.mark_file_processed(file_path, content_hash)
1304
+ result.documents_processed += 1
1305
+
1306
+ # Track failed files
1307
+ for failed_file in failed_files:
1308
+ checkpoint.mark_file_failed(failed_file, "Entity extraction failed")
1309
+ if failed_file not in result.failed_documents:
1310
+ result.failed_documents.append(failed_file)
1311
+
1312
+ def _process_file_batch_and_update_graph(
1313
+ self,
1314
+ file_batch: List[Tuple[str, List[Document], Document]],
1315
+ _raw_doc_by_file: Dict[str, Document], # Deprecated, kept for compatibility
1316
+ source: str,
1317
+ schema: Optional[Dict],
1318
+ checkpoint: IngestionCheckpoint,
1319
+ result: IngestionResult,
1320
+ all_entities: List[Dict[str, Any]],
1321
+ all_parser_relationships: List[Dict[str, Any]],
1322
+ is_incremental_update: bool
1323
+ ) -> None:
1324
+ """
1325
+ Process a batch of files with their chunks and update the graph.
1326
+
1327
+ For each file:
1328
+ 1. Run parser on whole file (AST/regex extraction - no LLM)
1329
+ 2. Run LLM on each chunk (facts + entities)
1330
+ 3. Deduplicate facts/entities at file level
1331
+ 4. Add to graph
1332
+
1333
+ Args:
1334
+ file_batch: List of (file_path, chunks, raw_doc) tuples
1335
+ _raw_doc_by_file: DEPRECATED - raw_doc is now passed in file_batch tuple
1336
+ source: Source toolkit name
1337
+ schema: Optional schema for extraction
1338
+ checkpoint: Checkpoint for progress tracking
1339
+ result: IngestionResult to update
1340
+ all_entities: List to accumulate all entities
1341
+ all_parser_relationships: List to accumulate parser-extracted relationships
1342
+ is_incremental_update: Whether this is an incremental update
1343
+ """
1344
+ # Process files in parallel
1345
+ batch_start_time = time.time()
1346
+ logger.info(f"⏱️ [TIMING] Batch start: {len(file_batch)} files")
1347
+
1348
+ with ThreadPoolExecutor(max_workers=self.max_parallel_extractions) as executor:
1349
+ future_to_file = {
1350
+ executor.submit(
1351
+ self._process_file_with_chunks,
1352
+ file_path, chunks, raw_doc, source, schema
1353
+ ): file_path
1354
+ for file_path, chunks, raw_doc in file_batch
1355
+ }
1356
+
1357
+ for future in as_completed(future_to_file):
1358
+ file_path = future_to_file[future]
1359
+
1360
+ try:
1361
+ file_entities, parser_rels, content_hash = future.result()
1362
+
1363
+ # Update graph with file results (sequential - graph is not thread-safe)
1364
+ for entity in file_entities:
1365
+ self._knowledge_graph.add_entity(
1366
+ entity_id=entity['id'],
1367
+ name=entity['name'],
1368
+ entity_type=entity['type'],
1369
+ citation=entity['citation'],
1370
+ properties=entity['properties']
1371
+ )
1372
+ result.entities_added += 1
1373
+ all_entities.append(entity)
1374
+
1375
+ # Collect parser relationships
1376
+ all_parser_relationships.extend(parser_rels)
1377
+
1378
+ # Mark file as processed
1379
+ checkpoint.mark_file_processed(file_path, content_hash)
1380
+ result.documents_processed += 1
1381
+
1382
+ except Exception as e:
1383
+ logger.warning(f"Failed to process file '{file_path}': {e}")
1384
+ checkpoint.mark_file_failed(file_path, str(e))
1385
+ if file_path not in result.failed_documents:
1386
+ result.failed_documents.append(file_path)
1387
+ result.documents_processed += 1
1388
+
1389
+ batch_duration = time.time() - batch_start_time
1390
+ logger.info(f"⏱️ [TIMING] Batch complete: {len(file_batch)} files in {batch_duration:.3f}s ({batch_duration/len(file_batch):.3f}s/file avg)")
1391
+
1392
+ def _process_file_with_chunks(
1393
+ self,
1394
+ file_path: str,
1395
+ chunks: List[Document],
1396
+ raw_doc: Document,
1397
+ source_toolkit: str,
1398
+ schema: Optional[Dict] = None
1399
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], str]:
1400
+ """
1401
+ Process a single file: parser on whole file, LLM on chunks, dedupe at file level.
1402
+
1403
+ Args:
1404
+ file_path: Path to the file
1405
+ chunks: List of chunk Documents for this file
1406
+ raw_doc: Raw (whole file) Document for parser
1407
+ source_toolkit: Source toolkit name
1408
+ schema: Optional schema for extraction
1409
+
1410
+ Returns:
1411
+ Tuple of (deduplicated_entities, parser_relationships, content_hash)
1412
+ """
1413
+ all_entities = []
1414
+ parser_relationships = []
1415
+ content_hash = hashlib.sha256(raw_doc.page_content.encode()).hexdigest()
1416
+
1417
+ # Check if file is too small or trivial for LLM extraction
1418
+ file_content = raw_doc.page_content
1419
+ line_count = file_content.count('\n') + 1
1420
+ char_count = len(file_content)
1421
+
1422
+ # Detect trivial/boilerplate content
1423
+ skip_llm = False
1424
+ skip_reason = ""
1425
+
1426
+ # 1. Too small
1427
+ if line_count < self.min_file_lines or char_count < self.min_file_chars:
1428
+ skip_llm = True
1429
+ skip_reason = f"small ({line_count} lines, {char_count} chars)"
1430
+
1431
+ # 2. License-only files or files starting with license header that's most of the content
1432
+ if not skip_llm:
1433
+ content_lower = file_content.lower()
1434
+ license_indicators = [
1435
+ 'apache license', 'mit license', 'bsd license', 'gpl license',
1436
+ 'licensed under the', 'permission is hereby granted',
1437
+ 'copyright (c)', 'copyright 20', 'all rights reserved',
1438
+ 'without warranties or conditions', 'provided "as is"',
1439
+ ]
1440
+ license_matches = sum(1 for ind in license_indicators if ind in content_lower)
1441
+
1442
+ # If 3+ license indicators and file is mostly comments/license text
1443
+ if license_matches >= 3:
1444
+ # Count actual code lines (non-empty, non-comment)
1445
+ code_lines = 0
1446
+ for line in file_content.split('\n'):
1447
+ stripped = line.strip()
1448
+ if stripped and not stripped.startswith(('#', '//', '/*', '*', '<!--', '"""', "'''")):
1449
+ code_lines += 1
1450
+
1451
+ # If less than 20% is actual code, it's mostly license/boilerplate
1452
+ if code_lines < line_count * 0.2:
1453
+ skip_llm = True
1454
+ skip_reason = f"license/boilerplate ({code_lines} code lines of {line_count})"
1455
+
1456
+ # 3. Re-export / barrel files (e.g., index.js with only exports)
1457
+ if not skip_llm:
1458
+ content_stripped = file_content.strip()
1459
+ lines = [l.strip() for l in content_stripped.split('\n') if l.strip()]
1460
+
1461
+ # Check if file is mostly import/export statements
1462
+ export_import_lines = sum(1 for l in lines if
1463
+ l.startswith(('export ', 'import ', 'from ', 'module.exports', 'exports.'))
1464
+ or l.startswith('export {') or l.startswith('export default')
1465
+ or 'require(' in l)
1466
+
1467
+ if len(lines) > 0 and export_import_lines / len(lines) > 0.8:
1468
+ skip_llm = True
1469
+ skip_reason = f"barrel/re-export file ({export_import_lines}/{len(lines)} export lines)"
1470
+
1471
+ if skip_llm:
1472
+ logger.debug(f"Skipping LLM for {Path(file_path).name}: {skip_reason}")
1473
+
1474
+ # ========== PARSER EXTRACTION (whole file, no LLM) ==========
1475
+ parser_start = time.time()
1476
+ parser = get_parser_for_file(file_path)
1477
+ parser_extracted_names = set()
1478
+
1479
+ if parser and _is_code_file(file_path):
1480
+ try:
1481
+ parse_result = parser_parse_file(file_path, content=raw_doc.page_content)
1482
+
1483
+ # Build symbol name to entity ID mapping for containment edges
1484
+ symbol_name_to_entity_id = {}
1485
+
1486
+ # Convert symbols to entities
1487
+ for symbol in parse_result.symbols:
1488
+ entity = _symbol_to_entity(
1489
+ symbol,
1490
+ source_toolkit,
1491
+ self._generate_entity_id
1492
+ )
1493
+ if raw_doc.metadata.get('commit_hash'):
1494
+ entity['citation'].content_hash = raw_doc.metadata.get('commit_hash')
1495
+
1496
+ all_entities.append(entity)
1497
+ parser_extracted_names.add(symbol.name.lower())
1498
+
1499
+ full_name = symbol.full_name or symbol.get_qualified_name()
1500
+ if full_name:
1501
+ symbol_name_to_entity_id[full_name] = entity['id']
1502
+
1503
+ # Convert relationships
1504
+ for rel in parse_result.relationships:
1505
+ parser_relationships.append(
1506
+ _parser_relationship_to_dict(rel, source_toolkit)
1507
+ )
1508
+
1509
+ # Add containment edges from parent_symbol
1510
+ containment_count = 0
1511
+ for symbol in parse_result.symbols:
1512
+ if symbol.parent_symbol:
1513
+ child_name = symbol.full_name or symbol.get_qualified_name()
1514
+ parent_name = symbol.parent_symbol
1515
+
1516
+ child_id = symbol_name_to_entity_id.get(child_name)
1517
+ parent_id = symbol_name_to_entity_id.get(parent_name)
1518
+
1519
+ if child_id and parent_id:
1520
+ parser_relationships.append({
1521
+ 'source_id': parent_id,
1522
+ 'target_id': child_id,
1523
+ 'relation_type': 'contains',
1524
+ 'properties': {
1525
+ 'source': 'parser',
1526
+ 'source_toolkit': source_toolkit,
1527
+ 'file_path': file_path,
1528
+ },
1529
+ 'source': 'parser',
1530
+ })
1531
+ containment_count += 1
1532
+
1533
+ logger.debug(f"Parser extracted {len(all_entities)} symbols, {len(parser_relationships)} relationships from {file_path}")
1534
+
1535
+ except Exception as e:
1536
+ logger.warning(f"Parser failed for {file_path}: {e}")
1537
+
1538
+ parser_duration = time.time() - parser_start
1539
+ if parser_duration > 0.1: # Only log if > 100ms
1540
+ logger.info(f"⏱️ [TIMING] Parser: {parser_duration:.3f}s for {file_path}")
1541
+
1542
+ # ========== PARALLEL LLM EXTRACTION (Entity + Fact in parallel) ==========
1543
+ chunk_entities = []
1544
+ chunk_facts = []
1545
+ entity_llm_duration = 0.0
1546
+ fact_llm_duration = 0.0
1547
+
1548
+ # Build chunk metadata for line number adjustment
1549
+ chunk_offsets = []
1550
+ for chunk in chunks:
1551
+ start_line = chunk.metadata.get('start_line') or chunk.metadata.get('line_start') or 1
1552
+ chunk_offsets.append(start_line)
1553
+
1554
+ # Helper functions for parallel execution
1555
+ def extract_entities():
1556
+ """Extract entities from chunks - runs in parallel thread."""
1557
+ entities = []
1558
+ if not self._entity_extractor or not chunks:
1559
+ return entities, 0.0
1560
+
1561
+ start = time.time()
1562
+ try:
1563
+ extracted, _ = self._entity_extractor.extract_batch(
1564
+ chunks, schema=schema, skip_on_error=True
1565
+ )
1566
+
1567
+ for entity in extracted:
1568
+ entity_name = entity.get('name', '').lower()
1569
+ raw_type = entity.get('type', 'unknown')
1570
+ normalized_type = normalize_entity_type(raw_type)
1571
+
1572
+ # Skip if parser already extracted this
1573
+ code_layer_types = {'class', 'function', 'method', 'module', 'interface',
1574
+ 'constant', 'variable', 'import', 'property', 'field'}
1575
+ if (entity_name in parser_extracted_names and
1576
+ normalized_type in code_layer_types):
1577
+ continue
1578
+
1579
+ entity_id = self._generate_entity_id(
1580
+ normalized_type,
1581
+ entity.get('name', 'unnamed'),
1582
+ file_path
1583
+ )
1584
+
1585
+ citation = Citation(
1586
+ file_path=file_path,
1587
+ line_start=entity.get('line_start'),
1588
+ line_end=entity.get('line_end'),
1589
+ source_toolkit=source_toolkit,
1590
+ doc_id=f"{source_toolkit}://{file_path}",
1591
+ content_hash=raw_doc.metadata.get('commit_hash'),
1592
+ )
1593
+
1594
+ entities.append({
1595
+ 'id': entity_id,
1596
+ 'name': entity.get('name', 'unnamed'),
1597
+ 'type': normalized_type,
1598
+ 'citation': citation,
1599
+ 'properties': {
1600
+ k: v for k, v in entity.items()
1601
+ if k not in ('id', 'name', 'type', 'content', 'text', 'line_start', 'line_end')
1602
+ },
1603
+ 'source_doc': chunks[0] if chunks else None,
1604
+ 'source': 'llm',
1605
+ })
1606
+ except Exception as e:
1607
+ logger.warning(f"Batched entity extraction failed for {file_path}: {e}")
1608
+
1609
+ return entities, time.time() - start
1610
+
1611
+ def extract_facts():
1612
+ """Extract facts from chunks - runs in parallel thread."""
1613
+ facts = []
1614
+ if not self.llm or not chunks:
1615
+ return facts, 0.0
1616
+
1617
+ start = time.time()
1618
+ try:
1619
+ fact_extractor = FactExtractor(self.llm)
1620
+ is_code = _is_code_file(file_path) or _is_code_like_file(file_path)
1621
+
1622
+ if is_code:
1623
+ all_facts = fact_extractor.extract_batch_code(chunks)
1624
+ else:
1625
+ all_facts = fact_extractor.extract_batch(chunks)
1626
+
1627
+ for fact in all_facts:
1628
+ fact_id = self._generate_entity_id(
1629
+ 'fact',
1630
+ f"{fact.get('fact_type', 'unknown')}_{fact.get('subject', 'unknown')[:30]}",
1631
+ file_path
1632
+ )
1633
+
1634
+ citation = Citation(
1635
+ file_path=file_path,
1636
+ line_start=fact.get('line_start'),
1637
+ line_end=fact.get('line_end'),
1638
+ source_toolkit=source_toolkit,
1639
+ doc_id=f"{source_toolkit}://{file_path}",
1640
+ content_hash=raw_doc.metadata.get('commit_hash'),
1641
+ )
1642
+
1643
+ facts.append({
1644
+ 'id': fact_id,
1645
+ 'name': fact.get('subject', 'unknown fact'),
1646
+ 'type': 'fact',
1647
+ 'citation': citation,
1648
+ 'properties': {
1649
+ 'fact_type': fact.get('fact_type'),
1650
+ 'subject': fact.get('subject'),
1651
+ 'predicate': fact.get('predicate'),
1652
+ 'object': fact.get('object'),
1653
+ 'confidence': fact.get('confidence', 0.8),
1654
+ },
1655
+ 'source_doc': chunks[0] if chunks else None,
1656
+ 'source': 'llm_fact',
1657
+ })
1658
+ except Exception as e:
1659
+ logger.warning(f"Batched fact extraction failed for {file_path}: {e}")
1660
+
1661
+ return facts, time.time() - start
1662
+
1663
+ # Run entity and fact extraction in PARALLEL (skip for trivial files)
1664
+ llm_start = time.time()
1665
+ if chunks and not skip_llm:
1666
+ with ThreadPoolExecutor(max_workers=2) as executor:
1667
+ entity_future = executor.submit(extract_entities)
1668
+ fact_future = executor.submit(extract_facts)
1669
+
1670
+ chunk_entities, entity_llm_duration = entity_future.result()
1671
+ chunk_facts, fact_llm_duration = fact_future.result()
1672
+
1673
+ llm_total = time.time() - llm_start
1674
+ logger.info(f"⏱️ [TIMING] LLM parallel: {llm_total:.3f}s (entity: {entity_llm_duration:.3f}s, fact: {fact_llm_duration:.3f}s, {len(chunks)} chunks) for {Path(file_path).name}")
1675
+ elif skip_llm:
1676
+ logger.info(f"⏱️ [TIMING] LLM skipped ({skip_reason}) for {Path(file_path).name}")
1677
+
1678
+ # ========== FILE-LEVEL DEDUPLICATION ==========
1679
+ # Deduplicate entities by (type, name)
1680
+ seen_entities = {}
1681
+ for entity in chunk_entities:
1682
+ key = (entity['type'], entity['name'].lower())
1683
+ if key not in seen_entities:
1684
+ seen_entities[key] = entity
1685
+ else:
1686
+ # Merge properties, keep first citation
1687
+ existing = seen_entities[key]
1688
+ for prop_key, prop_value in entity.get('properties', {}).items():
1689
+ if prop_key not in existing.get('properties', {}):
1690
+ existing.setdefault('properties', {})[prop_key] = prop_value
1691
+
1692
+ # Deduplicate facts by (fact_type, subject)
1693
+ seen_facts = {}
1694
+ for fact in chunk_facts:
1695
+ key = (fact['properties'].get('fact_type'), fact['name'].lower())
1696
+ if key not in seen_facts:
1697
+ seen_facts[key] = fact
1698
+ else:
1699
+ # Keep higher confidence
1700
+ existing = seen_facts[key]
1701
+ if fact['properties'].get('confidence', 0) > existing['properties'].get('confidence', 0):
1702
+ seen_facts[key] = fact
1703
+
1704
+ # Combine: parser entities + deduplicated chunk entities + deduplicated facts
1705
+ all_entities.extend(seen_entities.values())
1706
+ all_entities.extend(seen_facts.values())
1707
+
1708
+ # ========== CREATE FILE-LEVEL NODE ==========
1709
+ # File node acts as a container for all entities/facts from this file
1710
+ file_name = Path(file_path).name
1711
+ file_ext = Path(file_path).suffix.lower()
1712
+
1713
+ # Determine file type based on extension
1714
+ file_type = 'file'
1715
+ if file_ext in {'.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.go', '.rs', '.kt', '.swift', '.cs', '.c', '.cpp', '.h'}:
1716
+ file_type = 'source_file'
1717
+ elif file_ext in {'.md', '.rst', '.txt'}:
1718
+ file_type = 'document_file'
1719
+ elif file_ext in {'.yml', '.yaml', '.json', '.toml', '.ini', '.cfg'}:
1720
+ file_type = 'config_file'
1721
+ elif file_ext in {'.html', '.css', '.scss', '.less'}:
1722
+ file_type = 'web_file'
1723
+
1724
+ file_entity_id = self._generate_entity_id('file', file_path, file_path)
1725
+
1726
+ file_citation = Citation(
1727
+ file_path=file_path,
1728
+ line_start=1,
1729
+ line_end=raw_doc.page_content.count('\n') + 1,
1730
+ source_toolkit=source_toolkit,
1731
+ doc_id=f"{source_toolkit}://{file_path}",
1732
+ content_hash=content_hash,
1733
+ )
1734
+
1735
+ # Count entities by category for file properties
1736
+ code_entity_count = sum(1 for e in all_entities if e['type'] in {'class', 'function', 'method', 'module', 'interface'})
1737
+ fact_count = sum(1 for e in all_entities if e['type'] == 'fact')
1738
+ other_entity_count = len(all_entities) - code_entity_count - fact_count
1739
+
1740
+ file_entity = {
1741
+ 'id': file_entity_id,
1742
+ 'name': file_name,
1743
+ 'type': file_type,
1744
+ 'citation': file_citation,
1745
+ 'properties': {
1746
+ 'full_path': file_path,
1747
+ 'extension': file_ext,
1748
+ 'line_count': raw_doc.page_content.count('\n') + 1,
1749
+ 'size_bytes': len(raw_doc.page_content.encode('utf-8')),
1750
+ 'content_hash': content_hash,
1751
+ 'entity_count': len(all_entities),
1752
+ 'code_entity_count': code_entity_count,
1753
+ 'fact_count': fact_count,
1754
+ 'other_entity_count': other_entity_count,
1755
+ },
1756
+ 'source': 'parser',
1757
+ }
1758
+
1759
+ # Add file entity to the beginning (it's the container)
1760
+ all_entities.insert(0, file_entity)
1761
+
1762
+ # Create DEFINED_IN relationships from all entities to file
1763
+ for entity in all_entities[1:]: # Skip the file entity itself
1764
+ parser_relationships.append({
1765
+ 'source_id': entity['id'],
1766
+ 'target_id': file_entity_id,
1767
+ 'relation_type': 'defined_in',
1768
+ 'properties': {
1769
+ 'source': 'parser',
1770
+ 'source_toolkit': source_toolkit,
1771
+ },
1772
+ })
1773
+
1774
+ file_total_time = (time.time() - parser_start)
1775
+ logger.info(f"⏱️ [TIMING] File total: {file_total_time:.3f}s (parser: {parser_duration:.3f}s, llm_max: {max(entity_llm_duration, fact_llm_duration):.3f}s) for {Path(file_path).name}")
1776
+ logger.debug(f"File {file_path}: {len(all_entities)} total entities ({len(seen_entities)} from LLM, {len(seen_facts)} facts)")
1777
+
1778
+ return all_entities, parser_relationships, content_hash
1779
+
1780
+ def _extract_relations_from_file(
1781
+ self,
1782
+ file_path: str,
1783
+ file_entities: List[Dict[str, Any]],
1784
+ all_entity_dicts: List[Dict[str, Any]],
1785
+ schema: Optional[Dict] = None,
1786
+ max_retries: int = 3
1787
+ ) -> Tuple[List[Dict[str, Any]], Optional[str]]:
1788
+ """
1789
+ Extract relations from entities in a single file with retry logic.
1790
+
1791
+ Args:
1792
+ file_path: Path to the file being processed
1793
+ file_entities: Entities from this file
1794
+ all_entity_dicts: All graph entities for ID resolution
1795
+ schema: Optional schema to guide extraction
1796
+ max_retries: Maximum number of retry attempts (default: 3)
1797
+
1798
+ Returns:
1799
+ Tuple of (relations_list, error_message)
1800
+ error_message is None on success
1801
+ """
1802
+ # Use first entity's doc for context
1803
+ doc = file_entities[0].get('source_doc')
1804
+ if not doc or not doc.page_content:
1805
+ # Try to reload content from file if source_doc is missing
1806
+ # This happens when resuming from checkpoint (source_doc isn't serialized)
1807
+ try:
1808
+ if file_path and Path(file_path).exists():
1809
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
1810
+ content = f.read()
1811
+ doc = Document(page_content=content, metadata={'file_path': file_path})
1812
+ logger.debug(f"Reloaded content from file: {file_path} ({len(content)} chars)")
1813
+ else:
1814
+ # Can't reload - return empty (no relations can be extracted without content)
1815
+ logger.debug(f"Cannot reload content for relation extraction: {file_path}")
1816
+ return [], None # Return empty but not an error
1817
+ except Exception as e:
1818
+ logger.warning(f"Failed to reload content from {file_path}: {e}")
1819
+ return [], None # Return empty but not an error
1820
+
1821
+ # Convert to format expected by relation extractor
1822
+ entity_dicts = [
1823
+ {'id': e['id'], 'name': e['name'], 'type': e['type'], **e.get('properties', {})}
1824
+ for e in file_entities
1825
+ ]
1826
+
1827
+ # Retry logic with exponential backoff
1828
+ last_error = None
1829
+ for attempt in range(max_retries):
1830
+ try:
1831
+ file_relations = self._relation_extractor.extract(
1832
+ doc, entity_dicts, schema=schema, confidence_threshold=0.5,
1833
+ all_entities=all_entity_dicts
1834
+ )
1835
+
1836
+ # Add source tracking to each relation
1837
+ source_toolkit = file_entities[0].get('source_toolkit') if file_entities else None
1838
+ for rel in file_relations:
1839
+ if source_toolkit:
1840
+ if 'properties' not in rel:
1841
+ rel['properties'] = {}
1842
+ rel['properties']['source_toolkit'] = source_toolkit
1843
+ rel['properties']['discovered_in_file'] = file_path
1844
+
1845
+ return file_relations, None
1846
+
1847
+ except Exception as e:
1848
+ last_error = str(e)
1849
+ logger.warning(
1850
+ f"Relation extraction failed for '{file_path}' "
1851
+ f"(attempt {attempt + 1}/{max_retries}): {e}"
1852
+ )
1853
+
1854
+ # Exponential backoff: 1s, 2s, 4s
1855
+ if attempt < max_retries - 1:
1856
+ wait_time = 2 ** attempt
1857
+ time.sleep(wait_time)
1858
+
1859
+ # All retries failed
1860
+ logger.error(f"Failed to extract relations from '{file_path}' after {max_retries} attempts: {last_error}")
1861
+ return [], f"Failed after {max_retries} attempts: {last_error}"
1862
+
1863
+ def _extract_relations(
1864
+ self,
1865
+ entities: List[Dict[str, Any]],
1866
+ schema: Optional[Dict] = None,
1867
+ all_graph_entities: Optional[List[Dict[str, Any]]] = None
1868
+ ) -> List[Dict[str, Any]]:
1869
+ """
1870
+ Extract relations between entities in parallel with robust error handling.
1871
+
1872
+ Uses ThreadPoolExecutor to process multiple files concurrently, with automatic
1873
+ retry logic for failed extractions. Progress is reported as tasks complete.
1874
+
1875
+ Args:
1876
+ entities: New entities to extract relations from
1877
+ schema: Optional schema to guide extraction
1878
+ all_graph_entities: All entities in graph (for ID resolution across sources)
1879
+
1880
+ Returns:
1881
+ List of extracted relations
1882
+ """
1883
+ if not self._relation_extractor or len(entities) < 2:
1884
+ return []
1885
+
1886
+ extract_rel_start = time.time()
1887
+ relations = []
1888
+ failed_files = []
1889
+
1890
+ # Build ID lookup from ALL graph entities (enables cross-source relations)
1891
+ all_entities_for_lookup = all_graph_entities or entities
1892
+
1893
+ # Group entities by file for relation extraction
1894
+ by_file: Dict[str, List] = {}
1895
+ for ent in entities:
1896
+ citation = ent.get('citation')
1897
+ if isinstance(citation, dict):
1898
+ fpath = citation.get('file_path', '')
1899
+ elif hasattr(citation, 'file_path'):
1900
+ fpath = citation.file_path
1901
+ else:
1902
+ fpath = ent.get('file_path', '')
1903
+
1904
+ if not fpath:
1905
+ continue
1906
+
1907
+ if fpath not in by_file:
1908
+ by_file[fpath] = []
1909
+ by_file[fpath].append(ent)
1910
+
1911
+ # Filter files with enough entities for relation extraction
1912
+ files_to_process = [(fp, ents) for fp, ents in by_file.items() if len(ents) >= 2]
1913
+ total_files = len(files_to_process)
1914
+
1915
+ if total_files == 0:
1916
+ return []
1917
+
1918
+ # Prepare all_entity_dicts for cross-source ID resolution
1919
+ # Use all_graph_entities if provided, otherwise use the entities we're processing
1920
+ all_entity_dicts = [
1921
+ {'id': e.get('id'), 'name': e.get('name'), 'type': e.get('type')}
1922
+ for e in all_entities_for_lookup
1923
+ if e.get('id')
1924
+ ]
1925
+
1926
+ # Use ThreadPoolExecutor for parallel relation extraction
1927
+ completed_files = 0
1928
+
1929
+ with ThreadPoolExecutor(max_workers=self.max_parallel_extractions) as executor:
1930
+ # Submit all extraction tasks
1931
+ future_to_file = {
1932
+ executor.submit(
1933
+ self._extract_relations_from_file,
1934
+ file_path,
1935
+ file_entities,
1936
+ all_entity_dicts,
1937
+ schema
1938
+ ): (file_path, file_entities)
1939
+ for file_path, file_entities in files_to_process
1940
+ }
1941
+
1942
+ # Process completed tasks as they finish
1943
+ for future in as_completed(future_to_file):
1944
+ file_path, file_entities = future_to_file[future]
1945
+ completed_files += 1
1946
+
1947
+ try:
1948
+ file_relations, error = future.result()
1949
+
1950
+ if error:
1951
+ # Log failed file but continue processing
1952
+ failed_files.append({
1953
+ 'file_path': file_path,
1954
+ 'error': error,
1955
+ 'entity_count': len(file_entities)
1956
+ })
1957
+ else:
1958
+ relations.extend(file_relations)
1959
+
1960
+ except Exception as e:
1961
+ # Unexpected error (shouldn't happen since we catch in _extract_relations_from_file)
1962
+ logger.error(f"Unexpected error processing '{file_path}': {e}")
1963
+ failed_files.append({
1964
+ 'file_path': file_path,
1965
+ 'error': f"Unexpected error: {str(e)}",
1966
+ 'entity_count': len(file_entities)
1967
+ })
1968
+
1969
+ # Log progress periodically
1970
+ if completed_files % 10 == 0 or completed_files == total_files or completed_files == 1:
1971
+ pct = (completed_files / total_files) * 100
1972
+ status_msg = f"🔗 Relations: {completed_files}/{total_files} files ({pct:.0f}%) | Found {len(relations)} relations"
1973
+ if failed_files:
1974
+ status_msg += f" | {len(failed_files)} files failed"
1975
+ self._log_progress(status_msg, "relations")
1976
+
1977
+ # Log summary of failures if any
1978
+ if failed_files:
1979
+ self._log_progress(
1980
+ f"⚠️ Relation extraction failed for {len(failed_files)}/{total_files} files. "
1981
+ f"Successfully extracted {len(relations)} relations from {total_files - len(failed_files)} files.",
1982
+ "relations"
1983
+ )
1984
+ # Log first few failures for debugging
1985
+ for failed in failed_files[:3]:
1986
+ logger.warning(
1987
+ f"Failed to extract relations from '{failed['file_path']}' "
1988
+ f"({failed['entity_count']} entities): {failed['error']}"
1989
+ )
1990
+
1991
+ file_rel_duration = time.time() - extract_rel_start
1992
+ logger.info(f"⏱️ [TIMING] Per-file relation extraction: {file_rel_duration:.3f}s for {total_files} files")
1993
+
1994
+ # Phase 2: Extract cross-file relations (imports, dependencies between modules)
1995
+ cross_file_start = time.time()
1996
+ cross_file_relations = self._extract_cross_file_relations(entities, all_entity_dicts, by_file)
1997
+ if cross_file_relations:
1998
+ relations.extend(cross_file_relations)
1999
+ self._log_progress(
2000
+ f"🔗 Cross-file: Found {len(cross_file_relations)} inter-module relations",
2001
+ "relations"
2002
+ )
2003
+ cross_file_duration = time.time() - cross_file_start
2004
+ logger.info(f"⏱️ [TIMING] Cross-file relation extraction: {cross_file_duration:.3f}s")
2005
+
2006
+ total_rel_duration = time.time() - extract_rel_start
2007
+ logger.info(f"⏱️ [TIMING] _extract_relations total: {total_rel_duration:.3f}s ({len(relations)} relations)")
2008
+
2009
+ return relations
2010
+
2011
+ def _extract_cross_file_relations(
2012
+ self,
2013
+ entities: List[Dict[str, Any]],
2014
+ all_entity_dicts: List[Dict[str, Any]],
2015
+ by_file: Dict[str, List[Dict[str, Any]]]
2016
+ ) -> List[Dict[str, Any]]:
2017
+ """
2018
+ Extract cross-file relationships by analyzing imports, references, and dependencies.
2019
+
2020
+ Uses the patterns module for extensible, language-specific pattern matching.
2021
+ Patterns cover:
2022
+ - Import statements (JS/TS, Python, Java, C#, Go, Ruby, Rust, PHP, etc.)
2023
+ - Documentation links (Markdown, Wiki, HTML, RST)
2024
+ - Text citations and references ("see X", "@see X", etc.)
2025
+ - Inheritance patterns
2026
+ - Entity name mentions in content
2027
+
2028
+ Args:
2029
+ entities: All entities to analyze
2030
+ all_entity_dicts: Entity dictionaries for lookup
2031
+ by_file: Entities grouped by file path
2032
+
2033
+ Returns:
2034
+ List of cross-file relations
2035
+ """
2036
+ from .patterns import get_patterns_for_file, PatternCategory
2037
+ import re
2038
+
2039
+ cross_relations = []
2040
+
2041
+ # Build lookup tables
2042
+ entity_by_name: Dict[str, Dict] = {}
2043
+ entity_by_id: Dict[str, Dict] = {}
2044
+ file_to_entities: Dict[str, List[Dict]] = {}
2045
+ module_to_file: Dict[str, str] = {}
2046
+
2047
+ # For entity mention matching
2048
+ significant_entities: List[Tuple[str, Dict]] = []
2049
+
2050
+ for ent in entities:
2051
+ name = ent.get('name', '')
2052
+ ent_id = ent.get('id', '')
2053
+
2054
+ if name:
2055
+ name_lower = name.lower()
2056
+ entity_by_name[name_lower] = ent
2057
+ entity_by_name[name] = ent
2058
+
2059
+ # Track significant entities for mention detection
2060
+ ent_type = ent.get('type', '').lower()
2061
+ if ent_type in ('class', 'component', 'service', 'module', 'api', 'endpoint',
2062
+ 'feature', 'epic', 'requirement', 'interface', 'schema', 'table'):
2063
+ if len(name) >= 3: # Min 3 chars to reduce noise
2064
+ significant_entities.append((name_lower, ent))
2065
+
2066
+ if ent_id:
2067
+ entity_by_id[ent_id] = ent
2068
+
2069
+ # Build file -> entities and module -> file mappings
2070
+ citation = ent.get('citation')
2071
+ if citation:
2072
+ file_path = citation.get('file_path', '') if isinstance(citation, dict) else getattr(citation, 'file_path', '')
2073
+ if file_path:
2074
+ if file_path not in file_to_entities:
2075
+ file_to_entities[file_path] = []
2076
+ file_to_entities[file_path].append(ent)
2077
+
2078
+ p = Path(file_path)
2079
+ stem = p.stem
2080
+ module_to_file[stem.lower()] = file_path
2081
+ if stem.lower() == 'index':
2082
+ module_to_file[p.parent.name.lower()] = file_path
2083
+
2084
+ # Sort significant entities by length for greedy matching
2085
+ significant_entities.sort(key=lambda x: len(x[0]), reverse=True)
2086
+
2087
+ # ========================================================================
2088
+ # PHASE 1: Pattern-based extraction from file content
2089
+ # ========================================================================
2090
+
2091
+ for file_path, file_ents in by_file.items():
2092
+ if not file_ents:
2093
+ continue
2094
+
2095
+ # Read file content
2096
+ file_content = ""
2097
+ try:
2098
+ for ent in file_ents:
2099
+ doc = ent.get('source_doc')
2100
+ if doc and hasattr(doc, 'page_content') and doc.page_content:
2101
+ file_content = doc.page_content
2102
+ break
2103
+
2104
+ if not file_content and Path(file_path).exists():
2105
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
2106
+ file_content = f.read()
2107
+ except Exception:
2108
+ pass
2109
+
2110
+ if not file_content:
2111
+ continue
2112
+
2113
+ # Get source entity for this file
2114
+ source_ent = next(
2115
+ (e for e in file_ents if e.get('type', '').lower() in
2116
+ ('module', 'component', 'class', 'file', 'page', 'document')),
2117
+ file_ents[0] if file_ents else None
2118
+ )
2119
+ if not source_ent:
2120
+ continue
2121
+
2122
+ source_id = source_ent.get('id')
2123
+
2124
+ # Get patterns for this file type
2125
+ patterns = get_patterns_for_file(file_path)
2126
+
2127
+ # Apply each pattern
2128
+ for pattern in patterns:
2129
+ matches = pattern.match(file_content)
2130
+
2131
+ for match_value in matches:
2132
+ if not match_value or len(match_value) < 2:
2133
+ continue
2134
+
2135
+ # Try to resolve match to an entity
2136
+ target_ent = None
2137
+ match_lower = match_value.lower()
2138
+
2139
+ # For imports, try module-to-file mapping
2140
+ if pattern.category == PatternCategory.IMPORT:
2141
+ target_file = module_to_file.get(match_lower)
2142
+ if target_file and target_file != file_path:
2143
+ target_ents = file_to_entities.get(target_file, [])
2144
+ target_ent = next(
2145
+ (e for e in target_ents if e.get('type', '').lower() in
2146
+ ('module', 'component', 'class', 'file')),
2147
+ target_ents[0] if target_ents else None
2148
+ )
2149
+
2150
+ # For links/citations, try entity name lookup
2151
+ if pattern.category in (PatternCategory.LINK, PatternCategory.CITATION,
2152
+ PatternCategory.INHERITANCE, PatternCategory.TYPE_REF):
2153
+ # Skip external URLs
2154
+ if match_value.startswith(('http://', 'https://', '#')):
2155
+ continue
2156
+
2157
+ # Try direct name match
2158
+ target_ent = entity_by_name.get(match_lower) or entity_by_name.get(match_value)
2159
+
2160
+ # Try as file path
2161
+ if not target_ent:
2162
+ target_file = module_to_file.get(Path(match_value).stem.lower())
2163
+ if target_file:
2164
+ target_ents = file_to_entities.get(target_file, [])
2165
+ target_ent = target_ents[0] if target_ents else None
2166
+
2167
+ if target_ent and source_id != target_ent.get('id'):
2168
+ target_citation = target_ent.get('citation')
2169
+ target_file = ''
2170
+ if target_citation:
2171
+ target_file = (target_citation.get('file_path', '')
2172
+ if isinstance(target_citation, dict)
2173
+ else getattr(target_citation, 'file_path', ''))
2174
+
2175
+ if target_file != file_path:
2176
+ cross_relations.append({
2177
+ 'source_id': source_id,
2178
+ 'target_id': target_ent.get('id'),
2179
+ 'type': pattern.relation_type.value,
2180
+ 'properties': {
2181
+ 'source_file': file_path,
2182
+ 'target_file': target_file,
2183
+ 'discovered_by': f'pattern:{pattern.name}',
2184
+ 'matched_value': match_value
2185
+ },
2186
+ 'confidence': pattern.confidence
2187
+ })
2188
+
2189
+ # --- Entity mention detection ---
2190
+ content_lower = file_content.lower()
2191
+ for name_lower, target_ent in significant_entities:
2192
+ if target_ent.get('id') == source_id:
2193
+ continue
2194
+
2195
+ if name_lower in content_lower:
2196
+ # Verify word boundary
2197
+ if re.search(r'\b' + re.escape(name_lower) + r'\b', content_lower):
2198
+ target_citation = target_ent.get('citation')
2199
+ target_file = ''
2200
+ if target_citation:
2201
+ target_file = (target_citation.get('file_path', '')
2202
+ if isinstance(target_citation, dict)
2203
+ else getattr(target_citation, 'file_path', ''))
2204
+
2205
+ if target_file and target_file != file_path:
2206
+ cross_relations.append({
2207
+ 'source_id': source_id,
2208
+ 'target_id': target_ent.get('id'),
2209
+ 'type': 'MENTIONS',
2210
+ 'properties': {
2211
+ 'source_file': file_path,
2212
+ 'target_file': target_file,
2213
+ 'discovered_by': 'content_mention',
2214
+ 'mentioned_name': target_ent.get('name', '')
2215
+ },
2216
+ 'confidence': 0.7
2217
+ })
2218
+
2219
+ # ========================================================================
2220
+ # PHASE 1.5: AST-based analysis (when available)
2221
+ # ========================================================================
2222
+ # Uses deepwiki parsers for more accurate code analysis
2223
+
2224
+ try:
2225
+ from .patterns import is_ast_available, extract_ast_cross_file_relations
2226
+
2227
+ if is_ast_available():
2228
+ # Collect file contents for AST analysis
2229
+ ast_file_contents: Dict[str, str] = {}
2230
+ ast_file_paths = []
2231
+
2232
+ for file_path, file_ents in by_file.items():
2233
+ # Only process code files that benefit from AST
2234
+ ext = Path(file_path).suffix.lower()
2235
+ if ext in ('.py', '.js', '.jsx', '.ts', '.tsx', '.java'):
2236
+ # Get content from entity or file
2237
+ file_content = ""
2238
+ try:
2239
+ for ent in file_ents:
2240
+ doc = ent.get('source_doc')
2241
+ if doc and hasattr(doc, 'page_content') and doc.page_content:
2242
+ file_content = doc.page_content
2243
+ break
2244
+
2245
+ if not file_content and Path(file_path).exists():
2246
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
2247
+ file_content = f.read()
2248
+ except Exception:
2249
+ pass
2250
+
2251
+ if file_content:
2252
+ ast_file_contents[file_path] = file_content
2253
+ ast_file_paths.append(file_path)
2254
+
2255
+ if ast_file_paths:
2256
+ # Extract AST-based relations
2257
+ ast_relations = extract_ast_cross_file_relations(
2258
+ ast_file_paths,
2259
+ ast_file_contents,
2260
+ entities
2261
+ )
2262
+
2263
+ # Convert to standard format and add
2264
+ for ast_rel in ast_relations:
2265
+ source_name = ast_rel.get('source_entity', '')
2266
+ target_name = ast_rel.get('target_entity', '')
2267
+
2268
+ # Resolve to entity IDs
2269
+ source_ent = entity_by_name.get(source_name.lower()) or entity_by_name.get(source_name)
2270
+ target_ent = entity_by_name.get(target_name.lower()) or entity_by_name.get(target_name)
2271
+
2272
+ if source_ent and target_ent and source_ent.get('id') != target_ent.get('id'):
2273
+ cross_relations.append({
2274
+ 'source_id': source_ent.get('id'),
2275
+ 'target_id': target_ent.get('id'),
2276
+ 'type': ast_rel.get('relationship_type', 'REFERENCES').upper(),
2277
+ 'properties': {
2278
+ 'source_file': ast_rel.get('metadata', {}).get('source_file', ''),
2279
+ 'target_file': ast_rel.get('metadata', {}).get('target_file', ''),
2280
+ 'discovered_by': 'ast_analysis',
2281
+ 'line': ast_rel.get('metadata', {}).get('line', 0)
2282
+ },
2283
+ 'confidence': ast_rel.get('relationship_strength', 0.95)
2284
+ })
2285
+
2286
+ if ast_relations:
2287
+ self._log_progress(
2288
+ f"🌳 AST analysis found {len(ast_relations)} relations",
2289
+ "relations"
2290
+ )
2291
+ except ImportError:
2292
+ pass # AST adapter not available
2293
+ except Exception as e:
2294
+ import traceback
2295
+ self._log_progress(f"AST analysis failed: {e}", "debug")
2296
+
2297
+ # ========================================================================
2298
+ # PHASE 2: Entity property analysis
2299
+ # ========================================================================
2300
+
2301
+ def to_list(val):
2302
+ if isinstance(val, str):
2303
+ return [val] if val else []
2304
+ if isinstance(val, list):
2305
+ return val
2306
+ return []
2307
+
2308
+ for ent in entities:
2309
+ props = ent.get('properties', {})
2310
+ ent_id = ent.get('id', '')
2311
+
2312
+ citation = ent.get('citation')
2313
+ source_file = ''
2314
+ if citation:
2315
+ source_file = (citation.get('file_path', '')
2316
+ if isinstance(citation, dict)
2317
+ else getattr(citation, 'file_path', ''))
2318
+
2319
+ # Property-based references
2320
+ all_refs = [
2321
+ (to_list(props.get('imports', [])), 'IMPORTS'),
2322
+ (to_list(props.get('dependencies', [])), 'DEPENDS_ON'),
2323
+ (to_list(props.get('extends', props.get('parent_class', ''))), 'EXTENDS'),
2324
+ (to_list(props.get('implements', [])), 'IMPLEMENTS'),
2325
+ (to_list(props.get('uses', props.get('calls', []))), 'USES'),
2326
+ (to_list(props.get('references', props.get('links', []))), 'REFERENCES'),
2327
+ ]
2328
+
2329
+ for ref_list, rel_type in all_refs:
2330
+ for ref in ref_list:
2331
+ if not ref:
2332
+ continue
2333
+
2334
+ ref_lower = ref.lower() if isinstance(ref, str) else str(ref).lower()
2335
+ target_ent = entity_by_name.get(ref_lower) or entity_by_name.get(ref)
2336
+
2337
+ if not target_ent and ('/' in ref_lower or '.' in ref_lower):
2338
+ clean_ref = ref_lower.split('/')[-1].split('.')[-1]
2339
+ target_ent = entity_by_name.get(clean_ref)
2340
+
2341
+ if target_ent and target_ent.get('id') != ent_id:
2342
+ target_citation = target_ent.get('citation')
2343
+ target_file = ''
2344
+ if target_citation:
2345
+ target_file = (target_citation.get('file_path', '')
2346
+ if isinstance(target_citation, dict)
2347
+ else getattr(target_citation, 'file_path', ''))
2348
+
2349
+ if target_file and source_file and target_file != source_file:
2350
+ cross_relations.append({
2351
+ 'source_id': ent_id,
2352
+ 'target_id': target_ent.get('id'),
2353
+ 'type': rel_type,
2354
+ 'properties': {
2355
+ 'source_file': source_file,
2356
+ 'target_file': target_file,
2357
+ 'discovered_by': 'property_analysis',
2358
+ 'reference_name': ref
2359
+ },
2360
+ 'confidence': 0.9
2361
+ })
2362
+
2363
+ # Deduplicate
2364
+ seen = set()
2365
+ unique_relations = []
2366
+ for rel in cross_relations:
2367
+ key = (rel['source_id'], rel['target_id'], rel['type'])
2368
+ if key not in seen:
2369
+ seen.add(key)
2370
+ unique_relations.append(rel)
2371
+
2372
+ return unique_relations
2373
+
2374
+ def run(
2375
+ self,
2376
+ source: str,
2377
+ branch: Optional[str] = None,
2378
+ whitelist: Optional[List[str]] = None,
2379
+ blacklist: Optional[List[str]] = None,
2380
+ extract_relations: bool = True,
2381
+ resume: bool = True,
2382
+ max_documents: Optional[int] = None,
2383
+ **loader_kwargs
2384
+ ) -> IngestionResult:
2385
+ """
2386
+ Run the full ingestion pipeline with checkpoint support for resumability.
2387
+
2388
+ Args:
2389
+ source: Name of source toolkit (must be in source_toolkits)
2390
+ branch: Branch to analyze (optional, uses default if not specified)
2391
+ whitelist: File patterns to include (e.g., ['*.py', '*.js'])
2392
+ blacklist: File patterns to exclude (e.g., ['*test*', '*vendor*'])
2393
+ extract_relations: Whether to extract relations between entities
2394
+ resume: If True, try to resume from last checkpoint
2395
+ max_documents: Maximum number of documents to process (for testing)
2396
+ **loader_kwargs: Additional arguments for the toolkit's loader
2397
+
2398
+ Returns:
2399
+ IngestionResult with statistics and any errors
2400
+ """
2401
+ import time
2402
+ start_time = time.time()
2403
+ result = IngestionResult(source=source)
2404
+
2405
+ # Validate source toolkit
2406
+ if source not in self.source_toolkits:
2407
+ available = list(self.source_toolkits.keys()) if self.source_toolkits else ['none']
2408
+ result.success = False
2409
+ result.errors.append(f"Toolkit '{source}' not found. Available: {', '.join(available)}")
2410
+ return result
2411
+
2412
+ toolkit = self.source_toolkits[source]
2413
+
2414
+ # Check for loader method
2415
+ if not hasattr(toolkit, 'loader'):
2416
+ result.success = False
2417
+ result.errors.append(f"Toolkit '{source}' does not have a loader method")
2418
+ return result
2419
+
2420
+ # Ensure extractors are initialized
2421
+ if not self._init_extractors():
2422
+ result.success = False
2423
+ result.errors.append("LLM not configured - cannot extract entities")
2424
+ return result
2425
+
2426
+ # Try to load existing checkpoint if resume is enabled
2427
+ checkpoint = None
2428
+ is_incremental_update = False
2429
+ if resume:
2430
+ checkpoint = self._load_checkpoint(source)
2431
+ if checkpoint:
2432
+ if checkpoint.completed:
2433
+ # Completed checkpoint - use for incremental update
2434
+ is_incremental_update = True
2435
+ num_tracked = len(checkpoint.file_hashes)
2436
+ self._log_progress(
2437
+ f"📋 Incremental update: tracking {num_tracked} files for changes",
2438
+ "incremental"
2439
+ )
2440
+ # Reset counters for new run but keep file hashes
2441
+ checkpoint.completed = False
2442
+ checkpoint.phase = "extract"
2443
+ checkpoint.pending_entities = []
2444
+ checkpoint.errors = []
2445
+ else:
2446
+ # Incomplete checkpoint - resume from failure
2447
+ self._log_progress(
2448
+ f"📋 Resuming from checkpoint: {checkpoint.documents_processed} docs already processed",
2449
+ "resume"
2450
+ )
2451
+ result.resumed_from_checkpoint = True
2452
+ # Restore progress from checkpoint
2453
+ result.documents_processed = checkpoint.documents_processed
2454
+ result.entities_added = checkpoint.entities_added
2455
+
2456
+ # Create new checkpoint if no existing one
2457
+ if not checkpoint:
2458
+ checkpoint = IngestionCheckpoint.create(
2459
+ source=source,
2460
+ branch=branch,
2461
+ whitelist=whitelist,
2462
+ blacklist=blacklist,
2463
+ extract_relations=extract_relations,
2464
+ )
2465
+
2466
+ self._current_checkpoint = checkpoint
2467
+
2468
+ self._log_progress(f"🚀 Starting ingestion from {source}", "start")
2469
+
2470
+ # Build loader kwargs
2471
+ loader_args = {**loader_kwargs}
2472
+ if branch:
2473
+ loader_args['branch'] = branch
2474
+ if whitelist:
2475
+ loader_args['whitelist'] = whitelist
2476
+ if blacklist:
2477
+ loader_args['blacklist'] = blacklist
2478
+
2479
+ if loader_args:
2480
+ params_str = ", ".join(f"{k}={v}" for k, v in loader_args.items() if v is not None)
2481
+ self._log_progress(f"📋 Loader params: {params_str}", "config")
2482
+
2483
+ try:
2484
+ # ========== STREAMING APPROACH ==========
2485
+ # Read files once, create raw doc + chunks on the fly
2486
+ # Process in batches to limit memory usage
2487
+
2488
+ self._log_progress(f"📥 Fetching documents from {source}...", "fetch")
2489
+
2490
+ # Note: We don't pre-count files to avoid iterating twice
2491
+ # The toolkit's loader() will log progress as it goes
2492
+
2493
+ # Import chunker for on-the-fly chunking
2494
+ try:
2495
+ from alita_sdk.tools.chunkers.universal_chunker import chunk_single_document
2496
+ from alita_sdk.tools.chunkers.code.codeparser import parse_code_files_for_db
2497
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
2498
+ has_chunker = True
2499
+
2500
+ # Create text splitter for non-code files
2501
+ text_splitter = RecursiveCharacterTextSplitter(
2502
+ chunk_size=1000,
2503
+ chunk_overlap=100,
2504
+ length_function=len,
2505
+ )
2506
+
2507
+ # Code extensions that use tree-sitter
2508
+ CODE_EXTENSIONS = {
2509
+ '.py', '.js', '.jsx', '.mjs', '.cjs', '.ts', '.tsx',
2510
+ '.java', '.kt', '.rs', '.go', '.cpp', '.c', '.cs',
2511
+ '.hs', '.rb', '.scala', '.lua'
2512
+ }
2513
+
2514
+ def chunk_document_direct(doc: Document) -> List[Document]:
2515
+ """Chunk a single document directly without buffering."""
2516
+ file_path = doc.metadata.get('file_path', '')
2517
+ ext = Path(file_path).suffix.lower()
2518
+
2519
+ if ext in CODE_EXTENSIONS:
2520
+ # Use code parser directly
2521
+ try:
2522
+ chunks = list(parse_code_files_for_db([{
2523
+ 'file_name': file_path,
2524
+ 'file_content': doc.page_content,
2525
+ 'commit_hash': doc.metadata.get('commit_hash', ''),
2526
+ }]))
2527
+ # Ensure file_path is preserved
2528
+ for chunk in chunks:
2529
+ if 'file_path' not in chunk.metadata:
2530
+ chunk.metadata['file_path'] = file_path
2531
+ return chunks if chunks else [doc]
2532
+ except Exception as e:
2533
+ logger.debug(f"Code chunking failed for {file_path}: {e}")
2534
+ return [doc]
2535
+ else:
2536
+ # Use text splitter
2537
+ try:
2538
+ chunks = text_splitter.split_documents([doc])
2539
+ for idx, chunk in enumerate(chunks, 1):
2540
+ chunk.metadata['chunk_id'] = idx
2541
+ return chunks if chunks else [doc]
2542
+ except Exception:
2543
+ return [doc]
2544
+
2545
+ except ImportError:
2546
+ has_chunker = False
2547
+ chunk_document_direct = None
2548
+ logger.warning("Chunkers not available, using raw documents")
2549
+
2550
+ # Get schema
2551
+ schema = self._knowledge_graph.get_schema()
2552
+ all_entities = list(checkpoint.pending_entities) if checkpoint.pending_entities else []
2553
+ all_parser_relationships = [] # Collect parser-extracted relationships
2554
+
2555
+ checkpoint.phase = "extract"
2556
+ self._log_progress(
2557
+ f"🔍 Extracting entities (parallel batches of {self.batch_size}, "
2558
+ f"max {self.max_parallel_extractions} concurrent)...",
2559
+ "extract"
2560
+ )
2561
+
2562
+ # ========== STREAMING FILE PROCESSING ==========
2563
+ # Process files one at a time, creating chunks on-the-fly
2564
+ file_batch = []
2565
+ total_batches_processed = 0
2566
+ files_seen = 0
2567
+ streaming_start = time.time()
2568
+ total_chunk_time = 0.0
2569
+
2570
+ # Stream raw documents (read once)
2571
+ loader_args['chunked'] = False
2572
+ for raw_doc in toolkit.loader(**loader_args):
2573
+ file_path = (raw_doc.metadata.get('file_path') or
2574
+ raw_doc.metadata.get('file_name') or
2575
+ raw_doc.metadata.get('source', 'unknown'))
2576
+ files_seen += 1
2577
+
2578
+ # Check document limit (for testing)
2579
+ if max_documents and result.documents_processed >= max_documents:
2580
+ # Process remaining batch if any
2581
+ if file_batch:
2582
+ self._process_file_batch_and_update_graph(
2583
+ file_batch, {}, source, schema, checkpoint, result,
2584
+ all_entities, all_parser_relationships, is_incremental_update
2585
+ )
2586
+ self._log_progress(
2587
+ f"⚠️ Reached document limit ({max_documents}), stopping...",
2588
+ "limit"
2589
+ )
2590
+ break
2591
+
2592
+ # Normalize document
2593
+ normalized = self._normalize_document(raw_doc, source)
2594
+ if not normalized:
2595
+ continue
2596
+
2597
+ # For incremental updates, check if file changed
2598
+ if is_incremental_update:
2599
+ content_hash = hashlib.sha256(normalized.page_content.encode()).hexdigest()
2600
+ if not checkpoint.has_file_changed(file_path, content_hash):
2601
+ result.documents_skipped += 1
2602
+ continue
2603
+ else:
2604
+ # File has changed - remove old entities before reprocessing
2605
+ removed = self._knowledge_graph.remove_entities_by_file(file_path)
2606
+ if removed > 0:
2607
+ result.entities_removed += removed
2608
+ logger.debug(f"Removed {removed} stale entities from {file_path}")
2609
+
2610
+ # Skip if already processed in current run (resuming from checkpoint)
2611
+ if not is_incremental_update and checkpoint.is_file_processed(file_path):
2612
+ result.documents_skipped += 1
2613
+ continue
2614
+
2615
+ # Create chunks on-the-fly from this single document
2616
+ chunk_start = time.time()
2617
+ if has_chunker and chunk_document_direct:
2618
+ # Direct chunking - no buffering overhead
2619
+ chunks = chunk_document_direct(normalized)
2620
+ else:
2621
+ # No chunker - use raw doc as single chunk
2622
+ chunks = [normalized]
2623
+ chunk_time = time.time() - chunk_start
2624
+ total_chunk_time += chunk_time
2625
+ if chunk_time > 0.1: # Log if chunking takes > 100ms
2626
+ logger.info(f"⏱️ [TIMING] Chunking: {chunk_time:.3f}s ({len(chunks)} chunks) for {Path(file_path).name}")
2627
+
2628
+ # Add to current batch: (file_path, chunks, raw_doc)
2629
+ file_batch.append((file_path, chunks, normalized))
2630
+
2631
+ # Process batch when it reaches batch_size
2632
+ if len(file_batch) >= self.batch_size:
2633
+ batch_num = total_batches_processed + 1
2634
+ self._log_progress(
2635
+ f"⚡ Processing batch {batch_num} ({len(file_batch)} files, file #{files_seen})...",
2636
+ "batch"
2637
+ )
2638
+
2639
+ self._process_file_batch_and_update_graph(
2640
+ file_batch, {}, source, schema, checkpoint, result,
2641
+ all_entities, all_parser_relationships, is_incremental_update
2642
+ )
2643
+
2644
+ total_batches_processed += 1
2645
+ file_batch = [] # Reset batch
2646
+
2647
+ # Save checkpoint after each batch
2648
+ checkpoint.documents_processed = result.documents_processed
2649
+ checkpoint.entities_added = result.entities_added
2650
+ self._save_checkpoint(checkpoint)
2651
+ self._auto_save()
2652
+
2653
+ self._log_progress(
2654
+ f"📄 Processed {result.documents_processed} files | "
2655
+ f"📊 {result.entities_added} entities | 💾 Checkpoint saved",
2656
+ "progress"
2657
+ )
2658
+
2659
+ # Process remaining files in final batch
2660
+ if file_batch:
2661
+ batch_num = total_batches_processed + 1
2662
+ self._log_progress(
2663
+ f"⚡ Processing final batch {batch_num} ({len(file_batch)} files)...",
2664
+ "batch"
2665
+ )
2666
+ self._process_file_batch_and_update_graph(
2667
+ file_batch, {}, source, schema, checkpoint, result,
2668
+ all_entities, all_parser_relationships, is_incremental_update
2669
+ )
2670
+
2671
+ streaming_duration = time.time() - streaming_start
2672
+ logger.info(f"⏱️ [TIMING] Streaming phase complete: {streaming_duration:.3f}s total, {total_chunk_time:.3f}s chunking, {total_batches_processed + 1} batches")
2673
+
2674
+ # Report skipped files before relation extraction
2675
+ if result.documents_skipped > 0:
2676
+ self._log_progress(
2677
+ f"⏭️ Skipped {result.documents_skipped} unchanged files",
2678
+ "progress"
2679
+ )
2680
+
2681
+ # Update checkpoint before relation extraction
2682
+ checkpoint.documents_processed = result.documents_processed
2683
+ checkpoint.entities_added = result.entities_added
2684
+ checkpoint.pending_entities = [
2685
+ {'id': e['id'], 'name': e['name'], 'type': e['type'],
2686
+ 'file_path': (e['citation'].file_path if hasattr(e.get('citation'), 'file_path')
2687
+ else e.get('citation', {}).get('file_path', e.get('file_path', ''))),
2688
+ 'properties': e.get('properties', {})}
2689
+ for e in all_entities
2690
+ ]
2691
+ self._save_checkpoint(checkpoint)
2692
+
2693
+ # Extract relations
2694
+ if extract_relations and all_entities:
2695
+ checkpoint.phase = "relations"
2696
+ self._save_checkpoint(checkpoint)
2697
+ relations_phase_start = time.time()
2698
+
2699
+ # Get ALL entities from graph (existing + new) for relation resolution
2700
+ # This enables cross-source relations (e.g., github entities referencing confluence entities)
2701
+ graph_entities = self._knowledge_graph.get_all_entities()
2702
+
2703
+ # ========== PARSER RELATIONSHIPS (no LLM) ==========
2704
+ # Add parser-extracted relationships directly to graph
2705
+ parser_rel_start = time.time()
2706
+ if all_parser_relationships:
2707
+ self._log_progress(
2708
+ f"🔗 Adding {len(all_parser_relationships)} parser-extracted relationships...",
2709
+ "relations"
2710
+ )
2711
+
2712
+ # Build entity lookup for ID resolution
2713
+ entity_by_name = {}
2714
+ for e in graph_entities:
2715
+ name_lower = e.get('name', '').lower()
2716
+ entity_by_name[name_lower] = e.get('id')
2717
+ # Also map full qualified names
2718
+ full_name = e.get('properties', {}).get('full_name', '')
2719
+ if full_name:
2720
+ entity_by_name[full_name.lower()] = e.get('id')
2721
+
2722
+ for rel in all_parser_relationships:
2723
+ # Check for pre-resolved IDs (used for containment edges)
2724
+ source_id = rel.get('_resolved_source_id')
2725
+ target_id = rel.get('_resolved_target_id')
2726
+
2727
+ # Fall back to name-based resolution if not pre-resolved
2728
+ if not source_id or not target_id:
2729
+ source_name = rel.get('source_symbol', '').lower()
2730
+ target_name = rel.get('target_symbol', '').lower()
2731
+
2732
+ source_id = source_id or entity_by_name.get(source_name)
2733
+ target_id = target_id or entity_by_name.get(target_name)
2734
+
2735
+ if source_id and target_id:
2736
+ properties = {
2737
+ 'source_toolkit': rel.get('source_toolkit', source),
2738
+ 'confidence': rel.get('confidence', 1.0),
2739
+ 'source': 'parser',
2740
+ 'discovered_in_file': rel.get('source_file'),
2741
+ }
2742
+ if rel.get('is_cross_file'):
2743
+ properties['is_cross_file'] = True
2744
+
2745
+ success = self._knowledge_graph.add_relation(
2746
+ source_id=source_id,
2747
+ target_id=target_id,
2748
+ relation_type=rel.get('relation_type', 'references'),
2749
+ properties=properties
2750
+ )
2751
+ if success:
2752
+ result.relations_added += 1
2753
+
2754
+ parser_rel_duration = time.time() - parser_rel_start
2755
+ logger.info(f"⏱️ [TIMING] Parser relations: {parser_rel_duration:.3f}s for {len(all_parser_relationships)} relationships")
2756
+
2757
+ # ========== LLM RELATIONSHIPS (semantic) ==========
2758
+ llm_rel_start = time.time()
2759
+ self._log_progress(
2760
+ f"🔗 Extracting semantic relations from {len(all_entities)} new entities "
2761
+ f"(graph has {len(graph_entities)} total)...",
2762
+ "relations"
2763
+ )
2764
+
2765
+ # Pass all graph entities for ID resolution, but only extract from new docs
2766
+ relations = self._extract_relations(all_entities, schema, all_graph_entities=graph_entities)
2767
+
2768
+ for rel in relations:
2769
+ # Merge source information into properties
2770
+ properties = rel.get('properties', {})
2771
+ if 'source_toolkit' not in properties:
2772
+ # Fallback: add current source if not already set
2773
+ properties['source_toolkit'] = source
2774
+ properties['source'] = 'llm' # Mark as LLM-extracted
2775
+
2776
+ success = self._knowledge_graph.add_relation(
2777
+ source_id=rel.get('source_id'),
2778
+ target_id=rel.get('target_id'),
2779
+ relation_type=rel.get('relation_type', 'RELATED_TO'),
2780
+ properties=properties
2781
+ )
2782
+ if success:
2783
+ result.relations_added += 1
2784
+
2785
+ llm_rel_duration = time.time() - llm_rel_start
2786
+ relations_phase_duration = time.time() - relations_phase_start
2787
+ logger.info(f"⏱️ [TIMING] LLM relations: {llm_rel_duration:.3f}s")
2788
+ logger.info(f"⏱️ [TIMING] Relations phase total: {relations_phase_duration:.3f}s")
2789
+
2790
+ # Save final graph
2791
+ self._auto_save()
2792
+
2793
+ # Mark checkpoint as complete - keep it for incremental updates
2794
+ checkpoint.completed = True
2795
+ checkpoint.phase = "complete"
2796
+ checkpoint.relations_added = result.relations_added
2797
+ checkpoint.pending_entities = [] # Clear pending entities to save space
2798
+ self._save_checkpoint(checkpoint)
2799
+ # Note: We keep the checkpoint for incremental updates (file hash tracking)
2800
+
2801
+ result.graph_stats = self._knowledge_graph.get_stats()
2802
+ result.duration_seconds = time.time() - start_time
2803
+
2804
+ # Report any failed documents
2805
+ if result.failed_documents:
2806
+ self._log_progress(
2807
+ f"⚠️ {len(result.failed_documents)} documents failed to process",
2808
+ "warning"
2809
+ )
2810
+
2811
+ # Build completion message
2812
+ completion_msg = (
2813
+ f"✅ Ingestion complete! {result.entities_added} entities, "
2814
+ f"{result.relations_added} relations in {result.duration_seconds:.1f}s"
2815
+ )
2816
+ if result.documents_skipped > 0:
2817
+ completion_msg += f" ({result.documents_skipped} unchanged files skipped)"
2818
+
2819
+ self._log_progress(completion_msg, "complete")
2820
+
2821
+ except Exception as e:
2822
+ logger.exception(f"Ingestion failed: {e}")
2823
+ result.success = False
2824
+ result.errors.append(str(e))
2825
+ result.duration_seconds = time.time() - start_time
2826
+
2827
+ # Save checkpoint on failure for resume
2828
+ checkpoint.errors.append(str(e))
2829
+ checkpoint.documents_processed = result.documents_processed
2830
+ checkpoint.entities_added = result.entities_added
2831
+ self._save_checkpoint(checkpoint)
2832
+ self._auto_save() # Save graph progress
2833
+
2834
+ self._log_progress(
2835
+ f"❌ Ingestion failed. Checkpoint saved for resume. "
2836
+ f"Processed {result.documents_processed} docs before failure.",
2837
+ "error"
2838
+ )
2839
+
2840
+ return result
2841
+
2842
+ def run_from_generator(
2843
+ self,
2844
+ documents: Generator[Document, None, None],
2845
+ source: str = "custom",
2846
+ extract_relations: bool = True
2847
+ ) -> IngestionResult:
2848
+ """
2849
+ Run ingestion from a pre-built document generator.
2850
+
2851
+ Use this when you have your own document source that's not
2852
+ a standard toolkit (e.g., custom loader, S3 files, etc.).
2853
+
2854
+ Args:
2855
+ documents: Generator yielding LangChain Documents
2856
+ source: Name to identify the source in citations
2857
+ extract_relations: Whether to extract relations
2858
+
2859
+ Returns:
2860
+ IngestionResult with statistics
2861
+ """
2862
+ import time
2863
+ start_time = time.time()
2864
+ result = IngestionResult(source=source)
2865
+
2866
+ if not self._init_extractors():
2867
+ result.success = False
2868
+ result.errors.append("LLM not configured")
2869
+ return result
2870
+
2871
+ self._log_progress(f"🚀 Starting ingestion from {source} generator", "start")
2872
+
2873
+ schema = self._knowledge_graph.get_schema()
2874
+ all_entities = []
2875
+
2876
+ try:
2877
+ for doc in documents:
2878
+ normalized = self._normalize_document(doc, source)
2879
+ if not normalized:
2880
+ continue
2881
+
2882
+ result.documents_processed += 1
2883
+ entities, extraction_failures = self._extract_entities_from_doc(normalized, source, schema)
2884
+
2885
+ # Track extraction failures
2886
+ if extraction_failures:
2887
+ for failed_path in extraction_failures:
2888
+ if failed_path not in result.failed_documents:
2889
+ result.failed_documents.append(failed_path)
2890
+
2891
+ for entity in entities:
2892
+ self._knowledge_graph.add_entity(
2893
+ entity_id=entity['id'],
2894
+ name=entity['name'],
2895
+ entity_type=entity['type'],
2896
+ citation=entity['citation'],
2897
+ properties=entity['properties']
2898
+ )
2899
+ result.entities_added += 1
2900
+ all_entities.append(entity)
2901
+
2902
+ if result.documents_processed % 10 == 0:
2903
+ self._log_progress(
2904
+ f"📄 {result.documents_processed} docs | 📊 {result.entities_added} entities",
2905
+ "progress"
2906
+ )
2907
+
2908
+ if extract_relations and all_entities:
2909
+ graph_entities = self._knowledge_graph.get_all_entities()
2910
+ self._log_progress(
2911
+ f"🔗 Extracting relations from {len(all_entities)} new entities "
2912
+ f"(graph has {len(graph_entities)} total)...",
2913
+ "relations"
2914
+ )
2915
+ relations = self._extract_relations(all_entities, schema, all_graph_entities=graph_entities)
2916
+
2917
+ for rel in relations:
2918
+ # Merge source information into properties
2919
+ properties = rel.get('properties', {})
2920
+ if 'source_toolkit' not in properties:
2921
+ # Add current source if not already set
2922
+ properties['source_toolkit'] = source
2923
+
2924
+ if self._knowledge_graph.add_relation(
2925
+ source_id=rel.get('source_id'),
2926
+ target_id=rel.get('target_id'),
2927
+ relation_type=rel.get('relation_type', 'RELATED_TO'),
2928
+ properties=properties
2929
+ ):
2930
+ result.relations_added += 1
2931
+
2932
+ self._auto_save()
2933
+ result.graph_stats = self._knowledge_graph.get_stats()
2934
+ result.duration_seconds = time.time() - start_time
2935
+
2936
+ self._log_progress(f"✅ Complete! {result}", "complete")
2937
+
2938
+ except Exception as e:
2939
+ logger.exception(f"Ingestion failed: {e}")
2940
+ result.success = False
2941
+ result.errors.append(str(e))
2942
+ result.duration_seconds = time.time() - start_time
2943
+
2944
+ return result
2945
+
2946
+ def delta_update(
2947
+ self,
2948
+ source: str,
2949
+ file_paths: List[str],
2950
+ extract_relations: bool = True
2951
+ ) -> IngestionResult:
2952
+ """
2953
+ Perform delta update for changed files.
2954
+
2955
+ 1. Removes existing entities from the specified files
2956
+ 2. Re-fetches and re-analyzes those files
2957
+ 3. Adds new entities with fresh citations
2958
+
2959
+ Args:
2960
+ source: Name of source toolkit
2961
+ file_paths: List of file paths that have changed
2962
+ extract_relations: Whether to extract relations
2963
+
2964
+ Returns:
2965
+ IngestionResult with statistics including entities removed
2966
+ """
2967
+ import time
2968
+ start_time = time.time()
2969
+ result = IngestionResult(source=source)
2970
+
2971
+ self._log_progress(f"🔄 Delta update for {len(file_paths)} files from {source}", "start")
2972
+
2973
+ # Remove stale entities
2974
+ for file_path in file_paths:
2975
+ removed = self._knowledge_graph.remove_entities_by_file(file_path)
2976
+ result.entities_removed += removed
2977
+
2978
+ self._log_progress(f"🗑️ Removed {result.entities_removed} stale entities", "cleanup")
2979
+
2980
+ # Re-ingest the changed files
2981
+ if source not in self.source_toolkits:
2982
+ # Fall back to local file read if toolkit not available
2983
+ self._log_progress("📁 Reading files locally (toolkit not available)", "local")
2984
+
2985
+ from pathlib import Path
2986
+
2987
+ def local_loader():
2988
+ for file_path in file_paths:
2989
+ try:
2990
+ content = Path(file_path).read_text(encoding='utf-8')
2991
+ yield Document(
2992
+ page_content=content,
2993
+ metadata={'file_path': file_path, 'source_toolkit': 'filesystem'}
2994
+ )
2995
+ except Exception as e:
2996
+ logger.warning(f"Could not read {file_path}: {e}")
2997
+
2998
+ ingest_result = self.run_from_generator(
2999
+ documents=local_loader(),
3000
+ source='filesystem',
3001
+ extract_relations=extract_relations
3002
+ )
3003
+ else:
3004
+ # Use toolkit to fetch specific files
3005
+ toolkit = self.source_toolkits[source]
3006
+
3007
+ # Try to use toolkit's file-specific loader if available
3008
+ if hasattr(toolkit, 'get_files_content'):
3009
+ def file_loader():
3010
+ for file_path in file_paths:
3011
+ try:
3012
+ content = toolkit.get_files_content(file_path)
3013
+ if content:
3014
+ yield Document(
3015
+ page_content=content,
3016
+ metadata={'file_path': file_path, 'source_toolkit': source}
3017
+ )
3018
+ except Exception as e:
3019
+ logger.warning(f"Could not fetch {file_path}: {e}")
3020
+
3021
+ ingest_result = self.run_from_generator(
3022
+ documents=file_loader(),
3023
+ source=source,
3024
+ extract_relations=extract_relations
3025
+ )
3026
+ else:
3027
+ # Run full ingestion with whitelist
3028
+ ingest_result = self.run(
3029
+ source=source,
3030
+ whitelist=file_paths,
3031
+ extract_relations=extract_relations
3032
+ )
3033
+
3034
+ # Merge results
3035
+ result.documents_processed = ingest_result.documents_processed
3036
+ result.entities_added = ingest_result.entities_added
3037
+ result.relations_added = ingest_result.relations_added
3038
+ result.errors.extend(ingest_result.errors)
3039
+ result.success = ingest_result.success
3040
+ result.graph_stats = ingest_result.graph_stats
3041
+ result.duration_seconds = time.time() - start_time
3042
+
3043
+ self._log_progress(
3044
+ f"✅ Delta update complete: -{result.entities_removed} +{result.entities_added}",
3045
+ "complete"
3046
+ )
3047
+
3048
+ return result
3049
+
3050
+ def discover_schema(self, sample_file_paths: List[str]) -> Dict[str, Any]:
3051
+ """
3052
+ Discover entity types from sample files using LLM.
3053
+
3054
+ Useful for customizing extraction for domain-specific codebases.
3055
+
3056
+ Args:
3057
+ sample_file_paths: Paths to sample files for schema discovery
3058
+
3059
+ Returns:
3060
+ Discovered schema with entity_types and relation_types
3061
+ """
3062
+ if not self._init_extractors():
3063
+ return {'error': 'LLM not configured'}
3064
+
3065
+ self._log_progress(f"🔍 Discovering schema from {len(sample_file_paths)} samples", "schema")
3066
+
3067
+ from pathlib import Path
3068
+ docs = []
3069
+
3070
+ for file_path in sample_file_paths[:10]:
3071
+ try:
3072
+ content = Path(file_path).read_text(encoding='utf-8')
3073
+ docs.append(Document(
3074
+ page_content=content[:5000],
3075
+ metadata={'file_path': file_path}
3076
+ ))
3077
+ except Exception as e:
3078
+ logger.warning(f"Could not read {file_path}: {e}")
3079
+
3080
+ if not docs:
3081
+ return {'error': 'Could not read any sample files'}
3082
+
3083
+ schema = self._schema_discoverer.discover(docs)
3084
+ self._knowledge_graph.set_schema(schema)
3085
+ self._auto_save()
3086
+
3087
+ self._log_progress(
3088
+ f"✅ Discovered {len(schema.get('entity_types', []))} entity types, "
3089
+ f"{len(schema.get('relation_types', []))} relation types",
3090
+ "schema"
3091
+ )
3092
+
3093
+ return schema
3094
+
3095
+ def get_stats(self) -> Dict[str, Any]:
3096
+ """Get current graph statistics."""
3097
+ return self._knowledge_graph.get_stats()
3098
+
3099
+ def export(self, path: Optional[str] = None) -> str:
3100
+ """Export graph to JSON."""
3101
+ export_path = path or self.graph_path
3102
+ self._knowledge_graph.dump_to_json(export_path)
3103
+ return export_path
3104
+
3105
+ def register_toolkit(self, name: str, toolkit: Any) -> None:
3106
+ """Register a source toolkit for ingestion."""
3107
+ self.source_toolkits[name] = toolkit
3108
+ logger.info(f"Registered toolkit: {name}")
3109
+
3110
+
3111
+ # Convenience function for one-shot ingestion
3112
+ def ingest_repository(
3113
+ llm: Any,
3114
+ graph_path: str,
3115
+ source_toolkit: Any,
3116
+ source_name: str = "repository",
3117
+ branch: Optional[str] = None,
3118
+ whitelist: Optional[List[str]] = None,
3119
+ blacklist: Optional[List[str]] = None,
3120
+ extract_relations: bool = True,
3121
+ progress_callback: Optional[Callable] = None,
3122
+ ) -> IngestionResult:
3123
+ """
3124
+ Convenience function for one-shot repository ingestion.
3125
+
3126
+ Args:
3127
+ llm: LangChain LLM instance
3128
+ graph_path: Where to save the graph JSON
3129
+ source_toolkit: Toolkit instance with loader() method
3130
+ source_name: Name for the source in citations
3131
+ branch: Branch to analyze
3132
+ whitelist: File patterns to include
3133
+ blacklist: File patterns to exclude
3134
+ extract_relations: Whether to extract relations
3135
+ progress_callback: Optional callback for progress updates
3136
+
3137
+ Returns:
3138
+ IngestionResult with statistics
3139
+
3140
+ Example:
3141
+ from alita_sdk.community.github.api_wrapper import GitHubApiWrapper
3142
+
3143
+ github = GitHubApiWrapper(
3144
+ api_base="...",
3145
+ api_key="...",
3146
+ repository="owner/repo"
3147
+ )
3148
+
3149
+ result = ingest_repository(
3150
+ llm=llm,
3151
+ graph_path="./graph.json",
3152
+ source_toolkit=github,
3153
+ source_name="github",
3154
+ branch="main",
3155
+ whitelist=["*.py"],
3156
+ progress_callback=lambda msg, phase: print(f"[{phase}] {msg}")
3157
+ )
3158
+ """
3159
+ pipeline = IngestionPipeline(
3160
+ llm=llm,
3161
+ graph_path=graph_path,
3162
+ source_toolkits={source_name: source_toolkit},
3163
+ progress_callback=progress_callback,
3164
+ )
3165
+
3166
+ return pipeline.run(
3167
+ source=source_name,
3168
+ branch=branch,
3169
+ whitelist=whitelist,
3170
+ blacklist=blacklist,
3171
+ extract_relations=extract_relations,
3172
+ )