alita-sdk 0.3.462__py3-none-any.whl → 0.3.627__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. alita_sdk/cli/agent/__init__.py +5 -0
  2. alita_sdk/cli/agent/default.py +258 -0
  3. alita_sdk/cli/agent_executor.py +15 -3
  4. alita_sdk/cli/agent_loader.py +56 -8
  5. alita_sdk/cli/agent_ui.py +93 -31
  6. alita_sdk/cli/agents.py +2274 -230
  7. alita_sdk/cli/callbacks.py +96 -25
  8. alita_sdk/cli/cli.py +10 -1
  9. alita_sdk/cli/config.py +162 -9
  10. alita_sdk/cli/context/__init__.py +30 -0
  11. alita_sdk/cli/context/cleanup.py +198 -0
  12. alita_sdk/cli/context/manager.py +731 -0
  13. alita_sdk/cli/context/message.py +285 -0
  14. alita_sdk/cli/context/strategies.py +289 -0
  15. alita_sdk/cli/context/token_estimation.py +127 -0
  16. alita_sdk/cli/input_handler.py +419 -0
  17. alita_sdk/cli/inventory.py +1073 -0
  18. alita_sdk/cli/testcases/__init__.py +94 -0
  19. alita_sdk/cli/testcases/data_generation.py +119 -0
  20. alita_sdk/cli/testcases/discovery.py +96 -0
  21. alita_sdk/cli/testcases/executor.py +84 -0
  22. alita_sdk/cli/testcases/logger.py +85 -0
  23. alita_sdk/cli/testcases/parser.py +172 -0
  24. alita_sdk/cli/testcases/prompts.py +91 -0
  25. alita_sdk/cli/testcases/reporting.py +125 -0
  26. alita_sdk/cli/testcases/setup.py +108 -0
  27. alita_sdk/cli/testcases/test_runner.py +282 -0
  28. alita_sdk/cli/testcases/utils.py +39 -0
  29. alita_sdk/cli/testcases/validation.py +90 -0
  30. alita_sdk/cli/testcases/workflow.py +196 -0
  31. alita_sdk/cli/toolkit.py +14 -17
  32. alita_sdk/cli/toolkit_loader.py +35 -5
  33. alita_sdk/cli/tools/__init__.py +36 -2
  34. alita_sdk/cli/tools/approval.py +224 -0
  35. alita_sdk/cli/tools/filesystem.py +910 -64
  36. alita_sdk/cli/tools/planning.py +389 -0
  37. alita_sdk/cli/tools/terminal.py +414 -0
  38. alita_sdk/community/__init__.py +72 -12
  39. alita_sdk/community/inventory/__init__.py +236 -0
  40. alita_sdk/community/inventory/config.py +257 -0
  41. alita_sdk/community/inventory/enrichment.py +2137 -0
  42. alita_sdk/community/inventory/extractors.py +1469 -0
  43. alita_sdk/community/inventory/ingestion.py +3172 -0
  44. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  45. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  46. alita_sdk/community/inventory/parsers/base.py +295 -0
  47. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  48. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  49. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  50. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  51. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  52. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  53. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  54. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  55. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  56. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  57. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  58. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  59. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  60. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  61. alita_sdk/community/inventory/patterns/loader.py +348 -0
  62. alita_sdk/community/inventory/patterns/registry.py +198 -0
  63. alita_sdk/community/inventory/presets.py +535 -0
  64. alita_sdk/community/inventory/retrieval.py +1403 -0
  65. alita_sdk/community/inventory/toolkit.py +173 -0
  66. alita_sdk/community/inventory/toolkit_utils.py +176 -0
  67. alita_sdk/community/inventory/visualize.py +1370 -0
  68. alita_sdk/configurations/__init__.py +1 -1
  69. alita_sdk/configurations/ado.py +141 -20
  70. alita_sdk/configurations/bitbucket.py +0 -3
  71. alita_sdk/configurations/confluence.py +76 -42
  72. alita_sdk/configurations/figma.py +76 -0
  73. alita_sdk/configurations/gitlab.py +17 -5
  74. alita_sdk/configurations/openapi.py +329 -0
  75. alita_sdk/configurations/qtest.py +72 -1
  76. alita_sdk/configurations/report_portal.py +96 -0
  77. alita_sdk/configurations/sharepoint.py +148 -0
  78. alita_sdk/configurations/testio.py +83 -0
  79. alita_sdk/runtime/clients/artifact.py +3 -3
  80. alita_sdk/runtime/clients/client.py +353 -48
  81. alita_sdk/runtime/clients/sandbox_client.py +0 -21
  82. alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
  83. alita_sdk/runtime/langchain/assistant.py +123 -26
  84. alita_sdk/runtime/langchain/constants.py +642 -1
  85. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
  86. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
  87. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +6 -3
  88. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +226 -7
  89. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +5 -2
  90. alita_sdk/runtime/langchain/document_loaders/constants.py +12 -7
  91. alita_sdk/runtime/langchain/langraph_agent.py +279 -73
  92. alita_sdk/runtime/langchain/utils.py +82 -15
  93. alita_sdk/runtime/llms/preloaded.py +2 -6
  94. alita_sdk/runtime/skills/__init__.py +91 -0
  95. alita_sdk/runtime/skills/callbacks.py +498 -0
  96. alita_sdk/runtime/skills/discovery.py +540 -0
  97. alita_sdk/runtime/skills/executor.py +610 -0
  98. alita_sdk/runtime/skills/input_builder.py +371 -0
  99. alita_sdk/runtime/skills/models.py +330 -0
  100. alita_sdk/runtime/skills/registry.py +355 -0
  101. alita_sdk/runtime/skills/skill_runner.py +330 -0
  102. alita_sdk/runtime/toolkits/__init__.py +7 -0
  103. alita_sdk/runtime/toolkits/application.py +21 -9
  104. alita_sdk/runtime/toolkits/artifact.py +15 -5
  105. alita_sdk/runtime/toolkits/datasource.py +13 -6
  106. alita_sdk/runtime/toolkits/mcp.py +139 -251
  107. alita_sdk/runtime/toolkits/mcp_config.py +1048 -0
  108. alita_sdk/runtime/toolkits/planning.py +178 -0
  109. alita_sdk/runtime/toolkits/skill_router.py +238 -0
  110. alita_sdk/runtime/toolkits/subgraph.py +251 -6
  111. alita_sdk/runtime/toolkits/tools.py +238 -32
  112. alita_sdk/runtime/toolkits/vectorstore.py +11 -5
  113. alita_sdk/runtime/tools/__init__.py +3 -1
  114. alita_sdk/runtime/tools/application.py +20 -6
  115. alita_sdk/runtime/tools/artifact.py +511 -28
  116. alita_sdk/runtime/tools/data_analysis.py +183 -0
  117. alita_sdk/runtime/tools/function.py +43 -15
  118. alita_sdk/runtime/tools/image_generation.py +50 -44
  119. alita_sdk/runtime/tools/llm.py +852 -67
  120. alita_sdk/runtime/tools/loop.py +3 -1
  121. alita_sdk/runtime/tools/loop_output.py +3 -1
  122. alita_sdk/runtime/tools/mcp_remote_tool.py +25 -10
  123. alita_sdk/runtime/tools/mcp_server_tool.py +7 -6
  124. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  125. alita_sdk/runtime/tools/planning/models.py +246 -0
  126. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  127. alita_sdk/runtime/tools/router.py +2 -4
  128. alita_sdk/runtime/tools/sandbox.py +9 -6
  129. alita_sdk/runtime/tools/skill_router.py +776 -0
  130. alita_sdk/runtime/tools/tool.py +3 -1
  131. alita_sdk/runtime/tools/vectorstore.py +7 -2
  132. alita_sdk/runtime/tools/vectorstore_base.py +51 -11
  133. alita_sdk/runtime/utils/AlitaCallback.py +137 -21
  134. alita_sdk/runtime/utils/constants.py +5 -1
  135. alita_sdk/runtime/utils/mcp_client.py +492 -0
  136. alita_sdk/runtime/utils/mcp_oauth.py +202 -5
  137. alita_sdk/runtime/utils/mcp_sse_client.py +36 -7
  138. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  139. alita_sdk/runtime/utils/serialization.py +155 -0
  140. alita_sdk/runtime/utils/streamlit.py +6 -10
  141. alita_sdk/runtime/utils/toolkit_utils.py +16 -5
  142. alita_sdk/runtime/utils/utils.py +36 -0
  143. alita_sdk/tools/__init__.py +113 -29
  144. alita_sdk/tools/ado/repos/__init__.py +51 -33
  145. alita_sdk/tools/ado/repos/repos_wrapper.py +148 -89
  146. alita_sdk/tools/ado/test_plan/__init__.py +25 -9
  147. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +23 -1
  148. alita_sdk/tools/ado/utils.py +1 -18
  149. alita_sdk/tools/ado/wiki/__init__.py +25 -8
  150. alita_sdk/tools/ado/wiki/ado_wrapper.py +291 -22
  151. alita_sdk/tools/ado/work_item/__init__.py +26 -9
  152. alita_sdk/tools/ado/work_item/ado_wrapper.py +56 -3
  153. alita_sdk/tools/advanced_jira_mining/__init__.py +11 -8
  154. alita_sdk/tools/aws/delta_lake/__init__.py +13 -9
  155. alita_sdk/tools/aws/delta_lake/tool.py +5 -1
  156. alita_sdk/tools/azure_ai/search/__init__.py +11 -8
  157. alita_sdk/tools/azure_ai/search/api_wrapper.py +1 -1
  158. alita_sdk/tools/base/tool.py +5 -1
  159. alita_sdk/tools/base_indexer_toolkit.py +170 -45
  160. alita_sdk/tools/bitbucket/__init__.py +17 -12
  161. alita_sdk/tools/bitbucket/api_wrapper.py +59 -11
  162. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +49 -35
  163. alita_sdk/tools/browser/__init__.py +5 -4
  164. alita_sdk/tools/carrier/__init__.py +5 -6
  165. alita_sdk/tools/carrier/backend_reports_tool.py +6 -6
  166. alita_sdk/tools/carrier/run_ui_test_tool.py +6 -6
  167. alita_sdk/tools/carrier/ui_reports_tool.py +5 -5
  168. alita_sdk/tools/chunkers/__init__.py +3 -1
  169. alita_sdk/tools/chunkers/code/treesitter/treesitter.py +37 -13
  170. alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
  171. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  172. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  173. alita_sdk/tools/cloud/aws/__init__.py +10 -7
  174. alita_sdk/tools/cloud/azure/__init__.py +10 -7
  175. alita_sdk/tools/cloud/gcp/__init__.py +10 -7
  176. alita_sdk/tools/cloud/k8s/__init__.py +10 -7
  177. alita_sdk/tools/code/linter/__init__.py +10 -8
  178. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  179. alita_sdk/tools/code/sonar/__init__.py +10 -7
  180. alita_sdk/tools/code_indexer_toolkit.py +73 -23
  181. alita_sdk/tools/confluence/__init__.py +21 -15
  182. alita_sdk/tools/confluence/api_wrapper.py +78 -23
  183. alita_sdk/tools/confluence/loader.py +4 -2
  184. alita_sdk/tools/custom_open_api/__init__.py +12 -5
  185. alita_sdk/tools/elastic/__init__.py +11 -8
  186. alita_sdk/tools/elitea_base.py +493 -30
  187. alita_sdk/tools/figma/__init__.py +58 -11
  188. alita_sdk/tools/figma/api_wrapper.py +1235 -143
  189. alita_sdk/tools/figma/figma_client.py +73 -0
  190. alita_sdk/tools/figma/toon_tools.py +2748 -0
  191. alita_sdk/tools/github/__init__.py +13 -14
  192. alita_sdk/tools/github/github_client.py +224 -100
  193. alita_sdk/tools/github/graphql_client_wrapper.py +119 -33
  194. alita_sdk/tools/github/schemas.py +14 -5
  195. alita_sdk/tools/github/tool.py +5 -1
  196. alita_sdk/tools/github/tool_prompts.py +9 -22
  197. alita_sdk/tools/gitlab/__init__.py +15 -11
  198. alita_sdk/tools/gitlab/api_wrapper.py +207 -41
  199. alita_sdk/tools/gitlab_org/__init__.py +10 -8
  200. alita_sdk/tools/gitlab_org/api_wrapper.py +63 -64
  201. alita_sdk/tools/google/bigquery/__init__.py +13 -12
  202. alita_sdk/tools/google/bigquery/tool.py +5 -1
  203. alita_sdk/tools/google_places/__init__.py +10 -8
  204. alita_sdk/tools/google_places/api_wrapper.py +1 -1
  205. alita_sdk/tools/jira/__init__.py +17 -11
  206. alita_sdk/tools/jira/api_wrapper.py +91 -40
  207. alita_sdk/tools/keycloak/__init__.py +11 -8
  208. alita_sdk/tools/localgit/__init__.py +9 -3
  209. alita_sdk/tools/localgit/local_git.py +62 -54
  210. alita_sdk/tools/localgit/tool.py +5 -1
  211. alita_sdk/tools/memory/__init__.py +11 -3
  212. alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
  213. alita_sdk/tools/ocr/__init__.py +11 -8
  214. alita_sdk/tools/openapi/__init__.py +490 -114
  215. alita_sdk/tools/openapi/api_wrapper.py +1368 -0
  216. alita_sdk/tools/openapi/tool.py +20 -0
  217. alita_sdk/tools/pandas/__init__.py +20 -12
  218. alita_sdk/tools/pandas/api_wrapper.py +38 -25
  219. alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
  220. alita_sdk/tools/postman/__init__.py +11 -11
  221. alita_sdk/tools/pptx/__init__.py +10 -9
  222. alita_sdk/tools/pptx/pptx_wrapper.py +1 -1
  223. alita_sdk/tools/qtest/__init__.py +30 -10
  224. alita_sdk/tools/qtest/api_wrapper.py +430 -13
  225. alita_sdk/tools/rally/__init__.py +10 -8
  226. alita_sdk/tools/rally/api_wrapper.py +1 -1
  227. alita_sdk/tools/report_portal/__init__.py +12 -9
  228. alita_sdk/tools/salesforce/__init__.py +10 -9
  229. alita_sdk/tools/servicenow/__init__.py +17 -14
  230. alita_sdk/tools/servicenow/api_wrapper.py +1 -1
  231. alita_sdk/tools/sharepoint/__init__.py +10 -8
  232. alita_sdk/tools/sharepoint/api_wrapper.py +4 -4
  233. alita_sdk/tools/slack/__init__.py +10 -8
  234. alita_sdk/tools/slack/api_wrapper.py +2 -2
  235. alita_sdk/tools/sql/__init__.py +11 -9
  236. alita_sdk/tools/testio/__init__.py +10 -8
  237. alita_sdk/tools/testrail/__init__.py +11 -8
  238. alita_sdk/tools/testrail/api_wrapper.py +1 -1
  239. alita_sdk/tools/utils/__init__.py +9 -4
  240. alita_sdk/tools/utils/content_parser.py +77 -3
  241. alita_sdk/tools/utils/text_operations.py +410 -0
  242. alita_sdk/tools/utils/tool_prompts.py +79 -0
  243. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +17 -13
  244. alita_sdk/tools/xray/__init__.py +12 -9
  245. alita_sdk/tools/yagmail/__init__.py +9 -3
  246. alita_sdk/tools/zephyr/__init__.py +9 -7
  247. alita_sdk/tools/zephyr_enterprise/__init__.py +11 -8
  248. alita_sdk/tools/zephyr_essential/__init__.py +10 -8
  249. alita_sdk/tools/zephyr_essential/api_wrapper.py +30 -13
  250. alita_sdk/tools/zephyr_essential/client.py +2 -2
  251. alita_sdk/tools/zephyr_scale/__init__.py +11 -9
  252. alita_sdk/tools/zephyr_scale/api_wrapper.py +2 -2
  253. alita_sdk/tools/zephyr_squad/__init__.py +10 -8
  254. {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/METADATA +147 -7
  255. alita_sdk-0.3.627.dist-info/RECORD +468 -0
  256. alita_sdk-0.3.627.dist-info/entry_points.txt +2 -0
  257. alita_sdk-0.3.462.dist-info/RECORD +0 -384
  258. alita_sdk-0.3.462.dist-info/entry_points.txt +0 -2
  259. {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/WHEEL +0 -0
  260. {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/licenses/LICENSE +0 -0
  261. {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1469 @@
1
+ """
2
+ LLM-based extractors for document classification, schema discovery,
3
+ entity extraction, and relation extraction.
4
+
5
+ Supports comprehensive entity types across multiple layers:
6
+ - Product Layer: Features, Epics, User Stories, Screens, UX Flows
7
+ - Domain Layer: Business Objects, Rules, Glossary Terms
8
+ - Service Layer: APIs, Endpoints, Services, Methods
9
+ - Code Layer: Modules, Classes, Functions
10
+ - Data Layer: Tables, Columns, Constraints
11
+ - Testing Layer: Test Cases, Test Suites, Defects
12
+ - Delivery Layer: Releases, Commits, Tickets
13
+ - Organization Layer: Teams, Owners, Repositories
14
+ """
15
+
16
+ import json
17
+ import logging
18
+ import hashlib
19
+ from typing import Any, Optional, List, Dict, Union, Tuple
20
+
21
+ from langchain_core.documents import Document
22
+ from langchain_core.prompts import ChatPromptTemplate
23
+ from langchain_core.output_parsers import JsonOutputParser
24
+
25
+ logger = logging.getLogger(__name__)
26
+
27
+
28
+ # ============================================================================
29
+ # COMPREHENSIVE ENTITY & RELATIONSHIP TAXONOMY
30
+ # ============================================================================
31
+
32
+ ENTITY_TAXONOMY = {
33
+ "product_layer": {
34
+ "description": "Product and UX artifacts",
35
+ "types": [
36
+ {"name": "epic", "description": "Large feature grouping or initiative", "properties": ["name", "description", "acceptance_criteria", "priority"]},
37
+ {"name": "feature", "description": "Product capability or functionality", "properties": ["name", "description", "acceptance_criteria", "related_screens"]},
38
+ {"name": "user_story", "description": "User requirement in story format", "properties": ["name", "description", "persona", "acceptance_criteria", "story_points"]},
39
+ {"name": "screen", "description": "UI page, screen, or view", "properties": ["name", "description", "url_path", "parent_screen"]},
40
+ {"name": "ux_flow", "description": "User journey or navigation flow", "properties": ["name", "description", "start_screen", "end_screen", "steps"]},
41
+ {"name": "ui_component", "description": "Reusable UI element (form, button, modal)", "properties": ["name", "description", "component_type", "parent_screen"]},
42
+ {"name": "ui_field", "description": "Input field, dropdown, or display element", "properties": ["name", "field_type", "validation_rules", "api_mapping", "db_mapping"]},
43
+ ]
44
+ },
45
+ "domain_layer": {
46
+ "description": "Business domain concepts",
47
+ "types": [
48
+ {"name": "domain_entity", "description": "Core business object (Customer, Order, Product)", "properties": ["name", "description", "attributes", "lifecycle_states"]},
49
+ {"name": "attribute", "description": "Property of a domain entity", "properties": ["name", "data_type", "constraints", "parent_entity"]},
50
+ {"name": "business_rule", "description": "Business logic or constraint", "properties": ["name", "description", "trigger_event", "conditions", "actions", "exceptions"]},
51
+ {"name": "business_event", "description": "Domain event that triggers actions", "properties": ["name", "description", "trigger", "payload", "handlers"]},
52
+ {"name": "glossary_term", "description": "Domain vocabulary definition", "properties": ["name", "definition", "synonyms", "related_terms"]},
53
+ {"name": "workflow", "description": "Business process or workflow", "properties": ["name", "description", "steps", "triggers", "outcomes"]},
54
+ ]
55
+ },
56
+ "service_layer": {
57
+ "description": "APIs and services (semantic descriptions, not code structure)",
58
+ "types": [
59
+ {"name": "service", "description": "Software service or microservice", "properties": ["name", "description", "tech_stack", "owner_team"]},
60
+
61
+ # REST API types (from specs/docs, not code)
62
+ {"name": "rest_api", "description": "REST API specification", "properties": ["name", "description", "version", "auth_schema", "base_url", "content_type"]},
63
+ {"name": "rest_endpoint", "description": "REST API endpoint", "properties": ["name", "method", "path", "request_schema", "response_schema", "auth_required", "status_codes"]},
64
+
65
+ # GraphQL types (from specs/docs)
66
+ {"name": "graphql_api", "description": "GraphQL API schema", "properties": ["name", "description", "version", "endpoint", "auth_schema"]},
67
+ {"name": "graphql_query", "description": "GraphQL query operation", "properties": ["name", "description", "arguments", "return_type"]},
68
+ {"name": "graphql_mutation", "description": "GraphQL mutation operation", "properties": ["name", "description", "arguments", "return_type"]},
69
+
70
+ # Event-driven types (semantic)
71
+ {"name": "event_type", "description": "Event type or message schema", "properties": ["name", "description", "version", "schema", "payload_fields"]},
72
+
73
+ # Integration types
74
+ {"name": "integration", "description": "External system integration", "properties": ["name", "description", "protocol", "external_system", "direction"]},
75
+ ]
76
+ },
77
+ # NOTE: code_layer removed - classes, functions, methods, modules, interfaces, constants
78
+ # are now extracted by AST/regex parsers, not LLM entity extraction
79
+ "data_layer": {
80
+ "description": "Database and data artifacts (from docs/specs, not code)",
81
+ "types": [
82
+ {"name": "database", "description": "Database or data store", "properties": ["name", "type", "description"]},
83
+ {"name": "table", "description": "Database table or collection", "properties": ["name", "description", "primary_key", "indexes"]},
84
+ {"name": "column", "description": "Table column or field", "properties": ["name", "data_type", "nullable", "default_value", "constraints", "parent_table"]},
85
+ {"name": "migration", "description": "Database migration script", "properties": ["name", "version", "description", "changes"]},
86
+ {"name": "enum", "description": "Enumeration or lookup values", "properties": ["name", "values", "description"]},
87
+ ]
88
+ },
89
+ "testing_layer": {
90
+ "description": "Testing artifacts",
91
+ "types": [
92
+ {"name": "test_suite", "description": "Collection of related test cases", "properties": ["name", "description", "test_type", "coverage_area"]},
93
+ {"name": "test_case", "description": "Individual test case", "properties": ["name", "description", "preconditions", "steps", "expected_result", "priority", "automated"]},
94
+ {"name": "test_data", "description": "Test data set or fixture", "properties": ["name", "description", "data_format", "scope"]},
95
+ {"name": "defect", "description": "Bug or defect report", "properties": ["name", "description", "severity", "status", "steps_to_reproduce", "affected_version"]},
96
+ {"name": "incident", "description": "Production incident", "properties": ["name", "description", "severity", "impact", "root_cause", "resolution"]},
97
+ ]
98
+ },
99
+ "delivery_layer": {
100
+ "description": "Delivery and release artifacts",
101
+ "types": [
102
+ {"name": "release", "description": "Software release or version", "properties": ["name", "version", "release_date", "changes", "status"]},
103
+ {"name": "sprint", "description": "Development sprint or iteration", "properties": ["name", "start_date", "end_date", "goals"]},
104
+ {"name": "ticket", "description": "Work item or task ticket", "properties": ["name", "description", "type", "status", "priority", "assignee"]},
105
+ {"name": "deployment", "description": "Deployment to environment", "properties": ["name", "environment", "version", "timestamp", "status"]},
106
+ ]
107
+ },
108
+ "organization_layer": {
109
+ "description": "People and organizational artifacts",
110
+ "types": [
111
+ {"name": "team", "description": "Development team or squad", "properties": ["name", "description", "members", "responsibilities"]},
112
+ {"name": "owner", "description": "Feature or component owner", "properties": ["name", "email", "role", "owned_components"]},
113
+ {"name": "stakeholder", "description": "Business stakeholder", "properties": ["name", "role", "interests", "contact"]},
114
+ {"name": "repository", "description": "Code repository", "properties": ["name", "url", "description", "language", "owner_team"]},
115
+ {"name": "documentation", "description": "Documentation page or article", "properties": ["name", "url", "description", "doc_type", "last_updated"]},
116
+ ]
117
+ },
118
+ "tooling_layer": {
119
+ "description": "Tools and integration toolkits",
120
+ "types": [
121
+ {"name": "toolkit", "description": "Integration toolkit or connector (e.g., Jira Toolkit, GitHub Toolkit)", "properties": ["name", "description", "tools", "configuration_fields", "authentication"]},
122
+ {"name": "tool", "description": "Individual tool or capability within a toolkit", "properties": ["name", "description", "parameters", "return_type", "parent_toolkit"]},
123
+ {"name": "mcp_server", "description": "MCP (Model Context Protocol) server", "properties": ["name", "description", "transport", "tools", "resources"]},
124
+ {"name": "mcp_tool", "description": "Tool exposed by an MCP server", "properties": ["name", "description", "input_schema", "parent_server"]},
125
+ {"name": "connector", "description": "External system connector or adapter", "properties": ["name", "description", "target_system", "auth_type", "capabilities"]},
126
+ ]
127
+ },
128
+ }
129
+
130
+ RELATIONSHIP_TAXONOMY = {
131
+ # NOTE: Code structural relationships (imports, extends, implements, calls, contains for code)
132
+ # are now extracted by AST/regex parsers, not LLM. This taxonomy is for semantic relationships.
133
+ "structural": {
134
+ "description": "Structural relationships (for non-code entities)",
135
+ "types": [
136
+ {"name": "contains", "description": "Parent contains child (non-code)", "examples": ["screen contains ui_component", "toolkit contains tool", "epic contains feature"]},
137
+ {"name": "part_of", "description": "Part of larger whole", "examples": ["column part_of table", "tool part_of toolkit", "ui_field part_of screen"]},
138
+ {"name": "provides", "description": "Provides capability or resource", "examples": ["toolkit provides tool", "mcp_server provides mcp_tool", "service provides api"]},
139
+ ]
140
+ },
141
+ "behavioral": {
142
+ "description": "Behavioral and runtime relationships (semantic, not code-level)",
143
+ "types": [
144
+ {"name": "triggers", "description": "Triggers event or action", "examples": ["business_rule triggers workflow", "event triggers handler"]},
145
+ {"name": "depends_on", "description": "Business/feature dependency", "examples": ["service depends_on service", "feature depends_on feature"]},
146
+ {"name": "uses", "description": "Uses or references", "examples": ["feature uses service", "test_case uses test_data"]},
147
+ {"name": "publishes", "description": "Publishes event", "examples": ["service publishes event_type"]},
148
+ {"name": "subscribes_to", "description": "Subscribes to event", "examples": ["service subscribes_to event_type"]},
149
+ ]
150
+ },
151
+ "data_lineage": {
152
+ "description": "Data flow relationships",
153
+ "types": [
154
+ {"name": "stores_in", "description": "Data stored in", "examples": ["ui_field stores_in column", "endpoint stores_in table"]},
155
+ {"name": "reads_from", "description": "Reads data from", "examples": ["endpoint reads_from table", "screen reads_from api"]},
156
+ {"name": "maps_to", "description": "Data mapping", "examples": ["ui_field maps_to column", "attribute maps_to column"]},
157
+ ]
158
+ },
159
+ "ui_product": {
160
+ "description": "UI and product relationships",
161
+ "types": [
162
+ {"name": "shown_on", "description": "Displayed on screen/UI", "examples": ["ui_field shown_on screen", "domain_entity shown_on screen"]},
163
+ {"name": "navigates_to", "description": "Navigation link", "examples": ["screen navigates_to screen", "button navigates_to screen"]},
164
+ {"name": "validates", "description": "Validates input", "examples": ["business_rule validates ui_field"]},
165
+ ]
166
+ },
167
+ "testing": {
168
+ "description": "Testing relationships",
169
+ "types": [
170
+ {"name": "tests", "description": "Tests functionality", "examples": ["test_case tests feature", "test_case tests endpoint"]},
171
+ {"name": "covers", "description": "Test coverage", "examples": ["test_suite covers feature", "test_case covers user_story"]},
172
+ {"name": "reproduces", "description": "Reproduces defect", "examples": ["test_case reproduces defect"]},
173
+ ]
174
+ },
175
+ "ownership": {
176
+ "description": "Ownership and responsibility",
177
+ "types": [
178
+ {"name": "owned_by", "description": "Owned by team/person", "examples": ["service owned_by team", "feature owned_by owner"]},
179
+ {"name": "maintained_by", "description": "Maintained by", "examples": ["repository maintained_by team"]},
180
+ {"name": "assigned_to", "description": "Assigned to person", "examples": ["ticket assigned_to owner", "defect assigned_to owner"]},
181
+ ]
182
+ },
183
+ "temporal": {
184
+ "description": "Temporal and versioning relationships",
185
+ "types": [
186
+ {"name": "introduced_in", "description": "Introduced in release", "examples": ["feature introduced_in release", "api introduced_in release"]},
187
+ {"name": "modified_in", "description": "Modified in release", "examples": ["table modified_in migration"]},
188
+ {"name": "blocks", "description": "Blocks progress", "examples": ["defect blocks feature", "ticket blocks ticket"]},
189
+ ]
190
+ },
191
+ "semantic": {
192
+ "description": "Semantic and knowledge relationships",
193
+ "types": [
194
+ {"name": "related_to", "description": "General relationship", "examples": ["feature related_to feature", "ticket related_to defect"]},
195
+ {"name": "duplicates", "description": "Duplicate of another", "examples": ["defect duplicates defect"]},
196
+ {"name": "references", "description": "References document", "examples": ["ticket references documentation", "test_case references user_story"]},
197
+ {"name": "documents", "description": "Documents or describes", "examples": ["documentation documents feature", "wiki documents api"]},
198
+ ]
199
+ },
200
+ }
201
+
202
+
203
+ # ============================================================================
204
+ # PROMPTS
205
+ # ============================================================================
206
+
207
+ DOCUMENT_CLASSIFIER_PROMPT = """Analyze the following document chunk and classify it into one of these document types:
208
+ - code: Source code files (Python, JavaScript, Java, etc.)
209
+ - api_spec: API specifications (OpenAPI, Swagger, GraphQL schemas)
210
+ - requirements: Requirements documents, user stories, specs
211
+ - architecture: Architecture documentation, design documents
212
+ - config: Configuration files (YAML, JSON config, env files)
213
+ - database: Database schemas, migrations, SQL
214
+ - test: Test files, test cases
215
+ - documentation: General documentation, READMEs, guides
216
+ - ticket: Issue tickets, bug reports, feature requests (Jira, GitHub issues)
217
+ - commit: Git commits, changelogs
218
+ - ui: UI component definitions, screen layouts, UX flows
219
+ - other: Anything that doesn't fit the above
220
+
221
+ Document content:
222
+ ---
223
+ {content}
224
+ ---
225
+
226
+ Metadata:
227
+ {metadata}
228
+
229
+ Respond with ONLY a JSON object:
230
+ {{"doc_type": "<type>", "confidence": <0.0-1.0>}}
231
+ """
232
+
233
+
234
+ SCHEMA_DISCOVERY_PROMPT = """Analyze the following document samples to discover entity types and relationship types for a comprehensive knowledge graph.
235
+
236
+ ## Entity Layers to Consider
237
+
238
+ ### Product Layer (UI/UX artifacts)
239
+ - Epic, Feature, User Story (product requirements)
240
+ - Screen, Page, View (UI containers)
241
+ - UX Flow, Journey (navigation flows)
242
+ - UI Component, Field (interactive elements with validation rules)
243
+
244
+ ### Domain Layer (Business concepts)
245
+ - Domain Entity (Customer, Order, Product - core business objects)
246
+ - Attribute (properties of domain entities)
247
+ - Business Rule (conditions, triggers, exceptions)
248
+ - Business Event (domain events that trigger actions)
249
+ - Glossary Term (vocabulary definitions, synonyms)
250
+
251
+ ### Service Layer (APIs and integrations)
252
+ - Service, Microservice
253
+ - API, Endpoint (with method, path, auth)
254
+ - Payload, Schema (request/response structures)
255
+ - Integration (external system connections)
256
+
257
+ ### Code Layer (Implementation)
258
+ - Module, Package
259
+ - Class, Interface
260
+ - Function, Method
261
+ - Configuration, Constant
262
+
263
+ ### Data Layer (Storage)
264
+ - Database, Table, Collection
265
+ - Column, Field (with type, constraints, nullable)
266
+ - Constraint, Index, Enum
267
+ - Migration Script
268
+
269
+ ### Testing Layer
270
+ - Test Suite, Test Case (with preconditions, steps, expected results)
271
+ - Test Data, Fixture
272
+ - Defect, Bug (with severity, reproduction steps)
273
+ - Incident (production issues)
274
+
275
+ ### Delivery Layer
276
+ - Release, Version
277
+ - Sprint, Iteration
278
+ - Commit, Pull Request
279
+ - Ticket, Task
280
+
281
+ ### Organization Layer
282
+ - Team, Squad (ownership)
283
+ - Owner, SME (subject matter experts)
284
+ - Repository (code location)
285
+ - Documentation (wiki, guides)
286
+
287
+ ## Relationship Categories
288
+
289
+ - Structural: contains, extends, implements, imports, part_of
290
+ - Behavioral: calls, triggers, depends_on, uses
291
+ - Data Lineage: stores_in, reads_from, maps_to, transforms
292
+ - UI/Product: shown_on, navigates_to, validates
293
+ - Testing: tests, validates, covers, reproduces
294
+ - Ownership: owned_by, maintained_by, assigned_to, reviewed_by
295
+ - Temporal: introduced_in, removed_in, modified_in, supersedes, blocks
296
+ - Semantic: related_to, duplicates, contradicts, references, synonym_of
297
+
298
+ ---
299
+
300
+ Document samples:
301
+ ---
302
+ {samples}
303
+ ---
304
+
305
+ Based on these samples, identify which entity types and relationships are most relevant.
306
+ Group by layer and include properties that would be valuable to extract.
307
+
308
+ Respond with ONLY a JSON object:
309
+ {{
310
+ "entity_types": [
311
+ {{"name": "<type_name>", "layer": "<layer_name>", "description": "<description>", "properties": ["<prop1>", "<prop2>"]}}
312
+ ],
313
+ "relation_types": [
314
+ {{"name": "<relation_name>", "category": "<category>", "description": "<description>", "source_types": ["<entity_type>"], "target_types": ["<entity_type>"]}}
315
+ ]
316
+ }}
317
+ """
318
+
319
+
320
+ ENTITY_EXTRACTION_PROMPT = """Extract semantic entities from the following document for a knowledge graph.
321
+
322
+ NOTE: Code structure entities (classes, functions, methods, modules, interfaces, constants, variables, imports)
323
+ are automatically extracted by AST/regex parsers - DO NOT extract these from code files.
324
+ Focus on extracting SEMANTIC entities that represent business concepts, requirements, and domain knowledge.
325
+
326
+ {schema_section}
327
+
328
+ Document content (with line numbers):
329
+ ---
330
+ {content}
331
+ ---
332
+
333
+ Source file: {file_path}
334
+ Source toolkit: {source_toolkit}
335
+
336
+ ## What to Extract:
337
+ For CODE files (.py, .js, .java, etc.): Extract only semantic entities like:
338
+ - Features, requirements, business rules mentioned in comments/docstrings
339
+ - Domain concepts, glossary terms explained in documentation strings
340
+ - TODOs, FIXMEs, or technical debt notes
341
+ - API contracts or integration points described in comments
342
+ - Test scenarios or acceptance criteria in docstrings
343
+
344
+ For DOCUMENTATION files (.md, .rst, .txt, confluence, etc.): Extract all entities including:
345
+ - Features, requirements, user stories, epics
346
+ - Domain entities, business rules, glossary terms
347
+ - Workflows, processes, procedures
348
+ - Services, APIs, integrations (as described, not as code)
349
+ - Test cases, test suites, defects
350
+ - Teams, owners, stakeholders
351
+
352
+ For each entity provide:
353
+ - A unique ID (use existing identifiers when available)
354
+ - The entity type (from semantic types above, NOT code types like class/function)
355
+ - The line range where this entity is defined or described (at least 3-5 lines minimum)
356
+ - Properties including at minimum: name, description
357
+
358
+ IMPORTANT: line_start and line_end should capture the full context of the entity definition.
359
+ Single-line references (line_start == line_end) are discouraged - expand to include surrounding context.
360
+
361
+ Respond with ONLY a JSON array:
362
+ [
363
+ {{
364
+ "id": "<unique_id>",
365
+ "type": "<entity_type>",
366
+ "name": "<entity_name>",
367
+ "line_start": <start_line_number>,
368
+ "line_end": <end_line_number>,
369
+ "properties": {{
370
+ "description": "<brief_description>",
371
+ ...
372
+ }}
373
+ }}
374
+ ]
375
+ """
376
+
377
+
378
+ RELATION_EXTRACTION_PROMPT = """Extract SEMANTIC relationships between the entities listed below based on the document content.
379
+
380
+ NOTE: Structural code relationships (imports, extends, implements, calls, contains for code elements)
381
+ are automatically extracted by AST/regex parsers - DO NOT extract these from code files.
382
+ Focus on extracting SEMANTIC relationships that represent business logic and domain connections.
383
+
384
+ ## Document content:
385
+ ---
386
+ {content}
387
+ ---
388
+
389
+ ## Available Entities (ID -> Name):
390
+ {entities_list}
391
+
392
+ {schema_section}
393
+
394
+ ## Instructions:
395
+ 1. Look for semantic relationships mentioned or implied in the document
396
+ 2. For source_id and target_id, you MUST use EXACTLY the ID shown before the arrow (->)
397
+
398
+ ## Relationship Types to Extract:
399
+ For CODE files: Focus on semantic relationships like:
400
+ - tests (test_case tests feature)
401
+ - validates (test validates business_rule)
402
+ - documents (code documents requirement)
403
+ - related_to (feature related_to feature)
404
+ - depends_on (business dependency, not code import)
405
+
406
+ For DOCUMENTATION files: Extract all relationship types:
407
+ - tests, validates, covers (testing relationships)
408
+ - owned_by, maintained_by, assigned_to (ownership)
409
+ - introduced_in, modified_in, removed_in (temporal)
410
+ - related_to, references, duplicates (semantic)
411
+ - navigates_to, shown_on (UI relationships)
412
+ - triggers, depends_on (behavioral - for business logic)
413
+
414
+ DO NOT extract for code files:
415
+ - imports (handled by parser)
416
+ - extends/implements (handled by parser)
417
+ - calls (handled by parser)
418
+ - contains (for code structure - handled by parser)
419
+
420
+ ## Output Format:
421
+ Respond with ONLY a JSON array. Use the EXACT entity IDs from the list above:
422
+ [
423
+ {{
424
+ "source_id": "<exact-id-from-list>",
425
+ "relation_type": "<relationship_type>",
426
+ "target_id": "<exact-id-from-list>",
427
+ "confidence": <0.0-1.0>
428
+ }}
429
+ ]
430
+
431
+ If no relationships are found, return an empty array: []
432
+
433
+ EXAMPLE: If entities are "Migration Guide (407b9c0c2048)" and "Before State (bc4612fc3d87)",
434
+ a valid relation would be: {{"source_id": "407b9c0c2048", "relation_type": "describes", "target_id": "bc4612fc3d87", "confidence": 0.9}}
435
+ """
436
+
437
+
438
+ class DocumentClassifier:
439
+ """Classifies documents by type using LLM."""
440
+
441
+ def __init__(self, llm: Any):
442
+ self.llm = llm
443
+ self.prompt = ChatPromptTemplate.from_template(DOCUMENT_CLASSIFIER_PROMPT)
444
+ self.parser = JsonOutputParser()
445
+
446
+ def classify(self, document: Document) -> str:
447
+ """Classify a single document."""
448
+ try:
449
+ content = document.page_content[:3000] # Limit content size
450
+ metadata = json.dumps(document.metadata, default=str)[:500]
451
+
452
+ chain = self.prompt | self.llm | self.parser
453
+ result = chain.invoke({
454
+ "content": content,
455
+ "metadata": metadata
456
+ })
457
+
458
+ return result.get('doc_type', 'other')
459
+ except Exception as e:
460
+ logger.warning(f"Classification failed: {e}")
461
+ return 'other'
462
+
463
+ def classify_batch(self, documents: List[Document]) -> List[str]:
464
+ """Classify multiple documents."""
465
+ return [self.classify(doc) for doc in documents]
466
+
467
+
468
+ class EntitySchemaDiscoverer:
469
+ """Discovers entity and relation schemas from document samples using LLM."""
470
+
471
+ def __init__(self, llm: Any):
472
+ self.llm = llm
473
+ self.prompt = ChatPromptTemplate.from_template(SCHEMA_DISCOVERY_PROMPT)
474
+ self.parser = JsonOutputParser()
475
+
476
+ def discover(self, documents: List[Document]) -> Dict[str, Any]:
477
+ """
478
+ Discover entity and relation types from document samples.
479
+
480
+ Args:
481
+ documents: Sample documents to analyze
482
+
483
+ Returns:
484
+ Schema dictionary with entity_types and relation_types
485
+ """
486
+ try:
487
+ # Build samples string
488
+ samples_parts = []
489
+ for i, doc in enumerate(documents[:20]): # Limit samples
490
+ content = doc.page_content[:500]
491
+ doc_type = doc.metadata.get('doc_type', 'unknown')
492
+ source = doc.metadata.get('source_toolkit', 'unknown')
493
+ samples_parts.append(f"[Sample {i+1} - {doc_type} from {source}]\n{content}\n")
494
+
495
+ samples = "\n---\n".join(samples_parts)
496
+
497
+ chain = self.prompt | self.llm | self.parser
498
+ result = chain.invoke({"samples": samples})
499
+
500
+ # Validate structure
501
+ if 'entity_types' not in result:
502
+ result['entity_types'] = []
503
+ if 'relation_types' not in result:
504
+ result['relation_types'] = []
505
+
506
+ return result
507
+ except Exception as e:
508
+ logger.error(f"Schema discovery failed: {e}")
509
+ return self._default_schema()
510
+
511
+ def _default_schema(self) -> Dict[str, Any]:
512
+ """Return a default schema as fallback."""
513
+ return {
514
+ "entity_types": [
515
+ {"name": "service", "description": "A software service or microservice", "properties": ["name", "description"]},
516
+ {"name": "module", "description": "A code module or package", "properties": ["name", "path"]},
517
+ {"name": "function", "description": "A function or method", "properties": ["name", "signature"]},
518
+ {"name": "api", "description": "An API endpoint", "properties": ["name", "path", "method"]},
519
+ {"name": "feature", "description": "A product feature", "properties": ["name", "description"]},
520
+ {"name": "requirement", "description": "A requirement or user story", "properties": ["name", "description"]},
521
+ ],
522
+ "relation_types": [
523
+ {"name": "depends_on", "description": "Dependency relationship", "source_types": ["*"], "target_types": ["*"]},
524
+ {"name": "calls", "description": "Function/API call", "source_types": ["function", "service"], "target_types": ["function", "api"]},
525
+ {"name": "implements", "description": "Implementation relationship", "source_types": ["module", "function"], "target_types": ["feature", "requirement"]},
526
+ {"name": "contains", "description": "Containment relationship", "source_types": ["service", "module"], "target_types": ["module", "function"]},
527
+ ]
528
+ }
529
+
530
+
531
+ class EntityExtractor:
532
+ """Extracts entities from documents using LLM."""
533
+
534
+ def __init__(self, llm: Any, embedding: Optional[Any] = None, max_retries: int = 3, retry_delay: float = 2.0):
535
+ self.llm = llm
536
+ self.embedding = embedding
537
+ self.prompt = ChatPromptTemplate.from_template(ENTITY_EXTRACTION_PROMPT)
538
+ self.parser = JsonOutputParser()
539
+ self._entity_cache: Dict[str, Dict] = {}
540
+ self.max_retries = max_retries
541
+ self.retry_delay = retry_delay
542
+
543
+ def extract(
544
+ self,
545
+ document: Document,
546
+ schema: Optional[Dict[str, Any]] = None
547
+ ) -> List[Dict[str, Any]]:
548
+ """
549
+ Extract entities from a single document with retry logic.
550
+
551
+ Args:
552
+ document: Document to extract from
553
+ schema: Optional schema to guide extraction
554
+
555
+ Returns:
556
+ List of extracted entities with line numbers for citations
557
+ """
558
+ import time
559
+
560
+ file_path = document.metadata.get('file_path', document.metadata.get('source', 'unknown'))
561
+ source_toolkit = document.metadata.get('source_toolkit', 'filesystem')
562
+
563
+ last_error = None
564
+ for attempt in range(self.max_retries):
565
+ try:
566
+ content = document.page_content
567
+
568
+ # Add line numbers to content for better extraction
569
+ lines = content.split('\n')
570
+ numbered_content = '\n'.join(
571
+ f"{i+1:4d} | {line}"
572
+ for i, line in enumerate(lines[:200]) # Limit lines
573
+ )
574
+
575
+ # Build schema section
576
+ schema_section = ""
577
+ if schema and schema.get('entity_types'):
578
+ types_str = ", ".join([et['name'] for et in schema['entity_types']])
579
+ schema_section = f"Entity types to extract: {types_str}\n"
580
+ for et in schema['entity_types']:
581
+ schema_section += f"- {et['name']}: {et.get('description', '')}\n"
582
+
583
+ chain = self.prompt | self.llm | self.parser
584
+ result = chain.invoke({
585
+ "content": numbered_content,
586
+ "file_path": file_path,
587
+ "source_toolkit": source_toolkit,
588
+ "schema_section": schema_section
589
+ })
590
+
591
+ if not isinstance(result, list):
592
+ result = [result] if result else []
593
+
594
+ # Track total lines in document for boundary checks
595
+ total_lines = len(lines)
596
+
597
+ # Add source tracking and normalize structure
598
+ for entity in result:
599
+ entity['source_toolkit'] = source_toolkit
600
+ entity['file_path'] = file_path
601
+
602
+ # Ensure name is at top level
603
+ if 'name' not in entity and 'properties' in entity:
604
+ entity['name'] = entity['properties'].get('name', entity.get('id', 'unnamed'))
605
+
606
+ # Expand small line ranges to provide meaningful context
607
+ # Minimum span should be 3 lines
608
+ line_start = entity.get('line_start', 1)
609
+ line_end = entity.get('line_end', line_start)
610
+ span = line_end - line_start
611
+
612
+ if span < 2: # Less than 3 lines of context
613
+ # Expand range symmetrically around the center
614
+ center = (line_start + line_end) // 2
615
+ # Add 2 lines on each side (for 5 line minimum)
616
+ new_start = max(1, center - 2)
617
+ new_end = min(total_lines, center + 2)
618
+ entity['line_start'] = new_start
619
+ entity['line_end'] = new_end
620
+
621
+ return result
622
+
623
+ except Exception as e:
624
+ last_error = e
625
+ attempt_num = attempt + 1
626
+
627
+ if attempt_num < self.max_retries:
628
+ delay = 10 * attempt_num
629
+ logger.warning(
630
+ f"Entity extraction failed for '{file_path}' (attempt {attempt_num}/{self.max_retries}): {e}. "
631
+ f"Retrying in {delay}s..."
632
+ )
633
+ time.sleep(delay)
634
+ else:
635
+ logger.error(
636
+ f"Entity extraction failed for '{file_path}' after {self.max_retries} attempts: {e}"
637
+ )
638
+
639
+ # All retries exhausted - raise exception to signal failure
640
+ raise RuntimeError(
641
+ f"Entity extraction failed for '{file_path}' after {self.max_retries} attempts: {last_error}"
642
+ )
643
+
644
+ def extract_batch(
645
+ self,
646
+ documents: List[Document],
647
+ schema: Optional[Dict[str, Any]] = None,
648
+ skip_on_error: bool = False
649
+ ) -> Union[List[Dict[str, Any]], Tuple[List[Dict[str, Any]], List[str]]]:
650
+ """
651
+ Extract entities from multiple documents with deduplication.
652
+
653
+ Args:
654
+ documents: List of documents to extract from
655
+ schema: Optional schema to guide extraction
656
+ skip_on_error: If True, skip documents that fail extraction after retries
657
+ and return tuple of (entities, failed_file_paths).
658
+ If False (default), raise exception on first failure.
659
+
660
+ Returns:
661
+ If skip_on_error=False: List of extracted entities
662
+ If skip_on_error=True: Tuple of (entities, failed_file_paths)
663
+ """
664
+ all_entities = []
665
+ failed_docs = []
666
+
667
+ for doc in documents:
668
+ try:
669
+ entities = self.extract(doc, schema)
670
+ all_entities.extend(entities)
671
+ except RuntimeError as e:
672
+ file_path = doc.metadata.get('file_path', doc.metadata.get('source', 'unknown'))
673
+ if skip_on_error:
674
+ logger.warning(f"Skipping document '{file_path}' due to extraction failure: {e}")
675
+ failed_docs.append(file_path)
676
+ else:
677
+ raise
678
+
679
+ if failed_docs:
680
+ logger.warning(f"Skipped {len(failed_docs)} documents due to extraction failures: {failed_docs[:5]}{'...' if len(failed_docs) > 5 else ''}")
681
+
682
+ # Deduplicate
683
+ deduped = self._deduplicate_entities(all_entities)
684
+
685
+ # Return tuple when skip_on_error is enabled so caller can track failures
686
+ if skip_on_error:
687
+ return deduped, failed_docs
688
+
689
+ return deduped
690
+
691
+ def _deduplicate_entities(self, entities: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
692
+ """
693
+ Deduplicate entities using simple heuristics.
694
+
695
+ For more complex deduplication, LLM-based approach can be used.
696
+ """
697
+ seen = {} # (type, normalized_name) -> entity
698
+ deduped = []
699
+
700
+ for entity in entities:
701
+ etype = entity.get('type', 'unknown')
702
+ name = entity.get('properties', {}).get('name', entity.get('id', ''))
703
+
704
+ # Normalize name
705
+ normalized = name.lower().strip().replace('_', ' ').replace('-', ' ')
706
+ key = (etype, normalized)
707
+
708
+ if key in seen:
709
+ # Merge properties
710
+ existing = seen[key]
711
+ for prop_key, prop_value in entity.get('properties', {}).items():
712
+ if prop_key not in existing.get('properties', {}):
713
+ existing.setdefault('properties', {})[prop_key] = prop_value
714
+ else:
715
+ seen[key] = entity
716
+ deduped.append(entity)
717
+
718
+ return deduped
719
+
720
+
721
+ class RelationExtractor:
722
+ """Extracts relationships between entities using LLM."""
723
+
724
+ def __init__(self, llm: Any, max_retries: int = 3, retry_delay: float = 2.0):
725
+ self.llm = llm
726
+ self.prompt = ChatPromptTemplate.from_template(RELATION_EXTRACTION_PROMPT)
727
+ self.parser = JsonOutputParser()
728
+ self.max_retries = max_retries
729
+ self.retry_delay = retry_delay
730
+
731
+ def extract(
732
+ self,
733
+ document: Document,
734
+ entities: List[Dict[str, Any]],
735
+ schema: Optional[Dict[str, Any]] = None,
736
+ confidence_threshold: float = 0.5,
737
+ all_entities: Optional[List[Dict[str, Any]]] = None
738
+ ) -> List[Dict[str, Any]]:
739
+ """
740
+ Extract relationships from a document given known entities with retry logic.
741
+
742
+ Args:
743
+ document: Document to analyze
744
+ entities: Entities known to be in this document (for LLM context)
745
+ schema: Optional schema to guide extraction
746
+ confidence_threshold: Minimum confidence to include
747
+ all_entities: All entities in graph (for ID resolution across sources)
748
+
749
+ Returns:
750
+ List of extracted relations
751
+ """
752
+ import time
753
+
754
+ if not entities:
755
+ return []
756
+
757
+ # Use all_entities for ID resolution if provided, otherwise just doc entities
758
+ entities_for_lookup = all_entities if all_entities else entities
759
+
760
+ file_path = document.metadata.get('file_path', document.metadata.get('source', 'unknown'))
761
+ last_error = None
762
+
763
+ for attempt in range(self.max_retries):
764
+ try:
765
+ content = document.page_content[:4000]
766
+
767
+ # Filter entities from this document
768
+ doc_id = document.metadata.get('source')
769
+ doc_entities = [e for e in entities if e.get('source_doc_id') == doc_id]
770
+
771
+ if not doc_entities:
772
+ doc_entities = entities[:20] # Fall back to first N entities
773
+
774
+ # Format entities with ID first for clarity: "ID -> Name (type)"
775
+ entities_list = "\n".join([
776
+ f"- {e.get('id')} -> {e.get('name', 'unnamed')} ({e.get('type', 'unknown')})"
777
+ for e in doc_entities[:30]
778
+ ])
779
+
780
+ # Build schema section
781
+ schema_section = ""
782
+ if schema and schema.get('relation_types'):
783
+ types_str = ", ".join([rt['name'] for rt in schema['relation_types']])
784
+ schema_section = f"## Relationship types: {types_str}\n"
785
+ for rt in schema['relation_types']:
786
+ schema_section += f"- {rt['name']}: {rt.get('description', '')}\n"
787
+
788
+ chain = self.prompt | self.llm | self.parser
789
+ result = chain.invoke({
790
+ "content": content,
791
+ "entities_list": entities_list,
792
+ "schema_section": schema_section
793
+ })
794
+
795
+ if not isinstance(result, list):
796
+ result = [result] if result else []
797
+
798
+ # Build lookup tables from ALL entities (enables cross-source resolution)
799
+ # LLMs often use names instead of hex IDs, so we map both
800
+ id_lookup = {}
801
+ name_to_id = {} # For fuzzy matching fallback
802
+
803
+ for e in entities_for_lookup:
804
+ entity_id = e.get('id', '')
805
+ entity_name = e.get('name', '')
806
+ entity_type = e.get('type', '')
807
+
808
+ if not entity_id:
809
+ continue
810
+
811
+ # Direct ID match
812
+ id_lookup[entity_id] = entity_id
813
+ id_lookup[entity_id.lower()] = entity_id
814
+
815
+ # Name-based lookups (what LLM often returns)
816
+ if entity_name:
817
+ # Exact name
818
+ id_lookup[entity_name] = entity_id
819
+ id_lookup[entity_name.lower()] = entity_id
820
+ # snake_case version
821
+ snake_name = entity_name.lower().replace(' ', '_').replace('-', '_').replace(':', '_')
822
+ id_lookup[snake_name] = entity_id
823
+ # Remove articles/filler words for matching
824
+ short_snake = snake_name.replace('_a_', '_').replace('_an_', '_').replace('_the_', '_').replace('_your_', '_').replace('_my_', '_')
825
+ id_lookup[short_snake] = entity_id
826
+ # type:name format
827
+ type_name = f"{entity_type}:{snake_name}"
828
+ id_lookup[type_name] = entity_id
829
+ id_lookup[type_name.lower()] = entity_id
830
+ # Store for fuzzy matching with word sets
831
+ words = set(snake_name.split('_'))
832
+ name_to_id[snake_name] = (entity_id, words)
833
+
834
+ def resolve_id(ref: str) -> Optional[str]:
835
+ """Resolve an entity reference to its actual ID."""
836
+ if not ref:
837
+ return None
838
+ # Direct lookup
839
+ if ref in id_lookup:
840
+ return id_lookup[ref]
841
+ ref_lower = ref.lower()
842
+ if ref_lower in id_lookup:
843
+ return id_lookup[ref_lower]
844
+ # Snake case the reference
845
+ ref_snake = ref_lower.replace(' ', '_').replace('-', '_').replace(':', '_')
846
+ if ref_snake in id_lookup:
847
+ return id_lookup[ref_snake]
848
+
849
+ # Fuzzy matching: substring or word overlap
850
+ ref_words = set(ref_snake.split('_'))
851
+ best_match = None
852
+ best_score = 0
853
+
854
+ for name, (eid, name_words) in name_to_id.items():
855
+ # Substring match
856
+ if ref_snake in name or name in ref_snake:
857
+ return eid
858
+
859
+ # Word overlap score
860
+ overlap = len(ref_words & name_words)
861
+ if overlap >= 2 and overlap > best_score:
862
+ # At least 2 words must match
863
+ best_score = overlap
864
+ best_match = eid
865
+
866
+ return best_match
867
+
868
+ # Resolve relations to actual entity IDs
869
+ resolved = []
870
+ logger.info(f"Relation extraction got {len(result)} raw relations from LLM")
871
+ logger.info(f"ID lookup has {len(id_lookup)} entries, name_to_id has {len(name_to_id)} entries")
872
+
873
+ for r in result:
874
+ source = r.get('source_id', '')
875
+ target = r.get('target_id', '')
876
+
877
+ # Try to resolve source and target
878
+ resolved_source = resolve_id(source)
879
+ resolved_target = resolve_id(target)
880
+
881
+ logger.debug(f"Resolving: {source} -> {resolved_source}, {target} -> {resolved_target}")
882
+
883
+ if resolved_source and resolved_target:
884
+ r['source_id'] = resolved_source
885
+ r['target_id'] = resolved_target
886
+ resolved.append(r)
887
+ else:
888
+ logger.warning(f"Could not resolve relation: {source} ({resolved_source}) -> {target} ({resolved_target})")
889
+
890
+ logger.info(f"Resolved {len(resolved)} relations successfully")
891
+
892
+ # Filter by confidence
893
+ filtered = [
894
+ r for r in resolved
895
+ if r.get('confidence', 0) >= confidence_threshold
896
+ ]
897
+
898
+ return filtered
899
+
900
+ except Exception as e:
901
+ last_error = e
902
+ attempt_num = attempt + 1
903
+
904
+ if attempt_num < self.max_retries:
905
+ # Exponential backoff: 10^0=1s, 10^1=10s, 10^2=100s
906
+ delay = 10 ** attempt
907
+ logger.warning(
908
+ f"Relation extraction failed for '{file_path}' (attempt {attempt_num}/{self.max_retries}): {e}. "
909
+ f"Retrying in {delay}s..."
910
+ )
911
+ time.sleep(delay)
912
+ else:
913
+ logger.warning(
914
+ f"Relation extraction failed for '{file_path}' after {self.max_retries} attempts: {e}. Skipping."
915
+ )
916
+
917
+ # Return empty list on failure (relations are optional)
918
+ return []
919
+
920
+ def extract_batch(
921
+ self,
922
+ documents: List[Document],
923
+ entities: List[Dict[str, Any]],
924
+ schema: Optional[Dict[str, Any]] = None,
925
+ confidence_threshold: float = 0.5
926
+ ) -> List[Dict[str, Any]]:
927
+ """Extract relations from multiple documents."""
928
+ all_relations = []
929
+
930
+ for doc in documents:
931
+ relations = self.extract(
932
+ doc,
933
+ entities,
934
+ schema=schema,
935
+ confidence_threshold=confidence_threshold
936
+ )
937
+ all_relations.extend(relations)
938
+
939
+ # Deduplicate relations
940
+ return self._deduplicate_relations(all_relations)
941
+
942
+ def _deduplicate_relations(self, relations: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
943
+ """Deduplicate relations by source-type-target key."""
944
+ seen = {}
945
+ deduped = []
946
+
947
+ for rel in relations:
948
+ key = (
949
+ rel.get('source_id'),
950
+ rel.get('relation_type'),
951
+ rel.get('target_id')
952
+ )
953
+
954
+ if key not in seen:
955
+ seen[key] = rel
956
+ deduped.append(rel)
957
+ else:
958
+ # Keep higher confidence
959
+ if rel.get('confidence', 0) > seen[key].get('confidence', 0):
960
+ seen[key] = rel
961
+ # Update in deduped list
962
+ for i, r in enumerate(deduped):
963
+ if (r.get('source_id'), r.get('relation_type'), r.get('target_id')) == key:
964
+ deduped[i] = rel
965
+ break
966
+
967
+ return deduped
968
+
969
+
970
+ # ============================================================================
971
+ # FACT EXTRACTION (for non-code files)
972
+ # ============================================================================
973
+
974
+ FACT_EXTRACTION_PROMPT = """Extract factual information from the following document.
975
+
976
+ ## Document content:
977
+ ---
978
+ {content}
979
+ ---
980
+
981
+ ## Document metadata:
982
+ - File: {file_path}
983
+ - Source: {source_toolkit}
984
+
985
+ ## Canonical Fact Types:
986
+ Extract facts using these canonical types:
987
+
988
+ 1. **decision** - Architectural or business decisions
989
+ - Properties: title, rationale, alternatives, outcome, date, stakeholders
990
+ - Example: "We chose PostgreSQL over MongoDB for transactional consistency"
991
+
992
+ 2. **requirement** - Functional or non-functional requirements
993
+ - Properties: title, description, priority, status, acceptance_criteria
994
+ - Example: "System must handle 1000 concurrent users"
995
+
996
+ 3. **definition** - Definitions of terms, concepts, or standards
997
+ - Properties: term, definition, context, synonyms
998
+ - Example: "A 'tenant' refers to an organization using our SaaS platform"
999
+
1000
+ 4. **date** - Important dates, deadlines, milestones
1001
+ - Properties: event, date, description, status
1002
+ - Example: "MVP launch scheduled for Q2 2024"
1003
+
1004
+ 5. **reference** - External references, links, citations
1005
+ - Properties: title, url, description, type (spec, documentation, standard)
1006
+ - Example: "OAuth 2.0 specification: RFC 6749"
1007
+
1008
+ 6. **contact** - People, teams, ownership information
1009
+ - Properties: name, role, team, email, responsibilities
1010
+ - Example: "John Smith is the product owner for the payments module"
1011
+
1012
+ ## Instructions:
1013
+ 1. Identify facts from the document that match the canonical types
1014
+ 2. Each fact must have a unique ID (short hash-like string)
1015
+ 3. Include confidence score (0.0-1.0) based on how explicit the fact is
1016
+ 4. Include line_start and line_end for citation support
1017
+
1018
+ ## Output Format:
1019
+ Respond with ONLY a JSON array:
1020
+ [
1021
+ {{
1022
+ "id": "<unique_id>",
1023
+ "fact_type": "<decision|requirement|definition|date|reference|contact>",
1024
+ "title": "<brief title>",
1025
+ "properties": {{
1026
+ "<property_name>": "<value>"
1027
+ }},
1028
+ "line_start": <line_number>,
1029
+ "line_end": <line_number>,
1030
+ "confidence": <0.0-1.0>
1031
+ }}
1032
+ ]
1033
+
1034
+ If no facts are found, return an empty array: []
1035
+ """
1036
+
1037
+ # Canonical fact types for validation
1038
+ CANONICAL_FACT_TYPES = {
1039
+ "decision": {
1040
+ "description": "Architectural or business decisions",
1041
+ "properties": ["title", "rationale", "alternatives", "outcome", "date", "stakeholders"]
1042
+ },
1043
+ "requirement": {
1044
+ "description": "Functional or non-functional requirements",
1045
+ "properties": ["title", "description", "priority", "status", "acceptance_criteria"]
1046
+ },
1047
+ "definition": {
1048
+ "description": "Definitions of terms, concepts, or standards",
1049
+ "properties": ["term", "definition", "context", "synonyms"]
1050
+ },
1051
+ "date": {
1052
+ "description": "Important dates, deadlines, milestones",
1053
+ "properties": ["event", "date", "description", "status"]
1054
+ },
1055
+ "reference": {
1056
+ "description": "External references, links, citations",
1057
+ "properties": ["title", "url", "description", "type"]
1058
+ },
1059
+ "contact": {
1060
+ "description": "People, teams, ownership information",
1061
+ "properties": ["name", "role", "team", "email", "responsibilities"]
1062
+ }
1063
+ }
1064
+
1065
+ # Code-specific fact types for semantic code understanding
1066
+ CODE_FACT_TYPES = {
1067
+ "algorithm": {
1068
+ "description": "Algorithm or pattern used in the code",
1069
+ "properties": ["name", "description", "complexity", "use_case"]
1070
+ },
1071
+ "behavior": {
1072
+ "description": "What the code does - actions, side effects, I/O",
1073
+ "properties": ["action", "description", "inputs", "outputs", "side_effects"]
1074
+ },
1075
+ "validation": {
1076
+ "description": "Input validation, checks, guards, assertions",
1077
+ "properties": ["what", "how", "error_handling"]
1078
+ },
1079
+ "dependency": {
1080
+ "description": "External service calls, API usage, library dependencies",
1081
+ "properties": ["service", "operation", "purpose", "error_handling"]
1082
+ },
1083
+ "error_handling": {
1084
+ "description": "How errors are handled - retry, fallback, logging",
1085
+ "properties": ["strategy", "description", "fallback", "logging"]
1086
+ }
1087
+ }
1088
+
1089
+ CODE_FACT_EXTRACTION_PROMPT = """Analyze the following code and extract semantic facts about what it does.
1090
+
1091
+ Focus on understanding the CODE'S PURPOSE AND BEHAVIOR, not its structure.
1092
+ Parsers already extract structure (classes, functions, imports). You extract MEANING.
1093
+
1094
+ ## Code content (with line numbers):
1095
+ ---
1096
+ {content}
1097
+ ---
1098
+
1099
+ Source file: {file_path}
1100
+
1101
+ ## Fact types to extract:
1102
+ 1. **algorithm** - What algorithm or design pattern is used?
1103
+ - Example: "Uses binary search for O(log n) lookup"
1104
+ - Example: "Implements observer pattern for event handling"
1105
+
1106
+ 2. **behavior** - What does this code DO? (actions, I/O, side effects)
1107
+ - Example: "Sends email notification when payment fails"
1108
+ - Example: "Writes audit log for every database mutation"
1109
+ - Example: "Caches API responses for 5 minutes"
1110
+
1111
+ 3. **validation** - What validation or checks are performed?
1112
+ - Example: "Validates email format using RFC 5322 regex"
1113
+ - Example: "Checks user permissions before allowing access"
1114
+
1115
+ 4. **dependency** - What external services or APIs are called?
1116
+ - Example: "Calls Stripe API for payment processing"
1117
+ - Example: "Queries PostgreSQL for user data"
1118
+
1119
+ 5. **error_handling** - How are errors handled?
1120
+ - Example: "Retries failed requests 3 times with exponential backoff"
1121
+ - Example: "Falls back to cached data when API is unavailable"
1122
+
1123
+ ## Instructions:
1124
+ - Extract 1-5 most important facts about what this code DOES
1125
+ - Focus on business logic and behavior, not syntax
1126
+ - Include line numbers where the behavior is implemented
1127
+ - Be specific and actionable
1128
+
1129
+ ## Output Format:
1130
+ Respond with ONLY a JSON array:
1131
+ [
1132
+ {{
1133
+ "fact_type": "<algorithm|behavior|validation|dependency|error_handling>",
1134
+ "subject": "<what is being described>",
1135
+ "predicate": "<action or relationship>",
1136
+ "object": "<target or outcome>",
1137
+ "line_start": <start_line>,
1138
+ "line_end": <end_line>,
1139
+ "confidence": <0.0-1.0>
1140
+ }}
1141
+ ]
1142
+
1143
+ If no meaningful facts can be extracted, return an empty array: []
1144
+ """
1145
+
1146
+
1147
+ class FactExtractor:
1148
+ """
1149
+ Extracts structured facts from documents using lightweight LLM.
1150
+
1151
+ Two extraction modes:
1152
+ - extract(): For text/docs - extracts decisions, requirements, definitions, etc.
1153
+ - extract_code(): For code - extracts algorithms, behaviors, validations, etc.
1154
+ """
1155
+
1156
+ def __init__(self, llm: Any, max_retries: int = 3, retry_delay: float = 2.0):
1157
+ self.llm = llm
1158
+ self.prompt = ChatPromptTemplate.from_template(FACT_EXTRACTION_PROMPT)
1159
+ self.code_prompt = ChatPromptTemplate.from_template(CODE_FACT_EXTRACTION_PROMPT)
1160
+ self.parser = JsonOutputParser()
1161
+ self.max_retries = max_retries
1162
+ self.retry_delay = retry_delay
1163
+
1164
+ def extract(
1165
+ self,
1166
+ document: Document,
1167
+ fact_types: Optional[List[str]] = None
1168
+ ) -> List[Dict[str, Any]]:
1169
+ """
1170
+ Extract facts from a single document.
1171
+
1172
+ Args:
1173
+ document: Document to extract from
1174
+ fact_types: Optional filter for specific fact types
1175
+
1176
+ Returns:
1177
+ List of extracted facts
1178
+ """
1179
+ import time
1180
+
1181
+ file_path = document.metadata.get('file_path', document.metadata.get('source', 'unknown'))
1182
+ source_toolkit = document.metadata.get('source_toolkit', 'filesystem')
1183
+
1184
+ last_error = None
1185
+ for attempt in range(self.max_retries):
1186
+ try:
1187
+ content = document.page_content
1188
+
1189
+ # Add line numbers for citation support
1190
+ lines = content.split('\n')
1191
+ numbered_content = '\n'.join(
1192
+ f"{i+1:4d} | {line}"
1193
+ for i, line in enumerate(lines[:300]) # Limit for LLM context
1194
+ )
1195
+
1196
+ chain = self.prompt | self.llm | self.parser
1197
+ result = chain.invoke({
1198
+ "content": numbered_content,
1199
+ "file_path": file_path,
1200
+ "source_toolkit": source_toolkit
1201
+ })
1202
+
1203
+ if not isinstance(result, list):
1204
+ result = [result] if result else []
1205
+
1206
+ # Validate and enrich facts
1207
+ validated_facts = []
1208
+ for fact in result:
1209
+ fact_type = fact.get('fact_type', '').lower()
1210
+
1211
+ # Skip if not a canonical type
1212
+ if fact_type not in CANONICAL_FACT_TYPES:
1213
+ logger.warning(f"Skipping non-canonical fact type: {fact_type}")
1214
+ continue
1215
+
1216
+ # Filter by requested types if specified
1217
+ if fact_types and fact_type not in fact_types:
1218
+ continue
1219
+
1220
+ # Ensure required fields
1221
+ if 'id' not in fact:
1222
+ fact['id'] = hashlib.md5(
1223
+ f"{file_path}:{fact.get('title', '')}:{fact_type}".encode()
1224
+ ).hexdigest()[:12]
1225
+
1226
+ # Add source tracking
1227
+ fact['source_toolkit'] = source_toolkit
1228
+ fact['file_path'] = file_path
1229
+ fact['source'] = 'fact_extractor'
1230
+
1231
+ # Normalize confidence
1232
+ if 'confidence' not in fact:
1233
+ fact['confidence'] = 0.7
1234
+
1235
+ validated_facts.append(fact)
1236
+
1237
+ return validated_facts
1238
+
1239
+ except Exception as e:
1240
+ last_error = e
1241
+ attempt_num = attempt + 1
1242
+
1243
+ if attempt_num < self.max_retries:
1244
+ delay = 10 * attempt_num
1245
+ logger.warning(
1246
+ f"Fact extraction failed for '{file_path}' (attempt {attempt_num}/{self.max_retries}): {e}. "
1247
+ f"Retrying in {delay}s..."
1248
+ )
1249
+ time.sleep(delay)
1250
+ else:
1251
+ logger.warning(
1252
+ f"Fact extraction failed for '{file_path}' after {self.max_retries} attempts: {e}. Skipping."
1253
+ )
1254
+
1255
+ return []
1256
+
1257
+ def extract_batch(
1258
+ self,
1259
+ documents: List[Document],
1260
+ fact_types: Optional[List[str]] = None
1261
+ ) -> List[Dict[str, Any]]:
1262
+ """
1263
+ Extract facts from multiple documents with deduplication.
1264
+
1265
+ Args:
1266
+ documents: List of documents to extract from
1267
+ fact_types: Optional filter for specific fact types
1268
+
1269
+ Returns:
1270
+ List of extracted facts (deduplicated)
1271
+ """
1272
+ all_facts = []
1273
+
1274
+ for doc in documents:
1275
+ facts = self.extract(doc, fact_types)
1276
+ all_facts.extend(facts)
1277
+
1278
+ return self._deduplicate_facts(all_facts)
1279
+
1280
+ def extract_code(
1281
+ self,
1282
+ document: Document,
1283
+ fact_types: Optional[List[str]] = None
1284
+ ) -> List[Dict[str, Any]]:
1285
+ """
1286
+ Extract semantic facts from code - what the code DOES.
1287
+
1288
+ Args:
1289
+ document: Code document to extract from
1290
+ fact_types: Optional filter for specific fact types
1291
+
1292
+ Returns:
1293
+ List of extracted code facts
1294
+ """
1295
+ import time
1296
+
1297
+ file_path = document.metadata.get('file_path', document.metadata.get('source', 'unknown'))
1298
+ source_toolkit = document.metadata.get('source_toolkit', 'filesystem')
1299
+
1300
+ last_error = None
1301
+ for attempt in range(self.max_retries):
1302
+ try:
1303
+ content = document.page_content
1304
+
1305
+ # Add line numbers for citation support
1306
+ lines = content.split('\n')
1307
+ numbered_content = '\n'.join(
1308
+ f"{i+1:4d} | {line}"
1309
+ for i, line in enumerate(lines[:200]) # Limit for LLM context
1310
+ )
1311
+
1312
+ chain = self.code_prompt | self.llm | self.parser
1313
+ result = chain.invoke({
1314
+ "content": numbered_content,
1315
+ "file_path": file_path
1316
+ })
1317
+
1318
+ if not isinstance(result, list):
1319
+ result = [result] if result else []
1320
+
1321
+ # Validate and enrich facts
1322
+ validated_facts = []
1323
+ for fact in result:
1324
+ fact_type = fact.get('fact_type', '').lower()
1325
+
1326
+ # Skip if not a canonical code fact type
1327
+ if fact_type not in CODE_FACT_TYPES:
1328
+ logger.warning(f"Skipping non-canonical code fact type: {fact_type}")
1329
+ continue
1330
+
1331
+ # Filter by requested types if specified
1332
+ if fact_types and fact_type not in fact_types:
1333
+ continue
1334
+
1335
+ # Ensure required fields
1336
+ if 'id' not in fact:
1337
+ fact['id'] = hashlib.md5(
1338
+ f"{file_path}:{fact.get('subject', '')}:{fact_type}".encode()
1339
+ ).hexdigest()[:12]
1340
+
1341
+ # Add source tracking
1342
+ fact['source_toolkit'] = source_toolkit
1343
+ fact['file_path'] = file_path
1344
+ fact['source'] = 'code_fact_extractor'
1345
+
1346
+ # Normalize confidence
1347
+ if 'confidence' not in fact:
1348
+ fact['confidence'] = 0.7
1349
+
1350
+ validated_facts.append(fact)
1351
+
1352
+ return validated_facts
1353
+
1354
+ except Exception as e:
1355
+ last_error = e
1356
+ attempt_num = attempt + 1
1357
+
1358
+ if attempt_num < self.max_retries:
1359
+ delay = 10 * attempt_num
1360
+ logger.warning(
1361
+ f"Code fact extraction failed for '{file_path}' (attempt {attempt_num}/{self.max_retries}): {e}. "
1362
+ f"Retrying in {delay}s..."
1363
+ )
1364
+ time.sleep(delay)
1365
+ else:
1366
+ logger.warning(
1367
+ f"Code fact extraction failed for '{file_path}' after {self.max_retries} attempts: {e}. Skipping."
1368
+ )
1369
+
1370
+ return []
1371
+
1372
+ def extract_batch_code(
1373
+ self,
1374
+ documents: List[Document],
1375
+ fact_types: Optional[List[str]] = None
1376
+ ) -> List[Dict[str, Any]]:
1377
+ """
1378
+ Extract facts from multiple code documents with deduplication.
1379
+
1380
+ This method processes code documents using the code-specific prompt
1381
+ and fact types (algorithm, behavior, validation, etc.).
1382
+
1383
+ Args:
1384
+ documents: List of code documents to extract from
1385
+ fact_types: Optional filter for specific fact types
1386
+
1387
+ Returns:
1388
+ List of extracted facts (deduplicated)
1389
+ """
1390
+ all_facts = []
1391
+
1392
+ for doc in documents:
1393
+ facts = self.extract_code(doc, fact_types)
1394
+ all_facts.extend(facts)
1395
+
1396
+ return self._deduplicate_facts(all_facts)
1397
+
1398
+ def _is_code_file(self, file_path: str) -> bool:
1399
+ """Check if file is a code file that should use parsers instead."""
1400
+ code_extensions = {
1401
+ '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.kt', '.cs',
1402
+ '.go', '.rs', '.swift', '.c', '.cpp', '.h', '.hpp', '.rb',
1403
+ '.php', '.scala', '.clj', '.ex', '.exs', '.erl', '.hs'
1404
+ }
1405
+
1406
+ if not file_path:
1407
+ return False
1408
+
1409
+ import os
1410
+ _, ext = os.path.splitext(file_path.lower())
1411
+ return ext in code_extensions
1412
+
1413
+ def _deduplicate_facts(self, facts: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
1414
+ """
1415
+ Deduplicate facts using title and type similarity.
1416
+
1417
+ File-level deduplication: same fact type + similar title = duplicate.
1418
+ """
1419
+ seen = {} # (fact_type, normalized_title) -> fact
1420
+ deduped = []
1421
+
1422
+ for fact in facts:
1423
+ fact_type = fact.get('fact_type', 'unknown')
1424
+ title = fact.get('title', fact.get('id', ''))
1425
+
1426
+ # Normalize title for comparison
1427
+ normalized = title.lower().strip()
1428
+ # Remove common stop words for better matching
1429
+ for word in ['the', 'a', 'an', 'is', 'are', 'was', 'were']:
1430
+ normalized = normalized.replace(f' {word} ', ' ')
1431
+ normalized = ' '.join(normalized.split()) # Collapse whitespace
1432
+
1433
+ key = (fact_type, normalized)
1434
+
1435
+ if key in seen:
1436
+ # Merge properties, keep higher confidence
1437
+ existing = seen[key]
1438
+ if fact.get('confidence', 0) > existing.get('confidence', 0):
1439
+ # Replace with higher confidence version
1440
+ for prop_key, prop_value in existing.get('properties', {}).items():
1441
+ if prop_key not in fact.get('properties', {}):
1442
+ fact.setdefault('properties', {})[prop_key] = prop_value
1443
+ seen[key] = fact
1444
+ # Update in deduped list
1445
+ for i, f in enumerate(deduped):
1446
+ if (f.get('fact_type'), f.get('title', '').lower().strip()) == key:
1447
+ deduped[i] = fact
1448
+ break
1449
+ else:
1450
+ # Merge new properties into existing
1451
+ for prop_key, prop_value in fact.get('properties', {}).items():
1452
+ if prop_key not in existing.get('properties', {}):
1453
+ existing.setdefault('properties', {})[prop_key] = prop_value
1454
+ else:
1455
+ seen[key] = fact
1456
+ deduped.append(fact)
1457
+
1458
+ return deduped
1459
+
1460
+ def get_fact_type_info(self, fact_type: str, code: bool = False) -> Optional[Dict[str, Any]]:
1461
+ """Get information about a canonical fact type."""
1462
+ types = CODE_FACT_TYPES if code else CANONICAL_FACT_TYPES
1463
+ return types.get(fact_type)
1464
+
1465
+ @staticmethod
1466
+ def get_canonical_types(code: bool = False) -> List[str]:
1467
+ """Get list of all canonical fact types."""
1468
+ types = CODE_FACT_TYPES if code else CANONICAL_FACT_TYPES
1469
+ return list(types.keys())