alita-sdk 0.3.257__py3-none-any.whl → 0.3.562__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent/__init__.py +5 -0
  4. alita_sdk/cli/agent/default.py +258 -0
  5. alita_sdk/cli/agent_executor.py +155 -0
  6. alita_sdk/cli/agent_loader.py +215 -0
  7. alita_sdk/cli/agent_ui.py +228 -0
  8. alita_sdk/cli/agents.py +3601 -0
  9. alita_sdk/cli/callbacks.py +647 -0
  10. alita_sdk/cli/cli.py +168 -0
  11. alita_sdk/cli/config.py +306 -0
  12. alita_sdk/cli/context/__init__.py +30 -0
  13. alita_sdk/cli/context/cleanup.py +198 -0
  14. alita_sdk/cli/context/manager.py +731 -0
  15. alita_sdk/cli/context/message.py +285 -0
  16. alita_sdk/cli/context/strategies.py +289 -0
  17. alita_sdk/cli/context/token_estimation.py +127 -0
  18. alita_sdk/cli/formatting.py +182 -0
  19. alita_sdk/cli/input_handler.py +419 -0
  20. alita_sdk/cli/inventory.py +1073 -0
  21. alita_sdk/cli/mcp_loader.py +315 -0
  22. alita_sdk/cli/toolkit.py +327 -0
  23. alita_sdk/cli/toolkit_loader.py +85 -0
  24. alita_sdk/cli/tools/__init__.py +43 -0
  25. alita_sdk/cli/tools/approval.py +224 -0
  26. alita_sdk/cli/tools/filesystem.py +1751 -0
  27. alita_sdk/cli/tools/planning.py +389 -0
  28. alita_sdk/cli/tools/terminal.py +414 -0
  29. alita_sdk/community/__init__.py +72 -12
  30. alita_sdk/community/inventory/__init__.py +236 -0
  31. alita_sdk/community/inventory/config.py +257 -0
  32. alita_sdk/community/inventory/enrichment.py +2137 -0
  33. alita_sdk/community/inventory/extractors.py +1469 -0
  34. alita_sdk/community/inventory/ingestion.py +3172 -0
  35. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  36. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  37. alita_sdk/community/inventory/parsers/base.py +295 -0
  38. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  39. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  40. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  41. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  42. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  43. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  44. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  45. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  46. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  47. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  48. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  49. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  50. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  51. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  52. alita_sdk/community/inventory/patterns/loader.py +348 -0
  53. alita_sdk/community/inventory/patterns/registry.py +198 -0
  54. alita_sdk/community/inventory/presets.py +535 -0
  55. alita_sdk/community/inventory/retrieval.py +1403 -0
  56. alita_sdk/community/inventory/toolkit.py +173 -0
  57. alita_sdk/community/inventory/toolkit_utils.py +176 -0
  58. alita_sdk/community/inventory/visualize.py +1370 -0
  59. alita_sdk/configurations/__init__.py +11 -0
  60. alita_sdk/configurations/ado.py +148 -2
  61. alita_sdk/configurations/azure_search.py +1 -1
  62. alita_sdk/configurations/bigquery.py +1 -1
  63. alita_sdk/configurations/bitbucket.py +94 -2
  64. alita_sdk/configurations/browser.py +18 -0
  65. alita_sdk/configurations/carrier.py +19 -0
  66. alita_sdk/configurations/confluence.py +130 -1
  67. alita_sdk/configurations/delta_lake.py +1 -1
  68. alita_sdk/configurations/figma.py +76 -5
  69. alita_sdk/configurations/github.py +65 -1
  70. alita_sdk/configurations/gitlab.py +81 -0
  71. alita_sdk/configurations/google_places.py +17 -0
  72. alita_sdk/configurations/jira.py +103 -0
  73. alita_sdk/configurations/openapi.py +111 -0
  74. alita_sdk/configurations/postman.py +1 -1
  75. alita_sdk/configurations/qtest.py +72 -3
  76. alita_sdk/configurations/report_portal.py +115 -0
  77. alita_sdk/configurations/salesforce.py +19 -0
  78. alita_sdk/configurations/service_now.py +1 -12
  79. alita_sdk/configurations/sharepoint.py +167 -0
  80. alita_sdk/configurations/sonar.py +18 -0
  81. alita_sdk/configurations/sql.py +20 -0
  82. alita_sdk/configurations/testio.py +101 -0
  83. alita_sdk/configurations/testrail.py +88 -0
  84. alita_sdk/configurations/xray.py +94 -1
  85. alita_sdk/configurations/zephyr_enterprise.py +94 -1
  86. alita_sdk/configurations/zephyr_essential.py +95 -0
  87. alita_sdk/runtime/clients/artifact.py +21 -4
  88. alita_sdk/runtime/clients/client.py +458 -67
  89. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  90. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  91. alita_sdk/runtime/clients/sandbox_client.py +352 -0
  92. alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
  93. alita_sdk/runtime/langchain/assistant.py +183 -43
  94. alita_sdk/runtime/langchain/constants.py +647 -1
  95. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  96. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
  97. alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
  98. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
  99. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -3
  100. alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
  101. alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
  102. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
  103. alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
  104. alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
  105. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
  106. alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
  107. alita_sdk/runtime/langchain/document_loaders/constants.py +189 -41
  108. alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
  109. alita_sdk/runtime/langchain/langraph_agent.py +407 -92
  110. alita_sdk/runtime/langchain/utils.py +102 -8
  111. alita_sdk/runtime/llms/preloaded.py +2 -6
  112. alita_sdk/runtime/models/mcp_models.py +61 -0
  113. alita_sdk/runtime/skills/__init__.py +91 -0
  114. alita_sdk/runtime/skills/callbacks.py +498 -0
  115. alita_sdk/runtime/skills/discovery.py +540 -0
  116. alita_sdk/runtime/skills/executor.py +610 -0
  117. alita_sdk/runtime/skills/input_builder.py +371 -0
  118. alita_sdk/runtime/skills/models.py +330 -0
  119. alita_sdk/runtime/skills/registry.py +355 -0
  120. alita_sdk/runtime/skills/skill_runner.py +330 -0
  121. alita_sdk/runtime/toolkits/__init__.py +28 -0
  122. alita_sdk/runtime/toolkits/application.py +14 -4
  123. alita_sdk/runtime/toolkits/artifact.py +24 -9
  124. alita_sdk/runtime/toolkits/datasource.py +13 -6
  125. alita_sdk/runtime/toolkits/mcp.py +780 -0
  126. alita_sdk/runtime/toolkits/planning.py +178 -0
  127. alita_sdk/runtime/toolkits/skill_router.py +238 -0
  128. alita_sdk/runtime/toolkits/subgraph.py +11 -6
  129. alita_sdk/runtime/toolkits/tools.py +314 -70
  130. alita_sdk/runtime/toolkits/vectorstore.py +11 -5
  131. alita_sdk/runtime/tools/__init__.py +24 -0
  132. alita_sdk/runtime/tools/application.py +16 -4
  133. alita_sdk/runtime/tools/artifact.py +367 -33
  134. alita_sdk/runtime/tools/data_analysis.py +183 -0
  135. alita_sdk/runtime/tools/function.py +100 -4
  136. alita_sdk/runtime/tools/graph.py +81 -0
  137. alita_sdk/runtime/tools/image_generation.py +218 -0
  138. alita_sdk/runtime/tools/llm.py +1013 -177
  139. alita_sdk/runtime/tools/loop.py +3 -1
  140. alita_sdk/runtime/tools/loop_output.py +3 -1
  141. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  142. alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
  143. alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
  144. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  145. alita_sdk/runtime/tools/planning/models.py +246 -0
  146. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  147. alita_sdk/runtime/tools/router.py +2 -1
  148. alita_sdk/runtime/tools/sandbox.py +375 -0
  149. alita_sdk/runtime/tools/skill_router.py +776 -0
  150. alita_sdk/runtime/tools/tool.py +3 -1
  151. alita_sdk/runtime/tools/vectorstore.py +69 -65
  152. alita_sdk/runtime/tools/vectorstore_base.py +163 -90
  153. alita_sdk/runtime/utils/AlitaCallback.py +137 -21
  154. alita_sdk/runtime/utils/mcp_client.py +492 -0
  155. alita_sdk/runtime/utils/mcp_oauth.py +361 -0
  156. alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
  157. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  158. alita_sdk/runtime/utils/streamlit.py +41 -14
  159. alita_sdk/runtime/utils/toolkit_utils.py +28 -9
  160. alita_sdk/runtime/utils/utils.py +48 -0
  161. alita_sdk/tools/__init__.py +135 -37
  162. alita_sdk/tools/ado/__init__.py +2 -2
  163. alita_sdk/tools/ado/repos/__init__.py +15 -19
  164. alita_sdk/tools/ado/repos/repos_wrapper.py +12 -20
  165. alita_sdk/tools/ado/test_plan/__init__.py +26 -8
  166. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -28
  167. alita_sdk/tools/ado/wiki/__init__.py +27 -12
  168. alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -40
  169. alita_sdk/tools/ado/work_item/__init__.py +27 -12
  170. alita_sdk/tools/ado/work_item/ado_wrapper.py +95 -11
  171. alita_sdk/tools/advanced_jira_mining/__init__.py +12 -8
  172. alita_sdk/tools/aws/delta_lake/__init__.py +14 -11
  173. alita_sdk/tools/aws/delta_lake/tool.py +5 -1
  174. alita_sdk/tools/azure_ai/search/__init__.py +13 -8
  175. alita_sdk/tools/base/tool.py +5 -1
  176. alita_sdk/tools/base_indexer_toolkit.py +454 -110
  177. alita_sdk/tools/bitbucket/__init__.py +27 -19
  178. alita_sdk/tools/bitbucket/api_wrapper.py +285 -27
  179. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
  180. alita_sdk/tools/browser/__init__.py +41 -16
  181. alita_sdk/tools/browser/crawler.py +3 -1
  182. alita_sdk/tools/browser/utils.py +15 -6
  183. alita_sdk/tools/carrier/__init__.py +18 -17
  184. alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
  185. alita_sdk/tools/carrier/excel_reporter.py +8 -4
  186. alita_sdk/tools/chunkers/__init__.py +3 -1
  187. alita_sdk/tools/chunkers/code/codeparser.py +1 -1
  188. alita_sdk/tools/chunkers/sematic/json_chunker.py +2 -1
  189. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  190. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  191. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  192. alita_sdk/tools/cloud/aws/__init__.py +11 -7
  193. alita_sdk/tools/cloud/azure/__init__.py +11 -7
  194. alita_sdk/tools/cloud/gcp/__init__.py +11 -7
  195. alita_sdk/tools/cloud/k8s/__init__.py +11 -7
  196. alita_sdk/tools/code/linter/__init__.py +9 -8
  197. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  198. alita_sdk/tools/code/sonar/__init__.py +20 -13
  199. alita_sdk/tools/code_indexer_toolkit.py +199 -0
  200. alita_sdk/tools/confluence/__init__.py +21 -14
  201. alita_sdk/tools/confluence/api_wrapper.py +197 -58
  202. alita_sdk/tools/confluence/loader.py +14 -2
  203. alita_sdk/tools/custom_open_api/__init__.py +11 -5
  204. alita_sdk/tools/elastic/__init__.py +10 -8
  205. alita_sdk/tools/elitea_base.py +546 -64
  206. alita_sdk/tools/figma/__init__.py +11 -8
  207. alita_sdk/tools/figma/api_wrapper.py +352 -153
  208. alita_sdk/tools/github/__init__.py +17 -17
  209. alita_sdk/tools/github/api_wrapper.py +9 -26
  210. alita_sdk/tools/github/github_client.py +81 -12
  211. alita_sdk/tools/github/schemas.py +2 -1
  212. alita_sdk/tools/github/tool.py +5 -1
  213. alita_sdk/tools/gitlab/__init__.py +18 -13
  214. alita_sdk/tools/gitlab/api_wrapper.py +224 -80
  215. alita_sdk/tools/gitlab_org/__init__.py +13 -10
  216. alita_sdk/tools/google/bigquery/__init__.py +13 -13
  217. alita_sdk/tools/google/bigquery/tool.py +5 -1
  218. alita_sdk/tools/google_places/__init__.py +20 -11
  219. alita_sdk/tools/jira/__init__.py +21 -11
  220. alita_sdk/tools/jira/api_wrapper.py +315 -168
  221. alita_sdk/tools/keycloak/__init__.py +10 -8
  222. alita_sdk/tools/localgit/__init__.py +8 -3
  223. alita_sdk/tools/localgit/local_git.py +62 -54
  224. alita_sdk/tools/localgit/tool.py +5 -1
  225. alita_sdk/tools/memory/__init__.py +38 -14
  226. alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
  227. alita_sdk/tools/ocr/__init__.py +10 -8
  228. alita_sdk/tools/openapi/__init__.py +281 -108
  229. alita_sdk/tools/openapi/api_wrapper.py +883 -0
  230. alita_sdk/tools/openapi/tool.py +20 -0
  231. alita_sdk/tools/pandas/__init__.py +18 -11
  232. alita_sdk/tools/pandas/api_wrapper.py +40 -45
  233. alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
  234. alita_sdk/tools/postman/__init__.py +10 -11
  235. alita_sdk/tools/postman/api_wrapper.py +19 -8
  236. alita_sdk/tools/postman/postman_analysis.py +8 -1
  237. alita_sdk/tools/pptx/__init__.py +10 -10
  238. alita_sdk/tools/qtest/__init__.py +21 -14
  239. alita_sdk/tools/qtest/api_wrapper.py +1784 -88
  240. alita_sdk/tools/rally/__init__.py +12 -10
  241. alita_sdk/tools/report_portal/__init__.py +22 -16
  242. alita_sdk/tools/salesforce/__init__.py +21 -16
  243. alita_sdk/tools/servicenow/__init__.py +20 -16
  244. alita_sdk/tools/servicenow/api_wrapper.py +1 -1
  245. alita_sdk/tools/sharepoint/__init__.py +16 -14
  246. alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
  247. alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
  248. alita_sdk/tools/sharepoint/utils.py +8 -2
  249. alita_sdk/tools/slack/__init__.py +11 -7
  250. alita_sdk/tools/sql/__init__.py +21 -19
  251. alita_sdk/tools/sql/api_wrapper.py +71 -23
  252. alita_sdk/tools/testio/__init__.py +20 -13
  253. alita_sdk/tools/testrail/__init__.py +12 -11
  254. alita_sdk/tools/testrail/api_wrapper.py +214 -46
  255. alita_sdk/tools/utils/__init__.py +28 -4
  256. alita_sdk/tools/utils/content_parser.py +182 -62
  257. alita_sdk/tools/utils/text_operations.py +254 -0
  258. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
  259. alita_sdk/tools/xray/__init__.py +17 -14
  260. alita_sdk/tools/xray/api_wrapper.py +58 -113
  261. alita_sdk/tools/yagmail/__init__.py +8 -3
  262. alita_sdk/tools/zephyr/__init__.py +11 -7
  263. alita_sdk/tools/zephyr_enterprise/__init__.py +15 -9
  264. alita_sdk/tools/zephyr_enterprise/api_wrapper.py +30 -15
  265. alita_sdk/tools/zephyr_essential/__init__.py +15 -10
  266. alita_sdk/tools/zephyr_essential/api_wrapper.py +297 -54
  267. alita_sdk/tools/zephyr_essential/client.py +6 -4
  268. alita_sdk/tools/zephyr_scale/__init__.py +12 -8
  269. alita_sdk/tools/zephyr_scale/api_wrapper.py +39 -31
  270. alita_sdk/tools/zephyr_squad/__init__.py +11 -7
  271. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/METADATA +184 -37
  272. alita_sdk-0.3.562.dist-info/RECORD +450 -0
  273. alita_sdk-0.3.562.dist-info/entry_points.txt +2 -0
  274. alita_sdk/tools/bitbucket/tools.py +0 -304
  275. alita_sdk-0.3.257.dist-info/RECORD +0 -343
  276. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/WHEEL +0 -0
  277. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/licenses/LICENSE +0 -0
  278. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,270 @@
1
+ """
2
+ Universal Chunker - Routes documents to appropriate chunkers based on file type.
3
+
4
+ This module provides a universal chunking interface that automatically selects
5
+ the appropriate chunking strategy based on the file extension:
6
+
7
+ - .md, .markdown → Markdown chunker (header-based splitting)
8
+ - .py, .js, .ts, .java, etc. → TreeSitter code chunker
9
+ - .json → JSON chunker
10
+ - other → Default text chunker
11
+
12
+ Usage:
13
+ from alita_sdk.tools.chunkers.universal_chunker import universal_chunker
14
+
15
+ # Chunk documents from a loader
16
+ for chunk in universal_chunker(document_generator, config):
17
+ print(chunk.page_content)
18
+ """
19
+
20
+ import logging
21
+ import os
22
+ from typing import Generator, Dict, Any, Optional
23
+ from langchain_core.documents import Document
24
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
25
+
26
+ from .code.codeparser import parse_code_files_for_db
27
+ from .sematic.markdown_chunker import markdown_chunker
28
+ from .sematic.json_chunker import json_chunker
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ # File extension mappings
34
+ MARKDOWN_EXTENSIONS = {'.md', '.markdown', '.mdown', '.mkd', '.mdx'}
35
+ JSON_EXTENSIONS = {'.json', '.jsonl', '.jsonc'}
36
+ CODE_EXTENSIONS = {
37
+ '.py', '.js', '.jsx', '.mjs', '.cjs', '.ts', '.tsx',
38
+ '.java', '.kt', '.rs', '.go', '.cpp', '.c', '.cs',
39
+ '.hs', '.rb', '.scala', '.lua'
40
+ }
41
+
42
+
43
+ def get_file_extension(file_path: str) -> str:
44
+ """Extract file extension from path."""
45
+ return os.path.splitext(file_path)[-1].lower()
46
+
47
+
48
+ def get_file_type(file_path: str) -> str:
49
+ """
50
+ Determine the file type category for chunking.
51
+
52
+ Returns:
53
+ 'markdown', 'json', 'code', or 'text'
54
+ """
55
+ ext = get_file_extension(file_path)
56
+
57
+ if ext in MARKDOWN_EXTENSIONS:
58
+ return 'markdown'
59
+ elif ext in JSON_EXTENSIONS:
60
+ return 'json'
61
+ elif ext in CODE_EXTENSIONS:
62
+ return 'code'
63
+ else:
64
+ return 'text'
65
+
66
+
67
+ def _default_text_chunker(
68
+ documents: Generator[Document, None, None],
69
+ config: Dict[str, Any]
70
+ ) -> Generator[Document, None, None]:
71
+ """
72
+ Default text chunker for unknown file types.
73
+ Uses recursive character splitting.
74
+ """
75
+ chunk_size = config.get('chunk_size', 1000)
76
+ chunk_overlap = config.get('chunk_overlap', 100)
77
+
78
+ splitter = RecursiveCharacterTextSplitter(
79
+ chunk_size=chunk_size,
80
+ chunk_overlap=chunk_overlap,
81
+ length_function=len,
82
+ )
83
+
84
+ for doc in documents:
85
+ chunks = splitter.split_documents([doc])
86
+ for idx, chunk in enumerate(chunks, 1):
87
+ chunk.metadata['chunk_id'] = idx
88
+ chunk.metadata['chunk_type'] = 'text'
89
+ chunk.metadata['method_name'] = 'text'
90
+ yield chunk
91
+
92
+
93
+ def _code_chunker_from_documents(
94
+ documents: Generator[Document, None, None],
95
+ config: Dict[str, Any]
96
+ ) -> Generator[Document, None, None]:
97
+ """
98
+ Adapter to convert Document generator to code parser format.
99
+ """
100
+ def file_content_generator():
101
+ for doc in documents:
102
+ yield {
103
+ 'file_name': doc.metadata.get('file_path', doc.metadata.get('filename', 'unknown')),
104
+ 'file_content': doc.page_content,
105
+ 'commit_hash': doc.metadata.get('commit_hash', ''),
106
+ }
107
+
108
+ # parse_code_files_for_db returns chunks with proper metadata
109
+ for chunk in parse_code_files_for_db(file_content_generator()):
110
+ # Ensure file_path is preserved
111
+ if 'file_path' not in chunk.metadata and 'filename' in chunk.metadata:
112
+ chunk.metadata['file_path'] = chunk.metadata['filename']
113
+ yield chunk
114
+
115
+
116
+ def universal_chunker(
117
+ documents: Generator[Document, None, None],
118
+ config: Optional[Dict[str, Any]] = None
119
+ ) -> Generator[Document, None, None]:
120
+ """
121
+ Universal chunker that routes documents to appropriate chunkers based on file type.
122
+
123
+ Each document is inspected for its file extension (from metadata.file_path or
124
+ metadata.file_name) and routed to the appropriate chunker:
125
+
126
+ - Markdown files → markdown_chunker (header-based splitting)
127
+ - JSON files → json_chunker (recursive JSON splitting)
128
+ - Code files → code parser (TreeSitter-based parsing)
129
+ - Other files → default text chunker (recursive character splitting)
130
+
131
+ Args:
132
+ documents: Generator yielding Document objects with file content
133
+ config: Optional configuration dict with:
134
+ - markdown_config: Config for markdown chunker
135
+ - json_config: Config for JSON chunker
136
+ - code_config: Config for code chunker
137
+ - text_config: Config for default text chunker
138
+
139
+ Yields:
140
+ Document objects with chunked content and preserved metadata
141
+ """
142
+ if config is None:
143
+ config = {}
144
+
145
+ # Default configs for each chunker type
146
+ markdown_config = config.get('markdown_config', {
147
+ 'strip_header': False,
148
+ 'return_each_line': False,
149
+ 'headers_to_split_on': [
150
+ ('#', 'Header 1'),
151
+ ('##', 'Header 2'),
152
+ ('###', 'Header 3'),
153
+ ('####', 'Header 4'),
154
+ ],
155
+ 'max_tokens': 1024,
156
+ 'token_overlap': 50,
157
+ 'min_chunk_chars': 100, # Merge chunks smaller than this
158
+ })
159
+
160
+ json_config = config.get('json_config', {
161
+ 'max_tokens': 512,
162
+ })
163
+
164
+ code_config = config.get('code_config', {})
165
+
166
+ text_config = config.get('text_config', {
167
+ 'chunk_size': 1000,
168
+ 'chunk_overlap': 100,
169
+ })
170
+
171
+ # Buffer documents by type for batch processing
172
+ # This is more efficient than processing one at a time
173
+ markdown_docs = []
174
+ json_docs = []
175
+ code_docs = []
176
+ text_docs = []
177
+
178
+ # Buffer size before flushing
179
+ BUFFER_SIZE = 10
180
+
181
+ def flush_markdown():
182
+ if markdown_docs:
183
+ def gen():
184
+ for d in markdown_docs:
185
+ yield d
186
+ for chunk in markdown_chunker(gen(), markdown_config):
187
+ yield chunk
188
+ markdown_docs.clear()
189
+
190
+ def flush_json():
191
+ if json_docs:
192
+ def gen():
193
+ for d in json_docs:
194
+ yield d
195
+ for chunk in json_chunker(gen(), json_config):
196
+ yield chunk
197
+ json_docs.clear()
198
+
199
+ def flush_code():
200
+ if code_docs:
201
+ def gen():
202
+ for d in code_docs:
203
+ yield d
204
+ for chunk in _code_chunker_from_documents(gen(), code_config):
205
+ yield chunk
206
+ code_docs.clear()
207
+
208
+ def flush_text():
209
+ if text_docs:
210
+ def gen():
211
+ for d in text_docs:
212
+ yield d
213
+ for chunk in _default_text_chunker(gen(), text_config):
214
+ yield chunk
215
+ text_docs.clear()
216
+
217
+ for doc in documents:
218
+ # Get file path from metadata
219
+ file_path = (doc.metadata.get('file_path') or
220
+ doc.metadata.get('file_name') or
221
+ doc.metadata.get('source') or
222
+ 'unknown')
223
+
224
+ # Ensure file_path is in metadata for downstream use
225
+ doc.metadata['file_path'] = file_path
226
+
227
+ file_type = get_file_type(file_path)
228
+
229
+ if file_type == 'markdown':
230
+ markdown_docs.append(doc)
231
+ if len(markdown_docs) >= BUFFER_SIZE:
232
+ yield from flush_markdown()
233
+ elif file_type == 'json':
234
+ json_docs.append(doc)
235
+ if len(json_docs) >= BUFFER_SIZE:
236
+ yield from flush_json()
237
+ elif file_type == 'code':
238
+ code_docs.append(doc)
239
+ if len(code_docs) >= BUFFER_SIZE:
240
+ yield from flush_code()
241
+ else:
242
+ text_docs.append(doc)
243
+ if len(text_docs) >= BUFFER_SIZE:
244
+ yield from flush_text()
245
+
246
+ # Flush remaining documents
247
+ yield from flush_markdown()
248
+ yield from flush_json()
249
+ yield from flush_code()
250
+ yield from flush_text()
251
+
252
+
253
+ def chunk_single_document(
254
+ doc: Document,
255
+ config: Optional[Dict[str, Any]] = None
256
+ ) -> Generator[Document, None, None]:
257
+ """
258
+ Convenience function to chunk a single document.
259
+
260
+ Args:
261
+ doc: Single Document to chunk
262
+ config: Optional chunker configuration
263
+
264
+ Yields:
265
+ Chunked Document objects
266
+ """
267
+ def single_doc_gen():
268
+ yield doc
269
+
270
+ yield from universal_chunker(single_doc_gen(), config)
@@ -5,7 +5,8 @@ from langchain_core.tools import BaseToolkit, BaseTool
5
5
 
6
6
  from .api_wrapper import AWSToolConfig
7
7
  from ...base.tool import BaseAction
8
- from ...utils import clean_string, TOOLKIT_SPLITTER, get_max_toolkit_length
8
+ from ...elitea_base import filter_missconfigured_index_tools
9
+ from ...utils import clean_string, get_max_toolkit_length
9
10
 
10
11
  name = "aws"
11
12
 
@@ -21,12 +22,10 @@ def get_tools(tool):
21
22
 
22
23
  class AWSToolkit(BaseToolkit):
23
24
  tools: list[BaseTool] = []
24
- toolkit_max_length: int = 0
25
25
 
26
26
  @staticmethod
27
27
  def toolkit_config_schema() -> BaseModel:
28
28
  selected_tools = {x['name']: x['args_schema'].schema() for x in AWSToolConfig.model_construct().get_available_tools()}
29
- AWSToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
30
29
  return create_model(
31
30
  name,
32
31
  region=(str, Field(default="", title="Region", description="AWS region")),
@@ -46,21 +45,26 @@ class AWSToolkit(BaseToolkit):
46
45
  )
47
46
 
48
47
  @classmethod
48
+ @filter_missconfigured_index_tools
49
49
  def get_toolkit(cls, selected_tools: list[str] | None = None, toolkit_name: Optional[str] = None, **kwargs):
50
50
  if selected_tools is None:
51
51
  selected_tools = []
52
52
  aws_tool_config = AWSToolConfig(**kwargs)
53
53
  available_tools = aws_tool_config.get_available_tools()
54
54
  tools = []
55
- prefix = clean_string(toolkit_name, cls.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
56
55
  for tool in available_tools:
57
56
  if selected_tools and tool["name"] not in selected_tools:
58
57
  continue
58
+ description = tool["description"]
59
+ if toolkit_name:
60
+ description = f"Toolkit: {toolkit_name}\n{description}"
61
+ description = description[:1000]
59
62
  tools.append(BaseAction(
60
63
  api_wrapper=aws_tool_config,
61
- name=prefix + tool["name"],
62
- description=tool["description"],
63
- args_schema=tool["args_schema"]
64
+ name=tool["name"],
65
+ description=description,
66
+ args_schema=tool["args_schema"],
67
+ metadata={"toolkit_name": toolkit_name} if toolkit_name else {}
64
68
  ))
65
69
  return cls(tools=tools)
66
70
 
@@ -5,7 +5,8 @@ from pydantic import create_model, BaseModel, ConfigDict, Field, SecretStr
5
5
 
6
6
  from .api_wrapper import AzureApiWrapper
7
7
  from ...base.tool import BaseAction
8
- from ...utils import clean_string, TOOLKIT_SPLITTER, get_max_toolkit_length
8
+ from ...elitea_base import filter_missconfigured_index_tools
9
+ from ...utils import clean_string, get_max_toolkit_length
9
10
 
10
11
  name = "azure"
11
12
 
@@ -22,12 +23,10 @@ def get_tools(tool):
22
23
 
23
24
  class AzureToolkit(BaseToolkit):
24
25
  tools: list[BaseTool] = []
25
- toolkit_max_length: int = 0
26
26
 
27
27
  @staticmethod
28
28
  def toolkit_config_schema() -> BaseModel:
29
29
  selected_tools = {x['name']: x['args_schema'].schema() for x in AzureApiWrapper.model_construct().get_available_tools()}
30
- AzureToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
31
30
  return create_model(
32
31
  name,
33
32
  subscription_id=(str, Field(default="", title="Subscription ID", description="Azure subscription ID")),
@@ -39,21 +38,26 @@ class AzureToolkit(BaseToolkit):
39
38
  )
40
39
 
41
40
  @classmethod
41
+ @filter_missconfigured_index_tools
42
42
  def get_toolkit(cls, selected_tools: list[str] | None = None, toolkit_name: Optional[str] = None, **kwargs):
43
43
  if selected_tools is None:
44
44
  selected_tools = []
45
45
  azure_api_wrapper = AzureApiWrapper(**kwargs)
46
46
  available_tools = azure_api_wrapper.get_available_tools()
47
47
  tools = []
48
- prefix = clean_string(toolkit_name, cls.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
49
48
  for tool in available_tools:
50
49
  if selected_tools and tool["name"] not in selected_tools:
51
50
  continue
51
+ description = tool["description"]
52
+ if toolkit_name:
53
+ description = f"Toolkit: {toolkit_name}\n{description}"
54
+ description = description[:1000]
52
55
  tools.append(BaseAction(
53
56
  api_wrapper=azure_api_wrapper,
54
- name=prefix + tool["name"],
55
- description=tool["description"],
56
- args_schema=tool["args_schema"]
57
+ name=tool["name"],
58
+ description=description,
59
+ args_schema=tool["args_schema"],
60
+ metadata={"toolkit_name": toolkit_name} if toolkit_name else {}
57
61
  ))
58
62
  return cls(tools=tools)
59
63
 
@@ -5,7 +5,8 @@ from pydantic import create_model, BaseModel, ConfigDict, Field, SecretStr
5
5
 
6
6
  from .api_wrapper import GCPApiWrapper
7
7
  from ...base.tool import BaseAction
8
- from ...utils import clean_string, TOOLKIT_SPLITTER, get_max_toolkit_length
8
+ from ...elitea_base import filter_missconfigured_index_tools
9
+ from ...utils import clean_string, get_max_toolkit_length
9
10
 
10
11
  name = "gcp"
11
12
 
@@ -19,12 +20,10 @@ def get_tools(tool):
19
20
 
20
21
  class GCPToolkit(BaseToolkit):
21
22
  tools: list[BaseTool] = []
22
- toolkit_max_length: int = 0
23
23
 
24
24
  @staticmethod
25
25
  def toolkit_config_schema() -> BaseModel:
26
26
  selected_tools = {x['name']: x['args_schema'].schema() for x in GCPApiWrapper.model_construct().get_available_tools()}
27
- GCPToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
28
27
  return create_model(
29
28
  name,
30
29
  api_key=(SecretStr, Field(default="", title="API key", description="GCP API key", json_schema_extra={'secret': True})),
@@ -33,21 +32,26 @@ class GCPToolkit(BaseToolkit):
33
32
  )
34
33
 
35
34
  @classmethod
35
+ @filter_missconfigured_index_tools
36
36
  def get_toolkit(cls, selected_tools: list[str] | None = None, toolkit_name: Optional[str] = None, **kwargs):
37
37
  if selected_tools is None:
38
38
  selected_tools = []
39
39
  gcp_api_wrapper = GCPApiWrapper(**kwargs)
40
40
  available_tools = gcp_api_wrapper.get_available_tools()
41
41
  tools = []
42
- prefix = clean_string(toolkit_name, cls.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
43
42
  for tool in available_tools:
44
43
  if selected_tools and tool["name"] not in selected_tools:
45
44
  continue
45
+ description = tool["description"]
46
+ if toolkit_name:
47
+ description = f"Toolkit: {toolkit_name}\n{description}"
48
+ description = description[:1000]
46
49
  tools.append(BaseAction(
47
50
  api_wrapper=gcp_api_wrapper,
48
- name=prefix + tool["name"],
49
- description=tool["description"],
50
- args_schema=tool["args_schema"]
51
+ name=tool["name"],
52
+ description=description,
53
+ args_schema=tool["args_schema"],
54
+ metadata={"toolkit_name": toolkit_name} if toolkit_name else {}
51
55
  ))
52
56
  return cls(tools=tools)
53
57
 
@@ -5,7 +5,8 @@ from pydantic import create_model, BaseModel, ConfigDict, Field, SecretStr
5
5
 
6
6
  from .api_wrapper import KubernetesApiWrapper
7
7
  from ...base.tool import BaseAction
8
- from ...utils import clean_string, TOOLKIT_SPLITTER, get_max_toolkit_length
8
+ from ...elitea_base import filter_missconfigured_index_tools
9
+ from ...utils import clean_string, get_max_toolkit_length
9
10
 
10
11
  name = "kubernetes"
11
12
 
@@ -20,12 +21,10 @@ def get_tools(tool):
20
21
 
21
22
  class KubernetesToolkit(BaseToolkit):
22
23
  tools: list[BaseTool] = []
23
- toolkit_max_length: int = 0
24
24
 
25
25
  @staticmethod
26
26
  def toolkit_config_schema() -> BaseModel:
27
27
  selected_tools = {x['name']: x['args_schema'].schema() for x in KubernetesApiWrapper.model_construct().get_available_tools()}
28
- KubernetesToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
29
28
  return create_model(
30
29
  name,
31
30
  url=(str, Field(default="", title="Cluster URL", description="The URL of the Kubernetes cluster")),
@@ -43,21 +42,26 @@ class KubernetesToolkit(BaseToolkit):
43
42
  )
44
43
 
45
44
  @classmethod
45
+ @filter_missconfigured_index_tools
46
46
  def get_toolkit(cls, selected_tools: list[str] | None = None, toolkit_name: Optional[str] = None, **kwargs):
47
47
  if selected_tools is None:
48
48
  selected_tools = []
49
49
  kubernetes_api_wrapper = KubernetesApiWrapper(**kwargs)
50
50
  available_tools = kubernetes_api_wrapper.get_available_tools()
51
51
  tools = []
52
- prefix = clean_string(toolkit_name, cls.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
53
52
  for tool in available_tools:
54
53
  if selected_tools and tool["name"] not in selected_tools:
55
54
  continue
55
+ description = tool["description"]
56
+ if toolkit_name:
57
+ description = f"Toolkit: {toolkit_name}\n{description}"
58
+ description = description[:1000]
56
59
  tools.append(BaseAction(
57
60
  api_wrapper=kubernetes_api_wrapper,
58
- name=prefix + tool["name"],
59
- description=tool["description"],
60
- args_schema=tool["args_schema"]
61
+ name=tool["name"],
62
+ description=description,
63
+ args_schema=tool["args_schema"],
64
+ metadata={"toolkit_name": toolkit_name} if toolkit_name else {}
61
65
  ))
62
66
  return cls(tools=tools)
63
67
 
@@ -5,7 +5,7 @@ from pydantic import BaseModel, create_model, Field
5
5
 
6
6
  from .api_wrapper import PythonLinter
7
7
  from ...base.tool import BaseAction
8
- from ...utils import clean_string, TOOLKIT_SPLITTER, get_max_toolkit_length
8
+ from ...utils import clean_string, get_max_toolkit_length
9
9
 
10
10
  name = "python_linter"
11
11
 
@@ -19,11 +19,9 @@ def get_tools(tool):
19
19
 
20
20
  class PythonLinterToolkit(BaseToolkit):
21
21
  tools: list[BaseTool] = []
22
- toolkit_max_length: int = 0
23
22
 
24
23
  @staticmethod
25
24
  def toolkit_config_schema() -> BaseModel:
26
- PythonLinterToolkit.toolkit_max_length = get_max_toolkit_length([])
27
25
  return create_model(
28
26
  name,
29
27
  error_codes=(str, Field(description="Error codes to be used by the linter")),
@@ -39,16 +37,19 @@ class PythonLinterToolkit(BaseToolkit):
39
37
  python_linter = PythonLinter(**kwargs)
40
38
  available_tools = python_linter.get_available_tools()
41
39
  tools = []
42
- toolkit_max_length = get_max_toolkit_length(selected_tools)
43
- prefix = clean_string(toolkit_name, PythonLinterToolkit.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
44
40
  for tool in available_tools:
45
41
  if selected_tools and tool["name"] not in selected_tools:
46
42
  continue
43
+ description = tool["description"]
44
+ if toolkit_name:
45
+ description = f"Toolkit: {toolkit_name}\n{description}"
46
+ description = description[:1000]
47
47
  tools.append(BaseAction(
48
48
  api_wrapper=python_linter,
49
- name=prefix + tool["name"],
50
- description=tool["description"],
51
- args_schema=tool["args_schema"]
49
+ name=tool["name"],
50
+ description=description,
51
+ args_schema=tool["args_schema"],
52
+ metadata={"toolkit_name": toolkit_name} if toolkit_name else {}
52
53
  ))
53
54
  return cls(tools=tools)
54
55
 
@@ -4,8 +4,9 @@ def search_format(items):
4
4
  results = []
5
5
  for (doc, score) in items:
6
6
  res_chunk = ''
7
- language = get_programming_language(get_file_extension(doc.metadata["filename"]))
8
- res_chunk += doc.metadata["filename"] + " -> " + doc.metadata["method_name"] + " (score: " + str(score) + ")"
7
+ language = get_programming_language(get_file_extension(doc.metadata.get("filename", "unknown")))
8
+ method_name = doc.metadata.get("method_name", "text")
9
+ res_chunk += doc.metadata.get("filename", "unknown") + " -> " + method_name + " (score: " + str(score) + ")"
9
10
  res_chunk += "\n\n```" + language.value + "\n"+ doc.page_content + "\n```\n\n"
10
11
  results.append(res_chunk)
11
12
  return results
@@ -1,36 +1,34 @@
1
1
  from typing import List, Literal, Optional
2
2
  from langchain_core.tools import BaseToolkit, BaseTool
3
- from pydantic import create_model, BaseModel, ConfigDict, Field, SecretStr
3
+ from pydantic import create_model, BaseModel, ConfigDict, Field
4
4
 
5
5
  from .api_wrapper import SonarApiWrapper
6
6
  from ...base.tool import BaseAction
7
- from ...utils import clean_string, TOOLKIT_SPLITTER, get_max_toolkit_length
7
+ from ...elitea_base import filter_missconfigured_index_tools
8
+ from ...utils import clean_string, get_max_toolkit_length
9
+ from ....configurations.sonar import SonarConfiguration
8
10
 
9
11
  name = "sonar"
10
12
 
11
13
  def get_tools(tool):
12
14
  return SonarToolkit().get_toolkit(
13
15
  selected_tools=tool['settings'].get('selected_tools', []),
14
- url=tool['settings']['url'],
15
- sonar_token=tool['settings']['sonar_token'],
16
16
  sonar_project_name=tool['settings']['sonar_project_name'],
17
+ sonar_configuration=tool['settings']['sonar_configuration'],
17
18
  toolkit_name=tool.get('toolkit_name')
18
19
  ).get_tools()
19
20
 
20
21
 
21
22
  class SonarToolkit(BaseToolkit):
22
23
  tools: list[BaseTool] = []
23
- toolkit_max_length: int = 0
24
24
 
25
25
  @staticmethod
26
26
  def toolkit_config_schema() -> BaseModel:
27
27
  selected_tools = {x['name']: x['args_schema'].schema() for x in SonarApiWrapper.model_construct().get_available_tools()}
28
- SonarToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
29
28
  return create_model(
30
29
  name,
31
- url=(str, Field(description="SonarQube Server URL", json_schema_extra={'toolkit_name': True, 'max_toolkit_length': SonarToolkit.toolkit_max_length})),
32
- sonar_token=(SecretStr, Field(description="SonarQube user token for authentication", json_schema_extra={'secret': True})),
33
30
  sonar_project_name=(str, Field(description="Project name of the desired repository")),
31
+ sonar_configuration=(SonarConfiguration, Field(description="Sonar Configuration", json_schema_extra={'configuration_types': ['sonar']})),
34
32
  selected_tools=(List[Literal[tuple(selected_tools)]], Field(default=[], json_schema_extra={'args_schemas': selected_tools})),
35
33
  __config__=ConfigDict(json_schema_extra=
36
34
  {
@@ -44,21 +42,30 @@ class SonarToolkit(BaseToolkit):
44
42
  )
45
43
 
46
44
  @classmethod
45
+ @filter_missconfigured_index_tools
47
46
  def get_toolkit(cls, selected_tools: list[str] | None = None, toolkit_name: Optional[str] = None, **kwargs):
48
47
  if selected_tools is None:
49
48
  selected_tools = []
50
- sonar_api_wrapper = SonarApiWrapper(**kwargs)
49
+ wrapper_payload = {
50
+ **kwargs,
51
+ **kwargs.get('sonar_configuration', {}),
52
+ }
53
+ sonar_api_wrapper = SonarApiWrapper(**wrapper_payload)
51
54
  available_tools = sonar_api_wrapper.get_available_tools()
52
55
  tools = []
53
- prefix = clean_string(toolkit_name, SonarToolkit.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
54
56
  for tool in available_tools:
55
57
  if selected_tools and tool["name"] not in selected_tools:
56
58
  continue
59
+ description = tool["description"]
60
+ if toolkit_name:
61
+ description = f"Toolkit: {toolkit_name}\n{description}"
62
+ description = description[:1000]
57
63
  tools.append(BaseAction(
58
64
  api_wrapper=sonar_api_wrapper,
59
- name=prefix + tool["name"],
60
- description=tool["description"],
61
- args_schema=tool["args_schema"]
65
+ name=tool["name"],
66
+ description=description,
67
+ args_schema=tool["args_schema"],
68
+ metadata={"toolkit_name": toolkit_name} if toolkit_name else {}
62
69
  ))
63
70
  return cls(tools=tools)
64
71