alita-sdk 0.3.462__py3-none-any.whl → 0.3.627__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (261) hide show
  1. alita_sdk/cli/agent/__init__.py +5 -0
  2. alita_sdk/cli/agent/default.py +258 -0
  3. alita_sdk/cli/agent_executor.py +15 -3
  4. alita_sdk/cli/agent_loader.py +56 -8
  5. alita_sdk/cli/agent_ui.py +93 -31
  6. alita_sdk/cli/agents.py +2274 -230
  7. alita_sdk/cli/callbacks.py +96 -25
  8. alita_sdk/cli/cli.py +10 -1
  9. alita_sdk/cli/config.py +162 -9
  10. alita_sdk/cli/context/__init__.py +30 -0
  11. alita_sdk/cli/context/cleanup.py +198 -0
  12. alita_sdk/cli/context/manager.py +731 -0
  13. alita_sdk/cli/context/message.py +285 -0
  14. alita_sdk/cli/context/strategies.py +289 -0
  15. alita_sdk/cli/context/token_estimation.py +127 -0
  16. alita_sdk/cli/input_handler.py +419 -0
  17. alita_sdk/cli/inventory.py +1073 -0
  18. alita_sdk/cli/testcases/__init__.py +94 -0
  19. alita_sdk/cli/testcases/data_generation.py +119 -0
  20. alita_sdk/cli/testcases/discovery.py +96 -0
  21. alita_sdk/cli/testcases/executor.py +84 -0
  22. alita_sdk/cli/testcases/logger.py +85 -0
  23. alita_sdk/cli/testcases/parser.py +172 -0
  24. alita_sdk/cli/testcases/prompts.py +91 -0
  25. alita_sdk/cli/testcases/reporting.py +125 -0
  26. alita_sdk/cli/testcases/setup.py +108 -0
  27. alita_sdk/cli/testcases/test_runner.py +282 -0
  28. alita_sdk/cli/testcases/utils.py +39 -0
  29. alita_sdk/cli/testcases/validation.py +90 -0
  30. alita_sdk/cli/testcases/workflow.py +196 -0
  31. alita_sdk/cli/toolkit.py +14 -17
  32. alita_sdk/cli/toolkit_loader.py +35 -5
  33. alita_sdk/cli/tools/__init__.py +36 -2
  34. alita_sdk/cli/tools/approval.py +224 -0
  35. alita_sdk/cli/tools/filesystem.py +910 -64
  36. alita_sdk/cli/tools/planning.py +389 -0
  37. alita_sdk/cli/tools/terminal.py +414 -0
  38. alita_sdk/community/__init__.py +72 -12
  39. alita_sdk/community/inventory/__init__.py +236 -0
  40. alita_sdk/community/inventory/config.py +257 -0
  41. alita_sdk/community/inventory/enrichment.py +2137 -0
  42. alita_sdk/community/inventory/extractors.py +1469 -0
  43. alita_sdk/community/inventory/ingestion.py +3172 -0
  44. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  45. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  46. alita_sdk/community/inventory/parsers/base.py +295 -0
  47. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  48. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  49. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  50. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  51. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  52. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  53. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  54. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  55. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  56. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  57. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  58. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  59. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  60. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  61. alita_sdk/community/inventory/patterns/loader.py +348 -0
  62. alita_sdk/community/inventory/patterns/registry.py +198 -0
  63. alita_sdk/community/inventory/presets.py +535 -0
  64. alita_sdk/community/inventory/retrieval.py +1403 -0
  65. alita_sdk/community/inventory/toolkit.py +173 -0
  66. alita_sdk/community/inventory/toolkit_utils.py +176 -0
  67. alita_sdk/community/inventory/visualize.py +1370 -0
  68. alita_sdk/configurations/__init__.py +1 -1
  69. alita_sdk/configurations/ado.py +141 -20
  70. alita_sdk/configurations/bitbucket.py +0 -3
  71. alita_sdk/configurations/confluence.py +76 -42
  72. alita_sdk/configurations/figma.py +76 -0
  73. alita_sdk/configurations/gitlab.py +17 -5
  74. alita_sdk/configurations/openapi.py +329 -0
  75. alita_sdk/configurations/qtest.py +72 -1
  76. alita_sdk/configurations/report_portal.py +96 -0
  77. alita_sdk/configurations/sharepoint.py +148 -0
  78. alita_sdk/configurations/testio.py +83 -0
  79. alita_sdk/runtime/clients/artifact.py +3 -3
  80. alita_sdk/runtime/clients/client.py +353 -48
  81. alita_sdk/runtime/clients/sandbox_client.py +0 -21
  82. alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
  83. alita_sdk/runtime/langchain/assistant.py +123 -26
  84. alita_sdk/runtime/langchain/constants.py +642 -1
  85. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
  86. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
  87. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +6 -3
  88. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +226 -7
  89. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +5 -2
  90. alita_sdk/runtime/langchain/document_loaders/constants.py +12 -7
  91. alita_sdk/runtime/langchain/langraph_agent.py +279 -73
  92. alita_sdk/runtime/langchain/utils.py +82 -15
  93. alita_sdk/runtime/llms/preloaded.py +2 -6
  94. alita_sdk/runtime/skills/__init__.py +91 -0
  95. alita_sdk/runtime/skills/callbacks.py +498 -0
  96. alita_sdk/runtime/skills/discovery.py +540 -0
  97. alita_sdk/runtime/skills/executor.py +610 -0
  98. alita_sdk/runtime/skills/input_builder.py +371 -0
  99. alita_sdk/runtime/skills/models.py +330 -0
  100. alita_sdk/runtime/skills/registry.py +355 -0
  101. alita_sdk/runtime/skills/skill_runner.py +330 -0
  102. alita_sdk/runtime/toolkits/__init__.py +7 -0
  103. alita_sdk/runtime/toolkits/application.py +21 -9
  104. alita_sdk/runtime/toolkits/artifact.py +15 -5
  105. alita_sdk/runtime/toolkits/datasource.py +13 -6
  106. alita_sdk/runtime/toolkits/mcp.py +139 -251
  107. alita_sdk/runtime/toolkits/mcp_config.py +1048 -0
  108. alita_sdk/runtime/toolkits/planning.py +178 -0
  109. alita_sdk/runtime/toolkits/skill_router.py +238 -0
  110. alita_sdk/runtime/toolkits/subgraph.py +251 -6
  111. alita_sdk/runtime/toolkits/tools.py +238 -32
  112. alita_sdk/runtime/toolkits/vectorstore.py +11 -5
  113. alita_sdk/runtime/tools/__init__.py +3 -1
  114. alita_sdk/runtime/tools/application.py +20 -6
  115. alita_sdk/runtime/tools/artifact.py +511 -28
  116. alita_sdk/runtime/tools/data_analysis.py +183 -0
  117. alita_sdk/runtime/tools/function.py +43 -15
  118. alita_sdk/runtime/tools/image_generation.py +50 -44
  119. alita_sdk/runtime/tools/llm.py +852 -67
  120. alita_sdk/runtime/tools/loop.py +3 -1
  121. alita_sdk/runtime/tools/loop_output.py +3 -1
  122. alita_sdk/runtime/tools/mcp_remote_tool.py +25 -10
  123. alita_sdk/runtime/tools/mcp_server_tool.py +7 -6
  124. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  125. alita_sdk/runtime/tools/planning/models.py +246 -0
  126. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  127. alita_sdk/runtime/tools/router.py +2 -4
  128. alita_sdk/runtime/tools/sandbox.py +9 -6
  129. alita_sdk/runtime/tools/skill_router.py +776 -0
  130. alita_sdk/runtime/tools/tool.py +3 -1
  131. alita_sdk/runtime/tools/vectorstore.py +7 -2
  132. alita_sdk/runtime/tools/vectorstore_base.py +51 -11
  133. alita_sdk/runtime/utils/AlitaCallback.py +137 -21
  134. alita_sdk/runtime/utils/constants.py +5 -1
  135. alita_sdk/runtime/utils/mcp_client.py +492 -0
  136. alita_sdk/runtime/utils/mcp_oauth.py +202 -5
  137. alita_sdk/runtime/utils/mcp_sse_client.py +36 -7
  138. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  139. alita_sdk/runtime/utils/serialization.py +155 -0
  140. alita_sdk/runtime/utils/streamlit.py +6 -10
  141. alita_sdk/runtime/utils/toolkit_utils.py +16 -5
  142. alita_sdk/runtime/utils/utils.py +36 -0
  143. alita_sdk/tools/__init__.py +113 -29
  144. alita_sdk/tools/ado/repos/__init__.py +51 -33
  145. alita_sdk/tools/ado/repos/repos_wrapper.py +148 -89
  146. alita_sdk/tools/ado/test_plan/__init__.py +25 -9
  147. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +23 -1
  148. alita_sdk/tools/ado/utils.py +1 -18
  149. alita_sdk/tools/ado/wiki/__init__.py +25 -8
  150. alita_sdk/tools/ado/wiki/ado_wrapper.py +291 -22
  151. alita_sdk/tools/ado/work_item/__init__.py +26 -9
  152. alita_sdk/tools/ado/work_item/ado_wrapper.py +56 -3
  153. alita_sdk/tools/advanced_jira_mining/__init__.py +11 -8
  154. alita_sdk/tools/aws/delta_lake/__init__.py +13 -9
  155. alita_sdk/tools/aws/delta_lake/tool.py +5 -1
  156. alita_sdk/tools/azure_ai/search/__init__.py +11 -8
  157. alita_sdk/tools/azure_ai/search/api_wrapper.py +1 -1
  158. alita_sdk/tools/base/tool.py +5 -1
  159. alita_sdk/tools/base_indexer_toolkit.py +170 -45
  160. alita_sdk/tools/bitbucket/__init__.py +17 -12
  161. alita_sdk/tools/bitbucket/api_wrapper.py +59 -11
  162. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +49 -35
  163. alita_sdk/tools/browser/__init__.py +5 -4
  164. alita_sdk/tools/carrier/__init__.py +5 -6
  165. alita_sdk/tools/carrier/backend_reports_tool.py +6 -6
  166. alita_sdk/tools/carrier/run_ui_test_tool.py +6 -6
  167. alita_sdk/tools/carrier/ui_reports_tool.py +5 -5
  168. alita_sdk/tools/chunkers/__init__.py +3 -1
  169. alita_sdk/tools/chunkers/code/treesitter/treesitter.py +37 -13
  170. alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
  171. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  172. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  173. alita_sdk/tools/cloud/aws/__init__.py +10 -7
  174. alita_sdk/tools/cloud/azure/__init__.py +10 -7
  175. alita_sdk/tools/cloud/gcp/__init__.py +10 -7
  176. alita_sdk/tools/cloud/k8s/__init__.py +10 -7
  177. alita_sdk/tools/code/linter/__init__.py +10 -8
  178. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  179. alita_sdk/tools/code/sonar/__init__.py +10 -7
  180. alita_sdk/tools/code_indexer_toolkit.py +73 -23
  181. alita_sdk/tools/confluence/__init__.py +21 -15
  182. alita_sdk/tools/confluence/api_wrapper.py +78 -23
  183. alita_sdk/tools/confluence/loader.py +4 -2
  184. alita_sdk/tools/custom_open_api/__init__.py +12 -5
  185. alita_sdk/tools/elastic/__init__.py +11 -8
  186. alita_sdk/tools/elitea_base.py +493 -30
  187. alita_sdk/tools/figma/__init__.py +58 -11
  188. alita_sdk/tools/figma/api_wrapper.py +1235 -143
  189. alita_sdk/tools/figma/figma_client.py +73 -0
  190. alita_sdk/tools/figma/toon_tools.py +2748 -0
  191. alita_sdk/tools/github/__init__.py +13 -14
  192. alita_sdk/tools/github/github_client.py +224 -100
  193. alita_sdk/tools/github/graphql_client_wrapper.py +119 -33
  194. alita_sdk/tools/github/schemas.py +14 -5
  195. alita_sdk/tools/github/tool.py +5 -1
  196. alita_sdk/tools/github/tool_prompts.py +9 -22
  197. alita_sdk/tools/gitlab/__init__.py +15 -11
  198. alita_sdk/tools/gitlab/api_wrapper.py +207 -41
  199. alita_sdk/tools/gitlab_org/__init__.py +10 -8
  200. alita_sdk/tools/gitlab_org/api_wrapper.py +63 -64
  201. alita_sdk/tools/google/bigquery/__init__.py +13 -12
  202. alita_sdk/tools/google/bigquery/tool.py +5 -1
  203. alita_sdk/tools/google_places/__init__.py +10 -8
  204. alita_sdk/tools/google_places/api_wrapper.py +1 -1
  205. alita_sdk/tools/jira/__init__.py +17 -11
  206. alita_sdk/tools/jira/api_wrapper.py +91 -40
  207. alita_sdk/tools/keycloak/__init__.py +11 -8
  208. alita_sdk/tools/localgit/__init__.py +9 -3
  209. alita_sdk/tools/localgit/local_git.py +62 -54
  210. alita_sdk/tools/localgit/tool.py +5 -1
  211. alita_sdk/tools/memory/__init__.py +11 -3
  212. alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
  213. alita_sdk/tools/ocr/__init__.py +11 -8
  214. alita_sdk/tools/openapi/__init__.py +490 -114
  215. alita_sdk/tools/openapi/api_wrapper.py +1368 -0
  216. alita_sdk/tools/openapi/tool.py +20 -0
  217. alita_sdk/tools/pandas/__init__.py +20 -12
  218. alita_sdk/tools/pandas/api_wrapper.py +38 -25
  219. alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
  220. alita_sdk/tools/postman/__init__.py +11 -11
  221. alita_sdk/tools/pptx/__init__.py +10 -9
  222. alita_sdk/tools/pptx/pptx_wrapper.py +1 -1
  223. alita_sdk/tools/qtest/__init__.py +30 -10
  224. alita_sdk/tools/qtest/api_wrapper.py +430 -13
  225. alita_sdk/tools/rally/__init__.py +10 -8
  226. alita_sdk/tools/rally/api_wrapper.py +1 -1
  227. alita_sdk/tools/report_portal/__init__.py +12 -9
  228. alita_sdk/tools/salesforce/__init__.py +10 -9
  229. alita_sdk/tools/servicenow/__init__.py +17 -14
  230. alita_sdk/tools/servicenow/api_wrapper.py +1 -1
  231. alita_sdk/tools/sharepoint/__init__.py +10 -8
  232. alita_sdk/tools/sharepoint/api_wrapper.py +4 -4
  233. alita_sdk/tools/slack/__init__.py +10 -8
  234. alita_sdk/tools/slack/api_wrapper.py +2 -2
  235. alita_sdk/tools/sql/__init__.py +11 -9
  236. alita_sdk/tools/testio/__init__.py +10 -8
  237. alita_sdk/tools/testrail/__init__.py +11 -8
  238. alita_sdk/tools/testrail/api_wrapper.py +1 -1
  239. alita_sdk/tools/utils/__init__.py +9 -4
  240. alita_sdk/tools/utils/content_parser.py +77 -3
  241. alita_sdk/tools/utils/text_operations.py +410 -0
  242. alita_sdk/tools/utils/tool_prompts.py +79 -0
  243. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +17 -13
  244. alita_sdk/tools/xray/__init__.py +12 -9
  245. alita_sdk/tools/yagmail/__init__.py +9 -3
  246. alita_sdk/tools/zephyr/__init__.py +9 -7
  247. alita_sdk/tools/zephyr_enterprise/__init__.py +11 -8
  248. alita_sdk/tools/zephyr_essential/__init__.py +10 -8
  249. alita_sdk/tools/zephyr_essential/api_wrapper.py +30 -13
  250. alita_sdk/tools/zephyr_essential/client.py +2 -2
  251. alita_sdk/tools/zephyr_scale/__init__.py +11 -9
  252. alita_sdk/tools/zephyr_scale/api_wrapper.py +2 -2
  253. alita_sdk/tools/zephyr_squad/__init__.py +10 -8
  254. {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/METADATA +147 -7
  255. alita_sdk-0.3.627.dist-info/RECORD +468 -0
  256. alita_sdk-0.3.627.dist-info/entry_points.txt +2 -0
  257. alita_sdk-0.3.462.dist-info/RECORD +0 -384
  258. alita_sdk-0.3.462.dist-info/entry_points.txt +0 -2
  259. {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/WHEEL +0 -0
  260. {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/licenses/LICENSE +0 -0
  261. {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/top_level.txt +0 -0
@@ -5,8 +5,9 @@ from pydantic import create_model, BaseModel, ConfigDict, Field
5
5
  from .api_wrapper import SonarApiWrapper
6
6
  from ...base.tool import BaseAction
7
7
  from ...elitea_base import filter_missconfigured_index_tools
8
- from ...utils import clean_string, TOOLKIT_SPLITTER, get_max_toolkit_length
8
+ from ...utils import clean_string, get_max_toolkit_length
9
9
  from ....configurations.sonar import SonarConfiguration
10
+ from ....runtime.utils.constants import TOOLKIT_NAME_META, TOOL_NAME_META, TOOLKIT_TYPE_META
10
11
 
11
12
  name = "sonar"
12
13
 
@@ -21,12 +22,10 @@ def get_tools(tool):
21
22
 
22
23
  class SonarToolkit(BaseToolkit):
23
24
  tools: list[BaseTool] = []
24
- toolkit_max_length: int = 0
25
25
 
26
26
  @staticmethod
27
27
  def toolkit_config_schema() -> BaseModel:
28
28
  selected_tools = {x['name']: x['args_schema'].schema() for x in SonarApiWrapper.model_construct().get_available_tools()}
29
- SonarToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
30
29
  return create_model(
31
30
  name,
32
31
  sonar_project_name=(str, Field(description="Project name of the desired repository")),
@@ -55,15 +54,19 @@ class SonarToolkit(BaseToolkit):
55
54
  sonar_api_wrapper = SonarApiWrapper(**wrapper_payload)
56
55
  available_tools = sonar_api_wrapper.get_available_tools()
57
56
  tools = []
58
- prefix = clean_string(toolkit_name, SonarToolkit.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
59
57
  for tool in available_tools:
60
58
  if selected_tools and tool["name"] not in selected_tools:
61
59
  continue
60
+ description = tool["description"]
61
+ if toolkit_name:
62
+ description = f"Toolkit: {toolkit_name}\n{description}"
63
+ description = description[:1000]
62
64
  tools.append(BaseAction(
63
65
  api_wrapper=sonar_api_wrapper,
64
- name=prefix + tool["name"],
65
- description=tool["description"],
66
- args_schema=tool["args_schema"]
66
+ name=tool["name"],
67
+ description=description,
68
+ args_schema=tool["args_schema"],
69
+ metadata={TOOLKIT_NAME_META: toolkit_name, TOOLKIT_TYPE_META: name, TOOL_NAME_META: tool["name"]} if toolkit_name else {TOOL_NAME_META: tool["name"]}
67
70
  ))
68
71
  return cls(tools=tools)
69
72
 
@@ -9,13 +9,13 @@ from langchain_core.tools import ToolException
9
9
  from pydantic import Field
10
10
 
11
11
  from alita_sdk.tools.base_indexer_toolkit import BaseIndexerToolkit
12
- from .chunkers.code.codeparser import parse_code_files_for_db
13
12
 
14
13
  logger = logging.getLogger(__name__)
15
14
 
16
15
 
17
16
  class CodeIndexerToolkit(BaseIndexerToolkit):
18
17
  def _get_indexed_data(self, index_name: str):
18
+ self._ensure_vectorstore_initialized()
19
19
  if not self.vector_adapter:
20
20
  raise ToolException("Vector adapter is not initialized. "
21
21
  "Check your configuration: embedding_model and vectorstore_type.")
@@ -38,12 +38,14 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
38
38
  branch: Optional[str] = None,
39
39
  whitelist: Optional[List[str]] = None,
40
40
  blacklist: Optional[List[str]] = None,
41
+ chunking_config: Optional[dict] = None,
41
42
  **kwargs) -> Generator[Document, None, None]:
42
43
  """Index repository files in the vector store using code parsing."""
43
44
  yield from self.loader(
44
45
  branch=branch,
45
46
  whitelist=whitelist,
46
- blacklist=blacklist
47
+ blacklist=blacklist,
48
+ chunking_config=chunking_config
47
49
  )
48
50
 
49
51
  def _extend_data(self, documents: Generator[Document, None, None]):
@@ -66,26 +68,55 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
66
68
  def loader(self,
67
69
  branch: Optional[str] = None,
68
70
  whitelist: Optional[List[str]] = None,
69
- blacklist: Optional[List[str]] = None) -> Generator[Document, None, None]:
71
+ blacklist: Optional[List[str]] = None,
72
+ chunked: bool = True,
73
+ chunking_config: Optional[dict] = None) -> Generator[Document, None, None]:
70
74
  """
71
- Generates file content from a branch, respecting whitelist and blacklist patterns.
75
+ Generates Documents from files in a branch, respecting whitelist and blacklist patterns.
72
76
 
73
77
  Parameters:
74
78
  - branch (Optional[str]): Branch for listing files. Defaults to the current branch if None.
75
79
  - whitelist (Optional[List[str]]): File extensions or paths to include. Defaults to all files if None.
76
80
  - blacklist (Optional[List[str]]): File extensions or paths to exclude. Defaults to no exclusions if None.
81
+ - chunked (bool): If True (default), applies universal chunker based on file type.
82
+ If False, returns raw Documents without chunking.
83
+ - chunking_config (Optional[dict]): Chunking configuration by file extension
77
84
 
78
85
  Returns:
79
- - generator: Yields content from files matching the whitelist but not the blacklist.
86
+ - generator: Yields Documents from files matching the whitelist but not the blacklist.
87
+ Each document has exactly the key 'filename' in metadata, which is used as an ID
88
+ for further operations (indexing, deduplication, and retrieval).
80
89
 
81
90
  Example:
82
91
  # Use 'feature-branch', include '.py' files, exclude 'test_' files
83
- file_generator = loader(branch='feature-branch', whitelist=['*.py'], blacklist=['*test_*'])
92
+ for doc in loader(branch='feature-branch', whitelist=['*.py'], blacklist=['*test_*']):
93
+ print(doc.page_content)
84
94
 
85
95
  Notes:
86
96
  - Whitelist and blacklist use Unix shell-style wildcards.
87
97
  - Files must match the whitelist and not the blacklist to be included.
98
+ - Each document MUST have exactly the key 'filename' in metadata. This key is used as an ID
99
+ for further operations such as indexing, deduplication, and retrieval.
100
+ - When chunked=True:
101
+ - .md files → markdown chunker (header-based splitting)
102
+ - .py/.js/.ts/etc → code parser (TreeSitter-based)
103
+ - .json files → JSON chunker
104
+ - other files → default text chunker
88
105
  """
106
+ import hashlib
107
+
108
+ # Auto-include extensions from chunking_config if whitelist is specified
109
+ # This allows chunking config to work without manually adding extensions to whitelist
110
+ if chunking_config and whitelist:
111
+ for ext_pattern in chunking_config.keys():
112
+ # Normalize extension pattern (both ".cbl" and "*.cbl" should work)
113
+ normalized = ext_pattern if ext_pattern.startswith('*') else f'*{ext_pattern}'
114
+ if normalized not in whitelist:
115
+ whitelist.append(normalized)
116
+ self._log_tool_event(
117
+ message=f"Auto-included extension '{normalized}' from chunking_config",
118
+ tool_name="loader"
119
+ )
89
120
 
90
121
  _files = self.__handle_get_files("", self.__get_branch(branch))
91
122
  self._log_tool_event(message="Listing files in branch", tool_name="loader")
@@ -103,41 +134,60 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
103
134
  or any(file_path.endswith(f'.{pattern}') for pattern in blacklist))
104
135
  return False
105
136
 
106
- def file_content_generator():
137
+ def raw_document_generator() -> Generator[Document, None, None]:
138
+ """Yields raw Documents without chunking."""
107
139
  self._log_tool_event(message="Reading the files", tool_name="loader")
108
- # log the progress of file reading
109
140
  total_files = len(_files)
141
+ processed = 0
142
+
110
143
  for idx, file in enumerate(_files, 1):
111
144
  if is_whitelisted(file) and not is_blacklisted(file):
112
- # read file ONLY if it matches whitelist and does not match blacklist
113
145
  try:
114
146
  file_content = self._read_file(file, self.__get_branch(branch))
115
147
  except Exception as e:
116
148
  logger.error(f"Failed to read file {file}: {e}")
117
- file_content = ""
149
+ continue
150
+
118
151
  if not file_content:
119
- # empty file, skip
120
152
  continue
121
- #
122
- # ensure file content is a string
153
+
154
+ # Ensure file content is a string
123
155
  if isinstance(file_content, bytes):
124
156
  file_content = file_content.decode("utf-8", errors="ignore")
125
157
  elif isinstance(file_content, dict) and file.endswith('.json'):
126
158
  file_content = json.dumps(file_content)
127
159
  elif not isinstance(file_content, str):
128
160
  file_content = str(file_content)
129
- #
130
- # hash the file content to ensure uniqueness
131
- import hashlib
161
+
162
+ # Hash the file content for uniqueness tracking
132
163
  file_hash = hashlib.sha256(file_content.encode("utf-8")).hexdigest()
133
- yield {"file_name": file,
134
- "file_content": file_content,
135
- "commit_hash": file_hash}
164
+ processed += 1
165
+
166
+ yield Document(
167
+ page_content=file_content,
168
+ metadata={
169
+ 'file_path': file,
170
+ 'filename': file,
171
+ 'source': file,
172
+ 'commit_hash': file_hash,
173
+ }
174
+ )
175
+
136
176
  if idx % 10 == 0 or idx == total_files:
137
- self._log_tool_event(message=f"{idx} out of {total_files} files have been read", tool_name="loader")
138
- self._log_tool_event(message=f"{len(_files)} have been read", tool_name="loader")
139
-
140
- return parse_code_files_for_db(file_content_generator())
177
+ self._log_tool_event(
178
+ message=f"{idx} out of {total_files} files checked, {processed} matched",
179
+ tool_name="loader"
180
+ )
181
+
182
+ self._log_tool_event(message=f"{processed} files loaded", tool_name="loader")
183
+
184
+ if not chunked:
185
+ # Return raw documents without chunking
186
+ return raw_document_generator()
187
+
188
+ # Apply universal chunker based on file type
189
+ from .chunkers.universal_chunker import universal_chunker
190
+ return universal_chunker(raw_document_generator())
141
191
 
142
192
  def __handle_get_files(self, path: str, branch: str):
143
193
  """
@@ -6,14 +6,15 @@ from ..base.tool import BaseAction
6
6
  from pydantic import create_model, BaseModel, ConfigDict, Field
7
7
 
8
8
  from ..elitea_base import filter_missconfigured_index_tools
9
- from ..utils import clean_string, TOOLKIT_SPLITTER, get_max_toolkit_length, parse_list, check_connection_response
9
+ from ..utils import clean_string, get_max_toolkit_length, parse_list, check_connection_response
10
10
  from ...configurations.confluence import ConfluenceConfiguration
11
11
  from ...configurations.pgvector import PgVectorConfiguration
12
12
  import requests
13
+ from ...runtime.utils.constants import TOOLKIT_NAME_META, TOOL_NAME_META, TOOLKIT_TYPE_META
13
14
 
14
15
  name = "confluence"
15
16
 
16
- def get_tools(tool):
17
+ def get_toolkit(tool):
17
18
  return ConfluenceToolkit().get_toolkit(
18
19
  selected_tools=tool['settings'].get('selected_tools', []),
19
20
  space=tool['settings'].get('space', None),
@@ -33,18 +34,19 @@ def get_tools(tool):
33
34
  doctype='doc',
34
35
  embedding_model=tool['settings'].get('embedding_model'),
35
36
  vectorstore_type="PGVector"
36
- ).get_tools()
37
+ )
38
+
39
+ def get_tools(tool):
40
+ return get_toolkit(tool).get_tools()
37
41
 
38
42
 
39
43
  class ConfluenceToolkit(BaseToolkit):
40
44
  tools: List[BaseTool] = []
41
- toolkit_max_length: int = 0
42
45
 
43
46
  @staticmethod
44
47
  def toolkit_config_schema() -> BaseModel:
45
48
  selected_tools = {x['name']: x['args_schema'].schema() for x in
46
49
  ConfluenceAPIWrapper.model_construct().get_available_tools()}
47
- ConfluenceToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
48
50
 
49
51
  @check_connection_response
50
52
  def check_connection(self):
@@ -69,16 +71,16 @@ class ConfluenceToolkit(BaseToolkit):
69
71
  name,
70
72
  space=(str, Field(description="Space")),
71
73
  cloud=(bool, Field(description="Hosting Option", json_schema_extra={'configuration': True})),
72
- limit=(int, Field(description="Pages limit per request", default=5)),
74
+ limit=(int, Field(description="Pages limit per request", default=5, gt=0)),
73
75
  labels=(Optional[str], Field(
74
76
  description="List of comma separated labels used for labeling of agent's created or updated entities",
75
77
  default=None,
76
78
  examples="alita,elitea;another-label"
77
79
  )),
78
- max_pages=(int, Field(description="Max total pages", default=10)),
79
- number_of_retries=(int, Field(description="Number of retries", default=2)),
80
- min_retry_seconds=(int, Field(description="Min retry, sec", default=10)),
81
- max_retry_seconds=(int, Field(description="Max retry, sec", default=60)),
80
+ max_pages=(int, Field(description="Max total pages", default=10, gt=0)),
81
+ number_of_retries=(int, Field(description="Number of retries", default=2, ge=0)),
82
+ min_retry_seconds=(int, Field(description="Min retry, sec", default=10, ge=0)),
83
+ max_retry_seconds=(int, Field(description="Max retry, sec", default=60, ge=0)),
82
84
  # optional field for custom headers as dictionary
83
85
  custom_headers=(Optional[dict], Field(description="Custom headers for API requests", default={})),
84
86
  confluence_configuration=(ConfluenceConfiguration, Field(description="Confluence Configuration", json_schema_extra={'configuration_types': ['confluence']})),
@@ -94,7 +96,6 @@ class ConfluenceToolkit(BaseToolkit):
94
96
  'metadata': {
95
97
  "label": "Confluence",
96
98
  "icon_url": None,
97
- "max_length": ConfluenceToolkit.toolkit_max_length,
98
99
  "categories": ["documentation"],
99
100
  "extra_categories": ["confluence", "wiki", "knowledge base", "documentation", "atlassian"]
100
101
  }
@@ -115,18 +116,23 @@ class ConfluenceToolkit(BaseToolkit):
115
116
  **(kwargs.get('pgvector_configuration') or {}),
116
117
  }
117
118
  confluence_api_wrapper = ConfluenceAPIWrapper(**wrapper_payload)
118
- prefix = clean_string(toolkit_name, ConfluenceToolkit.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
119
119
  available_tools = confluence_api_wrapper.get_available_tools()
120
120
  tools = []
121
121
  for tool in available_tools:
122
122
  if selected_tools:
123
123
  if tool["name"] not in selected_tools:
124
124
  continue
125
+ description = tool["description"]
126
+ if toolkit_name:
127
+ description = f"Toolkit: {toolkit_name}\n{description}"
128
+ description = f"Confluence space: {confluence_api_wrapper.space}\n{description}"
129
+ description = description[:1000]
125
130
  tools.append(BaseAction(
126
131
  api_wrapper=confluence_api_wrapper,
127
- name=prefix + tool["name"],
128
- description=f"Confluence space: {confluence_api_wrapper.space}" + tool["description"],
129
- args_schema=tool["args_schema"]
132
+ name=tool["name"],
133
+ description=description,
134
+ args_schema=tool["args_schema"],
135
+ metadata={TOOLKIT_NAME_META: toolkit_name, TOOLKIT_TYPE_META: name, TOOL_NAME_META: tool["name"]} if toolkit_name else {TOOL_NAME_META: tool["name"]}
130
136
  ))
131
137
  return cls(tools=tools)
132
138
 
@@ -480,21 +480,69 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
480
480
  """Gets pages with specific label in the Confluence space."""
481
481
 
482
482
  start = 0
483
- pages_info = []
484
- for _ in range((self.max_pages + self.limit - 1) // self.limit):
485
- pages = self.client.get_all_pages_by_label(label, start=start,
486
- limit=self.limit) # , expand="body.view.value"
483
+ pages_info: List[Dict[str, Any]] = []
484
+ seen_ids: set[str] = set()
485
+
486
+ # Use a while-loop driven by unique pages collected and
487
+ # presence of additional results instead of a fixed number
488
+ # of iterations based purely on max_pages/limit.
489
+ while len(pages_info) < (self.max_pages or 0):
490
+ pages = self.client.get_all_pages_by_label(
491
+ label,
492
+ start=start,
493
+ limit=self.limit,
494
+ ) # , expand="body.view.value"
487
495
  if not pages:
488
496
  break
489
497
 
490
- pages_info += [{
491
- 'page_id': page.metadata['id'],
492
- 'page_title': page.metadata['title'],
493
- 'page_url': page.metadata['source'],
494
- 'content': page.page_content
495
- } for page in self.get_pages_by_id([page["id"] for page in pages])]
498
+ # Collect only ids we haven't processed yet to avoid
499
+ # calling get_page_by_id multiple times for the same
500
+ # Confluence page.
501
+ new_ids: List[str] = []
502
+ for p in pages:
503
+ page_id = p["id"] if isinstance(p, dict) else getattr(p, "id", None)
504
+ if page_id is None:
505
+ continue
506
+ if page_id in seen_ids:
507
+ continue
508
+ seen_ids.add(page_id)
509
+ new_ids.append(page_id)
510
+
511
+ if new_ids:
512
+ for page in self.get_pages_by_id(new_ids):
513
+ meta = getattr(page, "metadata", {}) or {}
514
+ page_id = meta.get("id")
515
+ page_title = meta.get("title")
516
+ page_url = meta.get("source")
517
+ content = getattr(page, "page_content", None)
518
+
519
+ if page_id is None:
520
+ continue
521
+
522
+ pages_info.append(
523
+ {
524
+ "page_id": page_id,
525
+ "page_title": page_title,
526
+ "page_url": page_url,
527
+ "content": content,
528
+ }
529
+ )
530
+
531
+ # Respect max_pages on unique pages collected.
532
+ if len(pages_info) >= (self.max_pages or 0):
533
+ break
534
+
535
+ # Advance the offset by the requested page size.
496
536
  start += self.limit
497
- return pages_info
537
+
538
+ # Defensive break: if the API returns fewer items than
539
+ # requested, there are likely no more pages to fetch.
540
+ if len(pages) < self.limit:
541
+ break
542
+
543
+ # Slice as an extra safety net in case of any race conditions
544
+ # around the max_pages guard in the loop above.
545
+ return pages_info[: (self.max_pages or len(pages_info))]
498
546
 
499
547
  def is_public_page(self, page: dict) -> bool:
500
548
  """Check if a page is publicly accessible."""
@@ -572,11 +620,18 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
572
620
  def _process_search(self, cql, skip_images: bool = False):
573
621
  start = 0
574
622
  pages_info = []
623
+ seen_ids: set = set() # Track seen page IDs to avoid duplicates
575
624
  for _ in range((self.max_pages + self.limit - 1) // self.limit):
576
625
  pages = self.client.cql(cql, start=start, limit=self.limit).get("results", [])
577
626
  if not pages:
578
627
  break
579
- page_ids = [page['content']['id'] for page in pages]
628
+ # Deduplicate page IDs before processing
629
+ page_ids = []
630
+ for page in pages:
631
+ page_id = page['content']['id']
632
+ if page_id not in seen_ids:
633
+ seen_ids.add(page_id)
634
+ page_ids.append(page_id)
580
635
  for page in self.get_pages_by_id(page_ids, skip_images):
581
636
  page_info = {
582
637
  'content': page.page_content,
@@ -896,14 +951,14 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
896
951
 
897
952
  # Re-verify extension filters
898
953
  # Check if file should be skipped based on skip_extensions
899
- if any(re.match(pattern.replace('*', '.*') + '$', title, re.IGNORECASE)
954
+ if any(re.match(re.escape(pattern).replace(r'\*', '.*') + '$', title, re.IGNORECASE)
900
955
  for pattern in self._skip_extensions):
901
956
  continue
902
957
 
903
958
  # Check if file should be included based on include_extensions
904
959
  # If include_extensions is empty, process all files (that weren't skipped)
905
960
  if self._include_extensions and not (
906
- any(re.match(pattern.replace('*', '.*') + '$', title, re.IGNORECASE)
961
+ any(re.match(re.escape(pattern).replace(r'\*', '.*') + '$', title, re.IGNORECASE)
907
962
  for pattern in self._include_extensions)):
908
963
  continue
909
964
 
@@ -914,6 +969,9 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
914
969
  created_date = hist.get('createdDate', '') if hist else attachment.get('created', '')
915
970
  last_updated = hist.get('lastUpdated', {}).get('when', '') if hist else ''
916
971
 
972
+ attachment_path = attachment['_links']['download'] if attachment.get(
973
+ '_links', {}).get('download') else ''
974
+ download_url = self.client.url.rstrip('/') + attachment_path
917
975
  metadata = {
918
976
  'name': title,
919
977
  'size': attachment.get('extensions', {}).get('fileSize', None),
@@ -923,14 +981,10 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
923
981
  'media_type': media_type,
924
982
  'labels': [label['name'] for label in
925
983
  attachment.get('metadata', {}).get('labels', {}).get('results', [])],
926
- 'download_url': self.base_url.rstrip('/') + attachment['_links']['download'] if attachment.get(
927
- '_links', {}).get('download') else None
984
+ 'download_url': download_url
928
985
  }
929
-
930
- download_url = self.base_url.rstrip('/') + attachment['_links']['download']
931
-
932
986
  try:
933
- resp = self.client.request(method="GET", path=download_url[len(self.base_url):], advanced_mode=True)
987
+ resp = self.client.request(method="GET", path=attachment_path, advanced_mode=True)
934
988
  if resp.status_code == 200:
935
989
  content = resp.content
936
990
  else:
@@ -1683,8 +1737,8 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
1683
1737
  "page_ids": (Optional[List[str]], Field(description="List of page IDs to retrieve.", default=None)),
1684
1738
  "label": (Optional[str], Field(description="Label to filter pages.", default=None)),
1685
1739
  "cql": (Optional[str], Field(description="CQL query to filter pages.", default=None)),
1686
- "limit": (Optional[int], Field(description="Limit the number of results.", default=10)),
1687
- "max_pages": (Optional[int], Field(description="Maximum number of pages to retrieve.", default=1000)),
1740
+ "limit": (Optional[int], Field(description="Limit the number of results.", default=10, gt=0)),
1741
+ "max_pages": (Optional[int], Field(description="Maximum number of pages to retrieve.", default=1000, gt=0)),
1688
1742
  "include_restricted_content": (Optional[bool], Field(description="Include restricted content.", default=False)),
1689
1743
  "include_archived_content": (Optional[bool], Field(description="Include archived content.", default=False)),
1690
1744
  "include_attachments": (Optional[bool], Field(description="Include attachments.", default=False)),
@@ -1820,4 +1874,5 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
1820
1874
  "description": self.get_page_attachments.__doc__,
1821
1875
  "args_schema": GetPageAttachmentsInput,
1822
1876
  }
1823
- ]
1877
+ ]
1878
+
@@ -48,7 +48,8 @@ class AlitaConfluenceLoader(ConfluenceLoader):
48
48
  del kwargs[key]
49
49
  except:
50
50
  pass
51
- self.base_url = kwargs.get('url')
51
+ # utilize adjusted URL from Confluence instance for base_url
52
+ self.base_url = confluence_client.url
52
53
  self.space_key = kwargs.get('space_key')
53
54
  self.page_ids = kwargs.get('page_ids')
54
55
  self.label = kwargs.get('label')
@@ -108,7 +109,8 @@ class AlitaConfluenceLoader(ConfluenceLoader):
108
109
  texts = []
109
110
  for attachment in attachments:
110
111
  media_type = attachment["metadata"]["mediaType"]
111
- absolute_url = self.base_url + attachment["_links"]["download"]
112
+ # utilize adjusted URL from Confluence instance for attachment download URL
113
+ absolute_url = self.confluence.url + attachment["_links"]["download"]
112
114
  title = attachment["title"]
113
115
  try:
114
116
  if media_type == "application/pdf":
@@ -5,7 +5,8 @@ from pydantic import create_model, BaseModel, ConfigDict, Field
5
5
 
6
6
  from .api_wrapper import OpenApiWrapper
7
7
  from ..base.tool import BaseAction
8
- from ..utils import clean_string, TOOLKIT_SPLITTER
8
+ from ..utils import clean_string
9
+ from ...runtime.utils.constants import TOOLKIT_NAME_META, TOOL_NAME_META, TOOLKIT_TYPE_META
9
10
 
10
11
  name = "openapi"
11
12
 
@@ -43,15 +44,21 @@ class OpenApiToolkit(BaseToolkit):
43
44
  openapi_api_wrapper = OpenApiWrapper(**kwargs)
44
45
  available_tools = openapi_api_wrapper.get_available_tools()
45
46
  tools = []
46
- prefix = clean_string(toolkit_name + TOOLKIT_SPLITTER) if toolkit_name else ''
47
+ # Use clean toolkit name for context (max 1000 chars in description)
48
+ toolkit_context = f" [Toolkit: {clean_string(toolkit_name)}]" if toolkit_name else ''
47
49
  for tool in available_tools:
48
50
  if selected_tools and tool["name"] not in selected_tools:
49
51
  continue
52
+ # Add toolkit context to description with character limit
53
+ description = tool["description"]
54
+ if toolkit_context and len(description + toolkit_context) <= 1000:
55
+ description = description + toolkit_context
50
56
  tools.append(BaseAction(
51
57
  api_wrapper=openapi_api_wrapper,
52
- name=prefix + tool["name"],
53
- description=tool["description"],
54
- args_schema=tool["args_schema"]
58
+ name=tool["name"],
59
+ description=description,
60
+ args_schema=tool["args_schema"],
61
+ metadata={TOOLKIT_NAME_META: toolkit_name, TOOLKIT_TYPE_META: name, TOOL_NAME_META: tool["name"]} if toolkit_name else {TOOL_NAME_META: tool["name"]}
55
62
  ))
56
63
  return cls(tools=tools)
57
64
 
@@ -5,7 +5,8 @@ from pydantic import BaseModel, ConfigDict, create_model, Field, SecretStr
5
5
 
6
6
  from .api_wrapper import ELITEAElasticApiWrapper
7
7
  from ..base.tool import BaseAction
8
- from ..utils import clean_string, TOOLKIT_SPLITTER, get_max_toolkit_length
8
+ from ..utils import clean_string, get_max_toolkit_length
9
+ from ...runtime.utils.constants import TOOLKIT_NAME_META, TOOL_NAME_META, TOOLKIT_TYPE_META
9
10
 
10
11
  name = "elastic"
11
12
 
@@ -19,15 +20,13 @@ def get_tools(tool):
19
20
 
20
21
  class ElasticToolkit(BaseToolkit):
21
22
  tools: list[BaseTool] = []
22
- toolkit_max_length: int = 0
23
23
 
24
24
  @staticmethod
25
25
  def toolkit_config_schema() -> BaseModel:
26
26
  selected_tools = {x['name']: x['args_schema'].schema() for x in ELITEAElasticApiWrapper.model_construct().get_available_tools()}
27
- ElasticToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
28
27
  return create_model(
29
28
  name,
30
- url=(str, Field(default=None, title="Elasticsearch URL", description="Elasticsearch URL", json_schema_extra={'toolkit_name': True, 'max_toolkit_length': ElasticToolkit.toolkit_max_length})),
29
+ url=(Optional[str], Field(default=None, title="Elasticsearch URL", description="Elasticsearch URL", json_schema_extra={'toolkit_name': True})),
31
30
  api_key=(
32
31
  Optional[SecretStr],
33
32
  Field(
@@ -48,15 +47,19 @@ class ElasticToolkit(BaseToolkit):
48
47
  elastic_api_wrapper = ELITEAElasticApiWrapper(**kwargs)
49
48
  available_tools = elastic_api_wrapper.get_available_tools()
50
49
  tools = []
51
- prefix = clean_string(toolkit_name, ElasticToolkit.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
52
50
  for tool in available_tools:
53
51
  if selected_tools and tool["name"] not in selected_tools:
54
52
  continue
53
+ description = tool["description"]
54
+ if toolkit_name:
55
+ description = f"Toolkit: {toolkit_name}\n{description}"
56
+ description = description[:1000]
55
57
  tools.append(BaseAction(
56
58
  api_wrapper=elastic_api_wrapper,
57
- name=prefix + tool["name"],
58
- description=tool["description"],
59
- args_schema=tool["args_schema"]
59
+ name=tool["name"],
60
+ description=description,
61
+ args_schema=tool["args_schema"],
62
+ metadata={TOOLKIT_NAME_META: toolkit_name, TOOLKIT_TYPE_META: name, TOOL_NAME_META: tool["name"]} if toolkit_name else {TOOL_NAME_META: tool["name"]}
60
63
  ))
61
64
  return cls(tools=tools)
62
65