alita-sdk 0.3.379__py3-none-any.whl → 0.3.627__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent/__init__.py +5 -0
  4. alita_sdk/cli/agent/default.py +258 -0
  5. alita_sdk/cli/agent_executor.py +156 -0
  6. alita_sdk/cli/agent_loader.py +245 -0
  7. alita_sdk/cli/agent_ui.py +228 -0
  8. alita_sdk/cli/agents.py +3113 -0
  9. alita_sdk/cli/callbacks.py +647 -0
  10. alita_sdk/cli/cli.py +168 -0
  11. alita_sdk/cli/config.py +306 -0
  12. alita_sdk/cli/context/__init__.py +30 -0
  13. alita_sdk/cli/context/cleanup.py +198 -0
  14. alita_sdk/cli/context/manager.py +731 -0
  15. alita_sdk/cli/context/message.py +285 -0
  16. alita_sdk/cli/context/strategies.py +289 -0
  17. alita_sdk/cli/context/token_estimation.py +127 -0
  18. alita_sdk/cli/formatting.py +182 -0
  19. alita_sdk/cli/input_handler.py +419 -0
  20. alita_sdk/cli/inventory.py +1073 -0
  21. alita_sdk/cli/mcp_loader.py +315 -0
  22. alita_sdk/cli/testcases/__init__.py +94 -0
  23. alita_sdk/cli/testcases/data_generation.py +119 -0
  24. alita_sdk/cli/testcases/discovery.py +96 -0
  25. alita_sdk/cli/testcases/executor.py +84 -0
  26. alita_sdk/cli/testcases/logger.py +85 -0
  27. alita_sdk/cli/testcases/parser.py +172 -0
  28. alita_sdk/cli/testcases/prompts.py +91 -0
  29. alita_sdk/cli/testcases/reporting.py +125 -0
  30. alita_sdk/cli/testcases/setup.py +108 -0
  31. alita_sdk/cli/testcases/test_runner.py +282 -0
  32. alita_sdk/cli/testcases/utils.py +39 -0
  33. alita_sdk/cli/testcases/validation.py +90 -0
  34. alita_sdk/cli/testcases/workflow.py +196 -0
  35. alita_sdk/cli/toolkit.py +327 -0
  36. alita_sdk/cli/toolkit_loader.py +85 -0
  37. alita_sdk/cli/tools/__init__.py +43 -0
  38. alita_sdk/cli/tools/approval.py +224 -0
  39. alita_sdk/cli/tools/filesystem.py +1751 -0
  40. alita_sdk/cli/tools/planning.py +389 -0
  41. alita_sdk/cli/tools/terminal.py +414 -0
  42. alita_sdk/community/__init__.py +72 -12
  43. alita_sdk/community/inventory/__init__.py +236 -0
  44. alita_sdk/community/inventory/config.py +257 -0
  45. alita_sdk/community/inventory/enrichment.py +2137 -0
  46. alita_sdk/community/inventory/extractors.py +1469 -0
  47. alita_sdk/community/inventory/ingestion.py +3172 -0
  48. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  49. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  50. alita_sdk/community/inventory/parsers/base.py +295 -0
  51. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  52. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  53. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  54. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  55. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  56. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  57. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  58. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  59. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  60. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  61. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  62. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  63. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  64. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  65. alita_sdk/community/inventory/patterns/loader.py +348 -0
  66. alita_sdk/community/inventory/patterns/registry.py +198 -0
  67. alita_sdk/community/inventory/presets.py +535 -0
  68. alita_sdk/community/inventory/retrieval.py +1403 -0
  69. alita_sdk/community/inventory/toolkit.py +173 -0
  70. alita_sdk/community/inventory/toolkit_utils.py +176 -0
  71. alita_sdk/community/inventory/visualize.py +1370 -0
  72. alita_sdk/configurations/__init__.py +1 -1
  73. alita_sdk/configurations/ado.py +141 -20
  74. alita_sdk/configurations/bitbucket.py +94 -2
  75. alita_sdk/configurations/confluence.py +130 -1
  76. alita_sdk/configurations/figma.py +76 -0
  77. alita_sdk/configurations/gitlab.py +91 -0
  78. alita_sdk/configurations/jira.py +103 -0
  79. alita_sdk/configurations/openapi.py +329 -0
  80. alita_sdk/configurations/qtest.py +72 -1
  81. alita_sdk/configurations/report_portal.py +96 -0
  82. alita_sdk/configurations/sharepoint.py +148 -0
  83. alita_sdk/configurations/testio.py +83 -0
  84. alita_sdk/configurations/testrail.py +88 -0
  85. alita_sdk/configurations/xray.py +93 -0
  86. alita_sdk/configurations/zephyr_enterprise.py +93 -0
  87. alita_sdk/configurations/zephyr_essential.py +75 -0
  88. alita_sdk/runtime/clients/artifact.py +3 -3
  89. alita_sdk/runtime/clients/client.py +388 -46
  90. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  91. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  92. alita_sdk/runtime/clients/sandbox_client.py +8 -21
  93. alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
  94. alita_sdk/runtime/langchain/assistant.py +157 -39
  95. alita_sdk/runtime/langchain/constants.py +647 -1
  96. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  97. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
  98. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
  99. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -4
  100. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +226 -7
  101. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +5 -2
  102. alita_sdk/runtime/langchain/document_loaders/constants.py +40 -19
  103. alita_sdk/runtime/langchain/langraph_agent.py +405 -84
  104. alita_sdk/runtime/langchain/utils.py +106 -7
  105. alita_sdk/runtime/llms/preloaded.py +2 -6
  106. alita_sdk/runtime/models/mcp_models.py +61 -0
  107. alita_sdk/runtime/skills/__init__.py +91 -0
  108. alita_sdk/runtime/skills/callbacks.py +498 -0
  109. alita_sdk/runtime/skills/discovery.py +540 -0
  110. alita_sdk/runtime/skills/executor.py +610 -0
  111. alita_sdk/runtime/skills/input_builder.py +371 -0
  112. alita_sdk/runtime/skills/models.py +330 -0
  113. alita_sdk/runtime/skills/registry.py +355 -0
  114. alita_sdk/runtime/skills/skill_runner.py +330 -0
  115. alita_sdk/runtime/toolkits/__init__.py +31 -0
  116. alita_sdk/runtime/toolkits/application.py +29 -10
  117. alita_sdk/runtime/toolkits/artifact.py +20 -11
  118. alita_sdk/runtime/toolkits/datasource.py +13 -6
  119. alita_sdk/runtime/toolkits/mcp.py +783 -0
  120. alita_sdk/runtime/toolkits/mcp_config.py +1048 -0
  121. alita_sdk/runtime/toolkits/planning.py +178 -0
  122. alita_sdk/runtime/toolkits/skill_router.py +238 -0
  123. alita_sdk/runtime/toolkits/subgraph.py +251 -6
  124. alita_sdk/runtime/toolkits/tools.py +356 -69
  125. alita_sdk/runtime/toolkits/vectorstore.py +11 -5
  126. alita_sdk/runtime/tools/__init__.py +10 -3
  127. alita_sdk/runtime/tools/application.py +27 -6
  128. alita_sdk/runtime/tools/artifact.py +511 -28
  129. alita_sdk/runtime/tools/data_analysis.py +183 -0
  130. alita_sdk/runtime/tools/function.py +67 -35
  131. alita_sdk/runtime/tools/graph.py +10 -4
  132. alita_sdk/runtime/tools/image_generation.py +148 -46
  133. alita_sdk/runtime/tools/llm.py +1003 -128
  134. alita_sdk/runtime/tools/loop.py +3 -1
  135. alita_sdk/runtime/tools/loop_output.py +3 -1
  136. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  137. alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
  138. alita_sdk/runtime/tools/mcp_server_tool.py +8 -5
  139. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  140. alita_sdk/runtime/tools/planning/models.py +246 -0
  141. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  142. alita_sdk/runtime/tools/router.py +2 -4
  143. alita_sdk/runtime/tools/sandbox.py +65 -48
  144. alita_sdk/runtime/tools/skill_router.py +776 -0
  145. alita_sdk/runtime/tools/tool.py +3 -1
  146. alita_sdk/runtime/tools/vectorstore.py +9 -3
  147. alita_sdk/runtime/tools/vectorstore_base.py +70 -14
  148. alita_sdk/runtime/utils/AlitaCallback.py +137 -21
  149. alita_sdk/runtime/utils/constants.py +5 -1
  150. alita_sdk/runtime/utils/mcp_client.py +492 -0
  151. alita_sdk/runtime/utils/mcp_oauth.py +361 -0
  152. alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
  153. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  154. alita_sdk/runtime/utils/serialization.py +155 -0
  155. alita_sdk/runtime/utils/streamlit.py +40 -13
  156. alita_sdk/runtime/utils/toolkit_utils.py +30 -9
  157. alita_sdk/runtime/utils/utils.py +36 -0
  158. alita_sdk/tools/__init__.py +134 -35
  159. alita_sdk/tools/ado/repos/__init__.py +51 -32
  160. alita_sdk/tools/ado/repos/repos_wrapper.py +148 -89
  161. alita_sdk/tools/ado/test_plan/__init__.py +25 -9
  162. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +23 -1
  163. alita_sdk/tools/ado/utils.py +1 -18
  164. alita_sdk/tools/ado/wiki/__init__.py +25 -12
  165. alita_sdk/tools/ado/wiki/ado_wrapper.py +291 -22
  166. alita_sdk/tools/ado/work_item/__init__.py +26 -13
  167. alita_sdk/tools/ado/work_item/ado_wrapper.py +73 -11
  168. alita_sdk/tools/advanced_jira_mining/__init__.py +11 -8
  169. alita_sdk/tools/aws/delta_lake/__init__.py +13 -9
  170. alita_sdk/tools/aws/delta_lake/tool.py +5 -1
  171. alita_sdk/tools/azure_ai/search/__init__.py +11 -8
  172. alita_sdk/tools/azure_ai/search/api_wrapper.py +1 -1
  173. alita_sdk/tools/base/tool.py +5 -1
  174. alita_sdk/tools/base_indexer_toolkit.py +271 -84
  175. alita_sdk/tools/bitbucket/__init__.py +17 -11
  176. alita_sdk/tools/bitbucket/api_wrapper.py +59 -11
  177. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +49 -35
  178. alita_sdk/tools/browser/__init__.py +5 -4
  179. alita_sdk/tools/carrier/__init__.py +5 -6
  180. alita_sdk/tools/carrier/backend_reports_tool.py +6 -6
  181. alita_sdk/tools/carrier/run_ui_test_tool.py +6 -6
  182. alita_sdk/tools/carrier/ui_reports_tool.py +5 -5
  183. alita_sdk/tools/chunkers/__init__.py +3 -1
  184. alita_sdk/tools/chunkers/code/treesitter/treesitter.py +37 -13
  185. alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
  186. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  187. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  188. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  189. alita_sdk/tools/cloud/aws/__init__.py +10 -7
  190. alita_sdk/tools/cloud/azure/__init__.py +10 -7
  191. alita_sdk/tools/cloud/gcp/__init__.py +10 -7
  192. alita_sdk/tools/cloud/k8s/__init__.py +10 -7
  193. alita_sdk/tools/code/linter/__init__.py +10 -8
  194. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  195. alita_sdk/tools/code/sonar/__init__.py +11 -8
  196. alita_sdk/tools/code_indexer_toolkit.py +82 -22
  197. alita_sdk/tools/confluence/__init__.py +22 -16
  198. alita_sdk/tools/confluence/api_wrapper.py +107 -30
  199. alita_sdk/tools/confluence/loader.py +14 -2
  200. alita_sdk/tools/custom_open_api/__init__.py +12 -5
  201. alita_sdk/tools/elastic/__init__.py +11 -8
  202. alita_sdk/tools/elitea_base.py +493 -30
  203. alita_sdk/tools/figma/__init__.py +58 -11
  204. alita_sdk/tools/figma/api_wrapper.py +1235 -143
  205. alita_sdk/tools/figma/figma_client.py +73 -0
  206. alita_sdk/tools/figma/toon_tools.py +2748 -0
  207. alita_sdk/tools/github/__init__.py +14 -15
  208. alita_sdk/tools/github/github_client.py +224 -100
  209. alita_sdk/tools/github/graphql_client_wrapper.py +119 -33
  210. alita_sdk/tools/github/schemas.py +14 -5
  211. alita_sdk/tools/github/tool.py +5 -1
  212. alita_sdk/tools/github/tool_prompts.py +9 -22
  213. alita_sdk/tools/gitlab/__init__.py +16 -11
  214. alita_sdk/tools/gitlab/api_wrapper.py +218 -48
  215. alita_sdk/tools/gitlab_org/__init__.py +10 -9
  216. alita_sdk/tools/gitlab_org/api_wrapper.py +63 -64
  217. alita_sdk/tools/google/bigquery/__init__.py +13 -12
  218. alita_sdk/tools/google/bigquery/tool.py +5 -1
  219. alita_sdk/tools/google_places/__init__.py +11 -8
  220. alita_sdk/tools/google_places/api_wrapper.py +1 -1
  221. alita_sdk/tools/jira/__init__.py +17 -10
  222. alita_sdk/tools/jira/api_wrapper.py +92 -41
  223. alita_sdk/tools/keycloak/__init__.py +11 -8
  224. alita_sdk/tools/localgit/__init__.py +9 -3
  225. alita_sdk/tools/localgit/local_git.py +62 -54
  226. alita_sdk/tools/localgit/tool.py +5 -1
  227. alita_sdk/tools/memory/__init__.py +12 -4
  228. alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
  229. alita_sdk/tools/ocr/__init__.py +11 -8
  230. alita_sdk/tools/openapi/__init__.py +491 -106
  231. alita_sdk/tools/openapi/api_wrapper.py +1368 -0
  232. alita_sdk/tools/openapi/tool.py +20 -0
  233. alita_sdk/tools/pandas/__init__.py +20 -12
  234. alita_sdk/tools/pandas/api_wrapper.py +38 -25
  235. alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
  236. alita_sdk/tools/postman/__init__.py +10 -9
  237. alita_sdk/tools/pptx/__init__.py +11 -10
  238. alita_sdk/tools/pptx/pptx_wrapper.py +1 -1
  239. alita_sdk/tools/qtest/__init__.py +31 -11
  240. alita_sdk/tools/qtest/api_wrapper.py +2135 -86
  241. alita_sdk/tools/rally/__init__.py +10 -9
  242. alita_sdk/tools/rally/api_wrapper.py +1 -1
  243. alita_sdk/tools/report_portal/__init__.py +12 -8
  244. alita_sdk/tools/salesforce/__init__.py +10 -8
  245. alita_sdk/tools/servicenow/__init__.py +17 -15
  246. alita_sdk/tools/servicenow/api_wrapper.py +1 -1
  247. alita_sdk/tools/sharepoint/__init__.py +10 -7
  248. alita_sdk/tools/sharepoint/api_wrapper.py +129 -38
  249. alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
  250. alita_sdk/tools/sharepoint/utils.py +8 -2
  251. alita_sdk/tools/slack/__init__.py +10 -7
  252. alita_sdk/tools/slack/api_wrapper.py +2 -2
  253. alita_sdk/tools/sql/__init__.py +12 -9
  254. alita_sdk/tools/testio/__init__.py +10 -7
  255. alita_sdk/tools/testrail/__init__.py +11 -10
  256. alita_sdk/tools/testrail/api_wrapper.py +1 -1
  257. alita_sdk/tools/utils/__init__.py +9 -4
  258. alita_sdk/tools/utils/content_parser.py +103 -18
  259. alita_sdk/tools/utils/text_operations.py +410 -0
  260. alita_sdk/tools/utils/tool_prompts.py +79 -0
  261. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +30 -13
  262. alita_sdk/tools/xray/__init__.py +13 -9
  263. alita_sdk/tools/yagmail/__init__.py +9 -3
  264. alita_sdk/tools/zephyr/__init__.py +10 -7
  265. alita_sdk/tools/zephyr_enterprise/__init__.py +11 -7
  266. alita_sdk/tools/zephyr_essential/__init__.py +10 -7
  267. alita_sdk/tools/zephyr_essential/api_wrapper.py +30 -13
  268. alita_sdk/tools/zephyr_essential/client.py +2 -2
  269. alita_sdk/tools/zephyr_scale/__init__.py +11 -8
  270. alita_sdk/tools/zephyr_scale/api_wrapper.py +2 -2
  271. alita_sdk/tools/zephyr_squad/__init__.py +10 -7
  272. {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/METADATA +154 -8
  273. alita_sdk-0.3.627.dist-info/RECORD +468 -0
  274. alita_sdk-0.3.627.dist-info/entry_points.txt +2 -0
  275. alita_sdk-0.3.379.dist-info/RECORD +0 -360
  276. {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/WHEEL +0 -0
  277. {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/licenses/LICENSE +0 -0
  278. {alita_sdk-0.3.379.dist-info → alita_sdk-0.3.627.dist-info}/top_level.txt +0 -0
@@ -5,8 +5,9 @@ from pydantic import create_model, BaseModel, ConfigDict, Field
5
5
  from .api_wrapper import SonarApiWrapper
6
6
  from ...base.tool import BaseAction
7
7
  from ...elitea_base import filter_missconfigured_index_tools
8
- from ...utils import clean_string, TOOLKIT_SPLITTER, get_max_toolkit_length
8
+ from ...utils import clean_string, get_max_toolkit_length
9
9
  from ....configurations.sonar import SonarConfiguration
10
+ from ....runtime.utils.constants import TOOLKIT_NAME_META, TOOL_NAME_META, TOOLKIT_TYPE_META
10
11
 
11
12
  name = "sonar"
12
13
 
@@ -21,15 +22,13 @@ def get_tools(tool):
21
22
 
22
23
  class SonarToolkit(BaseToolkit):
23
24
  tools: list[BaseTool] = []
24
- toolkit_max_length: int = 0
25
25
 
26
26
  @staticmethod
27
27
  def toolkit_config_schema() -> BaseModel:
28
28
  selected_tools = {x['name']: x['args_schema'].schema() for x in SonarApiWrapper.model_construct().get_available_tools()}
29
- SonarToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
30
29
  return create_model(
31
30
  name,
32
- sonar_project_name=(str, Field(description="Project name of the desired repository", json_schema_extra={'toolkit_name': True, 'max_toolkit_length': SonarToolkit.toolkit_max_length})),
31
+ sonar_project_name=(str, Field(description="Project name of the desired repository")),
33
32
  sonar_configuration=(SonarConfiguration, Field(description="Sonar Configuration", json_schema_extra={'configuration_types': ['sonar']})),
34
33
  selected_tools=(List[Literal[tuple(selected_tools)]], Field(default=[], json_schema_extra={'args_schemas': selected_tools})),
35
34
  __config__=ConfigDict(json_schema_extra=
@@ -55,15 +54,19 @@ class SonarToolkit(BaseToolkit):
55
54
  sonar_api_wrapper = SonarApiWrapper(**wrapper_payload)
56
55
  available_tools = sonar_api_wrapper.get_available_tools()
57
56
  tools = []
58
- prefix = clean_string(toolkit_name, SonarToolkit.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
59
57
  for tool in available_tools:
60
58
  if selected_tools and tool["name"] not in selected_tools:
61
59
  continue
60
+ description = tool["description"]
61
+ if toolkit_name:
62
+ description = f"Toolkit: {toolkit_name}\n{description}"
63
+ description = description[:1000]
62
64
  tools.append(BaseAction(
63
65
  api_wrapper=sonar_api_wrapper,
64
- name=prefix + tool["name"],
65
- description=tool["description"],
66
- args_schema=tool["args_schema"]
66
+ name=tool["name"],
67
+ description=description,
68
+ args_schema=tool["args_schema"],
69
+ metadata={TOOLKIT_NAME_META: toolkit_name, TOOLKIT_TYPE_META: name, TOOL_NAME_META: tool["name"]} if toolkit_name else {TOOL_NAME_META: tool["name"]}
67
70
  ))
68
71
  return cls(tools=tools)
69
72
 
@@ -1,5 +1,6 @@
1
1
  import ast
2
2
  import fnmatch
3
+ import json
3
4
  import logging
4
5
  from typing import Optional, List, Generator
5
6
 
@@ -8,20 +9,20 @@ from langchain_core.tools import ToolException
8
9
  from pydantic import Field
9
10
 
10
11
  from alita_sdk.tools.base_indexer_toolkit import BaseIndexerToolkit
11
- from .chunkers.code.codeparser import parse_code_files_for_db
12
12
 
13
13
  logger = logging.getLogger(__name__)
14
14
 
15
15
 
16
16
  class CodeIndexerToolkit(BaseIndexerToolkit):
17
17
  def _get_indexed_data(self, index_name: str):
18
+ self._ensure_vectorstore_initialized()
18
19
  if not self.vector_adapter:
19
20
  raise ToolException("Vector adapter is not initialized. "
20
21
  "Check your configuration: embedding_model and vectorstore_type.")
21
22
  return self.vector_adapter.get_code_indexed_data(self, index_name)
22
23
 
23
24
  def key_fn(self, document: Document):
24
- return document.metadata.get('id')
25
+ return document.metadata.get("filename")
25
26
 
26
27
  def compare_fn(self, document: Document, idx_data):
27
28
  return (document.metadata.get('commit_hash') and
@@ -37,16 +38,18 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
37
38
  branch: Optional[str] = None,
38
39
  whitelist: Optional[List[str]] = None,
39
40
  blacklist: Optional[List[str]] = None,
41
+ chunking_config: Optional[dict] = None,
40
42
  **kwargs) -> Generator[Document, None, None]:
41
43
  """Index repository files in the vector store using code parsing."""
42
44
  yield from self.loader(
43
45
  branch=branch,
44
46
  whitelist=whitelist,
45
- blacklist=blacklist
47
+ blacklist=blacklist,
48
+ chunking_config=chunking_config
46
49
  )
47
50
 
48
51
  def _extend_data(self, documents: Generator[Document, None, None]):
49
- yield from parse_code_files_for_db(documents)
52
+ yield from documents
50
53
 
51
54
  def _index_tool_params(self):
52
55
  """Return the parameters for indexing data."""
@@ -65,26 +68,55 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
65
68
  def loader(self,
66
69
  branch: Optional[str] = None,
67
70
  whitelist: Optional[List[str]] = None,
68
- blacklist: Optional[List[str]] = None) -> Generator[Document, None, None]:
71
+ blacklist: Optional[List[str]] = None,
72
+ chunked: bool = True,
73
+ chunking_config: Optional[dict] = None) -> Generator[Document, None, None]:
69
74
  """
70
- Generates file content from a branch, respecting whitelist and blacklist patterns.
75
+ Generates Documents from files in a branch, respecting whitelist and blacklist patterns.
71
76
 
72
77
  Parameters:
73
78
  - branch (Optional[str]): Branch for listing files. Defaults to the current branch if None.
74
79
  - whitelist (Optional[List[str]]): File extensions or paths to include. Defaults to all files if None.
75
80
  - blacklist (Optional[List[str]]): File extensions or paths to exclude. Defaults to no exclusions if None.
81
+ - chunked (bool): If True (default), applies universal chunker based on file type.
82
+ If False, returns raw Documents without chunking.
83
+ - chunking_config (Optional[dict]): Chunking configuration by file extension
76
84
 
77
85
  Returns:
78
- - generator: Yields content from files matching the whitelist but not the blacklist.
86
+ - generator: Yields Documents from files matching the whitelist but not the blacklist.
87
+ Each document has exactly the key 'filename' in metadata, which is used as an ID
88
+ for further operations (indexing, deduplication, and retrieval).
79
89
 
80
90
  Example:
81
91
  # Use 'feature-branch', include '.py' files, exclude 'test_' files
82
- file_generator = loader(branch='feature-branch', whitelist=['*.py'], blacklist=['*test_*'])
92
+ for doc in loader(branch='feature-branch', whitelist=['*.py'], blacklist=['*test_*']):
93
+ print(doc.page_content)
83
94
 
84
95
  Notes:
85
96
  - Whitelist and blacklist use Unix shell-style wildcards.
86
97
  - Files must match the whitelist and not the blacklist to be included.
98
+ - Each document MUST have exactly the key 'filename' in metadata. This key is used as an ID
99
+ for further operations such as indexing, deduplication, and retrieval.
100
+ - When chunked=True:
101
+ - .md files → markdown chunker (header-based splitting)
102
+ - .py/.js/.ts/etc → code parser (TreeSitter-based)
103
+ - .json files → JSON chunker
104
+ - other files → default text chunker
87
105
  """
106
+ import hashlib
107
+
108
+ # Auto-include extensions from chunking_config if whitelist is specified
109
+ # This allows chunking config to work without manually adding extensions to whitelist
110
+ if chunking_config and whitelist:
111
+ for ext_pattern in chunking_config.keys():
112
+ # Normalize extension pattern (both ".cbl" and "*.cbl" should work)
113
+ normalized = ext_pattern if ext_pattern.startswith('*') else f'*{ext_pattern}'
114
+ if normalized not in whitelist:
115
+ whitelist.append(normalized)
116
+ self._log_tool_event(
117
+ message=f"Auto-included extension '{normalized}' from chunking_config",
118
+ tool_name="loader"
119
+ )
88
120
 
89
121
  _files = self.__handle_get_files("", self.__get_branch(branch))
90
122
  self._log_tool_event(message="Listing files in branch", tool_name="loader")
@@ -102,32 +134,60 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
102
134
  or any(file_path.endswith(f'.{pattern}') for pattern in blacklist))
103
135
  return False
104
136
 
105
- def file_content_generator():
137
+ def raw_document_generator() -> Generator[Document, None, None]:
138
+ """Yields raw Documents without chunking."""
106
139
  self._log_tool_event(message="Reading the files", tool_name="loader")
107
- # log the progress of file reading
108
140
  total_files = len(_files)
141
+ processed = 0
142
+
109
143
  for idx, file in enumerate(_files, 1):
110
144
  if is_whitelisted(file) and not is_blacklisted(file):
111
- # read file ONLY if it matches whitelist and does not match blacklist
112
145
  try:
113
146
  file_content = self._read_file(file, self.__get_branch(branch))
114
147
  except Exception as e:
115
148
  logger.error(f"Failed to read file {file}: {e}")
116
- file_content = ""
149
+ continue
150
+
117
151
  if not file_content:
118
- # empty file, skip
119
152
  continue
120
- # hash the file content to ensure uniqueness
121
- import hashlib
153
+
154
+ # Ensure file content is a string
155
+ if isinstance(file_content, bytes):
156
+ file_content = file_content.decode("utf-8", errors="ignore")
157
+ elif isinstance(file_content, dict) and file.endswith('.json'):
158
+ file_content = json.dumps(file_content)
159
+ elif not isinstance(file_content, str):
160
+ file_content = str(file_content)
161
+
162
+ # Hash the file content for uniqueness tracking
122
163
  file_hash = hashlib.sha256(file_content.encode("utf-8")).hexdigest()
123
- yield {"file_name": file,
124
- "file_content": file_content,
125
- "commit_hash": file_hash}
164
+ processed += 1
165
+
166
+ yield Document(
167
+ page_content=file_content,
168
+ metadata={
169
+ 'file_path': file,
170
+ 'filename': file,
171
+ 'source': file,
172
+ 'commit_hash': file_hash,
173
+ }
174
+ )
175
+
126
176
  if idx % 10 == 0 or idx == total_files:
127
- self._log_tool_event(message=f"{idx} out of {total_files} files have been read", tool_name="loader")
128
- self._log_tool_event(message=f"{len(_files)} have been read", tool_name="loader")
129
-
130
- return file_content_generator()
177
+ self._log_tool_event(
178
+ message=f"{idx} out of {total_files} files checked, {processed} matched",
179
+ tool_name="loader"
180
+ )
181
+
182
+ self._log_tool_event(message=f"{processed} files loaded", tool_name="loader")
183
+
184
+ if not chunked:
185
+ # Return raw documents without chunking
186
+ return raw_document_generator()
187
+
188
+ # Apply universal chunker based on file type
189
+ from .chunkers.universal_chunker import universal_chunker
190
+ return universal_chunker(raw_document_generator())
131
191
 
132
192
  def __handle_get_files(self, path: str, branch: str):
133
193
  """
@@ -6,14 +6,15 @@ from ..base.tool import BaseAction
6
6
  from pydantic import create_model, BaseModel, ConfigDict, Field
7
7
 
8
8
  from ..elitea_base import filter_missconfigured_index_tools
9
- from ..utils import clean_string, TOOLKIT_SPLITTER, get_max_toolkit_length, parse_list, check_connection_response
9
+ from ..utils import clean_string, get_max_toolkit_length, parse_list, check_connection_response
10
10
  from ...configurations.confluence import ConfluenceConfiguration
11
11
  from ...configurations.pgvector import PgVectorConfiguration
12
12
  import requests
13
+ from ...runtime.utils.constants import TOOLKIT_NAME_META, TOOL_NAME_META, TOOLKIT_TYPE_META
13
14
 
14
15
  name = "confluence"
15
16
 
16
- def get_tools(tool):
17
+ def get_toolkit(tool):
17
18
  return ConfluenceToolkit().get_toolkit(
18
19
  selected_tools=tool['settings'].get('selected_tools', []),
19
20
  space=tool['settings'].get('space', None),
@@ -33,18 +34,19 @@ def get_tools(tool):
33
34
  doctype='doc',
34
35
  embedding_model=tool['settings'].get('embedding_model'),
35
36
  vectorstore_type="PGVector"
36
- ).get_tools()
37
+ )
38
+
39
+ def get_tools(tool):
40
+ return get_toolkit(tool).get_tools()
37
41
 
38
42
 
39
43
  class ConfluenceToolkit(BaseToolkit):
40
44
  tools: List[BaseTool] = []
41
- toolkit_max_length: int = 0
42
45
 
43
46
  @staticmethod
44
47
  def toolkit_config_schema() -> BaseModel:
45
48
  selected_tools = {x['name']: x['args_schema'].schema() for x in
46
49
  ConfluenceAPIWrapper.model_construct().get_available_tools()}
47
- ConfluenceToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
48
50
 
49
51
  @check_connection_response
50
52
  def check_connection(self):
@@ -67,19 +69,18 @@ class ConfluenceToolkit(BaseToolkit):
67
69
 
68
70
  model = create_model(
69
71
  name,
70
- space=(str, Field(description="Space", json_schema_extra={'toolkit_name': True,
71
- 'max_toolkit_length': ConfluenceToolkit.toolkit_max_length})),
72
+ space=(str, Field(description="Space")),
72
73
  cloud=(bool, Field(description="Hosting Option", json_schema_extra={'configuration': True})),
73
- limit=(int, Field(description="Pages limit per request", default=5)),
74
+ limit=(int, Field(description="Pages limit per request", default=5, gt=0)),
74
75
  labels=(Optional[str], Field(
75
76
  description="List of comma separated labels used for labeling of agent's created or updated entities",
76
77
  default=None,
77
78
  examples="alita,elitea;another-label"
78
79
  )),
79
- max_pages=(int, Field(description="Max total pages", default=10)),
80
- number_of_retries=(int, Field(description="Number of retries", default=2)),
81
- min_retry_seconds=(int, Field(description="Min retry, sec", default=10)),
82
- max_retry_seconds=(int, Field(description="Max retry, sec", default=60)),
80
+ max_pages=(int, Field(description="Max total pages", default=10, gt=0)),
81
+ number_of_retries=(int, Field(description="Number of retries", default=2, ge=0)),
82
+ min_retry_seconds=(int, Field(description="Min retry, sec", default=10, ge=0)),
83
+ max_retry_seconds=(int, Field(description="Max retry, sec", default=60, ge=0)),
83
84
  # optional field for custom headers as dictionary
84
85
  custom_headers=(Optional[dict], Field(description="Custom headers for API requests", default={})),
85
86
  confluence_configuration=(ConfluenceConfiguration, Field(description="Confluence Configuration", json_schema_extra={'configuration_types': ['confluence']})),
@@ -115,18 +116,23 @@ class ConfluenceToolkit(BaseToolkit):
115
116
  **(kwargs.get('pgvector_configuration') or {}),
116
117
  }
117
118
  confluence_api_wrapper = ConfluenceAPIWrapper(**wrapper_payload)
118
- prefix = clean_string(toolkit_name, ConfluenceToolkit.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
119
119
  available_tools = confluence_api_wrapper.get_available_tools()
120
120
  tools = []
121
121
  for tool in available_tools:
122
122
  if selected_tools:
123
123
  if tool["name"] not in selected_tools:
124
124
  continue
125
+ description = tool["description"]
126
+ if toolkit_name:
127
+ description = f"Toolkit: {toolkit_name}\n{description}"
128
+ description = f"Confluence space: {confluence_api_wrapper.space}\n{description}"
129
+ description = description[:1000]
125
130
  tools.append(BaseAction(
126
131
  api_wrapper=confluence_api_wrapper,
127
- name=prefix + tool["name"],
128
- description=f"Confluence space: {confluence_api_wrapper.space}" + tool["description"],
129
- args_schema=tool["args_schema"]
132
+ name=tool["name"],
133
+ description=description,
134
+ args_schema=tool["args_schema"],
135
+ metadata={TOOLKIT_NAME_META: toolkit_name, TOOLKIT_TYPE_META: name, TOOL_NAME_META: tool["name"]} if toolkit_name else {TOOL_NAME_META: tool["name"]}
130
136
  ))
131
137
  return cls(tools=tools)
132
138
 
@@ -7,12 +7,14 @@ from json import JSONDecodeError
7
7
  from typing import Optional, List, Any, Dict, Callable, Generator, Literal
8
8
 
9
9
  import requests
10
+ from atlassian.errors import ApiError
10
11
  from langchain_community.document_loaders.confluence import ContentFormat
11
12
  from langchain_core.documents import Document
12
13
  from langchain_core.messages import HumanMessage
13
14
  from langchain_core.tools import ToolException
14
15
  from markdownify import markdownify
15
16
  from pydantic import Field, PrivateAttr, model_validator, create_model, SecretStr
17
+ from requests import HTTPError
16
18
  from tenacity import retry, stop_after_attempt, wait_exponential, before_sleep_log
17
19
 
18
20
  from alita_sdk.tools.non_code_indexer_toolkit import NonCodeIndexerToolkit
@@ -194,6 +196,7 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
194
196
  keep_markdown_format: Optional[bool] = True
195
197
  ocr_languages: Optional[str] = None
196
198
  keep_newlines: Optional[bool] = True
199
+ _errors: Optional[list[str]] = None
197
200
  _image_cache: ImageDescriptionCache = PrivateAttr(default_factory=ImageDescriptionCache)
198
201
 
199
202
  @model_validator(mode='before')
@@ -477,28 +480,78 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
477
480
  """Gets pages with specific label in the Confluence space."""
478
481
 
479
482
  start = 0
480
- pages_info = []
481
- for _ in range((self.max_pages + self.limit - 1) // self.limit):
482
- pages = self.client.get_all_pages_by_label(label, start=start,
483
- limit=self.limit) # , expand="body.view.value"
483
+ pages_info: List[Dict[str, Any]] = []
484
+ seen_ids: set[str] = set()
485
+
486
+ # Use a while-loop driven by unique pages collected and
487
+ # presence of additional results instead of a fixed number
488
+ # of iterations based purely on max_pages/limit.
489
+ while len(pages_info) < (self.max_pages or 0):
490
+ pages = self.client.get_all_pages_by_label(
491
+ label,
492
+ start=start,
493
+ limit=self.limit,
494
+ ) # , expand="body.view.value"
484
495
  if not pages:
485
496
  break
486
497
 
487
- pages_info += [{
488
- 'page_id': page.metadata['id'],
489
- 'page_title': page.metadata['title'],
490
- 'page_url': page.metadata['source'],
491
- 'content': page.page_content
492
- } for page in self.get_pages_by_id([page["id"] for page in pages])]
498
+ # Collect only ids we haven't processed yet to avoid
499
+ # calling get_page_by_id multiple times for the same
500
+ # Confluence page.
501
+ new_ids: List[str] = []
502
+ for p in pages:
503
+ page_id = p["id"] if isinstance(p, dict) else getattr(p, "id", None)
504
+ if page_id is None:
505
+ continue
506
+ if page_id in seen_ids:
507
+ continue
508
+ seen_ids.add(page_id)
509
+ new_ids.append(page_id)
510
+
511
+ if new_ids:
512
+ for page in self.get_pages_by_id(new_ids):
513
+ meta = getattr(page, "metadata", {}) or {}
514
+ page_id = meta.get("id")
515
+ page_title = meta.get("title")
516
+ page_url = meta.get("source")
517
+ content = getattr(page, "page_content", None)
518
+
519
+ if page_id is None:
520
+ continue
521
+
522
+ pages_info.append(
523
+ {
524
+ "page_id": page_id,
525
+ "page_title": page_title,
526
+ "page_url": page_url,
527
+ "content": content,
528
+ }
529
+ )
530
+
531
+ # Respect max_pages on unique pages collected.
532
+ if len(pages_info) >= (self.max_pages or 0):
533
+ break
534
+
535
+ # Advance the offset by the requested page size.
493
536
  start += self.limit
494
- return pages_info
537
+
538
+ # Defensive break: if the API returns fewer items than
539
+ # requested, there are likely no more pages to fetch.
540
+ if len(pages) < self.limit:
541
+ break
542
+
543
+ # Slice as an extra safety net in case of any race conditions
544
+ # around the max_pages guard in the loop above.
545
+ return pages_info[: (self.max_pages or len(pages_info))]
495
546
 
496
547
  def is_public_page(self, page: dict) -> bool:
497
548
  """Check if a page is publicly accessible."""
498
549
  restrictions = self.client.get_all_restrictions_for_content(page["id"])
499
550
 
500
551
  return (
501
- page["status"] == "current"
552
+ (page["status"] == "current"
553
+ # allow user to see archived content if needed
554
+ or page["status"] == "archived")
502
555
  and not restrictions["read"]["restrictions"]["user"]["results"]
503
556
  and not restrictions["read"]["restrictions"]["group"]["results"]
504
557
  )
@@ -518,18 +571,35 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
518
571
  ),
519
572
  before_sleep=before_sleep_log(logger, logging.WARNING),
520
573
  )(self.client.get_page_by_id)
521
- page = get_page(
522
- page_id=page_id, expand=f"{self.content_format.value},version"
523
- )
524
- if not self.include_restricted_content and not self.is_public_page(page):
525
- continue
574
+ try:
575
+ page = get_page(
576
+ page_id=page_id, expand=f"{self.content_format.value},version"
577
+ )
578
+ except (ApiError, HTTPError) as e:
579
+ logger.error(f"Error fetching page with ID {page_id}: {e}")
580
+ page_content_temp = f"Confluence API Error: cannot fetch the page with ID {page_id}: {e}"
581
+ # store errors
582
+ if self._errors is None:
583
+ self._errors = []
584
+ self._errors.append(page_content_temp)
585
+ return Document(page_content=page_content_temp,
586
+ metadata={})
587
+ # TODO: update on toolkit advanced settings level as a separate feature
588
+ # if not self.include_restricted_content and not self.is_public_page(page):
589
+ # continue
526
590
  yield self.process_page(page, skip_images)
527
591
 
592
+ def _log_errors(self):
593
+ """ Log errors encountered during toolkit execution. """
594
+ if self._errors:
595
+ logger.info(f"Errors encountered during toolkit execution: {self._errors}")
596
+
528
597
  def read_page_by_id(self, page_id: str, skip_images: bool = False):
529
598
  """Reads a page by its id in the Confluence space. If id is not available, but there is a title - use get_page_id first."""
530
599
  result = list(self.get_pages_by_id([page_id], skip_images))
531
600
  if not result:
532
- "Page not found"
601
+ return f"Pages not found. Errors: {self._errors}" if self._errors \
602
+ else "Pages not found or you do not have access to them."
533
603
  return result[0].page_content
534
604
  # return self._strip_base64_images(result[0].page_content) if skip_images else result[0].page_content
535
605
 
@@ -550,11 +620,18 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
550
620
  def _process_search(self, cql, skip_images: bool = False):
551
621
  start = 0
552
622
  pages_info = []
623
+ seen_ids: set = set() # Track seen page IDs to avoid duplicates
553
624
  for _ in range((self.max_pages + self.limit - 1) // self.limit):
554
625
  pages = self.client.cql(cql, start=start, limit=self.limit).get("results", [])
555
626
  if not pages:
556
627
  break
557
- page_ids = [page['content']['id'] for page in pages]
628
+ # Deduplicate page IDs before processing
629
+ page_ids = []
630
+ for page in pages:
631
+ page_id = page['content']['id']
632
+ if page_id not in seen_ids:
633
+ seen_ids.add(page_id)
634
+ page_ids.append(page_id)
558
635
  for page in self.get_pages_by_id(page_ids, skip_images):
559
636
  page_info = {
560
637
  'content': page.page_content,
@@ -874,14 +951,14 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
874
951
 
875
952
  # Re-verify extension filters
876
953
  # Check if file should be skipped based on skip_extensions
877
- if any(re.match(pattern.replace('*', '.*') + '$', title, re.IGNORECASE)
954
+ if any(re.match(re.escape(pattern).replace(r'\*', '.*') + '$', title, re.IGNORECASE)
878
955
  for pattern in self._skip_extensions):
879
956
  continue
880
957
 
881
958
  # Check if file should be included based on include_extensions
882
959
  # If include_extensions is empty, process all files (that weren't skipped)
883
960
  if self._include_extensions and not (
884
- any(re.match(pattern.replace('*', '.*') + '$', title, re.IGNORECASE)
961
+ any(re.match(re.escape(pattern).replace(r'\*', '.*') + '$', title, re.IGNORECASE)
885
962
  for pattern in self._include_extensions)):
886
963
  continue
887
964
 
@@ -892,6 +969,9 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
892
969
  created_date = hist.get('createdDate', '') if hist else attachment.get('created', '')
893
970
  last_updated = hist.get('lastUpdated', {}).get('when', '') if hist else ''
894
971
 
972
+ attachment_path = attachment['_links']['download'] if attachment.get(
973
+ '_links', {}).get('download') else ''
974
+ download_url = self.client.url.rstrip('/') + attachment_path
895
975
  metadata = {
896
976
  'name': title,
897
977
  'size': attachment.get('extensions', {}).get('fileSize', None),
@@ -901,14 +981,10 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
901
981
  'media_type': media_type,
902
982
  'labels': [label['name'] for label in
903
983
  attachment.get('metadata', {}).get('labels', {}).get('results', [])],
904
- 'download_url': self.base_url.rstrip('/') + attachment['_links']['download'] if attachment.get(
905
- '_links', {}).get('download') else None
984
+ 'download_url': download_url
906
985
  }
907
-
908
- download_url = self.base_url.rstrip('/') + attachment['_links']['download']
909
-
910
986
  try:
911
- resp = self.client.request(method="GET", path=download_url[len(self.base_url):], advanced_mode=True)
987
+ resp = self.client.request(method="GET", path=attachment_path, advanced_mode=True)
912
988
  if resp.status_code == 200:
913
989
  content = resp.content
914
990
  else:
@@ -1661,8 +1737,8 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
1661
1737
  "page_ids": (Optional[List[str]], Field(description="List of page IDs to retrieve.", default=None)),
1662
1738
  "label": (Optional[str], Field(description="Label to filter pages.", default=None)),
1663
1739
  "cql": (Optional[str], Field(description="CQL query to filter pages.", default=None)),
1664
- "limit": (Optional[int], Field(description="Limit the number of results.", default=10)),
1665
- "max_pages": (Optional[int], Field(description="Maximum number of pages to retrieve.", default=1000)),
1740
+ "limit": (Optional[int], Field(description="Limit the number of results.", default=10, gt=0)),
1741
+ "max_pages": (Optional[int], Field(description="Maximum number of pages to retrieve.", default=1000, gt=0)),
1666
1742
  "include_restricted_content": (Optional[bool], Field(description="Include restricted content.", default=False)),
1667
1743
  "include_archived_content": (Optional[bool], Field(description="Include archived content.", default=False)),
1668
1744
  "include_attachments": (Optional[bool], Field(description="Include attachments.", default=False)),
@@ -1798,4 +1874,5 @@ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
1798
1874
  "description": self.get_page_attachments.__doc__,
1799
1875
  "args_schema": GetPageAttachmentsInput,
1800
1876
  }
1801
- ]
1877
+ ]
1878
+
@@ -3,6 +3,7 @@ from typing import Optional, List
3
3
  from logging import getLogger
4
4
 
5
5
  import requests
6
+ from langchain_core.documents import Document
6
7
 
7
8
  logger = getLogger(__name__)
8
9
  from PIL import Image
@@ -47,7 +48,8 @@ class AlitaConfluenceLoader(ConfluenceLoader):
47
48
  del kwargs[key]
48
49
  except:
49
50
  pass
50
- self.base_url = kwargs.get('url')
51
+ # utilize adjusted URL from Confluence instance for base_url
52
+ self.base_url = confluence_client.url
51
53
  self.space_key = kwargs.get('space_key')
52
54
  self.page_ids = kwargs.get('page_ids')
53
55
  self.label = kwargs.get('label')
@@ -107,7 +109,8 @@ class AlitaConfluenceLoader(ConfluenceLoader):
107
109
  texts = []
108
110
  for attachment in attachments:
109
111
  media_type = attachment["metadata"]["mediaType"]
110
- absolute_url = self.base_url + attachment["_links"]["download"]
112
+ # utilize adjusted URL from Confluence instance for attachment download URL
113
+ absolute_url = self.confluence.url + attachment["_links"]["download"]
111
114
  title = attachment["title"]
112
115
  try:
113
116
  if media_type == "application/pdf":
@@ -193,6 +196,15 @@ class AlitaConfluenceLoader(ConfluenceLoader):
193
196
  else:
194
197
  return super().process_image(link, ocr_languages)
195
198
 
199
+ def process_page(self, page: dict, include_attachments: bool, include_comments: bool, include_labels: bool,
200
+ content_format: ContentFormat, ocr_languages: Optional[str] = None,
201
+ keep_markdown_format: Optional[bool] = False, keep_newlines: bool = False) -> Document:
202
+ if not page.get("title"):
203
+ # if 'include_restricted_content' set to True, draft pages are loaded and can have no title
204
+ page["title"] = "Untitled"
205
+ return super().process_page(page, include_attachments, include_comments, include_labels, content_format,
206
+ ocr_languages, keep_markdown_format, keep_newlines)
207
+
196
208
  # TODO review usage
197
209
  # def process_svg(
198
210
  # self,
@@ -5,7 +5,8 @@ from pydantic import create_model, BaseModel, ConfigDict, Field
5
5
 
6
6
  from .api_wrapper import OpenApiWrapper
7
7
  from ..base.tool import BaseAction
8
- from ..utils import clean_string, TOOLKIT_SPLITTER
8
+ from ..utils import clean_string
9
+ from ...runtime.utils.constants import TOOLKIT_NAME_META, TOOL_NAME_META, TOOLKIT_TYPE_META
9
10
 
10
11
  name = "openapi"
11
12
 
@@ -43,15 +44,21 @@ class OpenApiToolkit(BaseToolkit):
43
44
  openapi_api_wrapper = OpenApiWrapper(**kwargs)
44
45
  available_tools = openapi_api_wrapper.get_available_tools()
45
46
  tools = []
46
- prefix = clean_string(toolkit_name + TOOLKIT_SPLITTER) if toolkit_name else ''
47
+ # Use clean toolkit name for context (max 1000 chars in description)
48
+ toolkit_context = f" [Toolkit: {clean_string(toolkit_name)}]" if toolkit_name else ''
47
49
  for tool in available_tools:
48
50
  if selected_tools and tool["name"] not in selected_tools:
49
51
  continue
52
+ # Add toolkit context to description with character limit
53
+ description = tool["description"]
54
+ if toolkit_context and len(description + toolkit_context) <= 1000:
55
+ description = description + toolkit_context
50
56
  tools.append(BaseAction(
51
57
  api_wrapper=openapi_api_wrapper,
52
- name=prefix + tool["name"],
53
- description=tool["description"],
54
- args_schema=tool["args_schema"]
58
+ name=tool["name"],
59
+ description=description,
60
+ args_schema=tool["args_schema"],
61
+ metadata={TOOLKIT_NAME_META: toolkit_name, TOOLKIT_TYPE_META: name, TOOL_NAME_META: tool["name"]} if toolkit_name else {TOOL_NAME_META: tool["name"]}
55
62
  ))
56
63
  return cls(tools=tools)
57
64