alita-sdk 0.3.257__py3-none-any.whl → 0.3.584__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of alita-sdk might be problematic. Click here for more details.

Files changed (281) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent/__init__.py +5 -0
  4. alita_sdk/cli/agent/default.py +258 -0
  5. alita_sdk/cli/agent_executor.py +155 -0
  6. alita_sdk/cli/agent_loader.py +215 -0
  7. alita_sdk/cli/agent_ui.py +228 -0
  8. alita_sdk/cli/agents.py +3794 -0
  9. alita_sdk/cli/callbacks.py +647 -0
  10. alita_sdk/cli/cli.py +168 -0
  11. alita_sdk/cli/config.py +306 -0
  12. alita_sdk/cli/context/__init__.py +30 -0
  13. alita_sdk/cli/context/cleanup.py +198 -0
  14. alita_sdk/cli/context/manager.py +731 -0
  15. alita_sdk/cli/context/message.py +285 -0
  16. alita_sdk/cli/context/strategies.py +289 -0
  17. alita_sdk/cli/context/token_estimation.py +127 -0
  18. alita_sdk/cli/formatting.py +182 -0
  19. alita_sdk/cli/input_handler.py +419 -0
  20. alita_sdk/cli/inventory.py +1073 -0
  21. alita_sdk/cli/mcp_loader.py +315 -0
  22. alita_sdk/cli/toolkit.py +327 -0
  23. alita_sdk/cli/toolkit_loader.py +85 -0
  24. alita_sdk/cli/tools/__init__.py +43 -0
  25. alita_sdk/cli/tools/approval.py +224 -0
  26. alita_sdk/cli/tools/filesystem.py +1751 -0
  27. alita_sdk/cli/tools/planning.py +389 -0
  28. alita_sdk/cli/tools/terminal.py +414 -0
  29. alita_sdk/community/__init__.py +72 -12
  30. alita_sdk/community/inventory/__init__.py +236 -0
  31. alita_sdk/community/inventory/config.py +257 -0
  32. alita_sdk/community/inventory/enrichment.py +2137 -0
  33. alita_sdk/community/inventory/extractors.py +1469 -0
  34. alita_sdk/community/inventory/ingestion.py +3172 -0
  35. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  36. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  37. alita_sdk/community/inventory/parsers/base.py +295 -0
  38. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  39. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  40. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  41. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  42. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  43. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  44. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  45. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  46. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  47. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  48. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  49. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  50. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  51. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  52. alita_sdk/community/inventory/patterns/loader.py +348 -0
  53. alita_sdk/community/inventory/patterns/registry.py +198 -0
  54. alita_sdk/community/inventory/presets.py +535 -0
  55. alita_sdk/community/inventory/retrieval.py +1403 -0
  56. alita_sdk/community/inventory/toolkit.py +173 -0
  57. alita_sdk/community/inventory/toolkit_utils.py +176 -0
  58. alita_sdk/community/inventory/visualize.py +1370 -0
  59. alita_sdk/configurations/__init__.py +11 -0
  60. alita_sdk/configurations/ado.py +148 -2
  61. alita_sdk/configurations/azure_search.py +1 -1
  62. alita_sdk/configurations/bigquery.py +1 -1
  63. alita_sdk/configurations/bitbucket.py +94 -2
  64. alita_sdk/configurations/browser.py +18 -0
  65. alita_sdk/configurations/carrier.py +19 -0
  66. alita_sdk/configurations/confluence.py +130 -1
  67. alita_sdk/configurations/delta_lake.py +1 -1
  68. alita_sdk/configurations/figma.py +76 -5
  69. alita_sdk/configurations/github.py +65 -1
  70. alita_sdk/configurations/gitlab.py +81 -0
  71. alita_sdk/configurations/google_places.py +17 -0
  72. alita_sdk/configurations/jira.py +103 -0
  73. alita_sdk/configurations/openapi.py +323 -0
  74. alita_sdk/configurations/postman.py +1 -1
  75. alita_sdk/configurations/qtest.py +72 -3
  76. alita_sdk/configurations/report_portal.py +115 -0
  77. alita_sdk/configurations/salesforce.py +19 -0
  78. alita_sdk/configurations/service_now.py +1 -12
  79. alita_sdk/configurations/sharepoint.py +167 -0
  80. alita_sdk/configurations/sonar.py +18 -0
  81. alita_sdk/configurations/sql.py +20 -0
  82. alita_sdk/configurations/testio.py +101 -0
  83. alita_sdk/configurations/testrail.py +88 -0
  84. alita_sdk/configurations/xray.py +94 -1
  85. alita_sdk/configurations/zephyr_enterprise.py +94 -1
  86. alita_sdk/configurations/zephyr_essential.py +95 -0
  87. alita_sdk/runtime/clients/artifact.py +21 -4
  88. alita_sdk/runtime/clients/client.py +458 -67
  89. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  90. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  91. alita_sdk/runtime/clients/sandbox_client.py +352 -0
  92. alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
  93. alita_sdk/runtime/langchain/assistant.py +183 -43
  94. alita_sdk/runtime/langchain/constants.py +647 -1
  95. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  96. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
  97. alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
  98. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
  99. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -3
  100. alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
  101. alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
  102. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
  103. alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
  104. alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
  105. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
  106. alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
  107. alita_sdk/runtime/langchain/document_loaders/constants.py +189 -41
  108. alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
  109. alita_sdk/runtime/langchain/langraph_agent.py +493 -105
  110. alita_sdk/runtime/langchain/utils.py +118 -8
  111. alita_sdk/runtime/llms/preloaded.py +2 -6
  112. alita_sdk/runtime/models/mcp_models.py +61 -0
  113. alita_sdk/runtime/skills/__init__.py +91 -0
  114. alita_sdk/runtime/skills/callbacks.py +498 -0
  115. alita_sdk/runtime/skills/discovery.py +540 -0
  116. alita_sdk/runtime/skills/executor.py +610 -0
  117. alita_sdk/runtime/skills/input_builder.py +371 -0
  118. alita_sdk/runtime/skills/models.py +330 -0
  119. alita_sdk/runtime/skills/registry.py +355 -0
  120. alita_sdk/runtime/skills/skill_runner.py +330 -0
  121. alita_sdk/runtime/toolkits/__init__.py +28 -0
  122. alita_sdk/runtime/toolkits/application.py +14 -4
  123. alita_sdk/runtime/toolkits/artifact.py +25 -9
  124. alita_sdk/runtime/toolkits/datasource.py +13 -6
  125. alita_sdk/runtime/toolkits/mcp.py +782 -0
  126. alita_sdk/runtime/toolkits/planning.py +178 -0
  127. alita_sdk/runtime/toolkits/skill_router.py +238 -0
  128. alita_sdk/runtime/toolkits/subgraph.py +11 -6
  129. alita_sdk/runtime/toolkits/tools.py +314 -70
  130. alita_sdk/runtime/toolkits/vectorstore.py +11 -5
  131. alita_sdk/runtime/tools/__init__.py +24 -0
  132. alita_sdk/runtime/tools/application.py +16 -4
  133. alita_sdk/runtime/tools/artifact.py +367 -33
  134. alita_sdk/runtime/tools/data_analysis.py +183 -0
  135. alita_sdk/runtime/tools/function.py +100 -4
  136. alita_sdk/runtime/tools/graph.py +81 -0
  137. alita_sdk/runtime/tools/image_generation.py +218 -0
  138. alita_sdk/runtime/tools/llm.py +1032 -177
  139. alita_sdk/runtime/tools/loop.py +3 -1
  140. alita_sdk/runtime/tools/loop_output.py +3 -1
  141. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  142. alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
  143. alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
  144. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  145. alita_sdk/runtime/tools/planning/models.py +246 -0
  146. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  147. alita_sdk/runtime/tools/router.py +2 -1
  148. alita_sdk/runtime/tools/sandbox.py +375 -0
  149. alita_sdk/runtime/tools/skill_router.py +776 -0
  150. alita_sdk/runtime/tools/tool.py +3 -1
  151. alita_sdk/runtime/tools/vectorstore.py +69 -65
  152. alita_sdk/runtime/tools/vectorstore_base.py +163 -90
  153. alita_sdk/runtime/utils/AlitaCallback.py +137 -21
  154. alita_sdk/runtime/utils/constants.py +5 -1
  155. alita_sdk/runtime/utils/mcp_client.py +492 -0
  156. alita_sdk/runtime/utils/mcp_oauth.py +361 -0
  157. alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
  158. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  159. alita_sdk/runtime/utils/streamlit.py +41 -14
  160. alita_sdk/runtime/utils/toolkit_utils.py +28 -9
  161. alita_sdk/runtime/utils/utils.py +48 -0
  162. alita_sdk/tools/__init__.py +135 -37
  163. alita_sdk/tools/ado/__init__.py +2 -2
  164. alita_sdk/tools/ado/repos/__init__.py +16 -19
  165. alita_sdk/tools/ado/repos/repos_wrapper.py +12 -20
  166. alita_sdk/tools/ado/test_plan/__init__.py +27 -8
  167. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -28
  168. alita_sdk/tools/ado/wiki/__init__.py +28 -12
  169. alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -40
  170. alita_sdk/tools/ado/work_item/__init__.py +28 -12
  171. alita_sdk/tools/ado/work_item/ado_wrapper.py +95 -11
  172. alita_sdk/tools/advanced_jira_mining/__init__.py +13 -8
  173. alita_sdk/tools/aws/delta_lake/__init__.py +15 -11
  174. alita_sdk/tools/aws/delta_lake/tool.py +5 -1
  175. alita_sdk/tools/azure_ai/search/__init__.py +14 -8
  176. alita_sdk/tools/base/tool.py +5 -1
  177. alita_sdk/tools/base_indexer_toolkit.py +454 -110
  178. alita_sdk/tools/bitbucket/__init__.py +28 -19
  179. alita_sdk/tools/bitbucket/api_wrapper.py +285 -27
  180. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
  181. alita_sdk/tools/browser/__init__.py +41 -16
  182. alita_sdk/tools/browser/crawler.py +3 -1
  183. alita_sdk/tools/browser/utils.py +15 -6
  184. alita_sdk/tools/carrier/__init__.py +18 -17
  185. alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
  186. alita_sdk/tools/carrier/excel_reporter.py +8 -4
  187. alita_sdk/tools/chunkers/__init__.py +3 -1
  188. alita_sdk/tools/chunkers/code/codeparser.py +1 -1
  189. alita_sdk/tools/chunkers/sematic/json_chunker.py +2 -1
  190. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  191. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  192. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  193. alita_sdk/tools/cloud/aws/__init__.py +12 -7
  194. alita_sdk/tools/cloud/azure/__init__.py +12 -7
  195. alita_sdk/tools/cloud/gcp/__init__.py +12 -7
  196. alita_sdk/tools/cloud/k8s/__init__.py +12 -7
  197. alita_sdk/tools/code/linter/__init__.py +10 -8
  198. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  199. alita_sdk/tools/code/sonar/__init__.py +21 -13
  200. alita_sdk/tools/code_indexer_toolkit.py +199 -0
  201. alita_sdk/tools/confluence/__init__.py +22 -14
  202. alita_sdk/tools/confluence/api_wrapper.py +197 -58
  203. alita_sdk/tools/confluence/loader.py +14 -2
  204. alita_sdk/tools/custom_open_api/__init__.py +12 -5
  205. alita_sdk/tools/elastic/__init__.py +11 -8
  206. alita_sdk/tools/elitea_base.py +546 -64
  207. alita_sdk/tools/figma/__init__.py +60 -11
  208. alita_sdk/tools/figma/api_wrapper.py +1400 -167
  209. alita_sdk/tools/figma/figma_client.py +73 -0
  210. alita_sdk/tools/figma/toon_tools.py +2748 -0
  211. alita_sdk/tools/github/__init__.py +18 -17
  212. alita_sdk/tools/github/api_wrapper.py +9 -26
  213. alita_sdk/tools/github/github_client.py +81 -12
  214. alita_sdk/tools/github/schemas.py +2 -1
  215. alita_sdk/tools/github/tool.py +5 -1
  216. alita_sdk/tools/gitlab/__init__.py +19 -13
  217. alita_sdk/tools/gitlab/api_wrapper.py +256 -80
  218. alita_sdk/tools/gitlab_org/__init__.py +14 -10
  219. alita_sdk/tools/google/bigquery/__init__.py +14 -13
  220. alita_sdk/tools/google/bigquery/tool.py +5 -1
  221. alita_sdk/tools/google_places/__init__.py +21 -11
  222. alita_sdk/tools/jira/__init__.py +22 -11
  223. alita_sdk/tools/jira/api_wrapper.py +315 -168
  224. alita_sdk/tools/keycloak/__init__.py +11 -8
  225. alita_sdk/tools/localgit/__init__.py +9 -3
  226. alita_sdk/tools/localgit/local_git.py +62 -54
  227. alita_sdk/tools/localgit/tool.py +5 -1
  228. alita_sdk/tools/memory/__init__.py +38 -14
  229. alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
  230. alita_sdk/tools/ocr/__init__.py +11 -8
  231. alita_sdk/tools/openapi/__init__.py +491 -106
  232. alita_sdk/tools/openapi/api_wrapper.py +1357 -0
  233. alita_sdk/tools/openapi/tool.py +20 -0
  234. alita_sdk/tools/pandas/__init__.py +20 -12
  235. alita_sdk/tools/pandas/api_wrapper.py +40 -45
  236. alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
  237. alita_sdk/tools/postman/__init__.py +11 -11
  238. alita_sdk/tools/postman/api_wrapper.py +19 -8
  239. alita_sdk/tools/postman/postman_analysis.py +8 -1
  240. alita_sdk/tools/pptx/__init__.py +11 -10
  241. alita_sdk/tools/qtest/__init__.py +22 -14
  242. alita_sdk/tools/qtest/api_wrapper.py +1784 -88
  243. alita_sdk/tools/rally/__init__.py +13 -10
  244. alita_sdk/tools/report_portal/__init__.py +23 -16
  245. alita_sdk/tools/salesforce/__init__.py +22 -16
  246. alita_sdk/tools/servicenow/__init__.py +21 -16
  247. alita_sdk/tools/servicenow/api_wrapper.py +1 -1
  248. alita_sdk/tools/sharepoint/__init__.py +17 -14
  249. alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
  250. alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
  251. alita_sdk/tools/sharepoint/utils.py +8 -2
  252. alita_sdk/tools/slack/__init__.py +13 -8
  253. alita_sdk/tools/sql/__init__.py +22 -19
  254. alita_sdk/tools/sql/api_wrapper.py +71 -23
  255. alita_sdk/tools/testio/__init__.py +21 -13
  256. alita_sdk/tools/testrail/__init__.py +13 -11
  257. alita_sdk/tools/testrail/api_wrapper.py +214 -46
  258. alita_sdk/tools/utils/__init__.py +28 -4
  259. alita_sdk/tools/utils/content_parser.py +241 -55
  260. alita_sdk/tools/utils/text_operations.py +254 -0
  261. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
  262. alita_sdk/tools/xray/__init__.py +18 -14
  263. alita_sdk/tools/xray/api_wrapper.py +58 -113
  264. alita_sdk/tools/yagmail/__init__.py +9 -3
  265. alita_sdk/tools/zephyr/__init__.py +12 -7
  266. alita_sdk/tools/zephyr_enterprise/__init__.py +16 -9
  267. alita_sdk/tools/zephyr_enterprise/api_wrapper.py +30 -15
  268. alita_sdk/tools/zephyr_essential/__init__.py +16 -10
  269. alita_sdk/tools/zephyr_essential/api_wrapper.py +297 -54
  270. alita_sdk/tools/zephyr_essential/client.py +6 -4
  271. alita_sdk/tools/zephyr_scale/__init__.py +13 -8
  272. alita_sdk/tools/zephyr_scale/api_wrapper.py +39 -31
  273. alita_sdk/tools/zephyr_squad/__init__.py +12 -7
  274. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/METADATA +184 -37
  275. alita_sdk-0.3.584.dist-info/RECORD +452 -0
  276. alita_sdk-0.3.584.dist-info/entry_points.txt +2 -0
  277. alita_sdk/tools/bitbucket/tools.py +0 -304
  278. alita_sdk-0.3.257.dist-info/RECORD +0 -343
  279. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/WHEEL +0 -0
  280. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/licenses/LICENSE +0 -0
  281. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,199 @@
1
+ import ast
2
+ import fnmatch
3
+ import json
4
+ import logging
5
+ from typing import Optional, List, Generator
6
+
7
+ from langchain_core.documents import Document
8
+ from langchain_core.tools import ToolException
9
+ from pydantic import Field
10
+
11
+ from alita_sdk.tools.base_indexer_toolkit import BaseIndexerToolkit
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+
16
+ class CodeIndexerToolkit(BaseIndexerToolkit):
17
+ def _get_indexed_data(self, index_name: str):
18
+ self._ensure_vectorstore_initialized()
19
+ if not self.vector_adapter:
20
+ raise ToolException("Vector adapter is not initialized. "
21
+ "Check your configuration: embedding_model and vectorstore_type.")
22
+ return self.vector_adapter.get_code_indexed_data(self, index_name)
23
+
24
+ def key_fn(self, document: Document):
25
+ return document.metadata.get("filename")
26
+
27
+ def compare_fn(self, document: Document, idx_data):
28
+ return (document.metadata.get('commit_hash') and
29
+ idx_data.get('commit_hashes') and
30
+ document.metadata.get('commit_hash') in idx_data.get('commit_hashes')
31
+ )
32
+
33
+ def remove_ids_fn(self, idx_data, key: str):
34
+ return idx_data[key]['ids']
35
+
36
+ def _base_loader(
37
+ self,
38
+ branch: Optional[str] = None,
39
+ whitelist: Optional[List[str]] = None,
40
+ blacklist: Optional[List[str]] = None,
41
+ **kwargs) -> Generator[Document, None, None]:
42
+ """Index repository files in the vector store using code parsing."""
43
+ yield from self.loader(
44
+ branch=branch,
45
+ whitelist=whitelist,
46
+ blacklist=blacklist
47
+ )
48
+
49
+ def _extend_data(self, documents: Generator[Document, None, None]):
50
+ yield from documents
51
+
52
+ def _index_tool_params(self):
53
+ """Return the parameters for indexing data."""
54
+ return {
55
+ "branch": (Optional[str], Field(
56
+ description="Branch to index files from. Defaults to active branch if None.",
57
+ default=None)),
58
+ "whitelist": (Optional[List[str]], Field(
59
+ description='File extensions or paths to include. Defaults to all files if None. Example: ["*.md", "*.java"]',
60
+ default=None)),
61
+ "blacklist": (Optional[List[str]], Field(
62
+ description='File extensions or paths to exclude. Defaults to no exclusions if None. Example: ["*.md", "*.java"]',
63
+ default=None)),
64
+ }
65
+
66
+ def loader(self,
67
+ branch: Optional[str] = None,
68
+ whitelist: Optional[List[str]] = None,
69
+ blacklist: Optional[List[str]] = None,
70
+ chunked: bool = True) -> Generator[Document, None, None]:
71
+ """
72
+ Generates Documents from files in a branch, respecting whitelist and blacklist patterns.
73
+
74
+ Parameters:
75
+ - branch (Optional[str]): Branch for listing files. Defaults to the current branch if None.
76
+ - whitelist (Optional[List[str]]): File extensions or paths to include. Defaults to all files if None.
77
+ - blacklist (Optional[List[str]]): File extensions or paths to exclude. Defaults to no exclusions if None.
78
+ - chunked (bool): If True (default), applies universal chunker based on file type.
79
+ If False, returns raw Documents without chunking.
80
+
81
+ Returns:
82
+ - generator: Yields Documents from files matching the whitelist but not the blacklist.
83
+ Each document has exactly the key 'filename' in metadata, which is used as an ID
84
+ for further operations (indexing, deduplication, and retrieval).
85
+
86
+ Example:
87
+ # Use 'feature-branch', include '.py' files, exclude 'test_' files
88
+ for doc in loader(branch='feature-branch', whitelist=['*.py'], blacklist=['*test_*']):
89
+ print(doc.page_content)
90
+
91
+ Notes:
92
+ - Whitelist and blacklist use Unix shell-style wildcards.
93
+ - Files must match the whitelist and not the blacklist to be included.
94
+ - Each document MUST have exactly the key 'filename' in metadata. This key is used as an ID
95
+ for further operations such as indexing, deduplication, and retrieval.
96
+ - When chunked=True:
97
+ - .md files → markdown chunker (header-based splitting)
98
+ - .py/.js/.ts/etc → code parser (TreeSitter-based)
99
+ - .json files → JSON chunker
100
+ - other files → default text chunker
101
+ """
102
+ import hashlib
103
+
104
+ _files = self.__handle_get_files("", self.__get_branch(branch))
105
+ self._log_tool_event(message="Listing files in branch", tool_name="loader")
106
+ logger.info(f"Files in branch: {_files}")
107
+
108
+ def is_whitelisted(file_path: str) -> bool:
109
+ if whitelist:
110
+ return (any(fnmatch.fnmatch(file_path, pattern) for pattern in whitelist)
111
+ or any(file_path.endswith(f'.{pattern}') for pattern in whitelist))
112
+ return True
113
+
114
+ def is_blacklisted(file_path: str) -> bool:
115
+ if blacklist:
116
+ return (any(fnmatch.fnmatch(file_path, pattern) for pattern in blacklist)
117
+ or any(file_path.endswith(f'.{pattern}') for pattern in blacklist))
118
+ return False
119
+
120
+ def raw_document_generator() -> Generator[Document, None, None]:
121
+ """Yields raw Documents without chunking."""
122
+ self._log_tool_event(message="Reading the files", tool_name="loader")
123
+ total_files = len(_files)
124
+ processed = 0
125
+
126
+ for idx, file in enumerate(_files, 1):
127
+ if is_whitelisted(file) and not is_blacklisted(file):
128
+ try:
129
+ file_content = self._read_file(file, self.__get_branch(branch))
130
+ except Exception as e:
131
+ logger.error(f"Failed to read file {file}: {e}")
132
+ continue
133
+
134
+ if not file_content:
135
+ continue
136
+
137
+ # Ensure file content is a string
138
+ if isinstance(file_content, bytes):
139
+ file_content = file_content.decode("utf-8", errors="ignore")
140
+ elif isinstance(file_content, dict) and file.endswith('.json'):
141
+ file_content = json.dumps(file_content)
142
+ elif not isinstance(file_content, str):
143
+ file_content = str(file_content)
144
+
145
+ # Hash the file content for uniqueness tracking
146
+ file_hash = hashlib.sha256(file_content.encode("utf-8")).hexdigest()
147
+ processed += 1
148
+
149
+ yield Document(
150
+ page_content=file_content,
151
+ metadata={
152
+ 'file_path': file,
153
+ 'filename': file,
154
+ 'source': file,
155
+ 'commit_hash': file_hash,
156
+ }
157
+ )
158
+
159
+ if idx % 10 == 0 or idx == total_files:
160
+ self._log_tool_event(
161
+ message=f"{idx} out of {total_files} files checked, {processed} matched",
162
+ tool_name="loader"
163
+ )
164
+
165
+ self._log_tool_event(message=f"{processed} files loaded", tool_name="loader")
166
+
167
+ if not chunked:
168
+ # Return raw documents without chunking
169
+ return raw_document_generator()
170
+
171
+ # Apply universal chunker based on file type
172
+ from .chunkers.universal_chunker import universal_chunker
173
+ return universal_chunker(raw_document_generator())
174
+
175
+ def __handle_get_files(self, path: str, branch: str):
176
+ """
177
+ Handles the retrieval of files from a specific path and branch.
178
+ This method should be implemented in subclasses to provide the actual file retrieval logic.
179
+ """
180
+ _files = self._get_files(path=path, branch=branch)
181
+ if isinstance(_files, str):
182
+ try:
183
+ # Attempt to convert the string to a list using ast.literal_eval
184
+ _files = ast.literal_eval(_files)
185
+ # Ensure that the result is actually a list of strings
186
+ if not isinstance(_files, list) or not all(isinstance(item, str) for item in _files):
187
+ raise ValueError("The evaluated result is not a list of strings")
188
+ except (SyntaxError, ValueError):
189
+ # Handle the case where the string cannot be converted to a list
190
+ raise ValueError("Expected a list of strings, but got a string that cannot be converted")
191
+
192
+ # Ensure _files is a list of strings
193
+ if not isinstance(_files, list) or not all(isinstance(item, str) for item in _files):
194
+ raise ValueError("Expected a list of strings")
195
+ return _files
196
+
197
+ def __get_branch(self, branch):
198
+ return (branch or getattr(self, 'active_branch', None)
199
+ or getattr(self, '_active_branch', None) or getattr(self, 'branch', None))
@@ -4,17 +4,19 @@ from .api_wrapper import ConfluenceAPIWrapper
4
4
  from langchain_core.tools import BaseTool
5
5
  from ..base.tool import BaseAction
6
6
  from pydantic import create_model, BaseModel, ConfigDict, Field
7
- from ..utils import clean_string, TOOLKIT_SPLITTER, get_max_toolkit_length, parse_list, check_connection_response
7
+
8
+ from ..elitea_base import filter_missconfigured_index_tools
9
+ from ..utils import clean_string, get_max_toolkit_length, parse_list, check_connection_response
8
10
  from ...configurations.confluence import ConfluenceConfiguration
9
11
  from ...configurations.pgvector import PgVectorConfiguration
10
12
  import requests
13
+ from ...runtime.utils.constants import TOOLKIT_NAME_META, TOOL_NAME_META, TOOLKIT_TYPE_META
11
14
 
12
15
  name = "confluence"
13
16
 
14
- def get_tools(tool):
17
+ def get_toolkit(tool):
15
18
  return ConfluenceToolkit().get_toolkit(
16
19
  selected_tools=tool['settings'].get('selected_tools', []),
17
- base_url=tool['settings']['base_url'],
18
20
  space=tool['settings'].get('space', None),
19
21
  cloud=tool['settings'].get('cloud', True),
20
22
  confluence_configuration=tool['settings']['confluence_configuration'],
@@ -32,18 +34,19 @@ def get_tools(tool):
32
34
  doctype='doc',
33
35
  embedding_model=tool['settings'].get('embedding_model'),
34
36
  vectorstore_type="PGVector"
35
- ).get_tools()
37
+ )
38
+
39
+ def get_tools(tool):
40
+ return get_toolkit(tool).get_tools()
36
41
 
37
42
 
38
43
  class ConfluenceToolkit(BaseToolkit):
39
44
  tools: List[BaseTool] = []
40
- toolkit_max_length: int = 0
41
45
 
42
46
  @staticmethod
43
47
  def toolkit_config_schema() -> BaseModel:
44
48
  selected_tools = {x['name']: x['args_schema'].schema() for x in
45
49
  ConfluenceAPIWrapper.model_construct().get_available_tools()}
46
- ConfluenceToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
47
50
 
48
51
  @check_connection_response
49
52
  def check_connection(self):
@@ -66,8 +69,7 @@ class ConfluenceToolkit(BaseToolkit):
66
69
 
67
70
  model = create_model(
68
71
  name,
69
- space=(str, Field(description="Space", json_schema_extra={'toolkit_name': True,
70
- 'max_toolkit_length': ConfluenceToolkit.toolkit_max_length})),
72
+ space=(str, Field(description="Space")),
71
73
  cloud=(bool, Field(description="Hosting Option", json_schema_extra={'configuration': True})),
72
74
  limit=(int, Field(description="Pages limit per request", default=5)),
73
75
  labels=(Optional[str], Field(
@@ -80,8 +82,8 @@ class ConfluenceToolkit(BaseToolkit):
80
82
  min_retry_seconds=(int, Field(description="Min retry, sec", default=10)),
81
83
  max_retry_seconds=(int, Field(description="Max retry, sec", default=60)),
82
84
  # optional field for custom headers as dictionary
83
- custom_headers=(Optional[dict], Field(description="Custom headers for API requests", default=None)),
84
- confluence_configuration=(Optional[ConfluenceConfiguration], Field(description="Confluence Configuration", json_schema_extra={'configuration_types': ['confluence']})),
85
+ custom_headers=(Optional[dict], Field(description="Custom headers for API requests", default={})),
86
+ confluence_configuration=(ConfluenceConfiguration, Field(description="Confluence Configuration", json_schema_extra={'configuration_types': ['confluence']})),
85
87
  pgvector_configuration=(Optional[PgVectorConfiguration], Field(default = None,
86
88
  description="PgVector Configuration",
87
89
  json_schema_extra={'configuration_types': ['pgvector']})),
@@ -103,6 +105,7 @@ class ConfluenceToolkit(BaseToolkit):
103
105
  return model
104
106
 
105
107
  @classmethod
108
+ @filter_missconfigured_index_tools
106
109
  def get_toolkit(cls, selected_tools: list[str] | None = None, toolkit_name: Optional[str] = None, **kwargs):
107
110
  if selected_tools is None:
108
111
  selected_tools = []
@@ -113,18 +116,23 @@ class ConfluenceToolkit(BaseToolkit):
113
116
  **(kwargs.get('pgvector_configuration') or {}),
114
117
  }
115
118
  confluence_api_wrapper = ConfluenceAPIWrapper(**wrapper_payload)
116
- prefix = clean_string(toolkit_name, ConfluenceToolkit.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
117
119
  available_tools = confluence_api_wrapper.get_available_tools()
118
120
  tools = []
119
121
  for tool in available_tools:
120
122
  if selected_tools:
121
123
  if tool["name"] not in selected_tools:
122
124
  continue
125
+ description = tool["description"]
126
+ if toolkit_name:
127
+ description = f"Toolkit: {toolkit_name}\n{description}"
128
+ description = f"Confluence space: {confluence_api_wrapper.space}\n{description}"
129
+ description = description[:1000]
123
130
  tools.append(BaseAction(
124
131
  api_wrapper=confluence_api_wrapper,
125
- name=prefix + tool["name"],
126
- description=f"Confluence space: {confluence_api_wrapper.space}" + tool["description"],
127
- args_schema=tool["args_schema"]
132
+ name=tool["name"],
133
+ description=description,
134
+ args_schema=tool["args_schema"],
135
+ metadata={TOOLKIT_NAME_META: toolkit_name, TOOLKIT_TYPE_META: name, TOOL_NAME_META: tool["name"]} if toolkit_name else {TOOL_NAME_META: tool["name"]}
128
136
  ))
129
137
  return cls(tools=tools)
130
138
 
@@ -1,24 +1,27 @@
1
- import re
2
- import logging
3
- import requests
4
- import json
5
1
  import base64
2
+ import json
3
+ import logging
4
+ import re
6
5
  import traceback
7
- from typing import Optional, List, Any, Dict, Callable, Generator, Literal
8
6
  from json import JSONDecodeError
7
+ from typing import Optional, List, Any, Dict, Callable, Generator, Literal
9
8
 
10
- from pydantic import Field, PrivateAttr, model_validator, create_model, SecretStr
11
- from tenacity import retry, stop_after_attempt, wait_exponential, before_sleep_log
12
-
9
+ import requests
10
+ from atlassian.errors import ApiError
11
+ from langchain_community.document_loaders.confluence import ContentFormat
13
12
  from langchain_core.documents import Document
14
- from langchain_core.tools import ToolException
15
13
  from langchain_core.messages import HumanMessage
14
+ from langchain_core.tools import ToolException
16
15
  from markdownify import markdownify
17
- from langchain_community.document_loaders.confluence import ContentFormat
16
+ from pydantic import Field, PrivateAttr, model_validator, create_model, SecretStr
17
+ from requests import HTTPError
18
+ from tenacity import retry, stop_after_attempt, wait_exponential, before_sleep_log
18
19
 
19
- from ..elitea_base import BaseVectorStoreToolApiWrapper, extend_with_vector_tools
20
+ from alita_sdk.tools.non_code_indexer_toolkit import NonCodeIndexerToolkit
21
+ from alita_sdk.tools.utils.available_tools_decorator import extend_with_parent_available_tools
20
22
  from ..llm.img_utils import ImageDescriptionCache
21
23
  from ..utils import is_cookie_token, parse_cookie_string
24
+ from ...runtime.utils.utils import IndexerKeywords
22
25
 
23
26
  logger = logging.getLogger(__name__)
24
27
 
@@ -171,7 +174,7 @@ def parse_payload_params(params: Optional[str]) -> Dict[str, Any]:
171
174
  return {}
172
175
 
173
176
 
174
- class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
177
+ class ConfluenceAPIWrapper(NonCodeIndexerToolkit):
175
178
  # Changed from PrivateAttr to Optional field with exclude=True
176
179
  client: Optional[Any] = Field(default=None, exclude=True)
177
180
  base_url: str
@@ -193,15 +196,7 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
193
196
  keep_markdown_format: Optional[bool] = True
194
197
  ocr_languages: Optional[str] = None
195
198
  keep_newlines: Optional[bool] = True
196
- llm: Any = None
197
- # indexer related
198
- connection_string: Optional[SecretStr] = None
199
- collection_name: Optional[str] = None
200
- doctype: Optional[str] = 'doc'
201
- embedding_model: Optional[str] = "HuggingFaceEmbeddings"
202
- embedding_model_params: Optional[Dict[str, Any]] = {"model_name": "sentence-transformers/all-MiniLM-L6-v2"}
203
- vectorstore_type: Optional[str] = "PGVector"
204
-
199
+ _errors: Optional[list[str]] = None
205
200
  _image_cache: ImageDescriptionCache = PrivateAttr(default_factory=ImageDescriptionCache)
206
201
 
207
202
  @model_validator(mode='before')
@@ -232,13 +227,13 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
232
227
  else:
233
228
  client_instance = Confluence(url=url, username=username, password=api_key, cloud=cloud)
234
229
 
235
- custom_headers = values.get('custom_headers', {})
236
- logger.info(f"Jira tool: custom headers length: {len(custom_headers)}")
230
+ custom_headers = values.get('custom_headers') or {}
231
+ logger.info(f"Confluence tool: custom headers length: {len(custom_headers)}")
237
232
  for header, value in custom_headers.items():
238
233
  client_instance._update_header(header, value)
239
234
 
240
235
  values['client'] = client_instance
241
- return values
236
+ return super().validate_toolkit(values)
242
237
 
243
238
  def __unquote_confluence_space(self) -> str | None:
244
239
  if self.space:
@@ -485,28 +480,78 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
485
480
  """Gets pages with specific label in the Confluence space."""
486
481
 
487
482
  start = 0
488
- pages_info = []
489
- for _ in range((self.max_pages + self.limit - 1) // self.limit):
490
- pages = self.client.get_all_pages_by_label(label, start=start,
491
- limit=self.limit) # , expand="body.view.value"
483
+ pages_info: List[Dict[str, Any]] = []
484
+ seen_ids: set[str] = set()
485
+
486
+ # Use a while-loop driven by unique pages collected and
487
+ # presence of additional results instead of a fixed number
488
+ # of iterations based purely on max_pages/limit.
489
+ while len(pages_info) < (self.max_pages or 0):
490
+ pages = self.client.get_all_pages_by_label(
491
+ label,
492
+ start=start,
493
+ limit=self.limit,
494
+ ) # , expand="body.view.value"
492
495
  if not pages:
493
496
  break
494
497
 
495
- pages_info += [{
496
- 'page_id': page.metadata['id'],
497
- 'page_title': page.metadata['title'],
498
- 'page_url': page.metadata['source'],
499
- 'content': page.page_content
500
- } for page in self.get_pages_by_id([page["id"] for page in pages])]
498
+ # Collect only ids we haven't processed yet to avoid
499
+ # calling get_page_by_id multiple times for the same
500
+ # Confluence page.
501
+ new_ids: List[str] = []
502
+ for p in pages:
503
+ page_id = p["id"] if isinstance(p, dict) else getattr(p, "id", None)
504
+ if page_id is None:
505
+ continue
506
+ if page_id in seen_ids:
507
+ continue
508
+ seen_ids.add(page_id)
509
+ new_ids.append(page_id)
510
+
511
+ if new_ids:
512
+ for page in self.get_pages_by_id(new_ids):
513
+ meta = getattr(page, "metadata", {}) or {}
514
+ page_id = meta.get("id")
515
+ page_title = meta.get("title")
516
+ page_url = meta.get("source")
517
+ content = getattr(page, "page_content", None)
518
+
519
+ if page_id is None:
520
+ continue
521
+
522
+ pages_info.append(
523
+ {
524
+ "page_id": page_id,
525
+ "page_title": page_title,
526
+ "page_url": page_url,
527
+ "content": content,
528
+ }
529
+ )
530
+
531
+ # Respect max_pages on unique pages collected.
532
+ if len(pages_info) >= (self.max_pages or 0):
533
+ break
534
+
535
+ # Advance the offset by the requested page size.
501
536
  start += self.limit
502
- return pages_info
537
+
538
+ # Defensive break: if the API returns fewer items than
539
+ # requested, there are likely no more pages to fetch.
540
+ if len(pages) < self.limit:
541
+ break
542
+
543
+ # Slice as an extra safety net in case of any race conditions
544
+ # around the max_pages guard in the loop above.
545
+ return pages_info[: (self.max_pages or len(pages_info))]
503
546
 
504
547
  def is_public_page(self, page: dict) -> bool:
505
548
  """Check if a page is publicly accessible."""
506
549
  restrictions = self.client.get_all_restrictions_for_content(page["id"])
507
550
 
508
551
  return (
509
- page["status"] == "current"
552
+ (page["status"] == "current"
553
+ # allow user to see archived content if needed
554
+ or page["status"] == "archived")
510
555
  and not restrictions["read"]["restrictions"]["user"]["results"]
511
556
  and not restrictions["read"]["restrictions"]["group"]["results"]
512
557
  )
@@ -526,18 +571,35 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
526
571
  ),
527
572
  before_sleep=before_sleep_log(logger, logging.WARNING),
528
573
  )(self.client.get_page_by_id)
529
- page = get_page(
530
- page_id=page_id, expand=f"{self.content_format.value},version"
531
- )
532
- if not self.include_restricted_content and not self.is_public_page(page):
533
- continue
574
+ try:
575
+ page = get_page(
576
+ page_id=page_id, expand=f"{self.content_format.value},version"
577
+ )
578
+ except (ApiError, HTTPError) as e:
579
+ logger.error(f"Error fetching page with ID {page_id}: {e}")
580
+ page_content_temp = f"Confluence API Error: cannot fetch the page with ID {page_id}: {e}"
581
+ # store errors
582
+ if self._errors is None:
583
+ self._errors = []
584
+ self._errors.append(page_content_temp)
585
+ return Document(page_content=page_content_temp,
586
+ metadata={})
587
+ # TODO: update on toolkit advanced settings level as a separate feature
588
+ # if not self.include_restricted_content and not self.is_public_page(page):
589
+ # continue
534
590
  yield self.process_page(page, skip_images)
535
591
 
592
+ def _log_errors(self):
593
+ """ Log errors encountered during toolkit execution. """
594
+ if self._errors:
595
+ logger.info(f"Errors encountered during toolkit execution: {self._errors}")
596
+
536
597
  def read_page_by_id(self, page_id: str, skip_images: bool = False):
537
598
  """Reads a page by its id in the Confluence space. If id is not available, but there is a title - use get_page_id first."""
538
599
  result = list(self.get_pages_by_id([page_id], skip_images))
539
600
  if not result:
540
- "Page not found"
601
+ return f"Pages not found. Errors: {self._errors}" if self._errors \
602
+ else "Pages not found or you do not have access to them."
541
603
  return result[0].page_content
542
604
  # return self._strip_base64_images(result[0].page_content) if skip_images else result[0].page_content
543
605
 
@@ -547,7 +609,9 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
547
609
  :param title: title
548
610
  :param type: type of content: page or blogpost. Defaults to page
549
611
  """
550
- return self.client.get_page_id(space=self.space, title=title, type=type)
612
+
613
+ result = self.client.get_page_id(space=self.space, title=title, type=type)
614
+ return result if result else "Page not found. Check the title or space."
551
615
 
552
616
  def _strip_base64_images(self, content):
553
617
  base64_md_pattern = r'data:image/(png|jpeg|gif);base64,[a-zA-Z0-9+/=]+'
@@ -570,7 +634,7 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
570
634
  }
571
635
  pages_info.append(page_info)
572
636
  start += self.limit
573
- return str(pages_info)
637
+ return str(pages_info) if pages_info else f"Unable to find anything using query {cql}. Check space or query."
574
638
 
575
639
  def search_pages(self, query: str, skip_images: bool = False):
576
640
  """Search pages in Confluence by query text in title or page content."""
@@ -821,6 +885,10 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
821
885
  from .loader import AlitaConfluenceLoader
822
886
  from copy import copy
823
887
  content_format = kwargs.get('content_format', 'view').lower()
888
+
889
+ self._index_include_attachments = kwargs.get('include_attachments', False)
890
+ self._include_extensions = kwargs.get('include_extensions', [])
891
+ self._skip_extensions = kwargs.get('skip_extensions', [])
824
892
  base_params = {
825
893
  'url': self.base_url,
826
894
  'space_key': self.space,
@@ -852,12 +920,81 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
852
920
  yield document
853
921
 
854
922
  def _process_document(self, document: Document) -> Generator[Document, None, None]:
855
- attachments = self.get_page_attachments(document.metadata.get('id'))
856
- if isinstance(attachments, str):
857
- logger.info(f" {document.metadata.get('id')}: {attachments}")
858
- return
859
- for attachment in attachments:
860
- yield Document(page_content=attachment.get('content', '') or attachment.get('llm_analysis', ''), metadata=attachment.get('metadata', {}))
923
+ try:
924
+ if self._index_include_attachments:
925
+ page_id = document.metadata.get('id')
926
+ attachments = self.client.get_attachments_from_content(page_id)
927
+ if not attachments or not attachments.get('results'):
928
+ return f"No attachments found for page ID {page_id}."
929
+
930
+ # Get attachment history for created/updated info
931
+ history_map = {}
932
+ for attachment in attachments['results']:
933
+ try:
934
+ hist = self.client.history(attachment['id'])
935
+ history_map[attachment['id']] = hist
936
+ except Exception as e:
937
+ logger.warning(f"Failed to fetch history for attachment {attachment.get('title', '')}: {str(e)}")
938
+ history_map[attachment['id']] = None
939
+
940
+ import re
941
+ for attachment in attachments['results']:
942
+ title = attachment.get('title', '')
943
+ file_ext = title.lower().split('.')[-1] if '.' in title else ''
944
+
945
+ # Re-verify extension filters
946
+ # Check if file should be skipped based on skip_extensions
947
+ if any(re.match(re.escape(pattern).replace(r'\*', '.*') + '$', title, re.IGNORECASE)
948
+ for pattern in self._skip_extensions):
949
+ continue
950
+
951
+ # Check if file should be included based on include_extensions
952
+ # If include_extensions is empty, process all files (that weren't skipped)
953
+ if self._include_extensions and not (
954
+ any(re.match(re.escape(pattern).replace(r'\*', '.*') + '$', title, re.IGNORECASE)
955
+ for pattern in self._include_extensions)):
956
+ continue
957
+
958
+ media_type = attachment.get('metadata', {}).get('mediaType', '')
959
+ # Core metadata extraction with history
960
+ hist = history_map.get(attachment['id']) or {}
961
+ created_by = hist.get('createdBy', {}).get('displayName', '') if hist else attachment.get('creator', {}).get('displayName', '')
962
+ created_date = hist.get('createdDate', '') if hist else attachment.get('created', '')
963
+ last_updated = hist.get('lastUpdated', {}).get('when', '') if hist else ''
964
+
965
+ attachment_path = attachment['_links']['download'] if attachment.get(
966
+ '_links', {}).get('download') else ''
967
+ download_url = self.client.url.rstrip('/') + attachment_path
968
+ metadata = {
969
+ 'name': title,
970
+ 'size': attachment.get('extensions', {}).get('fileSize', None),
971
+ 'creator': created_by,
972
+ 'created': created_date,
973
+ 'updated': last_updated,
974
+ 'media_type': media_type,
975
+ 'labels': [label['name'] for label in
976
+ attachment.get('metadata', {}).get('labels', {}).get('results', [])],
977
+ 'download_url': download_url
978
+ }
979
+ try:
980
+ resp = self.client.request(method="GET", path=attachment_path, advanced_mode=True)
981
+ if resp.status_code == 200:
982
+ content = resp.content
983
+ else:
984
+ content = f"[Failed to download {download_url}: HTTP status code {resp.status_code}]"
985
+ except Exception as e:
986
+ content = f"[Error downloading content: {str(e)}]"
987
+
988
+ if isinstance(content, str):
989
+ yield Document(page_content=content, metadata=metadata)
990
+ else:
991
+ yield Document(page_content="", metadata={
992
+ **metadata,
993
+ IndexerKeywords.CONTENT_FILE_NAME.value: f".{file_ext}",
994
+ IndexerKeywords.CONTENT_IN_BYTES.value: content
995
+ })
996
+ except Exception as e:
997
+ yield from ()
861
998
 
862
999
  def _download_image(self, image_url):
863
1000
  """
@@ -1598,18 +1735,24 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
1598
1735
  "include_restricted_content": (Optional[bool], Field(description="Include restricted content.", default=False)),
1599
1736
  "include_archived_content": (Optional[bool], Field(description="Include archived content.", default=False)),
1600
1737
  "include_attachments": (Optional[bool], Field(description="Include attachments.", default=False)),
1738
+ 'include_extensions': (Optional[List[str]], Field(
1739
+ description="List of file extensions to include when processing attachments: i.e. ['*.png', '*.jpg']. "
1740
+ "If empty, all files will be processed (except skip_extensions).",
1741
+ default=[])),
1742
+ 'skip_extensions': (Optional[List[str]], Field(
1743
+ description="List of file extensions to skip when processing attachments: i.e. ['*.png', '*.jpg']",
1744
+ default=[])),
1601
1745
  "include_comments": (Optional[bool], Field(description="Include comments.", default=False)),
1602
- "include_labels": (Optional[bool], Field(description="Include labels.", default=True)),
1746
+ "include_labels": (Optional[bool], Field(description="Include labels.", default=False)),
1603
1747
  "ocr_languages": (Optional[str], Field(description="OCR languages for processing attachments.", default='eng')),
1604
1748
  "keep_markdown_format": (Optional[bool], Field(description="Keep the markdown format.", default=True)),
1605
1749
  "keep_newlines": (Optional[bool], Field(description="Keep newlines in the content.", default=True)),
1606
1750
  "bins_with_llm": (Optional[bool], Field(description="Use LLM for processing binary files.", default=False)),
1607
1751
  }
1608
1752
 
1609
- @extend_with_vector_tools
1753
+ @extend_with_parent_available_tools
1610
1754
  def get_available_tools(self):
1611
- # Confluence-specific tools
1612
- confluence_tools = [
1755
+ return [
1613
1756
  {
1614
1757
  "name": "create_page",
1615
1758
  "ref": self.create_page,
@@ -1726,7 +1869,3 @@ class ConfluenceAPIWrapper(BaseVectorStoreToolApiWrapper):
1726
1869
  }
1727
1870
  ]
1728
1871
 
1729
- # Add standardized vector search tools from base class
1730
- vector_search_tools = self._get_vector_search_tools()
1731
-
1732
- return confluence_tools + vector_search_tools