alita-sdk 0.3.257__py3-none-any.whl → 0.3.584__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of alita-sdk might be problematic. Click here for more details.

Files changed (281) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent/__init__.py +5 -0
  4. alita_sdk/cli/agent/default.py +258 -0
  5. alita_sdk/cli/agent_executor.py +155 -0
  6. alita_sdk/cli/agent_loader.py +215 -0
  7. alita_sdk/cli/agent_ui.py +228 -0
  8. alita_sdk/cli/agents.py +3794 -0
  9. alita_sdk/cli/callbacks.py +647 -0
  10. alita_sdk/cli/cli.py +168 -0
  11. alita_sdk/cli/config.py +306 -0
  12. alita_sdk/cli/context/__init__.py +30 -0
  13. alita_sdk/cli/context/cleanup.py +198 -0
  14. alita_sdk/cli/context/manager.py +731 -0
  15. alita_sdk/cli/context/message.py +285 -0
  16. alita_sdk/cli/context/strategies.py +289 -0
  17. alita_sdk/cli/context/token_estimation.py +127 -0
  18. alita_sdk/cli/formatting.py +182 -0
  19. alita_sdk/cli/input_handler.py +419 -0
  20. alita_sdk/cli/inventory.py +1073 -0
  21. alita_sdk/cli/mcp_loader.py +315 -0
  22. alita_sdk/cli/toolkit.py +327 -0
  23. alita_sdk/cli/toolkit_loader.py +85 -0
  24. alita_sdk/cli/tools/__init__.py +43 -0
  25. alita_sdk/cli/tools/approval.py +224 -0
  26. alita_sdk/cli/tools/filesystem.py +1751 -0
  27. alita_sdk/cli/tools/planning.py +389 -0
  28. alita_sdk/cli/tools/terminal.py +414 -0
  29. alita_sdk/community/__init__.py +72 -12
  30. alita_sdk/community/inventory/__init__.py +236 -0
  31. alita_sdk/community/inventory/config.py +257 -0
  32. alita_sdk/community/inventory/enrichment.py +2137 -0
  33. alita_sdk/community/inventory/extractors.py +1469 -0
  34. alita_sdk/community/inventory/ingestion.py +3172 -0
  35. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  36. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  37. alita_sdk/community/inventory/parsers/base.py +295 -0
  38. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  39. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  40. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  41. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  42. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  43. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  44. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  45. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  46. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  47. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  48. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  49. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  50. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  51. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  52. alita_sdk/community/inventory/patterns/loader.py +348 -0
  53. alita_sdk/community/inventory/patterns/registry.py +198 -0
  54. alita_sdk/community/inventory/presets.py +535 -0
  55. alita_sdk/community/inventory/retrieval.py +1403 -0
  56. alita_sdk/community/inventory/toolkit.py +173 -0
  57. alita_sdk/community/inventory/toolkit_utils.py +176 -0
  58. alita_sdk/community/inventory/visualize.py +1370 -0
  59. alita_sdk/configurations/__init__.py +11 -0
  60. alita_sdk/configurations/ado.py +148 -2
  61. alita_sdk/configurations/azure_search.py +1 -1
  62. alita_sdk/configurations/bigquery.py +1 -1
  63. alita_sdk/configurations/bitbucket.py +94 -2
  64. alita_sdk/configurations/browser.py +18 -0
  65. alita_sdk/configurations/carrier.py +19 -0
  66. alita_sdk/configurations/confluence.py +130 -1
  67. alita_sdk/configurations/delta_lake.py +1 -1
  68. alita_sdk/configurations/figma.py +76 -5
  69. alita_sdk/configurations/github.py +65 -1
  70. alita_sdk/configurations/gitlab.py +81 -0
  71. alita_sdk/configurations/google_places.py +17 -0
  72. alita_sdk/configurations/jira.py +103 -0
  73. alita_sdk/configurations/openapi.py +323 -0
  74. alita_sdk/configurations/postman.py +1 -1
  75. alita_sdk/configurations/qtest.py +72 -3
  76. alita_sdk/configurations/report_portal.py +115 -0
  77. alita_sdk/configurations/salesforce.py +19 -0
  78. alita_sdk/configurations/service_now.py +1 -12
  79. alita_sdk/configurations/sharepoint.py +167 -0
  80. alita_sdk/configurations/sonar.py +18 -0
  81. alita_sdk/configurations/sql.py +20 -0
  82. alita_sdk/configurations/testio.py +101 -0
  83. alita_sdk/configurations/testrail.py +88 -0
  84. alita_sdk/configurations/xray.py +94 -1
  85. alita_sdk/configurations/zephyr_enterprise.py +94 -1
  86. alita_sdk/configurations/zephyr_essential.py +95 -0
  87. alita_sdk/runtime/clients/artifact.py +21 -4
  88. alita_sdk/runtime/clients/client.py +458 -67
  89. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  90. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  91. alita_sdk/runtime/clients/sandbox_client.py +352 -0
  92. alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
  93. alita_sdk/runtime/langchain/assistant.py +183 -43
  94. alita_sdk/runtime/langchain/constants.py +647 -1
  95. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  96. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
  97. alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
  98. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
  99. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -3
  100. alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
  101. alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
  102. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
  103. alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
  104. alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
  105. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
  106. alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
  107. alita_sdk/runtime/langchain/document_loaders/constants.py +189 -41
  108. alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
  109. alita_sdk/runtime/langchain/langraph_agent.py +493 -105
  110. alita_sdk/runtime/langchain/utils.py +118 -8
  111. alita_sdk/runtime/llms/preloaded.py +2 -6
  112. alita_sdk/runtime/models/mcp_models.py +61 -0
  113. alita_sdk/runtime/skills/__init__.py +91 -0
  114. alita_sdk/runtime/skills/callbacks.py +498 -0
  115. alita_sdk/runtime/skills/discovery.py +540 -0
  116. alita_sdk/runtime/skills/executor.py +610 -0
  117. alita_sdk/runtime/skills/input_builder.py +371 -0
  118. alita_sdk/runtime/skills/models.py +330 -0
  119. alita_sdk/runtime/skills/registry.py +355 -0
  120. alita_sdk/runtime/skills/skill_runner.py +330 -0
  121. alita_sdk/runtime/toolkits/__init__.py +28 -0
  122. alita_sdk/runtime/toolkits/application.py +14 -4
  123. alita_sdk/runtime/toolkits/artifact.py +25 -9
  124. alita_sdk/runtime/toolkits/datasource.py +13 -6
  125. alita_sdk/runtime/toolkits/mcp.py +782 -0
  126. alita_sdk/runtime/toolkits/planning.py +178 -0
  127. alita_sdk/runtime/toolkits/skill_router.py +238 -0
  128. alita_sdk/runtime/toolkits/subgraph.py +11 -6
  129. alita_sdk/runtime/toolkits/tools.py +314 -70
  130. alita_sdk/runtime/toolkits/vectorstore.py +11 -5
  131. alita_sdk/runtime/tools/__init__.py +24 -0
  132. alita_sdk/runtime/tools/application.py +16 -4
  133. alita_sdk/runtime/tools/artifact.py +367 -33
  134. alita_sdk/runtime/tools/data_analysis.py +183 -0
  135. alita_sdk/runtime/tools/function.py +100 -4
  136. alita_sdk/runtime/tools/graph.py +81 -0
  137. alita_sdk/runtime/tools/image_generation.py +218 -0
  138. alita_sdk/runtime/tools/llm.py +1032 -177
  139. alita_sdk/runtime/tools/loop.py +3 -1
  140. alita_sdk/runtime/tools/loop_output.py +3 -1
  141. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  142. alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
  143. alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
  144. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  145. alita_sdk/runtime/tools/planning/models.py +246 -0
  146. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  147. alita_sdk/runtime/tools/router.py +2 -1
  148. alita_sdk/runtime/tools/sandbox.py +375 -0
  149. alita_sdk/runtime/tools/skill_router.py +776 -0
  150. alita_sdk/runtime/tools/tool.py +3 -1
  151. alita_sdk/runtime/tools/vectorstore.py +69 -65
  152. alita_sdk/runtime/tools/vectorstore_base.py +163 -90
  153. alita_sdk/runtime/utils/AlitaCallback.py +137 -21
  154. alita_sdk/runtime/utils/constants.py +5 -1
  155. alita_sdk/runtime/utils/mcp_client.py +492 -0
  156. alita_sdk/runtime/utils/mcp_oauth.py +361 -0
  157. alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
  158. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  159. alita_sdk/runtime/utils/streamlit.py +41 -14
  160. alita_sdk/runtime/utils/toolkit_utils.py +28 -9
  161. alita_sdk/runtime/utils/utils.py +48 -0
  162. alita_sdk/tools/__init__.py +135 -37
  163. alita_sdk/tools/ado/__init__.py +2 -2
  164. alita_sdk/tools/ado/repos/__init__.py +16 -19
  165. alita_sdk/tools/ado/repos/repos_wrapper.py +12 -20
  166. alita_sdk/tools/ado/test_plan/__init__.py +27 -8
  167. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -28
  168. alita_sdk/tools/ado/wiki/__init__.py +28 -12
  169. alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -40
  170. alita_sdk/tools/ado/work_item/__init__.py +28 -12
  171. alita_sdk/tools/ado/work_item/ado_wrapper.py +95 -11
  172. alita_sdk/tools/advanced_jira_mining/__init__.py +13 -8
  173. alita_sdk/tools/aws/delta_lake/__init__.py +15 -11
  174. alita_sdk/tools/aws/delta_lake/tool.py +5 -1
  175. alita_sdk/tools/azure_ai/search/__init__.py +14 -8
  176. alita_sdk/tools/base/tool.py +5 -1
  177. alita_sdk/tools/base_indexer_toolkit.py +454 -110
  178. alita_sdk/tools/bitbucket/__init__.py +28 -19
  179. alita_sdk/tools/bitbucket/api_wrapper.py +285 -27
  180. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
  181. alita_sdk/tools/browser/__init__.py +41 -16
  182. alita_sdk/tools/browser/crawler.py +3 -1
  183. alita_sdk/tools/browser/utils.py +15 -6
  184. alita_sdk/tools/carrier/__init__.py +18 -17
  185. alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
  186. alita_sdk/tools/carrier/excel_reporter.py +8 -4
  187. alita_sdk/tools/chunkers/__init__.py +3 -1
  188. alita_sdk/tools/chunkers/code/codeparser.py +1 -1
  189. alita_sdk/tools/chunkers/sematic/json_chunker.py +2 -1
  190. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  191. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  192. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  193. alita_sdk/tools/cloud/aws/__init__.py +12 -7
  194. alita_sdk/tools/cloud/azure/__init__.py +12 -7
  195. alita_sdk/tools/cloud/gcp/__init__.py +12 -7
  196. alita_sdk/tools/cloud/k8s/__init__.py +12 -7
  197. alita_sdk/tools/code/linter/__init__.py +10 -8
  198. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  199. alita_sdk/tools/code/sonar/__init__.py +21 -13
  200. alita_sdk/tools/code_indexer_toolkit.py +199 -0
  201. alita_sdk/tools/confluence/__init__.py +22 -14
  202. alita_sdk/tools/confluence/api_wrapper.py +197 -58
  203. alita_sdk/tools/confluence/loader.py +14 -2
  204. alita_sdk/tools/custom_open_api/__init__.py +12 -5
  205. alita_sdk/tools/elastic/__init__.py +11 -8
  206. alita_sdk/tools/elitea_base.py +546 -64
  207. alita_sdk/tools/figma/__init__.py +60 -11
  208. alita_sdk/tools/figma/api_wrapper.py +1400 -167
  209. alita_sdk/tools/figma/figma_client.py +73 -0
  210. alita_sdk/tools/figma/toon_tools.py +2748 -0
  211. alita_sdk/tools/github/__init__.py +18 -17
  212. alita_sdk/tools/github/api_wrapper.py +9 -26
  213. alita_sdk/tools/github/github_client.py +81 -12
  214. alita_sdk/tools/github/schemas.py +2 -1
  215. alita_sdk/tools/github/tool.py +5 -1
  216. alita_sdk/tools/gitlab/__init__.py +19 -13
  217. alita_sdk/tools/gitlab/api_wrapper.py +256 -80
  218. alita_sdk/tools/gitlab_org/__init__.py +14 -10
  219. alita_sdk/tools/google/bigquery/__init__.py +14 -13
  220. alita_sdk/tools/google/bigquery/tool.py +5 -1
  221. alita_sdk/tools/google_places/__init__.py +21 -11
  222. alita_sdk/tools/jira/__init__.py +22 -11
  223. alita_sdk/tools/jira/api_wrapper.py +315 -168
  224. alita_sdk/tools/keycloak/__init__.py +11 -8
  225. alita_sdk/tools/localgit/__init__.py +9 -3
  226. alita_sdk/tools/localgit/local_git.py +62 -54
  227. alita_sdk/tools/localgit/tool.py +5 -1
  228. alita_sdk/tools/memory/__init__.py +38 -14
  229. alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
  230. alita_sdk/tools/ocr/__init__.py +11 -8
  231. alita_sdk/tools/openapi/__init__.py +491 -106
  232. alita_sdk/tools/openapi/api_wrapper.py +1357 -0
  233. alita_sdk/tools/openapi/tool.py +20 -0
  234. alita_sdk/tools/pandas/__init__.py +20 -12
  235. alita_sdk/tools/pandas/api_wrapper.py +40 -45
  236. alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
  237. alita_sdk/tools/postman/__init__.py +11 -11
  238. alita_sdk/tools/postman/api_wrapper.py +19 -8
  239. alita_sdk/tools/postman/postman_analysis.py +8 -1
  240. alita_sdk/tools/pptx/__init__.py +11 -10
  241. alita_sdk/tools/qtest/__init__.py +22 -14
  242. alita_sdk/tools/qtest/api_wrapper.py +1784 -88
  243. alita_sdk/tools/rally/__init__.py +13 -10
  244. alita_sdk/tools/report_portal/__init__.py +23 -16
  245. alita_sdk/tools/salesforce/__init__.py +22 -16
  246. alita_sdk/tools/servicenow/__init__.py +21 -16
  247. alita_sdk/tools/servicenow/api_wrapper.py +1 -1
  248. alita_sdk/tools/sharepoint/__init__.py +17 -14
  249. alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
  250. alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
  251. alita_sdk/tools/sharepoint/utils.py +8 -2
  252. alita_sdk/tools/slack/__init__.py +13 -8
  253. alita_sdk/tools/sql/__init__.py +22 -19
  254. alita_sdk/tools/sql/api_wrapper.py +71 -23
  255. alita_sdk/tools/testio/__init__.py +21 -13
  256. alita_sdk/tools/testrail/__init__.py +13 -11
  257. alita_sdk/tools/testrail/api_wrapper.py +214 -46
  258. alita_sdk/tools/utils/__init__.py +28 -4
  259. alita_sdk/tools/utils/content_parser.py +241 -55
  260. alita_sdk/tools/utils/text_operations.py +254 -0
  261. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
  262. alita_sdk/tools/xray/__init__.py +18 -14
  263. alita_sdk/tools/xray/api_wrapper.py +58 -113
  264. alita_sdk/tools/yagmail/__init__.py +9 -3
  265. alita_sdk/tools/zephyr/__init__.py +12 -7
  266. alita_sdk/tools/zephyr_enterprise/__init__.py +16 -9
  267. alita_sdk/tools/zephyr_enterprise/api_wrapper.py +30 -15
  268. alita_sdk/tools/zephyr_essential/__init__.py +16 -10
  269. alita_sdk/tools/zephyr_essential/api_wrapper.py +297 -54
  270. alita_sdk/tools/zephyr_essential/client.py +6 -4
  271. alita_sdk/tools/zephyr_scale/__init__.py +13 -8
  272. alita_sdk/tools/zephyr_scale/api_wrapper.py +39 -31
  273. alita_sdk/tools/zephyr_squad/__init__.py +12 -7
  274. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/METADATA +184 -37
  275. alita_sdk-0.3.584.dist-info/RECORD +452 -0
  276. alita_sdk-0.3.584.dist-info/entry_points.txt +2 -0
  277. alita_sdk/tools/bitbucket/tools.py +0 -304
  278. alita_sdk-0.3.257.dist-info/RECORD +0 -343
  279. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/WHEEL +0 -0
  280. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/licenses/LICENSE +0 -0
  281. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,17 @@
1
+ from copy import deepcopy
1
2
  import os
3
+ import re
2
4
  import tempfile
3
5
  from logging import getLogger
4
6
  from pathlib import Path
5
- from typing import Generator
7
+ from typing import Generator, List
6
8
 
7
9
  from langchain_core.documents import Document
8
10
  from langchain_core.tools import ToolException
9
11
 
10
- from alita_sdk.runtime.langchain.document_loaders.constants import loaders_map
12
+ from alita_sdk.runtime.langchain.document_loaders.constants import loaders_map, LoaderProperties
13
+ from ...runtime.langchain.document_loaders.AlitaTextLoader import AlitaTextLoader
14
+ from ...runtime.utils.utils import IndexerKeywords
11
15
 
12
16
  logger = getLogger(__name__)
13
17
 
@@ -51,11 +55,9 @@ Highlight any visible details that could help in understanding the image.
51
55
  Be as precise and thorough as possible in your responses. If something is unclear or illegible, state that explicitly.
52
56
  '''
53
57
 
54
- IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'webp', 'svg']
55
-
56
58
 
57
59
  def parse_file_content(file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
58
- sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False) -> str | ToolException:
60
+ sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False, prompt=None) -> str | ToolException:
59
61
  """Parse the content of a file based on its type and return the parsed content.
60
62
 
61
63
  Args:
@@ -72,18 +74,94 @@ def parse_file_content(file_name=None, file_content=None, is_capture_image: bool
72
74
  Raises:
73
75
  ToolException: If the file type is not supported or if there is an error reading the file.
74
76
  """
77
+ if not prompt:
78
+ prompt = image_processing_prompt
79
+ loader = prepare_loader(
80
+ file_name=file_name,
81
+ file_content=file_content,
82
+ is_capture_image=is_capture_image,
83
+ page_number=page_number,
84
+ sheet_name=sheet_name,
85
+ llm=llm,
86
+ file_path=file_path,
87
+ excel_by_sheets=excel_by_sheets,
88
+ prompt=prompt
89
+ )
75
90
 
76
- if (file_path and (file_name or file_content)) or (not file_path and (not file_name or file_content is None)):
77
- raise ToolException("Either (file_name and file_content) or file_path must be provided, but not both.")
78
-
79
- extension = Path(file_path if file_path else file_name).suffix
91
+ if not loader:
92
+ return ToolException(
93
+ "Not supported type of files entered. Supported types are TXT, DOCX, PDF, PPTX, XLSX and XLS only.")
80
94
 
81
- loader_object = loaders_map.get(extension)
82
- if not loader_object:
83
- logger.warning(f"No loader found for file extension: {extension}. File: {file_path if file_path else file_name}")
95
+ try:
96
+ if hasattr(loader, 'get_content'):
97
+ return loader.get_content()
98
+ else:
99
+ extension = Path(file_path if file_path else file_name).suffix
100
+ loader_kwargs = get_loader_kwargs(loaders_map.get(extension), file_name, file_content, is_capture_image, page_number, sheet_name, llm, file_path, excel_by_sheets)
101
+ if file_content:
102
+ return load_content_from_bytes(file_content=file_content,
103
+ extension=extension,
104
+ loader_extra_config=loader_kwargs,
105
+ llm=llm)
106
+ else:
107
+ return load_content(file_path=file_path,
108
+ extension=extension,
109
+ loader_extra_config=loader_kwargs,
110
+ llm=llm)
111
+ except Exception as e:
112
+ return ToolException(f"Error reading file ({file_name or file_path}) content. Make sure these types are supported: {str(e)}")
113
+
114
+ def load_file_docs(file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
115
+ sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False) -> List[Document] | ToolException:
116
+ loader = prepare_loader(
117
+ file_name=file_name,
118
+ file_content=file_content,
119
+ is_capture_image=is_capture_image,
120
+ page_number=page_number,
121
+ sheet_name=sheet_name,
122
+ llm=llm,
123
+ file_path=file_path,
124
+ excel_by_sheets=excel_by_sheets
125
+ )
126
+ if not loader:
84
127
  return ToolException(
85
128
  "Not supported type of files entered. Supported types are TXT, DOCX, PDF, PPTX, XLSX and XLS only.")
86
- loader_kwargs = loader_object['kwargs']
129
+ return loader.load()
130
+
131
+ def get_loader_kwargs(loader_object, file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
132
+ sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False, prompt=None):
133
+ """Build loader kwargs safely without deepcopying non-picklable objects like LLMs.
134
+
135
+ We avoid copying keys that are going to be overridden by this function anyway
136
+ (file_path, file_content, file_name, extract_images, llm, page_number,
137
+ sheet_name, excel_by_sheets, prompt, row_content, json_documents) to
138
+ prevent errors such as `cannot pickle '_thread.RLock' object` when an LLM
139
+ or client with internal locks is stored in the original kwargs.
140
+ """
141
+ if not loader_object:
142
+ raise ToolException("Loader configuration is missing.")
143
+
144
+ original_kwargs = loader_object.get("kwargs", {}) or {}
145
+
146
+ # Keys that will be overwritten below – skip them when copying
147
+ overridden_keys = {
148
+ "file_path",
149
+ "file_content",
150
+ "file_name",
151
+ "extract_images",
152
+ "llm",
153
+ "page_number",
154
+ "sheet_name",
155
+ "excel_by_sheets",
156
+ "prompt",
157
+ "row_content",
158
+ "json_documents",
159
+ }
160
+
161
+ # Build a safe shallow copy without overridden keys to avoid deepcopy
162
+ # of potentially non-picklable objects (e.g., llm with internal RLock).
163
+ loader_kwargs = {k: v for k, v in original_kwargs.items() if k not in overridden_keys}
164
+
87
165
  loader_kwargs.update({
88
166
  "file_path": file_path,
89
167
  "file_content": file_content,
@@ -93,28 +171,26 @@ def parse_file_content(file_name=None, file_content=None, is_capture_image: bool
93
171
  "page_number": page_number,
94
172
  "sheet_name": sheet_name,
95
173
  "excel_by_sheets": excel_by_sheets,
174
+ "prompt": prompt,
96
175
  "row_content": True,
97
176
  "json_documents": False
98
177
  })
99
- loader = loader_object['class'](**loader_kwargs)
178
+ return loader_kwargs
100
179
 
101
- if not loader:
102
- return ToolException(
103
- "Not supported type of files entered. Supported types are TXT, DOCX, PDF, PPTX, XLSX and XLS only.")
180
+ def prepare_loader(file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
181
+ sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False,
182
+ prompt=None):
183
+ if (file_path and (file_name or file_content)) or (not file_path and (not file_name or file_content is None)):
184
+ raise ToolException("Either (file_name and file_content) or file_path must be provided, but not both.")
104
185
 
105
- if hasattr(loader, 'get_content'):
106
- return loader.get_content()
107
- else:
108
- if file_content:
109
- return load_content_from_bytes(file_content=file_content,
110
- extension=extension,
111
- loader_extra_config=loader_kwargs,
112
- llm=llm)
113
- else:
114
- return load_content(file_path=file_path,
115
- extension=extension,
116
- loader_extra_config=loader_kwargs,
117
- llm=llm)
186
+ extension = Path(file_path if file_path else file_name).suffix
187
+
188
+ loader_object = loaders_map.get(extension)
189
+ if not loader_object:
190
+ loader_object = loaders_map.get('.txt') # Default to text loader if no specific loader found
191
+ loader_kwargs = get_loader_kwargs(loader_object, file_name, file_content, is_capture_image, page_number, sheet_name, llm, file_path, excel_by_sheets, prompt)
192
+ loader = loader_object['class'](**loader_kwargs)
193
+ return loader
118
194
 
119
195
  # TODO: review usage of this function alongside with functions above
120
196
  def load_content(file_path: str, extension: str = None, loader_extra_config: dict = None, llm = None) -> str:
@@ -142,7 +218,7 @@ def load_content(file_path: str, extension: str = None, loader_extra_config: dic
142
218
  if "file_path" in loader_kwargs:
143
219
  del loader_kwargs["file_path"]
144
220
 
145
- loader = loader_cls(file_path, **loader_kwargs)
221
+ loader = loader_cls(file_path=file_path, **loader_kwargs)
146
222
  documents = loader.load()
147
223
 
148
224
  page_contents = [doc.page_content for doc in documents]
@@ -167,38 +243,129 @@ def load_content_from_bytes(file_content: bytes, extension: str = None, loader_e
167
243
  if temp_file_path and os.path.exists(temp_file_path):
168
244
  os.remove(temp_file_path)
169
245
 
170
- def process_content_by_type(document: Document, content, extension_source: str, llm = None, chunking_config={}) -> Generator[Document, None, None]:
246
+
247
+ def _load_content_from_bytes_with_prompt(file_content: bytes, extension: str = None, loader_extra_config: dict = None, llm = None, prompt: str = image_processing_prompt) -> str:
248
+ """Internal helper that behaves like load_content_from_bytes but also propagates prompt.
249
+
250
+ This keeps the public load_content_from_bytes API unchanged while allowing newer
251
+ code paths to pass an explicit prompt through to the loader.
252
+ """
171
253
  temp_file_path = None
172
254
  try:
173
- extension = "." + extension_source.split('.')[-1].lower()
174
-
175
- with tempfile.NamedTemporaryFile(mode='w+b', suffix=extension, delete=False) as temp_file:
176
- temp_file_path = temp_file.name
177
- if content is None:
178
- logger.warning("'loader_content' ie expected but not found in document metadata.")
179
- return
180
-
181
- temp_file.write(content)
255
+ with tempfile.NamedTemporaryFile(mode='w+b', delete=False, suffix=extension or '') as temp_file:
256
+ temp_file.write(file_content)
182
257
  temp_file.flush()
258
+ temp_file_path = temp_file.name
183
259
 
184
- loader_config = loaders_map.get(extension)
185
- if not loader_config:
186
- logger.warning(f"No loader found for file extension: {extension}. File: {temp_file_path}")
187
- return
188
-
189
- loader_cls = loader_config['class']
190
- loader_kwargs = loader_config['kwargs']
260
+ # Use prepare_loader so that prompt and other kwargs are handled consistently
261
+ loader = prepare_loader(
262
+ file_name=None,
263
+ file_content=None,
264
+ is_capture_image=loader_extra_config.get('extract_images') if loader_extra_config else False,
265
+ page_number=loader_extra_config.get('page_number') if loader_extra_config else None,
266
+ sheet_name=loader_extra_config.get('sheet_name') if loader_extra_config else None,
267
+ llm=llm or (loader_extra_config.get('llm') if loader_extra_config else None),
268
+ file_path=temp_file_path,
269
+ excel_by_sheets=loader_extra_config.get('excel_by_sheets') if loader_extra_config else False,
270
+ prompt=prompt or (loader_extra_config.get('prompt') if loader_extra_config else image_processing_prompt),
271
+ )
191
272
 
192
- loader = loader_cls(file_path=temp_file_path, **loader_kwargs)
193
- for chunk in loader.load():
194
- yield Document(
195
- page_content=sanitize_for_postgres(chunk.page_content),
196
- metadata={**document.metadata, **chunk.metadata}
197
- )
273
+ documents = loader.load()
274
+ page_contents = [doc.page_content for doc in documents]
275
+ return "\n".join(page_contents)
198
276
  finally:
199
277
  if temp_file_path and os.path.exists(temp_file_path):
200
278
  os.remove(temp_file_path)
201
279
 
280
+
281
+ def process_document_by_type(content, extension_source: str, document: Document = None, llm = None, chunking_config=None) \
282
+ -> Generator[Document, None, None]:
283
+ """Process the content of a file based on its type using a configured loader cosidering the origin document."""
284
+ try:
285
+ chunks = process_content_by_type(content, extension_source, llm, chunking_config)
286
+ except Exception as e:
287
+ msg = f"Error during content parsing for file {extension_source}:\n{e}"
288
+ logger.warning(msg)
289
+ yield Document(
290
+ page_content=msg,
291
+ metadata={**document.metadata, 'chunk_id': 1}
292
+ )
293
+ return
294
+ #
295
+ chunks_counter = 0
296
+ for chunk in chunks:
297
+ chunks_counter += 1
298
+ metadata = {**document.metadata, **chunk.metadata}
299
+ #
300
+ # ensure each chunk has a unique chunk_id
301
+ metadata['chunk_id'] = chunks_counter
302
+ #
303
+ yield Document(
304
+ page_content=sanitize_for_postgres(chunk.page_content),
305
+ metadata=metadata
306
+ )
307
+
308
+
309
+ def process_content_by_type(content, filename: str, llm=None, chunking_config=None, fallback_extensions=None) -> \
310
+ Generator[Document, None, None]:
311
+ """Process the content of a file based on its type using a configured loader."""
312
+ temp_file_path = None
313
+ extensions = fallback_extensions if fallback_extensions else []
314
+ match = re.search(r'\.([^.]+)$', filename)
315
+
316
+ if match:
317
+ extensions.insert(0, f".{match.group(1).lower()}")
318
+ elif not extensions:
319
+ extensions = [".txt"]
320
+
321
+ for extension in extensions:
322
+ try:
323
+ with tempfile.NamedTemporaryFile(mode='w+b', suffix=extension, delete=False) as temp_file:
324
+ temp_file_path = temp_file.name
325
+ if content is None:
326
+ logger.warning(
327
+ f"'{IndexerKeywords.CONTENT_IN_BYTES.value}' ie expected but not found in document metadata.")
328
+ return []
329
+
330
+ temp_file.write(content)
331
+ temp_file.flush()
332
+
333
+ loader_config = loaders_map.get(extension)
334
+ if not loader_config:
335
+ logger.warning(f"No loader found for file extension: {extension}. File: {temp_file_path}")
336
+ return []
337
+
338
+ loader_cls = loader_config['class']
339
+ loader_kwargs = loader_config['kwargs']
340
+ # Determine which loader configuration keys are allowed to be overridden by user input.
341
+ # If 'allowed_to_override' is specified in the loader configuration, use it; otherwise, allow all keys in loader_kwargs.
342
+ allowed_to_override = loader_config.get('allowed_to_override', loader_kwargs)
343
+ # If a chunking_config is provided and contains custom configuration for the current file extension,
344
+ # update loader_kwargs with user-supplied values, but only for keys explicitly permitted in allowed_to_override and if value differs from default.
345
+ # This ensures that only safe and intended parameters can be customized, preventing accidental or unauthorized changes
346
+ # to critical loader settings.
347
+ if chunking_config and (users_config_for_extension := chunking_config.get(extension, {})):
348
+ for key in set(users_config_for_extension.keys()) & set(allowed_to_override.keys()):
349
+ if users_config_for_extension[key] != allowed_to_override[key]:
350
+ loader_kwargs[key] = users_config_for_extension[key]
351
+ if LoaderProperties.LLM.value in loader_kwargs and loader_kwargs.pop(LoaderProperties.LLM.value):
352
+ loader_kwargs['llm'] = llm
353
+ if LoaderProperties.PROMPT_DEFAULT.value in loader_kwargs and loader_kwargs.pop(LoaderProperties.PROMPT_DEFAULT.value):
354
+ loader_kwargs[LoaderProperties.PROMPT.value] = image_processing_prompt
355
+ loader = loader_cls(file_path=temp_file_path, **loader_kwargs)
356
+ yield from loader.load()
357
+ break
358
+ except Exception as e:
359
+ if fallback_extensions:
360
+ logger.warning(f"Error loading attachment: {str(e)} for file {temp_file_path} (extension: {extension})")
361
+ logger.warning(f"Continuing with fallback extensions: {fallback_extensions}.")
362
+ continue
363
+ else:
364
+ raise e
365
+ finally:
366
+ if temp_file_path and os.path.exists(temp_file_path):
367
+ os.remove(temp_file_path)
368
+
202
369
  # FIXME copied from langchain_core/utils/strings.py of 0.3.74 version
203
370
  # https://github.com/langchain-ai/langchain/pull/32157
204
371
  # should be used from langchain_core.utils import sanitize_for_postgres once updated to newer version
@@ -218,4 +385,23 @@ def sanitize_for_postgres(text: str, replacement: str = "") -> str:
218
385
  >>> sanitize_for_postgres("Hello\\x00world", " ")
219
386
  'Hello world'
220
387
  """
221
- return text.replace("\x00", replacement)
388
+ return text.replace("\x00", replacement)
389
+
390
+
391
+ def file_extension_by_chunker(chunker_name: str) -> str | None:
392
+ if not chunker_name:
393
+ return None
394
+ name = chunker_name.lower()
395
+ if name == "markdown":
396
+ return ".md"
397
+ if name == "json":
398
+ return ".json"
399
+ if name == "text" or name == "txt":
400
+ return ".txt"
401
+ if name == "html":
402
+ return ".html"
403
+ if name == "xml":
404
+ return ".xml"
405
+ if name == "csv":
406
+ return ".csv"
407
+ return None
@@ -0,0 +1,254 @@
1
+ """
2
+ Shared text operations utilities for file manipulation across toolkits.
3
+
4
+ Provides common functionality for:
5
+ - Parsing OLD/NEW marker-based edits
6
+ - Text file validation
7
+ - Line-based slicing and partial reads
8
+ - Content searching with context
9
+ """
10
+ import re
11
+ import logging
12
+ from typing import List, Tuple, Dict, Optional
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Text file extensions that support editing
17
+ TEXT_EDITABLE_EXTENSIONS = {
18
+ '.md', '.txt', '.csv', '.json', '.xml', '.html',
19
+ '.yaml', '.yml', '.ini', '.conf', '.log', '.sh',
20
+ '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.go',
21
+ '.rb', '.php', '.c', '.cpp', '.h', '.hpp', '.cs',
22
+ '.sql', '.r', '.m', '.swift', '.kt', '.rs', '.scala'
23
+ }
24
+
25
+
26
+ def parse_old_new_markers(file_query: str) -> List[Tuple[str, str]]:
27
+ """
28
+ Parse OLD/NEW marker-based edit instructions.
29
+
30
+ Extracts pairs of old and new content from a file query using markers:
31
+ - OLD <<<< ... >>>> OLD
32
+ - NEW <<<< ... >>>> NEW
33
+
34
+ Args:
35
+ file_query: String containing marked old and new content sections
36
+
37
+ Returns:
38
+ List of tuples (old_content, new_content) for each edit pair
39
+
40
+ Example:
41
+ >>> query = '''
42
+ ... OLD <<<<
43
+ ... Hello World
44
+ ... >>>> OLD
45
+ ... NEW <<<<
46
+ ... Hello Mars
47
+ ... >>>> NEW
48
+ ... '''
49
+ >>> parse_old_new_markers(query)
50
+ [('Hello World', 'Hello Mars')]
51
+ """
52
+ # Split the file content by lines
53
+ code_lines = file_query.split("\n")
54
+
55
+ # Initialize lists to hold the contents of OLD and NEW sections
56
+ old_contents = []
57
+ new_contents = []
58
+
59
+ # Initialize variables to track whether the current line is within an OLD or NEW section
60
+ in_old_section = False
61
+ in_new_section = False
62
+
63
+ # Temporary storage for the current section's content
64
+ current_section_content = []
65
+
66
+ # Iterate through each line in the file content
67
+ for line in code_lines:
68
+ # Check for OLD section start
69
+ if "OLD <<<" in line:
70
+ in_old_section = True
71
+ current_section_content = [] # Reset current section content
72
+ continue # Skip the line with the marker
73
+
74
+ # Check for OLD section end
75
+ if ">>>> OLD" in line:
76
+ in_old_section = False
77
+ old_contents.append("\n".join(current_section_content).strip()) # Add the captured content
78
+ current_section_content = [] # Reset current section content
79
+ continue # Skip the line with the marker
80
+
81
+ # Check for NEW section start
82
+ if "NEW <<<" in line:
83
+ in_new_section = True
84
+ current_section_content = [] # Reset current section content
85
+ continue # Skip the line with the marker
86
+
87
+ # Check for NEW section end
88
+ if ">>>> NEW" in line:
89
+ in_new_section = False
90
+ new_contents.append("\n".join(current_section_content).strip()) # Add the captured content
91
+ current_section_content = [] # Reset current section content
92
+ continue # Skip the line with the marker
93
+
94
+ # If currently in an OLD or NEW section, add the line to the current section content
95
+ if in_old_section or in_new_section:
96
+ current_section_content.append(line)
97
+
98
+ # Pair the OLD and NEW contents
99
+ paired_contents = list(zip(old_contents, new_contents))
100
+
101
+ return paired_contents
102
+
103
+
104
+ def is_text_editable(filename: str) -> bool:
105
+ """
106
+ Check if a file is editable as text based on its extension.
107
+
108
+ Args:
109
+ filename: Name or path of the file to check
110
+
111
+ Returns:
112
+ True if file extension is in the text-editable whitelist
113
+
114
+ Example:
115
+ >>> is_text_editable("config.json")
116
+ True
117
+ >>> is_text_editable("image.png")
118
+ False
119
+ """
120
+ from pathlib import Path
121
+ ext = Path(filename).suffix.lower()
122
+ return ext in TEXT_EDITABLE_EXTENSIONS
123
+
124
+
125
+ def apply_line_slice(
126
+ content: str,
127
+ offset: Optional[int] = None,
128
+ limit: Optional[int] = None,
129
+ head: Optional[int] = None,
130
+ tail: Optional[int] = None
131
+ ) -> str:
132
+ """
133
+ Apply line-based slicing to text content.
134
+
135
+ Supports multiple modes:
136
+ - offset + limit: Read from line `offset` for `limit` lines (1-indexed)
137
+ - head: Read only first N lines
138
+ - tail: Read only last N lines
139
+ - No params: Return full content
140
+
141
+ Args:
142
+ content: Text content to slice
143
+ offset: Starting line number (1-indexed, inclusive)
144
+ limit: Number of lines to read from offset
145
+ head: Return only first N lines
146
+ tail: Return only last N lines
147
+
148
+ Returns:
149
+ Sliced content as string
150
+
151
+ Example:
152
+ >>> text = "line1\\nline2\\nline3\\nline4\\nline5"
153
+ >>> apply_line_slice(text, offset=2, limit=2)
154
+ 'line2\\nline3'
155
+ >>> apply_line_slice(text, head=2)
156
+ 'line1\\nline2'
157
+ >>> apply_line_slice(text, tail=2)
158
+ 'line4\\nline5'
159
+ """
160
+ if not content:
161
+ return content
162
+
163
+ lines = content.splitlines(keepends=True)
164
+
165
+ # Head mode: first N lines
166
+ if head is not None:
167
+ return ''.join(lines[:head])
168
+
169
+ # Tail mode: last N lines
170
+ if tail is not None:
171
+ return ''.join(lines[-tail:] if tail > 0 else lines)
172
+
173
+ # Offset + limit mode: slice from offset for limit lines
174
+ if offset is not None:
175
+ start_idx = max(0, offset - 1) # Convert 1-indexed to 0-indexed
176
+ if limit is not None:
177
+ end_idx = start_idx + limit
178
+ return ''.join(lines[start_idx:end_idx])
179
+ else:
180
+ return ''.join(lines[start_idx:])
181
+
182
+ # No slicing parameters: return full content
183
+ return content
184
+
185
+
186
+ def search_in_content(
187
+ content: str,
188
+ pattern: str,
189
+ is_regex: bool = True,
190
+ context_lines: int = 2
191
+ ) -> List[Dict[str, any]]:
192
+ """
193
+ Search for pattern in content with context lines.
194
+
195
+ Args:
196
+ content: Text content to search
197
+ pattern: Search pattern (regex if is_regex=True, else literal string)
198
+ is_regex: Whether to treat pattern as regex (default True)
199
+ context_lines: Number of lines before/after match to include (default 2)
200
+
201
+ Returns:
202
+ List of match dictionaries with keys:
203
+ - line_number: 1-indexed line number of match
204
+ - line_content: The matching line
205
+ - match_text: The actual matched text
206
+ - context_before: List of lines before match
207
+ - context_after: List of lines after match
208
+
209
+ Example:
210
+ >>> text = "line1\\nHello World\\nline3"
211
+ >>> matches = search_in_content(text, "Hello", is_regex=False)
212
+ >>> matches[0]['line_number']
213
+ 2
214
+ >>> matches[0]['match_text']
215
+ 'Hello'
216
+ """
217
+ if not content:
218
+ return []
219
+
220
+ lines = content.splitlines()
221
+ matches = []
222
+
223
+ # Compile regex pattern or escape for literal search
224
+ if is_regex:
225
+ try:
226
+ regex = re.compile(pattern, re.IGNORECASE)
227
+ except re.error as e:
228
+ logger.warning(f"Invalid regex pattern '{pattern}': {e}")
229
+ return []
230
+ else:
231
+ regex = re.compile(re.escape(pattern), re.IGNORECASE)
232
+
233
+ # Search each line
234
+ for line_idx, line in enumerate(lines):
235
+ match = regex.search(line)
236
+ if match:
237
+ line_number = line_idx + 1 # Convert to 1-indexed
238
+
239
+ # Get context lines
240
+ context_start = max(0, line_idx - context_lines)
241
+ context_end = min(len(lines), line_idx + context_lines + 1)
242
+
243
+ context_before = lines[context_start:line_idx]
244
+ context_after = lines[line_idx + 1:context_end]
245
+
246
+ matches.append({
247
+ 'line_number': line_number,
248
+ 'line_content': line,
249
+ 'match_text': match.group(0),
250
+ 'context_before': context_before,
251
+ 'context_after': context_after,
252
+ })
253
+
254
+ return matches