alita-sdk 0.3.257__py3-none-any.whl → 0.3.562__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent/__init__.py +5 -0
  4. alita_sdk/cli/agent/default.py +258 -0
  5. alita_sdk/cli/agent_executor.py +155 -0
  6. alita_sdk/cli/agent_loader.py +215 -0
  7. alita_sdk/cli/agent_ui.py +228 -0
  8. alita_sdk/cli/agents.py +3601 -0
  9. alita_sdk/cli/callbacks.py +647 -0
  10. alita_sdk/cli/cli.py +168 -0
  11. alita_sdk/cli/config.py +306 -0
  12. alita_sdk/cli/context/__init__.py +30 -0
  13. alita_sdk/cli/context/cleanup.py +198 -0
  14. alita_sdk/cli/context/manager.py +731 -0
  15. alita_sdk/cli/context/message.py +285 -0
  16. alita_sdk/cli/context/strategies.py +289 -0
  17. alita_sdk/cli/context/token_estimation.py +127 -0
  18. alita_sdk/cli/formatting.py +182 -0
  19. alita_sdk/cli/input_handler.py +419 -0
  20. alita_sdk/cli/inventory.py +1073 -0
  21. alita_sdk/cli/mcp_loader.py +315 -0
  22. alita_sdk/cli/toolkit.py +327 -0
  23. alita_sdk/cli/toolkit_loader.py +85 -0
  24. alita_sdk/cli/tools/__init__.py +43 -0
  25. alita_sdk/cli/tools/approval.py +224 -0
  26. alita_sdk/cli/tools/filesystem.py +1751 -0
  27. alita_sdk/cli/tools/planning.py +389 -0
  28. alita_sdk/cli/tools/terminal.py +414 -0
  29. alita_sdk/community/__init__.py +72 -12
  30. alita_sdk/community/inventory/__init__.py +236 -0
  31. alita_sdk/community/inventory/config.py +257 -0
  32. alita_sdk/community/inventory/enrichment.py +2137 -0
  33. alita_sdk/community/inventory/extractors.py +1469 -0
  34. alita_sdk/community/inventory/ingestion.py +3172 -0
  35. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  36. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  37. alita_sdk/community/inventory/parsers/base.py +295 -0
  38. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  39. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  40. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  41. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  42. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  43. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  44. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  45. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  46. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  47. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  48. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  49. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  50. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  51. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  52. alita_sdk/community/inventory/patterns/loader.py +348 -0
  53. alita_sdk/community/inventory/patterns/registry.py +198 -0
  54. alita_sdk/community/inventory/presets.py +535 -0
  55. alita_sdk/community/inventory/retrieval.py +1403 -0
  56. alita_sdk/community/inventory/toolkit.py +173 -0
  57. alita_sdk/community/inventory/toolkit_utils.py +176 -0
  58. alita_sdk/community/inventory/visualize.py +1370 -0
  59. alita_sdk/configurations/__init__.py +11 -0
  60. alita_sdk/configurations/ado.py +148 -2
  61. alita_sdk/configurations/azure_search.py +1 -1
  62. alita_sdk/configurations/bigquery.py +1 -1
  63. alita_sdk/configurations/bitbucket.py +94 -2
  64. alita_sdk/configurations/browser.py +18 -0
  65. alita_sdk/configurations/carrier.py +19 -0
  66. alita_sdk/configurations/confluence.py +130 -1
  67. alita_sdk/configurations/delta_lake.py +1 -1
  68. alita_sdk/configurations/figma.py +76 -5
  69. alita_sdk/configurations/github.py +65 -1
  70. alita_sdk/configurations/gitlab.py +81 -0
  71. alita_sdk/configurations/google_places.py +17 -0
  72. alita_sdk/configurations/jira.py +103 -0
  73. alita_sdk/configurations/openapi.py +111 -0
  74. alita_sdk/configurations/postman.py +1 -1
  75. alita_sdk/configurations/qtest.py +72 -3
  76. alita_sdk/configurations/report_portal.py +115 -0
  77. alita_sdk/configurations/salesforce.py +19 -0
  78. alita_sdk/configurations/service_now.py +1 -12
  79. alita_sdk/configurations/sharepoint.py +167 -0
  80. alita_sdk/configurations/sonar.py +18 -0
  81. alita_sdk/configurations/sql.py +20 -0
  82. alita_sdk/configurations/testio.py +101 -0
  83. alita_sdk/configurations/testrail.py +88 -0
  84. alita_sdk/configurations/xray.py +94 -1
  85. alita_sdk/configurations/zephyr_enterprise.py +94 -1
  86. alita_sdk/configurations/zephyr_essential.py +95 -0
  87. alita_sdk/runtime/clients/artifact.py +21 -4
  88. alita_sdk/runtime/clients/client.py +458 -67
  89. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  90. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  91. alita_sdk/runtime/clients/sandbox_client.py +352 -0
  92. alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
  93. alita_sdk/runtime/langchain/assistant.py +183 -43
  94. alita_sdk/runtime/langchain/constants.py +647 -1
  95. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  96. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
  97. alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
  98. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
  99. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -3
  100. alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
  101. alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
  102. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
  103. alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
  104. alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
  105. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
  106. alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
  107. alita_sdk/runtime/langchain/document_loaders/constants.py +189 -41
  108. alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
  109. alita_sdk/runtime/langchain/langraph_agent.py +407 -92
  110. alita_sdk/runtime/langchain/utils.py +102 -8
  111. alita_sdk/runtime/llms/preloaded.py +2 -6
  112. alita_sdk/runtime/models/mcp_models.py +61 -0
  113. alita_sdk/runtime/skills/__init__.py +91 -0
  114. alita_sdk/runtime/skills/callbacks.py +498 -0
  115. alita_sdk/runtime/skills/discovery.py +540 -0
  116. alita_sdk/runtime/skills/executor.py +610 -0
  117. alita_sdk/runtime/skills/input_builder.py +371 -0
  118. alita_sdk/runtime/skills/models.py +330 -0
  119. alita_sdk/runtime/skills/registry.py +355 -0
  120. alita_sdk/runtime/skills/skill_runner.py +330 -0
  121. alita_sdk/runtime/toolkits/__init__.py +28 -0
  122. alita_sdk/runtime/toolkits/application.py +14 -4
  123. alita_sdk/runtime/toolkits/artifact.py +24 -9
  124. alita_sdk/runtime/toolkits/datasource.py +13 -6
  125. alita_sdk/runtime/toolkits/mcp.py +780 -0
  126. alita_sdk/runtime/toolkits/planning.py +178 -0
  127. alita_sdk/runtime/toolkits/skill_router.py +238 -0
  128. alita_sdk/runtime/toolkits/subgraph.py +11 -6
  129. alita_sdk/runtime/toolkits/tools.py +314 -70
  130. alita_sdk/runtime/toolkits/vectorstore.py +11 -5
  131. alita_sdk/runtime/tools/__init__.py +24 -0
  132. alita_sdk/runtime/tools/application.py +16 -4
  133. alita_sdk/runtime/tools/artifact.py +367 -33
  134. alita_sdk/runtime/tools/data_analysis.py +183 -0
  135. alita_sdk/runtime/tools/function.py +100 -4
  136. alita_sdk/runtime/tools/graph.py +81 -0
  137. alita_sdk/runtime/tools/image_generation.py +218 -0
  138. alita_sdk/runtime/tools/llm.py +1013 -177
  139. alita_sdk/runtime/tools/loop.py +3 -1
  140. alita_sdk/runtime/tools/loop_output.py +3 -1
  141. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  142. alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
  143. alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
  144. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  145. alita_sdk/runtime/tools/planning/models.py +246 -0
  146. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  147. alita_sdk/runtime/tools/router.py +2 -1
  148. alita_sdk/runtime/tools/sandbox.py +375 -0
  149. alita_sdk/runtime/tools/skill_router.py +776 -0
  150. alita_sdk/runtime/tools/tool.py +3 -1
  151. alita_sdk/runtime/tools/vectorstore.py +69 -65
  152. alita_sdk/runtime/tools/vectorstore_base.py +163 -90
  153. alita_sdk/runtime/utils/AlitaCallback.py +137 -21
  154. alita_sdk/runtime/utils/mcp_client.py +492 -0
  155. alita_sdk/runtime/utils/mcp_oauth.py +361 -0
  156. alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
  157. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  158. alita_sdk/runtime/utils/streamlit.py +41 -14
  159. alita_sdk/runtime/utils/toolkit_utils.py +28 -9
  160. alita_sdk/runtime/utils/utils.py +48 -0
  161. alita_sdk/tools/__init__.py +135 -37
  162. alita_sdk/tools/ado/__init__.py +2 -2
  163. alita_sdk/tools/ado/repos/__init__.py +15 -19
  164. alita_sdk/tools/ado/repos/repos_wrapper.py +12 -20
  165. alita_sdk/tools/ado/test_plan/__init__.py +26 -8
  166. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -28
  167. alita_sdk/tools/ado/wiki/__init__.py +27 -12
  168. alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -40
  169. alita_sdk/tools/ado/work_item/__init__.py +27 -12
  170. alita_sdk/tools/ado/work_item/ado_wrapper.py +95 -11
  171. alita_sdk/tools/advanced_jira_mining/__init__.py +12 -8
  172. alita_sdk/tools/aws/delta_lake/__init__.py +14 -11
  173. alita_sdk/tools/aws/delta_lake/tool.py +5 -1
  174. alita_sdk/tools/azure_ai/search/__init__.py +13 -8
  175. alita_sdk/tools/base/tool.py +5 -1
  176. alita_sdk/tools/base_indexer_toolkit.py +454 -110
  177. alita_sdk/tools/bitbucket/__init__.py +27 -19
  178. alita_sdk/tools/bitbucket/api_wrapper.py +285 -27
  179. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
  180. alita_sdk/tools/browser/__init__.py +41 -16
  181. alita_sdk/tools/browser/crawler.py +3 -1
  182. alita_sdk/tools/browser/utils.py +15 -6
  183. alita_sdk/tools/carrier/__init__.py +18 -17
  184. alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
  185. alita_sdk/tools/carrier/excel_reporter.py +8 -4
  186. alita_sdk/tools/chunkers/__init__.py +3 -1
  187. alita_sdk/tools/chunkers/code/codeparser.py +1 -1
  188. alita_sdk/tools/chunkers/sematic/json_chunker.py +2 -1
  189. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  190. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  191. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  192. alita_sdk/tools/cloud/aws/__init__.py +11 -7
  193. alita_sdk/tools/cloud/azure/__init__.py +11 -7
  194. alita_sdk/tools/cloud/gcp/__init__.py +11 -7
  195. alita_sdk/tools/cloud/k8s/__init__.py +11 -7
  196. alita_sdk/tools/code/linter/__init__.py +9 -8
  197. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  198. alita_sdk/tools/code/sonar/__init__.py +20 -13
  199. alita_sdk/tools/code_indexer_toolkit.py +199 -0
  200. alita_sdk/tools/confluence/__init__.py +21 -14
  201. alita_sdk/tools/confluence/api_wrapper.py +197 -58
  202. alita_sdk/tools/confluence/loader.py +14 -2
  203. alita_sdk/tools/custom_open_api/__init__.py +11 -5
  204. alita_sdk/tools/elastic/__init__.py +10 -8
  205. alita_sdk/tools/elitea_base.py +546 -64
  206. alita_sdk/tools/figma/__init__.py +11 -8
  207. alita_sdk/tools/figma/api_wrapper.py +352 -153
  208. alita_sdk/tools/github/__init__.py +17 -17
  209. alita_sdk/tools/github/api_wrapper.py +9 -26
  210. alita_sdk/tools/github/github_client.py +81 -12
  211. alita_sdk/tools/github/schemas.py +2 -1
  212. alita_sdk/tools/github/tool.py +5 -1
  213. alita_sdk/tools/gitlab/__init__.py +18 -13
  214. alita_sdk/tools/gitlab/api_wrapper.py +224 -80
  215. alita_sdk/tools/gitlab_org/__init__.py +13 -10
  216. alita_sdk/tools/google/bigquery/__init__.py +13 -13
  217. alita_sdk/tools/google/bigquery/tool.py +5 -1
  218. alita_sdk/tools/google_places/__init__.py +20 -11
  219. alita_sdk/tools/jira/__init__.py +21 -11
  220. alita_sdk/tools/jira/api_wrapper.py +315 -168
  221. alita_sdk/tools/keycloak/__init__.py +10 -8
  222. alita_sdk/tools/localgit/__init__.py +8 -3
  223. alita_sdk/tools/localgit/local_git.py +62 -54
  224. alita_sdk/tools/localgit/tool.py +5 -1
  225. alita_sdk/tools/memory/__init__.py +38 -14
  226. alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
  227. alita_sdk/tools/ocr/__init__.py +10 -8
  228. alita_sdk/tools/openapi/__init__.py +281 -108
  229. alita_sdk/tools/openapi/api_wrapper.py +883 -0
  230. alita_sdk/tools/openapi/tool.py +20 -0
  231. alita_sdk/tools/pandas/__init__.py +18 -11
  232. alita_sdk/tools/pandas/api_wrapper.py +40 -45
  233. alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
  234. alita_sdk/tools/postman/__init__.py +10 -11
  235. alita_sdk/tools/postman/api_wrapper.py +19 -8
  236. alita_sdk/tools/postman/postman_analysis.py +8 -1
  237. alita_sdk/tools/pptx/__init__.py +10 -10
  238. alita_sdk/tools/qtest/__init__.py +21 -14
  239. alita_sdk/tools/qtest/api_wrapper.py +1784 -88
  240. alita_sdk/tools/rally/__init__.py +12 -10
  241. alita_sdk/tools/report_portal/__init__.py +22 -16
  242. alita_sdk/tools/salesforce/__init__.py +21 -16
  243. alita_sdk/tools/servicenow/__init__.py +20 -16
  244. alita_sdk/tools/servicenow/api_wrapper.py +1 -1
  245. alita_sdk/tools/sharepoint/__init__.py +16 -14
  246. alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
  247. alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
  248. alita_sdk/tools/sharepoint/utils.py +8 -2
  249. alita_sdk/tools/slack/__init__.py +11 -7
  250. alita_sdk/tools/sql/__init__.py +21 -19
  251. alita_sdk/tools/sql/api_wrapper.py +71 -23
  252. alita_sdk/tools/testio/__init__.py +20 -13
  253. alita_sdk/tools/testrail/__init__.py +12 -11
  254. alita_sdk/tools/testrail/api_wrapper.py +214 -46
  255. alita_sdk/tools/utils/__init__.py +28 -4
  256. alita_sdk/tools/utils/content_parser.py +182 -62
  257. alita_sdk/tools/utils/text_operations.py +254 -0
  258. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
  259. alita_sdk/tools/xray/__init__.py +17 -14
  260. alita_sdk/tools/xray/api_wrapper.py +58 -113
  261. alita_sdk/tools/yagmail/__init__.py +8 -3
  262. alita_sdk/tools/zephyr/__init__.py +11 -7
  263. alita_sdk/tools/zephyr_enterprise/__init__.py +15 -9
  264. alita_sdk/tools/zephyr_enterprise/api_wrapper.py +30 -15
  265. alita_sdk/tools/zephyr_essential/__init__.py +15 -10
  266. alita_sdk/tools/zephyr_essential/api_wrapper.py +297 -54
  267. alita_sdk/tools/zephyr_essential/client.py +6 -4
  268. alita_sdk/tools/zephyr_scale/__init__.py +12 -8
  269. alita_sdk/tools/zephyr_scale/api_wrapper.py +39 -31
  270. alita_sdk/tools/zephyr_squad/__init__.py +11 -7
  271. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/METADATA +184 -37
  272. alita_sdk-0.3.562.dist-info/RECORD +450 -0
  273. alita_sdk-0.3.562.dist-info/entry_points.txt +2 -0
  274. alita_sdk/tools/bitbucket/tools.py +0 -304
  275. alita_sdk-0.3.257.dist-info/RECORD +0 -343
  276. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/WHEEL +0 -0
  277. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/licenses/LICENSE +0 -0
  278. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/top_level.txt +0 -0
@@ -1,13 +1,17 @@
1
+ from copy import deepcopy
1
2
  import os
3
+ import re
2
4
  import tempfile
3
5
  from logging import getLogger
4
6
  from pathlib import Path
5
- from typing import Generator
7
+ from typing import Generator, List
6
8
 
7
9
  from langchain_core.documents import Document
8
10
  from langchain_core.tools import ToolException
9
11
 
10
- from alita_sdk.runtime.langchain.document_loaders.constants import loaders_map
12
+ from alita_sdk.runtime.langchain.document_loaders.constants import loaders_map, LoaderProperties
13
+ from ...runtime.langchain.document_loaders.AlitaTextLoader import AlitaTextLoader
14
+ from ...runtime.utils.utils import IndexerKeywords
11
15
 
12
16
  logger = getLogger(__name__)
13
17
 
@@ -51,11 +55,9 @@ Highlight any visible details that could help in understanding the image.
51
55
  Be as precise and thorough as possible in your responses. If something is unclear or illegible, state that explicitly.
52
56
  '''
53
57
 
54
- IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'webp', 'svg']
55
-
56
58
 
57
59
  def parse_file_content(file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
58
- sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False) -> str | ToolException:
60
+ sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False, prompt=None) -> str | ToolException:
59
61
  """Parse the content of a file based on its type and return the parsed content.
60
62
 
61
63
  Args:
@@ -72,18 +74,63 @@ def parse_file_content(file_name=None, file_content=None, is_capture_image: bool
72
74
  Raises:
73
75
  ToolException: If the file type is not supported or if there is an error reading the file.
74
76
  """
77
+ if not prompt:
78
+ prompt = image_processing_prompt
79
+ loader = prepare_loader(
80
+ file_name=file_name,
81
+ file_content=file_content,
82
+ is_capture_image=is_capture_image,
83
+ page_number=page_number,
84
+ sheet_name=sheet_name,
85
+ llm=llm,
86
+ file_path=file_path,
87
+ excel_by_sheets=excel_by_sheets,
88
+ prompt=prompt
89
+ )
75
90
 
76
- if (file_path and (file_name or file_content)) or (not file_path and (not file_name or file_content is None)):
77
- raise ToolException("Either (file_name and file_content) or file_path must be provided, but not both.")
78
-
79
- extension = Path(file_path if file_path else file_name).suffix
91
+ if not loader:
92
+ return ToolException(
93
+ "Not supported type of files entered. Supported types are TXT, DOCX, PDF, PPTX, XLSX and XLS only.")
80
94
 
81
- loader_object = loaders_map.get(extension)
82
- if not loader_object:
83
- logger.warning(f"No loader found for file extension: {extension}. File: {file_path if file_path else file_name}")
95
+ try:
96
+ if hasattr(loader, 'get_content'):
97
+ return loader.get_content()
98
+ else:
99
+ extension = Path(file_path if file_path else file_name).suffix
100
+ loader_kwargs = get_loader_kwargs(loaders_map.get(extension), file_name, file_content, is_capture_image, page_number, sheet_name, llm, file_path, excel_by_sheets)
101
+ if file_content:
102
+ return load_content_from_bytes(file_content=file_content,
103
+ extension=extension,
104
+ loader_extra_config=loader_kwargs,
105
+ llm=llm)
106
+ else:
107
+ return load_content(file_path=file_path,
108
+ extension=extension,
109
+ loader_extra_config=loader_kwargs,
110
+ llm=llm)
111
+ except Exception as e:
112
+ return ToolException(f"Error reading file ({file_name or file_path}) content. Make sure these types are supported: {str(e)}")
113
+
114
+ def load_file_docs(file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
115
+ sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False) -> List[Document] | ToolException:
116
+ loader = prepare_loader(
117
+ file_name=file_name,
118
+ file_content=file_content,
119
+ is_capture_image=is_capture_image,
120
+ page_number=page_number,
121
+ sheet_name=sheet_name,
122
+ llm=llm,
123
+ file_path=file_path,
124
+ excel_by_sheets=excel_by_sheets
125
+ )
126
+ if not loader:
84
127
  return ToolException(
85
128
  "Not supported type of files entered. Supported types are TXT, DOCX, PDF, PPTX, XLSX and XLS only.")
86
- loader_kwargs = loader_object['kwargs']
129
+ return loader.load()
130
+
131
+ def get_loader_kwargs(loader_object, file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
132
+ sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False, prompt=None):
133
+ loader_kwargs = deepcopy(loader_object['kwargs'])
87
134
  loader_kwargs.update({
88
135
  "file_path": file_path,
89
136
  "file_content": file_content,
@@ -93,28 +140,26 @@ def parse_file_content(file_name=None, file_content=None, is_capture_image: bool
93
140
  "page_number": page_number,
94
141
  "sheet_name": sheet_name,
95
142
  "excel_by_sheets": excel_by_sheets,
143
+ "prompt": prompt,
96
144
  "row_content": True,
97
145
  "json_documents": False
98
146
  })
99
- loader = loader_object['class'](**loader_kwargs)
147
+ return loader_kwargs
100
148
 
101
- if not loader:
102
- return ToolException(
103
- "Not supported type of files entered. Supported types are TXT, DOCX, PDF, PPTX, XLSX and XLS only.")
149
+ def prepare_loader(file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
150
+ sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False,
151
+ prompt=None):
152
+ if (file_path and (file_name or file_content)) or (not file_path and (not file_name or file_content is None)):
153
+ raise ToolException("Either (file_name and file_content) or file_path must be provided, but not both.")
104
154
 
105
- if hasattr(loader, 'get_content'):
106
- return loader.get_content()
107
- else:
108
- if file_content:
109
- return load_content_from_bytes(file_content=file_content,
110
- extension=extension,
111
- loader_extra_config=loader_kwargs,
112
- llm=llm)
113
- else:
114
- return load_content(file_path=file_path,
115
- extension=extension,
116
- loader_extra_config=loader_kwargs,
117
- llm=llm)
155
+ extension = Path(file_path if file_path else file_name).suffix
156
+
157
+ loader_object = loaders_map.get(extension)
158
+ if not loader_object:
159
+ loader_object = loaders_map.get('.txt') # Default to text loader if no specific loader found
160
+ loader_kwargs = get_loader_kwargs(loader_object, file_name, file_content, is_capture_image, page_number, sheet_name, llm, file_path, excel_by_sheets, prompt)
161
+ loader = loader_object['class'](**loader_kwargs)
162
+ return loader
118
163
 
119
164
  # TODO: review usage of this function alongside with functions above
120
165
  def load_content(file_path: str, extension: str = None, loader_extra_config: dict = None, llm = None) -> str:
@@ -142,7 +187,7 @@ def load_content(file_path: str, extension: str = None, loader_extra_config: dic
142
187
  if "file_path" in loader_kwargs:
143
188
  del loader_kwargs["file_path"]
144
189
 
145
- loader = loader_cls(file_path, **loader_kwargs)
190
+ loader = loader_cls(file_path=file_path, **loader_kwargs)
146
191
  documents = loader.load()
147
192
 
148
193
  page_contents = [doc.page_content for doc in documents]
@@ -167,37 +212,93 @@ def load_content_from_bytes(file_content: bytes, extension: str = None, loader_e
167
212
  if temp_file_path and os.path.exists(temp_file_path):
168
213
  os.remove(temp_file_path)
169
214
 
170
- def process_content_by_type(document: Document, content, extension_source: str, llm = None, chunking_config={}) -> Generator[Document, None, None]:
171
- temp_file_path = None
215
+ def process_document_by_type(content, extension_source: str, document: Document = None, llm = None, chunking_config=None) \
216
+ -> Generator[Document, None, None]:
217
+ """Process the content of a file based on its type using a configured loader cosidering the origin document."""
172
218
  try:
173
- extension = "." + extension_source.split('.')[-1].lower()
174
-
175
- with tempfile.NamedTemporaryFile(mode='w+b', suffix=extension, delete=False) as temp_file:
176
- temp_file_path = temp_file.name
177
- if content is None:
178
- logger.warning("'loader_content' ie expected but not found in document metadata.")
179
- return
180
-
181
- temp_file.write(content)
182
- temp_file.flush()
183
-
184
- loader_config = loaders_map.get(extension)
185
- if not loader_config:
186
- logger.warning(f"No loader found for file extension: {extension}. File: {temp_file_path}")
187
- return
188
-
189
- loader_cls = loader_config['class']
190
- loader_kwargs = loader_config['kwargs']
191
-
192
- loader = loader_cls(file_path=temp_file_path, **loader_kwargs)
193
- for chunk in loader.load():
194
- yield Document(
195
- page_content=sanitize_for_postgres(chunk.page_content),
196
- metadata={**document.metadata, **chunk.metadata}
197
- )
198
- finally:
199
- if temp_file_path and os.path.exists(temp_file_path):
200
- os.remove(temp_file_path)
219
+ chunks = process_content_by_type(content, extension_source, llm, chunking_config)
220
+ except Exception as e:
221
+ msg = f"Error during content parsing for file {extension_source}:\n{e}"
222
+ logger.warning(msg)
223
+ yield Document(
224
+ page_content=msg,
225
+ metadata={**document.metadata, 'chunk_id': 1}
226
+ )
227
+ return
228
+ #
229
+ chunks_counter = 0
230
+ for chunk in chunks:
231
+ chunks_counter += 1
232
+ metadata = {**document.metadata, **chunk.metadata}
233
+ #
234
+ # ensure each chunk has a unique chunk_id
235
+ metadata['chunk_id'] = chunks_counter
236
+ #
237
+ yield Document(
238
+ page_content=sanitize_for_postgres(chunk.page_content),
239
+ metadata=metadata
240
+ )
241
+
242
+
243
+ def process_content_by_type(content, filename: str, llm=None, chunking_config=None, fallback_extensions=None) -> \
244
+ Generator[Document, None, None]:
245
+ """Process the content of a file based on its type using a configured loader."""
246
+ temp_file_path = None
247
+ extensions = fallback_extensions if fallback_extensions else []
248
+ match = re.search(r'\.([^.]+)$', filename)
249
+
250
+ if match:
251
+ extensions.insert(0, f".{match.group(1).lower()}")
252
+ elif not extensions:
253
+ extensions = [".txt"]
254
+
255
+ for extension in extensions:
256
+ try:
257
+ with tempfile.NamedTemporaryFile(mode='w+b', suffix=extension, delete=False) as temp_file:
258
+ temp_file_path = temp_file.name
259
+ if content is None:
260
+ logger.warning(
261
+ f"'{IndexerKeywords.CONTENT_IN_BYTES.value}' ie expected but not found in document metadata.")
262
+ return []
263
+
264
+ temp_file.write(content)
265
+ temp_file.flush()
266
+
267
+ loader_config = loaders_map.get(extension)
268
+ if not loader_config:
269
+ logger.warning(f"No loader found for file extension: {extension}. File: {temp_file_path}")
270
+ return []
271
+
272
+ loader_cls = loader_config['class']
273
+ loader_kwargs = loader_config['kwargs']
274
+ # Determine which loader configuration keys are allowed to be overridden by user input.
275
+ # If 'allowed_to_override' is specified in the loader configuration, use it; otherwise, allow all keys in loader_kwargs.
276
+ allowed_to_override = loader_config.get('allowed_to_override', loader_kwargs)
277
+ # If a chunking_config is provided and contains custom configuration for the current file extension,
278
+ # update loader_kwargs with user-supplied values, but only for keys explicitly permitted in allowed_to_override and if value differs from default.
279
+ # This ensures that only safe and intended parameters can be customized, preventing accidental or unauthorized changes
280
+ # to critical loader settings.
281
+ if chunking_config and (users_config_for_extension := chunking_config.get(extension, {})):
282
+ for key in set(users_config_for_extension.keys()) & set(allowed_to_override.keys()):
283
+ if users_config_for_extension[key] != allowed_to_override[key]:
284
+ loader_kwargs[key] = users_config_for_extension[key]
285
+ if LoaderProperties.LLM.value in loader_kwargs and loader_kwargs.pop(LoaderProperties.LLM.value):
286
+ loader_kwargs['llm'] = llm
287
+ if LoaderProperties.PROMPT_DEFAULT.value in loader_kwargs and loader_kwargs.pop(LoaderProperties.PROMPT_DEFAULT.value):
288
+ loader_kwargs[LoaderProperties.PROMPT.value] = image_processing_prompt
289
+ loader = loader_cls(file_path=temp_file_path, **loader_kwargs)
290
+ yield from loader.load()
291
+ break
292
+ except Exception as e:
293
+ if fallback_extensions:
294
+ logger.warning(f"Error loading attachment: {str(e)} for file {temp_file_path} (extension: {extension})")
295
+ logger.warning(f"Continuing with fallback extensions: {fallback_extensions}.")
296
+ continue
297
+ else:
298
+ raise e
299
+ finally:
300
+ if temp_file_path and os.path.exists(temp_file_path):
301
+ os.remove(temp_file_path)
201
302
 
202
303
  # FIXME copied from langchain_core/utils/strings.py of 0.3.74 version
203
304
  # https://github.com/langchain-ai/langchain/pull/32157
@@ -218,4 +319,23 @@ def sanitize_for_postgres(text: str, replacement: str = "") -> str:
218
319
  >>> sanitize_for_postgres("Hello\\x00world", " ")
219
320
  'Hello world'
220
321
  """
221
- return text.replace("\x00", replacement)
322
+ return text.replace("\x00", replacement)
323
+
324
+
325
+ def file_extension_by_chunker(chunker_name: str) -> str | None:
326
+ if not chunker_name:
327
+ return None
328
+ name = chunker_name.lower()
329
+ if name == "markdown":
330
+ return ".md"
331
+ if name == "json":
332
+ return ".json"
333
+ if name == "text" or name == "txt":
334
+ return ".txt"
335
+ if name == "html":
336
+ return ".html"
337
+ if name == "xml":
338
+ return ".xml"
339
+ if name == "csv":
340
+ return ".csv"
341
+ return None
@@ -0,0 +1,254 @@
1
+ """
2
+ Shared text operations utilities for file manipulation across toolkits.
3
+
4
+ Provides common functionality for:
5
+ - Parsing OLD/NEW marker-based edits
6
+ - Text file validation
7
+ - Line-based slicing and partial reads
8
+ - Content searching with context
9
+ """
10
+ import re
11
+ import logging
12
+ from typing import List, Tuple, Dict, Optional
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Text file extensions that support editing
17
+ TEXT_EDITABLE_EXTENSIONS = {
18
+ '.md', '.txt', '.csv', '.json', '.xml', '.html',
19
+ '.yaml', '.yml', '.ini', '.conf', '.log', '.sh',
20
+ '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.go',
21
+ '.rb', '.php', '.c', '.cpp', '.h', '.hpp', '.cs',
22
+ '.sql', '.r', '.m', '.swift', '.kt', '.rs', '.scala'
23
+ }
24
+
25
+
26
+ def parse_old_new_markers(file_query: str) -> List[Tuple[str, str]]:
27
+ """
28
+ Parse OLD/NEW marker-based edit instructions.
29
+
30
+ Extracts pairs of old and new content from a file query using markers:
31
+ - OLD <<<< ... >>>> OLD
32
+ - NEW <<<< ... >>>> NEW
33
+
34
+ Args:
35
+ file_query: String containing marked old and new content sections
36
+
37
+ Returns:
38
+ List of tuples (old_content, new_content) for each edit pair
39
+
40
+ Example:
41
+ >>> query = '''
42
+ ... OLD <<<<
43
+ ... Hello World
44
+ ... >>>> OLD
45
+ ... NEW <<<<
46
+ ... Hello Mars
47
+ ... >>>> NEW
48
+ ... '''
49
+ >>> parse_old_new_markers(query)
50
+ [('Hello World', 'Hello Mars')]
51
+ """
52
+ # Split the file content by lines
53
+ code_lines = file_query.split("\n")
54
+
55
+ # Initialize lists to hold the contents of OLD and NEW sections
56
+ old_contents = []
57
+ new_contents = []
58
+
59
+ # Initialize variables to track whether the current line is within an OLD or NEW section
60
+ in_old_section = False
61
+ in_new_section = False
62
+
63
+ # Temporary storage for the current section's content
64
+ current_section_content = []
65
+
66
+ # Iterate through each line in the file content
67
+ for line in code_lines:
68
+ # Check for OLD section start
69
+ if "OLD <<<" in line:
70
+ in_old_section = True
71
+ current_section_content = [] # Reset current section content
72
+ continue # Skip the line with the marker
73
+
74
+ # Check for OLD section end
75
+ if ">>>> OLD" in line:
76
+ in_old_section = False
77
+ old_contents.append("\n".join(current_section_content).strip()) # Add the captured content
78
+ current_section_content = [] # Reset current section content
79
+ continue # Skip the line with the marker
80
+
81
+ # Check for NEW section start
82
+ if "NEW <<<" in line:
83
+ in_new_section = True
84
+ current_section_content = [] # Reset current section content
85
+ continue # Skip the line with the marker
86
+
87
+ # Check for NEW section end
88
+ if ">>>> NEW" in line:
89
+ in_new_section = False
90
+ new_contents.append("\n".join(current_section_content).strip()) # Add the captured content
91
+ current_section_content = [] # Reset current section content
92
+ continue # Skip the line with the marker
93
+
94
+ # If currently in an OLD or NEW section, add the line to the current section content
95
+ if in_old_section or in_new_section:
96
+ current_section_content.append(line)
97
+
98
+ # Pair the OLD and NEW contents
99
+ paired_contents = list(zip(old_contents, new_contents))
100
+
101
+ return paired_contents
102
+
103
+
104
+ def is_text_editable(filename: str) -> bool:
105
+ """
106
+ Check if a file is editable as text based on its extension.
107
+
108
+ Args:
109
+ filename: Name or path of the file to check
110
+
111
+ Returns:
112
+ True if file extension is in the text-editable whitelist
113
+
114
+ Example:
115
+ >>> is_text_editable("config.json")
116
+ True
117
+ >>> is_text_editable("image.png")
118
+ False
119
+ """
120
+ from pathlib import Path
121
+ ext = Path(filename).suffix.lower()
122
+ return ext in TEXT_EDITABLE_EXTENSIONS
123
+
124
+
125
+ def apply_line_slice(
126
+ content: str,
127
+ offset: Optional[int] = None,
128
+ limit: Optional[int] = None,
129
+ head: Optional[int] = None,
130
+ tail: Optional[int] = None
131
+ ) -> str:
132
+ """
133
+ Apply line-based slicing to text content.
134
+
135
+ Supports multiple modes:
136
+ - offset + limit: Read from line `offset` for `limit` lines (1-indexed)
137
+ - head: Read only first N lines
138
+ - tail: Read only last N lines
139
+ - No params: Return full content
140
+
141
+ Args:
142
+ content: Text content to slice
143
+ offset: Starting line number (1-indexed, inclusive)
144
+ limit: Number of lines to read from offset
145
+ head: Return only first N lines
146
+ tail: Return only last N lines
147
+
148
+ Returns:
149
+ Sliced content as string
150
+
151
+ Example:
152
+ >>> text = "line1\\nline2\\nline3\\nline4\\nline5"
153
+ >>> apply_line_slice(text, offset=2, limit=2)
154
+ 'line2\\nline3'
155
+ >>> apply_line_slice(text, head=2)
156
+ 'line1\\nline2'
157
+ >>> apply_line_slice(text, tail=2)
158
+ 'line4\\nline5'
159
+ """
160
+ if not content:
161
+ return content
162
+
163
+ lines = content.splitlines(keepends=True)
164
+
165
+ # Head mode: first N lines
166
+ if head is not None:
167
+ return ''.join(lines[:head])
168
+
169
+ # Tail mode: last N lines
170
+ if tail is not None:
171
+ return ''.join(lines[-tail:] if tail > 0 else lines)
172
+
173
+ # Offset + limit mode: slice from offset for limit lines
174
+ if offset is not None:
175
+ start_idx = max(0, offset - 1) # Convert 1-indexed to 0-indexed
176
+ if limit is not None:
177
+ end_idx = start_idx + limit
178
+ return ''.join(lines[start_idx:end_idx])
179
+ else:
180
+ return ''.join(lines[start_idx:])
181
+
182
+ # No slicing parameters: return full content
183
+ return content
184
+
185
+
186
+ def search_in_content(
187
+ content: str,
188
+ pattern: str,
189
+ is_regex: bool = True,
190
+ context_lines: int = 2
191
+ ) -> List[Dict[str, any]]:
192
+ """
193
+ Search for pattern in content with context lines.
194
+
195
+ Args:
196
+ content: Text content to search
197
+ pattern: Search pattern (regex if is_regex=True, else literal string)
198
+ is_regex: Whether to treat pattern as regex (default True)
199
+ context_lines: Number of lines before/after match to include (default 2)
200
+
201
+ Returns:
202
+ List of match dictionaries with keys:
203
+ - line_number: 1-indexed line number of match
204
+ - line_content: The matching line
205
+ - match_text: The actual matched text
206
+ - context_before: List of lines before match
207
+ - context_after: List of lines after match
208
+
209
+ Example:
210
+ >>> text = "line1\\nHello World\\nline3"
211
+ >>> matches = search_in_content(text, "Hello", is_regex=False)
212
+ >>> matches[0]['line_number']
213
+ 2
214
+ >>> matches[0]['match_text']
215
+ 'Hello'
216
+ """
217
+ if not content:
218
+ return []
219
+
220
+ lines = content.splitlines()
221
+ matches = []
222
+
223
+ # Compile regex pattern or escape for literal search
224
+ if is_regex:
225
+ try:
226
+ regex = re.compile(pattern, re.IGNORECASE)
227
+ except re.error as e:
228
+ logger.warning(f"Invalid regex pattern '{pattern}': {e}")
229
+ return []
230
+ else:
231
+ regex = re.compile(re.escape(pattern), re.IGNORECASE)
232
+
233
+ # Search each line
234
+ for line_idx, line in enumerate(lines):
235
+ match = regex.search(line)
236
+ if match:
237
+ line_number = line_idx + 1 # Convert to 1-indexed
238
+
239
+ # Get context lines
240
+ context_start = max(0, line_idx - context_lines)
241
+ context_end = min(len(lines), line_idx + context_lines + 1)
242
+
243
+ context_before = lines[context_start:line_idx]
244
+ context_after = lines[line_idx + 1:context_end]
245
+
246
+ matches.append({
247
+ 'line_number': line_number,
248
+ 'line_content': line,
249
+ 'match_text': match.group(0),
250
+ 'context_before': context_before,
251
+ 'context_after': context_after,
252
+ })
253
+
254
+ return matches