alita-sdk 0.3.257__py3-none-any.whl → 0.3.584__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of alita-sdk might be problematic. Click here for more details.

Files changed (281) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent/__init__.py +5 -0
  4. alita_sdk/cli/agent/default.py +258 -0
  5. alita_sdk/cli/agent_executor.py +155 -0
  6. alita_sdk/cli/agent_loader.py +215 -0
  7. alita_sdk/cli/agent_ui.py +228 -0
  8. alita_sdk/cli/agents.py +3794 -0
  9. alita_sdk/cli/callbacks.py +647 -0
  10. alita_sdk/cli/cli.py +168 -0
  11. alita_sdk/cli/config.py +306 -0
  12. alita_sdk/cli/context/__init__.py +30 -0
  13. alita_sdk/cli/context/cleanup.py +198 -0
  14. alita_sdk/cli/context/manager.py +731 -0
  15. alita_sdk/cli/context/message.py +285 -0
  16. alita_sdk/cli/context/strategies.py +289 -0
  17. alita_sdk/cli/context/token_estimation.py +127 -0
  18. alita_sdk/cli/formatting.py +182 -0
  19. alita_sdk/cli/input_handler.py +419 -0
  20. alita_sdk/cli/inventory.py +1073 -0
  21. alita_sdk/cli/mcp_loader.py +315 -0
  22. alita_sdk/cli/toolkit.py +327 -0
  23. alita_sdk/cli/toolkit_loader.py +85 -0
  24. alita_sdk/cli/tools/__init__.py +43 -0
  25. alita_sdk/cli/tools/approval.py +224 -0
  26. alita_sdk/cli/tools/filesystem.py +1751 -0
  27. alita_sdk/cli/tools/planning.py +389 -0
  28. alita_sdk/cli/tools/terminal.py +414 -0
  29. alita_sdk/community/__init__.py +72 -12
  30. alita_sdk/community/inventory/__init__.py +236 -0
  31. alita_sdk/community/inventory/config.py +257 -0
  32. alita_sdk/community/inventory/enrichment.py +2137 -0
  33. alita_sdk/community/inventory/extractors.py +1469 -0
  34. alita_sdk/community/inventory/ingestion.py +3172 -0
  35. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  36. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  37. alita_sdk/community/inventory/parsers/base.py +295 -0
  38. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  39. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  40. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  41. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  42. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  43. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  44. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  45. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  46. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  47. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  48. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  49. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  50. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  51. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  52. alita_sdk/community/inventory/patterns/loader.py +348 -0
  53. alita_sdk/community/inventory/patterns/registry.py +198 -0
  54. alita_sdk/community/inventory/presets.py +535 -0
  55. alita_sdk/community/inventory/retrieval.py +1403 -0
  56. alita_sdk/community/inventory/toolkit.py +173 -0
  57. alita_sdk/community/inventory/toolkit_utils.py +176 -0
  58. alita_sdk/community/inventory/visualize.py +1370 -0
  59. alita_sdk/configurations/__init__.py +11 -0
  60. alita_sdk/configurations/ado.py +148 -2
  61. alita_sdk/configurations/azure_search.py +1 -1
  62. alita_sdk/configurations/bigquery.py +1 -1
  63. alita_sdk/configurations/bitbucket.py +94 -2
  64. alita_sdk/configurations/browser.py +18 -0
  65. alita_sdk/configurations/carrier.py +19 -0
  66. alita_sdk/configurations/confluence.py +130 -1
  67. alita_sdk/configurations/delta_lake.py +1 -1
  68. alita_sdk/configurations/figma.py +76 -5
  69. alita_sdk/configurations/github.py +65 -1
  70. alita_sdk/configurations/gitlab.py +81 -0
  71. alita_sdk/configurations/google_places.py +17 -0
  72. alita_sdk/configurations/jira.py +103 -0
  73. alita_sdk/configurations/openapi.py +323 -0
  74. alita_sdk/configurations/postman.py +1 -1
  75. alita_sdk/configurations/qtest.py +72 -3
  76. alita_sdk/configurations/report_portal.py +115 -0
  77. alita_sdk/configurations/salesforce.py +19 -0
  78. alita_sdk/configurations/service_now.py +1 -12
  79. alita_sdk/configurations/sharepoint.py +167 -0
  80. alita_sdk/configurations/sonar.py +18 -0
  81. alita_sdk/configurations/sql.py +20 -0
  82. alita_sdk/configurations/testio.py +101 -0
  83. alita_sdk/configurations/testrail.py +88 -0
  84. alita_sdk/configurations/xray.py +94 -1
  85. alita_sdk/configurations/zephyr_enterprise.py +94 -1
  86. alita_sdk/configurations/zephyr_essential.py +95 -0
  87. alita_sdk/runtime/clients/artifact.py +21 -4
  88. alita_sdk/runtime/clients/client.py +458 -67
  89. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  90. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  91. alita_sdk/runtime/clients/sandbox_client.py +352 -0
  92. alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
  93. alita_sdk/runtime/langchain/assistant.py +183 -43
  94. alita_sdk/runtime/langchain/constants.py +647 -1
  95. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  96. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
  97. alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
  98. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
  99. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -3
  100. alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
  101. alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
  102. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
  103. alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
  104. alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
  105. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
  106. alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
  107. alita_sdk/runtime/langchain/document_loaders/constants.py +189 -41
  108. alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
  109. alita_sdk/runtime/langchain/langraph_agent.py +493 -105
  110. alita_sdk/runtime/langchain/utils.py +118 -8
  111. alita_sdk/runtime/llms/preloaded.py +2 -6
  112. alita_sdk/runtime/models/mcp_models.py +61 -0
  113. alita_sdk/runtime/skills/__init__.py +91 -0
  114. alita_sdk/runtime/skills/callbacks.py +498 -0
  115. alita_sdk/runtime/skills/discovery.py +540 -0
  116. alita_sdk/runtime/skills/executor.py +610 -0
  117. alita_sdk/runtime/skills/input_builder.py +371 -0
  118. alita_sdk/runtime/skills/models.py +330 -0
  119. alita_sdk/runtime/skills/registry.py +355 -0
  120. alita_sdk/runtime/skills/skill_runner.py +330 -0
  121. alita_sdk/runtime/toolkits/__init__.py +28 -0
  122. alita_sdk/runtime/toolkits/application.py +14 -4
  123. alita_sdk/runtime/toolkits/artifact.py +25 -9
  124. alita_sdk/runtime/toolkits/datasource.py +13 -6
  125. alita_sdk/runtime/toolkits/mcp.py +782 -0
  126. alita_sdk/runtime/toolkits/planning.py +178 -0
  127. alita_sdk/runtime/toolkits/skill_router.py +238 -0
  128. alita_sdk/runtime/toolkits/subgraph.py +11 -6
  129. alita_sdk/runtime/toolkits/tools.py +314 -70
  130. alita_sdk/runtime/toolkits/vectorstore.py +11 -5
  131. alita_sdk/runtime/tools/__init__.py +24 -0
  132. alita_sdk/runtime/tools/application.py +16 -4
  133. alita_sdk/runtime/tools/artifact.py +367 -33
  134. alita_sdk/runtime/tools/data_analysis.py +183 -0
  135. alita_sdk/runtime/tools/function.py +100 -4
  136. alita_sdk/runtime/tools/graph.py +81 -0
  137. alita_sdk/runtime/tools/image_generation.py +218 -0
  138. alita_sdk/runtime/tools/llm.py +1032 -177
  139. alita_sdk/runtime/tools/loop.py +3 -1
  140. alita_sdk/runtime/tools/loop_output.py +3 -1
  141. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  142. alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
  143. alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
  144. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  145. alita_sdk/runtime/tools/planning/models.py +246 -0
  146. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  147. alita_sdk/runtime/tools/router.py +2 -1
  148. alita_sdk/runtime/tools/sandbox.py +375 -0
  149. alita_sdk/runtime/tools/skill_router.py +776 -0
  150. alita_sdk/runtime/tools/tool.py +3 -1
  151. alita_sdk/runtime/tools/vectorstore.py +69 -65
  152. alita_sdk/runtime/tools/vectorstore_base.py +163 -90
  153. alita_sdk/runtime/utils/AlitaCallback.py +137 -21
  154. alita_sdk/runtime/utils/constants.py +5 -1
  155. alita_sdk/runtime/utils/mcp_client.py +492 -0
  156. alita_sdk/runtime/utils/mcp_oauth.py +361 -0
  157. alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
  158. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  159. alita_sdk/runtime/utils/streamlit.py +41 -14
  160. alita_sdk/runtime/utils/toolkit_utils.py +28 -9
  161. alita_sdk/runtime/utils/utils.py +48 -0
  162. alita_sdk/tools/__init__.py +135 -37
  163. alita_sdk/tools/ado/__init__.py +2 -2
  164. alita_sdk/tools/ado/repos/__init__.py +16 -19
  165. alita_sdk/tools/ado/repos/repos_wrapper.py +12 -20
  166. alita_sdk/tools/ado/test_plan/__init__.py +27 -8
  167. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -28
  168. alita_sdk/tools/ado/wiki/__init__.py +28 -12
  169. alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -40
  170. alita_sdk/tools/ado/work_item/__init__.py +28 -12
  171. alita_sdk/tools/ado/work_item/ado_wrapper.py +95 -11
  172. alita_sdk/tools/advanced_jira_mining/__init__.py +13 -8
  173. alita_sdk/tools/aws/delta_lake/__init__.py +15 -11
  174. alita_sdk/tools/aws/delta_lake/tool.py +5 -1
  175. alita_sdk/tools/azure_ai/search/__init__.py +14 -8
  176. alita_sdk/tools/base/tool.py +5 -1
  177. alita_sdk/tools/base_indexer_toolkit.py +454 -110
  178. alita_sdk/tools/bitbucket/__init__.py +28 -19
  179. alita_sdk/tools/bitbucket/api_wrapper.py +285 -27
  180. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
  181. alita_sdk/tools/browser/__init__.py +41 -16
  182. alita_sdk/tools/browser/crawler.py +3 -1
  183. alita_sdk/tools/browser/utils.py +15 -6
  184. alita_sdk/tools/carrier/__init__.py +18 -17
  185. alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
  186. alita_sdk/tools/carrier/excel_reporter.py +8 -4
  187. alita_sdk/tools/chunkers/__init__.py +3 -1
  188. alita_sdk/tools/chunkers/code/codeparser.py +1 -1
  189. alita_sdk/tools/chunkers/sematic/json_chunker.py +2 -1
  190. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  191. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  192. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  193. alita_sdk/tools/cloud/aws/__init__.py +12 -7
  194. alita_sdk/tools/cloud/azure/__init__.py +12 -7
  195. alita_sdk/tools/cloud/gcp/__init__.py +12 -7
  196. alita_sdk/tools/cloud/k8s/__init__.py +12 -7
  197. alita_sdk/tools/code/linter/__init__.py +10 -8
  198. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  199. alita_sdk/tools/code/sonar/__init__.py +21 -13
  200. alita_sdk/tools/code_indexer_toolkit.py +199 -0
  201. alita_sdk/tools/confluence/__init__.py +22 -14
  202. alita_sdk/tools/confluence/api_wrapper.py +197 -58
  203. alita_sdk/tools/confluence/loader.py +14 -2
  204. alita_sdk/tools/custom_open_api/__init__.py +12 -5
  205. alita_sdk/tools/elastic/__init__.py +11 -8
  206. alita_sdk/tools/elitea_base.py +546 -64
  207. alita_sdk/tools/figma/__init__.py +60 -11
  208. alita_sdk/tools/figma/api_wrapper.py +1400 -167
  209. alita_sdk/tools/figma/figma_client.py +73 -0
  210. alita_sdk/tools/figma/toon_tools.py +2748 -0
  211. alita_sdk/tools/github/__init__.py +18 -17
  212. alita_sdk/tools/github/api_wrapper.py +9 -26
  213. alita_sdk/tools/github/github_client.py +81 -12
  214. alita_sdk/tools/github/schemas.py +2 -1
  215. alita_sdk/tools/github/tool.py +5 -1
  216. alita_sdk/tools/gitlab/__init__.py +19 -13
  217. alita_sdk/tools/gitlab/api_wrapper.py +256 -80
  218. alita_sdk/tools/gitlab_org/__init__.py +14 -10
  219. alita_sdk/tools/google/bigquery/__init__.py +14 -13
  220. alita_sdk/tools/google/bigquery/tool.py +5 -1
  221. alita_sdk/tools/google_places/__init__.py +21 -11
  222. alita_sdk/tools/jira/__init__.py +22 -11
  223. alita_sdk/tools/jira/api_wrapper.py +315 -168
  224. alita_sdk/tools/keycloak/__init__.py +11 -8
  225. alita_sdk/tools/localgit/__init__.py +9 -3
  226. alita_sdk/tools/localgit/local_git.py +62 -54
  227. alita_sdk/tools/localgit/tool.py +5 -1
  228. alita_sdk/tools/memory/__init__.py +38 -14
  229. alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
  230. alita_sdk/tools/ocr/__init__.py +11 -8
  231. alita_sdk/tools/openapi/__init__.py +491 -106
  232. alita_sdk/tools/openapi/api_wrapper.py +1357 -0
  233. alita_sdk/tools/openapi/tool.py +20 -0
  234. alita_sdk/tools/pandas/__init__.py +20 -12
  235. alita_sdk/tools/pandas/api_wrapper.py +40 -45
  236. alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
  237. alita_sdk/tools/postman/__init__.py +11 -11
  238. alita_sdk/tools/postman/api_wrapper.py +19 -8
  239. alita_sdk/tools/postman/postman_analysis.py +8 -1
  240. alita_sdk/tools/pptx/__init__.py +11 -10
  241. alita_sdk/tools/qtest/__init__.py +22 -14
  242. alita_sdk/tools/qtest/api_wrapper.py +1784 -88
  243. alita_sdk/tools/rally/__init__.py +13 -10
  244. alita_sdk/tools/report_portal/__init__.py +23 -16
  245. alita_sdk/tools/salesforce/__init__.py +22 -16
  246. alita_sdk/tools/servicenow/__init__.py +21 -16
  247. alita_sdk/tools/servicenow/api_wrapper.py +1 -1
  248. alita_sdk/tools/sharepoint/__init__.py +17 -14
  249. alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
  250. alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
  251. alita_sdk/tools/sharepoint/utils.py +8 -2
  252. alita_sdk/tools/slack/__init__.py +13 -8
  253. alita_sdk/tools/sql/__init__.py +22 -19
  254. alita_sdk/tools/sql/api_wrapper.py +71 -23
  255. alita_sdk/tools/testio/__init__.py +21 -13
  256. alita_sdk/tools/testrail/__init__.py +13 -11
  257. alita_sdk/tools/testrail/api_wrapper.py +214 -46
  258. alita_sdk/tools/utils/__init__.py +28 -4
  259. alita_sdk/tools/utils/content_parser.py +241 -55
  260. alita_sdk/tools/utils/text_operations.py +254 -0
  261. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
  262. alita_sdk/tools/xray/__init__.py +18 -14
  263. alita_sdk/tools/xray/api_wrapper.py +58 -113
  264. alita_sdk/tools/yagmail/__init__.py +9 -3
  265. alita_sdk/tools/zephyr/__init__.py +12 -7
  266. alita_sdk/tools/zephyr_enterprise/__init__.py +16 -9
  267. alita_sdk/tools/zephyr_enterprise/api_wrapper.py +30 -15
  268. alita_sdk/tools/zephyr_essential/__init__.py +16 -10
  269. alita_sdk/tools/zephyr_essential/api_wrapper.py +297 -54
  270. alita_sdk/tools/zephyr_essential/client.py +6 -4
  271. alita_sdk/tools/zephyr_scale/__init__.py +13 -8
  272. alita_sdk/tools/zephyr_scale/api_wrapper.py +39 -31
  273. alita_sdk/tools/zephyr_squad/__init__.py +12 -7
  274. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/METADATA +184 -37
  275. alita_sdk-0.3.584.dist-info/RECORD +452 -0
  276. alita_sdk-0.3.584.dist-info/entry_points.txt +2 -0
  277. alita_sdk/tools/bitbucket/tools.py +0 -304
  278. alita_sdk-0.3.257.dist-info/RECORD +0 -343
  279. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/WHEEL +0 -0
  280. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/licenses/LICENSE +0 -0
  281. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/top_level.txt +0 -0
@@ -1,4 +1,5 @@
1
1
  import re
2
+ import uuid
2
3
  from io import BytesIO
3
4
 
4
5
  import mammoth.images
@@ -8,6 +9,9 @@ from langchain_core.document_loaders import BaseLoader
8
9
  from langchain_core.documents import Document
9
10
  from mammoth import convert_to_html
10
11
  from markdownify import markdownify
12
+ from docx import Document as DocxDocument
13
+ from docx.oxml.ns import qn
14
+ from bs4 import BeautifulSoup
11
15
 
12
16
  from alita_sdk.tools.chunkers.sematic.markdown_chunker import markdown_by_headers_chunker
13
17
  from .utils import perform_llm_prediction_for_image_bytes
@@ -17,6 +21,7 @@ class AlitaDocxMammothLoader(BaseLoader):
17
21
  """
18
22
  Loader for Docx files using Mammoth to convert to HTML, with image handling,
19
23
  and then Markdownify to convert HTML to markdown.
24
+ Detects bordered paragraphs and text boxes and treats them as code blocks.
20
25
  """
21
26
  def __init__(self, **kwargs):
22
27
  """
@@ -97,6 +102,295 @@ class AlitaDocxMammothLoader(BaseLoader):
97
102
  new_md = pattern.sub(replace_placeholder, original_md)
98
103
  return new_md
99
104
 
105
+ def __has_border(self, paragraph):
106
+ """
107
+ Check if a paragraph has border formatting.
108
+
109
+ Args:
110
+ paragraph: A python-docx Paragraph object.
111
+
112
+ Returns:
113
+ bool: True if paragraph has any border, False otherwise.
114
+ """
115
+ pPr = paragraph._element.pPr
116
+ if pPr is not None:
117
+ pBdr = pPr.find(qn('w:pBdr'))
118
+ if pBdr is not None:
119
+ # Check if any border side exists (top, bottom, left, right)
120
+ for side in ['top', 'bottom', 'left', 'right']:
121
+ border = pBdr.find(qn(f'w:{side}'))
122
+ if border is not None:
123
+ # Check if border is not "none" or has a width
124
+ val = border.get(qn('w:val'))
125
+ if val and val != 'none':
126
+ return True
127
+ return False
128
+
129
+ def __find_text_boxes(self, doc):
130
+ """
131
+ Find all text boxes in document by searching OOXML structure.
132
+ Text boxes are typically in w:txbxContent elements.
133
+
134
+ Args:
135
+ doc: A python-docx Document object.
136
+
137
+ Returns:
138
+ list: List of tuples (element, paragraphs_inside_textbox).
139
+ """
140
+ text_boxes = []
141
+
142
+ # Iterate through document body XML to find text box content elements
143
+ for element in doc.element.body.iter():
144
+ # Look for text box content elements
145
+ if element.tag.endswith('txbxContent'):
146
+ # Collect all paragraphs inside this text box
147
+ txbx_paragraphs = []
148
+ for txbx_para_element in element.iter():
149
+ if txbx_para_element.tag.endswith('p'):
150
+ txbx_paragraphs.append(txbx_para_element)
151
+
152
+ if txbx_paragraphs:
153
+ text_boxes.append((element, txbx_paragraphs))
154
+
155
+ return text_boxes
156
+
157
+ def __create_marker_paragraph(self, marker_text):
158
+ """
159
+ Create a paragraph element with marker text.
160
+
161
+ Args:
162
+ marker_text (str): The marker text to insert.
163
+
164
+ Returns:
165
+ Element: An OOXML paragraph element.
166
+ """
167
+ from docx.oxml import OxmlElement
168
+
169
+ p = OxmlElement('w:p')
170
+ r = OxmlElement('w:r')
171
+ t = OxmlElement('w:t')
172
+ t.text = marker_text
173
+ r.append(t)
174
+ p.append(r)
175
+ return p
176
+
177
+ def __inject_markers_for_paragraph(self, paragraph, start_marker, end_marker):
178
+ """
179
+ Inject marker paragraphs before and after a bordered paragraph.
180
+
181
+ Args:
182
+ paragraph: A python-docx Paragraph object.
183
+ start_marker (str): The start marker text.
184
+ end_marker (str): The end marker text.
185
+ """
186
+ # Insert start marker paragraph before
187
+ marker_p_start = self.__create_marker_paragraph(start_marker)
188
+ paragraph._element.addprevious(marker_p_start)
189
+
190
+ # Insert end marker paragraph after
191
+ marker_p_end = self.__create_marker_paragraph(end_marker)
192
+ paragraph._element.addnext(marker_p_end)
193
+
194
+ def __inject_markers_for_textbox(self, textbox_element, paragraph_elements, start_marker, end_marker):
195
+ """
196
+ Inject markers around text box content.
197
+
198
+ Args:
199
+ textbox_element: The w:txbxContent element.
200
+ paragraph_elements: List of paragraph elements inside the text box.
201
+ start_marker (str): The start marker text.
202
+ end_marker (str): The end marker text.
203
+ """
204
+ if not paragraph_elements:
205
+ return
206
+
207
+ # Insert start marker before first paragraph in text box
208
+ first_para = paragraph_elements[0]
209
+ marker_p_start = self.__create_marker_paragraph(start_marker)
210
+ first_para.addprevious(marker_p_start)
211
+
212
+ # Insert end marker after last paragraph in text box
213
+ last_para = paragraph_elements[-1]
214
+ marker_p_end = self.__create_marker_paragraph(end_marker)
215
+ last_para.addnext(marker_p_end)
216
+
217
+ def __detect_and_mark_bordered_content(self, docx_stream):
218
+ """
219
+ Detects bordered paragraphs and text boxes, injects unique markers around them.
220
+ Groups consecutive bordered paragraphs into single code blocks.
221
+
222
+ Args:
223
+ docx_stream: A file-like object containing the DOCX document.
224
+
225
+ Returns:
226
+ tuple: (modified_docx_stream, start_marker, end_marker)
227
+ """
228
+ # Load document with python-docx
229
+ doc = DocxDocument(docx_stream)
230
+
231
+ # Generate unique markers to avoid conflicts with document content
232
+ unique_id = uuid.uuid4().hex[:8]
233
+ start_marker = f"<<<BORDERED_BLOCK_START_{unique_id}>>>"
234
+ end_marker = f"<<<BORDERED_BLOCK_END_{unique_id}>>>"
235
+
236
+ # Group consecutive bordered paragraphs together
237
+ bordered_groups = []
238
+ current_group = []
239
+
240
+ for para in doc.paragraphs:
241
+ if self.__has_border(para):
242
+ current_group.append(para)
243
+ else:
244
+ if current_group:
245
+ # End of a bordered group
246
+ bordered_groups.append(current_group)
247
+ current_group = []
248
+
249
+ # Don't forget the last group if document ends with bordered paragraphs
250
+ if current_group:
251
+ bordered_groups.append(current_group)
252
+
253
+ # Collect all text boxes
254
+ # text_boxes = self.__find_text_boxes(doc)
255
+
256
+ # Inject markers around each group of consecutive bordered paragraphs
257
+ for group in bordered_groups:
258
+ if group:
259
+ # Add start marker before first paragraph in group
260
+ first_para = group[0]
261
+ marker_p_start = self.__create_marker_paragraph(start_marker)
262
+ first_para._element.addprevious(marker_p_start)
263
+
264
+ # Add end marker after last paragraph in group
265
+ last_para = group[-1]
266
+ marker_p_end = self.__create_marker_paragraph(end_marker)
267
+ last_para._element.addnext(marker_p_end)
268
+
269
+ # Inject markers around text box content
270
+ # for textbox_element, para_elements in text_boxes:
271
+ # self.__inject_markers_for_textbox(textbox_element, para_elements, start_marker, end_marker)
272
+
273
+ # Save modified document to BytesIO
274
+ output = BytesIO()
275
+ doc.save(output)
276
+ output.seek(0)
277
+
278
+ return output, start_marker, end_marker
279
+
280
+ def __contains_complex_structure(self, content_html):
281
+ """
282
+ Check if HTML content contains tables, lists, or other complex structures.
283
+
284
+ Args:
285
+ content_html (str): HTML content to analyze.
286
+
287
+ Returns:
288
+ bool: True if content contains tables/lists, False otherwise.
289
+ """
290
+ content_soup = BeautifulSoup(content_html, 'html.parser')
291
+
292
+ # Check for tables
293
+ if content_soup.find('table'):
294
+ return True
295
+
296
+ # Check for lists (ul, ol)
297
+ if content_soup.find('ul') or content_soup.find('ol'):
298
+ return True
299
+
300
+ return False
301
+
302
+ def __escape_hash_symbols(self, html_content):
303
+ """
304
+ Escape hash (#) symbols at the beginning of lines in HTML to prevent
305
+ them from being treated as markdown headers.
306
+
307
+ Args:
308
+ html_content (str): HTML content.
309
+
310
+ Returns:
311
+ str: HTML with escaped hash symbols.
312
+ """
313
+ soup = BeautifulSoup(html_content, 'html.parser')
314
+
315
+ # Process all text-containing elements
316
+ for element in soup.find_all(['p', 'li', 'td', 'th', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
317
+ if element.string:
318
+ text = element.string
319
+ # If line starts with #, escape it
320
+ if text.strip().startswith('#'):
321
+ element.string = text.replace('#', '\\#', 1)
322
+
323
+ return str(soup)
324
+
325
+ def __wrap_marked_sections_in_code_blocks(self, html, start_marker, end_marker):
326
+ """
327
+ Find content between markers and wrap appropriately:
328
+ - Simple text/code → <pre><code> block
329
+ - Tables/lists → Custom wrapper with preserved structure
330
+
331
+ Args:
332
+ html (str): The HTML content from Mammoth.
333
+ start_marker (str): The start marker text.
334
+ end_marker (str): The end marker text.
335
+
336
+ Returns:
337
+ str: HTML with marked sections wrapped appropriately.
338
+ """
339
+ import html as html_module
340
+
341
+ # Mammoth escapes < and > to &lt; and &gt;, so we need to escape our markers too
342
+ escaped_start = html_module.escape(start_marker)
343
+ escaped_end = html_module.escape(end_marker)
344
+
345
+ # Pattern to find content between HTML-escaped markers (including HTML tags)
346
+ # The markers will be in separate <p> tags, and content in between
347
+ pattern = re.compile(
348
+ f'<p>{re.escape(escaped_start)}</p>(.*?)<p>{re.escape(escaped_end)}</p>',
349
+ re.DOTALL
350
+ )
351
+
352
+ def replace_with_appropriate_wrapper(match):
353
+ content = match.group(1)
354
+
355
+ # Detect if content has complex structure (tables, lists)
356
+ has_complex_structure = self.__contains_complex_structure(content)
357
+
358
+ if has_complex_structure:
359
+ # Preserve structure: keep HTML as-is, escape # symbols
360
+ escaped_content = self.__escape_hash_symbols(content)
361
+ # Wrap in a div with special class for potential custom handling
362
+ return f'<div class="alita-bordered-content">{escaped_content}</div>'
363
+ else:
364
+ # Simple text/code: extract as plain text and wrap in code block
365
+ content_soup = BeautifulSoup(content, 'html.parser')
366
+
367
+ # Extract text from each paragraph separately to preserve line breaks
368
+ lines = []
369
+ for element in content_soup.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
370
+ # Replace <br /> within paragraphs with newlines
371
+ for br in element.find_all('br'):
372
+ br.replace_with('\n')
373
+ text = element.get_text()
374
+ # Preserve leading whitespace (indentation), only strip trailing
375
+ lines.append(text.rstrip())
376
+
377
+ # If no paragraphs found, just get all text
378
+ if not lines:
379
+ content = content.replace('<br />', '\n').replace('<br/>', '\n').replace('<br>', '\n')
380
+ content_text = content_soup.get_text()
381
+ lines = [line.rstrip() for line in content_text.split('\n')]
382
+
383
+ # Join lines, strip only leading/trailing empty lines
384
+ content_text = '\n'.join(lines).strip()
385
+ # Return as code block (need to HTML-escape the content)
386
+ content_escaped = html_module.escape(content_text)
387
+ return f'<pre><code>{content_escaped}</code></pre>'
388
+
389
+ # Replace all marked sections with appropriate wrappers
390
+ result_html = pattern.sub(replace_with_appropriate_wrapper, html)
391
+
392
+ return result_html
393
+
100
394
  def load(self):
101
395
  """
102
396
  Loads and converts the Docx file to markdown format.
@@ -131,6 +425,7 @@ class AlitaDocxMammothLoader(BaseLoader):
131
425
  def _convert_docx_to_markdown(self, docx_file):
132
426
  """
133
427
  Converts the content of a Docx file to markdown format.
428
+ Detects bordered content and treats it as code blocks.
134
429
 
135
430
  Args:
136
431
  docx_file (BinaryIO): The Docx file object.
@@ -138,11 +433,28 @@ class AlitaDocxMammothLoader(BaseLoader):
138
433
  Returns:
139
434
  str: The markdown content extracted from the Docx file.
140
435
  """
436
+ # Step 1: Detect and mark bordered content
437
+ # Reset stream position if needed
438
+ if hasattr(docx_file, 'seek'):
439
+ docx_file.seek(0)
440
+
441
+ marked_docx, start_marker, end_marker = self.__detect_and_mark_bordered_content(docx_file)
442
+
443
+ # Step 2: Convert marked DOCX to HTML using Mammoth
141
444
  if self.extract_images:
142
445
  # Extract images using the provided image handler
143
- result = convert_to_html(docx_file, convert_image=mammoth.images.img_element(self.__handle_image))
446
+ result = convert_to_html(marked_docx, convert_image=mammoth.images.img_element(self.__handle_image))
144
447
  else:
145
448
  # Ignore images
146
- result = convert_to_html(docx_file, convert_image=lambda image: "")
147
- content = markdownify(result.value, heading_style="ATX")
449
+ result = convert_to_html(marked_docx, convert_image=lambda image: "")
450
+
451
+ # Step 3: Wrap marked sections in <pre><code> tags
452
+ html_with_code_blocks = self.__wrap_marked_sections_in_code_blocks(
453
+ result.value, start_marker, end_marker
454
+ )
455
+
456
+ # Step 4: Convert HTML to markdown
457
+ content = markdownify(html_with_code_blocks, heading_style="ATX")
458
+
459
+ # Step 5: Post-process markdown (for image transcripts, etc.)
148
460
  return self.__postprocess_original_md(content)
@@ -12,61 +12,239 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  import io
15
+ import os
15
16
  from typing import Iterator
16
17
  import pandas as pd
17
18
  from json import loads
18
19
 
19
- from langchain_core.tools import ToolException
20
+ from openpyxl import load_workbook
21
+ from xlrd import open_workbook
22
+ from langchain_core.documents import Document
20
23
  from .AlitaTableLoader import AlitaTableLoader
21
-
24
+ from alita_sdk.runtime.langchain.constants import LOADER_MAX_TOKENS_DEFAULT
22
25
 
23
- class AlitaExcelLoader(AlitaTableLoader):
26
+ cell_delimiter = " | "
24
27
 
25
- excel_by_sheets: bool = False
28
+ class AlitaExcelLoader(AlitaTableLoader):
26
29
  sheet_name: str = None
27
- return_type: str = 'str'
30
+ file_name: str = None
31
+ max_tokens: int = LOADER_MAX_TOKENS_DEFAULT
32
+ add_header_to_chunks: bool = False
33
+ header_row_number: int = 1
28
34
 
29
35
  def __init__(self, **kwargs):
30
36
  if not kwargs.get('file_path'):
31
37
  file_content = kwargs.get('file_content')
32
38
  if file_content:
39
+ self.file_name = kwargs.get('file_name')
33
40
  kwargs['file_path'] = io.BytesIO(file_content)
41
+ else:
42
+ self.file_name = kwargs.get('file_path')
34
43
  super().__init__(**kwargs)
35
- self.excel_by_sheets = kwargs.get('excel_by_sheets')
36
- self.return_type = kwargs.get('return_type')
37
44
  self.sheet_name = kwargs.get('sheet_name')
45
+ # Set and validate chunking parameters only once
46
+ self.max_tokens = int(kwargs.get('max_tokens', LOADER_MAX_TOKENS_DEFAULT))
47
+ self.add_header_to_chunks = bool(kwargs.get('add_header_to_chunks', False))
48
+ header_row_number = kwargs.get('header_row_number', 1)
49
+ # Validate header_row_number
50
+ try:
51
+ header_row_number = int(header_row_number)
52
+ if header_row_number > 0:
53
+ self.header_row_number = header_row_number
54
+ else:
55
+ self.header_row_number = 1
56
+ self.add_header_to_chunks = False
57
+ except (ValueError, TypeError):
58
+ self.header_row_number = 1
59
+ self.add_header_to_chunks = False
38
60
 
39
61
  def get_content(self):
40
62
  try:
41
- dfs = pd.read_excel(self.file_path, sheet_name=self.sheet_name)
42
-
43
- if self.excel_by_sheets:
44
- result = {}
45
- for sheet_name, df in dfs.items():
46
- df.fillna('', inplace=True)
47
- result[sheet_name] = self.parse_sheet(df)
48
- return result
63
+ # Determine file extension
64
+ file_extension = os.path.splitext(self.file_name)[-1].lower()
65
+
66
+ if file_extension == '.xlsx':
67
+ # Use openpyxl for .xlsx files
68
+ return self._read_xlsx()
69
+ elif file_extension == '.xls':
70
+ # Use xlrd for .xls files
71
+ return self._read_xls()
49
72
  else:
50
- result = []
51
- for sheet_name, df in dfs.items():
52
- string_content = self.parse_sheet(df)
53
- result.append(f"====== Sheet name: {sheet_name} ======\n{string_content}")
54
- return "\n\n".join(result)
73
+ raise ValueError(f"Unsupported file format: {file_extension}")
55
74
  except Exception as e:
56
- return ToolException(f"Error reading Excel file: {e}")
75
+ return f"Error reading Excel file: {e}"
57
76
 
58
- def parse_sheet(self, df):
59
- df.fillna('', inplace=True)
77
+ def _read_xlsx(self):
78
+ """
79
+ Reads .xlsx files using openpyxl.
80
+ """
81
+ workbook = load_workbook(self.file_path, data_only=True) # `data_only=True` ensures we get cell values, not formulas
82
+ sheets = workbook.sheetnames
83
+ if self.sheet_name:
84
+ if self.sheet_name in sheets:
85
+ sheet_content = self.parse_sheet(workbook[self.sheet_name])
86
+ else:
87
+ sheet_content = [f"Sheet '{self.sheet_name}' does not exist in the workbook."]
88
+ return {self.sheet_name: sheet_content}
89
+ else:
90
+ # Dictionary comprehension for all sheets
91
+ return {name: self.parse_sheet(workbook[name]) for name in sheets}
60
92
 
61
- if self.return_type == 'dict':
62
- return df.to_dict(orient='records')
63
- elif self.return_type == 'csv':
64
- return df.to_csv()
93
+ def _read_xls(self):
94
+ """
95
+ Reads .xls files using xlrd.
96
+ """
97
+ workbook = open_workbook(filename=self.file_name, file_contents=self.file_content)
98
+ sheets = workbook.sheet_names()
99
+ if self.sheet_name:
100
+ if self.sheet_name in sheets:
101
+ sheet = workbook.sheet_by_name(self.sheet_name)
102
+ return {self.sheet_name: self.parse_sheet_xls(sheet)}
103
+ else:
104
+ return {self.sheet_name: [f"Sheet '{self.sheet_name}' does not exist in the workbook."]}
65
105
  else:
66
- return df.to_string(index=False)
106
+ # Dictionary comprehension for all sheets
107
+ return {name: self.parse_sheet_xls(workbook.sheet_by_name(name)) for name in sheets}
108
+
109
+ def parse_sheet(self, sheet):
110
+ """
111
+ Parses a single .xlsx sheet, extracting text and hyperlinks, and formats them.
112
+ """
113
+ sheet_content = []
114
+
115
+ for row in sheet.iter_rows():
116
+ row_content = []
117
+ for cell in row:
118
+ if cell.hyperlink:
119
+ # If the cell has a hyperlink, format it as Markdown
120
+ hyperlink = cell.hyperlink.target
121
+ cell_value = cell.value or '' # Use cell value or empty string
122
+ row_content.append(f"[{cell_value}]({hyperlink})")
123
+ else:
124
+ # If no hyperlink, use the cell value (computed value if formula)
125
+ row_content.append(str(cell.value) if cell.value is not None else "")
126
+ # Join the row content into a single line using `|` as the delimiter
127
+ sheet_content.append(cell_delimiter.join(row_content))
128
+
129
+ # Format the sheet content based on the return type
130
+ return self._format_sheet_content(sheet_content)
131
+
132
+ def parse_sheet_xls(self, sheet):
133
+ """
134
+ Parses a single .xls sheet using xlrd, extracting text and hyperlinks, and formats them.
135
+ """
136
+ sheet_content = []
137
+
138
+ # Extract hyperlink map (if available)
139
+ hyperlink_map = getattr(sheet, 'hyperlink_map', {})
140
+
141
+ for row_idx in range(sheet.nrows):
142
+ row_content = []
143
+ for col_idx in range(sheet.ncols):
144
+ cell = sheet.cell(row_idx, col_idx)
145
+ cell_value = cell.value
146
+
147
+ # Check if the cell has a hyperlink
148
+ cell_address = (row_idx, col_idx)
149
+ if cell_address in hyperlink_map:
150
+ hyperlink = hyperlink_map[cell_address].url_or_path
151
+ if cell_value:
152
+ row_content.append(f"[{cell_value}]({hyperlink})")
153
+ else:
154
+ row_content.append(str(cell_value) if cell_value is not None else "")
155
+ # Join the row content into a single line using `|` as the delimiter
156
+ sheet_content.append(cell_delimiter.join(row_content))
157
+
158
+ # Format the sheet content based on the return type
159
+ return self._format_sheet_content(sheet_content)
160
+
161
+ def _format_sheet_content(self, rows):
162
+ """
163
+ Specification:
164
+ Formats a list of sheet rows into a list of string chunks according to the following rules:
165
+ 1. If max_tokens < 1, returns a single chunk (list of one string) with all rows joined by a newline ('\n').
166
+ - If add_header_to_chunks is True and header_row_number is valid, the specified header row is prepended as the first line.
167
+ 2. If max_tokens >= 1:
168
+ a. Each chunk is a string containing one or more rows, separated by newlines ('\n'), such that the total token count (as measured by tiktoken) does not exceed max_tokens.
169
+ b. If add_header_to_chunks is True and header_row_number is valid, the specified header row is prepended once at the top of each chunk (not before every row).
170
+ c. If a single row exceeds max_tokens, it is placed in its own chunk without splitting, with the header prepended if applicable.
171
+ 3. Returns: List[str], where each string is a chunk ready for further processing.
172
+ """
173
+ import tiktoken
174
+ encoding = tiktoken.get_encoding('cl100k_base')
175
+
176
+ # --- Inner functions ---
177
+ def count_tokens(text):
178
+ """Count tokens in text using tiktoken encoding."""
179
+ return len(encoding.encode(text))
180
+
181
+ def finalize_chunk(chunk_rows):
182
+ """Join rows for a chunk, prepending header if needed."""
183
+ if self.add_header_to_chunks and header:
184
+ return '\n'.join([header] + chunk_rows)
185
+ else:
186
+ return '\n'.join(chunk_rows)
187
+ # --- End inner functions ---
188
+
189
+ # If max_tokens < 1, return all rows as a single chunk
190
+ if self.max_tokens < 1:
191
+ return ['\n'.join(rows)]
192
+
193
+ # Extract header if needed
194
+ header = None
195
+ if self.add_header_to_chunks and rows:
196
+ header_idx = self.header_row_number - 1
197
+ header = rows.pop(header_idx)
198
+
199
+ chunks = [] # List to store final chunks
200
+ current_chunk = [] # Accumulate rows for the current chunk
201
+ current_tokens = 0 # Token count for the current chunk
202
+
203
+ for row in rows:
204
+ row_tokens = count_tokens(row)
205
+ # If row itself exceeds max_tokens, flush current chunk and add row as its own chunk (with header if needed)
206
+ if row_tokens > self.max_tokens:
207
+ if current_chunk:
208
+ chunks.append(finalize_chunk(current_chunk))
209
+ current_chunk = []
210
+ current_tokens = 0
211
+ # Add the large row as its own chunk, with header if needed
212
+ if self.add_header_to_chunks and header:
213
+ chunks.append(finalize_chunk([row]))
214
+ else:
215
+ chunks.append(row)
216
+ continue
217
+ # If adding row would exceed max_tokens, flush current chunk and start new
218
+ if current_tokens + row_tokens > self.max_tokens:
219
+ if current_chunk:
220
+ chunks.append(finalize_chunk(current_chunk))
221
+ current_chunk = [row]
222
+ current_tokens = row_tokens
223
+ else:
224
+ current_chunk.append(row)
225
+ current_tokens += row_tokens
226
+ # Add any remaining rows as the last chunk
227
+ if current_chunk:
228
+ chunks.append(finalize_chunk(current_chunk))
229
+ return chunks
230
+
231
+ def load(self) -> list:
232
+ docs = []
233
+ content_per_sheet = self.get_content()
234
+ # content_per_sheet is a dict of sheet_name: list of chunk strings
235
+ for sheet_name, content_chunks in content_per_sheet.items():
236
+ metadata = {
237
+ "source": f'{self.file_path}:{sheet_name}',
238
+ "sheet_name": sheet_name,
239
+ "file_type": "excel",
240
+ }
241
+ # Each chunk is a separate Document
242
+ for chunk in content_chunks:
243
+ docs.append(Document(page_content=chunk, metadata=metadata))
244
+ return docs
67
245
 
68
- def read(self):
69
- df = pd.read_excel(self.file_path, sheet_name=None)
246
+ def read(self, lazy: bool = False):
247
+ df = pd.read_excel(self.file_path, sheet_name=None, engine='calamine')
70
248
  docs = []
71
249
  for key in df.keys():
72
250
  if self.raw_content:
@@ -77,7 +255,7 @@ class AlitaExcelLoader(AlitaTableLoader):
77
255
  return docs
78
256
 
79
257
  def read_lazy(self) -> Iterator[dict]:
80
- df = pd.read_excel(self.file_path, sheet_name=None)
258
+ df = pd.read_excel(self.file_path, sheet_name=None, engine='calamine')
81
259
  for key in df.keys():
82
260
  if self.raw_content:
83
261
  yield df[key].to_string()
@@ -151,5 +151,5 @@ class AlitaImageLoader(BaseLoader):
151
151
  """Load text from image using OCR or LLM if llm is provided, supports SVG."""
152
152
  text_content = self.get_content()
153
153
 
154
- metadata = {"source": str(self.file_path)} # Ensure source is always a string for metadata
154
+ metadata = {"source": str(self.file_path if hasattr(self, 'file_path') else self.file_name)} # Ensure source is always a string for metadata
155
155
  return [Document(page_content=text_content, metadata=metadata)]