alita-sdk 0.3.257__py3-none-any.whl → 0.3.562__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent/__init__.py +5 -0
  4. alita_sdk/cli/agent/default.py +258 -0
  5. alita_sdk/cli/agent_executor.py +155 -0
  6. alita_sdk/cli/agent_loader.py +215 -0
  7. alita_sdk/cli/agent_ui.py +228 -0
  8. alita_sdk/cli/agents.py +3601 -0
  9. alita_sdk/cli/callbacks.py +647 -0
  10. alita_sdk/cli/cli.py +168 -0
  11. alita_sdk/cli/config.py +306 -0
  12. alita_sdk/cli/context/__init__.py +30 -0
  13. alita_sdk/cli/context/cleanup.py +198 -0
  14. alita_sdk/cli/context/manager.py +731 -0
  15. alita_sdk/cli/context/message.py +285 -0
  16. alita_sdk/cli/context/strategies.py +289 -0
  17. alita_sdk/cli/context/token_estimation.py +127 -0
  18. alita_sdk/cli/formatting.py +182 -0
  19. alita_sdk/cli/input_handler.py +419 -0
  20. alita_sdk/cli/inventory.py +1073 -0
  21. alita_sdk/cli/mcp_loader.py +315 -0
  22. alita_sdk/cli/toolkit.py +327 -0
  23. alita_sdk/cli/toolkit_loader.py +85 -0
  24. alita_sdk/cli/tools/__init__.py +43 -0
  25. alita_sdk/cli/tools/approval.py +224 -0
  26. alita_sdk/cli/tools/filesystem.py +1751 -0
  27. alita_sdk/cli/tools/planning.py +389 -0
  28. alita_sdk/cli/tools/terminal.py +414 -0
  29. alita_sdk/community/__init__.py +72 -12
  30. alita_sdk/community/inventory/__init__.py +236 -0
  31. alita_sdk/community/inventory/config.py +257 -0
  32. alita_sdk/community/inventory/enrichment.py +2137 -0
  33. alita_sdk/community/inventory/extractors.py +1469 -0
  34. alita_sdk/community/inventory/ingestion.py +3172 -0
  35. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  36. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  37. alita_sdk/community/inventory/parsers/base.py +295 -0
  38. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  39. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  40. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  41. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  42. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  43. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  44. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  45. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  46. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  47. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  48. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  49. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  50. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  51. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  52. alita_sdk/community/inventory/patterns/loader.py +348 -0
  53. alita_sdk/community/inventory/patterns/registry.py +198 -0
  54. alita_sdk/community/inventory/presets.py +535 -0
  55. alita_sdk/community/inventory/retrieval.py +1403 -0
  56. alita_sdk/community/inventory/toolkit.py +173 -0
  57. alita_sdk/community/inventory/toolkit_utils.py +176 -0
  58. alita_sdk/community/inventory/visualize.py +1370 -0
  59. alita_sdk/configurations/__init__.py +11 -0
  60. alita_sdk/configurations/ado.py +148 -2
  61. alita_sdk/configurations/azure_search.py +1 -1
  62. alita_sdk/configurations/bigquery.py +1 -1
  63. alita_sdk/configurations/bitbucket.py +94 -2
  64. alita_sdk/configurations/browser.py +18 -0
  65. alita_sdk/configurations/carrier.py +19 -0
  66. alita_sdk/configurations/confluence.py +130 -1
  67. alita_sdk/configurations/delta_lake.py +1 -1
  68. alita_sdk/configurations/figma.py +76 -5
  69. alita_sdk/configurations/github.py +65 -1
  70. alita_sdk/configurations/gitlab.py +81 -0
  71. alita_sdk/configurations/google_places.py +17 -0
  72. alita_sdk/configurations/jira.py +103 -0
  73. alita_sdk/configurations/openapi.py +111 -0
  74. alita_sdk/configurations/postman.py +1 -1
  75. alita_sdk/configurations/qtest.py +72 -3
  76. alita_sdk/configurations/report_portal.py +115 -0
  77. alita_sdk/configurations/salesforce.py +19 -0
  78. alita_sdk/configurations/service_now.py +1 -12
  79. alita_sdk/configurations/sharepoint.py +167 -0
  80. alita_sdk/configurations/sonar.py +18 -0
  81. alita_sdk/configurations/sql.py +20 -0
  82. alita_sdk/configurations/testio.py +101 -0
  83. alita_sdk/configurations/testrail.py +88 -0
  84. alita_sdk/configurations/xray.py +94 -1
  85. alita_sdk/configurations/zephyr_enterprise.py +94 -1
  86. alita_sdk/configurations/zephyr_essential.py +95 -0
  87. alita_sdk/runtime/clients/artifact.py +21 -4
  88. alita_sdk/runtime/clients/client.py +458 -67
  89. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  90. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  91. alita_sdk/runtime/clients/sandbox_client.py +352 -0
  92. alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
  93. alita_sdk/runtime/langchain/assistant.py +183 -43
  94. alita_sdk/runtime/langchain/constants.py +647 -1
  95. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  96. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
  97. alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
  98. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
  99. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -3
  100. alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
  101. alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
  102. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
  103. alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
  104. alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
  105. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
  106. alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
  107. alita_sdk/runtime/langchain/document_loaders/constants.py +189 -41
  108. alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
  109. alita_sdk/runtime/langchain/langraph_agent.py +407 -92
  110. alita_sdk/runtime/langchain/utils.py +102 -8
  111. alita_sdk/runtime/llms/preloaded.py +2 -6
  112. alita_sdk/runtime/models/mcp_models.py +61 -0
  113. alita_sdk/runtime/skills/__init__.py +91 -0
  114. alita_sdk/runtime/skills/callbacks.py +498 -0
  115. alita_sdk/runtime/skills/discovery.py +540 -0
  116. alita_sdk/runtime/skills/executor.py +610 -0
  117. alita_sdk/runtime/skills/input_builder.py +371 -0
  118. alita_sdk/runtime/skills/models.py +330 -0
  119. alita_sdk/runtime/skills/registry.py +355 -0
  120. alita_sdk/runtime/skills/skill_runner.py +330 -0
  121. alita_sdk/runtime/toolkits/__init__.py +28 -0
  122. alita_sdk/runtime/toolkits/application.py +14 -4
  123. alita_sdk/runtime/toolkits/artifact.py +24 -9
  124. alita_sdk/runtime/toolkits/datasource.py +13 -6
  125. alita_sdk/runtime/toolkits/mcp.py +780 -0
  126. alita_sdk/runtime/toolkits/planning.py +178 -0
  127. alita_sdk/runtime/toolkits/skill_router.py +238 -0
  128. alita_sdk/runtime/toolkits/subgraph.py +11 -6
  129. alita_sdk/runtime/toolkits/tools.py +314 -70
  130. alita_sdk/runtime/toolkits/vectorstore.py +11 -5
  131. alita_sdk/runtime/tools/__init__.py +24 -0
  132. alita_sdk/runtime/tools/application.py +16 -4
  133. alita_sdk/runtime/tools/artifact.py +367 -33
  134. alita_sdk/runtime/tools/data_analysis.py +183 -0
  135. alita_sdk/runtime/tools/function.py +100 -4
  136. alita_sdk/runtime/tools/graph.py +81 -0
  137. alita_sdk/runtime/tools/image_generation.py +218 -0
  138. alita_sdk/runtime/tools/llm.py +1013 -177
  139. alita_sdk/runtime/tools/loop.py +3 -1
  140. alita_sdk/runtime/tools/loop_output.py +3 -1
  141. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  142. alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
  143. alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
  144. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  145. alita_sdk/runtime/tools/planning/models.py +246 -0
  146. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  147. alita_sdk/runtime/tools/router.py +2 -1
  148. alita_sdk/runtime/tools/sandbox.py +375 -0
  149. alita_sdk/runtime/tools/skill_router.py +776 -0
  150. alita_sdk/runtime/tools/tool.py +3 -1
  151. alita_sdk/runtime/tools/vectorstore.py +69 -65
  152. alita_sdk/runtime/tools/vectorstore_base.py +163 -90
  153. alita_sdk/runtime/utils/AlitaCallback.py +137 -21
  154. alita_sdk/runtime/utils/mcp_client.py +492 -0
  155. alita_sdk/runtime/utils/mcp_oauth.py +361 -0
  156. alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
  157. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  158. alita_sdk/runtime/utils/streamlit.py +41 -14
  159. alita_sdk/runtime/utils/toolkit_utils.py +28 -9
  160. alita_sdk/runtime/utils/utils.py +48 -0
  161. alita_sdk/tools/__init__.py +135 -37
  162. alita_sdk/tools/ado/__init__.py +2 -2
  163. alita_sdk/tools/ado/repos/__init__.py +15 -19
  164. alita_sdk/tools/ado/repos/repos_wrapper.py +12 -20
  165. alita_sdk/tools/ado/test_plan/__init__.py +26 -8
  166. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -28
  167. alita_sdk/tools/ado/wiki/__init__.py +27 -12
  168. alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -40
  169. alita_sdk/tools/ado/work_item/__init__.py +27 -12
  170. alita_sdk/tools/ado/work_item/ado_wrapper.py +95 -11
  171. alita_sdk/tools/advanced_jira_mining/__init__.py +12 -8
  172. alita_sdk/tools/aws/delta_lake/__init__.py +14 -11
  173. alita_sdk/tools/aws/delta_lake/tool.py +5 -1
  174. alita_sdk/tools/azure_ai/search/__init__.py +13 -8
  175. alita_sdk/tools/base/tool.py +5 -1
  176. alita_sdk/tools/base_indexer_toolkit.py +454 -110
  177. alita_sdk/tools/bitbucket/__init__.py +27 -19
  178. alita_sdk/tools/bitbucket/api_wrapper.py +285 -27
  179. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
  180. alita_sdk/tools/browser/__init__.py +41 -16
  181. alita_sdk/tools/browser/crawler.py +3 -1
  182. alita_sdk/tools/browser/utils.py +15 -6
  183. alita_sdk/tools/carrier/__init__.py +18 -17
  184. alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
  185. alita_sdk/tools/carrier/excel_reporter.py +8 -4
  186. alita_sdk/tools/chunkers/__init__.py +3 -1
  187. alita_sdk/tools/chunkers/code/codeparser.py +1 -1
  188. alita_sdk/tools/chunkers/sematic/json_chunker.py +2 -1
  189. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  190. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  191. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  192. alita_sdk/tools/cloud/aws/__init__.py +11 -7
  193. alita_sdk/tools/cloud/azure/__init__.py +11 -7
  194. alita_sdk/tools/cloud/gcp/__init__.py +11 -7
  195. alita_sdk/tools/cloud/k8s/__init__.py +11 -7
  196. alita_sdk/tools/code/linter/__init__.py +9 -8
  197. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  198. alita_sdk/tools/code/sonar/__init__.py +20 -13
  199. alita_sdk/tools/code_indexer_toolkit.py +199 -0
  200. alita_sdk/tools/confluence/__init__.py +21 -14
  201. alita_sdk/tools/confluence/api_wrapper.py +197 -58
  202. alita_sdk/tools/confluence/loader.py +14 -2
  203. alita_sdk/tools/custom_open_api/__init__.py +11 -5
  204. alita_sdk/tools/elastic/__init__.py +10 -8
  205. alita_sdk/tools/elitea_base.py +546 -64
  206. alita_sdk/tools/figma/__init__.py +11 -8
  207. alita_sdk/tools/figma/api_wrapper.py +352 -153
  208. alita_sdk/tools/github/__init__.py +17 -17
  209. alita_sdk/tools/github/api_wrapper.py +9 -26
  210. alita_sdk/tools/github/github_client.py +81 -12
  211. alita_sdk/tools/github/schemas.py +2 -1
  212. alita_sdk/tools/github/tool.py +5 -1
  213. alita_sdk/tools/gitlab/__init__.py +18 -13
  214. alita_sdk/tools/gitlab/api_wrapper.py +224 -80
  215. alita_sdk/tools/gitlab_org/__init__.py +13 -10
  216. alita_sdk/tools/google/bigquery/__init__.py +13 -13
  217. alita_sdk/tools/google/bigquery/tool.py +5 -1
  218. alita_sdk/tools/google_places/__init__.py +20 -11
  219. alita_sdk/tools/jira/__init__.py +21 -11
  220. alita_sdk/tools/jira/api_wrapper.py +315 -168
  221. alita_sdk/tools/keycloak/__init__.py +10 -8
  222. alita_sdk/tools/localgit/__init__.py +8 -3
  223. alita_sdk/tools/localgit/local_git.py +62 -54
  224. alita_sdk/tools/localgit/tool.py +5 -1
  225. alita_sdk/tools/memory/__init__.py +38 -14
  226. alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
  227. alita_sdk/tools/ocr/__init__.py +10 -8
  228. alita_sdk/tools/openapi/__init__.py +281 -108
  229. alita_sdk/tools/openapi/api_wrapper.py +883 -0
  230. alita_sdk/tools/openapi/tool.py +20 -0
  231. alita_sdk/tools/pandas/__init__.py +18 -11
  232. alita_sdk/tools/pandas/api_wrapper.py +40 -45
  233. alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
  234. alita_sdk/tools/postman/__init__.py +10 -11
  235. alita_sdk/tools/postman/api_wrapper.py +19 -8
  236. alita_sdk/tools/postman/postman_analysis.py +8 -1
  237. alita_sdk/tools/pptx/__init__.py +10 -10
  238. alita_sdk/tools/qtest/__init__.py +21 -14
  239. alita_sdk/tools/qtest/api_wrapper.py +1784 -88
  240. alita_sdk/tools/rally/__init__.py +12 -10
  241. alita_sdk/tools/report_portal/__init__.py +22 -16
  242. alita_sdk/tools/salesforce/__init__.py +21 -16
  243. alita_sdk/tools/servicenow/__init__.py +20 -16
  244. alita_sdk/tools/servicenow/api_wrapper.py +1 -1
  245. alita_sdk/tools/sharepoint/__init__.py +16 -14
  246. alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
  247. alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
  248. alita_sdk/tools/sharepoint/utils.py +8 -2
  249. alita_sdk/tools/slack/__init__.py +11 -7
  250. alita_sdk/tools/sql/__init__.py +21 -19
  251. alita_sdk/tools/sql/api_wrapper.py +71 -23
  252. alita_sdk/tools/testio/__init__.py +20 -13
  253. alita_sdk/tools/testrail/__init__.py +12 -11
  254. alita_sdk/tools/testrail/api_wrapper.py +214 -46
  255. alita_sdk/tools/utils/__init__.py +28 -4
  256. alita_sdk/tools/utils/content_parser.py +182 -62
  257. alita_sdk/tools/utils/text_operations.py +254 -0
  258. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
  259. alita_sdk/tools/xray/__init__.py +17 -14
  260. alita_sdk/tools/xray/api_wrapper.py +58 -113
  261. alita_sdk/tools/yagmail/__init__.py +8 -3
  262. alita_sdk/tools/zephyr/__init__.py +11 -7
  263. alita_sdk/tools/zephyr_enterprise/__init__.py +15 -9
  264. alita_sdk/tools/zephyr_enterprise/api_wrapper.py +30 -15
  265. alita_sdk/tools/zephyr_essential/__init__.py +15 -10
  266. alita_sdk/tools/zephyr_essential/api_wrapper.py +297 -54
  267. alita_sdk/tools/zephyr_essential/client.py +6 -4
  268. alita_sdk/tools/zephyr_scale/__init__.py +12 -8
  269. alita_sdk/tools/zephyr_scale/api_wrapper.py +39 -31
  270. alita_sdk/tools/zephyr_squad/__init__.py +11 -7
  271. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/METADATA +184 -37
  272. alita_sdk-0.3.562.dist-info/RECORD +450 -0
  273. alita_sdk-0.3.562.dist-info/entry_points.txt +2 -0
  274. alita_sdk/tools/bitbucket/tools.py +0 -304
  275. alita_sdk-0.3.257.dist-info/RECORD +0 -343
  276. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/WHEEL +0 -0
  277. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/licenses/LICENSE +0 -0
  278. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,77 @@
1
+ from .AlitaJSONLoader import AlitaJSONLoader
2
+ import json
3
+ from io import StringIO
4
+ from typing import List, Iterator
5
+
6
+ from langchain_core.documents import Document
7
+ from langchain_core.tools import ToolException
8
+
9
+
10
+ class AlitaJSONLinesLoader(AlitaJSONLoader):
11
+ """Load local JSONL files (one JSON object per line) using AlitaJSONLoader behavior.
12
+
13
+ Behavior:
14
+ - Supports both `file_path` and `file_content` (bytes or file-like object), same as AlitaJSONLoader.
15
+ - Treats each non-empty line as an independent JSON object.
16
+ - Aggregates all parsed JSON objects into a list and feeds them through the same
17
+ RecursiveJsonSplitter-based chunking used by AlitaJSONLoader.lazy_load.
18
+ - Returns a list of Documents with chunked JSON content.
19
+ """
20
+
21
+ def __init__(self, **kwargs):
22
+ # Reuse AlitaJSONLoader initialization logic (file_path / file_content handling, encoding, etc.)
23
+ super().__init__(**kwargs)
24
+
25
+ def _iter_lines(self) -> Iterator[str]:
26
+ """Yield lines from file_path or file_content, mirroring AlitaJSONLoader sources."""
27
+ # Prefer file_path if available
28
+ if hasattr(self, "file_path") and self.file_path:
29
+ with open(self.file_path, "r", encoding=self.encoding) as f:
30
+ for line in f:
31
+ yield line
32
+ # Fallback to file_content if available
33
+ elif hasattr(self, "file_content") and self.file_content:
34
+ # file_content may be bytes or a file-like object
35
+ if isinstance(self.file_content, (bytes, bytearray)):
36
+ text = self.file_content.decode(self.encoding)
37
+ for line in StringIO(text):
38
+ yield line
39
+ else:
40
+ # Assume it's a text file-like object positioned at the beginning
41
+ self.file_content.seek(0)
42
+ for line in self.file_content:
43
+ yield line
44
+ else:
45
+ raise ToolException("'file_path' or 'file_content' parameter should be provided.")
46
+
47
+ def load(self) -> List[Document]: # type: ignore[override]
48
+ """Load JSONL content by delegating each non-empty line to AlitaJSONLoader.
49
+
50
+ For each non-empty line in the underlying source (file_path or file_content):
51
+ - Create a temporary AlitaJSONLoader instance with that line as file_content.
52
+ - Call lazy_load() on that instance to apply the same RecursiveJsonSplitter logic
53
+ as for a normal JSON file.
54
+ - Accumulate all Documents from all lines and return them as a single list.
55
+ """
56
+ docs: List[Document] = []
57
+
58
+ for raw_line in self._iter_lines():
59
+ line = raw_line.strip()
60
+ if not line:
61
+ continue
62
+ try:
63
+ # Instantiate a per-line AlitaJSONLoader using the same configuration
64
+ line_loader = AlitaJSONLoader(
65
+ file_content=line,
66
+ file_name=getattr(self, "file_name", str(getattr(self, "file_path", "no_name"))),
67
+ encoding=self.encoding,
68
+ autodetect_encoding=self.autodetect_encoding,
69
+ max_tokens=self.max_tokens,
70
+ )
71
+
72
+ for doc in line_loader.lazy_load():
73
+ docs.append(doc)
74
+ except Exception as e:
75
+ raise ToolException(f"Error processing JSONL line: {line[:100]}... Error: {e}") from e
76
+
77
+ return docs
@@ -30,7 +30,12 @@ class AlitaJSONLoader(BaseLoader):
30
30
  with open(self.file_path, encoding=self.encoding) as f:
31
31
  return json.load(f)
32
32
  elif hasattr(self, 'file_content') and self.file_content:
33
- return json.load(self.file_content)
33
+ if isinstance(self.file_content, bytes):
34
+ return json.loads(self.file_content.decode(self.encoding))
35
+ elif isinstance(self.file_content, str):
36
+ return json.loads(self.file_content)
37
+ else:
38
+ return json.load(self.file_content)
34
39
  else:
35
40
  raise ValueError("Neither file_path nor file_content is provided.")
36
41
 
@@ -42,7 +47,6 @@ class AlitaJSONLoader(BaseLoader):
42
47
  try:
43
48
  with open(self.file_path, encoding=encoding.encoding) as f:
44
49
  return f.read()
45
- break
46
50
  except UnicodeDecodeError:
47
51
  continue
48
52
  elif hasattr(self, 'file_content') and self.file_content:
@@ -68,6 +72,9 @@ class AlitaJSONLoader(BaseLoader):
68
72
  else:
69
73
  data_dict = content_json
70
74
  chunks = RecursiveJsonSplitter(max_chunk_size=self.max_tokens).split_json(json_data=data_dict)
75
+ chunk_id = 1
71
76
  for chunk in chunks:
72
- metadata = {"source": str(self.file_path) if hasattr(self, 'file_path') else self.file_name}
77
+ metadata = {"source": str(self.file_path) if hasattr(self, 'file_path') else self.file_name,
78
+ "chunk_id": chunk_id}
79
+ chunk_id+=1
73
80
  yield Document(page_content=json.dumps(chunk), metadata=metadata)
@@ -0,0 +1,66 @@
1
+ from pathlib import Path
2
+ from typing import Any, List, Union, Generator, Iterator
3
+ from langchain_core.documents import Document
4
+
5
+ from langchain_community.document_loaders.unstructured import (
6
+ UnstructuredFileLoader,
7
+ validate_unstructured_version,
8
+ )
9
+
10
+ class AlitaMarkdownLoader(UnstructuredFileLoader):
11
+
12
+ def __init__(
13
+ self,
14
+ file_path: Union[str, Path],
15
+ mode: str = "elements",
16
+ chunker_config: dict = None,
17
+ **unstructured_kwargs: Any,
18
+ ):
19
+ """
20
+ Args:
21
+ file_path: The path to the Markdown file to load.
22
+ mode: The mode to use when loading the file. Can be one of "single",
23
+ "multi", or "all". Default is "single".
24
+ chunker_config: Configuration dictionary for the markdown chunker.
25
+ **unstructured_kwargs: Any kwargs to pass to the unstructured.
26
+ """
27
+ file_path = str(file_path)
28
+ validate_unstructured_version("0.4.16")
29
+ self.chunker_config = chunker_config or {
30
+ "strip_header": False,
31
+ "return_each_line": False,
32
+ "headers_to_split_on": [],
33
+ "max_tokens": 512,
34
+ "token_overlap": 10,
35
+ }
36
+ super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
37
+
38
+ def _file_content_generator(self) -> Generator[Document, None, None]:
39
+ """
40
+ Creates a generator that yields a single Document object
41
+ representing the entire content of the Markdown file.
42
+ """
43
+ with open(self.file_path, "r", encoding="utf-8") as file:
44
+ content = file.read()
45
+ yield Document(page_content=content, metadata={"source": self.file_path})
46
+
47
+ def _get_elements(self) -> List[Document]:
48
+ """
49
+ Processes the Markdown file using the markdown_chunker and returns the chunks.
50
+ """
51
+ from alita_sdk.tools.chunkers.sematic.markdown_chunker import markdown_chunker
52
+
53
+ # Create a generator for the file content
54
+ file_content_generator = self._file_content_generator()
55
+
56
+ # Use the markdown_chunker to process the content
57
+ chunks = markdown_chunker(file_content_generator, config=self.chunker_config)
58
+
59
+ # Convert the generator to a list of Document objects
60
+ return list(chunks)
61
+
62
+ def lazy_load(self) -> Iterator[Document]:
63
+ """Load file."""
64
+ elements = self._get_elements()
65
+ self._post_process_elements(elements)
66
+ yield from elements
@@ -1,5 +1,8 @@
1
1
  import pymupdf
2
- from langchain_community.document_loaders import PyPDFLoader
2
+ import fitz
3
+ from langchain_community.document_loaders import PyPDFium2Loader
4
+
5
+ from .ImageParser import ImageParser
3
6
  from .utils import perform_llm_prediction_for_image_bytes, create_temp_file
4
7
  from langchain_core.tools import ToolException
5
8
 
@@ -20,6 +23,7 @@ class AlitaPDFLoader:
20
23
  self.headers = kwargs.get('headers', None)
21
24
  self.extraction_mode = kwargs.get('extraction_mode', "plain")
22
25
  self.extraction_kwargs = kwargs.get('extraction_kwargs', None)
26
+ self.images_parser=ImageParser(llm=self.llm, prompt=self.prompt)
23
27
 
24
28
  def get_content(self):
25
29
  if hasattr(self, 'file_path'):
@@ -41,8 +45,59 @@ class AlitaPDFLoader:
41
45
  return text_content
42
46
 
43
47
  def read_pdf_page(self, report, page, index):
44
- text_content = f'Page: {index}\n'
45
- text_content += page.get_text()
48
+ # Extract text in block format (to more accurately match hyperlinks to text)
49
+ text_blocks = page.get_text("blocks") # Returns a list of text blocks
50
+ words = page.get_text("words") # Returns words with their coordinates
51
+
52
+ # Extract hyperlinks
53
+ links = page.get_links()
54
+
55
+ # Create a list to store the modified text
56
+ modified_text = []
57
+
58
+ for block in text_blocks:
59
+ block_rect = fitz.Rect(block[:4]) # Coordinates of the text block
60
+ block_text = block[4] # The actual text of the block
61
+
62
+ # Check if there are hyperlinks intersecting with this text block
63
+ for link in links:
64
+ if "uri" in link: # Ensure this is a hyperlink
65
+ link_rect = link["from"] # Coordinates of the hyperlink area
66
+ link_uri = link["uri"] # The URL of the hyperlink
67
+
68
+ # Expand the hyperlink area slightly to account for inaccuracies
69
+ link_rect = fitz.Rect(
70
+ link_rect.x0 - 1, link_rect.y0 - 1, link_rect.x1 + 1, link_rect.y1 + 1
71
+ )
72
+
73
+ # Find words that are inside the hyperlink area
74
+ link_text = []
75
+ for word in words:
76
+ word_rect = fitz.Rect(word[:4]) # Coordinates of the word
77
+ word_text = word[4]
78
+
79
+ # Check if the word rectangle is fully inside the hyperlink rectangle
80
+ if link_rect.contains(word_rect):
81
+ link_text.append(word_text)
82
+ # If the word partially intersects, check vertical alignment
83
+ elif link_rect.intersects(word_rect):
84
+ # Condition: The word must be on the same line as the hyperlink
85
+ if abs(link_rect.y0 - word_rect.y0) < 2 and abs(link_rect.y1 - word_rect.y1) < 2:
86
+ link_text.append(word_text)
87
+
88
+ # Format the hyperlink in Markdown
89
+ full_text = " ".join(link_text) if link_text else "No text"
90
+ hyperlink = f"[{full_text}]({link_uri})"
91
+
92
+ # Replace the hyperlink text in the block with the formatted hyperlink
93
+ block_text = block_text.replace(full_text, hyperlink)
94
+
95
+ # Add the processed text block to the result
96
+ modified_text.append(block_text)
97
+
98
+ # Combine all text blocks into the final text for the page
99
+ text_content = f'Page: {index}\n' + "\n".join(modified_text)
100
+
46
101
  if self.extract_images:
47
102
  images = page.get_images(full=True)
48
103
  for i, img in enumerate(images):
@@ -54,10 +109,24 @@ class AlitaPDFLoader:
54
109
 
55
110
  def load(self):
56
111
  if not hasattr(self, 'file_path'):
57
- self.file_path = create_temp_file(self.file_content)
58
- return PyPDFLoader(file_path=self.file_path,
59
- password=self.password,
60
- headers=self.headers,
61
- extract_images=self.extract_images,
62
- extraction_mode=self.extraction_mode,
63
- extraction_kwargs=self.extraction_kwargs).load()
112
+ import tempfile
113
+
114
+ with tempfile.NamedTemporaryFile(mode='w+b', delete=True, suffix=".pdf") as temp_file:
115
+ temp_file.write(self.file_content)
116
+ temp_file.flush()
117
+ self.file_path = temp_file.name
118
+ return self._load_docs()
119
+ else:
120
+ return self._load_docs()
121
+
122
+ def _load_docs(self):
123
+ docs = PyPDFium2Loader(
124
+ file_path = self.file_path,
125
+ password=self.password,
126
+ headers=self.headers,
127
+ extract_images = self.extract_images,
128
+ images_parser = ImageParser(llm=self.llm, prompt=self.prompt),
129
+ ).load()
130
+ for doc in docs:
131
+ doc.metadata['chunk_id'] = doc.metadata['page']
132
+ return docs
@@ -1,10 +1,10 @@
1
1
  import io
2
2
 
3
- from langchain_community.document_loaders import UnstructuredPowerPointLoader
4
3
  from langchain_core.tools import ToolException
5
4
  from pptx import Presentation
6
5
  from .utils import perform_llm_prediction_for_image_bytes, create_temp_file
7
6
  from pptx.enum.shapes import MSO_SHAPE_TYPE
7
+ from langchain_core.documents import Document
8
8
 
9
9
  class AlitaPowerPointLoader:
10
10
 
@@ -22,33 +22,70 @@ class AlitaPowerPointLoader:
22
22
  self.extract_images = unstructured_kwargs.get('extract_images', False)
23
23
  self.llm = unstructured_kwargs.get('llm', None)
24
24
  self.prompt = unstructured_kwargs.get('prompt', "Describe image")
25
+ self.pages_per_chunk = unstructured_kwargs.get('pages_per_chunk', 5)
25
26
 
26
27
  def get_content(self):
27
- prs = Presentation(io.BytesIO(self.file_content))
28
- text_content = ''
28
+ if hasattr(self, 'file_path'):
29
+ with open(self.file_path, 'rb') as f:
30
+ prs = Presentation(f)
31
+ elif hasattr(self, 'file_content'):
32
+ prs = Presentation(io.BytesIO(self.file_content))
33
+ pages = []
29
34
  if self.page_number is not None:
30
- text_content += self.read_pptx_slide(prs.slides[self.page_number - 1], self.page_number)
35
+ pages.append(self.read_pptx_slide(prs.slides[self.page_number - 1], self.page_number))
31
36
  else:
32
37
  for index, slide in enumerate(prs.slides, start=1):
33
- text_content += self.read_pptx_slide(slide, index)
34
- return text_content
38
+ pages.append(self.read_pptx_slide(slide, index))
39
+ if self.mode == 'single':
40
+ return "\n".join(pages)
41
+ if self.mode == 'paged':
42
+ return pages
43
+ else:
44
+ raise ToolException(f"Unknown mode value: {self.mode}. Only 'single', 'paged' values allowed.")
35
45
 
36
46
  def read_pptx_slide(self, slide, index):
37
47
  text_content = f'Slide: {index}\n'
38
48
  for shape in slide.shapes:
39
- if hasattr(shape, "text"):
40
- text_content += shape.text + "\n"
49
+ if hasattr(shape, "text_frame") and shape.text_frame is not None:
50
+ for paragraph in shape.text_frame.paragraphs:
51
+ for run in paragraph.runs:
52
+ if run.hyperlink and run.hyperlink.address:
53
+ link_text = run.text.strip() or "Link"
54
+ link_url = run.hyperlink.address
55
+ text_content += f" [{link_text}]({link_url}) "
56
+ else:
57
+ text_content += run.text
58
+ text_content += "\n"
41
59
  elif self.extract_images and shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
42
60
  try:
43
- caption = perform_llm_prediction_for_image_bytes(shape.image.blob, self.llm)
61
+ caption = perform_llm_prediction_for_image_bytes(shape.image.blob, self.llm, self.prompt)
44
62
  except:
45
63
  caption = "unknown"
46
64
  text_content += "\n**Image Transcript:**\n" + caption + "\n--------------------\n"
47
- return text_content
65
+ return text_content + "\n"
48
66
 
49
67
  def load(self):
50
- if not self.file_path:
51
- self.file_path = create_temp_file(self.file_content)
52
- return UnstructuredPowerPointLoader(file_path=self.file_path,
53
- mode=self.mode,
54
- **self.unstructured_kwargs).load()
68
+ content = self.get_content()
69
+ if isinstance(content, str):
70
+ yield Document(page_content=content, metadata={})
71
+ elif isinstance(content, list):
72
+ chunk = []
73
+ chunk_count = 0
74
+ for page_number, page in enumerate(content, start=1):
75
+ chunk.append(page)
76
+ if len(chunk) == self.pages_per_chunk:
77
+ chunk_content = "\n".join(chunk)
78
+ yield Document(
79
+ page_content=chunk_content,
80
+ metadata={"chunk_number": chunk_count + 1,
81
+ "pages_in_chunk": list(range(page_number - len(chunk) + 1, page_number + 1))}
82
+ )
83
+ chunk = []
84
+ chunk_count += 1
85
+ if chunk:
86
+ chunk_content = "\n".join(chunk)
87
+ yield Document(
88
+ page_content=chunk_content,
89
+ metadata={"chunk_number": chunk_count + 1,
90
+ "pages_in_chunk": list(range(len(content) - len(chunk) + 1, len(content) + 1))}
91
+ )
@@ -0,0 +1,9 @@
1
+ from pathlib import Path
2
+ from typing import Union
3
+ from langchain_community.document_loaders.python import PythonLoader
4
+
5
+ class AlitaPythonLoader(PythonLoader):
6
+ """Load `Python` files, respecting any non-default encoding if specified."""
7
+
8
+ def __init__(self, file_path: Union[str, Path], **kwargs):
9
+ super().__init__(file_path)
@@ -17,8 +17,6 @@ from langchain_core.documents import Document
17
17
  from typing import List, Optional, Iterator
18
18
  from json import dumps
19
19
  from .utils import cleanse_data
20
- from ..tools.log import print_log
21
-
22
20
 
23
21
  class AlitaTableLoader(BaseLoader):
24
22
  def __init__(self,
@@ -65,7 +63,7 @@ class AlitaTableLoader(BaseLoader):
65
63
  "source": f'{self.file_path}:{idx+1}',
66
64
  "table_source": self.file_path,
67
65
  }
68
- if len(docs) == 0:
66
+ if len(docs) == 0 and not self.raw_content:
69
67
  header_metadata = metadata.copy()
70
68
  header_metadata["header"] = "true"
71
69
  header = "\t".join([str(value) for value in row.keys()])
@@ -74,7 +72,6 @@ class AlitaTableLoader(BaseLoader):
74
72
  docs.append(Document(page_content=row, metadata=metadata))
75
73
  continue
76
74
  if self.json_documents:
77
- # print_log(row)
78
75
  metadata['columns'] = list(row.keys())
79
76
  metadata['og_data'] = dumps(row)
80
77
  docs.append(Document(page_content=self.row_processor(row), metadata=metadata))
@@ -1,4 +1,4 @@
1
- from typing import Iterator
1
+ from typing import Iterator, Generator
2
2
 
3
3
  from langchain_core.documents import Document
4
4
 
@@ -6,6 +6,9 @@ from langchain_community.document_loaders.base import BaseLoader
6
6
  from langchain_community.document_loaders.helpers import detect_file_encodings
7
7
  from langchain_core.tools import ToolException
8
8
 
9
+ from alita_sdk.tools.chunkers import markdown_chunker
10
+
11
+
9
12
  class AlitaTextLoader(BaseLoader):
10
13
 
11
14
  def __init__(self, **kwargs):
@@ -19,6 +22,8 @@ class AlitaTextLoader(BaseLoader):
19
22
  raise ToolException("'file_path' or 'file_content' parameter should be provided.")
20
23
  self.encoding = kwargs.get('encoding', 'utf-8')
21
24
  self.autodetect_encoding = kwargs.get('autodetect_encoding', False)
25
+ self.max_tokens=kwargs.get('max_tokens', 1024)
26
+ self.token_overlap = kwargs.get('token_overlap', 10)
22
27
 
23
28
  def get_content(self):
24
29
  text = ""
@@ -59,8 +64,16 @@ class AlitaTextLoader(BaseLoader):
59
64
 
60
65
  return text
61
66
 
67
+ def generate_document(self, text, metadata) -> Generator[Document, None, None]:
68
+ yield Document(page_content=text, metadata=metadata)
69
+
62
70
  def lazy_load(self) -> Iterator[Document]:
63
71
  """Load from file path."""
64
72
  text = self.get_content()
65
73
  metadata = {"source": str(self.file_path) if hasattr(self, 'file_path') else self.file_name}
66
- yield Document(page_content=text, metadata=metadata)
74
+ chunks = markdown_chunker(file_content_generator=self.generate_document(text, metadata),
75
+ config={
76
+ "max_tokens": self.max_tokens,
77
+ "token_overlap": self.token_overlap
78
+ })
79
+ yield from chunks
@@ -0,0 +1,30 @@
1
+ from typing import Iterator
2
+
3
+ from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
4
+ from langchain_core.documents import Document
5
+ from langchain_core.documents.base import Blob
6
+
7
+ from alita_sdk.runtime.langchain.document_loaders.AlitaImageLoader import AlitaImageLoader
8
+
9
+ class ImageParser(BaseImageBlobParser):
10
+
11
+ def __init__(self, **kwargs):
12
+ self.llm = kwargs.get('llm')
13
+ self.prompt = kwargs.get('prompt')
14
+
15
+ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
16
+ try:
17
+ yield from super().lazy_parse(blob)
18
+ except Exception:
19
+ yield Document(page_content="[Image: Unknown]")
20
+
21
+ def _analyze_image(self, img) -> str:
22
+ from io import BytesIO
23
+
24
+ byte_stream = BytesIO()
25
+ img.save(byte_stream, format='PNG')
26
+ image_bytes = byte_stream.getvalue()
27
+ try:
28
+ return AlitaImageLoader(file_content=image_bytes, file_name="image.png", prompt=self.prompt, llm=self.llm).get_content()
29
+ except Exception:
30
+ return "Image: unknown"