alita-sdk 0.3.263__py3-none-any.whl → 0.3.499__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent/__init__.py +5 -0
  4. alita_sdk/cli/agent/default.py +258 -0
  5. alita_sdk/cli/agent_executor.py +155 -0
  6. alita_sdk/cli/agent_loader.py +215 -0
  7. alita_sdk/cli/agent_ui.py +228 -0
  8. alita_sdk/cli/agents.py +3601 -0
  9. alita_sdk/cli/callbacks.py +647 -0
  10. alita_sdk/cli/cli.py +168 -0
  11. alita_sdk/cli/config.py +306 -0
  12. alita_sdk/cli/context/__init__.py +30 -0
  13. alita_sdk/cli/context/cleanup.py +198 -0
  14. alita_sdk/cli/context/manager.py +731 -0
  15. alita_sdk/cli/context/message.py +285 -0
  16. alita_sdk/cli/context/strategies.py +289 -0
  17. alita_sdk/cli/context/token_estimation.py +127 -0
  18. alita_sdk/cli/formatting.py +182 -0
  19. alita_sdk/cli/input_handler.py +419 -0
  20. alita_sdk/cli/inventory.py +1256 -0
  21. alita_sdk/cli/mcp_loader.py +315 -0
  22. alita_sdk/cli/toolkit.py +327 -0
  23. alita_sdk/cli/toolkit_loader.py +85 -0
  24. alita_sdk/cli/tools/__init__.py +43 -0
  25. alita_sdk/cli/tools/approval.py +224 -0
  26. alita_sdk/cli/tools/filesystem.py +1751 -0
  27. alita_sdk/cli/tools/planning.py +389 -0
  28. alita_sdk/cli/tools/terminal.py +414 -0
  29. alita_sdk/community/__init__.py +64 -8
  30. alita_sdk/community/inventory/__init__.py +224 -0
  31. alita_sdk/community/inventory/config.py +257 -0
  32. alita_sdk/community/inventory/enrichment.py +2137 -0
  33. alita_sdk/community/inventory/extractors.py +1469 -0
  34. alita_sdk/community/inventory/ingestion.py +3172 -0
  35. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  36. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  37. alita_sdk/community/inventory/parsers/base.py +295 -0
  38. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  39. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  40. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  41. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  42. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  43. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  44. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  45. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  46. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  47. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  48. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  49. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  50. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  51. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  52. alita_sdk/community/inventory/patterns/loader.py +348 -0
  53. alita_sdk/community/inventory/patterns/registry.py +198 -0
  54. alita_sdk/community/inventory/presets.py +535 -0
  55. alita_sdk/community/inventory/retrieval.py +1403 -0
  56. alita_sdk/community/inventory/toolkit.py +173 -0
  57. alita_sdk/community/inventory/visualize.py +1370 -0
  58. alita_sdk/configurations/__init__.py +10 -0
  59. alita_sdk/configurations/ado.py +4 -2
  60. alita_sdk/configurations/azure_search.py +1 -1
  61. alita_sdk/configurations/bigquery.py +1 -1
  62. alita_sdk/configurations/bitbucket.py +94 -2
  63. alita_sdk/configurations/browser.py +18 -0
  64. alita_sdk/configurations/carrier.py +19 -0
  65. alita_sdk/configurations/confluence.py +96 -1
  66. alita_sdk/configurations/delta_lake.py +1 -1
  67. alita_sdk/configurations/figma.py +0 -5
  68. alita_sdk/configurations/github.py +65 -1
  69. alita_sdk/configurations/gitlab.py +79 -0
  70. alita_sdk/configurations/google_places.py +17 -0
  71. alita_sdk/configurations/jira.py +103 -0
  72. alita_sdk/configurations/postman.py +1 -1
  73. alita_sdk/configurations/qtest.py +1 -3
  74. alita_sdk/configurations/report_portal.py +19 -0
  75. alita_sdk/configurations/salesforce.py +19 -0
  76. alita_sdk/configurations/service_now.py +1 -12
  77. alita_sdk/configurations/sharepoint.py +19 -0
  78. alita_sdk/configurations/sonar.py +18 -0
  79. alita_sdk/configurations/sql.py +20 -0
  80. alita_sdk/configurations/testio.py +18 -0
  81. alita_sdk/configurations/testrail.py +88 -0
  82. alita_sdk/configurations/xray.py +94 -1
  83. alita_sdk/configurations/zephyr_enterprise.py +94 -1
  84. alita_sdk/configurations/zephyr_essential.py +95 -0
  85. alita_sdk/runtime/clients/artifact.py +12 -2
  86. alita_sdk/runtime/clients/client.py +235 -66
  87. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  88. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  89. alita_sdk/runtime/clients/sandbox_client.py +373 -0
  90. alita_sdk/runtime/langchain/assistant.py +123 -17
  91. alita_sdk/runtime/langchain/constants.py +8 -1
  92. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  93. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
  94. alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
  95. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +8 -2
  96. alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
  97. alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
  98. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
  99. alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
  100. alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
  101. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
  102. alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
  103. alita_sdk/runtime/langchain/document_loaders/constants.py +187 -40
  104. alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
  105. alita_sdk/runtime/langchain/langraph_agent.py +406 -91
  106. alita_sdk/runtime/langchain/utils.py +51 -8
  107. alita_sdk/runtime/llms/preloaded.py +2 -6
  108. alita_sdk/runtime/models/mcp_models.py +61 -0
  109. alita_sdk/runtime/toolkits/__init__.py +26 -0
  110. alita_sdk/runtime/toolkits/application.py +9 -2
  111. alita_sdk/runtime/toolkits/artifact.py +19 -7
  112. alita_sdk/runtime/toolkits/datasource.py +13 -6
  113. alita_sdk/runtime/toolkits/mcp.py +780 -0
  114. alita_sdk/runtime/toolkits/planning.py +178 -0
  115. alita_sdk/runtime/toolkits/subgraph.py +11 -6
  116. alita_sdk/runtime/toolkits/tools.py +214 -60
  117. alita_sdk/runtime/toolkits/vectorstore.py +9 -4
  118. alita_sdk/runtime/tools/__init__.py +22 -0
  119. alita_sdk/runtime/tools/application.py +16 -4
  120. alita_sdk/runtime/tools/artifact.py +312 -19
  121. alita_sdk/runtime/tools/function.py +100 -4
  122. alita_sdk/runtime/tools/graph.py +81 -0
  123. alita_sdk/runtime/tools/image_generation.py +212 -0
  124. alita_sdk/runtime/tools/llm.py +539 -180
  125. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  126. alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
  127. alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
  128. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  129. alita_sdk/runtime/tools/planning/models.py +246 -0
  130. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  131. alita_sdk/runtime/tools/router.py +2 -1
  132. alita_sdk/runtime/tools/sandbox.py +375 -0
  133. alita_sdk/runtime/tools/vectorstore.py +62 -63
  134. alita_sdk/runtime/tools/vectorstore_base.py +156 -85
  135. alita_sdk/runtime/utils/AlitaCallback.py +106 -20
  136. alita_sdk/runtime/utils/mcp_client.py +465 -0
  137. alita_sdk/runtime/utils/mcp_oauth.py +244 -0
  138. alita_sdk/runtime/utils/mcp_sse_client.py +405 -0
  139. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  140. alita_sdk/runtime/utils/streamlit.py +41 -14
  141. alita_sdk/runtime/utils/toolkit_utils.py +28 -9
  142. alita_sdk/runtime/utils/utils.py +14 -0
  143. alita_sdk/tools/__init__.py +78 -35
  144. alita_sdk/tools/ado/__init__.py +0 -1
  145. alita_sdk/tools/ado/repos/__init__.py +10 -6
  146. alita_sdk/tools/ado/repos/repos_wrapper.py +12 -11
  147. alita_sdk/tools/ado/test_plan/__init__.py +10 -7
  148. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -23
  149. alita_sdk/tools/ado/wiki/__init__.py +10 -11
  150. alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -28
  151. alita_sdk/tools/ado/work_item/__init__.py +10 -11
  152. alita_sdk/tools/ado/work_item/ado_wrapper.py +63 -10
  153. alita_sdk/tools/advanced_jira_mining/__init__.py +10 -7
  154. alita_sdk/tools/aws/delta_lake/__init__.py +13 -11
  155. alita_sdk/tools/azure_ai/search/__init__.py +11 -7
  156. alita_sdk/tools/base_indexer_toolkit.py +392 -86
  157. alita_sdk/tools/bitbucket/__init__.py +18 -11
  158. alita_sdk/tools/bitbucket/api_wrapper.py +52 -9
  159. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
  160. alita_sdk/tools/browser/__init__.py +40 -16
  161. alita_sdk/tools/browser/crawler.py +3 -1
  162. alita_sdk/tools/browser/utils.py +15 -6
  163. alita_sdk/tools/carrier/__init__.py +17 -17
  164. alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
  165. alita_sdk/tools/carrier/excel_reporter.py +8 -4
  166. alita_sdk/tools/chunkers/__init__.py +3 -1
  167. alita_sdk/tools/chunkers/code/codeparser.py +1 -1
  168. alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
  169. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  170. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  171. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  172. alita_sdk/tools/cloud/aws/__init__.py +9 -6
  173. alita_sdk/tools/cloud/azure/__init__.py +9 -6
  174. alita_sdk/tools/cloud/gcp/__init__.py +9 -6
  175. alita_sdk/tools/cloud/k8s/__init__.py +9 -6
  176. alita_sdk/tools/code/linter/__init__.py +7 -7
  177. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  178. alita_sdk/tools/code/sonar/__init__.py +18 -12
  179. alita_sdk/tools/code_indexer_toolkit.py +199 -0
  180. alita_sdk/tools/confluence/__init__.py +14 -11
  181. alita_sdk/tools/confluence/api_wrapper.py +198 -58
  182. alita_sdk/tools/confluence/loader.py +10 -0
  183. alita_sdk/tools/custom_open_api/__init__.py +9 -4
  184. alita_sdk/tools/elastic/__init__.py +8 -7
  185. alita_sdk/tools/elitea_base.py +543 -64
  186. alita_sdk/tools/figma/__init__.py +10 -8
  187. alita_sdk/tools/figma/api_wrapper.py +352 -153
  188. alita_sdk/tools/github/__init__.py +13 -11
  189. alita_sdk/tools/github/api_wrapper.py +9 -26
  190. alita_sdk/tools/github/github_client.py +75 -12
  191. alita_sdk/tools/github/schemas.py +2 -1
  192. alita_sdk/tools/gitlab/__init__.py +11 -10
  193. alita_sdk/tools/gitlab/api_wrapper.py +135 -45
  194. alita_sdk/tools/gitlab_org/__init__.py +11 -9
  195. alita_sdk/tools/google/bigquery/__init__.py +12 -13
  196. alita_sdk/tools/google_places/__init__.py +18 -10
  197. alita_sdk/tools/jira/__init__.py +14 -8
  198. alita_sdk/tools/jira/api_wrapper.py +315 -168
  199. alita_sdk/tools/keycloak/__init__.py +8 -7
  200. alita_sdk/tools/localgit/local_git.py +56 -54
  201. alita_sdk/tools/memory/__init__.py +27 -11
  202. alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
  203. alita_sdk/tools/ocr/__init__.py +8 -7
  204. alita_sdk/tools/openapi/__init__.py +10 -1
  205. alita_sdk/tools/pandas/__init__.py +8 -7
  206. alita_sdk/tools/pandas/api_wrapper.py +7 -25
  207. alita_sdk/tools/postman/__init__.py +8 -10
  208. alita_sdk/tools/postman/api_wrapper.py +19 -8
  209. alita_sdk/tools/postman/postman_analysis.py +8 -1
  210. alita_sdk/tools/pptx/__init__.py +8 -9
  211. alita_sdk/tools/qtest/__init__.py +19 -13
  212. alita_sdk/tools/qtest/api_wrapper.py +1784 -88
  213. alita_sdk/tools/rally/__init__.py +10 -9
  214. alita_sdk/tools/report_portal/__init__.py +20 -15
  215. alita_sdk/tools/salesforce/__init__.py +19 -15
  216. alita_sdk/tools/servicenow/__init__.py +14 -11
  217. alita_sdk/tools/sharepoint/__init__.py +14 -13
  218. alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
  219. alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
  220. alita_sdk/tools/sharepoint/utils.py +8 -2
  221. alita_sdk/tools/slack/__init__.py +10 -7
  222. alita_sdk/tools/sql/__init__.py +19 -18
  223. alita_sdk/tools/sql/api_wrapper.py +71 -23
  224. alita_sdk/tools/testio/__init__.py +18 -12
  225. alita_sdk/tools/testrail/__init__.py +10 -10
  226. alita_sdk/tools/testrail/api_wrapper.py +213 -45
  227. alita_sdk/tools/utils/__init__.py +28 -4
  228. alita_sdk/tools/utils/content_parser.py +181 -61
  229. alita_sdk/tools/utils/text_operations.py +254 -0
  230. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
  231. alita_sdk/tools/xray/__init__.py +12 -7
  232. alita_sdk/tools/xray/api_wrapper.py +58 -113
  233. alita_sdk/tools/zephyr/__init__.py +9 -6
  234. alita_sdk/tools/zephyr_enterprise/__init__.py +13 -8
  235. alita_sdk/tools/zephyr_enterprise/api_wrapper.py +17 -7
  236. alita_sdk/tools/zephyr_essential/__init__.py +13 -9
  237. alita_sdk/tools/zephyr_essential/api_wrapper.py +289 -47
  238. alita_sdk/tools/zephyr_essential/client.py +6 -4
  239. alita_sdk/tools/zephyr_scale/__init__.py +10 -7
  240. alita_sdk/tools/zephyr_scale/api_wrapper.py +6 -2
  241. alita_sdk/tools/zephyr_squad/__init__.py +9 -6
  242. {alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/METADATA +180 -33
  243. alita_sdk-0.3.499.dist-info/RECORD +433 -0
  244. alita_sdk-0.3.499.dist-info/entry_points.txt +2 -0
  245. alita_sdk-0.3.263.dist-info/RECORD +0 -342
  246. {alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/WHEEL +0 -0
  247. {alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/licenses/LICENSE +0 -0
  248. {alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,66 @@
1
+ from pathlib import Path
2
+ from typing import Any, List, Union, Generator, Iterator
3
+ from langchain_core.documents import Document
4
+
5
+ from langchain_community.document_loaders.unstructured import (
6
+ UnstructuredFileLoader,
7
+ validate_unstructured_version,
8
+ )
9
+
10
+ class AlitaMarkdownLoader(UnstructuredFileLoader):
11
+
12
+ def __init__(
13
+ self,
14
+ file_path: Union[str, Path],
15
+ mode: str = "elements",
16
+ chunker_config: dict = None,
17
+ **unstructured_kwargs: Any,
18
+ ):
19
+ """
20
+ Args:
21
+ file_path: The path to the Markdown file to load.
22
+ mode: The mode to use when loading the file. Can be one of "single",
23
+ "multi", or "all". Default is "single".
24
+ chunker_config: Configuration dictionary for the markdown chunker.
25
+ **unstructured_kwargs: Any kwargs to pass to the unstructured.
26
+ """
27
+ file_path = str(file_path)
28
+ validate_unstructured_version("0.4.16")
29
+ self.chunker_config = chunker_config or {
30
+ "strip_header": False,
31
+ "return_each_line": False,
32
+ "headers_to_split_on": [],
33
+ "max_tokens": 512,
34
+ "token_overlap": 10,
35
+ }
36
+ super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
37
+
38
+ def _file_content_generator(self) -> Generator[Document, None, None]:
39
+ """
40
+ Creates a generator that yields a single Document object
41
+ representing the entire content of the Markdown file.
42
+ """
43
+ with open(self.file_path, "r", encoding="utf-8") as file:
44
+ content = file.read()
45
+ yield Document(page_content=content, metadata={"source": self.file_path})
46
+
47
+ def _get_elements(self) -> List[Document]:
48
+ """
49
+ Processes the Markdown file using the markdown_chunker and returns the chunks.
50
+ """
51
+ from alita_sdk.tools.chunkers.sematic.markdown_chunker import markdown_chunker
52
+
53
+ # Create a generator for the file content
54
+ file_content_generator = self._file_content_generator()
55
+
56
+ # Use the markdown_chunker to process the content
57
+ chunks = markdown_chunker(file_content_generator, config=self.chunker_config)
58
+
59
+ # Convert the generator to a list of Document objects
60
+ return list(chunks)
61
+
62
+ def lazy_load(self) -> Iterator[Document]:
63
+ """Load file."""
64
+ elements = self._get_elements()
65
+ self._post_process_elements(elements)
66
+ yield from elements
@@ -1,5 +1,8 @@
1
1
  import pymupdf
2
- from langchain_community.document_loaders import PyPDFLoader
2
+ import fitz
3
+ from langchain_community.document_loaders import PyPDFium2Loader
4
+
5
+ from .ImageParser import ImageParser
3
6
  from .utils import perform_llm_prediction_for_image_bytes, create_temp_file
4
7
  from langchain_core.tools import ToolException
5
8
 
@@ -20,6 +23,7 @@ class AlitaPDFLoader:
20
23
  self.headers = kwargs.get('headers', None)
21
24
  self.extraction_mode = kwargs.get('extraction_mode', "plain")
22
25
  self.extraction_kwargs = kwargs.get('extraction_kwargs', None)
26
+ self.images_parser=ImageParser(llm=self.llm, prompt=self.prompt)
23
27
 
24
28
  def get_content(self):
25
29
  if hasattr(self, 'file_path'):
@@ -41,8 +45,59 @@ class AlitaPDFLoader:
41
45
  return text_content
42
46
 
43
47
  def read_pdf_page(self, report, page, index):
44
- text_content = f'Page: {index}\n'
45
- text_content += page.get_text()
48
+ # Extract text in block format (to more accurately match hyperlinks to text)
49
+ text_blocks = page.get_text("blocks") # Returns a list of text blocks
50
+ words = page.get_text("words") # Returns words with their coordinates
51
+
52
+ # Extract hyperlinks
53
+ links = page.get_links()
54
+
55
+ # Create a list to store the modified text
56
+ modified_text = []
57
+
58
+ for block in text_blocks:
59
+ block_rect = fitz.Rect(block[:4]) # Coordinates of the text block
60
+ block_text = block[4] # The actual text of the block
61
+
62
+ # Check if there are hyperlinks intersecting with this text block
63
+ for link in links:
64
+ if "uri" in link: # Ensure this is a hyperlink
65
+ link_rect = link["from"] # Coordinates of the hyperlink area
66
+ link_uri = link["uri"] # The URL of the hyperlink
67
+
68
+ # Expand the hyperlink area slightly to account for inaccuracies
69
+ link_rect = fitz.Rect(
70
+ link_rect.x0 - 1, link_rect.y0 - 1, link_rect.x1 + 1, link_rect.y1 + 1
71
+ )
72
+
73
+ # Find words that are inside the hyperlink area
74
+ link_text = []
75
+ for word in words:
76
+ word_rect = fitz.Rect(word[:4]) # Coordinates of the word
77
+ word_text = word[4]
78
+
79
+ # Check if the word rectangle is fully inside the hyperlink rectangle
80
+ if link_rect.contains(word_rect):
81
+ link_text.append(word_text)
82
+ # If the word partially intersects, check vertical alignment
83
+ elif link_rect.intersects(word_rect):
84
+ # Condition: The word must be on the same line as the hyperlink
85
+ if abs(link_rect.y0 - word_rect.y0) < 2 and abs(link_rect.y1 - word_rect.y1) < 2:
86
+ link_text.append(word_text)
87
+
88
+ # Format the hyperlink in Markdown
89
+ full_text = " ".join(link_text) if link_text else "No text"
90
+ hyperlink = f"[{full_text}]({link_uri})"
91
+
92
+ # Replace the hyperlink text in the block with the formatted hyperlink
93
+ block_text = block_text.replace(full_text, hyperlink)
94
+
95
+ # Add the processed text block to the result
96
+ modified_text.append(block_text)
97
+
98
+ # Combine all text blocks into the final text for the page
99
+ text_content = f'Page: {index}\n' + "\n".join(modified_text)
100
+
46
101
  if self.extract_images:
47
102
  images = page.get_images(full=True)
48
103
  for i, img in enumerate(images):
@@ -54,10 +109,24 @@ class AlitaPDFLoader:
54
109
 
55
110
  def load(self):
56
111
  if not hasattr(self, 'file_path'):
57
- self.file_path = create_temp_file(self.file_content)
58
- return PyPDFLoader(file_path=self.file_path,
59
- password=self.password,
60
- headers=self.headers,
61
- extract_images=self.extract_images,
62
- extraction_mode=self.extraction_mode,
63
- extraction_kwargs=self.extraction_kwargs).load()
112
+ import tempfile
113
+
114
+ with tempfile.NamedTemporaryFile(mode='w+b', delete=True, suffix=".pdf") as temp_file:
115
+ temp_file.write(self.file_content)
116
+ temp_file.flush()
117
+ self.file_path = temp_file.name
118
+ return self._load_docs()
119
+ else:
120
+ return self._load_docs()
121
+
122
+ def _load_docs(self):
123
+ docs = PyPDFium2Loader(
124
+ file_path = self.file_path,
125
+ password=self.password,
126
+ headers=self.headers,
127
+ extract_images = self.extract_images,
128
+ images_parser = ImageParser(llm=self.llm, prompt=self.prompt),
129
+ ).load()
130
+ for doc in docs:
131
+ doc.metadata['chunk_id'] = doc.metadata['page']
132
+ return docs
@@ -1,10 +1,10 @@
1
1
  import io
2
2
 
3
- from langchain_community.document_loaders import UnstructuredPowerPointLoader
4
3
  from langchain_core.tools import ToolException
5
4
  from pptx import Presentation
6
5
  from .utils import perform_llm_prediction_for_image_bytes, create_temp_file
7
6
  from pptx.enum.shapes import MSO_SHAPE_TYPE
7
+ from langchain_core.documents import Document
8
8
 
9
9
  class AlitaPowerPointLoader:
10
10
 
@@ -22,33 +22,70 @@ class AlitaPowerPointLoader:
22
22
  self.extract_images = unstructured_kwargs.get('extract_images', False)
23
23
  self.llm = unstructured_kwargs.get('llm', None)
24
24
  self.prompt = unstructured_kwargs.get('prompt', "Describe image")
25
+ self.pages_per_chunk = unstructured_kwargs.get('pages_per_chunk', 5)
25
26
 
26
27
  def get_content(self):
27
- prs = Presentation(io.BytesIO(self.file_content))
28
- text_content = ''
28
+ if hasattr(self, 'file_path'):
29
+ with open(self.file_path, 'rb') as f:
30
+ prs = Presentation(f)
31
+ elif hasattr(self, 'file_content'):
32
+ prs = Presentation(io.BytesIO(self.file_content))
33
+ pages = []
29
34
  if self.page_number is not None:
30
- text_content += self.read_pptx_slide(prs.slides[self.page_number - 1], self.page_number)
35
+ pages.append(self.read_pptx_slide(prs.slides[self.page_number - 1], self.page_number))
31
36
  else:
32
37
  for index, slide in enumerate(prs.slides, start=1):
33
- text_content += self.read_pptx_slide(slide, index)
34
- return text_content
38
+ pages.append(self.read_pptx_slide(slide, index))
39
+ if self.mode == 'single':
40
+ return "\n".join(pages)
41
+ if self.mode == 'paged':
42
+ return pages
43
+ else:
44
+ raise ToolException(f"Unknown mode value: {self.mode}. Only 'single', 'paged' values allowed.")
35
45
 
36
46
  def read_pptx_slide(self, slide, index):
37
47
  text_content = f'Slide: {index}\n'
38
48
  for shape in slide.shapes:
39
- if hasattr(shape, "text"):
40
- text_content += shape.text + "\n"
49
+ if hasattr(shape, "text_frame") and shape.text_frame is not None:
50
+ for paragraph in shape.text_frame.paragraphs:
51
+ for run in paragraph.runs:
52
+ if run.hyperlink and run.hyperlink.address:
53
+ link_text = run.text.strip() or "Link"
54
+ link_url = run.hyperlink.address
55
+ text_content += f" [{link_text}]({link_url}) "
56
+ else:
57
+ text_content += run.text
58
+ text_content += "\n"
41
59
  elif self.extract_images and shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
42
60
  try:
43
- caption = perform_llm_prediction_for_image_bytes(shape.image.blob, self.llm)
61
+ caption = perform_llm_prediction_for_image_bytes(shape.image.blob, self.llm, self.prompt)
44
62
  except:
45
63
  caption = "unknown"
46
64
  text_content += "\n**Image Transcript:**\n" + caption + "\n--------------------\n"
47
- return text_content
65
+ return text_content + "\n"
48
66
 
49
67
  def load(self):
50
- if not self.file_path:
51
- self.file_path = create_temp_file(self.file_content)
52
- return UnstructuredPowerPointLoader(file_path=self.file_path,
53
- mode=self.mode,
54
- **self.unstructured_kwargs).load()
68
+ content = self.get_content()
69
+ if isinstance(content, str):
70
+ yield Document(page_content=content, metadata={})
71
+ elif isinstance(content, list):
72
+ chunk = []
73
+ chunk_count = 0
74
+ for page_number, page in enumerate(content, start=1):
75
+ chunk.append(page)
76
+ if len(chunk) == self.pages_per_chunk:
77
+ chunk_content = "\n".join(chunk)
78
+ yield Document(
79
+ page_content=chunk_content,
80
+ metadata={"chunk_number": chunk_count + 1,
81
+ "pages_in_chunk": list(range(page_number - len(chunk) + 1, page_number + 1))}
82
+ )
83
+ chunk = []
84
+ chunk_count += 1
85
+ if chunk:
86
+ chunk_content = "\n".join(chunk)
87
+ yield Document(
88
+ page_content=chunk_content,
89
+ metadata={"chunk_number": chunk_count + 1,
90
+ "pages_in_chunk": list(range(len(content) - len(chunk) + 1, len(content) + 1))}
91
+ )
@@ -0,0 +1,9 @@
1
+ from pathlib import Path
2
+ from typing import Union
3
+ from langchain_community.document_loaders.python import PythonLoader
4
+
5
+ class AlitaPythonLoader(PythonLoader):
6
+ """Load `Python` files, respecting any non-default encoding if specified."""
7
+
8
+ def __init__(self, file_path: Union[str, Path], **kwargs):
9
+ super().__init__(file_path)
@@ -17,8 +17,6 @@ from langchain_core.documents import Document
17
17
  from typing import List, Optional, Iterator
18
18
  from json import dumps
19
19
  from .utils import cleanse_data
20
- from ..tools.log import print_log
21
-
22
20
 
23
21
  class AlitaTableLoader(BaseLoader):
24
22
  def __init__(self,
@@ -65,7 +63,7 @@ class AlitaTableLoader(BaseLoader):
65
63
  "source": f'{self.file_path}:{idx+1}',
66
64
  "table_source": self.file_path,
67
65
  }
68
- if len(docs) == 0:
66
+ if len(docs) == 0 and not self.raw_content:
69
67
  header_metadata = metadata.copy()
70
68
  header_metadata["header"] = "true"
71
69
  header = "\t".join([str(value) for value in row.keys()])
@@ -74,7 +72,6 @@ class AlitaTableLoader(BaseLoader):
74
72
  docs.append(Document(page_content=row, metadata=metadata))
75
73
  continue
76
74
  if self.json_documents:
77
- # print_log(row)
78
75
  metadata['columns'] = list(row.keys())
79
76
  metadata['og_data'] = dumps(row)
80
77
  docs.append(Document(page_content=self.row_processor(row), metadata=metadata))
@@ -1,4 +1,4 @@
1
- from typing import Iterator
1
+ from typing import Iterator, Generator
2
2
 
3
3
  from langchain_core.documents import Document
4
4
 
@@ -6,6 +6,9 @@ from langchain_community.document_loaders.base import BaseLoader
6
6
  from langchain_community.document_loaders.helpers import detect_file_encodings
7
7
  from langchain_core.tools import ToolException
8
8
 
9
+ from alita_sdk.tools.chunkers import markdown_chunker
10
+
11
+
9
12
  class AlitaTextLoader(BaseLoader):
10
13
 
11
14
  def __init__(self, **kwargs):
@@ -19,6 +22,8 @@ class AlitaTextLoader(BaseLoader):
19
22
  raise ToolException("'file_path' or 'file_content' parameter should be provided.")
20
23
  self.encoding = kwargs.get('encoding', 'utf-8')
21
24
  self.autodetect_encoding = kwargs.get('autodetect_encoding', False)
25
+ self.max_tokens=kwargs.get('max_tokens', 1024)
26
+ self.token_overlap = kwargs.get('token_overlap', 10)
22
27
 
23
28
  def get_content(self):
24
29
  text = ""
@@ -59,8 +64,16 @@ class AlitaTextLoader(BaseLoader):
59
64
 
60
65
  return text
61
66
 
67
+ def generate_document(self, text, metadata) -> Generator[Document, None, None]:
68
+ yield Document(page_content=text, metadata=metadata)
69
+
62
70
  def lazy_load(self) -> Iterator[Document]:
63
71
  """Load from file path."""
64
72
  text = self.get_content()
65
73
  metadata = {"source": str(self.file_path) if hasattr(self, 'file_path') else self.file_name}
66
- yield Document(page_content=text, metadata=metadata)
74
+ chunks = markdown_chunker(file_content_generator=self.generate_document(text, metadata),
75
+ config={
76
+ "max_tokens": self.max_tokens,
77
+ "token_overlap": self.token_overlap
78
+ })
79
+ yield from chunks
@@ -0,0 +1,30 @@
1
+ from typing import Iterator
2
+
3
+ from langchain_community.document_loaders.parsers.images import BaseImageBlobParser
4
+ from langchain_core.documents import Document
5
+ from langchain_core.documents.base import Blob
6
+
7
+ from alita_sdk.runtime.langchain.document_loaders.AlitaImageLoader import AlitaImageLoader
8
+
9
+ class ImageParser(BaseImageBlobParser):
10
+
11
+ def __init__(self, **kwargs):
12
+ self.llm = kwargs.get('llm')
13
+ self.prompt = kwargs.get('prompt')
14
+
15
+ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
16
+ try:
17
+ yield from super().lazy_parse(blob)
18
+ except Exception:
19
+ yield Document(page_content="[Image: Unknown]")
20
+
21
+ def _analyze_image(self, img) -> str:
22
+ from io import BytesIO
23
+
24
+ byte_stream = BytesIO()
25
+ img.save(byte_stream, format='PNG')
26
+ image_bytes = byte_stream.getvalue()
27
+ try:
28
+ return AlitaImageLoader(file_content=image_bytes, file_name="image.png", prompt=self.prompt, llm=self.llm).get_content()
29
+ except Exception:
30
+ return "Image: unknown"