alita-sdk 0.3.323__py3-none-any.whl → 0.3.324__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of alita-sdk might be problematic. Click here for more details.

@@ -16,10 +16,11 @@ from typing import Iterator
16
16
  import pandas as pd
17
17
  from json import loads
18
18
 
19
- from langchain_core.tools import ToolException
19
+ from openpyxl import load_workbook
20
20
  from langchain_core.documents import Document
21
21
  from .AlitaTableLoader import AlitaTableLoader
22
22
 
23
+ cell_delimeter = " | "
23
24
 
24
25
  class AlitaExcelLoader(AlitaTableLoader):
25
26
 
@@ -39,32 +40,65 @@ class AlitaExcelLoader(AlitaTableLoader):
39
40
 
40
41
  def get_content(self):
41
42
  try:
42
- dfs = pd.read_excel(self.file_path, sheet_name=self.sheet_name)
43
+ # Load the workbook
44
+ workbook = load_workbook(self.file_path, data_only=True) # `data_only=True` ensures we get cell values, not formulas
43
45
 
44
- if self.excel_by_sheets:
46
+ if self.sheet_name:
47
+ # If a specific sheet name is provided, parse only that sheet
48
+ if self.sheet_name in workbook.sheetnames:
49
+ sheet_content = self.parse_sheet(workbook[self.sheet_name])
50
+ return sheet_content
51
+ else:
52
+ raise ValueError(f"Sheet '{self.sheet_name}' does not exist in the workbook.")
53
+ elif self.excel_by_sheets:
54
+ # Parse each sheet individually and return as a dictionary
45
55
  result = {}
46
- for sheet_name, df in dfs.items():
47
- df.fillna('', inplace=True)
48
- result[sheet_name] = self.parse_sheet(df)
56
+ for sheet_name in workbook.sheetnames:
57
+ sheet_content = self.parse_sheet(workbook[sheet_name])
58
+ result[sheet_name] = sheet_content
49
59
  return result
50
60
  else:
61
+ # Combine all sheets into a single string result
51
62
  result = []
52
- for sheet_name, df in dfs.items():
53
- string_content = self.parse_sheet(df)
54
- result.append(f"====== Sheet name: {sheet_name} ======\n{string_content}")
63
+ for sheet_name in workbook.sheetnames:
64
+ sheet_content = self.parse_sheet(workbook[sheet_name])
65
+ result.append(f"====== Sheet name: {sheet_name} ======\n{sheet_content}")
55
66
  return "\n\n".join(result)
56
67
  except Exception as e:
57
- return ToolException(f"Error reading Excel file: {e}")
68
+ return f"Error reading Excel file: {e}"
58
69
 
59
- def parse_sheet(self, df):
60
- df.fillna('', inplace=True)
70
+ def parse_sheet(self, sheet):
71
+ """
72
+ Parses a single sheet, extracting text and hyperlinks, and formats them.
73
+ """
74
+ sheet_content = []
61
75
 
76
+ for row in sheet.iter_rows():
77
+ row_content = []
78
+ for cell in row:
79
+ if cell.hyperlink:
80
+ # If the cell has a hyperlink, format it as Markdown
81
+ hyperlink = cell.hyperlink.target
82
+ cell_value = cell.value or '' # Use cell value or empty string
83
+ row_content.append(f"[{cell_value}]({hyperlink})")
84
+ else:
85
+ # If no hyperlink, use the cell value (computed value if formula)
86
+ row_content.append(str(cell.value) if cell.value is not None else "")
87
+ # Join the row content into a single line using `|` as the delimiter
88
+ sheet_content.append(cell_delimeter.join(row_content))
89
+
90
+ # Format the sheet content based on the return type
62
91
  if self.return_type == 'dict':
63
- return df.to_dict(orient='records')
92
+ # Convert to a list of dictionaries (each row is a dictionary)
93
+ headers = sheet_content[0].split(cell_delimeter) if sheet_content else []
94
+ data_rows = sheet_content[1:] if len(sheet_content) > 1 else []
95
+ return [dict(zip(headers, row.split(cell_delimeter))) for row in data_rows]
64
96
  elif self.return_type == 'csv':
65
- return df.to_csv()
97
+ # Return as CSV (newline-separated rows, comma-separated values)
98
+ return "\n".join([",".join(row.split(cell_delimeter)) for row in sheet_content])
66
99
  else:
67
- return df.to_string(index=False)
100
+ # Default: Return as plain text (newline-separated rows, pipe-separated values)
101
+ return "\n".join(sheet_content)
68
102
 
69
103
  def load(self) -> list:
70
104
  docs = []
@@ -1,4 +1,5 @@
1
1
  import pymupdf
2
+ import fitz
2
3
  from langchain_community.document_loaders import PyPDFLoader
3
4
 
4
5
  from .ImageParser import ImageParser
@@ -43,8 +44,59 @@ class AlitaPDFLoader:
43
44
  return text_content
44
45
 
45
46
  def read_pdf_page(self, report, page, index):
46
- text_content = f'Page: {index}\n'
47
- text_content += page.get_text()
47
+ # Extract text in block format (to more accurately match hyperlinks to text)
48
+ text_blocks = page.get_text("blocks") # Returns a list of text blocks
49
+ words = page.get_text("words") # Returns words with their coordinates
50
+
51
+ # Extract hyperlinks
52
+ links = page.get_links()
53
+
54
+ # Create a list to store the modified text
55
+ modified_text = []
56
+
57
+ for block in text_blocks:
58
+ block_rect = fitz.Rect(block[:4]) # Coordinates of the text block
59
+ block_text = block[4] # The actual text of the block
60
+
61
+ # Check if there are hyperlinks intersecting with this text block
62
+ for link in links:
63
+ if "uri" in link: # Ensure this is a hyperlink
64
+ link_rect = link["from"] # Coordinates of the hyperlink area
65
+ link_uri = link["uri"] # The URL of the hyperlink
66
+
67
+ # Expand the hyperlink area slightly to account for inaccuracies
68
+ link_rect = fitz.Rect(
69
+ link_rect.x0 - 1, link_rect.y0 - 1, link_rect.x1 + 1, link_rect.y1 + 1
70
+ )
71
+
72
+ # Find words that are inside the hyperlink area
73
+ link_text = []
74
+ for word in words:
75
+ word_rect = fitz.Rect(word[:4]) # Coordinates of the word
76
+ word_text = word[4]
77
+
78
+ # Check if the word rectangle is fully inside the hyperlink rectangle
79
+ if link_rect.contains(word_rect):
80
+ link_text.append(word_text)
81
+ # If the word partially intersects, check vertical alignment
82
+ elif link_rect.intersects(word_rect):
83
+ # Condition: The word must be on the same line as the hyperlink
84
+ if abs(link_rect.y0 - word_rect.y0) < 2 and abs(link_rect.y1 - word_rect.y1) < 2:
85
+ link_text.append(word_text)
86
+
87
+ # Format the hyperlink in Markdown
88
+ full_text = " ".join(link_text) if link_text else "No text"
89
+ hyperlink = f"[{full_text}]({link_uri})"
90
+
91
+ # Replace the hyperlink text in the block with the formatted hyperlink
92
+ block_text = block_text.replace(full_text, hyperlink)
93
+
94
+ # Add the processed text block to the result
95
+ modified_text.append(block_text)
96
+
97
+ # Combine all text blocks into the final text for the page
98
+ text_content = f'Page: {index}\n' + "\n".join(modified_text)
99
+
48
100
  if self.extract_images:
49
101
  images = page.get_images(full=True)
50
102
  for i, img in enumerate(images):
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alita_sdk
3
- Version: 0.3.323
3
+ Version: 0.3.324
4
4
  Summary: SDK for building langchain agents using resources from Alita
5
5
  Author-email: Artem Rozumenko <artyom.rozumenko@gmail.com>, Mikalai Biazruchka <mikalai_biazruchka@epam.com>, Roman Mitusov <roman_mitusov@epam.com>, Ivan Krakhmaliuk <lifedj27@gmail.com>, Artem Dubrovskiy <ad13box@gmail.com>
6
6
  License-Expression: Apache-2.0
@@ -56,13 +56,13 @@ alita_sdk/runtime/langchain/document_loaders/AlitaCSVLoader.py,sha256=3ne-a5qIkB
56
56
  alita_sdk/runtime/langchain/document_loaders/AlitaConfluenceLoader.py,sha256=NzpoL4C7UzyzLouTSL_xTQw70MitNt-WZz3Eyl7QkTA,8294
57
57
  alita_sdk/runtime/langchain/document_loaders/AlitaDirectoryLoader.py,sha256=fKezkgvIcLG7S2PVJp1a8sZd6C4XQKNZKAFC87DbQts,7003
58
58
  alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py,sha256=9hi5eHgDIfa9wBWqTuwMM6D6W64czrDTfZl_htooe8Y,5943
59
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py,sha256=CKFL13TXCyqQa_fl6EmR6q9O9cT_w0tQzoQQFmfCpi8,3712
59
+ alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py,sha256=P17csHx94JkXiyo1a2V-CrfP2E5XCG4uZC31ulZ_Ab4,5817
60
60
  alita_sdk/runtime/langchain/document_loaders/AlitaGitRepoLoader.py,sha256=5WXGcyHraSVj3ANHj_U6X4EDikoekrIYtS0Q_QqNIng,2608
61
61
  alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py,sha256=QwgBJE-BvOasjgT1hYHZc0MP0F_elirUjSzKixoM6fY,6610
62
62
  alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py,sha256=Nav2cgCQKOHQi_ZgYYn_iFdP_Os56KVlVR5nHGXecBc,3445
63
63
  alita_sdk/runtime/langchain/document_loaders/AlitaJiraLoader.py,sha256=M2q3YThkps0yAZOjfoLcyE7qycVTYKcXEGtpmp0N6C8,10950
64
64
  alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py,sha256=RGHDfleYTn7AAc3H-yFZrjm06L0Ux14ZtEJpFlVBNCA,2474
65
- alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py,sha256=toXdQbT9TuBCdB4t62t2cPalBY_2RZy2lqKSMU7YVhw,3386
65
+ alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py,sha256=usSrPnYQ3dDOJDdg6gBDTnBJnHiqjLxd_kvOBfRyVxY,5946
66
66
  alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py,sha256=SKAAPo3DfMtRPxICKrPzlXXkC5RfaeiRj7lejLXTi7o,2337
67
67
  alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py,sha256=m_7aq-aCFVb4vXZsJNinfN1hAuyy_S0ylRknv_ahxDc,340
68
68
  alita_sdk/runtime/langchain/document_loaders/AlitaQtestLoader.py,sha256=CUVVnisxm7b5yZWV6rn0Q3MEEaO1GWNcfnz5yWz8T0k,13283
@@ -349,8 +349,8 @@ alita_sdk/tools/zephyr_scale/api_wrapper.py,sha256=kT0TbmMvuKhDUZc0i7KO18O38JM9S
349
349
  alita_sdk/tools/zephyr_squad/__init__.py,sha256=0ne8XLJEQSLOWfzd2HdnqOYmQlUliKHbBED5kW_Vias,2895
350
350
  alita_sdk/tools/zephyr_squad/api_wrapper.py,sha256=kmw_xol8YIYFplBLWTqP_VKPRhL_1ItDD0_vXTe_UuI,14906
351
351
  alita_sdk/tools/zephyr_squad/zephyr_squad_cloud_client.py,sha256=R371waHsms4sllHCbijKYs90C-9Yu0sSR3N4SUfQOgU,5066
352
- alita_sdk-0.3.323.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
353
- alita_sdk-0.3.323.dist-info/METADATA,sha256=H6Stzos-D6Mw5ie9tVXbVrUWFhIZOHkJbE8Jz-dGHao,18897
354
- alita_sdk-0.3.323.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
355
- alita_sdk-0.3.323.dist-info/top_level.txt,sha256=0vJYy5p_jK6AwVb1aqXr7Kgqgk3WDtQ6t5C-XI9zkmg,10
356
- alita_sdk-0.3.323.dist-info/RECORD,,
352
+ alita_sdk-0.3.324.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
353
+ alita_sdk-0.3.324.dist-info/METADATA,sha256=uxEEUIMIOSP9WwGk_YaGjp2hDLTynd35eEWo4SPjHUc,18897
354
+ alita_sdk-0.3.324.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
355
+ alita_sdk-0.3.324.dist-info/top_level.txt,sha256=0vJYy5p_jK6AwVb1aqXr7Kgqgk3WDtQ6t5C-XI9zkmg,10
356
+ alita_sdk-0.3.324.dist-info/RECORD,,