PyPI - alita-sdk - Versions diffs - 0.3.323__py3-none-any.whl → 0.3.324__py3-none-any.whl - Mend

alita-sdk 0.3.323py3-none-any.whl → 0.3.324py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of alita-sdk might be problematic. Click here for more details.

Files changed (7) hide show

alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py CHANGED Viewed

@@ -16,10 +16,11 @@ from typing import Iterator
 import pandas as pd
 from json import loads
-from langchain_core.tools import ToolException
+from openpyxl import load_workbook
 from langchain_core.documents import Document
 from .AlitaTableLoader import AlitaTableLoader
+cell_delimeter = " | "
 class AlitaExcelLoader(AlitaTableLoader):
@@ -39,32 +40,65 @@ class AlitaExcelLoader(AlitaTableLoader):
     def get_content(self):
         try:
-            dfs = pd.read_excel(self.file_path, sheet_name=self.sheet_name)
+            # Load the workbook
+            workbook = load_workbook(self.file_path, data_only=True)  # `data_only=True` ensures we get cell values, not formulas
-            if self.excel_by_sheets:
+            if self.sheet_name:
+                # If a specific sheet name is provided, parse only that sheet
+                if self.sheet_name in workbook.sheetnames:
+                    sheet_content = self.parse_sheet(workbook[self.sheet_name])
+                    return sheet_content
+                else:
+                    raise ValueError(f"Sheet '{self.sheet_name}' does not exist in the workbook.")
+            elif self.excel_by_sheets:
+                # Parse each sheet individually and return as a dictionary
                 result = {}
-                for sheet_name, df in dfs.items():
-                    df.fillna('', inplace=True)
-                    result[sheet_name] = self.parse_sheet(df)
+                for sheet_name in workbook.sheetnames:
+                    sheet_content = self.parse_sheet(workbook[sheet_name])
+                    result[sheet_name] = sheet_content
                 return result
             else:
+                # Combine all sheets into a single string result
                 result = []
-                for sheet_name, df in dfs.items():
-                    string_content = self.parse_sheet(df)
-                    result.append(f"====== Sheet name: {sheet_name} ======\n{string_content}")
+                for sheet_name in workbook.sheetnames:
+                    sheet_content = self.parse_sheet(workbook[sheet_name])
+                    result.append(f"====== Sheet name: {sheet_name} ======\n{sheet_content}")
                 return "\n\n".join(result)
         except Exception as e:
-            return ToolException(f"Error reading Excel file: {e}")
+            return f"Error reading Excel file: {e}"
-    def parse_sheet(self, df):
-        df.fillna('', inplace=True)
+    def parse_sheet(self, sheet):
+        """
+        Parses a single sheet, extracting text and hyperlinks, and formats them.
+        """
+        sheet_content = []
+        for row in sheet.iter_rows():
+            row_content = []
+            for cell in row:
+                if cell.hyperlink:
+                    # If the cell has a hyperlink, format it as Markdown
+                    hyperlink = cell.hyperlink.target
+                    cell_value = cell.value or ''  # Use cell value or empty string
+                    row_content.append(f"[{cell_value}]({hyperlink})")
+                else:
+                    # If no hyperlink, use the cell value (computed value if formula)
+                    row_content.append(str(cell.value) if cell.value is not None else "")
+            # Join the row content into a single line using `|` as the delimiter
+            sheet_content.append(cell_delimeter.join(row_content))
+        # Format the sheet content based on the return type
         if self.return_type == 'dict':
-            return df.to_dict(orient='records')
+            # Convert to a list of dictionaries (each row is a dictionary)
+            headers = sheet_content[0].split(cell_delimeter) if sheet_content else []
+            data_rows = sheet_content[1:] if len(sheet_content) > 1 else []
+            return [dict(zip(headers, row.split(cell_delimeter))) for row in data_rows]
         elif self.return_type == 'csv':
-            return df.to_csv()
+            # Return as CSV (newline-separated rows, comma-separated values)
+            return "\n".join([",".join(row.split(cell_delimeter)) for row in sheet_content])
         else:
-            return df.to_string(index=False)
+            # Default: Return as plain text (newline-separated rows, pipe-separated values)
+            return "\n".join(sheet_content)
     def load(self) -> list:
         docs = []

alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import pymupdf
+import fitz
 from langchain_community.document_loaders import PyPDFLoader
 from .ImageParser import ImageParser
@@ -43,8 +44,59 @@ class AlitaPDFLoader:
         return text_content
     def read_pdf_page(self, report, page, index):
-        text_content = f'Page: {index}\n'
-        text_content += page.get_text()
+        # Extract text in block format (to more accurately match hyperlinks to text)
+        text_blocks = page.get_text("blocks")  # Returns a list of text blocks
+        words = page.get_text("words")  # Returns words with their coordinates
+        # Extract hyperlinks
+        links = page.get_links()
+        # Create a list to store the modified text
+        modified_text = []
+        for block in text_blocks:
+            block_rect = fitz.Rect(block[:4])  # Coordinates of the text block
+            block_text = block[4]  # The actual text of the block
+            # Check if there are hyperlinks intersecting with this text block
+            for link in links:
+                if "uri" in link:  # Ensure this is a hyperlink
+                    link_rect = link["from"]  # Coordinates of the hyperlink area
+                    link_uri = link["uri"]  # The URL of the hyperlink
+                    # Expand the hyperlink area slightly to account for inaccuracies
+                    link_rect = fitz.Rect(
+                        link_rect.x0 - 1, link_rect.y0 - 1, link_rect.x1 + 1, link_rect.y1 + 1
+                    )
+                    # Find words that are inside the hyperlink area
+                    link_text = []
+                    for word in words:
+                        word_rect = fitz.Rect(word[:4])  # Coordinates of the word
+                        word_text = word[4]
+                        # Check if the word rectangle is fully inside the hyperlink rectangle
+                        if link_rect.contains(word_rect):
+                            link_text.append(word_text)
+                        # If the word partially intersects, check vertical alignment
+                        elif link_rect.intersects(word_rect):
+                            # Condition: The word must be on the same line as the hyperlink
+                            if abs(link_rect.y0 - word_rect.y0) < 2 and abs(link_rect.y1 - word_rect.y1) < 2:
+                                link_text.append(word_text)
+                    # Format the hyperlink in Markdown
+                    full_text = " ".join(link_text) if link_text else "No text"
+                    hyperlink = f"[{full_text}]({link_uri})"
+                    # Replace the hyperlink text in the block with the formatted hyperlink
+                    block_text = block_text.replace(full_text, hyperlink)
+            # Add the processed text block to the result
+            modified_text.append(block_text)
+        # Combine all text blocks into the final text for the page
+        text_content = f'Page: {index}\n' + "\n".join(modified_text)
         if self.extract_images:
             images = page.get_images(full=True)
             for i, img in enumerate(images):

{alita_sdk-0.3.323.dist-info → alita_sdk-0.3.324.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: alita_sdk
-Version: 0.3.323
+Version: 0.3.324
 Summary: SDK for building langchain agents using resources from Alita
 Author-email: Artem Rozumenko <artyom.rozumenko@gmail.com>, Mikalai Biazruchka <mikalai_biazruchka@epam.com>, Roman Mitusov <roman_mitusov@epam.com>, Ivan Krakhmaliuk <lifedj27@gmail.com>, Artem Dubrovskiy <ad13box@gmail.com>
 License-Expression: Apache-2.0

{alita_sdk-0.3.323.dist-info → alita_sdk-0.3.324.dist-info}/RECORD RENAMED Viewed

@@ -56,13 +56,13 @@ alita_sdk/runtime/langchain/document_loaders/AlitaCSVLoader.py,sha256=3ne-a5qIkB
 alita_sdk/runtime/langchain/document_loaders/AlitaConfluenceLoader.py,sha256=NzpoL4C7UzyzLouTSL_xTQw70MitNt-WZz3Eyl7QkTA,8294
 alita_sdk/runtime/langchain/document_loaders/AlitaDirectoryLoader.py,sha256=fKezkgvIcLG7S2PVJp1a8sZd6C4XQKNZKAFC87DbQts,7003
 alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py,sha256=9hi5eHgDIfa9wBWqTuwMM6D6W64czrDTfZl_htooe8Y,5943
-alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py,sha256=CKFL13TXCyqQa_fl6EmR6q9O9cT_w0tQzoQQFmfCpi8,3712
+alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py,sha256=P17csHx94JkXiyo1a2V-CrfP2E5XCG4uZC31ulZ_Ab4,5817
 alita_sdk/runtime/langchain/document_loaders/AlitaGitRepoLoader.py,sha256=5WXGcyHraSVj3ANHj_U6X4EDikoekrIYtS0Q_QqNIng,2608
 alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py,sha256=QwgBJE-BvOasjgT1hYHZc0MP0F_elirUjSzKixoM6fY,6610
 alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py,sha256=Nav2cgCQKOHQi_ZgYYn_iFdP_Os56KVlVR5nHGXecBc,3445
 alita_sdk/runtime/langchain/document_loaders/AlitaJiraLoader.py,sha256=M2q3YThkps0yAZOjfoLcyE7qycVTYKcXEGtpmp0N6C8,10950
 alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py,sha256=RGHDfleYTn7AAc3H-yFZrjm06L0Ux14ZtEJpFlVBNCA,2474
-alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py,sha256=toXdQbT9TuBCdB4t62t2cPalBY_2RZy2lqKSMU7YVhw,3386
+alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py,sha256=usSrPnYQ3dDOJDdg6gBDTnBJnHiqjLxd_kvOBfRyVxY,5946
 alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py,sha256=SKAAPo3DfMtRPxICKrPzlXXkC5RfaeiRj7lejLXTi7o,2337
 alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py,sha256=m_7aq-aCFVb4vXZsJNinfN1hAuyy_S0ylRknv_ahxDc,340
 alita_sdk/runtime/langchain/document_loaders/AlitaQtestLoader.py,sha256=CUVVnisxm7b5yZWV6rn0Q3MEEaO1GWNcfnz5yWz8T0k,13283
@@ -349,8 +349,8 @@ alita_sdk/tools/zephyr_scale/api_wrapper.py,sha256=kT0TbmMvuKhDUZc0i7KO18O38JM9S
 alita_sdk/tools/zephyr_squad/__init__.py,sha256=0ne8XLJEQSLOWfzd2HdnqOYmQlUliKHbBED5kW_Vias,2895
 alita_sdk/tools/zephyr_squad/api_wrapper.py,sha256=kmw_xol8YIYFplBLWTqP_VKPRhL_1ItDD0_vXTe_UuI,14906
 alita_sdk/tools/zephyr_squad/zephyr_squad_cloud_client.py,sha256=R371waHsms4sllHCbijKYs90C-9Yu0sSR3N4SUfQOgU,5066
-alita_sdk-0.3.323.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-alita_sdk-0.3.323.dist-info/METADATA,sha256=H6Stzos-D6Mw5ie9tVXbVrUWFhIZOHkJbE8Jz-dGHao,18897
-alita_sdk-0.3.323.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-alita_sdk-0.3.323.dist-info/top_level.txt,sha256=0vJYy5p_jK6AwVb1aqXr7Kgqgk3WDtQ6t5C-XI9zkmg,10
-alita_sdk-0.3.323.dist-info/RECORD,,
+alita_sdk-0.3.324.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+alita_sdk-0.3.324.dist-info/METADATA,sha256=uxEEUIMIOSP9WwGk_YaGjp2hDLTynd35eEWo4SPjHUc,18897
+alita_sdk-0.3.324.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+alita_sdk-0.3.324.dist-info/top_level.txt,sha256=0vJYy5p_jK6AwVb1aqXr7Kgqgk3WDtQ6t5C-XI9zkmg,10
+alita_sdk-0.3.324.dist-info/RECORD,,

{alita_sdk-0.3.323.dist-info → alita_sdk-0.3.324.dist-info}/WHEEL RENAMED Viewed

File without changes

{alita_sdk-0.3.323.dist-info → alita_sdk-0.3.324.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{alita_sdk-0.3.323.dist-info → alita_sdk-0.3.324.dist-info}/top_level.txt RENAMED Viewed

File without changes

alita-sdk 0.3.323__py3-none-any.whl → 0.3.324__py3-none-any.whl

Potentially problematic release.

alita-sdk 0.3.323py3-none-any.whl → 0.3.324py3-none-any.whl