alita-sdk 0.3.211__py3-none-any.whl → 0.3.212__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alita_sdk/runtime/clients/client.py +2 -2
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +48 -24
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +47 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +103 -49
- alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +63 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +54 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +66 -0
- alita_sdk/runtime/langchain/document_loaders/constants.py +13 -19
- alita_sdk/runtime/langchain/document_loaders/utils.py +30 -1
- alita_sdk/runtime/tools/artifact.py +2 -4
- alita_sdk/runtime/tools/vectorstore.py +2 -1
- alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +13 -37
- alita_sdk/tools/ado/wiki/ado_wrapper.py +10 -39
- alita_sdk/tools/confluence/api_wrapper.py +2 -0
- alita_sdk/tools/elitea_base.py +20 -1
- alita_sdk/tools/gitlab/__init__.py +3 -2
- alita_sdk/tools/gitlab/api_wrapper.py +45 -18
- alita_sdk/tools/gitlab_org/api_wrapper.py +44 -25
- alita_sdk/tools/sharepoint/api_wrapper.py +13 -13
- alita_sdk/tools/testrail/api_wrapper.py +20 -0
- alita_sdk/tools/utils/content_parser.py +37 -162
- {alita_sdk-0.3.211.dist-info → alita_sdk-0.3.212.dist-info}/METADATA +1 -1
- {alita_sdk-0.3.211.dist-info → alita_sdk-0.3.212.dist-info}/RECORD +26 -23
- {alita_sdk-0.3.211.dist-info → alita_sdk-0.3.212.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.211.dist-info → alita_sdk-0.3.212.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.211.dist-info → alita_sdk-0.3.212.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,7 @@
|
|
1
1
|
import logging
|
2
2
|
from datetime import datetime
|
3
3
|
from typing import Optional, Any, List, Dict
|
4
|
+
import fnmatch
|
4
5
|
|
5
6
|
from gitlab import GitlabGetError
|
6
7
|
from langchain_core.tools import ToolException
|
@@ -22,7 +23,9 @@ GitLabCreateBranch = create_model(
|
|
22
23
|
|
23
24
|
GitLabListBranches = create_model(
|
24
25
|
"GitLabListBranchesModel",
|
25
|
-
repository=(Optional[str], Field(description="Name of the repository", default=None))
|
26
|
+
repository=(Optional[str], Field(description="Name of the repository", default=None)),
|
27
|
+
limit=(Optional[int], Field(description="Maximum number of branches to return. If not provided, all branches will be returned.", default=20)),
|
28
|
+
branch_wildcard=(Optional[str], Field(description="Wildcard pattern to filter branches by name. If not provided, all branches will be returned.", default=None))
|
26
29
|
)
|
27
30
|
|
28
31
|
GitlabSetActiveBranch = create_model(
|
@@ -209,16 +212,32 @@ class GitLabWorkspaceAPIWrapper(BaseToolApiWrapper):
|
|
209
212
|
self._active_branch = branch
|
210
213
|
return f"Active branch set to {branch}"
|
211
214
|
|
212
|
-
def list_branches_in_repo(self, repository: Optional[str] = None) -> List[str]:
|
213
|
-
"""
|
215
|
+
def list_branches_in_repo(self, repository: Optional[str] = None, limit: Optional[int] = 20, branch_wildcard: Optional[str] = None) -> List[str]:
|
216
|
+
"""
|
217
|
+
Lists branches in the repository with optional limit and wildcard filtering.
|
218
|
+
|
219
|
+
Parameters:
|
220
|
+
repository (Optional[str]): Name of the repository. If None, uses the active repository.
|
221
|
+
limit (Optional[int]): Maximum number of branches to return
|
222
|
+
branch_wildcard (Optional[str]): Wildcard pattern to filter branches (e.g., '*dev')
|
223
|
+
|
224
|
+
Returns:
|
225
|
+
List[str]: List containing names of branches
|
226
|
+
"""
|
214
227
|
try:
|
215
228
|
repo_instance = self._get_repo(repository)
|
216
|
-
branches = repo_instance.branches.list()
|
217
|
-
|
229
|
+
branches = repo_instance.branches.list(get_all=True)
|
230
|
+
|
231
|
+
if branch_wildcard:
|
232
|
+
branches = [branch for branch in branches if fnmatch.fnmatch(branch.name, branch_wildcard)]
|
233
|
+
|
234
|
+
if limit:
|
235
|
+
branches = branches[:limit]
|
236
|
+
|
237
|
+
branch_names = [branch.name for branch in branches]
|
238
|
+
return branch_names
|
218
239
|
except Exception as e:
|
219
|
-
return
|
220
|
-
|
221
|
-
|
240
|
+
return f"Failed to list branches: {str(e)}"
|
222
241
|
|
223
242
|
def create_branch(self, branch_name: str, repository: Optional[str] = None) -> str:
|
224
243
|
"""Create a new branch in the repository."""
|
@@ -568,104 +587,104 @@ class GitLabWorkspaceAPIWrapper(BaseToolApiWrapper):
|
|
568
587
|
return [
|
569
588
|
{
|
570
589
|
"name": "create_branch",
|
571
|
-
"description": self.create_branch.__doc__,
|
590
|
+
"description": self.create_branch.__doc__ or "Create a new branch in the repository.",
|
572
591
|
"args_schema": GitLabCreateBranch,
|
573
592
|
"ref": self.create_branch,
|
574
593
|
},
|
575
594
|
{
|
576
595
|
"name": "set_active_branch",
|
577
|
-
"description": self.set_active_branch.__doc__,
|
596
|
+
"description": self.set_active_branch.__doc__ or "Set the active branch for the bot.",
|
578
597
|
"args_schema": GitlabSetActiveBranch,
|
579
598
|
"ref": self.set_active_branch,
|
580
599
|
},
|
581
600
|
{
|
582
601
|
"name": "list_branches_in_repo",
|
583
|
-
"description": self.list_branches_in_repo.__doc__,
|
602
|
+
"description": self.list_branches_in_repo.__doc__ or "List branches in the repository with optional limit and wildcard filtering.",
|
584
603
|
"args_schema": GitLabListBranches,
|
585
604
|
"ref": self.list_branches_in_repo,
|
586
605
|
},
|
587
606
|
{
|
588
607
|
"name": "get_issues",
|
589
|
-
"description": self.get_issues.__doc__,
|
608
|
+
"description": self.get_issues.__doc__ or "Fetches all open issues from the repository.",
|
590
609
|
"args_schema": GitLabGetIssues,
|
591
610
|
"ref": self.get_issues,
|
592
611
|
},
|
593
612
|
{
|
594
613
|
"name": "get_issue",
|
595
|
-
"description": self.get_issue.__doc__,
|
614
|
+
"description": self.get_issue.__doc__ or "Fetches a specific issue and its first 10 comments.",
|
596
615
|
"args_schema": GitLabGetIssue,
|
597
616
|
"ref": self.get_issue,
|
598
617
|
},
|
599
618
|
{
|
600
619
|
"name": "create_pull_request",
|
601
|
-
"description": self.create_pull_request.__doc__,
|
620
|
+
"description": self.create_pull_request.__doc__ or "Creates a pull request in the repository.",
|
602
621
|
"args_schema": GitLabCreatePullRequest,
|
603
622
|
"ref": self.create_pull_request,
|
604
623
|
},
|
605
624
|
{
|
606
625
|
"name": "comment_on_issue",
|
607
|
-
"description": self.comment_on_issue.__doc__,
|
626
|
+
"description": self.comment_on_issue.__doc__ or "Adds a comment to a GitLab issue.",
|
608
627
|
"args_schema": GitLabCommentOnIssue,
|
609
628
|
"ref": self.comment_on_issue,
|
610
629
|
},
|
611
630
|
{
|
612
631
|
"name": "create_file",
|
613
|
-
"description": self.create_file.__doc__,
|
632
|
+
"description": self.create_file.__doc__ or "Creates a new file in the GitLab repository.",
|
614
633
|
"args_schema": GitLabCreateFile,
|
615
634
|
"ref": self.create_file,
|
616
635
|
},
|
617
636
|
{
|
618
637
|
"name": "read_file",
|
619
|
-
"description": self.read_file.__doc__,
|
638
|
+
"description": self.read_file.__doc__ or "Reads a file from the GitLab repository.",
|
620
639
|
"args_schema": GitLabReadFile,
|
621
640
|
"ref": self.read_file,
|
622
641
|
},
|
623
642
|
{
|
624
643
|
"name": "update_file",
|
625
|
-
"description": self.update_file.__doc__,
|
644
|
+
"description": self.update_file.__doc__ or "Updates a file in the GitLab repository.",
|
626
645
|
"args_schema": GitLabUpdateFile,
|
627
646
|
"ref": self.update_file,
|
628
647
|
},
|
629
648
|
{
|
630
649
|
"name": "delete_file",
|
631
|
-
"description": self.delete_file.__doc__,
|
650
|
+
"description": self.delete_file.__doc__ or "Deletes a file from the GitLab repository.",
|
632
651
|
"args_schema": GitLabDeleteFile,
|
633
652
|
"ref": self.delete_file,
|
634
653
|
},
|
635
654
|
{
|
636
655
|
"name": "get_pr_changes",
|
637
|
-
"description": self.get_pr_changes.__doc__,
|
656
|
+
"description": self.get_pr_changes.__doc__ or "Get pull request changes from the specified PR number and repository.",
|
638
657
|
"args_schema": GitLabGetPRChanges,
|
639
658
|
"ref": self.get_pr_changes,
|
640
659
|
},
|
641
660
|
{
|
642
661
|
"name": "create_pr_change_comment",
|
643
|
-
"description": self.create_pr_change_comment.__doc__,
|
662
|
+
"description": self.create_pr_change_comment.__doc__ or "Create a comment on a pull request change in GitLab.",
|
644
663
|
"args_schema": GitLabCreatePullRequestChangeCommentInput,
|
645
664
|
"ref": self.create_pr_change_comment,
|
646
665
|
},
|
647
666
|
{
|
648
667
|
"name": "list_files",
|
649
|
-
"description": self.list_files.__doc__,
|
668
|
+
"description": self.list_files.__doc__ or "List files by defined path.",
|
650
669
|
"args_schema": ListFilesModel,
|
651
670
|
"ref": self.list_files,
|
652
671
|
},
|
653
672
|
{
|
654
673
|
"name": "list_folders",
|
655
|
-
"description": self.list_folders.__doc__,
|
674
|
+
"description": self.list_folders.__doc__ or "List folders by defined path.",
|
656
675
|
"args_schema": ListFilesModel,
|
657
676
|
"ref": self.list_folders,
|
658
677
|
},
|
659
678
|
{
|
660
679
|
"name": "append_file",
|
661
|
-
"description": self.append_file.__doc__,
|
680
|
+
"description": self.append_file.__doc__ or "Appends new content to the end of a file.",
|
662
681
|
"args_schema": AppendFileInput,
|
663
682
|
"ref": self.append_file,
|
664
683
|
},
|
665
684
|
{
|
666
685
|
"ref": self.get_commits,
|
667
686
|
"name": "get_commits",
|
668
|
-
"description": self.get_commits.__doc__,
|
687
|
+
"description": self.get_commits.__doc__ or "Retrieves a list of commits from the repository.",
|
669
688
|
"args_schema": GetCommits,
|
670
689
|
}
|
671
690
|
]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
|
-
from typing import Optional, List, Generator
|
3
|
+
from typing import Optional, List, Generator, Any
|
4
4
|
|
5
5
|
from langchain_core.documents import Document
|
6
6
|
from langchain_core.tools import ToolException
|
@@ -129,7 +129,7 @@ class SharepointApiWrapper(BaseVectorStoreToolApiWrapper):
|
|
129
129
|
is_capture_image: bool = False,
|
130
130
|
page_number: int = None,
|
131
131
|
sheet_name: str = None,
|
132
|
-
excel_by_sheets: bool = False):
|
132
|
+
excel_by_sheets: bool = False) -> str | dict | ToolException:
|
133
133
|
""" Reads file located at the specified server-relative path. """
|
134
134
|
try:
|
135
135
|
file = self._client.web.get_file_by_server_relative_path(path)
|
@@ -148,30 +148,30 @@ class SharepointApiWrapper(BaseVectorStoreToolApiWrapper):
|
|
148
148
|
excel_by_sheets=excel_by_sheets,
|
149
149
|
llm=self.llm)
|
150
150
|
|
151
|
-
def _base_loader(self, **kwargs) ->
|
151
|
+
def _base_loader(self, **kwargs) -> Generator[Document, None, None]:
|
152
152
|
try:
|
153
153
|
all_files = self.get_files_list()
|
154
154
|
except Exception as e:
|
155
155
|
raise ToolException(f"Unable to extract files: {e}")
|
156
156
|
|
157
|
-
docs: List[Document] = []
|
158
157
|
for file in all_files:
|
159
158
|
metadata = {
|
160
159
|
("updated_on" if k == "Modified" else k): str(v)
|
161
160
|
for k, v in file.items()
|
162
161
|
}
|
163
|
-
|
164
|
-
return docs
|
162
|
+
yield Document(page_content="", metadata=metadata)
|
165
163
|
|
166
164
|
def _process_document(self, document: Document) -> Generator[Document, None, None]:
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
165
|
+
doc_content = self.read_file(document.metadata['Path'],
|
166
|
+
is_capture_image=True,
|
167
|
+
excel_by_sheets=True)
|
168
|
+
if isinstance(doc_content, dict):
|
169
|
+
for page, content in doc_content:
|
170
|
+
new_metadata = document.metadata
|
171
|
+
new_metadata['page'] = page
|
172
|
+
yield Document(page_content=str(content), metadata=new_metadata)
|
173
173
|
else:
|
174
|
-
document.page_content =
|
174
|
+
document.page_content = str(doc_content)
|
175
175
|
yield document
|
176
176
|
|
177
177
|
@extend_with_vector_tools
|
@@ -8,6 +8,8 @@ from openai import BadRequestError
|
|
8
8
|
from pydantic import SecretStr, create_model, model_validator
|
9
9
|
from pydantic.fields import Field, PrivateAttr
|
10
10
|
from testrail_api import StatusCodeError, TestRailAPI
|
11
|
+
|
12
|
+
from ..chunkers.code.constants import get_file_extension
|
11
13
|
from ..elitea_base import BaseVectorStoreToolApiWrapper, extend_with_vector_tools
|
12
14
|
from langchain_core.documents import Document
|
13
15
|
|
@@ -537,6 +539,9 @@ class TestrailAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
537
539
|
title_keyword: Optional[str] = None,
|
538
540
|
**kwargs: Any
|
539
541
|
) -> Generator[Document, None, None]:
|
542
|
+
self._include_attachments = kwargs.get('include_attachments', False)
|
543
|
+
self._skip_attachment_extensions = kwargs.get('skip_attachment_extensions', [])
|
544
|
+
|
540
545
|
try:
|
541
546
|
if suite_id:
|
542
547
|
resp = self._client.cases.get_cases(project_id=project_id, suite_id=int(suite_id))
|
@@ -582,6 +587,11 @@ class TestrailAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
582
587
|
Generator[Document, None, None]: A generator yielding processed Document objects with metadata.
|
583
588
|
"""
|
584
589
|
try:
|
590
|
+
if not self._include_attachments:
|
591
|
+
# If attachments are not included, return the document as is
|
592
|
+
yield document
|
593
|
+
return
|
594
|
+
|
585
595
|
# get base data from the document required to extract attachments and other metadata
|
586
596
|
base_data = json.loads(document.page_content)
|
587
597
|
case_id = base_data.get("id")
|
@@ -591,6 +601,10 @@ class TestrailAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
591
601
|
|
592
602
|
# process each attachment to extract its content
|
593
603
|
for attachment in attachments:
|
604
|
+
if get_file_extension(attachment['filename']) in self._skip_attachment_extensions:
|
605
|
+
logger.info(f"Skipping attachment {attachment['filename']} with unsupported extension.")
|
606
|
+
continue
|
607
|
+
|
594
608
|
attachment_id = f"attach_{attachment['id']}"
|
595
609
|
# add attachment id to metadata of parent
|
596
610
|
document.metadata.setdefault(IndexerKeywords.DEPENDENT_DOCS.value, []).append(attachment_id)
|
@@ -639,6 +653,12 @@ class TestrailAPIWrapper(BaseVectorStoreToolApiWrapper):
|
|
639
653
|
'suite_id': (Optional[str],
|
640
654
|
Field(default=None, description="Optional TestRail suite ID to filter test cases")),
|
641
655
|
'section_id': (Optional[int], Field(default=None, description="Optional section ID to filter test cases")),
|
656
|
+
'include_attachments': (Optional[bool],
|
657
|
+
Field(description="Whether to include attachment content in indexing",
|
658
|
+
default=False)),
|
659
|
+
'skip_attachment_extensions': (Optional[List[str]], Field(
|
660
|
+
description="List of file extensions to skip when processing attachments: i.e. ['.png', '.jpg']",
|
661
|
+
default=[])),
|
642
662
|
}
|
643
663
|
|
644
664
|
def _to_markup(self, data: List[Dict], output_format: str) -> str:
|
@@ -1,19 +1,11 @@
|
|
1
|
-
import
|
2
|
-
|
3
|
-
from docx import Document
|
4
|
-
from io import BytesIO
|
5
|
-
import pandas as pd
|
6
|
-
from PIL import Image
|
7
|
-
from pptx import Presentation
|
8
|
-
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
9
|
-
import io
|
10
|
-
import pymupdf
|
1
|
+
from pathlib import Path
|
2
|
+
|
11
3
|
from langchain_core.tools import ToolException
|
12
|
-
from transformers import BlipProcessor, BlipForConditionalGeneration
|
13
|
-
from langchain_core.messages import HumanMessage
|
14
4
|
from logging import getLogger
|
5
|
+
from alita_sdk.runtime.langchain.document_loaders.constants import loaders_map
|
6
|
+
from langchain_core.documents import Document
|
15
7
|
|
16
|
-
from ...runtime.langchain.
|
8
|
+
from ...runtime.langchain.document_loaders.utils import create_temp_file
|
17
9
|
|
18
10
|
logger = getLogger(__name__)
|
19
11
|
|
@@ -61,7 +53,7 @@ IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'webp', 'svg']
|
|
61
53
|
|
62
54
|
|
63
55
|
def parse_file_content(file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
|
64
|
-
sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False):
|
56
|
+
sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False) -> str | ToolException:
|
65
57
|
"""Parse the content of a file based on its type and return the parsed content.
|
66
58
|
|
67
59
|
Args:
|
@@ -72,6 +64,7 @@ def parse_file_content(file_name=None, file_content=None, is_capture_image: bool
|
|
72
64
|
sheet_name (str, optional): The specific sheet name to parse for Excel files.
|
73
65
|
llm: The language model to use for image processing.
|
74
66
|
file_path (str, optional): The path to the file if it needs to be read from disk.
|
67
|
+
return_type (str, optional): Tipe of returned result. Possible values are 'str', 'docs'.
|
75
68
|
Returns:
|
76
69
|
str: The parsed content of the file.
|
77
70
|
Raises:
|
@@ -81,142 +74,39 @@ def parse_file_content(file_name=None, file_content=None, is_capture_image: bool
|
|
81
74
|
if (file_path and (file_name or file_content)) or (not file_path and (not file_name or file_content is None)):
|
82
75
|
raise ToolException("Either (file_name and file_content) or file_path must be provided, but not both.")
|
83
76
|
|
84
|
-
if file_path
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
return __perform_llm_prediction_for_image(llm, file_content, match.group(1), image_processing_prompt)
|
102
|
-
else:
|
77
|
+
extension = Path(file_path if file_path else file_name).suffix
|
78
|
+
|
79
|
+
loader_object = loaders_map.get(extension)
|
80
|
+
loader_kwargs = loader_object['kwargs']
|
81
|
+
loader_kwargs.update({
|
82
|
+
"file_path": file_path,
|
83
|
+
"file_content": file_content,
|
84
|
+
"file_name": file_name,
|
85
|
+
"extract_images": is_capture_image,
|
86
|
+
"llm": llm,
|
87
|
+
"page_number": page_number,
|
88
|
+
"sheet_name": sheet_name,
|
89
|
+
"excel_by_sheets": excel_by_sheets
|
90
|
+
})
|
91
|
+
loader = loader_object['class'](**loader_kwargs)
|
92
|
+
|
93
|
+
if not loader:
|
103
94
|
return ToolException(
|
104
95
|
"Not supported type of files entered. Supported types are TXT, DOCX, PDF, PPTX, XLSX and XLS only.")
|
105
96
|
|
106
|
-
|
107
|
-
|
108
|
-
return file_content.decode('utf-8')
|
109
|
-
except Exception as e:
|
110
|
-
return ToolException(f"Error decoding file content: {e}")
|
111
|
-
|
112
|
-
def parse_excel(file_content, sheet_name = None, return_by_sheets: bool = False):
|
113
|
-
try:
|
114
|
-
excel_file = io.BytesIO(file_content)
|
115
|
-
if sheet_name:
|
116
|
-
return parse_sheet(excel_file, sheet_name)
|
117
|
-
dfs = pd.read_excel(excel_file, sheet_name=sheet_name)
|
118
|
-
|
119
|
-
if return_by_sheets:
|
120
|
-
result = {}
|
121
|
-
for sheet_name, df in dfs.items():
|
122
|
-
df.fillna('', inplace=True)
|
123
|
-
result[sheet_name] = df.to_dict(orient='records')
|
124
|
-
return result
|
125
|
-
else:
|
126
|
-
result = []
|
127
|
-
for sheet_name, df in dfs.items():
|
128
|
-
df.fillna('', inplace=True)
|
129
|
-
string_content = df.to_string(index=False)
|
130
|
-
result.append(f"====== Sheet name: {sheet_name} ======\n{string_content}")
|
131
|
-
return "\n\n".join(result)
|
132
|
-
except Exception as e:
|
133
|
-
return ToolException(f"Error reading Excel file: {e}")
|
134
|
-
|
135
|
-
def parse_sheet(excel_file, sheet_name):
|
136
|
-
df = pd.read_excel(excel_file, sheet_name=sheet_name)
|
137
|
-
df.fillna('', inplace=True)
|
138
|
-
return df.to_string()
|
139
|
-
|
140
|
-
def parse_pdf(file_content, page_number, is_capture_image, llm):
|
141
|
-
with pymupdf.open(stream=file_content, filetype="pdf") as report:
|
142
|
-
text_content = ''
|
143
|
-
if page_number is not None:
|
144
|
-
page = report.load_page(page_number - 1)
|
145
|
-
text_content += read_pdf_page(report, page, page_number, is_capture_image, llm)
|
146
|
-
else:
|
147
|
-
for index, page in enumerate(report, start=1):
|
148
|
-
text_content += read_pdf_page(report, page, index, is_capture_image, llm)
|
149
|
-
return text_content
|
150
|
-
|
151
|
-
def parse_pptx(file_content, page_number, is_capture_image, llm=None):
|
152
|
-
prs = Presentation(io.BytesIO(file_content))
|
153
|
-
text_content = ''
|
154
|
-
if page_number is not None:
|
155
|
-
text_content += read_pptx_slide(prs.slides[page_number - 1], page_number, is_capture_image, llm)
|
97
|
+
if hasattr(loader, 'get_content'):
|
98
|
+
return loader.get_content()
|
156
99
|
else:
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
xref = img[0]
|
168
|
-
base_image = report.extract_image(xref)
|
169
|
-
img_bytes = base_image["image"]
|
170
|
-
text_content += __perform_llm_prediction_for_image(llm, img_bytes)
|
171
|
-
return text_content
|
172
|
-
|
173
|
-
def read_docx_from_bytes(file_content):
|
174
|
-
"""Read and return content from a .docx file using a byte stream."""
|
175
|
-
try:
|
176
|
-
doc = Document(BytesIO(file_content))
|
177
|
-
text = []
|
178
|
-
for paragraph in doc.paragraphs:
|
179
|
-
text.append(paragraph.text)
|
180
|
-
return '\n'.join(text)
|
181
|
-
except Exception as e:
|
182
|
-
print(f"Error reading .docx from bytes: {e}")
|
183
|
-
return ""
|
184
|
-
|
185
|
-
def read_pptx_slide(slide, index, is_capture_image, llm):
|
186
|
-
text_content = f'Slide: {index}\n'
|
187
|
-
for shape in slide.shapes:
|
188
|
-
if hasattr(shape, "text"):
|
189
|
-
text_content += shape.text + "\n"
|
190
|
-
elif is_capture_image and shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
191
|
-
try:
|
192
|
-
caption = __perform_llm_prediction_for_image(llm, shape.image.blob)
|
193
|
-
except:
|
194
|
-
caption = "\n[Picture: unknown]\n"
|
195
|
-
text_content += caption
|
196
|
-
return text_content
|
197
|
-
|
198
|
-
def describe_image(image):
|
199
|
-
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
200
|
-
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
|
201
|
-
inputs = processor(image, return_tensors="pt")
|
202
|
-
out = model.generate(**inputs)
|
203
|
-
return "\n[Picture: " + processor.decode(out[0], skip_special_tokens=True) + "]\n"
|
204
|
-
|
205
|
-
def __perform_llm_prediction_for_image(llm, image: bytes, image_format='png', prompt=image_processing_prompt) -> str:
|
206
|
-
if not llm:
|
207
|
-
raise ToolException("LLM is not provided for image processing.")
|
208
|
-
base64_string = bytes_to_base64(image)
|
209
|
-
result = llm.invoke([
|
210
|
-
HumanMessage(
|
211
|
-
content=[
|
212
|
-
{"type": "text", "text": prompt},
|
213
|
-
{
|
214
|
-
"type": "image_url",
|
215
|
-
"image_url": {"url": f"data:image/{image_format};base64,{base64_string}"},
|
216
|
-
},
|
217
|
-
])
|
218
|
-
])
|
219
|
-
return f"\n[Image description: {result.content}]\n"
|
100
|
+
if file_content:
|
101
|
+
return load_content_from_bytes(file_content=file_content,
|
102
|
+
extension=extension,
|
103
|
+
loader_extra_config=loader_kwargs,
|
104
|
+
llm=llm)
|
105
|
+
else:
|
106
|
+
return load_content(file_path=file_path,
|
107
|
+
extension=extension,
|
108
|
+
loader_extra_config=loader_kwargs,
|
109
|
+
llm=llm)
|
220
110
|
|
221
111
|
# TODO: review usage of this function alongside with functions above
|
222
112
|
def load_content(file_path: str, extension: str = None, loader_extra_config: dict = None, llm = None) -> str:
|
@@ -254,22 +144,7 @@ def load_content(file_path: str, extension: str = None, loader_extra_config: dic
|
|
254
144
|
|
255
145
|
def load_content_from_bytes(file_content: bytes, extension: str = None, loader_extra_config: dict = None, llm = None) -> str:
|
256
146
|
"""Loads the content of a file from bytes based on its extension using a configured loader."""
|
257
|
-
|
258
|
-
import tempfile
|
259
|
-
|
260
|
-
# Automatic cleanup with context manager
|
261
|
-
with tempfile.NamedTemporaryFile(mode='w+b', delete=True) as temp_file:
|
262
|
-
# Write data to temp file
|
263
|
-
temp_file.write(file_content)
|
264
|
-
temp_file.flush() # Ensure data is written
|
265
|
-
|
266
|
-
# Get the file path for operations
|
267
|
-
temp_path = temp_file.name
|
268
|
-
|
269
|
-
# Perform your operations
|
270
|
-
return load_content(temp_path, extension, loader_extra_config, llm)
|
271
|
-
|
272
|
-
|
147
|
+
return load_content(create_temp_file(file_content), extension, loader_extra_config, llm)
|
273
148
|
|
274
149
|
def file_to_bytes(filepath):
|
275
150
|
"""
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: alita_sdk
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.212
|
4
4
|
Summary: SDK for building langchain agents using resources from Alita
|
5
5
|
Author-email: Artem Rozumenko <artyom.rozumenko@gmail.com>, Mikalai Biazruchka <mikalai_biazruchka@epam.com>, Roman Mitusov <roman_mitusov@epam.com>, Ivan Krakhmaliuk <lifedjik@gmail.com>, Artem Dubrovskiy <ad13box@gmail.com>
|
6
6
|
License-Expression: Apache-2.0
|