alita-sdk 0.3.211__py3-none-any.whl → 0.3.213__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alita_sdk/runtime/clients/client.py +2 -2
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +48 -24
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +47 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +103 -49
- alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +63 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +54 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +66 -0
- alita_sdk/runtime/langchain/document_loaders/constants.py +13 -19
- alita_sdk/runtime/langchain/document_loaders/utils.py +30 -1
- alita_sdk/runtime/toolkits/artifact.py +5 -0
- alita_sdk/runtime/tools/artifact.py +2 -4
- alita_sdk/runtime/tools/vectorstore.py +2 -1
- alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +13 -37
- alita_sdk/tools/ado/wiki/ado_wrapper.py +10 -39
- alita_sdk/tools/confluence/api_wrapper.py +2 -0
- alita_sdk/tools/elitea_base.py +24 -3
- alita_sdk/tools/gitlab/__init__.py +3 -2
- alita_sdk/tools/gitlab/api_wrapper.py +45 -18
- alita_sdk/tools/gitlab_org/api_wrapper.py +44 -25
- alita_sdk/tools/sharepoint/api_wrapper.py +13 -13
- alita_sdk/tools/testrail/api_wrapper.py +20 -0
- alita_sdk/tools/utils/content_parser.py +37 -162
- {alita_sdk-0.3.211.dist-info → alita_sdk-0.3.213.dist-info}/METADATA +1 -1
- {alita_sdk-0.3.211.dist-info → alita_sdk-0.3.213.dist-info}/RECORD +27 -24
- {alita_sdk-0.3.211.dist-info → alita_sdk-0.3.213.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.211.dist-info → alita_sdk-0.3.213.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.211.dist-info → alita_sdk-0.3.213.dist-info}/top_level.txt +0 -0
@@ -249,9 +249,9 @@ class AlitaClient:
|
|
249
249
|
elif app_type == "llama":
|
250
250
|
app_type = "react"
|
251
251
|
elif app_type == "dial":
|
252
|
-
app_type = "
|
252
|
+
app_type = "react"
|
253
253
|
elif app_type == 'autogen':
|
254
|
-
app_type = "
|
254
|
+
app_type = "react"
|
255
255
|
if runtime == 'nonrunnable':
|
256
256
|
return LangChainAssistant(self, data, llm, chat_history, app_type,
|
257
257
|
tools=tools, memory=memory, store=store)
|
@@ -1,16 +1,15 @@
|
|
1
|
-
import base64
|
2
1
|
import re
|
2
|
+
from io import BytesIO
|
3
3
|
|
4
4
|
import mammoth.images
|
5
5
|
import pytesseract
|
6
6
|
from PIL import Image
|
7
7
|
from langchain_core.document_loaders import BaseLoader
|
8
8
|
from langchain_core.documents import Document
|
9
|
-
from langchain_core.messages import HumanMessage
|
10
9
|
from mammoth import convert_to_html
|
11
10
|
from markdownify import markdownify
|
12
11
|
|
13
|
-
from
|
12
|
+
from .utils import perform_llm_prediction_for_image_bytes
|
14
13
|
|
15
14
|
|
16
15
|
class AlitaDocxMammothLoader(BaseLoader):
|
@@ -18,7 +17,7 @@ class AlitaDocxMammothLoader(BaseLoader):
|
|
18
17
|
Loader for Docx files using Mammoth to convert to HTML, with image handling,
|
19
18
|
and then Markdownify to convert HTML to markdown.
|
20
19
|
"""
|
21
|
-
def __init__(self,
|
20
|
+
def __init__(self, **kwargs):
|
22
21
|
"""
|
23
22
|
Initializes AlitaDocxMammothLoader.
|
24
23
|
|
@@ -30,7 +29,10 @@ class AlitaDocxMammothLoader(BaseLoader):
|
|
30
29
|
Raises:
|
31
30
|
ValueError: If the 'path' parameter is not provided.
|
32
31
|
"""
|
33
|
-
self.path =
|
32
|
+
self.path = kwargs.get('file_path')
|
33
|
+
self.file_content = kwargs.get('file_content')
|
34
|
+
self.file_name = kwargs.get('file_name')
|
35
|
+
self.extract_images = kwargs.get('extract_images')
|
34
36
|
self.llm = kwargs.get("llm")
|
35
37
|
self.prompt = kwargs.get("prompt")
|
36
38
|
|
@@ -52,20 +54,7 @@ class AlitaDocxMammothLoader(BaseLoader):
|
|
52
54
|
if self.llm:
|
53
55
|
# Use LLM for image understanding
|
54
56
|
with image.open() as image_bytes:
|
55
|
-
|
56
|
-
url_path = f"data:image/{image.content_type};base64,{base64_string}"
|
57
|
-
result = self.llm.invoke([
|
58
|
-
HumanMessage(
|
59
|
-
content=[
|
60
|
-
{"type": "text",
|
61
|
-
"text": self.prompt if self.prompt is not None else DEFAULT_MULTIMODAL_PROMPT},
|
62
|
-
{
|
63
|
-
"type": "image_url",
|
64
|
-
"image_url": {"url": url_path},
|
65
|
-
},
|
66
|
-
]
|
67
|
-
)
|
68
|
-
]).content
|
57
|
+
result = perform_llm_prediction_for_image_bytes(image_bytes, self.llm, self.prompt)
|
69
58
|
output['src'] = result # LLM image transcript in src
|
70
59
|
return output
|
71
60
|
else:
|
@@ -114,9 +103,44 @@ class AlitaDocxMammothLoader(BaseLoader):
|
|
114
103
|
List[Document]: A list containing a single Document with the markdown content
|
115
104
|
and metadata including the source file path.
|
116
105
|
"""
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
106
|
+
result_content = self.get_content()
|
107
|
+
return [Document(page_content=result_content, metadata={'source': str(self.path)})]
|
108
|
+
|
109
|
+
def get_content(self):
|
110
|
+
"""
|
111
|
+
Extracts and converts the content of the Docx file to markdown format.
|
112
|
+
|
113
|
+
Handles both file paths and in-memory file content.
|
122
114
|
|
115
|
+
Returns:
|
116
|
+
str: The markdown content extracted from the Docx file.
|
117
|
+
"""
|
118
|
+
if self.path:
|
119
|
+
# If path is provided, read from file system
|
120
|
+
with open(self.path, 'rb') as docx_file:
|
121
|
+
return self._convert_docx_to_markdown(docx_file)
|
122
|
+
elif self.file_content and self.file_name:
|
123
|
+
# If file_content and file_name are provided, read from memory
|
124
|
+
docx_file = BytesIO(self.file_content)
|
125
|
+
return self._convert_docx_to_markdown(docx_file)
|
126
|
+
else:
|
127
|
+
raise ValueError("Either 'path' or 'file_content' and 'file_name' must be provided.")
|
128
|
+
|
129
|
+
def _convert_docx_to_markdown(self, docx_file):
|
130
|
+
"""
|
131
|
+
Converts the content of a Docx file to markdown format.
|
132
|
+
|
133
|
+
Args:
|
134
|
+
docx_file (BinaryIO): The Docx file object.
|
135
|
+
|
136
|
+
Returns:
|
137
|
+
str: The markdown content extracted from the Docx file.
|
138
|
+
"""
|
139
|
+
if self.extract_images:
|
140
|
+
# Extract images using the provided image handler
|
141
|
+
result = convert_to_html(docx_file, convert_image=mammoth.images.img_element(self.__handle_image))
|
142
|
+
else:
|
143
|
+
# Ignore images
|
144
|
+
result = convert_to_html(docx_file, convert_image=lambda image: "")
|
145
|
+
content = markdownify(result.value, heading_style="ATX")
|
146
|
+
return self.__postprocess_original_md(content)
|
@@ -11,14 +11,60 @@
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
|
-
|
14
|
+
import io
|
15
15
|
from typing import Iterator
|
16
16
|
import pandas as pd
|
17
17
|
from json import loads
|
18
|
+
|
19
|
+
from langchain_core.tools import ToolException
|
18
20
|
from .AlitaTableLoader import AlitaTableLoader
|
19
21
|
|
20
22
|
|
21
23
|
class AlitaExcelLoader(AlitaTableLoader):
|
24
|
+
|
25
|
+
excel_by_sheets: bool = False
|
26
|
+
sheet_name: str = None
|
27
|
+
return_type: str = 'str'
|
28
|
+
|
29
|
+
def __init__(self, **kwargs):
|
30
|
+
if not kwargs.get('file_path'):
|
31
|
+
file_content = kwargs.get('file_content')
|
32
|
+
if file_content:
|
33
|
+
kwargs['file_path'] = io.BytesIO(file_content)
|
34
|
+
super().__init__(**kwargs)
|
35
|
+
self.excel_by_sheets = kwargs.get('excel_by_sheets')
|
36
|
+
self.return_type = kwargs.get('return_type')
|
37
|
+
self.sheet_name = kwargs.get('sheet_name')
|
38
|
+
|
39
|
+
def get_content(self):
|
40
|
+
try:
|
41
|
+
dfs = pd.read_excel(self.file_path, sheet_name=self.sheet_name)
|
42
|
+
|
43
|
+
if self.excel_by_sheets:
|
44
|
+
result = {}
|
45
|
+
for sheet_name, df in dfs.items():
|
46
|
+
df.fillna('', inplace=True)
|
47
|
+
result[sheet_name] = self.parse_sheet(df)
|
48
|
+
return result
|
49
|
+
else:
|
50
|
+
result = []
|
51
|
+
for sheet_name, df in dfs.items():
|
52
|
+
string_content = self.parse_sheet(df)
|
53
|
+
result.append(f"====== Sheet name: {sheet_name} ======\n{string_content}")
|
54
|
+
return "\n\n".join(result)
|
55
|
+
except Exception as e:
|
56
|
+
return ToolException(f"Error reading Excel file: {e}")
|
57
|
+
|
58
|
+
def parse_sheet(self, df):
|
59
|
+
df.fillna('', inplace=True)
|
60
|
+
|
61
|
+
if self.return_type == 'dict':
|
62
|
+
return df.to_dict(orient='records')
|
63
|
+
elif self.return_type == 'csv':
|
64
|
+
return df.to_csv()
|
65
|
+
else:
|
66
|
+
return df.to_string(index=False)
|
67
|
+
|
22
68
|
def read(self):
|
23
69
|
df = pd.read_excel(self.file_path, sheet_name=None)
|
24
70
|
docs = []
|
@@ -6,10 +6,10 @@ import pytesseract
|
|
6
6
|
from PIL import Image
|
7
7
|
from langchain_core.document_loaders import BaseLoader
|
8
8
|
from langchain_core.documents import Document
|
9
|
-
from langchain_core.messages import HumanMessage
|
10
9
|
from reportlab.graphics import renderPM
|
11
10
|
from svglib.svglib import svg2rlg
|
12
11
|
|
12
|
+
from .utils import perform_llm_prediction_for_image_bytes
|
13
13
|
from ..constants import DEFAULT_MULTIMODAL_PROMPT
|
14
14
|
from ..tools.utils import image_to_byte_array, bytes_to_base64
|
15
15
|
|
@@ -25,6 +25,9 @@ class AlitaImageLoader(BaseLoader):
|
|
25
25
|
self.file_path = file_path
|
26
26
|
elif kwargs.get('path'):
|
27
27
|
self.file_path = kwargs['path']
|
28
|
+
elif kwargs.get('file_content'):
|
29
|
+
self.file_content = kwargs['file_content']
|
30
|
+
self.file_name = kwargs['file_name']
|
28
31
|
else:
|
29
32
|
raise ValueError(
|
30
33
|
"Path parameter is required (either as 'file_path' positional argument or 'path' keyword argument)")
|
@@ -33,22 +36,107 @@ class AlitaImageLoader(BaseLoader):
|
|
33
36
|
self.prompt = kwargs.get('prompt') if kwargs.get(
|
34
37
|
'prompt') is not None else DEFAULT_MULTIMODAL_PROMPT # Use provided prompt or default
|
35
38
|
|
39
|
+
def get_content(self):
|
40
|
+
"""
|
41
|
+
Retrieves the text content from the file or in-memory content.
|
42
|
+
|
43
|
+
Depending on the file type (SVG or raster image) and the availability of LLM,
|
44
|
+
processes the file appropriately using OCR or LLM.
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
str: Extracted text content from the file.
|
48
|
+
"""
|
49
|
+
try:
|
50
|
+
if hasattr(self, 'file_path'):
|
51
|
+
# If file_path is provided
|
52
|
+
file_path = Path(self.file_path)
|
53
|
+
if not file_path.exists():
|
54
|
+
raise FileNotFoundError(f"File not found: {self.file_path}")
|
55
|
+
|
56
|
+
if file_path.suffix.lower() == '.svg':
|
57
|
+
text_content = self._process_svg(self.file_path)
|
58
|
+
else:
|
59
|
+
text_content = self._process_raster_image(self.file_path)
|
60
|
+
|
61
|
+
elif hasattr(self, 'file_content') and hasattr(self, 'file_name'):
|
62
|
+
# If file_content and file_name are provided
|
63
|
+
file_name = Path(self.file_name)
|
64
|
+
if file_name.suffix.lower() == '.svg':
|
65
|
+
text_content = self._process_svg(BytesIO(self.file_content))
|
66
|
+
else:
|
67
|
+
text_content = self._process_raster_image(BytesIO(self.file_content))
|
68
|
+
else:
|
69
|
+
raise ValueError("Either 'file_path' or 'file_content' and 'file_name' must be provided.")
|
70
|
+
|
71
|
+
except pytesseract.TesseractError as e:
|
72
|
+
raise ValueError(f"Error during OCR: {e}")
|
73
|
+
except ImportError as e:
|
74
|
+
raise ImportError(
|
75
|
+
f"Error: SVG processing dependencies not installed. Please install svglib and reportlab: {e}")
|
76
|
+
except Exception as e:
|
77
|
+
raise ValueError(f"Error opening image or processing SVG: {e}")
|
78
|
+
|
79
|
+
return text_content
|
80
|
+
|
81
|
+
def _process_svg(self, svg_source):
|
82
|
+
"""
|
83
|
+
Processes an SVG file or in-memory SVG content.
|
84
|
+
|
85
|
+
If an LLM is available, the SVG is processed using LLM. Otherwise, the SVG
|
86
|
+
is converted to PNG and processed using OCR.
|
87
|
+
|
88
|
+
Args:
|
89
|
+
svg_source (str, Path, or BytesIO): The SVG file path or in-memory content.
|
90
|
+
|
91
|
+
Returns:
|
92
|
+
str: Extracted text content from the SVG.
|
93
|
+
"""
|
94
|
+
if self.llm:
|
95
|
+
if isinstance(svg_source, (str, Path)):
|
96
|
+
with open(svg_source, 'rb') as f:
|
97
|
+
svg_content = f.read()
|
98
|
+
else:
|
99
|
+
svg_content = svg_source.read()
|
100
|
+
return self.__process_svg_with_llm(svg_content, self.llm, self.prompt)
|
101
|
+
else:
|
102
|
+
# For OCR on SVG, convert SVG to PNG and then use OCR
|
103
|
+
if isinstance(svg_source, (str, Path)):
|
104
|
+
drawing = svg2rlg(str(svg_source)) # svglib requires path as a string
|
105
|
+
else:
|
106
|
+
drawing = svg2rlg(svg_source) # svglib supports BytesIO
|
107
|
+
img_data = BytesIO()
|
108
|
+
renderPM.drawToFile(drawing, img_data, fmt="PNG")
|
109
|
+
img_data.seek(0)
|
110
|
+
image = Image.open(img_data)
|
111
|
+
return pytesseract.image_to_string(image, lang=self.ocr_language)
|
112
|
+
|
113
|
+
def _process_raster_image(self, image_source):
|
114
|
+
"""
|
115
|
+
Processes a raster image (e.g., PNG, JPG).
|
116
|
+
|
117
|
+
If an LLM is available, the image is processed using LLM. Otherwise, OCR is used
|
118
|
+
to extract text content from the image.
|
119
|
+
|
120
|
+
Args:
|
121
|
+
image_source (str, Path, or BytesIO): The image file path or in-memory content.
|
122
|
+
|
123
|
+
Returns:
|
124
|
+
str: Extracted text content from the raster image.
|
125
|
+
"""
|
126
|
+
image = Image.open(image_source)
|
127
|
+
if self.llm:
|
128
|
+
try:
|
129
|
+
return self.__perform_llm_prediction_for_image(image, self.llm, self.prompt)
|
130
|
+
except Exception as e:
|
131
|
+
print(f"Warning: Error during LLM processing of image: {e}. Falling back to OCR.")
|
132
|
+
return pytesseract.image_to_string(image, lang=self.ocr_language)
|
133
|
+
else:
|
134
|
+
return pytesseract.image_to_string(image, lang=self.ocr_language)
|
135
|
+
|
36
136
|
def __perform_llm_prediction_for_image(self, image: Image, llm, prompt: str) -> str:
|
37
137
|
"""Performs LLM prediction for image content."""
|
38
138
|
byte_array = image_to_byte_array(image)
|
39
|
-
|
40
|
-
result = llm.invoke([
|
41
|
-
HumanMessage(
|
42
|
-
content=[
|
43
|
-
{"type": "text", "text": prompt},
|
44
|
-
{
|
45
|
-
"type": "image_url",
|
46
|
-
"image_url": {"url": f"data:image/png;base64,{base64_string}"},
|
47
|
-
},
|
48
|
-
]
|
49
|
-
)
|
50
|
-
])
|
51
|
-
return result.content
|
139
|
+
return perform_llm_prediction_for_image_bytes(byte_array, llm, prompt)
|
52
140
|
|
53
141
|
def __process_svg_with_llm(self, svg_content: bytes, llm, prompt: str) -> str:
|
54
142
|
"""Processes SVG content using LLM."""
|
@@ -61,41 +149,7 @@ class AlitaImageLoader(BaseLoader):
|
|
61
149
|
|
62
150
|
def load(self) -> List[Document]:
|
63
151
|
"""Load text from image using OCR or LLM if llm is provided, supports SVG."""
|
64
|
-
|
65
|
-
try:
|
66
|
-
if file_path.suffix.lower() == '.svg':
|
67
|
-
if self.llm:
|
68
|
-
with open(self.file_path, 'rb') as f:
|
69
|
-
svg_content = f.read()
|
70
|
-
text_content = self.__process_svg_with_llm(svg_content, self.llm, self.prompt)
|
71
|
-
else:
|
72
|
-
# For OCR on SVG, we first convert SVG to PNG then use OCR
|
73
|
-
drawing = svg2rlg(str(self.file_path)) # svglib requires path as string
|
74
|
-
img_data = BytesIO()
|
75
|
-
renderPM.drawToFile(drawing, img_data, fmt="PNG")
|
76
|
-
img_data.seek(0)
|
77
|
-
image = Image.open(img_data)
|
78
|
-
text_content = pytesseract.image_to_string(image, lang=self.ocr_language)
|
79
|
-
else: # For raster images (png, jpg, etc.)
|
80
|
-
image = Image.open(self.file_path)
|
81
|
-
if self.llm:
|
82
|
-
try:
|
83
|
-
text_content = self.__perform_llm_prediction_for_image(image, self.llm, self.prompt)
|
84
|
-
except Exception as e:
|
85
|
-
print(f"Warning: Error during LLM processing of image: {e}. Falling back to OCR.")
|
86
|
-
text_content = pytesseract.image_to_string(image,
|
87
|
-
lang=self.ocr_language) # Fallback to OCR if LLM fails
|
88
|
-
else:
|
89
|
-
text_content = pytesseract.image_to_string(image, lang=self.ocr_language)
|
90
|
-
except FileNotFoundError:
|
91
|
-
raise FileNotFoundError(f"File not found: {self.file_path}")
|
92
|
-
except pytesseract.TesseractError as e:
|
93
|
-
raise ValueError(f"Error during OCR: {e}")
|
94
|
-
except ImportError as e: # svglib or reportlab missing
|
95
|
-
raise ImportError(
|
96
|
-
f"Error: SVG processing dependencies not installed. Please install svglib and reportlab: {e}")
|
97
|
-
except Exception as e:
|
98
|
-
raise ValueError(f"Error opening image or processing SVG: {e}")
|
152
|
+
text_content = self.get_content()
|
99
153
|
|
100
154
|
metadata = {"source": str(self.file_path)} # Ensure source is always a string for metadata
|
101
155
|
return [Document(page_content=text_content, metadata=metadata)]
|
@@ -0,0 +1,63 @@
|
|
1
|
+
import pymupdf
|
2
|
+
from langchain_community.document_loaders import PyPDFLoader
|
3
|
+
from .utils import perform_llm_prediction_for_image_bytes, create_temp_file
|
4
|
+
from langchain_core.tools import ToolException
|
5
|
+
|
6
|
+
class AlitaPDFLoader:
|
7
|
+
|
8
|
+
def __init__(self, **kwargs):
|
9
|
+
if kwargs.get('file_path'):
|
10
|
+
self.file_path = kwargs.get('file_path')
|
11
|
+
elif kwargs.get('file_content'):
|
12
|
+
self.file_content = kwargs.get('file_content')
|
13
|
+
else:
|
14
|
+
raise ToolException("'file_path' or 'file_content' parameter should be provided.")
|
15
|
+
self.password = kwargs.get('password', None)
|
16
|
+
self.page_number = kwargs.get('page_number', None)
|
17
|
+
self.extract_images = kwargs.get('extract_images', False)
|
18
|
+
self.llm = kwargs.get('llm', None)
|
19
|
+
self.prompt = kwargs.get('prompt', "Describe image")
|
20
|
+
self.headers = kwargs.get('headers', None)
|
21
|
+
self.extraction_mode = kwargs.get('extraction_mode', "plain")
|
22
|
+
self.extraction_kwargs = kwargs.get('extraction_kwargs', None)
|
23
|
+
|
24
|
+
def get_content(self):
|
25
|
+
if hasattr(self, 'file_path'):
|
26
|
+
with pymupdf.open(filename=self.file_path, filetype="pdf") as report:
|
27
|
+
return self.parse_report(report)
|
28
|
+
else:
|
29
|
+
with pymupdf.open(stream=self.file_content, filetype="pdf") as report:
|
30
|
+
return self.parse_report(report)
|
31
|
+
|
32
|
+
def parse_report(self, report):
|
33
|
+
text_content = ''
|
34
|
+
if self.page_number is not None:
|
35
|
+
page = report.load_page(self.page_number - 1)
|
36
|
+
text_content += self.read_pdf_page(report, page, self.page_number)
|
37
|
+
else:
|
38
|
+
for index, page in enumerate(report, start=1):
|
39
|
+
text_content += self.read_pdf_page(report, page, index)
|
40
|
+
|
41
|
+
return text_content
|
42
|
+
|
43
|
+
def read_pdf_page(self, report, page, index):
|
44
|
+
text_content = f'Page: {index}\n'
|
45
|
+
text_content += page.get_text()
|
46
|
+
if self.extract_images:
|
47
|
+
images = page.get_images(full=True)
|
48
|
+
for i, img in enumerate(images):
|
49
|
+
xref = img[0]
|
50
|
+
base_image = report.extract_image(xref)
|
51
|
+
img_bytes = base_image["image"]
|
52
|
+
text_content += "\n**Image Transcript:**\n" + perform_llm_prediction_for_image_bytes(img_bytes, self.llm, self.prompt) + "\n--------------------\n"
|
53
|
+
return text_content
|
54
|
+
|
55
|
+
def load(self):
|
56
|
+
if not hasattr(self, 'file_path'):
|
57
|
+
self.file_path = create_temp_file(self.file_content)
|
58
|
+
return PyPDFLoader(file_path=self.file_path,
|
59
|
+
password=self.password,
|
60
|
+
headers=self.headers,
|
61
|
+
extract_images=self.extract_images,
|
62
|
+
extraction_mode=self.extraction_mode,
|
63
|
+
extraction_kwargs=self.extraction_kwargs).load()
|
@@ -0,0 +1,54 @@
|
|
1
|
+
import io
|
2
|
+
|
3
|
+
from langchain_community.document_loaders import UnstructuredPowerPointLoader
|
4
|
+
from langchain_core.tools import ToolException
|
5
|
+
from pptx import Presentation
|
6
|
+
from .utils import perform_llm_prediction_for_image_bytes, create_temp_file
|
7
|
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
8
|
+
|
9
|
+
class AlitaPowerPointLoader:
|
10
|
+
|
11
|
+
def __init__(self, file_path=None, file_content=None, mode=None, **unstructured_kwargs):
|
12
|
+
if file_path:
|
13
|
+
self.file_path = file_path
|
14
|
+
elif file_content:
|
15
|
+
self.file_content = file_content
|
16
|
+
else:
|
17
|
+
raise ToolException("'file_path' or 'file_content' parameter should be provided.")
|
18
|
+
|
19
|
+
self.mode=mode
|
20
|
+
self.unstructured_kwargs = unstructured_kwargs
|
21
|
+
self.page_number = unstructured_kwargs.get('page_number', None)
|
22
|
+
self.extract_images = unstructured_kwargs.get('extract_images', False)
|
23
|
+
self.llm = unstructured_kwargs.get('llm', None)
|
24
|
+
self.prompt = unstructured_kwargs.get('prompt', "Describe image")
|
25
|
+
|
26
|
+
def get_content(self):
|
27
|
+
prs = Presentation(io.BytesIO(self.file_content))
|
28
|
+
text_content = ''
|
29
|
+
if self.page_number is not None:
|
30
|
+
text_content += self.read_pptx_slide(prs.slides[self.page_number - 1], self.page_number)
|
31
|
+
else:
|
32
|
+
for index, slide in enumerate(prs.slides, start=1):
|
33
|
+
text_content += self.read_pptx_slide(slide, index)
|
34
|
+
return text_content
|
35
|
+
|
36
|
+
def read_pptx_slide(self, slide, index):
|
37
|
+
text_content = f'Slide: {index}\n'
|
38
|
+
for shape in slide.shapes:
|
39
|
+
if hasattr(shape, "text"):
|
40
|
+
text_content += shape.text + "\n"
|
41
|
+
elif self.extract_images and shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
42
|
+
try:
|
43
|
+
caption = perform_llm_prediction_for_image_bytes(shape.image.blob, self.llm)
|
44
|
+
except:
|
45
|
+
caption = "unknown"
|
46
|
+
text_content += "\n**Image Transcript:**\n" + caption + "\n--------------------\n"
|
47
|
+
return text_content
|
48
|
+
|
49
|
+
def load(self):
|
50
|
+
if not self.file_path:
|
51
|
+
self.file_path = create_temp_file(self.file_content)
|
52
|
+
return UnstructuredPowerPointLoader(file_path=self.file_path,
|
53
|
+
mode=self.mode,
|
54
|
+
**self.unstructured_kwargs).load()
|
@@ -0,0 +1,66 @@
|
|
1
|
+
from typing import Iterator
|
2
|
+
|
3
|
+
from langchain_core.documents import Document
|
4
|
+
|
5
|
+
from langchain_community.document_loaders.base import BaseLoader
|
6
|
+
from langchain_community.document_loaders.helpers import detect_file_encodings
|
7
|
+
from langchain_core.tools import ToolException
|
8
|
+
|
9
|
+
class AlitaTextLoader(BaseLoader):
|
10
|
+
|
11
|
+
def __init__(self, **kwargs):
|
12
|
+
"""Initialize with file path."""
|
13
|
+
if kwargs.get('file_path'):
|
14
|
+
self.file_path = kwargs['file_path']
|
15
|
+
elif kwargs.get('file_content'):
|
16
|
+
self.file_content = kwargs['file_content']
|
17
|
+
self.file_name = kwargs['file_name']
|
18
|
+
else:
|
19
|
+
raise ToolException("'file_path' or 'file_content' parameter should be provided.")
|
20
|
+
self.encoding = kwargs.get('encoding', 'utf-8')
|
21
|
+
self.autodetect_encoding = kwargs.get('autodetect_encoding', False)
|
22
|
+
|
23
|
+
def get_content(self):
|
24
|
+
text = ""
|
25
|
+
try:
|
26
|
+
if hasattr(self, 'file_path') and self.file_path:
|
27
|
+
with open(self.file_path, encoding=self.encoding) as f:
|
28
|
+
text = f.read()
|
29
|
+
elif hasattr(self, 'file_content') and self.file_content:
|
30
|
+
text = self.file_content.decode(self.encoding)
|
31
|
+
else:
|
32
|
+
raise ValueError("Neither file_path nor file_content is provided.")
|
33
|
+
|
34
|
+
except UnicodeDecodeError as e:
|
35
|
+
if self.autodetect_encoding:
|
36
|
+
if hasattr(self, 'file_path') and self.file_path:
|
37
|
+
detected_encodings = detect_file_encodings(self.file_path)
|
38
|
+
for encoding in detected_encodings:
|
39
|
+
try:
|
40
|
+
with open(self.file_path, encoding=encoding.encoding) as f:
|
41
|
+
text = f.read()
|
42
|
+
break
|
43
|
+
except UnicodeDecodeError:
|
44
|
+
continue
|
45
|
+
elif hasattr(self, 'file_content') and self.file_content:
|
46
|
+
detected_encodings = detect_file_encodings(self.file_content)
|
47
|
+
for encoding in detected_encodings:
|
48
|
+
try:
|
49
|
+
text = self.file_content.decode(encoding.encoding)
|
50
|
+
break
|
51
|
+
except UnicodeDecodeError:
|
52
|
+
continue
|
53
|
+
else:
|
54
|
+
raise ValueError("Neither file_path nor file_content is provided for encoding detection.")
|
55
|
+
else:
|
56
|
+
raise RuntimeError(f"Error loading content with encoding {self.encoding}.") from e
|
57
|
+
except Exception as e:
|
58
|
+
raise RuntimeError(f"Error loading content.") from e
|
59
|
+
|
60
|
+
return text
|
61
|
+
|
62
|
+
def lazy_load(self) -> Iterator[Document]:
|
63
|
+
"""Load from file path."""
|
64
|
+
text = self.get_content()
|
65
|
+
metadata = {"source": str(self.file_path) if hasattr(self, 'file_path') else self.file_name}
|
66
|
+
yield Document(page_content=text, metadata=metadata)
|
@@ -12,24 +12,18 @@
|
|
12
12
|
# See the License for the specific language governing permissions and
|
13
13
|
# limitations under the License.
|
14
14
|
|
15
|
-
from langchain_community.document_loaders import (
|
15
|
+
from langchain_community.document_loaders import (
|
16
16
|
UnstructuredMarkdownLoader,
|
17
|
-
|
18
|
-
UnstructuredPDFLoader,UnstructuredWordDocumentLoader,
|
19
|
-
JSONLoader, AirbyteJSONLoader, UnstructuredHTMLLoader,
|
20
|
-
UnstructuredPowerPointLoader, PythonLoader)
|
21
|
-
|
22
|
-
from langchain_community.document_loaders import (TextLoader,
|
23
|
-
UnstructuredMarkdownLoader,
|
24
|
-
PyPDFLoader,
|
25
|
-
UnstructuredPDFLoader,UnstructuredWordDocumentLoader,
|
26
|
-
JSONLoader, AirbyteJSONLoader, UnstructuredHTMLLoader,
|
17
|
+
AirbyteJSONLoader, UnstructuredHTMLLoader,
|
27
18
|
UnstructuredPowerPointLoader, PythonLoader)
|
28
19
|
|
29
20
|
from .AlitaCSVLoader import AlitaCSVLoader
|
30
21
|
from .AlitaDocxMammothLoader import AlitaDocxMammothLoader
|
31
22
|
from .AlitaExcelLoader import AlitaExcelLoader
|
32
23
|
from .AlitaImageLoader import AlitaImageLoader
|
24
|
+
from .AlitaPDFLoader import AlitaPDFLoader
|
25
|
+
from .AlitaTextLoader import AlitaTextLoader
|
26
|
+
from .AlitaPowerPointLoader import AlitaPowerPointLoader
|
33
27
|
|
34
28
|
loaders_map = {
|
35
29
|
'.png': {
|
@@ -63,28 +57,28 @@ loaders_map = {
|
|
63
57
|
'kwargs': {}
|
64
58
|
},
|
65
59
|
'.txt': {
|
66
|
-
'class':
|
60
|
+
'class': AlitaTextLoader,
|
67
61
|
'is_multimodal_processing': False,
|
68
62
|
'kwargs': {
|
69
63
|
'autodetect_encoding': True
|
70
64
|
}
|
71
65
|
},
|
72
66
|
'.yml': {
|
73
|
-
'class':
|
67
|
+
'class': AlitaTextLoader,
|
74
68
|
'is_multimodal_processing': False,
|
75
69
|
'kwargs': {
|
76
70
|
'autodetect_encoding': True
|
77
71
|
}
|
78
72
|
},
|
79
73
|
'.yaml': {
|
80
|
-
'class':
|
74
|
+
'class': AlitaTextLoader,
|
81
75
|
'is_multimodal_processing': False,
|
82
76
|
'kwargs': {
|
83
77
|
'autodetect_encoding': True
|
84
78
|
}
|
85
79
|
},
|
86
80
|
'.groovy': {
|
87
|
-
'class':
|
81
|
+
'class': AlitaTextLoader,
|
88
82
|
'is_multimodal_processing': False,
|
89
83
|
'kwargs': {
|
90
84
|
'autodetect_encoding': True
|
@@ -121,7 +115,7 @@ loaders_map = {
|
|
121
115
|
}
|
122
116
|
},
|
123
117
|
'.pdf': {
|
124
|
-
'class':
|
118
|
+
'class': AlitaPDFLoader,
|
125
119
|
'is_multimodal_processing': False,
|
126
120
|
'kwargs': {}
|
127
121
|
},
|
@@ -131,7 +125,7 @@ loaders_map = {
|
|
131
125
|
'kwargs': {}
|
132
126
|
},
|
133
127
|
'.json': {
|
134
|
-
'class':
|
128
|
+
'class': AlitaTextLoader,
|
135
129
|
'is_multimodal_processing': False,
|
136
130
|
'kwargs': {
|
137
131
|
'autodetect_encoding': True
|
@@ -153,12 +147,12 @@ loaders_map = {
|
|
153
147
|
'kwargs': {}
|
154
148
|
},
|
155
149
|
'.ppt': {
|
156
|
-
'class':
|
150
|
+
'class': AlitaPowerPointLoader,
|
157
151
|
'is_multimodal_processing': False,
|
158
152
|
'kwargs': {}
|
159
153
|
},
|
160
154
|
'.pptx': {
|
161
|
-
'class':
|
155
|
+
'class': AlitaPowerPointLoader,
|
162
156
|
'is_multimodal_processing': False,
|
163
157
|
'kwargs': {}
|
164
158
|
},
|