alita-sdk 0.3.211__py3-none-any.whl → 0.3.212__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. alita_sdk/runtime/clients/client.py +2 -2
  2. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +48 -24
  3. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +47 -1
  4. alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +103 -49
  5. alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +63 -0
  6. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +54 -0
  7. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +66 -0
  8. alita_sdk/runtime/langchain/document_loaders/constants.py +13 -19
  9. alita_sdk/runtime/langchain/document_loaders/utils.py +30 -1
  10. alita_sdk/runtime/tools/artifact.py +2 -4
  11. alita_sdk/runtime/tools/vectorstore.py +2 -1
  12. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +13 -37
  13. alita_sdk/tools/ado/wiki/ado_wrapper.py +10 -39
  14. alita_sdk/tools/confluence/api_wrapper.py +2 -0
  15. alita_sdk/tools/elitea_base.py +20 -1
  16. alita_sdk/tools/gitlab/__init__.py +3 -2
  17. alita_sdk/tools/gitlab/api_wrapper.py +45 -18
  18. alita_sdk/tools/gitlab_org/api_wrapper.py +44 -25
  19. alita_sdk/tools/sharepoint/api_wrapper.py +13 -13
  20. alita_sdk/tools/testrail/api_wrapper.py +20 -0
  21. alita_sdk/tools/utils/content_parser.py +37 -162
  22. {alita_sdk-0.3.211.dist-info → alita_sdk-0.3.212.dist-info}/METADATA +1 -1
  23. {alita_sdk-0.3.211.dist-info → alita_sdk-0.3.212.dist-info}/RECORD +26 -23
  24. {alita_sdk-0.3.211.dist-info → alita_sdk-0.3.212.dist-info}/WHEEL +0 -0
  25. {alita_sdk-0.3.211.dist-info → alita_sdk-0.3.212.dist-info}/licenses/LICENSE +0 -0
  26. {alita_sdk-0.3.211.dist-info → alita_sdk-0.3.212.dist-info}/top_level.txt +0 -0
@@ -249,9 +249,9 @@ class AlitaClient:
249
249
  elif app_type == "llama":
250
250
  app_type = "react"
251
251
  elif app_type == "dial":
252
- app_type = "openai"
252
+ app_type = "react"
253
253
  elif app_type == 'autogen':
254
- app_type = "openai"
254
+ app_type = "react"
255
255
  if runtime == 'nonrunnable':
256
256
  return LangChainAssistant(self, data, llm, chat_history, app_type,
257
257
  tools=tools, memory=memory, store=store)
@@ -1,16 +1,15 @@
1
- import base64
2
1
  import re
2
+ from io import BytesIO
3
3
 
4
4
  import mammoth.images
5
5
  import pytesseract
6
6
  from PIL import Image
7
7
  from langchain_core.document_loaders import BaseLoader
8
8
  from langchain_core.documents import Document
9
- from langchain_core.messages import HumanMessage
10
9
  from mammoth import convert_to_html
11
10
  from markdownify import markdownify
12
11
 
13
- from ..constants import DEFAULT_MULTIMODAL_PROMPT
12
+ from .utils import perform_llm_prediction_for_image_bytes
14
13
 
15
14
 
16
15
  class AlitaDocxMammothLoader(BaseLoader):
@@ -18,7 +17,7 @@ class AlitaDocxMammothLoader(BaseLoader):
18
17
  Loader for Docx files using Mammoth to convert to HTML, with image handling,
19
18
  and then Markdownify to convert HTML to markdown.
20
19
  """
21
- def __init__(self, file_path: str, **kwargs):
20
+ def __init__(self, **kwargs):
22
21
  """
23
22
  Initializes AlitaDocxMammothLoader.
24
23
 
@@ -30,7 +29,10 @@ class AlitaDocxMammothLoader(BaseLoader):
30
29
  Raises:
31
30
  ValueError: If the 'path' parameter is not provided.
32
31
  """
33
- self.path = file_path
32
+ self.path = kwargs.get('file_path')
33
+ self.file_content = kwargs.get('file_content')
34
+ self.file_name = kwargs.get('file_name')
35
+ self.extract_images = kwargs.get('extract_images')
34
36
  self.llm = kwargs.get("llm")
35
37
  self.prompt = kwargs.get("prompt")
36
38
 
@@ -52,20 +54,7 @@ class AlitaDocxMammothLoader(BaseLoader):
52
54
  if self.llm:
53
55
  # Use LLM for image understanding
54
56
  with image.open() as image_bytes:
55
- base64_string = base64.b64encode(image_bytes.read()).decode()
56
- url_path = f"data:image/{image.content_type};base64,{base64_string}"
57
- result = self.llm.invoke([
58
- HumanMessage(
59
- content=[
60
- {"type": "text",
61
- "text": self.prompt if self.prompt is not None else DEFAULT_MULTIMODAL_PROMPT},
62
- {
63
- "type": "image_url",
64
- "image_url": {"url": url_path},
65
- },
66
- ]
67
- )
68
- ]).content
57
+ result = perform_llm_prediction_for_image_bytes(image_bytes, self.llm, self.prompt)
69
58
  output['src'] = result # LLM image transcript in src
70
59
  return output
71
60
  else:
@@ -114,9 +103,44 @@ class AlitaDocxMammothLoader(BaseLoader):
114
103
  List[Document]: A list containing a single Document with the markdown content
115
104
  and metadata including the source file path.
116
105
  """
117
- with open(self.path, 'rb') as docx_file:
118
- result = convert_to_html(docx_file, convert_image=mammoth.images.img_element(self.__handle_image))
119
- content = markdownify(result.value, heading_style="ATX")
120
- result_content = self.__postprocess_original_md(content)
121
- return [Document(page_content=result_content, metadata={'source': str(self.path)})]
106
+ result_content = self.get_content()
107
+ return [Document(page_content=result_content, metadata={'source': str(self.path)})]
108
+
109
+ def get_content(self):
110
+ """
111
+ Extracts and converts the content of the Docx file to markdown format.
112
+
113
+ Handles both file paths and in-memory file content.
122
114
 
115
+ Returns:
116
+ str: The markdown content extracted from the Docx file.
117
+ """
118
+ if self.path:
119
+ # If path is provided, read from file system
120
+ with open(self.path, 'rb') as docx_file:
121
+ return self._convert_docx_to_markdown(docx_file)
122
+ elif self.file_content and self.file_name:
123
+ # If file_content and file_name are provided, read from memory
124
+ docx_file = BytesIO(self.file_content)
125
+ return self._convert_docx_to_markdown(docx_file)
126
+ else:
127
+ raise ValueError("Either 'path' or 'file_content' and 'file_name' must be provided.")
128
+
129
+ def _convert_docx_to_markdown(self, docx_file):
130
+ """
131
+ Converts the content of a Docx file to markdown format.
132
+
133
+ Args:
134
+ docx_file (BinaryIO): The Docx file object.
135
+
136
+ Returns:
137
+ str: The markdown content extracted from the Docx file.
138
+ """
139
+ if self.extract_images:
140
+ # Extract images using the provided image handler
141
+ result = convert_to_html(docx_file, convert_image=mammoth.images.img_element(self.__handle_image))
142
+ else:
143
+ # Ignore images
144
+ result = convert_to_html(docx_file, convert_image=lambda image: "")
145
+ content = markdownify(result.value, heading_style="ATX")
146
+ return self.__postprocess_original_md(content)
@@ -11,14 +11,60 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
-
14
+ import io
15
15
  from typing import Iterator
16
16
  import pandas as pd
17
17
  from json import loads
18
+
19
+ from langchain_core.tools import ToolException
18
20
  from .AlitaTableLoader import AlitaTableLoader
19
21
 
20
22
 
21
23
  class AlitaExcelLoader(AlitaTableLoader):
24
+
25
+ excel_by_sheets: bool = False
26
+ sheet_name: str = None
27
+ return_type: str = 'str'
28
+
29
+ def __init__(self, **kwargs):
30
+ if not kwargs.get('file_path'):
31
+ file_content = kwargs.get('file_content')
32
+ if file_content:
33
+ kwargs['file_path'] = io.BytesIO(file_content)
34
+ super().__init__(**kwargs)
35
+ self.excel_by_sheets = kwargs.get('excel_by_sheets')
36
+ self.return_type = kwargs.get('return_type')
37
+ self.sheet_name = kwargs.get('sheet_name')
38
+
39
+ def get_content(self):
40
+ try:
41
+ dfs = pd.read_excel(self.file_path, sheet_name=self.sheet_name)
42
+
43
+ if self.excel_by_sheets:
44
+ result = {}
45
+ for sheet_name, df in dfs.items():
46
+ df.fillna('', inplace=True)
47
+ result[sheet_name] = self.parse_sheet(df)
48
+ return result
49
+ else:
50
+ result = []
51
+ for sheet_name, df in dfs.items():
52
+ string_content = self.parse_sheet(df)
53
+ result.append(f"====== Sheet name: {sheet_name} ======\n{string_content}")
54
+ return "\n\n".join(result)
55
+ except Exception as e:
56
+ return ToolException(f"Error reading Excel file: {e}")
57
+
58
+ def parse_sheet(self, df):
59
+ df.fillna('', inplace=True)
60
+
61
+ if self.return_type == 'dict':
62
+ return df.to_dict(orient='records')
63
+ elif self.return_type == 'csv':
64
+ return df.to_csv()
65
+ else:
66
+ return df.to_string(index=False)
67
+
22
68
  def read(self):
23
69
  df = pd.read_excel(self.file_path, sheet_name=None)
24
70
  docs = []
@@ -6,10 +6,10 @@ import pytesseract
6
6
  from PIL import Image
7
7
  from langchain_core.document_loaders import BaseLoader
8
8
  from langchain_core.documents import Document
9
- from langchain_core.messages import HumanMessage
10
9
  from reportlab.graphics import renderPM
11
10
  from svglib.svglib import svg2rlg
12
11
 
12
+ from .utils import perform_llm_prediction_for_image_bytes
13
13
  from ..constants import DEFAULT_MULTIMODAL_PROMPT
14
14
  from ..tools.utils import image_to_byte_array, bytes_to_base64
15
15
 
@@ -25,6 +25,9 @@ class AlitaImageLoader(BaseLoader):
25
25
  self.file_path = file_path
26
26
  elif kwargs.get('path'):
27
27
  self.file_path = kwargs['path']
28
+ elif kwargs.get('file_content'):
29
+ self.file_content = kwargs['file_content']
30
+ self.file_name = kwargs['file_name']
28
31
  else:
29
32
  raise ValueError(
30
33
  "Path parameter is required (either as 'file_path' positional argument or 'path' keyword argument)")
@@ -33,22 +36,107 @@ class AlitaImageLoader(BaseLoader):
33
36
  self.prompt = kwargs.get('prompt') if kwargs.get(
34
37
  'prompt') is not None else DEFAULT_MULTIMODAL_PROMPT # Use provided prompt or default
35
38
 
39
+ def get_content(self):
40
+ """
41
+ Retrieves the text content from the file or in-memory content.
42
+
43
+ Depending on the file type (SVG or raster image) and the availability of LLM,
44
+ processes the file appropriately using OCR or LLM.
45
+
46
+ Returns:
47
+ str: Extracted text content from the file.
48
+ """
49
+ try:
50
+ if hasattr(self, 'file_path'):
51
+ # If file_path is provided
52
+ file_path = Path(self.file_path)
53
+ if not file_path.exists():
54
+ raise FileNotFoundError(f"File not found: {self.file_path}")
55
+
56
+ if file_path.suffix.lower() == '.svg':
57
+ text_content = self._process_svg(self.file_path)
58
+ else:
59
+ text_content = self._process_raster_image(self.file_path)
60
+
61
+ elif hasattr(self, 'file_content') and hasattr(self, 'file_name'):
62
+ # If file_content and file_name are provided
63
+ file_name = Path(self.file_name)
64
+ if file_name.suffix.lower() == '.svg':
65
+ text_content = self._process_svg(BytesIO(self.file_content))
66
+ else:
67
+ text_content = self._process_raster_image(BytesIO(self.file_content))
68
+ else:
69
+ raise ValueError("Either 'file_path' or 'file_content' and 'file_name' must be provided.")
70
+
71
+ except pytesseract.TesseractError as e:
72
+ raise ValueError(f"Error during OCR: {e}")
73
+ except ImportError as e:
74
+ raise ImportError(
75
+ f"Error: SVG processing dependencies not installed. Please install svglib and reportlab: {e}")
76
+ except Exception as e:
77
+ raise ValueError(f"Error opening image or processing SVG: {e}")
78
+
79
+ return text_content
80
+
81
+ def _process_svg(self, svg_source):
82
+ """
83
+ Processes an SVG file or in-memory SVG content.
84
+
85
+ If an LLM is available, the SVG is processed using LLM. Otherwise, the SVG
86
+ is converted to PNG and processed using OCR.
87
+
88
+ Args:
89
+ svg_source (str, Path, or BytesIO): The SVG file path or in-memory content.
90
+
91
+ Returns:
92
+ str: Extracted text content from the SVG.
93
+ """
94
+ if self.llm:
95
+ if isinstance(svg_source, (str, Path)):
96
+ with open(svg_source, 'rb') as f:
97
+ svg_content = f.read()
98
+ else:
99
+ svg_content = svg_source.read()
100
+ return self.__process_svg_with_llm(svg_content, self.llm, self.prompt)
101
+ else:
102
+ # For OCR on SVG, convert SVG to PNG and then use OCR
103
+ if isinstance(svg_source, (str, Path)):
104
+ drawing = svg2rlg(str(svg_source)) # svglib requires path as a string
105
+ else:
106
+ drawing = svg2rlg(svg_source) # svglib supports BytesIO
107
+ img_data = BytesIO()
108
+ renderPM.drawToFile(drawing, img_data, fmt="PNG")
109
+ img_data.seek(0)
110
+ image = Image.open(img_data)
111
+ return pytesseract.image_to_string(image, lang=self.ocr_language)
112
+
113
+ def _process_raster_image(self, image_source):
114
+ """
115
+ Processes a raster image (e.g., PNG, JPG).
116
+
117
+ If an LLM is available, the image is processed using LLM. Otherwise, OCR is used
118
+ to extract text content from the image.
119
+
120
+ Args:
121
+ image_source (str, Path, or BytesIO): The image file path or in-memory content.
122
+
123
+ Returns:
124
+ str: Extracted text content from the raster image.
125
+ """
126
+ image = Image.open(image_source)
127
+ if self.llm:
128
+ try:
129
+ return self.__perform_llm_prediction_for_image(image, self.llm, self.prompt)
130
+ except Exception as e:
131
+ print(f"Warning: Error during LLM processing of image: {e}. Falling back to OCR.")
132
+ return pytesseract.image_to_string(image, lang=self.ocr_language)
133
+ else:
134
+ return pytesseract.image_to_string(image, lang=self.ocr_language)
135
+
36
136
  def __perform_llm_prediction_for_image(self, image: Image, llm, prompt: str) -> str:
37
137
  """Performs LLM prediction for image content."""
38
138
  byte_array = image_to_byte_array(image)
39
- base64_string = bytes_to_base64(byte_array)
40
- result = llm.invoke([
41
- HumanMessage(
42
- content=[
43
- {"type": "text", "text": prompt},
44
- {
45
- "type": "image_url",
46
- "image_url": {"url": f"data:image/png;base64,{base64_string}"},
47
- },
48
- ]
49
- )
50
- ])
51
- return result.content
139
+ return perform_llm_prediction_for_image_bytes(byte_array, llm, prompt)
52
140
 
53
141
  def __process_svg_with_llm(self, svg_content: bytes, llm, prompt: str) -> str:
54
142
  """Processes SVG content using LLM."""
@@ -61,41 +149,7 @@ class AlitaImageLoader(BaseLoader):
61
149
 
62
150
  def load(self) -> List[Document]:
63
151
  """Load text from image using OCR or LLM if llm is provided, supports SVG."""
64
- file_path = Path(self.file_path)
65
- try:
66
- if file_path.suffix.lower() == '.svg':
67
- if self.llm:
68
- with open(self.file_path, 'rb') as f:
69
- svg_content = f.read()
70
- text_content = self.__process_svg_with_llm(svg_content, self.llm, self.prompt)
71
- else:
72
- # For OCR on SVG, we first convert SVG to PNG then use OCR
73
- drawing = svg2rlg(str(self.file_path)) # svglib requires path as string
74
- img_data = BytesIO()
75
- renderPM.drawToFile(drawing, img_data, fmt="PNG")
76
- img_data.seek(0)
77
- image = Image.open(img_data)
78
- text_content = pytesseract.image_to_string(image, lang=self.ocr_language)
79
- else: # For raster images (png, jpg, etc.)
80
- image = Image.open(self.file_path)
81
- if self.llm:
82
- try:
83
- text_content = self.__perform_llm_prediction_for_image(image, self.llm, self.prompt)
84
- except Exception as e:
85
- print(f"Warning: Error during LLM processing of image: {e}. Falling back to OCR.")
86
- text_content = pytesseract.image_to_string(image,
87
- lang=self.ocr_language) # Fallback to OCR if LLM fails
88
- else:
89
- text_content = pytesseract.image_to_string(image, lang=self.ocr_language)
90
- except FileNotFoundError:
91
- raise FileNotFoundError(f"File not found: {self.file_path}")
92
- except pytesseract.TesseractError as e:
93
- raise ValueError(f"Error during OCR: {e}")
94
- except ImportError as e: # svglib or reportlab missing
95
- raise ImportError(
96
- f"Error: SVG processing dependencies not installed. Please install svglib and reportlab: {e}")
97
- except Exception as e:
98
- raise ValueError(f"Error opening image or processing SVG: {e}")
152
+ text_content = self.get_content()
99
153
 
100
154
  metadata = {"source": str(self.file_path)} # Ensure source is always a string for metadata
101
155
  return [Document(page_content=text_content, metadata=metadata)]
@@ -0,0 +1,63 @@
1
+ import pymupdf
2
+ from langchain_community.document_loaders import PyPDFLoader
3
+ from .utils import perform_llm_prediction_for_image_bytes, create_temp_file
4
+ from langchain_core.tools import ToolException
5
+
6
+ class AlitaPDFLoader:
7
+
8
+ def __init__(self, **kwargs):
9
+ if kwargs.get('file_path'):
10
+ self.file_path = kwargs.get('file_path')
11
+ elif kwargs.get('file_content'):
12
+ self.file_content = kwargs.get('file_content')
13
+ else:
14
+ raise ToolException("'file_path' or 'file_content' parameter should be provided.")
15
+ self.password = kwargs.get('password', None)
16
+ self.page_number = kwargs.get('page_number', None)
17
+ self.extract_images = kwargs.get('extract_images', False)
18
+ self.llm = kwargs.get('llm', None)
19
+ self.prompt = kwargs.get('prompt', "Describe image")
20
+ self.headers = kwargs.get('headers', None)
21
+ self.extraction_mode = kwargs.get('extraction_mode', "plain")
22
+ self.extraction_kwargs = kwargs.get('extraction_kwargs', None)
23
+
24
+ def get_content(self):
25
+ if hasattr(self, 'file_path'):
26
+ with pymupdf.open(filename=self.file_path, filetype="pdf") as report:
27
+ return self.parse_report(report)
28
+ else:
29
+ with pymupdf.open(stream=self.file_content, filetype="pdf") as report:
30
+ return self.parse_report(report)
31
+
32
+ def parse_report(self, report):
33
+ text_content = ''
34
+ if self.page_number is not None:
35
+ page = report.load_page(self.page_number - 1)
36
+ text_content += self.read_pdf_page(report, page, self.page_number)
37
+ else:
38
+ for index, page in enumerate(report, start=1):
39
+ text_content += self.read_pdf_page(report, page, index)
40
+
41
+ return text_content
42
+
43
+ def read_pdf_page(self, report, page, index):
44
+ text_content = f'Page: {index}\n'
45
+ text_content += page.get_text()
46
+ if self.extract_images:
47
+ images = page.get_images(full=True)
48
+ for i, img in enumerate(images):
49
+ xref = img[0]
50
+ base_image = report.extract_image(xref)
51
+ img_bytes = base_image["image"]
52
+ text_content += "\n**Image Transcript:**\n" + perform_llm_prediction_for_image_bytes(img_bytes, self.llm, self.prompt) + "\n--------------------\n"
53
+ return text_content
54
+
55
+ def load(self):
56
+ if not hasattr(self, 'file_path'):
57
+ self.file_path = create_temp_file(self.file_content)
58
+ return PyPDFLoader(file_path=self.file_path,
59
+ password=self.password,
60
+ headers=self.headers,
61
+ extract_images=self.extract_images,
62
+ extraction_mode=self.extraction_mode,
63
+ extraction_kwargs=self.extraction_kwargs).load()
@@ -0,0 +1,54 @@
1
+ import io
2
+
3
+ from langchain_community.document_loaders import UnstructuredPowerPointLoader
4
+ from langchain_core.tools import ToolException
5
+ from pptx import Presentation
6
+ from .utils import perform_llm_prediction_for_image_bytes, create_temp_file
7
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
8
+
9
+ class AlitaPowerPointLoader:
10
+
11
+ def __init__(self, file_path=None, file_content=None, mode=None, **unstructured_kwargs):
12
+ if file_path:
13
+ self.file_path = file_path
14
+ elif file_content:
15
+ self.file_content = file_content
16
+ else:
17
+ raise ToolException("'file_path' or 'file_content' parameter should be provided.")
18
+
19
+ self.mode=mode
20
+ self.unstructured_kwargs = unstructured_kwargs
21
+ self.page_number = unstructured_kwargs.get('page_number', None)
22
+ self.extract_images = unstructured_kwargs.get('extract_images', False)
23
+ self.llm = unstructured_kwargs.get('llm', None)
24
+ self.prompt = unstructured_kwargs.get('prompt', "Describe image")
25
+
26
+ def get_content(self):
27
+ prs = Presentation(io.BytesIO(self.file_content))
28
+ text_content = ''
29
+ if self.page_number is not None:
30
+ text_content += self.read_pptx_slide(prs.slides[self.page_number - 1], self.page_number)
31
+ else:
32
+ for index, slide in enumerate(prs.slides, start=1):
33
+ text_content += self.read_pptx_slide(slide, index)
34
+ return text_content
35
+
36
+ def read_pptx_slide(self, slide, index):
37
+ text_content = f'Slide: {index}\n'
38
+ for shape in slide.shapes:
39
+ if hasattr(shape, "text"):
40
+ text_content += shape.text + "\n"
41
+ elif self.extract_images and shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
42
+ try:
43
+ caption = perform_llm_prediction_for_image_bytes(shape.image.blob, self.llm)
44
+ except:
45
+ caption = "unknown"
46
+ text_content += "\n**Image Transcript:**\n" + caption + "\n--------------------\n"
47
+ return text_content
48
+
49
+ def load(self):
50
+ if not self.file_path:
51
+ self.file_path = create_temp_file(self.file_content)
52
+ return UnstructuredPowerPointLoader(file_path=self.file_path,
53
+ mode=self.mode,
54
+ **self.unstructured_kwargs).load()
@@ -0,0 +1,66 @@
1
+ from typing import Iterator
2
+
3
+ from langchain_core.documents import Document
4
+
5
+ from langchain_community.document_loaders.base import BaseLoader
6
+ from langchain_community.document_loaders.helpers import detect_file_encodings
7
+ from langchain_core.tools import ToolException
8
+
9
+ class AlitaTextLoader(BaseLoader):
10
+
11
+ def __init__(self, **kwargs):
12
+ """Initialize with file path."""
13
+ if kwargs.get('file_path'):
14
+ self.file_path = kwargs['file_path']
15
+ elif kwargs.get('file_content'):
16
+ self.file_content = kwargs['file_content']
17
+ self.file_name = kwargs['file_name']
18
+ else:
19
+ raise ToolException("'file_path' or 'file_content' parameter should be provided.")
20
+ self.encoding = kwargs.get('encoding', 'utf-8')
21
+ self.autodetect_encoding = kwargs.get('autodetect_encoding', False)
22
+
23
+ def get_content(self):
24
+ text = ""
25
+ try:
26
+ if hasattr(self, 'file_path') and self.file_path:
27
+ with open(self.file_path, encoding=self.encoding) as f:
28
+ text = f.read()
29
+ elif hasattr(self, 'file_content') and self.file_content:
30
+ text = self.file_content.decode(self.encoding)
31
+ else:
32
+ raise ValueError("Neither file_path nor file_content is provided.")
33
+
34
+ except UnicodeDecodeError as e:
35
+ if self.autodetect_encoding:
36
+ if hasattr(self, 'file_path') and self.file_path:
37
+ detected_encodings = detect_file_encodings(self.file_path)
38
+ for encoding in detected_encodings:
39
+ try:
40
+ with open(self.file_path, encoding=encoding.encoding) as f:
41
+ text = f.read()
42
+ break
43
+ except UnicodeDecodeError:
44
+ continue
45
+ elif hasattr(self, 'file_content') and self.file_content:
46
+ detected_encodings = detect_file_encodings(self.file_content)
47
+ for encoding in detected_encodings:
48
+ try:
49
+ text = self.file_content.decode(encoding.encoding)
50
+ break
51
+ except UnicodeDecodeError:
52
+ continue
53
+ else:
54
+ raise ValueError("Neither file_path nor file_content is provided for encoding detection.")
55
+ else:
56
+ raise RuntimeError(f"Error loading content with encoding {self.encoding}.") from e
57
+ except Exception as e:
58
+ raise RuntimeError(f"Error loading content.") from e
59
+
60
+ return text
61
+
62
+ def lazy_load(self) -> Iterator[Document]:
63
+ """Load from file path."""
64
+ text = self.get_content()
65
+ metadata = {"source": str(self.file_path) if hasattr(self, 'file_path') else self.file_name}
66
+ yield Document(page_content=text, metadata=metadata)
@@ -12,24 +12,18 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
 
15
- from langchain_community.document_loaders import (TextLoader,
15
+ from langchain_community.document_loaders import (
16
16
  UnstructuredMarkdownLoader,
17
- PyPDFLoader,
18
- UnstructuredPDFLoader,UnstructuredWordDocumentLoader,
19
- JSONLoader, AirbyteJSONLoader, UnstructuredHTMLLoader,
20
- UnstructuredPowerPointLoader, PythonLoader)
21
-
22
- from langchain_community.document_loaders import (TextLoader,
23
- UnstructuredMarkdownLoader,
24
- PyPDFLoader,
25
- UnstructuredPDFLoader,UnstructuredWordDocumentLoader,
26
- JSONLoader, AirbyteJSONLoader, UnstructuredHTMLLoader,
17
+ AirbyteJSONLoader, UnstructuredHTMLLoader,
27
18
  UnstructuredPowerPointLoader, PythonLoader)
28
19
 
29
20
  from .AlitaCSVLoader import AlitaCSVLoader
30
21
  from .AlitaDocxMammothLoader import AlitaDocxMammothLoader
31
22
  from .AlitaExcelLoader import AlitaExcelLoader
32
23
  from .AlitaImageLoader import AlitaImageLoader
24
+ from .AlitaPDFLoader import AlitaPDFLoader
25
+ from .AlitaTextLoader import AlitaTextLoader
26
+ from .AlitaPowerPointLoader import AlitaPowerPointLoader
33
27
 
34
28
  loaders_map = {
35
29
  '.png': {
@@ -63,28 +57,28 @@ loaders_map = {
63
57
  'kwargs': {}
64
58
  },
65
59
  '.txt': {
66
- 'class': TextLoader,
60
+ 'class': AlitaTextLoader,
67
61
  'is_multimodal_processing': False,
68
62
  'kwargs': {
69
63
  'autodetect_encoding': True
70
64
  }
71
65
  },
72
66
  '.yml': {
73
- 'class': TextLoader,
67
+ 'class': AlitaTextLoader,
74
68
  'is_multimodal_processing': False,
75
69
  'kwargs': {
76
70
  'autodetect_encoding': True
77
71
  }
78
72
  },
79
73
  '.yaml': {
80
- 'class': TextLoader,
74
+ 'class': AlitaTextLoader,
81
75
  'is_multimodal_processing': False,
82
76
  'kwargs': {
83
77
  'autodetect_encoding': True
84
78
  }
85
79
  },
86
80
  '.groovy': {
87
- 'class': TextLoader,
81
+ 'class': AlitaTextLoader,
88
82
  'is_multimodal_processing': False,
89
83
  'kwargs': {
90
84
  'autodetect_encoding': True
@@ -121,7 +115,7 @@ loaders_map = {
121
115
  }
122
116
  },
123
117
  '.pdf': {
124
- 'class': PyPDFLoader,
118
+ 'class': AlitaPDFLoader,
125
119
  'is_multimodal_processing': False,
126
120
  'kwargs': {}
127
121
  },
@@ -131,7 +125,7 @@ loaders_map = {
131
125
  'kwargs': {}
132
126
  },
133
127
  '.json': {
134
- 'class': TextLoader,
128
+ 'class': AlitaTextLoader,
135
129
  'is_multimodal_processing': False,
136
130
  'kwargs': {
137
131
  'autodetect_encoding': True
@@ -153,12 +147,12 @@ loaders_map = {
153
147
  'kwargs': {}
154
148
  },
155
149
  '.ppt': {
156
- 'class': UnstructuredPowerPointLoader,
150
+ 'class': AlitaPowerPointLoader,
157
151
  'is_multimodal_processing': False,
158
152
  'kwargs': {}
159
153
  },
160
154
  '.pptx': {
161
- 'class': UnstructuredPowerPointLoader,
155
+ 'class': AlitaPowerPointLoader,
162
156
  'is_multimodal_processing': False,
163
157
  'kwargs': {}
164
158
  },