alita-sdk 0.3.462__py3-none-any.whl → 0.3.627__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +15 -3
- alita_sdk/cli/agent_loader.py +56 -8
- alita_sdk/cli/agent_ui.py +93 -31
- alita_sdk/cli/agents.py +2274 -230
- alita_sdk/cli/callbacks.py +96 -25
- alita_sdk/cli/cli.py +10 -1
- alita_sdk/cli/config.py +162 -9
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1073 -0
- alita_sdk/cli/testcases/__init__.py +94 -0
- alita_sdk/cli/testcases/data_generation.py +119 -0
- alita_sdk/cli/testcases/discovery.py +96 -0
- alita_sdk/cli/testcases/executor.py +84 -0
- alita_sdk/cli/testcases/logger.py +85 -0
- alita_sdk/cli/testcases/parser.py +172 -0
- alita_sdk/cli/testcases/prompts.py +91 -0
- alita_sdk/cli/testcases/reporting.py +125 -0
- alita_sdk/cli/testcases/setup.py +108 -0
- alita_sdk/cli/testcases/test_runner.py +282 -0
- alita_sdk/cli/testcases/utils.py +39 -0
- alita_sdk/cli/testcases/validation.py +90 -0
- alita_sdk/cli/testcases/workflow.py +196 -0
- alita_sdk/cli/toolkit.py +14 -17
- alita_sdk/cli/toolkit_loader.py +35 -5
- alita_sdk/cli/tools/__init__.py +36 -2
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +910 -64
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +72 -12
- alita_sdk/community/inventory/__init__.py +236 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +173 -0
- alita_sdk/community/inventory/toolkit_utils.py +176 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/__init__.py +1 -1
- alita_sdk/configurations/ado.py +141 -20
- alita_sdk/configurations/bitbucket.py +0 -3
- alita_sdk/configurations/confluence.py +76 -42
- alita_sdk/configurations/figma.py +76 -0
- alita_sdk/configurations/gitlab.py +17 -5
- alita_sdk/configurations/openapi.py +329 -0
- alita_sdk/configurations/qtest.py +72 -1
- alita_sdk/configurations/report_portal.py +96 -0
- alita_sdk/configurations/sharepoint.py +148 -0
- alita_sdk/configurations/testio.py +83 -0
- alita_sdk/runtime/clients/artifact.py +3 -3
- alita_sdk/runtime/clients/client.py +353 -48
- alita_sdk/runtime/clients/sandbox_client.py +0 -21
- alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
- alita_sdk/runtime/langchain/assistant.py +123 -26
- alita_sdk/runtime/langchain/constants.py +642 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +6 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +226 -7
- alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +5 -2
- alita_sdk/runtime/langchain/document_loaders/constants.py +12 -7
- alita_sdk/runtime/langchain/langraph_agent.py +279 -73
- alita_sdk/runtime/langchain/utils.py +82 -15
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/skills/__init__.py +91 -0
- alita_sdk/runtime/skills/callbacks.py +498 -0
- alita_sdk/runtime/skills/discovery.py +540 -0
- alita_sdk/runtime/skills/executor.py +610 -0
- alita_sdk/runtime/skills/input_builder.py +371 -0
- alita_sdk/runtime/skills/models.py +330 -0
- alita_sdk/runtime/skills/registry.py +355 -0
- alita_sdk/runtime/skills/skill_runner.py +330 -0
- alita_sdk/runtime/toolkits/__init__.py +7 -0
- alita_sdk/runtime/toolkits/application.py +21 -9
- alita_sdk/runtime/toolkits/artifact.py +15 -5
- alita_sdk/runtime/toolkits/datasource.py +13 -6
- alita_sdk/runtime/toolkits/mcp.py +139 -251
- alita_sdk/runtime/toolkits/mcp_config.py +1048 -0
- alita_sdk/runtime/toolkits/planning.py +178 -0
- alita_sdk/runtime/toolkits/skill_router.py +238 -0
- alita_sdk/runtime/toolkits/subgraph.py +251 -6
- alita_sdk/runtime/toolkits/tools.py +238 -32
- alita_sdk/runtime/toolkits/vectorstore.py +11 -5
- alita_sdk/runtime/tools/__init__.py +3 -1
- alita_sdk/runtime/tools/application.py +20 -6
- alita_sdk/runtime/tools/artifact.py +511 -28
- alita_sdk/runtime/tools/data_analysis.py +183 -0
- alita_sdk/runtime/tools/function.py +43 -15
- alita_sdk/runtime/tools/image_generation.py +50 -44
- alita_sdk/runtime/tools/llm.py +852 -67
- alita_sdk/runtime/tools/loop.py +3 -1
- alita_sdk/runtime/tools/loop_output.py +3 -1
- alita_sdk/runtime/tools/mcp_remote_tool.py +25 -10
- alita_sdk/runtime/tools/mcp_server_tool.py +7 -6
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/router.py +2 -4
- alita_sdk/runtime/tools/sandbox.py +9 -6
- alita_sdk/runtime/tools/skill_router.py +776 -0
- alita_sdk/runtime/tools/tool.py +3 -1
- alita_sdk/runtime/tools/vectorstore.py +7 -2
- alita_sdk/runtime/tools/vectorstore_base.py +51 -11
- alita_sdk/runtime/utils/AlitaCallback.py +137 -21
- alita_sdk/runtime/utils/constants.py +5 -1
- alita_sdk/runtime/utils/mcp_client.py +492 -0
- alita_sdk/runtime/utils/mcp_oauth.py +202 -5
- alita_sdk/runtime/utils/mcp_sse_client.py +36 -7
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/serialization.py +155 -0
- alita_sdk/runtime/utils/streamlit.py +6 -10
- alita_sdk/runtime/utils/toolkit_utils.py +16 -5
- alita_sdk/runtime/utils/utils.py +36 -0
- alita_sdk/tools/__init__.py +113 -29
- alita_sdk/tools/ado/repos/__init__.py +51 -33
- alita_sdk/tools/ado/repos/repos_wrapper.py +148 -89
- alita_sdk/tools/ado/test_plan/__init__.py +25 -9
- alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +23 -1
- alita_sdk/tools/ado/utils.py +1 -18
- alita_sdk/tools/ado/wiki/__init__.py +25 -8
- alita_sdk/tools/ado/wiki/ado_wrapper.py +291 -22
- alita_sdk/tools/ado/work_item/__init__.py +26 -9
- alita_sdk/tools/ado/work_item/ado_wrapper.py +56 -3
- alita_sdk/tools/advanced_jira_mining/__init__.py +11 -8
- alita_sdk/tools/aws/delta_lake/__init__.py +13 -9
- alita_sdk/tools/aws/delta_lake/tool.py +5 -1
- alita_sdk/tools/azure_ai/search/__init__.py +11 -8
- alita_sdk/tools/azure_ai/search/api_wrapper.py +1 -1
- alita_sdk/tools/base/tool.py +5 -1
- alita_sdk/tools/base_indexer_toolkit.py +170 -45
- alita_sdk/tools/bitbucket/__init__.py +17 -12
- alita_sdk/tools/bitbucket/api_wrapper.py +59 -11
- alita_sdk/tools/bitbucket/cloud_api_wrapper.py +49 -35
- alita_sdk/tools/browser/__init__.py +5 -4
- alita_sdk/tools/carrier/__init__.py +5 -6
- alita_sdk/tools/carrier/backend_reports_tool.py +6 -6
- alita_sdk/tools/carrier/run_ui_test_tool.py +6 -6
- alita_sdk/tools/carrier/ui_reports_tool.py +5 -5
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/code/treesitter/treesitter.py +37 -13
- alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/cloud/aws/__init__.py +10 -7
- alita_sdk/tools/cloud/azure/__init__.py +10 -7
- alita_sdk/tools/cloud/gcp/__init__.py +10 -7
- alita_sdk/tools/cloud/k8s/__init__.py +10 -7
- alita_sdk/tools/code/linter/__init__.py +10 -8
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code/sonar/__init__.py +10 -7
- alita_sdk/tools/code_indexer_toolkit.py +73 -23
- alita_sdk/tools/confluence/__init__.py +21 -15
- alita_sdk/tools/confluence/api_wrapper.py +78 -23
- alita_sdk/tools/confluence/loader.py +4 -2
- alita_sdk/tools/custom_open_api/__init__.py +12 -5
- alita_sdk/tools/elastic/__init__.py +11 -8
- alita_sdk/tools/elitea_base.py +493 -30
- alita_sdk/tools/figma/__init__.py +58 -11
- alita_sdk/tools/figma/api_wrapper.py +1235 -143
- alita_sdk/tools/figma/figma_client.py +73 -0
- alita_sdk/tools/figma/toon_tools.py +2748 -0
- alita_sdk/tools/github/__init__.py +13 -14
- alita_sdk/tools/github/github_client.py +224 -100
- alita_sdk/tools/github/graphql_client_wrapper.py +119 -33
- alita_sdk/tools/github/schemas.py +14 -5
- alita_sdk/tools/github/tool.py +5 -1
- alita_sdk/tools/github/tool_prompts.py +9 -22
- alita_sdk/tools/gitlab/__init__.py +15 -11
- alita_sdk/tools/gitlab/api_wrapper.py +207 -41
- alita_sdk/tools/gitlab_org/__init__.py +10 -8
- alita_sdk/tools/gitlab_org/api_wrapper.py +63 -64
- alita_sdk/tools/google/bigquery/__init__.py +13 -12
- alita_sdk/tools/google/bigquery/tool.py +5 -1
- alita_sdk/tools/google_places/__init__.py +10 -8
- alita_sdk/tools/google_places/api_wrapper.py +1 -1
- alita_sdk/tools/jira/__init__.py +17 -11
- alita_sdk/tools/jira/api_wrapper.py +91 -40
- alita_sdk/tools/keycloak/__init__.py +11 -8
- alita_sdk/tools/localgit/__init__.py +9 -3
- alita_sdk/tools/localgit/local_git.py +62 -54
- alita_sdk/tools/localgit/tool.py +5 -1
- alita_sdk/tools/memory/__init__.py +11 -3
- alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
- alita_sdk/tools/ocr/__init__.py +11 -8
- alita_sdk/tools/openapi/__init__.py +490 -114
- alita_sdk/tools/openapi/api_wrapper.py +1368 -0
- alita_sdk/tools/openapi/tool.py +20 -0
- alita_sdk/tools/pandas/__init__.py +20 -12
- alita_sdk/tools/pandas/api_wrapper.py +38 -25
- alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
- alita_sdk/tools/postman/__init__.py +11 -11
- alita_sdk/tools/pptx/__init__.py +10 -9
- alita_sdk/tools/pptx/pptx_wrapper.py +1 -1
- alita_sdk/tools/qtest/__init__.py +30 -10
- alita_sdk/tools/qtest/api_wrapper.py +430 -13
- alita_sdk/tools/rally/__init__.py +10 -8
- alita_sdk/tools/rally/api_wrapper.py +1 -1
- alita_sdk/tools/report_portal/__init__.py +12 -9
- alita_sdk/tools/salesforce/__init__.py +10 -9
- alita_sdk/tools/servicenow/__init__.py +17 -14
- alita_sdk/tools/servicenow/api_wrapper.py +1 -1
- alita_sdk/tools/sharepoint/__init__.py +10 -8
- alita_sdk/tools/sharepoint/api_wrapper.py +4 -4
- alita_sdk/tools/slack/__init__.py +10 -8
- alita_sdk/tools/slack/api_wrapper.py +2 -2
- alita_sdk/tools/sql/__init__.py +11 -9
- alita_sdk/tools/testio/__init__.py +10 -8
- alita_sdk/tools/testrail/__init__.py +11 -8
- alita_sdk/tools/testrail/api_wrapper.py +1 -1
- alita_sdk/tools/utils/__init__.py +9 -4
- alita_sdk/tools/utils/content_parser.py +77 -3
- alita_sdk/tools/utils/text_operations.py +410 -0
- alita_sdk/tools/utils/tool_prompts.py +79 -0
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +17 -13
- alita_sdk/tools/xray/__init__.py +12 -9
- alita_sdk/tools/yagmail/__init__.py +9 -3
- alita_sdk/tools/zephyr/__init__.py +9 -7
- alita_sdk/tools/zephyr_enterprise/__init__.py +11 -8
- alita_sdk/tools/zephyr_essential/__init__.py +10 -8
- alita_sdk/tools/zephyr_essential/api_wrapper.py +30 -13
- alita_sdk/tools/zephyr_essential/client.py +2 -2
- alita_sdk/tools/zephyr_scale/__init__.py +11 -9
- alita_sdk/tools/zephyr_scale/api_wrapper.py +2 -2
- alita_sdk/tools/zephyr_squad/__init__.py +10 -8
- {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/METADATA +147 -7
- alita_sdk-0.3.627.dist-info/RECORD +468 -0
- alita_sdk-0.3.627.dist-info/entry_points.txt +2 -0
- alita_sdk-0.3.462.dist-info/RECORD +0 -384
- alita_sdk-0.3.462.dist-info/entry_points.txt +0 -2
- {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.462.dist-info → alita_sdk-0.3.627.dist-info}/top_level.txt +0 -0
|
@@ -21,14 +21,16 @@ from openpyxl import load_workbook
|
|
|
21
21
|
from xlrd import open_workbook
|
|
22
22
|
from langchain_core.documents import Document
|
|
23
23
|
from .AlitaTableLoader import AlitaTableLoader
|
|
24
|
+
from alita_sdk.runtime.langchain.constants import LOADER_MAX_TOKENS_DEFAULT
|
|
24
25
|
|
|
25
26
|
cell_delimiter = " | "
|
|
26
27
|
|
|
27
28
|
class AlitaExcelLoader(AlitaTableLoader):
|
|
28
|
-
excel_by_sheets: bool = False
|
|
29
29
|
sheet_name: str = None
|
|
30
|
-
return_type: str = 'str'
|
|
31
30
|
file_name: str = None
|
|
31
|
+
max_tokens: int = LOADER_MAX_TOKENS_DEFAULT
|
|
32
|
+
add_header_to_chunks: bool = False
|
|
33
|
+
header_row_number: int = 1
|
|
32
34
|
|
|
33
35
|
def __init__(self, **kwargs):
|
|
34
36
|
if not kwargs.get('file_path'):
|
|
@@ -39,9 +41,22 @@ class AlitaExcelLoader(AlitaTableLoader):
|
|
|
39
41
|
else:
|
|
40
42
|
self.file_name = kwargs.get('file_path')
|
|
41
43
|
super().__init__(**kwargs)
|
|
42
|
-
self.excel_by_sheets = kwargs.get('excel_by_sheets')
|
|
43
|
-
self.return_type = kwargs.get('return_type')
|
|
44
44
|
self.sheet_name = kwargs.get('sheet_name')
|
|
45
|
+
# Set and validate chunking parameters only once
|
|
46
|
+
self.max_tokens = int(kwargs.get('max_tokens', LOADER_MAX_TOKENS_DEFAULT))
|
|
47
|
+
self.add_header_to_chunks = bool(kwargs.get('add_header_to_chunks', False))
|
|
48
|
+
header_row_number = kwargs.get('header_row_number', 1)
|
|
49
|
+
# Validate header_row_number
|
|
50
|
+
try:
|
|
51
|
+
header_row_number = int(header_row_number)
|
|
52
|
+
if header_row_number > 0:
|
|
53
|
+
self.header_row_number = header_row_number
|
|
54
|
+
else:
|
|
55
|
+
self.header_row_number = 1
|
|
56
|
+
self.add_header_to_chunks = False
|
|
57
|
+
except (ValueError, TypeError):
|
|
58
|
+
self.header_row_number = 1
|
|
59
|
+
self.add_header_to_chunks = False
|
|
45
60
|
|
|
46
61
|
def get_content(self):
|
|
47
62
|
try:
|
|
@@ -64,59 +79,32 @@ class AlitaExcelLoader(AlitaTableLoader):
|
|
|
64
79
|
Reads .xlsx files using openpyxl.
|
|
65
80
|
"""
|
|
66
81
|
workbook = load_workbook(self.file_path, data_only=True) # `data_only=True` ensures we get cell values, not formulas
|
|
67
|
-
|
|
82
|
+
sheets = workbook.sheetnames
|
|
68
83
|
if self.sheet_name:
|
|
69
|
-
|
|
70
|
-
if self.sheet_name in workbook.sheetnames:
|
|
84
|
+
if self.sheet_name in sheets:
|
|
71
85
|
sheet_content = self.parse_sheet(workbook[self.sheet_name])
|
|
72
|
-
return sheet_content
|
|
73
86
|
else:
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
# Parse each sheet individually and return as a dictionary
|
|
77
|
-
result = {}
|
|
78
|
-
for sheet_name in workbook.sheetnames:
|
|
79
|
-
sheet_content = self.parse_sheet(workbook[sheet_name])
|
|
80
|
-
result[sheet_name] = sheet_content
|
|
81
|
-
return result
|
|
87
|
+
sheet_content = [f"Sheet '{self.sheet_name}' does not exist in the workbook."]
|
|
88
|
+
return {self.sheet_name: sheet_content}
|
|
82
89
|
else:
|
|
83
|
-
#
|
|
84
|
-
|
|
85
|
-
for sheet_name in workbook.sheetnames:
|
|
86
|
-
sheet_content = self.parse_sheet(workbook[sheet_name])
|
|
87
|
-
result.append(f"====== Sheet name: {sheet_name} ======\n{sheet_content}")
|
|
88
|
-
return "\n\n".join(result)
|
|
90
|
+
# Dictionary comprehension for all sheets
|
|
91
|
+
return {name: self.parse_sheet(workbook[name]) for name in sheets}
|
|
89
92
|
|
|
90
93
|
def _read_xls(self):
|
|
91
94
|
"""
|
|
92
95
|
Reads .xls files using xlrd.
|
|
93
96
|
"""
|
|
94
97
|
workbook = open_workbook(filename=self.file_name, file_contents=self.file_content)
|
|
95
|
-
|
|
98
|
+
sheets = workbook.sheet_names()
|
|
96
99
|
if self.sheet_name:
|
|
97
|
-
|
|
98
|
-
if self.sheet_name in workbook.sheet_names():
|
|
100
|
+
if self.sheet_name in sheets:
|
|
99
101
|
sheet = workbook.sheet_by_name(self.sheet_name)
|
|
100
|
-
|
|
101
|
-
return sheet_content
|
|
102
|
+
return {self.sheet_name: self.parse_sheet_xls(sheet)}
|
|
102
103
|
else:
|
|
103
|
-
|
|
104
|
-
elif self.excel_by_sheets:
|
|
105
|
-
# Parse each sheet individually and return as a dictionary
|
|
106
|
-
result = {}
|
|
107
|
-
for sheet_name in workbook.sheet_names():
|
|
108
|
-
sheet = workbook.sheet_by_name(sheet_name)
|
|
109
|
-
sheet_content = self.parse_sheet_xls(sheet)
|
|
110
|
-
result[sheet_name] = sheet_content
|
|
111
|
-
return result
|
|
104
|
+
return {self.sheet_name: [f"Sheet '{self.sheet_name}' does not exist in the workbook."]}
|
|
112
105
|
else:
|
|
113
|
-
#
|
|
114
|
-
|
|
115
|
-
for sheet_name in workbook.sheet_names():
|
|
116
|
-
sheet = workbook.sheet_by_name(sheet_name)
|
|
117
|
-
sheet_content = self.parse_sheet_xls(sheet)
|
|
118
|
-
result.append(f"====== Sheet name: {sheet_name} ======\n{sheet_content}")
|
|
119
|
-
return "\n\n".join(result)
|
|
106
|
+
# Dictionary comprehension for all sheets
|
|
107
|
+
return {name: self.parse_sheet_xls(workbook.sheet_by_name(name)) for name in sheets}
|
|
120
108
|
|
|
121
109
|
def parse_sheet(self, sheet):
|
|
122
110
|
"""
|
|
@@ -170,34 +158,89 @@ class AlitaExcelLoader(AlitaTableLoader):
|
|
|
170
158
|
# Format the sheet content based on the return type
|
|
171
159
|
return self._format_sheet_content(sheet_content)
|
|
172
160
|
|
|
173
|
-
def _format_sheet_content(self,
|
|
161
|
+
def _format_sheet_content(self, rows):
|
|
174
162
|
"""
|
|
175
|
-
|
|
163
|
+
Specification:
|
|
164
|
+
Formats a list of sheet rows into a list of string chunks according to the following rules:
|
|
165
|
+
1. If max_tokens < 1, returns a single chunk (list of one string) with all rows joined by a newline ('\n').
|
|
166
|
+
- If add_header_to_chunks is True and header_row_number is valid, the specified header row is prepended as the first line.
|
|
167
|
+
2. If max_tokens >= 1:
|
|
168
|
+
a. Each chunk is a string containing one or more rows, separated by newlines ('\n'), such that the total token count (as measured by tiktoken) does not exceed max_tokens.
|
|
169
|
+
b. If add_header_to_chunks is True and header_row_number is valid, the specified header row is prepended once at the top of each chunk (not before every row).
|
|
170
|
+
c. If a single row exceeds max_tokens, it is placed in its own chunk without splitting, with the header prepended if applicable.
|
|
171
|
+
3. Returns: List[str], where each string is a chunk ready for further processing.
|
|
176
172
|
"""
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
173
|
+
import tiktoken
|
|
174
|
+
encoding = tiktoken.get_encoding('cl100k_base')
|
|
175
|
+
|
|
176
|
+
# --- Inner functions ---
|
|
177
|
+
def count_tokens(text):
|
|
178
|
+
"""Count tokens in text using tiktoken encoding."""
|
|
179
|
+
return len(encoding.encode(text))
|
|
180
|
+
|
|
181
|
+
def finalize_chunk(chunk_rows):
|
|
182
|
+
"""Join rows for a chunk, prepending header if needed."""
|
|
183
|
+
if self.add_header_to_chunks and header:
|
|
184
|
+
return '\n'.join([header] + chunk_rows)
|
|
185
|
+
else:
|
|
186
|
+
return '\n'.join(chunk_rows)
|
|
187
|
+
# --- End inner functions ---
|
|
188
|
+
|
|
189
|
+
# If max_tokens < 1, return all rows as a single chunk
|
|
190
|
+
if self.max_tokens < 1:
|
|
191
|
+
return ['\n'.join(rows)]
|
|
192
|
+
|
|
193
|
+
# Extract header if needed
|
|
194
|
+
header = None
|
|
195
|
+
if self.add_header_to_chunks and rows:
|
|
196
|
+
header_idx = self.header_row_number - 1
|
|
197
|
+
header = rows.pop(header_idx)
|
|
198
|
+
|
|
199
|
+
chunks = [] # List to store final chunks
|
|
200
|
+
current_chunk = [] # Accumulate rows for the current chunk
|
|
201
|
+
current_tokens = 0 # Token count for the current chunk
|
|
202
|
+
|
|
203
|
+
for row in rows:
|
|
204
|
+
row_tokens = count_tokens(row)
|
|
205
|
+
# If row itself exceeds max_tokens, flush current chunk and add row as its own chunk (with header if needed)
|
|
206
|
+
if row_tokens > self.max_tokens:
|
|
207
|
+
if current_chunk:
|
|
208
|
+
chunks.append(finalize_chunk(current_chunk))
|
|
209
|
+
current_chunk = []
|
|
210
|
+
current_tokens = 0
|
|
211
|
+
# Add the large row as its own chunk, with header if needed
|
|
212
|
+
if self.add_header_to_chunks and header:
|
|
213
|
+
chunks.append(finalize_chunk([row]))
|
|
214
|
+
else:
|
|
215
|
+
chunks.append(row)
|
|
216
|
+
continue
|
|
217
|
+
# If adding row would exceed max_tokens, flush current chunk and start new
|
|
218
|
+
if current_tokens + row_tokens > self.max_tokens:
|
|
219
|
+
if current_chunk:
|
|
220
|
+
chunks.append(finalize_chunk(current_chunk))
|
|
221
|
+
current_chunk = [row]
|
|
222
|
+
current_tokens = row_tokens
|
|
223
|
+
else:
|
|
224
|
+
current_chunk.append(row)
|
|
225
|
+
current_tokens += row_tokens
|
|
226
|
+
# Add any remaining rows as the last chunk
|
|
227
|
+
if current_chunk:
|
|
228
|
+
chunks.append(finalize_chunk(current_chunk))
|
|
229
|
+
return chunks
|
|
188
230
|
|
|
189
231
|
def load(self) -> list:
|
|
190
232
|
docs = []
|
|
191
233
|
content_per_sheet = self.get_content()
|
|
192
|
-
|
|
234
|
+
# content_per_sheet is a dict of sheet_name: list of chunk strings
|
|
235
|
+
for sheet_name, content_chunks in content_per_sheet.items():
|
|
193
236
|
metadata = {
|
|
194
237
|
"source": f'{self.file_path}:{sheet_name}',
|
|
195
238
|
"sheet_name": sheet_name,
|
|
196
239
|
"file_type": "excel",
|
|
197
|
-
"excel_by_sheets": self.excel_by_sheets,
|
|
198
|
-
"return_type": self.return_type,
|
|
199
240
|
}
|
|
200
|
-
|
|
241
|
+
# Each chunk is a separate Document
|
|
242
|
+
for chunk in content_chunks:
|
|
243
|
+
docs.append(Document(page_content=chunk, metadata=metadata))
|
|
201
244
|
return docs
|
|
202
245
|
|
|
203
246
|
def read(self, lazy: bool = False):
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
from .AlitaJSONLoader import AlitaJSONLoader
|
|
2
|
+
import json
|
|
3
|
+
from io import StringIO
|
|
4
|
+
from typing import List, Iterator
|
|
5
|
+
|
|
6
|
+
from langchain_core.documents import Document
|
|
7
|
+
from langchain_core.tools import ToolException
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class AlitaJSONLinesLoader(AlitaJSONLoader):
|
|
11
|
+
"""Load local JSONL files (one JSON object per line) using AlitaJSONLoader behavior.
|
|
12
|
+
|
|
13
|
+
Behavior:
|
|
14
|
+
- Supports both `file_path` and `file_content` (bytes or file-like object), same as AlitaJSONLoader.
|
|
15
|
+
- Treats each non-empty line as an independent JSON object.
|
|
16
|
+
- Aggregates all parsed JSON objects into a list and feeds them through the same
|
|
17
|
+
RecursiveJsonSplitter-based chunking used by AlitaJSONLoader.lazy_load.
|
|
18
|
+
- Returns a list of Documents with chunked JSON content.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, **kwargs):
|
|
22
|
+
# Reuse AlitaJSONLoader initialization logic (file_path / file_content handling, encoding, etc.)
|
|
23
|
+
super().__init__(**kwargs)
|
|
24
|
+
|
|
25
|
+
def _iter_lines(self) -> Iterator[str]:
|
|
26
|
+
"""Yield lines from file_path or file_content, mirroring AlitaJSONLoader sources."""
|
|
27
|
+
# Prefer file_path if available
|
|
28
|
+
if hasattr(self, "file_path") and self.file_path:
|
|
29
|
+
with open(self.file_path, "r", encoding=self.encoding) as f:
|
|
30
|
+
for line in f:
|
|
31
|
+
yield line
|
|
32
|
+
# Fallback to file_content if available
|
|
33
|
+
elif hasattr(self, "file_content") and self.file_content:
|
|
34
|
+
# file_content may be bytes or a file-like object
|
|
35
|
+
if isinstance(self.file_content, (bytes, bytearray)):
|
|
36
|
+
text = self.file_content.decode(self.encoding)
|
|
37
|
+
for line in StringIO(text):
|
|
38
|
+
yield line
|
|
39
|
+
else:
|
|
40
|
+
# Assume it's a text file-like object positioned at the beginning
|
|
41
|
+
self.file_content.seek(0)
|
|
42
|
+
for line in self.file_content:
|
|
43
|
+
yield line
|
|
44
|
+
else:
|
|
45
|
+
raise ToolException("'file_path' or 'file_content' parameter should be provided.")
|
|
46
|
+
|
|
47
|
+
def load(self) -> List[Document]: # type: ignore[override]
|
|
48
|
+
"""Load JSONL content by delegating each non-empty line to AlitaJSONLoader.
|
|
49
|
+
|
|
50
|
+
For each non-empty line in the underlying source (file_path or file_content):
|
|
51
|
+
- Create a temporary AlitaJSONLoader instance with that line as file_content.
|
|
52
|
+
- Call lazy_load() on that instance to apply the same RecursiveJsonSplitter logic
|
|
53
|
+
as for a normal JSON file.
|
|
54
|
+
- Accumulate all Documents from all lines and return them as a single list.
|
|
55
|
+
"""
|
|
56
|
+
docs: List[Document] = []
|
|
57
|
+
|
|
58
|
+
for raw_line in self._iter_lines():
|
|
59
|
+
line = raw_line.strip()
|
|
60
|
+
if not line:
|
|
61
|
+
continue
|
|
62
|
+
try:
|
|
63
|
+
# Instantiate a per-line AlitaJSONLoader using the same configuration
|
|
64
|
+
line_loader = AlitaJSONLoader(
|
|
65
|
+
file_content=line,
|
|
66
|
+
file_name=getattr(self, "file_name", str(getattr(self, "file_path", "no_name"))),
|
|
67
|
+
encoding=self.encoding,
|
|
68
|
+
autodetect_encoding=self.autodetect_encoding,
|
|
69
|
+
max_tokens=self.max_tokens,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
for doc in line_loader.lazy_load():
|
|
73
|
+
docs.append(doc)
|
|
74
|
+
except Exception as e:
|
|
75
|
+
raise ToolException(f"Error processing JSONL line: {line[:100]}... Error: {e}") from e
|
|
76
|
+
|
|
77
|
+
return docs
|
|
@@ -32,6 +32,8 @@ class AlitaJSONLoader(BaseLoader):
|
|
|
32
32
|
elif hasattr(self, 'file_content') and self.file_content:
|
|
33
33
|
if isinstance(self.file_content, bytes):
|
|
34
34
|
return json.loads(self.file_content.decode(self.encoding))
|
|
35
|
+
elif isinstance(self.file_content, str):
|
|
36
|
+
return json.loads(self.file_content)
|
|
35
37
|
else:
|
|
36
38
|
return json.load(self.file_content)
|
|
37
39
|
else:
|
|
@@ -45,7 +47,6 @@ class AlitaJSONLoader(BaseLoader):
|
|
|
45
47
|
try:
|
|
46
48
|
with open(self.file_path, encoding=encoding.encoding) as f:
|
|
47
49
|
return f.read()
|
|
48
|
-
break
|
|
49
50
|
except UnicodeDecodeError:
|
|
50
51
|
continue
|
|
51
52
|
elif hasattr(self, 'file_content') and self.file_content:
|
|
@@ -58,9 +59,11 @@ class AlitaJSONLoader(BaseLoader):
|
|
|
58
59
|
else:
|
|
59
60
|
raise ValueError("Neither file_path nor file_content is provided for encoding detection.")
|
|
60
61
|
else:
|
|
61
|
-
raise RuntimeError(f"Error loading content with encoding {self.encoding}
|
|
62
|
+
raise RuntimeError(f"Error loading content with encoding {self.encoding}: {e}") from e
|
|
62
63
|
except Exception as e:
|
|
63
|
-
|
|
64
|
+
# Preserve original error details so callers (e.g., parse_file_content)
|
|
65
|
+
# can expose the real root cause instead of a generic message.
|
|
66
|
+
raise RuntimeError(f"Error loading content: {e}") from e
|
|
64
67
|
|
|
65
68
|
def lazy_load(self) -> Iterator[Document]:
|
|
66
69
|
"""Load from file path."""
|
|
@@ -6,6 +6,7 @@ from .utils import perform_llm_prediction_for_image_bytes, create_temp_file
|
|
|
6
6
|
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
|
7
7
|
from langchain_core.documents import Document
|
|
8
8
|
|
|
9
|
+
|
|
9
10
|
class AlitaPowerPointLoader:
|
|
10
11
|
|
|
11
12
|
def __init__(self, file_path=None, file_content=None, mode=None, **unstructured_kwargs):
|
|
@@ -43,10 +44,203 @@ class AlitaPowerPointLoader:
|
|
|
43
44
|
else:
|
|
44
45
|
raise ToolException(f"Unknown mode value: {self.mode}. Only 'single', 'paged' values allowed.")
|
|
45
46
|
|
|
47
|
+
def _extract_table_as_markdown(self, table) -> str:
|
|
48
|
+
"""Convert PPTX table to markdown format."""
|
|
49
|
+
if not table.rows:
|
|
50
|
+
return ""
|
|
51
|
+
|
|
52
|
+
rows = []
|
|
53
|
+
for row in table.rows:
|
|
54
|
+
cells = []
|
|
55
|
+
for cell in row.cells:
|
|
56
|
+
cell_text = cell.text.strip().replace("|", "\\|").replace("\n", " ")
|
|
57
|
+
cells.append(cell_text)
|
|
58
|
+
rows.append("| " + " | ".join(cells) + " |")
|
|
59
|
+
|
|
60
|
+
if len(rows) > 0:
|
|
61
|
+
# Add header separator after first row
|
|
62
|
+
num_cols = len(table.rows[0].cells)
|
|
63
|
+
header_sep = "| " + " | ".join(["---"] * num_cols) + " |"
|
|
64
|
+
rows.insert(1, header_sep)
|
|
65
|
+
|
|
66
|
+
return "\n**Table:**\n" + "\n".join(rows) + "\n"
|
|
67
|
+
|
|
68
|
+
def _extract_chart_info(self, chart) -> str:
|
|
69
|
+
"""Extract data and labels from PPTX chart."""
|
|
70
|
+
result = []
|
|
71
|
+
|
|
72
|
+
# Extract chart title
|
|
73
|
+
try:
|
|
74
|
+
if chart.has_title and chart.chart_title.has_text_frame:
|
|
75
|
+
title_text = chart.chart_title.text_frame.text.strip()
|
|
76
|
+
if title_text:
|
|
77
|
+
result.append(f"Chart Title: {title_text}")
|
|
78
|
+
except Exception:
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
# Try to extract series data directly from chart.series (works for some chart types)
|
|
82
|
+
try:
|
|
83
|
+
if hasattr(chart, 'series') and chart.series:
|
|
84
|
+
for series in chart.series:
|
|
85
|
+
series_name = series.name if series.name else "Unnamed Series"
|
|
86
|
+
values = []
|
|
87
|
+
categories = []
|
|
88
|
+
|
|
89
|
+
# Try to get values
|
|
90
|
+
try:
|
|
91
|
+
if hasattr(series, 'values') and series.values:
|
|
92
|
+
values = list(series.values)
|
|
93
|
+
except Exception:
|
|
94
|
+
pass
|
|
95
|
+
|
|
96
|
+
# Try to get categories from series
|
|
97
|
+
try:
|
|
98
|
+
if hasattr(series, 'categories') and series.categories:
|
|
99
|
+
categories = list(series.categories)
|
|
100
|
+
except Exception:
|
|
101
|
+
pass
|
|
102
|
+
|
|
103
|
+
# Build output
|
|
104
|
+
if categories and values and len(categories) == len(values):
|
|
105
|
+
data_pairs = [f"{cat}: {val}" for cat, val in zip(categories, values)]
|
|
106
|
+
result.append(f"Series '{series_name}': {', '.join(data_pairs)}")
|
|
107
|
+
elif values:
|
|
108
|
+
result.append(f"Series '{series_name}': {', '.join(str(v) for v in values)}")
|
|
109
|
+
elif categories:
|
|
110
|
+
result.append(f"Series '{series_name}' categories: {', '.join(str(c) for c in categories)}")
|
|
111
|
+
except Exception:
|
|
112
|
+
pass
|
|
113
|
+
|
|
114
|
+
# Fallback: try plots API for bar/line charts
|
|
115
|
+
if not result or (len(result) == 1 and "Chart Title" in result[0]):
|
|
116
|
+
try:
|
|
117
|
+
if hasattr(chart, 'plots') and chart.plots and len(chart.plots) > 0:
|
|
118
|
+
plot = chart.plots[0]
|
|
119
|
+
categories = []
|
|
120
|
+
if hasattr(plot, 'categories') and plot.categories:
|
|
121
|
+
categories = list(plot.categories)
|
|
122
|
+
if categories:
|
|
123
|
+
result.append(f"Categories: {', '.join(str(c) for c in categories)}")
|
|
124
|
+
|
|
125
|
+
# Extract series data from plot
|
|
126
|
+
for series in plot.series:
|
|
127
|
+
series_name = series.name if series.name else "Unnamed Series"
|
|
128
|
+
values = list(series.values) if series.values else []
|
|
129
|
+
|
|
130
|
+
if categories and len(categories) == len(values):
|
|
131
|
+
data_pairs = [f"{cat}: {val}" for cat, val in zip(categories, values)]
|
|
132
|
+
result.append(f"Series '{series_name}': {', '.join(data_pairs)}")
|
|
133
|
+
elif values:
|
|
134
|
+
result.append(f"Series '{series_name}': {', '.join(str(v) for v in values)}")
|
|
135
|
+
except Exception:
|
|
136
|
+
pass
|
|
137
|
+
|
|
138
|
+
# Final fallback: parse XML directly for unsupported chart types (e.g., pie3DChart)
|
|
139
|
+
if not result or (len(result) == 1 and "Chart Title" in result[0]):
|
|
140
|
+
try:
|
|
141
|
+
result.extend(self._extract_chart_from_xml(chart))
|
|
142
|
+
except Exception:
|
|
143
|
+
pass
|
|
144
|
+
|
|
145
|
+
# If we still have no data, add a note
|
|
146
|
+
if not result:
|
|
147
|
+
result.append("(Chart detected - there is no parsed data from this type of chart)")
|
|
148
|
+
|
|
149
|
+
return "\n**Chart:**\n" + "\n".join(result) + "\n"
|
|
150
|
+
|
|
151
|
+
def _extract_chart_from_xml(self, chart) -> list:
|
|
152
|
+
"""Extract chart data by parsing the underlying XML directly."""
|
|
153
|
+
result = []
|
|
154
|
+
|
|
155
|
+
# Get the chart part XML
|
|
156
|
+
chart_part = chart.part
|
|
157
|
+
chart_element = chart_part.element
|
|
158
|
+
|
|
159
|
+
# Define namespaces used in chart XML
|
|
160
|
+
namespaces = {
|
|
161
|
+
'c': 'http://schemas.openxmlformats.org/drawingml/2006/chart',
|
|
162
|
+
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
# Find all series (ser) elements
|
|
166
|
+
series_elements = chart_element.findall('.//c:ser', namespaces)
|
|
167
|
+
|
|
168
|
+
for ser in series_elements:
|
|
169
|
+
series_name = "Unnamed Series"
|
|
170
|
+
categories = []
|
|
171
|
+
values = []
|
|
172
|
+
|
|
173
|
+
# Extract series name from tx/v or tx/strRef
|
|
174
|
+
tx = ser.find('.//c:tx', namespaces)
|
|
175
|
+
if tx is not None:
|
|
176
|
+
v = tx.find('.//c:v', namespaces)
|
|
177
|
+
if v is not None and v.text:
|
|
178
|
+
series_name = v.text
|
|
179
|
+
|
|
180
|
+
# Extract category labels from c:cat
|
|
181
|
+
cat = ser.find('.//c:cat', namespaces)
|
|
182
|
+
if cat is not None:
|
|
183
|
+
# Try strRef first (string references)
|
|
184
|
+
str_cache = cat.find('.//c:strCache', namespaces)
|
|
185
|
+
if str_cache is not None:
|
|
186
|
+
for pt in str_cache.findall('.//c:pt', namespaces):
|
|
187
|
+
v = pt.find('c:v', namespaces)
|
|
188
|
+
if v is not None and v.text:
|
|
189
|
+
categories.append(v.text)
|
|
190
|
+
|
|
191
|
+
# Try numRef (numeric references used as categories)
|
|
192
|
+
if not categories:
|
|
193
|
+
num_cache = cat.find('.//c:numCache', namespaces)
|
|
194
|
+
if num_cache is not None:
|
|
195
|
+
for pt in num_cache.findall('.//c:pt', namespaces):
|
|
196
|
+
v = pt.find('c:v', namespaces)
|
|
197
|
+
if v is not None and v.text:
|
|
198
|
+
categories.append(v.text)
|
|
199
|
+
|
|
200
|
+
# Extract values from c:val
|
|
201
|
+
val = ser.find('.//c:val', namespaces)
|
|
202
|
+
if val is not None:
|
|
203
|
+
num_cache = val.find('.//c:numCache', namespaces)
|
|
204
|
+
if num_cache is not None:
|
|
205
|
+
for pt in num_cache.findall('.//c:pt', namespaces):
|
|
206
|
+
v = pt.find('c:v', namespaces)
|
|
207
|
+
if v is not None and v.text:
|
|
208
|
+
try:
|
|
209
|
+
values.append(float(v.text))
|
|
210
|
+
except ValueError:
|
|
211
|
+
values.append(v.text)
|
|
212
|
+
|
|
213
|
+
# Build output
|
|
214
|
+
if categories and values and len(categories) == len(values):
|
|
215
|
+
data_pairs = [f"{cat}: {val}" for cat, val in zip(categories, values)]
|
|
216
|
+
result.append(f"Series '{series_name}': {', '.join(data_pairs)}")
|
|
217
|
+
elif values:
|
|
218
|
+
result.append(f"Series '{series_name}': {', '.join(str(v) for v in values)}")
|
|
219
|
+
elif categories:
|
|
220
|
+
result.append(f"Series '{series_name}' categories: {', '.join(str(c) for c in categories)}")
|
|
221
|
+
|
|
222
|
+
return result
|
|
223
|
+
|
|
46
224
|
def read_pptx_slide(self, slide, index):
|
|
47
225
|
text_content = f'Slide: {index}\n'
|
|
48
226
|
for shape in slide.shapes:
|
|
49
|
-
|
|
227
|
+
# Handle tables
|
|
228
|
+
if shape.has_table:
|
|
229
|
+
text_content += self._extract_table_as_markdown(shape.table)
|
|
230
|
+
# Handle charts
|
|
231
|
+
elif shape.has_chart:
|
|
232
|
+
text_content += self._extract_chart_info(shape.chart)
|
|
233
|
+
# Handle images - check multiple ways images can be embedded
|
|
234
|
+
elif self.extract_images and self._is_image_shape(shape):
|
|
235
|
+
try:
|
|
236
|
+
image_blob = self._get_image_blob(shape)
|
|
237
|
+
if image_blob:
|
|
238
|
+
caption = perform_llm_prediction_for_image_bytes(image_blob, self.llm, self.prompt)
|
|
239
|
+
text_content += "\n**Image Transcript:**\n" + caption + "\n--------------------\n"
|
|
240
|
+
except Exception:
|
|
241
|
+
pass
|
|
242
|
+
# Handle text frames with hyperlinks
|
|
243
|
+
elif hasattr(shape, "text_frame") and shape.text_frame is not None:
|
|
50
244
|
for paragraph in shape.text_frame.paragraphs:
|
|
51
245
|
for run in paragraph.runs:
|
|
52
246
|
if run.hyperlink and run.hyperlink.address:
|
|
@@ -56,14 +250,39 @@ class AlitaPowerPointLoader:
|
|
|
56
250
|
else:
|
|
57
251
|
text_content += run.text
|
|
58
252
|
text_content += "\n"
|
|
59
|
-
elif self.extract_images and shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
|
60
|
-
try:
|
|
61
|
-
caption = perform_llm_prediction_for_image_bytes(shape.image.blob, self.llm, self.prompt)
|
|
62
|
-
except:
|
|
63
|
-
caption = "unknown"
|
|
64
|
-
text_content += "\n**Image Transcript:**\n" + caption + "\n--------------------\n"
|
|
65
253
|
return text_content + "\n"
|
|
66
254
|
|
|
255
|
+
def _is_image_shape(self, shape) -> bool:
|
|
256
|
+
"""Check if shape contains an image using multiple detection methods."""
|
|
257
|
+
# Method 1: Check shape type
|
|
258
|
+
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
|
259
|
+
return True
|
|
260
|
+
# Method 2: Check if shape has image attribute with blob
|
|
261
|
+
if hasattr(shape, 'image') and shape.image is not None:
|
|
262
|
+
try:
|
|
263
|
+
if shape.image.blob:
|
|
264
|
+
return True
|
|
265
|
+
except Exception:
|
|
266
|
+
pass
|
|
267
|
+
# Method 3: Check for placeholder with image
|
|
268
|
+
if hasattr(shape, 'placeholder_format') and shape.placeholder_format is not None:
|
|
269
|
+
try:
|
|
270
|
+
if hasattr(shape, 'image') and shape.image is not None:
|
|
271
|
+
return True
|
|
272
|
+
except Exception:
|
|
273
|
+
pass
|
|
274
|
+
return False
|
|
275
|
+
|
|
276
|
+
def _get_image_blob(self, shape) -> bytes:
|
|
277
|
+
"""Extract image blob from shape using available methods."""
|
|
278
|
+
# Try direct image access
|
|
279
|
+
if hasattr(shape, 'image') and shape.image is not None:
|
|
280
|
+
try:
|
|
281
|
+
return shape.image.blob
|
|
282
|
+
except Exception:
|
|
283
|
+
pass
|
|
284
|
+
return None
|
|
285
|
+
|
|
67
286
|
def load(self):
|
|
68
287
|
content = self.get_content()
|
|
69
288
|
if isinstance(content, str):
|
|
@@ -58,9 +58,12 @@ class AlitaTextLoader(BaseLoader):
|
|
|
58
58
|
else:
|
|
59
59
|
raise ValueError("Neither file_path nor file_content is provided for encoding detection.")
|
|
60
60
|
else:
|
|
61
|
-
|
|
61
|
+
# Preserve original error details for callers
|
|
62
|
+
raise RuntimeError(f"Error loading content with encoding {self.encoding}: {e}") from e
|
|
62
63
|
except Exception as e:
|
|
63
|
-
|
|
64
|
+
# Preserve original error details so higher-level code (e.g., parse_file_content)
|
|
65
|
+
# can expose the real root cause instead of a generic message.
|
|
66
|
+
raise RuntimeError(f"Error loading content: {e}") from e
|
|
64
67
|
|
|
65
68
|
return text
|
|
66
69
|
|
|
@@ -21,12 +21,14 @@ from .AlitaDocxMammothLoader import AlitaDocxMammothLoader
|
|
|
21
21
|
from .AlitaExcelLoader import AlitaExcelLoader
|
|
22
22
|
from .AlitaImageLoader import AlitaImageLoader
|
|
23
23
|
from .AlitaJSONLoader import AlitaJSONLoader
|
|
24
|
+
from .AlitaJSONLinesLoader import AlitaJSONLinesLoader
|
|
24
25
|
from .AlitaPDFLoader import AlitaPDFLoader
|
|
25
26
|
from .AlitaPowerPointLoader import AlitaPowerPointLoader
|
|
26
27
|
from .AlitaTextLoader import AlitaTextLoader
|
|
27
28
|
from .AlitaMarkdownLoader import AlitaMarkdownLoader
|
|
28
29
|
from .AlitaPythonLoader import AlitaPythonLoader
|
|
29
30
|
from enum import Enum
|
|
31
|
+
from alita_sdk.runtime.langchain.constants import LOADER_MAX_TOKENS_DEFAULT
|
|
30
32
|
|
|
31
33
|
|
|
32
34
|
class LoaderProperties(Enum):
|
|
@@ -34,7 +36,7 @@ class LoaderProperties(Enum):
|
|
|
34
36
|
PROMPT_DEFAULT = 'use_default_prompt'
|
|
35
37
|
PROMPT = 'prompt'
|
|
36
38
|
|
|
37
|
-
DEFAULT_ALLOWED_BASE = {'max_tokens':
|
|
39
|
+
DEFAULT_ALLOWED_BASE = {'max_tokens': LOADER_MAX_TOKENS_DEFAULT}
|
|
38
40
|
|
|
39
41
|
DEFAULT_ALLOWED_WITH_LLM = {
|
|
40
42
|
**DEFAULT_ALLOWED_BASE,
|
|
@@ -43,6 +45,8 @@ DEFAULT_ALLOWED_WITH_LLM = {
|
|
|
43
45
|
LoaderProperties.PROMPT.value: "",
|
|
44
46
|
}
|
|
45
47
|
|
|
48
|
+
DEFAULT_ALLOWED_EXCEL = {**DEFAULT_ALLOWED_WITH_LLM, 'add_header_to_chunks': False, 'header_row_number': 1, 'max_tokens': -1, 'sheet_name': ''}
|
|
49
|
+
|
|
46
50
|
# Image file loaders mapping - directly supported by LLM with image_url
|
|
47
51
|
image_loaders_map = {
|
|
48
52
|
'.png': {
|
|
@@ -162,11 +166,12 @@ document_loaders_map = {
|
|
|
162
166
|
'spreadsheetml.sheet'),
|
|
163
167
|
'is_multimodal_processing': False,
|
|
164
168
|
'kwargs': {
|
|
165
|
-
'
|
|
166
|
-
'
|
|
167
|
-
'
|
|
169
|
+
'add_header_to_chunks': False,
|
|
170
|
+
'header_row_number': 1,
|
|
171
|
+
'max_tokens': -1,
|
|
172
|
+
'sheet_name': ''
|
|
168
173
|
},
|
|
169
|
-
'allowed_to_override':
|
|
174
|
+
'allowed_to_override': DEFAULT_ALLOWED_EXCEL
|
|
170
175
|
},
|
|
171
176
|
'.xls': {
|
|
172
177
|
'class': AlitaExcelLoader,
|
|
@@ -177,7 +182,7 @@ document_loaders_map = {
|
|
|
177
182
|
'raw_content': True,
|
|
178
183
|
'cleanse': False
|
|
179
184
|
},
|
|
180
|
-
'allowed_to_override':
|
|
185
|
+
'allowed_to_override': DEFAULT_ALLOWED_EXCEL
|
|
181
186
|
},
|
|
182
187
|
'.pdf': {
|
|
183
188
|
'class': AlitaPDFLoader,
|
|
@@ -204,7 +209,7 @@ document_loaders_map = {
|
|
|
204
209
|
'allowed_to_override': DEFAULT_ALLOWED_BASE
|
|
205
210
|
},
|
|
206
211
|
'.jsonl': {
|
|
207
|
-
'class':
|
|
212
|
+
'class': AlitaJSONLinesLoader,
|
|
208
213
|
'mime_type': 'application/jsonl',
|
|
209
214
|
'is_multimodal_processing': False,
|
|
210
215
|
'kwargs': {},
|