alita-sdk 0.3.257__py3-none-any.whl → 0.3.562__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alita_sdk/cli/__init__.py +10 -0
- alita_sdk/cli/__main__.py +17 -0
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +155 -0
- alita_sdk/cli/agent_loader.py +215 -0
- alita_sdk/cli/agent_ui.py +228 -0
- alita_sdk/cli/agents.py +3601 -0
- alita_sdk/cli/callbacks.py +647 -0
- alita_sdk/cli/cli.py +168 -0
- alita_sdk/cli/config.py +306 -0
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/formatting.py +182 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1073 -0
- alita_sdk/cli/mcp_loader.py +315 -0
- alita_sdk/cli/toolkit.py +327 -0
- alita_sdk/cli/toolkit_loader.py +85 -0
- alita_sdk/cli/tools/__init__.py +43 -0
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +1751 -0
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +72 -12
- alita_sdk/community/inventory/__init__.py +236 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +173 -0
- alita_sdk/community/inventory/toolkit_utils.py +176 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/__init__.py +11 -0
- alita_sdk/configurations/ado.py +148 -2
- alita_sdk/configurations/azure_search.py +1 -1
- alita_sdk/configurations/bigquery.py +1 -1
- alita_sdk/configurations/bitbucket.py +94 -2
- alita_sdk/configurations/browser.py +18 -0
- alita_sdk/configurations/carrier.py +19 -0
- alita_sdk/configurations/confluence.py +130 -1
- alita_sdk/configurations/delta_lake.py +1 -1
- alita_sdk/configurations/figma.py +76 -5
- alita_sdk/configurations/github.py +65 -1
- alita_sdk/configurations/gitlab.py +81 -0
- alita_sdk/configurations/google_places.py +17 -0
- alita_sdk/configurations/jira.py +103 -0
- alita_sdk/configurations/openapi.py +111 -0
- alita_sdk/configurations/postman.py +1 -1
- alita_sdk/configurations/qtest.py +72 -3
- alita_sdk/configurations/report_portal.py +115 -0
- alita_sdk/configurations/salesforce.py +19 -0
- alita_sdk/configurations/service_now.py +1 -12
- alita_sdk/configurations/sharepoint.py +167 -0
- alita_sdk/configurations/sonar.py +18 -0
- alita_sdk/configurations/sql.py +20 -0
- alita_sdk/configurations/testio.py +101 -0
- alita_sdk/configurations/testrail.py +88 -0
- alita_sdk/configurations/xray.py +94 -1
- alita_sdk/configurations/zephyr_enterprise.py +94 -1
- alita_sdk/configurations/zephyr_essential.py +95 -0
- alita_sdk/runtime/clients/artifact.py +21 -4
- alita_sdk/runtime/clients/client.py +458 -67
- alita_sdk/runtime/clients/mcp_discovery.py +342 -0
- alita_sdk/runtime/clients/mcp_manager.py +262 -0
- alita_sdk/runtime/clients/sandbox_client.py +352 -0
- alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
- alita_sdk/runtime/langchain/assistant.py +183 -43
- alita_sdk/runtime/langchain/constants.py +647 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
- alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
- alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
- alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
- alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
- alita_sdk/runtime/langchain/document_loaders/constants.py +189 -41
- alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
- alita_sdk/runtime/langchain/langraph_agent.py +407 -92
- alita_sdk/runtime/langchain/utils.py +102 -8
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/models/mcp_models.py +61 -0
- alita_sdk/runtime/skills/__init__.py +91 -0
- alita_sdk/runtime/skills/callbacks.py +498 -0
- alita_sdk/runtime/skills/discovery.py +540 -0
- alita_sdk/runtime/skills/executor.py +610 -0
- alita_sdk/runtime/skills/input_builder.py +371 -0
- alita_sdk/runtime/skills/models.py +330 -0
- alita_sdk/runtime/skills/registry.py +355 -0
- alita_sdk/runtime/skills/skill_runner.py +330 -0
- alita_sdk/runtime/toolkits/__init__.py +28 -0
- alita_sdk/runtime/toolkits/application.py +14 -4
- alita_sdk/runtime/toolkits/artifact.py +24 -9
- alita_sdk/runtime/toolkits/datasource.py +13 -6
- alita_sdk/runtime/toolkits/mcp.py +780 -0
- alita_sdk/runtime/toolkits/planning.py +178 -0
- alita_sdk/runtime/toolkits/skill_router.py +238 -0
- alita_sdk/runtime/toolkits/subgraph.py +11 -6
- alita_sdk/runtime/toolkits/tools.py +314 -70
- alita_sdk/runtime/toolkits/vectorstore.py +11 -5
- alita_sdk/runtime/tools/__init__.py +24 -0
- alita_sdk/runtime/tools/application.py +16 -4
- alita_sdk/runtime/tools/artifact.py +367 -33
- alita_sdk/runtime/tools/data_analysis.py +183 -0
- alita_sdk/runtime/tools/function.py +100 -4
- alita_sdk/runtime/tools/graph.py +81 -0
- alita_sdk/runtime/tools/image_generation.py +218 -0
- alita_sdk/runtime/tools/llm.py +1013 -177
- alita_sdk/runtime/tools/loop.py +3 -1
- alita_sdk/runtime/tools/loop_output.py +3 -1
- alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
- alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
- alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/router.py +2 -1
- alita_sdk/runtime/tools/sandbox.py +375 -0
- alita_sdk/runtime/tools/skill_router.py +776 -0
- alita_sdk/runtime/tools/tool.py +3 -1
- alita_sdk/runtime/tools/vectorstore.py +69 -65
- alita_sdk/runtime/tools/vectorstore_base.py +163 -90
- alita_sdk/runtime/utils/AlitaCallback.py +137 -21
- alita_sdk/runtime/utils/mcp_client.py +492 -0
- alita_sdk/runtime/utils/mcp_oauth.py +361 -0
- alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/streamlit.py +41 -14
- alita_sdk/runtime/utils/toolkit_utils.py +28 -9
- alita_sdk/runtime/utils/utils.py +48 -0
- alita_sdk/tools/__init__.py +135 -37
- alita_sdk/tools/ado/__init__.py +2 -2
- alita_sdk/tools/ado/repos/__init__.py +15 -19
- alita_sdk/tools/ado/repos/repos_wrapper.py +12 -20
- alita_sdk/tools/ado/test_plan/__init__.py +26 -8
- alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -28
- alita_sdk/tools/ado/wiki/__init__.py +27 -12
- alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -40
- alita_sdk/tools/ado/work_item/__init__.py +27 -12
- alita_sdk/tools/ado/work_item/ado_wrapper.py +95 -11
- alita_sdk/tools/advanced_jira_mining/__init__.py +12 -8
- alita_sdk/tools/aws/delta_lake/__init__.py +14 -11
- alita_sdk/tools/aws/delta_lake/tool.py +5 -1
- alita_sdk/tools/azure_ai/search/__init__.py +13 -8
- alita_sdk/tools/base/tool.py +5 -1
- alita_sdk/tools/base_indexer_toolkit.py +454 -110
- alita_sdk/tools/bitbucket/__init__.py +27 -19
- alita_sdk/tools/bitbucket/api_wrapper.py +285 -27
- alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
- alita_sdk/tools/browser/__init__.py +41 -16
- alita_sdk/tools/browser/crawler.py +3 -1
- alita_sdk/tools/browser/utils.py +15 -6
- alita_sdk/tools/carrier/__init__.py +18 -17
- alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
- alita_sdk/tools/carrier/excel_reporter.py +8 -4
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/code/codeparser.py +1 -1
- alita_sdk/tools/chunkers/sematic/json_chunker.py +2 -1
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/cloud/aws/__init__.py +11 -7
- alita_sdk/tools/cloud/azure/__init__.py +11 -7
- alita_sdk/tools/cloud/gcp/__init__.py +11 -7
- alita_sdk/tools/cloud/k8s/__init__.py +11 -7
- alita_sdk/tools/code/linter/__init__.py +9 -8
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code/sonar/__init__.py +20 -13
- alita_sdk/tools/code_indexer_toolkit.py +199 -0
- alita_sdk/tools/confluence/__init__.py +21 -14
- alita_sdk/tools/confluence/api_wrapper.py +197 -58
- alita_sdk/tools/confluence/loader.py +14 -2
- alita_sdk/tools/custom_open_api/__init__.py +11 -5
- alita_sdk/tools/elastic/__init__.py +10 -8
- alita_sdk/tools/elitea_base.py +546 -64
- alita_sdk/tools/figma/__init__.py +11 -8
- alita_sdk/tools/figma/api_wrapper.py +352 -153
- alita_sdk/tools/github/__init__.py +17 -17
- alita_sdk/tools/github/api_wrapper.py +9 -26
- alita_sdk/tools/github/github_client.py +81 -12
- alita_sdk/tools/github/schemas.py +2 -1
- alita_sdk/tools/github/tool.py +5 -1
- alita_sdk/tools/gitlab/__init__.py +18 -13
- alita_sdk/tools/gitlab/api_wrapper.py +224 -80
- alita_sdk/tools/gitlab_org/__init__.py +13 -10
- alita_sdk/tools/google/bigquery/__init__.py +13 -13
- alita_sdk/tools/google/bigquery/tool.py +5 -1
- alita_sdk/tools/google_places/__init__.py +20 -11
- alita_sdk/tools/jira/__init__.py +21 -11
- alita_sdk/tools/jira/api_wrapper.py +315 -168
- alita_sdk/tools/keycloak/__init__.py +10 -8
- alita_sdk/tools/localgit/__init__.py +8 -3
- alita_sdk/tools/localgit/local_git.py +62 -54
- alita_sdk/tools/localgit/tool.py +5 -1
- alita_sdk/tools/memory/__init__.py +38 -14
- alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
- alita_sdk/tools/ocr/__init__.py +10 -8
- alita_sdk/tools/openapi/__init__.py +281 -108
- alita_sdk/tools/openapi/api_wrapper.py +883 -0
- alita_sdk/tools/openapi/tool.py +20 -0
- alita_sdk/tools/pandas/__init__.py +18 -11
- alita_sdk/tools/pandas/api_wrapper.py +40 -45
- alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
- alita_sdk/tools/postman/__init__.py +10 -11
- alita_sdk/tools/postman/api_wrapper.py +19 -8
- alita_sdk/tools/postman/postman_analysis.py +8 -1
- alita_sdk/tools/pptx/__init__.py +10 -10
- alita_sdk/tools/qtest/__init__.py +21 -14
- alita_sdk/tools/qtest/api_wrapper.py +1784 -88
- alita_sdk/tools/rally/__init__.py +12 -10
- alita_sdk/tools/report_portal/__init__.py +22 -16
- alita_sdk/tools/salesforce/__init__.py +21 -16
- alita_sdk/tools/servicenow/__init__.py +20 -16
- alita_sdk/tools/servicenow/api_wrapper.py +1 -1
- alita_sdk/tools/sharepoint/__init__.py +16 -14
- alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
- alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
- alita_sdk/tools/sharepoint/utils.py +8 -2
- alita_sdk/tools/slack/__init__.py +11 -7
- alita_sdk/tools/sql/__init__.py +21 -19
- alita_sdk/tools/sql/api_wrapper.py +71 -23
- alita_sdk/tools/testio/__init__.py +20 -13
- alita_sdk/tools/testrail/__init__.py +12 -11
- alita_sdk/tools/testrail/api_wrapper.py +214 -46
- alita_sdk/tools/utils/__init__.py +28 -4
- alita_sdk/tools/utils/content_parser.py +182 -62
- alita_sdk/tools/utils/text_operations.py +254 -0
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
- alita_sdk/tools/xray/__init__.py +17 -14
- alita_sdk/tools/xray/api_wrapper.py +58 -113
- alita_sdk/tools/yagmail/__init__.py +8 -3
- alita_sdk/tools/zephyr/__init__.py +11 -7
- alita_sdk/tools/zephyr_enterprise/__init__.py +15 -9
- alita_sdk/tools/zephyr_enterprise/api_wrapper.py +30 -15
- alita_sdk/tools/zephyr_essential/__init__.py +15 -10
- alita_sdk/tools/zephyr_essential/api_wrapper.py +297 -54
- alita_sdk/tools/zephyr_essential/client.py +6 -4
- alita_sdk/tools/zephyr_scale/__init__.py +12 -8
- alita_sdk/tools/zephyr_scale/api_wrapper.py +39 -31
- alita_sdk/tools/zephyr_squad/__init__.py +11 -7
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/METADATA +184 -37
- alita_sdk-0.3.562.dist-info/RECORD +450 -0
- alita_sdk-0.3.562.dist-info/entry_points.txt +2 -0
- alita_sdk/tools/bitbucket/tools.py +0 -304
- alita_sdk-0.3.257.dist-info/RECORD +0 -343
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,17 @@
|
|
|
1
|
+
from copy import deepcopy
|
|
1
2
|
import os
|
|
3
|
+
import re
|
|
2
4
|
import tempfile
|
|
3
5
|
from logging import getLogger
|
|
4
6
|
from pathlib import Path
|
|
5
|
-
from typing import Generator
|
|
7
|
+
from typing import Generator, List
|
|
6
8
|
|
|
7
9
|
from langchain_core.documents import Document
|
|
8
10
|
from langchain_core.tools import ToolException
|
|
9
11
|
|
|
10
|
-
from alita_sdk.runtime.langchain.document_loaders.constants import loaders_map
|
|
12
|
+
from alita_sdk.runtime.langchain.document_loaders.constants import loaders_map, LoaderProperties
|
|
13
|
+
from ...runtime.langchain.document_loaders.AlitaTextLoader import AlitaTextLoader
|
|
14
|
+
from ...runtime.utils.utils import IndexerKeywords
|
|
11
15
|
|
|
12
16
|
logger = getLogger(__name__)
|
|
13
17
|
|
|
@@ -51,11 +55,9 @@ Highlight any visible details that could help in understanding the image.
|
|
|
51
55
|
Be as precise and thorough as possible in your responses. If something is unclear or illegible, state that explicitly.
|
|
52
56
|
'''
|
|
53
57
|
|
|
54
|
-
IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'webp', 'svg']
|
|
55
|
-
|
|
56
58
|
|
|
57
59
|
def parse_file_content(file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
|
|
58
|
-
sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False) -> str | ToolException:
|
|
60
|
+
sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False, prompt=None) -> str | ToolException:
|
|
59
61
|
"""Parse the content of a file based on its type and return the parsed content.
|
|
60
62
|
|
|
61
63
|
Args:
|
|
@@ -72,18 +74,63 @@ def parse_file_content(file_name=None, file_content=None, is_capture_image: bool
|
|
|
72
74
|
Raises:
|
|
73
75
|
ToolException: If the file type is not supported or if there is an error reading the file.
|
|
74
76
|
"""
|
|
77
|
+
if not prompt:
|
|
78
|
+
prompt = image_processing_prompt
|
|
79
|
+
loader = prepare_loader(
|
|
80
|
+
file_name=file_name,
|
|
81
|
+
file_content=file_content,
|
|
82
|
+
is_capture_image=is_capture_image,
|
|
83
|
+
page_number=page_number,
|
|
84
|
+
sheet_name=sheet_name,
|
|
85
|
+
llm=llm,
|
|
86
|
+
file_path=file_path,
|
|
87
|
+
excel_by_sheets=excel_by_sheets,
|
|
88
|
+
prompt=prompt
|
|
89
|
+
)
|
|
75
90
|
|
|
76
|
-
if
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
extension = Path(file_path if file_path else file_name).suffix
|
|
91
|
+
if not loader:
|
|
92
|
+
return ToolException(
|
|
93
|
+
"Not supported type of files entered. Supported types are TXT, DOCX, PDF, PPTX, XLSX and XLS only.")
|
|
80
94
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
95
|
+
try:
|
|
96
|
+
if hasattr(loader, 'get_content'):
|
|
97
|
+
return loader.get_content()
|
|
98
|
+
else:
|
|
99
|
+
extension = Path(file_path if file_path else file_name).suffix
|
|
100
|
+
loader_kwargs = get_loader_kwargs(loaders_map.get(extension), file_name, file_content, is_capture_image, page_number, sheet_name, llm, file_path, excel_by_sheets)
|
|
101
|
+
if file_content:
|
|
102
|
+
return load_content_from_bytes(file_content=file_content,
|
|
103
|
+
extension=extension,
|
|
104
|
+
loader_extra_config=loader_kwargs,
|
|
105
|
+
llm=llm)
|
|
106
|
+
else:
|
|
107
|
+
return load_content(file_path=file_path,
|
|
108
|
+
extension=extension,
|
|
109
|
+
loader_extra_config=loader_kwargs,
|
|
110
|
+
llm=llm)
|
|
111
|
+
except Exception as e:
|
|
112
|
+
return ToolException(f"Error reading file ({file_name or file_path}) content. Make sure these types are supported: {str(e)}")
|
|
113
|
+
|
|
114
|
+
def load_file_docs(file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
|
|
115
|
+
sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False) -> List[Document] | ToolException:
|
|
116
|
+
loader = prepare_loader(
|
|
117
|
+
file_name=file_name,
|
|
118
|
+
file_content=file_content,
|
|
119
|
+
is_capture_image=is_capture_image,
|
|
120
|
+
page_number=page_number,
|
|
121
|
+
sheet_name=sheet_name,
|
|
122
|
+
llm=llm,
|
|
123
|
+
file_path=file_path,
|
|
124
|
+
excel_by_sheets=excel_by_sheets
|
|
125
|
+
)
|
|
126
|
+
if not loader:
|
|
84
127
|
return ToolException(
|
|
85
128
|
"Not supported type of files entered. Supported types are TXT, DOCX, PDF, PPTX, XLSX and XLS only.")
|
|
86
|
-
|
|
129
|
+
return loader.load()
|
|
130
|
+
|
|
131
|
+
def get_loader_kwargs(loader_object, file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
|
|
132
|
+
sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False, prompt=None):
|
|
133
|
+
loader_kwargs = deepcopy(loader_object['kwargs'])
|
|
87
134
|
loader_kwargs.update({
|
|
88
135
|
"file_path": file_path,
|
|
89
136
|
"file_content": file_content,
|
|
@@ -93,28 +140,26 @@ def parse_file_content(file_name=None, file_content=None, is_capture_image: bool
|
|
|
93
140
|
"page_number": page_number,
|
|
94
141
|
"sheet_name": sheet_name,
|
|
95
142
|
"excel_by_sheets": excel_by_sheets,
|
|
143
|
+
"prompt": prompt,
|
|
96
144
|
"row_content": True,
|
|
97
145
|
"json_documents": False
|
|
98
146
|
})
|
|
99
|
-
|
|
147
|
+
return loader_kwargs
|
|
100
148
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
149
|
+
def prepare_loader(file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
|
|
150
|
+
sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False,
|
|
151
|
+
prompt=None):
|
|
152
|
+
if (file_path and (file_name or file_content)) or (not file_path and (not file_name or file_content is None)):
|
|
153
|
+
raise ToolException("Either (file_name and file_content) or file_path must be provided, but not both.")
|
|
104
154
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
if
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
else:
|
|
114
|
-
return load_content(file_path=file_path,
|
|
115
|
-
extension=extension,
|
|
116
|
-
loader_extra_config=loader_kwargs,
|
|
117
|
-
llm=llm)
|
|
155
|
+
extension = Path(file_path if file_path else file_name).suffix
|
|
156
|
+
|
|
157
|
+
loader_object = loaders_map.get(extension)
|
|
158
|
+
if not loader_object:
|
|
159
|
+
loader_object = loaders_map.get('.txt') # Default to text loader if no specific loader found
|
|
160
|
+
loader_kwargs = get_loader_kwargs(loader_object, file_name, file_content, is_capture_image, page_number, sheet_name, llm, file_path, excel_by_sheets, prompt)
|
|
161
|
+
loader = loader_object['class'](**loader_kwargs)
|
|
162
|
+
return loader
|
|
118
163
|
|
|
119
164
|
# TODO: review usage of this function alongside with functions above
|
|
120
165
|
def load_content(file_path: str, extension: str = None, loader_extra_config: dict = None, llm = None) -> str:
|
|
@@ -142,7 +187,7 @@ def load_content(file_path: str, extension: str = None, loader_extra_config: dic
|
|
|
142
187
|
if "file_path" in loader_kwargs:
|
|
143
188
|
del loader_kwargs["file_path"]
|
|
144
189
|
|
|
145
|
-
loader = loader_cls(file_path, **loader_kwargs)
|
|
190
|
+
loader = loader_cls(file_path=file_path, **loader_kwargs)
|
|
146
191
|
documents = loader.load()
|
|
147
192
|
|
|
148
193
|
page_contents = [doc.page_content for doc in documents]
|
|
@@ -167,37 +212,93 @@ def load_content_from_bytes(file_content: bytes, extension: str = None, loader_e
|
|
|
167
212
|
if temp_file_path and os.path.exists(temp_file_path):
|
|
168
213
|
os.remove(temp_file_path)
|
|
169
214
|
|
|
170
|
-
def
|
|
171
|
-
|
|
215
|
+
def process_document_by_type(content, extension_source: str, document: Document = None, llm = None, chunking_config=None) \
|
|
216
|
+
-> Generator[Document, None, None]:
|
|
217
|
+
"""Process the content of a file based on its type using a configured loader cosidering the origin document."""
|
|
172
218
|
try:
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
219
|
+
chunks = process_content_by_type(content, extension_source, llm, chunking_config)
|
|
220
|
+
except Exception as e:
|
|
221
|
+
msg = f"Error during content parsing for file {extension_source}:\n{e}"
|
|
222
|
+
logger.warning(msg)
|
|
223
|
+
yield Document(
|
|
224
|
+
page_content=msg,
|
|
225
|
+
metadata={**document.metadata, 'chunk_id': 1}
|
|
226
|
+
)
|
|
227
|
+
return
|
|
228
|
+
#
|
|
229
|
+
chunks_counter = 0
|
|
230
|
+
for chunk in chunks:
|
|
231
|
+
chunks_counter += 1
|
|
232
|
+
metadata = {**document.metadata, **chunk.metadata}
|
|
233
|
+
#
|
|
234
|
+
# ensure each chunk has a unique chunk_id
|
|
235
|
+
metadata['chunk_id'] = chunks_counter
|
|
236
|
+
#
|
|
237
|
+
yield Document(
|
|
238
|
+
page_content=sanitize_for_postgres(chunk.page_content),
|
|
239
|
+
metadata=metadata
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
def process_content_by_type(content, filename: str, llm=None, chunking_config=None, fallback_extensions=None) -> \
|
|
244
|
+
Generator[Document, None, None]:
|
|
245
|
+
"""Process the content of a file based on its type using a configured loader."""
|
|
246
|
+
temp_file_path = None
|
|
247
|
+
extensions = fallback_extensions if fallback_extensions else []
|
|
248
|
+
match = re.search(r'\.([^.]+)$', filename)
|
|
249
|
+
|
|
250
|
+
if match:
|
|
251
|
+
extensions.insert(0, f".{match.group(1).lower()}")
|
|
252
|
+
elif not extensions:
|
|
253
|
+
extensions = [".txt"]
|
|
254
|
+
|
|
255
|
+
for extension in extensions:
|
|
256
|
+
try:
|
|
257
|
+
with tempfile.NamedTemporaryFile(mode='w+b', suffix=extension, delete=False) as temp_file:
|
|
258
|
+
temp_file_path = temp_file.name
|
|
259
|
+
if content is None:
|
|
260
|
+
logger.warning(
|
|
261
|
+
f"'{IndexerKeywords.CONTENT_IN_BYTES.value}' ie expected but not found in document metadata.")
|
|
262
|
+
return []
|
|
263
|
+
|
|
264
|
+
temp_file.write(content)
|
|
265
|
+
temp_file.flush()
|
|
266
|
+
|
|
267
|
+
loader_config = loaders_map.get(extension)
|
|
268
|
+
if not loader_config:
|
|
269
|
+
logger.warning(f"No loader found for file extension: {extension}. File: {temp_file_path}")
|
|
270
|
+
return []
|
|
271
|
+
|
|
272
|
+
loader_cls = loader_config['class']
|
|
273
|
+
loader_kwargs = loader_config['kwargs']
|
|
274
|
+
# Determine which loader configuration keys are allowed to be overridden by user input.
|
|
275
|
+
# If 'allowed_to_override' is specified in the loader configuration, use it; otherwise, allow all keys in loader_kwargs.
|
|
276
|
+
allowed_to_override = loader_config.get('allowed_to_override', loader_kwargs)
|
|
277
|
+
# If a chunking_config is provided and contains custom configuration for the current file extension,
|
|
278
|
+
# update loader_kwargs with user-supplied values, but only for keys explicitly permitted in allowed_to_override and if value differs from default.
|
|
279
|
+
# This ensures that only safe and intended parameters can be customized, preventing accidental or unauthorized changes
|
|
280
|
+
# to critical loader settings.
|
|
281
|
+
if chunking_config and (users_config_for_extension := chunking_config.get(extension, {})):
|
|
282
|
+
for key in set(users_config_for_extension.keys()) & set(allowed_to_override.keys()):
|
|
283
|
+
if users_config_for_extension[key] != allowed_to_override[key]:
|
|
284
|
+
loader_kwargs[key] = users_config_for_extension[key]
|
|
285
|
+
if LoaderProperties.LLM.value in loader_kwargs and loader_kwargs.pop(LoaderProperties.LLM.value):
|
|
286
|
+
loader_kwargs['llm'] = llm
|
|
287
|
+
if LoaderProperties.PROMPT_DEFAULT.value in loader_kwargs and loader_kwargs.pop(LoaderProperties.PROMPT_DEFAULT.value):
|
|
288
|
+
loader_kwargs[LoaderProperties.PROMPT.value] = image_processing_prompt
|
|
289
|
+
loader = loader_cls(file_path=temp_file_path, **loader_kwargs)
|
|
290
|
+
yield from loader.load()
|
|
291
|
+
break
|
|
292
|
+
except Exception as e:
|
|
293
|
+
if fallback_extensions:
|
|
294
|
+
logger.warning(f"Error loading attachment: {str(e)} for file {temp_file_path} (extension: {extension})")
|
|
295
|
+
logger.warning(f"Continuing with fallback extensions: {fallback_extensions}.")
|
|
296
|
+
continue
|
|
297
|
+
else:
|
|
298
|
+
raise e
|
|
299
|
+
finally:
|
|
300
|
+
if temp_file_path and os.path.exists(temp_file_path):
|
|
301
|
+
os.remove(temp_file_path)
|
|
201
302
|
|
|
202
303
|
# FIXME copied from langchain_core/utils/strings.py of 0.3.74 version
|
|
203
304
|
# https://github.com/langchain-ai/langchain/pull/32157
|
|
@@ -218,4 +319,23 @@ def sanitize_for_postgres(text: str, replacement: str = "") -> str:
|
|
|
218
319
|
>>> sanitize_for_postgres("Hello\\x00world", " ")
|
|
219
320
|
'Hello world'
|
|
220
321
|
"""
|
|
221
|
-
return text.replace("\x00", replacement)
|
|
322
|
+
return text.replace("\x00", replacement)
|
|
323
|
+
|
|
324
|
+
|
|
325
|
+
def file_extension_by_chunker(chunker_name: str) -> str | None:
|
|
326
|
+
if not chunker_name:
|
|
327
|
+
return None
|
|
328
|
+
name = chunker_name.lower()
|
|
329
|
+
if name == "markdown":
|
|
330
|
+
return ".md"
|
|
331
|
+
if name == "json":
|
|
332
|
+
return ".json"
|
|
333
|
+
if name == "text" or name == "txt":
|
|
334
|
+
return ".txt"
|
|
335
|
+
if name == "html":
|
|
336
|
+
return ".html"
|
|
337
|
+
if name == "xml":
|
|
338
|
+
return ".xml"
|
|
339
|
+
if name == "csv":
|
|
340
|
+
return ".csv"
|
|
341
|
+
return None
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared text operations utilities for file manipulation across toolkits.
|
|
3
|
+
|
|
4
|
+
Provides common functionality for:
|
|
5
|
+
- Parsing OLD/NEW marker-based edits
|
|
6
|
+
- Text file validation
|
|
7
|
+
- Line-based slicing and partial reads
|
|
8
|
+
- Content searching with context
|
|
9
|
+
"""
|
|
10
|
+
import re
|
|
11
|
+
import logging
|
|
12
|
+
from typing import List, Tuple, Dict, Optional
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# Text file extensions that support editing
|
|
17
|
+
TEXT_EDITABLE_EXTENSIONS = {
|
|
18
|
+
'.md', '.txt', '.csv', '.json', '.xml', '.html',
|
|
19
|
+
'.yaml', '.yml', '.ini', '.conf', '.log', '.sh',
|
|
20
|
+
'.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.go',
|
|
21
|
+
'.rb', '.php', '.c', '.cpp', '.h', '.hpp', '.cs',
|
|
22
|
+
'.sql', '.r', '.m', '.swift', '.kt', '.rs', '.scala'
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parse_old_new_markers(file_query: str) -> List[Tuple[str, str]]:
|
|
27
|
+
"""
|
|
28
|
+
Parse OLD/NEW marker-based edit instructions.
|
|
29
|
+
|
|
30
|
+
Extracts pairs of old and new content from a file query using markers:
|
|
31
|
+
- OLD <<<< ... >>>> OLD
|
|
32
|
+
- NEW <<<< ... >>>> NEW
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
file_query: String containing marked old and new content sections
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
List of tuples (old_content, new_content) for each edit pair
|
|
39
|
+
|
|
40
|
+
Example:
|
|
41
|
+
>>> query = '''
|
|
42
|
+
... OLD <<<<
|
|
43
|
+
... Hello World
|
|
44
|
+
... >>>> OLD
|
|
45
|
+
... NEW <<<<
|
|
46
|
+
... Hello Mars
|
|
47
|
+
... >>>> NEW
|
|
48
|
+
... '''
|
|
49
|
+
>>> parse_old_new_markers(query)
|
|
50
|
+
[('Hello World', 'Hello Mars')]
|
|
51
|
+
"""
|
|
52
|
+
# Split the file content by lines
|
|
53
|
+
code_lines = file_query.split("\n")
|
|
54
|
+
|
|
55
|
+
# Initialize lists to hold the contents of OLD and NEW sections
|
|
56
|
+
old_contents = []
|
|
57
|
+
new_contents = []
|
|
58
|
+
|
|
59
|
+
# Initialize variables to track whether the current line is within an OLD or NEW section
|
|
60
|
+
in_old_section = False
|
|
61
|
+
in_new_section = False
|
|
62
|
+
|
|
63
|
+
# Temporary storage for the current section's content
|
|
64
|
+
current_section_content = []
|
|
65
|
+
|
|
66
|
+
# Iterate through each line in the file content
|
|
67
|
+
for line in code_lines:
|
|
68
|
+
# Check for OLD section start
|
|
69
|
+
if "OLD <<<" in line:
|
|
70
|
+
in_old_section = True
|
|
71
|
+
current_section_content = [] # Reset current section content
|
|
72
|
+
continue # Skip the line with the marker
|
|
73
|
+
|
|
74
|
+
# Check for OLD section end
|
|
75
|
+
if ">>>> OLD" in line:
|
|
76
|
+
in_old_section = False
|
|
77
|
+
old_contents.append("\n".join(current_section_content).strip()) # Add the captured content
|
|
78
|
+
current_section_content = [] # Reset current section content
|
|
79
|
+
continue # Skip the line with the marker
|
|
80
|
+
|
|
81
|
+
# Check for NEW section start
|
|
82
|
+
if "NEW <<<" in line:
|
|
83
|
+
in_new_section = True
|
|
84
|
+
current_section_content = [] # Reset current section content
|
|
85
|
+
continue # Skip the line with the marker
|
|
86
|
+
|
|
87
|
+
# Check for NEW section end
|
|
88
|
+
if ">>>> NEW" in line:
|
|
89
|
+
in_new_section = False
|
|
90
|
+
new_contents.append("\n".join(current_section_content).strip()) # Add the captured content
|
|
91
|
+
current_section_content = [] # Reset current section content
|
|
92
|
+
continue # Skip the line with the marker
|
|
93
|
+
|
|
94
|
+
# If currently in an OLD or NEW section, add the line to the current section content
|
|
95
|
+
if in_old_section or in_new_section:
|
|
96
|
+
current_section_content.append(line)
|
|
97
|
+
|
|
98
|
+
# Pair the OLD and NEW contents
|
|
99
|
+
paired_contents = list(zip(old_contents, new_contents))
|
|
100
|
+
|
|
101
|
+
return paired_contents
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def is_text_editable(filename: str) -> bool:
|
|
105
|
+
"""
|
|
106
|
+
Check if a file is editable as text based on its extension.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
filename: Name or path of the file to check
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
True if file extension is in the text-editable whitelist
|
|
113
|
+
|
|
114
|
+
Example:
|
|
115
|
+
>>> is_text_editable("config.json")
|
|
116
|
+
True
|
|
117
|
+
>>> is_text_editable("image.png")
|
|
118
|
+
False
|
|
119
|
+
"""
|
|
120
|
+
from pathlib import Path
|
|
121
|
+
ext = Path(filename).suffix.lower()
|
|
122
|
+
return ext in TEXT_EDITABLE_EXTENSIONS
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def apply_line_slice(
|
|
126
|
+
content: str,
|
|
127
|
+
offset: Optional[int] = None,
|
|
128
|
+
limit: Optional[int] = None,
|
|
129
|
+
head: Optional[int] = None,
|
|
130
|
+
tail: Optional[int] = None
|
|
131
|
+
) -> str:
|
|
132
|
+
"""
|
|
133
|
+
Apply line-based slicing to text content.
|
|
134
|
+
|
|
135
|
+
Supports multiple modes:
|
|
136
|
+
- offset + limit: Read from line `offset` for `limit` lines (1-indexed)
|
|
137
|
+
- head: Read only first N lines
|
|
138
|
+
- tail: Read only last N lines
|
|
139
|
+
- No params: Return full content
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
content: Text content to slice
|
|
143
|
+
offset: Starting line number (1-indexed, inclusive)
|
|
144
|
+
limit: Number of lines to read from offset
|
|
145
|
+
head: Return only first N lines
|
|
146
|
+
tail: Return only last N lines
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Sliced content as string
|
|
150
|
+
|
|
151
|
+
Example:
|
|
152
|
+
>>> text = "line1\\nline2\\nline3\\nline4\\nline5"
|
|
153
|
+
>>> apply_line_slice(text, offset=2, limit=2)
|
|
154
|
+
'line2\\nline3'
|
|
155
|
+
>>> apply_line_slice(text, head=2)
|
|
156
|
+
'line1\\nline2'
|
|
157
|
+
>>> apply_line_slice(text, tail=2)
|
|
158
|
+
'line4\\nline5'
|
|
159
|
+
"""
|
|
160
|
+
if not content:
|
|
161
|
+
return content
|
|
162
|
+
|
|
163
|
+
lines = content.splitlines(keepends=True)
|
|
164
|
+
|
|
165
|
+
# Head mode: first N lines
|
|
166
|
+
if head is not None:
|
|
167
|
+
return ''.join(lines[:head])
|
|
168
|
+
|
|
169
|
+
# Tail mode: last N lines
|
|
170
|
+
if tail is not None:
|
|
171
|
+
return ''.join(lines[-tail:] if tail > 0 else lines)
|
|
172
|
+
|
|
173
|
+
# Offset + limit mode: slice from offset for limit lines
|
|
174
|
+
if offset is not None:
|
|
175
|
+
start_idx = max(0, offset - 1) # Convert 1-indexed to 0-indexed
|
|
176
|
+
if limit is not None:
|
|
177
|
+
end_idx = start_idx + limit
|
|
178
|
+
return ''.join(lines[start_idx:end_idx])
|
|
179
|
+
else:
|
|
180
|
+
return ''.join(lines[start_idx:])
|
|
181
|
+
|
|
182
|
+
# No slicing parameters: return full content
|
|
183
|
+
return content
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def search_in_content(
|
|
187
|
+
content: str,
|
|
188
|
+
pattern: str,
|
|
189
|
+
is_regex: bool = True,
|
|
190
|
+
context_lines: int = 2
|
|
191
|
+
) -> List[Dict[str, any]]:
|
|
192
|
+
"""
|
|
193
|
+
Search for pattern in content with context lines.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
content: Text content to search
|
|
197
|
+
pattern: Search pattern (regex if is_regex=True, else literal string)
|
|
198
|
+
is_regex: Whether to treat pattern as regex (default True)
|
|
199
|
+
context_lines: Number of lines before/after match to include (default 2)
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
List of match dictionaries with keys:
|
|
203
|
+
- line_number: 1-indexed line number of match
|
|
204
|
+
- line_content: The matching line
|
|
205
|
+
- match_text: The actual matched text
|
|
206
|
+
- context_before: List of lines before match
|
|
207
|
+
- context_after: List of lines after match
|
|
208
|
+
|
|
209
|
+
Example:
|
|
210
|
+
>>> text = "line1\\nHello World\\nline3"
|
|
211
|
+
>>> matches = search_in_content(text, "Hello", is_regex=False)
|
|
212
|
+
>>> matches[0]['line_number']
|
|
213
|
+
2
|
|
214
|
+
>>> matches[0]['match_text']
|
|
215
|
+
'Hello'
|
|
216
|
+
"""
|
|
217
|
+
if not content:
|
|
218
|
+
return []
|
|
219
|
+
|
|
220
|
+
lines = content.splitlines()
|
|
221
|
+
matches = []
|
|
222
|
+
|
|
223
|
+
# Compile regex pattern or escape for literal search
|
|
224
|
+
if is_regex:
|
|
225
|
+
try:
|
|
226
|
+
regex = re.compile(pattern, re.IGNORECASE)
|
|
227
|
+
except re.error as e:
|
|
228
|
+
logger.warning(f"Invalid regex pattern '{pattern}': {e}")
|
|
229
|
+
return []
|
|
230
|
+
else:
|
|
231
|
+
regex = re.compile(re.escape(pattern), re.IGNORECASE)
|
|
232
|
+
|
|
233
|
+
# Search each line
|
|
234
|
+
for line_idx, line in enumerate(lines):
|
|
235
|
+
match = regex.search(line)
|
|
236
|
+
if match:
|
|
237
|
+
line_number = line_idx + 1 # Convert to 1-indexed
|
|
238
|
+
|
|
239
|
+
# Get context lines
|
|
240
|
+
context_start = max(0, line_idx - context_lines)
|
|
241
|
+
context_end = min(len(lines), line_idx + context_lines + 1)
|
|
242
|
+
|
|
243
|
+
context_before = lines[context_start:line_idx]
|
|
244
|
+
context_after = lines[line_idx + 1:context_end]
|
|
245
|
+
|
|
246
|
+
matches.append({
|
|
247
|
+
'line_number': line_number,
|
|
248
|
+
'line_content': line,
|
|
249
|
+
'match_text': match.group(0),
|
|
250
|
+
'context_before': context_before,
|
|
251
|
+
'context_after': context_after,
|
|
252
|
+
})
|
|
253
|
+
|
|
254
|
+
return matches
|