alita-sdk 0.3.257__py3-none-any.whl → 0.3.584__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of alita-sdk might be problematic. Click here for more details.
- alita_sdk/cli/__init__.py +10 -0
- alita_sdk/cli/__main__.py +17 -0
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +155 -0
- alita_sdk/cli/agent_loader.py +215 -0
- alita_sdk/cli/agent_ui.py +228 -0
- alita_sdk/cli/agents.py +3794 -0
- alita_sdk/cli/callbacks.py +647 -0
- alita_sdk/cli/cli.py +168 -0
- alita_sdk/cli/config.py +306 -0
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/formatting.py +182 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1073 -0
- alita_sdk/cli/mcp_loader.py +315 -0
- alita_sdk/cli/toolkit.py +327 -0
- alita_sdk/cli/toolkit_loader.py +85 -0
- alita_sdk/cli/tools/__init__.py +43 -0
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +1751 -0
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +72 -12
- alita_sdk/community/inventory/__init__.py +236 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +173 -0
- alita_sdk/community/inventory/toolkit_utils.py +176 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/__init__.py +11 -0
- alita_sdk/configurations/ado.py +148 -2
- alita_sdk/configurations/azure_search.py +1 -1
- alita_sdk/configurations/bigquery.py +1 -1
- alita_sdk/configurations/bitbucket.py +94 -2
- alita_sdk/configurations/browser.py +18 -0
- alita_sdk/configurations/carrier.py +19 -0
- alita_sdk/configurations/confluence.py +130 -1
- alita_sdk/configurations/delta_lake.py +1 -1
- alita_sdk/configurations/figma.py +76 -5
- alita_sdk/configurations/github.py +65 -1
- alita_sdk/configurations/gitlab.py +81 -0
- alita_sdk/configurations/google_places.py +17 -0
- alita_sdk/configurations/jira.py +103 -0
- alita_sdk/configurations/openapi.py +323 -0
- alita_sdk/configurations/postman.py +1 -1
- alita_sdk/configurations/qtest.py +72 -3
- alita_sdk/configurations/report_portal.py +115 -0
- alita_sdk/configurations/salesforce.py +19 -0
- alita_sdk/configurations/service_now.py +1 -12
- alita_sdk/configurations/sharepoint.py +167 -0
- alita_sdk/configurations/sonar.py +18 -0
- alita_sdk/configurations/sql.py +20 -0
- alita_sdk/configurations/testio.py +101 -0
- alita_sdk/configurations/testrail.py +88 -0
- alita_sdk/configurations/xray.py +94 -1
- alita_sdk/configurations/zephyr_enterprise.py +94 -1
- alita_sdk/configurations/zephyr_essential.py +95 -0
- alita_sdk/runtime/clients/artifact.py +21 -4
- alita_sdk/runtime/clients/client.py +458 -67
- alita_sdk/runtime/clients/mcp_discovery.py +342 -0
- alita_sdk/runtime/clients/mcp_manager.py +262 -0
- alita_sdk/runtime/clients/sandbox_client.py +352 -0
- alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
- alita_sdk/runtime/langchain/assistant.py +183 -43
- alita_sdk/runtime/langchain/constants.py +647 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
- alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
- alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
- alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
- alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
- alita_sdk/runtime/langchain/document_loaders/constants.py +189 -41
- alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
- alita_sdk/runtime/langchain/langraph_agent.py +493 -105
- alita_sdk/runtime/langchain/utils.py +118 -8
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/models/mcp_models.py +61 -0
- alita_sdk/runtime/skills/__init__.py +91 -0
- alita_sdk/runtime/skills/callbacks.py +498 -0
- alita_sdk/runtime/skills/discovery.py +540 -0
- alita_sdk/runtime/skills/executor.py +610 -0
- alita_sdk/runtime/skills/input_builder.py +371 -0
- alita_sdk/runtime/skills/models.py +330 -0
- alita_sdk/runtime/skills/registry.py +355 -0
- alita_sdk/runtime/skills/skill_runner.py +330 -0
- alita_sdk/runtime/toolkits/__init__.py +28 -0
- alita_sdk/runtime/toolkits/application.py +14 -4
- alita_sdk/runtime/toolkits/artifact.py +25 -9
- alita_sdk/runtime/toolkits/datasource.py +13 -6
- alita_sdk/runtime/toolkits/mcp.py +782 -0
- alita_sdk/runtime/toolkits/planning.py +178 -0
- alita_sdk/runtime/toolkits/skill_router.py +238 -0
- alita_sdk/runtime/toolkits/subgraph.py +11 -6
- alita_sdk/runtime/toolkits/tools.py +314 -70
- alita_sdk/runtime/toolkits/vectorstore.py +11 -5
- alita_sdk/runtime/tools/__init__.py +24 -0
- alita_sdk/runtime/tools/application.py +16 -4
- alita_sdk/runtime/tools/artifact.py +367 -33
- alita_sdk/runtime/tools/data_analysis.py +183 -0
- alita_sdk/runtime/tools/function.py +100 -4
- alita_sdk/runtime/tools/graph.py +81 -0
- alita_sdk/runtime/tools/image_generation.py +218 -0
- alita_sdk/runtime/tools/llm.py +1032 -177
- alita_sdk/runtime/tools/loop.py +3 -1
- alita_sdk/runtime/tools/loop_output.py +3 -1
- alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
- alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
- alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/router.py +2 -1
- alita_sdk/runtime/tools/sandbox.py +375 -0
- alita_sdk/runtime/tools/skill_router.py +776 -0
- alita_sdk/runtime/tools/tool.py +3 -1
- alita_sdk/runtime/tools/vectorstore.py +69 -65
- alita_sdk/runtime/tools/vectorstore_base.py +163 -90
- alita_sdk/runtime/utils/AlitaCallback.py +137 -21
- alita_sdk/runtime/utils/constants.py +5 -1
- alita_sdk/runtime/utils/mcp_client.py +492 -0
- alita_sdk/runtime/utils/mcp_oauth.py +361 -0
- alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/streamlit.py +41 -14
- alita_sdk/runtime/utils/toolkit_utils.py +28 -9
- alita_sdk/runtime/utils/utils.py +48 -0
- alita_sdk/tools/__init__.py +135 -37
- alita_sdk/tools/ado/__init__.py +2 -2
- alita_sdk/tools/ado/repos/__init__.py +16 -19
- alita_sdk/tools/ado/repos/repos_wrapper.py +12 -20
- alita_sdk/tools/ado/test_plan/__init__.py +27 -8
- alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -28
- alita_sdk/tools/ado/wiki/__init__.py +28 -12
- alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -40
- alita_sdk/tools/ado/work_item/__init__.py +28 -12
- alita_sdk/tools/ado/work_item/ado_wrapper.py +95 -11
- alita_sdk/tools/advanced_jira_mining/__init__.py +13 -8
- alita_sdk/tools/aws/delta_lake/__init__.py +15 -11
- alita_sdk/tools/aws/delta_lake/tool.py +5 -1
- alita_sdk/tools/azure_ai/search/__init__.py +14 -8
- alita_sdk/tools/base/tool.py +5 -1
- alita_sdk/tools/base_indexer_toolkit.py +454 -110
- alita_sdk/tools/bitbucket/__init__.py +28 -19
- alita_sdk/tools/bitbucket/api_wrapper.py +285 -27
- alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
- alita_sdk/tools/browser/__init__.py +41 -16
- alita_sdk/tools/browser/crawler.py +3 -1
- alita_sdk/tools/browser/utils.py +15 -6
- alita_sdk/tools/carrier/__init__.py +18 -17
- alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
- alita_sdk/tools/carrier/excel_reporter.py +8 -4
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/code/codeparser.py +1 -1
- alita_sdk/tools/chunkers/sematic/json_chunker.py +2 -1
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/cloud/aws/__init__.py +12 -7
- alita_sdk/tools/cloud/azure/__init__.py +12 -7
- alita_sdk/tools/cloud/gcp/__init__.py +12 -7
- alita_sdk/tools/cloud/k8s/__init__.py +12 -7
- alita_sdk/tools/code/linter/__init__.py +10 -8
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code/sonar/__init__.py +21 -13
- alita_sdk/tools/code_indexer_toolkit.py +199 -0
- alita_sdk/tools/confluence/__init__.py +22 -14
- alita_sdk/tools/confluence/api_wrapper.py +197 -58
- alita_sdk/tools/confluence/loader.py +14 -2
- alita_sdk/tools/custom_open_api/__init__.py +12 -5
- alita_sdk/tools/elastic/__init__.py +11 -8
- alita_sdk/tools/elitea_base.py +546 -64
- alita_sdk/tools/figma/__init__.py +60 -11
- alita_sdk/tools/figma/api_wrapper.py +1400 -167
- alita_sdk/tools/figma/figma_client.py +73 -0
- alita_sdk/tools/figma/toon_tools.py +2748 -0
- alita_sdk/tools/github/__init__.py +18 -17
- alita_sdk/tools/github/api_wrapper.py +9 -26
- alita_sdk/tools/github/github_client.py +81 -12
- alita_sdk/tools/github/schemas.py +2 -1
- alita_sdk/tools/github/tool.py +5 -1
- alita_sdk/tools/gitlab/__init__.py +19 -13
- alita_sdk/tools/gitlab/api_wrapper.py +256 -80
- alita_sdk/tools/gitlab_org/__init__.py +14 -10
- alita_sdk/tools/google/bigquery/__init__.py +14 -13
- alita_sdk/tools/google/bigquery/tool.py +5 -1
- alita_sdk/tools/google_places/__init__.py +21 -11
- alita_sdk/tools/jira/__init__.py +22 -11
- alita_sdk/tools/jira/api_wrapper.py +315 -168
- alita_sdk/tools/keycloak/__init__.py +11 -8
- alita_sdk/tools/localgit/__init__.py +9 -3
- alita_sdk/tools/localgit/local_git.py +62 -54
- alita_sdk/tools/localgit/tool.py +5 -1
- alita_sdk/tools/memory/__init__.py +38 -14
- alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
- alita_sdk/tools/ocr/__init__.py +11 -8
- alita_sdk/tools/openapi/__init__.py +491 -106
- alita_sdk/tools/openapi/api_wrapper.py +1357 -0
- alita_sdk/tools/openapi/tool.py +20 -0
- alita_sdk/tools/pandas/__init__.py +20 -12
- alita_sdk/tools/pandas/api_wrapper.py +40 -45
- alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
- alita_sdk/tools/postman/__init__.py +11 -11
- alita_sdk/tools/postman/api_wrapper.py +19 -8
- alita_sdk/tools/postman/postman_analysis.py +8 -1
- alita_sdk/tools/pptx/__init__.py +11 -10
- alita_sdk/tools/qtest/__init__.py +22 -14
- alita_sdk/tools/qtest/api_wrapper.py +1784 -88
- alita_sdk/tools/rally/__init__.py +13 -10
- alita_sdk/tools/report_portal/__init__.py +23 -16
- alita_sdk/tools/salesforce/__init__.py +22 -16
- alita_sdk/tools/servicenow/__init__.py +21 -16
- alita_sdk/tools/servicenow/api_wrapper.py +1 -1
- alita_sdk/tools/sharepoint/__init__.py +17 -14
- alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
- alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
- alita_sdk/tools/sharepoint/utils.py +8 -2
- alita_sdk/tools/slack/__init__.py +13 -8
- alita_sdk/tools/sql/__init__.py +22 -19
- alita_sdk/tools/sql/api_wrapper.py +71 -23
- alita_sdk/tools/testio/__init__.py +21 -13
- alita_sdk/tools/testrail/__init__.py +13 -11
- alita_sdk/tools/testrail/api_wrapper.py +214 -46
- alita_sdk/tools/utils/__init__.py +28 -4
- alita_sdk/tools/utils/content_parser.py +241 -55
- alita_sdk/tools/utils/text_operations.py +254 -0
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
- alita_sdk/tools/xray/__init__.py +18 -14
- alita_sdk/tools/xray/api_wrapper.py +58 -113
- alita_sdk/tools/yagmail/__init__.py +9 -3
- alita_sdk/tools/zephyr/__init__.py +12 -7
- alita_sdk/tools/zephyr_enterprise/__init__.py +16 -9
- alita_sdk/tools/zephyr_enterprise/api_wrapper.py +30 -15
- alita_sdk/tools/zephyr_essential/__init__.py +16 -10
- alita_sdk/tools/zephyr_essential/api_wrapper.py +297 -54
- alita_sdk/tools/zephyr_essential/client.py +6 -4
- alita_sdk/tools/zephyr_scale/__init__.py +13 -8
- alita_sdk/tools/zephyr_scale/api_wrapper.py +39 -31
- alita_sdk/tools/zephyr_squad/__init__.py +12 -7
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/METADATA +184 -37
- alita_sdk-0.3.584.dist-info/RECORD +452 -0
- alita_sdk-0.3.584.dist-info/entry_points.txt +2 -0
- alita_sdk/tools/bitbucket/tools.py +0 -304
- alita_sdk-0.3.257.dist-info/RECORD +0 -343
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/top_level.txt +0 -0
|
@@ -1,13 +1,17 @@
|
|
|
1
|
+
from copy import deepcopy
|
|
1
2
|
import os
|
|
3
|
+
import re
|
|
2
4
|
import tempfile
|
|
3
5
|
from logging import getLogger
|
|
4
6
|
from pathlib import Path
|
|
5
|
-
from typing import Generator
|
|
7
|
+
from typing import Generator, List
|
|
6
8
|
|
|
7
9
|
from langchain_core.documents import Document
|
|
8
10
|
from langchain_core.tools import ToolException
|
|
9
11
|
|
|
10
|
-
from alita_sdk.runtime.langchain.document_loaders.constants import loaders_map
|
|
12
|
+
from alita_sdk.runtime.langchain.document_loaders.constants import loaders_map, LoaderProperties
|
|
13
|
+
from ...runtime.langchain.document_loaders.AlitaTextLoader import AlitaTextLoader
|
|
14
|
+
from ...runtime.utils.utils import IndexerKeywords
|
|
11
15
|
|
|
12
16
|
logger = getLogger(__name__)
|
|
13
17
|
|
|
@@ -51,11 +55,9 @@ Highlight any visible details that could help in understanding the image.
|
|
|
51
55
|
Be as precise and thorough as possible in your responses. If something is unclear or illegible, state that explicitly.
|
|
52
56
|
'''
|
|
53
57
|
|
|
54
|
-
IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'webp', 'svg']
|
|
55
|
-
|
|
56
58
|
|
|
57
59
|
def parse_file_content(file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
|
|
58
|
-
sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False) -> str | ToolException:
|
|
60
|
+
sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False, prompt=None) -> str | ToolException:
|
|
59
61
|
"""Parse the content of a file based on its type and return the parsed content.
|
|
60
62
|
|
|
61
63
|
Args:
|
|
@@ -72,18 +74,94 @@ def parse_file_content(file_name=None, file_content=None, is_capture_image: bool
|
|
|
72
74
|
Raises:
|
|
73
75
|
ToolException: If the file type is not supported or if there is an error reading the file.
|
|
74
76
|
"""
|
|
77
|
+
if not prompt:
|
|
78
|
+
prompt = image_processing_prompt
|
|
79
|
+
loader = prepare_loader(
|
|
80
|
+
file_name=file_name,
|
|
81
|
+
file_content=file_content,
|
|
82
|
+
is_capture_image=is_capture_image,
|
|
83
|
+
page_number=page_number,
|
|
84
|
+
sheet_name=sheet_name,
|
|
85
|
+
llm=llm,
|
|
86
|
+
file_path=file_path,
|
|
87
|
+
excel_by_sheets=excel_by_sheets,
|
|
88
|
+
prompt=prompt
|
|
89
|
+
)
|
|
75
90
|
|
|
76
|
-
if
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
extension = Path(file_path if file_path else file_name).suffix
|
|
91
|
+
if not loader:
|
|
92
|
+
return ToolException(
|
|
93
|
+
"Not supported type of files entered. Supported types are TXT, DOCX, PDF, PPTX, XLSX and XLS only.")
|
|
80
94
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
95
|
+
try:
|
|
96
|
+
if hasattr(loader, 'get_content'):
|
|
97
|
+
return loader.get_content()
|
|
98
|
+
else:
|
|
99
|
+
extension = Path(file_path if file_path else file_name).suffix
|
|
100
|
+
loader_kwargs = get_loader_kwargs(loaders_map.get(extension), file_name, file_content, is_capture_image, page_number, sheet_name, llm, file_path, excel_by_sheets)
|
|
101
|
+
if file_content:
|
|
102
|
+
return load_content_from_bytes(file_content=file_content,
|
|
103
|
+
extension=extension,
|
|
104
|
+
loader_extra_config=loader_kwargs,
|
|
105
|
+
llm=llm)
|
|
106
|
+
else:
|
|
107
|
+
return load_content(file_path=file_path,
|
|
108
|
+
extension=extension,
|
|
109
|
+
loader_extra_config=loader_kwargs,
|
|
110
|
+
llm=llm)
|
|
111
|
+
except Exception as e:
|
|
112
|
+
return ToolException(f"Error reading file ({file_name or file_path}) content. Make sure these types are supported: {str(e)}")
|
|
113
|
+
|
|
114
|
+
def load_file_docs(file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
|
|
115
|
+
sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False) -> List[Document] | ToolException:
|
|
116
|
+
loader = prepare_loader(
|
|
117
|
+
file_name=file_name,
|
|
118
|
+
file_content=file_content,
|
|
119
|
+
is_capture_image=is_capture_image,
|
|
120
|
+
page_number=page_number,
|
|
121
|
+
sheet_name=sheet_name,
|
|
122
|
+
llm=llm,
|
|
123
|
+
file_path=file_path,
|
|
124
|
+
excel_by_sheets=excel_by_sheets
|
|
125
|
+
)
|
|
126
|
+
if not loader:
|
|
84
127
|
return ToolException(
|
|
85
128
|
"Not supported type of files entered. Supported types are TXT, DOCX, PDF, PPTX, XLSX and XLS only.")
|
|
86
|
-
|
|
129
|
+
return loader.load()
|
|
130
|
+
|
|
131
|
+
def get_loader_kwargs(loader_object, file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
|
|
132
|
+
sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False, prompt=None):
|
|
133
|
+
"""Build loader kwargs safely without deepcopying non-picklable objects like LLMs.
|
|
134
|
+
|
|
135
|
+
We avoid copying keys that are going to be overridden by this function anyway
|
|
136
|
+
(file_path, file_content, file_name, extract_images, llm, page_number,
|
|
137
|
+
sheet_name, excel_by_sheets, prompt, row_content, json_documents) to
|
|
138
|
+
prevent errors such as `cannot pickle '_thread.RLock' object` when an LLM
|
|
139
|
+
or client with internal locks is stored in the original kwargs.
|
|
140
|
+
"""
|
|
141
|
+
if not loader_object:
|
|
142
|
+
raise ToolException("Loader configuration is missing.")
|
|
143
|
+
|
|
144
|
+
original_kwargs = loader_object.get("kwargs", {}) or {}
|
|
145
|
+
|
|
146
|
+
# Keys that will be overwritten below – skip them when copying
|
|
147
|
+
overridden_keys = {
|
|
148
|
+
"file_path",
|
|
149
|
+
"file_content",
|
|
150
|
+
"file_name",
|
|
151
|
+
"extract_images",
|
|
152
|
+
"llm",
|
|
153
|
+
"page_number",
|
|
154
|
+
"sheet_name",
|
|
155
|
+
"excel_by_sheets",
|
|
156
|
+
"prompt",
|
|
157
|
+
"row_content",
|
|
158
|
+
"json_documents",
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
# Build a safe shallow copy without overridden keys to avoid deepcopy
|
|
162
|
+
# of potentially non-picklable objects (e.g., llm with internal RLock).
|
|
163
|
+
loader_kwargs = {k: v for k, v in original_kwargs.items() if k not in overridden_keys}
|
|
164
|
+
|
|
87
165
|
loader_kwargs.update({
|
|
88
166
|
"file_path": file_path,
|
|
89
167
|
"file_content": file_content,
|
|
@@ -93,28 +171,26 @@ def parse_file_content(file_name=None, file_content=None, is_capture_image: bool
|
|
|
93
171
|
"page_number": page_number,
|
|
94
172
|
"sheet_name": sheet_name,
|
|
95
173
|
"excel_by_sheets": excel_by_sheets,
|
|
174
|
+
"prompt": prompt,
|
|
96
175
|
"row_content": True,
|
|
97
176
|
"json_documents": False
|
|
98
177
|
})
|
|
99
|
-
|
|
178
|
+
return loader_kwargs
|
|
100
179
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
180
|
+
def prepare_loader(file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
|
|
181
|
+
sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False,
|
|
182
|
+
prompt=None):
|
|
183
|
+
if (file_path and (file_name or file_content)) or (not file_path and (not file_name or file_content is None)):
|
|
184
|
+
raise ToolException("Either (file_name and file_content) or file_path must be provided, but not both.")
|
|
104
185
|
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
if
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
else:
|
|
114
|
-
return load_content(file_path=file_path,
|
|
115
|
-
extension=extension,
|
|
116
|
-
loader_extra_config=loader_kwargs,
|
|
117
|
-
llm=llm)
|
|
186
|
+
extension = Path(file_path if file_path else file_name).suffix
|
|
187
|
+
|
|
188
|
+
loader_object = loaders_map.get(extension)
|
|
189
|
+
if not loader_object:
|
|
190
|
+
loader_object = loaders_map.get('.txt') # Default to text loader if no specific loader found
|
|
191
|
+
loader_kwargs = get_loader_kwargs(loader_object, file_name, file_content, is_capture_image, page_number, sheet_name, llm, file_path, excel_by_sheets, prompt)
|
|
192
|
+
loader = loader_object['class'](**loader_kwargs)
|
|
193
|
+
return loader
|
|
118
194
|
|
|
119
195
|
# TODO: review usage of this function alongside with functions above
|
|
120
196
|
def load_content(file_path: str, extension: str = None, loader_extra_config: dict = None, llm = None) -> str:
|
|
@@ -142,7 +218,7 @@ def load_content(file_path: str, extension: str = None, loader_extra_config: dic
|
|
|
142
218
|
if "file_path" in loader_kwargs:
|
|
143
219
|
del loader_kwargs["file_path"]
|
|
144
220
|
|
|
145
|
-
loader = loader_cls(file_path, **loader_kwargs)
|
|
221
|
+
loader = loader_cls(file_path=file_path, **loader_kwargs)
|
|
146
222
|
documents = loader.load()
|
|
147
223
|
|
|
148
224
|
page_contents = [doc.page_content for doc in documents]
|
|
@@ -167,38 +243,129 @@ def load_content_from_bytes(file_content: bytes, extension: str = None, loader_e
|
|
|
167
243
|
if temp_file_path and os.path.exists(temp_file_path):
|
|
168
244
|
os.remove(temp_file_path)
|
|
169
245
|
|
|
170
|
-
|
|
246
|
+
|
|
247
|
+
def _load_content_from_bytes_with_prompt(file_content: bytes, extension: str = None, loader_extra_config: dict = None, llm = None, prompt: str = image_processing_prompt) -> str:
|
|
248
|
+
"""Internal helper that behaves like load_content_from_bytes but also propagates prompt.
|
|
249
|
+
|
|
250
|
+
This keeps the public load_content_from_bytes API unchanged while allowing newer
|
|
251
|
+
code paths to pass an explicit prompt through to the loader.
|
|
252
|
+
"""
|
|
171
253
|
temp_file_path = None
|
|
172
254
|
try:
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
with tempfile.NamedTemporaryFile(mode='w+b', suffix=extension, delete=False) as temp_file:
|
|
176
|
-
temp_file_path = temp_file.name
|
|
177
|
-
if content is None:
|
|
178
|
-
logger.warning("'loader_content' ie expected but not found in document metadata.")
|
|
179
|
-
return
|
|
180
|
-
|
|
181
|
-
temp_file.write(content)
|
|
255
|
+
with tempfile.NamedTemporaryFile(mode='w+b', delete=False, suffix=extension or '') as temp_file:
|
|
256
|
+
temp_file.write(file_content)
|
|
182
257
|
temp_file.flush()
|
|
258
|
+
temp_file_path = temp_file.name
|
|
183
259
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
260
|
+
# Use prepare_loader so that prompt and other kwargs are handled consistently
|
|
261
|
+
loader = prepare_loader(
|
|
262
|
+
file_name=None,
|
|
263
|
+
file_content=None,
|
|
264
|
+
is_capture_image=loader_extra_config.get('extract_images') if loader_extra_config else False,
|
|
265
|
+
page_number=loader_extra_config.get('page_number') if loader_extra_config else None,
|
|
266
|
+
sheet_name=loader_extra_config.get('sheet_name') if loader_extra_config else None,
|
|
267
|
+
llm=llm or (loader_extra_config.get('llm') if loader_extra_config else None),
|
|
268
|
+
file_path=temp_file_path,
|
|
269
|
+
excel_by_sheets=loader_extra_config.get('excel_by_sheets') if loader_extra_config else False,
|
|
270
|
+
prompt=prompt or (loader_extra_config.get('prompt') if loader_extra_config else image_processing_prompt),
|
|
271
|
+
)
|
|
191
272
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
page_content=sanitize_for_postgres(chunk.page_content),
|
|
196
|
-
metadata={**document.metadata, **chunk.metadata}
|
|
197
|
-
)
|
|
273
|
+
documents = loader.load()
|
|
274
|
+
page_contents = [doc.page_content for doc in documents]
|
|
275
|
+
return "\n".join(page_contents)
|
|
198
276
|
finally:
|
|
199
277
|
if temp_file_path and os.path.exists(temp_file_path):
|
|
200
278
|
os.remove(temp_file_path)
|
|
201
279
|
|
|
280
|
+
|
|
281
|
+
def process_document_by_type(content, extension_source: str, document: Document = None, llm = None, chunking_config=None) \
|
|
282
|
+
-> Generator[Document, None, None]:
|
|
283
|
+
"""Process the content of a file based on its type using a configured loader cosidering the origin document."""
|
|
284
|
+
try:
|
|
285
|
+
chunks = process_content_by_type(content, extension_source, llm, chunking_config)
|
|
286
|
+
except Exception as e:
|
|
287
|
+
msg = f"Error during content parsing for file {extension_source}:\n{e}"
|
|
288
|
+
logger.warning(msg)
|
|
289
|
+
yield Document(
|
|
290
|
+
page_content=msg,
|
|
291
|
+
metadata={**document.metadata, 'chunk_id': 1}
|
|
292
|
+
)
|
|
293
|
+
return
|
|
294
|
+
#
|
|
295
|
+
chunks_counter = 0
|
|
296
|
+
for chunk in chunks:
|
|
297
|
+
chunks_counter += 1
|
|
298
|
+
metadata = {**document.metadata, **chunk.metadata}
|
|
299
|
+
#
|
|
300
|
+
# ensure each chunk has a unique chunk_id
|
|
301
|
+
metadata['chunk_id'] = chunks_counter
|
|
302
|
+
#
|
|
303
|
+
yield Document(
|
|
304
|
+
page_content=sanitize_for_postgres(chunk.page_content),
|
|
305
|
+
metadata=metadata
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def process_content_by_type(content, filename: str, llm=None, chunking_config=None, fallback_extensions=None) -> \
|
|
310
|
+
Generator[Document, None, None]:
|
|
311
|
+
"""Process the content of a file based on its type using a configured loader."""
|
|
312
|
+
temp_file_path = None
|
|
313
|
+
extensions = fallback_extensions if fallback_extensions else []
|
|
314
|
+
match = re.search(r'\.([^.]+)$', filename)
|
|
315
|
+
|
|
316
|
+
if match:
|
|
317
|
+
extensions.insert(0, f".{match.group(1).lower()}")
|
|
318
|
+
elif not extensions:
|
|
319
|
+
extensions = [".txt"]
|
|
320
|
+
|
|
321
|
+
for extension in extensions:
|
|
322
|
+
try:
|
|
323
|
+
with tempfile.NamedTemporaryFile(mode='w+b', suffix=extension, delete=False) as temp_file:
|
|
324
|
+
temp_file_path = temp_file.name
|
|
325
|
+
if content is None:
|
|
326
|
+
logger.warning(
|
|
327
|
+
f"'{IndexerKeywords.CONTENT_IN_BYTES.value}' ie expected but not found in document metadata.")
|
|
328
|
+
return []
|
|
329
|
+
|
|
330
|
+
temp_file.write(content)
|
|
331
|
+
temp_file.flush()
|
|
332
|
+
|
|
333
|
+
loader_config = loaders_map.get(extension)
|
|
334
|
+
if not loader_config:
|
|
335
|
+
logger.warning(f"No loader found for file extension: {extension}. File: {temp_file_path}")
|
|
336
|
+
return []
|
|
337
|
+
|
|
338
|
+
loader_cls = loader_config['class']
|
|
339
|
+
loader_kwargs = loader_config['kwargs']
|
|
340
|
+
# Determine which loader configuration keys are allowed to be overridden by user input.
|
|
341
|
+
# If 'allowed_to_override' is specified in the loader configuration, use it; otherwise, allow all keys in loader_kwargs.
|
|
342
|
+
allowed_to_override = loader_config.get('allowed_to_override', loader_kwargs)
|
|
343
|
+
# If a chunking_config is provided and contains custom configuration for the current file extension,
|
|
344
|
+
# update loader_kwargs with user-supplied values, but only for keys explicitly permitted in allowed_to_override and if value differs from default.
|
|
345
|
+
# This ensures that only safe and intended parameters can be customized, preventing accidental or unauthorized changes
|
|
346
|
+
# to critical loader settings.
|
|
347
|
+
if chunking_config and (users_config_for_extension := chunking_config.get(extension, {})):
|
|
348
|
+
for key in set(users_config_for_extension.keys()) & set(allowed_to_override.keys()):
|
|
349
|
+
if users_config_for_extension[key] != allowed_to_override[key]:
|
|
350
|
+
loader_kwargs[key] = users_config_for_extension[key]
|
|
351
|
+
if LoaderProperties.LLM.value in loader_kwargs and loader_kwargs.pop(LoaderProperties.LLM.value):
|
|
352
|
+
loader_kwargs['llm'] = llm
|
|
353
|
+
if LoaderProperties.PROMPT_DEFAULT.value in loader_kwargs and loader_kwargs.pop(LoaderProperties.PROMPT_DEFAULT.value):
|
|
354
|
+
loader_kwargs[LoaderProperties.PROMPT.value] = image_processing_prompt
|
|
355
|
+
loader = loader_cls(file_path=temp_file_path, **loader_kwargs)
|
|
356
|
+
yield from loader.load()
|
|
357
|
+
break
|
|
358
|
+
except Exception as e:
|
|
359
|
+
if fallback_extensions:
|
|
360
|
+
logger.warning(f"Error loading attachment: {str(e)} for file {temp_file_path} (extension: {extension})")
|
|
361
|
+
logger.warning(f"Continuing with fallback extensions: {fallback_extensions}.")
|
|
362
|
+
continue
|
|
363
|
+
else:
|
|
364
|
+
raise e
|
|
365
|
+
finally:
|
|
366
|
+
if temp_file_path and os.path.exists(temp_file_path):
|
|
367
|
+
os.remove(temp_file_path)
|
|
368
|
+
|
|
202
369
|
# FIXME copied from langchain_core/utils/strings.py of 0.3.74 version
|
|
203
370
|
# https://github.com/langchain-ai/langchain/pull/32157
|
|
204
371
|
# should be used from langchain_core.utils import sanitize_for_postgres once updated to newer version
|
|
@@ -218,4 +385,23 @@ def sanitize_for_postgres(text: str, replacement: str = "") -> str:
|
|
|
218
385
|
>>> sanitize_for_postgres("Hello\\x00world", " ")
|
|
219
386
|
'Hello world'
|
|
220
387
|
"""
|
|
221
|
-
return text.replace("\x00", replacement)
|
|
388
|
+
return text.replace("\x00", replacement)
|
|
389
|
+
|
|
390
|
+
|
|
391
|
+
def file_extension_by_chunker(chunker_name: str) -> str | None:
|
|
392
|
+
if not chunker_name:
|
|
393
|
+
return None
|
|
394
|
+
name = chunker_name.lower()
|
|
395
|
+
if name == "markdown":
|
|
396
|
+
return ".md"
|
|
397
|
+
if name == "json":
|
|
398
|
+
return ".json"
|
|
399
|
+
if name == "text" or name == "txt":
|
|
400
|
+
return ".txt"
|
|
401
|
+
if name == "html":
|
|
402
|
+
return ".html"
|
|
403
|
+
if name == "xml":
|
|
404
|
+
return ".xml"
|
|
405
|
+
if name == "csv":
|
|
406
|
+
return ".csv"
|
|
407
|
+
return None
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Shared text operations utilities for file manipulation across toolkits.
|
|
3
|
+
|
|
4
|
+
Provides common functionality for:
|
|
5
|
+
- Parsing OLD/NEW marker-based edits
|
|
6
|
+
- Text file validation
|
|
7
|
+
- Line-based slicing and partial reads
|
|
8
|
+
- Content searching with context
|
|
9
|
+
"""
|
|
10
|
+
import re
|
|
11
|
+
import logging
|
|
12
|
+
from typing import List, Tuple, Dict, Optional
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
# Text file extensions that support editing
|
|
17
|
+
TEXT_EDITABLE_EXTENSIONS = {
|
|
18
|
+
'.md', '.txt', '.csv', '.json', '.xml', '.html',
|
|
19
|
+
'.yaml', '.yml', '.ini', '.conf', '.log', '.sh',
|
|
20
|
+
'.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.go',
|
|
21
|
+
'.rb', '.php', '.c', '.cpp', '.h', '.hpp', '.cs',
|
|
22
|
+
'.sql', '.r', '.m', '.swift', '.kt', '.rs', '.scala'
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def parse_old_new_markers(file_query: str) -> List[Tuple[str, str]]:
|
|
27
|
+
"""
|
|
28
|
+
Parse OLD/NEW marker-based edit instructions.
|
|
29
|
+
|
|
30
|
+
Extracts pairs of old and new content from a file query using markers:
|
|
31
|
+
- OLD <<<< ... >>>> OLD
|
|
32
|
+
- NEW <<<< ... >>>> NEW
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
file_query: String containing marked old and new content sections
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
List of tuples (old_content, new_content) for each edit pair
|
|
39
|
+
|
|
40
|
+
Example:
|
|
41
|
+
>>> query = '''
|
|
42
|
+
... OLD <<<<
|
|
43
|
+
... Hello World
|
|
44
|
+
... >>>> OLD
|
|
45
|
+
... NEW <<<<
|
|
46
|
+
... Hello Mars
|
|
47
|
+
... >>>> NEW
|
|
48
|
+
... '''
|
|
49
|
+
>>> parse_old_new_markers(query)
|
|
50
|
+
[('Hello World', 'Hello Mars')]
|
|
51
|
+
"""
|
|
52
|
+
# Split the file content by lines
|
|
53
|
+
code_lines = file_query.split("\n")
|
|
54
|
+
|
|
55
|
+
# Initialize lists to hold the contents of OLD and NEW sections
|
|
56
|
+
old_contents = []
|
|
57
|
+
new_contents = []
|
|
58
|
+
|
|
59
|
+
# Initialize variables to track whether the current line is within an OLD or NEW section
|
|
60
|
+
in_old_section = False
|
|
61
|
+
in_new_section = False
|
|
62
|
+
|
|
63
|
+
# Temporary storage for the current section's content
|
|
64
|
+
current_section_content = []
|
|
65
|
+
|
|
66
|
+
# Iterate through each line in the file content
|
|
67
|
+
for line in code_lines:
|
|
68
|
+
# Check for OLD section start
|
|
69
|
+
if "OLD <<<" in line:
|
|
70
|
+
in_old_section = True
|
|
71
|
+
current_section_content = [] # Reset current section content
|
|
72
|
+
continue # Skip the line with the marker
|
|
73
|
+
|
|
74
|
+
# Check for OLD section end
|
|
75
|
+
if ">>>> OLD" in line:
|
|
76
|
+
in_old_section = False
|
|
77
|
+
old_contents.append("\n".join(current_section_content).strip()) # Add the captured content
|
|
78
|
+
current_section_content = [] # Reset current section content
|
|
79
|
+
continue # Skip the line with the marker
|
|
80
|
+
|
|
81
|
+
# Check for NEW section start
|
|
82
|
+
if "NEW <<<" in line:
|
|
83
|
+
in_new_section = True
|
|
84
|
+
current_section_content = [] # Reset current section content
|
|
85
|
+
continue # Skip the line with the marker
|
|
86
|
+
|
|
87
|
+
# Check for NEW section end
|
|
88
|
+
if ">>>> NEW" in line:
|
|
89
|
+
in_new_section = False
|
|
90
|
+
new_contents.append("\n".join(current_section_content).strip()) # Add the captured content
|
|
91
|
+
current_section_content = [] # Reset current section content
|
|
92
|
+
continue # Skip the line with the marker
|
|
93
|
+
|
|
94
|
+
# If currently in an OLD or NEW section, add the line to the current section content
|
|
95
|
+
if in_old_section or in_new_section:
|
|
96
|
+
current_section_content.append(line)
|
|
97
|
+
|
|
98
|
+
# Pair the OLD and NEW contents
|
|
99
|
+
paired_contents = list(zip(old_contents, new_contents))
|
|
100
|
+
|
|
101
|
+
return paired_contents
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def is_text_editable(filename: str) -> bool:
|
|
105
|
+
"""
|
|
106
|
+
Check if a file is editable as text based on its extension.
|
|
107
|
+
|
|
108
|
+
Args:
|
|
109
|
+
filename: Name or path of the file to check
|
|
110
|
+
|
|
111
|
+
Returns:
|
|
112
|
+
True if file extension is in the text-editable whitelist
|
|
113
|
+
|
|
114
|
+
Example:
|
|
115
|
+
>>> is_text_editable("config.json")
|
|
116
|
+
True
|
|
117
|
+
>>> is_text_editable("image.png")
|
|
118
|
+
False
|
|
119
|
+
"""
|
|
120
|
+
from pathlib import Path
|
|
121
|
+
ext = Path(filename).suffix.lower()
|
|
122
|
+
return ext in TEXT_EDITABLE_EXTENSIONS
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def apply_line_slice(
|
|
126
|
+
content: str,
|
|
127
|
+
offset: Optional[int] = None,
|
|
128
|
+
limit: Optional[int] = None,
|
|
129
|
+
head: Optional[int] = None,
|
|
130
|
+
tail: Optional[int] = None
|
|
131
|
+
) -> str:
|
|
132
|
+
"""
|
|
133
|
+
Apply line-based slicing to text content.
|
|
134
|
+
|
|
135
|
+
Supports multiple modes:
|
|
136
|
+
- offset + limit: Read from line `offset` for `limit` lines (1-indexed)
|
|
137
|
+
- head: Read only first N lines
|
|
138
|
+
- tail: Read only last N lines
|
|
139
|
+
- No params: Return full content
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
content: Text content to slice
|
|
143
|
+
offset: Starting line number (1-indexed, inclusive)
|
|
144
|
+
limit: Number of lines to read from offset
|
|
145
|
+
head: Return only first N lines
|
|
146
|
+
tail: Return only last N lines
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
Sliced content as string
|
|
150
|
+
|
|
151
|
+
Example:
|
|
152
|
+
>>> text = "line1\\nline2\\nline3\\nline4\\nline5"
|
|
153
|
+
>>> apply_line_slice(text, offset=2, limit=2)
|
|
154
|
+
'line2\\nline3'
|
|
155
|
+
>>> apply_line_slice(text, head=2)
|
|
156
|
+
'line1\\nline2'
|
|
157
|
+
>>> apply_line_slice(text, tail=2)
|
|
158
|
+
'line4\\nline5'
|
|
159
|
+
"""
|
|
160
|
+
if not content:
|
|
161
|
+
return content
|
|
162
|
+
|
|
163
|
+
lines = content.splitlines(keepends=True)
|
|
164
|
+
|
|
165
|
+
# Head mode: first N lines
|
|
166
|
+
if head is not None:
|
|
167
|
+
return ''.join(lines[:head])
|
|
168
|
+
|
|
169
|
+
# Tail mode: last N lines
|
|
170
|
+
if tail is not None:
|
|
171
|
+
return ''.join(lines[-tail:] if tail > 0 else lines)
|
|
172
|
+
|
|
173
|
+
# Offset + limit mode: slice from offset for limit lines
|
|
174
|
+
if offset is not None:
|
|
175
|
+
start_idx = max(0, offset - 1) # Convert 1-indexed to 0-indexed
|
|
176
|
+
if limit is not None:
|
|
177
|
+
end_idx = start_idx + limit
|
|
178
|
+
return ''.join(lines[start_idx:end_idx])
|
|
179
|
+
else:
|
|
180
|
+
return ''.join(lines[start_idx:])
|
|
181
|
+
|
|
182
|
+
# No slicing parameters: return full content
|
|
183
|
+
return content
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def search_in_content(
|
|
187
|
+
content: str,
|
|
188
|
+
pattern: str,
|
|
189
|
+
is_regex: bool = True,
|
|
190
|
+
context_lines: int = 2
|
|
191
|
+
) -> List[Dict[str, any]]:
|
|
192
|
+
"""
|
|
193
|
+
Search for pattern in content with context lines.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
content: Text content to search
|
|
197
|
+
pattern: Search pattern (regex if is_regex=True, else literal string)
|
|
198
|
+
is_regex: Whether to treat pattern as regex (default True)
|
|
199
|
+
context_lines: Number of lines before/after match to include (default 2)
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
List of match dictionaries with keys:
|
|
203
|
+
- line_number: 1-indexed line number of match
|
|
204
|
+
- line_content: The matching line
|
|
205
|
+
- match_text: The actual matched text
|
|
206
|
+
- context_before: List of lines before match
|
|
207
|
+
- context_after: List of lines after match
|
|
208
|
+
|
|
209
|
+
Example:
|
|
210
|
+
>>> text = "line1\\nHello World\\nline3"
|
|
211
|
+
>>> matches = search_in_content(text, "Hello", is_regex=False)
|
|
212
|
+
>>> matches[0]['line_number']
|
|
213
|
+
2
|
|
214
|
+
>>> matches[0]['match_text']
|
|
215
|
+
'Hello'
|
|
216
|
+
"""
|
|
217
|
+
if not content:
|
|
218
|
+
return []
|
|
219
|
+
|
|
220
|
+
lines = content.splitlines()
|
|
221
|
+
matches = []
|
|
222
|
+
|
|
223
|
+
# Compile regex pattern or escape for literal search
|
|
224
|
+
if is_regex:
|
|
225
|
+
try:
|
|
226
|
+
regex = re.compile(pattern, re.IGNORECASE)
|
|
227
|
+
except re.error as e:
|
|
228
|
+
logger.warning(f"Invalid regex pattern '{pattern}': {e}")
|
|
229
|
+
return []
|
|
230
|
+
else:
|
|
231
|
+
regex = re.compile(re.escape(pattern), re.IGNORECASE)
|
|
232
|
+
|
|
233
|
+
# Search each line
|
|
234
|
+
for line_idx, line in enumerate(lines):
|
|
235
|
+
match = regex.search(line)
|
|
236
|
+
if match:
|
|
237
|
+
line_number = line_idx + 1 # Convert to 1-indexed
|
|
238
|
+
|
|
239
|
+
# Get context lines
|
|
240
|
+
context_start = max(0, line_idx - context_lines)
|
|
241
|
+
context_end = min(len(lines), line_idx + context_lines + 1)
|
|
242
|
+
|
|
243
|
+
context_before = lines[context_start:line_idx]
|
|
244
|
+
context_after = lines[line_idx + 1:context_end]
|
|
245
|
+
|
|
246
|
+
matches.append({
|
|
247
|
+
'line_number': line_number,
|
|
248
|
+
'line_content': line,
|
|
249
|
+
'match_text': match.group(0),
|
|
250
|
+
'context_before': context_before,
|
|
251
|
+
'context_after': context_after,
|
|
252
|
+
})
|
|
253
|
+
|
|
254
|
+
return matches
|