alita-sdk 0.3.263__py3-none-any.whl → 0.3.499__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alita_sdk/cli/__init__.py +10 -0
- alita_sdk/cli/__main__.py +17 -0
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +155 -0
- alita_sdk/cli/agent_loader.py +215 -0
- alita_sdk/cli/agent_ui.py +228 -0
- alita_sdk/cli/agents.py +3601 -0
- alita_sdk/cli/callbacks.py +647 -0
- alita_sdk/cli/cli.py +168 -0
- alita_sdk/cli/config.py +306 -0
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/formatting.py +182 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1256 -0
- alita_sdk/cli/mcp_loader.py +315 -0
- alita_sdk/cli/toolkit.py +327 -0
- alita_sdk/cli/toolkit_loader.py +85 -0
- alita_sdk/cli/tools/__init__.py +43 -0
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +1751 -0
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +64 -8
- alita_sdk/community/inventory/__init__.py +224 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +173 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/__init__.py +10 -0
- alita_sdk/configurations/ado.py +4 -2
- alita_sdk/configurations/azure_search.py +1 -1
- alita_sdk/configurations/bigquery.py +1 -1
- alita_sdk/configurations/bitbucket.py +94 -2
- alita_sdk/configurations/browser.py +18 -0
- alita_sdk/configurations/carrier.py +19 -0
- alita_sdk/configurations/confluence.py +96 -1
- alita_sdk/configurations/delta_lake.py +1 -1
- alita_sdk/configurations/figma.py +0 -5
- alita_sdk/configurations/github.py +65 -1
- alita_sdk/configurations/gitlab.py +79 -0
- alita_sdk/configurations/google_places.py +17 -0
- alita_sdk/configurations/jira.py +103 -0
- alita_sdk/configurations/postman.py +1 -1
- alita_sdk/configurations/qtest.py +1 -3
- alita_sdk/configurations/report_portal.py +19 -0
- alita_sdk/configurations/salesforce.py +19 -0
- alita_sdk/configurations/service_now.py +1 -12
- alita_sdk/configurations/sharepoint.py +19 -0
- alita_sdk/configurations/sonar.py +18 -0
- alita_sdk/configurations/sql.py +20 -0
- alita_sdk/configurations/testio.py +18 -0
- alita_sdk/configurations/testrail.py +88 -0
- alita_sdk/configurations/xray.py +94 -1
- alita_sdk/configurations/zephyr_enterprise.py +94 -1
- alita_sdk/configurations/zephyr_essential.py +95 -0
- alita_sdk/runtime/clients/artifact.py +12 -2
- alita_sdk/runtime/clients/client.py +235 -66
- alita_sdk/runtime/clients/mcp_discovery.py +342 -0
- alita_sdk/runtime/clients/mcp_manager.py +262 -0
- alita_sdk/runtime/clients/sandbox_client.py +373 -0
- alita_sdk/runtime/langchain/assistant.py +123 -17
- alita_sdk/runtime/langchain/constants.py +8 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
- alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +8 -2
- alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
- alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
- alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
- alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
- alita_sdk/runtime/langchain/document_loaders/constants.py +187 -40
- alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
- alita_sdk/runtime/langchain/langraph_agent.py +406 -91
- alita_sdk/runtime/langchain/utils.py +51 -8
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/models/mcp_models.py +61 -0
- alita_sdk/runtime/toolkits/__init__.py +26 -0
- alita_sdk/runtime/toolkits/application.py +9 -2
- alita_sdk/runtime/toolkits/artifact.py +19 -7
- alita_sdk/runtime/toolkits/datasource.py +13 -6
- alita_sdk/runtime/toolkits/mcp.py +780 -0
- alita_sdk/runtime/toolkits/planning.py +178 -0
- alita_sdk/runtime/toolkits/subgraph.py +11 -6
- alita_sdk/runtime/toolkits/tools.py +214 -60
- alita_sdk/runtime/toolkits/vectorstore.py +9 -4
- alita_sdk/runtime/tools/__init__.py +22 -0
- alita_sdk/runtime/tools/application.py +16 -4
- alita_sdk/runtime/tools/artifact.py +312 -19
- alita_sdk/runtime/tools/function.py +100 -4
- alita_sdk/runtime/tools/graph.py +81 -0
- alita_sdk/runtime/tools/image_generation.py +212 -0
- alita_sdk/runtime/tools/llm.py +539 -180
- alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
- alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
- alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/router.py +2 -1
- alita_sdk/runtime/tools/sandbox.py +375 -0
- alita_sdk/runtime/tools/vectorstore.py +62 -63
- alita_sdk/runtime/tools/vectorstore_base.py +156 -85
- alita_sdk/runtime/utils/AlitaCallback.py +106 -20
- alita_sdk/runtime/utils/mcp_client.py +465 -0
- alita_sdk/runtime/utils/mcp_oauth.py +244 -0
- alita_sdk/runtime/utils/mcp_sse_client.py +405 -0
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/streamlit.py +41 -14
- alita_sdk/runtime/utils/toolkit_utils.py +28 -9
- alita_sdk/runtime/utils/utils.py +14 -0
- alita_sdk/tools/__init__.py +78 -35
- alita_sdk/tools/ado/__init__.py +0 -1
- alita_sdk/tools/ado/repos/__init__.py +10 -6
- alita_sdk/tools/ado/repos/repos_wrapper.py +12 -11
- alita_sdk/tools/ado/test_plan/__init__.py +10 -7
- alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -23
- alita_sdk/tools/ado/wiki/__init__.py +10 -11
- alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -28
- alita_sdk/tools/ado/work_item/__init__.py +10 -11
- alita_sdk/tools/ado/work_item/ado_wrapper.py +63 -10
- alita_sdk/tools/advanced_jira_mining/__init__.py +10 -7
- alita_sdk/tools/aws/delta_lake/__init__.py +13 -11
- alita_sdk/tools/azure_ai/search/__init__.py +11 -7
- alita_sdk/tools/base_indexer_toolkit.py +392 -86
- alita_sdk/tools/bitbucket/__init__.py +18 -11
- alita_sdk/tools/bitbucket/api_wrapper.py +52 -9
- alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
- alita_sdk/tools/browser/__init__.py +40 -16
- alita_sdk/tools/browser/crawler.py +3 -1
- alita_sdk/tools/browser/utils.py +15 -6
- alita_sdk/tools/carrier/__init__.py +17 -17
- alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
- alita_sdk/tools/carrier/excel_reporter.py +8 -4
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/code/codeparser.py +1 -1
- alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/cloud/aws/__init__.py +9 -6
- alita_sdk/tools/cloud/azure/__init__.py +9 -6
- alita_sdk/tools/cloud/gcp/__init__.py +9 -6
- alita_sdk/tools/cloud/k8s/__init__.py +9 -6
- alita_sdk/tools/code/linter/__init__.py +7 -7
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code/sonar/__init__.py +18 -12
- alita_sdk/tools/code_indexer_toolkit.py +199 -0
- alita_sdk/tools/confluence/__init__.py +14 -11
- alita_sdk/tools/confluence/api_wrapper.py +198 -58
- alita_sdk/tools/confluence/loader.py +10 -0
- alita_sdk/tools/custom_open_api/__init__.py +9 -4
- alita_sdk/tools/elastic/__init__.py +8 -7
- alita_sdk/tools/elitea_base.py +543 -64
- alita_sdk/tools/figma/__init__.py +10 -8
- alita_sdk/tools/figma/api_wrapper.py +352 -153
- alita_sdk/tools/github/__init__.py +13 -11
- alita_sdk/tools/github/api_wrapper.py +9 -26
- alita_sdk/tools/github/github_client.py +75 -12
- alita_sdk/tools/github/schemas.py +2 -1
- alita_sdk/tools/gitlab/__init__.py +11 -10
- alita_sdk/tools/gitlab/api_wrapper.py +135 -45
- alita_sdk/tools/gitlab_org/__init__.py +11 -9
- alita_sdk/tools/google/bigquery/__init__.py +12 -13
- alita_sdk/tools/google_places/__init__.py +18 -10
- alita_sdk/tools/jira/__init__.py +14 -8
- alita_sdk/tools/jira/api_wrapper.py +315 -168
- alita_sdk/tools/keycloak/__init__.py +8 -7
- alita_sdk/tools/localgit/local_git.py +56 -54
- alita_sdk/tools/memory/__init__.py +27 -11
- alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
- alita_sdk/tools/ocr/__init__.py +8 -7
- alita_sdk/tools/openapi/__init__.py +10 -1
- alita_sdk/tools/pandas/__init__.py +8 -7
- alita_sdk/tools/pandas/api_wrapper.py +7 -25
- alita_sdk/tools/postman/__init__.py +8 -10
- alita_sdk/tools/postman/api_wrapper.py +19 -8
- alita_sdk/tools/postman/postman_analysis.py +8 -1
- alita_sdk/tools/pptx/__init__.py +8 -9
- alita_sdk/tools/qtest/__init__.py +19 -13
- alita_sdk/tools/qtest/api_wrapper.py +1784 -88
- alita_sdk/tools/rally/__init__.py +10 -9
- alita_sdk/tools/report_portal/__init__.py +20 -15
- alita_sdk/tools/salesforce/__init__.py +19 -15
- alita_sdk/tools/servicenow/__init__.py +14 -11
- alita_sdk/tools/sharepoint/__init__.py +14 -13
- alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
- alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
- alita_sdk/tools/sharepoint/utils.py +8 -2
- alita_sdk/tools/slack/__init__.py +10 -7
- alita_sdk/tools/sql/__init__.py +19 -18
- alita_sdk/tools/sql/api_wrapper.py +71 -23
- alita_sdk/tools/testio/__init__.py +18 -12
- alita_sdk/tools/testrail/__init__.py +10 -10
- alita_sdk/tools/testrail/api_wrapper.py +213 -45
- alita_sdk/tools/utils/__init__.py +28 -4
- alita_sdk/tools/utils/content_parser.py +181 -61
- alita_sdk/tools/utils/text_operations.py +254 -0
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
- alita_sdk/tools/xray/__init__.py +12 -7
- alita_sdk/tools/xray/api_wrapper.py +58 -113
- alita_sdk/tools/zephyr/__init__.py +9 -6
- alita_sdk/tools/zephyr_enterprise/__init__.py +13 -8
- alita_sdk/tools/zephyr_enterprise/api_wrapper.py +17 -7
- alita_sdk/tools/zephyr_essential/__init__.py +13 -9
- alita_sdk/tools/zephyr_essential/api_wrapper.py +289 -47
- alita_sdk/tools/zephyr_essential/client.py +6 -4
- alita_sdk/tools/zephyr_scale/__init__.py +10 -7
- alita_sdk/tools/zephyr_scale/api_wrapper.py +6 -2
- alita_sdk/tools/zephyr_squad/__init__.py +9 -6
- {alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/METADATA +180 -33
- alita_sdk-0.3.499.dist-info/RECORD +433 -0
- alita_sdk-0.3.499.dist-info/entry_points.txt +2 -0
- alita_sdk-0.3.263.dist-info/RECORD +0 -342
- {alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,375 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import logging
|
|
3
|
+
import subprocess
|
|
4
|
+
import os
|
|
5
|
+
from typing import Any, Type, Optional, Dict, List, Literal, Union
|
|
6
|
+
from copy import deepcopy
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
from langchain_core.tools import BaseTool, BaseToolkit
|
|
10
|
+
from langchain_core.messages import ToolCall
|
|
11
|
+
from pydantic import BaseModel, create_model, ConfigDict, Field
|
|
12
|
+
from pydantic.fields import FieldInfo
|
|
13
|
+
|
|
14
|
+
logger = logging.getLogger(__name__)
|
|
15
|
+
|
|
16
|
+
name = "pyodide"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_tools(tools_list: list, alita_client=None, llm=None, memory_store=None):
|
|
20
|
+
"""
|
|
21
|
+
Get sandbox tools for the provided tool configurations.
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
tools_list: List of tool configurations
|
|
25
|
+
alita_client: Alita client instance for sandbox tools
|
|
26
|
+
llm: LLM client instance (unused for sandbox)
|
|
27
|
+
memory_store: Optional memory store instance (unused for sandbox)
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
List of sandbox tools
|
|
31
|
+
"""
|
|
32
|
+
all_tools = []
|
|
33
|
+
|
|
34
|
+
for tool in tools_list:
|
|
35
|
+
if tool.get('type') == 'sandbox' or tool.get('toolkit_name') == 'sandbox':
|
|
36
|
+
try:
|
|
37
|
+
toolkit_instance = SandboxToolkit.get_toolkit(
|
|
38
|
+
stateful=tool['settings'].get('stateful', False),
|
|
39
|
+
allow_net=tool['settings'].get('allow_net', True),
|
|
40
|
+
alita_client=alita_client,
|
|
41
|
+
toolkit_name=tool.get('toolkit_name', '')
|
|
42
|
+
)
|
|
43
|
+
all_tools.extend(toolkit_instance.get_tools())
|
|
44
|
+
except Exception as e:
|
|
45
|
+
logger.error(f"Error in sandbox toolkit get_tools: {e}")
|
|
46
|
+
logger.error(f"Tool config: {tool}")
|
|
47
|
+
raise
|
|
48
|
+
|
|
49
|
+
return all_tools
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _is_deno_available() -> bool:
|
|
53
|
+
"""Check if Deno is available in the PATH"""
|
|
54
|
+
try:
|
|
55
|
+
result = subprocess.run(
|
|
56
|
+
["deno", "--version"],
|
|
57
|
+
capture_output=True,
|
|
58
|
+
text=True,
|
|
59
|
+
timeout=10
|
|
60
|
+
)
|
|
61
|
+
return result.returncode == 0
|
|
62
|
+
except (subprocess.TimeoutExpired, subprocess.CalledProcessError, FileNotFoundError):
|
|
63
|
+
return False
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _setup_pyodide_cache_env() -> None:
|
|
67
|
+
"""Setup Pyodide caching environment variables for performance optimization [NO-OP]"""
|
|
68
|
+
try:
|
|
69
|
+
for key in ["SANDBOX_BASE", "DENO_DIR"]:
|
|
70
|
+
logger.info("Sandbox env: %s -> %s", key, os.environ.get(key, "n/a"))
|
|
71
|
+
except Exception as e:
|
|
72
|
+
logger.warning(f"Could not setup Pyodide cache environment: {e}")
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# Create input schema for the sandbox tool
|
|
76
|
+
sandbox_tool_input = create_model(
|
|
77
|
+
"SandboxToolInput",
|
|
78
|
+
code=(str, FieldInfo(description="Python code to execute in the sandbox environment"))
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class PyodideSandboxTool(BaseTool):
|
|
83
|
+
"""
|
|
84
|
+
A tool that provides secure Python code execution using Pyodide (Python compiled to WebAssembly).
|
|
85
|
+
This tool leverages langchain-sandbox to provide a safe environment for running untrusted Python code.
|
|
86
|
+
Optimized for performance with caching and stateless execution by default.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
name: str = "pyodide_sandbox"
|
|
90
|
+
description: str = """Execute Python code in a secure sandbox environment using Pyodide.
|
|
91
|
+
This tool allows safe execution of Python code without access to the host system.
|
|
92
|
+
Use this tool when you need to:
|
|
93
|
+
- Execute Python code snippets
|
|
94
|
+
- Perform calculations or data analysis
|
|
95
|
+
- Test Python algorithms
|
|
96
|
+
- Run code that requires isolation from the host system
|
|
97
|
+
|
|
98
|
+
The sandbox supports most Python standard library modules and can install additional packages.
|
|
99
|
+
Note: File access and some system operations are restricted for security.
|
|
100
|
+
Optimized for performance with local caching (stateless by default for faster execution).
|
|
101
|
+
"""
|
|
102
|
+
args_schema: Type[BaseModel] = sandbox_tool_input
|
|
103
|
+
stateful: bool = False # Default to stateless for better performance
|
|
104
|
+
allow_net: bool = True
|
|
105
|
+
session_bytes: Optional[bytes] = None
|
|
106
|
+
session_metadata: Optional[Dict] = None
|
|
107
|
+
alita_client: Optional[Any] = None
|
|
108
|
+
|
|
109
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
110
|
+
super().__init__(**kwargs)
|
|
111
|
+
self._sandbox = None
|
|
112
|
+
# Setup caching environment for optimal performance
|
|
113
|
+
_setup_pyodide_cache_env()
|
|
114
|
+
self._initialize_sandbox()
|
|
115
|
+
|
|
116
|
+
def _prepare_pyodide_input(self, code: str) -> str:
|
|
117
|
+
"""Prepare input for PyodideSandboxTool by injecting state and alita_client into the code block."""
|
|
118
|
+
pyodide_predata = ""
|
|
119
|
+
|
|
120
|
+
# Add alita_client if available
|
|
121
|
+
if self.alita_client:
|
|
122
|
+
try:
|
|
123
|
+
# Get the directory of the current file and construct the path to sandbox_client.py
|
|
124
|
+
current_dir = Path(__file__).parent
|
|
125
|
+
sandbox_client_path = current_dir.parent / 'clients' / 'sandbox_client.py'
|
|
126
|
+
|
|
127
|
+
with open(sandbox_client_path, 'r') as f:
|
|
128
|
+
sandbox_client_code = f.read()
|
|
129
|
+
pyodide_predata += f"{sandbox_client_code}\n"
|
|
130
|
+
pyodide_predata += (f"alita_client = SandboxClient(base_url='{self.alita_client.base_url}',"
|
|
131
|
+
f"project_id={self.alita_client.project_id},"
|
|
132
|
+
f"auth_token='{self.alita_client.auth_token}')\n")
|
|
133
|
+
except FileNotFoundError:
|
|
134
|
+
logger.error(f"sandbox_client.py not found. Ensure the file exists.")
|
|
135
|
+
|
|
136
|
+
return f"#elitea simplified client\n{pyodide_predata}{code}"
|
|
137
|
+
|
|
138
|
+
def _initialize_sandbox(self) -> None:
|
|
139
|
+
"""Initialize the PyodideSandbox instance with optimized settings"""
|
|
140
|
+
try:
|
|
141
|
+
# Check if Deno is available
|
|
142
|
+
if not _is_deno_available():
|
|
143
|
+
error_msg = (
|
|
144
|
+
"Deno is required for PyodideSandbox but is not installed. "
|
|
145
|
+
"Please run the bootstrap.sh script or install Deno manually."
|
|
146
|
+
)
|
|
147
|
+
logger.error(error_msg)
|
|
148
|
+
raise RuntimeError(error_msg)
|
|
149
|
+
|
|
150
|
+
from langchain_sandbox import PyodideSandbox
|
|
151
|
+
|
|
152
|
+
# Air-gapped settings
|
|
153
|
+
sandbox_base = os.environ.get("SANDBOX_BASE", os.path.expanduser('~/.cache/pyodide'))
|
|
154
|
+
sandbox_tmp = os.path.join(sandbox_base, "tmp")
|
|
155
|
+
deno_cache = os.environ.get("DENO_DIR", os.path.expanduser('~/.cache/deno'))
|
|
156
|
+
|
|
157
|
+
# Configure sandbox with performance optimizations
|
|
158
|
+
self._sandbox = PyodideSandbox(
|
|
159
|
+
stateful=self.stateful,
|
|
160
|
+
#
|
|
161
|
+
allow_env=["SANDBOX_BASE"],
|
|
162
|
+
allow_read=[sandbox_base, sandbox_tmp, deno_cache],
|
|
163
|
+
allow_write=[sandbox_tmp, deno_cache],
|
|
164
|
+
#
|
|
165
|
+
allow_net=self.allow_net,
|
|
166
|
+
# Use auto node_modules_dir for better caching
|
|
167
|
+
node_modules_dir="auto"
|
|
168
|
+
)
|
|
169
|
+
logger.info(f"PyodideSandbox initialized successfully (stateful={self.stateful})")
|
|
170
|
+
except ImportError as e:
|
|
171
|
+
if "langchain_sandbox" in str(e):
|
|
172
|
+
error_msg = (
|
|
173
|
+
"langchain-sandbox is required for the PyodideSandboxTool. "
|
|
174
|
+
"Please install it with: pip install langchain-sandbox"
|
|
175
|
+
)
|
|
176
|
+
logger.error(error_msg)
|
|
177
|
+
raise ImportError(error_msg) from e
|
|
178
|
+
else:
|
|
179
|
+
logger.error(f"Failed to import required module: {e}")
|
|
180
|
+
raise
|
|
181
|
+
except Exception as e:
|
|
182
|
+
logger.error(f"Failed to initialize PyodideSandbox: {e}")
|
|
183
|
+
raise
|
|
184
|
+
|
|
185
|
+
def _run(self, code: str) -> str:
|
|
186
|
+
"""
|
|
187
|
+
Synchronous version - runs the async method in a new event loop
|
|
188
|
+
"""
|
|
189
|
+
try:
|
|
190
|
+
# Check if sandbox is initialized, if not try to initialize
|
|
191
|
+
if self._sandbox is None:
|
|
192
|
+
self._initialize_sandbox()
|
|
193
|
+
|
|
194
|
+
# Prepare code with state and client injection
|
|
195
|
+
prepared_code = self._prepare_pyodide_input(code)
|
|
196
|
+
|
|
197
|
+
# Check if we're already in an async context
|
|
198
|
+
try:
|
|
199
|
+
loop = asyncio.get_running_loop()
|
|
200
|
+
# We're in an async context, but _run is supposed to be sync
|
|
201
|
+
# We'll need to use a different approach
|
|
202
|
+
import concurrent.futures
|
|
203
|
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
|
204
|
+
future = executor.submit(asyncio.run, self._arun(prepared_code))
|
|
205
|
+
return future.result()
|
|
206
|
+
except RuntimeError:
|
|
207
|
+
# No running loop, safe to use asyncio.run
|
|
208
|
+
return asyncio.run(self._arun(prepared_code))
|
|
209
|
+
except (ImportError, RuntimeError) as e:
|
|
210
|
+
# Handle specific dependency errors gracefully
|
|
211
|
+
error_msg = str(e)
|
|
212
|
+
if "langchain-sandbox" in error_msg:
|
|
213
|
+
return "❌ PyodideSandboxTool requires langchain-sandbox. Install with: pip install langchain-sandbox"
|
|
214
|
+
elif "Deno" in error_msg:
|
|
215
|
+
return "❌ PyodideSandboxTool requires Deno. Install from: https://docs.deno.com/runtime/getting_started/installation/"
|
|
216
|
+
else:
|
|
217
|
+
return f"❌ PyodideSandboxTool initialization failed: {error_msg}"
|
|
218
|
+
except Exception as e:
|
|
219
|
+
logger.error(f"Error executing code in sandbox: {e}")
|
|
220
|
+
return f"Error executing code: {str(e)}"
|
|
221
|
+
|
|
222
|
+
async def _arun(self, code: str) -> str:
|
|
223
|
+
"""
|
|
224
|
+
Execute Python code in the Pyodide sandbox
|
|
225
|
+
"""
|
|
226
|
+
try:
|
|
227
|
+
if self._sandbox is None:
|
|
228
|
+
self._initialize_sandbox()
|
|
229
|
+
|
|
230
|
+
# Execute the code with session state if available
|
|
231
|
+
result = await self._sandbox.execute(
|
|
232
|
+
code,
|
|
233
|
+
session_bytes=self.session_bytes,
|
|
234
|
+
session_metadata=self.session_metadata
|
|
235
|
+
)
|
|
236
|
+
|
|
237
|
+
# Update session state for stateful execution
|
|
238
|
+
if self.stateful:
|
|
239
|
+
self.session_bytes = result.session_bytes
|
|
240
|
+
self.session_metadata = result.session_metadata
|
|
241
|
+
|
|
242
|
+
result_dict = {}
|
|
243
|
+
|
|
244
|
+
if result.result is not None:
|
|
245
|
+
result_dict["result"] = result.result
|
|
246
|
+
|
|
247
|
+
if result.stdout:
|
|
248
|
+
result_dict["output"] = result.stdout
|
|
249
|
+
|
|
250
|
+
if result.stderr:
|
|
251
|
+
result_dict["error"] = result.stderr
|
|
252
|
+
|
|
253
|
+
if result.status == 'error':
|
|
254
|
+
result_dict["status"] = "Execution failed"
|
|
255
|
+
|
|
256
|
+
execution_info = f"Execution time: {result.execution_time:.2f}s"
|
|
257
|
+
if result.session_metadata and 'packages' in result.session_metadata:
|
|
258
|
+
packages = result.session_metadata.get('packages', [])
|
|
259
|
+
if packages:
|
|
260
|
+
execution_info += f", Packages: {', '.join(packages)}"
|
|
261
|
+
|
|
262
|
+
result_dict["execution_info"] = execution_info
|
|
263
|
+
return result_dict
|
|
264
|
+
|
|
265
|
+
except Exception as e:
|
|
266
|
+
logger.error(f"Error executing code in sandbox: {e}")
|
|
267
|
+
return {"error": f"Error executing code: {str(e)}"}
|
|
268
|
+
|
|
269
|
+
|
|
270
|
+
class StatefulPyodideSandboxTool(PyodideSandboxTool):
|
|
271
|
+
"""
|
|
272
|
+
A stateful version of the PyodideSandboxTool that maintains state between executions.
|
|
273
|
+
This version preserves variables, imports, and function definitions across multiple tool calls.
|
|
274
|
+
"""
|
|
275
|
+
|
|
276
|
+
name: str = "stateful_pyodide_sandbox"
|
|
277
|
+
description: str = """Execute Python code in a stateful sandbox environment using Pyodide.
|
|
278
|
+
This tool maintains state between executions, preserving variables, imports, and function definitions.
|
|
279
|
+
Use this tool when you need to:
|
|
280
|
+
- Build upon previous code executions
|
|
281
|
+
- Maintain variables across multiple calls
|
|
282
|
+
- Develop complex programs step by step
|
|
283
|
+
- Preserve imported libraries and defined functions
|
|
284
|
+
|
|
285
|
+
The sandbox supports most Python standard library modules and can install additional packages.
|
|
286
|
+
Note: File access and some system operations are restricted for security.
|
|
287
|
+
"""
|
|
288
|
+
|
|
289
|
+
def __init__(self, **kwargs: Any) -> None:
|
|
290
|
+
kwargs['stateful'] = True # Force stateful mode
|
|
291
|
+
super().__init__(**kwargs)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
# Factory function for creating sandbox tools
|
|
295
|
+
def create_sandbox_tool(stateful: bool = False, allow_net: bool = True, alita_client: Optional[Any] = None) -> BaseTool:
|
|
296
|
+
"""
|
|
297
|
+
Factory function to create sandbox tools with specified configuration.
|
|
298
|
+
|
|
299
|
+
Note: This tool requires Deno to be installed and available in PATH.
|
|
300
|
+
For installation and optimization, run the bootstrap.sh script.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
stateful: Whether to maintain state between executions (default: False for better performance)
|
|
304
|
+
allow_net: Whether to allow network access (for package installation)
|
|
305
|
+
|
|
306
|
+
Returns:
|
|
307
|
+
Configured sandbox tool instance
|
|
308
|
+
|
|
309
|
+
Raises:
|
|
310
|
+
ImportError: If langchain-sandbox is not installed
|
|
311
|
+
RuntimeError: If Deno is not found in PATH
|
|
312
|
+
|
|
313
|
+
Performance Notes:
|
|
314
|
+
- Stateless mode (default) is faster and avoids session state overhead
|
|
315
|
+
- Run bootstrap.sh script to enable local caching and reduce initialization time
|
|
316
|
+
- Cached wheels reduce package download time from ~4.76s to near-instant
|
|
317
|
+
"""
|
|
318
|
+
if stateful:
|
|
319
|
+
return StatefulPyodideSandboxTool(allow_net=allow_net, alita_client=alita_client)
|
|
320
|
+
else:
|
|
321
|
+
return PyodideSandboxTool(stateful=False, allow_net=allow_net, alita_client=alita_client)
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
class SandboxToolkit(BaseToolkit):
|
|
325
|
+
tools: List[BaseTool] = []
|
|
326
|
+
|
|
327
|
+
@staticmethod
|
|
328
|
+
def toolkit_config_schema() -> Type[BaseModel]:
|
|
329
|
+
# Create sample tools to get their schemas
|
|
330
|
+
sample_tools = [
|
|
331
|
+
PyodideSandboxTool(),
|
|
332
|
+
StatefulPyodideSandboxTool()
|
|
333
|
+
]
|
|
334
|
+
selected_tools = {x.name: x.args_schema.model_json_schema() for x in sample_tools}
|
|
335
|
+
|
|
336
|
+
return create_model(
|
|
337
|
+
'sandbox',
|
|
338
|
+
stateful=(bool, Field(default=False, description="Whether to maintain state between executions")),
|
|
339
|
+
allow_net=(bool, Field(default=True, description="Whether to allow network access for package installation")),
|
|
340
|
+
selected_tools=(List[Literal[tuple(selected_tools)]],
|
|
341
|
+
Field(default=[], json_schema_extra={'args_schemas': selected_tools})),
|
|
342
|
+
|
|
343
|
+
__config__=ConfigDict(json_schema_extra={
|
|
344
|
+
'metadata': {
|
|
345
|
+
"label": "Python Sandbox",
|
|
346
|
+
"icon_url": "sandbox.svg",
|
|
347
|
+
"hidden": False,
|
|
348
|
+
"categories": ["code", "execution", "internal_tool"],
|
|
349
|
+
"extra_categories": ["python", "pyodide", "sandbox", "code execution"],
|
|
350
|
+
}
|
|
351
|
+
})
|
|
352
|
+
)
|
|
353
|
+
|
|
354
|
+
@classmethod
|
|
355
|
+
def get_toolkit(cls, stateful: bool = False, allow_net: bool = True, alita_client=None, **kwargs):
|
|
356
|
+
"""
|
|
357
|
+
Get toolkit with sandbox tools.
|
|
358
|
+
|
|
359
|
+
Args:
|
|
360
|
+
stateful: Whether to maintain state between executions
|
|
361
|
+
allow_net: Whether to allow network access
|
|
362
|
+
alita_client: Alita client instance for sandbox tools
|
|
363
|
+
**kwargs: Additional arguments
|
|
364
|
+
"""
|
|
365
|
+
tools = []
|
|
366
|
+
|
|
367
|
+
if stateful:
|
|
368
|
+
tools.append(StatefulPyodideSandboxTool(allow_net=allow_net, alita_client=alita_client))
|
|
369
|
+
else:
|
|
370
|
+
tools.append(PyodideSandboxTool(stateful=False, allow_net=allow_net, alita_client=alita_client))
|
|
371
|
+
|
|
372
|
+
return cls(tools=tools)
|
|
373
|
+
|
|
374
|
+
def get_tools(self):
|
|
375
|
+
return self.tools
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import json
|
|
2
2
|
import math
|
|
3
3
|
import types
|
|
4
|
-
from typing import Any, Optional, List, Dict, Callable, Generator
|
|
4
|
+
from typing import Any, Optional, List, Dict, Callable, Generator, OrderedDict
|
|
5
5
|
|
|
6
6
|
from langchain_core.documents import Document
|
|
7
7
|
from pydantic import BaseModel, model_validator, Field
|
|
@@ -12,7 +12,6 @@ from alita_sdk.tools.vector_adapters.VectorStoreAdapter import VectorStoreAdapte
|
|
|
12
12
|
from logging import getLogger
|
|
13
13
|
|
|
14
14
|
from ..utils.logging import dispatch_custom_event
|
|
15
|
-
from ..utils.utils import IndexerKeywords
|
|
16
15
|
|
|
17
16
|
logger = getLogger(__name__)
|
|
18
17
|
|
|
@@ -73,6 +72,10 @@ class StepBackSearchDocumentsModel(BaseModel):
|
|
|
73
72
|
}""",
|
|
74
73
|
default=None
|
|
75
74
|
)
|
|
75
|
+
extended_search: Optional[List[str]] = Field(
|
|
76
|
+
description="List of chunk types to search for (title, summary, propositions, keywords, documents)",
|
|
77
|
+
default=None
|
|
78
|
+
)
|
|
76
79
|
reranking_config: Optional[Dict[str, Dict[str, Any]]] = Field(
|
|
77
80
|
description="""Reranking configuration. Example:
|
|
78
81
|
{
|
|
@@ -87,10 +90,6 @@ class StepBackSearchDocumentsModel(BaseModel):
|
|
|
87
90
|
}""",
|
|
88
91
|
default=None
|
|
89
92
|
)
|
|
90
|
-
extended_search: Optional[List[str]] = Field(
|
|
91
|
-
description="List of chunk types to search for (title, summary, propositions, keywords, documents)",
|
|
92
|
-
default=None
|
|
93
|
-
)
|
|
94
93
|
|
|
95
94
|
STEPBACK_PROMPT = """Your task is to convert provided question into a more generic question that will be used for similarity search.
|
|
96
95
|
Remove all not important words, question words, but save all names, dates and acronym as in original question.
|
|
@@ -138,7 +137,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
|
|
|
138
137
|
embedding_model_params: dict
|
|
139
138
|
vectorstore_type: str
|
|
140
139
|
vectorstore_params: dict
|
|
141
|
-
max_docs_per_add: int =
|
|
140
|
+
max_docs_per_add: int = 20
|
|
142
141
|
dataset: str = None
|
|
143
142
|
embedding: Any = None
|
|
144
143
|
vectorstore: Any = None
|
|
@@ -208,16 +207,33 @@ class VectorStoreWrapper(BaseToolApiWrapper):
|
|
|
208
207
|
tool_name="_remove_collection"
|
|
209
208
|
)
|
|
210
209
|
|
|
211
|
-
def _get_indexed_ids(self,
|
|
210
|
+
def _get_indexed_ids(self, index_name: Optional[str] = '') -> List[str]:
|
|
212
211
|
"""Get all indexed document IDs from vectorstore"""
|
|
213
|
-
return self.vector_adapter.get_indexed_ids(self,
|
|
214
|
-
|
|
215
|
-
def list_collections(self) ->
|
|
216
|
-
"""List all collections in the vectorstore.
|
|
217
|
-
|
|
218
|
-
|
|
212
|
+
return self.vector_adapter.get_indexed_ids(self, index_name)
|
|
213
|
+
|
|
214
|
+
def list_collections(self) -> Any:
|
|
215
|
+
"""List all collections in the vectorstore.
|
|
216
|
+
Returns a list of collection names, or if no collections exist,
|
|
217
|
+
returns a dict with an empty list and a message."""
|
|
218
|
+
raw = self.vector_adapter.list_collections(self)
|
|
219
|
+
# Normalize raw result to a list of names
|
|
220
|
+
if not raw:
|
|
221
|
+
# No collections found
|
|
222
|
+
return {"collections": [], "message": "No indexed collections"}
|
|
223
|
+
if isinstance(raw, str):
|
|
224
|
+
# e.g., Chroma adapter returns comma-separated string
|
|
225
|
+
cols = [c for c in raw.split(',') if c]
|
|
226
|
+
else:
|
|
227
|
+
try:
|
|
228
|
+
cols = list(raw)
|
|
229
|
+
except Exception:
|
|
230
|
+
# Unexpected type, return raw directly
|
|
231
|
+
return raw
|
|
232
|
+
if not cols:
|
|
233
|
+
return {"collections": [], "message": "No indexed collections"}
|
|
234
|
+
return cols
|
|
219
235
|
|
|
220
|
-
def _clean_collection(self,
|
|
236
|
+
def _clean_collection(self, index_name: str = ''):
|
|
221
237
|
"""
|
|
222
238
|
Clean the vectorstore collection by deleting all indexed data.
|
|
223
239
|
"""
|
|
@@ -225,19 +241,15 @@ class VectorStoreWrapper(BaseToolApiWrapper):
|
|
|
225
241
|
f"Cleaning collection '{self.dataset}'",
|
|
226
242
|
tool_name="_clean_collection"
|
|
227
243
|
)
|
|
228
|
-
self.vector_adapter.clean_collection(self,
|
|
244
|
+
self.vector_adapter.clean_collection(self, index_name)
|
|
229
245
|
self._log_data(
|
|
230
246
|
f"Collection '{self.dataset}' has been cleaned. ",
|
|
231
247
|
tool_name="_clean_collection"
|
|
232
248
|
)
|
|
233
249
|
|
|
234
|
-
def
|
|
235
|
-
""" Get all indexed data from vectorstore for non-code content """
|
|
236
|
-
return self.vector_adapter.get_indexed_data(self, collection_name)
|
|
237
|
-
|
|
238
|
-
def _get_code_indexed_data(self, collection_suffix: str) -> Dict[str, Dict[str, Any]]:
|
|
250
|
+
def _get_code_indexed_data(self, index_name: str) -> Dict[str, Dict[str, Any]]:
|
|
239
251
|
""" Get all indexed data from vectorstore for code content """
|
|
240
|
-
return self.vector_adapter.get_code_indexed_data(self,
|
|
252
|
+
return self.vector_adapter.get_code_indexed_data(self, index_name)
|
|
241
253
|
|
|
242
254
|
def _add_to_collection(self, entry_id, new_collection_value):
|
|
243
255
|
"""Add a new collection name to the `collection` key in the `metadata` column."""
|
|
@@ -246,7 +258,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
|
|
|
246
258
|
def _reduce_duplicates(
|
|
247
259
|
self,
|
|
248
260
|
documents: Generator[Any, None, None],
|
|
249
|
-
|
|
261
|
+
index_name: str,
|
|
250
262
|
get_indexed_data: Callable,
|
|
251
263
|
key_fn: Callable,
|
|
252
264
|
compare_fn: Callable,
|
|
@@ -255,7 +267,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
|
|
|
255
267
|
) -> List[Any]:
|
|
256
268
|
"""Generic duplicate reduction logic for documents."""
|
|
257
269
|
self._log_data(log_msg, tool_name="index_documents")
|
|
258
|
-
indexed_data = get_indexed_data(
|
|
270
|
+
indexed_data = get_indexed_data(index_name)
|
|
259
271
|
indexed_keys = set(indexed_data.keys())
|
|
260
272
|
if not indexed_keys:
|
|
261
273
|
self._log_data("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
|
|
@@ -266,14 +278,15 @@ class VectorStoreWrapper(BaseToolApiWrapper):
|
|
|
266
278
|
|
|
267
279
|
for document in documents:
|
|
268
280
|
key = key_fn(document)
|
|
269
|
-
|
|
281
|
+
key = key if isinstance(key, str) else str(key)
|
|
282
|
+
if key in indexed_keys and index_name == indexed_data[key]['metadata'].get('collection'):
|
|
270
283
|
if compare_fn(document, indexed_data[key]):
|
|
271
284
|
# Disabled addition of new collection to already indexed documents
|
|
272
285
|
# # check metadata.collection and update if needed
|
|
273
286
|
# for update_collection_id in remove_ids_fn(indexed_data, key):
|
|
274
287
|
# self._add_to_collection(
|
|
275
288
|
# update_collection_id,
|
|
276
|
-
#
|
|
289
|
+
# index_name
|
|
277
290
|
# )
|
|
278
291
|
continue
|
|
279
292
|
final_docs.append(document)
|
|
@@ -290,30 +303,10 @@ class VectorStoreWrapper(BaseToolApiWrapper):
|
|
|
290
303
|
|
|
291
304
|
return final_docs
|
|
292
305
|
|
|
293
|
-
def
|
|
294
|
-
return self._reduce_duplicates(
|
|
295
|
-
documents,
|
|
296
|
-
collection_suffix,
|
|
297
|
-
self._get_indexed_data,
|
|
298
|
-
lambda doc: doc.metadata.get('id'),
|
|
299
|
-
lambda doc, idx: (
|
|
300
|
-
doc.metadata.get('updated_on') and
|
|
301
|
-
idx['metadata'].get('updated_on') and
|
|
302
|
-
doc.metadata.get('updated_on') == idx['metadata'].get('updated_on')
|
|
303
|
-
),
|
|
304
|
-
lambda idx_data, key: (
|
|
305
|
-
idx_data[key]['all_chunks'] +
|
|
306
|
-
[idx_data[dep_id]['id'] for dep_id in idx_data[key][IndexerKeywords.DEPENDENT_DOCS.value]] +
|
|
307
|
-
[chunk_db_id for dep_id in idx_data[key][IndexerKeywords.DEPENDENT_DOCS.value]
|
|
308
|
-
for chunk_db_id in idx_data[dep_id]['all_chunks']]
|
|
309
|
-
),
|
|
310
|
-
log_msg="Verification of documents to index started"
|
|
311
|
-
)
|
|
312
|
-
|
|
313
|
-
def _reduce_code_duplicates(self, documents: Generator[Any, None, None], collection_suffix: str) -> List[Any]:
|
|
306
|
+
def _reduce_code_duplicates(self, documents: Generator[Any, None, None], index_name: str) -> List[Any]:
|
|
314
307
|
return self._reduce_duplicates(
|
|
315
308
|
documents,
|
|
316
|
-
|
|
309
|
+
index_name,
|
|
317
310
|
self._get_code_indexed_data,
|
|
318
311
|
lambda doc: doc.metadata.get('filename'),
|
|
319
312
|
lambda doc, idx: (
|
|
@@ -325,7 +318,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
|
|
|
325
318
|
log_msg="Verification of code documents to index started"
|
|
326
319
|
)
|
|
327
320
|
|
|
328
|
-
def index_documents(self, documents: Generator[Document, None, None],
|
|
321
|
+
def index_documents(self, documents: Generator[Document, None, None], index_name: str, progress_step: int = 20, clean_index: bool = True, is_code: bool = True):
|
|
329
322
|
""" Index documents in the vectorstore.
|
|
330
323
|
|
|
331
324
|
Args:
|
|
@@ -336,13 +329,13 @@ class VectorStoreWrapper(BaseToolApiWrapper):
|
|
|
336
329
|
|
|
337
330
|
from ..langchain.interfaces.llm_processor import add_documents
|
|
338
331
|
|
|
339
|
-
self._log_tool_event(message=f"Starting the indexing... Parameters: {
|
|
332
|
+
self._log_tool_event(message=f"Starting the indexing... Parameters: {index_name=}, {clean_index=}, {is_code}", tool_name="index_documents")
|
|
340
333
|
# pre-process documents if needed (find duplicates, etc.)
|
|
341
334
|
if clean_index:
|
|
342
335
|
logger.info("Cleaning index before re-indexing all documents.")
|
|
343
336
|
self._log_data("Cleaning index before re-indexing all documents. Previous index will be removed", tool_name="index_documents")
|
|
344
337
|
try:
|
|
345
|
-
self._clean_collection(
|
|
338
|
+
self._clean_collection(index_name)
|
|
346
339
|
self.vectoradapter.persist()
|
|
347
340
|
self.vectoradapter.vacuum()
|
|
348
341
|
self._log_data("Previous index has been removed",
|
|
@@ -356,8 +349,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
|
|
|
356
349
|
message="Filter for duplicates",
|
|
357
350
|
tool_name="index_documents")
|
|
358
351
|
# remove duplicates based on metadata 'id' and 'updated_on' or 'commit_hash' fields
|
|
359
|
-
documents = self._reduce_code_duplicates(documents,
|
|
360
|
-
else self._reduce_non_code_duplicates(documents, collection_suffix)
|
|
352
|
+
documents = self._reduce_code_duplicates(documents, index_name)
|
|
361
353
|
self._log_tool_event(
|
|
362
354
|
message="All the duplicates were filtered out. Proceeding with indexing.",
|
|
363
355
|
tool_name="index_documents")
|
|
@@ -385,13 +377,13 @@ class VectorStoreWrapper(BaseToolApiWrapper):
|
|
|
385
377
|
self._log_tool_event(message=f"Documents for indexing were processed. Total documents: {len(documents)}",
|
|
386
378
|
tool_name="index_documents")
|
|
387
379
|
|
|
388
|
-
# if
|
|
389
|
-
if
|
|
380
|
+
# if index_name is provided, add it to metadata of each document
|
|
381
|
+
if index_name:
|
|
390
382
|
for doc in documents:
|
|
391
383
|
if not doc.metadata.get('collection'):
|
|
392
|
-
doc.metadata['collection'] =
|
|
384
|
+
doc.metadata['collection'] = index_name
|
|
393
385
|
else:
|
|
394
|
-
doc.metadata['collection'] += f";{
|
|
386
|
+
doc.metadata['collection'] += f";{index_name}"
|
|
395
387
|
|
|
396
388
|
total_docs = len(documents)
|
|
397
389
|
documents_count = 0
|
|
@@ -422,7 +414,8 @@ class VectorStoreWrapper(BaseToolApiWrapper):
|
|
|
422
414
|
return {"status": "error", "message": f"Error: {format_exc()}"}
|
|
423
415
|
if _documents:
|
|
424
416
|
add_documents(vectorstore=self.vectorstore, documents=_documents)
|
|
425
|
-
return {"status": "ok", "message": f"successfully indexed {documents_count} documents"
|
|
417
|
+
return {"status": "ok", "message": f"successfully indexed {documents_count} documents" if documents_count > 0
|
|
418
|
+
else "No new documents to index."}
|
|
426
419
|
|
|
427
420
|
def search_documents(self, query:str, doctype: str = 'code',
|
|
428
421
|
filter:dict|str={}, cut_off: float=0.5,
|
|
@@ -542,11 +535,18 @@ class VectorStoreWrapper(BaseToolApiWrapper):
|
|
|
542
535
|
|
|
543
536
|
# Initialize document map for tracking by ID
|
|
544
537
|
doc_map = {
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
538
|
+
(
|
|
539
|
+
f"{doc.metadata.get('id', f'idx_{i}')}_{doc.metadata['chunk_id']}"
|
|
540
|
+
if 'chunk_id' in doc.metadata
|
|
541
|
+
else doc.metadata.get('id', f"idx_{i}")
|
|
542
|
+
): (doc, 1 - score)
|
|
548
543
|
for i, (doc, score) in enumerate(vector_items)
|
|
549
544
|
}
|
|
545
|
+
|
|
546
|
+
# Sort the items by the new score in descending order
|
|
547
|
+
doc_map = OrderedDict(
|
|
548
|
+
sorted(doc_map.items(), key=lambda x: x[1][1], reverse=True)
|
|
549
|
+
)
|
|
550
550
|
|
|
551
551
|
# Process full-text search if configured
|
|
552
552
|
if full_text_search and full_text_search.get('enabled') and full_text_search.get('fields'):
|
|
@@ -597,7 +597,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
|
|
|
597
597
|
# Apply cutoff filter
|
|
598
598
|
if cut_off:
|
|
599
599
|
# Filter out items above the cutoff score (since the lower the score, the better)
|
|
600
|
-
combined_items = [item for item in combined_items if abs(item[1])
|
|
600
|
+
combined_items = [item for item in combined_items if abs(item[1]) >= cut_off]
|
|
601
601
|
|
|
602
602
|
# Sort by score and limit results
|
|
603
603
|
# DISABLED: for chroma we want ascending order (lower score is better), for others descending
|
|
@@ -758,4 +758,3 @@ class VectorStoreWrapper(BaseToolApiWrapper):
|
|
|
758
758
|
"args_schema": StepBackSearchDocumentsModel
|
|
759
759
|
}
|
|
760
760
|
]
|
|
761
|
-
|