alita-sdk 0.3.465__py3-none-any.whl → 0.3.497__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of alita-sdk might be problematic. Click here for more details.
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +83 -1
- alita_sdk/cli/agent_loader.py +22 -4
- alita_sdk/cli/agent_ui.py +13 -3
- alita_sdk/cli/agents.py +1876 -186
- alita_sdk/cli/callbacks.py +96 -25
- alita_sdk/cli/cli.py +10 -1
- alita_sdk/cli/config.py +151 -9
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/input_handler.py +167 -4
- alita_sdk/cli/inventory.py +1256 -0
- alita_sdk/cli/toolkit.py +14 -17
- alita_sdk/cli/toolkit_loader.py +35 -5
- alita_sdk/cli/tools/__init__.py +8 -1
- alita_sdk/cli/tools/filesystem.py +910 -64
- alita_sdk/cli/tools/planning.py +143 -157
- alita_sdk/cli/tools/terminal.py +154 -20
- alita_sdk/community/__init__.py +64 -8
- alita_sdk/community/inventory/__init__.py +224 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +169 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/bitbucket.py +0 -3
- alita_sdk/runtime/clients/client.py +108 -31
- alita_sdk/runtime/langchain/assistant.py +4 -2
- alita_sdk/runtime/langchain/constants.py +3 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
- alita_sdk/runtime/langchain/document_loaders/constants.py +10 -6
- alita_sdk/runtime/langchain/langraph_agent.py +123 -31
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/toolkits/__init__.py +2 -0
- alita_sdk/runtime/toolkits/application.py +1 -1
- alita_sdk/runtime/toolkits/mcp.py +107 -91
- alita_sdk/runtime/toolkits/planning.py +173 -0
- alita_sdk/runtime/toolkits/tools.py +59 -7
- alita_sdk/runtime/tools/artifact.py +46 -17
- alita_sdk/runtime/tools/function.py +2 -1
- alita_sdk/runtime/tools/llm.py +320 -32
- alita_sdk/runtime/tools/mcp_remote_tool.py +23 -7
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/vectorstore_base.py +44 -9
- alita_sdk/runtime/utils/AlitaCallback.py +106 -20
- alita_sdk/runtime/utils/mcp_client.py +465 -0
- alita_sdk/runtime/utils/mcp_oauth.py +80 -0
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/streamlit.py +6 -10
- alita_sdk/runtime/utils/toolkit_utils.py +14 -5
- alita_sdk/tools/__init__.py +54 -27
- alita_sdk/tools/ado/repos/repos_wrapper.py +1 -2
- alita_sdk/tools/base_indexer_toolkit.py +99 -20
- alita_sdk/tools/bitbucket/__init__.py +2 -2
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code_indexer_toolkit.py +55 -22
- alita_sdk/tools/confluence/api_wrapper.py +63 -14
- alita_sdk/tools/elitea_base.py +86 -21
- alita_sdk/tools/jira/__init__.py +1 -1
- alita_sdk/tools/jira/api_wrapper.py +91 -40
- alita_sdk/tools/non_code_indexer_toolkit.py +1 -0
- alita_sdk/tools/qtest/__init__.py +1 -1
- alita_sdk/tools/sharepoint/api_wrapper.py +2 -2
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +17 -13
- alita_sdk/tools/zephyr_essential/api_wrapper.py +12 -13
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/METADATA +2 -1
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/RECORD +103 -61
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/entry_points.txt +0 -0
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.465.dist-info → alita_sdk-0.3.497.dist-info}/top_level.txt +0 -0
|
@@ -287,7 +287,6 @@ def run_streamlit(st, ai_icon=None, user_icon=None):
|
|
|
287
287
|
model_config={
|
|
288
288
|
"temperature": 0.1,
|
|
289
289
|
"max_tokens": 1000,
|
|
290
|
-
"top_p": 1.0
|
|
291
290
|
}
|
|
292
291
|
)
|
|
293
292
|
except Exception as e:
|
|
@@ -1256,7 +1255,6 @@ def run_streamlit(st, ai_icon=None, user_icon=None):
|
|
|
1256
1255
|
model_config={
|
|
1257
1256
|
"temperature": 0.1,
|
|
1258
1257
|
"max_tokens": 1000,
|
|
1259
|
-
"top_p": 1.0
|
|
1260
1258
|
}
|
|
1261
1259
|
)
|
|
1262
1260
|
except Exception as e:
|
|
@@ -1387,20 +1385,18 @@ def run_streamlit(st, ai_icon=None, user_icon=None):
|
|
|
1387
1385
|
help="Maximum number of tokens in the AI response"
|
|
1388
1386
|
)
|
|
1389
1387
|
|
|
1390
|
-
|
|
1391
|
-
"
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
step=0.1,
|
|
1396
|
-
help="Controls diversity via nucleus sampling"
|
|
1388
|
+
reasoning_effort = st.selectbox(
|
|
1389
|
+
"Reasoning effort:",
|
|
1390
|
+
options=['null', 'low', 'medium', 'high'],
|
|
1391
|
+
index=0,
|
|
1392
|
+
help="Higher effort better reasoning, slower response"
|
|
1397
1393
|
)
|
|
1398
1394
|
|
|
1399
1395
|
# Create LLM config
|
|
1400
1396
|
llm_config = {
|
|
1401
1397
|
'max_tokens': max_tokens,
|
|
1402
1398
|
'temperature': temperature,
|
|
1403
|
-
'
|
|
1399
|
+
'reasoning_effort': reasoning_effort
|
|
1404
1400
|
}
|
|
1405
1401
|
|
|
1406
1402
|
col1, col2 = st.columns([3, 1])
|
|
@@ -12,7 +12,9 @@ logger = logging.getLogger(__name__)
|
|
|
12
12
|
|
|
13
13
|
def instantiate_toolkit_with_client(toolkit_config: Dict[str, Any],
|
|
14
14
|
llm_client: Any,
|
|
15
|
-
alita_client: Optional[Any] = None
|
|
15
|
+
alita_client: Optional[Any] = None,
|
|
16
|
+
mcp_tokens: Optional[Dict[str, Any]] = None,
|
|
17
|
+
use_prefix: bool = False) -> List[Any]:
|
|
16
18
|
"""
|
|
17
19
|
Instantiate a toolkit with LLM client support.
|
|
18
20
|
|
|
@@ -22,7 +24,11 @@ def instantiate_toolkit_with_client(toolkit_config: Dict[str, Any],
|
|
|
22
24
|
Args:
|
|
23
25
|
toolkit_config: Configuration dictionary for the toolkit
|
|
24
26
|
llm_client: LLM client instance for tools that need LLM capabilities
|
|
25
|
-
|
|
27
|
+
alita_client: Optional additional client instance
|
|
28
|
+
mcp_tokens: Optional dictionary of MCP OAuth tokens by server URL
|
|
29
|
+
use_prefix: If True, tools get prefixed with toolkit_name to prevent collisions
|
|
30
|
+
(for agent use). If False, tools use base names only (for testing interface).
|
|
31
|
+
Default False for backward compatibility with testing.
|
|
26
32
|
|
|
27
33
|
Returns:
|
|
28
34
|
List of instantiated tools from the toolkit
|
|
@@ -52,16 +58,17 @@ def instantiate_toolkit_with_client(toolkit_config: Dict[str, Any],
|
|
|
52
58
|
toolkit_type = toolkit_config.get('type', toolkit_name.lower())
|
|
53
59
|
|
|
54
60
|
# Create a tool configuration dict with required fields
|
|
61
|
+
# Note: MCP toolkit always requires toolkit_name, other toolkits respect use_prefix flag
|
|
55
62
|
tool_config = {
|
|
56
63
|
'id': toolkit_config.get('id', random.randint(1, 1000000)),
|
|
57
64
|
'type': toolkit_config.get('type', toolkit_type),
|
|
58
65
|
'settings': settings,
|
|
59
|
-
'toolkit_name': toolkit_name
|
|
66
|
+
'toolkit_name': toolkit_name if (use_prefix or toolkit_type == 'mcp') else None
|
|
60
67
|
}
|
|
61
68
|
|
|
62
69
|
# Get tools using the toolkit configuration with clients
|
|
63
|
-
# Parameter order: get_tools(tools_list, alita_client, llm, memory_store)
|
|
64
|
-
tools = get_tools([tool_config], alita_client, llm_client)
|
|
70
|
+
# Parameter order: get_tools(tools_list, alita_client, llm, memory_store, debug_mode, mcp_tokens)
|
|
71
|
+
tools = get_tools([tool_config], alita_client, llm_client, mcp_tokens=mcp_tokens)
|
|
65
72
|
|
|
66
73
|
if not tools:
|
|
67
74
|
logger.warning(f"No tools returned for toolkit {toolkit_name}")
|
|
@@ -73,9 +80,11 @@ def instantiate_toolkit_with_client(toolkit_config: Dict[str, Any],
|
|
|
73
80
|
except Exception as e:
|
|
74
81
|
# Re-raise McpAuthorizationRequired without logging as error
|
|
75
82
|
from ..utils.mcp_oauth import McpAuthorizationRequired
|
|
83
|
+
|
|
76
84
|
if isinstance(e, McpAuthorizationRequired):
|
|
77
85
|
logger.info(f"Toolkit {toolkit_name} requires MCP OAuth authorization")
|
|
78
86
|
raise
|
|
87
|
+
|
|
79
88
|
# Log and re-raise other errors
|
|
80
89
|
logger.error(f"Error instantiating toolkit {toolkit_name} with client: {str(e)}")
|
|
81
90
|
raise
|
alita_sdk/tools/__init__.py
CHANGED
|
@@ -13,6 +13,30 @@ AVAILABLE_TOOLS = {}
|
|
|
13
13
|
AVAILABLE_TOOLKITS = {}
|
|
14
14
|
FAILED_IMPORTS = {}
|
|
15
15
|
|
|
16
|
+
|
|
17
|
+
def _inject_toolkit_id(tool_conf: dict, toolkit_tools) -> None:
|
|
18
|
+
"""Inject `toolkit_id` into tools that expose `api_wrapper.toolkit_id`.
|
|
19
|
+
|
|
20
|
+
This reads 'id' from the tool configuration and, if it is an integer,
|
|
21
|
+
assigns it to the 'toolkit_id' attribute of the 'api_wrapper' for each
|
|
22
|
+
tool in 'toolkit_tools' that supports it.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
tool_conf: Raw tool configuration item from 'tools_list'.
|
|
26
|
+
toolkit_tools: List of instantiated tools produced by a toolkit.
|
|
27
|
+
"""
|
|
28
|
+
toolkit_id = tool_conf.get('id')
|
|
29
|
+
if isinstance(toolkit_id, int):
|
|
30
|
+
for t in toolkit_tools:
|
|
31
|
+
if hasattr(t, 'api_wrapper') and hasattr(t.api_wrapper, 'toolkit_id'):
|
|
32
|
+
t.api_wrapper.toolkit_id = toolkit_id
|
|
33
|
+
else:
|
|
34
|
+
logger.error(
|
|
35
|
+
f"Toolkit ID is missing or not an integer for tool "
|
|
36
|
+
f"`{tool_conf.get('type', '')}` with name `{tool_conf.get('name', '')}`"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
16
40
|
def _safe_import_tool(tool_name, module_path, get_tools_name=None, toolkit_class_name=None):
|
|
17
41
|
"""Safely import a tool module and register available functions/classes."""
|
|
18
42
|
try:
|
|
@@ -34,6 +58,7 @@ def _safe_import_tool(tool_name, module_path, get_tools_name=None, toolkit_class
|
|
|
34
58
|
FAILED_IMPORTS[tool_name] = str(e)
|
|
35
59
|
logger.debug(f"Failed to import {tool_name}: {e}")
|
|
36
60
|
|
|
61
|
+
|
|
37
62
|
# Safe imports for all tools
|
|
38
63
|
_safe_import_tool('github', 'github', 'get_tools', 'AlitaGitHubToolkit')
|
|
39
64
|
_safe_import_tool('openapi', 'openapi', 'get_tools')
|
|
@@ -90,11 +115,19 @@ available_count = len(AVAILABLE_TOOLS)
|
|
|
90
115
|
total_attempted = len(AVAILABLE_TOOLS) + len(FAILED_IMPORTS)
|
|
91
116
|
logger.info(f"Tool imports completed: {available_count}/{total_attempted} successful")
|
|
92
117
|
|
|
118
|
+
# Import community module to trigger community toolkit registration
|
|
119
|
+
try:
|
|
120
|
+
from alita_sdk import community # noqa: F401
|
|
121
|
+
logger.debug("Community toolkits registered successfully")
|
|
122
|
+
except ImportError as e:
|
|
123
|
+
logger.debug(f"Community module not available: {e}")
|
|
124
|
+
|
|
93
125
|
|
|
94
126
|
def get_tools(tools_list, alita, llm, store: Optional[BaseStore] = None, *args, **kwargs):
|
|
95
127
|
tools = []
|
|
96
128
|
|
|
97
129
|
for tool in tools_list:
|
|
130
|
+
toolkit_tools = []
|
|
98
131
|
settings = tool.get('settings')
|
|
99
132
|
|
|
100
133
|
# Skip tools without settings early
|
|
@@ -116,53 +149,47 @@ def get_tools(tools_list, alita, llm, store: Optional[BaseStore] = None, *args,
|
|
|
116
149
|
|
|
117
150
|
# Set pgvector collection schema if present
|
|
118
151
|
if settings.get('pgvector_configuration'):
|
|
119
|
-
|
|
152
|
+
# Use tool id if available, otherwise use toolkit_name or type as fallback
|
|
153
|
+
collection_id = tool.get('id') or tool.get('toolkit_name') or tool_type
|
|
154
|
+
settings['pgvector_configuration']['collection_schema'] = str(collection_id)
|
|
120
155
|
|
|
121
156
|
# Handle ADO special cases
|
|
122
157
|
if tool_type in ['ado_boards', 'ado_wiki', 'ado_plans']:
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
# Handle ADO repos aliases
|
|
127
|
-
if tool_type in ['ado_repos', 'azure_devops_repos'] and 'ado_repos' in AVAILABLE_TOOLS:
|
|
158
|
+
toolkit_tools.extend(AVAILABLE_TOOLS['ado']['get_tools'](tool_type, tool))
|
|
159
|
+
elif tool_type in ['ado_repos', 'azure_devops_repos'] and 'ado_repos' in AVAILABLE_TOOLS:
|
|
128
160
|
try:
|
|
129
|
-
|
|
161
|
+
toolkit_tools.extend(AVAILABLE_TOOLS['ado_repos']['get_tools'](tool))
|
|
130
162
|
except Exception as e:
|
|
131
163
|
logger.error(f"Error getting ADO repos tools: {e}")
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
# Skip MCP toolkit - it's handled by runtime/toolkits/tools.py to avoid duplicate loading
|
|
135
|
-
if tool_type == 'mcp':
|
|
164
|
+
elif tool_type == 'mcp':
|
|
136
165
|
logger.debug(f"Skipping MCP toolkit '{tool.get('toolkit_name')}' - handled by runtime toolkit system")
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
if tool_type in AVAILABLE_TOOLS and 'get_tools' in AVAILABLE_TOOLS[tool_type]:
|
|
166
|
+
elif tool_type == 'planning':
|
|
167
|
+
logger.debug(f"Skipping planning toolkit '{tool.get('toolkit_name')}' - handled by runtime toolkit system")
|
|
168
|
+
elif tool_type in AVAILABLE_TOOLS and 'get_tools' in AVAILABLE_TOOLS[tool_type]:
|
|
141
169
|
try:
|
|
142
|
-
|
|
170
|
+
toolkit_tools.extend(AVAILABLE_TOOLS[tool_type]['get_tools'](tool))
|
|
143
171
|
except Exception as e:
|
|
144
172
|
logger.error(f"Error getting tools for {tool_type}: {e}")
|
|
145
173
|
raise ToolException(f"Error getting tools for {tool_type}: {e}")
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
# Handle custom modules
|
|
149
|
-
if settings.get("module"):
|
|
174
|
+
elif settings.get("module"):
|
|
150
175
|
try:
|
|
151
176
|
mod = import_module(settings.pop("module"))
|
|
152
177
|
tkitclass = getattr(mod, settings.pop("class"))
|
|
153
178
|
get_toolkit_params = settings.copy()
|
|
154
179
|
get_toolkit_params["name"] = tool.get("name")
|
|
155
180
|
toolkit = tkitclass.get_toolkit(**get_toolkit_params)
|
|
156
|
-
|
|
181
|
+
toolkit_tools.extend(toolkit.get_tools())
|
|
157
182
|
except Exception as e:
|
|
158
183
|
logger.error(f"Error in getting custom toolkit: {e}")
|
|
159
|
-
continue
|
|
160
|
-
|
|
161
|
-
# Tool not available
|
|
162
|
-
if tool_type in FAILED_IMPORTS:
|
|
163
|
-
logger.warning(f"Tool '{tool_type}' is not available: {FAILED_IMPORTS[tool_type]}")
|
|
164
184
|
else:
|
|
165
|
-
|
|
185
|
+
if tool_type in FAILED_IMPORTS:
|
|
186
|
+
logger.warning(f"Tool '{tool_type}' is not available: {FAILED_IMPORTS[tool_type]}")
|
|
187
|
+
else:
|
|
188
|
+
logger.warning(f"Unknown tool type: {tool_type}")
|
|
189
|
+
#
|
|
190
|
+
# Always inject toolkit_id to each tool
|
|
191
|
+
_inject_toolkit_id(tool, toolkit_tools)
|
|
192
|
+
tools.extend(toolkit_tools)
|
|
166
193
|
|
|
167
194
|
return tools
|
|
168
195
|
|
|
@@ -2,6 +2,7 @@ import copy
|
|
|
2
2
|
import json
|
|
3
3
|
import logging
|
|
4
4
|
import time
|
|
5
|
+
from enum import Enum
|
|
5
6
|
from typing import Any, Optional, List, Dict, Generator
|
|
6
7
|
|
|
7
8
|
from langchain_core.callbacks import dispatch_custom_event
|
|
@@ -16,7 +17,17 @@ from ..runtime.utils.utils import IndexerKeywords
|
|
|
16
17
|
|
|
17
18
|
logger = logging.getLogger(__name__)
|
|
18
19
|
|
|
19
|
-
DEFAULT_CUT_OFF = 0.
|
|
20
|
+
DEFAULT_CUT_OFF = 0.1
|
|
21
|
+
INDEX_META_UPDATE_INTERVAL = 600.0
|
|
22
|
+
|
|
23
|
+
class IndexTools(str, Enum):
|
|
24
|
+
"""Enum for index-related tool names."""
|
|
25
|
+
INDEX_DATA = "index_data"
|
|
26
|
+
SEARCH_INDEX = "search_index"
|
|
27
|
+
STEPBACK_SEARCH_INDEX = "stepback_search_index"
|
|
28
|
+
STEPBACK_SUMMARY_INDEX = "stepback_summary_index"
|
|
29
|
+
REMOVE_INDEX = "remove_index"
|
|
30
|
+
LIST_COLLECTIONS = "list_collections"
|
|
20
31
|
|
|
21
32
|
# Base Vector Store Schema Models
|
|
22
33
|
BaseIndexParams = create_model(
|
|
@@ -157,6 +168,16 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
157
168
|
clean_index = kwargs.get("clean_index")
|
|
158
169
|
chunking_tool = kwargs.get("chunking_tool")
|
|
159
170
|
chunking_config = kwargs.get("chunking_config")
|
|
171
|
+
|
|
172
|
+
# Store the interval in a private dict to avoid Pydantic field errors
|
|
173
|
+
if not hasattr(self, "_index_meta_config"):
|
|
174
|
+
self._index_meta_config: Dict[str, Any] = {}
|
|
175
|
+
|
|
176
|
+
self._index_meta_config["update_interval"] = kwargs.get(
|
|
177
|
+
"meta_update_interval",
|
|
178
|
+
INDEX_META_UPDATE_INTERVAL,
|
|
179
|
+
)
|
|
180
|
+
|
|
160
181
|
result = {"count": 0}
|
|
161
182
|
#
|
|
162
183
|
try:
|
|
@@ -164,6 +185,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
164
185
|
self._clean_index(index_name)
|
|
165
186
|
#
|
|
166
187
|
self.index_meta_init(index_name, kwargs)
|
|
188
|
+
self._emit_index_event(index_name)
|
|
167
189
|
#
|
|
168
190
|
self._log_tool_event(f"Indexing data into collection with suffix '{index_name}'. It can take some time...")
|
|
169
191
|
self._log_tool_event(f"Loading the documents to index...{kwargs}")
|
|
@@ -179,18 +201,26 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
179
201
|
self._save_index_generator(documents, documents_count, chunking_tool, chunking_config, index_name=index_name, result=result)
|
|
180
202
|
#
|
|
181
203
|
results_count = result["count"]
|
|
182
|
-
|
|
204
|
+
# Final update should always be forced
|
|
205
|
+
self.index_meta_update(index_name, IndexerKeywords.INDEX_META_COMPLETED.value, results_count, update_force=True)
|
|
183
206
|
self._emit_index_event(index_name)
|
|
184
207
|
#
|
|
185
208
|
return {"status": "ok", "message": f"successfully indexed {results_count} documents" if results_count > 0
|
|
186
209
|
else "no new documents to index"}
|
|
187
210
|
except Exception as e:
|
|
188
|
-
|
|
189
|
-
|
|
211
|
+
# Do maximum effort at least send custom event for supposed changed status
|
|
212
|
+
msg = str(e)
|
|
213
|
+
try:
|
|
214
|
+
# Error update should also be forced
|
|
215
|
+
self.index_meta_update(index_name, IndexerKeywords.INDEX_META_FAILED.value, result["count"], update_force=True)
|
|
216
|
+
except Exception as ie:
|
|
217
|
+
logger.error(f"Failed to update index meta status to FAILED for index '{index_name}': {ie}")
|
|
218
|
+
msg = f"{msg}; additionally failed to update index meta status to FAILED: {ie}"
|
|
219
|
+
self._emit_index_event(index_name, error=msg)
|
|
190
220
|
raise e
|
|
191
|
-
|
|
192
221
|
|
|
193
222
|
def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config, result, index_name: Optional[str] = None):
|
|
223
|
+
self._ensure_vectorstore_initialized()
|
|
194
224
|
self._log_tool_event(f"Base documents are ready for indexing. {base_total} base documents in total to index.")
|
|
195
225
|
from ..runtime.langchain.interfaces.llm_processor import add_documents
|
|
196
226
|
#
|
|
@@ -243,6 +273,11 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
243
273
|
logger.debug(msg)
|
|
244
274
|
self._log_tool_event(msg)
|
|
245
275
|
result["count"] += dependent_docs_counter
|
|
276
|
+
# After each base document, try a non-forced meta update; throttling handled inside index_meta_update
|
|
277
|
+
try:
|
|
278
|
+
self.index_meta_update(index_name, IndexerKeywords.INDEX_META_IN_PROGRESS.value, result["count"], update_force=False)
|
|
279
|
+
except Exception as exc: # best-effort, do not break indexing
|
|
280
|
+
logger.warning(f"Failed to update index meta during indexing process for index '{index_name}': {exc}")
|
|
246
281
|
if pg_vector_add_docs_chunk:
|
|
247
282
|
add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
|
|
248
283
|
|
|
@@ -308,6 +343,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
308
343
|
log_msg: str = "Verification of documents to index started"
|
|
309
344
|
) -> Generator[Document, None, None]:
|
|
310
345
|
"""Generic duplicate reduction logic for documents."""
|
|
346
|
+
self._ensure_vectorstore_initialized()
|
|
311
347
|
self._log_tool_event(log_msg, tool_name="index_documents")
|
|
312
348
|
indexed_data = self._get_indexed_data(index_name)
|
|
313
349
|
indexed_keys = set(indexed_data.keys())
|
|
@@ -350,7 +386,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
350
386
|
|
|
351
387
|
def remove_index(self, index_name: str = ""):
|
|
352
388
|
"""Cleans the indexed data in the collection."""
|
|
353
|
-
super()._clean_collection(index_name=index_name)
|
|
389
|
+
super()._clean_collection(index_name=index_name, including_index_meta=True)
|
|
354
390
|
return (f"Collection '{index_name}' has been removed from the vector store.\n"
|
|
355
391
|
f"Available collections: {self.list_collections()}") if index_name \
|
|
356
392
|
else "All collections have been removed from the vector store."
|
|
@@ -463,6 +499,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
463
499
|
)
|
|
464
500
|
|
|
465
501
|
def index_meta_init(self, index_name: str, index_configuration: dict[str, Any]):
|
|
502
|
+
self._ensure_vectorstore_initialized()
|
|
466
503
|
index_meta = super().get_index_meta(index_name)
|
|
467
504
|
if not index_meta:
|
|
468
505
|
self._log_tool_event(
|
|
@@ -482,12 +519,53 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
482
519
|
"updated_on": created_on,
|
|
483
520
|
"task_id": None,
|
|
484
521
|
"conversation_id": None,
|
|
522
|
+
"toolkit_id": self.toolkit_id,
|
|
485
523
|
}
|
|
486
524
|
metadata["history"] = json.dumps([metadata])
|
|
487
525
|
index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{index_name}", metadata=metadata)
|
|
488
526
|
add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc])
|
|
489
527
|
|
|
490
|
-
def index_meta_update(self, index_name: str, state: str, result: int):
|
|
528
|
+
def index_meta_update(self, index_name: str, state: str, result: int, update_force: bool = True, interval: Optional[float] = None):
|
|
529
|
+
"""Update `index_meta` document with optional time-based throttling.
|
|
530
|
+
|
|
531
|
+
Args:
|
|
532
|
+
index_name: Index name to update meta for.
|
|
533
|
+
state: New state value for the `index_meta` record.
|
|
534
|
+
result: Number of processed documents to store in the `updated` field.
|
|
535
|
+
update_force: If `True`, perform the update unconditionally, ignoring throttling.
|
|
536
|
+
If `False`, perform the update only when the effective time interval has passed.
|
|
537
|
+
interval: Optional custom interval (in seconds) for this call when `update_force` is `False`.
|
|
538
|
+
If `None`, falls back to the value stored in `self._index_meta_config["update_interval"]`
|
|
539
|
+
if present, otherwise uses `INDEX_META_UPDATE_INTERVAL`.
|
|
540
|
+
"""
|
|
541
|
+
self._ensure_vectorstore_initialized()
|
|
542
|
+
if not hasattr(self, "_index_meta_last_update_time"):
|
|
543
|
+
self._index_meta_last_update_time: Dict[str, float] = {}
|
|
544
|
+
|
|
545
|
+
if not update_force:
|
|
546
|
+
# Resolve effective interval:
|
|
547
|
+
# 1\) explicit arg
|
|
548
|
+
# 2\) value from `_index_meta_config`
|
|
549
|
+
# 3\) default constant
|
|
550
|
+
cfg_interval = None
|
|
551
|
+
if hasattr(self, "_index_meta_config"):
|
|
552
|
+
cfg_interval = self._index_meta_config.get("update_interval")
|
|
553
|
+
|
|
554
|
+
eff_interval = (
|
|
555
|
+
interval
|
|
556
|
+
if interval is not None
|
|
557
|
+
else (cfg_interval if cfg_interval is not None else INDEX_META_UPDATE_INTERVAL)
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
last_time = self._index_meta_last_update_time.get(index_name)
|
|
561
|
+
now = time.time()
|
|
562
|
+
if last_time is not None and (now - last_time) < eff_interval:
|
|
563
|
+
return
|
|
564
|
+
self._index_meta_last_update_time[index_name] = now
|
|
565
|
+
else:
|
|
566
|
+
# For forced updates, always refresh last update time
|
|
567
|
+
self._index_meta_last_update_time[index_name] = time.time()
|
|
568
|
+
|
|
491
569
|
index_meta_raw = super().get_index_meta(index_name)
|
|
492
570
|
from ..runtime.langchain.interfaces.llm_processor import add_documents
|
|
493
571
|
#
|
|
@@ -545,11 +623,12 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
545
623
|
event_data = {
|
|
546
624
|
"id": index_meta.get("id"),
|
|
547
625
|
"index_name": index_name,
|
|
548
|
-
"state": metadata.get("state"),
|
|
626
|
+
"state": "failed" if error is not None else metadata.get("state"),
|
|
549
627
|
"error": error,
|
|
550
628
|
"reindex": is_reindex,
|
|
551
629
|
"indexed": metadata.get("indexed", 0),
|
|
552
630
|
"updated": metadata.get("updated", 0),
|
|
631
|
+
"toolkit_id": metadata.get("toolkit_id"),
|
|
553
632
|
}
|
|
554
633
|
|
|
555
634
|
# Emit the event
|
|
@@ -572,8 +651,8 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
572
651
|
"""
|
|
573
652
|
return [
|
|
574
653
|
{
|
|
575
|
-
"name":
|
|
576
|
-
"mode":
|
|
654
|
+
"name": IndexTools.INDEX_DATA.value,
|
|
655
|
+
"mode": IndexTools.INDEX_DATA.value,
|
|
577
656
|
"ref": self.index_data,
|
|
578
657
|
"description": "Loads data to index.",
|
|
579
658
|
"args_schema": create_model(
|
|
@@ -583,36 +662,36 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
583
662
|
)
|
|
584
663
|
},
|
|
585
664
|
{
|
|
586
|
-
"name":
|
|
587
|
-
"mode":
|
|
665
|
+
"name": IndexTools.SEARCH_INDEX.value,
|
|
666
|
+
"mode": IndexTools.SEARCH_INDEX.value,
|
|
588
667
|
"ref": self.search_index,
|
|
589
668
|
"description": self.search_index.__doc__,
|
|
590
669
|
"args_schema": BaseSearchParams
|
|
591
670
|
},
|
|
592
671
|
{
|
|
593
|
-
"name":
|
|
594
|
-
"mode":
|
|
672
|
+
"name": IndexTools.STEPBACK_SEARCH_INDEX.value,
|
|
673
|
+
"mode": IndexTools.STEPBACK_SEARCH_INDEX.value,
|
|
595
674
|
"ref": self.stepback_search_index,
|
|
596
675
|
"description": self.stepback_search_index.__doc__,
|
|
597
676
|
"args_schema": BaseStepbackSearchParams
|
|
598
677
|
},
|
|
599
678
|
{
|
|
600
|
-
"name":
|
|
601
|
-
"mode":
|
|
679
|
+
"name": IndexTools.STEPBACK_SUMMARY_INDEX.value,
|
|
680
|
+
"mode": IndexTools.STEPBACK_SUMMARY_INDEX.value,
|
|
602
681
|
"ref": self.stepback_summary_index,
|
|
603
682
|
"description": self.stepback_summary_index.__doc__,
|
|
604
683
|
"args_schema": BaseStepbackSearchParams
|
|
605
684
|
},
|
|
606
685
|
{
|
|
607
|
-
"name":
|
|
608
|
-
"mode":
|
|
686
|
+
"name": IndexTools.REMOVE_INDEX.value,
|
|
687
|
+
"mode": IndexTools.REMOVE_INDEX.value,
|
|
609
688
|
"ref": self.remove_index,
|
|
610
689
|
"description": self.remove_index.__doc__,
|
|
611
690
|
"args_schema": RemoveIndexParams
|
|
612
691
|
},
|
|
613
692
|
{
|
|
614
|
-
"name":
|
|
615
|
-
"mode":
|
|
693
|
+
"name": IndexTools.LIST_COLLECTIONS.value,
|
|
694
|
+
"mode": IndexTools.LIST_COLLECTIONS.value,
|
|
616
695
|
"ref": self.list_collections,
|
|
617
696
|
"description": self.list_collections.__doc__,
|
|
618
697
|
# No parameters
|
|
@@ -47,8 +47,8 @@ class AlitaBitbucketToolkit(BaseToolkit):
|
|
|
47
47
|
AlitaBitbucketToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
|
|
48
48
|
m = create_model(
|
|
49
49
|
name,
|
|
50
|
-
project=(str, Field(description="Project/Workspace"
|
|
51
|
-
repository=(str, Field(description="Repository"
|
|
50
|
+
project=(str, Field(description="Project/Workspace")),
|
|
51
|
+
repository=(str, Field(description="Repository")),
|
|
52
52
|
branch=(str, Field(description="Main branch", default="main")),
|
|
53
53
|
cloud=(Optional[bool], Field(description="Hosting Option", default=None)),
|
|
54
54
|
bitbucket_configuration=(BitbucketConfiguration, Field(description="Bitbucket Configuration", json_schema_extra={'configuration_types': ['bitbucket']})),
|
|
@@ -3,6 +3,7 @@ from .sematic.statistical_chunker import statistical_chunker
|
|
|
3
3
|
from .sematic.markdown_chunker import markdown_chunker
|
|
4
4
|
from .sematic.proposal_chunker import proposal_chunker
|
|
5
5
|
from .sematic.json_chunker import json_chunker
|
|
6
|
+
from .universal_chunker import universal_chunker, chunk_single_document, get_file_type
|
|
6
7
|
from .models import StatisticalChunkerConfig, MarkdownChunkerConfig, ProposalChunkerConfig
|
|
7
8
|
|
|
8
9
|
__all__ = {
|
|
@@ -10,7 +11,8 @@ __all__ = {
|
|
|
10
11
|
'statistical': statistical_chunker,
|
|
11
12
|
'markdown': markdown_chunker,
|
|
12
13
|
'proposal': proposal_chunker,
|
|
13
|
-
'json': json_chunker
|
|
14
|
+
'json': json_chunker,
|
|
15
|
+
'universal': universal_chunker,
|
|
14
16
|
}
|
|
15
17
|
|
|
16
18
|
__confluence_chunkers__ = {
|
|
@@ -17,6 +17,7 @@ def json_chunker(file_content_generator: Generator[Document, None, None], config
|
|
|
17
17
|
for chunk in chunks:
|
|
18
18
|
metadata = doc.metadata.copy()
|
|
19
19
|
metadata['chunk_id'] = chunk_id
|
|
20
|
+
metadata['method_name'] = 'json'
|
|
20
21
|
chunk_id += 1
|
|
21
22
|
yield Document(page_content=json.dumps(chunk), metadata=metadata)
|
|
22
23
|
except Exception as e:
|