alita-sdk 0.3.365__py3-none-any.whl → 0.3.462__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of alita-sdk might be problematic. Click here for more details.
- alita_sdk/cli/__init__.py +10 -0
- alita_sdk/cli/__main__.py +17 -0
- alita_sdk/cli/agent_executor.py +144 -0
- alita_sdk/cli/agent_loader.py +197 -0
- alita_sdk/cli/agent_ui.py +166 -0
- alita_sdk/cli/agents.py +1069 -0
- alita_sdk/cli/callbacks.py +576 -0
- alita_sdk/cli/cli.py +159 -0
- alita_sdk/cli/config.py +153 -0
- alita_sdk/cli/formatting.py +182 -0
- alita_sdk/cli/mcp_loader.py +315 -0
- alita_sdk/cli/toolkit.py +330 -0
- alita_sdk/cli/toolkit_loader.py +55 -0
- alita_sdk/cli/tools/__init__.py +9 -0
- alita_sdk/cli/tools/filesystem.py +905 -0
- alita_sdk/configurations/bitbucket.py +95 -0
- alita_sdk/configurations/confluence.py +96 -1
- alita_sdk/configurations/gitlab.py +79 -0
- alita_sdk/configurations/jira.py +103 -0
- alita_sdk/configurations/testrail.py +88 -0
- alita_sdk/configurations/xray.py +93 -0
- alita_sdk/configurations/zephyr_enterprise.py +93 -0
- alita_sdk/configurations/zephyr_essential.py +75 -0
- alita_sdk/runtime/clients/artifact.py +1 -1
- alita_sdk/runtime/clients/client.py +47 -10
- alita_sdk/runtime/clients/mcp_discovery.py +342 -0
- alita_sdk/runtime/clients/mcp_manager.py +262 -0
- alita_sdk/runtime/clients/sandbox_client.py +373 -0
- alita_sdk/runtime/langchain/assistant.py +70 -41
- alita_sdk/runtime/langchain/constants.py +6 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +4 -1
- alita_sdk/runtime/langchain/document_loaders/constants.py +73 -100
- alita_sdk/runtime/langchain/langraph_agent.py +164 -38
- alita_sdk/runtime/langchain/utils.py +43 -7
- alita_sdk/runtime/models/mcp_models.py +61 -0
- alita_sdk/runtime/toolkits/__init__.py +24 -0
- alita_sdk/runtime/toolkits/application.py +8 -1
- alita_sdk/runtime/toolkits/artifact.py +5 -6
- alita_sdk/runtime/toolkits/mcp.py +895 -0
- alita_sdk/runtime/toolkits/tools.py +140 -50
- alita_sdk/runtime/tools/__init__.py +7 -2
- alita_sdk/runtime/tools/application.py +7 -0
- alita_sdk/runtime/tools/function.py +94 -5
- alita_sdk/runtime/tools/graph.py +10 -4
- alita_sdk/runtime/tools/image_generation.py +104 -8
- alita_sdk/runtime/tools/llm.py +204 -114
- alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
- alita_sdk/runtime/tools/mcp_remote_tool.py +166 -0
- alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
- alita_sdk/runtime/tools/sandbox.py +180 -79
- alita_sdk/runtime/tools/vectorstore.py +22 -21
- alita_sdk/runtime/tools/vectorstore_base.py +79 -26
- alita_sdk/runtime/utils/mcp_oauth.py +164 -0
- alita_sdk/runtime/utils/mcp_sse_client.py +405 -0
- alita_sdk/runtime/utils/streamlit.py +34 -3
- alita_sdk/runtime/utils/toolkit_utils.py +14 -4
- alita_sdk/runtime/utils/utils.py +1 -0
- alita_sdk/tools/__init__.py +48 -31
- alita_sdk/tools/ado/repos/__init__.py +1 -0
- alita_sdk/tools/ado/test_plan/__init__.py +1 -1
- alita_sdk/tools/ado/wiki/__init__.py +1 -5
- alita_sdk/tools/ado/work_item/__init__.py +1 -5
- alita_sdk/tools/ado/work_item/ado_wrapper.py +17 -8
- alita_sdk/tools/base_indexer_toolkit.py +194 -112
- alita_sdk/tools/bitbucket/__init__.py +1 -0
- alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
- alita_sdk/tools/code/sonar/__init__.py +1 -1
- alita_sdk/tools/code_indexer_toolkit.py +15 -5
- alita_sdk/tools/confluence/__init__.py +2 -2
- alita_sdk/tools/confluence/api_wrapper.py +110 -63
- alita_sdk/tools/confluence/loader.py +10 -0
- alita_sdk/tools/elitea_base.py +22 -22
- alita_sdk/tools/github/__init__.py +2 -2
- alita_sdk/tools/gitlab/__init__.py +2 -1
- alita_sdk/tools/gitlab/api_wrapper.py +11 -7
- alita_sdk/tools/gitlab_org/__init__.py +1 -2
- alita_sdk/tools/google_places/__init__.py +2 -1
- alita_sdk/tools/jira/__init__.py +1 -0
- alita_sdk/tools/jira/api_wrapper.py +1 -1
- alita_sdk/tools/memory/__init__.py +1 -1
- alita_sdk/tools/non_code_indexer_toolkit.py +2 -2
- alita_sdk/tools/openapi/__init__.py +10 -1
- alita_sdk/tools/pandas/__init__.py +1 -1
- alita_sdk/tools/postman/__init__.py +2 -1
- alita_sdk/tools/postman/api_wrapper.py +18 -8
- alita_sdk/tools/postman/postman_analysis.py +8 -1
- alita_sdk/tools/pptx/__init__.py +2 -2
- alita_sdk/tools/qtest/__init__.py +3 -3
- alita_sdk/tools/qtest/api_wrapper.py +1708 -76
- alita_sdk/tools/rally/__init__.py +1 -2
- alita_sdk/tools/report_portal/__init__.py +1 -0
- alita_sdk/tools/salesforce/__init__.py +1 -0
- alita_sdk/tools/servicenow/__init__.py +2 -3
- alita_sdk/tools/sharepoint/__init__.py +1 -0
- alita_sdk/tools/sharepoint/api_wrapper.py +125 -34
- alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
- alita_sdk/tools/sharepoint/utils.py +8 -2
- alita_sdk/tools/slack/__init__.py +1 -0
- alita_sdk/tools/sql/__init__.py +2 -1
- alita_sdk/tools/sql/api_wrapper.py +71 -23
- alita_sdk/tools/testio/__init__.py +1 -0
- alita_sdk/tools/testrail/__init__.py +1 -3
- alita_sdk/tools/utils/__init__.py +17 -0
- alita_sdk/tools/utils/content_parser.py +35 -24
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +67 -21
- alita_sdk/tools/xray/__init__.py +2 -1
- alita_sdk/tools/zephyr/__init__.py +2 -1
- alita_sdk/tools/zephyr_enterprise/__init__.py +1 -0
- alita_sdk/tools/zephyr_essential/__init__.py +1 -0
- alita_sdk/tools/zephyr_scale/__init__.py +1 -0
- alita_sdk/tools/zephyr_squad/__init__.py +1 -0
- {alita_sdk-0.3.365.dist-info → alita_sdk-0.3.462.dist-info}/METADATA +8 -2
- {alita_sdk-0.3.365.dist-info → alita_sdk-0.3.462.dist-info}/RECORD +118 -93
- alita_sdk-0.3.462.dist-info/entry_points.txt +2 -0
- {alita_sdk-0.3.365.dist-info → alita_sdk-0.3.462.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.365.dist-info → alita_sdk-0.3.462.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.365.dist-info → alita_sdk-0.3.462.dist-info}/top_level.txt +0 -0
|
@@ -24,11 +24,6 @@ class AzureDevOpsWikiToolkit(BaseToolkit):
|
|
|
24
24
|
AzureDevOpsWikiToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
|
|
25
25
|
m = create_model(
|
|
26
26
|
name_alias,
|
|
27
|
-
name=(str, Field(description="Toolkit name",
|
|
28
|
-
json_schema_extra={
|
|
29
|
-
'toolkit_name': True,
|
|
30
|
-
'max_toolkit_length': AzureDevOpsWikiToolkit.toolkit_max_length})
|
|
31
|
-
),
|
|
32
27
|
ado_configuration=(AdoConfiguration, Field(description="Ado configuration", json_schema_extra={'configuration_types': ['ado']})),
|
|
33
28
|
# indexer settings
|
|
34
29
|
pgvector_configuration=(Optional[PgVectorConfiguration], Field(default=None,
|
|
@@ -42,6 +37,7 @@ class AzureDevOpsWikiToolkit(BaseToolkit):
|
|
|
42
37
|
'metadata': {
|
|
43
38
|
"label": "ADO wiki",
|
|
44
39
|
"icon_url": "ado-wiki-icon.svg",
|
|
40
|
+
"max_length": AzureDevOpsWikiToolkit.toolkit_max_length,
|
|
45
41
|
"categories": ["documentation"],
|
|
46
42
|
"extra_categories": ["knowledge base", "documentation management", "wiki"],
|
|
47
43
|
"sections": {
|
|
@@ -23,11 +23,6 @@ class AzureDevOpsWorkItemsToolkit(BaseToolkit):
|
|
|
23
23
|
AzureDevOpsWorkItemsToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
|
|
24
24
|
m = create_model(
|
|
25
25
|
name,
|
|
26
|
-
name=(str, Field(description="Toolkit name",
|
|
27
|
-
json_schema_extra={
|
|
28
|
-
'toolkit_name': True,
|
|
29
|
-
'max_toolkit_length': AzureDevOpsWorkItemsToolkit.toolkit_max_length})
|
|
30
|
-
),
|
|
31
26
|
ado_configuration=(AdoConfiguration, Field(description="Ado Work Item configuration", json_schema_extra={'configuration_types': ['ado']})),
|
|
32
27
|
limit=(Optional[int], Field(description="ADO plans limit used for limitation of the list with results", default=5)),
|
|
33
28
|
selected_tools=(List[Literal[tuple(selected_tools)]], Field(default=[], json_schema_extra={'args_schemas': selected_tools})),
|
|
@@ -42,6 +37,7 @@ class AzureDevOpsWorkItemsToolkit(BaseToolkit):
|
|
|
42
37
|
'metadata': {
|
|
43
38
|
"label": "ADO boards",
|
|
44
39
|
"icon_url": "ado-boards-icon.svg",
|
|
40
|
+
"max_length": AzureDevOpsWorkItemsToolkit.toolkit_max_length,
|
|
45
41
|
"categories": ["project management"],
|
|
46
42
|
"extra_categories": ["work item management", "issue tracking", "agile boards"],
|
|
47
43
|
"sections": {
|
|
@@ -329,11 +329,14 @@ class AzureDevOpsApiWrapper(NonCodeIndexerToolkit):
|
|
|
329
329
|
parsed_item.update(fields_data)
|
|
330
330
|
|
|
331
331
|
# extract relations if any
|
|
332
|
-
relations_data =
|
|
332
|
+
relations_data = None
|
|
333
|
+
if expand and str(expand).lower() in ("relations", "all"):
|
|
334
|
+
try:
|
|
335
|
+
relations_data = getattr(work_item, 'relations', None)
|
|
336
|
+
except KeyError:
|
|
337
|
+
relations_data = None
|
|
333
338
|
if relations_data:
|
|
334
|
-
parsed_item['relations'] = []
|
|
335
|
-
for relation in relations_data:
|
|
336
|
-
parsed_item['relations'].append(relation.as_dict())
|
|
339
|
+
parsed_item['relations'] = [relation.as_dict() for relation in relations_data]
|
|
337
340
|
|
|
338
341
|
if parse_attachments:
|
|
339
342
|
# describe images in work item fields if present
|
|
@@ -344,13 +347,19 @@ class AzureDevOpsApiWrapper(NonCodeIndexerToolkit):
|
|
|
344
347
|
for img in images:
|
|
345
348
|
src = img.get('src')
|
|
346
349
|
if src:
|
|
347
|
-
description = self.parse_attachment_by_url(src, image_description_prompt)
|
|
350
|
+
description = self.parse_attachment_by_url(src, image_description_prompt=image_description_prompt)
|
|
348
351
|
img['image-description'] = description
|
|
349
352
|
parsed_item[field_name] = str(soup)
|
|
350
353
|
# parse attached documents if present
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
+
for relation in parsed_item.get('relations', []):
|
|
355
|
+
# Only process actual file attachments
|
|
356
|
+
if relation.get('rel') == 'AttachedFile':
|
|
357
|
+
file_name = relation.get('attributes', {}).get('name')
|
|
358
|
+
if file_name:
|
|
359
|
+
try:
|
|
360
|
+
relation['content'] = self.parse_attachment_by_url(relation['url'], file_name, image_description_prompt=image_description_prompt)
|
|
361
|
+
except Exception as att_e:
|
|
362
|
+
logger.warning(f"Failed to parse attachment {file_name}: {att_e}")
|
|
354
363
|
|
|
355
364
|
|
|
356
365
|
return parsed_item
|
|
@@ -1,41 +1,46 @@
|
|
|
1
|
+
import copy
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
4
|
import time
|
|
4
5
|
from typing import Any, Optional, List, Dict, Generator
|
|
5
6
|
|
|
7
|
+
from langchain_core.callbacks import dispatch_custom_event
|
|
6
8
|
from langchain_core.documents import Document
|
|
7
9
|
from pydantic import create_model, Field, SecretStr
|
|
8
10
|
|
|
9
11
|
from .utils.content_parser import file_extension_by_chunker, process_document_by_type
|
|
10
12
|
from .vector_adapters.VectorStoreAdapter import VectorStoreAdapterFactory
|
|
13
|
+
from ..runtime.langchain.document_loaders.constants import loaders_allowed_to_override
|
|
11
14
|
from ..runtime.tools.vectorstore_base import VectorStoreWrapperBase
|
|
12
15
|
from ..runtime.utils.utils import IndexerKeywords
|
|
13
16
|
|
|
14
17
|
logger = logging.getLogger(__name__)
|
|
15
18
|
|
|
19
|
+
DEFAULT_CUT_OFF = 0.2
|
|
20
|
+
|
|
16
21
|
# Base Vector Store Schema Models
|
|
17
22
|
BaseIndexParams = create_model(
|
|
18
23
|
"BaseIndexParams",
|
|
19
|
-
|
|
24
|
+
index_name=(str, Field(description="Index name (max 7 characters)", min_length=1, max_length=7)),
|
|
20
25
|
)
|
|
21
26
|
|
|
22
27
|
RemoveIndexParams = create_model(
|
|
23
28
|
"RemoveIndexParams",
|
|
24
|
-
|
|
29
|
+
index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
|
|
25
30
|
)
|
|
26
31
|
|
|
27
32
|
BaseSearchParams = create_model(
|
|
28
33
|
"BaseSearchParams",
|
|
29
34
|
query=(str, Field(description="Query text to search in the index")),
|
|
30
|
-
|
|
31
|
-
description="Optional
|
|
35
|
+
index_name=(Optional[str], Field(
|
|
36
|
+
description="Optional index name (max 7 characters). Leave empty to search across all datasets",
|
|
32
37
|
default="", max_length=7)),
|
|
33
38
|
filter=(Optional[dict | str], Field(
|
|
34
39
|
description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
|
|
35
40
|
default={},
|
|
36
41
|
examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
|
|
37
42
|
)),
|
|
38
|
-
cut_off=(Optional[float], Field(description="Cut-off score for search results", default=
|
|
43
|
+
cut_off=(Optional[float], Field(description="Cut-off score for search results", default=DEFAULT_CUT_OFF, ge=0, le=1)),
|
|
39
44
|
search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
|
|
40
45
|
full_text_search=(Optional[Dict[str, Any]], Field(
|
|
41
46
|
description="Full text search parameters. Can be a dictionary with search options.",
|
|
@@ -58,14 +63,14 @@ BaseSearchParams = create_model(
|
|
|
58
63
|
BaseStepbackSearchParams = create_model(
|
|
59
64
|
"BaseStepbackSearchParams",
|
|
60
65
|
query=(str, Field(description="Query text to search in the index")),
|
|
61
|
-
|
|
66
|
+
index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
|
|
62
67
|
messages=(Optional[List], Field(description="Chat messages for stepback search context", default=[])),
|
|
63
68
|
filter=(Optional[dict | str], Field(
|
|
64
69
|
description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
|
|
65
70
|
default={},
|
|
66
71
|
examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
|
|
67
72
|
)),
|
|
68
|
-
cut_off=(Optional[float], Field(description="Cut-off score for search results", default=
|
|
73
|
+
cut_off=(Optional[float], Field(description="Cut-off score for search results", default=DEFAULT_CUT_OFF, ge=0, le=1)),
|
|
69
74
|
search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
|
|
70
75
|
full_text_search=(Optional[Dict[str, Any]], Field(
|
|
71
76
|
description="Full text search parameters. Can be a dictionary with search options.",
|
|
@@ -92,7 +97,7 @@ BaseIndexDataParams = create_model(
|
|
|
92
97
|
description="Optional flag to enforce clean existing index before indexing new data")),
|
|
93
98
|
progress_step=(Optional[int], Field(default=10, ge=0, le=100,
|
|
94
99
|
description="Optional step size for progress reporting during indexing")),
|
|
95
|
-
chunking_config=(Optional[dict], Field(description="Chunking tool configuration",
|
|
100
|
+
chunking_config=(Optional[dict], Field(description="Chunking tool configuration", default=loaders_allowed_to_override)),
|
|
96
101
|
)
|
|
97
102
|
|
|
98
103
|
|
|
@@ -108,7 +113,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
108
113
|
def __init__(self, **kwargs):
|
|
109
114
|
conn = kwargs.get('connection_string', None)
|
|
110
115
|
connection_string = conn.get_secret_value() if isinstance(conn, SecretStr) else conn
|
|
111
|
-
collection_name = kwargs.get('
|
|
116
|
+
collection_name = kwargs.get('collection_schema')
|
|
112
117
|
|
|
113
118
|
if 'vectorstore_type' not in kwargs:
|
|
114
119
|
kwargs['vectorstore_type'] = 'PGVector'
|
|
@@ -148,55 +153,48 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
148
153
|
yield from ()
|
|
149
154
|
|
|
150
155
|
def index_data(self, **kwargs):
|
|
151
|
-
|
|
152
|
-
collection_suffix = kwargs.get("collection_suffix")
|
|
153
|
-
progress_step = kwargs.get("progress_step")
|
|
156
|
+
index_name = kwargs.get("index_name")
|
|
154
157
|
clean_index = kwargs.get("clean_index")
|
|
155
158
|
chunking_tool = kwargs.get("chunking_tool")
|
|
156
159
|
chunking_config = kwargs.get("chunking_config")
|
|
160
|
+
result = {"count": 0}
|
|
157
161
|
#
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
"
|
|
165
|
-
"
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
index_meta_doc.metadata["state"] = IndexerKeywords.INDEX_META_COMPLETED.value
|
|
189
|
-
index_meta_doc.metadata["updated_on"] = time.time()
|
|
190
|
-
add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=index_meta_ids)
|
|
191
|
-
#
|
|
192
|
-
return {"status": "ok", "message": f"successfully indexed {result} documents"}
|
|
162
|
+
try:
|
|
163
|
+
if clean_index:
|
|
164
|
+
self._clean_index(index_name)
|
|
165
|
+
#
|
|
166
|
+
self.index_meta_init(index_name, kwargs)
|
|
167
|
+
#
|
|
168
|
+
self._log_tool_event(f"Indexing data into collection with suffix '{index_name}'. It can take some time...")
|
|
169
|
+
self._log_tool_event(f"Loading the documents to index...{kwargs}")
|
|
170
|
+
documents = self._base_loader(**kwargs)
|
|
171
|
+
documents = list(documents) # consume/exhaust generator to count items
|
|
172
|
+
documents_count = len(documents)
|
|
173
|
+
documents = (doc for doc in documents)
|
|
174
|
+
self._log_tool_event(f"Base documents were pre-loaded. "
|
|
175
|
+
f"Search for possible document duplicates and remove them from the indexing list...")
|
|
176
|
+
documents = self._reduce_duplicates(documents, index_name)
|
|
177
|
+
self._log_tool_event(f"Duplicates were removed. "
|
|
178
|
+
f"Processing documents to collect dependencies and prepare them for indexing...")
|
|
179
|
+
self._save_index_generator(documents, documents_count, chunking_tool, chunking_config, index_name=index_name, result=result)
|
|
180
|
+
#
|
|
181
|
+
results_count = result["count"]
|
|
182
|
+
self.index_meta_update(index_name, IndexerKeywords.INDEX_META_COMPLETED.value, results_count)
|
|
183
|
+
self._emit_index_event(index_name)
|
|
184
|
+
#
|
|
185
|
+
return {"status": "ok", "message": f"successfully indexed {results_count} documents" if results_count > 0
|
|
186
|
+
else "no new documents to index"}
|
|
187
|
+
except Exception as e:
|
|
188
|
+
self.index_meta_update(index_name, IndexerKeywords.INDEX_META_FAILED.value, result["count"])
|
|
189
|
+
self._emit_index_event(index_name, error=str(e))
|
|
190
|
+
raise e
|
|
191
|
+
|
|
193
192
|
|
|
194
|
-
def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config,
|
|
193
|
+
def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config, result, index_name: Optional[str] = None):
|
|
195
194
|
self._log_tool_event(f"Base documents are ready for indexing. {base_total} base documents in total to index.")
|
|
196
195
|
from ..runtime.langchain.interfaces.llm_processor import add_documents
|
|
197
196
|
#
|
|
198
197
|
base_doc_counter = 0
|
|
199
|
-
total_counter = 0
|
|
200
198
|
pg_vector_add_docs_chunk = []
|
|
201
199
|
for base_doc in base_documents:
|
|
202
200
|
base_doc_counter += 1
|
|
@@ -223,12 +221,12 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
223
221
|
if 'id' not in doc.metadata or 'updated_on' not in doc.metadata:
|
|
224
222
|
logger.warning(f"Document is missing required metadata field 'id' or 'updated_on': {doc.metadata}")
|
|
225
223
|
#
|
|
226
|
-
# if
|
|
227
|
-
if
|
|
224
|
+
# if index_name is provided, add it to metadata of each document
|
|
225
|
+
if index_name:
|
|
228
226
|
if not doc.metadata.get('collection'):
|
|
229
|
-
doc.metadata['collection'] =
|
|
227
|
+
doc.metadata['collection'] = index_name
|
|
230
228
|
else:
|
|
231
|
-
doc.metadata['collection'] += f";{
|
|
229
|
+
doc.metadata['collection'] += f";{index_name}"
|
|
232
230
|
#
|
|
233
231
|
try:
|
|
234
232
|
pg_vector_add_docs_chunk.append(doc)
|
|
@@ -244,10 +242,9 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
244
242
|
msg = f"Indexed base document #{base_doc_counter} out of {base_total} (with {dependent_docs_counter} dependencies)."
|
|
245
243
|
logger.debug(msg)
|
|
246
244
|
self._log_tool_event(msg)
|
|
247
|
-
|
|
245
|
+
result["count"] += dependent_docs_counter
|
|
248
246
|
if pg_vector_add_docs_chunk:
|
|
249
247
|
add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
|
|
250
|
-
return total_counter
|
|
251
248
|
|
|
252
249
|
def _apply_loaders_chunkers(self, documents: Generator[Document, None, None], chunking_tool: str=None, chunking_config=None) -> Generator[Document, None, None]:
|
|
253
250
|
from ..tools.chunkers import __all__ as chunkers
|
|
@@ -307,12 +304,12 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
307
304
|
def _reduce_duplicates(
|
|
308
305
|
self,
|
|
309
306
|
documents: Generator[Any, None, None],
|
|
310
|
-
|
|
307
|
+
index_name: str,
|
|
311
308
|
log_msg: str = "Verification of documents to index started"
|
|
312
309
|
) -> Generator[Document, None, None]:
|
|
313
310
|
"""Generic duplicate reduction logic for documents."""
|
|
314
311
|
self._log_tool_event(log_msg, tool_name="index_documents")
|
|
315
|
-
indexed_data = self._get_indexed_data(
|
|
312
|
+
indexed_data = self._get_indexed_data(index_name)
|
|
316
313
|
indexed_keys = set(indexed_data.keys())
|
|
317
314
|
if not indexed_keys:
|
|
318
315
|
self._log_tool_event("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
|
|
@@ -324,7 +321,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
324
321
|
for document in documents:
|
|
325
322
|
key = self.key_fn(document)
|
|
326
323
|
key = key if isinstance(key, str) else str(key)
|
|
327
|
-
if key in indexed_keys and
|
|
324
|
+
if key in indexed_keys and index_name == indexed_data[key]['metadata'].get('collection'):
|
|
328
325
|
if self.compare_fn(document, indexed_data[key]):
|
|
329
326
|
continue
|
|
330
327
|
yield document
|
|
@@ -339,7 +336,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
339
336
|
)
|
|
340
337
|
self.vectorstore.delete(ids=list(docs_to_remove))
|
|
341
338
|
|
|
342
|
-
def _get_indexed_data(self,
|
|
339
|
+
def _get_indexed_data(self, index_name: str):
|
|
343
340
|
raise NotImplementedError("Subclasses must implement this method")
|
|
344
341
|
|
|
345
342
|
def key_fn(self, document: Document):
|
|
@@ -351,73 +348,57 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
351
348
|
def remove_ids_fn(self, idx_data, key: str):
|
|
352
349
|
raise NotImplementedError("Subclasses must implement this method")
|
|
353
350
|
|
|
354
|
-
def remove_index(self,
|
|
351
|
+
def remove_index(self, index_name: str = ""):
|
|
355
352
|
"""Cleans the indexed data in the collection."""
|
|
356
|
-
super()._clean_collection(
|
|
357
|
-
return (f"Collection '{
|
|
358
|
-
f"Available collections: {self.list_collections()}") if
|
|
353
|
+
super()._clean_collection(index_name=index_name)
|
|
354
|
+
return (f"Collection '{index_name}' has been removed from the vector store.\n"
|
|
355
|
+
f"Available collections: {self.list_collections()}") if index_name \
|
|
359
356
|
else "All collections have been removed from the vector store."
|
|
360
357
|
|
|
361
|
-
def _build_collection_filter(self, filter: dict | str,
|
|
358
|
+
def _build_collection_filter(self, filter: dict | str, index_name: str = "") -> dict:
|
|
362
359
|
"""Builds a filter for the collection based on the provided suffix."""
|
|
363
360
|
|
|
364
361
|
filter = filter if isinstance(filter, dict) else json.loads(filter)
|
|
365
|
-
if
|
|
362
|
+
if index_name:
|
|
366
363
|
filter.update({"collection": {
|
|
367
|
-
"$eq":
|
|
364
|
+
"$eq": index_name.strip()
|
|
368
365
|
}})
|
|
369
|
-
filter = {
|
|
370
|
-
"$and": [
|
|
371
|
-
filter,
|
|
372
|
-
{"$or": [
|
|
373
|
-
{"type": {"$exists": False}},
|
|
374
|
-
{"type": {"$ne": IndexerKeywords.INDEX_META_TYPE.value}}
|
|
375
|
-
]},
|
|
376
|
-
]
|
|
377
|
-
}
|
|
378
|
-
return filter
|
|
379
366
|
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
store.EmbeddingStore.cmetadata
|
|
390
|
-
).filter(
|
|
391
|
-
func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'type') == IndexerKeywords.INDEX_META_TYPE.value
|
|
392
|
-
).all()
|
|
393
|
-
return [
|
|
394
|
-
{"id": id_, "metadata": cmetadata}
|
|
395
|
-
for id_, cmetadata in meta
|
|
367
|
+
if filter:
|
|
368
|
+
# Exclude index meta documents from search results
|
|
369
|
+
filter = {
|
|
370
|
+
"$and": [
|
|
371
|
+
filter,
|
|
372
|
+
{"$or": [
|
|
373
|
+
{"type": {"$exists": False}},
|
|
374
|
+
{"type": {"$ne": IndexerKeywords.INDEX_META_TYPE.value}}
|
|
375
|
+
]},
|
|
396
376
|
]
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
377
|
+
}
|
|
378
|
+
else:
|
|
379
|
+
filter = {"$or": [
|
|
380
|
+
{"type": {"$exists": False}},
|
|
381
|
+
{"type": {"$ne": IndexerKeywords.INDEX_META_TYPE.value}}
|
|
382
|
+
]}
|
|
383
|
+
return filter
|
|
403
384
|
|
|
404
385
|
def search_index(self,
|
|
405
386
|
query: str,
|
|
406
|
-
|
|
407
|
-
filter: dict | str = {}, cut_off: float =
|
|
387
|
+
index_name: str = "",
|
|
388
|
+
filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
|
|
408
389
|
search_top: int = 10, reranker: dict = {},
|
|
409
390
|
full_text_search: Optional[Dict[str, Any]] = None,
|
|
410
391
|
reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
411
392
|
extended_search: Optional[List[str]] = None,
|
|
412
393
|
**kwargs):
|
|
413
394
|
""" Searches indexed documents in the vector store."""
|
|
414
|
-
# build filter on top of
|
|
395
|
+
# build filter on top of index_name
|
|
415
396
|
|
|
416
397
|
available_collections = super().list_collections()
|
|
417
|
-
if
|
|
418
|
-
return f"Collection '{
|
|
398
|
+
if index_name and index_name not in available_collections:
|
|
399
|
+
return f"Collection '{index_name}' not found. Available collections: {available_collections}"
|
|
419
400
|
|
|
420
|
-
filter = self._build_collection_filter(filter,
|
|
401
|
+
filter = self._build_collection_filter(filter, index_name)
|
|
421
402
|
found_docs = super().search_documents(
|
|
422
403
|
query,
|
|
423
404
|
doctype=self.doctype,
|
|
@@ -434,15 +415,15 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
434
415
|
def stepback_search_index(self,
|
|
435
416
|
query: str,
|
|
436
417
|
messages: List[Dict[str, Any]] = [],
|
|
437
|
-
|
|
438
|
-
filter: dict | str = {}, cut_off: float =
|
|
418
|
+
index_name: str = "",
|
|
419
|
+
filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
|
|
439
420
|
search_top: int = 10, reranker: dict = {},
|
|
440
421
|
full_text_search: Optional[Dict[str, Any]] = None,
|
|
441
422
|
reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
442
423
|
extended_search: Optional[List[str]] = None,
|
|
443
424
|
**kwargs):
|
|
444
425
|
""" Searches indexed documents in the vector store."""
|
|
445
|
-
filter = self._build_collection_filter(filter,
|
|
426
|
+
filter = self._build_collection_filter(filter, index_name)
|
|
446
427
|
found_docs = super().stepback_search(
|
|
447
428
|
query,
|
|
448
429
|
messages,
|
|
@@ -459,8 +440,8 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
459
440
|
def stepback_summary_index(self,
|
|
460
441
|
query: str,
|
|
461
442
|
messages: List[Dict[str, Any]] = [],
|
|
462
|
-
|
|
463
|
-
filter: dict | str = {}, cut_off: float =
|
|
443
|
+
index_name: str = "",
|
|
444
|
+
filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
|
|
464
445
|
search_top: int = 10, reranker: dict = {},
|
|
465
446
|
full_text_search: Optional[Dict[str, Any]] = None,
|
|
466
447
|
reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
@@ -468,7 +449,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
468
449
|
**kwargs):
|
|
469
450
|
""" Generates a summary of indexed documents using stepback technique."""
|
|
470
451
|
|
|
471
|
-
filter = self._build_collection_filter(filter,
|
|
452
|
+
filter = self._build_collection_filter(filter, index_name)
|
|
472
453
|
return super().stepback_summary(
|
|
473
454
|
query,
|
|
474
455
|
messages,
|
|
@@ -480,6 +461,106 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
480
461
|
reranking_config=reranking_config,
|
|
481
462
|
extended_search=extended_search
|
|
482
463
|
)
|
|
464
|
+
|
|
465
|
+
def index_meta_init(self, index_name: str, index_configuration: dict[str, Any]):
|
|
466
|
+
index_meta = super().get_index_meta(index_name)
|
|
467
|
+
if not index_meta:
|
|
468
|
+
self._log_tool_event(
|
|
469
|
+
f"There is no existing index_meta for collection '{index_name}'. Initializing it.",
|
|
470
|
+
tool_name="index_data"
|
|
471
|
+
)
|
|
472
|
+
from ..runtime.langchain.interfaces.llm_processor import add_documents
|
|
473
|
+
created_on = time.time()
|
|
474
|
+
metadata = {
|
|
475
|
+
"collection": index_name,
|
|
476
|
+
"type": IndexerKeywords.INDEX_META_TYPE.value,
|
|
477
|
+
"indexed": 0,
|
|
478
|
+
"updated": 0,
|
|
479
|
+
"state": IndexerKeywords.INDEX_META_IN_PROGRESS.value,
|
|
480
|
+
"index_configuration": index_configuration,
|
|
481
|
+
"created_on": created_on,
|
|
482
|
+
"updated_on": created_on,
|
|
483
|
+
"task_id": None,
|
|
484
|
+
"conversation_id": None,
|
|
485
|
+
}
|
|
486
|
+
metadata["history"] = json.dumps([metadata])
|
|
487
|
+
index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{index_name}", metadata=metadata)
|
|
488
|
+
add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc])
|
|
489
|
+
|
|
490
|
+
def index_meta_update(self, index_name: str, state: str, result: int):
|
|
491
|
+
index_meta_raw = super().get_index_meta(index_name)
|
|
492
|
+
from ..runtime.langchain.interfaces.llm_processor import add_documents
|
|
493
|
+
#
|
|
494
|
+
if index_meta_raw:
|
|
495
|
+
metadata = copy.deepcopy(index_meta_raw.get("metadata", {}))
|
|
496
|
+
metadata["indexed"] = self.get_indexed_count(index_name)
|
|
497
|
+
metadata["updated"] = result
|
|
498
|
+
metadata["state"] = state
|
|
499
|
+
metadata["updated_on"] = time.time()
|
|
500
|
+
#
|
|
501
|
+
history_raw = metadata.pop("history", "[]")
|
|
502
|
+
try:
|
|
503
|
+
history = json.loads(history_raw) if history_raw.strip() else []
|
|
504
|
+
# replace the last history item with updated metadata
|
|
505
|
+
if history and isinstance(history, list):
|
|
506
|
+
history[-1] = metadata
|
|
507
|
+
else:
|
|
508
|
+
history = [metadata]
|
|
509
|
+
except (json.JSONDecodeError, TypeError):
|
|
510
|
+
logger.warning(f"Failed to load index history: {history_raw}. Create new with only current item.")
|
|
511
|
+
history = [metadata]
|
|
512
|
+
#
|
|
513
|
+
metadata["history"] = json.dumps(history)
|
|
514
|
+
index_meta_doc = Document(page_content=index_meta_raw.get("content", ""), metadata=metadata)
|
|
515
|
+
add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=[index_meta_raw.get("id")])
|
|
516
|
+
|
|
517
|
+
def _emit_index_event(self, index_name: str, error: Optional[str] = None):
|
|
518
|
+
"""
|
|
519
|
+
Emit custom event for index data operation.
|
|
520
|
+
|
|
521
|
+
Args:
|
|
522
|
+
index_name: The name of the index
|
|
523
|
+
error: Error message if the operation failed, None otherwise
|
|
524
|
+
"""
|
|
525
|
+
index_meta = super().get_index_meta(index_name)
|
|
526
|
+
|
|
527
|
+
if not index_meta:
|
|
528
|
+
logger.warning(
|
|
529
|
+
f"No index_meta found for index '{index_name}'. "
|
|
530
|
+
"Cannot emit index event."
|
|
531
|
+
)
|
|
532
|
+
return
|
|
533
|
+
|
|
534
|
+
metadata = index_meta.get("metadata", {})
|
|
535
|
+
|
|
536
|
+
# Determine if this is a reindex operation
|
|
537
|
+
history_raw = metadata.get("history", "[]")
|
|
538
|
+
try:
|
|
539
|
+
history = json.loads(history_raw) if history_raw.strip() else []
|
|
540
|
+
is_reindex = len(history) > 1
|
|
541
|
+
except (json.JSONDecodeError, TypeError):
|
|
542
|
+
is_reindex = False
|
|
543
|
+
|
|
544
|
+
# Build event message
|
|
545
|
+
event_data = {
|
|
546
|
+
"id": index_meta.get("id"),
|
|
547
|
+
"index_name": index_name,
|
|
548
|
+
"state": metadata.get("state"),
|
|
549
|
+
"error": error,
|
|
550
|
+
"reindex": is_reindex,
|
|
551
|
+
"indexed": metadata.get("indexed", 0),
|
|
552
|
+
"updated": metadata.get("updated", 0),
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
# Emit the event
|
|
556
|
+
try:
|
|
557
|
+
dispatch_custom_event("index_data_status", event_data)
|
|
558
|
+
logger.debug(
|
|
559
|
+
f"Emitted index_data_status event for index "
|
|
560
|
+
f"'{index_name}': {event_data}"
|
|
561
|
+
)
|
|
562
|
+
except Exception as e:
|
|
563
|
+
logger.warning(f"Failed to emit index_data_status event: {e}")
|
|
483
564
|
|
|
484
565
|
def get_available_tools(self):
|
|
485
566
|
"""
|
|
@@ -534,6 +615,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
534
615
|
"mode": "list_collections",
|
|
535
616
|
"ref": self.list_collections,
|
|
536
617
|
"description": self.list_collections.__doc__,
|
|
537
|
-
|
|
618
|
+
# No parameters
|
|
619
|
+
"args_schema": create_model("ListCollectionsParams")
|
|
538
620
|
},
|
|
539
|
-
]
|
|
621
|
+
]
|
|
@@ -61,6 +61,7 @@ class AlitaBitbucketToolkit(BaseToolkit):
|
|
|
61
61
|
'metadata':
|
|
62
62
|
{
|
|
63
63
|
"label": "Bitbucket", "icon_url": "bitbucket-icon.svg",
|
|
64
|
+
"max_length": AlitaBitbucketToolkit.toolkit_max_length,
|
|
64
65
|
"categories": ["code repositories"],
|
|
65
66
|
"extra_categories": ["bitbucket", "git", "repository", "code", "version control"],
|
|
66
67
|
}
|
|
@@ -6,7 +6,7 @@ from langchain_core.prompts import ChatPromptTemplate
|
|
|
6
6
|
from langchain.text_splitter import TokenTextSplitter
|
|
7
7
|
|
|
8
8
|
from typing import Optional, List
|
|
9
|
-
from
|
|
9
|
+
from pydantic import BaseModel
|
|
10
10
|
from ..utils import tiktoken_length
|
|
11
11
|
|
|
12
12
|
logger = getLogger(__name__)
|
|
@@ -29,7 +29,7 @@ class SonarToolkit(BaseToolkit):
|
|
|
29
29
|
SonarToolkit.toolkit_max_length = get_max_toolkit_length(selected_tools)
|
|
30
30
|
return create_model(
|
|
31
31
|
name,
|
|
32
|
-
sonar_project_name=(str, Field(description="Project name of the desired repository"
|
|
32
|
+
sonar_project_name=(str, Field(description="Project name of the desired repository")),
|
|
33
33
|
sonar_configuration=(SonarConfiguration, Field(description="Sonar Configuration", json_schema_extra={'configuration_types': ['sonar']})),
|
|
34
34
|
selected_tools=(List[Literal[tuple(selected_tools)]], Field(default=[], json_schema_extra={'args_schemas': selected_tools})),
|
|
35
35
|
__config__=ConfigDict(json_schema_extra=
|