alita-sdk 0.3.374__py3-none-any.whl → 0.3.423__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of alita-sdk might be problematic. Click here for more details.
- alita_sdk/configurations/bitbucket.py +95 -0
- alita_sdk/configurations/confluence.py +96 -1
- alita_sdk/configurations/gitlab.py +79 -0
- alita_sdk/configurations/jira.py +103 -0
- alita_sdk/configurations/testrail.py +88 -0
- alita_sdk/configurations/xray.py +93 -0
- alita_sdk/configurations/zephyr_enterprise.py +93 -0
- alita_sdk/configurations/zephyr_essential.py +75 -0
- alita_sdk/runtime/clients/client.py +3 -2
- alita_sdk/runtime/clients/sandbox_client.py +8 -0
- alita_sdk/runtime/langchain/assistant.py +56 -40
- alita_sdk/runtime/langchain/constants.py +4 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +4 -1
- alita_sdk/runtime/langchain/document_loaders/constants.py +28 -12
- alita_sdk/runtime/langchain/langraph_agent.py +92 -28
- alita_sdk/runtime/langchain/utils.py +24 -4
- alita_sdk/runtime/toolkits/application.py +8 -1
- alita_sdk/runtime/toolkits/tools.py +80 -49
- alita_sdk/runtime/tools/__init__.py +7 -2
- alita_sdk/runtime/tools/application.py +7 -0
- alita_sdk/runtime/tools/function.py +28 -23
- alita_sdk/runtime/tools/graph.py +10 -4
- alita_sdk/runtime/tools/image_generation.py +104 -8
- alita_sdk/runtime/tools/llm.py +146 -114
- alita_sdk/runtime/tools/sandbox.py +166 -63
- alita_sdk/runtime/tools/vectorstore.py +22 -21
- alita_sdk/runtime/tools/vectorstore_base.py +16 -15
- alita_sdk/runtime/utils/utils.py +1 -0
- alita_sdk/tools/__init__.py +43 -31
- alita_sdk/tools/ado/work_item/ado_wrapper.py +17 -8
- alita_sdk/tools/base_indexer_toolkit.py +102 -93
- alita_sdk/tools/code_indexer_toolkit.py +15 -5
- alita_sdk/tools/confluence/api_wrapper.py +30 -8
- alita_sdk/tools/confluence/loader.py +10 -0
- alita_sdk/tools/elitea_base.py +22 -22
- alita_sdk/tools/gitlab/api_wrapper.py +8 -9
- alita_sdk/tools/jira/api_wrapper.py +1 -1
- alita_sdk/tools/non_code_indexer_toolkit.py +2 -2
- alita_sdk/tools/openapi/__init__.py +10 -1
- alita_sdk/tools/qtest/api_wrapper.py +298 -51
- alita_sdk/tools/sharepoint/api_wrapper.py +104 -33
- alita_sdk/tools/sharepoint/authorization_helper.py +175 -1
- alita_sdk/tools/sharepoint/utils.py +8 -2
- alita_sdk/tools/utils/content_parser.py +27 -16
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +38 -25
- {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/METADATA +1 -1
- {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/RECORD +51 -51
- {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/top_level.txt +0 -0
alita_sdk/tools/__init__.py
CHANGED
|
@@ -90,62 +90,74 @@ available_count = len(AVAILABLE_TOOLS)
|
|
|
90
90
|
total_attempted = len(AVAILABLE_TOOLS) + len(FAILED_IMPORTS)
|
|
91
91
|
logger.info(f"Tool imports completed: {available_count}/{total_attempted} successful")
|
|
92
92
|
|
|
93
|
+
|
|
93
94
|
def get_tools(tools_list, alita, llm, store: Optional[BaseStore] = None, *args, **kwargs):
|
|
94
95
|
tools = []
|
|
96
|
+
|
|
95
97
|
for tool in tools_list:
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
tool
|
|
98
|
+
settings = tool.get('settings')
|
|
99
|
+
|
|
100
|
+
# Skip tools without settings early
|
|
101
|
+
if not settings:
|
|
102
|
+
logger.warning(f"Tool '{tool.get('type', '')}' has no settings, skipping...")
|
|
103
|
+
continue
|
|
104
|
+
|
|
105
|
+
# Validate tool names once
|
|
106
|
+
selected_tools = settings.get('selected_tools', [])
|
|
107
|
+
invalid_tools = [name for name in selected_tools if isinstance(name, str) and name.startswith('_')]
|
|
108
|
+
if invalid_tools:
|
|
109
|
+
raise ValueError(f"Tool names {invalid_tools} from toolkit '{tool.get('type', '')}' cannot start with '_'")
|
|
110
|
+
|
|
111
|
+
# Cache tool type and add common settings
|
|
104
112
|
tool_type = tool['type']
|
|
113
|
+
settings['alita'] = alita
|
|
114
|
+
settings['llm'] = llm
|
|
115
|
+
settings['store'] = store
|
|
116
|
+
|
|
117
|
+
# Set pgvector collection schema if present
|
|
118
|
+
if settings.get('pgvector_configuration'):
|
|
119
|
+
settings['pgvector_configuration']['collection_schema'] = str(tool['id'])
|
|
105
120
|
|
|
106
|
-
# Handle special cases
|
|
121
|
+
# Handle ADO special cases
|
|
107
122
|
if tool_type in ['ado_boards', 'ado_wiki', 'ado_plans']:
|
|
108
123
|
tools.extend(AVAILABLE_TOOLS['ado']['get_tools'](tool_type, tool))
|
|
124
|
+
continue
|
|
109
125
|
|
|
110
|
-
#
|
|
111
|
-
|
|
126
|
+
# Handle ADO repos aliases
|
|
127
|
+
if tool_type in ['ado_repos', 'azure_devops_repos'] and 'ado_repos' in AVAILABLE_TOOLS:
|
|
112
128
|
try:
|
|
113
|
-
|
|
114
|
-
tools.extend(get_tools_func(tool))
|
|
115
|
-
|
|
129
|
+
tools.extend(AVAILABLE_TOOLS['ado_repos']['get_tools'](tool))
|
|
116
130
|
except Exception as e:
|
|
117
|
-
logger.error(f"Error getting
|
|
118
|
-
|
|
131
|
+
logger.error(f"Error getting ADO repos tools: {e}")
|
|
132
|
+
continue
|
|
119
133
|
|
|
120
|
-
# Handle
|
|
121
|
-
|
|
134
|
+
# Handle standard tools
|
|
135
|
+
if tool_type in AVAILABLE_TOOLS and 'get_tools' in AVAILABLE_TOOLS[tool_type]:
|
|
122
136
|
try:
|
|
123
|
-
|
|
124
|
-
tools.extend(get_tools_func(tool))
|
|
137
|
+
tools.extend(AVAILABLE_TOOLS[tool_type]['get_tools'](tool))
|
|
125
138
|
except Exception as e:
|
|
126
|
-
logger.error(f"Error getting
|
|
139
|
+
logger.error(f"Error getting tools for {tool_type}: {e}")
|
|
140
|
+
raise ToolException(f"Error getting tools for {tool_type}: {e}")
|
|
141
|
+
continue
|
|
127
142
|
|
|
128
143
|
# Handle custom modules
|
|
129
|
-
|
|
144
|
+
if settings.get("module"):
|
|
130
145
|
try:
|
|
131
|
-
settings = tool.get("settings", {})
|
|
132
146
|
mod = import_module(settings.pop("module"))
|
|
133
147
|
tkitclass = getattr(mod, settings.pop("class"))
|
|
134
|
-
|
|
135
|
-
get_toolkit_params = tool["settings"].copy()
|
|
148
|
+
get_toolkit_params = settings.copy()
|
|
136
149
|
get_toolkit_params["name"] = tool.get("name")
|
|
137
|
-
#
|
|
138
150
|
toolkit = tkitclass.get_toolkit(**get_toolkit_params)
|
|
139
151
|
tools.extend(toolkit.get_tools())
|
|
140
152
|
except Exception as e:
|
|
141
153
|
logger.error(f"Error in getting custom toolkit: {e}")
|
|
154
|
+
continue
|
|
142
155
|
|
|
156
|
+
# Tool not available
|
|
157
|
+
if tool_type in FAILED_IMPORTS:
|
|
158
|
+
logger.warning(f"Tool '{tool_type}' is not available: {FAILED_IMPORTS[tool_type]}")
|
|
143
159
|
else:
|
|
144
|
-
|
|
145
|
-
if tool_type in FAILED_IMPORTS:
|
|
146
|
-
logger.warning(f"Tool '{tool_type}' is not available: {FAILED_IMPORTS[tool_type]}")
|
|
147
|
-
else:
|
|
148
|
-
logger.warning(f"Unknown tool type: {tool_type}")
|
|
160
|
+
logger.warning(f"Unknown tool type: {tool_type}")
|
|
149
161
|
|
|
150
162
|
return tools
|
|
151
163
|
|
|
@@ -329,11 +329,14 @@ class AzureDevOpsApiWrapper(NonCodeIndexerToolkit):
|
|
|
329
329
|
parsed_item.update(fields_data)
|
|
330
330
|
|
|
331
331
|
# extract relations if any
|
|
332
|
-
relations_data =
|
|
332
|
+
relations_data = None
|
|
333
|
+
if expand and str(expand).lower() in ("relations", "all"):
|
|
334
|
+
try:
|
|
335
|
+
relations_data = getattr(work_item, 'relations', None)
|
|
336
|
+
except KeyError:
|
|
337
|
+
relations_data = None
|
|
333
338
|
if relations_data:
|
|
334
|
-
parsed_item['relations'] = []
|
|
335
|
-
for relation in relations_data:
|
|
336
|
-
parsed_item['relations'].append(relation.as_dict())
|
|
339
|
+
parsed_item['relations'] = [relation.as_dict() for relation in relations_data]
|
|
337
340
|
|
|
338
341
|
if parse_attachments:
|
|
339
342
|
# describe images in work item fields if present
|
|
@@ -344,13 +347,19 @@ class AzureDevOpsApiWrapper(NonCodeIndexerToolkit):
|
|
|
344
347
|
for img in images:
|
|
345
348
|
src = img.get('src')
|
|
346
349
|
if src:
|
|
347
|
-
description = self.parse_attachment_by_url(src, image_description_prompt)
|
|
350
|
+
description = self.parse_attachment_by_url(src, image_description_prompt=image_description_prompt)
|
|
348
351
|
img['image-description'] = description
|
|
349
352
|
parsed_item[field_name] = str(soup)
|
|
350
353
|
# parse attached documents if present
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
+
for relation in parsed_item.get('relations', []):
|
|
355
|
+
# Only process actual file attachments
|
|
356
|
+
if relation.get('rel') == 'AttachedFile':
|
|
357
|
+
file_name = relation.get('attributes', {}).get('name')
|
|
358
|
+
if file_name:
|
|
359
|
+
try:
|
|
360
|
+
relation['content'] = self.parse_attachment_by_url(relation['url'], file_name, image_description_prompt=image_description_prompt)
|
|
361
|
+
except Exception as att_e:
|
|
362
|
+
logger.warning(f"Failed to parse attachment {file_name}: {att_e}")
|
|
354
363
|
|
|
355
364
|
|
|
356
365
|
return parsed_item
|
|
@@ -7,7 +7,6 @@ from typing import Any, Optional, List, Dict, Generator
|
|
|
7
7
|
from langchain_core.documents import Document
|
|
8
8
|
from pydantic import create_model, Field, SecretStr
|
|
9
9
|
|
|
10
|
-
from .utils import make_json_serializable
|
|
11
10
|
from .utils.content_parser import file_extension_by_chunker, process_document_by_type
|
|
12
11
|
from .vector_adapters.VectorStoreAdapter import VectorStoreAdapterFactory
|
|
13
12
|
from ..runtime.langchain.document_loaders.constants import loaders_allowed_to_override
|
|
@@ -19,19 +18,19 @@ logger = logging.getLogger(__name__)
|
|
|
19
18
|
# Base Vector Store Schema Models
|
|
20
19
|
BaseIndexParams = create_model(
|
|
21
20
|
"BaseIndexParams",
|
|
22
|
-
|
|
21
|
+
index_name=(str, Field(description="Index name (max 7 characters)", min_length=1, max_length=7)),
|
|
23
22
|
)
|
|
24
23
|
|
|
25
24
|
RemoveIndexParams = create_model(
|
|
26
25
|
"RemoveIndexParams",
|
|
27
|
-
|
|
26
|
+
index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
|
|
28
27
|
)
|
|
29
28
|
|
|
30
29
|
BaseSearchParams = create_model(
|
|
31
30
|
"BaseSearchParams",
|
|
32
31
|
query=(str, Field(description="Query text to search in the index")),
|
|
33
|
-
|
|
34
|
-
description="Optional
|
|
32
|
+
index_name=(Optional[str], Field(
|
|
33
|
+
description="Optional index name (max 7 characters). Leave empty to search across all datasets",
|
|
35
34
|
default="", max_length=7)),
|
|
36
35
|
filter=(Optional[dict | str], Field(
|
|
37
36
|
description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
|
|
@@ -61,7 +60,7 @@ BaseSearchParams = create_model(
|
|
|
61
60
|
BaseStepbackSearchParams = create_model(
|
|
62
61
|
"BaseStepbackSearchParams",
|
|
63
62
|
query=(str, Field(description="Query text to search in the index")),
|
|
64
|
-
|
|
63
|
+
index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
|
|
65
64
|
messages=(Optional[List], Field(description="Chat messages for stepback search context", default=[])),
|
|
66
65
|
filter=(Optional[dict | str], Field(
|
|
67
66
|
description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
|
|
@@ -111,7 +110,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
111
110
|
def __init__(self, **kwargs):
|
|
112
111
|
conn = kwargs.get('connection_string', None)
|
|
113
112
|
connection_string = conn.get_secret_value() if isinstance(conn, SecretStr) else conn
|
|
114
|
-
collection_name = kwargs.get('
|
|
113
|
+
collection_name = kwargs.get('collection_schema')
|
|
115
114
|
|
|
116
115
|
if 'vectorstore_type' not in kwargs:
|
|
117
116
|
kwargs['vectorstore_type'] = 'PGVector'
|
|
@@ -151,40 +150,46 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
151
150
|
yield from ()
|
|
152
151
|
|
|
153
152
|
def index_data(self, **kwargs):
|
|
154
|
-
|
|
155
|
-
progress_step = kwargs.get("progress_step")
|
|
153
|
+
index_name = kwargs.get("index_name")
|
|
156
154
|
clean_index = kwargs.get("clean_index")
|
|
157
155
|
chunking_tool = kwargs.get("chunking_tool")
|
|
158
156
|
chunking_config = kwargs.get("chunking_config")
|
|
157
|
+
result = {"count": 0}
|
|
159
158
|
#
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
159
|
+
try:
|
|
160
|
+
if clean_index:
|
|
161
|
+
self._clean_index(index_name)
|
|
162
|
+
#
|
|
163
|
+
self.index_meta_init(index_name, kwargs)
|
|
164
|
+
#
|
|
165
|
+
self._log_tool_event(f"Indexing data into collection with suffix '{index_name}'. It can take some time...")
|
|
166
|
+
self._log_tool_event(f"Loading the documents to index...{kwargs}")
|
|
167
|
+
documents = self._base_loader(**kwargs)
|
|
168
|
+
documents = list(documents) # consume/exhaust generator to count items
|
|
169
|
+
documents_count = len(documents)
|
|
170
|
+
documents = (doc for doc in documents)
|
|
171
|
+
self._log_tool_event(f"Base documents were pre-loaded. "
|
|
172
|
+
f"Search for possible document duplicates and remove them from the indexing list...")
|
|
173
|
+
documents = self._reduce_duplicates(documents, index_name)
|
|
174
|
+
self._log_tool_event(f"Duplicates were removed. "
|
|
175
|
+
f"Processing documents to collect dependencies and prepare them for indexing...")
|
|
176
|
+
self._save_index_generator(documents, documents_count, chunking_tool, chunking_config, index_name=index_name, result=result)
|
|
177
|
+
#
|
|
178
|
+
results_count = result["count"]
|
|
179
|
+
self.index_meta_update(index_name, IndexerKeywords.INDEX_META_COMPLETED.value, results_count)
|
|
180
|
+
#
|
|
181
|
+
return {"status": "ok", "message": f"successfully indexed {results_count} documents" if results_count > 0
|
|
182
|
+
else "no new documents to index"}
|
|
183
|
+
except Exception as e:
|
|
184
|
+
self.index_meta_update(index_name, IndexerKeywords.INDEX_META_FAILED.value, result["count"])
|
|
185
|
+
raise e
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config, result, index_name: Optional[str] = None):
|
|
183
189
|
self._log_tool_event(f"Base documents are ready for indexing. {base_total} base documents in total to index.")
|
|
184
190
|
from ..runtime.langchain.interfaces.llm_processor import add_documents
|
|
185
191
|
#
|
|
186
192
|
base_doc_counter = 0
|
|
187
|
-
total_counter = 0
|
|
188
193
|
pg_vector_add_docs_chunk = []
|
|
189
194
|
for base_doc in base_documents:
|
|
190
195
|
base_doc_counter += 1
|
|
@@ -211,12 +216,12 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
211
216
|
if 'id' not in doc.metadata or 'updated_on' not in doc.metadata:
|
|
212
217
|
logger.warning(f"Document is missing required metadata field 'id' or 'updated_on': {doc.metadata}")
|
|
213
218
|
#
|
|
214
|
-
# if
|
|
215
|
-
if
|
|
219
|
+
# if index_name is provided, add it to metadata of each document
|
|
220
|
+
if index_name:
|
|
216
221
|
if not doc.metadata.get('collection'):
|
|
217
|
-
doc.metadata['collection'] =
|
|
222
|
+
doc.metadata['collection'] = index_name
|
|
218
223
|
else:
|
|
219
|
-
doc.metadata['collection'] += f";{
|
|
224
|
+
doc.metadata['collection'] += f";{index_name}"
|
|
220
225
|
#
|
|
221
226
|
try:
|
|
222
227
|
pg_vector_add_docs_chunk.append(doc)
|
|
@@ -232,10 +237,9 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
232
237
|
msg = f"Indexed base document #{base_doc_counter} out of {base_total} (with {dependent_docs_counter} dependencies)."
|
|
233
238
|
logger.debug(msg)
|
|
234
239
|
self._log_tool_event(msg)
|
|
235
|
-
|
|
240
|
+
result["count"] += dependent_docs_counter
|
|
236
241
|
if pg_vector_add_docs_chunk:
|
|
237
242
|
add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
|
|
238
|
-
return total_counter
|
|
239
243
|
|
|
240
244
|
def _apply_loaders_chunkers(self, documents: Generator[Document, None, None], chunking_tool: str=None, chunking_config=None) -> Generator[Document, None, None]:
|
|
241
245
|
from ..tools.chunkers import __all__ as chunkers
|
|
@@ -295,12 +299,12 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
295
299
|
def _reduce_duplicates(
|
|
296
300
|
self,
|
|
297
301
|
documents: Generator[Any, None, None],
|
|
298
|
-
|
|
302
|
+
index_name: str,
|
|
299
303
|
log_msg: str = "Verification of documents to index started"
|
|
300
304
|
) -> Generator[Document, None, None]:
|
|
301
305
|
"""Generic duplicate reduction logic for documents."""
|
|
302
306
|
self._log_tool_event(log_msg, tool_name="index_documents")
|
|
303
|
-
indexed_data = self._get_indexed_data(
|
|
307
|
+
indexed_data = self._get_indexed_data(index_name)
|
|
304
308
|
indexed_keys = set(indexed_data.keys())
|
|
305
309
|
if not indexed_keys:
|
|
306
310
|
self._log_tool_event("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
|
|
@@ -312,7 +316,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
312
316
|
for document in documents:
|
|
313
317
|
key = self.key_fn(document)
|
|
314
318
|
key = key if isinstance(key, str) else str(key)
|
|
315
|
-
if key in indexed_keys and
|
|
319
|
+
if key in indexed_keys and index_name == indexed_data[key]['metadata'].get('collection'):
|
|
316
320
|
if self.compare_fn(document, indexed_data[key]):
|
|
317
321
|
continue
|
|
318
322
|
yield document
|
|
@@ -327,7 +331,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
327
331
|
)
|
|
328
332
|
self.vectorstore.delete(ids=list(docs_to_remove))
|
|
329
333
|
|
|
330
|
-
def _get_indexed_data(self,
|
|
334
|
+
def _get_indexed_data(self, index_name: str):
|
|
331
335
|
raise NotImplementedError("Subclasses must implement this method")
|
|
332
336
|
|
|
333
337
|
def key_fn(self, document: Document):
|
|
@@ -339,20 +343,20 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
339
343
|
def remove_ids_fn(self, idx_data, key: str):
|
|
340
344
|
raise NotImplementedError("Subclasses must implement this method")
|
|
341
345
|
|
|
342
|
-
def remove_index(self,
|
|
346
|
+
def remove_index(self, index_name: str = ""):
|
|
343
347
|
"""Cleans the indexed data in the collection."""
|
|
344
|
-
super()._clean_collection(
|
|
345
|
-
return (f"Collection '{
|
|
346
|
-
f"Available collections: {self.list_collections()}") if
|
|
348
|
+
super()._clean_collection(index_name=index_name)
|
|
349
|
+
return (f"Collection '{index_name}' has been removed from the vector store.\n"
|
|
350
|
+
f"Available collections: {self.list_collections()}") if index_name \
|
|
347
351
|
else "All collections have been removed from the vector store."
|
|
348
352
|
|
|
349
|
-
def _build_collection_filter(self, filter: dict | str,
|
|
353
|
+
def _build_collection_filter(self, filter: dict | str, index_name: str = "") -> dict:
|
|
350
354
|
"""Builds a filter for the collection based on the provided suffix."""
|
|
351
355
|
|
|
352
356
|
filter = filter if isinstance(filter, dict) else json.loads(filter)
|
|
353
|
-
if
|
|
357
|
+
if index_name:
|
|
354
358
|
filter.update({"collection": {
|
|
355
|
-
"$eq":
|
|
359
|
+
"$eq": index_name.strip()
|
|
356
360
|
}})
|
|
357
361
|
|
|
358
362
|
if filter:
|
|
@@ -375,7 +379,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
375
379
|
|
|
376
380
|
def search_index(self,
|
|
377
381
|
query: str,
|
|
378
|
-
|
|
382
|
+
index_name: str = "",
|
|
379
383
|
filter: dict | str = {}, cut_off: float = 0.5,
|
|
380
384
|
search_top: int = 10, reranker: dict = {},
|
|
381
385
|
full_text_search: Optional[Dict[str, Any]] = None,
|
|
@@ -383,13 +387,13 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
383
387
|
extended_search: Optional[List[str]] = None,
|
|
384
388
|
**kwargs):
|
|
385
389
|
""" Searches indexed documents in the vector store."""
|
|
386
|
-
# build filter on top of
|
|
390
|
+
# build filter on top of index_name
|
|
387
391
|
|
|
388
392
|
available_collections = super().list_collections()
|
|
389
|
-
if
|
|
390
|
-
return f"Collection '{
|
|
393
|
+
if index_name and index_name not in available_collections:
|
|
394
|
+
return f"Collection '{index_name}' not found. Available collections: {available_collections}"
|
|
391
395
|
|
|
392
|
-
filter = self._build_collection_filter(filter,
|
|
396
|
+
filter = self._build_collection_filter(filter, index_name)
|
|
393
397
|
found_docs = super().search_documents(
|
|
394
398
|
query,
|
|
395
399
|
doctype=self.doctype,
|
|
@@ -406,7 +410,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
406
410
|
def stepback_search_index(self,
|
|
407
411
|
query: str,
|
|
408
412
|
messages: List[Dict[str, Any]] = [],
|
|
409
|
-
|
|
413
|
+
index_name: str = "",
|
|
410
414
|
filter: dict | str = {}, cut_off: float = 0.5,
|
|
411
415
|
search_top: int = 10, reranker: dict = {},
|
|
412
416
|
full_text_search: Optional[Dict[str, Any]] = None,
|
|
@@ -414,7 +418,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
414
418
|
extended_search: Optional[List[str]] = None,
|
|
415
419
|
**kwargs):
|
|
416
420
|
""" Searches indexed documents in the vector store."""
|
|
417
|
-
filter = self._build_collection_filter(filter,
|
|
421
|
+
filter = self._build_collection_filter(filter, index_name)
|
|
418
422
|
found_docs = super().stepback_search(
|
|
419
423
|
query,
|
|
420
424
|
messages,
|
|
@@ -431,7 +435,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
431
435
|
def stepback_summary_index(self,
|
|
432
436
|
query: str,
|
|
433
437
|
messages: List[Dict[str, Any]] = [],
|
|
434
|
-
|
|
438
|
+
index_name: str = "",
|
|
435
439
|
filter: dict | str = {}, cut_off: float = 0.5,
|
|
436
440
|
search_top: int = 10, reranker: dict = {},
|
|
437
441
|
full_text_search: Optional[Dict[str, Any]] = None,
|
|
@@ -440,7 +444,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
440
444
|
**kwargs):
|
|
441
445
|
""" Generates a summary of indexed documents using stepback technique."""
|
|
442
446
|
|
|
443
|
-
filter = self._build_collection_filter(filter,
|
|
447
|
+
filter = self._build_collection_filter(filter, index_name)
|
|
444
448
|
return super().stepback_summary(
|
|
445
449
|
query,
|
|
446
450
|
messages,
|
|
@@ -453,41 +457,32 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
453
457
|
extended_search=extended_search
|
|
454
458
|
)
|
|
455
459
|
|
|
456
|
-
def index_meta_init(self,
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
else:
|
|
480
|
-
history = []
|
|
481
|
-
new_history_item = {k: v for k, v in index_meta_raw.get("metadata", {}).items() if k != "history"}
|
|
482
|
-
history.append(new_history_item)
|
|
483
|
-
metadata["history"] = json.dumps(history)
|
|
484
|
-
index_meta_ids = [index_meta_raw.get("id")]
|
|
485
|
-
#
|
|
486
|
-
index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{collection_suffix}", metadata=metadata)
|
|
487
|
-
add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=index_meta_ids)
|
|
460
|
+
def index_meta_init(self, index_name: str, index_configuration: dict[str, Any]):
|
|
461
|
+
index_meta = super().get_index_meta(index_name)
|
|
462
|
+
if not index_meta:
|
|
463
|
+
self._log_tool_event(
|
|
464
|
+
f"There is no existing index_meta for collection '{index_name}'. Initializing it.",
|
|
465
|
+
tool_name="index_data"
|
|
466
|
+
)
|
|
467
|
+
from ..runtime.langchain.interfaces.llm_processor import add_documents
|
|
468
|
+
created_on = time.time()
|
|
469
|
+
metadata = {
|
|
470
|
+
"collection": index_name,
|
|
471
|
+
"type": IndexerKeywords.INDEX_META_TYPE.value,
|
|
472
|
+
"indexed": 0,
|
|
473
|
+
"state": IndexerKeywords.INDEX_META_IN_PROGRESS.value,
|
|
474
|
+
"index_configuration": index_configuration,
|
|
475
|
+
"created_on": created_on,
|
|
476
|
+
"updated_on": created_on,
|
|
477
|
+
"task_id": None,
|
|
478
|
+
"conversation_id": None,
|
|
479
|
+
}
|
|
480
|
+
metadata["history"] = json.dumps([metadata])
|
|
481
|
+
index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{index_name}", metadata=metadata)
|
|
482
|
+
add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc])
|
|
488
483
|
|
|
489
|
-
def index_meta_update(self,
|
|
490
|
-
index_meta_raw = super().get_index_meta(
|
|
484
|
+
def index_meta_update(self, index_name: str, state: str, result: int):
|
|
485
|
+
index_meta_raw = super().get_index_meta(index_name)
|
|
491
486
|
from ..runtime.langchain.interfaces.llm_processor import add_documents
|
|
492
487
|
#
|
|
493
488
|
if index_meta_raw:
|
|
@@ -495,6 +490,20 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
495
490
|
metadata["indexed"] = result
|
|
496
491
|
metadata["state"] = state
|
|
497
492
|
metadata["updated_on"] = time.time()
|
|
493
|
+
#
|
|
494
|
+
history_raw = metadata.pop("history", "[]")
|
|
495
|
+
try:
|
|
496
|
+
history = json.loads(history_raw) if history_raw.strip() else []
|
|
497
|
+
# replace the last history item with updated metadata
|
|
498
|
+
if history and isinstance(history, list):
|
|
499
|
+
history[-1] = metadata
|
|
500
|
+
else:
|
|
501
|
+
history = [metadata]
|
|
502
|
+
except (json.JSONDecodeError, TypeError):
|
|
503
|
+
logger.warning(f"Failed to load index history: {history_raw}. Create new with only current item.")
|
|
504
|
+
history = [metadata]
|
|
505
|
+
#
|
|
506
|
+
metadata["history"] = json.dumps(history)
|
|
498
507
|
index_meta_doc = Document(page_content=index_meta_raw.get("content", ""), metadata=metadata)
|
|
499
508
|
add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=[index_meta_raw.get("id")])
|
|
500
509
|
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import ast
|
|
2
2
|
import fnmatch
|
|
3
|
+
import json
|
|
3
4
|
import logging
|
|
4
5
|
from typing import Optional, List, Generator
|
|
5
6
|
|
|
@@ -14,14 +15,14 @@ logger = logging.getLogger(__name__)
|
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
class CodeIndexerToolkit(BaseIndexerToolkit):
|
|
17
|
-
def _get_indexed_data(self,
|
|
18
|
+
def _get_indexed_data(self, index_name: str):
|
|
18
19
|
if not self.vector_adapter:
|
|
19
20
|
raise ToolException("Vector adapter is not initialized. "
|
|
20
21
|
"Check your configuration: embedding_model and vectorstore_type.")
|
|
21
|
-
return self.vector_adapter.get_code_indexed_data(self,
|
|
22
|
+
return self.vector_adapter.get_code_indexed_data(self, index_name)
|
|
22
23
|
|
|
23
24
|
def key_fn(self, document: Document):
|
|
24
|
-
return document.metadata.get(
|
|
25
|
+
return document.metadata.get("filename")
|
|
25
26
|
|
|
26
27
|
def compare_fn(self, document: Document, idx_data):
|
|
27
28
|
return (document.metadata.get('commit_hash') and
|
|
@@ -46,7 +47,7 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
|
|
|
46
47
|
)
|
|
47
48
|
|
|
48
49
|
def _extend_data(self, documents: Generator[Document, None, None]):
|
|
49
|
-
yield from
|
|
50
|
+
yield from documents
|
|
50
51
|
|
|
51
52
|
def _index_tool_params(self):
|
|
52
53
|
"""Return the parameters for indexing data."""
|
|
@@ -117,6 +118,15 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
|
|
|
117
118
|
if not file_content:
|
|
118
119
|
# empty file, skip
|
|
119
120
|
continue
|
|
121
|
+
#
|
|
122
|
+
# ensure file content is a string
|
|
123
|
+
if isinstance(file_content, bytes):
|
|
124
|
+
file_content = file_content.decode("utf-8", errors="ignore")
|
|
125
|
+
elif isinstance(file_content, dict) and file.endswith('.json'):
|
|
126
|
+
file_content = json.dumps(file_content)
|
|
127
|
+
elif not isinstance(file_content, str):
|
|
128
|
+
file_content = str(file_content)
|
|
129
|
+
#
|
|
120
130
|
# hash the file content to ensure uniqueness
|
|
121
131
|
import hashlib
|
|
122
132
|
file_hash = hashlib.sha256(file_content.encode("utf-8")).hexdigest()
|
|
@@ -127,7 +137,7 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
|
|
|
127
137
|
self._log_tool_event(message=f"{idx} out of {total_files} files have been read", tool_name="loader")
|
|
128
138
|
self._log_tool_event(message=f"{len(_files)} have been read", tool_name="loader")
|
|
129
139
|
|
|
130
|
-
return file_content_generator()
|
|
140
|
+
return parse_code_files_for_db(file_content_generator())
|
|
131
141
|
|
|
132
142
|
def __handle_get_files(self, path: str, branch: str):
|
|
133
143
|
"""
|