alita-sdk 0.3.374__py3-none-any.whl → 0.3.423__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of alita-sdk might be problematic. Click here for more details.

Files changed (51) hide show
  1. alita_sdk/configurations/bitbucket.py +95 -0
  2. alita_sdk/configurations/confluence.py +96 -1
  3. alita_sdk/configurations/gitlab.py +79 -0
  4. alita_sdk/configurations/jira.py +103 -0
  5. alita_sdk/configurations/testrail.py +88 -0
  6. alita_sdk/configurations/xray.py +93 -0
  7. alita_sdk/configurations/zephyr_enterprise.py +93 -0
  8. alita_sdk/configurations/zephyr_essential.py +75 -0
  9. alita_sdk/runtime/clients/client.py +3 -2
  10. alita_sdk/runtime/clients/sandbox_client.py +8 -0
  11. alita_sdk/runtime/langchain/assistant.py +56 -40
  12. alita_sdk/runtime/langchain/constants.py +4 -0
  13. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  14. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +4 -1
  15. alita_sdk/runtime/langchain/document_loaders/constants.py +28 -12
  16. alita_sdk/runtime/langchain/langraph_agent.py +92 -28
  17. alita_sdk/runtime/langchain/utils.py +24 -4
  18. alita_sdk/runtime/toolkits/application.py +8 -1
  19. alita_sdk/runtime/toolkits/tools.py +80 -49
  20. alita_sdk/runtime/tools/__init__.py +7 -2
  21. alita_sdk/runtime/tools/application.py +7 -0
  22. alita_sdk/runtime/tools/function.py +28 -23
  23. alita_sdk/runtime/tools/graph.py +10 -4
  24. alita_sdk/runtime/tools/image_generation.py +104 -8
  25. alita_sdk/runtime/tools/llm.py +146 -114
  26. alita_sdk/runtime/tools/sandbox.py +166 -63
  27. alita_sdk/runtime/tools/vectorstore.py +22 -21
  28. alita_sdk/runtime/tools/vectorstore_base.py +16 -15
  29. alita_sdk/runtime/utils/utils.py +1 -0
  30. alita_sdk/tools/__init__.py +43 -31
  31. alita_sdk/tools/ado/work_item/ado_wrapper.py +17 -8
  32. alita_sdk/tools/base_indexer_toolkit.py +102 -93
  33. alita_sdk/tools/code_indexer_toolkit.py +15 -5
  34. alita_sdk/tools/confluence/api_wrapper.py +30 -8
  35. alita_sdk/tools/confluence/loader.py +10 -0
  36. alita_sdk/tools/elitea_base.py +22 -22
  37. alita_sdk/tools/gitlab/api_wrapper.py +8 -9
  38. alita_sdk/tools/jira/api_wrapper.py +1 -1
  39. alita_sdk/tools/non_code_indexer_toolkit.py +2 -2
  40. alita_sdk/tools/openapi/__init__.py +10 -1
  41. alita_sdk/tools/qtest/api_wrapper.py +298 -51
  42. alita_sdk/tools/sharepoint/api_wrapper.py +104 -33
  43. alita_sdk/tools/sharepoint/authorization_helper.py +175 -1
  44. alita_sdk/tools/sharepoint/utils.py +8 -2
  45. alita_sdk/tools/utils/content_parser.py +27 -16
  46. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +38 -25
  47. {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/METADATA +1 -1
  48. {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/RECORD +51 -51
  49. {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/WHEEL +0 -0
  50. {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/licenses/LICENSE +0 -0
  51. {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/top_level.txt +0 -0
@@ -90,62 +90,74 @@ available_count = len(AVAILABLE_TOOLS)
90
90
  total_attempted = len(AVAILABLE_TOOLS) + len(FAILED_IMPORTS)
91
91
  logger.info(f"Tool imports completed: {available_count}/{total_attempted} successful")
92
92
 
93
+
93
94
  def get_tools(tools_list, alita, llm, store: Optional[BaseStore] = None, *args, **kwargs):
94
95
  tools = []
96
+
95
97
  for tool in tools_list:
96
- # validate tool name syntax - it cannot be started with _
97
- for tool_name in tool.get('settings', {}).get('selected_tools', []):
98
- if isinstance(tool_name, str) and tool_name.startswith('_'):
99
- raise ValueError(f"Tool name '{tool_name}' from toolkit '{tool.get('type', '')}' cannot start with '_'")
100
-
101
- tool['settings']['alita'] = alita
102
- tool['settings']['llm'] = llm
103
- tool['settings']['store'] = store
98
+ settings = tool.get('settings')
99
+
100
+ # Skip tools without settings early
101
+ if not settings:
102
+ logger.warning(f"Tool '{tool.get('type', '')}' has no settings, skipping...")
103
+ continue
104
+
105
+ # Validate tool names once
106
+ selected_tools = settings.get('selected_tools', [])
107
+ invalid_tools = [name for name in selected_tools if isinstance(name, str) and name.startswith('_')]
108
+ if invalid_tools:
109
+ raise ValueError(f"Tool names {invalid_tools} from toolkit '{tool.get('type', '')}' cannot start with '_'")
110
+
111
+ # Cache tool type and add common settings
104
112
  tool_type = tool['type']
113
+ settings['alita'] = alita
114
+ settings['llm'] = llm
115
+ settings['store'] = store
116
+
117
+ # Set pgvector collection schema if present
118
+ if settings.get('pgvector_configuration'):
119
+ settings['pgvector_configuration']['collection_schema'] = str(tool['id'])
105
120
 
106
- # Handle special cases for ADO tools
121
+ # Handle ADO special cases
107
122
  if tool_type in ['ado_boards', 'ado_wiki', 'ado_plans']:
108
123
  tools.extend(AVAILABLE_TOOLS['ado']['get_tools'](tool_type, tool))
124
+ continue
109
125
 
110
- # Check if tool is available and has get_tools function
111
- elif tool_type in AVAILABLE_TOOLS and 'get_tools' in AVAILABLE_TOOLS[tool_type]:
126
+ # Handle ADO repos aliases
127
+ if tool_type in ['ado_repos', 'azure_devops_repos'] and 'ado_repos' in AVAILABLE_TOOLS:
112
128
  try:
113
- get_tools_func = AVAILABLE_TOOLS[tool_type]['get_tools']
114
- tools.extend(get_tools_func(tool))
115
-
129
+ tools.extend(AVAILABLE_TOOLS['ado_repos']['get_tools'](tool))
116
130
  except Exception as e:
117
- logger.error(f"Error getting tools for {tool_type}: {e}")
118
- raise ToolException(f"Error getting tools for {tool_type}: {e}")
131
+ logger.error(f"Error getting ADO repos tools: {e}")
132
+ continue
119
133
 
120
- # Handle ADO repos special case (it might be requested as azure_devops_repos)
121
- elif tool_type in ['ado_repos', 'azure_devops_repos'] and 'ado_repos' in AVAILABLE_TOOLS:
134
+ # Handle standard tools
135
+ if tool_type in AVAILABLE_TOOLS and 'get_tools' in AVAILABLE_TOOLS[tool_type]:
122
136
  try:
123
- get_tools_func = AVAILABLE_TOOLS['ado_repos']['get_tools']
124
- tools.extend(get_tools_func(tool))
137
+ tools.extend(AVAILABLE_TOOLS[tool_type]['get_tools'](tool))
125
138
  except Exception as e:
126
- logger.error(f"Error getting ADO repos tools: {e}")
139
+ logger.error(f"Error getting tools for {tool_type}: {e}")
140
+ raise ToolException(f"Error getting tools for {tool_type}: {e}")
141
+ continue
127
142
 
128
143
  # Handle custom modules
129
- elif tool.get("settings", {}).get("module"):
144
+ if settings.get("module"):
130
145
  try:
131
- settings = tool.get("settings", {})
132
146
  mod = import_module(settings.pop("module"))
133
147
  tkitclass = getattr(mod, settings.pop("class"))
134
- #
135
- get_toolkit_params = tool["settings"].copy()
148
+ get_toolkit_params = settings.copy()
136
149
  get_toolkit_params["name"] = tool.get("name")
137
- #
138
150
  toolkit = tkitclass.get_toolkit(**get_toolkit_params)
139
151
  tools.extend(toolkit.get_tools())
140
152
  except Exception as e:
141
153
  logger.error(f"Error in getting custom toolkit: {e}")
154
+ continue
142
155
 
156
+ # Tool not available
157
+ if tool_type in FAILED_IMPORTS:
158
+ logger.warning(f"Tool '{tool_type}' is not available: {FAILED_IMPORTS[tool_type]}")
143
159
  else:
144
- # Tool not available or not found
145
- if tool_type in FAILED_IMPORTS:
146
- logger.warning(f"Tool '{tool_type}' is not available: {FAILED_IMPORTS[tool_type]}")
147
- else:
148
- logger.warning(f"Unknown tool type: {tool_type}")
160
+ logger.warning(f"Unknown tool type: {tool_type}")
149
161
 
150
162
  return tools
151
163
 
@@ -329,11 +329,14 @@ class AzureDevOpsApiWrapper(NonCodeIndexerToolkit):
329
329
  parsed_item.update(fields_data)
330
330
 
331
331
  # extract relations if any
332
- relations_data = work_item.relations
332
+ relations_data = None
333
+ if expand and str(expand).lower() in ("relations", "all"):
334
+ try:
335
+ relations_data = getattr(work_item, 'relations', None)
336
+ except KeyError:
337
+ relations_data = None
333
338
  if relations_data:
334
- parsed_item['relations'] = []
335
- for relation in relations_data:
336
- parsed_item['relations'].append(relation.as_dict())
339
+ parsed_item['relations'] = [relation.as_dict() for relation in relations_data]
337
340
 
338
341
  if parse_attachments:
339
342
  # describe images in work item fields if present
@@ -344,13 +347,19 @@ class AzureDevOpsApiWrapper(NonCodeIndexerToolkit):
344
347
  for img in images:
345
348
  src = img.get('src')
346
349
  if src:
347
- description = self.parse_attachment_by_url(src, image_description_prompt)
350
+ description = self.parse_attachment_by_url(src, image_description_prompt=image_description_prompt)
348
351
  img['image-description'] = description
349
352
  parsed_item[field_name] = str(soup)
350
353
  # parse attached documents if present
351
- if parsed_item['relations']:
352
- for attachment in parsed_item['relations']:
353
- attachment['content'] = self.parse_attachment_by_url(attachment['url'], attachment['attributes']['name'], image_description_prompt)
354
+ for relation in parsed_item.get('relations', []):
355
+ # Only process actual file attachments
356
+ if relation.get('rel') == 'AttachedFile':
357
+ file_name = relation.get('attributes', {}).get('name')
358
+ if file_name:
359
+ try:
360
+ relation['content'] = self.parse_attachment_by_url(relation['url'], file_name, image_description_prompt=image_description_prompt)
361
+ except Exception as att_e:
362
+ logger.warning(f"Failed to parse attachment {file_name}: {att_e}")
354
363
 
355
364
 
356
365
  return parsed_item
@@ -7,7 +7,6 @@ from typing import Any, Optional, List, Dict, Generator
7
7
  from langchain_core.documents import Document
8
8
  from pydantic import create_model, Field, SecretStr
9
9
 
10
- from .utils import make_json_serializable
11
10
  from .utils.content_parser import file_extension_by_chunker, process_document_by_type
12
11
  from .vector_adapters.VectorStoreAdapter import VectorStoreAdapterFactory
13
12
  from ..runtime.langchain.document_loaders.constants import loaders_allowed_to_override
@@ -19,19 +18,19 @@ logger = logging.getLogger(__name__)
19
18
  # Base Vector Store Schema Models
20
19
  BaseIndexParams = create_model(
21
20
  "BaseIndexParams",
22
- collection_suffix=(str, Field(description="Suffix for collection name (max 7 characters) used to separate datasets", min_length=1, max_length=7)),
21
+ index_name=(str, Field(description="Index name (max 7 characters)", min_length=1, max_length=7)),
23
22
  )
24
23
 
25
24
  RemoveIndexParams = create_model(
26
25
  "RemoveIndexParams",
27
- collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
26
+ index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
28
27
  )
29
28
 
30
29
  BaseSearchParams = create_model(
31
30
  "BaseSearchParams",
32
31
  query=(str, Field(description="Query text to search in the index")),
33
- collection_suffix=(Optional[str], Field(
34
- description="Optional suffix for collection name (max 7 characters). Leave empty to search across all datasets",
32
+ index_name=(Optional[str], Field(
33
+ description="Optional index name (max 7 characters). Leave empty to search across all datasets",
35
34
  default="", max_length=7)),
36
35
  filter=(Optional[dict | str], Field(
37
36
  description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
@@ -61,7 +60,7 @@ BaseSearchParams = create_model(
61
60
  BaseStepbackSearchParams = create_model(
62
61
  "BaseStepbackSearchParams",
63
62
  query=(str, Field(description="Query text to search in the index")),
64
- collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
63
+ index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
65
64
  messages=(Optional[List], Field(description="Chat messages for stepback search context", default=[])),
66
65
  filter=(Optional[dict | str], Field(
67
66
  description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
@@ -111,7 +110,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
111
110
  def __init__(self, **kwargs):
112
111
  conn = kwargs.get('connection_string', None)
113
112
  connection_string = conn.get_secret_value() if isinstance(conn, SecretStr) else conn
114
- collection_name = kwargs.get('collection_name')
113
+ collection_name = kwargs.get('collection_schema')
115
114
 
116
115
  if 'vectorstore_type' not in kwargs:
117
116
  kwargs['vectorstore_type'] = 'PGVector'
@@ -151,40 +150,46 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
151
150
  yield from ()
152
151
 
153
152
  def index_data(self, **kwargs):
154
- collection_suffix = kwargs.get("collection_suffix")
155
- progress_step = kwargs.get("progress_step")
153
+ index_name = kwargs.get("index_name")
156
154
  clean_index = kwargs.get("clean_index")
157
155
  chunking_tool = kwargs.get("chunking_tool")
158
156
  chunking_config = kwargs.get("chunking_config")
157
+ result = {"count": 0}
159
158
  #
160
- if clean_index:
161
- self._clean_index(collection_suffix)
162
- #
163
- self.index_meta_init(collection_suffix, kwargs)
164
- #
165
- self._log_tool_event(f"Indexing data into collection with suffix '{collection_suffix}'. It can take some time...")
166
- self._log_tool_event(f"Loading the documents to index...{kwargs}")
167
- documents = self._base_loader(**kwargs)
168
- documents = list(documents) # consume/exhaust generator to count items
169
- documents_count = len(documents)
170
- documents = (doc for doc in documents)
171
- self._log_tool_event(f"Base documents were pre-loaded. "
172
- f"Search for possible document duplicates and remove them from the indexing list...")
173
- documents = self._reduce_duplicates(documents, collection_suffix)
174
- self._log_tool_event(f"Duplicates were removed. "
175
- f"Processing documents to collect dependencies and prepare them for indexing...")
176
- result = self._save_index_generator(documents, documents_count, chunking_tool, chunking_config, collection_suffix=collection_suffix, progress_step=progress_step)
177
- #
178
- self.index_meta_update(collection_suffix, IndexerKeywords.INDEX_META_COMPLETED.value, result)
179
- #
180
- return {"status": "ok", "message": f"successfully indexed {result} documents"}
181
-
182
- def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config, collection_suffix: Optional[str] = None, progress_step: int = 20):
159
+ try:
160
+ if clean_index:
161
+ self._clean_index(index_name)
162
+ #
163
+ self.index_meta_init(index_name, kwargs)
164
+ #
165
+ self._log_tool_event(f"Indexing data into collection with suffix '{index_name}'. It can take some time...")
166
+ self._log_tool_event(f"Loading the documents to index...{kwargs}")
167
+ documents = self._base_loader(**kwargs)
168
+ documents = list(documents) # consume/exhaust generator to count items
169
+ documents_count = len(documents)
170
+ documents = (doc for doc in documents)
171
+ self._log_tool_event(f"Base documents were pre-loaded. "
172
+ f"Search for possible document duplicates and remove them from the indexing list...")
173
+ documents = self._reduce_duplicates(documents, index_name)
174
+ self._log_tool_event(f"Duplicates were removed. "
175
+ f"Processing documents to collect dependencies and prepare them for indexing...")
176
+ self._save_index_generator(documents, documents_count, chunking_tool, chunking_config, index_name=index_name, result=result)
177
+ #
178
+ results_count = result["count"]
179
+ self.index_meta_update(index_name, IndexerKeywords.INDEX_META_COMPLETED.value, results_count)
180
+ #
181
+ return {"status": "ok", "message": f"successfully indexed {results_count} documents" if results_count > 0
182
+ else "no new documents to index"}
183
+ except Exception as e:
184
+ self.index_meta_update(index_name, IndexerKeywords.INDEX_META_FAILED.value, result["count"])
185
+ raise e
186
+
187
+
188
+ def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config, result, index_name: Optional[str] = None):
183
189
  self._log_tool_event(f"Base documents are ready for indexing. {base_total} base documents in total to index.")
184
190
  from ..runtime.langchain.interfaces.llm_processor import add_documents
185
191
  #
186
192
  base_doc_counter = 0
187
- total_counter = 0
188
193
  pg_vector_add_docs_chunk = []
189
194
  for base_doc in base_documents:
190
195
  base_doc_counter += 1
@@ -211,12 +216,12 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
211
216
  if 'id' not in doc.metadata or 'updated_on' not in doc.metadata:
212
217
  logger.warning(f"Document is missing required metadata field 'id' or 'updated_on': {doc.metadata}")
213
218
  #
214
- # if collection_suffix is provided, add it to metadata of each document
215
- if collection_suffix:
219
+ # if index_name is provided, add it to metadata of each document
220
+ if index_name:
216
221
  if not doc.metadata.get('collection'):
217
- doc.metadata['collection'] = collection_suffix
222
+ doc.metadata['collection'] = index_name
218
223
  else:
219
- doc.metadata['collection'] += f";{collection_suffix}"
224
+ doc.metadata['collection'] += f";{index_name}"
220
225
  #
221
226
  try:
222
227
  pg_vector_add_docs_chunk.append(doc)
@@ -232,10 +237,9 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
232
237
  msg = f"Indexed base document #{base_doc_counter} out of {base_total} (with {dependent_docs_counter} dependencies)."
233
238
  logger.debug(msg)
234
239
  self._log_tool_event(msg)
235
- total_counter += dependent_docs_counter
240
+ result["count"] += dependent_docs_counter
236
241
  if pg_vector_add_docs_chunk:
237
242
  add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
238
- return total_counter
239
243
 
240
244
  def _apply_loaders_chunkers(self, documents: Generator[Document, None, None], chunking_tool: str=None, chunking_config=None) -> Generator[Document, None, None]:
241
245
  from ..tools.chunkers import __all__ as chunkers
@@ -295,12 +299,12 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
295
299
  def _reduce_duplicates(
296
300
  self,
297
301
  documents: Generator[Any, None, None],
298
- collection_suffix: str,
302
+ index_name: str,
299
303
  log_msg: str = "Verification of documents to index started"
300
304
  ) -> Generator[Document, None, None]:
301
305
  """Generic duplicate reduction logic for documents."""
302
306
  self._log_tool_event(log_msg, tool_name="index_documents")
303
- indexed_data = self._get_indexed_data(collection_suffix)
307
+ indexed_data = self._get_indexed_data(index_name)
304
308
  indexed_keys = set(indexed_data.keys())
305
309
  if not indexed_keys:
306
310
  self._log_tool_event("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
@@ -312,7 +316,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
312
316
  for document in documents:
313
317
  key = self.key_fn(document)
314
318
  key = key if isinstance(key, str) else str(key)
315
- if key in indexed_keys and collection_suffix == indexed_data[key]['metadata'].get('collection'):
319
+ if key in indexed_keys and index_name == indexed_data[key]['metadata'].get('collection'):
316
320
  if self.compare_fn(document, indexed_data[key]):
317
321
  continue
318
322
  yield document
@@ -327,7 +331,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
327
331
  )
328
332
  self.vectorstore.delete(ids=list(docs_to_remove))
329
333
 
330
- def _get_indexed_data(self, collection_suffix: str):
334
+ def _get_indexed_data(self, index_name: str):
331
335
  raise NotImplementedError("Subclasses must implement this method")
332
336
 
333
337
  def key_fn(self, document: Document):
@@ -339,20 +343,20 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
339
343
  def remove_ids_fn(self, idx_data, key: str):
340
344
  raise NotImplementedError("Subclasses must implement this method")
341
345
 
342
- def remove_index(self, collection_suffix: str = ""):
346
+ def remove_index(self, index_name: str = ""):
343
347
  """Cleans the indexed data in the collection."""
344
- super()._clean_collection(collection_suffix=collection_suffix)
345
- return (f"Collection '{collection_suffix}' has been removed from the vector store.\n"
346
- f"Available collections: {self.list_collections()}") if collection_suffix \
348
+ super()._clean_collection(index_name=index_name)
349
+ return (f"Collection '{index_name}' has been removed from the vector store.\n"
350
+ f"Available collections: {self.list_collections()}") if index_name \
347
351
  else "All collections have been removed from the vector store."
348
352
 
349
- def _build_collection_filter(self, filter: dict | str, collection_suffix: str = "") -> dict:
353
+ def _build_collection_filter(self, filter: dict | str, index_name: str = "") -> dict:
350
354
  """Builds a filter for the collection based on the provided suffix."""
351
355
 
352
356
  filter = filter if isinstance(filter, dict) else json.loads(filter)
353
- if collection_suffix:
357
+ if index_name:
354
358
  filter.update({"collection": {
355
- "$eq": collection_suffix.strip()
359
+ "$eq": index_name.strip()
356
360
  }})
357
361
 
358
362
  if filter:
@@ -375,7 +379,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
375
379
 
376
380
  def search_index(self,
377
381
  query: str,
378
- collection_suffix: str = "",
382
+ index_name: str = "",
379
383
  filter: dict | str = {}, cut_off: float = 0.5,
380
384
  search_top: int = 10, reranker: dict = {},
381
385
  full_text_search: Optional[Dict[str, Any]] = None,
@@ -383,13 +387,13 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
383
387
  extended_search: Optional[List[str]] = None,
384
388
  **kwargs):
385
389
  """ Searches indexed documents in the vector store."""
386
- # build filter on top of collection_suffix
390
+ # build filter on top of index_name
387
391
 
388
392
  available_collections = super().list_collections()
389
- if collection_suffix and collection_suffix not in available_collections:
390
- return f"Collection '{collection_suffix}' not found. Available collections: {available_collections}"
393
+ if index_name and index_name not in available_collections:
394
+ return f"Collection '{index_name}' not found. Available collections: {available_collections}"
391
395
 
392
- filter = self._build_collection_filter(filter, collection_suffix)
396
+ filter = self._build_collection_filter(filter, index_name)
393
397
  found_docs = super().search_documents(
394
398
  query,
395
399
  doctype=self.doctype,
@@ -406,7 +410,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
406
410
  def stepback_search_index(self,
407
411
  query: str,
408
412
  messages: List[Dict[str, Any]] = [],
409
- collection_suffix: str = "",
413
+ index_name: str = "",
410
414
  filter: dict | str = {}, cut_off: float = 0.5,
411
415
  search_top: int = 10, reranker: dict = {},
412
416
  full_text_search: Optional[Dict[str, Any]] = None,
@@ -414,7 +418,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
414
418
  extended_search: Optional[List[str]] = None,
415
419
  **kwargs):
416
420
  """ Searches indexed documents in the vector store."""
417
- filter = self._build_collection_filter(filter, collection_suffix)
421
+ filter = self._build_collection_filter(filter, index_name)
418
422
  found_docs = super().stepback_search(
419
423
  query,
420
424
  messages,
@@ -431,7 +435,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
431
435
  def stepback_summary_index(self,
432
436
  query: str,
433
437
  messages: List[Dict[str, Any]] = [],
434
- collection_suffix: str = "",
438
+ index_name: str = "",
435
439
  filter: dict | str = {}, cut_off: float = 0.5,
436
440
  search_top: int = 10, reranker: dict = {},
437
441
  full_text_search: Optional[Dict[str, Any]] = None,
@@ -440,7 +444,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
440
444
  **kwargs):
441
445
  """ Generates a summary of indexed documents using stepback technique."""
442
446
 
443
- filter = self._build_collection_filter(filter, collection_suffix)
447
+ filter = self._build_collection_filter(filter, index_name)
444
448
  return super().stepback_summary(
445
449
  query,
446
450
  messages,
@@ -453,41 +457,32 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
453
457
  extended_search=extended_search
454
458
  )
455
459
 
456
- def index_meta_init(self, collection_suffix: str, index_configuration: dict[str, Any]):
457
- index_meta_raw = super().get_index_meta(collection_suffix)
458
- from ..runtime.langchain.interfaces.llm_processor import add_documents
459
- created_on = time.time()
460
- metadata = {
461
- "collection": collection_suffix,
462
- "type": IndexerKeywords.INDEX_META_TYPE.value,
463
- "indexed": 0,
464
- "state": IndexerKeywords.INDEX_META_IN_PROGRESS.value,
465
- "index_configuration": index_configuration,
466
- "created_on": created_on,
467
- "updated_on": created_on,
468
- "history": "[]",
469
- }
470
- index_meta_ids = None
471
- #
472
- if index_meta_raw:
473
- history_raw = index_meta_raw.get("metadata", {}).get("history", "[]")
474
- if isinstance(history_raw, str) and history_raw.strip():
475
- try:
476
- history = json.loads(history_raw)
477
- except (json.JSONDecodeError, TypeError):
478
- history = []
479
- else:
480
- history = []
481
- new_history_item = {k: v for k, v in index_meta_raw.get("metadata", {}).items() if k != "history"}
482
- history.append(new_history_item)
483
- metadata["history"] = json.dumps(history)
484
- index_meta_ids = [index_meta_raw.get("id")]
485
- #
486
- index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{collection_suffix}", metadata=metadata)
487
- add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=index_meta_ids)
460
+ def index_meta_init(self, index_name: str, index_configuration: dict[str, Any]):
461
+ index_meta = super().get_index_meta(index_name)
462
+ if not index_meta:
463
+ self._log_tool_event(
464
+ f"There is no existing index_meta for collection '{index_name}'. Initializing it.",
465
+ tool_name="index_data"
466
+ )
467
+ from ..runtime.langchain.interfaces.llm_processor import add_documents
468
+ created_on = time.time()
469
+ metadata = {
470
+ "collection": index_name,
471
+ "type": IndexerKeywords.INDEX_META_TYPE.value,
472
+ "indexed": 0,
473
+ "state": IndexerKeywords.INDEX_META_IN_PROGRESS.value,
474
+ "index_configuration": index_configuration,
475
+ "created_on": created_on,
476
+ "updated_on": created_on,
477
+ "task_id": None,
478
+ "conversation_id": None,
479
+ }
480
+ metadata["history"] = json.dumps([metadata])
481
+ index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{index_name}", metadata=metadata)
482
+ add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc])
488
483
 
489
- def index_meta_update(self, collection_suffix: str, state: str, result: int):
490
- index_meta_raw = super().get_index_meta(collection_suffix)
484
+ def index_meta_update(self, index_name: str, state: str, result: int):
485
+ index_meta_raw = super().get_index_meta(index_name)
491
486
  from ..runtime.langchain.interfaces.llm_processor import add_documents
492
487
  #
493
488
  if index_meta_raw:
@@ -495,6 +490,20 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
495
490
  metadata["indexed"] = result
496
491
  metadata["state"] = state
497
492
  metadata["updated_on"] = time.time()
493
+ #
494
+ history_raw = metadata.pop("history", "[]")
495
+ try:
496
+ history = json.loads(history_raw) if history_raw.strip() else []
497
+ # replace the last history item with updated metadata
498
+ if history and isinstance(history, list):
499
+ history[-1] = metadata
500
+ else:
501
+ history = [metadata]
502
+ except (json.JSONDecodeError, TypeError):
503
+ logger.warning(f"Failed to load index history: {history_raw}. Create new with only current item.")
504
+ history = [metadata]
505
+ #
506
+ metadata["history"] = json.dumps(history)
498
507
  index_meta_doc = Document(page_content=index_meta_raw.get("content", ""), metadata=metadata)
499
508
  add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=[index_meta_raw.get("id")])
500
509
 
@@ -1,5 +1,6 @@
1
1
  import ast
2
2
  import fnmatch
3
+ import json
3
4
  import logging
4
5
  from typing import Optional, List, Generator
5
6
 
@@ -14,14 +15,14 @@ logger = logging.getLogger(__name__)
14
15
 
15
16
 
16
17
  class CodeIndexerToolkit(BaseIndexerToolkit):
17
- def _get_indexed_data(self, collection_suffix: str):
18
+ def _get_indexed_data(self, index_name: str):
18
19
  if not self.vector_adapter:
19
20
  raise ToolException("Vector adapter is not initialized. "
20
21
  "Check your configuration: embedding_model and vectorstore_type.")
21
- return self.vector_adapter.get_code_indexed_data(self, collection_suffix)
22
+ return self.vector_adapter.get_code_indexed_data(self, index_name)
22
23
 
23
24
  def key_fn(self, document: Document):
24
- return document.metadata.get('id')
25
+ return document.metadata.get("filename")
25
26
 
26
27
  def compare_fn(self, document: Document, idx_data):
27
28
  return (document.metadata.get('commit_hash') and
@@ -46,7 +47,7 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
46
47
  )
47
48
 
48
49
  def _extend_data(self, documents: Generator[Document, None, None]):
49
- yield from parse_code_files_for_db(documents)
50
+ yield from documents
50
51
 
51
52
  def _index_tool_params(self):
52
53
  """Return the parameters for indexing data."""
@@ -117,6 +118,15 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
117
118
  if not file_content:
118
119
  # empty file, skip
119
120
  continue
121
+ #
122
+ # ensure file content is a string
123
+ if isinstance(file_content, bytes):
124
+ file_content = file_content.decode("utf-8", errors="ignore")
125
+ elif isinstance(file_content, dict) and file.endswith('.json'):
126
+ file_content = json.dumps(file_content)
127
+ elif not isinstance(file_content, str):
128
+ file_content = str(file_content)
129
+ #
120
130
  # hash the file content to ensure uniqueness
121
131
  import hashlib
122
132
  file_hash = hashlib.sha256(file_content.encode("utf-8")).hexdigest()
@@ -127,7 +137,7 @@ class CodeIndexerToolkit(BaseIndexerToolkit):
127
137
  self._log_tool_event(message=f"{idx} out of {total_files} files have been read", tool_name="loader")
128
138
  self._log_tool_event(message=f"{len(_files)} have been read", tool_name="loader")
129
139
 
130
- return file_content_generator()
140
+ return parse_code_files_for_db(file_content_generator())
131
141
 
132
142
  def __handle_get_files(self, path: str, branch: str):
133
143
  """