alita-sdk 0.3.230__py3-none-any.whl → 0.3.231__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -210,38 +210,48 @@ class VectorStoreWrapper(BaseToolApiWrapper):
210
210
  tool_name="_remove_collection"
211
211
  )
212
212
 
213
- def _get_indexed_ids(self, store):
213
+ def _get_indexed_ids(self, collection_suffix: Optional[str] = '') -> List[str]:
214
214
  """Get all indexed document IDs from vectorstore"""
215
215
 
216
216
  # Check if this is a PGVector store
217
- if hasattr(store, 'session_maker') and hasattr(store, 'EmbeddingStore'):
218
- return self._get_pgvector_indexed_ids(store)
217
+ if self._is_pgvector():
218
+ return self._get_pgvector_indexed_ids(collection_suffix)
219
219
  else:
220
220
  # Fall back to Chroma implementation
221
- return self._get_chroma_indexed_ids(store)
221
+ # TODO: update filter by collection_suffix for Chroma
222
+ return self._get_chroma_indexed_ids(collection_suffix)
222
223
 
223
- def _get_pgvector_indexed_ids(self, store):
224
+ def _get_pgvector_indexed_ids(self, collection_suffix: Optional[str] = ''):
224
225
  """Get all indexed document IDs from PGVector"""
225
226
  from sqlalchemy.orm import Session
227
+ from sqlalchemy import func
226
228
 
229
+ store = self.vectorstore
227
230
  try:
228
231
  with Session(store.session_maker.bind) as session:
229
- ids = session.query(store.EmbeddingStore.id).all()
230
- return [str(id_tuple[0]) for id_tuple in ids]
232
+ # Start building the query
233
+ query = session.query(store.EmbeddingStore.id)
234
+ # Apply filter only if collection_suffix is provided
235
+ if collection_suffix:
236
+ query = query.filter(
237
+ func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'collection') == collection_suffix
238
+ )
239
+ ids = query.all()
240
+ return [str(id_tuple[0]) for id_tuple in ids]
231
241
  except Exception as e:
232
242
  logger.error(f"Failed to get indexed IDs from PGVector: {str(e)}")
233
243
  return []
234
244
 
235
- def _get_chroma_indexed_ids(self, store):
245
+ def _get_chroma_indexed_ids(self, collection_suffix: Optional[str] = ''):
236
246
  """Get all indexed document IDs from Chroma"""
237
247
  try:
238
- data = store.get(include=[]) # Only get IDs, no metadata
248
+ data = self.vectorstore.get(include=[]) # Only get IDs, no metadata
239
249
  return data.get('ids', [])
240
250
  except Exception as e:
241
251
  logger.error(f"Failed to get indexed IDs from Chroma: {str(e)}")
242
252
  return []
243
253
 
244
- def _clean_collection(self):
254
+ def _clean_collection(self, collection_suffix: str = ''):
245
255
  """
246
256
  Clean the vectorstore collection by deleting all indexed data.
247
257
  """
@@ -251,33 +261,37 @@ class VectorStoreWrapper(BaseToolApiWrapper):
251
261
  )
252
262
  # This logic deletes all data from the vectorstore collection without removal of collection.
253
263
  # Collection itself remains available for future indexing.
254
- self.vectoradapter.vectorstore.delete(ids=self._get_indexed_ids(self.vectoradapter.vectorstore))
264
+ self.vectorstore.delete(ids=self._get_indexed_ids(collection_suffix))
255
265
 
256
266
  self._log_data(
257
267
  f"Collection '{self.dataset}' has been cleaned. ",
258
268
  tool_name="_clean_collection"
259
269
  )
260
270
 
271
+ def _is_pgvector(self) -> bool:
272
+ """Check if the vectorstore is a PGVector store."""
273
+ return hasattr(self.vectorstore, 'session_maker') and hasattr(self.vectorstore, 'EmbeddingStore')
274
+
261
275
  # TODO: refactor to use common method for different vectorstores in a separate vectorstore wrappers
262
- def _get_indexed_data(self, store):
276
+ def _get_indexed_data(self):
263
277
  """ Get all indexed data from vectorstore for non-code content """
264
278
 
265
279
  # Check if this is a PGVector store
266
- if hasattr(store, 'session_maker') and hasattr(store, 'EmbeddingStore'):
267
- return self._get_pgvector_indexed_data(store)
280
+ if self._is_pgvector():
281
+ return self._get_pgvector_indexed_data()
268
282
  else:
269
283
  # Fall back to original Chroma implementation
270
- return self._get_chroma_indexed_data(store)
284
+ return self._get_chroma_indexed_data(self.vectorstore)
271
285
 
272
- def _get_pgvector_indexed_data(self, store):
286
+ def _get_pgvector_indexed_data(self):
273
287
  """ Get all indexed data from PGVector for non-code content """
274
288
  from sqlalchemy.orm import Session
275
289
 
276
290
  result = {}
277
291
  try:
278
292
  self._log_data("Retrieving already indexed data from PGVector vectorstore",
279
- tool_name="index_documents")
280
-
293
+ tool_name="get_indexed_data")
294
+ store = self.vectorstore
281
295
  with Session(store.session_maker.bind) as session:
282
296
  docs = session.query(
283
297
  store.EmbeddingStore.id,
@@ -320,7 +334,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
320
334
  result = {}
321
335
  try:
322
336
  self._log_data("Retrieving already indexed data from Chroma vectorstore",
323
- tool_name="index_documents")
337
+ tool_name="get_indexed_data")
324
338
  data = store.get(include=['metadatas'])
325
339
 
326
340
  # Re-structure data to be more usable
@@ -349,21 +363,63 @@ class VectorStoreWrapper(BaseToolApiWrapper):
349
363
 
350
364
  return result
351
365
 
352
- def _get_code_indexed_data(self, store) -> Dict[str, Dict[str, Any]]:
366
+ def _get_code_indexed_data(self) -> Dict[str, Dict[str, Any]]:
353
367
  """ Get all indexed data from vectorstore for code content """
354
368
 
355
369
  # get already indexed data
370
+ if self._is_pgvector():
371
+ result = self._get_pgvector_code_indexed_data()
372
+ else:
373
+ result = self._get_chroma_code_indexed_data()
374
+ return result
375
+
376
+ def _get_chroma_code_indexed_data(self) -> Dict[str, Dict[str, Any]]:
377
+ """Get all indexed code data from Chroma."""
356
378
  result = {}
357
379
  try:
358
- self._log_data("Retrieving already indexed code data from vectorstore",
359
- tool_name="index_documents")
360
- data = store.get(include=['metadatas'])
361
- # re-structure data to be more usable
380
+ self._log_data("Retrieving already indexed code data from Chroma vectorstore",
381
+ tool_name="index_code_data")
382
+ data = self.vectorstore.get(include=['metadatas'])
362
383
  for meta, db_id in zip(data['metadatas'], data['ids']):
363
- filename = meta['filename']
384
+ filename = meta.get('filename')
385
+ commit_hash = meta.get('commit_hash')
386
+ if not filename:
387
+ continue
388
+ if filename not in result:
389
+ result[filename] = {
390
+ 'commit_hashes': [],
391
+ 'ids': []
392
+ }
393
+ if commit_hash is not None:
394
+ result[filename]['commit_hashes'].append(commit_hash)
395
+ result[filename]['ids'].append(db_id)
396
+ except Exception as e:
397
+ logger.error(f"Failed to get indexed code data from Chroma: {str(e)}. Continuing with empty index.")
398
+ return result
399
+
400
+ def _get_pgvector_code_indexed_data(self) -> Dict[str, Dict[str, Any]]:
401
+ """Get all indexed code data from PGVector."""
402
+ from sqlalchemy.orm import Session
403
+
404
+ result = {}
405
+ try:
406
+ self._log_data("Retrieving already indexed code data from PGVector vectorstore",
407
+ tool_name="index_code_data")
408
+ store = self.vectorstore
409
+ with Session(store.session_maker.bind) as session:
410
+ docs = session.query(
411
+ store.EmbeddingStore.id,
412
+ store.EmbeddingStore.cmetadata
413
+ ).all()
414
+
415
+ for db_id, meta in docs:
416
+ filename = meta.get('filename')
364
417
  commit_hash = meta.get('commit_hash')
418
+ if not filename:
419
+ continue
365
420
  if filename not in result:
366
421
  result[filename] = {
422
+ 'metadata': meta,
367
423
  'commit_hashes': [],
368
424
  'ids': []
369
425
  }
@@ -371,13 +427,64 @@ class VectorStoreWrapper(BaseToolApiWrapper):
371
427
  result[filename]['commit_hashes'].append(commit_hash)
372
428
  result[filename]['ids'].append(db_id)
373
429
  except Exception as e:
374
- logger.error(f"Failed to get indexed code data from vectorstore: {str(e)}. Continuing with empty index.")
430
+ logger.error(f"Failed to get indexed code data from PGVector: {str(e)}. Continuing with empty index.")
375
431
  return result
376
432
 
433
+
434
+ def _add_to_collection(self, entry_id, new_collection_value):
435
+ """Add a new collection name to the `collection` key in the `metadata` column."""
436
+
437
+ from sqlalchemy import func
438
+ from sqlalchemy.orm import Session
439
+
440
+ store = self.vectorstore
441
+ try:
442
+ with Session(store.session_maker.bind) as session:
443
+ # Query the current value of the `collection` key
444
+ current_collection_query = session.query(
445
+ func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'collection')
446
+ ).filter(store.EmbeddingStore.id == entry_id).scalar()
447
+
448
+ # If the `collection` key is NULL or doesn't contain the new value, update it
449
+ if current_collection_query is None:
450
+ # If `collection` is NULL, initialize it with the new value
451
+ session.query(store.EmbeddingStore).filter(
452
+ store.EmbeddingStore.id == entry_id
453
+ ).update(
454
+ {
455
+ store.EmbeddingStore.cmetadata: func.jsonb_set(
456
+ func.coalesce(store.EmbeddingStore.cmetadata, '{}'),
457
+ '{collection}', # Path to the `collection` key
458
+ f'"{new_collection_value}"', # New value for the `collection` key
459
+ True # Create the key if it doesn't exist
460
+ )
461
+ }
462
+ )
463
+ elif new_collection_value not in current_collection_query.split(";"):
464
+ # If `collection` exists but doesn't contain the new value, append it
465
+ updated_collection_value = f"{current_collection_query};{new_collection_value}"
466
+ session.query(store.EmbeddingStore).filter(
467
+ store.EmbeddingStore.id == entry_id
468
+ ).update(
469
+ {
470
+ store.EmbeddingStore.cmetadata: func.jsonb_set(
471
+ store.EmbeddingStore.cmetadata,
472
+ '{collection}', # Path to the `collection` key
473
+ f'"{updated_collection_value}"', # Concatenated value as a valid JSON string
474
+ True # Create the key if it doesn't exist
475
+ )
476
+ }
477
+ )
478
+
479
+ session.commit()
480
+ logger.info(f"Successfully updated collection for entry ID {entry_id}.")
481
+ except Exception as e:
482
+ logger.error(f"Failed to update collection for entry ID {entry_id}: {str(e)}")
483
+
377
484
  def _reduce_duplicates(
378
485
  self,
379
486
  documents: Generator[Any, None, None],
380
- store,
487
+ collection_suffix: str,
381
488
  get_indexed_data: Callable,
382
489
  key_fn: Callable,
383
490
  compare_fn: Callable,
@@ -386,7 +493,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
386
493
  ) -> List[Any]:
387
494
  """Generic duplicate reduction logic for documents."""
388
495
  self._log_data(log_msg, tool_name="index_documents")
389
- indexed_data = get_indexed_data(store)
496
+ indexed_data = get_indexed_data()
390
497
  indexed_keys = set(indexed_data.keys())
391
498
  if not indexed_keys:
392
499
  self._log_data("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
@@ -397,8 +504,15 @@ class VectorStoreWrapper(BaseToolApiWrapper):
397
504
 
398
505
  for document in documents:
399
506
  key = key_fn(document)
400
- if key in indexed_keys:
507
+ if key in indexed_keys and collection_suffix == indexed_data[key]['metadata'].get('collection'):
401
508
  if compare_fn(document, indexed_data[key]):
509
+ # Disabled addition of new collection to already indexed documents
510
+ # # check metadata.collection and update if needed
511
+ # for update_collection_id in remove_ids_fn(indexed_data, key):
512
+ # self._add_to_collection(
513
+ # update_collection_id,
514
+ # collection_suffix
515
+ # )
402
516
  continue
403
517
  final_docs.append(document)
404
518
  docs_to_remove.update(remove_ids_fn(indexed_data, key))
@@ -410,14 +524,14 @@ class VectorStoreWrapper(BaseToolApiWrapper):
410
524
  f"Removing {len(docs_to_remove)} documents from vectorstore that are already indexed with different updated_on.",
411
525
  tool_name="index_documents"
412
526
  )
413
- store.delete(ids=list(docs_to_remove))
527
+ self.vectorstore.delete(ids=list(docs_to_remove))
414
528
 
415
529
  return final_docs
416
530
 
417
- def _reduce_non_code_duplicates(self, documents: Generator[Any, None, None], store) -> List[Any]:
531
+ def _reduce_non_code_duplicates(self, documents: Generator[Any, None, None], collection_suffix: str) -> List[Any]:
418
532
  return self._reduce_duplicates(
419
533
  documents,
420
- store,
534
+ collection_suffix,
421
535
  self._get_indexed_data,
422
536
  lambda doc: doc.metadata.get('id'),
423
537
  lambda doc, idx: (
@@ -434,10 +548,10 @@ class VectorStoreWrapper(BaseToolApiWrapper):
434
548
  log_msg="Verification of documents to index started"
435
549
  )
436
550
 
437
- def _reduce_code_duplicates(self, documents: Generator[Any, None, None], store) -> List[Any]:
551
+ def _reduce_code_duplicates(self, documents: Generator[Any, None, None], collection_suffix: str) -> List[Any]:
438
552
  return self._reduce_duplicates(
439
553
  documents,
440
- store,
554
+ collection_suffix,
441
555
  self._get_code_indexed_data,
442
556
  lambda doc: doc.metadata.get('filename'),
443
557
  lambda doc, idx: (
@@ -449,7 +563,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
449
563
  log_msg="Verification of code documents to index started"
450
564
  )
451
565
 
452
- def index_documents(self, documents: Generator[Document, None, None], progress_step: int = 20, clean_index: bool = True, is_code: bool = False):
566
+ def index_documents(self, documents: Generator[Document, None, None], collection_suffix: str, progress_step: int = 20, clean_index: bool = True, is_code: bool = False):
453
567
  """ Index documents in the vectorstore.
454
568
 
455
569
  Args:
@@ -465,7 +579,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
465
579
  logger.info("Cleaning index before re-indexing all documents.")
466
580
  self._log_data("Cleaning index before re-indexing all documents. Previous index will be removed", tool_name="index_documents")
467
581
  try:
468
- self._clean_collection()
582
+ self._clean_collection(collection_suffix)
469
583
  self.vectoradapter.persist()
470
584
  self.vectoradapter.vacuum()
471
585
  self._log_data("Previous index has been removed",
@@ -476,8 +590,8 @@ class VectorStoreWrapper(BaseToolApiWrapper):
476
590
  documents = list(documents)
477
591
  else:
478
592
  # remove duplicates based on metadata 'id' and 'updated_on' or 'commit_hash' fields
479
- documents = self._reduce_code_duplicates(documents, self.vectoradapter.vectorstore) if is_code \
480
- else self._reduce_non_code_duplicates(documents, self.vectoradapter.vectorstore)
593
+ documents = self._reduce_code_duplicates(documents, collection_suffix) if is_code \
594
+ else self._reduce_non_code_duplicates(documents, collection_suffix)
481
595
 
482
596
  if not documents or len(documents) == 0:
483
597
  logger.info("No new documents to index after duplicate check.")
@@ -498,6 +612,15 @@ class VectorStoreWrapper(BaseToolApiWrapper):
498
612
  logger.debug(self.vectoradapter)
499
613
 
500
614
  documents = documents + list(dependent_docs_generator)
615
+
616
+ # if collection_suffix is provided, add it to metadata of each document
617
+ if collection_suffix:
618
+ for doc in documents:
619
+ if not doc.metadata.get('collection'):
620
+ doc.metadata['collection'] = collection_suffix
621
+ else:
622
+ doc.metadata['collection'] += f";{collection_suffix}"
623
+
501
624
  total_docs = len(documents)
502
625
  documents_count = 0
503
626
  _documents = []
@@ -511,8 +634,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
511
634
  try:
512
635
  _documents.append(document)
513
636
  if len(_documents) >= self.max_docs_per_add:
514
- add_documents(vectorstore=self.vectoradapter.vectorstore, documents=_documents)
515
- self.vectoradapter.persist()
637
+ add_documents(vectorstore=self.vectorstore, documents=_documents)
516
638
  _documents = []
517
639
 
518
640
  percent = math.floor((documents_count / total_docs) * 100)
@@ -526,8 +648,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
526
648
  logger.error(f"Error: {format_exc()}")
527
649
  return {"status": "error", "message": f"Error: {format_exc()}"}
528
650
  if _documents:
529
- add_documents(vectorstore=self.vectoradapter.vectorstore, documents=_documents)
530
- self.vectoradapter.persist()
651
+ add_documents(vectorstore=self.vectorstore, documents=_documents)
531
652
  return {"status": "ok", "message": f"successfully indexed {documents_count} documents"}
532
653
 
533
654
  def search_documents(self, query:str, doctype: str = 'code',
@@ -562,7 +683,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
562
683
  }
563
684
 
564
685
  try:
565
- document_items = self.vectoradapter.vectorstore.similarity_search_with_score(
686
+ document_items = self.vectorstore.similarity_search_with_score(
566
687
  query, filter=document_filter, k=search_top
567
688
  )
568
689
  # Add document results to unique docs
@@ -595,7 +716,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
595
716
  }
596
717
 
597
718
  try:
598
- chunk_items = self.vectoradapter.vectorstore.similarity_search_with_score(
719
+ chunk_items = self.vectorstore.similarity_search_with_score(
599
720
  query, filter=chunk_filter, k=search_top
600
721
  )
601
722
 
@@ -628,7 +749,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
628
749
  }
629
750
 
630
751
  try:
631
- fetch_items = self.vectoradapter.vectorstore.similarity_search_with_score(
752
+ fetch_items = self.vectorstore.similarity_search_with_score(
632
753
  query, filter=doc_filter, k=1
633
754
  )
634
755
  if fetch_items:
@@ -642,7 +763,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
642
763
  else:
643
764
  # Default search behavior (unchanged)
644
765
  max_search_results = 30 if search_top * 3 > 30 else search_top * 3
645
- vector_items = self.vectoradapter.vectorstore.similarity_search_with_score(
766
+ vector_items = self.vectorstore.similarity_search_with_score(
646
767
  query, filter=filter, k=max_search_results
647
768
  )
648
769
 
@@ -30,13 +30,13 @@ LoaderSchema = create_model(
30
30
  # Base Vector Store Schema Models
31
31
  BaseIndexParams = create_model(
32
32
  "BaseIndexParams",
33
- collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
33
+ collection_suffix=(str, Field(description="Suffix for collection name (max 7 characters) used to separate datasets", min_length=1, max_length=7)),
34
34
  vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
35
35
  )
36
36
 
37
37
  BaseCodeIndexParams = create_model(
38
38
  "BaseCodeIndexParams",
39
- collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
39
+ collection_suffix=(str, Field(description="Suffix for collection name (max 7 characters) used to separate datasets", min_length=1, max_length=7)),
40
40
  vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
41
41
  branch=(Optional[str], Field(description="Branch to index files from. Defaults to active branch if None.", default=None)),
42
42
  whitelist=(Optional[List[str]], Field(description="File extensions or paths to include. Defaults to all files if None.", default=None)),
@@ -51,7 +51,9 @@ RemoveIndexParams = create_model(
51
51
  BaseSearchParams = create_model(
52
52
  "BaseSearchParams",
53
53
  query=(str, Field(description="Query text to search in the index")),
54
- collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
54
+ collection_suffix=(Optional[str], Field(
55
+ description="Optional suffix for collection name (max 7 characters). Leave empty to search across all datasets",
56
+ default="", max_length=7)),
55
57
  vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
56
58
  filter=(Optional[dict | str], Field(
57
59
  description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
@@ -219,6 +221,7 @@ class BaseVectorStoreToolApiWrapper(BaseToolApiWrapper):
219
221
  embedding_model: Optional[str] = "HuggingFaceEmbeddings"
220
222
  embedding_model_params: Optional[Dict[str, Any]] = {"model_name": "sentence-transformers/all-MiniLM-L6-v2"}
221
223
  vectorstore_type: Optional[str] = "PGVector"
224
+ _vector_store: Optional[Any] = None
222
225
 
223
226
  def __init__(self, **kwargs):
224
227
  super().__init__(**kwargs)
@@ -297,9 +300,9 @@ class BaseVectorStoreToolApiWrapper(BaseToolApiWrapper):
297
300
  collection_suffix = kwargs.get("collection_suffix")
298
301
  progress_step = kwargs.get("progress_step")
299
302
  clean_index = kwargs.get("clean_index")
300
- vs = self._init_vector_store(collection_suffix, embeddings=embedding)
303
+ vs = self._init_vector_store(embeddings=embedding)
301
304
  #
302
- return vs.index_documents(docs, progress_step=progress_step, clean_index=clean_index)
305
+ return vs.index_documents(docs, collection_suffix=collection_suffix, progress_step=progress_step, clean_index=clean_index)
303
306
 
304
307
  def _process_documents(self, documents: List[Document]) -> Generator[Document, None, None]:
305
308
  """
@@ -333,42 +336,31 @@ class BaseVectorStoreToolApiWrapper(BaseToolApiWrapper):
333
336
 
334
337
 
335
338
  # TODO: init store once and re-use the instance
336
- def _init_vector_store(self, collection_suffix: str = "", embeddings: Optional[Any] = None):
339
+ def _init_vector_store(self, embeddings: Optional[Any] = None):
337
340
  """Initializes the vector store wrapper with the provided parameters."""
338
341
  try:
339
342
  from alita_sdk.runtime.tools.vectorstore import VectorStoreWrapper
340
343
  except ImportError:
341
344
  from alita_sdk.runtime.tools.vectorstore import VectorStoreWrapper
342
345
 
343
- # Validate collection_suffix length
344
- if collection_suffix and len(collection_suffix.strip()) > 7:
345
- raise ToolException("collection_suffix must be 7 characters or less")
346
-
347
- # Create collection name with suffix if provided
348
- collection_name = str(self.collection_name)
349
- if collection_suffix and collection_suffix.strip():
350
- collection_name = f"{self.collection_name}_{collection_suffix.strip()}"
351
-
352
- # Get database-specific parameters using adapter
353
- connection_string = self.connection_string.get_secret_value() if self.connection_string else None
354
- vectorstore_params = self._adapter.get_vectorstore_params(collection_name, connection_string)
355
-
356
- return VectorStoreWrapper(
357
- llm=self.llm,
358
- vectorstore_type=self.vectorstore_type,
359
- embedding_model=self.embedding_model,
360
- embedding_model_params=self.embedding_model_params,
361
- vectorstore_params=vectorstore_params,
362
- embeddings=embeddings,
363
- process_document_func=self._process_documents,
364
- )
346
+ if not self._vector_store:
347
+ connection_string = self.connection_string.get_secret_value() if self.connection_string else None
348
+ vectorstore_params = self._adapter.get_vectorstore_params(self.collection_name, connection_string)
349
+ self._vector_store = VectorStoreWrapper(
350
+ llm=self.llm,
351
+ vectorstore_type=self.vectorstore_type,
352
+ embedding_model=self.embedding_model,
353
+ embedding_model_params=self.embedding_model_params,
354
+ vectorstore_params=vectorstore_params,
355
+ embeddings=embeddings,
356
+ process_document_func=self._process_documents,
357
+ )
358
+ return self._vector_store
365
359
 
366
360
  def remove_index(self, collection_suffix: str = ""):
367
361
  """Cleans the indexed data in the collection."""
368
- vectorstore_wrapper = self._init_vector_store(collection_suffix)
369
- collection_name = f"{self.collection_name}_{collection_suffix}" if collection_suffix else str(self.collection_name)
370
- self._adapter.remove_collection(vectorstore_wrapper, collection_name)
371
- return (f"Collection '{collection_name}' has been removed from the vector store.\n"
362
+ self._init_vector_store()._clean_collection(collection_suffix=collection_suffix)
363
+ return (f"Collection '{collection_suffix}' has been removed from the vector store.\n"
372
364
  f"Available collections: {self.list_collections()}")
373
365
 
374
366
  def list_collections(self):
@@ -386,7 +378,14 @@ class BaseVectorStoreToolApiWrapper(BaseToolApiWrapper):
386
378
  extended_search: Optional[List[str]] = None,
387
379
  **kwargs):
388
380
  """ Searches indexed documents in the vector store."""
389
- vectorstore = self._init_vector_store(collection_suffix)
381
+ vectorstore = self._init_vector_store()
382
+ # build filter on top of collection_suffix
383
+ filter = filter if isinstance(filter, dict) else json.loads(filter)
384
+ if collection_suffix:
385
+ filter.update({"collection": {
386
+ "$eq": collection_suffix.strip()
387
+ }})
388
+
390
389
  found_docs = vectorstore.search_documents(
391
390
  query,
392
391
  doctype=self.doctype,
@@ -579,22 +578,20 @@ class BaseCodeToolApiWrapper(BaseVectorStoreToolApiWrapper):
579
578
  return parse_code_files_for_db(file_content_generator())
580
579
 
581
580
  def index_data(self,
581
+ collection_suffix: str,
582
582
  branch: Optional[str] = None,
583
583
  whitelist: Optional[List[str]] = None,
584
584
  blacklist: Optional[List[str]] = None,
585
- collection_suffix: str = "",
586
585
  **kwargs) -> str:
587
586
  """Index repository files in the vector store using code parsing."""
588
587
 
589
-
590
-
591
588
  documents = self.loader(
592
589
  branch=branch,
593
590
  whitelist=whitelist,
594
591
  blacklist=blacklist
595
592
  )
596
- vectorstore = self._init_vector_store(collection_suffix)
597
- return vectorstore.index_documents(documents, clean_index=False, is_code=True)
593
+ vectorstore = self._init_vector_store()
594
+ return vectorstore.index_documents(documents, collection_suffix=collection_suffix, clean_index=False, is_code=True)
598
595
 
599
596
  def _get_vector_search_tools(self):
600
597
  """
@@ -1,5 +1,8 @@
1
1
  from abc import ABC, abstractmethod
2
2
  from typing import Any, Dict, Optional
3
+ from logging import getLogger
4
+
5
+ logger = getLogger(__name__)
3
6
 
4
7
 
5
8
  class VectorStoreAdapter(ABC):
@@ -36,19 +39,23 @@ class PGVectorAdapter(VectorStoreAdapter):
36
39
  }
37
40
 
38
41
  def list_collections(self, vectorstore_wrapper, collection_name) -> str:
39
- from sqlalchemy import text
42
+ from sqlalchemy import func
40
43
  from sqlalchemy.orm import Session
41
44
 
42
- with Session(vectorstore_wrapper.vectorstore.session_maker.bind) as session:
43
- get_collections = text(f"""
44
- SELECT table_schema
45
- FROM information_schema.columns
46
- WHERE udt_name = 'vector'
47
- AND table_schema LIKE '%{collection_name}%';
48
- """)
49
- result = session.execute(get_collections)
50
- docs = result.fetchall()
51
- return str(docs)
45
+ store = vectorstore_wrapper.vectorstore
46
+ try:
47
+ with Session(store.session_maker.bind) as session:
48
+ collections = (
49
+ session.query(
50
+ func.distinct(func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'collection'))
51
+ )
52
+ .filter(store.EmbeddingStore.cmetadata.isnot(None))
53
+ .all()
54
+ )
55
+ return [collection[0] for collection in collections if collection[0] is not None]
56
+ except Exception as e:
57
+ logger.error(f"Failed to get unique collections from PGVector: {str(e)}")
58
+ return []
52
59
 
53
60
  def remove_collection(self, vectorstore_wrapper, collection_name: str):
54
61
  vectorstore_wrapper._remove_collection()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alita_sdk
3
- Version: 0.3.230
3
+ Version: 0.3.231
4
4
  Summary: SDK for building langchain agents using resources from Alita
5
5
  Author-email: Artem Rozumenko <artyom.rozumenko@gmail.com>, Mikalai Biazruchka <mikalai_biazruchka@epam.com>, Roman Mitusov <roman_mitusov@epam.com>, Ivan Krakhmaliuk <lifedjik@gmail.com>, Artem Dubrovskiy <ad13box@gmail.com>
6
6
  License-Expression: Apache-2.0
@@ -100,7 +100,7 @@ alita_sdk/runtime/tools/pgvector_search.py,sha256=NN2BGAnq4SsDHIhUcFZ8d_dbEOM8Qw
100
100
  alita_sdk/runtime/tools/prompt.py,sha256=nJafb_e5aOM1Rr3qGFCR-SKziU9uCsiP2okIMs9PppM,741
101
101
  alita_sdk/runtime/tools/router.py,sha256=wCvZjVkdXK9dMMeEerrgKf5M790RudH68pDortnHSz0,1517
102
102
  alita_sdk/runtime/tools/tool.py,sha256=lE1hGi6qOAXG7qxtqxarD_XMQqTghdywf261DZawwno,5631
103
- alita_sdk/runtime/tools/vectorstore.py,sha256=ItOkyorjusvoyZQcszs72FzDgo2ri9Xh3yWFhA4wdKM,38549
103
+ alita_sdk/runtime/tools/vectorstore.py,sha256=R6M6emjP7VUkXwufI_tfTicx4EKn-lZwxQ16-WzIwMA,44557
104
104
  alita_sdk/runtime/utils/AlitaCallback.py,sha256=E4LlSBuCHWiUq6W7IZExERHZY0qcmdjzc_rJlF2iQIw,7356
105
105
  alita_sdk/runtime/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
106
106
  alita_sdk/runtime/utils/constants.py,sha256=Xntx1b_uxUzT4clwqHA_U6K8y5bBqf_4lSQwXdcWrp4,13586
@@ -112,7 +112,7 @@ alita_sdk/runtime/utils/toolkit_runtime.py,sha256=MU63Fpxj0b5_r1IUUc0Q3-PN9VwL7r
112
112
  alita_sdk/runtime/utils/toolkit_utils.py,sha256=I9QFqnaqfVgN26LUr6s3XlBlG6y0CoHURnCzG7XcwVs,5311
113
113
  alita_sdk/runtime/utils/utils.py,sha256=CpEl3LCeLbhzQySz08lkKPm7Auac6IiLF7WB8wmArMI,589
114
114
  alita_sdk/tools/__init__.py,sha256=1AHqP2xyLjn92xVm70l9XIke6FkfHkLo5OoQVe4BuP8,10421
115
- alita_sdk/tools/elitea_base.py,sha256=22P97EmUlO-eC8DoGPiSNjHNjTv8ru0Q05qnR3U9WWs,30572
115
+ alita_sdk/tools/elitea_base.py,sha256=Wg9HqeWlsJ_R5--_Xfg7bm8rqKd68aqHm4a1sKAvahI,30365
116
116
  alita_sdk/tools/ado/__init__.py,sha256=j4lt6MLWlpkIIVkHmAyVG3i_qQeQ3ZmL_g8BfMhVhVI,1289
117
117
  alita_sdk/tools/ado/utils.py,sha256=PTCludvaQmPLakF2EbCGy66Mro4-rjDtavVP-xcB2Wc,1252
118
118
  alita_sdk/tools/ado/repos/__init__.py,sha256=kc4ZJI3B9CDUp4q3jRSj7JZNc3fJwwMTsV40CiKO7Po,6111
@@ -304,7 +304,7 @@ alita_sdk/tools/testrail/__init__.py,sha256=577XVaOAoXG3mDkojCsy5XCUlxCsdJf_2-_5
304
304
  alita_sdk/tools/testrail/api_wrapper.py,sha256=Aax0jspgidXYNxLIw6qTWu3dO2JOIS0ALIqsCzQuFbQ,32087
305
305
  alita_sdk/tools/utils/__init__.py,sha256=155xepXPr4OEzs2Mz5YnjXcBpxSv1X2eznRUVoPtyK0,3268
306
306
  alita_sdk/tools/utils/content_parser.py,sha256=yi1IDLreqfM41w-PnoFEvVLtSV50qpNvKshJwbDTgqs,7172
307
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py,sha256=wZ_MhVWPEgoPhKRvbPB6Qs34EPyWx0IW7ydY_GVBZFQ,3344
307
+ alita_sdk/tools/vector_adapters/VectorStoreAdapter.py,sha256=KhxojgddWlEQ4TZA7jEL5ZEp86PcXfmfgRXixsjj7-M,3634
308
308
  alita_sdk/tools/vector_adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
309
309
  alita_sdk/tools/xray/__init__.py,sha256=OYa1wveTm-lAhsJaGXMnwOrDQWl6ch--NjNLBeR63eM,4331
310
310
  alita_sdk/tools/xray/api_wrapper.py,sha256=A8PJmY2k7TowaD_vk6ZxkMnSUoZUt9A6g4TJrZfNTAw,32225
@@ -325,8 +325,8 @@ alita_sdk/tools/zephyr_scale/api_wrapper.py,sha256=JAeWf-RXohsxheUpT0iMDClc_izj-
325
325
  alita_sdk/tools/zephyr_squad/__init__.py,sha256=0AI_j27xVO5Gk5HQMFrqPTd4uvuVTpiZUicBrdfEpKg,2796
326
326
  alita_sdk/tools/zephyr_squad/api_wrapper.py,sha256=kmw_xol8YIYFplBLWTqP_VKPRhL_1ItDD0_vXTe_UuI,14906
327
327
  alita_sdk/tools/zephyr_squad/zephyr_squad_cloud_client.py,sha256=R371waHsms4sllHCbijKYs90C-9Yu0sSR3N4SUfQOgU,5066
328
- alita_sdk-0.3.230.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
329
- alita_sdk-0.3.230.dist-info/METADATA,sha256=jFOYe8oP7YbfYBFrKSoV_BtsZYIcsH2RPMmiQ-X2-tw,18896
330
- alita_sdk-0.3.230.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
331
- alita_sdk-0.3.230.dist-info/top_level.txt,sha256=0vJYy5p_jK6AwVb1aqXr7Kgqgk3WDtQ6t5C-XI9zkmg,10
332
- alita_sdk-0.3.230.dist-info/RECORD,,
328
+ alita_sdk-0.3.231.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
329
+ alita_sdk-0.3.231.dist-info/METADATA,sha256=8JHRCwh-SnyBtnD3wL4yJDUL1ov5fLXL1WUyfSIynf0,18896
330
+ alita_sdk-0.3.231.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
331
+ alita_sdk-0.3.231.dist-info/top_level.txt,sha256=0vJYy5p_jK6AwVb1aqXr7Kgqgk3WDtQ6t5C-XI9zkmg,10
332
+ alita_sdk-0.3.231.dist-info/RECORD,,