alita-sdk 0.3.231__py3-none-any.whl → 0.3.232__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,6 +8,7 @@ from pydantic import BaseModel, model_validator, Field
8
8
  from ..langchain.tools.vector import VectorAdapter
9
9
  from langchain_core.messages import HumanMessage
10
10
  from alita_sdk.tools.elitea_base import BaseToolApiWrapper
11
+ from alita_sdk.tools.vector_adapters.VectorStoreAdapter import VectorStoreAdapterFactory
11
12
  from logging import getLogger
12
13
 
13
14
  from ..utils.logging import dispatch_custom_event
@@ -141,11 +142,14 @@ class VectorStoreWrapper(BaseToolApiWrapper):
141
142
  dataset: str = None
142
143
  embedding: Any = None
143
144
  vectorstore: Any = None
145
+ # Review usage of old adapter
144
146
  vectoradapter: Any = None
145
147
  pg_helper: Any = None
146
148
  embeddings: Any = None
147
149
  process_document_func: Optional[Callable] = None
148
-
150
+ # New adapter for vector database operations
151
+ vector_adapter: Any = None
152
+
149
153
  @model_validator(mode='before')
150
154
  @classmethod
151
155
  def validate_toolkit(cls, values):
@@ -170,6 +174,8 @@ class VectorStoreWrapper(BaseToolApiWrapper):
170
174
  embeddings=values['embeddings'],
171
175
  quota_params=None,
172
176
  )
177
+ # Initialize the new vector adapter
178
+ values['vector_adapter'] = VectorStoreAdapterFactory.create_adapter(values['vectorstore_type'])
173
179
  logger.debug(f"Vectorstore wrapper initialized: {values}")
174
180
  return values
175
181
 
@@ -196,15 +202,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
196
202
  f"Remove collection '{self.dataset}'",
197
203
  tool_name="_remove_collection"
198
204
  )
199
- from sqlalchemy import text
200
- from sqlalchemy.orm import Session
201
-
202
- schema_name = self.vectorstore.collection_name
203
- with Session(self.vectorstore.session_maker.bind) as session:
204
- drop_schema_query = text(f"DROP SCHEMA IF EXISTS {schema_name} CASCADE;")
205
- session.execute(drop_schema_query)
206
- session.commit()
207
- logger.info(f"Schema '{schema_name}' has been dropped.")
205
+ self.vector_adapter.remove_collection(self, self.dataset)
208
206
  self._log_data(
209
207
  f"Collection '{self.dataset}' has been removed. ",
210
208
  tool_name="_remove_collection"
@@ -212,44 +210,12 @@ class VectorStoreWrapper(BaseToolApiWrapper):
212
210
 
213
211
  def _get_indexed_ids(self, collection_suffix: Optional[str] = '') -> List[str]:
214
212
  """Get all indexed document IDs from vectorstore"""
213
+ return self.vector_adapter.get_indexed_ids(self, collection_suffix)
215
214
 
216
- # Check if this is a PGVector store
217
- if self._is_pgvector():
218
- return self._get_pgvector_indexed_ids(collection_suffix)
219
- else:
220
- # Fall back to Chroma implementation
221
- # TODO: update filter by collection_suffix for Chroma
222
- return self._get_chroma_indexed_ids(collection_suffix)
215
+ def list_collections(self) -> List[str]:
216
+ """List all collections in the vectorstore."""
223
217
 
224
- def _get_pgvector_indexed_ids(self, collection_suffix: Optional[str] = ''):
225
- """Get all indexed document IDs from PGVector"""
226
- from sqlalchemy.orm import Session
227
- from sqlalchemy import func
228
-
229
- store = self.vectorstore
230
- try:
231
- with Session(store.session_maker.bind) as session:
232
- # Start building the query
233
- query = session.query(store.EmbeddingStore.id)
234
- # Apply filter only if collection_suffix is provided
235
- if collection_suffix:
236
- query = query.filter(
237
- func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'collection') == collection_suffix
238
- )
239
- ids = query.all()
240
- return [str(id_tuple[0]) for id_tuple in ids]
241
- except Exception as e:
242
- logger.error(f"Failed to get indexed IDs from PGVector: {str(e)}")
243
- return []
244
-
245
- def _get_chroma_indexed_ids(self, collection_suffix: Optional[str] = ''):
246
- """Get all indexed document IDs from Chroma"""
247
- try:
248
- data = self.vectorstore.get(include=[]) # Only get IDs, no metadata
249
- return data.get('ids', [])
250
- except Exception as e:
251
- logger.error(f"Failed to get indexed IDs from Chroma: {str(e)}")
252
- return []
218
+ return self.vector_adapter.list_collections(self)
253
219
 
254
220
  def _clean_collection(self, collection_suffix: str = ''):
255
221
  """
@@ -259,227 +225,23 @@ class VectorStoreWrapper(BaseToolApiWrapper):
259
225
  f"Cleaning collection '{self.dataset}'",
260
226
  tool_name="_clean_collection"
261
227
  )
262
- # This logic deletes all data from the vectorstore collection without removal of collection.
263
- # Collection itself remains available for future indexing.
264
- self.vectorstore.delete(ids=self._get_indexed_ids(collection_suffix))
265
-
228
+ self.vector_adapter.clean_collection(self, collection_suffix)
266
229
  self._log_data(
267
230
  f"Collection '{self.dataset}' has been cleaned. ",
268
231
  tool_name="_clean_collection"
269
232
  )
270
233
 
271
- def _is_pgvector(self) -> bool:
272
- """Check if the vectorstore is a PGVector store."""
273
- return hasattr(self.vectorstore, 'session_maker') and hasattr(self.vectorstore, 'EmbeddingStore')
274
-
275
- # TODO: refactor to use common method for different vectorstores in a separate vectorstore wrappers
276
234
  def _get_indexed_data(self):
277
235
  """ Get all indexed data from vectorstore for non-code content """
278
-
279
- # Check if this is a PGVector store
280
- if self._is_pgvector():
281
- return self._get_pgvector_indexed_data()
282
- else:
283
- # Fall back to original Chroma implementation
284
- return self._get_chroma_indexed_data(self.vectorstore)
285
-
286
- def _get_pgvector_indexed_data(self):
287
- """ Get all indexed data from PGVector for non-code content """
288
- from sqlalchemy.orm import Session
289
-
290
- result = {}
291
- try:
292
- self._log_data("Retrieving already indexed data from PGVector vectorstore",
293
- tool_name="get_indexed_data")
294
- store = self.vectorstore
295
- with Session(store.session_maker.bind) as session:
296
- docs = session.query(
297
- store.EmbeddingStore.id,
298
- store.EmbeddingStore.document,
299
- store.EmbeddingStore.cmetadata
300
- ).all()
301
-
302
- # Process the retrieved data
303
- for doc in docs:
304
- db_id = doc.id
305
- meta = doc.cmetadata or {}
306
-
307
- # Get document id from metadata
308
- doc_id = str(meta.get('id', db_id))
309
- dependent_docs = meta.get(IndexerKeywords.DEPENDENT_DOCS.value, [])
310
- if dependent_docs:
311
- dependent_docs = [d.strip() for d in dependent_docs.split(';') if d.strip()]
312
- parent_id = meta.get(IndexerKeywords.PARENT.value, -1)
313
-
314
- chunk_id = meta.get('chunk_id')
315
- if doc_id in result and chunk_id:
316
- # If document with the same id already saved, add db_id for current one as chunk
317
- result[doc_id]['all_chunks'].append(db_id)
318
- else:
319
- result[doc_id] = {
320
- 'metadata': meta,
321
- 'id': db_id,
322
- 'all_chunks': [db_id],
323
- IndexerKeywords.DEPENDENT_DOCS.value: dependent_docs,
324
- IndexerKeywords.PARENT.value: parent_id
325
- }
326
-
327
- except Exception as e:
328
- logger.error(f"Failed to get indexed data from PGVector: {str(e)}. Continuing with empty index.")
329
-
330
- return result
331
-
332
- def _get_chroma_indexed_data(self, store):
333
- """ Get all indexed data from Chroma for non-code content """
334
- result = {}
335
- try:
336
- self._log_data("Retrieving already indexed data from Chroma vectorstore",
337
- tool_name="get_indexed_data")
338
- data = store.get(include=['metadatas'])
339
-
340
- # Re-structure data to be more usable
341
- for meta, db_id in zip(data['metadatas'], data['ids']):
342
- # Get document id from metadata
343
- doc_id = str(meta['id'])
344
- dependent_docs = meta.get(IndexerKeywords.DEPENDENT_DOCS.value, [])
345
- if dependent_docs:
346
- dependent_docs = [d.strip() for d in dependent_docs.split(';') if d.strip()]
347
- parent_id = meta.get(IndexerKeywords.PARENT.value, -1)
348
-
349
- chunk_id = meta.get('chunk_id')
350
- if doc_id in result and chunk_id:
351
- # If document with the same id already saved, add db_id for current one as chunk
352
- result[doc_id]['all_chunks'].append(db_id)
353
- else:
354
- result[doc_id] = {
355
- 'metadata': meta,
356
- 'id': db_id,
357
- 'all_chunks': [db_id],
358
- IndexerKeywords.DEPENDENT_DOCS.value: dependent_docs,
359
- IndexerKeywords.PARENT.value: parent_id
360
- }
361
- except Exception as e:
362
- logger.error(f"Failed to get indexed data from Chroma: {str(e)}. Continuing with empty index.")
363
-
364
- return result
236
+ return self.vector_adapter.get_indexed_data(self)
365
237
 
366
238
  def _get_code_indexed_data(self) -> Dict[str, Dict[str, Any]]:
367
239
  """ Get all indexed data from vectorstore for code content """
368
-
369
- # get already indexed data
370
- if self._is_pgvector():
371
- result = self._get_pgvector_code_indexed_data()
372
- else:
373
- result = self._get_chroma_code_indexed_data()
374
- return result
375
-
376
- def _get_chroma_code_indexed_data(self) -> Dict[str, Dict[str, Any]]:
377
- """Get all indexed code data from Chroma."""
378
- result = {}
379
- try:
380
- self._log_data("Retrieving already indexed code data from Chroma vectorstore",
381
- tool_name="index_code_data")
382
- data = self.vectorstore.get(include=['metadatas'])
383
- for meta, db_id in zip(data['metadatas'], data['ids']):
384
- filename = meta.get('filename')
385
- commit_hash = meta.get('commit_hash')
386
- if not filename:
387
- continue
388
- if filename not in result:
389
- result[filename] = {
390
- 'commit_hashes': [],
391
- 'ids': []
392
- }
393
- if commit_hash is not None:
394
- result[filename]['commit_hashes'].append(commit_hash)
395
- result[filename]['ids'].append(db_id)
396
- except Exception as e:
397
- logger.error(f"Failed to get indexed code data from Chroma: {str(e)}. Continuing with empty index.")
398
- return result
399
-
400
- def _get_pgvector_code_indexed_data(self) -> Dict[str, Dict[str, Any]]:
401
- """Get all indexed code data from PGVector."""
402
- from sqlalchemy.orm import Session
403
-
404
- result = {}
405
- try:
406
- self._log_data("Retrieving already indexed code data from PGVector vectorstore",
407
- tool_name="index_code_data")
408
- store = self.vectorstore
409
- with Session(store.session_maker.bind) as session:
410
- docs = session.query(
411
- store.EmbeddingStore.id,
412
- store.EmbeddingStore.cmetadata
413
- ).all()
414
-
415
- for db_id, meta in docs:
416
- filename = meta.get('filename')
417
- commit_hash = meta.get('commit_hash')
418
- if not filename:
419
- continue
420
- if filename not in result:
421
- result[filename] = {
422
- 'metadata': meta,
423
- 'commit_hashes': [],
424
- 'ids': []
425
- }
426
- if commit_hash is not None:
427
- result[filename]['commit_hashes'].append(commit_hash)
428
- result[filename]['ids'].append(db_id)
429
- except Exception as e:
430
- logger.error(f"Failed to get indexed code data from PGVector: {str(e)}. Continuing with empty index.")
431
- return result
432
-
240
+ return self.vector_adapter.get_code_indexed_data(self)
433
241
 
434
242
  def _add_to_collection(self, entry_id, new_collection_value):
435
243
  """Add a new collection name to the `collection` key in the `metadata` column."""
436
-
437
- from sqlalchemy import func
438
- from sqlalchemy.orm import Session
439
-
440
- store = self.vectorstore
441
- try:
442
- with Session(store.session_maker.bind) as session:
443
- # Query the current value of the `collection` key
444
- current_collection_query = session.query(
445
- func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'collection')
446
- ).filter(store.EmbeddingStore.id == entry_id).scalar()
447
-
448
- # If the `collection` key is NULL or doesn't contain the new value, update it
449
- if current_collection_query is None:
450
- # If `collection` is NULL, initialize it with the new value
451
- session.query(store.EmbeddingStore).filter(
452
- store.EmbeddingStore.id == entry_id
453
- ).update(
454
- {
455
- store.EmbeddingStore.cmetadata: func.jsonb_set(
456
- func.coalesce(store.EmbeddingStore.cmetadata, '{}'),
457
- '{collection}', # Path to the `collection` key
458
- f'"{new_collection_value}"', # New value for the `collection` key
459
- True # Create the key if it doesn't exist
460
- )
461
- }
462
- )
463
- elif new_collection_value not in current_collection_query.split(";"):
464
- # If `collection` exists but doesn't contain the new value, append it
465
- updated_collection_value = f"{current_collection_query};{new_collection_value}"
466
- session.query(store.EmbeddingStore).filter(
467
- store.EmbeddingStore.id == entry_id
468
- ).update(
469
- {
470
- store.EmbeddingStore.cmetadata: func.jsonb_set(
471
- store.EmbeddingStore.cmetadata,
472
- '{collection}', # Path to the `collection` key
473
- f'"{updated_collection_value}"', # Concatenated value as a valid JSON string
474
- True # Create the key if it doesn't exist
475
- )
476
- }
477
- )
478
-
479
- session.commit()
480
- logger.info(f"Successfully updated collection for entry ID {entry_id}.")
481
- except Exception as e:
482
- logger.error(f"Failed to update collection for entry ID {entry_id}: {str(e)}")
244
+ self.vector_adapter.add_to_collection(self, entry_id, new_collection_value)
483
245
 
484
246
  def _reduce_duplicates(
485
247
  self,
@@ -983,4 +745,5 @@ class VectorStoreWrapper(BaseToolApiWrapper):
983
745
  "description": "Get summary of search results using stepback technique",
984
746
  "args_schema": StepBackSearchDocumentsModel
985
747
  }
986
- ]
748
+ ]
749
+
@@ -366,7 +366,7 @@ class BaseVectorStoreToolApiWrapper(BaseToolApiWrapper):
366
366
  def list_collections(self):
367
367
  """Lists all collections in the vector store."""
368
368
  vectorstore_wrapper = self._init_vector_store()
369
- return self._adapter.list_collections(vectorstore_wrapper, self.collection_name or "")
369
+ return vectorstore_wrapper.list_collections()
370
370
 
371
371
  def search_index(self,
372
372
  query: str,
@@ -498,6 +498,8 @@ class BaseVectorStoreToolApiWrapper(BaseToolApiWrapper):
498
498
 
499
499
  class BaseCodeToolApiWrapper(BaseVectorStoreToolApiWrapper):
500
500
 
501
+ doctype: Optional[str] = 'code'
502
+
501
503
  def _get_files(self):
502
504
  raise NotImplementedError("Subclasses should implement this method")
503
505
 
@@ -37,9 +37,9 @@ class AlitaGitHubAPIWrapper(BaseCodeToolApiWrapper):
37
37
  Wrapper for GitHub API that integrates both REST and GraphQL functionality.
38
38
  """
39
39
  # Authentication config
40
- github_access_token: Optional[str] = None
41
- github_username: Optional[str] = None
42
- github_password: Optional[str] = None
40
+ github_access_token: Optional[SecretStr] = None
41
+ github_username: Optional[SecretStr] = None
42
+ github_password: Optional[SecretStr] = None
43
43
  github_app_id: Optional[str] = None
44
44
  github_app_private_key: Optional[str] = None
45
45
  github_base_url: Optional[str] = None
@@ -49,19 +49,9 @@ class AlitaGitHubAPIWrapper(BaseCodeToolApiWrapper):
49
49
  active_branch: Optional[str] = None
50
50
  github_base_branch: Optional[str] = None
51
51
 
52
- # Add LLM instance
53
- llm: Optional[Any] = None
54
52
  # Alita instance
55
53
  alita: Optional[Any] = None
56
54
 
57
- # Vector store configuration
58
- connection_string: Optional[SecretStr] = None
59
- collection_name: Optional[str] = None
60
- doctype: Optional[str] = 'code' # GitHub uses 'code' doctype
61
- embedding_model: Optional[str] = "HuggingFaceEmbeddings"
62
- embedding_model_params: Optional[Dict[str, Any]] = {"model_name": "sentence-transformers/all-MiniLM-L6-v2"}
63
- vectorstore_type: Optional[str] = "PGVector"
64
-
65
55
  # Client instances - renamed without leading underscores and marked as exclude=True
66
56
  github_client_instance: Optional[GitHubClient] = Field(default=None, exclude=True)
67
57
  graphql_client_instance: Optional[GraphQLClientWrapper] = Field(default=None, exclude=True)
@@ -84,12 +74,12 @@ class AlitaGitHubAPIWrapper(BaseCodeToolApiWrapper):
84
74
  from langchain.utils import get_from_dict_or_env
85
75
 
86
76
  # Get all authentication values
87
- github_access_token = get_from_dict_or_env(values, "github_access_token", "GITHUB_ACCESS_TOKEN", default='')
88
- github_username = get_from_dict_or_env(values, "github_username", "GITHUB_USERNAME", default='')
89
- github_password = get_from_dict_or_env(values, "github_password", "GITHUB_PASSWORD", default='')
90
- github_app_id = get_from_dict_or_env(values, "github_app_id", "GITHUB_APP_ID", default='')
91
- github_app_private_key = get_from_dict_or_env(values, "github_app_private_key", "GITHUB_APP_PRIVATE_KEY", default='')
92
- github_base_url = get_from_dict_or_env(values, "github_base_url", "GITHUB_BASE_URL", default='https://api.github.com')
77
+ github_access_token = get_from_dict_or_env(values, ["access_token", "github_access_token"], "GITHUB_ACCESS_TOKEN", default='')
78
+ github_username = get_from_dict_or_env(values, ["username", "github_username"], "GITHUB_USERNAME", default='')
79
+ github_password = get_from_dict_or_env(values, ["password", "github_password"], "GITHUB_PASSWORD", default='')
80
+ github_app_id = get_from_dict_or_env(values, ["app_id", "github_app_id"], "GITHUB_APP_ID", default='')
81
+ github_app_private_key = get_from_dict_or_env(values, ["app_private_key", "github_app_private_key"], "GITHUB_APP_PRIVATE_KEY", default='')
82
+ github_base_url = get_from_dict_or_env(values, ["base_url", "github_base_url"], "GITHUB_BASE_URL", default='https://api.github.com')
93
83
 
94
84
  auth_config = GitHubAuthConfig(
95
85
  github_access_token=github_access_token,
@@ -1,5 +1,5 @@
1
1
  from abc import ABC, abstractmethod
2
- from typing import Any, Dict, Optional
2
+ from typing import Any, Dict, Optional, List
3
3
  from logging import getLogger
4
4
 
5
5
  logger = getLogger(__name__)
@@ -23,6 +23,31 @@ class VectorStoreAdapter(ABC):
23
23
  """Remove a collection from the vector store."""
24
24
  pass
25
25
 
26
+ @abstractmethod
27
+ def get_indexed_ids(self, vectorstore_wrapper, collection_suffix: Optional[str] = '') -> List[str]:
28
+ """Get all indexed document IDs from vectorstore"""
29
+ pass
30
+
31
+ @abstractmethod
32
+ def clean_collection(self, vectorstore_wrapper, collection_suffix: str = ''):
33
+ """Clean the vectorstore collection by deleting all indexed data."""
34
+ pass
35
+
36
+ @abstractmethod
37
+ def get_indexed_data(self, vectorstore_wrapper):
38
+ """Get all indexed data from vectorstore for non-code content"""
39
+ pass
40
+
41
+ @abstractmethod
42
+ def get_code_indexed_data(self, vectorstore_wrapper) -> Dict[str, Dict[str, Any]]:
43
+ """Get all indexed data from vectorstore for code content"""
44
+ pass
45
+
46
+ @abstractmethod
47
+ def add_to_collection(self, vectorstore_wrapper, entry_id, new_collection_value):
48
+ """Add a new collection name to the metadata"""
49
+ pass
50
+
26
51
 
27
52
  class PGVectorAdapter(VectorStoreAdapter):
28
53
  """Adapter for PGVector database operations."""
@@ -38,7 +63,7 @@ class PGVectorAdapter(VectorStoreAdapter):
38
63
  "connection_string": connection_string
39
64
  }
40
65
 
41
- def list_collections(self, vectorstore_wrapper, collection_name) -> str:
66
+ def list_collections(self, vectorstore_wrapper) -> str:
42
67
  from sqlalchemy import func
43
68
  from sqlalchemy.orm import Session
44
69
 
@@ -58,7 +83,175 @@ class PGVectorAdapter(VectorStoreAdapter):
58
83
  return []
59
84
 
60
85
  def remove_collection(self, vectorstore_wrapper, collection_name: str):
61
- vectorstore_wrapper._remove_collection()
86
+ from sqlalchemy import text
87
+ from sqlalchemy.orm import Session
88
+
89
+ schema_name = vectorstore_wrapper.vectorstore.collection_name
90
+ with Session(vectorstore_wrapper.vectorstore.session_maker.bind) as session:
91
+ drop_schema_query = text(f"DROP SCHEMA IF EXISTS {schema_name} CASCADE;")
92
+ session.execute(drop_schema_query)
93
+ session.commit()
94
+ logger.info(f"Schema '{schema_name}' has been dropped.")
95
+
96
+ def get_indexed_ids(self, vectorstore_wrapper, collection_suffix: Optional[str] = '') -> List[str]:
97
+ """Get all indexed document IDs from PGVector"""
98
+ from sqlalchemy.orm import Session
99
+ from sqlalchemy import func
100
+
101
+ store = vectorstore_wrapper.vectorstore
102
+ try:
103
+ with Session(store.session_maker.bind) as session:
104
+ # Start building the query
105
+ query = session.query(store.EmbeddingStore.id)
106
+ # Apply filter only if collection_suffix is provided
107
+ if collection_suffix:
108
+ query = query.filter(
109
+ func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'collection') == collection_suffix
110
+ )
111
+ ids = query.all()
112
+ return [str(id_tuple[0]) for id_tuple in ids]
113
+ except Exception as e:
114
+ logger.error(f"Failed to get indexed IDs from PGVector: {str(e)}")
115
+ return []
116
+
117
+ def clean_collection(self, vectorstore_wrapper, collection_suffix: str = ''):
118
+ """Clean the vectorstore collection by deleting all indexed data."""
119
+ # This logic deletes all data from the vectorstore collection without removal of collection.
120
+ # Collection itself remains available for future indexing.
121
+ vectorstore_wrapper.vectorstore.delete(ids=self.get_indexed_ids(vectorstore_wrapper, collection_suffix))
122
+
123
+ def is_vectorstore_type(self, vectorstore) -> bool:
124
+ """Check if the vectorstore is a PGVector store."""
125
+ return hasattr(vectorstore, 'session_maker') and hasattr(vectorstore, 'EmbeddingStore')
126
+
127
+ def get_indexed_data(self, vectorstore_wrapper):
128
+ """Get all indexed data from PGVector for non-code content"""
129
+ from sqlalchemy.orm import Session
130
+ from ...runtime.utils.utils import IndexerKeywords
131
+
132
+ result = {}
133
+ try:
134
+ vectorstore_wrapper._log_data("Retrieving already indexed data from PGVector vectorstore",
135
+ tool_name="get_indexed_data")
136
+ store = vectorstore_wrapper.vectorstore
137
+ with Session(store.session_maker.bind) as session:
138
+ docs = session.query(
139
+ store.EmbeddingStore.id,
140
+ store.EmbeddingStore.document,
141
+ store.EmbeddingStore.cmetadata
142
+ ).all()
143
+
144
+ # Process the retrieved data
145
+ for doc in docs:
146
+ db_id = doc.id
147
+ meta = doc.cmetadata or {}
148
+
149
+ # Get document id from metadata
150
+ doc_id = str(meta.get('id', db_id))
151
+ dependent_docs = meta.get(IndexerKeywords.DEPENDENT_DOCS.value, [])
152
+ if dependent_docs:
153
+ dependent_docs = [d.strip() for d in dependent_docs.split(';') if d.strip()]
154
+ parent_id = meta.get(IndexerKeywords.PARENT.value, -1)
155
+
156
+ chunk_id = meta.get('chunk_id')
157
+ if doc_id in result and chunk_id:
158
+ # If document with the same id already saved, add db_id for current one as chunk
159
+ result[doc_id]['all_chunks'].append(db_id)
160
+ else:
161
+ result[doc_id] = {
162
+ 'metadata': meta,
163
+ 'id': db_id,
164
+ 'all_chunks': [db_id],
165
+ IndexerKeywords.DEPENDENT_DOCS.value: dependent_docs,
166
+ IndexerKeywords.PARENT.value: parent_id
167
+ }
168
+
169
+ except Exception as e:
170
+ logger.error(f"Failed to get indexed data from PGVector: {str(e)}. Continuing with empty index.")
171
+
172
+ return result
173
+
174
+ def get_code_indexed_data(self, vectorstore_wrapper) -> Dict[str, Dict[str, Any]]:
175
+ """Get all indexed code data from PGVector."""
176
+ from sqlalchemy.orm import Session
177
+
178
+ result = {}
179
+ try:
180
+ vectorstore_wrapper._log_data("Retrieving already indexed code data from PGVector vectorstore",
181
+ tool_name="index_code_data")
182
+ store = vectorstore_wrapper.vectorstore
183
+ with Session(store.session_maker.bind) as session:
184
+ docs = session.query(
185
+ store.EmbeddingStore.id,
186
+ store.EmbeddingStore.cmetadata
187
+ ).all()
188
+
189
+ for db_id, meta in docs:
190
+ filename = meta.get('filename')
191
+ commit_hash = meta.get('commit_hash')
192
+ if not filename:
193
+ continue
194
+ if filename not in result:
195
+ result[filename] = {
196
+ 'metadata': meta,
197
+ 'commit_hashes': [],
198
+ 'ids': []
199
+ }
200
+ if commit_hash is not None:
201
+ result[filename]['commit_hashes'].append(commit_hash)
202
+ result[filename]['ids'].append(db_id)
203
+ except Exception as e:
204
+ logger.error(f"Failed to get indexed code data from PGVector: {str(e)}. Continuing with empty index.")
205
+ return result
206
+
207
+ def add_to_collection(self, vectorstore_wrapper, entry_id, new_collection_value):
208
+ """Add a new collection name to the `collection` key in the `metadata` column."""
209
+ from sqlalchemy import func
210
+ from sqlalchemy.orm import Session
211
+
212
+ store = vectorstore_wrapper.vectorstore
213
+ try:
214
+ with Session(store.session_maker.bind) as session:
215
+ # Query the current value of the `collection` key
216
+ current_collection_query = session.query(
217
+ func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'collection')
218
+ ).filter(store.EmbeddingStore.id == entry_id).scalar()
219
+
220
+ # If the `collection` key is NULL or doesn't contain the new value, update it
221
+ if current_collection_query is None:
222
+ # If `collection` is NULL, initialize it with the new value
223
+ session.query(store.EmbeddingStore).filter(
224
+ store.EmbeddingStore.id == entry_id
225
+ ).update(
226
+ {
227
+ store.EmbeddingStore.cmetadata: func.jsonb_set(
228
+ func.coalesce(store.EmbeddingStore.cmetadata, '{}'),
229
+ '{collection}', # Path to the `collection` key
230
+ f'"{new_collection_value}"', # New value for the `collection` key
231
+ True # Create the key if it doesn't exist
232
+ )
233
+ }
234
+ )
235
+ elif new_collection_value not in current_collection_query.split(";"):
236
+ # If `collection` exists but doesn't contain the new value, append it
237
+ updated_collection_value = f"{current_collection_query};{new_collection_value}"
238
+ session.query(store.EmbeddingStore).filter(
239
+ store.EmbeddingStore.id == entry_id
240
+ ).update(
241
+ {
242
+ store.EmbeddingStore.cmetadata: func.jsonb_set(
243
+ store.EmbeddingStore.cmetadata,
244
+ '{collection}', # Path to the `collection` key
245
+ f'"{updated_collection_value}"', # Concatenated value as a valid JSON string
246
+ True # Create the key if it doesn't exist
247
+ )
248
+ }
249
+ )
250
+
251
+ session.commit()
252
+ logger.info(f"Successfully updated collection for entry ID {entry_id}.")
253
+ except Exception as e:
254
+ logger.error(f"Failed to update collection for entry ID {entry_id}: {str(e)}")
62
255
 
63
256
 
64
257
  class ChromaAdapter(VectorStoreAdapter):
@@ -71,11 +264,90 @@ class ChromaAdapter(VectorStoreAdapter):
71
264
  }
72
265
 
73
266
  def list_collections(self, vectorstore_wrapper) -> str:
74
- vector_client = vectorstore_wrapper.vectoradapter.vectorstore._client
267
+ vector_client = vectorstore_wrapper.vectorstore._client
75
268
  return ','.join([collection.name for collection in vector_client.list_collections()])
76
269
 
77
270
  def remove_collection(self, vectorstore_wrapper, collection_name: str):
78
- vectorstore_wrapper.vectoradapter.vectorstore.delete_collection()
271
+ vectorstore_wrapper.vectorstore.delete_collection()
272
+
273
+ def get_indexed_ids(self, vectorstore_wrapper, collection_suffix: Optional[str] = '') -> List[str]:
274
+ """Get all indexed document IDs from Chroma"""
275
+ try:
276
+ data = vectorstore_wrapper.vectorstore.get(include=[]) # Only get IDs, no metadata
277
+ return data.get('ids', [])
278
+ except Exception as e:
279
+ logger.error(f"Failed to get indexed IDs from Chroma: {str(e)}")
280
+ return []
281
+
282
+ def clean_collection(self, vectorstore_wrapper, collection_suffix: str = ''):
283
+ """Clean the vectorstore collection by deleting all indexed data."""
284
+ vectorstore_wrapper.vectorstore.delete(ids=self.get_indexed_ids(vectorstore_wrapper, collection_suffix))
285
+
286
+ def get_indexed_data(self, vectorstore_wrapper):
287
+ """Get all indexed data from Chroma for non-code content"""
288
+ from ...runtime.utils.utils import IndexerKeywords
289
+
290
+ result = {}
291
+ try:
292
+ vectorstore_wrapper._log_data("Retrieving already indexed data from Chroma vectorstore",
293
+ tool_name="get_indexed_data")
294
+ data = vectorstore_wrapper.vectorstore.get(include=['metadatas'])
295
+
296
+ # Re-structure data to be more usable
297
+ for meta, db_id in zip(data['metadatas'], data['ids']):
298
+ # Get document id from metadata
299
+ doc_id = str(meta['id'])
300
+ dependent_docs = meta.get(IndexerKeywords.DEPENDENT_DOCS.value, [])
301
+ if dependent_docs:
302
+ dependent_docs = [d.strip() for d in dependent_docs.split(';') if d.strip()]
303
+ parent_id = meta.get(IndexerKeywords.PARENT.value, -1)
304
+
305
+ chunk_id = meta.get('chunk_id')
306
+ if doc_id in result and chunk_id:
307
+ # If document with the same id already saved, add db_id for current one as chunk
308
+ result[doc_id]['all_chunks'].append(db_id)
309
+ else:
310
+ result[doc_id] = {
311
+ 'metadata': meta,
312
+ 'id': db_id,
313
+ 'all_chunks': [db_id],
314
+ IndexerKeywords.DEPENDENT_DOCS.value: dependent_docs,
315
+ IndexerKeywords.PARENT.value: parent_id
316
+ }
317
+ except Exception as e:
318
+ logger.error(f"Failed to get indexed data from Chroma: {str(e)}. Continuing with empty index.")
319
+
320
+ return result
321
+
322
+ def get_code_indexed_data(self, vectorstore_wrapper) -> Dict[str, Dict[str, Any]]:
323
+ """Get all indexed code data from Chroma."""
324
+ result = {}
325
+ try:
326
+ vectorstore_wrapper._log_data("Retrieving already indexed code data from Chroma vectorstore",
327
+ tool_name="index_code_data")
328
+ data = vectorstore_wrapper.vectorstore.get(include=['metadatas'])
329
+ for meta, db_id in zip(data['metadatas'], data['ids']):
330
+ filename = meta.get('filename')
331
+ commit_hash = meta.get('commit_hash')
332
+ if not filename:
333
+ continue
334
+ if filename not in result:
335
+ result[filename] = {
336
+ 'commit_hashes': [],
337
+ 'ids': []
338
+ }
339
+ if commit_hash is not None:
340
+ result[filename]['commit_hashes'].append(commit_hash)
341
+ result[filename]['ids'].append(db_id)
342
+ except Exception as e:
343
+ logger.error(f"Failed to get indexed code data from Chroma: {str(e)}. Continuing with empty index.")
344
+ return result
345
+
346
+ def add_to_collection(self, vectorstore_wrapper, entry_id, new_collection_value):
347
+ """Add a new collection name to the metadata - Chroma implementation"""
348
+ # For Chroma, we would need to update the metadata through vectorstore operations
349
+ # This is a simplified implementation - in practice, you might need more complex logic
350
+ logger.warning("add_to_collection for Chroma is not fully implemented yet")
79
351
 
80
352
 
81
353
  class VectorStoreAdapterFactory:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alita_sdk
3
- Version: 0.3.231
3
+ Version: 0.3.232
4
4
  Summary: SDK for building langchain agents using resources from Alita
5
5
  Author-email: Artem Rozumenko <artyom.rozumenko@gmail.com>, Mikalai Biazruchka <mikalai_biazruchka@epam.com>, Roman Mitusov <roman_mitusov@epam.com>, Ivan Krakhmaliuk <lifedjik@gmail.com>, Artem Dubrovskiy <ad13box@gmail.com>
6
6
  License-Expression: Apache-2.0
@@ -100,7 +100,7 @@ alita_sdk/runtime/tools/pgvector_search.py,sha256=NN2BGAnq4SsDHIhUcFZ8d_dbEOM8Qw
100
100
  alita_sdk/runtime/tools/prompt.py,sha256=nJafb_e5aOM1Rr3qGFCR-SKziU9uCsiP2okIMs9PppM,741
101
101
  alita_sdk/runtime/tools/router.py,sha256=wCvZjVkdXK9dMMeEerrgKf5M790RudH68pDortnHSz0,1517
102
102
  alita_sdk/runtime/tools/tool.py,sha256=lE1hGi6qOAXG7qxtqxarD_XMQqTghdywf261DZawwno,5631
103
- alita_sdk/runtime/tools/vectorstore.py,sha256=R6M6emjP7VUkXwufI_tfTicx4EKn-lZwxQ16-WzIwMA,44557
103
+ alita_sdk/runtime/tools/vectorstore.py,sha256=0VWmYRWgFvzGViFlhYbUk2fjkofrLlVQQg6Vnx6nxhs,33659
104
104
  alita_sdk/runtime/utils/AlitaCallback.py,sha256=E4LlSBuCHWiUq6W7IZExERHZY0qcmdjzc_rJlF2iQIw,7356
105
105
  alita_sdk/runtime/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
106
106
  alita_sdk/runtime/utils/constants.py,sha256=Xntx1b_uxUzT4clwqHA_U6K8y5bBqf_4lSQwXdcWrp4,13586
@@ -112,7 +112,7 @@ alita_sdk/runtime/utils/toolkit_runtime.py,sha256=MU63Fpxj0b5_r1IUUc0Q3-PN9VwL7r
112
112
  alita_sdk/runtime/utils/toolkit_utils.py,sha256=I9QFqnaqfVgN26LUr6s3XlBlG6y0CoHURnCzG7XcwVs,5311
113
113
  alita_sdk/runtime/utils/utils.py,sha256=CpEl3LCeLbhzQySz08lkKPm7Auac6IiLF7WB8wmArMI,589
114
114
  alita_sdk/tools/__init__.py,sha256=1AHqP2xyLjn92xVm70l9XIke6FkfHkLo5OoQVe4BuP8,10421
115
- alita_sdk/tools/elitea_base.py,sha256=Wg9HqeWlsJ_R5--_Xfg7bm8rqKd68aqHm4a1sKAvahI,30365
115
+ alita_sdk/tools/elitea_base.py,sha256=pxcUj_z4xDy5EQDbEkBuneDBh8QdUzevDcHkCKR35v4,30361
116
116
  alita_sdk/tools/ado/__init__.py,sha256=j4lt6MLWlpkIIVkHmAyVG3i_qQeQ3ZmL_g8BfMhVhVI,1289
117
117
  alita_sdk/tools/ado/utils.py,sha256=PTCludvaQmPLakF2EbCGy66Mro4-rjDtavVP-xcB2Wc,1252
118
118
  alita_sdk/tools/ado/repos/__init__.py,sha256=kc4ZJI3B9CDUp4q3jRSj7JZNc3fJwwMTsV40CiKO7Po,6111
@@ -214,7 +214,7 @@ alita_sdk/tools/elastic/api_wrapper.py,sha256=pl8CqQxteJAGwyOhMcld-ZgtOTFwwbv42O
214
214
  alita_sdk/tools/figma/__init__.py,sha256=281OU_aw4Y87Do09HhDSi5zL5ne9YlrsRLZQo8s1U8Q,5316
215
215
  alita_sdk/tools/figma/api_wrapper.py,sha256=Rtgt9FvR8VD0oPdYhlgvVyXLVqLTjtiOPTlwNeaV80w,20560
216
216
  alita_sdk/tools/github/__init__.py,sha256=CtU52t6-jd6JErWe3M2HF5XXWzFj9CqGmG7HBjUet6E,5348
217
- alita_sdk/tools/github/api_wrapper.py,sha256=JRhn7Cgg2j6uEwlvuQCMeISNYvRV2Yahx-v-p8HspUQ,8767
217
+ alita_sdk/tools/github/api_wrapper.py,sha256=uDwYckdnpYRJtb0uZnDkaz2udvdDLVxuCh1tSwspsiU,8411
218
218
  alita_sdk/tools/github/github_client.py,sha256=nxnSXsDul2PPbWvYZS8TmAFFmR-5ALyakNoV5LN2D4U,86617
219
219
  alita_sdk/tools/github/graphql_client_wrapper.py,sha256=d3AGjzLGH_hdQV2V8HeAX92dJ4dlnE5OXqUlCO_PBr0,71539
220
220
  alita_sdk/tools/github/schemas.py,sha256=yFsqivfjCPRk9GxFJrL8sTz6nnjFCZ0j5DIfPtGSsvA,13852
@@ -304,7 +304,7 @@ alita_sdk/tools/testrail/__init__.py,sha256=577XVaOAoXG3mDkojCsy5XCUlxCsdJf_2-_5
304
304
  alita_sdk/tools/testrail/api_wrapper.py,sha256=Aax0jspgidXYNxLIw6qTWu3dO2JOIS0ALIqsCzQuFbQ,32087
305
305
  alita_sdk/tools/utils/__init__.py,sha256=155xepXPr4OEzs2Mz5YnjXcBpxSv1X2eznRUVoPtyK0,3268
306
306
  alita_sdk/tools/utils/content_parser.py,sha256=yi1IDLreqfM41w-PnoFEvVLtSV50qpNvKshJwbDTgqs,7172
307
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py,sha256=KhxojgddWlEQ4TZA7jEL5ZEp86PcXfmfgRXixsjj7-M,3634
307
+ alita_sdk/tools/vector_adapters/VectorStoreAdapter.py,sha256=kB6KYN4IRisyNc3U4SYJ4PdOoPKH1wrRvRwvdrjZ0OQ,16850
308
308
  alita_sdk/tools/vector_adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
309
309
  alita_sdk/tools/xray/__init__.py,sha256=OYa1wveTm-lAhsJaGXMnwOrDQWl6ch--NjNLBeR63eM,4331
310
310
  alita_sdk/tools/xray/api_wrapper.py,sha256=A8PJmY2k7TowaD_vk6ZxkMnSUoZUt9A6g4TJrZfNTAw,32225
@@ -325,8 +325,8 @@ alita_sdk/tools/zephyr_scale/api_wrapper.py,sha256=JAeWf-RXohsxheUpT0iMDClc_izj-
325
325
  alita_sdk/tools/zephyr_squad/__init__.py,sha256=0AI_j27xVO5Gk5HQMFrqPTd4uvuVTpiZUicBrdfEpKg,2796
326
326
  alita_sdk/tools/zephyr_squad/api_wrapper.py,sha256=kmw_xol8YIYFplBLWTqP_VKPRhL_1ItDD0_vXTe_UuI,14906
327
327
  alita_sdk/tools/zephyr_squad/zephyr_squad_cloud_client.py,sha256=R371waHsms4sllHCbijKYs90C-9Yu0sSR3N4SUfQOgU,5066
328
- alita_sdk-0.3.231.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
329
- alita_sdk-0.3.231.dist-info/METADATA,sha256=8JHRCwh-SnyBtnD3wL4yJDUL1ov5fLXL1WUyfSIynf0,18896
330
- alita_sdk-0.3.231.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
331
- alita_sdk-0.3.231.dist-info/top_level.txt,sha256=0vJYy5p_jK6AwVb1aqXr7Kgqgk3WDtQ6t5C-XI9zkmg,10
332
- alita_sdk-0.3.231.dist-info/RECORD,,
328
+ alita_sdk-0.3.232.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
329
+ alita_sdk-0.3.232.dist-info/METADATA,sha256=HMHMoJWO6wQ3h3u5c-p_27RlppcpFUaw9BDyOL7Y9_c,18896
330
+ alita_sdk-0.3.232.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
331
+ alita_sdk-0.3.232.dist-info/top_level.txt,sha256=0vJYy5p_jK6AwVb1aqXr7Kgqgk3WDtQ6t5C-XI9zkmg,10
332
+ alita_sdk-0.3.232.dist-info/RECORD,,