alita-sdk 0.3.209__py3-none-any.whl → 0.3.210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. alita_sdk/runtime/clients/artifact.py +18 -4
  2. alita_sdk/runtime/langchain/document_loaders/AlitaCSVLoader.py +2 -1
  3. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +3 -3
  4. alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +8 -4
  5. alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -1
  6. alita_sdk/runtime/langchain/langraph_agent.py +1 -1
  7. alita_sdk/runtime/toolkits/artifact.py +7 -3
  8. alita_sdk/runtime/toolkits/tools.py +8 -1
  9. alita_sdk/runtime/tools/application.py +2 -0
  10. alita_sdk/runtime/tools/artifact.py +65 -8
  11. alita_sdk/runtime/tools/vectorstore.py +125 -41
  12. alita_sdk/runtime/utils/utils.py +3 -0
  13. alita_sdk/tools/ado/__init__.py +8 -0
  14. alita_sdk/tools/ado/repos/repos_wrapper.py +37 -0
  15. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +0 -7
  16. alita_sdk/tools/ado/work_item/__init__.py +4 -0
  17. alita_sdk/tools/ado/work_item/ado_wrapper.py +37 -4
  18. alita_sdk/tools/aws/delta_lake/__init__.py +1 -1
  19. alita_sdk/tools/bitbucket/__init__.py +13 -1
  20. alita_sdk/tools/bitbucket/api_wrapper.py +31 -4
  21. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +31 -0
  22. alita_sdk/tools/chunkers/code/codeparser.py +18 -10
  23. alita_sdk/tools/confluence/api_wrapper.py +35 -134
  24. alita_sdk/tools/confluence/loader.py +30 -28
  25. alita_sdk/tools/elitea_base.py +112 -11
  26. alita_sdk/tools/figma/__init__.py +13 -1
  27. alita_sdk/tools/figma/api_wrapper.py +47 -3
  28. alita_sdk/tools/github/api_wrapper.py +8 -0
  29. alita_sdk/tools/github/github_client.py +18 -0
  30. alita_sdk/tools/gitlab/__init__.py +4 -0
  31. alita_sdk/tools/gitlab/api_wrapper.py +10 -0
  32. alita_sdk/tools/google/bigquery/__init__.py +1 -1
  33. alita_sdk/tools/jira/__init__.py +21 -13
  34. alita_sdk/tools/jira/api_wrapper.py +285 -5
  35. alita_sdk/tools/sharepoint/__init__.py +11 -1
  36. alita_sdk/tools/sharepoint/api_wrapper.py +23 -53
  37. alita_sdk/tools/testrail/__init__.py +4 -0
  38. alita_sdk/tools/testrail/api_wrapper.py +21 -54
  39. alita_sdk/tools/utils/content_parser.py +72 -8
  40. alita_sdk/tools/xray/__init__.py +8 -1
  41. alita_sdk/tools/xray/api_wrapper.py +505 -14
  42. alita_sdk/tools/zephyr_scale/api_wrapper.py +5 -5
  43. {alita_sdk-0.3.209.dist-info → alita_sdk-0.3.210.dist-info}/METADATA +1 -1
  44. {alita_sdk-0.3.209.dist-info → alita_sdk-0.3.210.dist-info}/RECORD +47 -47
  45. {alita_sdk-0.3.209.dist-info → alita_sdk-0.3.210.dist-info}/WHEEL +0 -0
  46. {alita_sdk-0.3.209.dist-info → alita_sdk-0.3.210.dist-info}/licenses/LICENSE +0 -0
  47. {alita_sdk-0.3.209.dist-info → alita_sdk-0.3.210.dist-info}/top_level.txt +0 -0
@@ -24,7 +24,14 @@ class Artifact:
24
24
  logger.error(f"Error: {e}")
25
25
  return f"Error: {e}"
26
26
 
27
- def get(self, artifact_name: str, bucket_name: str = None, is_capture_image: bool = False, page_number: int = None, sheet_name: str = None):
27
+ def get(self,
28
+ artifact_name: str,
29
+ bucket_name: str = None,
30
+ is_capture_image: bool = False,
31
+ page_number: int = None,
32
+ sheet_name: str = None,
33
+ excel_by_sheets: bool = False,
34
+ llm = None):
28
35
  if not bucket_name:
29
36
  bucket_name = self.bucket_name
30
37
  data = self.client.download_artifact(bucket_name, artifact_name)
@@ -37,17 +44,24 @@ class Artifact:
37
44
  if detected['encoding'] is not None:
38
45
  return data.decode(detected['encoding'])
39
46
  else:
40
- return parse_file_content(artifact_name, data, is_capture_image, page_number, sheet_name)
47
+ return parse_file_content(file_name=artifact_name,
48
+ file_content=data,
49
+ is_capture_image=is_capture_image,
50
+ page_number=page_number,
51
+ sheet_name=sheet_name,
52
+ excel_by_sheets=excel_by_sheets,
53
+ llm=llm)
41
54
 
42
55
  def delete(self, artifact_name: str, bucket_name = None):
43
56
  if not bucket_name:
44
57
  bucket_name = self.bucket_name
45
58
  self.client.delete_artifact(bucket_name, artifact_name)
46
59
 
47
- def list(self, bucket_name: str = None) -> str:
60
+ def list(self, bucket_name: str = None, return_as_string = True) -> str|dict:
48
61
  if not bucket_name:
49
62
  bucket_name = self.bucket_name
50
- return str(self.client.list_artifacts(bucket_name))
63
+ artifacts = self.client.list_artifacts(bucket_name)
64
+ return str(artifacts) if return_as_string else artifacts
51
65
 
52
66
  def append(self, artifact_name: str, additional_data: Any, bucket_name: str = None):
53
67
  if not bucket_name:
@@ -26,7 +26,8 @@ class AlitaCSVLoader(AlitaTableLoader):
26
26
  json_documents: bool = True,
27
27
  raw_content: bool = False,
28
28
  columns: Optional[List[str]] = None,
29
- cleanse: bool = True):
29
+ cleanse: bool = True,
30
+ **kwargs):
30
31
  super().__init__(file_path=file_path, json_documents=json_documents, columns=columns, raw_content=raw_content, cleanse=cleanse)
31
32
  self.encoding = encoding
32
33
  self.autodetect_encoding = autodetect_encoding
@@ -18,19 +18,19 @@ class AlitaDocxMammothLoader(BaseLoader):
18
18
  Loader for Docx files using Mammoth to convert to HTML, with image handling,
19
19
  and then Markdownify to convert HTML to markdown.
20
20
  """
21
- def __init__(self, path: str, **kwargs):
21
+ def __init__(self, file_path: str, **kwargs):
22
22
  """
23
23
  Initializes AlitaDocxMammothLoader.
24
24
 
25
25
  Args:
26
26
  **kwargs: Keyword arguments, including:
27
- path (str): Path to the Docx file. Required.
27
+ file_path (str): Path to the Docx file. Required.
28
28
  llm (LLM, optional): Language model for processing images.
29
29
  prompt (str, optional): Prompt for the language model.
30
30
  Raises:
31
31
  ValueError: If the 'path' parameter is not provided.
32
32
  """
33
- self.path = path
33
+ self.path = file_path
34
34
  self.llm = kwargs.get("llm")
35
35
  self.prompt = kwargs.get("prompt")
36
36
 
@@ -19,11 +19,15 @@ Image.MAX_IMAGE_PIXELS = 300_000_000
19
19
  class AlitaImageLoader(BaseLoader):
20
20
  """Loads image files using pytesseract for OCR or optionally LLM for advanced analysis, including SVG support."""
21
21
 
22
- def __init__(self, **kwargs):
23
- if not kwargs.get('path'):
24
- raise ValueError("Path parameter 'path' is required")
25
- else:
22
+ def __init__(self, file_path=None, **kwargs):
23
+ # Handle both positional and keyword arguments for file_path
24
+ if file_path is not None:
25
+ self.file_path = file_path
26
+ elif kwargs.get('path'):
26
27
  self.file_path = kwargs['path']
28
+ else:
29
+ raise ValueError(
30
+ "Path parameter is required (either as 'file_path' positional argument or 'path' keyword argument)")
27
31
  self.llm = kwargs.get('llm', None)
28
32
  self.ocr_language = kwargs.get('ocr_language', None)
29
33
  self.prompt = kwargs.get('prompt') if kwargs.get(
@@ -26,7 +26,7 @@ class AlitaTableLoader(BaseLoader):
26
26
  json_documents: bool = True,
27
27
  raw_content: bool = False,
28
28
  columns: Optional[List[str]] = None,
29
- cleanse: bool = True):
29
+ cleanse: bool = True, **kwargs):
30
30
 
31
31
  self.raw_content = raw_content
32
32
  self.file_path = file_path
@@ -505,7 +505,7 @@ def create_graph(
505
505
  if isinstance(connected_tools, dict):
506
506
  for toolkit, selected_tools in connected_tools.items():
507
507
  for tool in selected_tools:
508
- tool_names.append(f"{toolkit}___{tool}")
508
+ tool_names.append(f"{toolkit}{TOOLKIT_SPLITTER}{tool}")
509
509
  elif isinstance(connected_tools, list):
510
510
  # for cases when tools are provided as a list of names with already bound toolkit_name
511
511
  tool_names = connected_tools
@@ -3,7 +3,7 @@ from typing import List, Any, Literal, Optional
3
3
  from alita_sdk.tools.utils import clean_string, TOOLKIT_SPLITTER, get_max_toolkit_length
4
4
  from langchain_community.agent_toolkits.base import BaseToolkit
5
5
  from langchain_core.tools import BaseTool
6
- from pydantic import create_model, BaseModel, ConfigDict, Field
6
+ from pydantic import create_model, BaseModel, ConfigDict, Field, SecretStr
7
7
  from pydantic.fields import FieldInfo
8
8
  from ..tools.artifact import ArtifactWrapper
9
9
  from alita_sdk.tools.base.tool import BaseAction
@@ -22,15 +22,19 @@ class ArtifactToolkit(BaseToolkit):
22
22
  # client = (Any, FieldInfo(description="Client object", required=True, autopopulate=True)),
23
23
  bucket = (str, FieldInfo(description="Bucket name", json_schema_extra={'toolkit_name': True, 'max_toolkit_length': ArtifactToolkit.toolkit_max_length})),
24
24
  selected_tools=(List[Literal[tuple(selected_tools)]], Field(default=[], json_schema_extra={'args_schemas': selected_tools})),
25
+ # indexer settings
26
+ connection_string = (Optional[SecretStr], Field(description="Connection string for vectorstore",
27
+ default=None,
28
+ json_schema_extra={'secret': True})),
25
29
  __config__=ConfigDict(json_schema_extra={'metadata': {"label": "Artifact", "icon_url": None}})
26
30
  )
27
31
 
28
32
  @classmethod
29
- def get_toolkit(cls, client: Any, bucket: str, toolkit_name: Optional[str] = None, selected_tools: list[str] = []):
33
+ def get_toolkit(cls, client: Any, bucket: str, toolkit_name: Optional[str] = None, selected_tools: list[str] = [], **kwargs):
30
34
  if selected_tools is None:
31
35
  selected_tools = []
32
36
  tools = []
33
- artifact_wrapper = ArtifactWrapper(client=client, bucket=bucket)
37
+ artifact_wrapper = ArtifactWrapper(client=client, bucket=bucket, **kwargs)
34
38
  prefix = clean_string(toolkit_name, cls.toolkit_max_length) + TOOLKIT_SPLITTER if toolkit_name else ''
35
39
  available_tools = artifact_wrapper.get_available_tools()
36
40
  for tool in available_tools:
@@ -71,7 +71,14 @@ def get_tools(tools_list: list, alita_client, llm, memory_store: BaseStore = Non
71
71
  client=alita_client,
72
72
  bucket=tool['settings']['bucket'],
73
73
  toolkit_name=tool.get('toolkit_name', ''),
74
- selected_tools=tool['settings'].get('selected_tools', [])
74
+ selected_tools=tool['settings'].get('selected_tools', []),
75
+ llm=tool['settings'].get('llm'),
76
+ # indexer settings
77
+ connection_string=tool['settings'].get('connection_string', None),
78
+ collection_name=f"{tool.get('toolkit_name')}_{str(tool['id'])}",
79
+ embedding_model="HuggingFaceEmbeddings",
80
+ embedding_model_params={"model_name": "sentence-transformers/all-MiniLM-L6-v2"},
81
+ vectorstore_type="PGVector"
75
82
  ).get_tools())
76
83
  elif tool['type'] == 'vectorstore':
77
84
  tools.extend(VectorStoreToolkit.get_toolkit(
@@ -56,6 +56,8 @@ class Application(BaseTool):
56
56
  schema_values = self.args_schema(**input).model_dump() if self.args_schema else {}
57
57
  extras = {k: v for k, v in input.items() if k not in schema_values}
58
58
  all_kwargs = {**kwargs, **extras, **schema_values}
59
+ if config is None:
60
+ config = {}
59
61
  return self._run(*config, **all_kwargs)
60
62
 
61
63
  def _run(self, *args, **kwargs):
@@ -1,8 +1,19 @@
1
- from alita_sdk.tools.elitea_base import BaseToolApiWrapper
2
- from typing import Any, Optional
1
+ import hashlib
2
+ import json
3
+ from typing import Any, Optional, Generator, List
4
+
5
+ from langchain_core.documents import Document
6
+ from langchain_core.tools import ToolException
3
7
  from pydantic import create_model, Field, model_validator
4
8
 
5
- class ArtifactWrapper(BaseToolApiWrapper):
9
+ from alita_sdk.tools.elitea_base import BaseVectorStoreToolApiWrapper, extend_with_vector_tools
10
+
11
+ try:
12
+ from alita_sdk.runtime.langchain.interfaces.llm_processor import get_embeddings
13
+ except ImportError:
14
+ from alita_sdk.langchain.interfaces.llm_processor import get_embeddings
15
+
16
+ class ArtifactWrapper(BaseVectorStoreToolApiWrapper):
6
17
  client: Any
7
18
  bucket: str
8
19
  artifact: Optional[Any] = None
@@ -17,14 +28,26 @@ class ArtifactWrapper(BaseToolApiWrapper):
17
28
  values["artifact"] = values['client'].artifact(values['bucket'])
18
29
  return values
19
30
 
20
- def list_files(self, bucket_name = None):
21
- return self.artifact.list(bucket_name)
31
+ def list_files(self, bucket_name = None, return_as_string = True):
32
+ return self.artifact.list(bucket_name, return_as_string)
22
33
 
23
34
  def create_file(self, filename: str, filedata: str, bucket_name = None):
24
35
  return self.artifact.create(filename, filedata, bucket_name)
25
36
 
26
- def read_file(self, filename: str, bucket_name = None, is_capture_image: bool = False, page_number: int = None, sheet_name: str = None):
27
- return self.artifact.get(filename, bucket_name, is_capture_image, page_number, sheet_name)
37
+ def read_file(self,
38
+ filename: str,
39
+ bucket_name = None,
40
+ is_capture_image: bool = False,
41
+ page_number: int = None,
42
+ sheet_name: str = None,
43
+ excel_by_sheets: bool = False):
44
+ return self.artifact.get(artifact_name=filename,
45
+ bucket_name=bucket_name,
46
+ is_capture_image=is_capture_image,
47
+ page_number=page_number,
48
+ sheet_name=sheet_name,
49
+ excel_by_sheets=excel_by_sheets,
50
+ llm=self.llm)
28
51
 
29
52
  def delete_file(self, filename: str, bucket_name = None):
30
53
  return self.artifact.delete(filename, bucket_name)
@@ -38,6 +61,40 @@ class ArtifactWrapper(BaseToolApiWrapper):
38
61
  def create_new_bucket(self, bucket_name: str, expiration_measure = "weeks", expiration_value = 1):
39
62
  return self.artifact.client.create_bucket(bucket_name, expiration_measure, expiration_value)
40
63
 
64
+ def _base_loader(self, **kwargs) -> List[Document]:
65
+ try:
66
+ all_files = self.list_files(self.bucket, False)
67
+ except Exception as e:
68
+ raise ToolException(f"Unable to extract files: {e}")
69
+
70
+ docs: List[Document] = []
71
+ for file in all_files['rows']:
72
+ metadata = {
73
+ ("updated_on" if k == "modified" else k): str(v)
74
+ for k, v in file.items()
75
+ }
76
+ metadata['id'] = self.get_hash_from_bucket_and_file_name(self.bucket, file['name'])
77
+ docs.append(Document(page_content="", metadata=metadata))
78
+ return docs
79
+
80
+ def get_hash_from_bucket_and_file_name(self, bucket, file_name):
81
+ hasher = hashlib.sha256()
82
+ hasher.update(bucket.encode('utf-8'))
83
+ hasher.update(file_name.encode('utf-8'))
84
+ return hasher.hexdigest()
85
+
86
+ def _process_document(self, document: Document) -> Generator[Document, None, None]:
87
+ page_content = self.read_file(document.metadata['name'], is_capture_image=True, excel_by_sheets=True)
88
+ if isinstance(page_content, dict):
89
+ for key, value in page_content.items():
90
+ metadata = document.metadata
91
+ metadata['page'] = key
92
+ yield Document(page_content=str(value), metadata=metadata)
93
+ else:
94
+ document.page_content = json.dumps(str(page_content))
95
+ yield document
96
+
97
+ @extend_with_vector_tools
41
98
  def get_available_tools(self):
42
99
  bucket_name = (Optional[str], Field(description="Name of the bucket to work with."
43
100
  "If bucket is not specified by user directly, the name should be taken from chat history."
@@ -125,5 +182,5 @@ class ArtifactWrapper(BaseToolApiWrapper):
125
182
  default="weeks")),
126
183
  expiration_value=(Optional[int], Field(description="Expiration time values.", default=1))
127
184
  )
128
- },
185
+ }
129
186
  ]
@@ -188,62 +188,108 @@ class VectorStoreWrapper(BaseToolApiWrapper):
188
188
  except Exception as e:
189
189
  logger.error(f"Failed to initialize PGVectorSearch: {str(e)}")
190
190
 
191
+ def _clean_collection(self):
192
+ """
193
+ Clean the vectorstore collection by deleting all indexed data.
194
+ """
195
+ self._log_data(
196
+ f"Cleaning collection '{self.dataset}'",
197
+ tool_name="_clean_collection"
198
+ )
199
+ data = self.vectoradapter.vectorstore.get(include=['metadatas'])
200
+ self.vectoradapter.vectorstore.delete(ids=data['ids'])
201
+ self._log_data(
202
+ f"Collection '{self.dataset}' has been cleaned. ",
203
+ tool_name="_clean_collection"
204
+ )
205
+
191
206
  def _get_indexed_data(self, store):
192
- """ Get all indexed data from vectorstore """
207
+ """ Get all indexed data from vectorstore for non-code content """
193
208
 
194
209
  # get already indexed data
195
210
  result = {}
196
211
  try:
197
212
  self._log_data("Retrieving already indexed data from vectorstore",
198
213
  tool_name="index_documents")
199
- data = store.get(include=['documents', 'metadatas'])
214
+ data = store.get(include=['metadatas'])
200
215
  # re-structure data to be more usable
201
- for doc_str, meta, db_id in zip(data['documents'], data['metadatas'], data['ids']):
216
+ for meta, db_id in zip(data['metadatas'], data['ids']):
217
+ # get document id from metadata
202
218
  doc_id = str(meta['id'])
203
219
  dependent_docs = meta.get(IndexerKeywords.DEPENDENT_DOCS.value, [])
220
+ if dependent_docs:
221
+ dependent_docs = [d.strip() for d in dependent_docs.split(';') if d.strip()]
204
222
  parent_id = meta.get(IndexerKeywords.PARENT.value, -1)
205
- result[doc_id] = {
206
- 'metadata': meta,
207
- 'document': doc_str,
208
- 'id': db_id,
209
- IndexerKeywords.DEPENDENT_DOCS.value: dependent_docs,
210
- IndexerKeywords.PARENT.value: parent_id
211
- }
223
+ #
224
+ chunk_id = meta.get('chunk_id')
225
+ if doc_id in result and chunk_id:
226
+ # if document with the same id already saved, add db_id fof current one as chunk
227
+ result[doc_id]['all_chunks'].append(db_id)
228
+ else:
229
+ result[doc_id] = {
230
+ 'metadata': meta,
231
+ 'id': db_id,
232
+ 'all_chunks': [db_id],
233
+ IndexerKeywords.DEPENDENT_DOCS.value: dependent_docs,
234
+ IndexerKeywords.PARENT.value: parent_id
235
+ }
212
236
  except Exception as e:
213
237
  logger.error(f"Failed to get indexed data from vectorstore: {str(e)}. Continuing with empty index.")
214
238
  return result
215
239
 
216
- def _reduce_duplicates(self, documents: Generator[Document, None, None], store) -> List[Any]:
217
- """Remove documents already indexed in the vectorstore based on metadata 'id' and 'updated_on' fields."""
240
+ def _get_code_indexed_data(self, store) -> Dict[str, Dict[str, Any]]:
241
+ """ Get all indexed data from vectorstore for code content """
218
242
 
219
- self._log_data("Verification of documents to index started", tool_name="index_documents")
243
+ # get already indexed data
244
+ result = {}
245
+ try:
246
+ self._log_data("Retrieving already indexed code data from vectorstore",
247
+ tool_name="index_documents")
248
+ data = store.get(include=['metadatas'])
249
+ # re-structure data to be more usable
250
+ for meta, db_id in zip(data['metadatas'], data['ids']):
251
+ filename = meta['filename']
252
+ commit_hash = meta.get('commit_hash')
253
+ if filename not in result:
254
+ result[filename] = {
255
+ 'commit_hashes': [],
256
+ 'ids': []
257
+ }
258
+ if commit_hash is not None:
259
+ result[filename]['commit_hashes'].append(commit_hash)
260
+ result[filename]['ids'].append(db_id)
261
+ except Exception as e:
262
+ logger.error(f"Failed to get indexed code data from vectorstore: {str(e)}. Continuing with empty index.")
263
+ return result
220
264
 
221
- indexed_data = self._get_indexed_data(store)
222
- indexed_ids = set(indexed_data.keys())
223
- if not indexed_ids:
265
+ def _reduce_duplicates(
266
+ self,
267
+ documents: Generator[Any, None, None],
268
+ store,
269
+ get_indexed_data: Callable,
270
+ key_fn: Callable,
271
+ compare_fn: Callable,
272
+ remove_ids_fn: Callable,
273
+ log_msg: str = "Verification of documents to index started"
274
+ ) -> List[Any]:
275
+ """Generic duplicate reduction logic for documents."""
276
+ self._log_data(log_msg, tool_name="index_documents")
277
+ indexed_data = get_indexed_data(store)
278
+ indexed_keys = set(indexed_data.keys())
279
+ if not indexed_keys:
224
280
  self._log_data("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
225
281
  return list(documents)
226
282
 
227
283
  final_docs = []
228
- docs_to_remove = []
284
+ docs_to_remove = set()
229
285
 
230
286
  for document in documents:
231
- doc_id = document.metadata.get('id')
232
- # get document's metadata and id and check if already indexed
233
- if doc_id in indexed_ids:
234
- # document has been indexed already, then verify `updated_on`
235
- to_index_updated_on = document.metadata.get('updated_on')
236
- indexed_meta = indexed_data[doc_id]['metadata']
237
- indexed_updated_on = indexed_meta.get('updated_on')
238
- if to_index_updated_on and indexed_updated_on and to_index_updated_on == indexed_updated_on:
239
- # same updated_on, skip indexing
287
+ key = key_fn(document)
288
+ if key in indexed_keys:
289
+ if compare_fn(document, indexed_data[key]):
240
290
  continue
241
- # if updated_on is missing or different, we will re-index the document and remove old one
242
- # parent doc removal
243
- docs_to_remove.append(indexed_data[doc_id]['id'])
244
- # mark dependent docs for removal
245
- for dependent_doc_id in indexed_data[doc_id][IndexerKeywords.DEPENDENT_DOCS.value]:
246
- docs_to_remove.append(indexed_data[dependent_doc_id]['id'])
291
+ final_docs.append(document)
292
+ docs_to_remove.update(remove_ids_fn(indexed_data, key))
247
293
  else:
248
294
  final_docs.append(document)
249
295
 
@@ -252,16 +298,50 @@ class VectorStoreWrapper(BaseToolApiWrapper):
252
298
  f"Removing {len(docs_to_remove)} documents from vectorstore that are already indexed with different updated_on.",
253
299
  tool_name="index_documents"
254
300
  )
255
- store.delete(ids=docs_to_remove)
301
+ store.delete(ids=list(docs_to_remove))
256
302
 
257
303
  return final_docs
258
304
 
259
- def index_documents(self, documents: Generator[Document, None, None], progress_step: int = 20, clean_index: bool = True):
305
+ def _reduce_non_code_duplicates(self, documents: Generator[Any, None, None], store) -> List[Any]:
306
+ return self._reduce_duplicates(
307
+ documents,
308
+ store,
309
+ self._get_indexed_data,
310
+ lambda doc: doc.metadata.get('id'),
311
+ lambda doc, idx: (
312
+ doc.metadata.get('updated_on') and
313
+ idx['metadata'].get('updated_on') and
314
+ doc.metadata.get('updated_on') == idx['metadata'].get('updated_on')
315
+ ),
316
+ lambda idx_data, key: (
317
+ idx_data[key]['all_chunks'] +
318
+ [idx_data[dep_id]['id'] for dep_id in idx_data[key][IndexerKeywords.DEPENDENT_DOCS.value]] +
319
+ [chunk_db_id for dep_id in idx_data[key][IndexerKeywords.DEPENDENT_DOCS.value]
320
+ for chunk_db_id in idx_data[dep_id]['all_chunks']]
321
+ ),
322
+ log_msg="Verification of documents to index started"
323
+ )
324
+
325
+ def _reduce_code_duplicates(self, documents: Generator[Any, None, None], store) -> List[Any]:
326
+ return self._reduce_duplicates(
327
+ documents,
328
+ store,
329
+ self._get_code_indexed_data,
330
+ lambda doc: doc.metadata.get('filename'),
331
+ lambda doc, idx: (
332
+ doc.metadata.get('commit_hash') and
333
+ idx.get('commit_hashes') and
334
+ doc.metadata.get('commit_hash') in idx.get('commit_hashes')
335
+ ),
336
+ lambda idx_data, key: idx_data[key]['ids'],
337
+ log_msg="Verification of code documents to index started"
338
+ )
339
+
340
+ def index_documents(self, documents: Generator[Document, None, None], progress_step: int = 20, clean_index: bool = True, is_code: bool = False):
260
341
  """ Index documents in the vectorstore.
261
342
 
262
343
  Args:
263
344
  documents (Any): Generator or list of documents to index.
264
- document_processing_func (Optional[Callable]): Function to process documents after duplicates removal and before indexing.
265
345
  progress_step (int): Step for progress reporting, default is 20.
266
346
  clean_index (bool): If True, clean the index before re-indexing all documents.
267
347
  """
@@ -273,7 +353,7 @@ class VectorStoreWrapper(BaseToolApiWrapper):
273
353
  logger.info("Cleaning index before re-indexing all documents.")
274
354
  self._log_data("Cleaning index before re-indexing all documents. Previous index will be removed", tool_name="index_documents")
275
355
  try:
276
- self.vectoradapter.delete_dataset(self.dataset)
356
+ self._clean_collection()
277
357
  self.vectoradapter.persist()
278
358
  self.vectoradapter.vacuum()
279
359
  self._log_data("Previous index has been removed",
@@ -283,9 +363,9 @@ class VectorStoreWrapper(BaseToolApiWrapper):
283
363
  if isinstance(documents, types.GeneratorType):
284
364
  documents = list(documents)
285
365
  else:
286
- # remove duplicates based on metadata 'id' and 'updated_on' fields
287
- documents = self._reduce_duplicates(documents, self.vectoradapter.vectorstore)
288
-
366
+ # remove duplicates based on metadata 'id' and 'updated_on' or 'commit_hash' fields
367
+ documents = self._reduce_code_duplicates(documents, self.vectoradapter.vectorstore) if is_code \
368
+ else self._reduce_non_code_duplicates(documents, self.vectoradapter.vectorstore)
289
369
 
290
370
  if not documents or len(documents) == 0:
291
371
  logger.info("No new documents to index after duplicate check.")
@@ -455,8 +535,12 @@ class VectorStoreWrapper(BaseToolApiWrapper):
455
535
  )
456
536
 
457
537
  # Initialize document map for tracking by ID
458
- doc_map = {doc.metadata.get('id', f"idx_{i}"): (doc, score)
459
- for i, (doc, score) in enumerate(vector_items)}
538
+ doc_map = {
539
+ f"{doc.metadata.get('id', f'idx_{i}')}_{doc.metadata['chunk_id']}"
540
+ if 'chunk_id' in doc.metadata
541
+ else doc.metadata.get('id', f"idx_{i}"): (doc, score)
542
+ for i, (doc, score) in enumerate(vector_items)
543
+ }
460
544
 
461
545
  # Process full-text search if configured
462
546
  if full_text_search and full_text_search.get('enabled') and full_text_search.get('fields'):
@@ -4,8 +4,11 @@ from enum import Enum
4
4
  TOOLKIT_SPLITTER = "___"
5
5
 
6
6
  class IndexerKeywords(Enum):
7
+ # TODO: remove these fields when the indexer is updated
7
8
  DEPENDENT_DOCS = 'dependent_docs'
8
9
  PARENT = 'parent_id'
10
+ # DEPENDENCY_ID = 'dependency_id'
11
+ UPDATED_ON = 'updated_on'
9
12
 
10
13
  # This pattern matches characters that are NOT alphanumeric, underscores, or hyphens
11
14
  clean_string_pattern = re.compile(r'[^a-zA-Z0-9_.-]')
@@ -14,6 +14,14 @@ def get_tools(tool_type, tool):
14
14
  "token": tool['settings'].get('token', None),
15
15
  "limit": tool['settings'].get('limit', 5),
16
16
  "toolkit_name": tool.get('toolkit_name', ''),
17
+ # indexer settings
18
+ "llm":tool['settings'].get('llm', None),
19
+ "connection_string":tool['settings'].get('connection_string', None),
20
+ "collection_name":str(tool['id']),
21
+ "doctype":'doc',
22
+ "embedding_model":"HuggingFaceEmbeddings",
23
+ "embedding_model_params":{"model_name": "sentence-transformers/all-MiniLM-L6-v2"},
24
+ "vectorstore_type":"PGVector"
17
25
  }
18
26
  if tool_type == 'ado_plans':
19
27
  return AzureDevOpsPlansToolkit().get_toolkit(**config_dict).get_tools()
@@ -302,6 +302,43 @@ class ReposApiWrapper(BaseCodeToolApiWrapper):
302
302
 
303
303
  return values
304
304
 
305
+ def _get_commits(self, file_path: str, branch: str, top: int = None) -> List[GitCommitRef]:
306
+ """
307
+ Get commits for a specific file in a specific branch.
308
+
309
+ Args:
310
+ file_path (str): Path to the file in the repository.
311
+ branch (str): Branch name to get commits from.
312
+ top (int, optional): Maximum number of commits to return. Defaults to None.
313
+
314
+ Returns:
315
+ List[GitCommitRef]: List of commit references.
316
+ """
317
+ try:
318
+ version_descriptor = GitVersionDescriptor(
319
+ version=branch, version_type="branch"
320
+ )
321
+ commits = self._client.get_commits(
322
+ repository_id=self.repository_id,
323
+ project=self.project,
324
+ search_criteria=GitQueryCommitsCriteria(item_path=file_path,
325
+ item_version=version_descriptor, top=top if top else 100),
326
+ )
327
+ return commits
328
+ except Exception as e:
329
+ msg = f"Failed to get commits for file '{file_path}' on branch '{branch}': {str(e)}"
330
+ logger.error(msg)
331
+ return ToolException(msg)
332
+
333
+ def _file_commit_hash(self, file_path: str, branch: str) -> str:
334
+ """Get the commit hash of the last commit that modified a file in a specific branch."""
335
+
336
+ commits = self._get_commits(file_path, branch, top=1)
337
+ if commits:
338
+ return commits[0].commit_id
339
+ else:
340
+ return None
341
+
305
342
  def _get_files(
306
343
  self,
307
344
  path: str = "",
@@ -184,13 +184,6 @@ class TestPlanApiWrapper(BaseVectorStoreToolApiWrapper):
184
184
  limit: Optional[int] = 5
185
185
  _client: Optional[TestPlanClient] = PrivateAttr()
186
186
 
187
- llm: Any = None
188
- connection_string: Optional[SecretStr] = None
189
- collection_name: Optional[str] = None
190
- embedding_model: Optional[str] = "HuggingFaceEmbeddings"
191
- embedding_model_params: Optional[Dict[str, Any]] = {"model_name": "sentence-transformers/all-MiniLM-L6-v2"}
192
- vectorstore_type: Optional[str] = "PGVector"
193
-
194
187
  class Config:
195
188
  arbitrary_types_allowed = True
196
189
 
@@ -37,6 +37,10 @@ class AzureDevOpsWorkItemsToolkit(BaseToolkit):
37
37
  token=(SecretStr, Field(description="ADO token", json_schema_extra={'secret': True, 'configuration': True})),
38
38
  limit=(Optional[int], Field(description="ADO plans limit used for limitation of the list with results", default=5)),
39
39
  selected_tools=(List[Literal[tuple(selected_tools)]], Field(default=[], json_schema_extra={'args_schemas': selected_tools})),
40
+ # indexer settings
41
+ connection_string = (Optional[SecretStr], Field(description="Connection string for vectorstore",
42
+ default=None,
43
+ json_schema_extra={'secret': True})),
40
44
  __config__={
41
45
  'json_schema_extra': {
42
46
  'metadata': {