alita-sdk 0.3.248__py3-none-any.whl → 0.3.250__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,6 @@ import logging
3
3
  import urllib.parse
4
4
  from typing import Dict, List, Generator, Optional
5
5
 
6
- from alita_sdk.tools.elitea_base import BaseVectorStoreToolApiWrapper, extend_with_vector_tools
7
6
  from azure.devops.connection import Connection
8
7
  from azure.devops.v7_1.core import CoreClient
9
8
  from azure.devops.v7_1.wiki import WikiClient
@@ -15,6 +14,8 @@ from pydantic import create_model, PrivateAttr, SecretStr
15
14
  from pydantic import model_validator
16
15
  from pydantic.fields import Field
17
16
 
17
+ from alita_sdk.tools.non_code_indexer_toolkit import NonCodeIndexerToolkit
18
+
18
19
  try:
19
20
  from alita_sdk.runtime.langchain.interfaces.llm_processor import get_embeddings
20
21
  except ImportError:
@@ -94,7 +95,7 @@ ADOUnlinkWorkItemsFromWikiPage = create_model(
94
95
  page_name=(str, Field(description="Wiki page path to unlink the work items from", examples=["/TargetPage"]))
95
96
  )
96
97
 
97
- class AzureDevOpsApiWrapper(BaseVectorStoreToolApiWrapper):
98
+ class AzureDevOpsApiWrapper(NonCodeIndexerToolkit):
98
99
  # TODO use ado_configuration instead of organization_url, project and token
99
100
  organization_url: str
100
101
  project: str
@@ -125,7 +126,7 @@ class AzureDevOpsApiWrapper(BaseVectorStoreToolApiWrapper):
125
126
  except Exception as e:
126
127
  return ImportError(f"Failed to connect to Azure DevOps: {e}")
127
128
 
128
- return values
129
+ return super().validate_toolkit(values)
129
130
 
130
131
  def _parse_work_items(self, work_items, fields=None):
131
132
  """Parse work items dynamically based on the fields requested."""
@@ -522,14 +523,14 @@ class AzureDevOpsApiWrapper(BaseVectorStoreToolApiWrapper):
522
523
  'reason': wi.fields.get('System.Reason', ''),
523
524
  'iteration': wi.fields.get('System.IterationPath', ''),
524
525
  'updated_on': wi.fields.get('System.ChangedDate', ''),
525
- 'attachment_ids': [rel.url.split('/')[-1] for rel in wi.relations or [] if rel.rel == 'AttachedFile']
526
+ 'attachment_ids': {rel.url.split('/')[-1]:rel.attributes.get('name', '') for rel in wi.relations or [] if rel.rel == 'AttachedFile'}
526
527
  })
527
528
 
528
529
  def _process_document(self, document: Document) -> Generator[Document, None, None]:
529
- for attachment_id in document.metadata.get('attachment_ids', []):
530
+ for attachment_id, file_name in document.metadata.get('attachment_ids', {}).items():
530
531
  content_generator = self._client.get_attachment_content(id=attachment_id, download=True)
531
- content = ''.join(str(item) for item in content_generator)
532
- yield Document(page_content=content, metadata={'id': attachment_id})
532
+ content = b"".join(x for x in content_generator)
533
+ yield Document(page_content="", metadata={'id': attachment_id, 'loader_content_type': file_name, 'loader_content': content})
533
534
 
534
535
  def _index_tool_params(self):
535
536
  """Return the parameters for indexing data."""
@@ -537,10 +538,9 @@ class AzureDevOpsApiWrapper(BaseVectorStoreToolApiWrapper):
537
538
  "wiql": (str, Field(description="WIQL (Work Item Query Language) query string to select and filter Azure DevOps work items."))
538
539
  }
539
540
 
540
- @extend_with_vector_tools
541
541
  def get_available_tools(self):
542
542
  """Return a list of available tools."""
543
- return [
543
+ return super().get_available_tools() + [
544
544
  {
545
545
  "name": "search_work_items",
546
546
  "description": self.search_work_items.__doc__,
@@ -0,0 +1,426 @@
1
+ import json
2
+ import logging
3
+ from typing import Any, Optional, List, Literal, Dict, Generator
4
+
5
+ from langchain_core.documents import Document
6
+ from pydantic import create_model, Field, SecretStr
7
+
8
+ # from alita_sdk.runtime.langchain.interfaces.llm_processor import get_embeddings
9
+ from .chunkers import markdown_chunker
10
+ from .utils.content_parser import process_content_by_type
11
+ from .vector_adapters.VectorStoreAdapter import VectorStoreAdapterFactory
12
+ from ..runtime.tools.vectorstore_base import VectorStoreWrapperBase
13
+ from ..runtime.utils.utils import IndexerKeywords
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Base Vector Store Schema Models
18
+ BaseIndexParams = create_model(
19
+ "BaseIndexParams",
20
+ collection_suffix=(str, Field(description="Suffix for collection name (max 7 characters) used to separate datasets", min_length=1, max_length=7)),
21
+ vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
22
+ )
23
+
24
+ RemoveIndexParams = create_model(
25
+ "RemoveIndexParams",
26
+ collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
27
+ )
28
+
29
+ BaseSearchParams = create_model(
30
+ "BaseSearchParams",
31
+ query=(str, Field(description="Query text to search in the index")),
32
+ collection_suffix=(Optional[str], Field(
33
+ description="Optional suffix for collection name (max 7 characters). Leave empty to search across all datasets",
34
+ default="", max_length=7)),
35
+ vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
36
+ filter=(Optional[dict | str], Field(
37
+ description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
38
+ default={},
39
+ examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
40
+ )),
41
+ cut_off=(Optional[float], Field(description="Cut-off score for search results", default=0.5)),
42
+ search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
43
+ full_text_search=(Optional[Dict[str, Any]], Field(
44
+ description="Full text search parameters. Can be a dictionary with search options.",
45
+ default=None
46
+ )),
47
+ extended_search=(Optional[List[str]], Field(
48
+ description="List of additional fields to include in the search results.",
49
+ default=None
50
+ )),
51
+ reranker=(Optional[dict], Field(
52
+ description="Reranker configuration. Can be a dictionary with reranking parameters.",
53
+ default={}
54
+ )),
55
+ reranking_config=(Optional[Dict[str, Dict[str, Any]]], Field(
56
+ description="Reranking configuration. Can be a dictionary with reranking settings.",
57
+ default=None
58
+ )),
59
+ )
60
+
61
+ BaseStepbackSearchParams = create_model(
62
+ "BaseStepbackSearchParams",
63
+ query=(str, Field(description="Query text to search in the index")),
64
+ collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
65
+ vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
66
+ messages=(Optional[List], Field(description="Chat messages for stepback search context", default=[])),
67
+ filter=(Optional[dict | str], Field(
68
+ description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
69
+ default={},
70
+ examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
71
+ )),
72
+ cut_off=(Optional[float], Field(description="Cut-off score for search results", default=0.5)),
73
+ search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
74
+ reranker=(Optional[dict], Field(
75
+ description="Reranker configuration. Can be a dictionary with reranking parameters.",
76
+ default={}
77
+ )),
78
+ full_text_search=(Optional[Dict[str, Any]], Field(
79
+ description="Full text search parameters. Can be a dictionary with search options.",
80
+ default=None
81
+ )),
82
+ reranking_config=(Optional[Dict[str, Dict[str, Any]]], Field(
83
+ description="Reranking configuration. Can be a dictionary with reranking settings.",
84
+ default=None
85
+ )),
86
+ extended_search=(Optional[List[str]], Field(
87
+ description="List of additional fields to include in the search results.",
88
+ default=None
89
+ )),
90
+ )
91
+
92
+ BaseIndexDataParams = create_model(
93
+ "indexData",
94
+ __base__=BaseIndexParams,
95
+ progress_step=(Optional[int], Field(default=10, ge=0, le=100,
96
+ description="Optional step size for progress reporting during indexing")),
97
+ clean_index=(Optional[bool], Field(default=False,
98
+ description="Optional flag to enforce clean existing index before indexing new data")),
99
+ chunking_tool=(Literal[None,'markdown', 'statistical', 'proposal'], Field(description="Name of chunking tool", default=None)),
100
+ chunking_config=(Optional[dict], Field(description="Chunking tool configuration", default_factory=dict)),
101
+ )
102
+
103
+
104
+ class BaseIndexerToolkit(VectorStoreWrapperBase):
105
+ """Base class for tool API wrappers that support vector store functionality."""
106
+
107
+ doctype: str = "document"
108
+
109
+ llm: Any = None
110
+ connection_string: Optional[SecretStr] = None
111
+ collection_name: Optional[str] = None
112
+ embedding_model: Optional[str] = "HuggingFaceEmbeddings"
113
+ embedding_model_params: Optional[Dict[str, Any]] = {"model_name": "sentence-transformers/all-MiniLM-L6-v2"}
114
+ vectorstore_type: Optional[str] = "PGVector"
115
+ _embedding: Optional[Any] = None
116
+ alita: Any = None # Elitea client, if available
117
+
118
+ def __init__(self, **kwargs):
119
+ conn = kwargs.get('connection_string', None)
120
+ connection_string = conn.get_secret_value() if isinstance(conn, SecretStr) else conn
121
+ collection_name = kwargs.get('collection_name')
122
+
123
+ # if 'embedding_model' not in kwargs:
124
+ kwargs['embedding_model'] = 'HuggingFaceEmbeddings'
125
+ if 'embedding_model_params' not in kwargs:
126
+ kwargs['embedding_model_params'] = {"model_name": "sentence-transformers/all-MiniLM-L6-v2"}
127
+ if 'vectorstore_type' not in kwargs:
128
+ kwargs['vectorstore_type'] = 'PGVector'
129
+ vectorstore_type = kwargs.get('vectorstore_type')
130
+ kwargs['vectorstore_params'] = VectorStoreAdapterFactory.create_adapter(vectorstore_type).get_vectorstore_params(collection_name, connection_string)
131
+ kwargs['_embedding'] = kwargs.get('alita').get_embeddings(kwargs.get('embedding_model'))
132
+ super().__init__(**kwargs)
133
+
134
+ def _index_tool_params(self, **kwargs) -> dict[str, tuple[type, Field]]:
135
+ """
136
+ Returns a list of fields for index_data args schema.
137
+ NOTE: override this method in subclasses to provide specific parameters for certain toolkit.
138
+ """
139
+ return {}
140
+
141
+ def _base_loader(self, **kwargs) -> Generator[Document, None, None]:
142
+ """ Loads documents from a source, processes them,
143
+ and returns a list of Document objects with base metadata: id and created_on."""
144
+ pass
145
+
146
+ def _process_document(self, base_document: Document) -> Generator[Document, None, None]:
147
+ """ Process an existing base document to extract relevant metadata for full document preparation.
148
+ Used for late processing of documents after we ensure that the document has to be indexed to avoid
149
+ time-consuming operations for documents which might be useless.
150
+
151
+ Args:
152
+ document (Document): The base document to process.
153
+
154
+ Returns:
155
+ Document: The processed document with metadata."""
156
+ pass
157
+
158
+ def index_data(self, **kwargs):
159
+ collection_suffix = kwargs.get("collection_suffix")
160
+ progress_step = kwargs.get("progress_step")
161
+ clean_index = kwargs.get("clean_index")
162
+ chunking_tool = kwargs.get("chunking_tool")
163
+ chunking_config = kwargs.get("chunking_config")
164
+ #
165
+ if clean_index:
166
+ self._clean_index()
167
+ #
168
+ documents = self._base_loader(**kwargs)
169
+ documents = self._reduce_duplicates(documents, collection_suffix)
170
+ documents = self._extend_data(documents) # update content of not-reduced base document if needed (for sharepoint and similar)
171
+ documents = self._collect_dependencies(documents) # collect dependencies for base documents
172
+ documents = self._apply_loaders_chunkers(documents, chunking_tool, chunking_config)
173
+ #
174
+ return self._save_index(list(documents), collection_suffix=collection_suffix, progress_step=progress_step)
175
+
176
+ def _apply_loaders_chunkers(self, documents: Generator[Document, None, None], chunking_tool: str=None, chunking_config=None) -> Generator[Document, None, None]:
177
+ from alita_sdk.tools.chunkers import __confluence_chunkers__ as chunkers, __confluence_models__ as models
178
+
179
+ if chunking_config is None:
180
+ chunking_config = {}
181
+ chunking_config['embedding'] = self._embedding
182
+ chunking_config['llm'] = self.llm
183
+
184
+ for document in documents:
185
+ if content_type := document.metadata.get('loader_content_type', None):
186
+ # apply parsing based on content type and chunk if chunker was applied to parent doc
187
+ yield from process_content_by_type(
188
+ document=document,
189
+ extension_source=content_type, llm=self.llm, chunking_config=chunking_config)
190
+ elif chunking_tool:
191
+ # apply default chunker from toolkit config. No parsing.
192
+ chunker = chunkers.get(chunking_tool)
193
+ yield from chunker(file_content_generator=iter([document]), config=chunking_config)
194
+ else:
195
+ # return as is if neither chunker or content typa are specified
196
+ yield document
197
+
198
+ def _extend_data(self, documents: Generator[Document, None, None]):
199
+ yield from documents
200
+
201
+ def _collect_dependencies(self, documents: Generator[Document, None, None]):
202
+ for document in documents:
203
+ dependencies = self._process_document(document)
204
+ yield document
205
+ for dep in dependencies:
206
+ dep.metadata[IndexerKeywords.PARENT.value] = document.metadata.get('id', None)
207
+ yield dep
208
+
209
+ def _content_loader(self):
210
+ pass
211
+
212
+ def _reduce_duplicates(
213
+ self,
214
+ documents: Generator[Any, None, None],
215
+ collection_suffix: str,
216
+ log_msg: str = "Verification of documents to index started"
217
+ ) -> Generator[Document, None, None]:
218
+ """Generic duplicate reduction logic for documents."""
219
+ self._log_data(log_msg, tool_name="index_documents")
220
+ indexed_data = self._get_indexed_data(collection_suffix)
221
+ indexed_keys = set(indexed_data.keys())
222
+ if not indexed_keys:
223
+ self._log_data("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
224
+ yield from documents
225
+ return
226
+
227
+ docs_to_remove = set()
228
+
229
+ for document in documents:
230
+ key = self.key_fn(document)
231
+ if key in indexed_keys and collection_suffix == indexed_data[key]['metadata'].get('collection'):
232
+ if self.compare_fn(document, indexed_data[key]):
233
+ continue
234
+ yield document
235
+ docs_to_remove.update(self.remove_ids_fn(indexed_data, key))
236
+ else:
237
+ yield document
238
+
239
+ if docs_to_remove:
240
+ self._log_data(
241
+ f"Removing {len(docs_to_remove)} documents from vectorstore that are already indexed with different updated_on.",
242
+ tool_name="index_documents"
243
+ )
244
+ self.vectorstore.delete(ids=list(docs_to_remove))
245
+
246
+ def _get_indexed_data(self, collection_suffix: str):
247
+ raise NotImplementedError("Subclasses must implement this method")
248
+
249
+ def key_fn(self, document: Document):
250
+ raise NotImplementedError("Subclasses must implement this method")
251
+
252
+ def compare_fn(self, document: Document, idx):
253
+ raise NotImplementedError("Subclasses must implement this method")
254
+
255
+ def remove_ids_fn(self, idx_data, key: str):
256
+ raise NotImplementedError("Subclasses must implement this method")
257
+
258
+ def _process_documents(self, documents: List[Document]) -> Generator[Document, None, None]:
259
+ """
260
+ Process a list of base documents to extract relevant metadata for full document preparation.
261
+ Used for late processing of documents after we ensure that the documents have to be indexed to avoid
262
+ time-consuming operations for documents which might be useless.
263
+ This function passed to index_documents method of vector store and called after _reduce_duplicates method.
264
+
265
+ Args:
266
+ documents (List[Document]): The base documents to process.
267
+
268
+ Returns:
269
+ Generator[Document, None, None]: A generator yielding processed documents with metadata.
270
+ """
271
+ for doc in documents:
272
+ # Filter documents to process only those that either:
273
+ # - do not have a 'chunk_id' in their metadata, or
274
+ # - have 'chunk_id' explicitly set to 1.
275
+ # This prevents processing of irrelevant or duplicate chunks, improving efficiency.
276
+ chunk_id = doc.metadata.get("chunk_id")
277
+ if chunk_id is None or chunk_id == 1:
278
+ processed_docs = self._process_document(doc)
279
+ if processed_docs: # Only proceed if the list is not empty
280
+ for processed_doc in processed_docs:
281
+ # map processed document (child) to the original document (parent)
282
+ processed_doc.metadata[IndexerKeywords.PARENT.value] = doc.metadata.get('id', None)
283
+ if chunker:=self._get_dependencies_chunker(processed_doc):
284
+ yield from chunker(file_content_generator=iter([processed_doc]), config=self._get_dependencies_chunker_config())
285
+ else:
286
+ yield processed_doc
287
+
288
+ def remove_index(self, collection_suffix: str = ""):
289
+ """Cleans the indexed data in the collection."""
290
+ super()._clean_collection(collection_suffix=collection_suffix)
291
+ return (f"Collection '{collection_suffix}' has been removed from the vector store.\n"
292
+ f"Available collections: {self.list_collections()}")
293
+
294
+ def search_index(self,
295
+ query: str,
296
+ collection_suffix: str = "",
297
+ filter: dict | str = {}, cut_off: float = 0.5,
298
+ search_top: int = 10, reranker: dict = {},
299
+ full_text_search: Optional[Dict[str, Any]] = None,
300
+ reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
301
+ extended_search: Optional[List[str]] = None,
302
+ **kwargs):
303
+ """ Searches indexed documents in the vector store."""
304
+ # build filter on top of collection_suffix
305
+ filter = filter if isinstance(filter, dict) else json.loads(filter)
306
+ if collection_suffix:
307
+ filter.update({"collection": {
308
+ "$eq": collection_suffix.strip()
309
+ }})
310
+
311
+ found_docs = super().search_documents(
312
+ query,
313
+ doctype=self.doctype,
314
+ filter=filter,
315
+ cut_off=cut_off,
316
+ search_top=search_top,
317
+ reranker=reranker,
318
+ full_text_search=full_text_search,
319
+ reranking_config=reranking_config,
320
+ extended_search=extended_search
321
+ )
322
+ return found_docs if found_docs else f"No documents found by query '{query}' and filter '{filter}'"
323
+
324
+ def stepback_search_index(self,
325
+ query: str,
326
+ messages: List[Dict[str, Any]] = [],
327
+ collection_suffix: str = "",
328
+ filter: dict | str = {}, cut_off: float = 0.5,
329
+ search_top: int = 10, reranker: dict = {},
330
+ full_text_search: Optional[Dict[str, Any]] = None,
331
+ reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
332
+ extended_search: Optional[List[str]] = None,
333
+ **kwargs):
334
+ """ Searches indexed documents in the vector store."""
335
+ found_docs = super().stepback_search(
336
+ query,
337
+ messages,
338
+ self.doctype,
339
+ filter=filter,
340
+ cut_off=cut_off,
341
+ search_top=search_top,
342
+ full_text_search=full_text_search,
343
+ reranking_config=reranking_config,
344
+ extended_search=extended_search
345
+ )
346
+ return f"Found {len(found_docs)} documents matching the query\n{json.dumps(found_docs, indent=4)}" if found_docs else "No documents found matching the query."
347
+
348
+ def stepback_summary_index(self,
349
+ query: str,
350
+ messages: List[Dict[str, Any]] = [],
351
+ collection_suffix: str = "",
352
+ filter: dict | str = {}, cut_off: float = 0.5,
353
+ search_top: int = 10, reranker: dict = {},
354
+ full_text_search: Optional[Dict[str, Any]] = None,
355
+ reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
356
+ extended_search: Optional[List[str]] = None,
357
+ **kwargs):
358
+ """ Generates a summary of indexed documents using stepback technique."""
359
+ return super().stepback_summary(
360
+ query,
361
+ messages,
362
+ self.doctype,
363
+ filter=filter,
364
+ cut_off=cut_off,
365
+ search_top=search_top,
366
+ full_text_search=full_text_search,
367
+ reranking_config=reranking_config,
368
+ extended_search=extended_search
369
+ )
370
+
371
+ def get_available_tools(self):
372
+ """
373
+ Returns the standardized vector search tools (search operations only).
374
+ Index operations are toolkit-specific and should be added manually to each toolkit.
375
+
376
+ Returns:
377
+ List of tool dictionaries with name, ref, description, and args_schema
378
+ """
379
+ return [
380
+ {
381
+ "name": "index_data",
382
+ "mode": "index_data",
383
+ "ref": self.index_data,
384
+ "description": "Loads data to index.",
385
+ "args_schema": create_model(
386
+ "IndexData",
387
+ __base__=BaseIndexDataParams,
388
+ **self._index_tool_params() if self._index_tool_params() else {}
389
+ )
390
+ },
391
+ {
392
+ "name": "search_index",
393
+ "mode": "search_index",
394
+ "ref": self.search_index,
395
+ "description": self.search_index.__doc__,
396
+ "args_schema": BaseSearchParams
397
+ },
398
+ {
399
+ "name": "stepback_search_index",
400
+ "mode": "stepback_search_index",
401
+ "ref": self.stepback_search_index,
402
+ "description": self.stepback_search_index.__doc__,
403
+ "args_schema": BaseStepbackSearchParams
404
+ },
405
+ {
406
+ "name": "stepback_summary_index",
407
+ "mode": "stepback_summary_index",
408
+ "ref": self.stepback_summary_index,
409
+ "description": self.stepback_summary_index.__doc__,
410
+ "args_schema": BaseStepbackSearchParams
411
+ },
412
+ {
413
+ "name": "remove_index",
414
+ "mode": "remove_index",
415
+ "ref": self.remove_index,
416
+ "description": self.remove_index.__doc__,
417
+ "args_schema": RemoveIndexParams
418
+ },
419
+ {
420
+ "name": "list_collections",
421
+ "mode": "list_collections",
422
+ "ref": self.list_collections,
423
+ "description": self.list_collections.__doc__,
424
+ "args_schema": create_model("ListCollectionsParams") # No parameters
425
+ },
426
+ ]
@@ -31,13 +31,11 @@ LoaderSchema = create_model(
31
31
  BaseIndexParams = create_model(
32
32
  "BaseIndexParams",
33
33
  collection_suffix=(str, Field(description="Suffix for collection name (max 7 characters) used to separate datasets", min_length=1, max_length=7)),
34
- vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
35
34
  )
36
35
 
37
36
  BaseCodeIndexParams = create_model(
38
37
  "BaseCodeIndexParams",
39
38
  collection_suffix=(str, Field(description="Suffix for collection name (max 7 characters) used to separate datasets", min_length=1, max_length=7)),
40
- vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
41
39
  branch=(Optional[str], Field(description="Branch to index files from. Defaults to active branch if None.", default=None)),
42
40
  whitelist=(Optional[List[str]], Field(description="File extensions or paths to include. Defaults to all files if None.", default=None)),
43
41
  blacklist=(Optional[List[str]], Field(description="File extensions or paths to exclude. Defaults to no exclusions if None.", default=None)),
@@ -54,7 +52,6 @@ BaseSearchParams = create_model(
54
52
  collection_suffix=(Optional[str], Field(
55
53
  description="Optional suffix for collection name (max 7 characters). Leave empty to search across all datasets",
56
54
  default="", max_length=7)),
57
- vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
58
55
  filter=(Optional[dict | str], Field(
59
56
  description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
60
57
  default={},
@@ -84,7 +81,6 @@ BaseStepbackSearchParams = create_model(
84
81
  "BaseStepbackSearchParams",
85
82
  query=(str, Field(description="Query text to search in the index")),
86
83
  collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
87
- vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
88
84
  messages=(Optional[List], Field(description="Chat messages for stepback search context", default=[])),
89
85
  filter=(Optional[dict | str], Field(
90
86
  description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
@@ -369,6 +365,16 @@ class BaseVectorStoreToolApiWrapper(BaseToolApiWrapper):
369
365
  vectorstore_wrapper = self._init_vector_store()
370
366
  return vectorstore_wrapper.list_collections()
371
367
 
368
+ def _build_collection_filter(self, filter: dict | str, collection_suffix: str = "") -> dict:
369
+ """Builds a filter for the collection based on the provided suffix."""
370
+
371
+ filter = filter if isinstance(filter, dict) else json.loads(filter)
372
+ if collection_suffix:
373
+ filter.update({"collection": {
374
+ "$eq": collection_suffix.strip()
375
+ }})
376
+ return filter
377
+
372
378
  def search_index(self,
373
379
  query: str,
374
380
  collection_suffix: str = "",
@@ -380,13 +386,7 @@ class BaseVectorStoreToolApiWrapper(BaseToolApiWrapper):
380
386
  **kwargs):
381
387
  """ Searches indexed documents in the vector store."""
382
388
  vectorstore = self._init_vector_store()
383
- # build filter on top of collection_suffix
384
- filter = filter if isinstance(filter, dict) else json.loads(filter)
385
- if collection_suffix:
386
- filter.update({"collection": {
387
- "$eq": collection_suffix.strip()
388
- }})
389
-
389
+ filter = self._build_collection_filter(filter, collection_suffix)
390
390
  found_docs = vectorstore.search_documents(
391
391
  query,
392
392
  doctype=self.doctype,
@@ -411,6 +411,8 @@ class BaseVectorStoreToolApiWrapper(BaseToolApiWrapper):
411
411
  extended_search: Optional[List[str]] = None,
412
412
  **kwargs):
413
413
  """ Searches indexed documents in the vector store."""
414
+
415
+ filter = self._build_collection_filter(filter, collection_suffix)
414
416
  vectorstore = self._init_vector_store()
415
417
  found_docs = vectorstore.stepback_search(
416
418
  query,
@@ -423,7 +425,7 @@ class BaseVectorStoreToolApiWrapper(BaseToolApiWrapper):
423
425
  reranking_config=reranking_config,
424
426
  extended_search=extended_search
425
427
  )
426
- return f"Found {len(found_docs)} documents matching the query\n{json.dumps(found_docs, indent=4)}" if found_docs else "No documents found matching the query."
428
+ return found_docs if found_docs else f"No documents found by query '{query}' and filter '{filter}'"
427
429
 
428
430
  def stepback_summary_index(self,
429
431
  query: str,
@@ -437,17 +439,20 @@ class BaseVectorStoreToolApiWrapper(BaseToolApiWrapper):
437
439
  **kwargs):
438
440
  """ Generates a summary of indexed documents using stepback technique."""
439
441
  vectorstore = self._init_vector_store()
440
- return vectorstore.stepback_summary(
441
- query,
442
- messages,
443
- self.doctype,
444
- filter=filter,
445
- cut_off=cut_off,
442
+ filter = self._build_collection_filter(filter, collection_suffix)
443
+
444
+ found_docs = vectorstore.stepback_summary(
445
+ query,
446
+ messages,
447
+ self.doctype,
448
+ filter=filter,
449
+ cut_off=cut_off,
446
450
  search_top=search_top,
447
- full_text_search=full_text_search,
448
- reranking_config=reranking_config,
451
+ full_text_search=full_text_search,
452
+ reranking_config=reranking_config,
449
453
  extended_search=extended_search
450
454
  )
455
+ return found_docs if found_docs else f"No documents found by query '{query}' and filter '{filter}'"
451
456
 
452
457
  def _get_vector_search_tools(self):
453
458
  """
@@ -563,12 +568,14 @@ class BaseCodeToolApiWrapper(BaseVectorStoreToolApiWrapper):
563
568
 
564
569
  def is_whitelisted(file_path: str) -> bool:
565
570
  if whitelist:
566
- return any(fnmatch.fnmatch(file_path, pattern) for pattern in whitelist)
571
+ return (any(fnmatch.fnmatch(file_path, pattern) for pattern in whitelist)
572
+ or any(file_path.endswith(f'.{pattern}') for pattern in whitelist))
567
573
  return True
568
574
 
569
575
  def is_blacklisted(file_path: str) -> bool:
570
576
  if blacklist:
571
- return any(fnmatch.fnmatch(file_path, pattern) for pattern in blacklist)
577
+ return (any(fnmatch.fnmatch(file_path, pattern) for pattern in blacklist)
578
+ or any(file_path.endswith(f'.{pattern}') for pattern in blacklist))
572
579
  return False
573
580
 
574
581
  def file_content_generator():
@@ -0,0 +1,23 @@
1
+ from langchain_core.documents import Document
2
+
3
+ from alita_sdk.runtime.utils.utils import IndexerKeywords
4
+ from alita_sdk.tools.base_indexer_toolkit import BaseIndexerToolkit
5
+
6
+
7
+ class NonCodeIndexerToolkit(BaseIndexerToolkit):
8
+ def _get_indexed_data(self, collection_suffix: str):
9
+ return self.vector_adapter.get_indexed_data(self, collection_suffix)
10
+
11
+ def key_fn(self, document: Document):
12
+ return document.metadata.get('id')
13
+
14
+ def compare_fn(self, document: Document, idx_data):
15
+ return (document.metadata.get('updated_on')
16
+ and idx_data['metadata'].get('updated_on')
17
+ and document.metadata.get('updated_on') == idx_data['metadata'].get('updated_on'))
18
+
19
+ def remove_ids_fn(self, idx_data, key: str):
20
+ return (idx_data[key]['all_chunks'] +
21
+ [idx_data[dep_id]['id'] for dep_id in idx_data[key][IndexerKeywords.DEPENDENT_DOCS.value]] +
22
+ [chunk_db_id for dep_id in idx_data[key][IndexerKeywords.DEPENDENT_DOCS.value] for chunk_db_id in
23
+ idx_data[dep_id]['all_chunks']])