alita-sdk 0.3.249__py3-none-any.whl → 0.3.250__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,624 @@
1
+ import json
2
+ import math
3
+ import types
4
+ from typing import Any, Optional, List, Dict, Callable, Generator
5
+
6
+ from langchain_core.documents import Document
7
+ from pydantic import BaseModel, model_validator, Field
8
+ from ..langchain.tools.vector import VectorAdapter
9
+ from langchain_core.messages import HumanMessage
10
+ from alita_sdk.tools.elitea_base import BaseToolApiWrapper
11
+ from alita_sdk.tools.vector_adapters.VectorStoreAdapter import VectorStoreAdapterFactory
12
+ from logging import getLogger
13
+
14
+ from ..utils.logging import dispatch_custom_event
15
+ from ..utils.utils import IndexerKeywords
16
+
17
+ logger = getLogger(__name__)
18
+
19
+ class IndexDocumentsModel(BaseModel):
20
+ documents: Any = Field(description="Generator of documents to index")
21
+
22
+ class SearchDocumentsModel(BaseModel):
23
+ query: str = Field(description="Search query")
24
+ doctype: str = Field(description="Document type")
25
+ filter: Optional[dict | str] = Field(
26
+ description='Filter for metadata of documents. Use JSON format for complex filters.',
27
+ default=None)
28
+ search_top: Optional[int] = Field(description="Number of search results", default=10)
29
+ cut_off: Optional[float] = Field(description="Cut off value for search results", default=0.5)
30
+ full_text_search: Optional[Dict[str, Any]] = Field(
31
+ description="""Full text search configuration. Example:
32
+ {
33
+ "enabled": true,
34
+ "weight": 0.3,
35
+ "fields": ["content", "title"],
36
+ "language": "english"
37
+ }""",
38
+ default=None
39
+ )
40
+ reranking_config: Optional[Dict[str, Dict[str, Any]]] = Field(
41
+ description="""Reranking configuration. Example:
42
+ {
43
+ "field_name": {
44
+ "weight": 1.0,
45
+ "rules": {
46
+ "contains": "keyword",
47
+ "priority": "value",
48
+ "sort": "desc"
49
+ }
50
+ }
51
+ }""",
52
+ default=None
53
+ )
54
+ extended_search: Optional[List[str]] = Field(
55
+ description="List of chunk types to search for (title, summary, propositions, keywords, documents)",
56
+ default=None
57
+ )
58
+
59
+ class StepBackSearchDocumentsModel(BaseModel):
60
+ query: str = Field(description="Search query")
61
+ doctype: str = Field(description="Document type")
62
+ messages: Optional[list] = Field(description="Conversation history", default=[])
63
+ filter: Optional[dict] = Field(description='Filter for metadata of documents. Use JSON format for complex filters.', default=None)
64
+ search_top: Optional[int] = Field(description="Number of search results", default=10)
65
+ cut_off: Optional[float] = Field(description="Cut off value for search results", default=0.5)
66
+ full_text_search: Optional[Dict[str, Any]] = Field(
67
+ description="""Full text search configuration. Example:
68
+ {
69
+ "enabled": true,
70
+ "weight": 0.3,
71
+ "fields": ["content", "title"],
72
+ "language": "english"
73
+ }""",
74
+ default=None
75
+ )
76
+ reranking_config: Optional[Dict[str, Dict[str, Any]]] = Field(
77
+ description="""Reranking configuration. Example:
78
+ {
79
+ "field_name": {
80
+ "weight": 1.0,
81
+ "rules": {
82
+ "contains": "keyword",
83
+ "priority": "value",
84
+ "sort": "desc"
85
+ }
86
+ }
87
+ }""",
88
+ default=None
89
+ )
90
+ extended_search: Optional[List[str]] = Field(
91
+ description="List of chunk types to search for (title, summary, propositions, keywords, documents)",
92
+ default=None
93
+ )
94
+
95
+ STEPBACK_PROMPT = """Your task is to convert provided question into a more generic question that will be used for similarity search.
96
+ Remove all not important words, question words, but save all names, dates and acronym as in original question.
97
+
98
+ <input>
99
+ {input}
100
+ </input>
101
+
102
+ Output:
103
+ """
104
+
105
+ GET_ANSWER_PROMPT = """<search_results>
106
+ {search_results}
107
+ </search_results>
108
+
109
+ <conversation_history>
110
+ {messages}
111
+ </conversation_history>
112
+
113
+ Please answer the question based on provided search results.
114
+ Provided information is already processed and available in the context as list of possibly relevant pieces of the documents.
115
+ Use only provided information. Do not make up answer.
116
+ If you have no answer and you can not derive it from the context, please provide "I have no answer".
117
+ <question>
118
+ {input}
119
+ </question>
120
+ ## Answer
121
+ Add <ANSWER> here
122
+
123
+ ## Score
124
+ Score the answer from 0 to 100, where 0 is not relevant and 100 is very relevant.
125
+
126
+ ## Citations
127
+ - source (score)
128
+ - source (score)
129
+ Make sure to provide unique source for each citation.
130
+
131
+ ## Explanation
132
+ How did you come up with the answer?
133
+ """
134
+
135
+ class VectorStoreWrapperBase(BaseToolApiWrapper):
136
+ llm: Any
137
+ embedding_model: str
138
+ embedding_model_params: dict
139
+ vectorstore_type: str
140
+ vectorstore_params: dict
141
+ max_docs_per_add: int = 100
142
+ dataset: str = None
143
+ embedding: Any = None
144
+ vectorstore: Any = None
145
+ # Review usage of old adapter
146
+ vectoradapter: Any = None
147
+ pg_helper: Any = None
148
+ embeddings: Any = None
149
+ # New adapter for vector database operations
150
+ vector_adapter: Any = None
151
+
152
+ @model_validator(mode='before')
153
+ @classmethod
154
+ def validate_toolkit(cls, values):
155
+ from ..langchain.interfaces.llm_processor import get_embeddings, get_vectorstore
156
+ logger.debug(f"Validating toolkit: {values}")
157
+ if not values.get('vectorstore_type'):
158
+ raise ValueError("Vectorstore type is required.")
159
+ if not values.get('embedding_model'):
160
+ raise ValueError("Embedding model is required.")
161
+ if not values.get('vectorstore_params'):
162
+ raise ValueError("Vectorstore parameters are required.")
163
+ if not values.get('embedding_model_params'):
164
+ raise ValueError("Embedding model parameters are required.")
165
+ values["dataset"] = values.get('vectorstore_params').get('collection_name')
166
+ if not values["dataset"]:
167
+ raise ValueError("Collection name is required.")
168
+ if not values.get('embeddings'):
169
+ values['embeddings'] = get_embeddings(values['embedding_model'], values['embedding_model_params'])
170
+ values['vectorstore'] = get_vectorstore(values['vectorstore_type'], values['vectorstore_params'], embedding_func=values['embeddings'])
171
+ values['vectoradapter'] = VectorAdapter(
172
+ vectorstore=values['vectorstore'],
173
+ embeddings=values['embeddings'],
174
+ quota_params=None,
175
+ )
176
+ # Initialize the new vector adapter
177
+ values['vector_adapter'] = VectorStoreAdapterFactory.create_adapter(values['vectorstore_type'])
178
+ logger.debug(f"Vectorstore wrapper initialized: {values}")
179
+ return values
180
+
181
+ def _init_pg_helper(self, language='english'):
182
+ """Initialize PGVector helper if needed and not already initialized"""
183
+ if self.pg_helper is None and hasattr(self.vectorstore, 'connection_string') and hasattr(self.vectorstore, 'collection_name'):
184
+ try:
185
+ from .pgvector_search import PGVectorSearch
186
+ self.pg_helper = PGVectorSearch(
187
+ self.vectorstore.connection_string,
188
+ self.vectorstore.collection_name,
189
+ language=language
190
+ )
191
+ except ImportError:
192
+ logger.warning("PGVectorSearch not available - full-text search will be limited")
193
+ except Exception as e:
194
+ logger.error(f"Failed to initialize PGVectorSearch: {str(e)}")
195
+
196
+ def list_collections(self) -> List[str]:
197
+ """List all collections in the vectorstore."""
198
+
199
+ return self.vector_adapter.list_collections(self)
200
+
201
+ def _clean_collection(self, collection_suffix: str = ''):
202
+ """
203
+ Clean the vectorstore collection by deleting all indexed data.
204
+ """
205
+ self._log_data(
206
+ f"Cleaning collection '{self.dataset}'",
207
+ tool_name="_clean_collection"
208
+ )
209
+ self.vector_adapter.clean_collection(self, collection_suffix)
210
+ self._log_data(
211
+ f"Collection '{self.dataset}' has been cleaned. ",
212
+ tool_name="_clean_collection"
213
+ )
214
+
215
+ def _add_to_collection(self, entry_id, new_collection_value):
216
+ """Add a new collection name to the `collection` key in the `metadata` column."""
217
+ self.vector_adapter.add_to_collection(self, entry_id, new_collection_value)
218
+
219
+ def index_documents(self, documents: Generator[Document, None, None], collection_suffix: str, progress_step: int = 20, clean_index: bool = True):
220
+ """ Index documents in the vectorstore.
221
+
222
+ Args:
223
+ documents (Any): Generator or list of documents to index.
224
+ progress_step (int): Step for progress reporting, default is 20.
225
+ clean_index (bool): If True, clean the index before re-indexing all documents.
226
+ """
227
+ if clean_index:
228
+ self._clean_index(collection_suffix)
229
+
230
+ return self._save_index(list(documents), collection_suffix, progress_step)
231
+
232
+ def _clean_index(self, collection_suffix: str):
233
+ logger.info("Cleaning index before re-indexing all documents.")
234
+ self._log_data("Cleaning index before re-indexing all documents. Previous index will be removed", tool_name="index_documents")
235
+ try:
236
+ self._clean_collection(collection_suffix)
237
+ self.vectoradapter.persist()
238
+ self.vectoradapter.vacuum()
239
+ self._log_data("Previous index has been removed",
240
+ tool_name="index_documents")
241
+ except Exception as e:
242
+ logger.warning(f"Failed to clean index: {str(e)}. Continuing with re-indexing.")
243
+
244
+ def _save_index(self, documents: list[Document], collection_suffix: Optional[str] = None, progress_step: int = 20):
245
+ from ..langchain.interfaces.llm_processor import add_documents
246
+ #
247
+ for doc in documents:
248
+ if 'id' not in doc.metadata or 'updated_on' not in doc.metadata:
249
+ logger.warning(f"Document is missing required metadata field 'id' or 'updated_on': {doc.metadata}")
250
+
251
+ logger.debug(f"Indexing documents: {documents}")
252
+ logger.debug(self.vectoradapter)
253
+
254
+ # if collection_suffix is provided, add it to metadata of each document
255
+ if collection_suffix:
256
+ for doc in documents:
257
+ if not doc.metadata.get('collection'):
258
+ doc.metadata['collection'] = collection_suffix
259
+ else:
260
+ doc.metadata['collection'] += f";{collection_suffix}"
261
+
262
+ total_docs = len(documents)
263
+ documents_count = 0
264
+ _documents = []
265
+
266
+ # set default progress step to 20 if out of 0...100 or None
267
+ progress_step = 20 if progress_step not in range(0, 100) else progress_step
268
+ next_progress_point = progress_step
269
+ for document in documents:
270
+ documents_count += 1
271
+ # logger.debug(f"Indexing document: {document}")
272
+ try:
273
+ _documents.append(document)
274
+ if len(_documents) >= self.max_docs_per_add:
275
+ add_documents(vectorstore=self.vectorstore, documents=_documents)
276
+ _documents = []
277
+
278
+ percent = math.floor((documents_count / total_docs) * 100)
279
+ if percent >= next_progress_point:
280
+ msg = f"Indexing progress: {percent}%. Processed {documents_count} of {total_docs} documents."
281
+ logger.debug(msg)
282
+ self._log_data(msg)
283
+ next_progress_point += progress_step
284
+ except Exception:
285
+ from traceback import format_exc
286
+ logger.error(f"Error: {format_exc()}")
287
+ return {"status": "error", "message": f"Error: {format_exc()}"}
288
+ if _documents:
289
+ add_documents(vectorstore=self.vectorstore, documents=_documents)
290
+ return {"status": "ok", "message": f"successfully indexed {documents_count} documents"}
291
+
292
+ def search_documents(self, query:str, doctype: str = 'code',
293
+ filter:dict|str={}, cut_off: float=0.5,
294
+ search_top:int=10, full_text_search: Optional[Dict[str, Any]] = None,
295
+ extended_search: Optional[List[str]] = None,
296
+ reranker: dict = {}, reranking_config: Optional[Dict[str, Dict[str, Any]]] = None
297
+ ):
298
+ """Enhanced search documents method using JSON configurations for full-text search and reranking"""
299
+ from alita_sdk.tools.code.loaders.codesearcher import search_format as code_format
300
+
301
+ if not filter:
302
+ filter = None
303
+ else:
304
+ if isinstance(filter, str):
305
+ filter = json.loads(filter)
306
+
307
+ # Extended search implementation
308
+ if extended_search:
309
+ # Track unique documents by source and chunk_id
310
+ unique_docs = {}
311
+ chunk_type_scores = {} # Store scores by document identifier
312
+ # Create initial set of results from documents
313
+ if filter is None:
314
+ document_filter = {"chunk_type": {"$eq": "document"}}
315
+ else:
316
+ document_filter = {
317
+ "$and": [
318
+ filter,
319
+ {"chunk_type": {"$eq": "document"}}
320
+ ]
321
+ }
322
+
323
+ try:
324
+ document_items = self.vectorstore.similarity_search_with_score(
325
+ query, filter=document_filter, k=search_top
326
+ )
327
+ # Add document results to unique docs
328
+ vector_items = document_items
329
+ for doc, score in document_items:
330
+ source = doc.metadata.get('source')
331
+ chunk_id = doc.metadata.get('chunk_id')
332
+ doc_id = f"{source}_{chunk_id}" if source and chunk_id else str(doc.metadata.get('id', id(doc)))
333
+
334
+ if doc_id not in unique_docs or score > chunk_type_scores.get(doc_id, 0):
335
+ unique_docs[doc_id] = doc
336
+ chunk_type_scores[doc_id] = score
337
+ except Exception as e:
338
+ logger.warning(f"Error searching for document chunks: {str(e)}")
339
+
340
+ # First search for specified chunk types (title, summary, propositions, keywords)
341
+ valid_chunk_types = ["title", "summary", "propositions", "keywords"]
342
+ chunk_types_to_search = [ct for ct in extended_search if ct in valid_chunk_types]
343
+
344
+ # Search for each chunk type separately
345
+ for chunk_type in chunk_types_to_search:
346
+ if filter is None:
347
+ chunk_filter = {"chunk_type": {"$eq": chunk_type}}
348
+ else:
349
+ chunk_filter = {
350
+ "$and": [
351
+ filter,
352
+ {"chunk_type": {"$eq": chunk_type}}
353
+ ]
354
+ }
355
+
356
+ try:
357
+ chunk_items = self.vectorstore.similarity_search_with_score(
358
+ query, filter=chunk_filter, k=search_top
359
+ )
360
+
361
+ logger.debug(f"Chunk items for {chunk_type}: {chunk_items[0]}")
362
+
363
+ for doc, score in chunk_items:
364
+ # Create unique identifier for document
365
+ source = doc.metadata.get('source')
366
+ chunk_id = doc.metadata.get('chunk_id')
367
+ doc_id = f"{source}_{chunk_id}" if source and chunk_id else str(doc.metadata.get('id', id(doc)))
368
+
369
+ # Store document and its score
370
+ if doc_id not in unique_docs:
371
+ unique_docs[doc_id] = doc
372
+ chunk_type_scores[doc_id] = score
373
+ # Create a filter with proper operators
374
+ doc_filter_parts = [
375
+ {"source": {"$eq": source}},
376
+ {"chunk_id": {"$eq": chunk_id}},
377
+ {"chunk_type": {"$eq": "document"}}
378
+ ]
379
+
380
+ if filter is not None:
381
+ doc_filter = {
382
+ "$and": [filter] + doc_filter_parts
383
+ }
384
+ else:
385
+ doc_filter = {
386
+ "$and": doc_filter_parts
387
+ }
388
+
389
+ try:
390
+ fetch_items = self.vectorstore.similarity_search_with_score(
391
+ query, filter=doc_filter, k=1
392
+ )
393
+ if fetch_items:
394
+ vector_items.append(fetch_items[0])
395
+
396
+ except Exception as e:
397
+ logger.warning(f"Error retrieving document chunk for {source}_{chunk_id}: {str(e)}")
398
+ except Exception as e:
399
+ logger.warning(f"Error searching for chunk type {chunk_type}: {str(e)}")
400
+
401
+ else:
402
+ # Default search behavior (unchanged)
403
+ max_search_results = 30 if search_top * 3 > 30 else search_top * 3
404
+ vector_items = self.vectorstore.similarity_search_with_score(
405
+ query, filter=filter, k=max_search_results
406
+ )
407
+
408
+ # Initialize document map for tracking by ID
409
+ doc_map = {
410
+ f"{doc.metadata.get('id', f'idx_{i}')}_{doc.metadata['chunk_id']}"
411
+ if 'chunk_id' in doc.metadata
412
+ else doc.metadata.get('id', f"idx_{i}"): (doc, score)
413
+ for i, (doc, score) in enumerate(vector_items)
414
+ }
415
+
416
+ # Process full-text search if configured
417
+ if full_text_search and full_text_search.get('enabled') and full_text_search.get('fields'):
418
+ language = full_text_search.get('language', 'english')
419
+ self._init_pg_helper(language)
420
+ if self.pg_helper:
421
+ vector_weight = 1.0 # Default vector weight
422
+ text_weight = full_text_search.get('weight', 0.3)
423
+
424
+ # Query each specified field
425
+ for field_name in full_text_search.get('fields', []):
426
+ try:
427
+ text_results = self.pg_helper.full_text_search(field_name, query)
428
+
429
+ # Combine text search results with vector results
430
+ for result in text_results:
431
+ doc_id = result['id']
432
+ text_score = result['text_score']
433
+
434
+ if doc_id in doc_map:
435
+ # Document exists in vector results, combine scores
436
+ doc, vector_score = doc_map[doc_id]
437
+ combined_score = (vector_score * vector_weight) + (text_score * text_weight)
438
+ doc_map[doc_id] = (doc, combined_score)
439
+ else:
440
+ # Document is new from text search, fetch and add if possible
441
+ doc_data = self.pg_helper.get_documents_by_ids([doc_id]).get(doc_id)
442
+ if doc_data:
443
+ from langchain_core.documents import Document
444
+ doc = Document(
445
+ page_content=doc_data.get('document', ''),
446
+ metadata=doc_data.get('cmetadata', {})
447
+ )
448
+ # Use weighted text score for new documents
449
+ doc_map[doc_id] = (doc, text_score * text_weight)
450
+ except Exception as e:
451
+ logger.error(f"Full-text search error on field {field_name}: {str(e)}")
452
+
453
+ # Convert the document map back to a list
454
+ combined_items = list(doc_map.values())
455
+
456
+ # Apply reranking rules
457
+ if reranking_config:
458
+ combined_items = self._apply_reranking(combined_items, reranking_config)
459
+ elif reranker: # Fallback to legacy reranker parameter
460
+ combined_items = self._apply_reranking(combined_items, reranker)
461
+
462
+ # Apply cutoff filter
463
+ if cut_off:
464
+ combined_items = [item for item in combined_items if abs(item[1]) >= cut_off]
465
+
466
+ # Sort by score and limit results
467
+ # DISABLED: for chroma we want ascending order (lower score is better), for others descending
468
+ # combined_items.sort(key=lambda x: x[1], reverse= self.vectorstore_type.lower() != 'chroma')
469
+ combined_items = combined_items[:search_top]
470
+
471
+ # Format output based on doctype
472
+ if doctype == 'code':
473
+ return code_format(combined_items)
474
+ else:
475
+ response = []
476
+ for doc, score in combined_items:
477
+ response.append({
478
+ 'page_content': doc.page_content,
479
+ 'metadata': doc.metadata,
480
+ 'score': score
481
+ })
482
+ return response
483
+
484
+ def _apply_reranking(self, items, reranker):
485
+ """Apply reranking rules to search results"""
486
+ if not items:
487
+ return items
488
+
489
+ # Create a copy of items with mutable scores for reranking
490
+ reranked_items = [(doc, score) for doc, score in items]
491
+
492
+ for field_name, config in reranker.items():
493
+ weight = config.get("weight", 1.0)
494
+ rules = config.get("rules", {})
495
+
496
+ for i, (doc, score) in enumerate(reranked_items):
497
+ metadata = doc.metadata
498
+ field_value = metadata.get(field_name)
499
+
500
+ if field_value is not None:
501
+ # Apply rules-based reranking
502
+ for rule_type, rule_value in rules.items():
503
+ if rule_type == "contains" and isinstance(rule_value, str) and isinstance(field_value, str):
504
+ if rule_value.lower() in field_value.lower():
505
+ # Boost score if field contains the rule value
506
+ reranked_items[i] = (doc, score * (1 + weight))
507
+
508
+ elif rule_type == "priority":
509
+ # Apply priority rule based on exact match
510
+ if str(field_value).lower() == str(rule_value).lower():
511
+ reranked_items[i] = (doc, score * (1 + weight))
512
+
513
+ # Handle sort rules after individual score adjustments
514
+ for field_name, config in reranker.items():
515
+ rules = config.get("rules", {})
516
+ if "sort" in rules:
517
+ sort_direction = rules["sort"]
518
+ # Assuming sort can be "asc" or "desc"
519
+ reverse_sort = sort_direction.lower() == "desc"
520
+
521
+ # Sort based on the specified field
522
+ reranked_items.sort(
523
+ key=lambda x: (x[0].metadata.get(field_name, None) is not None,
524
+ x[0].metadata.get(field_name, ""),
525
+ x[1]),
526
+ reverse=reverse_sort
527
+ )
528
+
529
+ # Re-sort by score if no sort rules were applied
530
+ if not any("sort" in config.get("rules", {}) for config in reranker.values()):
531
+ reranked_items.sort(key=lambda x: x[1], reverse=True)
532
+
533
+ return reranked_items
534
+
535
+ def stepback_search(self, query:str, messages: list, doctype: str = 'code',
536
+ filter:dict={}, cut_off: float=0.5, search_top:int=10,
537
+ full_text_search: Optional[Dict[str, Any]] = None,
538
+ reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
539
+ extended_search: Optional[List[str]] = None):
540
+ """Enhanced stepback search using JSON configs for full-text search and reranking"""
541
+ result = self.llm.invoke([
542
+ HumanMessage(
543
+ content=[
544
+ {
545
+ "type": "text",
546
+ "text": STEPBACK_PROMPT.format(input=query, messages=messages)
547
+ }
548
+ ]
549
+ )
550
+ ])
551
+ search_results = self.search_documents(
552
+ result.content, doctype, filter, cut_off, search_top,
553
+ full_text_search=full_text_search,
554
+ reranking_config=reranking_config,
555
+ extended_search=extended_search
556
+ )
557
+ return search_results
558
+
559
+ def stepback_summary(self, query:str, messages: list, doctype: str = 'code',
560
+ filter:dict={}, cut_off: float=0.5, search_top:int=10,
561
+ full_text_search: Optional[Dict[str, Any]] = None,
562
+ reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
563
+ extended_search: Optional[List[str]] = None):
564
+ """Enhanced stepback summary using JSON configs for full-text search and reranking"""
565
+ search_results = self.stepback_search(
566
+ query, messages, doctype, filter, cut_off, search_top,
567
+ full_text_search=full_text_search,
568
+ reranking_config=reranking_config,
569
+ extended_search=extended_search
570
+ )
571
+ result = self.llm.invoke([
572
+ HumanMessage(
573
+ content=[
574
+ {
575
+ "type": "text",
576
+ "text": GET_ANSWER_PROMPT.format(input=query, search_results=search_results, messages=messages)
577
+ }
578
+ ]
579
+ )
580
+ ])
581
+ return result.content
582
+
583
+ def _log_data(self, message: str, tool_name: str = "index_data"):
584
+ """Log data and dispatch custom event for indexing progress"""
585
+
586
+ try:
587
+ dispatch_custom_event(
588
+ name="thinking_step",
589
+ data={
590
+ "message": message,
591
+ "tool_name": tool_name,
592
+ "toolkit": "vectorstore",
593
+ },
594
+ )
595
+ except Exception as e:
596
+ logger.warning(f"Failed to dispatch progress event: {str(e)}")
597
+
598
+ def get_available_tools(self):
599
+ return [
600
+ {
601
+ "ref": self.index_documents,
602
+ "name": "indexDocuments",
603
+ "description": "Index documents in the vectorstore",
604
+ "args_schema": IndexDocumentsModel
605
+ },
606
+ {
607
+ "ref": self.search_documents,
608
+ "name": "searchDocuments",
609
+ "description": "Search documents in the vectorstore",
610
+ "args_schema": SearchDocumentsModel
611
+ },
612
+ {
613
+ "ref": self.stepback_search,
614
+ "name": "stepbackSearch",
615
+ "description": "Search in the vectorstore using stepback technique",
616
+ "args_schema": StepBackSearchDocumentsModel
617
+ },
618
+ {
619
+ "ref": self.stepback_summary,
620
+ "name": "stepbackSummary",
621
+ "description": "Get summary of search results using stepback technique",
622
+ "args_schema": StepBackSearchDocumentsModel
623
+ }
624
+ ]
@@ -3,7 +3,6 @@ import logging
3
3
  import urllib.parse
4
4
  from typing import Dict, List, Generator, Optional
5
5
 
6
- from alita_sdk.tools.elitea_base import BaseVectorStoreToolApiWrapper, extend_with_vector_tools
7
6
  from azure.devops.connection import Connection
8
7
  from azure.devops.v7_1.core import CoreClient
9
8
  from azure.devops.v7_1.wiki import WikiClient
@@ -15,6 +14,8 @@ from pydantic import create_model, PrivateAttr, SecretStr
15
14
  from pydantic import model_validator
16
15
  from pydantic.fields import Field
17
16
 
17
+ from alita_sdk.tools.non_code_indexer_toolkit import NonCodeIndexerToolkit
18
+
18
19
  try:
19
20
  from alita_sdk.runtime.langchain.interfaces.llm_processor import get_embeddings
20
21
  except ImportError:
@@ -94,7 +95,7 @@ ADOUnlinkWorkItemsFromWikiPage = create_model(
94
95
  page_name=(str, Field(description="Wiki page path to unlink the work items from", examples=["/TargetPage"]))
95
96
  )
96
97
 
97
- class AzureDevOpsApiWrapper(BaseVectorStoreToolApiWrapper):
98
+ class AzureDevOpsApiWrapper(NonCodeIndexerToolkit):
98
99
  # TODO use ado_configuration instead of organization_url, project and token
99
100
  organization_url: str
100
101
  project: str
@@ -125,7 +126,7 @@ class AzureDevOpsApiWrapper(BaseVectorStoreToolApiWrapper):
125
126
  except Exception as e:
126
127
  return ImportError(f"Failed to connect to Azure DevOps: {e}")
127
128
 
128
- return values
129
+ return super().validate_toolkit(values)
129
130
 
130
131
  def _parse_work_items(self, work_items, fields=None):
131
132
  """Parse work items dynamically based on the fields requested."""
@@ -522,14 +523,14 @@ class AzureDevOpsApiWrapper(BaseVectorStoreToolApiWrapper):
522
523
  'reason': wi.fields.get('System.Reason', ''),
523
524
  'iteration': wi.fields.get('System.IterationPath', ''),
524
525
  'updated_on': wi.fields.get('System.ChangedDate', ''),
525
- 'attachment_ids': [rel.url.split('/')[-1] for rel in wi.relations or [] if rel.rel == 'AttachedFile']
526
+ 'attachment_ids': {rel.url.split('/')[-1]:rel.attributes.get('name', '') for rel in wi.relations or [] if rel.rel == 'AttachedFile'}
526
527
  })
527
528
 
528
529
  def _process_document(self, document: Document) -> Generator[Document, None, None]:
529
- for attachment_id in document.metadata.get('attachment_ids', []):
530
+ for attachment_id, file_name in document.metadata.get('attachment_ids', {}).items():
530
531
  content_generator = self._client.get_attachment_content(id=attachment_id, download=True)
531
- content = ''.join(str(item) for item in content_generator)
532
- yield Document(page_content=content, metadata={'id': attachment_id})
532
+ content = b"".join(x for x in content_generator)
533
+ yield Document(page_content="", metadata={'id': attachment_id, 'loader_content_type': file_name, 'loader_content': content})
533
534
 
534
535
  def _index_tool_params(self):
535
536
  """Return the parameters for indexing data."""
@@ -537,10 +538,9 @@ class AzureDevOpsApiWrapper(BaseVectorStoreToolApiWrapper):
537
538
  "wiql": (str, Field(description="WIQL (Work Item Query Language) query string to select and filter Azure DevOps work items."))
538
539
  }
539
540
 
540
- @extend_with_vector_tools
541
541
  def get_available_tools(self):
542
542
  """Return a list of available tools."""
543
- return [
543
+ return super().get_available_tools() + [
544
544
  {
545
545
  "name": "search_work_items",
546
546
  "description": self.search_work_items.__doc__,
@@ -0,0 +1,426 @@
1
+ import json
2
+ import logging
3
+ from typing import Any, Optional, List, Literal, Dict, Generator
4
+
5
+ from langchain_core.documents import Document
6
+ from pydantic import create_model, Field, SecretStr
7
+
8
+ # from alita_sdk.runtime.langchain.interfaces.llm_processor import get_embeddings
9
+ from .chunkers import markdown_chunker
10
+ from .utils.content_parser import process_content_by_type
11
+ from .vector_adapters.VectorStoreAdapter import VectorStoreAdapterFactory
12
+ from ..runtime.tools.vectorstore_base import VectorStoreWrapperBase
13
+ from ..runtime.utils.utils import IndexerKeywords
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Base Vector Store Schema Models
18
+ BaseIndexParams = create_model(
19
+ "BaseIndexParams",
20
+ collection_suffix=(str, Field(description="Suffix for collection name (max 7 characters) used to separate datasets", min_length=1, max_length=7)),
21
+ vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
22
+ )
23
+
24
+ RemoveIndexParams = create_model(
25
+ "RemoveIndexParams",
26
+ collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
27
+ )
28
+
29
+ BaseSearchParams = create_model(
30
+ "BaseSearchParams",
31
+ query=(str, Field(description="Query text to search in the index")),
32
+ collection_suffix=(Optional[str], Field(
33
+ description="Optional suffix for collection name (max 7 characters). Leave empty to search across all datasets",
34
+ default="", max_length=7)),
35
+ vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
36
+ filter=(Optional[dict | str], Field(
37
+ description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
38
+ default={},
39
+ examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
40
+ )),
41
+ cut_off=(Optional[float], Field(description="Cut-off score for search results", default=0.5)),
42
+ search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
43
+ full_text_search=(Optional[Dict[str, Any]], Field(
44
+ description="Full text search parameters. Can be a dictionary with search options.",
45
+ default=None
46
+ )),
47
+ extended_search=(Optional[List[str]], Field(
48
+ description="List of additional fields to include in the search results.",
49
+ default=None
50
+ )),
51
+ reranker=(Optional[dict], Field(
52
+ description="Reranker configuration. Can be a dictionary with reranking parameters.",
53
+ default={}
54
+ )),
55
+ reranking_config=(Optional[Dict[str, Dict[str, Any]]], Field(
56
+ description="Reranking configuration. Can be a dictionary with reranking settings.",
57
+ default=None
58
+ )),
59
+ )
60
+
61
+ BaseStepbackSearchParams = create_model(
62
+ "BaseStepbackSearchParams",
63
+ query=(str, Field(description="Query text to search in the index")),
64
+ collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
65
+ vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
66
+ messages=(Optional[List], Field(description="Chat messages for stepback search context", default=[])),
67
+ filter=(Optional[dict | str], Field(
68
+ description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
69
+ default={},
70
+ examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
71
+ )),
72
+ cut_off=(Optional[float], Field(description="Cut-off score for search results", default=0.5)),
73
+ search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
74
+ reranker=(Optional[dict], Field(
75
+ description="Reranker configuration. Can be a dictionary with reranking parameters.",
76
+ default={}
77
+ )),
78
+ full_text_search=(Optional[Dict[str, Any]], Field(
79
+ description="Full text search parameters. Can be a dictionary with search options.",
80
+ default=None
81
+ )),
82
+ reranking_config=(Optional[Dict[str, Dict[str, Any]]], Field(
83
+ description="Reranking configuration. Can be a dictionary with reranking settings.",
84
+ default=None
85
+ )),
86
+ extended_search=(Optional[List[str]], Field(
87
+ description="List of additional fields to include in the search results.",
88
+ default=None
89
+ )),
90
+ )
91
+
92
+ BaseIndexDataParams = create_model(
93
+ "indexData",
94
+ __base__=BaseIndexParams,
95
+ progress_step=(Optional[int], Field(default=10, ge=0, le=100,
96
+ description="Optional step size for progress reporting during indexing")),
97
+ clean_index=(Optional[bool], Field(default=False,
98
+ description="Optional flag to enforce clean existing index before indexing new data")),
99
+ chunking_tool=(Literal[None,'markdown', 'statistical', 'proposal'], Field(description="Name of chunking tool", default=None)),
100
+ chunking_config=(Optional[dict], Field(description="Chunking tool configuration", default_factory=dict)),
101
+ )
102
+
103
+
104
+ class BaseIndexerToolkit(VectorStoreWrapperBase):
105
+ """Base class for tool API wrappers that support vector store functionality."""
106
+
107
+ doctype: str = "document"
108
+
109
+ llm: Any = None
110
+ connection_string: Optional[SecretStr] = None
111
+ collection_name: Optional[str] = None
112
+ embedding_model: Optional[str] = "HuggingFaceEmbeddings"
113
+ embedding_model_params: Optional[Dict[str, Any]] = {"model_name": "sentence-transformers/all-MiniLM-L6-v2"}
114
+ vectorstore_type: Optional[str] = "PGVector"
115
+ _embedding: Optional[Any] = None
116
+ alita: Any = None # Elitea client, if available
117
+
118
+ def __init__(self, **kwargs):
119
+ conn = kwargs.get('connection_string', None)
120
+ connection_string = conn.get_secret_value() if isinstance(conn, SecretStr) else conn
121
+ collection_name = kwargs.get('collection_name')
122
+
123
+ # if 'embedding_model' not in kwargs:
124
+ kwargs['embedding_model'] = 'HuggingFaceEmbeddings'
125
+ if 'embedding_model_params' not in kwargs:
126
+ kwargs['embedding_model_params'] = {"model_name": "sentence-transformers/all-MiniLM-L6-v2"}
127
+ if 'vectorstore_type' not in kwargs:
128
+ kwargs['vectorstore_type'] = 'PGVector'
129
+ vectorstore_type = kwargs.get('vectorstore_type')
130
+ kwargs['vectorstore_params'] = VectorStoreAdapterFactory.create_adapter(vectorstore_type).get_vectorstore_params(collection_name, connection_string)
131
+ kwargs['_embedding'] = kwargs.get('alita').get_embeddings(kwargs.get('embedding_model'))
132
+ super().__init__(**kwargs)
133
+
134
+ def _index_tool_params(self, **kwargs) -> dict[str, tuple[type, Field]]:
135
+ """
136
+ Returns a list of fields for index_data args schema.
137
+ NOTE: override this method in subclasses to provide specific parameters for certain toolkit.
138
+ """
139
+ return {}
140
+
141
+ def _base_loader(self, **kwargs) -> Generator[Document, None, None]:
142
+ """ Loads documents from a source, processes them,
143
+ and returns a list of Document objects with base metadata: id and created_on."""
144
+ pass
145
+
146
+ def _process_document(self, base_document: Document) -> Generator[Document, None, None]:
147
+ """ Process an existing base document to extract relevant metadata for full document preparation.
148
+ Used for late processing of documents after we ensure that the document has to be indexed to avoid
149
+ time-consuming operations for documents which might be useless.
150
+
151
+ Args:
152
+ document (Document): The base document to process.
153
+
154
+ Returns:
155
+ Document: The processed document with metadata."""
156
+ pass
157
+
158
+ def index_data(self, **kwargs):
159
+ collection_suffix = kwargs.get("collection_suffix")
160
+ progress_step = kwargs.get("progress_step")
161
+ clean_index = kwargs.get("clean_index")
162
+ chunking_tool = kwargs.get("chunking_tool")
163
+ chunking_config = kwargs.get("chunking_config")
164
+ #
165
+ if clean_index:
166
+ self._clean_index()
167
+ #
168
+ documents = self._base_loader(**kwargs)
169
+ documents = self._reduce_duplicates(documents, collection_suffix)
170
+ documents = self._extend_data(documents) # update content of not-reduced base document if needed (for sharepoint and similar)
171
+ documents = self._collect_dependencies(documents) # collect dependencies for base documents
172
+ documents = self._apply_loaders_chunkers(documents, chunking_tool, chunking_config)
173
+ #
174
+ return self._save_index(list(documents), collection_suffix=collection_suffix, progress_step=progress_step)
175
+
176
+ def _apply_loaders_chunkers(self, documents: Generator[Document, None, None], chunking_tool: str=None, chunking_config=None) -> Generator[Document, None, None]:
177
+ from alita_sdk.tools.chunkers import __confluence_chunkers__ as chunkers, __confluence_models__ as models
178
+
179
+ if chunking_config is None:
180
+ chunking_config = {}
181
+ chunking_config['embedding'] = self._embedding
182
+ chunking_config['llm'] = self.llm
183
+
184
+ for document in documents:
185
+ if content_type := document.metadata.get('loader_content_type', None):
186
+ # apply parsing based on content type and chunk if chunker was applied to parent doc
187
+ yield from process_content_by_type(
188
+ document=document,
189
+ extension_source=content_type, llm=self.llm, chunking_config=chunking_config)
190
+ elif chunking_tool:
191
+ # apply default chunker from toolkit config. No parsing.
192
+ chunker = chunkers.get(chunking_tool)
193
+ yield from chunker(file_content_generator=iter([document]), config=chunking_config)
194
+ else:
195
+ # return as is if neither chunker or content typa are specified
196
+ yield document
197
+
198
+ def _extend_data(self, documents: Generator[Document, None, None]):
199
+ yield from documents
200
+
201
+ def _collect_dependencies(self, documents: Generator[Document, None, None]):
202
+ for document in documents:
203
+ dependencies = self._process_document(document)
204
+ yield document
205
+ for dep in dependencies:
206
+ dep.metadata[IndexerKeywords.PARENT.value] = document.metadata.get('id', None)
207
+ yield dep
208
+
209
+ def _content_loader(self):
210
+ pass
211
+
212
+ def _reduce_duplicates(
213
+ self,
214
+ documents: Generator[Any, None, None],
215
+ collection_suffix: str,
216
+ log_msg: str = "Verification of documents to index started"
217
+ ) -> Generator[Document, None, None]:
218
+ """Generic duplicate reduction logic for documents."""
219
+ self._log_data(log_msg, tool_name="index_documents")
220
+ indexed_data = self._get_indexed_data(collection_suffix)
221
+ indexed_keys = set(indexed_data.keys())
222
+ if not indexed_keys:
223
+ self._log_data("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
224
+ yield from documents
225
+ return
226
+
227
+ docs_to_remove = set()
228
+
229
+ for document in documents:
230
+ key = self.key_fn(document)
231
+ if key in indexed_keys and collection_suffix == indexed_data[key]['metadata'].get('collection'):
232
+ if self.compare_fn(document, indexed_data[key]):
233
+ continue
234
+ yield document
235
+ docs_to_remove.update(self.remove_ids_fn(indexed_data, key))
236
+ else:
237
+ yield document
238
+
239
+ if docs_to_remove:
240
+ self._log_data(
241
+ f"Removing {len(docs_to_remove)} documents from vectorstore that are already indexed with different updated_on.",
242
+ tool_name="index_documents"
243
+ )
244
+ self.vectorstore.delete(ids=list(docs_to_remove))
245
+
246
+ def _get_indexed_data(self, collection_suffix: str):
247
+ raise NotImplementedError("Subclasses must implement this method")
248
+
249
+ def key_fn(self, document: Document):
250
+ raise NotImplementedError("Subclasses must implement this method")
251
+
252
+ def compare_fn(self, document: Document, idx):
253
+ raise NotImplementedError("Subclasses must implement this method")
254
+
255
+ def remove_ids_fn(self, idx_data, key: str):
256
+ raise NotImplementedError("Subclasses must implement this method")
257
+
258
+ def _process_documents(self, documents: List[Document]) -> Generator[Document, None, None]:
259
+ """
260
+ Process a list of base documents to extract relevant metadata for full document preparation.
261
+ Used for late processing of documents after we ensure that the documents have to be indexed to avoid
262
+ time-consuming operations for documents which might be useless.
263
+ This function passed to index_documents method of vector store and called after _reduce_duplicates method.
264
+
265
+ Args:
266
+ documents (List[Document]): The base documents to process.
267
+
268
+ Returns:
269
+ Generator[Document, None, None]: A generator yielding processed documents with metadata.
270
+ """
271
+ for doc in documents:
272
+ # Filter documents to process only those that either:
273
+ # - do not have a 'chunk_id' in their metadata, or
274
+ # - have 'chunk_id' explicitly set to 1.
275
+ # This prevents processing of irrelevant or duplicate chunks, improving efficiency.
276
+ chunk_id = doc.metadata.get("chunk_id")
277
+ if chunk_id is None or chunk_id == 1:
278
+ processed_docs = self._process_document(doc)
279
+ if processed_docs: # Only proceed if the list is not empty
280
+ for processed_doc in processed_docs:
281
+ # map processed document (child) to the original document (parent)
282
+ processed_doc.metadata[IndexerKeywords.PARENT.value] = doc.metadata.get('id', None)
283
+ if chunker:=self._get_dependencies_chunker(processed_doc):
284
+ yield from chunker(file_content_generator=iter([processed_doc]), config=self._get_dependencies_chunker_config())
285
+ else:
286
+ yield processed_doc
287
+
288
+ def remove_index(self, collection_suffix: str = ""):
289
+ """Cleans the indexed data in the collection."""
290
+ super()._clean_collection(collection_suffix=collection_suffix)
291
+ return (f"Collection '{collection_suffix}' has been removed from the vector store.\n"
292
+ f"Available collections: {self.list_collections()}")
293
+
294
+ def search_index(self,
295
+ query: str,
296
+ collection_suffix: str = "",
297
+ filter: dict | str = {}, cut_off: float = 0.5,
298
+ search_top: int = 10, reranker: dict = {},
299
+ full_text_search: Optional[Dict[str, Any]] = None,
300
+ reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
301
+ extended_search: Optional[List[str]] = None,
302
+ **kwargs):
303
+ """ Searches indexed documents in the vector store."""
304
+ # build filter on top of collection_suffix
305
+ filter = filter if isinstance(filter, dict) else json.loads(filter)
306
+ if collection_suffix:
307
+ filter.update({"collection": {
308
+ "$eq": collection_suffix.strip()
309
+ }})
310
+
311
+ found_docs = super().search_documents(
312
+ query,
313
+ doctype=self.doctype,
314
+ filter=filter,
315
+ cut_off=cut_off,
316
+ search_top=search_top,
317
+ reranker=reranker,
318
+ full_text_search=full_text_search,
319
+ reranking_config=reranking_config,
320
+ extended_search=extended_search
321
+ )
322
+ return found_docs if found_docs else f"No documents found by query '{query}' and filter '{filter}'"
323
+
324
+ def stepback_search_index(self,
325
+ query: str,
326
+ messages: List[Dict[str, Any]] = [],
327
+ collection_suffix: str = "",
328
+ filter: dict | str = {}, cut_off: float = 0.5,
329
+ search_top: int = 10, reranker: dict = {},
330
+ full_text_search: Optional[Dict[str, Any]] = None,
331
+ reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
332
+ extended_search: Optional[List[str]] = None,
333
+ **kwargs):
334
+ """ Searches indexed documents in the vector store."""
335
+ found_docs = super().stepback_search(
336
+ query,
337
+ messages,
338
+ self.doctype,
339
+ filter=filter,
340
+ cut_off=cut_off,
341
+ search_top=search_top,
342
+ full_text_search=full_text_search,
343
+ reranking_config=reranking_config,
344
+ extended_search=extended_search
345
+ )
346
+ return f"Found {len(found_docs)} documents matching the query\n{json.dumps(found_docs, indent=4)}" if found_docs else "No documents found matching the query."
347
+
348
+ def stepback_summary_index(self,
349
+ query: str,
350
+ messages: List[Dict[str, Any]] = [],
351
+ collection_suffix: str = "",
352
+ filter: dict | str = {}, cut_off: float = 0.5,
353
+ search_top: int = 10, reranker: dict = {},
354
+ full_text_search: Optional[Dict[str, Any]] = None,
355
+ reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
356
+ extended_search: Optional[List[str]] = None,
357
+ **kwargs):
358
+ """ Generates a summary of indexed documents using stepback technique."""
359
+ return super().stepback_summary(
360
+ query,
361
+ messages,
362
+ self.doctype,
363
+ filter=filter,
364
+ cut_off=cut_off,
365
+ search_top=search_top,
366
+ full_text_search=full_text_search,
367
+ reranking_config=reranking_config,
368
+ extended_search=extended_search
369
+ )
370
+
371
+ def get_available_tools(self):
372
+ """
373
+ Returns the standardized vector search tools (search operations only).
374
+ Index operations are toolkit-specific and should be added manually to each toolkit.
375
+
376
+ Returns:
377
+ List of tool dictionaries with name, ref, description, and args_schema
378
+ """
379
+ return [
380
+ {
381
+ "name": "index_data",
382
+ "mode": "index_data",
383
+ "ref": self.index_data,
384
+ "description": "Loads data to index.",
385
+ "args_schema": create_model(
386
+ "IndexData",
387
+ __base__=BaseIndexDataParams,
388
+ **self._index_tool_params() if self._index_tool_params() else {}
389
+ )
390
+ },
391
+ {
392
+ "name": "search_index",
393
+ "mode": "search_index",
394
+ "ref": self.search_index,
395
+ "description": self.search_index.__doc__,
396
+ "args_schema": BaseSearchParams
397
+ },
398
+ {
399
+ "name": "stepback_search_index",
400
+ "mode": "stepback_search_index",
401
+ "ref": self.stepback_search_index,
402
+ "description": self.stepback_search_index.__doc__,
403
+ "args_schema": BaseStepbackSearchParams
404
+ },
405
+ {
406
+ "name": "stepback_summary_index",
407
+ "mode": "stepback_summary_index",
408
+ "ref": self.stepback_summary_index,
409
+ "description": self.stepback_summary_index.__doc__,
410
+ "args_schema": BaseStepbackSearchParams
411
+ },
412
+ {
413
+ "name": "remove_index",
414
+ "mode": "remove_index",
415
+ "ref": self.remove_index,
416
+ "description": self.remove_index.__doc__,
417
+ "args_schema": RemoveIndexParams
418
+ },
419
+ {
420
+ "name": "list_collections",
421
+ "mode": "list_collections",
422
+ "ref": self.list_collections,
423
+ "description": self.list_collections.__doc__,
424
+ "args_schema": create_model("ListCollectionsParams") # No parameters
425
+ },
426
+ ]
@@ -31,13 +31,11 @@ LoaderSchema = create_model(
31
31
  BaseIndexParams = create_model(
32
32
  "BaseIndexParams",
33
33
  collection_suffix=(str, Field(description="Suffix for collection name (max 7 characters) used to separate datasets", min_length=1, max_length=7)),
34
- vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
35
34
  )
36
35
 
37
36
  BaseCodeIndexParams = create_model(
38
37
  "BaseCodeIndexParams",
39
38
  collection_suffix=(str, Field(description="Suffix for collection name (max 7 characters) used to separate datasets", min_length=1, max_length=7)),
40
- vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
41
39
  branch=(Optional[str], Field(description="Branch to index files from. Defaults to active branch if None.", default=None)),
42
40
  whitelist=(Optional[List[str]], Field(description="File extensions or paths to include. Defaults to all files if None.", default=None)),
43
41
  blacklist=(Optional[List[str]], Field(description="File extensions or paths to exclude. Defaults to no exclusions if None.", default=None)),
@@ -54,7 +52,6 @@ BaseSearchParams = create_model(
54
52
  collection_suffix=(Optional[str], Field(
55
53
  description="Optional suffix for collection name (max 7 characters). Leave empty to search across all datasets",
56
54
  default="", max_length=7)),
57
- vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
58
55
  filter=(Optional[dict | str], Field(
59
56
  description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
60
57
  default={},
@@ -84,7 +81,6 @@ BaseStepbackSearchParams = create_model(
84
81
  "BaseStepbackSearchParams",
85
82
  query=(str, Field(description="Query text to search in the index")),
86
83
  collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
87
- vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
88
84
  messages=(Optional[List], Field(description="Chat messages for stepback search context", default=[])),
89
85
  filter=(Optional[dict | str], Field(
90
86
  description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
@@ -572,12 +568,14 @@ class BaseCodeToolApiWrapper(BaseVectorStoreToolApiWrapper):
572
568
 
573
569
  def is_whitelisted(file_path: str) -> bool:
574
570
  if whitelist:
575
- return any(fnmatch.fnmatch(file_path, pattern) for pattern in whitelist)
571
+ return (any(fnmatch.fnmatch(file_path, pattern) for pattern in whitelist)
572
+ or any(file_path.endswith(f'.{pattern}') for pattern in whitelist))
576
573
  return True
577
574
 
578
575
  def is_blacklisted(file_path: str) -> bool:
579
576
  if blacklist:
580
- return any(fnmatch.fnmatch(file_path, pattern) for pattern in blacklist)
577
+ return (any(fnmatch.fnmatch(file_path, pattern) for pattern in blacklist)
578
+ or any(file_path.endswith(f'.{pattern}') for pattern in blacklist))
581
579
  return False
582
580
 
583
581
  def file_content_generator():
@@ -0,0 +1,23 @@
1
+ from langchain_core.documents import Document
2
+
3
+ from alita_sdk.runtime.utils.utils import IndexerKeywords
4
+ from alita_sdk.tools.base_indexer_toolkit import BaseIndexerToolkit
5
+
6
+
7
+ class NonCodeIndexerToolkit(BaseIndexerToolkit):
8
+ def _get_indexed_data(self, collection_suffix: str):
9
+ return self.vector_adapter.get_indexed_data(self, collection_suffix)
10
+
11
+ def key_fn(self, document: Document):
12
+ return document.metadata.get('id')
13
+
14
+ def compare_fn(self, document: Document, idx_data):
15
+ return (document.metadata.get('updated_on')
16
+ and idx_data['metadata'].get('updated_on')
17
+ and document.metadata.get('updated_on') == idx_data['metadata'].get('updated_on'))
18
+
19
+ def remove_ids_fn(self, idx_data, key: str):
20
+ return (idx_data[key]['all_chunks'] +
21
+ [idx_data[dep_id]['id'] for dep_id in idx_data[key][IndexerKeywords.DEPENDENT_DOCS.value]] +
22
+ [chunk_db_id for dep_id in idx_data[key][IndexerKeywords.DEPENDENT_DOCS.value] for chunk_db_id in
23
+ idx_data[dep_id]['all_chunks']])
@@ -1,11 +1,16 @@
1
1
  import os
2
2
  import tempfile
3
+ from copy import deepcopy as copy
3
4
  from logging import getLogger
4
5
  from pathlib import Path
6
+ from typing import Generator
5
7
 
8
+ from langchain_core.documents import Document
6
9
  from langchain_core.tools import ToolException
10
+ from langchain_text_splitters import TokenTextSplitter
7
11
 
8
12
  from alita_sdk.runtime.langchain.document_loaders.constants import loaders_map
13
+ from alita_sdk.tools.chunkers.utils import tiktoken_length
9
14
 
10
15
  logger = getLogger(__name__)
11
16
 
@@ -161,6 +166,61 @@ def load_content_from_bytes(file_content: bytes, extension: str = None, loader_e
161
166
  # Now the file is closed and can be read
162
167
  result = load_content(temp_file_path, extension, loader_extra_config, llm)
163
168
  return result
169
+ finally:
170
+ if temp_file_path and os.path.exists(temp_file_path):
171
+ os.remove(temp_file_path)
172
+
173
+ def process_content_by_type(document: Document, extension_source: str, llm = None, chunking_config={}) -> Generator[Document, None, None]:
174
+ temp_file_path = None
175
+ try:
176
+ extension = "." + extension_source.split('.')[-1].lower()
177
+
178
+ with tempfile.NamedTemporaryFile(mode='w+b', suffix=extension, delete=False) as temp_file:
179
+ temp_file_path = temp_file.name
180
+ content = document.metadata.pop('loader_content')
181
+ temp_file.write(content)
182
+ temp_file.flush()
183
+
184
+ loader_config = loaders_map.get(extension)
185
+ if not loader_config:
186
+ logger.warning(f"No loader found for file extension: {extension}. File: {temp_file_path}")
187
+ return
188
+
189
+ loader_cls = loader_config['class']
190
+ loader_kwargs = loader_config['kwargs']
191
+
192
+ loader = loader_cls(file_path=temp_file_path, **loader_kwargs)
193
+ docs_iterator = loader.load()
194
+ max_tokens = chunking_config.get('max_tokens', 512)
195
+ tokens_overlapping = chunking_config.get('tokens_overlapping', 10)
196
+ chunk_id = 0
197
+ for chunk in docs_iterator:
198
+ if tiktoken_length(chunk.page_content) > max_tokens:
199
+ for subchunk in TokenTextSplitter(encoding_name="cl100k_base",
200
+ chunk_size=max_tokens,
201
+ chunk_overlap=tokens_overlapping
202
+ ).split_text(chunk.page_content):
203
+ chunk_id += 1
204
+ headers_meta = list(chunk.metadata.values())
205
+ docmeta = copy(document.metadata)
206
+ docmeta.update({"headers": "; ".join(str(headers_meta))})
207
+ docmeta['chunk_id'] = chunk_id
208
+ docmeta['chunk_type'] = "document"
209
+ yield Document(
210
+ page_content=subchunk,
211
+ metadata=docmeta
212
+ )
213
+ else:
214
+ chunk_id += 1
215
+ headers_meta = list(chunk.metadata.values())
216
+ docmeta = copy(document.metadata)
217
+ docmeta.update({"headers": "; ".join(str(headers_meta))})
218
+ docmeta['chunk_id'] = chunk_id
219
+ docmeta['chunk_type'] = "document"
220
+ yield Document(
221
+ page_content=chunk.page_content,
222
+ metadata=docmeta
223
+ )
164
224
  finally:
165
225
  if temp_file_path and os.path.exists(temp_file_path):
166
226
  os.remove(temp_file_path)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: alita_sdk
3
- Version: 0.3.249
3
+ Version: 0.3.250
4
4
  Summary: SDK for building langchain agents using resources from Alita
5
5
  Author-email: Artem Rozumenko <artyom.rozumenko@gmail.com>, Mikalai Biazruchka <mikalai_biazruchka@epam.com>, Roman Mitusov <roman_mitusov@epam.com>, Ivan Krakhmaliuk <lifedjik@gmail.com>, Artem Dubrovskiy <ad13box@gmail.com>
6
6
  License-Expression: Apache-2.0
@@ -106,6 +106,7 @@ alita_sdk/runtime/tools/prompt.py,sha256=nJafb_e5aOM1Rr3qGFCR-SKziU9uCsiP2okIMs9
106
106
  alita_sdk/runtime/tools/router.py,sha256=wCvZjVkdXK9dMMeEerrgKf5M790RudH68pDortnHSz0,1517
107
107
  alita_sdk/runtime/tools/tool.py,sha256=lE1hGi6qOAXG7qxtqxarD_XMQqTghdywf261DZawwno,5631
108
108
  alita_sdk/runtime/tools/vectorstore.py,sha256=l5wfovwMNvS_RgW-ZHXCh8Cm8gauunRzP0NPkzmshcQ,33852
109
+ alita_sdk/runtime/tools/vectorstore_base.py,sha256=OdJIJkjTmQ0BC-AzAOMP2phAcNATJ8gI5JoBWSSdpNU,27892
109
110
  alita_sdk/runtime/utils/AlitaCallback.py,sha256=E4LlSBuCHWiUq6W7IZExERHZY0qcmdjzc_rJlF2iQIw,7356
110
111
  alita_sdk/runtime/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
111
112
  alita_sdk/runtime/utils/constants.py,sha256=Xntx1b_uxUzT4clwqHA_U6K8y5bBqf_4lSQwXdcWrp4,13586
@@ -117,7 +118,9 @@ alita_sdk/runtime/utils/toolkit_runtime.py,sha256=MU63Fpxj0b5_r1IUUc0Q3-PN9VwL7r
117
118
  alita_sdk/runtime/utils/toolkit_utils.py,sha256=I9QFqnaqfVgN26LUr6s3XlBlG6y0CoHURnCzG7XcwVs,5311
118
119
  alita_sdk/runtime/utils/utils.py,sha256=CpEl3LCeLbhzQySz08lkKPm7Auac6IiLF7WB8wmArMI,589
119
120
  alita_sdk/tools/__init__.py,sha256=ko5TToGYZFmBrho26DRAVvrkHWxQ2sfs8gVAASinYp8,10611
120
- alita_sdk/tools/elitea_base.py,sha256=A88HXZUMdoE_YtYVY3JwhgpKWaA_LIFTu54Ms2pj_Fs,30688
121
+ alita_sdk/tools/base_indexer_toolkit.py,sha256=qQfMHzsQ2BfusKMV_DNiHOtZVheiQ4gBfy5JXjYi0UY,20231
122
+ alita_sdk/tools/elitea_base.py,sha256=kmfVA965-IkcDBnGQzfv3E14sD6RXLDzJFcj7CJ3fDc,30350
123
+ alita_sdk/tools/non_code_indexer_toolkit.py,sha256=v9uq1POE1fQKCd152mbqDtF-HSe0qoDj83k4E5LAkMI,1080
121
124
  alita_sdk/tools/ado/__init__.py,sha256=bArTObt5cqG1SkijKevWGbsIILHBA3aCStg8Q1jd69k,1243
122
125
  alita_sdk/tools/ado/utils.py,sha256=PTCludvaQmPLakF2EbCGy66Mro4-rjDtavVP-xcB2Wc,1252
123
126
  alita_sdk/tools/ado/repos/__init__.py,sha256=_vjU3yHRXmLg6BDNmJsLiM9qDYRE_JmX5kXI_irMmQQ,5789
@@ -127,7 +130,7 @@ alita_sdk/tools/ado/test_plan/test_plan_wrapper.py,sha256=jQt8kFmdAzsopjByLTMiSn
127
130
  alita_sdk/tools/ado/wiki/__init__.py,sha256=uBKo_Meu2ZxMxcxGsMmvCXyplRE2um1_PIRvdYd37rM,5171
128
131
  alita_sdk/tools/ado/wiki/ado_wrapper.py,sha256=zg6wMRar1DTp-ZRlYaQifBEnpYmTrHXskTNPdrLdy8s,14759
129
132
  alita_sdk/tools/ado/work_item/__init__.py,sha256=coDedNL0pSPLjZ6VVK1UcqWo00zxe2T4XfVXt8bMho8,5383
130
- alita_sdk/tools/ado/work_item/ado_wrapper.py,sha256=ubeF2m8J6CGZF_gnkTEbmW_eh6YWsk7bD2clu9FmZpY,28313
133
+ alita_sdk/tools/ado/work_item/ado_wrapper.py,sha256=gEywCL_kS0k1jWcDhsmYUybpIP08tH8go6CixLJGwT4,28409
131
134
  alita_sdk/tools/advanced_jira_mining/__init__.py,sha256=pUTzECqGvYaR5qWY3JPUhrImrZgc7pCXuqSe5eWIE80,4604
132
135
  alita_sdk/tools/advanced_jira_mining/data_mining_wrapper.py,sha256=nZPtuwVWp8VeHw1B8q9kdwf-6ZvHnlXTOGdcIMDkKpw,44211
133
136
  alita_sdk/tools/aws/__init__.py,sha256=tB6GCOg4XGSpR6qgbgAF4MUQ5-YmQCbWurWgrVKEKQ8,181
@@ -308,7 +311,7 @@ alita_sdk/tools/testio/api_wrapper.py,sha256=BvmL5h634BzG6p7ajnQLmj-uoAw1gjWnd4F
308
311
  alita_sdk/tools/testrail/__init__.py,sha256=0kETjWKLU7R6mugBWsjwEUsh10pipbAeNSGJAO0FBh0,4634
309
312
  alita_sdk/tools/testrail/api_wrapper.py,sha256=K-Gc42RH2z-fK4cXi8zQq3s9A4v_pCJkRB3XKLAhypc,32056
310
313
  alita_sdk/tools/utils/__init__.py,sha256=155xepXPr4OEzs2Mz5YnjXcBpxSv1X2eznRUVoPtyK0,3268
311
- alita_sdk/tools/utils/content_parser.py,sha256=uqJoaJzl9w4Nf9yFy40sm1-qm88bvj7Y1S-1OxHkTks,7410
314
+ alita_sdk/tools/utils/content_parser.py,sha256=0HKQqGTdXHKlcz72GHEwXqLXJsRYXm35F-P1KZz0sNc,10351
312
315
  alita_sdk/tools/vector_adapters/VectorStoreAdapter.py,sha256=a6FAsiix_EvATIKUf5YT6vHh5LDyJ5uSP3LJqoxFo04,17367
313
316
  alita_sdk/tools/vector_adapters/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
314
317
  alita_sdk/tools/xray/__init__.py,sha256=GGpbiBdDQ9kMFqJEHYi7XwKpkuMMHi-ZF-IM8yFIgUM,4380
@@ -330,8 +333,8 @@ alita_sdk/tools/zephyr_scale/api_wrapper.py,sha256=JAeWf-RXohsxheUpT0iMDClc_izj-
330
333
  alita_sdk/tools/zephyr_squad/__init__.py,sha256=0AI_j27xVO5Gk5HQMFrqPTd4uvuVTpiZUicBrdfEpKg,2796
331
334
  alita_sdk/tools/zephyr_squad/api_wrapper.py,sha256=kmw_xol8YIYFplBLWTqP_VKPRhL_1ItDD0_vXTe_UuI,14906
332
335
  alita_sdk/tools/zephyr_squad/zephyr_squad_cloud_client.py,sha256=R371waHsms4sllHCbijKYs90C-9Yu0sSR3N4SUfQOgU,5066
333
- alita_sdk-0.3.249.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
334
- alita_sdk-0.3.249.dist-info/METADATA,sha256=dDaElnaGjY8jmdfIQ3k8wXejBDlZUVR7A1U_kA6rwEs,18897
335
- alita_sdk-0.3.249.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
336
- alita_sdk-0.3.249.dist-info/top_level.txt,sha256=0vJYy5p_jK6AwVb1aqXr7Kgqgk3WDtQ6t5C-XI9zkmg,10
337
- alita_sdk-0.3.249.dist-info/RECORD,,
336
+ alita_sdk-0.3.250.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
337
+ alita_sdk-0.3.250.dist-info/METADATA,sha256=vKpZbYSYxt7ruy33ugylQXj66_5YWm3Cuw3Wz-psGSA,18897
338
+ alita_sdk-0.3.250.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
339
+ alita_sdk-0.3.250.dist-info/top_level.txt,sha256=0vJYy5p_jK6AwVb1aqXr7Kgqgk3WDtQ6t5C-XI9zkmg,10
340
+ alita_sdk-0.3.250.dist-info/RECORD,,