alita-sdk 0.3.351__py3-none-any.whl → 0.3.499__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (206) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent/__init__.py +5 -0
  4. alita_sdk/cli/agent/default.py +258 -0
  5. alita_sdk/cli/agent_executor.py +155 -0
  6. alita_sdk/cli/agent_loader.py +215 -0
  7. alita_sdk/cli/agent_ui.py +228 -0
  8. alita_sdk/cli/agents.py +3601 -0
  9. alita_sdk/cli/callbacks.py +647 -0
  10. alita_sdk/cli/cli.py +168 -0
  11. alita_sdk/cli/config.py +306 -0
  12. alita_sdk/cli/context/__init__.py +30 -0
  13. alita_sdk/cli/context/cleanup.py +198 -0
  14. alita_sdk/cli/context/manager.py +731 -0
  15. alita_sdk/cli/context/message.py +285 -0
  16. alita_sdk/cli/context/strategies.py +289 -0
  17. alita_sdk/cli/context/token_estimation.py +127 -0
  18. alita_sdk/cli/formatting.py +182 -0
  19. alita_sdk/cli/input_handler.py +419 -0
  20. alita_sdk/cli/inventory.py +1256 -0
  21. alita_sdk/cli/mcp_loader.py +315 -0
  22. alita_sdk/cli/toolkit.py +327 -0
  23. alita_sdk/cli/toolkit_loader.py +85 -0
  24. alita_sdk/cli/tools/__init__.py +43 -0
  25. alita_sdk/cli/tools/approval.py +224 -0
  26. alita_sdk/cli/tools/filesystem.py +1751 -0
  27. alita_sdk/cli/tools/planning.py +389 -0
  28. alita_sdk/cli/tools/terminal.py +414 -0
  29. alita_sdk/community/__init__.py +64 -8
  30. alita_sdk/community/inventory/__init__.py +224 -0
  31. alita_sdk/community/inventory/config.py +257 -0
  32. alita_sdk/community/inventory/enrichment.py +2137 -0
  33. alita_sdk/community/inventory/extractors.py +1469 -0
  34. alita_sdk/community/inventory/ingestion.py +3172 -0
  35. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  36. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  37. alita_sdk/community/inventory/parsers/base.py +295 -0
  38. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  39. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  40. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  41. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  42. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  43. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  44. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  45. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  46. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  47. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  48. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  49. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  50. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  51. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  52. alita_sdk/community/inventory/patterns/loader.py +348 -0
  53. alita_sdk/community/inventory/patterns/registry.py +198 -0
  54. alita_sdk/community/inventory/presets.py +535 -0
  55. alita_sdk/community/inventory/retrieval.py +1403 -0
  56. alita_sdk/community/inventory/toolkit.py +173 -0
  57. alita_sdk/community/inventory/visualize.py +1370 -0
  58. alita_sdk/configurations/bitbucket.py +94 -2
  59. alita_sdk/configurations/confluence.py +96 -1
  60. alita_sdk/configurations/gitlab.py +79 -0
  61. alita_sdk/configurations/jira.py +103 -0
  62. alita_sdk/configurations/testrail.py +88 -0
  63. alita_sdk/configurations/xray.py +93 -0
  64. alita_sdk/configurations/zephyr_enterprise.py +93 -0
  65. alita_sdk/configurations/zephyr_essential.py +75 -0
  66. alita_sdk/runtime/clients/artifact.py +1 -1
  67. alita_sdk/runtime/clients/client.py +214 -42
  68. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  69. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  70. alita_sdk/runtime/clients/sandbox_client.py +373 -0
  71. alita_sdk/runtime/langchain/assistant.py +118 -30
  72. alita_sdk/runtime/langchain/constants.py +8 -1
  73. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  74. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +103 -60
  75. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +4 -1
  76. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +41 -12
  77. alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -1
  78. alita_sdk/runtime/langchain/document_loaders/constants.py +116 -99
  79. alita_sdk/runtime/langchain/interfaces/llm_processor.py +2 -2
  80. alita_sdk/runtime/langchain/langraph_agent.py +307 -71
  81. alita_sdk/runtime/langchain/utils.py +48 -8
  82. alita_sdk/runtime/llms/preloaded.py +2 -6
  83. alita_sdk/runtime/models/mcp_models.py +61 -0
  84. alita_sdk/runtime/toolkits/__init__.py +26 -0
  85. alita_sdk/runtime/toolkits/application.py +9 -2
  86. alita_sdk/runtime/toolkits/artifact.py +18 -6
  87. alita_sdk/runtime/toolkits/datasource.py +13 -6
  88. alita_sdk/runtime/toolkits/mcp.py +780 -0
  89. alita_sdk/runtime/toolkits/planning.py +178 -0
  90. alita_sdk/runtime/toolkits/tools.py +205 -55
  91. alita_sdk/runtime/toolkits/vectorstore.py +9 -4
  92. alita_sdk/runtime/tools/__init__.py +11 -3
  93. alita_sdk/runtime/tools/application.py +7 -0
  94. alita_sdk/runtime/tools/artifact.py +225 -12
  95. alita_sdk/runtime/tools/function.py +95 -5
  96. alita_sdk/runtime/tools/graph.py +10 -4
  97. alita_sdk/runtime/tools/image_generation.py +212 -0
  98. alita_sdk/runtime/tools/llm.py +494 -102
  99. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  100. alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
  101. alita_sdk/runtime/tools/mcp_server_tool.py +4 -4
  102. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  103. alita_sdk/runtime/tools/planning/models.py +246 -0
  104. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  105. alita_sdk/runtime/tools/router.py +2 -1
  106. alita_sdk/runtime/tools/sandbox.py +180 -79
  107. alita_sdk/runtime/tools/vectorstore.py +22 -21
  108. alita_sdk/runtime/tools/vectorstore_base.py +125 -52
  109. alita_sdk/runtime/utils/AlitaCallback.py +106 -20
  110. alita_sdk/runtime/utils/mcp_client.py +465 -0
  111. alita_sdk/runtime/utils/mcp_oauth.py +244 -0
  112. alita_sdk/runtime/utils/mcp_sse_client.py +405 -0
  113. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  114. alita_sdk/runtime/utils/streamlit.py +40 -13
  115. alita_sdk/runtime/utils/toolkit_utils.py +28 -9
  116. alita_sdk/runtime/utils/utils.py +12 -0
  117. alita_sdk/tools/__init__.py +77 -33
  118. alita_sdk/tools/ado/repos/__init__.py +7 -6
  119. alita_sdk/tools/ado/repos/repos_wrapper.py +11 -11
  120. alita_sdk/tools/ado/test_plan/__init__.py +7 -7
  121. alita_sdk/tools/ado/wiki/__init__.py +7 -11
  122. alita_sdk/tools/ado/wiki/ado_wrapper.py +89 -15
  123. alita_sdk/tools/ado/work_item/__init__.py +7 -11
  124. alita_sdk/tools/ado/work_item/ado_wrapper.py +17 -8
  125. alita_sdk/tools/advanced_jira_mining/__init__.py +8 -7
  126. alita_sdk/tools/aws/delta_lake/__init__.py +11 -9
  127. alita_sdk/tools/azure_ai/search/__init__.py +7 -6
  128. alita_sdk/tools/base_indexer_toolkit.py +345 -70
  129. alita_sdk/tools/bitbucket/__init__.py +9 -8
  130. alita_sdk/tools/bitbucket/api_wrapper.py +50 -6
  131. alita_sdk/tools/browser/__init__.py +4 -4
  132. alita_sdk/tools/carrier/__init__.py +4 -6
  133. alita_sdk/tools/chunkers/__init__.py +3 -1
  134. alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
  135. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  136. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  137. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  138. alita_sdk/tools/cloud/aws/__init__.py +7 -6
  139. alita_sdk/tools/cloud/azure/__init__.py +7 -6
  140. alita_sdk/tools/cloud/gcp/__init__.py +7 -6
  141. alita_sdk/tools/cloud/k8s/__init__.py +7 -6
  142. alita_sdk/tools/code/linter/__init__.py +7 -7
  143. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  144. alita_sdk/tools/code/sonar/__init__.py +8 -7
  145. alita_sdk/tools/code_indexer_toolkit.py +199 -0
  146. alita_sdk/tools/confluence/__init__.py +9 -8
  147. alita_sdk/tools/confluence/api_wrapper.py +171 -75
  148. alita_sdk/tools/confluence/loader.py +10 -0
  149. alita_sdk/tools/custom_open_api/__init__.py +9 -4
  150. alita_sdk/tools/elastic/__init__.py +8 -7
  151. alita_sdk/tools/elitea_base.py +492 -52
  152. alita_sdk/tools/figma/__init__.py +7 -7
  153. alita_sdk/tools/figma/api_wrapper.py +2 -1
  154. alita_sdk/tools/github/__init__.py +9 -9
  155. alita_sdk/tools/github/api_wrapper.py +9 -26
  156. alita_sdk/tools/github/github_client.py +62 -2
  157. alita_sdk/tools/gitlab/__init__.py +8 -8
  158. alita_sdk/tools/gitlab/api_wrapper.py +135 -33
  159. alita_sdk/tools/gitlab_org/__init__.py +7 -8
  160. alita_sdk/tools/google/bigquery/__init__.py +11 -12
  161. alita_sdk/tools/google_places/__init__.py +8 -7
  162. alita_sdk/tools/jira/__init__.py +9 -7
  163. alita_sdk/tools/jira/api_wrapper.py +100 -52
  164. alita_sdk/tools/keycloak/__init__.py +8 -7
  165. alita_sdk/tools/localgit/local_git.py +56 -54
  166. alita_sdk/tools/memory/__init__.py +1 -1
  167. alita_sdk/tools/non_code_indexer_toolkit.py +3 -2
  168. alita_sdk/tools/ocr/__init__.py +8 -7
  169. alita_sdk/tools/openapi/__init__.py +10 -1
  170. alita_sdk/tools/pandas/__init__.py +8 -7
  171. alita_sdk/tools/postman/__init__.py +7 -8
  172. alita_sdk/tools/postman/api_wrapper.py +19 -8
  173. alita_sdk/tools/postman/postman_analysis.py +8 -1
  174. alita_sdk/tools/pptx/__init__.py +8 -9
  175. alita_sdk/tools/qtest/__init__.py +16 -11
  176. alita_sdk/tools/qtest/api_wrapper.py +1784 -88
  177. alita_sdk/tools/rally/__init__.py +7 -8
  178. alita_sdk/tools/report_portal/__init__.py +9 -7
  179. alita_sdk/tools/salesforce/__init__.py +7 -7
  180. alita_sdk/tools/servicenow/__init__.py +10 -10
  181. alita_sdk/tools/sharepoint/__init__.py +7 -6
  182. alita_sdk/tools/sharepoint/api_wrapper.py +127 -36
  183. alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
  184. alita_sdk/tools/sharepoint/utils.py +8 -2
  185. alita_sdk/tools/slack/__init__.py +7 -6
  186. alita_sdk/tools/sql/__init__.py +8 -7
  187. alita_sdk/tools/sql/api_wrapper.py +71 -23
  188. alita_sdk/tools/testio/__init__.py +7 -6
  189. alita_sdk/tools/testrail/__init__.py +8 -9
  190. alita_sdk/tools/utils/__init__.py +26 -4
  191. alita_sdk/tools/utils/content_parser.py +88 -60
  192. alita_sdk/tools/utils/text_operations.py +254 -0
  193. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +76 -26
  194. alita_sdk/tools/xray/__init__.py +9 -7
  195. alita_sdk/tools/zephyr/__init__.py +7 -6
  196. alita_sdk/tools/zephyr_enterprise/__init__.py +8 -6
  197. alita_sdk/tools/zephyr_essential/__init__.py +7 -6
  198. alita_sdk/tools/zephyr_essential/api_wrapper.py +12 -13
  199. alita_sdk/tools/zephyr_scale/__init__.py +7 -6
  200. alita_sdk/tools/zephyr_squad/__init__.py +7 -6
  201. {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/METADATA +147 -2
  202. {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/RECORD +206 -130
  203. alita_sdk-0.3.499.dist-info/entry_points.txt +2 -0
  204. {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/WHEEL +0 -0
  205. {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/licenses/LICENSE +0 -0
  206. {alita_sdk-0.3.351.dist-info → alita_sdk-0.3.499.dist-info}/top_level.txt +0 -0
@@ -1,40 +1,57 @@
1
+ import copy
1
2
  import json
2
3
  import logging
3
- from typing import Any, Optional, List, Literal, Dict, Generator
4
+ import time
5
+ from enum import Enum
6
+ from typing import Any, Optional, List, Dict, Generator
4
7
 
8
+ from langchain_core.callbacks import dispatch_custom_event
5
9
  from langchain_core.documents import Document
6
10
  from pydantic import create_model, Field, SecretStr
7
11
 
8
12
  from .utils.content_parser import file_extension_by_chunker, process_document_by_type
9
13
  from .vector_adapters.VectorStoreAdapter import VectorStoreAdapterFactory
14
+ from ..runtime.langchain.document_loaders.constants import loaders_allowed_to_override
10
15
  from ..runtime.tools.vectorstore_base import VectorStoreWrapperBase
11
16
  from ..runtime.utils.utils import IndexerKeywords
12
17
 
13
18
  logger = logging.getLogger(__name__)
14
19
 
20
+ DEFAULT_CUT_OFF = 0.1
21
+ INDEX_META_UPDATE_INTERVAL = 600.0
22
+
23
+ class IndexTools(str, Enum):
24
+ """Enum for index-related tool names."""
25
+ INDEX_DATA = "index_data"
26
+ SEARCH_INDEX = "search_index"
27
+ STEPBACK_SEARCH_INDEX = "stepback_search_index"
28
+ STEPBACK_SUMMARY_INDEX = "stepback_summary_index"
29
+ REMOVE_INDEX = "remove_index"
30
+ LIST_COLLECTIONS = "list_collections"
31
+
15
32
  # Base Vector Store Schema Models
16
33
  BaseIndexParams = create_model(
17
34
  "BaseIndexParams",
18
- collection_suffix=(str, Field(description="Suffix for collection name (max 7 characters) used to separate datasets", min_length=1, max_length=7)),
35
+ index_name=(str, Field(description="Index name (max 7 characters)", min_length=1, max_length=7)),
19
36
  )
20
37
 
21
38
  RemoveIndexParams = create_model(
22
39
  "RemoveIndexParams",
23
- collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
40
+ index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
24
41
  )
25
42
 
26
43
  BaseSearchParams = create_model(
27
44
  "BaseSearchParams",
28
45
  query=(str, Field(description="Query text to search in the index")),
29
- collection_suffix=(Optional[str], Field(
30
- description="Optional suffix for collection name (max 7 characters). Leave empty to search across all datasets",
46
+ index_name=(Optional[str], Field(
47
+ description="Optional index name (max 7 characters). Leave empty to search across all datasets",
31
48
  default="", max_length=7)),
32
49
  filter=(Optional[dict | str], Field(
33
50
  description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
34
51
  default={},
35
52
  examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
36
53
  )),
37
- cut_off=(Optional[float], Field(description="Cut-off score for search results", default=0.5, ge=0, le=1)),
54
+ cut_off=(Optional[float], Field(description="Cut-off score for search results", default=DEFAULT_CUT_OFF, ge=0, le=1)),
38
55
  search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
39
56
  full_text_search=(Optional[Dict[str, Any]], Field(
40
57
  description="Full text search parameters. Can be a dictionary with search options.",
@@ -57,14 +74,14 @@ BaseSearchParams = create_model(
57
74
  BaseStepbackSearchParams = create_model(
58
75
  "BaseStepbackSearchParams",
59
76
  query=(str, Field(description="Query text to search in the index")),
60
- collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
77
+ index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
61
78
  messages=(Optional[List], Field(description="Chat messages for stepback search context", default=[])),
62
79
  filter=(Optional[dict | str], Field(
63
80
  description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
64
81
  default={},
65
82
  examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
66
83
  )),
67
- cut_off=(Optional[float], Field(description="Cut-off score for search results", default=0.5, ge=0, le=1)),
84
+ cut_off=(Optional[float], Field(description="Cut-off score for search results", default=DEFAULT_CUT_OFF, ge=0, le=1)),
68
85
  search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
69
86
  full_text_search=(Optional[Dict[str, Any]], Field(
70
87
  description="Full text search parameters. Can be a dictionary with search options.",
@@ -91,7 +108,7 @@ BaseIndexDataParams = create_model(
91
108
  description="Optional flag to enforce clean existing index before indexing new data")),
92
109
  progress_step=(Optional[int], Field(default=10, ge=0, le=100,
93
110
  description="Optional step size for progress reporting during indexing")),
94
- chunking_config=(Optional[dict], Field(description="Chunking tool configuration", default_factory=dict)),
111
+ chunking_config=(Optional[dict], Field(description="Chunking tool configuration", default=loaders_allowed_to_override)),
95
112
  )
96
113
 
97
114
 
@@ -107,7 +124,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
107
124
  def __init__(self, **kwargs):
108
125
  conn = kwargs.get('connection_string', None)
109
126
  connection_string = conn.get_secret_value() if isinstance(conn, SecretStr) else conn
110
- collection_name = kwargs.get('collection_name')
127
+ collection_name = kwargs.get('collection_schema')
111
128
 
112
129
  if 'vectorstore_type' not in kwargs:
113
130
  kwargs['vectorstore_type'] = 'PGVector'
@@ -147,32 +164,122 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
147
164
  yield from ()
148
165
 
149
166
  def index_data(self, **kwargs):
150
- collection_suffix = kwargs.get("collection_suffix")
151
- progress_step = kwargs.get("progress_step")
167
+ index_name = kwargs.get("index_name")
152
168
  clean_index = kwargs.get("clean_index")
153
169
  chunking_tool = kwargs.get("chunking_tool")
154
170
  chunking_config = kwargs.get("chunking_config")
171
+
172
+ # Store the interval in a private dict to avoid Pydantic field errors
173
+ if not hasattr(self, "_index_meta_config"):
174
+ self._index_meta_config: Dict[str, Any] = {}
175
+
176
+ self._index_meta_config["update_interval"] = kwargs.get(
177
+ "meta_update_interval",
178
+ INDEX_META_UPDATE_INTERVAL,
179
+ )
180
+
181
+ result = {"count": 0}
155
182
  #
156
- if clean_index:
157
- self._clean_index(collection_suffix)
183
+ try:
184
+ if clean_index:
185
+ self._clean_index(index_name)
186
+ #
187
+ self.index_meta_init(index_name, kwargs)
188
+ self._emit_index_event(index_name)
189
+ #
190
+ self._log_tool_event(f"Indexing data into collection with suffix '{index_name}'. It can take some time...")
191
+ self._log_tool_event(f"Loading the documents to index...{kwargs}")
192
+ documents = self._base_loader(**kwargs)
193
+ documents = list(documents) # consume/exhaust generator to count items
194
+ documents_count = len(documents)
195
+ documents = (doc for doc in documents)
196
+ self._log_tool_event(f"Base documents were pre-loaded. "
197
+ f"Search for possible document duplicates and remove them from the indexing list...")
198
+ documents = self._reduce_duplicates(documents, index_name)
199
+ self._log_tool_event(f"Duplicates were removed. "
200
+ f"Processing documents to collect dependencies and prepare them for indexing...")
201
+ self._save_index_generator(documents, documents_count, chunking_tool, chunking_config, index_name=index_name, result=result)
202
+ #
203
+ results_count = result["count"]
204
+ # Final update should always be forced
205
+ self.index_meta_update(index_name, IndexerKeywords.INDEX_META_COMPLETED.value, results_count, update_force=True)
206
+ self._emit_index_event(index_name)
207
+ #
208
+ return {"status": "ok", "message": f"successfully indexed {results_count} documents" if results_count > 0
209
+ else "no new documents to index"}
210
+ except Exception as e:
211
+ # Do maximum effort at least send custom event for supposed changed status
212
+ msg = str(e)
213
+ try:
214
+ # Error update should also be forced
215
+ self.index_meta_update(index_name, IndexerKeywords.INDEX_META_FAILED.value, result["count"], update_force=True)
216
+ except Exception as ie:
217
+ logger.error(f"Failed to update index meta status to FAILED for index '{index_name}': {ie}")
218
+ msg = f"{msg}; additionally failed to update index meta status to FAILED: {ie}"
219
+ self._emit_index_event(index_name, error=msg)
220
+ raise e
221
+
222
+ def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config, result, index_name: Optional[str] = None):
223
+ self._ensure_vectorstore_initialized()
224
+ self._log_tool_event(f"Base documents are ready for indexing. {base_total} base documents in total to index.")
225
+ from ..runtime.langchain.interfaces.llm_processor import add_documents
158
226
  #
159
- self._log_tool_event(f"Indexing data into collection with suffix '{collection_suffix}'. It can take some time...")
160
- self._log_tool_event(f"Loading the documents to index...{kwargs}")
161
- documents = self._base_loader(**kwargs)
162
- self._log_tool_event(f"Base documents were pre-loaded. "
163
- f"Search for possible document duplicates and remove them from the indexing list...")
164
- documents = self._reduce_duplicates(documents, collection_suffix)
165
- self._log_tool_event(f"Duplicates were removed. "
166
- f"Processing documents to collect dependencies and prepare them for indexing...")
167
- documents = self._extend_data(documents) # update content of not-reduced base document if needed (for sharepoint and similar)
168
- documents = self._collect_dependencies(documents) # collect dependencies for base documents
169
- self._log_tool_event(f"Documents were processed. "
170
- f"Applying chunking tool '{chunking_tool}' if specified and preparing documents for indexing...")
171
- documents = self._apply_loaders_chunkers(documents, chunking_tool, chunking_config)
172
- list_documents = list(documents)
173
- self._clean_metadata(list_documents)
174
- self._log_tool_event(f"Documents are ready for indexing. Total documents to index: {len(list_documents)}")
175
- return self._save_index(list_documents, collection_suffix=collection_suffix, progress_step=progress_step)
227
+ base_doc_counter = 0
228
+ pg_vector_add_docs_chunk = []
229
+ for base_doc in base_documents:
230
+ base_doc_counter += 1
231
+ self._log_tool_event(f"Processing dependent documents for base documents #{base_doc_counter}.")
232
+
233
+ # (base_doc for _ in range(1)) - wrap single base_doc to Generator in order to reuse existing code
234
+ documents = self._extend_data((base_doc for _ in range(1))) # update content of not-reduced base document if needed (for sharepoint and similar)
235
+ documents = self._collect_dependencies(documents) # collect dependencies for base documents
236
+ self._log_tool_event(f"Dependent documents were processed. "
237
+ f"Applying chunking tool '{chunking_tool}' if specified and preparing documents for indexing...")
238
+ documents = self._apply_loaders_chunkers(documents, chunking_tool, chunking_config)
239
+ self._clean_metadata(documents)
240
+
241
+ logger.debug(f"Indexing base document #{base_doc_counter}: {base_doc} and all dependent documents: {documents}")
242
+
243
+ dependent_docs_counter = 0
244
+ #
245
+ for doc in documents:
246
+ if not doc.page_content:
247
+ # To avoid case when all documents have empty content
248
+ # See llm_processor.add_documents which exclude metadata of docs with empty content
249
+ continue
250
+ #
251
+ if 'id' not in doc.metadata or 'updated_on' not in doc.metadata:
252
+ logger.warning(f"Document is missing required metadata field 'id' or 'updated_on': {doc.metadata}")
253
+ #
254
+ # if index_name is provided, add it to metadata of each document
255
+ if index_name:
256
+ if not doc.metadata.get('collection'):
257
+ doc.metadata['collection'] = index_name
258
+ else:
259
+ doc.metadata['collection'] += f";{index_name}"
260
+ #
261
+ try:
262
+ pg_vector_add_docs_chunk.append(doc)
263
+ dependent_docs_counter += 1
264
+ if len(pg_vector_add_docs_chunk) >= self.max_docs_per_add:
265
+ add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
266
+ self._log_tool_event(f"{len(pg_vector_add_docs_chunk)} documents have been indexed. Continuing...")
267
+ pg_vector_add_docs_chunk = []
268
+ except Exception:
269
+ from traceback import format_exc
270
+ logger.error(f"Error: {format_exc()}")
271
+ return {"status": "error", "message": f"Error: {format_exc()}"}
272
+ msg = f"Indexed base document #{base_doc_counter} out of {base_total} (with {dependent_docs_counter} dependencies)."
273
+ logger.debug(msg)
274
+ self._log_tool_event(msg)
275
+ result["count"] += dependent_docs_counter
276
+ # After each base document, try a non-forced meta update; throttling handled inside index_meta_update
277
+ try:
278
+ self.index_meta_update(index_name, IndexerKeywords.INDEX_META_IN_PROGRESS.value, result["count"], update_force=False)
279
+ except Exception as exc: # best-effort, do not break indexing
280
+ logger.warning(f"Failed to update index meta during indexing process for index '{index_name}': {exc}")
281
+ if pg_vector_add_docs_chunk:
282
+ add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
176
283
 
177
284
  def _apply_loaders_chunkers(self, documents: Generator[Document, None, None], chunking_tool: str=None, chunking_config=None) -> Generator[Document, None, None]:
178
285
  from ..tools.chunkers import __all__ as chunkers
@@ -222,24 +329,26 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
222
329
  dep.metadata[IndexerKeywords.PARENT.value] = document.metadata.get('id', None)
223
330
  yield dep
224
331
 
225
- def _clean_metadata(self, documents: list[Document]):
332
+ def _clean_metadata(self, documents: Generator[Document, None, None]):
226
333
  for document in documents:
227
334
  remove_keys = self._remove_metadata_keys()
228
335
  for key in remove_keys:
229
336
  document.metadata.pop(key, None)
337
+ yield document
230
338
 
231
339
  def _reduce_duplicates(
232
340
  self,
233
341
  documents: Generator[Any, None, None],
234
- collection_suffix: str,
342
+ index_name: str,
235
343
  log_msg: str = "Verification of documents to index started"
236
344
  ) -> Generator[Document, None, None]:
237
345
  """Generic duplicate reduction logic for documents."""
238
- self._log_data(log_msg, tool_name="index_documents")
239
- indexed_data = self._get_indexed_data(collection_suffix)
346
+ self._ensure_vectorstore_initialized()
347
+ self._log_tool_event(log_msg, tool_name="index_documents")
348
+ indexed_data = self._get_indexed_data(index_name)
240
349
  indexed_keys = set(indexed_data.keys())
241
350
  if not indexed_keys:
242
- self._log_data("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
351
+ self._log_tool_event("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
243
352
  yield from documents
244
353
  return
245
354
 
@@ -248,7 +357,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
248
357
  for document in documents:
249
358
  key = self.key_fn(document)
250
359
  key = key if isinstance(key, str) else str(key)
251
- if key in indexed_keys and collection_suffix == indexed_data[key]['metadata'].get('collection'):
360
+ if key in indexed_keys and index_name == indexed_data[key]['metadata'].get('collection'):
252
361
  if self.compare_fn(document, indexed_data[key]):
253
362
  continue
254
363
  yield document
@@ -257,13 +366,13 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
257
366
  yield document
258
367
 
259
368
  if docs_to_remove:
260
- self._log_data(
369
+ self._log_tool_event(
261
370
  f"Removing {len(docs_to_remove)} documents from vectorstore that are already indexed with different updated_on.",
262
371
  tool_name="index_documents"
263
372
  )
264
373
  self.vectorstore.delete(ids=list(docs_to_remove))
265
374
 
266
- def _get_indexed_data(self, collection_suffix: str):
375
+ def _get_indexed_data(self, index_name: str):
267
376
  raise NotImplementedError("Subclasses must implement this method")
268
377
 
269
378
  def key_fn(self, document: Document):
@@ -275,35 +384,57 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
275
384
  def remove_ids_fn(self, idx_data, key: str):
276
385
  raise NotImplementedError("Subclasses must implement this method")
277
386
 
278
- def remove_index(self, collection_suffix: str = ""):
387
+ def remove_index(self, index_name: str = ""):
279
388
  """Cleans the indexed data in the collection."""
280
- super()._clean_collection(collection_suffix=collection_suffix)
281
- return (f"Collection '{collection_suffix}' has been removed from the vector store.\n"
282
- f"Available collections: {self.list_collections()}") if collection_suffix \
389
+ super()._clean_collection(index_name=index_name, including_index_meta=True)
390
+ return (f"Collection '{index_name}' has been removed from the vector store.\n"
391
+ f"Available collections: {self.list_collections()}") if index_name \
283
392
  else "All collections have been removed from the vector store."
284
393
 
285
- def _build_collection_filter(self, filter: dict | str, collection_suffix: str = "") -> dict:
394
+ def _build_collection_filter(self, filter: dict | str, index_name: str = "") -> dict:
286
395
  """Builds a filter for the collection based on the provided suffix."""
287
396
 
288
397
  filter = filter if isinstance(filter, dict) else json.loads(filter)
289
- if collection_suffix:
398
+ if index_name:
290
399
  filter.update({"collection": {
291
- "$eq": collection_suffix.strip()
400
+ "$eq": index_name.strip()
292
401
  }})
402
+
403
+ if filter:
404
+ # Exclude index meta documents from search results
405
+ filter = {
406
+ "$and": [
407
+ filter,
408
+ {"$or": [
409
+ {"type": {"$exists": False}},
410
+ {"type": {"$ne": IndexerKeywords.INDEX_META_TYPE.value}}
411
+ ]},
412
+ ]
413
+ }
414
+ else:
415
+ filter = {"$or": [
416
+ {"type": {"$exists": False}},
417
+ {"type": {"$ne": IndexerKeywords.INDEX_META_TYPE.value}}
418
+ ]}
293
419
  return filter
294
420
 
295
421
  def search_index(self,
296
422
  query: str,
297
- collection_suffix: str = "",
298
- filter: dict | str = {}, cut_off: float = 0.5,
423
+ index_name: str = "",
424
+ filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
299
425
  search_top: int = 10, reranker: dict = {},
300
426
  full_text_search: Optional[Dict[str, Any]] = None,
301
427
  reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
302
428
  extended_search: Optional[List[str]] = None,
303
429
  **kwargs):
304
430
  """ Searches indexed documents in the vector store."""
305
- # build filter on top of collection_suffix
306
- filter = self._build_collection_filter(filter, collection_suffix)
431
+ # build filter on top of index_name
432
+
433
+ available_collections = super().list_collections()
434
+ if index_name and index_name not in available_collections:
435
+ return f"Collection '{index_name}' not found. Available collections: {available_collections}"
436
+
437
+ filter = self._build_collection_filter(filter, index_name)
307
438
  found_docs = super().search_documents(
308
439
  query,
309
440
  doctype=self.doctype,
@@ -320,15 +451,15 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
320
451
  def stepback_search_index(self,
321
452
  query: str,
322
453
  messages: List[Dict[str, Any]] = [],
323
- collection_suffix: str = "",
324
- filter: dict | str = {}, cut_off: float = 0.5,
454
+ index_name: str = "",
455
+ filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
325
456
  search_top: int = 10, reranker: dict = {},
326
457
  full_text_search: Optional[Dict[str, Any]] = None,
327
458
  reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
328
459
  extended_search: Optional[List[str]] = None,
329
460
  **kwargs):
330
461
  """ Searches indexed documents in the vector store."""
331
- filter = self._build_collection_filter(filter, collection_suffix)
462
+ filter = self._build_collection_filter(filter, index_name)
332
463
  found_docs = super().stepback_search(
333
464
  query,
334
465
  messages,
@@ -345,8 +476,8 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
345
476
  def stepback_summary_index(self,
346
477
  query: str,
347
478
  messages: List[Dict[str, Any]] = [],
348
- collection_suffix: str = "",
349
- filter: dict | str = {}, cut_off: float = 0.5,
479
+ index_name: str = "",
480
+ filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
350
481
  search_top: int = 10, reranker: dict = {},
351
482
  full_text_search: Optional[Dict[str, Any]] = None,
352
483
  reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
@@ -354,7 +485,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
354
485
  **kwargs):
355
486
  """ Generates a summary of indexed documents using stepback technique."""
356
487
 
357
- filter = self._build_collection_filter(filter, collection_suffix)
488
+ filter = self._build_collection_filter(filter, index_name)
358
489
  return super().stepback_summary(
359
490
  query,
360
491
  messages,
@@ -366,6 +497,149 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
366
497
  reranking_config=reranking_config,
367
498
  extended_search=extended_search
368
499
  )
500
+
501
+ def index_meta_init(self, index_name: str, index_configuration: dict[str, Any]):
502
+ self._ensure_vectorstore_initialized()
503
+ index_meta = super().get_index_meta(index_name)
504
+ if not index_meta:
505
+ self._log_tool_event(
506
+ f"There is no existing index_meta for collection '{index_name}'. Initializing it.",
507
+ tool_name="index_data"
508
+ )
509
+ from ..runtime.langchain.interfaces.llm_processor import add_documents
510
+ created_on = time.time()
511
+ metadata = {
512
+ "collection": index_name,
513
+ "type": IndexerKeywords.INDEX_META_TYPE.value,
514
+ "indexed": 0,
515
+ "updated": 0,
516
+ "state": IndexerKeywords.INDEX_META_IN_PROGRESS.value,
517
+ "index_configuration": index_configuration,
518
+ "created_on": created_on,
519
+ "updated_on": created_on,
520
+ "task_id": None,
521
+ "conversation_id": None,
522
+ "toolkit_id": self.toolkit_id,
523
+ }
524
+ metadata["history"] = json.dumps([metadata])
525
+ index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{index_name}", metadata=metadata)
526
+ add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc])
527
+
528
+ def index_meta_update(self, index_name: str, state: str, result: int, update_force: bool = True, interval: Optional[float] = None):
529
+ """Update `index_meta` document with optional time-based throttling.
530
+
531
+ Args:
532
+ index_name: Index name to update meta for.
533
+ state: New state value for the `index_meta` record.
534
+ result: Number of processed documents to store in the `updated` field.
535
+ update_force: If `True`, perform the update unconditionally, ignoring throttling.
536
+ If `False`, perform the update only when the effective time interval has passed.
537
+ interval: Optional custom interval (in seconds) for this call when `update_force` is `False`.
538
+ If `None`, falls back to the value stored in `self._index_meta_config["update_interval"]`
539
+ if present, otherwise uses `INDEX_META_UPDATE_INTERVAL`.
540
+ """
541
+ self._ensure_vectorstore_initialized()
542
+ if not hasattr(self, "_index_meta_last_update_time"):
543
+ self._index_meta_last_update_time: Dict[str, float] = {}
544
+
545
+ if not update_force:
546
+ # Resolve effective interval:
547
+ # 1\) explicit arg
548
+ # 2\) value from `_index_meta_config`
549
+ # 3\) default constant
550
+ cfg_interval = None
551
+ if hasattr(self, "_index_meta_config"):
552
+ cfg_interval = self._index_meta_config.get("update_interval")
553
+
554
+ eff_interval = (
555
+ interval
556
+ if interval is not None
557
+ else (cfg_interval if cfg_interval is not None else INDEX_META_UPDATE_INTERVAL)
558
+ )
559
+
560
+ last_time = self._index_meta_last_update_time.get(index_name)
561
+ now = time.time()
562
+ if last_time is not None and (now - last_time) < eff_interval:
563
+ return
564
+ self._index_meta_last_update_time[index_name] = now
565
+ else:
566
+ # For forced updates, always refresh last update time
567
+ self._index_meta_last_update_time[index_name] = time.time()
568
+
569
+ index_meta_raw = super().get_index_meta(index_name)
570
+ from ..runtime.langchain.interfaces.llm_processor import add_documents
571
+ #
572
+ if index_meta_raw:
573
+ metadata = copy.deepcopy(index_meta_raw.get("metadata", {}))
574
+ metadata["indexed"] = self.get_indexed_count(index_name)
575
+ metadata["updated"] = result
576
+ metadata["state"] = state
577
+ metadata["updated_on"] = time.time()
578
+ #
579
+ history_raw = metadata.pop("history", "[]")
580
+ try:
581
+ history = json.loads(history_raw) if history_raw.strip() else []
582
+ # replace the last history item with updated metadata
583
+ if history and isinstance(history, list):
584
+ history[-1] = metadata
585
+ else:
586
+ history = [metadata]
587
+ except (json.JSONDecodeError, TypeError):
588
+ logger.warning(f"Failed to load index history: {history_raw}. Create new with only current item.")
589
+ history = [metadata]
590
+ #
591
+ metadata["history"] = json.dumps(history)
592
+ index_meta_doc = Document(page_content=index_meta_raw.get("content", ""), metadata=metadata)
593
+ add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=[index_meta_raw.get("id")])
594
+
595
+ def _emit_index_event(self, index_name: str, error: Optional[str] = None):
596
+ """
597
+ Emit custom event for index data operation.
598
+
599
+ Args:
600
+ index_name: The name of the index
601
+ error: Error message if the operation failed, None otherwise
602
+ """
603
+ index_meta = super().get_index_meta(index_name)
604
+
605
+ if not index_meta:
606
+ logger.warning(
607
+ f"No index_meta found for index '{index_name}'. "
608
+ "Cannot emit index event."
609
+ )
610
+ return
611
+
612
+ metadata = index_meta.get("metadata", {})
613
+
614
+ # Determine if this is a reindex operation
615
+ history_raw = metadata.get("history", "[]")
616
+ try:
617
+ history = json.loads(history_raw) if history_raw.strip() else []
618
+ is_reindex = len(history) > 1
619
+ except (json.JSONDecodeError, TypeError):
620
+ is_reindex = False
621
+
622
+ # Build event message
623
+ event_data = {
624
+ "id": index_meta.get("id"),
625
+ "index_name": index_name,
626
+ "state": "failed" if error is not None else metadata.get("state"),
627
+ "error": error,
628
+ "reindex": is_reindex,
629
+ "indexed": metadata.get("indexed", 0),
630
+ "updated": metadata.get("updated", 0),
631
+ "toolkit_id": metadata.get("toolkit_id"),
632
+ }
633
+
634
+ # Emit the event
635
+ try:
636
+ dispatch_custom_event("index_data_status", event_data)
637
+ logger.debug(
638
+ f"Emitted index_data_status event for index "
639
+ f"'{index_name}': {event_data}"
640
+ )
641
+ except Exception as e:
642
+ logger.warning(f"Failed to emit index_data_status event: {e}")
369
643
 
370
644
  def get_available_tools(self):
371
645
  """
@@ -377,8 +651,8 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
377
651
  """
378
652
  return [
379
653
  {
380
- "name": "index_data",
381
- "mode": "index_data",
654
+ "name": IndexTools.INDEX_DATA.value,
655
+ "mode": IndexTools.INDEX_DATA.value,
382
656
  "ref": self.index_data,
383
657
  "description": "Loads data to index.",
384
658
  "args_schema": create_model(
@@ -388,38 +662,39 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
388
662
  )
389
663
  },
390
664
  {
391
- "name": "search_index",
392
- "mode": "search_index",
665
+ "name": IndexTools.SEARCH_INDEX.value,
666
+ "mode": IndexTools.SEARCH_INDEX.value,
393
667
  "ref": self.search_index,
394
668
  "description": self.search_index.__doc__,
395
669
  "args_schema": BaseSearchParams
396
670
  },
397
671
  {
398
- "name": "stepback_search_index",
399
- "mode": "stepback_search_index",
672
+ "name": IndexTools.STEPBACK_SEARCH_INDEX.value,
673
+ "mode": IndexTools.STEPBACK_SEARCH_INDEX.value,
400
674
  "ref": self.stepback_search_index,
401
675
  "description": self.stepback_search_index.__doc__,
402
676
  "args_schema": BaseStepbackSearchParams
403
677
  },
404
678
  {
405
- "name": "stepback_summary_index",
406
- "mode": "stepback_summary_index",
679
+ "name": IndexTools.STEPBACK_SUMMARY_INDEX.value,
680
+ "mode": IndexTools.STEPBACK_SUMMARY_INDEX.value,
407
681
  "ref": self.stepback_summary_index,
408
682
  "description": self.stepback_summary_index.__doc__,
409
683
  "args_schema": BaseStepbackSearchParams
410
684
  },
411
685
  {
412
- "name": "remove_index",
413
- "mode": "remove_index",
686
+ "name": IndexTools.REMOVE_INDEX.value,
687
+ "mode": IndexTools.REMOVE_INDEX.value,
414
688
  "ref": self.remove_index,
415
689
  "description": self.remove_index.__doc__,
416
690
  "args_schema": RemoveIndexParams
417
691
  },
418
692
  {
419
- "name": "list_collections",
420
- "mode": "list_collections",
693
+ "name": IndexTools.LIST_COLLECTIONS.value,
694
+ "mode": IndexTools.LIST_COLLECTIONS.value,
421
695
  "ref": self.list_collections,
422
696
  "description": self.list_collections.__doc__,
423
- "args_schema": create_model("ListCollectionsParams") # No parameters
697
+ # No parameters
698
+ "args_schema": create_model("ListCollectionsParams")
424
699
  },
425
- ]
700
+ ]