alita-sdk 0.3.263__py3-none-any.whl → 0.3.499__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (248) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent/__init__.py +5 -0
  4. alita_sdk/cli/agent/default.py +258 -0
  5. alita_sdk/cli/agent_executor.py +155 -0
  6. alita_sdk/cli/agent_loader.py +215 -0
  7. alita_sdk/cli/agent_ui.py +228 -0
  8. alita_sdk/cli/agents.py +3601 -0
  9. alita_sdk/cli/callbacks.py +647 -0
  10. alita_sdk/cli/cli.py +168 -0
  11. alita_sdk/cli/config.py +306 -0
  12. alita_sdk/cli/context/__init__.py +30 -0
  13. alita_sdk/cli/context/cleanup.py +198 -0
  14. alita_sdk/cli/context/manager.py +731 -0
  15. alita_sdk/cli/context/message.py +285 -0
  16. alita_sdk/cli/context/strategies.py +289 -0
  17. alita_sdk/cli/context/token_estimation.py +127 -0
  18. alita_sdk/cli/formatting.py +182 -0
  19. alita_sdk/cli/input_handler.py +419 -0
  20. alita_sdk/cli/inventory.py +1256 -0
  21. alita_sdk/cli/mcp_loader.py +315 -0
  22. alita_sdk/cli/toolkit.py +327 -0
  23. alita_sdk/cli/toolkit_loader.py +85 -0
  24. alita_sdk/cli/tools/__init__.py +43 -0
  25. alita_sdk/cli/tools/approval.py +224 -0
  26. alita_sdk/cli/tools/filesystem.py +1751 -0
  27. alita_sdk/cli/tools/planning.py +389 -0
  28. alita_sdk/cli/tools/terminal.py +414 -0
  29. alita_sdk/community/__init__.py +64 -8
  30. alita_sdk/community/inventory/__init__.py +224 -0
  31. alita_sdk/community/inventory/config.py +257 -0
  32. alita_sdk/community/inventory/enrichment.py +2137 -0
  33. alita_sdk/community/inventory/extractors.py +1469 -0
  34. alita_sdk/community/inventory/ingestion.py +3172 -0
  35. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  36. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  37. alita_sdk/community/inventory/parsers/base.py +295 -0
  38. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  39. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  40. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  41. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  42. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  43. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  44. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  45. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  46. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  47. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  48. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  49. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  50. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  51. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  52. alita_sdk/community/inventory/patterns/loader.py +348 -0
  53. alita_sdk/community/inventory/patterns/registry.py +198 -0
  54. alita_sdk/community/inventory/presets.py +535 -0
  55. alita_sdk/community/inventory/retrieval.py +1403 -0
  56. alita_sdk/community/inventory/toolkit.py +173 -0
  57. alita_sdk/community/inventory/visualize.py +1370 -0
  58. alita_sdk/configurations/__init__.py +10 -0
  59. alita_sdk/configurations/ado.py +4 -2
  60. alita_sdk/configurations/azure_search.py +1 -1
  61. alita_sdk/configurations/bigquery.py +1 -1
  62. alita_sdk/configurations/bitbucket.py +94 -2
  63. alita_sdk/configurations/browser.py +18 -0
  64. alita_sdk/configurations/carrier.py +19 -0
  65. alita_sdk/configurations/confluence.py +96 -1
  66. alita_sdk/configurations/delta_lake.py +1 -1
  67. alita_sdk/configurations/figma.py +0 -5
  68. alita_sdk/configurations/github.py +65 -1
  69. alita_sdk/configurations/gitlab.py +79 -0
  70. alita_sdk/configurations/google_places.py +17 -0
  71. alita_sdk/configurations/jira.py +103 -0
  72. alita_sdk/configurations/postman.py +1 -1
  73. alita_sdk/configurations/qtest.py +1 -3
  74. alita_sdk/configurations/report_portal.py +19 -0
  75. alita_sdk/configurations/salesforce.py +19 -0
  76. alita_sdk/configurations/service_now.py +1 -12
  77. alita_sdk/configurations/sharepoint.py +19 -0
  78. alita_sdk/configurations/sonar.py +18 -0
  79. alita_sdk/configurations/sql.py +20 -0
  80. alita_sdk/configurations/testio.py +18 -0
  81. alita_sdk/configurations/testrail.py +88 -0
  82. alita_sdk/configurations/xray.py +94 -1
  83. alita_sdk/configurations/zephyr_enterprise.py +94 -1
  84. alita_sdk/configurations/zephyr_essential.py +95 -0
  85. alita_sdk/runtime/clients/artifact.py +12 -2
  86. alita_sdk/runtime/clients/client.py +235 -66
  87. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  88. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  89. alita_sdk/runtime/clients/sandbox_client.py +373 -0
  90. alita_sdk/runtime/langchain/assistant.py +123 -17
  91. alita_sdk/runtime/langchain/constants.py +8 -1
  92. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  93. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
  94. alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
  95. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +8 -2
  96. alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
  97. alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
  98. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
  99. alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
  100. alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
  101. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
  102. alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
  103. alita_sdk/runtime/langchain/document_loaders/constants.py +187 -40
  104. alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
  105. alita_sdk/runtime/langchain/langraph_agent.py +406 -91
  106. alita_sdk/runtime/langchain/utils.py +51 -8
  107. alita_sdk/runtime/llms/preloaded.py +2 -6
  108. alita_sdk/runtime/models/mcp_models.py +61 -0
  109. alita_sdk/runtime/toolkits/__init__.py +26 -0
  110. alita_sdk/runtime/toolkits/application.py +9 -2
  111. alita_sdk/runtime/toolkits/artifact.py +19 -7
  112. alita_sdk/runtime/toolkits/datasource.py +13 -6
  113. alita_sdk/runtime/toolkits/mcp.py +780 -0
  114. alita_sdk/runtime/toolkits/planning.py +178 -0
  115. alita_sdk/runtime/toolkits/subgraph.py +11 -6
  116. alita_sdk/runtime/toolkits/tools.py +214 -60
  117. alita_sdk/runtime/toolkits/vectorstore.py +9 -4
  118. alita_sdk/runtime/tools/__init__.py +22 -0
  119. alita_sdk/runtime/tools/application.py +16 -4
  120. alita_sdk/runtime/tools/artifact.py +312 -19
  121. alita_sdk/runtime/tools/function.py +100 -4
  122. alita_sdk/runtime/tools/graph.py +81 -0
  123. alita_sdk/runtime/tools/image_generation.py +212 -0
  124. alita_sdk/runtime/tools/llm.py +539 -180
  125. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  126. alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
  127. alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
  128. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  129. alita_sdk/runtime/tools/planning/models.py +246 -0
  130. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  131. alita_sdk/runtime/tools/router.py +2 -1
  132. alita_sdk/runtime/tools/sandbox.py +375 -0
  133. alita_sdk/runtime/tools/vectorstore.py +62 -63
  134. alita_sdk/runtime/tools/vectorstore_base.py +156 -85
  135. alita_sdk/runtime/utils/AlitaCallback.py +106 -20
  136. alita_sdk/runtime/utils/mcp_client.py +465 -0
  137. alita_sdk/runtime/utils/mcp_oauth.py +244 -0
  138. alita_sdk/runtime/utils/mcp_sse_client.py +405 -0
  139. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  140. alita_sdk/runtime/utils/streamlit.py +41 -14
  141. alita_sdk/runtime/utils/toolkit_utils.py +28 -9
  142. alita_sdk/runtime/utils/utils.py +14 -0
  143. alita_sdk/tools/__init__.py +78 -35
  144. alita_sdk/tools/ado/__init__.py +0 -1
  145. alita_sdk/tools/ado/repos/__init__.py +10 -6
  146. alita_sdk/tools/ado/repos/repos_wrapper.py +12 -11
  147. alita_sdk/tools/ado/test_plan/__init__.py +10 -7
  148. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -23
  149. alita_sdk/tools/ado/wiki/__init__.py +10 -11
  150. alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -28
  151. alita_sdk/tools/ado/work_item/__init__.py +10 -11
  152. alita_sdk/tools/ado/work_item/ado_wrapper.py +63 -10
  153. alita_sdk/tools/advanced_jira_mining/__init__.py +10 -7
  154. alita_sdk/tools/aws/delta_lake/__init__.py +13 -11
  155. alita_sdk/tools/azure_ai/search/__init__.py +11 -7
  156. alita_sdk/tools/base_indexer_toolkit.py +392 -86
  157. alita_sdk/tools/bitbucket/__init__.py +18 -11
  158. alita_sdk/tools/bitbucket/api_wrapper.py +52 -9
  159. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
  160. alita_sdk/tools/browser/__init__.py +40 -16
  161. alita_sdk/tools/browser/crawler.py +3 -1
  162. alita_sdk/tools/browser/utils.py +15 -6
  163. alita_sdk/tools/carrier/__init__.py +17 -17
  164. alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
  165. alita_sdk/tools/carrier/excel_reporter.py +8 -4
  166. alita_sdk/tools/chunkers/__init__.py +3 -1
  167. alita_sdk/tools/chunkers/code/codeparser.py +1 -1
  168. alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
  169. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  170. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  171. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  172. alita_sdk/tools/cloud/aws/__init__.py +9 -6
  173. alita_sdk/tools/cloud/azure/__init__.py +9 -6
  174. alita_sdk/tools/cloud/gcp/__init__.py +9 -6
  175. alita_sdk/tools/cloud/k8s/__init__.py +9 -6
  176. alita_sdk/tools/code/linter/__init__.py +7 -7
  177. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  178. alita_sdk/tools/code/sonar/__init__.py +18 -12
  179. alita_sdk/tools/code_indexer_toolkit.py +199 -0
  180. alita_sdk/tools/confluence/__init__.py +14 -11
  181. alita_sdk/tools/confluence/api_wrapper.py +198 -58
  182. alita_sdk/tools/confluence/loader.py +10 -0
  183. alita_sdk/tools/custom_open_api/__init__.py +9 -4
  184. alita_sdk/tools/elastic/__init__.py +8 -7
  185. alita_sdk/tools/elitea_base.py +543 -64
  186. alita_sdk/tools/figma/__init__.py +10 -8
  187. alita_sdk/tools/figma/api_wrapper.py +352 -153
  188. alita_sdk/tools/github/__init__.py +13 -11
  189. alita_sdk/tools/github/api_wrapper.py +9 -26
  190. alita_sdk/tools/github/github_client.py +75 -12
  191. alita_sdk/tools/github/schemas.py +2 -1
  192. alita_sdk/tools/gitlab/__init__.py +11 -10
  193. alita_sdk/tools/gitlab/api_wrapper.py +135 -45
  194. alita_sdk/tools/gitlab_org/__init__.py +11 -9
  195. alita_sdk/tools/google/bigquery/__init__.py +12 -13
  196. alita_sdk/tools/google_places/__init__.py +18 -10
  197. alita_sdk/tools/jira/__init__.py +14 -8
  198. alita_sdk/tools/jira/api_wrapper.py +315 -168
  199. alita_sdk/tools/keycloak/__init__.py +8 -7
  200. alita_sdk/tools/localgit/local_git.py +56 -54
  201. alita_sdk/tools/memory/__init__.py +27 -11
  202. alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
  203. alita_sdk/tools/ocr/__init__.py +8 -7
  204. alita_sdk/tools/openapi/__init__.py +10 -1
  205. alita_sdk/tools/pandas/__init__.py +8 -7
  206. alita_sdk/tools/pandas/api_wrapper.py +7 -25
  207. alita_sdk/tools/postman/__init__.py +8 -10
  208. alita_sdk/tools/postman/api_wrapper.py +19 -8
  209. alita_sdk/tools/postman/postman_analysis.py +8 -1
  210. alita_sdk/tools/pptx/__init__.py +8 -9
  211. alita_sdk/tools/qtest/__init__.py +19 -13
  212. alita_sdk/tools/qtest/api_wrapper.py +1784 -88
  213. alita_sdk/tools/rally/__init__.py +10 -9
  214. alita_sdk/tools/report_portal/__init__.py +20 -15
  215. alita_sdk/tools/salesforce/__init__.py +19 -15
  216. alita_sdk/tools/servicenow/__init__.py +14 -11
  217. alita_sdk/tools/sharepoint/__init__.py +14 -13
  218. alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
  219. alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
  220. alita_sdk/tools/sharepoint/utils.py +8 -2
  221. alita_sdk/tools/slack/__init__.py +10 -7
  222. alita_sdk/tools/sql/__init__.py +19 -18
  223. alita_sdk/tools/sql/api_wrapper.py +71 -23
  224. alita_sdk/tools/testio/__init__.py +18 -12
  225. alita_sdk/tools/testrail/__init__.py +10 -10
  226. alita_sdk/tools/testrail/api_wrapper.py +213 -45
  227. alita_sdk/tools/utils/__init__.py +28 -4
  228. alita_sdk/tools/utils/content_parser.py +181 -61
  229. alita_sdk/tools/utils/text_operations.py +254 -0
  230. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
  231. alita_sdk/tools/xray/__init__.py +12 -7
  232. alita_sdk/tools/xray/api_wrapper.py +58 -113
  233. alita_sdk/tools/zephyr/__init__.py +9 -6
  234. alita_sdk/tools/zephyr_enterprise/__init__.py +13 -8
  235. alita_sdk/tools/zephyr_enterprise/api_wrapper.py +17 -7
  236. alita_sdk/tools/zephyr_essential/__init__.py +13 -9
  237. alita_sdk/tools/zephyr_essential/api_wrapper.py +289 -47
  238. alita_sdk/tools/zephyr_essential/client.py +6 -4
  239. alita_sdk/tools/zephyr_scale/__init__.py +10 -7
  240. alita_sdk/tools/zephyr_scale/api_wrapper.py +6 -2
  241. alita_sdk/tools/zephyr_squad/__init__.py +9 -6
  242. {alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/METADATA +180 -33
  243. alita_sdk-0.3.499.dist-info/RECORD +433 -0
  244. alita_sdk-0.3.499.dist-info/entry_points.txt +2 -0
  245. alita_sdk-0.3.263.dist-info/RECORD +0 -342
  246. {alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/WHEEL +0 -0
  247. {alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/licenses/LICENSE +0 -0
  248. {alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/top_level.txt +0 -0
@@ -1,40 +1,57 @@
1
+ import copy
1
2
  import json
2
3
  import logging
3
- from typing import Any, Optional, List, Literal, Dict, Generator
4
+ import time
5
+ from enum import Enum
6
+ from typing import Any, Optional, List, Dict, Generator
4
7
 
8
+ from langchain_core.callbacks import dispatch_custom_event
5
9
  from langchain_core.documents import Document
6
10
  from pydantic import create_model, Field, SecretStr
7
11
 
8
- from .utils.content_parser import process_content_by_type
12
+ from .utils.content_parser import file_extension_by_chunker, process_document_by_type
9
13
  from .vector_adapters.VectorStoreAdapter import VectorStoreAdapterFactory
14
+ from ..runtime.langchain.document_loaders.constants import loaders_allowed_to_override
10
15
  from ..runtime.tools.vectorstore_base import VectorStoreWrapperBase
11
16
  from ..runtime.utils.utils import IndexerKeywords
12
17
 
13
18
  logger = logging.getLogger(__name__)
14
19
 
20
+ DEFAULT_CUT_OFF = 0.1
21
+ INDEX_META_UPDATE_INTERVAL = 600.0
22
+
23
+ class IndexTools(str, Enum):
24
+ """Enum for index-related tool names."""
25
+ INDEX_DATA = "index_data"
26
+ SEARCH_INDEX = "search_index"
27
+ STEPBACK_SEARCH_INDEX = "stepback_search_index"
28
+ STEPBACK_SUMMARY_INDEX = "stepback_summary_index"
29
+ REMOVE_INDEX = "remove_index"
30
+ LIST_COLLECTIONS = "list_collections"
31
+
15
32
  # Base Vector Store Schema Models
16
33
  BaseIndexParams = create_model(
17
34
  "BaseIndexParams",
18
- collection_suffix=(str, Field(description="Suffix for collection name (max 7 characters) used to separate datasets", min_length=1, max_length=7)),
35
+ index_name=(str, Field(description="Index name (max 7 characters)", min_length=1, max_length=7)),
19
36
  )
20
37
 
21
38
  RemoveIndexParams = create_model(
22
39
  "RemoveIndexParams",
23
- collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
40
+ index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
24
41
  )
25
42
 
26
43
  BaseSearchParams = create_model(
27
44
  "BaseSearchParams",
28
45
  query=(str, Field(description="Query text to search in the index")),
29
- collection_suffix=(Optional[str], Field(
30
- description="Optional suffix for collection name (max 7 characters). Leave empty to search across all datasets",
46
+ index_name=(Optional[str], Field(
47
+ description="Optional index name (max 7 characters). Leave empty to search across all datasets",
31
48
  default="", max_length=7)),
32
49
  filter=(Optional[dict | str], Field(
33
50
  description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
34
51
  default={},
35
52
  examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
36
53
  )),
37
- cut_off=(Optional[float], Field(description="Cut-off score for search results", default=0.5)),
54
+ cut_off=(Optional[float], Field(description="Cut-off score for search results", default=DEFAULT_CUT_OFF, ge=0, le=1)),
38
55
  search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
39
56
  full_text_search=(Optional[Dict[str, Any]], Field(
40
57
  description="Full text search parameters. Can be a dictionary with search options.",
@@ -57,41 +74,41 @@ BaseSearchParams = create_model(
57
74
  BaseStepbackSearchParams = create_model(
58
75
  "BaseStepbackSearchParams",
59
76
  query=(str, Field(description="Query text to search in the index")),
60
- collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
77
+ index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
61
78
  messages=(Optional[List], Field(description="Chat messages for stepback search context", default=[])),
62
79
  filter=(Optional[dict | str], Field(
63
80
  description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
64
81
  default={},
65
82
  examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
66
83
  )),
67
- cut_off=(Optional[float], Field(description="Cut-off score for search results", default=0.5)),
84
+ cut_off=(Optional[float], Field(description="Cut-off score for search results", default=DEFAULT_CUT_OFF, ge=0, le=1)),
68
85
  search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
69
- reranker=(Optional[dict], Field(
70
- description="Reranker configuration. Can be a dictionary with reranking parameters.",
71
- default={}
72
- )),
73
86
  full_text_search=(Optional[Dict[str, Any]], Field(
74
87
  description="Full text search parameters. Can be a dictionary with search options.",
75
88
  default=None
76
89
  )),
77
- reranking_config=(Optional[Dict[str, Dict[str, Any]]], Field(
78
- description="Reranking configuration. Can be a dictionary with reranking settings.",
79
- default=None
80
- )),
81
90
  extended_search=(Optional[List[str]], Field(
82
91
  description="List of additional fields to include in the search results.",
83
92
  default=None
84
93
  )),
94
+ reranker=(Optional[dict], Field(
95
+ description="Reranker configuration. Can be a dictionary with reranking parameters.",
96
+ default={}
97
+ )),
98
+ reranking_config=(Optional[Dict[str, Dict[str, Any]]], Field(
99
+ description="Reranking configuration. Can be a dictionary with reranking settings.",
100
+ default=None
101
+ )),
85
102
  )
86
103
 
87
104
  BaseIndexDataParams = create_model(
88
105
  "indexData",
89
106
  __base__=BaseIndexParams,
90
- progress_step=(Optional[int], Field(default=10, ge=0, le=100,
91
- description="Optional step size for progress reporting during indexing")),
92
107
  clean_index=(Optional[bool], Field(default=False,
93
108
  description="Optional flag to enforce clean existing index before indexing new data")),
94
- chunking_config=(Optional[dict], Field(description="Chunking tool configuration", default_factory=dict)),
109
+ progress_step=(Optional[int], Field(default=10, ge=0, le=100,
110
+ description="Optional step size for progress reporting during indexing")),
111
+ chunking_config=(Optional[dict], Field(description="Chunking tool configuration", default=loaders_allowed_to_override)),
95
112
  )
96
113
 
97
114
 
@@ -100,26 +117,21 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
100
117
 
101
118
  doctype: str = "document"
102
119
 
103
- llm: Any = None
104
120
  connection_string: Optional[SecretStr] = None
105
121
  collection_name: Optional[str] = None
106
- embedding_model: Optional[str] = "HuggingFaceEmbeddings"
107
- vectorstore_type: Optional[str] = "PGVector"
108
- _embedding: Optional[Any] = None
109
122
  alita: Any = None # Elitea client, if available
110
123
 
111
124
  def __init__(self, **kwargs):
112
125
  conn = kwargs.get('connection_string', None)
113
126
  connection_string = conn.get_secret_value() if isinstance(conn, SecretStr) else conn
114
- collection_name = kwargs.get('collection_name')
127
+ collection_name = kwargs.get('collection_schema')
115
128
 
116
- if 'embedding_model' not in kwargs:
117
- kwargs['embedding_model'] = 'HuggingFaceEmbeddings'
118
129
  if 'vectorstore_type' not in kwargs:
119
130
  kwargs['vectorstore_type'] = 'PGVector'
120
131
  vectorstore_type = kwargs.get('vectorstore_type')
121
- kwargs['vectorstore_params'] = VectorStoreAdapterFactory.create_adapter(vectorstore_type).get_vectorstore_params(collection_name, connection_string)
122
- kwargs['_embedding'] = kwargs.get('alita').get_embeddings(kwargs.get('embedding_model'))
132
+ if connection_string:
133
+ # Initialize vectorstore params only if connection string is provided
134
+ kwargs['vectorstore_params'] = VectorStoreAdapterFactory.create_adapter(vectorstore_type).get_vectorstore_params(collection_name, connection_string)
123
135
  super().__init__(**kwargs)
124
136
 
125
137
  def _index_tool_params(self, **kwargs) -> dict[str, tuple[type, Field]]:
@@ -129,6 +141,11 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
129
141
  """
130
142
  return {}
131
143
 
144
+ def _remove_metadata_keys(self) -> List[str]:
145
+ """ Returns a list of metadata keys to be removed from documents before indexing.
146
+ Override this method in subclasses to provide specific keys to remove."""
147
+ return [IndexerKeywords.CONTENT_IN_BYTES.value, IndexerKeywords.CONTENT_FILE_NAME.value]
148
+
132
149
  def _base_loader(self, **kwargs) -> Generator[Document, None, None]:
133
150
  """ Loads documents from a source, processes them,
134
151
  and returns a list of Document objects with base metadata: id and created_on."""
@@ -147,45 +164,156 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
147
164
  yield from ()
148
165
 
149
166
  def index_data(self, **kwargs):
150
- collection_suffix = kwargs.get("collection_suffix")
151
- progress_step = kwargs.get("progress_step")
167
+ index_name = kwargs.get("index_name")
152
168
  clean_index = kwargs.get("clean_index")
153
169
  chunking_tool = kwargs.get("chunking_tool")
154
170
  chunking_config = kwargs.get("chunking_config")
171
+
172
+ # Store the interval in a private dict to avoid Pydantic field errors
173
+ if not hasattr(self, "_index_meta_config"):
174
+ self._index_meta_config: Dict[str, Any] = {}
175
+
176
+ self._index_meta_config["update_interval"] = kwargs.get(
177
+ "meta_update_interval",
178
+ INDEX_META_UPDATE_INTERVAL,
179
+ )
180
+
181
+ result = {"count": 0}
155
182
  #
156
- if clean_index:
157
- self._clean_index(collection_suffix)
158
- #
159
- documents = self._base_loader(**kwargs)
160
- documents = self._reduce_duplicates(documents, collection_suffix)
161
- documents = self._extend_data(documents) # update content of not-reduced base document if needed (for sharepoint and similar)
162
- documents = self._collect_dependencies(documents) # collect dependencies for base documents
163
- documents = self._apply_loaders_chunkers(documents, chunking_tool, chunking_config)
183
+ try:
184
+ if clean_index:
185
+ self._clean_index(index_name)
186
+ #
187
+ self.index_meta_init(index_name, kwargs)
188
+ self._emit_index_event(index_name)
189
+ #
190
+ self._log_tool_event(f"Indexing data into collection with suffix '{index_name}'. It can take some time...")
191
+ self._log_tool_event(f"Loading the documents to index...{kwargs}")
192
+ documents = self._base_loader(**kwargs)
193
+ documents = list(documents) # consume/exhaust generator to count items
194
+ documents_count = len(documents)
195
+ documents = (doc for doc in documents)
196
+ self._log_tool_event(f"Base documents were pre-loaded. "
197
+ f"Search for possible document duplicates and remove them from the indexing list...")
198
+ documents = self._reduce_duplicates(documents, index_name)
199
+ self._log_tool_event(f"Duplicates were removed. "
200
+ f"Processing documents to collect dependencies and prepare them for indexing...")
201
+ self._save_index_generator(documents, documents_count, chunking_tool, chunking_config, index_name=index_name, result=result)
202
+ #
203
+ results_count = result["count"]
204
+ # Final update should always be forced
205
+ self.index_meta_update(index_name, IndexerKeywords.INDEX_META_COMPLETED.value, results_count, update_force=True)
206
+ self._emit_index_event(index_name)
207
+ #
208
+ return {"status": "ok", "message": f"successfully indexed {results_count} documents" if results_count > 0
209
+ else "no new documents to index"}
210
+ except Exception as e:
211
+ # Do maximum effort at least send custom event for supposed changed status
212
+ msg = str(e)
213
+ try:
214
+ # Error update should also be forced
215
+ self.index_meta_update(index_name, IndexerKeywords.INDEX_META_FAILED.value, result["count"], update_force=True)
216
+ except Exception as ie:
217
+ logger.error(f"Failed to update index meta status to FAILED for index '{index_name}': {ie}")
218
+ msg = f"{msg}; additionally failed to update index meta status to FAILED: {ie}"
219
+ self._emit_index_event(index_name, error=msg)
220
+ raise e
221
+
222
+ def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config, result, index_name: Optional[str] = None):
223
+ self._ensure_vectorstore_initialized()
224
+ self._log_tool_event(f"Base documents are ready for indexing. {base_total} base documents in total to index.")
225
+ from ..runtime.langchain.interfaces.llm_processor import add_documents
164
226
  #
165
- return self._save_index(list(documents), collection_suffix=collection_suffix, progress_step=progress_step)
166
-
227
+ base_doc_counter = 0
228
+ pg_vector_add_docs_chunk = []
229
+ for base_doc in base_documents:
230
+ base_doc_counter += 1
231
+ self._log_tool_event(f"Processing dependent documents for base documents #{base_doc_counter}.")
232
+
233
+ # (base_doc for _ in range(1)) - wrap single base_doc to Generator in order to reuse existing code
234
+ documents = self._extend_data((base_doc for _ in range(1))) # update content of not-reduced base document if needed (for sharepoint and similar)
235
+ documents = self._collect_dependencies(documents) # collect dependencies for base documents
236
+ self._log_tool_event(f"Dependent documents were processed. "
237
+ f"Applying chunking tool '{chunking_tool}' if specified and preparing documents for indexing...")
238
+ documents = self._apply_loaders_chunkers(documents, chunking_tool, chunking_config)
239
+ self._clean_metadata(documents)
240
+
241
+ logger.debug(f"Indexing base document #{base_doc_counter}: {base_doc} and all dependent documents: {documents}")
242
+
243
+ dependent_docs_counter = 0
244
+ #
245
+ for doc in documents:
246
+ if not doc.page_content:
247
+ # To avoid case when all documents have empty content
248
+ # See llm_processor.add_documents which exclude metadata of docs with empty content
249
+ continue
250
+ #
251
+ if 'id' not in doc.metadata or 'updated_on' not in doc.metadata:
252
+ logger.warning(f"Document is missing required metadata field 'id' or 'updated_on': {doc.metadata}")
253
+ #
254
+ # if index_name is provided, add it to metadata of each document
255
+ if index_name:
256
+ if not doc.metadata.get('collection'):
257
+ doc.metadata['collection'] = index_name
258
+ else:
259
+ doc.metadata['collection'] += f";{index_name}"
260
+ #
261
+ try:
262
+ pg_vector_add_docs_chunk.append(doc)
263
+ dependent_docs_counter += 1
264
+ if len(pg_vector_add_docs_chunk) >= self.max_docs_per_add:
265
+ add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
266
+ self._log_tool_event(f"{len(pg_vector_add_docs_chunk)} documents have been indexed. Continuing...")
267
+ pg_vector_add_docs_chunk = []
268
+ except Exception:
269
+ from traceback import format_exc
270
+ logger.error(f"Error: {format_exc()}")
271
+ return {"status": "error", "message": f"Error: {format_exc()}"}
272
+ msg = f"Indexed base document #{base_doc_counter} out of {base_total} (with {dependent_docs_counter} dependencies)."
273
+ logger.debug(msg)
274
+ self._log_tool_event(msg)
275
+ result["count"] += dependent_docs_counter
276
+ # After each base document, try a non-forced meta update; throttling handled inside index_meta_update
277
+ try:
278
+ self.index_meta_update(index_name, IndexerKeywords.INDEX_META_IN_PROGRESS.value, result["count"], update_force=False)
279
+ except Exception as exc: # best-effort, do not break indexing
280
+ logger.warning(f"Failed to update index meta during indexing process for index '{index_name}': {exc}")
281
+ if pg_vector_add_docs_chunk:
282
+ add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
283
+
167
284
  def _apply_loaders_chunkers(self, documents: Generator[Document, None, None], chunking_tool: str=None, chunking_config=None) -> Generator[Document, None, None]:
168
- from alita_sdk.tools.chunkers import __all__ as chunkers
285
+ from ..tools.chunkers import __all__ as chunkers
169
286
 
170
287
  if chunking_config is None:
171
288
  chunking_config = {}
172
- chunking_config['embedding'] = self._embedding
289
+ chunking_config['embedding'] = self.embeddings
173
290
  chunking_config['llm'] = self.llm
174
-
291
+
175
292
  for document in documents:
176
- if content_type := document.metadata.get('loader_content_type', None):
293
+ if content_type := document.metadata.get(IndexerKeywords.CONTENT_FILE_NAME.value, None):
177
294
  # apply parsing based on content type and chunk if chunker was applied to parent doc
178
- content = document.metadata.pop('loader_content', None)
179
- yield from process_content_by_type(
295
+ content = document.metadata.pop(IndexerKeywords.CONTENT_IN_BYTES.value, None)
296
+ yield from process_document_by_type(
180
297
  document=document,
181
298
  content=content,
182
299
  extension_source=content_type, llm=self.llm, chunking_config=chunking_config)
300
+ elif chunking_tool and (content_in_bytes := document.metadata.pop(IndexerKeywords.CONTENT_IN_BYTES.value, None)) is not None:
301
+ if not content_in_bytes:
302
+ # content is empty, yield as is
303
+ yield document
304
+ continue
305
+ # apply parsing based on content type resolved from chunking_tool
306
+ content_type = file_extension_by_chunker(chunking_tool)
307
+ yield from process_document_by_type(
308
+ document=document,
309
+ content=content_in_bytes,
310
+ extension_source=content_type, llm=self.llm, chunking_config=chunking_config)
183
311
  elif chunking_tool:
184
312
  # apply default chunker from toolkit config. No parsing.
185
313
  chunker = chunkers.get(chunking_tool)
186
314
  yield from chunker(file_content_generator=iter([document]), config=chunking_config)
187
315
  else:
188
- # return as is if neither chunker or content typa are specified
316
+ # return as is if neither chunker nor content type are specified
189
317
  yield document
190
318
 
191
319
  def _extend_data(self, documents: Generator[Document, None, None]):
@@ -193,24 +321,34 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
193
321
 
194
322
  def _collect_dependencies(self, documents: Generator[Document, None, None]):
195
323
  for document in documents:
324
+ self._log_tool_event(message=f"Collecting the dependencies for document ID "
325
+ f"'{document.metadata.get('id', 'N/A')}' to collect dependencies if any...")
196
326
  dependencies = self._process_document(document)
197
327
  yield document
198
328
  for dep in dependencies:
199
329
  dep.metadata[IndexerKeywords.PARENT.value] = document.metadata.get('id', None)
200
330
  yield dep
201
331
 
332
+ def _clean_metadata(self, documents: Generator[Document, None, None]):
333
+ for document in documents:
334
+ remove_keys = self._remove_metadata_keys()
335
+ for key in remove_keys:
336
+ document.metadata.pop(key, None)
337
+ yield document
338
+
202
339
  def _reduce_duplicates(
203
340
  self,
204
341
  documents: Generator[Any, None, None],
205
- collection_suffix: str,
342
+ index_name: str,
206
343
  log_msg: str = "Verification of documents to index started"
207
344
  ) -> Generator[Document, None, None]:
208
345
  """Generic duplicate reduction logic for documents."""
209
- self._log_data(log_msg, tool_name="index_documents")
210
- indexed_data = self._get_indexed_data(collection_suffix)
346
+ self._ensure_vectorstore_initialized()
347
+ self._log_tool_event(log_msg, tool_name="index_documents")
348
+ indexed_data = self._get_indexed_data(index_name)
211
349
  indexed_keys = set(indexed_data.keys())
212
350
  if not indexed_keys:
213
- self._log_data("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
351
+ self._log_tool_event("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
214
352
  yield from documents
215
353
  return
216
354
 
@@ -218,7 +356,8 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
218
356
 
219
357
  for document in documents:
220
358
  key = self.key_fn(document)
221
- if key in indexed_keys and collection_suffix == indexed_data[key]['metadata'].get('collection'):
359
+ key = key if isinstance(key, str) else str(key)
360
+ if key in indexed_keys and index_name == indexed_data[key]['metadata'].get('collection'):
222
361
  if self.compare_fn(document, indexed_data[key]):
223
362
  continue
224
363
  yield document
@@ -227,13 +366,13 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
227
366
  yield document
228
367
 
229
368
  if docs_to_remove:
230
- self._log_data(
369
+ self._log_tool_event(
231
370
  f"Removing {len(docs_to_remove)} documents from vectorstore that are already indexed with different updated_on.",
232
371
  tool_name="index_documents"
233
372
  )
234
373
  self.vectorstore.delete(ids=list(docs_to_remove))
235
374
 
236
- def _get_indexed_data(self, collection_suffix: str):
375
+ def _get_indexed_data(self, index_name: str):
237
376
  raise NotImplementedError("Subclasses must implement this method")
238
377
 
239
378
  def key_fn(self, document: Document):
@@ -245,34 +384,57 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
245
384
  def remove_ids_fn(self, idx_data, key: str):
246
385
  raise NotImplementedError("Subclasses must implement this method")
247
386
 
248
- def remove_index(self, collection_suffix: str = ""):
387
+ def remove_index(self, index_name: str = ""):
249
388
  """Cleans the indexed data in the collection."""
250
- super()._clean_collection(collection_suffix=collection_suffix)
251
- return (f"Collection '{collection_suffix}' has been removed from the vector store.\n"
252
- f"Available collections: {self.list_collections()}")
389
+ super()._clean_collection(index_name=index_name, including_index_meta=True)
390
+ return (f"Collection '{index_name}' has been removed from the vector store.\n"
391
+ f"Available collections: {self.list_collections()}") if index_name \
392
+ else "All collections have been removed from the vector store."
253
393
 
254
- def _build_collection_filter(self, filter: dict | str, collection_suffix: str = "") -> dict:
394
+ def _build_collection_filter(self, filter: dict | str, index_name: str = "") -> dict:
255
395
  """Builds a filter for the collection based on the provided suffix."""
256
396
 
257
397
  filter = filter if isinstance(filter, dict) else json.loads(filter)
258
- if collection_suffix:
398
+ if index_name:
259
399
  filter.update({"collection": {
260
- "$eq": collection_suffix.strip()
400
+ "$eq": index_name.strip()
261
401
  }})
402
+
403
+ if filter:
404
+ # Exclude index meta documents from search results
405
+ filter = {
406
+ "$and": [
407
+ filter,
408
+ {"$or": [
409
+ {"type": {"$exists": False}},
410
+ {"type": {"$ne": IndexerKeywords.INDEX_META_TYPE.value}}
411
+ ]},
412
+ ]
413
+ }
414
+ else:
415
+ filter = {"$or": [
416
+ {"type": {"$exists": False}},
417
+ {"type": {"$ne": IndexerKeywords.INDEX_META_TYPE.value}}
418
+ ]}
262
419
  return filter
263
420
 
264
421
  def search_index(self,
265
422
  query: str,
266
- collection_suffix: str = "",
267
- filter: dict | str = {}, cut_off: float = 0.5,
423
+ index_name: str = "",
424
+ filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
268
425
  search_top: int = 10, reranker: dict = {},
269
426
  full_text_search: Optional[Dict[str, Any]] = None,
270
427
  reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
271
428
  extended_search: Optional[List[str]] = None,
272
429
  **kwargs):
273
430
  """ Searches indexed documents in the vector store."""
274
- # build filter on top of collection_suffix
275
- filter = self._build_collection_filter(filter, collection_suffix)
431
+ # build filter on top of index_name
432
+
433
+ available_collections = super().list_collections()
434
+ if index_name and index_name not in available_collections:
435
+ return f"Collection '{index_name}' not found. Available collections: {available_collections}"
436
+
437
+ filter = self._build_collection_filter(filter, index_name)
276
438
  found_docs = super().search_documents(
277
439
  query,
278
440
  doctype=self.doctype,
@@ -289,15 +451,15 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
289
451
  def stepback_search_index(self,
290
452
  query: str,
291
453
  messages: List[Dict[str, Any]] = [],
292
- collection_suffix: str = "",
293
- filter: dict | str = {}, cut_off: float = 0.5,
454
+ index_name: str = "",
455
+ filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
294
456
  search_top: int = 10, reranker: dict = {},
295
457
  full_text_search: Optional[Dict[str, Any]] = None,
296
458
  reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
297
459
  extended_search: Optional[List[str]] = None,
298
460
  **kwargs):
299
461
  """ Searches indexed documents in the vector store."""
300
- filter = self._build_collection_filter(filter, collection_suffix)
462
+ filter = self._build_collection_filter(filter, index_name)
301
463
  found_docs = super().stepback_search(
302
464
  query,
303
465
  messages,
@@ -314,8 +476,8 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
314
476
  def stepback_summary_index(self,
315
477
  query: str,
316
478
  messages: List[Dict[str, Any]] = [],
317
- collection_suffix: str = "",
318
- filter: dict | str = {}, cut_off: float = 0.5,
479
+ index_name: str = "",
480
+ filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
319
481
  search_top: int = 10, reranker: dict = {},
320
482
  full_text_search: Optional[Dict[str, Any]] = None,
321
483
  reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
@@ -323,7 +485,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
323
485
  **kwargs):
324
486
  """ Generates a summary of indexed documents using stepback technique."""
325
487
 
326
- filter = self._build_collection_filter(filter, collection_suffix)
488
+ filter = self._build_collection_filter(filter, index_name)
327
489
  return super().stepback_summary(
328
490
  query,
329
491
  messages,
@@ -335,6 +497,149 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
335
497
  reranking_config=reranking_config,
336
498
  extended_search=extended_search
337
499
  )
500
+
501
+ def index_meta_init(self, index_name: str, index_configuration: dict[str, Any]):
502
+ self._ensure_vectorstore_initialized()
503
+ index_meta = super().get_index_meta(index_name)
504
+ if not index_meta:
505
+ self._log_tool_event(
506
+ f"There is no existing index_meta for collection '{index_name}'. Initializing it.",
507
+ tool_name="index_data"
508
+ )
509
+ from ..runtime.langchain.interfaces.llm_processor import add_documents
510
+ created_on = time.time()
511
+ metadata = {
512
+ "collection": index_name,
513
+ "type": IndexerKeywords.INDEX_META_TYPE.value,
514
+ "indexed": 0,
515
+ "updated": 0,
516
+ "state": IndexerKeywords.INDEX_META_IN_PROGRESS.value,
517
+ "index_configuration": index_configuration,
518
+ "created_on": created_on,
519
+ "updated_on": created_on,
520
+ "task_id": None,
521
+ "conversation_id": None,
522
+ "toolkit_id": self.toolkit_id,
523
+ }
524
+ metadata["history"] = json.dumps([metadata])
525
+ index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{index_name}", metadata=metadata)
526
+ add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc])
527
+
528
+ def index_meta_update(self, index_name: str, state: str, result: int, update_force: bool = True, interval: Optional[float] = None):
529
+ """Update `index_meta` document with optional time-based throttling.
530
+
531
+ Args:
532
+ index_name: Index name to update meta for.
533
+ state: New state value for the `index_meta` record.
534
+ result: Number of processed documents to store in the `updated` field.
535
+ update_force: If `True`, perform the update unconditionally, ignoring throttling.
536
+ If `False`, perform the update only when the effective time interval has passed.
537
+ interval: Optional custom interval (in seconds) for this call when `update_force` is `False`.
538
+ If `None`, falls back to the value stored in `self._index_meta_config["update_interval"]`
539
+ if present, otherwise uses `INDEX_META_UPDATE_INTERVAL`.
540
+ """
541
+ self._ensure_vectorstore_initialized()
542
+ if not hasattr(self, "_index_meta_last_update_time"):
543
+ self._index_meta_last_update_time: Dict[str, float] = {}
544
+
545
+ if not update_force:
546
+ # Resolve effective interval:
547
+ # 1\) explicit arg
548
+ # 2\) value from `_index_meta_config`
549
+ # 3\) default constant
550
+ cfg_interval = None
551
+ if hasattr(self, "_index_meta_config"):
552
+ cfg_interval = self._index_meta_config.get("update_interval")
553
+
554
+ eff_interval = (
555
+ interval
556
+ if interval is not None
557
+ else (cfg_interval if cfg_interval is not None else INDEX_META_UPDATE_INTERVAL)
558
+ )
559
+
560
+ last_time = self._index_meta_last_update_time.get(index_name)
561
+ now = time.time()
562
+ if last_time is not None and (now - last_time) < eff_interval:
563
+ return
564
+ self._index_meta_last_update_time[index_name] = now
565
+ else:
566
+ # For forced updates, always refresh last update time
567
+ self._index_meta_last_update_time[index_name] = time.time()
568
+
569
+ index_meta_raw = super().get_index_meta(index_name)
570
+ from ..runtime.langchain.interfaces.llm_processor import add_documents
571
+ #
572
+ if index_meta_raw:
573
+ metadata = copy.deepcopy(index_meta_raw.get("metadata", {}))
574
+ metadata["indexed"] = self.get_indexed_count(index_name)
575
+ metadata["updated"] = result
576
+ metadata["state"] = state
577
+ metadata["updated_on"] = time.time()
578
+ #
579
+ history_raw = metadata.pop("history", "[]")
580
+ try:
581
+ history = json.loads(history_raw) if history_raw.strip() else []
582
+ # replace the last history item with updated metadata
583
+ if history and isinstance(history, list):
584
+ history[-1] = metadata
585
+ else:
586
+ history = [metadata]
587
+ except (json.JSONDecodeError, TypeError):
588
+ logger.warning(f"Failed to load index history: {history_raw}. Create new with only current item.")
589
+ history = [metadata]
590
+ #
591
+ metadata["history"] = json.dumps(history)
592
+ index_meta_doc = Document(page_content=index_meta_raw.get("content", ""), metadata=metadata)
593
+ add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=[index_meta_raw.get("id")])
594
+
595
+ def _emit_index_event(self, index_name: str, error: Optional[str] = None):
596
+ """
597
+ Emit custom event for index data operation.
598
+
599
+ Args:
600
+ index_name: The name of the index
601
+ error: Error message if the operation failed, None otherwise
602
+ """
603
+ index_meta = super().get_index_meta(index_name)
604
+
605
+ if not index_meta:
606
+ logger.warning(
607
+ f"No index_meta found for index '{index_name}'. "
608
+ "Cannot emit index event."
609
+ )
610
+ return
611
+
612
+ metadata = index_meta.get("metadata", {})
613
+
614
+ # Determine if this is a reindex operation
615
+ history_raw = metadata.get("history", "[]")
616
+ try:
617
+ history = json.loads(history_raw) if history_raw.strip() else []
618
+ is_reindex = len(history) > 1
619
+ except (json.JSONDecodeError, TypeError):
620
+ is_reindex = False
621
+
622
+ # Build event message
623
+ event_data = {
624
+ "id": index_meta.get("id"),
625
+ "index_name": index_name,
626
+ "state": "failed" if error is not None else metadata.get("state"),
627
+ "error": error,
628
+ "reindex": is_reindex,
629
+ "indexed": metadata.get("indexed", 0),
630
+ "updated": metadata.get("updated", 0),
631
+ "toolkit_id": metadata.get("toolkit_id"),
632
+ }
633
+
634
+ # Emit the event
635
+ try:
636
+ dispatch_custom_event("index_data_status", event_data)
637
+ logger.debug(
638
+ f"Emitted index_data_status event for index "
639
+ f"'{index_name}': {event_data}"
640
+ )
641
+ except Exception as e:
642
+ logger.warning(f"Failed to emit index_data_status event: {e}")
338
643
 
339
644
  def get_available_tools(self):
340
645
  """
@@ -346,8 +651,8 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
346
651
  """
347
652
  return [
348
653
  {
349
- "name": "index_data",
350
- "mode": "index_data",
654
+ "name": IndexTools.INDEX_DATA.value,
655
+ "mode": IndexTools.INDEX_DATA.value,
351
656
  "ref": self.index_data,
352
657
  "description": "Loads data to index.",
353
658
  "args_schema": create_model(
@@ -357,38 +662,39 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
357
662
  )
358
663
  },
359
664
  {
360
- "name": "search_index",
361
- "mode": "search_index",
665
+ "name": IndexTools.SEARCH_INDEX.value,
666
+ "mode": IndexTools.SEARCH_INDEX.value,
362
667
  "ref": self.search_index,
363
668
  "description": self.search_index.__doc__,
364
669
  "args_schema": BaseSearchParams
365
670
  },
366
671
  {
367
- "name": "stepback_search_index",
368
- "mode": "stepback_search_index",
672
+ "name": IndexTools.STEPBACK_SEARCH_INDEX.value,
673
+ "mode": IndexTools.STEPBACK_SEARCH_INDEX.value,
369
674
  "ref": self.stepback_search_index,
370
675
  "description": self.stepback_search_index.__doc__,
371
676
  "args_schema": BaseStepbackSearchParams
372
677
  },
373
678
  {
374
- "name": "stepback_summary_index",
375
- "mode": "stepback_summary_index",
679
+ "name": IndexTools.STEPBACK_SUMMARY_INDEX.value,
680
+ "mode": IndexTools.STEPBACK_SUMMARY_INDEX.value,
376
681
  "ref": self.stepback_summary_index,
377
682
  "description": self.stepback_summary_index.__doc__,
378
683
  "args_schema": BaseStepbackSearchParams
379
684
  },
380
685
  {
381
- "name": "remove_index",
382
- "mode": "remove_index",
686
+ "name": IndexTools.REMOVE_INDEX.value,
687
+ "mode": IndexTools.REMOVE_INDEX.value,
383
688
  "ref": self.remove_index,
384
689
  "description": self.remove_index.__doc__,
385
690
  "args_schema": RemoveIndexParams
386
691
  },
387
692
  {
388
- "name": "list_collections",
389
- "mode": "list_collections",
693
+ "name": IndexTools.LIST_COLLECTIONS.value,
694
+ "mode": IndexTools.LIST_COLLECTIONS.value,
390
695
  "ref": self.list_collections,
391
696
  "description": self.list_collections.__doc__,
392
- "args_schema": create_model("ListCollectionsParams") # No parameters
697
+ # No parameters
698
+ "args_schema": create_model("ListCollectionsParams")
393
699
  },
394
- ]
700
+ ]