alita-sdk 0.3.257__py3-none-any.whl → 0.3.562__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (278) hide show
  1. alita_sdk/cli/__init__.py +10 -0
  2. alita_sdk/cli/__main__.py +17 -0
  3. alita_sdk/cli/agent/__init__.py +5 -0
  4. alita_sdk/cli/agent/default.py +258 -0
  5. alita_sdk/cli/agent_executor.py +155 -0
  6. alita_sdk/cli/agent_loader.py +215 -0
  7. alita_sdk/cli/agent_ui.py +228 -0
  8. alita_sdk/cli/agents.py +3601 -0
  9. alita_sdk/cli/callbacks.py +647 -0
  10. alita_sdk/cli/cli.py +168 -0
  11. alita_sdk/cli/config.py +306 -0
  12. alita_sdk/cli/context/__init__.py +30 -0
  13. alita_sdk/cli/context/cleanup.py +198 -0
  14. alita_sdk/cli/context/manager.py +731 -0
  15. alita_sdk/cli/context/message.py +285 -0
  16. alita_sdk/cli/context/strategies.py +289 -0
  17. alita_sdk/cli/context/token_estimation.py +127 -0
  18. alita_sdk/cli/formatting.py +182 -0
  19. alita_sdk/cli/input_handler.py +419 -0
  20. alita_sdk/cli/inventory.py +1073 -0
  21. alita_sdk/cli/mcp_loader.py +315 -0
  22. alita_sdk/cli/toolkit.py +327 -0
  23. alita_sdk/cli/toolkit_loader.py +85 -0
  24. alita_sdk/cli/tools/__init__.py +43 -0
  25. alita_sdk/cli/tools/approval.py +224 -0
  26. alita_sdk/cli/tools/filesystem.py +1751 -0
  27. alita_sdk/cli/tools/planning.py +389 -0
  28. alita_sdk/cli/tools/terminal.py +414 -0
  29. alita_sdk/community/__init__.py +72 -12
  30. alita_sdk/community/inventory/__init__.py +236 -0
  31. alita_sdk/community/inventory/config.py +257 -0
  32. alita_sdk/community/inventory/enrichment.py +2137 -0
  33. alita_sdk/community/inventory/extractors.py +1469 -0
  34. alita_sdk/community/inventory/ingestion.py +3172 -0
  35. alita_sdk/community/inventory/knowledge_graph.py +1457 -0
  36. alita_sdk/community/inventory/parsers/__init__.py +218 -0
  37. alita_sdk/community/inventory/parsers/base.py +295 -0
  38. alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
  39. alita_sdk/community/inventory/parsers/go_parser.py +851 -0
  40. alita_sdk/community/inventory/parsers/html_parser.py +389 -0
  41. alita_sdk/community/inventory/parsers/java_parser.py +593 -0
  42. alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
  43. alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
  44. alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
  45. alita_sdk/community/inventory/parsers/python_parser.py +604 -0
  46. alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
  47. alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
  48. alita_sdk/community/inventory/parsers/text_parser.py +322 -0
  49. alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
  50. alita_sdk/community/inventory/patterns/__init__.py +61 -0
  51. alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
  52. alita_sdk/community/inventory/patterns/loader.py +348 -0
  53. alita_sdk/community/inventory/patterns/registry.py +198 -0
  54. alita_sdk/community/inventory/presets.py +535 -0
  55. alita_sdk/community/inventory/retrieval.py +1403 -0
  56. alita_sdk/community/inventory/toolkit.py +173 -0
  57. alita_sdk/community/inventory/toolkit_utils.py +176 -0
  58. alita_sdk/community/inventory/visualize.py +1370 -0
  59. alita_sdk/configurations/__init__.py +11 -0
  60. alita_sdk/configurations/ado.py +148 -2
  61. alita_sdk/configurations/azure_search.py +1 -1
  62. alita_sdk/configurations/bigquery.py +1 -1
  63. alita_sdk/configurations/bitbucket.py +94 -2
  64. alita_sdk/configurations/browser.py +18 -0
  65. alita_sdk/configurations/carrier.py +19 -0
  66. alita_sdk/configurations/confluence.py +130 -1
  67. alita_sdk/configurations/delta_lake.py +1 -1
  68. alita_sdk/configurations/figma.py +76 -5
  69. alita_sdk/configurations/github.py +65 -1
  70. alita_sdk/configurations/gitlab.py +81 -0
  71. alita_sdk/configurations/google_places.py +17 -0
  72. alita_sdk/configurations/jira.py +103 -0
  73. alita_sdk/configurations/openapi.py +111 -0
  74. alita_sdk/configurations/postman.py +1 -1
  75. alita_sdk/configurations/qtest.py +72 -3
  76. alita_sdk/configurations/report_portal.py +115 -0
  77. alita_sdk/configurations/salesforce.py +19 -0
  78. alita_sdk/configurations/service_now.py +1 -12
  79. alita_sdk/configurations/sharepoint.py +167 -0
  80. alita_sdk/configurations/sonar.py +18 -0
  81. alita_sdk/configurations/sql.py +20 -0
  82. alita_sdk/configurations/testio.py +101 -0
  83. alita_sdk/configurations/testrail.py +88 -0
  84. alita_sdk/configurations/xray.py +94 -1
  85. alita_sdk/configurations/zephyr_enterprise.py +94 -1
  86. alita_sdk/configurations/zephyr_essential.py +95 -0
  87. alita_sdk/runtime/clients/artifact.py +21 -4
  88. alita_sdk/runtime/clients/client.py +458 -67
  89. alita_sdk/runtime/clients/mcp_discovery.py +342 -0
  90. alita_sdk/runtime/clients/mcp_manager.py +262 -0
  91. alita_sdk/runtime/clients/sandbox_client.py +352 -0
  92. alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
  93. alita_sdk/runtime/langchain/assistant.py +183 -43
  94. alita_sdk/runtime/langchain/constants.py +647 -1
  95. alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
  96. alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
  97. alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
  98. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
  99. alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -3
  100. alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
  101. alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
  102. alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
  103. alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
  104. alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
  105. alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
  106. alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
  107. alita_sdk/runtime/langchain/document_loaders/constants.py +189 -41
  108. alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
  109. alita_sdk/runtime/langchain/langraph_agent.py +407 -92
  110. alita_sdk/runtime/langchain/utils.py +102 -8
  111. alita_sdk/runtime/llms/preloaded.py +2 -6
  112. alita_sdk/runtime/models/mcp_models.py +61 -0
  113. alita_sdk/runtime/skills/__init__.py +91 -0
  114. alita_sdk/runtime/skills/callbacks.py +498 -0
  115. alita_sdk/runtime/skills/discovery.py +540 -0
  116. alita_sdk/runtime/skills/executor.py +610 -0
  117. alita_sdk/runtime/skills/input_builder.py +371 -0
  118. alita_sdk/runtime/skills/models.py +330 -0
  119. alita_sdk/runtime/skills/registry.py +355 -0
  120. alita_sdk/runtime/skills/skill_runner.py +330 -0
  121. alita_sdk/runtime/toolkits/__init__.py +28 -0
  122. alita_sdk/runtime/toolkits/application.py +14 -4
  123. alita_sdk/runtime/toolkits/artifact.py +24 -9
  124. alita_sdk/runtime/toolkits/datasource.py +13 -6
  125. alita_sdk/runtime/toolkits/mcp.py +780 -0
  126. alita_sdk/runtime/toolkits/planning.py +178 -0
  127. alita_sdk/runtime/toolkits/skill_router.py +238 -0
  128. alita_sdk/runtime/toolkits/subgraph.py +11 -6
  129. alita_sdk/runtime/toolkits/tools.py +314 -70
  130. alita_sdk/runtime/toolkits/vectorstore.py +11 -5
  131. alita_sdk/runtime/tools/__init__.py +24 -0
  132. alita_sdk/runtime/tools/application.py +16 -4
  133. alita_sdk/runtime/tools/artifact.py +367 -33
  134. alita_sdk/runtime/tools/data_analysis.py +183 -0
  135. alita_sdk/runtime/tools/function.py +100 -4
  136. alita_sdk/runtime/tools/graph.py +81 -0
  137. alita_sdk/runtime/tools/image_generation.py +218 -0
  138. alita_sdk/runtime/tools/llm.py +1013 -177
  139. alita_sdk/runtime/tools/loop.py +3 -1
  140. alita_sdk/runtime/tools/loop_output.py +3 -1
  141. alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
  142. alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
  143. alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
  144. alita_sdk/runtime/tools/planning/__init__.py +36 -0
  145. alita_sdk/runtime/tools/planning/models.py +246 -0
  146. alita_sdk/runtime/tools/planning/wrapper.py +607 -0
  147. alita_sdk/runtime/tools/router.py +2 -1
  148. alita_sdk/runtime/tools/sandbox.py +375 -0
  149. alita_sdk/runtime/tools/skill_router.py +776 -0
  150. alita_sdk/runtime/tools/tool.py +3 -1
  151. alita_sdk/runtime/tools/vectorstore.py +69 -65
  152. alita_sdk/runtime/tools/vectorstore_base.py +163 -90
  153. alita_sdk/runtime/utils/AlitaCallback.py +137 -21
  154. alita_sdk/runtime/utils/mcp_client.py +492 -0
  155. alita_sdk/runtime/utils/mcp_oauth.py +361 -0
  156. alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
  157. alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
  158. alita_sdk/runtime/utils/streamlit.py +41 -14
  159. alita_sdk/runtime/utils/toolkit_utils.py +28 -9
  160. alita_sdk/runtime/utils/utils.py +48 -0
  161. alita_sdk/tools/__init__.py +135 -37
  162. alita_sdk/tools/ado/__init__.py +2 -2
  163. alita_sdk/tools/ado/repos/__init__.py +15 -19
  164. alita_sdk/tools/ado/repos/repos_wrapper.py +12 -20
  165. alita_sdk/tools/ado/test_plan/__init__.py +26 -8
  166. alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -28
  167. alita_sdk/tools/ado/wiki/__init__.py +27 -12
  168. alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -40
  169. alita_sdk/tools/ado/work_item/__init__.py +27 -12
  170. alita_sdk/tools/ado/work_item/ado_wrapper.py +95 -11
  171. alita_sdk/tools/advanced_jira_mining/__init__.py +12 -8
  172. alita_sdk/tools/aws/delta_lake/__init__.py +14 -11
  173. alita_sdk/tools/aws/delta_lake/tool.py +5 -1
  174. alita_sdk/tools/azure_ai/search/__init__.py +13 -8
  175. alita_sdk/tools/base/tool.py +5 -1
  176. alita_sdk/tools/base_indexer_toolkit.py +454 -110
  177. alita_sdk/tools/bitbucket/__init__.py +27 -19
  178. alita_sdk/tools/bitbucket/api_wrapper.py +285 -27
  179. alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
  180. alita_sdk/tools/browser/__init__.py +41 -16
  181. alita_sdk/tools/browser/crawler.py +3 -1
  182. alita_sdk/tools/browser/utils.py +15 -6
  183. alita_sdk/tools/carrier/__init__.py +18 -17
  184. alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
  185. alita_sdk/tools/carrier/excel_reporter.py +8 -4
  186. alita_sdk/tools/chunkers/__init__.py +3 -1
  187. alita_sdk/tools/chunkers/code/codeparser.py +1 -1
  188. alita_sdk/tools/chunkers/sematic/json_chunker.py +2 -1
  189. alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
  190. alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
  191. alita_sdk/tools/chunkers/universal_chunker.py +270 -0
  192. alita_sdk/tools/cloud/aws/__init__.py +11 -7
  193. alita_sdk/tools/cloud/azure/__init__.py +11 -7
  194. alita_sdk/tools/cloud/gcp/__init__.py +11 -7
  195. alita_sdk/tools/cloud/k8s/__init__.py +11 -7
  196. alita_sdk/tools/code/linter/__init__.py +9 -8
  197. alita_sdk/tools/code/loaders/codesearcher.py +3 -2
  198. alita_sdk/tools/code/sonar/__init__.py +20 -13
  199. alita_sdk/tools/code_indexer_toolkit.py +199 -0
  200. alita_sdk/tools/confluence/__init__.py +21 -14
  201. alita_sdk/tools/confluence/api_wrapper.py +197 -58
  202. alita_sdk/tools/confluence/loader.py +14 -2
  203. alita_sdk/tools/custom_open_api/__init__.py +11 -5
  204. alita_sdk/tools/elastic/__init__.py +10 -8
  205. alita_sdk/tools/elitea_base.py +546 -64
  206. alita_sdk/tools/figma/__init__.py +11 -8
  207. alita_sdk/tools/figma/api_wrapper.py +352 -153
  208. alita_sdk/tools/github/__init__.py +17 -17
  209. alita_sdk/tools/github/api_wrapper.py +9 -26
  210. alita_sdk/tools/github/github_client.py +81 -12
  211. alita_sdk/tools/github/schemas.py +2 -1
  212. alita_sdk/tools/github/tool.py +5 -1
  213. alita_sdk/tools/gitlab/__init__.py +18 -13
  214. alita_sdk/tools/gitlab/api_wrapper.py +224 -80
  215. alita_sdk/tools/gitlab_org/__init__.py +13 -10
  216. alita_sdk/tools/google/bigquery/__init__.py +13 -13
  217. alita_sdk/tools/google/bigquery/tool.py +5 -1
  218. alita_sdk/tools/google_places/__init__.py +20 -11
  219. alita_sdk/tools/jira/__init__.py +21 -11
  220. alita_sdk/tools/jira/api_wrapper.py +315 -168
  221. alita_sdk/tools/keycloak/__init__.py +10 -8
  222. alita_sdk/tools/localgit/__init__.py +8 -3
  223. alita_sdk/tools/localgit/local_git.py +62 -54
  224. alita_sdk/tools/localgit/tool.py +5 -1
  225. alita_sdk/tools/memory/__init__.py +38 -14
  226. alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
  227. alita_sdk/tools/ocr/__init__.py +10 -8
  228. alita_sdk/tools/openapi/__init__.py +281 -108
  229. alita_sdk/tools/openapi/api_wrapper.py +883 -0
  230. alita_sdk/tools/openapi/tool.py +20 -0
  231. alita_sdk/tools/pandas/__init__.py +18 -11
  232. alita_sdk/tools/pandas/api_wrapper.py +40 -45
  233. alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
  234. alita_sdk/tools/postman/__init__.py +10 -11
  235. alita_sdk/tools/postman/api_wrapper.py +19 -8
  236. alita_sdk/tools/postman/postman_analysis.py +8 -1
  237. alita_sdk/tools/pptx/__init__.py +10 -10
  238. alita_sdk/tools/qtest/__init__.py +21 -14
  239. alita_sdk/tools/qtest/api_wrapper.py +1784 -88
  240. alita_sdk/tools/rally/__init__.py +12 -10
  241. alita_sdk/tools/report_portal/__init__.py +22 -16
  242. alita_sdk/tools/salesforce/__init__.py +21 -16
  243. alita_sdk/tools/servicenow/__init__.py +20 -16
  244. alita_sdk/tools/servicenow/api_wrapper.py +1 -1
  245. alita_sdk/tools/sharepoint/__init__.py +16 -14
  246. alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
  247. alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
  248. alita_sdk/tools/sharepoint/utils.py +8 -2
  249. alita_sdk/tools/slack/__init__.py +11 -7
  250. alita_sdk/tools/sql/__init__.py +21 -19
  251. alita_sdk/tools/sql/api_wrapper.py +71 -23
  252. alita_sdk/tools/testio/__init__.py +20 -13
  253. alita_sdk/tools/testrail/__init__.py +12 -11
  254. alita_sdk/tools/testrail/api_wrapper.py +214 -46
  255. alita_sdk/tools/utils/__init__.py +28 -4
  256. alita_sdk/tools/utils/content_parser.py +182 -62
  257. alita_sdk/tools/utils/text_operations.py +254 -0
  258. alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
  259. alita_sdk/tools/xray/__init__.py +17 -14
  260. alita_sdk/tools/xray/api_wrapper.py +58 -113
  261. alita_sdk/tools/yagmail/__init__.py +8 -3
  262. alita_sdk/tools/zephyr/__init__.py +11 -7
  263. alita_sdk/tools/zephyr_enterprise/__init__.py +15 -9
  264. alita_sdk/tools/zephyr_enterprise/api_wrapper.py +30 -15
  265. alita_sdk/tools/zephyr_essential/__init__.py +15 -10
  266. alita_sdk/tools/zephyr_essential/api_wrapper.py +297 -54
  267. alita_sdk/tools/zephyr_essential/client.py +6 -4
  268. alita_sdk/tools/zephyr_scale/__init__.py +12 -8
  269. alita_sdk/tools/zephyr_scale/api_wrapper.py +39 -31
  270. alita_sdk/tools/zephyr_squad/__init__.py +11 -7
  271. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/METADATA +184 -37
  272. alita_sdk-0.3.562.dist-info/RECORD +450 -0
  273. alita_sdk-0.3.562.dist-info/entry_points.txt +2 -0
  274. alita_sdk/tools/bitbucket/tools.py +0 -304
  275. alita_sdk-0.3.257.dist-info/RECORD +0 -343
  276. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/WHEEL +0 -0
  277. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/licenses/LICENSE +0 -0
  278. {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.562.dist-info}/top_level.txt +0 -0
@@ -1,43 +1,51 @@
1
+ import copy
1
2
  import json
2
3
  import logging
3
- from typing import Any, Optional, List, Literal, Dict, Generator
4
+ import time
5
+ from enum import Enum
6
+ from typing import Any, Optional, List, Dict, Generator
4
7
 
8
+ from langchain_core.callbacks import dispatch_custom_event
5
9
  from langchain_core.documents import Document
6
10
  from pydantic import create_model, Field, SecretStr
7
11
 
8
- # from alita_sdk.runtime.langchain.interfaces.llm_processor import get_embeddings
9
- from .utils.content_parser import process_content_by_type
12
+ from .utils.content_parser import file_extension_by_chunker, process_document_by_type
10
13
  from .vector_adapters.VectorStoreAdapter import VectorStoreAdapterFactory
14
+ from ..runtime.langchain.document_loaders.constants import loaders_allowed_to_override
11
15
  from ..runtime.tools.vectorstore_base import VectorStoreWrapperBase
12
16
  from ..runtime.utils.utils import IndexerKeywords
13
17
 
14
18
  logger = logging.getLogger(__name__)
15
19
 
16
- # Base Vector Store Schema Models
17
- BaseIndexParams = create_model(
18
- "BaseIndexParams",
19
- collection_suffix=(str, Field(description="Suffix for collection name (max 7 characters) used to separate datasets", min_length=1, max_length=7)),
20
- vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
21
- )
20
+ DEFAULT_CUT_OFF = 0.1
21
+ INDEX_META_UPDATE_INTERVAL = 600.0
22
+
23
+ class IndexTools(str, Enum):
24
+ """Enum for index-related tool names."""
25
+ INDEX_DATA = "index_data"
26
+ SEARCH_INDEX = "search_index"
27
+ STEPBACK_SEARCH_INDEX = "stepback_search_index"
28
+ STEPBACK_SUMMARY_INDEX = "stepback_summary_index"
29
+ REMOVE_INDEX = "remove_index"
30
+ LIST_COLLECTIONS = "list_collections"
22
31
 
23
32
  RemoveIndexParams = create_model(
24
33
  "RemoveIndexParams",
25
- collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
34
+ index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
26
35
  )
27
36
 
28
37
  BaseSearchParams = create_model(
29
38
  "BaseSearchParams",
30
39
  query=(str, Field(description="Query text to search in the index")),
31
- collection_suffix=(Optional[str], Field(
32
- description="Optional suffix for collection name (max 7 characters). Leave empty to search across all datasets",
40
+ index_name=(Optional[str], Field(
41
+ description="Optional index name (max 7 characters). Leave empty to search across all datasets",
33
42
  default="", max_length=7)),
34
- vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
35
43
  filter=(Optional[dict | str], Field(
36
44
  description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
37
45
  default={},
38
46
  examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
39
47
  )),
40
- cut_off=(Optional[float], Field(description="Cut-off score for search results", default=0.5)),
48
+ cut_off=(Optional[float], Field(description="Cut-off score for search results", default=DEFAULT_CUT_OFF, ge=0, le=1)),
41
49
  search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
42
50
  full_text_search=(Optional[Dict[str, Any]], Field(
43
51
  description="Full text search parameters. Can be a dictionary with search options.",
@@ -60,42 +68,31 @@ BaseSearchParams = create_model(
60
68
  BaseStepbackSearchParams = create_model(
61
69
  "BaseStepbackSearchParams",
62
70
  query=(str, Field(description="Query text to search in the index")),
63
- collection_suffix=(Optional[str], Field(description="Optional suffix for collection name (max 7 characters)", default="", max_length=7)),
64
- vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
71
+ index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
65
72
  messages=(Optional[List], Field(description="Chat messages for stepback search context", default=[])),
66
73
  filter=(Optional[dict | str], Field(
67
74
  description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
68
75
  default={},
69
76
  examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
70
77
  )),
71
- cut_off=(Optional[float], Field(description="Cut-off score for search results", default=0.5)),
78
+ cut_off=(Optional[float], Field(description="Cut-off score for search results", default=DEFAULT_CUT_OFF, ge=0, le=1)),
72
79
  search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
73
- reranker=(Optional[dict], Field(
74
- description="Reranker configuration. Can be a dictionary with reranking parameters.",
75
- default={}
76
- )),
77
80
  full_text_search=(Optional[Dict[str, Any]], Field(
78
81
  description="Full text search parameters. Can be a dictionary with search options.",
79
82
  default=None
80
83
  )),
81
- reranking_config=(Optional[Dict[str, Dict[str, Any]]], Field(
82
- description="Reranking configuration. Can be a dictionary with reranking settings.",
83
- default=None
84
- )),
85
84
  extended_search=(Optional[List[str]], Field(
86
85
  description="List of additional fields to include in the search results.",
87
86
  default=None
88
87
  )),
89
- )
90
-
91
- BaseIndexDataParams = create_model(
92
- "indexData",
93
- __base__=BaseIndexParams,
94
- progress_step=(Optional[int], Field(default=10, ge=0, le=100,
95
- description="Optional step size for progress reporting during indexing")),
96
- clean_index=(Optional[bool], Field(default=False,
97
- description="Optional flag to enforce clean existing index before indexing new data")),
98
- chunking_config=(Optional[dict], Field(description="Chunking tool configuration", default_factory=dict)),
88
+ reranker=(Optional[dict], Field(
89
+ description="Reranker configuration. Can be a dictionary with reranking parameters.",
90
+ default={}
91
+ )),
92
+ reranking_config=(Optional[Dict[str, Dict[str, Any]]], Field(
93
+ description="Reranking configuration. Can be a dictionary with reranking settings.",
94
+ default=None
95
+ )),
99
96
  )
100
97
 
101
98
 
@@ -104,29 +101,21 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
104
101
 
105
102
  doctype: str = "document"
106
103
 
107
- llm: Any = None
108
104
  connection_string: Optional[SecretStr] = None
109
105
  collection_name: Optional[str] = None
110
- embedding_model: Optional[str] = "HuggingFaceEmbeddings"
111
- embedding_model_params: Optional[Dict[str, Any]] = {"model_name": "sentence-transformers/all-MiniLM-L6-v2"}
112
- vectorstore_type: Optional[str] = "PGVector"
113
- _embedding: Optional[Any] = None
114
106
  alita: Any = None # Elitea client, if available
115
107
 
116
108
  def __init__(self, **kwargs):
117
109
  conn = kwargs.get('connection_string', None)
118
110
  connection_string = conn.get_secret_value() if isinstance(conn, SecretStr) else conn
119
- collection_name = kwargs.get('collection_name')
111
+ collection_name = kwargs.get('collection_schema')
120
112
 
121
- # if 'embedding_model' not in kwargs:
122
- kwargs['embedding_model'] = 'HuggingFaceEmbeddings'
123
- if 'embedding_model_params' not in kwargs:
124
- kwargs['embedding_model_params'] = {"model_name": "sentence-transformers/all-MiniLM-L6-v2"}
125
113
  if 'vectorstore_type' not in kwargs:
126
114
  kwargs['vectorstore_type'] = 'PGVector'
127
115
  vectorstore_type = kwargs.get('vectorstore_type')
128
- kwargs['vectorstore_params'] = VectorStoreAdapterFactory.create_adapter(vectorstore_type).get_vectorstore_params(collection_name, connection_string)
129
- kwargs['_embedding'] = kwargs.get('alita').get_embeddings(kwargs.get('embedding_model'))
116
+ if connection_string:
117
+ # Initialize vectorstore params only if connection string is provided
118
+ kwargs['vectorstore_params'] = VectorStoreAdapterFactory.create_adapter(vectorstore_type).get_vectorstore_params(collection_name, connection_string)
130
119
  super().__init__(**kwargs)
131
120
 
132
121
  def _index_tool_params(self, **kwargs) -> dict[str, tuple[type, Field]]:
@@ -136,6 +125,11 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
136
125
  """
137
126
  return {}
138
127
 
128
+ def _remove_metadata_keys(self) -> List[str]:
129
+ """ Returns a list of metadata keys to be removed from documents before indexing.
130
+ Override this method in subclasses to provide specific keys to remove."""
131
+ return [IndexerKeywords.CONTENT_IN_BYTES.value, IndexerKeywords.CONTENT_FILE_NAME.value]
132
+
139
133
  def _base_loader(self, **kwargs) -> Generator[Document, None, None]:
140
134
  """ Loads documents from a source, processes them,
141
135
  and returns a list of Document objects with base metadata: id and created_on."""
@@ -154,45 +148,156 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
154
148
  yield from ()
155
149
 
156
150
  def index_data(self, **kwargs):
157
- collection_suffix = kwargs.get("collection_suffix")
158
- progress_step = kwargs.get("progress_step")
151
+ index_name = kwargs.get("index_name")
159
152
  clean_index = kwargs.get("clean_index")
160
153
  chunking_tool = kwargs.get("chunking_tool")
161
154
  chunking_config = kwargs.get("chunking_config")
155
+
156
+ # Store the interval in a private dict to avoid Pydantic field errors
157
+ if not hasattr(self, "_index_meta_config"):
158
+ self._index_meta_config: Dict[str, Any] = {}
159
+
160
+ self._index_meta_config["update_interval"] = kwargs.get(
161
+ "meta_update_interval",
162
+ INDEX_META_UPDATE_INTERVAL,
163
+ )
164
+
165
+ result = {"count": 0}
162
166
  #
163
- if clean_index:
164
- self._clean_index(collection_suffix)
165
- #
166
- documents = self._base_loader(**kwargs)
167
- documents = self._reduce_duplicates(documents, collection_suffix)
168
- documents = self._extend_data(documents) # update content of not-reduced base document if needed (for sharepoint and similar)
169
- documents = self._collect_dependencies(documents) # collect dependencies for base documents
170
- documents = self._apply_loaders_chunkers(documents, chunking_tool, chunking_config)
167
+ try:
168
+ if clean_index:
169
+ self._clean_index(index_name)
170
+ #
171
+ self.index_meta_init(index_name, kwargs)
172
+ self._emit_index_event(index_name)
173
+ #
174
+ self._log_tool_event(f"Indexing data into collection with suffix '{index_name}'. It can take some time...")
175
+ self._log_tool_event(f"Loading the documents to index...{kwargs}")
176
+ documents = self._base_loader(**kwargs)
177
+ documents = list(documents) # consume/exhaust generator to count items
178
+ documents_count = len(documents)
179
+ documents = (doc for doc in documents)
180
+ self._log_tool_event(f"Base documents were pre-loaded. "
181
+ f"Search for possible document duplicates and remove them from the indexing list...")
182
+ documents = self._reduce_duplicates(documents, index_name)
183
+ self._log_tool_event(f"Duplicates were removed. "
184
+ f"Processing documents to collect dependencies and prepare them for indexing...")
185
+ self._save_index_generator(documents, documents_count, chunking_tool, chunking_config, index_name=index_name, result=result)
186
+ #
187
+ results_count = result["count"]
188
+ # Final update should always be forced
189
+ self.index_meta_update(index_name, IndexerKeywords.INDEX_META_COMPLETED.value, results_count, update_force=True)
190
+ self._emit_index_event(index_name)
191
+ #
192
+ return {"status": "ok", "message": f"successfully indexed {results_count} documents" if results_count > 0
193
+ else "no new documents to index"}
194
+ except Exception as e:
195
+ # Do maximum effort at least send custom event for supposed changed status
196
+ msg = str(e)
197
+ try:
198
+ # Error update should also be forced
199
+ self.index_meta_update(index_name, IndexerKeywords.INDEX_META_FAILED.value, result["count"], update_force=True)
200
+ except Exception as ie:
201
+ logger.error(f"Failed to update index meta status to FAILED for index '{index_name}': {ie}")
202
+ msg = f"{msg}; additionally failed to update index meta status to FAILED: {ie}"
203
+ self._emit_index_event(index_name, error=msg)
204
+ raise e
205
+
206
+ def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config, result, index_name: Optional[str] = None):
207
+ self._ensure_vectorstore_initialized()
208
+ self._log_tool_event(f"Base documents are ready for indexing. {base_total} base documents in total to index.")
209
+ from ..runtime.langchain.interfaces.llm_processor import add_documents
171
210
  #
172
- return self._save_index(list(documents), collection_suffix=collection_suffix, progress_step=progress_step)
173
-
211
+ base_doc_counter = 0
212
+ pg_vector_add_docs_chunk = []
213
+ for base_doc in base_documents:
214
+ base_doc_counter += 1
215
+ self._log_tool_event(f"Processing dependent documents for base documents #{base_doc_counter}.")
216
+
217
+ # (base_doc for _ in range(1)) - wrap single base_doc to Generator in order to reuse existing code
218
+ documents = self._extend_data((base_doc for _ in range(1))) # update content of not-reduced base document if needed (for sharepoint and similar)
219
+ documents = self._collect_dependencies(documents) # collect dependencies for base documents
220
+ self._log_tool_event(f"Dependent documents were processed. "
221
+ f"Applying chunking tool '{chunking_tool}' if specified and preparing documents for indexing...")
222
+ documents = self._apply_loaders_chunkers(documents, chunking_tool, chunking_config)
223
+ documents = self._clean_metadata(documents)
224
+
225
+ logger.debug(f"Indexing base document #{base_doc_counter}: {base_doc} and all dependent documents: {documents}")
226
+
227
+ dependent_docs_counter = 0
228
+ #
229
+ for doc in documents:
230
+ if not doc.page_content:
231
+ # To avoid case when all documents have empty content
232
+ # See llm_processor.add_documents which exclude metadata of docs with empty content
233
+ continue
234
+ #
235
+ if 'id' not in doc.metadata or 'updated_on' not in doc.metadata:
236
+ logger.warning(f"Document is missing required metadata field 'id' or 'updated_on': {doc.metadata}")
237
+ #
238
+ # if index_name is provided, add it to metadata of each document
239
+ if index_name:
240
+ if not doc.metadata.get('collection'):
241
+ doc.metadata['collection'] = index_name
242
+ else:
243
+ doc.metadata['collection'] += f";{index_name}"
244
+ #
245
+ try:
246
+ pg_vector_add_docs_chunk.append(doc)
247
+ dependent_docs_counter += 1
248
+ if len(pg_vector_add_docs_chunk) >= self.max_docs_per_add:
249
+ add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
250
+ self._log_tool_event(f"{len(pg_vector_add_docs_chunk)} documents have been indexed. Continuing...")
251
+ pg_vector_add_docs_chunk = []
252
+ except Exception:
253
+ from traceback import format_exc
254
+ logger.error(f"Error: {format_exc()}")
255
+ return {"status": "error", "message": f"Error: {format_exc()}"}
256
+ msg = f"Indexed base document #{base_doc_counter} out of {base_total} (with {dependent_docs_counter} dependencies)."
257
+ logger.debug(msg)
258
+ self._log_tool_event(msg)
259
+ result["count"] += dependent_docs_counter
260
+ # After each base document, try a non-forced meta update; throttling handled inside index_meta_update
261
+ try:
262
+ self.index_meta_update(index_name, IndexerKeywords.INDEX_META_IN_PROGRESS.value, result["count"], update_force=False)
263
+ except Exception as exc: # best-effort, do not break indexing
264
+ logger.warning(f"Failed to update index meta during indexing process for index '{index_name}': {exc}")
265
+ if pg_vector_add_docs_chunk:
266
+ add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
267
+
174
268
  def _apply_loaders_chunkers(self, documents: Generator[Document, None, None], chunking_tool: str=None, chunking_config=None) -> Generator[Document, None, None]:
175
- from alita_sdk.tools.chunkers import __all__ as chunkers
269
+ from ..tools.chunkers import __all__ as chunkers
176
270
 
177
271
  if chunking_config is None:
178
272
  chunking_config = {}
179
- chunking_config['embedding'] = self._embedding
273
+ chunking_config['embedding'] = self.embeddings
180
274
  chunking_config['llm'] = self.llm
181
-
275
+
182
276
  for document in documents:
183
- if content_type := document.metadata.get('loader_content_type', None):
277
+ if content_type := document.metadata.get(IndexerKeywords.CONTENT_FILE_NAME.value, None):
184
278
  # apply parsing based on content type and chunk if chunker was applied to parent doc
185
- content = document.metadata.pop('loader_content', None)
186
- yield from process_content_by_type(
279
+ content = document.metadata.pop(IndexerKeywords.CONTENT_IN_BYTES.value, None)
280
+ yield from process_document_by_type(
187
281
  document=document,
188
282
  content=content,
189
283
  extension_source=content_type, llm=self.llm, chunking_config=chunking_config)
284
+ elif chunking_tool and (content_in_bytes := document.metadata.pop(IndexerKeywords.CONTENT_IN_BYTES.value, None)) is not None:
285
+ if not content_in_bytes:
286
+ # content is empty, yield as is
287
+ yield document
288
+ continue
289
+ # apply parsing based on content type resolved from chunking_tool
290
+ content_type = file_extension_by_chunker(chunking_tool)
291
+ yield from process_document_by_type(
292
+ document=document,
293
+ content=content_in_bytes,
294
+ extension_source=content_type, llm=self.llm, chunking_config=chunking_config)
190
295
  elif chunking_tool:
191
296
  # apply default chunker from toolkit config. No parsing.
192
297
  chunker = chunkers.get(chunking_tool)
193
298
  yield from chunker(file_content_generator=iter([document]), config=chunking_config)
194
299
  else:
195
- # return as is if neither chunker or content typa are specified
300
+ # return as is if neither chunker nor content type are specified
196
301
  yield document
197
302
 
198
303
  def _extend_data(self, documents: Generator[Document, None, None]):
@@ -200,24 +305,34 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
200
305
 
201
306
  def _collect_dependencies(self, documents: Generator[Document, None, None]):
202
307
  for document in documents:
308
+ self._log_tool_event(message=f"Collecting the dependencies for document ID "
309
+ f"'{document.metadata.get('id', 'N/A')}' to collect dependencies if any...")
203
310
  dependencies = self._process_document(document)
204
311
  yield document
205
312
  for dep in dependencies:
206
313
  dep.metadata[IndexerKeywords.PARENT.value] = document.metadata.get('id', None)
207
314
  yield dep
208
315
 
316
+ def _clean_metadata(self, documents: Generator[Document, None, None]):
317
+ for document in documents:
318
+ remove_keys = self._remove_metadata_keys()
319
+ for key in remove_keys:
320
+ document.metadata.pop(key, None)
321
+ yield document
322
+
209
323
  def _reduce_duplicates(
210
324
  self,
211
325
  documents: Generator[Any, None, None],
212
- collection_suffix: str,
326
+ index_name: str,
213
327
  log_msg: str = "Verification of documents to index started"
214
328
  ) -> Generator[Document, None, None]:
215
329
  """Generic duplicate reduction logic for documents."""
216
- self._log_data(log_msg, tool_name="index_documents")
217
- indexed_data = self._get_indexed_data(collection_suffix)
330
+ self._ensure_vectorstore_initialized()
331
+ self._log_tool_event(log_msg, tool_name="index_documents")
332
+ indexed_data = self._get_indexed_data(index_name)
218
333
  indexed_keys = set(indexed_data.keys())
219
334
  if not indexed_keys:
220
- self._log_data("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
335
+ self._log_tool_event("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
221
336
  yield from documents
222
337
  return
223
338
 
@@ -225,7 +340,8 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
225
340
 
226
341
  for document in documents:
227
342
  key = self.key_fn(document)
228
- if key in indexed_keys and collection_suffix == indexed_data[key]['metadata'].get('collection'):
343
+ key = key if isinstance(key, str) else str(key)
344
+ if key in indexed_keys and index_name == indexed_data[key]['metadata'].get('collection'):
229
345
  if self.compare_fn(document, indexed_data[key]):
230
346
  continue
231
347
  yield document
@@ -234,13 +350,13 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
234
350
  yield document
235
351
 
236
352
  if docs_to_remove:
237
- self._log_data(
353
+ self._log_tool_event(
238
354
  f"Removing {len(docs_to_remove)} documents from vectorstore that are already indexed with different updated_on.",
239
355
  tool_name="index_documents"
240
356
  )
241
357
  self.vectorstore.delete(ids=list(docs_to_remove))
242
358
 
243
- def _get_indexed_data(self, collection_suffix: str):
359
+ def _get_indexed_data(self, index_name: str):
244
360
  raise NotImplementedError("Subclasses must implement this method")
245
361
 
246
362
  def key_fn(self, document: Document):
@@ -252,29 +368,58 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
252
368
  def remove_ids_fn(self, idx_data, key: str):
253
369
  raise NotImplementedError("Subclasses must implement this method")
254
370
 
255
- def remove_index(self, collection_suffix: str = ""):
371
+ def remove_index(self, index_name: str = ""):
256
372
  """Cleans the indexed data in the collection."""
257
- super()._clean_collection(collection_suffix=collection_suffix)
258
- return (f"Collection '{collection_suffix}' has been removed from the vector store.\n"
259
- f"Available collections: {self.list_collections()}")
373
+ super()._clean_collection(index_name=index_name, including_index_meta=True)
374
+ self._emit_index_data_removed_event(index_name)
375
+ return (f"Collection '{index_name}' has been removed from the vector store.\n"
376
+ f"Available collections: {self.list_collections()}") if index_name \
377
+ else "All collections have been removed from the vector store."
378
+
379
+ def _build_collection_filter(self, filter: dict | str, index_name: str = "") -> dict:
380
+ """Builds a filter for the collection based on the provided suffix."""
381
+
382
+ filter = filter if isinstance(filter, dict) else json.loads(filter)
383
+ if index_name:
384
+ filter.update({"collection": {
385
+ "$eq": index_name.strip()
386
+ }})
387
+
388
+ if filter:
389
+ # Exclude index meta documents from search results
390
+ filter = {
391
+ "$and": [
392
+ filter,
393
+ {"$or": [
394
+ {"type": {"$exists": False}},
395
+ {"type": {"$ne": IndexerKeywords.INDEX_META_TYPE.value}}
396
+ ]},
397
+ ]
398
+ }
399
+ else:
400
+ filter = {"$or": [
401
+ {"type": {"$exists": False}},
402
+ {"type": {"$ne": IndexerKeywords.INDEX_META_TYPE.value}}
403
+ ]}
404
+ return filter
260
405
 
261
406
  def search_index(self,
262
407
  query: str,
263
- collection_suffix: str = "",
264
- filter: dict | str = {}, cut_off: float = 0.5,
408
+ index_name: str = "",
409
+ filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
265
410
  search_top: int = 10, reranker: dict = {},
266
411
  full_text_search: Optional[Dict[str, Any]] = None,
267
412
  reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
268
413
  extended_search: Optional[List[str]] = None,
269
414
  **kwargs):
270
415
  """ Searches indexed documents in the vector store."""
271
- # build filter on top of collection_suffix
272
- filter = filter if isinstance(filter, dict) else json.loads(filter)
273
- if collection_suffix:
274
- filter.update({"collection": {
275
- "$eq": collection_suffix.strip()
276
- }})
416
+ # build filter on top of index_name
277
417
 
418
+ available_collections = super().list_collections()
419
+ if index_name and index_name not in available_collections:
420
+ return f"Collection '{index_name}' not found. Available collections: {available_collections}"
421
+
422
+ filter = self._build_collection_filter(filter, index_name)
278
423
  found_docs = super().search_documents(
279
424
  query,
280
425
  doctype=self.doctype,
@@ -291,14 +436,15 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
291
436
  def stepback_search_index(self,
292
437
  query: str,
293
438
  messages: List[Dict[str, Any]] = [],
294
- collection_suffix: str = "",
295
- filter: dict | str = {}, cut_off: float = 0.5,
439
+ index_name: str = "",
440
+ filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
296
441
  search_top: int = 10, reranker: dict = {},
297
442
  full_text_search: Optional[Dict[str, Any]] = None,
298
443
  reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
299
444
  extended_search: Optional[List[str]] = None,
300
445
  **kwargs):
301
446
  """ Searches indexed documents in the vector store."""
447
+ filter = self._build_collection_filter(filter, index_name)
302
448
  found_docs = super().stepback_search(
303
449
  query,
304
450
  messages,
@@ -315,14 +461,16 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
315
461
  def stepback_summary_index(self,
316
462
  query: str,
317
463
  messages: List[Dict[str, Any]] = [],
318
- collection_suffix: str = "",
319
- filter: dict | str = {}, cut_off: float = 0.5,
464
+ index_name: str = "",
465
+ filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
320
466
  search_top: int = 10, reranker: dict = {},
321
467
  full_text_search: Optional[Dict[str, Any]] = None,
322
468
  reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
323
469
  extended_search: Optional[List[str]] = None,
324
470
  **kwargs):
325
471
  """ Generates a summary of indexed documents using stepback technique."""
472
+
473
+ filter = self._build_collection_filter(filter, index_name)
326
474
  return super().stepback_summary(
327
475
  query,
328
476
  messages,
@@ -334,60 +482,256 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
334
482
  reranking_config=reranking_config,
335
483
  extended_search=extended_search
336
484
  )
485
+
486
+ def index_meta_init(self, index_name: str, index_configuration: dict[str, Any]):
487
+ self._ensure_vectorstore_initialized()
488
+ index_meta = super().get_index_meta(index_name)
489
+ if not index_meta:
490
+ self._log_tool_event(
491
+ f"There is no existing index_meta for collection '{index_name}'. Initializing it.",
492
+ tool_name="index_data"
493
+ )
494
+ from ..runtime.langchain.interfaces.llm_processor import add_documents
495
+ created_on = time.time()
496
+ metadata = {
497
+ "collection": index_name,
498
+ "type": IndexerKeywords.INDEX_META_TYPE.value,
499
+ "indexed": 0,
500
+ "updated": 0,
501
+ "state": IndexerKeywords.INDEX_META_IN_PROGRESS.value,
502
+ "index_configuration": index_configuration,
503
+ "created_on": created_on,
504
+ "updated_on": created_on,
505
+ "task_id": None,
506
+ "conversation_id": None,
507
+ "toolkit_id": self.toolkit_id,
508
+ }
509
+ metadata["history"] = json.dumps([metadata])
510
+ index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{index_name}", metadata=metadata)
511
+ add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc])
512
+
513
+ def index_meta_update(self, index_name: str, state: str, result: int, update_force: bool = True, interval: Optional[float] = None):
514
+ """Update `index_meta` document with optional time-based throttling.
515
+
516
+ Args:
517
+ index_name: Index name to update meta for.
518
+ state: New state value for the `index_meta` record.
519
+ result: Number of processed documents to store in the `updated` field.
520
+ update_force: If `True`, perform the update unconditionally, ignoring throttling.
521
+ If `False`, perform the update only when the effective time interval has passed.
522
+ interval: Optional custom interval (in seconds) for this call when `update_force` is `False`.
523
+ If `None`, falls back to the value stored in `self._index_meta_config["update_interval"]`
524
+ if present, otherwise uses `INDEX_META_UPDATE_INTERVAL`.
525
+ """
526
+ self._ensure_vectorstore_initialized()
527
+ if not hasattr(self, "_index_meta_last_update_time"):
528
+ self._index_meta_last_update_time: Dict[str, float] = {}
529
+
530
+ if not update_force:
531
+ # Resolve effective interval:
532
+ # 1\) explicit arg
533
+ # 2\) value from `_index_meta_config`
534
+ # 3\) default constant
535
+ cfg_interval = None
536
+ if hasattr(self, "_index_meta_config"):
537
+ cfg_interval = self._index_meta_config.get("update_interval")
538
+
539
+ eff_interval = (
540
+ interval
541
+ if interval is not None
542
+ else (cfg_interval if cfg_interval is not None else INDEX_META_UPDATE_INTERVAL)
543
+ )
544
+
545
+ last_time = self._index_meta_last_update_time.get(index_name)
546
+ now = time.time()
547
+ if last_time is not None and (now - last_time) < eff_interval:
548
+ return
549
+ self._index_meta_last_update_time[index_name] = now
550
+ else:
551
+ # For forced updates, always refresh last update time
552
+ self._index_meta_last_update_time[index_name] = time.time()
553
+
554
+ index_meta_raw = super().get_index_meta(index_name)
555
+ from ..runtime.langchain.interfaces.llm_processor import add_documents
556
+ #
557
+ if index_meta_raw:
558
+ metadata = copy.deepcopy(index_meta_raw.get("metadata", {}))
559
+ metadata["indexed"] = self.get_indexed_count(index_name)
560
+ metadata["updated"] = result
561
+ metadata["state"] = state
562
+ metadata["updated_on"] = time.time()
563
+ #
564
+ history_raw = metadata.pop("history", "[]")
565
+ try:
566
+ history = json.loads(history_raw) if history_raw.strip() else []
567
+ # replace the last history item with updated metadata
568
+ if history and isinstance(history, list):
569
+ history[-1] = metadata
570
+ else:
571
+ history = [metadata]
572
+ except (json.JSONDecodeError, TypeError):
573
+ logger.warning(f"Failed to load index history: {history_raw}. Create new with only current item.")
574
+ history = [metadata]
575
+ #
576
+ metadata["history"] = json.dumps(history)
577
+ index_meta_doc = Document(page_content=index_meta_raw.get("content", ""), metadata=metadata)
578
+ add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=[index_meta_raw.get("id")])
579
+
580
+ def _emit_index_event(self, index_name: str, error: Optional[str] = None):
581
+ """
582
+ Emit custom event for index data operation.
583
+
584
+ Args:
585
+ index_name: The name of the index
586
+ error: Error message if the operation failed, None otherwise
587
+ """
588
+ index_meta = super().get_index_meta(index_name)
589
+
590
+ if not index_meta:
591
+ logger.warning(
592
+ f"No index_meta found for index '{index_name}'. "
593
+ "Cannot emit index event."
594
+ )
595
+ return
596
+
597
+ metadata = index_meta.get("metadata", {})
598
+
599
+ # Determine if this is a reindex operation
600
+ history_raw = metadata.get("history", "[]")
601
+ try:
602
+ history = json.loads(history_raw) if history_raw.strip() else []
603
+ is_reindex = len(history) > 1
604
+ except (json.JSONDecodeError, TypeError):
605
+ is_reindex = False
606
+
607
+ # Build event message
608
+ event_data = {
609
+ "id": index_meta.get("id"),
610
+ "index_name": index_name,
611
+ "state": "failed" if error is not None else metadata.get("state"),
612
+ "error": error,
613
+ "reindex": is_reindex,
614
+ "indexed": metadata.get("indexed", 0),
615
+ "updated": metadata.get("updated", 0),
616
+ "toolkit_id": metadata.get("toolkit_id"),
617
+ }
618
+
619
+ # Emit the event
620
+ try:
621
+ dispatch_custom_event("index_data_status", event_data)
622
+ logger.debug(
623
+ f"Emitted index_data_status event for index "
624
+ f"'{index_name}': {event_data}"
625
+ )
626
+ except Exception as e:
627
+ logger.warning(f"Failed to emit index_data_status event: {e}")
628
+
629
+ def _emit_index_data_removed_event(self, index_name: str):
630
+ """
631
+ Emit custom event for index data removing.
632
+
633
+ Args:
634
+ index_name: The name of the index
635
+ toolkit_id: The toolkit identifier
636
+ """
637
+ # Build event message
638
+ event_data = {
639
+ "index_name": index_name,
640
+ "toolkit_id": self.toolkit_id,
641
+ "project_id": self.alita.project_id,
642
+ }
643
+ # Emit the event
644
+ try:
645
+ dispatch_custom_event("index_data_removed", event_data)
646
+ logger.debug(
647
+ f"Emitted index_data_removed event for index "
648
+ f"'{index_name}': {event_data}"
649
+ )
650
+ except Exception as e:
651
+ logger.warning(f"Failed to emit index_data_removed event: {e}")
337
652
 
338
653
  def get_available_tools(self):
339
654
  """
340
655
  Returns the standardized vector search tools (search operations only).
341
656
  Index operations are toolkit-specific and should be added manually to each toolkit.
342
-
657
+
658
+ This method constructs the argument schemas for each tool, merging base parameters with any extra parameters
659
+ defined in the subclass. It also handles the special case for chunking tools and their configuration.
660
+
343
661
  Returns:
344
- List of tool dictionaries with name, ref, description, and args_schema
662
+ list: List of tool dictionaries with name, ref, description, and args_schema.
345
663
  """
664
+ index_params = {
665
+ "index_name": (
666
+ str,
667
+ Field(description="Index name (max 7 characters)", min_length=1, max_length=7)
668
+ ),
669
+ "clean_index": (
670
+ Optional[bool],
671
+ Field(default=False, description="Optional flag to enforce clean existing index before indexing new data")
672
+ ),
673
+ "progress_step": (
674
+ Optional[int],
675
+ Field(default=10, ge=0, le=100, description="Optional step size for progress reporting during indexing")
676
+ ),
677
+ }
678
+ chunking_config = (
679
+ Optional[dict],
680
+ Field(description="Chunking tool configuration", default=loaders_allowed_to_override)
681
+ )
682
+
683
+ index_extra_params = self._index_tool_params() or {}
684
+ chunking_tool = index_extra_params.pop("chunking_tool", None)
685
+ if chunking_tool:
686
+ index_params = {
687
+ **index_params,
688
+ "chunking_tool": chunking_tool,
689
+ }
690
+ index_params["chunking_config"] = chunking_config
691
+ index_args_schema = create_model("IndexData", **index_params, **index_extra_params)
692
+
346
693
  return [
347
694
  {
348
- "name": "index_data",
349
- "mode": "index_data",
695
+ "name": IndexTools.INDEX_DATA.value,
696
+ "mode": IndexTools.INDEX_DATA.value,
350
697
  "ref": self.index_data,
351
698
  "description": "Loads data to index.",
352
- "args_schema": create_model(
353
- "IndexData",
354
- __base__=BaseIndexDataParams,
355
- **self._index_tool_params() if self._index_tool_params() else {}
356
- )
699
+ "args_schema": index_args_schema,
357
700
  },
358
701
  {
359
- "name": "search_index",
360
- "mode": "search_index",
702
+ "name": IndexTools.SEARCH_INDEX.value,
703
+ "mode": IndexTools.SEARCH_INDEX.value,
361
704
  "ref": self.search_index,
362
705
  "description": self.search_index.__doc__,
363
706
  "args_schema": BaseSearchParams
364
707
  },
365
708
  {
366
- "name": "stepback_search_index",
367
- "mode": "stepback_search_index",
709
+ "name": IndexTools.STEPBACK_SEARCH_INDEX.value,
710
+ "mode": IndexTools.STEPBACK_SEARCH_INDEX.value,
368
711
  "ref": self.stepback_search_index,
369
712
  "description": self.stepback_search_index.__doc__,
370
713
  "args_schema": BaseStepbackSearchParams
371
714
  },
372
715
  {
373
- "name": "stepback_summary_index",
374
- "mode": "stepback_summary_index",
716
+ "name": IndexTools.STEPBACK_SUMMARY_INDEX.value,
717
+ "mode": IndexTools.STEPBACK_SUMMARY_INDEX.value,
375
718
  "ref": self.stepback_summary_index,
376
719
  "description": self.stepback_summary_index.__doc__,
377
720
  "args_schema": BaseStepbackSearchParams
378
721
  },
379
722
  {
380
- "name": "remove_index",
381
- "mode": "remove_index",
723
+ "name": IndexTools.REMOVE_INDEX.value,
724
+ "mode": IndexTools.REMOVE_INDEX.value,
382
725
  "ref": self.remove_index,
383
726
  "description": self.remove_index.__doc__,
384
727
  "args_schema": RemoveIndexParams
385
728
  },
386
729
  {
387
- "name": "list_collections",
388
- "mode": "list_collections",
730
+ "name": IndexTools.LIST_COLLECTIONS.value,
731
+ "mode": IndexTools.LIST_COLLECTIONS.value,
389
732
  "ref": self.list_collections,
390
733
  "description": self.list_collections.__doc__,
391
- "args_schema": create_model("ListCollectionsParams") # No parameters
734
+ # No parameters
735
+ "args_schema": create_model("ListCollectionsParams")
392
736
  },
393
- ]
737
+ ]