alita-sdk 0.3.257__py3-none-any.whl → 0.3.584__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of alita-sdk might be problematic. Click here for more details.
- alita_sdk/cli/__init__.py +10 -0
- alita_sdk/cli/__main__.py +17 -0
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +155 -0
- alita_sdk/cli/agent_loader.py +215 -0
- alita_sdk/cli/agent_ui.py +228 -0
- alita_sdk/cli/agents.py +3794 -0
- alita_sdk/cli/callbacks.py +647 -0
- alita_sdk/cli/cli.py +168 -0
- alita_sdk/cli/config.py +306 -0
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/formatting.py +182 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1073 -0
- alita_sdk/cli/mcp_loader.py +315 -0
- alita_sdk/cli/toolkit.py +327 -0
- alita_sdk/cli/toolkit_loader.py +85 -0
- alita_sdk/cli/tools/__init__.py +43 -0
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +1751 -0
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +72 -12
- alita_sdk/community/inventory/__init__.py +236 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +173 -0
- alita_sdk/community/inventory/toolkit_utils.py +176 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/__init__.py +11 -0
- alita_sdk/configurations/ado.py +148 -2
- alita_sdk/configurations/azure_search.py +1 -1
- alita_sdk/configurations/bigquery.py +1 -1
- alita_sdk/configurations/bitbucket.py +94 -2
- alita_sdk/configurations/browser.py +18 -0
- alita_sdk/configurations/carrier.py +19 -0
- alita_sdk/configurations/confluence.py +130 -1
- alita_sdk/configurations/delta_lake.py +1 -1
- alita_sdk/configurations/figma.py +76 -5
- alita_sdk/configurations/github.py +65 -1
- alita_sdk/configurations/gitlab.py +81 -0
- alita_sdk/configurations/google_places.py +17 -0
- alita_sdk/configurations/jira.py +103 -0
- alita_sdk/configurations/openapi.py +323 -0
- alita_sdk/configurations/postman.py +1 -1
- alita_sdk/configurations/qtest.py +72 -3
- alita_sdk/configurations/report_portal.py +115 -0
- alita_sdk/configurations/salesforce.py +19 -0
- alita_sdk/configurations/service_now.py +1 -12
- alita_sdk/configurations/sharepoint.py +167 -0
- alita_sdk/configurations/sonar.py +18 -0
- alita_sdk/configurations/sql.py +20 -0
- alita_sdk/configurations/testio.py +101 -0
- alita_sdk/configurations/testrail.py +88 -0
- alita_sdk/configurations/xray.py +94 -1
- alita_sdk/configurations/zephyr_enterprise.py +94 -1
- alita_sdk/configurations/zephyr_essential.py +95 -0
- alita_sdk/runtime/clients/artifact.py +21 -4
- alita_sdk/runtime/clients/client.py +458 -67
- alita_sdk/runtime/clients/mcp_discovery.py +342 -0
- alita_sdk/runtime/clients/mcp_manager.py +262 -0
- alita_sdk/runtime/clients/sandbox_client.py +352 -0
- alita_sdk/runtime/langchain/_constants_bkup.py +1318 -0
- alita_sdk/runtime/langchain/assistant.py +183 -43
- alita_sdk/runtime/langchain/constants.py +647 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
- alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLinesLoader.py +77 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +10 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
- alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
- alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
- alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
- alita_sdk/runtime/langchain/document_loaders/constants.py +189 -41
- alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
- alita_sdk/runtime/langchain/langraph_agent.py +493 -105
- alita_sdk/runtime/langchain/utils.py +118 -8
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/models/mcp_models.py +61 -0
- alita_sdk/runtime/skills/__init__.py +91 -0
- alita_sdk/runtime/skills/callbacks.py +498 -0
- alita_sdk/runtime/skills/discovery.py +540 -0
- alita_sdk/runtime/skills/executor.py +610 -0
- alita_sdk/runtime/skills/input_builder.py +371 -0
- alita_sdk/runtime/skills/models.py +330 -0
- alita_sdk/runtime/skills/registry.py +355 -0
- alita_sdk/runtime/skills/skill_runner.py +330 -0
- alita_sdk/runtime/toolkits/__init__.py +28 -0
- alita_sdk/runtime/toolkits/application.py +14 -4
- alita_sdk/runtime/toolkits/artifact.py +25 -9
- alita_sdk/runtime/toolkits/datasource.py +13 -6
- alita_sdk/runtime/toolkits/mcp.py +782 -0
- alita_sdk/runtime/toolkits/planning.py +178 -0
- alita_sdk/runtime/toolkits/skill_router.py +238 -0
- alita_sdk/runtime/toolkits/subgraph.py +11 -6
- alita_sdk/runtime/toolkits/tools.py +314 -70
- alita_sdk/runtime/toolkits/vectorstore.py +11 -5
- alita_sdk/runtime/tools/__init__.py +24 -0
- alita_sdk/runtime/tools/application.py +16 -4
- alita_sdk/runtime/tools/artifact.py +367 -33
- alita_sdk/runtime/tools/data_analysis.py +183 -0
- alita_sdk/runtime/tools/function.py +100 -4
- alita_sdk/runtime/tools/graph.py +81 -0
- alita_sdk/runtime/tools/image_generation.py +218 -0
- alita_sdk/runtime/tools/llm.py +1032 -177
- alita_sdk/runtime/tools/loop.py +3 -1
- alita_sdk/runtime/tools/loop_output.py +3 -1
- alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
- alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
- alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/router.py +2 -1
- alita_sdk/runtime/tools/sandbox.py +375 -0
- alita_sdk/runtime/tools/skill_router.py +776 -0
- alita_sdk/runtime/tools/tool.py +3 -1
- alita_sdk/runtime/tools/vectorstore.py +69 -65
- alita_sdk/runtime/tools/vectorstore_base.py +163 -90
- alita_sdk/runtime/utils/AlitaCallback.py +137 -21
- alita_sdk/runtime/utils/constants.py +5 -1
- alita_sdk/runtime/utils/mcp_client.py +492 -0
- alita_sdk/runtime/utils/mcp_oauth.py +361 -0
- alita_sdk/runtime/utils/mcp_sse_client.py +434 -0
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/streamlit.py +41 -14
- alita_sdk/runtime/utils/toolkit_utils.py +28 -9
- alita_sdk/runtime/utils/utils.py +48 -0
- alita_sdk/tools/__init__.py +135 -37
- alita_sdk/tools/ado/__init__.py +2 -2
- alita_sdk/tools/ado/repos/__init__.py +16 -19
- alita_sdk/tools/ado/repos/repos_wrapper.py +12 -20
- alita_sdk/tools/ado/test_plan/__init__.py +27 -8
- alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -28
- alita_sdk/tools/ado/wiki/__init__.py +28 -12
- alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -40
- alita_sdk/tools/ado/work_item/__init__.py +28 -12
- alita_sdk/tools/ado/work_item/ado_wrapper.py +95 -11
- alita_sdk/tools/advanced_jira_mining/__init__.py +13 -8
- alita_sdk/tools/aws/delta_lake/__init__.py +15 -11
- alita_sdk/tools/aws/delta_lake/tool.py +5 -1
- alita_sdk/tools/azure_ai/search/__init__.py +14 -8
- alita_sdk/tools/base/tool.py +5 -1
- alita_sdk/tools/base_indexer_toolkit.py +454 -110
- alita_sdk/tools/bitbucket/__init__.py +28 -19
- alita_sdk/tools/bitbucket/api_wrapper.py +285 -27
- alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
- alita_sdk/tools/browser/__init__.py +41 -16
- alita_sdk/tools/browser/crawler.py +3 -1
- alita_sdk/tools/browser/utils.py +15 -6
- alita_sdk/tools/carrier/__init__.py +18 -17
- alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
- alita_sdk/tools/carrier/excel_reporter.py +8 -4
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/code/codeparser.py +1 -1
- alita_sdk/tools/chunkers/sematic/json_chunker.py +2 -1
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/cloud/aws/__init__.py +12 -7
- alita_sdk/tools/cloud/azure/__init__.py +12 -7
- alita_sdk/tools/cloud/gcp/__init__.py +12 -7
- alita_sdk/tools/cloud/k8s/__init__.py +12 -7
- alita_sdk/tools/code/linter/__init__.py +10 -8
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code/sonar/__init__.py +21 -13
- alita_sdk/tools/code_indexer_toolkit.py +199 -0
- alita_sdk/tools/confluence/__init__.py +22 -14
- alita_sdk/tools/confluence/api_wrapper.py +197 -58
- alita_sdk/tools/confluence/loader.py +14 -2
- alita_sdk/tools/custom_open_api/__init__.py +12 -5
- alita_sdk/tools/elastic/__init__.py +11 -8
- alita_sdk/tools/elitea_base.py +546 -64
- alita_sdk/tools/figma/__init__.py +60 -11
- alita_sdk/tools/figma/api_wrapper.py +1400 -167
- alita_sdk/tools/figma/figma_client.py +73 -0
- alita_sdk/tools/figma/toon_tools.py +2748 -0
- alita_sdk/tools/github/__init__.py +18 -17
- alita_sdk/tools/github/api_wrapper.py +9 -26
- alita_sdk/tools/github/github_client.py +81 -12
- alita_sdk/tools/github/schemas.py +2 -1
- alita_sdk/tools/github/tool.py +5 -1
- alita_sdk/tools/gitlab/__init__.py +19 -13
- alita_sdk/tools/gitlab/api_wrapper.py +256 -80
- alita_sdk/tools/gitlab_org/__init__.py +14 -10
- alita_sdk/tools/google/bigquery/__init__.py +14 -13
- alita_sdk/tools/google/bigquery/tool.py +5 -1
- alita_sdk/tools/google_places/__init__.py +21 -11
- alita_sdk/tools/jira/__init__.py +22 -11
- alita_sdk/tools/jira/api_wrapper.py +315 -168
- alita_sdk/tools/keycloak/__init__.py +11 -8
- alita_sdk/tools/localgit/__init__.py +9 -3
- alita_sdk/tools/localgit/local_git.py +62 -54
- alita_sdk/tools/localgit/tool.py +5 -1
- alita_sdk/tools/memory/__init__.py +38 -14
- alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
- alita_sdk/tools/ocr/__init__.py +11 -8
- alita_sdk/tools/openapi/__init__.py +491 -106
- alita_sdk/tools/openapi/api_wrapper.py +1357 -0
- alita_sdk/tools/openapi/tool.py +20 -0
- alita_sdk/tools/pandas/__init__.py +20 -12
- alita_sdk/tools/pandas/api_wrapper.py +40 -45
- alita_sdk/tools/pandas/dataframe/generator/base.py +3 -1
- alita_sdk/tools/postman/__init__.py +11 -11
- alita_sdk/tools/postman/api_wrapper.py +19 -8
- alita_sdk/tools/postman/postman_analysis.py +8 -1
- alita_sdk/tools/pptx/__init__.py +11 -10
- alita_sdk/tools/qtest/__init__.py +22 -14
- alita_sdk/tools/qtest/api_wrapper.py +1784 -88
- alita_sdk/tools/rally/__init__.py +13 -10
- alita_sdk/tools/report_portal/__init__.py +23 -16
- alita_sdk/tools/salesforce/__init__.py +22 -16
- alita_sdk/tools/servicenow/__init__.py +21 -16
- alita_sdk/tools/servicenow/api_wrapper.py +1 -1
- alita_sdk/tools/sharepoint/__init__.py +17 -14
- alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
- alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
- alita_sdk/tools/sharepoint/utils.py +8 -2
- alita_sdk/tools/slack/__init__.py +13 -8
- alita_sdk/tools/sql/__init__.py +22 -19
- alita_sdk/tools/sql/api_wrapper.py +71 -23
- alita_sdk/tools/testio/__init__.py +21 -13
- alita_sdk/tools/testrail/__init__.py +13 -11
- alita_sdk/tools/testrail/api_wrapper.py +214 -46
- alita_sdk/tools/utils/__init__.py +28 -4
- alita_sdk/tools/utils/content_parser.py +241 -55
- alita_sdk/tools/utils/text_operations.py +254 -0
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
- alita_sdk/tools/xray/__init__.py +18 -14
- alita_sdk/tools/xray/api_wrapper.py +58 -113
- alita_sdk/tools/yagmail/__init__.py +9 -3
- alita_sdk/tools/zephyr/__init__.py +12 -7
- alita_sdk/tools/zephyr_enterprise/__init__.py +16 -9
- alita_sdk/tools/zephyr_enterprise/api_wrapper.py +30 -15
- alita_sdk/tools/zephyr_essential/__init__.py +16 -10
- alita_sdk/tools/zephyr_essential/api_wrapper.py +297 -54
- alita_sdk/tools/zephyr_essential/client.py +6 -4
- alita_sdk/tools/zephyr_scale/__init__.py +13 -8
- alita_sdk/tools/zephyr_scale/api_wrapper.py +39 -31
- alita_sdk/tools/zephyr_squad/__init__.py +12 -7
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/METADATA +184 -37
- alita_sdk-0.3.584.dist-info/RECORD +452 -0
- alita_sdk-0.3.584.dist-info/entry_points.txt +2 -0
- alita_sdk/tools/bitbucket/tools.py +0 -304
- alita_sdk-0.3.257.dist-info/RECORD +0 -343
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.257.dist-info → alita_sdk-0.3.584.dist-info}/top_level.txt +0 -0
|
@@ -1,43 +1,51 @@
|
|
|
1
|
+
import copy
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
|
-
|
|
4
|
+
import time
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Any, Optional, List, Dict, Generator
|
|
4
7
|
|
|
8
|
+
from langchain_core.callbacks import dispatch_custom_event
|
|
5
9
|
from langchain_core.documents import Document
|
|
6
10
|
from pydantic import create_model, Field, SecretStr
|
|
7
11
|
|
|
8
|
-
|
|
9
|
-
from .utils.content_parser import process_content_by_type
|
|
12
|
+
from .utils.content_parser import file_extension_by_chunker, process_document_by_type
|
|
10
13
|
from .vector_adapters.VectorStoreAdapter import VectorStoreAdapterFactory
|
|
14
|
+
from ..runtime.langchain.document_loaders.constants import loaders_allowed_to_override
|
|
11
15
|
from ..runtime.tools.vectorstore_base import VectorStoreWrapperBase
|
|
12
16
|
from ..runtime.utils.utils import IndexerKeywords
|
|
13
17
|
|
|
14
18
|
logger = logging.getLogger(__name__)
|
|
15
19
|
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
20
|
+
DEFAULT_CUT_OFF = 0.1
|
|
21
|
+
INDEX_META_UPDATE_INTERVAL = 600.0
|
|
22
|
+
|
|
23
|
+
class IndexTools(str, Enum):
|
|
24
|
+
"""Enum for index-related tool names."""
|
|
25
|
+
INDEX_DATA = "index_data"
|
|
26
|
+
SEARCH_INDEX = "search_index"
|
|
27
|
+
STEPBACK_SEARCH_INDEX = "stepback_search_index"
|
|
28
|
+
STEPBACK_SUMMARY_INDEX = "stepback_summary_index"
|
|
29
|
+
REMOVE_INDEX = "remove_index"
|
|
30
|
+
LIST_COLLECTIONS = "list_collections"
|
|
22
31
|
|
|
23
32
|
RemoveIndexParams = create_model(
|
|
24
33
|
"RemoveIndexParams",
|
|
25
|
-
|
|
34
|
+
index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
|
|
26
35
|
)
|
|
27
36
|
|
|
28
37
|
BaseSearchParams = create_model(
|
|
29
38
|
"BaseSearchParams",
|
|
30
39
|
query=(str, Field(description="Query text to search in the index")),
|
|
31
|
-
|
|
32
|
-
description="Optional
|
|
40
|
+
index_name=(Optional[str], Field(
|
|
41
|
+
description="Optional index name (max 7 characters). Leave empty to search across all datasets",
|
|
33
42
|
default="", max_length=7)),
|
|
34
|
-
vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
|
|
35
43
|
filter=(Optional[dict | str], Field(
|
|
36
44
|
description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
|
|
37
45
|
default={},
|
|
38
46
|
examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
|
|
39
47
|
)),
|
|
40
|
-
cut_off=(Optional[float], Field(description="Cut-off score for search results", default=0
|
|
48
|
+
cut_off=(Optional[float], Field(description="Cut-off score for search results", default=DEFAULT_CUT_OFF, ge=0, le=1)),
|
|
41
49
|
search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
|
|
42
50
|
full_text_search=(Optional[Dict[str, Any]], Field(
|
|
43
51
|
description="Full text search parameters. Can be a dictionary with search options.",
|
|
@@ -60,42 +68,31 @@ BaseSearchParams = create_model(
|
|
|
60
68
|
BaseStepbackSearchParams = create_model(
|
|
61
69
|
"BaseStepbackSearchParams",
|
|
62
70
|
query=(str, Field(description="Query text to search in the index")),
|
|
63
|
-
|
|
64
|
-
vectorstore_type=(Optional[str], Field(description="Vectorstore type (Chroma, PGVector, Elastic, etc.)", default="PGVector")),
|
|
71
|
+
index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
|
|
65
72
|
messages=(Optional[List], Field(description="Chat messages for stepback search context", default=[])),
|
|
66
73
|
filter=(Optional[dict | str], Field(
|
|
67
74
|
description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
|
|
68
75
|
default={},
|
|
69
76
|
examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
|
|
70
77
|
)),
|
|
71
|
-
cut_off=(Optional[float], Field(description="Cut-off score for search results", default=0
|
|
78
|
+
cut_off=(Optional[float], Field(description="Cut-off score for search results", default=DEFAULT_CUT_OFF, ge=0, le=1)),
|
|
72
79
|
search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
|
|
73
|
-
reranker=(Optional[dict], Field(
|
|
74
|
-
description="Reranker configuration. Can be a dictionary with reranking parameters.",
|
|
75
|
-
default={}
|
|
76
|
-
)),
|
|
77
80
|
full_text_search=(Optional[Dict[str, Any]], Field(
|
|
78
81
|
description="Full text search parameters. Can be a dictionary with search options.",
|
|
79
82
|
default=None
|
|
80
83
|
)),
|
|
81
|
-
reranking_config=(Optional[Dict[str, Dict[str, Any]]], Field(
|
|
82
|
-
description="Reranking configuration. Can be a dictionary with reranking settings.",
|
|
83
|
-
default=None
|
|
84
|
-
)),
|
|
85
84
|
extended_search=(Optional[List[str]], Field(
|
|
86
85
|
description="List of additional fields to include in the search results.",
|
|
87
86
|
default=None
|
|
88
87
|
)),
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
description="Optional flag to enforce clean existing index before indexing new data")),
|
|
98
|
-
chunking_config=(Optional[dict], Field(description="Chunking tool configuration", default_factory=dict)),
|
|
88
|
+
reranker=(Optional[dict], Field(
|
|
89
|
+
description="Reranker configuration. Can be a dictionary with reranking parameters.",
|
|
90
|
+
default={}
|
|
91
|
+
)),
|
|
92
|
+
reranking_config=(Optional[Dict[str, Dict[str, Any]]], Field(
|
|
93
|
+
description="Reranking configuration. Can be a dictionary with reranking settings.",
|
|
94
|
+
default=None
|
|
95
|
+
)),
|
|
99
96
|
)
|
|
100
97
|
|
|
101
98
|
|
|
@@ -104,29 +101,21 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
104
101
|
|
|
105
102
|
doctype: str = "document"
|
|
106
103
|
|
|
107
|
-
llm: Any = None
|
|
108
104
|
connection_string: Optional[SecretStr] = None
|
|
109
105
|
collection_name: Optional[str] = None
|
|
110
|
-
embedding_model: Optional[str] = "HuggingFaceEmbeddings"
|
|
111
|
-
embedding_model_params: Optional[Dict[str, Any]] = {"model_name": "sentence-transformers/all-MiniLM-L6-v2"}
|
|
112
|
-
vectorstore_type: Optional[str] = "PGVector"
|
|
113
|
-
_embedding: Optional[Any] = None
|
|
114
106
|
alita: Any = None # Elitea client, if available
|
|
115
107
|
|
|
116
108
|
def __init__(self, **kwargs):
|
|
117
109
|
conn = kwargs.get('connection_string', None)
|
|
118
110
|
connection_string = conn.get_secret_value() if isinstance(conn, SecretStr) else conn
|
|
119
|
-
collection_name = kwargs.get('
|
|
111
|
+
collection_name = kwargs.get('collection_schema')
|
|
120
112
|
|
|
121
|
-
# if 'embedding_model' not in kwargs:
|
|
122
|
-
kwargs['embedding_model'] = 'HuggingFaceEmbeddings'
|
|
123
|
-
if 'embedding_model_params' not in kwargs:
|
|
124
|
-
kwargs['embedding_model_params'] = {"model_name": "sentence-transformers/all-MiniLM-L6-v2"}
|
|
125
113
|
if 'vectorstore_type' not in kwargs:
|
|
126
114
|
kwargs['vectorstore_type'] = 'PGVector'
|
|
127
115
|
vectorstore_type = kwargs.get('vectorstore_type')
|
|
128
|
-
|
|
129
|
-
|
|
116
|
+
if connection_string:
|
|
117
|
+
# Initialize vectorstore params only if connection string is provided
|
|
118
|
+
kwargs['vectorstore_params'] = VectorStoreAdapterFactory.create_adapter(vectorstore_type).get_vectorstore_params(collection_name, connection_string)
|
|
130
119
|
super().__init__(**kwargs)
|
|
131
120
|
|
|
132
121
|
def _index_tool_params(self, **kwargs) -> dict[str, tuple[type, Field]]:
|
|
@@ -136,6 +125,11 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
136
125
|
"""
|
|
137
126
|
return {}
|
|
138
127
|
|
|
128
|
+
def _remove_metadata_keys(self) -> List[str]:
|
|
129
|
+
""" Returns a list of metadata keys to be removed from documents before indexing.
|
|
130
|
+
Override this method in subclasses to provide specific keys to remove."""
|
|
131
|
+
return [IndexerKeywords.CONTENT_IN_BYTES.value, IndexerKeywords.CONTENT_FILE_NAME.value]
|
|
132
|
+
|
|
139
133
|
def _base_loader(self, **kwargs) -> Generator[Document, None, None]:
|
|
140
134
|
""" Loads documents from a source, processes them,
|
|
141
135
|
and returns a list of Document objects with base metadata: id and created_on."""
|
|
@@ -154,45 +148,156 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
154
148
|
yield from ()
|
|
155
149
|
|
|
156
150
|
def index_data(self, **kwargs):
|
|
157
|
-
|
|
158
|
-
progress_step = kwargs.get("progress_step")
|
|
151
|
+
index_name = kwargs.get("index_name")
|
|
159
152
|
clean_index = kwargs.get("clean_index")
|
|
160
153
|
chunking_tool = kwargs.get("chunking_tool")
|
|
161
154
|
chunking_config = kwargs.get("chunking_config")
|
|
155
|
+
|
|
156
|
+
# Store the interval in a private dict to avoid Pydantic field errors
|
|
157
|
+
if not hasattr(self, "_index_meta_config"):
|
|
158
|
+
self._index_meta_config: Dict[str, Any] = {}
|
|
159
|
+
|
|
160
|
+
self._index_meta_config["update_interval"] = kwargs.get(
|
|
161
|
+
"meta_update_interval",
|
|
162
|
+
INDEX_META_UPDATE_INTERVAL,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
result = {"count": 0}
|
|
162
166
|
#
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
167
|
+
try:
|
|
168
|
+
if clean_index:
|
|
169
|
+
self._clean_index(index_name)
|
|
170
|
+
#
|
|
171
|
+
self.index_meta_init(index_name, kwargs)
|
|
172
|
+
self._emit_index_event(index_name)
|
|
173
|
+
#
|
|
174
|
+
self._log_tool_event(f"Indexing data into collection with suffix '{index_name}'. It can take some time...")
|
|
175
|
+
self._log_tool_event(f"Loading the documents to index...{kwargs}")
|
|
176
|
+
documents = self._base_loader(**kwargs)
|
|
177
|
+
documents = list(documents) # consume/exhaust generator to count items
|
|
178
|
+
documents_count = len(documents)
|
|
179
|
+
documents = (doc for doc in documents)
|
|
180
|
+
self._log_tool_event(f"Base documents were pre-loaded. "
|
|
181
|
+
f"Search for possible document duplicates and remove them from the indexing list...")
|
|
182
|
+
documents = self._reduce_duplicates(documents, index_name)
|
|
183
|
+
self._log_tool_event(f"Duplicates were removed. "
|
|
184
|
+
f"Processing documents to collect dependencies and prepare them for indexing...")
|
|
185
|
+
self._save_index_generator(documents, documents_count, chunking_tool, chunking_config, index_name=index_name, result=result)
|
|
186
|
+
#
|
|
187
|
+
results_count = result["count"]
|
|
188
|
+
# Final update should always be forced
|
|
189
|
+
self.index_meta_update(index_name, IndexerKeywords.INDEX_META_COMPLETED.value, results_count, update_force=True)
|
|
190
|
+
self._emit_index_event(index_name)
|
|
191
|
+
#
|
|
192
|
+
return {"status": "ok", "message": f"successfully indexed {results_count} documents" if results_count > 0
|
|
193
|
+
else "no new documents to index"}
|
|
194
|
+
except Exception as e:
|
|
195
|
+
# Do maximum effort at least send custom event for supposed changed status
|
|
196
|
+
msg = str(e)
|
|
197
|
+
try:
|
|
198
|
+
# Error update should also be forced
|
|
199
|
+
self.index_meta_update(index_name, IndexerKeywords.INDEX_META_FAILED.value, result["count"], update_force=True)
|
|
200
|
+
except Exception as ie:
|
|
201
|
+
logger.error(f"Failed to update index meta status to FAILED for index '{index_name}': {ie}")
|
|
202
|
+
msg = f"{msg}; additionally failed to update index meta status to FAILED: {ie}"
|
|
203
|
+
self._emit_index_event(index_name, error=msg)
|
|
204
|
+
raise e
|
|
205
|
+
|
|
206
|
+
def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config, result, index_name: Optional[str] = None):
|
|
207
|
+
self._ensure_vectorstore_initialized()
|
|
208
|
+
self._log_tool_event(f"Base documents are ready for indexing. {base_total} base documents in total to index.")
|
|
209
|
+
from ..runtime.langchain.interfaces.llm_processor import add_documents
|
|
171
210
|
#
|
|
172
|
-
|
|
173
|
-
|
|
211
|
+
base_doc_counter = 0
|
|
212
|
+
pg_vector_add_docs_chunk = []
|
|
213
|
+
for base_doc in base_documents:
|
|
214
|
+
base_doc_counter += 1
|
|
215
|
+
self._log_tool_event(f"Processing dependent documents for base documents #{base_doc_counter}.")
|
|
216
|
+
|
|
217
|
+
# (base_doc for _ in range(1)) - wrap single base_doc to Generator in order to reuse existing code
|
|
218
|
+
documents = self._extend_data((base_doc for _ in range(1))) # update content of not-reduced base document if needed (for sharepoint and similar)
|
|
219
|
+
documents = self._collect_dependencies(documents) # collect dependencies for base documents
|
|
220
|
+
self._log_tool_event(f"Dependent documents were processed. "
|
|
221
|
+
f"Applying chunking tool '{chunking_tool}' if specified and preparing documents for indexing...")
|
|
222
|
+
documents = self._apply_loaders_chunkers(documents, chunking_tool, chunking_config)
|
|
223
|
+
documents = self._clean_metadata(documents)
|
|
224
|
+
|
|
225
|
+
logger.debug(f"Indexing base document #{base_doc_counter}: {base_doc} and all dependent documents: {documents}")
|
|
226
|
+
|
|
227
|
+
dependent_docs_counter = 0
|
|
228
|
+
#
|
|
229
|
+
for doc in documents:
|
|
230
|
+
if not doc.page_content:
|
|
231
|
+
# To avoid case when all documents have empty content
|
|
232
|
+
# See llm_processor.add_documents which exclude metadata of docs with empty content
|
|
233
|
+
continue
|
|
234
|
+
#
|
|
235
|
+
if 'id' not in doc.metadata or 'updated_on' not in doc.metadata:
|
|
236
|
+
logger.warning(f"Document is missing required metadata field 'id' or 'updated_on': {doc.metadata}")
|
|
237
|
+
#
|
|
238
|
+
# if index_name is provided, add it to metadata of each document
|
|
239
|
+
if index_name:
|
|
240
|
+
if not doc.metadata.get('collection'):
|
|
241
|
+
doc.metadata['collection'] = index_name
|
|
242
|
+
else:
|
|
243
|
+
doc.metadata['collection'] += f";{index_name}"
|
|
244
|
+
#
|
|
245
|
+
try:
|
|
246
|
+
pg_vector_add_docs_chunk.append(doc)
|
|
247
|
+
dependent_docs_counter += 1
|
|
248
|
+
if len(pg_vector_add_docs_chunk) >= self.max_docs_per_add:
|
|
249
|
+
add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
|
|
250
|
+
self._log_tool_event(f"{len(pg_vector_add_docs_chunk)} documents have been indexed. Continuing...")
|
|
251
|
+
pg_vector_add_docs_chunk = []
|
|
252
|
+
except Exception:
|
|
253
|
+
from traceback import format_exc
|
|
254
|
+
logger.error(f"Error: {format_exc()}")
|
|
255
|
+
return {"status": "error", "message": f"Error: {format_exc()}"}
|
|
256
|
+
msg = f"Indexed base document #{base_doc_counter} out of {base_total} (with {dependent_docs_counter} dependencies)."
|
|
257
|
+
logger.debug(msg)
|
|
258
|
+
self._log_tool_event(msg)
|
|
259
|
+
result["count"] += dependent_docs_counter
|
|
260
|
+
# After each base document, try a non-forced meta update; throttling handled inside index_meta_update
|
|
261
|
+
try:
|
|
262
|
+
self.index_meta_update(index_name, IndexerKeywords.INDEX_META_IN_PROGRESS.value, result["count"], update_force=False)
|
|
263
|
+
except Exception as exc: # best-effort, do not break indexing
|
|
264
|
+
logger.warning(f"Failed to update index meta during indexing process for index '{index_name}': {exc}")
|
|
265
|
+
if pg_vector_add_docs_chunk:
|
|
266
|
+
add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
|
|
267
|
+
|
|
174
268
|
def _apply_loaders_chunkers(self, documents: Generator[Document, None, None], chunking_tool: str=None, chunking_config=None) -> Generator[Document, None, None]:
|
|
175
|
-
from
|
|
269
|
+
from ..tools.chunkers import __all__ as chunkers
|
|
176
270
|
|
|
177
271
|
if chunking_config is None:
|
|
178
272
|
chunking_config = {}
|
|
179
|
-
chunking_config['embedding'] = self.
|
|
273
|
+
chunking_config['embedding'] = self.embeddings
|
|
180
274
|
chunking_config['llm'] = self.llm
|
|
181
|
-
|
|
275
|
+
|
|
182
276
|
for document in documents:
|
|
183
|
-
if content_type := document.metadata.get(
|
|
277
|
+
if content_type := document.metadata.get(IndexerKeywords.CONTENT_FILE_NAME.value, None):
|
|
184
278
|
# apply parsing based on content type and chunk if chunker was applied to parent doc
|
|
185
|
-
content = document.metadata.pop(
|
|
186
|
-
yield from
|
|
279
|
+
content = document.metadata.pop(IndexerKeywords.CONTENT_IN_BYTES.value, None)
|
|
280
|
+
yield from process_document_by_type(
|
|
187
281
|
document=document,
|
|
188
282
|
content=content,
|
|
189
283
|
extension_source=content_type, llm=self.llm, chunking_config=chunking_config)
|
|
284
|
+
elif chunking_tool and (content_in_bytes := document.metadata.pop(IndexerKeywords.CONTENT_IN_BYTES.value, None)) is not None:
|
|
285
|
+
if not content_in_bytes:
|
|
286
|
+
# content is empty, yield as is
|
|
287
|
+
yield document
|
|
288
|
+
continue
|
|
289
|
+
# apply parsing based on content type resolved from chunking_tool
|
|
290
|
+
content_type = file_extension_by_chunker(chunking_tool)
|
|
291
|
+
yield from process_document_by_type(
|
|
292
|
+
document=document,
|
|
293
|
+
content=content_in_bytes,
|
|
294
|
+
extension_source=content_type, llm=self.llm, chunking_config=chunking_config)
|
|
190
295
|
elif chunking_tool:
|
|
191
296
|
# apply default chunker from toolkit config. No parsing.
|
|
192
297
|
chunker = chunkers.get(chunking_tool)
|
|
193
298
|
yield from chunker(file_content_generator=iter([document]), config=chunking_config)
|
|
194
299
|
else:
|
|
195
|
-
# return as is if neither chunker
|
|
300
|
+
# return as is if neither chunker nor content type are specified
|
|
196
301
|
yield document
|
|
197
302
|
|
|
198
303
|
def _extend_data(self, documents: Generator[Document, None, None]):
|
|
@@ -200,24 +305,34 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
200
305
|
|
|
201
306
|
def _collect_dependencies(self, documents: Generator[Document, None, None]):
|
|
202
307
|
for document in documents:
|
|
308
|
+
self._log_tool_event(message=f"Collecting the dependencies for document ID "
|
|
309
|
+
f"'{document.metadata.get('id', 'N/A')}' to collect dependencies if any...")
|
|
203
310
|
dependencies = self._process_document(document)
|
|
204
311
|
yield document
|
|
205
312
|
for dep in dependencies:
|
|
206
313
|
dep.metadata[IndexerKeywords.PARENT.value] = document.metadata.get('id', None)
|
|
207
314
|
yield dep
|
|
208
315
|
|
|
316
|
+
def _clean_metadata(self, documents: Generator[Document, None, None]):
|
|
317
|
+
for document in documents:
|
|
318
|
+
remove_keys = self._remove_metadata_keys()
|
|
319
|
+
for key in remove_keys:
|
|
320
|
+
document.metadata.pop(key, None)
|
|
321
|
+
yield document
|
|
322
|
+
|
|
209
323
|
def _reduce_duplicates(
|
|
210
324
|
self,
|
|
211
325
|
documents: Generator[Any, None, None],
|
|
212
|
-
|
|
326
|
+
index_name: str,
|
|
213
327
|
log_msg: str = "Verification of documents to index started"
|
|
214
328
|
) -> Generator[Document, None, None]:
|
|
215
329
|
"""Generic duplicate reduction logic for documents."""
|
|
216
|
-
self.
|
|
217
|
-
|
|
330
|
+
self._ensure_vectorstore_initialized()
|
|
331
|
+
self._log_tool_event(log_msg, tool_name="index_documents")
|
|
332
|
+
indexed_data = self._get_indexed_data(index_name)
|
|
218
333
|
indexed_keys = set(indexed_data.keys())
|
|
219
334
|
if not indexed_keys:
|
|
220
|
-
self.
|
|
335
|
+
self._log_tool_event("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
|
|
221
336
|
yield from documents
|
|
222
337
|
return
|
|
223
338
|
|
|
@@ -225,7 +340,8 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
225
340
|
|
|
226
341
|
for document in documents:
|
|
227
342
|
key = self.key_fn(document)
|
|
228
|
-
|
|
343
|
+
key = key if isinstance(key, str) else str(key)
|
|
344
|
+
if key in indexed_keys and index_name == indexed_data[key]['metadata'].get('collection'):
|
|
229
345
|
if self.compare_fn(document, indexed_data[key]):
|
|
230
346
|
continue
|
|
231
347
|
yield document
|
|
@@ -234,13 +350,13 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
234
350
|
yield document
|
|
235
351
|
|
|
236
352
|
if docs_to_remove:
|
|
237
|
-
self.
|
|
353
|
+
self._log_tool_event(
|
|
238
354
|
f"Removing {len(docs_to_remove)} documents from vectorstore that are already indexed with different updated_on.",
|
|
239
355
|
tool_name="index_documents"
|
|
240
356
|
)
|
|
241
357
|
self.vectorstore.delete(ids=list(docs_to_remove))
|
|
242
358
|
|
|
243
|
-
def _get_indexed_data(self,
|
|
359
|
+
def _get_indexed_data(self, index_name: str):
|
|
244
360
|
raise NotImplementedError("Subclasses must implement this method")
|
|
245
361
|
|
|
246
362
|
def key_fn(self, document: Document):
|
|
@@ -252,29 +368,58 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
252
368
|
def remove_ids_fn(self, idx_data, key: str):
|
|
253
369
|
raise NotImplementedError("Subclasses must implement this method")
|
|
254
370
|
|
|
255
|
-
def remove_index(self,
|
|
371
|
+
def remove_index(self, index_name: str = ""):
|
|
256
372
|
"""Cleans the indexed data in the collection."""
|
|
257
|
-
super()._clean_collection(
|
|
258
|
-
|
|
259
|
-
|
|
373
|
+
super()._clean_collection(index_name=index_name, including_index_meta=True)
|
|
374
|
+
self._emit_index_data_removed_event(index_name)
|
|
375
|
+
return (f"Collection '{index_name}' has been removed from the vector store.\n"
|
|
376
|
+
f"Available collections: {self.list_collections()}") if index_name \
|
|
377
|
+
else "All collections have been removed from the vector store."
|
|
378
|
+
|
|
379
|
+
def _build_collection_filter(self, filter: dict | str, index_name: str = "") -> dict:
|
|
380
|
+
"""Builds a filter for the collection based on the provided suffix."""
|
|
381
|
+
|
|
382
|
+
filter = filter if isinstance(filter, dict) else json.loads(filter)
|
|
383
|
+
if index_name:
|
|
384
|
+
filter.update({"collection": {
|
|
385
|
+
"$eq": index_name.strip()
|
|
386
|
+
}})
|
|
387
|
+
|
|
388
|
+
if filter:
|
|
389
|
+
# Exclude index meta documents from search results
|
|
390
|
+
filter = {
|
|
391
|
+
"$and": [
|
|
392
|
+
filter,
|
|
393
|
+
{"$or": [
|
|
394
|
+
{"type": {"$exists": False}},
|
|
395
|
+
{"type": {"$ne": IndexerKeywords.INDEX_META_TYPE.value}}
|
|
396
|
+
]},
|
|
397
|
+
]
|
|
398
|
+
}
|
|
399
|
+
else:
|
|
400
|
+
filter = {"$or": [
|
|
401
|
+
{"type": {"$exists": False}},
|
|
402
|
+
{"type": {"$ne": IndexerKeywords.INDEX_META_TYPE.value}}
|
|
403
|
+
]}
|
|
404
|
+
return filter
|
|
260
405
|
|
|
261
406
|
def search_index(self,
|
|
262
407
|
query: str,
|
|
263
|
-
|
|
264
|
-
filter: dict | str = {}, cut_off: float =
|
|
408
|
+
index_name: str = "",
|
|
409
|
+
filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
|
|
265
410
|
search_top: int = 10, reranker: dict = {},
|
|
266
411
|
full_text_search: Optional[Dict[str, Any]] = None,
|
|
267
412
|
reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
268
413
|
extended_search: Optional[List[str]] = None,
|
|
269
414
|
**kwargs):
|
|
270
415
|
""" Searches indexed documents in the vector store."""
|
|
271
|
-
# build filter on top of
|
|
272
|
-
filter = filter if isinstance(filter, dict) else json.loads(filter)
|
|
273
|
-
if collection_suffix:
|
|
274
|
-
filter.update({"collection": {
|
|
275
|
-
"$eq": collection_suffix.strip()
|
|
276
|
-
}})
|
|
416
|
+
# build filter on top of index_name
|
|
277
417
|
|
|
418
|
+
available_collections = super().list_collections()
|
|
419
|
+
if index_name and index_name not in available_collections:
|
|
420
|
+
return f"Collection '{index_name}' not found. Available collections: {available_collections}"
|
|
421
|
+
|
|
422
|
+
filter = self._build_collection_filter(filter, index_name)
|
|
278
423
|
found_docs = super().search_documents(
|
|
279
424
|
query,
|
|
280
425
|
doctype=self.doctype,
|
|
@@ -291,14 +436,15 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
291
436
|
def stepback_search_index(self,
|
|
292
437
|
query: str,
|
|
293
438
|
messages: List[Dict[str, Any]] = [],
|
|
294
|
-
|
|
295
|
-
filter: dict | str = {}, cut_off: float =
|
|
439
|
+
index_name: str = "",
|
|
440
|
+
filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
|
|
296
441
|
search_top: int = 10, reranker: dict = {},
|
|
297
442
|
full_text_search: Optional[Dict[str, Any]] = None,
|
|
298
443
|
reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
299
444
|
extended_search: Optional[List[str]] = None,
|
|
300
445
|
**kwargs):
|
|
301
446
|
""" Searches indexed documents in the vector store."""
|
|
447
|
+
filter = self._build_collection_filter(filter, index_name)
|
|
302
448
|
found_docs = super().stepback_search(
|
|
303
449
|
query,
|
|
304
450
|
messages,
|
|
@@ -315,14 +461,16 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
315
461
|
def stepback_summary_index(self,
|
|
316
462
|
query: str,
|
|
317
463
|
messages: List[Dict[str, Any]] = [],
|
|
318
|
-
|
|
319
|
-
filter: dict | str = {}, cut_off: float =
|
|
464
|
+
index_name: str = "",
|
|
465
|
+
filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
|
|
320
466
|
search_top: int = 10, reranker: dict = {},
|
|
321
467
|
full_text_search: Optional[Dict[str, Any]] = None,
|
|
322
468
|
reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
323
469
|
extended_search: Optional[List[str]] = None,
|
|
324
470
|
**kwargs):
|
|
325
471
|
""" Generates a summary of indexed documents using stepback technique."""
|
|
472
|
+
|
|
473
|
+
filter = self._build_collection_filter(filter, index_name)
|
|
326
474
|
return super().stepback_summary(
|
|
327
475
|
query,
|
|
328
476
|
messages,
|
|
@@ -334,60 +482,256 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
334
482
|
reranking_config=reranking_config,
|
|
335
483
|
extended_search=extended_search
|
|
336
484
|
)
|
|
485
|
+
|
|
486
|
+
def index_meta_init(self, index_name: str, index_configuration: dict[str, Any]):
|
|
487
|
+
self._ensure_vectorstore_initialized()
|
|
488
|
+
index_meta = super().get_index_meta(index_name)
|
|
489
|
+
if not index_meta:
|
|
490
|
+
self._log_tool_event(
|
|
491
|
+
f"There is no existing index_meta for collection '{index_name}'. Initializing it.",
|
|
492
|
+
tool_name="index_data"
|
|
493
|
+
)
|
|
494
|
+
from ..runtime.langchain.interfaces.llm_processor import add_documents
|
|
495
|
+
created_on = time.time()
|
|
496
|
+
metadata = {
|
|
497
|
+
"collection": index_name,
|
|
498
|
+
"type": IndexerKeywords.INDEX_META_TYPE.value,
|
|
499
|
+
"indexed": 0,
|
|
500
|
+
"updated": 0,
|
|
501
|
+
"state": IndexerKeywords.INDEX_META_IN_PROGRESS.value,
|
|
502
|
+
"index_configuration": index_configuration,
|
|
503
|
+
"created_on": created_on,
|
|
504
|
+
"updated_on": created_on,
|
|
505
|
+
"task_id": None,
|
|
506
|
+
"conversation_id": None,
|
|
507
|
+
"toolkit_id": self.toolkit_id,
|
|
508
|
+
}
|
|
509
|
+
metadata["history"] = json.dumps([metadata])
|
|
510
|
+
index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{index_name}", metadata=metadata)
|
|
511
|
+
add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc])
|
|
512
|
+
|
|
513
|
+
def index_meta_update(self, index_name: str, state: str, result: int, update_force: bool = True, interval: Optional[float] = None):
|
|
514
|
+
"""Update `index_meta` document with optional time-based throttling.
|
|
515
|
+
|
|
516
|
+
Args:
|
|
517
|
+
index_name: Index name to update meta for.
|
|
518
|
+
state: New state value for the `index_meta` record.
|
|
519
|
+
result: Number of processed documents to store in the `updated` field.
|
|
520
|
+
update_force: If `True`, perform the update unconditionally, ignoring throttling.
|
|
521
|
+
If `False`, perform the update only when the effective time interval has passed.
|
|
522
|
+
interval: Optional custom interval (in seconds) for this call when `update_force` is `False`.
|
|
523
|
+
If `None`, falls back to the value stored in `self._index_meta_config["update_interval"]`
|
|
524
|
+
if present, otherwise uses `INDEX_META_UPDATE_INTERVAL`.
|
|
525
|
+
"""
|
|
526
|
+
self._ensure_vectorstore_initialized()
|
|
527
|
+
if not hasattr(self, "_index_meta_last_update_time"):
|
|
528
|
+
self._index_meta_last_update_time: Dict[str, float] = {}
|
|
529
|
+
|
|
530
|
+
if not update_force:
|
|
531
|
+
# Resolve effective interval:
|
|
532
|
+
# 1\) explicit arg
|
|
533
|
+
# 2\) value from `_index_meta_config`
|
|
534
|
+
# 3\) default constant
|
|
535
|
+
cfg_interval = None
|
|
536
|
+
if hasattr(self, "_index_meta_config"):
|
|
537
|
+
cfg_interval = self._index_meta_config.get("update_interval")
|
|
538
|
+
|
|
539
|
+
eff_interval = (
|
|
540
|
+
interval
|
|
541
|
+
if interval is not None
|
|
542
|
+
else (cfg_interval if cfg_interval is not None else INDEX_META_UPDATE_INTERVAL)
|
|
543
|
+
)
|
|
544
|
+
|
|
545
|
+
last_time = self._index_meta_last_update_time.get(index_name)
|
|
546
|
+
now = time.time()
|
|
547
|
+
if last_time is not None and (now - last_time) < eff_interval:
|
|
548
|
+
return
|
|
549
|
+
self._index_meta_last_update_time[index_name] = now
|
|
550
|
+
else:
|
|
551
|
+
# For forced updates, always refresh last update time
|
|
552
|
+
self._index_meta_last_update_time[index_name] = time.time()
|
|
553
|
+
|
|
554
|
+
index_meta_raw = super().get_index_meta(index_name)
|
|
555
|
+
from ..runtime.langchain.interfaces.llm_processor import add_documents
|
|
556
|
+
#
|
|
557
|
+
if index_meta_raw:
|
|
558
|
+
metadata = copy.deepcopy(index_meta_raw.get("metadata", {}))
|
|
559
|
+
metadata["indexed"] = self.get_indexed_count(index_name)
|
|
560
|
+
metadata["updated"] = result
|
|
561
|
+
metadata["state"] = state
|
|
562
|
+
metadata["updated_on"] = time.time()
|
|
563
|
+
#
|
|
564
|
+
history_raw = metadata.pop("history", "[]")
|
|
565
|
+
try:
|
|
566
|
+
history = json.loads(history_raw) if history_raw.strip() else []
|
|
567
|
+
# replace the last history item with updated metadata
|
|
568
|
+
if history and isinstance(history, list):
|
|
569
|
+
history[-1] = metadata
|
|
570
|
+
else:
|
|
571
|
+
history = [metadata]
|
|
572
|
+
except (json.JSONDecodeError, TypeError):
|
|
573
|
+
logger.warning(f"Failed to load index history: {history_raw}. Create new with only current item.")
|
|
574
|
+
history = [metadata]
|
|
575
|
+
#
|
|
576
|
+
metadata["history"] = json.dumps(history)
|
|
577
|
+
index_meta_doc = Document(page_content=index_meta_raw.get("content", ""), metadata=metadata)
|
|
578
|
+
add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=[index_meta_raw.get("id")])
|
|
579
|
+
|
|
580
|
+
def _emit_index_event(self, index_name: str, error: Optional[str] = None):
|
|
581
|
+
"""
|
|
582
|
+
Emit custom event for index data operation.
|
|
583
|
+
|
|
584
|
+
Args:
|
|
585
|
+
index_name: The name of the index
|
|
586
|
+
error: Error message if the operation failed, None otherwise
|
|
587
|
+
"""
|
|
588
|
+
index_meta = super().get_index_meta(index_name)
|
|
589
|
+
|
|
590
|
+
if not index_meta:
|
|
591
|
+
logger.warning(
|
|
592
|
+
f"No index_meta found for index '{index_name}'. "
|
|
593
|
+
"Cannot emit index event."
|
|
594
|
+
)
|
|
595
|
+
return
|
|
596
|
+
|
|
597
|
+
metadata = index_meta.get("metadata", {})
|
|
598
|
+
|
|
599
|
+
# Determine if this is a reindex operation
|
|
600
|
+
history_raw = metadata.get("history", "[]")
|
|
601
|
+
try:
|
|
602
|
+
history = json.loads(history_raw) if history_raw.strip() else []
|
|
603
|
+
is_reindex = len(history) > 1
|
|
604
|
+
except (json.JSONDecodeError, TypeError):
|
|
605
|
+
is_reindex = False
|
|
606
|
+
|
|
607
|
+
# Build event message
|
|
608
|
+
event_data = {
|
|
609
|
+
"id": index_meta.get("id"),
|
|
610
|
+
"index_name": index_name,
|
|
611
|
+
"state": "failed" if error is not None else metadata.get("state"),
|
|
612
|
+
"error": error,
|
|
613
|
+
"reindex": is_reindex,
|
|
614
|
+
"indexed": metadata.get("indexed", 0),
|
|
615
|
+
"updated": metadata.get("updated", 0),
|
|
616
|
+
"toolkit_id": metadata.get("toolkit_id"),
|
|
617
|
+
}
|
|
618
|
+
|
|
619
|
+
# Emit the event
|
|
620
|
+
try:
|
|
621
|
+
dispatch_custom_event("index_data_status", event_data)
|
|
622
|
+
logger.debug(
|
|
623
|
+
f"Emitted index_data_status event for index "
|
|
624
|
+
f"'{index_name}': {event_data}"
|
|
625
|
+
)
|
|
626
|
+
except Exception as e:
|
|
627
|
+
logger.warning(f"Failed to emit index_data_status event: {e}")
|
|
628
|
+
|
|
629
|
+
def _emit_index_data_removed_event(self, index_name: str):
|
|
630
|
+
"""
|
|
631
|
+
Emit custom event for index data removing.
|
|
632
|
+
|
|
633
|
+
Args:
|
|
634
|
+
index_name: The name of the index
|
|
635
|
+
toolkit_id: The toolkit identifier
|
|
636
|
+
"""
|
|
637
|
+
# Build event message
|
|
638
|
+
event_data = {
|
|
639
|
+
"index_name": index_name,
|
|
640
|
+
"toolkit_id": self.toolkit_id,
|
|
641
|
+
"project_id": self.alita.project_id,
|
|
642
|
+
}
|
|
643
|
+
# Emit the event
|
|
644
|
+
try:
|
|
645
|
+
dispatch_custom_event("index_data_removed", event_data)
|
|
646
|
+
logger.debug(
|
|
647
|
+
f"Emitted index_data_removed event for index "
|
|
648
|
+
f"'{index_name}': {event_data}"
|
|
649
|
+
)
|
|
650
|
+
except Exception as e:
|
|
651
|
+
logger.warning(f"Failed to emit index_data_removed event: {e}")
|
|
337
652
|
|
|
338
653
|
def get_available_tools(self):
|
|
339
654
|
"""
|
|
340
655
|
Returns the standardized vector search tools (search operations only).
|
|
341
656
|
Index operations are toolkit-specific and should be added manually to each toolkit.
|
|
342
|
-
|
|
657
|
+
|
|
658
|
+
This method constructs the argument schemas for each tool, merging base parameters with any extra parameters
|
|
659
|
+
defined in the subclass. It also handles the special case for chunking tools and their configuration.
|
|
660
|
+
|
|
343
661
|
Returns:
|
|
344
|
-
List of tool dictionaries with name, ref, description, and args_schema
|
|
662
|
+
list: List of tool dictionaries with name, ref, description, and args_schema.
|
|
345
663
|
"""
|
|
664
|
+
index_params = {
|
|
665
|
+
"index_name": (
|
|
666
|
+
str,
|
|
667
|
+
Field(description="Index name (max 7 characters)", min_length=1, max_length=7)
|
|
668
|
+
),
|
|
669
|
+
"clean_index": (
|
|
670
|
+
Optional[bool],
|
|
671
|
+
Field(default=False, description="Optional flag to enforce clean existing index before indexing new data")
|
|
672
|
+
),
|
|
673
|
+
"progress_step": (
|
|
674
|
+
Optional[int],
|
|
675
|
+
Field(default=10, ge=0, le=100, description="Optional step size for progress reporting during indexing")
|
|
676
|
+
),
|
|
677
|
+
}
|
|
678
|
+
chunking_config = (
|
|
679
|
+
Optional[dict],
|
|
680
|
+
Field(description="Chunking tool configuration", default=loaders_allowed_to_override)
|
|
681
|
+
)
|
|
682
|
+
|
|
683
|
+
index_extra_params = self._index_tool_params() or {}
|
|
684
|
+
chunking_tool = index_extra_params.pop("chunking_tool", None)
|
|
685
|
+
if chunking_tool:
|
|
686
|
+
index_params = {
|
|
687
|
+
**index_params,
|
|
688
|
+
"chunking_tool": chunking_tool,
|
|
689
|
+
}
|
|
690
|
+
index_params["chunking_config"] = chunking_config
|
|
691
|
+
index_args_schema = create_model("IndexData", **index_params, **index_extra_params)
|
|
692
|
+
|
|
346
693
|
return [
|
|
347
694
|
{
|
|
348
|
-
"name":
|
|
349
|
-
"mode":
|
|
695
|
+
"name": IndexTools.INDEX_DATA.value,
|
|
696
|
+
"mode": IndexTools.INDEX_DATA.value,
|
|
350
697
|
"ref": self.index_data,
|
|
351
698
|
"description": "Loads data to index.",
|
|
352
|
-
"args_schema":
|
|
353
|
-
"IndexData",
|
|
354
|
-
__base__=BaseIndexDataParams,
|
|
355
|
-
**self._index_tool_params() if self._index_tool_params() else {}
|
|
356
|
-
)
|
|
699
|
+
"args_schema": index_args_schema,
|
|
357
700
|
},
|
|
358
701
|
{
|
|
359
|
-
"name":
|
|
360
|
-
"mode":
|
|
702
|
+
"name": IndexTools.SEARCH_INDEX.value,
|
|
703
|
+
"mode": IndexTools.SEARCH_INDEX.value,
|
|
361
704
|
"ref": self.search_index,
|
|
362
705
|
"description": self.search_index.__doc__,
|
|
363
706
|
"args_schema": BaseSearchParams
|
|
364
707
|
},
|
|
365
708
|
{
|
|
366
|
-
"name":
|
|
367
|
-
"mode":
|
|
709
|
+
"name": IndexTools.STEPBACK_SEARCH_INDEX.value,
|
|
710
|
+
"mode": IndexTools.STEPBACK_SEARCH_INDEX.value,
|
|
368
711
|
"ref": self.stepback_search_index,
|
|
369
712
|
"description": self.stepback_search_index.__doc__,
|
|
370
713
|
"args_schema": BaseStepbackSearchParams
|
|
371
714
|
},
|
|
372
715
|
{
|
|
373
|
-
"name":
|
|
374
|
-
"mode":
|
|
716
|
+
"name": IndexTools.STEPBACK_SUMMARY_INDEX.value,
|
|
717
|
+
"mode": IndexTools.STEPBACK_SUMMARY_INDEX.value,
|
|
375
718
|
"ref": self.stepback_summary_index,
|
|
376
719
|
"description": self.stepback_summary_index.__doc__,
|
|
377
720
|
"args_schema": BaseStepbackSearchParams
|
|
378
721
|
},
|
|
379
722
|
{
|
|
380
|
-
"name":
|
|
381
|
-
"mode":
|
|
723
|
+
"name": IndexTools.REMOVE_INDEX.value,
|
|
724
|
+
"mode": IndexTools.REMOVE_INDEX.value,
|
|
382
725
|
"ref": self.remove_index,
|
|
383
726
|
"description": self.remove_index.__doc__,
|
|
384
727
|
"args_schema": RemoveIndexParams
|
|
385
728
|
},
|
|
386
729
|
{
|
|
387
|
-
"name":
|
|
388
|
-
"mode":
|
|
730
|
+
"name": IndexTools.LIST_COLLECTIONS.value,
|
|
731
|
+
"mode": IndexTools.LIST_COLLECTIONS.value,
|
|
389
732
|
"ref": self.list_collections,
|
|
390
733
|
"description": self.list_collections.__doc__,
|
|
391
|
-
|
|
734
|
+
# No parameters
|
|
735
|
+
"args_schema": create_model("ListCollectionsParams")
|
|
392
736
|
},
|
|
393
|
-
]
|
|
737
|
+
]
|