alita-sdk 0.3.263__py3-none-any.whl → 0.3.499__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- alita_sdk/cli/__init__.py +10 -0
- alita_sdk/cli/__main__.py +17 -0
- alita_sdk/cli/agent/__init__.py +5 -0
- alita_sdk/cli/agent/default.py +258 -0
- alita_sdk/cli/agent_executor.py +155 -0
- alita_sdk/cli/agent_loader.py +215 -0
- alita_sdk/cli/agent_ui.py +228 -0
- alita_sdk/cli/agents.py +3601 -0
- alita_sdk/cli/callbacks.py +647 -0
- alita_sdk/cli/cli.py +168 -0
- alita_sdk/cli/config.py +306 -0
- alita_sdk/cli/context/__init__.py +30 -0
- alita_sdk/cli/context/cleanup.py +198 -0
- alita_sdk/cli/context/manager.py +731 -0
- alita_sdk/cli/context/message.py +285 -0
- alita_sdk/cli/context/strategies.py +289 -0
- alita_sdk/cli/context/token_estimation.py +127 -0
- alita_sdk/cli/formatting.py +182 -0
- alita_sdk/cli/input_handler.py +419 -0
- alita_sdk/cli/inventory.py +1256 -0
- alita_sdk/cli/mcp_loader.py +315 -0
- alita_sdk/cli/toolkit.py +327 -0
- alita_sdk/cli/toolkit_loader.py +85 -0
- alita_sdk/cli/tools/__init__.py +43 -0
- alita_sdk/cli/tools/approval.py +224 -0
- alita_sdk/cli/tools/filesystem.py +1751 -0
- alita_sdk/cli/tools/planning.py +389 -0
- alita_sdk/cli/tools/terminal.py +414 -0
- alita_sdk/community/__init__.py +64 -8
- alita_sdk/community/inventory/__init__.py +224 -0
- alita_sdk/community/inventory/config.py +257 -0
- alita_sdk/community/inventory/enrichment.py +2137 -0
- alita_sdk/community/inventory/extractors.py +1469 -0
- alita_sdk/community/inventory/ingestion.py +3172 -0
- alita_sdk/community/inventory/knowledge_graph.py +1457 -0
- alita_sdk/community/inventory/parsers/__init__.py +218 -0
- alita_sdk/community/inventory/parsers/base.py +295 -0
- alita_sdk/community/inventory/parsers/csharp_parser.py +907 -0
- alita_sdk/community/inventory/parsers/go_parser.py +851 -0
- alita_sdk/community/inventory/parsers/html_parser.py +389 -0
- alita_sdk/community/inventory/parsers/java_parser.py +593 -0
- alita_sdk/community/inventory/parsers/javascript_parser.py +629 -0
- alita_sdk/community/inventory/parsers/kotlin_parser.py +768 -0
- alita_sdk/community/inventory/parsers/markdown_parser.py +362 -0
- alita_sdk/community/inventory/parsers/python_parser.py +604 -0
- alita_sdk/community/inventory/parsers/rust_parser.py +858 -0
- alita_sdk/community/inventory/parsers/swift_parser.py +832 -0
- alita_sdk/community/inventory/parsers/text_parser.py +322 -0
- alita_sdk/community/inventory/parsers/yaml_parser.py +370 -0
- alita_sdk/community/inventory/patterns/__init__.py +61 -0
- alita_sdk/community/inventory/patterns/ast_adapter.py +380 -0
- alita_sdk/community/inventory/patterns/loader.py +348 -0
- alita_sdk/community/inventory/patterns/registry.py +198 -0
- alita_sdk/community/inventory/presets.py +535 -0
- alita_sdk/community/inventory/retrieval.py +1403 -0
- alita_sdk/community/inventory/toolkit.py +173 -0
- alita_sdk/community/inventory/visualize.py +1370 -0
- alita_sdk/configurations/__init__.py +10 -0
- alita_sdk/configurations/ado.py +4 -2
- alita_sdk/configurations/azure_search.py +1 -1
- alita_sdk/configurations/bigquery.py +1 -1
- alita_sdk/configurations/bitbucket.py +94 -2
- alita_sdk/configurations/browser.py +18 -0
- alita_sdk/configurations/carrier.py +19 -0
- alita_sdk/configurations/confluence.py +96 -1
- alita_sdk/configurations/delta_lake.py +1 -1
- alita_sdk/configurations/figma.py +0 -5
- alita_sdk/configurations/github.py +65 -1
- alita_sdk/configurations/gitlab.py +79 -0
- alita_sdk/configurations/google_places.py +17 -0
- alita_sdk/configurations/jira.py +103 -0
- alita_sdk/configurations/postman.py +1 -1
- alita_sdk/configurations/qtest.py +1 -3
- alita_sdk/configurations/report_portal.py +19 -0
- alita_sdk/configurations/salesforce.py +19 -0
- alita_sdk/configurations/service_now.py +1 -12
- alita_sdk/configurations/sharepoint.py +19 -0
- alita_sdk/configurations/sonar.py +18 -0
- alita_sdk/configurations/sql.py +20 -0
- alita_sdk/configurations/testio.py +18 -0
- alita_sdk/configurations/testrail.py +88 -0
- alita_sdk/configurations/xray.py +94 -1
- alita_sdk/configurations/zephyr_enterprise.py +94 -1
- alita_sdk/configurations/zephyr_essential.py +95 -0
- alita_sdk/runtime/clients/artifact.py +12 -2
- alita_sdk/runtime/clients/client.py +235 -66
- alita_sdk/runtime/clients/mcp_discovery.py +342 -0
- alita_sdk/runtime/clients/mcp_manager.py +262 -0
- alita_sdk/runtime/clients/sandbox_client.py +373 -0
- alita_sdk/runtime/langchain/assistant.py +123 -17
- alita_sdk/runtime/langchain/constants.py +8 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaExcelLoader.py +209 -31
- alita_sdk/runtime/langchain/document_loaders/AlitaImageLoader.py +1 -1
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +8 -2
- alita_sdk/runtime/langchain/document_loaders/AlitaMarkdownLoader.py +66 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaPDFLoader.py +79 -10
- alita_sdk/runtime/langchain/document_loaders/AlitaPowerPointLoader.py +52 -15
- alita_sdk/runtime/langchain/document_loaders/AlitaPythonLoader.py +9 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaTableLoader.py +1 -4
- alita_sdk/runtime/langchain/document_loaders/AlitaTextLoader.py +15 -2
- alita_sdk/runtime/langchain/document_loaders/ImageParser.py +30 -0
- alita_sdk/runtime/langchain/document_loaders/constants.py +187 -40
- alita_sdk/runtime/langchain/interfaces/llm_processor.py +4 -2
- alita_sdk/runtime/langchain/langraph_agent.py +406 -91
- alita_sdk/runtime/langchain/utils.py +51 -8
- alita_sdk/runtime/llms/preloaded.py +2 -6
- alita_sdk/runtime/models/mcp_models.py +61 -0
- alita_sdk/runtime/toolkits/__init__.py +26 -0
- alita_sdk/runtime/toolkits/application.py +9 -2
- alita_sdk/runtime/toolkits/artifact.py +19 -7
- alita_sdk/runtime/toolkits/datasource.py +13 -6
- alita_sdk/runtime/toolkits/mcp.py +780 -0
- alita_sdk/runtime/toolkits/planning.py +178 -0
- alita_sdk/runtime/toolkits/subgraph.py +11 -6
- alita_sdk/runtime/toolkits/tools.py +214 -60
- alita_sdk/runtime/toolkits/vectorstore.py +9 -4
- alita_sdk/runtime/tools/__init__.py +22 -0
- alita_sdk/runtime/tools/application.py +16 -4
- alita_sdk/runtime/tools/artifact.py +312 -19
- alita_sdk/runtime/tools/function.py +100 -4
- alita_sdk/runtime/tools/graph.py +81 -0
- alita_sdk/runtime/tools/image_generation.py +212 -0
- alita_sdk/runtime/tools/llm.py +539 -180
- alita_sdk/runtime/tools/mcp_inspect_tool.py +284 -0
- alita_sdk/runtime/tools/mcp_remote_tool.py +181 -0
- alita_sdk/runtime/tools/mcp_server_tool.py +3 -1
- alita_sdk/runtime/tools/planning/__init__.py +36 -0
- alita_sdk/runtime/tools/planning/models.py +246 -0
- alita_sdk/runtime/tools/planning/wrapper.py +607 -0
- alita_sdk/runtime/tools/router.py +2 -1
- alita_sdk/runtime/tools/sandbox.py +375 -0
- alita_sdk/runtime/tools/vectorstore.py +62 -63
- alita_sdk/runtime/tools/vectorstore_base.py +156 -85
- alita_sdk/runtime/utils/AlitaCallback.py +106 -20
- alita_sdk/runtime/utils/mcp_client.py +465 -0
- alita_sdk/runtime/utils/mcp_oauth.py +244 -0
- alita_sdk/runtime/utils/mcp_sse_client.py +405 -0
- alita_sdk/runtime/utils/mcp_tools_discovery.py +124 -0
- alita_sdk/runtime/utils/streamlit.py +41 -14
- alita_sdk/runtime/utils/toolkit_utils.py +28 -9
- alita_sdk/runtime/utils/utils.py +14 -0
- alita_sdk/tools/__init__.py +78 -35
- alita_sdk/tools/ado/__init__.py +0 -1
- alita_sdk/tools/ado/repos/__init__.py +10 -6
- alita_sdk/tools/ado/repos/repos_wrapper.py +12 -11
- alita_sdk/tools/ado/test_plan/__init__.py +10 -7
- alita_sdk/tools/ado/test_plan/test_plan_wrapper.py +56 -23
- alita_sdk/tools/ado/wiki/__init__.py +10 -11
- alita_sdk/tools/ado/wiki/ado_wrapper.py +114 -28
- alita_sdk/tools/ado/work_item/__init__.py +10 -11
- alita_sdk/tools/ado/work_item/ado_wrapper.py +63 -10
- alita_sdk/tools/advanced_jira_mining/__init__.py +10 -7
- alita_sdk/tools/aws/delta_lake/__init__.py +13 -11
- alita_sdk/tools/azure_ai/search/__init__.py +11 -7
- alita_sdk/tools/base_indexer_toolkit.py +392 -86
- alita_sdk/tools/bitbucket/__init__.py +18 -11
- alita_sdk/tools/bitbucket/api_wrapper.py +52 -9
- alita_sdk/tools/bitbucket/cloud_api_wrapper.py +5 -5
- alita_sdk/tools/browser/__init__.py +40 -16
- alita_sdk/tools/browser/crawler.py +3 -1
- alita_sdk/tools/browser/utils.py +15 -6
- alita_sdk/tools/carrier/__init__.py +17 -17
- alita_sdk/tools/carrier/backend_reports_tool.py +8 -4
- alita_sdk/tools/carrier/excel_reporter.py +8 -4
- alita_sdk/tools/chunkers/__init__.py +3 -1
- alita_sdk/tools/chunkers/code/codeparser.py +1 -1
- alita_sdk/tools/chunkers/sematic/json_chunker.py +1 -0
- alita_sdk/tools/chunkers/sematic/markdown_chunker.py +97 -6
- alita_sdk/tools/chunkers/sematic/proposal_chunker.py +1 -1
- alita_sdk/tools/chunkers/universal_chunker.py +270 -0
- alita_sdk/tools/cloud/aws/__init__.py +9 -6
- alita_sdk/tools/cloud/azure/__init__.py +9 -6
- alita_sdk/tools/cloud/gcp/__init__.py +9 -6
- alita_sdk/tools/cloud/k8s/__init__.py +9 -6
- alita_sdk/tools/code/linter/__init__.py +7 -7
- alita_sdk/tools/code/loaders/codesearcher.py +3 -2
- alita_sdk/tools/code/sonar/__init__.py +18 -12
- alita_sdk/tools/code_indexer_toolkit.py +199 -0
- alita_sdk/tools/confluence/__init__.py +14 -11
- alita_sdk/tools/confluence/api_wrapper.py +198 -58
- alita_sdk/tools/confluence/loader.py +10 -0
- alita_sdk/tools/custom_open_api/__init__.py +9 -4
- alita_sdk/tools/elastic/__init__.py +8 -7
- alita_sdk/tools/elitea_base.py +543 -64
- alita_sdk/tools/figma/__init__.py +10 -8
- alita_sdk/tools/figma/api_wrapper.py +352 -153
- alita_sdk/tools/github/__init__.py +13 -11
- alita_sdk/tools/github/api_wrapper.py +9 -26
- alita_sdk/tools/github/github_client.py +75 -12
- alita_sdk/tools/github/schemas.py +2 -1
- alita_sdk/tools/gitlab/__init__.py +11 -10
- alita_sdk/tools/gitlab/api_wrapper.py +135 -45
- alita_sdk/tools/gitlab_org/__init__.py +11 -9
- alita_sdk/tools/google/bigquery/__init__.py +12 -13
- alita_sdk/tools/google_places/__init__.py +18 -10
- alita_sdk/tools/jira/__init__.py +14 -8
- alita_sdk/tools/jira/api_wrapper.py +315 -168
- alita_sdk/tools/keycloak/__init__.py +8 -7
- alita_sdk/tools/localgit/local_git.py +56 -54
- alita_sdk/tools/memory/__init__.py +27 -11
- alita_sdk/tools/non_code_indexer_toolkit.py +7 -2
- alita_sdk/tools/ocr/__init__.py +8 -7
- alita_sdk/tools/openapi/__init__.py +10 -1
- alita_sdk/tools/pandas/__init__.py +8 -7
- alita_sdk/tools/pandas/api_wrapper.py +7 -25
- alita_sdk/tools/postman/__init__.py +8 -10
- alita_sdk/tools/postman/api_wrapper.py +19 -8
- alita_sdk/tools/postman/postman_analysis.py +8 -1
- alita_sdk/tools/pptx/__init__.py +8 -9
- alita_sdk/tools/qtest/__init__.py +19 -13
- alita_sdk/tools/qtest/api_wrapper.py +1784 -88
- alita_sdk/tools/rally/__init__.py +10 -9
- alita_sdk/tools/report_portal/__init__.py +20 -15
- alita_sdk/tools/salesforce/__init__.py +19 -15
- alita_sdk/tools/servicenow/__init__.py +14 -11
- alita_sdk/tools/sharepoint/__init__.py +14 -13
- alita_sdk/tools/sharepoint/api_wrapper.py +179 -39
- alita_sdk/tools/sharepoint/authorization_helper.py +191 -1
- alita_sdk/tools/sharepoint/utils.py +8 -2
- alita_sdk/tools/slack/__init__.py +10 -7
- alita_sdk/tools/sql/__init__.py +19 -18
- alita_sdk/tools/sql/api_wrapper.py +71 -23
- alita_sdk/tools/testio/__init__.py +18 -12
- alita_sdk/tools/testrail/__init__.py +10 -10
- alita_sdk/tools/testrail/api_wrapper.py +213 -45
- alita_sdk/tools/utils/__init__.py +28 -4
- alita_sdk/tools/utils/content_parser.py +181 -61
- alita_sdk/tools/utils/text_operations.py +254 -0
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +83 -27
- alita_sdk/tools/xray/__init__.py +12 -7
- alita_sdk/tools/xray/api_wrapper.py +58 -113
- alita_sdk/tools/zephyr/__init__.py +9 -6
- alita_sdk/tools/zephyr_enterprise/__init__.py +13 -8
- alita_sdk/tools/zephyr_enterprise/api_wrapper.py +17 -7
- alita_sdk/tools/zephyr_essential/__init__.py +13 -9
- alita_sdk/tools/zephyr_essential/api_wrapper.py +289 -47
- alita_sdk/tools/zephyr_essential/client.py +6 -4
- alita_sdk/tools/zephyr_scale/__init__.py +10 -7
- alita_sdk/tools/zephyr_scale/api_wrapper.py +6 -2
- alita_sdk/tools/zephyr_squad/__init__.py +9 -6
- {alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/METADATA +180 -33
- alita_sdk-0.3.499.dist-info/RECORD +433 -0
- alita_sdk-0.3.499.dist-info/entry_points.txt +2 -0
- alita_sdk-0.3.263.dist-info/RECORD +0 -342
- {alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.263.dist-info → alita_sdk-0.3.499.dist-info}/top_level.txt +0 -0
|
@@ -1,40 +1,57 @@
|
|
|
1
|
+
import copy
|
|
1
2
|
import json
|
|
2
3
|
import logging
|
|
3
|
-
|
|
4
|
+
import time
|
|
5
|
+
from enum import Enum
|
|
6
|
+
from typing import Any, Optional, List, Dict, Generator
|
|
4
7
|
|
|
8
|
+
from langchain_core.callbacks import dispatch_custom_event
|
|
5
9
|
from langchain_core.documents import Document
|
|
6
10
|
from pydantic import create_model, Field, SecretStr
|
|
7
11
|
|
|
8
|
-
from .utils.content_parser import
|
|
12
|
+
from .utils.content_parser import file_extension_by_chunker, process_document_by_type
|
|
9
13
|
from .vector_adapters.VectorStoreAdapter import VectorStoreAdapterFactory
|
|
14
|
+
from ..runtime.langchain.document_loaders.constants import loaders_allowed_to_override
|
|
10
15
|
from ..runtime.tools.vectorstore_base import VectorStoreWrapperBase
|
|
11
16
|
from ..runtime.utils.utils import IndexerKeywords
|
|
12
17
|
|
|
13
18
|
logger = logging.getLogger(__name__)
|
|
14
19
|
|
|
20
|
+
DEFAULT_CUT_OFF = 0.1
|
|
21
|
+
INDEX_META_UPDATE_INTERVAL = 600.0
|
|
22
|
+
|
|
23
|
+
class IndexTools(str, Enum):
|
|
24
|
+
"""Enum for index-related tool names."""
|
|
25
|
+
INDEX_DATA = "index_data"
|
|
26
|
+
SEARCH_INDEX = "search_index"
|
|
27
|
+
STEPBACK_SEARCH_INDEX = "stepback_search_index"
|
|
28
|
+
STEPBACK_SUMMARY_INDEX = "stepback_summary_index"
|
|
29
|
+
REMOVE_INDEX = "remove_index"
|
|
30
|
+
LIST_COLLECTIONS = "list_collections"
|
|
31
|
+
|
|
15
32
|
# Base Vector Store Schema Models
|
|
16
33
|
BaseIndexParams = create_model(
|
|
17
34
|
"BaseIndexParams",
|
|
18
|
-
|
|
35
|
+
index_name=(str, Field(description="Index name (max 7 characters)", min_length=1, max_length=7)),
|
|
19
36
|
)
|
|
20
37
|
|
|
21
38
|
RemoveIndexParams = create_model(
|
|
22
39
|
"RemoveIndexParams",
|
|
23
|
-
|
|
40
|
+
index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
|
|
24
41
|
)
|
|
25
42
|
|
|
26
43
|
BaseSearchParams = create_model(
|
|
27
44
|
"BaseSearchParams",
|
|
28
45
|
query=(str, Field(description="Query text to search in the index")),
|
|
29
|
-
|
|
30
|
-
description="Optional
|
|
46
|
+
index_name=(Optional[str], Field(
|
|
47
|
+
description="Optional index name (max 7 characters). Leave empty to search across all datasets",
|
|
31
48
|
default="", max_length=7)),
|
|
32
49
|
filter=(Optional[dict | str], Field(
|
|
33
50
|
description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
|
|
34
51
|
default={},
|
|
35
52
|
examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
|
|
36
53
|
)),
|
|
37
|
-
cut_off=(Optional[float], Field(description="Cut-off score for search results", default=0
|
|
54
|
+
cut_off=(Optional[float], Field(description="Cut-off score for search results", default=DEFAULT_CUT_OFF, ge=0, le=1)),
|
|
38
55
|
search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
|
|
39
56
|
full_text_search=(Optional[Dict[str, Any]], Field(
|
|
40
57
|
description="Full text search parameters. Can be a dictionary with search options.",
|
|
@@ -57,41 +74,41 @@ BaseSearchParams = create_model(
|
|
|
57
74
|
BaseStepbackSearchParams = create_model(
|
|
58
75
|
"BaseStepbackSearchParams",
|
|
59
76
|
query=(str, Field(description="Query text to search in the index")),
|
|
60
|
-
|
|
77
|
+
index_name=(Optional[str], Field(description="Optional index name (max 7 characters)", default="", max_length=7)),
|
|
61
78
|
messages=(Optional[List], Field(description="Chat messages for stepback search context", default=[])),
|
|
62
79
|
filter=(Optional[dict | str], Field(
|
|
63
80
|
description="Filter to apply to the search results. Can be a dictionary or a JSON string.",
|
|
64
81
|
default={},
|
|
65
82
|
examples=["{\"key\": \"value\"}", "{\"status\": \"active\"}"]
|
|
66
83
|
)),
|
|
67
|
-
cut_off=(Optional[float], Field(description="Cut-off score for search results", default=0
|
|
84
|
+
cut_off=(Optional[float], Field(description="Cut-off score for search results", default=DEFAULT_CUT_OFF, ge=0, le=1)),
|
|
68
85
|
search_top=(Optional[int], Field(description="Number of top results to return", default=10)),
|
|
69
|
-
reranker=(Optional[dict], Field(
|
|
70
|
-
description="Reranker configuration. Can be a dictionary with reranking parameters.",
|
|
71
|
-
default={}
|
|
72
|
-
)),
|
|
73
86
|
full_text_search=(Optional[Dict[str, Any]], Field(
|
|
74
87
|
description="Full text search parameters. Can be a dictionary with search options.",
|
|
75
88
|
default=None
|
|
76
89
|
)),
|
|
77
|
-
reranking_config=(Optional[Dict[str, Dict[str, Any]]], Field(
|
|
78
|
-
description="Reranking configuration. Can be a dictionary with reranking settings.",
|
|
79
|
-
default=None
|
|
80
|
-
)),
|
|
81
90
|
extended_search=(Optional[List[str]], Field(
|
|
82
91
|
description="List of additional fields to include in the search results.",
|
|
83
92
|
default=None
|
|
84
93
|
)),
|
|
94
|
+
reranker=(Optional[dict], Field(
|
|
95
|
+
description="Reranker configuration. Can be a dictionary with reranking parameters.",
|
|
96
|
+
default={}
|
|
97
|
+
)),
|
|
98
|
+
reranking_config=(Optional[Dict[str, Dict[str, Any]]], Field(
|
|
99
|
+
description="Reranking configuration. Can be a dictionary with reranking settings.",
|
|
100
|
+
default=None
|
|
101
|
+
)),
|
|
85
102
|
)
|
|
86
103
|
|
|
87
104
|
BaseIndexDataParams = create_model(
|
|
88
105
|
"indexData",
|
|
89
106
|
__base__=BaseIndexParams,
|
|
90
|
-
progress_step=(Optional[int], Field(default=10, ge=0, le=100,
|
|
91
|
-
description="Optional step size for progress reporting during indexing")),
|
|
92
107
|
clean_index=(Optional[bool], Field(default=False,
|
|
93
108
|
description="Optional flag to enforce clean existing index before indexing new data")),
|
|
94
|
-
|
|
109
|
+
progress_step=(Optional[int], Field(default=10, ge=0, le=100,
|
|
110
|
+
description="Optional step size for progress reporting during indexing")),
|
|
111
|
+
chunking_config=(Optional[dict], Field(description="Chunking tool configuration", default=loaders_allowed_to_override)),
|
|
95
112
|
)
|
|
96
113
|
|
|
97
114
|
|
|
@@ -100,26 +117,21 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
100
117
|
|
|
101
118
|
doctype: str = "document"
|
|
102
119
|
|
|
103
|
-
llm: Any = None
|
|
104
120
|
connection_string: Optional[SecretStr] = None
|
|
105
121
|
collection_name: Optional[str] = None
|
|
106
|
-
embedding_model: Optional[str] = "HuggingFaceEmbeddings"
|
|
107
|
-
vectorstore_type: Optional[str] = "PGVector"
|
|
108
|
-
_embedding: Optional[Any] = None
|
|
109
122
|
alita: Any = None # Elitea client, if available
|
|
110
123
|
|
|
111
124
|
def __init__(self, **kwargs):
|
|
112
125
|
conn = kwargs.get('connection_string', None)
|
|
113
126
|
connection_string = conn.get_secret_value() if isinstance(conn, SecretStr) else conn
|
|
114
|
-
collection_name = kwargs.get('
|
|
127
|
+
collection_name = kwargs.get('collection_schema')
|
|
115
128
|
|
|
116
|
-
if 'embedding_model' not in kwargs:
|
|
117
|
-
kwargs['embedding_model'] = 'HuggingFaceEmbeddings'
|
|
118
129
|
if 'vectorstore_type' not in kwargs:
|
|
119
130
|
kwargs['vectorstore_type'] = 'PGVector'
|
|
120
131
|
vectorstore_type = kwargs.get('vectorstore_type')
|
|
121
|
-
|
|
122
|
-
|
|
132
|
+
if connection_string:
|
|
133
|
+
# Initialize vectorstore params only if connection string is provided
|
|
134
|
+
kwargs['vectorstore_params'] = VectorStoreAdapterFactory.create_adapter(vectorstore_type).get_vectorstore_params(collection_name, connection_string)
|
|
123
135
|
super().__init__(**kwargs)
|
|
124
136
|
|
|
125
137
|
def _index_tool_params(self, **kwargs) -> dict[str, tuple[type, Field]]:
|
|
@@ -129,6 +141,11 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
129
141
|
"""
|
|
130
142
|
return {}
|
|
131
143
|
|
|
144
|
+
def _remove_metadata_keys(self) -> List[str]:
|
|
145
|
+
""" Returns a list of metadata keys to be removed from documents before indexing.
|
|
146
|
+
Override this method in subclasses to provide specific keys to remove."""
|
|
147
|
+
return [IndexerKeywords.CONTENT_IN_BYTES.value, IndexerKeywords.CONTENT_FILE_NAME.value]
|
|
148
|
+
|
|
132
149
|
def _base_loader(self, **kwargs) -> Generator[Document, None, None]:
|
|
133
150
|
""" Loads documents from a source, processes them,
|
|
134
151
|
and returns a list of Document objects with base metadata: id and created_on."""
|
|
@@ -147,45 +164,156 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
147
164
|
yield from ()
|
|
148
165
|
|
|
149
166
|
def index_data(self, **kwargs):
|
|
150
|
-
|
|
151
|
-
progress_step = kwargs.get("progress_step")
|
|
167
|
+
index_name = kwargs.get("index_name")
|
|
152
168
|
clean_index = kwargs.get("clean_index")
|
|
153
169
|
chunking_tool = kwargs.get("chunking_tool")
|
|
154
170
|
chunking_config = kwargs.get("chunking_config")
|
|
171
|
+
|
|
172
|
+
# Store the interval in a private dict to avoid Pydantic field errors
|
|
173
|
+
if not hasattr(self, "_index_meta_config"):
|
|
174
|
+
self._index_meta_config: Dict[str, Any] = {}
|
|
175
|
+
|
|
176
|
+
self._index_meta_config["update_interval"] = kwargs.get(
|
|
177
|
+
"meta_update_interval",
|
|
178
|
+
INDEX_META_UPDATE_INTERVAL,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
result = {"count": 0}
|
|
155
182
|
#
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
183
|
+
try:
|
|
184
|
+
if clean_index:
|
|
185
|
+
self._clean_index(index_name)
|
|
186
|
+
#
|
|
187
|
+
self.index_meta_init(index_name, kwargs)
|
|
188
|
+
self._emit_index_event(index_name)
|
|
189
|
+
#
|
|
190
|
+
self._log_tool_event(f"Indexing data into collection with suffix '{index_name}'. It can take some time...")
|
|
191
|
+
self._log_tool_event(f"Loading the documents to index...{kwargs}")
|
|
192
|
+
documents = self._base_loader(**kwargs)
|
|
193
|
+
documents = list(documents) # consume/exhaust generator to count items
|
|
194
|
+
documents_count = len(documents)
|
|
195
|
+
documents = (doc for doc in documents)
|
|
196
|
+
self._log_tool_event(f"Base documents were pre-loaded. "
|
|
197
|
+
f"Search for possible document duplicates and remove them from the indexing list...")
|
|
198
|
+
documents = self._reduce_duplicates(documents, index_name)
|
|
199
|
+
self._log_tool_event(f"Duplicates were removed. "
|
|
200
|
+
f"Processing documents to collect dependencies and prepare them for indexing...")
|
|
201
|
+
self._save_index_generator(documents, documents_count, chunking_tool, chunking_config, index_name=index_name, result=result)
|
|
202
|
+
#
|
|
203
|
+
results_count = result["count"]
|
|
204
|
+
# Final update should always be forced
|
|
205
|
+
self.index_meta_update(index_name, IndexerKeywords.INDEX_META_COMPLETED.value, results_count, update_force=True)
|
|
206
|
+
self._emit_index_event(index_name)
|
|
207
|
+
#
|
|
208
|
+
return {"status": "ok", "message": f"successfully indexed {results_count} documents" if results_count > 0
|
|
209
|
+
else "no new documents to index"}
|
|
210
|
+
except Exception as e:
|
|
211
|
+
# Do maximum effort at least send custom event for supposed changed status
|
|
212
|
+
msg = str(e)
|
|
213
|
+
try:
|
|
214
|
+
# Error update should also be forced
|
|
215
|
+
self.index_meta_update(index_name, IndexerKeywords.INDEX_META_FAILED.value, result["count"], update_force=True)
|
|
216
|
+
except Exception as ie:
|
|
217
|
+
logger.error(f"Failed to update index meta status to FAILED for index '{index_name}': {ie}")
|
|
218
|
+
msg = f"{msg}; additionally failed to update index meta status to FAILED: {ie}"
|
|
219
|
+
self._emit_index_event(index_name, error=msg)
|
|
220
|
+
raise e
|
|
221
|
+
|
|
222
|
+
def _save_index_generator(self, base_documents: Generator[Document, None, None], base_total: int, chunking_tool, chunking_config, result, index_name: Optional[str] = None):
|
|
223
|
+
self._ensure_vectorstore_initialized()
|
|
224
|
+
self._log_tool_event(f"Base documents are ready for indexing. {base_total} base documents in total to index.")
|
|
225
|
+
from ..runtime.langchain.interfaces.llm_processor import add_documents
|
|
164
226
|
#
|
|
165
|
-
|
|
166
|
-
|
|
227
|
+
base_doc_counter = 0
|
|
228
|
+
pg_vector_add_docs_chunk = []
|
|
229
|
+
for base_doc in base_documents:
|
|
230
|
+
base_doc_counter += 1
|
|
231
|
+
self._log_tool_event(f"Processing dependent documents for base documents #{base_doc_counter}.")
|
|
232
|
+
|
|
233
|
+
# (base_doc for _ in range(1)) - wrap single base_doc to Generator in order to reuse existing code
|
|
234
|
+
documents = self._extend_data((base_doc for _ in range(1))) # update content of not-reduced base document if needed (for sharepoint and similar)
|
|
235
|
+
documents = self._collect_dependencies(documents) # collect dependencies for base documents
|
|
236
|
+
self._log_tool_event(f"Dependent documents were processed. "
|
|
237
|
+
f"Applying chunking tool '{chunking_tool}' if specified and preparing documents for indexing...")
|
|
238
|
+
documents = self._apply_loaders_chunkers(documents, chunking_tool, chunking_config)
|
|
239
|
+
self._clean_metadata(documents)
|
|
240
|
+
|
|
241
|
+
logger.debug(f"Indexing base document #{base_doc_counter}: {base_doc} and all dependent documents: {documents}")
|
|
242
|
+
|
|
243
|
+
dependent_docs_counter = 0
|
|
244
|
+
#
|
|
245
|
+
for doc in documents:
|
|
246
|
+
if not doc.page_content:
|
|
247
|
+
# To avoid case when all documents have empty content
|
|
248
|
+
# See llm_processor.add_documents which exclude metadata of docs with empty content
|
|
249
|
+
continue
|
|
250
|
+
#
|
|
251
|
+
if 'id' not in doc.metadata or 'updated_on' not in doc.metadata:
|
|
252
|
+
logger.warning(f"Document is missing required metadata field 'id' or 'updated_on': {doc.metadata}")
|
|
253
|
+
#
|
|
254
|
+
# if index_name is provided, add it to metadata of each document
|
|
255
|
+
if index_name:
|
|
256
|
+
if not doc.metadata.get('collection'):
|
|
257
|
+
doc.metadata['collection'] = index_name
|
|
258
|
+
else:
|
|
259
|
+
doc.metadata['collection'] += f";{index_name}"
|
|
260
|
+
#
|
|
261
|
+
try:
|
|
262
|
+
pg_vector_add_docs_chunk.append(doc)
|
|
263
|
+
dependent_docs_counter += 1
|
|
264
|
+
if len(pg_vector_add_docs_chunk) >= self.max_docs_per_add:
|
|
265
|
+
add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
|
|
266
|
+
self._log_tool_event(f"{len(pg_vector_add_docs_chunk)} documents have been indexed. Continuing...")
|
|
267
|
+
pg_vector_add_docs_chunk = []
|
|
268
|
+
except Exception:
|
|
269
|
+
from traceback import format_exc
|
|
270
|
+
logger.error(f"Error: {format_exc()}")
|
|
271
|
+
return {"status": "error", "message": f"Error: {format_exc()}"}
|
|
272
|
+
msg = f"Indexed base document #{base_doc_counter} out of {base_total} (with {dependent_docs_counter} dependencies)."
|
|
273
|
+
logger.debug(msg)
|
|
274
|
+
self._log_tool_event(msg)
|
|
275
|
+
result["count"] += dependent_docs_counter
|
|
276
|
+
# After each base document, try a non-forced meta update; throttling handled inside index_meta_update
|
|
277
|
+
try:
|
|
278
|
+
self.index_meta_update(index_name, IndexerKeywords.INDEX_META_IN_PROGRESS.value, result["count"], update_force=False)
|
|
279
|
+
except Exception as exc: # best-effort, do not break indexing
|
|
280
|
+
logger.warning(f"Failed to update index meta during indexing process for index '{index_name}': {exc}")
|
|
281
|
+
if pg_vector_add_docs_chunk:
|
|
282
|
+
add_documents(vectorstore=self.vectorstore, documents=pg_vector_add_docs_chunk)
|
|
283
|
+
|
|
167
284
|
def _apply_loaders_chunkers(self, documents: Generator[Document, None, None], chunking_tool: str=None, chunking_config=None) -> Generator[Document, None, None]:
|
|
168
|
-
from
|
|
285
|
+
from ..tools.chunkers import __all__ as chunkers
|
|
169
286
|
|
|
170
287
|
if chunking_config is None:
|
|
171
288
|
chunking_config = {}
|
|
172
|
-
chunking_config['embedding'] = self.
|
|
289
|
+
chunking_config['embedding'] = self.embeddings
|
|
173
290
|
chunking_config['llm'] = self.llm
|
|
174
|
-
|
|
291
|
+
|
|
175
292
|
for document in documents:
|
|
176
|
-
if content_type := document.metadata.get(
|
|
293
|
+
if content_type := document.metadata.get(IndexerKeywords.CONTENT_FILE_NAME.value, None):
|
|
177
294
|
# apply parsing based on content type and chunk if chunker was applied to parent doc
|
|
178
|
-
content = document.metadata.pop(
|
|
179
|
-
yield from
|
|
295
|
+
content = document.metadata.pop(IndexerKeywords.CONTENT_IN_BYTES.value, None)
|
|
296
|
+
yield from process_document_by_type(
|
|
180
297
|
document=document,
|
|
181
298
|
content=content,
|
|
182
299
|
extension_source=content_type, llm=self.llm, chunking_config=chunking_config)
|
|
300
|
+
elif chunking_tool and (content_in_bytes := document.metadata.pop(IndexerKeywords.CONTENT_IN_BYTES.value, None)) is not None:
|
|
301
|
+
if not content_in_bytes:
|
|
302
|
+
# content is empty, yield as is
|
|
303
|
+
yield document
|
|
304
|
+
continue
|
|
305
|
+
# apply parsing based on content type resolved from chunking_tool
|
|
306
|
+
content_type = file_extension_by_chunker(chunking_tool)
|
|
307
|
+
yield from process_document_by_type(
|
|
308
|
+
document=document,
|
|
309
|
+
content=content_in_bytes,
|
|
310
|
+
extension_source=content_type, llm=self.llm, chunking_config=chunking_config)
|
|
183
311
|
elif chunking_tool:
|
|
184
312
|
# apply default chunker from toolkit config. No parsing.
|
|
185
313
|
chunker = chunkers.get(chunking_tool)
|
|
186
314
|
yield from chunker(file_content_generator=iter([document]), config=chunking_config)
|
|
187
315
|
else:
|
|
188
|
-
# return as is if neither chunker
|
|
316
|
+
# return as is if neither chunker nor content type are specified
|
|
189
317
|
yield document
|
|
190
318
|
|
|
191
319
|
def _extend_data(self, documents: Generator[Document, None, None]):
|
|
@@ -193,24 +321,34 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
193
321
|
|
|
194
322
|
def _collect_dependencies(self, documents: Generator[Document, None, None]):
|
|
195
323
|
for document in documents:
|
|
324
|
+
self._log_tool_event(message=f"Collecting the dependencies for document ID "
|
|
325
|
+
f"'{document.metadata.get('id', 'N/A')}' to collect dependencies if any...")
|
|
196
326
|
dependencies = self._process_document(document)
|
|
197
327
|
yield document
|
|
198
328
|
for dep in dependencies:
|
|
199
329
|
dep.metadata[IndexerKeywords.PARENT.value] = document.metadata.get('id', None)
|
|
200
330
|
yield dep
|
|
201
331
|
|
|
332
|
+
def _clean_metadata(self, documents: Generator[Document, None, None]):
|
|
333
|
+
for document in documents:
|
|
334
|
+
remove_keys = self._remove_metadata_keys()
|
|
335
|
+
for key in remove_keys:
|
|
336
|
+
document.metadata.pop(key, None)
|
|
337
|
+
yield document
|
|
338
|
+
|
|
202
339
|
def _reduce_duplicates(
|
|
203
340
|
self,
|
|
204
341
|
documents: Generator[Any, None, None],
|
|
205
|
-
|
|
342
|
+
index_name: str,
|
|
206
343
|
log_msg: str = "Verification of documents to index started"
|
|
207
344
|
) -> Generator[Document, None, None]:
|
|
208
345
|
"""Generic duplicate reduction logic for documents."""
|
|
209
|
-
self.
|
|
210
|
-
|
|
346
|
+
self._ensure_vectorstore_initialized()
|
|
347
|
+
self._log_tool_event(log_msg, tool_name="index_documents")
|
|
348
|
+
indexed_data = self._get_indexed_data(index_name)
|
|
211
349
|
indexed_keys = set(indexed_data.keys())
|
|
212
350
|
if not indexed_keys:
|
|
213
|
-
self.
|
|
351
|
+
self._log_tool_event("Vectorstore is empty, indexing all incoming documents", tool_name="index_documents")
|
|
214
352
|
yield from documents
|
|
215
353
|
return
|
|
216
354
|
|
|
@@ -218,7 +356,8 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
218
356
|
|
|
219
357
|
for document in documents:
|
|
220
358
|
key = self.key_fn(document)
|
|
221
|
-
|
|
359
|
+
key = key if isinstance(key, str) else str(key)
|
|
360
|
+
if key in indexed_keys and index_name == indexed_data[key]['metadata'].get('collection'):
|
|
222
361
|
if self.compare_fn(document, indexed_data[key]):
|
|
223
362
|
continue
|
|
224
363
|
yield document
|
|
@@ -227,13 +366,13 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
227
366
|
yield document
|
|
228
367
|
|
|
229
368
|
if docs_to_remove:
|
|
230
|
-
self.
|
|
369
|
+
self._log_tool_event(
|
|
231
370
|
f"Removing {len(docs_to_remove)} documents from vectorstore that are already indexed with different updated_on.",
|
|
232
371
|
tool_name="index_documents"
|
|
233
372
|
)
|
|
234
373
|
self.vectorstore.delete(ids=list(docs_to_remove))
|
|
235
374
|
|
|
236
|
-
def _get_indexed_data(self,
|
|
375
|
+
def _get_indexed_data(self, index_name: str):
|
|
237
376
|
raise NotImplementedError("Subclasses must implement this method")
|
|
238
377
|
|
|
239
378
|
def key_fn(self, document: Document):
|
|
@@ -245,34 +384,57 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
245
384
|
def remove_ids_fn(self, idx_data, key: str):
|
|
246
385
|
raise NotImplementedError("Subclasses must implement this method")
|
|
247
386
|
|
|
248
|
-
def remove_index(self,
|
|
387
|
+
def remove_index(self, index_name: str = ""):
|
|
249
388
|
"""Cleans the indexed data in the collection."""
|
|
250
|
-
super()._clean_collection(
|
|
251
|
-
return (f"Collection '{
|
|
252
|
-
f"Available collections: {self.list_collections()}")
|
|
389
|
+
super()._clean_collection(index_name=index_name, including_index_meta=True)
|
|
390
|
+
return (f"Collection '{index_name}' has been removed from the vector store.\n"
|
|
391
|
+
f"Available collections: {self.list_collections()}") if index_name \
|
|
392
|
+
else "All collections have been removed from the vector store."
|
|
253
393
|
|
|
254
|
-
def _build_collection_filter(self, filter: dict | str,
|
|
394
|
+
def _build_collection_filter(self, filter: dict | str, index_name: str = "") -> dict:
|
|
255
395
|
"""Builds a filter for the collection based on the provided suffix."""
|
|
256
396
|
|
|
257
397
|
filter = filter if isinstance(filter, dict) else json.loads(filter)
|
|
258
|
-
if
|
|
398
|
+
if index_name:
|
|
259
399
|
filter.update({"collection": {
|
|
260
|
-
"$eq":
|
|
400
|
+
"$eq": index_name.strip()
|
|
261
401
|
}})
|
|
402
|
+
|
|
403
|
+
if filter:
|
|
404
|
+
# Exclude index meta documents from search results
|
|
405
|
+
filter = {
|
|
406
|
+
"$and": [
|
|
407
|
+
filter,
|
|
408
|
+
{"$or": [
|
|
409
|
+
{"type": {"$exists": False}},
|
|
410
|
+
{"type": {"$ne": IndexerKeywords.INDEX_META_TYPE.value}}
|
|
411
|
+
]},
|
|
412
|
+
]
|
|
413
|
+
}
|
|
414
|
+
else:
|
|
415
|
+
filter = {"$or": [
|
|
416
|
+
{"type": {"$exists": False}},
|
|
417
|
+
{"type": {"$ne": IndexerKeywords.INDEX_META_TYPE.value}}
|
|
418
|
+
]}
|
|
262
419
|
return filter
|
|
263
420
|
|
|
264
421
|
def search_index(self,
|
|
265
422
|
query: str,
|
|
266
|
-
|
|
267
|
-
filter: dict | str = {}, cut_off: float =
|
|
423
|
+
index_name: str = "",
|
|
424
|
+
filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
|
|
268
425
|
search_top: int = 10, reranker: dict = {},
|
|
269
426
|
full_text_search: Optional[Dict[str, Any]] = None,
|
|
270
427
|
reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
271
428
|
extended_search: Optional[List[str]] = None,
|
|
272
429
|
**kwargs):
|
|
273
430
|
""" Searches indexed documents in the vector store."""
|
|
274
|
-
# build filter on top of
|
|
275
|
-
|
|
431
|
+
# build filter on top of index_name
|
|
432
|
+
|
|
433
|
+
available_collections = super().list_collections()
|
|
434
|
+
if index_name and index_name not in available_collections:
|
|
435
|
+
return f"Collection '{index_name}' not found. Available collections: {available_collections}"
|
|
436
|
+
|
|
437
|
+
filter = self._build_collection_filter(filter, index_name)
|
|
276
438
|
found_docs = super().search_documents(
|
|
277
439
|
query,
|
|
278
440
|
doctype=self.doctype,
|
|
@@ -289,15 +451,15 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
289
451
|
def stepback_search_index(self,
|
|
290
452
|
query: str,
|
|
291
453
|
messages: List[Dict[str, Any]] = [],
|
|
292
|
-
|
|
293
|
-
filter: dict | str = {}, cut_off: float =
|
|
454
|
+
index_name: str = "",
|
|
455
|
+
filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
|
|
294
456
|
search_top: int = 10, reranker: dict = {},
|
|
295
457
|
full_text_search: Optional[Dict[str, Any]] = None,
|
|
296
458
|
reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
297
459
|
extended_search: Optional[List[str]] = None,
|
|
298
460
|
**kwargs):
|
|
299
461
|
""" Searches indexed documents in the vector store."""
|
|
300
|
-
filter = self._build_collection_filter(filter,
|
|
462
|
+
filter = self._build_collection_filter(filter, index_name)
|
|
301
463
|
found_docs = super().stepback_search(
|
|
302
464
|
query,
|
|
303
465
|
messages,
|
|
@@ -314,8 +476,8 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
314
476
|
def stepback_summary_index(self,
|
|
315
477
|
query: str,
|
|
316
478
|
messages: List[Dict[str, Any]] = [],
|
|
317
|
-
|
|
318
|
-
filter: dict | str = {}, cut_off: float =
|
|
479
|
+
index_name: str = "",
|
|
480
|
+
filter: dict | str = {}, cut_off: float = DEFAULT_CUT_OFF,
|
|
319
481
|
search_top: int = 10, reranker: dict = {},
|
|
320
482
|
full_text_search: Optional[Dict[str, Any]] = None,
|
|
321
483
|
reranking_config: Optional[Dict[str, Dict[str, Any]]] = None,
|
|
@@ -323,7 +485,7 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
323
485
|
**kwargs):
|
|
324
486
|
""" Generates a summary of indexed documents using stepback technique."""
|
|
325
487
|
|
|
326
|
-
filter = self._build_collection_filter(filter,
|
|
488
|
+
filter = self._build_collection_filter(filter, index_name)
|
|
327
489
|
return super().stepback_summary(
|
|
328
490
|
query,
|
|
329
491
|
messages,
|
|
@@ -335,6 +497,149 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
335
497
|
reranking_config=reranking_config,
|
|
336
498
|
extended_search=extended_search
|
|
337
499
|
)
|
|
500
|
+
|
|
501
|
+
def index_meta_init(self, index_name: str, index_configuration: dict[str, Any]):
|
|
502
|
+
self._ensure_vectorstore_initialized()
|
|
503
|
+
index_meta = super().get_index_meta(index_name)
|
|
504
|
+
if not index_meta:
|
|
505
|
+
self._log_tool_event(
|
|
506
|
+
f"There is no existing index_meta for collection '{index_name}'. Initializing it.",
|
|
507
|
+
tool_name="index_data"
|
|
508
|
+
)
|
|
509
|
+
from ..runtime.langchain.interfaces.llm_processor import add_documents
|
|
510
|
+
created_on = time.time()
|
|
511
|
+
metadata = {
|
|
512
|
+
"collection": index_name,
|
|
513
|
+
"type": IndexerKeywords.INDEX_META_TYPE.value,
|
|
514
|
+
"indexed": 0,
|
|
515
|
+
"updated": 0,
|
|
516
|
+
"state": IndexerKeywords.INDEX_META_IN_PROGRESS.value,
|
|
517
|
+
"index_configuration": index_configuration,
|
|
518
|
+
"created_on": created_on,
|
|
519
|
+
"updated_on": created_on,
|
|
520
|
+
"task_id": None,
|
|
521
|
+
"conversation_id": None,
|
|
522
|
+
"toolkit_id": self.toolkit_id,
|
|
523
|
+
}
|
|
524
|
+
metadata["history"] = json.dumps([metadata])
|
|
525
|
+
index_meta_doc = Document(page_content=f"{IndexerKeywords.INDEX_META_TYPE.value}_{index_name}", metadata=metadata)
|
|
526
|
+
add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc])
|
|
527
|
+
|
|
528
|
+
def index_meta_update(self, index_name: str, state: str, result: int, update_force: bool = True, interval: Optional[float] = None):
|
|
529
|
+
"""Update `index_meta` document with optional time-based throttling.
|
|
530
|
+
|
|
531
|
+
Args:
|
|
532
|
+
index_name: Index name to update meta for.
|
|
533
|
+
state: New state value for the `index_meta` record.
|
|
534
|
+
result: Number of processed documents to store in the `updated` field.
|
|
535
|
+
update_force: If `True`, perform the update unconditionally, ignoring throttling.
|
|
536
|
+
If `False`, perform the update only when the effective time interval has passed.
|
|
537
|
+
interval: Optional custom interval (in seconds) for this call when `update_force` is `False`.
|
|
538
|
+
If `None`, falls back to the value stored in `self._index_meta_config["update_interval"]`
|
|
539
|
+
if present, otherwise uses `INDEX_META_UPDATE_INTERVAL`.
|
|
540
|
+
"""
|
|
541
|
+
self._ensure_vectorstore_initialized()
|
|
542
|
+
if not hasattr(self, "_index_meta_last_update_time"):
|
|
543
|
+
self._index_meta_last_update_time: Dict[str, float] = {}
|
|
544
|
+
|
|
545
|
+
if not update_force:
|
|
546
|
+
# Resolve effective interval:
|
|
547
|
+
# 1\) explicit arg
|
|
548
|
+
# 2\) value from `_index_meta_config`
|
|
549
|
+
# 3\) default constant
|
|
550
|
+
cfg_interval = None
|
|
551
|
+
if hasattr(self, "_index_meta_config"):
|
|
552
|
+
cfg_interval = self._index_meta_config.get("update_interval")
|
|
553
|
+
|
|
554
|
+
eff_interval = (
|
|
555
|
+
interval
|
|
556
|
+
if interval is not None
|
|
557
|
+
else (cfg_interval if cfg_interval is not None else INDEX_META_UPDATE_INTERVAL)
|
|
558
|
+
)
|
|
559
|
+
|
|
560
|
+
last_time = self._index_meta_last_update_time.get(index_name)
|
|
561
|
+
now = time.time()
|
|
562
|
+
if last_time is not None and (now - last_time) < eff_interval:
|
|
563
|
+
return
|
|
564
|
+
self._index_meta_last_update_time[index_name] = now
|
|
565
|
+
else:
|
|
566
|
+
# For forced updates, always refresh last update time
|
|
567
|
+
self._index_meta_last_update_time[index_name] = time.time()
|
|
568
|
+
|
|
569
|
+
index_meta_raw = super().get_index_meta(index_name)
|
|
570
|
+
from ..runtime.langchain.interfaces.llm_processor import add_documents
|
|
571
|
+
#
|
|
572
|
+
if index_meta_raw:
|
|
573
|
+
metadata = copy.deepcopy(index_meta_raw.get("metadata", {}))
|
|
574
|
+
metadata["indexed"] = self.get_indexed_count(index_name)
|
|
575
|
+
metadata["updated"] = result
|
|
576
|
+
metadata["state"] = state
|
|
577
|
+
metadata["updated_on"] = time.time()
|
|
578
|
+
#
|
|
579
|
+
history_raw = metadata.pop("history", "[]")
|
|
580
|
+
try:
|
|
581
|
+
history = json.loads(history_raw) if history_raw.strip() else []
|
|
582
|
+
# replace the last history item with updated metadata
|
|
583
|
+
if history and isinstance(history, list):
|
|
584
|
+
history[-1] = metadata
|
|
585
|
+
else:
|
|
586
|
+
history = [metadata]
|
|
587
|
+
except (json.JSONDecodeError, TypeError):
|
|
588
|
+
logger.warning(f"Failed to load index history: {history_raw}. Create new with only current item.")
|
|
589
|
+
history = [metadata]
|
|
590
|
+
#
|
|
591
|
+
metadata["history"] = json.dumps(history)
|
|
592
|
+
index_meta_doc = Document(page_content=index_meta_raw.get("content", ""), metadata=metadata)
|
|
593
|
+
add_documents(vectorstore=self.vectorstore, documents=[index_meta_doc], ids=[index_meta_raw.get("id")])
|
|
594
|
+
|
|
595
|
+
def _emit_index_event(self, index_name: str, error: Optional[str] = None):
|
|
596
|
+
"""
|
|
597
|
+
Emit custom event for index data operation.
|
|
598
|
+
|
|
599
|
+
Args:
|
|
600
|
+
index_name: The name of the index
|
|
601
|
+
error: Error message if the operation failed, None otherwise
|
|
602
|
+
"""
|
|
603
|
+
index_meta = super().get_index_meta(index_name)
|
|
604
|
+
|
|
605
|
+
if not index_meta:
|
|
606
|
+
logger.warning(
|
|
607
|
+
f"No index_meta found for index '{index_name}'. "
|
|
608
|
+
"Cannot emit index event."
|
|
609
|
+
)
|
|
610
|
+
return
|
|
611
|
+
|
|
612
|
+
metadata = index_meta.get("metadata", {})
|
|
613
|
+
|
|
614
|
+
# Determine if this is a reindex operation
|
|
615
|
+
history_raw = metadata.get("history", "[]")
|
|
616
|
+
try:
|
|
617
|
+
history = json.loads(history_raw) if history_raw.strip() else []
|
|
618
|
+
is_reindex = len(history) > 1
|
|
619
|
+
except (json.JSONDecodeError, TypeError):
|
|
620
|
+
is_reindex = False
|
|
621
|
+
|
|
622
|
+
# Build event message
|
|
623
|
+
event_data = {
|
|
624
|
+
"id": index_meta.get("id"),
|
|
625
|
+
"index_name": index_name,
|
|
626
|
+
"state": "failed" if error is not None else metadata.get("state"),
|
|
627
|
+
"error": error,
|
|
628
|
+
"reindex": is_reindex,
|
|
629
|
+
"indexed": metadata.get("indexed", 0),
|
|
630
|
+
"updated": metadata.get("updated", 0),
|
|
631
|
+
"toolkit_id": metadata.get("toolkit_id"),
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
# Emit the event
|
|
635
|
+
try:
|
|
636
|
+
dispatch_custom_event("index_data_status", event_data)
|
|
637
|
+
logger.debug(
|
|
638
|
+
f"Emitted index_data_status event for index "
|
|
639
|
+
f"'{index_name}': {event_data}"
|
|
640
|
+
)
|
|
641
|
+
except Exception as e:
|
|
642
|
+
logger.warning(f"Failed to emit index_data_status event: {e}")
|
|
338
643
|
|
|
339
644
|
def get_available_tools(self):
|
|
340
645
|
"""
|
|
@@ -346,8 +651,8 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
346
651
|
"""
|
|
347
652
|
return [
|
|
348
653
|
{
|
|
349
|
-
"name":
|
|
350
|
-
"mode":
|
|
654
|
+
"name": IndexTools.INDEX_DATA.value,
|
|
655
|
+
"mode": IndexTools.INDEX_DATA.value,
|
|
351
656
|
"ref": self.index_data,
|
|
352
657
|
"description": "Loads data to index.",
|
|
353
658
|
"args_schema": create_model(
|
|
@@ -357,38 +662,39 @@ class BaseIndexerToolkit(VectorStoreWrapperBase):
|
|
|
357
662
|
)
|
|
358
663
|
},
|
|
359
664
|
{
|
|
360
|
-
"name":
|
|
361
|
-
"mode":
|
|
665
|
+
"name": IndexTools.SEARCH_INDEX.value,
|
|
666
|
+
"mode": IndexTools.SEARCH_INDEX.value,
|
|
362
667
|
"ref": self.search_index,
|
|
363
668
|
"description": self.search_index.__doc__,
|
|
364
669
|
"args_schema": BaseSearchParams
|
|
365
670
|
},
|
|
366
671
|
{
|
|
367
|
-
"name":
|
|
368
|
-
"mode":
|
|
672
|
+
"name": IndexTools.STEPBACK_SEARCH_INDEX.value,
|
|
673
|
+
"mode": IndexTools.STEPBACK_SEARCH_INDEX.value,
|
|
369
674
|
"ref": self.stepback_search_index,
|
|
370
675
|
"description": self.stepback_search_index.__doc__,
|
|
371
676
|
"args_schema": BaseStepbackSearchParams
|
|
372
677
|
},
|
|
373
678
|
{
|
|
374
|
-
"name":
|
|
375
|
-
"mode":
|
|
679
|
+
"name": IndexTools.STEPBACK_SUMMARY_INDEX.value,
|
|
680
|
+
"mode": IndexTools.STEPBACK_SUMMARY_INDEX.value,
|
|
376
681
|
"ref": self.stepback_summary_index,
|
|
377
682
|
"description": self.stepback_summary_index.__doc__,
|
|
378
683
|
"args_schema": BaseStepbackSearchParams
|
|
379
684
|
},
|
|
380
685
|
{
|
|
381
|
-
"name":
|
|
382
|
-
"mode":
|
|
686
|
+
"name": IndexTools.REMOVE_INDEX.value,
|
|
687
|
+
"mode": IndexTools.REMOVE_INDEX.value,
|
|
383
688
|
"ref": self.remove_index,
|
|
384
689
|
"description": self.remove_index.__doc__,
|
|
385
690
|
"args_schema": RemoveIndexParams
|
|
386
691
|
},
|
|
387
692
|
{
|
|
388
|
-
"name":
|
|
389
|
-
"mode":
|
|
693
|
+
"name": IndexTools.LIST_COLLECTIONS.value,
|
|
694
|
+
"mode": IndexTools.LIST_COLLECTIONS.value,
|
|
390
695
|
"ref": self.list_collections,
|
|
391
696
|
"description": self.list_collections.__doc__,
|
|
392
|
-
|
|
697
|
+
# No parameters
|
|
698
|
+
"args_schema": create_model("ListCollectionsParams")
|
|
393
699
|
},
|
|
394
|
-
]
|
|
700
|
+
]
|