alita-sdk 0.3.374__py3-none-any.whl → 0.3.423__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of alita-sdk might be problematic. Click here for more details.
- alita_sdk/configurations/bitbucket.py +95 -0
- alita_sdk/configurations/confluence.py +96 -1
- alita_sdk/configurations/gitlab.py +79 -0
- alita_sdk/configurations/jira.py +103 -0
- alita_sdk/configurations/testrail.py +88 -0
- alita_sdk/configurations/xray.py +93 -0
- alita_sdk/configurations/zephyr_enterprise.py +93 -0
- alita_sdk/configurations/zephyr_essential.py +75 -0
- alita_sdk/runtime/clients/client.py +3 -2
- alita_sdk/runtime/clients/sandbox_client.py +8 -0
- alita_sdk/runtime/langchain/assistant.py +56 -40
- alita_sdk/runtime/langchain/constants.py +4 -0
- alita_sdk/runtime/langchain/document_loaders/AlitaDocxMammothLoader.py +315 -3
- alita_sdk/runtime/langchain/document_loaders/AlitaJSONLoader.py +4 -1
- alita_sdk/runtime/langchain/document_loaders/constants.py +28 -12
- alita_sdk/runtime/langchain/langraph_agent.py +92 -28
- alita_sdk/runtime/langchain/utils.py +24 -4
- alita_sdk/runtime/toolkits/application.py +8 -1
- alita_sdk/runtime/toolkits/tools.py +80 -49
- alita_sdk/runtime/tools/__init__.py +7 -2
- alita_sdk/runtime/tools/application.py +7 -0
- alita_sdk/runtime/tools/function.py +28 -23
- alita_sdk/runtime/tools/graph.py +10 -4
- alita_sdk/runtime/tools/image_generation.py +104 -8
- alita_sdk/runtime/tools/llm.py +146 -114
- alita_sdk/runtime/tools/sandbox.py +166 -63
- alita_sdk/runtime/tools/vectorstore.py +22 -21
- alita_sdk/runtime/tools/vectorstore_base.py +16 -15
- alita_sdk/runtime/utils/utils.py +1 -0
- alita_sdk/tools/__init__.py +43 -31
- alita_sdk/tools/ado/work_item/ado_wrapper.py +17 -8
- alita_sdk/tools/base_indexer_toolkit.py +102 -93
- alita_sdk/tools/code_indexer_toolkit.py +15 -5
- alita_sdk/tools/confluence/api_wrapper.py +30 -8
- alita_sdk/tools/confluence/loader.py +10 -0
- alita_sdk/tools/elitea_base.py +22 -22
- alita_sdk/tools/gitlab/api_wrapper.py +8 -9
- alita_sdk/tools/jira/api_wrapper.py +1 -1
- alita_sdk/tools/non_code_indexer_toolkit.py +2 -2
- alita_sdk/tools/openapi/__init__.py +10 -1
- alita_sdk/tools/qtest/api_wrapper.py +298 -51
- alita_sdk/tools/sharepoint/api_wrapper.py +104 -33
- alita_sdk/tools/sharepoint/authorization_helper.py +175 -1
- alita_sdk/tools/sharepoint/utils.py +8 -2
- alita_sdk/tools/utils/content_parser.py +27 -16
- alita_sdk/tools/vector_adapters/VectorStoreAdapter.py +38 -25
- {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/METADATA +1 -1
- {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/RECORD +51 -51
- {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/WHEEL +0 -0
- {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/licenses/LICENSE +0 -0
- {alita_sdk-0.3.374.dist-info → alita_sdk-0.3.423.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,10 @@
|
|
|
1
1
|
from datetime import datetime, timezone
|
|
2
|
+
from urllib.parse import unquote, urlparse, quote
|
|
2
3
|
|
|
3
4
|
import jwt
|
|
4
5
|
import requests
|
|
6
|
+
from botocore.response import get_response
|
|
7
|
+
|
|
5
8
|
|
|
6
9
|
class SharepointAuthorizationHelper:
|
|
7
10
|
|
|
@@ -54,4 +57,175 @@ class SharepointAuthorizationHelper:
|
|
|
54
57
|
except jwt.ExpiredSignatureError:
|
|
55
58
|
return False
|
|
56
59
|
except jwt.InvalidTokenError:
|
|
57
|
-
return False
|
|
60
|
+
return False
|
|
61
|
+
|
|
62
|
+
def _validate_response(self, response, required_field, error_prefix=None):
|
|
63
|
+
if response.status_code != 200:
|
|
64
|
+
raise RuntimeError(f"{error_prefix or 'Request'} failed: {response.status_code} {response.text}")
|
|
65
|
+
json_data = response.json()
|
|
66
|
+
if required_field not in json_data:
|
|
67
|
+
raise KeyError(f"'{required_field}' missing in response")
|
|
68
|
+
return json_data[required_field]
|
|
69
|
+
|
|
70
|
+
def generate_token_and_site_id(self, site_url: str) -> tuple[str, str]:
|
|
71
|
+
try:
|
|
72
|
+
parsed = urlparse(site_url)
|
|
73
|
+
domain = parsed.hostname
|
|
74
|
+
site_path = parsed.path.strip('/')
|
|
75
|
+
if not domain or not site_path:
|
|
76
|
+
raise ValueError(f"site_url missing domain or site path: {site_url}")
|
|
77
|
+
app_name = domain.split('.')[0]
|
|
78
|
+
openid_config_url = f"https://login.microsoftonline.com/{app_name}.onmicrosoft.com/v2.0/.well-known/openid-configuration"
|
|
79
|
+
response = requests.get(openid_config_url)
|
|
80
|
+
token_url = self._validate_response(response, required_field="token_endpoint", error_prefix="OpenID config")
|
|
81
|
+
token_data = {
|
|
82
|
+
"grant_type": "client_credentials",
|
|
83
|
+
"client_id": self.client_id,
|
|
84
|
+
"client_secret": self.client_secret,
|
|
85
|
+
"scope": "https://graph.microsoft.com/.default"
|
|
86
|
+
}
|
|
87
|
+
token_response = requests.post(token_url, data=token_data)
|
|
88
|
+
access_token = self._validate_response(token_response, required_field="access_token", error_prefix="Token request")
|
|
89
|
+
graph_site_url = f"https://graph.microsoft.com/v1.0/sites/{domain}:/{site_path}"
|
|
90
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
91
|
+
site_response = requests.get(graph_site_url, headers=headers)
|
|
92
|
+
site_id = self._validate_response(site_response, required_field="id", error_prefix="Site info")
|
|
93
|
+
return access_token, site_id
|
|
94
|
+
except Exception as e:
|
|
95
|
+
raise RuntimeError(f"Error while obtaining access_token and site_id: {e}")
|
|
96
|
+
|
|
97
|
+
def get_files_list(self, site_url: str, folder_name: str = None, limit_files: int = 100):
|
|
98
|
+
if not site_url or not site_url.startswith("https://"):
|
|
99
|
+
raise ValueError(f"Invalid site_url format: {site_url}")
|
|
100
|
+
if limit_files is not None and (not isinstance(limit_files, int) or limit_files <= 0):
|
|
101
|
+
raise ValueError(f"limit_files must be a positive integer, got: {limit_files}")
|
|
102
|
+
try:
|
|
103
|
+
access_token, site_id = self.generate_token_and_site_id(site_url)
|
|
104
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
105
|
+
drives_url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives"
|
|
106
|
+
drives_response = requests.get(drives_url, headers=headers)
|
|
107
|
+
drives = self._validate_response(drives_response, required_field="value", error_prefix="Drives request")
|
|
108
|
+
result = []
|
|
109
|
+
def _recurse_drive(drive_id, drive_path, parent_folder, limit_files):
|
|
110
|
+
# Escape folder_name for URL safety if present
|
|
111
|
+
if parent_folder:
|
|
112
|
+
safe_folder_name = quote(parent_folder.strip('/'), safe="/")
|
|
113
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives/{drive_id}/root:/{safe_folder_name}:/children?$top={limit_files}"
|
|
114
|
+
else:
|
|
115
|
+
url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives/{drive_id}/root/children?$top={limit_files}"
|
|
116
|
+
response = requests.get(url, headers=headers)
|
|
117
|
+
if response.status_code != 200:
|
|
118
|
+
return []
|
|
119
|
+
files_json = response.json()
|
|
120
|
+
if "value" not in files_json:
|
|
121
|
+
return []
|
|
122
|
+
files = []
|
|
123
|
+
for file in files_json["value"]:
|
|
124
|
+
file_name = file.get('name', '')
|
|
125
|
+
# Build full path reflecting nested folders
|
|
126
|
+
if parent_folder:
|
|
127
|
+
full_path = '/' + '/'.join([drive_path.strip('/'), parent_folder.strip('/'), file_name.strip('/')])
|
|
128
|
+
else:
|
|
129
|
+
full_path = '/' + '/'.join([drive_path.strip('/'), file_name.strip('/')])
|
|
130
|
+
temp_props = {
|
|
131
|
+
'Name': file_name,
|
|
132
|
+
'Path': full_path,
|
|
133
|
+
'Created': file.get('createdDateTime'),
|
|
134
|
+
'Modified': file.get('lastModifiedDateTime'),
|
|
135
|
+
'Link': file.get('webUrl'),
|
|
136
|
+
'id': file.get('id')
|
|
137
|
+
}
|
|
138
|
+
if not all([temp_props['Name'], temp_props['Path'], temp_props['id']]):
|
|
139
|
+
continue # skip files with missing required fields
|
|
140
|
+
if 'folder' in file:
|
|
141
|
+
# Recursively extract files from this folder
|
|
142
|
+
inner_folder = parent_folder + '/' + file_name if parent_folder else file_name
|
|
143
|
+
inner_files = _recurse_drive(drive_id, drive_path, inner_folder, limit_files)
|
|
144
|
+
files.extend(inner_files)
|
|
145
|
+
else:
|
|
146
|
+
files.append(temp_props)
|
|
147
|
+
if limit_files is not None and len(result) + len(files) >= limit_files:
|
|
148
|
+
return files[:limit_files - len(result)]
|
|
149
|
+
return files
|
|
150
|
+
for drive in drives:
|
|
151
|
+
drive_id = drive.get("id")
|
|
152
|
+
drive_path = unquote(urlparse(drive.get("webUrl")).path) if drive.get("webUrl") else ""
|
|
153
|
+
if not drive_id:
|
|
154
|
+
continue # skip drives without id
|
|
155
|
+
files = _recurse_drive(drive_id, drive_path, folder_name, limit_files)
|
|
156
|
+
result.extend(files)
|
|
157
|
+
if limit_files is not None and len(result) >= limit_files:
|
|
158
|
+
return result[:limit_files]
|
|
159
|
+
return result
|
|
160
|
+
except Exception as e:
|
|
161
|
+
raise RuntimeError(f"Error in get_files_list: {e}")
|
|
162
|
+
|
|
163
|
+
def get_file_content(self, site_url: str, path: str):
|
|
164
|
+
try:
|
|
165
|
+
access_token, site_id = self.generate_token_and_site_id(site_url)
|
|
166
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
167
|
+
drives_url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives"
|
|
168
|
+
drives_response = requests.get(drives_url, headers=headers)
|
|
169
|
+
drives = self._validate_response(drives_response, required_field="value", error_prefix="Drives request")
|
|
170
|
+
path = path.strip('/')
|
|
171
|
+
#
|
|
172
|
+
for drive in drives:
|
|
173
|
+
drive_path = unquote(urlparse(drive.get("webUrl")).path).strip('/')
|
|
174
|
+
if not drive_path or not path.startswith(drive_path):
|
|
175
|
+
continue
|
|
176
|
+
drive_id = drive.get("id")
|
|
177
|
+
if not drive_id:
|
|
178
|
+
continue
|
|
179
|
+
path = path.replace(drive_path, '').strip('/')
|
|
180
|
+
safe_path = quote(path, safe="")
|
|
181
|
+
url = f"https://graph.microsoft.com/v1.0/drives/{drive_id}/root:/{safe_path}:/content"
|
|
182
|
+
response = requests.get(url, headers=headers)
|
|
183
|
+
if response.status_code == 200:
|
|
184
|
+
return response.content
|
|
185
|
+
raise RuntimeError(f"File '{path}' not found in any private or shared documents.")
|
|
186
|
+
except Exception as e:
|
|
187
|
+
raise RuntimeError(f"Error in get_file_content: {e}")
|
|
188
|
+
|
|
189
|
+
def get_list_items(self, site_url: str, list_title: str, limit: int = 1000):
|
|
190
|
+
"""Fallback Graph API method to read SharePoint list items by list title.
|
|
191
|
+
|
|
192
|
+
Returns a list of dictionaries representing list item fields.
|
|
193
|
+
"""
|
|
194
|
+
if not site_url or not site_url.startswith("https://"):
|
|
195
|
+
raise ValueError(f"Invalid site_url format: {site_url}")
|
|
196
|
+
try:
|
|
197
|
+
access_token, site_id = self.generate_token_and_site_id(site_url)
|
|
198
|
+
headers = {"Authorization": f"Bearer {access_token}"}
|
|
199
|
+
lists_url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/lists"
|
|
200
|
+
response = requests.get(lists_url, headers=headers)
|
|
201
|
+
if response.status_code != 200:
|
|
202
|
+
raise RuntimeError(f"Lists request failed: {response.status_code} {response.text}")
|
|
203
|
+
lists_json = response.json()
|
|
204
|
+
lists = lists_json.get("value", [])
|
|
205
|
+
target_list = None
|
|
206
|
+
normalized_title = list_title.strip().lower()
|
|
207
|
+
for lst in lists:
|
|
208
|
+
# displayName is the user-visible title. name can differ (internal name)
|
|
209
|
+
display_name = (lst.get("displayName") or lst.get("name") or '').strip().lower()
|
|
210
|
+
if display_name == normalized_title:
|
|
211
|
+
target_list = lst
|
|
212
|
+
break
|
|
213
|
+
if not target_list:
|
|
214
|
+
raise RuntimeError(f"List '{list_title}' not found via Graph API.")
|
|
215
|
+
list_id = target_list.get('id')
|
|
216
|
+
if not list_id:
|
|
217
|
+
raise RuntimeError(f"List '{list_title}' missing id field.")
|
|
218
|
+
items_url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/lists/{list_id}/items?expand=fields&$top={limit}"
|
|
219
|
+
items_response = requests.get(items_url, headers=headers)
|
|
220
|
+
if items_response.status_code != 200:
|
|
221
|
+
raise RuntimeError(f"List items request failed: {items_response.status_code} {items_response.text}")
|
|
222
|
+
items_json = items_response.json()
|
|
223
|
+
values = items_json.get('value', [])
|
|
224
|
+
result = []
|
|
225
|
+
for item in values:
|
|
226
|
+
fields = item.get('fields', {})
|
|
227
|
+
if fields:
|
|
228
|
+
result.append(fields)
|
|
229
|
+
return result
|
|
230
|
+
except Exception as e:
|
|
231
|
+
raise RuntimeError(f"Error in get_list_items: {e}")
|
|
@@ -1,5 +1,7 @@
|
|
|
1
|
-
|
|
1
|
+
import re
|
|
2
2
|
from io import BytesIO
|
|
3
|
+
from docx import Document
|
|
4
|
+
|
|
3
5
|
|
|
4
6
|
def read_docx_from_bytes(file_content):
|
|
5
7
|
"""Read and return content from a .docx file using a byte stream."""
|
|
@@ -11,4 +13,8 @@ def read_docx_from_bytes(file_content):
|
|
|
11
13
|
return '\n'.join(text)
|
|
12
14
|
except Exception as e:
|
|
13
15
|
print(f"Error reading .docx from bytes: {e}")
|
|
14
|
-
return ""
|
|
16
|
+
return ""
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def decode_sharepoint_string(s):
|
|
20
|
+
return re.sub(r'_x([0-9A-Fa-f]{4})_', lambda m: chr(int(m.group(1), 16)), s)
|
|
@@ -92,21 +92,24 @@ def parse_file_content(file_name=None, file_content=None, is_capture_image: bool
|
|
|
92
92
|
return ToolException(
|
|
93
93
|
"Not supported type of files entered. Supported types are TXT, DOCX, PDF, PPTX, XLSX and XLS only.")
|
|
94
94
|
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
extension = Path(file_path if file_path else file_name).suffix
|
|
99
|
-
loader_kwargs = get_loader_kwargs(loaders_map.get(extension), file_name, file_content, is_capture_image, page_number, sheet_name, llm, file_path, excel_by_sheets)
|
|
100
|
-
if file_content:
|
|
101
|
-
return load_content_from_bytes(file_content=file_content,
|
|
102
|
-
extension=extension,
|
|
103
|
-
loader_extra_config=loader_kwargs,
|
|
104
|
-
llm=llm)
|
|
95
|
+
try:
|
|
96
|
+
if hasattr(loader, 'get_content'):
|
|
97
|
+
return loader.get_content()
|
|
105
98
|
else:
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
99
|
+
extension = Path(file_path if file_path else file_name).suffix
|
|
100
|
+
loader_kwargs = get_loader_kwargs(loaders_map.get(extension), file_name, file_content, is_capture_image, page_number, sheet_name, llm, file_path, excel_by_sheets)
|
|
101
|
+
if file_content:
|
|
102
|
+
return load_content_from_bytes(file_content=file_content,
|
|
103
|
+
extension=extension,
|
|
104
|
+
loader_extra_config=loader_kwargs,
|
|
105
|
+
llm=llm)
|
|
106
|
+
else:
|
|
107
|
+
return load_content(file_path=file_path,
|
|
108
|
+
extension=extension,
|
|
109
|
+
loader_extra_config=loader_kwargs,
|
|
110
|
+
llm=llm)
|
|
111
|
+
except Exception as e:
|
|
112
|
+
return ToolException(f"Error reading file ({file_name or file_path}) content. Make sure these types are supported: {str(e)}")
|
|
110
113
|
|
|
111
114
|
def load_file_docs(file_name=None, file_content=None, is_capture_image: bool = False, page_number: int = None,
|
|
112
115
|
sheet_name: str = None, llm=None, file_path: str = None, excel_by_sheets: bool = False) -> List[Document] | ToolException:
|
|
@@ -153,7 +156,7 @@ def prepare_loader(file_name=None, file_content=None, is_capture_image: bool = F
|
|
|
153
156
|
|
|
154
157
|
loader_object = loaders_map.get(extension)
|
|
155
158
|
if not loader_object:
|
|
156
|
-
|
|
159
|
+
loader_object = loaders_map.get('.txt') # Default to text loader if no specific loader found
|
|
157
160
|
loader_kwargs = get_loader_kwargs(loader_object, file_name, file_content, is_capture_image, page_number, sheet_name, llm, file_path, excel_by_sheets, prompt)
|
|
158
161
|
loader = loader_object['class'](**loader_kwargs)
|
|
159
162
|
return loader
|
|
@@ -222,10 +225,18 @@ def process_document_by_type(content, extension_source: str, document: Document
|
|
|
222
225
|
metadata={**document.metadata, 'chunk_id': 1}
|
|
223
226
|
)
|
|
224
227
|
return
|
|
228
|
+
#
|
|
229
|
+
chunks_counter = 0
|
|
225
230
|
for chunk in chunks:
|
|
231
|
+
chunks_counter += 1
|
|
232
|
+
metadata = {**document.metadata, **chunk.metadata}
|
|
233
|
+
#
|
|
234
|
+
# ensure each chunk has a unique chunk_id
|
|
235
|
+
metadata['chunk_id'] = chunks_counter
|
|
236
|
+
#
|
|
226
237
|
yield Document(
|
|
227
238
|
page_content=sanitize_for_postgres(chunk.page_content),
|
|
228
|
-
metadata=
|
|
239
|
+
metadata=metadata
|
|
229
240
|
)
|
|
230
241
|
|
|
231
242
|
|
|
@@ -26,12 +26,12 @@ class VectorStoreAdapter(ABC):
|
|
|
26
26
|
pass
|
|
27
27
|
|
|
28
28
|
@abstractmethod
|
|
29
|
-
def get_indexed_ids(self, vectorstore_wrapper,
|
|
29
|
+
def get_indexed_ids(self, vectorstore_wrapper, index_name: Optional[str] = '') -> List[str]:
|
|
30
30
|
"""Get all indexed document IDs from vectorstore"""
|
|
31
31
|
pass
|
|
32
32
|
|
|
33
33
|
@abstractmethod
|
|
34
|
-
def clean_collection(self, vectorstore_wrapper,
|
|
34
|
+
def clean_collection(self, vectorstore_wrapper, index_name: str = ''):
|
|
35
35
|
"""Clean the vectorstore collection by deleting all indexed data."""
|
|
36
36
|
pass
|
|
37
37
|
|
|
@@ -41,7 +41,7 @@ class VectorStoreAdapter(ABC):
|
|
|
41
41
|
pass
|
|
42
42
|
|
|
43
43
|
@abstractmethod
|
|
44
|
-
def get_code_indexed_data(self, vectorstore_wrapper,
|
|
44
|
+
def get_code_indexed_data(self, vectorstore_wrapper, index_name) -> Dict[str, Dict[str, Any]]:
|
|
45
45
|
"""Get all indexed data from vectorstore for code content"""
|
|
46
46
|
pass
|
|
47
47
|
|
|
@@ -51,7 +51,7 @@ class VectorStoreAdapter(ABC):
|
|
|
51
51
|
pass
|
|
52
52
|
|
|
53
53
|
@abstractmethod
|
|
54
|
-
def get_index_meta(self, vectorstore_wrapper,
|
|
54
|
+
def get_index_meta(self, vectorstore_wrapper, index_name: str) -> List[Dict[str, Any]]:
|
|
55
55
|
"""Get all index_meta entries from the vector store."""
|
|
56
56
|
pass
|
|
57
57
|
|
|
@@ -106,20 +106,25 @@ class PGVectorAdapter(VectorStoreAdapter):
|
|
|
106
106
|
session.commit()
|
|
107
107
|
logger.info(f"Schema '{schema_name}' has been dropped.")
|
|
108
108
|
|
|
109
|
-
def get_indexed_ids(self, vectorstore_wrapper,
|
|
109
|
+
def get_indexed_ids(self, vectorstore_wrapper, index_name: Optional[str] = '') -> List[str]:
|
|
110
110
|
"""Get all indexed document IDs from PGVector"""
|
|
111
111
|
from sqlalchemy.orm import Session
|
|
112
|
-
from sqlalchemy import func
|
|
112
|
+
from sqlalchemy import func, or_
|
|
113
113
|
|
|
114
114
|
store = vectorstore_wrapper.vectorstore
|
|
115
115
|
try:
|
|
116
116
|
with Session(store.session_maker.bind) as session:
|
|
117
117
|
# Start building the query
|
|
118
118
|
query = session.query(store.EmbeddingStore.id)
|
|
119
|
-
# Apply filter only if
|
|
120
|
-
if
|
|
119
|
+
# Apply filter only if index_name is provided
|
|
120
|
+
if index_name:
|
|
121
121
|
query = query.filter(
|
|
122
|
-
func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'collection') ==
|
|
122
|
+
func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'collection') == index_name,
|
|
123
|
+
or_(
|
|
124
|
+
func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'type').is_(None),
|
|
125
|
+
func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata,
|
|
126
|
+
'type') != IndexerKeywords.INDEX_META_TYPE.value
|
|
127
|
+
)
|
|
123
128
|
)
|
|
124
129
|
ids = query.all()
|
|
125
130
|
return [str(id_tuple[0]) for id_tuple in ids]
|
|
@@ -127,25 +132,33 @@ class PGVectorAdapter(VectorStoreAdapter):
|
|
|
127
132
|
logger.error(f"Failed to get indexed IDs from PGVector: {str(e)}")
|
|
128
133
|
return []
|
|
129
134
|
|
|
130
|
-
def clean_collection(self, vectorstore_wrapper,
|
|
135
|
+
def clean_collection(self, vectorstore_wrapper, index_name: str = ''):
|
|
131
136
|
"""Clean the vectorstore collection by deleting all indexed data."""
|
|
132
137
|
# This logic deletes all data from the vectorstore collection without removal of collection.
|
|
133
138
|
# Collection itself remains available for future indexing.
|
|
134
|
-
|
|
139
|
+
from sqlalchemy.orm import Session
|
|
140
|
+
from sqlalchemy import func
|
|
141
|
+
|
|
142
|
+
store = vectorstore_wrapper.vectorstore
|
|
143
|
+
with Session(store.session_maker.bind) as session:
|
|
144
|
+
session.query(store.EmbeddingStore).filter(
|
|
145
|
+
func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'collection') == index_name
|
|
146
|
+
).delete(synchronize_session=False)
|
|
147
|
+
session.commit()
|
|
135
148
|
|
|
136
149
|
def is_vectorstore_type(self, vectorstore) -> bool:
|
|
137
150
|
"""Check if the vectorstore is a PGVector store."""
|
|
138
151
|
return hasattr(vectorstore, 'session_maker') and hasattr(vectorstore, 'EmbeddingStore')
|
|
139
152
|
|
|
140
|
-
def get_indexed_data(self, vectorstore_wrapper,
|
|
141
|
-
"""Get all indexed data from PGVector for non-code content per
|
|
153
|
+
def get_indexed_data(self, vectorstore_wrapper, index_name: str)-> Dict[str, Dict[str, Any]]:
|
|
154
|
+
"""Get all indexed data from PGVector for non-code content per index_name."""
|
|
142
155
|
from sqlalchemy.orm import Session
|
|
143
156
|
from sqlalchemy import func
|
|
144
157
|
from ...runtime.utils.utils import IndexerKeywords
|
|
145
158
|
|
|
146
159
|
result = {}
|
|
147
160
|
try:
|
|
148
|
-
vectorstore_wrapper.
|
|
161
|
+
vectorstore_wrapper._log_tool_event("Retrieving already indexed data from PGVector vectorstore",
|
|
149
162
|
tool_name="get_indexed_data")
|
|
150
163
|
store = vectorstore_wrapper.vectorstore
|
|
151
164
|
with Session(store.session_maker.bind) as session:
|
|
@@ -154,7 +167,7 @@ class PGVectorAdapter(VectorStoreAdapter):
|
|
|
154
167
|
store.EmbeddingStore.document,
|
|
155
168
|
store.EmbeddingStore.cmetadata
|
|
156
169
|
).filter(
|
|
157
|
-
func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'collection') ==
|
|
170
|
+
func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'collection') == index_name
|
|
158
171
|
).all()
|
|
159
172
|
|
|
160
173
|
# Process the retrieved data
|
|
@@ -187,14 +200,14 @@ class PGVectorAdapter(VectorStoreAdapter):
|
|
|
187
200
|
|
|
188
201
|
return result
|
|
189
202
|
|
|
190
|
-
def get_code_indexed_data(self, vectorstore_wrapper,
|
|
203
|
+
def get_code_indexed_data(self, vectorstore_wrapper, index_name: str) -> Dict[str, Dict[str, Any]]:
|
|
191
204
|
"""Get all indexed code data from PGVector per collection suffix."""
|
|
192
205
|
from sqlalchemy.orm import Session
|
|
193
206
|
from sqlalchemy import func
|
|
194
207
|
|
|
195
208
|
result = {}
|
|
196
209
|
try:
|
|
197
|
-
vectorstore_wrapper.
|
|
210
|
+
vectorstore_wrapper._log_tool_event(message="Retrieving already indexed code data from PGVector vectorstore",
|
|
198
211
|
tool_name="index_code_data")
|
|
199
212
|
store = vectorstore_wrapper.vectorstore
|
|
200
213
|
with (Session(store.session_maker.bind) as session):
|
|
@@ -202,7 +215,7 @@ class PGVectorAdapter(VectorStoreAdapter):
|
|
|
202
215
|
store.EmbeddingStore.id,
|
|
203
216
|
store.EmbeddingStore.cmetadata
|
|
204
217
|
).filter(
|
|
205
|
-
func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'collection') ==
|
|
218
|
+
func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'collection') == index_name
|
|
206
219
|
).all()
|
|
207
220
|
|
|
208
221
|
for db_id, meta in docs:
|
|
@@ -272,7 +285,7 @@ class PGVectorAdapter(VectorStoreAdapter):
|
|
|
272
285
|
except Exception as e:
|
|
273
286
|
logger.error(f"Failed to update collection for entry ID {entry_id}: {str(e)}")
|
|
274
287
|
|
|
275
|
-
def get_index_meta(self, vectorstore_wrapper,
|
|
288
|
+
def get_index_meta(self, vectorstore_wrapper, index_name: str) -> List[Dict[str, Any]]:
|
|
276
289
|
from sqlalchemy.orm import Session
|
|
277
290
|
from sqlalchemy import func
|
|
278
291
|
|
|
@@ -285,7 +298,7 @@ class PGVectorAdapter(VectorStoreAdapter):
|
|
|
285
298
|
store.EmbeddingStore.cmetadata
|
|
286
299
|
).filter(
|
|
287
300
|
store.EmbeddingStore.cmetadata['type'].astext == IndexerKeywords.INDEX_META_TYPE.value,
|
|
288
|
-
func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'collection') ==
|
|
301
|
+
func.jsonb_extract_path_text(store.EmbeddingStore.cmetadata, 'collection') == index_name
|
|
289
302
|
).all()
|
|
290
303
|
result = []
|
|
291
304
|
for id, document, cmetadata in meta:
|
|
@@ -312,7 +325,7 @@ class ChromaAdapter(VectorStoreAdapter):
|
|
|
312
325
|
def remove_collection(self, vectorstore_wrapper, collection_name: str):
|
|
313
326
|
vectorstore_wrapper.vectorstore.delete_collection()
|
|
314
327
|
|
|
315
|
-
def get_indexed_ids(self, vectorstore_wrapper,
|
|
328
|
+
def get_indexed_ids(self, vectorstore_wrapper, index_name: Optional[str] = '') -> List[str]:
|
|
316
329
|
"""Get all indexed document IDs from Chroma"""
|
|
317
330
|
try:
|
|
318
331
|
data = vectorstore_wrapper.vectorstore.get(include=[]) # Only get IDs, no metadata
|
|
@@ -321,9 +334,9 @@ class ChromaAdapter(VectorStoreAdapter):
|
|
|
321
334
|
logger.error(f"Failed to get indexed IDs from Chroma: {str(e)}")
|
|
322
335
|
return []
|
|
323
336
|
|
|
324
|
-
def clean_collection(self, vectorstore_wrapper,
|
|
337
|
+
def clean_collection(self, vectorstore_wrapper, index_name: str = ''):
|
|
325
338
|
"""Clean the vectorstore collection by deleting all indexed data."""
|
|
326
|
-
vectorstore_wrapper.vectorstore.delete(ids=self.get_indexed_ids(vectorstore_wrapper,
|
|
339
|
+
vectorstore_wrapper.vectorstore.delete(ids=self.get_indexed_ids(vectorstore_wrapper, index_name))
|
|
327
340
|
|
|
328
341
|
def get_indexed_data(self, vectorstore_wrapper):
|
|
329
342
|
"""Get all indexed data from Chroma for non-code content"""
|
|
@@ -361,7 +374,7 @@ class ChromaAdapter(VectorStoreAdapter):
|
|
|
361
374
|
|
|
362
375
|
return result
|
|
363
376
|
|
|
364
|
-
def get_code_indexed_data(self, vectorstore_wrapper,
|
|
377
|
+
def get_code_indexed_data(self, vectorstore_wrapper, index_name) -> Dict[str, Dict[str, Any]]:
|
|
365
378
|
"""Get all indexed code data from Chroma."""
|
|
366
379
|
result = {}
|
|
367
380
|
try:
|
|
@@ -391,7 +404,7 @@ class ChromaAdapter(VectorStoreAdapter):
|
|
|
391
404
|
# This is a simplified implementation - in practice, you might need more complex logic
|
|
392
405
|
logger.warning("add_to_collection for Chroma is not fully implemented yet")
|
|
393
406
|
|
|
394
|
-
def get_index_meta(self, vectorstore_wrapper,
|
|
407
|
+
def get_index_meta(self, vectorstore_wrapper, index_name: str) -> List[Dict[str, Any]]:
|
|
395
408
|
logger.warning("get_index_meta for Chroma is not implemented yet")
|
|
396
409
|
|
|
397
410
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: alita_sdk
|
|
3
|
-
Version: 0.3.
|
|
3
|
+
Version: 0.3.423
|
|
4
4
|
Summary: SDK for building langchain agents using resources from Alita
|
|
5
5
|
Author-email: Artem Rozumenko <artyom.rozumenko@gmail.com>, Mikalai Biazruchka <mikalai_biazruchka@epam.com>, Roman Mitusov <roman_mitusov@epam.com>, Ivan Krakhmaliuk <lifedj27@gmail.com>, Artem Dubrovskiy <ad13box@gmail.com>
|
|
6
6
|
License-Expression: Apache-2.0
|