veadk-python 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of veadk-python might be problematic. Click here for more details.
- veadk/agent.py +3 -2
- veadk/auth/veauth/opensearch_veauth.py +75 -0
- veadk/auth/veauth/postgresql_veauth.py +75 -0
- veadk/cli/cli.py +3 -1
- veadk/cli/cli_eval.py +160 -0
- veadk/cli/cli_prompt.py +9 -2
- veadk/cli/cli_web.py +6 -1
- veadk/configs/database_configs.py +43 -0
- veadk/configs/model_configs.py +32 -0
- veadk/consts.py +11 -4
- veadk/evaluation/adk_evaluator/adk_evaluator.py +5 -2
- veadk/evaluation/base_evaluator.py +95 -68
- veadk/evaluation/deepeval_evaluator/deepeval_evaluator.py +23 -15
- veadk/evaluation/eval_set_recorder.py +2 -2
- veadk/integrations/ve_prompt_pilot/ve_prompt_pilot.py +9 -3
- veadk/integrations/ve_tls/utils.py +1 -2
- veadk/integrations/ve_tls/ve_tls.py +9 -5
- veadk/integrations/ve_tos/ve_tos.py +542 -68
- veadk/knowledgebase/backends/base_backend.py +59 -0
- veadk/knowledgebase/backends/in_memory_backend.py +82 -0
- veadk/knowledgebase/backends/opensearch_backend.py +136 -0
- veadk/knowledgebase/backends/redis_backend.py +144 -0
- veadk/knowledgebase/backends/utils.py +91 -0
- veadk/knowledgebase/backends/vikingdb_knowledge_backend.py +524 -0
- veadk/{database/__init__.py → knowledgebase/entry.py} +10 -2
- veadk/knowledgebase/knowledgebase.py +120 -139
- veadk/memory/__init__.py +22 -0
- veadk/memory/long_term_memory.py +124 -41
- veadk/{database/base_database.py → memory/long_term_memory_backends/base_backend.py} +10 -22
- veadk/memory/long_term_memory_backends/in_memory_backend.py +65 -0
- veadk/memory/long_term_memory_backends/mem0_backend.py +129 -0
- veadk/memory/long_term_memory_backends/opensearch_backend.py +120 -0
- veadk/memory/long_term_memory_backends/redis_backend.py +127 -0
- veadk/memory/long_term_memory_backends/vikingdb_memory_backend.py +148 -0
- veadk/memory/short_term_memory.py +80 -72
- veadk/memory/short_term_memory_backends/base_backend.py +31 -0
- veadk/memory/short_term_memory_backends/mysql_backend.py +41 -0
- veadk/memory/short_term_memory_backends/postgresql_backend.py +41 -0
- veadk/memory/short_term_memory_backends/sqlite_backend.py +48 -0
- veadk/runner.py +12 -19
- veadk/tools/builtin_tools/generate_image.py +355 -0
- veadk/tools/builtin_tools/image_edit.py +56 -16
- veadk/tools/builtin_tools/image_generate.py +51 -15
- veadk/tools/builtin_tools/video_generate.py +41 -41
- veadk/tools/builtin_tools/web_scraper.py +1 -1
- veadk/tools/builtin_tools/web_search.py +7 -7
- veadk/tools/load_knowledgebase_tool.py +2 -8
- veadk/tracing/telemetry/attributes/extractors/llm_attributes_extractors.py +21 -3
- veadk/tracing/telemetry/exporters/apmplus_exporter.py +24 -6
- veadk/tracing/telemetry/exporters/cozeloop_exporter.py +2 -0
- veadk/tracing/telemetry/exporters/inmemory_exporter.py +22 -8
- veadk/tracing/telemetry/exporters/tls_exporter.py +2 -0
- veadk/tracing/telemetry/opentelemetry_tracer.py +13 -10
- veadk/tracing/telemetry/telemetry.py +66 -63
- veadk/utils/misc.py +15 -0
- veadk/version.py +1 -1
- {veadk_python-0.2.7.dist-info → veadk_python-0.2.9.dist-info}/METADATA +28 -5
- {veadk_python-0.2.7.dist-info → veadk_python-0.2.9.dist-info}/RECORD +65 -56
- veadk/database/database_adapter.py +0 -533
- veadk/database/database_factory.py +0 -80
- veadk/database/kv/redis_database.py +0 -159
- veadk/database/local_database.py +0 -62
- veadk/database/relational/mysql_database.py +0 -173
- veadk/database/vector/opensearch_vector_database.py +0 -263
- veadk/database/vector/type.py +0 -50
- veadk/database/viking/__init__.py +0 -13
- veadk/database/viking/viking_database.py +0 -638
- veadk/database/viking/viking_memory_db.py +0 -525
- /veadk/{database/kv → knowledgebase/backends}/__init__.py +0 -0
- /veadk/{database/relational → memory/long_term_memory_backends}/__init__.py +0 -0
- /veadk/{database/vector → memory/short_term_memory_backends}/__init__.py +0 -0
- {veadk_python-0.2.7.dist-info → veadk_python-0.2.9.dist-info}/WHEEL +0 -0
- {veadk_python-0.2.7.dist-info → veadk_python-0.2.9.dist-info}/entry_points.txt +0 -0
- {veadk_python-0.2.7.dist-info → veadk_python-0.2.9.dist-info}/licenses/LICENSE +0 -0
- {veadk_python-0.2.7.dist-info → veadk_python-0.2.9.dist-info}/top_level.txt +0 -0
veadk/database/vector/type.py
DELETED
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
import requests
|
|
16
|
-
|
|
17
|
-
from veadk.config import getenv
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class Embeddings:
|
|
21
|
-
def __init__(
|
|
22
|
-
self,
|
|
23
|
-
model: str = getenv("MODEL_EMBEDDING_NAME"),
|
|
24
|
-
api_base: str = getenv("MODEL_EMBEDDING_API_BASE"),
|
|
25
|
-
api_key: str = getenv("MODEL_EMBEDDING_API_KEY"),
|
|
26
|
-
dim: int = int(getenv("MODEL_EMBEDDING_DIM")),
|
|
27
|
-
):
|
|
28
|
-
self.model = model
|
|
29
|
-
self.url = api_base
|
|
30
|
-
self.api_key = api_key
|
|
31
|
-
self.dim = dim
|
|
32
|
-
|
|
33
|
-
self.headers = {
|
|
34
|
-
"Content-Type": "application/json",
|
|
35
|
-
"Authorization": f"Bearer {self.api_key}",
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
def embed_documents(self, texts: list[str]) -> list[list[float]]:
|
|
39
|
-
MAX_CHARS = 4000
|
|
40
|
-
data = {"model": self.model, "input": [text[:MAX_CHARS] for text in texts]}
|
|
41
|
-
response = requests.post(self.url, headers=self.headers, json=data)
|
|
42
|
-
response.raise_for_status()
|
|
43
|
-
result = response.json()
|
|
44
|
-
return [item["embedding"] for item in result["data"]]
|
|
45
|
-
|
|
46
|
-
def embed_query(self, text: str) -> list[float]:
|
|
47
|
-
return self.embed_documents([text])[0]
|
|
48
|
-
|
|
49
|
-
def get_embedding_dim(self) -> int:
|
|
50
|
-
return self.dim
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
@@ -1,638 +0,0 @@
|
|
|
1
|
-
# Copyright (c) 2025 Beijing Volcano Engine Technology Co., Ltd. and/or its affiliates.
|
|
2
|
-
#
|
|
3
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
-
# you may not use this file except in compliance with the License.
|
|
5
|
-
# You may obtain a copy of the License at
|
|
6
|
-
#
|
|
7
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
-
#
|
|
9
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
-
# See the License for the specific language governing permissions and
|
|
13
|
-
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
import io
|
|
16
|
-
import json
|
|
17
|
-
import os
|
|
18
|
-
import uuid
|
|
19
|
-
from typing import Any, BinaryIO, Literal, TextIO
|
|
20
|
-
|
|
21
|
-
import requests
|
|
22
|
-
import tos
|
|
23
|
-
from pydantic import BaseModel, Field
|
|
24
|
-
from volcengine.auth.SignerV4 import SignerV4
|
|
25
|
-
from volcengine.base.Request import Request
|
|
26
|
-
from volcengine.Credentials import Credentials
|
|
27
|
-
|
|
28
|
-
from veadk.config import getenv
|
|
29
|
-
from veadk.database.base_database import BaseDatabase
|
|
30
|
-
from veadk.utils.logger import get_logger
|
|
31
|
-
|
|
32
|
-
logger = get_logger(__name__)
|
|
33
|
-
|
|
34
|
-
# knowledge base domain
|
|
35
|
-
g_knowledge_base_domain = "api-knowledgebase.mlp.cn-beijing.volces.com"
|
|
36
|
-
# paths
|
|
37
|
-
create_collection_path = "/api/knowledge/collection/create"
|
|
38
|
-
search_knowledge_path = "/api/knowledge/collection/search_knowledge"
|
|
39
|
-
list_collections_path = "/api/knowledge/collection/list"
|
|
40
|
-
get_collections_path = "/api/knowledge/collection/info"
|
|
41
|
-
doc_del_path = "/api/knowledge/collection/delete"
|
|
42
|
-
doc_add_path = "/api/knowledge/doc/add"
|
|
43
|
-
doc_info_path = "/api/knowledge/doc/info"
|
|
44
|
-
list_point_path = "/api/knowledge/point/list"
|
|
45
|
-
list_docs_path = "/api/knowledge/doc/list"
|
|
46
|
-
delete_docs_path = "/api/knowledge/doc/delete"
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
class VolcengineTOSConfig(BaseModel):
|
|
50
|
-
endpoint: str = Field(
|
|
51
|
-
default_factory=lambda: getenv(
|
|
52
|
-
"DATABASE_TOS_ENDPOINT", "tos-cn-beijing.volces.com"
|
|
53
|
-
),
|
|
54
|
-
description="VikingDB TOS endpoint",
|
|
55
|
-
)
|
|
56
|
-
region: str = Field(
|
|
57
|
-
default_factory=lambda: getenv("DATABASE_TOS_REGION", "cn-beijing"),
|
|
58
|
-
description="VikingDB TOS region",
|
|
59
|
-
)
|
|
60
|
-
bucket: str = Field(
|
|
61
|
-
default_factory=lambda: getenv("DATABASE_TOS_BUCKET"),
|
|
62
|
-
description="VikingDB TOS bucket",
|
|
63
|
-
)
|
|
64
|
-
base_key: str = Field(
|
|
65
|
-
default="veadk",
|
|
66
|
-
description="VikingDB TOS base key",
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
class VikingDatabaseConfig(BaseModel):
|
|
71
|
-
volcengine_ak: str = Field(
|
|
72
|
-
default_factory=lambda: getenv("VOLCENGINE_ACCESS_KEY"),
|
|
73
|
-
description="VikingDB access key",
|
|
74
|
-
)
|
|
75
|
-
volcengine_sk: str = Field(
|
|
76
|
-
default_factory=lambda: getenv("VOLCENGINE_SECRET_KEY"),
|
|
77
|
-
description="VikingDB secret key",
|
|
78
|
-
)
|
|
79
|
-
project: str = Field(
|
|
80
|
-
default_factory=lambda: getenv("DATABASE_VIKING_PROJECT"),
|
|
81
|
-
description="VikingDB project name",
|
|
82
|
-
)
|
|
83
|
-
region: str = Field(
|
|
84
|
-
default_factory=lambda: getenv("DATABASE_VIKING_REGION"),
|
|
85
|
-
description="VikingDB region",
|
|
86
|
-
)
|
|
87
|
-
tos: VolcengineTOSConfig = Field(
|
|
88
|
-
default_factory=VolcengineTOSConfig,
|
|
89
|
-
description="VikingDB TOS configuration",
|
|
90
|
-
)
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
def prepare_request(
|
|
94
|
-
method, path, config: VikingDatabaseConfig, params=None, data=None, doseq=0
|
|
95
|
-
):
|
|
96
|
-
ak = config.volcengine_ak
|
|
97
|
-
sk = config.volcengine_sk
|
|
98
|
-
|
|
99
|
-
if params:
|
|
100
|
-
for key in params:
|
|
101
|
-
if (
|
|
102
|
-
type(params[key]) is int
|
|
103
|
-
or type(params[key]) is float
|
|
104
|
-
or type(params[key]) is bool
|
|
105
|
-
):
|
|
106
|
-
params[key] = str(params[key])
|
|
107
|
-
elif type(params[key]) is list:
|
|
108
|
-
if not doseq:
|
|
109
|
-
params[key] = ",".join(params[key])
|
|
110
|
-
r = Request()
|
|
111
|
-
r.set_shema("https")
|
|
112
|
-
r.set_method(method)
|
|
113
|
-
r.set_connection_timeout(10)
|
|
114
|
-
r.set_socket_timeout(10)
|
|
115
|
-
mheaders = {
|
|
116
|
-
"Accept": "application/json",
|
|
117
|
-
"Content-Type": "application/json",
|
|
118
|
-
}
|
|
119
|
-
r.set_headers(mheaders)
|
|
120
|
-
if params:
|
|
121
|
-
r.set_query(params)
|
|
122
|
-
r.set_path(path)
|
|
123
|
-
if data is not None:
|
|
124
|
-
r.set_body(json.dumps(data))
|
|
125
|
-
credentials = Credentials(ak, sk, "air", config.region)
|
|
126
|
-
SignerV4.sign(r, credentials)
|
|
127
|
-
return r
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
class VikingDatabase(BaseModel, BaseDatabase):
|
|
131
|
-
config: VikingDatabaseConfig = Field(
|
|
132
|
-
default_factory=VikingDatabaseConfig,
|
|
133
|
-
description="VikingDB configuration",
|
|
134
|
-
)
|
|
135
|
-
|
|
136
|
-
def _upload_to_tos(
|
|
137
|
-
self,
|
|
138
|
-
data: str | list[str] | TextIO | BinaryIO | bytes,
|
|
139
|
-
**kwargs: Any,
|
|
140
|
-
) -> tuple[int, str]:
|
|
141
|
-
"""
|
|
142
|
-
Upload data to TOS (Tinder Object Storage).
|
|
143
|
-
|
|
144
|
-
Args:
|
|
145
|
-
data: The data to be uploaded. Can be one of the following types:
|
|
146
|
-
- str: File path or string data
|
|
147
|
-
- list[str]: List of strings
|
|
148
|
-
- TextIO: File object (text)
|
|
149
|
-
- BinaryIO: File object (binary)
|
|
150
|
-
- bytes: Binary data
|
|
151
|
-
**kwargs: Additional keyword arguments.
|
|
152
|
-
- file_name (str): The file name (including suffix).
|
|
153
|
-
|
|
154
|
-
Returns:
|
|
155
|
-
tuple: A tuple containing the status code and TOS URL.
|
|
156
|
-
- status_code (int): HTTP status code
|
|
157
|
-
- tos_url (str): The URL of the uploaded file in TOS
|
|
158
|
-
"""
|
|
159
|
-
ak = self.config.volcengine_ak
|
|
160
|
-
sk = self.config.volcengine_sk
|
|
161
|
-
|
|
162
|
-
tos_bucket = self.config.tos.bucket
|
|
163
|
-
tos_endpoint = self.config.tos.endpoint
|
|
164
|
-
tos_region = self.config.tos.region
|
|
165
|
-
tos_key = self.config.tos.base_key
|
|
166
|
-
|
|
167
|
-
client = tos.TosClientV2(ak, sk, tos_endpoint, tos_region, max_connections=1024)
|
|
168
|
-
|
|
169
|
-
# Extract file_name from kwargs - this is now required and includes the extension
|
|
170
|
-
file_names = kwargs.get("file_name")
|
|
171
|
-
|
|
172
|
-
if isinstance(data, str) and os.path.isfile(data): # Process file path
|
|
173
|
-
# Use provided file_name which includes the extension
|
|
174
|
-
new_key = f"{tos_key}/{file_names}"
|
|
175
|
-
with open(data, "rb") as f:
|
|
176
|
-
upload_data = f.read()
|
|
177
|
-
|
|
178
|
-
elif (
|
|
179
|
-
isinstance(data, list)
|
|
180
|
-
and all(isinstance(item, str) for item in data)
|
|
181
|
-
and all(os.path.isfile(item) for item in data)
|
|
182
|
-
):
|
|
183
|
-
# Process list of file paths - this should be handled at a higher level
|
|
184
|
-
raise ValueError(
|
|
185
|
-
"Uploading multiple files through a list of file paths is not supported in _upload_to_tos directly. Please call this function for each file individually."
|
|
186
|
-
)
|
|
187
|
-
|
|
188
|
-
elif isinstance(
|
|
189
|
-
data,
|
|
190
|
-
(io.TextIOWrapper, io.BufferedReader), # file type: TextIO | BinaryIO
|
|
191
|
-
): # Process file stream
|
|
192
|
-
# Use provided file_name which includes the extension
|
|
193
|
-
new_key = f"{tos_key}/{file_names}"
|
|
194
|
-
if isinstance(data, TextIO):
|
|
195
|
-
# Encode the text stream content into bytes
|
|
196
|
-
upload_data = data.read().encode("utf-8")
|
|
197
|
-
else:
|
|
198
|
-
# Read the content of the binary stream
|
|
199
|
-
upload_data = data.read()
|
|
200
|
-
|
|
201
|
-
elif isinstance(data, str): # Process ordinary strings
|
|
202
|
-
# Use provided file_name which includes the extension
|
|
203
|
-
new_key = f"{tos_key}/{file_names}"
|
|
204
|
-
upload_data = data.encode("utf-8") # Encode as byte type
|
|
205
|
-
|
|
206
|
-
elif isinstance(data, list): # Process list of strings
|
|
207
|
-
# Use provided file_name which includes the extension
|
|
208
|
-
new_key = f"{tos_key}/{file_names}"
|
|
209
|
-
# Join the strings in the list with newlines and encode as byte type
|
|
210
|
-
upload_data = "\n".join(data).encode("utf-8")
|
|
211
|
-
|
|
212
|
-
elif isinstance(data, bytes): # Process bytes data
|
|
213
|
-
# Use provided file_name which includes the extension
|
|
214
|
-
new_key = f"{tos_key}/{file_names}"
|
|
215
|
-
upload_data = data
|
|
216
|
-
|
|
217
|
-
else:
|
|
218
|
-
raise ValueError(f"Unsupported data type: {type(data)}")
|
|
219
|
-
|
|
220
|
-
resp = client.put_object(tos_bucket, new_key, content=upload_data)
|
|
221
|
-
tos_url = f"{tos_bucket}/{new_key}"
|
|
222
|
-
|
|
223
|
-
return resp.resp.status, tos_url
|
|
224
|
-
|
|
225
|
-
def _add_doc(self, collection_name: str, tos_url: str, doc_id: str, **kwargs: Any):
|
|
226
|
-
request_params = {
|
|
227
|
-
"collection_name": collection_name,
|
|
228
|
-
"project": self.config.project,
|
|
229
|
-
"add_type": "tos",
|
|
230
|
-
"doc_id": doc_id,
|
|
231
|
-
"tos_path": tos_url,
|
|
232
|
-
}
|
|
233
|
-
|
|
234
|
-
doc_add_req = prepare_request(
|
|
235
|
-
method="POST", path=doc_add_path, config=self.config, data=request_params
|
|
236
|
-
)
|
|
237
|
-
rsp = requests.request(
|
|
238
|
-
method=doc_add_req.method,
|
|
239
|
-
url="https://{}{}".format(g_knowledge_base_domain, doc_add_req.path),
|
|
240
|
-
headers=doc_add_req.headers,
|
|
241
|
-
data=doc_add_req.body,
|
|
242
|
-
)
|
|
243
|
-
|
|
244
|
-
result = rsp.json()
|
|
245
|
-
if result["code"] != 0:
|
|
246
|
-
logger.error(f"Error in add_doc: {result['message']}")
|
|
247
|
-
return {"error": result["message"]}
|
|
248
|
-
|
|
249
|
-
doc_add_data = result["data"]
|
|
250
|
-
if not doc_add_data:
|
|
251
|
-
raise ValueError(f"doc {doc_id} has no data.")
|
|
252
|
-
|
|
253
|
-
return doc_id
|
|
254
|
-
|
|
255
|
-
def add(
|
|
256
|
-
self,
|
|
257
|
-
data: str | list[str] | TextIO | BinaryIO | bytes,
|
|
258
|
-
collection_name: str,
|
|
259
|
-
**kwargs,
|
|
260
|
-
):
|
|
261
|
-
"""
|
|
262
|
-
Add documents to the Viking database.
|
|
263
|
-
Args:
|
|
264
|
-
data: The data to be added. Can be one of the following types:
|
|
265
|
-
- str: File path or string data
|
|
266
|
-
- list[str]: List of file paths or list of strings
|
|
267
|
-
- TextIO: File object (text)
|
|
268
|
-
- BinaryIO: File object (binary)
|
|
269
|
-
- bytes: Binary data
|
|
270
|
-
collection_name: The name of the collection to add documents to.
|
|
271
|
-
**kwargs: Additional keyword arguments.
|
|
272
|
-
- file_name (str | list[str]): The file name or a list of file names (including suffix).
|
|
273
|
-
- doc_id (str): The document ID. If not provided, a UUID will be generated.
|
|
274
|
-
Returns:
|
|
275
|
-
dict or list: A dictionary containing the TOS URL and document ID, or a list of such dictionaries for multiple file uploads.
|
|
276
|
-
Format: {
|
|
277
|
-
"tos_url": "tos://<bucket>/<key>",
|
|
278
|
-
"doc_id": "<doc_id>",
|
|
279
|
-
}
|
|
280
|
-
"""
|
|
281
|
-
# Handle list of file paths (multiple file upload)
|
|
282
|
-
if (
|
|
283
|
-
isinstance(data, list)
|
|
284
|
-
and all(isinstance(item, str) for item in data)
|
|
285
|
-
and all(os.path.isfile(item) for item in data)
|
|
286
|
-
):
|
|
287
|
-
# Handle multiple file upload
|
|
288
|
-
file_names = kwargs.get("file_name")
|
|
289
|
-
if (
|
|
290
|
-
not file_names
|
|
291
|
-
or not isinstance(file_names, list)
|
|
292
|
-
or len(file_names) != len(data)
|
|
293
|
-
):
|
|
294
|
-
raise ValueError(
|
|
295
|
-
"For multiple file upload, file_name must be provided as a list with the same length as data"
|
|
296
|
-
)
|
|
297
|
-
|
|
298
|
-
results = []
|
|
299
|
-
for i, file_path in enumerate(data):
|
|
300
|
-
# Create kwargs for this specific file
|
|
301
|
-
single_kwargs = kwargs.copy()
|
|
302
|
-
single_kwargs["file_name"] = file_names[i]
|
|
303
|
-
|
|
304
|
-
# Generate or use provided doc_id for this file
|
|
305
|
-
doc_id = single_kwargs.get("doc_id")
|
|
306
|
-
if not doc_id:
|
|
307
|
-
doc_id = str(uuid.uuid4())
|
|
308
|
-
single_kwargs["doc_id"] = doc_id
|
|
309
|
-
|
|
310
|
-
status, tos_url = self._upload_to_tos(data=file_path, **single_kwargs)
|
|
311
|
-
if status != 200:
|
|
312
|
-
raise ValueError(
|
|
313
|
-
f"Error in upload_to_tos for file {file_path}: {status}"
|
|
314
|
-
)
|
|
315
|
-
|
|
316
|
-
doc_id = self._add_doc(
|
|
317
|
-
collection_name=collection_name,
|
|
318
|
-
tos_url=tos_url,
|
|
319
|
-
doc_id=doc_id,
|
|
320
|
-
)
|
|
321
|
-
|
|
322
|
-
results.append(
|
|
323
|
-
{
|
|
324
|
-
"tos_url": f"tos://{tos_url}",
|
|
325
|
-
"doc_id": doc_id,
|
|
326
|
-
}
|
|
327
|
-
)
|
|
328
|
-
|
|
329
|
-
return results
|
|
330
|
-
|
|
331
|
-
# Handle list of strings (multiple string upload)
|
|
332
|
-
elif isinstance(data, list) and all(isinstance(item, str) for item in data):
|
|
333
|
-
# Handle multiple string upload
|
|
334
|
-
file_names = kwargs.get("file_name")
|
|
335
|
-
if (
|
|
336
|
-
not file_names
|
|
337
|
-
or not isinstance(file_names, list)
|
|
338
|
-
or len(file_names) != len(data)
|
|
339
|
-
):
|
|
340
|
-
raise ValueError(
|
|
341
|
-
"For multiple string upload, file_name must be provided as a list with the same length as data"
|
|
342
|
-
)
|
|
343
|
-
|
|
344
|
-
results = []
|
|
345
|
-
for i, content in enumerate(data):
|
|
346
|
-
# Create kwargs for this specific string
|
|
347
|
-
single_kwargs = kwargs.copy()
|
|
348
|
-
single_kwargs["file_name"] = file_names[i]
|
|
349
|
-
|
|
350
|
-
# Generate or use provided doc_id for this string
|
|
351
|
-
doc_id = single_kwargs.get("doc_id")
|
|
352
|
-
if not doc_id:
|
|
353
|
-
doc_id = str(uuid.uuid4())
|
|
354
|
-
single_kwargs["doc_id"] = doc_id
|
|
355
|
-
|
|
356
|
-
status, tos_url = self._upload_to_tos(data=content, **single_kwargs)
|
|
357
|
-
if status != 200:
|
|
358
|
-
raise ValueError(f"Error in upload_to_tos for string {i}: {status}")
|
|
359
|
-
|
|
360
|
-
doc_id = self._add_doc(
|
|
361
|
-
collection_name=collection_name,
|
|
362
|
-
tos_url=tos_url,
|
|
363
|
-
doc_id=doc_id,
|
|
364
|
-
)
|
|
365
|
-
|
|
366
|
-
results.append(
|
|
367
|
-
{
|
|
368
|
-
"tos_url": f"tos://{tos_url}",
|
|
369
|
-
"doc_id": doc_id,
|
|
370
|
-
}
|
|
371
|
-
)
|
|
372
|
-
|
|
373
|
-
return results
|
|
374
|
-
|
|
375
|
-
# Handle single file upload or other data types
|
|
376
|
-
else:
|
|
377
|
-
# Handle doc_id from kwargs or generate a new one
|
|
378
|
-
doc_id = kwargs.get("doc_id", str(uuid.uuid4()))
|
|
379
|
-
|
|
380
|
-
status, tos_url = self._upload_to_tos(data=data, **kwargs)
|
|
381
|
-
if status != 200:
|
|
382
|
-
raise ValueError(f"Error in upload_to_tos: {status}")
|
|
383
|
-
doc_id = self._add_doc(
|
|
384
|
-
collection_name=collection_name,
|
|
385
|
-
tos_url=tos_url,
|
|
386
|
-
doc_id=doc_id,
|
|
387
|
-
)
|
|
388
|
-
return {
|
|
389
|
-
"tos_url": f"tos://{tos_url}",
|
|
390
|
-
"doc_id": doc_id,
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
def delete(self, **kwargs: Any):
|
|
394
|
-
name = kwargs.get("name")
|
|
395
|
-
project = kwargs.get("project", self.config.project)
|
|
396
|
-
request_param = {"name": name, "project": project}
|
|
397
|
-
doc_del_req = prepare_request(
|
|
398
|
-
method="POST", path=doc_del_path, config=self.config, data=request_param
|
|
399
|
-
)
|
|
400
|
-
rsp = requests.request(
|
|
401
|
-
method=doc_del_req.method,
|
|
402
|
-
url="http://{}{}".format(g_knowledge_base_domain, doc_del_req.path),
|
|
403
|
-
headers=doc_del_req.headers,
|
|
404
|
-
data=doc_del_req.body,
|
|
405
|
-
)
|
|
406
|
-
result = rsp.json()
|
|
407
|
-
if result["code"] != 0:
|
|
408
|
-
logger.error(f"Error in add_doc: {result['message']}")
|
|
409
|
-
return False
|
|
410
|
-
return True
|
|
411
|
-
|
|
412
|
-
def query(self, query: str, **kwargs: Any) -> list[str]:
|
|
413
|
-
"""
|
|
414
|
-
Args:
|
|
415
|
-
query: query text
|
|
416
|
-
**kwargs: collection_name(required), top_k(optional, default 5)
|
|
417
|
-
|
|
418
|
-
Returns: list of str, the search result
|
|
419
|
-
"""
|
|
420
|
-
collection_name = kwargs.get("collection_name")
|
|
421
|
-
assert collection_name is not None, "collection_name is required"
|
|
422
|
-
request_params = {
|
|
423
|
-
"query": query,
|
|
424
|
-
"limit": int(kwargs.get("top_k", 5)),
|
|
425
|
-
"name": collection_name,
|
|
426
|
-
"project": self.config.project,
|
|
427
|
-
}
|
|
428
|
-
search_req = prepare_request(
|
|
429
|
-
method="POST",
|
|
430
|
-
path=search_knowledge_path,
|
|
431
|
-
config=self.config,
|
|
432
|
-
data=request_params,
|
|
433
|
-
)
|
|
434
|
-
resp = requests.request(
|
|
435
|
-
method=search_req.method,
|
|
436
|
-
url="https://{}{}".format(g_knowledge_base_domain, search_req.path),
|
|
437
|
-
headers=search_req.headers,
|
|
438
|
-
data=search_req.body,
|
|
439
|
-
)
|
|
440
|
-
|
|
441
|
-
result = resp.json()
|
|
442
|
-
if result["code"] != 0:
|
|
443
|
-
logger.error(f"Error in search_knowledge: {result['message']}")
|
|
444
|
-
raise ValueError(f"Error in search_knowledge: {result['message']}")
|
|
445
|
-
|
|
446
|
-
if not result["data"]["result_list"]:
|
|
447
|
-
raise ValueError(f"No results found for collection {collection_name}")
|
|
448
|
-
|
|
449
|
-
chunks = result["data"]["result_list"]
|
|
450
|
-
|
|
451
|
-
search_result = []
|
|
452
|
-
|
|
453
|
-
for chunk in chunks:
|
|
454
|
-
search_result.append(chunk["content"])
|
|
455
|
-
|
|
456
|
-
return search_result
|
|
457
|
-
|
|
458
|
-
def create_collection(
|
|
459
|
-
self,
|
|
460
|
-
collection_name: str,
|
|
461
|
-
description: str = "",
|
|
462
|
-
version: Literal[2, 4] = 4,
|
|
463
|
-
data_type: Literal[
|
|
464
|
-
"unstructured_data", "structured_data"
|
|
465
|
-
] = "unstructured_data",
|
|
466
|
-
chunking_strategy: Literal["custom_balance", "custom"] = "custom_balance",
|
|
467
|
-
chunk_length: int = 500,
|
|
468
|
-
merge_small_chunks: bool = True,
|
|
469
|
-
):
|
|
470
|
-
request_params = {
|
|
471
|
-
"name": collection_name,
|
|
472
|
-
"project": self.config.project,
|
|
473
|
-
"description": description,
|
|
474
|
-
"version": version,
|
|
475
|
-
"data_type": data_type,
|
|
476
|
-
"preprocessing": {
|
|
477
|
-
"chunking_strategy": chunking_strategy,
|
|
478
|
-
"chunk_length": chunk_length,
|
|
479
|
-
"merge_small_chunks": merge_small_chunks,
|
|
480
|
-
},
|
|
481
|
-
}
|
|
482
|
-
|
|
483
|
-
create_collection_req = prepare_request(
|
|
484
|
-
method="POST",
|
|
485
|
-
path=create_collection_path,
|
|
486
|
-
config=self.config,
|
|
487
|
-
data=request_params,
|
|
488
|
-
)
|
|
489
|
-
resp = requests.request(
|
|
490
|
-
method=create_collection_req.method,
|
|
491
|
-
url="https://{}{}".format(
|
|
492
|
-
g_knowledge_base_domain, create_collection_req.path
|
|
493
|
-
),
|
|
494
|
-
headers=create_collection_req.headers,
|
|
495
|
-
data=create_collection_req.body,
|
|
496
|
-
)
|
|
497
|
-
|
|
498
|
-
result = resp.json()
|
|
499
|
-
if result["code"] != 0:
|
|
500
|
-
logger.error(f"Error in create_collection: {result['message']}")
|
|
501
|
-
raise ValueError(f"Error in create_collection: {result['message']}")
|
|
502
|
-
return result
|
|
503
|
-
|
|
504
|
-
def collection_exists(self, collection_name: str) -> bool:
|
|
505
|
-
request_params = {
|
|
506
|
-
"project": self.config.project,
|
|
507
|
-
}
|
|
508
|
-
list_collections_req = prepare_request(
|
|
509
|
-
method="POST",
|
|
510
|
-
path=list_collections_path,
|
|
511
|
-
config=self.config,
|
|
512
|
-
data=request_params,
|
|
513
|
-
)
|
|
514
|
-
resp = requests.request(
|
|
515
|
-
method=list_collections_req.method,
|
|
516
|
-
url="https://{}{}".format(
|
|
517
|
-
g_knowledge_base_domain, list_collections_req.path
|
|
518
|
-
),
|
|
519
|
-
headers=list_collections_req.headers,
|
|
520
|
-
data=list_collections_req.body,
|
|
521
|
-
)
|
|
522
|
-
|
|
523
|
-
result = resp.json()
|
|
524
|
-
if result["code"] != 0:
|
|
525
|
-
logger.error(f"Error in list_collections: {result['message']}")
|
|
526
|
-
raise ValueError(f"Error in list_collections: {result['message']}")
|
|
527
|
-
|
|
528
|
-
collections = result["data"].get("collection_list", [])
|
|
529
|
-
if len(collections) == 0:
|
|
530
|
-
return False
|
|
531
|
-
|
|
532
|
-
collection_list = set()
|
|
533
|
-
|
|
534
|
-
for collection in collections:
|
|
535
|
-
collection_list.add(collection["collection_name"])
|
|
536
|
-
# check the collection exist or not
|
|
537
|
-
if collection_name in collection_list:
|
|
538
|
-
return True
|
|
539
|
-
else:
|
|
540
|
-
return False
|
|
541
|
-
|
|
542
|
-
def list_chunks(
|
|
543
|
-
self, collection_name: str, offset: int = 0, limit: int = -1
|
|
544
|
-
) -> list[dict]:
|
|
545
|
-
request_params = {
|
|
546
|
-
"collection_name": collection_name,
|
|
547
|
-
"project": self.config.project,
|
|
548
|
-
"offset": offset,
|
|
549
|
-
"limit": limit,
|
|
550
|
-
}
|
|
551
|
-
|
|
552
|
-
list_doc_req = prepare_request(
|
|
553
|
-
method="POST",
|
|
554
|
-
path=list_point_path,
|
|
555
|
-
config=self.config,
|
|
556
|
-
data=request_params,
|
|
557
|
-
)
|
|
558
|
-
resp = requests.request(
|
|
559
|
-
method=list_doc_req.method,
|
|
560
|
-
url="https://{}{}".format(g_knowledge_base_domain, list_doc_req.path),
|
|
561
|
-
headers=list_doc_req.headers,
|
|
562
|
-
data=list_doc_req.body,
|
|
563
|
-
)
|
|
564
|
-
|
|
565
|
-
result = resp.json()
|
|
566
|
-
if result["code"] != 0:
|
|
567
|
-
logger.error(f"Error in list_docs: {result['message']}")
|
|
568
|
-
raise ValueError(f"Error in list_docs: {result['message']}")
|
|
569
|
-
|
|
570
|
-
if not result["data"].get("point_list", []):
|
|
571
|
-
return []
|
|
572
|
-
|
|
573
|
-
data = [
|
|
574
|
-
{
|
|
575
|
-
"id": res["point_id"],
|
|
576
|
-
"content": res["content"],
|
|
577
|
-
"metadata": res["doc_info"],
|
|
578
|
-
}
|
|
579
|
-
for res in result["data"]["point_list"]
|
|
580
|
-
]
|
|
581
|
-
return data
|
|
582
|
-
|
|
583
|
-
def list_docs(
|
|
584
|
-
self, collection_name: str, offset: int = 0, limit: int = -1
|
|
585
|
-
) -> list[dict]:
|
|
586
|
-
request_params = {
|
|
587
|
-
"collection_name": collection_name,
|
|
588
|
-
"project": self.config.project,
|
|
589
|
-
"offset": offset,
|
|
590
|
-
"limit": limit,
|
|
591
|
-
}
|
|
592
|
-
|
|
593
|
-
list_doc_req = prepare_request(
|
|
594
|
-
method="POST",
|
|
595
|
-
path=list_docs_path,
|
|
596
|
-
config=self.config,
|
|
597
|
-
data=request_params,
|
|
598
|
-
)
|
|
599
|
-
resp = requests.request(
|
|
600
|
-
method=list_doc_req.method,
|
|
601
|
-
url="https://{}{}".format(g_knowledge_base_domain, list_doc_req.path),
|
|
602
|
-
headers=list_doc_req.headers,
|
|
603
|
-
data=list_doc_req.body,
|
|
604
|
-
)
|
|
605
|
-
|
|
606
|
-
result = resp.json()
|
|
607
|
-
if result["code"] != 0:
|
|
608
|
-
logger.error(f"Error in list_docs: {result['message']}")
|
|
609
|
-
raise ValueError(f"Error in list_docs: {result['message']}")
|
|
610
|
-
|
|
611
|
-
if not result["data"].get("doc_list", []):
|
|
612
|
-
return []
|
|
613
|
-
return result["data"]["doc_list"]
|
|
614
|
-
|
|
615
|
-
def delete_by_id(self, collection_name: str, id: str) -> bool:
|
|
616
|
-
request_params = {
|
|
617
|
-
"collection_name": collection_name,
|
|
618
|
-
"project": self.config.project,
|
|
619
|
-
"doc_id": id,
|
|
620
|
-
}
|
|
621
|
-
|
|
622
|
-
delete_by_id_req = prepare_request(
|
|
623
|
-
method="POST",
|
|
624
|
-
path=delete_docs_path,
|
|
625
|
-
config=self.config,
|
|
626
|
-
data=request_params,
|
|
627
|
-
)
|
|
628
|
-
resp = requests.request(
|
|
629
|
-
method=delete_by_id_req.method,
|
|
630
|
-
url="https://{}{}".format(g_knowledge_base_domain, delete_by_id_req.path),
|
|
631
|
-
headers=delete_by_id_req.headers,
|
|
632
|
-
data=delete_by_id_req.body,
|
|
633
|
-
)
|
|
634
|
-
|
|
635
|
-
result = resp.json()
|
|
636
|
-
if result["code"] != 0:
|
|
637
|
-
return False
|
|
638
|
-
return True
|