waldiez 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of waldiez might be problematic. Click here for more details.
- waldiez/__init__.py +15 -0
- waldiez/__main__.py +6 -0
- waldiez/_version.py +3 -0
- waldiez/cli.py +162 -0
- waldiez/exporter.py +293 -0
- waldiez/exporting/__init__.py +14 -0
- waldiez/exporting/agents/__init__.py +5 -0
- waldiez/exporting/agents/agent.py +229 -0
- waldiez/exporting/agents/agent_skills.py +67 -0
- waldiez/exporting/agents/code_execution.py +67 -0
- waldiez/exporting/agents/group_manager.py +209 -0
- waldiez/exporting/agents/llm_config.py +53 -0
- waldiez/exporting/agents/rag_user/__init__.py +5 -0
- waldiez/exporting/agents/rag_user/chroma_utils.py +134 -0
- waldiez/exporting/agents/rag_user/mongo_utils.py +83 -0
- waldiez/exporting/agents/rag_user/pgvector_utils.py +93 -0
- waldiez/exporting/agents/rag_user/qdrant_utils.py +112 -0
- waldiez/exporting/agents/rag_user/rag_user.py +165 -0
- waldiez/exporting/agents/rag_user/vector_db.py +119 -0
- waldiez/exporting/agents/teachability.py +37 -0
- waldiez/exporting/agents/termination_message.py +45 -0
- waldiez/exporting/chats/__init__.py +14 -0
- waldiez/exporting/chats/chats.py +46 -0
- waldiez/exporting/chats/helpers.py +395 -0
- waldiez/exporting/chats/nested.py +264 -0
- waldiez/exporting/flow/__init__.py +5 -0
- waldiez/exporting/flow/def_main.py +37 -0
- waldiez/exporting/flow/flow.py +185 -0
- waldiez/exporting/models/__init__.py +193 -0
- waldiez/exporting/skills/__init__.py +128 -0
- waldiez/exporting/utils/__init__.py +34 -0
- waldiez/exporting/utils/comments.py +136 -0
- waldiez/exporting/utils/importing.py +267 -0
- waldiez/exporting/utils/logging_utils.py +203 -0
- waldiez/exporting/utils/method_utils.py +35 -0
- waldiez/exporting/utils/naming.py +127 -0
- waldiez/exporting/utils/object_string.py +81 -0
- waldiez/io_stream.py +181 -0
- waldiez/models/__init__.py +107 -0
- waldiez/models/agents/__init__.py +65 -0
- waldiez/models/agents/agent/__init__.py +21 -0
- waldiez/models/agents/agent/agent.py +190 -0
- waldiez/models/agents/agent/agent_data.py +162 -0
- waldiez/models/agents/agent/code_execution.py +71 -0
- waldiez/models/agents/agent/linked_skill.py +30 -0
- waldiez/models/agents/agent/nested_chat.py +73 -0
- waldiez/models/agents/agent/teachability.py +68 -0
- waldiez/models/agents/agent/termination_message.py +167 -0
- waldiez/models/agents/agents.py +129 -0
- waldiez/models/agents/assistant/__init__.py +6 -0
- waldiez/models/agents/assistant/assistant.py +41 -0
- waldiez/models/agents/assistant/assistant_data.py +29 -0
- waldiez/models/agents/group_manager/__init__.py +19 -0
- waldiez/models/agents/group_manager/group_manager.py +87 -0
- waldiez/models/agents/group_manager/group_manager_data.py +91 -0
- waldiez/models/agents/group_manager/speakers.py +211 -0
- waldiez/models/agents/rag_user/__init__.py +26 -0
- waldiez/models/agents/rag_user/rag_user.py +58 -0
- waldiez/models/agents/rag_user/rag_user_data.py +32 -0
- waldiez/models/agents/rag_user/retrieve_config.py +592 -0
- waldiez/models/agents/rag_user/vector_db_config.py +162 -0
- waldiez/models/agents/user_proxy/__init__.py +6 -0
- waldiez/models/agents/user_proxy/user_proxy.py +41 -0
- waldiez/models/agents/user_proxy/user_proxy_data.py +30 -0
- waldiez/models/chat/__init__.py +22 -0
- waldiez/models/chat/chat.py +129 -0
- waldiez/models/chat/chat_data.py +326 -0
- waldiez/models/chat/chat_message.py +304 -0
- waldiez/models/chat/chat_nested.py +160 -0
- waldiez/models/chat/chat_summary.py +110 -0
- waldiez/models/common/__init__.py +38 -0
- waldiez/models/common/base.py +63 -0
- waldiez/models/common/method_utils.py +165 -0
- waldiez/models/flow/__init__.py +9 -0
- waldiez/models/flow/flow.py +302 -0
- waldiez/models/flow/flow_data.py +87 -0
- waldiez/models/model/__init__.py +11 -0
- waldiez/models/model/model.py +169 -0
- waldiez/models/model/model_data.py +86 -0
- waldiez/models/skill/__init__.py +9 -0
- waldiez/models/skill/skill.py +129 -0
- waldiez/models/skill/skill_data.py +37 -0
- waldiez/models/waldiez.py +301 -0
- waldiez/py.typed +0 -0
- waldiez/runner.py +304 -0
- waldiez/stream/__init__.py +7 -0
- waldiez/stream/consumer.py +139 -0
- waldiez/stream/provider.py +339 -0
- waldiez/stream/server.py +412 -0
- waldiez-0.1.0.dist-info/METADATA +181 -0
- waldiez-0.1.0.dist-info/RECORD +94 -0
- waldiez-0.1.0.dist-info/WHEEL +4 -0
- waldiez-0.1.0.dist-info/entry_points.txt +2 -0
- waldiez-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,592 @@
|
|
|
1
|
+
"""RAG user agent retrieve config."""
|
|
2
|
+
|
|
3
|
+
from typing import Dict, List, Optional, Union
|
|
4
|
+
|
|
5
|
+
from pydantic import ConfigDict, Field, model_validator
|
|
6
|
+
from pydantic.alias_generators import to_camel
|
|
7
|
+
from typing_extensions import Annotated, Literal, Self
|
|
8
|
+
|
|
9
|
+
from ...common import WaldiezBase, WaldiezMethodName, check_function
|
|
10
|
+
from .vector_db_config import WaldiezRagUserVectorDbConfig
|
|
11
|
+
|
|
12
|
+
WaldiezRagUserTask = Literal["code", "qa", "default"]
|
|
13
|
+
WaldiezRagUserVectorDb = Literal["chroma", "pgvector", "mongodb", "qdrant"]
|
|
14
|
+
WaldiezRagUserChunkMode = Literal["multi_lines", "one_line"]
|
|
15
|
+
WaldiezRagUserModels: Dict[WaldiezRagUserVectorDb, str] = {
|
|
16
|
+
"chroma": "all-MiniLM-L6-v2",
|
|
17
|
+
"mongodb": "all-MiniLM-L6-v2",
|
|
18
|
+
"pgvector": "all-MiniLM-L6-v2",
|
|
19
|
+
"qdrant": "BAAI/bge-small-en-v1.5",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class WaldiezRagUserRetrieveConfig(WaldiezBase):
|
|
24
|
+
"""RAG user agent.
|
|
25
|
+
|
|
26
|
+
Attributes
|
|
27
|
+
----------
|
|
28
|
+
task : Literal["code", "qa", "default"]
|
|
29
|
+
The task of the retrieve chat.
|
|
30
|
+
Possible values are 'code', 'qa' and 'default'.
|
|
31
|
+
System prompt will be different for different tasks.
|
|
32
|
+
The default value is default, which supports both code and qa,
|
|
33
|
+
and provides source information in the end of the response.
|
|
34
|
+
vector_db : Literal["chroma", "pgvector", "mongodb", "qdrant"]
|
|
35
|
+
The vector db for the retrieve chat.
|
|
36
|
+
db_config : Annotated[WaldiezVectorDbConfig, Field]
|
|
37
|
+
The config for the selected vector db.
|
|
38
|
+
docs_path : Optional[Union[str, List[str]]]
|
|
39
|
+
The path to the docs directory. It can also be the path to a single
|
|
40
|
+
file, the url to a single file or a list of directories, files and
|
|
41
|
+
urls. Default is None, which works only if the collection is already
|
|
42
|
+
created.
|
|
43
|
+
new_docs : bool
|
|
44
|
+
When True, only adds new documents to the collection; when False,
|
|
45
|
+
updates existing documents and adds new ones. Default is True.
|
|
46
|
+
Document id is used to determine if a document is new or existing.
|
|
47
|
+
By default, the id is the hash value of the content.
|
|
48
|
+
model : Optional[str]
|
|
49
|
+
The model to use for the retrieve chat. If key not provided, a default
|
|
50
|
+
model gpt-4 will be used.
|
|
51
|
+
chunk_token_size : Optional[int]
|
|
52
|
+
The chunk token size for the retrieve chat. If key not provided, a
|
|
53
|
+
default size max_tokens * 0.4 will be used.
|
|
54
|
+
context_max_tokens : Optional[int]
|
|
55
|
+
The context max token size for the retrieve chat. If key not provided,
|
|
56
|
+
a default size max_tokens * 0.8 will be used.
|
|
57
|
+
chunk_mode : Optional[str]
|
|
58
|
+
The chunk mode for the retrieve chat. Possible values are 'multi_lines'
|
|
59
|
+
and 'one_line'. If key not provided, a default mode multi_lines will be
|
|
60
|
+
used.
|
|
61
|
+
must_break_at_empty_line : bool
|
|
62
|
+
Chunk will only break at empty line if True. Default is True. If
|
|
63
|
+
chunk_mode is 'one_line', this parameter will be ignored.
|
|
64
|
+
use_custom_embedding: bool
|
|
65
|
+
Whether to use custom embedding for the retrieve chat. Default is False.
|
|
66
|
+
If True, the embedding_function should be provided.
|
|
67
|
+
embedding_function : Optional[str]
|
|
68
|
+
The embedding function for creating the vector db. Default is None,
|
|
69
|
+
SentenceTransformer with the given embedding_model will be used. If
|
|
70
|
+
you want to use OpenAI, Cohere, HuggingFace or other embedding
|
|
71
|
+
functions, you can pass it here, follow the examples in
|
|
72
|
+
https://docs.trychroma.com/guides/embeddings.
|
|
73
|
+
customized_prompt : Optional[str]
|
|
74
|
+
The customized prompt for the retrieve chat. Default is None.
|
|
75
|
+
customized_answer_prefix : Optional[str]
|
|
76
|
+
The customized answer prefix for the retrieve chat. Default is ''. If
|
|
77
|
+
not '' and the customized_answer_prefix is not in the answer, Update
|
|
78
|
+
Context will be triggered.
|
|
79
|
+
update_context : bool
|
|
80
|
+
If False, will not apply Update Context for interactive retrieval.
|
|
81
|
+
Default is True.
|
|
82
|
+
collection_name : Optional[str]
|
|
83
|
+
The name of the collection. If key not provided, a default name
|
|
84
|
+
autogen-docs will be used.
|
|
85
|
+
get_or_create : bool
|
|
86
|
+
Whether to get the collection if it exists. Default is False.
|
|
87
|
+
overwrite : bool
|
|
88
|
+
Whether to overwrite the collection if it exists. Default is False.
|
|
89
|
+
Case 1. if the collection does not exist, create the collection. Case
|
|
90
|
+
2. the collection exists, if overwrite is True, it will overwrite the
|
|
91
|
+
collection. Case 3. the collection exists and overwrite is False, if
|
|
92
|
+
get_or_create is True, it will get the collection, otherwise it raise a
|
|
93
|
+
ValueError.
|
|
94
|
+
use_custom_token_count: bool
|
|
95
|
+
Whether to use custom token count function for the retrieve chat.
|
|
96
|
+
Default is False. If True, the custom_token_count_function should be
|
|
97
|
+
provided.
|
|
98
|
+
custom_token_count_function : Optional[str]
|
|
99
|
+
A custom function to count the number of tokens in a string. The
|
|
100
|
+
function should take (text:str, model:str) as input and return the
|
|
101
|
+
token_count(int). the retrieve_config['model'] will be passed in the
|
|
102
|
+
function. Default is autogen.token_count_utils.count_token that uses
|
|
103
|
+
tiktoken, which may not be accurate for non-OpenAI models.
|
|
104
|
+
use_custom_text_split: bool
|
|
105
|
+
Whether to use custom text split function for the retrieve chat. Default
|
|
106
|
+
is False. If True, the custom_text_split_function should be provided.
|
|
107
|
+
custom_text_split_function : Optional[str]
|
|
108
|
+
A custom function to split a string into a list of strings. Default is
|
|
109
|
+
None, will use the default function in autogen.retrieve_utils.
|
|
110
|
+
split_text_to_chunks.
|
|
111
|
+
custom_text_types : Optional[List[str]]
|
|
112
|
+
A list of file types to be processed. Default is autogen.retrieve_utils.
|
|
113
|
+
TEXT_FORMATS. This only applies to files under the directories in
|
|
114
|
+
docs_path. Explicitly included files and urls will be chunked
|
|
115
|
+
regardless of their types.
|
|
116
|
+
recursive : bool
|
|
117
|
+
Whether to search documents recursively in the docs_path. Default is
|
|
118
|
+
True.
|
|
119
|
+
distance_threshold : float
|
|
120
|
+
The threshold for the distance score, only distance smaller than it
|
|
121
|
+
will be returned. Will be ignored if < 0. Default is -1.
|
|
122
|
+
embedding_function_string : Optional[str]
|
|
123
|
+
The embedding function string (if use_custom_embedding is True).
|
|
124
|
+
token_count_function_string : Optional[str]
|
|
125
|
+
The token count function string (if use_custom_token_count is True).
|
|
126
|
+
text_split_function_string : Optional[str]
|
|
127
|
+
The text split function string (if use_custom_text_split is True).
|
|
128
|
+
n_results: Optional[int]
|
|
129
|
+
The number of results to return. Default is None, which will return all
|
|
130
|
+
|
|
131
|
+
Functions
|
|
132
|
+
---------
|
|
133
|
+
validate_custom_embedding_function
|
|
134
|
+
Validate the custom embedding function.
|
|
135
|
+
validate_custom_token_count_function
|
|
136
|
+
Validate the custom token count function.
|
|
137
|
+
validate_custom_text_split_function
|
|
138
|
+
Validate the custom text split function.
|
|
139
|
+
validate_rag_user_data
|
|
140
|
+
Validate the RAG user data.
|
|
141
|
+
"""
|
|
142
|
+
|
|
143
|
+
model_config = ConfigDict(
|
|
144
|
+
extra="forbid",
|
|
145
|
+
alias_generator=to_camel,
|
|
146
|
+
populate_by_name=True,
|
|
147
|
+
frozen=False,
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
task: Annotated[
|
|
151
|
+
WaldiezRagUserTask,
|
|
152
|
+
Field(
|
|
153
|
+
"default",
|
|
154
|
+
title="Task",
|
|
155
|
+
description=(
|
|
156
|
+
"The task of the retrieve chat. "
|
|
157
|
+
"Possible values are 'code', 'qa' and 'default'. "
|
|
158
|
+
"System prompt will be different for different tasks. "
|
|
159
|
+
"The default value is default, which supports both code, "
|
|
160
|
+
"and qa and provides source information in the end of "
|
|
161
|
+
"the response."
|
|
162
|
+
),
|
|
163
|
+
),
|
|
164
|
+
]
|
|
165
|
+
vector_db: Annotated[
|
|
166
|
+
WaldiezRagUserVectorDb,
|
|
167
|
+
Field(
|
|
168
|
+
"chroma",
|
|
169
|
+
title="Vector DB",
|
|
170
|
+
description="The vector db for the retrieve chat.",
|
|
171
|
+
),
|
|
172
|
+
]
|
|
173
|
+
db_config: Annotated[
|
|
174
|
+
WaldiezRagUserVectorDbConfig,
|
|
175
|
+
Field(
|
|
176
|
+
title="DB Config",
|
|
177
|
+
description="The config for the selected vector db.",
|
|
178
|
+
default_factory=WaldiezRagUserVectorDbConfig,
|
|
179
|
+
),
|
|
180
|
+
]
|
|
181
|
+
docs_path: Annotated[
|
|
182
|
+
Optional[Union[str, List[str]]],
|
|
183
|
+
Field(
|
|
184
|
+
default=None,
|
|
185
|
+
title="Docs Path",
|
|
186
|
+
description=(
|
|
187
|
+
"The path to the docs directory. It can also be the path to "
|
|
188
|
+
"a single file, the url to a single file or a list of "
|
|
189
|
+
"directories, files and urls. Default is None, which works "
|
|
190
|
+
"only if the collection is already created."
|
|
191
|
+
),
|
|
192
|
+
),
|
|
193
|
+
]
|
|
194
|
+
new_docs: Annotated[
|
|
195
|
+
bool,
|
|
196
|
+
Field(
|
|
197
|
+
default=True,
|
|
198
|
+
title="New Docs",
|
|
199
|
+
description=(
|
|
200
|
+
"When True, only adds new documents to the collection; "
|
|
201
|
+
"when False, updates existing documents and adds new ones. "
|
|
202
|
+
"Default is True. Document id is used to determine if a "
|
|
203
|
+
"document is new or existing. By default, the id is the "
|
|
204
|
+
"hash value of the content."
|
|
205
|
+
),
|
|
206
|
+
),
|
|
207
|
+
]
|
|
208
|
+
model: Annotated[
|
|
209
|
+
Optional[str],
|
|
210
|
+
Field(
|
|
211
|
+
default=None,
|
|
212
|
+
title="Model",
|
|
213
|
+
description=(
|
|
214
|
+
"The model to use for the retrieve chat. If key not provided, "
|
|
215
|
+
"we check for models linked to the agent."
|
|
216
|
+
),
|
|
217
|
+
),
|
|
218
|
+
]
|
|
219
|
+
chunk_token_size: Annotated[
|
|
220
|
+
Optional[int],
|
|
221
|
+
Field(
|
|
222
|
+
default=None,
|
|
223
|
+
title="Chunk Token Size",
|
|
224
|
+
description=(
|
|
225
|
+
"The chunk token size for the retrieve chat. "
|
|
226
|
+
"If key not provided, a default size max_tokens * 0.4 "
|
|
227
|
+
"will be used."
|
|
228
|
+
),
|
|
229
|
+
),
|
|
230
|
+
]
|
|
231
|
+
context_max_tokens: Annotated[
|
|
232
|
+
Optional[int],
|
|
233
|
+
Field(
|
|
234
|
+
default=None,
|
|
235
|
+
title="Context Max Tokens",
|
|
236
|
+
description=(
|
|
237
|
+
"The context max token size for the retrieve chat. "
|
|
238
|
+
"If key not provided, a default size max_tokens * 0.8 "
|
|
239
|
+
"will be used."
|
|
240
|
+
),
|
|
241
|
+
),
|
|
242
|
+
]
|
|
243
|
+
chunk_mode: Annotated[
|
|
244
|
+
WaldiezRagUserChunkMode,
|
|
245
|
+
Field(
|
|
246
|
+
default="multi_lines",
|
|
247
|
+
title="Chunk Mode",
|
|
248
|
+
description=(
|
|
249
|
+
"The chunk mode for the retrieve chat. Possible values are "
|
|
250
|
+
"'multi_lines' and 'one_line'. If key not provided, "
|
|
251
|
+
"a default mode multi_lines will be used."
|
|
252
|
+
),
|
|
253
|
+
),
|
|
254
|
+
]
|
|
255
|
+
|
|
256
|
+
must_break_at_empty_line: Annotated[
|
|
257
|
+
bool,
|
|
258
|
+
Field(
|
|
259
|
+
default=True,
|
|
260
|
+
title="Must Break at Empty Line",
|
|
261
|
+
description=(
|
|
262
|
+
"Chunk will only break at empty line if True. Default is True. "
|
|
263
|
+
"If chunk_mode is 'one_line', this parameter will be ignored."
|
|
264
|
+
),
|
|
265
|
+
),
|
|
266
|
+
]
|
|
267
|
+
use_custom_embedding: Annotated[
|
|
268
|
+
bool,
|
|
269
|
+
Field(
|
|
270
|
+
default=False,
|
|
271
|
+
title="Use Custom Embedding",
|
|
272
|
+
description=(
|
|
273
|
+
"Whether to use custom embedding for the retrieve chat. "
|
|
274
|
+
"Default is False. If True, the embedding_function should be "
|
|
275
|
+
"provided."
|
|
276
|
+
),
|
|
277
|
+
),
|
|
278
|
+
]
|
|
279
|
+
embedding_function: Annotated[
|
|
280
|
+
Optional[str],
|
|
281
|
+
Field(
|
|
282
|
+
default=None,
|
|
283
|
+
title="Embedding Function",
|
|
284
|
+
description=(
|
|
285
|
+
"The embedding function for creating the vector db. "
|
|
286
|
+
"Default is None, SentenceTransformer with the given "
|
|
287
|
+
"embedding_model will be used. If you want to use OpenAI, "
|
|
288
|
+
"Cohere, HuggingFace or other embedding functions, "
|
|
289
|
+
"you can pass it here, follow the examples in "
|
|
290
|
+
"https://docs.trychroma.com/guides/embeddings."
|
|
291
|
+
),
|
|
292
|
+
),
|
|
293
|
+
]
|
|
294
|
+
customized_prompt: Annotated[
|
|
295
|
+
Optional[str],
|
|
296
|
+
Field(
|
|
297
|
+
default=None,
|
|
298
|
+
title="Customized Prompt",
|
|
299
|
+
description=(
|
|
300
|
+
"The customized prompt for the retrieve chat. Default is None."
|
|
301
|
+
),
|
|
302
|
+
),
|
|
303
|
+
]
|
|
304
|
+
customized_answer_prefix: Annotated[
|
|
305
|
+
Optional[str],
|
|
306
|
+
Field(
|
|
307
|
+
default="",
|
|
308
|
+
title="Customized Answer Prefix",
|
|
309
|
+
description=(
|
|
310
|
+
"The customized answer prefix for the retrieve chat. "
|
|
311
|
+
"Default is ''. If not '' and the customized_answer_prefix is "
|
|
312
|
+
"not in the answer, Update Context will be triggered."
|
|
313
|
+
),
|
|
314
|
+
),
|
|
315
|
+
]
|
|
316
|
+
update_context: Annotated[
|
|
317
|
+
bool,
|
|
318
|
+
Field(
|
|
319
|
+
default=True,
|
|
320
|
+
title="Update Context",
|
|
321
|
+
description=(
|
|
322
|
+
"If False, will not apply Update Context for interactive "
|
|
323
|
+
"retrieval. Default is True."
|
|
324
|
+
),
|
|
325
|
+
),
|
|
326
|
+
]
|
|
327
|
+
collection_name: Annotated[
|
|
328
|
+
str,
|
|
329
|
+
Field(
|
|
330
|
+
default="autogen-docs",
|
|
331
|
+
title="Collection Name",
|
|
332
|
+
description=(
|
|
333
|
+
"The name of the collection. If key not provided, "
|
|
334
|
+
"a default name autogen-docs will be used."
|
|
335
|
+
),
|
|
336
|
+
),
|
|
337
|
+
]
|
|
338
|
+
get_or_create: Annotated[
|
|
339
|
+
bool,
|
|
340
|
+
Field(
|
|
341
|
+
default=False,
|
|
342
|
+
title="Get or Create",
|
|
343
|
+
description=(
|
|
344
|
+
"Whether to get the collection if it exists. Default is False."
|
|
345
|
+
),
|
|
346
|
+
),
|
|
347
|
+
]
|
|
348
|
+
overwrite: Annotated[
|
|
349
|
+
bool,
|
|
350
|
+
Field(
|
|
351
|
+
default=False,
|
|
352
|
+
title="Overwrite",
|
|
353
|
+
description=(
|
|
354
|
+
"Whether to overwrite the collection if it exists. "
|
|
355
|
+
"Default is False. "
|
|
356
|
+
"Case 1. if the collection does not exist,"
|
|
357
|
+
" create the collection. "
|
|
358
|
+
"Case 2. the collection exists, if overwrite is True,"
|
|
359
|
+
" it will overwrite the collection. "
|
|
360
|
+
"Case 3. the collection exists and overwrite is False, if"
|
|
361
|
+
" get_or_create is True, it will get the collection,"
|
|
362
|
+
" otherwise it raise a ValueError."
|
|
363
|
+
),
|
|
364
|
+
),
|
|
365
|
+
]
|
|
366
|
+
use_custom_token_count: Annotated[
|
|
367
|
+
bool,
|
|
368
|
+
Field(
|
|
369
|
+
default=False,
|
|
370
|
+
title="Use Custom Token Count",
|
|
371
|
+
description=(
|
|
372
|
+
"Whether to use custom token count function for the retrieve "
|
|
373
|
+
"chat. Default is False. If True, the "
|
|
374
|
+
"custom_token_count_function should be provided."
|
|
375
|
+
),
|
|
376
|
+
),
|
|
377
|
+
]
|
|
378
|
+
custom_token_count_function: Annotated[
|
|
379
|
+
Optional[str],
|
|
380
|
+
Field(
|
|
381
|
+
default=None,
|
|
382
|
+
title="Custom Token Count Function",
|
|
383
|
+
description=(
|
|
384
|
+
"A custom function to count the number of tokens in a string. "
|
|
385
|
+
"The function should take (text:str, model:str) as input "
|
|
386
|
+
"and return the token_count(int). the retrieve_config['model'] "
|
|
387
|
+
"will be passed in the function. "
|
|
388
|
+
"Default is autogen.token_count_utils.count_token that uses "
|
|
389
|
+
"tiktoken, which may not be accurate for non-OpenAI models."
|
|
390
|
+
),
|
|
391
|
+
),
|
|
392
|
+
]
|
|
393
|
+
use_custom_text_split: Annotated[
|
|
394
|
+
bool,
|
|
395
|
+
Field(
|
|
396
|
+
default=False,
|
|
397
|
+
title="Use Custom Text Split",
|
|
398
|
+
description=(
|
|
399
|
+
"Whether to use custom text split function for the retrieve "
|
|
400
|
+
"chat. Default is False. If True, the "
|
|
401
|
+
"custom_text_split_function should be provided."
|
|
402
|
+
),
|
|
403
|
+
),
|
|
404
|
+
]
|
|
405
|
+
custom_text_split_function: Annotated[
|
|
406
|
+
Optional[str],
|
|
407
|
+
Field(
|
|
408
|
+
default=None,
|
|
409
|
+
title="Custom Text Split Function",
|
|
410
|
+
description=(
|
|
411
|
+
"A custom function to split a string into a list of strings. "
|
|
412
|
+
"Default is None, will use the default function in "
|
|
413
|
+
"autogen.retrieve_utils.split_text_to_chunks."
|
|
414
|
+
),
|
|
415
|
+
),
|
|
416
|
+
]
|
|
417
|
+
custom_text_types: Annotated[
|
|
418
|
+
Optional[List[str]],
|
|
419
|
+
Field(
|
|
420
|
+
default=None,
|
|
421
|
+
title="Custom Text Types",
|
|
422
|
+
description=(
|
|
423
|
+
"A list of file types to be processed. "
|
|
424
|
+
"Default is autogen.retrieve_utils.TEXT_FORMATS. "
|
|
425
|
+
"This only applies to files under the directories in "
|
|
426
|
+
"docs_path. Explicitly included files and urls will be "
|
|
427
|
+
"chunked regardless of their types."
|
|
428
|
+
),
|
|
429
|
+
),
|
|
430
|
+
]
|
|
431
|
+
recursive: Annotated[
|
|
432
|
+
bool,
|
|
433
|
+
Field(
|
|
434
|
+
default=True,
|
|
435
|
+
title="Recursive",
|
|
436
|
+
description=(
|
|
437
|
+
"Whether to search documents recursively in the docs_path. "
|
|
438
|
+
"Default is True."
|
|
439
|
+
),
|
|
440
|
+
),
|
|
441
|
+
]
|
|
442
|
+
distance_threshold: Annotated[
|
|
443
|
+
float,
|
|
444
|
+
Field(
|
|
445
|
+
default=-1,
|
|
446
|
+
title="Distance Threshold",
|
|
447
|
+
description=(
|
|
448
|
+
"The threshold for the distance score, only distance"
|
|
449
|
+
" smaller than this will be returned. "
|
|
450
|
+
"Will be ignored if < 0. Default is -1."
|
|
451
|
+
),
|
|
452
|
+
),
|
|
453
|
+
]
|
|
454
|
+
n_results: Annotated[
|
|
455
|
+
Optional[int],
|
|
456
|
+
Field(
|
|
457
|
+
default=None,
|
|
458
|
+
title="Number of Results",
|
|
459
|
+
description=(
|
|
460
|
+
"The number of results to return. Default is None, "
|
|
461
|
+
"which will return all."
|
|
462
|
+
"Use None or <1 to return all results."
|
|
463
|
+
),
|
|
464
|
+
),
|
|
465
|
+
]
|
|
466
|
+
_embedding_function_string: Optional[str] = None
|
|
467
|
+
|
|
468
|
+
_token_count_function_string: Optional[str] = None
|
|
469
|
+
|
|
470
|
+
_text_split_function_string: Optional[str] = None
|
|
471
|
+
|
|
472
|
+
@property
|
|
473
|
+
def embedding_function_string(self) -> Optional[str]:
|
|
474
|
+
"""Get the embedding function string.
|
|
475
|
+
|
|
476
|
+
Returns
|
|
477
|
+
-------
|
|
478
|
+
Optional[str]
|
|
479
|
+
The embedding function string.
|
|
480
|
+
"""
|
|
481
|
+
return self._embedding_function_string
|
|
482
|
+
|
|
483
|
+
@property
|
|
484
|
+
def token_count_function_string(self) -> Optional[str]:
|
|
485
|
+
"""Get the token count function string.
|
|
486
|
+
|
|
487
|
+
Returns
|
|
488
|
+
-------
|
|
489
|
+
Optional[str]
|
|
490
|
+
The token count function string.
|
|
491
|
+
"""
|
|
492
|
+
return self._token_count_function_string
|
|
493
|
+
|
|
494
|
+
@property
|
|
495
|
+
def text_split_function_string(self) -> Optional[str]:
|
|
496
|
+
"""Get the text split function string.
|
|
497
|
+
|
|
498
|
+
Returns
|
|
499
|
+
-------
|
|
500
|
+
Optional[str]
|
|
501
|
+
The text split function string.
|
|
502
|
+
"""
|
|
503
|
+
return self._text_split_function_string
|
|
504
|
+
|
|
505
|
+
def validate_custom_embedding_function(self) -> None:
|
|
506
|
+
"""Validate the custom embedding function.
|
|
507
|
+
|
|
508
|
+
Raises
|
|
509
|
+
------
|
|
510
|
+
ValueError
|
|
511
|
+
If the validation fails.
|
|
512
|
+
"""
|
|
513
|
+
if self.use_custom_embedding:
|
|
514
|
+
if not self.embedding_function:
|
|
515
|
+
raise ValueError(
|
|
516
|
+
"The embedding_function is required "
|
|
517
|
+
"if use_custom_embedding is True."
|
|
518
|
+
)
|
|
519
|
+
function_name: WaldiezMethodName = "custom_embedding_function"
|
|
520
|
+
valid, error_or_content = check_function(
|
|
521
|
+
self.embedding_function, function_name
|
|
522
|
+
)
|
|
523
|
+
if not valid:
|
|
524
|
+
raise ValueError(error_or_content)
|
|
525
|
+
self._embedding_function_string = error_or_content
|
|
526
|
+
|
|
527
|
+
def validate_custom_token_count_function(self) -> None:
|
|
528
|
+
"""Validate the custom token count function.
|
|
529
|
+
|
|
530
|
+
Raises
|
|
531
|
+
------
|
|
532
|
+
ValueError
|
|
533
|
+
If the validation fails.
|
|
534
|
+
"""
|
|
535
|
+
if self.use_custom_token_count:
|
|
536
|
+
if not self.custom_token_count_function:
|
|
537
|
+
raise ValueError(
|
|
538
|
+
"The custom_token_count_function is required "
|
|
539
|
+
"if use_custom_token_count is True."
|
|
540
|
+
)
|
|
541
|
+
function_name: WaldiezMethodName = "custom_token_count_function"
|
|
542
|
+
valid, error_or_content = check_function(
|
|
543
|
+
self.custom_token_count_function, function_name
|
|
544
|
+
)
|
|
545
|
+
if not valid:
|
|
546
|
+
raise ValueError(error_or_content)
|
|
547
|
+
self._token_count_function_string = error_or_content
|
|
548
|
+
|
|
549
|
+
def validate_custom_text_split_function(self) -> None:
|
|
550
|
+
"""Validate the custom text split function.
|
|
551
|
+
|
|
552
|
+
Raises
|
|
553
|
+
------
|
|
554
|
+
ValueError
|
|
555
|
+
If the validation fails.
|
|
556
|
+
"""
|
|
557
|
+
if self.use_custom_text_split:
|
|
558
|
+
if not self.custom_text_split_function:
|
|
559
|
+
raise ValueError(
|
|
560
|
+
"The custom_text_split_function is required "
|
|
561
|
+
"if use_custom_text_split is True."
|
|
562
|
+
)
|
|
563
|
+
function_name: WaldiezMethodName = "custom_text_split_function"
|
|
564
|
+
valid, error_or_content = check_function(
|
|
565
|
+
self.custom_text_split_function, function_name
|
|
566
|
+
)
|
|
567
|
+
if not valid:
|
|
568
|
+
raise ValueError(error_or_content)
|
|
569
|
+
self._text_split_function_string = error_or_content
|
|
570
|
+
|
|
571
|
+
@model_validator(mode="after")
|
|
572
|
+
def validate_rag_user_data(self) -> Self:
|
|
573
|
+
"""Validate the RAG user data.
|
|
574
|
+
|
|
575
|
+
Raises
|
|
576
|
+
------
|
|
577
|
+
ValueError
|
|
578
|
+
If the validation fails.
|
|
579
|
+
|
|
580
|
+
Returns
|
|
581
|
+
-------
|
|
582
|
+
WaldiezRagUserData
|
|
583
|
+
The validated RAG user data.
|
|
584
|
+
"""
|
|
585
|
+
self.validate_custom_embedding_function()
|
|
586
|
+
self.validate_custom_token_count_function()
|
|
587
|
+
self.validate_custom_text_split_function()
|
|
588
|
+
if not self.db_config.model:
|
|
589
|
+
self.db_config.model = WaldiezRagUserModels[self.vector_db]
|
|
590
|
+
if isinstance(self.n_results, int) and self.n_results < 1:
|
|
591
|
+
self.n_results = None
|
|
592
|
+
return self
|