crewplus 0.2.7__tar.gz → 0.2.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crewplus might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: crewplus
3
- Version: 0.2.7
3
+ Version: 0.2.9
4
4
  Summary: Base services for CrewPlus AI applications
5
5
  Author-Email: Tim Liu <tim@opsmateai.com>
6
6
  License: MIT
@@ -9,6 +9,30 @@ from crewplus.utils.schema_document_updater import SchemaDocumentUpdater
9
9
  from crewplus.utils.schema_action import Action
10
10
  from .milvus_schema_manager import MilvusSchemaManager
11
11
 
12
+ DEFAULT_SCHEMA = """
13
+ {
14
+ "node_types": {
15
+ "Document": {
16
+ "properties": {
17
+ "pk": {
18
+ "type": "INT64",
19
+ "is_primary": true,
20
+ "auto_id": true
21
+ },
22
+ "vector": {
23
+ "type": "FLOAT_VECTOR",
24
+ "dim": 1536
25
+ },
26
+ "text": {
27
+ "type": "VARCHAR",
28
+ "max_length": 65535,
29
+ "description": "The core text of the memory. This could be a user query, a documented fact, a procedural step, or a log of an event."
30
+ }
31
+ }
32
+ }
33
+ }
34
+ }
35
+ """
12
36
 
13
37
  class SchemaMilvus(Milvus):
14
38
  """
@@ -1,394 +1,427 @@
1
- # -*- coding: utf-8 -*-
2
- # @Author: Cursor
3
- # @Date: 2025-02-12
4
- # @Last Modified by: Gemini
5
- # @Last Modified time: 2025-07-01
6
-
7
- import logging
8
- from typing import List, Dict, Union, Optional
9
- from langchain_milvus import Zilliz
10
- from langchain_core.embeddings import Embeddings
11
- from langchain_openai import AzureOpenAIEmbeddings
12
- from pymilvus import MilvusClient
13
-
14
- from crewplus.services.init_services import get_model_balancer
15
- from crewplus.vectorstores.milvus.schema_milvus import SchemaMilvus
16
-
17
- class VDBService(object):
18
- """
19
- A service to manage connections to Milvus/Zilliz vector databases and embedding models.
20
-
21
- This service centralizes the configuration and instantiation of the Milvus client
22
- and provides helper methods to get embedding functions and vector store instances.
23
-
24
- Args:
25
- settings (dict, optional): A dictionary containing configuration for the vector store
26
- and embedding models.
27
- endpoint (str, optional): The URI for the Zilliz cluster. Can be used for simple
28
- initialization instead of `settings`.
29
- token (str, optional): The token for authenticating with Zilliz. Must be provided
30
- with `endpoint`.
31
- schema (str, optional): The schema definition for a collection. Defaults to None.
32
- logger (logging.Logger, optional): An optional logger instance. Defaults to None.
33
-
34
- Raises:
35
- ValueError: If required configurations are missing.
36
- NotImplementedError: If an unsupported provider is specified.
37
- RuntimeError: If the MilvusClient fails to initialize after a retry.
38
-
39
- Example:
40
- >>> # Initialize with a full settings dictionary
41
- >>> settings = {
42
- ... "embedder": {
43
- ... "provider": "azure-openai",
44
- ... "config": {
45
- ... "model": "text-embedding-3-small",
46
- ... "api_version": "2023-05-15",
47
- ... "api_key": "YOUR_AZURE_OPENAI_KEY",
48
- ... "openai_base_url": "YOUR_AZURE_OPENAI_ENDPOINT",
49
- ... "embedding_dims": 1536
50
- ... }
51
- ... },
52
- ... "vector_store": {
53
- ... "provider": "milvus",
54
- ... "config": {
55
- ... "host": "localhost",
56
- ... "port": 19530,
57
- ... "user": "root",
58
- ... "password": "password",
59
- ... "db_name": "default"
60
- ... }
61
- ... },
62
- ... "index_params": {
63
- ... "metric_type": "L2",
64
- ... "index_type": "AUTOINDEX",
65
- ... "params": {}
66
- ... }
67
- ... }
68
- >>> vdb_service = VDBService(settings=settings)
69
- >>>
70
- >>> # Alternatively, initialize with an endpoint and token for Zilliz
71
- >>> # vdb_service_zilliz = VDBService(endpoint="YOUR_ZILLIZ_ENDPOINT", token="YOUR_ZILLIZ_TOKEN")
72
- >>>
73
- >>> # Get the raw Milvus client
74
- >>> client = vdb_service.get_vector_client()
75
- >>> print(client.list_collections())
76
- >>> # Get an embedding function
77
- >>> embeddings = vdb_service.get_embeddings()
78
- >>> print(embeddings)
79
- >>> # Get a LangChain vector store instance (will be cached)
80
- >>> vector_store = vdb_service.get_vector_store(collection_name="my_collection")
81
- >>> print(vector_store)
82
- >>> same_vector_store = vdb_service.get_vector_store(collection_name="my_collection")
83
- >>> assert vector_store is same_vector_store
84
- """
85
- _client: MilvusClient
86
- _instances: Dict[str, Zilliz] = {}
87
-
88
- schema: str
89
- embedding_function: Embeddings
90
- index_params: dict
91
- connection_args: dict
92
- settings: dict
93
-
94
- def __init__(self, settings: dict = None, endpoint: str = None, token: str = None, schema: str = None, logger: logging.Logger = None):
95
- """
96
- Initializes the VDBService.
97
-
98
- Can be initialized in two ways:
99
- 1. By providing a full `settings` dictionary for complex configurations.
100
- 2. By providing `endpoint` and `token` for a direct Zilliz connection.
101
- Note: When using this method, an `embedder` configuration is not created.
102
- You must either use the `ModelLoadBalancer` or pass an `Embeddings` object
103
- directly to methods like `get_vector_store`.
104
-
105
- Args:
106
- settings (dict, optional): Configuration dictionary for the service. Defaults to None.
107
- endpoint (str, optional): The URI for the Zilliz cluster. Used if `settings` is not provided.
108
- token (str, optional): The token for authenticating with the Zilliz cluster.
109
- schema (str, optional): Default schema for new collections. Defaults to None.
110
- logger (logging.Logger, optional): Logger instance. Defaults to None.
111
- """
112
- self.logger = logger or logging.getLogger(__name__)
113
-
114
- if settings:
115
- self.settings = settings
116
- elif endpoint and token:
117
- self.logger.info("Initializing VDBService with endpoint and token for a Zilliz connection.")
118
- self.settings = {
119
- "vector_store": {
120
- "provider": "zilliz",
121
- "config": {
122
- "uri": endpoint,
123
- "token": token
124
- }
125
- }
126
- }
127
- else:
128
- raise ValueError("VDBService must be initialized with either a 'settings' dictionary or both 'endpoint' and 'token'.")
129
-
130
- vector_store_settings = self.settings.get("vector_store")
131
- if not vector_store_settings:
132
- msg = "'vector_store' not found in settings"
133
- self.logger.error(msg)
134
- raise ValueError(msg)
135
-
136
- provider = vector_store_settings.get("provider")
137
- self.connection_args = vector_store_settings.get("config")
138
-
139
- if not provider or not self.connection_args:
140
- msg = "'provider' or 'config' not found in 'vector_store' settings"
141
- self.logger.error(msg)
142
- raise ValueError(msg)
143
-
144
- self._client = self._initialize_milvus_client(provider)
145
-
146
- self.schema = schema
147
- self.index_params = self.settings.get("index_params")
148
-
149
- self.logger.info("VDBService initialized successfully")
150
-
151
- def _initialize_milvus_client(self, provider: str) -> MilvusClient:
152
- """
153
- Initializes and returns a MilvusClient with a retry mechanism.
154
- """
155
- client_args = {}
156
- if provider == "milvus":
157
- host = self.connection_args.get("host", "localhost")
158
- port = self.connection_args.get("port", 19530)
159
-
160
- # Use https for remote hosts, and http for local connections.
161
- scheme = "https" if host not in ["localhost", "127.0.0.1"] else "http"
162
- uri = f"{scheme}://{host}:{port}"
163
-
164
- client_args = {
165
- "uri": uri,
166
- "user": self.connection_args.get("user"),
167
- "password": self.connection_args.get("password"),
168
- "db_name": self.connection_args.get("db_name")
169
- }
170
- # Filter out None values to use client defaults
171
- client_args = {k: v for k, v in client_args.items() if v is not None}
172
-
173
- elif provider == "zilliz":
174
- client_args = self.connection_args
175
- else:
176
- self.logger.error(f"Unsupported vector store provider: {provider}")
177
- raise NotImplementedError(f"Vector store provider '{provider}' is not supported.")
178
-
179
- try:
180
- # First attempt to connect
181
- return MilvusClient(**client_args)
182
- except Exception as e:
183
- self.logger.error(f"Failed to initialize MilvusClient, trying again. Error: {e}")
184
- # Second attempt after failure
185
- try:
186
- return MilvusClient(**client_args)
187
- except Exception as e_retry:
188
- self.logger.error(f"Failed to initialize MilvusClient on retry. Final error: {e_retry}")
189
- raise RuntimeError(f"Could not initialize MilvusClient after retry: {e_retry}")
190
-
191
- def get_vector_client(self) -> MilvusClient:
192
- """
193
- Returns the active MilvusClient instance.
194
-
195
- Returns:
196
- MilvusClient: The initialized client for interacting with the vector database.
197
- """
198
- return self._client
199
-
200
- def get_embeddings(self, from_model_balancer: bool = False, provider: Optional[str] = "azure-openai", model_type: Optional[str] = "embedding-large") -> Embeddings:
201
- """
202
- Gets an embedding function, either from the model balancer or directly from settings.
203
-
204
- Args:
205
- from_model_balancer (bool): If True, uses the central model balancer service.
206
- If False, creates a new instance based on 'embedder' settings.
207
- model_type (str, optional): The type of model to get from the balancer. Defaults to "embedding-large".
208
-
209
- Returns:
210
- Embeddings: An instance of a LangChain embedding model.
211
- """
212
- if from_model_balancer:
213
- model_balancer = get_model_balancer()
214
- return model_balancer.get_model(provider=provider, model_type=model_type)
215
-
216
- embedder_config = self.settings.get("embedder")
217
- if not embedder_config:
218
- self.logger.error("'embedder' configuration not found in settings.")
219
- raise ValueError("'embedder' configuration not found in settings.")
220
-
221
- provider = embedder_config.get("provider")
222
- config = embedder_config.get("config")
223
-
224
- if not provider or not config:
225
- self.logger.error("Embedder 'provider' or 'config' not found in settings.")
226
- raise ValueError("Embedder 'provider' or 'config' not found in settings.")
227
-
228
- if provider == "azure-openai":
229
- # Map the settings config to AzureOpenAIEmbeddings parameters.
230
- azure_config = {
231
- "azure_deployment": config.get("model"),
232
- "openai_api_version": config.get("api_version"),
233
- "api_key": config.get("api_key"),
234
- "azure_endpoint": config.get("openai_base_url"),
235
- "dimensions": config.get("embedding_dims"),
236
- "chunk_size": config.get("chunk_size", 16),
237
- "request_timeout": config.get("request_timeout", 60),
238
- "max_retries": config.get("max_retries", 2)
239
- }
240
- # Filter out None values to use client defaults.
241
- azure_config = {k: v for k, v in azure_config.items() if v is not None}
242
-
243
- return AzureOpenAIEmbeddings(**azure_config)
244
- else:
245
- self.logger.error(f"Unsupported embedding provider: {provider}")
246
- raise NotImplementedError(f"Embedding provider '{provider}' is not supported yet.")
247
-
248
- def get_vector_store(self, collection_name: str, embeddings: Embeddings = None, metric_type: str = "L2") -> Zilliz:
249
- """
250
- Gets a vector store instance, creating it if it doesn't exist for the collection.
251
- This method validates both the embedding function and the vector store connection
252
- before caching the instance to prevent faulty instances from being reused.
253
-
254
- Args:
255
- collection_name (str): The name of the collection in the vector database.
256
- embeddings (Embeddings, optional): An embedding model instance. If None, one is created.
257
- metric_type (str): The distance metric for the index. Defaults to "L2".
258
-
259
- Returns:
260
- Zilliz: LangChain Zilliz instance, which is compatible with both Zilliz and Milvus.
261
- """
262
- if not collection_name:
263
- self.logger.error("get_vector_store called with no collection_name.")
264
- raise ValueError("collection_name must be provided.")
265
-
266
- # Return the cached instance if it already exists.
267
- if collection_name in self._instances:
268
- self.logger.info(f"Returning existing vector store instance for collection: {collection_name}")
269
- return self._instances[collection_name]
270
-
271
- self.logger.info(f"Creating new vector store instance for collection: {collection_name}")
272
- if embeddings is None:
273
- embeddings = self.get_embeddings()
274
-
275
- # 1. Validate the embedding function before proceeding.
276
- try:
277
- self.logger.debug(f"Testing embedding function for collection '{collection_name}'...")
278
- embeddings.embed_query("validation_test_string")
279
- self.logger.debug("Embedding function is valid.")
280
- except Exception as e:
281
- self.logger.error(
282
- f"The provided embedding function is invalid and failed with error: {e}. "
283
- f"Cannot create a vector store for collection '{collection_name}'."
284
- )
285
- raise RuntimeError(f"Invalid embedding function provided.") from e
286
-
287
- # If embeddings are valid, proceed to create the Zilliz instance.
288
- index_params = self.index_params or {
289
- "metric_type": metric_type,
290
- "index_type": "AUTOINDEX",
291
- "params": {}
292
- }
293
-
294
- vdb = Zilliz(
295
- embedding_function=embeddings,
296
- collection_name=collection_name,
297
- connection_args=self.connection_args,
298
- index_params=index_params
299
- )
300
-
301
- # Cache the newly created instance.
302
- self._instances[collection_name] = vdb
303
-
304
- return vdb
305
-
306
- def delete_old_indexes(self, url: str = None, vdb: Zilliz = None) -> (bool | None):
307
- """ Delete old indexes of the same source_url
308
-
309
- Args:
310
- url (str): source url
311
- vdb (Zilliz): Zilliz instance
312
- """
313
- self.logger.info(f"Delete old indexes of the same source_url:{url}")
314
-
315
- if url is None or vdb is None:
316
- return None
317
-
318
- # Delete indexes of the same source_url
319
- expr = f'source_url == "{url}" or source == "{url}"'
320
- pks = vdb.get_pks(expr)
321
-
322
- # Delete entities by pks
323
- if pks is not None and len(pks) > 0 :
324
- res = vdb.delete(pks)
325
- self.logger.info("Deleted old indexes result: " + str(res))
326
- return res
327
-
328
- def delete_old_indexes_by_id(self, source_id: str = None, vdb: Zilliz = None) -> (bool | None):
329
- """ Delete old indexes of the same source_id
330
-
331
- Args:
332
- source_id (str): source id
333
- """
334
- self.logger.info(f"Delete old indexes of the same source_id:{source_id}")
335
-
336
- if source_id is None or vdb is None:
337
- return None
338
-
339
- # Delete indexes of the same source_id
340
- expr = f'source_id == "{source_id}"'
341
- pks = vdb.get_pks(expr)
342
-
343
- # Delete entities by pks
344
- if pks is not None and len(pks) > 0 :
345
- res = vdb.delete(pks)
346
- self.logger.info("Deleted old indexes result: " + str(res))
347
- return res
348
-
349
- def drop_collection(self, collection_name: str) -> None:
350
- """
351
- Deletes a collection from the vector database and removes it from the cache.
352
-
353
- Args:
354
- collection_name (str): The name of the collection to drop.
355
-
356
- Raises:
357
- ValueError: If collection_name is not provided.
358
- RuntimeError: If the operation fails on the database side.
359
- """
360
- if not collection_name:
361
- self.logger.error("drop_collection called without a collection_name.")
362
- raise ValueError("collection_name must be provided.")
363
-
364
- self.logger.info(f"Attempting to drop collection: {collection_name}")
365
-
366
- try:
367
- client = self.get_vector_client()
368
- client.drop_collection(collection_name=collection_name)
369
- self.logger.info(f"Successfully dropped collection: {collection_name}")
370
- except Exception as e:
371
- self.logger.error(f"Failed to drop collection '{collection_name}': {e}")
372
- raise RuntimeError(f"An error occurred while dropping collection '{collection_name}'.") from e
373
- finally:
374
- # Whether successful or not, remove the stale instance from the cache.
375
- if collection_name in self._instances:
376
- del self._instances[collection_name]
377
- self.logger.info(f"Removed '{collection_name}' from instance cache.")
378
-
379
- def delete_data_by_filter(self, collection_name: str = None, filter: str = None) -> None:
380
- """ Delete data by filter
381
-
382
- Args:
383
- collection_name (str): collection_name
384
- filter (str): filter
385
- """
386
- self.logger.info(f"Delete data by filter:{filter}")
387
-
388
- try:
389
- client=self.get_vector_client()
390
- if collection_name is None or client is None or filter is None:
391
- return RuntimeError(f"collection_name must be not null or check out your client to link milvus")
392
- client.delete(collection_name=collection_name, filter=filter)
393
- except Exception as e:
1
+ # -*- coding: utf-8 -*-
2
+ # @Author: Cursor
3
+ # @Date: 2025-02-12
4
+ # @Last Modified by: Gemini
5
+ # @Last Modified time: 2025-07-04
6
+
7
+ import logging
8
+ from typing import List, Dict, Union, Optional
9
+ from langchain_milvus import Zilliz
10
+ from langchain_core.embeddings import Embeddings
11
+ from langchain_openai import AzureOpenAIEmbeddings
12
+ from pymilvus import MilvusClient
13
+
14
+ from ...services.init_services import get_model_balancer
15
+ from .schema_milvus import SchemaMilvus, DEFAULT_SCHEMA
16
+
17
+ class VDBService(object):
18
+ """
19
+ A service to manage connections to Milvus/Zilliz vector databases and embedding models.
20
+
21
+ This service centralizes the configuration and instantiation of the Milvus client
22
+ and provides helper methods to get embedding functions and vector store instances.
23
+
24
+ Args:
25
+ settings (dict, optional): A dictionary containing configuration for the vector store
26
+ and embedding models.
27
+ endpoint (str, optional): The URI for the Zilliz cluster. Can be used for simple
28
+ initialization instead of `settings`.
29
+ token (str, optional): The token for authenticating with Zilliz. Must be provided
30
+ with `endpoint`.
31
+ schema (str, optional): The schema definition for a collection. Defaults to None.
32
+ logger (logging.Logger, optional): An optional logger instance. Defaults to None.
33
+
34
+ Raises:
35
+ ValueError: If required configurations are missing.
36
+ NotImplementedError: If an unsupported provider is specified.
37
+ RuntimeError: If the MilvusClient fails to initialize after a retry.
38
+
39
+ Example:
40
+ >>> # Initialize with a full settings dictionary
41
+ >>> settings = {
42
+ ... "embedder": {
43
+ ... "provider": "azure-openai",
44
+ ... "config": {
45
+ ... "model": "text-embedding-3-small",
46
+ ... "api_version": "2023-05-15",
47
+ ... "api_key": "YOUR_AZURE_OPENAI_KEY",
48
+ ... "openai_base_url": "YOUR_AZURE_OPENAI_ENDPOINT",
49
+ ... "embedding_dims": 1536
50
+ ... }
51
+ ... },
52
+ ... "vector_store": {
53
+ ... "provider": "milvus",
54
+ ... "config": {
55
+ ... "host": "localhost",
56
+ ... "port": 19530,
57
+ ... "user": "root",
58
+ ... "password": "password",
59
+ ... "db_name": "default"
60
+ ... }
61
+ ... },
62
+ ... "index_params": {
63
+ ... "metric_type": "L2",
64
+ ... "index_type": "AUTOINDEX",
65
+ ... "params": {}
66
+ ... }
67
+ ... }
68
+ >>> vdb_service = VDBService(settings=settings)
69
+ >>>
70
+ >>> # Alternatively, initialize with an endpoint and token for Zilliz
71
+ >>> # vdb_service_zilliz = VDBService(endpoint="YOUR_ZILLIZ_ENDPOINT", token="YOUR_ZILLIZ_TOKEN")
72
+ >>>
73
+ >>> # Get the raw Milvus client
74
+ >>> client = vdb_service.get_vector_client()
75
+ >>> print(client.list_collections())
76
+ >>> # Get an embedding function
77
+ >>> embeddings = vdb_service.get_embeddings()
78
+ >>> print(embeddings)
79
+ >>> # Get a LangChain vector store instance (will be cached)
80
+ >>> vector_store = vdb_service.get_vector_store(collection_name="my_collection")
81
+ >>> print(vector_store)
82
+ >>> same_vector_store = vdb_service.get_vector_store(collection_name="my_collection")
83
+ >>> assert vector_store is same_vector_store
84
+ """
85
+ _client: MilvusClient
86
+ _instances: Dict[str, Zilliz] = {}
87
+
88
+ schema: str
89
+ embedding_function: Embeddings
90
+ index_params: dict
91
+ connection_args: dict
92
+ settings: dict
93
+
94
+ def __init__(self, settings: dict = None, endpoint: str = None, token: str = None, schema: str = None, logger: logging.Logger = None):
95
+ """
96
+ Initializes the VDBService.
97
+
98
+ Can be initialized in two ways:
99
+ 1. By providing a full `settings` dictionary for complex configurations.
100
+ 2. By providing `endpoint` and `token` for a direct Zilliz connection.
101
+ Note: When using this method, an `embedder` configuration is not created.
102
+ You must either use the `ModelLoadBalancer` or pass an `Embeddings` object
103
+ directly to methods like `get_vector_store`.
104
+
105
+ Args:
106
+ settings (dict, optional): Configuration dictionary for the service. Defaults to None.
107
+ endpoint (str, optional): The URI for the Zilliz cluster. Used if `settings` is not provided.
108
+ token (str, optional): The token for authenticating with the Zilliz cluster.
109
+ schema (str, optional): Default schema for new collections. Defaults to None.
110
+ logger (logging.Logger, optional): Logger instance. Defaults to None.
111
+ """
112
+ self.logger = logger or logging.getLogger(__name__)
113
+
114
+ if settings:
115
+ self.settings = settings
116
+ elif endpoint and token:
117
+ self.logger.info("Initializing VDBService with endpoint and token for a Zilliz connection.")
118
+ self.settings = {
119
+ "vector_store": {
120
+ "provider": "zilliz",
121
+ "config": {
122
+ "uri": endpoint,
123
+ "token": token
124
+ }
125
+ }
126
+ }
127
+ else:
128
+ raise ValueError("VDBService must be initialized with either a 'settings' dictionary or both 'endpoint' and 'token'.")
129
+
130
+ vector_store_settings = self.settings.get("vector_store")
131
+ if not vector_store_settings:
132
+ msg = "'vector_store' not found in settings"
133
+ self.logger.error(msg)
134
+ raise ValueError(msg)
135
+
136
+ provider = vector_store_settings.get("provider")
137
+ self.connection_args = vector_store_settings.get("config")
138
+
139
+ if not provider or not self.connection_args:
140
+ msg = "'provider' or 'config' not found in 'vector_store' settings"
141
+ self.logger.error(msg)
142
+ raise ValueError(msg)
143
+
144
+ self._client = self._initialize_milvus_client(provider)
145
+
146
+ self.schema = schema
147
+ self.index_params = self.settings.get("index_params")
148
+
149
+ self.logger.info("VDBService initialized successfully")
150
+
151
+ def _initialize_milvus_client(self, provider: str) -> MilvusClient:
152
+ """
153
+ Initializes and returns a MilvusClient with a retry mechanism.
154
+ """
155
+ client_args = {}
156
+ if provider == "milvus":
157
+ host = self.connection_args.get("host", "localhost")
158
+ port = self.connection_args.get("port", 19530)
159
+
160
+ # Use https for remote hosts, and http for local connections.
161
+ scheme = "https" if host not in ["localhost", "127.0.0.1"] else "http"
162
+ uri = f"{scheme}://{host}:{port}"
163
+
164
+ client_args = {
165
+ "uri": uri,
166
+ "user": self.connection_args.get("user"),
167
+ "password": self.connection_args.get("password"),
168
+ "db_name": self.connection_args.get("db_name")
169
+ }
170
+ # Filter out None values to use client defaults
171
+ client_args = {k: v for k, v in client_args.items() if v is not None}
172
+
173
+ elif provider == "zilliz":
174
+ client_args = self.connection_args
175
+ else:
176
+ self.logger.error(f"Unsupported vector store provider: {provider}")
177
+ raise NotImplementedError(f"Vector store provider '{provider}' is not supported.")
178
+
179
+ try:
180
+ # First attempt to connect
181
+ return MilvusClient(**client_args)
182
+ except Exception as e:
183
+ self.logger.error(f"Failed to initialize MilvusClient, trying again. Error: {e}")
184
+ # Second attempt after failure
185
+ try:
186
+ return MilvusClient(**client_args)
187
+ except Exception as e_retry:
188
+ self.logger.error(f"Failed to initialize MilvusClient on retry. Final error: {e_retry}")
189
+ raise RuntimeError(f"Could not initialize MilvusClient after retry: {e_retry}")
190
+
191
+ def get_vector_client(self) -> MilvusClient:
192
+ """
193
+ Returns the active MilvusClient instance.
194
+
195
+ Returns:
196
+ MilvusClient: The initialized client for interacting with the vector database.
197
+ """
198
+ return self._client
199
+
200
+ def get_embeddings(self, from_model_balancer: bool = False, provider: Optional[str] = "azure-openai", model_type: Optional[str] = "embedding-large") -> Embeddings:
201
+ """
202
+ Gets an embedding function, either from the model balancer or directly from settings.
203
+
204
+ Args:
205
+ from_model_balancer (bool): If True, uses the central model balancer service.
206
+ If False, creates a new instance based on 'embedder' settings.
207
+ model_type (str, optional): The type of model to get from the balancer. Defaults to "embedding-large".
208
+
209
+ Returns:
210
+ Embeddings: An instance of a LangChain embedding model.
211
+ """
212
+ if from_model_balancer:
213
+ model_balancer = get_model_balancer()
214
+ return model_balancer.get_model(provider=provider, model_type=model_type)
215
+
216
+ embedder_config = self.settings.get("embedder")
217
+ if not embedder_config:
218
+ self.logger.error("'embedder' configuration not found in settings.")
219
+ raise ValueError("'embedder' configuration not found in settings.")
220
+
221
+ provider = embedder_config.get("provider")
222
+ config = embedder_config.get("config")
223
+
224
+ if not provider or not config:
225
+ self.logger.error("Embedder 'provider' or 'config' not found in settings.")
226
+ raise ValueError("Embedder 'provider' or 'config' not found in settings.")
227
+
228
+ if provider == "azure-openai":
229
+ # Map the settings config to AzureOpenAIEmbeddings parameters.
230
+ azure_config = {
231
+ "azure_deployment": config.get("model"),
232
+ "openai_api_version": config.get("api_version"),
233
+ "api_key": config.get("api_key"),
234
+ "azure_endpoint": config.get("openai_base_url"),
235
+ "dimensions": config.get("embedding_dims"),
236
+ "chunk_size": config.get("chunk_size", 16),
237
+ "request_timeout": config.get("request_timeout", 60),
238
+ "max_retries": config.get("max_retries", 2)
239
+ }
240
+ # Filter out None values to use client defaults.
241
+ azure_config = {k: v for k, v in azure_config.items() if v is not None}
242
+
243
+ return AzureOpenAIEmbeddings(**azure_config)
244
+ else:
245
+ self.logger.error(f"Unsupported embedding provider: {provider}")
246
+ raise NotImplementedError(f"Embedding provider '{provider}' is not supported yet.")
247
+
248
+ def _ensure_collection_exists(self, collection_name: str, embeddings: Embeddings):
249
+ """
250
+ Checks if a collection exists and creates it if it doesn't.
251
+ This operation is wrapped in a try-except block to handle potential failures
252
+ during collection creation.
253
+ """
254
+ try:
255
+ client = self.get_vector_client()
256
+ if not client.has_collection(collection_name):
257
+ self.logger.info(f"Collection '{collection_name}' does not exist. Creating it.")
258
+
259
+ schema_milvus = SchemaMilvus(
260
+ embedding_function=embeddings,
261
+ collection_name=collection_name,
262
+ connection_args=self.connection_args,
263
+ index_params=self.index_params
264
+ )
265
+
266
+ schema_to_use = self.schema or DEFAULT_SCHEMA
267
+ if not self.schema:
268
+ self.logger.warning(f"No schema provided for VDBService. Using DEFAULT_SCHEMA for collection '{collection_name}'.")
269
+
270
+ schema_milvus.set_schema(schema_to_use)
271
+
272
+ if not schema_milvus.create_collection():
273
+ raise RuntimeError(f"SchemaMilvus failed to create collection '{collection_name}'.")
274
+ except Exception as e:
275
+ self.logger.error(f"An error occurred while ensuring collection '{collection_name}' exists: {e}")
276
+ raise RuntimeError(f"Failed to ensure collection '{collection_name}' exists.") from e
277
+
278
+ def get_vector_store(self, collection_name: str, embeddings: Embeddings = None, metric_type: str = "L2") -> Zilliz:
279
+ """
280
+ Gets a vector store instance, creating it if it doesn't exist for the collection.
281
+ This method validates both the embedding function and the vector store connection
282
+ before caching the instance to prevent faulty instances from being reused.
283
+
284
+ Args:
285
+ collection_name (str): The name of the collection in the vector database.
286
+ embeddings (Embeddings, optional): An embedding model instance. If None, one is created.
287
+ metric_type (str): The distance metric for the index. Defaults to "L2".
288
+
289
+ Returns:
290
+ Zilliz: LangChain Zilliz instance, which is compatible with both Zilliz and Milvus.
291
+ """
292
+ if not collection_name:
293
+ self.logger.error("get_vector_store called with no collection_name.")
294
+ raise ValueError("collection_name must be provided.")
295
+
296
+ # Return the cached instance if it already exists.
297
+ if collection_name in self._instances:
298
+ self.logger.info(f"Returning existing vector store instance for collection: {collection_name}")
299
+ return self._instances[collection_name]
300
+
301
+ self.logger.info(f"Creating new vector store instance for collection: {collection_name}")
302
+ if embeddings is None:
303
+ embeddings = self.get_embeddings()
304
+
305
+ # Ensure the collection exists before proceeding.
306
+ self._ensure_collection_exists(collection_name, embeddings)
307
+
308
+ # 1. Validate the embedding function before proceeding.
309
+ try:
310
+ self.logger.debug(f"Testing embedding function for collection '{collection_name}'...")
311
+ embeddings.embed_query("validation_test_string")
312
+ self.logger.debug("Embedding function is valid.")
313
+ except Exception as e:
314
+ self.logger.error(
315
+ f"The provided embedding function is invalid and failed with error: {e}. "
316
+ f"Cannot create a vector store for collection '{collection_name}'."
317
+ )
318
+ raise RuntimeError(f"Invalid embedding function provided.") from e
319
+
320
+ # If embeddings are valid, proceed to create the Zilliz instance.
321
+ index_params = self.index_params or {
322
+ "metric_type": metric_type,
323
+ "index_type": "AUTOINDEX",
324
+ "params": {}
325
+ }
326
+
327
+ vdb = Zilliz(
328
+ embedding_function=embeddings,
329
+ collection_name=collection_name,
330
+ connection_args=self.connection_args,
331
+ index_params=index_params
332
+ )
333
+
334
+ # Cache the newly created instance.
335
+ self._instances[collection_name] = vdb
336
+
337
+ return vdb
338
+
339
+ def delete_old_indexes(self, url: str = None, vdb: Zilliz = None) -> (bool | None):
340
+ """ Delete old indexes of the same source_url
341
+
342
+ Args:
343
+ url (str): source url
344
+ vdb (Zilliz): Zilliz instance
345
+ """
346
+ self.logger.info(f"Delete old indexes of the same source_url:{url}")
347
+
348
+ if url is None or vdb is None:
349
+ return None
350
+
351
+ # Delete indexes of the same source_url
352
+ expr = f'source_url == "{url}" or source == "{url}"'
353
+ pks = vdb.get_pks(expr)
354
+
355
+ # Delete entities by pks
356
+ if pks is not None and len(pks) > 0 :
357
+ res = vdb.delete(pks)
358
+ self.logger.info("Deleted old indexes result: " + str(res))
359
+ return res
360
+
361
+ def delete_old_indexes_by_id(self, source_id: str = None, vdb: Zilliz = None) -> (bool | None):
362
+ """ Delete old indexes of the same source_id
363
+
364
+ Args:
365
+ source_id (str): source id
366
+ """
367
+ self.logger.info(f"Delete old indexes of the same source_id:{source_id}")
368
+
369
+ if source_id is None or vdb is None:
370
+ return None
371
+
372
+ # Delete indexes of the same source_id
373
+ expr = f'source_id == "{source_id}"'
374
+ pks = vdb.get_pks(expr)
375
+
376
+ # Delete entities by pks
377
+ if pks is not None and len(pks) > 0 :
378
+ res = vdb.delete(pks)
379
+ self.logger.info("Deleted old indexes result: " + str(res))
380
+ return res
381
+
382
+ def drop_collection(self, collection_name: str) -> None:
383
+ """
384
+ Deletes a collection from the vector database and removes it from the cache.
385
+
386
+ Args:
387
+ collection_name (str): The name of the collection to drop.
388
+
389
+ Raises:
390
+ ValueError: If collection_name is not provided.
391
+ RuntimeError: If the operation fails on the database side.
392
+ """
393
+ if not collection_name:
394
+ self.logger.error("drop_collection called without a collection_name.")
395
+ raise ValueError("collection_name must be provided.")
396
+
397
+ self.logger.info(f"Attempting to drop collection: {collection_name}")
398
+
399
+ try:
400
+ client = self.get_vector_client()
401
+ client.drop_collection(collection_name=collection_name)
402
+ self.logger.info(f"Successfully dropped collection: {collection_name}")
403
+ except Exception as e:
404
+ self.logger.error(f"Failed to drop collection '{collection_name}': {e}")
405
+ raise RuntimeError(f"An error occurred while dropping collection '{collection_name}'.") from e
406
+ finally:
407
+ # Whether successful or not, remove the stale instance from the cache.
408
+ if collection_name in self._instances:
409
+ del self._instances[collection_name]
410
+ self.logger.info(f"Removed '{collection_name}' from instance cache.")
411
+
412
+ def delete_data_by_filter(self, collection_name: str = None, filter: str = None) -> None:
413
+ """ Delete data by filter
414
+
415
+ Args:
416
+ collection_name (str): collection_name
417
+ filter (str): filter
418
+ """
419
+ self.logger.info(f"Delete data by filter:{filter}")
420
+
421
+ try:
422
+ client=self.get_vector_client()
423
+ if collection_name is None or client is None or filter is None:
424
+ return RuntimeError(f"collection_name must be not null or check out your client to link milvus")
425
+ client.delete(collection_name=collection_name, filter=filter)
426
+ except Exception as e:
394
427
  raise RuntimeError(f"delete collection data failed: {str(e)}")
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
6
6
 
7
7
  [project]
8
8
  name = "crewplus"
9
- version = "0.2.7"
9
+ version = "0.2.9"
10
10
  description = "Base services for CrewPlus AI applications"
11
11
  authors = [
12
12
  { name = "Tim Liu", email = "tim@opsmateai.com" },
File without changes
File without changes
File without changes
File without changes
File without changes