crewplus 0.1.6__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crewplus might be problematic. Click here for more details.

@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: crewplus
3
- Version: 0.1.6
3
+ Version: 0.2.1
4
4
  Summary: Base services for CrewPlus AI applications
5
5
  Author-Email: Tim Liu <tim@opsmateai.com>
6
6
  License: MIT
@@ -15,6 +15,7 @@ Requires-Dist: google-genai==1.21.1
15
15
  Requires-Dist: mkdocs<2.0.0,>=1.6.1
16
16
  Requires-Dist: mkdocs-material<10.0.0,>=9.6.14
17
17
  Requires-Dist: mkdocstrings-python<2.0.0,>=1.16.12
18
+ Requires-Dist: langchain-milvus<0.3.0,>=0.2.1
18
19
  Description-Content-Type: text/markdown
19
20
 
20
21
  # CrewPlus
@@ -37,6 +38,7 @@ CrewPlus is designed as a modular and extensible ecosystem of packages. This all
37
38
  - **`crewplus` (This package):** The core package containing foundational services for chat, model load balancing, and vector stores.
38
39
  - **`crewplus-agents`:** An extension for creating and managing autonomous AI agents.
39
40
  - **`crewplus-ingestion`:** Provides robust pipelines for knowledge ingestion and data processing.
41
+ - **`crewplus-memory`:** Provides agent memory services for Crewplus AI Agents.
40
42
  - **`crewplus-integrations`:** A collection of third-party integrations to connect CrewPlus with other services and platforms.
41
43
 
42
44
  ## Features
@@ -94,6 +96,10 @@ crewplus-base/ # GitHub repo name
94
96
  │ └── gemini_chat_model.py
95
97
  │ └── model_load_balancer.py
96
98
  │ └── ...
99
+ │ └── vectorstores/milvus
100
+ │ └── __init__.py
101
+ │ └── schema_milvus.py
102
+ │ └── vdb_service.py
97
103
  │ └── core/
98
104
  │ └── __init__.py
99
105
  │ └── config.py
@@ -18,6 +18,7 @@ CrewPlus is designed as a modular and extensible ecosystem of packages. This all
18
18
  - **`crewplus` (This package):** The core package containing foundational services for chat, model load balancing, and vector stores.
19
19
  - **`crewplus-agents`:** An extension for creating and managing autonomous AI agents.
20
20
  - **`crewplus-ingestion`:** Provides robust pipelines for knowledge ingestion and data processing.
21
+ - **`crewplus-memory`:** Provides agent memory services for Crewplus AI Agents.
21
22
  - **`crewplus-integrations`:** A collection of third-party integrations to connect CrewPlus with other services and platforms.
22
23
 
23
24
  ## Features
@@ -75,6 +76,10 @@ crewplus-base/ # GitHub repo name
75
76
  │ └── gemini_chat_model.py
76
77
  │ └── model_load_balancer.py
77
78
  │ └── ...
79
+ │ └── vectorstores/milvus
80
+ │ └── __init__.py
81
+ │ └── schema_milvus.py
82
+ │ └── vdb_service.py
78
83
  │ └── core/
79
84
  │ └── __init__.py
80
85
  │ └── config.py
@@ -0,0 +1,16 @@
1
+ import os
2
+ from crewplus.services.model_load_balancer import ModelLoadBalancer
3
+
4
+ model_balancer = None
5
+
6
+ def init_load_balancer():
7
+ global model_balancer
8
+ if model_balancer is None:
9
+ config_path = os.getenv("MODEL_CONFIG_PATH", "config/models_config.json")
10
+ model_balancer = ModelLoadBalancer(config_path)
11
+ model_balancer.load_config() # Load initial configuration synchronously
12
+
13
+ def get_model_balancer() -> ModelLoadBalancer:
14
+ if model_balancer is None:
15
+ raise RuntimeError("ModelLoadBalancer not initialized")
16
+ return model_balancer
@@ -81,7 +81,7 @@ class ModelLoadBalancer:
81
81
 
82
82
  Args:
83
83
  provider: The model provider (e.g., 'azure-openai', 'google-genai').
84
- model_type: The type of model (e.g., 'inference', 'embedding').
84
+ model_type: The type of model (e.g., 'inference', 'embedding', 'embedding-large').
85
85
  deployment_name: The unique name for the model deployment.
86
86
 
87
87
  Returns:
@@ -0,0 +1,7 @@
1
+ from enum import Enum
2
+
3
+ class Action(Enum):
4
+ UPSERT = "upsert" # Update existing fields; if a match is found, it updates, otherwise, it inserts. Does not delete unmatched existing fields.
5
+ DELETE = "delete" # Clear data from fields in the schema.
6
+ UPDATE = "update" # Update only the matching original fields.
7
+ INSERT = "insert" # Insert data, clearing the original fields before inserting new values.
@@ -0,0 +1,173 @@
1
+ from typing import List
2
+ from langchain_core.documents import Document
3
+ import random
4
+
5
+ class SchemaDocumentUpdater:
6
+ """A utility class for updating and creating LangChain Documents with specific metadata schemas."""
7
+
8
+ @staticmethod
9
+ def update_document_metadata(document: Document, metadata: dict) -> Document:
10
+ """
11
+ Updates the metadata of a LangChain Document.
12
+
13
+ Args:
14
+ document (Document): The document to update.
15
+ metadata (dict): A dictionary containing the metadata to add or update.
16
+
17
+ Returns:
18
+ Document: The updated document with the new metadata.
19
+ """
20
+ metadata_updates = document.metadata
21
+
22
+ for key, value in metadata.items():
23
+ metadata_updates[key] = value
24
+
25
+ return Document(
26
+ page_content=document.page_content,
27
+ metadata=metadata_updates
28
+ )
29
+
30
+ @staticmethod
31
+ def delete_document_metadata(document: Document, keys_to_delete: List[str]) -> Document:
32
+ """
33
+ Deletes specified keys from the metadata of a LangChain Document.
34
+
35
+ Args:
36
+ document (Document): The document to update.
37
+ keys_to_delete (List[str]): A list of keys to delete from the metadata.
38
+
39
+ Returns:
40
+ Document: The updated document with the specified metadata keys removed.
41
+ """
42
+ metadata = document.metadata
43
+
44
+ for key in keys_to_delete:
45
+ if key in metadata:
46
+ del metadata[key]
47
+
48
+ return Document(
49
+ page_content=document.page_content,
50
+ metadata=metadata
51
+ )
52
+
53
+ @staticmethod
54
+ def add_sample_metadata(document: Document, type: str) -> Document:
55
+ """
56
+ Adds sample metadata to a document based on a specified type.
57
+
58
+ The metadata schema is tailored for either "Reg Wheel" or "Robot" types.
59
+
60
+ Args:
61
+ document (Document): The document to which sample metadata will be added.
62
+ type (str): The type of sample metadata to add ("Reg Wheel" or "Robot").
63
+
64
+ Returns:
65
+ Document: The document with added sample metadata.
66
+ """
67
+ if type == "Reg Wheel":
68
+ meta = {
69
+ "keywords": "Reg Wheel",
70
+ "plant_metadata": {
71
+ "entity_id": "EQUIP_123",
72
+ "entity_type": "Machine",
73
+ "hierarchy_path": "/EnterpriseA/SITE_A/LINE_003/",
74
+ "entity_tags": ["nickname_for_EQUIP_123", "PB3"],
75
+ "parent_entity": None,
76
+ "linked_entities": []
77
+ },
78
+ "version_metadata": {
79
+ "version_id": "V2.0",
80
+ "version_tags": ["global"],
81
+ "version_date": "2024/05/23"
82
+ },
83
+ "other_metadata": {}
84
+ }
85
+ else: # Robot
86
+ meta = {
87
+ "keywords": "Robot",
88
+ "plant_metadata": {
89
+ "entity_id": "EQUIP_124",
90
+ "entity_type": "Robot",
91
+ "hierarchy_path": "/EnterpriseA/SITE_A/LINE_002/",
92
+ "entity_tags": ["nickname_for_EQUIP_124", "RB2"],
93
+ "parent_entity": None,
94
+ "linked_entities": []
95
+ },
96
+ "version_metadata": {
97
+ "version_id": "R1.0",
98
+ "version_tags": ["prototype"],
99
+ "version_date": "2024/05/23"
100
+ },
101
+ "other_metadata": {}
102
+ }
103
+
104
+ updated_document = SchemaDocumentUpdater.update_document_metadata(document, meta)
105
+ return updated_document
106
+
107
+ @staticmethod
108
+ def create_test_document(index: int, type: str) -> Document:
109
+ """
110
+ Creates a test document with sample content and metadata.
111
+
112
+ The content and metadata are generated based on the specified type ("Reg Wheel" or "Robot").
113
+
114
+ Args:
115
+ index (int): An index number to make the document unique.
116
+ type (str): The type of test document to create ("Reg Wheel" or "Robot").
117
+
118
+ Returns:
119
+ A new test document.
120
+ """
121
+ meta = {
122
+ "title": f"{type} Maintenance Record {index}",
123
+ "source_url": f"http://example.com/{type.lower()}_maintenance_{index}",
124
+ "file_type": "xlsx",
125
+ "page": index
126
+ }
127
+
128
+ if type == "Reg Wheel":
129
+ page_content = ["| Date | Maintenance Performed | Technician | Notes |",
130
+ "|------------|-----------------------|------------|----------------------------|"]
131
+ for _ in range(random.randint(10, 20)):
132
+ day = random.randint(1, 28)
133
+ maintenance_performed = random.choice(["Oil Change", "Belt Replacement", "Alignment Check", "General Inspection"])
134
+ technician = random.choice(["John Doe", "Jane Smith", "Jim Brown"])
135
+ notes = random.choice(["Changed oil and filter", "Replaced worn-out belt", "Checked and adjusted align", "No issues found"])
136
+ page_content.append(f"| 2023-05-{day:02} | {maintenance_performed} | {technician} | {notes} |")
137
+ page_content = "\n".join(page_content)
138
+ else: # Robot
139
+ technicians = ["Bob", "Tim", "Alice"]
140
+ page_content = ["| Date | Maintenance Performed | Technician | Notes |",
141
+ "|------------|-----------------------|------------|-------------------------------------|"]
142
+ for _ in range(random.randint(10, 20)):
143
+ day = random.randint(1, 28)
144
+ maintenance_performed = random.choice(["Sensor Calibration", "Actuator Testing", "Software Update", "Battery Replacement"])
145
+ technician = random.choice(technicians)
146
+ notes = random.choice(["Calibrated all sensors", "Tested and replaced faulty actuators", "Updated robot software to v2.1", "Replaced old battery with new one"])
147
+ page_content.append(f"| 2023-05-{day:02} | {maintenance_performed} | {technician} | {notes} |")
148
+ page_content = "\n".join(page_content)
149
+
150
+ document = Document(page_content=page_content, metadata=meta)
151
+ return SchemaDocumentUpdater.add_sample_metadata(document, type)
152
+
153
+ @staticmethod
154
+ def create_test_documents(doc_num: int) -> List[Document]:
155
+ """
156
+ Creates a list of test documents.
157
+
158
+ It generates a mix of "Reg Wheel" and "Robot" documents.
159
+
160
+ Args:
161
+ doc_num (int): The total number of documents to create.
162
+
163
+ Returns:
164
+ List[Document]: A list of created test documents.
165
+ """
166
+
167
+ reg_wheel_docs_num = doc_num * 2 // 3
168
+ robot_docs_num = doc_num - reg_wheel_docs_num
169
+
170
+ documents = [SchemaDocumentUpdater.create_test_document(i+1, "Reg Wheel") for i in range(reg_wheel_docs_num)]
171
+ documents += [SchemaDocumentUpdater.create_test_document(i+1 + reg_wheel_docs_num, "Robot") for i in range(robot_docs_num)]
172
+
173
+ return documents
@@ -0,0 +1,221 @@
1
+ from pymilvus import DataType, MilvusClient
2
+ import json
3
+ import logging
4
+ from typing import Any
5
+
6
+ class MilvusSchemaManager:
7
+ """
8
+ Manages Milvus/Milvus collection schemas.
9
+
10
+ This class provides functionalities to create and validate collection schemas
11
+ and index parameters based on a JSON definition. It interacts with a
12
+ MilvusClient instance to perform these operations.
13
+ """
14
+ def __init__(self, client: MilvusClient, logger=None):
15
+ """
16
+ Initializes the MilvusSchemaManager.
17
+
18
+ Args:
19
+ client (MilvusClient): An instance of the Milvus client.
20
+ logger (logging.Logger, optional): A logger instance. If not provided,
21
+ a default logger will be created.
22
+ Defaults to None.
23
+ """
24
+ self.client = client
25
+ self.logger = logger or logging.getLogger(__name__)
26
+
27
+ def bind_client(self, client: MilvusClient):
28
+ """
29
+ Binds a new MilvusClient instance to the manager.
30
+
31
+ Args:
32
+ client (MilvusClient): The Milvus client instance to use.
33
+ """
34
+ self.client = client
35
+
36
+ def _add_array_field(self, schema, field_name, field_info):
37
+ """
38
+ Adds an ARRAY field to the schema based on field information.
39
+
40
+ This is a helper method to handle the specific logic for creating ARRAY fields.
41
+
42
+ Args:
43
+ schema: The Milvus schema object to add the field to.
44
+ field_name (str): The name of the field.
45
+ field_info (dict): A dictionary containing information about the field,
46
+ such as element type and max capacity.
47
+
48
+ Raises:
49
+ ValueError: If required information like 'element' or 'max_capacity'
50
+ is missing from field_info, or if an unsupported element
51
+ type is specified.
52
+ """
53
+ element_type_str = field_info.get("element")
54
+ if not element_type_str:
55
+ raise ValueError(f"Array field '{field_name}' must have 'element' type specified.")
56
+
57
+ element_type = None
58
+ if element_type_str in ["STRING", "VARCHAR", "TEXT"]:
59
+ element_type = DataType.VARCHAR
60
+ elif element_type_str == "INT64":
61
+ element_type = DataType.INT64
62
+ else:
63
+ raise ValueError(f"Unsupported element type '{element_type_str}' for ARRAY field '{field_name}'.")
64
+
65
+ max_capacity = field_info.get("max_capacity")
66
+ if max_capacity is None:
67
+ raise ValueError(f"Array field '{field_name}' must have 'max_capacity' specified.")
68
+
69
+ nullable = field_info.get('nullable', True)
70
+
71
+ field_args = {
72
+ "field_name": field_name,
73
+ "datatype": DataType.ARRAY,
74
+ "element_type": element_type,
75
+ "max_capacity": int(max_capacity),
76
+ "nullable": nullable,
77
+ }
78
+
79
+ if element_type == DataType.VARCHAR:
80
+ max_length = field_info.get('max_length', 65535)
81
+ field_args["max_length"] = int(max_length)
82
+
83
+ schema.add_field(**field_args)
84
+
85
+ def create_collection_schema(self, json_schema: str):
86
+ """
87
+ Creates a Milvus collection schema from a JSON string.
88
+
89
+ Args:
90
+ json_schema (str): A JSON string defining the schema.
91
+
92
+ Returns:
93
+ A Milvus schema object.
94
+
95
+ Raises:
96
+ ValueError: If an unknown field type is encountered in the schema.
97
+ """
98
+ schema_data = json.loads(json_schema)
99
+ fields = schema_data['node_types']['Document']['properties']
100
+
101
+ schema = self.client.create_schema(auto_id=False, enable_dynamic_fields=True)
102
+ for field_name, field_info in fields.items():
103
+ field_type = field_info['type']
104
+ if field_type == "STRING" or field_type == "VARCHAR" or field_type == "TEXT":
105
+ max_length = field_info.get('max_length', 256) # Default max_length if not provided
106
+ nullable = field_info.get('nullable', False) # Default nullable if not provided
107
+ schema.add_field(field_name=field_name, datatype=DataType.VARCHAR, max_length=max_length, nullable=nullable)
108
+ elif field_type == "JSON":
109
+ nullable = field_info.get('nullable', True)
110
+ schema.add_field(field_name=field_name, datatype=DataType.JSON, nullable=nullable)
111
+ elif field_type == "INT64":
112
+ is_primary = field_info.get('is_primary', False)
113
+ auto_id = field_info.get('auto_id', False)
114
+ nullable = field_info.get('nullable', False)
115
+ schema.add_field(field_name=field_name, datatype=DataType.INT64, is_primary=is_primary, auto_id=auto_id, nullable=nullable)
116
+ elif field_type == "ARRAY":
117
+ self._add_array_field(schema, field_name, field_info)
118
+ elif field_type == "FLOAT_VECTOR":
119
+ dim = field_info.get('dim', 1536) # Default dimension if not provided
120
+ schema.add_field(field_name=field_name, datatype=DataType.FLOAT_VECTOR, dim=dim)
121
+ else:
122
+ raise ValueError(f"Unknown field type: {field_type}")
123
+
124
+ return schema
125
+
126
+ def create_index_params(self, json_schema: str):
127
+ """
128
+ Creates index parameters from a JSON schema string.
129
+
130
+ This method defines indexes based on the 'indexes' section of the schema
131
+ and automatically creates an 'AUTOINDEX' for any FLOAT_VECTOR fields.
132
+
133
+ Args:
134
+ json_schema (str): A JSON string defining the schema and indexes.
135
+
136
+ Returns:
137
+ Milvus index parameters object.
138
+ """
139
+ schema_data = json.loads(json_schema)
140
+ fields = schema_data['node_types']['Document']['properties']
141
+
142
+ index_params = self.client.prepare_index_params()
143
+
144
+ # Check if 'indexes' key exists
145
+ if 'indexes' in schema_data['node_types']['Document']:
146
+ indexes = schema_data['node_types']['Document']['indexes']
147
+ for index_name, index_details in indexes.items():
148
+ field_name = index_details['fieldname']
149
+ index_type = index_details['type']
150
+ params = index_details['params']
151
+ index_params.add_index(
152
+ field_name=field_name,
153
+ index_type=index_type,
154
+ index_name=index_name,
155
+ params=params
156
+ )
157
+
158
+ # Automatic indexing for FLOAT_VECTOR fields
159
+ for field_name, field_info in fields.items():
160
+ if field_info['type'] == "FLOAT_VECTOR":
161
+ index_params.add_index(
162
+ field_name=field_name,
163
+ index_name="vector",
164
+ index_type="AUTOINDEX",
165
+ metric_type="L2"
166
+ )
167
+
168
+ return index_params
169
+
170
+ def create_collection(self, collection_name: str, json_schema: str):
171
+ """
172
+ Creates a new collection in Milvus.
173
+
174
+ This method orchestrates the creation of the schema and index parameters
175
+ before creating the collection itself.
176
+
177
+ Args:
178
+ collection_name (str): The name for the new collection.
179
+ json_schema (str): The JSON string defining the collection's schema
180
+ and indexes.
181
+ """
182
+ schema = self.create_collection_schema(json_schema)
183
+ index_params = self.create_index_params(json_schema)
184
+
185
+ self.client.create_collection(
186
+ collection_name=collection_name,
187
+ schema=schema,
188
+ index_params=index_params,
189
+ enable_dynamic_fields=True # we need to enable dynamic fields for schema updates
190
+ )
191
+
192
+ def validate_schema(self, json_schema: str) -> bool:
193
+ """
194
+ Validates the given schema by attempting to create a collection schema and index params.
195
+
196
+ Args:
197
+ json_schema (str): The schema JSON string to validate.
198
+
199
+ Returns:
200
+ bool: True if the schema is valid, False if any exceptions are caught.
201
+ """
202
+ try:
203
+ self.create_collection_schema(json_schema)
204
+ self.create_index_params(json_schema)
205
+ return True
206
+ except Exception as e:
207
+ self.logger.error(f"Schema validation failed: {e}")
208
+ return False
209
+
210
+
211
+ class ZillizSchemaManager(MilvusSchemaManager):
212
+ def __init__(self, *args: Any, **kwargs: Any) -> None:
213
+ import warnings
214
+
215
+ warnings.warn(
216
+ "The ZillizSchemaManager class will be deprecated in the future. "
217
+ "Please use the MilvusSchemaManager class instead.",
218
+ DeprecationWarning,
219
+ stacklevel=2,
220
+ )
221
+ super().__init__(*args, **kwargs)
@@ -0,0 +1,253 @@
1
+ from typing import List, Optional
2
+ import logging
3
+ import json
4
+
5
+ from pymilvus import DataType
6
+ from langchain_milvus import Milvus
7
+ from langchain_core.documents import Document
8
+ from crewplus.utils.schema_document_updater import SchemaDocumentUpdater
9
+ from crewplus.utils.schema_action import Action
10
+ from .milvus_schema_manager import MilvusSchemaManager
11
+
12
+
13
+ class SchemaMilvus(Milvus):
14
+ """
15
+ SchemaMilvus is a subclass of the Milvus class from langchain_milvus. This class is responsible for updating metadata of documents in a Milvus vector store.
16
+
17
+ Attributes:
18
+ embedding_function: Embedding function used by the Milvus vector store.
19
+ collection_name: Name of the collection in the Milvus vector store.
20
+ connection_args: Connection arguments for the Milvus vector store.
21
+ index_params: Index parameters for the Milvus vector store.
22
+ auto_id: Flag to specify if auto ID generation is enabled.
23
+ primary_field: The primary field of the collection.
24
+ vector_field: The vector field of the collection.
25
+ consistency_level: The consistency level for the Milvus vector store.
26
+ collection_schema: Schema JSON string associated with the Milvus existing collection name.
27
+ """
28
+ def __init__(
29
+ self,
30
+ embedding_function,
31
+ collection_name,
32
+ connection_args,
33
+ index_params=None,
34
+ auto_id=True,
35
+ primary_field="pk",
36
+ text_field: str = "text",
37
+ vector_field=["vector"],
38
+ consistency_level="Session",
39
+ logger: Optional[logging.Logger] = None
40
+ ):
41
+ """
42
+ Initializes the SchemaMilvus class with the provided parameters.
43
+
44
+ Args:
45
+ embedding_function: Embedding function used by the Milvus vector store.
46
+ collection_name: Name of the collection in the Milvus vector store.
47
+ connection_args: Connection arguments for the Milvus vector store.
48
+ index_params: Index parameters for the Milvus vector store.
49
+ auto_id: Flag to specify if auto ID generation is enabled.
50
+ primary_field: The primary field of the collection.
51
+ text_field: The text field of the collection.
52
+ vector_field: The vector field of the collection.
53
+ consistency_level: The consistency level for the Milvus vector store.
54
+ logger: Optional logger instance. If not provided, a default logger is created.
55
+ """
56
+ super().__init__(
57
+ embedding_function=embedding_function,
58
+ collection_name=collection_name,
59
+ connection_args=connection_args,
60
+ index_params=index_params,
61
+ auto_id=auto_id,
62
+ primary_field=primary_field,
63
+ text_field=text_field,
64
+ vector_field=vector_field,
65
+ consistency_level=consistency_level
66
+ )
67
+ self.logger = logger or logging.getLogger(__name__)
68
+ self.collection_schema = None
69
+ self.schema_manager = MilvusSchemaManager(client=self.client)
70
+
71
+ def set_schema(self, schema: str):
72
+ """
73
+ Sets the collection schema.
74
+
75
+ Args:
76
+ schema: The schema JSON string.
77
+ """
78
+ self.collection_schema = schema
79
+
80
+ def get_fields(self, collection_name: Optional[str] = None) -> Optional[List[str]]:
81
+ """
82
+ Retrieves and returns the fields from the collection schema.
83
+
84
+ Args:
85
+ collection_name: The name of the collection to describe. If None, use self.collection_name.
86
+
87
+ Returns:
88
+ List[str] | None: The list of field names from the collection schema (excluding vector and text fields), or None if collection_name is not provided or an error occurs.
89
+ """
90
+ if collection_name is None:
91
+ collection_name = self.collection_name
92
+ if collection_name is None:
93
+ return None
94
+
95
+ try:
96
+ schema = self.client.describe_collection(collection_name)
97
+ fields = [field["name"] for field in schema["fields"] if field["type"] != DataType.FLOAT_VECTOR ]
98
+ return fields
99
+ except Exception as e:
100
+ self.logger.warning(f"Failed to retrieve schema fields: {e}")
101
+ return None
102
+
103
+ def create_collection(self) -> bool:
104
+ """
105
+ Validates the schema and creates the collection using the MilvusSchemaManager.
106
+
107
+ Returns:
108
+ bool: True if the collection is successfully created, False otherwise.
109
+ """
110
+ if self.collection_schema is None:
111
+ self.logger.error("Collection schema is not set. Please set a schema using set_schema().")
112
+ return False
113
+
114
+ self.schema_manager.bind_client(self.client)
115
+ if not self.schema_manager.validate_schema(self.collection_schema):
116
+ self.logger.error("Failed to validate schema")
117
+ return False
118
+ try:
119
+ self.schema_manager.create_collection(self.collection_name, self.collection_schema)
120
+ self.logger.info(f"Collection {self.collection_name} created successfully")
121
+
122
+ return True
123
+ except Exception as e:
124
+ self.logger.error(f"Failed to create collection: {e}")
125
+ return False
126
+
127
+ def drop_collection(self, collection_name: Optional[str] = None) -> bool:
128
+ """
129
+ Drops the collection using the Milvus client.
130
+
131
+ Returns:
132
+ bool: True if the collection is successfully dropped, False otherwise.
133
+ """
134
+ if collection_name is None:
135
+ collection_name = self.collection_name
136
+
137
+ try:
138
+ self.client.drop_collection(collection_name)
139
+ self.logger.info(f"Collection {collection_name} dropped successfully")
140
+ return True
141
+ except Exception as e:
142
+ self.logger.error(f"Failed to drop collection {self.collection_name}: {e}")
143
+ return False
144
+
145
+ def _handle_upsert(self, doc: Document, metadata_dict: dict) -> Document:
146
+ """
147
+ Handles the UPSERT action for a single document by merging metadata.
148
+ """
149
+ existing_metadata = doc.metadata
150
+ for key, value in metadata_dict.items():
151
+ # Skip primary key and text fields to prevent modification.
152
+ if key in [self.primary_field, self.text_field]:
153
+ continue
154
+
155
+ if isinstance(value, dict):
156
+ # If the new value is a dictionary, handle nested updates.
157
+ if key not in existing_metadata or not isinstance(existing_metadata.get(key), dict):
158
+ # If the key doesn't exist or its value is not a dict, replace it.
159
+ existing_metadata[key] = value
160
+ else:
161
+ # If both are dictionaries, recursively update the nested fields.
162
+ for sub_key, sub_value in value.items():
163
+ if isinstance(sub_value, dict) and sub_key in existing_metadata[key] and isinstance(existing_metadata[key].get(sub_key), dict):
164
+ existing_metadata[key][sub_key].update(sub_value)
165
+ else:
166
+ existing_metadata[key][sub_key] = sub_value
167
+ else:
168
+ # For non-dictionary values, simply update or add the field.
169
+ existing_metadata[key] = value
170
+
171
+ doc.metadata = existing_metadata
172
+ return doc
173
+
174
+ def update_documents_metadata(self, expr: str, metadata: str,action:Action=Action.UPSERT) -> List[Document]:
175
+ """
176
+ Updates the metadata of documents in the Milvus vector store based on the provided expression.
177
+
178
+ Args:
179
+ expr: Expression to filter the target documents.
180
+ metadata: New metadata to update the documents with.
181
+
182
+ Returns:
183
+ List of updated documents.
184
+ """
185
+ try:
186
+ metadata_dict = json.loads(metadata)
187
+ except json.JSONDecodeError:
188
+ raise ValueError("Invalid JSON string for metadata")
189
+
190
+ # Retrieve documents that match the filter expression.
191
+ fields = self.get_fields()
192
+ documents = self.search_by_metadata(expr, fields=fields, limit=5000)
193
+
194
+ updated_documents = []
195
+ for doc in documents:
196
+ # Preserve the original primary key and text values.
197
+ pk_value = doc.metadata.get(self.primary_field) # default to pk
198
+ text_value = doc.metadata.get(self.text_field)
199
+
200
+ # Apply the specified action to update the document's metadata.
201
+ if action == Action.UPSERT:
202
+ doc = self._handle_upsert(doc, metadata_dict)
203
+ elif action == Action.DELETE:
204
+ keys_to_delete = metadata_dict.keys()
205
+ doc = SchemaDocumentUpdater.delete_document_metadata(doc, list(keys_to_delete))
206
+ elif action == Action.UPDATE:
207
+ existing_metadata = doc.metadata
208
+ update_dict = {}
209
+ for key, value in metadata_dict.items():
210
+ if key in existing_metadata:
211
+ if isinstance(value, dict) and isinstance(existing_metadata[key], dict):
212
+ merged = existing_metadata[key].copy()
213
+ for sub_key, sub_value in value.items():
214
+ if sub_key in merged:
215
+ merged[sub_key] = sub_value
216
+ update_dict[key] = merged
217
+ else:
218
+ update_dict[key] = value
219
+ doc = SchemaDocumentUpdater.update_document_metadata(doc, update_dict)
220
+ elif action == Action.INSERT:
221
+ existing_metadata = doc.metadata
222
+ for key, value in metadata_dict.items():
223
+ if key in ['pk', 'text']:
224
+ continue
225
+
226
+ if isinstance(value, dict) and key in existing_metadata and isinstance(existing_metadata[key], dict):
227
+ existing_metadata[key] = {}
228
+ existing_metadata[key] = value
229
+ else:
230
+ existing_metadata[key] = value
231
+ doc.metadata = existing_metadata
232
+
233
+ # Restore the primary key and text values to ensure they are not lost.
234
+ if pk_value is not None:
235
+ doc.metadata[self.primary_field] = pk_value
236
+ if text_value is not None:
237
+ doc.metadata[self.text_field] = text_value
238
+
239
+ updated_documents.append(doc)
240
+
241
+ # Extract the primary keys for the upsert operation.
242
+ updated_ids = [doc.metadata[self.primary_field] for doc in updated_documents]
243
+
244
+ # Remove primary key and text from metadata before upserting,
245
+ # as they are handled separately by the vector store.
246
+ for doc in updated_documents:
247
+ doc.metadata.pop(self.primary_field, None)
248
+ doc.metadata.pop(self.text_field, None)
249
+
250
+ # Perform the upsert operation to update the documents in the collection.
251
+ self.upsert(ids=updated_ids, documents=updated_documents)
252
+
253
+ return updated_documents
@@ -0,0 +1,342 @@
1
+ # -*- coding: utf-8 -*-
2
+ # @Author: Cursor
3
+ # @Date: 2025-02-12
4
+ # @Last Modified by: Gemini
5
+ # @Last Modified time: 2025-07-01
6
+
7
+ import logging
8
+ from typing import List, Dict, Union, Optional
9
+ from langchain_milvus import Zilliz
10
+ from langchain_core.embeddings import Embeddings
11
+ from langchain_openai import AzureOpenAIEmbeddings
12
+ from pymilvus import MilvusClient
13
+
14
+ from crewplus.services.init_services import get_model_balancer
15
+ from crewplus.vectorstores.milvus.schema_milvus import SchemaMilvus
16
+
17
+ class VDBService(object):
18
+ """
19
+ A service to manage connections to Milvus/Zilliz vector databases and embedding models.
20
+
21
+ This service centralizes the configuration and instantiation of the Milvus client
22
+ and provides helper methods to get embedding functions and vector store instances.
23
+
24
+ Args:
25
+ settings (dict): A dictionary containing configuration for the vector store
26
+ and embedding models.
27
+ schema (str, optional): The schema definition for a collection. Defaults to None.
28
+ logger (logging.Logger, optional): An optional logger instance. Defaults to None.
29
+
30
+ Raises:
31
+ ValueError: If required configurations are missing from the settings dictionary.
32
+ NotImplementedError: If an unsupported provider is specified.
33
+ RuntimeError: If the MilvusClient fails to initialize after a retry.
34
+
35
+ Example:
36
+ >>> settings = {
37
+ ... "embedder": {
38
+ ... "provider": "azure-openai",
39
+ ... "config": {
40
+ ... "model": "text-embedding-3-small",
41
+ ... "api_version": "2023-05-15",
42
+ ... "api_key": "YOUR_AZURE_OPENAI_KEY",
43
+ ... "openai_base_url": "YOUR_AZURE_OPENAI_ENDPOINT",
44
+ ... "embedding_dims": 1536
45
+ ... }
46
+ ... },
47
+ ... "vector_store": {
48
+ ... "provider": "milvus",
49
+ ... "config": {
50
+ ... "host": "localhost",
51
+ ... "port": 19530,
52
+ ... "user": "root",
53
+ ... "password": "password",
54
+ ... "db_name": "default"
55
+ ... }
56
+ ... },
57
+ ... "index_params": {
58
+ ... "metric_type": "L2",
59
+ ... "index_type": "AUTOINDEX",
60
+ ... "params": {}
61
+ ... }
62
+ ... }
63
+ >>> vdb_service = VDBService(settings=settings)
64
+ >>> # Get the raw Milvus client
65
+ >>> client = vdb_service.get_vector_client()
66
+ >>> print(client.list_collections())
67
+ >>> # Get an embedding function
68
+ >>> embeddings = vdb_service.get_embeddings()
69
+ >>> print(embeddings)
70
+ >>> # Get a LangChain vector store instance (will be cached)
71
+ >>> vector_store = vdb_service.get_vector_store(collection_name="my_collection")
72
+ >>> print(vector_store)
73
+ >>> same_vector_store = vdb_service.get_vector_store(collection_name="my_collection")
74
+ >>> assert vector_store is same_vector_store
75
+ """
76
+ _client: MilvusClient
77
+ _instances: Dict[str, Zilliz] = {}
78
+
79
+ schema: str
80
+ embedding_function: Embeddings
81
+ index_params: dict
82
+ connection_args: dict
83
+ settings: dict
84
+
85
+ def __init__(self, settings: dict, schema: str = None, logger: logging.Logger = None):
86
+ """
87
+ Initializes the VDBService.
88
+
89
+ Args:
90
+ settings (dict): Configuration dictionary for the service.
91
+ schema (str, optional): Default schema for new collections. Defaults to None.
92
+ logger (logging.Logger, optional): Logger instance. Defaults to None.
93
+ """
94
+ self.logger = logger or logging.getLogger(__name__)
95
+ self.settings = settings
96
+
97
+ vector_store_settings = self.settings.get("vector_store")
98
+ if not vector_store_settings:
99
+ msg = "'vector_store' not found in settings"
100
+ self.logger.error(msg)
101
+ raise ValueError(msg)
102
+
103
+ provider = vector_store_settings.get("provider")
104
+ self.connection_args = vector_store_settings.get("config")
105
+
106
+ if not provider or not self.connection_args:
107
+ msg = "'provider' or 'config' not found in 'vector_store' settings"
108
+ self.logger.error(msg)
109
+ raise ValueError(msg)
110
+
111
+ self._client = self._initialize_milvus_client(provider)
112
+
113
+ self.schema = schema
114
+ self.index_params = self.settings.get("index_params")
115
+
116
+ self.logger.info("VDBService initialized successfully")
117
+
118
+ def _initialize_milvus_client(self, provider: str) -> MilvusClient:
119
+ """
120
+ Initializes and returns a MilvusClient with a retry mechanism.
121
+ """
122
+ client_args = {}
123
+ if provider == "milvus":
124
+ host = self.connection_args.get("host", "localhost")
125
+ port = self.connection_args.get("port", 19530)
126
+
127
+ # Use https for remote hosts, and http for local connections.
128
+ scheme = "https" if host not in ["localhost", "127.0.0.1"] else "http"
129
+ uri = f"{scheme}://{host}:{port}"
130
+
131
+ client_args = {
132
+ "uri": uri,
133
+ "user": self.connection_args.get("user"),
134
+ "password": self.connection_args.get("password"),
135
+ "db_name": self.connection_args.get("db_name")
136
+ }
137
+ # Filter out None values to use client defaults
138
+ client_args = {k: v for k, v in client_args.items() if v is not None}
139
+
140
+ elif provider == "zilliz":
141
+ client_args = self.connection_args
142
+ else:
143
+ self.logger.error(f"Unsupported vector store provider: {provider}")
144
+ raise NotImplementedError(f"Vector store provider '{provider}' is not supported.")
145
+
146
+ try:
147
+ # First attempt to connect
148
+ return MilvusClient(**client_args)
149
+ except Exception as e:
150
+ self.logger.error(f"Failed to initialize MilvusClient, trying again. Error: {e}")
151
+ # Second attempt after failure
152
+ try:
153
+ return MilvusClient(**client_args)
154
+ except Exception as e_retry:
155
+ self.logger.error(f"Failed to initialize MilvusClient on retry. Final error: {e_retry}")
156
+ raise RuntimeError(f"Could not initialize MilvusClient after retry: {e_retry}")
157
+
158
+ def get_vector_client(self) -> MilvusClient:
159
+ """
160
+ Returns the active MilvusClient instance.
161
+
162
+ Returns:
163
+ MilvusClient: The initialized client for interacting with the vector database.
164
+ """
165
+ return self._client
166
+
167
+ def get_embeddings(self, from_model_balancer: bool = False, model_type: Optional[str] = "embedding-large") -> Embeddings:
168
+ """
169
+ Gets an embedding function, either from the model balancer or directly from settings.
170
+
171
+ Args:
172
+ from_model_balancer (bool): If True, uses the central model balancer service.
173
+ If False, creates a new instance based on 'embedder' settings.
174
+ model_type (str, optional): The type of model to get from the balancer. Defaults to "embedding-large".
175
+
176
+ Returns:
177
+ Embeddings: An instance of a LangChain embedding model.
178
+ """
179
+ if from_model_balancer:
180
+ model_balancer = get_model_balancer()
181
+ return model_balancer.get_model(model_type=model_type)
182
+
183
+ embedder_config = self.settings.get("embedder")
184
+ if not embedder_config:
185
+ self.logger.error("'embedder' configuration not found in settings.")
186
+ raise ValueError("'embedder' configuration not found in settings.")
187
+
188
+ provider = embedder_config.get("provider")
189
+ config = embedder_config.get("config")
190
+
191
+ if not provider or not config:
192
+ self.logger.error("Embedder 'provider' or 'config' not found in settings.")
193
+ raise ValueError("Embedder 'provider' or 'config' not found in settings.")
194
+
195
+ if provider == "azure-openai":
196
+ # Map the settings config to AzureOpenAIEmbeddings parameters.
197
+ azure_config = {
198
+ "azure_deployment": config.get("model"),
199
+ "openai_api_version": config.get("api_version"),
200
+ "api_key": config.get("api_key"),
201
+ "azure_endpoint": config.get("openai_base_url"),
202
+ "dimensions": config.get("embedding_dims"),
203
+ "chunk_size": config.get("chunk_size", 16),
204
+ "request_timeout": config.get("request_timeout", 60),
205
+ "max_retries": config.get("max_retries", 2)
206
+ }
207
+ # Filter out None values to use client defaults.
208
+ azure_config = {k: v for k, v in azure_config.items() if v is not None}
209
+
210
+ return AzureOpenAIEmbeddings(**azure_config)
211
+ else:
212
+ self.logger.error(f"Unsupported embedding provider: {provider}")
213
+ raise NotImplementedError(f"Embedding provider '{provider}' is not supported yet.")
214
+
215
+ def get_vector_store(self, collection_name: str, embeddings: Embeddings = None, metric_type: str = "L2") -> Zilliz:
216
+ """
217
+ Gets a vector store instance, creating it if it doesn't exist for the collection.
218
+
219
+ This method caches instances by collection name to avoid re-instantiation.
220
+
221
+ Args:
222
+ collection_name (str): The name of the collection in the vector database.
223
+ embeddings (Embeddings, optional): An embedding model instance. If None, one is created.
224
+ metric_type (str): The distance metric for the index. Defaults to "L2".
225
+
226
+ Returns:
227
+ Zilliz: LangChain Zilliz instance, which is compatible with both Zilliz and Milvus.
228
+ """
229
+ if not collection_name:
230
+ self.logger.error("get_vector_store called with no collection_name.")
231
+ raise ValueError("collection_name must be provided.")
232
+
233
+ # Return the cached instance if it already exists.
234
+ if collection_name in self._instances:
235
+ self.logger.info(f"Returning existing vector store instance for collection: {collection_name}")
236
+ return self._instances[collection_name]
237
+
238
+ self.logger.info(f"Creating new vector store instance for collection: {collection_name}")
239
+ if embeddings is None:
240
+ embeddings = self.get_embeddings()
241
+
242
+ index_params = self.index_params or {
243
+ "metric_type": metric_type,
244
+ "index_type": "AUTOINDEX",
245
+ "params": {}
246
+ }
247
+
248
+ vdb = Zilliz(
249
+ embedding_function=embeddings,
250
+ collection_name=collection_name,
251
+ connection_args=self.connection_args,
252
+ index_params=index_params
253
+ )
254
+
255
+ # Cache the newly created instance.
256
+ self._instances[collection_name] = vdb
257
+
258
+ return vdb
259
+
260
+ def delete_old_indexes(self, url: str = None, vdb: Zilliz = None) -> None:
261
+ """ Delete old indexes of the same source_url
262
+
263
+ Args:
264
+ url (str): source url
265
+ """
266
+ if url is None or vdb is None:
267
+ return
268
+
269
+ # Delete indexes of the same source_url
270
+ expr = "source in [\"" + url + "\"]"
271
+ pks = vdb.get_pks(expr)
272
+
273
+ # Delete entities by pks
274
+ if pks is not None and len(pks) > 0 :
275
+ old_items = vdb.delete(pks)
276
+ self.logger.info("ingesting document -- delete old indexes -- " + str(old_items))
277
+
278
+ def delete_old_indexes_by_id(self, id: str = None, vdb: Zilliz = None) -> None:
279
+ """ Delete old indexes of the same source_id
280
+
281
+ Args:
282
+ id (str): source id
283
+ """
284
+ self.logger.info(f"Delete old indexes of the same source_id:{id}")
285
+
286
+ if id is None or vdb is None:
287
+ return
288
+
289
+ # Delete indexes of the same source_id
290
+ expr = "source_id in [\"" + id + "\"]"
291
+ pks = vdb.get_pks(expr)
292
+
293
+ # Delete entities by pks
294
+ if pks is not None and len(pks) > 0 :
295
+ old_items = vdb.delete(pks)
296
+ self.logger.info("ingesting document -- delete old indexes -- " + str(old_items))
297
+
298
+ def drop_collection(self, collection_name: str) -> None:
299
+ """
300
+ Deletes a collection from the vector database and removes it from the cache.
301
+
302
+ Args:
303
+ collection_name (str): The name of the collection to drop.
304
+
305
+ Raises:
306
+ ValueError: If collection_name is not provided.
307
+ RuntimeError: If the operation fails on the database side.
308
+ """
309
+ if not collection_name:
310
+ self.logger.error("drop_collection called without a collection_name.")
311
+ raise ValueError("collection_name must be provided.")
312
+
313
+ self.logger.info(f"Attempting to drop collection: {collection_name}")
314
+
315
+ try:
316
+ client = self.get_vector_client()
317
+ client.drop_collection(collection_name=collection_name)
318
+ self.logger.info(f"Successfully dropped collection: {collection_name}")
319
+ except Exception as e:
320
+ self.logger.error(f"Failed to drop collection '{collection_name}': {e}")
321
+ raise RuntimeError(f"An error occurred while dropping collection '{collection_name}'.") from e
322
+ finally:
323
+ # Whether successful or not, remove the stale instance from the cache.
324
+ if collection_name in self._instances:
325
+ del self._instances[collection_name]
326
+ self.logger.info(f"Removed '{collection_name}' from instance cache.")
327
+
328
+ def delete_data_by_filter(self, collection_name: str = None, filter: str = None) -> None:
329
+ """ Delete a collection
330
+
331
+ Args:
332
+ collection_name (str): scollection_name
333
+ """
334
+ self.logger.info(f"drop a collection by name:{collection_name}")
335
+
336
+ try:
337
+ client=self.get_vector_client()
338
+ if collection_name is None or client is None or filter is None:
339
+ return RuntimeError(f"collection_name must be not null or check out your client to link milvus")
340
+ client.delete(collection_name=collection_name, filter=filter)
341
+ except Exception as e:
342
+ raise RuntimeError(f"delete collection data failed: {str(e)}")
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
6
6
 
7
7
  [project]
8
8
  name = "crewplus"
9
- version = "0.1.6"
9
+ version = "0.2.1"
10
10
  description = "Base services for CrewPlus AI applications"
11
11
  authors = [
12
12
  { name = "Tim Liu", email = "tim@opsmateai.com" },
@@ -20,6 +20,7 @@ dependencies = [
20
20
  "mkdocs (>=1.6.1,<2.0.0)",
21
21
  "mkdocs-material (>=9.6.14,<10.0.0)",
22
22
  "mkdocstrings-python (>=1.16.12,<2.0.0)",
23
+ "langchain-milvus (>=0.2.1,<0.3.0)",
23
24
  ]
24
25
 
25
26
  [project.license]
@@ -1,77 +0,0 @@
1
- {
2
- "models": [
3
- {
4
- "id": 1,
5
- "provider": "azure-openai",
6
- "type": "inference",
7
- "deployment_name": "gpt-o3mini-eastus2-RPM25",
8
- "api_version": "2024-12-01-preview",
9
- "api_base": "https://crewplus-eastus2.openai.azure.com",
10
- "api_key": "c67cb5d0d8ae4aef81d7f42aeae274b6"
11
- },
12
- {
13
- "id": 2,
14
- "provider": "azure-openai",
15
- "type": "ingestion",
16
- "deployment_name": "gpt-4o",
17
- "api_version": "2025-01-01-preview",
18
- "api_base": "https://crewplus-eastus2.openai.azure.com",
19
- "api_key": "c67cb5d0d8ae4aef81d7f42aeae274b6",
20
- "temperature": 0.0
21
- },
22
- {
23
- "id": 3,
24
- "provider": "azure-openai",
25
- "type": "inference",
26
- "deployment_name": "gpt-4.1",
27
- "api_version": "2025-01-01-preview",
28
- "api_base": "https://crewplus-eastus2.openai.azure.com",
29
- "api_key": "c67cb5d0d8ae4aef81d7f42aeae274b6",
30
- "temperature": 0.0
31
- },
32
- {
33
- "id": 4,
34
- "provider": "azure-openai",
35
- "type": "ingestion",
36
- "deployment_name": "cpai-gpt4o-westus",
37
- "api_version": "2025-01-01-preview",
38
- "api_base": "https://crewplus-westus.openai.azure.com",
39
- "api_key": "b93bc4d2ef8e4298bd8390002922d084",
40
- "temperature": 0.0
41
- },
42
- {
43
- "id": 5,
44
- "provider": "azure-openai-embeddings",
45
- "type": "embedding",
46
- "deployment_name": "cpai-text-embedding-ada-002-westus",
47
- "api_version": "2024-02-01",
48
- "api_base": "https://crewplus-westus.openai.azure.com",
49
- "api_key": "b93bc4d2ef8e4298bd8390002922d084"
50
- },
51
- {
52
- "id": 6,
53
- "provider": "azure-openai-embeddings",
54
- "type": "embedding",
55
- "deployment_name": "cpai-text-embedding-3-large-eastus2",
56
- "api_version": "1",
57
- "api_base": "https://crewplus-eastus2.openai.azure.com",
58
- "api_key": "c67cb5d0d8ae4aef81d7f42aeae274b6"
59
- },
60
- {
61
- "id": 7,
62
- "provider": "google-genai",
63
- "type": "inference",
64
- "deployment_name": "gemini-2.5-flash",
65
- "api_key": "AIzaSyDkZbcGcV7SB6OyN4XkK_sF2mzO2E-nKQk",
66
- "temperature": 0.0
67
- },
68
- {
69
- "id": 8,
70
- "provider": "google-genai",
71
- "type": "ingestion",
72
- "deployment_name": "gemini-2.5-pro",
73
- "api_key": "AIzaSyDkZbcGcV7SB6OyN4XkK_sF2mzO2E-nKQk",
74
- "temperature": 0.0
75
- }
76
- ]
77
- }
File without changes
File without changes