crewplus 0.1.6__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of crewplus might be problematic. Click here for more details.
- crewplus/services/init_services.py +16 -0
- crewplus/services/model_load_balancer.py +1 -1
- crewplus/utils/schema_action.py +7 -0
- crewplus/utils/schema_document_updater.py +173 -0
- crewplus/vectorstores/milvus/__init__.py +0 -0
- crewplus/vectorstores/milvus/milvus_schema_manager.py +221 -0
- crewplus/vectorstores/milvus/schema_milvus.py +253 -0
- crewplus/vectorstores/milvus/vdb_service.py +342 -0
- {crewplus-0.1.6.dist-info → crewplus-0.2.1.dist-info}/METADATA +7 -1
- crewplus-0.2.1.dist-info/RECORD +16 -0
- crewplus-0.1.6.dist-info/RECORD +0 -9
- {crewplus-0.1.6.dist-info → crewplus-0.2.1.dist-info}/WHEEL +0 -0
- {crewplus-0.1.6.dist-info → crewplus-0.2.1.dist-info}/entry_points.txt +0 -0
- {crewplus-0.1.6.dist-info → crewplus-0.2.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import os
|
|
2
|
+
from crewplus.services.model_load_balancer import ModelLoadBalancer
|
|
3
|
+
|
|
4
|
+
model_balancer = None
|
|
5
|
+
|
|
6
|
+
def init_load_balancer():
|
|
7
|
+
global model_balancer
|
|
8
|
+
if model_balancer is None:
|
|
9
|
+
config_path = os.getenv("MODEL_CONFIG_PATH", "config/models_config.json")
|
|
10
|
+
model_balancer = ModelLoadBalancer(config_path)
|
|
11
|
+
model_balancer.load_config() # Load initial configuration synchronously
|
|
12
|
+
|
|
13
|
+
def get_model_balancer() -> ModelLoadBalancer:
|
|
14
|
+
if model_balancer is None:
|
|
15
|
+
raise RuntimeError("ModelLoadBalancer not initialized")
|
|
16
|
+
return model_balancer
|
|
@@ -81,7 +81,7 @@ class ModelLoadBalancer:
|
|
|
81
81
|
|
|
82
82
|
Args:
|
|
83
83
|
provider: The model provider (e.g., 'azure-openai', 'google-genai').
|
|
84
|
-
model_type: The type of model (e.g., 'inference', 'embedding').
|
|
84
|
+
model_type: The type of model (e.g., 'inference', 'embedding', 'embedding-large').
|
|
85
85
|
deployment_name: The unique name for the model deployment.
|
|
86
86
|
|
|
87
87
|
Returns:
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
from enum import Enum
|
|
2
|
+
|
|
3
|
+
class Action(Enum):
|
|
4
|
+
UPSERT = "upsert" # Update existing fields; if a match is found, it updates, otherwise, it inserts. Does not delete unmatched existing fields.
|
|
5
|
+
DELETE = "delete" # Clear data from fields in the schema.
|
|
6
|
+
UPDATE = "update" # Update only the matching original fields.
|
|
7
|
+
INSERT = "insert" # Insert data, clearing the original fields before inserting new values.
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
from typing import List
|
|
2
|
+
from langchain_core.documents import Document
|
|
3
|
+
import random
|
|
4
|
+
|
|
5
|
+
class SchemaDocumentUpdater:
|
|
6
|
+
"""A utility class for updating and creating LangChain Documents with specific metadata schemas."""
|
|
7
|
+
|
|
8
|
+
@staticmethod
|
|
9
|
+
def update_document_metadata(document: Document, metadata: dict) -> Document:
|
|
10
|
+
"""
|
|
11
|
+
Updates the metadata of a LangChain Document.
|
|
12
|
+
|
|
13
|
+
Args:
|
|
14
|
+
document (Document): The document to update.
|
|
15
|
+
metadata (dict): A dictionary containing the metadata to add or update.
|
|
16
|
+
|
|
17
|
+
Returns:
|
|
18
|
+
Document: The updated document with the new metadata.
|
|
19
|
+
"""
|
|
20
|
+
metadata_updates = document.metadata
|
|
21
|
+
|
|
22
|
+
for key, value in metadata.items():
|
|
23
|
+
metadata_updates[key] = value
|
|
24
|
+
|
|
25
|
+
return Document(
|
|
26
|
+
page_content=document.page_content,
|
|
27
|
+
metadata=metadata_updates
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
@staticmethod
|
|
31
|
+
def delete_document_metadata(document: Document, keys_to_delete: List[str]) -> Document:
|
|
32
|
+
"""
|
|
33
|
+
Deletes specified keys from the metadata of a LangChain Document.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
document (Document): The document to update.
|
|
37
|
+
keys_to_delete (List[str]): A list of keys to delete from the metadata.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Document: The updated document with the specified metadata keys removed.
|
|
41
|
+
"""
|
|
42
|
+
metadata = document.metadata
|
|
43
|
+
|
|
44
|
+
for key in keys_to_delete:
|
|
45
|
+
if key in metadata:
|
|
46
|
+
del metadata[key]
|
|
47
|
+
|
|
48
|
+
return Document(
|
|
49
|
+
page_content=document.page_content,
|
|
50
|
+
metadata=metadata
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
@staticmethod
|
|
54
|
+
def add_sample_metadata(document: Document, type: str) -> Document:
|
|
55
|
+
"""
|
|
56
|
+
Adds sample metadata to a document based on a specified type.
|
|
57
|
+
|
|
58
|
+
The metadata schema is tailored for either "Reg Wheel" or "Robot" types.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
document (Document): The document to which sample metadata will be added.
|
|
62
|
+
type (str): The type of sample metadata to add ("Reg Wheel" or "Robot").
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
Document: The document with added sample metadata.
|
|
66
|
+
"""
|
|
67
|
+
if type == "Reg Wheel":
|
|
68
|
+
meta = {
|
|
69
|
+
"keywords": "Reg Wheel",
|
|
70
|
+
"plant_metadata": {
|
|
71
|
+
"entity_id": "EQUIP_123",
|
|
72
|
+
"entity_type": "Machine",
|
|
73
|
+
"hierarchy_path": "/EnterpriseA/SITE_A/LINE_003/",
|
|
74
|
+
"entity_tags": ["nickname_for_EQUIP_123", "PB3"],
|
|
75
|
+
"parent_entity": None,
|
|
76
|
+
"linked_entities": []
|
|
77
|
+
},
|
|
78
|
+
"version_metadata": {
|
|
79
|
+
"version_id": "V2.0",
|
|
80
|
+
"version_tags": ["global"],
|
|
81
|
+
"version_date": "2024/05/23"
|
|
82
|
+
},
|
|
83
|
+
"other_metadata": {}
|
|
84
|
+
}
|
|
85
|
+
else: # Robot
|
|
86
|
+
meta = {
|
|
87
|
+
"keywords": "Robot",
|
|
88
|
+
"plant_metadata": {
|
|
89
|
+
"entity_id": "EQUIP_124",
|
|
90
|
+
"entity_type": "Robot",
|
|
91
|
+
"hierarchy_path": "/EnterpriseA/SITE_A/LINE_002/",
|
|
92
|
+
"entity_tags": ["nickname_for_EQUIP_124", "RB2"],
|
|
93
|
+
"parent_entity": None,
|
|
94
|
+
"linked_entities": []
|
|
95
|
+
},
|
|
96
|
+
"version_metadata": {
|
|
97
|
+
"version_id": "R1.0",
|
|
98
|
+
"version_tags": ["prototype"],
|
|
99
|
+
"version_date": "2024/05/23"
|
|
100
|
+
},
|
|
101
|
+
"other_metadata": {}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
updated_document = SchemaDocumentUpdater.update_document_metadata(document, meta)
|
|
105
|
+
return updated_document
|
|
106
|
+
|
|
107
|
+
@staticmethod
|
|
108
|
+
def create_test_document(index: int, type: str) -> Document:
|
|
109
|
+
"""
|
|
110
|
+
Creates a test document with sample content and metadata.
|
|
111
|
+
|
|
112
|
+
The content and metadata are generated based on the specified type ("Reg Wheel" or "Robot").
|
|
113
|
+
|
|
114
|
+
Args:
|
|
115
|
+
index (int): An index number to make the document unique.
|
|
116
|
+
type (str): The type of test document to create ("Reg Wheel" or "Robot").
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
A new test document.
|
|
120
|
+
"""
|
|
121
|
+
meta = {
|
|
122
|
+
"title": f"{type} Maintenance Record {index}",
|
|
123
|
+
"source_url": f"http://example.com/{type.lower()}_maintenance_{index}",
|
|
124
|
+
"file_type": "xlsx",
|
|
125
|
+
"page": index
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if type == "Reg Wheel":
|
|
129
|
+
page_content = ["| Date | Maintenance Performed | Technician | Notes |",
|
|
130
|
+
"|------------|-----------------------|------------|----------------------------|"]
|
|
131
|
+
for _ in range(random.randint(10, 20)):
|
|
132
|
+
day = random.randint(1, 28)
|
|
133
|
+
maintenance_performed = random.choice(["Oil Change", "Belt Replacement", "Alignment Check", "General Inspection"])
|
|
134
|
+
technician = random.choice(["John Doe", "Jane Smith", "Jim Brown"])
|
|
135
|
+
notes = random.choice(["Changed oil and filter", "Replaced worn-out belt", "Checked and adjusted align", "No issues found"])
|
|
136
|
+
page_content.append(f"| 2023-05-{day:02} | {maintenance_performed} | {technician} | {notes} |")
|
|
137
|
+
page_content = "\n".join(page_content)
|
|
138
|
+
else: # Robot
|
|
139
|
+
technicians = ["Bob", "Tim", "Alice"]
|
|
140
|
+
page_content = ["| Date | Maintenance Performed | Technician | Notes |",
|
|
141
|
+
"|------------|-----------------------|------------|-------------------------------------|"]
|
|
142
|
+
for _ in range(random.randint(10, 20)):
|
|
143
|
+
day = random.randint(1, 28)
|
|
144
|
+
maintenance_performed = random.choice(["Sensor Calibration", "Actuator Testing", "Software Update", "Battery Replacement"])
|
|
145
|
+
technician = random.choice(technicians)
|
|
146
|
+
notes = random.choice(["Calibrated all sensors", "Tested and replaced faulty actuators", "Updated robot software to v2.1", "Replaced old battery with new one"])
|
|
147
|
+
page_content.append(f"| 2023-05-{day:02} | {maintenance_performed} | {technician} | {notes} |")
|
|
148
|
+
page_content = "\n".join(page_content)
|
|
149
|
+
|
|
150
|
+
document = Document(page_content=page_content, metadata=meta)
|
|
151
|
+
return SchemaDocumentUpdater.add_sample_metadata(document, type)
|
|
152
|
+
|
|
153
|
+
@staticmethod
|
|
154
|
+
def create_test_documents(doc_num: int) -> List[Document]:
|
|
155
|
+
"""
|
|
156
|
+
Creates a list of test documents.
|
|
157
|
+
|
|
158
|
+
It generates a mix of "Reg Wheel" and "Robot" documents.
|
|
159
|
+
|
|
160
|
+
Args:
|
|
161
|
+
doc_num (int): The total number of documents to create.
|
|
162
|
+
|
|
163
|
+
Returns:
|
|
164
|
+
List[Document]: A list of created test documents.
|
|
165
|
+
"""
|
|
166
|
+
|
|
167
|
+
reg_wheel_docs_num = doc_num * 2 // 3
|
|
168
|
+
robot_docs_num = doc_num - reg_wheel_docs_num
|
|
169
|
+
|
|
170
|
+
documents = [SchemaDocumentUpdater.create_test_document(i+1, "Reg Wheel") for i in range(reg_wheel_docs_num)]
|
|
171
|
+
documents += [SchemaDocumentUpdater.create_test_document(i+1 + reg_wheel_docs_num, "Robot") for i in range(robot_docs_num)]
|
|
172
|
+
|
|
173
|
+
return documents
|
|
File without changes
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
from pymilvus import DataType, MilvusClient
|
|
2
|
+
import json
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
class MilvusSchemaManager:
|
|
7
|
+
"""
|
|
8
|
+
Manages Milvus/Milvus collection schemas.
|
|
9
|
+
|
|
10
|
+
This class provides functionalities to create and validate collection schemas
|
|
11
|
+
and index parameters based on a JSON definition. It interacts with a
|
|
12
|
+
MilvusClient instance to perform these operations.
|
|
13
|
+
"""
|
|
14
|
+
def __init__(self, client: MilvusClient, logger=None):
|
|
15
|
+
"""
|
|
16
|
+
Initializes the MilvusSchemaManager.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
client (MilvusClient): An instance of the Milvus client.
|
|
20
|
+
logger (logging.Logger, optional): A logger instance. If not provided,
|
|
21
|
+
a default logger will be created.
|
|
22
|
+
Defaults to None.
|
|
23
|
+
"""
|
|
24
|
+
self.client = client
|
|
25
|
+
self.logger = logger or logging.getLogger(__name__)
|
|
26
|
+
|
|
27
|
+
def bind_client(self, client: MilvusClient):
|
|
28
|
+
"""
|
|
29
|
+
Binds a new MilvusClient instance to the manager.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
client (MilvusClient): The Milvus client instance to use.
|
|
33
|
+
"""
|
|
34
|
+
self.client = client
|
|
35
|
+
|
|
36
|
+
def _add_array_field(self, schema, field_name, field_info):
|
|
37
|
+
"""
|
|
38
|
+
Adds an ARRAY field to the schema based on field information.
|
|
39
|
+
|
|
40
|
+
This is a helper method to handle the specific logic for creating ARRAY fields.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
schema: The Milvus schema object to add the field to.
|
|
44
|
+
field_name (str): The name of the field.
|
|
45
|
+
field_info (dict): A dictionary containing information about the field,
|
|
46
|
+
such as element type and max capacity.
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
ValueError: If required information like 'element' or 'max_capacity'
|
|
50
|
+
is missing from field_info, or if an unsupported element
|
|
51
|
+
type is specified.
|
|
52
|
+
"""
|
|
53
|
+
element_type_str = field_info.get("element")
|
|
54
|
+
if not element_type_str:
|
|
55
|
+
raise ValueError(f"Array field '{field_name}' must have 'element' type specified.")
|
|
56
|
+
|
|
57
|
+
element_type = None
|
|
58
|
+
if element_type_str in ["STRING", "VARCHAR", "TEXT"]:
|
|
59
|
+
element_type = DataType.VARCHAR
|
|
60
|
+
elif element_type_str == "INT64":
|
|
61
|
+
element_type = DataType.INT64
|
|
62
|
+
else:
|
|
63
|
+
raise ValueError(f"Unsupported element type '{element_type_str}' for ARRAY field '{field_name}'.")
|
|
64
|
+
|
|
65
|
+
max_capacity = field_info.get("max_capacity")
|
|
66
|
+
if max_capacity is None:
|
|
67
|
+
raise ValueError(f"Array field '{field_name}' must have 'max_capacity' specified.")
|
|
68
|
+
|
|
69
|
+
nullable = field_info.get('nullable', True)
|
|
70
|
+
|
|
71
|
+
field_args = {
|
|
72
|
+
"field_name": field_name,
|
|
73
|
+
"datatype": DataType.ARRAY,
|
|
74
|
+
"element_type": element_type,
|
|
75
|
+
"max_capacity": int(max_capacity),
|
|
76
|
+
"nullable": nullable,
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if element_type == DataType.VARCHAR:
|
|
80
|
+
max_length = field_info.get('max_length', 65535)
|
|
81
|
+
field_args["max_length"] = int(max_length)
|
|
82
|
+
|
|
83
|
+
schema.add_field(**field_args)
|
|
84
|
+
|
|
85
|
+
def create_collection_schema(self, json_schema: str):
|
|
86
|
+
"""
|
|
87
|
+
Creates a Milvus collection schema from a JSON string.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
json_schema (str): A JSON string defining the schema.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
A Milvus schema object.
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
ValueError: If an unknown field type is encountered in the schema.
|
|
97
|
+
"""
|
|
98
|
+
schema_data = json.loads(json_schema)
|
|
99
|
+
fields = schema_data['node_types']['Document']['properties']
|
|
100
|
+
|
|
101
|
+
schema = self.client.create_schema(auto_id=False, enable_dynamic_fields=True)
|
|
102
|
+
for field_name, field_info in fields.items():
|
|
103
|
+
field_type = field_info['type']
|
|
104
|
+
if field_type == "STRING" or field_type == "VARCHAR" or field_type == "TEXT":
|
|
105
|
+
max_length = field_info.get('max_length', 256) # Default max_length if not provided
|
|
106
|
+
nullable = field_info.get('nullable', False) # Default nullable if not provided
|
|
107
|
+
schema.add_field(field_name=field_name, datatype=DataType.VARCHAR, max_length=max_length, nullable=nullable)
|
|
108
|
+
elif field_type == "JSON":
|
|
109
|
+
nullable = field_info.get('nullable', True)
|
|
110
|
+
schema.add_field(field_name=field_name, datatype=DataType.JSON, nullable=nullable)
|
|
111
|
+
elif field_type == "INT64":
|
|
112
|
+
is_primary = field_info.get('is_primary', False)
|
|
113
|
+
auto_id = field_info.get('auto_id', False)
|
|
114
|
+
nullable = field_info.get('nullable', False)
|
|
115
|
+
schema.add_field(field_name=field_name, datatype=DataType.INT64, is_primary=is_primary, auto_id=auto_id, nullable=nullable)
|
|
116
|
+
elif field_type == "ARRAY":
|
|
117
|
+
self._add_array_field(schema, field_name, field_info)
|
|
118
|
+
elif field_type == "FLOAT_VECTOR":
|
|
119
|
+
dim = field_info.get('dim', 1536) # Default dimension if not provided
|
|
120
|
+
schema.add_field(field_name=field_name, datatype=DataType.FLOAT_VECTOR, dim=dim)
|
|
121
|
+
else:
|
|
122
|
+
raise ValueError(f"Unknown field type: {field_type}")
|
|
123
|
+
|
|
124
|
+
return schema
|
|
125
|
+
|
|
126
|
+
def create_index_params(self, json_schema: str):
|
|
127
|
+
"""
|
|
128
|
+
Creates index parameters from a JSON schema string.
|
|
129
|
+
|
|
130
|
+
This method defines indexes based on the 'indexes' section of the schema
|
|
131
|
+
and automatically creates an 'AUTOINDEX' for any FLOAT_VECTOR fields.
|
|
132
|
+
|
|
133
|
+
Args:
|
|
134
|
+
json_schema (str): A JSON string defining the schema and indexes.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Milvus index parameters object.
|
|
138
|
+
"""
|
|
139
|
+
schema_data = json.loads(json_schema)
|
|
140
|
+
fields = schema_data['node_types']['Document']['properties']
|
|
141
|
+
|
|
142
|
+
index_params = self.client.prepare_index_params()
|
|
143
|
+
|
|
144
|
+
# Check if 'indexes' key exists
|
|
145
|
+
if 'indexes' in schema_data['node_types']['Document']:
|
|
146
|
+
indexes = schema_data['node_types']['Document']['indexes']
|
|
147
|
+
for index_name, index_details in indexes.items():
|
|
148
|
+
field_name = index_details['fieldname']
|
|
149
|
+
index_type = index_details['type']
|
|
150
|
+
params = index_details['params']
|
|
151
|
+
index_params.add_index(
|
|
152
|
+
field_name=field_name,
|
|
153
|
+
index_type=index_type,
|
|
154
|
+
index_name=index_name,
|
|
155
|
+
params=params
|
|
156
|
+
)
|
|
157
|
+
|
|
158
|
+
# Automatic indexing for FLOAT_VECTOR fields
|
|
159
|
+
for field_name, field_info in fields.items():
|
|
160
|
+
if field_info['type'] == "FLOAT_VECTOR":
|
|
161
|
+
index_params.add_index(
|
|
162
|
+
field_name=field_name,
|
|
163
|
+
index_name="vector",
|
|
164
|
+
index_type="AUTOINDEX",
|
|
165
|
+
metric_type="L2"
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
return index_params
|
|
169
|
+
|
|
170
|
+
def create_collection(self, collection_name: str, json_schema: str):
|
|
171
|
+
"""
|
|
172
|
+
Creates a new collection in Milvus.
|
|
173
|
+
|
|
174
|
+
This method orchestrates the creation of the schema and index parameters
|
|
175
|
+
before creating the collection itself.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
collection_name (str): The name for the new collection.
|
|
179
|
+
json_schema (str): The JSON string defining the collection's schema
|
|
180
|
+
and indexes.
|
|
181
|
+
"""
|
|
182
|
+
schema = self.create_collection_schema(json_schema)
|
|
183
|
+
index_params = self.create_index_params(json_schema)
|
|
184
|
+
|
|
185
|
+
self.client.create_collection(
|
|
186
|
+
collection_name=collection_name,
|
|
187
|
+
schema=schema,
|
|
188
|
+
index_params=index_params,
|
|
189
|
+
enable_dynamic_fields=True # we need to enable dynamic fields for schema updates
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
def validate_schema(self, json_schema: str) -> bool:
|
|
193
|
+
"""
|
|
194
|
+
Validates the given schema by attempting to create a collection schema and index params.
|
|
195
|
+
|
|
196
|
+
Args:
|
|
197
|
+
json_schema (str): The schema JSON string to validate.
|
|
198
|
+
|
|
199
|
+
Returns:
|
|
200
|
+
bool: True if the schema is valid, False if any exceptions are caught.
|
|
201
|
+
"""
|
|
202
|
+
try:
|
|
203
|
+
self.create_collection_schema(json_schema)
|
|
204
|
+
self.create_index_params(json_schema)
|
|
205
|
+
return True
|
|
206
|
+
except Exception as e:
|
|
207
|
+
self.logger.error(f"Schema validation failed: {e}")
|
|
208
|
+
return False
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
class ZillizSchemaManager(MilvusSchemaManager):
|
|
212
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
|
213
|
+
import warnings
|
|
214
|
+
|
|
215
|
+
warnings.warn(
|
|
216
|
+
"The ZillizSchemaManager class will be deprecated in the future. "
|
|
217
|
+
"Please use the MilvusSchemaManager class instead.",
|
|
218
|
+
DeprecationWarning,
|
|
219
|
+
stacklevel=2,
|
|
220
|
+
)
|
|
221
|
+
super().__init__(*args, **kwargs)
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
from typing import List, Optional
|
|
2
|
+
import logging
|
|
3
|
+
import json
|
|
4
|
+
|
|
5
|
+
from pymilvus import DataType
|
|
6
|
+
from langchain_milvus import Milvus
|
|
7
|
+
from langchain_core.documents import Document
|
|
8
|
+
from crewplus.utils.schema_document_updater import SchemaDocumentUpdater
|
|
9
|
+
from crewplus.utils.schema_action import Action
|
|
10
|
+
from .milvus_schema_manager import MilvusSchemaManager
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SchemaMilvus(Milvus):
|
|
14
|
+
"""
|
|
15
|
+
SchemaMilvus is a subclass of the Milvus class from langchain_milvus. This class is responsible for updating metadata of documents in a Milvus vector store.
|
|
16
|
+
|
|
17
|
+
Attributes:
|
|
18
|
+
embedding_function: Embedding function used by the Milvus vector store.
|
|
19
|
+
collection_name: Name of the collection in the Milvus vector store.
|
|
20
|
+
connection_args: Connection arguments for the Milvus vector store.
|
|
21
|
+
index_params: Index parameters for the Milvus vector store.
|
|
22
|
+
auto_id: Flag to specify if auto ID generation is enabled.
|
|
23
|
+
primary_field: The primary field of the collection.
|
|
24
|
+
vector_field: The vector field of the collection.
|
|
25
|
+
consistency_level: The consistency level for the Milvus vector store.
|
|
26
|
+
collection_schema: Schema JSON string associated with the Milvus existing collection name.
|
|
27
|
+
"""
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
embedding_function,
|
|
31
|
+
collection_name,
|
|
32
|
+
connection_args,
|
|
33
|
+
index_params=None,
|
|
34
|
+
auto_id=True,
|
|
35
|
+
primary_field="pk",
|
|
36
|
+
text_field: str = "text",
|
|
37
|
+
vector_field=["vector"],
|
|
38
|
+
consistency_level="Session",
|
|
39
|
+
logger: Optional[logging.Logger] = None
|
|
40
|
+
):
|
|
41
|
+
"""
|
|
42
|
+
Initializes the SchemaMilvus class with the provided parameters.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
embedding_function: Embedding function used by the Milvus vector store.
|
|
46
|
+
collection_name: Name of the collection in the Milvus vector store.
|
|
47
|
+
connection_args: Connection arguments for the Milvus vector store.
|
|
48
|
+
index_params: Index parameters for the Milvus vector store.
|
|
49
|
+
auto_id: Flag to specify if auto ID generation is enabled.
|
|
50
|
+
primary_field: The primary field of the collection.
|
|
51
|
+
text_field: The text field of the collection.
|
|
52
|
+
vector_field: The vector field of the collection.
|
|
53
|
+
consistency_level: The consistency level for the Milvus vector store.
|
|
54
|
+
logger: Optional logger instance. If not provided, a default logger is created.
|
|
55
|
+
"""
|
|
56
|
+
super().__init__(
|
|
57
|
+
embedding_function=embedding_function,
|
|
58
|
+
collection_name=collection_name,
|
|
59
|
+
connection_args=connection_args,
|
|
60
|
+
index_params=index_params,
|
|
61
|
+
auto_id=auto_id,
|
|
62
|
+
primary_field=primary_field,
|
|
63
|
+
text_field=text_field,
|
|
64
|
+
vector_field=vector_field,
|
|
65
|
+
consistency_level=consistency_level
|
|
66
|
+
)
|
|
67
|
+
self.logger = logger or logging.getLogger(__name__)
|
|
68
|
+
self.collection_schema = None
|
|
69
|
+
self.schema_manager = MilvusSchemaManager(client=self.client)
|
|
70
|
+
|
|
71
|
+
def set_schema(self, schema: str):
|
|
72
|
+
"""
|
|
73
|
+
Sets the collection schema.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
schema: The schema JSON string.
|
|
77
|
+
"""
|
|
78
|
+
self.collection_schema = schema
|
|
79
|
+
|
|
80
|
+
def get_fields(self, collection_name: Optional[str] = None) -> Optional[List[str]]:
|
|
81
|
+
"""
|
|
82
|
+
Retrieves and returns the fields from the collection schema.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
collection_name: The name of the collection to describe. If None, use self.collection_name.
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
List[str] | None: The list of field names from the collection schema (excluding vector and text fields), or None if collection_name is not provided or an error occurs.
|
|
89
|
+
"""
|
|
90
|
+
if collection_name is None:
|
|
91
|
+
collection_name = self.collection_name
|
|
92
|
+
if collection_name is None:
|
|
93
|
+
return None
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
schema = self.client.describe_collection(collection_name)
|
|
97
|
+
fields = [field["name"] for field in schema["fields"] if field["type"] != DataType.FLOAT_VECTOR ]
|
|
98
|
+
return fields
|
|
99
|
+
except Exception as e:
|
|
100
|
+
self.logger.warning(f"Failed to retrieve schema fields: {e}")
|
|
101
|
+
return None
|
|
102
|
+
|
|
103
|
+
def create_collection(self) -> bool:
|
|
104
|
+
"""
|
|
105
|
+
Validates the schema and creates the collection using the MilvusSchemaManager.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
bool: True if the collection is successfully created, False otherwise.
|
|
109
|
+
"""
|
|
110
|
+
if self.collection_schema is None:
|
|
111
|
+
self.logger.error("Collection schema is not set. Please set a schema using set_schema().")
|
|
112
|
+
return False
|
|
113
|
+
|
|
114
|
+
self.schema_manager.bind_client(self.client)
|
|
115
|
+
if not self.schema_manager.validate_schema(self.collection_schema):
|
|
116
|
+
self.logger.error("Failed to validate schema")
|
|
117
|
+
return False
|
|
118
|
+
try:
|
|
119
|
+
self.schema_manager.create_collection(self.collection_name, self.collection_schema)
|
|
120
|
+
self.logger.info(f"Collection {self.collection_name} created successfully")
|
|
121
|
+
|
|
122
|
+
return True
|
|
123
|
+
except Exception as e:
|
|
124
|
+
self.logger.error(f"Failed to create collection: {e}")
|
|
125
|
+
return False
|
|
126
|
+
|
|
127
|
+
def drop_collection(self, collection_name: Optional[str] = None) -> bool:
|
|
128
|
+
"""
|
|
129
|
+
Drops the collection using the Milvus client.
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
bool: True if the collection is successfully dropped, False otherwise.
|
|
133
|
+
"""
|
|
134
|
+
if collection_name is None:
|
|
135
|
+
collection_name = self.collection_name
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
self.client.drop_collection(collection_name)
|
|
139
|
+
self.logger.info(f"Collection {collection_name} dropped successfully")
|
|
140
|
+
return True
|
|
141
|
+
except Exception as e:
|
|
142
|
+
self.logger.error(f"Failed to drop collection {self.collection_name}: {e}")
|
|
143
|
+
return False
|
|
144
|
+
|
|
145
|
+
def _handle_upsert(self, doc: Document, metadata_dict: dict) -> Document:
|
|
146
|
+
"""
|
|
147
|
+
Handles the UPSERT action for a single document by merging metadata.
|
|
148
|
+
"""
|
|
149
|
+
existing_metadata = doc.metadata
|
|
150
|
+
for key, value in metadata_dict.items():
|
|
151
|
+
# Skip primary key and text fields to prevent modification.
|
|
152
|
+
if key in [self.primary_field, self.text_field]:
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
if isinstance(value, dict):
|
|
156
|
+
# If the new value is a dictionary, handle nested updates.
|
|
157
|
+
if key not in existing_metadata or not isinstance(existing_metadata.get(key), dict):
|
|
158
|
+
# If the key doesn't exist or its value is not a dict, replace it.
|
|
159
|
+
existing_metadata[key] = value
|
|
160
|
+
else:
|
|
161
|
+
# If both are dictionaries, recursively update the nested fields.
|
|
162
|
+
for sub_key, sub_value in value.items():
|
|
163
|
+
if isinstance(sub_value, dict) and sub_key in existing_metadata[key] and isinstance(existing_metadata[key].get(sub_key), dict):
|
|
164
|
+
existing_metadata[key][sub_key].update(sub_value)
|
|
165
|
+
else:
|
|
166
|
+
existing_metadata[key][sub_key] = sub_value
|
|
167
|
+
else:
|
|
168
|
+
# For non-dictionary values, simply update or add the field.
|
|
169
|
+
existing_metadata[key] = value
|
|
170
|
+
|
|
171
|
+
doc.metadata = existing_metadata
|
|
172
|
+
return doc
|
|
173
|
+
|
|
174
|
+
def update_documents_metadata(self, expr: str, metadata: str,action:Action=Action.UPSERT) -> List[Document]:
|
|
175
|
+
"""
|
|
176
|
+
Updates the metadata of documents in the Milvus vector store based on the provided expression.
|
|
177
|
+
|
|
178
|
+
Args:
|
|
179
|
+
expr: Expression to filter the target documents.
|
|
180
|
+
metadata: New metadata to update the documents with.
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
List of updated documents.
|
|
184
|
+
"""
|
|
185
|
+
try:
|
|
186
|
+
metadata_dict = json.loads(metadata)
|
|
187
|
+
except json.JSONDecodeError:
|
|
188
|
+
raise ValueError("Invalid JSON string for metadata")
|
|
189
|
+
|
|
190
|
+
# Retrieve documents that match the filter expression.
|
|
191
|
+
fields = self.get_fields()
|
|
192
|
+
documents = self.search_by_metadata(expr, fields=fields, limit=5000)
|
|
193
|
+
|
|
194
|
+
updated_documents = []
|
|
195
|
+
for doc in documents:
|
|
196
|
+
# Preserve the original primary key and text values.
|
|
197
|
+
pk_value = doc.metadata.get(self.primary_field) # default to pk
|
|
198
|
+
text_value = doc.metadata.get(self.text_field)
|
|
199
|
+
|
|
200
|
+
# Apply the specified action to update the document's metadata.
|
|
201
|
+
if action == Action.UPSERT:
|
|
202
|
+
doc = self._handle_upsert(doc, metadata_dict)
|
|
203
|
+
elif action == Action.DELETE:
|
|
204
|
+
keys_to_delete = metadata_dict.keys()
|
|
205
|
+
doc = SchemaDocumentUpdater.delete_document_metadata(doc, list(keys_to_delete))
|
|
206
|
+
elif action == Action.UPDATE:
|
|
207
|
+
existing_metadata = doc.metadata
|
|
208
|
+
update_dict = {}
|
|
209
|
+
for key, value in metadata_dict.items():
|
|
210
|
+
if key in existing_metadata:
|
|
211
|
+
if isinstance(value, dict) and isinstance(existing_metadata[key], dict):
|
|
212
|
+
merged = existing_metadata[key].copy()
|
|
213
|
+
for sub_key, sub_value in value.items():
|
|
214
|
+
if sub_key in merged:
|
|
215
|
+
merged[sub_key] = sub_value
|
|
216
|
+
update_dict[key] = merged
|
|
217
|
+
else:
|
|
218
|
+
update_dict[key] = value
|
|
219
|
+
doc = SchemaDocumentUpdater.update_document_metadata(doc, update_dict)
|
|
220
|
+
elif action == Action.INSERT:
|
|
221
|
+
existing_metadata = doc.metadata
|
|
222
|
+
for key, value in metadata_dict.items():
|
|
223
|
+
if key in ['pk', 'text']:
|
|
224
|
+
continue
|
|
225
|
+
|
|
226
|
+
if isinstance(value, dict) and key in existing_metadata and isinstance(existing_metadata[key], dict):
|
|
227
|
+
existing_metadata[key] = {}
|
|
228
|
+
existing_metadata[key] = value
|
|
229
|
+
else:
|
|
230
|
+
existing_metadata[key] = value
|
|
231
|
+
doc.metadata = existing_metadata
|
|
232
|
+
|
|
233
|
+
# Restore the primary key and text values to ensure they are not lost.
|
|
234
|
+
if pk_value is not None:
|
|
235
|
+
doc.metadata[self.primary_field] = pk_value
|
|
236
|
+
if text_value is not None:
|
|
237
|
+
doc.metadata[self.text_field] = text_value
|
|
238
|
+
|
|
239
|
+
updated_documents.append(doc)
|
|
240
|
+
|
|
241
|
+
# Extract the primary keys for the upsert operation.
|
|
242
|
+
updated_ids = [doc.metadata[self.primary_field] for doc in updated_documents]
|
|
243
|
+
|
|
244
|
+
# Remove primary key and text from metadata before upserting,
|
|
245
|
+
# as they are handled separately by the vector store.
|
|
246
|
+
for doc in updated_documents:
|
|
247
|
+
doc.metadata.pop(self.primary_field, None)
|
|
248
|
+
doc.metadata.pop(self.text_field, None)
|
|
249
|
+
|
|
250
|
+
# Perform the upsert operation to update the documents in the collection.
|
|
251
|
+
self.upsert(ids=updated_ids, documents=updated_documents)
|
|
252
|
+
|
|
253
|
+
return updated_documents
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
# @Author: Cursor
|
|
3
|
+
# @Date: 2025-02-12
|
|
4
|
+
# @Last Modified by: Gemini
|
|
5
|
+
# @Last Modified time: 2025-07-01
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from typing import List, Dict, Union, Optional
|
|
9
|
+
from langchain_milvus import Zilliz
|
|
10
|
+
from langchain_core.embeddings import Embeddings
|
|
11
|
+
from langchain_openai import AzureOpenAIEmbeddings
|
|
12
|
+
from pymilvus import MilvusClient
|
|
13
|
+
|
|
14
|
+
from crewplus.services.init_services import get_model_balancer
|
|
15
|
+
from crewplus.vectorstores.milvus.schema_milvus import SchemaMilvus
|
|
16
|
+
|
|
17
|
+
class VDBService(object):
|
|
18
|
+
"""
|
|
19
|
+
A service to manage connections to Milvus/Zilliz vector databases and embedding models.
|
|
20
|
+
|
|
21
|
+
This service centralizes the configuration and instantiation of the Milvus client
|
|
22
|
+
and provides helper methods to get embedding functions and vector store instances.
|
|
23
|
+
|
|
24
|
+
Args:
|
|
25
|
+
settings (dict): A dictionary containing configuration for the vector store
|
|
26
|
+
and embedding models.
|
|
27
|
+
schema (str, optional): The schema definition for a collection. Defaults to None.
|
|
28
|
+
logger (logging.Logger, optional): An optional logger instance. Defaults to None.
|
|
29
|
+
|
|
30
|
+
Raises:
|
|
31
|
+
ValueError: If required configurations are missing from the settings dictionary.
|
|
32
|
+
NotImplementedError: If an unsupported provider is specified.
|
|
33
|
+
RuntimeError: If the MilvusClient fails to initialize after a retry.
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> settings = {
|
|
37
|
+
... "embedder": {
|
|
38
|
+
... "provider": "azure-openai",
|
|
39
|
+
... "config": {
|
|
40
|
+
... "model": "text-embedding-3-small",
|
|
41
|
+
... "api_version": "2023-05-15",
|
|
42
|
+
... "api_key": "YOUR_AZURE_OPENAI_KEY",
|
|
43
|
+
... "openai_base_url": "YOUR_AZURE_OPENAI_ENDPOINT",
|
|
44
|
+
... "embedding_dims": 1536
|
|
45
|
+
... }
|
|
46
|
+
... },
|
|
47
|
+
... "vector_store": {
|
|
48
|
+
... "provider": "milvus",
|
|
49
|
+
... "config": {
|
|
50
|
+
... "host": "localhost",
|
|
51
|
+
... "port": 19530,
|
|
52
|
+
... "user": "root",
|
|
53
|
+
... "password": "password",
|
|
54
|
+
... "db_name": "default"
|
|
55
|
+
... }
|
|
56
|
+
... },
|
|
57
|
+
... "index_params": {
|
|
58
|
+
... "metric_type": "L2",
|
|
59
|
+
... "index_type": "AUTOINDEX",
|
|
60
|
+
... "params": {}
|
|
61
|
+
... }
|
|
62
|
+
... }
|
|
63
|
+
>>> vdb_service = VDBService(settings=settings)
|
|
64
|
+
>>> # Get the raw Milvus client
|
|
65
|
+
>>> client = vdb_service.get_vector_client()
|
|
66
|
+
>>> print(client.list_collections())
|
|
67
|
+
>>> # Get an embedding function
|
|
68
|
+
>>> embeddings = vdb_service.get_embeddings()
|
|
69
|
+
>>> print(embeddings)
|
|
70
|
+
>>> # Get a LangChain vector store instance (will be cached)
|
|
71
|
+
>>> vector_store = vdb_service.get_vector_store(collection_name="my_collection")
|
|
72
|
+
>>> print(vector_store)
|
|
73
|
+
>>> same_vector_store = vdb_service.get_vector_store(collection_name="my_collection")
|
|
74
|
+
>>> assert vector_store is same_vector_store
|
|
75
|
+
"""
|
|
76
|
+
_client: MilvusClient
|
|
77
|
+
_instances: Dict[str, Zilliz] = {}
|
|
78
|
+
|
|
79
|
+
schema: str
|
|
80
|
+
embedding_function: Embeddings
|
|
81
|
+
index_params: dict
|
|
82
|
+
connection_args: dict
|
|
83
|
+
settings: dict
|
|
84
|
+
|
|
85
|
+
def __init__(self, settings: dict, schema: str = None, logger: logging.Logger = None):
|
|
86
|
+
"""
|
|
87
|
+
Initializes the VDBService.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
settings (dict): Configuration dictionary for the service.
|
|
91
|
+
schema (str, optional): Default schema for new collections. Defaults to None.
|
|
92
|
+
logger (logging.Logger, optional): Logger instance. Defaults to None.
|
|
93
|
+
"""
|
|
94
|
+
self.logger = logger or logging.getLogger(__name__)
|
|
95
|
+
self.settings = settings
|
|
96
|
+
|
|
97
|
+
vector_store_settings = self.settings.get("vector_store")
|
|
98
|
+
if not vector_store_settings:
|
|
99
|
+
msg = "'vector_store' not found in settings"
|
|
100
|
+
self.logger.error(msg)
|
|
101
|
+
raise ValueError(msg)
|
|
102
|
+
|
|
103
|
+
provider = vector_store_settings.get("provider")
|
|
104
|
+
self.connection_args = vector_store_settings.get("config")
|
|
105
|
+
|
|
106
|
+
if not provider or not self.connection_args:
|
|
107
|
+
msg = "'provider' or 'config' not found in 'vector_store' settings"
|
|
108
|
+
self.logger.error(msg)
|
|
109
|
+
raise ValueError(msg)
|
|
110
|
+
|
|
111
|
+
self._client = self._initialize_milvus_client(provider)
|
|
112
|
+
|
|
113
|
+
self.schema = schema
|
|
114
|
+
self.index_params = self.settings.get("index_params")
|
|
115
|
+
|
|
116
|
+
self.logger.info("VDBService initialized successfully")
|
|
117
|
+
|
|
118
|
+
def _initialize_milvus_client(self, provider: str) -> MilvusClient:
|
|
119
|
+
"""
|
|
120
|
+
Initializes and returns a MilvusClient with a retry mechanism.
|
|
121
|
+
"""
|
|
122
|
+
client_args = {}
|
|
123
|
+
if provider == "milvus":
|
|
124
|
+
host = self.connection_args.get("host", "localhost")
|
|
125
|
+
port = self.connection_args.get("port", 19530)
|
|
126
|
+
|
|
127
|
+
# Use https for remote hosts, and http for local connections.
|
|
128
|
+
scheme = "https" if host not in ["localhost", "127.0.0.1"] else "http"
|
|
129
|
+
uri = f"{scheme}://{host}:{port}"
|
|
130
|
+
|
|
131
|
+
client_args = {
|
|
132
|
+
"uri": uri,
|
|
133
|
+
"user": self.connection_args.get("user"),
|
|
134
|
+
"password": self.connection_args.get("password"),
|
|
135
|
+
"db_name": self.connection_args.get("db_name")
|
|
136
|
+
}
|
|
137
|
+
# Filter out None values to use client defaults
|
|
138
|
+
client_args = {k: v for k, v in client_args.items() if v is not None}
|
|
139
|
+
|
|
140
|
+
elif provider == "zilliz":
|
|
141
|
+
client_args = self.connection_args
|
|
142
|
+
else:
|
|
143
|
+
self.logger.error(f"Unsupported vector store provider: {provider}")
|
|
144
|
+
raise NotImplementedError(f"Vector store provider '{provider}' is not supported.")
|
|
145
|
+
|
|
146
|
+
try:
|
|
147
|
+
# First attempt to connect
|
|
148
|
+
return MilvusClient(**client_args)
|
|
149
|
+
except Exception as e:
|
|
150
|
+
self.logger.error(f"Failed to initialize MilvusClient, trying again. Error: {e}")
|
|
151
|
+
# Second attempt after failure
|
|
152
|
+
try:
|
|
153
|
+
return MilvusClient(**client_args)
|
|
154
|
+
except Exception as e_retry:
|
|
155
|
+
self.logger.error(f"Failed to initialize MilvusClient on retry. Final error: {e_retry}")
|
|
156
|
+
raise RuntimeError(f"Could not initialize MilvusClient after retry: {e_retry}")
|
|
157
|
+
|
|
158
|
+
def get_vector_client(self) -> MilvusClient:
|
|
159
|
+
"""
|
|
160
|
+
Returns the active MilvusClient instance.
|
|
161
|
+
|
|
162
|
+
Returns:
|
|
163
|
+
MilvusClient: The initialized client for interacting with the vector database.
|
|
164
|
+
"""
|
|
165
|
+
return self._client
|
|
166
|
+
|
|
167
|
+
def get_embeddings(self, from_model_balancer: bool = False, model_type: Optional[str] = "embedding-large") -> Embeddings:
|
|
168
|
+
"""
|
|
169
|
+
Gets an embedding function, either from the model balancer or directly from settings.
|
|
170
|
+
|
|
171
|
+
Args:
|
|
172
|
+
from_model_balancer (bool): If True, uses the central model balancer service.
|
|
173
|
+
If False, creates a new instance based on 'embedder' settings.
|
|
174
|
+
model_type (str, optional): The type of model to get from the balancer. Defaults to "embedding-large".
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
Embeddings: An instance of a LangChain embedding model.
|
|
178
|
+
"""
|
|
179
|
+
if from_model_balancer:
|
|
180
|
+
model_balancer = get_model_balancer()
|
|
181
|
+
return model_balancer.get_model(model_type=model_type)
|
|
182
|
+
|
|
183
|
+
embedder_config = self.settings.get("embedder")
|
|
184
|
+
if not embedder_config:
|
|
185
|
+
self.logger.error("'embedder' configuration not found in settings.")
|
|
186
|
+
raise ValueError("'embedder' configuration not found in settings.")
|
|
187
|
+
|
|
188
|
+
provider = embedder_config.get("provider")
|
|
189
|
+
config = embedder_config.get("config")
|
|
190
|
+
|
|
191
|
+
if not provider or not config:
|
|
192
|
+
self.logger.error("Embedder 'provider' or 'config' not found in settings.")
|
|
193
|
+
raise ValueError("Embedder 'provider' or 'config' not found in settings.")
|
|
194
|
+
|
|
195
|
+
if provider == "azure-openai":
|
|
196
|
+
# Map the settings config to AzureOpenAIEmbeddings parameters.
|
|
197
|
+
azure_config = {
|
|
198
|
+
"azure_deployment": config.get("model"),
|
|
199
|
+
"openai_api_version": config.get("api_version"),
|
|
200
|
+
"api_key": config.get("api_key"),
|
|
201
|
+
"azure_endpoint": config.get("openai_base_url"),
|
|
202
|
+
"dimensions": config.get("embedding_dims"),
|
|
203
|
+
"chunk_size": config.get("chunk_size", 16),
|
|
204
|
+
"request_timeout": config.get("request_timeout", 60),
|
|
205
|
+
"max_retries": config.get("max_retries", 2)
|
|
206
|
+
}
|
|
207
|
+
# Filter out None values to use client defaults.
|
|
208
|
+
azure_config = {k: v for k, v in azure_config.items() if v is not None}
|
|
209
|
+
|
|
210
|
+
return AzureOpenAIEmbeddings(**azure_config)
|
|
211
|
+
else:
|
|
212
|
+
self.logger.error(f"Unsupported embedding provider: {provider}")
|
|
213
|
+
raise NotImplementedError(f"Embedding provider '{provider}' is not supported yet.")
|
|
214
|
+
|
|
215
|
+
def get_vector_store(self, collection_name: str, embeddings: Embeddings = None, metric_type: str = "L2") -> Zilliz:
|
|
216
|
+
"""
|
|
217
|
+
Gets a vector store instance, creating it if it doesn't exist for the collection.
|
|
218
|
+
|
|
219
|
+
This method caches instances by collection name to avoid re-instantiation.
|
|
220
|
+
|
|
221
|
+
Args:
|
|
222
|
+
collection_name (str): The name of the collection in the vector database.
|
|
223
|
+
embeddings (Embeddings, optional): An embedding model instance. If None, one is created.
|
|
224
|
+
metric_type (str): The distance metric for the index. Defaults to "L2".
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
Zilliz: LangChain Zilliz instance, which is compatible with both Zilliz and Milvus.
|
|
228
|
+
"""
|
|
229
|
+
if not collection_name:
|
|
230
|
+
self.logger.error("get_vector_store called with no collection_name.")
|
|
231
|
+
raise ValueError("collection_name must be provided.")
|
|
232
|
+
|
|
233
|
+
# Return the cached instance if it already exists.
|
|
234
|
+
if collection_name in self._instances:
|
|
235
|
+
self.logger.info(f"Returning existing vector store instance for collection: {collection_name}")
|
|
236
|
+
return self._instances[collection_name]
|
|
237
|
+
|
|
238
|
+
self.logger.info(f"Creating new vector store instance for collection: {collection_name}")
|
|
239
|
+
if embeddings is None:
|
|
240
|
+
embeddings = self.get_embeddings()
|
|
241
|
+
|
|
242
|
+
index_params = self.index_params or {
|
|
243
|
+
"metric_type": metric_type,
|
|
244
|
+
"index_type": "AUTOINDEX",
|
|
245
|
+
"params": {}
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
vdb = Zilliz(
|
|
249
|
+
embedding_function=embeddings,
|
|
250
|
+
collection_name=collection_name,
|
|
251
|
+
connection_args=self.connection_args,
|
|
252
|
+
index_params=index_params
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Cache the newly created instance.
|
|
256
|
+
self._instances[collection_name] = vdb
|
|
257
|
+
|
|
258
|
+
return vdb
|
|
259
|
+
|
|
260
|
+
def delete_old_indexes(self, url: str = None, vdb: Zilliz = None) -> None:
|
|
261
|
+
""" Delete old indexes of the same source_url
|
|
262
|
+
|
|
263
|
+
Args:
|
|
264
|
+
url (str): source url
|
|
265
|
+
"""
|
|
266
|
+
if url is None or vdb is None:
|
|
267
|
+
return
|
|
268
|
+
|
|
269
|
+
# Delete indexes of the same source_url
|
|
270
|
+
expr = "source in [\"" + url + "\"]"
|
|
271
|
+
pks = vdb.get_pks(expr)
|
|
272
|
+
|
|
273
|
+
# Delete entities by pks
|
|
274
|
+
if pks is not None and len(pks) > 0 :
|
|
275
|
+
old_items = vdb.delete(pks)
|
|
276
|
+
self.logger.info("ingesting document -- delete old indexes -- " + str(old_items))
|
|
277
|
+
|
|
278
|
+
def delete_old_indexes_by_id(self, id: str = None, vdb: Zilliz = None) -> None:
|
|
279
|
+
""" Delete old indexes of the same source_id
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
id (str): source id
|
|
283
|
+
"""
|
|
284
|
+
self.logger.info(f"Delete old indexes of the same source_id:{id}")
|
|
285
|
+
|
|
286
|
+
if id is None or vdb is None:
|
|
287
|
+
return
|
|
288
|
+
|
|
289
|
+
# Delete indexes of the same source_id
|
|
290
|
+
expr = "source_id in [\"" + id + "\"]"
|
|
291
|
+
pks = vdb.get_pks(expr)
|
|
292
|
+
|
|
293
|
+
# Delete entities by pks
|
|
294
|
+
if pks is not None and len(pks) > 0 :
|
|
295
|
+
old_items = vdb.delete(pks)
|
|
296
|
+
self.logger.info("ingesting document -- delete old indexes -- " + str(old_items))
|
|
297
|
+
|
|
298
|
+
def drop_collection(self, collection_name: str) -> None:
|
|
299
|
+
"""
|
|
300
|
+
Deletes a collection from the vector database and removes it from the cache.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
collection_name (str): The name of the collection to drop.
|
|
304
|
+
|
|
305
|
+
Raises:
|
|
306
|
+
ValueError: If collection_name is not provided.
|
|
307
|
+
RuntimeError: If the operation fails on the database side.
|
|
308
|
+
"""
|
|
309
|
+
if not collection_name:
|
|
310
|
+
self.logger.error("drop_collection called without a collection_name.")
|
|
311
|
+
raise ValueError("collection_name must be provided.")
|
|
312
|
+
|
|
313
|
+
self.logger.info(f"Attempting to drop collection: {collection_name}")
|
|
314
|
+
|
|
315
|
+
try:
|
|
316
|
+
client = self.get_vector_client()
|
|
317
|
+
client.drop_collection(collection_name=collection_name)
|
|
318
|
+
self.logger.info(f"Successfully dropped collection: {collection_name}")
|
|
319
|
+
except Exception as e:
|
|
320
|
+
self.logger.error(f"Failed to drop collection '{collection_name}': {e}")
|
|
321
|
+
raise RuntimeError(f"An error occurred while dropping collection '{collection_name}'.") from e
|
|
322
|
+
finally:
|
|
323
|
+
# Whether successful or not, remove the stale instance from the cache.
|
|
324
|
+
if collection_name in self._instances:
|
|
325
|
+
del self._instances[collection_name]
|
|
326
|
+
self.logger.info(f"Removed '{collection_name}' from instance cache.")
|
|
327
|
+
|
|
328
|
+
def delete_data_by_filter(self, collection_name: str = None, filter: str = None) -> None:
|
|
329
|
+
""" Delete a collection
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
collection_name (str): scollection_name
|
|
333
|
+
"""
|
|
334
|
+
self.logger.info(f"drop a collection by name:{collection_name}")
|
|
335
|
+
|
|
336
|
+
try:
|
|
337
|
+
client=self.get_vector_client()
|
|
338
|
+
if collection_name is None or client is None or filter is None:
|
|
339
|
+
return RuntimeError(f"collection_name must be not null or check out your client to link milvus")
|
|
340
|
+
client.delete(collection_name=collection_name, filter=filter)
|
|
341
|
+
except Exception as e:
|
|
342
|
+
raise RuntimeError(f"delete collection data failed: {str(e)}")
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: crewplus
|
|
3
|
-
Version: 0.1
|
|
3
|
+
Version: 0.2.1
|
|
4
4
|
Summary: Base services for CrewPlus AI applications
|
|
5
5
|
Author-Email: Tim Liu <tim@opsmateai.com>
|
|
6
6
|
License: MIT
|
|
@@ -15,6 +15,7 @@ Requires-Dist: google-genai==1.21.1
|
|
|
15
15
|
Requires-Dist: mkdocs<2.0.0,>=1.6.1
|
|
16
16
|
Requires-Dist: mkdocs-material<10.0.0,>=9.6.14
|
|
17
17
|
Requires-Dist: mkdocstrings-python<2.0.0,>=1.16.12
|
|
18
|
+
Requires-Dist: langchain-milvus<0.3.0,>=0.2.1
|
|
18
19
|
Description-Content-Type: text/markdown
|
|
19
20
|
|
|
20
21
|
# CrewPlus
|
|
@@ -37,6 +38,7 @@ CrewPlus is designed as a modular and extensible ecosystem of packages. This all
|
|
|
37
38
|
- **`crewplus` (This package):** The core package containing foundational services for chat, model load balancing, and vector stores.
|
|
38
39
|
- **`crewplus-agents`:** An extension for creating and managing autonomous AI agents.
|
|
39
40
|
- **`crewplus-ingestion`:** Provides robust pipelines for knowledge ingestion and data processing.
|
|
41
|
+
- **`crewplus-memory`:** Provides agent memory services for Crewplus AI Agents.
|
|
40
42
|
- **`crewplus-integrations`:** A collection of third-party integrations to connect CrewPlus with other services and platforms.
|
|
41
43
|
|
|
42
44
|
## Features
|
|
@@ -94,6 +96,10 @@ crewplus-base/ # GitHub repo name
|
|
|
94
96
|
│ └── gemini_chat_model.py
|
|
95
97
|
│ └── model_load_balancer.py
|
|
96
98
|
│ └── ...
|
|
99
|
+
│ └── vectorstores/milvus
|
|
100
|
+
│ └── __init__.py
|
|
101
|
+
│ └── schema_milvus.py
|
|
102
|
+
│ └── vdb_service.py
|
|
97
103
|
│ └── core/
|
|
98
104
|
│ └── __init__.py
|
|
99
105
|
│ └── config.py
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
crewplus-0.2.1.dist-info/METADATA,sha256=IKVh7qW2mV9NfYXs66enxgcBOk_GTP9xeU3kwI6lysc,4881
|
|
2
|
+
crewplus-0.2.1.dist-info/WHEEL,sha256=tSfRZzRHthuv7vxpI4aehrdN9scLjk-dCJkPLzkHxGg,90
|
|
3
|
+
crewplus-0.2.1.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
|
|
4
|
+
crewplus-0.2.1.dist-info/licenses/LICENSE,sha256=2_NHSHRTKB_cTcT_GXgcenOCtIZku8j343mOgAguTfc,1087
|
|
5
|
+
crewplus/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
+
crewplus/services/__init__.py,sha256=MmH2v3N0ZMsuqFNAupkXENjUqvgf5ehQ99H6EzPqLZU,48
|
|
7
|
+
crewplus/services/gemini_chat_model.py,sha256=i9p5KvSJYaHSUBLPKM_bpyGVLWCDQoNeah_WjQVJRXs,26227
|
|
8
|
+
crewplus/services/init_services.py,sha256=5MlvZbyGPNgVA_2bPuxhhbw9TgD5rrvC0e_62YJh340,589
|
|
9
|
+
crewplus/services/model_load_balancer.py,sha256=a2BfcWV-OEItVf7_tTBYY0HiXxFzA5Uk7d-6AG-jLNU,8692
|
|
10
|
+
crewplus/utils/schema_action.py,sha256=GDaBoVFQD1rXqrLVSMTfXYW1xcUu7eDcHsn57XBSnIg,422
|
|
11
|
+
crewplus/utils/schema_document_updater.py,sha256=frvffxn2vbi71fHFPoGb9hq7gH2azmmdq17p-Fumnvg,7322
|
|
12
|
+
crewplus/vectorstores/milvus/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
13
|
+
crewplus/vectorstores/milvus/milvus_schema_manager.py,sha256=qHMVIM0NS3rLfACb8d3-tQS9hJo6_7_YP8AxVx4t1Cc,9019
|
|
14
|
+
crewplus/vectorstores/milvus/schema_milvus.py,sha256=GhHTtCH5HsIJc3RHa25RXl3aZdkS3Rba5KeuUk_Hi0k,11425
|
|
15
|
+
crewplus/vectorstores/milvus/vdb_service.py,sha256=KiGuHWU9oz2QCCaxGECtN-F69m8ZOwjzPO0umk6ZjzA,14592
|
|
16
|
+
crewplus-0.2.1.dist-info/RECORD,,
|
crewplus-0.1.6.dist-info/RECORD
DELETED
|
@@ -1,9 +0,0 @@
|
|
|
1
|
-
crewplus-0.1.6.dist-info/METADATA,sha256=lqdkw41PrxeXf9IJRMEkCZrxw1B-3jLyEmBwlU2iT-k,4609
|
|
2
|
-
crewplus-0.1.6.dist-info/WHEEL,sha256=tSfRZzRHthuv7vxpI4aehrdN9scLjk-dCJkPLzkHxGg,90
|
|
3
|
-
crewplus-0.1.6.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
|
|
4
|
-
crewplus-0.1.6.dist-info/licenses/LICENSE,sha256=2_NHSHRTKB_cTcT_GXgcenOCtIZku8j343mOgAguTfc,1087
|
|
5
|
-
crewplus/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
6
|
-
crewplus/services/__init__.py,sha256=MmH2v3N0ZMsuqFNAupkXENjUqvgf5ehQ99H6EzPqLZU,48
|
|
7
|
-
crewplus/services/gemini_chat_model.py,sha256=i9p5KvSJYaHSUBLPKM_bpyGVLWCDQoNeah_WjQVJRXs,26227
|
|
8
|
-
crewplus/services/model_load_balancer.py,sha256=bJpSgCGPWWT1yD_nYshIPngr8Xmdq1gfq8lJ1hOEGbM,8673
|
|
9
|
-
crewplus-0.1.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|