crewplus 0.2.38__tar.gz → 0.2.40__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crewplus might be problematic. Click here for more details.

Files changed (23) hide show
  1. {crewplus-0.2.38 → crewplus-0.2.40}/PKG-INFO +1 -1
  2. crewplus-0.2.40/crewplus/vectorstores/milvus/schema_milvus.py +563 -0
  3. {crewplus-0.2.38 → crewplus-0.2.40}/pyproject.toml +1 -1
  4. crewplus-0.2.38/crewplus/vectorstores/milvus/schema_milvus.py +0 -277
  5. {crewplus-0.2.38 → crewplus-0.2.40}/LICENSE +0 -0
  6. {crewplus-0.2.38 → crewplus-0.2.40}/README.md +0 -0
  7. {crewplus-0.2.38 → crewplus-0.2.40}/crewplus/__init__.py +0 -0
  8. {crewplus-0.2.38 → crewplus-0.2.40}/crewplus/services/__init__.py +0 -0
  9. {crewplus-0.2.38 → crewplus-0.2.40}/crewplus/services/azure_chat_model.py +0 -0
  10. {crewplus-0.2.38 → crewplus-0.2.40}/crewplus/services/gemini_chat_model.py +0 -0
  11. {crewplus-0.2.38 → crewplus-0.2.40}/crewplus/services/init_services.py +0 -0
  12. {crewplus-0.2.38 → crewplus-0.2.40}/crewplus/services/model_load_balancer.py +0 -0
  13. {crewplus-0.2.38 → crewplus-0.2.40}/crewplus/services/tracing_manager.py +0 -0
  14. {crewplus-0.2.38 → crewplus-0.2.40}/crewplus/utils/__init__.py +0 -0
  15. {crewplus-0.2.38 → crewplus-0.2.40}/crewplus/utils/schema_action.py +0 -0
  16. {crewplus-0.2.38 → crewplus-0.2.40}/crewplus/utils/schema_document_updater.py +0 -0
  17. {crewplus-0.2.38 → crewplus-0.2.40}/crewplus/vectorstores/milvus/__init__.py +0 -0
  18. {crewplus-0.2.38 → crewplus-0.2.40}/crewplus/vectorstores/milvus/milvus_schema_manager.py +0 -0
  19. {crewplus-0.2.38 → crewplus-0.2.40}/crewplus/vectorstores/milvus/vdb_service.py +0 -0
  20. {crewplus-0.2.38 → crewplus-0.2.40}/docs/GeminiChatModel.md +0 -0
  21. {crewplus-0.2.38 → crewplus-0.2.40}/docs/ModelLoadBalancer.md +0 -0
  22. {crewplus-0.2.38 → crewplus-0.2.40}/docs/VDBService.md +0 -0
  23. {crewplus-0.2.38 → crewplus-0.2.40}/docs/index.md +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: crewplus
3
- Version: 0.2.38
3
+ Version: 0.2.40
4
4
  Summary: Base services for CrewPlus AI applications
5
5
  Author-Email: Tim Liu <tim@opsmateai.com>
6
6
  License: MIT
@@ -0,0 +1,563 @@
1
+ from typing import List, Optional
2
+ import logging
3
+ import json
4
+ import asyncio
5
+
6
+ from pymilvus import DataType
7
+ from langchain_milvus import Milvus
8
+ from langchain_core.documents import Document
9
+ from ...utils.schema_document_updater import SchemaDocumentUpdater
10
+ from ...utils.schema_action import Action
11
+ from .milvus_schema_manager import MilvusSchemaManager
12
+
13
+ DEFAULT_SCHEMA = """
14
+ {
15
+ "node_types": {
16
+ "Document": {
17
+ "properties": {
18
+ "pk": {
19
+ "type": "INT64",
20
+ "is_primary": true,
21
+ "auto_id": true
22
+ },
23
+ "vector": {
24
+ "type": "FLOAT_VECTOR",
25
+ "dim": 1536
26
+ },
27
+ "text": {
28
+ "type": "VARCHAR",
29
+ "max_length": 65535,
30
+ "description": "The core text of the memory. This could be a user query, a documented fact, a procedural step, or a log of an event."
31
+ }
32
+ }
33
+ }
34
+ }
35
+ }
36
+ """
37
+
38
+ class SchemaMilvus(Milvus):
39
+ """
40
+ SchemaMilvus is a subclass of the Milvus class from langchain_milvus. This class is responsible for updating metadata of documents in a Milvus vector store.
41
+
42
+ Attributes:
43
+ embedding_function: Embedding function used by the Milvus vector store.
44
+ collection_name: Name of the collection in the Milvus vector store.
45
+ connection_args: Connection arguments for the Milvus vector store.
46
+ index_params: Index parameters for the Milvus vector store.
47
+ auto_id: Flag to specify if auto ID generation is enabled.
48
+ primary_field: The primary field of the collection.
49
+ vector_field: The vector field of the collection.
50
+ consistency_level: The consistency level for the Milvus vector store.
51
+ collection_schema: Schema JSON string associated with the Milvus existing collection name.
52
+ """
53
+ def __init__(
54
+ self,
55
+ embedding_function,
56
+ collection_name,
57
+ connection_args,
58
+ index_params=None,
59
+ auto_id=True,
60
+ primary_field="pk",
61
+ text_field: str = "text",
62
+ vector_field=["vector"],
63
+ consistency_level="Session",
64
+ logger: Optional[logging.Logger] = None
65
+ ):
66
+ """
67
+ Initializes the SchemaMilvus class with the provided parameters.
68
+
69
+ Args:
70
+ embedding_function: Embedding function used by the Milvus vector store.
71
+ collection_name: Name of the collection in the Milvus vector store.
72
+ connection_args: Connection arguments for the Milvus vector store.
73
+ index_params: Index parameters for the Milvus vector store.
74
+ auto_id: Flag to specify if auto ID generation is enabled.
75
+ primary_field: The primary field of the collection.
76
+ text_field: The text field of the collection.
77
+ vector_field: The vector field of the collection.
78
+ consistency_level: The consistency level for the Milvus vector store.
79
+ logger: Optional logger instance. If not provided, a default logger is created.
80
+ """
81
+ super().__init__(
82
+ embedding_function=embedding_function,
83
+ collection_name=collection_name,
84
+ connection_args=connection_args,
85
+ index_params=index_params,
86
+ auto_id=auto_id,
87
+ primary_field=primary_field,
88
+ text_field=text_field,
89
+ vector_field=vector_field,
90
+ consistency_level=consistency_level
91
+ )
92
+ self.logger = logger or logging.getLogger(__name__)
93
+ self.collection_schema = None
94
+ self.schema_manager = MilvusSchemaManager(client=self.client)
95
+
96
+ def set_schema(self, schema: str):
97
+ """
98
+ Sets the collection schema.
99
+
100
+ Args:
101
+ schema: The schema JSON string.
102
+ """
103
+ self.collection_schema = schema
104
+
105
+ def get_fields(self, collection_name: Optional[str] = None) -> Optional[List[str]]:
106
+ """
107
+ Retrieves and returns the fields from the collection schema.
108
+
109
+ Args:
110
+ collection_name: The name of the collection to describe. If None, use self.collection_name.
111
+
112
+ Returns:
113
+ List[str] | None: The list of field names from the collection schema (excluding vector and text fields), or None if collection_name is not provided or an error occurs.
114
+ """
115
+ if collection_name is None:
116
+ collection_name = self.collection_name
117
+ if collection_name is None:
118
+ return None
119
+
120
+ try:
121
+ schema = self.client.describe_collection(collection_name)
122
+ fields = [field["name"] for field in schema["fields"] if field["type"] != DataType.FLOAT_VECTOR ]
123
+ return fields
124
+ except Exception as e:
125
+ self.logger.warning(f"Failed to retrieve schema fields: {e}")
126
+ return None
127
+
128
+ def create_collection(self) -> bool:
129
+ """
130
+ Validates the schema and creates the collection using the MilvusSchemaManager.
131
+
132
+ Returns:
133
+ bool: True if the collection is successfully created, False otherwise.
134
+ """
135
+ if self.collection_schema is None:
136
+ self.logger.error("Collection schema is not set. Please set a schema using set_schema().")
137
+ return False
138
+
139
+ self.schema_manager.bind_client(self.client)
140
+ if not self.schema_manager.validate_schema(self.collection_schema):
141
+ self.logger.error("Failed to validate schema")
142
+ return False
143
+ try:
144
+ self.schema_manager.create_collection(self.collection_name, self.collection_schema)
145
+ self.logger.info(f"Collection {self.collection_name} created successfully")
146
+
147
+ return True
148
+ except Exception as e:
149
+ self.logger.error(f"Failed to create collection: {e}")
150
+ return False
151
+
152
+ def drop_collection(self, collection_name: Optional[str] = None) -> bool:
153
+ """
154
+ Drops the collection using the Milvus client.
155
+
156
+ Returns:
157
+ bool: True if the collection is successfully dropped, False otherwise.
158
+ """
159
+ if collection_name is None:
160
+ collection_name = self.collection_name
161
+
162
+ try:
163
+ self.client.drop_collection(collection_name)
164
+ self.logger.info(f"Collection {collection_name} dropped successfully")
165
+ return True
166
+ except Exception as e:
167
+ self.logger.error(f"Failed to drop collection {self.collection_name}: {e}")
168
+ return False
169
+
170
+ def _handle_upsert(self, doc: Document, metadata_dict: dict) -> Document:
171
+ """
172
+ Handles the UPSERT action for a single document by merging metadata.
173
+ """
174
+ existing_metadata = doc.metadata
175
+ for key, value in metadata_dict.items():
176
+ # Skip primary key and text fields to prevent modification.
177
+ if key in [self.primary_field, self.text_field]:
178
+ continue
179
+
180
+ if isinstance(value, dict):
181
+ # If it's a JSON object field (e.g., plant_metadata)
182
+ # Check if the existing value is a string, and if so, try to parse it as a dictionary
183
+ if key in existing_metadata and isinstance(existing_metadata[key], str):
184
+ try:
185
+ existing_metadata[key] = json.loads(existing_metadata[key])
186
+ except json.JSONDecodeError:
187
+ # If the parsing fails, it may not be a valid JSON string, treat it as a regular string
188
+ self.logger.warning(f"Field '{key}' could not be parsed as JSON. Overwriting as a new dict.")
189
+ existing_metadata[key] = {}
190
+
191
+ if key not in existing_metadata:
192
+ # If the field does not exist, add it
193
+ existing_metadata[key] = value
194
+ elif isinstance(existing_metadata[key], dict):
195
+ # If the field exists and is a dictionary, recursively update the sub-fields
196
+ for sub_key, sub_value in value.items():
197
+ if isinstance(sub_value, dict):
198
+ # If the sub-field is also a dictionary, recursively process it
199
+ if sub_key not in existing_metadata[key]:
200
+ existing_metadata[key][sub_key] = sub_value
201
+ else:
202
+ existing_metadata[key][sub_key].update(sub_value)
203
+ else:
204
+ # If the sub-field is a regular value, update it
205
+ existing_metadata[key][sub_key] = sub_value
206
+ else:
207
+ # If the field exists but is not a dictionary (e.g., a number or string), overwrite with the new dictionary
208
+ existing_metadata[key] = value
209
+ else:
210
+ # If it's a regular field, update the value
211
+ existing_metadata[key] = value
212
+
213
+ # Update the document's metadata
214
+ doc.metadata = existing_metadata
215
+
216
+ return doc
217
+
218
+ def _process_document_update(self, doc: Document, metadata_dict: dict, action: Action) -> Document:
219
+ """
220
+ Applies the specified update operation to a single document.
221
+
222
+ Args:
223
+ doc: The Document object to be updated.
224
+ metadata_dict: A dictionary containing the new data.
225
+ action: The type of operation to perform (UPSERT, DELETE, UPDATE, INSERT).
226
+
227
+ Returns:
228
+ The updated Document object.
229
+ """
230
+ pk_value = doc.metadata.get(self.primary_field)
231
+ text_value = doc.metadata.get(self.text_field)
232
+
233
+ if action == Action.UPSERT:
234
+ doc = self._handle_upsert(doc, metadata_dict)
235
+ elif action == Action.DELETE:
236
+ keys_to_delete = metadata_dict.keys()
237
+ doc = SchemaDocumentUpdater.delete_document_metadata(doc, list(keys_to_delete))
238
+ elif action == Action.UPDATE:
239
+ existing_metadata = doc.metadata
240
+ update_dict = {}
241
+ for key, value in metadata_dict.items():
242
+ if key in existing_metadata:
243
+ if isinstance(value, dict) and isinstance(existing_metadata.get(key), dict):
244
+ merged = existing_metadata[key].copy()
245
+ for sub_key, sub_value in value.items():
246
+ if sub_key in merged:
247
+ merged[sub_key] = sub_value
248
+ update_dict[key] = merged
249
+ else:
250
+ update_dict[key] = value
251
+ doc = SchemaDocumentUpdater.update_document_metadata(doc, update_dict)
252
+ elif action == Action.INSERT:
253
+ existing_metadata = doc.metadata
254
+ for key, value in metadata_dict.items():
255
+ if key in ['pk', 'text']:
256
+ continue
257
+
258
+ if isinstance(value, dict) and key in existing_metadata and isinstance(existing_metadata.get(key), dict):
259
+ existing_metadata[key] = {}
260
+ existing_metadata[key] = value
261
+ else:
262
+ existing_metadata[key] = value
263
+ doc.metadata = existing_metadata
264
+
265
+ if pk_value is not None:
266
+ doc.metadata[self.primary_field] = pk_value
267
+ if text_value is not None:
268
+ doc.metadata[self.text_field] = text_value
269
+
270
+ return doc
271
+
272
+ def update_documents_metadata(self, expr: str, metadata: str, action: Action = Action.UPSERT) -> List[Document]:
273
+ """
274
+ Updates the metadata of documents in the Milvus vector store based on the provided expression.
275
+ This method uses a direct client upsert to avoid re-embedding vectors.
276
+
277
+ Args:
278
+ expr: Expression to filter the target documents.
279
+ metadata: New metadata to update the documents with.
280
+ action: The action to perform on the document metadata.
281
+
282
+ Returns:
283
+ List of updated documents.
284
+ """
285
+ try:
286
+ metadata_dict = json.loads(metadata)
287
+ except json.JSONDecodeError:
288
+ raise ValueError("Invalid JSON string for metadata")
289
+
290
+ fields = self.get_fields()
291
+ if not fields:
292
+ fields = []
293
+
294
+ if isinstance(self._vector_field, list):
295
+ fields.extend(self._vector_field)
296
+ else:
297
+ fields.append(self._vector_field)
298
+
299
+ documents = self.search_by_metadata(expr, fields=fields, limit=5000)
300
+
301
+ updated_documents = [self._process_document_update(doc, metadata_dict, action) for doc in documents]
302
+
303
+ if updated_documents:
304
+ self.logger.debug(f"Upserting {len(updated_documents)} documents using direct client upsert.")
305
+ upsert_data = [doc.metadata for doc in updated_documents]
306
+ self.client.upsert(
307
+ collection_name=self.collection_name,
308
+ data=upsert_data
309
+ )
310
+
311
+ return updated_documents
312
+
313
+ async def aupdate_documents_metadata(self, expr: str, metadata: str, action: Action = Action.UPSERT) -> List[Document]:
314
+ """
315
+ Asynchronously updates the metadata of documents in the Milvus vector store.
316
+ This method uses a direct client upsert to avoid re-embedding vectors.
317
+
318
+ Args:
319
+ expr: Expression to filter the target documents.
320
+ metadata: New metadata to update the documents with.
321
+ action: The action to perform on the document metadata.
322
+
323
+ Returns:
324
+ List of updated documents.
325
+ """
326
+ try:
327
+ metadata_dict = json.loads(metadata)
328
+ except json.JSONDecodeError:
329
+ raise ValueError("Invalid JSON string for metadata")
330
+
331
+ fields = self.get_fields()
332
+ if not fields:
333
+ fields = []
334
+
335
+ if isinstance(self._vector_field, list):
336
+ fields.extend(self._vector_field)
337
+ else:
338
+ fields.append(self._vector_field)
339
+
340
+ documents = self.search_by_metadata(expr, fields=fields, limit=5000)
341
+
342
+ updated_documents = [self._process_document_update(doc, metadata_dict, action) for doc in documents]
343
+
344
+ if updated_documents:
345
+ self.logger.debug(f"Upserting {len(updated_documents)} documents using direct client upsert.")
346
+ upsert_data = [doc.metadata for doc in updated_documents]
347
+
348
+ await asyncio.to_thread(
349
+ self.client.upsert,
350
+ collection_name=self.collection_name,
351
+ data=upsert_data
352
+ )
353
+
354
+ return updated_documents
355
+
356
+ def update_documents_metadata_by_iterator(self, expr: str, metadata: str, action:Action=Action.UPSERT) -> List[Document]:
357
+ """
358
+ 【官方推荐版】
359
+ 使用 pymilvus.Collection.query_iterator 官方推荐的迭代方式更新元数据。
360
+ 本方法的业务逻辑(UPSERT/DELETE等)与 update_documents_metadata 方法完全一致,
361
+ 仅数据获取方式遵循官方标准迭代器模式。
362
+ """
363
+ try:
364
+ metadata_dict = json.loads(metadata)
365
+ except json.JSONDecodeError:
366
+ raise ValueError("Invalid JSON string for metadata")
367
+
368
+ fields = self.get_fields() or []
369
+ # 确保主键和文本字段在输出字段中
370
+ if 'pk' not in fields:
371
+ fields.append('pk')
372
+ text_field = getattr(self, "_text_field", "text")
373
+ if text_field not in fields:
374
+ fields.append(text_field)
375
+
376
+ # 【关键修正】: 确保在查询时也获取向量字段。
377
+ # self.client.upsert 操作要求提供所有非 nullable 字段,包括 vector。
378
+ # 因此,我们必须在迭代查询时获取它,以便在更新时能够一并传回。
379
+ vector_fields_to_add = self._vector_field if isinstance(self._vector_field, list) else [self._vector_field]
380
+ for vf in vector_fields_to_add:
381
+ if vf not in fields:
382
+ fields.append(vf)
383
+
384
+ total_updated_documents = []
385
+ batch_size = 1000 # 您可以根据需要调整批次大小
386
+
387
+ self.logger.info(f"Starting metadata update using 'collection.query_iterator' with batch size {batch_size}.")
388
+
389
+ # # 1. 【关键保险】: 在查询前,显式地确保集合已被加载。
390
+ # logger.info(f"Ensuring collection '{self.collection_name}' is loaded before querying.")
391
+ # self.col.load()
392
+
393
+ # 2. 【官方用法】: 创建官方推荐的迭代器
394
+ iterator = self.col.query_iterator(
395
+ batch_size=batch_size,
396
+ expr=expr,
397
+ output_fields=fields
398
+ )
399
+
400
+ batch_i = 0
401
+ try:
402
+ while True:
403
+ # 3. 【官方用法】: 获取下一批次
404
+ batch_results = iterator.next()
405
+ if not batch_results:
406
+ break # 迭代完成,正常退出
407
+
408
+ batch_i += 1
409
+ self.logger.info(f"Processing batch {batch_i} of {len(batch_results)} documents.")
410
+
411
+ # 4. 将 Milvus 返回的 dict 列表转换为 Langchain Document 对象
412
+ documents = [
413
+ Document(page_content=result.get(text_field, ""), metadata=result)
414
+ for result in batch_results
415
+ ]
416
+
417
+ # 5. 【核心业务逻辑】: 使用公共方法处理批次中的每个文档
418
+ updated_documents_in_batch = [self._process_document_update(doc, metadata_dict, action) for doc in documents]
419
+
420
+ # 6. 【Upsert逻辑】:
421
+ if updated_documents_in_batch:
422
+ self.logger.debug(f"Upserting batch of {len(updated_documents_in_batch)} documents using direct client upsert.")
423
+ # 从更新后的 Document 对象中提取元数据字典列表
424
+ upsert_data = [doc.metadata for doc in updated_documents_in_batch]
425
+ self.client.upsert(
426
+ collection_name=self.collection_name,
427
+ data=upsert_data
428
+ )
429
+ total_updated_documents.extend(updated_documents_in_batch)
430
+
431
+ finally:
432
+ # 7. 【官方用法】: 确保迭代器被关闭
433
+ self.logger.info("Closing iterator.")
434
+ iterator.close()
435
+
436
+ self.logger.info(f"Iterator processing complete. Total batches processed: {batch_i}.")
437
+ return total_updated_documents
438
+
439
+
440
+ def update_documents_metadata_folder_path(self, old_expr: str, metadata: str, action:Action=Action.UPSERT) -> List[Document]:
441
+ """
442
+ 专门用于更新 version_metadata.folder_path 字段的方法。
443
+
444
+ 它执行一个“目录移动”逻辑:
445
+ 1. 使用 old_expr 找出所有路径匹配的文档。
446
+ 2. 从 metadata 中获取新的基础路径。
447
+ 3. 将文档中 folder_path 的 old_expr 前缀替换为新的基础路径,并保留后续的子路径。
448
+ """
449
+ # 1. 根据 old_expr 构造一个 "starts with" 查询
450
+ # Milvus JSON 'like' 操作符需要转义内部的双引号
451
+ # 但由于我们这里是变量,直接用 f-string 插入是安全的
452
+ expr = f"version_metadata[\"folder_path\"] like \"{old_expr}%\""
453
+
454
+ try:
455
+ metadata_dict = json.loads(metadata)
456
+ except json.JSONDecodeError:
457
+ raise ValueError("Invalid JSON string for metadata")
458
+
459
+ fields = self.get_fields() or []
460
+ # 确保关键字段都被查询出来
461
+ required_fields = ['pk', getattr(self, "_text_field", "text")]
462
+ vector_fields = self._vector_field if isinstance(self._vector_field, list) else [self._vector_field]
463
+ required_fields.extend(vector_fields)
464
+
465
+ for f in required_fields:
466
+ if f not in fields:
467
+ fields.append(f)
468
+
469
+ total_updated_documents = []
470
+ batch_size = 1000
471
+
472
+ self.logger.info(f"Starting folder path update using 'collection.query_iterator' with expr: {expr}")
473
+
474
+ # self.col.load() # 确保集合已加载
475
+
476
+ iterator = self.col.query_iterator(
477
+ batch_size=batch_size,
478
+ expr=expr,
479
+ output_fields=fields
480
+ )
481
+
482
+ batch_i = 0
483
+ try:
484
+ while True:
485
+ batch_results = iterator.next()
486
+ if not batch_results:
487
+ break
488
+
489
+ batch_i += 1
490
+ self.logger.info(f"Processing batch {batch_i} of {len(batch_results)} documents for folder path update.")
491
+
492
+ documents = [
493
+ Document(page_content=result.get(getattr(self, "_text_field", "text"), ""), metadata=result)
494
+ for result in batch_results
495
+ ]
496
+
497
+ updated_documents_in_batch = []
498
+ for doc in documents:
499
+ # 沿用标准的 UPSERT 逻辑,但在处理 folder_path 时应用特殊规则
500
+ if action == Action.UPSERT:
501
+ existing_metadata = doc.metadata
502
+ for key, value in metadata_dict.items():
503
+ # ... (此处省略了标准的深层合并逻辑,与您已有的 update_documents_metadata 方法一致)
504
+ # 仅展示与 folder_path 相关的特殊处理部分
505
+ if isinstance(value, dict):
506
+ # ... (处理从数据库读出的可能是字符串的JSON)
507
+ if key in existing_metadata and isinstance(existing_metadata[key], str):
508
+ try:
509
+ existing_metadata[key] = json.loads(existing_metadata[key])
510
+ except json.JSONDecodeError:
511
+ existing_metadata[key] = {}
512
+
513
+ if key not in existing_metadata or not isinstance(existing_metadata[key], dict):
514
+ existing_metadata[key] = value
515
+ else:
516
+ # 递归更新,在这里注入我们的特殊逻辑
517
+ for sub_key, sub_value in value.items():
518
+ # 【核心特殊逻辑】
519
+ if key == 'version_metadata' and sub_key == 'folder_path':
520
+ new_folder_path_base = sub_value
521
+ current_folder_path = existing_metadata.get(key, {}).get(sub_key)
522
+
523
+ if current_folder_path and current_folder_path.startswith(old_expr):
524
+ # 移除旧前缀,保留子路径
525
+ sub_path = current_folder_path[len(old_expr):]
526
+ # 拼接新路径(确保斜杠正确)
527
+ new_full_path = f"{new_folder_path_base.rstrip('/')}/{sub_path.lstrip('/')}"
528
+ existing_metadata[key][sub_key] = new_full_path
529
+ self.logger.debug(f"Rewrote folder path from '{current_folder_path}' to '{new_full_path}'")
530
+ else:
531
+ # 如果不匹配,则按普通逻辑直接覆盖
532
+ existing_metadata[key][sub_key] = new_folder_path_base
533
+
534
+ # 其他所有字段按原逻辑递归更新
535
+ elif isinstance(sub_value, dict):
536
+ if sub_key not in existing_metadata[key]:
537
+ existing_metadata[key][sub_key] = sub_value
538
+ else:
539
+ existing_metadata[key][sub_key].update(sub_value)
540
+ else:
541
+ existing_metadata[key][sub_key] = sub_value
542
+ else:
543
+ existing_metadata[key] = value
544
+ doc.metadata = existing_metadata
545
+
546
+ # (此处可以添加对 DELETE, UPDATE, INSERT 的处理,如果需要的话)
547
+
548
+ updated_documents_in_batch.append(doc)
549
+
550
+ if updated_documents_in_batch:
551
+ self.logger.debug(f"Upserting batch of {len(updated_documents_in_batch)} documents with updated folder paths.")
552
+ upsert_data = [d.metadata for d in updated_documents_in_batch]
553
+ self.client.upsert(
554
+ collection_name=self.collection_name,
555
+ data=upsert_data
556
+ )
557
+ total_updated_documents.extend(updated_documents_in_batch)
558
+ finally:
559
+ self.logger.info("Closing folder path update iterator.")
560
+ iterator.close()
561
+
562
+ self.logger.info(f"Folder path update complete. Total batches processed: {batch_i}.")
563
+ return total_updated_documents
@@ -6,7 +6,7 @@ build-backend = "pdm.backend"
6
6
 
7
7
  [project]
8
8
  name = "crewplus"
9
- version = "0.2.38"
9
+ version = "0.2.40"
10
10
  description = "Base services for CrewPlus AI applications"
11
11
  authors = [
12
12
  { name = "Tim Liu", email = "tim@opsmateai.com" },
@@ -1,277 +0,0 @@
1
- from typing import List, Optional
2
- import logging
3
- import json
4
-
5
- from pymilvus import DataType
6
- from langchain_milvus import Milvus
7
- from langchain_core.documents import Document
8
- from ...utils.schema_document_updater import SchemaDocumentUpdater
9
- from ...utils.schema_action import Action
10
- from .milvus_schema_manager import MilvusSchemaManager
11
-
12
- DEFAULT_SCHEMA = """
13
- {
14
- "node_types": {
15
- "Document": {
16
- "properties": {
17
- "pk": {
18
- "type": "INT64",
19
- "is_primary": true,
20
- "auto_id": true
21
- },
22
- "vector": {
23
- "type": "FLOAT_VECTOR",
24
- "dim": 1536
25
- },
26
- "text": {
27
- "type": "VARCHAR",
28
- "max_length": 65535,
29
- "description": "The core text of the memory. This could be a user query, a documented fact, a procedural step, or a log of an event."
30
- }
31
- }
32
- }
33
- }
34
- }
35
- """
36
-
37
- class SchemaMilvus(Milvus):
38
- """
39
- SchemaMilvus is a subclass of the Milvus class from langchain_milvus. This class is responsible for updating metadata of documents in a Milvus vector store.
40
-
41
- Attributes:
42
- embedding_function: Embedding function used by the Milvus vector store.
43
- collection_name: Name of the collection in the Milvus vector store.
44
- connection_args: Connection arguments for the Milvus vector store.
45
- index_params: Index parameters for the Milvus vector store.
46
- auto_id: Flag to specify if auto ID generation is enabled.
47
- primary_field: The primary field of the collection.
48
- vector_field: The vector field of the collection.
49
- consistency_level: The consistency level for the Milvus vector store.
50
- collection_schema: Schema JSON string associated with the Milvus existing collection name.
51
- """
52
- def __init__(
53
- self,
54
- embedding_function,
55
- collection_name,
56
- connection_args,
57
- index_params=None,
58
- auto_id=True,
59
- primary_field="pk",
60
- text_field: str = "text",
61
- vector_field=["vector"],
62
- consistency_level="Session",
63
- logger: Optional[logging.Logger] = None
64
- ):
65
- """
66
- Initializes the SchemaMilvus class with the provided parameters.
67
-
68
- Args:
69
- embedding_function: Embedding function used by the Milvus vector store.
70
- collection_name: Name of the collection in the Milvus vector store.
71
- connection_args: Connection arguments for the Milvus vector store.
72
- index_params: Index parameters for the Milvus vector store.
73
- auto_id: Flag to specify if auto ID generation is enabled.
74
- primary_field: The primary field of the collection.
75
- text_field: The text field of the collection.
76
- vector_field: The vector field of the collection.
77
- consistency_level: The consistency level for the Milvus vector store.
78
- logger: Optional logger instance. If not provided, a default logger is created.
79
- """
80
- super().__init__(
81
- embedding_function=embedding_function,
82
- collection_name=collection_name,
83
- connection_args=connection_args,
84
- index_params=index_params,
85
- auto_id=auto_id,
86
- primary_field=primary_field,
87
- text_field=text_field,
88
- vector_field=vector_field,
89
- consistency_level=consistency_level
90
- )
91
- self.logger = logger or logging.getLogger(__name__)
92
- self.collection_schema = None
93
- self.schema_manager = MilvusSchemaManager(client=self.client)
94
-
95
- def set_schema(self, schema: str):
96
- """
97
- Sets the collection schema.
98
-
99
- Args:
100
- schema: The schema JSON string.
101
- """
102
- self.collection_schema = schema
103
-
104
- def get_fields(self, collection_name: Optional[str] = None) -> Optional[List[str]]:
105
- """
106
- Retrieves and returns the fields from the collection schema.
107
-
108
- Args:
109
- collection_name: The name of the collection to describe. If None, use self.collection_name.
110
-
111
- Returns:
112
- List[str] | None: The list of field names from the collection schema (excluding vector and text fields), or None if collection_name is not provided or an error occurs.
113
- """
114
- if collection_name is None:
115
- collection_name = self.collection_name
116
- if collection_name is None:
117
- return None
118
-
119
- try:
120
- schema = self.client.describe_collection(collection_name)
121
- fields = [field["name"] for field in schema["fields"] if field["type"] != DataType.FLOAT_VECTOR ]
122
- return fields
123
- except Exception as e:
124
- self.logger.warning(f"Failed to retrieve schema fields: {e}")
125
- return None
126
-
127
- def create_collection(self) -> bool:
128
- """
129
- Validates the schema and creates the collection using the MilvusSchemaManager.
130
-
131
- Returns:
132
- bool: True if the collection is successfully created, False otherwise.
133
- """
134
- if self.collection_schema is None:
135
- self.logger.error("Collection schema is not set. Please set a schema using set_schema().")
136
- return False
137
-
138
- self.schema_manager.bind_client(self.client)
139
- if not self.schema_manager.validate_schema(self.collection_schema):
140
- self.logger.error("Failed to validate schema")
141
- return False
142
- try:
143
- self.schema_manager.create_collection(self.collection_name, self.collection_schema)
144
- self.logger.info(f"Collection {self.collection_name} created successfully")
145
-
146
- return True
147
- except Exception as e:
148
- self.logger.error(f"Failed to create collection: {e}")
149
- return False
150
-
151
- def drop_collection(self, collection_name: Optional[str] = None) -> bool:
152
- """
153
- Drops the collection using the Milvus client.
154
-
155
- Returns:
156
- bool: True if the collection is successfully dropped, False otherwise.
157
- """
158
- if collection_name is None:
159
- collection_name = self.collection_name
160
-
161
- try:
162
- self.client.drop_collection(collection_name)
163
- self.logger.info(f"Collection {collection_name} dropped successfully")
164
- return True
165
- except Exception as e:
166
- self.logger.error(f"Failed to drop collection {self.collection_name}: {e}")
167
- return False
168
-
169
- def _handle_upsert(self, doc: Document, metadata_dict: dict) -> Document:
170
- """
171
- Handles the UPSERT action for a single document by merging metadata.
172
- """
173
- existing_metadata = doc.metadata
174
- for key, value in metadata_dict.items():
175
- # Skip primary key and text fields to prevent modification.
176
- if key in [self.primary_field, self.text_field]:
177
- continue
178
-
179
- if isinstance(value, dict):
180
- # If the new value is a dictionary, handle nested updates.
181
- if key not in existing_metadata or not isinstance(existing_metadata.get(key), dict):
182
- # If the key doesn't exist or its value is not a dict, replace it.
183
- existing_metadata[key] = value
184
- else:
185
- # If both are dictionaries, recursively update the nested fields.
186
- for sub_key, sub_value in value.items():
187
- if isinstance(sub_value, dict) and sub_key in existing_metadata[key] and isinstance(existing_metadata[key].get(sub_key), dict):
188
- existing_metadata[key][sub_key].update(sub_value)
189
- else:
190
- existing_metadata[key][sub_key] = sub_value
191
- else:
192
- # For non-dictionary values, simply update or add the field.
193
- existing_metadata[key] = value
194
-
195
- doc.metadata = existing_metadata
196
- return doc
197
-
198
- def update_documents_metadata(self, expr: str, metadata: str,action:Action=Action.UPSERT) -> List[Document]:
199
- """
200
- Updates the metadata of documents in the Milvus vector store based on the provided expression.
201
-
202
- Args:
203
- expr: Expression to filter the target documents.
204
- metadata: New metadata to update the documents with.
205
-
206
- Returns:
207
- List of updated documents.
208
- """
209
- try:
210
- metadata_dict = json.loads(metadata)
211
- except json.JSONDecodeError:
212
- raise ValueError("Invalid JSON string for metadata")
213
-
214
- # Retrieve documents that match the filter expression.
215
- fields = self.get_fields()
216
- documents = self.search_by_metadata(expr, fields=fields, limit=5000)
217
-
218
- updated_documents = []
219
- for doc in documents:
220
- # Preserve the original primary key and text values.
221
- pk_value = doc.metadata.get(self.primary_field) # default to pk
222
- text_value = doc.metadata.get(self.text_field)
223
-
224
- # Apply the specified action to update the document's metadata.
225
- if action == Action.UPSERT:
226
- doc = self._handle_upsert(doc, metadata_dict)
227
- elif action == Action.DELETE:
228
- keys_to_delete = metadata_dict.keys()
229
- doc = SchemaDocumentUpdater.delete_document_metadata(doc, list(keys_to_delete))
230
- elif action == Action.UPDATE:
231
- existing_metadata = doc.metadata
232
- update_dict = {}
233
- for key, value in metadata_dict.items():
234
- if key in existing_metadata:
235
- if isinstance(value, dict) and isinstance(existing_metadata[key], dict):
236
- merged = existing_metadata[key].copy()
237
- for sub_key, sub_value in value.items():
238
- if sub_key in merged:
239
- merged[sub_key] = sub_value
240
- update_dict[key] = merged
241
- else:
242
- update_dict[key] = value
243
- doc = SchemaDocumentUpdater.update_document_metadata(doc, update_dict)
244
- elif action == Action.INSERT:
245
- existing_metadata = doc.metadata
246
- for key, value in metadata_dict.items():
247
- if key in ['pk', 'text']:
248
- continue
249
-
250
- if isinstance(value, dict) and key in existing_metadata and isinstance(existing_metadata[key], dict):
251
- existing_metadata[key] = {}
252
- existing_metadata[key] = value
253
- else:
254
- existing_metadata[key] = value
255
- doc.metadata = existing_metadata
256
-
257
- # Restore the primary key and text values to ensure they are not lost.
258
- if pk_value is not None:
259
- doc.metadata[self.primary_field] = pk_value
260
- if text_value is not None:
261
- doc.metadata[self.text_field] = text_value
262
-
263
- updated_documents.append(doc)
264
-
265
- # Extract the primary keys for the upsert operation.
266
- updated_ids = [doc.metadata[self.primary_field] for doc in updated_documents]
267
-
268
- # Remove primary key and text from metadata before upserting,
269
- # as they are handled separately by the vector store.
270
- for doc in updated_documents:
271
- doc.metadata.pop(self.primary_field, None)
272
- doc.metadata.pop(self.text_field, None)
273
-
274
- # Perform the upsert operation to update the documents in the collection.
275
- self.upsert(ids=updated_ids, documents=updated_documents)
276
-
277
- return updated_documents
File without changes
File without changes
File without changes
File without changes