crewplus 0.2.39__py3-none-any.whl → 0.2.41__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of crewplus might be problematic. Click here for more details.

@@ -0,0 +1 @@
1
+ # This file makes the 'callbacks' directory a Python package.
@@ -0,0 +1,95 @@
1
+ # File: crewplus/callbacks/async_langfuse_handler.py
2
+ import asyncio
3
+ import contextvars
4
+ from contextlib import contextmanager
5
+ from typing import Any, Dict, List, Union
6
+
7
+ try:
8
+ from langfuse.langchain import CallbackHandler as LangfuseCallbackHandler
9
+ from langchain_core.callbacks import AsyncCallbackHandler
10
+ from langchain_core.outputs import LLMResult
11
+ LANGFUSE_AVAILABLE = True
12
+ except ImportError:
13
+ LANGFUSE_AVAILABLE = False
14
+ LangfuseCallbackHandler = None
15
+ AsyncCallbackHandler = object
16
+
17
+ # This token is a simple flag to indicate that we are in an async context.
18
+ # We use a context variable to make it available only within the async task.
19
+ _ASYNC_CONTEXT_TOKEN = "in_async_context"
20
+ in_async_context = contextvars.ContextVar(_ASYNC_CONTEXT_TOKEN, default=False)
21
+
22
+ @contextmanager
23
+ def async_context():
24
+ """A context manager to signal that we are in an async execution context."""
25
+ token = in_async_context.set(True)
26
+ try:
27
+ yield
28
+ finally:
29
+ in_async_context.reset(token)
30
+
31
+ class AsyncLangfuseCallbackHandler(AsyncCallbackHandler):
32
+ """
33
+ Wraps the synchronous LangfuseCallbackHandler to make it compatible with
34
+ LangChain's async methods.
35
+
36
+ This works by running the synchronous handler's methods in a separate thread
37
+ using `asyncio.to_thread`. This is crucial because `asyncio`'s default
38
+ executor can correctly propagate `contextvars`, which solves the
39
+ `ValueError: <Token ...> was created in a different Context` from OpenTelemetry.
40
+ """
41
+ def __init__(self, *args: Any, **kwargs: Any):
42
+ if not LANGFUSE_AVAILABLE:
43
+ raise ImportError("Langfuse is not available. Please install it with 'pip install langfuse'")
44
+ self.sync_handler = LangfuseCallbackHandler(*args, **kwargs)
45
+
46
+ def __getattr__(self, name: str) -> Any:
47
+ # Delegate any other attribute access to the sync handler
48
+ return getattr(self.sync_handler, name)
49
+
50
+ async def on_llm_start(
51
+ self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any
52
+ ) -> None:
53
+ await asyncio.to_thread(
54
+ self.sync_handler.on_llm_start, serialized, prompts, **kwargs
55
+ )
56
+
57
+ async def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
58
+ await asyncio.to_thread(
59
+ self.sync_handler.on_llm_end, response, **kwargs
60
+ )
61
+
62
+ async def on_llm_error(self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any) -> None:
63
+ await asyncio.to_thread(
64
+ self.sync_handler.on_llm_error, error, **kwargs
65
+ )
66
+
67
+ async def on_tool_start(self, serialized: Dict[str, Any], input_str: str, **kwargs: Any) -> Any:
68
+ await asyncio.to_thread(
69
+ self.sync_handler.on_tool_start, serialized, input_str, **kwargs
70
+ )
71
+
72
+ async def on_tool_end(self, output: str, **kwargs: Any) -> Any:
73
+ await asyncio.to_thread(
74
+ self.sync_handler.on_tool_end, output, **kwargs
75
+ )
76
+
77
+ async def on_tool_error(self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any) -> Any:
78
+ await asyncio.to_thread(
79
+ self.sync_handler.on_tool_error, error, **kwargs
80
+ )
81
+
82
+ async def on_chain_start(self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs: Any) -> Any:
83
+ await asyncio.to_thread(
84
+ self.sync_handler.on_chain_start, serialized, inputs, **kwargs
85
+ )
86
+
87
+ async def on_chain_end(self, outputs: Dict[str, Any], **kwargs: Any) -> Any:
88
+ await asyncio.to_thread(
89
+ self.sync_handler.on_chain_end, outputs, **kwargs
90
+ )
91
+
92
+ async def on_chain_error(self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any) -> Any:
93
+ await asyncio.to_thread(
94
+ self.sync_handler.on_chain_error, error, **kwargs
95
+ )
@@ -5,6 +5,7 @@ from typing import Any, Optional
5
5
  from langchain_openai.chat_models.azure import AzureChatOpenAI
6
6
  from pydantic import Field
7
7
  from .tracing_manager import TracingManager, TracingContext
8
+ from ..callbacks.async_langfuse_handler import async_context
8
9
 
9
10
  class TracedAzureChatOpenAI(AzureChatOpenAI):
10
11
  """
@@ -106,8 +107,9 @@ class TracedAzureChatOpenAI(AzureChatOpenAI):
106
107
  return super().invoke(input, config=config, **kwargs)
107
108
 
108
109
  async def ainvoke(self, input, config=None, **kwargs):
109
- config = self._tracing_manager.add_callbacks_to_config(config)
110
- return await super().ainvoke(input, config=config, **kwargs)
110
+ with async_context():
111
+ config = self._tracing_manager.add_callbacks_to_config(config)
112
+ return await super().ainvoke(input, config=config, **kwargs)
111
113
 
112
114
  def stream(self, input, config=None, **kwargs):
113
115
  # Add stream_options to get usage data for Langfuse
@@ -124,6 +126,7 @@ class TracedAzureChatOpenAI(AzureChatOpenAI):
124
126
  stream_options["include_usage"] = True
125
127
  kwargs["stream_options"] = stream_options
126
128
 
127
- config = self._tracing_manager.add_callbacks_to_config(config)
128
- async for chunk in super().astream(input, config=config, **kwargs):
129
- yield chunk
129
+ with async_context():
130
+ config = self._tracing_manager.add_callbacks_to_config(config)
131
+ async for chunk in super().astream(input, config=config, **kwargs):
132
+ yield chunk
@@ -8,10 +8,13 @@ import logging
8
8
  # even if the langfuse library is not installed.
9
9
  try:
10
10
  from langfuse.langchain import CallbackHandler as LangfuseCallbackHandler
11
+ from ..callbacks.async_langfuse_handler import AsyncLangfuseCallbackHandler, in_async_context
11
12
  LANGFUSE_AVAILABLE = True
12
13
  except ImportError:
13
14
  LANGFUSE_AVAILABLE = False
14
15
  LangfuseCallbackHandler = None
16
+ AsyncLangfuseCallbackHandler = None
17
+ in_async_context = None
15
18
 
16
19
  class TracingContext(Protocol):
17
20
  """
@@ -65,7 +68,8 @@ class TracingManager:
65
68
  to the TracingContext protocol.
66
69
  """
67
70
  self.context = context
68
- self._handlers: List[Any] = []
71
+ self._sync_handlers: List[Any] = []
72
+ self._async_handlers: List[Any] = []
69
73
  self._initialize_handlers()
70
74
 
71
75
  def _initialize_handlers(self):
@@ -73,7 +77,8 @@ class TracingManager:
73
77
  Initializes all supported tracing handlers. This is the central point
74
78
  for adding new observability tools.
75
79
  """
76
- self._handlers = []
80
+ self._sync_handlers = []
81
+ self._async_handlers = []
77
82
  self._initialize_langfuse()
78
83
  # To add a new handler (e.g., Helicone), you would add a call to
79
84
  # self._initialize_helicone() here.
@@ -94,8 +99,14 @@ class TracingManager:
94
99
 
95
100
  if enable_langfuse:
96
101
  try:
97
- handler = LangfuseCallbackHandler()
98
- self._handlers.append(handler)
102
+ # Create both sync and async handlers. We'll pick one at runtime.
103
+ sync_handler = LangfuseCallbackHandler()
104
+ self._sync_handlers.append(sync_handler)
105
+
106
+ if AsyncLangfuseCallbackHandler:
107
+ async_handler = AsyncLangfuseCallbackHandler()
108
+ self._async_handlers.append(async_handler)
109
+
99
110
  self.context.logger.info(f"Langfuse tracing enabled for {self.context.get_model_identifier()}")
100
111
  except Exception as e:
101
112
  self.context.logger.warning(f"Failed to initialize Langfuse: {e}")
@@ -118,15 +129,19 @@ class TracingManager:
118
129
  if config is None:
119
130
  config = {}
120
131
 
132
+ # Decide which handlers to use based on the async context flag.
133
+ is_async = in_async_context.get() if in_async_context else False
134
+ handlers = self._async_handlers if is_async else self._sync_handlers
135
+
121
136
  # Respect a global disable flag for this specific call.
122
- if not self._handlers or config.get("metadata", {}).get("tracing_disabled"):
137
+ if not handlers or config.get("metadata", {}).get("tracing_disabled"):
123
138
  return config
124
139
 
125
140
  callbacks = config.get("callbacks")
126
141
 
127
142
  # Case 1: The 'callbacks' key holds a CallbackManager instance
128
143
  if hasattr(callbacks, 'add_handler') and hasattr(callbacks, 'handlers'):
129
- for handler in self._handlers:
144
+ for handler in handlers:
130
145
  if not any(isinstance(cb, type(handler)) for cb in callbacks.handlers):
131
146
  callbacks.add_handler(handler, inherit=True)
132
147
  return config # Return the original, now-mutated config
@@ -135,7 +150,7 @@ class TracingManager:
135
150
  current_callbacks = callbacks or []
136
151
  new_callbacks = list(current_callbacks)
137
152
 
138
- for handler in self._handlers:
153
+ for handler in handlers:
139
154
  if not any(isinstance(cb, type(handler)) for cb in new_callbacks):
140
155
  new_callbacks.append(handler)
141
156
 
@@ -1,6 +1,7 @@
1
1
  from typing import List, Optional
2
2
  import logging
3
3
  import json
4
+ import asyncio
4
5
 
5
6
  from pymilvus import DataType
6
7
  from langchain_milvus import Milvus
@@ -177,119 +178,142 @@ class SchemaMilvus(Milvus):
177
178
  continue
178
179
 
179
180
  if isinstance(value, dict):
180
- # If the new value is a dictionary, handle nested updates.
181
- if key not in existing_metadata or not isinstance(existing_metadata.get(key), dict):
182
- # If the key doesn't exist or its value is not a dict, replace it.
181
+ # If it's a JSON object field (e.g., plant_metadata)
182
+ # Check if the existing value is a string, and if so, try to parse it as a dictionary
183
+ if key in existing_metadata and isinstance(existing_metadata[key], str):
184
+ try:
185
+ existing_metadata[key] = json.loads(existing_metadata[key])
186
+ except json.JSONDecodeError:
187
+ # If the parsing fails, it may not be a valid JSON string, treat it as a regular string
188
+ self.logger.warning(f"Field '{key}' could not be parsed as JSON. Overwriting as a new dict.")
189
+ existing_metadata[key] = {}
190
+
191
+ if key not in existing_metadata:
192
+ # If the field does not exist, add it
183
193
  existing_metadata[key] = value
184
- else:
185
- # If both are dictionaries, recursively update the nested fields.
194
+ elif isinstance(existing_metadata[key], dict):
195
+ # If the field exists and is a dictionary, recursively update the sub-fields
186
196
  for sub_key, sub_value in value.items():
187
- if isinstance(sub_value, dict) and sub_key in existing_metadata[key] and isinstance(existing_metadata[key].get(sub_key), dict):
188
- existing_metadata[key][sub_key].update(sub_value)
197
+ if isinstance(sub_value, dict):
198
+ # If the sub-field is also a dictionary, recursively process it
199
+ if sub_key not in existing_metadata[key]:
200
+ existing_metadata[key][sub_key] = sub_value
201
+ else:
202
+ existing_metadata[key][sub_key].update(sub_value)
189
203
  else:
204
+ # If the sub-field is a regular value, update it
190
205
  existing_metadata[key][sub_key] = sub_value
206
+ else:
207
+ # If the field exists but is not a dictionary (e.g., a number or string), overwrite with the new dictionary
208
+ existing_metadata[key] = value
191
209
  else:
192
- # For non-dictionary values, simply update or add the field.
210
+ # If it's a regular field, update the value
193
211
  existing_metadata[key] = value
194
212
 
213
+ # Update the document's metadata
195
214
  doc.metadata = existing_metadata
196
- return doc
197
215
 
198
- def _prepare_documents_for_update(self, expr: str, metadata: str, action: Action = Action.UPSERT) -> tuple[List[Document], List]:
199
- try:
200
- metadata_dict = json.loads(metadata)
201
- except json.JSONDecodeError:
202
- raise ValueError("Invalid JSON string for metadata")
216
+ return doc
203
217
 
204
- # Retrieve documents that match the filter expression.
205
- fields = self.get_fields()
206
- documents = self.search_by_metadata(expr, fields=fields, limit=5000)
218
+ def _process_document_update(self, doc: Document, metadata_dict: dict, action: Action) -> Document:
219
+ """
220
+ Applies the specified update operation to a single document.
221
+
222
+ Args:
223
+ doc: The Document object to be updated.
224
+ metadata_dict: A dictionary containing the new data.
225
+ action: The type of operation to perform (UPSERT, DELETE, UPDATE, INSERT).
207
226
 
208
- updated_documents = []
209
- for doc in documents:
210
- # Preserve the original primary key and text values.
211
- pk_value = doc.metadata.get(self.primary_field) # default to pk
212
- text_value = doc.metadata.get(self.text_field)
213
-
214
- # Apply the specified action to update the document's metadata.
215
- if action == Action.UPSERT:
216
- doc = self._handle_upsert(doc, metadata_dict)
217
- elif action == Action.DELETE:
218
- keys_to_delete = metadata_dict.keys()
219
- doc = SchemaDocumentUpdater.delete_document_metadata(doc, list(keys_to_delete))
220
- elif action == Action.UPDATE:
221
- existing_metadata = doc.metadata
222
- update_dict = {}
223
- for key, value in metadata_dict.items():
224
- if key in existing_metadata:
225
- if isinstance(value, dict) and isinstance(existing_metadata.get(key), dict):
226
- merged = existing_metadata[key].copy()
227
- for sub_key, sub_value in value.items():
228
- if sub_key in merged:
229
- merged[sub_key] = sub_value
230
- update_dict[key] = merged
231
- else:
232
- update_dict[key] = value
233
- doc = SchemaDocumentUpdater.update_document_metadata(doc, update_dict)
234
- elif action == Action.INSERT:
235
- existing_metadata = doc.metadata
236
- for key, value in metadata_dict.items():
237
- if key in ['pk', 'text']:
238
- continue
239
-
240
- if isinstance(value, dict) and key in existing_metadata and isinstance(existing_metadata.get(key), dict):
241
- existing_metadata[key] = {}
242
- existing_metadata[key] = value
227
+ Returns:
228
+ The updated Document object.
229
+ """
230
+ pk_value = doc.metadata.get(self.primary_field)
231
+ text_value = doc.metadata.get(self.text_field)
232
+
233
+ if action == Action.UPSERT:
234
+ doc = self._handle_upsert(doc, metadata_dict)
235
+ elif action == Action.DELETE:
236
+ keys_to_delete = metadata_dict.keys()
237
+ doc = SchemaDocumentUpdater.delete_document_metadata(doc, list(keys_to_delete))
238
+ elif action == Action.UPDATE:
239
+ existing_metadata = doc.metadata
240
+ update_dict = {}
241
+ for key, value in metadata_dict.items():
242
+ if key in existing_metadata:
243
+ if isinstance(value, dict) and isinstance(existing_metadata.get(key), dict):
244
+ merged = existing_metadata[key].copy()
245
+ for sub_key, sub_value in value.items():
246
+ if sub_key in merged:
247
+ merged[sub_key] = sub_value
248
+ update_dict[key] = merged
243
249
  else:
244
- existing_metadata[key] = value
245
- doc.metadata = existing_metadata
246
-
247
- # Restore the primary key and text values to ensure they are not lost.
248
- if pk_value is not None:
249
- doc.metadata[self.primary_field] = pk_value
250
- if text_value is not None:
251
- doc.metadata[self.text_field] = text_value
252
-
253
- updated_documents.append(doc)
254
-
255
- if not updated_documents:
256
- return [], []
257
-
258
- # Extract the primary keys for the upsert operation.
259
- updated_ids = [doc.metadata[self.primary_field] for doc in updated_documents]
260
-
261
- # Remove primary key and text from metadata before upserting,
262
- # as they are handled separately by the vector store.
263
- for doc in updated_documents:
264
- doc.metadata.pop(self.primary_field, None)
265
- doc.metadata.pop(self.text_field, None)
250
+ update_dict[key] = value
251
+ doc = SchemaDocumentUpdater.update_document_metadata(doc, update_dict)
252
+ elif action == Action.INSERT:
253
+ existing_metadata = doc.metadata
254
+ for key, value in metadata_dict.items():
255
+ if key in ['pk', 'text']:
256
+ continue
257
+
258
+ if isinstance(value, dict) and key in existing_metadata and isinstance(existing_metadata.get(key), dict):
259
+ existing_metadata[key] = {}
260
+ existing_metadata[key] = value
261
+ else:
262
+ existing_metadata[key] = value
263
+ doc.metadata = existing_metadata
266
264
 
267
- return updated_documents, updated_ids
265
+ if pk_value is not None:
266
+ doc.metadata[self.primary_field] = pk_value
267
+ if text_value is not None:
268
+ doc.metadata[self.text_field] = text_value
269
+
270
+ return doc
268
271
 
269
272
  def update_documents_metadata(self, expr: str, metadata: str, action: Action = Action.UPSERT) -> List[Document]:
270
273
  """
271
274
  Updates the metadata of documents in the Milvus vector store based on the provided expression.
275
+ This method uses a direct client upsert to avoid re-embedding vectors.
272
276
 
273
277
  Args:
274
278
  expr: Expression to filter the target documents.
275
279
  metadata: New metadata to update the documents with.
280
+ action: The action to perform on the document metadata.
276
281
 
277
282
  Returns:
278
283
  List of updated documents.
279
284
  """
280
- documents_to_upsert, updated_ids = self._prepare_documents_for_update(expr, metadata, action)
281
-
282
- if not documents_to_upsert:
283
- return []
285
+ try:
286
+ metadata_dict = json.loads(metadata)
287
+ except json.JSONDecodeError:
288
+ raise ValueError("Invalid JSON string for metadata")
289
+
290
+ fields = self.get_fields()
291
+ if not fields:
292
+ fields = []
284
293
 
285
- # Perform the upsert operation to update the documents in the collection.
286
- self.upsert(ids=updated_ids, documents=documents_to_upsert)
294
+ if isinstance(self._vector_field, list):
295
+ fields.extend(self._vector_field)
296
+ else:
297
+ fields.append(self._vector_field)
287
298
 
288
- return documents_to_upsert
299
+ documents = self.search_by_metadata(expr, fields=fields, limit=5000)
300
+
301
+ updated_documents = [self._process_document_update(doc, metadata_dict, action) for doc in documents]
302
+
303
+ if updated_documents:
304
+ self.logger.debug(f"Upserting {len(updated_documents)} documents using direct client upsert.")
305
+ upsert_data = [doc.metadata for doc in updated_documents]
306
+ self.client.upsert(
307
+ collection_name=self.collection_name,
308
+ data=upsert_data
309
+ )
310
+
311
+ return updated_documents
289
312
 
290
313
  async def aupdate_documents_metadata(self, expr: str, metadata: str, action: Action = Action.UPSERT) -> List[Document]:
291
314
  """
292
315
  Asynchronously updates the metadata of documents in the Milvus vector store.
316
+ This method uses a direct client upsert to avoid re-embedding vectors.
293
317
 
294
318
  Args:
295
319
  expr: Expression to filter the target documents.
@@ -299,12 +323,241 @@ class SchemaMilvus(Milvus):
299
323
  Returns:
300
324
  List of updated documents.
301
325
  """
302
- documents_to_upsert, updated_ids = self._prepare_documents_for_update(expr, metadata, action)
326
+ try:
327
+ metadata_dict = json.loads(metadata)
328
+ except json.JSONDecodeError:
329
+ raise ValueError("Invalid JSON string for metadata")
330
+
331
+ fields = self.get_fields()
332
+ if not fields:
333
+ fields = []
334
+
335
+ if isinstance(self._vector_field, list):
336
+ fields.extend(self._vector_field)
337
+ else:
338
+ fields.append(self._vector_field)
339
+
340
+ documents = self.search_by_metadata(expr, fields=fields, limit=5000)
341
+
342
+ updated_documents = [self._process_document_update(doc, metadata_dict, action) for doc in documents]
343
+
344
+ if updated_documents:
345
+ self.logger.debug(f"Upserting {len(updated_documents)} documents using direct client upsert.")
346
+ upsert_data = [doc.metadata for doc in updated_documents]
347
+
348
+ await asyncio.to_thread(
349
+ self.client.upsert,
350
+ collection_name=self.collection_name,
351
+ data=upsert_data
352
+ )
353
+
354
+ return updated_documents
355
+
356
+ def update_documents_metadata_by_iterator(self, expr: str, metadata: str, action:Action=Action.UPSERT) -> List[Document]:
357
+ """
358
+ 【官方推荐版】
359
+ 使用 pymilvus.Collection.query_iterator 官方推荐的迭代方式更新元数据。
360
+ 本方法的业务逻辑(UPSERT/DELETE等)与 update_documents_metadata 方法完全一致,
361
+ 仅数据获取方式遵循官方标准迭代器模式。
362
+ """
363
+ try:
364
+ metadata_dict = json.loads(metadata)
365
+ except json.JSONDecodeError:
366
+ raise ValueError("Invalid JSON string for metadata")
367
+
368
+ fields = self.get_fields() or []
369
+ # 确保主键和文本字段在输出字段中
370
+ if 'pk' not in fields:
371
+ fields.append('pk')
372
+ text_field = getattr(self, "_text_field", "text")
373
+ if text_field not in fields:
374
+ fields.append(text_field)
375
+
376
+ # 【关键修正】: 确保在查询时也获取向量字段。
377
+ # self.client.upsert 操作要求提供所有非 nullable 字段,包括 vector。
378
+ # 因此,我们必须在迭代查询时获取它,以便在更新时能够一并传回。
379
+ vector_fields_to_add = self._vector_field if isinstance(self._vector_field, list) else [self._vector_field]
380
+ for vf in vector_fields_to_add:
381
+ if vf not in fields:
382
+ fields.append(vf)
383
+
384
+ total_updated_documents = []
385
+ batch_size = 1000 # 您可以根据需要调整批次大小
386
+
387
+ self.logger.info(f"Starting metadata update using 'collection.query_iterator' with batch size {batch_size}.")
388
+
389
+ # # 1. 【关键保险】: 在查询前,显式地确保集合已被加载。
390
+ # logger.info(f"Ensuring collection '{self.collection_name}' is loaded before querying.")
391
+ # self.col.load()
392
+
393
+ # 2. 【官方用法】: 创建官方推荐的迭代器
394
+ iterator = self.col.query_iterator(
395
+ batch_size=batch_size,
396
+ expr=expr,
397
+ output_fields=fields
398
+ )
399
+
400
+ batch_i = 0
401
+ try:
402
+ while True:
403
+ # 3. 【官方用法】: 获取下一批次
404
+ batch_results = iterator.next()
405
+ if not batch_results:
406
+ break # 迭代完成,正常退出
407
+
408
+ batch_i += 1
409
+ self.logger.info(f"Processing batch {batch_i} of {len(batch_results)} documents.")
410
+
411
+ # 4. 将 Milvus 返回的 dict 列表转换为 Langchain Document 对象
412
+ documents = [
413
+ Document(page_content=result.get(text_field, ""), metadata=result)
414
+ for result in batch_results
415
+ ]
416
+
417
+ # 5. 【核心业务逻辑】: 使用公共方法处理批次中的每个文档
418
+ updated_documents_in_batch = [self._process_document_update(doc, metadata_dict, action) for doc in documents]
419
+
420
+ # 6. 【Upsert逻辑】:
421
+ if updated_documents_in_batch:
422
+ self.logger.debug(f"Upserting batch of {len(updated_documents_in_batch)} documents using direct client upsert.")
423
+ # 从更新后的 Document 对象中提取元数据字典列表
424
+ upsert_data = [doc.metadata for doc in updated_documents_in_batch]
425
+ self.client.upsert(
426
+ collection_name=self.collection_name,
427
+ data=upsert_data
428
+ )
429
+ total_updated_documents.extend(updated_documents_in_batch)
430
+
431
+ finally:
432
+ # 7. 【官方用法】: 确保迭代器被关闭
433
+ self.logger.info("Closing iterator.")
434
+ iterator.close()
435
+
436
+ self.logger.info(f"Iterator processing complete. Total batches processed: {batch_i}.")
437
+ return total_updated_documents
438
+
439
+
440
+ def update_documents_metadata_folder_path(self, old_expr: str, metadata: str, action:Action=Action.UPSERT) -> List[Document]:
441
+ """
442
+ 专门用于更新 version_metadata.folder_path 字段的方法。
303
443
 
304
- if not documents_to_upsert:
305
- return []
444
+ 它执行一个“目录移动”逻辑:
445
+ 1. 使用 old_expr 找出所有路径匹配的文档。
446
+ 2. 从 metadata 中获取新的基础路径。
447
+ 3. 将文档中 folder_path 的 old_expr 前缀替换为新的基础路径,并保留后续的子路径。
448
+ """
449
+ # 1. 根据 old_expr 构造一个 "starts with" 查询
450
+ # Milvus JSON 'like' 操作符需要转义内部的双引号
451
+ # 但由于我们这里是变量,直接用 f-string 插入是安全的
452
+ expr = f"version_metadata[\"folder_path\"] like \"{old_expr}%\""
306
453
 
307
- # Perform the asynchronous upsert operation.
308
- await self.aupsert(ids=updated_ids, documents=documents_to_upsert)
454
+ try:
455
+ metadata_dict = json.loads(metadata)
456
+ except json.JSONDecodeError:
457
+ raise ValueError("Invalid JSON string for metadata")
458
+
459
+ fields = self.get_fields() or []
460
+ # 确保关键字段都被查询出来
461
+ required_fields = ['pk', getattr(self, "_text_field", "text")]
462
+ vector_fields = self._vector_field if isinstance(self._vector_field, list) else [self._vector_field]
463
+ required_fields.extend(vector_fields)
464
+
465
+ for f in required_fields:
466
+ if f not in fields:
467
+ fields.append(f)
468
+
469
+ total_updated_documents = []
470
+ batch_size = 1000
471
+
472
+ self.logger.info(f"Starting folder path update using 'collection.query_iterator' with expr: {expr}")
309
473
 
310
- return documents_to_upsert
474
+ # self.col.load() # 确保集合已加载
475
+
476
+ iterator = self.col.query_iterator(
477
+ batch_size=batch_size,
478
+ expr=expr,
479
+ output_fields=fields
480
+ )
481
+
482
+ batch_i = 0
483
+ try:
484
+ while True:
485
+ batch_results = iterator.next()
486
+ if not batch_results:
487
+ break
488
+
489
+ batch_i += 1
490
+ self.logger.info(f"Processing batch {batch_i} of {len(batch_results)} documents for folder path update.")
491
+
492
+ documents = [
493
+ Document(page_content=result.get(getattr(self, "_text_field", "text"), ""), metadata=result)
494
+ for result in batch_results
495
+ ]
496
+
497
+ updated_documents_in_batch = []
498
+ for doc in documents:
499
+ # 沿用标准的 UPSERT 逻辑,但在处理 folder_path 时应用特殊规则
500
+ if action == Action.UPSERT:
501
+ existing_metadata = doc.metadata
502
+ for key, value in metadata_dict.items():
503
+ # ... (此处省略了标准的深层合并逻辑,与您已有的 update_documents_metadata 方法一致)
504
+ # 仅展示与 folder_path 相关的特殊处理部分
505
+ if isinstance(value, dict):
506
+ # ... (处理从数据库读出的可能是字符串的JSON)
507
+ if key in existing_metadata and isinstance(existing_metadata[key], str):
508
+ try:
509
+ existing_metadata[key] = json.loads(existing_metadata[key])
510
+ except json.JSONDecodeError:
511
+ existing_metadata[key] = {}
512
+
513
+ if key not in existing_metadata or not isinstance(existing_metadata[key], dict):
514
+ existing_metadata[key] = value
515
+ else:
516
+ # 递归更新,在这里注入我们的特殊逻辑
517
+ for sub_key, sub_value in value.items():
518
+ # 【核心特殊逻辑】
519
+ if key == 'version_metadata' and sub_key == 'folder_path':
520
+ new_folder_path_base = sub_value
521
+ current_folder_path = existing_metadata.get(key, {}).get(sub_key)
522
+
523
+ if current_folder_path and current_folder_path.startswith(old_expr):
524
+ # 移除旧前缀,保留子路径
525
+ sub_path = current_folder_path[len(old_expr):]
526
+ # 拼接新路径(确保斜杠正确)
527
+ new_full_path = f"{new_folder_path_base.rstrip('/')}/{sub_path.lstrip('/')}"
528
+ existing_metadata[key][sub_key] = new_full_path
529
+ self.logger.debug(f"Rewrote folder path from '{current_folder_path}' to '{new_full_path}'")
530
+ else:
531
+ # 如果不匹配,则按普通逻辑直接覆盖
532
+ existing_metadata[key][sub_key] = new_folder_path_base
533
+
534
+ # 其他所有字段按原逻辑递归更新
535
+ elif isinstance(sub_value, dict):
536
+ if sub_key not in existing_metadata[key]:
537
+ existing_metadata[key][sub_key] = sub_value
538
+ else:
539
+ existing_metadata[key][sub_key].update(sub_value)
540
+ else:
541
+ existing_metadata[key][sub_key] = sub_value
542
+ else:
543
+ existing_metadata[key] = value
544
+ doc.metadata = existing_metadata
545
+
546
+ # (此处可以添加对 DELETE, UPDATE, INSERT 的处理,如果需要的话)
547
+
548
+ updated_documents_in_batch.append(doc)
549
+
550
+ if updated_documents_in_batch:
551
+ self.logger.debug(f"Upserting batch of {len(updated_documents_in_batch)} documents with updated folder paths.")
552
+ upsert_data = [d.metadata for d in updated_documents_in_batch]
553
+ self.client.upsert(
554
+ collection_name=self.collection_name,
555
+ data=upsert_data
556
+ )
557
+ total_updated_documents.extend(updated_documents_in_batch)
558
+ finally:
559
+ self.logger.info("Closing folder path update iterator.")
560
+ iterator.close()
561
+
562
+ self.logger.info(f"Folder path update complete. Total batches processed: {batch_i}.")
563
+ return total_updated_documents
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: crewplus
3
- Version: 0.2.39
3
+ Version: 0.2.41
4
4
  Summary: Base services for CrewPlus AI applications
5
5
  Author-Email: Tim Liu <tim@opsmateai.com>
6
6
  License: MIT
@@ -1,23 +1,25 @@
1
- crewplus-0.2.39.dist-info/METADATA,sha256=dkBYPdWsw_cQqQ21iDPqSDZI9aLIUJERIYEdQ3umuzc,5362
2
- crewplus-0.2.39.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
3
- crewplus-0.2.39.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
4
- crewplus-0.2.39.dist-info/licenses/LICENSE,sha256=2_NHSHRTKB_cTcT_GXgcenOCtIZku8j343mOgAguTfc,1087
1
+ crewplus-0.2.41.dist-info/METADATA,sha256=JKgTyNze2KdlVY4GP_oInL_nc3m8wcxJtOOqf-sYvAI,5362
2
+ crewplus-0.2.41.dist-info/WHEEL,sha256=9P2ygRxDrTJz3gsagc0Z96ukrxjr-LFBGOgv3AuKlCA,90
3
+ crewplus-0.2.41.dist-info/entry_points.txt,sha256=6OYgBcLyFCUgeqLgnvMyOJxPCWzgy7se4rLPKtNonMs,34
4
+ crewplus-0.2.41.dist-info/licenses/LICENSE,sha256=2_NHSHRTKB_cTcT_GXgcenOCtIZku8j343mOgAguTfc,1087
5
5
  crewplus/__init__.py,sha256=m46HkZL1Y4toD619NL47Sn2Qe084WFFSFD7e6VoYKZc,284
6
+ crewplus/callbacks/__init__.py,sha256=YG7ieeb91qEjp1zF0-inEN7mjZ7yT_D2yzdWFT8Z1Ws,63
7
+ crewplus/callbacks/async_langfuse_handler.py,sha256=QBY2xP7G4LnyuW4HVbNuZwZ5dJtWZl97VX3gkqJA_tc,3887
6
8
  crewplus/services/__init__.py,sha256=V1CG8b2NOmRzNgQH7BPl4KVxWSYJH5vfEsW1wVErKNE,375
7
- crewplus/services/azure_chat_model.py,sha256=WMSf4BDO8UcP7ZASNGRJxdTEnuWBmCRSY_4yx_VMbok,5499
9
+ crewplus/services/azure_chat_model.py,sha256=LXTd1g6OBV-YEaGokVNMddd1P5BU8nfV4k5tG_GcH04,5643
8
10
  crewplus/services/gemini_chat_model.py,sha256=VsOB_st1qRmDkwLXzo-gCShhUsZHpk0V-G-ulQXGN3g,40081
9
11
  crewplus/services/init_services.py,sha256=7oZ1GmesK32EDB_DYnTzW17MEpXjXK41_U_1pmqu_m4,2183
10
12
  crewplus/services/model_load_balancer.py,sha256=Q9Gx3GrbKworU-Ytxeqp0ggHSgZ1Q6brtTk-nCl4sak,12095
11
- crewplus/services/tracing_manager.py,sha256=aCU9N4Jvh8pDD3h8kWX4O-Ax8xwdLHnQ4wJ3sf-vLwA,6289
13
+ crewplus/services/tracing_manager.py,sha256=vT-7zerq6v0x-cwEBWAsB9NHdul4mPDnI60azcngTr8,7058
12
14
  crewplus/utils/__init__.py,sha256=2Gk1n5srFJQnFfBuYTxktdtKOVZyNrFcNaZKhXk35Pw,142
13
15
  crewplus/utils/schema_action.py,sha256=GDaBoVFQD1rXqrLVSMTfXYW1xcUu7eDcHsn57XBSnIg,422
14
16
  crewplus/utils/schema_document_updater.py,sha256=frvffxn2vbi71fHFPoGb9hq7gH2azmmdq17p-Fumnvg,7322
15
17
  crewplus/vectorstores/milvus/__init__.py,sha256=OeYv2rdyG7tcREIjBJPyt2TbE54NvyeRoWMe7LwopRE,245
16
18
  crewplus/vectorstores/milvus/milvus_schema_manager.py,sha256=2IZT61LVui21Pt5Z3y8YYS2dYcwzkgUKxMq2NA0-lQE,9222
17
- crewplus/vectorstores/milvus/schema_milvus.py,sha256=1GWYPV6e1DgbN1F5AvgjR3zoNCBNdUtfg6f7WlGQefw,13356
19
+ crewplus/vectorstores/milvus/schema_milvus.py,sha256=DtHP8jHRSpLqt9ixAnJE5R4CId9NLYXxOVqRxPCEyv4,26131
18
20
  crewplus/vectorstores/milvus/vdb_service.py,sha256=CaUMLIMeOCm2R4t5EKtAupIddFXQu0NSb8RpTkInGd4,22498
19
21
  docs/GeminiChatModel.md,sha256=zZYyl6RmjZTUsKxxMiC9O4yV70MC4TD-IGUmWhIDBKA,8677
20
22
  docs/ModelLoadBalancer.md,sha256=aGHES1dcXPz4c7Y8kB5-vsCNJjriH2SWmjBkSGoYKiI,4398
21
23
  docs/VDBService.md,sha256=Dw286Rrf_fsi13jyD3Bo4Sy7nZ_G7tYm7d8MZ2j9hxk,9375
22
24
  docs/index.md,sha256=3tlc15uR8lzFNM5WjdoZLw0Y9o1P1gwgbEnOdIBspqc,1643
23
- crewplus-0.2.39.dist-info/RECORD,,
25
+ crewplus-0.2.41.dist-info/RECORD,,