isage-middleware 0.2.4.3__cp311-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. isage_middleware-0.2.4.3.dist-info/METADATA +266 -0
  2. isage_middleware-0.2.4.3.dist-info/RECORD +94 -0
  3. isage_middleware-0.2.4.3.dist-info/WHEEL +5 -0
  4. isage_middleware-0.2.4.3.dist-info/top_level.txt +1 -0
  5. sage/middleware/__init__.py +59 -0
  6. sage/middleware/_version.py +6 -0
  7. sage/middleware/components/__init__.py +30 -0
  8. sage/middleware/components/extensions_compat.py +141 -0
  9. sage/middleware/components/sage_db/__init__.py +116 -0
  10. sage/middleware/components/sage_db/backend.py +136 -0
  11. sage/middleware/components/sage_db/service.py +15 -0
  12. sage/middleware/components/sage_flow/__init__.py +76 -0
  13. sage/middleware/components/sage_flow/python/__init__.py +14 -0
  14. sage/middleware/components/sage_flow/python/micro_service/__init__.py +4 -0
  15. sage/middleware/components/sage_flow/python/micro_service/sage_flow_service.py +88 -0
  16. sage/middleware/components/sage_flow/python/sage_flow.py +30 -0
  17. sage/middleware/components/sage_flow/service.py +14 -0
  18. sage/middleware/components/sage_mem/__init__.py +83 -0
  19. sage/middleware/components/sage_sias/__init__.py +59 -0
  20. sage/middleware/components/sage_sias/continual_learner.py +184 -0
  21. sage/middleware/components/sage_sias/coreset_selector.py +302 -0
  22. sage/middleware/components/sage_sias/types.py +94 -0
  23. sage/middleware/components/sage_tsdb/__init__.py +81 -0
  24. sage/middleware/components/sage_tsdb/python/__init__.py +21 -0
  25. sage/middleware/components/sage_tsdb/python/_sage_tsdb.pyi +17 -0
  26. sage/middleware/components/sage_tsdb/python/algorithms/__init__.py +17 -0
  27. sage/middleware/components/sage_tsdb/python/algorithms/base.py +51 -0
  28. sage/middleware/components/sage_tsdb/python/algorithms/out_of_order_join.py +248 -0
  29. sage/middleware/components/sage_tsdb/python/algorithms/window_aggregator.py +296 -0
  30. sage/middleware/components/sage_tsdb/python/micro_service/__init__.py +7 -0
  31. sage/middleware/components/sage_tsdb/python/micro_service/sage_tsdb_service.py +365 -0
  32. sage/middleware/components/sage_tsdb/python/sage_tsdb.py +523 -0
  33. sage/middleware/components/sage_tsdb/service.py +17 -0
  34. sage/middleware/components/vector_stores/__init__.py +25 -0
  35. sage/middleware/components/vector_stores/chroma.py +483 -0
  36. sage/middleware/components/vector_stores/chroma_adapter.py +185 -0
  37. sage/middleware/components/vector_stores/milvus.py +677 -0
  38. sage/middleware/operators/__init__.py +56 -0
  39. sage/middleware/operators/agent/__init__.py +24 -0
  40. sage/middleware/operators/agent/planning/__init__.py +5 -0
  41. sage/middleware/operators/agent/planning/llm_adapter.py +41 -0
  42. sage/middleware/operators/agent/planning/planner_adapter.py +98 -0
  43. sage/middleware/operators/agent/planning/router.py +107 -0
  44. sage/middleware/operators/agent/runtime.py +296 -0
  45. sage/middleware/operators/agentic/__init__.py +41 -0
  46. sage/middleware/operators/agentic/config.py +254 -0
  47. sage/middleware/operators/agentic/planning_operator.py +125 -0
  48. sage/middleware/operators/agentic/refined_searcher.py +132 -0
  49. sage/middleware/operators/agentic/runtime.py +241 -0
  50. sage/middleware/operators/agentic/timing_operator.py +125 -0
  51. sage/middleware/operators/agentic/tool_selection_operator.py +127 -0
  52. sage/middleware/operators/context/__init__.py +17 -0
  53. sage/middleware/operators/context/critic_evaluation.py +16 -0
  54. sage/middleware/operators/context/model_context.py +565 -0
  55. sage/middleware/operators/context/quality_label.py +12 -0
  56. sage/middleware/operators/context/search_query_results.py +61 -0
  57. sage/middleware/operators/context/search_result.py +42 -0
  58. sage/middleware/operators/context/search_session.py +79 -0
  59. sage/middleware/operators/filters/__init__.py +26 -0
  60. sage/middleware/operators/filters/context_sink.py +387 -0
  61. sage/middleware/operators/filters/context_source.py +376 -0
  62. sage/middleware/operators/filters/evaluate_filter.py +83 -0
  63. sage/middleware/operators/filters/tool_filter.py +74 -0
  64. sage/middleware/operators/llm/__init__.py +18 -0
  65. sage/middleware/operators/llm/sagellm_generator.py +432 -0
  66. sage/middleware/operators/rag/__init__.py +147 -0
  67. sage/middleware/operators/rag/arxiv.py +331 -0
  68. sage/middleware/operators/rag/chunk.py +13 -0
  69. sage/middleware/operators/rag/document_loaders.py +23 -0
  70. sage/middleware/operators/rag/evaluate.py +658 -0
  71. sage/middleware/operators/rag/generator.py +340 -0
  72. sage/middleware/operators/rag/index_builder/__init__.py +48 -0
  73. sage/middleware/operators/rag/index_builder/builder.py +363 -0
  74. sage/middleware/operators/rag/index_builder/manifest.py +101 -0
  75. sage/middleware/operators/rag/index_builder/storage.py +131 -0
  76. sage/middleware/operators/rag/pipeline.py +46 -0
  77. sage/middleware/operators/rag/profiler.py +59 -0
  78. sage/middleware/operators/rag/promptor.py +400 -0
  79. sage/middleware/operators/rag/refiner.py +231 -0
  80. sage/middleware/operators/rag/reranker.py +364 -0
  81. sage/middleware/operators/rag/retriever.py +1308 -0
  82. sage/middleware/operators/rag/searcher.py +37 -0
  83. sage/middleware/operators/rag/types.py +28 -0
  84. sage/middleware/operators/rag/writer.py +80 -0
  85. sage/middleware/operators/tools/__init__.py +71 -0
  86. sage/middleware/operators/tools/arxiv_paper_searcher.py +175 -0
  87. sage/middleware/operators/tools/arxiv_searcher.py +102 -0
  88. sage/middleware/operators/tools/duckduckgo_searcher.py +105 -0
  89. sage/middleware/operators/tools/image_captioner.py +104 -0
  90. sage/middleware/operators/tools/nature_news_fetcher.py +224 -0
  91. sage/middleware/operators/tools/searcher_tool.py +514 -0
  92. sage/middleware/operators/tools/text_detector.py +185 -0
  93. sage/middleware/operators/tools/url_text_extractor.py +104 -0
  94. sage/middleware/py.typed +2 -0
@@ -0,0 +1,677 @@
1
+ """
2
+ Milvus 后端管理工具
3
+ 提供 Milvus / Milvus Lite 的初始化、文档管理和检索功能
4
+ """
5
+
6
+ import json
7
+ import logging
8
+ import os
9
+ import time
10
+ from typing import TYPE_CHECKING, Any
11
+
12
+ import numpy as np
13
+
14
+ if TYPE_CHECKING:
15
+ from pymilvus import MilvusClient # noqa: F401
16
+
17
+
18
+ class MilvusBackend:
19
+ """Milvus 后端管理器(支持本地 Milvus Lite 与远程 Milvus)"""
20
+
21
+ def __init__(self, config: dict[str, Any], logger: logging.Logger | Any = None):
22
+ """
23
+ 初始化 Milvus 后端
24
+
25
+ Args:
26
+ config: Milvus 配置字典
27
+ logger: 日志记录器
28
+ """
29
+ self.config = config
30
+ self.logger = logger or logging.getLogger(__name__)
31
+
32
+ # 连接与集合配置
33
+ self.host: str = self.config.get("host", "localhost")
34
+ self.port: int = int(self.config.get("port", 19530))
35
+ self.persistence_path: str | None = self.config.get(
36
+ "persistence_path", "./milvus_db"
37
+ ) # 可选,优先级高于 host/port
38
+
39
+ self.collection_name: str = self.config.get("collection_name", "retriever_collection")
40
+ self.dim: int | None = self.config.get("dim", 1024) # 稠密向量维度
41
+ raw_metric_type = self.config.get("metric_type")
42
+ if not raw_metric_type:
43
+ # 未提供则默认 COSINE,不做校验
44
+ metric_type_value: str = "COSINE"
45
+ else:
46
+ # 提供了则严格校验,仅支持 IP/COSINE/L2(大小写不敏感,统一为大写)
47
+ allowed_metric_types = {"IP", "COSINE", "L2"}
48
+ metric_upper = str(raw_metric_type).upper()
49
+ if metric_upper not in allowed_metric_types:
50
+ raise ValueError(
51
+ f"Invalid metric_type: {raw_metric_type}. Allowed: {sorted(allowed_metric_types)}"
52
+ )
53
+ metric_type_value = metric_upper
54
+ self.metric_type: str = metric_type_value
55
+ self.drop_ratio_search = self.config.get(
56
+ "drop_ratio_search", 0.2
57
+ ) # 稀疏向量搜索时,drop 比例
58
+ self.search_type = self.config.get("search_type", "sparse") # 搜索类型,sparse 或 dense
59
+ self.dense_insert_batch_size = self.config.get(
60
+ "dense_insert_batch_size", 128
61
+ ) # 稠密向量插入批次大小
62
+
63
+ # 客户端
64
+ self.client: Any = None # Will be initialized by _init_client
65
+ self._init_client()
66
+ self._init_collection()
67
+
68
+ def _init_client(self):
69
+ """初始化 Milvus 客户端,支持 Milvus Lite(本地 .db 文件)与远程服务"""
70
+ try:
71
+ from pymilvus import MilvusClient
72
+
73
+ # 判断使用本地还是远程模式
74
+ if self.host in ["localhost", "127.0.0.1"] and not self.config.get("force_http", False):
75
+ self.client = MilvusClient(self.persistence_path or "./milvus.db")
76
+ self.logger.info(
77
+ f"Initialized Milvus persistent client at: {self.persistence_path}"
78
+ )
79
+ else:
80
+ # 远程服务器模式
81
+ full_host = (
82
+ f"http://{self.host}:{self.port}"
83
+ if not self.host.startswith("http")
84
+ else self.host
85
+ )
86
+
87
+ self.client = MilvusClient(full_host)
88
+ self.logger.info(
89
+ f"Initialized Milvus HTTP client at: http://{self.host}:{self.port}"
90
+ )
91
+
92
+ except ImportError as e:
93
+ self.logger.error(f"Failed to import pymilvus: {e}")
94
+ raise ImportError(
95
+ "Milvus dependencies not available. Install with: pip install pymilvus"
96
+ )
97
+
98
+ except Exception as e:
99
+ self.logger.error(f"Failed to initialize Milvus client: {e}")
100
+ raise
101
+
102
+ def _ensure_client(self):
103
+ """Ensure client is initialized"""
104
+ if self.client is None:
105
+ raise RuntimeError("Milvus client is not initialized")
106
+
107
+ def _init_collection(self):
108
+ """初始化或获取 Milvus 集合,必要时创建索引"""
109
+ self._ensure_client()
110
+ try:
111
+ # 尝试直接获取已存在的集合
112
+ try:
113
+ # 通过检查集合是否能正常查询来验证集合存在
114
+ self.client.load_collection(collection_name=self.collection_name) # type: ignore
115
+ self.logger.info(f"Retrieved existing Milvus collection: {self.collection_name}")
116
+ return
117
+ except Exception:
118
+ # 集合不存在,需要创建新集合
119
+ self.logger.debug(
120
+ f"Collection '{self.collection_name}' does not exist, creating new one"
121
+ )
122
+ # 创建集合 - 导入必要的数据类型
123
+ try:
124
+ from pymilvus import DataType
125
+ except Exception as e:
126
+ self.logger.error(f"Failed to import PyMilvus schema classes: {e}")
127
+ raise
128
+
129
+ try:
130
+ schema = self.client.create_schema(auto_id=False)
131
+ index_params = self.client.prepare_index_params()
132
+ schema.add_field(
133
+ "id",
134
+ DataType.VARCHAR,
135
+ max_length=2000,
136
+ is_primary=True,
137
+ auto_id=False,
138
+ )
139
+ schema.add_field("text", DataType.VARCHAR, max_length=2000) # 文本字段
140
+ self.logger.info(self.search_type + "=" * 60)
141
+ if self.search_type == "sparse":
142
+ schema.add_field("sparse", DataType.SPARSE_FLOAT_VECTOR) # 稀疏向量字段
143
+
144
+ index_params.add_index(
145
+ field_name="sparse",
146
+ index_type="SPARSE_INVERTED_INDEX",
147
+ metric_type="IP",
148
+ )
149
+
150
+ if self.search_type == "dense":
151
+ schema.add_field("dense", DataType.FLOAT_VECTOR, dim=self.dim) # 稠密向量
152
+
153
+ index_params.add_index(
154
+ field_name="dense",
155
+ index_type="AUTOINDEX",
156
+ metric_type=self.metric_type,
157
+ )
158
+
159
+ self.client.create_collection(
160
+ collection_name=self.collection_name,
161
+ schema=schema,
162
+ index_params=index_params,
163
+ )
164
+
165
+ # 创建集合后,立即加载以确保可以使用
166
+ self.client.load_collection(collection_name=self.collection_name)
167
+
168
+ self.logger.info(
169
+ f"Created and loaded new Milvus collection {self.collection_name} successfully!"
170
+ )
171
+ except Exception as e:
172
+ self.logger.error(f"Failed to create Milvus collection: {e}")
173
+ raise
174
+ except Exception as e:
175
+ self.logger.error(f"Failed to initialize Milvus collection: {e}")
176
+ raise
177
+
178
+ def add_dense_documents(
179
+ self,
180
+ documents: list[str],
181
+ dense_embeddings: list[np.ndarray],
182
+ doc_ids: list[str],
183
+ ) -> list[str]:
184
+ """
185
+ 添加稠密向量文档,防止内存溢出,分批插入
186
+
187
+ Args:
188
+ documents: 文本列表
189
+ embeddings: 向量列表(list[float] 或可转 list)
190
+ doc_ids: 文档 ID 列表
191
+ Returns:
192
+ 成功插入的文档 ID 列表
193
+ """
194
+ try:
195
+ # 转换 embedding 格式(milvus 需要 list 格式)
196
+ dense_embeddings_list = [embedding.tolist() for embedding in dense_embeddings]
197
+ docs = []
198
+ # 生成文档ID
199
+ doc_ids = [f"doc_{int(time.time() * 1000)}_{i}" for i in range(len(documents))]
200
+ for i in range(len(documents)):
201
+ docs.append(
202
+ {
203
+ "id": doc_ids[i],
204
+ "text": documents[i],
205
+ "dense": dense_embeddings_list[i],
206
+ }
207
+ )
208
+
209
+ if len(docs) > self.dense_insert_batch_size:
210
+ for i in range(0, len(docs), self.dense_insert_batch_size):
211
+ self.client.insert(
212
+ collection_name=self.collection_name,
213
+ data=docs[i : i + self.dense_insert_batch_size],
214
+ )
215
+ else:
216
+ self.client.insert(collection_name=self.collection_name, data=docs)
217
+
218
+ self.logger.info(
219
+ f"Added {len(docs)} documents to Milvus collection {self.collection_name}"
220
+ )
221
+ return doc_ids
222
+ except Exception as e:
223
+ self.logger.error(f"Error adding dense documents to Milvus: {e}")
224
+ return []
225
+
226
+ def add_sparse_documents(
227
+ self, documents: list[str], sparse_embeddings, doc_ids: list[str]
228
+ ) -> list[str]:
229
+ """
230
+ 添加稀疏向量文档
231
+
232
+ Args:
233
+ documents: 文本列表
234
+ sparse_embeddings: 稀疏向量列表(来自BGEM3EmbeddingFunction)
235
+ doc_ids: 文档 ID 列表
236
+ Returns:
237
+ 成功插入的文档 ID 列表
238
+ """
239
+ try:
240
+ docs = []
241
+ for i in range(len(documents)):
242
+ # 处理稀疏向量格式
243
+ sparse_vector = sparse_embeddings[i]
244
+
245
+ # 如果是scipy稀疏矩阵(csr_array/csr_matrix),转换为字典格式
246
+ if hasattr(sparse_vector, "tocoo"):
247
+ # scipy sparse matrix to dict
248
+ coo = sparse_vector.tocoo()
249
+ sparse_dict = {
250
+ int(idx): float(val) for idx, val in zip(coo.col, coo.data, strict=False)
251
+ }
252
+ elif hasattr(sparse_vector, "indices") and hasattr(sparse_vector, "data"):
253
+ # 处理 csr_array 格式
254
+ sparse_dict = {
255
+ int(idx): float(val)
256
+ for idx, val in zip(sparse_vector.indices, sparse_vector.data, strict=False)
257
+ }
258
+ elif isinstance(sparse_vector, dict):
259
+ # 已经是字典格式
260
+ sparse_dict = sparse_vector
261
+ else:
262
+ # 尝试转换为字典
263
+ self.logger.warning(f"Unknown sparse vector format: {type(sparse_vector)}")
264
+ sparse_dict = dict(sparse_vector) if hasattr(sparse_vector, "__iter__") else {}
265
+
266
+ docs.append({"id": doc_ids[i], "text": documents[i], "sparse": sparse_dict})
267
+
268
+ # 插入数据
269
+ self.client.insert(collection_name=self.collection_name, data=docs)
270
+ self.logger.info(
271
+ f"Added {len(docs)} documents to Milvus collection {self.collection_name}"
272
+ )
273
+
274
+ return doc_ids
275
+ except Exception as e:
276
+ self.logger.error(f"Error adding sparse documents to Milvus: {e}")
277
+ return []
278
+
279
+ def sparse_search(self, query_text: str, top_k: int) -> list[str]:
280
+ """
281
+ 在 Milvus 中执行稀疏向量搜索
282
+
283
+ Args:
284
+ query_text: 查询文本
285
+ top_k: 返回的文档数量
286
+
287
+ Returns:
288
+ 文本结果列表
289
+ """
290
+ try:
291
+ # 使用 BGEM3EmbeddingFunction 生成查询向量
292
+ try:
293
+ from pymilvus.model.hybrid import (
294
+ BGEM3EmbeddingFunction, # type: ignore[import-not-found]
295
+ )
296
+ except ImportError:
297
+ try:
298
+ from pymilvus.model import (
299
+ BGEM3EmbeddingFunction, # type: ignore[import-not-found]
300
+ )
301
+ except ImportError:
302
+ self.logger.error(
303
+ "Please install: pip install 'pymilvus[model]' or pip install pymilvus.model"
304
+ )
305
+ return []
306
+
307
+ embedding_model = BGEM3EmbeddingFunction(use_fp16=False, device="cpu")
308
+ query_embeddings = embedding_model.encode_queries([query_text])
309
+
310
+ # 提取稀疏向量
311
+ if isinstance(query_embeddings, dict) and "sparse" in query_embeddings:
312
+ sparse_vector = query_embeddings["sparse"][0]
313
+ else:
314
+ sparse_vector = query_embeddings[0]
315
+
316
+ # 处理稀疏向量格式转换为字典
317
+ if hasattr(sparse_vector, "tocoo"):
318
+ # scipy sparse matrix to dict
319
+ coo = sparse_vector.tocoo()
320
+ query_vector = {
321
+ int(idx): float(val) for idx, val in zip(coo.col, coo.data, strict=False)
322
+ }
323
+ elif hasattr(sparse_vector, "indices") and hasattr(sparse_vector, "data"):
324
+ # 处理 csr_array 格式
325
+ query_vector = {
326
+ int(idx): float(val)
327
+ for idx, val in zip(sparse_vector.indices, sparse_vector.data, strict=False)
328
+ }
329
+ elif isinstance(sparse_vector, dict):
330
+ # 已经是字典格式
331
+ query_vector = sparse_vector
332
+ else:
333
+ # 尝试转换为字典
334
+ self.logger.warning(f"Unknown sparse vector format: {type(sparse_vector)}")
335
+ query_vector = dict(sparse_vector) if hasattr(sparse_vector, "__iter__") else {}
336
+
337
+ # 执行搜索
338
+ hits = self.client.search(
339
+ collection_name=self.collection_name,
340
+ data=[query_vector],
341
+ anns_field="sparse",
342
+ search_params={"metric_type": "IP", "params": {}},
343
+ limit=top_k,
344
+ output_fields=["text"],
345
+ )
346
+
347
+ results = hits[0]
348
+ sparse_results = []
349
+
350
+ if results and len(results) > 0:
351
+ for r in results:
352
+ sparse_results.append(r.entity.get("text")) # type: ignore[union-attr]
353
+ return sparse_results
354
+ except Exception as e:
355
+ self.logger.error(f"Error executing Milvus sparse search: {e}")
356
+ return []
357
+
358
+ def dense_search(self, query_vector: np.ndarray, top_k: int) -> list[str]:
359
+ """
360
+ 在 Milvus 中执行稠密向量搜索
361
+
362
+ Args:
363
+ query_vector: 查询向量
364
+ top_k: 返回的文档数量
365
+
366
+ Returns:
367
+ 文本结果列表
368
+ """
369
+ try:
370
+ print(f"MilvusBackend.search: using top_k = {top_k}")
371
+
372
+ hits = self.client.search(
373
+ collection_name=self.collection_name,
374
+ data=[query_vector],
375
+ anns_field="dense",
376
+ search_params={"metric_type": self.metric_type, "params": {}},
377
+ limit=top_k,
378
+ output_fields=["text"],
379
+ )
380
+
381
+ results = hits[0]
382
+ dense_results = []
383
+ if results and len(results) > 0:
384
+ for r in results:
385
+ dense_results.append(r.entity.get("text")) # type: ignore[union-attr]
386
+ return dense_results
387
+ except Exception as e:
388
+ self.logger.error(f"Error executing Milvus search: {e}")
389
+ return []
390
+
391
+ def delete_collection(self, collection_name: str) -> bool:
392
+ """删除当前集合"""
393
+ try:
394
+ self.client.drop_collection(collection_name)
395
+ self.logger.info(f"Deleted Milvus collection: {collection_name}")
396
+ return True
397
+ except Exception as e:
398
+ self.logger.error(f"Error deleting Milvus collection: {e}")
399
+ return False
400
+
401
+ def get_collection_info(self) -> dict[str, Any]:
402
+ """
403
+ 获取集合信息
404
+
405
+ Returns:
406
+ 包含集合信息的字典
407
+ """
408
+ try:
409
+ count = None
410
+ try:
411
+ stats = self.client.get_collection_stats(self.collection_name)
412
+ count = stats.get("row_count") if isinstance(stats, dict) else None
413
+ except Exception:
414
+ pass
415
+ return {
416
+ "backend": "milvus",
417
+ "collection_name": self.collection_name,
418
+ "document_count": count,
419
+ "persistence_path": (
420
+ self.persistence_path if hasattr(self, "persistence_path") else None
421
+ ),
422
+ }
423
+ except Exception as e:
424
+ self.logger.error(f"Failed to get Milvus collection info: {e}")
425
+ return {"backend": "milvus", "error": str(e)}
426
+
427
+ def save_config(self, save_path: str) -> bool:
428
+ """
429
+ 保存 milvus 配置信息
430
+
431
+ Args:
432
+ save_path: 保存路径
433
+
434
+ Returns:
435
+ 是否保存成功
436
+ """
437
+ try:
438
+ os.makedirs(save_path, exist_ok=True)
439
+ config_path = os.path.join(save_path, "milvus_config.json")
440
+ stats = self.client.get_collection_stats(self.collection_name)
441
+ count = stats.get("row_count") if isinstance(stats, dict) else None
442
+ config_info = {
443
+ "collection_name": self.collection_name,
444
+ "collection_count": count,
445
+ "backend_type": "milvus",
446
+ "milvus_config": self.config,
447
+ "saved_time": time.time(),
448
+ }
449
+ with open(config_path, "w", encoding="utf-8") as f:
450
+ json.dump(config_info, f, ensure_ascii=False, indent=2)
451
+ self.logger.info(f"Successfully saved Milvus config to: {save_path}")
452
+ self.logger.info(
453
+ f"Milvus collection '{self.collection_name}' contains {config_info['collection_count']} documents"
454
+ )
455
+ return True
456
+ except Exception as e:
457
+ self.logger.error(f"Failed to save Milvus config: {e}")
458
+ return False
459
+
460
+ def load_config(self, load_path: str) -> bool:
461
+ """
462
+ 从配置文件重新连接到 Milvus 集合
463
+
464
+ Args:
465
+ load_path: 配置文件路径
466
+
467
+ Returns:
468
+ 是否加载成功
469
+ """
470
+ try:
471
+ config_path = os.path.join(load_path, "milvus_config.json")
472
+ if os.path.exists(config_path):
473
+ with open(config_path, encoding="utf-8") as f:
474
+ config_info = json.load(f)
475
+ collection_name = config_info.get("collection_name")
476
+ if collection_name:
477
+ self.collection_name = collection_name
478
+ self.client.load_collection(collection_name=self.collection_name)
479
+ stats = self.client.get_collection_stats(self.collection_name)
480
+ count = stats.get("row_count") if isinstance(stats, dict) else None
481
+ self.logger.info(
482
+ f"Reloaded Milvus collection name from config: {self.collection_name}"
483
+ )
484
+ self.logger.info(f"Collection contains {count} documents")
485
+ return True
486
+ else:
487
+ self.logger.error("No collection name found in Milvus config")
488
+ return False
489
+ else:
490
+ self.logger.error(f"Milvus config not found at: {config_path}")
491
+ return False
492
+ except Exception as e:
493
+ self.logger.error(f"Failed to load Milvus config: {e}")
494
+ return False
495
+
496
+ def load_knowledge_from_file_dense(self, file_path: str, embedding_model) -> bool:
497
+ """
498
+ 从文件加载知识库到 Milvus
499
+
500
+ Args:
501
+ file_path: 知识库文件路径
502
+ embedding_model: 嵌入模型实例
503
+
504
+ Returns:
505
+ 是否加载成功
506
+ """
507
+ try:
508
+ self.logger.info(f"Loading knowledge from file: {file_path}")
509
+ with open(file_path, encoding="utf-8") as f:
510
+ content = f.read()
511
+
512
+ # 将知识库按段落分割
513
+ documents = [doc.strip() for doc in content.split("\n\n") if doc.strip()]
514
+
515
+ if documents:
516
+ # 生成文档ID
517
+ doc_ids = [f"doc_{int(time.time() * 1000)}_{i}" for i in range(len(documents))]
518
+
519
+ # 生成 embedding
520
+ embeddings = []
521
+ for doc in documents:
522
+ embedding = embedding_model.embed(doc)
523
+ embeddings.append(np.array(embedding, dtype=np.float32))
524
+
525
+ # dense 向量添加到 Milvus
526
+ added_dense_ids = self.add_dense_documents(documents, embeddings, doc_ids)
527
+
528
+ if added_dense_ids:
529
+ self.logger.info(
530
+ f"Loaded {len(added_dense_ids)} dense documents from {file_path}"
531
+ )
532
+ return True
533
+ else:
534
+ self.logger.error(f"Failed to add documents from {file_path}")
535
+ return False
536
+ else:
537
+ self.logger.warning(f"No valid documents found in {file_path}")
538
+ return False
539
+
540
+ except Exception as e:
541
+ self.logger.error(f"Failed to load knowledge from file {file_path}: {e}")
542
+ return False
543
+
544
+ def load_knowledge_from_file_sparse(self, file_path: str) -> bool:
545
+ """
546
+ 从文件加载知识库到 Milvus
547
+
548
+ Args:
549
+ file_path: 知识库文件路径
550
+
551
+ Returns:
552
+ 是否加载成功
553
+ """
554
+ try:
555
+ self.logger.info(f"Loading knowledge from file: {file_path}")
556
+ with open(file_path, encoding="utf-8") as f:
557
+ content = f.read()
558
+
559
+ # 将知识库按段落分割
560
+ documents = [doc.strip() for doc in content.split("\n\n") if doc.strip()]
561
+
562
+ if documents:
563
+ # 生成文档ID
564
+ doc_ids = [f"doc_{int(time.time() * 1000)}_{i}" for i in range(len(documents))]
565
+
566
+ try:
567
+ from pymilvus.model.hybrid import BGEM3EmbeddingFunction
568
+
569
+ embedding_model = BGEM3EmbeddingFunction(use_fp16=False, device="cpu")
570
+
571
+ # 生成 sparse embedding
572
+ sparse_embeddings = embedding_model.encode_documents(documents)
573
+
574
+ # 提取稀疏向量部分
575
+ if isinstance(sparse_embeddings, dict) and "sparse" in sparse_embeddings:
576
+ embeddings = sparse_embeddings["sparse"]
577
+ else:
578
+ # 如果返回格式不同,直接使用
579
+ embeddings = sparse_embeddings
580
+
581
+ except Exception as e:
582
+ self.logger.error(f"Failed to import or use BGEM3EmbeddingFunction: {e}")
583
+ raise
584
+
585
+ # sparse 向量添加到 Milvus
586
+ added_sparse_ids = self.add_sparse_documents(documents, embeddings, doc_ids)
587
+
588
+ if added_sparse_ids:
589
+ self.logger.info(
590
+ f"Loaded {len(added_sparse_ids)} sparse documents from {file_path}"
591
+ )
592
+ return True
593
+ else:
594
+ self.logger.error(f"Failed to add documents from {file_path}")
595
+ return False
596
+ else:
597
+ self.logger.warning(f"No valid documents found in {file_path}")
598
+ return False
599
+
600
+ except Exception as e:
601
+ self.logger.error(f"Failed to load knowledge from file {file_path}: {e}")
602
+ return False
603
+
604
+ def clear_collection(self) -> bool:
605
+ """清空集合中的所有文档,保留集合结构与索引"""
606
+ try:
607
+ # 通过过滤条件删除全部实体(匹配所有非空字符串id)
608
+ self.client.delete(collection_name=self.collection_name, filter='id != ""')
609
+ self.logger.info(f"Cleared documents in Milvus collection '{self.collection_name}'")
610
+ return True
611
+ except Exception as e:
612
+ self.logger.error(f"Failed to clear Milvus collection: {e}")
613
+ return False
614
+
615
+ def update_document(self, doc_id: str, new_content: str, new_embedding: np.ndarray) -> bool:
616
+ """
617
+ 更新指定文档
618
+ """
619
+ try:
620
+ self.client.upsert( # type: ignore[attr-defined]
621
+ collection_name=self.collection_name,
622
+ data=[{"id": doc_id, "text": new_content, "dense": new_embedding.tolist()}],
623
+ )
624
+ self.logger.info(
625
+ f"Updated document {doc_id} in Milvus collection '{self.collection_name}'"
626
+ )
627
+ return True
628
+ except Exception as e:
629
+ self.logger.error(
630
+ f"Failed to update document {doc_id} in Milvus collection '{self.collection_name}': {e}"
631
+ )
632
+ return False
633
+
634
+ def delete_document(self, doc_id: str) -> bool:
635
+ """
636
+ 删除指定文档
637
+ """
638
+ try:
639
+ self.client.delete(collection_name=self.collection_name, filter=f'id == "{doc_id}"')
640
+ self.logger.info(
641
+ f"Deleted document {doc_id} in Milvus collection '{self.collection_name}'"
642
+ )
643
+ return True
644
+ except Exception as e:
645
+ self.logger.error(
646
+ f"Failed to delete document {doc_id} in Milvus collection '{self.collection_name}': {e}"
647
+ )
648
+ return False
649
+
650
+
651
+ class MilvusUtils:
652
+ """Milvus 工具类"""
653
+
654
+ @staticmethod
655
+ def check_milvus_available() -> bool:
656
+ """
657
+ 检查 MilvusDB 是否可用
658
+ """
659
+ try:
660
+ import pymilvus # noqa: F401
661
+
662
+ return True
663
+ except ImportError:
664
+ return False
665
+
666
+ @staticmethod
667
+ def validate_milvus_config(config: dict[str, Any]) -> bool:
668
+ """
669
+ 验证 Milvus 配置的有效性
670
+ """
671
+ required_keys = ["collection_name"]
672
+
673
+ for key in required_keys:
674
+ if key not in config:
675
+ return False
676
+
677
+ return True