isage-middleware 0.2.4.3__cp311-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- isage_middleware-0.2.4.3.dist-info/METADATA +266 -0
- isage_middleware-0.2.4.3.dist-info/RECORD +94 -0
- isage_middleware-0.2.4.3.dist-info/WHEEL +5 -0
- isage_middleware-0.2.4.3.dist-info/top_level.txt +1 -0
- sage/middleware/__init__.py +59 -0
- sage/middleware/_version.py +6 -0
- sage/middleware/components/__init__.py +30 -0
- sage/middleware/components/extensions_compat.py +141 -0
- sage/middleware/components/sage_db/__init__.py +116 -0
- sage/middleware/components/sage_db/backend.py +136 -0
- sage/middleware/components/sage_db/service.py +15 -0
- sage/middleware/components/sage_flow/__init__.py +76 -0
- sage/middleware/components/sage_flow/python/__init__.py +14 -0
- sage/middleware/components/sage_flow/python/micro_service/__init__.py +4 -0
- sage/middleware/components/sage_flow/python/micro_service/sage_flow_service.py +88 -0
- sage/middleware/components/sage_flow/python/sage_flow.py +30 -0
- sage/middleware/components/sage_flow/service.py +14 -0
- sage/middleware/components/sage_mem/__init__.py +83 -0
- sage/middleware/components/sage_sias/__init__.py +59 -0
- sage/middleware/components/sage_sias/continual_learner.py +184 -0
- sage/middleware/components/sage_sias/coreset_selector.py +302 -0
- sage/middleware/components/sage_sias/types.py +94 -0
- sage/middleware/components/sage_tsdb/__init__.py +81 -0
- sage/middleware/components/sage_tsdb/python/__init__.py +21 -0
- sage/middleware/components/sage_tsdb/python/_sage_tsdb.pyi +17 -0
- sage/middleware/components/sage_tsdb/python/algorithms/__init__.py +17 -0
- sage/middleware/components/sage_tsdb/python/algorithms/base.py +51 -0
- sage/middleware/components/sage_tsdb/python/algorithms/out_of_order_join.py +248 -0
- sage/middleware/components/sage_tsdb/python/algorithms/window_aggregator.py +296 -0
- sage/middleware/components/sage_tsdb/python/micro_service/__init__.py +7 -0
- sage/middleware/components/sage_tsdb/python/micro_service/sage_tsdb_service.py +365 -0
- sage/middleware/components/sage_tsdb/python/sage_tsdb.py +523 -0
- sage/middleware/components/sage_tsdb/service.py +17 -0
- sage/middleware/components/vector_stores/__init__.py +25 -0
- sage/middleware/components/vector_stores/chroma.py +483 -0
- sage/middleware/components/vector_stores/chroma_adapter.py +185 -0
- sage/middleware/components/vector_stores/milvus.py +677 -0
- sage/middleware/operators/__init__.py +56 -0
- sage/middleware/operators/agent/__init__.py +24 -0
- sage/middleware/operators/agent/planning/__init__.py +5 -0
- sage/middleware/operators/agent/planning/llm_adapter.py +41 -0
- sage/middleware/operators/agent/planning/planner_adapter.py +98 -0
- sage/middleware/operators/agent/planning/router.py +107 -0
- sage/middleware/operators/agent/runtime.py +296 -0
- sage/middleware/operators/agentic/__init__.py +41 -0
- sage/middleware/operators/agentic/config.py +254 -0
- sage/middleware/operators/agentic/planning_operator.py +125 -0
- sage/middleware/operators/agentic/refined_searcher.py +132 -0
- sage/middleware/operators/agentic/runtime.py +241 -0
- sage/middleware/operators/agentic/timing_operator.py +125 -0
- sage/middleware/operators/agentic/tool_selection_operator.py +127 -0
- sage/middleware/operators/context/__init__.py +17 -0
- sage/middleware/operators/context/critic_evaluation.py +16 -0
- sage/middleware/operators/context/model_context.py +565 -0
- sage/middleware/operators/context/quality_label.py +12 -0
- sage/middleware/operators/context/search_query_results.py +61 -0
- sage/middleware/operators/context/search_result.py +42 -0
- sage/middleware/operators/context/search_session.py +79 -0
- sage/middleware/operators/filters/__init__.py +26 -0
- sage/middleware/operators/filters/context_sink.py +387 -0
- sage/middleware/operators/filters/context_source.py +376 -0
- sage/middleware/operators/filters/evaluate_filter.py +83 -0
- sage/middleware/operators/filters/tool_filter.py +74 -0
- sage/middleware/operators/llm/__init__.py +18 -0
- sage/middleware/operators/llm/sagellm_generator.py +432 -0
- sage/middleware/operators/rag/__init__.py +147 -0
- sage/middleware/operators/rag/arxiv.py +331 -0
- sage/middleware/operators/rag/chunk.py +13 -0
- sage/middleware/operators/rag/document_loaders.py +23 -0
- sage/middleware/operators/rag/evaluate.py +658 -0
- sage/middleware/operators/rag/generator.py +340 -0
- sage/middleware/operators/rag/index_builder/__init__.py +48 -0
- sage/middleware/operators/rag/index_builder/builder.py +363 -0
- sage/middleware/operators/rag/index_builder/manifest.py +101 -0
- sage/middleware/operators/rag/index_builder/storage.py +131 -0
- sage/middleware/operators/rag/pipeline.py +46 -0
- sage/middleware/operators/rag/profiler.py +59 -0
- sage/middleware/operators/rag/promptor.py +400 -0
- sage/middleware/operators/rag/refiner.py +231 -0
- sage/middleware/operators/rag/reranker.py +364 -0
- sage/middleware/operators/rag/retriever.py +1308 -0
- sage/middleware/operators/rag/searcher.py +37 -0
- sage/middleware/operators/rag/types.py +28 -0
- sage/middleware/operators/rag/writer.py +80 -0
- sage/middleware/operators/tools/__init__.py +71 -0
- sage/middleware/operators/tools/arxiv_paper_searcher.py +175 -0
- sage/middleware/operators/tools/arxiv_searcher.py +102 -0
- sage/middleware/operators/tools/duckduckgo_searcher.py +105 -0
- sage/middleware/operators/tools/image_captioner.py +104 -0
- sage/middleware/operators/tools/nature_news_fetcher.py +224 -0
- sage/middleware/operators/tools/searcher_tool.py +514 -0
- sage/middleware/operators/tools/text_detector.py +185 -0
- sage/middleware/operators/tools/url_text_extractor.py +104 -0
- sage/middleware/py.typed +2 -0
|
@@ -0,0 +1,677 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Milvus 后端管理工具
|
|
3
|
+
提供 Milvus / Milvus Lite 的初始化、文档管理和检索功能
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
import os
|
|
9
|
+
import time
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
|
|
14
|
+
if TYPE_CHECKING:
|
|
15
|
+
from pymilvus import MilvusClient # noqa: F401
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MilvusBackend:
|
|
19
|
+
"""Milvus 后端管理器(支持本地 Milvus Lite 与远程 Milvus)"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, config: dict[str, Any], logger: logging.Logger | Any = None):
|
|
22
|
+
"""
|
|
23
|
+
初始化 Milvus 后端
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
config: Milvus 配置字典
|
|
27
|
+
logger: 日志记录器
|
|
28
|
+
"""
|
|
29
|
+
self.config = config
|
|
30
|
+
self.logger = logger or logging.getLogger(__name__)
|
|
31
|
+
|
|
32
|
+
# 连接与集合配置
|
|
33
|
+
self.host: str = self.config.get("host", "localhost")
|
|
34
|
+
self.port: int = int(self.config.get("port", 19530))
|
|
35
|
+
self.persistence_path: str | None = self.config.get(
|
|
36
|
+
"persistence_path", "./milvus_db"
|
|
37
|
+
) # 可选,优先级高于 host/port
|
|
38
|
+
|
|
39
|
+
self.collection_name: str = self.config.get("collection_name", "retriever_collection")
|
|
40
|
+
self.dim: int | None = self.config.get("dim", 1024) # 稠密向量维度
|
|
41
|
+
raw_metric_type = self.config.get("metric_type")
|
|
42
|
+
if not raw_metric_type:
|
|
43
|
+
# 未提供则默认 COSINE,不做校验
|
|
44
|
+
metric_type_value: str = "COSINE"
|
|
45
|
+
else:
|
|
46
|
+
# 提供了则严格校验,仅支持 IP/COSINE/L2(大小写不敏感,统一为大写)
|
|
47
|
+
allowed_metric_types = {"IP", "COSINE", "L2"}
|
|
48
|
+
metric_upper = str(raw_metric_type).upper()
|
|
49
|
+
if metric_upper not in allowed_metric_types:
|
|
50
|
+
raise ValueError(
|
|
51
|
+
f"Invalid metric_type: {raw_metric_type}. Allowed: {sorted(allowed_metric_types)}"
|
|
52
|
+
)
|
|
53
|
+
metric_type_value = metric_upper
|
|
54
|
+
self.metric_type: str = metric_type_value
|
|
55
|
+
self.drop_ratio_search = self.config.get(
|
|
56
|
+
"drop_ratio_search", 0.2
|
|
57
|
+
) # 稀疏向量搜索时,drop 比例
|
|
58
|
+
self.search_type = self.config.get("search_type", "sparse") # 搜索类型,sparse 或 dense
|
|
59
|
+
self.dense_insert_batch_size = self.config.get(
|
|
60
|
+
"dense_insert_batch_size", 128
|
|
61
|
+
) # 稠密向量插入批次大小
|
|
62
|
+
|
|
63
|
+
# 客户端
|
|
64
|
+
self.client: Any = None # Will be initialized by _init_client
|
|
65
|
+
self._init_client()
|
|
66
|
+
self._init_collection()
|
|
67
|
+
|
|
68
|
+
def _init_client(self):
|
|
69
|
+
"""初始化 Milvus 客户端,支持 Milvus Lite(本地 .db 文件)与远程服务"""
|
|
70
|
+
try:
|
|
71
|
+
from pymilvus import MilvusClient
|
|
72
|
+
|
|
73
|
+
# 判断使用本地还是远程模式
|
|
74
|
+
if self.host in ["localhost", "127.0.0.1"] and not self.config.get("force_http", False):
|
|
75
|
+
self.client = MilvusClient(self.persistence_path or "./milvus.db")
|
|
76
|
+
self.logger.info(
|
|
77
|
+
f"Initialized Milvus persistent client at: {self.persistence_path}"
|
|
78
|
+
)
|
|
79
|
+
else:
|
|
80
|
+
# 远程服务器模式
|
|
81
|
+
full_host = (
|
|
82
|
+
f"http://{self.host}:{self.port}"
|
|
83
|
+
if not self.host.startswith("http")
|
|
84
|
+
else self.host
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
self.client = MilvusClient(full_host)
|
|
88
|
+
self.logger.info(
|
|
89
|
+
f"Initialized Milvus HTTP client at: http://{self.host}:{self.port}"
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
except ImportError as e:
|
|
93
|
+
self.logger.error(f"Failed to import pymilvus: {e}")
|
|
94
|
+
raise ImportError(
|
|
95
|
+
"Milvus dependencies not available. Install with: pip install pymilvus"
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
except Exception as e:
|
|
99
|
+
self.logger.error(f"Failed to initialize Milvus client: {e}")
|
|
100
|
+
raise
|
|
101
|
+
|
|
102
|
+
def _ensure_client(self):
|
|
103
|
+
"""Ensure client is initialized"""
|
|
104
|
+
if self.client is None:
|
|
105
|
+
raise RuntimeError("Milvus client is not initialized")
|
|
106
|
+
|
|
107
|
+
def _init_collection(self):
|
|
108
|
+
"""初始化或获取 Milvus 集合,必要时创建索引"""
|
|
109
|
+
self._ensure_client()
|
|
110
|
+
try:
|
|
111
|
+
# 尝试直接获取已存在的集合
|
|
112
|
+
try:
|
|
113
|
+
# 通过检查集合是否能正常查询来验证集合存在
|
|
114
|
+
self.client.load_collection(collection_name=self.collection_name) # type: ignore
|
|
115
|
+
self.logger.info(f"Retrieved existing Milvus collection: {self.collection_name}")
|
|
116
|
+
return
|
|
117
|
+
except Exception:
|
|
118
|
+
# 集合不存在,需要创建新集合
|
|
119
|
+
self.logger.debug(
|
|
120
|
+
f"Collection '{self.collection_name}' does not exist, creating new one"
|
|
121
|
+
)
|
|
122
|
+
# 创建集合 - 导入必要的数据类型
|
|
123
|
+
try:
|
|
124
|
+
from pymilvus import DataType
|
|
125
|
+
except Exception as e:
|
|
126
|
+
self.logger.error(f"Failed to import PyMilvus schema classes: {e}")
|
|
127
|
+
raise
|
|
128
|
+
|
|
129
|
+
try:
|
|
130
|
+
schema = self.client.create_schema(auto_id=False)
|
|
131
|
+
index_params = self.client.prepare_index_params()
|
|
132
|
+
schema.add_field(
|
|
133
|
+
"id",
|
|
134
|
+
DataType.VARCHAR,
|
|
135
|
+
max_length=2000,
|
|
136
|
+
is_primary=True,
|
|
137
|
+
auto_id=False,
|
|
138
|
+
)
|
|
139
|
+
schema.add_field("text", DataType.VARCHAR, max_length=2000) # 文本字段
|
|
140
|
+
self.logger.info(self.search_type + "=" * 60)
|
|
141
|
+
if self.search_type == "sparse":
|
|
142
|
+
schema.add_field("sparse", DataType.SPARSE_FLOAT_VECTOR) # 稀疏向量字段
|
|
143
|
+
|
|
144
|
+
index_params.add_index(
|
|
145
|
+
field_name="sparse",
|
|
146
|
+
index_type="SPARSE_INVERTED_INDEX",
|
|
147
|
+
metric_type="IP",
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
if self.search_type == "dense":
|
|
151
|
+
schema.add_field("dense", DataType.FLOAT_VECTOR, dim=self.dim) # 稠密向量
|
|
152
|
+
|
|
153
|
+
index_params.add_index(
|
|
154
|
+
field_name="dense",
|
|
155
|
+
index_type="AUTOINDEX",
|
|
156
|
+
metric_type=self.metric_type,
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
self.client.create_collection(
|
|
160
|
+
collection_name=self.collection_name,
|
|
161
|
+
schema=schema,
|
|
162
|
+
index_params=index_params,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# 创建集合后,立即加载以确保可以使用
|
|
166
|
+
self.client.load_collection(collection_name=self.collection_name)
|
|
167
|
+
|
|
168
|
+
self.logger.info(
|
|
169
|
+
f"Created and loaded new Milvus collection {self.collection_name} successfully!"
|
|
170
|
+
)
|
|
171
|
+
except Exception as e:
|
|
172
|
+
self.logger.error(f"Failed to create Milvus collection: {e}")
|
|
173
|
+
raise
|
|
174
|
+
except Exception as e:
|
|
175
|
+
self.logger.error(f"Failed to initialize Milvus collection: {e}")
|
|
176
|
+
raise
|
|
177
|
+
|
|
178
|
+
def add_dense_documents(
|
|
179
|
+
self,
|
|
180
|
+
documents: list[str],
|
|
181
|
+
dense_embeddings: list[np.ndarray],
|
|
182
|
+
doc_ids: list[str],
|
|
183
|
+
) -> list[str]:
|
|
184
|
+
"""
|
|
185
|
+
添加稠密向量文档,防止内存溢出,分批插入
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
documents: 文本列表
|
|
189
|
+
embeddings: 向量列表(list[float] 或可转 list)
|
|
190
|
+
doc_ids: 文档 ID 列表
|
|
191
|
+
Returns:
|
|
192
|
+
成功插入的文档 ID 列表
|
|
193
|
+
"""
|
|
194
|
+
try:
|
|
195
|
+
# 转换 embedding 格式(milvus 需要 list 格式)
|
|
196
|
+
dense_embeddings_list = [embedding.tolist() for embedding in dense_embeddings]
|
|
197
|
+
docs = []
|
|
198
|
+
# 生成文档ID
|
|
199
|
+
doc_ids = [f"doc_{int(time.time() * 1000)}_{i}" for i in range(len(documents))]
|
|
200
|
+
for i in range(len(documents)):
|
|
201
|
+
docs.append(
|
|
202
|
+
{
|
|
203
|
+
"id": doc_ids[i],
|
|
204
|
+
"text": documents[i],
|
|
205
|
+
"dense": dense_embeddings_list[i],
|
|
206
|
+
}
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
if len(docs) > self.dense_insert_batch_size:
|
|
210
|
+
for i in range(0, len(docs), self.dense_insert_batch_size):
|
|
211
|
+
self.client.insert(
|
|
212
|
+
collection_name=self.collection_name,
|
|
213
|
+
data=docs[i : i + self.dense_insert_batch_size],
|
|
214
|
+
)
|
|
215
|
+
else:
|
|
216
|
+
self.client.insert(collection_name=self.collection_name, data=docs)
|
|
217
|
+
|
|
218
|
+
self.logger.info(
|
|
219
|
+
f"Added {len(docs)} documents to Milvus collection {self.collection_name}"
|
|
220
|
+
)
|
|
221
|
+
return doc_ids
|
|
222
|
+
except Exception as e:
|
|
223
|
+
self.logger.error(f"Error adding dense documents to Milvus: {e}")
|
|
224
|
+
return []
|
|
225
|
+
|
|
226
|
+
def add_sparse_documents(
|
|
227
|
+
self, documents: list[str], sparse_embeddings, doc_ids: list[str]
|
|
228
|
+
) -> list[str]:
|
|
229
|
+
"""
|
|
230
|
+
添加稀疏向量文档
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
documents: 文本列表
|
|
234
|
+
sparse_embeddings: 稀疏向量列表(来自BGEM3EmbeddingFunction)
|
|
235
|
+
doc_ids: 文档 ID 列表
|
|
236
|
+
Returns:
|
|
237
|
+
成功插入的文档 ID 列表
|
|
238
|
+
"""
|
|
239
|
+
try:
|
|
240
|
+
docs = []
|
|
241
|
+
for i in range(len(documents)):
|
|
242
|
+
# 处理稀疏向量格式
|
|
243
|
+
sparse_vector = sparse_embeddings[i]
|
|
244
|
+
|
|
245
|
+
# 如果是scipy稀疏矩阵(csr_array/csr_matrix),转换为字典格式
|
|
246
|
+
if hasattr(sparse_vector, "tocoo"):
|
|
247
|
+
# scipy sparse matrix to dict
|
|
248
|
+
coo = sparse_vector.tocoo()
|
|
249
|
+
sparse_dict = {
|
|
250
|
+
int(idx): float(val) for idx, val in zip(coo.col, coo.data, strict=False)
|
|
251
|
+
}
|
|
252
|
+
elif hasattr(sparse_vector, "indices") and hasattr(sparse_vector, "data"):
|
|
253
|
+
# 处理 csr_array 格式
|
|
254
|
+
sparse_dict = {
|
|
255
|
+
int(idx): float(val)
|
|
256
|
+
for idx, val in zip(sparse_vector.indices, sparse_vector.data, strict=False)
|
|
257
|
+
}
|
|
258
|
+
elif isinstance(sparse_vector, dict):
|
|
259
|
+
# 已经是字典格式
|
|
260
|
+
sparse_dict = sparse_vector
|
|
261
|
+
else:
|
|
262
|
+
# 尝试转换为字典
|
|
263
|
+
self.logger.warning(f"Unknown sparse vector format: {type(sparse_vector)}")
|
|
264
|
+
sparse_dict = dict(sparse_vector) if hasattr(sparse_vector, "__iter__") else {}
|
|
265
|
+
|
|
266
|
+
docs.append({"id": doc_ids[i], "text": documents[i], "sparse": sparse_dict})
|
|
267
|
+
|
|
268
|
+
# 插入数据
|
|
269
|
+
self.client.insert(collection_name=self.collection_name, data=docs)
|
|
270
|
+
self.logger.info(
|
|
271
|
+
f"Added {len(docs)} documents to Milvus collection {self.collection_name}"
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
return doc_ids
|
|
275
|
+
except Exception as e:
|
|
276
|
+
self.logger.error(f"Error adding sparse documents to Milvus: {e}")
|
|
277
|
+
return []
|
|
278
|
+
|
|
279
|
+
def sparse_search(self, query_text: str, top_k: int) -> list[str]:
|
|
280
|
+
"""
|
|
281
|
+
在 Milvus 中执行稀疏向量搜索
|
|
282
|
+
|
|
283
|
+
Args:
|
|
284
|
+
query_text: 查询文本
|
|
285
|
+
top_k: 返回的文档数量
|
|
286
|
+
|
|
287
|
+
Returns:
|
|
288
|
+
文本结果列表
|
|
289
|
+
"""
|
|
290
|
+
try:
|
|
291
|
+
# 使用 BGEM3EmbeddingFunction 生成查询向量
|
|
292
|
+
try:
|
|
293
|
+
from pymilvus.model.hybrid import (
|
|
294
|
+
BGEM3EmbeddingFunction, # type: ignore[import-not-found]
|
|
295
|
+
)
|
|
296
|
+
except ImportError:
|
|
297
|
+
try:
|
|
298
|
+
from pymilvus.model import (
|
|
299
|
+
BGEM3EmbeddingFunction, # type: ignore[import-not-found]
|
|
300
|
+
)
|
|
301
|
+
except ImportError:
|
|
302
|
+
self.logger.error(
|
|
303
|
+
"Please install: pip install 'pymilvus[model]' or pip install pymilvus.model"
|
|
304
|
+
)
|
|
305
|
+
return []
|
|
306
|
+
|
|
307
|
+
embedding_model = BGEM3EmbeddingFunction(use_fp16=False, device="cpu")
|
|
308
|
+
query_embeddings = embedding_model.encode_queries([query_text])
|
|
309
|
+
|
|
310
|
+
# 提取稀疏向量
|
|
311
|
+
if isinstance(query_embeddings, dict) and "sparse" in query_embeddings:
|
|
312
|
+
sparse_vector = query_embeddings["sparse"][0]
|
|
313
|
+
else:
|
|
314
|
+
sparse_vector = query_embeddings[0]
|
|
315
|
+
|
|
316
|
+
# 处理稀疏向量格式转换为字典
|
|
317
|
+
if hasattr(sparse_vector, "tocoo"):
|
|
318
|
+
# scipy sparse matrix to dict
|
|
319
|
+
coo = sparse_vector.tocoo()
|
|
320
|
+
query_vector = {
|
|
321
|
+
int(idx): float(val) for idx, val in zip(coo.col, coo.data, strict=False)
|
|
322
|
+
}
|
|
323
|
+
elif hasattr(sparse_vector, "indices") and hasattr(sparse_vector, "data"):
|
|
324
|
+
# 处理 csr_array 格式
|
|
325
|
+
query_vector = {
|
|
326
|
+
int(idx): float(val)
|
|
327
|
+
for idx, val in zip(sparse_vector.indices, sparse_vector.data, strict=False)
|
|
328
|
+
}
|
|
329
|
+
elif isinstance(sparse_vector, dict):
|
|
330
|
+
# 已经是字典格式
|
|
331
|
+
query_vector = sparse_vector
|
|
332
|
+
else:
|
|
333
|
+
# 尝试转换为字典
|
|
334
|
+
self.logger.warning(f"Unknown sparse vector format: {type(sparse_vector)}")
|
|
335
|
+
query_vector = dict(sparse_vector) if hasattr(sparse_vector, "__iter__") else {}
|
|
336
|
+
|
|
337
|
+
# 执行搜索
|
|
338
|
+
hits = self.client.search(
|
|
339
|
+
collection_name=self.collection_name,
|
|
340
|
+
data=[query_vector],
|
|
341
|
+
anns_field="sparse",
|
|
342
|
+
search_params={"metric_type": "IP", "params": {}},
|
|
343
|
+
limit=top_k,
|
|
344
|
+
output_fields=["text"],
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
results = hits[0]
|
|
348
|
+
sparse_results = []
|
|
349
|
+
|
|
350
|
+
if results and len(results) > 0:
|
|
351
|
+
for r in results:
|
|
352
|
+
sparse_results.append(r.entity.get("text")) # type: ignore[union-attr]
|
|
353
|
+
return sparse_results
|
|
354
|
+
except Exception as e:
|
|
355
|
+
self.logger.error(f"Error executing Milvus sparse search: {e}")
|
|
356
|
+
return []
|
|
357
|
+
|
|
358
|
+
def dense_search(self, query_vector: np.ndarray, top_k: int) -> list[str]:
|
|
359
|
+
"""
|
|
360
|
+
在 Milvus 中执行稠密向量搜索
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
query_vector: 查询向量
|
|
364
|
+
top_k: 返回的文档数量
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
文本结果列表
|
|
368
|
+
"""
|
|
369
|
+
try:
|
|
370
|
+
print(f"MilvusBackend.search: using top_k = {top_k}")
|
|
371
|
+
|
|
372
|
+
hits = self.client.search(
|
|
373
|
+
collection_name=self.collection_name,
|
|
374
|
+
data=[query_vector],
|
|
375
|
+
anns_field="dense",
|
|
376
|
+
search_params={"metric_type": self.metric_type, "params": {}},
|
|
377
|
+
limit=top_k,
|
|
378
|
+
output_fields=["text"],
|
|
379
|
+
)
|
|
380
|
+
|
|
381
|
+
results = hits[0]
|
|
382
|
+
dense_results = []
|
|
383
|
+
if results and len(results) > 0:
|
|
384
|
+
for r in results:
|
|
385
|
+
dense_results.append(r.entity.get("text")) # type: ignore[union-attr]
|
|
386
|
+
return dense_results
|
|
387
|
+
except Exception as e:
|
|
388
|
+
self.logger.error(f"Error executing Milvus search: {e}")
|
|
389
|
+
return []
|
|
390
|
+
|
|
391
|
+
def delete_collection(self, collection_name: str) -> bool:
|
|
392
|
+
"""删除当前集合"""
|
|
393
|
+
try:
|
|
394
|
+
self.client.drop_collection(collection_name)
|
|
395
|
+
self.logger.info(f"Deleted Milvus collection: {collection_name}")
|
|
396
|
+
return True
|
|
397
|
+
except Exception as e:
|
|
398
|
+
self.logger.error(f"Error deleting Milvus collection: {e}")
|
|
399
|
+
return False
|
|
400
|
+
|
|
401
|
+
def get_collection_info(self) -> dict[str, Any]:
|
|
402
|
+
"""
|
|
403
|
+
获取集合信息
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
包含集合信息的字典
|
|
407
|
+
"""
|
|
408
|
+
try:
|
|
409
|
+
count = None
|
|
410
|
+
try:
|
|
411
|
+
stats = self.client.get_collection_stats(self.collection_name)
|
|
412
|
+
count = stats.get("row_count") if isinstance(stats, dict) else None
|
|
413
|
+
except Exception:
|
|
414
|
+
pass
|
|
415
|
+
return {
|
|
416
|
+
"backend": "milvus",
|
|
417
|
+
"collection_name": self.collection_name,
|
|
418
|
+
"document_count": count,
|
|
419
|
+
"persistence_path": (
|
|
420
|
+
self.persistence_path if hasattr(self, "persistence_path") else None
|
|
421
|
+
),
|
|
422
|
+
}
|
|
423
|
+
except Exception as e:
|
|
424
|
+
self.logger.error(f"Failed to get Milvus collection info: {e}")
|
|
425
|
+
return {"backend": "milvus", "error": str(e)}
|
|
426
|
+
|
|
427
|
+
def save_config(self, save_path: str) -> bool:
|
|
428
|
+
"""
|
|
429
|
+
保存 milvus 配置信息
|
|
430
|
+
|
|
431
|
+
Args:
|
|
432
|
+
save_path: 保存路径
|
|
433
|
+
|
|
434
|
+
Returns:
|
|
435
|
+
是否保存成功
|
|
436
|
+
"""
|
|
437
|
+
try:
|
|
438
|
+
os.makedirs(save_path, exist_ok=True)
|
|
439
|
+
config_path = os.path.join(save_path, "milvus_config.json")
|
|
440
|
+
stats = self.client.get_collection_stats(self.collection_name)
|
|
441
|
+
count = stats.get("row_count") if isinstance(stats, dict) else None
|
|
442
|
+
config_info = {
|
|
443
|
+
"collection_name": self.collection_name,
|
|
444
|
+
"collection_count": count,
|
|
445
|
+
"backend_type": "milvus",
|
|
446
|
+
"milvus_config": self.config,
|
|
447
|
+
"saved_time": time.time(),
|
|
448
|
+
}
|
|
449
|
+
with open(config_path, "w", encoding="utf-8") as f:
|
|
450
|
+
json.dump(config_info, f, ensure_ascii=False, indent=2)
|
|
451
|
+
self.logger.info(f"Successfully saved Milvus config to: {save_path}")
|
|
452
|
+
self.logger.info(
|
|
453
|
+
f"Milvus collection '{self.collection_name}' contains {config_info['collection_count']} documents"
|
|
454
|
+
)
|
|
455
|
+
return True
|
|
456
|
+
except Exception as e:
|
|
457
|
+
self.logger.error(f"Failed to save Milvus config: {e}")
|
|
458
|
+
return False
|
|
459
|
+
|
|
460
|
+
def load_config(self, load_path: str) -> bool:
|
|
461
|
+
"""
|
|
462
|
+
从配置文件重新连接到 Milvus 集合
|
|
463
|
+
|
|
464
|
+
Args:
|
|
465
|
+
load_path: 配置文件路径
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
是否加载成功
|
|
469
|
+
"""
|
|
470
|
+
try:
|
|
471
|
+
config_path = os.path.join(load_path, "milvus_config.json")
|
|
472
|
+
if os.path.exists(config_path):
|
|
473
|
+
with open(config_path, encoding="utf-8") as f:
|
|
474
|
+
config_info = json.load(f)
|
|
475
|
+
collection_name = config_info.get("collection_name")
|
|
476
|
+
if collection_name:
|
|
477
|
+
self.collection_name = collection_name
|
|
478
|
+
self.client.load_collection(collection_name=self.collection_name)
|
|
479
|
+
stats = self.client.get_collection_stats(self.collection_name)
|
|
480
|
+
count = stats.get("row_count") if isinstance(stats, dict) else None
|
|
481
|
+
self.logger.info(
|
|
482
|
+
f"Reloaded Milvus collection name from config: {self.collection_name}"
|
|
483
|
+
)
|
|
484
|
+
self.logger.info(f"Collection contains {count} documents")
|
|
485
|
+
return True
|
|
486
|
+
else:
|
|
487
|
+
self.logger.error("No collection name found in Milvus config")
|
|
488
|
+
return False
|
|
489
|
+
else:
|
|
490
|
+
self.logger.error(f"Milvus config not found at: {config_path}")
|
|
491
|
+
return False
|
|
492
|
+
except Exception as e:
|
|
493
|
+
self.logger.error(f"Failed to load Milvus config: {e}")
|
|
494
|
+
return False
|
|
495
|
+
|
|
496
|
+
def load_knowledge_from_file_dense(self, file_path: str, embedding_model) -> bool:
|
|
497
|
+
"""
|
|
498
|
+
从文件加载知识库到 Milvus
|
|
499
|
+
|
|
500
|
+
Args:
|
|
501
|
+
file_path: 知识库文件路径
|
|
502
|
+
embedding_model: 嵌入模型实例
|
|
503
|
+
|
|
504
|
+
Returns:
|
|
505
|
+
是否加载成功
|
|
506
|
+
"""
|
|
507
|
+
try:
|
|
508
|
+
self.logger.info(f"Loading knowledge from file: {file_path}")
|
|
509
|
+
with open(file_path, encoding="utf-8") as f:
|
|
510
|
+
content = f.read()
|
|
511
|
+
|
|
512
|
+
# 将知识库按段落分割
|
|
513
|
+
documents = [doc.strip() for doc in content.split("\n\n") if doc.strip()]
|
|
514
|
+
|
|
515
|
+
if documents:
|
|
516
|
+
# 生成文档ID
|
|
517
|
+
doc_ids = [f"doc_{int(time.time() * 1000)}_{i}" for i in range(len(documents))]
|
|
518
|
+
|
|
519
|
+
# 生成 embedding
|
|
520
|
+
embeddings = []
|
|
521
|
+
for doc in documents:
|
|
522
|
+
embedding = embedding_model.embed(doc)
|
|
523
|
+
embeddings.append(np.array(embedding, dtype=np.float32))
|
|
524
|
+
|
|
525
|
+
# dense 向量添加到 Milvus
|
|
526
|
+
added_dense_ids = self.add_dense_documents(documents, embeddings, doc_ids)
|
|
527
|
+
|
|
528
|
+
if added_dense_ids:
|
|
529
|
+
self.logger.info(
|
|
530
|
+
f"Loaded {len(added_dense_ids)} dense documents from {file_path}"
|
|
531
|
+
)
|
|
532
|
+
return True
|
|
533
|
+
else:
|
|
534
|
+
self.logger.error(f"Failed to add documents from {file_path}")
|
|
535
|
+
return False
|
|
536
|
+
else:
|
|
537
|
+
self.logger.warning(f"No valid documents found in {file_path}")
|
|
538
|
+
return False
|
|
539
|
+
|
|
540
|
+
except Exception as e:
|
|
541
|
+
self.logger.error(f"Failed to load knowledge from file {file_path}: {e}")
|
|
542
|
+
return False
|
|
543
|
+
|
|
544
|
+
def load_knowledge_from_file_sparse(self, file_path: str) -> bool:
|
|
545
|
+
"""
|
|
546
|
+
从文件加载知识库到 Milvus
|
|
547
|
+
|
|
548
|
+
Args:
|
|
549
|
+
file_path: 知识库文件路径
|
|
550
|
+
|
|
551
|
+
Returns:
|
|
552
|
+
是否加载成功
|
|
553
|
+
"""
|
|
554
|
+
try:
|
|
555
|
+
self.logger.info(f"Loading knowledge from file: {file_path}")
|
|
556
|
+
with open(file_path, encoding="utf-8") as f:
|
|
557
|
+
content = f.read()
|
|
558
|
+
|
|
559
|
+
# 将知识库按段落分割
|
|
560
|
+
documents = [doc.strip() for doc in content.split("\n\n") if doc.strip()]
|
|
561
|
+
|
|
562
|
+
if documents:
|
|
563
|
+
# 生成文档ID
|
|
564
|
+
doc_ids = [f"doc_{int(time.time() * 1000)}_{i}" for i in range(len(documents))]
|
|
565
|
+
|
|
566
|
+
try:
|
|
567
|
+
from pymilvus.model.hybrid import BGEM3EmbeddingFunction
|
|
568
|
+
|
|
569
|
+
embedding_model = BGEM3EmbeddingFunction(use_fp16=False, device="cpu")
|
|
570
|
+
|
|
571
|
+
# 生成 sparse embedding
|
|
572
|
+
sparse_embeddings = embedding_model.encode_documents(documents)
|
|
573
|
+
|
|
574
|
+
# 提取稀疏向量部分
|
|
575
|
+
if isinstance(sparse_embeddings, dict) and "sparse" in sparse_embeddings:
|
|
576
|
+
embeddings = sparse_embeddings["sparse"]
|
|
577
|
+
else:
|
|
578
|
+
# 如果返回格式不同,直接使用
|
|
579
|
+
embeddings = sparse_embeddings
|
|
580
|
+
|
|
581
|
+
except Exception as e:
|
|
582
|
+
self.logger.error(f"Failed to import or use BGEM3EmbeddingFunction: {e}")
|
|
583
|
+
raise
|
|
584
|
+
|
|
585
|
+
# sparse 向量添加到 Milvus
|
|
586
|
+
added_sparse_ids = self.add_sparse_documents(documents, embeddings, doc_ids)
|
|
587
|
+
|
|
588
|
+
if added_sparse_ids:
|
|
589
|
+
self.logger.info(
|
|
590
|
+
f"Loaded {len(added_sparse_ids)} sparse documents from {file_path}"
|
|
591
|
+
)
|
|
592
|
+
return True
|
|
593
|
+
else:
|
|
594
|
+
self.logger.error(f"Failed to add documents from {file_path}")
|
|
595
|
+
return False
|
|
596
|
+
else:
|
|
597
|
+
self.logger.warning(f"No valid documents found in {file_path}")
|
|
598
|
+
return False
|
|
599
|
+
|
|
600
|
+
except Exception as e:
|
|
601
|
+
self.logger.error(f"Failed to load knowledge from file {file_path}: {e}")
|
|
602
|
+
return False
|
|
603
|
+
|
|
604
|
+
def clear_collection(self) -> bool:
|
|
605
|
+
"""清空集合中的所有文档,保留集合结构与索引"""
|
|
606
|
+
try:
|
|
607
|
+
# 通过过滤条件删除全部实体(匹配所有非空字符串id)
|
|
608
|
+
self.client.delete(collection_name=self.collection_name, filter='id != ""')
|
|
609
|
+
self.logger.info(f"Cleared documents in Milvus collection '{self.collection_name}'")
|
|
610
|
+
return True
|
|
611
|
+
except Exception as e:
|
|
612
|
+
self.logger.error(f"Failed to clear Milvus collection: {e}")
|
|
613
|
+
return False
|
|
614
|
+
|
|
615
|
+
def update_document(self, doc_id: str, new_content: str, new_embedding: np.ndarray) -> bool:
|
|
616
|
+
"""
|
|
617
|
+
更新指定文档
|
|
618
|
+
"""
|
|
619
|
+
try:
|
|
620
|
+
self.client.upsert( # type: ignore[attr-defined]
|
|
621
|
+
collection_name=self.collection_name,
|
|
622
|
+
data=[{"id": doc_id, "text": new_content, "dense": new_embedding.tolist()}],
|
|
623
|
+
)
|
|
624
|
+
self.logger.info(
|
|
625
|
+
f"Updated document {doc_id} in Milvus collection '{self.collection_name}'"
|
|
626
|
+
)
|
|
627
|
+
return True
|
|
628
|
+
except Exception as e:
|
|
629
|
+
self.logger.error(
|
|
630
|
+
f"Failed to update document {doc_id} in Milvus collection '{self.collection_name}': {e}"
|
|
631
|
+
)
|
|
632
|
+
return False
|
|
633
|
+
|
|
634
|
+
def delete_document(self, doc_id: str) -> bool:
|
|
635
|
+
"""
|
|
636
|
+
删除指定文档
|
|
637
|
+
"""
|
|
638
|
+
try:
|
|
639
|
+
self.client.delete(collection_name=self.collection_name, filter=f'id == "{doc_id}"')
|
|
640
|
+
self.logger.info(
|
|
641
|
+
f"Deleted document {doc_id} in Milvus collection '{self.collection_name}'"
|
|
642
|
+
)
|
|
643
|
+
return True
|
|
644
|
+
except Exception as e:
|
|
645
|
+
self.logger.error(
|
|
646
|
+
f"Failed to delete document {doc_id} in Milvus collection '{self.collection_name}': {e}"
|
|
647
|
+
)
|
|
648
|
+
return False
|
|
649
|
+
|
|
650
|
+
|
|
651
|
+
class MilvusUtils:
|
|
652
|
+
"""Milvus 工具类"""
|
|
653
|
+
|
|
654
|
+
@staticmethod
|
|
655
|
+
def check_milvus_available() -> bool:
|
|
656
|
+
"""
|
|
657
|
+
检查 MilvusDB 是否可用
|
|
658
|
+
"""
|
|
659
|
+
try:
|
|
660
|
+
import pymilvus # noqa: F401
|
|
661
|
+
|
|
662
|
+
return True
|
|
663
|
+
except ImportError:
|
|
664
|
+
return False
|
|
665
|
+
|
|
666
|
+
@staticmethod
|
|
667
|
+
def validate_milvus_config(config: dict[str, Any]) -> bool:
|
|
668
|
+
"""
|
|
669
|
+
验证 Milvus 配置的有效性
|
|
670
|
+
"""
|
|
671
|
+
required_keys = ["collection_name"]
|
|
672
|
+
|
|
673
|
+
for key in required_keys:
|
|
674
|
+
if key not in config:
|
|
675
|
+
return False
|
|
676
|
+
|
|
677
|
+
return True
|