isage-middleware 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of isage-middleware might be problematic. Click here for more details.
- isage_middleware-0.1.0.dist-info/METADATA +424 -0
- isage_middleware-0.1.0.dist-info/RECORD +191 -0
- isage_middleware-0.1.0.dist-info/WHEEL +5 -0
- isage_middleware-0.1.0.dist-info/top_level.txt +1 -0
- sage/__init__.py +2 -0
- sage/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/__init__.py +83 -0
- sage/middleware/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/api/__init__.py +22 -0
- sage/middleware/api/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/api/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/api/__pycache__/graph_api.cpython-311.opt-2.pyc +0 -0
- sage/middleware/api/__pycache__/graph_api.cpython-311.pyc +0 -0
- sage/middleware/api/__pycache__/kv_api.cpython-311.opt-2.pyc +0 -0
- sage/middleware/api/__pycache__/kv_api.cpython-311.pyc +0 -0
- sage/middleware/api/__pycache__/memory_api.cpython-311.opt-2.pyc +0 -0
- sage/middleware/api/__pycache__/memory_api.cpython-311.pyc +0 -0
- sage/middleware/api/__pycache__/vdb_api.cpython-311.opt-2.pyc +0 -0
- sage/middleware/api/__pycache__/vdb_api.cpython-311.pyc +0 -0
- sage/middleware/api/graph_api.py +74 -0
- sage/middleware/api/kv_api.py +45 -0
- sage/middleware/api/memory_api.py +64 -0
- sage/middleware/api/vdb_api.py +60 -0
- sage/middleware/enterprise/__init__.py +75 -0
- sage/middleware/enterprise/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/enterprise/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/enterprise/sage_db/__init__.py +132 -0
- sage/middleware/enterprise/sage_db/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/enterprise/sage_db/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/enterprise/sage_db/__pycache__/sage_db.cpython-311.opt-2.pyc +0 -0
- sage/middleware/enterprise/sage_db/__pycache__/sage_db.cpython-311.pyc +0 -0
- sage/middleware/enterprise/sage_db/python/__init__.py +7 -0
- sage/middleware/enterprise/sage_db/python/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/enterprise/sage_db/python/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/enterprise/sage_db/python/__pycache__/sage_db.cpython-311.opt-2.pyc +0 -0
- sage/middleware/enterprise/sage_db/python/__pycache__/sage_db.cpython-311.pyc +0 -0
- sage/middleware/enterprise/sage_db/python/sage_db.py +44 -0
- sage/middleware/enterprise/sage_db/sage_db.py +395 -0
- sage/middleware/enterprise/sage_db/tests/__pycache__/test_python.cpython-311.opt-2.pyc +0 -0
- sage/middleware/enterprise/sage_db/tests/__pycache__/test_python.cpython-311.pyc +0 -0
- sage/middleware/enterprise/sage_db/tests/test_python.py +144 -0
- sage/middleware/examples/__pycache__/api_usage_tutorial.cpython-311.opt-2.pyc +0 -0
- sage/middleware/examples/__pycache__/api_usage_tutorial.cpython-311.pyc +0 -0
- sage/middleware/examples/__pycache__/dag_microservices_demo.cpython-311.opt-2.pyc +0 -0
- sage/middleware/examples/__pycache__/dag_microservices_demo.cpython-311.pyc +0 -0
- sage/middleware/examples/__pycache__/microservices_demo.cpython-311.opt-2.pyc +0 -0
- sage/middleware/examples/__pycache__/microservices_demo.cpython-311.pyc +0 -0
- sage/middleware/examples/__pycache__/microservices_integration_demo.cpython-311.opt-2.pyc +0 -0
- sage/middleware/examples/__pycache__/microservices_integration_demo.cpython-311.pyc +0 -0
- sage/middleware/examples/__pycache__/microservices_registration_demo.cpython-311.opt-2.pyc +0 -0
- sage/middleware/examples/__pycache__/microservices_registration_demo.cpython-311.pyc +0 -0
- sage/middleware/examples/api_usage_tutorial.py +339 -0
- sage/middleware/examples/dag_microservices_demo.py +220 -0
- sage/middleware/examples/microservices_demo.py +0 -0
- sage/middleware/examples/microservices_integration_demo.py +373 -0
- sage/middleware/examples/microservices_registration_demo.py +144 -0
- sage/middleware/py.typed +2 -0
- sage/middleware/services/graph/__init__.py +8 -0
- sage/middleware/services/graph/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/graph/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/services/graph/__pycache__/graph_index.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/graph/__pycache__/graph_index.cpython-311.pyc +0 -0
- sage/middleware/services/graph/__pycache__/graph_service.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/graph/__pycache__/graph_service.cpython-311.pyc +0 -0
- sage/middleware/services/graph/examples/__pycache__/graph_demo.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/graph/examples/__pycache__/graph_demo.cpython-311.pyc +0 -0
- sage/middleware/services/graph/examples/graph_demo.py +177 -0
- sage/middleware/services/graph/graph_index.py +194 -0
- sage/middleware/services/graph/graph_service.py +541 -0
- sage/middleware/services/graph/search_engine/__init__.py +0 -0
- sage/middleware/services/graph/search_engine/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/graph/search_engine/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/services/graph/search_engine/__pycache__/base_graph_index.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/graph/search_engine/__pycache__/base_graph_index.cpython-311.pyc +0 -0
- sage/middleware/services/graph/search_engine/base_graph_index.py +0 -0
- sage/middleware/services/kv/__init__.py +8 -0
- sage/middleware/services/kv/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/kv/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/services/kv/__pycache__/kv_service.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/kv/__pycache__/kv_service.cpython-311.pyc +0 -0
- sage/middleware/services/kv/examples/__pycache__/kv_demo.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/kv/examples/__pycache__/kv_demo.cpython-311.pyc +0 -0
- sage/middleware/services/kv/examples/kv_demo.py +213 -0
- sage/middleware/services/kv/kv_service.py +306 -0
- sage/middleware/services/kv/search_engine/__init__.py +0 -0
- sage/middleware/services/kv/search_engine/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/kv/search_engine/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/services/kv/search_engine/__pycache__/base_kv_index.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/kv/search_engine/__pycache__/base_kv_index.cpython-311.pyc +0 -0
- sage/middleware/services/kv/search_engine/__pycache__/bm25s_index.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/kv/search_engine/__pycache__/bm25s_index.cpython-311.pyc +0 -0
- sage/middleware/services/kv/search_engine/base_kv_index.py +75 -0
- sage/middleware/services/kv/search_engine/bm25s_index.py +238 -0
- sage/middleware/services/memory/__init__.py +12 -0
- sage/middleware/services/memory/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/memory/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/services/memory/__pycache__/memory_service.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/memory/__pycache__/memory_service.cpython-311.pyc +0 -0
- sage/middleware/services/memory/examples/__pycache__/dag_microservices_demo.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/memory/examples/__pycache__/dag_microservices_demo.cpython-311.pyc +0 -0
- sage/middleware/services/memory/examples/__pycache__/memory_demo.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/memory/examples/__pycache__/memory_demo.cpython-311.pyc +0 -0
- sage/middleware/services/memory/examples/dag_microservices_demo.py +220 -0
- sage/middleware/services/memory/examples/memory_demo.py +490 -0
- sage/middleware/services/memory/memory_collection/__pycache__/base_collection.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/memory/memory_collection/__pycache__/base_collection.cpython-311.pyc +0 -0
- sage/middleware/services/memory/memory_collection/__pycache__/graph_collection.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/memory/memory_collection/__pycache__/graph_collection.cpython-311.pyc +0 -0
- sage/middleware/services/memory/memory_collection/__pycache__/kv_collection.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/memory/memory_collection/__pycache__/kv_collection.cpython-311.pyc +0 -0
- sage/middleware/services/memory/memory_collection/__pycache__/vdb_collection.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/memory/memory_collection/__pycache__/vdb_collection.cpython-311.pyc +0 -0
- sage/middleware/services/memory/memory_collection/base_collection.py +0 -0
- sage/middleware/services/memory/memory_collection/graph_collection.py +0 -0
- sage/middleware/services/memory/memory_collection/kv_collection.py +0 -0
- sage/middleware/services/memory/memory_collection/vdb_collection.py +0 -0
- sage/middleware/services/memory/memory_service.py +474 -0
- sage/middleware/services/memory/utils/__init__.py +0 -0
- sage/middleware/services/memory/utils/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/memory/utils/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/services/memory/utils/__pycache__/path_utils.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/memory/utils/__pycache__/path_utils.cpython-311.pyc +0 -0
- sage/middleware/services/memory/utils/path_utils.py +0 -0
- sage/middleware/services/vdb/__init__.py +8 -0
- sage/middleware/services/vdb/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/vdb/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/services/vdb/__pycache__/vdb_service.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/vdb/__pycache__/vdb_service.cpython-311.pyc +0 -0
- sage/middleware/services/vdb/examples/__pycache__/vdb_demo.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/vdb/examples/__pycache__/vdb_demo.cpython-311.pyc +0 -0
- sage/middleware/services/vdb/examples/vdb_demo.py +447 -0
- sage/middleware/services/vdb/search_engine/__init__.py +0 -0
- sage/middleware/services/vdb/search_engine/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/vdb/search_engine/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/services/vdb/search_engine/__pycache__/base_vdb_index.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/vdb/search_engine/__pycache__/base_vdb_index.cpython-311.pyc +0 -0
- sage/middleware/services/vdb/search_engine/__pycache__/faiss_index.cpython-311.opt-2.pyc +0 -0
- sage/middleware/services/vdb/search_engine/__pycache__/faiss_index.cpython-311.pyc +0 -0
- sage/middleware/services/vdb/search_engine/base_vdb_index.py +58 -0
- sage/middleware/services/vdb/search_engine/faiss_index.py +461 -0
- sage/middleware/services/vdb/vdb_service.py +433 -0
- sage/middleware/utils/__init__.py +5 -0
- sage/middleware/utils/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__init__.py +35 -0
- sage/middleware/utils/embedding/__pycache__/__init__.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/__init__.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/_cohere.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/_cohere.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/bedrock.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/bedrock.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/embedding_api.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/embedding_api.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/embedding_model.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/embedding_model.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/hf.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/hf.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/instructor.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/instructor.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/jina.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/jina.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/lollms.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/lollms.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/mockembedder.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/mockembedder.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/nvidia_openai.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/nvidia_openai.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/ollama.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/ollama.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/openai.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/openai.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/siliconcloud.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/siliconcloud.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/zhipu.cpython-311.opt-2.pyc +0 -0
- sage/middleware/utils/embedding/__pycache__/zhipu.cpython-311.pyc +0 -0
- sage/middleware/utils/embedding/_cohere.py +68 -0
- sage/middleware/utils/embedding/bedrock.py +174 -0
- sage/middleware/utils/embedding/embedding_api.py +12 -0
- sage/middleware/utils/embedding/embedding_model.py +150 -0
- sage/middleware/utils/embedding/hf.py +90 -0
- sage/middleware/utils/embedding/instructor.py +10 -0
- sage/middleware/utils/embedding/jina.py +115 -0
- sage/middleware/utils/embedding/lollms.py +100 -0
- sage/middleware/utils/embedding/mockembedder.py +46 -0
- sage/middleware/utils/embedding/nvidia_openai.py +97 -0
- sage/middleware/utils/embedding/ollama.py +97 -0
- sage/middleware/utils/embedding/openai.py +112 -0
- sage/middleware/utils/embedding/siliconcloud.py +133 -0
- sage/middleware/utils/embedding/zhipu.py +85 -0
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
# file sage/core/sage.service.memory./search_engine/kv_index/bm25s_index.py
|
|
2
|
+
# python -m sage.core.sage.service.memory..search_engine.kv_index.bm25s_index
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
import bm25s
|
|
6
|
+
import shutil
|
|
7
|
+
import Stemmer
|
|
8
|
+
from typing import List, Optional, Dict, Any
|
|
9
|
+
|
|
10
|
+
from sage.middleware.services.kv.search_engine.base_kv_index import BaseKVIndex
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class BM25sIndex(BaseKVIndex):
|
|
14
|
+
def __init__(
|
|
15
|
+
self,
|
|
16
|
+
name: str,
|
|
17
|
+
texts: Optional[List[str]] = None,
|
|
18
|
+
ids: Optional[List[str]] = None,
|
|
19
|
+
load_path: Optional[str] = None,
|
|
20
|
+
):
|
|
21
|
+
"""
|
|
22
|
+
Initialize BM25sIndex.
|
|
23
|
+
支持两种初始化方式:传入文本和id新建索引,或从指定目录加载已有索引。
|
|
24
|
+
Supports two initialization modes: create a new index with texts and ids, or load an existing index from directory.
|
|
25
|
+
"""
|
|
26
|
+
self.name = name
|
|
27
|
+
self.ids: List[str] = []
|
|
28
|
+
self.texts: List[str] = []
|
|
29
|
+
self.tokens: List[List[str]] = []
|
|
30
|
+
self.tokenizer = None
|
|
31
|
+
self.bm25 = None
|
|
32
|
+
|
|
33
|
+
if load_path is not None:
|
|
34
|
+
self._load(load_path)
|
|
35
|
+
elif texts is not None and ids is not None:
|
|
36
|
+
assert len(texts) == len(ids), "texts and ids must have the same length."
|
|
37
|
+
self.ids = list(ids)
|
|
38
|
+
self.texts = list(texts)
|
|
39
|
+
self.tokenizer = self._get_tokenizer(self.texts)
|
|
40
|
+
self.tokens = self.tokenizer.tokenize(self.texts) # type: ignore
|
|
41
|
+
self.bm25 = bm25s.BM25(corpus=self.texts, backend="numba")
|
|
42
|
+
self.bm25.index(self.tokens)
|
|
43
|
+
else:
|
|
44
|
+
raise ValueError("Must provide either texts+ids or load_path.")
|
|
45
|
+
|
|
46
|
+
def _get_tokenizer(self, texts: List[str]):
|
|
47
|
+
"""
|
|
48
|
+
根据文本内容选择合适的分词器(中文或英文)。
|
|
49
|
+
Select appropriate tokenizer (Chinese or English) according to the content of texts.
|
|
50
|
+
"""
|
|
51
|
+
zh_flag = self._is_chinese(texts[0])
|
|
52
|
+
if zh_flag:
|
|
53
|
+
return bm25s.tokenization.Tokenizer(stopwords='zh')
|
|
54
|
+
else:
|
|
55
|
+
stemmer = Stemmer.Stemmer("english")
|
|
56
|
+
return bm25s.tokenization.Tokenizer(stopwords='en', stemmer=stemmer)
|
|
57
|
+
|
|
58
|
+
def _rebuild(self):
|
|
59
|
+
"""
|
|
60
|
+
重新构建分词器、分词结果和BM25索引。
|
|
61
|
+
Rebuild the tokenizer, tokens, and BM25 index.
|
|
62
|
+
"""
|
|
63
|
+
self.tokenizer = self._get_tokenizer(self.texts)
|
|
64
|
+
self.tokens = self.tokenizer.tokenize(self.texts) # type: ignore
|
|
65
|
+
self.bm25 = bm25s.BM25(corpus=self.texts, backend="numba")
|
|
66
|
+
self.bm25.index(self.tokens)
|
|
67
|
+
|
|
68
|
+
def _is_chinese(self, text: str):
|
|
69
|
+
"""
|
|
70
|
+
判断字符串中是否包含中文字符。
|
|
71
|
+
Detect whether the text contains Chinese characters.
|
|
72
|
+
"""
|
|
73
|
+
return any('\u4e00' <= ch <= '\u9fff' for ch in text)
|
|
74
|
+
|
|
75
|
+
def insert(self, text, doc_id):
|
|
76
|
+
"""
|
|
77
|
+
插入新的文本和id,并重建索引。
|
|
78
|
+
Insert a new text and doc_id, then rebuild the index.
|
|
79
|
+
"""
|
|
80
|
+
self.texts.append(text)
|
|
81
|
+
self.ids.append(doc_id)
|
|
82
|
+
self._rebuild()
|
|
83
|
+
|
|
84
|
+
def delete(self, id: str) -> None:
|
|
85
|
+
"""
|
|
86
|
+
根据id删除对应的文本,并重建索引。
|
|
87
|
+
Delete the text corresponding to the given id, then rebuild the index.
|
|
88
|
+
"""
|
|
89
|
+
if id not in self.ids:
|
|
90
|
+
return
|
|
91
|
+
idx = self.ids.index(id)
|
|
92
|
+
self.ids.pop(idx)
|
|
93
|
+
self.texts.pop(idx)
|
|
94
|
+
self._rebuild()
|
|
95
|
+
|
|
96
|
+
def update(self, id: str, new_text: str) -> None:
|
|
97
|
+
"""
|
|
98
|
+
更新指定id的文本内容,并重建索引。
|
|
99
|
+
Update the text of the given id, then rebuild the index.
|
|
100
|
+
"""
|
|
101
|
+
if id not in self.ids:
|
|
102
|
+
return
|
|
103
|
+
idx = self.ids.index(id)
|
|
104
|
+
self.texts[idx] = new_text
|
|
105
|
+
self._rebuild()
|
|
106
|
+
|
|
107
|
+
def search(self, text: str, topk: int = 5) -> List[str]:
|
|
108
|
+
"""
|
|
109
|
+
对输入文本进行检索,返回最相关的topk个id。
|
|
110
|
+
Search for the most relevant texts and return the top-k ids.
|
|
111
|
+
"""
|
|
112
|
+
if self.bm25 is None or len(self.ids) == 0:
|
|
113
|
+
return []
|
|
114
|
+
query_token = self.tokenizer.tokenize([text])[0] # type: ignore
|
|
115
|
+
scores = self.bm25.get_scores(query_token) # type: ignore
|
|
116
|
+
topk_idx = sorted(range(len(scores)), key=lambda i: -scores[i])[:topk]
|
|
117
|
+
return [self.ids[i] for i in topk_idx]
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def store(self, dir_path: str) -> Dict[str, Any]:
|
|
121
|
+
"""
|
|
122
|
+
将索引信息存储到指定目录,包含bm25模型、分词器、ids和texts。
|
|
123
|
+
Store the index info into the specified directory, including bm25 model, tokenizer, ids, and texts.
|
|
124
|
+
"""
|
|
125
|
+
os.makedirs(dir_path, exist_ok=True)
|
|
126
|
+
self.bm25.vocab_dict = {str(k): v for k, v in self.bm25.vocab_dict.items()} # type: ignore
|
|
127
|
+
|
|
128
|
+
self.bm25.save(dir_path, corpus=None)# type: ignore
|
|
129
|
+
self.tokenizer.save_vocab(dir_path) # type: ignore
|
|
130
|
+
self.tokenizer.save_stopwords(dir_path) # type: ignore
|
|
131
|
+
with open(os.path.join(dir_path, "ids.txt"), "w", encoding="utf-8") as f:
|
|
132
|
+
for i in self.ids:
|
|
133
|
+
f.write(i + "\n")
|
|
134
|
+
with open(os.path.join(dir_path, "texts.txt"), "w", encoding="utf-8") as f:
|
|
135
|
+
for t in self.texts:
|
|
136
|
+
f.write(t.replace("\n", " ") + "\n")
|
|
137
|
+
return {"index_path": dir_path}
|
|
138
|
+
|
|
139
|
+
def _load(self, dir_path: str):
|
|
140
|
+
"""
|
|
141
|
+
从目录加载索引及相关内容,包括bm25模型、分词器、ids和texts。
|
|
142
|
+
Load index and related data from directory, including bm25 model, tokenizer, ids, and texts.
|
|
143
|
+
"""
|
|
144
|
+
self.bm25 = bm25s.BM25.load(dir_path)
|
|
145
|
+
|
|
146
|
+
self.tokenizer = bm25s.tokenization.Tokenizer()
|
|
147
|
+
self.tokenizer.load_vocab(dir_path)
|
|
148
|
+
self.tokenizer.load_stopwords(dir_path)
|
|
149
|
+
|
|
150
|
+
with open(os.path.join(dir_path, "ids.txt"), "r", encoding="utf-8") as f:
|
|
151
|
+
self.ids = [line.strip() for line in f.readlines()]
|
|
152
|
+
with open(os.path.join(dir_path, "texts.txt"), "r", encoding="utf-8") as f:
|
|
153
|
+
self.texts = [line.strip() for line in f.readlines()]
|
|
154
|
+
self.tokens = [self.tokenizer.tokenize([t], return_as="tuple")[0][0] for t in self.texts] # type: ignore
|
|
155
|
+
self.bm25.index(self.tokens)
|
|
156
|
+
|
|
157
|
+
@classmethod
|
|
158
|
+
def load(cls, name: str, dir_path: str) -> "BM25sIndex":
|
|
159
|
+
"""
|
|
160
|
+
通过名称和根路径加载一个BM25sIndex实例。
|
|
161
|
+
Load a BM25sIndex instance by name and root path.
|
|
162
|
+
"""
|
|
163
|
+
return cls(name=name, load_path=dir_path)
|
|
164
|
+
|
|
165
|
+
@staticmethod
|
|
166
|
+
def clear(dir_path: str):
|
|
167
|
+
"""
|
|
168
|
+
删除指定名称下的所有索引数据。
|
|
169
|
+
Remove all index data under the specified name.
|
|
170
|
+
"""
|
|
171
|
+
try:
|
|
172
|
+
shutil.rmtree(dir_path)
|
|
173
|
+
print(f"Cleared: {dir_path}")
|
|
174
|
+
except FileNotFoundError:
|
|
175
|
+
print(f"Directory does not exist, nothing to clear: {dir_path}")
|
|
176
|
+
except Exception as e:
|
|
177
|
+
print(f"Failed to clear: {e}")
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
if __name__ == "__main__":
|
|
181
|
+
# 简单数据
|
|
182
|
+
ids = ["a", "b", "c"]
|
|
183
|
+
texts = [
|
|
184
|
+
"The quick brown fox jumps over the lazy dog.",
|
|
185
|
+
"Hello world! This is a operator_test document.",
|
|
186
|
+
"Python is a great programming language."
|
|
187
|
+
]
|
|
188
|
+
root_path = "./tmp_bm25_test" # 用临时目录避免误删业务数据
|
|
189
|
+
index_dir = root_path
|
|
190
|
+
index_name = "demo"
|
|
191
|
+
|
|
192
|
+
# 1. 初始化索引
|
|
193
|
+
print("\n== 初始化并首检 ==")
|
|
194
|
+
index = BM25sIndex(name=index_name, texts=texts, ids=ids)
|
|
195
|
+
print("初始检索 'Python':", index.search("Python"))
|
|
196
|
+
print("初始检索 'hello':", index.search("hello"))
|
|
197
|
+
print("初始检索 'fox':", index.search("fox"))
|
|
198
|
+
|
|
199
|
+
# 2. 插入新文档后检索
|
|
200
|
+
print("\n== 插入新文档 ==")
|
|
201
|
+
index.insert("This document mentions python and fox together.", "d")
|
|
202
|
+
print("插入后检索 'python':", index.search("python"))
|
|
203
|
+
print("插入后检索 'fox':", index.search("fox"))
|
|
204
|
+
|
|
205
|
+
# 3. 删除文档后检索
|
|
206
|
+
print("\n== 删除文档 ==")
|
|
207
|
+
index.delete("b")
|
|
208
|
+
print("删除 'b' 后检索 'hello':", index.search("hello"))
|
|
209
|
+
print("删除 'b' 后检索 'operator_test':", index.search("operator_test"))
|
|
210
|
+
print("删除 'b' 后检索 'python':", index.search("python"))
|
|
211
|
+
|
|
212
|
+
# 4. 更新文档后检索
|
|
213
|
+
print("\n== 更新文档 ==")
|
|
214
|
+
index.update("c", "Hello world! Now c document talks about foxes.")
|
|
215
|
+
print("更新 'c' 后检索 'fox':", index.search("fox"))
|
|
216
|
+
print("更新 'c' 后检索 'python':", index.search("python"))
|
|
217
|
+
|
|
218
|
+
# 5. 保存索引
|
|
219
|
+
print("\n== 保存索引到磁盘 ==")
|
|
220
|
+
store_info = index.store(index_dir)
|
|
221
|
+
print("索引保存路径:", store_info["index_path"])
|
|
222
|
+
|
|
223
|
+
# 6. 等待用户输入 'yes' 后加载索引并检索
|
|
224
|
+
print("\n== 测试持久化(请手动输入 yes 继续)==")
|
|
225
|
+
user_input = input("输入 'yes' 以继续测试 load 并检索:")
|
|
226
|
+
if user_input.strip().lower() == "yes":
|
|
227
|
+
index_loaded = BM25sIndex.load(name=index_name, dir_path=index_dir)
|
|
228
|
+
print("持久化load后检索 'fox':", index_loaded.search("fox"))
|
|
229
|
+
print("持久化load后检索 'python':", index_loaded.search("python"))
|
|
230
|
+
print("持久化load后检索 'hello':", index_loaded.search("hello"))
|
|
231
|
+
print("ids序列:", index_loaded.ids)
|
|
232
|
+
else:
|
|
233
|
+
print("用户未输入 'yes',测试提前结束。")
|
|
234
|
+
|
|
235
|
+
# 7. 清理测试目录
|
|
236
|
+
print("\n== 清理测试目录 ==")
|
|
237
|
+
BM25sIndex.clear(index_dir)
|
|
238
|
+
|
|
Binary file
|
sage/middleware/services/memory/examples/__pycache__/dag_microservices_demo.cpython-311.opt-2.pyc
ADDED
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
"""
|
|
2
|
+
SAGE 微服务架构使用示例
|
|
3
|
+
展示如何在应用程序中注册和使用KV、VDB、Memory服务
|
|
4
|
+
"""
|
|
5
|
+
import asyncio
|
|
6
|
+
import time
|
|
7
|
+
from typing import List
|
|
8
|
+
import numpy as np
|
|
9
|
+
|
|
10
|
+
# 导入SAGE环境和服务
|
|
11
|
+
from sage.core.api.local_environment import LocalEnvironment
|
|
12
|
+
from sage.middleware.services import (
|
|
13
|
+
MemoryService,
|
|
14
|
+
create_kv_service_factory,
|
|
15
|
+
create_vdb_service_factory,
|
|
16
|
+
create_memory_service_factory
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class SampleApplication:
|
|
21
|
+
"""示例应用程序,展示如何使用微服务架构"""
|
|
22
|
+
|
|
23
|
+
def __init__(self):
|
|
24
|
+
# 创建SAGE环境
|
|
25
|
+
self.env = LocalEnvironment("microservices_demo", {})
|
|
26
|
+
|
|
27
|
+
def setup_services(self):
|
|
28
|
+
"""在应用中注册微服务"""
|
|
29
|
+
print("🔧 注册微服务到SAGE环境...")
|
|
30
|
+
|
|
31
|
+
# 注册KV服务
|
|
32
|
+
kv_factory = create_kv_service_factory(
|
|
33
|
+
service_name="kv_service",
|
|
34
|
+
backend_type="memory", # 使用内存后端
|
|
35
|
+
max_size=1000,
|
|
36
|
+
ttl_seconds=3600 # 1小时过期
|
|
37
|
+
)
|
|
38
|
+
self.env.register_service("kv_service", kv_factory.service_class, kv_factory)
|
|
39
|
+
|
|
40
|
+
# 注册VDB服务
|
|
41
|
+
vdb_factory = create_vdb_service_factory(
|
|
42
|
+
service_name="vdb_service",
|
|
43
|
+
collection_name="demo_vectors",
|
|
44
|
+
dimension=384,
|
|
45
|
+
persist_directory="./demo_vectors"
|
|
46
|
+
)
|
|
47
|
+
self.env.register_service("vdb_service", vdb_factory.service_class, vdb_factory)
|
|
48
|
+
|
|
49
|
+
# 注册Memory编排服务
|
|
50
|
+
memory_factory = create_memory_service_factory(
|
|
51
|
+
service_name="memory_service",
|
|
52
|
+
kv_service_name="kv_service",
|
|
53
|
+
vdb_service_name="vdb_service"
|
|
54
|
+
)
|
|
55
|
+
self.env.register_service("memory_service", memory_factory.service_class, memory_factory)
|
|
56
|
+
|
|
57
|
+
print("✅ 所有服务已注册")
|
|
58
|
+
|
|
59
|
+
def run_demo(self):
|
|
60
|
+
"""运行演示"""
|
|
61
|
+
print("🚀 启动微服务演示")
|
|
62
|
+
print("=" * 50)
|
|
63
|
+
|
|
64
|
+
# 设置服务
|
|
65
|
+
self.setup_services()
|
|
66
|
+
|
|
67
|
+
# 创建一个简单的数据流来演示服务使用
|
|
68
|
+
data_stream = self.env.from_memory_source([
|
|
69
|
+
{"id": 1, "content": "用户询问了关于Python的问题", "session": "session_1"},
|
|
70
|
+
{"id": 2, "content": "AI助手回答了Python基础知识", "session": "session_1"},
|
|
71
|
+
{"id": 3, "content": "用户请求更多代码示例", "session": "session_1"},
|
|
72
|
+
{"id": 4, "content": "讨论了机器学习算法", "session": "session_2"},
|
|
73
|
+
{"id": 5, "content": "解释了神经网络原理", "session": "session_2"}
|
|
74
|
+
])
|
|
75
|
+
|
|
76
|
+
# 定义处理函数
|
|
77
|
+
def process_conversation(data):
|
|
78
|
+
"""处理对话数据的函数"""
|
|
79
|
+
# 在这里我们可以使用服务调用
|
|
80
|
+
# 注意:在实际的SAGE函数中,可以通过 self.call_service 访问服务
|
|
81
|
+
|
|
82
|
+
print(f"处理对话: {data['content'][:30]}...")
|
|
83
|
+
|
|
84
|
+
# 模拟向量化(在实际应用中,这里会调用embedding服务)
|
|
85
|
+
content_vector = np.random.random(384).tolist()
|
|
86
|
+
|
|
87
|
+
# 这里展示了服务调用的概念
|
|
88
|
+
# 在实际的SAGE函数中,代码会是这样:
|
|
89
|
+
#
|
|
90
|
+
# # 存储到KV
|
|
91
|
+
# self.call_service["kv_service"].put(f"conv:{data['id']}", {
|
|
92
|
+
# "content": data['content'],
|
|
93
|
+
# "session": data['session'],
|
|
94
|
+
# "timestamp": time.time()
|
|
95
|
+
# })
|
|
96
|
+
#
|
|
97
|
+
# # 存储记忆
|
|
98
|
+
# memory_id = self.call_service["memory_service"].store_memory(
|
|
99
|
+
# session_id=data['session'],
|
|
100
|
+
# content=data['content'],
|
|
101
|
+
# vector=content_vector,
|
|
102
|
+
# memory_type="conversation"
|
|
103
|
+
# )
|
|
104
|
+
|
|
105
|
+
return {
|
|
106
|
+
"processed": True,
|
|
107
|
+
"memory_id": f"mock_memory_{data['id']}",
|
|
108
|
+
"vector_dim": len(content_vector)
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
# 应用处理函数
|
|
112
|
+
processed_stream = data_stream.map(process_conversation)
|
|
113
|
+
|
|
114
|
+
# 执行并收集结果
|
|
115
|
+
print("\n📊 处理结果:")
|
|
116
|
+
results = processed_stream.collect()
|
|
117
|
+
|
|
118
|
+
for i, result in enumerate(results, 1):
|
|
119
|
+
print(f" {i}. ✅ 已处理 - Memory ID: {result['memory_id']}")
|
|
120
|
+
|
|
121
|
+
print(f"\n🎯 总共处理了 {len(results)} 条对话记录")
|
|
122
|
+
|
|
123
|
+
# 展示服务调用的概念
|
|
124
|
+
self.show_service_usage_concept()
|
|
125
|
+
|
|
126
|
+
def show_service_usage_concept(self):
|
|
127
|
+
"""展示服务使用概念"""
|
|
128
|
+
print("\n" + "=" * 50)
|
|
129
|
+
print("💡 在SAGE函数中使用服务的示例代码:")
|
|
130
|
+
print("=" * 50)
|
|
131
|
+
|
|
132
|
+
example_code = '''
|
|
133
|
+
# 在SAGE Function中使用微服务的示例
|
|
134
|
+
|
|
135
|
+
class ConversationProcessor(BaseFunction):
|
|
136
|
+
"""对话处理函数"""
|
|
137
|
+
|
|
138
|
+
def process(self, conversation_data):
|
|
139
|
+
session_id = conversation_data['session_id']
|
|
140
|
+
content = conversation_data['content']
|
|
141
|
+
|
|
142
|
+
# 1. 调用KV服务存储原始数据
|
|
143
|
+
kv_success = self.call_service["kv_service"].put(
|
|
144
|
+
f"raw:{session_id}",
|
|
145
|
+
conversation_data
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# 2. 生成向量表示(假设有embedding服务)
|
|
149
|
+
vector = self.call_service["embedding_service"].encode(content)
|
|
150
|
+
|
|
151
|
+
# 3. 调用Memory服务存储记忆
|
|
152
|
+
memory_id = self.call_service["memory_service"].store_memory(
|
|
153
|
+
session_id=session_id,
|
|
154
|
+
content=content,
|
|
155
|
+
vector=vector,
|
|
156
|
+
memory_type="conversation"
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# 4. 搜索相关历史记忆
|
|
160
|
+
related_memories = self.call_service["memory_service"].search_memories(
|
|
161
|
+
query_vector=vector,
|
|
162
|
+
session_id=session_id,
|
|
163
|
+
limit=5
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
return {
|
|
167
|
+
"memory_id": memory_id,
|
|
168
|
+
"related_count": len(related_memories),
|
|
169
|
+
"kv_stored": kv_success
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
# 在DAG中注册和使用
|
|
173
|
+
def create_conversation_dag():
|
|
174
|
+
env = LocalEnvironment("conversation_app", {})
|
|
175
|
+
|
|
176
|
+
# 注册微服务
|
|
177
|
+
env.register_service("kv_service", KVService, create_kv_service_factory())
|
|
178
|
+
env.register_service("vdb_service", VDBService, create_vdb_service_factory())
|
|
179
|
+
env.register_service("memory_service", MemoryService, create_memory_service_factory())
|
|
180
|
+
|
|
181
|
+
# 创建数据流
|
|
182
|
+
stream = env.from_kafka_source(...)
|
|
183
|
+
|
|
184
|
+
# 应用处理函数(自动访问服务)
|
|
185
|
+
processed = stream.map(ConversationProcessor())
|
|
186
|
+
|
|
187
|
+
return processed
|
|
188
|
+
'''
|
|
189
|
+
|
|
190
|
+
print(example_code)
|
|
191
|
+
print("\n" + "=" * 50)
|
|
192
|
+
print("🔍 关键概念:")
|
|
193
|
+
print("1. 服务作为Service Tasks在DAG中运行")
|
|
194
|
+
print("2. 函数通过 self.call_service[service_name] 调用服务")
|
|
195
|
+
print("3. 服务可以是本地任务或Ray分布式任务")
|
|
196
|
+
print("4. 应用程序控制服务的生命周期")
|
|
197
|
+
print("5. 服务间通过SAGE的队列机制通信")
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def main():
|
|
201
|
+
"""主函数"""
|
|
202
|
+
app = SampleApplication()
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
app.run_demo()
|
|
206
|
+
print("\n✅ 演示完成!")
|
|
207
|
+
print("\n📖 查看更多信息:")
|
|
208
|
+
print(" - 微服务代码: packages/sage-middleware/src/sage/service/")
|
|
209
|
+
print(" - 使用指南: packages/sage-middleware/MICROSERVICES_GUIDE.md")
|
|
210
|
+
|
|
211
|
+
except KeyboardInterrupt:
|
|
212
|
+
print("\n\n👋 演示被中断")
|
|
213
|
+
except Exception as e:
|
|
214
|
+
print(f"\n❌ 演示出错: {e}")
|
|
215
|
+
import traceback
|
|
216
|
+
traceback.print_exc()
|
|
217
|
+
|
|
218
|
+
|
|
219
|
+
if __name__ == "__main__":
|
|
220
|
+
main()
|