flowllm 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowllm-0.1.0.dist-info/METADATA +597 -0
- flowllm-0.1.0.dist-info/RECORD +66 -0
- flowllm-0.1.0.dist-info/WHEEL +5 -0
- flowllm-0.1.0.dist-info/entry_points.txt +3 -0
- flowllm-0.1.0.dist-info/licenses/LICENSE +201 -0
- flowllm-0.1.0.dist-info/top_level.txt +1 -0
- llmflow/__init__.py +0 -0
- llmflow/app.py +53 -0
- llmflow/config/__init__.py +0 -0
- llmflow/config/config_parser.py +80 -0
- llmflow/config/mock_config.yaml +58 -0
- llmflow/embedding_model/__init__.py +5 -0
- llmflow/embedding_model/base_embedding_model.py +104 -0
- llmflow/embedding_model/openai_compatible_embedding_model.py +95 -0
- llmflow/enumeration/__init__.py +0 -0
- llmflow/enumeration/agent_state.py +8 -0
- llmflow/enumeration/chunk_enum.py +9 -0
- llmflow/enumeration/http_enum.py +9 -0
- llmflow/enumeration/role.py +8 -0
- llmflow/llm/__init__.py +5 -0
- llmflow/llm/base_llm.py +138 -0
- llmflow/llm/openai_compatible_llm.py +283 -0
- llmflow/mcp_server.py +110 -0
- llmflow/op/__init__.py +10 -0
- llmflow/op/base_op.py +125 -0
- llmflow/op/mock_op.py +40 -0
- llmflow/op/prompt_mixin.py +74 -0
- llmflow/op/react/__init__.py +0 -0
- llmflow/op/react/react_v1_op.py +88 -0
- llmflow/op/react/react_v1_prompt.yaml +28 -0
- llmflow/op/vector_store/__init__.py +13 -0
- llmflow/op/vector_store/recall_vector_store_op.py +48 -0
- llmflow/op/vector_store/update_vector_store_op.py +28 -0
- llmflow/op/vector_store/vector_store_action_op.py +46 -0
- llmflow/pipeline/__init__.py +0 -0
- llmflow/pipeline/pipeline.py +94 -0
- llmflow/pipeline/pipeline_context.py +37 -0
- llmflow/schema/__init__.py +0 -0
- llmflow/schema/app_config.py +69 -0
- llmflow/schema/experience.py +144 -0
- llmflow/schema/message.py +68 -0
- llmflow/schema/request.py +32 -0
- llmflow/schema/response.py +29 -0
- llmflow/schema/vector_node.py +11 -0
- llmflow/service/__init__.py +0 -0
- llmflow/service/llmflow_service.py +96 -0
- llmflow/tool/__init__.py +9 -0
- llmflow/tool/base_tool.py +80 -0
- llmflow/tool/code_tool.py +43 -0
- llmflow/tool/dashscope_search_tool.py +162 -0
- llmflow/tool/mcp_tool.py +77 -0
- llmflow/tool/tavily_search_tool.py +109 -0
- llmflow/tool/terminate_tool.py +23 -0
- llmflow/utils/__init__.py +0 -0
- llmflow/utils/common_utils.py +17 -0
- llmflow/utils/file_handler.py +25 -0
- llmflow/utils/http_client.py +156 -0
- llmflow/utils/op_utils.py +102 -0
- llmflow/utils/registry.py +33 -0
- llmflow/utils/singleton.py +9 -0
- llmflow/utils/timer.py +53 -0
- llmflow/vector_store/__init__.py +7 -0
- llmflow/vector_store/base_vector_store.py +136 -0
- llmflow/vector_store/chroma_vector_store.py +188 -0
- llmflow/vector_store/es_vector_store.py +227 -0
- llmflow/vector_store/file_vector_store.py +163 -0
@@ -0,0 +1,136 @@
|
|
1
|
+
import fcntl
|
2
|
+
import json
|
3
|
+
from abc import ABC
|
4
|
+
from pathlib import Path
|
5
|
+
from typing import List, Iterable
|
6
|
+
|
7
|
+
from loguru import logger
|
8
|
+
from pydantic import BaseModel, Field
|
9
|
+
from tqdm import tqdm
|
10
|
+
|
11
|
+
from llmflow.embedding_model.base_embedding_model import BaseEmbeddingModel
|
12
|
+
from llmflow.schema.vector_node import VectorNode
|
13
|
+
|
14
|
+
|
15
|
+
class BaseVectorStore(BaseModel, ABC):
|
16
|
+
embedding_model: BaseEmbeddingModel | None = Field(default=None)
|
17
|
+
batch_size: int = Field(default=1024)
|
18
|
+
|
19
|
+
@staticmethod
|
20
|
+
def _load_from_path(workspace_id: str, path: str | Path, callback_fn=None, **kwargs) -> Iterable[VectorNode]:
|
21
|
+
workspace_path = Path(path) / f"{workspace_id}.jsonl"
|
22
|
+
if not workspace_path.exists():
|
23
|
+
logger.warning(f"workspace_path={workspace_path} is not exists!")
|
24
|
+
return
|
25
|
+
|
26
|
+
with workspace_path.open() as f:
|
27
|
+
fcntl.flock(f, fcntl.LOCK_SH)
|
28
|
+
try:
|
29
|
+
for line in tqdm(f, desc="load from path"):
|
30
|
+
if line.strip():
|
31
|
+
node_dict = json.loads(line.strip())
|
32
|
+
if callback_fn:
|
33
|
+
node = callback_fn(node_dict)
|
34
|
+
else:
|
35
|
+
node = VectorNode(**node_dict, **kwargs)
|
36
|
+
node.workspace_id = workspace_id
|
37
|
+
yield node
|
38
|
+
|
39
|
+
finally:
|
40
|
+
fcntl.flock(f, fcntl.LOCK_UN)
|
41
|
+
|
42
|
+
@staticmethod
|
43
|
+
def _dump_to_path(nodes: Iterable[VectorNode], workspace_id: str, path: str | Path = "", callback_fn=None,
|
44
|
+
ensure_ascii: bool = False, **kwargs):
|
45
|
+
dump_path: Path = Path(path)
|
46
|
+
dump_path.mkdir(parents=True, exist_ok=True)
|
47
|
+
dump_file = dump_path / f"{workspace_id}.jsonl"
|
48
|
+
|
49
|
+
count = 0
|
50
|
+
with dump_file.open("w") as f:
|
51
|
+
fcntl.flock(f, fcntl.LOCK_EX)
|
52
|
+
try:
|
53
|
+
for node in tqdm(nodes, desc="dump to path"):
|
54
|
+
node.workspace_id = workspace_id
|
55
|
+
if callback_fn:
|
56
|
+
node_dict = callback_fn(node)
|
57
|
+
else:
|
58
|
+
node_dict = node.model_dump()
|
59
|
+
assert isinstance(node_dict, dict)
|
60
|
+
f.write(json.dumps(node_dict, ensure_ascii=ensure_ascii, **kwargs))
|
61
|
+
f.write("\n")
|
62
|
+
count += 1
|
63
|
+
|
64
|
+
return {"size": count}
|
65
|
+
finally:
|
66
|
+
fcntl.flock(f, fcntl.LOCK_UN)
|
67
|
+
|
68
|
+
def exist_workspace(self, workspace_id: str, **kwargs) -> bool:
|
69
|
+
raise NotImplementedError
|
70
|
+
|
71
|
+
def delete_workspace(self, workspace_id: str, **kwargs):
|
72
|
+
raise NotImplementedError
|
73
|
+
|
74
|
+
def create_workspace(self, workspace_id: str, **kwargs):
|
75
|
+
raise NotImplementedError
|
76
|
+
|
77
|
+
def _iter_workspace_nodes(self, workspace_id: str, **kwargs) -> Iterable[VectorNode]:
|
78
|
+
raise NotImplementedError
|
79
|
+
|
80
|
+
def dump_workspace(self, workspace_id: str, path: str | Path = "", callback_fn=None, **kwargs):
|
81
|
+
if not self.exist_workspace(workspace_id=workspace_id, **kwargs):
|
82
|
+
logger.warning(f"workspace_id={workspace_id} is not exist!")
|
83
|
+
return {}
|
84
|
+
|
85
|
+
return self._dump_to_path(nodes=self._iter_workspace_nodes(workspace_id=workspace_id, **kwargs),
|
86
|
+
workspace_id=workspace_id,
|
87
|
+
path=path,
|
88
|
+
callback_fn=callback_fn,
|
89
|
+
**kwargs)
|
90
|
+
|
91
|
+
def load_workspace(self, workspace_id: str, path: str | Path = "", nodes: List[VectorNode] = None, callback_fn=None,
|
92
|
+
**kwargs):
|
93
|
+
if self.exist_workspace(workspace_id, **kwargs):
|
94
|
+
self.delete_workspace(workspace_id=workspace_id, **kwargs)
|
95
|
+
logger.info(f"delete workspace_id={workspace_id}")
|
96
|
+
|
97
|
+
self.create_workspace(workspace_id=workspace_id, **kwargs)
|
98
|
+
|
99
|
+
all_nodes: List[VectorNode] = []
|
100
|
+
if nodes:
|
101
|
+
all_nodes.extend(nodes)
|
102
|
+
for node in self._load_from_path(path=path, workspace_id=workspace_id, callback_fn=callback_fn, **kwargs):
|
103
|
+
all_nodes.append(node)
|
104
|
+
self.insert(nodes=all_nodes, workspace_id=workspace_id, **kwargs)
|
105
|
+
return {"size": len(all_nodes)}
|
106
|
+
|
107
|
+
def copy_workspace(self, src_workspace_id: str, dest_workspace_id: str, **kwargs):
|
108
|
+
if not self.exist_workspace(workspace_id=src_workspace_id, **kwargs):
|
109
|
+
logger.warning(f"src_workspace_id={src_workspace_id} is not exist!")
|
110
|
+
return {}
|
111
|
+
|
112
|
+
if not self.exist_workspace(dest_workspace_id, **kwargs):
|
113
|
+
self.create_workspace(workspace_id=dest_workspace_id, **kwargs)
|
114
|
+
|
115
|
+
nodes = []
|
116
|
+
node_size = 0
|
117
|
+
for node in self._iter_workspace_nodes(workspace_id=src_workspace_id, **kwargs):
|
118
|
+
nodes.append(node)
|
119
|
+
node_size += 1
|
120
|
+
if len(nodes) >= self.batch_size:
|
121
|
+
self.insert(nodes=nodes, workspace_id=dest_workspace_id, **kwargs)
|
122
|
+
nodes.clear()
|
123
|
+
|
124
|
+
if nodes:
|
125
|
+
self.insert(nodes=nodes, workspace_id=dest_workspace_id, **kwargs)
|
126
|
+
return {"size": node_size}
|
127
|
+
|
128
|
+
def search(self, query: str, workspace_id: str, top_k: int = 1, **kwargs) -> List[VectorNode]:
|
129
|
+
raise NotImplementedError
|
130
|
+
|
131
|
+
def insert(self, nodes: VectorNode | List[VectorNode], workspace_id: str, **kwargs):
|
132
|
+
raise NotImplementedError
|
133
|
+
|
134
|
+
def delete(self, node_ids: str | List[str], workspace_id: str, **kwargs):
|
135
|
+
raise NotImplementedError
|
136
|
+
|
@@ -0,0 +1,188 @@
|
|
1
|
+
from typing import List, Iterable
|
2
|
+
|
3
|
+
import chromadb
|
4
|
+
from chromadb import Collection
|
5
|
+
from chromadb.config import Settings
|
6
|
+
from loguru import logger
|
7
|
+
from pydantic import Field, PrivateAttr, model_validator
|
8
|
+
|
9
|
+
from llmflow.embedding_model.openai_compatible_embedding_model import OpenAICompatibleEmbeddingModel
|
10
|
+
from llmflow.schema.vector_node import VectorNode
|
11
|
+
from llmflow.vector_store import VECTOR_STORE_REGISTRY
|
12
|
+
from llmflow.vector_store.base_vector_store import BaseVectorStore
|
13
|
+
|
14
|
+
|
15
|
+
@VECTOR_STORE_REGISTRY.register("chroma")
|
16
|
+
class ChromaVectorStore(BaseVectorStore):
|
17
|
+
store_dir: str = Field(default="./chroma_vector_store")
|
18
|
+
collections: dict = Field(default_factory=dict)
|
19
|
+
_client: chromadb.Client = PrivateAttr()
|
20
|
+
|
21
|
+
@model_validator(mode="after")
|
22
|
+
def init_client(self):
|
23
|
+
self._client = chromadb.Client(Settings(persist_directory=self.store_dir))
|
24
|
+
return self
|
25
|
+
|
26
|
+
def _get_collection(self, workspace_id: str) -> Collection:
|
27
|
+
if workspace_id not in self.collections:
|
28
|
+
self.collections[workspace_id] = self._client.get_or_create_collection(workspace_id)
|
29
|
+
return self.collections[workspace_id]
|
30
|
+
|
31
|
+
def exist_workspace(self, workspace_id: str, **kwargs) -> bool:
|
32
|
+
return workspace_id in [c.name for c in self._client.list_collections()]
|
33
|
+
|
34
|
+
def delete_workspace(self, workspace_id: str, **kwargs):
|
35
|
+
self._client.delete_collection(workspace_id)
|
36
|
+
if workspace_id in self.collections:
|
37
|
+
del self.collections[workspace_id]
|
38
|
+
|
39
|
+
def create_workspace(self, workspace_id: str, **kwargs):
|
40
|
+
self.collections[workspace_id] = self._client.get_or_create_collection(workspace_id)
|
41
|
+
|
42
|
+
def _iter_workspace_nodes(self, workspace_id: str, **kwargs) -> Iterable[VectorNode]:
|
43
|
+
collection: Collection = self._get_collection(workspace_id)
|
44
|
+
results = collection.get()
|
45
|
+
for i in range(len(results["ids"])):
|
46
|
+
node = VectorNode(workspace_id=workspace_id,
|
47
|
+
unique_id=results["ids"][i],
|
48
|
+
content=results["documents"][i],
|
49
|
+
metadata=results["metadatas"][i])
|
50
|
+
yield node
|
51
|
+
|
52
|
+
def search(self, query: str, workspace_id: str, top_k: int = 1, **kwargs) -> List[VectorNode]:
|
53
|
+
if not self.exist_workspace(workspace_id=workspace_id):
|
54
|
+
logger.warning(f"workspace_id={workspace_id} is not exists!")
|
55
|
+
return []
|
56
|
+
|
57
|
+
collection: Collection = self._get_collection(workspace_id)
|
58
|
+
query_vector = self.embedding_model.get_embeddings(query)
|
59
|
+
results = collection.query(query_embeddings=[query_vector], n_results=top_k)
|
60
|
+
nodes = []
|
61
|
+
for i in range(len(results["ids"][0])):
|
62
|
+
node = VectorNode(workspace_id=workspace_id,
|
63
|
+
unique_id=results["ids"][0][i],
|
64
|
+
content=results["documents"][0][i],
|
65
|
+
metadata=results["metadatas"][0][i])
|
66
|
+
nodes.append(node)
|
67
|
+
return nodes
|
68
|
+
|
69
|
+
def insert(self, nodes: VectorNode | List[VectorNode], workspace_id: str, **kwargs):
|
70
|
+
if not self.exist_workspace(workspace_id=workspace_id):
|
71
|
+
self.create_workspace(workspace_id=workspace_id)
|
72
|
+
|
73
|
+
if isinstance(nodes, VectorNode):
|
74
|
+
nodes = [nodes]
|
75
|
+
|
76
|
+
embedded_nodes = [node for node in nodes if node.vector]
|
77
|
+
not_embedded_nodes = [node for node in nodes if not node.vector]
|
78
|
+
now_embedded_nodes = self.embedding_model.get_node_embeddings(not_embedded_nodes)
|
79
|
+
all_nodes = embedded_nodes + now_embedded_nodes
|
80
|
+
|
81
|
+
collection: Collection = self._get_collection(workspace_id)
|
82
|
+
collection.add(ids=[n.unique_id for n in all_nodes],
|
83
|
+
embeddings=[n.vector for n in all_nodes],
|
84
|
+
documents=[n.content for n in all_nodes],
|
85
|
+
metadatas=[n.metadata for n in all_nodes])
|
86
|
+
|
87
|
+
def delete(self, node_ids: str | List[str], workspace_id: str, **kwargs):
|
88
|
+
if not self.exist_workspace(workspace_id=workspace_id):
|
89
|
+
logger.warning(f"workspace_id={workspace_id} is not exists!")
|
90
|
+
return
|
91
|
+
|
92
|
+
if isinstance(node_ids, str):
|
93
|
+
node_ids = [node_ids]
|
94
|
+
|
95
|
+
collection: Collection = self._get_collection(workspace_id)
|
96
|
+
collection.delete(ids=node_ids)
|
97
|
+
|
98
|
+
|
99
|
+
def main():
|
100
|
+
from dotenv import load_dotenv
|
101
|
+
load_dotenv()
|
102
|
+
|
103
|
+
embedding_model = OpenAICompatibleEmbeddingModel(dimensions=64, model_name="text-embedding-v4")
|
104
|
+
workspace_id = "chroma_test_index"
|
105
|
+
|
106
|
+
chroma_store = ChromaVectorStore(
|
107
|
+
embedding_model=embedding_model,
|
108
|
+
store_dir="./chroma_test_db"
|
109
|
+
)
|
110
|
+
|
111
|
+
if chroma_store.exist_workspace(workspace_id):
|
112
|
+
chroma_store.delete_workspace(workspace_id)
|
113
|
+
chroma_store.create_workspace(workspace_id)
|
114
|
+
|
115
|
+
sample_nodes = [
|
116
|
+
VectorNode(
|
117
|
+
unique_id="node1",
|
118
|
+
workspace_id=workspace_id,
|
119
|
+
content="Artificial intelligence is a technology that simulates human intelligence.",
|
120
|
+
metadata={
|
121
|
+
"node_type": "n1",
|
122
|
+
"category": "tech"
|
123
|
+
}
|
124
|
+
),
|
125
|
+
VectorNode(
|
126
|
+
unique_id="node2",
|
127
|
+
workspace_id=workspace_id,
|
128
|
+
content="AI is the future of mankind.",
|
129
|
+
metadata={
|
130
|
+
"node_type": "n1",
|
131
|
+
"category": "tech"
|
132
|
+
}
|
133
|
+
),
|
134
|
+
VectorNode(
|
135
|
+
unique_id="node3",
|
136
|
+
workspace_id=workspace_id,
|
137
|
+
content="I want to eat fish!",
|
138
|
+
metadata={
|
139
|
+
"node_type": "n2",
|
140
|
+
"category": "food"
|
141
|
+
}
|
142
|
+
),
|
143
|
+
VectorNode(
|
144
|
+
unique_id="node4",
|
145
|
+
workspace_id=workspace_id,
|
146
|
+
content="The bigger the storm, the more expensive the fish.",
|
147
|
+
metadata={
|
148
|
+
"node_type": "n1",
|
149
|
+
"category": "food"
|
150
|
+
}
|
151
|
+
),
|
152
|
+
]
|
153
|
+
|
154
|
+
chroma_store.insert(sample_nodes, workspace_id=workspace_id)
|
155
|
+
|
156
|
+
logger.info("=" * 20)
|
157
|
+
results = chroma_store.search("What is AI?", top_k=5, workspace_id=workspace_id)
|
158
|
+
for r in results:
|
159
|
+
logger.info(r.model_dump(exclude={"vector"}))
|
160
|
+
logger.info("=" * 20)
|
161
|
+
|
162
|
+
node2_update = VectorNode(
|
163
|
+
unique_id="node2",
|
164
|
+
workspace_id=workspace_id,
|
165
|
+
content="AI is the future of humanity and technology.",
|
166
|
+
metadata={
|
167
|
+
"node_type": "n1",
|
168
|
+
"category": "tech",
|
169
|
+
"updated": True
|
170
|
+
}
|
171
|
+
)
|
172
|
+
chroma_store.delete(node2_update.unique_id, workspace_id=workspace_id)
|
173
|
+
chroma_store.insert(node2_update, workspace_id=workspace_id)
|
174
|
+
|
175
|
+
logger.info("Updated Result:")
|
176
|
+
results = chroma_store.search("fish?", top_k=10, workspace_id=workspace_id)
|
177
|
+
for r in results:
|
178
|
+
logger.info(r.model_dump(exclude={"vector"}))
|
179
|
+
logger.info("=" * 20)
|
180
|
+
|
181
|
+
chroma_store.dump_workspace(workspace_id=workspace_id)
|
182
|
+
|
183
|
+
chroma_store.delete_workspace(workspace_id=workspace_id)
|
184
|
+
|
185
|
+
|
186
|
+
if __name__ == "__main__":
|
187
|
+
main()
|
188
|
+
# launch with: python -m llmflow.storage.chroma_vector_store
|
@@ -0,0 +1,227 @@
|
|
1
|
+
import os
|
2
|
+
from typing import List, Tuple, Iterable
|
3
|
+
|
4
|
+
from elasticsearch import Elasticsearch
|
5
|
+
from elasticsearch.helpers import bulk
|
6
|
+
from loguru import logger
|
7
|
+
from pydantic import Field, PrivateAttr, model_validator
|
8
|
+
|
9
|
+
from llmflow.embedding_model.openai_compatible_embedding_model import OpenAICompatibleEmbeddingModel
|
10
|
+
from llmflow.schema.vector_node import VectorNode
|
11
|
+
from llmflow.vector_store import VECTOR_STORE_REGISTRY
|
12
|
+
from llmflow.vector_store.base_vector_store import BaseVectorStore
|
13
|
+
|
14
|
+
|
15
|
+
@VECTOR_STORE_REGISTRY.register("elasticsearch")
|
16
|
+
class EsVectorStore(BaseVectorStore):
|
17
|
+
hosts: str | List[str] = Field(default_factory=lambda: os.getenv("ES_HOSTS", "http://localhost:9200"))
|
18
|
+
basic_auth: str | Tuple[str, str] | None = Field(default=None)
|
19
|
+
retrieve_filters: List[dict] = []
|
20
|
+
_client: Elasticsearch = PrivateAttr()
|
21
|
+
|
22
|
+
@model_validator(mode="after")
|
23
|
+
def init_client(self):
|
24
|
+
if isinstance(self.hosts, str):
|
25
|
+
self.hosts = [self.hosts]
|
26
|
+
self._client = Elasticsearch(hosts=self.hosts, basic_auth=self.basic_auth)
|
27
|
+
return self
|
28
|
+
|
29
|
+
def exist_workspace(self, workspace_id: str, **kwargs) -> bool:
|
30
|
+
return self._client.indices.exists(index=workspace_id)
|
31
|
+
|
32
|
+
def delete_workspace(self, workspace_id: str, **kwargs):
|
33
|
+
return self._client.indices.delete(index=workspace_id, **kwargs)
|
34
|
+
|
35
|
+
def create_workspace(self, workspace_id: str, **kwargs):
|
36
|
+
body = {
|
37
|
+
"mappings": {
|
38
|
+
"properties": {
|
39
|
+
"workspace_id": {"type": "keyword"},
|
40
|
+
"content": {"type": "text"},
|
41
|
+
"metadata": {"type": "object"},
|
42
|
+
"vector": {
|
43
|
+
"type": "dense_vector",
|
44
|
+
"dims": self.embedding_model.dimensions
|
45
|
+
}
|
46
|
+
}
|
47
|
+
}
|
48
|
+
}
|
49
|
+
return self._client.indices.create(index=workspace_id, body=body)
|
50
|
+
|
51
|
+
def _iter_workspace_nodes(self, workspace_id: str, max_size: int = 10000, **kwargs) -> Iterable[VectorNode]:
|
52
|
+
response = self._client.search(index=workspace_id, body={"query": {"match_all": {}}, "size": max_size})
|
53
|
+
for doc in response['hits']['hits']:
|
54
|
+
yield self.doc2node(doc, workspace_id)
|
55
|
+
|
56
|
+
def refresh(self, workspace_id: str):
|
57
|
+
self._client.indices.refresh(index=workspace_id)
|
58
|
+
|
59
|
+
@staticmethod
|
60
|
+
def doc2node(doc, workspace_id: str) -> VectorNode:
|
61
|
+
node = VectorNode(**doc["_source"])
|
62
|
+
node.workspace_id = workspace_id
|
63
|
+
node.unique_id = doc["_id"]
|
64
|
+
if "_score" in doc:
|
65
|
+
node.metadata["_score"] = doc["_score"] - 1
|
66
|
+
return node
|
67
|
+
|
68
|
+
def add_term_filter(self, key: str, value):
|
69
|
+
if key:
|
70
|
+
self.retrieve_filters.append({"term": {key: value}})
|
71
|
+
return self
|
72
|
+
|
73
|
+
def add_range_filter(self, key: str, gte=None, lte=None):
|
74
|
+
if key:
|
75
|
+
if gte is not None and lte is not None:
|
76
|
+
self.retrieve_filters.append({"range": {key: {"gte": gte, "lte": lte}}})
|
77
|
+
elif gte is not None:
|
78
|
+
self.retrieve_filters.append({"range": {key: {"gte": gte}}})
|
79
|
+
elif lte is not None:
|
80
|
+
self.retrieve_filters.append({"range": {key: {"lte": lte}}})
|
81
|
+
return self
|
82
|
+
|
83
|
+
def clear_filter(self):
|
84
|
+
self.retrieve_filters.clear()
|
85
|
+
return self
|
86
|
+
|
87
|
+
def search(self, query: str, workspace_id: str, top_k: int = 1, **kwargs) -> List[VectorNode]:
|
88
|
+
if not self.exist_workspace(workspace_id=workspace_id):
|
89
|
+
logger.warning(f"workspace_id={workspace_id} is not exists!")
|
90
|
+
return []
|
91
|
+
|
92
|
+
query_vector = self.embedding_model.get_embeddings(query)
|
93
|
+
body = {
|
94
|
+
"query": {
|
95
|
+
"script_score": {
|
96
|
+
"query": {"bool": {"must": self.retrieve_filters}},
|
97
|
+
"script": {
|
98
|
+
"source": "cosineSimilarity(params.query_vector, 'vector') + 1.0",
|
99
|
+
"params": {"query_vector": query_vector},
|
100
|
+
}
|
101
|
+
}
|
102
|
+
},
|
103
|
+
"size": top_k
|
104
|
+
}
|
105
|
+
response = self._client.search(index=workspace_id, body=body, **kwargs)
|
106
|
+
|
107
|
+
nodes: List[VectorNode] = []
|
108
|
+
for doc in response['hits']['hits']:
|
109
|
+
nodes.append(self.doc2node(doc, workspace_id))
|
110
|
+
|
111
|
+
self.retrieve_filters.clear()
|
112
|
+
return nodes
|
113
|
+
|
114
|
+
def insert(self, nodes: VectorNode | List[VectorNode], workspace_id: str, refresh: bool = False, **kwargs):
|
115
|
+
if not self.exist_workspace(workspace_id=workspace_id):
|
116
|
+
self.create_workspace(workspace_id=workspace_id)
|
117
|
+
|
118
|
+
if isinstance(nodes, VectorNode):
|
119
|
+
nodes = [nodes]
|
120
|
+
|
121
|
+
embedded_nodes = [node for node in nodes if node.vector]
|
122
|
+
not_embedded_nodes = [node for node in nodes if not node.vector]
|
123
|
+
now_embedded_nodes = self.embedding_model.get_node_embeddings(not_embedded_nodes)
|
124
|
+
|
125
|
+
docs = [
|
126
|
+
{
|
127
|
+
"_op_type": "index",
|
128
|
+
"_index": workspace_id,
|
129
|
+
"_id": node.unique_id,
|
130
|
+
"_source": {
|
131
|
+
"workspace_id": workspace_id,
|
132
|
+
"content": node.content,
|
133
|
+
"metadata": node.metadata,
|
134
|
+
"vector": node.vector
|
135
|
+
}
|
136
|
+
} for node in embedded_nodes + now_embedded_nodes]
|
137
|
+
status, error = bulk(self._client, docs, chunk_size=self.batch_size, **kwargs)
|
138
|
+
logger.info(f"insert docs.size={len(docs)} status={status} error={error}")
|
139
|
+
|
140
|
+
if refresh:
|
141
|
+
self.refresh(workspace_id=workspace_id)
|
142
|
+
|
143
|
+
def delete(self, node_ids: str | List[str], workspace_id: str, refresh: bool = False, **kwargs):
|
144
|
+
if not self.exist_workspace(workspace_id=workspace_id):
|
145
|
+
logger.warning(f"workspace_id={workspace_id} is not exists!")
|
146
|
+
return
|
147
|
+
|
148
|
+
if isinstance(node_ids, str):
|
149
|
+
node_ids = [node_ids]
|
150
|
+
|
151
|
+
actions = [
|
152
|
+
{
|
153
|
+
"_op_type": "delete",
|
154
|
+
"_index": workspace_id,
|
155
|
+
"_id": node_id
|
156
|
+
} for node_id in node_ids]
|
157
|
+
status, error = bulk(self._client, actions, chunk_size=self.batch_size, **kwargs)
|
158
|
+
logger.info(f"delete actions.size={len(actions)} status={status} error={error}")
|
159
|
+
|
160
|
+
if refresh:
|
161
|
+
self.refresh(workspace_id=workspace_id)
|
162
|
+
|
163
|
+
|
164
|
+
def main():
|
165
|
+
from dotenv import load_dotenv
|
166
|
+
load_dotenv()
|
167
|
+
|
168
|
+
embedding_model = OpenAICompatibleEmbeddingModel(dimensions=64, model_name="text-embedding-v4")
|
169
|
+
workspace_id = "rag_nodes_index"
|
170
|
+
hosts = "http://11.160.132.46:8200"
|
171
|
+
es = EsVectorStore(hosts=hosts, embedding_model=embedding_model)
|
172
|
+
if es.exist_workspace(workspace_id=workspace_id):
|
173
|
+
es.delete_workspace(workspace_id=workspace_id)
|
174
|
+
es.create_workspace(workspace_id=workspace_id)
|
175
|
+
|
176
|
+
sample_nodes = [
|
177
|
+
VectorNode(
|
178
|
+
workspace_id=workspace_id,
|
179
|
+
content="Artificial intelligence is a technology that simulates human intelligence.",
|
180
|
+
metadata={
|
181
|
+
"node_type": "n1",
|
182
|
+
}
|
183
|
+
),
|
184
|
+
VectorNode(
|
185
|
+
workspace_id=workspace_id,
|
186
|
+
content="AI is the future of mankind.",
|
187
|
+
metadata={
|
188
|
+
"node_type": "n1",
|
189
|
+
}
|
190
|
+
),
|
191
|
+
VectorNode(
|
192
|
+
workspace_id=workspace_id,
|
193
|
+
content="I want to eat fish!",
|
194
|
+
metadata={
|
195
|
+
"node_type": "n2",
|
196
|
+
}
|
197
|
+
),
|
198
|
+
VectorNode(
|
199
|
+
workspace_id=workspace_id,
|
200
|
+
content="The bigger the storm, the more expensive the fish.",
|
201
|
+
metadata={
|
202
|
+
"node_type": "n1",
|
203
|
+
}
|
204
|
+
),
|
205
|
+
]
|
206
|
+
|
207
|
+
es.insert(sample_nodes, workspace_id=workspace_id, refresh=True)
|
208
|
+
|
209
|
+
logger.info("=" * 20)
|
210
|
+
results = es.add_term_filter(key="metadata.node_type", value="n1") \
|
211
|
+
.search("What is AI?", top_k=5, workspace_id=workspace_id)
|
212
|
+
for r in results:
|
213
|
+
logger.info(r.model_dump(exclude={"vector"}))
|
214
|
+
logger.info("=" * 20)
|
215
|
+
|
216
|
+
logger.info("=" * 20)
|
217
|
+
results = es.search("What is AI?", top_k=5, workspace_id=workspace_id)
|
218
|
+
for r in results:
|
219
|
+
logger.info(r.model_dump(exclude={"vector"}))
|
220
|
+
logger.info("=" * 20)
|
221
|
+
es.dump_workspace(workspace_id=workspace_id)
|
222
|
+
es.delete_workspace(workspace_id=workspace_id)
|
223
|
+
|
224
|
+
|
225
|
+
if __name__ == "__main__":
|
226
|
+
main()
|
227
|
+
# launch with: python -m llmflow.storage.es_vector_store
|