pycityagent 2.0.0a22__py3-none-any.whl → 2.0.0a25__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycityagent/__init__.py +2 -1
- pycityagent/agent.py +13 -2
- pycityagent/environment/simulator.py +5 -5
- pycityagent/llm/__init__.py +7 -2
- pycityagent/llm/embeddings.py +231 -0
- pycityagent/memory/__init__.py +2 -0
- pycityagent/memory/faiss_query.py +302 -0
- pycityagent/memory/memory.py +131 -137
- pycityagent/simulation/agentgroup.py +42 -25
- pycityagent/simulation/simulation.py +9 -1
- {pycityagent-2.0.0a22.dist-info → pycityagent-2.0.0a25.dist-info}/METADATA +5 -1
- {pycityagent-2.0.0a22.dist-info → pycityagent-2.0.0a25.dist-info}/RECORD +13 -12
- pycityagent/llm/embedding.py +0 -136
- {pycityagent-2.0.0a22.dist-info → pycityagent-2.0.0a25.dist-info}/WHEEL +0 -0
pycityagent/__init__.py
CHANGED
@@ -5,6 +5,7 @@ Pycityagent: 城市智能体构建框架
|
|
5
5
|
from .agent import Agent, CitizenAgent, InstitutionAgent
|
6
6
|
from .environment import Simulator
|
7
7
|
import logging
|
8
|
+
from .llm import SentenceEmbedding
|
8
9
|
|
9
10
|
# 创建一个 pycityagent 记录器
|
10
11
|
logger = logging.getLogger("pycityagent")
|
@@ -19,4 +20,4 @@ if not logger.hasHandlers():
|
|
19
20
|
handler.setFormatter(formatter)
|
20
21
|
logger.addHandler(handler)
|
21
22
|
|
22
|
-
__all__ = ["Agent", "Simulator", "CitizenAgent", "InstitutionAgent"]
|
23
|
+
__all__ = ["Agent", "Simulator", "CitizenAgent", "InstitutionAgent","SentenceEmbedding",]
|
pycityagent/agent.py
CHANGED
@@ -236,7 +236,15 @@ class Agent(ABC):
|
|
236
236
|
|
237
237
|
# 添加记忆上下文
|
238
238
|
if self._memory:
|
239
|
-
relevant_memories = await self.
|
239
|
+
relevant_memories = await self.memory.search(survey_prompt)
|
240
|
+
|
241
|
+
formatted_results = []
|
242
|
+
# for result in top_results:
|
243
|
+
# formatted_results.append(
|
244
|
+
# f"- [{result['type']}] {result['content']} "
|
245
|
+
# f"(相关度: {result['similarity']:.2f})"
|
246
|
+
# )
|
247
|
+
|
240
248
|
if relevant_memories:
|
241
249
|
dialog.append(
|
242
250
|
{
|
@@ -458,7 +466,9 @@ class Agent(ABC):
|
|
458
466
|
topic = f"exps/{self._exp_id}/agents/{to_agent_uuid}/{sub_topic}"
|
459
467
|
await self._messager.send_message(topic, payload)
|
460
468
|
|
461
|
-
async def send_message_to_agent(
|
469
|
+
async def send_message_to_agent(
|
470
|
+
self, to_agent_uuid: str, content: str, type: str = "social"
|
471
|
+
):
|
462
472
|
"""通过 Messager 发送消息"""
|
463
473
|
if self._messager is None:
|
464
474
|
raise RuntimeError("Messager is not set")
|
@@ -598,6 +608,7 @@ class CitizenAgent(Agent):
|
|
598
608
|
# 防止模拟器还没有到prepare阶段导致get_person出错
|
599
609
|
self._has_bound_to_simulator = True
|
600
610
|
self._agent_id = person_id
|
611
|
+
self.memory.set_agent_id(person_id)
|
601
612
|
|
602
613
|
async def _bind_to_economy(self):
|
603
614
|
if self._economy_client is None:
|
@@ -3,7 +3,6 @@
|
|
3
3
|
import asyncio
|
4
4
|
import logging
|
5
5
|
import os
|
6
|
-
from collections.abc import Sequence
|
7
6
|
from datetime import datetime, timedelta
|
8
7
|
from typing import Any, Optional, Union, cast
|
9
8
|
|
@@ -22,13 +21,14 @@ from .utils.const import *
|
|
22
21
|
|
23
22
|
logger = logging.getLogger("pycityagent")
|
24
23
|
|
24
|
+
|
25
25
|
class Simulator:
|
26
26
|
"""
|
27
27
|
- 模拟器主类
|
28
28
|
- Simulator Class
|
29
29
|
"""
|
30
30
|
|
31
|
-
def __init__(self, config:dict, secure: bool = False) -> None:
|
31
|
+
def __init__(self, config: dict, secure: bool = False) -> None:
|
32
32
|
self.config = config
|
33
33
|
"""
|
34
34
|
- 模拟器配置
|
@@ -193,7 +193,7 @@ class Simulator:
|
|
193
193
|
else:
|
194
194
|
# BUG: 返回的time是float类型
|
195
195
|
return t_sec["t"]
|
196
|
-
|
196
|
+
|
197
197
|
async def get_simulator_day(self) -> int:
|
198
198
|
"""
|
199
199
|
获取模拟器到第几日
|
@@ -202,7 +202,7 @@ class Simulator:
|
|
202
202
|
t_sec = cast(dict[str, int], t_sec)
|
203
203
|
day = t_sec["t"] // 86400
|
204
204
|
return day
|
205
|
-
|
205
|
+
|
206
206
|
async def get_simulator_second_from_start_of_day(self) -> int:
|
207
207
|
"""
|
208
208
|
获取模拟器从00:00:00到当前的秒数
|
@@ -316,7 +316,7 @@ class Simulator:
|
|
316
316
|
radius: float,
|
317
317
|
poi_type: Union[str, list[str]],
|
318
318
|
):
|
319
|
-
if
|
319
|
+
if isinstance(poi_type, str):
|
320
320
|
poi_type = [poi_type]
|
321
321
|
transformed_poi_type = []
|
322
322
|
for t in poi_type:
|
pycityagent/llm/__init__.py
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
"""LLM相关模块"""
|
2
2
|
|
3
|
+
from .embeddings import SentenceEmbedding, SimpleEmbedding
|
3
4
|
from .llm import LLM, LLMConfig
|
4
|
-
from .embedding import SimpleEmbedding
|
5
5
|
|
6
|
-
__all__ = [
|
6
|
+
__all__ = [
|
7
|
+
"LLM",
|
8
|
+
"LLMConfig",
|
9
|
+
"SentenceEmbedding",
|
10
|
+
"SimpleEmbedding",
|
11
|
+
]
|
@@ -0,0 +1,231 @@
|
|
1
|
+
import hashlib
|
2
|
+
import json
|
3
|
+
import os
|
4
|
+
from typing import Optional, Union
|
5
|
+
|
6
|
+
import numpy as np
|
7
|
+
import torch
|
8
|
+
from langchain_core.embeddings import Embeddings
|
9
|
+
from transformers import AutoModel, AutoTokenizer
|
10
|
+
|
11
|
+
__all__ = [
|
12
|
+
"SentenceEmbedding",
|
13
|
+
"SimpleEmbedding",
|
14
|
+
]
|
15
|
+
|
16
|
+
|
17
|
+
class SentenceEmbedding(Embeddings):
|
18
|
+
def __init__(
|
19
|
+
self,
|
20
|
+
pretrained_model_name_or_path: Union[str, os.PathLike] = "BAAI/bge-m3",
|
21
|
+
max_seq_len: int = 8192,
|
22
|
+
auto_cuda: bool = False,
|
23
|
+
local_files_only: bool = False,
|
24
|
+
cache_dir: str = "./cache",
|
25
|
+
proxies: Optional[dict] = None,
|
26
|
+
):
|
27
|
+
os.makedirs(cache_dir, exist_ok=True)
|
28
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
29
|
+
pretrained_model_name_or_path,
|
30
|
+
proxies=proxies,
|
31
|
+
cache_dir=cache_dir,
|
32
|
+
local_files_only=local_files_only,
|
33
|
+
)
|
34
|
+
self.model = AutoModel.from_pretrained(
|
35
|
+
pretrained_model_name_or_path,
|
36
|
+
proxies=proxies,
|
37
|
+
cache_dir=cache_dir,
|
38
|
+
local_files_only=local_files_only,
|
39
|
+
)
|
40
|
+
self._cuda = auto_cuda and torch.cuda.is_available()
|
41
|
+
|
42
|
+
if self._cuda:
|
43
|
+
self.model = self.model.cuda()
|
44
|
+
|
45
|
+
self.model.eval()
|
46
|
+
self.max_seq_len = max_seq_len
|
47
|
+
|
48
|
+
def _embed(self, texts: list[str]) -> list[list[float]]:
|
49
|
+
# Tokenize sentences
|
50
|
+
encoded_input = self.tokenizer(
|
51
|
+
texts, padding=True, truncation=True, return_tensors="pt"
|
52
|
+
)
|
53
|
+
# for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
|
54
|
+
# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
|
55
|
+
|
56
|
+
# check length of input
|
57
|
+
# assert seq_len <= 8192
|
58
|
+
assert encoded_input["input_ids"].shape[1] <= self.max_seq_len # type: ignore
|
59
|
+
|
60
|
+
if self._cuda:
|
61
|
+
encoded_input = {k: v.cuda() for k, v in encoded_input.items()}
|
62
|
+
# Compute token embeddings
|
63
|
+
with torch.no_grad():
|
64
|
+
model_output = self.model(**encoded_input)
|
65
|
+
# Perform pooling. In this case, cls pooling.
|
66
|
+
sentence_embeddings = model_output[0][:, 0]
|
67
|
+
# normalize embeddings
|
68
|
+
sentence_embeddings = torch.nn.functional.normalize(
|
69
|
+
sentence_embeddings, p=2, dim=1
|
70
|
+
)
|
71
|
+
if self._cuda:
|
72
|
+
sentence_embeddings = sentence_embeddings.cpu()
|
73
|
+
return sentence_embeddings.tolist()
|
74
|
+
|
75
|
+
def embed_documents(self, texts: list[str]) -> list[list[float]]:
|
76
|
+
"""Embed documents."""
|
77
|
+
return self._embed(texts)
|
78
|
+
|
79
|
+
def embed_query(self, text: str) -> list[float]:
|
80
|
+
"""Embed query text."""
|
81
|
+
return self._embed([text])[0]
|
82
|
+
|
83
|
+
|
84
|
+
class SimpleEmbedding(Embeddings):
|
85
|
+
"""简单的基于内存的embedding实现
|
86
|
+
|
87
|
+
使用简单的词袋模型(Bag of Words)和TF-IDF来生成文本的向量表示。
|
88
|
+
所有向量都保存在内存中,适用于小规模应用。
|
89
|
+
"""
|
90
|
+
|
91
|
+
def __init__(self, vector_dim: int = 128, cache_size: int = 1000):
|
92
|
+
"""初始化
|
93
|
+
|
94
|
+
Args:
|
95
|
+
vector_dim: 向量维度
|
96
|
+
cache_size: 缓存大小,超过此大小将清除最早的缓存
|
97
|
+
"""
|
98
|
+
self.vector_dim = vector_dim
|
99
|
+
self.cache_size = cache_size
|
100
|
+
self._cache: dict[str, list[float]] = {}
|
101
|
+
self._vocab: dict[str, int] = {} # 词汇表
|
102
|
+
self._idf: dict[str, float] = {} # 逆文档频率
|
103
|
+
self._doc_count = 0 # 文档总数
|
104
|
+
|
105
|
+
def _text_to_hash(self, text: str) -> str:
|
106
|
+
"""将文本转换为hash值"""
|
107
|
+
return hashlib.md5(text.encode()).hexdigest()
|
108
|
+
|
109
|
+
def _tokenize(self, text: str) -> list[str]:
|
110
|
+
"""简单的分词"""
|
111
|
+
# 这里使用简单的空格分词,实际应用中可以使用更复杂的分词方法
|
112
|
+
return text.lower().split()
|
113
|
+
|
114
|
+
def _update_vocab(self, tokens: list[str]):
|
115
|
+
"""更新词汇表"""
|
116
|
+
for token in set(tokens): # 使用set去重
|
117
|
+
if token not in self._vocab:
|
118
|
+
self._vocab[token] = len(self._vocab)
|
119
|
+
|
120
|
+
def _update_idf(self, tokens: list[str]):
|
121
|
+
"""更新IDF值"""
|
122
|
+
self._doc_count += 1
|
123
|
+
unique_tokens = set(tokens)
|
124
|
+
for token in unique_tokens:
|
125
|
+
self._idf[token] = self._idf.get(token, 0) + 1
|
126
|
+
|
127
|
+
def _calculate_tf(self, tokens: list[str]) -> dict[str, float]:
|
128
|
+
"""计算词频(TF)"""
|
129
|
+
tf = {}
|
130
|
+
total_tokens = len(tokens)
|
131
|
+
for token in tokens:
|
132
|
+
tf[token] = tf.get(token, 0) + 1
|
133
|
+
# 归一化
|
134
|
+
for token in tf:
|
135
|
+
tf[token] /= total_tokens
|
136
|
+
return tf
|
137
|
+
|
138
|
+
def _calculate_tfidf(self, tokens: list[str]) -> list[float]:
|
139
|
+
"""计算TF-IDF向量"""
|
140
|
+
vector = np.zeros(self.vector_dim)
|
141
|
+
tf = self._calculate_tf(tokens)
|
142
|
+
|
143
|
+
for token, tf_value in tf.items():
|
144
|
+
if token in self._idf:
|
145
|
+
idf = np.log(self._doc_count / self._idf[token])
|
146
|
+
idx = self._vocab[token] % self.vector_dim # 使用取模运算来控制向量维度
|
147
|
+
vector[idx] += tf_value * idf
|
148
|
+
|
149
|
+
# L2归一化
|
150
|
+
norm = np.linalg.norm(vector)
|
151
|
+
if norm > 0:
|
152
|
+
vector /= norm
|
153
|
+
|
154
|
+
return list(vector)
|
155
|
+
|
156
|
+
def _embed(self, text: str) -> list[float]:
|
157
|
+
"""生成文本的向量表示
|
158
|
+
|
159
|
+
Args:
|
160
|
+
text: 输入文本
|
161
|
+
|
162
|
+
Returns:
|
163
|
+
np.ndarray: 文本的向量表示
|
164
|
+
"""
|
165
|
+
# 检查缓存
|
166
|
+
text_hash = self._text_to_hash(text)
|
167
|
+
if text_hash in self._cache:
|
168
|
+
return self._cache[text_hash]
|
169
|
+
|
170
|
+
# 分词
|
171
|
+
tokens = self._tokenize(text)
|
172
|
+
if not tokens:
|
173
|
+
return list(np.zeros(self.vector_dim))
|
174
|
+
|
175
|
+
# 更新词汇表和IDF
|
176
|
+
self._update_vocab(tokens)
|
177
|
+
self._update_idf(tokens)
|
178
|
+
|
179
|
+
# 计算向量
|
180
|
+
vector = self._calculate_tfidf(tokens)
|
181
|
+
|
182
|
+
# 更新缓存
|
183
|
+
if len(self._cache) >= self.cache_size:
|
184
|
+
# 删除最早的缓存
|
185
|
+
oldest_key = next(iter(self._cache))
|
186
|
+
del self._cache[oldest_key]
|
187
|
+
self._cache[text_hash] = vector
|
188
|
+
|
189
|
+
return list(vector)
|
190
|
+
|
191
|
+
def embed_documents(self, texts: list[str]) -> list[list[float]]:
|
192
|
+
"""Embed documents."""
|
193
|
+
return [self._embed(text) for text in texts]
|
194
|
+
|
195
|
+
def embed_query(self, text: str) -> list[float]:
|
196
|
+
"""Embed query text."""
|
197
|
+
return self._embed(text)
|
198
|
+
|
199
|
+
# def save(self, file_path: str):
|
200
|
+
# """保存模型"""
|
201
|
+
# state = {
|
202
|
+
# "vector_dim": self.vector_dim,
|
203
|
+
# "cache_size": self.cache_size,
|
204
|
+
# "vocab": self._vocab,
|
205
|
+
# "idf": self._idf,
|
206
|
+
# "doc_count": self._doc_count,
|
207
|
+
# }
|
208
|
+
# with open(file_path, "w") as f:
|
209
|
+
# json.dump(state, f)
|
210
|
+
|
211
|
+
# def load(self, file_path: str):
|
212
|
+
# """加载模型"""
|
213
|
+
# with open(file_path, "r") as f:
|
214
|
+
# state = json.load(f)
|
215
|
+
# self.vector_dim = state["vector_dim"]
|
216
|
+
# self.cache_size = state["cache_size"]
|
217
|
+
# self._vocab = state["vocab"]
|
218
|
+
# self._idf = state["idf"]
|
219
|
+
# self._doc_count = state["doc_count"]
|
220
|
+
# self._cache = {} # 清空缓存
|
221
|
+
|
222
|
+
|
223
|
+
if __name__ == "__main__":
|
224
|
+
# se = SentenceEmbedding(
|
225
|
+
# pretrained_model_name_or_path="ignore/BAAI--bge-m3", cache_dir="ignore"
|
226
|
+
# )
|
227
|
+
se = SimpleEmbedding()
|
228
|
+
print(se.embed_query("hello world"))
|
229
|
+
print(se.embed_query("hello world"))
|
230
|
+
print(se.embed_query("hello world"))
|
231
|
+
print(se.embed_query("hello world"))
|
pycityagent/memory/__init__.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
"""Memory."""
|
2
2
|
|
3
|
+
from .faiss_query import FaissQuery
|
3
4
|
from .memory import Memory
|
4
5
|
from .memory_base import MemoryBase, MemoryUnit
|
5
6
|
from .profile import ProfileMemory, ProfileMemoryUnit
|
@@ -8,4 +9,5 @@ from .state import StateMemory
|
|
8
9
|
|
9
10
|
__all__ = [
|
10
11
|
"Memory",
|
12
|
+
"FaissQuery",
|
11
13
|
]
|
@@ -0,0 +1,302 @@
|
|
1
|
+
import asyncio
|
2
|
+
from collections.abc import Sequence
|
3
|
+
from typing import Any, Literal, Optional, Union
|
4
|
+
|
5
|
+
import faiss
|
6
|
+
import numpy as np
|
7
|
+
from langchain_community.docstore.in_memory import InMemoryDocstore
|
8
|
+
from langchain_community.vectorstores import FAISS
|
9
|
+
from langchain_core.documents import Document
|
10
|
+
from langchain_core.embeddings import Embeddings
|
11
|
+
|
12
|
+
from ..utils.decorators import lock_decorator
|
13
|
+
|
14
|
+
|
15
|
+
class FaissQuery:
|
16
|
+
def __init__(
|
17
|
+
self,
|
18
|
+
embeddings: Optional[Embeddings] = None,
|
19
|
+
index_type: Any = faiss.IndexFlatL2,
|
20
|
+
dimension: Optional[int] = None,
|
21
|
+
) -> None:
|
22
|
+
self._embeddings = embeddings
|
23
|
+
self._lock = asyncio.Lock()
|
24
|
+
if embeddings is None:
|
25
|
+
self._index = None
|
26
|
+
self._vectors_store = None
|
27
|
+
else:
|
28
|
+
if dimension is None:
|
29
|
+
dimension = len(embeddings.embed_query("hello world"))
|
30
|
+
self._index = index_type(dimension)
|
31
|
+
self._vectors_store = FAISS(
|
32
|
+
embedding_function=embeddings,
|
33
|
+
index=self._index,
|
34
|
+
docstore=InMemoryDocstore(),
|
35
|
+
index_to_docstore_id={},
|
36
|
+
)
|
37
|
+
|
38
|
+
@property
|
39
|
+
def embeddings(
|
40
|
+
self,
|
41
|
+
) -> Embeddings:
|
42
|
+
if self._embeddings is None:
|
43
|
+
raise RuntimeError(f"No embedding set, please `set_embeddings` first!")
|
44
|
+
return self._embeddings
|
45
|
+
|
46
|
+
@property
|
47
|
+
def vectors_store(
|
48
|
+
self,
|
49
|
+
) -> FAISS:
|
50
|
+
if self._vectors_store is None:
|
51
|
+
raise RuntimeError(f"No embedding set, thus no vector stores initialized!")
|
52
|
+
return self._vectors_store
|
53
|
+
|
54
|
+
@lock_decorator
|
55
|
+
async def add_documents(
|
56
|
+
self,
|
57
|
+
agent_id: int,
|
58
|
+
documents: Union[str, Sequence[str]],
|
59
|
+
extra_tags: Optional[dict] = None,
|
60
|
+
) -> list[str]:
|
61
|
+
if isinstance(documents, str):
|
62
|
+
documents = [documents]
|
63
|
+
_metadata = {"_id": agent_id}
|
64
|
+
if extra_tags is not None:
|
65
|
+
_metadata.update(extra_tags)
|
66
|
+
to_add_documents = [
|
67
|
+
Document(page_content=doc, metadata=_metadata) for doc in documents
|
68
|
+
]
|
69
|
+
return await self.vectors_store.aadd_documents(
|
70
|
+
documents=to_add_documents,
|
71
|
+
)
|
72
|
+
|
73
|
+
@lock_decorator
|
74
|
+
async def delete_documents(
|
75
|
+
self,
|
76
|
+
to_delete_ids: list[str],
|
77
|
+
):
|
78
|
+
await self.vectors_store.adelete(
|
79
|
+
ids=to_delete_ids,
|
80
|
+
)
|
81
|
+
|
82
|
+
@lock_decorator
|
83
|
+
async def similarity_search(
|
84
|
+
self,
|
85
|
+
query: str,
|
86
|
+
agent_id: int,
|
87
|
+
k: int = 4,
|
88
|
+
fetch_k: int = 20,
|
89
|
+
return_score_type: Union[
|
90
|
+
Literal["none"], Literal["similarity_score"], Literal["L2-distance"]
|
91
|
+
] = "none",
|
92
|
+
filter: Optional[dict] = None,
|
93
|
+
) -> Union[list[tuple[str, dict]], list[tuple[str, float, dict]]]:
|
94
|
+
"""
|
95
|
+
Return content most similar to the given query.
|
96
|
+
|
97
|
+
Args:
|
98
|
+
query (str): The text to look up documents similar to.
|
99
|
+
agent_id (int): The identifier of the agent to filter specific documents. Only documents associated with this agent will be considered.
|
100
|
+
k (int, optional): The number of top similar contents to return. Defaults to 4.
|
101
|
+
fetch_k (int, optional): The number of documents to fetch before applying any filters. Defaults to 20.
|
102
|
+
return_score_type (Union[Literal["none"], Literal["similarity_score"], Literal["L2-distance"]], optional):
|
103
|
+
Specifies whether and how to return similarity scores with the results:
|
104
|
+
- "none": Do not return scores; only return the contents (default).
|
105
|
+
- "similarity_score": Return a tuple of content and its similarity score.
|
106
|
+
- "L2-distance": Return a tuple of content and its L2 distance from the query.
|
107
|
+
filter (dict, optional): The filter dict for metadata.
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
Union[list[tuple[str,dict]], list[tuple[str, float,dict]]]:
|
111
|
+
Depending on the `return_score_type` parameter, returns either a list of strings representing the top-k similar contents,
|
112
|
+
or a list of tuples where each tuple contains a string and a floating-point score.
|
113
|
+
"""
|
114
|
+
_filter = {
|
115
|
+
"_id": agent_id,
|
116
|
+
}
|
117
|
+
if filter is not None:
|
118
|
+
_filter.update(filter)
|
119
|
+
if return_score_type == "L2-distance":
|
120
|
+
_result = await self.vectors_store.asimilarity_search_with_score(
|
121
|
+
query=query,
|
122
|
+
k=k,
|
123
|
+
filter=_filter,
|
124
|
+
fetch_k=fetch_k,
|
125
|
+
)
|
126
|
+
return [(r.page_content, s, r.metadata) for r, s in _result]
|
127
|
+
elif return_score_type == "none":
|
128
|
+
_result = await self.vectors_store.asimilarity_search(
|
129
|
+
query=query,
|
130
|
+
k=k,
|
131
|
+
filter=_filter,
|
132
|
+
fetch_k=fetch_k,
|
133
|
+
)
|
134
|
+
return [(r.page_content, r.metadata) for r in _result]
|
135
|
+
elif return_score_type == "similarity_score":
|
136
|
+
_result = await self.vectors_store.asimilarity_search_with_relevance_scores(
|
137
|
+
query=query,
|
138
|
+
k=k,
|
139
|
+
filter=_filter,
|
140
|
+
fetch_k=fetch_k,
|
141
|
+
)
|
142
|
+
return [(r.page_content, s, r.metadata) for r, s in _result]
|
143
|
+
else:
|
144
|
+
raise ValueError(f"Invalid `return_score_type` {return_score_type}!")
|
145
|
+
|
146
|
+
@lock_decorator
|
147
|
+
async def similarity_search_by_embedding(
|
148
|
+
self,
|
149
|
+
embedding: list[float],
|
150
|
+
agent_id: int,
|
151
|
+
k: int = 4,
|
152
|
+
fetch_k: int = 20,
|
153
|
+
return_score_type: Union[Literal["none"], Literal["L2-distance"]] = "none",
|
154
|
+
filter: Optional[dict] = None,
|
155
|
+
) -> Union[list[tuple[str, dict]], list[tuple[str, float, dict]]]:
|
156
|
+
"""
|
157
|
+
Return content most similar to the given query.
|
158
|
+
|
159
|
+
Args:
|
160
|
+
embedding (list[float]): The vector to look up documents similar to.
|
161
|
+
agent_id (int): The identifier of the agent to filter specific documents. Only documents associated with this agent will be considered.
|
162
|
+
k (int, optional): The number of top similar contents to return. Defaults to 4.
|
163
|
+
fetch_k (int, optional): The number of documents to fetch before applying any filters. Defaults to 20.
|
164
|
+
return_score_type (Union[Literal["none"], Literal["similarity_score"], Literal["L2-distance"]], optional):
|
165
|
+
Specifies whether and how to return similarity scores with the results:
|
166
|
+
- "none": Do not return scores; only return the contents (default).
|
167
|
+
- "L2-distance": Return a tuple of content and its L2 distance from the query.
|
168
|
+
filter (dict, optional): The filter dict for metadata.
|
169
|
+
|
170
|
+
Returns:
|
171
|
+
Union[list[tuple[str,dict]], list[tuple[str, float,dict]]]:
|
172
|
+
Depending on the `return_score_type` parameter, returns either a list of strings representing the top-k similar contents,
|
173
|
+
or a list of tuples where each tuple contains a string and a floating-point score.
|
174
|
+
"""
|
175
|
+
_filter = {
|
176
|
+
"_id": agent_id,
|
177
|
+
}
|
178
|
+
if filter is not None:
|
179
|
+
_filter.update(filter)
|
180
|
+
if return_score_type == "L2-distance":
|
181
|
+
_result = await self.vectors_store.asimilarity_search_with_score_by_vector(
|
182
|
+
embedding=embedding,
|
183
|
+
k=k,
|
184
|
+
filter=_filter,
|
185
|
+
fetch_k=fetch_k,
|
186
|
+
)
|
187
|
+
return [(r.page_content, s, r.metadata) for r, s in _result]
|
188
|
+
elif return_score_type == "none":
|
189
|
+
_result = await self.vectors_store.asimilarity_search_by_vector(
|
190
|
+
embedding=embedding,
|
191
|
+
k=k,
|
192
|
+
filter=_filter,
|
193
|
+
fetch_k=fetch_k,
|
194
|
+
)
|
195
|
+
return [(r.page_content, r.metadata) for r in _result]
|
196
|
+
else:
|
197
|
+
raise ValueError(f"Invalid `return_score_type` {return_score_type}!")
|
198
|
+
|
199
|
+
@lock_decorator
|
200
|
+
async def marginal_relevance_search(
|
201
|
+
self,
|
202
|
+
query: str,
|
203
|
+
agent_id: int,
|
204
|
+
k: int = 4,
|
205
|
+
fetch_k: int = 20,
|
206
|
+
lambda_mult: float = 0.5,
|
207
|
+
return_score_type: Literal["none"] = "none",
|
208
|
+
filter: Optional[dict] = None,
|
209
|
+
) -> list[tuple[str, dict]]:
|
210
|
+
"""
|
211
|
+
Return contents selected using the maximal marginal relevance asynchronously.
|
212
|
+
|
213
|
+
Args:
|
214
|
+
query (str): The text to look up documents similar to.
|
215
|
+
agent_id (int): The identifier of the agent to filter specific documents. Only documents associated with this agent will be considered.
|
216
|
+
k (int, optional): The number of top similar contents to return. Defaults to 4.
|
217
|
+
fetch_k (int, optional): The number of documents to fetch before applying any filters. Defaults to 20.
|
218
|
+
lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
|
219
|
+
return_score_type (Literal["none"].,optional):
|
220
|
+
Specifies whether and how to return similarity scores with the results:
|
221
|
+
- "none": Do not return scores; only return the contents (default).
|
222
|
+
filter (dict, optional): The filter dict for metadata.
|
223
|
+
|
224
|
+
Returns:
|
225
|
+
list[tuple[str,dict]]: the result contents.
|
226
|
+
"""
|
227
|
+
_filter = {
|
228
|
+
"_id": agent_id,
|
229
|
+
}
|
230
|
+
if filter is not None:
|
231
|
+
_filter.update(filter)
|
232
|
+
|
233
|
+
if return_score_type == "none":
|
234
|
+
_result = await self.vectors_store.amax_marginal_relevance_search(
|
235
|
+
query=query,
|
236
|
+
k=k,
|
237
|
+
filter=_filter,
|
238
|
+
fetch_k=fetch_k,
|
239
|
+
lambda_mult=lambda_mult,
|
240
|
+
)
|
241
|
+
return [(r.page_content, r.metadata) for r in _result]
|
242
|
+
else:
|
243
|
+
raise ValueError(f"Invalid `return_score_type` {return_score_type}!")
|
244
|
+
|
245
|
+
@lock_decorator
|
246
|
+
async def marginal_relevance_search_by_embedding(
|
247
|
+
self,
|
248
|
+
embedding: list[float],
|
249
|
+
agent_id: int,
|
250
|
+
k: int = 4,
|
251
|
+
fetch_k: int = 20,
|
252
|
+
lambda_mult: float = 0.5,
|
253
|
+
return_score_type: Union[Literal["none"], Literal["similarity_score"]] = "none",
|
254
|
+
filter: Optional[dict] = None,
|
255
|
+
) -> Union[list[tuple[str, dict]], list[tuple[str, float, dict]]]:
|
256
|
+
"""
|
257
|
+
Return contents selected using the maximal marginal relevance asynchronously.
|
258
|
+
|
259
|
+
Args:
|
260
|
+
embedding (list[float]): The vector to look up documents similar to.
|
261
|
+
agent_id (int): The identifier of the agent to filter specific documents. Only documents associated with this agent will be considered.
|
262
|
+
k (int, optional): The number of top similar contents to return. Defaults to 4.
|
263
|
+
fetch_k (int, optional): The number of documents to fetch before applying any filters. Defaults to 20.
|
264
|
+
lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
|
265
|
+
return_score_type (Union[Literal["none"], Literal["similarity_score"]], optional):
|
266
|
+
Specifies whether and how to return similarity scores with the results:
|
267
|
+
- "none": Do not return scores; only return the contents (default).
|
268
|
+
- "similarity_score": Return a tuple of content and its similarity score.
|
269
|
+
filter (dict, optional): The filter dict for metadata.
|
270
|
+
|
271
|
+
Returns:
|
272
|
+
Union[list[tuple[str,dict]], list[tuple[str, float,dict]]]:
|
273
|
+
Depending on the `return_score_type` parameter, returns either a list of strings representing the top-k similar contents,
|
274
|
+
or a list of tuples where each tuple contains a string and a floating-point score.
|
275
|
+
"""
|
276
|
+
|
277
|
+
_filter = {
|
278
|
+
"_id": agent_id,
|
279
|
+
}
|
280
|
+
if filter is not None:
|
281
|
+
_filter.update(filter)
|
282
|
+
if return_score_type == "none":
|
283
|
+
_result = await self.vectors_store.amax_marginal_relevance_search_by_vector(
|
284
|
+
embedding=embedding,
|
285
|
+
k=k,
|
286
|
+
filter=_filter,
|
287
|
+
fetch_k=fetch_k,
|
288
|
+
lambda_mult=lambda_mult,
|
289
|
+
)
|
290
|
+
return [(r.page_content, r.metadata) for r in _result]
|
291
|
+
elif return_score_type == "similarity_score":
|
292
|
+
_result = await self.vectors_store.amax_marginal_relevance_search_with_score_by_vector(
|
293
|
+
embedding=embedding,
|
294
|
+
k=k,
|
295
|
+
filter=_filter,
|
296
|
+
fetch_k=fetch_k,
|
297
|
+
lambda_mult=lambda_mult,
|
298
|
+
)
|
299
|
+
return [(r.page_content, s, r.metadata) for r, s in _result]
|
300
|
+
|
301
|
+
else:
|
302
|
+
raise ValueError(f"Invalid `return_score_type` {return_score_type}!")
|