pycityagent 2.0.0a22__py3-none-any.whl → 2.0.0a24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycityagent/__init__.py +2 -1
- pycityagent/agent.py +13 -2
- pycityagent/llm/__init__.py +7 -2
- pycityagent/llm/embeddings.py +231 -0
- pycityagent/memory/__init__.py +2 -0
- pycityagent/memory/faiss_query.py +302 -0
- pycityagent/memory/memory.py +131 -137
- pycityagent/simulation/agentgroup.py +42 -25
- pycityagent/simulation/simulation.py +9 -1
- {pycityagent-2.0.0a22.dist-info → pycityagent-2.0.0a24.dist-info}/METADATA +5 -1
- {pycityagent-2.0.0a22.dist-info → pycityagent-2.0.0a24.dist-info}/RECORD +12 -11
- pycityagent/llm/embedding.py +0 -136
- {pycityagent-2.0.0a22.dist-info → pycityagent-2.0.0a24.dist-info}/WHEEL +0 -0
pycityagent/__init__.py
CHANGED
@@ -5,6 +5,7 @@ Pycityagent: 城市智能体构建框架
|
|
5
5
|
from .agent import Agent, CitizenAgent, InstitutionAgent
|
6
6
|
from .environment import Simulator
|
7
7
|
import logging
|
8
|
+
from .llm import SentenceEmbedding
|
8
9
|
|
9
10
|
# 创建一个 pycityagent 记录器
|
10
11
|
logger = logging.getLogger("pycityagent")
|
@@ -19,4 +20,4 @@ if not logger.hasHandlers():
|
|
19
20
|
handler.setFormatter(formatter)
|
20
21
|
logger.addHandler(handler)
|
21
22
|
|
22
|
-
__all__ = ["Agent", "Simulator", "CitizenAgent", "InstitutionAgent"]
|
23
|
+
__all__ = ["Agent", "Simulator", "CitizenAgent", "InstitutionAgent","SentenceEmbedding",]
|
pycityagent/agent.py
CHANGED
@@ -236,7 +236,15 @@ class Agent(ABC):
|
|
236
236
|
|
237
237
|
# 添加记忆上下文
|
238
238
|
if self._memory:
|
239
|
-
relevant_memories = await self.
|
239
|
+
relevant_memories = await self.memory.search(survey_prompt)
|
240
|
+
|
241
|
+
formatted_results = []
|
242
|
+
# for result in top_results:
|
243
|
+
# formatted_results.append(
|
244
|
+
# f"- [{result['type']}] {result['content']} "
|
245
|
+
# f"(相关度: {result['similarity']:.2f})"
|
246
|
+
# )
|
247
|
+
|
240
248
|
if relevant_memories:
|
241
249
|
dialog.append(
|
242
250
|
{
|
@@ -458,7 +466,9 @@ class Agent(ABC):
|
|
458
466
|
topic = f"exps/{self._exp_id}/agents/{to_agent_uuid}/{sub_topic}"
|
459
467
|
await self._messager.send_message(topic, payload)
|
460
468
|
|
461
|
-
async def send_message_to_agent(
|
469
|
+
async def send_message_to_agent(
|
470
|
+
self, to_agent_uuid: str, content: str, type: str = "social"
|
471
|
+
):
|
462
472
|
"""通过 Messager 发送消息"""
|
463
473
|
if self._messager is None:
|
464
474
|
raise RuntimeError("Messager is not set")
|
@@ -598,6 +608,7 @@ class CitizenAgent(Agent):
|
|
598
608
|
# 防止模拟器还没有到prepare阶段导致get_person出错
|
599
609
|
self._has_bound_to_simulator = True
|
600
610
|
self._agent_id = person_id
|
611
|
+
self.memory.set_agent_id(person_id)
|
601
612
|
|
602
613
|
async def _bind_to_economy(self):
|
603
614
|
if self._economy_client is None:
|
pycityagent/llm/__init__.py
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
"""LLM相关模块"""
|
2
2
|
|
3
|
+
from .embeddings import SentenceEmbedding, SimpleEmbedding
|
3
4
|
from .llm import LLM, LLMConfig
|
4
|
-
from .embedding import SimpleEmbedding
|
5
5
|
|
6
|
-
__all__ = [
|
6
|
+
__all__ = [
|
7
|
+
"LLM",
|
8
|
+
"LLMConfig",
|
9
|
+
"SentenceEmbedding",
|
10
|
+
"SimpleEmbedding",
|
11
|
+
]
|
@@ -0,0 +1,231 @@
|
|
1
|
+
import hashlib
|
2
|
+
import json
|
3
|
+
import os
|
4
|
+
from typing import Optional, Union
|
5
|
+
|
6
|
+
import numpy as np
|
7
|
+
import torch
|
8
|
+
from langchain_core.embeddings import Embeddings
|
9
|
+
from transformers import AutoModel, AutoTokenizer
|
10
|
+
|
11
|
+
__all__ = [
|
12
|
+
"SentenceEmbedding",
|
13
|
+
"SimpleEmbedding",
|
14
|
+
]
|
15
|
+
|
16
|
+
|
17
|
+
class SentenceEmbedding(Embeddings):
|
18
|
+
def __init__(
|
19
|
+
self,
|
20
|
+
pretrained_model_name_or_path: Union[str, os.PathLike] = "BAAI/bge-m3",
|
21
|
+
max_seq_len: int = 8192,
|
22
|
+
auto_cuda: bool = False,
|
23
|
+
local_files_only: bool = False,
|
24
|
+
cache_dir: str = "./cache",
|
25
|
+
proxies: Optional[dict] = None,
|
26
|
+
):
|
27
|
+
os.makedirs(cache_dir, exist_ok=True)
|
28
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
29
|
+
pretrained_model_name_or_path,
|
30
|
+
proxies=proxies,
|
31
|
+
cache_dir=cache_dir,
|
32
|
+
local_files_only=local_files_only,
|
33
|
+
)
|
34
|
+
self.model = AutoModel.from_pretrained(
|
35
|
+
pretrained_model_name_or_path,
|
36
|
+
proxies=proxies,
|
37
|
+
cache_dir=cache_dir,
|
38
|
+
local_files_only=local_files_only,
|
39
|
+
)
|
40
|
+
self._cuda = auto_cuda and torch.cuda.is_available()
|
41
|
+
|
42
|
+
if self._cuda:
|
43
|
+
self.model = self.model.cuda()
|
44
|
+
|
45
|
+
self.model.eval()
|
46
|
+
self.max_seq_len = max_seq_len
|
47
|
+
|
48
|
+
def _embed(self, texts: list[str]) -> list[list[float]]:
|
49
|
+
# Tokenize sentences
|
50
|
+
encoded_input = self.tokenizer(
|
51
|
+
texts, padding=True, truncation=True, return_tensors="pt"
|
52
|
+
)
|
53
|
+
# for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
|
54
|
+
# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
|
55
|
+
|
56
|
+
# check length of input
|
57
|
+
# assert seq_len <= 8192
|
58
|
+
assert encoded_input["input_ids"].shape[1] <= self.max_seq_len # type: ignore
|
59
|
+
|
60
|
+
if self._cuda:
|
61
|
+
encoded_input = {k: v.cuda() for k, v in encoded_input.items()}
|
62
|
+
# Compute token embeddings
|
63
|
+
with torch.no_grad():
|
64
|
+
model_output = self.model(**encoded_input)
|
65
|
+
# Perform pooling. In this case, cls pooling.
|
66
|
+
sentence_embeddings = model_output[0][:, 0]
|
67
|
+
# normalize embeddings
|
68
|
+
sentence_embeddings = torch.nn.functional.normalize(
|
69
|
+
sentence_embeddings, p=2, dim=1
|
70
|
+
)
|
71
|
+
if self._cuda:
|
72
|
+
sentence_embeddings = sentence_embeddings.cpu()
|
73
|
+
return sentence_embeddings.tolist()
|
74
|
+
|
75
|
+
def embed_documents(self, texts: list[str]) -> list[list[float]]:
|
76
|
+
"""Embed documents."""
|
77
|
+
return self._embed(texts)
|
78
|
+
|
79
|
+
def embed_query(self, text: str) -> list[float]:
|
80
|
+
"""Embed query text."""
|
81
|
+
return self._embed([text])[0]
|
82
|
+
|
83
|
+
|
84
|
+
class SimpleEmbedding(Embeddings):
|
85
|
+
"""简单的基于内存的embedding实现
|
86
|
+
|
87
|
+
使用简单的词袋模型(Bag of Words)和TF-IDF来生成文本的向量表示。
|
88
|
+
所有向量都保存在内存中,适用于小规模应用。
|
89
|
+
"""
|
90
|
+
|
91
|
+
def __init__(self, vector_dim: int = 128, cache_size: int = 1000):
|
92
|
+
"""初始化
|
93
|
+
|
94
|
+
Args:
|
95
|
+
vector_dim: 向量维度
|
96
|
+
cache_size: 缓存大小,超过此大小将清除最早的缓存
|
97
|
+
"""
|
98
|
+
self.vector_dim = vector_dim
|
99
|
+
self.cache_size = cache_size
|
100
|
+
self._cache: dict[str, list[float]] = {}
|
101
|
+
self._vocab: dict[str, int] = {} # 词汇表
|
102
|
+
self._idf: dict[str, float] = {} # 逆文档频率
|
103
|
+
self._doc_count = 0 # 文档总数
|
104
|
+
|
105
|
+
def _text_to_hash(self, text: str) -> str:
|
106
|
+
"""将文本转换为hash值"""
|
107
|
+
return hashlib.md5(text.encode()).hexdigest()
|
108
|
+
|
109
|
+
def _tokenize(self, text: str) -> list[str]:
|
110
|
+
"""简单的分词"""
|
111
|
+
# 这里使用简单的空格分词,实际应用中可以使用更复杂的分词方法
|
112
|
+
return text.lower().split()
|
113
|
+
|
114
|
+
def _update_vocab(self, tokens: list[str]):
|
115
|
+
"""更新词汇表"""
|
116
|
+
for token in set(tokens): # 使用set去重
|
117
|
+
if token not in self._vocab:
|
118
|
+
self._vocab[token] = len(self._vocab)
|
119
|
+
|
120
|
+
def _update_idf(self, tokens: list[str]):
|
121
|
+
"""更新IDF值"""
|
122
|
+
self._doc_count += 1
|
123
|
+
unique_tokens = set(tokens)
|
124
|
+
for token in unique_tokens:
|
125
|
+
self._idf[token] = self._idf.get(token, 0) + 1
|
126
|
+
|
127
|
+
def _calculate_tf(self, tokens: list[str]) -> dict[str, float]:
|
128
|
+
"""计算词频(TF)"""
|
129
|
+
tf = {}
|
130
|
+
total_tokens = len(tokens)
|
131
|
+
for token in tokens:
|
132
|
+
tf[token] = tf.get(token, 0) + 1
|
133
|
+
# 归一化
|
134
|
+
for token in tf:
|
135
|
+
tf[token] /= total_tokens
|
136
|
+
return tf
|
137
|
+
|
138
|
+
def _calculate_tfidf(self, tokens: list[str]) -> list[float]:
|
139
|
+
"""计算TF-IDF向量"""
|
140
|
+
vector = np.zeros(self.vector_dim)
|
141
|
+
tf = self._calculate_tf(tokens)
|
142
|
+
|
143
|
+
for token, tf_value in tf.items():
|
144
|
+
if token in self._idf:
|
145
|
+
idf = np.log(self._doc_count / self._idf[token])
|
146
|
+
idx = self._vocab[token] % self.vector_dim # 使用取模运算来控制向量维度
|
147
|
+
vector[idx] += tf_value * idf
|
148
|
+
|
149
|
+
# L2归一化
|
150
|
+
norm = np.linalg.norm(vector)
|
151
|
+
if norm > 0:
|
152
|
+
vector /= norm
|
153
|
+
|
154
|
+
return list(vector)
|
155
|
+
|
156
|
+
def _embed(self, text: str) -> list[float]:
|
157
|
+
"""生成文本的向量表示
|
158
|
+
|
159
|
+
Args:
|
160
|
+
text: 输入文本
|
161
|
+
|
162
|
+
Returns:
|
163
|
+
np.ndarray: 文本的向量表示
|
164
|
+
"""
|
165
|
+
# 检查缓存
|
166
|
+
text_hash = self._text_to_hash(text)
|
167
|
+
if text_hash in self._cache:
|
168
|
+
return self._cache[text_hash]
|
169
|
+
|
170
|
+
# 分词
|
171
|
+
tokens = self._tokenize(text)
|
172
|
+
if not tokens:
|
173
|
+
return list(np.zeros(self.vector_dim))
|
174
|
+
|
175
|
+
# 更新词汇表和IDF
|
176
|
+
self._update_vocab(tokens)
|
177
|
+
self._update_idf(tokens)
|
178
|
+
|
179
|
+
# 计算向量
|
180
|
+
vector = self._calculate_tfidf(tokens)
|
181
|
+
|
182
|
+
# 更新缓存
|
183
|
+
if len(self._cache) >= self.cache_size:
|
184
|
+
# 删除最早的缓存
|
185
|
+
oldest_key = next(iter(self._cache))
|
186
|
+
del self._cache[oldest_key]
|
187
|
+
self._cache[text_hash] = vector
|
188
|
+
|
189
|
+
return list(vector)
|
190
|
+
|
191
|
+
def embed_documents(self, texts: list[str]) -> list[list[float]]:
|
192
|
+
"""Embed documents."""
|
193
|
+
return [self._embed(text) for text in texts]
|
194
|
+
|
195
|
+
def embed_query(self, text: str) -> list[float]:
|
196
|
+
"""Embed query text."""
|
197
|
+
return self._embed(text)
|
198
|
+
|
199
|
+
# def save(self, file_path: str):
|
200
|
+
# """保存模型"""
|
201
|
+
# state = {
|
202
|
+
# "vector_dim": self.vector_dim,
|
203
|
+
# "cache_size": self.cache_size,
|
204
|
+
# "vocab": self._vocab,
|
205
|
+
# "idf": self._idf,
|
206
|
+
# "doc_count": self._doc_count,
|
207
|
+
# }
|
208
|
+
# with open(file_path, "w") as f:
|
209
|
+
# json.dump(state, f)
|
210
|
+
|
211
|
+
# def load(self, file_path: str):
|
212
|
+
# """加载模型"""
|
213
|
+
# with open(file_path, "r") as f:
|
214
|
+
# state = json.load(f)
|
215
|
+
# self.vector_dim = state["vector_dim"]
|
216
|
+
# self.cache_size = state["cache_size"]
|
217
|
+
# self._vocab = state["vocab"]
|
218
|
+
# self._idf = state["idf"]
|
219
|
+
# self._doc_count = state["doc_count"]
|
220
|
+
# self._cache = {} # 清空缓存
|
221
|
+
|
222
|
+
|
223
|
+
if __name__ == "__main__":
|
224
|
+
# se = SentenceEmbedding(
|
225
|
+
# pretrained_model_name_or_path="ignore/BAAI--bge-m3", cache_dir="ignore"
|
226
|
+
# )
|
227
|
+
se = SimpleEmbedding()
|
228
|
+
print(se.embed_query("hello world"))
|
229
|
+
print(se.embed_query("hello world"))
|
230
|
+
print(se.embed_query("hello world"))
|
231
|
+
print(se.embed_query("hello world"))
|
pycityagent/memory/__init__.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
"""Memory."""
|
2
2
|
|
3
|
+
from .faiss_query import FaissQuery
|
3
4
|
from .memory import Memory
|
4
5
|
from .memory_base import MemoryBase, MemoryUnit
|
5
6
|
from .profile import ProfileMemory, ProfileMemoryUnit
|
@@ -8,4 +9,5 @@ from .state import StateMemory
|
|
8
9
|
|
9
10
|
__all__ = [
|
10
11
|
"Memory",
|
12
|
+
"FaissQuery",
|
11
13
|
]
|
@@ -0,0 +1,302 @@
|
|
1
|
+
import asyncio
|
2
|
+
from collections.abc import Sequence
|
3
|
+
from typing import Any, Literal, Optional, Union
|
4
|
+
|
5
|
+
import faiss
|
6
|
+
import numpy as np
|
7
|
+
from langchain_community.docstore.in_memory import InMemoryDocstore
|
8
|
+
from langchain_community.vectorstores import FAISS
|
9
|
+
from langchain_core.documents import Document
|
10
|
+
from langchain_core.embeddings import Embeddings
|
11
|
+
|
12
|
+
from ..utils.decorators import lock_decorator
|
13
|
+
|
14
|
+
|
15
|
+
class FaissQuery:
|
16
|
+
def __init__(
|
17
|
+
self,
|
18
|
+
embeddings: Optional[Embeddings] = None,
|
19
|
+
index_type: Any = faiss.IndexFlatL2,
|
20
|
+
dimension: Optional[int] = None,
|
21
|
+
) -> None:
|
22
|
+
self._embeddings = embeddings
|
23
|
+
self._lock = asyncio.Lock()
|
24
|
+
if embeddings is None:
|
25
|
+
self._index = None
|
26
|
+
self._vectors_store = None
|
27
|
+
else:
|
28
|
+
if dimension is None:
|
29
|
+
dimension = len(embeddings.embed_query("hello world"))
|
30
|
+
self._index = index_type(dimension)
|
31
|
+
self._vectors_store = FAISS(
|
32
|
+
embedding_function=embeddings,
|
33
|
+
index=self._index,
|
34
|
+
docstore=InMemoryDocstore(),
|
35
|
+
index_to_docstore_id={},
|
36
|
+
)
|
37
|
+
|
38
|
+
@property
|
39
|
+
def embeddings(
|
40
|
+
self,
|
41
|
+
) -> Embeddings:
|
42
|
+
if self._embeddings is None:
|
43
|
+
raise RuntimeError(f"No embedding set, please `set_embeddings` first!")
|
44
|
+
return self._embeddings
|
45
|
+
|
46
|
+
@property
|
47
|
+
def vectors_store(
|
48
|
+
self,
|
49
|
+
) -> FAISS:
|
50
|
+
if self._vectors_store is None:
|
51
|
+
raise RuntimeError(f"No embedding set, thus no vector stores initialized!")
|
52
|
+
return self._vectors_store
|
53
|
+
|
54
|
+
@lock_decorator
|
55
|
+
async def add_documents(
|
56
|
+
self,
|
57
|
+
agent_id: int,
|
58
|
+
documents: Union[str, Sequence[str]],
|
59
|
+
extra_tags: Optional[dict] = None,
|
60
|
+
) -> list[str]:
|
61
|
+
if isinstance(documents, str):
|
62
|
+
documents = [documents]
|
63
|
+
_metadata = {"_id": agent_id}
|
64
|
+
if extra_tags is not None:
|
65
|
+
_metadata.update(extra_tags)
|
66
|
+
to_add_documents = [
|
67
|
+
Document(page_content=doc, metadata=_metadata) for doc in documents
|
68
|
+
]
|
69
|
+
return await self.vectors_store.aadd_documents(
|
70
|
+
documents=to_add_documents,
|
71
|
+
)
|
72
|
+
|
73
|
+
@lock_decorator
|
74
|
+
async def delete_documents(
|
75
|
+
self,
|
76
|
+
to_delete_ids: list[str],
|
77
|
+
):
|
78
|
+
await self.vectors_store.adelete(
|
79
|
+
ids=to_delete_ids,
|
80
|
+
)
|
81
|
+
|
82
|
+
@lock_decorator
|
83
|
+
async def similarity_search(
|
84
|
+
self,
|
85
|
+
query: str,
|
86
|
+
agent_id: int,
|
87
|
+
k: int = 4,
|
88
|
+
fetch_k: int = 20,
|
89
|
+
return_score_type: Union[
|
90
|
+
Literal["none"], Literal["similarity_score"], Literal["L2-distance"]
|
91
|
+
] = "none",
|
92
|
+
filter: Optional[dict] = None,
|
93
|
+
) -> Union[list[tuple[str, dict]], list[tuple[str, float, dict]]]:
|
94
|
+
"""
|
95
|
+
Return content most similar to the given query.
|
96
|
+
|
97
|
+
Args:
|
98
|
+
query (str): The text to look up documents similar to.
|
99
|
+
agent_id (int): The identifier of the agent to filter specific documents. Only documents associated with this agent will be considered.
|
100
|
+
k (int, optional): The number of top similar contents to return. Defaults to 4.
|
101
|
+
fetch_k (int, optional): The number of documents to fetch before applying any filters. Defaults to 20.
|
102
|
+
return_score_type (Union[Literal["none"], Literal["similarity_score"], Literal["L2-distance"]], optional):
|
103
|
+
Specifies whether and how to return similarity scores with the results:
|
104
|
+
- "none": Do not return scores; only return the contents (default).
|
105
|
+
- "similarity_score": Return a tuple of content and its similarity score.
|
106
|
+
- "L2-distance": Return a tuple of content and its L2 distance from the query.
|
107
|
+
filter (dict, optional): The filter dict for metadata.
|
108
|
+
|
109
|
+
Returns:
|
110
|
+
Union[list[tuple[str,dict]], list[tuple[str, float,dict]]]:
|
111
|
+
Depending on the `return_score_type` parameter, returns either a list of strings representing the top-k similar contents,
|
112
|
+
or a list of tuples where each tuple contains a string and a floating-point score.
|
113
|
+
"""
|
114
|
+
_filter = {
|
115
|
+
"_id": agent_id,
|
116
|
+
}
|
117
|
+
if filter is not None:
|
118
|
+
_filter.update(filter)
|
119
|
+
if return_score_type == "L2-distance":
|
120
|
+
_result = await self.vectors_store.asimilarity_search_with_score(
|
121
|
+
query=query,
|
122
|
+
k=k,
|
123
|
+
filter=_filter,
|
124
|
+
fetch_k=fetch_k,
|
125
|
+
)
|
126
|
+
return [(r.page_content, s, r.metadata) for r, s in _result]
|
127
|
+
elif return_score_type == "none":
|
128
|
+
_result = await self.vectors_store.asimilarity_search(
|
129
|
+
query=query,
|
130
|
+
k=k,
|
131
|
+
filter=_filter,
|
132
|
+
fetch_k=fetch_k,
|
133
|
+
)
|
134
|
+
return [(r.page_content, r.metadata) for r in _result]
|
135
|
+
elif return_score_type == "similarity_score":
|
136
|
+
_result = await self.vectors_store.asimilarity_search_with_relevance_scores(
|
137
|
+
query=query,
|
138
|
+
k=k,
|
139
|
+
filter=_filter,
|
140
|
+
fetch_k=fetch_k,
|
141
|
+
)
|
142
|
+
return [(r.page_content, s, r.metadata) for r, s in _result]
|
143
|
+
else:
|
144
|
+
raise ValueError(f"Invalid `return_score_type` {return_score_type}!")
|
145
|
+
|
146
|
+
@lock_decorator
|
147
|
+
async def similarity_search_by_embedding(
|
148
|
+
self,
|
149
|
+
embedding: list[float],
|
150
|
+
agent_id: int,
|
151
|
+
k: int = 4,
|
152
|
+
fetch_k: int = 20,
|
153
|
+
return_score_type: Union[Literal["none"], Literal["L2-distance"]] = "none",
|
154
|
+
filter: Optional[dict] = None,
|
155
|
+
) -> Union[list[tuple[str, dict]], list[tuple[str, float, dict]]]:
|
156
|
+
"""
|
157
|
+
Return content most similar to the given query.
|
158
|
+
|
159
|
+
Args:
|
160
|
+
embedding (list[float]): The vector to look up documents similar to.
|
161
|
+
agent_id (int): The identifier of the agent to filter specific documents. Only documents associated with this agent will be considered.
|
162
|
+
k (int, optional): The number of top similar contents to return. Defaults to 4.
|
163
|
+
fetch_k (int, optional): The number of documents to fetch before applying any filters. Defaults to 20.
|
164
|
+
return_score_type (Union[Literal["none"], Literal["similarity_score"], Literal["L2-distance"]], optional):
|
165
|
+
Specifies whether and how to return similarity scores with the results:
|
166
|
+
- "none": Do not return scores; only return the contents (default).
|
167
|
+
- "L2-distance": Return a tuple of content and its L2 distance from the query.
|
168
|
+
filter (dict, optional): The filter dict for metadata.
|
169
|
+
|
170
|
+
Returns:
|
171
|
+
Union[list[tuple[str,dict]], list[tuple[str, float,dict]]]:
|
172
|
+
Depending on the `return_score_type` parameter, returns either a list of strings representing the top-k similar contents,
|
173
|
+
or a list of tuples where each tuple contains a string and a floating-point score.
|
174
|
+
"""
|
175
|
+
_filter = {
|
176
|
+
"_id": agent_id,
|
177
|
+
}
|
178
|
+
if filter is not None:
|
179
|
+
_filter.update(filter)
|
180
|
+
if return_score_type == "L2-distance":
|
181
|
+
_result = await self.vectors_store.asimilarity_search_with_score_by_vector(
|
182
|
+
embedding=embedding,
|
183
|
+
k=k,
|
184
|
+
filter=_filter,
|
185
|
+
fetch_k=fetch_k,
|
186
|
+
)
|
187
|
+
return [(r.page_content, s, r.metadata) for r, s in _result]
|
188
|
+
elif return_score_type == "none":
|
189
|
+
_result = await self.vectors_store.asimilarity_search_by_vector(
|
190
|
+
embedding=embedding,
|
191
|
+
k=k,
|
192
|
+
filter=_filter,
|
193
|
+
fetch_k=fetch_k,
|
194
|
+
)
|
195
|
+
return [(r.page_content, r.metadata) for r in _result]
|
196
|
+
else:
|
197
|
+
raise ValueError(f"Invalid `return_score_type` {return_score_type}!")
|
198
|
+
|
199
|
+
@lock_decorator
|
200
|
+
async def marginal_relevance_search(
|
201
|
+
self,
|
202
|
+
query: str,
|
203
|
+
agent_id: int,
|
204
|
+
k: int = 4,
|
205
|
+
fetch_k: int = 20,
|
206
|
+
lambda_mult: float = 0.5,
|
207
|
+
return_score_type: Literal["none"] = "none",
|
208
|
+
filter: Optional[dict] = None,
|
209
|
+
) -> list[tuple[str, dict]]:
|
210
|
+
"""
|
211
|
+
Return contents selected using the maximal marginal relevance asynchronously.
|
212
|
+
|
213
|
+
Args:
|
214
|
+
query (str): The text to look up documents similar to.
|
215
|
+
agent_id (int): The identifier of the agent to filter specific documents. Only documents associated with this agent will be considered.
|
216
|
+
k (int, optional): The number of top similar contents to return. Defaults to 4.
|
217
|
+
fetch_k (int, optional): The number of documents to fetch before applying any filters. Defaults to 20.
|
218
|
+
lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
|
219
|
+
return_score_type (Literal["none"].,optional):
|
220
|
+
Specifies whether and how to return similarity scores with the results:
|
221
|
+
- "none": Do not return scores; only return the contents (default).
|
222
|
+
filter (dict, optional): The filter dict for metadata.
|
223
|
+
|
224
|
+
Returns:
|
225
|
+
list[tuple[str,dict]]: the result contents.
|
226
|
+
"""
|
227
|
+
_filter = {
|
228
|
+
"_id": agent_id,
|
229
|
+
}
|
230
|
+
if filter is not None:
|
231
|
+
_filter.update(filter)
|
232
|
+
|
233
|
+
if return_score_type == "none":
|
234
|
+
_result = await self.vectors_store.amax_marginal_relevance_search(
|
235
|
+
query=query,
|
236
|
+
k=k,
|
237
|
+
filter=_filter,
|
238
|
+
fetch_k=fetch_k,
|
239
|
+
lambda_mult=lambda_mult,
|
240
|
+
)
|
241
|
+
return [(r.page_content, r.metadata) for r in _result]
|
242
|
+
else:
|
243
|
+
raise ValueError(f"Invalid `return_score_type` {return_score_type}!")
|
244
|
+
|
245
|
+
@lock_decorator
|
246
|
+
async def marginal_relevance_search_by_embedding(
|
247
|
+
self,
|
248
|
+
embedding: list[float],
|
249
|
+
agent_id: int,
|
250
|
+
k: int = 4,
|
251
|
+
fetch_k: int = 20,
|
252
|
+
lambda_mult: float = 0.5,
|
253
|
+
return_score_type: Union[Literal["none"], Literal["similarity_score"]] = "none",
|
254
|
+
filter: Optional[dict] = None,
|
255
|
+
) -> Union[list[tuple[str, dict]], list[tuple[str, float, dict]]]:
|
256
|
+
"""
|
257
|
+
Return contents selected using the maximal marginal relevance asynchronously.
|
258
|
+
|
259
|
+
Args:
|
260
|
+
embedding (list[float]): The vector to look up documents similar to.
|
261
|
+
agent_id (int): The identifier of the agent to filter specific documents. Only documents associated with this agent will be considered.
|
262
|
+
k (int, optional): The number of top similar contents to return. Defaults to 4.
|
263
|
+
fetch_k (int, optional): The number of documents to fetch before applying any filters. Defaults to 20.
|
264
|
+
lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
|
265
|
+
return_score_type (Union[Literal["none"], Literal["similarity_score"]], optional):
|
266
|
+
Specifies whether and how to return similarity scores with the results:
|
267
|
+
- "none": Do not return scores; only return the contents (default).
|
268
|
+
- "similarity_score": Return a tuple of content and its similarity score.
|
269
|
+
filter (dict, optional): The filter dict for metadata.
|
270
|
+
|
271
|
+
Returns:
|
272
|
+
Union[list[tuple[str,dict]], list[tuple[str, float,dict]]]:
|
273
|
+
Depending on the `return_score_type` parameter, returns either a list of strings representing the top-k similar contents,
|
274
|
+
or a list of tuples where each tuple contains a string and a floating-point score.
|
275
|
+
"""
|
276
|
+
|
277
|
+
_filter = {
|
278
|
+
"_id": agent_id,
|
279
|
+
}
|
280
|
+
if filter is not None:
|
281
|
+
_filter.update(filter)
|
282
|
+
if return_score_type == "none":
|
283
|
+
_result = await self.vectors_store.amax_marginal_relevance_search_by_vector(
|
284
|
+
embedding=embedding,
|
285
|
+
k=k,
|
286
|
+
filter=_filter,
|
287
|
+
fetch_k=fetch_k,
|
288
|
+
lambda_mult=lambda_mult,
|
289
|
+
)
|
290
|
+
return [(r.page_content, r.metadata) for r in _result]
|
291
|
+
elif return_score_type == "similarity_score":
|
292
|
+
_result = await self.vectors_store.amax_marginal_relevance_search_with_score_by_vector(
|
293
|
+
embedding=embedding,
|
294
|
+
k=k,
|
295
|
+
filter=_filter,
|
296
|
+
fetch_k=fetch_k,
|
297
|
+
lambda_mult=lambda_mult,
|
298
|
+
)
|
299
|
+
return [(r.page_content, s, r.metadata) for r, s in _result]
|
300
|
+
|
301
|
+
else:
|
302
|
+
raise ValueError(f"Invalid `return_score_type` {return_score_type}!")
|
pycityagent/memory/memory.py
CHANGED
@@ -1,21 +1,25 @@
|
|
1
1
|
import asyncio
|
2
2
|
import logging
|
3
|
+
from collections import defaultdict
|
4
|
+
from collections.abc import Callable, Sequence
|
3
5
|
from copy import deepcopy
|
4
6
|
from datetime import datetime
|
5
|
-
from typing import Any, Literal, Optional,
|
6
|
-
from collections.abc import Sequence,Callable
|
7
|
+
from typing import Any, Literal, Optional, Union
|
7
8
|
|
8
9
|
import numpy as np
|
10
|
+
from langchain_core.embeddings import Embeddings
|
9
11
|
from pyparsing import deque
|
10
12
|
|
11
13
|
from ..utils.decorators import lock_decorator
|
12
14
|
from .const import *
|
15
|
+
from .faiss_query import FaissQuery
|
13
16
|
from .profile import ProfileMemory
|
14
17
|
from .self_define import DynamicMemory
|
15
18
|
from .state import StateMemory
|
16
19
|
|
17
20
|
logger = logging.getLogger("pycityagent")
|
18
21
|
|
22
|
+
|
19
23
|
class Memory:
|
20
24
|
"""
|
21
25
|
A class to manage different types of memory (state, profile, dynamic).
|
@@ -33,7 +37,8 @@ class Memory:
|
|
33
37
|
base: Optional[dict[Any, Any]] = None,
|
34
38
|
motion: Optional[dict[Any, Any]] = None,
|
35
39
|
activate_timestamp: bool = False,
|
36
|
-
embedding_model:
|
40
|
+
embedding_model: Optional[Embeddings] = None,
|
41
|
+
faiss_query: Optional[FaissQuery] = None,
|
37
42
|
) -> None:
|
38
43
|
"""
|
39
44
|
Initializes the Memory with optional configuration.
|
@@ -51,20 +56,21 @@ class Memory:
|
|
51
56
|
base (Optional[dict[Any, Any]], optional): base attribute dict from City Simulator.
|
52
57
|
motion (Optional[dict[Any, Any]], optional): motion attribute dict from City Simulator.
|
53
58
|
activate_timestamp (bool): Whether activate timestamp storage in MemoryUnit
|
54
|
-
embedding_model (
|
59
|
+
embedding_model (Embeddings): The embedding model for memory search.
|
60
|
+
faiss_query (FaissQuery): The faiss_query of the agent. Defaults to None.
|
55
61
|
"""
|
56
62
|
self.watchers: dict[str, list[Callable]] = {}
|
57
63
|
self._lock = asyncio.Lock()
|
58
|
-
self.
|
59
|
-
|
60
|
-
# 初始化embedding存储
|
61
|
-
self._embeddings = {"state": {}, "profile": {}, "dynamic": {}}
|
64
|
+
self._agent_id: int = -1
|
65
|
+
self._embedding_model = embedding_model
|
62
66
|
|
63
67
|
_dynamic_config: dict[Any, Any] = {}
|
64
68
|
_state_config: dict[Any, Any] = {}
|
65
69
|
_profile_config: dict[Any, Any] = {}
|
66
70
|
# 记录哪些字段需要embedding
|
67
71
|
self._embedding_fields: dict[str, bool] = {}
|
72
|
+
self._embedding_field_to_doc_id: dict[Any, str] = defaultdict(str)
|
73
|
+
self._faiss_query = faiss_query
|
68
74
|
|
69
75
|
if config is not None:
|
70
76
|
for k, v in config.items():
|
@@ -135,8 +141,55 @@ class Memory:
|
|
135
141
|
self._profile = ProfileMemory(
|
136
142
|
msg=_profile_config, activate_timestamp=activate_timestamp
|
137
143
|
)
|
138
|
-
self.memories = [] # 存储记忆内容
|
139
|
-
self.embeddings = [] # 存储记忆的向量表示
|
144
|
+
# self.memories = [] # 存储记忆内容
|
145
|
+
# self.embeddings = [] # 存储记忆的向量表示
|
146
|
+
|
147
|
+
def set_embedding_model(
|
148
|
+
self,
|
149
|
+
embedding_model: Embeddings,
|
150
|
+
):
|
151
|
+
self._embedding_model = embedding_model
|
152
|
+
|
153
|
+
@property
|
154
|
+
def embedding_model(
|
155
|
+
self,
|
156
|
+
):
|
157
|
+
if self._embedding_model is None:
|
158
|
+
raise RuntimeError(
|
159
|
+
f"embedding_model before assignment, please `set_embedding_model` first!"
|
160
|
+
)
|
161
|
+
return self._embedding_model
|
162
|
+
|
163
|
+
def set_faiss_query(self, faiss_query: FaissQuery):
|
164
|
+
"""
|
165
|
+
Set the FaissQuery of the agent.
|
166
|
+
"""
|
167
|
+
self._faiss_query = faiss_query
|
168
|
+
|
169
|
+
@property
|
170
|
+
def agent_id(
|
171
|
+
self,
|
172
|
+
):
|
173
|
+
if self._agent_id < 0:
|
174
|
+
raise RuntimeError(
|
175
|
+
f"agent_id before assignment, please `set_agent_id` first!"
|
176
|
+
)
|
177
|
+
return self._agent_id
|
178
|
+
|
179
|
+
def set_agent_id(self, agent_id: int):
|
180
|
+
"""
|
181
|
+
Set the FaissQuery of the agent.
|
182
|
+
"""
|
183
|
+
self._agent_id = agent_id
|
184
|
+
|
185
|
+
@property
|
186
|
+
def faiss_query(self) -> FaissQuery:
|
187
|
+
"""FaissQuery"""
|
188
|
+
if self._faiss_query is None:
|
189
|
+
raise RuntimeError(
|
190
|
+
f"FaissQuery access before assignment, please `set_faiss_query` first!"
|
191
|
+
)
|
192
|
+
return self._faiss_query
|
140
193
|
|
141
194
|
@lock_decorator
|
142
195
|
async def get(
|
@@ -192,11 +245,23 @@ class Memory:
|
|
192
245
|
if mode == "replace":
|
193
246
|
await _mem.update(key, value, store_snapshot)
|
194
247
|
# 如果字段需要embedding,则更新embedding
|
195
|
-
if self.
|
248
|
+
if self._embedding_fields.get(key, False) and self.embedding_model:
|
196
249
|
memory_type = self._get_memory_type(_mem)
|
197
|
-
|
198
|
-
|
250
|
+
# 覆盖更新删除原vector
|
251
|
+
orig_doc_id = self._embedding_field_to_doc_id[key]
|
252
|
+
if orig_doc_id:
|
253
|
+
await self.faiss_query.delete_documents(
|
254
|
+
to_delete_ids=[orig_doc_id],
|
255
|
+
)
|
256
|
+
doc_ids: list[str] = await self.faiss_query.add_documents(
|
257
|
+
agent_id=self.agent_id,
|
258
|
+
documents=f"{key}: {str(value)}",
|
259
|
+
extra_tags={
|
260
|
+
"type": memory_type,
|
261
|
+
"key": key,
|
262
|
+
},
|
199
263
|
)
|
264
|
+
self._embedding_field_to_doc_id[key] = doc_ids[0]
|
200
265
|
if key in self.watchers:
|
201
266
|
for callback in self.watchers[key]:
|
202
267
|
asyncio.create_task(callback())
|
@@ -214,13 +279,17 @@ class Memory:
|
|
214
279
|
f"Type of {type(original_value)} does not support mode `merge`, using `replace` instead!"
|
215
280
|
)
|
216
281
|
await _mem.update(key, value, store_snapshot)
|
217
|
-
if self.
|
282
|
+
if self._embedding_fields.get(key, False) and self.embedding_model:
|
218
283
|
memory_type = self._get_memory_type(_mem)
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
284
|
+
doc_ids = await self.faiss_query.add_documents(
|
285
|
+
agent_id=self.agent_id,
|
286
|
+
documents=f"{key}: {str(original_value)}",
|
287
|
+
extra_tags={
|
288
|
+
"type": memory_type,
|
289
|
+
"key": key,
|
290
|
+
},
|
223
291
|
)
|
292
|
+
self._embedding_field_to_doc_id[key] = doc_ids[0]
|
224
293
|
if key in self.watchers:
|
225
294
|
for callback in self.watchers[key]:
|
226
295
|
asyncio.create_task(callback())
|
@@ -240,68 +309,6 @@ class Memory:
|
|
240
309
|
else:
|
241
310
|
return "dynamic"
|
242
311
|
|
243
|
-
async def _generate_embedding(self, text: str) -> np.ndarray:
|
244
|
-
"""生成文本的向量表示
|
245
|
-
|
246
|
-
Args:
|
247
|
-
text: 输入文本
|
248
|
-
|
249
|
-
Returns:
|
250
|
-
np.ndarray: 文本的向量表示
|
251
|
-
|
252
|
-
Raises:
|
253
|
-
ValueError: 如果embedding_model未初始化
|
254
|
-
"""
|
255
|
-
if not self.embedding_model:
|
256
|
-
raise RuntimeError("Embedding model not initialized")
|
257
|
-
|
258
|
-
return await self.embedding_model.embed(text)
|
259
|
-
|
260
|
-
async def search(self, query: str, top_k: int = 3) -> str:
|
261
|
-
"""搜索相关记忆
|
262
|
-
|
263
|
-
Args:
|
264
|
-
query: 查询文本
|
265
|
-
top_k: 返回最相关的记忆数量
|
266
|
-
|
267
|
-
Returns:
|
268
|
-
str: 格式化的相关记忆文本
|
269
|
-
"""
|
270
|
-
if not self.embedding_model:
|
271
|
-
return "Embedding model not initialized"
|
272
|
-
|
273
|
-
query_embedding = await self._generate_embedding(query)
|
274
|
-
all_results = []
|
275
|
-
|
276
|
-
# 搜索所有记忆类型中启用了embedding的字段
|
277
|
-
for memory_type, embeddings in self._embeddings.items():
|
278
|
-
for key, embedding in embeddings.items():
|
279
|
-
similarity = self._cosine_similarity(query_embedding, embedding)
|
280
|
-
value = await self.get(key)
|
281
|
-
|
282
|
-
all_results.append(
|
283
|
-
{
|
284
|
-
"type": memory_type,
|
285
|
-
"key": key,
|
286
|
-
"content": f"{key}: {str(value)}",
|
287
|
-
"similarity": similarity,
|
288
|
-
}
|
289
|
-
)
|
290
|
-
|
291
|
-
# 按相似度排序
|
292
|
-
all_results.sort(key=lambda x: x["similarity"], reverse=True)
|
293
|
-
top_results = all_results[:top_k]
|
294
|
-
|
295
|
-
# 格式化输出
|
296
|
-
formatted_results = []
|
297
|
-
for result in top_results:
|
298
|
-
formatted_results.append(
|
299
|
-
f"- [{result['type']}] {result['content']} "
|
300
|
-
f"(相关度: {result['similarity']:.2f})"
|
301
|
-
)
|
302
|
-
|
303
|
-
return "\n".join(formatted_results)
|
304
|
-
|
305
312
|
async def update_batch(
|
306
313
|
self,
|
307
314
|
content: Union[dict, Sequence[tuple[Any, Any]]],
|
@@ -388,67 +395,54 @@ class Memory:
|
|
388
395
|
if _snapshot:
|
389
396
|
await _mem.load(snapshots=_snapshot, reset_memory=reset_memory)
|
390
397
|
|
398
|
+
# async def add(self, content: str, metadata: Optional[dict] = None) -> None:
|
399
|
+
# """添加新的记忆
|
400
|
+
|
401
|
+
# Args:
|
402
|
+
# content: 记忆内容
|
403
|
+
# metadata: 相关元数据,如时间、地点等
|
404
|
+
# """
|
405
|
+
# embedding = await self.embedding_model.aembed_query(content)
|
406
|
+
# self.memories.append(
|
407
|
+
# {
|
408
|
+
# "content": content,
|
409
|
+
# "metadata": metadata or {},
|
410
|
+
# "timestamp": datetime.now(),
|
411
|
+
# "embedding": embedding,
|
412
|
+
# }
|
413
|
+
# )
|
414
|
+
# self.embeddings.append(embedding)
|
415
|
+
|
391
416
|
@lock_decorator
|
392
|
-
async def
|
393
|
-
self,
|
394
|
-
|
395
|
-
|
396
|
-
top_k: Optional[int] = None,
|
397
|
-
mode: Union[Literal["read only"], Literal["read and write"]] = "read only",
|
398
|
-
preserve_order: bool = True,
|
399
|
-
) -> Any:
|
400
|
-
"""
|
401
|
-
Retrieves the top-k items from the memory based on the given key and metric.
|
417
|
+
async def search(
|
418
|
+
self, query: str, top_k: int = 3, filter: Optional[dict] = None
|
419
|
+
) -> str:
|
420
|
+
"""搜索相关记忆
|
402
421
|
|
403
422
|
Args:
|
404
|
-
|
405
|
-
|
406
|
-
|
407
|
-
mode (Union[Literal["read only"], Literal["read and write"]], optional): Access mode for the item. Defaults to "read only".
|
408
|
-
preserve_order (bool): Whether preserve original order in output values.
|
423
|
+
query: 查询文本
|
424
|
+
top_k: 返回最相关的记忆数量
|
425
|
+
filter (dict, optional): 记忆的筛选条件,如 {"type":"dynamic", "key":"self_define_1",},默认为空
|
409
426
|
|
410
427
|
Returns:
|
411
|
-
|
412
|
-
|
413
|
-
Raises:
|
414
|
-
ValueError: If an invalid mode is provided.
|
415
|
-
KeyError: If the key is not found in any of the memory sections.
|
416
|
-
"""
|
417
|
-
if mode == "read only":
|
418
|
-
process_func = deepcopy
|
419
|
-
elif mode == "read and write":
|
420
|
-
process_func = lambda x: x
|
421
|
-
else:
|
422
|
-
raise ValueError(f"Invalid get mode `{mode}`!")
|
423
|
-
for _mem in [self._state, self._profile, self._dynamic]:
|
424
|
-
try:
|
425
|
-
value = await _mem.get_top_k(key, metric, top_k, preserve_order)
|
426
|
-
return process_func(value)
|
427
|
-
except KeyError as e:
|
428
|
-
continue
|
429
|
-
raise KeyError(f"No attribute `{key}` in memories!")
|
430
|
-
|
431
|
-
async def add(self, content: str, metadata: Optional[dict] = None) -> None:
|
432
|
-
"""添加新的记忆
|
433
|
-
|
434
|
-
Args:
|
435
|
-
content: 记忆内容
|
436
|
-
metadata: 相关元数据,如时间、地点等
|
428
|
+
str: 格式化的相关记忆文本
|
437
429
|
"""
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
430
|
+
if not self._embedding_model:
|
431
|
+
return "Embedding model not initialized"
|
432
|
+
top_results: list[tuple[str, float, dict]] = (
|
433
|
+
await self.faiss_query.similarity_search( # type:ignore
|
434
|
+
query=query,
|
435
|
+
agent_id=self.agent_id,
|
436
|
+
k=top_k,
|
437
|
+
return_score_type="similarity_score",
|
438
|
+
filter=filter,
|
439
|
+
)
|
446
440
|
)
|
447
|
-
|
448
|
-
|
449
|
-
|
450
|
-
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
return
|
441
|
+
# 格式化输出
|
442
|
+
formatted_results = []
|
443
|
+
for content, score, metadata in top_results:
|
444
|
+
formatted_results.append(
|
445
|
+
f"- [{metadata['type']}] {content} " f"(相关度: {score:.2f})"
|
446
|
+
)
|
447
|
+
|
448
|
+
return "\n".join(formatted_results)
|
@@ -10,12 +10,14 @@ from uuid import UUID
|
|
10
10
|
|
11
11
|
import fastavro
|
12
12
|
import ray
|
13
|
+
from langchain_core.embeddings import Embeddings
|
13
14
|
|
14
15
|
from ..agent import Agent, CitizenAgent, InstitutionAgent
|
15
16
|
from ..economy.econ_client import EconomyClient
|
16
17
|
from ..environment.simulator import Simulator
|
17
18
|
from ..llm.llm import LLM
|
18
19
|
from ..llm.llmconfig import LLMConfig
|
20
|
+
from ..memory import FaissQuery
|
19
21
|
from ..message import Messager
|
20
22
|
from ..metrics import MlflowClient
|
21
23
|
from ..utils import (DIALOG_SCHEMA, INSTITUTION_STATUS_SCHEMA, PROFILE_SCHEMA,
|
@@ -37,6 +39,7 @@ class AgentGroup:
|
|
37
39
|
enable_pgsql: bool,
|
38
40
|
pgsql_writer: ray.ObjectRef,
|
39
41
|
mlflow_run_id: str,
|
42
|
+
embedding_model: Embeddings,
|
40
43
|
logging_level: int,
|
41
44
|
):
|
42
45
|
logger.setLevel(logging_level)
|
@@ -46,6 +49,7 @@ class AgentGroup:
|
|
46
49
|
self.exp_id = exp_id
|
47
50
|
self.enable_avro = enable_avro
|
48
51
|
self.enable_pgsql = enable_pgsql
|
52
|
+
self.embedding_model = embedding_model
|
49
53
|
if enable_avro:
|
50
54
|
self.avro_path = avro_path / f"{self._uuid}"
|
51
55
|
self.avro_path.mkdir(parents=True, exist_ok=True)
|
@@ -99,6 +103,13 @@ class AgentGroup:
|
|
99
103
|
else:
|
100
104
|
self.mlflow_client = None
|
101
105
|
|
106
|
+
# set FaissQuery
|
107
|
+
if self.embedding_model is not None:
|
108
|
+
self.faiss_query = FaissQuery(
|
109
|
+
embeddings=self.embedding_model,
|
110
|
+
)
|
111
|
+
else:
|
112
|
+
self.faiss_query = None
|
102
113
|
for agent in self.agents:
|
103
114
|
agent.set_exp_id(self.exp_id) # type: ignore
|
104
115
|
agent.set_llm_client(self.llm)
|
@@ -112,6 +123,12 @@ class AgentGroup:
|
|
112
123
|
agent.set_avro_file(self.avro_file) # type: ignore
|
113
124
|
if self.enable_pgsql:
|
114
125
|
agent.set_pgsql_writer(self._pgsql_writer)
|
126
|
+
# set memory.faiss_query
|
127
|
+
if self.faiss_query is not None:
|
128
|
+
agent.memory.set_faiss_query(self.faiss_query)
|
129
|
+
# set memory.embedding model
|
130
|
+
if self.embedding_model is not None:
|
131
|
+
agent.memory.set_embedding_model(self.embedding_model)
|
115
132
|
|
116
133
|
async def init_agents(self):
|
117
134
|
logger.debug(f"-----Initializing Agents in AgentGroup {self._uuid} ...")
|
@@ -376,32 +393,32 @@ class AgentGroup:
|
|
376
393
|
"created_at": _date_time,
|
377
394
|
}
|
378
395
|
_statuses_time_list.append((_status_dict, _date_time))
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
398
|
-
|
399
|
-
|
400
|
-
|
401
|
-
|
402
|
-
|
396
|
+
to_update_statues: list[tuple] = []
|
397
|
+
for _status_dict, _ in _statuses_time_list:
|
398
|
+
BASIC_KEYS = [
|
399
|
+
"id",
|
400
|
+
"day",
|
401
|
+
"t",
|
402
|
+
"lng",
|
403
|
+
"lat",
|
404
|
+
"parent_id",
|
405
|
+
"action",
|
406
|
+
"created_at",
|
407
|
+
]
|
408
|
+
_data = [_status_dict[k] for k in BASIC_KEYS if k != "created_at"]
|
409
|
+
_other_dict = json.dumps(
|
410
|
+
{k: v for k, v in _status_dict.items() if k not in BASIC_KEYS}
|
411
|
+
)
|
412
|
+
_data.append(_other_dict)
|
413
|
+
_data.append(_status_dict["created_at"])
|
414
|
+
to_update_statues.append(tuple(_data))
|
415
|
+
if self._last_asyncio_pg_task is not None:
|
416
|
+
await self._last_asyncio_pg_task
|
417
|
+
self._last_asyncio_pg_task = (
|
418
|
+
self._pgsql_writer.async_write_status.remote( # type:ignore
|
419
|
+
to_update_statues
|
420
|
+
)
|
403
421
|
)
|
404
|
-
)
|
405
422
|
|
406
423
|
async def step(self):
|
407
424
|
if not self.initialized:
|
@@ -14,11 +14,13 @@ from typing import Any, Optional, Union
|
|
14
14
|
import pycityproto.city.economy.v2.economy_pb2 as economyv2
|
15
15
|
import ray
|
16
16
|
import yaml
|
17
|
+
from langchain_core.embeddings import Embeddings
|
17
18
|
from mosstool.map._map_util.const import AOI_START_ID
|
18
19
|
|
19
20
|
from ..agent import Agent, InstitutionAgent
|
20
21
|
from ..environment.simulator import Simulator
|
21
|
-
from ..
|
22
|
+
from ..llm import SimpleEmbedding
|
23
|
+
from ..memory import FaissQuery, Memory
|
22
24
|
from ..message.messager import Messager
|
23
25
|
from ..metrics import init_mlflow_connection
|
24
26
|
from ..survey import Survey
|
@@ -76,6 +78,8 @@ class AgentSimulation:
|
|
76
78
|
|
77
79
|
# storage
|
78
80
|
_storage_config: dict[str, Any] = config.get("storage", {})
|
81
|
+
if _storage_config is None:
|
82
|
+
_storage_config = {}
|
79
83
|
# avro
|
80
84
|
_avro_config: dict[str, Any] = _storage_config.get("avro", {})
|
81
85
|
self._enable_avro = _avro_config.get("enabled", False)
|
@@ -164,6 +168,7 @@ class AgentSimulation:
|
|
164
168
|
enable_pgsql: bool,
|
165
169
|
pgsql_writer: ray.ObjectRef,
|
166
170
|
mlflow_run_id: str = None, # type: ignore
|
171
|
+
embedding_model: Embeddings = None, # type: ignore
|
167
172
|
logging_level: int = logging.WARNING,
|
168
173
|
):
|
169
174
|
"""创建远程组"""
|
@@ -177,6 +182,7 @@ class AgentSimulation:
|
|
177
182
|
enable_pgsql,
|
178
183
|
pgsql_writer,
|
179
184
|
mlflow_run_id,
|
185
|
+
embedding_model,
|
180
186
|
logging_level,
|
181
187
|
)
|
182
188
|
return group_name, group, agents
|
@@ -186,6 +192,7 @@ class AgentSimulation:
|
|
186
192
|
agent_count: Union[int, list[int]],
|
187
193
|
group_size: int = 1000,
|
188
194
|
pg_sql_writers: int = 32,
|
195
|
+
embedding_model: Embeddings = SimpleEmbedding(),
|
189
196
|
memory_config_func: Optional[Union[Callable, list[Callable]]] = None,
|
190
197
|
) -> None:
|
191
198
|
"""初始化智能体
|
@@ -305,6 +312,7 @@ class AgentSimulation:
|
|
305
312
|
self.enable_pgsql,
|
306
313
|
_workers[i % _num_workers], # type:ignore
|
307
314
|
mlflow_run_id, # type:ignore
|
315
|
+
embedding_model,
|
308
316
|
self.logging_level,
|
309
317
|
)
|
310
318
|
creation_tasks.append((group_name, group, agents))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: pycityagent
|
3
|
-
Version: 2.0.
|
3
|
+
Version: 2.0.0a24
|
4
4
|
Summary: LLM-based城市环境agent构建库
|
5
5
|
License: MIT
|
6
6
|
Author: Yuwei Yan
|
@@ -20,10 +20,12 @@ Requires-Dist: aiohttp (==3.10.10)
|
|
20
20
|
Requires-Dist: aiomqtt (>=2.3.0,<3.0.0)
|
21
21
|
Requires-Dist: citystreetview (==1.2.4)
|
22
22
|
Requires-Dist: dashscope (==1.14.1)
|
23
|
+
Requires-Dist: faiss-cpu (>=1.9.0.post1,<2.0.0)
|
23
24
|
Requires-Dist: fastavro (>=1.10.0,<2.0.0)
|
24
25
|
Requires-Dist: geojson (==3.1.0)
|
25
26
|
Requires-Dist: gradio (>=5.7.1,<6.0.0)
|
26
27
|
Requires-Dist: grpcio (==1.67.1)
|
28
|
+
Requires-Dist: langchain-community (>=0.3.13,<0.4.0)
|
27
29
|
Requires-Dist: langchain-core (>=0.3.28,<0.4.0)
|
28
30
|
Requires-Dist: matplotlib (==3.8.3)
|
29
31
|
Requires-Dist: mlflow (>=2.19.0,<3.0.0)
|
@@ -40,6 +42,8 @@ Requires-Dist: pycityproto (>=2.1.5,<3.0.0)
|
|
40
42
|
Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
|
41
43
|
Requires-Dist: ray (>=2.40.0,<3.0.0)
|
42
44
|
Requires-Dist: sidecar (==0.7.0)
|
45
|
+
Requires-Dist: torch (>=2.5.1,<3.0.0)
|
46
|
+
Requires-Dist: transformers (>=4.47.1,<5.0.0)
|
43
47
|
Requires-Dist: zhipuai (>=2.1.5.20230904,<3.0.0.0)
|
44
48
|
Description-Content-Type: text/markdown
|
45
49
|
|
@@ -1,5 +1,5 @@
|
|
1
|
-
pycityagent/__init__.py,sha256=
|
2
|
-
pycityagent/agent.py,sha256=
|
1
|
+
pycityagent/__init__.py,sha256=fv0mzNGbHBF6m550yYqnuUpB8iQPWS-7EatYRK7DO4s,693
|
2
|
+
pycityagent/agent.py,sha256=l8Oa95_K5JBWKzvZmbQe_QM_E_vaG-YstuuR55kgC6Y,29005
|
3
3
|
pycityagent/economy/__init__.py,sha256=aonY4WHnx-6EGJ4WKrx4S-2jAkYNLtqUA04jp6q8B7w,75
|
4
4
|
pycityagent/economy/econ_client.py,sha256=GuHK9ZBnhqW3Z7F8ViDJn_iN73yOBbbwFyJv1wLEBDk,12211
|
5
5
|
pycityagent/environment/__init__.py,sha256=awHxlOud-btWbk0FCS4RmGJ13W84oVCkbGfcrhKqihA,240
|
@@ -30,14 +30,15 @@ pycityagent/environment/utils/grpc.py,sha256=6EJwKXXktIWb1NcUiJzIRmfrY0S03QAXXGc
|
|
30
30
|
pycityagent/environment/utils/map_utils.py,sha256=lYOEoCFFK6-e9N5txLMMq4HUlxMqc8Uw1YrGW5oJmgg,5749
|
31
31
|
pycityagent/environment/utils/port.py,sha256=3OM6kSUt3PxvDUOlgyiendBtETaWU8Mzk_8H0TzTmYg,295
|
32
32
|
pycityagent/environment/utils/protobuf.py,sha256=0BsM_G7x2B_6DMIBHe9bjVuQDOXUytNRQ03g9e05F3c,1170
|
33
|
-
pycityagent/llm/__init__.py,sha256=
|
34
|
-
pycityagent/llm/
|
33
|
+
pycityagent/llm/__init__.py,sha256=iWs6FLgrbRVIiqOf4ILS89gkVCTvS7HFC3vG-MWuyko,205
|
34
|
+
pycityagent/llm/embeddings.py,sha256=Nhf_tUIlaYJAZ93wW2QTCtS1wq7e8fUgdn2JketEAuQ,7600
|
35
35
|
pycityagent/llm/llm.py,sha256=vJaaGqVuyV-GlBxrnvGKZnMDlxeTT_sGUTdxz5tYwEE,15141
|
36
36
|
pycityagent/llm/llmconfig.py,sha256=4Ylf4OFSBEFy8jrOneeX0HvPhWEaF5jGvy1HkXK08Ro,436
|
37
37
|
pycityagent/llm/utils.py,sha256=hoNPhvomb1u6lhFX0GctFipw74hVKb7bvUBDqwBzBYw,160
|
38
|
-
pycityagent/memory/__init__.py,sha256=
|
38
|
+
pycityagent/memory/__init__.py,sha256=_Vfdo1HcLWsuuz34_i8e91nnLVYADpMlHHSVaB3xgIk,297
|
39
39
|
pycityagent/memory/const.py,sha256=6zpJPJXWoH9-yf4RARYYff586agCoud9BRn7sPERB1g,932
|
40
|
-
pycityagent/memory/
|
40
|
+
pycityagent/memory/faiss_query.py,sha256=Z0JS4udyPYCIzHMq464QtHscnswu35gh9fQptikAwkQ,12976
|
41
|
+
pycityagent/memory/memory.py,sha256=UBh4yANNHDzYZwrsvyX4ZMSHXINbu1U6g0HLNCOOCk8,17883
|
41
42
|
pycityagent/memory/memory_base.py,sha256=QG_j3BxZvkadFEeE3uBR_kjl_xcXD1aHUVs8GEF3d6w,5654
|
42
43
|
pycityagent/memory/profile.py,sha256=q8ZS9IBmHCg_X1GONUvXK85P6tCepTKQgXKuvuXYNXw,5203
|
43
44
|
pycityagent/memory/self_define.py,sha256=vpZ6CIxR2grNXEIOScdpsSc59FBg0mOKelwQuTElbtQ,5200
|
@@ -49,8 +50,8 @@ pycityagent/metrics/__init__.py,sha256=X08PaBbGVAd7_PRGLREXWxaqm7nS82WBQpD1zvQzc
|
|
49
50
|
pycityagent/metrics/mlflow_client.py,sha256=g_tHxWkWTDijtbGL74-HmiYzWVKb1y8-w12QrY9jL30,4449
|
50
51
|
pycityagent/metrics/utils/const.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
51
52
|
pycityagent/simulation/__init__.py,sha256=P5czbcg2d8S0nbbnsQXFIhwzO4CennAhZM8OmKvAeYw,194
|
52
|
-
pycityagent/simulation/agentgroup.py,sha256=
|
53
|
-
pycityagent/simulation/simulation.py,sha256=
|
53
|
+
pycityagent/simulation/agentgroup.py,sha256=r8arCAQkKMhv3yr35XsYJL-MfG6o6rWwHItBmxfDtA4,20589
|
54
|
+
pycityagent/simulation/simulation.py,sha256=9kkdgXSEOAN8wiewVFyORksti4IdVNU0opObV6ZYa9k,23344
|
54
55
|
pycityagent/simulation/storage/pg.py,sha256=Ws04mUgRcbbvWi_eQm3PXYa6w7AQUbDPWhSU7HFtsD8,6026
|
55
56
|
pycityagent/survey/__init__.py,sha256=rxwou8U9KeFSP7rMzXtmtp2fVFZxK4Trzi-psx9LPIs,153
|
56
57
|
pycityagent/survey/manager.py,sha256=S5IkwTdelsdtZETChRcfCEczzwSrry_Fly9MY4s3rbk,1681
|
@@ -69,6 +70,6 @@ pycityagent/workflow/block.py,sha256=l-z9iJo9_USZQRyj4TLMfihK0-tnNDG0a6jVk9WhG0o
|
|
69
70
|
pycityagent/workflow/prompt.py,sha256=6jI0Rq54JLv3-IXqZLYug62vse10wTI83xvf4ZX42nk,2929
|
70
71
|
pycityagent/workflow/tool.py,sha256=xADxhNgVsjNiMxlhdwn3xGUstFOkLEG8P67ez8VmwSI,8555
|
71
72
|
pycityagent/workflow/trigger.py,sha256=Df-MOBEDWBbM-v0dFLQLXteLsipymT4n8vqexmK2GiQ,5643
|
72
|
-
pycityagent-2.0.
|
73
|
-
pycityagent-2.0.
|
74
|
-
pycityagent-2.0.
|
73
|
+
pycityagent-2.0.0a24.dist-info/METADATA,sha256=cHowSJH9VJmum92fAEfRvQYtWmbCJRnVgOmI2JZDlqw,8033
|
74
|
+
pycityagent-2.0.0a24.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
75
|
+
pycityagent-2.0.0a24.dist-info/RECORD,,
|
pycityagent/llm/embedding.py
DELETED
@@ -1,136 +0,0 @@
|
|
1
|
-
"""简单的基于内存的embedding实现"""
|
2
|
-
|
3
|
-
import numpy as np
|
4
|
-
import hashlib
|
5
|
-
import json
|
6
|
-
|
7
|
-
|
8
|
-
class SimpleEmbedding:
|
9
|
-
"""简单的基于内存的embedding实现
|
10
|
-
|
11
|
-
使用简单的词袋模型(Bag of Words)和TF-IDF来生成文本的向量表示。
|
12
|
-
所有向量都保存在内存中,适用于小规模应用。
|
13
|
-
"""
|
14
|
-
|
15
|
-
def __init__(self, vector_dim: int = 128, cache_size: int = 1000):
|
16
|
-
"""初始化
|
17
|
-
|
18
|
-
Args:
|
19
|
-
vector_dim: 向量维度
|
20
|
-
cache_size: 缓存大小,超过此大小将清除最早的缓存
|
21
|
-
"""
|
22
|
-
self.vector_dim = vector_dim
|
23
|
-
self.cache_size = cache_size
|
24
|
-
self._cache: dict[str, np.ndarray] = {}
|
25
|
-
self._vocab: dict[str, int] = {} # 词汇表
|
26
|
-
self._idf: dict[str, float] = {} # 逆文档频率
|
27
|
-
self._doc_count = 0 # 文档总数
|
28
|
-
|
29
|
-
def _text_to_hash(self, text: str) -> str:
|
30
|
-
"""将文本转换为hash值"""
|
31
|
-
return hashlib.md5(text.encode()).hexdigest()
|
32
|
-
|
33
|
-
def _tokenize(self, text: str) -> list[str]:
|
34
|
-
"""简单的分词"""
|
35
|
-
# 这里使用简单的空格分词,实际应用中可以使用更复杂的分词方法
|
36
|
-
return text.lower().split()
|
37
|
-
|
38
|
-
def _update_vocab(self, tokens: list[str]):
|
39
|
-
"""更新词汇表"""
|
40
|
-
for token in set(tokens): # 使用set去重
|
41
|
-
if token not in self._vocab:
|
42
|
-
self._vocab[token] = len(self._vocab)
|
43
|
-
|
44
|
-
def _update_idf(self, tokens: list[str]):
|
45
|
-
"""更新IDF值"""
|
46
|
-
self._doc_count += 1
|
47
|
-
unique_tokens = set(tokens)
|
48
|
-
for token in unique_tokens:
|
49
|
-
self._idf[token] = self._idf.get(token, 0) + 1
|
50
|
-
|
51
|
-
def _calculate_tf(self, tokens: list[str]) -> dict[str, float]:
|
52
|
-
"""计算词频(TF)"""
|
53
|
-
tf = {}
|
54
|
-
total_tokens = len(tokens)
|
55
|
-
for token in tokens:
|
56
|
-
tf[token] = tf.get(token, 0) + 1
|
57
|
-
# 归一化
|
58
|
-
for token in tf:
|
59
|
-
tf[token] /= total_tokens
|
60
|
-
return tf
|
61
|
-
|
62
|
-
def _calculate_tfidf(self, tokens: list[str]) -> np.ndarray:
|
63
|
-
"""计算TF-IDF向量"""
|
64
|
-
vector = np.zeros(self.vector_dim)
|
65
|
-
tf = self._calculate_tf(tokens)
|
66
|
-
|
67
|
-
for token, tf_value in tf.items():
|
68
|
-
if token in self._idf:
|
69
|
-
idf = np.log(self._doc_count / self._idf[token])
|
70
|
-
idx = self._vocab[token] % self.vector_dim # 使用取模运算来控制向量维度
|
71
|
-
vector[idx] += tf_value * idf
|
72
|
-
|
73
|
-
# L2归一化
|
74
|
-
norm = np.linalg.norm(vector)
|
75
|
-
if norm > 0:
|
76
|
-
vector /= norm
|
77
|
-
|
78
|
-
return vector
|
79
|
-
|
80
|
-
async def embed(self, text: str) -> np.ndarray:
|
81
|
-
"""生成文本的向量表示
|
82
|
-
|
83
|
-
Args:
|
84
|
-
text: 输入文本
|
85
|
-
|
86
|
-
Returns:
|
87
|
-
np.ndarray: 文本的向量表示
|
88
|
-
"""
|
89
|
-
# 检查缓存
|
90
|
-
text_hash = self._text_to_hash(text)
|
91
|
-
if text_hash in self._cache:
|
92
|
-
return self._cache[text_hash]
|
93
|
-
|
94
|
-
# 分词
|
95
|
-
tokens = self._tokenize(text)
|
96
|
-
if not tokens:
|
97
|
-
return np.zeros(self.vector_dim)
|
98
|
-
|
99
|
-
# 更新词汇表和IDF
|
100
|
-
self._update_vocab(tokens)
|
101
|
-
self._update_idf(tokens)
|
102
|
-
|
103
|
-
# 计算向量
|
104
|
-
vector = self._calculate_tfidf(tokens)
|
105
|
-
|
106
|
-
# 更新缓存
|
107
|
-
if len(self._cache) >= self.cache_size:
|
108
|
-
# 删除最早的缓存
|
109
|
-
oldest_key = next(iter(self._cache))
|
110
|
-
del self._cache[oldest_key]
|
111
|
-
self._cache[text_hash] = vector
|
112
|
-
|
113
|
-
return vector
|
114
|
-
|
115
|
-
def save(self, file_path: str):
|
116
|
-
"""保存模型"""
|
117
|
-
state = {
|
118
|
-
"vector_dim": self.vector_dim,
|
119
|
-
"cache_size": self.cache_size,
|
120
|
-
"vocab": self._vocab,
|
121
|
-
"idf": self._idf,
|
122
|
-
"doc_count": self._doc_count,
|
123
|
-
}
|
124
|
-
with open(file_path, "w") as f:
|
125
|
-
json.dump(state, f)
|
126
|
-
|
127
|
-
def load(self, file_path: str):
|
128
|
-
"""加载模型"""
|
129
|
-
with open(file_path, "r") as f:
|
130
|
-
state = json.load(f)
|
131
|
-
self.vector_dim = state["vector_dim"]
|
132
|
-
self.cache_size = state["cache_size"]
|
133
|
-
self._vocab = state["vocab"]
|
134
|
-
self._idf = state["idf"]
|
135
|
-
self._doc_count = state["doc_count"]
|
136
|
-
self._cache = {} # 清空缓存
|
File without changes
|