pycityagent 2.0.0a22__py3-none-any.whl → 2.0.0a24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pycityagent/__init__.py CHANGED
@@ -5,6 +5,7 @@ Pycityagent: 城市智能体构建框架
5
5
  from .agent import Agent, CitizenAgent, InstitutionAgent
6
6
  from .environment import Simulator
7
7
  import logging
8
+ from .llm import SentenceEmbedding
8
9
 
9
10
  # 创建一个 pycityagent 记录器
10
11
  logger = logging.getLogger("pycityagent")
@@ -19,4 +20,4 @@ if not logger.hasHandlers():
19
20
  handler.setFormatter(formatter)
20
21
  logger.addHandler(handler)
21
22
 
22
- __all__ = ["Agent", "Simulator", "CitizenAgent", "InstitutionAgent"]
23
+ __all__ = ["Agent", "Simulator", "CitizenAgent", "InstitutionAgent","SentenceEmbedding",]
pycityagent/agent.py CHANGED
@@ -236,7 +236,15 @@ class Agent(ABC):
236
236
 
237
237
  # 添加记忆上下文
238
238
  if self._memory:
239
- relevant_memories = await self._memory.search(survey_prompt)
239
+ relevant_memories = await self.memory.search(survey_prompt)
240
+
241
+ formatted_results = []
242
+ # for result in top_results:
243
+ # formatted_results.append(
244
+ # f"- [{result['type']}] {result['content']} "
245
+ # f"(相关度: {result['similarity']:.2f})"
246
+ # )
247
+
240
248
  if relevant_memories:
241
249
  dialog.append(
242
250
  {
@@ -458,7 +466,9 @@ class Agent(ABC):
458
466
  topic = f"exps/{self._exp_id}/agents/{to_agent_uuid}/{sub_topic}"
459
467
  await self._messager.send_message(topic, payload)
460
468
 
461
- async def send_message_to_agent(self, to_agent_uuid: str, content: str, type: str = "social"):
469
+ async def send_message_to_agent(
470
+ self, to_agent_uuid: str, content: str, type: str = "social"
471
+ ):
462
472
  """通过 Messager 发送消息"""
463
473
  if self._messager is None:
464
474
  raise RuntimeError("Messager is not set")
@@ -598,6 +608,7 @@ class CitizenAgent(Agent):
598
608
  # 防止模拟器还没有到prepare阶段导致get_person出错
599
609
  self._has_bound_to_simulator = True
600
610
  self._agent_id = person_id
611
+ self.memory.set_agent_id(person_id)
601
612
 
602
613
  async def _bind_to_economy(self):
603
614
  if self._economy_client is None:
@@ -1,6 +1,11 @@
1
1
  """LLM相关模块"""
2
2
 
3
+ from .embeddings import SentenceEmbedding, SimpleEmbedding
3
4
  from .llm import LLM, LLMConfig
4
- from .embedding import SimpleEmbedding
5
5
 
6
- __all__ = ["LLM", "LLMConfig", "SimpleEmbedding"]
6
+ __all__ = [
7
+ "LLM",
8
+ "LLMConfig",
9
+ "SentenceEmbedding",
10
+ "SimpleEmbedding",
11
+ ]
@@ -0,0 +1,231 @@
1
+ import hashlib
2
+ import json
3
+ import os
4
+ from typing import Optional, Union
5
+
6
+ import numpy as np
7
+ import torch
8
+ from langchain_core.embeddings import Embeddings
9
+ from transformers import AutoModel, AutoTokenizer
10
+
11
+ __all__ = [
12
+ "SentenceEmbedding",
13
+ "SimpleEmbedding",
14
+ ]
15
+
16
+
17
+ class SentenceEmbedding(Embeddings):
18
+ def __init__(
19
+ self,
20
+ pretrained_model_name_or_path: Union[str, os.PathLike] = "BAAI/bge-m3",
21
+ max_seq_len: int = 8192,
22
+ auto_cuda: bool = False,
23
+ local_files_only: bool = False,
24
+ cache_dir: str = "./cache",
25
+ proxies: Optional[dict] = None,
26
+ ):
27
+ os.makedirs(cache_dir, exist_ok=True)
28
+ self.tokenizer = AutoTokenizer.from_pretrained(
29
+ pretrained_model_name_or_path,
30
+ proxies=proxies,
31
+ cache_dir=cache_dir,
32
+ local_files_only=local_files_only,
33
+ )
34
+ self.model = AutoModel.from_pretrained(
35
+ pretrained_model_name_or_path,
36
+ proxies=proxies,
37
+ cache_dir=cache_dir,
38
+ local_files_only=local_files_only,
39
+ )
40
+ self._cuda = auto_cuda and torch.cuda.is_available()
41
+
42
+ if self._cuda:
43
+ self.model = self.model.cuda()
44
+
45
+ self.model.eval()
46
+ self.max_seq_len = max_seq_len
47
+
48
+ def _embed(self, texts: list[str]) -> list[list[float]]:
49
+ # Tokenize sentences
50
+ encoded_input = self.tokenizer(
51
+ texts, padding=True, truncation=True, return_tensors="pt"
52
+ )
53
+ # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
54
+ # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
55
+
56
+ # check length of input
57
+ # assert seq_len <= 8192
58
+ assert encoded_input["input_ids"].shape[1] <= self.max_seq_len # type: ignore
59
+
60
+ if self._cuda:
61
+ encoded_input = {k: v.cuda() for k, v in encoded_input.items()}
62
+ # Compute token embeddings
63
+ with torch.no_grad():
64
+ model_output = self.model(**encoded_input)
65
+ # Perform pooling. In this case, cls pooling.
66
+ sentence_embeddings = model_output[0][:, 0]
67
+ # normalize embeddings
68
+ sentence_embeddings = torch.nn.functional.normalize(
69
+ sentence_embeddings, p=2, dim=1
70
+ )
71
+ if self._cuda:
72
+ sentence_embeddings = sentence_embeddings.cpu()
73
+ return sentence_embeddings.tolist()
74
+
75
+ def embed_documents(self, texts: list[str]) -> list[list[float]]:
76
+ """Embed documents."""
77
+ return self._embed(texts)
78
+
79
+ def embed_query(self, text: str) -> list[float]:
80
+ """Embed query text."""
81
+ return self._embed([text])[0]
82
+
83
+
84
+ class SimpleEmbedding(Embeddings):
85
+ """简单的基于内存的embedding实现
86
+
87
+ 使用简单的词袋模型(Bag of Words)和TF-IDF来生成文本的向量表示。
88
+ 所有向量都保存在内存中,适用于小规模应用。
89
+ """
90
+
91
+ def __init__(self, vector_dim: int = 128, cache_size: int = 1000):
92
+ """初始化
93
+
94
+ Args:
95
+ vector_dim: 向量维度
96
+ cache_size: 缓存大小,超过此大小将清除最早的缓存
97
+ """
98
+ self.vector_dim = vector_dim
99
+ self.cache_size = cache_size
100
+ self._cache: dict[str, list[float]] = {}
101
+ self._vocab: dict[str, int] = {} # 词汇表
102
+ self._idf: dict[str, float] = {} # 逆文档频率
103
+ self._doc_count = 0 # 文档总数
104
+
105
+ def _text_to_hash(self, text: str) -> str:
106
+ """将文本转换为hash值"""
107
+ return hashlib.md5(text.encode()).hexdigest()
108
+
109
+ def _tokenize(self, text: str) -> list[str]:
110
+ """简单的分词"""
111
+ # 这里使用简单的空格分词,实际应用中可以使用更复杂的分词方法
112
+ return text.lower().split()
113
+
114
+ def _update_vocab(self, tokens: list[str]):
115
+ """更新词汇表"""
116
+ for token in set(tokens): # 使用set去重
117
+ if token not in self._vocab:
118
+ self._vocab[token] = len(self._vocab)
119
+
120
+ def _update_idf(self, tokens: list[str]):
121
+ """更新IDF值"""
122
+ self._doc_count += 1
123
+ unique_tokens = set(tokens)
124
+ for token in unique_tokens:
125
+ self._idf[token] = self._idf.get(token, 0) + 1
126
+
127
+ def _calculate_tf(self, tokens: list[str]) -> dict[str, float]:
128
+ """计算词频(TF)"""
129
+ tf = {}
130
+ total_tokens = len(tokens)
131
+ for token in tokens:
132
+ tf[token] = tf.get(token, 0) + 1
133
+ # 归一化
134
+ for token in tf:
135
+ tf[token] /= total_tokens
136
+ return tf
137
+
138
+ def _calculate_tfidf(self, tokens: list[str]) -> list[float]:
139
+ """计算TF-IDF向量"""
140
+ vector = np.zeros(self.vector_dim)
141
+ tf = self._calculate_tf(tokens)
142
+
143
+ for token, tf_value in tf.items():
144
+ if token in self._idf:
145
+ idf = np.log(self._doc_count / self._idf[token])
146
+ idx = self._vocab[token] % self.vector_dim # 使用取模运算来控制向量维度
147
+ vector[idx] += tf_value * idf
148
+
149
+ # L2归一化
150
+ norm = np.linalg.norm(vector)
151
+ if norm > 0:
152
+ vector /= norm
153
+
154
+ return list(vector)
155
+
156
+ def _embed(self, text: str) -> list[float]:
157
+ """生成文本的向量表示
158
+
159
+ Args:
160
+ text: 输入文本
161
+
162
+ Returns:
163
+ np.ndarray: 文本的向量表示
164
+ """
165
+ # 检查缓存
166
+ text_hash = self._text_to_hash(text)
167
+ if text_hash in self._cache:
168
+ return self._cache[text_hash]
169
+
170
+ # 分词
171
+ tokens = self._tokenize(text)
172
+ if not tokens:
173
+ return list(np.zeros(self.vector_dim))
174
+
175
+ # 更新词汇表和IDF
176
+ self._update_vocab(tokens)
177
+ self._update_idf(tokens)
178
+
179
+ # 计算向量
180
+ vector = self._calculate_tfidf(tokens)
181
+
182
+ # 更新缓存
183
+ if len(self._cache) >= self.cache_size:
184
+ # 删除最早的缓存
185
+ oldest_key = next(iter(self._cache))
186
+ del self._cache[oldest_key]
187
+ self._cache[text_hash] = vector
188
+
189
+ return list(vector)
190
+
191
+ def embed_documents(self, texts: list[str]) -> list[list[float]]:
192
+ """Embed documents."""
193
+ return [self._embed(text) for text in texts]
194
+
195
+ def embed_query(self, text: str) -> list[float]:
196
+ """Embed query text."""
197
+ return self._embed(text)
198
+
199
+ # def save(self, file_path: str):
200
+ # """保存模型"""
201
+ # state = {
202
+ # "vector_dim": self.vector_dim,
203
+ # "cache_size": self.cache_size,
204
+ # "vocab": self._vocab,
205
+ # "idf": self._idf,
206
+ # "doc_count": self._doc_count,
207
+ # }
208
+ # with open(file_path, "w") as f:
209
+ # json.dump(state, f)
210
+
211
+ # def load(self, file_path: str):
212
+ # """加载模型"""
213
+ # with open(file_path, "r") as f:
214
+ # state = json.load(f)
215
+ # self.vector_dim = state["vector_dim"]
216
+ # self.cache_size = state["cache_size"]
217
+ # self._vocab = state["vocab"]
218
+ # self._idf = state["idf"]
219
+ # self._doc_count = state["doc_count"]
220
+ # self._cache = {} # 清空缓存
221
+
222
+
223
+ if __name__ == "__main__":
224
+ # se = SentenceEmbedding(
225
+ # pretrained_model_name_or_path="ignore/BAAI--bge-m3", cache_dir="ignore"
226
+ # )
227
+ se = SimpleEmbedding()
228
+ print(se.embed_query("hello world"))
229
+ print(se.embed_query("hello world"))
230
+ print(se.embed_query("hello world"))
231
+ print(se.embed_query("hello world"))
@@ -1,5 +1,6 @@
1
1
  """Memory."""
2
2
 
3
+ from .faiss_query import FaissQuery
3
4
  from .memory import Memory
4
5
  from .memory_base import MemoryBase, MemoryUnit
5
6
  from .profile import ProfileMemory, ProfileMemoryUnit
@@ -8,4 +9,5 @@ from .state import StateMemory
8
9
 
9
10
  __all__ = [
10
11
  "Memory",
12
+ "FaissQuery",
11
13
  ]
@@ -0,0 +1,302 @@
1
+ import asyncio
2
+ from collections.abc import Sequence
3
+ from typing import Any, Literal, Optional, Union
4
+
5
+ import faiss
6
+ import numpy as np
7
+ from langchain_community.docstore.in_memory import InMemoryDocstore
8
+ from langchain_community.vectorstores import FAISS
9
+ from langchain_core.documents import Document
10
+ from langchain_core.embeddings import Embeddings
11
+
12
+ from ..utils.decorators import lock_decorator
13
+
14
+
15
+ class FaissQuery:
16
+ def __init__(
17
+ self,
18
+ embeddings: Optional[Embeddings] = None,
19
+ index_type: Any = faiss.IndexFlatL2,
20
+ dimension: Optional[int] = None,
21
+ ) -> None:
22
+ self._embeddings = embeddings
23
+ self._lock = asyncio.Lock()
24
+ if embeddings is None:
25
+ self._index = None
26
+ self._vectors_store = None
27
+ else:
28
+ if dimension is None:
29
+ dimension = len(embeddings.embed_query("hello world"))
30
+ self._index = index_type(dimension)
31
+ self._vectors_store = FAISS(
32
+ embedding_function=embeddings,
33
+ index=self._index,
34
+ docstore=InMemoryDocstore(),
35
+ index_to_docstore_id={},
36
+ )
37
+
38
+ @property
39
+ def embeddings(
40
+ self,
41
+ ) -> Embeddings:
42
+ if self._embeddings is None:
43
+ raise RuntimeError(f"No embedding set, please `set_embeddings` first!")
44
+ return self._embeddings
45
+
46
+ @property
47
+ def vectors_store(
48
+ self,
49
+ ) -> FAISS:
50
+ if self._vectors_store is None:
51
+ raise RuntimeError(f"No embedding set, thus no vector stores initialized!")
52
+ return self._vectors_store
53
+
54
+ @lock_decorator
55
+ async def add_documents(
56
+ self,
57
+ agent_id: int,
58
+ documents: Union[str, Sequence[str]],
59
+ extra_tags: Optional[dict] = None,
60
+ ) -> list[str]:
61
+ if isinstance(documents, str):
62
+ documents = [documents]
63
+ _metadata = {"_id": agent_id}
64
+ if extra_tags is not None:
65
+ _metadata.update(extra_tags)
66
+ to_add_documents = [
67
+ Document(page_content=doc, metadata=_metadata) for doc in documents
68
+ ]
69
+ return await self.vectors_store.aadd_documents(
70
+ documents=to_add_documents,
71
+ )
72
+
73
+ @lock_decorator
74
+ async def delete_documents(
75
+ self,
76
+ to_delete_ids: list[str],
77
+ ):
78
+ await self.vectors_store.adelete(
79
+ ids=to_delete_ids,
80
+ )
81
+
82
+ @lock_decorator
83
+ async def similarity_search(
84
+ self,
85
+ query: str,
86
+ agent_id: int,
87
+ k: int = 4,
88
+ fetch_k: int = 20,
89
+ return_score_type: Union[
90
+ Literal["none"], Literal["similarity_score"], Literal["L2-distance"]
91
+ ] = "none",
92
+ filter: Optional[dict] = None,
93
+ ) -> Union[list[tuple[str, dict]], list[tuple[str, float, dict]]]:
94
+ """
95
+ Return content most similar to the given query.
96
+
97
+ Args:
98
+ query (str): The text to look up documents similar to.
99
+ agent_id (int): The identifier of the agent to filter specific documents. Only documents associated with this agent will be considered.
100
+ k (int, optional): The number of top similar contents to return. Defaults to 4.
101
+ fetch_k (int, optional): The number of documents to fetch before applying any filters. Defaults to 20.
102
+ return_score_type (Union[Literal["none"], Literal["similarity_score"], Literal["L2-distance"]], optional):
103
+ Specifies whether and how to return similarity scores with the results:
104
+ - "none": Do not return scores; only return the contents (default).
105
+ - "similarity_score": Return a tuple of content and its similarity score.
106
+ - "L2-distance": Return a tuple of content and its L2 distance from the query.
107
+ filter (dict, optional): The filter dict for metadata.
108
+
109
+ Returns:
110
+ Union[list[tuple[str,dict]], list[tuple[str, float,dict]]]:
111
+ Depending on the `return_score_type` parameter, returns either a list of strings representing the top-k similar contents,
112
+ or a list of tuples where each tuple contains a string and a floating-point score.
113
+ """
114
+ _filter = {
115
+ "_id": agent_id,
116
+ }
117
+ if filter is not None:
118
+ _filter.update(filter)
119
+ if return_score_type == "L2-distance":
120
+ _result = await self.vectors_store.asimilarity_search_with_score(
121
+ query=query,
122
+ k=k,
123
+ filter=_filter,
124
+ fetch_k=fetch_k,
125
+ )
126
+ return [(r.page_content, s, r.metadata) for r, s in _result]
127
+ elif return_score_type == "none":
128
+ _result = await self.vectors_store.asimilarity_search(
129
+ query=query,
130
+ k=k,
131
+ filter=_filter,
132
+ fetch_k=fetch_k,
133
+ )
134
+ return [(r.page_content, r.metadata) for r in _result]
135
+ elif return_score_type == "similarity_score":
136
+ _result = await self.vectors_store.asimilarity_search_with_relevance_scores(
137
+ query=query,
138
+ k=k,
139
+ filter=_filter,
140
+ fetch_k=fetch_k,
141
+ )
142
+ return [(r.page_content, s, r.metadata) for r, s in _result]
143
+ else:
144
+ raise ValueError(f"Invalid `return_score_type` {return_score_type}!")
145
+
146
+ @lock_decorator
147
+ async def similarity_search_by_embedding(
148
+ self,
149
+ embedding: list[float],
150
+ agent_id: int,
151
+ k: int = 4,
152
+ fetch_k: int = 20,
153
+ return_score_type: Union[Literal["none"], Literal["L2-distance"]] = "none",
154
+ filter: Optional[dict] = None,
155
+ ) -> Union[list[tuple[str, dict]], list[tuple[str, float, dict]]]:
156
+ """
157
+ Return content most similar to the given query.
158
+
159
+ Args:
160
+ embedding (list[float]): The vector to look up documents similar to.
161
+ agent_id (int): The identifier of the agent to filter specific documents. Only documents associated with this agent will be considered.
162
+ k (int, optional): The number of top similar contents to return. Defaults to 4.
163
+ fetch_k (int, optional): The number of documents to fetch before applying any filters. Defaults to 20.
164
+ return_score_type (Union[Literal["none"], Literal["similarity_score"], Literal["L2-distance"]], optional):
165
+ Specifies whether and how to return similarity scores with the results:
166
+ - "none": Do not return scores; only return the contents (default).
167
+ - "L2-distance": Return a tuple of content and its L2 distance from the query.
168
+ filter (dict, optional): The filter dict for metadata.
169
+
170
+ Returns:
171
+ Union[list[tuple[str,dict]], list[tuple[str, float,dict]]]:
172
+ Depending on the `return_score_type` parameter, returns either a list of strings representing the top-k similar contents,
173
+ or a list of tuples where each tuple contains a string and a floating-point score.
174
+ """
175
+ _filter = {
176
+ "_id": agent_id,
177
+ }
178
+ if filter is not None:
179
+ _filter.update(filter)
180
+ if return_score_type == "L2-distance":
181
+ _result = await self.vectors_store.asimilarity_search_with_score_by_vector(
182
+ embedding=embedding,
183
+ k=k,
184
+ filter=_filter,
185
+ fetch_k=fetch_k,
186
+ )
187
+ return [(r.page_content, s, r.metadata) for r, s in _result]
188
+ elif return_score_type == "none":
189
+ _result = await self.vectors_store.asimilarity_search_by_vector(
190
+ embedding=embedding,
191
+ k=k,
192
+ filter=_filter,
193
+ fetch_k=fetch_k,
194
+ )
195
+ return [(r.page_content, r.metadata) for r in _result]
196
+ else:
197
+ raise ValueError(f"Invalid `return_score_type` {return_score_type}!")
198
+
199
+ @lock_decorator
200
+ async def marginal_relevance_search(
201
+ self,
202
+ query: str,
203
+ agent_id: int,
204
+ k: int = 4,
205
+ fetch_k: int = 20,
206
+ lambda_mult: float = 0.5,
207
+ return_score_type: Literal["none"] = "none",
208
+ filter: Optional[dict] = None,
209
+ ) -> list[tuple[str, dict]]:
210
+ """
211
+ Return contents selected using the maximal marginal relevance asynchronously.
212
+
213
+ Args:
214
+ query (str): The text to look up documents similar to.
215
+ agent_id (int): The identifier of the agent to filter specific documents. Only documents associated with this agent will be considered.
216
+ k (int, optional): The number of top similar contents to return. Defaults to 4.
217
+ fetch_k (int, optional): The number of documents to fetch before applying any filters. Defaults to 20.
218
+ lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
219
+ return_score_type (Literal["none"].,optional):
220
+ Specifies whether and how to return similarity scores with the results:
221
+ - "none": Do not return scores; only return the contents (default).
222
+ filter (dict, optional): The filter dict for metadata.
223
+
224
+ Returns:
225
+ list[tuple[str,dict]]: the result contents.
226
+ """
227
+ _filter = {
228
+ "_id": agent_id,
229
+ }
230
+ if filter is not None:
231
+ _filter.update(filter)
232
+
233
+ if return_score_type == "none":
234
+ _result = await self.vectors_store.amax_marginal_relevance_search(
235
+ query=query,
236
+ k=k,
237
+ filter=_filter,
238
+ fetch_k=fetch_k,
239
+ lambda_mult=lambda_mult,
240
+ )
241
+ return [(r.page_content, r.metadata) for r in _result]
242
+ else:
243
+ raise ValueError(f"Invalid `return_score_type` {return_score_type}!")
244
+
245
+ @lock_decorator
246
+ async def marginal_relevance_search_by_embedding(
247
+ self,
248
+ embedding: list[float],
249
+ agent_id: int,
250
+ k: int = 4,
251
+ fetch_k: int = 20,
252
+ lambda_mult: float = 0.5,
253
+ return_score_type: Union[Literal["none"], Literal["similarity_score"]] = "none",
254
+ filter: Optional[dict] = None,
255
+ ) -> Union[list[tuple[str, dict]], list[tuple[str, float, dict]]]:
256
+ """
257
+ Return contents selected using the maximal marginal relevance asynchronously.
258
+
259
+ Args:
260
+ embedding (list[float]): The vector to look up documents similar to.
261
+ agent_id (int): The identifier of the agent to filter specific documents. Only documents associated with this agent will be considered.
262
+ k (int, optional): The number of top similar contents to return. Defaults to 4.
263
+ fetch_k (int, optional): The number of documents to fetch before applying any filters. Defaults to 20.
264
+ lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
265
+ return_score_type (Union[Literal["none"], Literal["similarity_score"]], optional):
266
+ Specifies whether and how to return similarity scores with the results:
267
+ - "none": Do not return scores; only return the contents (default).
268
+ - "similarity_score": Return a tuple of content and its similarity score.
269
+ filter (dict, optional): The filter dict for metadata.
270
+
271
+ Returns:
272
+ Union[list[tuple[str,dict]], list[tuple[str, float,dict]]]:
273
+ Depending on the `return_score_type` parameter, returns either a list of strings representing the top-k similar contents,
274
+ or a list of tuples where each tuple contains a string and a floating-point score.
275
+ """
276
+
277
+ _filter = {
278
+ "_id": agent_id,
279
+ }
280
+ if filter is not None:
281
+ _filter.update(filter)
282
+ if return_score_type == "none":
283
+ _result = await self.vectors_store.amax_marginal_relevance_search_by_vector(
284
+ embedding=embedding,
285
+ k=k,
286
+ filter=_filter,
287
+ fetch_k=fetch_k,
288
+ lambda_mult=lambda_mult,
289
+ )
290
+ return [(r.page_content, r.metadata) for r in _result]
291
+ elif return_score_type == "similarity_score":
292
+ _result = await self.vectors_store.amax_marginal_relevance_search_with_score_by_vector(
293
+ embedding=embedding,
294
+ k=k,
295
+ filter=_filter,
296
+ fetch_k=fetch_k,
297
+ lambda_mult=lambda_mult,
298
+ )
299
+ return [(r.page_content, s, r.metadata) for r, s in _result]
300
+
301
+ else:
302
+ raise ValueError(f"Invalid `return_score_type` {return_score_type}!")
@@ -1,21 +1,25 @@
1
1
  import asyncio
2
2
  import logging
3
+ from collections import defaultdict
4
+ from collections.abc import Callable, Sequence
3
5
  from copy import deepcopy
4
6
  from datetime import datetime
5
- from typing import Any, Literal, Optional, Union
6
- from collections.abc import Sequence,Callable
7
+ from typing import Any, Literal, Optional, Union
7
8
 
8
9
  import numpy as np
10
+ from langchain_core.embeddings import Embeddings
9
11
  from pyparsing import deque
10
12
 
11
13
  from ..utils.decorators import lock_decorator
12
14
  from .const import *
15
+ from .faiss_query import FaissQuery
13
16
  from .profile import ProfileMemory
14
17
  from .self_define import DynamicMemory
15
18
  from .state import StateMemory
16
19
 
17
20
  logger = logging.getLogger("pycityagent")
18
21
 
22
+
19
23
  class Memory:
20
24
  """
21
25
  A class to manage different types of memory (state, profile, dynamic).
@@ -33,7 +37,8 @@ class Memory:
33
37
  base: Optional[dict[Any, Any]] = None,
34
38
  motion: Optional[dict[Any, Any]] = None,
35
39
  activate_timestamp: bool = False,
36
- embedding_model: Any = None,
40
+ embedding_model: Optional[Embeddings] = None,
41
+ faiss_query: Optional[FaissQuery] = None,
37
42
  ) -> None:
38
43
  """
39
44
  Initializes the Memory with optional configuration.
@@ -51,20 +56,21 @@ class Memory:
51
56
  base (Optional[dict[Any, Any]], optional): base attribute dict from City Simulator.
52
57
  motion (Optional[dict[Any, Any]], optional): motion attribute dict from City Simulator.
53
58
  activate_timestamp (bool): Whether activate timestamp storage in MemoryUnit
54
- embedding_model (Any): The embedding model for memory search.
59
+ embedding_model (Embeddings): The embedding model for memory search.
60
+ faiss_query (FaissQuery): The faiss_query of the agent. Defaults to None.
55
61
  """
56
62
  self.watchers: dict[str, list[Callable]] = {}
57
63
  self._lock = asyncio.Lock()
58
- self.embedding_model = embedding_model
59
-
60
- # 初始化embedding存储
61
- self._embeddings = {"state": {}, "profile": {}, "dynamic": {}}
64
+ self._agent_id: int = -1
65
+ self._embedding_model = embedding_model
62
66
 
63
67
  _dynamic_config: dict[Any, Any] = {}
64
68
  _state_config: dict[Any, Any] = {}
65
69
  _profile_config: dict[Any, Any] = {}
66
70
  # 记录哪些字段需要embedding
67
71
  self._embedding_fields: dict[str, bool] = {}
72
+ self._embedding_field_to_doc_id: dict[Any, str] = defaultdict(str)
73
+ self._faiss_query = faiss_query
68
74
 
69
75
  if config is not None:
70
76
  for k, v in config.items():
@@ -135,8 +141,55 @@ class Memory:
135
141
  self._profile = ProfileMemory(
136
142
  msg=_profile_config, activate_timestamp=activate_timestamp
137
143
  )
138
- self.memories = [] # 存储记忆内容
139
- self.embeddings = [] # 存储记忆的向量表示
144
+ # self.memories = [] # 存储记忆内容
145
+ # self.embeddings = [] # 存储记忆的向量表示
146
+
147
+ def set_embedding_model(
148
+ self,
149
+ embedding_model: Embeddings,
150
+ ):
151
+ self._embedding_model = embedding_model
152
+
153
+ @property
154
+ def embedding_model(
155
+ self,
156
+ ):
157
+ if self._embedding_model is None:
158
+ raise RuntimeError(
159
+ f"embedding_model before assignment, please `set_embedding_model` first!"
160
+ )
161
+ return self._embedding_model
162
+
163
+ def set_faiss_query(self, faiss_query: FaissQuery):
164
+ """
165
+ Set the FaissQuery of the agent.
166
+ """
167
+ self._faiss_query = faiss_query
168
+
169
+ @property
170
+ def agent_id(
171
+ self,
172
+ ):
173
+ if self._agent_id < 0:
174
+ raise RuntimeError(
175
+ f"agent_id before assignment, please `set_agent_id` first!"
176
+ )
177
+ return self._agent_id
178
+
179
+ def set_agent_id(self, agent_id: int):
180
+ """
181
+ Set the FaissQuery of the agent.
182
+ """
183
+ self._agent_id = agent_id
184
+
185
+ @property
186
+ def faiss_query(self) -> FaissQuery:
187
+ """FaissQuery"""
188
+ if self._faiss_query is None:
189
+ raise RuntimeError(
190
+ f"FaissQuery access before assignment, please `set_faiss_query` first!"
191
+ )
192
+ return self._faiss_query
140
193
 
141
194
  @lock_decorator
142
195
  async def get(
@@ -192,11 +245,23 @@ class Memory:
192
245
  if mode == "replace":
193
246
  await _mem.update(key, value, store_snapshot)
194
247
  # 如果字段需要embedding,则更新embedding
195
- if self.embedding_model and self._embedding_fields.get(key, False):
248
+ if self._embedding_fields.get(key, False) and self.embedding_model:
196
249
  memory_type = self._get_memory_type(_mem)
197
- self._embeddings[memory_type][key] = (
198
- await self._generate_embedding(f"{key}: {str(value)}")
250
+ # 覆盖更新删除原vector
251
+ orig_doc_id = self._embedding_field_to_doc_id[key]
252
+ if orig_doc_id:
253
+ await self.faiss_query.delete_documents(
254
+ to_delete_ids=[orig_doc_id],
255
+ )
256
+ doc_ids: list[str] = await self.faiss_query.add_documents(
257
+ agent_id=self.agent_id,
258
+ documents=f"{key}: {str(value)}",
259
+ extra_tags={
260
+ "type": memory_type,
261
+ "key": key,
262
+ },
199
263
  )
264
+ self._embedding_field_to_doc_id[key] = doc_ids[0]
200
265
  if key in self.watchers:
201
266
  for callback in self.watchers[key]:
202
267
  asyncio.create_task(callback())
@@ -214,13 +279,17 @@ class Memory:
214
279
  f"Type of {type(original_value)} does not support mode `merge`, using `replace` instead!"
215
280
  )
216
281
  await _mem.update(key, value, store_snapshot)
217
- if self.embedding_model and self._embedding_fields.get(key, False):
282
+ if self._embedding_fields.get(key, False) and self.embedding_model:
218
283
  memory_type = self._get_memory_type(_mem)
219
- self._embeddings[memory_type][key] = (
220
- await self._generate_embedding(
221
- f"{key}: {str(original_value)}"
222
- )
284
+ doc_ids = await self.faiss_query.add_documents(
285
+ agent_id=self.agent_id,
286
+ documents=f"{key}: {str(original_value)}",
287
+ extra_tags={
288
+ "type": memory_type,
289
+ "key": key,
290
+ },
223
291
  )
292
+ self._embedding_field_to_doc_id[key] = doc_ids[0]
224
293
  if key in self.watchers:
225
294
  for callback in self.watchers[key]:
226
295
  asyncio.create_task(callback())
@@ -240,68 +309,6 @@ class Memory:
240
309
  else:
241
310
  return "dynamic"
242
311
 
243
- async def _generate_embedding(self, text: str) -> np.ndarray:
244
- """生成文本的向量表示
245
-
246
- Args:
247
- text: 输入文本
248
-
249
- Returns:
250
- np.ndarray: 文本的向量表示
251
-
252
- Raises:
253
- ValueError: 如果embedding_model未初始化
254
- """
255
- if not self.embedding_model:
256
- raise RuntimeError("Embedding model not initialized")
257
-
258
- return await self.embedding_model.embed(text)
259
-
260
- async def search(self, query: str, top_k: int = 3) -> str:
261
- """搜索相关记忆
262
-
263
- Args:
264
- query: 查询文本
265
- top_k: 返回最相关的记忆数量
266
-
267
- Returns:
268
- str: 格式化的相关记忆文本
269
- """
270
- if not self.embedding_model:
271
- return "Embedding model not initialized"
272
-
273
- query_embedding = await self._generate_embedding(query)
274
- all_results = []
275
-
276
- # 搜索所有记忆类型中启用了embedding的字段
277
- for memory_type, embeddings in self._embeddings.items():
278
- for key, embedding in embeddings.items():
279
- similarity = self._cosine_similarity(query_embedding, embedding)
280
- value = await self.get(key)
281
-
282
- all_results.append(
283
- {
284
- "type": memory_type,
285
- "key": key,
286
- "content": f"{key}: {str(value)}",
287
- "similarity": similarity,
288
- }
289
- )
290
-
291
- # 按相似度排序
292
- all_results.sort(key=lambda x: x["similarity"], reverse=True)
293
- top_results = all_results[:top_k]
294
-
295
- # 格式化输出
296
- formatted_results = []
297
- for result in top_results:
298
- formatted_results.append(
299
- f"- [{result['type']}] {result['content']} "
300
- f"(相关度: {result['similarity']:.2f})"
301
- )
302
-
303
- return "\n".join(formatted_results)
304
-
305
312
  async def update_batch(
306
313
  self,
307
314
  content: Union[dict, Sequence[tuple[Any, Any]]],
@@ -388,67 +395,54 @@ class Memory:
388
395
  if _snapshot:
389
396
  await _mem.load(snapshots=_snapshot, reset_memory=reset_memory)
390
397
 
398
+ # async def add(self, content: str, metadata: Optional[dict] = None) -> None:
399
+ # """添加新的记忆
400
+
401
+ # Args:
402
+ # content: 记忆内容
403
+ # metadata: 相关元数据,如时间、地点等
404
+ # """
405
+ # embedding = await self.embedding_model.aembed_query(content)
406
+ # self.memories.append(
407
+ # {
408
+ # "content": content,
409
+ # "metadata": metadata or {},
410
+ # "timestamp": datetime.now(),
411
+ # "embedding": embedding,
412
+ # }
413
+ # )
414
+ # self.embeddings.append(embedding)
415
+
391
416
  @lock_decorator
392
- async def get_top_k(
393
- self,
394
- key: Any,
395
- metric: Callable[[Any], Any],
396
- top_k: Optional[int] = None,
397
- mode: Union[Literal["read only"], Literal["read and write"]] = "read only",
398
- preserve_order: bool = True,
399
- ) -> Any:
400
- """
401
- Retrieves the top-k items from the memory based on the given key and metric.
417
+ async def search(
418
+ self, query: str, top_k: int = 3, filter: Optional[dict] = None
419
+ ) -> str:
420
+ """搜索相关记忆
402
421
 
403
422
  Args:
404
- key (Any): The key of the item to retrieve.
405
- metric (Callable[[Any], Any]): A callable function that defines the metric for ranking the items.
406
- top_k (Optional[int], optional): The number of top items to retrieve. Defaults to None (all items).
407
- mode (Union[Literal["read only"], Literal["read and write"]], optional): Access mode for the item. Defaults to "read only".
408
- preserve_order (bool): Whether preserve original order in output values.
423
+ query: 查询文本
424
+ top_k: 返回最相关的记忆数量
425
+ filter (dict, optional): 记忆的筛选条件,如 {"type":"dynamic", "key":"self_define_1",},默认为空
409
426
 
410
427
  Returns:
411
- Any: The top-k items based on the specified metric.
412
-
413
- Raises:
414
- ValueError: If an invalid mode is provided.
415
- KeyError: If the key is not found in any of the memory sections.
416
- """
417
- if mode == "read only":
418
- process_func = deepcopy
419
- elif mode == "read and write":
420
- process_func = lambda x: x
421
- else:
422
- raise ValueError(f"Invalid get mode `{mode}`!")
423
- for _mem in [self._state, self._profile, self._dynamic]:
424
- try:
425
- value = await _mem.get_top_k(key, metric, top_k, preserve_order)
426
- return process_func(value)
427
- except KeyError as e:
428
- continue
429
- raise KeyError(f"No attribute `{key}` in memories!")
430
-
431
- async def add(self, content: str, metadata: Optional[dict] = None) -> None:
432
- """添加新的记忆
433
-
434
- Args:
435
- content: 记忆内容
436
- metadata: 相关元数据,如时间、地点等
428
+ str: 格式化的相关记忆文本
437
429
  """
438
- embedding = await self.embedding_model.embed(content)
439
- self.memories.append(
440
- {
441
- "content": content,
442
- "metadata": metadata or {},
443
- "timestamp": datetime.now(),
444
- "embedding": embedding,
445
- }
430
+ if not self._embedding_model:
431
+ return "Embedding model not initialized"
432
+ top_results: list[tuple[str, float, dict]] = (
433
+ await self.faiss_query.similarity_search( # type:ignore
434
+ query=query,
435
+ agent_id=self.agent_id,
436
+ k=top_k,
437
+ return_score_type="similarity_score",
438
+ filter=filter,
439
+ )
446
440
  )
447
- self.embeddings.append(embedding)
448
-
449
- def _cosine_similarity(self, v1: np.ndarray, v2: np.ndarray) -> float:
450
- """计算余弦相似度"""
451
- dot_product = np.dot(v1, v2)
452
- norm_v1 = np.linalg.norm(v1)
453
- norm_v2 = np.linalg.norm(v2)
454
- return dot_product / (norm_v1 * norm_v2)
441
+ # 格式化输出
442
+ formatted_results = []
443
+ for content, score, metadata in top_results:
444
+ formatted_results.append(
445
+ f"- [{metadata['type']}] {content} " f"(相关度: {score:.2f})"
446
+ )
447
+
448
+ return "\n".join(formatted_results)
@@ -10,12 +10,14 @@ from uuid import UUID
10
10
 
11
11
  import fastavro
12
12
  import ray
13
+ from langchain_core.embeddings import Embeddings
13
14
 
14
15
  from ..agent import Agent, CitizenAgent, InstitutionAgent
15
16
  from ..economy.econ_client import EconomyClient
16
17
  from ..environment.simulator import Simulator
17
18
  from ..llm.llm import LLM
18
19
  from ..llm.llmconfig import LLMConfig
20
+ from ..memory import FaissQuery
19
21
  from ..message import Messager
20
22
  from ..metrics import MlflowClient
21
23
  from ..utils import (DIALOG_SCHEMA, INSTITUTION_STATUS_SCHEMA, PROFILE_SCHEMA,
@@ -37,6 +39,7 @@ class AgentGroup:
37
39
  enable_pgsql: bool,
38
40
  pgsql_writer: ray.ObjectRef,
39
41
  mlflow_run_id: str,
42
+ embedding_model: Embeddings,
40
43
  logging_level: int,
41
44
  ):
42
45
  logger.setLevel(logging_level)
@@ -46,6 +49,7 @@ class AgentGroup:
46
49
  self.exp_id = exp_id
47
50
  self.enable_avro = enable_avro
48
51
  self.enable_pgsql = enable_pgsql
52
+ self.embedding_model = embedding_model
49
53
  if enable_avro:
50
54
  self.avro_path = avro_path / f"{self._uuid}"
51
55
  self.avro_path.mkdir(parents=True, exist_ok=True)
@@ -99,6 +103,13 @@ class AgentGroup:
99
103
  else:
100
104
  self.mlflow_client = None
101
105
 
106
+ # set FaissQuery
107
+ if self.embedding_model is not None:
108
+ self.faiss_query = FaissQuery(
109
+ embeddings=self.embedding_model,
110
+ )
111
+ else:
112
+ self.faiss_query = None
102
113
  for agent in self.agents:
103
114
  agent.set_exp_id(self.exp_id) # type: ignore
104
115
  agent.set_llm_client(self.llm)
@@ -112,6 +123,12 @@ class AgentGroup:
112
123
  agent.set_avro_file(self.avro_file) # type: ignore
113
124
  if self.enable_pgsql:
114
125
  agent.set_pgsql_writer(self._pgsql_writer)
126
+ # set memory.faiss_query
127
+ if self.faiss_query is not None:
128
+ agent.memory.set_faiss_query(self.faiss_query)
129
+ # set memory.embedding model
130
+ if self.embedding_model is not None:
131
+ agent.memory.set_embedding_model(self.embedding_model)
115
132
 
116
133
  async def init_agents(self):
117
134
  logger.debug(f"-----Initializing Agents in AgentGroup {self._uuid} ...")
@@ -376,32 +393,32 @@ class AgentGroup:
376
393
  "created_at": _date_time,
377
394
  }
378
395
  _statuses_time_list.append((_status_dict, _date_time))
379
- to_update_statues: list[tuple] = []
380
- for _status_dict, _ in _statuses_time_list:
381
- BASIC_KEYS = [
382
- "id",
383
- "day",
384
- "t",
385
- "lng",
386
- "lat",
387
- "parent_id",
388
- "action",
389
- "created_at",
390
- ]
391
- _data = [_status_dict[k] for k in BASIC_KEYS if k != "created_at"]
392
- _other_dict = json.dumps(
393
- {k: v for k, v in _status_dict.items() if k not in BASIC_KEYS}
394
- )
395
- _data.append(_other_dict)
396
- _data.append(_status_dict["created_at"])
397
- to_update_statues.append(tuple(_data))
398
- if self._last_asyncio_pg_task is not None:
399
- await self._last_asyncio_pg_task
400
- self._last_asyncio_pg_task = (
401
- self._pgsql_writer.async_write_status.remote( # type:ignore
402
- to_update_statues
396
+ to_update_statues: list[tuple] = []
397
+ for _status_dict, _ in _statuses_time_list:
398
+ BASIC_KEYS = [
399
+ "id",
400
+ "day",
401
+ "t",
402
+ "lng",
403
+ "lat",
404
+ "parent_id",
405
+ "action",
406
+ "created_at",
407
+ ]
408
+ _data = [_status_dict[k] for k in BASIC_KEYS if k != "created_at"]
409
+ _other_dict = json.dumps(
410
+ {k: v for k, v in _status_dict.items() if k not in BASIC_KEYS}
411
+ )
412
+ _data.append(_other_dict)
413
+ _data.append(_status_dict["created_at"])
414
+ to_update_statues.append(tuple(_data))
415
+ if self._last_asyncio_pg_task is not None:
416
+ await self._last_asyncio_pg_task
417
+ self._last_asyncio_pg_task = (
418
+ self._pgsql_writer.async_write_status.remote( # type:ignore
419
+ to_update_statues
420
+ )
403
421
  )
404
- )
405
422
 
406
423
  async def step(self):
407
424
  if not self.initialized:
@@ -14,11 +14,13 @@ from typing import Any, Optional, Union
14
14
  import pycityproto.city.economy.v2.economy_pb2 as economyv2
15
15
  import ray
16
16
  import yaml
17
+ from langchain_core.embeddings import Embeddings
17
18
  from mosstool.map._map_util.const import AOI_START_ID
18
19
 
19
20
  from ..agent import Agent, InstitutionAgent
20
21
  from ..environment.simulator import Simulator
21
- from ..memory.memory import Memory
22
+ from ..llm import SimpleEmbedding
23
+ from ..memory import FaissQuery, Memory
22
24
  from ..message.messager import Messager
23
25
  from ..metrics import init_mlflow_connection
24
26
  from ..survey import Survey
@@ -76,6 +78,8 @@ class AgentSimulation:
76
78
 
77
79
  # storage
78
80
  _storage_config: dict[str, Any] = config.get("storage", {})
81
+ if _storage_config is None:
82
+ _storage_config = {}
79
83
  # avro
80
84
  _avro_config: dict[str, Any] = _storage_config.get("avro", {})
81
85
  self._enable_avro = _avro_config.get("enabled", False)
@@ -164,6 +168,7 @@ class AgentSimulation:
164
168
  enable_pgsql: bool,
165
169
  pgsql_writer: ray.ObjectRef,
166
170
  mlflow_run_id: str = None, # type: ignore
171
+ embedding_model: Embeddings = None, # type: ignore
167
172
  logging_level: int = logging.WARNING,
168
173
  ):
169
174
  """创建远程组"""
@@ -177,6 +182,7 @@ class AgentSimulation:
177
182
  enable_pgsql,
178
183
  pgsql_writer,
179
184
  mlflow_run_id,
185
+ embedding_model,
180
186
  logging_level,
181
187
  )
182
188
  return group_name, group, agents
@@ -186,6 +192,7 @@ class AgentSimulation:
186
192
  agent_count: Union[int, list[int]],
187
193
  group_size: int = 1000,
188
194
  pg_sql_writers: int = 32,
195
+ embedding_model: Embeddings = SimpleEmbedding(),
189
196
  memory_config_func: Optional[Union[Callable, list[Callable]]] = None,
190
197
  ) -> None:
191
198
  """初始化智能体
@@ -305,6 +312,7 @@ class AgentSimulation:
305
312
  self.enable_pgsql,
306
313
  _workers[i % _num_workers], # type:ignore
307
314
  mlflow_run_id, # type:ignore
315
+ embedding_model,
308
316
  self.logging_level,
309
317
  )
310
318
  creation_tasks.append((group_name, group, agents))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: pycityagent
3
- Version: 2.0.0a22
3
+ Version: 2.0.0a24
4
4
  Summary: LLM-based城市环境agent构建库
5
5
  License: MIT
6
6
  Author: Yuwei Yan
@@ -20,10 +20,12 @@ Requires-Dist: aiohttp (==3.10.10)
20
20
  Requires-Dist: aiomqtt (>=2.3.0,<3.0.0)
21
21
  Requires-Dist: citystreetview (==1.2.4)
22
22
  Requires-Dist: dashscope (==1.14.1)
23
+ Requires-Dist: faiss-cpu (>=1.9.0.post1,<2.0.0)
23
24
  Requires-Dist: fastavro (>=1.10.0,<2.0.0)
24
25
  Requires-Dist: geojson (==3.1.0)
25
26
  Requires-Dist: gradio (>=5.7.1,<6.0.0)
26
27
  Requires-Dist: grpcio (==1.67.1)
28
+ Requires-Dist: langchain-community (>=0.3.13,<0.4.0)
27
29
  Requires-Dist: langchain-core (>=0.3.28,<0.4.0)
28
30
  Requires-Dist: matplotlib (==3.8.3)
29
31
  Requires-Dist: mlflow (>=2.19.0,<3.0.0)
@@ -40,6 +42,8 @@ Requires-Dist: pycityproto (>=2.1.5,<3.0.0)
40
42
  Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
41
43
  Requires-Dist: ray (>=2.40.0,<3.0.0)
42
44
  Requires-Dist: sidecar (==0.7.0)
45
+ Requires-Dist: torch (>=2.5.1,<3.0.0)
46
+ Requires-Dist: transformers (>=4.47.1,<5.0.0)
43
47
  Requires-Dist: zhipuai (>=2.1.5.20230904,<3.0.0.0)
44
48
  Description-Content-Type: text/markdown
45
49
 
@@ -1,5 +1,5 @@
1
- pycityagent/__init__.py,sha256=EDxt3Su3lH1IMh9suNw7GeGL7UrXeWiZTw5KWNznDzc,637
2
- pycityagent/agent.py,sha256=TGW4vyaYBnNxYkr22FhGPwex3dLIeq3F-2rnELidNPA,28670
1
+ pycityagent/__init__.py,sha256=fv0mzNGbHBF6m550yYqnuUpB8iQPWS-7EatYRK7DO4s,693
2
+ pycityagent/agent.py,sha256=l8Oa95_K5JBWKzvZmbQe_QM_E_vaG-YstuuR55kgC6Y,29005
3
3
  pycityagent/economy/__init__.py,sha256=aonY4WHnx-6EGJ4WKrx4S-2jAkYNLtqUA04jp6q8B7w,75
4
4
  pycityagent/economy/econ_client.py,sha256=GuHK9ZBnhqW3Z7F8ViDJn_iN73yOBbbwFyJv1wLEBDk,12211
5
5
  pycityagent/environment/__init__.py,sha256=awHxlOud-btWbk0FCS4RmGJ13W84oVCkbGfcrhKqihA,240
@@ -30,14 +30,15 @@ pycityagent/environment/utils/grpc.py,sha256=6EJwKXXktIWb1NcUiJzIRmfrY0S03QAXXGc
30
30
  pycityagent/environment/utils/map_utils.py,sha256=lYOEoCFFK6-e9N5txLMMq4HUlxMqc8Uw1YrGW5oJmgg,5749
31
31
  pycityagent/environment/utils/port.py,sha256=3OM6kSUt3PxvDUOlgyiendBtETaWU8Mzk_8H0TzTmYg,295
32
32
  pycityagent/environment/utils/protobuf.py,sha256=0BsM_G7x2B_6DMIBHe9bjVuQDOXUytNRQ03g9e05F3c,1170
33
- pycityagent/llm/__init__.py,sha256=7klKEmCcDWJIu-F4DoAukSuKfDbLhdczrSIhpwow-sY,145
34
- pycityagent/llm/embedding.py,sha256=2psX_EK67oPlYe77g43EYUYams4M9AiJqxpHTFHG0n8,4253
33
+ pycityagent/llm/__init__.py,sha256=iWs6FLgrbRVIiqOf4ILS89gkVCTvS7HFC3vG-MWuyko,205
34
+ pycityagent/llm/embeddings.py,sha256=Nhf_tUIlaYJAZ93wW2QTCtS1wq7e8fUgdn2JketEAuQ,7600
35
35
  pycityagent/llm/llm.py,sha256=vJaaGqVuyV-GlBxrnvGKZnMDlxeTT_sGUTdxz5tYwEE,15141
36
36
  pycityagent/llm/llmconfig.py,sha256=4Ylf4OFSBEFy8jrOneeX0HvPhWEaF5jGvy1HkXK08Ro,436
37
37
  pycityagent/llm/utils.py,sha256=hoNPhvomb1u6lhFX0GctFipw74hVKb7bvUBDqwBzBYw,160
38
- pycityagent/memory/__init__.py,sha256=Hs2NhYpIG-lvpwPWwj4DydB1sxtjz7cuA4iDAzCXnjI,243
38
+ pycityagent/memory/__init__.py,sha256=_Vfdo1HcLWsuuz34_i8e91nnLVYADpMlHHSVaB3xgIk,297
39
39
  pycityagent/memory/const.py,sha256=6zpJPJXWoH9-yf4RARYYff586agCoud9BRn7sPERB1g,932
40
- pycityagent/memory/memory.py,sha256=vJxHOI74aJDGZPFu2LbBr02ASfOYpig66fto6Gjr-6Q,18191
40
+ pycityagent/memory/faiss_query.py,sha256=Z0JS4udyPYCIzHMq464QtHscnswu35gh9fQptikAwkQ,12976
41
+ pycityagent/memory/memory.py,sha256=UBh4yANNHDzYZwrsvyX4ZMSHXINbu1U6g0HLNCOOCk8,17883
41
42
  pycityagent/memory/memory_base.py,sha256=QG_j3BxZvkadFEeE3uBR_kjl_xcXD1aHUVs8GEF3d6w,5654
42
43
  pycityagent/memory/profile.py,sha256=q8ZS9IBmHCg_X1GONUvXK85P6tCepTKQgXKuvuXYNXw,5203
43
44
  pycityagent/memory/self_define.py,sha256=vpZ6CIxR2grNXEIOScdpsSc59FBg0mOKelwQuTElbtQ,5200
@@ -49,8 +50,8 @@ pycityagent/metrics/__init__.py,sha256=X08PaBbGVAd7_PRGLREXWxaqm7nS82WBQpD1zvQzc
49
50
  pycityagent/metrics/mlflow_client.py,sha256=g_tHxWkWTDijtbGL74-HmiYzWVKb1y8-w12QrY9jL30,4449
50
51
  pycityagent/metrics/utils/const.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
51
52
  pycityagent/simulation/__init__.py,sha256=P5czbcg2d8S0nbbnsQXFIhwzO4CennAhZM8OmKvAeYw,194
52
- pycityagent/simulation/agentgroup.py,sha256=5p68wNoEaog4nDym3xsCTporBWmxNiQ1crN3mbOHFsE,19788
53
- pycityagent/simulation/simulation.py,sha256=7Go_RkpkC_DuBWW21JPqlV2yXY754RqSkqzM0vTdteU,23008
53
+ pycityagent/simulation/agentgroup.py,sha256=r8arCAQkKMhv3yr35XsYJL-MfG6o6rWwHItBmxfDtA4,20589
54
+ pycityagent/simulation/simulation.py,sha256=9kkdgXSEOAN8wiewVFyORksti4IdVNU0opObV6ZYa9k,23344
54
55
  pycityagent/simulation/storage/pg.py,sha256=Ws04mUgRcbbvWi_eQm3PXYa6w7AQUbDPWhSU7HFtsD8,6026
55
56
  pycityagent/survey/__init__.py,sha256=rxwou8U9KeFSP7rMzXtmtp2fVFZxK4Trzi-psx9LPIs,153
56
57
  pycityagent/survey/manager.py,sha256=S5IkwTdelsdtZETChRcfCEczzwSrry_Fly9MY4s3rbk,1681
@@ -69,6 +70,6 @@ pycityagent/workflow/block.py,sha256=l-z9iJo9_USZQRyj4TLMfihK0-tnNDG0a6jVk9WhG0o
69
70
  pycityagent/workflow/prompt.py,sha256=6jI0Rq54JLv3-IXqZLYug62vse10wTI83xvf4ZX42nk,2929
70
71
  pycityagent/workflow/tool.py,sha256=xADxhNgVsjNiMxlhdwn3xGUstFOkLEG8P67ez8VmwSI,8555
71
72
  pycityagent/workflow/trigger.py,sha256=Df-MOBEDWBbM-v0dFLQLXteLsipymT4n8vqexmK2GiQ,5643
72
- pycityagent-2.0.0a22.dist-info/METADATA,sha256=s_gC55n1d1ZUyt1kRcYhl7h9Ymp8BQQKXZHrg93V8sg,7848
73
- pycityagent-2.0.0a22.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
74
- pycityagent-2.0.0a22.dist-info/RECORD,,
73
+ pycityagent-2.0.0a24.dist-info/METADATA,sha256=cHowSJH9VJmum92fAEfRvQYtWmbCJRnVgOmI2JZDlqw,8033
74
+ pycityagent-2.0.0a24.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
75
+ pycityagent-2.0.0a24.dist-info/RECORD,,
@@ -1,136 +0,0 @@
1
- """简单的基于内存的embedding实现"""
2
-
3
- import numpy as np
4
- import hashlib
5
- import json
6
-
7
-
8
- class SimpleEmbedding:
9
- """简单的基于内存的embedding实现
10
-
11
- 使用简单的词袋模型(Bag of Words)和TF-IDF来生成文本的向量表示。
12
- 所有向量都保存在内存中,适用于小规模应用。
13
- """
14
-
15
- def __init__(self, vector_dim: int = 128, cache_size: int = 1000):
16
- """初始化
17
-
18
- Args:
19
- vector_dim: 向量维度
20
- cache_size: 缓存大小,超过此大小将清除最早的缓存
21
- """
22
- self.vector_dim = vector_dim
23
- self.cache_size = cache_size
24
- self._cache: dict[str, np.ndarray] = {}
25
- self._vocab: dict[str, int] = {} # 词汇表
26
- self._idf: dict[str, float] = {} # 逆文档频率
27
- self._doc_count = 0 # 文档总数
28
-
29
- def _text_to_hash(self, text: str) -> str:
30
- """将文本转换为hash值"""
31
- return hashlib.md5(text.encode()).hexdigest()
32
-
33
- def _tokenize(self, text: str) -> list[str]:
34
- """简单的分词"""
35
- # 这里使用简单的空格分词,实际应用中可以使用更复杂的分词方法
36
- return text.lower().split()
37
-
38
- def _update_vocab(self, tokens: list[str]):
39
- """更新词汇表"""
40
- for token in set(tokens): # 使用set去重
41
- if token not in self._vocab:
42
- self._vocab[token] = len(self._vocab)
43
-
44
- def _update_idf(self, tokens: list[str]):
45
- """更新IDF值"""
46
- self._doc_count += 1
47
- unique_tokens = set(tokens)
48
- for token in unique_tokens:
49
- self._idf[token] = self._idf.get(token, 0) + 1
50
-
51
- def _calculate_tf(self, tokens: list[str]) -> dict[str, float]:
52
- """计算词频(TF)"""
53
- tf = {}
54
- total_tokens = len(tokens)
55
- for token in tokens:
56
- tf[token] = tf.get(token, 0) + 1
57
- # 归一化
58
- for token in tf:
59
- tf[token] /= total_tokens
60
- return tf
61
-
62
- def _calculate_tfidf(self, tokens: list[str]) -> np.ndarray:
63
- """计算TF-IDF向量"""
64
- vector = np.zeros(self.vector_dim)
65
- tf = self._calculate_tf(tokens)
66
-
67
- for token, tf_value in tf.items():
68
- if token in self._idf:
69
- idf = np.log(self._doc_count / self._idf[token])
70
- idx = self._vocab[token] % self.vector_dim # 使用取模运算来控制向量维度
71
- vector[idx] += tf_value * idf
72
-
73
- # L2归一化
74
- norm = np.linalg.norm(vector)
75
- if norm > 0:
76
- vector /= norm
77
-
78
- return vector
79
-
80
- async def embed(self, text: str) -> np.ndarray:
81
- """生成文本的向量表示
82
-
83
- Args:
84
- text: 输入文本
85
-
86
- Returns:
87
- np.ndarray: 文本的向量表示
88
- """
89
- # 检查缓存
90
- text_hash = self._text_to_hash(text)
91
- if text_hash in self._cache:
92
- return self._cache[text_hash]
93
-
94
- # 分词
95
- tokens = self._tokenize(text)
96
- if not tokens:
97
- return np.zeros(self.vector_dim)
98
-
99
- # 更新词汇表和IDF
100
- self._update_vocab(tokens)
101
- self._update_idf(tokens)
102
-
103
- # 计算向量
104
- vector = self._calculate_tfidf(tokens)
105
-
106
- # 更新缓存
107
- if len(self._cache) >= self.cache_size:
108
- # 删除最早的缓存
109
- oldest_key = next(iter(self._cache))
110
- del self._cache[oldest_key]
111
- self._cache[text_hash] = vector
112
-
113
- return vector
114
-
115
- def save(self, file_path: str):
116
- """保存模型"""
117
- state = {
118
- "vector_dim": self.vector_dim,
119
- "cache_size": self.cache_size,
120
- "vocab": self._vocab,
121
- "idf": self._idf,
122
- "doc_count": self._doc_count,
123
- }
124
- with open(file_path, "w") as f:
125
- json.dump(state, f)
126
-
127
- def load(self, file_path: str):
128
- """加载模型"""
129
- with open(file_path, "r") as f:
130
- state = json.load(f)
131
- self.vector_dim = state["vector_dim"]
132
- self.cache_size = state["cache_size"]
133
- self._vocab = state["vocab"]
134
- self._idf = state["idf"]
135
- self._doc_count = state["doc_count"]
136
- self._cache = {} # 清空缓存