PyPI - pycityagent - Versions diffs - 2.0.0a22__py3-none-any.whl → 2.0.0a24__py3-none-any.whl - Mend

pycityagent 2.0.0a22py3-none-any.whl → 2.0.0a24py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

pycityagent/__init__.py +2 -1
pycityagent/agent.py +13 -2
pycityagent/llm/__init__.py +7 -2
pycityagent/llm/embeddings.py +231 -0
pycityagent/memory/__init__.py +2 -0
pycityagent/memory/faiss_query.py +302 -0
pycityagent/memory/memory.py +131 -137
pycityagent/simulation/agentgroup.py +42 -25
pycityagent/simulation/simulation.py +9 -1
{pycityagent-2.0.0a22.dist-info → pycityagent-2.0.0a24.dist-info}/METADATA +5 -1
{pycityagent-2.0.0a22.dist-info → pycityagent-2.0.0a24.dist-info}/RECORD +12 -11
pycityagent/llm/embedding.py +0 -136
{pycityagent-2.0.0a22.dist-info → pycityagent-2.0.0a24.dist-info}/WHEEL +0 -0

pycityagent/__init__.py CHANGED Viewed

@@ -5,6 +5,7 @@ Pycityagent: 城市智能体构建框架
 from .agent import Agent, CitizenAgent, InstitutionAgent
 from .environment import Simulator
 import logging
+from .llm import SentenceEmbedding
 # 创建一个 pycityagent 记录器
 logger = logging.getLogger("pycityagent")
@@ -19,4 +20,4 @@ if not logger.hasHandlers():
     handler.setFormatter(formatter)
     logger.addHandler(handler)
-__all__ = ["Agent", "Simulator", "CitizenAgent", "InstitutionAgent"]
+__all__ = ["Agent", "Simulator", "CitizenAgent", "InstitutionAgent","SentenceEmbedding",]

pycityagent/agent.py CHANGED Viewed

@@ -236,7 +236,15 @@ class Agent(ABC):
         # 添加记忆上下文
         if self._memory:
-            relevant_memories = await self._memory.search(survey_prompt)
+            relevant_memories = await self.memory.search(survey_prompt)
+            formatted_results = []
+            # for result in top_results:
+            #     formatted_results.append(
+            #         f"- [{result['type']}] {result['content']} "
+            #         f"(相关度: {result['similarity']:.2f})"
+            #     )
             if relevant_memories:
                 dialog.append(
                     {
@@ -458,7 +466,9 @@ class Agent(ABC):
         topic = f"exps/{self._exp_id}/agents/{to_agent_uuid}/{sub_topic}"
         await self._messager.send_message(topic, payload)
-    async def send_message_to_agent(self, to_agent_uuid: str, content: str, type: str = "social"):
+    async def send_message_to_agent(
+        self, to_agent_uuid: str, content: str, type: str = "social"
+    ):
         """通过 Messager 发送消息"""
         if self._messager is None:
             raise RuntimeError("Messager is not set")
@@ -598,6 +608,7 @@ class CitizenAgent(Agent):
                 # 防止模拟器还没有到prepare阶段导致get_person出错
             self._has_bound_to_simulator = True
             self._agent_id = person_id
+            self.memory.set_agent_id(person_id)
     async def _bind_to_economy(self):
         if self._economy_client is None:

pycityagent/llm/__init__.py CHANGED Viewed

@@ -1,6 +1,11 @@
 """LLM相关模块"""
+from .embeddings import SentenceEmbedding, SimpleEmbedding
 from .llm import LLM, LLMConfig
-from .embedding import SimpleEmbedding
-__all__ = ["LLM", "LLMConfig", "SimpleEmbedding"]
+__all__ = [
+    "LLM",
+    "LLMConfig",
+    "SentenceEmbedding",
+    "SimpleEmbedding",
+]

pycityagent/llm/embeddings.py ADDED Viewed

@@ -0,0 +1,231 @@
+import hashlib
+import json
+import os
+from typing import Optional, Union
+import numpy as np
+import torch
+from langchain_core.embeddings import Embeddings
+from transformers import AutoModel, AutoTokenizer
+__all__ = [
+    "SentenceEmbedding",
+    "SimpleEmbedding",
+]
+class SentenceEmbedding(Embeddings):
+    def __init__(
+        self,
+        pretrained_model_name_or_path: Union[str, os.PathLike] = "BAAI/bge-m3",
+        max_seq_len: int = 8192,
+        auto_cuda: bool = False,
+        local_files_only: bool = False,
+        cache_dir: str = "./cache",
+        proxies: Optional[dict] = None,
+    ):
+        os.makedirs(cache_dir, exist_ok=True)
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            pretrained_model_name_or_path,
+            proxies=proxies,
+            cache_dir=cache_dir,
+            local_files_only=local_files_only,
+        )
+        self.model = AutoModel.from_pretrained(
+            pretrained_model_name_or_path,
+            proxies=proxies,
+            cache_dir=cache_dir,
+            local_files_only=local_files_only,
+        )
+        self._cuda = auto_cuda and torch.cuda.is_available()
+        if self._cuda:
+            self.model = self.model.cuda()
+        self.model.eval()
+        self.max_seq_len = max_seq_len
+    def _embed(self, texts: list[str]) -> list[list[float]]:
+        # Tokenize sentences
+        encoded_input = self.tokenizer(
+            texts, padding=True, truncation=True, return_tensors="pt"
+        )
+        # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
+        # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
+        # check length of input
+        # assert seq_len <= 8192
+        assert encoded_input["input_ids"].shape[1] <= self.max_seq_len  # type: ignore
+        if self._cuda:
+            encoded_input = {k: v.cuda() for k, v in encoded_input.items()}
+        # Compute token embeddings
+        with torch.no_grad():
+            model_output = self.model(**encoded_input)
+            # Perform pooling. In this case, cls pooling.
+            sentence_embeddings = model_output[0][:, 0]
+        # normalize embeddings
+        sentence_embeddings = torch.nn.functional.normalize(
+            sentence_embeddings, p=2, dim=1
+        )
+        if self._cuda:
+            sentence_embeddings = sentence_embeddings.cpu()
+        return sentence_embeddings.tolist()
+    def embed_documents(self, texts: list[str]) -> list[list[float]]:
+        """Embed documents."""
+        return self._embed(texts)
+    def embed_query(self, text: str) -> list[float]:
+        """Embed query text."""
+        return self._embed([text])[0]
+class SimpleEmbedding(Embeddings):
+    """简单的基于内存的embedding实现
+    使用简单的词袋模型(Bag of Words)和TF-IDF来生成文本的向量表示。
+    所有向量都保存在内存中，适用于小规模应用。
+    """
+    def __init__(self, vector_dim: int = 128, cache_size: int = 1000):
+        """初始化
+        Args:
+            vector_dim: 向量维度
+            cache_size: 缓存大小，超过此大小将清除最早的缓存
+        """
+        self.vector_dim = vector_dim
+        self.cache_size = cache_size
+        self._cache: dict[str, list[float]] = {}
+        self._vocab: dict[str, int] = {}  # 词汇表
+        self._idf: dict[str, float] = {}  # 逆文档频率
+        self._doc_count = 0  # 文档总数
+    def _text_to_hash(self, text: str) -> str:
+        """将文本转换为hash值"""
+        return hashlib.md5(text.encode()).hexdigest()
+    def _tokenize(self, text: str) -> list[str]:
+        """简单的分词"""
+        # 这里使用简单的空格分词，实际应用中可以使用更复杂的分词方法
+        return text.lower().split()
+    def _update_vocab(self, tokens: list[str]):
+        """更新词汇表"""
+        for token in set(tokens):  # 使用set去重
+            if token not in self._vocab:
+                self._vocab[token] = len(self._vocab)
+    def _update_idf(self, tokens: list[str]):
+        """更新IDF值"""
+        self._doc_count += 1
+        unique_tokens = set(tokens)
+        for token in unique_tokens:
+            self._idf[token] = self._idf.get(token, 0) + 1
+    def _calculate_tf(self, tokens: list[str]) -> dict[str, float]:
+        """计算词频(TF)"""
+        tf = {}
+        total_tokens = len(tokens)
+        for token in tokens:
+            tf[token] = tf.get(token, 0) + 1
+        # 归一化
+        for token in tf:
+            tf[token] /= total_tokens
+        return tf
+    def _calculate_tfidf(self, tokens: list[str]) -> list[float]:
+        """计算TF-IDF向量"""
+        vector = np.zeros(self.vector_dim)
+        tf = self._calculate_tf(tokens)
+        for token, tf_value in tf.items():
+            if token in self._idf:
+                idf = np.log(self._doc_count / self._idf[token])
+                idx = self._vocab[token] % self.vector_dim  # 使用取模运算来控制向量维度
+                vector[idx] += tf_value * idf
+        # L2归一化
+        norm = np.linalg.norm(vector)
+        if norm > 0:
+            vector /= norm
+        return list(vector)
+    def _embed(self, text: str) -> list[float]:
+        """生成文本的向量表示
+        Args:
+            text: 输入文本
+        Returns:
+            np.ndarray: 文本的向量表示
+        """
+        # 检查缓存
+        text_hash = self._text_to_hash(text)
+        if text_hash in self._cache:
+            return self._cache[text_hash]
+        # 分词
+        tokens = self._tokenize(text)
+        if not tokens:
+            return list(np.zeros(self.vector_dim))
+        # 更新词汇表和IDF
+        self._update_vocab(tokens)
+        self._update_idf(tokens)
+        # 计算向量
+        vector = self._calculate_tfidf(tokens)
+        # 更新缓存
+        if len(self._cache) >= self.cache_size:
+            # 删除最早的缓存
+            oldest_key = next(iter(self._cache))
+            del self._cache[oldest_key]
+        self._cache[text_hash] = vector
+        return list(vector)
+    def embed_documents(self, texts: list[str]) -> list[list[float]]:
+        """Embed documents."""
+        return [self._embed(text) for text in texts]
+    def embed_query(self, text: str) -> list[float]:
+        """Embed query text."""
+        return self._embed(text)
+    # def save(self, file_path: str):
+    #     """保存模型"""
+    #     state = {
+    #         "vector_dim": self.vector_dim,
+    #         "cache_size": self.cache_size,
+    #         "vocab": self._vocab,
+    #         "idf": self._idf,
+    #         "doc_count": self._doc_count,
+    #     }
+    #     with open(file_path, "w") as f:
+    #         json.dump(state, f)
+    # def load(self, file_path: str):
+    #     """加载模型"""
+    #     with open(file_path, "r") as f:
+    #         state = json.load(f)
+    #     self.vector_dim = state["vector_dim"]
+    #     self.cache_size = state["cache_size"]
+    #     self._vocab = state["vocab"]
+    #     self._idf = state["idf"]
+    #     self._doc_count = state["doc_count"]
+    #     self._cache = {}  # 清空缓存
+if __name__ == "__main__":
+    # se = SentenceEmbedding(
+    #     pretrained_model_name_or_path="ignore/BAAI--bge-m3", cache_dir="ignore"
+    # )
+    se = SimpleEmbedding()
+    print(se.embed_query("hello world"))
+    print(se.embed_query("hello world"))
+    print(se.embed_query("hello world"))
+    print(se.embed_query("hello world"))

pycityagent/memory/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Memory."""
+from .faiss_query import FaissQuery
 from .memory import Memory
 from .memory_base import MemoryBase, MemoryUnit
 from .profile import ProfileMemory, ProfileMemoryUnit
@@ -8,4 +9,5 @@ from .state import StateMemory
 __all__ = [
     "Memory",
+    "FaissQuery",
 ]

pycityagent/memory/faiss_query.py ADDED Viewed

@@ -0,0 +1,302 @@
+import asyncio
+from collections.abc import Sequence
+from typing import Any, Literal, Optional, Union
+import faiss
+import numpy as np
+from langchain_community.docstore.in_memory import InMemoryDocstore
+from langchain_community.vectorstores import FAISS
+from langchain_core.documents import Document
+from langchain_core.embeddings import Embeddings
+from ..utils.decorators import lock_decorator
+class FaissQuery:
+    def __init__(
+        self,
+        embeddings: Optional[Embeddings] = None,
+        index_type: Any = faiss.IndexFlatL2,
+        dimension: Optional[int] = None,
+    ) -> None:
+        self._embeddings = embeddings
+        self._lock = asyncio.Lock()
+        if embeddings is None:
+            self._index = None
+            self._vectors_store = None
+        else:
+            if dimension is None:
+                dimension = len(embeddings.embed_query("hello world"))
+            self._index = index_type(dimension)
+            self._vectors_store = FAISS(
+                embedding_function=embeddings,
+                index=self._index,
+                docstore=InMemoryDocstore(),
+                index_to_docstore_id={},
+            )
+    @property
+    def embeddings(
+        self,
+    ) -> Embeddings:
+        if self._embeddings is None:
+            raise RuntimeError(f"No embedding set, please `set_embeddings` first!")
+        return self._embeddings
+    @property
+    def vectors_store(
+        self,
+    ) -> FAISS:
+        if self._vectors_store is None:
+            raise RuntimeError(f"No embedding set, thus no vector stores initialized!")
+        return self._vectors_store
+    @lock_decorator
+    async def add_documents(
+        self,
+        agent_id: int,
+        documents: Union[str, Sequence[str]],
+        extra_tags: Optional[dict] = None,
+    ) -> list[str]:
+        if isinstance(documents, str):
+            documents = [documents]
+        _metadata = {"_id": agent_id}
+        if extra_tags is not None:
+            _metadata.update(extra_tags)
+        to_add_documents = [
+            Document(page_content=doc, metadata=_metadata) for doc in documents
+        ]
+        return await self.vectors_store.aadd_documents(
+            documents=to_add_documents,
+        )
+    @lock_decorator
+    async def delete_documents(
+        self,
+        to_delete_ids: list[str],
+    ):
+        await self.vectors_store.adelete(
+            ids=to_delete_ids,
+        )
+    @lock_decorator
+    async def similarity_search(
+        self,
+        query: str,
+        agent_id: int,
+        k: int = 4,
+        fetch_k: int = 20,
+        return_score_type: Union[
+            Literal["none"], Literal["similarity_score"], Literal["L2-distance"]
+        ] = "none",
+        filter: Optional[dict] = None,
+    ) -> Union[list[tuple[str, dict]], list[tuple[str, float, dict]]]:
+        """
+        Return content most similar to the given query.
+        Args:
+            query (str): The text to look up documents similar to.
+            agent_id (int): The identifier of the agent to filter specific documents. Only documents associated with this agent will be considered.
+            k (int, optional): The number of top similar contents to return. Defaults to 4.
+            fetch_k (int, optional): The number of documents to fetch before applying any filters. Defaults to 20.
+            return_score_type (Union[Literal["none"], Literal["similarity_score"], Literal["L2-distance"]], optional):
+                Specifies whether and how to return similarity scores with the results:
+                    - "none": Do not return scores; only return the contents (default).
+                    - "similarity_score": Return a tuple of content and its similarity score.
+                    - "L2-distance": Return a tuple of content and its L2 distance from the query.
+            filter (dict, optional): The filter dict for metadata.
+        Returns:
+            Union[list[tuple[str,dict]], list[tuple[str, float,dict]]]:
+                Depending on the `return_score_type` parameter, returns either a list of strings representing the top-k similar contents,
+                or a list of tuples where each tuple contains a string and a floating-point score.
+        """
+        _filter = {
+            "_id": agent_id,
+        }
+        if filter is not None:
+            _filter.update(filter)
+        if return_score_type == "L2-distance":
+            _result = await self.vectors_store.asimilarity_search_with_score(
+                query=query,
+                k=k,
+                filter=_filter,
+                fetch_k=fetch_k,
+            )
+            return [(r.page_content, s, r.metadata) for r, s in _result]
+        elif return_score_type == "none":
+            _result = await self.vectors_store.asimilarity_search(
+                query=query,
+                k=k,
+                filter=_filter,
+                fetch_k=fetch_k,
+            )
+            return [(r.page_content, r.metadata) for r in _result]
+        elif return_score_type == "similarity_score":
+            _result = await self.vectors_store.asimilarity_search_with_relevance_scores(
+                query=query,
+                k=k,
+                filter=_filter,
+                fetch_k=fetch_k,
+            )
+            return [(r.page_content, s, r.metadata) for r, s in _result]
+        else:
+            raise ValueError(f"Invalid `return_score_type` {return_score_type}!")
+    @lock_decorator
+    async def similarity_search_by_embedding(
+        self,
+        embedding: list[float],
+        agent_id: int,
+        k: int = 4,
+        fetch_k: int = 20,
+        return_score_type: Union[Literal["none"], Literal["L2-distance"]] = "none",
+        filter: Optional[dict] = None,
+    ) -> Union[list[tuple[str, dict]], list[tuple[str, float, dict]]]:
+        """
+        Return content most similar to the given query.
+        Args:
+            embedding (list[float]): The vector to look up documents similar to.
+            agent_id (int): The identifier of the agent to filter specific documents. Only documents associated with this agent will be considered.
+            k (int, optional): The number of top similar contents to return. Defaults to 4.
+            fetch_k (int, optional): The number of documents to fetch before applying any filters. Defaults to 20.
+            return_score_type (Union[Literal["none"], Literal["similarity_score"], Literal["L2-distance"]], optional):
+                Specifies whether and how to return similarity scores with the results:
+                    - "none": Do not return scores; only return the contents (default).
+                    - "L2-distance": Return a tuple of content and its L2 distance from the query.
+            filter (dict, optional): The filter dict for metadata.
+        Returns:
+            Union[list[tuple[str,dict]], list[tuple[str, float,dict]]]:
+                Depending on the `return_score_type` parameter, returns either a list of strings representing the top-k similar contents,
+                or a list of tuples where each tuple contains a string and a floating-point score.
+        """
+        _filter = {
+            "_id": agent_id,
+        }
+        if filter is not None:
+            _filter.update(filter)
+        if return_score_type == "L2-distance":
+            _result = await self.vectors_store.asimilarity_search_with_score_by_vector(
+                embedding=embedding,
+                k=k,
+                filter=_filter,
+                fetch_k=fetch_k,
+            )
+            return [(r.page_content, s, r.metadata) for r, s in _result]
+        elif return_score_type == "none":
+            _result = await self.vectors_store.asimilarity_search_by_vector(
+                embedding=embedding,
+                k=k,
+                filter=_filter,
+                fetch_k=fetch_k,
+            )
+            return [(r.page_content, r.metadata) for r in _result]
+        else:
+            raise ValueError(f"Invalid `return_score_type` {return_score_type}!")
+    @lock_decorator
+    async def marginal_relevance_search(
+        self,
+        query: str,
+        agent_id: int,
+        k: int = 4,
+        fetch_k: int = 20,
+        lambda_mult: float = 0.5,
+        return_score_type: Literal["none"] = "none",
+        filter: Optional[dict] = None,
+    ) -> list[tuple[str, dict]]:
+        """
+        Return contents selected using the maximal marginal relevance asynchronously.
+        Args:
+            query (str): The text to look up documents similar to.
+            agent_id (int): The identifier of the agent to filter specific documents. Only documents associated with this agent will be considered.
+            k (int, optional): The number of top similar contents to return. Defaults to 4.
+            fetch_k (int, optional): The number of documents to fetch before applying any filters. Defaults to 20.
+            lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
+            return_score_type (Literal["none"].,optional):
+                Specifies whether and how to return similarity scores with the results:
+                    - "none": Do not return scores; only return the contents (default).
+            filter (dict, optional): The filter dict for metadata.
+        Returns:
+            list[tuple[str,dict]]: the result contents.
+        """
+        _filter = {
+            "_id": agent_id,
+        }
+        if filter is not None:
+            _filter.update(filter)
+        if return_score_type == "none":
+            _result = await self.vectors_store.amax_marginal_relevance_search(
+                query=query,
+                k=k,
+                filter=_filter,
+                fetch_k=fetch_k,
+                lambda_mult=lambda_mult,
+            )
+            return [(r.page_content, r.metadata) for r in _result]
+        else:
+            raise ValueError(f"Invalid `return_score_type` {return_score_type}!")
+    @lock_decorator
+    async def marginal_relevance_search_by_embedding(
+        self,
+        embedding: list[float],
+        agent_id: int,
+        k: int = 4,
+        fetch_k: int = 20,
+        lambda_mult: float = 0.5,
+        return_score_type: Union[Literal["none"], Literal["similarity_score"]] = "none",
+        filter: Optional[dict] = None,
+    ) -> Union[list[tuple[str, dict]], list[tuple[str, float, dict]]]:
+        """
+        Return contents selected using the maximal marginal relevance asynchronously.
+        Args:
+            embedding (list[float]): The vector to look up documents similar to.
+            agent_id (int): The identifier of the agent to filter specific documents. Only documents associated with this agent will be considered.
+            k (int, optional): The number of top similar contents to return. Defaults to 4.
+            fetch_k (int, optional): The number of documents to fetch before applying any filters. Defaults to 20.
+            lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5.
+            return_score_type (Union[Literal["none"], Literal["similarity_score"]], optional):
+                Specifies whether and how to return similarity scores with the results:
+                    - "none": Do not return scores; only return the contents (default).
+                    - "similarity_score": Return a tuple of content and its similarity score.
+            filter (dict, optional): The filter dict for metadata.
+        Returns:
+            Union[list[tuple[str,dict]], list[tuple[str, float,dict]]]:
+                Depending on the `return_score_type` parameter, returns either a list of strings representing the top-k similar contents,
+                or a list of tuples where each tuple contains a string and a floating-point score.
+        """
+        _filter = {
+            "_id": agent_id,
+        }
+        if filter is not None:
+            _filter.update(filter)
+        if return_score_type == "none":
+            _result = await self.vectors_store.amax_marginal_relevance_search_by_vector(
+                embedding=embedding,
+                k=k,
+                filter=_filter,
+                fetch_k=fetch_k,
+                lambda_mult=lambda_mult,
+            )
+            return [(r.page_content, r.metadata) for r in _result]
+        elif return_score_type == "similarity_score":
+            _result = await self.vectors_store.amax_marginal_relevance_search_with_score_by_vector(
+                embedding=embedding,
+                k=k,
+                filter=_filter,
+                fetch_k=fetch_k,
+                lambda_mult=lambda_mult,
+            )
+            return [(r.page_content, s, r.metadata) for r, s in _result]
+        else:
+            raise ValueError(f"Invalid `return_score_type` {return_score_type}!")

pycityagent/memory/memory.py CHANGED Viewed

@@ -1,21 +1,25 @@
 import asyncio
 import logging
+from collections import defaultdict
+from collections.abc import Callable, Sequence
 from copy import deepcopy
 from datetime import datetime
-from typing import Any, Literal, Optional,  Union
-from collections.abc import Sequence,Callable
+from typing import Any, Literal, Optional, Union
 import numpy as np
+from langchain_core.embeddings import Embeddings
 from pyparsing import deque
 from ..utils.decorators import lock_decorator
 from .const import *
+from .faiss_query import FaissQuery
 from .profile import ProfileMemory
 from .self_define import DynamicMemory
 from .state import StateMemory
 logger = logging.getLogger("pycityagent")
 class Memory:
     """
     A class to manage different types of memory (state, profile, dynamic).
@@ -33,7 +37,8 @@ class Memory:
         base: Optional[dict[Any, Any]] = None,
         motion: Optional[dict[Any, Any]] = None,
         activate_timestamp: bool = False,
-        embedding_model: Any = None,
+        embedding_model: Optional[Embeddings] = None,
+        faiss_query: Optional[FaissQuery] = None,
     ) -> None:
         """
         Initializes the Memory with optional configuration.
@@ -51,20 +56,21 @@ class Memory:
             base (Optional[dict[Any, Any]], optional): base attribute dict from City Simulator.
             motion (Optional[dict[Any, Any]], optional): motion attribute dict from City Simulator.
             activate_timestamp (bool): Whether activate timestamp storage in MemoryUnit
-            embedding_model (Any): The embedding model for memory search.
+            embedding_model (Embeddings): The embedding model for memory search.
+            faiss_query (FaissQuery): The faiss_query of the agent. Defaults to None.
         """
         self.watchers: dict[str, list[Callable]] = {}
         self._lock = asyncio.Lock()
-        self.embedding_model = embedding_model
-        # 初始化embedding存储
-        self._embeddings = {"state": {}, "profile": {}, "dynamic": {}}
+        self._agent_id: int = -1
+        self._embedding_model = embedding_model
         _dynamic_config: dict[Any, Any] = {}
         _state_config: dict[Any, Any] = {}
         _profile_config: dict[Any, Any] = {}
         # 记录哪些字段需要embedding
         self._embedding_fields: dict[str, bool] = {}
+        self._embedding_field_to_doc_id: dict[Any, str] = defaultdict(str)
+        self._faiss_query = faiss_query
         if config is not None:
             for k, v in config.items():
@@ -135,8 +141,55 @@ class Memory:
         self._profile = ProfileMemory(
             msg=_profile_config, activate_timestamp=activate_timestamp
         )
-        self.memories = []  # 存储记忆内容
-        self.embeddings = []  # 存储记忆的向量表示
+        # self.memories = []  # 存储记忆内容
+        # self.embeddings = []  # 存储记忆的向量表示
+    def set_embedding_model(
+        self,
+        embedding_model: Embeddings,
+    ):
+        self._embedding_model = embedding_model
+    @property
+    def embedding_model(
+        self,
+    ):
+        if self._embedding_model is None:
+            raise RuntimeError(
+                f"embedding_model before assignment, please `set_embedding_model` first!"
+            )
+        return self._embedding_model
+    def set_faiss_query(self, faiss_query: FaissQuery):
+        """
+        Set the FaissQuery of the agent.
+        """
+        self._faiss_query = faiss_query
+    @property
+    def agent_id(
+        self,
+    ):
+        if self._agent_id < 0:
+            raise RuntimeError(
+                f"agent_id before assignment, please `set_agent_id` first!"
+            )
+        return self._agent_id
+    def set_agent_id(self, agent_id: int):
+        """
+        Set the FaissQuery of the agent.
+        """
+        self._agent_id = agent_id
+    @property
+    def faiss_query(self) -> FaissQuery:
+        """FaissQuery"""
+        if self._faiss_query is None:
+            raise RuntimeError(
+                f"FaissQuery access before assignment, please `set_faiss_query` first!"
+            )
+        return self._faiss_query
     @lock_decorator
     async def get(
@@ -192,11 +245,23 @@ class Memory:
                 if mode == "replace":
                     await _mem.update(key, value, store_snapshot)
                     # 如果字段需要embedding，则更新embedding
-                    if self.embedding_model and self._embedding_fields.get(key, False):
+                    if self._embedding_fields.get(key, False) and self.embedding_model:
                         memory_type = self._get_memory_type(_mem)
-                        self._embeddings[memory_type][key] = (
-                            await self._generate_embedding(f"{key}: {str(value)}")
+                        # 覆盖更新删除原vector
+                        orig_doc_id = self._embedding_field_to_doc_id[key]
+                        if orig_doc_id:
+                            await self.faiss_query.delete_documents(
+                                to_delete_ids=[orig_doc_id],
+                            )
+                        doc_ids: list[str] = await self.faiss_query.add_documents(
+                            agent_id=self.agent_id,
+                            documents=f"{key}: {str(value)}",
+                            extra_tags={
+                                "type": memory_type,
+                                "key": key,
+                            },
                         )
+                        self._embedding_field_to_doc_id[key] = doc_ids[0]
                     if key in self.watchers:
                         for callback in self.watchers[key]:
                             asyncio.create_task(callback())
@@ -214,13 +279,17 @@ class Memory:
                             f"Type of {type(original_value)} does not support mode `merge`, using `replace` instead!"
                         )
                         await _mem.update(key, value, store_snapshot)
-                    if self.embedding_model and self._embedding_fields.get(key, False):
+                    if self._embedding_fields.get(key, False) and self.embedding_model:
                         memory_type = self._get_memory_type(_mem)
-                        self._embeddings[memory_type][key] = (
-                            await self._generate_embedding(
-                                f"{key}: {str(original_value)}"
-                            )
+                        doc_ids = await self.faiss_query.add_documents(
+                            agent_id=self.agent_id,
+                            documents=f"{key}: {str(original_value)}",
+                            extra_tags={
+                                "type": memory_type,
+                                "key": key,
+                            },
                         )
+                        self._embedding_field_to_doc_id[key] = doc_ids[0]
                     if key in self.watchers:
                         for callback in self.watchers[key]:
                             asyncio.create_task(callback())
@@ -240,68 +309,6 @@ class Memory:
         else:
             return "dynamic"
-    async def _generate_embedding(self, text: str) -> np.ndarray:
-        """生成文本的向量表示
-        Args:
-            text: 输入文本
-        Returns:
-            np.ndarray: 文本的向量表示
-        Raises:
-            ValueError: 如果embedding_model未初始化
-        """
-        if not self.embedding_model:
-            raise RuntimeError("Embedding model not initialized")
-        return await self.embedding_model.embed(text)
-    async def search(self, query: str, top_k: int = 3) -> str:
-        """搜索相关记忆
-        Args:
-            query: 查询文本
-            top_k: 返回最相关的记忆数量
-        Returns:
-            str: 格式化的相关记忆文本
-        """
-        if not self.embedding_model:
-            return "Embedding model not initialized"
-        query_embedding = await self._generate_embedding(query)
-        all_results = []
-        # 搜索所有记忆类型中启用了embedding的字段
-        for memory_type, embeddings in self._embeddings.items():
-            for key, embedding in embeddings.items():
-                similarity = self._cosine_similarity(query_embedding, embedding)
-                value = await self.get(key)
-                all_results.append(
-                    {
-                        "type": memory_type,
-                        "key": key,
-                        "content": f"{key}: {str(value)}",
-                        "similarity": similarity,
-                    }
-                )
-        # 按相似度排序
-        all_results.sort(key=lambda x: x["similarity"], reverse=True)
-        top_results = all_results[:top_k]
-        # 格式化输出
-        formatted_results = []
-        for result in top_results:
-            formatted_results.append(
-                f"- [{result['type']}] {result['content']} "
-                f"(相关度: {result['similarity']:.2f})"
-            )
-        return "\n".join(formatted_results)
     async def update_batch(
         self,
         content: Union[dict, Sequence[tuple[Any, Any]]],
@@ -388,67 +395,54 @@ class Memory:
             if _snapshot:
                 await _mem.load(snapshots=_snapshot, reset_memory=reset_memory)
+    # async def add(self, content: str, metadata: Optional[dict] = None) -> None:
+    #     """添加新的记忆
+    #     Args:
+    #         content: 记忆内容
+    #         metadata: 相关元数据，如时间、地点等
+    #     """
+    #     embedding = await self.embedding_model.aembed_query(content)
+    #     self.memories.append(
+    #         {
+    #             "content": content,
+    #             "metadata": metadata or {},
+    #             "timestamp": datetime.now(),
+    #             "embedding": embedding,
+    #         }
+    #     )
+    #     self.embeddings.append(embedding)
     @lock_decorator
-    async def get_top_k(
-        self,
-        key: Any,
-        metric: Callable[[Any], Any],
-        top_k: Optional[int] = None,
-        mode: Union[Literal["read only"], Literal["read and write"]] = "read only",
-        preserve_order: bool = True,
-    ) -> Any:
-        """
-        Retrieves the top-k items from the memory based on the given key and metric.
+    async def search(
+        self, query: str, top_k: int = 3, filter: Optional[dict] = None
+    ) -> str:
+        """搜索相关记忆
         Args:
-            key (Any): The key of the item to retrieve.
-            metric (Callable[[Any], Any]): A callable function that defines the metric for ranking the items.
-            top_k (Optional[int], optional): The number of top items to retrieve. Defaults to None (all items).
-            mode (Union[Literal["read only"], Literal["read and write"]], optional): Access mode for the item. Defaults to "read only".
-            preserve_order (bool): Whether preserve original order in output values.
+            query: 查询文本
+            top_k: 返回最相关的记忆数量
+            filter (dict, optional): 记忆的筛选条件，如 {"type":"dynamic", "key":"self_define_1",}，默认为空
         Returns:
-            Any: The top-k items based on the specified metric.
-        Raises:
-            ValueError: If an invalid mode is provided.
-            KeyError: If the key is not found in any of the memory sections.
-        """
-        if mode == "read only":
-            process_func = deepcopy
-        elif mode == "read and write":
-            process_func = lambda x: x
-        else:
-            raise ValueError(f"Invalid get mode `{mode}`!")
-        for _mem in [self._state, self._profile, self._dynamic]:
-            try:
-                value = await _mem.get_top_k(key, metric, top_k, preserve_order)
-                return process_func(value)
-            except KeyError as e:
-                continue
-        raise KeyError(f"No attribute `{key}` in memories!")
-    async def add(self, content: str, metadata: Optional[dict] = None) -> None:
-        """添加新的记忆
-        Args:
-            content: 记忆内容
-            metadata: 相关元数据，如时间、地点等
+            str: 格式化的相关记忆文本
         """
-        embedding = await self.embedding_model.embed(content)
-        self.memories.append(
-            {
-                "content": content,
-                "metadata": metadata or {},
-                "timestamp": datetime.now(),
-                "embedding": embedding,
-            }
+        if not self._embedding_model:
+            return "Embedding model not initialized"
+        top_results: list[tuple[str, float, dict]] = (
+            await self.faiss_query.similarity_search(  # type:ignore
+                query=query,
+                agent_id=self.agent_id,
+                k=top_k,
+                return_score_type="similarity_score",
+                filter=filter,
+            )
         )
-        self.embeddings.append(embedding)
-    def _cosine_similarity(self, v1: np.ndarray, v2: np.ndarray) -> float:
-        """计算余弦相似度"""
-        dot_product = np.dot(v1, v2)
-        norm_v1 = np.linalg.norm(v1)
-        norm_v2 = np.linalg.norm(v2)
-        return dot_product / (norm_v1 * norm_v2)
+        # 格式化输出
+        formatted_results = []
+        for content, score, metadata in top_results:
+            formatted_results.append(
+                f"- [{metadata['type']}] {content} " f"(相关度: {score:.2f})"
+            )
+        return "\n".join(formatted_results)

pycityagent/simulation/agentgroup.py CHANGED Viewed

@@ -10,12 +10,14 @@ from uuid import UUID
 import fastavro
 import ray
+from langchain_core.embeddings import Embeddings
 from ..agent import Agent, CitizenAgent, InstitutionAgent
 from ..economy.econ_client import EconomyClient
 from ..environment.simulator import Simulator
 from ..llm.llm import LLM
 from ..llm.llmconfig import LLMConfig
+from ..memory import FaissQuery
 from ..message import Messager
 from ..metrics import MlflowClient
 from ..utils import (DIALOG_SCHEMA, INSTITUTION_STATUS_SCHEMA, PROFILE_SCHEMA,
@@ -37,6 +39,7 @@ class AgentGroup:
         enable_pgsql: bool,
         pgsql_writer: ray.ObjectRef,
         mlflow_run_id: str,
+        embedding_model: Embeddings,
         logging_level: int,
     ):
         logger.setLevel(logging_level)
@@ -46,6 +49,7 @@ class AgentGroup:
         self.exp_id = exp_id
         self.enable_avro = enable_avro
         self.enable_pgsql = enable_pgsql
+        self.embedding_model = embedding_model
         if enable_avro:
             self.avro_path = avro_path / f"{self._uuid}"
             self.avro_path.mkdir(parents=True, exist_ok=True)
@@ -99,6 +103,13 @@ class AgentGroup:
         else:
             self.mlflow_client = None
+        # set FaissQuery
+        if self.embedding_model is not None:
+            self.faiss_query = FaissQuery(
+                embeddings=self.embedding_model,
+            )
+        else:
+            self.faiss_query = None
         for agent in self.agents:
             agent.set_exp_id(self.exp_id)  # type: ignore
             agent.set_llm_client(self.llm)
@@ -112,6 +123,12 @@ class AgentGroup:
                 agent.set_avro_file(self.avro_file)  # type: ignore
             if self.enable_pgsql:
                 agent.set_pgsql_writer(self._pgsql_writer)
+            # set memory.faiss_query
+            if self.faiss_query is not None:
+                agent.memory.set_faiss_query(self.faiss_query)
+            # set memory.embedding model
+            if self.embedding_model is not None:
+                agent.memory.set_embedding_model(self.embedding_model)
     async def init_agents(self):
         logger.debug(f"-----Initializing Agents in AgentGroup {self._uuid} ...")
@@ -376,32 +393,32 @@ class AgentGroup:
                             "created_at": _date_time,
                         }
                         _statuses_time_list.append((_status_dict, _date_time))
-        to_update_statues: list[tuple] = []
-        for _status_dict, _ in _statuses_time_list:
-            BASIC_KEYS = [
-                "id",
-                "day",
-                "t",
-                "lng",
-                "lat",
-                "parent_id",
-                "action",
-                "created_at",
-            ]
-            _data = [_status_dict[k] for k in BASIC_KEYS if k != "created_at"]
-            _other_dict = json.dumps(
-                {k: v for k, v in _status_dict.items() if k not in BASIC_KEYS}
-            )
-            _data.append(_other_dict)
-            _data.append(_status_dict["created_at"])
-            to_update_statues.append(tuple(_data))
-        if self._last_asyncio_pg_task is not None:
-            await self._last_asyncio_pg_task
-        self._last_asyncio_pg_task = (
-            self._pgsql_writer.async_write_status.remote(  # type:ignore
-                to_update_statues
+            to_update_statues: list[tuple] = []
+            for _status_dict, _ in _statuses_time_list:
+                BASIC_KEYS = [
+                    "id",
+                    "day",
+                    "t",
+                    "lng",
+                    "lat",
+                    "parent_id",
+                    "action",
+                    "created_at",
+                ]
+                _data = [_status_dict[k] for k in BASIC_KEYS if k != "created_at"]
+                _other_dict = json.dumps(
+                    {k: v for k, v in _status_dict.items() if k not in BASIC_KEYS}
+                )
+                _data.append(_other_dict)
+                _data.append(_status_dict["created_at"])
+                to_update_statues.append(tuple(_data))
+            if self._last_asyncio_pg_task is not None:
+                await self._last_asyncio_pg_task
+            self._last_asyncio_pg_task = (
+                self._pgsql_writer.async_write_status.remote(  # type:ignore
+                    to_update_statues
+                )
             )
-        )
     async def step(self):
         if not self.initialized:

pycityagent/simulation/simulation.py CHANGED Viewed

@@ -14,11 +14,13 @@ from typing import Any, Optional, Union
 import pycityproto.city.economy.v2.economy_pb2 as economyv2
 import ray
 import yaml
+from langchain_core.embeddings import Embeddings
 from mosstool.map._map_util.const import AOI_START_ID
 from ..agent import Agent, InstitutionAgent
 from ..environment.simulator import Simulator
-from ..memory.memory import Memory
+from ..llm import SimpleEmbedding
+from ..memory import FaissQuery, Memory
 from ..message.messager import Messager
 from ..metrics import init_mlflow_connection
 from ..survey import Survey
@@ -76,6 +78,8 @@ class AgentSimulation:
         # storage
         _storage_config: dict[str, Any] = config.get("storage", {})
+        if _storage_config is None:
+            _storage_config = {}
         # avro
         _avro_config: dict[str, Any] = _storage_config.get("avro", {})
         self._enable_avro = _avro_config.get("enabled", False)
@@ -164,6 +168,7 @@ class AgentSimulation:
         enable_pgsql: bool,
         pgsql_writer: ray.ObjectRef,
         mlflow_run_id: str = None,  # type: ignore
+        embedding_model: Embeddings = None,  # type: ignore
         logging_level: int = logging.WARNING,
     ):
         """创建远程组"""
@@ -177,6 +182,7 @@ class AgentSimulation:
             enable_pgsql,
             pgsql_writer,
             mlflow_run_id,
+            embedding_model,
             logging_level,
         )
         return group_name, group, agents
@@ -186,6 +192,7 @@ class AgentSimulation:
         agent_count: Union[int, list[int]],
         group_size: int = 1000,
         pg_sql_writers: int = 32,
+        embedding_model: Embeddings = SimpleEmbedding(),
         memory_config_func: Optional[Union[Callable, list[Callable]]] = None,
     ) -> None:
         """初始化智能体
@@ -305,6 +312,7 @@ class AgentSimulation:
                 self.enable_pgsql,
                 _workers[i % _num_workers],  # type:ignore
                 mlflow_run_id,  # type:ignore
+                embedding_model,
                 self.logging_level,
             )
             creation_tasks.append((group_name, group, agents))

{pycityagent-2.0.0a22.dist-info → pycityagent-2.0.0a24.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pycityagent
-Version: 2.0.0a22
+Version: 2.0.0a24
 Summary: LLM-based城市环境agent构建库
 License: MIT
 Author: Yuwei Yan
@@ -20,10 +20,12 @@ Requires-Dist: aiohttp (==3.10.10)
 Requires-Dist: aiomqtt (>=2.3.0,<3.0.0)
 Requires-Dist: citystreetview (==1.2.4)
 Requires-Dist: dashscope (==1.14.1)
+Requires-Dist: faiss-cpu (>=1.9.0.post1,<2.0.0)
 Requires-Dist: fastavro (>=1.10.0,<2.0.0)
 Requires-Dist: geojson (==3.1.0)
 Requires-Dist: gradio (>=5.7.1,<6.0.0)
 Requires-Dist: grpcio (==1.67.1)
+Requires-Dist: langchain-community (>=0.3.13,<0.4.0)
 Requires-Dist: langchain-core (>=0.3.28,<0.4.0)
 Requires-Dist: matplotlib (==3.8.3)
 Requires-Dist: mlflow (>=2.19.0,<3.0.0)
@@ -40,6 +42,8 @@ Requires-Dist: pycityproto (>=2.1.5,<3.0.0)
 Requires-Dist: pyyaml (>=6.0.2,<7.0.0)
 Requires-Dist: ray (>=2.40.0,<3.0.0)
 Requires-Dist: sidecar (==0.7.0)
+Requires-Dist: torch (>=2.5.1,<3.0.0)
+Requires-Dist: transformers (>=4.47.1,<5.0.0)
 Requires-Dist: zhipuai (>=2.1.5.20230904,<3.0.0.0)
 Description-Content-Type: text/markdown

{pycityagent-2.0.0a22.dist-info → pycityagent-2.0.0a24.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
-pycityagent/__init__.py,sha256=EDxt3Su3lH1IMh9suNw7GeGL7UrXeWiZTw5KWNznDzc,637
-pycityagent/agent.py,sha256=TGW4vyaYBnNxYkr22FhGPwex3dLIeq3F-2rnELidNPA,28670
+pycityagent/__init__.py,sha256=fv0mzNGbHBF6m550yYqnuUpB8iQPWS-7EatYRK7DO4s,693
+pycityagent/agent.py,sha256=l8Oa95_K5JBWKzvZmbQe_QM_E_vaG-YstuuR55kgC6Y,29005
 pycityagent/economy/__init__.py,sha256=aonY4WHnx-6EGJ4WKrx4S-2jAkYNLtqUA04jp6q8B7w,75
 pycityagent/economy/econ_client.py,sha256=GuHK9ZBnhqW3Z7F8ViDJn_iN73yOBbbwFyJv1wLEBDk,12211
 pycityagent/environment/__init__.py,sha256=awHxlOud-btWbk0FCS4RmGJ13W84oVCkbGfcrhKqihA,240
@@ -30,14 +30,15 @@ pycityagent/environment/utils/grpc.py,sha256=6EJwKXXktIWb1NcUiJzIRmfrY0S03QAXXGc
 pycityagent/environment/utils/map_utils.py,sha256=lYOEoCFFK6-e9N5txLMMq4HUlxMqc8Uw1YrGW5oJmgg,5749
 pycityagent/environment/utils/port.py,sha256=3OM6kSUt3PxvDUOlgyiendBtETaWU8Mzk_8H0TzTmYg,295
 pycityagent/environment/utils/protobuf.py,sha256=0BsM_G7x2B_6DMIBHe9bjVuQDOXUytNRQ03g9e05F3c,1170
-pycityagent/llm/__init__.py,sha256=7klKEmCcDWJIu-F4DoAukSuKfDbLhdczrSIhpwow-sY,145
-pycityagent/llm/embedding.py,sha256=2psX_EK67oPlYe77g43EYUYams4M9AiJqxpHTFHG0n8,4253
+pycityagent/llm/__init__.py,sha256=iWs6FLgrbRVIiqOf4ILS89gkVCTvS7HFC3vG-MWuyko,205
+pycityagent/llm/embeddings.py,sha256=Nhf_tUIlaYJAZ93wW2QTCtS1wq7e8fUgdn2JketEAuQ,7600
 pycityagent/llm/llm.py,sha256=vJaaGqVuyV-GlBxrnvGKZnMDlxeTT_sGUTdxz5tYwEE,15141
 pycityagent/llm/llmconfig.py,sha256=4Ylf4OFSBEFy8jrOneeX0HvPhWEaF5jGvy1HkXK08Ro,436
 pycityagent/llm/utils.py,sha256=hoNPhvomb1u6lhFX0GctFipw74hVKb7bvUBDqwBzBYw,160
-pycityagent/memory/__init__.py,sha256=Hs2NhYpIG-lvpwPWwj4DydB1sxtjz7cuA4iDAzCXnjI,243
+pycityagent/memory/__init__.py,sha256=_Vfdo1HcLWsuuz34_i8e91nnLVYADpMlHHSVaB3xgIk,297
 pycityagent/memory/const.py,sha256=6zpJPJXWoH9-yf4RARYYff586agCoud9BRn7sPERB1g,932
-pycityagent/memory/memory.py,sha256=vJxHOI74aJDGZPFu2LbBr02ASfOYpig66fto6Gjr-6Q,18191
+pycityagent/memory/faiss_query.py,sha256=Z0JS4udyPYCIzHMq464QtHscnswu35gh9fQptikAwkQ,12976
+pycityagent/memory/memory.py,sha256=UBh4yANNHDzYZwrsvyX4ZMSHXINbu1U6g0HLNCOOCk8,17883
 pycityagent/memory/memory_base.py,sha256=QG_j3BxZvkadFEeE3uBR_kjl_xcXD1aHUVs8GEF3d6w,5654
 pycityagent/memory/profile.py,sha256=q8ZS9IBmHCg_X1GONUvXK85P6tCepTKQgXKuvuXYNXw,5203
 pycityagent/memory/self_define.py,sha256=vpZ6CIxR2grNXEIOScdpsSc59FBg0mOKelwQuTElbtQ,5200
@@ -49,8 +50,8 @@ pycityagent/metrics/__init__.py,sha256=X08PaBbGVAd7_PRGLREXWxaqm7nS82WBQpD1zvQzc
 pycityagent/metrics/mlflow_client.py,sha256=g_tHxWkWTDijtbGL74-HmiYzWVKb1y8-w12QrY9jL30,4449
 pycityagent/metrics/utils/const.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 pycityagent/simulation/__init__.py,sha256=P5czbcg2d8S0nbbnsQXFIhwzO4CennAhZM8OmKvAeYw,194
-pycityagent/simulation/agentgroup.py,sha256=5p68wNoEaog4nDym3xsCTporBWmxNiQ1crN3mbOHFsE,19788
-pycityagent/simulation/simulation.py,sha256=7Go_RkpkC_DuBWW21JPqlV2yXY754RqSkqzM0vTdteU,23008
+pycityagent/simulation/agentgroup.py,sha256=r8arCAQkKMhv3yr35XsYJL-MfG6o6rWwHItBmxfDtA4,20589
+pycityagent/simulation/simulation.py,sha256=9kkdgXSEOAN8wiewVFyORksti4IdVNU0opObV6ZYa9k,23344
 pycityagent/simulation/storage/pg.py,sha256=Ws04mUgRcbbvWi_eQm3PXYa6w7AQUbDPWhSU7HFtsD8,6026
 pycityagent/survey/__init__.py,sha256=rxwou8U9KeFSP7rMzXtmtp2fVFZxK4Trzi-psx9LPIs,153
 pycityagent/survey/manager.py,sha256=S5IkwTdelsdtZETChRcfCEczzwSrry_Fly9MY4s3rbk,1681
@@ -69,6 +70,6 @@ pycityagent/workflow/block.py,sha256=l-z9iJo9_USZQRyj4TLMfihK0-tnNDG0a6jVk9WhG0o
 pycityagent/workflow/prompt.py,sha256=6jI0Rq54JLv3-IXqZLYug62vse10wTI83xvf4ZX42nk,2929
 pycityagent/workflow/tool.py,sha256=xADxhNgVsjNiMxlhdwn3xGUstFOkLEG8P67ez8VmwSI,8555
 pycityagent/workflow/trigger.py,sha256=Df-MOBEDWBbM-v0dFLQLXteLsipymT4n8vqexmK2GiQ,5643
-pycityagent-2.0.0a22.dist-info/METADATA,sha256=s_gC55n1d1ZUyt1kRcYhl7h9Ymp8BQQKXZHrg93V8sg,7848
-pycityagent-2.0.0a22.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-pycityagent-2.0.0a22.dist-info/RECORD,,
+pycityagent-2.0.0a24.dist-info/METADATA,sha256=cHowSJH9VJmum92fAEfRvQYtWmbCJRnVgOmI2JZDlqw,8033
+pycityagent-2.0.0a24.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+pycityagent-2.0.0a24.dist-info/RECORD,,

pycityagent/llm/embedding.py DELETED Viewed

@@ -1,136 +0,0 @@
-"""简单的基于内存的embedding实现"""
-import numpy as np
-import hashlib
-import json
-class SimpleEmbedding:
-    """简单的基于内存的embedding实现
-    使用简单的词袋模型(Bag of Words)和TF-IDF来生成文本的向量表示。
-    所有向量都保存在内存中，适用于小规模应用。
-    """
-    def __init__(self, vector_dim: int = 128, cache_size: int = 1000):
-        """初始化
-        Args:
-            vector_dim: 向量维度
-            cache_size: 缓存大小，超过此大小将清除最早的缓存
-        """
-        self.vector_dim = vector_dim
-        self.cache_size = cache_size
-        self._cache: dict[str, np.ndarray] = {}
-        self._vocab: dict[str, int] = {}  # 词汇表
-        self._idf: dict[str, float] = {}  # 逆文档频率
-        self._doc_count = 0  # 文档总数
-    def _text_to_hash(self, text: str) -> str:
-        """将文本转换为hash值"""
-        return hashlib.md5(text.encode()).hexdigest()
-    def _tokenize(self, text: str) -> list[str]:
-        """简单的分词"""
-        # 这里使用简单的空格分词，实际应用中可以使用更复杂的分词方法
-        return text.lower().split()
-    def _update_vocab(self, tokens: list[str]):
-        """更新词汇表"""
-        for token in set(tokens):  # 使用set去重
-            if token not in self._vocab:
-                self._vocab[token] = len(self._vocab)
-    def _update_idf(self, tokens: list[str]):
-        """更新IDF值"""
-        self._doc_count += 1
-        unique_tokens = set(tokens)
-        for token in unique_tokens:
-            self._idf[token] = self._idf.get(token, 0) + 1
-    def _calculate_tf(self, tokens: list[str]) -> dict[str, float]:
-        """计算词频(TF)"""
-        tf = {}
-        total_tokens = len(tokens)
-        for token in tokens:
-            tf[token] = tf.get(token, 0) + 1
-        # 归一化
-        for token in tf:
-            tf[token] /= total_tokens
-        return tf
-    def _calculate_tfidf(self, tokens: list[str]) -> np.ndarray:
-        """计算TF-IDF向量"""
-        vector = np.zeros(self.vector_dim)
-        tf = self._calculate_tf(tokens)
-        for token, tf_value in tf.items():
-            if token in self._idf:
-                idf = np.log(self._doc_count / self._idf[token])
-                idx = self._vocab[token] % self.vector_dim  # 使用取模运算来控制向量维度
-                vector[idx] += tf_value * idf
-        # L2归一化
-        norm = np.linalg.norm(vector)
-        if norm > 0:
-            vector /= norm
-        return vector
-    async def embed(self, text: str) -> np.ndarray:
-        """生成文本的向量表示
-        Args:
-            text: 输入文本
-        Returns:
-            np.ndarray: 文本的向量表示
-        """
-        # 检查缓存
-        text_hash = self._text_to_hash(text)
-        if text_hash in self._cache:
-            return self._cache[text_hash]
-        # 分词
-        tokens = self._tokenize(text)
-        if not tokens:
-            return np.zeros(self.vector_dim)
-        # 更新词汇表和IDF
-        self._update_vocab(tokens)
-        self._update_idf(tokens)
-        # 计算向量
-        vector = self._calculate_tfidf(tokens)
-        # 更新缓存
-        if len(self._cache) >= self.cache_size:
-            # 删除最早的缓存
-            oldest_key = next(iter(self._cache))
-            del self._cache[oldest_key]
-        self._cache[text_hash] = vector
-        return vector
-    def save(self, file_path: str):
-        """保存模型"""
-        state = {
-            "vector_dim": self.vector_dim,
-            "cache_size": self.cache_size,
-            "vocab": self._vocab,
-            "idf": self._idf,
-            "doc_count": self._doc_count,
-        }
-        with open(file_path, "w") as f:
-            json.dump(state, f)
-    def load(self, file_path: str):
-        """加载模型"""
-        with open(file_path, "r") as f:
-            state = json.load(f)
-        self.vector_dim = state["vector_dim"]
-        self.cache_size = state["cache_size"]
-        self._vocab = state["vocab"]
-        self._idf = state["idf"]
-        self._doc_count = state["doc_count"]
-        self._cache = {}  # 清空缓存

{pycityagent-2.0.0a22.dist-info → pycityagent-2.0.0a24.dist-info}/WHEEL RENAMED Viewed

File without changes

pycityagent 2.0.0a22__py3-none-any.whl → 2.0.0a24__py3-none-any.whl

pycityagent 2.0.0a22py3-none-any.whl → 2.0.0a24py3-none-any.whl