pycityagent 2.0.0a21__py3-none-any.whl → 2.0.0a24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pycityagent/__init__.py CHANGED
@@ -5,6 +5,7 @@ Pycityagent: 城市智能体构建框架
5
5
  from .agent import Agent, CitizenAgent, InstitutionAgent
6
6
  from .environment import Simulator
7
7
  import logging
8
+ from .llm import SentenceEmbedding
8
9
 
9
10
  # 创建一个 pycityagent 记录器
10
11
  logger = logging.getLogger("pycityagent")
@@ -19,4 +20,4 @@ if not logger.hasHandlers():
19
20
  handler.setFormatter(formatter)
20
21
  logger.addHandler(handler)
21
22
 
22
- __all__ = ["Agent", "Simulator", "CitizenAgent", "InstitutionAgent"]
23
+ __all__ = ["Agent", "Simulator", "CitizenAgent", "InstitutionAgent","SentenceEmbedding",]
pycityagent/agent.py CHANGED
@@ -236,7 +236,15 @@ class Agent(ABC):
236
236
 
237
237
  # 添加记忆上下文
238
238
  if self._memory:
239
- relevant_memories = await self._memory.search(survey_prompt)
239
+ relevant_memories = await self.memory.search(survey_prompt)
240
+
241
+ formatted_results = []
242
+ # for result in top_results:
243
+ # formatted_results.append(
244
+ # f"- [{result['type']}] {result['content']} "
245
+ # f"(相关度: {result['similarity']:.2f})"
246
+ # )
247
+
240
248
  if relevant_memories:
241
249
  dialog.append(
242
250
  {
@@ -458,13 +466,18 @@ class Agent(ABC):
458
466
  topic = f"exps/{self._exp_id}/agents/{to_agent_uuid}/{sub_topic}"
459
467
  await self._messager.send_message(topic, payload)
460
468
 
461
- async def send_message_to_agent(self, to_agent_uuid: str, content: str):
469
+ async def send_message_to_agent(
470
+ self, to_agent_uuid: str, content: str, type: str = "social"
471
+ ):
462
472
  """通过 Messager 发送消息"""
463
473
  if self._messager is None:
464
474
  raise RuntimeError("Messager is not set")
475
+ if type not in ["social", "economy"]:
476
+ logger.warning(f"Invalid message type: {type}, sent from {self._uuid}")
465
477
  payload = {
466
478
  "from": self._uuid,
467
479
  "content": content,
480
+ "type": type,
468
481
  "timestamp": int(datetime.now().timestamp() * 1000),
469
482
  "day": await self.simulator.get_simulator_day(),
470
483
  "t": await self.simulator.get_simulator_second_from_start_of_day(),
@@ -485,11 +498,11 @@ class Agent(ABC):
485
498
  auros.append(_message_dict)
486
499
  pg_list.append((_message_dict, _date_time))
487
500
  # Avro
488
- if self._avro_file is not None:
501
+ if self._avro_file is not None and type == "social":
489
502
  with open(self._avro_file["dialog"], "a+b") as f:
490
503
  fastavro.writer(f, DIALOG_SCHEMA, auros, codec="snappy")
491
504
  # Pg
492
- if self._pgsql_writer is not None:
505
+ if self._pgsql_writer is not None and type == "social":
493
506
  if self._last_asyncio_pg_task is not None:
494
507
  await self._last_asyncio_pg_task
495
508
  _keys = ["id", "day", "t", "type", "speaker", "content", "created_at"]
@@ -595,6 +608,7 @@ class CitizenAgent(Agent):
595
608
  # 防止模拟器还没有到prepare阶段导致get_person出错
596
609
  self._has_bound_to_simulator = True
597
610
  self._agent_id = person_id
611
+ self.memory.set_agent_id(person_id)
598
612
 
599
613
  async def _bind_to_economy(self):
600
614
  if self._economy_client is None:
@@ -1,4 +1,5 @@
1
- from typing import Any, Awaitable, Coroutine, cast, Union, Dict
1
+ from collections.abc import Awaitable, Coroutine
2
+ from typing import Any, Dict, Union, cast
2
3
 
3
4
  import grpc
4
5
  from google.protobuf.json_format import ParseDict
@@ -1,4 +1,5 @@
1
- from typing import Any, Awaitable, Coroutine, cast, Union, Dict
1
+ from collections.abc import Awaitable, Coroutine
2
+ from typing import Any, Dict, Union, cast
2
3
 
3
4
  import grpc
4
5
  from google.protobuf.json_format import ParseDict
@@ -1,4 +1,5 @@
1
- from typing import Any, Awaitable, Coroutine, cast, Union, Dict
1
+ from typing import Any, cast, Union
2
+ from collections.abc import Awaitable, Coroutine
2
3
 
3
4
  import grpc
4
5
  from google.protobuf.json_format import ParseDict
@@ -25,7 +26,7 @@ class EconomyPersonService:
25
26
  self,
26
27
  req: Union[person_service.GetPersonRequest, dict],
27
28
  dict_return: bool = True,
28
- ) -> Coroutine[Any, Any, Union[Dict[str, Any], person_service.GetPersonResponse]]:
29
+ ) -> Coroutine[Any, Any, Union[dict[str, Any], person_service.GetPersonResponse]]:
29
30
  """
30
31
  批量查询人的经济情况(资金、雇佣关系)
31
32
  Query person’s economic situation (funds, employment relationship) in batches
@@ -48,7 +49,7 @@ class EconomyPersonService:
48
49
  req: Union[person_service.UpdatePersonMoneyRequest, dict],
49
50
  dict_return: bool = True,
50
51
  ) -> Coroutine[
51
- Any, Any, Union[Dict[str, Any], person_service.UpdatePersonMoneyResponse]
52
+ Any, Any, Union[dict[str, Any], person_service.UpdatePersonMoneyResponse]
52
53
  ]:
53
54
  """
54
55
  批量修改人的资金
@@ -80,7 +81,7 @@ class EconomyOrgService:
80
81
 
81
82
  def GetOrg(
82
83
  self, req: Union[org_service.GetOrgRequest, dict], dict_return: bool = True
83
- ) -> Coroutine[Any, Any, Union[Dict[str, Any], org_service.GetOrgResponse]]:
84
+ ) -> Coroutine[Any, Any, Union[dict[str, Any], org_service.GetOrgResponse]]:
84
85
  """
85
86
  批量查询组织的经济情况(员工、岗位、资金、货物)
86
87
  Query the economic status of the organization (employees, positions, funds, goods) in batches
@@ -100,7 +101,7 @@ class EconomyOrgService:
100
101
  self,
101
102
  req: Union[org_service.UpdateOrgMoneyRequest, dict],
102
103
  dict_return: bool = True,
103
- ) -> Coroutine[Any, Any, Union[Dict[str, Any], org_service.UpdateOrgMoneyResponse]]:
104
+ ) -> Coroutine[Any, Any, Union[dict[str, Any], org_service.UpdateOrgMoneyResponse]]:
104
105
  """
105
106
  批量修改组织的资金
106
107
  Modify organization’s money in batches
@@ -123,7 +124,7 @@ class EconomyOrgService:
123
124
  self,
124
125
  req: Union[org_service.UpdateOrgGoodsRequest, dict],
125
126
  dict_return: bool = True,
126
- ) -> Coroutine[Any, Any, Union[Dict[str, Any], org_service.UpdateOrgGoodsResponse]]:
127
+ ) -> Coroutine[Any, Any, Union[dict[str, Any], org_service.UpdateOrgGoodsResponse]]:
127
128
  """
128
129
  批量修改组织的货物
129
130
  Modify organization’s goods in batches
@@ -147,7 +148,7 @@ class EconomyOrgService:
147
148
  req: Union[org_service.UpdateOrgEmployeeRequest, dict],
148
149
  dict_return: bool = True,
149
150
  ) -> Coroutine[
150
- Any, Any, Union[Dict[str, Any], org_service.UpdateOrgEmployeeResponse]
151
+ Any, Any, Union[dict[str, Any], org_service.UpdateOrgEmployeeResponse]
151
152
  ]:
152
153
  """
153
154
  批量修改组织的员工
@@ -171,7 +172,7 @@ class EconomyOrgService:
171
172
  self,
172
173
  req: Union[org_service.UpdateOrgJobRequest, dict],
173
174
  dict_return: bool = True,
174
- ) -> Coroutine[Any, Any, Union[Dict[str, Any], org_service.UpdateOrgJobResponse]]:
175
+ ) -> Coroutine[Any, Any, Union[dict[str, Any], org_service.UpdateOrgJobResponse]]:
175
176
  """
176
177
  批量修改组织的岗位
177
178
  Modify organization’s jobs in batches
@@ -1,4 +1,5 @@
1
- from typing import Any, Awaitable, Coroutine, cast, Union, Dict
1
+ from typing import Any,cast, Union
2
+ from collections.abc import Awaitable, Coroutine
2
3
 
3
4
  import grpc
4
5
  from google.protobuf.json_format import ParseDict
@@ -21,7 +22,7 @@ class LaneService:
21
22
 
22
23
  def GetLane(
23
24
  self, req: Union[lane_service.GetLaneRequest, dict], dict_return: bool = True
24
- ) -> Coroutine[Any, Any, Union[Dict[str, Any], lane_service.GetLaneResponse]]:
25
+ ) -> Coroutine[Any, Any, Union[dict[str, Any], lane_service.GetLaneResponse]]:
25
26
  """
26
27
  获取Lane的信息
27
28
  Get Lane's information
@@ -41,7 +42,7 @@ class LaneService:
41
42
  self,
42
43
  req: Union[lane_service.SetLaneMaxVRequest, dict],
43
44
  dict_return: bool = True,
44
- ) -> Coroutine[Any, Any, Union[Dict[str, Any], lane_service.SetLaneMaxVResponse]]:
45
+ ) -> Coroutine[Any, Any, Union[dict[str, Any], lane_service.SetLaneMaxVResponse]]:
45
46
  """
46
47
  设置Lane的最大速度(限速)
47
48
  Set the maximum speed of Lane (speed limit)
@@ -64,7 +65,7 @@ class LaneService:
64
65
  req: Union[lane_service.SetLaneRestrictionRequest, dict],
65
66
  dict_return: bool = True,
66
67
  ) -> Coroutine[
67
- Any, Any, Union[Dict[str, Any], lane_service.SetLaneRestrictionResponse]
68
+ Any, Any, Union[dict[str, Any], lane_service.SetLaneRestrictionResponse]
68
69
  ]:
69
70
  """
70
71
  设置Lane的限制
@@ -89,7 +90,7 @@ class LaneService:
89
90
  req: Union[lane_service.GetLaneByLongLatBBoxRequest, dict],
90
91
  dict_return: bool = True,
91
92
  ) -> Coroutine[
92
- Any, Any, Union[Dict[str, Any], lane_service.GetLaneByLongLatBBoxResponse]
93
+ Any, Any, Union[dict[str, Any], lane_service.GetLaneByLongLatBBoxResponse]
93
94
  ]:
94
95
  """
95
96
  获取特定区域内的Lane的信息
@@ -1,9 +1,11 @@
1
- from typing import Any, Awaitable, Coroutine, cast, Union, Dict
1
+ from collections.abc import Awaitable, Coroutine
2
+ from typing import Any, Union, cast
2
3
 
3
4
  import grpc
4
5
  from google.protobuf.json_format import ParseDict
5
6
  from pycityproto.city.map.v2 import traffic_light_service_pb2 as light_service
6
- from pycityproto.city.map.v2 import traffic_light_service_pb2_grpc as light_grpc
7
+ from pycityproto.city.map.v2 import \
8
+ traffic_light_service_pb2_grpc as light_grpc
7
9
 
8
10
  from ..utils.protobuf import async_parse
9
11
 
@@ -21,10 +23,10 @@ class LightService:
21
23
 
22
24
  def GetTrafficLight(
23
25
  self,
24
- req: Union[light_service.GetTrafficLightRequest, Dict[str, Any]],
26
+ req: Union[light_service.GetTrafficLightRequest, dict[str, Any]],
25
27
  dict_return: bool = True,
26
28
  ) -> Coroutine[
27
- Any, Any, Union[Dict[str, Any], light_service.GetTrafficLightResponse]
29
+ Any, Any, Union[dict[str, Any], light_service.GetTrafficLightResponse]
28
30
  ]:
29
31
  """
30
32
  获取路口的红绿灯信息
@@ -46,10 +48,10 @@ class LightService:
46
48
 
47
49
  def SetTrafficLight(
48
50
  self,
49
- req: Union[light_service.SetTrafficLightRequest, Dict[str, Any]],
51
+ req: Union[light_service.SetTrafficLightRequest, dict[str, Any]],
50
52
  dict_return: bool = True,
51
53
  ) -> Coroutine[
52
- Any, Any, Union[Dict[str, Any], light_service.SetTrafficLightResponse]
54
+ Any, Any, Union[dict[str, Any], light_service.SetTrafficLightResponse]
53
55
  ]:
54
56
  """
55
57
  设置路口的红绿灯信息
@@ -74,7 +76,7 @@ class LightService:
74
76
  req: Union[light_service.SetTrafficLightPhaseRequest, dict],
75
77
  dict_return: bool = True,
76
78
  ) -> Coroutine[
77
- Any, Any, Union[Dict[str, Any], light_service.SetTrafficLightPhaseResponse]
79
+ Any, Any, Union[dict[str, Any], light_service.SetTrafficLightPhaseResponse]
78
80
  ]:
79
81
  """
80
82
  设置路口的红绿灯相位
@@ -99,7 +101,7 @@ class LightService:
99
101
  req: Union[light_service.SetTrafficLightStatusRequest, dict],
100
102
  dict_return: bool = True,
101
103
  ) -> Coroutine[
102
- Any, Any, Union[Dict[str, Any], light_service.SetTrafficLightStatusResponse]
104
+ Any, Any, Union[dict[str, Any], light_service.SetTrafficLightStatusResponse]
103
105
  ]:
104
106
  """
105
107
  设置路口的红绿灯状态
@@ -1,5 +1,6 @@
1
1
  import warnings
2
- from typing import Any, Awaitable, Coroutine, Dict, Union, cast
2
+ from collections.abc import Awaitable, Coroutine
3
+ from typing import Any, Union, cast
3
4
 
4
5
  import grpc
5
6
  from google.protobuf.json_format import ParseDict
@@ -51,7 +52,7 @@ class PersonService:
51
52
  self,
52
53
  req: Union[person_service.GetPersonRequest, dict],
53
54
  dict_return: bool = True,
54
- ) -> Coroutine[Any, Any, Union[Dict[str, Any], person_service.GetPersonResponse]]:
55
+ ) -> Coroutine[Any, Any, Union[dict[str, Any], person_service.GetPersonResponse]]:
55
56
  """
56
57
  获取person信息
57
58
  Get person information
@@ -73,7 +74,7 @@ class PersonService:
73
74
  self,
74
75
  req: Union[person_service.AddPersonRequest, dict],
75
76
  dict_return: bool = True,
76
- ) -> Coroutine[Any, Any, Union[Dict[str, Any], person_service.AddPersonResponse]]:
77
+ ) -> Coroutine[Any, Any, Union[dict[str, Any], person_service.AddPersonResponse]]:
77
78
  """
78
79
  新增person
79
80
  Add a new person
@@ -95,7 +96,7 @@ class PersonService:
95
96
  self,
96
97
  req: Union[person_service.SetScheduleRequest, dict],
97
98
  dict_return: bool = True,
98
- ) -> Coroutine[Any, Any, Union[Dict[str, Any], person_service.SetScheduleResponse]]:
99
+ ) -> Coroutine[Any, Any, Union[dict[str, Any], person_service.SetScheduleResponse]]:
99
100
  """
100
101
  修改person的schedule
101
102
  set person's schedule
@@ -118,7 +119,7 @@ class PersonService:
118
119
  self,
119
120
  req: Union[person_service.GetPersonsRequest, dict],
120
121
  dict_return: bool = True,
121
- ) -> Coroutine[Any, Any, Union[Dict[str, Any], person_service.GetPersonsResponse]]:
122
+ ) -> Coroutine[Any, Any, Union[dict[str, Any], person_service.GetPersonsResponse]]:
122
123
  """
123
124
  获取多个person信息
124
125
  Get information of multiple persons
@@ -142,7 +143,7 @@ class PersonService:
142
143
  req: Union[person_service.GetPersonByLongLatBBoxRequest, dict],
143
144
  dict_return: bool = True,
144
145
  ) -> Coroutine[
145
- Any, Any, Union[Dict[str, Any], person_service.GetPersonByLongLatBBoxResponse]
146
+ Any, Any, Union[dict[str, Any], person_service.GetPersonByLongLatBBoxResponse]
146
147
  ]:
147
148
  """
148
149
  获取特定区域内的person
@@ -167,7 +168,7 @@ class PersonService:
167
168
  req: Union[person_service.GetAllVehiclesRequest, dict],
168
169
  dict_return: bool = True,
169
170
  ) -> Coroutine[
170
- Any, Any, Union[Dict[str, Any], person_service.GetAllVehiclesResponse]
171
+ Any, Any, Union[dict[str, Any], person_service.GetAllVehiclesResponse]
171
172
  ]:
172
173
  """
173
174
  获取所有车辆
@@ -192,7 +193,7 @@ class PersonService:
192
193
  req: Union[person_service.ResetPersonPositionRequest, dict],
193
194
  dict_return: bool = True,
194
195
  ) -> Coroutine[
195
- Any, Any, Union[Dict[str, Any], person_service.ResetPersonPositionResponse]
196
+ Any, Any, Union[dict[str, Any], person_service.ResetPersonPositionResponse]
196
197
  ]:
197
198
  """
198
199
  重置人的位置(将停止当前正在进行的出行,转为sleep状态)
@@ -219,7 +220,7 @@ class PersonService:
219
220
  req: Union[person_service.SetControlledVehicleIDsRequest, dict],
220
221
  dict_return: bool = True,
221
222
  ) -> Coroutine[
222
- Any, Any, Union[Dict[str, Any], person_service.SetControlledVehicleIDsResponse]
223
+ Any, Any, Union[dict[str, Any], person_service.SetControlledVehicleIDsResponse]
223
224
  ]:
224
225
  """
225
226
  设置由外部控制行为的vehicle
@@ -246,7 +247,7 @@ class PersonService:
246
247
  ) -> Coroutine[
247
248
  Any,
248
249
  Any,
249
- Union[Dict[str, Any], person_service.FetchControlledVehicleEnvsResponse],
250
+ Union[dict[str, Any], person_service.FetchControlledVehicleEnvsResponse],
250
251
  ]:
251
252
  """
252
253
  获取由外部控制行为的vehicle的环境信息
@@ -273,7 +274,7 @@ class PersonService:
273
274
  ) -> Coroutine[
274
275
  Any,
275
276
  Any,
276
- Union[Dict[str, Any], person_service.SetControlledVehicleActionsResponse],
277
+ Union[dict[str, Any], person_service.SetControlledVehicleActionsResponse],
277
278
  ]:
278
279
  """
279
280
  设置由外部控制行为的vehicle的行为
@@ -1,4 +1,5 @@
1
- from typing import Any, Awaitable, Coroutine, cast, Union, Dict
1
+ from collections.abc import Awaitable, Coroutine
2
+ from typing import Any, Union, cast
2
3
 
3
4
  import grpc
4
5
  from google.protobuf.json_format import ParseDict
@@ -21,7 +22,7 @@ class RoadService:
21
22
 
22
23
  def GetRoad(
23
24
  self, req: Union[road_service.GetRoadRequest, dict], dict_return: bool = True
24
- ) -> Coroutine[Any, Any, Union[Dict[str, Any], road_service.GetRoadResponse]]:
25
+ ) -> Coroutine[Any, Any, Union[dict[str, Any], road_service.GetRoadResponse]]:
25
26
  """
26
27
  查询道路信息
27
28
  Query road information
@@ -1,4 +1,5 @@
1
- from typing import Any, Awaitable, Coroutine, cast, Union, Dict
1
+ from collections.abc import Awaitable, Coroutine
2
+ from typing import Any, Union, cast
2
3
 
3
4
  import grpc
4
5
  from google.protobuf.json_format import ParseDict
@@ -21,7 +22,7 @@ class SocialService:
21
22
 
22
23
  def Send(
23
24
  self, req: Union[social_service.SendRequest, dict], dict_return: bool = True
24
- ) -> Coroutine[Any, Any, Union[Dict[str, Any], social_service.SendResponse]]:
25
+ ) -> Coroutine[Any, Any, Union[dict[str, Any], social_service.SendResponse]]:
25
26
  """
26
27
  发送消息
27
28
  Send message
@@ -39,7 +40,7 @@ class SocialService:
39
40
 
40
41
  def Receive(
41
42
  self, req: Union[social_service.ReceiveRequest, dict], dict_return: bool = True
42
- ) -> Coroutine[Any, Any, Union[Dict[str, Any], social_service.ReceiveResponse]]:
43
+ ) -> Coroutine[Any, Any, Union[dict[str, Any], social_service.ReceiveResponse]]:
43
44
  """
44
45
  接收消息
45
46
  Receive message
@@ -1,13 +1,15 @@
1
- from typing import Any, Awaitable, TypeVar, Union, Dict
2
- from google.protobuf.message import Message
1
+ from collections.abc import Awaitable
2
+ from typing import Any, TypeVar, Union
3
+
3
4
  from google.protobuf.json_format import MessageToDict
5
+ from google.protobuf.message import Message
4
6
 
5
7
  __all__ = ["parse", "async_parse"]
6
8
 
7
9
  T = TypeVar("T", bound=Message)
8
10
 
9
11
 
10
- def parse(res: T, dict_return: bool) -> Union[Dict[str, Any], T]:
12
+ def parse(res: T, dict_return: bool) -> Union[dict[str, Any], T]:
11
13
  """
12
14
  将Protobuf返回值转换为dict或者原始值
13
15
  Convert Protobuf return value to dict or original value
@@ -23,7 +25,7 @@ def parse(res: T, dict_return: bool) -> Union[Dict[str, Any], T]:
23
25
  return res
24
26
 
25
27
 
26
- async def async_parse(res: Awaitable[T], dict_return: bool) -> Union[Dict[str, Any], T]:
28
+ async def async_parse(res: Awaitable[T], dict_return: bool) -> Union[dict[str, Any], T]:
27
29
  """
28
30
  将Protobuf await返回值转换为dict或者原始值
29
31
  Convert Protobuf await return value to dict or original value
@@ -1,6 +1,11 @@
1
1
  """LLM相关模块"""
2
2
 
3
+ from .embeddings import SentenceEmbedding, SimpleEmbedding
3
4
  from .llm import LLM, LLMConfig
4
- from .embedding import SimpleEmbedding
5
5
 
6
- __all__ = ["LLM", "LLMConfig", "SimpleEmbedding"]
6
+ __all__ = [
7
+ "LLM",
8
+ "LLMConfig",
9
+ "SentenceEmbedding",
10
+ "SimpleEmbedding",
11
+ ]
@@ -0,0 +1,231 @@
1
+ import hashlib
2
+ import json
3
+ import os
4
+ from typing import Optional, Union
5
+
6
+ import numpy as np
7
+ import torch
8
+ from langchain_core.embeddings import Embeddings
9
+ from transformers import AutoModel, AutoTokenizer
10
+
11
+ __all__ = [
12
+ "SentenceEmbedding",
13
+ "SimpleEmbedding",
14
+ ]
15
+
16
+
17
+ class SentenceEmbedding(Embeddings):
18
+ def __init__(
19
+ self,
20
+ pretrained_model_name_or_path: Union[str, os.PathLike] = "BAAI/bge-m3",
21
+ max_seq_len: int = 8192,
22
+ auto_cuda: bool = False,
23
+ local_files_only: bool = False,
24
+ cache_dir: str = "./cache",
25
+ proxies: Optional[dict] = None,
26
+ ):
27
+ os.makedirs(cache_dir, exist_ok=True)
28
+ self.tokenizer = AutoTokenizer.from_pretrained(
29
+ pretrained_model_name_or_path,
30
+ proxies=proxies,
31
+ cache_dir=cache_dir,
32
+ local_files_only=local_files_only,
33
+ )
34
+ self.model = AutoModel.from_pretrained(
35
+ pretrained_model_name_or_path,
36
+ proxies=proxies,
37
+ cache_dir=cache_dir,
38
+ local_files_only=local_files_only,
39
+ )
40
+ self._cuda = auto_cuda and torch.cuda.is_available()
41
+
42
+ if self._cuda:
43
+ self.model = self.model.cuda()
44
+
45
+ self.model.eval()
46
+ self.max_seq_len = max_seq_len
47
+
48
+ def _embed(self, texts: list[str]) -> list[list[float]]:
49
+ # Tokenize sentences
50
+ encoded_input = self.tokenizer(
51
+ texts, padding=True, truncation=True, return_tensors="pt"
52
+ )
53
+ # for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
54
+ # encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
55
+
56
+ # check length of input
57
+ # assert seq_len <= 8192
58
+ assert encoded_input["input_ids"].shape[1] <= self.max_seq_len # type: ignore
59
+
60
+ if self._cuda:
61
+ encoded_input = {k: v.cuda() for k, v in encoded_input.items()}
62
+ # Compute token embeddings
63
+ with torch.no_grad():
64
+ model_output = self.model(**encoded_input)
65
+ # Perform pooling. In this case, cls pooling.
66
+ sentence_embeddings = model_output[0][:, 0]
67
+ # normalize embeddings
68
+ sentence_embeddings = torch.nn.functional.normalize(
69
+ sentence_embeddings, p=2, dim=1
70
+ )
71
+ if self._cuda:
72
+ sentence_embeddings = sentence_embeddings.cpu()
73
+ return sentence_embeddings.tolist()
74
+
75
+ def embed_documents(self, texts: list[str]) -> list[list[float]]:
76
+ """Embed documents."""
77
+ return self._embed(texts)
78
+
79
+ def embed_query(self, text: str) -> list[float]:
80
+ """Embed query text."""
81
+ return self._embed([text])[0]
82
+
83
+
84
+ class SimpleEmbedding(Embeddings):
85
+ """简单的基于内存的embedding实现
86
+
87
+ 使用简单的词袋模型(Bag of Words)和TF-IDF来生成文本的向量表示。
88
+ 所有向量都保存在内存中,适用于小规模应用。
89
+ """
90
+
91
+ def __init__(self, vector_dim: int = 128, cache_size: int = 1000):
92
+ """初始化
93
+
94
+ Args:
95
+ vector_dim: 向量维度
96
+ cache_size: 缓存大小,超过此大小将清除最早的缓存
97
+ """
98
+ self.vector_dim = vector_dim
99
+ self.cache_size = cache_size
100
+ self._cache: dict[str, list[float]] = {}
101
+ self._vocab: dict[str, int] = {} # 词汇表
102
+ self._idf: dict[str, float] = {} # 逆文档频率
103
+ self._doc_count = 0 # 文档总数
104
+
105
+ def _text_to_hash(self, text: str) -> str:
106
+ """将文本转换为hash值"""
107
+ return hashlib.md5(text.encode()).hexdigest()
108
+
109
+ def _tokenize(self, text: str) -> list[str]:
110
+ """简单的分词"""
111
+ # 这里使用简单的空格分词,实际应用中可以使用更复杂的分词方法
112
+ return text.lower().split()
113
+
114
+ def _update_vocab(self, tokens: list[str]):
115
+ """更新词汇表"""
116
+ for token in set(tokens): # 使用set去重
117
+ if token not in self._vocab:
118
+ self._vocab[token] = len(self._vocab)
119
+
120
+ def _update_idf(self, tokens: list[str]):
121
+ """更新IDF值"""
122
+ self._doc_count += 1
123
+ unique_tokens = set(tokens)
124
+ for token in unique_tokens:
125
+ self._idf[token] = self._idf.get(token, 0) + 1
126
+
127
+ def _calculate_tf(self, tokens: list[str]) -> dict[str, float]:
128
+ """计算词频(TF)"""
129
+ tf = {}
130
+ total_tokens = len(tokens)
131
+ for token in tokens:
132
+ tf[token] = tf.get(token, 0) + 1
133
+ # 归一化
134
+ for token in tf:
135
+ tf[token] /= total_tokens
136
+ return tf
137
+
138
+ def _calculate_tfidf(self, tokens: list[str]) -> list[float]:
139
+ """计算TF-IDF向量"""
140
+ vector = np.zeros(self.vector_dim)
141
+ tf = self._calculate_tf(tokens)
142
+
143
+ for token, tf_value in tf.items():
144
+ if token in self._idf:
145
+ idf = np.log(self._doc_count / self._idf[token])
146
+ idx = self._vocab[token] % self.vector_dim # 使用取模运算来控制向量维度
147
+ vector[idx] += tf_value * idf
148
+
149
+ # L2归一化
150
+ norm = np.linalg.norm(vector)
151
+ if norm > 0:
152
+ vector /= norm
153
+
154
+ return list(vector)
155
+
156
+ def _embed(self, text: str) -> list[float]:
157
+ """生成文本的向量表示
158
+
159
+ Args:
160
+ text: 输入文本
161
+
162
+ Returns:
163
+ np.ndarray: 文本的向量表示
164
+ """
165
+ # 检查缓存
166
+ text_hash = self._text_to_hash(text)
167
+ if text_hash in self._cache:
168
+ return self._cache[text_hash]
169
+
170
+ # 分词
171
+ tokens = self._tokenize(text)
172
+ if not tokens:
173
+ return list(np.zeros(self.vector_dim))
174
+
175
+ # 更新词汇表和IDF
176
+ self._update_vocab(tokens)
177
+ self._update_idf(tokens)
178
+
179
+ # 计算向量
180
+ vector = self._calculate_tfidf(tokens)
181
+
182
+ # 更新缓存
183
+ if len(self._cache) >= self.cache_size:
184
+ # 删除最早的缓存
185
+ oldest_key = next(iter(self._cache))
186
+ del self._cache[oldest_key]
187
+ self._cache[text_hash] = vector
188
+
189
+ return list(vector)
190
+
191
+ def embed_documents(self, texts: list[str]) -> list[list[float]]:
192
+ """Embed documents."""
193
+ return [self._embed(text) for text in texts]
194
+
195
+ def embed_query(self, text: str) -> list[float]:
196
+ """Embed query text."""
197
+ return self._embed(text)
198
+
199
+ # def save(self, file_path: str):
200
+ # """保存模型"""
201
+ # state = {
202
+ # "vector_dim": self.vector_dim,
203
+ # "cache_size": self.cache_size,
204
+ # "vocab": self._vocab,
205
+ # "idf": self._idf,
206
+ # "doc_count": self._doc_count,
207
+ # }
208
+ # with open(file_path, "w") as f:
209
+ # json.dump(state, f)
210
+
211
+ # def load(self, file_path: str):
212
+ # """加载模型"""
213
+ # with open(file_path, "r") as f:
214
+ # state = json.load(f)
215
+ # self.vector_dim = state["vector_dim"]
216
+ # self.cache_size = state["cache_size"]
217
+ # self._vocab = state["vocab"]
218
+ # self._idf = state["idf"]
219
+ # self._doc_count = state["doc_count"]
220
+ # self._cache = {} # 清空缓存
221
+
222
+
223
+ if __name__ == "__main__":
224
+ # se = SentenceEmbedding(
225
+ # pretrained_model_name_or_path="ignore/BAAI--bge-m3", cache_dir="ignore"
226
+ # )
227
+ se = SimpleEmbedding()
228
+ print(se.embed_query("hello world"))
229
+ print(se.embed_query("hello world"))
230
+ print(se.embed_query("hello world"))
231
+ print(se.embed_query("hello world"))