pycityagent 2.0.0a21__py3-none-any.whl → 2.0.0a24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycityagent/__init__.py +2 -1
- pycityagent/agent.py +18 -4
- pycityagent/environment/sim/aoi_service.py +2 -1
- pycityagent/environment/sim/clock_service.py +2 -1
- pycityagent/environment/sim/economy_services.py +9 -8
- pycityagent/environment/sim/lane_service.py +6 -5
- pycityagent/environment/sim/light_service.py +10 -8
- pycityagent/environment/sim/person_service.py +12 -11
- pycityagent/environment/sim/road_service.py +3 -2
- pycityagent/environment/sim/social_service.py +4 -3
- pycityagent/environment/utils/protobuf.py +6 -4
- pycityagent/llm/__init__.py +7 -2
- pycityagent/llm/embeddings.py +231 -0
- pycityagent/memory/__init__.py +2 -0
- pycityagent/memory/faiss_query.py +302 -0
- pycityagent/memory/memory.py +131 -137
- pycityagent/memory/memory_base.py +7 -6
- pycityagent/memory/profile.py +7 -6
- pycityagent/memory/self_define.py +8 -7
- pycityagent/memory/state.py +7 -6
- pycityagent/memory/utils.py +2 -1
- pycityagent/simulation/agentgroup.py +42 -25
- pycityagent/simulation/simulation.py +9 -1
- pycityagent/utils/parsers/json_parser.py +3 -3
- pycityagent/workflow/block.py +2 -1
- {pycityagent-2.0.0a21.dist-info → pycityagent-2.0.0a24.dist-info}/METADATA +5 -1
- {pycityagent-2.0.0a21.dist-info → pycityagent-2.0.0a24.dist-info}/RECORD +28 -27
- pycityagent/llm/embedding.py +0 -136
- {pycityagent-2.0.0a21.dist-info → pycityagent-2.0.0a24.dist-info}/WHEEL +0 -0
pycityagent/__init__.py
CHANGED
@@ -5,6 +5,7 @@ Pycityagent: 城市智能体构建框架
|
|
5
5
|
from .agent import Agent, CitizenAgent, InstitutionAgent
|
6
6
|
from .environment import Simulator
|
7
7
|
import logging
|
8
|
+
from .llm import SentenceEmbedding
|
8
9
|
|
9
10
|
# 创建一个 pycityagent 记录器
|
10
11
|
logger = logging.getLogger("pycityagent")
|
@@ -19,4 +20,4 @@ if not logger.hasHandlers():
|
|
19
20
|
handler.setFormatter(formatter)
|
20
21
|
logger.addHandler(handler)
|
21
22
|
|
22
|
-
__all__ = ["Agent", "Simulator", "CitizenAgent", "InstitutionAgent"]
|
23
|
+
__all__ = ["Agent", "Simulator", "CitizenAgent", "InstitutionAgent","SentenceEmbedding",]
|
pycityagent/agent.py
CHANGED
@@ -236,7 +236,15 @@ class Agent(ABC):
|
|
236
236
|
|
237
237
|
# 添加记忆上下文
|
238
238
|
if self._memory:
|
239
|
-
relevant_memories = await self.
|
239
|
+
relevant_memories = await self.memory.search(survey_prompt)
|
240
|
+
|
241
|
+
formatted_results = []
|
242
|
+
# for result in top_results:
|
243
|
+
# formatted_results.append(
|
244
|
+
# f"- [{result['type']}] {result['content']} "
|
245
|
+
# f"(相关度: {result['similarity']:.2f})"
|
246
|
+
# )
|
247
|
+
|
240
248
|
if relevant_memories:
|
241
249
|
dialog.append(
|
242
250
|
{
|
@@ -458,13 +466,18 @@ class Agent(ABC):
|
|
458
466
|
topic = f"exps/{self._exp_id}/agents/{to_agent_uuid}/{sub_topic}"
|
459
467
|
await self._messager.send_message(topic, payload)
|
460
468
|
|
461
|
-
async def send_message_to_agent(
|
469
|
+
async def send_message_to_agent(
|
470
|
+
self, to_agent_uuid: str, content: str, type: str = "social"
|
471
|
+
):
|
462
472
|
"""通过 Messager 发送消息"""
|
463
473
|
if self._messager is None:
|
464
474
|
raise RuntimeError("Messager is not set")
|
475
|
+
if type not in ["social", "economy"]:
|
476
|
+
logger.warning(f"Invalid message type: {type}, sent from {self._uuid}")
|
465
477
|
payload = {
|
466
478
|
"from": self._uuid,
|
467
479
|
"content": content,
|
480
|
+
"type": type,
|
468
481
|
"timestamp": int(datetime.now().timestamp() * 1000),
|
469
482
|
"day": await self.simulator.get_simulator_day(),
|
470
483
|
"t": await self.simulator.get_simulator_second_from_start_of_day(),
|
@@ -485,11 +498,11 @@ class Agent(ABC):
|
|
485
498
|
auros.append(_message_dict)
|
486
499
|
pg_list.append((_message_dict, _date_time))
|
487
500
|
# Avro
|
488
|
-
if self._avro_file is not None:
|
501
|
+
if self._avro_file is not None and type == "social":
|
489
502
|
with open(self._avro_file["dialog"], "a+b") as f:
|
490
503
|
fastavro.writer(f, DIALOG_SCHEMA, auros, codec="snappy")
|
491
504
|
# Pg
|
492
|
-
if self._pgsql_writer is not None:
|
505
|
+
if self._pgsql_writer is not None and type == "social":
|
493
506
|
if self._last_asyncio_pg_task is not None:
|
494
507
|
await self._last_asyncio_pg_task
|
495
508
|
_keys = ["id", "day", "t", "type", "speaker", "content", "created_at"]
|
@@ -595,6 +608,7 @@ class CitizenAgent(Agent):
|
|
595
608
|
# 防止模拟器还没有到prepare阶段导致get_person出错
|
596
609
|
self._has_bound_to_simulator = True
|
597
610
|
self._agent_id = person_id
|
611
|
+
self.memory.set_agent_id(person_id)
|
598
612
|
|
599
613
|
async def _bind_to_economy(self):
|
600
614
|
if self._economy_client is None:
|
@@ -1,4 +1,5 @@
|
|
1
|
-
from typing import Any,
|
1
|
+
from typing import Any, cast, Union
|
2
|
+
from collections.abc import Awaitable, Coroutine
|
2
3
|
|
3
4
|
import grpc
|
4
5
|
from google.protobuf.json_format import ParseDict
|
@@ -25,7 +26,7 @@ class EconomyPersonService:
|
|
25
26
|
self,
|
26
27
|
req: Union[person_service.GetPersonRequest, dict],
|
27
28
|
dict_return: bool = True,
|
28
|
-
) -> Coroutine[Any, Any, Union[
|
29
|
+
) -> Coroutine[Any, Any, Union[dict[str, Any], person_service.GetPersonResponse]]:
|
29
30
|
"""
|
30
31
|
批量查询人的经济情况(资金、雇佣关系)
|
31
32
|
Query person’s economic situation (funds, employment relationship) in batches
|
@@ -48,7 +49,7 @@ class EconomyPersonService:
|
|
48
49
|
req: Union[person_service.UpdatePersonMoneyRequest, dict],
|
49
50
|
dict_return: bool = True,
|
50
51
|
) -> Coroutine[
|
51
|
-
Any, Any, Union[
|
52
|
+
Any, Any, Union[dict[str, Any], person_service.UpdatePersonMoneyResponse]
|
52
53
|
]:
|
53
54
|
"""
|
54
55
|
批量修改人的资金
|
@@ -80,7 +81,7 @@ class EconomyOrgService:
|
|
80
81
|
|
81
82
|
def GetOrg(
|
82
83
|
self, req: Union[org_service.GetOrgRequest, dict], dict_return: bool = True
|
83
|
-
) -> Coroutine[Any, Any, Union[
|
84
|
+
) -> Coroutine[Any, Any, Union[dict[str, Any], org_service.GetOrgResponse]]:
|
84
85
|
"""
|
85
86
|
批量查询组织的经济情况(员工、岗位、资金、货物)
|
86
87
|
Query the economic status of the organization (employees, positions, funds, goods) in batches
|
@@ -100,7 +101,7 @@ class EconomyOrgService:
|
|
100
101
|
self,
|
101
102
|
req: Union[org_service.UpdateOrgMoneyRequest, dict],
|
102
103
|
dict_return: bool = True,
|
103
|
-
) -> Coroutine[Any, Any, Union[
|
104
|
+
) -> Coroutine[Any, Any, Union[dict[str, Any], org_service.UpdateOrgMoneyResponse]]:
|
104
105
|
"""
|
105
106
|
批量修改组织的资金
|
106
107
|
Modify organization’s money in batches
|
@@ -123,7 +124,7 @@ class EconomyOrgService:
|
|
123
124
|
self,
|
124
125
|
req: Union[org_service.UpdateOrgGoodsRequest, dict],
|
125
126
|
dict_return: bool = True,
|
126
|
-
) -> Coroutine[Any, Any, Union[
|
127
|
+
) -> Coroutine[Any, Any, Union[dict[str, Any], org_service.UpdateOrgGoodsResponse]]:
|
127
128
|
"""
|
128
129
|
批量修改组织的货物
|
129
130
|
Modify organization’s goods in batches
|
@@ -147,7 +148,7 @@ class EconomyOrgService:
|
|
147
148
|
req: Union[org_service.UpdateOrgEmployeeRequest, dict],
|
148
149
|
dict_return: bool = True,
|
149
150
|
) -> Coroutine[
|
150
|
-
Any, Any, Union[
|
151
|
+
Any, Any, Union[dict[str, Any], org_service.UpdateOrgEmployeeResponse]
|
151
152
|
]:
|
152
153
|
"""
|
153
154
|
批量修改组织的员工
|
@@ -171,7 +172,7 @@ class EconomyOrgService:
|
|
171
172
|
self,
|
172
173
|
req: Union[org_service.UpdateOrgJobRequest, dict],
|
173
174
|
dict_return: bool = True,
|
174
|
-
) -> Coroutine[Any, Any, Union[
|
175
|
+
) -> Coroutine[Any, Any, Union[dict[str, Any], org_service.UpdateOrgJobResponse]]:
|
175
176
|
"""
|
176
177
|
批量修改组织的岗位
|
177
178
|
Modify organization’s jobs in batches
|
@@ -1,4 +1,5 @@
|
|
1
|
-
from typing import Any,
|
1
|
+
from typing import Any,cast, Union
|
2
|
+
from collections.abc import Awaitable, Coroutine
|
2
3
|
|
3
4
|
import grpc
|
4
5
|
from google.protobuf.json_format import ParseDict
|
@@ -21,7 +22,7 @@ class LaneService:
|
|
21
22
|
|
22
23
|
def GetLane(
|
23
24
|
self, req: Union[lane_service.GetLaneRequest, dict], dict_return: bool = True
|
24
|
-
) -> Coroutine[Any, Any, Union[
|
25
|
+
) -> Coroutine[Any, Any, Union[dict[str, Any], lane_service.GetLaneResponse]]:
|
25
26
|
"""
|
26
27
|
获取Lane的信息
|
27
28
|
Get Lane's information
|
@@ -41,7 +42,7 @@ class LaneService:
|
|
41
42
|
self,
|
42
43
|
req: Union[lane_service.SetLaneMaxVRequest, dict],
|
43
44
|
dict_return: bool = True,
|
44
|
-
) -> Coroutine[Any, Any, Union[
|
45
|
+
) -> Coroutine[Any, Any, Union[dict[str, Any], lane_service.SetLaneMaxVResponse]]:
|
45
46
|
"""
|
46
47
|
设置Lane的最大速度(限速)
|
47
48
|
Set the maximum speed of Lane (speed limit)
|
@@ -64,7 +65,7 @@ class LaneService:
|
|
64
65
|
req: Union[lane_service.SetLaneRestrictionRequest, dict],
|
65
66
|
dict_return: bool = True,
|
66
67
|
) -> Coroutine[
|
67
|
-
Any, Any, Union[
|
68
|
+
Any, Any, Union[dict[str, Any], lane_service.SetLaneRestrictionResponse]
|
68
69
|
]:
|
69
70
|
"""
|
70
71
|
设置Lane的限制
|
@@ -89,7 +90,7 @@ class LaneService:
|
|
89
90
|
req: Union[lane_service.GetLaneByLongLatBBoxRequest, dict],
|
90
91
|
dict_return: bool = True,
|
91
92
|
) -> Coroutine[
|
92
|
-
Any, Any, Union[
|
93
|
+
Any, Any, Union[dict[str, Any], lane_service.GetLaneByLongLatBBoxResponse]
|
93
94
|
]:
|
94
95
|
"""
|
95
96
|
获取特定区域内的Lane的信息
|
@@ -1,9 +1,11 @@
|
|
1
|
-
from
|
1
|
+
from collections.abc import Awaitable, Coroutine
|
2
|
+
from typing import Any, Union, cast
|
2
3
|
|
3
4
|
import grpc
|
4
5
|
from google.protobuf.json_format import ParseDict
|
5
6
|
from pycityproto.city.map.v2 import traffic_light_service_pb2 as light_service
|
6
|
-
from pycityproto.city.map.v2 import
|
7
|
+
from pycityproto.city.map.v2 import \
|
8
|
+
traffic_light_service_pb2_grpc as light_grpc
|
7
9
|
|
8
10
|
from ..utils.protobuf import async_parse
|
9
11
|
|
@@ -21,10 +23,10 @@ class LightService:
|
|
21
23
|
|
22
24
|
def GetTrafficLight(
|
23
25
|
self,
|
24
|
-
req: Union[light_service.GetTrafficLightRequest,
|
26
|
+
req: Union[light_service.GetTrafficLightRequest, dict[str, Any]],
|
25
27
|
dict_return: bool = True,
|
26
28
|
) -> Coroutine[
|
27
|
-
Any, Any, Union[
|
29
|
+
Any, Any, Union[dict[str, Any], light_service.GetTrafficLightResponse]
|
28
30
|
]:
|
29
31
|
"""
|
30
32
|
获取路口的红绿灯信息
|
@@ -46,10 +48,10 @@ class LightService:
|
|
46
48
|
|
47
49
|
def SetTrafficLight(
|
48
50
|
self,
|
49
|
-
req: Union[light_service.SetTrafficLightRequest,
|
51
|
+
req: Union[light_service.SetTrafficLightRequest, dict[str, Any]],
|
50
52
|
dict_return: bool = True,
|
51
53
|
) -> Coroutine[
|
52
|
-
Any, Any, Union[
|
54
|
+
Any, Any, Union[dict[str, Any], light_service.SetTrafficLightResponse]
|
53
55
|
]:
|
54
56
|
"""
|
55
57
|
设置路口的红绿灯信息
|
@@ -74,7 +76,7 @@ class LightService:
|
|
74
76
|
req: Union[light_service.SetTrafficLightPhaseRequest, dict],
|
75
77
|
dict_return: bool = True,
|
76
78
|
) -> Coroutine[
|
77
|
-
Any, Any, Union[
|
79
|
+
Any, Any, Union[dict[str, Any], light_service.SetTrafficLightPhaseResponse]
|
78
80
|
]:
|
79
81
|
"""
|
80
82
|
设置路口的红绿灯相位
|
@@ -99,7 +101,7 @@ class LightService:
|
|
99
101
|
req: Union[light_service.SetTrafficLightStatusRequest, dict],
|
100
102
|
dict_return: bool = True,
|
101
103
|
) -> Coroutine[
|
102
|
-
Any, Any, Union[
|
104
|
+
Any, Any, Union[dict[str, Any], light_service.SetTrafficLightStatusResponse]
|
103
105
|
]:
|
104
106
|
"""
|
105
107
|
设置路口的红绿灯状态
|
@@ -1,5 +1,6 @@
|
|
1
1
|
import warnings
|
2
|
-
from
|
2
|
+
from collections.abc import Awaitable, Coroutine
|
3
|
+
from typing import Any, Union, cast
|
3
4
|
|
4
5
|
import grpc
|
5
6
|
from google.protobuf.json_format import ParseDict
|
@@ -51,7 +52,7 @@ class PersonService:
|
|
51
52
|
self,
|
52
53
|
req: Union[person_service.GetPersonRequest, dict],
|
53
54
|
dict_return: bool = True,
|
54
|
-
) -> Coroutine[Any, Any, Union[
|
55
|
+
) -> Coroutine[Any, Any, Union[dict[str, Any], person_service.GetPersonResponse]]:
|
55
56
|
"""
|
56
57
|
获取person信息
|
57
58
|
Get person information
|
@@ -73,7 +74,7 @@ class PersonService:
|
|
73
74
|
self,
|
74
75
|
req: Union[person_service.AddPersonRequest, dict],
|
75
76
|
dict_return: bool = True,
|
76
|
-
) -> Coroutine[Any, Any, Union[
|
77
|
+
) -> Coroutine[Any, Any, Union[dict[str, Any], person_service.AddPersonResponse]]:
|
77
78
|
"""
|
78
79
|
新增person
|
79
80
|
Add a new person
|
@@ -95,7 +96,7 @@ class PersonService:
|
|
95
96
|
self,
|
96
97
|
req: Union[person_service.SetScheduleRequest, dict],
|
97
98
|
dict_return: bool = True,
|
98
|
-
) -> Coroutine[Any, Any, Union[
|
99
|
+
) -> Coroutine[Any, Any, Union[dict[str, Any], person_service.SetScheduleResponse]]:
|
99
100
|
"""
|
100
101
|
修改person的schedule
|
101
102
|
set person's schedule
|
@@ -118,7 +119,7 @@ class PersonService:
|
|
118
119
|
self,
|
119
120
|
req: Union[person_service.GetPersonsRequest, dict],
|
120
121
|
dict_return: bool = True,
|
121
|
-
) -> Coroutine[Any, Any, Union[
|
122
|
+
) -> Coroutine[Any, Any, Union[dict[str, Any], person_service.GetPersonsResponse]]:
|
122
123
|
"""
|
123
124
|
获取多个person信息
|
124
125
|
Get information of multiple persons
|
@@ -142,7 +143,7 @@ class PersonService:
|
|
142
143
|
req: Union[person_service.GetPersonByLongLatBBoxRequest, dict],
|
143
144
|
dict_return: bool = True,
|
144
145
|
) -> Coroutine[
|
145
|
-
Any, Any, Union[
|
146
|
+
Any, Any, Union[dict[str, Any], person_service.GetPersonByLongLatBBoxResponse]
|
146
147
|
]:
|
147
148
|
"""
|
148
149
|
获取特定区域内的person
|
@@ -167,7 +168,7 @@ class PersonService:
|
|
167
168
|
req: Union[person_service.GetAllVehiclesRequest, dict],
|
168
169
|
dict_return: bool = True,
|
169
170
|
) -> Coroutine[
|
170
|
-
Any, Any, Union[
|
171
|
+
Any, Any, Union[dict[str, Any], person_service.GetAllVehiclesResponse]
|
171
172
|
]:
|
172
173
|
"""
|
173
174
|
获取所有车辆
|
@@ -192,7 +193,7 @@ class PersonService:
|
|
192
193
|
req: Union[person_service.ResetPersonPositionRequest, dict],
|
193
194
|
dict_return: bool = True,
|
194
195
|
) -> Coroutine[
|
195
|
-
Any, Any, Union[
|
196
|
+
Any, Any, Union[dict[str, Any], person_service.ResetPersonPositionResponse]
|
196
197
|
]:
|
197
198
|
"""
|
198
199
|
重置人的位置(将停止当前正在进行的出行,转为sleep状态)
|
@@ -219,7 +220,7 @@ class PersonService:
|
|
219
220
|
req: Union[person_service.SetControlledVehicleIDsRequest, dict],
|
220
221
|
dict_return: bool = True,
|
221
222
|
) -> Coroutine[
|
222
|
-
Any, Any, Union[
|
223
|
+
Any, Any, Union[dict[str, Any], person_service.SetControlledVehicleIDsResponse]
|
223
224
|
]:
|
224
225
|
"""
|
225
226
|
设置由外部控制行为的vehicle
|
@@ -246,7 +247,7 @@ class PersonService:
|
|
246
247
|
) -> Coroutine[
|
247
248
|
Any,
|
248
249
|
Any,
|
249
|
-
Union[
|
250
|
+
Union[dict[str, Any], person_service.FetchControlledVehicleEnvsResponse],
|
250
251
|
]:
|
251
252
|
"""
|
252
253
|
获取由外部控制行为的vehicle的环境信息
|
@@ -273,7 +274,7 @@ class PersonService:
|
|
273
274
|
) -> Coroutine[
|
274
275
|
Any,
|
275
276
|
Any,
|
276
|
-
Union[
|
277
|
+
Union[dict[str, Any], person_service.SetControlledVehicleActionsResponse],
|
277
278
|
]:
|
278
279
|
"""
|
279
280
|
设置由外部控制行为的vehicle的行为
|
@@ -1,4 +1,5 @@
|
|
1
|
-
from
|
1
|
+
from collections.abc import Awaitable, Coroutine
|
2
|
+
from typing import Any, Union, cast
|
2
3
|
|
3
4
|
import grpc
|
4
5
|
from google.protobuf.json_format import ParseDict
|
@@ -21,7 +22,7 @@ class RoadService:
|
|
21
22
|
|
22
23
|
def GetRoad(
|
23
24
|
self, req: Union[road_service.GetRoadRequest, dict], dict_return: bool = True
|
24
|
-
) -> Coroutine[Any, Any, Union[
|
25
|
+
) -> Coroutine[Any, Any, Union[dict[str, Any], road_service.GetRoadResponse]]:
|
25
26
|
"""
|
26
27
|
查询道路信息
|
27
28
|
Query road information
|
@@ -1,4 +1,5 @@
|
|
1
|
-
from
|
1
|
+
from collections.abc import Awaitable, Coroutine
|
2
|
+
from typing import Any, Union, cast
|
2
3
|
|
3
4
|
import grpc
|
4
5
|
from google.protobuf.json_format import ParseDict
|
@@ -21,7 +22,7 @@ class SocialService:
|
|
21
22
|
|
22
23
|
def Send(
|
23
24
|
self, req: Union[social_service.SendRequest, dict], dict_return: bool = True
|
24
|
-
) -> Coroutine[Any, Any, Union[
|
25
|
+
) -> Coroutine[Any, Any, Union[dict[str, Any], social_service.SendResponse]]:
|
25
26
|
"""
|
26
27
|
发送消息
|
27
28
|
Send message
|
@@ -39,7 +40,7 @@ class SocialService:
|
|
39
40
|
|
40
41
|
def Receive(
|
41
42
|
self, req: Union[social_service.ReceiveRequest, dict], dict_return: bool = True
|
42
|
-
) -> Coroutine[Any, Any, Union[
|
43
|
+
) -> Coroutine[Any, Any, Union[dict[str, Any], social_service.ReceiveResponse]]:
|
43
44
|
"""
|
44
45
|
接收消息
|
45
46
|
Receive message
|
@@ -1,13 +1,15 @@
|
|
1
|
-
from
|
2
|
-
from
|
1
|
+
from collections.abc import Awaitable
|
2
|
+
from typing import Any, TypeVar, Union
|
3
|
+
|
3
4
|
from google.protobuf.json_format import MessageToDict
|
5
|
+
from google.protobuf.message import Message
|
4
6
|
|
5
7
|
__all__ = ["parse", "async_parse"]
|
6
8
|
|
7
9
|
T = TypeVar("T", bound=Message)
|
8
10
|
|
9
11
|
|
10
|
-
def parse(res: T, dict_return: bool) -> Union[
|
12
|
+
def parse(res: T, dict_return: bool) -> Union[dict[str, Any], T]:
|
11
13
|
"""
|
12
14
|
将Protobuf返回值转换为dict或者原始值
|
13
15
|
Convert Protobuf return value to dict or original value
|
@@ -23,7 +25,7 @@ def parse(res: T, dict_return: bool) -> Union[Dict[str, Any], T]:
|
|
23
25
|
return res
|
24
26
|
|
25
27
|
|
26
|
-
async def async_parse(res: Awaitable[T], dict_return: bool) -> Union[
|
28
|
+
async def async_parse(res: Awaitable[T], dict_return: bool) -> Union[dict[str, Any], T]:
|
27
29
|
"""
|
28
30
|
将Protobuf await返回值转换为dict或者原始值
|
29
31
|
Convert Protobuf await return value to dict or original value
|
pycityagent/llm/__init__.py
CHANGED
@@ -1,6 +1,11 @@
|
|
1
1
|
"""LLM相关模块"""
|
2
2
|
|
3
|
+
from .embeddings import SentenceEmbedding, SimpleEmbedding
|
3
4
|
from .llm import LLM, LLMConfig
|
4
|
-
from .embedding import SimpleEmbedding
|
5
5
|
|
6
|
-
__all__ = [
|
6
|
+
__all__ = [
|
7
|
+
"LLM",
|
8
|
+
"LLMConfig",
|
9
|
+
"SentenceEmbedding",
|
10
|
+
"SimpleEmbedding",
|
11
|
+
]
|
@@ -0,0 +1,231 @@
|
|
1
|
+
import hashlib
|
2
|
+
import json
|
3
|
+
import os
|
4
|
+
from typing import Optional, Union
|
5
|
+
|
6
|
+
import numpy as np
|
7
|
+
import torch
|
8
|
+
from langchain_core.embeddings import Embeddings
|
9
|
+
from transformers import AutoModel, AutoTokenizer
|
10
|
+
|
11
|
+
__all__ = [
|
12
|
+
"SentenceEmbedding",
|
13
|
+
"SimpleEmbedding",
|
14
|
+
]
|
15
|
+
|
16
|
+
|
17
|
+
class SentenceEmbedding(Embeddings):
|
18
|
+
def __init__(
|
19
|
+
self,
|
20
|
+
pretrained_model_name_or_path: Union[str, os.PathLike] = "BAAI/bge-m3",
|
21
|
+
max_seq_len: int = 8192,
|
22
|
+
auto_cuda: bool = False,
|
23
|
+
local_files_only: bool = False,
|
24
|
+
cache_dir: str = "./cache",
|
25
|
+
proxies: Optional[dict] = None,
|
26
|
+
):
|
27
|
+
os.makedirs(cache_dir, exist_ok=True)
|
28
|
+
self.tokenizer = AutoTokenizer.from_pretrained(
|
29
|
+
pretrained_model_name_or_path,
|
30
|
+
proxies=proxies,
|
31
|
+
cache_dir=cache_dir,
|
32
|
+
local_files_only=local_files_only,
|
33
|
+
)
|
34
|
+
self.model = AutoModel.from_pretrained(
|
35
|
+
pretrained_model_name_or_path,
|
36
|
+
proxies=proxies,
|
37
|
+
cache_dir=cache_dir,
|
38
|
+
local_files_only=local_files_only,
|
39
|
+
)
|
40
|
+
self._cuda = auto_cuda and torch.cuda.is_available()
|
41
|
+
|
42
|
+
if self._cuda:
|
43
|
+
self.model = self.model.cuda()
|
44
|
+
|
45
|
+
self.model.eval()
|
46
|
+
self.max_seq_len = max_seq_len
|
47
|
+
|
48
|
+
def _embed(self, texts: list[str]) -> list[list[float]]:
|
49
|
+
# Tokenize sentences
|
50
|
+
encoded_input = self.tokenizer(
|
51
|
+
texts, padding=True, truncation=True, return_tensors="pt"
|
52
|
+
)
|
53
|
+
# for s2p(short query to long passage) retrieval task, add an instruction to query (not add instruction for passages)
|
54
|
+
# encoded_input = tokenizer([instruction + q for q in queries], padding=True, truncation=True, return_tensors='pt')
|
55
|
+
|
56
|
+
# check length of input
|
57
|
+
# assert seq_len <= 8192
|
58
|
+
assert encoded_input["input_ids"].shape[1] <= self.max_seq_len # type: ignore
|
59
|
+
|
60
|
+
if self._cuda:
|
61
|
+
encoded_input = {k: v.cuda() for k, v in encoded_input.items()}
|
62
|
+
# Compute token embeddings
|
63
|
+
with torch.no_grad():
|
64
|
+
model_output = self.model(**encoded_input)
|
65
|
+
# Perform pooling. In this case, cls pooling.
|
66
|
+
sentence_embeddings = model_output[0][:, 0]
|
67
|
+
# normalize embeddings
|
68
|
+
sentence_embeddings = torch.nn.functional.normalize(
|
69
|
+
sentence_embeddings, p=2, dim=1
|
70
|
+
)
|
71
|
+
if self._cuda:
|
72
|
+
sentence_embeddings = sentence_embeddings.cpu()
|
73
|
+
return sentence_embeddings.tolist()
|
74
|
+
|
75
|
+
def embed_documents(self, texts: list[str]) -> list[list[float]]:
|
76
|
+
"""Embed documents."""
|
77
|
+
return self._embed(texts)
|
78
|
+
|
79
|
+
def embed_query(self, text: str) -> list[float]:
|
80
|
+
"""Embed query text."""
|
81
|
+
return self._embed([text])[0]
|
82
|
+
|
83
|
+
|
84
|
+
class SimpleEmbedding(Embeddings):
|
85
|
+
"""简单的基于内存的embedding实现
|
86
|
+
|
87
|
+
使用简单的词袋模型(Bag of Words)和TF-IDF来生成文本的向量表示。
|
88
|
+
所有向量都保存在内存中,适用于小规模应用。
|
89
|
+
"""
|
90
|
+
|
91
|
+
def __init__(self, vector_dim: int = 128, cache_size: int = 1000):
|
92
|
+
"""初始化
|
93
|
+
|
94
|
+
Args:
|
95
|
+
vector_dim: 向量维度
|
96
|
+
cache_size: 缓存大小,超过此大小将清除最早的缓存
|
97
|
+
"""
|
98
|
+
self.vector_dim = vector_dim
|
99
|
+
self.cache_size = cache_size
|
100
|
+
self._cache: dict[str, list[float]] = {}
|
101
|
+
self._vocab: dict[str, int] = {} # 词汇表
|
102
|
+
self._idf: dict[str, float] = {} # 逆文档频率
|
103
|
+
self._doc_count = 0 # 文档总数
|
104
|
+
|
105
|
+
def _text_to_hash(self, text: str) -> str:
|
106
|
+
"""将文本转换为hash值"""
|
107
|
+
return hashlib.md5(text.encode()).hexdigest()
|
108
|
+
|
109
|
+
def _tokenize(self, text: str) -> list[str]:
|
110
|
+
"""简单的分词"""
|
111
|
+
# 这里使用简单的空格分词,实际应用中可以使用更复杂的分词方法
|
112
|
+
return text.lower().split()
|
113
|
+
|
114
|
+
def _update_vocab(self, tokens: list[str]):
|
115
|
+
"""更新词汇表"""
|
116
|
+
for token in set(tokens): # 使用set去重
|
117
|
+
if token not in self._vocab:
|
118
|
+
self._vocab[token] = len(self._vocab)
|
119
|
+
|
120
|
+
def _update_idf(self, tokens: list[str]):
|
121
|
+
"""更新IDF值"""
|
122
|
+
self._doc_count += 1
|
123
|
+
unique_tokens = set(tokens)
|
124
|
+
for token in unique_tokens:
|
125
|
+
self._idf[token] = self._idf.get(token, 0) + 1
|
126
|
+
|
127
|
+
def _calculate_tf(self, tokens: list[str]) -> dict[str, float]:
|
128
|
+
"""计算词频(TF)"""
|
129
|
+
tf = {}
|
130
|
+
total_tokens = len(tokens)
|
131
|
+
for token in tokens:
|
132
|
+
tf[token] = tf.get(token, 0) + 1
|
133
|
+
# 归一化
|
134
|
+
for token in tf:
|
135
|
+
tf[token] /= total_tokens
|
136
|
+
return tf
|
137
|
+
|
138
|
+
def _calculate_tfidf(self, tokens: list[str]) -> list[float]:
|
139
|
+
"""计算TF-IDF向量"""
|
140
|
+
vector = np.zeros(self.vector_dim)
|
141
|
+
tf = self._calculate_tf(tokens)
|
142
|
+
|
143
|
+
for token, tf_value in tf.items():
|
144
|
+
if token in self._idf:
|
145
|
+
idf = np.log(self._doc_count / self._idf[token])
|
146
|
+
idx = self._vocab[token] % self.vector_dim # 使用取模运算来控制向量维度
|
147
|
+
vector[idx] += tf_value * idf
|
148
|
+
|
149
|
+
# L2归一化
|
150
|
+
norm = np.linalg.norm(vector)
|
151
|
+
if norm > 0:
|
152
|
+
vector /= norm
|
153
|
+
|
154
|
+
return list(vector)
|
155
|
+
|
156
|
+
def _embed(self, text: str) -> list[float]:
|
157
|
+
"""生成文本的向量表示
|
158
|
+
|
159
|
+
Args:
|
160
|
+
text: 输入文本
|
161
|
+
|
162
|
+
Returns:
|
163
|
+
np.ndarray: 文本的向量表示
|
164
|
+
"""
|
165
|
+
# 检查缓存
|
166
|
+
text_hash = self._text_to_hash(text)
|
167
|
+
if text_hash in self._cache:
|
168
|
+
return self._cache[text_hash]
|
169
|
+
|
170
|
+
# 分词
|
171
|
+
tokens = self._tokenize(text)
|
172
|
+
if not tokens:
|
173
|
+
return list(np.zeros(self.vector_dim))
|
174
|
+
|
175
|
+
# 更新词汇表和IDF
|
176
|
+
self._update_vocab(tokens)
|
177
|
+
self._update_idf(tokens)
|
178
|
+
|
179
|
+
# 计算向量
|
180
|
+
vector = self._calculate_tfidf(tokens)
|
181
|
+
|
182
|
+
# 更新缓存
|
183
|
+
if len(self._cache) >= self.cache_size:
|
184
|
+
# 删除最早的缓存
|
185
|
+
oldest_key = next(iter(self._cache))
|
186
|
+
del self._cache[oldest_key]
|
187
|
+
self._cache[text_hash] = vector
|
188
|
+
|
189
|
+
return list(vector)
|
190
|
+
|
191
|
+
def embed_documents(self, texts: list[str]) -> list[list[float]]:
|
192
|
+
"""Embed documents."""
|
193
|
+
return [self._embed(text) for text in texts]
|
194
|
+
|
195
|
+
def embed_query(self, text: str) -> list[float]:
|
196
|
+
"""Embed query text."""
|
197
|
+
return self._embed(text)
|
198
|
+
|
199
|
+
# def save(self, file_path: str):
|
200
|
+
# """保存模型"""
|
201
|
+
# state = {
|
202
|
+
# "vector_dim": self.vector_dim,
|
203
|
+
# "cache_size": self.cache_size,
|
204
|
+
# "vocab": self._vocab,
|
205
|
+
# "idf": self._idf,
|
206
|
+
# "doc_count": self._doc_count,
|
207
|
+
# }
|
208
|
+
# with open(file_path, "w") as f:
|
209
|
+
# json.dump(state, f)
|
210
|
+
|
211
|
+
# def load(self, file_path: str):
|
212
|
+
# """加载模型"""
|
213
|
+
# with open(file_path, "r") as f:
|
214
|
+
# state = json.load(f)
|
215
|
+
# self.vector_dim = state["vector_dim"]
|
216
|
+
# self.cache_size = state["cache_size"]
|
217
|
+
# self._vocab = state["vocab"]
|
218
|
+
# self._idf = state["idf"]
|
219
|
+
# self._doc_count = state["doc_count"]
|
220
|
+
# self._cache = {} # 清空缓存
|
221
|
+
|
222
|
+
|
223
|
+
if __name__ == "__main__":
|
224
|
+
# se = SentenceEmbedding(
|
225
|
+
# pretrained_model_name_or_path="ignore/BAAI--bge-m3", cache_dir="ignore"
|
226
|
+
# )
|
227
|
+
se = SimpleEmbedding()
|
228
|
+
print(se.embed_query("hello world"))
|
229
|
+
print(se.embed_query("hello world"))
|
230
|
+
print(se.embed_query("hello world"))
|
231
|
+
print(se.embed_query("hello world"))
|