libinephany 1.1.1__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- libinephany/pydantic_models/schemas/inner_task_profile.py +39 -55
- libinephany/web_apps/error_logger.py +34 -2
- {libinephany-1.1.1.dist-info → libinephany-1.1.3.dist-info}/METADATA +1 -1
- {libinephany-1.1.1.dist-info → libinephany-1.1.3.dist-info}/RECORD +7 -7
- {libinephany-1.1.1.dist-info → libinephany-1.1.3.dist-info}/WHEEL +0 -0
- {libinephany-1.1.1.dist-info → libinephany-1.1.3.dist-info}/licenses/LICENSE +0 -0
- {libinephany-1.1.1.dist-info → libinephany-1.1.3.dist-info}/top_level.txt +0 -0
@@ -7,8 +7,15 @@
|
|
7
7
|
import math
|
8
8
|
from typing import Any, Callable
|
9
9
|
|
10
|
-
from
|
11
|
-
|
10
|
+
from pydantic import BaseModel
|
11
|
+
|
12
|
+
# ======================================================================================================================
|
13
|
+
#
|
14
|
+
# CONSTANTS
|
15
|
+
#
|
16
|
+
# ======================================================================================================================
|
17
|
+
|
18
|
+
VRAM_USAGES_KEY = "vram_usages"
|
12
19
|
|
13
20
|
# ======================================================================================================================
|
14
21
|
#
|
@@ -20,26 +27,40 @@ from pydantic import BaseModel, field_validator
|
|
20
27
|
class InnerTaskProfile(BaseModel):
|
21
28
|
|
22
29
|
inner_task_name: str
|
30
|
+
|
23
31
|
number_of_agents: int
|
24
32
|
number_of_layers: int
|
33
|
+
number_of_parameters: int
|
34
|
+
|
25
35
|
observation_space_sizes: dict[str, int]
|
26
36
|
action_space_sizes: dict[str, int]
|
27
|
-
number_of_parameters: int
|
28
|
-
vram_usage: float
|
29
|
-
idle_vram_usage: float
|
30
|
-
hparam_overrides: dict[str, dict[str, Any]] | None = None
|
31
37
|
|
32
|
-
|
33
|
-
|
38
|
+
vram_usages: dict[int, tuple[float, float]] | None = None
|
39
|
+
expected_vram_usage: float | None = None
|
40
|
+
expected_idle_vram_usage: float | None = None
|
41
|
+
max_batch_size_override: int | None = None
|
42
|
+
|
43
|
+
@property
|
44
|
+
def vram_usage(self) -> float:
|
45
|
+
"""
|
46
|
+
:return: VRAM usage at the max batch size.
|
47
|
+
"""
|
48
|
+
|
49
|
+
if self.expected_vram_usage is None:
|
50
|
+
return float("nan")
|
51
|
+
|
52
|
+
return self.expected_vram_usage
|
53
|
+
|
54
|
+
@property
|
55
|
+
def idle_vram_usage(self) -> float:
|
34
56
|
"""
|
35
|
-
:
|
36
|
-
:return: Either the given float value or NaN.
|
57
|
+
:return: Idle VRAM usage at the max batch size.
|
37
58
|
"""
|
38
59
|
|
39
|
-
if
|
60
|
+
if self.expected_idle_vram_usage is None:
|
40
61
|
return float("nan")
|
41
62
|
|
42
|
-
return
|
63
|
+
return self.expected_idle_vram_usage
|
43
64
|
|
44
65
|
@property
|
45
66
|
def failed_to_profile(self) -> bool:
|
@@ -49,19 +70,18 @@ class InnerTaskProfile(BaseModel):
|
|
49
70
|
|
50
71
|
return math.isnan(self.vram_usage)
|
51
72
|
|
52
|
-
def
|
73
|
+
def model_dump(self, **kwargs) -> dict[str, Any]:
|
53
74
|
"""
|
54
75
|
:param kwargs: Standard Pydantic model dump kwargs.
|
55
76
|
:return: Dump result of the superclass' method.
|
56
77
|
"""
|
57
78
|
|
58
|
-
|
59
|
-
f"Inner task {self.inner_task_name} consumed {self.vram_usage:.3f} MB of VRAM while training and "
|
60
|
-
f"{self.idle_vram_usage:.3f} MB of VRAM while idle. It has {self.number_of_agents} agents across "
|
61
|
-
f"{self.number_of_layers} inner model layers."
|
62
|
-
)
|
79
|
+
super_dump = super().model_dump(**kwargs)
|
63
80
|
|
64
|
-
|
81
|
+
if self.vram_usages is not None:
|
82
|
+
super_dump[VRAM_USAGES_KEY] = {k: list(v) for k, v in self.vram_usages.items()}
|
83
|
+
|
84
|
+
return super_dump
|
65
85
|
|
66
86
|
|
67
87
|
class InnerTaskProfiles(BaseModel):
|
@@ -235,42 +255,6 @@ class InnerTaskProfiles(BaseModel):
|
|
235
255
|
|
236
256
|
return inner_task_name in self.profiles
|
237
257
|
|
238
|
-
def add_profile(
|
239
|
-
self,
|
240
|
-
inner_task_name: str,
|
241
|
-
number_of_agents: int,
|
242
|
-
number_of_layers: int,
|
243
|
-
observation_space_sizes: dict[str, int],
|
244
|
-
action_space_sizes: dict[str, int],
|
245
|
-
number_of_parameters: int,
|
246
|
-
vram_usage: float,
|
247
|
-
idle_vram_usage: float,
|
248
|
-
hparam_overrides: dict[str, dict[str, Any]] | None = None,
|
249
|
-
) -> None:
|
250
|
-
"""
|
251
|
-
:param inner_task_name: Name of the inner task to add a profile for.
|
252
|
-
:param number_of_agents: Number of agents active in the inner task's environment.
|
253
|
-
:param number_of_layers: Number of layers in the inner model.
|
254
|
-
:param observation_space_sizes: Dictionary mapping agent IDs to their observation space sizes.
|
255
|
-
:param action_space_sizes: Dictionary mapping agent IDs to their action space sizes.
|
256
|
-
:param vram_usage: VRAM required to perform the inner task. Can be NaN if an OOM was encountered.
|
257
|
-
:param idle_vram_usage: VRAM required for the inner task to sit loaded but not actively being trained. Can be
|
258
|
-
NaN if an OOM was encountered.
|
259
|
-
:param hparam_overrides: Hyperparameter overrides for the inner task.
|
260
|
-
"""
|
261
|
-
|
262
|
-
self.profiles[inner_task_name] = InnerTaskProfile(
|
263
|
-
inner_task_name=inner_task_name,
|
264
|
-
number_of_agents=number_of_agents,
|
265
|
-
number_of_layers=number_of_layers,
|
266
|
-
observation_space_sizes=observation_space_sizes,
|
267
|
-
action_space_sizes=action_space_sizes,
|
268
|
-
number_of_parameters=number_of_parameters,
|
269
|
-
vram_usage=vram_usage,
|
270
|
-
idle_vram_usage=idle_vram_usage,
|
271
|
-
hparam_overrides=hparam_overrides,
|
272
|
-
)
|
273
|
-
|
274
258
|
def validate_task_profiles(self, policy_mapping_function: Callable[[str, Any, Any], str]) -> None:
|
275
259
|
"""
|
276
260
|
:param policy_mapping_function: Function which maps agent IDs to policy IDs.
|
@@ -58,6 +58,8 @@ class ErrorLogger(AsyncioWorker):
|
|
58
58
|
def __init__(
|
59
59
|
self,
|
60
60
|
service_name: str,
|
61
|
+
member_ids_to_mention: list[str] | None = None,
|
62
|
+
custom_error_message: str | None = None,
|
61
63
|
minimum_severity: ErrorSeverities = ErrorSeverities.WARNING,
|
62
64
|
error_classifier: ExceptionSeverityClassifier | None = None,
|
63
65
|
max_time_accumulating_errors: float = 10.0,
|
@@ -66,6 +68,8 @@ class ErrorLogger(AsyncioWorker):
|
|
66
68
|
"""
|
67
69
|
:param service_name: Name of the service deployed on the cloud errors sent to this error logger should be sent
|
68
70
|
to.
|
71
|
+
:param member_ids_to_mention: List of member IDs to mention in the Slack message. If None, no members will be mentioned.
|
72
|
+
:param custom_error_message: Custom error message to include in the Slack message. If None, no custom message will be included.
|
69
73
|
:param minimum_severity: Minimum error severity that can be sent to Slack.
|
70
74
|
:param error_classifier: Optional callable used to alter how errors severities are classified. If this is None
|
71
75
|
all errors are given the 'WARNING' severity.
|
@@ -77,6 +81,8 @@ class ErrorLogger(AsyncioWorker):
|
|
77
81
|
|
78
82
|
self._slack_client, self._channel_id = self._get_slack_client()
|
79
83
|
|
84
|
+
self.member_mentions = self._form_member_mentions(member_ids_to_mention=member_ids_to_mention)
|
85
|
+
self.custom_error_message = custom_error_message
|
80
86
|
self.service_name = service_name
|
81
87
|
self.error_classifier = error_classifier if error_classifier is not None else default_error_classifier
|
82
88
|
self.max_time_accumulating_errors = max_time_accumulating_errors
|
@@ -144,6 +150,7 @@ class ErrorLogger(AsyncioWorker):
|
|
144
150
|
exception: Exception,
|
145
151
|
frequency: int,
|
146
152
|
frequency_time_window: float,
|
153
|
+
custom_error_message: str | None,
|
147
154
|
) -> str:
|
148
155
|
"""
|
149
156
|
:param mention: Mention tag of the entire channel or a particular user ID.
|
@@ -152,19 +159,40 @@ class ErrorLogger(AsyncioWorker):
|
|
152
159
|
:param exception: Exception that occurred.
|
153
160
|
:param frequency: How frequently the exception occurred in the given time window.
|
154
161
|
:param frequency_time_window: Time between the first and last instance of the exception.
|
162
|
+
:param custom_error_message: Custom error message to include in the Slack message. If None, no custom message will be included.
|
155
163
|
:return: Formatted Slack message string.
|
156
164
|
"""
|
157
165
|
|
158
166
|
frequency_line = f"{frequency} in {frequency_time_window}s" if frequency > 1 else f"{frequency}"
|
159
167
|
|
168
|
+
header = f"{notification_emoji} *{service_name} Error* {notification_emoji}\n"
|
169
|
+
|
170
|
+
if mention:
|
171
|
+
header += f"• *Alerting*: {mention}\n"
|
172
|
+
|
173
|
+
if custom_error_message is not None:
|
174
|
+
header += f"• *Custom Message*: {custom_error_message}\n"
|
175
|
+
|
160
176
|
return (
|
161
|
-
f"{
|
177
|
+
f"{header}"
|
162
178
|
f"• *Service*: {service_name}\n"
|
163
179
|
f"• *Error Type*: {exception.__class__.__name__}\n"
|
164
180
|
f"• *Frequency*: {frequency_line}\n"
|
165
181
|
f"• *Traceback*: "
|
166
182
|
)
|
167
183
|
|
184
|
+
@staticmethod
|
185
|
+
def _form_member_mentions(member_ids_to_mention: list[str] | None) -> str:
|
186
|
+
"""
|
187
|
+
:param member_ids_to_mention: List of member IDs to mention in the Slack message.
|
188
|
+
:return: String of member mentions.
|
189
|
+
"""
|
190
|
+
|
191
|
+
if not member_ids_to_mention:
|
192
|
+
return ""
|
193
|
+
|
194
|
+
return " ".join([f"<@{member_id}>" for member_id in member_ids_to_mention])
|
195
|
+
|
168
196
|
def _get_traceback_file_name(self, exception: Exception, exception_timestamp: str) -> str:
|
169
197
|
"""
|
170
198
|
:param exception: Exception being sent to Slack.
|
@@ -275,7 +303,10 @@ class ErrorLogger(AsyncioWorker):
|
|
275
303
|
traceback_filename = self._get_traceback_file_name(exception=exception, exception_timestamp=formatted_timestamp)
|
276
304
|
|
277
305
|
notification_emoji = SEVERITY_EMOJIS[severity]
|
278
|
-
|
306
|
+
# Temporary since the bot is used in training and the API.
|
307
|
+
mention = (
|
308
|
+
self.member_mentions if not ErrorSeverities.should_mention(severity=severity) else self.CHANNEL_MENTION
|
309
|
+
)
|
279
310
|
|
280
311
|
self._log_error_to_terminal(exception=exception)
|
281
312
|
|
@@ -291,6 +322,7 @@ class ErrorLogger(AsyncioWorker):
|
|
291
322
|
exception=exception,
|
292
323
|
frequency=frequency,
|
293
324
|
frequency_time_window=frequency_time_window,
|
325
|
+
custom_error_message=self.custom_error_message,
|
294
326
|
),
|
295
327
|
)
|
296
328
|
except SlackApiError as e:
|
@@ -28,7 +28,7 @@ libinephany/pydantic_models/configs/observer_config.py,sha256=v_ChzaVXC_rlZ7eDZP
|
|
28
28
|
libinephany/pydantic_models/configs/outer_model_config.py,sha256=GQ0QBSC2Xht8x8X_TEMfYM2GF_x1kErLuFrA_H6Jhs0,1209
|
29
29
|
libinephany/pydantic_models/schemas/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
30
30
|
libinephany/pydantic_models/schemas/agent_info.py,sha256=me5gDxvZjP9TNK588mpUvxiiJrPDqy3Z7ZHRzryAYTs,2628
|
31
|
-
libinephany/pydantic_models/schemas/inner_task_profile.py,sha256=
|
31
|
+
libinephany/pydantic_models/schemas/inner_task_profile.py,sha256=1Q3cDyyW01NOgalmAWGLc-AaLoLum9nBtcpLsfxo_pw,10628
|
32
32
|
libinephany/pydantic_models/schemas/observation_models.py,sha256=MLhxqDet9Yol1D5mkQGQsQT23sm37AStRLnPc4sgcZc,2110
|
33
33
|
libinephany/pydantic_models/schemas/request_schemas.py,sha256=VED8eAUvBofxeAx9gWU8DyCZOTVD3QsHRq-TO7kyOqk,1260
|
34
34
|
libinephany/pydantic_models/schemas/response_schemas.py,sha256=SKFuasdjX5aH_I0vT3SwnpwhyMf9cNPB1ZpDeAGgoO8,2158
|
@@ -55,10 +55,10 @@ libinephany/utils/torch_utils.py,sha256=o5TsqrXe6Id04P6SqB_avGBRZutbu6IBB61llAHQ
|
|
55
55
|
libinephany/utils/transforms.py,sha256=Ca4pbCs_FbCpXb8M8oPxrP5QOqOAwGSdGpKzy5YUubc,3503
|
56
56
|
libinephany/utils/typing.py,sha256=rGbaPO3MaUndsWiC_wHzReD_TOLYqb43i01pKN-j7Xs,624
|
57
57
|
libinephany/web_apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
58
|
-
libinephany/web_apps/error_logger.py,sha256=
|
58
|
+
libinephany/web_apps/error_logger.py,sha256=QpspO726Uoyyr6lBEEb3Q9XqhVOXUM4AaYE7vbnk31c,18153
|
59
59
|
libinephany/web_apps/web_app_utils.py,sha256=qiq_lasPipgN1RgRudPJc342kYci8O_4RqppxmIX8NY,4095
|
60
|
-
libinephany-1.1.
|
61
|
-
libinephany-1.1.
|
62
|
-
libinephany-1.1.
|
63
|
-
libinephany-1.1.
|
64
|
-
libinephany-1.1.
|
60
|
+
libinephany-1.1.3.dist-info/licenses/LICENSE,sha256=pogfDoMBP07ehIOvWymuWIar8pg2YLUhqOHsJQU3wdc,9250
|
61
|
+
libinephany-1.1.3.dist-info/METADATA,sha256=Z99V_6BvYynyRZkZ2D5Ih6Dk-GWGeNZH0cTiMFEPSvg,8389
|
62
|
+
libinephany-1.1.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
63
|
+
libinephany-1.1.3.dist-info/top_level.txt,sha256=bYAOXQdJgIoLkO2Ui0kxe7pSYegS_e38u0dMscd7COQ,12
|
64
|
+
libinephany-1.1.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|