naeural-core 7.7.239__py3-none-any.whl → 7.7.242__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- naeural_core/comm/communication_manager.py +9 -4
- naeural_core/comm/mixins/default_comm_mixin.py +0 -1
- naeural_core/core_logging/logger_mixins/gpu_mixin.py +28 -14
- naeural_core/main/orchestrator.py +28 -1
- naeural_core/main/ver.py +1 -1
- {naeural_core-7.7.239.dist-info → naeural_core-7.7.242.dist-info}/METADATA +1 -1
- {naeural_core-7.7.239.dist-info → naeural_core-7.7.242.dist-info}/RECORD +9 -9
- {naeural_core-7.7.239.dist-info → naeural_core-7.7.242.dist-info}/WHEEL +0 -0
- {naeural_core-7.7.239.dist-info → naeural_core-7.7.242.dist-info}/licenses/LICENSE +0 -0
|
@@ -74,13 +74,18 @@ class CommunicationManager(Manager, _ConfigHandlerMixin):
|
|
|
74
74
|
_id = self.log.config_data.get(ct.CONFIG_STARTUP_v2.K_EE_ID, '')[:ct.EE_ALIAS_MAX_SIZE]
|
|
75
75
|
return _id
|
|
76
76
|
|
|
77
|
+
def _has_failed_comms(self):
|
|
78
|
+
for comm in self._dct_comm_plugins.values():
|
|
79
|
+
if comm.comm_failed_after_retries:
|
|
80
|
+
return comm
|
|
81
|
+
return None
|
|
77
82
|
|
|
78
83
|
@property
|
|
79
84
|
def has_failed_comms(self):
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
85
|
+
comm = self._has_failed_comms()
|
|
86
|
+
if comm is not None:
|
|
87
|
+
self.P("Detected total communication failure on comm {}. This may generate shutdown/restart.".format(comm.__class__.__name__), color='error')
|
|
88
|
+
return True
|
|
84
89
|
return False
|
|
85
90
|
|
|
86
91
|
|
|
@@ -136,6 +136,28 @@ class _GPUMixin(object):
|
|
|
136
136
|
|
|
137
137
|
return processes_by_uuid
|
|
138
138
|
|
|
139
|
+
def _get_uuid_by_index(self, timeout=1.5):
|
|
140
|
+
import subprocess, shutil
|
|
141
|
+
smi = shutil.which("nvidia-smi")
|
|
142
|
+
if not smi:
|
|
143
|
+
return {}
|
|
144
|
+
|
|
145
|
+
out = subprocess.run(
|
|
146
|
+
[smi, "--query-gpu=index,uuid", "--format=csv,noheader,nounits"],
|
|
147
|
+
capture_output=True, text=True, timeout=timeout
|
|
148
|
+
)
|
|
149
|
+
if out.returncode != 0:
|
|
150
|
+
return {}
|
|
151
|
+
|
|
152
|
+
d = {}
|
|
153
|
+
for line in out.stdout.splitlines():
|
|
154
|
+
line = line.strip()
|
|
155
|
+
if not line:
|
|
156
|
+
continue
|
|
157
|
+
idx_s, uuid = [p.strip() for p in line.split(",", 1)]
|
|
158
|
+
d[int(idx_s)] = uuid
|
|
159
|
+
return d
|
|
160
|
+
|
|
139
161
|
def gpu_info(self, show=False, mb=False, current_pid=False):
|
|
140
162
|
"""
|
|
141
163
|
Collects GPU info. Must have torch installed & non-mandatory nvidia-smi
|
|
@@ -215,20 +237,12 @@ class _GPUMixin(object):
|
|
|
215
237
|
fan_speed, fan_speed_unit = -1, "N/A"
|
|
216
238
|
if pynvml_avail:
|
|
217
239
|
# --- get an NVML handle that matches torch's CUDA device ordering when possible ---
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
if hasattr(pynvml, "nvmlDeviceGetHandleByPciBusId_v2"):
|
|
225
|
-
handle = pynvml.nvmlDeviceGetHandleByPciBusId_v2(pci_bus_id)
|
|
226
|
-
elif hasattr(pynvml, "nvmlDeviceGetHandleByPciBusId"):
|
|
227
|
-
handle = pynvml.nvmlDeviceGetHandleByPciBusId(pci_bus_id)
|
|
228
|
-
except Exception:
|
|
229
|
-
handle = None
|
|
230
|
-
|
|
231
|
-
if handle is None:
|
|
240
|
+
uuid_by_index = self._get_uuid_by_index()
|
|
241
|
+
# inside your for device_id in range(n_gpus):
|
|
242
|
+
uuid = uuid_by_index.get(device_id)
|
|
243
|
+
if uuid:
|
|
244
|
+
handle = pynvml.nvmlDeviceGetHandleByUUID(uuid)
|
|
245
|
+
else:
|
|
232
246
|
handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
|
|
233
247
|
|
|
234
248
|
# --- memory (NVML returns bytes) ---
|
|
@@ -121,6 +121,7 @@ class Orchestrator(DecentrAIObject,
|
|
|
121
121
|
self._current_dct_config_streams = {}
|
|
122
122
|
self._should_send_initial_log = False
|
|
123
123
|
self._initial_log_sent = False
|
|
124
|
+
self._last_offline_log = 0
|
|
124
125
|
self.loop_timings = deque(maxlen=3600)
|
|
125
126
|
self._reset_timers = False
|
|
126
127
|
self.__is_mlstop_dangerous = False
|
|
@@ -709,6 +710,10 @@ class Orchestrator(DecentrAIObject,
|
|
|
709
710
|
def cfg_main_loop_resolution(self):
|
|
710
711
|
return self.config_data.get('MAIN_LOOP_RESOLUTION', 20)
|
|
711
712
|
|
|
713
|
+
@property
|
|
714
|
+
def cfg_work_offline(self):
|
|
715
|
+
return self.config_data.get('WORK_OFFLINE', False)
|
|
716
|
+
|
|
712
717
|
@property
|
|
713
718
|
def cfg_sequential_streams(self):
|
|
714
719
|
"""
|
|
@@ -1613,6 +1618,27 @@ class Orchestrator(DecentrAIObject,
|
|
|
1613
1618
|
self._comm_manager.maybe_show_info()
|
|
1614
1619
|
return
|
|
1615
1620
|
|
|
1621
|
+
def _maybe_log_offline_status(self):
|
|
1622
|
+
if not self.cfg_work_offline or self._comm_manager is None or not self._comm_manager._has_failed_comms():
|
|
1623
|
+
return
|
|
1624
|
+
now = time()
|
|
1625
|
+
if (now - self._last_offline_log) < ct.COMMS.COMM_SECS_SHOW_INFO:
|
|
1626
|
+
return
|
|
1627
|
+
self._last_offline_log = now
|
|
1628
|
+
|
|
1629
|
+
comm_attempts = [
|
|
1630
|
+
"{}:try={} fails={}".format(
|
|
1631
|
+
name,
|
|
1632
|
+
getattr(comm, "_nr_conn_retry_iters", None),
|
|
1633
|
+
getattr(comm, "_total_conn_fails", None),
|
|
1634
|
+
)
|
|
1635
|
+
for name, comm in self._comm_manager._dct_comm_plugins.items()
|
|
1636
|
+
if comm is not None
|
|
1637
|
+
]
|
|
1638
|
+
attempts_str = "; ".join(comm_attempts) if len(comm_attempts) > 0 else "no comm plugins"
|
|
1639
|
+
self.P(f"WORK_OFFLINE enabled; reconnect attempts: {attempts_str}", color='r')
|
|
1640
|
+
return
|
|
1641
|
+
|
|
1616
1642
|
def _save_exception_main_loop_state(self, txt, **save_kwargs):
|
|
1617
1643
|
fn = '{}_main_loop_exception'.format(self.log.now_str())
|
|
1618
1644
|
self.log.save_pickle_to_output(data=save_kwargs, fn=fn + '.pickle', subfolder_path='main_loop_exceptions')
|
|
@@ -1829,6 +1855,7 @@ class Orchestrator(DecentrAIObject,
|
|
|
1829
1855
|
#9. Comm info, timers, ... - later we gonna check for total comm failures
|
|
1830
1856
|
self.__loop_stage = '9.logs'
|
|
1831
1857
|
self.comm_manager_show_info()
|
|
1858
|
+
self._maybe_log_offline_status()
|
|
1832
1859
|
|
|
1833
1860
|
|
|
1834
1861
|
self.log.stop_timer(self._main_loop_timer_name)
|
|
@@ -1844,7 +1871,7 @@ class Orchestrator(DecentrAIObject,
|
|
|
1844
1871
|
return_code = self._return_code
|
|
1845
1872
|
|
|
1846
1873
|
self.__loop_stage = '10.checks'
|
|
1847
|
-
if self.comm_manager.has_failed_comms:
|
|
1874
|
+
if (not self.cfg_work_offline) and self.comm_manager.has_failed_comms:
|
|
1848
1875
|
self.P("Shutdown initiated due to multiple failure in communication!", color='r')
|
|
1849
1876
|
return_code = ct.CODE_EXCEPTION
|
|
1850
1877
|
|
naeural_core/main/ver.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: naeural_core
|
|
3
|
-
Version: 7.7.
|
|
3
|
+
Version: 7.7.242
|
|
4
4
|
Summary: Ratio1 Core is the backbone of the Ratio1 Edge Protocol.
|
|
5
5
|
Project-URL: Homepage, https://github.com/Ratio1/naeural_core
|
|
6
6
|
Project-URL: Bug Tracker, https://github.com/Ratio1/naeural_core/issues
|
|
@@ -102,7 +102,7 @@ naeural_core/business/training/minio_download_dataset.py,sha256=7uoo5CFLynWbLkBh
|
|
|
102
102
|
naeural_core/business/training/minio_upload_dataset.py,sha256=5stm_E_L3SLwcjd2znUVMIC2PWO07f6QfBdcrKRiHCo,2162
|
|
103
103
|
naeural_core/business/training/second_stage_training_process.py,sha256=z8LG9xx2G6s5AqeSD-t5rBegIctkFMEf4ZqEJVXZcz0,3152
|
|
104
104
|
naeural_core/comm/__init__.py,sha256=SpAWJIyYdOouZSImzVrEF_M4-nrCrY9p3cVYwvmbt20,105
|
|
105
|
-
naeural_core/comm/communication_manager.py,sha256=
|
|
105
|
+
naeural_core/comm/communication_manager.py,sha256=swMxO3DPTnT0SeSsv6PWz0gP0WsWOzA_BlYawoWHSB4,30471
|
|
106
106
|
naeural_core/comm/base/__init__.py,sha256=rDzAtPwcMOsW3aCp0t07GpJz5qweLiJgafTngHwEMOo,44
|
|
107
107
|
naeural_core/comm/base/base_comm_thread.py,sha256=DVPzLN9UfKxjiVf0FtjOr5001-YzkjPMmC4RJFhoGG0,27034
|
|
108
108
|
naeural_core/comm/default/amqp.py,sha256=-6_qGlOIjOUz42nkwnfMQZEVfdeUKGTNgjcTuo49v4E,4145
|
|
@@ -110,7 +110,7 @@ naeural_core/comm/default/mqtt.py,sha256=F2VOOmjXkz7diFC-PVdxZgBI7O9aCwul5KfA6r4
|
|
|
110
110
|
naeural_core/comm/default/readme.md,sha256=hNY9V5HU8yW0JjyseiPWMkV8l7YU0ZEBw_iq_lpW-Uk,162
|
|
111
111
|
naeural_core/comm/mixins/__init__.py,sha256=d8o2tKAkQ-P9voRB6REnEmObVyi4AiQgNVZuAKKObKo,290
|
|
112
112
|
naeural_core/comm/mixins/commandcontrol_comm_mixin.py,sha256=VhAGzR23-x8INn8VGBd7z2YUtT4GIkeTNwNK0CmAjio,4781
|
|
113
|
-
naeural_core/comm/mixins/default_comm_mixin.py,sha256=
|
|
113
|
+
naeural_core/comm/mixins/default_comm_mixin.py,sha256=CMnaYjBk5jazIygEyGXb4j4DSCPij7LgmDoCsGz6W3E,2949
|
|
114
114
|
naeural_core/comm/mixins/heartbeats_comm_mixin.py,sha256=_PqCqn4R3KmzE-2aRnMRHSMelc0v52QRms2HoZgTvFE,3307
|
|
115
115
|
naeural_core/comm/mixins/notifications_comm_mixin.py,sha256=Otcb7vUkNOu2xAiEMm0ukndBqocsr2dYdvyRv-5YLWA,1929
|
|
116
116
|
naeural_core/comm/mixins/telemetry_mixin.py,sha256=mIox-rJe1XTy3_vgi3qT6gb0SiupUksyn7CPWACZCsY,4025
|
|
@@ -137,7 +137,7 @@ naeural_core/core_logging/logger_mixins/confusion_matrix_mixin.py,sha256=fLJOeyp
|
|
|
137
137
|
naeural_core/core_logging/logger_mixins/dataframe_mixin.py,sha256=hkOtoTzoBDacpagdFYp2kawsw7rzbgLw2-_pzXLBU6Q,11491
|
|
138
138
|
naeural_core/core_logging/logger_mixins/deploy_models_in_production_mixin.py,sha256=J2j1tnt0Cd2qD31rL8Sov0sz9_T5-h6Ukd-4sl1ITcQ,5986
|
|
139
139
|
naeural_core/core_logging/logger_mixins/fit_debug_tfkeras_mixin.py,sha256=6efE5W59a3VWWR1UbPd9iNfQl0nuse7SW3IV0RFpHOc,9344
|
|
140
|
-
naeural_core/core_logging/logger_mixins/gpu_mixin.py,sha256=
|
|
140
|
+
naeural_core/core_logging/logger_mixins/gpu_mixin.py,sha256=1gh83RB8ziZJG-UdF1BnOPJWWFAq4xbbhGuQvUPUn1E,14193
|
|
141
141
|
naeural_core/core_logging/logger_mixins/grid_search_mixin.py,sha256=lo3bTyEmcsk03ttTLs0lC_N0beAC1eTiGI9kZX2ib-A,5961
|
|
142
142
|
naeural_core/core_logging/logger_mixins/histogram_mixin.py,sha256=ro5q99VXdcLSBi6XMZK4fJIXNX8n4wBzay-0SXuj7qc,6413
|
|
143
143
|
naeural_core/core_logging/logger_mixins/keras_callbacks_mixin.py,sha256=ELlTb8TycdFnuO2dMDAherlzRd1rfHWogIDq-svnZ7w,3940
|
|
@@ -344,8 +344,8 @@ naeural_core/main/epochs_manager.py,sha256=lH01Pv9E_uz5fdvh_W2dZ29hZLM0CL2NZfuYJ
|
|
|
344
344
|
naeural_core/main/geoloc.py,sha256=TEqyuNzpVqZSBCo0OOrpHYncIsHSClvRt28hgvxJ35o,24909
|
|
345
345
|
naeural_core/main/main_loop_data_handler.py,sha256=hABB65OUBhtur3rd2mYsEhdAc54jVILzybrvxml5h0s,13815
|
|
346
346
|
naeural_core/main/net_mon.py,sha256=qlyo1fqTeQy_M9VfJOxon_PBbQat0QO9Zbu_93FMbLc,88144
|
|
347
|
-
naeural_core/main/orchestrator.py,sha256=
|
|
348
|
-
naeural_core/main/ver.py,sha256=
|
|
347
|
+
naeural_core/main/orchestrator.py,sha256=0FhxLgpJN9Wjw1L7Ojx4Mi0P82u8dNDRTfUdqGh0Tqs,70626
|
|
348
|
+
naeural_core/main/ver.py,sha256=Ga9K1mXR6U7fDBbh2_lozzdOHGPe7E40abF2oYh_SWI,335
|
|
349
349
|
naeural_core/main/orchestrator_mixins/__init__.py,sha256=MNleg48vdlqsyAR8Vamjl4ahG2jwCH5kLbQN5CfU57E,149
|
|
350
350
|
naeural_core/main/orchestrator_mixins/managers_init.py,sha256=sQVqpr99a5WP9HCloYCyaWDW5J3IypEImlf703bqTF4,6692
|
|
351
351
|
naeural_core/main/orchestrator_mixins/utils.py,sha256=jMa0uStVNLQmp0VhNMRvfBDjo387ORLlUVLthRNBKqc,1866
|
|
@@ -555,7 +555,7 @@ naeural_core/utils/tracing/onnx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
|
|
|
555
555
|
naeural_core/utils/tracing/onnx/base_trt_scripter.py,sha256=1FelEBo7JGsc8hbJ3sevzxnM-J61nvBHz6L1VLpZrVc,2043
|
|
556
556
|
naeural_core/utils/tracing/onnx/utils.py,sha256=IKmqUWakrMWn34uJvbRjNLacdszD8jkkQBFPUhgJtOQ,5618
|
|
557
557
|
naeural_core/utils/web_app/favicon.ico,sha256=zU6-Jxx4ol1A9FJvcQELYV9DiqwqyvjPS89xQybZE74,15406
|
|
558
|
-
naeural_core-7.7.
|
|
559
|
-
naeural_core-7.7.
|
|
560
|
-
naeural_core-7.7.
|
|
561
|
-
naeural_core-7.7.
|
|
558
|
+
naeural_core-7.7.242.dist-info/METADATA,sha256=f7kZ1SrW7zJuC9edJQDUX9cBCV_bOTTxYVcpC4UnBkU,6522
|
|
559
|
+
naeural_core-7.7.242.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
560
|
+
naeural_core-7.7.242.dist-info/licenses/LICENSE,sha256=SPHPWjOdAUUUUI020nI5VNCtFjmTOlJpi1cZxyB3gKo,11339
|
|
561
|
+
naeural_core-7.7.242.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|