naeural-core 7.7.239__py3-none-any.whl → 7.7.242__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -74,13 +74,18 @@ class CommunicationManager(Manager, _ConfigHandlerMixin):
74
74
  _id = self.log.config_data.get(ct.CONFIG_STARTUP_v2.K_EE_ID, '')[:ct.EE_ALIAS_MAX_SIZE]
75
75
  return _id
76
76
 
77
+ def _has_failed_comms(self):
78
+ for comm in self._dct_comm_plugins.values():
79
+ if comm.comm_failed_after_retries:
80
+ return comm
81
+ return None
77
82
 
78
83
  @property
79
84
  def has_failed_comms(self):
80
- for comm in self._dct_comm_plugins.values():
81
- if comm.comm_failed_after_retries:
82
- self.P("Detected total communication failure on comm {}. This may generate shutdown/restart.".format(comm.__class__.__name__), color='error')
83
- return True
85
+ comm = self._has_failed_comms()
86
+ if comm is not None:
87
+ self.P("Detected total communication failure on comm {}. This may generate shutdown/restart.".format(comm.__class__.__name__), color='error')
88
+ return True
84
89
  return False
85
90
 
86
91
 
@@ -80,4 +80,3 @@ class _DefaultCommMixin(object):
80
80
  self.P('`run_thread` finished')
81
81
  self._thread_stopped = True
82
82
  return
83
-
@@ -136,6 +136,28 @@ class _GPUMixin(object):
136
136
 
137
137
  return processes_by_uuid
138
138
 
139
+ def _get_uuid_by_index(self, timeout=1.5):
140
+ import subprocess, shutil
141
+ smi = shutil.which("nvidia-smi")
142
+ if not smi:
143
+ return {}
144
+
145
+ out = subprocess.run(
146
+ [smi, "--query-gpu=index,uuid", "--format=csv,noheader,nounits"],
147
+ capture_output=True, text=True, timeout=timeout
148
+ )
149
+ if out.returncode != 0:
150
+ return {}
151
+
152
+ d = {}
153
+ for line in out.stdout.splitlines():
154
+ line = line.strip()
155
+ if not line:
156
+ continue
157
+ idx_s, uuid = [p.strip() for p in line.split(",", 1)]
158
+ d[int(idx_s)] = uuid
159
+ return d
160
+
139
161
  def gpu_info(self, show=False, mb=False, current_pid=False):
140
162
  """
141
163
  Collects GPU info. Must have torch installed & non-mandatory nvidia-smi
@@ -215,20 +237,12 @@ class _GPUMixin(object):
215
237
  fan_speed, fan_speed_unit = -1, "N/A"
216
238
  if pynvml_avail:
217
239
  # --- get an NVML handle that matches torch's CUDA device ordering when possible ---
218
- handle = None
219
- try:
220
- # This helps when CUDA_VISIBLE_DEVICES remaps indices:
221
- # torch device 0 may not be NVML index 0.
222
- pci_bus_id = getattr(device_props, "pci_bus_id", None)
223
- if pci_bus_id:
224
- if hasattr(pynvml, "nvmlDeviceGetHandleByPciBusId_v2"):
225
- handle = pynvml.nvmlDeviceGetHandleByPciBusId_v2(pci_bus_id)
226
- elif hasattr(pynvml, "nvmlDeviceGetHandleByPciBusId"):
227
- handle = pynvml.nvmlDeviceGetHandleByPciBusId(pci_bus_id)
228
- except Exception:
229
- handle = None
230
-
231
- if handle is None:
240
+ uuid_by_index = self._get_uuid_by_index()
241
+ # inside your for device_id in range(n_gpus):
242
+ uuid = uuid_by_index.get(device_id)
243
+ if uuid:
244
+ handle = pynvml.nvmlDeviceGetHandleByUUID(uuid)
245
+ else:
232
246
  handle = pynvml.nvmlDeviceGetHandleByIndex(device_id)
233
247
 
234
248
  # --- memory (NVML returns bytes) ---
@@ -121,6 +121,7 @@ class Orchestrator(DecentrAIObject,
121
121
  self._current_dct_config_streams = {}
122
122
  self._should_send_initial_log = False
123
123
  self._initial_log_sent = False
124
+ self._last_offline_log = 0
124
125
  self.loop_timings = deque(maxlen=3600)
125
126
  self._reset_timers = False
126
127
  self.__is_mlstop_dangerous = False
@@ -709,6 +710,10 @@ class Orchestrator(DecentrAIObject,
709
710
  def cfg_main_loop_resolution(self):
710
711
  return self.config_data.get('MAIN_LOOP_RESOLUTION', 20)
711
712
 
713
+ @property
714
+ def cfg_work_offline(self):
715
+ return self.config_data.get('WORK_OFFLINE', False)
716
+
712
717
  @property
713
718
  def cfg_sequential_streams(self):
714
719
  """
@@ -1613,6 +1618,27 @@ class Orchestrator(DecentrAIObject,
1613
1618
  self._comm_manager.maybe_show_info()
1614
1619
  return
1615
1620
 
1621
+ def _maybe_log_offline_status(self):
1622
+ if not self.cfg_work_offline or self._comm_manager is None or not self._comm_manager._has_failed_comms():
1623
+ return
1624
+ now = time()
1625
+ if (now - self._last_offline_log) < ct.COMMS.COMM_SECS_SHOW_INFO:
1626
+ return
1627
+ self._last_offline_log = now
1628
+
1629
+ comm_attempts = [
1630
+ "{}:try={} fails={}".format(
1631
+ name,
1632
+ getattr(comm, "_nr_conn_retry_iters", None),
1633
+ getattr(comm, "_total_conn_fails", None),
1634
+ )
1635
+ for name, comm in self._comm_manager._dct_comm_plugins.items()
1636
+ if comm is not None
1637
+ ]
1638
+ attempts_str = "; ".join(comm_attempts) if len(comm_attempts) > 0 else "no comm plugins"
1639
+ self.P(f"WORK_OFFLINE enabled; reconnect attempts: {attempts_str}", color='r')
1640
+ return
1641
+
1616
1642
  def _save_exception_main_loop_state(self, txt, **save_kwargs):
1617
1643
  fn = '{}_main_loop_exception'.format(self.log.now_str())
1618
1644
  self.log.save_pickle_to_output(data=save_kwargs, fn=fn + '.pickle', subfolder_path='main_loop_exceptions')
@@ -1829,6 +1855,7 @@ class Orchestrator(DecentrAIObject,
1829
1855
  #9. Comm info, timers, ... - later we gonna check for total comm failures
1830
1856
  self.__loop_stage = '9.logs'
1831
1857
  self.comm_manager_show_info()
1858
+ self._maybe_log_offline_status()
1832
1859
 
1833
1860
 
1834
1861
  self.log.stop_timer(self._main_loop_timer_name)
@@ -1844,7 +1871,7 @@ class Orchestrator(DecentrAIObject,
1844
1871
  return_code = self._return_code
1845
1872
 
1846
1873
  self.__loop_stage = '10.checks'
1847
- if self.comm_manager.has_failed_comms:
1874
+ if (not self.cfg_work_offline) and self.comm_manager.has_failed_comms:
1848
1875
  self.P("Shutdown initiated due to multiple failure in communication!", color='r')
1849
1876
  return_code = ct.CODE_EXCEPTION
1850
1877
 
naeural_core/main/ver.py CHANGED
@@ -1,4 +1,4 @@
1
- __VER__ = '7.7.239'
1
+ __VER__ = '7.7.242'
2
2
 
3
3
 
4
4
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: naeural_core
3
- Version: 7.7.239
3
+ Version: 7.7.242
4
4
  Summary: Ratio1 Core is the backbone of the Ratio1 Edge Protocol.
5
5
  Project-URL: Homepage, https://github.com/Ratio1/naeural_core
6
6
  Project-URL: Bug Tracker, https://github.com/Ratio1/naeural_core/issues
@@ -102,7 +102,7 @@ naeural_core/business/training/minio_download_dataset.py,sha256=7uoo5CFLynWbLkBh
102
102
  naeural_core/business/training/minio_upload_dataset.py,sha256=5stm_E_L3SLwcjd2znUVMIC2PWO07f6QfBdcrKRiHCo,2162
103
103
  naeural_core/business/training/second_stage_training_process.py,sha256=z8LG9xx2G6s5AqeSD-t5rBegIctkFMEf4ZqEJVXZcz0,3152
104
104
  naeural_core/comm/__init__.py,sha256=SpAWJIyYdOouZSImzVrEF_M4-nrCrY9p3cVYwvmbt20,105
105
- naeural_core/comm/communication_manager.py,sha256=n70Khv464kQbseo6ch33cTel_JZkC1eItGyKah27sHM,30347
105
+ naeural_core/comm/communication_manager.py,sha256=swMxO3DPTnT0SeSsv6PWz0gP0WsWOzA_BlYawoWHSB4,30471
106
106
  naeural_core/comm/base/__init__.py,sha256=rDzAtPwcMOsW3aCp0t07GpJz5qweLiJgafTngHwEMOo,44
107
107
  naeural_core/comm/base/base_comm_thread.py,sha256=DVPzLN9UfKxjiVf0FtjOr5001-YzkjPMmC4RJFhoGG0,27034
108
108
  naeural_core/comm/default/amqp.py,sha256=-6_qGlOIjOUz42nkwnfMQZEVfdeUKGTNgjcTuo49v4E,4145
@@ -110,7 +110,7 @@ naeural_core/comm/default/mqtt.py,sha256=F2VOOmjXkz7diFC-PVdxZgBI7O9aCwul5KfA6r4
110
110
  naeural_core/comm/default/readme.md,sha256=hNY9V5HU8yW0JjyseiPWMkV8l7YU0ZEBw_iq_lpW-Uk,162
111
111
  naeural_core/comm/mixins/__init__.py,sha256=d8o2tKAkQ-P9voRB6REnEmObVyi4AiQgNVZuAKKObKo,290
112
112
  naeural_core/comm/mixins/commandcontrol_comm_mixin.py,sha256=VhAGzR23-x8INn8VGBd7z2YUtT4GIkeTNwNK0CmAjio,4781
113
- naeural_core/comm/mixins/default_comm_mixin.py,sha256=aAqFPCkFcDIY6CvtmW1OkfNJtsJSobCD412euG3_j_M,2950
113
+ naeural_core/comm/mixins/default_comm_mixin.py,sha256=CMnaYjBk5jazIygEyGXb4j4DSCPij7LgmDoCsGz6W3E,2949
114
114
  naeural_core/comm/mixins/heartbeats_comm_mixin.py,sha256=_PqCqn4R3KmzE-2aRnMRHSMelc0v52QRms2HoZgTvFE,3307
115
115
  naeural_core/comm/mixins/notifications_comm_mixin.py,sha256=Otcb7vUkNOu2xAiEMm0ukndBqocsr2dYdvyRv-5YLWA,1929
116
116
  naeural_core/comm/mixins/telemetry_mixin.py,sha256=mIox-rJe1XTy3_vgi3qT6gb0SiupUksyn7CPWACZCsY,4025
@@ -137,7 +137,7 @@ naeural_core/core_logging/logger_mixins/confusion_matrix_mixin.py,sha256=fLJOeyp
137
137
  naeural_core/core_logging/logger_mixins/dataframe_mixin.py,sha256=hkOtoTzoBDacpagdFYp2kawsw7rzbgLw2-_pzXLBU6Q,11491
138
138
  naeural_core/core_logging/logger_mixins/deploy_models_in_production_mixin.py,sha256=J2j1tnt0Cd2qD31rL8Sov0sz9_T5-h6Ukd-4sl1ITcQ,5986
139
139
  naeural_core/core_logging/logger_mixins/fit_debug_tfkeras_mixin.py,sha256=6efE5W59a3VWWR1UbPd9iNfQl0nuse7SW3IV0RFpHOc,9344
140
- naeural_core/core_logging/logger_mixins/gpu_mixin.py,sha256=sgtEfuROphSL03Id8n4Wd8SbxGoSuBMVx_v-N_cKvXc,14023
140
+ naeural_core/core_logging/logger_mixins/gpu_mixin.py,sha256=1gh83RB8ziZJG-UdF1BnOPJWWFAq4xbbhGuQvUPUn1E,14193
141
141
  naeural_core/core_logging/logger_mixins/grid_search_mixin.py,sha256=lo3bTyEmcsk03ttTLs0lC_N0beAC1eTiGI9kZX2ib-A,5961
142
142
  naeural_core/core_logging/logger_mixins/histogram_mixin.py,sha256=ro5q99VXdcLSBi6XMZK4fJIXNX8n4wBzay-0SXuj7qc,6413
143
143
  naeural_core/core_logging/logger_mixins/keras_callbacks_mixin.py,sha256=ELlTb8TycdFnuO2dMDAherlzRd1rfHWogIDq-svnZ7w,3940
@@ -344,8 +344,8 @@ naeural_core/main/epochs_manager.py,sha256=lH01Pv9E_uz5fdvh_W2dZ29hZLM0CL2NZfuYJ
344
344
  naeural_core/main/geoloc.py,sha256=TEqyuNzpVqZSBCo0OOrpHYncIsHSClvRt28hgvxJ35o,24909
345
345
  naeural_core/main/main_loop_data_handler.py,sha256=hABB65OUBhtur3rd2mYsEhdAc54jVILzybrvxml5h0s,13815
346
346
  naeural_core/main/net_mon.py,sha256=qlyo1fqTeQy_M9VfJOxon_PBbQat0QO9Zbu_93FMbLc,88144
347
- naeural_core/main/orchestrator.py,sha256=SKnW5jWksBm2-fZcfeT5dddDmFSTSyA-CcODKX664KI,69657
348
- naeural_core/main/ver.py,sha256=NCXuXm5fEgpFjFfI3Rlr_co4S3oap5g9XZOHDtGhUPM,335
347
+ naeural_core/main/orchestrator.py,sha256=0FhxLgpJN9Wjw1L7Ojx4Mi0P82u8dNDRTfUdqGh0Tqs,70626
348
+ naeural_core/main/ver.py,sha256=Ga9K1mXR6U7fDBbh2_lozzdOHGPe7E40abF2oYh_SWI,335
349
349
  naeural_core/main/orchestrator_mixins/__init__.py,sha256=MNleg48vdlqsyAR8Vamjl4ahG2jwCH5kLbQN5CfU57E,149
350
350
  naeural_core/main/orchestrator_mixins/managers_init.py,sha256=sQVqpr99a5WP9HCloYCyaWDW5J3IypEImlf703bqTF4,6692
351
351
  naeural_core/main/orchestrator_mixins/utils.py,sha256=jMa0uStVNLQmp0VhNMRvfBDjo387ORLlUVLthRNBKqc,1866
@@ -555,7 +555,7 @@ naeural_core/utils/tracing/onnx/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm
555
555
  naeural_core/utils/tracing/onnx/base_trt_scripter.py,sha256=1FelEBo7JGsc8hbJ3sevzxnM-J61nvBHz6L1VLpZrVc,2043
556
556
  naeural_core/utils/tracing/onnx/utils.py,sha256=IKmqUWakrMWn34uJvbRjNLacdszD8jkkQBFPUhgJtOQ,5618
557
557
  naeural_core/utils/web_app/favicon.ico,sha256=zU6-Jxx4ol1A9FJvcQELYV9DiqwqyvjPS89xQybZE74,15406
558
- naeural_core-7.7.239.dist-info/METADATA,sha256=jWc9x_Znx1X-FNdp69ujFdvVHKIH5uiEqsS55f673E0,6522
559
- naeural_core-7.7.239.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
560
- naeural_core-7.7.239.dist-info/licenses/LICENSE,sha256=SPHPWjOdAUUUUI020nI5VNCtFjmTOlJpi1cZxyB3gKo,11339
561
- naeural_core-7.7.239.dist-info/RECORD,,
558
+ naeural_core-7.7.242.dist-info/METADATA,sha256=f7kZ1SrW7zJuC9edJQDUX9cBCV_bOTTxYVcpC4UnBkU,6522
559
+ naeural_core-7.7.242.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
560
+ naeural_core-7.7.242.dist-info/licenses/LICENSE,sha256=SPHPWjOdAUUUUI020nI5VNCtFjmTOlJpi1cZxyB3gKo,11339
561
+ naeural_core-7.7.242.dist-info/RECORD,,