learning-loop-node 0.10.14__py3-none-any.whl → 0.10.15rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of learning-loop-node might be problematic. Click here for more details.
- learning_loop_node/loop_communication.py +7 -4
- learning_loop_node/node.py +11 -3
- learning_loop_node/trainer/exceptions.py +11 -1
- learning_loop_node/trainer/trainer_logic_generic.py +4 -1
- {learning_loop_node-0.10.14.dist-info → learning_loop_node-0.10.15rc1.dist-info}/METADATA +1 -1
- {learning_loop_node-0.10.14.dist-info → learning_loop_node-0.10.15rc1.dist-info}/RECORD +7 -7
- {learning_loop_node-0.10.14.dist-info → learning_loop_node-0.10.15rc1.dist-info}/WHEEL +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import logging
|
|
3
|
+
import time
|
|
3
4
|
from typing import Awaitable, Callable, List, Optional
|
|
4
5
|
|
|
5
6
|
import httpx
|
|
@@ -33,7 +34,6 @@ class LoopCommunicator():
|
|
|
33
34
|
base_url=self.base_url, timeout=Timeout(60.0), verify=self.ssl_cert_path)
|
|
34
35
|
else:
|
|
35
36
|
self.async_client = httpx.AsyncClient(base_url=self.base_url, timeout=Timeout(60.0))
|
|
36
|
-
self.async_client.cookies.clear()
|
|
37
37
|
|
|
38
38
|
logging.info(f'Loop interface initialized with base_url: {self.base_url} / user: {self.username}')
|
|
39
39
|
|
|
@@ -68,16 +68,19 @@ class LoopCommunicator():
|
|
|
68
68
|
if self.async_client is not None and not self.async_client.is_closed:
|
|
69
69
|
await self.async_client.aclose()
|
|
70
70
|
|
|
71
|
-
async def backend_ready(self) -> bool:
|
|
71
|
+
async def backend_ready(self, timeout: Optional[int] = None) -> bool:
|
|
72
72
|
"""Wait until the backend is ready"""
|
|
73
|
+
start_time = time.time()
|
|
73
74
|
while True:
|
|
74
75
|
try:
|
|
75
76
|
logging.info('Checking if backend is ready')
|
|
76
77
|
response = await self.get('/status', requires_login=False)
|
|
77
78
|
if response.status_code == 200:
|
|
78
79
|
return True
|
|
79
|
-
except Exception
|
|
80
|
-
logging.info(
|
|
80
|
+
except Exception:
|
|
81
|
+
logging.info('backend not ready yet.')
|
|
82
|
+
if timeout is not None and time.time() + 10 - start_time > timeout:
|
|
83
|
+
raise TimeoutError('Backend not ready within timeout')
|
|
81
84
|
await asyncio.sleep(10)
|
|
82
85
|
|
|
83
86
|
async def retry_on_401(self, func: Callable[..., Awaitable[httpx.Response]], *args, **kwargs) -> httpx.Response:
|
learning_loop_node/node.py
CHANGED
|
@@ -126,6 +126,8 @@ class Node(FastAPI):
|
|
|
126
126
|
await self.on_repeat()
|
|
127
127
|
except asyncio.CancelledError:
|
|
128
128
|
return
|
|
129
|
+
except TimeoutError:
|
|
130
|
+
self.log.debug('Backend not ready within timeout, skipping repeat loop')
|
|
129
131
|
except Exception:
|
|
130
132
|
self.log.exception('error in repeat loop')
|
|
131
133
|
|
|
@@ -140,6 +142,7 @@ class Node(FastAPI):
|
|
|
140
142
|
async def reconnect_to_loop(self):
|
|
141
143
|
"""Initialize the loop communicator, log in if needed and reconnect to the loop via socket.io."""
|
|
142
144
|
self.init_loop_communicator()
|
|
145
|
+
await self.loop_communicator.backend_ready(timeout=5)
|
|
143
146
|
if self.needs_login:
|
|
144
147
|
await self.loop_communicator.ensure_login(relogin=True)
|
|
145
148
|
try:
|
|
@@ -162,7 +165,8 @@ class Node(FastAPI):
|
|
|
162
165
|
The current client is disconnected and deleted if it already exists."""
|
|
163
166
|
|
|
164
167
|
self.log.debug('-------------- Connecting to loop via socket.io -------------------')
|
|
165
|
-
|
|
168
|
+
cookies = self.loop_communicator.get_cookies()
|
|
169
|
+
self.log.debug('HTTP Cookies: %s\n', cookies)
|
|
166
170
|
|
|
167
171
|
if self._sio_client is not None:
|
|
168
172
|
try:
|
|
@@ -185,8 +189,12 @@ class Node(FastAPI):
|
|
|
185
189
|
ssl_context.verify_mode = ssl.CERT_REQUIRED
|
|
186
190
|
connector = TCPConnector(ssl=ssl_context)
|
|
187
191
|
|
|
188
|
-
self.
|
|
189
|
-
|
|
192
|
+
if self.needs_login:
|
|
193
|
+
self._sio_client = AsyncClient(request_timeout=20, http_session=aiohttp.ClientSession(
|
|
194
|
+
cookies=cookies, connector=connector))
|
|
195
|
+
else:
|
|
196
|
+
self._sio_client = AsyncClient(request_timeout=20, http_session=aiohttp.ClientSession(
|
|
197
|
+
connector=connector))
|
|
190
198
|
|
|
191
199
|
# pylint: disable=protected-access
|
|
192
200
|
self._sio_client._trigger_event = ensure_socket_response(self._sio_client._trigger_event)
|
|
@@ -1,2 +1,12 @@
|
|
|
1
1
|
class CriticalError(Exception):
|
|
2
|
-
|
|
2
|
+
'''
|
|
3
|
+
CriticalError is raised when the training cannot be continued.
|
|
4
|
+
In this case the trainer jumps to the TrainerState.ReadyForCleanup and tries to upload the latest model.
|
|
5
|
+
'''
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class NodeNeedsRestartError(Exception):
|
|
9
|
+
'''
|
|
10
|
+
NodeNeedsRestartError is raised when the node needs to be restarted.
|
|
11
|
+
This is e.g. the case when the GPU is not available anymore.
|
|
12
|
+
'''
|
|
@@ -14,7 +14,7 @@ from ..data_classes import (Context, Errors, Hyperparameter, PretrainedModel, Tr
|
|
|
14
14
|
TrainingOut, TrainingStateData)
|
|
15
15
|
from ..helpers.misc import create_project_folder, delete_all_training_folders, generate_training, is_valid_uuid4
|
|
16
16
|
from .downloader import TrainingsDownloader
|
|
17
|
-
from .exceptions import CriticalError
|
|
17
|
+
from .exceptions import CriticalError, NodeNeedsRestartError
|
|
18
18
|
from .io_helpers import ActiveTrainingIO, EnvironmentVars, LastTrainingIO
|
|
19
19
|
|
|
20
20
|
if TYPE_CHECKING:
|
|
@@ -294,6 +294,9 @@ class TrainerLogicGeneric(ABC):
|
|
|
294
294
|
logger.error('CriticalError in %s - Exception: %s', state_during, e)
|
|
295
295
|
self.errors.set(error_key, str(e))
|
|
296
296
|
self.training.training_state = TrainerState.ReadyForCleanup
|
|
297
|
+
except NodeNeedsRestartError:
|
|
298
|
+
logger.error('Node Restart Requested')
|
|
299
|
+
sys.exit(0)
|
|
297
300
|
except Exception as e:
|
|
298
301
|
self.errors.set(error_key, str(e))
|
|
299
302
|
logger.exception('Error in %s - Exception: %s', state_during, e)
|
|
@@ -31,8 +31,8 @@ learning_loop_node/helpers/environment_reader.py,sha256=OtCTDc0KT9r-SMygkZB_Mw-Z
|
|
|
31
31
|
learning_loop_node/helpers/gdrive_downloader.py,sha256=zeYJciTAJVRpu_eFjwgYLCpIa6hU1d71anqEBb564Rk,1145
|
|
32
32
|
learning_loop_node/helpers/log_conf.py,sha256=z_0PHh7U7DkJbSbKoSPyUfS7NhBHtRxXHdNcj67Hpbc,951
|
|
33
33
|
learning_loop_node/helpers/misc.py,sha256=j4is8Rv0ttnCqF-R-wP3xwEi67OI6IBJav5Woo5lyDk,7701
|
|
34
|
-
learning_loop_node/loop_communication.py,sha256=
|
|
35
|
-
learning_loop_node/node.py,sha256=
|
|
34
|
+
learning_loop_node/loop_communication.py,sha256=xkoZtHRgxq1arusHQtC_lEBculFBLeCijyfVSYIEchY,6755
|
|
35
|
+
learning_loop_node/node.py,sha256=vbMR_6QsruB2IYYKUWx4--9Ywjf_vuBQb4jyzLRqpRQ,10300
|
|
36
36
|
learning_loop_node/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
37
|
learning_loop_node/rest.py,sha256=o1dl4Mtznd5duyEQtCYSGlK04l1Y-p_YRjG40Q4l31c,1491
|
|
38
38
|
learning_loop_node/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -81,15 +81,15 @@ learning_loop_node/tests/trainer/test_trainer_states.py,sha256=djYCs5ieajQHRjk8Q
|
|
|
81
81
|
learning_loop_node/tests/trainer/testing_trainer_logic.py,sha256=KslqDJDntkgH4Yd_z-guiVPvzi5Q-l-Bqc3fUjT5N7U,3883
|
|
82
82
|
learning_loop_node/trainer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
83
83
|
learning_loop_node/trainer/downloader.py,sha256=Qk-oBcrGCVuWTVs3hvAJzQSqCIHPGZ7NXLJ_fAqvCoY,1469
|
|
84
|
-
learning_loop_node/trainer/exceptions.py,sha256=
|
|
84
|
+
learning_loop_node/trainer/exceptions.py,sha256=vbuoE6kssLQuA8zd3LiDHmZglP6E2IJJwEi5AZtWXxY,420
|
|
85
85
|
learning_loop_node/trainer/executor.py,sha256=-0BxDqmAI1NCiISi7Rw8McJQfgxxVy1gSa1epYuL3U0,3942
|
|
86
86
|
learning_loop_node/trainer/io_helpers.py,sha256=hGEtNAQBSBbVB56U1ndwfP8qK5K4YIwMQrjCDcaMy9I,7218
|
|
87
87
|
learning_loop_node/trainer/rest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
88
88
|
learning_loop_node/trainer/rest/backdoor_controls.py,sha256=-pU4iHheBWf0SW2QzBVBsLiCMZBRz9CDdVZv6414Ts8,5134
|
|
89
89
|
learning_loop_node/trainer/test_executor.py,sha256=6BVGDN_6f5GEMMEvDLSG1yzMybSvgXaP5uYpSfsVPP0,2224
|
|
90
90
|
learning_loop_node/trainer/trainer_logic.py,sha256=PlYExIskU9pWJO0e9m_0KJnUdOI10GtW0oDOevYmg1o,8461
|
|
91
|
-
learning_loop_node/trainer/trainer_logic_generic.py,sha256=
|
|
91
|
+
learning_loop_node/trainer/trainer_logic_generic.py,sha256=ERfuGhHGNvIPRyd_QOGavylPDXTCC8qCOO1eJXAwEO8,25957
|
|
92
92
|
learning_loop_node/trainer/trainer_node.py,sha256=8ANS9iy-swdTLvt9wEFixE6YlmqvqBl17A-R4tVYD-I,5384
|
|
93
|
-
learning_loop_node-0.10.
|
|
94
|
-
learning_loop_node-0.10.
|
|
95
|
-
learning_loop_node-0.10.
|
|
93
|
+
learning_loop_node-0.10.15rc1.dist-info/METADATA,sha256=uPPK1CQexq-mVs_1oXnzd3CrGTvieHyApPYipMsOud8,11910
|
|
94
|
+
learning_loop_node-0.10.15rc1.dist-info/WHEEL,sha256=WGfLGfLX43Ei_YORXSnT54hxFygu34kMpcQdmgmEwCQ,88
|
|
95
|
+
learning_loop_node-0.10.15rc1.dist-info/RECORD,,
|
|
File without changes
|