learning-loop-node 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of learning-loop-node might be problematic. Click here for more details.
- learning_loop_node/data_classes/__init__.py +3 -3
- learning_loop_node/data_classes/general.py +1 -1
- learning_loop_node/data_classes/image_metadata.py +1 -1
- learning_loop_node/data_classes/training.py +62 -67
- learning_loop_node/data_exchanger.py +11 -9
- learning_loop_node/detector/detector_node.py +3 -2
- learning_loop_node/detector/outbox.py +8 -5
- learning_loop_node/helpers/environment_reader.py +2 -2
- learning_loop_node/helpers/log_conf.py +4 -1
- learning_loop_node/helpers/misc.py +7 -17
- learning_loop_node/loop_communication.py +7 -11
- learning_loop_node/node.py +10 -4
- learning_loop_node/rest.py +4 -2
- learning_loop_node/tests/detector/conftest.py +17 -21
- learning_loop_node/tests/trainer/conftest.py +18 -12
- learning_loop_node/tests/trainer/states/test_state_download_train_model.py +7 -3
- learning_loop_node/tests/trainer/states/test_state_prepare.py +0 -1
- learning_loop_node/tests/trainer/states/test_state_sync_confusion_matrix.py +2 -1
- learning_loop_node/tests/trainer/states/test_state_train.py +0 -2
- learning_loop_node/tests/trainer/test_trainer_states.py +6 -1
- learning_loop_node/tests/trainer/testing_trainer_logic.py +3 -3
- learning_loop_node/trainer/downloader.py +1 -1
- learning_loop_node/trainer/executor.py +2 -2
- learning_loop_node/trainer/rest/backdoor_controls.py +6 -6
- learning_loop_node/trainer/trainer_logic.py +7 -3
- learning_loop_node/trainer/trainer_logic_generic.py +59 -41
- learning_loop_node/trainer/trainer_node.py +18 -35
- {learning_loop_node-0.11.0.dist-info → learning_loop_node-0.12.0.dist-info}/METADATA +1 -1
- {learning_loop_node-0.11.0.dist-info → learning_loop_node-0.12.0.dist-info}/RECORD +30 -30
- {learning_loop_node-0.11.0.dist-info → learning_loop_node-0.12.0.dist-info}/WHEEL +0 -0
|
@@ -30,12 +30,15 @@ async def test_initialized_trainer_node():
|
|
|
30
30
|
node = TrainerNode(name='test', trainer_logic=trainer, uuid='NOD30000-0000-0000-0000-000000000000')
|
|
31
31
|
trainer._node = node
|
|
32
32
|
trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'),
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
33
|
+
training_config={'categories': [],
|
|
34
|
+
'id': '00000000-0000-0000-0000-000000000012', # version 1.2 of demo project
|
|
35
|
+
'training_number': 0,
|
|
36
|
+
'model_variant': '',
|
|
37
|
+
'hyperparameters': {
|
|
38
|
+
'resolution': 800,
|
|
39
|
+
'flip_rl': False,
|
|
40
|
+
'flip_ud': False}
|
|
41
|
+
})
|
|
39
42
|
await node._on_startup()
|
|
40
43
|
yield node
|
|
41
44
|
await node._on_shutdown()
|
|
@@ -50,12 +53,15 @@ async def test_initialized_trainer():
|
|
|
50
53
|
await node._on_startup()
|
|
51
54
|
trainer._node = node
|
|
52
55
|
trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'),
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
56
|
+
training_config={'categories': [],
|
|
57
|
+
'id': '00000000-0000-0000-0000-000000000012', # version 1.2 of demo project
|
|
58
|
+
'training_number': 0,
|
|
59
|
+
'model_variant': '',
|
|
60
|
+
'hyperparameters': {
|
|
61
|
+
'resolution': 800,
|
|
62
|
+
'flip_rl': False,
|
|
63
|
+
'flip_ud': False}
|
|
64
|
+
})
|
|
59
65
|
yield trainer
|
|
60
66
|
try:
|
|
61
67
|
await node._on_shutdown()
|
|
@@ -3,6 +3,7 @@ import asyncio
|
|
|
3
3
|
import os
|
|
4
4
|
|
|
5
5
|
from ....data_classes import TrainerState
|
|
6
|
+
from ... import test_helper
|
|
6
7
|
from ..state_helper import assert_training_state, create_active_training_file
|
|
7
8
|
from ..testing_trainer_logic import TestingTrainerLogic
|
|
8
9
|
|
|
@@ -11,9 +12,12 @@ from ..testing_trainer_logic import TestingTrainerLogic
|
|
|
11
12
|
|
|
12
13
|
async def test_downloading_is_successful(test_initialized_trainer: TestingTrainerLogic):
|
|
13
14
|
trainer = test_initialized_trainer
|
|
14
|
-
create_active_training_file(trainer, training_state=TrainerState.DataDownloaded)
|
|
15
15
|
|
|
16
|
-
|
|
16
|
+
model_id = await test_helper.get_latest_model_id(project='demo')
|
|
17
|
+
create_active_training_file(trainer,
|
|
18
|
+
base_model_uuid=model_id,
|
|
19
|
+
training_state=TrainerState.DataDownloaded)
|
|
20
|
+
|
|
17
21
|
trainer._init_from_last_training()
|
|
18
22
|
|
|
19
23
|
asyncio.get_running_loop().create_task(
|
|
@@ -50,7 +54,7 @@ async def test_abort_download_model(test_initialized_trainer: TestingTrainerLogi
|
|
|
50
54
|
async def test_downloading_failed(test_initialized_trainer: TestingTrainerLogic):
|
|
51
55
|
trainer = test_initialized_trainer
|
|
52
56
|
create_active_training_file(trainer, training_state=TrainerState.DataDownloaded,
|
|
53
|
-
|
|
57
|
+
base_model_uuid='00000000-0000-0000-0000-000000000000') # bad model id)
|
|
54
58
|
trainer._init_from_last_training()
|
|
55
59
|
|
|
56
60
|
trainer._begin_training_task()
|
|
@@ -20,7 +20,6 @@ async def test_preparing_is_successful(test_initialized_trainer: TestingTrainerL
|
|
|
20
20
|
await trainer._perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, trainer._prepare)
|
|
21
21
|
assert trainer_has_prepare_error(trainer) is False
|
|
22
22
|
assert trainer.training.training_state == TrainerState.DataDownloaded
|
|
23
|
-
assert trainer.training.data is not None
|
|
24
23
|
assert trainer.node.last_training_io.load() == trainer.training
|
|
25
24
|
|
|
26
25
|
|
|
@@ -19,7 +19,7 @@ def trainer_has_sync_confusion_matrix_error(trainer: TrainerLogic):
|
|
|
19
19
|
async def test_nothing_to_sync(test_initialized_trainer: TestingTrainerLogic):
|
|
20
20
|
trainer = test_initialized_trainer
|
|
21
21
|
|
|
22
|
-
#
|
|
22
|
+
# NOTE: this requires trainer to have _training
|
|
23
23
|
# trainer.load_active_training()
|
|
24
24
|
create_active_training_file(trainer, training_state=TrainerState.TrainingFinished)
|
|
25
25
|
trainer._init_from_last_training()
|
|
@@ -40,6 +40,7 @@ async def test_unsynced_model_available__sync_successful(test_initialized_traine
|
|
|
40
40
|
create_active_training_file(trainer, training_state=TrainerState.TrainingFinished)
|
|
41
41
|
|
|
42
42
|
trainer._init_from_last_training()
|
|
43
|
+
trainer.training.image_data = []
|
|
43
44
|
trainer.has_new_model = True
|
|
44
45
|
|
|
45
46
|
trainer._begin_training_task()
|
|
@@ -14,7 +14,12 @@ def create_training() -> Training:
|
|
|
14
14
|
context=context,
|
|
15
15
|
project_folder='',
|
|
16
16
|
images_folder='',
|
|
17
|
-
training_folder=''
|
|
17
|
+
training_folder='',
|
|
18
|
+
categories=[],
|
|
19
|
+
hyperparameters={},
|
|
20
|
+
model_variant='',
|
|
21
|
+
training_number=0,
|
|
22
|
+
training_state=TrainerState.Preparing)
|
|
18
23
|
return training
|
|
19
24
|
|
|
20
25
|
|
|
@@ -30,13 +30,13 @@ class TestingTrainerLogic(TrainerLogic):
|
|
|
30
30
|
PretrainedModel(name='large', label='Large', description='a large model')]
|
|
31
31
|
|
|
32
32
|
# pylint: disable=unused-argument
|
|
33
|
-
async def _start_training_from_base_model(self
|
|
33
|
+
async def _start_training_from_base_model(self) -> None:
|
|
34
34
|
assert self._executor is not None
|
|
35
35
|
await self._executor.start('/bin/bash -c "while true; do sleep 1; done"')
|
|
36
36
|
|
|
37
37
|
async def _start_training_from_scratch(self) -> None:
|
|
38
|
-
assert self.
|
|
39
|
-
await self.
|
|
38
|
+
assert self._executor is not None
|
|
39
|
+
await self._executor.start('/bin/bash -c "while true; do sleep 1; done"')
|
|
40
40
|
|
|
41
41
|
def _get_new_best_training_state(self) -> Optional[TrainingStateData]:
|
|
42
42
|
if self.has_new_model:
|
|
@@ -27,5 +27,5 @@ class TrainingsDownloader():
|
|
|
27
27
|
valid_image_data.append(i)
|
|
28
28
|
else:
|
|
29
29
|
skipped_image_count += 1
|
|
30
|
-
logging.info(
|
|
30
|
+
logging.info('Done downloading image data for %s images.', len(image_data))
|
|
31
31
|
return (valid_image_data, skipped_image_count)
|
|
@@ -3,7 +3,7 @@ import logging
|
|
|
3
3
|
import os
|
|
4
4
|
import shlex
|
|
5
5
|
from io import BufferedWriter
|
|
6
|
-
from typing import List, Optional
|
|
6
|
+
from typing import Dict, List, Optional
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class Executor:
|
|
@@ -33,7 +33,7 @@ class Executor:
|
|
|
33
33
|
if env is not None:
|
|
34
34
|
full_env.update(env)
|
|
35
35
|
|
|
36
|
-
logging.info(
|
|
36
|
+
logging.info('Starting executor with command: %s in %s - logging to %s', cmd, self.path, self.log_file_path)
|
|
37
37
|
self.log_file = open(self.log_file_path, 'ab')
|
|
38
38
|
|
|
39
39
|
self._process = await asyncio.create_subprocess_exec(
|
|
@@ -29,7 +29,7 @@ async def provide_new_model(request: Request):
|
|
|
29
29
|
if value == 'on':
|
|
30
30
|
trainer_node.trainer_logic.provide_new_model = True # type: ignore
|
|
31
31
|
|
|
32
|
-
logging.debug(
|
|
32
|
+
logging.debug('turning automatically provide_new_model %s', value)
|
|
33
33
|
|
|
34
34
|
|
|
35
35
|
@router.post("/reset")
|
|
@@ -64,7 +64,7 @@ def set_error_configuration(msg: Dict, request: Request):
|
|
|
64
64
|
get_new_model=msg.get('get_new_model', None),
|
|
65
65
|
save_model=msg.get('save_model', None), )
|
|
66
66
|
|
|
67
|
-
logging.info(
|
|
67
|
+
logging.info('setting error configuration to: %s', asdict(error_configuration))
|
|
68
68
|
trainer_logic = request.app.trainer_logic
|
|
69
69
|
|
|
70
70
|
# NOTE: trainer_logic is MockTrainerLogic which has a property error_configuration
|
|
@@ -82,23 +82,23 @@ async def add_steps(request: Request):
|
|
|
82
82
|
|
|
83
83
|
if not trainer_logic._executor or not trainer_logic._executor.is_running(): # pylint: disable=protected-access
|
|
84
84
|
training = trainer_logic._training # pylint: disable=protected-access
|
|
85
|
-
logging.error(
|
|
85
|
+
logging.error('cannot add steps when not running, state: %s', training.training_state if training else 'None')
|
|
86
86
|
raise HTTPException(status_code=409, detail="trainer is not running")
|
|
87
87
|
|
|
88
88
|
steps = int(str(await request.body(), 'utf-8'))
|
|
89
89
|
|
|
90
90
|
previous_state = trainer_logic.provide_new_model # type: ignore
|
|
91
91
|
trainer_logic.provide_new_model = True # type: ignore
|
|
92
|
-
logging.warning(
|
|
92
|
+
logging.warning('simulating newly completed models by moving %s forward', steps)
|
|
93
93
|
|
|
94
94
|
for _ in range(steps):
|
|
95
95
|
try:
|
|
96
96
|
logging.warning('calling sync_confusion_matrix')
|
|
97
|
-
await trainer_logic.
|
|
97
|
+
await trainer_logic._sync_training() # pylint: disable=protected-access
|
|
98
98
|
except Exception:
|
|
99
99
|
pass # Tests can force synchroniation to fail, error state is reported to backend
|
|
100
100
|
trainer_logic.provide_new_model = previous_state # type: ignore
|
|
101
|
-
logging.warning(
|
|
101
|
+
logging.warning('progress increased to %s', trainer_logic.current_iteration) # type: ignore
|
|
102
102
|
await trainer_node.send_status()
|
|
103
103
|
|
|
104
104
|
|
|
@@ -62,7 +62,7 @@ class TrainerLogic(TrainerLogicGeneric):
|
|
|
62
62
|
break
|
|
63
63
|
self.errors.reset(error_key)
|
|
64
64
|
try:
|
|
65
|
-
await self.
|
|
65
|
+
await self._sync_training()
|
|
66
66
|
except asyncio.CancelledError:
|
|
67
67
|
logging.warning('CancelledError in run_training')
|
|
68
68
|
raise
|
|
@@ -130,8 +130,12 @@ class TrainerLogic(TrainerLogicGeneric):
|
|
|
130
130
|
if self._can_resume():
|
|
131
131
|
self.start_training_task = self._resume()
|
|
132
132
|
else:
|
|
133
|
-
|
|
134
|
-
|
|
133
|
+
base_model_uuid_is_none = self.training.base_model_uuid is None
|
|
134
|
+
base_model_uuid_is_valid = is_valid_uuid4(self.training.base_model_uuid)
|
|
135
|
+
if not base_model_uuid_is_none and not base_model_uuid_is_valid:
|
|
136
|
+
logging.warning('base_model_uuid is not a valid uuid4: %s\n Starting training from scratch.',
|
|
137
|
+
self.training.base_model_uuid)
|
|
138
|
+
if not base_model_uuid_is_valid:
|
|
135
139
|
self.start_training_task = self._start_training_from_scratch()
|
|
136
140
|
else:
|
|
137
141
|
self.start_training_task = self._start_training_from_base_model()
|
|
@@ -10,9 +10,9 @@ from typing import TYPE_CHECKING, Callable, Coroutine, Dict, List, Optional
|
|
|
10
10
|
|
|
11
11
|
from fastapi.encoders import jsonable_encoder
|
|
12
12
|
|
|
13
|
-
from ..data_classes import (Context, Errors,
|
|
14
|
-
|
|
15
|
-
from ..helpers.misc import create_project_folder, delete_all_training_folders,
|
|
13
|
+
from ..data_classes import (Context, Errors, PretrainedModel, TrainerState, Training, TrainingOut, TrainingStateData,
|
|
14
|
+
TrainingStatus)
|
|
15
|
+
from ..helpers.misc import create_project_folder, delete_all_training_folders, is_valid_uuid4
|
|
16
16
|
from .downloader import TrainingsDownloader
|
|
17
17
|
from .exceptions import CriticalError, NodeNeedsRestartError
|
|
18
18
|
from .io_helpers import ActiveTrainingIO, EnvironmentVars, LastTrainingIO
|
|
@@ -66,19 +66,12 @@ class TrainerLogicGeneric(ABC):
|
|
|
66
66
|
return self._training
|
|
67
67
|
|
|
68
68
|
@property
|
|
69
|
-
def
|
|
70
|
-
assert self.
|
|
71
|
-
|
|
72
|
-
return self.training_data.hyperparameter
|
|
69
|
+
def hyperparameters(self) -> dict:
|
|
70
|
+
assert self._training is not None, 'Training should have data'
|
|
71
|
+
return self._training.hyperparameters
|
|
73
72
|
|
|
74
73
|
# ---------------------------------------- PROPERTIES ----------------------------------------
|
|
75
74
|
|
|
76
|
-
@property
|
|
77
|
-
def training_data(self) -> Optional[TrainingData]:
|
|
78
|
-
if self.training_active and self.training.data:
|
|
79
|
-
return self.training.data
|
|
80
|
-
return None
|
|
81
|
-
|
|
82
75
|
@property
|
|
83
76
|
def training_context(self) -> Optional[Context]:
|
|
84
77
|
if self.training_active:
|
|
@@ -111,12 +104,8 @@ class TrainerLogicGeneric(ABC):
|
|
|
111
104
|
def hyperparameters_for_state_sync(self) -> Optional[Dict]:
|
|
112
105
|
"""Used in sync_confusion_matrix and send_status to provide information about the training configuration.
|
|
113
106
|
"""
|
|
114
|
-
if self._training
|
|
115
|
-
|
|
116
|
-
information['resolution'] = self._training.data.hyperparameter.resolution
|
|
117
|
-
information['flipRl'] = self._training.data.hyperparameter.flip_rl
|
|
118
|
-
information['flipUd'] = self._training.data.hyperparameter.flip_ud
|
|
119
|
-
return information
|
|
107
|
+
if self._training:
|
|
108
|
+
return self._training.hyperparameters
|
|
120
109
|
return None
|
|
121
110
|
|
|
122
111
|
@property
|
|
@@ -173,6 +162,24 @@ class TrainerLogicGeneric(ABC):
|
|
|
173
162
|
# Initializing a new training object will create the folder structure for the training.
|
|
174
163
|
# The training loop will then run through the states of the training.
|
|
175
164
|
|
|
165
|
+
def generate_status_for_loop(self, trainer_uuid: str, trainer_name: str) -> TrainingStatus:
|
|
166
|
+
|
|
167
|
+
status = TrainingStatus(id=trainer_uuid,
|
|
168
|
+
name=trainer_name,
|
|
169
|
+
state=self.state,
|
|
170
|
+
errors={},
|
|
171
|
+
uptime=self.training_uptime,
|
|
172
|
+
progress=self.general_progress)
|
|
173
|
+
|
|
174
|
+
status.pretrained_models = self.provided_pretrained_models
|
|
175
|
+
status.architecture = self.model_architecture
|
|
176
|
+
|
|
177
|
+
if self._training:
|
|
178
|
+
status.errors = self.errors.errors
|
|
179
|
+
status.context = self.training_context
|
|
180
|
+
|
|
181
|
+
return status
|
|
182
|
+
|
|
176
183
|
async def try_continue_run_if_incomplete(self) -> bool:
|
|
177
184
|
"""Tries to continue a training if the last training was not finished.
|
|
178
185
|
"""
|
|
@@ -188,29 +195,30 @@ class TrainerLogicGeneric(ABC):
|
|
|
188
195
|
"""
|
|
189
196
|
self._training = self.last_training_io.load()
|
|
190
197
|
assert self._training is not None and self._training.training_folder is not None, 'could not restore training folder'
|
|
198
|
+
logger.info('restored training: \n%s', self._training)
|
|
191
199
|
self._active_training_io = ActiveTrainingIO(
|
|
192
200
|
self._training.training_folder, self.node.loop_communicator, self._training.context)
|
|
193
201
|
|
|
194
|
-
async def begin_training(self, organization: str, project: str,
|
|
202
|
+
async def begin_training(self, organization: str, project: str, training_config: Dict) -> None:
|
|
195
203
|
"""Called on `begin_training` event from the Learning Loop.
|
|
196
204
|
"""
|
|
197
|
-
self._init_new_training(Context(organization=organization, project=project),
|
|
205
|
+
self._init_new_training(Context(organization=organization, project=project), training_config)
|
|
198
206
|
self._begin_training_task()
|
|
199
207
|
|
|
200
208
|
def _begin_training_task(self) -> None:
|
|
201
209
|
# NOTE: Task object is used to potentially cancel the task
|
|
202
210
|
self.training_task = asyncio.get_event_loop().create_task(self._run())
|
|
203
211
|
|
|
204
|
-
def _init_new_training(self, context: Context,
|
|
212
|
+
def _init_new_training(self, context: Context, training_config: Dict) -> None:
|
|
205
213
|
"""Called on `begin_training` event from the Learning Loop.
|
|
206
|
-
Note that
|
|
214
|
+
Note that training_config needs the entries 'categories', 'model_variant' and 'training_number',
|
|
207
215
|
but also the hyperparameter entries.
|
|
216
|
+
'base_model_uuid' is optional if the training is continued from a previous training.
|
|
208
217
|
"""
|
|
209
218
|
project_folder = create_project_folder(context)
|
|
210
219
|
if not self._environment_vars.keep_old_trainings:
|
|
211
220
|
delete_all_training_folders(project_folder)
|
|
212
|
-
self._training = generate_training(project_folder, context)
|
|
213
|
-
self._training.set_values_from_data(details)
|
|
221
|
+
self._training = Training.generate_training(project_folder, context, training_config)
|
|
214
222
|
|
|
215
223
|
self._active_training_io = ActiveTrainingIO(
|
|
216
224
|
self._training.training_folder, self.node.loop_communicator, context)
|
|
@@ -254,7 +262,7 @@ class TrainerLogicGeneric(ABC):
|
|
|
254
262
|
elif tstate == TrainerState.TrainModelDownloaded: # -> TrainingRunning -> TrainingFinished
|
|
255
263
|
await self._perform_state('run_training', TrainerState.TrainingRunning, TrainerState.TrainingFinished, self._train)
|
|
256
264
|
elif tstate == TrainerState.TrainingFinished: # -> ConfusionMatrixSyncing -> ConfusionMatrixSynced
|
|
257
|
-
await self._perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self.
|
|
265
|
+
await self._perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self._sync_training)
|
|
258
266
|
elif tstate == TrainerState.ConfusionMatrixSynced: # -> TrainModelUploading -> TrainModelUploaded
|
|
259
267
|
await self._perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, self._upload_model)
|
|
260
268
|
elif tstate == TrainerState.TrainModelUploaded: # -> Detecting -> Detected
|
|
@@ -298,6 +306,7 @@ class TrainerLogicGeneric(ABC):
|
|
|
298
306
|
logger.error('Node Restart Requested')
|
|
299
307
|
sys.exit(0)
|
|
300
308
|
except Exception as e:
|
|
309
|
+
print('Error in %s - Exception: %s', state_during, e, flush=True)
|
|
301
310
|
self.errors.set(error_key, str(e))
|
|
302
311
|
logger.exception('Error in %s - Exception: %s', state_during, e)
|
|
303
312
|
self.training.training_state = previous_state
|
|
@@ -316,19 +325,25 @@ class TrainerLogicGeneric(ABC):
|
|
|
316
325
|
self.node.data_exchanger.set_context(self.training.context)
|
|
317
326
|
downloader = TrainingsDownloader(self.node.data_exchanger)
|
|
318
327
|
image_data, skipped_image_count = await downloader.download_training_data(self.training.images_folder)
|
|
319
|
-
|
|
320
|
-
self.training.
|
|
321
|
-
self.training.
|
|
328
|
+
|
|
329
|
+
self.training.image_data = image_data
|
|
330
|
+
self.training.skipped_image_count = skipped_image_count
|
|
322
331
|
|
|
323
332
|
async def _download_model(self) -> None:
|
|
324
333
|
"""If training is continued, the model is downloaded from the Learning Loop to the training_folder.
|
|
325
334
|
The downloaded model.json file is renamed to base_model.json because a new model.json will be created during training.
|
|
326
335
|
"""
|
|
327
|
-
base_model_uuid = self.training.
|
|
336
|
+
base_model_uuid = self.training.base_model_uuid
|
|
337
|
+
base_model_uuid_is_none = base_model_uuid is None
|
|
338
|
+
base_model_uuid_is_valid = is_valid_uuid4(base_model_uuid)
|
|
339
|
+
|
|
340
|
+
if not base_model_uuid_is_none and not base_model_uuid_is_valid:
|
|
341
|
+
logger.warning(
|
|
342
|
+
'base model uuid was provided but was not valid (base_model_uuid: %s).\nSkipping download and starting training from scratch.', base_model_uuid)
|
|
343
|
+
return
|
|
328
344
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
logger.info('skipping model download. No base model provided (in form of uuid): %s', base_model_uuid)
|
|
345
|
+
if base_model_uuid_is_none:
|
|
346
|
+
logger.info('No base model provided (base_model_uuid: %s).\nStarting training from scratch.', base_model_uuid)
|
|
332
347
|
return
|
|
333
348
|
|
|
334
349
|
logger.info('loading model from Learning Loop')
|
|
@@ -337,19 +352,21 @@ class TrainerLogicGeneric(ABC):
|
|
|
337
352
|
shutil.move(f'{self.training.training_folder}/model.json',
|
|
338
353
|
f'{self.training.training_folder}/base_model.json')
|
|
339
354
|
|
|
340
|
-
async def
|
|
341
|
-
"""Syncronizes the
|
|
355
|
+
async def _sync_training(self) -> None:
|
|
356
|
+
"""Syncronizes the training with the Learning Loop via the update_training endpoint.
|
|
342
357
|
NOTE: This stage sets the errors explicitly because it may be used inside the training stage.
|
|
343
358
|
"""
|
|
344
359
|
error_key = 'sync_confusion_matrix'
|
|
345
360
|
try:
|
|
346
361
|
new_best_model = self._get_new_best_training_state()
|
|
347
|
-
if new_best_model
|
|
362
|
+
if new_best_model:
|
|
348
363
|
new_training = TrainingOut(trainer_id=self.node.uuid,
|
|
364
|
+
trainer_name=self.node.name,
|
|
349
365
|
confusion_matrix=new_best_model.confusion_matrix,
|
|
350
|
-
train_image_count=self.training.
|
|
351
|
-
test_image_count=self.training.
|
|
352
|
-
hyperparameters=self.hyperparameters_for_state_sync
|
|
366
|
+
train_image_count=self.training.train_image_count(),
|
|
367
|
+
test_image_count=self.training.test_image_count(),
|
|
368
|
+
hyperparameters=self.hyperparameters_for_state_sync,
|
|
369
|
+
best_epoch=new_best_model.epoch)
|
|
353
370
|
await asyncio.sleep(0.1) # NOTE needed for tests.
|
|
354
371
|
|
|
355
372
|
result = await self.node.sio_client.call('update_training', (
|
|
@@ -411,7 +428,7 @@ class TrainerLogicGeneric(ABC):
|
|
|
411
428
|
def _dump_categories_to_json(self) -> str:
|
|
412
429
|
"""Dumps the categories to a json file and returns the path to the file.
|
|
413
430
|
"""
|
|
414
|
-
content = {'categories': [asdict(c) for c in self.
|
|
431
|
+
content = {'categories': [asdict(c) for c in self._training.categories], } if self._training else None
|
|
415
432
|
json_path = '/tmp/model.json'
|
|
416
433
|
with open(json_path, 'w') as f:
|
|
417
434
|
json.dump(content, f)
|
|
@@ -481,12 +498,13 @@ class TrainerLogicGeneric(ABC):
|
|
|
481
498
|
|
|
482
499
|
@abstractmethod
|
|
483
500
|
def _get_new_best_training_state(self) -> Optional[TrainingStateData]:
|
|
484
|
-
"""Is called frequently by `
|
|
501
|
+
"""Is called frequently by `_sync_training` during training to check if a new "best" model is availabe.
|
|
485
502
|
Returns None if no new model could be found. Otherwise TrainingStateData(confusion_matrix, meta_information).
|
|
486
503
|
`confusion_matrix` contains a dict of all classes:
|
|
487
504
|
- The classes must be identified by their uuid, not their name.
|
|
488
505
|
- For each class a dict with tp, fp, fn is provided (true positives, false positives, false negatives).
|
|
489
506
|
`meta_information` can hold any data which is helpful for self._on_metrics_published to store weight file etc for later upload via self.get_model_files
|
|
507
|
+
`epoch` is the epoch number of the best model.
|
|
490
508
|
"""
|
|
491
509
|
raise NotImplementedError
|
|
492
510
|
|
|
@@ -7,7 +7,6 @@ from typing import Dict, Optional
|
|
|
7
7
|
from fastapi.encoders import jsonable_encoder
|
|
8
8
|
from socketio import AsyncClient, exceptions
|
|
9
9
|
|
|
10
|
-
from ..data_classes import TrainingStatus
|
|
11
10
|
from ..node import Node
|
|
12
11
|
from .io_helpers import LastTrainingIO
|
|
13
12
|
from .rest import backdoor_controls
|
|
@@ -23,14 +22,15 @@ class TrainerNode(Node):
|
|
|
23
22
|
self.last_training_io = LastTrainingIO(self.uuid)
|
|
24
23
|
self.trainer_logic._last_training_io = self.last_training_io
|
|
25
24
|
|
|
26
|
-
self.
|
|
25
|
+
self._first_idle_time: float | None = None
|
|
27
26
|
if os.environ.get('TRAINER_IDLE_TIMEOUT_SEC', 0.0):
|
|
28
|
-
self.
|
|
27
|
+
self._idle_timeout = float(os.environ.get('TRAINER_IDLE_TIMEOUT_SEC', 0.0))
|
|
29
28
|
else:
|
|
30
|
-
self.
|
|
31
|
-
if self.
|
|
29
|
+
self._idle_timeout = 0.0
|
|
30
|
+
if self._idle_timeout:
|
|
32
31
|
self.log.info(
|
|
33
|
-
|
|
32
|
+
'Trainer started with an idle_timeout of %s seconds. Note that shutdown does not work if docker container has the restart policy set to always',
|
|
33
|
+
self._idle_timeout)
|
|
34
34
|
|
|
35
35
|
if use_backdoor_controls or os.environ.get('USE_BACKDOOR_CONTROLS', '0').lower() in ('1', 'true'):
|
|
36
36
|
self.include_router(backdoor_controls.router, tags=["controls"])
|
|
@@ -53,8 +53,8 @@ class TrainerNode(Node):
|
|
|
53
53
|
except exceptions.TimeoutError:
|
|
54
54
|
self.log.warning('timeout when sending status to learning loop, reconnecting sio_client')
|
|
55
55
|
await self.sio_client.disconnect() # NOTE: reconnect happens in node._on_repeat
|
|
56
|
-
except Exception
|
|
57
|
-
self.log.exception(
|
|
56
|
+
except Exception:
|
|
57
|
+
self.log.exception('could not send status. Exception:')
|
|
58
58
|
|
|
59
59
|
# ---------------------------------------------- NODE METHODS ---------------------------------------------------
|
|
60
60
|
|
|
@@ -68,7 +68,7 @@ class TrainerNode(Node):
|
|
|
68
68
|
|
|
69
69
|
@sio_client.event
|
|
70
70
|
async def stop_training():
|
|
71
|
-
self.log.info(
|
|
71
|
+
self.log.info('stop_training received. Current state : %s', self.trainer_logic.state)
|
|
72
72
|
try:
|
|
73
73
|
await self.trainer_logic.stop()
|
|
74
74
|
except Exception:
|
|
@@ -80,24 +80,7 @@ class TrainerNode(Node):
|
|
|
80
80
|
self.log.debug('cannot send status - not connected to the Learning Loop')
|
|
81
81
|
return
|
|
82
82
|
|
|
83
|
-
status =
|
|
84
|
-
name=self.name,
|
|
85
|
-
state=self.trainer_logic.state,
|
|
86
|
-
errors={},
|
|
87
|
-
uptime=self.trainer_logic.training_uptime,
|
|
88
|
-
progress=self.trainer_logic.general_progress)
|
|
89
|
-
|
|
90
|
-
status.pretrained_models = self.trainer_logic.provided_pretrained_models
|
|
91
|
-
status.architecture = self.trainer_logic.model_architecture
|
|
92
|
-
|
|
93
|
-
if data := self.trainer_logic.training_data:
|
|
94
|
-
status.train_image_count = data.train_image_count()
|
|
95
|
-
status.test_image_count = data.test_image_count()
|
|
96
|
-
status.skipped_image_count = data.skipped_image_count
|
|
97
|
-
status.hyperparameters = self.trainer_logic.hyperparameters_for_state_sync
|
|
98
|
-
status.errors = self.trainer_logic.errors.errors
|
|
99
|
-
status.context = self.trainer_logic.training_context
|
|
100
|
-
|
|
83
|
+
status = self.trainer_logic.generate_status_for_loop(self.uuid, self.name)
|
|
101
84
|
self.log.debug('sending status: %s', status.short_str())
|
|
102
85
|
result = await self.sio_client.call('update_trainer', jsonable_encoder(asdict(status)), timeout=30)
|
|
103
86
|
if isinstance(result, Dict) and not result['success']:
|
|
@@ -105,17 +88,17 @@ class TrainerNode(Node):
|
|
|
105
88
|
self.log.error('Error when sending status update: Response from loop was:\n %s', result)
|
|
106
89
|
|
|
107
90
|
def check_idle_timeout(self):
|
|
108
|
-
if not self.
|
|
91
|
+
if not self._idle_timeout:
|
|
109
92
|
return
|
|
110
93
|
|
|
111
94
|
if self.trainer_logic.state == 'idle':
|
|
112
|
-
if self.
|
|
113
|
-
self.
|
|
114
|
-
idle_time = time.time() - self.
|
|
115
|
-
if idle_time > self.
|
|
95
|
+
if self._first_idle_time is None:
|
|
96
|
+
self._first_idle_time = time.time()
|
|
97
|
+
idle_time = time.time() - self._first_idle_time
|
|
98
|
+
if idle_time > self._idle_timeout:
|
|
116
99
|
self.log.info('Trainer has been idle for %.2f s (with timeout %.2f s). Shutting down.',
|
|
117
|
-
idle_time, self.
|
|
100
|
+
idle_time, self._idle_timeout)
|
|
118
101
|
sys.exit(0)
|
|
119
|
-
self.log.debug('idle time: %.2f s / %.2f s', idle_time, self.
|
|
102
|
+
self.log.debug('idle time: %.2f s / %.2f s', idle_time, self._idle_timeout)
|
|
120
103
|
else:
|
|
121
|
-
self.
|
|
104
|
+
self._first_idle_time = None
|