learning-loop-node 0.11.0__py3-none-any.whl → 0.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of learning-loop-node might be problematic. Click here for more details.

Files changed (30) hide show
  1. learning_loop_node/data_classes/__init__.py +3 -3
  2. learning_loop_node/data_classes/general.py +1 -1
  3. learning_loop_node/data_classes/image_metadata.py +1 -1
  4. learning_loop_node/data_classes/training.py +62 -67
  5. learning_loop_node/data_exchanger.py +11 -9
  6. learning_loop_node/detector/detector_node.py +3 -2
  7. learning_loop_node/detector/outbox.py +8 -5
  8. learning_loop_node/helpers/environment_reader.py +2 -2
  9. learning_loop_node/helpers/log_conf.py +4 -1
  10. learning_loop_node/helpers/misc.py +7 -17
  11. learning_loop_node/loop_communication.py +7 -11
  12. learning_loop_node/node.py +10 -4
  13. learning_loop_node/rest.py +4 -2
  14. learning_loop_node/tests/detector/conftest.py +17 -21
  15. learning_loop_node/tests/trainer/conftest.py +18 -12
  16. learning_loop_node/tests/trainer/states/test_state_download_train_model.py +7 -3
  17. learning_loop_node/tests/trainer/states/test_state_prepare.py +0 -1
  18. learning_loop_node/tests/trainer/states/test_state_sync_confusion_matrix.py +2 -1
  19. learning_loop_node/tests/trainer/states/test_state_train.py +0 -2
  20. learning_loop_node/tests/trainer/test_trainer_states.py +6 -1
  21. learning_loop_node/tests/trainer/testing_trainer_logic.py +3 -3
  22. learning_loop_node/trainer/downloader.py +1 -1
  23. learning_loop_node/trainer/executor.py +2 -2
  24. learning_loop_node/trainer/rest/backdoor_controls.py +6 -6
  25. learning_loop_node/trainer/trainer_logic.py +7 -3
  26. learning_loop_node/trainer/trainer_logic_generic.py +59 -41
  27. learning_loop_node/trainer/trainer_node.py +18 -35
  28. {learning_loop_node-0.11.0.dist-info → learning_loop_node-0.12.0.dist-info}/METADATA +1 -1
  29. {learning_loop_node-0.11.0.dist-info → learning_loop_node-0.12.0.dist-info}/RECORD +30 -30
  30. {learning_loop_node-0.11.0.dist-info → learning_loop_node-0.12.0.dist-info}/WHEEL +0 -0
@@ -30,12 +30,15 @@ async def test_initialized_trainer_node():
30
30
  node = TrainerNode(name='test', trainer_logic=trainer, uuid='NOD30000-0000-0000-0000-000000000000')
31
31
  trainer._node = node
32
32
  trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'),
33
- details={'categories': [],
34
- 'id': '00000000-0000-0000-0000-000000000012', # version 1.2 of demo project
35
- 'training_number': 0,
36
- 'resolution': 800,
37
- 'flip_rl': False,
38
- 'flip_ud': False})
33
+ training_config={'categories': [],
34
+ 'id': '00000000-0000-0000-0000-000000000012', # version 1.2 of demo project
35
+ 'training_number': 0,
36
+ 'model_variant': '',
37
+ 'hyperparameters': {
38
+ 'resolution': 800,
39
+ 'flip_rl': False,
40
+ 'flip_ud': False}
41
+ })
39
42
  await node._on_startup()
40
43
  yield node
41
44
  await node._on_shutdown()
@@ -50,12 +53,15 @@ async def test_initialized_trainer():
50
53
  await node._on_startup()
51
54
  trainer._node = node
52
55
  trainer._init_new_training(context=Context(organization='zauberzeug', project='demo'),
53
- details={'categories': [],
54
- 'id': '00000000-0000-0000-0000-000000000012', # version 1.2 of demo project
55
- 'training_number': 0,
56
- 'resolution': 800,
57
- 'flip_rl': False,
58
- 'flip_ud': False})
56
+ training_config={'categories': [],
57
+ 'id': '00000000-0000-0000-0000-000000000012', # version 1.2 of demo project
58
+ 'training_number': 0,
59
+ 'model_variant': '',
60
+ 'hyperparameters': {
61
+ 'resolution': 800,
62
+ 'flip_rl': False,
63
+ 'flip_ud': False}
64
+ })
59
65
  yield trainer
60
66
  try:
61
67
  await node._on_shutdown()
@@ -3,6 +3,7 @@ import asyncio
3
3
  import os
4
4
 
5
5
  from ....data_classes import TrainerState
6
+ from ... import test_helper
6
7
  from ..state_helper import assert_training_state, create_active_training_file
7
8
  from ..testing_trainer_logic import TestingTrainerLogic
8
9
 
@@ -11,9 +12,12 @@ from ..testing_trainer_logic import TestingTrainerLogic
11
12
 
12
13
  async def test_downloading_is_successful(test_initialized_trainer: TestingTrainerLogic):
13
14
  trainer = test_initialized_trainer
14
- create_active_training_file(trainer, training_state=TrainerState.DataDownloaded)
15
15
 
16
- trainer.model_format = 'mocked'
16
+ model_id = await test_helper.get_latest_model_id(project='demo')
17
+ create_active_training_file(trainer,
18
+ base_model_uuid=model_id,
19
+ training_state=TrainerState.DataDownloaded)
20
+
17
21
  trainer._init_from_last_training()
18
22
 
19
23
  asyncio.get_running_loop().create_task(
@@ -50,7 +54,7 @@ async def test_abort_download_model(test_initialized_trainer: TestingTrainerLogi
50
54
  async def test_downloading_failed(test_initialized_trainer: TestingTrainerLogic):
51
55
  trainer = test_initialized_trainer
52
56
  create_active_training_file(trainer, training_state=TrainerState.DataDownloaded,
53
- base_model_uuid_or_name='00000000-0000-0000-0000-000000000000') # bad model id)
57
+ base_model_uuid='00000000-0000-0000-0000-000000000000') # bad model id)
54
58
  trainer._init_from_last_training()
55
59
 
56
60
  trainer._begin_training_task()
@@ -20,7 +20,6 @@ async def test_preparing_is_successful(test_initialized_trainer: TestingTrainerL
20
20
  await trainer._perform_state('prepare', TrainerState.DataDownloading, TrainerState.DataDownloaded, trainer._prepare)
21
21
  assert trainer_has_prepare_error(trainer) is False
22
22
  assert trainer.training.training_state == TrainerState.DataDownloaded
23
- assert trainer.training.data is not None
24
23
  assert trainer.node.last_training_io.load() == trainer.training
25
24
 
26
25
 
@@ -19,7 +19,7 @@ def trainer_has_sync_confusion_matrix_error(trainer: TrainerLogic):
19
19
  async def test_nothing_to_sync(test_initialized_trainer: TestingTrainerLogic):
20
20
  trainer = test_initialized_trainer
21
21
 
22
- # TODO this requires trainer to have _training
22
+ # NOTE: this requires trainer to have _training
23
23
  # trainer.load_active_training()
24
24
  create_active_training_file(trainer, training_state=TrainerState.TrainingFinished)
25
25
  trainer._init_from_last_training()
@@ -40,6 +40,7 @@ async def test_unsynced_model_available__sync_successful(test_initialized_traine
40
40
  create_active_training_file(trainer, training_state=TrainerState.TrainingFinished)
41
41
 
42
42
  trainer._init_from_last_training()
43
+ trainer.training.image_data = []
43
44
  trainer.has_new_model = True
44
45
 
45
46
  trainer._begin_training_task()
@@ -1,5 +1,3 @@
1
- import asyncio
2
-
3
1
  from ....data_classes import TrainerState
4
2
  from ...test_helper import condition
5
3
  from ..state_helper import assert_training_state, create_active_training_file
@@ -14,7 +14,12 @@ def create_training() -> Training:
14
14
  context=context,
15
15
  project_folder='',
16
16
  images_folder='',
17
- training_folder='')
17
+ training_folder='',
18
+ categories=[],
19
+ hyperparameters={},
20
+ model_variant='',
21
+ training_number=0,
22
+ training_state=TrainerState.Preparing)
18
23
  return training
19
24
 
20
25
 
@@ -30,13 +30,13 @@ class TestingTrainerLogic(TrainerLogic):
30
30
  PretrainedModel(name='large', label='Large', description='a large model')]
31
31
 
32
32
  # pylint: disable=unused-argument
33
- async def _start_training_from_base_model(self, model: str = 'model.model') -> None:
33
+ async def _start_training_from_base_model(self) -> None:
34
34
  assert self._executor is not None
35
35
  await self._executor.start('/bin/bash -c "while true; do sleep 1; done"')
36
36
 
37
37
  async def _start_training_from_scratch(self) -> None:
38
- assert self.training.base_model_uuid_or_name is not None, 'base_model_uuid_or_name must be set'
39
- await self._start_training_from_base_model(model=f'model_{self.training.base_model_uuid_or_name}.pt')
38
+ assert self._executor is not None
39
+ await self._executor.start('/bin/bash -c "while true; do sleep 1; done"')
40
40
 
41
41
  def _get_new_best_training_state(self) -> Optional[TrainingStateData]:
42
42
  if self.has_new_model:
@@ -27,5 +27,5 @@ class TrainingsDownloader():
27
27
  valid_image_data.append(i)
28
28
  else:
29
29
  skipped_image_count += 1
30
- logging.info(f'Done downloading image data for {len(image_data)} images.')
30
+ logging.info('Done downloading image data for %s images.', len(image_data))
31
31
  return (valid_image_data, skipped_image_count)
@@ -3,7 +3,7 @@ import logging
3
3
  import os
4
4
  import shlex
5
5
  from io import BufferedWriter
6
- from typing import List, Optional, Dict
6
+ from typing import Dict, List, Optional
7
7
 
8
8
 
9
9
  class Executor:
@@ -33,7 +33,7 @@ class Executor:
33
33
  if env is not None:
34
34
  full_env.update(env)
35
35
 
36
- logging.info(f'Starting executor with command: {cmd} in {self.path} - logging to {self.log_file_path}')
36
+ logging.info('Starting executor with command: %s in %s - logging to %s', cmd, self.path, self.log_file_path)
37
37
  self.log_file = open(self.log_file_path, 'ab')
38
38
 
39
39
  self._process = await asyncio.create_subprocess_exec(
@@ -29,7 +29,7 @@ async def provide_new_model(request: Request):
29
29
  if value == 'on':
30
30
  trainer_node.trainer_logic.provide_new_model = True # type: ignore
31
31
 
32
- logging.debug(f'turning automatically provide_new_model {value}')
32
+ logging.debug('turning automatically provide_new_model %s', value)
33
33
 
34
34
 
35
35
  @router.post("/reset")
@@ -64,7 +64,7 @@ def set_error_configuration(msg: Dict, request: Request):
64
64
  get_new_model=msg.get('get_new_model', None),
65
65
  save_model=msg.get('save_model', None), )
66
66
 
67
- logging.info(f'setting error configuration to: {asdict(error_configuration)}')
67
+ logging.info('setting error configuration to: %s', asdict(error_configuration))
68
68
  trainer_logic = request.app.trainer_logic
69
69
 
70
70
  # NOTE: trainer_logic is MockTrainerLogic which has a property error_configuration
@@ -82,23 +82,23 @@ async def add_steps(request: Request):
82
82
 
83
83
  if not trainer_logic._executor or not trainer_logic._executor.is_running(): # pylint: disable=protected-access
84
84
  training = trainer_logic._training # pylint: disable=protected-access
85
- logging.error(f'cannot add steps when not running, state: {training.training_state if training else "None"}')
85
+ logging.error('cannot add steps when not running, state: %s', training.training_state if training else 'None')
86
86
  raise HTTPException(status_code=409, detail="trainer is not running")
87
87
 
88
88
  steps = int(str(await request.body(), 'utf-8'))
89
89
 
90
90
  previous_state = trainer_logic.provide_new_model # type: ignore
91
91
  trainer_logic.provide_new_model = True # type: ignore
92
- logging.warning(f'simulating newly completed models by moving {steps} forward')
92
+ logging.warning('simulating newly completed models by moving %s forward', steps)
93
93
 
94
94
  for _ in range(steps):
95
95
  try:
96
96
  logging.warning('calling sync_confusion_matrix')
97
- await trainer_logic._sync_confusion_matrix() # pylint: disable=protected-access
97
+ await trainer_logic._sync_training() # pylint: disable=protected-access
98
98
  except Exception:
99
99
  pass # Tests can force synchroniation to fail, error state is reported to backend
100
100
  trainer_logic.provide_new_model = previous_state # type: ignore
101
- logging.warning(f'progress increased to {trainer_logic.current_iteration}') # type: ignore
101
+ logging.warning('progress increased to %s', trainer_logic.current_iteration) # type: ignore
102
102
  await trainer_node.send_status()
103
103
 
104
104
 
@@ -62,7 +62,7 @@ class TrainerLogic(TrainerLogicGeneric):
62
62
  break
63
63
  self.errors.reset(error_key)
64
64
  try:
65
- await self._sync_confusion_matrix()
65
+ await self._sync_training()
66
66
  except asyncio.CancelledError:
67
67
  logging.warning('CancelledError in run_training')
68
68
  raise
@@ -130,8 +130,12 @@ class TrainerLogic(TrainerLogicGeneric):
130
130
  if self._can_resume():
131
131
  self.start_training_task = self._resume()
132
132
  else:
133
- base_model_uuid_or_name = self.training.base_model_uuid_or_name
134
- if not is_valid_uuid4(base_model_uuid_or_name):
133
+ base_model_uuid_is_none = self.training.base_model_uuid is None
134
+ base_model_uuid_is_valid = is_valid_uuid4(self.training.base_model_uuid)
135
+ if not base_model_uuid_is_none and not base_model_uuid_is_valid:
136
+ logging.warning('base_model_uuid is not a valid uuid4: %s\n Starting training from scratch.',
137
+ self.training.base_model_uuid)
138
+ if not base_model_uuid_is_valid:
135
139
  self.start_training_task = self._start_training_from_scratch()
136
140
  else:
137
141
  self.start_training_task = self._start_training_from_base_model()
@@ -10,9 +10,9 @@ from typing import TYPE_CHECKING, Callable, Coroutine, Dict, List, Optional
10
10
 
11
11
  from fastapi.encoders import jsonable_encoder
12
12
 
13
- from ..data_classes import (Context, Errors, Hyperparameter, PretrainedModel, TrainerState, Training, TrainingData,
14
- TrainingOut, TrainingStateData)
15
- from ..helpers.misc import create_project_folder, delete_all_training_folders, generate_training, is_valid_uuid4
13
+ from ..data_classes import (Context, Errors, PretrainedModel, TrainerState, Training, TrainingOut, TrainingStateData,
14
+ TrainingStatus)
15
+ from ..helpers.misc import create_project_folder, delete_all_training_folders, is_valid_uuid4
16
16
  from .downloader import TrainingsDownloader
17
17
  from .exceptions import CriticalError, NodeNeedsRestartError
18
18
  from .io_helpers import ActiveTrainingIO, EnvironmentVars, LastTrainingIO
@@ -66,19 +66,12 @@ class TrainerLogicGeneric(ABC):
66
66
  return self._training
67
67
 
68
68
  @property
69
- def hyperparameter(self) -> Hyperparameter:
70
- assert self.training_data is not None, 'Training should have data'
71
- assert self.training_data.hyperparameter is not None, 'Training.data should have hyperparameter'
72
- return self.training_data.hyperparameter
69
+ def hyperparameters(self) -> dict:
70
+ assert self._training is not None, 'Training should have data'
71
+ return self._training.hyperparameters
73
72
 
74
73
  # ---------------------------------------- PROPERTIES ----------------------------------------
75
74
 
76
- @property
77
- def training_data(self) -> Optional[TrainingData]:
78
- if self.training_active and self.training.data:
79
- return self.training.data
80
- return None
81
-
82
75
  @property
83
76
  def training_context(self) -> Optional[Context]:
84
77
  if self.training_active:
@@ -111,12 +104,8 @@ class TrainerLogicGeneric(ABC):
111
104
  def hyperparameters_for_state_sync(self) -> Optional[Dict]:
112
105
  """Used in sync_confusion_matrix and send_status to provide information about the training configuration.
113
106
  """
114
- if self._training and self._training.data and self._training.data.hyperparameter:
115
- information = {}
116
- information['resolution'] = self._training.data.hyperparameter.resolution
117
- information['flipRl'] = self._training.data.hyperparameter.flip_rl
118
- information['flipUd'] = self._training.data.hyperparameter.flip_ud
119
- return information
107
+ if self._training:
108
+ return self._training.hyperparameters
120
109
  return None
121
110
 
122
111
  @property
@@ -173,6 +162,24 @@ class TrainerLogicGeneric(ABC):
173
162
  # Initializing a new training object will create the folder structure for the training.
174
163
  # The training loop will then run through the states of the training.
175
164
 
165
+ def generate_status_for_loop(self, trainer_uuid: str, trainer_name: str) -> TrainingStatus:
166
+
167
+ status = TrainingStatus(id=trainer_uuid,
168
+ name=trainer_name,
169
+ state=self.state,
170
+ errors={},
171
+ uptime=self.training_uptime,
172
+ progress=self.general_progress)
173
+
174
+ status.pretrained_models = self.provided_pretrained_models
175
+ status.architecture = self.model_architecture
176
+
177
+ if self._training:
178
+ status.errors = self.errors.errors
179
+ status.context = self.training_context
180
+
181
+ return status
182
+
176
183
  async def try_continue_run_if_incomplete(self) -> bool:
177
184
  """Tries to continue a training if the last training was not finished.
178
185
  """
@@ -188,29 +195,30 @@ class TrainerLogicGeneric(ABC):
188
195
  """
189
196
  self._training = self.last_training_io.load()
190
197
  assert self._training is not None and self._training.training_folder is not None, 'could not restore training folder'
198
+ logger.info('restored training: \n%s', self._training)
191
199
  self._active_training_io = ActiveTrainingIO(
192
200
  self._training.training_folder, self.node.loop_communicator, self._training.context)
193
201
 
194
- async def begin_training(self, organization: str, project: str, details: Dict) -> None:
202
+ async def begin_training(self, organization: str, project: str, training_config: Dict) -> None:
195
203
  """Called on `begin_training` event from the Learning Loop.
196
204
  """
197
- self._init_new_training(Context(organization=organization, project=project), details)
205
+ self._init_new_training(Context(organization=organization, project=project), training_config)
198
206
  self._begin_training_task()
199
207
 
200
208
  def _begin_training_task(self) -> None:
201
209
  # NOTE: Task object is used to potentially cancel the task
202
210
  self.training_task = asyncio.get_event_loop().create_task(self._run())
203
211
 
204
- def _init_new_training(self, context: Context, details: Dict) -> None:
212
+ def _init_new_training(self, context: Context, training_config: Dict) -> None:
205
213
  """Called on `begin_training` event from the Learning Loop.
206
- Note that details needs the entries 'categories' and 'training_number',
214
+ Note that training_config needs the entries 'categories', 'model_variant' and 'training_number',
207
215
  but also the hyperparameter entries.
216
+ 'base_model_uuid' is optional if the training is continued from a previous training.
208
217
  """
209
218
  project_folder = create_project_folder(context)
210
219
  if not self._environment_vars.keep_old_trainings:
211
220
  delete_all_training_folders(project_folder)
212
- self._training = generate_training(project_folder, context)
213
- self._training.set_values_from_data(details)
221
+ self._training = Training.generate_training(project_folder, context, training_config)
214
222
 
215
223
  self._active_training_io = ActiveTrainingIO(
216
224
  self._training.training_folder, self.node.loop_communicator, context)
@@ -254,7 +262,7 @@ class TrainerLogicGeneric(ABC):
254
262
  elif tstate == TrainerState.TrainModelDownloaded: # -> TrainingRunning -> TrainingFinished
255
263
  await self._perform_state('run_training', TrainerState.TrainingRunning, TrainerState.TrainingFinished, self._train)
256
264
  elif tstate == TrainerState.TrainingFinished: # -> ConfusionMatrixSyncing -> ConfusionMatrixSynced
257
- await self._perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self._sync_confusion_matrix)
265
+ await self._perform_state('sync_confusion_matrix', TrainerState.ConfusionMatrixSyncing, TrainerState.ConfusionMatrixSynced, self._sync_training)
258
266
  elif tstate == TrainerState.ConfusionMatrixSynced: # -> TrainModelUploading -> TrainModelUploaded
259
267
  await self._perform_state('upload_model', TrainerState.TrainModelUploading, TrainerState.TrainModelUploaded, self._upload_model)
260
268
  elif tstate == TrainerState.TrainModelUploaded: # -> Detecting -> Detected
@@ -298,6 +306,7 @@ class TrainerLogicGeneric(ABC):
298
306
  logger.error('Node Restart Requested')
299
307
  sys.exit(0)
300
308
  except Exception as e:
309
+ print('Error in %s - Exception: %s', state_during, e, flush=True)
301
310
  self.errors.set(error_key, str(e))
302
311
  logger.exception('Error in %s - Exception: %s', state_during, e)
303
312
  self.training.training_state = previous_state
@@ -316,19 +325,25 @@ class TrainerLogicGeneric(ABC):
316
325
  self.node.data_exchanger.set_context(self.training.context)
317
326
  downloader = TrainingsDownloader(self.node.data_exchanger)
318
327
  image_data, skipped_image_count = await downloader.download_training_data(self.training.images_folder)
319
- assert self.training.data is not None, 'training.data must be set'
320
- self.training.data.image_data = image_data
321
- self.training.data.skipped_image_count = skipped_image_count
328
+
329
+ self.training.image_data = image_data
330
+ self.training.skipped_image_count = skipped_image_count
322
331
 
323
332
  async def _download_model(self) -> None:
324
333
  """If training is continued, the model is downloaded from the Learning Loop to the training_folder.
325
334
  The downloaded model.json file is renamed to base_model.json because a new model.json will be created during training.
326
335
  """
327
- base_model_uuid = self.training.base_model_uuid_or_name
336
+ base_model_uuid = self.training.base_model_uuid
337
+ base_model_uuid_is_none = base_model_uuid is None
338
+ base_model_uuid_is_valid = is_valid_uuid4(base_model_uuid)
339
+
340
+ if not base_model_uuid_is_none and not base_model_uuid_is_valid:
341
+ logger.warning(
342
+ 'base model uuid was provided but was not valid (base_model_uuid: %s).\nSkipping download and starting training from scratch.', base_model_uuid)
343
+ return
328
344
 
329
- # TODO this checks if we continue a training -> make more explicit
330
- if not base_model_uuid or not is_valid_uuid4(base_model_uuid):
331
- logger.info('skipping model download. No base model provided (in form of uuid): %s', base_model_uuid)
345
+ if base_model_uuid_is_none:
346
+ logger.info('No base model provided (base_model_uuid: %s).\nStarting training from scratch.', base_model_uuid)
332
347
  return
333
348
 
334
349
  logger.info('loading model from Learning Loop')
@@ -337,19 +352,21 @@ class TrainerLogicGeneric(ABC):
337
352
  shutil.move(f'{self.training.training_folder}/model.json',
338
353
  f'{self.training.training_folder}/base_model.json')
339
354
 
340
- async def _sync_confusion_matrix(self) -> None:
341
- """Syncronizes the confusion matrix with the Learning Loop via the update_training endpoint.
355
+ async def _sync_training(self) -> None:
356
+ """Syncronizes the training with the Learning Loop via the update_training endpoint.
342
357
  NOTE: This stage sets the errors explicitly because it may be used inside the training stage.
343
358
  """
344
359
  error_key = 'sync_confusion_matrix'
345
360
  try:
346
361
  new_best_model = self._get_new_best_training_state()
347
- if new_best_model and self.training.data:
362
+ if new_best_model:
348
363
  new_training = TrainingOut(trainer_id=self.node.uuid,
364
+ trainer_name=self.node.name,
349
365
  confusion_matrix=new_best_model.confusion_matrix,
350
- train_image_count=self.training.data.train_image_count(),
351
- test_image_count=self.training.data.test_image_count(),
352
- hyperparameters=self.hyperparameters_for_state_sync)
366
+ train_image_count=self.training.train_image_count(),
367
+ test_image_count=self.training.test_image_count(),
368
+ hyperparameters=self.hyperparameters_for_state_sync,
369
+ best_epoch=new_best_model.epoch)
353
370
  await asyncio.sleep(0.1) # NOTE needed for tests.
354
371
 
355
372
  result = await self.node.sio_client.call('update_training', (
@@ -411,7 +428,7 @@ class TrainerLogicGeneric(ABC):
411
428
  def _dump_categories_to_json(self) -> str:
412
429
  """Dumps the categories to a json file and returns the path to the file.
413
430
  """
414
- content = {'categories': [asdict(c) for c in self.training_data.categories], } if self.training_data else None
431
+ content = {'categories': [asdict(c) for c in self._training.categories], } if self._training else None
415
432
  json_path = '/tmp/model.json'
416
433
  with open(json_path, 'w') as f:
417
434
  json.dump(content, f)
@@ -481,12 +498,13 @@ class TrainerLogicGeneric(ABC):
481
498
 
482
499
  @abstractmethod
483
500
  def _get_new_best_training_state(self) -> Optional[TrainingStateData]:
484
- """Is called frequently by `_sync_confusion_matrix` to check if a new "best" model is availabe.
501
+ """Is called frequently by `_sync_training` during training to check if a new "best" model is availabe.
485
502
  Returns None if no new model could be found. Otherwise TrainingStateData(confusion_matrix, meta_information).
486
503
  `confusion_matrix` contains a dict of all classes:
487
504
  - The classes must be identified by their uuid, not their name.
488
505
  - For each class a dict with tp, fp, fn is provided (true positives, false positives, false negatives).
489
506
  `meta_information` can hold any data which is helpful for self._on_metrics_published to store weight file etc for later upload via self.get_model_files
507
+ `epoch` is the epoch number of the best model.
490
508
  """
491
509
  raise NotImplementedError
492
510
 
@@ -7,7 +7,6 @@ from typing import Dict, Optional
7
7
  from fastapi.encoders import jsonable_encoder
8
8
  from socketio import AsyncClient, exceptions
9
9
 
10
- from ..data_classes import TrainingStatus
11
10
  from ..node import Node
12
11
  from .io_helpers import LastTrainingIO
13
12
  from .rest import backdoor_controls
@@ -23,14 +22,15 @@ class TrainerNode(Node):
23
22
  self.last_training_io = LastTrainingIO(self.uuid)
24
23
  self.trainer_logic._last_training_io = self.last_training_io
25
24
 
26
- self.first_idle_time: float | None = None
25
+ self._first_idle_time: float | None = None
27
26
  if os.environ.get('TRAINER_IDLE_TIMEOUT_SEC', 0.0):
28
- self.idle_timeout = float(os.environ.get('TRAINER_IDLE_TIMEOUT_SEC', 0.0))
27
+ self._idle_timeout = float(os.environ.get('TRAINER_IDLE_TIMEOUT_SEC', 0.0))
29
28
  else:
30
- self.idle_timeout = 0.0
31
- if self.idle_timeout:
29
+ self._idle_timeout = 0.0
30
+ if self._idle_timeout:
32
31
  self.log.info(
33
- f'Trainer started with an idle_timeout of {self.idle_timeout} seconds. Note that shutdown does not work if docker container has the restart policy set to always')
32
+ 'Trainer started with an idle_timeout of %s seconds. Note that shutdown does not work if docker container has the restart policy set to always',
33
+ self._idle_timeout)
34
34
 
35
35
  if use_backdoor_controls or os.environ.get('USE_BACKDOOR_CONTROLS', '0').lower() in ('1', 'true'):
36
36
  self.include_router(backdoor_controls.router, tags=["controls"])
@@ -53,8 +53,8 @@ class TrainerNode(Node):
53
53
  except exceptions.TimeoutError:
54
54
  self.log.warning('timeout when sending status to learning loop, reconnecting sio_client')
55
55
  await self.sio_client.disconnect() # NOTE: reconnect happens in node._on_repeat
56
- except Exception as e:
57
- self.log.exception(f'could not send status state: {e}')
56
+ except Exception:
57
+ self.log.exception('could not send status. Exception:')
58
58
 
59
59
  # ---------------------------------------------- NODE METHODS ---------------------------------------------------
60
60
 
@@ -68,7 +68,7 @@ class TrainerNode(Node):
68
68
 
69
69
  @sio_client.event
70
70
  async def stop_training():
71
- self.log.info(f'stop_training received. Current state : {self.status.state}')
71
+ self.log.info('stop_training received. Current state : %s', self.trainer_logic.state)
72
72
  try:
73
73
  await self.trainer_logic.stop()
74
74
  except Exception:
@@ -80,24 +80,7 @@ class TrainerNode(Node):
80
80
  self.log.debug('cannot send status - not connected to the Learning Loop')
81
81
  return
82
82
 
83
- status = TrainingStatus(id=self.uuid,
84
- name=self.name,
85
- state=self.trainer_logic.state,
86
- errors={},
87
- uptime=self.trainer_logic.training_uptime,
88
- progress=self.trainer_logic.general_progress)
89
-
90
- status.pretrained_models = self.trainer_logic.provided_pretrained_models
91
- status.architecture = self.trainer_logic.model_architecture
92
-
93
- if data := self.trainer_logic.training_data:
94
- status.train_image_count = data.train_image_count()
95
- status.test_image_count = data.test_image_count()
96
- status.skipped_image_count = data.skipped_image_count
97
- status.hyperparameters = self.trainer_logic.hyperparameters_for_state_sync
98
- status.errors = self.trainer_logic.errors.errors
99
- status.context = self.trainer_logic.training_context
100
-
83
+ status = self.trainer_logic.generate_status_for_loop(self.uuid, self.name)
101
84
  self.log.debug('sending status: %s', status.short_str())
102
85
  result = await self.sio_client.call('update_trainer', jsonable_encoder(asdict(status)), timeout=30)
103
86
  if isinstance(result, Dict) and not result['success']:
@@ -105,17 +88,17 @@ class TrainerNode(Node):
105
88
  self.log.error('Error when sending status update: Response from loop was:\n %s', result)
106
89
 
107
90
  def check_idle_timeout(self):
108
- if not self.idle_timeout:
91
+ if not self._idle_timeout:
109
92
  return
110
93
 
111
94
  if self.trainer_logic.state == 'idle':
112
- if self.first_idle_time is None:
113
- self.first_idle_time = time.time()
114
- idle_time = time.time() - self.first_idle_time
115
- if idle_time > self.idle_timeout:
95
+ if self._first_idle_time is None:
96
+ self._first_idle_time = time.time()
97
+ idle_time = time.time() - self._first_idle_time
98
+ if idle_time > self._idle_timeout:
116
99
  self.log.info('Trainer has been idle for %.2f s (with timeout %.2f s). Shutting down.',
117
- idle_time, self.idle_timeout)
100
+ idle_time, self._idle_timeout)
118
101
  sys.exit(0)
119
- self.log.debug('idle time: %.2f s / %.2f s', idle_time, self.idle_timeout)
102
+ self.log.debug('idle time: %.2f s / %.2f s', idle_time, self._idle_timeout)
120
103
  else:
121
- self.first_idle_time = None
104
+ self._first_idle_time = None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: learning-loop-node
3
- Version: 0.11.0
3
+ Version: 0.12.0
4
4
  Summary: Python Library for Nodes which connect to the Zauberzeug Learning Loop
5
5
  Home-page: https://github.com/zauberzeug/learning_loop_node
6
6
  License: MIT