learning-loop-node 0.10.6__py3-none-any.whl → 0.10.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of learning-loop-node might be problematic. Click here for more details.

@@ -14,6 +14,7 @@ import aiofiles # type: ignore
14
14
  from .data_classes import Context
15
15
  from .helpers.misc import create_resource_paths, create_task, is_valid_image
16
16
  from .loop_communication import LoopCommunicator
17
+ from .trainer.exceptions import CriticalError
17
18
 
18
19
 
19
20
  class DownloadError(Exception):
@@ -159,13 +160,17 @@ class DataExchanger():
159
160
  logging.info(f'Downloaded model {model_uuid}({model_format}) to {target_folder}.')
160
161
  return created_files
161
162
 
162
- async def upload_model_get_uuid(self, context: Context, files: List[str], training_number: Optional[int], mformat: str) -> Optional[str]:
163
- """Used by the trainers. Function returns the new model uuid to use for detection."""
163
+ async def upload_model_get_uuid(self, context: Context, files: List[str], training_number: Optional[int], mformat: str) -> str:
164
+ """Used by the trainers. Function returns the new model uuid to use for detection.
165
+
166
+ :return: The new model uuid.
167
+ :raise CriticalError: If the upload does not return status code 200.
168
+ """
164
169
  response = await self.loop_communicator.put(f'/{context.organization}/projects/{context.project}/trainings/{training_number}/models/latest/{mformat}/file', files=files)
165
170
  if response.status_code != 200:
166
171
  logging.error(f'Could not upload model for training {training_number}, format {mformat}: {response.text}')
167
- response.raise_for_status()
168
- return None
172
+ raise CriticalError(
173
+ f'Could not upload model for training {training_number}, format {mformat}: {response.text}')
169
174
 
170
175
  uploaded_model = response.json()
171
176
  logging.info(f'Uploaded model for training {training_number}, format {mformat}. Response is: {uploaded_model}')
@@ -27,6 +27,7 @@ from .rest import about as rest_about
27
27
  from .rest import backdoor_controls
28
28
  from .rest import detect as rest_detect
29
29
  from .rest import operation_mode as rest_mode
30
+ from .rest import outbox_mode as rest_outbox_mode
30
31
  from .rest import upload as rest_upload
31
32
  from .rest.operation_mode import OperationMode
32
33
 
@@ -57,6 +58,7 @@ class DetectorNode(Node):
57
58
  self.include_router(rest_upload.router, prefix="")
58
59
  self.include_router(rest_mode.router, tags=["operation_mode"])
59
60
  self.include_router(rest_about.router, tags=["about"])
61
+ self.include_router(rest_outbox_mode.router, tags=["outbox_mode"])
60
62
 
61
63
  if use_backdoor_controls:
62
64
  self.include_router(backdoor_controls.router)
@@ -89,7 +91,7 @@ class DetectorNode(Node):
89
91
 
90
92
  async def on_startup(self) -> None:
91
93
  try:
92
- self.outbox.start_continuous_upload()
94
+ self.outbox.ensure_continuous_upload()
93
95
  self.detector_logic.load_model()
94
96
  except Exception:
95
97
  self.log.exception("error during 'startup'")
@@ -97,7 +99,7 @@ class DetectorNode(Node):
97
99
 
98
100
  async def on_shutdown(self) -> None:
99
101
  try:
100
- self.outbox.stop_continuous_upload()
102
+ self.outbox.ensure_continuous_upload_stopped()
101
103
  for sid in self.connected_clients:
102
104
  # pylint: disable=no-member
103
105
  await self.sio.disconnect(sid) # type:ignore
@@ -5,7 +5,9 @@ import shutil
5
5
  import time
6
6
  from dataclasses import asdict
7
7
  from datetime import datetime
8
+ from enum import Enum
8
9
  from glob import glob
10
+ from io import BufferedReader, TextIOWrapper
9
11
  from multiprocessing import Event
10
12
  from multiprocessing.synchronize import Event as SyncEvent
11
13
  from threading import Thread
@@ -19,13 +21,18 @@ from ..globals import GLOBALS
19
21
  from ..helpers import environment_reader
20
22
 
21
23
 
22
- class Outbox():
24
+ class OutboxMode(Enum):
25
+ CONTINUOUS_UPLOAD = 'continuous_upload'
26
+ STOPPED = 'stopped'
27
+
23
28
 
29
+ class Outbox():
24
30
  def __init__(self) -> None:
25
31
  self.log = logging.getLogger()
26
32
  self.path = f'{GLOBALS.data_folder}/outbox'
27
33
  os.makedirs(self.path, exist_ok=True)
28
34
 
35
+ self.log = logging.getLogger()
29
36
  host = environment_reader.host()
30
37
  o = environment_reader.organization()
31
38
  p = environment_reader.project()
@@ -34,9 +41,12 @@ class Outbox():
34
41
  base_url = f'http{"s" if "learning-loop.ai" in host else ""}://{host}/api'
35
42
  base: str = base_url
36
43
  self.target_uri = f'{base}/{o}/projects/{p}/images'
37
- self.log.info(f'Outbox initialized with target_uri: {self.target_uri}')
44
+ self.log.info('Outbox initialized with target_uri: %s', self.target_uri)
45
+
46
+ self.BATCH_SIZE = 20
47
+ self.UPLOAD_TIMEOUT_S = 30
38
48
 
39
- self.shutdown_event: Optional[SyncEvent] = None
49
+ self.shutdown_event: SyncEvent = Event()
40
50
  self.upload_process: Optional[Thread] = None
41
51
 
42
52
  def save(self, image: bytes, detections: Optional[Detections] = None, tags: Optional[List[str]] = None) -> None:
@@ -59,59 +69,117 @@ class Outbox():
59
69
  if os.path.exists(tmp):
60
70
  os.rename(tmp, self.path + '/' + identifier) # NOTE rename is atomic so upload can run in parallel
61
71
  else:
62
- self.log.error(f'Could not rename {tmp} to {self.path}/{identifier}')
72
+ self.log.error('Could not rename %s to %s', tmp, self.path + '/' + identifier)
63
73
 
64
74
  def get_data_files(self):
65
75
  return glob(f'{self.path}/*')
66
76
 
67
- def start_continuous_upload(self):
68
- self.shutdown_event = Event()
69
- self.upload_process = Thread(target=self._continuous_upload)
77
+ def ensure_continuous_upload(self):
78
+ self.log.debug('start_continuous_upload')
79
+ if self._upload_process_alive():
80
+ self.log.debug('Upload thread already running')
81
+ return
82
+
83
+ self.shutdown_event.clear()
84
+ self.upload_process = Thread(target=self._continuous_upload, name='OutboxUpload')
70
85
  self.upload_process.start()
71
86
 
72
87
  def _continuous_upload(self):
73
- self.log.info('start continuous upload')
88
+ self.log.info('continuous upload started')
74
89
  assert self.shutdown_event is not None
75
90
  while not self.shutdown_event.is_set():
76
91
  self.upload()
77
- time.sleep(1)
78
- self.log.info('stop continuous upload')
92
+ time.sleep(5)
93
+ self.log.info('continuous upload ended')
79
94
 
80
95
  def upload(self):
81
96
  items = self.get_data_files()
82
97
  if items:
83
- self.log.info(f'Found {len(items)} images to upload')
84
- for item in items:
85
- if self.shutdown_event and self.shutdown_event.is_set():
86
- break
87
- try:
88
- data = [('files', open(f'{item}/image.json', 'r')),
89
- ('files', open(f'{item}/image.jpg', 'rb'))]
90
-
91
- response = requests.post(self.target_uri, files=data, timeout=30)
92
- if response.status_code == 200:
93
- shutil.rmtree(item)
94
- self.log.info(f'uploaded {item} successfully')
95
- elif response.status_code == 422:
96
- self.log.error(f'Broken content in {item}: dropping this data')
97
- shutil.rmtree(item)
98
- else:
99
- self.log.error(f'Could not upload {item}: {response.status_code}')
100
- except Exception:
101
- self.log.exception('could not upload files')
102
-
103
- def stop_continuous_upload(self, timeout=5):
98
+ self.log.info('Found %s images to upload', len(items))
99
+ for i in range(0, len(items), self.BATCH_SIZE):
100
+ batch_items = items[i:i+self.BATCH_SIZE]
101
+ if self.shutdown_event.is_set():
102
+ break
103
+ try:
104
+ self._upload_batch(batch_items)
105
+ except Exception:
106
+ self.log.exception('Could not upload files')
107
+ else:
108
+ self.log.info('No images found to upload')
109
+
110
+ def _upload_batch(self, items: List[str]):
111
+ data: List[tuple[str, TextIOWrapper | BufferedReader]] = []
112
+ data = [('files', open(f'{item}/image.json', 'r')) for item in items]
113
+ data += [('files', open(f'{item}/image.jpg', 'rb')) for item in items]
114
+
115
+ response = requests.post(self.target_uri, files=data, timeout=self.UPLOAD_TIMEOUT_S)
116
+ if response.status_code == 200:
117
+ for item in items:
118
+ shutil.rmtree(item, ignore_errors=True)
119
+ self.log.info('Uploaded %s images successfully', len(items))
120
+ elif response.status_code == 422:
121
+ if len(items) == 1:
122
+ self.log.error('Broken content in image: %s\n Skipping.', items[0])
123
+ shutil.rmtree(items[0], ignore_errors=True)
124
+ return
125
+
126
+ self.log.exception('Broken content in batch. Splitting and retrying')
127
+ self._upload_batch(items[:len(items)//2])
128
+ self._upload_batch(items[len(items)//2:])
129
+ else:
130
+ self.log.error('Could not upload images: %s', response.content)
131
+
132
+ def ensure_continuous_upload_stopped(self) -> bool:
133
+ self.log.debug('Outbox: Ensuring continuous upload')
134
+ if not self._upload_process_alive():
135
+ self.log.debug('Upload thread already stopped')
136
+ return True
104
137
  proc = self.upload_process
105
138
  if not proc:
106
- return
139
+ return True
107
140
 
108
141
  try:
109
142
  assert self.shutdown_event is not None
110
143
  self.shutdown_event.set()
111
144
  assert proc is not None
112
- proc.join(timeout)
145
+ proc.join(self.UPLOAD_TIMEOUT_S + 1)
113
146
  except Exception:
114
- logging.exception('error while shutting down upload thread')
147
+ self.log.exception('Error while shutting down upload thread: ')
115
148
 
116
149
  if proc.is_alive():
117
- self.log.error('upload thread did not terminate')
150
+ self.log.error('Upload thread did not terminate')
151
+ return False
152
+
153
+ self.log.info('Upload thread terminated')
154
+ return True
155
+
156
+ def _upload_process_alive(self) -> bool:
157
+ return bool(self.upload_process and self.upload_process.is_alive())
158
+
159
+ def get_mode(self) -> OutboxMode:
160
+ ''':return: current mode ('continuous_upload' or 'stopped')'''
161
+ if self.upload_process and self.upload_process.is_alive():
162
+ current_mode = OutboxMode.CONTINUOUS_UPLOAD
163
+ else:
164
+ current_mode = OutboxMode.STOPPED
165
+
166
+ self.log.debug('Outbox: Current mode is %s', current_mode)
167
+ return current_mode
168
+
169
+ def set_mode(self, mode: OutboxMode | str):
170
+ ''':param mode: 'continuous_upload' or 'stopped'
171
+ :raises ValueError: if mode is not a valid OutboxMode
172
+ :raises TimeoutError: if the upload thread does not terminate within 31 seconds with mode='stopped'
173
+ '''
174
+ if isinstance(mode, str):
175
+ mode = OutboxMode(mode)
176
+
177
+ if mode == OutboxMode.CONTINUOUS_UPLOAD:
178
+ self.ensure_continuous_upload()
179
+ elif mode == OutboxMode.STOPPED:
180
+ try:
181
+ self.ensure_continuous_upload_stopped()
182
+ except TimeoutError as e:
183
+ raise TimeoutError(f'Upload thread did not terminate within {self.UPLOAD_TIMEOUT_S} seconds.') from e
184
+
185
+ self.log.debug('set outbox mode to %s', mode)
@@ -0,0 +1,35 @@
1
+ from fastapi import APIRouter, HTTPException, Request
2
+ from fastapi.responses import PlainTextResponse
3
+
4
+ from ..outbox import Outbox
5
+
6
+ router = APIRouter()
7
+
8
+
9
+ @router.get("/outbox_mode")
10
+ async def get_outbox_mode(request: Request):
11
+ '''
12
+ Example Usage
13
+ curl http://localhost/outbox_mode
14
+ '''
15
+ outbox: Outbox = request.app.outbox
16
+ return PlainTextResponse(outbox.get_mode().value)
17
+
18
+
19
+ @router.put("/outbox_mode")
20
+ async def put_outbox_mode(request: Request):
21
+ '''
22
+ Example Usage
23
+ curl -X PUT -d "continuous_upload" http://localhost/outbox_mode
24
+ curl -X PUT -d "stopped" http://localhost/outbox_mode
25
+ '''
26
+ outbox: Outbox = request.app.outbox
27
+ content = str(await request.body(), 'utf-8')
28
+ try:
29
+ outbox.set_mode(content)
30
+ except TimeoutError as e:
31
+ raise HTTPException(202, 'Setting has not completed, yet: ' + str(e)) from e
32
+ except ValueError as e:
33
+ raise HTTPException(422, 'Could not set outbox mode: ' + str(e)) from e
34
+
35
+ return "OK"
@@ -1,7 +1,10 @@
1
- from typing import List
1
+ from typing import TYPE_CHECKING, List
2
2
 
3
3
  from fastapi import APIRouter, File, Request, UploadFile
4
4
 
5
+ if TYPE_CHECKING:
6
+ from ..detector_node import DetectorNode
7
+
5
8
  router = APIRouter()
6
9
 
7
10
 
@@ -13,5 +16,6 @@ async def upload_image(request: Request, files: List[UploadFile] = File(...)):
13
16
  curl -X POST -F 'files=@test.jpg' "http://localhost:/upload"
14
17
  """
15
18
  raw_files = [await file.read() for file in files]
16
- await request.app.upload_images(raw_files)
19
+ node: DetectorNode = request.app
20
+ await node.upload_images(raw_files)
17
21
  return 200, "OK"
@@ -102,3 +102,19 @@ async def test_about_endpoint(test_detector_node: DetectorNode):
102
102
  assert response_dict['state'] == 'online'
103
103
  assert response_dict['target_model'] == '1.1'
104
104
  assert any(c.name == 'purple point' for c in model_information.categories)
105
+
106
+
107
+ async def test_rest_outbox_mode(test_detector_node: DetectorNode):
108
+ await asyncio.sleep(3)
109
+
110
+ def check_switch_to_mode(mode: str):
111
+ response = requests.put(f'http://localhost:{GLOBALS.detector_port}/outbox_mode',
112
+ data=mode, timeout=30)
113
+ assert response.status_code == 200
114
+ response = requests.get(f'http://localhost:{GLOBALS.detector_port}/outbox_mode', timeout=30)
115
+ assert response.status_code == 200
116
+ assert response.content == mode.encode()
117
+
118
+ check_switch_to_mode('stopped')
119
+ check_switch_to_mode('continuous_upload')
120
+ check_switch_to_mode('stopped')
@@ -1,5 +1,6 @@
1
1
  import os
2
2
  import shutil
3
+ from time import sleep
3
4
 
4
5
  import numpy as np
5
6
  import pytest
@@ -21,6 +22,7 @@ def test_outbox():
21
22
  os.mkdir(test_outbox.path)
22
23
 
23
24
  yield test_outbox
25
+ test_outbox.set_mode('stopped')
24
26
  shutil.rmtree(test_outbox.path, ignore_errors=True)
25
27
 
26
28
 
@@ -52,11 +54,7 @@ def test_saving_opencv_image(test_outbox: Outbox):
52
54
 
53
55
  def test_saving_binary(test_outbox: Outbox):
54
56
  assert len(test_outbox.get_data_files()) == 0
55
- img = Image.new('RGB', (60, 30), color=(73, 109, 137))
56
- img.save('/tmp/image.jpg')
57
- with open('/tmp/image.jpg', 'rb') as f:
58
- data = f.read()
59
- test_outbox.save(data)
57
+ save_test_image_to_outbox(test_outbox)
60
58
  assert len(test_outbox.get_data_files()) == 1
61
59
 
62
60
 
@@ -66,3 +64,23 @@ async def test_files_are_automatically_uploaded(test_detector_node: DetectorNode
66
64
  assert len(test_detector_node.outbox.get_data_files()) == 1
67
65
 
68
66
  assert len(test_detector_node.outbox.get_data_files()) == 1
67
+
68
+
69
+ def test_set_outbox_mode(test_outbox: Outbox):
70
+ test_outbox.set_mode('stopped')
71
+ save_test_image_to_outbox(outbox=test_outbox)
72
+ sleep(6)
73
+ assert len(test_outbox.get_data_files()) == 1, 'File was cleared even though outbox should be stopped'
74
+ test_outbox.set_mode('continuous_upload')
75
+ sleep(6)
76
+ assert len(test_outbox.get_data_files()) == 0, 'File was not cleared even though outbox should be in continuous_upload'
77
+
78
+ ### Helper functions ###
79
+
80
+
81
+ def save_test_image_to_outbox(outbox: Outbox):
82
+ img = Image.new('RGB', (60, 30), color=(73, 109, 137))
83
+ img.save('/tmp/image.jpg')
84
+ with open('/tmp/image.jpg', 'rb') as f:
85
+ data = f.read()
86
+ outbox.save(data)
@@ -0,0 +1,2 @@
1
+ class CriticalError(Exception):
2
+ pass
@@ -20,7 +20,7 @@ async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogi
20
20
  model_uuid_for_detecting='00000000-0000-0000-0000-000000000011') # NOTE: this is the hard coded model uuid for zauberzeug/demo (model version 1.1)
21
21
 
22
22
  _ = asyncio.get_running_loop().create_task(
23
- trainer._perform_state('do_detections', TrainerState.Detecting, TrainerState.Detected, trainer._do_detections))
23
+ trainer._perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, trainer._do_detections))
24
24
 
25
25
  await assert_training_state(trainer.training, TrainerState.Detecting, timeout=1, interval=0.001)
26
26
  await assert_training_state(trainer.training, TrainerState.Detected, timeout=10, interval=0.001)
@@ -54,7 +54,7 @@ async def test_abort_upload_model(test_initialized_trainer: TestingTrainerLogic)
54
54
  async def test_bad_server_response_content(test_initialized_trainer: TestingTrainerLogic):
55
55
  """Set the training state to confusion_matrix_synced and try to upload the model.
56
56
  This should fail because the server response is not a valid model id.
57
- The training should be aborted and the training state should be set to confusion_matrix_synced."""
57
+ The training should be aborted and the training state should be set to ready_for_cleanup."""
58
58
  trainer = test_initialized_trainer
59
59
 
60
60
  create_active_training_file(trainer, training_state=TrainerState.ConfusionMatrixSynced)
@@ -64,10 +64,10 @@ async def test_bad_server_response_content(test_initialized_trainer: TestingTrai
64
64
 
65
65
  await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
66
66
  # TODO goes to finished because of the error
67
- await assert_training_state(trainer.training, TrainerState.ConfusionMatrixSynced, timeout=2, interval=0.001)
67
+ await assert_training_state(trainer.training, TrainerState.ReadyForCleanup, timeout=2, interval=0.001)
68
68
 
69
69
  assert trainer_has_error(trainer)
70
- assert trainer.training.training_state == TrainerState.ConfusionMatrixSynced
70
+ assert trainer.training.training_state == TrainerState.ReadyForCleanup
71
71
  assert trainer.training.model_uuid_for_detecting is None
72
72
  assert trainer.node.last_training_io.load() == trainer.training
73
73
 
@@ -81,8 +81,7 @@ async def test_mock_loop_response_example(mocker: MockerFixture, test_initialize
81
81
  trainer._init_from_last_training()
82
82
 
83
83
  # pylint: disable=protected-access
84
- result = await trainer._upload_model_return_new_model_uuid(Context(organization='zauberzeug', project='demo'))
85
- assert result is not None
84
+ await trainer._upload_model_return_new_model_uuid(Context(organization='zauberzeug', project='demo'))
86
85
 
87
86
 
88
87
  def mock_upload_model_for_training(mocker, return_value):
@@ -59,7 +59,7 @@ class TestingTrainerLogic(TrainerLogic):
59
59
  await super()._upload_model()
60
60
  await asyncio.sleep(0.1) # give tests a bit time to to check for the state
61
61
 
62
- async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]:
62
+ async def _upload_model_return_new_model_uuid(self, context: Context) -> str:
63
63
  await asyncio.sleep(0.1) # give tests a bit time to to check for the state
64
64
  result = await super()._upload_model_return_new_model_uuid(context)
65
65
  await asyncio.sleep(0.1) # give tests a bit time to to check for the state
@@ -14,11 +14,14 @@ from ..data_classes import (Context, Errors, Hyperparameter, PretrainedModel, Tr
14
14
  TrainingOut, TrainingStateData)
15
15
  from ..helpers.misc import create_project_folder, delete_all_training_folders, generate_training, is_valid_uuid4
16
16
  from .downloader import TrainingsDownloader
17
+ from .exceptions import CriticalError
17
18
  from .io_helpers import ActiveTrainingIO, EnvironmentVars, LastTrainingIO
18
19
 
19
20
  if TYPE_CHECKING:
20
21
  from .trainer_node import TrainerNode
21
22
 
23
+ logger = logging.getLogger('learning_loop_node.trainer_logic_generic')
24
+
22
25
 
23
26
  class TrainerLogicGeneric(ABC):
24
27
 
@@ -175,7 +178,7 @@ class TrainerLogicGeneric(ABC):
175
178
  """
176
179
  if not self.training_active and self.last_training_io.exists():
177
180
  self._init_from_last_training()
178
- logging.info('found incomplete training, continuing now.')
181
+ logger.info('found incomplete training, continuing now.')
179
182
  asyncio.get_event_loop().create_task(self._run())
180
183
  return True
181
184
  return False
@@ -207,7 +210,7 @@ class TrainerLogicGeneric(ABC):
207
210
 
208
211
  self._active_training_io = ActiveTrainingIO(
209
212
  self._training.training_folder, self.node.loop_communicator, context)
210
- logging.info(f'new training initialized: {self._training}')
213
+ logger.info(f'new training initialized: {self._training}')
211
214
 
212
215
  async def _run(self) -> None:
213
216
  """Called on `begin_training` event from the Learning Loop.
@@ -219,18 +222,21 @@ class TrainerLogicGeneric(ABC):
219
222
  await self.training_task # NOTE: Task object is used to potentially cancel the task
220
223
  except asyncio.CancelledError:
221
224
  if not self.shutdown_event.is_set():
222
- logging.info('training task was cancelled but not by shutdown event')
225
+ logger.info('CancelledError in _run - training task was cancelled but not by shutdown event')
223
226
  self.training.training_state = TrainerState.ReadyForCleanup
224
227
  self.last_training_io.save(self.training)
225
228
  await self._clear_training()
229
+ self._may_restart()
230
+ else:
231
+ logger.info('CancelledError in _run - shutting down')
226
232
  except Exception as e:
227
- logging.exception(f'Error in train: {e}')
233
+ logger.exception(f'Error in train: {e}')
228
234
 
229
235
  # ---------------------------------------- TRAINING STATES ----------------------------------------
230
236
 
231
237
  async def _training_loop(self) -> None:
232
238
  """Cycle through the training states until the training is finished or
233
- an asyncio.CancelledError is raised.
239
+ a critical error occurs (asyncio.CancelledError or CriticalError).
234
240
  """
235
241
  assert self.training_active
236
242
 
@@ -252,13 +258,20 @@ class TrainerLogicGeneric(ABC):
252
258
  await self._perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, self._do_detections)
253
259
  elif tstate == TrainerState.Detected: # -> DetectionUploading -> ReadyForCleanup
254
260
  await self._perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions)
255
- elif tstate == TrainerState.ReadyForCleanup: # -> RESTART or TrainingFinished
261
+ elif tstate == TrainerState.ReadyForCleanup: # -> Idle (RESTART or _training = None)
256
262
  await self._clear_training()
257
263
  self._may_restart()
258
264
 
259
265
  async def _perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False):
266
+ '''
267
+ Perform a training state and handle errors.
268
+ - If the loop sends a StopTraining event, this will raise a CancelledError.
269
+ - States can raise a CriticalError indicating that there is no point in retrying the state.
270
+ - If any other error occurs, the error is stored in the errors object and the state is reset to the previous state.
271
+ '''
272
+
260
273
  await asyncio.sleep(0.1)
261
- logging.info(f'Performing state: {state_during}')
274
+ logger.info(f'Performing state: {state_during}')
262
275
  previous_state = self.training.training_state
263
276
  self.training.training_state = state_during
264
277
  await asyncio.sleep(0.1)
@@ -266,21 +279,30 @@ class TrainerLogicGeneric(ABC):
266
279
  self.errors.reset(error_key)
267
280
 
268
281
  try:
269
- if await action():
270
- logging.error('Something went really bad.. cleaning up')
271
- state_after = TrainerState.ReadyForCleanup
282
+ await action()
283
+
272
284
  except asyncio.CancelledError:
273
- logging.warning(f'CancelledError in {state_during}')
274
- raise
285
+ if self.shutdown_event.is_set():
286
+ logger.info(f'CancelledError in {state_during} - shutdown event set')
287
+ raise
288
+ logger.info(f'CancelledError in {state_during} - cleaning up')
289
+ self.training.training_state = TrainerState.ReadyForCleanup
290
+ except CriticalError as e:
291
+ logger.error(f'CriticalError in {state_during} - Exception: {e}')
292
+ self.errors.set(error_key, str(e))
293
+ self.training.training_state = TrainerState.ReadyForCleanup
275
294
  except Exception as e:
276
295
  self.errors.set(error_key, str(e))
277
- logging.exception(f'Error in {state_during} - Exception:')
296
+ logger.exception('Error in %s - Exception: %s', state_during, e)
278
297
  self.training.training_state = previous_state
298
+ return
279
299
  else:
300
+ logger.info(f'Successfully finished state: {state_during}')
280
301
  if not reset_early:
281
302
  self.errors.reset(error_key)
282
303
  self.training.training_state = state_after
283
- self.last_training_io.save(self.training)
304
+
305
+ self.last_training_io.save(self.training)
284
306
 
285
307
  async def _prepare(self) -> None:
286
308
  """Downloads images to the images_folder and saves annotations to training.data.image_data.
@@ -300,11 +322,11 @@ class TrainerLogicGeneric(ABC):
300
322
 
301
323
  # TODO this checks if we continue a training -> make more explicit
302
324
  if not base_model_uuid or not is_valid_uuid4(base_model_uuid):
303
- logging.info(f'skipping model download. No base model provided (in form of uuid): {base_model_uuid}')
325
+ logger.info(f'skipping model download. No base model provided (in form of uuid): {base_model_uuid}')
304
326
  return
305
327
 
306
- logging.info('loading model from Learning Loop')
307
- logging.info(f'downloading model {base_model_uuid} as {self.model_format}')
328
+ logger.info('loading model from Learning Loop')
329
+ logger.info(f'downloading model {base_model_uuid} as {self.model_format}')
308
330
  await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, base_model_uuid, self.model_format)
309
331
  shutil.move(f'{self.training.training_folder}/model.json',
310
332
  f'{self.training.training_folder}/base_model.json')
@@ -327,12 +349,12 @@ class TrainerLogicGeneric(ABC):
327
349
  result = await self.node.sio_client.call('update_training', (
328
350
  self.training.context.organization, self.training.context.project, jsonable_encoder(new_training)))
329
351
  if isinstance(result, dict) and result['success']:
330
- logging.info(f'successfully updated training {asdict(new_training)}')
352
+ logger.info(f'successfully updated training {asdict(new_training)}')
331
353
  self._on_metrics_published(new_best_model)
332
354
  else:
333
355
  raise Exception(f'Error for update_training: Response from loop was : {result}')
334
356
  except Exception as e:
335
- logging.exception('Error during confusion matrix syncronization')
357
+ logger.exception('Error during confusion matrix syncronization')
336
358
  self.errors.set(error_key, str(e))
337
359
  raise
338
360
  self.errors.reset(error_key)
@@ -341,21 +363,22 @@ class TrainerLogicGeneric(ABC):
341
363
  """Uploads the latest model to the Learning Loop.
342
364
  """
343
365
  new_model_uuid = await self._upload_model_return_new_model_uuid(self.training.context)
344
- if new_model_uuid is None:
345
- self.training.training_state = TrainerState.ReadyForCleanup
346
- logging.error('could not upload model - maybe training failed.. cleaning up')
347
- logging.info(f'Successfully uploaded model and received new model id: {new_model_uuid}')
366
+ logger.info(f'Successfully uploaded model and received new model id: {new_model_uuid}')
348
367
  self.training.model_uuid_for_detecting = new_model_uuid
349
368
 
350
- async def _upload_model_return_new_model_uuid(self, context: Context) -> Optional[str]:
369
+ async def _upload_model_return_new_model_uuid(self, context: Context) -> str:
351
370
  """Upload model files, usually pytorch model (.pt) hyp.yaml and the converted .wts file.
352
371
  Note that with the latest trainers the conversion to (.wts) is done by the trainer.
353
372
  The conversion from .wts to .engine is done by the detector (needs to be done on target hardware).
354
- Note that trainer may train with different classes, which is why we send an initial model.json file."""
373
+ Note that trainer may train with different classes, which is why we send an initial model.json file.
374
+
375
+ :return: The new model UUID.
376
+ :raise CriticalError: If the latest model files cannot be obtained.
377
+ """
355
378
 
356
379
  files = await self._get_latest_model_files()
357
380
  if files is None:
358
- return None
381
+ raise CriticalError('Could not get latest model files. Training might have failed.')
359
382
 
360
383
  if isinstance(files, List):
361
384
  files = {self.model_format: files}
@@ -369,8 +392,6 @@ class TrainerLogicGeneric(ABC):
369
392
  assert len([f for f in _files if 'model.json' in f]) == 1, "model.json must be included exactly once"
370
393
 
371
394
  model_uuid = await self.node.data_exchanger.upload_model_get_uuid(context, _files, self.training.training_number, file_format)
372
- if model_uuid is None:
373
- return None
374
395
 
375
396
  already_uploaded_formats.append(file_format)
376
397
  self.active_training_io.save_model_upload_progress(already_uploaded_formats)
@@ -411,23 +432,23 @@ class TrainerLogicGeneric(ABC):
411
432
  if not self.training_active:
412
433
  return
413
434
  if self.training_task:
414
- logging.info('cancelling training task')
435
+ logger.info('cancelling training task')
415
436
  if self.training_task.cancel():
416
437
  try:
417
438
  await self.training_task
418
439
  except asyncio.CancelledError:
419
440
  pass
420
- logging.info('cancelled training task')
441
+ logger.info('cancelled training task')
421
442
  self._may_restart()
422
443
 
423
444
  def _may_restart(self) -> None:
424
445
  """If the environment variable RESTART_AFTER_TRAINING is set, the trainer will restart after a training.
425
446
  """
426
447
  if self._environment_vars.restart_after_training:
427
- logging.info('restarting')
448
+ logger.info('restarting')
428
449
  sys.exit(0)
429
450
  else:
430
- logging.info('not restarting')
451
+ logger.info('not restarting')
431
452
  # ---------------------------------------- ABSTRACT METHODS ----------------------------------------
432
453
 
433
454
  @abstractmethod
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: learning-loop-node
3
- Version: 0.10.6
3
+ Version: 0.10.7
4
4
  Summary: Python Library for Nodes which connect to the Zauberzeug Learning Loop
5
5
  Home-page: https://github.com/zauberzeug/learning_loop_node
6
6
  License: MIT
@@ -81,6 +81,8 @@ from learning_loop_node/learning_loop_node
81
81
 
82
82
  Detector Nodes are normally deployed on edge devices like robots or machinery but can also run in the cloud to provide backend services for an app or similar. These nodes register themself at the Learning Loop. They provide REST and Socket.io APIs to run inference on images. The processed images can automatically be used for active learning: e.g. uncertain predictions will be send to the Learning Loop.
83
83
 
84
+ ### Running Inference
85
+
84
86
  Images can be send to the detector node via socketio or rest.
85
87
  The later approach can be used via curl,
86
88
 
@@ -102,6 +104,26 @@ The detector also has a sio **upload endpoint** that can be used to upload image
102
104
 
103
105
  The endpoint returns None if the upload was successful and an error message otherwise.
104
106
 
107
+ ### Changing the outbox mode
108
+
109
+ If the autoupload is set to `all` or `filtered` (selected) images and the corresponding detections are saved on HDD (the outbox). A background thread will upload the images and detections to the Learning Loop. The outbox is located in the `outbox` folder in the root directory of the node. The outbox can be cleared by deleting the files in the folder.
110
+
111
+ The continuous upload can be stopped/started via a REST enpoint:
112
+
113
+ Example Usage:
114
+
115
+ - Enable upload: `curl -X PUT -d "continuous_upload" http://localhost/outbox_mode`
116
+ - Disable upload: `curl -X PUT -d "stopped" http://localhost/outbox_mode`
117
+
118
+ The current state can be queried via a GET request:
119
+ `curl http://localhost/outbox_mode`
120
+
121
+ ### Explicit upload
122
+
123
+ The detector has a REST endpoint to upload images (and detections) to the Learning Loop. The endpoint takes a POST request with the image and optionally the detections. The image is expected to be in jpg format. The detections are expected to be a json dictionary. Example:
124
+
125
+ `curl -X POST -F 'files=@test.jpg' "http://localhost:/upload"`
126
+
105
127
  ## Trainer Node
106
128
 
107
129
  Trainers fetch the images and anntoations from the Learning Loop to train new models.
@@ -10,28 +10,29 @@ learning_loop_node/data_classes/detections.py,sha256=1BcU5PNzIbryWcj2xJ6ysLBTBwG
10
10
  learning_loop_node/data_classes/general.py,sha256=44GJrJvGfPwDUmRsS7If9uSlE6KPP50LGUX91VzesLw,4664
11
11
  learning_loop_node/data_classes/socket_response.py,sha256=tIdt-oYf6ULoJIDYQCecNM9OtWR6_wJ9tL0Ksu83Vko,655
12
12
  learning_loop_node/data_classes/training.py,sha256=hnMHZMk-WNRERyo7U97qL09v1tIdhnzPfTH-JgifLwU,6164
13
- learning_loop_node/data_exchanger.py,sha256=hxF0zANA35f5EV8tkQ4yjelrKuvafMaKUya0CCjVrK0,8221
13
+ learning_loop_node/data_exchanger.py,sha256=BTrXwjNkG9KgtUxil_ijMggql8sZDKXQm26xdKQr8_0,8459
14
14
  learning_loop_node/detector/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
15
  learning_loop_node/detector/detector_logic.py,sha256=se0jRFbV7BfTvCuCI3gcUllSYIZ5dxTkvdISe6pPTRg,1660
16
- learning_loop_node/detector/detector_node.py,sha256=BStenBbtRuvuDxg6lscxvfz_lV_Am5MAAr0bRbTP50o,16553
16
+ learning_loop_node/detector/detector_node.py,sha256=ggnjv0lbSpgIAda-omUYiMAUAlktaWzGClH_iZ4Axd4,16689
17
17
  learning_loop_node/detector/inbox_filter/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  learning_loop_node/detector/inbox_filter/cam_observation_history.py,sha256=TD346I9ymtIP0_CJXCIKMRuiXbfVVanXNu_iHAwDd7Q,3318
19
19
  learning_loop_node/detector/inbox_filter/relevance_filter.py,sha256=s2FuwZ-tD_5obkSutstjc8pE_hLGbrv9WjrEO9t8rJ8,1011
20
20
  learning_loop_node/detector/inbox_filter/tests/test_observation.py,sha256=ORN08yjprqmgmtU25RsVysniyrWX-qGvqFN8ZkkYxow,1385
21
21
  learning_loop_node/detector/inbox_filter/tests/test_relevance_group.py,sha256=RUgsk1CnKSOCRZBzNjE7AZTqk06-yelgUqvHFRLH7_I,7865
22
22
  learning_loop_node/detector/inbox_filter/tests/test_unexpected_observations_count.py,sha256=y_dFUV21h6uZc90Q43s0u4oivJfuCNWlk5iXAWiXGgc,1804
23
- learning_loop_node/detector/outbox.py,sha256=kxOzIhffTbrBCvZqGQsrwDS68zWJe00mN12O_qNwwVI,4176
23
+ learning_loop_node/detector/outbox.py,sha256=o0Tq_iZg8dLIL-1o1tzNNDZks9Sovx4lBQKJvTnN09M,6884
24
24
  learning_loop_node/detector/rest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
25
25
  learning_loop_node/detector/rest/about.py,sha256=-PNqlQI_tzRvoSI_UR9rX8-5GeiENNpRDQ4Ylw3wYVs,607
26
26
  learning_loop_node/detector/rest/backdoor_controls.py,sha256=38axRG66Z3_Q6bYKa7Hw-ldChEAu-dJcBM_Sl_17Ozo,1725
27
27
  learning_loop_node/detector/rest/detect.py,sha256=8Rl1swANKgHc42P1z75t_PErQxpCKKPdAsKqDIZgdNU,1873
28
28
  learning_loop_node/detector/rest/operation_mode.py,sha256=eIo6_56qyZECftf4AEN8wJMABIojC0TRazvWeg0Uj_s,1664
29
- learning_loop_node/detector/rest/upload.py,sha256=MifNhban7GeaCjwa39lDhTQWyRuVyvyGFGscoszplH0,435
29
+ learning_loop_node/detector/rest/outbox_mode.py,sha256=N62NOb9cJZCkZTVC6iJVr6hX9slLR7Aym0WwTYinl7A,1012
30
+ learning_loop_node/detector/rest/upload.py,sha256=IPzxJPayD7_Gx5uYC1lVJwWxdnQgM8MYGa5NugXVosY,544
30
31
  learning_loop_node/detector/tests/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
32
  learning_loop_node/detector/tests/conftest.py,sha256=v08N5_jvQyeqQkOzkTaanCwwVd75Rf-tc1M2-fgiv54,3254
32
33
  learning_loop_node/detector/tests/test.jpg,sha256=msA-vHPmvPiro_D102Qmn1fn4vNfooqYYEXPxZUmYpk,161390
33
- learning_loop_node/detector/tests/test_client_communication.py,sha256=QjoES3qMqcsfZgNHI825_bfvjdxxX8NNncCRKIeV9Wo,4603
34
- learning_loop_node/detector/tests/test_outbox.py,sha256=bXvbxBVSF2O0M3V9gSDEqjMQ1g12_hwckq7rziG3i1c,2051
34
+ learning_loop_node/detector/tests/test_client_communication.py,sha256=pgmRvn6FunXQgijgYNkJAz7X3Hv8f52YbK4XF9EfVKk,5240
35
+ learning_loop_node/detector/tests/test_outbox.py,sha256=_d-RdBBAadAW56O7oP5b2mYQ_D3u36T5QjzkcmFx24c,2661
35
36
  learning_loop_node/detector/tests/test_relevance_filter.py,sha256=FzeOU6k17VIQvAHR8fjHbcPeAE7D7C-2Yxol0lDrMEM,1981
36
37
  learning_loop_node/detector/tests/testing_detector.py,sha256=2DSwIYJDOG4ixOGU8OxjsZQgaOdVU7_d3ASKsSkf8qc,564
37
38
  learning_loop_node/examples/novelty_score_updater.py,sha256=1DRgM9lxjFV-q2JvGDDsNLz_ic_rhEZ9wc6ZdjcxwPE,2038
@@ -57,6 +58,7 @@ learning_loop_node/tests/test_helper.py,sha256=AjOrTu3dHIlJLYI0mxcNx8MCmFF6IjLhH
57
58
  learning_loop_node/tests/test_learning_loop_node.py,sha256=4qWi1ovBzebUAbvw8ecSa-TBGKYuJvlKe2AMnMZ-Qs8,701
58
59
  learning_loop_node/trainer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
59
60
  learning_loop_node/trainer/downloader.py,sha256=qzx7zzObcFEvRVQFe8gi8KJNIapASi1_XssbspXD1Rw,1469
61
+ learning_loop_node/trainer/exceptions.py,sha256=hLLDGncC6PLZjKg4lZBpu-QA8itQIxiuxExz1uptgnw,40
60
62
  learning_loop_node/trainer/executor.py,sha256=-0BxDqmAI1NCiISi7Rw8McJQfgxxVy1gSa1epYuL3U0,3942
61
63
  learning_loop_node/trainer/io_helpers.py,sha256=Ylxz8HAId0Jlz95So5kXdJEp1yKQuwroDKIhbTUscF4,7257
62
64
  learning_loop_node/trainer/rest/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -67,19 +69,19 @@ learning_loop_node/trainer/tests/conftest.py,sha256=qUmcHPme19AD6K6sQektX63iZecd
67
69
  learning_loop_node/trainer/tests/state_helper.py,sha256=igoGqTBqcqqFcDng2i7ctC67bYR1hLPDl4G_mNRG6r8,934
68
70
  learning_loop_node/trainer/tests/states/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
69
71
  learning_loop_node/trainer/tests/states/test_state_cleanup.py,sha256=tiL31hSjg1Bl2obzg2ufAVpil5qW0YkpDCpSHPqXQrk,1312
70
- learning_loop_node/trainer/tests/states/test_state_detecting.py,sha256=heJS9KO9GzIgFiKMvrNWPrboujY6hcR-KjztZ6fivxk,3833
72
+ learning_loop_node/trainer/tests/states/test_state_detecting.py,sha256=8DSCurMPNuCq1zJ3rC-UA5-IoEcvWGdivHGmXCiakdo,3829
71
73
  learning_loop_node/trainer/tests/states/test_state_download_train_model.py,sha256=rNRXIyqyHzHz4fXY1Lsf7WKg8FFVFYfFPIevMCBBcCY,2940
72
74
  learning_loop_node/trainer/tests/states/test_state_prepare.py,sha256=fx9_bgPTaR5ANVB8n_hW8dXcaJIh_iKEnInmhzamZ9E,2432
73
75
  learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py,sha256=zfNbHB3GFSJXXoEkW-8PYtmX62md3feWp4oisyzs8A4,4773
74
76
  learning_loop_node/trainer/tests/states/test_state_train.py,sha256=j1vedjH2EwLTgHhon6eR9ttp-Sw9ozR9-9QgAKlFO-M,3248
75
77
  learning_loop_node/trainer/tests/states/test_state_upload_detections.py,sha256=u31gC0-Z2EVTnia1dyY2yNGDGAeyIXPfObBgrEWHhVQ,7674
76
- learning_loop_node/trainer/tests/states/test_state_upload_model.py,sha256=gAXJJcoxj2EVdFJcPAeqxGAjeL1a4ofLtytnM4joi-Q,3808
78
+ learning_loop_node/trainer/tests/states/test_state_upload_model.py,sha256=lWjIqjWBpWppjgX5U9yw_EZw8Zl2BD0cjuZYoMk4ccQ,3751
77
79
  learning_loop_node/trainer/tests/test_errors.py,sha256=8H-kjs9kEBoHWcQVJIZvW5zcwCs1VQI5Tf5I0VSbCUA,2245
78
80
  learning_loop_node/trainer/tests/test_trainer_states.py,sha256=OBrClH6srAM2hqqel2xTtfHCeTKYZlG_S4KO2G2GrS4,1147
79
- learning_loop_node/trainer/tests/testing_trainer_logic.py,sha256=7sQ6okiOhM4IhvRRo4XvLPjxnBrqFu9SPbuDX2LLwRs,3925
81
+ learning_loop_node/trainer/tests/testing_trainer_logic.py,sha256=eKvCRznWNzHctBEmgiSVemTcLDfCvy80IuhmxkFpvvI,3915
80
82
  learning_loop_node/trainer/trainer_logic.py,sha256=PJxiO1chPdvpq8UTtzv_nVam9CouCswX9b1FnRwT2Tw,8411
81
- learning_loop_node/trainer/trainer_logic_generic.py,sha256=KFDuxgzrGITHQaJoGvhjHxWzhbb4Q7HBxSpks4CeGBg,24801
83
+ learning_loop_node/trainer/trainer_logic_generic.py,sha256=AzllMMiUPP_CMkjIVqse8wY50Cg5RDnk5y5ERVUjtZg,25801
82
84
  learning_loop_node/trainer/trainer_node.py,sha256=bcyOMeLXrLuLgsPqS8lwEOSZ6vCjGLgT0pLXgaylI1Q,4155
83
- learning_loop_node-0.10.6.dist-info/METADATA,sha256=fxypCq0VtpapGOz_5Ao8NhcAtN-Xp-1aQb-qGWWXM7M,9287
84
- learning_loop_node-0.10.6.dist-info/WHEEL,sha256=WGfLGfLX43Ei_YORXSnT54hxFygu34kMpcQdmgmEwCQ,88
85
- learning_loop_node-0.10.6.dist-info/RECORD,,
85
+ learning_loop_node-0.10.7.dist-info/METADATA,sha256=87Mgc03RgU_2S_OQvjaSt06MAjqcgsR1s3vXYu_bzJ8,10383
86
+ learning_loop_node-0.10.7.dist-info/WHEEL,sha256=WGfLGfLX43Ei_YORXSnT54hxFygu34kMpcQdmgmEwCQ,88
87
+ learning_loop_node-0.10.7.dist-info/RECORD,,