learning-loop-node 0.10.6__tar.gz → 0.10.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of learning-loop-node might be problematic. Click here for more details.
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/PKG-INFO +23 -1
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/README.md +22 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/data_exchanger.py +9 -4
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/detector_node.py +4 -2
- learning_loop_node-0.10.7/learning_loop_node/detector/outbox.py +185 -0
- learning_loop_node-0.10.7/learning_loop_node/detector/rest/outbox_mode.py +35 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/rest/upload.py +6 -2
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/tests/test_client_communication.py +16 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/tests/test_outbox.py +23 -5
- learning_loop_node-0.10.7/learning_loop_node/trainer/exceptions.py +2 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/tests/states/test_state_detecting.py +1 -1
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/tests/states/test_state_upload_model.py +4 -5
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/tests/testing_trainer_logic.py +1 -1
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/trainer_logic_generic.py +53 -32
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/pyproject.toml +1 -1
- learning_loop_node-0.10.6/learning_loop_node/detector/outbox.py +0 -117
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/__init__.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/annotation/__init__.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/annotation/annotator_logic.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/annotation/annotator_node.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/annotation/tests/test_annotator_node.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/conftest.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/data_classes/__init__.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/data_classes/annotations.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/data_classes/detections.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/data_classes/general.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/data_classes/socket_response.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/data_classes/training.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/__init__.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/detector_logic.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/inbox_filter/__init__.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/inbox_filter/cam_observation_history.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/inbox_filter/relevance_filter.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/inbox_filter/tests/test_observation.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/inbox_filter/tests/test_relevance_group.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/inbox_filter/tests/test_unexpected_observations_count.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/rest/__init__.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/rest/about.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/rest/backdoor_controls.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/rest/detect.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/rest/operation_mode.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/tests/__init__.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/tests/conftest.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/tests/test.jpg +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/tests/test_relevance_filter.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/tests/testing_detector.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/examples/novelty_score_updater.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/globals.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/helpers/__init__.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/helpers/environment_reader.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/helpers/gdrive_downloader.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/helpers/log_conf.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/helpers/misc.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/loop_communication.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/node.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/py.typed +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/pytest.ini +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/tests/__init__.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/tests/conftest.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/tests/test_data/file_1.txt +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/tests/test_data/file_2.txt +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/tests/test_data/model.json +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/tests/test_data_classes.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/tests/test_downloader.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/tests/test_executor.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/tests/test_helper.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/tests/test_learning_loop_node.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/__init__.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/downloader.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/executor.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/io_helpers.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/rest/__init__.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/rest/backdoor_controls.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/rest/controls.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/tests/__init__.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/tests/conftest.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/tests/state_helper.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/tests/states/__init__.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/tests/states/test_state_cleanup.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/tests/states/test_state_download_train_model.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/tests/states/test_state_prepare.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/tests/states/test_state_sync_confusion_matrix.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/tests/states/test_state_train.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/tests/states/test_state_upload_detections.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/tests/test_errors.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/tests/test_trainer_states.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/trainer_logic.py +0 -0
- {learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/trainer_node.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: learning-loop-node
|
|
3
|
-
Version: 0.10.
|
|
3
|
+
Version: 0.10.7
|
|
4
4
|
Summary: Python Library for Nodes which connect to the Zauberzeug Learning Loop
|
|
5
5
|
Home-page: https://github.com/zauberzeug/learning_loop_node
|
|
6
6
|
License: MIT
|
|
@@ -81,6 +81,8 @@ from learning_loop_node/learning_loop_node
|
|
|
81
81
|
|
|
82
82
|
Detector Nodes are normally deployed on edge devices like robots or machinery but can also run in the cloud to provide backend services for an app or similar. These nodes register themself at the Learning Loop. They provide REST and Socket.io APIs to run inference on images. The processed images can automatically be used for active learning: e.g. uncertain predictions will be send to the Learning Loop.
|
|
83
83
|
|
|
84
|
+
### Running Inference
|
|
85
|
+
|
|
84
86
|
Images can be send to the detector node via socketio or rest.
|
|
85
87
|
The later approach can be used via curl,
|
|
86
88
|
|
|
@@ -102,6 +104,26 @@ The detector also has a sio **upload endpoint** that can be used to upload image
|
|
|
102
104
|
|
|
103
105
|
The endpoint returns None if the upload was successful and an error message otherwise.
|
|
104
106
|
|
|
107
|
+
### Changing the outbox mode
|
|
108
|
+
|
|
109
|
+
If the autoupload is set to `all` or `filtered` (selected) images and the corresponding detections are saved on HDD (the outbox). A background thread will upload the images and detections to the Learning Loop. The outbox is located in the `outbox` folder in the root directory of the node. The outbox can be cleared by deleting the files in the folder.
|
|
110
|
+
|
|
111
|
+
The continuous upload can be stopped/started via a REST enpoint:
|
|
112
|
+
|
|
113
|
+
Example Usage:
|
|
114
|
+
|
|
115
|
+
- Enable upload: `curl -X PUT -d "continuous_upload" http://localhost/outbox_mode`
|
|
116
|
+
- Disable upload: `curl -X PUT -d "stopped" http://localhost/outbox_mode`
|
|
117
|
+
|
|
118
|
+
The current state can be queried via a GET request:
|
|
119
|
+
`curl http://localhost/outbox_mode`
|
|
120
|
+
|
|
121
|
+
### Explicit upload
|
|
122
|
+
|
|
123
|
+
The detector has a REST endpoint to upload images (and detections) to the Learning Loop. The endpoint takes a POST request with the image and optionally the detections. The image is expected to be in jpg format. The detections are expected to be a json dictionary. Example:
|
|
124
|
+
|
|
125
|
+
`curl -X POST -F 'files=@test.jpg' "http://localhost:/upload"`
|
|
126
|
+
|
|
105
127
|
## Trainer Node
|
|
106
128
|
|
|
107
129
|
Trainers fetch the images and anntoations from the Learning Loop to train new models.
|
|
@@ -41,6 +41,8 @@ from learning_loop_node/learning_loop_node
|
|
|
41
41
|
|
|
42
42
|
Detector Nodes are normally deployed on edge devices like robots or machinery but can also run in the cloud to provide backend services for an app or similar. These nodes register themself at the Learning Loop. They provide REST and Socket.io APIs to run inference on images. The processed images can automatically be used for active learning: e.g. uncertain predictions will be send to the Learning Loop.
|
|
43
43
|
|
|
44
|
+
### Running Inference
|
|
45
|
+
|
|
44
46
|
Images can be send to the detector node via socketio or rest.
|
|
45
47
|
The later approach can be used via curl,
|
|
46
48
|
|
|
@@ -62,6 +64,26 @@ The detector also has a sio **upload endpoint** that can be used to upload image
|
|
|
62
64
|
|
|
63
65
|
The endpoint returns None if the upload was successful and an error message otherwise.
|
|
64
66
|
|
|
67
|
+
### Changing the outbox mode
|
|
68
|
+
|
|
69
|
+
If the autoupload is set to `all` or `filtered` (selected) images and the corresponding detections are saved on HDD (the outbox). A background thread will upload the images and detections to the Learning Loop. The outbox is located in the `outbox` folder in the root directory of the node. The outbox can be cleared by deleting the files in the folder.
|
|
70
|
+
|
|
71
|
+
The continuous upload can be stopped/started via a REST enpoint:
|
|
72
|
+
|
|
73
|
+
Example Usage:
|
|
74
|
+
|
|
75
|
+
- Enable upload: `curl -X PUT -d "continuous_upload" http://localhost/outbox_mode`
|
|
76
|
+
- Disable upload: `curl -X PUT -d "stopped" http://localhost/outbox_mode`
|
|
77
|
+
|
|
78
|
+
The current state can be queried via a GET request:
|
|
79
|
+
`curl http://localhost/outbox_mode`
|
|
80
|
+
|
|
81
|
+
### Explicit upload
|
|
82
|
+
|
|
83
|
+
The detector has a REST endpoint to upload images (and detections) to the Learning Loop. The endpoint takes a POST request with the image and optionally the detections. The image is expected to be in jpg format. The detections are expected to be a json dictionary. Example:
|
|
84
|
+
|
|
85
|
+
`curl -X POST -F 'files=@test.jpg' "http://localhost:/upload"`
|
|
86
|
+
|
|
65
87
|
## Trainer Node
|
|
66
88
|
|
|
67
89
|
Trainers fetch the images and anntoations from the Learning Loop to train new models.
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/data_exchanger.py
RENAMED
|
@@ -14,6 +14,7 @@ import aiofiles # type: ignore
|
|
|
14
14
|
from .data_classes import Context
|
|
15
15
|
from .helpers.misc import create_resource_paths, create_task, is_valid_image
|
|
16
16
|
from .loop_communication import LoopCommunicator
|
|
17
|
+
from .trainer.exceptions import CriticalError
|
|
17
18
|
|
|
18
19
|
|
|
19
20
|
class DownloadError(Exception):
|
|
@@ -159,13 +160,17 @@ class DataExchanger():
|
|
|
159
160
|
logging.info(f'Downloaded model {model_uuid}({model_format}) to {target_folder}.')
|
|
160
161
|
return created_files
|
|
161
162
|
|
|
162
|
-
async def upload_model_get_uuid(self, context: Context, files: List[str], training_number: Optional[int], mformat: str) ->
|
|
163
|
-
"""Used by the trainers. Function returns the new model uuid to use for detection.
|
|
163
|
+
async def upload_model_get_uuid(self, context: Context, files: List[str], training_number: Optional[int], mformat: str) -> str:
|
|
164
|
+
"""Used by the trainers. Function returns the new model uuid to use for detection.
|
|
165
|
+
|
|
166
|
+
:return: The new model uuid.
|
|
167
|
+
:raise CriticalError: If the upload does not return status code 200.
|
|
168
|
+
"""
|
|
164
169
|
response = await self.loop_communicator.put(f'/{context.organization}/projects/{context.project}/trainings/{training_number}/models/latest/{mformat}/file', files=files)
|
|
165
170
|
if response.status_code != 200:
|
|
166
171
|
logging.error(f'Could not upload model for training {training_number}, format {mformat}: {response.text}')
|
|
167
|
-
|
|
168
|
-
|
|
172
|
+
raise CriticalError(
|
|
173
|
+
f'Could not upload model for training {training_number}, format {mformat}: {response.text}')
|
|
169
174
|
|
|
170
175
|
uploaded_model = response.json()
|
|
171
176
|
logging.info(f'Uploaded model for training {training_number}, format {mformat}. Response is: {uploaded_model}')
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/detector_node.py
RENAMED
|
@@ -27,6 +27,7 @@ from .rest import about as rest_about
|
|
|
27
27
|
from .rest import backdoor_controls
|
|
28
28
|
from .rest import detect as rest_detect
|
|
29
29
|
from .rest import operation_mode as rest_mode
|
|
30
|
+
from .rest import outbox_mode as rest_outbox_mode
|
|
30
31
|
from .rest import upload as rest_upload
|
|
31
32
|
from .rest.operation_mode import OperationMode
|
|
32
33
|
|
|
@@ -57,6 +58,7 @@ class DetectorNode(Node):
|
|
|
57
58
|
self.include_router(rest_upload.router, prefix="")
|
|
58
59
|
self.include_router(rest_mode.router, tags=["operation_mode"])
|
|
59
60
|
self.include_router(rest_about.router, tags=["about"])
|
|
61
|
+
self.include_router(rest_outbox_mode.router, tags=["outbox_mode"])
|
|
60
62
|
|
|
61
63
|
if use_backdoor_controls:
|
|
62
64
|
self.include_router(backdoor_controls.router)
|
|
@@ -89,7 +91,7 @@ class DetectorNode(Node):
|
|
|
89
91
|
|
|
90
92
|
async def on_startup(self) -> None:
|
|
91
93
|
try:
|
|
92
|
-
self.outbox.
|
|
94
|
+
self.outbox.ensure_continuous_upload()
|
|
93
95
|
self.detector_logic.load_model()
|
|
94
96
|
except Exception:
|
|
95
97
|
self.log.exception("error during 'startup'")
|
|
@@ -97,7 +99,7 @@ class DetectorNode(Node):
|
|
|
97
99
|
|
|
98
100
|
async def on_shutdown(self) -> None:
|
|
99
101
|
try:
|
|
100
|
-
self.outbox.
|
|
102
|
+
self.outbox.ensure_continuous_upload_stopped()
|
|
101
103
|
for sid in self.connected_clients:
|
|
102
104
|
# pylint: disable=no-member
|
|
103
105
|
await self.sio.disconnect(sid) # type:ignore
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import shutil
|
|
5
|
+
import time
|
|
6
|
+
from dataclasses import asdict
|
|
7
|
+
from datetime import datetime
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from glob import glob
|
|
10
|
+
from io import BufferedReader, TextIOWrapper
|
|
11
|
+
from multiprocessing import Event
|
|
12
|
+
from multiprocessing.synchronize import Event as SyncEvent
|
|
13
|
+
from threading import Thread
|
|
14
|
+
from typing import List, Optional
|
|
15
|
+
|
|
16
|
+
import requests
|
|
17
|
+
from fastapi.encoders import jsonable_encoder
|
|
18
|
+
|
|
19
|
+
from ..data_classes import Detections
|
|
20
|
+
from ..globals import GLOBALS
|
|
21
|
+
from ..helpers import environment_reader
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class OutboxMode(Enum):
|
|
25
|
+
CONTINUOUS_UPLOAD = 'continuous_upload'
|
|
26
|
+
STOPPED = 'stopped'
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class Outbox():
|
|
30
|
+
def __init__(self) -> None:
|
|
31
|
+
self.log = logging.getLogger()
|
|
32
|
+
self.path = f'{GLOBALS.data_folder}/outbox'
|
|
33
|
+
os.makedirs(self.path, exist_ok=True)
|
|
34
|
+
|
|
35
|
+
self.log = logging.getLogger()
|
|
36
|
+
host = environment_reader.host()
|
|
37
|
+
o = environment_reader.organization()
|
|
38
|
+
p = environment_reader.project()
|
|
39
|
+
|
|
40
|
+
assert o and p, 'Outbox needs an organization and a project '
|
|
41
|
+
base_url = f'http{"s" if "learning-loop.ai" in host else ""}://{host}/api'
|
|
42
|
+
base: str = base_url
|
|
43
|
+
self.target_uri = f'{base}/{o}/projects/{p}/images'
|
|
44
|
+
self.log.info('Outbox initialized with target_uri: %s', self.target_uri)
|
|
45
|
+
|
|
46
|
+
self.BATCH_SIZE = 20
|
|
47
|
+
self.UPLOAD_TIMEOUT_S = 30
|
|
48
|
+
|
|
49
|
+
self.shutdown_event: SyncEvent = Event()
|
|
50
|
+
self.upload_process: Optional[Thread] = None
|
|
51
|
+
|
|
52
|
+
def save(self, image: bytes, detections: Optional[Detections] = None, tags: Optional[List[str]] = None) -> None:
|
|
53
|
+
if detections is None:
|
|
54
|
+
detections = Detections()
|
|
55
|
+
if not tags:
|
|
56
|
+
tags = []
|
|
57
|
+
identifier = datetime.now().isoformat(sep='_', timespec='milliseconds')
|
|
58
|
+
tmp = f'{GLOBALS.data_folder}/tmp/{identifier}'
|
|
59
|
+
detections.tags = tags
|
|
60
|
+
detections.date = identifier
|
|
61
|
+
os.makedirs(tmp, exist_ok=True)
|
|
62
|
+
|
|
63
|
+
with open(tmp + '/image.json', 'w') as f:
|
|
64
|
+
json.dump(jsonable_encoder(asdict(detections)), f)
|
|
65
|
+
|
|
66
|
+
with open(tmp + '/image.jpg', 'wb') as f:
|
|
67
|
+
f.write(image)
|
|
68
|
+
|
|
69
|
+
if os.path.exists(tmp):
|
|
70
|
+
os.rename(tmp, self.path + '/' + identifier) # NOTE rename is atomic so upload can run in parallel
|
|
71
|
+
else:
|
|
72
|
+
self.log.error('Could not rename %s to %s', tmp, self.path + '/' + identifier)
|
|
73
|
+
|
|
74
|
+
def get_data_files(self):
|
|
75
|
+
return glob(f'{self.path}/*')
|
|
76
|
+
|
|
77
|
+
def ensure_continuous_upload(self):
|
|
78
|
+
self.log.debug('start_continuous_upload')
|
|
79
|
+
if self._upload_process_alive():
|
|
80
|
+
self.log.debug('Upload thread already running')
|
|
81
|
+
return
|
|
82
|
+
|
|
83
|
+
self.shutdown_event.clear()
|
|
84
|
+
self.upload_process = Thread(target=self._continuous_upload, name='OutboxUpload')
|
|
85
|
+
self.upload_process.start()
|
|
86
|
+
|
|
87
|
+
def _continuous_upload(self):
|
|
88
|
+
self.log.info('continuous upload started')
|
|
89
|
+
assert self.shutdown_event is not None
|
|
90
|
+
while not self.shutdown_event.is_set():
|
|
91
|
+
self.upload()
|
|
92
|
+
time.sleep(5)
|
|
93
|
+
self.log.info('continuous upload ended')
|
|
94
|
+
|
|
95
|
+
def upload(self):
|
|
96
|
+
items = self.get_data_files()
|
|
97
|
+
if items:
|
|
98
|
+
self.log.info('Found %s images to upload', len(items))
|
|
99
|
+
for i in range(0, len(items), self.BATCH_SIZE):
|
|
100
|
+
batch_items = items[i:i+self.BATCH_SIZE]
|
|
101
|
+
if self.shutdown_event.is_set():
|
|
102
|
+
break
|
|
103
|
+
try:
|
|
104
|
+
self._upload_batch(batch_items)
|
|
105
|
+
except Exception:
|
|
106
|
+
self.log.exception('Could not upload files')
|
|
107
|
+
else:
|
|
108
|
+
self.log.info('No images found to upload')
|
|
109
|
+
|
|
110
|
+
def _upload_batch(self, items: List[str]):
|
|
111
|
+
data: List[tuple[str, TextIOWrapper | BufferedReader]] = []
|
|
112
|
+
data = [('files', open(f'{item}/image.json', 'r')) for item in items]
|
|
113
|
+
data += [('files', open(f'{item}/image.jpg', 'rb')) for item in items]
|
|
114
|
+
|
|
115
|
+
response = requests.post(self.target_uri, files=data, timeout=self.UPLOAD_TIMEOUT_S)
|
|
116
|
+
if response.status_code == 200:
|
|
117
|
+
for item in items:
|
|
118
|
+
shutil.rmtree(item, ignore_errors=True)
|
|
119
|
+
self.log.info('Uploaded %s images successfully', len(items))
|
|
120
|
+
elif response.status_code == 422:
|
|
121
|
+
if len(items) == 1:
|
|
122
|
+
self.log.error('Broken content in image: %s\n Skipping.', items[0])
|
|
123
|
+
shutil.rmtree(items[0], ignore_errors=True)
|
|
124
|
+
return
|
|
125
|
+
|
|
126
|
+
self.log.exception('Broken content in batch. Splitting and retrying')
|
|
127
|
+
self._upload_batch(items[:len(items)//2])
|
|
128
|
+
self._upload_batch(items[len(items)//2:])
|
|
129
|
+
else:
|
|
130
|
+
self.log.error('Could not upload images: %s', response.content)
|
|
131
|
+
|
|
132
|
+
def ensure_continuous_upload_stopped(self) -> bool:
|
|
133
|
+
self.log.debug('Outbox: Ensuring continuous upload')
|
|
134
|
+
if not self._upload_process_alive():
|
|
135
|
+
self.log.debug('Upload thread already stopped')
|
|
136
|
+
return True
|
|
137
|
+
proc = self.upload_process
|
|
138
|
+
if not proc:
|
|
139
|
+
return True
|
|
140
|
+
|
|
141
|
+
try:
|
|
142
|
+
assert self.shutdown_event is not None
|
|
143
|
+
self.shutdown_event.set()
|
|
144
|
+
assert proc is not None
|
|
145
|
+
proc.join(self.UPLOAD_TIMEOUT_S + 1)
|
|
146
|
+
except Exception:
|
|
147
|
+
self.log.exception('Error while shutting down upload thread: ')
|
|
148
|
+
|
|
149
|
+
if proc.is_alive():
|
|
150
|
+
self.log.error('Upload thread did not terminate')
|
|
151
|
+
return False
|
|
152
|
+
|
|
153
|
+
self.log.info('Upload thread terminated')
|
|
154
|
+
return True
|
|
155
|
+
|
|
156
|
+
def _upload_process_alive(self) -> bool:
|
|
157
|
+
return bool(self.upload_process and self.upload_process.is_alive())
|
|
158
|
+
|
|
159
|
+
def get_mode(self) -> OutboxMode:
|
|
160
|
+
''':return: current mode ('continuous_upload' or 'stopped')'''
|
|
161
|
+
if self.upload_process and self.upload_process.is_alive():
|
|
162
|
+
current_mode = OutboxMode.CONTINUOUS_UPLOAD
|
|
163
|
+
else:
|
|
164
|
+
current_mode = OutboxMode.STOPPED
|
|
165
|
+
|
|
166
|
+
self.log.debug('Outbox: Current mode is %s', current_mode)
|
|
167
|
+
return current_mode
|
|
168
|
+
|
|
169
|
+
def set_mode(self, mode: OutboxMode | str):
|
|
170
|
+
''':param mode: 'continuous_upload' or 'stopped'
|
|
171
|
+
:raises ValueError: if mode is not a valid OutboxMode
|
|
172
|
+
:raises TimeoutError: if the upload thread does not terminate within 31 seconds with mode='stopped'
|
|
173
|
+
'''
|
|
174
|
+
if isinstance(mode, str):
|
|
175
|
+
mode = OutboxMode(mode)
|
|
176
|
+
|
|
177
|
+
if mode == OutboxMode.CONTINUOUS_UPLOAD:
|
|
178
|
+
self.ensure_continuous_upload()
|
|
179
|
+
elif mode == OutboxMode.STOPPED:
|
|
180
|
+
try:
|
|
181
|
+
self.ensure_continuous_upload_stopped()
|
|
182
|
+
except TimeoutError as e:
|
|
183
|
+
raise TimeoutError(f'Upload thread did not terminate within {self.UPLOAD_TIMEOUT_S} seconds.') from e
|
|
184
|
+
|
|
185
|
+
self.log.debug('set outbox mode to %s', mode)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
from fastapi import APIRouter, HTTPException, Request
|
|
2
|
+
from fastapi.responses import PlainTextResponse
|
|
3
|
+
|
|
4
|
+
from ..outbox import Outbox
|
|
5
|
+
|
|
6
|
+
router = APIRouter()
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@router.get("/outbox_mode")
|
|
10
|
+
async def get_outbox_mode(request: Request):
|
|
11
|
+
'''
|
|
12
|
+
Example Usage
|
|
13
|
+
curl http://localhost/outbox_mode
|
|
14
|
+
'''
|
|
15
|
+
outbox: Outbox = request.app.outbox
|
|
16
|
+
return PlainTextResponse(outbox.get_mode().value)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@router.put("/outbox_mode")
|
|
20
|
+
async def put_outbox_mode(request: Request):
|
|
21
|
+
'''
|
|
22
|
+
Example Usage
|
|
23
|
+
curl -X PUT -d "continuous_upload" http://localhost/outbox_mode
|
|
24
|
+
curl -X PUT -d "stopped" http://localhost/outbox_mode
|
|
25
|
+
'''
|
|
26
|
+
outbox: Outbox = request.app.outbox
|
|
27
|
+
content = str(await request.body(), 'utf-8')
|
|
28
|
+
try:
|
|
29
|
+
outbox.set_mode(content)
|
|
30
|
+
except TimeoutError as e:
|
|
31
|
+
raise HTTPException(202, 'Setting has not completed, yet: ' + str(e)) from e
|
|
32
|
+
except ValueError as e:
|
|
33
|
+
raise HTTPException(422, 'Could not set outbox mode: ' + str(e)) from e
|
|
34
|
+
|
|
35
|
+
return "OK"
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/rest/upload.py
RENAMED
|
@@ -1,7 +1,10 @@
|
|
|
1
|
-
from typing import List
|
|
1
|
+
from typing import TYPE_CHECKING, List
|
|
2
2
|
|
|
3
3
|
from fastapi import APIRouter, File, Request, UploadFile
|
|
4
4
|
|
|
5
|
+
if TYPE_CHECKING:
|
|
6
|
+
from ..detector_node import DetectorNode
|
|
7
|
+
|
|
5
8
|
router = APIRouter()
|
|
6
9
|
|
|
7
10
|
|
|
@@ -13,5 +16,6 @@ async def upload_image(request: Request, files: List[UploadFile] = File(...)):
|
|
|
13
16
|
curl -X POST -F 'files=@test.jpg' "http://localhost:/upload"
|
|
14
17
|
"""
|
|
15
18
|
raw_files = [await file.read() for file in files]
|
|
16
|
-
|
|
19
|
+
node: DetectorNode = request.app
|
|
20
|
+
await node.upload_images(raw_files)
|
|
17
21
|
return 200, "OK"
|
|
@@ -102,3 +102,19 @@ async def test_about_endpoint(test_detector_node: DetectorNode):
|
|
|
102
102
|
assert response_dict['state'] == 'online'
|
|
103
103
|
assert response_dict['target_model'] == '1.1'
|
|
104
104
|
assert any(c.name == 'purple point' for c in model_information.categories)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
async def test_rest_outbox_mode(test_detector_node: DetectorNode):
|
|
108
|
+
await asyncio.sleep(3)
|
|
109
|
+
|
|
110
|
+
def check_switch_to_mode(mode: str):
|
|
111
|
+
response = requests.put(f'http://localhost:{GLOBALS.detector_port}/outbox_mode',
|
|
112
|
+
data=mode, timeout=30)
|
|
113
|
+
assert response.status_code == 200
|
|
114
|
+
response = requests.get(f'http://localhost:{GLOBALS.detector_port}/outbox_mode', timeout=30)
|
|
115
|
+
assert response.status_code == 200
|
|
116
|
+
assert response.content == mode.encode()
|
|
117
|
+
|
|
118
|
+
check_switch_to_mode('stopped')
|
|
119
|
+
check_switch_to_mode('continuous_upload')
|
|
120
|
+
check_switch_to_mode('stopped')
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import shutil
|
|
3
|
+
from time import sleep
|
|
3
4
|
|
|
4
5
|
import numpy as np
|
|
5
6
|
import pytest
|
|
@@ -21,6 +22,7 @@ def test_outbox():
|
|
|
21
22
|
os.mkdir(test_outbox.path)
|
|
22
23
|
|
|
23
24
|
yield test_outbox
|
|
25
|
+
test_outbox.set_mode('stopped')
|
|
24
26
|
shutil.rmtree(test_outbox.path, ignore_errors=True)
|
|
25
27
|
|
|
26
28
|
|
|
@@ -52,11 +54,7 @@ def test_saving_opencv_image(test_outbox: Outbox):
|
|
|
52
54
|
|
|
53
55
|
def test_saving_binary(test_outbox: Outbox):
|
|
54
56
|
assert len(test_outbox.get_data_files()) == 0
|
|
55
|
-
|
|
56
|
-
img.save('/tmp/image.jpg')
|
|
57
|
-
with open('/tmp/image.jpg', 'rb') as f:
|
|
58
|
-
data = f.read()
|
|
59
|
-
test_outbox.save(data)
|
|
57
|
+
save_test_image_to_outbox(test_outbox)
|
|
60
58
|
assert len(test_outbox.get_data_files()) == 1
|
|
61
59
|
|
|
62
60
|
|
|
@@ -66,3 +64,23 @@ async def test_files_are_automatically_uploaded(test_detector_node: DetectorNode
|
|
|
66
64
|
assert len(test_detector_node.outbox.get_data_files()) == 1
|
|
67
65
|
|
|
68
66
|
assert len(test_detector_node.outbox.get_data_files()) == 1
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def test_set_outbox_mode(test_outbox: Outbox):
|
|
70
|
+
test_outbox.set_mode('stopped')
|
|
71
|
+
save_test_image_to_outbox(outbox=test_outbox)
|
|
72
|
+
sleep(6)
|
|
73
|
+
assert len(test_outbox.get_data_files()) == 1, 'File was cleared even though outbox should be stopped'
|
|
74
|
+
test_outbox.set_mode('continuous_upload')
|
|
75
|
+
sleep(6)
|
|
76
|
+
assert len(test_outbox.get_data_files()) == 0, 'File was not cleared even though outbox should be in continuous_upload'
|
|
77
|
+
|
|
78
|
+
### Helper functions ###
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def save_test_image_to_outbox(outbox: Outbox):
|
|
82
|
+
img = Image.new('RGB', (60, 30), color=(73, 109, 137))
|
|
83
|
+
img.save('/tmp/image.jpg')
|
|
84
|
+
with open('/tmp/image.jpg', 'rb') as f:
|
|
85
|
+
data = f.read()
|
|
86
|
+
outbox.save(data)
|
|
@@ -20,7 +20,7 @@ async def test_successful_detecting(test_initialized_trainer: TestingTrainerLogi
|
|
|
20
20
|
model_uuid_for_detecting='00000000-0000-0000-0000-000000000011') # NOTE: this is the hard coded model uuid for zauberzeug/demo (model version 1.1)
|
|
21
21
|
|
|
22
22
|
_ = asyncio.get_running_loop().create_task(
|
|
23
|
-
trainer._perform_state('
|
|
23
|
+
trainer._perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, trainer._do_detections))
|
|
24
24
|
|
|
25
25
|
await assert_training_state(trainer.training, TrainerState.Detecting, timeout=1, interval=0.001)
|
|
26
26
|
await assert_training_state(trainer.training, TrainerState.Detected, timeout=10, interval=0.001)
|
|
@@ -54,7 +54,7 @@ async def test_abort_upload_model(test_initialized_trainer: TestingTrainerLogic)
|
|
|
54
54
|
async def test_bad_server_response_content(test_initialized_trainer: TestingTrainerLogic):
|
|
55
55
|
"""Set the training state to confusion_matrix_synced and try to upload the model.
|
|
56
56
|
This should fail because the server response is not a valid model id.
|
|
57
|
-
The training should be aborted and the training state should be set to
|
|
57
|
+
The training should be aborted and the training state should be set to ready_for_cleanup."""
|
|
58
58
|
trainer = test_initialized_trainer
|
|
59
59
|
|
|
60
60
|
create_active_training_file(trainer, training_state=TrainerState.ConfusionMatrixSynced)
|
|
@@ -64,10 +64,10 @@ async def test_bad_server_response_content(test_initialized_trainer: TestingTrai
|
|
|
64
64
|
|
|
65
65
|
await assert_training_state(trainer.training, TrainerState.TrainModelUploading, timeout=1, interval=0.001)
|
|
66
66
|
# TODO goes to finished because of the error
|
|
67
|
-
await assert_training_state(trainer.training, TrainerState.
|
|
67
|
+
await assert_training_state(trainer.training, TrainerState.ReadyForCleanup, timeout=2, interval=0.001)
|
|
68
68
|
|
|
69
69
|
assert trainer_has_error(trainer)
|
|
70
|
-
assert trainer.training.training_state == TrainerState.
|
|
70
|
+
assert trainer.training.training_state == TrainerState.ReadyForCleanup
|
|
71
71
|
assert trainer.training.model_uuid_for_detecting is None
|
|
72
72
|
assert trainer.node.last_training_io.load() == trainer.training
|
|
73
73
|
|
|
@@ -81,8 +81,7 @@ async def test_mock_loop_response_example(mocker: MockerFixture, test_initialize
|
|
|
81
81
|
trainer._init_from_last_training()
|
|
82
82
|
|
|
83
83
|
# pylint: disable=protected-access
|
|
84
|
-
|
|
85
|
-
assert result is not None
|
|
84
|
+
await trainer._upload_model_return_new_model_uuid(Context(organization='zauberzeug', project='demo'))
|
|
86
85
|
|
|
87
86
|
|
|
88
87
|
def mock_upload_model_for_training(mocker, return_value):
|
|
@@ -59,7 +59,7 @@ class TestingTrainerLogic(TrainerLogic):
|
|
|
59
59
|
await super()._upload_model()
|
|
60
60
|
await asyncio.sleep(0.1) # give tests a bit time to to check for the state
|
|
61
61
|
|
|
62
|
-
async def _upload_model_return_new_model_uuid(self, context: Context) ->
|
|
62
|
+
async def _upload_model_return_new_model_uuid(self, context: Context) -> str:
|
|
63
63
|
await asyncio.sleep(0.1) # give tests a bit time to to check for the state
|
|
64
64
|
result = await super()._upload_model_return_new_model_uuid(context)
|
|
65
65
|
await asyncio.sleep(0.1) # give tests a bit time to to check for the state
|
|
@@ -14,11 +14,14 @@ from ..data_classes import (Context, Errors, Hyperparameter, PretrainedModel, Tr
|
|
|
14
14
|
TrainingOut, TrainingStateData)
|
|
15
15
|
from ..helpers.misc import create_project_folder, delete_all_training_folders, generate_training, is_valid_uuid4
|
|
16
16
|
from .downloader import TrainingsDownloader
|
|
17
|
+
from .exceptions import CriticalError
|
|
17
18
|
from .io_helpers import ActiveTrainingIO, EnvironmentVars, LastTrainingIO
|
|
18
19
|
|
|
19
20
|
if TYPE_CHECKING:
|
|
20
21
|
from .trainer_node import TrainerNode
|
|
21
22
|
|
|
23
|
+
logger = logging.getLogger('learning_loop_node.trainer_logic_generic')
|
|
24
|
+
|
|
22
25
|
|
|
23
26
|
class TrainerLogicGeneric(ABC):
|
|
24
27
|
|
|
@@ -175,7 +178,7 @@ class TrainerLogicGeneric(ABC):
|
|
|
175
178
|
"""
|
|
176
179
|
if not self.training_active and self.last_training_io.exists():
|
|
177
180
|
self._init_from_last_training()
|
|
178
|
-
|
|
181
|
+
logger.info('found incomplete training, continuing now.')
|
|
179
182
|
asyncio.get_event_loop().create_task(self._run())
|
|
180
183
|
return True
|
|
181
184
|
return False
|
|
@@ -207,7 +210,7 @@ class TrainerLogicGeneric(ABC):
|
|
|
207
210
|
|
|
208
211
|
self._active_training_io = ActiveTrainingIO(
|
|
209
212
|
self._training.training_folder, self.node.loop_communicator, context)
|
|
210
|
-
|
|
213
|
+
logger.info(f'new training initialized: {self._training}')
|
|
211
214
|
|
|
212
215
|
async def _run(self) -> None:
|
|
213
216
|
"""Called on `begin_training` event from the Learning Loop.
|
|
@@ -219,18 +222,21 @@ class TrainerLogicGeneric(ABC):
|
|
|
219
222
|
await self.training_task # NOTE: Task object is used to potentially cancel the task
|
|
220
223
|
except asyncio.CancelledError:
|
|
221
224
|
if not self.shutdown_event.is_set():
|
|
222
|
-
|
|
225
|
+
logger.info('CancelledError in _run - training task was cancelled but not by shutdown event')
|
|
223
226
|
self.training.training_state = TrainerState.ReadyForCleanup
|
|
224
227
|
self.last_training_io.save(self.training)
|
|
225
228
|
await self._clear_training()
|
|
229
|
+
self._may_restart()
|
|
230
|
+
else:
|
|
231
|
+
logger.info('CancelledError in _run - shutting down')
|
|
226
232
|
except Exception as e:
|
|
227
|
-
|
|
233
|
+
logger.exception(f'Error in train: {e}')
|
|
228
234
|
|
|
229
235
|
# ---------------------------------------- TRAINING STATES ----------------------------------------
|
|
230
236
|
|
|
231
237
|
async def _training_loop(self) -> None:
|
|
232
238
|
"""Cycle through the training states until the training is finished or
|
|
233
|
-
|
|
239
|
+
a critical error occurs (asyncio.CancelledError or CriticalError).
|
|
234
240
|
"""
|
|
235
241
|
assert self.training_active
|
|
236
242
|
|
|
@@ -252,13 +258,20 @@ class TrainerLogicGeneric(ABC):
|
|
|
252
258
|
await self._perform_state('detecting', TrainerState.Detecting, TrainerState.Detected, self._do_detections)
|
|
253
259
|
elif tstate == TrainerState.Detected: # -> DetectionUploading -> ReadyForCleanup
|
|
254
260
|
await self._perform_state('upload_detections', TrainerState.DetectionUploading, TrainerState.ReadyForCleanup, self.active_training_io.upload_detetions)
|
|
255
|
-
elif tstate == TrainerState.ReadyForCleanup: # -> RESTART or
|
|
261
|
+
elif tstate == TrainerState.ReadyForCleanup: # -> Idle (RESTART or _training = None)
|
|
256
262
|
await self._clear_training()
|
|
257
263
|
self._may_restart()
|
|
258
264
|
|
|
259
265
|
async def _perform_state(self, error_key: str, state_during: TrainerState, state_after: TrainerState, action: Callable[[], Coroutine], reset_early=False):
|
|
266
|
+
'''
|
|
267
|
+
Perform a training state and handle errors.
|
|
268
|
+
- If the loop sends a StopTraining event, this will raise a CancelledError.
|
|
269
|
+
- States can raise a CriticalError indicating that there is no point in retrying the state.
|
|
270
|
+
- If any other error occurs, the error is stored in the errors object and the state is reset to the previous state.
|
|
271
|
+
'''
|
|
272
|
+
|
|
260
273
|
await asyncio.sleep(0.1)
|
|
261
|
-
|
|
274
|
+
logger.info(f'Performing state: {state_during}')
|
|
262
275
|
previous_state = self.training.training_state
|
|
263
276
|
self.training.training_state = state_during
|
|
264
277
|
await asyncio.sleep(0.1)
|
|
@@ -266,21 +279,30 @@ class TrainerLogicGeneric(ABC):
|
|
|
266
279
|
self.errors.reset(error_key)
|
|
267
280
|
|
|
268
281
|
try:
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
state_after = TrainerState.ReadyForCleanup
|
|
282
|
+
await action()
|
|
283
|
+
|
|
272
284
|
except asyncio.CancelledError:
|
|
273
|
-
|
|
274
|
-
|
|
285
|
+
if self.shutdown_event.is_set():
|
|
286
|
+
logger.info(f'CancelledError in {state_during} - shutdown event set')
|
|
287
|
+
raise
|
|
288
|
+
logger.info(f'CancelledError in {state_during} - cleaning up')
|
|
289
|
+
self.training.training_state = TrainerState.ReadyForCleanup
|
|
290
|
+
except CriticalError as e:
|
|
291
|
+
logger.error(f'CriticalError in {state_during} - Exception: {e}')
|
|
292
|
+
self.errors.set(error_key, str(e))
|
|
293
|
+
self.training.training_state = TrainerState.ReadyForCleanup
|
|
275
294
|
except Exception as e:
|
|
276
295
|
self.errors.set(error_key, str(e))
|
|
277
|
-
|
|
296
|
+
logger.exception('Error in %s - Exception: %s', state_during, e)
|
|
278
297
|
self.training.training_state = previous_state
|
|
298
|
+
return
|
|
279
299
|
else:
|
|
300
|
+
logger.info(f'Successfully finished state: {state_during}')
|
|
280
301
|
if not reset_early:
|
|
281
302
|
self.errors.reset(error_key)
|
|
282
303
|
self.training.training_state = state_after
|
|
283
|
-
|
|
304
|
+
|
|
305
|
+
self.last_training_io.save(self.training)
|
|
284
306
|
|
|
285
307
|
async def _prepare(self) -> None:
|
|
286
308
|
"""Downloads images to the images_folder and saves annotations to training.data.image_data.
|
|
@@ -300,11 +322,11 @@ class TrainerLogicGeneric(ABC):
|
|
|
300
322
|
|
|
301
323
|
# TODO this checks if we continue a training -> make more explicit
|
|
302
324
|
if not base_model_uuid or not is_valid_uuid4(base_model_uuid):
|
|
303
|
-
|
|
325
|
+
logger.info(f'skipping model download. No base model provided (in form of uuid): {base_model_uuid}')
|
|
304
326
|
return
|
|
305
327
|
|
|
306
|
-
|
|
307
|
-
|
|
328
|
+
logger.info('loading model from Learning Loop')
|
|
329
|
+
logger.info(f'downloading model {base_model_uuid} as {self.model_format}')
|
|
308
330
|
await self.node.data_exchanger.download_model(self.training.training_folder, self.training.context, base_model_uuid, self.model_format)
|
|
309
331
|
shutil.move(f'{self.training.training_folder}/model.json',
|
|
310
332
|
f'{self.training.training_folder}/base_model.json')
|
|
@@ -327,12 +349,12 @@ class TrainerLogicGeneric(ABC):
|
|
|
327
349
|
result = await self.node.sio_client.call('update_training', (
|
|
328
350
|
self.training.context.organization, self.training.context.project, jsonable_encoder(new_training)))
|
|
329
351
|
if isinstance(result, dict) and result['success']:
|
|
330
|
-
|
|
352
|
+
logger.info(f'successfully updated training {asdict(new_training)}')
|
|
331
353
|
self._on_metrics_published(new_best_model)
|
|
332
354
|
else:
|
|
333
355
|
raise Exception(f'Error for update_training: Response from loop was : {result}')
|
|
334
356
|
except Exception as e:
|
|
335
|
-
|
|
357
|
+
logger.exception('Error during confusion matrix syncronization')
|
|
336
358
|
self.errors.set(error_key, str(e))
|
|
337
359
|
raise
|
|
338
360
|
self.errors.reset(error_key)
|
|
@@ -341,21 +363,22 @@ class TrainerLogicGeneric(ABC):
|
|
|
341
363
|
"""Uploads the latest model to the Learning Loop.
|
|
342
364
|
"""
|
|
343
365
|
new_model_uuid = await self._upload_model_return_new_model_uuid(self.training.context)
|
|
344
|
-
|
|
345
|
-
self.training.training_state = TrainerState.ReadyForCleanup
|
|
346
|
-
logging.error('could not upload model - maybe training failed.. cleaning up')
|
|
347
|
-
logging.info(f'Successfully uploaded model and received new model id: {new_model_uuid}')
|
|
366
|
+
logger.info(f'Successfully uploaded model and received new model id: {new_model_uuid}')
|
|
348
367
|
self.training.model_uuid_for_detecting = new_model_uuid
|
|
349
368
|
|
|
350
|
-
async def _upload_model_return_new_model_uuid(self, context: Context) ->
|
|
369
|
+
async def _upload_model_return_new_model_uuid(self, context: Context) -> str:
|
|
351
370
|
"""Upload model files, usually pytorch model (.pt) hyp.yaml and the converted .wts file.
|
|
352
371
|
Note that with the latest trainers the conversion to (.wts) is done by the trainer.
|
|
353
372
|
The conversion from .wts to .engine is done by the detector (needs to be done on target hardware).
|
|
354
|
-
Note that trainer may train with different classes, which is why we send an initial model.json file.
|
|
373
|
+
Note that trainer may train with different classes, which is why we send an initial model.json file.
|
|
374
|
+
|
|
375
|
+
:return: The new model UUID.
|
|
376
|
+
:raise CriticalError: If the latest model files cannot be obtained.
|
|
377
|
+
"""
|
|
355
378
|
|
|
356
379
|
files = await self._get_latest_model_files()
|
|
357
380
|
if files is None:
|
|
358
|
-
|
|
381
|
+
raise CriticalError('Could not get latest model files. Training might have failed.')
|
|
359
382
|
|
|
360
383
|
if isinstance(files, List):
|
|
361
384
|
files = {self.model_format: files}
|
|
@@ -369,8 +392,6 @@ class TrainerLogicGeneric(ABC):
|
|
|
369
392
|
assert len([f for f in _files if 'model.json' in f]) == 1, "model.json must be included exactly once"
|
|
370
393
|
|
|
371
394
|
model_uuid = await self.node.data_exchanger.upload_model_get_uuid(context, _files, self.training.training_number, file_format)
|
|
372
|
-
if model_uuid is None:
|
|
373
|
-
return None
|
|
374
395
|
|
|
375
396
|
already_uploaded_formats.append(file_format)
|
|
376
397
|
self.active_training_io.save_model_upload_progress(already_uploaded_formats)
|
|
@@ -411,23 +432,23 @@ class TrainerLogicGeneric(ABC):
|
|
|
411
432
|
if not self.training_active:
|
|
412
433
|
return
|
|
413
434
|
if self.training_task:
|
|
414
|
-
|
|
435
|
+
logger.info('cancelling training task')
|
|
415
436
|
if self.training_task.cancel():
|
|
416
437
|
try:
|
|
417
438
|
await self.training_task
|
|
418
439
|
except asyncio.CancelledError:
|
|
419
440
|
pass
|
|
420
|
-
|
|
441
|
+
logger.info('cancelled training task')
|
|
421
442
|
self._may_restart()
|
|
422
443
|
|
|
423
444
|
def _may_restart(self) -> None:
|
|
424
445
|
"""If the environment variable RESTART_AFTER_TRAINING is set, the trainer will restart after a training.
|
|
425
446
|
"""
|
|
426
447
|
if self._environment_vars.restart_after_training:
|
|
427
|
-
|
|
448
|
+
logger.info('restarting')
|
|
428
449
|
sys.exit(0)
|
|
429
450
|
else:
|
|
430
|
-
|
|
451
|
+
logger.info('not restarting')
|
|
431
452
|
# ---------------------------------------- ABSTRACT METHODS ----------------------------------------
|
|
432
453
|
|
|
433
454
|
@abstractmethod
|
|
@@ -1,117 +0,0 @@
|
|
|
1
|
-
import json
|
|
2
|
-
import logging
|
|
3
|
-
import os
|
|
4
|
-
import shutil
|
|
5
|
-
import time
|
|
6
|
-
from dataclasses import asdict
|
|
7
|
-
from datetime import datetime
|
|
8
|
-
from glob import glob
|
|
9
|
-
from multiprocessing import Event
|
|
10
|
-
from multiprocessing.synchronize import Event as SyncEvent
|
|
11
|
-
from threading import Thread
|
|
12
|
-
from typing import List, Optional
|
|
13
|
-
|
|
14
|
-
import requests
|
|
15
|
-
from fastapi.encoders import jsonable_encoder
|
|
16
|
-
|
|
17
|
-
from ..data_classes import Detections
|
|
18
|
-
from ..globals import GLOBALS
|
|
19
|
-
from ..helpers import environment_reader
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
class Outbox():
|
|
23
|
-
|
|
24
|
-
def __init__(self) -> None:
|
|
25
|
-
self.log = logging.getLogger()
|
|
26
|
-
self.path = f'{GLOBALS.data_folder}/outbox'
|
|
27
|
-
os.makedirs(self.path, exist_ok=True)
|
|
28
|
-
|
|
29
|
-
host = environment_reader.host()
|
|
30
|
-
o = environment_reader.organization()
|
|
31
|
-
p = environment_reader.project()
|
|
32
|
-
|
|
33
|
-
assert o and p, 'Outbox needs an organization and a project '
|
|
34
|
-
base_url = f'http{"s" if "learning-loop.ai" in host else ""}://{host}/api'
|
|
35
|
-
base: str = base_url
|
|
36
|
-
self.target_uri = f'{base}/{o}/projects/{p}/images'
|
|
37
|
-
self.log.info(f'Outbox initialized with target_uri: {self.target_uri}')
|
|
38
|
-
|
|
39
|
-
self.shutdown_event: Optional[SyncEvent] = None
|
|
40
|
-
self.upload_process: Optional[Thread] = None
|
|
41
|
-
|
|
42
|
-
def save(self, image: bytes, detections: Optional[Detections] = None, tags: Optional[List[str]] = None) -> None:
|
|
43
|
-
if detections is None:
|
|
44
|
-
detections = Detections()
|
|
45
|
-
if not tags:
|
|
46
|
-
tags = []
|
|
47
|
-
identifier = datetime.now().isoformat(sep='_', timespec='milliseconds')
|
|
48
|
-
tmp = f'{GLOBALS.data_folder}/tmp/{identifier}'
|
|
49
|
-
detections.tags = tags
|
|
50
|
-
detections.date = identifier
|
|
51
|
-
os.makedirs(tmp, exist_ok=True)
|
|
52
|
-
|
|
53
|
-
with open(tmp + '/image.json', 'w') as f:
|
|
54
|
-
json.dump(jsonable_encoder(asdict(detections)), f)
|
|
55
|
-
|
|
56
|
-
with open(tmp + '/image.jpg', 'wb') as f:
|
|
57
|
-
f.write(image)
|
|
58
|
-
|
|
59
|
-
if os.path.exists(tmp):
|
|
60
|
-
os.rename(tmp, self.path + '/' + identifier) # NOTE rename is atomic so upload can run in parallel
|
|
61
|
-
else:
|
|
62
|
-
self.log.error(f'Could not rename {tmp} to {self.path}/{identifier}')
|
|
63
|
-
|
|
64
|
-
def get_data_files(self):
|
|
65
|
-
return glob(f'{self.path}/*')
|
|
66
|
-
|
|
67
|
-
def start_continuous_upload(self):
|
|
68
|
-
self.shutdown_event = Event()
|
|
69
|
-
self.upload_process = Thread(target=self._continuous_upload)
|
|
70
|
-
self.upload_process.start()
|
|
71
|
-
|
|
72
|
-
def _continuous_upload(self):
|
|
73
|
-
self.log.info('start continuous upload')
|
|
74
|
-
assert self.shutdown_event is not None
|
|
75
|
-
while not self.shutdown_event.is_set():
|
|
76
|
-
self.upload()
|
|
77
|
-
time.sleep(1)
|
|
78
|
-
self.log.info('stop continuous upload')
|
|
79
|
-
|
|
80
|
-
def upload(self):
|
|
81
|
-
items = self.get_data_files()
|
|
82
|
-
if items:
|
|
83
|
-
self.log.info(f'Found {len(items)} images to upload')
|
|
84
|
-
for item in items:
|
|
85
|
-
if self.shutdown_event and self.shutdown_event.is_set():
|
|
86
|
-
break
|
|
87
|
-
try:
|
|
88
|
-
data = [('files', open(f'{item}/image.json', 'r')),
|
|
89
|
-
('files', open(f'{item}/image.jpg', 'rb'))]
|
|
90
|
-
|
|
91
|
-
response = requests.post(self.target_uri, files=data, timeout=30)
|
|
92
|
-
if response.status_code == 200:
|
|
93
|
-
shutil.rmtree(item)
|
|
94
|
-
self.log.info(f'uploaded {item} successfully')
|
|
95
|
-
elif response.status_code == 422:
|
|
96
|
-
self.log.error(f'Broken content in {item}: dropping this data')
|
|
97
|
-
shutil.rmtree(item)
|
|
98
|
-
else:
|
|
99
|
-
self.log.error(f'Could not upload {item}: {response.status_code}')
|
|
100
|
-
except Exception:
|
|
101
|
-
self.log.exception('could not upload files')
|
|
102
|
-
|
|
103
|
-
def stop_continuous_upload(self, timeout=5):
|
|
104
|
-
proc = self.upload_process
|
|
105
|
-
if not proc:
|
|
106
|
-
return
|
|
107
|
-
|
|
108
|
-
try:
|
|
109
|
-
assert self.shutdown_event is not None
|
|
110
|
-
self.shutdown_event.set()
|
|
111
|
-
assert proc is not None
|
|
112
|
-
proc.join(timeout)
|
|
113
|
-
except Exception:
|
|
114
|
-
logging.exception('error while shutting down upload thread')
|
|
115
|
-
|
|
116
|
-
if proc.is_alive():
|
|
117
|
-
self.log.error('upload thread did not terminate')
|
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/annotation/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/data_classes/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/data_classes/general.py
RENAMED
|
File without changes
|
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/data_classes/training.py
RENAMED
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/rest/__init__.py
RENAMED
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/rest/about.py
RENAMED
|
File without changes
|
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/rest/detect.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/detector/tests/test.jpg
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/helpers/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/helpers/log_conf.py
RENAMED
|
File without changes
|
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/loop_communication.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/tests/__init__.py
RENAMED
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/tests/conftest.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/tests/test_downloader.py
RENAMED
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/tests/test_executor.py
RENAMED
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/tests/test_helper.py
RENAMED
|
File without changes
|
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/__init__.py
RENAMED
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/downloader.py
RENAMED
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/executor.py
RENAMED
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/io_helpers.py
RENAMED
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/rest/__init__.py
RENAMED
|
File without changes
|
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/rest/controls.py
RENAMED
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/tests/__init__.py
RENAMED
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/tests/conftest.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/trainer_logic.py
RENAMED
|
File without changes
|
{learning_loop_node-0.10.6 → learning_loop_node-0.10.7}/learning_loop_node/trainer/trainer_node.py
RENAMED
|
File without changes
|