flwr 1.13.0__py3-none-any.whl → 1.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flwr/cli/app.py +5 -0
- flwr/cli/build.py +1 -37
- flwr/cli/cli_user_auth_interceptor.py +86 -0
- flwr/cli/config_utils.py +19 -2
- flwr/cli/example.py +1 -0
- flwr/cli/install.py +2 -19
- flwr/cli/log.py +18 -36
- flwr/cli/login/__init__.py +22 -0
- flwr/cli/login/login.py +81 -0
- flwr/cli/ls.py +205 -106
- flwr/cli/new/__init__.py +1 -0
- flwr/cli/new/new.py +25 -14
- flwr/cli/new/templates/app/.gitignore.tpl +3 -0
- flwr/cli/new/templates/app/pyproject.baseline.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.flowertune.toml.tpl +3 -3
- flwr/cli/new/templates/app/pyproject.huggingface.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.jax.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.mlx.toml.tpl +2 -3
- flwr/cli/new/templates/app/pyproject.numpy.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.pytorch.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.sklearn.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.tensorflow.toml.tpl +1 -1
- flwr/cli/run/__init__.py +1 -0
- flwr/cli/run/run.py +89 -39
- flwr/cli/stop.py +130 -0
- flwr/cli/utils.py +172 -8
- flwr/client/app.py +14 -3
- flwr/client/client.py +1 -32
- flwr/client/clientapp/app.py +4 -8
- flwr/client/clientapp/utils.py +1 -0
- flwr/client/grpc_adapter_client/connection.py +1 -1
- flwr/client/grpc_client/connection.py +1 -1
- flwr/client/grpc_rere_client/connection.py +13 -7
- flwr/client/message_handler/message_handler.py +1 -2
- flwr/client/mod/comms_mods.py +1 -0
- flwr/client/mod/localdp_mod.py +1 -1
- flwr/client/nodestate/__init__.py +1 -0
- flwr/client/nodestate/nodestate.py +1 -0
- flwr/client/nodestate/nodestate_factory.py +1 -0
- flwr/client/numpy_client.py +0 -44
- flwr/client/rest_client/connection.py +3 -3
- flwr/client/supernode/app.py +2 -2
- flwr/common/address.py +1 -0
- flwr/common/args.py +1 -0
- flwr/common/auth_plugin/__init__.py +24 -0
- flwr/common/auth_plugin/auth_plugin.py +111 -0
- flwr/common/config.py +3 -1
- flwr/common/constant.py +17 -1
- flwr/common/logger.py +40 -0
- flwr/common/message.py +1 -0
- flwr/common/object_ref.py +57 -54
- flwr/common/pyproject.py +1 -0
- flwr/common/record/__init__.py +1 -0
- flwr/common/record/parametersrecord.py +1 -0
- flwr/common/retry_invoker.py +77 -0
- flwr/common/secure_aggregation/secaggplus_utils.py +2 -2
- flwr/common/telemetry.py +15 -4
- flwr/common/typing.py +12 -0
- flwr/common/version.py +1 -0
- flwr/proto/exec_pb2.py +38 -14
- flwr/proto/exec_pb2.pyi +107 -2
- flwr/proto/exec_pb2_grpc.py +102 -0
- flwr/proto/exec_pb2_grpc.pyi +39 -0
- flwr/proto/fab_pb2.py +4 -4
- flwr/proto/fab_pb2.pyi +4 -1
- flwr/proto/serverappio_pb2.py +18 -18
- flwr/proto/serverappio_pb2.pyi +8 -2
- flwr/proto/serverappio_pb2_grpc.py +34 -0
- flwr/proto/serverappio_pb2_grpc.pyi +13 -0
- flwr/proto/simulationio_pb2.py +2 -2
- flwr/proto/simulationio_pb2_grpc.py +34 -0
- flwr/proto/simulationio_pb2_grpc.pyi +13 -0
- flwr/server/app.py +62 -7
- flwr/server/compat/app_utils.py +7 -1
- flwr/server/driver/grpc_driver.py +11 -63
- flwr/server/driver/inmemory_driver.py +5 -1
- flwr/server/run_serverapp.py +8 -9
- flwr/server/serverapp/app.py +25 -10
- flwr/server/strategy/dpfedavg_fixed.py +1 -0
- flwr/server/superlink/driver/serverappio_grpc.py +1 -0
- flwr/server/superlink/driver/serverappio_servicer.py +82 -23
- flwr/server/superlink/ffs/disk_ffs.py +1 -0
- flwr/server/superlink/fleet/grpc_adapter/grpc_adapter_servicer.py +1 -0
- flwr/server/superlink/fleet/grpc_bidi/flower_service_servicer.py +1 -0
- flwr/server/superlink/fleet/grpc_rere/fleet_servicer.py +32 -12
- flwr/server/superlink/fleet/grpc_rere/server_interceptor.py +12 -11
- flwr/server/superlink/fleet/message_handler/message_handler.py +32 -5
- flwr/server/superlink/fleet/rest_rere/rest_api.py +4 -1
- flwr/server/superlink/fleet/vce/__init__.py +1 -0
- flwr/server/superlink/fleet/vce/backend/__init__.py +1 -0
- flwr/server/superlink/fleet/vce/backend/raybackend.py +1 -0
- flwr/server/superlink/linkstate/in_memory_linkstate.py +21 -30
- flwr/server/superlink/linkstate/linkstate.py +17 -2
- flwr/server/superlink/linkstate/sqlite_linkstate.py +30 -49
- flwr/server/superlink/simulation/simulationio_servicer.py +33 -0
- flwr/server/superlink/utils.py +65 -0
- flwr/simulation/app.py +59 -52
- flwr/simulation/ray_transport/ray_actor.py +1 -0
- flwr/simulation/ray_transport/utils.py +1 -0
- flwr/simulation/run_simulation.py +36 -22
- flwr/simulation/simulationio_connection.py +3 -0
- flwr/superexec/app.py +1 -0
- flwr/superexec/deployment.py +1 -0
- flwr/superexec/exec_grpc.py +19 -1
- flwr/superexec/exec_servicer.py +76 -2
- flwr/superexec/exec_user_auth_interceptor.py +101 -0
- flwr/superexec/executor.py +1 -0
- {flwr-1.13.0.dist-info → flwr-1.14.0.dist-info}/METADATA +8 -8
- {flwr-1.13.0.dist-info → flwr-1.14.0.dist-info}/RECORD +112 -112
- flwr/proto/common_pb2.py +0 -36
- flwr/proto/common_pb2.pyi +0 -121
- flwr/proto/common_pb2_grpc.py +0 -4
- flwr/proto/common_pb2_grpc.pyi +0 -4
- flwr/proto/control_pb2.py +0 -27
- flwr/proto/control_pb2.pyi +0 -7
- flwr/proto/control_pb2_grpc.py +0 -135
- flwr/proto/control_pb2_grpc.pyi +0 -53
- {flwr-1.13.0.dist-info → flwr-1.14.0.dist-info}/LICENSE +0 -0
- {flwr-1.13.0.dist-info → flwr-1.14.0.dist-info}/WHEEL +0 -0
- {flwr-1.13.0.dist-info → flwr-1.14.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
# Copyright 2024 Flower Labs GmbH. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
# ==============================================================================
|
|
15
|
+
"""SuperLink utilities."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
from typing import Union
|
|
19
|
+
|
|
20
|
+
import grpc
|
|
21
|
+
|
|
22
|
+
from flwr.common.constant import Status, SubStatus
|
|
23
|
+
from flwr.common.typing import RunStatus
|
|
24
|
+
from flwr.server.superlink.linkstate import LinkState
|
|
25
|
+
|
|
26
|
+
_STATUS_TO_MSG = {
|
|
27
|
+
Status.PENDING: "Run is pending.",
|
|
28
|
+
Status.STARTING: "Run is starting.",
|
|
29
|
+
Status.RUNNING: "Run is running.",
|
|
30
|
+
Status.FINISHED: "Run is finished.",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def check_abort(
|
|
35
|
+
run_id: int,
|
|
36
|
+
abort_status_list: list[str],
|
|
37
|
+
state: LinkState,
|
|
38
|
+
) -> Union[str, None]:
|
|
39
|
+
"""Check if the status of the provided `run_id` is in `abort_status_list`."""
|
|
40
|
+
run_status: RunStatus = state.get_run_status({run_id})[run_id]
|
|
41
|
+
|
|
42
|
+
if run_status.status in abort_status_list:
|
|
43
|
+
msg = _STATUS_TO_MSG[run_status.status]
|
|
44
|
+
if run_status.sub_status == SubStatus.STOPPED:
|
|
45
|
+
msg += " Stopped by user."
|
|
46
|
+
return msg
|
|
47
|
+
|
|
48
|
+
return None
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def abort_grpc_context(msg: Union[str, None], context: grpc.ServicerContext) -> None:
|
|
52
|
+
"""Abort context with statuscode PERMISSION_DENIED if `msg` is not None."""
|
|
53
|
+
if msg is not None:
|
|
54
|
+
context.abort(grpc.StatusCode.PERMISSION_DENIED, msg)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def abort_if(
|
|
58
|
+
run_id: int,
|
|
59
|
+
abort_status_list: list[str],
|
|
60
|
+
state: LinkState,
|
|
61
|
+
context: grpc.ServicerContext,
|
|
62
|
+
) -> None:
|
|
63
|
+
"""Abort context if status of the provided `run_id` is in `abort_status_list`."""
|
|
64
|
+
msg = check_abort(run_id, abort_status_list, state)
|
|
65
|
+
abort_grpc_context(msg, context)
|
flwr/simulation/app.py
CHANGED
|
@@ -16,6 +16,7 @@
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
import argparse
|
|
19
|
+
import sys
|
|
19
20
|
from logging import DEBUG, ERROR, INFO
|
|
20
21
|
from queue import Queue
|
|
21
22
|
from time import sleep
|
|
@@ -23,8 +24,9 @@ from typing import Optional
|
|
|
23
24
|
|
|
24
25
|
from flwr.cli.config_utils import get_fab_metadata
|
|
25
26
|
from flwr.cli.install import install_from_fab
|
|
26
|
-
from flwr.
|
|
27
|
-
from flwr.common
|
|
27
|
+
from flwr.cli.utils import get_sha256_hash
|
|
28
|
+
from flwr.common import EventType, event
|
|
29
|
+
from flwr.common.args import add_args_flwr_app_common
|
|
28
30
|
from flwr.common.config import (
|
|
29
31
|
get_flwr_dir,
|
|
30
32
|
get_fused_config_from_dir,
|
|
@@ -32,7 +34,11 @@ from flwr.common.config import (
|
|
|
32
34
|
get_project_dir,
|
|
33
35
|
unflatten_dict,
|
|
34
36
|
)
|
|
35
|
-
from flwr.common.constant import
|
|
37
|
+
from flwr.common.constant import (
|
|
38
|
+
SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS,
|
|
39
|
+
Status,
|
|
40
|
+
SubStatus,
|
|
41
|
+
)
|
|
36
42
|
from flwr.common.logger import (
|
|
37
43
|
log,
|
|
38
44
|
mirror_output_to_queue,
|
|
@@ -43,6 +49,7 @@ from flwr.common.logger import (
|
|
|
43
49
|
from flwr.common.serde import (
|
|
44
50
|
configs_record_from_proto,
|
|
45
51
|
context_from_proto,
|
|
52
|
+
context_to_proto,
|
|
46
53
|
fab_from_proto,
|
|
47
54
|
run_from_proto,
|
|
48
55
|
run_status_to_proto,
|
|
@@ -69,61 +76,30 @@ def flwr_simulation() -> None:
|
|
|
69
76
|
log_queue: Queue[Optional[str]] = Queue()
|
|
70
77
|
mirror_output_to_queue(log_queue)
|
|
71
78
|
|
|
72
|
-
|
|
73
|
-
description="Run a Flower Simulation",
|
|
74
|
-
)
|
|
75
|
-
parser.add_argument(
|
|
76
|
-
"--superlink",
|
|
77
|
-
type=str,
|
|
78
|
-
help="Address of SuperLink's SimulationIO API",
|
|
79
|
-
)
|
|
80
|
-
parser.add_argument(
|
|
81
|
-
"--run-once",
|
|
82
|
-
action="store_true",
|
|
83
|
-
help="When set, this process will start a single simulation "
|
|
84
|
-
"for a pending Run. If no pending run the process will exit. ",
|
|
85
|
-
)
|
|
86
|
-
parser.add_argument(
|
|
87
|
-
"--flwr-dir",
|
|
88
|
-
default=None,
|
|
89
|
-
help="""The path containing installed Flower Apps.
|
|
90
|
-
By default, this value is equal to:
|
|
91
|
-
|
|
92
|
-
- `$FLWR_HOME/` if `$FLWR_HOME` is defined
|
|
93
|
-
- `$XDG_DATA_HOME/.flwr/` if `$XDG_DATA_HOME` is defined
|
|
94
|
-
- `$HOME/.flwr/` in all other cases
|
|
95
|
-
""",
|
|
96
|
-
)
|
|
97
|
-
parser.add_argument(
|
|
98
|
-
"--insecure",
|
|
99
|
-
action="store_true",
|
|
100
|
-
help="Run the server without HTTPS, regardless of whether certificate "
|
|
101
|
-
"paths are provided. By default, the server runs with HTTPS enabled. "
|
|
102
|
-
"Use this flag only if you understand the risks.",
|
|
103
|
-
)
|
|
104
|
-
parser.add_argument(
|
|
105
|
-
"--root-certificates",
|
|
106
|
-
metavar="ROOT_CERT",
|
|
107
|
-
type=str,
|
|
108
|
-
help="Specifies the path to the PEM-encoded root certificate file for "
|
|
109
|
-
"establishing secure HTTPS connections.",
|
|
110
|
-
)
|
|
111
|
-
args = parser.parse_args()
|
|
79
|
+
args = _parse_args_run_flwr_simulation().parse_args()
|
|
112
80
|
|
|
113
81
|
log(INFO, "Starting Flower Simulation")
|
|
114
|
-
|
|
82
|
+
|
|
83
|
+
if not args.insecure:
|
|
84
|
+
log(
|
|
85
|
+
ERROR,
|
|
86
|
+
"`flwr-simulation` does not support TLS yet. "
|
|
87
|
+
"Please use the '--insecure' flag.",
|
|
88
|
+
)
|
|
89
|
+
sys.exit(1)
|
|
115
90
|
|
|
116
91
|
log(
|
|
117
92
|
DEBUG,
|
|
118
|
-
"
|
|
119
|
-
|
|
93
|
+
"Starting isolated `Simulation` connected to SuperLink SimulationAppIo API "
|
|
94
|
+
"at %s",
|
|
95
|
+
args.simulationio_api_address,
|
|
120
96
|
)
|
|
121
97
|
run_simulation_process(
|
|
122
|
-
simulationio_api_address=args.
|
|
98
|
+
simulationio_api_address=args.simulationio_api_address,
|
|
123
99
|
log_queue=log_queue,
|
|
124
100
|
run_once=args.run_once,
|
|
125
101
|
flwr_dir_=args.flwr_dir,
|
|
126
|
-
certificates=
|
|
102
|
+
certificates=None,
|
|
127
103
|
)
|
|
128
104
|
|
|
129
105
|
# Restore stdout/stderr
|
|
@@ -225,10 +201,19 @@ def run_simulation_process( # pylint: disable=R0914, disable=W0212, disable=R09
|
|
|
225
201
|
)
|
|
226
202
|
backend_config: BackendConfig = fed_opt.get("backend", {})
|
|
227
203
|
verbose: bool = fed_opt.get("verbose", False)
|
|
228
|
-
enable_tf_gpu_growth: bool = fed_opt.get("enable_tf_gpu_growth",
|
|
204
|
+
enable_tf_gpu_growth: bool = fed_opt.get("enable_tf_gpu_growth", False)
|
|
205
|
+
|
|
206
|
+
event(
|
|
207
|
+
EventType.FLWR_SIMULATION_RUN_ENTER,
|
|
208
|
+
event_details={
|
|
209
|
+
"backend": "ray",
|
|
210
|
+
"num-supernodes": num_supernodes,
|
|
211
|
+
"run-id-hash": get_sha256_hash(run.run_id),
|
|
212
|
+
},
|
|
213
|
+
)
|
|
229
214
|
|
|
230
215
|
# Launch the simulation
|
|
231
|
-
_run_simulation(
|
|
216
|
+
updated_context = _run_simulation(
|
|
232
217
|
server_app_attr=server_app_attr,
|
|
233
218
|
client_app_attr=client_app_attr,
|
|
234
219
|
num_supernodes=num_supernodes,
|
|
@@ -239,11 +224,11 @@ def run_simulation_process( # pylint: disable=R0914, disable=W0212, disable=R09
|
|
|
239
224
|
verbose_logging=verbose,
|
|
240
225
|
server_app_run_config=fused_config,
|
|
241
226
|
is_app=True,
|
|
242
|
-
exit_event=EventType.
|
|
227
|
+
exit_event=EventType.FLWR_SIMULATION_RUN_LEAVE,
|
|
243
228
|
)
|
|
244
229
|
|
|
245
230
|
# Send resulting context
|
|
246
|
-
context_proto =
|
|
231
|
+
context_proto = context_to_proto(updated_context)
|
|
247
232
|
out_req = PushSimulationOutputsRequest(
|
|
248
233
|
run_id=run.run_id, context=context_proto
|
|
249
234
|
)
|
|
@@ -274,3 +259,25 @@ def run_simulation_process( # pylint: disable=R0914, disable=W0212, disable=R09
|
|
|
274
259
|
# Stop the loop if `flwr-simulation` is expected to process a single run
|
|
275
260
|
if run_once:
|
|
276
261
|
break
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def _parse_args_run_flwr_simulation() -> argparse.ArgumentParser:
|
|
265
|
+
"""Parse flwr-simulation command line arguments."""
|
|
266
|
+
parser = argparse.ArgumentParser(
|
|
267
|
+
description="Run a Flower Simulation",
|
|
268
|
+
)
|
|
269
|
+
parser.add_argument(
|
|
270
|
+
"--simulationio-api-address",
|
|
271
|
+
default=SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS,
|
|
272
|
+
type=str,
|
|
273
|
+
help="Address of SuperLink's SimulationIO API (IPv4, IPv6, or a domain name)."
|
|
274
|
+
f"By default, it is set to {SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS}.",
|
|
275
|
+
)
|
|
276
|
+
parser.add_argument(
|
|
277
|
+
"--run-once",
|
|
278
|
+
action="store_true",
|
|
279
|
+
help="When set, this process will start a single simulation "
|
|
280
|
+
"for a pending Run. If no pending run the process will exit. ",
|
|
281
|
+
)
|
|
282
|
+
add_args_flwr_app_common(parser=parser)
|
|
283
|
+
return parser
|
|
@@ -14,6 +14,7 @@
|
|
|
14
14
|
# ==============================================================================
|
|
15
15
|
"""Flower Simulation."""
|
|
16
16
|
|
|
17
|
+
|
|
17
18
|
import argparse
|
|
18
19
|
import asyncio
|
|
19
20
|
import json
|
|
@@ -23,10 +24,11 @@ import threading
|
|
|
23
24
|
import traceback
|
|
24
25
|
from logging import DEBUG, ERROR, INFO, WARNING
|
|
25
26
|
from pathlib import Path
|
|
26
|
-
from
|
|
27
|
+
from queue import Empty, Queue
|
|
27
28
|
from typing import Any, Optional
|
|
28
29
|
|
|
29
30
|
from flwr.cli.config_utils import load_and_validate
|
|
31
|
+
from flwr.cli.utils import get_sha256_hash
|
|
30
32
|
from flwr.client import ClientApp
|
|
31
33
|
from flwr.common import Context, EventType, RecordSet, event, log, now
|
|
32
34
|
from flwr.common.config import get_fused_config_from_dir, parse_config_args
|
|
@@ -126,7 +128,7 @@ def run_simulation_from_cli() -> None:
|
|
|
126
128
|
run = Run.create_empty(run_id)
|
|
127
129
|
run.override_config = override_config
|
|
128
130
|
|
|
129
|
-
_run_simulation(
|
|
131
|
+
_ = _run_simulation(
|
|
130
132
|
server_app_attr=server_app_attr,
|
|
131
133
|
client_app_attr=client_app_attr,
|
|
132
134
|
num_supernodes=args.num_supernodes,
|
|
@@ -135,7 +137,6 @@ def run_simulation_from_cli() -> None:
|
|
|
135
137
|
app_dir=args.app,
|
|
136
138
|
run=run,
|
|
137
139
|
enable_tf_gpu_growth=args.enable_tf_gpu_growth,
|
|
138
|
-
delay_start=args.delay_start,
|
|
139
140
|
verbose_logging=args.verbose,
|
|
140
141
|
server_app_run_config=fused_config,
|
|
141
142
|
is_app=True,
|
|
@@ -207,7 +208,7 @@ def run_simulation(
|
|
|
207
208
|
"\n\tflwr.simulation.run_simulationt(...)",
|
|
208
209
|
)
|
|
209
210
|
|
|
210
|
-
_run_simulation(
|
|
211
|
+
_ = _run_simulation(
|
|
211
212
|
num_supernodes=num_supernodes,
|
|
212
213
|
client_app=client_app,
|
|
213
214
|
server_app=server_app,
|
|
@@ -230,6 +231,7 @@ def run_serverapp_th(
|
|
|
230
231
|
has_exception: threading.Event,
|
|
231
232
|
enable_tf_gpu_growth: bool,
|
|
232
233
|
run_id: int,
|
|
234
|
+
ctx_queue: "Queue[Context]",
|
|
233
235
|
) -> threading.Thread:
|
|
234
236
|
"""Run SeverApp in a thread."""
|
|
235
237
|
|
|
@@ -242,6 +244,7 @@ def run_serverapp_th(
|
|
|
242
244
|
_server_app_run_config: UserConfig,
|
|
243
245
|
_server_app_attr: Optional[str],
|
|
244
246
|
_server_app: Optional[ServerApp],
|
|
247
|
+
_ctx_queue: "Queue[Context]",
|
|
245
248
|
) -> None:
|
|
246
249
|
"""Run SeverApp, after check if GPU memory growth has to be set.
|
|
247
250
|
|
|
@@ -262,13 +265,14 @@ def run_serverapp_th(
|
|
|
262
265
|
)
|
|
263
266
|
|
|
264
267
|
# Run ServerApp
|
|
265
|
-
_run(
|
|
268
|
+
updated_context = _run(
|
|
266
269
|
driver=_driver,
|
|
267
270
|
context=context,
|
|
268
271
|
server_app_dir=_server_app_dir,
|
|
269
272
|
server_app_attr=_server_app_attr,
|
|
270
273
|
loaded_server_app=_server_app,
|
|
271
274
|
)
|
|
275
|
+
_ctx_queue.put(updated_context)
|
|
272
276
|
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
273
277
|
log(ERROR, "ServerApp thread raised an exception: %s", ex)
|
|
274
278
|
log(ERROR, traceback.format_exc())
|
|
@@ -292,6 +296,7 @@ def run_serverapp_th(
|
|
|
292
296
|
server_app_run_config,
|
|
293
297
|
server_app_attr,
|
|
294
298
|
server_app,
|
|
299
|
+
ctx_queue,
|
|
295
300
|
),
|
|
296
301
|
)
|
|
297
302
|
serverapp_th.start()
|
|
@@ -308,14 +313,13 @@ def _main_loop(
|
|
|
308
313
|
enable_tf_gpu_growth: bool,
|
|
309
314
|
run: Run,
|
|
310
315
|
exit_event: EventType,
|
|
311
|
-
delay_start: int,
|
|
312
316
|
flwr_dir: Optional[str] = None,
|
|
313
317
|
client_app: Optional[ClientApp] = None,
|
|
314
318
|
client_app_attr: Optional[str] = None,
|
|
315
319
|
server_app: Optional[ServerApp] = None,
|
|
316
320
|
server_app_attr: Optional[str] = None,
|
|
317
321
|
server_app_run_config: Optional[UserConfig] = None,
|
|
318
|
-
) ->
|
|
322
|
+
) -> Context:
|
|
319
323
|
"""Start ServerApp on a separate thread, then launch Simulation Engine."""
|
|
320
324
|
# Initialize StateFactory
|
|
321
325
|
state_factory = LinkStateFactory(":flwr-in-memory-state:")
|
|
@@ -325,6 +329,13 @@ def _main_loop(
|
|
|
325
329
|
server_app_thread_has_exception = threading.Event()
|
|
326
330
|
serverapp_th = None
|
|
327
331
|
success = True
|
|
332
|
+
updated_context = Context(
|
|
333
|
+
run_id=run.run_id,
|
|
334
|
+
node_id=0,
|
|
335
|
+
node_config=UserConfig(),
|
|
336
|
+
state=RecordSet(),
|
|
337
|
+
run_config=UserConfig(),
|
|
338
|
+
)
|
|
328
339
|
try:
|
|
329
340
|
# Register run
|
|
330
341
|
log(DEBUG, "Pre-registering run with id %s", run.run_id)
|
|
@@ -339,6 +350,7 @@ def _main_loop(
|
|
|
339
350
|
# Initialize Driver
|
|
340
351
|
driver = InMemoryDriver(state_factory=state_factory)
|
|
341
352
|
driver.set_run(run_id=run.run_id)
|
|
353
|
+
output_context_queue: "Queue[Context]" = Queue()
|
|
342
354
|
|
|
343
355
|
# Get and run ServerApp thread
|
|
344
356
|
serverapp_th = run_serverapp_th(
|
|
@@ -351,11 +363,9 @@ def _main_loop(
|
|
|
351
363
|
has_exception=server_app_thread_has_exception,
|
|
352
364
|
enable_tf_gpu_growth=enable_tf_gpu_growth,
|
|
353
365
|
run_id=run.run_id,
|
|
366
|
+
ctx_queue=output_context_queue,
|
|
354
367
|
)
|
|
355
368
|
|
|
356
|
-
# Buffer time so the `ServerApp` in separate thread is ready
|
|
357
|
-
log(DEBUG, "Buffer time delay: %ds", delay_start)
|
|
358
|
-
sleep(delay_start)
|
|
359
369
|
# Start Simulation Engine
|
|
360
370
|
vce.start_vce(
|
|
361
371
|
num_supernodes=num_supernodes,
|
|
@@ -371,6 +381,11 @@ def _main_loop(
|
|
|
371
381
|
flwr_dir=flwr_dir,
|
|
372
382
|
)
|
|
373
383
|
|
|
384
|
+
updated_context = output_context_queue.get(timeout=3)
|
|
385
|
+
|
|
386
|
+
except Empty:
|
|
387
|
+
log(DEBUG, "Queue timeout. No context received.")
|
|
388
|
+
|
|
374
389
|
except Exception as ex:
|
|
375
390
|
log(ERROR, "An exception occurred !! %s", ex)
|
|
376
391
|
log(ERROR, traceback.format_exc())
|
|
@@ -380,13 +395,20 @@ def _main_loop(
|
|
|
380
395
|
finally:
|
|
381
396
|
# Trigger stop event
|
|
382
397
|
f_stop.set()
|
|
383
|
-
event(
|
|
398
|
+
event(
|
|
399
|
+
exit_event,
|
|
400
|
+
event_details={
|
|
401
|
+
"run-id-hash": get_sha256_hash(run.run_id),
|
|
402
|
+
"success": success,
|
|
403
|
+
},
|
|
404
|
+
)
|
|
384
405
|
if serverapp_th:
|
|
385
406
|
serverapp_th.join()
|
|
386
407
|
if server_app_thread_has_exception.is_set():
|
|
387
408
|
raise RuntimeError("Exception in ServerApp thread")
|
|
388
409
|
|
|
389
410
|
log(DEBUG, "Stopping Simulation Engine now.")
|
|
411
|
+
return updated_context
|
|
390
412
|
|
|
391
413
|
|
|
392
414
|
# pylint: disable=too-many-arguments,too-many-locals,too-many-positional-arguments
|
|
@@ -404,10 +426,9 @@ def _run_simulation(
|
|
|
404
426
|
flwr_dir: Optional[str] = None,
|
|
405
427
|
run: Optional[Run] = None,
|
|
406
428
|
enable_tf_gpu_growth: bool = False,
|
|
407
|
-
delay_start: int = 5,
|
|
408
429
|
verbose_logging: bool = False,
|
|
409
430
|
is_app: bool = False,
|
|
410
|
-
) ->
|
|
431
|
+
) -> Context:
|
|
411
432
|
"""Launch the Simulation Engine."""
|
|
412
433
|
if backend_config is None:
|
|
413
434
|
backend_config = {}
|
|
@@ -459,7 +480,6 @@ def _run_simulation(
|
|
|
459
480
|
enable_tf_gpu_growth,
|
|
460
481
|
run,
|
|
461
482
|
exit_event,
|
|
462
|
-
delay_start,
|
|
463
483
|
flwr_dir,
|
|
464
484
|
client_app,
|
|
465
485
|
client_app_attr,
|
|
@@ -487,7 +507,8 @@ def _run_simulation(
|
|
|
487
507
|
# Set logger propagation to False to prevent duplicated log output in Colab.
|
|
488
508
|
logger = set_logger_propagation(logger, False)
|
|
489
509
|
|
|
490
|
-
_main_loop(*args)
|
|
510
|
+
updated_context = _main_loop(*args)
|
|
511
|
+
return updated_context
|
|
491
512
|
|
|
492
513
|
|
|
493
514
|
def _parse_args_run_simulation() -> argparse.ArgumentParser:
|
|
@@ -537,13 +558,6 @@ def _parse_args_run_simulation() -> argparse.ArgumentParser:
|
|
|
537
558
|
"Read more about how `tf.config.experimental.set_memory_growth()` works in "
|
|
538
559
|
"the TensorFlow documentation: https://www.tensorflow.org/api/stable.",
|
|
539
560
|
)
|
|
540
|
-
parser.add_argument(
|
|
541
|
-
"--delay-start",
|
|
542
|
-
type=int,
|
|
543
|
-
default=3,
|
|
544
|
-
help="Buffer time (in seconds) to delay the start the simulation engine after "
|
|
545
|
-
"the `ServerApp`, which runs in a separate thread, has been launched.",
|
|
546
|
-
)
|
|
547
561
|
parser.add_argument(
|
|
548
562
|
"--verbose",
|
|
549
563
|
action="store_true",
|
|
@@ -23,6 +23,7 @@ import grpc
|
|
|
23
23
|
from flwr.common.constant import SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS
|
|
24
24
|
from flwr.common.grpc import create_channel
|
|
25
25
|
from flwr.common.logger import log
|
|
26
|
+
from flwr.common.retry_invoker import _make_simple_grpc_retry_invoker, _wrap_stub
|
|
26
27
|
from flwr.proto.simulationio_pb2_grpc import SimulationIoStub # pylint: disable=E0611
|
|
27
28
|
|
|
28
29
|
|
|
@@ -48,6 +49,7 @@ class SimulationIoConnection:
|
|
|
48
49
|
self._cert = root_certificates
|
|
49
50
|
self._grpc_stub: Optional[SimulationIoStub] = None
|
|
50
51
|
self._channel: Optional[grpc.Channel] = None
|
|
52
|
+
self._retry_invoker = _make_simple_grpc_retry_invoker()
|
|
51
53
|
|
|
52
54
|
@property
|
|
53
55
|
def _is_connected(self) -> bool:
|
|
@@ -72,6 +74,7 @@ class SimulationIoConnection:
|
|
|
72
74
|
root_certificates=self._cert,
|
|
73
75
|
)
|
|
74
76
|
self._grpc_stub = SimulationIoStub(self._channel)
|
|
77
|
+
_wrap_stub(self._grpc_stub, self._retry_invoker)
|
|
75
78
|
log(DEBUG, "[SimulationIO] Connected to %s", self._addr)
|
|
76
79
|
|
|
77
80
|
def _disconnect(self) -> None:
|
flwr/superexec/app.py
CHANGED
flwr/superexec/deployment.py
CHANGED
flwr/superexec/exec_grpc.py
CHANGED
|
@@ -14,18 +14,22 @@
|
|
|
14
14
|
# ==============================================================================
|
|
15
15
|
"""SuperExec gRPC API."""
|
|
16
16
|
|
|
17
|
+
|
|
18
|
+
from collections.abc import Sequence
|
|
17
19
|
from logging import INFO
|
|
18
20
|
from typing import Optional
|
|
19
21
|
|
|
20
22
|
import grpc
|
|
21
23
|
|
|
22
24
|
from flwr.common import GRPC_MAX_MESSAGE_LENGTH
|
|
25
|
+
from flwr.common.auth_plugin import ExecAuthPlugin
|
|
23
26
|
from flwr.common.logger import log
|
|
24
27
|
from flwr.common.typing import UserConfig
|
|
25
28
|
from flwr.proto.exec_pb2_grpc import add_ExecServicer_to_server
|
|
26
29
|
from flwr.server.superlink.ffs.ffs_factory import FfsFactory
|
|
27
30
|
from flwr.server.superlink.fleet.grpc_bidi.grpc_server import generic_create_grpc_server
|
|
28
31
|
from flwr.server.superlink.linkstate import LinkStateFactory
|
|
32
|
+
from flwr.superexec.exec_user_auth_interceptor import ExecUserAuthInterceptor
|
|
29
33
|
|
|
30
34
|
from .exec_servicer import ExecServicer
|
|
31
35
|
from .executor import Executor
|
|
@@ -39,6 +43,7 @@ def run_exec_api_grpc(
|
|
|
39
43
|
ffs_factory: FfsFactory,
|
|
40
44
|
certificates: Optional[tuple[bytes, bytes, bytes]],
|
|
41
45
|
config: UserConfig,
|
|
46
|
+
auth_plugin: Optional[ExecAuthPlugin] = None,
|
|
42
47
|
) -> grpc.Server:
|
|
43
48
|
"""Run Exec API (gRPC, request-response)."""
|
|
44
49
|
executor.set_config(config)
|
|
@@ -47,16 +52,29 @@ def run_exec_api_grpc(
|
|
|
47
52
|
linkstate_factory=state_factory,
|
|
48
53
|
ffs_factory=ffs_factory,
|
|
49
54
|
executor=executor,
|
|
55
|
+
auth_plugin=auth_plugin,
|
|
50
56
|
)
|
|
57
|
+
interceptors: Optional[Sequence[grpc.ServerInterceptor]] = None
|
|
58
|
+
if auth_plugin is not None:
|
|
59
|
+
interceptors = [ExecUserAuthInterceptor(auth_plugin)]
|
|
51
60
|
exec_add_servicer_to_server_fn = add_ExecServicer_to_server
|
|
52
61
|
exec_grpc_server = generic_create_grpc_server(
|
|
53
62
|
servicer_and_add_fn=(exec_servicer, exec_add_servicer_to_server_fn),
|
|
54
63
|
server_address=address,
|
|
55
64
|
max_message_length=GRPC_MAX_MESSAGE_LENGTH,
|
|
56
65
|
certificates=certificates,
|
|
66
|
+
interceptors=interceptors,
|
|
57
67
|
)
|
|
58
68
|
|
|
59
|
-
|
|
69
|
+
if auth_plugin is None:
|
|
70
|
+
log(INFO, "Flower Deployment Engine: Starting Exec API on %s", address)
|
|
71
|
+
else:
|
|
72
|
+
log(
|
|
73
|
+
INFO,
|
|
74
|
+
"Flower Deployment Engine: Starting Exec API with user "
|
|
75
|
+
"authentication on %s",
|
|
76
|
+
address,
|
|
77
|
+
)
|
|
60
78
|
exec_grpc_server.start()
|
|
61
79
|
|
|
62
80
|
return exec_grpc_server
|
flwr/superexec/exec_servicer.py
CHANGED
|
@@ -18,24 +18,33 @@
|
|
|
18
18
|
import time
|
|
19
19
|
from collections.abc import Generator
|
|
20
20
|
from logging import ERROR, INFO
|
|
21
|
-
from typing import Any
|
|
21
|
+
from typing import Any, Optional
|
|
22
|
+
from uuid import UUID
|
|
22
23
|
|
|
23
24
|
import grpc
|
|
24
25
|
|
|
25
26
|
from flwr.common import now
|
|
26
|
-
from flwr.common.
|
|
27
|
+
from flwr.common.auth_plugin import ExecAuthPlugin
|
|
28
|
+
from flwr.common.constant import LOG_STREAM_INTERVAL, Status, SubStatus
|
|
27
29
|
from flwr.common.logger import log
|
|
28
30
|
from flwr.common.serde import (
|
|
29
31
|
configs_record_from_proto,
|
|
30
32
|
run_to_proto,
|
|
31
33
|
user_config_from_proto,
|
|
32
34
|
)
|
|
35
|
+
from flwr.common.typing import RunStatus
|
|
33
36
|
from flwr.proto import exec_pb2_grpc # pylint: disable=E0611
|
|
34
37
|
from flwr.proto.exec_pb2 import ( # pylint: disable=E0611
|
|
38
|
+
GetAuthTokensRequest,
|
|
39
|
+
GetAuthTokensResponse,
|
|
40
|
+
GetLoginDetailsRequest,
|
|
41
|
+
GetLoginDetailsResponse,
|
|
35
42
|
ListRunsRequest,
|
|
36
43
|
ListRunsResponse,
|
|
37
44
|
StartRunRequest,
|
|
38
45
|
StartRunResponse,
|
|
46
|
+
StopRunRequest,
|
|
47
|
+
StopRunResponse,
|
|
39
48
|
StreamLogsRequest,
|
|
40
49
|
StreamLogsResponse,
|
|
41
50
|
)
|
|
@@ -53,11 +62,13 @@ class ExecServicer(exec_pb2_grpc.ExecServicer):
|
|
|
53
62
|
linkstate_factory: LinkStateFactory,
|
|
54
63
|
ffs_factory: FfsFactory,
|
|
55
64
|
executor: Executor,
|
|
65
|
+
auth_plugin: Optional[ExecAuthPlugin] = None,
|
|
56
66
|
) -> None:
|
|
57
67
|
self.linkstate_factory = linkstate_factory
|
|
58
68
|
self.ffs_factory = ffs_factory
|
|
59
69
|
self.executor = executor
|
|
60
70
|
self.executor.initialize(linkstate_factory, ffs_factory)
|
|
71
|
+
self.auth_plugin = auth_plugin
|
|
61
72
|
|
|
62
73
|
def StartRun(
|
|
63
74
|
self, request: StartRunRequest, context: grpc.ServicerContext
|
|
@@ -126,6 +137,69 @@ class ExecServicer(exec_pb2_grpc.ExecServicer):
|
|
|
126
137
|
# Handle `flwr ls --run-id <run_id>`
|
|
127
138
|
return _create_list_runs_response({request.run_id}, state)
|
|
128
139
|
|
|
140
|
+
def StopRun(
|
|
141
|
+
self, request: StopRunRequest, context: grpc.ServicerContext
|
|
142
|
+
) -> StopRunResponse:
|
|
143
|
+
"""Stop a given run ID."""
|
|
144
|
+
log(INFO, "ExecServicer.StopRun")
|
|
145
|
+
state = self.linkstate_factory.state()
|
|
146
|
+
|
|
147
|
+
# Exit if `run_id` not found
|
|
148
|
+
if not state.get_run(request.run_id):
|
|
149
|
+
context.abort(
|
|
150
|
+
grpc.StatusCode.NOT_FOUND, f"Run ID {request.run_id} not found"
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
run_status = state.get_run_status({request.run_id})[request.run_id]
|
|
154
|
+
if run_status.status == Status.FINISHED:
|
|
155
|
+
context.abort(
|
|
156
|
+
grpc.StatusCode.FAILED_PRECONDITION,
|
|
157
|
+
f"Run ID {request.run_id} is already finished",
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
update_success = state.update_run_status(
|
|
161
|
+
run_id=request.run_id,
|
|
162
|
+
new_status=RunStatus(Status.FINISHED, SubStatus.STOPPED, ""),
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
if update_success:
|
|
166
|
+
task_ids: set[UUID] = state.get_task_ids_from_run_id(request.run_id)
|
|
167
|
+
|
|
168
|
+
# Delete TaskIns and TaskRes for the `run_id`
|
|
169
|
+
state.delete_tasks(task_ids)
|
|
170
|
+
|
|
171
|
+
return StopRunResponse(success=update_success)
|
|
172
|
+
|
|
173
|
+
def GetLoginDetails(
|
|
174
|
+
self, request: GetLoginDetailsRequest, context: grpc.ServicerContext
|
|
175
|
+
) -> GetLoginDetailsResponse:
|
|
176
|
+
"""Start login."""
|
|
177
|
+
log(INFO, "ExecServicer.GetLoginDetails")
|
|
178
|
+
if self.auth_plugin is None:
|
|
179
|
+
context.abort(
|
|
180
|
+
grpc.StatusCode.UNIMPLEMENTED,
|
|
181
|
+
"ExecServicer initialized without user authentication",
|
|
182
|
+
)
|
|
183
|
+
raise grpc.RpcError() # This line is unreachable
|
|
184
|
+
return GetLoginDetailsResponse(
|
|
185
|
+
login_details=self.auth_plugin.get_login_details()
|
|
186
|
+
)
|
|
187
|
+
|
|
188
|
+
def GetAuthTokens(
|
|
189
|
+
self, request: GetAuthTokensRequest, context: grpc.ServicerContext
|
|
190
|
+
) -> GetAuthTokensResponse:
|
|
191
|
+
"""Get auth token."""
|
|
192
|
+
log(INFO, "ExecServicer.GetAuthTokens")
|
|
193
|
+
if self.auth_plugin is None:
|
|
194
|
+
context.abort(
|
|
195
|
+
grpc.StatusCode.UNIMPLEMENTED,
|
|
196
|
+
"ExecServicer initialized without user authentication",
|
|
197
|
+
)
|
|
198
|
+
raise grpc.RpcError() # This line is unreachable
|
|
199
|
+
return GetAuthTokensResponse(
|
|
200
|
+
auth_tokens=self.auth_plugin.get_auth_tokens(dict(request.auth_details))
|
|
201
|
+
)
|
|
202
|
+
|
|
129
203
|
|
|
130
204
|
def _create_list_runs_response(run_ids: set[int], state: LinkState) -> ListRunsResponse:
|
|
131
205
|
"""Create response for `flwr ls --runs` and `flwr ls --run-id <run_id>`."""
|