flwr-nightly 1.13.0.dev20241106__py3-none-any.whl → 1.13.0.dev20241117__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of flwr-nightly might be problematic. Click here for more details.
- flwr/cli/app.py +2 -0
- flwr/cli/build.py +37 -0
- flwr/cli/install.py +5 -3
- flwr/cli/ls.py +228 -0
- flwr/cli/run/run.py +16 -5
- flwr/client/app.py +68 -19
- flwr/client/clientapp/app.py +51 -35
- flwr/client/grpc_rere_client/connection.py +2 -12
- flwr/client/nodestate/__init__.py +25 -0
- flwr/client/nodestate/in_memory_nodestate.py +38 -0
- flwr/client/nodestate/nodestate.py +30 -0
- flwr/client/nodestate/nodestate_factory.py +37 -0
- flwr/client/rest_client/connection.py +4 -14
- flwr/client/supernode/app.py +57 -53
- flwr/common/args.py +148 -0
- flwr/common/config.py +10 -0
- flwr/common/constant.py +21 -7
- flwr/common/date.py +18 -0
- flwr/common/logger.py +6 -2
- flwr/common/object_ref.py +47 -16
- flwr/common/serde.py +10 -0
- flwr/common/typing.py +32 -11
- flwr/proto/exec_pb2.py +23 -17
- flwr/proto/exec_pb2.pyi +50 -20
- flwr/proto/exec_pb2_grpc.py +34 -0
- flwr/proto/exec_pb2_grpc.pyi +13 -0
- flwr/proto/run_pb2.py +32 -27
- flwr/proto/run_pb2.pyi +44 -1
- flwr/proto/simulationio_pb2.py +2 -2
- flwr/proto/simulationio_pb2_grpc.py +34 -0
- flwr/proto/simulationio_pb2_grpc.pyi +13 -0
- flwr/server/app.py +83 -87
- flwr/server/driver/driver.py +1 -1
- flwr/server/driver/grpc_driver.py +6 -20
- flwr/server/driver/inmemory_driver.py +1 -3
- flwr/server/run_serverapp.py +8 -238
- flwr/server/serverapp/app.py +44 -89
- flwr/server/strategy/aggregate.py +4 -4
- flwr/server/superlink/fleet/rest_rere/rest_api.py +10 -9
- flwr/server/superlink/linkstate/in_memory_linkstate.py +76 -62
- flwr/server/superlink/linkstate/linkstate.py +24 -9
- flwr/server/superlink/linkstate/sqlite_linkstate.py +87 -128
- flwr/server/superlink/linkstate/utils.py +191 -32
- flwr/server/superlink/simulation/simulationio_servicer.py +22 -1
- flwr/simulation/__init__.py +3 -1
- flwr/simulation/app.py +245 -352
- flwr/simulation/legacy_app.py +402 -0
- flwr/simulation/run_simulation.py +8 -19
- flwr/simulation/simulationio_connection.py +2 -2
- flwr/superexec/deployment.py +13 -7
- flwr/superexec/exec_servicer.py +32 -3
- flwr/superexec/executor.py +4 -3
- flwr/superexec/simulation.py +52 -145
- {flwr_nightly-1.13.0.dev20241106.dist-info → flwr_nightly-1.13.0.dev20241117.dist-info}/METADATA +10 -7
- {flwr_nightly-1.13.0.dev20241106.dist-info → flwr_nightly-1.13.0.dev20241117.dist-info}/RECORD +58 -51
- {flwr_nightly-1.13.0.dev20241106.dist-info → flwr_nightly-1.13.0.dev20241117.dist-info}/entry_points.txt +1 -0
- {flwr_nightly-1.13.0.dev20241106.dist-info → flwr_nightly-1.13.0.dev20241117.dist-info}/LICENSE +0 -0
- {flwr_nightly-1.13.0.dev20241106.dist-info → flwr_nightly-1.13.0.dev20241117.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
# Copyright 2024 Flower Labs GmbH. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
# ==============================================================================
|
|
15
|
+
"""Flower simulation app."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
import asyncio
|
|
19
|
+
import logging
|
|
20
|
+
import sys
|
|
21
|
+
import threading
|
|
22
|
+
import traceback
|
|
23
|
+
import warnings
|
|
24
|
+
from logging import ERROR, INFO
|
|
25
|
+
from typing import Any, Optional, Union
|
|
26
|
+
|
|
27
|
+
import ray
|
|
28
|
+
from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
|
|
29
|
+
|
|
30
|
+
from flwr.client import ClientFnExt
|
|
31
|
+
from flwr.common import EventType, event
|
|
32
|
+
from flwr.common.constant import NODE_ID_NUM_BYTES
|
|
33
|
+
from flwr.common.logger import (
|
|
34
|
+
log,
|
|
35
|
+
set_logger_propagation,
|
|
36
|
+
warn_deprecated_feature,
|
|
37
|
+
warn_unsupported_feature,
|
|
38
|
+
)
|
|
39
|
+
from flwr.server.client_manager import ClientManager
|
|
40
|
+
from flwr.server.history import History
|
|
41
|
+
from flwr.server.server import Server, init_defaults, run_fl
|
|
42
|
+
from flwr.server.server_config import ServerConfig
|
|
43
|
+
from flwr.server.strategy import Strategy
|
|
44
|
+
from flwr.server.superlink.linkstate.utils import generate_rand_int_from_bytes
|
|
45
|
+
from flwr.simulation.ray_transport.ray_actor import (
|
|
46
|
+
ClientAppActor,
|
|
47
|
+
VirtualClientEngineActor,
|
|
48
|
+
VirtualClientEngineActorPool,
|
|
49
|
+
pool_size_from_resources,
|
|
50
|
+
)
|
|
51
|
+
from flwr.simulation.ray_transport.ray_client_proxy import RayActorClientProxy
|
|
52
|
+
|
|
53
|
+
INVALID_ARGUMENTS_START_SIMULATION = """
|
|
54
|
+
INVALID ARGUMENTS ERROR
|
|
55
|
+
|
|
56
|
+
Invalid Arguments in method:
|
|
57
|
+
|
|
58
|
+
`start_simulation(
|
|
59
|
+
*,
|
|
60
|
+
client_fn: ClientFn,
|
|
61
|
+
num_clients: int,
|
|
62
|
+
clients_ids: Optional[List[str]] = None,
|
|
63
|
+
client_resources: Optional[Dict[str, float]] = None,
|
|
64
|
+
server: Optional[Server] = None,
|
|
65
|
+
config: ServerConfig = None,
|
|
66
|
+
strategy: Optional[Strategy] = None,
|
|
67
|
+
client_manager: Optional[ClientManager] = None,
|
|
68
|
+
ray_init_args: Optional[Dict[str, Any]] = None,
|
|
69
|
+
) -> None:`
|
|
70
|
+
|
|
71
|
+
REASON:
|
|
72
|
+
Method requires:
|
|
73
|
+
- Either `num_clients`[int] or `clients_ids`[List[str]]
|
|
74
|
+
to be set exclusively.
|
|
75
|
+
OR
|
|
76
|
+
- `len(clients_ids)` == `num_clients`
|
|
77
|
+
|
|
78
|
+
"""
|
|
79
|
+
|
|
80
|
+
NodeToPartitionMapping = dict[int, int]
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _create_node_id_to_partition_mapping(
|
|
84
|
+
num_clients: int,
|
|
85
|
+
) -> NodeToPartitionMapping:
|
|
86
|
+
"""Generate a node_id:partition_id mapping."""
|
|
87
|
+
nodes_mapping: NodeToPartitionMapping = {} # {node-id; partition-id}
|
|
88
|
+
for i in range(num_clients):
|
|
89
|
+
while True:
|
|
90
|
+
node_id = generate_rand_int_from_bytes(NODE_ID_NUM_BYTES)
|
|
91
|
+
if node_id not in nodes_mapping:
|
|
92
|
+
break
|
|
93
|
+
nodes_mapping[node_id] = i
|
|
94
|
+
return nodes_mapping
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
# pylint: disable=too-many-arguments,too-many-statements,too-many-branches
|
|
98
|
+
def start_simulation(
|
|
99
|
+
*,
|
|
100
|
+
client_fn: ClientFnExt,
|
|
101
|
+
num_clients: int,
|
|
102
|
+
clients_ids: Optional[list[str]] = None, # UNSUPPORTED, WILL BE REMOVED
|
|
103
|
+
client_resources: Optional[dict[str, float]] = None,
|
|
104
|
+
server: Optional[Server] = None,
|
|
105
|
+
config: Optional[ServerConfig] = None,
|
|
106
|
+
strategy: Optional[Strategy] = None,
|
|
107
|
+
client_manager: Optional[ClientManager] = None,
|
|
108
|
+
ray_init_args: Optional[dict[str, Any]] = None,
|
|
109
|
+
keep_initialised: Optional[bool] = False,
|
|
110
|
+
actor_type: type[VirtualClientEngineActor] = ClientAppActor,
|
|
111
|
+
actor_kwargs: Optional[dict[str, Any]] = None,
|
|
112
|
+
actor_scheduling: Union[str, NodeAffinitySchedulingStrategy] = "DEFAULT",
|
|
113
|
+
) -> History:
|
|
114
|
+
"""Start a Ray-based Flower simulation server.
|
|
115
|
+
|
|
116
|
+
Warning
|
|
117
|
+
-------
|
|
118
|
+
This function is deprecated since 1.13.0. Use :code: `flwr run` to start a Flower
|
|
119
|
+
simulation.
|
|
120
|
+
|
|
121
|
+
Parameters
|
|
122
|
+
----------
|
|
123
|
+
client_fn : ClientFnExt
|
|
124
|
+
A function creating `Client` instances. The function must have the signature
|
|
125
|
+
`client_fn(context: Context). It should return
|
|
126
|
+
a single client instance of type `Client`. Note that the created client
|
|
127
|
+
instances are ephemeral and will often be destroyed after a single method
|
|
128
|
+
invocation. Since client instances are not long-lived, they should not attempt
|
|
129
|
+
to carry state over method invocations. Any state required by the instance
|
|
130
|
+
(model, dataset, hyperparameters, ...) should be (re-)created in either the
|
|
131
|
+
call to `client_fn` or the call to any of the client methods (e.g., load
|
|
132
|
+
evaluation data in the `evaluate` method itself).
|
|
133
|
+
num_clients : int
|
|
134
|
+
The total number of clients in this simulation.
|
|
135
|
+
clients_ids : Optional[List[str]]
|
|
136
|
+
UNSUPPORTED, WILL BE REMOVED. USE `num_clients` INSTEAD.
|
|
137
|
+
List `client_id`s for each client. This is only required if
|
|
138
|
+
`num_clients` is not set. Setting both `num_clients` and `clients_ids`
|
|
139
|
+
with `len(clients_ids)` not equal to `num_clients` generates an error.
|
|
140
|
+
Using this argument will raise an error.
|
|
141
|
+
client_resources : Optional[Dict[str, float]] (default: `{"num_cpus": 1, "num_gpus": 0.0}`)
|
|
142
|
+
CPU and GPU resources for a single client. Supported keys
|
|
143
|
+
are `num_cpus` and `num_gpus`. To understand the GPU utilization caused by
|
|
144
|
+
`num_gpus`, as well as using custom resources, please consult the Ray
|
|
145
|
+
documentation.
|
|
146
|
+
server : Optional[flwr.server.Server] (default: None).
|
|
147
|
+
An implementation of the abstract base class `flwr.server.Server`. If no
|
|
148
|
+
instance is provided, then `start_server` will create one.
|
|
149
|
+
config: ServerConfig (default: None).
|
|
150
|
+
Currently supported values are `num_rounds` (int, default: 1) and
|
|
151
|
+
`round_timeout` in seconds (float, default: None).
|
|
152
|
+
strategy : Optional[flwr.server.Strategy] (default: None)
|
|
153
|
+
An implementation of the abstract base class `flwr.server.Strategy`. If
|
|
154
|
+
no strategy is provided, then `start_server` will use
|
|
155
|
+
`flwr.server.strategy.FedAvg`.
|
|
156
|
+
client_manager : Optional[flwr.server.ClientManager] (default: None)
|
|
157
|
+
An implementation of the abstract base class `flwr.server.ClientManager`.
|
|
158
|
+
If no implementation is provided, then `start_simulation` will use
|
|
159
|
+
`flwr.server.client_manager.SimpleClientManager`.
|
|
160
|
+
ray_init_args : Optional[Dict[str, Any]] (default: None)
|
|
161
|
+
Optional dictionary containing arguments for the call to `ray.init`.
|
|
162
|
+
If ray_init_args is None (the default), Ray will be initialized with
|
|
163
|
+
the following default args:
|
|
164
|
+
|
|
165
|
+
{ "ignore_reinit_error": True, "include_dashboard": False }
|
|
166
|
+
|
|
167
|
+
An empty dictionary can be used (ray_init_args={}) to prevent any
|
|
168
|
+
arguments from being passed to ray.init.
|
|
169
|
+
keep_initialised: Optional[bool] (default: False)
|
|
170
|
+
Set to True to prevent `ray.shutdown()` in case `ray.is_initialized()=True`.
|
|
171
|
+
|
|
172
|
+
actor_type: VirtualClientEngineActor (default: ClientAppActor)
|
|
173
|
+
Optionally specify the type of actor to use. The actor object, which
|
|
174
|
+
persists throughout the simulation, will be the process in charge of
|
|
175
|
+
executing a ClientApp wrapping input argument `client_fn`.
|
|
176
|
+
|
|
177
|
+
actor_kwargs: Optional[Dict[str, Any]] (default: None)
|
|
178
|
+
If you want to create your own Actor classes, you might need to pass
|
|
179
|
+
some input argument. You can use this dictionary for such purpose.
|
|
180
|
+
|
|
181
|
+
actor_scheduling: Optional[Union[str, NodeAffinitySchedulingStrategy]]
|
|
182
|
+
(default: "DEFAULT")
|
|
183
|
+
Optional string ("DEFAULT" or "SPREAD") for the VCE to choose in which
|
|
184
|
+
node the actor is placed. If you are an advanced user needed more control
|
|
185
|
+
you can use lower-level scheduling strategies to pin actors to specific
|
|
186
|
+
compute nodes (e.g. via NodeAffinitySchedulingStrategy). Please note this
|
|
187
|
+
is an advanced feature. For all details, please refer to the Ray documentation:
|
|
188
|
+
https://docs.ray.io/en/latest/ray-core/scheduling/index.html
|
|
189
|
+
|
|
190
|
+
Returns
|
|
191
|
+
-------
|
|
192
|
+
hist : flwr.server.history.History
|
|
193
|
+
Object containing metrics from training.
|
|
194
|
+
""" # noqa: E501
|
|
195
|
+
# pylint: disable-msg=too-many-locals
|
|
196
|
+
msg = (
|
|
197
|
+
"flwr.simulation.start_simulation() is deprecated."
|
|
198
|
+
"\n\tInstead, use the `flwr run` CLI command to start a local simulation "
|
|
199
|
+
"in your Flower app, as shown for example below:"
|
|
200
|
+
"\n\n\t\t$ flwr new # Create a new Flower app from a template"
|
|
201
|
+
"\n\n\t\t$ flwr run # Run the Flower app in Simulation Mode"
|
|
202
|
+
"\n\n\tUsing `start_simulation()` is deprecated."
|
|
203
|
+
)
|
|
204
|
+
warn_deprecated_feature(name=msg)
|
|
205
|
+
|
|
206
|
+
event(
|
|
207
|
+
EventType.START_SIMULATION_ENTER,
|
|
208
|
+
{"num_clients": len(clients_ids) if clients_ids is not None else num_clients},
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
if clients_ids is not None:
|
|
212
|
+
warn_unsupported_feature(
|
|
213
|
+
"Passing `clients_ids` to `start_simulation` is deprecated and not longer "
|
|
214
|
+
"used by `start_simulation`. Use `num_clients` exclusively instead."
|
|
215
|
+
)
|
|
216
|
+
log(ERROR, "`clients_ids` argument used.")
|
|
217
|
+
sys.exit()
|
|
218
|
+
|
|
219
|
+
# Set logger propagation
|
|
220
|
+
loop: Optional[asyncio.AbstractEventLoop] = None
|
|
221
|
+
try:
|
|
222
|
+
loop = asyncio.get_running_loop()
|
|
223
|
+
except RuntimeError:
|
|
224
|
+
loop = None
|
|
225
|
+
finally:
|
|
226
|
+
if loop and loop.is_running():
|
|
227
|
+
# Set logger propagation to False to prevent duplicated log output in Colab.
|
|
228
|
+
logger = logging.getLogger("flwr")
|
|
229
|
+
_ = set_logger_propagation(logger, False)
|
|
230
|
+
|
|
231
|
+
# Initialize server and server config
|
|
232
|
+
initialized_server, initialized_config = init_defaults(
|
|
233
|
+
server=server,
|
|
234
|
+
config=config,
|
|
235
|
+
strategy=strategy,
|
|
236
|
+
client_manager=client_manager,
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
log(
|
|
240
|
+
INFO,
|
|
241
|
+
"Starting Flower simulation, config: %s",
|
|
242
|
+
initialized_config,
|
|
243
|
+
)
|
|
244
|
+
|
|
245
|
+
# Create node-id to partition-id mapping
|
|
246
|
+
nodes_mapping = _create_node_id_to_partition_mapping(num_clients)
|
|
247
|
+
|
|
248
|
+
# Default arguments for Ray initialization
|
|
249
|
+
if not ray_init_args:
|
|
250
|
+
ray_init_args = {
|
|
251
|
+
"ignore_reinit_error": True,
|
|
252
|
+
"include_dashboard": False,
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
# Shut down Ray if it has already been initialized (unless asked not to)
|
|
256
|
+
if ray.is_initialized() and not keep_initialised:
|
|
257
|
+
ray.shutdown()
|
|
258
|
+
|
|
259
|
+
# Initialize Ray
|
|
260
|
+
ray.init(**ray_init_args)
|
|
261
|
+
cluster_resources = ray.cluster_resources()
|
|
262
|
+
log(
|
|
263
|
+
INFO,
|
|
264
|
+
"Flower VCE: Ray initialized with resources: %s",
|
|
265
|
+
cluster_resources,
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
log(
|
|
269
|
+
INFO,
|
|
270
|
+
"Optimize your simulation with Flower VCE: "
|
|
271
|
+
"https://flower.ai/docs/framework/how-to-run-simulations.html",
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
# Log the resources that a single client will be able to use
|
|
275
|
+
if client_resources is None:
|
|
276
|
+
log(
|
|
277
|
+
INFO,
|
|
278
|
+
"No `client_resources` specified. Using minimal resources for clients.",
|
|
279
|
+
)
|
|
280
|
+
client_resources = {"num_cpus": 1, "num_gpus": 0.0}
|
|
281
|
+
|
|
282
|
+
# Each client needs at the very least one CPU
|
|
283
|
+
if "num_cpus" not in client_resources:
|
|
284
|
+
warnings.warn(
|
|
285
|
+
"No `num_cpus` specified in `client_resources`. "
|
|
286
|
+
"Using `num_cpus=1` for each client.",
|
|
287
|
+
stacklevel=2,
|
|
288
|
+
)
|
|
289
|
+
client_resources["num_cpus"] = 1
|
|
290
|
+
|
|
291
|
+
log(
|
|
292
|
+
INFO,
|
|
293
|
+
"Flower VCE: Resources for each Virtual Client: %s",
|
|
294
|
+
client_resources,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
actor_args = {} if actor_kwargs is None else actor_kwargs
|
|
298
|
+
|
|
299
|
+
# An actor factory. This is called N times to add N actors
|
|
300
|
+
# to the pool. If at some point the pool can accommodate more actors
|
|
301
|
+
# this will be called again.
|
|
302
|
+
def create_actor_fn() -> type[VirtualClientEngineActor]:
|
|
303
|
+
return actor_type.options( # type: ignore
|
|
304
|
+
**client_resources,
|
|
305
|
+
scheduling_strategy=actor_scheduling,
|
|
306
|
+
).remote(**actor_args)
|
|
307
|
+
|
|
308
|
+
# Instantiate ActorPool
|
|
309
|
+
pool = VirtualClientEngineActorPool(
|
|
310
|
+
create_actor_fn=create_actor_fn,
|
|
311
|
+
client_resources=client_resources,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
f_stop = threading.Event()
|
|
315
|
+
|
|
316
|
+
# Periodically, check if the cluster has grown (i.e. a new
|
|
317
|
+
# node has been added). If this happens, we likely want to grow
|
|
318
|
+
# the actor pool by adding more Actors to it.
|
|
319
|
+
def update_resources(f_stop: threading.Event) -> None:
|
|
320
|
+
"""Periodically check if more actors can be added to the pool.
|
|
321
|
+
|
|
322
|
+
If so, extend the pool.
|
|
323
|
+
"""
|
|
324
|
+
if not f_stop.is_set():
|
|
325
|
+
num_max_actors = pool_size_from_resources(client_resources)
|
|
326
|
+
if num_max_actors > pool.num_actors:
|
|
327
|
+
num_new = num_max_actors - pool.num_actors
|
|
328
|
+
log(
|
|
329
|
+
INFO, "The cluster expanded. Adding %s actors to the pool.", num_new
|
|
330
|
+
)
|
|
331
|
+
pool.add_actors_to_pool(num_actors=num_new)
|
|
332
|
+
|
|
333
|
+
threading.Timer(10, update_resources, [f_stop]).start()
|
|
334
|
+
|
|
335
|
+
update_resources(f_stop)
|
|
336
|
+
|
|
337
|
+
log(
|
|
338
|
+
INFO,
|
|
339
|
+
"Flower VCE: Creating %s with %s actors",
|
|
340
|
+
pool.__class__.__name__,
|
|
341
|
+
pool.num_actors,
|
|
342
|
+
)
|
|
343
|
+
|
|
344
|
+
# Register one RayClientProxy object for each client with the ClientManager
|
|
345
|
+
for node_id, partition_id in nodes_mapping.items():
|
|
346
|
+
client_proxy = RayActorClientProxy(
|
|
347
|
+
client_fn=client_fn,
|
|
348
|
+
node_id=node_id,
|
|
349
|
+
partition_id=partition_id,
|
|
350
|
+
num_partitions=num_clients,
|
|
351
|
+
actor_pool=pool,
|
|
352
|
+
)
|
|
353
|
+
initialized_server.client_manager().register(client=client_proxy)
|
|
354
|
+
|
|
355
|
+
hist = History()
|
|
356
|
+
# pylint: disable=broad-except
|
|
357
|
+
try:
|
|
358
|
+
# Start training
|
|
359
|
+
hist = run_fl(
|
|
360
|
+
server=initialized_server,
|
|
361
|
+
config=initialized_config,
|
|
362
|
+
)
|
|
363
|
+
except Exception as ex:
|
|
364
|
+
log(ERROR, ex)
|
|
365
|
+
log(ERROR, traceback.format_exc())
|
|
366
|
+
log(
|
|
367
|
+
ERROR,
|
|
368
|
+
"Your simulation crashed :(. This could be because of several reasons. "
|
|
369
|
+
"The most common are: "
|
|
370
|
+
"\n\t > Sometimes, issues in the simulation code itself can cause crashes. "
|
|
371
|
+
"It's always a good idea to double-check your code for any potential bugs "
|
|
372
|
+
"or inconsistencies that might be contributing to the problem. "
|
|
373
|
+
"For example: "
|
|
374
|
+
"\n\t\t - You might be using a class attribute in your clients that "
|
|
375
|
+
"hasn't been defined."
|
|
376
|
+
"\n\t\t - There could be an incorrect method call to a 3rd party library "
|
|
377
|
+
"(e.g., PyTorch)."
|
|
378
|
+
"\n\t\t - The return types of methods in your clients/strategies might be "
|
|
379
|
+
"incorrect."
|
|
380
|
+
"\n\t > Your system couldn't fit a single VirtualClient: try lowering "
|
|
381
|
+
"`client_resources`."
|
|
382
|
+
"\n\t > All the actors in your pool crashed. This could be because: "
|
|
383
|
+
"\n\t\t - You clients hit an out-of-memory (OOM) error and actors couldn't "
|
|
384
|
+
"recover from it. Try launching your simulation with more generous "
|
|
385
|
+
"`client_resources` setting (i.e. it seems %s is "
|
|
386
|
+
"not enough for your run). Use fewer concurrent actors. "
|
|
387
|
+
"\n\t\t - You were running a multi-node simulation and all worker nodes "
|
|
388
|
+
"disconnected. The head node might still be alive but cannot accommodate "
|
|
389
|
+
"any actor with resources: %s."
|
|
390
|
+
"\nTake a look at the Flower simulation examples for guidance "
|
|
391
|
+
"<https://flower.ai/docs/framework/how-to-run-simulations.html>.",
|
|
392
|
+
client_resources,
|
|
393
|
+
client_resources,
|
|
394
|
+
)
|
|
395
|
+
raise RuntimeError("Simulation crashed.") from ex
|
|
396
|
+
|
|
397
|
+
finally:
|
|
398
|
+
# Stop time monitoring resources in cluster
|
|
399
|
+
f_stop.set()
|
|
400
|
+
event(EventType.START_SIMULATION_LEAVE)
|
|
401
|
+
|
|
402
|
+
return hist
|
|
@@ -123,13 +123,8 @@ def run_simulation_from_cli() -> None:
|
|
|
123
123
|
fused_config = get_fused_config_from_dir(app_path, override_config)
|
|
124
124
|
|
|
125
125
|
# Create run
|
|
126
|
-
run = Run(
|
|
127
|
-
|
|
128
|
-
fab_id="",
|
|
129
|
-
fab_version="",
|
|
130
|
-
fab_hash="",
|
|
131
|
-
override_config=override_config,
|
|
132
|
-
)
|
|
126
|
+
run = Run.create_empty(run_id)
|
|
127
|
+
run.override_config = override_config
|
|
133
128
|
|
|
134
129
|
_run_simulation(
|
|
135
130
|
server_app_attr=server_app_attr,
|
|
@@ -333,21 +328,17 @@ def _main_loop(
|
|
|
333
328
|
try:
|
|
334
329
|
# Register run
|
|
335
330
|
log(DEBUG, "Pre-registering run with id %s", run.run_id)
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
starting_at=now().isoformat(),
|
|
341
|
-
running_at=now().isoformat(),
|
|
342
|
-
finished_at="",
|
|
343
|
-
)
|
|
331
|
+
run.status = RunStatus(Status.RUNNING, "", "")
|
|
332
|
+
run.starting_at = now().isoformat()
|
|
333
|
+
run.running_at = run.starting_at
|
|
334
|
+
state_factory.state().run_ids[run.run_id] = RunRecord(run=run) # type: ignore
|
|
344
335
|
|
|
345
336
|
if server_app_run_config is None:
|
|
346
337
|
server_app_run_config = {}
|
|
347
338
|
|
|
348
339
|
# Initialize Driver
|
|
349
340
|
driver = InMemoryDriver(state_factory=state_factory)
|
|
350
|
-
driver.
|
|
341
|
+
driver.set_run(run_id=run.run_id)
|
|
351
342
|
|
|
352
343
|
# Get and run ServerApp thread
|
|
353
344
|
serverapp_th = run_serverapp_th(
|
|
@@ -457,9 +448,7 @@ def _run_simulation(
|
|
|
457
448
|
# If no `Run` object is set, create one
|
|
458
449
|
if run is None:
|
|
459
450
|
run_id = generate_rand_int_from_bytes(RUN_ID_NUM_BYTES)
|
|
460
|
-
run = Run(
|
|
461
|
-
run_id=run_id, fab_id="", fab_version="", fab_hash="", override_config={}
|
|
462
|
-
)
|
|
451
|
+
run = Run.create_empty(run_id=run_id)
|
|
463
452
|
|
|
464
453
|
args = (
|
|
465
454
|
num_supernodes,
|
|
@@ -20,7 +20,7 @@ from typing import Optional, cast
|
|
|
20
20
|
|
|
21
21
|
import grpc
|
|
22
22
|
|
|
23
|
-
from flwr.common.constant import
|
|
23
|
+
from flwr.common.constant import SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS
|
|
24
24
|
from flwr.common.grpc import create_channel
|
|
25
25
|
from flwr.common.logger import log
|
|
26
26
|
from flwr.proto.simulationio_pb2_grpc import SimulationIoStub # pylint: disable=E0611
|
|
@@ -41,7 +41,7 @@ class SimulationIoConnection:
|
|
|
41
41
|
|
|
42
42
|
def __init__( # pylint: disable=too-many-arguments
|
|
43
43
|
self,
|
|
44
|
-
simulationio_service_address: str =
|
|
44
|
+
simulationio_service_address: str = SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS,
|
|
45
45
|
root_certificates: Optional[bytes] = None,
|
|
46
46
|
) -> None:
|
|
47
47
|
self._addr = simulationio_service_address
|
flwr/superexec/deployment.py
CHANGED
|
@@ -21,8 +21,13 @@ from typing import Optional
|
|
|
21
21
|
|
|
22
22
|
from typing_extensions import override
|
|
23
23
|
|
|
24
|
+
from flwr.cli.config_utils import get_fab_metadata
|
|
24
25
|
from flwr.common import ConfigsRecord, Context, RecordSet
|
|
25
|
-
from flwr.common.constant import
|
|
26
|
+
from flwr.common.constant import (
|
|
27
|
+
SERVERAPPIO_API_DEFAULT_CLIENT_ADDRESS,
|
|
28
|
+
Status,
|
|
29
|
+
SubStatus,
|
|
30
|
+
)
|
|
26
31
|
from flwr.common.logger import log
|
|
27
32
|
from flwr.common.typing import Fab, RunStatus, UserConfig
|
|
28
33
|
from flwr.server.superlink.ffs import Ffs
|
|
@@ -37,7 +42,7 @@ class DeploymentEngine(Executor):
|
|
|
37
42
|
|
|
38
43
|
Parameters
|
|
39
44
|
----------
|
|
40
|
-
|
|
45
|
+
serverappio_api_address: str (default: "127.0.0.1:9091")
|
|
41
46
|
Address of the SuperLink to connect to.
|
|
42
47
|
root_certificates: Optional[str] (default: None)
|
|
43
48
|
Specifies the path to the PEM-encoded root certificate file for
|
|
@@ -48,11 +53,11 @@ class DeploymentEngine(Executor):
|
|
|
48
53
|
|
|
49
54
|
def __init__(
|
|
50
55
|
self,
|
|
51
|
-
|
|
56
|
+
serverappio_api_address: str = SERVERAPPIO_API_DEFAULT_CLIENT_ADDRESS,
|
|
52
57
|
root_certificates: Optional[str] = None,
|
|
53
58
|
flwr_dir: Optional[str] = None,
|
|
54
59
|
) -> None:
|
|
55
|
-
self.
|
|
60
|
+
self.serverappio_api_address = serverappio_api_address
|
|
56
61
|
if root_certificates is None:
|
|
57
62
|
self.root_certificates = None
|
|
58
63
|
self.root_certificates_bytes = None
|
|
@@ -109,7 +114,7 @@ class DeploymentEngine(Executor):
|
|
|
109
114
|
if superlink_address := config.get("superlink"):
|
|
110
115
|
if not isinstance(superlink_address, str):
|
|
111
116
|
raise ValueError("The `superlink` value should be of type `str`.")
|
|
112
|
-
self.
|
|
117
|
+
self.serverappio_api_address = superlink_address
|
|
113
118
|
if root_certificates := config.get("root-certificates"):
|
|
114
119
|
if not isinstance(root_certificates, str):
|
|
115
120
|
raise ValueError(
|
|
@@ -132,9 +137,10 @@ class DeploymentEngine(Executor):
|
|
|
132
137
|
raise RuntimeError(
|
|
133
138
|
f"FAB ({fab.hash_str}) hash from request doesn't match contents"
|
|
134
139
|
)
|
|
140
|
+
fab_id, fab_version = get_fab_metadata(fab.content)
|
|
135
141
|
|
|
136
142
|
run_id = self.linkstate.create_run(
|
|
137
|
-
|
|
143
|
+
fab_id, fab_version, fab_hash, override_config, ConfigsRecord()
|
|
138
144
|
)
|
|
139
145
|
return run_id
|
|
140
146
|
|
|
@@ -153,7 +159,7 @@ class DeploymentEngine(Executor):
|
|
|
153
159
|
self,
|
|
154
160
|
fab_file: bytes,
|
|
155
161
|
override_config: UserConfig,
|
|
156
|
-
|
|
162
|
+
federation_options: ConfigsRecord,
|
|
157
163
|
) -> Optional[int]:
|
|
158
164
|
"""Start run using the Flower Deployment Engine."""
|
|
159
165
|
run_id = None
|
flwr/superexec/exec_servicer.py
CHANGED
|
@@ -22,18 +22,25 @@ from typing import Any
|
|
|
22
22
|
|
|
23
23
|
import grpc
|
|
24
24
|
|
|
25
|
+
from flwr.common import now
|
|
25
26
|
from flwr.common.constant import LOG_STREAM_INTERVAL, Status
|
|
26
27
|
from flwr.common.logger import log
|
|
27
|
-
from flwr.common.serde import
|
|
28
|
+
from flwr.common.serde import (
|
|
29
|
+
configs_record_from_proto,
|
|
30
|
+
run_to_proto,
|
|
31
|
+
user_config_from_proto,
|
|
32
|
+
)
|
|
28
33
|
from flwr.proto import exec_pb2_grpc # pylint: disable=E0611
|
|
29
34
|
from flwr.proto.exec_pb2 import ( # pylint: disable=E0611
|
|
35
|
+
ListRunsRequest,
|
|
36
|
+
ListRunsResponse,
|
|
30
37
|
StartRunRequest,
|
|
31
38
|
StartRunResponse,
|
|
32
39
|
StreamLogsRequest,
|
|
33
40
|
StreamLogsResponse,
|
|
34
41
|
)
|
|
35
42
|
from flwr.server.superlink.ffs.ffs_factory import FfsFactory
|
|
36
|
-
from flwr.server.superlink.linkstate import LinkStateFactory
|
|
43
|
+
from flwr.server.superlink.linkstate import LinkState, LinkStateFactory
|
|
37
44
|
|
|
38
45
|
from .executor import Executor
|
|
39
46
|
|
|
@@ -61,7 +68,7 @@ class ExecServicer(exec_pb2_grpc.ExecServicer):
|
|
|
61
68
|
run_id = self.executor.start_run(
|
|
62
69
|
request.fab.content,
|
|
63
70
|
user_config_from_proto(request.override_config),
|
|
64
|
-
|
|
71
|
+
configs_record_from_proto(request.federation_options),
|
|
65
72
|
)
|
|
66
73
|
|
|
67
74
|
if run_id is None:
|
|
@@ -105,3 +112,25 @@ class ExecServicer(exec_pb2_grpc.ExecServicer):
|
|
|
105
112
|
context.cancel()
|
|
106
113
|
|
|
107
114
|
time.sleep(LOG_STREAM_INTERVAL) # Sleep briefly to avoid busy waiting
|
|
115
|
+
|
|
116
|
+
def ListRuns(
|
|
117
|
+
self, request: ListRunsRequest, context: grpc.ServicerContext
|
|
118
|
+
) -> ListRunsResponse:
|
|
119
|
+
"""Handle `flwr ls` command."""
|
|
120
|
+
log(INFO, "ExecServicer.List")
|
|
121
|
+
state = self.linkstate_factory.state()
|
|
122
|
+
|
|
123
|
+
# Handle `flwr ls --runs`
|
|
124
|
+
if not request.HasField("run_id"):
|
|
125
|
+
return _create_list_runs_response(state.get_run_ids(), state)
|
|
126
|
+
# Handle `flwr ls --run-id <run_id>`
|
|
127
|
+
return _create_list_runs_response({request.run_id}, state)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _create_list_runs_response(run_ids: set[int], state: LinkState) -> ListRunsResponse:
|
|
131
|
+
"""Create response for `flwr ls --runs` and `flwr ls --run-id <run_id>`."""
|
|
132
|
+
run_dict = {run_id: state.get_run(run_id) for run_id in run_ids}
|
|
133
|
+
return ListRunsResponse(
|
|
134
|
+
run_dict={run_id: run_to_proto(run) for run_id, run in run_dict.items() if run},
|
|
135
|
+
now=now().isoformat(),
|
|
136
|
+
)
|
flwr/superexec/executor.py
CHANGED
|
@@ -19,6 +19,7 @@ from dataclasses import dataclass, field
|
|
|
19
19
|
from subprocess import Popen
|
|
20
20
|
from typing import Optional
|
|
21
21
|
|
|
22
|
+
from flwr.common import ConfigsRecord
|
|
22
23
|
from flwr.common.typing import UserConfig
|
|
23
24
|
from flwr.server.superlink.ffs.ffs_factory import FfsFactory
|
|
24
25
|
from flwr.server.superlink.linkstate import LinkStateFactory
|
|
@@ -71,7 +72,7 @@ class Executor(ABC):
|
|
|
71
72
|
self,
|
|
72
73
|
fab_file: bytes,
|
|
73
74
|
override_config: UserConfig,
|
|
74
|
-
|
|
75
|
+
federation_options: ConfigsRecord,
|
|
75
76
|
) -> Optional[int]:
|
|
76
77
|
"""Start a run using the given Flower FAB ID and version.
|
|
77
78
|
|
|
@@ -84,8 +85,8 @@ class Executor(ABC):
|
|
|
84
85
|
The Flower App Bundle file bytes.
|
|
85
86
|
override_config: UserConfig
|
|
86
87
|
The config overrides dict sent by the user (using `flwr run`).
|
|
87
|
-
|
|
88
|
-
The federation options
|
|
88
|
+
federation_options: ConfigsRecord
|
|
89
|
+
The federation options sent by the user (using `flwr run`).
|
|
89
90
|
|
|
90
91
|
Returns
|
|
91
92
|
-------
|