flwr 1.12.0__py3-none-any.whl → 1.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flwr/cli/app.py +2 -0
- flwr/cli/build.py +2 -2
- flwr/cli/config_utils.py +97 -0
- flwr/cli/install.py +0 -16
- flwr/cli/log.py +63 -97
- flwr/cli/ls.py +228 -0
- flwr/cli/new/new.py +23 -13
- flwr/cli/new/templates/app/README.md.tpl +11 -0
- flwr/cli/new/templates/app/code/flwr_tune/dataset.py.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.baseline.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.flowertune.toml.tpl +2 -1
- flwr/cli/new/templates/app/pyproject.huggingface.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.jax.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.mlx.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.numpy.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.pytorch.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.sklearn.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.tensorflow.toml.tpl +2 -2
- flwr/cli/run/run.py +37 -89
- flwr/client/app.py +73 -34
- flwr/client/clientapp/app.py +58 -37
- flwr/client/grpc_rere_client/connection.py +7 -12
- flwr/client/nodestate/__init__.py +25 -0
- flwr/client/nodestate/in_memory_nodestate.py +38 -0
- flwr/client/nodestate/nodestate.py +30 -0
- flwr/client/nodestate/nodestate_factory.py +37 -0
- flwr/client/rest_client/connection.py +4 -14
- flwr/client/{node_state.py → run_info_store.py} +4 -3
- flwr/client/supernode/app.py +34 -58
- flwr/common/args.py +152 -0
- flwr/common/config.py +10 -0
- flwr/common/constant.py +59 -7
- flwr/common/context.py +9 -4
- flwr/common/date.py +21 -3
- flwr/common/grpc.py +4 -1
- flwr/common/logger.py +108 -1
- flwr/common/object_ref.py +47 -16
- flwr/common/serde.py +34 -0
- flwr/common/telemetry.py +0 -6
- flwr/common/typing.py +32 -2
- flwr/proto/exec_pb2.py +23 -17
- flwr/proto/exec_pb2.pyi +58 -22
- flwr/proto/exec_pb2_grpc.py +34 -0
- flwr/proto/exec_pb2_grpc.pyi +13 -0
- flwr/proto/log_pb2.py +29 -0
- flwr/proto/log_pb2.pyi +39 -0
- flwr/proto/log_pb2_grpc.py +4 -0
- flwr/proto/log_pb2_grpc.pyi +4 -0
- flwr/proto/message_pb2.py +8 -8
- flwr/proto/message_pb2.pyi +4 -1
- flwr/proto/run_pb2.py +32 -27
- flwr/proto/run_pb2.pyi +44 -1
- flwr/proto/serverappio_pb2.py +52 -0
- flwr/proto/{driver_pb2.pyi → serverappio_pb2.pyi} +54 -0
- flwr/proto/serverappio_pb2_grpc.py +376 -0
- flwr/proto/serverappio_pb2_grpc.pyi +147 -0
- flwr/proto/simulationio_pb2.py +38 -0
- flwr/proto/simulationio_pb2.pyi +65 -0
- flwr/proto/simulationio_pb2_grpc.py +205 -0
- flwr/proto/simulationio_pb2_grpc.pyi +81 -0
- flwr/server/app.py +297 -162
- flwr/server/driver/driver.py +15 -1
- flwr/server/driver/grpc_driver.py +89 -50
- flwr/server/driver/inmemory_driver.py +6 -16
- flwr/server/run_serverapp.py +11 -235
- flwr/server/{superlink/state → serverapp}/__init__.py +3 -9
- flwr/server/serverapp/app.py +234 -0
- flwr/server/strategy/aggregate.py +4 -4
- flwr/server/strategy/fedadam.py +11 -1
- flwr/server/superlink/driver/__init__.py +1 -1
- flwr/server/superlink/driver/{driver_grpc.py → serverappio_grpc.py} +19 -16
- flwr/server/superlink/driver/{driver_servicer.py → serverappio_servicer.py} +125 -39
- flwr/server/superlink/fleet/grpc_adapter/grpc_adapter_servicer.py +4 -2
- flwr/server/superlink/fleet/grpc_bidi/grpc_server.py +2 -2
- flwr/server/superlink/fleet/grpc_rere/fleet_servicer.py +4 -2
- flwr/server/superlink/fleet/grpc_rere/server_interceptor.py +2 -2
- flwr/server/superlink/fleet/message_handler/message_handler.py +7 -7
- flwr/server/superlink/fleet/rest_rere/rest_api.py +10 -9
- flwr/server/superlink/fleet/vce/vce_api.py +23 -23
- flwr/server/superlink/linkstate/__init__.py +28 -0
- flwr/server/superlink/{state/in_memory_state.py → linkstate/in_memory_linkstate.py} +237 -64
- flwr/server/superlink/{state/state.py → linkstate/linkstate.py} +166 -22
- flwr/server/superlink/{state/state_factory.py → linkstate/linkstate_factory.py} +9 -9
- flwr/server/superlink/{state/sqlite_state.py → linkstate/sqlite_linkstate.py} +383 -174
- flwr/server/superlink/linkstate/utils.py +389 -0
- flwr/server/superlink/simulation/__init__.py +15 -0
- flwr/server/superlink/simulation/simulationio_grpc.py +65 -0
- flwr/server/superlink/simulation/simulationio_servicer.py +153 -0
- flwr/simulation/__init__.py +5 -1
- flwr/simulation/app.py +236 -347
- flwr/simulation/legacy_app.py +402 -0
- flwr/simulation/ray_transport/ray_client_proxy.py +2 -2
- flwr/simulation/run_simulation.py +56 -141
- flwr/simulation/simulationio_connection.py +86 -0
- flwr/superexec/app.py +6 -134
- flwr/superexec/deployment.py +70 -69
- flwr/superexec/exec_grpc.py +15 -8
- flwr/superexec/exec_servicer.py +65 -65
- flwr/superexec/executor.py +26 -7
- flwr/superexec/simulation.py +62 -150
- {flwr-1.12.0.dist-info → flwr-1.13.1.dist-info}/METADATA +9 -7
- {flwr-1.12.0.dist-info → flwr-1.13.1.dist-info}/RECORD +105 -85
- {flwr-1.12.0.dist-info → flwr-1.13.1.dist-info}/entry_points.txt +2 -0
- flwr/client/node_state_tests.py +0 -66
- flwr/proto/driver_pb2.py +0 -42
- flwr/proto/driver_pb2_grpc.py +0 -239
- flwr/proto/driver_pb2_grpc.pyi +0 -94
- flwr/server/superlink/state/utils.py +0 -148
- {flwr-1.12.0.dist-info → flwr-1.13.1.dist-info}/LICENSE +0 -0
- {flwr-1.12.0.dist-info → flwr-1.13.1.dist-info}/WHEEL +0 -0
flwr/simulation/app.py
CHANGED
|
@@ -12,371 +12,260 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
# ==============================================================================
|
|
15
|
-
"""Flower
|
|
15
|
+
"""Flower Simulation process."""
|
|
16
16
|
|
|
17
|
-
|
|
18
|
-
import asyncio
|
|
19
|
-
import logging
|
|
17
|
+
import argparse
|
|
20
18
|
import sys
|
|
21
|
-
import
|
|
22
|
-
import
|
|
23
|
-
import
|
|
24
|
-
from
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
import
|
|
28
|
-
from
|
|
29
|
-
|
|
30
|
-
from flwr.
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
from flwr.server.server import Server, init_defaults, run_fl
|
|
37
|
-
from flwr.server.server_config import ServerConfig
|
|
38
|
-
from flwr.server.strategy import Strategy
|
|
39
|
-
from flwr.server.superlink.state.utils import generate_rand_int_from_bytes
|
|
40
|
-
from flwr.simulation.ray_transport.ray_actor import (
|
|
41
|
-
ClientAppActor,
|
|
42
|
-
VirtualClientEngineActor,
|
|
43
|
-
VirtualClientEngineActorPool,
|
|
44
|
-
pool_size_from_resources,
|
|
19
|
+
from logging import DEBUG, ERROR, INFO
|
|
20
|
+
from queue import Queue
|
|
21
|
+
from time import sleep
|
|
22
|
+
from typing import Optional
|
|
23
|
+
|
|
24
|
+
from flwr.cli.config_utils import get_fab_metadata
|
|
25
|
+
from flwr.cli.install import install_from_fab
|
|
26
|
+
from flwr.common import EventType
|
|
27
|
+
from flwr.common.args import add_args_flwr_app_common
|
|
28
|
+
from flwr.common.config import (
|
|
29
|
+
get_flwr_dir,
|
|
30
|
+
get_fused_config_from_dir,
|
|
31
|
+
get_project_config,
|
|
32
|
+
get_project_dir,
|
|
33
|
+
unflatten_dict,
|
|
45
34
|
)
|
|
46
|
-
from flwr.
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
num_clients: int,
|
|
80
|
-
) -> NodeToPartitionMapping:
|
|
81
|
-
"""Generate a node_id:partition_id mapping."""
|
|
82
|
-
nodes_mapping: NodeToPartitionMapping = {} # {node-id; partition-id}
|
|
83
|
-
for i in range(num_clients):
|
|
84
|
-
while True:
|
|
85
|
-
node_id = generate_rand_int_from_bytes(NODE_ID_NUM_BYTES)
|
|
86
|
-
if node_id not in nodes_mapping:
|
|
87
|
-
break
|
|
88
|
-
nodes_mapping[node_id] = i
|
|
89
|
-
return nodes_mapping
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
# pylint: disable=too-many-arguments,too-many-statements,too-many-branches
|
|
93
|
-
def start_simulation(
|
|
94
|
-
*,
|
|
95
|
-
client_fn: ClientFnExt,
|
|
96
|
-
num_clients: int,
|
|
97
|
-
clients_ids: Optional[list[str]] = None, # UNSUPPORTED, WILL BE REMOVED
|
|
98
|
-
client_resources: Optional[dict[str, float]] = None,
|
|
99
|
-
server: Optional[Server] = None,
|
|
100
|
-
config: Optional[ServerConfig] = None,
|
|
101
|
-
strategy: Optional[Strategy] = None,
|
|
102
|
-
client_manager: Optional[ClientManager] = None,
|
|
103
|
-
ray_init_args: Optional[dict[str, Any]] = None,
|
|
104
|
-
keep_initialised: Optional[bool] = False,
|
|
105
|
-
actor_type: type[VirtualClientEngineActor] = ClientAppActor,
|
|
106
|
-
actor_kwargs: Optional[dict[str, Any]] = None,
|
|
107
|
-
actor_scheduling: Union[str, NodeAffinitySchedulingStrategy] = "DEFAULT",
|
|
108
|
-
) -> History:
|
|
109
|
-
"""Start a Ray-based Flower simulation server.
|
|
110
|
-
|
|
111
|
-
Parameters
|
|
112
|
-
----------
|
|
113
|
-
client_fn : ClientFnExt
|
|
114
|
-
A function creating `Client` instances. The function must have the signature
|
|
115
|
-
`client_fn(context: Context). It should return
|
|
116
|
-
a single client instance of type `Client`. Note that the created client
|
|
117
|
-
instances are ephemeral and will often be destroyed after a single method
|
|
118
|
-
invocation. Since client instances are not long-lived, they should not attempt
|
|
119
|
-
to carry state over method invocations. Any state required by the instance
|
|
120
|
-
(model, dataset, hyperparameters, ...) should be (re-)created in either the
|
|
121
|
-
call to `client_fn` or the call to any of the client methods (e.g., load
|
|
122
|
-
evaluation data in the `evaluate` method itself).
|
|
123
|
-
num_clients : int
|
|
124
|
-
The total number of clients in this simulation.
|
|
125
|
-
clients_ids : Optional[List[str]]
|
|
126
|
-
UNSUPPORTED, WILL BE REMOVED. USE `num_clients` INSTEAD.
|
|
127
|
-
List `client_id`s for each client. This is only required if
|
|
128
|
-
`num_clients` is not set. Setting both `num_clients` and `clients_ids`
|
|
129
|
-
with `len(clients_ids)` not equal to `num_clients` generates an error.
|
|
130
|
-
Using this argument will raise an error.
|
|
131
|
-
client_resources : Optional[Dict[str, float]] (default: `{"num_cpus": 1, "num_gpus": 0.0}`)
|
|
132
|
-
CPU and GPU resources for a single client. Supported keys
|
|
133
|
-
are `num_cpus` and `num_gpus`. To understand the GPU utilization caused by
|
|
134
|
-
`num_gpus`, as well as using custom resources, please consult the Ray
|
|
135
|
-
documentation.
|
|
136
|
-
server : Optional[flwr.server.Server] (default: None).
|
|
137
|
-
An implementation of the abstract base class `flwr.server.Server`. If no
|
|
138
|
-
instance is provided, then `start_server` will create one.
|
|
139
|
-
config: ServerConfig (default: None).
|
|
140
|
-
Currently supported values are `num_rounds` (int, default: 1) and
|
|
141
|
-
`round_timeout` in seconds (float, default: None).
|
|
142
|
-
strategy : Optional[flwr.server.Strategy] (default: None)
|
|
143
|
-
An implementation of the abstract base class `flwr.server.Strategy`. If
|
|
144
|
-
no strategy is provided, then `start_server` will use
|
|
145
|
-
`flwr.server.strategy.FedAvg`.
|
|
146
|
-
client_manager : Optional[flwr.server.ClientManager] (default: None)
|
|
147
|
-
An implementation of the abstract base class `flwr.server.ClientManager`.
|
|
148
|
-
If no implementation is provided, then `start_simulation` will use
|
|
149
|
-
`flwr.server.client_manager.SimpleClientManager`.
|
|
150
|
-
ray_init_args : Optional[Dict[str, Any]] (default: None)
|
|
151
|
-
Optional dictionary containing arguments for the call to `ray.init`.
|
|
152
|
-
If ray_init_args is None (the default), Ray will be initialized with
|
|
153
|
-
the following default args:
|
|
154
|
-
|
|
155
|
-
{ "ignore_reinit_error": True, "include_dashboard": False }
|
|
156
|
-
|
|
157
|
-
An empty dictionary can be used (ray_init_args={}) to prevent any
|
|
158
|
-
arguments from being passed to ray.init.
|
|
159
|
-
keep_initialised: Optional[bool] (default: False)
|
|
160
|
-
Set to True to prevent `ray.shutdown()` in case `ray.is_initialized()=True`.
|
|
161
|
-
|
|
162
|
-
actor_type: VirtualClientEngineActor (default: ClientAppActor)
|
|
163
|
-
Optionally specify the type of actor to use. The actor object, which
|
|
164
|
-
persists throughout the simulation, will be the process in charge of
|
|
165
|
-
executing a ClientApp wrapping input argument `client_fn`.
|
|
166
|
-
|
|
167
|
-
actor_kwargs: Optional[Dict[str, Any]] (default: None)
|
|
168
|
-
If you want to create your own Actor classes, you might need to pass
|
|
169
|
-
some input argument. You can use this dictionary for such purpose.
|
|
170
|
-
|
|
171
|
-
actor_scheduling: Optional[Union[str, NodeAffinitySchedulingStrategy]]
|
|
172
|
-
(default: "DEFAULT")
|
|
173
|
-
Optional string ("DEFAULT" or "SPREAD") for the VCE to choose in which
|
|
174
|
-
node the actor is placed. If you are an advanced user needed more control
|
|
175
|
-
you can use lower-level scheduling strategies to pin actors to specific
|
|
176
|
-
compute nodes (e.g. via NodeAffinitySchedulingStrategy). Please note this
|
|
177
|
-
is an advanced feature. For all details, please refer to the Ray documentation:
|
|
178
|
-
https://docs.ray.io/en/latest/ray-core/scheduling/index.html
|
|
179
|
-
|
|
180
|
-
Returns
|
|
181
|
-
-------
|
|
182
|
-
hist : flwr.server.history.History
|
|
183
|
-
Object containing metrics from training.
|
|
184
|
-
""" # noqa: E501
|
|
185
|
-
# pylint: disable-msg=too-many-locals
|
|
186
|
-
event(
|
|
187
|
-
EventType.START_SIMULATION_ENTER,
|
|
188
|
-
{"num_clients": len(clients_ids) if clients_ids is not None else num_clients},
|
|
189
|
-
)
|
|
190
|
-
|
|
191
|
-
if clients_ids is not None:
|
|
192
|
-
warn_unsupported_feature(
|
|
193
|
-
"Passing `clients_ids` to `start_simulation` is deprecated and not longer "
|
|
194
|
-
"used by `start_simulation`. Use `num_clients` exclusively instead."
|
|
195
|
-
)
|
|
196
|
-
log(ERROR, "`clients_ids` argument used.")
|
|
197
|
-
sys.exit()
|
|
198
|
-
|
|
199
|
-
# Set logger propagation
|
|
200
|
-
loop: Optional[asyncio.AbstractEventLoop] = None
|
|
201
|
-
try:
|
|
202
|
-
loop = asyncio.get_running_loop()
|
|
203
|
-
except RuntimeError:
|
|
204
|
-
loop = None
|
|
205
|
-
finally:
|
|
206
|
-
if loop and loop.is_running():
|
|
207
|
-
# Set logger propagation to False to prevent duplicated log output in Colab.
|
|
208
|
-
logger = logging.getLogger("flwr")
|
|
209
|
-
_ = set_logger_propagation(logger, False)
|
|
210
|
-
|
|
211
|
-
# Initialize server and server config
|
|
212
|
-
initialized_server, initialized_config = init_defaults(
|
|
213
|
-
server=server,
|
|
214
|
-
config=config,
|
|
215
|
-
strategy=strategy,
|
|
216
|
-
client_manager=client_manager,
|
|
217
|
-
)
|
|
218
|
-
|
|
219
|
-
log(
|
|
220
|
-
INFO,
|
|
221
|
-
"Starting Flower simulation, config: %s",
|
|
222
|
-
initialized_config,
|
|
223
|
-
)
|
|
35
|
+
from flwr.common.constant import (
|
|
36
|
+
SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS,
|
|
37
|
+
Status,
|
|
38
|
+
SubStatus,
|
|
39
|
+
)
|
|
40
|
+
from flwr.common.logger import (
|
|
41
|
+
log,
|
|
42
|
+
mirror_output_to_queue,
|
|
43
|
+
restore_output,
|
|
44
|
+
start_log_uploader,
|
|
45
|
+
stop_log_uploader,
|
|
46
|
+
)
|
|
47
|
+
from flwr.common.serde import (
|
|
48
|
+
configs_record_from_proto,
|
|
49
|
+
context_from_proto,
|
|
50
|
+
fab_from_proto,
|
|
51
|
+
run_from_proto,
|
|
52
|
+
run_status_to_proto,
|
|
53
|
+
)
|
|
54
|
+
from flwr.common.typing import RunStatus
|
|
55
|
+
from flwr.proto.run_pb2 import ( # pylint: disable=E0611
|
|
56
|
+
GetFederationOptionsRequest,
|
|
57
|
+
GetFederationOptionsResponse,
|
|
58
|
+
UpdateRunStatusRequest,
|
|
59
|
+
)
|
|
60
|
+
from flwr.proto.simulationio_pb2 import ( # pylint: disable=E0611
|
|
61
|
+
PullSimulationInputsRequest,
|
|
62
|
+
PullSimulationInputsResponse,
|
|
63
|
+
PushSimulationOutputsRequest,
|
|
64
|
+
)
|
|
65
|
+
from flwr.server.superlink.fleet.vce.backend.backend import BackendConfig
|
|
66
|
+
from flwr.simulation.run_simulation import _run_simulation
|
|
67
|
+
from flwr.simulation.simulationio_connection import SimulationIoConnection
|
|
224
68
|
|
|
225
|
-
# Create node-id to partition-id mapping
|
|
226
|
-
nodes_mapping = _create_node_id_to_partition_mapping(num_clients)
|
|
227
69
|
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
}
|
|
70
|
+
def flwr_simulation() -> None:
|
|
71
|
+
"""Run process-isolated Flower Simulation."""
|
|
72
|
+
# Capture stdout/stderr
|
|
73
|
+
log_queue: Queue[Optional[str]] = Queue()
|
|
74
|
+
mirror_output_to_queue(log_queue)
|
|
234
75
|
|
|
235
|
-
|
|
236
|
-
if ray.is_initialized() and not keep_initialised:
|
|
237
|
-
ray.shutdown()
|
|
76
|
+
args = _parse_args_run_flwr_simulation().parse_args()
|
|
238
77
|
|
|
239
|
-
|
|
240
|
-
ray.init(**ray_init_args)
|
|
241
|
-
cluster_resources = ray.cluster_resources()
|
|
242
|
-
log(
|
|
243
|
-
INFO,
|
|
244
|
-
"Flower VCE: Ray initialized with resources: %s",
|
|
245
|
-
cluster_resources,
|
|
246
|
-
)
|
|
78
|
+
log(INFO, "Starting Flower Simulation")
|
|
247
79
|
|
|
248
|
-
|
|
249
|
-
INFO,
|
|
250
|
-
"Optimize your simulation with Flower VCE: "
|
|
251
|
-
"https://flower.ai/docs/framework/how-to-run-simulations.html",
|
|
252
|
-
)
|
|
253
|
-
|
|
254
|
-
# Log the resources that a single client will be able to use
|
|
255
|
-
if client_resources is None:
|
|
80
|
+
if not args.insecure:
|
|
256
81
|
log(
|
|
257
|
-
|
|
258
|
-
"
|
|
259
|
-
|
|
260
|
-
client_resources = {"num_cpus": 1, "num_gpus": 0.0}
|
|
261
|
-
|
|
262
|
-
# Each client needs at the very least one CPU
|
|
263
|
-
if "num_cpus" not in client_resources:
|
|
264
|
-
warnings.warn(
|
|
265
|
-
"No `num_cpus` specified in `client_resources`. "
|
|
266
|
-
"Using `num_cpus=1` for each client.",
|
|
267
|
-
stacklevel=2,
|
|
82
|
+
ERROR,
|
|
83
|
+
"`flwr-simulation` does not support TLS yet. "
|
|
84
|
+
"Please use the '--insecure' flag.",
|
|
268
85
|
)
|
|
269
|
-
|
|
86
|
+
sys.exit(1)
|
|
270
87
|
|
|
271
88
|
log(
|
|
272
|
-
|
|
273
|
-
"
|
|
274
|
-
|
|
89
|
+
DEBUG,
|
|
90
|
+
"Starting isolated `Simulation` connected to SuperLink SimulationAppIo API "
|
|
91
|
+
"at %s",
|
|
92
|
+
args.simulationio_api_address,
|
|
93
|
+
)
|
|
94
|
+
run_simulation_process(
|
|
95
|
+
simulationio_api_address=args.simulationio_api_address,
|
|
96
|
+
log_queue=log_queue,
|
|
97
|
+
run_once=args.run_once,
|
|
98
|
+
flwr_dir_=args.flwr_dir,
|
|
99
|
+
certificates=None,
|
|
275
100
|
)
|
|
276
101
|
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
102
|
+
# Restore stdout/stderr
|
|
103
|
+
restore_output()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def run_simulation_process( # pylint: disable=R0914, disable=W0212, disable=R0915
|
|
107
|
+
simulationio_api_address: str,
|
|
108
|
+
log_queue: Queue[Optional[str]],
|
|
109
|
+
run_once: bool,
|
|
110
|
+
flwr_dir_: Optional[str] = None,
|
|
111
|
+
certificates: Optional[bytes] = None,
|
|
112
|
+
) -> None:
|
|
113
|
+
"""Run Flower Simulation process."""
|
|
114
|
+
conn = SimulationIoConnection(
|
|
115
|
+
simulationio_service_address=simulationio_api_address,
|
|
116
|
+
root_certificates=certificates,
|
|
292
117
|
)
|
|
293
118
|
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
119
|
+
# Resolve directory where FABs are installed
|
|
120
|
+
flwr_dir = get_flwr_dir(flwr_dir_)
|
|
121
|
+
log_uploader = None
|
|
122
|
+
|
|
123
|
+
while True:
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
# Pull SimulationInputs from LinkState
|
|
127
|
+
req = PullSimulationInputsRequest()
|
|
128
|
+
res: PullSimulationInputsResponse = conn._stub.PullSimulationInputs(req)
|
|
129
|
+
if not res.HasField("run"):
|
|
130
|
+
sleep(3)
|
|
131
|
+
run_status = None
|
|
132
|
+
continue
|
|
133
|
+
|
|
134
|
+
context = context_from_proto(res.context)
|
|
135
|
+
run = run_from_proto(res.run)
|
|
136
|
+
fab = fab_from_proto(res.fab)
|
|
137
|
+
|
|
138
|
+
# Start log uploader for this run
|
|
139
|
+
log_uploader = start_log_uploader(
|
|
140
|
+
log_queue=log_queue,
|
|
141
|
+
node_id=context.node_id,
|
|
142
|
+
run_id=run.run_id,
|
|
143
|
+
stub=conn._stub,
|
|
144
|
+
)
|
|
145
|
+
|
|
146
|
+
log(DEBUG, "Simulation process starts FAB installation.")
|
|
147
|
+
install_from_fab(fab.content, flwr_dir=flwr_dir, skip_prompt=True)
|
|
148
|
+
|
|
149
|
+
fab_id, fab_version = get_fab_metadata(fab.content)
|
|
150
|
+
|
|
151
|
+
app_path = get_project_dir(fab_id, fab_version, fab.hash_str, flwr_dir)
|
|
152
|
+
config = get_project_config(app_path)
|
|
153
|
+
|
|
154
|
+
# Get ClientApp and SeverApp components
|
|
155
|
+
app_components = config["tool"]["flwr"]["app"]["components"]
|
|
156
|
+
client_app_attr = app_components["clientapp"]
|
|
157
|
+
server_app_attr = app_components["serverapp"]
|
|
158
|
+
fused_config = get_fused_config_from_dir(app_path, run.override_config)
|
|
159
|
+
|
|
160
|
+
# Update run_config in context
|
|
161
|
+
context.run_config = fused_config
|
|
162
|
+
|
|
163
|
+
log(
|
|
164
|
+
DEBUG,
|
|
165
|
+
"Flower will load ServerApp `%s` in %s",
|
|
166
|
+
server_app_attr,
|
|
167
|
+
app_path,
|
|
168
|
+
)
|
|
169
|
+
log(
|
|
170
|
+
DEBUG,
|
|
171
|
+
"Flower will load ClientApp `%s` in %s",
|
|
172
|
+
client_app_attr,
|
|
173
|
+
app_path,
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
# Change status to Running
|
|
177
|
+
run_status_proto = run_status_to_proto(RunStatus(Status.RUNNING, "", ""))
|
|
178
|
+
conn._stub.UpdateRunStatus(
|
|
179
|
+
UpdateRunStatusRequest(run_id=run.run_id, run_status=run_status_proto)
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# Pull Federation Options
|
|
183
|
+
fed_opt_res: GetFederationOptionsResponse = conn._stub.GetFederationOptions(
|
|
184
|
+
GetFederationOptionsRequest(run_id=run.run_id)
|
|
185
|
+
)
|
|
186
|
+
federation_options = configs_record_from_proto(
|
|
187
|
+
fed_opt_res.federation_options
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
# Unflatten underlying dict
|
|
191
|
+
fed_opt = unflatten_dict({**federation_options})
|
|
192
|
+
|
|
193
|
+
# Extract configs values of interest
|
|
194
|
+
num_supernodes = fed_opt.get("num-supernodes")
|
|
195
|
+
if num_supernodes is None:
|
|
196
|
+
raise ValueError(
|
|
197
|
+
"Federation options expects `num-supernodes` to be set."
|
|
198
|
+
)
|
|
199
|
+
backend_config: BackendConfig = fed_opt.get("backend", {})
|
|
200
|
+
verbose: bool = fed_opt.get("verbose", False)
|
|
201
|
+
enable_tf_gpu_growth: bool = fed_opt.get("enable_tf_gpu_growth", False)
|
|
202
|
+
|
|
203
|
+
# Launch the simulation
|
|
204
|
+
_run_simulation(
|
|
205
|
+
server_app_attr=server_app_attr,
|
|
206
|
+
client_app_attr=client_app_attr,
|
|
207
|
+
num_supernodes=num_supernodes,
|
|
208
|
+
backend_config=backend_config,
|
|
209
|
+
app_dir=str(app_path),
|
|
210
|
+
run=run,
|
|
211
|
+
enable_tf_gpu_growth=enable_tf_gpu_growth,
|
|
212
|
+
verbose_logging=verbose,
|
|
213
|
+
server_app_run_config=fused_config,
|
|
214
|
+
is_app=True,
|
|
215
|
+
exit_event=EventType.CLI_FLOWER_SIMULATION_LEAVE,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
# Send resulting context
|
|
219
|
+
context_proto = None # context_to_proto(updated_context)
|
|
220
|
+
out_req = PushSimulationOutputsRequest(
|
|
221
|
+
run_id=run.run_id, context=context_proto
|
|
222
|
+
)
|
|
223
|
+
_ = conn._stub.PushSimulationOutputs(out_req)
|
|
224
|
+
|
|
225
|
+
run_status = RunStatus(Status.FINISHED, SubStatus.COMPLETED, "")
|
|
226
|
+
|
|
227
|
+
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
228
|
+
exc_entity = "Simulation"
|
|
229
|
+
log(ERROR, "%s raised an exception", exc_entity, exc_info=ex)
|
|
230
|
+
run_status = RunStatus(Status.FINISHED, SubStatus.FAILED, str(ex))
|
|
231
|
+
|
|
232
|
+
finally:
|
|
233
|
+
# Stop log uploader for this run and upload final logs
|
|
234
|
+
if log_uploader:
|
|
235
|
+
stop_log_uploader(log_queue, log_uploader)
|
|
236
|
+
log_uploader = None
|
|
237
|
+
|
|
238
|
+
# Update run status
|
|
239
|
+
if run_status:
|
|
240
|
+
run_status_proto = run_status_to_proto(run_status)
|
|
241
|
+
conn._stub.UpdateRunStatus(
|
|
242
|
+
UpdateRunStatusRequest(
|
|
243
|
+
run_id=run.run_id, run_status=run_status_proto
|
|
244
|
+
)
|
|
310
245
|
)
|
|
311
|
-
pool.add_actors_to_pool(num_actors=num_new)
|
|
312
246
|
|
|
313
|
-
|
|
247
|
+
# Stop the loop if `flwr-simulation` is expected to process a single run
|
|
248
|
+
if run_once:
|
|
249
|
+
break
|
|
314
250
|
|
|
315
|
-
update_resources(f_stop)
|
|
316
251
|
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
pool.num_actors,
|
|
252
|
+
def _parse_args_run_flwr_simulation() -> argparse.ArgumentParser:
|
|
253
|
+
"""Parse flwr-simulation command line arguments."""
|
|
254
|
+
parser = argparse.ArgumentParser(
|
|
255
|
+
description="Run a Flower Simulation",
|
|
322
256
|
)
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
# Start training
|
|
339
|
-
hist = run_fl(
|
|
340
|
-
server=initialized_server,
|
|
341
|
-
config=initialized_config,
|
|
342
|
-
)
|
|
343
|
-
except Exception as ex:
|
|
344
|
-
log(ERROR, ex)
|
|
345
|
-
log(ERROR, traceback.format_exc())
|
|
346
|
-
log(
|
|
347
|
-
ERROR,
|
|
348
|
-
"Your simulation crashed :(. This could be because of several reasons. "
|
|
349
|
-
"The most common are: "
|
|
350
|
-
"\n\t > Sometimes, issues in the simulation code itself can cause crashes. "
|
|
351
|
-
"It's always a good idea to double-check your code for any potential bugs "
|
|
352
|
-
"or inconsistencies that might be contributing to the problem. "
|
|
353
|
-
"For example: "
|
|
354
|
-
"\n\t\t - You might be using a class attribute in your clients that "
|
|
355
|
-
"hasn't been defined."
|
|
356
|
-
"\n\t\t - There could be an incorrect method call to a 3rd party library "
|
|
357
|
-
"(e.g., PyTorch)."
|
|
358
|
-
"\n\t\t - The return types of methods in your clients/strategies might be "
|
|
359
|
-
"incorrect."
|
|
360
|
-
"\n\t > Your system couldn't fit a single VirtualClient: try lowering "
|
|
361
|
-
"`client_resources`."
|
|
362
|
-
"\n\t > All the actors in your pool crashed. This could be because: "
|
|
363
|
-
"\n\t\t - You clients hit an out-of-memory (OOM) error and actors couldn't "
|
|
364
|
-
"recover from it. Try launching your simulation with more generous "
|
|
365
|
-
"`client_resources` setting (i.e. it seems %s is "
|
|
366
|
-
"not enough for your run). Use fewer concurrent actors. "
|
|
367
|
-
"\n\t\t - You were running a multi-node simulation and all worker nodes "
|
|
368
|
-
"disconnected. The head node might still be alive but cannot accommodate "
|
|
369
|
-
"any actor with resources: %s."
|
|
370
|
-
"\nTake a look at the Flower simulation examples for guidance "
|
|
371
|
-
"<https://flower.ai/docs/framework/how-to-run-simulations.html>.",
|
|
372
|
-
client_resources,
|
|
373
|
-
client_resources,
|
|
374
|
-
)
|
|
375
|
-
raise RuntimeError("Simulation crashed.") from ex
|
|
376
|
-
|
|
377
|
-
finally:
|
|
378
|
-
# Stop time monitoring resources in cluster
|
|
379
|
-
f_stop.set()
|
|
380
|
-
event(EventType.START_SIMULATION_LEAVE)
|
|
381
|
-
|
|
382
|
-
return hist
|
|
257
|
+
parser.add_argument(
|
|
258
|
+
"--simulationio-api-address",
|
|
259
|
+
default=SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS,
|
|
260
|
+
type=str,
|
|
261
|
+
help="Address of SuperLink's SimulationIO API (IPv4, IPv6, or a domain name)."
|
|
262
|
+
f"By default, it is set to {SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS}.",
|
|
263
|
+
)
|
|
264
|
+
parser.add_argument(
|
|
265
|
+
"--run-once",
|
|
266
|
+
action="store_true",
|
|
267
|
+
help="When set, this process will start a single simulation "
|
|
268
|
+
"for a pending Run. If no pending run the process will exit. ",
|
|
269
|
+
)
|
|
270
|
+
add_args_flwr_app_common(parser=parser)
|
|
271
|
+
return parser
|