flwr 1.12.0__py3-none-any.whl → 1.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. flwr/cli/app.py +2 -0
  2. flwr/cli/build.py +2 -2
  3. flwr/cli/config_utils.py +97 -0
  4. flwr/cli/install.py +0 -16
  5. flwr/cli/log.py +63 -97
  6. flwr/cli/ls.py +228 -0
  7. flwr/cli/new/new.py +23 -13
  8. flwr/cli/new/templates/app/README.md.tpl +11 -0
  9. flwr/cli/new/templates/app/code/flwr_tune/dataset.py.tpl +1 -1
  10. flwr/cli/new/templates/app/pyproject.baseline.toml.tpl +1 -1
  11. flwr/cli/new/templates/app/pyproject.flowertune.toml.tpl +2 -1
  12. flwr/cli/new/templates/app/pyproject.huggingface.toml.tpl +1 -1
  13. flwr/cli/new/templates/app/pyproject.jax.toml.tpl +1 -1
  14. flwr/cli/new/templates/app/pyproject.mlx.toml.tpl +1 -1
  15. flwr/cli/new/templates/app/pyproject.numpy.toml.tpl +1 -1
  16. flwr/cli/new/templates/app/pyproject.pytorch.toml.tpl +1 -1
  17. flwr/cli/new/templates/app/pyproject.sklearn.toml.tpl +1 -1
  18. flwr/cli/new/templates/app/pyproject.tensorflow.toml.tpl +2 -2
  19. flwr/cli/run/run.py +37 -89
  20. flwr/client/app.py +73 -34
  21. flwr/client/clientapp/app.py +58 -37
  22. flwr/client/grpc_rere_client/connection.py +7 -12
  23. flwr/client/nodestate/__init__.py +25 -0
  24. flwr/client/nodestate/in_memory_nodestate.py +38 -0
  25. flwr/client/nodestate/nodestate.py +30 -0
  26. flwr/client/nodestate/nodestate_factory.py +37 -0
  27. flwr/client/rest_client/connection.py +4 -14
  28. flwr/client/{node_state.py → run_info_store.py} +4 -3
  29. flwr/client/supernode/app.py +34 -58
  30. flwr/common/args.py +152 -0
  31. flwr/common/config.py +10 -0
  32. flwr/common/constant.py +59 -7
  33. flwr/common/context.py +9 -4
  34. flwr/common/date.py +21 -3
  35. flwr/common/grpc.py +4 -1
  36. flwr/common/logger.py +108 -1
  37. flwr/common/object_ref.py +47 -16
  38. flwr/common/serde.py +34 -0
  39. flwr/common/telemetry.py +0 -6
  40. flwr/common/typing.py +32 -2
  41. flwr/proto/exec_pb2.py +23 -17
  42. flwr/proto/exec_pb2.pyi +58 -22
  43. flwr/proto/exec_pb2_grpc.py +34 -0
  44. flwr/proto/exec_pb2_grpc.pyi +13 -0
  45. flwr/proto/log_pb2.py +29 -0
  46. flwr/proto/log_pb2.pyi +39 -0
  47. flwr/proto/log_pb2_grpc.py +4 -0
  48. flwr/proto/log_pb2_grpc.pyi +4 -0
  49. flwr/proto/message_pb2.py +8 -8
  50. flwr/proto/message_pb2.pyi +4 -1
  51. flwr/proto/run_pb2.py +32 -27
  52. flwr/proto/run_pb2.pyi +44 -1
  53. flwr/proto/serverappio_pb2.py +52 -0
  54. flwr/proto/{driver_pb2.pyi → serverappio_pb2.pyi} +54 -0
  55. flwr/proto/serverappio_pb2_grpc.py +376 -0
  56. flwr/proto/serverappio_pb2_grpc.pyi +147 -0
  57. flwr/proto/simulationio_pb2.py +38 -0
  58. flwr/proto/simulationio_pb2.pyi +65 -0
  59. flwr/proto/simulationio_pb2_grpc.py +205 -0
  60. flwr/proto/simulationio_pb2_grpc.pyi +81 -0
  61. flwr/server/app.py +297 -162
  62. flwr/server/driver/driver.py +15 -1
  63. flwr/server/driver/grpc_driver.py +89 -50
  64. flwr/server/driver/inmemory_driver.py +6 -16
  65. flwr/server/run_serverapp.py +11 -235
  66. flwr/server/{superlink/state → serverapp}/__init__.py +3 -9
  67. flwr/server/serverapp/app.py +234 -0
  68. flwr/server/strategy/aggregate.py +4 -4
  69. flwr/server/strategy/fedadam.py +11 -1
  70. flwr/server/superlink/driver/__init__.py +1 -1
  71. flwr/server/superlink/driver/{driver_grpc.py → serverappio_grpc.py} +19 -16
  72. flwr/server/superlink/driver/{driver_servicer.py → serverappio_servicer.py} +125 -39
  73. flwr/server/superlink/fleet/grpc_adapter/grpc_adapter_servicer.py +4 -2
  74. flwr/server/superlink/fleet/grpc_bidi/grpc_server.py +2 -2
  75. flwr/server/superlink/fleet/grpc_rere/fleet_servicer.py +4 -2
  76. flwr/server/superlink/fleet/grpc_rere/server_interceptor.py +2 -2
  77. flwr/server/superlink/fleet/message_handler/message_handler.py +7 -7
  78. flwr/server/superlink/fleet/rest_rere/rest_api.py +10 -9
  79. flwr/server/superlink/fleet/vce/vce_api.py +23 -23
  80. flwr/server/superlink/linkstate/__init__.py +28 -0
  81. flwr/server/superlink/{state/in_memory_state.py → linkstate/in_memory_linkstate.py} +237 -64
  82. flwr/server/superlink/{state/state.py → linkstate/linkstate.py} +166 -22
  83. flwr/server/superlink/{state/state_factory.py → linkstate/linkstate_factory.py} +9 -9
  84. flwr/server/superlink/{state/sqlite_state.py → linkstate/sqlite_linkstate.py} +383 -174
  85. flwr/server/superlink/linkstate/utils.py +389 -0
  86. flwr/server/superlink/simulation/__init__.py +15 -0
  87. flwr/server/superlink/simulation/simulationio_grpc.py +65 -0
  88. flwr/server/superlink/simulation/simulationio_servicer.py +153 -0
  89. flwr/simulation/__init__.py +5 -1
  90. flwr/simulation/app.py +236 -347
  91. flwr/simulation/legacy_app.py +402 -0
  92. flwr/simulation/ray_transport/ray_client_proxy.py +2 -2
  93. flwr/simulation/run_simulation.py +56 -141
  94. flwr/simulation/simulationio_connection.py +86 -0
  95. flwr/superexec/app.py +6 -134
  96. flwr/superexec/deployment.py +70 -69
  97. flwr/superexec/exec_grpc.py +15 -8
  98. flwr/superexec/exec_servicer.py +65 -65
  99. flwr/superexec/executor.py +26 -7
  100. flwr/superexec/simulation.py +62 -150
  101. {flwr-1.12.0.dist-info → flwr-1.13.1.dist-info}/METADATA +9 -7
  102. {flwr-1.12.0.dist-info → flwr-1.13.1.dist-info}/RECORD +105 -85
  103. {flwr-1.12.0.dist-info → flwr-1.13.1.dist-info}/entry_points.txt +2 -0
  104. flwr/client/node_state_tests.py +0 -66
  105. flwr/proto/driver_pb2.py +0 -42
  106. flwr/proto/driver_pb2_grpc.py +0 -239
  107. flwr/proto/driver_pb2_grpc.pyi +0 -94
  108. flwr/server/superlink/state/utils.py +0 -148
  109. {flwr-1.12.0.dist-info → flwr-1.13.1.dist-info}/LICENSE +0 -0
  110. {flwr-1.12.0.dist-info → flwr-1.13.1.dist-info}/WHEEL +0 -0
flwr/simulation/app.py CHANGED
@@ -12,371 +12,260 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  # ==============================================================================
15
- """Flower simulation app."""
15
+ """Flower Simulation process."""
16
16
 
17
-
18
- import asyncio
19
- import logging
17
+ import argparse
20
18
  import sys
21
- import threading
22
- import traceback
23
- import warnings
24
- from logging import ERROR, INFO
25
- from typing import Any, Optional, Union
26
-
27
- import ray
28
- from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
29
-
30
- from flwr.client import ClientFnExt
31
- from flwr.common import EventType, event
32
- from flwr.common.constant import NODE_ID_NUM_BYTES
33
- from flwr.common.logger import log, set_logger_propagation, warn_unsupported_feature
34
- from flwr.server.client_manager import ClientManager
35
- from flwr.server.history import History
36
- from flwr.server.server import Server, init_defaults, run_fl
37
- from flwr.server.server_config import ServerConfig
38
- from flwr.server.strategy import Strategy
39
- from flwr.server.superlink.state.utils import generate_rand_int_from_bytes
40
- from flwr.simulation.ray_transport.ray_actor import (
41
- ClientAppActor,
42
- VirtualClientEngineActor,
43
- VirtualClientEngineActorPool,
44
- pool_size_from_resources,
19
+ from logging import DEBUG, ERROR, INFO
20
+ from queue import Queue
21
+ from time import sleep
22
+ from typing import Optional
23
+
24
+ from flwr.cli.config_utils import get_fab_metadata
25
+ from flwr.cli.install import install_from_fab
26
+ from flwr.common import EventType
27
+ from flwr.common.args import add_args_flwr_app_common
28
+ from flwr.common.config import (
29
+ get_flwr_dir,
30
+ get_fused_config_from_dir,
31
+ get_project_config,
32
+ get_project_dir,
33
+ unflatten_dict,
45
34
  )
46
- from flwr.simulation.ray_transport.ray_client_proxy import RayActorClientProxy
47
-
48
- INVALID_ARGUMENTS_START_SIMULATION = """
49
- INVALID ARGUMENTS ERROR
50
-
51
- Invalid Arguments in method:
52
-
53
- `start_simulation(
54
- *,
55
- client_fn: ClientFn,
56
- num_clients: int,
57
- clients_ids: Optional[List[str]] = None,
58
- client_resources: Optional[Dict[str, float]] = None,
59
- server: Optional[Server] = None,
60
- config: ServerConfig = None,
61
- strategy: Optional[Strategy] = None,
62
- client_manager: Optional[ClientManager] = None,
63
- ray_init_args: Optional[Dict[str, Any]] = None,
64
- ) -> None:`
65
-
66
- REASON:
67
- Method requires:
68
- - Either `num_clients`[int] or `clients_ids`[List[str]]
69
- to be set exclusively.
70
- OR
71
- - `len(clients_ids)` == `num_clients`
72
-
73
- """
74
-
75
- NodeToPartitionMapping = dict[int, int]
76
-
77
-
78
- def _create_node_id_to_partition_mapping(
79
- num_clients: int,
80
- ) -> NodeToPartitionMapping:
81
- """Generate a node_id:partition_id mapping."""
82
- nodes_mapping: NodeToPartitionMapping = {} # {node-id; partition-id}
83
- for i in range(num_clients):
84
- while True:
85
- node_id = generate_rand_int_from_bytes(NODE_ID_NUM_BYTES)
86
- if node_id not in nodes_mapping:
87
- break
88
- nodes_mapping[node_id] = i
89
- return nodes_mapping
90
-
91
-
92
- # pylint: disable=too-many-arguments,too-many-statements,too-many-branches
93
- def start_simulation(
94
- *,
95
- client_fn: ClientFnExt,
96
- num_clients: int,
97
- clients_ids: Optional[list[str]] = None, # UNSUPPORTED, WILL BE REMOVED
98
- client_resources: Optional[dict[str, float]] = None,
99
- server: Optional[Server] = None,
100
- config: Optional[ServerConfig] = None,
101
- strategy: Optional[Strategy] = None,
102
- client_manager: Optional[ClientManager] = None,
103
- ray_init_args: Optional[dict[str, Any]] = None,
104
- keep_initialised: Optional[bool] = False,
105
- actor_type: type[VirtualClientEngineActor] = ClientAppActor,
106
- actor_kwargs: Optional[dict[str, Any]] = None,
107
- actor_scheduling: Union[str, NodeAffinitySchedulingStrategy] = "DEFAULT",
108
- ) -> History:
109
- """Start a Ray-based Flower simulation server.
110
-
111
- Parameters
112
- ----------
113
- client_fn : ClientFnExt
114
- A function creating `Client` instances. The function must have the signature
115
- `client_fn(context: Context). It should return
116
- a single client instance of type `Client`. Note that the created client
117
- instances are ephemeral and will often be destroyed after a single method
118
- invocation. Since client instances are not long-lived, they should not attempt
119
- to carry state over method invocations. Any state required by the instance
120
- (model, dataset, hyperparameters, ...) should be (re-)created in either the
121
- call to `client_fn` or the call to any of the client methods (e.g., load
122
- evaluation data in the `evaluate` method itself).
123
- num_clients : int
124
- The total number of clients in this simulation.
125
- clients_ids : Optional[List[str]]
126
- UNSUPPORTED, WILL BE REMOVED. USE `num_clients` INSTEAD.
127
- List `client_id`s for each client. This is only required if
128
- `num_clients` is not set. Setting both `num_clients` and `clients_ids`
129
- with `len(clients_ids)` not equal to `num_clients` generates an error.
130
- Using this argument will raise an error.
131
- client_resources : Optional[Dict[str, float]] (default: `{"num_cpus": 1, "num_gpus": 0.0}`)
132
- CPU and GPU resources for a single client. Supported keys
133
- are `num_cpus` and `num_gpus`. To understand the GPU utilization caused by
134
- `num_gpus`, as well as using custom resources, please consult the Ray
135
- documentation.
136
- server : Optional[flwr.server.Server] (default: None).
137
- An implementation of the abstract base class `flwr.server.Server`. If no
138
- instance is provided, then `start_server` will create one.
139
- config: ServerConfig (default: None).
140
- Currently supported values are `num_rounds` (int, default: 1) and
141
- `round_timeout` in seconds (float, default: None).
142
- strategy : Optional[flwr.server.Strategy] (default: None)
143
- An implementation of the abstract base class `flwr.server.Strategy`. If
144
- no strategy is provided, then `start_server` will use
145
- `flwr.server.strategy.FedAvg`.
146
- client_manager : Optional[flwr.server.ClientManager] (default: None)
147
- An implementation of the abstract base class `flwr.server.ClientManager`.
148
- If no implementation is provided, then `start_simulation` will use
149
- `flwr.server.client_manager.SimpleClientManager`.
150
- ray_init_args : Optional[Dict[str, Any]] (default: None)
151
- Optional dictionary containing arguments for the call to `ray.init`.
152
- If ray_init_args is None (the default), Ray will be initialized with
153
- the following default args:
154
-
155
- { "ignore_reinit_error": True, "include_dashboard": False }
156
-
157
- An empty dictionary can be used (ray_init_args={}) to prevent any
158
- arguments from being passed to ray.init.
159
- keep_initialised: Optional[bool] (default: False)
160
- Set to True to prevent `ray.shutdown()` in case `ray.is_initialized()=True`.
161
-
162
- actor_type: VirtualClientEngineActor (default: ClientAppActor)
163
- Optionally specify the type of actor to use. The actor object, which
164
- persists throughout the simulation, will be the process in charge of
165
- executing a ClientApp wrapping input argument `client_fn`.
166
-
167
- actor_kwargs: Optional[Dict[str, Any]] (default: None)
168
- If you want to create your own Actor classes, you might need to pass
169
- some input argument. You can use this dictionary for such purpose.
170
-
171
- actor_scheduling: Optional[Union[str, NodeAffinitySchedulingStrategy]]
172
- (default: "DEFAULT")
173
- Optional string ("DEFAULT" or "SPREAD") for the VCE to choose in which
174
- node the actor is placed. If you are an advanced user needed more control
175
- you can use lower-level scheduling strategies to pin actors to specific
176
- compute nodes (e.g. via NodeAffinitySchedulingStrategy). Please note this
177
- is an advanced feature. For all details, please refer to the Ray documentation:
178
- https://docs.ray.io/en/latest/ray-core/scheduling/index.html
179
-
180
- Returns
181
- -------
182
- hist : flwr.server.history.History
183
- Object containing metrics from training.
184
- """ # noqa: E501
185
- # pylint: disable-msg=too-many-locals
186
- event(
187
- EventType.START_SIMULATION_ENTER,
188
- {"num_clients": len(clients_ids) if clients_ids is not None else num_clients},
189
- )
190
-
191
- if clients_ids is not None:
192
- warn_unsupported_feature(
193
- "Passing `clients_ids` to `start_simulation` is deprecated and not longer "
194
- "used by `start_simulation`. Use `num_clients` exclusively instead."
195
- )
196
- log(ERROR, "`clients_ids` argument used.")
197
- sys.exit()
198
-
199
- # Set logger propagation
200
- loop: Optional[asyncio.AbstractEventLoop] = None
201
- try:
202
- loop = asyncio.get_running_loop()
203
- except RuntimeError:
204
- loop = None
205
- finally:
206
- if loop and loop.is_running():
207
- # Set logger propagation to False to prevent duplicated log output in Colab.
208
- logger = logging.getLogger("flwr")
209
- _ = set_logger_propagation(logger, False)
210
-
211
- # Initialize server and server config
212
- initialized_server, initialized_config = init_defaults(
213
- server=server,
214
- config=config,
215
- strategy=strategy,
216
- client_manager=client_manager,
217
- )
218
-
219
- log(
220
- INFO,
221
- "Starting Flower simulation, config: %s",
222
- initialized_config,
223
- )
35
+ from flwr.common.constant import (
36
+ SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS,
37
+ Status,
38
+ SubStatus,
39
+ )
40
+ from flwr.common.logger import (
41
+ log,
42
+ mirror_output_to_queue,
43
+ restore_output,
44
+ start_log_uploader,
45
+ stop_log_uploader,
46
+ )
47
+ from flwr.common.serde import (
48
+ configs_record_from_proto,
49
+ context_from_proto,
50
+ fab_from_proto,
51
+ run_from_proto,
52
+ run_status_to_proto,
53
+ )
54
+ from flwr.common.typing import RunStatus
55
+ from flwr.proto.run_pb2 import ( # pylint: disable=E0611
56
+ GetFederationOptionsRequest,
57
+ GetFederationOptionsResponse,
58
+ UpdateRunStatusRequest,
59
+ )
60
+ from flwr.proto.simulationio_pb2 import ( # pylint: disable=E0611
61
+ PullSimulationInputsRequest,
62
+ PullSimulationInputsResponse,
63
+ PushSimulationOutputsRequest,
64
+ )
65
+ from flwr.server.superlink.fleet.vce.backend.backend import BackendConfig
66
+ from flwr.simulation.run_simulation import _run_simulation
67
+ from flwr.simulation.simulationio_connection import SimulationIoConnection
224
68
 
225
- # Create node-id to partition-id mapping
226
- nodes_mapping = _create_node_id_to_partition_mapping(num_clients)
227
69
 
228
- # Default arguments for Ray initialization
229
- if not ray_init_args:
230
- ray_init_args = {
231
- "ignore_reinit_error": True,
232
- "include_dashboard": False,
233
- }
70
+ def flwr_simulation() -> None:
71
+ """Run process-isolated Flower Simulation."""
72
+ # Capture stdout/stderr
73
+ log_queue: Queue[Optional[str]] = Queue()
74
+ mirror_output_to_queue(log_queue)
234
75
 
235
- # Shut down Ray if it has already been initialized (unless asked not to)
236
- if ray.is_initialized() and not keep_initialised:
237
- ray.shutdown()
76
+ args = _parse_args_run_flwr_simulation().parse_args()
238
77
 
239
- # Initialize Ray
240
- ray.init(**ray_init_args)
241
- cluster_resources = ray.cluster_resources()
242
- log(
243
- INFO,
244
- "Flower VCE: Ray initialized with resources: %s",
245
- cluster_resources,
246
- )
78
+ log(INFO, "Starting Flower Simulation")
247
79
 
248
- log(
249
- INFO,
250
- "Optimize your simulation with Flower VCE: "
251
- "https://flower.ai/docs/framework/how-to-run-simulations.html",
252
- )
253
-
254
- # Log the resources that a single client will be able to use
255
- if client_resources is None:
80
+ if not args.insecure:
256
81
  log(
257
- INFO,
258
- "No `client_resources` specified. Using minimal resources for clients.",
259
- )
260
- client_resources = {"num_cpus": 1, "num_gpus": 0.0}
261
-
262
- # Each client needs at the very least one CPU
263
- if "num_cpus" not in client_resources:
264
- warnings.warn(
265
- "No `num_cpus` specified in `client_resources`. "
266
- "Using `num_cpus=1` for each client.",
267
- stacklevel=2,
82
+ ERROR,
83
+ "`flwr-simulation` does not support TLS yet. "
84
+ "Please use the '--insecure' flag.",
268
85
  )
269
- client_resources["num_cpus"] = 1
86
+ sys.exit(1)
270
87
 
271
88
  log(
272
- INFO,
273
- "Flower VCE: Resources for each Virtual Client: %s",
274
- client_resources,
89
+ DEBUG,
90
+ "Starting isolated `Simulation` connected to SuperLink SimulationAppIo API "
91
+ "at %s",
92
+ args.simulationio_api_address,
93
+ )
94
+ run_simulation_process(
95
+ simulationio_api_address=args.simulationio_api_address,
96
+ log_queue=log_queue,
97
+ run_once=args.run_once,
98
+ flwr_dir_=args.flwr_dir,
99
+ certificates=None,
275
100
  )
276
101
 
277
- actor_args = {} if actor_kwargs is None else actor_kwargs
278
-
279
- # An actor factory. This is called N times to add N actors
280
- # to the pool. If at some point the pool can accommodate more actors
281
- # this will be called again.
282
- def create_actor_fn() -> type[VirtualClientEngineActor]:
283
- return actor_type.options( # type: ignore
284
- **client_resources,
285
- scheduling_strategy=actor_scheduling,
286
- ).remote(**actor_args)
287
-
288
- # Instantiate ActorPool
289
- pool = VirtualClientEngineActorPool(
290
- create_actor_fn=create_actor_fn,
291
- client_resources=client_resources,
102
+ # Restore stdout/stderr
103
+ restore_output()
104
+
105
+
106
+ def run_simulation_process( # pylint: disable=R0914, disable=W0212, disable=R0915
107
+ simulationio_api_address: str,
108
+ log_queue: Queue[Optional[str]],
109
+ run_once: bool,
110
+ flwr_dir_: Optional[str] = None,
111
+ certificates: Optional[bytes] = None,
112
+ ) -> None:
113
+ """Run Flower Simulation process."""
114
+ conn = SimulationIoConnection(
115
+ simulationio_service_address=simulationio_api_address,
116
+ root_certificates=certificates,
292
117
  )
293
118
 
294
- f_stop = threading.Event()
295
-
296
- # Periodically, check if the cluster has grown (i.e. a new
297
- # node has been added). If this happens, we likely want to grow
298
- # the actor pool by adding more Actors to it.
299
- def update_resources(f_stop: threading.Event) -> None:
300
- """Periodically check if more actors can be added to the pool.
301
-
302
- If so, extend the pool.
303
- """
304
- if not f_stop.is_set():
305
- num_max_actors = pool_size_from_resources(client_resources)
306
- if num_max_actors > pool.num_actors:
307
- num_new = num_max_actors - pool.num_actors
308
- log(
309
- INFO, "The cluster expanded. Adding %s actors to the pool.", num_new
119
+ # Resolve directory where FABs are installed
120
+ flwr_dir = get_flwr_dir(flwr_dir_)
121
+ log_uploader = None
122
+
123
+ while True:
124
+
125
+ try:
126
+ # Pull SimulationInputs from LinkState
127
+ req = PullSimulationInputsRequest()
128
+ res: PullSimulationInputsResponse = conn._stub.PullSimulationInputs(req)
129
+ if not res.HasField("run"):
130
+ sleep(3)
131
+ run_status = None
132
+ continue
133
+
134
+ context = context_from_proto(res.context)
135
+ run = run_from_proto(res.run)
136
+ fab = fab_from_proto(res.fab)
137
+
138
+ # Start log uploader for this run
139
+ log_uploader = start_log_uploader(
140
+ log_queue=log_queue,
141
+ node_id=context.node_id,
142
+ run_id=run.run_id,
143
+ stub=conn._stub,
144
+ )
145
+
146
+ log(DEBUG, "Simulation process starts FAB installation.")
147
+ install_from_fab(fab.content, flwr_dir=flwr_dir, skip_prompt=True)
148
+
149
+ fab_id, fab_version = get_fab_metadata(fab.content)
150
+
151
+ app_path = get_project_dir(fab_id, fab_version, fab.hash_str, flwr_dir)
152
+ config = get_project_config(app_path)
153
+
154
+ # Get ClientApp and SeverApp components
155
+ app_components = config["tool"]["flwr"]["app"]["components"]
156
+ client_app_attr = app_components["clientapp"]
157
+ server_app_attr = app_components["serverapp"]
158
+ fused_config = get_fused_config_from_dir(app_path, run.override_config)
159
+
160
+ # Update run_config in context
161
+ context.run_config = fused_config
162
+
163
+ log(
164
+ DEBUG,
165
+ "Flower will load ServerApp `%s` in %s",
166
+ server_app_attr,
167
+ app_path,
168
+ )
169
+ log(
170
+ DEBUG,
171
+ "Flower will load ClientApp `%s` in %s",
172
+ client_app_attr,
173
+ app_path,
174
+ )
175
+
176
+ # Change status to Running
177
+ run_status_proto = run_status_to_proto(RunStatus(Status.RUNNING, "", ""))
178
+ conn._stub.UpdateRunStatus(
179
+ UpdateRunStatusRequest(run_id=run.run_id, run_status=run_status_proto)
180
+ )
181
+
182
+ # Pull Federation Options
183
+ fed_opt_res: GetFederationOptionsResponse = conn._stub.GetFederationOptions(
184
+ GetFederationOptionsRequest(run_id=run.run_id)
185
+ )
186
+ federation_options = configs_record_from_proto(
187
+ fed_opt_res.federation_options
188
+ )
189
+
190
+ # Unflatten underlying dict
191
+ fed_opt = unflatten_dict({**federation_options})
192
+
193
+ # Extract configs values of interest
194
+ num_supernodes = fed_opt.get("num-supernodes")
195
+ if num_supernodes is None:
196
+ raise ValueError(
197
+ "Federation options expects `num-supernodes` to be set."
198
+ )
199
+ backend_config: BackendConfig = fed_opt.get("backend", {})
200
+ verbose: bool = fed_opt.get("verbose", False)
201
+ enable_tf_gpu_growth: bool = fed_opt.get("enable_tf_gpu_growth", False)
202
+
203
+ # Launch the simulation
204
+ _run_simulation(
205
+ server_app_attr=server_app_attr,
206
+ client_app_attr=client_app_attr,
207
+ num_supernodes=num_supernodes,
208
+ backend_config=backend_config,
209
+ app_dir=str(app_path),
210
+ run=run,
211
+ enable_tf_gpu_growth=enable_tf_gpu_growth,
212
+ verbose_logging=verbose,
213
+ server_app_run_config=fused_config,
214
+ is_app=True,
215
+ exit_event=EventType.CLI_FLOWER_SIMULATION_LEAVE,
216
+ )
217
+
218
+ # Send resulting context
219
+ context_proto = None # context_to_proto(updated_context)
220
+ out_req = PushSimulationOutputsRequest(
221
+ run_id=run.run_id, context=context_proto
222
+ )
223
+ _ = conn._stub.PushSimulationOutputs(out_req)
224
+
225
+ run_status = RunStatus(Status.FINISHED, SubStatus.COMPLETED, "")
226
+
227
+ except Exception as ex: # pylint: disable=broad-exception-caught
228
+ exc_entity = "Simulation"
229
+ log(ERROR, "%s raised an exception", exc_entity, exc_info=ex)
230
+ run_status = RunStatus(Status.FINISHED, SubStatus.FAILED, str(ex))
231
+
232
+ finally:
233
+ # Stop log uploader for this run and upload final logs
234
+ if log_uploader:
235
+ stop_log_uploader(log_queue, log_uploader)
236
+ log_uploader = None
237
+
238
+ # Update run status
239
+ if run_status:
240
+ run_status_proto = run_status_to_proto(run_status)
241
+ conn._stub.UpdateRunStatus(
242
+ UpdateRunStatusRequest(
243
+ run_id=run.run_id, run_status=run_status_proto
244
+ )
310
245
  )
311
- pool.add_actors_to_pool(num_actors=num_new)
312
246
 
313
- threading.Timer(10, update_resources, [f_stop]).start()
247
+ # Stop the loop if `flwr-simulation` is expected to process a single run
248
+ if run_once:
249
+ break
314
250
 
315
- update_resources(f_stop)
316
251
 
317
- log(
318
- INFO,
319
- "Flower VCE: Creating %s with %s actors",
320
- pool.__class__.__name__,
321
- pool.num_actors,
252
+ def _parse_args_run_flwr_simulation() -> argparse.ArgumentParser:
253
+ """Parse flwr-simulation command line arguments."""
254
+ parser = argparse.ArgumentParser(
255
+ description="Run a Flower Simulation",
322
256
  )
323
-
324
- # Register one RayClientProxy object for each client with the ClientManager
325
- for node_id, partition_id in nodes_mapping.items():
326
- client_proxy = RayActorClientProxy(
327
- client_fn=client_fn,
328
- node_id=node_id,
329
- partition_id=partition_id,
330
- num_partitions=num_clients,
331
- actor_pool=pool,
332
- )
333
- initialized_server.client_manager().register(client=client_proxy)
334
-
335
- hist = History()
336
- # pylint: disable=broad-except
337
- try:
338
- # Start training
339
- hist = run_fl(
340
- server=initialized_server,
341
- config=initialized_config,
342
- )
343
- except Exception as ex:
344
- log(ERROR, ex)
345
- log(ERROR, traceback.format_exc())
346
- log(
347
- ERROR,
348
- "Your simulation crashed :(. This could be because of several reasons. "
349
- "The most common are: "
350
- "\n\t > Sometimes, issues in the simulation code itself can cause crashes. "
351
- "It's always a good idea to double-check your code for any potential bugs "
352
- "or inconsistencies that might be contributing to the problem. "
353
- "For example: "
354
- "\n\t\t - You might be using a class attribute in your clients that "
355
- "hasn't been defined."
356
- "\n\t\t - There could be an incorrect method call to a 3rd party library "
357
- "(e.g., PyTorch)."
358
- "\n\t\t - The return types of methods in your clients/strategies might be "
359
- "incorrect."
360
- "\n\t > Your system couldn't fit a single VirtualClient: try lowering "
361
- "`client_resources`."
362
- "\n\t > All the actors in your pool crashed. This could be because: "
363
- "\n\t\t - You clients hit an out-of-memory (OOM) error and actors couldn't "
364
- "recover from it. Try launching your simulation with more generous "
365
- "`client_resources` setting (i.e. it seems %s is "
366
- "not enough for your run). Use fewer concurrent actors. "
367
- "\n\t\t - You were running a multi-node simulation and all worker nodes "
368
- "disconnected. The head node might still be alive but cannot accommodate "
369
- "any actor with resources: %s."
370
- "\nTake a look at the Flower simulation examples for guidance "
371
- "<https://flower.ai/docs/framework/how-to-run-simulations.html>.",
372
- client_resources,
373
- client_resources,
374
- )
375
- raise RuntimeError("Simulation crashed.") from ex
376
-
377
- finally:
378
- # Stop time monitoring resources in cluster
379
- f_stop.set()
380
- event(EventType.START_SIMULATION_LEAVE)
381
-
382
- return hist
257
+ parser.add_argument(
258
+ "--simulationio-api-address",
259
+ default=SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS,
260
+ type=str,
261
+ help="Address of SuperLink's SimulationIO API (IPv4, IPv6, or a domain name)."
262
+ f"By default, it is set to {SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS}.",
263
+ )
264
+ parser.add_argument(
265
+ "--run-once",
266
+ action="store_true",
267
+ help="When set, this process will start a single simulation "
268
+ "for a pending Run. If no pending run the process will exit. ",
269
+ )
270
+ add_args_flwr_app_common(parser=parser)
271
+ return parser