flwr-nightly 1.13.0.dev20241111__py3-none-any.whl → 1.14.0.dev20241126__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of flwr-nightly might be problematic. Click here for more details.
- flwr/cli/app.py +2 -0
- flwr/cli/install.py +0 -16
- flwr/cli/ls.py +228 -0
- flwr/cli/new/new.py +23 -13
- flwr/cli/new/templates/app/README.md.tpl +11 -0
- flwr/cli/new/templates/app/pyproject.baseline.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.flowertune.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.huggingface.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.jax.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.mlx.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.numpy.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.pytorch.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.sklearn.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.tensorflow.toml.tpl +1 -1
- flwr/cli/run/run.py +4 -2
- flwr/client/app.py +50 -14
- flwr/client/clientapp/app.py +40 -23
- flwr/client/grpc_rere_client/connection.py +7 -12
- flwr/client/rest_client/connection.py +4 -14
- flwr/client/supernode/app.py +31 -53
- flwr/common/args.py +85 -16
- flwr/common/constant.py +24 -6
- flwr/common/date.py +18 -0
- flwr/common/grpc.py +4 -1
- flwr/common/serde.py +10 -0
- flwr/common/typing.py +31 -10
- flwr/proto/exec_pb2.py +22 -13
- flwr/proto/exec_pb2.pyi +44 -0
- flwr/proto/exec_pb2_grpc.py +34 -0
- flwr/proto/exec_pb2_grpc.pyi +13 -0
- flwr/proto/run_pb2.py +30 -30
- flwr/proto/run_pb2.pyi +18 -1
- flwr/server/app.py +47 -77
- flwr/server/driver/grpc_driver.py +66 -16
- flwr/server/run_serverapp.py +8 -238
- flwr/server/serverapp/app.py +49 -29
- flwr/server/superlink/fleet/rest_rere/rest_api.py +10 -9
- flwr/server/superlink/linkstate/in_memory_linkstate.py +71 -46
- flwr/server/superlink/linkstate/linkstate.py +19 -5
- flwr/server/superlink/linkstate/sqlite_linkstate.py +81 -113
- flwr/server/superlink/linkstate/utils.py +193 -3
- flwr/simulation/app.py +52 -91
- flwr/simulation/legacy_app.py +21 -1
- flwr/simulation/run_simulation.py +7 -18
- flwr/simulation/simulationio_connection.py +2 -2
- flwr/superexec/deployment.py +12 -6
- flwr/superexec/exec_servicer.py +31 -2
- flwr/superexec/simulation.py +11 -46
- {flwr_nightly-1.13.0.dev20241111.dist-info → flwr_nightly-1.14.0.dev20241126.dist-info}/METADATA +5 -4
- {flwr_nightly-1.13.0.dev20241111.dist-info → flwr_nightly-1.14.0.dev20241126.dist-info}/RECORD +53 -52
- {flwr_nightly-1.13.0.dev20241111.dist-info → flwr_nightly-1.14.0.dev20241126.dist-info}/LICENSE +0 -0
- {flwr_nightly-1.13.0.dev20241111.dist-info → flwr_nightly-1.14.0.dev20241126.dist-info}/WHEEL +0 -0
- {flwr_nightly-1.13.0.dev20241111.dist-info → flwr_nightly-1.14.0.dev20241126.dist-info}/entry_points.txt +0 -0
|
@@ -15,15 +15,23 @@
|
|
|
15
15
|
"""Utility functions for State."""
|
|
16
16
|
|
|
17
17
|
|
|
18
|
+
from logging import ERROR
|
|
18
19
|
from os import urandom
|
|
20
|
+
from typing import Optional, Union
|
|
21
|
+
from uuid import UUID, uuid4
|
|
19
22
|
|
|
20
|
-
from flwr.common import ConfigsRecord, Context, serde
|
|
21
|
-
from flwr.common.constant import Status, SubStatus
|
|
23
|
+
from flwr.common import ConfigsRecord, Context, log, now, serde
|
|
24
|
+
from flwr.common.constant import ErrorCode, Status, SubStatus
|
|
22
25
|
from flwr.common.typing import RunStatus
|
|
23
|
-
from flwr.proto.message_pb2 import Context as ProtoContext # pylint: disable=E0611
|
|
24
26
|
|
|
25
27
|
# pylint: disable=E0611
|
|
28
|
+
from flwr.proto.error_pb2 import Error
|
|
29
|
+
from flwr.proto.message_pb2 import Context as ProtoContext
|
|
30
|
+
from flwr.proto.node_pb2 import Node
|
|
26
31
|
from flwr.proto.recordset_pb2 import ConfigsRecord as ProtoConfigsRecord
|
|
32
|
+
from flwr.proto.task_pb2 import Task, TaskIns, TaskRes
|
|
33
|
+
|
|
34
|
+
# pylint: enable=E0611
|
|
27
35
|
|
|
28
36
|
NODE_UNAVAILABLE_ERROR_REASON = (
|
|
29
37
|
"Error: Node Unavailable - The destination node is currently unavailable. "
|
|
@@ -34,12 +42,22 @@ VALID_RUN_STATUS_TRANSITIONS = {
|
|
|
34
42
|
(Status.PENDING, Status.STARTING),
|
|
35
43
|
(Status.STARTING, Status.RUNNING),
|
|
36
44
|
(Status.RUNNING, Status.FINISHED),
|
|
45
|
+
# Any non-FINISHED status can transition to FINISHED
|
|
46
|
+
(Status.PENDING, Status.FINISHED),
|
|
47
|
+
(Status.STARTING, Status.FINISHED),
|
|
37
48
|
}
|
|
38
49
|
VALID_RUN_SUB_STATUSES = {
|
|
39
50
|
SubStatus.COMPLETED,
|
|
40
51
|
SubStatus.FAILED,
|
|
41
52
|
SubStatus.STOPPED,
|
|
42
53
|
}
|
|
54
|
+
MESSAGE_UNAVAILABLE_ERROR_REASON = (
|
|
55
|
+
"Error: Message Unavailable - The requested message could not be found in the "
|
|
56
|
+
"database. It may have expired due to its TTL or never existed."
|
|
57
|
+
)
|
|
58
|
+
REPLY_MESSAGE_UNAVAILABLE_ERROR_REASON = (
|
|
59
|
+
"Error: Reply Message Unavailable - The reply message has expired."
|
|
60
|
+
)
|
|
43
61
|
|
|
44
62
|
|
|
45
63
|
def generate_rand_int_from_bytes(num_bytes: int) -> int:
|
|
@@ -170,6 +188,14 @@ def is_valid_transition(current_status: RunStatus, new_status: RunStatus) -> boo
|
|
|
170
188
|
bool
|
|
171
189
|
True if the transition is valid, False otherwise.
|
|
172
190
|
"""
|
|
191
|
+
# Transition to FINISHED from a non-RUNNING status is only allowed
|
|
192
|
+
# if the sub-status is not COMPLETED
|
|
193
|
+
if (
|
|
194
|
+
current_status.status in [Status.PENDING, Status.STARTING]
|
|
195
|
+
and new_status.status == Status.FINISHED
|
|
196
|
+
):
|
|
197
|
+
return new_status.sub_status != SubStatus.COMPLETED
|
|
198
|
+
|
|
173
199
|
return (
|
|
174
200
|
current_status.status,
|
|
175
201
|
new_status.status,
|
|
@@ -197,3 +223,167 @@ def has_valid_sub_status(status: RunStatus) -> bool:
|
|
|
197
223
|
if status.status == Status.FINISHED:
|
|
198
224
|
return status.sub_status in VALID_RUN_SUB_STATUSES
|
|
199
225
|
return status.sub_status == ""
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
def create_taskres_for_unavailable_taskins(taskins_id: Union[str, UUID]) -> TaskRes:
|
|
229
|
+
"""Generate a TaskRes with a TaskIns unavailable error.
|
|
230
|
+
|
|
231
|
+
Parameters
|
|
232
|
+
----------
|
|
233
|
+
taskins_id : Union[str, UUID]
|
|
234
|
+
The ID of the unavailable TaskIns.
|
|
235
|
+
|
|
236
|
+
Returns
|
|
237
|
+
-------
|
|
238
|
+
TaskRes
|
|
239
|
+
A TaskRes with an error code MESSAGE_UNAVAILABLE to indicate that the
|
|
240
|
+
inquired TaskIns ID cannot be found (due to non-existence or expiration).
|
|
241
|
+
"""
|
|
242
|
+
current_time = now().timestamp()
|
|
243
|
+
return TaskRes(
|
|
244
|
+
task_id=str(uuid4()),
|
|
245
|
+
group_id="", # Unknown group ID
|
|
246
|
+
run_id=0, # Unknown run ID
|
|
247
|
+
task=Task(
|
|
248
|
+
# This function is only called by SuperLink, and thus it's the producer.
|
|
249
|
+
producer=Node(node_id=0, anonymous=False),
|
|
250
|
+
consumer=Node(node_id=0, anonymous=False),
|
|
251
|
+
created_at=current_time,
|
|
252
|
+
ttl=0,
|
|
253
|
+
ancestry=[str(taskins_id)],
|
|
254
|
+
task_type="", # Unknown message type
|
|
255
|
+
error=Error(
|
|
256
|
+
code=ErrorCode.MESSAGE_UNAVAILABLE,
|
|
257
|
+
reason=MESSAGE_UNAVAILABLE_ERROR_REASON,
|
|
258
|
+
),
|
|
259
|
+
),
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def create_taskres_for_unavailable_taskres(ref_taskins: TaskIns) -> TaskRes:
|
|
264
|
+
"""Generate a TaskRes with a reply message unavailable error from a TaskIns.
|
|
265
|
+
|
|
266
|
+
Parameters
|
|
267
|
+
----------
|
|
268
|
+
ref_taskins : TaskIns
|
|
269
|
+
The reference TaskIns object.
|
|
270
|
+
|
|
271
|
+
Returns
|
|
272
|
+
-------
|
|
273
|
+
TaskRes
|
|
274
|
+
The generated TaskRes with an error code REPLY_MESSAGE_UNAVAILABLE_ERROR_REASON,
|
|
275
|
+
indicating that the original TaskRes has expired.
|
|
276
|
+
"""
|
|
277
|
+
current_time = now().timestamp()
|
|
278
|
+
ttl = ref_taskins.task.ttl - (current_time - ref_taskins.task.created_at)
|
|
279
|
+
if ttl < 0:
|
|
280
|
+
log(ERROR, "Creating TaskRes for TaskIns that exceeds its TTL.")
|
|
281
|
+
ttl = 0
|
|
282
|
+
return TaskRes(
|
|
283
|
+
task_id=str(uuid4()),
|
|
284
|
+
group_id=ref_taskins.group_id,
|
|
285
|
+
run_id=ref_taskins.run_id,
|
|
286
|
+
task=Task(
|
|
287
|
+
# This function is only called by SuperLink, and thus it's the producer.
|
|
288
|
+
producer=Node(node_id=0, anonymous=False),
|
|
289
|
+
consumer=Node(node_id=0, anonymous=False),
|
|
290
|
+
created_at=current_time,
|
|
291
|
+
ttl=ttl,
|
|
292
|
+
ancestry=[ref_taskins.task_id],
|
|
293
|
+
task_type=ref_taskins.task.task_type,
|
|
294
|
+
error=Error(
|
|
295
|
+
code=ErrorCode.REPLY_MESSAGE_UNAVAILABLE,
|
|
296
|
+
reason=REPLY_MESSAGE_UNAVAILABLE_ERROR_REASON,
|
|
297
|
+
),
|
|
298
|
+
),
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def has_expired(task_ins_or_res: Union[TaskIns, TaskRes], current_time: float) -> bool:
|
|
303
|
+
"""Check if the TaskIns/TaskRes has expired."""
|
|
304
|
+
return task_ins_or_res.task.ttl + task_ins_or_res.task.created_at < current_time
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def verify_taskins_ids(
|
|
308
|
+
inquired_taskins_ids: set[UUID],
|
|
309
|
+
found_taskins_dict: dict[UUID, TaskIns],
|
|
310
|
+
current_time: Optional[float] = None,
|
|
311
|
+
update_set: bool = True,
|
|
312
|
+
) -> dict[UUID, TaskRes]:
|
|
313
|
+
"""Verify found TaskIns and generate error TaskRes for invalid ones.
|
|
314
|
+
|
|
315
|
+
Parameters
|
|
316
|
+
----------
|
|
317
|
+
inquired_taskins_ids : set[UUID]
|
|
318
|
+
Set of TaskIns IDs for which to generate error TaskRes if invalid.
|
|
319
|
+
found_taskins_dict : dict[UUID, TaskIns]
|
|
320
|
+
Dictionary containing all found TaskIns indexed by their IDs.
|
|
321
|
+
current_time : Optional[float] (default: None)
|
|
322
|
+
The current time to check for expiration. If set to `None`, the current time
|
|
323
|
+
will automatically be set to the current timestamp using `now().timestamp()`.
|
|
324
|
+
update_set : bool (default: True)
|
|
325
|
+
If True, the `inquired_taskins_ids` will be updated to remove invalid ones,
|
|
326
|
+
by default True.
|
|
327
|
+
|
|
328
|
+
Returns
|
|
329
|
+
-------
|
|
330
|
+
dict[UUID, TaskRes]
|
|
331
|
+
A dictionary of error TaskRes indexed by the corresponding TaskIns ID.
|
|
332
|
+
"""
|
|
333
|
+
ret_dict = {}
|
|
334
|
+
current = current_time if current_time else now().timestamp()
|
|
335
|
+
for taskins_id in list(inquired_taskins_ids):
|
|
336
|
+
# Generate error TaskRes if the task_ins doesn't exist or has expired
|
|
337
|
+
taskins = found_taskins_dict.get(taskins_id)
|
|
338
|
+
if taskins is None or has_expired(taskins, current):
|
|
339
|
+
if update_set:
|
|
340
|
+
inquired_taskins_ids.remove(taskins_id)
|
|
341
|
+
taskres = create_taskres_for_unavailable_taskins(taskins_id)
|
|
342
|
+
ret_dict[taskins_id] = taskres
|
|
343
|
+
return ret_dict
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
def verify_found_taskres(
|
|
347
|
+
inquired_taskins_ids: set[UUID],
|
|
348
|
+
found_taskins_dict: dict[UUID, TaskIns],
|
|
349
|
+
found_taskres_list: list[TaskRes],
|
|
350
|
+
current_time: Optional[float] = None,
|
|
351
|
+
update_set: bool = True,
|
|
352
|
+
) -> dict[UUID, TaskRes]:
|
|
353
|
+
"""Verify found TaskRes and generate error TaskRes for invalid ones.
|
|
354
|
+
|
|
355
|
+
Parameters
|
|
356
|
+
----------
|
|
357
|
+
inquired_taskins_ids : set[UUID]
|
|
358
|
+
Set of TaskIns IDs for which to generate error TaskRes if invalid.
|
|
359
|
+
found_taskins_dict : dict[UUID, TaskIns]
|
|
360
|
+
Dictionary containing all found TaskIns indexed by their IDs.
|
|
361
|
+
found_taskres_list : dict[TaskIns, TaskRes]
|
|
362
|
+
List of found TaskRes to be verified.
|
|
363
|
+
current_time : Optional[float] (default: None)
|
|
364
|
+
The current time to check for expiration. If set to `None`, the current time
|
|
365
|
+
will automatically be set to the current timestamp using `now().timestamp()`.
|
|
366
|
+
update_set : bool (default: True)
|
|
367
|
+
If True, the `inquired_taskins_ids` will be updated to remove ones
|
|
368
|
+
that have a TaskRes, by default True.
|
|
369
|
+
|
|
370
|
+
Returns
|
|
371
|
+
-------
|
|
372
|
+
dict[UUID, TaskRes]
|
|
373
|
+
A dictionary of TaskRes indexed by the corresponding TaskIns ID.
|
|
374
|
+
"""
|
|
375
|
+
ret_dict: dict[UUID, TaskRes] = {}
|
|
376
|
+
current = current_time if current_time else now().timestamp()
|
|
377
|
+
for taskres in found_taskres_list:
|
|
378
|
+
taskins_id = UUID(taskres.task.ancestry[0])
|
|
379
|
+
if update_set:
|
|
380
|
+
inquired_taskins_ids.remove(taskins_id)
|
|
381
|
+
# Check if the TaskRes has expired
|
|
382
|
+
if has_expired(taskres, current):
|
|
383
|
+
# No need to insert the error TaskRes
|
|
384
|
+
taskres = create_taskres_for_unavailable_taskres(
|
|
385
|
+
found_taskins_dict[taskins_id]
|
|
386
|
+
)
|
|
387
|
+
taskres.task.delivered_at = now().isoformat()
|
|
388
|
+
ret_dict[taskins_id] = taskres
|
|
389
|
+
return ret_dict
|
flwr/simulation/app.py
CHANGED
|
@@ -14,12 +14,9 @@
|
|
|
14
14
|
# ==============================================================================
|
|
15
15
|
"""Flower Simulation process."""
|
|
16
16
|
|
|
17
|
-
|
|
18
17
|
import argparse
|
|
19
18
|
import sys
|
|
20
|
-
from logging import DEBUG, ERROR, INFO
|
|
21
|
-
from os.path import isfile
|
|
22
|
-
from pathlib import Path
|
|
19
|
+
from logging import DEBUG, ERROR, INFO
|
|
23
20
|
from queue import Queue
|
|
24
21
|
from time import sleep
|
|
25
22
|
from typing import Optional
|
|
@@ -27,6 +24,7 @@ from typing import Optional
|
|
|
27
24
|
from flwr.cli.config_utils import get_fab_metadata
|
|
28
25
|
from flwr.cli.install import install_from_fab
|
|
29
26
|
from flwr.common import EventType
|
|
27
|
+
from flwr.common.args import add_args_flwr_app_common
|
|
30
28
|
from flwr.common.config import (
|
|
31
29
|
get_flwr_dir,
|
|
32
30
|
get_fused_config_from_dir,
|
|
@@ -34,7 +32,11 @@ from flwr.common.config import (
|
|
|
34
32
|
get_project_dir,
|
|
35
33
|
unflatten_dict,
|
|
36
34
|
)
|
|
37
|
-
from flwr.common.constant import
|
|
35
|
+
from flwr.common.constant import (
|
|
36
|
+
SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS,
|
|
37
|
+
Status,
|
|
38
|
+
SubStatus,
|
|
39
|
+
)
|
|
38
40
|
from flwr.common.logger import (
|
|
39
41
|
log,
|
|
40
42
|
mirror_output_to_queue,
|
|
@@ -71,102 +73,38 @@ def flwr_simulation() -> None:
|
|
|
71
73
|
log_queue: Queue[Optional[str]] = Queue()
|
|
72
74
|
mirror_output_to_queue(log_queue)
|
|
73
75
|
|
|
74
|
-
|
|
75
|
-
description="Run a Flower Simulation",
|
|
76
|
-
)
|
|
77
|
-
parser.add_argument(
|
|
78
|
-
"--superlink",
|
|
79
|
-
type=str,
|
|
80
|
-
help="Address of SuperLink's SimulationIO API",
|
|
81
|
-
)
|
|
82
|
-
parser.add_argument(
|
|
83
|
-
"--run-once",
|
|
84
|
-
action="store_true",
|
|
85
|
-
help="When set, this process will start a single simulation "
|
|
86
|
-
"for a pending Run. If no pending run the process will exit. ",
|
|
87
|
-
)
|
|
88
|
-
parser.add_argument(
|
|
89
|
-
"--flwr-dir",
|
|
90
|
-
default=None,
|
|
91
|
-
help="""The path containing installed Flower Apps.
|
|
92
|
-
By default, this value is equal to:
|
|
93
|
-
|
|
94
|
-
- `$FLWR_HOME/` if `$FLWR_HOME` is defined
|
|
95
|
-
- `$XDG_DATA_HOME/.flwr/` if `$XDG_DATA_HOME` is defined
|
|
96
|
-
- `$HOME/.flwr/` in all other cases
|
|
97
|
-
""",
|
|
98
|
-
)
|
|
99
|
-
parser.add_argument(
|
|
100
|
-
"--insecure",
|
|
101
|
-
action="store_true",
|
|
102
|
-
help="Run the server without HTTPS, regardless of whether certificate "
|
|
103
|
-
"paths are provided. By default, the server runs with HTTPS enabled. "
|
|
104
|
-
"Use this flag only if you understand the risks.",
|
|
105
|
-
)
|
|
106
|
-
parser.add_argument(
|
|
107
|
-
"--root-certificates",
|
|
108
|
-
metavar="ROOT_CERT",
|
|
109
|
-
type=str,
|
|
110
|
-
help="Specifies the path to the PEM-encoded root certificate file for "
|
|
111
|
-
"establishing secure HTTPS connections.",
|
|
112
|
-
)
|
|
113
|
-
args = parser.parse_args()
|
|
76
|
+
args = _parse_args_run_flwr_simulation().parse_args()
|
|
114
77
|
|
|
115
78
|
log(INFO, "Starting Flower Simulation")
|
|
116
|
-
|
|
79
|
+
|
|
80
|
+
if not args.insecure:
|
|
81
|
+
log(
|
|
82
|
+
ERROR,
|
|
83
|
+
"`flwr-simulation` does not support TLS yet. "
|
|
84
|
+
"Please use the '--insecure' flag.",
|
|
85
|
+
)
|
|
86
|
+
sys.exit(1)
|
|
117
87
|
|
|
118
88
|
log(
|
|
119
89
|
DEBUG,
|
|
120
|
-
"
|
|
121
|
-
|
|
90
|
+
"Starting isolated `Simulation` connected to SuperLink SimulationAppIo API "
|
|
91
|
+
"at %s",
|
|
92
|
+
args.simulationio_api_address,
|
|
122
93
|
)
|
|
123
94
|
run_simulation_process(
|
|
124
|
-
|
|
95
|
+
simulationio_api_address=args.simulationio_api_address,
|
|
125
96
|
log_queue=log_queue,
|
|
126
97
|
run_once=args.run_once,
|
|
127
98
|
flwr_dir_=args.flwr_dir,
|
|
128
|
-
certificates=
|
|
99
|
+
certificates=None,
|
|
129
100
|
)
|
|
130
101
|
|
|
131
102
|
# Restore stdout/stderr
|
|
132
103
|
restore_output()
|
|
133
104
|
|
|
134
105
|
|
|
135
|
-
def _try_obtain_certificates(
|
|
136
|
-
args: argparse.Namespace,
|
|
137
|
-
) -> Optional[bytes]:
|
|
138
|
-
|
|
139
|
-
if args.insecure:
|
|
140
|
-
if args.root_certificates is not None:
|
|
141
|
-
sys.exit(
|
|
142
|
-
"Conflicting options: The '--insecure' flag disables HTTPS, "
|
|
143
|
-
"but '--root-certificates' was also specified. Please remove "
|
|
144
|
-
"the '--root-certificates' option when running in insecure mode, "
|
|
145
|
-
"or omit '--insecure' to use HTTPS."
|
|
146
|
-
)
|
|
147
|
-
log(
|
|
148
|
-
WARN,
|
|
149
|
-
"Option `--insecure` was set. Starting insecure HTTP channel to %s.",
|
|
150
|
-
args.superlink,
|
|
151
|
-
)
|
|
152
|
-
root_certificates = None
|
|
153
|
-
else:
|
|
154
|
-
# Load the certificates if provided, or load the system certificates
|
|
155
|
-
if not isfile(args.root_certificates):
|
|
156
|
-
sys.exit("Path argument `--root-certificates` does not point to a file.")
|
|
157
|
-
root_certificates = Path(args.root_certificates).read_bytes()
|
|
158
|
-
log(
|
|
159
|
-
DEBUG,
|
|
160
|
-
"Starting secure HTTPS channel to %s "
|
|
161
|
-
"with the following certificates: %s.",
|
|
162
|
-
args.superlink,
|
|
163
|
-
args.root_certificates,
|
|
164
|
-
)
|
|
165
|
-
return root_certificates
|
|
166
|
-
|
|
167
|
-
|
|
168
106
|
def run_simulation_process( # pylint: disable=R0914, disable=W0212, disable=R0915
|
|
169
|
-
|
|
107
|
+
simulationio_api_address: str,
|
|
170
108
|
log_queue: Queue[Optional[str]],
|
|
171
109
|
run_once: bool,
|
|
172
110
|
flwr_dir_: Optional[str] = None,
|
|
@@ -174,7 +112,7 @@ def run_simulation_process( # pylint: disable=R0914, disable=W0212, disable=R09
|
|
|
174
112
|
) -> None:
|
|
175
113
|
"""Run Flower Simulation process."""
|
|
176
114
|
conn = SimulationIoConnection(
|
|
177
|
-
simulationio_service_address=
|
|
115
|
+
simulationio_service_address=simulationio_api_address,
|
|
178
116
|
root_certificates=certificates,
|
|
179
117
|
)
|
|
180
118
|
|
|
@@ -260,7 +198,7 @@ def run_simulation_process( # pylint: disable=R0914, disable=W0212, disable=R09
|
|
|
260
198
|
)
|
|
261
199
|
backend_config: BackendConfig = fed_opt.get("backend", {})
|
|
262
200
|
verbose: bool = fed_opt.get("verbose", False)
|
|
263
|
-
enable_tf_gpu_growth: bool = fed_opt.get("enable_tf_gpu_growth",
|
|
201
|
+
enable_tf_gpu_growth: bool = fed_opt.get("enable_tf_gpu_growth", False)
|
|
264
202
|
|
|
265
203
|
# Launch the simulation
|
|
266
204
|
_run_simulation(
|
|
@@ -292,6 +230,12 @@ def run_simulation_process( # pylint: disable=R0914, disable=W0212, disable=R09
|
|
|
292
230
|
run_status = RunStatus(Status.FINISHED, SubStatus.FAILED, str(ex))
|
|
293
231
|
|
|
294
232
|
finally:
|
|
233
|
+
# Stop log uploader for this run and upload final logs
|
|
234
|
+
if log_uploader:
|
|
235
|
+
stop_log_uploader(log_queue, log_uploader)
|
|
236
|
+
log_uploader = None
|
|
237
|
+
|
|
238
|
+
# Update run status
|
|
295
239
|
if run_status:
|
|
296
240
|
run_status_proto = run_status_to_proto(run_status)
|
|
297
241
|
conn._stub.UpdateRunStatus(
|
|
@@ -300,11 +244,28 @@ def run_simulation_process( # pylint: disable=R0914, disable=W0212, disable=R09
|
|
|
300
244
|
)
|
|
301
245
|
)
|
|
302
246
|
|
|
303
|
-
# Stop log uploader for this run
|
|
304
|
-
if log_uploader:
|
|
305
|
-
stop_log_uploader(log_queue, log_uploader)
|
|
306
|
-
log_uploader = None
|
|
307
|
-
|
|
308
247
|
# Stop the loop if `flwr-simulation` is expected to process a single run
|
|
309
248
|
if run_once:
|
|
310
249
|
break
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _parse_args_run_flwr_simulation() -> argparse.ArgumentParser:
|
|
253
|
+
"""Parse flwr-simulation command line arguments."""
|
|
254
|
+
parser = argparse.ArgumentParser(
|
|
255
|
+
description="Run a Flower Simulation",
|
|
256
|
+
)
|
|
257
|
+
parser.add_argument(
|
|
258
|
+
"--simulationio-api-address",
|
|
259
|
+
default=SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS,
|
|
260
|
+
type=str,
|
|
261
|
+
help="Address of SuperLink's SimulationIO API (IPv4, IPv6, or a domain name)."
|
|
262
|
+
f"By default, it is set to {SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS}.",
|
|
263
|
+
)
|
|
264
|
+
parser.add_argument(
|
|
265
|
+
"--run-once",
|
|
266
|
+
action="store_true",
|
|
267
|
+
help="When set, this process will start a single simulation "
|
|
268
|
+
"for a pending Run. If no pending run the process will exit. ",
|
|
269
|
+
)
|
|
270
|
+
add_args_flwr_app_common(parser=parser)
|
|
271
|
+
return parser
|
flwr/simulation/legacy_app.py
CHANGED
|
@@ -30,7 +30,12 @@ from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
|
|
|
30
30
|
from flwr.client import ClientFnExt
|
|
31
31
|
from flwr.common import EventType, event
|
|
32
32
|
from flwr.common.constant import NODE_ID_NUM_BYTES
|
|
33
|
-
from flwr.common.logger import
|
|
33
|
+
from flwr.common.logger import (
|
|
34
|
+
log,
|
|
35
|
+
set_logger_propagation,
|
|
36
|
+
warn_deprecated_feature,
|
|
37
|
+
warn_unsupported_feature,
|
|
38
|
+
)
|
|
34
39
|
from flwr.server.client_manager import ClientManager
|
|
35
40
|
from flwr.server.history import History
|
|
36
41
|
from flwr.server.server import Server, init_defaults, run_fl
|
|
@@ -108,6 +113,11 @@ def start_simulation(
|
|
|
108
113
|
) -> History:
|
|
109
114
|
"""Start a Ray-based Flower simulation server.
|
|
110
115
|
|
|
116
|
+
Warning
|
|
117
|
+
-------
|
|
118
|
+
This function is deprecated since 1.13.0. Use :code: `flwr run` to start a Flower
|
|
119
|
+
simulation.
|
|
120
|
+
|
|
111
121
|
Parameters
|
|
112
122
|
----------
|
|
113
123
|
client_fn : ClientFnExt
|
|
@@ -183,6 +193,16 @@ def start_simulation(
|
|
|
183
193
|
Object containing metrics from training.
|
|
184
194
|
""" # noqa: E501
|
|
185
195
|
# pylint: disable-msg=too-many-locals
|
|
196
|
+
msg = (
|
|
197
|
+
"flwr.simulation.start_simulation() is deprecated."
|
|
198
|
+
"\n\tInstead, use the `flwr run` CLI command to start a local simulation "
|
|
199
|
+
"in your Flower app, as shown for example below:"
|
|
200
|
+
"\n\n\t\t$ flwr new # Create a new Flower app from a template"
|
|
201
|
+
"\n\n\t\t$ flwr run # Run the Flower app in Simulation Mode"
|
|
202
|
+
"\n\n\tUsing `start_simulation()` is deprecated."
|
|
203
|
+
)
|
|
204
|
+
warn_deprecated_feature(name=msg)
|
|
205
|
+
|
|
186
206
|
event(
|
|
187
207
|
EventType.START_SIMULATION_ENTER,
|
|
188
208
|
{"num_clients": len(clients_ids) if clients_ids is not None else num_clients},
|
|
@@ -123,13 +123,8 @@ def run_simulation_from_cli() -> None:
|
|
|
123
123
|
fused_config = get_fused_config_from_dir(app_path, override_config)
|
|
124
124
|
|
|
125
125
|
# Create run
|
|
126
|
-
run = Run(
|
|
127
|
-
|
|
128
|
-
fab_id="",
|
|
129
|
-
fab_version="",
|
|
130
|
-
fab_hash="",
|
|
131
|
-
override_config=override_config,
|
|
132
|
-
)
|
|
126
|
+
run = Run.create_empty(run_id)
|
|
127
|
+
run.override_config = override_config
|
|
133
128
|
|
|
134
129
|
_run_simulation(
|
|
135
130
|
server_app_attr=server_app_attr,
|
|
@@ -333,14 +328,10 @@ def _main_loop(
|
|
|
333
328
|
try:
|
|
334
329
|
# Register run
|
|
335
330
|
log(DEBUG, "Pre-registering run with id %s", run.run_id)
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
starting_at=now().isoformat(),
|
|
341
|
-
running_at=now().isoformat(),
|
|
342
|
-
finished_at="",
|
|
343
|
-
)
|
|
331
|
+
run.status = RunStatus(Status.RUNNING, "", "")
|
|
332
|
+
run.starting_at = now().isoformat()
|
|
333
|
+
run.running_at = run.starting_at
|
|
334
|
+
state_factory.state().run_ids[run.run_id] = RunRecord(run=run) # type: ignore
|
|
344
335
|
|
|
345
336
|
if server_app_run_config is None:
|
|
346
337
|
server_app_run_config = {}
|
|
@@ -457,9 +448,7 @@ def _run_simulation(
|
|
|
457
448
|
# If no `Run` object is set, create one
|
|
458
449
|
if run is None:
|
|
459
450
|
run_id = generate_rand_int_from_bytes(RUN_ID_NUM_BYTES)
|
|
460
|
-
run = Run(
|
|
461
|
-
run_id=run_id, fab_id="", fab_version="", fab_hash="", override_config={}
|
|
462
|
-
)
|
|
451
|
+
run = Run.create_empty(run_id=run_id)
|
|
463
452
|
|
|
464
453
|
args = (
|
|
465
454
|
num_supernodes,
|
|
@@ -20,7 +20,7 @@ from typing import Optional, cast
|
|
|
20
20
|
|
|
21
21
|
import grpc
|
|
22
22
|
|
|
23
|
-
from flwr.common.constant import
|
|
23
|
+
from flwr.common.constant import SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS
|
|
24
24
|
from flwr.common.grpc import create_channel
|
|
25
25
|
from flwr.common.logger import log
|
|
26
26
|
from flwr.proto.simulationio_pb2_grpc import SimulationIoStub # pylint: disable=E0611
|
|
@@ -41,7 +41,7 @@ class SimulationIoConnection:
|
|
|
41
41
|
|
|
42
42
|
def __init__( # pylint: disable=too-many-arguments
|
|
43
43
|
self,
|
|
44
|
-
simulationio_service_address: str =
|
|
44
|
+
simulationio_service_address: str = SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS,
|
|
45
45
|
root_certificates: Optional[bytes] = None,
|
|
46
46
|
) -> None:
|
|
47
47
|
self._addr = simulationio_service_address
|
flwr/superexec/deployment.py
CHANGED
|
@@ -21,8 +21,13 @@ from typing import Optional
|
|
|
21
21
|
|
|
22
22
|
from typing_extensions import override
|
|
23
23
|
|
|
24
|
+
from flwr.cli.config_utils import get_fab_metadata
|
|
24
25
|
from flwr.common import ConfigsRecord, Context, RecordSet
|
|
25
|
-
from flwr.common.constant import
|
|
26
|
+
from flwr.common.constant import (
|
|
27
|
+
SERVERAPPIO_API_DEFAULT_CLIENT_ADDRESS,
|
|
28
|
+
Status,
|
|
29
|
+
SubStatus,
|
|
30
|
+
)
|
|
26
31
|
from flwr.common.logger import log
|
|
27
32
|
from flwr.common.typing import Fab, RunStatus, UserConfig
|
|
28
33
|
from flwr.server.superlink.ffs import Ffs
|
|
@@ -37,7 +42,7 @@ class DeploymentEngine(Executor):
|
|
|
37
42
|
|
|
38
43
|
Parameters
|
|
39
44
|
----------
|
|
40
|
-
|
|
45
|
+
serverappio_api_address: str (default: "127.0.0.1:9091")
|
|
41
46
|
Address of the SuperLink to connect to.
|
|
42
47
|
root_certificates: Optional[str] (default: None)
|
|
43
48
|
Specifies the path to the PEM-encoded root certificate file for
|
|
@@ -48,11 +53,11 @@ class DeploymentEngine(Executor):
|
|
|
48
53
|
|
|
49
54
|
def __init__(
|
|
50
55
|
self,
|
|
51
|
-
|
|
56
|
+
serverappio_api_address: str = SERVERAPPIO_API_DEFAULT_CLIENT_ADDRESS,
|
|
52
57
|
root_certificates: Optional[str] = None,
|
|
53
58
|
flwr_dir: Optional[str] = None,
|
|
54
59
|
) -> None:
|
|
55
|
-
self.
|
|
60
|
+
self.serverappio_api_address = serverappio_api_address
|
|
56
61
|
if root_certificates is None:
|
|
57
62
|
self.root_certificates = None
|
|
58
63
|
self.root_certificates_bytes = None
|
|
@@ -109,7 +114,7 @@ class DeploymentEngine(Executor):
|
|
|
109
114
|
if superlink_address := config.get("superlink"):
|
|
110
115
|
if not isinstance(superlink_address, str):
|
|
111
116
|
raise ValueError("The `superlink` value should be of type `str`.")
|
|
112
|
-
self.
|
|
117
|
+
self.serverappio_api_address = superlink_address
|
|
113
118
|
if root_certificates := config.get("root-certificates"):
|
|
114
119
|
if not isinstance(root_certificates, str):
|
|
115
120
|
raise ValueError(
|
|
@@ -132,9 +137,10 @@ class DeploymentEngine(Executor):
|
|
|
132
137
|
raise RuntimeError(
|
|
133
138
|
f"FAB ({fab.hash_str}) hash from request doesn't match contents"
|
|
134
139
|
)
|
|
140
|
+
fab_id, fab_version = get_fab_metadata(fab.content)
|
|
135
141
|
|
|
136
142
|
run_id = self.linkstate.create_run(
|
|
137
|
-
|
|
143
|
+
fab_id, fab_version, fab_hash, override_config, ConfigsRecord()
|
|
138
144
|
)
|
|
139
145
|
return run_id
|
|
140
146
|
|
flwr/superexec/exec_servicer.py
CHANGED
|
@@ -22,18 +22,25 @@ from typing import Any
|
|
|
22
22
|
|
|
23
23
|
import grpc
|
|
24
24
|
|
|
25
|
+
from flwr.common import now
|
|
25
26
|
from flwr.common.constant import LOG_STREAM_INTERVAL, Status
|
|
26
27
|
from flwr.common.logger import log
|
|
27
|
-
from flwr.common.serde import
|
|
28
|
+
from flwr.common.serde import (
|
|
29
|
+
configs_record_from_proto,
|
|
30
|
+
run_to_proto,
|
|
31
|
+
user_config_from_proto,
|
|
32
|
+
)
|
|
28
33
|
from flwr.proto import exec_pb2_grpc # pylint: disable=E0611
|
|
29
34
|
from flwr.proto.exec_pb2 import ( # pylint: disable=E0611
|
|
35
|
+
ListRunsRequest,
|
|
36
|
+
ListRunsResponse,
|
|
30
37
|
StartRunRequest,
|
|
31
38
|
StartRunResponse,
|
|
32
39
|
StreamLogsRequest,
|
|
33
40
|
StreamLogsResponse,
|
|
34
41
|
)
|
|
35
42
|
from flwr.server.superlink.ffs.ffs_factory import FfsFactory
|
|
36
|
-
from flwr.server.superlink.linkstate import LinkStateFactory
|
|
43
|
+
from flwr.server.superlink.linkstate import LinkState, LinkStateFactory
|
|
37
44
|
|
|
38
45
|
from .executor import Executor
|
|
39
46
|
|
|
@@ -105,3 +112,25 @@ class ExecServicer(exec_pb2_grpc.ExecServicer):
|
|
|
105
112
|
context.cancel()
|
|
106
113
|
|
|
107
114
|
time.sleep(LOG_STREAM_INTERVAL) # Sleep briefly to avoid busy waiting
|
|
115
|
+
|
|
116
|
+
def ListRuns(
|
|
117
|
+
self, request: ListRunsRequest, context: grpc.ServicerContext
|
|
118
|
+
) -> ListRunsResponse:
|
|
119
|
+
"""Handle `flwr ls` command."""
|
|
120
|
+
log(INFO, "ExecServicer.List")
|
|
121
|
+
state = self.linkstate_factory.state()
|
|
122
|
+
|
|
123
|
+
# Handle `flwr ls --runs`
|
|
124
|
+
if not request.HasField("run_id"):
|
|
125
|
+
return _create_list_runs_response(state.get_run_ids(), state)
|
|
126
|
+
# Handle `flwr ls --run-id <run_id>`
|
|
127
|
+
return _create_list_runs_response({request.run_id}, state)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def _create_list_runs_response(run_ids: set[int], state: LinkState) -> ListRunsResponse:
|
|
131
|
+
"""Create response for `flwr ls --runs` and `flwr ls --run-id <run_id>`."""
|
|
132
|
+
run_dict = {run_id: state.get_run(run_id) for run_id in run_ids}
|
|
133
|
+
return ListRunsResponse(
|
|
134
|
+
run_dict={run_id: run_to_proto(run) for run_id, run in run_dict.items() if run},
|
|
135
|
+
now=now().isoformat(),
|
|
136
|
+
)
|