flwr 1.20.0__py3-none-any.whl → 1.22.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flwr/__init__.py +4 -1
- flwr/app/__init__.py +28 -0
- flwr/app/exception.py +31 -0
- flwr/cli/app.py +2 -0
- flwr/cli/auth_plugin/oidc_cli_plugin.py +4 -4
- flwr/cli/cli_user_auth_interceptor.py +1 -1
- flwr/cli/config_utils.py +3 -3
- flwr/cli/constant.py +25 -8
- flwr/cli/log.py +9 -9
- flwr/cli/login/login.py +3 -3
- flwr/cli/ls.py +5 -5
- flwr/cli/new/new.py +15 -2
- flwr/cli/new/templates/app/README.flowertune.md.tpl +1 -1
- flwr/cli/new/templates/app/code/__init__.pytorch_legacy_api.py.tpl +1 -0
- flwr/cli/new/templates/app/code/client.baseline.py.tpl +64 -47
- flwr/cli/new/templates/app/code/client.huggingface.py.tpl +68 -30
- flwr/cli/new/templates/app/code/client.jax.py.tpl +63 -42
- flwr/cli/new/templates/app/code/client.mlx.py.tpl +80 -51
- flwr/cli/new/templates/app/code/client.numpy.py.tpl +36 -13
- flwr/cli/new/templates/app/code/client.pytorch.py.tpl +71 -46
- flwr/cli/new/templates/app/code/client.pytorch_legacy_api.py.tpl +55 -0
- flwr/cli/new/templates/app/code/client.sklearn.py.tpl +75 -30
- flwr/cli/new/templates/app/code/client.tensorflow.py.tpl +69 -44
- flwr/cli/new/templates/app/code/client.xgboost.py.tpl +110 -0
- flwr/cli/new/templates/app/code/flwr_tune/client_app.py.tpl +56 -90
- flwr/cli/new/templates/app/code/flwr_tune/models.py.tpl +1 -23
- flwr/cli/new/templates/app/code/flwr_tune/server_app.py.tpl +37 -58
- flwr/cli/new/templates/app/code/flwr_tune/strategy.py.tpl +39 -44
- flwr/cli/new/templates/app/code/model.baseline.py.tpl +0 -14
- flwr/cli/new/templates/app/code/server.baseline.py.tpl +27 -29
- flwr/cli/new/templates/app/code/server.huggingface.py.tpl +23 -19
- flwr/cli/new/templates/app/code/server.jax.py.tpl +27 -14
- flwr/cli/new/templates/app/code/server.mlx.py.tpl +29 -19
- flwr/cli/new/templates/app/code/server.numpy.py.tpl +30 -17
- flwr/cli/new/templates/app/code/server.pytorch.py.tpl +36 -26
- flwr/cli/new/templates/app/code/server.pytorch_legacy_api.py.tpl +31 -0
- flwr/cli/new/templates/app/code/server.sklearn.py.tpl +29 -21
- flwr/cli/new/templates/app/code/server.tensorflow.py.tpl +28 -19
- flwr/cli/new/templates/app/code/server.xgboost.py.tpl +56 -0
- flwr/cli/new/templates/app/code/task.huggingface.py.tpl +16 -20
- flwr/cli/new/templates/app/code/task.jax.py.tpl +1 -1
- flwr/cli/new/templates/app/code/task.numpy.py.tpl +1 -1
- flwr/cli/new/templates/app/code/task.pytorch.py.tpl +14 -27
- flwr/cli/new/templates/app/code/task.pytorch_legacy_api.py.tpl +111 -0
- flwr/cli/new/templates/app/code/task.tensorflow.py.tpl +1 -2
- flwr/cli/new/templates/app/code/task.xgboost.py.tpl +67 -0
- flwr/cli/new/templates/app/pyproject.baseline.toml.tpl +4 -4
- flwr/cli/new/templates/app/pyproject.flowertune.toml.tpl +2 -2
- flwr/cli/new/templates/app/pyproject.huggingface.toml.tpl +4 -4
- flwr/cli/new/templates/app/pyproject.jax.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.mlx.toml.tpl +2 -2
- flwr/cli/new/templates/app/pyproject.numpy.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.pytorch.toml.tpl +3 -3
- flwr/cli/new/templates/app/pyproject.pytorch_legacy_api.toml.tpl +53 -0
- flwr/cli/new/templates/app/pyproject.sklearn.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.tensorflow.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.xgboost.toml.tpl +61 -0
- flwr/cli/pull.py +100 -0
- flwr/cli/run/run.py +9 -13
- flwr/cli/stop.py +7 -4
- flwr/cli/utils.py +36 -8
- flwr/client/grpc_rere_client/connection.py +1 -12
- flwr/client/rest_client/connection.py +3 -0
- flwr/clientapp/__init__.py +10 -0
- flwr/clientapp/mod/__init__.py +29 -0
- flwr/clientapp/mod/centraldp_mods.py +248 -0
- flwr/clientapp/mod/localdp_mod.py +169 -0
- flwr/clientapp/typing.py +22 -0
- flwr/common/args.py +20 -6
- flwr/common/auth_plugin/__init__.py +4 -4
- flwr/common/auth_plugin/auth_plugin.py +7 -7
- flwr/common/constant.py +26 -4
- flwr/common/event_log_plugin/event_log_plugin.py +1 -1
- flwr/common/exit/__init__.py +4 -0
- flwr/common/exit/exit.py +8 -1
- flwr/common/exit/exit_code.py +30 -7
- flwr/common/exit/exit_handler.py +62 -0
- flwr/common/{exit_handlers.py → exit/signal_handler.py} +20 -37
- flwr/common/grpc.py +0 -11
- flwr/common/inflatable_utils.py +1 -1
- flwr/common/logger.py +1 -1
- flwr/common/record/typeddict.py +12 -0
- flwr/common/retry_invoker.py +30 -11
- flwr/common/telemetry.py +4 -0
- flwr/compat/server/app.py +2 -2
- flwr/proto/appio_pb2.py +25 -17
- flwr/proto/appio_pb2.pyi +46 -2
- flwr/proto/clientappio_pb2.py +3 -11
- flwr/proto/clientappio_pb2.pyi +0 -47
- flwr/proto/clientappio_pb2_grpc.py +19 -20
- flwr/proto/clientappio_pb2_grpc.pyi +10 -11
- flwr/proto/control_pb2.py +66 -0
- flwr/proto/{exec_pb2.pyi → control_pb2.pyi} +24 -0
- flwr/proto/{exec_pb2_grpc.py → control_pb2_grpc.py} +88 -54
- flwr/proto/control_pb2_grpc.pyi +106 -0
- flwr/proto/serverappio_pb2.py +2 -2
- flwr/proto/serverappio_pb2_grpc.py +68 -0
- flwr/proto/serverappio_pb2_grpc.pyi +26 -0
- flwr/proto/simulationio_pb2.py +4 -11
- flwr/proto/simulationio_pb2.pyi +0 -58
- flwr/proto/simulationio_pb2_grpc.py +129 -27
- flwr/proto/simulationio_pb2_grpc.pyi +52 -13
- flwr/server/app.py +142 -152
- flwr/server/grid/grpc_grid.py +3 -0
- flwr/server/grid/inmemory_grid.py +1 -0
- flwr/server/serverapp/app.py +157 -146
- flwr/server/superlink/fleet/vce/backend/raybackend.py +3 -1
- flwr/server/superlink/fleet/vce/vce_api.py +6 -6
- flwr/server/superlink/linkstate/in_memory_linkstate.py +34 -0
- flwr/server/superlink/linkstate/linkstate.py +2 -1
- flwr/server/superlink/linkstate/sqlite_linkstate.py +45 -0
- flwr/server/superlink/serverappio/serverappio_grpc.py +1 -1
- flwr/server/superlink/serverappio/serverappio_servicer.py +61 -6
- flwr/server/superlink/simulation/simulationio_servicer.py +97 -21
- flwr/serverapp/__init__.py +12 -0
- flwr/serverapp/exception.py +38 -0
- flwr/serverapp/strategy/__init__.py +64 -0
- flwr/serverapp/strategy/bulyan.py +238 -0
- flwr/serverapp/strategy/dp_adaptive_clipping.py +335 -0
- flwr/serverapp/strategy/dp_fixed_clipping.py +374 -0
- flwr/serverapp/strategy/fedadagrad.py +159 -0
- flwr/serverapp/strategy/fedadam.py +178 -0
- flwr/serverapp/strategy/fedavg.py +320 -0
- flwr/serverapp/strategy/fedavgm.py +198 -0
- flwr/serverapp/strategy/fedmedian.py +105 -0
- flwr/serverapp/strategy/fedopt.py +218 -0
- flwr/serverapp/strategy/fedprox.py +174 -0
- flwr/serverapp/strategy/fedtrimmedavg.py +176 -0
- flwr/serverapp/strategy/fedxgb_bagging.py +117 -0
- flwr/serverapp/strategy/fedxgb_cyclic.py +220 -0
- flwr/serverapp/strategy/fedyogi.py +170 -0
- flwr/serverapp/strategy/krum.py +112 -0
- flwr/serverapp/strategy/multikrum.py +247 -0
- flwr/serverapp/strategy/qfedavg.py +252 -0
- flwr/serverapp/strategy/result.py +105 -0
- flwr/serverapp/strategy/strategy.py +285 -0
- flwr/serverapp/strategy/strategy_utils.py +299 -0
- flwr/simulation/app.py +161 -164
- flwr/simulation/run_simulation.py +25 -30
- flwr/supercore/app_utils.py +58 -0
- flwr/{supernode/scheduler → supercore/cli}/__init__.py +3 -3
- flwr/supercore/cli/flower_superexec.py +166 -0
- flwr/supercore/constant.py +19 -0
- flwr/supercore/{scheduler → corestate}/__init__.py +3 -3
- flwr/supercore/corestate/corestate.py +81 -0
- flwr/supercore/grpc_health/__init__.py +3 -0
- flwr/supercore/grpc_health/health_server.py +53 -0
- flwr/supercore/grpc_health/simple_health_servicer.py +2 -2
- flwr/{superexec → supercore/superexec}/__init__.py +1 -1
- flwr/supercore/superexec/plugin/__init__.py +28 -0
- flwr/{supernode/scheduler/simple_clientapp_scheduler_plugin.py → supercore/superexec/plugin/base_exec_plugin.py} +10 -6
- flwr/supercore/superexec/plugin/clientapp_exec_plugin.py +28 -0
- flwr/supercore/{scheduler/plugin.py → superexec/plugin/exec_plugin.py} +15 -5
- flwr/supercore/superexec/plugin/serverapp_exec_plugin.py +28 -0
- flwr/supercore/superexec/plugin/simulation_exec_plugin.py +28 -0
- flwr/supercore/superexec/run_superexec.py +199 -0
- flwr/superlink/artifact_provider/__init__.py +22 -0
- flwr/superlink/artifact_provider/artifact_provider.py +37 -0
- flwr/superlink/servicer/__init__.py +15 -0
- flwr/superlink/servicer/control/__init__.py +22 -0
- flwr/{superexec/exec_event_log_interceptor.py → superlink/servicer/control/control_event_log_interceptor.py} +7 -7
- flwr/{superexec/exec_grpc.py → superlink/servicer/control/control_grpc.py} +27 -29
- flwr/{superexec/exec_license_interceptor.py → superlink/servicer/control/control_license_interceptor.py} +6 -6
- flwr/{superexec/exec_servicer.py → superlink/servicer/control/control_servicer.py} +127 -31
- flwr/{superexec/exec_user_auth_interceptor.py → superlink/servicer/control/control_user_auth_interceptor.py} +10 -10
- flwr/supernode/cli/flower_supernode.py +3 -0
- flwr/supernode/cli/flwr_clientapp.py +18 -21
- flwr/supernode/nodestate/in_memory_nodestate.py +2 -2
- flwr/supernode/nodestate/nodestate.py +3 -59
- flwr/supernode/runtime/run_clientapp.py +39 -102
- flwr/supernode/servicer/clientappio/clientappio_servicer.py +10 -17
- flwr/supernode/start_client_internal.py +35 -76
- {flwr-1.20.0.dist-info → flwr-1.22.0.dist-info}/METADATA +9 -18
- {flwr-1.20.0.dist-info → flwr-1.22.0.dist-info}/RECORD +176 -128
- {flwr-1.20.0.dist-info → flwr-1.22.0.dist-info}/entry_points.txt +1 -0
- flwr/proto/exec_pb2.py +0 -62
- flwr/proto/exec_pb2_grpc.pyi +0 -93
- flwr/superexec/app.py +0 -45
- flwr/superexec/deployment.py +0 -191
- flwr/superexec/executor.py +0 -100
- flwr/superexec/simulation.py +0 -129
- {flwr-1.20.0.dist-info → flwr-1.22.0.dist-info}/WHEEL +0 -0
flwr/simulation/app.py
CHANGED
|
@@ -16,10 +16,8 @@
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
import argparse
|
|
19
|
-
import gc
|
|
20
19
|
from logging import DEBUG, ERROR, INFO
|
|
21
20
|
from queue import Queue
|
|
22
|
-
from time import sleep
|
|
23
21
|
from typing import Optional
|
|
24
22
|
|
|
25
23
|
from flwr.cli.config_utils import get_fab_metadata
|
|
@@ -36,6 +34,7 @@ from flwr.common.config import (
|
|
|
36
34
|
)
|
|
37
35
|
from flwr.common.constant import (
|
|
38
36
|
SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS,
|
|
37
|
+
ExecPluginType,
|
|
39
38
|
Status,
|
|
40
39
|
SubStatus,
|
|
41
40
|
)
|
|
@@ -57,19 +56,23 @@ from flwr.common.serde import (
|
|
|
57
56
|
run_status_to_proto,
|
|
58
57
|
)
|
|
59
58
|
from flwr.common.typing import RunStatus
|
|
59
|
+
from flwr.proto.appio_pb2 import ( # pylint: disable=E0611
|
|
60
|
+
PullAppInputsRequest,
|
|
61
|
+
PullAppInputsResponse,
|
|
62
|
+
PushAppOutputsRequest,
|
|
63
|
+
)
|
|
60
64
|
from flwr.proto.run_pb2 import ( # pylint: disable=E0611
|
|
61
65
|
GetFederationOptionsRequest,
|
|
62
66
|
GetFederationOptionsResponse,
|
|
63
67
|
UpdateRunStatusRequest,
|
|
64
68
|
)
|
|
65
|
-
from flwr.proto.
|
|
66
|
-
PullSimulationInputsRequest,
|
|
67
|
-
PullSimulationInputsResponse,
|
|
68
|
-
PushSimulationOutputsRequest,
|
|
69
|
-
)
|
|
69
|
+
from flwr.proto.simulationio_pb2_grpc import SimulationIoStub
|
|
70
70
|
from flwr.server.superlink.fleet.vce.backend.backend import BackendConfig
|
|
71
71
|
from flwr.simulation.run_simulation import _run_simulation
|
|
72
72
|
from flwr.simulation.simulationio_connection import SimulationIoConnection
|
|
73
|
+
from flwr.supercore.app_utils import start_parent_process_monitor
|
|
74
|
+
from flwr.supercore.superexec.plugin import SimulationExecPlugin
|
|
75
|
+
from flwr.supercore.superexec.run_superexec import run_with_deprecation_warning
|
|
73
76
|
|
|
74
77
|
|
|
75
78
|
def flwr_simulation() -> None:
|
|
@@ -80,14 +83,27 @@ def flwr_simulation() -> None:
|
|
|
80
83
|
|
|
81
84
|
args = _parse_args_run_flwr_simulation().parse_args()
|
|
82
85
|
|
|
83
|
-
log(INFO, "Starting Flower Simulation")
|
|
84
|
-
|
|
85
86
|
if not args.insecure:
|
|
86
87
|
flwr_exit(
|
|
87
88
|
ExitCode.COMMON_TLS_NOT_SUPPORTED,
|
|
88
|
-
"`flwr-simulation` does not support TLS yet.
|
|
89
|
+
"`flwr-simulation` does not support TLS yet.",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Disallow long-running `flwr-simulation` processes
|
|
93
|
+
if args.token is None:
|
|
94
|
+
run_with_deprecation_warning(
|
|
95
|
+
cmd="flwr-simulation",
|
|
96
|
+
plugin_type=ExecPluginType.SIMULATION,
|
|
97
|
+
plugin_class=SimulationExecPlugin,
|
|
98
|
+
stub_class=SimulationIoStub,
|
|
99
|
+
appio_api_address=args.simulationio_api_address,
|
|
100
|
+
flwr_dir=args.flwr_dir,
|
|
101
|
+
parent_pid=args.parent_pid,
|
|
102
|
+
warn_run_once=args.run_once,
|
|
89
103
|
)
|
|
104
|
+
return
|
|
90
105
|
|
|
106
|
+
log(INFO, "Starting Flower Simulation")
|
|
91
107
|
log(
|
|
92
108
|
DEBUG,
|
|
93
109
|
"Starting isolated `Simulation` connected to SuperLink SimulationAppIo API "
|
|
@@ -97,23 +113,29 @@ def flwr_simulation() -> None:
|
|
|
97
113
|
run_simulation_process(
|
|
98
114
|
simulationio_api_address=args.simulationio_api_address,
|
|
99
115
|
log_queue=log_queue,
|
|
100
|
-
|
|
116
|
+
token=args.token,
|
|
101
117
|
flwr_dir_=args.flwr_dir,
|
|
102
118
|
certificates=None,
|
|
119
|
+
parent_pid=args.parent_pid,
|
|
103
120
|
)
|
|
104
121
|
|
|
105
122
|
# Restore stdout/stderr
|
|
106
123
|
restore_output()
|
|
107
124
|
|
|
108
125
|
|
|
109
|
-
def run_simulation_process( # pylint: disable=R0914,
|
|
126
|
+
def run_simulation_process( # pylint: disable=R0913, R0914, R0915, R0917, W0212
|
|
110
127
|
simulationio_api_address: str,
|
|
111
128
|
log_queue: Queue[Optional[str]],
|
|
112
|
-
|
|
129
|
+
token: str,
|
|
113
130
|
flwr_dir_: Optional[str] = None,
|
|
114
131
|
certificates: Optional[bytes] = None,
|
|
132
|
+
parent_pid: Optional[int] = None,
|
|
115
133
|
) -> None:
|
|
116
134
|
"""Run Flower Simulation process."""
|
|
135
|
+
# Start monitoring the parent process if a PID is provided
|
|
136
|
+
if parent_pid is not None:
|
|
137
|
+
start_parent_process_monitor(parent_pid)
|
|
138
|
+
|
|
117
139
|
conn = SimulationIoConnection(
|
|
118
140
|
simulationio_service_address=simulationio_api_address,
|
|
119
141
|
root_certificates=certificates,
|
|
@@ -123,165 +145,146 @@ def run_simulation_process( # pylint: disable=R0914, disable=W0212, disable=R09
|
|
|
123
145
|
flwr_dir = get_flwr_dir(flwr_dir_)
|
|
124
146
|
log_uploader = None
|
|
125
147
|
heartbeat_sender = None
|
|
148
|
+
run_status = None
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
# Pull SimulationInputs from LinkState
|
|
152
|
+
req = PullAppInputsRequest(token=token)
|
|
153
|
+
res: PullAppInputsResponse = conn._stub.PullAppInputs(req)
|
|
154
|
+
context = context_from_proto(res.context)
|
|
155
|
+
run = run_from_proto(res.run)
|
|
156
|
+
fab = fab_from_proto(res.fab)
|
|
157
|
+
|
|
158
|
+
# Start log uploader for this run
|
|
159
|
+
log_uploader = start_log_uploader(
|
|
160
|
+
log_queue=log_queue,
|
|
161
|
+
node_id=context.node_id,
|
|
162
|
+
run_id=run.run_id,
|
|
163
|
+
stub=conn._stub,
|
|
164
|
+
)
|
|
126
165
|
|
|
127
|
-
|
|
166
|
+
log(DEBUG, "Simulation process starts FAB installation.")
|
|
167
|
+
install_from_fab(fab.content, flwr_dir=flwr_dir, skip_prompt=True)
|
|
128
168
|
|
|
129
|
-
|
|
130
|
-
# Pull SimulationInputs from LinkState
|
|
131
|
-
req = PullSimulationInputsRequest()
|
|
132
|
-
res: PullSimulationInputsResponse = conn._stub.PullSimulationInputs(req)
|
|
133
|
-
if not res.HasField("run"):
|
|
134
|
-
sleep(3)
|
|
135
|
-
run_status = None
|
|
136
|
-
continue
|
|
137
|
-
|
|
138
|
-
context = context_from_proto(res.context)
|
|
139
|
-
run = run_from_proto(res.run)
|
|
140
|
-
fab = fab_from_proto(res.fab)
|
|
141
|
-
|
|
142
|
-
# Start log uploader for this run
|
|
143
|
-
log_uploader = start_log_uploader(
|
|
144
|
-
log_queue=log_queue,
|
|
145
|
-
node_id=context.node_id,
|
|
146
|
-
run_id=run.run_id,
|
|
147
|
-
stub=conn._stub,
|
|
148
|
-
)
|
|
169
|
+
fab_id, fab_version = get_fab_metadata(fab.content)
|
|
149
170
|
|
|
150
|
-
|
|
151
|
-
|
|
171
|
+
app_path = get_project_dir(fab_id, fab_version, fab.hash_str, flwr_dir)
|
|
172
|
+
config = get_project_config(app_path)
|
|
152
173
|
|
|
153
|
-
|
|
174
|
+
# Get ClientApp and SeverApp components
|
|
175
|
+
app_components = config["tool"]["flwr"]["app"]["components"]
|
|
176
|
+
client_app_attr = app_components["clientapp"]
|
|
177
|
+
server_app_attr = app_components["serverapp"]
|
|
178
|
+
fused_config = get_fused_config_from_dir(app_path, run.override_config)
|
|
154
179
|
|
|
155
|
-
|
|
156
|
-
|
|
180
|
+
# Update run_config in context
|
|
181
|
+
context.run_config = fused_config
|
|
157
182
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
server_app_attr
|
|
162
|
-
|
|
183
|
+
log(
|
|
184
|
+
DEBUG,
|
|
185
|
+
"Flower will load ServerApp `%s` in %s",
|
|
186
|
+
server_app_attr,
|
|
187
|
+
app_path,
|
|
188
|
+
)
|
|
189
|
+
log(
|
|
190
|
+
DEBUG,
|
|
191
|
+
"Flower will load ClientApp `%s` in %s",
|
|
192
|
+
client_app_attr,
|
|
193
|
+
app_path,
|
|
194
|
+
)
|
|
163
195
|
|
|
164
|
-
|
|
165
|
-
|
|
196
|
+
# Change status to Running
|
|
197
|
+
run_status_proto = run_status_to_proto(RunStatus(Status.RUNNING, "", ""))
|
|
198
|
+
conn._stub.UpdateRunStatus(
|
|
199
|
+
UpdateRunStatusRequest(run_id=run.run_id, run_status=run_status_proto)
|
|
200
|
+
)
|
|
166
201
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
202
|
+
# Pull Federation Options
|
|
203
|
+
fed_opt_res: GetFederationOptionsResponse = conn._stub.GetFederationOptions(
|
|
204
|
+
GetFederationOptionsRequest(run_id=run.run_id)
|
|
205
|
+
)
|
|
206
|
+
federation_options = config_record_from_proto(fed_opt_res.federation_options)
|
|
207
|
+
|
|
208
|
+
# Unflatten underlying dict
|
|
209
|
+
fed_opt = unflatten_dict({**federation_options})
|
|
210
|
+
|
|
211
|
+
# Extract configs values of interest
|
|
212
|
+
num_supernodes = fed_opt.get("num-supernodes")
|
|
213
|
+
if num_supernodes is None:
|
|
214
|
+
raise ValueError("Federation options expects `num-supernodes` to be set.")
|
|
215
|
+
backend_config: BackendConfig = fed_opt.get("backend", {})
|
|
216
|
+
verbose: bool = fed_opt.get("verbose", False)
|
|
217
|
+
enable_tf_gpu_growth: bool = fed_opt.get("enable_tf_gpu_growth", False)
|
|
218
|
+
|
|
219
|
+
event(
|
|
220
|
+
EventType.FLWR_SIMULATION_RUN_ENTER,
|
|
221
|
+
event_details={
|
|
222
|
+
"backend": "ray",
|
|
223
|
+
"num-supernodes": num_supernodes,
|
|
224
|
+
"run-id-hash": get_sha256_hash(run.run_id),
|
|
225
|
+
},
|
|
226
|
+
)
|
|
179
227
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
conn._stub
|
|
183
|
-
|
|
184
|
-
|
|
228
|
+
# Set up heartbeat sender
|
|
229
|
+
heartbeat_fn = get_grpc_app_heartbeat_fn(
|
|
230
|
+
conn._stub,
|
|
231
|
+
run.run_id,
|
|
232
|
+
failure_message="Heartbeat failed unexpectedly. The SuperLink could "
|
|
233
|
+
"not find the provided run ID, or the run status is invalid.",
|
|
234
|
+
)
|
|
235
|
+
heartbeat_sender = HeartbeatSender(heartbeat_fn)
|
|
236
|
+
heartbeat_sender.start()
|
|
237
|
+
|
|
238
|
+
# Launch the simulation
|
|
239
|
+
updated_context = _run_simulation(
|
|
240
|
+
server_app_attr=server_app_attr,
|
|
241
|
+
client_app_attr=client_app_attr,
|
|
242
|
+
num_supernodes=num_supernodes,
|
|
243
|
+
backend_config=backend_config,
|
|
244
|
+
app_dir=str(app_path),
|
|
245
|
+
run=run,
|
|
246
|
+
enable_tf_gpu_growth=enable_tf_gpu_growth,
|
|
247
|
+
verbose_logging=verbose,
|
|
248
|
+
server_app_context=context,
|
|
249
|
+
is_app=True,
|
|
250
|
+
exit_event=EventType.FLWR_SIMULATION_RUN_LEAVE,
|
|
251
|
+
)
|
|
185
252
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
)
|
|
253
|
+
# Send resulting context
|
|
254
|
+
context_proto = context_to_proto(updated_context)
|
|
255
|
+
out_req = PushAppOutputsRequest(
|
|
256
|
+
token=token, run_id=run.run_id, context=context_proto
|
|
257
|
+
)
|
|
258
|
+
_ = conn._stub.PushAppOutputs(out_req)
|
|
193
259
|
|
|
194
|
-
|
|
195
|
-
fed_opt = unflatten_dict({**federation_options})
|
|
196
|
-
|
|
197
|
-
# Extract configs values of interest
|
|
198
|
-
num_supernodes = fed_opt.get("num-supernodes")
|
|
199
|
-
if num_supernodes is None:
|
|
200
|
-
raise ValueError(
|
|
201
|
-
"Federation options expects `num-supernodes` to be set."
|
|
202
|
-
)
|
|
203
|
-
backend_config: BackendConfig = fed_opt.get("backend", {})
|
|
204
|
-
verbose: bool = fed_opt.get("verbose", False)
|
|
205
|
-
enable_tf_gpu_growth: bool = fed_opt.get("enable_tf_gpu_growth", False)
|
|
206
|
-
|
|
207
|
-
event(
|
|
208
|
-
EventType.FLWR_SIMULATION_RUN_ENTER,
|
|
209
|
-
event_details={
|
|
210
|
-
"backend": "ray",
|
|
211
|
-
"num-supernodes": num_supernodes,
|
|
212
|
-
"run-id-hash": get_sha256_hash(run.run_id),
|
|
213
|
-
},
|
|
214
|
-
)
|
|
260
|
+
run_status = RunStatus(Status.FINISHED, SubStatus.COMPLETED, "")
|
|
215
261
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
heartbeat_sender.
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
client_app_attr=client_app_attr,
|
|
230
|
-
num_supernodes=num_supernodes,
|
|
231
|
-
backend_config=backend_config,
|
|
232
|
-
app_dir=str(app_path),
|
|
233
|
-
run=run,
|
|
234
|
-
enable_tf_gpu_growth=enable_tf_gpu_growth,
|
|
235
|
-
verbose_logging=verbose,
|
|
236
|
-
server_app_run_config=fused_config,
|
|
237
|
-
is_app=True,
|
|
238
|
-
exit_event=EventType.FLWR_SIMULATION_RUN_LEAVE,
|
|
239
|
-
)
|
|
262
|
+
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
263
|
+
exc_entity = "Simulation"
|
|
264
|
+
log(ERROR, "%s raised an exception", exc_entity, exc_info=ex)
|
|
265
|
+
run_status = RunStatus(Status.FINISHED, SubStatus.FAILED, str(ex))
|
|
266
|
+
|
|
267
|
+
finally:
|
|
268
|
+
# Stop heartbeat sender
|
|
269
|
+
if heartbeat_sender:
|
|
270
|
+
heartbeat_sender.stop()
|
|
271
|
+
|
|
272
|
+
# Stop log uploader for this run and upload final logs
|
|
273
|
+
if log_uploader:
|
|
274
|
+
stop_log_uploader(log_queue, log_uploader)
|
|
240
275
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
276
|
+
# Update run status
|
|
277
|
+
if run_status:
|
|
278
|
+
run_status_proto = run_status_to_proto(run_status)
|
|
279
|
+
conn._stub.UpdateRunStatus(
|
|
280
|
+
UpdateRunStatusRequest(run_id=run.run_id, run_status=run_status_proto)
|
|
245
281
|
)
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
except
|
|
251
|
-
|
|
252
|
-
log(ERROR, "%s raised an exception", exc_entity, exc_info=ex)
|
|
253
|
-
run_status = RunStatus(Status.FINISHED, SubStatus.FAILED, str(ex))
|
|
254
|
-
|
|
255
|
-
finally:
|
|
256
|
-
# Stop heartbeat sender
|
|
257
|
-
if heartbeat_sender:
|
|
258
|
-
heartbeat_sender.stop()
|
|
259
|
-
heartbeat_sender = None
|
|
260
|
-
|
|
261
|
-
# Stop log uploader for this run and upload final logs
|
|
262
|
-
if log_uploader:
|
|
263
|
-
stop_log_uploader(log_queue, log_uploader)
|
|
264
|
-
log_uploader = None
|
|
265
|
-
|
|
266
|
-
# Update run status
|
|
267
|
-
if run_status:
|
|
268
|
-
run_status_proto = run_status_to_proto(run_status)
|
|
269
|
-
conn._stub.UpdateRunStatus(
|
|
270
|
-
UpdateRunStatusRequest(
|
|
271
|
-
run_id=run.run_id, run_status=run_status_proto
|
|
272
|
-
)
|
|
273
|
-
)
|
|
274
|
-
|
|
275
|
-
# Clean up the Context if it exists
|
|
276
|
-
try:
|
|
277
|
-
del updated_context
|
|
278
|
-
except NameError:
|
|
279
|
-
pass
|
|
280
|
-
gc.collect()
|
|
281
|
-
|
|
282
|
-
# Stop the loop if `flwr-simulation` is expected to process a single run
|
|
283
|
-
if run_once:
|
|
284
|
-
break
|
|
282
|
+
|
|
283
|
+
# Clean up the Context if it exists
|
|
284
|
+
try:
|
|
285
|
+
del updated_context
|
|
286
|
+
except NameError:
|
|
287
|
+
pass
|
|
285
288
|
|
|
286
289
|
|
|
287
290
|
def _parse_args_run_flwr_simulation() -> argparse.ArgumentParser:
|
|
@@ -296,11 +299,5 @@ def _parse_args_run_flwr_simulation() -> argparse.ArgumentParser:
|
|
|
296
299
|
help="Address of SuperLink's SimulationIO API (IPv4, IPv6, or a domain name)."
|
|
297
300
|
f"By default, it is set to {SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS}.",
|
|
298
301
|
)
|
|
299
|
-
parser.add_argument(
|
|
300
|
-
"--run-once",
|
|
301
|
-
action="store_true",
|
|
302
|
-
help="When set, this process will start a single simulation "
|
|
303
|
-
"for a pending Run. If no pending run the process will exit. ",
|
|
304
|
-
)
|
|
305
302
|
add_args_flwr_app_common(parser=parser)
|
|
306
303
|
return parser
|
|
@@ -143,6 +143,15 @@ def run_simulation_from_cli() -> None:
|
|
|
143
143
|
run = Run.create_empty(run_id)
|
|
144
144
|
run.override_config = override_config
|
|
145
145
|
|
|
146
|
+
# Create Context
|
|
147
|
+
server_app_context = Context(
|
|
148
|
+
run_id=run_id,
|
|
149
|
+
node_id=0,
|
|
150
|
+
node_config=UserConfig(),
|
|
151
|
+
state=RecordDict(),
|
|
152
|
+
run_config=fused_config,
|
|
153
|
+
)
|
|
154
|
+
|
|
146
155
|
_ = _run_simulation(
|
|
147
156
|
server_app_attr=server_app_attr,
|
|
148
157
|
client_app_attr=client_app_attr,
|
|
@@ -153,7 +162,7 @@ def run_simulation_from_cli() -> None:
|
|
|
153
162
|
run=run,
|
|
154
163
|
enable_tf_gpu_growth=args.enable_tf_gpu_growth,
|
|
155
164
|
verbose_logging=args.verbose,
|
|
156
|
-
|
|
165
|
+
server_app_context=server_app_context,
|
|
157
166
|
is_app=True,
|
|
158
167
|
exit_event=EventType.CLI_FLOWER_SIMULATION_LEAVE,
|
|
159
168
|
)
|
|
@@ -241,13 +250,12 @@ def run_simulation(
|
|
|
241
250
|
def run_serverapp_th(
|
|
242
251
|
server_app_attr: Optional[str],
|
|
243
252
|
server_app: Optional[ServerApp],
|
|
244
|
-
|
|
253
|
+
server_app_context: Context,
|
|
245
254
|
grid: Grid,
|
|
246
255
|
app_dir: str,
|
|
247
256
|
f_stop: threading.Event,
|
|
248
257
|
has_exception: threading.Event,
|
|
249
258
|
enable_tf_gpu_growth: bool,
|
|
250
|
-
run_id: int,
|
|
251
259
|
ctx_queue: "Queue[Context]",
|
|
252
260
|
) -> threading.Thread:
|
|
253
261
|
"""Run SeverApp in a thread."""
|
|
@@ -258,7 +266,6 @@ def run_serverapp_th(
|
|
|
258
266
|
exception_event: threading.Event,
|
|
259
267
|
_grid: Grid,
|
|
260
268
|
_server_app_dir: str,
|
|
261
|
-
_server_app_run_config: UserConfig,
|
|
262
269
|
_server_app_attr: Optional[str],
|
|
263
270
|
_server_app: Optional[ServerApp],
|
|
264
271
|
_ctx_queue: "Queue[Context]",
|
|
@@ -272,19 +279,10 @@ def run_serverapp_th(
|
|
|
272
279
|
log(INFO, "Enabling GPU growth for Tensorflow on the server thread.")
|
|
273
280
|
enable_gpu_growth()
|
|
274
281
|
|
|
275
|
-
# Initialize Context
|
|
276
|
-
context = Context(
|
|
277
|
-
run_id=run_id,
|
|
278
|
-
node_id=0,
|
|
279
|
-
node_config={},
|
|
280
|
-
state=RecordDict(),
|
|
281
|
-
run_config=_server_app_run_config,
|
|
282
|
-
)
|
|
283
|
-
|
|
284
282
|
# Run ServerApp
|
|
285
283
|
updated_context = _run(
|
|
286
284
|
grid=_grid,
|
|
287
|
-
context=
|
|
285
|
+
context=server_app_context,
|
|
288
286
|
server_app_dir=_server_app_dir,
|
|
289
287
|
server_app_attr=_server_app_attr,
|
|
290
288
|
loaded_server_app=_server_app,
|
|
@@ -310,7 +308,6 @@ def run_serverapp_th(
|
|
|
310
308
|
has_exception,
|
|
311
309
|
grid,
|
|
312
310
|
app_dir,
|
|
313
|
-
server_app_run_config,
|
|
314
311
|
server_app_attr,
|
|
315
312
|
server_app,
|
|
316
313
|
ctx_queue,
|
|
@@ -335,7 +332,7 @@ def _main_loop(
|
|
|
335
332
|
client_app_attr: Optional[str] = None,
|
|
336
333
|
server_app: Optional[ServerApp] = None,
|
|
337
334
|
server_app_attr: Optional[str] = None,
|
|
338
|
-
|
|
335
|
+
server_app_context: Optional[Context] = None,
|
|
339
336
|
) -> Context:
|
|
340
337
|
"""Start ServerApp on a separate thread, then launch Simulation Engine."""
|
|
341
338
|
# Initialize StateFactory
|
|
@@ -346,13 +343,15 @@ def _main_loop(
|
|
|
346
343
|
server_app_thread_has_exception = threading.Event()
|
|
347
344
|
serverapp_th = None
|
|
348
345
|
success = True
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
346
|
+
if server_app_context is None:
|
|
347
|
+
server_app_context = Context(
|
|
348
|
+
run_id=run.run_id,
|
|
349
|
+
node_id=0,
|
|
350
|
+
node_config=UserConfig(),
|
|
351
|
+
state=RecordDict(),
|
|
352
|
+
run_config=UserConfig(),
|
|
353
|
+
)
|
|
354
|
+
updated_context = server_app_context
|
|
356
355
|
try:
|
|
357
356
|
# Register run
|
|
358
357
|
log(DEBUG, "Pre-registering run with id %s", run.run_id)
|
|
@@ -361,9 +360,6 @@ def _main_loop(
|
|
|
361
360
|
run.running_at = run.starting_at
|
|
362
361
|
state_factory.state().run_ids[run.run_id] = RunRecord(run=run) # type: ignore
|
|
363
362
|
|
|
364
|
-
if server_app_run_config is None:
|
|
365
|
-
server_app_run_config = {}
|
|
366
|
-
|
|
367
363
|
# Initialize Grid
|
|
368
364
|
grid = InMemoryGrid(state_factory=state_factory)
|
|
369
365
|
grid.set_run(run_id=run.run_id)
|
|
@@ -373,13 +369,12 @@ def _main_loop(
|
|
|
373
369
|
serverapp_th = run_serverapp_th(
|
|
374
370
|
server_app_attr=server_app_attr,
|
|
375
371
|
server_app=server_app,
|
|
376
|
-
|
|
372
|
+
server_app_context=server_app_context,
|
|
377
373
|
grid=grid,
|
|
378
374
|
app_dir=app_dir,
|
|
379
375
|
f_stop=f_stop,
|
|
380
376
|
has_exception=server_app_thread_has_exception,
|
|
381
377
|
enable_tf_gpu_growth=enable_tf_gpu_growth,
|
|
382
|
-
run_id=run.run_id,
|
|
383
378
|
ctx_queue=output_context_queue,
|
|
384
379
|
)
|
|
385
380
|
|
|
@@ -438,7 +433,7 @@ def _run_simulation(
|
|
|
438
433
|
backend_config: Optional[BackendConfig] = None,
|
|
439
434
|
client_app_attr: Optional[str] = None,
|
|
440
435
|
server_app_attr: Optional[str] = None,
|
|
441
|
-
|
|
436
|
+
server_app_context: Optional[Context] = None,
|
|
442
437
|
app_dir: str = "",
|
|
443
438
|
flwr_dir: Optional[str] = None,
|
|
444
439
|
run: Optional[Run] = None,
|
|
@@ -502,7 +497,7 @@ def _run_simulation(
|
|
|
502
497
|
client_app_attr,
|
|
503
498
|
server_app,
|
|
504
499
|
server_app_attr,
|
|
505
|
-
|
|
500
|
+
server_app_context,
|
|
506
501
|
)
|
|
507
502
|
# Detect if there is an Asyncio event loop already running.
|
|
508
503
|
# If yes, disable logger propagation. In environmnets
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Copyright 2025 Flower Labs GmbH. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
# ==============================================================================
|
|
15
|
+
"""Utility functions for app processes."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import signal
|
|
20
|
+
import threading
|
|
21
|
+
import time
|
|
22
|
+
|
|
23
|
+
if os.name == "nt":
|
|
24
|
+
from ctypes import windll # type: ignore
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _pid_exists(pid: int) -> bool:
|
|
28
|
+
"""Check if a process with the given PID exists.
|
|
29
|
+
|
|
30
|
+
This works on Unix-like systems and Windows.
|
|
31
|
+
"""
|
|
32
|
+
# Use `ctypes` to check if the process exists on Windows
|
|
33
|
+
if os.name == "nt":
|
|
34
|
+
handle = windll.kernel32.OpenProcess(0x1000, False, pid)
|
|
35
|
+
if handle:
|
|
36
|
+
windll.kernel32.CloseHandle(handle)
|
|
37
|
+
return True
|
|
38
|
+
return False
|
|
39
|
+
# Use `os.kill` on Unix-like systems
|
|
40
|
+
try:
|
|
41
|
+
os.kill(pid, 0)
|
|
42
|
+
except OSError:
|
|
43
|
+
return False
|
|
44
|
+
return True
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def start_parent_process_monitor(
|
|
48
|
+
parent_pid: int,
|
|
49
|
+
) -> None:
|
|
50
|
+
"""Monitor the parent process and exit if it terminates."""
|
|
51
|
+
|
|
52
|
+
def monitor() -> None:
|
|
53
|
+
while True:
|
|
54
|
+
time.sleep(0.2)
|
|
55
|
+
if not _pid_exists(parent_pid):
|
|
56
|
+
os.kill(os.getpid(), signal.SIGKILL)
|
|
57
|
+
|
|
58
|
+
threading.Thread(target=monitor, daemon=True).start()
|
|
@@ -12,11 +12,11 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
# ==============================================================================
|
|
15
|
-
"""Flower
|
|
15
|
+
"""Flower command line interface for shared infrastructure components."""
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
from .
|
|
18
|
+
from .flower_superexec import flower_superexec
|
|
19
19
|
|
|
20
20
|
__all__ = [
|
|
21
|
-
"
|
|
21
|
+
"flower_superexec",
|
|
22
22
|
]
|