flwr 1.20.0__py3-none-any.whl → 1.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flwr/__init__.py +4 -1
- flwr/app/__init__.py +28 -0
- flwr/app/exception.py +31 -0
- flwr/cli/auth_plugin/oidc_cli_plugin.py +4 -4
- flwr/cli/cli_user_auth_interceptor.py +1 -1
- flwr/cli/config_utils.py +3 -3
- flwr/cli/constant.py +25 -8
- flwr/cli/log.py +9 -9
- flwr/cli/login/login.py +3 -3
- flwr/cli/ls.py +5 -5
- flwr/cli/new/new.py +11 -0
- flwr/cli/new/templates/app/code/__init__.pytorch_msg_api.py.tpl +1 -0
- flwr/cli/new/templates/app/code/client.pytorch_msg_api.py.tpl +80 -0
- flwr/cli/new/templates/app/code/server.pytorch_msg_api.py.tpl +41 -0
- flwr/cli/new/templates/app/code/task.pytorch_msg_api.py.tpl +98 -0
- flwr/cli/new/templates/app/pyproject.baseline.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.flowertune.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.huggingface.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.jax.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.mlx.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.numpy.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.pytorch.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.pytorch_msg_api.toml.tpl +53 -0
- flwr/cli/new/templates/app/pyproject.sklearn.toml.tpl +1 -1
- flwr/cli/new/templates/app/pyproject.tensorflow.toml.tpl +1 -1
- flwr/cli/run/run.py +9 -13
- flwr/cli/stop.py +7 -4
- flwr/cli/utils.py +19 -8
- flwr/client/grpc_rere_client/connection.py +1 -12
- flwr/client/rest_client/connection.py +3 -0
- flwr/clientapp/__init__.py +10 -0
- flwr/clientapp/mod/__init__.py +26 -0
- flwr/clientapp/mod/centraldp_mods.py +132 -0
- flwr/common/args.py +20 -6
- flwr/common/auth_plugin/__init__.py +4 -4
- flwr/common/auth_plugin/auth_plugin.py +7 -7
- flwr/common/constant.py +23 -4
- flwr/common/event_log_plugin/event_log_plugin.py +1 -1
- flwr/common/exit/__init__.py +4 -0
- flwr/common/exit/exit.py +8 -1
- flwr/common/exit/exit_code.py +26 -7
- flwr/common/exit/exit_handler.py +62 -0
- flwr/common/{exit_handlers.py → exit/signal_handler.py} +20 -37
- flwr/common/grpc.py +0 -11
- flwr/common/inflatable_utils.py +1 -1
- flwr/common/logger.py +1 -1
- flwr/common/retry_invoker.py +30 -11
- flwr/common/telemetry.py +4 -0
- flwr/compat/server/app.py +2 -2
- flwr/proto/appio_pb2.py +25 -17
- flwr/proto/appio_pb2.pyi +46 -2
- flwr/proto/clientappio_pb2.py +3 -11
- flwr/proto/clientappio_pb2.pyi +0 -47
- flwr/proto/clientappio_pb2_grpc.py +19 -20
- flwr/proto/clientappio_pb2_grpc.pyi +10 -11
- flwr/proto/control_pb2.py +62 -0
- flwr/proto/{exec_pb2_grpc.py → control_pb2_grpc.py} +54 -54
- flwr/proto/{exec_pb2_grpc.pyi → control_pb2_grpc.pyi} +28 -28
- flwr/proto/serverappio_pb2.py +2 -2
- flwr/proto/serverappio_pb2_grpc.py +68 -0
- flwr/proto/serverappio_pb2_grpc.pyi +26 -0
- flwr/proto/simulationio_pb2.py +4 -11
- flwr/proto/simulationio_pb2.pyi +0 -58
- flwr/proto/simulationio_pb2_grpc.py +129 -27
- flwr/proto/simulationio_pb2_grpc.pyi +52 -13
- flwr/server/app.py +129 -152
- flwr/server/grid/grpc_grid.py +3 -0
- flwr/server/grid/inmemory_grid.py +1 -0
- flwr/server/serverapp/app.py +157 -146
- flwr/server/superlink/fleet/vce/backend/raybackend.py +3 -1
- flwr/server/superlink/fleet/vce/vce_api.py +6 -6
- flwr/server/superlink/linkstate/in_memory_linkstate.py +34 -0
- flwr/server/superlink/linkstate/linkstate.py +2 -1
- flwr/server/superlink/linkstate/sqlite_linkstate.py +45 -0
- flwr/server/superlink/serverappio/serverappio_grpc.py +1 -1
- flwr/server/superlink/serverappio/serverappio_servicer.py +61 -6
- flwr/server/superlink/simulation/simulationio_servicer.py +97 -21
- flwr/serverapp/__init__.py +12 -0
- flwr/serverapp/dp_fixed_clipping.py +352 -0
- flwr/serverapp/exception.py +38 -0
- flwr/serverapp/strategy/__init__.py +38 -0
- flwr/serverapp/strategy/dp_fixed_clipping.py +352 -0
- flwr/serverapp/strategy/fedadagrad.py +162 -0
- flwr/serverapp/strategy/fedadam.py +181 -0
- flwr/serverapp/strategy/fedavg.py +295 -0
- flwr/serverapp/strategy/fedopt.py +218 -0
- flwr/serverapp/strategy/fedyogi.py +173 -0
- flwr/serverapp/strategy/result.py +105 -0
- flwr/serverapp/strategy/strategy.py +285 -0
- flwr/serverapp/strategy/strategy_utils.py +251 -0
- flwr/serverapp/strategy/strategy_utils_tests.py +304 -0
- flwr/simulation/app.py +161 -164
- flwr/supercore/app_utils.py +58 -0
- flwr/{supernode/scheduler → supercore/cli}/__init__.py +3 -3
- flwr/supercore/cli/flower_superexec.py +141 -0
- flwr/supercore/{scheduler → corestate}/__init__.py +3 -3
- flwr/supercore/corestate/corestate.py +81 -0
- flwr/supercore/grpc_health/__init__.py +3 -0
- flwr/supercore/grpc_health/health_server.py +53 -0
- flwr/supercore/grpc_health/simple_health_servicer.py +2 -2
- flwr/{superexec → supercore/superexec}/__init__.py +1 -1
- flwr/supercore/superexec/plugin/__init__.py +28 -0
- flwr/{supernode/scheduler/simple_clientapp_scheduler_plugin.py → supercore/superexec/plugin/base_exec_plugin.py} +10 -6
- flwr/supercore/superexec/plugin/clientapp_exec_plugin.py +28 -0
- flwr/supercore/{scheduler/plugin.py → superexec/plugin/exec_plugin.py} +4 -4
- flwr/supercore/superexec/plugin/serverapp_exec_plugin.py +28 -0
- flwr/supercore/superexec/plugin/simulation_exec_plugin.py +28 -0
- flwr/supercore/superexec/run_superexec.py +185 -0
- flwr/superlink/servicer/__init__.py +15 -0
- flwr/superlink/servicer/control/__init__.py +22 -0
- flwr/{superexec/exec_event_log_interceptor.py → superlink/servicer/control/control_event_log_interceptor.py} +7 -7
- flwr/{superexec/exec_grpc.py → superlink/servicer/control/control_grpc.py} +24 -29
- flwr/{superexec/exec_license_interceptor.py → superlink/servicer/control/control_license_interceptor.py} +6 -6
- flwr/{superexec/exec_servicer.py → superlink/servicer/control/control_servicer.py} +69 -30
- flwr/{superexec/exec_user_auth_interceptor.py → superlink/servicer/control/control_user_auth_interceptor.py} +10 -10
- flwr/supernode/cli/flower_supernode.py +3 -0
- flwr/supernode/cli/flwr_clientapp.py +18 -21
- flwr/supernode/nodestate/in_memory_nodestate.py +2 -2
- flwr/supernode/nodestate/nodestate.py +3 -59
- flwr/supernode/runtime/run_clientapp.py +39 -102
- flwr/supernode/servicer/clientappio/clientappio_servicer.py +10 -17
- flwr/supernode/start_client_internal.py +35 -76
- {flwr-1.20.0.dist-info → flwr-1.21.0.dist-info}/METADATA +4 -3
- {flwr-1.20.0.dist-info → flwr-1.21.0.dist-info}/RECORD +127 -98
- {flwr-1.20.0.dist-info → flwr-1.21.0.dist-info}/entry_points.txt +1 -0
- flwr/proto/exec_pb2.py +0 -62
- flwr/superexec/app.py +0 -45
- flwr/superexec/deployment.py +0 -191
- flwr/superexec/executor.py +0 -100
- flwr/superexec/simulation.py +0 -129
- /flwr/proto/{exec_pb2.pyi → control_pb2.pyi} +0 -0
- {flwr-1.20.0.dist-info → flwr-1.21.0.dist-info}/WHEEL +0 -0
flwr/simulation/app.py
CHANGED
|
@@ -16,10 +16,8 @@
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
import argparse
|
|
19
|
-
import gc
|
|
20
19
|
from logging import DEBUG, ERROR, INFO
|
|
21
20
|
from queue import Queue
|
|
22
|
-
from time import sleep
|
|
23
21
|
from typing import Optional
|
|
24
22
|
|
|
25
23
|
from flwr.cli.config_utils import get_fab_metadata
|
|
@@ -36,6 +34,7 @@ from flwr.common.config import (
|
|
|
36
34
|
)
|
|
37
35
|
from flwr.common.constant import (
|
|
38
36
|
SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS,
|
|
37
|
+
ExecPluginType,
|
|
39
38
|
Status,
|
|
40
39
|
SubStatus,
|
|
41
40
|
)
|
|
@@ -57,19 +56,23 @@ from flwr.common.serde import (
|
|
|
57
56
|
run_status_to_proto,
|
|
58
57
|
)
|
|
59
58
|
from flwr.common.typing import RunStatus
|
|
59
|
+
from flwr.proto.appio_pb2 import ( # pylint: disable=E0611
|
|
60
|
+
PullAppInputsRequest,
|
|
61
|
+
PullAppInputsResponse,
|
|
62
|
+
PushAppOutputsRequest,
|
|
63
|
+
)
|
|
60
64
|
from flwr.proto.run_pb2 import ( # pylint: disable=E0611
|
|
61
65
|
GetFederationOptionsRequest,
|
|
62
66
|
GetFederationOptionsResponse,
|
|
63
67
|
UpdateRunStatusRequest,
|
|
64
68
|
)
|
|
65
|
-
from flwr.proto.
|
|
66
|
-
PullSimulationInputsRequest,
|
|
67
|
-
PullSimulationInputsResponse,
|
|
68
|
-
PushSimulationOutputsRequest,
|
|
69
|
-
)
|
|
69
|
+
from flwr.proto.simulationio_pb2_grpc import SimulationIoStub
|
|
70
70
|
from flwr.server.superlink.fleet.vce.backend.backend import BackendConfig
|
|
71
71
|
from flwr.simulation.run_simulation import _run_simulation
|
|
72
72
|
from flwr.simulation.simulationio_connection import SimulationIoConnection
|
|
73
|
+
from flwr.supercore.app_utils import start_parent_process_monitor
|
|
74
|
+
from flwr.supercore.superexec.plugin import SimulationExecPlugin
|
|
75
|
+
from flwr.supercore.superexec.run_superexec import run_with_deprecation_warning
|
|
73
76
|
|
|
74
77
|
|
|
75
78
|
def flwr_simulation() -> None:
|
|
@@ -80,14 +83,27 @@ def flwr_simulation() -> None:
|
|
|
80
83
|
|
|
81
84
|
args = _parse_args_run_flwr_simulation().parse_args()
|
|
82
85
|
|
|
83
|
-
log(INFO, "Starting Flower Simulation")
|
|
84
|
-
|
|
85
86
|
if not args.insecure:
|
|
86
87
|
flwr_exit(
|
|
87
88
|
ExitCode.COMMON_TLS_NOT_SUPPORTED,
|
|
88
|
-
"`flwr-simulation` does not support TLS yet.
|
|
89
|
+
"`flwr-simulation` does not support TLS yet.",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Disallow long-running `flwr-simulation` processes
|
|
93
|
+
if args.token is None:
|
|
94
|
+
run_with_deprecation_warning(
|
|
95
|
+
cmd="flwr-simulation",
|
|
96
|
+
plugin_type=ExecPluginType.SIMULATION,
|
|
97
|
+
plugin_class=SimulationExecPlugin,
|
|
98
|
+
stub_class=SimulationIoStub,
|
|
99
|
+
appio_api_address=args.simulationio_api_address,
|
|
100
|
+
flwr_dir=args.flwr_dir,
|
|
101
|
+
parent_pid=args.parent_pid,
|
|
102
|
+
warn_run_once=args.run_once,
|
|
89
103
|
)
|
|
104
|
+
return
|
|
90
105
|
|
|
106
|
+
log(INFO, "Starting Flower Simulation")
|
|
91
107
|
log(
|
|
92
108
|
DEBUG,
|
|
93
109
|
"Starting isolated `Simulation` connected to SuperLink SimulationAppIo API "
|
|
@@ -97,23 +113,29 @@ def flwr_simulation() -> None:
|
|
|
97
113
|
run_simulation_process(
|
|
98
114
|
simulationio_api_address=args.simulationio_api_address,
|
|
99
115
|
log_queue=log_queue,
|
|
100
|
-
|
|
116
|
+
token=args.token,
|
|
101
117
|
flwr_dir_=args.flwr_dir,
|
|
102
118
|
certificates=None,
|
|
119
|
+
parent_pid=args.parent_pid,
|
|
103
120
|
)
|
|
104
121
|
|
|
105
122
|
# Restore stdout/stderr
|
|
106
123
|
restore_output()
|
|
107
124
|
|
|
108
125
|
|
|
109
|
-
def run_simulation_process( # pylint: disable=R0914,
|
|
126
|
+
def run_simulation_process( # pylint: disable=R0913, R0914, R0915, R0917, W0212
|
|
110
127
|
simulationio_api_address: str,
|
|
111
128
|
log_queue: Queue[Optional[str]],
|
|
112
|
-
|
|
129
|
+
token: str,
|
|
113
130
|
flwr_dir_: Optional[str] = None,
|
|
114
131
|
certificates: Optional[bytes] = None,
|
|
132
|
+
parent_pid: Optional[int] = None,
|
|
115
133
|
) -> None:
|
|
116
134
|
"""Run Flower Simulation process."""
|
|
135
|
+
# Start monitoring the parent process if a PID is provided
|
|
136
|
+
if parent_pid is not None:
|
|
137
|
+
start_parent_process_monitor(parent_pid)
|
|
138
|
+
|
|
117
139
|
conn = SimulationIoConnection(
|
|
118
140
|
simulationio_service_address=simulationio_api_address,
|
|
119
141
|
root_certificates=certificates,
|
|
@@ -123,165 +145,146 @@ def run_simulation_process( # pylint: disable=R0914, disable=W0212, disable=R09
|
|
|
123
145
|
flwr_dir = get_flwr_dir(flwr_dir_)
|
|
124
146
|
log_uploader = None
|
|
125
147
|
heartbeat_sender = None
|
|
148
|
+
run_status = None
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
# Pull SimulationInputs from LinkState
|
|
152
|
+
req = PullAppInputsRequest(token=token)
|
|
153
|
+
res: PullAppInputsResponse = conn._stub.PullAppInputs(req)
|
|
154
|
+
context = context_from_proto(res.context)
|
|
155
|
+
run = run_from_proto(res.run)
|
|
156
|
+
fab = fab_from_proto(res.fab)
|
|
157
|
+
|
|
158
|
+
# Start log uploader for this run
|
|
159
|
+
log_uploader = start_log_uploader(
|
|
160
|
+
log_queue=log_queue,
|
|
161
|
+
node_id=context.node_id,
|
|
162
|
+
run_id=run.run_id,
|
|
163
|
+
stub=conn._stub,
|
|
164
|
+
)
|
|
126
165
|
|
|
127
|
-
|
|
166
|
+
log(DEBUG, "Simulation process starts FAB installation.")
|
|
167
|
+
install_from_fab(fab.content, flwr_dir=flwr_dir, skip_prompt=True)
|
|
128
168
|
|
|
129
|
-
|
|
130
|
-
# Pull SimulationInputs from LinkState
|
|
131
|
-
req = PullSimulationInputsRequest()
|
|
132
|
-
res: PullSimulationInputsResponse = conn._stub.PullSimulationInputs(req)
|
|
133
|
-
if not res.HasField("run"):
|
|
134
|
-
sleep(3)
|
|
135
|
-
run_status = None
|
|
136
|
-
continue
|
|
137
|
-
|
|
138
|
-
context = context_from_proto(res.context)
|
|
139
|
-
run = run_from_proto(res.run)
|
|
140
|
-
fab = fab_from_proto(res.fab)
|
|
141
|
-
|
|
142
|
-
# Start log uploader for this run
|
|
143
|
-
log_uploader = start_log_uploader(
|
|
144
|
-
log_queue=log_queue,
|
|
145
|
-
node_id=context.node_id,
|
|
146
|
-
run_id=run.run_id,
|
|
147
|
-
stub=conn._stub,
|
|
148
|
-
)
|
|
169
|
+
fab_id, fab_version = get_fab_metadata(fab.content)
|
|
149
170
|
|
|
150
|
-
|
|
151
|
-
|
|
171
|
+
app_path = get_project_dir(fab_id, fab_version, fab.hash_str, flwr_dir)
|
|
172
|
+
config = get_project_config(app_path)
|
|
152
173
|
|
|
153
|
-
|
|
174
|
+
# Get ClientApp and SeverApp components
|
|
175
|
+
app_components = config["tool"]["flwr"]["app"]["components"]
|
|
176
|
+
client_app_attr = app_components["clientapp"]
|
|
177
|
+
server_app_attr = app_components["serverapp"]
|
|
178
|
+
fused_config = get_fused_config_from_dir(app_path, run.override_config)
|
|
154
179
|
|
|
155
|
-
|
|
156
|
-
|
|
180
|
+
# Update run_config in context
|
|
181
|
+
context.run_config = fused_config
|
|
157
182
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
server_app_attr
|
|
162
|
-
|
|
183
|
+
log(
|
|
184
|
+
DEBUG,
|
|
185
|
+
"Flower will load ServerApp `%s` in %s",
|
|
186
|
+
server_app_attr,
|
|
187
|
+
app_path,
|
|
188
|
+
)
|
|
189
|
+
log(
|
|
190
|
+
DEBUG,
|
|
191
|
+
"Flower will load ClientApp `%s` in %s",
|
|
192
|
+
client_app_attr,
|
|
193
|
+
app_path,
|
|
194
|
+
)
|
|
163
195
|
|
|
164
|
-
|
|
165
|
-
|
|
196
|
+
# Change status to Running
|
|
197
|
+
run_status_proto = run_status_to_proto(RunStatus(Status.RUNNING, "", ""))
|
|
198
|
+
conn._stub.UpdateRunStatus(
|
|
199
|
+
UpdateRunStatusRequest(run_id=run.run_id, run_status=run_status_proto)
|
|
200
|
+
)
|
|
166
201
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
202
|
+
# Pull Federation Options
|
|
203
|
+
fed_opt_res: GetFederationOptionsResponse = conn._stub.GetFederationOptions(
|
|
204
|
+
GetFederationOptionsRequest(run_id=run.run_id)
|
|
205
|
+
)
|
|
206
|
+
federation_options = config_record_from_proto(fed_opt_res.federation_options)
|
|
207
|
+
|
|
208
|
+
# Unflatten underlying dict
|
|
209
|
+
fed_opt = unflatten_dict({**federation_options})
|
|
210
|
+
|
|
211
|
+
# Extract configs values of interest
|
|
212
|
+
num_supernodes = fed_opt.get("num-supernodes")
|
|
213
|
+
if num_supernodes is None:
|
|
214
|
+
raise ValueError("Federation options expects `num-supernodes` to be set.")
|
|
215
|
+
backend_config: BackendConfig = fed_opt.get("backend", {})
|
|
216
|
+
verbose: bool = fed_opt.get("verbose", False)
|
|
217
|
+
enable_tf_gpu_growth: bool = fed_opt.get("enable_tf_gpu_growth", False)
|
|
218
|
+
|
|
219
|
+
event(
|
|
220
|
+
EventType.FLWR_SIMULATION_RUN_ENTER,
|
|
221
|
+
event_details={
|
|
222
|
+
"backend": "ray",
|
|
223
|
+
"num-supernodes": num_supernodes,
|
|
224
|
+
"run-id-hash": get_sha256_hash(run.run_id),
|
|
225
|
+
},
|
|
226
|
+
)
|
|
179
227
|
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
conn._stub
|
|
183
|
-
|
|
184
|
-
|
|
228
|
+
# Set up heartbeat sender
|
|
229
|
+
heartbeat_fn = get_grpc_app_heartbeat_fn(
|
|
230
|
+
conn._stub,
|
|
231
|
+
run.run_id,
|
|
232
|
+
failure_message="Heartbeat failed unexpectedly. The SuperLink could "
|
|
233
|
+
"not find the provided run ID, or the run status is invalid.",
|
|
234
|
+
)
|
|
235
|
+
heartbeat_sender = HeartbeatSender(heartbeat_fn)
|
|
236
|
+
heartbeat_sender.start()
|
|
237
|
+
|
|
238
|
+
# Launch the simulation
|
|
239
|
+
updated_context = _run_simulation(
|
|
240
|
+
server_app_attr=server_app_attr,
|
|
241
|
+
client_app_attr=client_app_attr,
|
|
242
|
+
num_supernodes=num_supernodes,
|
|
243
|
+
backend_config=backend_config,
|
|
244
|
+
app_dir=str(app_path),
|
|
245
|
+
run=run,
|
|
246
|
+
enable_tf_gpu_growth=enable_tf_gpu_growth,
|
|
247
|
+
verbose_logging=verbose,
|
|
248
|
+
server_app_run_config=fused_config,
|
|
249
|
+
is_app=True,
|
|
250
|
+
exit_event=EventType.FLWR_SIMULATION_RUN_LEAVE,
|
|
251
|
+
)
|
|
185
252
|
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
)
|
|
253
|
+
# Send resulting context
|
|
254
|
+
context_proto = context_to_proto(updated_context)
|
|
255
|
+
out_req = PushAppOutputsRequest(
|
|
256
|
+
token=token, run_id=run.run_id, context=context_proto
|
|
257
|
+
)
|
|
258
|
+
_ = conn._stub.PushAppOutputs(out_req)
|
|
193
259
|
|
|
194
|
-
|
|
195
|
-
fed_opt = unflatten_dict({**federation_options})
|
|
196
|
-
|
|
197
|
-
# Extract configs values of interest
|
|
198
|
-
num_supernodes = fed_opt.get("num-supernodes")
|
|
199
|
-
if num_supernodes is None:
|
|
200
|
-
raise ValueError(
|
|
201
|
-
"Federation options expects `num-supernodes` to be set."
|
|
202
|
-
)
|
|
203
|
-
backend_config: BackendConfig = fed_opt.get("backend", {})
|
|
204
|
-
verbose: bool = fed_opt.get("verbose", False)
|
|
205
|
-
enable_tf_gpu_growth: bool = fed_opt.get("enable_tf_gpu_growth", False)
|
|
206
|
-
|
|
207
|
-
event(
|
|
208
|
-
EventType.FLWR_SIMULATION_RUN_ENTER,
|
|
209
|
-
event_details={
|
|
210
|
-
"backend": "ray",
|
|
211
|
-
"num-supernodes": num_supernodes,
|
|
212
|
-
"run-id-hash": get_sha256_hash(run.run_id),
|
|
213
|
-
},
|
|
214
|
-
)
|
|
260
|
+
run_status = RunStatus(Status.FINISHED, SubStatus.COMPLETED, "")
|
|
215
261
|
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
heartbeat_sender.
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
client_app_attr=client_app_attr,
|
|
230
|
-
num_supernodes=num_supernodes,
|
|
231
|
-
backend_config=backend_config,
|
|
232
|
-
app_dir=str(app_path),
|
|
233
|
-
run=run,
|
|
234
|
-
enable_tf_gpu_growth=enable_tf_gpu_growth,
|
|
235
|
-
verbose_logging=verbose,
|
|
236
|
-
server_app_run_config=fused_config,
|
|
237
|
-
is_app=True,
|
|
238
|
-
exit_event=EventType.FLWR_SIMULATION_RUN_LEAVE,
|
|
239
|
-
)
|
|
262
|
+
except Exception as ex: # pylint: disable=broad-exception-caught
|
|
263
|
+
exc_entity = "Simulation"
|
|
264
|
+
log(ERROR, "%s raised an exception", exc_entity, exc_info=ex)
|
|
265
|
+
run_status = RunStatus(Status.FINISHED, SubStatus.FAILED, str(ex))
|
|
266
|
+
|
|
267
|
+
finally:
|
|
268
|
+
# Stop heartbeat sender
|
|
269
|
+
if heartbeat_sender:
|
|
270
|
+
heartbeat_sender.stop()
|
|
271
|
+
|
|
272
|
+
# Stop log uploader for this run and upload final logs
|
|
273
|
+
if log_uploader:
|
|
274
|
+
stop_log_uploader(log_queue, log_uploader)
|
|
240
275
|
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
276
|
+
# Update run status
|
|
277
|
+
if run_status:
|
|
278
|
+
run_status_proto = run_status_to_proto(run_status)
|
|
279
|
+
conn._stub.UpdateRunStatus(
|
|
280
|
+
UpdateRunStatusRequest(run_id=run.run_id, run_status=run_status_proto)
|
|
245
281
|
)
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
except
|
|
251
|
-
|
|
252
|
-
log(ERROR, "%s raised an exception", exc_entity, exc_info=ex)
|
|
253
|
-
run_status = RunStatus(Status.FINISHED, SubStatus.FAILED, str(ex))
|
|
254
|
-
|
|
255
|
-
finally:
|
|
256
|
-
# Stop heartbeat sender
|
|
257
|
-
if heartbeat_sender:
|
|
258
|
-
heartbeat_sender.stop()
|
|
259
|
-
heartbeat_sender = None
|
|
260
|
-
|
|
261
|
-
# Stop log uploader for this run and upload final logs
|
|
262
|
-
if log_uploader:
|
|
263
|
-
stop_log_uploader(log_queue, log_uploader)
|
|
264
|
-
log_uploader = None
|
|
265
|
-
|
|
266
|
-
# Update run status
|
|
267
|
-
if run_status:
|
|
268
|
-
run_status_proto = run_status_to_proto(run_status)
|
|
269
|
-
conn._stub.UpdateRunStatus(
|
|
270
|
-
UpdateRunStatusRequest(
|
|
271
|
-
run_id=run.run_id, run_status=run_status_proto
|
|
272
|
-
)
|
|
273
|
-
)
|
|
274
|
-
|
|
275
|
-
# Clean up the Context if it exists
|
|
276
|
-
try:
|
|
277
|
-
del updated_context
|
|
278
|
-
except NameError:
|
|
279
|
-
pass
|
|
280
|
-
gc.collect()
|
|
281
|
-
|
|
282
|
-
# Stop the loop if `flwr-simulation` is expected to process a single run
|
|
283
|
-
if run_once:
|
|
284
|
-
break
|
|
282
|
+
|
|
283
|
+
# Clean up the Context if it exists
|
|
284
|
+
try:
|
|
285
|
+
del updated_context
|
|
286
|
+
except NameError:
|
|
287
|
+
pass
|
|
285
288
|
|
|
286
289
|
|
|
287
290
|
def _parse_args_run_flwr_simulation() -> argparse.ArgumentParser:
|
|
@@ -296,11 +299,5 @@ def _parse_args_run_flwr_simulation() -> argparse.ArgumentParser:
|
|
|
296
299
|
help="Address of SuperLink's SimulationIO API (IPv4, IPv6, or a domain name)."
|
|
297
300
|
f"By default, it is set to {SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS}.",
|
|
298
301
|
)
|
|
299
|
-
parser.add_argument(
|
|
300
|
-
"--run-once",
|
|
301
|
-
action="store_true",
|
|
302
|
-
help="When set, this process will start a single simulation "
|
|
303
|
-
"for a pending Run. If no pending run the process will exit. ",
|
|
304
|
-
)
|
|
305
302
|
add_args_flwr_app_common(parser=parser)
|
|
306
303
|
return parser
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Copyright 2025 Flower Labs GmbH. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
# ==============================================================================
|
|
15
|
+
"""Utility functions for app processes."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import signal
|
|
20
|
+
import threading
|
|
21
|
+
import time
|
|
22
|
+
|
|
23
|
+
if os.name == "nt":
|
|
24
|
+
from ctypes import windll # type: ignore
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _pid_exists(pid: int) -> bool:
|
|
28
|
+
"""Check if a process with the given PID exists.
|
|
29
|
+
|
|
30
|
+
This works on Unix-like systems and Windows.
|
|
31
|
+
"""
|
|
32
|
+
# Use `ctypes` to check if the process exists on Windows
|
|
33
|
+
if os.name == "nt":
|
|
34
|
+
handle = windll.kernel32.OpenProcess(0x1000, False, pid)
|
|
35
|
+
if handle:
|
|
36
|
+
windll.kernel32.CloseHandle(handle)
|
|
37
|
+
return True
|
|
38
|
+
return False
|
|
39
|
+
# Use `os.kill` on Unix-like systems
|
|
40
|
+
try:
|
|
41
|
+
os.kill(pid, 0)
|
|
42
|
+
except OSError:
|
|
43
|
+
return False
|
|
44
|
+
return True
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def start_parent_process_monitor(
|
|
48
|
+
parent_pid: int,
|
|
49
|
+
) -> None:
|
|
50
|
+
"""Monitor the parent process and exit if it terminates."""
|
|
51
|
+
|
|
52
|
+
def monitor() -> None:
|
|
53
|
+
while True:
|
|
54
|
+
time.sleep(0.2)
|
|
55
|
+
if not _pid_exists(parent_pid):
|
|
56
|
+
os.kill(os.getpid(), signal.SIGKILL)
|
|
57
|
+
|
|
58
|
+
threading.Thread(target=monitor, daemon=True).start()
|
|
@@ -12,11 +12,11 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
# ==============================================================================
|
|
15
|
-
"""Flower
|
|
15
|
+
"""Flower command line interface for shared infrastructure components."""
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
from .
|
|
18
|
+
from .flower_superexec import flower_superexec
|
|
19
19
|
|
|
20
20
|
__all__ = [
|
|
21
|
-
"
|
|
21
|
+
"flower_superexec",
|
|
22
22
|
]
|
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
# Copyright 2025 Flower Labs GmbH. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
# ==============================================================================
|
|
15
|
+
"""`flower-superexec` command."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
from logging import INFO
|
|
20
|
+
from typing import Optional
|
|
21
|
+
|
|
22
|
+
from flwr.common import EventType, event
|
|
23
|
+
from flwr.common.constant import ExecPluginType
|
|
24
|
+
from flwr.common.exit import ExitCode, flwr_exit
|
|
25
|
+
from flwr.common.logger import log
|
|
26
|
+
from flwr.proto.clientappio_pb2_grpc import ClientAppIoStub
|
|
27
|
+
from flwr.proto.serverappio_pb2_grpc import ServerAppIoStub
|
|
28
|
+
from flwr.proto.simulationio_pb2_grpc import SimulationIoStub
|
|
29
|
+
from flwr.supercore.grpc_health import add_args_health
|
|
30
|
+
from flwr.supercore.superexec.plugin import (
|
|
31
|
+
ClientAppExecPlugin,
|
|
32
|
+
ExecPlugin,
|
|
33
|
+
ServerAppExecPlugin,
|
|
34
|
+
SimulationExecPlugin,
|
|
35
|
+
)
|
|
36
|
+
from flwr.supercore.superexec.run_superexec import run_superexec
|
|
37
|
+
|
|
38
|
+
try:
|
|
39
|
+
from flwr.ee.constant import ExecEePluginType
|
|
40
|
+
from flwr.ee.exec_plugin import get_ee_plugin_and_stub_class
|
|
41
|
+
except ImportError:
|
|
42
|
+
|
|
43
|
+
class ExecEePluginType: # type: ignore[no-redef]
|
|
44
|
+
"""SuperExec EE plugin types."""
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def all() -> list[str]:
|
|
48
|
+
"""Return all SuperExec EE plugin types."""
|
|
49
|
+
return []
|
|
50
|
+
|
|
51
|
+
def get_ee_plugin_and_stub_class( # pylint: disable=unused-argument
|
|
52
|
+
plugin_type: str,
|
|
53
|
+
) -> Optional[tuple[type[ExecPlugin], type[object]]]:
|
|
54
|
+
"""Get the EE plugin class and stub class based on the plugin type."""
|
|
55
|
+
return None
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def flower_superexec() -> None:
|
|
59
|
+
"""Run `flower-superexec` command."""
|
|
60
|
+
args = _parse_args().parse_args()
|
|
61
|
+
if not args.insecure:
|
|
62
|
+
flwr_exit(
|
|
63
|
+
ExitCode.COMMON_TLS_NOT_SUPPORTED,
|
|
64
|
+
"SuperExec does not support TLS yet.",
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# Log the first message after parsing arguments in case of `--help`
|
|
68
|
+
log(INFO, "Starting Flower SuperExec")
|
|
69
|
+
|
|
70
|
+
# Trigger telemetry event
|
|
71
|
+
event(EventType.RUN_SUPEREXEC_ENTER, {"plugin_type": args.plugin_type})
|
|
72
|
+
|
|
73
|
+
# Get the plugin class and stub class based on the plugin type
|
|
74
|
+
plugin_class, stub_class = _get_plugin_and_stub_class(args.plugin_type)
|
|
75
|
+
run_superexec(
|
|
76
|
+
plugin_class=plugin_class,
|
|
77
|
+
stub_class=stub_class, # type: ignore
|
|
78
|
+
appio_api_address=args.appio_api_address,
|
|
79
|
+
flwr_dir=args.flwr_dir,
|
|
80
|
+
parent_pid=args.parent_pid,
|
|
81
|
+
health_server_address=args.health_server_address,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def _parse_args() -> argparse.ArgumentParser:
|
|
86
|
+
"""Parse `flower-superexec` command line arguments."""
|
|
87
|
+
parser = argparse.ArgumentParser(
|
|
88
|
+
description="Run Flower SuperExec.",
|
|
89
|
+
)
|
|
90
|
+
parser.add_argument(
|
|
91
|
+
"--appio-api-address", type=str, required=True, help="Address of the AppIO API"
|
|
92
|
+
)
|
|
93
|
+
parser.add_argument(
|
|
94
|
+
"--plugin-type",
|
|
95
|
+
type=str,
|
|
96
|
+
choices=ExecPluginType.all() + ExecEePluginType.all(),
|
|
97
|
+
required=True,
|
|
98
|
+
help="The type of plugin to use.",
|
|
99
|
+
)
|
|
100
|
+
parser.add_argument(
|
|
101
|
+
"--insecure",
|
|
102
|
+
action="store_true",
|
|
103
|
+
help="Connect to the AppIO API without TLS. "
|
|
104
|
+
"Data transmitted between the client and server is not encrypted. "
|
|
105
|
+
"Use this flag only if you understand the risks.",
|
|
106
|
+
)
|
|
107
|
+
parser.add_argument(
|
|
108
|
+
"--flwr-dir",
|
|
109
|
+
default=None,
|
|
110
|
+
help="""The path containing installed Flower Apps.
|
|
111
|
+
By default, this value is equal to:
|
|
112
|
+
|
|
113
|
+
- `$FLWR_HOME/` if `$FLWR_HOME` is defined
|
|
114
|
+
- `$XDG_DATA_HOME/.flwr/` if `$XDG_DATA_HOME` is defined
|
|
115
|
+
- `$HOME/.flwr/` in all other cases
|
|
116
|
+
""",
|
|
117
|
+
)
|
|
118
|
+
parser.add_argument(
|
|
119
|
+
"--parent-pid",
|
|
120
|
+
type=int,
|
|
121
|
+
default=None,
|
|
122
|
+
help="The PID of the parent process. When set, the process will terminate "
|
|
123
|
+
"when the parent process exits.",
|
|
124
|
+
)
|
|
125
|
+
add_args_health(parser)
|
|
126
|
+
return parser
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _get_plugin_and_stub_class(
|
|
130
|
+
plugin_type: str,
|
|
131
|
+
) -> tuple[type[ExecPlugin], type[object]]:
|
|
132
|
+
"""Get the plugin class and stub class based on the plugin type."""
|
|
133
|
+
if plugin_type == ExecPluginType.CLIENT_APP:
|
|
134
|
+
return ClientAppExecPlugin, ClientAppIoStub
|
|
135
|
+
if plugin_type == ExecPluginType.SERVER_APP:
|
|
136
|
+
return ServerAppExecPlugin, ServerAppIoStub
|
|
137
|
+
if plugin_type == ExecPluginType.SIMULATION:
|
|
138
|
+
return SimulationExecPlugin, SimulationIoStub
|
|
139
|
+
if ret := get_ee_plugin_and_stub_class(plugin_type):
|
|
140
|
+
return ret # type: ignore[no-any-return]
|
|
141
|
+
raise ValueError(f"Unknown plugin type: {plugin_type}")
|
|
@@ -12,11 +12,11 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
# ==============================================================================
|
|
15
|
-
"""Flower
|
|
15
|
+
"""Flower CoreState."""
|
|
16
16
|
|
|
17
17
|
|
|
18
|
-
from .
|
|
18
|
+
from .corestate import CoreState
|
|
19
19
|
|
|
20
20
|
__all__ = [
|
|
21
|
-
"
|
|
21
|
+
"CoreState",
|
|
22
22
|
]
|