flwr 1.20.0__py3-none-any.whl → 1.22.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (182) hide show
  1. flwr/__init__.py +4 -1
  2. flwr/app/__init__.py +28 -0
  3. flwr/app/exception.py +31 -0
  4. flwr/cli/app.py +2 -0
  5. flwr/cli/auth_plugin/oidc_cli_plugin.py +4 -4
  6. flwr/cli/cli_user_auth_interceptor.py +1 -1
  7. flwr/cli/config_utils.py +3 -3
  8. flwr/cli/constant.py +25 -8
  9. flwr/cli/log.py +9 -9
  10. flwr/cli/login/login.py +3 -3
  11. flwr/cli/ls.py +5 -5
  12. flwr/cli/new/new.py +15 -2
  13. flwr/cli/new/templates/app/README.flowertune.md.tpl +1 -1
  14. flwr/cli/new/templates/app/code/__init__.pytorch_legacy_api.py.tpl +1 -0
  15. flwr/cli/new/templates/app/code/client.baseline.py.tpl +64 -47
  16. flwr/cli/new/templates/app/code/client.huggingface.py.tpl +68 -30
  17. flwr/cli/new/templates/app/code/client.jax.py.tpl +63 -42
  18. flwr/cli/new/templates/app/code/client.mlx.py.tpl +80 -51
  19. flwr/cli/new/templates/app/code/client.numpy.py.tpl +36 -13
  20. flwr/cli/new/templates/app/code/client.pytorch.py.tpl +71 -46
  21. flwr/cli/new/templates/app/code/client.pytorch_legacy_api.py.tpl +55 -0
  22. flwr/cli/new/templates/app/code/client.sklearn.py.tpl +75 -30
  23. flwr/cli/new/templates/app/code/client.tensorflow.py.tpl +69 -44
  24. flwr/cli/new/templates/app/code/client.xgboost.py.tpl +110 -0
  25. flwr/cli/new/templates/app/code/flwr_tune/client_app.py.tpl +56 -90
  26. flwr/cli/new/templates/app/code/flwr_tune/models.py.tpl +1 -23
  27. flwr/cli/new/templates/app/code/flwr_tune/server_app.py.tpl +37 -58
  28. flwr/cli/new/templates/app/code/flwr_tune/strategy.py.tpl +39 -44
  29. flwr/cli/new/templates/app/code/model.baseline.py.tpl +0 -14
  30. flwr/cli/new/templates/app/code/server.baseline.py.tpl +27 -29
  31. flwr/cli/new/templates/app/code/server.huggingface.py.tpl +23 -19
  32. flwr/cli/new/templates/app/code/server.jax.py.tpl +27 -14
  33. flwr/cli/new/templates/app/code/server.mlx.py.tpl +29 -19
  34. flwr/cli/new/templates/app/code/server.numpy.py.tpl +30 -17
  35. flwr/cli/new/templates/app/code/server.pytorch.py.tpl +36 -26
  36. flwr/cli/new/templates/app/code/server.pytorch_legacy_api.py.tpl +31 -0
  37. flwr/cli/new/templates/app/code/server.sklearn.py.tpl +29 -21
  38. flwr/cli/new/templates/app/code/server.tensorflow.py.tpl +28 -19
  39. flwr/cli/new/templates/app/code/server.xgboost.py.tpl +56 -0
  40. flwr/cli/new/templates/app/code/task.huggingface.py.tpl +16 -20
  41. flwr/cli/new/templates/app/code/task.jax.py.tpl +1 -1
  42. flwr/cli/new/templates/app/code/task.numpy.py.tpl +1 -1
  43. flwr/cli/new/templates/app/code/task.pytorch.py.tpl +14 -27
  44. flwr/cli/new/templates/app/code/task.pytorch_legacy_api.py.tpl +111 -0
  45. flwr/cli/new/templates/app/code/task.tensorflow.py.tpl +1 -2
  46. flwr/cli/new/templates/app/code/task.xgboost.py.tpl +67 -0
  47. flwr/cli/new/templates/app/pyproject.baseline.toml.tpl +4 -4
  48. flwr/cli/new/templates/app/pyproject.flowertune.toml.tpl +2 -2
  49. flwr/cli/new/templates/app/pyproject.huggingface.toml.tpl +4 -4
  50. flwr/cli/new/templates/app/pyproject.jax.toml.tpl +1 -1
  51. flwr/cli/new/templates/app/pyproject.mlx.toml.tpl +2 -2
  52. flwr/cli/new/templates/app/pyproject.numpy.toml.tpl +1 -1
  53. flwr/cli/new/templates/app/pyproject.pytorch.toml.tpl +3 -3
  54. flwr/cli/new/templates/app/pyproject.pytorch_legacy_api.toml.tpl +53 -0
  55. flwr/cli/new/templates/app/pyproject.sklearn.toml.tpl +1 -1
  56. flwr/cli/new/templates/app/pyproject.tensorflow.toml.tpl +1 -1
  57. flwr/cli/new/templates/app/pyproject.xgboost.toml.tpl +61 -0
  58. flwr/cli/pull.py +100 -0
  59. flwr/cli/run/run.py +9 -13
  60. flwr/cli/stop.py +7 -4
  61. flwr/cli/utils.py +36 -8
  62. flwr/client/grpc_rere_client/connection.py +1 -12
  63. flwr/client/rest_client/connection.py +3 -0
  64. flwr/clientapp/__init__.py +10 -0
  65. flwr/clientapp/mod/__init__.py +29 -0
  66. flwr/clientapp/mod/centraldp_mods.py +248 -0
  67. flwr/clientapp/mod/localdp_mod.py +169 -0
  68. flwr/clientapp/typing.py +22 -0
  69. flwr/common/args.py +20 -6
  70. flwr/common/auth_plugin/__init__.py +4 -4
  71. flwr/common/auth_plugin/auth_plugin.py +7 -7
  72. flwr/common/constant.py +26 -4
  73. flwr/common/event_log_plugin/event_log_plugin.py +1 -1
  74. flwr/common/exit/__init__.py +4 -0
  75. flwr/common/exit/exit.py +8 -1
  76. flwr/common/exit/exit_code.py +30 -7
  77. flwr/common/exit/exit_handler.py +62 -0
  78. flwr/common/{exit_handlers.py → exit/signal_handler.py} +20 -37
  79. flwr/common/grpc.py +0 -11
  80. flwr/common/inflatable_utils.py +1 -1
  81. flwr/common/logger.py +1 -1
  82. flwr/common/record/typeddict.py +12 -0
  83. flwr/common/retry_invoker.py +30 -11
  84. flwr/common/telemetry.py +4 -0
  85. flwr/compat/server/app.py +2 -2
  86. flwr/proto/appio_pb2.py +25 -17
  87. flwr/proto/appio_pb2.pyi +46 -2
  88. flwr/proto/clientappio_pb2.py +3 -11
  89. flwr/proto/clientappio_pb2.pyi +0 -47
  90. flwr/proto/clientappio_pb2_grpc.py +19 -20
  91. flwr/proto/clientappio_pb2_grpc.pyi +10 -11
  92. flwr/proto/control_pb2.py +66 -0
  93. flwr/proto/{exec_pb2.pyi → control_pb2.pyi} +24 -0
  94. flwr/proto/{exec_pb2_grpc.py → control_pb2_grpc.py} +88 -54
  95. flwr/proto/control_pb2_grpc.pyi +106 -0
  96. flwr/proto/serverappio_pb2.py +2 -2
  97. flwr/proto/serverappio_pb2_grpc.py +68 -0
  98. flwr/proto/serverappio_pb2_grpc.pyi +26 -0
  99. flwr/proto/simulationio_pb2.py +4 -11
  100. flwr/proto/simulationio_pb2.pyi +0 -58
  101. flwr/proto/simulationio_pb2_grpc.py +129 -27
  102. flwr/proto/simulationio_pb2_grpc.pyi +52 -13
  103. flwr/server/app.py +142 -152
  104. flwr/server/grid/grpc_grid.py +3 -0
  105. flwr/server/grid/inmemory_grid.py +1 -0
  106. flwr/server/serverapp/app.py +157 -146
  107. flwr/server/superlink/fleet/vce/backend/raybackend.py +3 -1
  108. flwr/server/superlink/fleet/vce/vce_api.py +6 -6
  109. flwr/server/superlink/linkstate/in_memory_linkstate.py +34 -0
  110. flwr/server/superlink/linkstate/linkstate.py +2 -1
  111. flwr/server/superlink/linkstate/sqlite_linkstate.py +45 -0
  112. flwr/server/superlink/serverappio/serverappio_grpc.py +1 -1
  113. flwr/server/superlink/serverappio/serverappio_servicer.py +61 -6
  114. flwr/server/superlink/simulation/simulationio_servicer.py +97 -21
  115. flwr/serverapp/__init__.py +12 -0
  116. flwr/serverapp/exception.py +38 -0
  117. flwr/serverapp/strategy/__init__.py +64 -0
  118. flwr/serverapp/strategy/bulyan.py +238 -0
  119. flwr/serverapp/strategy/dp_adaptive_clipping.py +335 -0
  120. flwr/serverapp/strategy/dp_fixed_clipping.py +374 -0
  121. flwr/serverapp/strategy/fedadagrad.py +159 -0
  122. flwr/serverapp/strategy/fedadam.py +178 -0
  123. flwr/serverapp/strategy/fedavg.py +320 -0
  124. flwr/serverapp/strategy/fedavgm.py +198 -0
  125. flwr/serverapp/strategy/fedmedian.py +105 -0
  126. flwr/serverapp/strategy/fedopt.py +218 -0
  127. flwr/serverapp/strategy/fedprox.py +174 -0
  128. flwr/serverapp/strategy/fedtrimmedavg.py +176 -0
  129. flwr/serverapp/strategy/fedxgb_bagging.py +117 -0
  130. flwr/serverapp/strategy/fedxgb_cyclic.py +220 -0
  131. flwr/serverapp/strategy/fedyogi.py +170 -0
  132. flwr/serverapp/strategy/krum.py +112 -0
  133. flwr/serverapp/strategy/multikrum.py +247 -0
  134. flwr/serverapp/strategy/qfedavg.py +252 -0
  135. flwr/serverapp/strategy/result.py +105 -0
  136. flwr/serverapp/strategy/strategy.py +285 -0
  137. flwr/serverapp/strategy/strategy_utils.py +299 -0
  138. flwr/simulation/app.py +161 -164
  139. flwr/simulation/run_simulation.py +25 -30
  140. flwr/supercore/app_utils.py +58 -0
  141. flwr/{supernode/scheduler → supercore/cli}/__init__.py +3 -3
  142. flwr/supercore/cli/flower_superexec.py +166 -0
  143. flwr/supercore/constant.py +19 -0
  144. flwr/supercore/{scheduler → corestate}/__init__.py +3 -3
  145. flwr/supercore/corestate/corestate.py +81 -0
  146. flwr/supercore/grpc_health/__init__.py +3 -0
  147. flwr/supercore/grpc_health/health_server.py +53 -0
  148. flwr/supercore/grpc_health/simple_health_servicer.py +2 -2
  149. flwr/{superexec → supercore/superexec}/__init__.py +1 -1
  150. flwr/supercore/superexec/plugin/__init__.py +28 -0
  151. flwr/{supernode/scheduler/simple_clientapp_scheduler_plugin.py → supercore/superexec/plugin/base_exec_plugin.py} +10 -6
  152. flwr/supercore/superexec/plugin/clientapp_exec_plugin.py +28 -0
  153. flwr/supercore/{scheduler/plugin.py → superexec/plugin/exec_plugin.py} +15 -5
  154. flwr/supercore/superexec/plugin/serverapp_exec_plugin.py +28 -0
  155. flwr/supercore/superexec/plugin/simulation_exec_plugin.py +28 -0
  156. flwr/supercore/superexec/run_superexec.py +199 -0
  157. flwr/superlink/artifact_provider/__init__.py +22 -0
  158. flwr/superlink/artifact_provider/artifact_provider.py +37 -0
  159. flwr/superlink/servicer/__init__.py +15 -0
  160. flwr/superlink/servicer/control/__init__.py +22 -0
  161. flwr/{superexec/exec_event_log_interceptor.py → superlink/servicer/control/control_event_log_interceptor.py} +7 -7
  162. flwr/{superexec/exec_grpc.py → superlink/servicer/control/control_grpc.py} +27 -29
  163. flwr/{superexec/exec_license_interceptor.py → superlink/servicer/control/control_license_interceptor.py} +6 -6
  164. flwr/{superexec/exec_servicer.py → superlink/servicer/control/control_servicer.py} +127 -31
  165. flwr/{superexec/exec_user_auth_interceptor.py → superlink/servicer/control/control_user_auth_interceptor.py} +10 -10
  166. flwr/supernode/cli/flower_supernode.py +3 -0
  167. flwr/supernode/cli/flwr_clientapp.py +18 -21
  168. flwr/supernode/nodestate/in_memory_nodestate.py +2 -2
  169. flwr/supernode/nodestate/nodestate.py +3 -59
  170. flwr/supernode/runtime/run_clientapp.py +39 -102
  171. flwr/supernode/servicer/clientappio/clientappio_servicer.py +10 -17
  172. flwr/supernode/start_client_internal.py +35 -76
  173. {flwr-1.20.0.dist-info → flwr-1.22.0.dist-info}/METADATA +9 -18
  174. {flwr-1.20.0.dist-info → flwr-1.22.0.dist-info}/RECORD +176 -128
  175. {flwr-1.20.0.dist-info → flwr-1.22.0.dist-info}/entry_points.txt +1 -0
  176. flwr/proto/exec_pb2.py +0 -62
  177. flwr/proto/exec_pb2_grpc.pyi +0 -93
  178. flwr/superexec/app.py +0 -45
  179. flwr/superexec/deployment.py +0 -191
  180. flwr/superexec/executor.py +0 -100
  181. flwr/superexec/simulation.py +0 -129
  182. {flwr-1.20.0.dist-info → flwr-1.22.0.dist-info}/WHEEL +0 -0
flwr/simulation/app.py CHANGED
@@ -16,10 +16,8 @@
16
16
 
17
17
 
18
18
  import argparse
19
- import gc
20
19
  from logging import DEBUG, ERROR, INFO
21
20
  from queue import Queue
22
- from time import sleep
23
21
  from typing import Optional
24
22
 
25
23
  from flwr.cli.config_utils import get_fab_metadata
@@ -36,6 +34,7 @@ from flwr.common.config import (
36
34
  )
37
35
  from flwr.common.constant import (
38
36
  SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS,
37
+ ExecPluginType,
39
38
  Status,
40
39
  SubStatus,
41
40
  )
@@ -57,19 +56,23 @@ from flwr.common.serde import (
57
56
  run_status_to_proto,
58
57
  )
59
58
  from flwr.common.typing import RunStatus
59
+ from flwr.proto.appio_pb2 import ( # pylint: disable=E0611
60
+ PullAppInputsRequest,
61
+ PullAppInputsResponse,
62
+ PushAppOutputsRequest,
63
+ )
60
64
  from flwr.proto.run_pb2 import ( # pylint: disable=E0611
61
65
  GetFederationOptionsRequest,
62
66
  GetFederationOptionsResponse,
63
67
  UpdateRunStatusRequest,
64
68
  )
65
- from flwr.proto.simulationio_pb2 import ( # pylint: disable=E0611
66
- PullSimulationInputsRequest,
67
- PullSimulationInputsResponse,
68
- PushSimulationOutputsRequest,
69
- )
69
+ from flwr.proto.simulationio_pb2_grpc import SimulationIoStub
70
70
  from flwr.server.superlink.fleet.vce.backend.backend import BackendConfig
71
71
  from flwr.simulation.run_simulation import _run_simulation
72
72
  from flwr.simulation.simulationio_connection import SimulationIoConnection
73
+ from flwr.supercore.app_utils import start_parent_process_monitor
74
+ from flwr.supercore.superexec.plugin import SimulationExecPlugin
75
+ from flwr.supercore.superexec.run_superexec import run_with_deprecation_warning
73
76
 
74
77
 
75
78
  def flwr_simulation() -> None:
@@ -80,14 +83,27 @@ def flwr_simulation() -> None:
80
83
 
81
84
  args = _parse_args_run_flwr_simulation().parse_args()
82
85
 
83
- log(INFO, "Starting Flower Simulation")
84
-
85
86
  if not args.insecure:
86
87
  flwr_exit(
87
88
  ExitCode.COMMON_TLS_NOT_SUPPORTED,
88
- "`flwr-simulation` does not support TLS yet. ",
89
+ "`flwr-simulation` does not support TLS yet.",
90
+ )
91
+
92
+ # Disallow long-running `flwr-simulation` processes
93
+ if args.token is None:
94
+ run_with_deprecation_warning(
95
+ cmd="flwr-simulation",
96
+ plugin_type=ExecPluginType.SIMULATION,
97
+ plugin_class=SimulationExecPlugin,
98
+ stub_class=SimulationIoStub,
99
+ appio_api_address=args.simulationio_api_address,
100
+ flwr_dir=args.flwr_dir,
101
+ parent_pid=args.parent_pid,
102
+ warn_run_once=args.run_once,
89
103
  )
104
+ return
90
105
 
106
+ log(INFO, "Starting Flower Simulation")
91
107
  log(
92
108
  DEBUG,
93
109
  "Starting isolated `Simulation` connected to SuperLink SimulationAppIo API "
@@ -97,23 +113,29 @@ def flwr_simulation() -> None:
97
113
  run_simulation_process(
98
114
  simulationio_api_address=args.simulationio_api_address,
99
115
  log_queue=log_queue,
100
- run_once=args.run_once,
116
+ token=args.token,
101
117
  flwr_dir_=args.flwr_dir,
102
118
  certificates=None,
119
+ parent_pid=args.parent_pid,
103
120
  )
104
121
 
105
122
  # Restore stdout/stderr
106
123
  restore_output()
107
124
 
108
125
 
109
- def run_simulation_process( # pylint: disable=R0914, disable=W0212, disable=R0915
126
+ def run_simulation_process( # pylint: disable=R0913, R0914, R0915, R0917, W0212
110
127
  simulationio_api_address: str,
111
128
  log_queue: Queue[Optional[str]],
112
- run_once: bool,
129
+ token: str,
113
130
  flwr_dir_: Optional[str] = None,
114
131
  certificates: Optional[bytes] = None,
132
+ parent_pid: Optional[int] = None,
115
133
  ) -> None:
116
134
  """Run Flower Simulation process."""
135
+ # Start monitoring the parent process if a PID is provided
136
+ if parent_pid is not None:
137
+ start_parent_process_monitor(parent_pid)
138
+
117
139
  conn = SimulationIoConnection(
118
140
  simulationio_service_address=simulationio_api_address,
119
141
  root_certificates=certificates,
@@ -123,165 +145,146 @@ def run_simulation_process( # pylint: disable=R0914, disable=W0212, disable=R09
123
145
  flwr_dir = get_flwr_dir(flwr_dir_)
124
146
  log_uploader = None
125
147
  heartbeat_sender = None
148
+ run_status = None
149
+
150
+ try:
151
+ # Pull SimulationInputs from LinkState
152
+ req = PullAppInputsRequest(token=token)
153
+ res: PullAppInputsResponse = conn._stub.PullAppInputs(req)
154
+ context = context_from_proto(res.context)
155
+ run = run_from_proto(res.run)
156
+ fab = fab_from_proto(res.fab)
157
+
158
+ # Start log uploader for this run
159
+ log_uploader = start_log_uploader(
160
+ log_queue=log_queue,
161
+ node_id=context.node_id,
162
+ run_id=run.run_id,
163
+ stub=conn._stub,
164
+ )
126
165
 
127
- while True:
166
+ log(DEBUG, "Simulation process starts FAB installation.")
167
+ install_from_fab(fab.content, flwr_dir=flwr_dir, skip_prompt=True)
128
168
 
129
- try:
130
- # Pull SimulationInputs from LinkState
131
- req = PullSimulationInputsRequest()
132
- res: PullSimulationInputsResponse = conn._stub.PullSimulationInputs(req)
133
- if not res.HasField("run"):
134
- sleep(3)
135
- run_status = None
136
- continue
137
-
138
- context = context_from_proto(res.context)
139
- run = run_from_proto(res.run)
140
- fab = fab_from_proto(res.fab)
141
-
142
- # Start log uploader for this run
143
- log_uploader = start_log_uploader(
144
- log_queue=log_queue,
145
- node_id=context.node_id,
146
- run_id=run.run_id,
147
- stub=conn._stub,
148
- )
169
+ fab_id, fab_version = get_fab_metadata(fab.content)
149
170
 
150
- log(DEBUG, "Simulation process starts FAB installation.")
151
- install_from_fab(fab.content, flwr_dir=flwr_dir, skip_prompt=True)
171
+ app_path = get_project_dir(fab_id, fab_version, fab.hash_str, flwr_dir)
172
+ config = get_project_config(app_path)
152
173
 
153
- fab_id, fab_version = get_fab_metadata(fab.content)
174
+ # Get ClientApp and SeverApp components
175
+ app_components = config["tool"]["flwr"]["app"]["components"]
176
+ client_app_attr = app_components["clientapp"]
177
+ server_app_attr = app_components["serverapp"]
178
+ fused_config = get_fused_config_from_dir(app_path, run.override_config)
154
179
 
155
- app_path = get_project_dir(fab_id, fab_version, fab.hash_str, flwr_dir)
156
- config = get_project_config(app_path)
180
+ # Update run_config in context
181
+ context.run_config = fused_config
157
182
 
158
- # Get ClientApp and SeverApp components
159
- app_components = config["tool"]["flwr"]["app"]["components"]
160
- client_app_attr = app_components["clientapp"]
161
- server_app_attr = app_components["serverapp"]
162
- fused_config = get_fused_config_from_dir(app_path, run.override_config)
183
+ log(
184
+ DEBUG,
185
+ "Flower will load ServerApp `%s` in %s",
186
+ server_app_attr,
187
+ app_path,
188
+ )
189
+ log(
190
+ DEBUG,
191
+ "Flower will load ClientApp `%s` in %s",
192
+ client_app_attr,
193
+ app_path,
194
+ )
163
195
 
164
- # Update run_config in context
165
- context.run_config = fused_config
196
+ # Change status to Running
197
+ run_status_proto = run_status_to_proto(RunStatus(Status.RUNNING, "", ""))
198
+ conn._stub.UpdateRunStatus(
199
+ UpdateRunStatusRequest(run_id=run.run_id, run_status=run_status_proto)
200
+ )
166
201
 
167
- log(
168
- DEBUG,
169
- "Flower will load ServerApp `%s` in %s",
170
- server_app_attr,
171
- app_path,
172
- )
173
- log(
174
- DEBUG,
175
- "Flower will load ClientApp `%s` in %s",
176
- client_app_attr,
177
- app_path,
178
- )
202
+ # Pull Federation Options
203
+ fed_opt_res: GetFederationOptionsResponse = conn._stub.GetFederationOptions(
204
+ GetFederationOptionsRequest(run_id=run.run_id)
205
+ )
206
+ federation_options = config_record_from_proto(fed_opt_res.federation_options)
207
+
208
+ # Unflatten underlying dict
209
+ fed_opt = unflatten_dict({**federation_options})
210
+
211
+ # Extract configs values of interest
212
+ num_supernodes = fed_opt.get("num-supernodes")
213
+ if num_supernodes is None:
214
+ raise ValueError("Federation options expects `num-supernodes` to be set.")
215
+ backend_config: BackendConfig = fed_opt.get("backend", {})
216
+ verbose: bool = fed_opt.get("verbose", False)
217
+ enable_tf_gpu_growth: bool = fed_opt.get("enable_tf_gpu_growth", False)
218
+
219
+ event(
220
+ EventType.FLWR_SIMULATION_RUN_ENTER,
221
+ event_details={
222
+ "backend": "ray",
223
+ "num-supernodes": num_supernodes,
224
+ "run-id-hash": get_sha256_hash(run.run_id),
225
+ },
226
+ )
179
227
 
180
- # Change status to Running
181
- run_status_proto = run_status_to_proto(RunStatus(Status.RUNNING, "", ""))
182
- conn._stub.UpdateRunStatus(
183
- UpdateRunStatusRequest(run_id=run.run_id, run_status=run_status_proto)
184
- )
228
+ # Set up heartbeat sender
229
+ heartbeat_fn = get_grpc_app_heartbeat_fn(
230
+ conn._stub,
231
+ run.run_id,
232
+ failure_message="Heartbeat failed unexpectedly. The SuperLink could "
233
+ "not find the provided run ID, or the run status is invalid.",
234
+ )
235
+ heartbeat_sender = HeartbeatSender(heartbeat_fn)
236
+ heartbeat_sender.start()
237
+
238
+ # Launch the simulation
239
+ updated_context = _run_simulation(
240
+ server_app_attr=server_app_attr,
241
+ client_app_attr=client_app_attr,
242
+ num_supernodes=num_supernodes,
243
+ backend_config=backend_config,
244
+ app_dir=str(app_path),
245
+ run=run,
246
+ enable_tf_gpu_growth=enable_tf_gpu_growth,
247
+ verbose_logging=verbose,
248
+ server_app_context=context,
249
+ is_app=True,
250
+ exit_event=EventType.FLWR_SIMULATION_RUN_LEAVE,
251
+ )
185
252
 
186
- # Pull Federation Options
187
- fed_opt_res: GetFederationOptionsResponse = conn._stub.GetFederationOptions(
188
- GetFederationOptionsRequest(run_id=run.run_id)
189
- )
190
- federation_options = config_record_from_proto(
191
- fed_opt_res.federation_options
192
- )
253
+ # Send resulting context
254
+ context_proto = context_to_proto(updated_context)
255
+ out_req = PushAppOutputsRequest(
256
+ token=token, run_id=run.run_id, context=context_proto
257
+ )
258
+ _ = conn._stub.PushAppOutputs(out_req)
193
259
 
194
- # Unflatten underlying dict
195
- fed_opt = unflatten_dict({**federation_options})
196
-
197
- # Extract configs values of interest
198
- num_supernodes = fed_opt.get("num-supernodes")
199
- if num_supernodes is None:
200
- raise ValueError(
201
- "Federation options expects `num-supernodes` to be set."
202
- )
203
- backend_config: BackendConfig = fed_opt.get("backend", {})
204
- verbose: bool = fed_opt.get("verbose", False)
205
- enable_tf_gpu_growth: bool = fed_opt.get("enable_tf_gpu_growth", False)
206
-
207
- event(
208
- EventType.FLWR_SIMULATION_RUN_ENTER,
209
- event_details={
210
- "backend": "ray",
211
- "num-supernodes": num_supernodes,
212
- "run-id-hash": get_sha256_hash(run.run_id),
213
- },
214
- )
260
+ run_status = RunStatus(Status.FINISHED, SubStatus.COMPLETED, "")
215
261
 
216
- # Set up heartbeat sender
217
- heartbeat_fn = get_grpc_app_heartbeat_fn(
218
- conn._stub,
219
- run.run_id,
220
- failure_message="Heartbeat failed unexpectedly. The SuperLink could "
221
- "not find the provided run ID, or the run status is invalid.",
222
- )
223
- heartbeat_sender = HeartbeatSender(heartbeat_fn)
224
- heartbeat_sender.start()
225
-
226
- # Launch the simulation
227
- updated_context = _run_simulation(
228
- server_app_attr=server_app_attr,
229
- client_app_attr=client_app_attr,
230
- num_supernodes=num_supernodes,
231
- backend_config=backend_config,
232
- app_dir=str(app_path),
233
- run=run,
234
- enable_tf_gpu_growth=enable_tf_gpu_growth,
235
- verbose_logging=verbose,
236
- server_app_run_config=fused_config,
237
- is_app=True,
238
- exit_event=EventType.FLWR_SIMULATION_RUN_LEAVE,
239
- )
262
+ except Exception as ex: # pylint: disable=broad-exception-caught
263
+ exc_entity = "Simulation"
264
+ log(ERROR, "%s raised an exception", exc_entity, exc_info=ex)
265
+ run_status = RunStatus(Status.FINISHED, SubStatus.FAILED, str(ex))
266
+
267
+ finally:
268
+ # Stop heartbeat sender
269
+ if heartbeat_sender:
270
+ heartbeat_sender.stop()
271
+
272
+ # Stop log uploader for this run and upload final logs
273
+ if log_uploader:
274
+ stop_log_uploader(log_queue, log_uploader)
240
275
 
241
- # Send resulting context
242
- context_proto = context_to_proto(updated_context)
243
- out_req = PushSimulationOutputsRequest(
244
- run_id=run.run_id, context=context_proto
276
+ # Update run status
277
+ if run_status:
278
+ run_status_proto = run_status_to_proto(run_status)
279
+ conn._stub.UpdateRunStatus(
280
+ UpdateRunStatusRequest(run_id=run.run_id, run_status=run_status_proto)
245
281
  )
246
- _ = conn._stub.PushSimulationOutputs(out_req)
247
-
248
- run_status = RunStatus(Status.FINISHED, SubStatus.COMPLETED, "")
249
-
250
- except Exception as ex: # pylint: disable=broad-exception-caught
251
- exc_entity = "Simulation"
252
- log(ERROR, "%s raised an exception", exc_entity, exc_info=ex)
253
- run_status = RunStatus(Status.FINISHED, SubStatus.FAILED, str(ex))
254
-
255
- finally:
256
- # Stop heartbeat sender
257
- if heartbeat_sender:
258
- heartbeat_sender.stop()
259
- heartbeat_sender = None
260
-
261
- # Stop log uploader for this run and upload final logs
262
- if log_uploader:
263
- stop_log_uploader(log_queue, log_uploader)
264
- log_uploader = None
265
-
266
- # Update run status
267
- if run_status:
268
- run_status_proto = run_status_to_proto(run_status)
269
- conn._stub.UpdateRunStatus(
270
- UpdateRunStatusRequest(
271
- run_id=run.run_id, run_status=run_status_proto
272
- )
273
- )
274
-
275
- # Clean up the Context if it exists
276
- try:
277
- del updated_context
278
- except NameError:
279
- pass
280
- gc.collect()
281
-
282
- # Stop the loop if `flwr-simulation` is expected to process a single run
283
- if run_once:
284
- break
282
+
283
+ # Clean up the Context if it exists
284
+ try:
285
+ del updated_context
286
+ except NameError:
287
+ pass
285
288
 
286
289
 
287
290
  def _parse_args_run_flwr_simulation() -> argparse.ArgumentParser:
@@ -296,11 +299,5 @@ def _parse_args_run_flwr_simulation() -> argparse.ArgumentParser:
296
299
  help="Address of SuperLink's SimulationIO API (IPv4, IPv6, or a domain name)."
297
300
  f"By default, it is set to {SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS}.",
298
301
  )
299
- parser.add_argument(
300
- "--run-once",
301
- action="store_true",
302
- help="When set, this process will start a single simulation "
303
- "for a pending Run. If no pending run the process will exit. ",
304
- )
305
302
  add_args_flwr_app_common(parser=parser)
306
303
  return parser
@@ -143,6 +143,15 @@ def run_simulation_from_cli() -> None:
143
143
  run = Run.create_empty(run_id)
144
144
  run.override_config = override_config
145
145
 
146
+ # Create Context
147
+ server_app_context = Context(
148
+ run_id=run_id,
149
+ node_id=0,
150
+ node_config=UserConfig(),
151
+ state=RecordDict(),
152
+ run_config=fused_config,
153
+ )
154
+
146
155
  _ = _run_simulation(
147
156
  server_app_attr=server_app_attr,
148
157
  client_app_attr=client_app_attr,
@@ -153,7 +162,7 @@ def run_simulation_from_cli() -> None:
153
162
  run=run,
154
163
  enable_tf_gpu_growth=args.enable_tf_gpu_growth,
155
164
  verbose_logging=args.verbose,
156
- server_app_run_config=fused_config,
165
+ server_app_context=server_app_context,
157
166
  is_app=True,
158
167
  exit_event=EventType.CLI_FLOWER_SIMULATION_LEAVE,
159
168
  )
@@ -241,13 +250,12 @@ def run_simulation(
241
250
  def run_serverapp_th(
242
251
  server_app_attr: Optional[str],
243
252
  server_app: Optional[ServerApp],
244
- server_app_run_config: UserConfig,
253
+ server_app_context: Context,
245
254
  grid: Grid,
246
255
  app_dir: str,
247
256
  f_stop: threading.Event,
248
257
  has_exception: threading.Event,
249
258
  enable_tf_gpu_growth: bool,
250
- run_id: int,
251
259
  ctx_queue: "Queue[Context]",
252
260
  ) -> threading.Thread:
253
261
  """Run SeverApp in a thread."""
@@ -258,7 +266,6 @@ def run_serverapp_th(
258
266
  exception_event: threading.Event,
259
267
  _grid: Grid,
260
268
  _server_app_dir: str,
261
- _server_app_run_config: UserConfig,
262
269
  _server_app_attr: Optional[str],
263
270
  _server_app: Optional[ServerApp],
264
271
  _ctx_queue: "Queue[Context]",
@@ -272,19 +279,10 @@ def run_serverapp_th(
272
279
  log(INFO, "Enabling GPU growth for Tensorflow on the server thread.")
273
280
  enable_gpu_growth()
274
281
 
275
- # Initialize Context
276
- context = Context(
277
- run_id=run_id,
278
- node_id=0,
279
- node_config={},
280
- state=RecordDict(),
281
- run_config=_server_app_run_config,
282
- )
283
-
284
282
  # Run ServerApp
285
283
  updated_context = _run(
286
284
  grid=_grid,
287
- context=context,
285
+ context=server_app_context,
288
286
  server_app_dir=_server_app_dir,
289
287
  server_app_attr=_server_app_attr,
290
288
  loaded_server_app=_server_app,
@@ -310,7 +308,6 @@ def run_serverapp_th(
310
308
  has_exception,
311
309
  grid,
312
310
  app_dir,
313
- server_app_run_config,
314
311
  server_app_attr,
315
312
  server_app,
316
313
  ctx_queue,
@@ -335,7 +332,7 @@ def _main_loop(
335
332
  client_app_attr: Optional[str] = None,
336
333
  server_app: Optional[ServerApp] = None,
337
334
  server_app_attr: Optional[str] = None,
338
- server_app_run_config: Optional[UserConfig] = None,
335
+ server_app_context: Optional[Context] = None,
339
336
  ) -> Context:
340
337
  """Start ServerApp on a separate thread, then launch Simulation Engine."""
341
338
  # Initialize StateFactory
@@ -346,13 +343,15 @@ def _main_loop(
346
343
  server_app_thread_has_exception = threading.Event()
347
344
  serverapp_th = None
348
345
  success = True
349
- updated_context = Context(
350
- run_id=run.run_id,
351
- node_id=0,
352
- node_config=UserConfig(),
353
- state=RecordDict(),
354
- run_config=UserConfig(),
355
- )
346
+ if server_app_context is None:
347
+ server_app_context = Context(
348
+ run_id=run.run_id,
349
+ node_id=0,
350
+ node_config=UserConfig(),
351
+ state=RecordDict(),
352
+ run_config=UserConfig(),
353
+ )
354
+ updated_context = server_app_context
356
355
  try:
357
356
  # Register run
358
357
  log(DEBUG, "Pre-registering run with id %s", run.run_id)
@@ -361,9 +360,6 @@ def _main_loop(
361
360
  run.running_at = run.starting_at
362
361
  state_factory.state().run_ids[run.run_id] = RunRecord(run=run) # type: ignore
363
362
 
364
- if server_app_run_config is None:
365
- server_app_run_config = {}
366
-
367
363
  # Initialize Grid
368
364
  grid = InMemoryGrid(state_factory=state_factory)
369
365
  grid.set_run(run_id=run.run_id)
@@ -373,13 +369,12 @@ def _main_loop(
373
369
  serverapp_th = run_serverapp_th(
374
370
  server_app_attr=server_app_attr,
375
371
  server_app=server_app,
376
- server_app_run_config=server_app_run_config,
372
+ server_app_context=server_app_context,
377
373
  grid=grid,
378
374
  app_dir=app_dir,
379
375
  f_stop=f_stop,
380
376
  has_exception=server_app_thread_has_exception,
381
377
  enable_tf_gpu_growth=enable_tf_gpu_growth,
382
- run_id=run.run_id,
383
378
  ctx_queue=output_context_queue,
384
379
  )
385
380
 
@@ -438,7 +433,7 @@ def _run_simulation(
438
433
  backend_config: Optional[BackendConfig] = None,
439
434
  client_app_attr: Optional[str] = None,
440
435
  server_app_attr: Optional[str] = None,
441
- server_app_run_config: Optional[UserConfig] = None,
436
+ server_app_context: Optional[Context] = None,
442
437
  app_dir: str = "",
443
438
  flwr_dir: Optional[str] = None,
444
439
  run: Optional[Run] = None,
@@ -502,7 +497,7 @@ def _run_simulation(
502
497
  client_app_attr,
503
498
  server_app,
504
499
  server_app_attr,
505
- server_app_run_config,
500
+ server_app_context,
506
501
  )
507
502
  # Detect if there is an Asyncio event loop already running.
508
503
  # If yes, disable logger propagation. In environmnets
@@ -0,0 +1,58 @@
1
+ # Copyright 2025 Flower Labs GmbH. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+ """Utility functions for app processes."""
16
+
17
+
18
+ import os
19
+ import signal
20
+ import threading
21
+ import time
22
+
23
+ if os.name == "nt":
24
+ from ctypes import windll # type: ignore
25
+
26
+
27
+ def _pid_exists(pid: int) -> bool:
28
+ """Check if a process with the given PID exists.
29
+
30
+ This works on Unix-like systems and Windows.
31
+ """
32
+ # Use `ctypes` to check if the process exists on Windows
33
+ if os.name == "nt":
34
+ handle = windll.kernel32.OpenProcess(0x1000, False, pid)
35
+ if handle:
36
+ windll.kernel32.CloseHandle(handle)
37
+ return True
38
+ return False
39
+ # Use `os.kill` on Unix-like systems
40
+ try:
41
+ os.kill(pid, 0)
42
+ except OSError:
43
+ return False
44
+ return True
45
+
46
+
47
+ def start_parent_process_monitor(
48
+ parent_pid: int,
49
+ ) -> None:
50
+ """Monitor the parent process and exit if it terminates."""
51
+
52
+ def monitor() -> None:
53
+ while True:
54
+ time.sleep(0.2)
55
+ if not _pid_exists(parent_pid):
56
+ os.kill(os.getpid(), signal.SIGKILL)
57
+
58
+ threading.Thread(target=monitor, daemon=True).start()
@@ -12,11 +12,11 @@
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
14
  # ==============================================================================
15
- """Flower ClientApp Scheduler."""
15
+ """Flower command line interface for shared infrastructure components."""
16
16
 
17
17
 
18
- from .simple_clientapp_scheduler_plugin import SimpleClientAppSchedulerPlugin
18
+ from .flower_superexec import flower_superexec
19
19
 
20
20
  __all__ = [
21
- "SimpleClientAppSchedulerPlugin",
21
+ "flower_superexec",
22
22
  ]