flwr 1.18.0__py3-none-any.whl → 1.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (174) hide show
  1. flwr/app/__init__.py +15 -0
  2. flwr/app/error.py +68 -0
  3. flwr/app/metadata.py +223 -0
  4. flwr/cli/build.py +94 -59
  5. flwr/cli/log.py +3 -3
  6. flwr/cli/login/login.py +3 -7
  7. flwr/cli/ls.py +15 -36
  8. flwr/cli/new/new.py +12 -4
  9. flwr/cli/new/templates/app/README.flowertune.md.tpl +2 -0
  10. flwr/cli/new/templates/app/README.md.tpl +5 -0
  11. flwr/cli/new/templates/app/code/client.baseline.py.tpl +1 -1
  12. flwr/cli/new/templates/app/code/model.baseline.py.tpl +1 -1
  13. flwr/cli/new/templates/app/code/server.baseline.py.tpl +2 -3
  14. flwr/cli/new/templates/app/pyproject.baseline.toml.tpl +25 -17
  15. flwr/cli/new/templates/app/pyproject.flowertune.toml.tpl +13 -1
  16. flwr/cli/new/templates/app/pyproject.huggingface.toml.tpl +21 -2
  17. flwr/cli/new/templates/app/pyproject.jax.toml.tpl +18 -1
  18. flwr/cli/new/templates/app/pyproject.mlx.toml.tpl +19 -2
  19. flwr/cli/new/templates/app/pyproject.numpy.toml.tpl +18 -1
  20. flwr/cli/new/templates/app/pyproject.pytorch.toml.tpl +20 -3
  21. flwr/cli/new/templates/app/pyproject.sklearn.toml.tpl +18 -1
  22. flwr/cli/new/templates/app/pyproject.tensorflow.toml.tpl +18 -1
  23. flwr/cli/run/run.py +48 -49
  24. flwr/cli/stop.py +2 -2
  25. flwr/cli/utils.py +38 -5
  26. flwr/client/__init__.py +2 -2
  27. flwr/client/client_app.py +1 -1
  28. flwr/client/clientapp/__init__.py +0 -7
  29. flwr/client/grpc_adapter_client/connection.py +15 -8
  30. flwr/client/grpc_rere_client/connection.py +142 -97
  31. flwr/client/grpc_rere_client/grpc_adapter.py +34 -6
  32. flwr/client/message_handler/message_handler.py +1 -1
  33. flwr/client/mod/comms_mods.py +36 -17
  34. flwr/client/rest_client/connection.py +176 -103
  35. flwr/clientapp/__init__.py +15 -0
  36. flwr/common/__init__.py +2 -2
  37. flwr/common/auth_plugin/__init__.py +2 -0
  38. flwr/common/auth_plugin/auth_plugin.py +29 -3
  39. flwr/common/constant.py +39 -8
  40. flwr/common/event_log_plugin/event_log_plugin.py +3 -3
  41. flwr/common/exit/exit_code.py +16 -1
  42. flwr/common/exit_handlers.py +30 -0
  43. flwr/common/grpc.py +12 -1
  44. flwr/common/heartbeat.py +165 -0
  45. flwr/common/inflatable.py +290 -0
  46. flwr/common/inflatable_protobuf_utils.py +141 -0
  47. flwr/common/inflatable_utils.py +508 -0
  48. flwr/common/message.py +110 -242
  49. flwr/common/record/__init__.py +2 -1
  50. flwr/common/record/array.py +402 -0
  51. flwr/common/record/arraychunk.py +59 -0
  52. flwr/common/record/arrayrecord.py +103 -225
  53. flwr/common/record/configrecord.py +59 -4
  54. flwr/common/record/conversion_utils.py +1 -1
  55. flwr/common/record/metricrecord.py +55 -4
  56. flwr/common/record/recorddict.py +69 -1
  57. flwr/common/recorddict_compat.py +2 -2
  58. flwr/common/retry_invoker.py +5 -1
  59. flwr/common/serde.py +59 -211
  60. flwr/common/serde_utils.py +175 -0
  61. flwr/common/typing.py +5 -3
  62. flwr/compat/__init__.py +15 -0
  63. flwr/compat/client/__init__.py +15 -0
  64. flwr/{client → compat/client}/app.py +28 -185
  65. flwr/compat/common/__init__.py +15 -0
  66. flwr/compat/server/__init__.py +15 -0
  67. flwr/compat/server/app.py +174 -0
  68. flwr/compat/simulation/__init__.py +15 -0
  69. flwr/proto/appio_pb2.py +43 -0
  70. flwr/proto/appio_pb2.pyi +151 -0
  71. flwr/proto/appio_pb2_grpc.py +4 -0
  72. flwr/proto/appio_pb2_grpc.pyi +4 -0
  73. flwr/proto/clientappio_pb2.py +12 -19
  74. flwr/proto/clientappio_pb2.pyi +23 -101
  75. flwr/proto/clientappio_pb2_grpc.py +269 -28
  76. flwr/proto/clientappio_pb2_grpc.pyi +114 -20
  77. flwr/proto/fleet_pb2.py +24 -27
  78. flwr/proto/fleet_pb2.pyi +19 -35
  79. flwr/proto/fleet_pb2_grpc.py +117 -13
  80. flwr/proto/fleet_pb2_grpc.pyi +47 -6
  81. flwr/proto/heartbeat_pb2.py +33 -0
  82. flwr/proto/heartbeat_pb2.pyi +66 -0
  83. flwr/proto/heartbeat_pb2_grpc.py +4 -0
  84. flwr/proto/heartbeat_pb2_grpc.pyi +4 -0
  85. flwr/proto/message_pb2.py +28 -11
  86. flwr/proto/message_pb2.pyi +125 -0
  87. flwr/proto/recorddict_pb2.py +16 -28
  88. flwr/proto/recorddict_pb2.pyi +46 -64
  89. flwr/proto/run_pb2.py +24 -32
  90. flwr/proto/run_pb2.pyi +4 -52
  91. flwr/proto/serverappio_pb2.py +9 -23
  92. flwr/proto/serverappio_pb2.pyi +0 -110
  93. flwr/proto/serverappio_pb2_grpc.py +177 -72
  94. flwr/proto/serverappio_pb2_grpc.pyi +75 -33
  95. flwr/proto/simulationio_pb2.py +12 -11
  96. flwr/proto/simulationio_pb2_grpc.py +35 -0
  97. flwr/proto/simulationio_pb2_grpc.pyi +14 -0
  98. flwr/server/__init__.py +1 -1
  99. flwr/server/app.py +69 -187
  100. flwr/server/compat/app_utils.py +50 -28
  101. flwr/server/fleet_event_log_interceptor.py +6 -2
  102. flwr/server/grid/grpc_grid.py +148 -41
  103. flwr/server/grid/inmemory_grid.py +5 -4
  104. flwr/server/serverapp/app.py +45 -17
  105. flwr/server/superlink/fleet/grpc_adapter/grpc_adapter_servicer.py +21 -3
  106. flwr/server/superlink/fleet/grpc_rere/fleet_servicer.py +102 -8
  107. flwr/server/superlink/fleet/grpc_rere/server_interceptor.py +2 -5
  108. flwr/server/superlink/fleet/message_handler/message_handler.py +130 -19
  109. flwr/server/superlink/fleet/rest_rere/rest_api.py +73 -13
  110. flwr/server/superlink/fleet/vce/vce_api.py +6 -3
  111. flwr/server/superlink/linkstate/in_memory_linkstate.py +138 -43
  112. flwr/server/superlink/linkstate/linkstate.py +53 -20
  113. flwr/server/superlink/linkstate/sqlite_linkstate.py +149 -55
  114. flwr/server/superlink/linkstate/utils.py +33 -29
  115. flwr/server/superlink/serverappio/serverappio_grpc.py +4 -1
  116. flwr/server/superlink/serverappio/serverappio_servicer.py +230 -84
  117. flwr/server/superlink/simulation/simulationio_grpc.py +1 -1
  118. flwr/server/superlink/simulation/simulationio_servicer.py +26 -2
  119. flwr/server/superlink/utils.py +9 -2
  120. flwr/server/utils/validator.py +2 -2
  121. flwr/serverapp/__init__.py +15 -0
  122. flwr/simulation/app.py +25 -0
  123. flwr/simulation/run_simulation.py +17 -0
  124. flwr/supercore/__init__.py +15 -0
  125. flwr/{server/superlink → supercore}/ffs/__init__.py +2 -0
  126. flwr/{server/superlink → supercore}/ffs/disk_ffs.py +1 -1
  127. flwr/supercore/grpc_health/__init__.py +22 -0
  128. flwr/supercore/grpc_health/simple_health_servicer.py +38 -0
  129. flwr/supercore/license_plugin/__init__.py +22 -0
  130. flwr/supercore/license_plugin/license_plugin.py +26 -0
  131. flwr/supercore/object_store/__init__.py +24 -0
  132. flwr/supercore/object_store/in_memory_object_store.py +229 -0
  133. flwr/supercore/object_store/object_store.py +170 -0
  134. flwr/supercore/object_store/object_store_factory.py +44 -0
  135. flwr/supercore/object_store/utils.py +43 -0
  136. flwr/supercore/scheduler/__init__.py +22 -0
  137. flwr/supercore/scheduler/plugin.py +71 -0
  138. flwr/{client/nodestate/nodestate.py → supercore/utils.py} +14 -13
  139. flwr/superexec/deployment.py +7 -4
  140. flwr/superexec/exec_event_log_interceptor.py +8 -4
  141. flwr/superexec/exec_grpc.py +25 -5
  142. flwr/superexec/exec_license_interceptor.py +82 -0
  143. flwr/superexec/exec_servicer.py +135 -24
  144. flwr/superexec/exec_user_auth_interceptor.py +45 -8
  145. flwr/superexec/executor.py +5 -1
  146. flwr/superexec/simulation.py +8 -3
  147. flwr/superlink/__init__.py +15 -0
  148. flwr/{client/supernode → supernode}/__init__.py +0 -7
  149. flwr/supernode/cli/__init__.py +24 -0
  150. flwr/{client/supernode/app.py → supernode/cli/flower_supernode.py} +3 -19
  151. flwr/supernode/cli/flwr_clientapp.py +88 -0
  152. flwr/supernode/nodestate/in_memory_nodestate.py +199 -0
  153. flwr/supernode/nodestate/nodestate.py +227 -0
  154. flwr/supernode/runtime/__init__.py +15 -0
  155. flwr/{client/clientapp/app.py → supernode/runtime/run_clientapp.py} +135 -89
  156. flwr/supernode/scheduler/__init__.py +22 -0
  157. flwr/supernode/scheduler/simple_clientapp_scheduler_plugin.py +49 -0
  158. flwr/supernode/servicer/__init__.py +15 -0
  159. flwr/supernode/servicer/clientappio/__init__.py +22 -0
  160. flwr/supernode/servicer/clientappio/clientappio_servicer.py +303 -0
  161. flwr/supernode/start_client_internal.py +589 -0
  162. {flwr-1.18.0.dist-info → flwr-1.20.0.dist-info}/METADATA +6 -4
  163. {flwr-1.18.0.dist-info → flwr-1.20.0.dist-info}/RECORD +171 -123
  164. {flwr-1.18.0.dist-info → flwr-1.20.0.dist-info}/WHEEL +1 -1
  165. {flwr-1.18.0.dist-info → flwr-1.20.0.dist-info}/entry_points.txt +2 -2
  166. flwr/client/clientapp/clientappio_servicer.py +0 -244
  167. flwr/client/heartbeat.py +0 -74
  168. flwr/client/nodestate/in_memory_nodestate.py +0 -38
  169. /flwr/{client → compat/client}/grpc_client/__init__.py +0 -0
  170. /flwr/{client → compat/client}/grpc_client/connection.py +0 -0
  171. /flwr/{server/superlink → supercore}/ffs/ffs.py +0 -0
  172. /flwr/{server/superlink → supercore}/ffs/ffs_factory.py +0 -0
  173. /flwr/{client → supernode}/nodestate/__init__.py +0 -0
  174. /flwr/{client → supernode}/nodestate/nodestate_factory.py +0 -0
flwr/common/constant.py CHANGED
@@ -55,13 +55,14 @@ EXEC_API_DEFAULT_SERVER_ADDRESS = f"{SERVER_OCTET}:{EXEC_API_PORT}"
55
55
  SIMULATIONIO_API_DEFAULT_SERVER_ADDRESS = f"{SERVER_OCTET}:{SIMULATIONIO_PORT}"
56
56
  SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS = f"{CLIENT_OCTET}:{SIMULATIONIO_PORT}"
57
57
 
58
- # Constants for ping
59
- PING_DEFAULT_INTERVAL = 30
60
- PING_CALL_TIMEOUT = 5
61
- PING_BASE_MULTIPLIER = 0.8
62
- PING_RANDOM_RANGE = (-0.1, 0.1)
63
- PING_MAX_INTERVAL = 1e300
64
- PING_PATIENCE = 2
58
+ # Constants for heartbeat
59
+ HEARTBEAT_DEFAULT_INTERVAL = 30
60
+ HEARTBEAT_CALL_TIMEOUT = 5
61
+ HEARTBEAT_BASE_MULTIPLIER = 0.8
62
+ HEARTBEAT_RANDOM_RANGE = (-0.1, 0.1)
63
+ HEARTBEAT_MAX_INTERVAL = 1e300
64
+ HEARTBEAT_PATIENCE = 2
65
+ RUN_FAILURE_DETAILS_NO_HEARTBEAT = "No heartbeat received from the run."
65
66
 
66
67
  # IDs
67
68
  RUN_ID_NUM_BYTES = 8
@@ -73,6 +74,7 @@ FAB_ALLOWED_EXTENSIONS = {".py", ".toml", ".md"}
73
74
  FAB_CONFIG_FILE = "pyproject.toml"
74
75
  FAB_DATE = (2024, 10, 1, 0, 0, 0)
75
76
  FAB_HASH_TRUNCATION = 8
77
+ FAB_MAX_SIZE = 10 * 1024 * 1024 # 10 MB
76
78
  FLWR_DIR = ".flwr" # The default Flower directory: ~/.flwr/
77
79
  FLWR_HOME = "FLWR_HOME" # If set, override the default Flower directory
78
80
 
@@ -114,16 +116,45 @@ AUTH_TYPE_YAML_KEY = "auth_type" # For key name in YAML file
114
116
  ACCESS_TOKEN_KEY = "flwr-oidc-access-token"
115
117
  REFRESH_TOKEN_KEY = "flwr-oidc-refresh-token"
116
118
 
119
+ # Constants for user authorization
120
+ AUTHZ_TYPE_YAML_KEY = "authz_type" # For key name in YAML file
121
+
117
122
  # Constants for node authentication
118
123
  PUBLIC_KEY_HEADER = "flwr-public-key-bin" # Must end with "-bin" for binary data
119
124
  SIGNATURE_HEADER = "flwr-signature-bin" # Must end with "-bin" for binary data
120
125
  TIMESTAMP_HEADER = "flwr-timestamp"
121
- TIMESTAMP_TOLERANCE = 10 # General tolerance for timestamp verification
126
+ TIMESTAMP_TOLERANCE = 300 # General tolerance for timestamp verification
122
127
  SYSTEM_TIME_TOLERANCE = 5 # Allowance for system time drift
123
128
 
129
+ # Constants for grpc retry
130
+ GRPC_RETRY_MAX_DELAY = 20 # Maximum delay duration between two consecutive retries.
131
+
124
132
  # Constants for ArrayRecord
125
133
  GC_THRESHOLD = 200_000_000 # 200 MB
126
134
 
135
+ # Constants for Inflatable
136
+ HEAD_BODY_DIVIDER = b"\x00"
137
+ HEAD_VALUE_DIVIDER = " "
138
+ MAX_ARRAY_CHUNK_SIZE = 20_971_520 # 20 MB
139
+
140
+ # Constants for serialization
141
+ INT64_MAX_VALUE = 9223372036854775807 # (1 << 63) - 1
142
+
143
+ # Constants for `flwr-serverapp` and `flwr-clientapp` CLI commands
144
+ FLWR_APP_TOKEN_LENGTH = 128 # Length of the token used
145
+
146
+ # Constants for object pushing and pulling
147
+ MAX_CONCURRENT_PUSHES = 8 # Default maximum number of concurrent pushes
148
+ MAX_CONCURRENT_PULLS = 8 # Default maximum number of concurrent pulls
149
+ PULL_MAX_TIME = 7200 # Default maximum time to wait for pulling objects
150
+ PULL_MAX_TRIES_PER_OBJECT = 500 # Default maximum number of tries to pull an object
151
+ PULL_INITIAL_BACKOFF = 1 # Initial backoff time for pulling objects
152
+ PULL_BACKOFF_CAP = 10 # Maximum backoff time for pulling objects
153
+
154
+
155
+ # ExecServicer constants
156
+ RUN_ID_NOT_FOUND_MESSAGE = "Run ID not found"
157
+
127
158
 
128
159
  class MessageType:
129
160
  """Message type."""
@@ -21,7 +21,7 @@ from typing import Optional, Union
21
21
  import grpc
22
22
  from google.protobuf.message import Message as GrpcMessage
23
23
 
24
- from flwr.common.typing import LogEntry, UserInfo
24
+ from flwr.common.typing import AccountInfo, LogEntry
25
25
 
26
26
 
27
27
  class EventLogWriterPlugin(ABC):
@@ -36,7 +36,7 @@ class EventLogWriterPlugin(ABC):
36
36
  self,
37
37
  request: GrpcMessage,
38
38
  context: grpc.ServicerContext,
39
- user_info: Optional[UserInfo],
39
+ account_info: Optional[AccountInfo],
40
40
  method_name: str,
41
41
  ) -> LogEntry:
42
42
  """Compose pre-event log entry from the provided request and context."""
@@ -46,7 +46,7 @@ class EventLogWriterPlugin(ABC):
46
46
  self,
47
47
  request: GrpcMessage,
48
48
  context: grpc.ServicerContext,
49
- user_info: Optional[UserInfo],
49
+ account_info: Optional[AccountInfo],
50
50
  method_name: str,
51
51
  response: Optional[Union[GrpcMessage, BaseException]],
52
52
  ) -> LogEntry:
@@ -29,6 +29,9 @@ class ExitCode:
29
29
 
30
30
  # SuperLink-specific exit codes (100-199)
31
31
  SUPERLINK_THREAD_CRASH = 100
32
+ SUPERLINK_LICENSE_INVALID = 101
33
+ SUPERLINK_LICENSE_MISSING = 102
34
+ SUPERLINK_LICENSE_URL_INVALID = 103
32
35
 
33
36
  # ServerApp-specific exit codes (200-299)
34
37
 
@@ -60,6 +63,18 @@ EXIT_CODE_HELP = {
60
63
  ExitCode.GRACEFUL_EXIT_SIGTERM: "",
61
64
  # SuperLink-specific exit codes (100-199)
62
65
  ExitCode.SUPERLINK_THREAD_CRASH: "An important background thread has crashed.",
66
+ ExitCode.SUPERLINK_LICENSE_INVALID: (
67
+ "The license is invalid or has expired. "
68
+ "Please contact `hello@flower.ai` for assistance."
69
+ ),
70
+ ExitCode.SUPERLINK_LICENSE_MISSING: (
71
+ "The license is missing. Please specify the license key by setting the "
72
+ "environment variable `FLWR_LICENSE_KEY`."
73
+ ),
74
+ ExitCode.SUPERLINK_LICENSE_URL_INVALID: (
75
+ "The license URL is invalid. Please ensure that the `FLWR_LICENSE_URL` "
76
+ "environment variable is set to a valid URL."
77
+ ),
63
78
  # ServerApp-specific exit codes (200-299)
64
79
  # SuperNode-specific exit codes (300-399)
65
80
  ExitCode.SUPERNODE_REST_ADDRESS_INVALID: (
@@ -72,7 +87,7 @@ EXIT_CODE_HELP = {
72
87
  "to be provided (providing only one of them is not sufficient)."
73
88
  ),
74
89
  ExitCode.SUPERNODE_NODE_AUTH_KEYS_INVALID: (
75
- "Node uthentication requires elliptic curve private and public key pair. "
90
+ "Node authentication requires elliptic curve private and public key pair. "
76
91
  "Please ensure that the file path points to a valid private/public key "
77
92
  "file and try again."
78
93
  ),
@@ -30,6 +30,7 @@ SIGNAL_TO_EXIT_CODE: dict[int, int] = {
30
30
  signal.SIGINT: ExitCode.GRACEFUL_EXIT_SIGINT,
31
31
  signal.SIGTERM: ExitCode.GRACEFUL_EXIT_SIGTERM,
32
32
  }
33
+ registered_exit_handlers: list[Callable[[], None]] = []
33
34
 
34
35
  # SIGQUIT is not available on Windows
35
36
  if hasattr(signal, "SIGQUIT"):
@@ -41,6 +42,7 @@ def register_exit_handlers(
41
42
  exit_message: Optional[str] = None,
42
43
  grpc_servers: Optional[list[Server]] = None,
43
44
  bckg_threads: Optional[list[Thread]] = None,
45
+ exit_handlers: Optional[list[Callable[[], None]]] = None,
44
46
  ) -> None:
45
47
  """Register exit handlers for `SIGINT`, `SIGTERM` and `SIGQUIT` signals.
46
48
 
@@ -56,8 +58,12 @@ def register_exit_handlers(
56
58
  bckg_threads: Optional[List[Thread]] (default: None)
57
59
  An optional list of threads that need to be gracefully
58
60
  terminated before exiting.
61
+ exit_handlers: Optional[List[Callable[[], None]]] (default: None)
62
+ An optional list of exit handlers to be called before exiting.
63
+ Additional exit handlers can be added using `add_exit_handler`.
59
64
  """
60
65
  default_handlers: dict[int, Callable[[int, FrameType], None]] = {}
66
+ registered_exit_handlers.extend(exit_handlers or [])
61
67
 
62
68
  def graceful_exit_handler(signalnum: int, _frame: FrameType) -> None:
63
69
  """Exit handler to be registered with `signal.signal`.
@@ -68,6 +74,9 @@ def register_exit_handlers(
68
74
  # Reset to default handler
69
75
  signal.signal(signalnum, default_handlers[signalnum]) # type: ignore
70
76
 
77
+ for handler in registered_exit_handlers:
78
+ handler()
79
+
71
80
  if grpc_servers is not None:
72
81
  for grpc_server in grpc_servers:
73
82
  grpc_server.stop(grace=1)
@@ -87,3 +96,24 @@ def register_exit_handlers(
87
96
  for sig in SIGNAL_TO_EXIT_CODE:
88
97
  default_handler = signal.signal(sig, graceful_exit_handler) # type: ignore
89
98
  default_handlers[sig] = default_handler # type: ignore
99
+
100
+
101
+ def add_exit_handler(exit_handler: Callable[[], None]) -> None:
102
+ """Add an exit handler to be called on graceful exit.
103
+
104
+ This function allows you to register additional exit handlers
105
+ that will be executed when the application exits gracefully,
106
+ if `register_exit_handlers` was called.
107
+
108
+ Parameters
109
+ ----------
110
+ exit_handler : Callable[[], None]
111
+ A callable that takes no arguments and performs cleanup or
112
+ other actions before the application exits.
113
+
114
+ Notes
115
+ -----
116
+ This method is not thread-safe, and it allows you to add the
117
+ same exit handler multiple times.
118
+ """
119
+ registered_exit_handlers.append(exit_handler)
flwr/common/grpc.py CHANGED
@@ -23,6 +23,9 @@ from logging import DEBUG, ERROR
23
23
  from typing import Any, Callable, Optional
24
24
 
25
25
  import grpc
26
+ from grpc_health.v1.health_pb2_grpc import add_HealthServicer_to_server
27
+
28
+ from flwr.supercore.grpc_health import SimpleHealthServicer
26
29
 
27
30
  from .address import is_port_in_use
28
31
  from .logger import log
@@ -98,7 +101,7 @@ def valid_certificates(certificates: tuple[bytes, bytes, bytes]) -> bool:
98
101
  return is_valid
99
102
 
100
103
 
101
- def generic_create_grpc_server( # pylint: disable=too-many-arguments,R0917
104
+ def generic_create_grpc_server( # pylint: disable=too-many-arguments, R0914, R0917
102
105
  servicer_and_add_fn: tuple[Any, AddServicerToServerFn],
103
106
  server_address: str,
104
107
  max_concurrent_workers: int = 1000,
@@ -106,6 +109,7 @@ def generic_create_grpc_server( # pylint: disable=too-many-arguments,R0917
106
109
  keepalive_time_ms: int = 210000,
107
110
  certificates: Optional[tuple[bytes, bytes, bytes]] = None,
108
111
  interceptors: Optional[Sequence[grpc.ServerInterceptor]] = None,
112
+ health_servicer: Optional[Any] = None,
109
113
  ) -> grpc.Server:
110
114
  """Create a gRPC server with a single servicer.
111
115
 
@@ -153,6 +157,10 @@ def generic_create_grpc_server( # pylint: disable=too-many-arguments,R0917
153
157
  * server private key.
154
158
  interceptors : Optional[Sequence[grpc.ServerInterceptor]] (default: None)
155
159
  A list of gRPC interceptors.
160
+ health_servicer : Optional[Any] (default: None)
161
+ An optional health servicer to add to the server. If provided, it should be an
162
+ instance of a class that inherits the `HealthServicer` class.
163
+ If None is provided, `SimpleHealthServicer` will be used by default.
156
164
 
157
165
  Returns
158
166
  -------
@@ -203,6 +211,9 @@ def generic_create_grpc_server( # pylint: disable=too-many-arguments,R0917
203
211
  )
204
212
  add_servicer_to_server_fn(servicer, server)
205
213
 
214
+ # Enable health service
215
+ add_HealthServicer_to_server(health_servicer or SimpleHealthServicer(), server)
216
+
206
217
  if certificates is not None:
207
218
  if not valid_certificates(certificates):
208
219
  sys.exit(1)
@@ -0,0 +1,165 @@
1
+ # Copyright 2025 Flower Labs GmbH. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+ """Heartbeat sender."""
16
+
17
+
18
+ import random
19
+ import threading
20
+ from typing import Callable, Union
21
+
22
+ import grpc
23
+
24
+ # pylint: disable=E0611
25
+ from flwr.proto.heartbeat_pb2 import SendAppHeartbeatRequest
26
+ from flwr.proto.serverappio_pb2_grpc import ServerAppIoStub
27
+ from flwr.proto.simulationio_pb2_grpc import SimulationIoStub
28
+
29
+ # pylint: enable=E0611
30
+ from .constant import (
31
+ HEARTBEAT_BASE_MULTIPLIER,
32
+ HEARTBEAT_CALL_TIMEOUT,
33
+ HEARTBEAT_DEFAULT_INTERVAL,
34
+ HEARTBEAT_RANDOM_RANGE,
35
+ )
36
+ from .retry_invoker import RetryInvoker, exponential
37
+
38
+
39
+ class HeartbeatFailure(Exception):
40
+ """Exception raised when a heartbeat fails."""
41
+
42
+
43
+ class HeartbeatSender:
44
+ """Periodically send heartbeat signals to a server in a background thread.
45
+
46
+ This class uses the provided `heartbeat_fn` to send heartbeats. If a heartbeat
47
+ attempt fails, it will be retried using an exponential backoff strategy.
48
+
49
+ Parameters
50
+ ----------
51
+ heartbeat_fn : Callable[[], bool]
52
+ Function used to send a heartbeat signal. It should return True if the heartbeat
53
+ succeeds, or False if it fails. Any internal exceptions (e.g., gRPC errors)
54
+ should be handled within this function to ensure boolean return values.
55
+ """
56
+
57
+ def __init__(
58
+ self,
59
+ heartbeat_fn: Callable[[], bool],
60
+ ) -> None:
61
+ self.heartbeat_fn = heartbeat_fn
62
+ self._stop_event = threading.Event()
63
+ self._thread = threading.Thread(target=self._run, daemon=True)
64
+ self._retry_invoker = RetryInvoker(
65
+ lambda: exponential(max_delay=20),
66
+ HeartbeatFailure, # The only exception we want to retry on
67
+ max_tries=None,
68
+ max_time=None,
69
+ # Allow the stop event to interrupt the wait
70
+ wait_function=self._stop_event.wait, # type: ignore
71
+ )
72
+
73
+ def start(self) -> None:
74
+ """Start the heartbeat sender."""
75
+ if self._thread.is_alive():
76
+ raise RuntimeError("Heartbeat sender is already running.")
77
+ if self._stop_event.is_set():
78
+ raise RuntimeError("Cannot start a stopped heartbeat sender.")
79
+ self._thread.start()
80
+
81
+ def stop(self) -> None:
82
+ """Stop the heartbeat sender."""
83
+ if not self._thread.is_alive():
84
+ raise RuntimeError("Heartbeat sender is not running.")
85
+ self._stop_event.set()
86
+ self._thread.join()
87
+
88
+ @property
89
+ def is_running(self) -> bool:
90
+ """Return True if the heartbeat sender is running, False otherwise."""
91
+ return self._thread.is_alive() and not self._stop_event.is_set()
92
+
93
+ def _run(self) -> None:
94
+ """Periodically send heartbeats until stopped."""
95
+ while not self._stop_event.is_set():
96
+ # Attempt to send a heartbeat with retry on failure
97
+ self._retry_invoker.invoke(self._heartbeat)
98
+
99
+ # Calculate the interval for the next heartbeat
100
+ # Formula: next_interval = (interval - timeout) * random.uniform(0.7, 0.9)
101
+ rd = random.uniform(*HEARTBEAT_RANDOM_RANGE)
102
+ next_interval: float = HEARTBEAT_DEFAULT_INTERVAL - HEARTBEAT_CALL_TIMEOUT
103
+ next_interval *= HEARTBEAT_BASE_MULTIPLIER + rd
104
+
105
+ # Wait for the calculated interval or exit early if stopped
106
+ self._stop_event.wait(next_interval)
107
+
108
+ def _heartbeat(self) -> None:
109
+ """Send a single heartbeat and raise an exception if it fails.
110
+
111
+ Call the provided `heartbeat_fn`. If the function returns False,
112
+ a `HeartbeatFailure` exception is raised to trigger the retry mechanism.
113
+ """
114
+ if not self._stop_event.is_set():
115
+ if not self.heartbeat_fn():
116
+ raise HeartbeatFailure
117
+
118
+
119
+ def get_grpc_app_heartbeat_fn(
120
+ stub: Union[ServerAppIoStub, SimulationIoStub],
121
+ run_id: int,
122
+ *,
123
+ failure_message: str,
124
+ ) -> Callable[[], bool]:
125
+ """Get the function to send a heartbeat to gRPC endpoint.
126
+
127
+ This function is for app heartbeats only. It is not used for node heartbeats.
128
+
129
+ Parameters
130
+ ----------
131
+ stub : Union[ServerAppIoStub, SimulationIoStub]
132
+ gRPC stub to send the heartbeat.
133
+ run_id : int
134
+ The run ID to use in the heartbeat request.
135
+ failure_message : str
136
+ Error message to raise if the heartbeat fails.
137
+
138
+ Returns
139
+ -------
140
+ Callable[[], bool]
141
+ Function that sends a heartbeat to the gRPC endpoint.
142
+ """
143
+ # Construct the heartbeat request
144
+ req = SendAppHeartbeatRequest(
145
+ run_id=run_id, heartbeat_interval=HEARTBEAT_DEFAULT_INTERVAL
146
+ )
147
+
148
+ def fn() -> bool:
149
+ # Call ServerAppIo API
150
+ try:
151
+ res = stub.SendAppHeartbeat(req)
152
+ except grpc.RpcError as e:
153
+ status_code = e.code()
154
+ if status_code == grpc.StatusCode.UNAVAILABLE:
155
+ return False
156
+ if status_code == grpc.StatusCode.DEADLINE_EXCEEDED:
157
+ return False
158
+ raise
159
+
160
+ # Check if not successful
161
+ if not res.success:
162
+ raise RuntimeError(failure_message)
163
+ return True
164
+
165
+ return fn