flwr 1.18.0__py3-none-any.whl → 1.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flwr/app/__init__.py +15 -0
- flwr/app/error.py +68 -0
- flwr/app/metadata.py +223 -0
- flwr/cli/build.py +94 -59
- flwr/cli/log.py +3 -3
- flwr/cli/login/login.py +3 -7
- flwr/cli/ls.py +15 -36
- flwr/cli/new/new.py +12 -4
- flwr/cli/new/templates/app/README.flowertune.md.tpl +2 -0
- flwr/cli/new/templates/app/README.md.tpl +5 -0
- flwr/cli/new/templates/app/code/client.baseline.py.tpl +1 -1
- flwr/cli/new/templates/app/code/model.baseline.py.tpl +1 -1
- flwr/cli/new/templates/app/code/server.baseline.py.tpl +2 -3
- flwr/cli/new/templates/app/pyproject.baseline.toml.tpl +25 -17
- flwr/cli/new/templates/app/pyproject.flowertune.toml.tpl +13 -1
- flwr/cli/new/templates/app/pyproject.huggingface.toml.tpl +21 -2
- flwr/cli/new/templates/app/pyproject.jax.toml.tpl +18 -1
- flwr/cli/new/templates/app/pyproject.mlx.toml.tpl +19 -2
- flwr/cli/new/templates/app/pyproject.numpy.toml.tpl +18 -1
- flwr/cli/new/templates/app/pyproject.pytorch.toml.tpl +20 -3
- flwr/cli/new/templates/app/pyproject.sklearn.toml.tpl +18 -1
- flwr/cli/new/templates/app/pyproject.tensorflow.toml.tpl +18 -1
- flwr/cli/run/run.py +48 -49
- flwr/cli/stop.py +2 -2
- flwr/cli/utils.py +38 -5
- flwr/client/__init__.py +2 -2
- flwr/client/client_app.py +1 -1
- flwr/client/clientapp/__init__.py +0 -7
- flwr/client/grpc_adapter_client/connection.py +15 -8
- flwr/client/grpc_rere_client/connection.py +142 -97
- flwr/client/grpc_rere_client/grpc_adapter.py +34 -6
- flwr/client/message_handler/message_handler.py +1 -1
- flwr/client/mod/comms_mods.py +36 -17
- flwr/client/rest_client/connection.py +176 -103
- flwr/clientapp/__init__.py +15 -0
- flwr/common/__init__.py +2 -2
- flwr/common/auth_plugin/__init__.py +2 -0
- flwr/common/auth_plugin/auth_plugin.py +29 -3
- flwr/common/constant.py +39 -8
- flwr/common/event_log_plugin/event_log_plugin.py +3 -3
- flwr/common/exit/exit_code.py +16 -1
- flwr/common/exit_handlers.py +30 -0
- flwr/common/grpc.py +12 -1
- flwr/common/heartbeat.py +165 -0
- flwr/common/inflatable.py +290 -0
- flwr/common/inflatable_protobuf_utils.py +141 -0
- flwr/common/inflatable_utils.py +508 -0
- flwr/common/message.py +110 -242
- flwr/common/record/__init__.py +2 -1
- flwr/common/record/array.py +402 -0
- flwr/common/record/arraychunk.py +59 -0
- flwr/common/record/arrayrecord.py +103 -225
- flwr/common/record/configrecord.py +59 -4
- flwr/common/record/conversion_utils.py +1 -1
- flwr/common/record/metricrecord.py +55 -4
- flwr/common/record/recorddict.py +69 -1
- flwr/common/recorddict_compat.py +2 -2
- flwr/common/retry_invoker.py +5 -1
- flwr/common/serde.py +59 -211
- flwr/common/serde_utils.py +175 -0
- flwr/common/typing.py +5 -3
- flwr/compat/__init__.py +15 -0
- flwr/compat/client/__init__.py +15 -0
- flwr/{client → compat/client}/app.py +28 -185
- flwr/compat/common/__init__.py +15 -0
- flwr/compat/server/__init__.py +15 -0
- flwr/compat/server/app.py +174 -0
- flwr/compat/simulation/__init__.py +15 -0
- flwr/proto/appio_pb2.py +43 -0
- flwr/proto/appio_pb2.pyi +151 -0
- flwr/proto/appio_pb2_grpc.py +4 -0
- flwr/proto/appio_pb2_grpc.pyi +4 -0
- flwr/proto/clientappio_pb2.py +12 -19
- flwr/proto/clientappio_pb2.pyi +23 -101
- flwr/proto/clientappio_pb2_grpc.py +269 -28
- flwr/proto/clientappio_pb2_grpc.pyi +114 -20
- flwr/proto/fleet_pb2.py +24 -27
- flwr/proto/fleet_pb2.pyi +19 -35
- flwr/proto/fleet_pb2_grpc.py +117 -13
- flwr/proto/fleet_pb2_grpc.pyi +47 -6
- flwr/proto/heartbeat_pb2.py +33 -0
- flwr/proto/heartbeat_pb2.pyi +66 -0
- flwr/proto/heartbeat_pb2_grpc.py +4 -0
- flwr/proto/heartbeat_pb2_grpc.pyi +4 -0
- flwr/proto/message_pb2.py +28 -11
- flwr/proto/message_pb2.pyi +125 -0
- flwr/proto/recorddict_pb2.py +16 -28
- flwr/proto/recorddict_pb2.pyi +46 -64
- flwr/proto/run_pb2.py +24 -32
- flwr/proto/run_pb2.pyi +4 -52
- flwr/proto/serverappio_pb2.py +9 -23
- flwr/proto/serverappio_pb2.pyi +0 -110
- flwr/proto/serverappio_pb2_grpc.py +177 -72
- flwr/proto/serverappio_pb2_grpc.pyi +75 -33
- flwr/proto/simulationio_pb2.py +12 -11
- flwr/proto/simulationio_pb2_grpc.py +35 -0
- flwr/proto/simulationio_pb2_grpc.pyi +14 -0
- flwr/server/__init__.py +1 -1
- flwr/server/app.py +69 -187
- flwr/server/compat/app_utils.py +50 -28
- flwr/server/fleet_event_log_interceptor.py +6 -2
- flwr/server/grid/grpc_grid.py +148 -41
- flwr/server/grid/inmemory_grid.py +5 -4
- flwr/server/serverapp/app.py +45 -17
- flwr/server/superlink/fleet/grpc_adapter/grpc_adapter_servicer.py +21 -3
- flwr/server/superlink/fleet/grpc_rere/fleet_servicer.py +102 -8
- flwr/server/superlink/fleet/grpc_rere/server_interceptor.py +2 -5
- flwr/server/superlink/fleet/message_handler/message_handler.py +130 -19
- flwr/server/superlink/fleet/rest_rere/rest_api.py +73 -13
- flwr/server/superlink/fleet/vce/vce_api.py +6 -3
- flwr/server/superlink/linkstate/in_memory_linkstate.py +138 -43
- flwr/server/superlink/linkstate/linkstate.py +53 -20
- flwr/server/superlink/linkstate/sqlite_linkstate.py +149 -55
- flwr/server/superlink/linkstate/utils.py +33 -29
- flwr/server/superlink/serverappio/serverappio_grpc.py +4 -1
- flwr/server/superlink/serverappio/serverappio_servicer.py +230 -84
- flwr/server/superlink/simulation/simulationio_grpc.py +1 -1
- flwr/server/superlink/simulation/simulationio_servicer.py +26 -2
- flwr/server/superlink/utils.py +9 -2
- flwr/server/utils/validator.py +2 -2
- flwr/serverapp/__init__.py +15 -0
- flwr/simulation/app.py +25 -0
- flwr/simulation/run_simulation.py +17 -0
- flwr/supercore/__init__.py +15 -0
- flwr/{server/superlink → supercore}/ffs/__init__.py +2 -0
- flwr/{server/superlink → supercore}/ffs/disk_ffs.py +1 -1
- flwr/supercore/grpc_health/__init__.py +22 -0
- flwr/supercore/grpc_health/simple_health_servicer.py +38 -0
- flwr/supercore/license_plugin/__init__.py +22 -0
- flwr/supercore/license_plugin/license_plugin.py +26 -0
- flwr/supercore/object_store/__init__.py +24 -0
- flwr/supercore/object_store/in_memory_object_store.py +229 -0
- flwr/supercore/object_store/object_store.py +170 -0
- flwr/supercore/object_store/object_store_factory.py +44 -0
- flwr/supercore/object_store/utils.py +43 -0
- flwr/supercore/scheduler/__init__.py +22 -0
- flwr/supercore/scheduler/plugin.py +71 -0
- flwr/{client/nodestate/nodestate.py → supercore/utils.py} +14 -13
- flwr/superexec/deployment.py +7 -4
- flwr/superexec/exec_event_log_interceptor.py +8 -4
- flwr/superexec/exec_grpc.py +25 -5
- flwr/superexec/exec_license_interceptor.py +82 -0
- flwr/superexec/exec_servicer.py +135 -24
- flwr/superexec/exec_user_auth_interceptor.py +45 -8
- flwr/superexec/executor.py +5 -1
- flwr/superexec/simulation.py +8 -3
- flwr/superlink/__init__.py +15 -0
- flwr/{client/supernode → supernode}/__init__.py +0 -7
- flwr/supernode/cli/__init__.py +24 -0
- flwr/{client/supernode/app.py → supernode/cli/flower_supernode.py} +3 -19
- flwr/supernode/cli/flwr_clientapp.py +88 -0
- flwr/supernode/nodestate/in_memory_nodestate.py +199 -0
- flwr/supernode/nodestate/nodestate.py +227 -0
- flwr/supernode/runtime/__init__.py +15 -0
- flwr/{client/clientapp/app.py → supernode/runtime/run_clientapp.py} +135 -89
- flwr/supernode/scheduler/__init__.py +22 -0
- flwr/supernode/scheduler/simple_clientapp_scheduler_plugin.py +49 -0
- flwr/supernode/servicer/__init__.py +15 -0
- flwr/supernode/servicer/clientappio/__init__.py +22 -0
- flwr/supernode/servicer/clientappio/clientappio_servicer.py +303 -0
- flwr/supernode/start_client_internal.py +589 -0
- {flwr-1.18.0.dist-info → flwr-1.20.0.dist-info}/METADATA +6 -4
- {flwr-1.18.0.dist-info → flwr-1.20.0.dist-info}/RECORD +171 -123
- {flwr-1.18.0.dist-info → flwr-1.20.0.dist-info}/WHEEL +1 -1
- {flwr-1.18.0.dist-info → flwr-1.20.0.dist-info}/entry_points.txt +2 -2
- flwr/client/clientapp/clientappio_servicer.py +0 -244
- flwr/client/heartbeat.py +0 -74
- flwr/client/nodestate/in_memory_nodestate.py +0 -38
- /flwr/{client → compat/client}/grpc_client/__init__.py +0 -0
- /flwr/{client → compat/client}/grpc_client/connection.py +0 -0
- /flwr/{server/superlink → supercore}/ffs/ffs.py +0 -0
- /flwr/{server/superlink → supercore}/ffs/ffs_factory.py +0 -0
- /flwr/{client → supernode}/nodestate/__init__.py +0 -0
- /flwr/{client → supernode}/nodestate/nodestate_factory.py +0 -0
flwr/common/constant.py
CHANGED
|
@@ -55,13 +55,14 @@ EXEC_API_DEFAULT_SERVER_ADDRESS = f"{SERVER_OCTET}:{EXEC_API_PORT}"
|
|
|
55
55
|
SIMULATIONIO_API_DEFAULT_SERVER_ADDRESS = f"{SERVER_OCTET}:{SIMULATIONIO_PORT}"
|
|
56
56
|
SIMULATIONIO_API_DEFAULT_CLIENT_ADDRESS = f"{CLIENT_OCTET}:{SIMULATIONIO_PORT}"
|
|
57
57
|
|
|
58
|
-
# Constants for
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
58
|
+
# Constants for heartbeat
|
|
59
|
+
HEARTBEAT_DEFAULT_INTERVAL = 30
|
|
60
|
+
HEARTBEAT_CALL_TIMEOUT = 5
|
|
61
|
+
HEARTBEAT_BASE_MULTIPLIER = 0.8
|
|
62
|
+
HEARTBEAT_RANDOM_RANGE = (-0.1, 0.1)
|
|
63
|
+
HEARTBEAT_MAX_INTERVAL = 1e300
|
|
64
|
+
HEARTBEAT_PATIENCE = 2
|
|
65
|
+
RUN_FAILURE_DETAILS_NO_HEARTBEAT = "No heartbeat received from the run."
|
|
65
66
|
|
|
66
67
|
# IDs
|
|
67
68
|
RUN_ID_NUM_BYTES = 8
|
|
@@ -73,6 +74,7 @@ FAB_ALLOWED_EXTENSIONS = {".py", ".toml", ".md"}
|
|
|
73
74
|
FAB_CONFIG_FILE = "pyproject.toml"
|
|
74
75
|
FAB_DATE = (2024, 10, 1, 0, 0, 0)
|
|
75
76
|
FAB_HASH_TRUNCATION = 8
|
|
77
|
+
FAB_MAX_SIZE = 10 * 1024 * 1024 # 10 MB
|
|
76
78
|
FLWR_DIR = ".flwr" # The default Flower directory: ~/.flwr/
|
|
77
79
|
FLWR_HOME = "FLWR_HOME" # If set, override the default Flower directory
|
|
78
80
|
|
|
@@ -114,16 +116,45 @@ AUTH_TYPE_YAML_KEY = "auth_type" # For key name in YAML file
|
|
|
114
116
|
ACCESS_TOKEN_KEY = "flwr-oidc-access-token"
|
|
115
117
|
REFRESH_TOKEN_KEY = "flwr-oidc-refresh-token"
|
|
116
118
|
|
|
119
|
+
# Constants for user authorization
|
|
120
|
+
AUTHZ_TYPE_YAML_KEY = "authz_type" # For key name in YAML file
|
|
121
|
+
|
|
117
122
|
# Constants for node authentication
|
|
118
123
|
PUBLIC_KEY_HEADER = "flwr-public-key-bin" # Must end with "-bin" for binary data
|
|
119
124
|
SIGNATURE_HEADER = "flwr-signature-bin" # Must end with "-bin" for binary data
|
|
120
125
|
TIMESTAMP_HEADER = "flwr-timestamp"
|
|
121
|
-
TIMESTAMP_TOLERANCE =
|
|
126
|
+
TIMESTAMP_TOLERANCE = 300 # General tolerance for timestamp verification
|
|
122
127
|
SYSTEM_TIME_TOLERANCE = 5 # Allowance for system time drift
|
|
123
128
|
|
|
129
|
+
# Constants for grpc retry
|
|
130
|
+
GRPC_RETRY_MAX_DELAY = 20 # Maximum delay duration between two consecutive retries.
|
|
131
|
+
|
|
124
132
|
# Constants for ArrayRecord
|
|
125
133
|
GC_THRESHOLD = 200_000_000 # 200 MB
|
|
126
134
|
|
|
135
|
+
# Constants for Inflatable
|
|
136
|
+
HEAD_BODY_DIVIDER = b"\x00"
|
|
137
|
+
HEAD_VALUE_DIVIDER = " "
|
|
138
|
+
MAX_ARRAY_CHUNK_SIZE = 20_971_520 # 20 MB
|
|
139
|
+
|
|
140
|
+
# Constants for serialization
|
|
141
|
+
INT64_MAX_VALUE = 9223372036854775807 # (1 << 63) - 1
|
|
142
|
+
|
|
143
|
+
# Constants for `flwr-serverapp` and `flwr-clientapp` CLI commands
|
|
144
|
+
FLWR_APP_TOKEN_LENGTH = 128 # Length of the token used
|
|
145
|
+
|
|
146
|
+
# Constants for object pushing and pulling
|
|
147
|
+
MAX_CONCURRENT_PUSHES = 8 # Default maximum number of concurrent pushes
|
|
148
|
+
MAX_CONCURRENT_PULLS = 8 # Default maximum number of concurrent pulls
|
|
149
|
+
PULL_MAX_TIME = 7200 # Default maximum time to wait for pulling objects
|
|
150
|
+
PULL_MAX_TRIES_PER_OBJECT = 500 # Default maximum number of tries to pull an object
|
|
151
|
+
PULL_INITIAL_BACKOFF = 1 # Initial backoff time for pulling objects
|
|
152
|
+
PULL_BACKOFF_CAP = 10 # Maximum backoff time for pulling objects
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
# ExecServicer constants
|
|
156
|
+
RUN_ID_NOT_FOUND_MESSAGE = "Run ID not found"
|
|
157
|
+
|
|
127
158
|
|
|
128
159
|
class MessageType:
|
|
129
160
|
"""Message type."""
|
|
@@ -21,7 +21,7 @@ from typing import Optional, Union
|
|
|
21
21
|
import grpc
|
|
22
22
|
from google.protobuf.message import Message as GrpcMessage
|
|
23
23
|
|
|
24
|
-
from flwr.common.typing import
|
|
24
|
+
from flwr.common.typing import AccountInfo, LogEntry
|
|
25
25
|
|
|
26
26
|
|
|
27
27
|
class EventLogWriterPlugin(ABC):
|
|
@@ -36,7 +36,7 @@ class EventLogWriterPlugin(ABC):
|
|
|
36
36
|
self,
|
|
37
37
|
request: GrpcMessage,
|
|
38
38
|
context: grpc.ServicerContext,
|
|
39
|
-
|
|
39
|
+
account_info: Optional[AccountInfo],
|
|
40
40
|
method_name: str,
|
|
41
41
|
) -> LogEntry:
|
|
42
42
|
"""Compose pre-event log entry from the provided request and context."""
|
|
@@ -46,7 +46,7 @@ class EventLogWriterPlugin(ABC):
|
|
|
46
46
|
self,
|
|
47
47
|
request: GrpcMessage,
|
|
48
48
|
context: grpc.ServicerContext,
|
|
49
|
-
|
|
49
|
+
account_info: Optional[AccountInfo],
|
|
50
50
|
method_name: str,
|
|
51
51
|
response: Optional[Union[GrpcMessage, BaseException]],
|
|
52
52
|
) -> LogEntry:
|
flwr/common/exit/exit_code.py
CHANGED
|
@@ -29,6 +29,9 @@ class ExitCode:
|
|
|
29
29
|
|
|
30
30
|
# SuperLink-specific exit codes (100-199)
|
|
31
31
|
SUPERLINK_THREAD_CRASH = 100
|
|
32
|
+
SUPERLINK_LICENSE_INVALID = 101
|
|
33
|
+
SUPERLINK_LICENSE_MISSING = 102
|
|
34
|
+
SUPERLINK_LICENSE_URL_INVALID = 103
|
|
32
35
|
|
|
33
36
|
# ServerApp-specific exit codes (200-299)
|
|
34
37
|
|
|
@@ -60,6 +63,18 @@ EXIT_CODE_HELP = {
|
|
|
60
63
|
ExitCode.GRACEFUL_EXIT_SIGTERM: "",
|
|
61
64
|
# SuperLink-specific exit codes (100-199)
|
|
62
65
|
ExitCode.SUPERLINK_THREAD_CRASH: "An important background thread has crashed.",
|
|
66
|
+
ExitCode.SUPERLINK_LICENSE_INVALID: (
|
|
67
|
+
"The license is invalid or has expired. "
|
|
68
|
+
"Please contact `hello@flower.ai` for assistance."
|
|
69
|
+
),
|
|
70
|
+
ExitCode.SUPERLINK_LICENSE_MISSING: (
|
|
71
|
+
"The license is missing. Please specify the license key by setting the "
|
|
72
|
+
"environment variable `FLWR_LICENSE_KEY`."
|
|
73
|
+
),
|
|
74
|
+
ExitCode.SUPERLINK_LICENSE_URL_INVALID: (
|
|
75
|
+
"The license URL is invalid. Please ensure that the `FLWR_LICENSE_URL` "
|
|
76
|
+
"environment variable is set to a valid URL."
|
|
77
|
+
),
|
|
63
78
|
# ServerApp-specific exit codes (200-299)
|
|
64
79
|
# SuperNode-specific exit codes (300-399)
|
|
65
80
|
ExitCode.SUPERNODE_REST_ADDRESS_INVALID: (
|
|
@@ -72,7 +87,7 @@ EXIT_CODE_HELP = {
|
|
|
72
87
|
"to be provided (providing only one of them is not sufficient)."
|
|
73
88
|
),
|
|
74
89
|
ExitCode.SUPERNODE_NODE_AUTH_KEYS_INVALID: (
|
|
75
|
-
"Node
|
|
90
|
+
"Node authentication requires elliptic curve private and public key pair. "
|
|
76
91
|
"Please ensure that the file path points to a valid private/public key "
|
|
77
92
|
"file and try again."
|
|
78
93
|
),
|
flwr/common/exit_handlers.py
CHANGED
|
@@ -30,6 +30,7 @@ SIGNAL_TO_EXIT_CODE: dict[int, int] = {
|
|
|
30
30
|
signal.SIGINT: ExitCode.GRACEFUL_EXIT_SIGINT,
|
|
31
31
|
signal.SIGTERM: ExitCode.GRACEFUL_EXIT_SIGTERM,
|
|
32
32
|
}
|
|
33
|
+
registered_exit_handlers: list[Callable[[], None]] = []
|
|
33
34
|
|
|
34
35
|
# SIGQUIT is not available on Windows
|
|
35
36
|
if hasattr(signal, "SIGQUIT"):
|
|
@@ -41,6 +42,7 @@ def register_exit_handlers(
|
|
|
41
42
|
exit_message: Optional[str] = None,
|
|
42
43
|
grpc_servers: Optional[list[Server]] = None,
|
|
43
44
|
bckg_threads: Optional[list[Thread]] = None,
|
|
45
|
+
exit_handlers: Optional[list[Callable[[], None]]] = None,
|
|
44
46
|
) -> None:
|
|
45
47
|
"""Register exit handlers for `SIGINT`, `SIGTERM` and `SIGQUIT` signals.
|
|
46
48
|
|
|
@@ -56,8 +58,12 @@ def register_exit_handlers(
|
|
|
56
58
|
bckg_threads: Optional[List[Thread]] (default: None)
|
|
57
59
|
An optional list of threads that need to be gracefully
|
|
58
60
|
terminated before exiting.
|
|
61
|
+
exit_handlers: Optional[List[Callable[[], None]]] (default: None)
|
|
62
|
+
An optional list of exit handlers to be called before exiting.
|
|
63
|
+
Additional exit handlers can be added using `add_exit_handler`.
|
|
59
64
|
"""
|
|
60
65
|
default_handlers: dict[int, Callable[[int, FrameType], None]] = {}
|
|
66
|
+
registered_exit_handlers.extend(exit_handlers or [])
|
|
61
67
|
|
|
62
68
|
def graceful_exit_handler(signalnum: int, _frame: FrameType) -> None:
|
|
63
69
|
"""Exit handler to be registered with `signal.signal`.
|
|
@@ -68,6 +74,9 @@ def register_exit_handlers(
|
|
|
68
74
|
# Reset to default handler
|
|
69
75
|
signal.signal(signalnum, default_handlers[signalnum]) # type: ignore
|
|
70
76
|
|
|
77
|
+
for handler in registered_exit_handlers:
|
|
78
|
+
handler()
|
|
79
|
+
|
|
71
80
|
if grpc_servers is not None:
|
|
72
81
|
for grpc_server in grpc_servers:
|
|
73
82
|
grpc_server.stop(grace=1)
|
|
@@ -87,3 +96,24 @@ def register_exit_handlers(
|
|
|
87
96
|
for sig in SIGNAL_TO_EXIT_CODE:
|
|
88
97
|
default_handler = signal.signal(sig, graceful_exit_handler) # type: ignore
|
|
89
98
|
default_handlers[sig] = default_handler # type: ignore
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def add_exit_handler(exit_handler: Callable[[], None]) -> None:
|
|
102
|
+
"""Add an exit handler to be called on graceful exit.
|
|
103
|
+
|
|
104
|
+
This function allows you to register additional exit handlers
|
|
105
|
+
that will be executed when the application exits gracefully,
|
|
106
|
+
if `register_exit_handlers` was called.
|
|
107
|
+
|
|
108
|
+
Parameters
|
|
109
|
+
----------
|
|
110
|
+
exit_handler : Callable[[], None]
|
|
111
|
+
A callable that takes no arguments and performs cleanup or
|
|
112
|
+
other actions before the application exits.
|
|
113
|
+
|
|
114
|
+
Notes
|
|
115
|
+
-----
|
|
116
|
+
This method is not thread-safe, and it allows you to add the
|
|
117
|
+
same exit handler multiple times.
|
|
118
|
+
"""
|
|
119
|
+
registered_exit_handlers.append(exit_handler)
|
flwr/common/grpc.py
CHANGED
|
@@ -23,6 +23,9 @@ from logging import DEBUG, ERROR
|
|
|
23
23
|
from typing import Any, Callable, Optional
|
|
24
24
|
|
|
25
25
|
import grpc
|
|
26
|
+
from grpc_health.v1.health_pb2_grpc import add_HealthServicer_to_server
|
|
27
|
+
|
|
28
|
+
from flwr.supercore.grpc_health import SimpleHealthServicer
|
|
26
29
|
|
|
27
30
|
from .address import is_port_in_use
|
|
28
31
|
from .logger import log
|
|
@@ -98,7 +101,7 @@ def valid_certificates(certificates: tuple[bytes, bytes, bytes]) -> bool:
|
|
|
98
101
|
return is_valid
|
|
99
102
|
|
|
100
103
|
|
|
101
|
-
def generic_create_grpc_server( # pylint: disable=too-many-arguments,R0917
|
|
104
|
+
def generic_create_grpc_server( # pylint: disable=too-many-arguments, R0914, R0917
|
|
102
105
|
servicer_and_add_fn: tuple[Any, AddServicerToServerFn],
|
|
103
106
|
server_address: str,
|
|
104
107
|
max_concurrent_workers: int = 1000,
|
|
@@ -106,6 +109,7 @@ def generic_create_grpc_server( # pylint: disable=too-many-arguments,R0917
|
|
|
106
109
|
keepalive_time_ms: int = 210000,
|
|
107
110
|
certificates: Optional[tuple[bytes, bytes, bytes]] = None,
|
|
108
111
|
interceptors: Optional[Sequence[grpc.ServerInterceptor]] = None,
|
|
112
|
+
health_servicer: Optional[Any] = None,
|
|
109
113
|
) -> grpc.Server:
|
|
110
114
|
"""Create a gRPC server with a single servicer.
|
|
111
115
|
|
|
@@ -153,6 +157,10 @@ def generic_create_grpc_server( # pylint: disable=too-many-arguments,R0917
|
|
|
153
157
|
* server private key.
|
|
154
158
|
interceptors : Optional[Sequence[grpc.ServerInterceptor]] (default: None)
|
|
155
159
|
A list of gRPC interceptors.
|
|
160
|
+
health_servicer : Optional[Any] (default: None)
|
|
161
|
+
An optional health servicer to add to the server. If provided, it should be an
|
|
162
|
+
instance of a class that inherits the `HealthServicer` class.
|
|
163
|
+
If None is provided, `SimpleHealthServicer` will be used by default.
|
|
156
164
|
|
|
157
165
|
Returns
|
|
158
166
|
-------
|
|
@@ -203,6 +211,9 @@ def generic_create_grpc_server( # pylint: disable=too-many-arguments,R0917
|
|
|
203
211
|
)
|
|
204
212
|
add_servicer_to_server_fn(servicer, server)
|
|
205
213
|
|
|
214
|
+
# Enable health service
|
|
215
|
+
add_HealthServicer_to_server(health_servicer or SimpleHealthServicer(), server)
|
|
216
|
+
|
|
206
217
|
if certificates is not None:
|
|
207
218
|
if not valid_certificates(certificates):
|
|
208
219
|
sys.exit(1)
|
flwr/common/heartbeat.py
ADDED
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
# Copyright 2025 Flower Labs GmbH. All Rights Reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
# ==============================================================================
|
|
15
|
+
"""Heartbeat sender."""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
import random
|
|
19
|
+
import threading
|
|
20
|
+
from typing import Callable, Union
|
|
21
|
+
|
|
22
|
+
import grpc
|
|
23
|
+
|
|
24
|
+
# pylint: disable=E0611
|
|
25
|
+
from flwr.proto.heartbeat_pb2 import SendAppHeartbeatRequest
|
|
26
|
+
from flwr.proto.serverappio_pb2_grpc import ServerAppIoStub
|
|
27
|
+
from flwr.proto.simulationio_pb2_grpc import SimulationIoStub
|
|
28
|
+
|
|
29
|
+
# pylint: enable=E0611
|
|
30
|
+
from .constant import (
|
|
31
|
+
HEARTBEAT_BASE_MULTIPLIER,
|
|
32
|
+
HEARTBEAT_CALL_TIMEOUT,
|
|
33
|
+
HEARTBEAT_DEFAULT_INTERVAL,
|
|
34
|
+
HEARTBEAT_RANDOM_RANGE,
|
|
35
|
+
)
|
|
36
|
+
from .retry_invoker import RetryInvoker, exponential
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class HeartbeatFailure(Exception):
|
|
40
|
+
"""Exception raised when a heartbeat fails."""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class HeartbeatSender:
|
|
44
|
+
"""Periodically send heartbeat signals to a server in a background thread.
|
|
45
|
+
|
|
46
|
+
This class uses the provided `heartbeat_fn` to send heartbeats. If a heartbeat
|
|
47
|
+
attempt fails, it will be retried using an exponential backoff strategy.
|
|
48
|
+
|
|
49
|
+
Parameters
|
|
50
|
+
----------
|
|
51
|
+
heartbeat_fn : Callable[[], bool]
|
|
52
|
+
Function used to send a heartbeat signal. It should return True if the heartbeat
|
|
53
|
+
succeeds, or False if it fails. Any internal exceptions (e.g., gRPC errors)
|
|
54
|
+
should be handled within this function to ensure boolean return values.
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(
|
|
58
|
+
self,
|
|
59
|
+
heartbeat_fn: Callable[[], bool],
|
|
60
|
+
) -> None:
|
|
61
|
+
self.heartbeat_fn = heartbeat_fn
|
|
62
|
+
self._stop_event = threading.Event()
|
|
63
|
+
self._thread = threading.Thread(target=self._run, daemon=True)
|
|
64
|
+
self._retry_invoker = RetryInvoker(
|
|
65
|
+
lambda: exponential(max_delay=20),
|
|
66
|
+
HeartbeatFailure, # The only exception we want to retry on
|
|
67
|
+
max_tries=None,
|
|
68
|
+
max_time=None,
|
|
69
|
+
# Allow the stop event to interrupt the wait
|
|
70
|
+
wait_function=self._stop_event.wait, # type: ignore
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def start(self) -> None:
|
|
74
|
+
"""Start the heartbeat sender."""
|
|
75
|
+
if self._thread.is_alive():
|
|
76
|
+
raise RuntimeError("Heartbeat sender is already running.")
|
|
77
|
+
if self._stop_event.is_set():
|
|
78
|
+
raise RuntimeError("Cannot start a stopped heartbeat sender.")
|
|
79
|
+
self._thread.start()
|
|
80
|
+
|
|
81
|
+
def stop(self) -> None:
|
|
82
|
+
"""Stop the heartbeat sender."""
|
|
83
|
+
if not self._thread.is_alive():
|
|
84
|
+
raise RuntimeError("Heartbeat sender is not running.")
|
|
85
|
+
self._stop_event.set()
|
|
86
|
+
self._thread.join()
|
|
87
|
+
|
|
88
|
+
@property
|
|
89
|
+
def is_running(self) -> bool:
|
|
90
|
+
"""Return True if the heartbeat sender is running, False otherwise."""
|
|
91
|
+
return self._thread.is_alive() and not self._stop_event.is_set()
|
|
92
|
+
|
|
93
|
+
def _run(self) -> None:
|
|
94
|
+
"""Periodically send heartbeats until stopped."""
|
|
95
|
+
while not self._stop_event.is_set():
|
|
96
|
+
# Attempt to send a heartbeat with retry on failure
|
|
97
|
+
self._retry_invoker.invoke(self._heartbeat)
|
|
98
|
+
|
|
99
|
+
# Calculate the interval for the next heartbeat
|
|
100
|
+
# Formula: next_interval = (interval - timeout) * random.uniform(0.7, 0.9)
|
|
101
|
+
rd = random.uniform(*HEARTBEAT_RANDOM_RANGE)
|
|
102
|
+
next_interval: float = HEARTBEAT_DEFAULT_INTERVAL - HEARTBEAT_CALL_TIMEOUT
|
|
103
|
+
next_interval *= HEARTBEAT_BASE_MULTIPLIER + rd
|
|
104
|
+
|
|
105
|
+
# Wait for the calculated interval or exit early if stopped
|
|
106
|
+
self._stop_event.wait(next_interval)
|
|
107
|
+
|
|
108
|
+
def _heartbeat(self) -> None:
|
|
109
|
+
"""Send a single heartbeat and raise an exception if it fails.
|
|
110
|
+
|
|
111
|
+
Call the provided `heartbeat_fn`. If the function returns False,
|
|
112
|
+
a `HeartbeatFailure` exception is raised to trigger the retry mechanism.
|
|
113
|
+
"""
|
|
114
|
+
if not self._stop_event.is_set():
|
|
115
|
+
if not self.heartbeat_fn():
|
|
116
|
+
raise HeartbeatFailure
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def get_grpc_app_heartbeat_fn(
|
|
120
|
+
stub: Union[ServerAppIoStub, SimulationIoStub],
|
|
121
|
+
run_id: int,
|
|
122
|
+
*,
|
|
123
|
+
failure_message: str,
|
|
124
|
+
) -> Callable[[], bool]:
|
|
125
|
+
"""Get the function to send a heartbeat to gRPC endpoint.
|
|
126
|
+
|
|
127
|
+
This function is for app heartbeats only. It is not used for node heartbeats.
|
|
128
|
+
|
|
129
|
+
Parameters
|
|
130
|
+
----------
|
|
131
|
+
stub : Union[ServerAppIoStub, SimulationIoStub]
|
|
132
|
+
gRPC stub to send the heartbeat.
|
|
133
|
+
run_id : int
|
|
134
|
+
The run ID to use in the heartbeat request.
|
|
135
|
+
failure_message : str
|
|
136
|
+
Error message to raise if the heartbeat fails.
|
|
137
|
+
|
|
138
|
+
Returns
|
|
139
|
+
-------
|
|
140
|
+
Callable[[], bool]
|
|
141
|
+
Function that sends a heartbeat to the gRPC endpoint.
|
|
142
|
+
"""
|
|
143
|
+
# Construct the heartbeat request
|
|
144
|
+
req = SendAppHeartbeatRequest(
|
|
145
|
+
run_id=run_id, heartbeat_interval=HEARTBEAT_DEFAULT_INTERVAL
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
def fn() -> bool:
|
|
149
|
+
# Call ServerAppIo API
|
|
150
|
+
try:
|
|
151
|
+
res = stub.SendAppHeartbeat(req)
|
|
152
|
+
except grpc.RpcError as e:
|
|
153
|
+
status_code = e.code()
|
|
154
|
+
if status_code == grpc.StatusCode.UNAVAILABLE:
|
|
155
|
+
return False
|
|
156
|
+
if status_code == grpc.StatusCode.DEADLINE_EXCEEDED:
|
|
157
|
+
return False
|
|
158
|
+
raise
|
|
159
|
+
|
|
160
|
+
# Check if not successful
|
|
161
|
+
if not res.success:
|
|
162
|
+
raise RuntimeError(failure_message)
|
|
163
|
+
return True
|
|
164
|
+
|
|
165
|
+
return fn
|