flwr-nightly 1.19.0.dev20250511__py3-none-any.whl → 1.19.0.dev20250513__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flwr/common/constant.py +1 -0
- flwr/common/heartbeat.py +5 -0
- flwr/common/inflatable.py +14 -1
- flwr/common/record/array.py +10 -1
- flwr/common/record/arrayrecord.py +65 -1
- flwr/common/record/configrecord.py +10 -1
- flwr/common/record/metricrecord.py +10 -1
- flwr/common/record/recorddict.py +69 -1
- flwr/server/superlink/linkstate/in_memory_linkstate.py +84 -4
- flwr/server/superlink/linkstate/linkstate.py +23 -0
- flwr/server/superlink/linkstate/sqlite_linkstate.py +98 -8
- {flwr_nightly-1.19.0.dev20250511.dist-info → flwr_nightly-1.19.0.dev20250513.dist-info}/METADATA +1 -1
- {flwr_nightly-1.19.0.dev20250511.dist-info → flwr_nightly-1.19.0.dev20250513.dist-info}/RECORD +15 -15
- {flwr_nightly-1.19.0.dev20250511.dist-info → flwr_nightly-1.19.0.dev20250513.dist-info}/WHEEL +0 -0
- {flwr_nightly-1.19.0.dev20250511.dist-info → flwr_nightly-1.19.0.dev20250513.dist-info}/entry_points.txt +0 -0
flwr/common/constant.py
CHANGED
flwr/common/heartbeat.py
CHANGED
@@ -77,6 +77,11 @@ class HeartbeatSender:
|
|
77
77
|
self._stop_event.set()
|
78
78
|
self._thread.join()
|
79
79
|
|
80
|
+
@property
|
81
|
+
def is_running(self) -> bool:
|
82
|
+
"""Return True if the heartbeat sender is running, False otherwise."""
|
83
|
+
return self._thread.is_alive() and not self._stop_event.is_set()
|
84
|
+
|
80
85
|
def _run(self) -> None:
|
81
86
|
"""Periodically send heartbeats until stopped."""
|
82
87
|
while not self._stop_event.is_set():
|
flwr/common/inflatable.py
CHANGED
@@ -15,6 +15,8 @@
|
|
15
15
|
"""InflatableObject base class."""
|
16
16
|
|
17
17
|
|
18
|
+
from __future__ import annotations
|
19
|
+
|
18
20
|
import hashlib
|
19
21
|
from typing import TypeVar
|
20
22
|
|
@@ -31,7 +33,9 @@ class InflatableObject:
|
|
31
33
|
raise NotImplementedError()
|
32
34
|
|
33
35
|
@classmethod
|
34
|
-
def inflate(
|
36
|
+
def inflate(
|
37
|
+
cls, object_content: bytes, children: dict[str, InflatableObject] | None = None
|
38
|
+
) -> InflatableObject:
|
35
39
|
"""Inflate the object from bytes.
|
36
40
|
|
37
41
|
Parameters
|
@@ -39,6 +43,10 @@ class InflatableObject:
|
|
39
43
|
object_content : bytes
|
40
44
|
The deflated object content.
|
41
45
|
|
46
|
+
children : Optional[dict[str, InflatableObject]] (default: None)
|
47
|
+
Dictionary of children InflatableObjects mapped to their object IDs. These
|
48
|
+
childrens enable the full inflation of the parent InflatableObject.
|
49
|
+
|
42
50
|
Returns
|
43
51
|
-------
|
44
52
|
InflatableObject
|
@@ -51,6 +59,11 @@ class InflatableObject:
|
|
51
59
|
"""Get object_id."""
|
52
60
|
return get_object_id(self.deflate())
|
53
61
|
|
62
|
+
@property
|
63
|
+
def children(self) -> dict[str, InflatableObject] | None:
|
64
|
+
"""Get all child objects as a dictionary or None if there are no children."""
|
65
|
+
return None
|
66
|
+
|
54
67
|
|
55
68
|
def get_object_id(object_content: bytes) -> str:
|
56
69
|
"""Return a SHA-256 hash of the (deflated) object content."""
|
flwr/common/record/array.py
CHANGED
@@ -265,7 +265,9 @@ class Array(InflatableObject):
|
|
265
265
|
return add_header_to_object_body(object_body=obj_body, cls=self)
|
266
266
|
|
267
267
|
@classmethod
|
268
|
-
def inflate(
|
268
|
+
def inflate(
|
269
|
+
cls, object_content: bytes, children: dict[str, InflatableObject] | None = None
|
270
|
+
) -> Array:
|
269
271
|
"""Inflate an Array from bytes.
|
270
272
|
|
271
273
|
Parameters
|
@@ -273,11 +275,18 @@ class Array(InflatableObject):
|
|
273
275
|
object_content : bytes
|
274
276
|
The deflated object content of the Array.
|
275
277
|
|
278
|
+
children : Optional[dict[str, InflatableObject]] (default: None)
|
279
|
+
Must be ``None``. ``Array`` does not support child objects.
|
280
|
+
Providing any children will raise a ``ValueError``.
|
281
|
+
|
276
282
|
Returns
|
277
283
|
-------
|
278
284
|
Array
|
279
285
|
The inflated Array.
|
280
286
|
"""
|
287
|
+
if children is not None:
|
288
|
+
raise ValueError("`Array` objects do not have children.")
|
289
|
+
|
281
290
|
obj_body = get_object_body(object_content, cls)
|
282
291
|
proto_array = ArrayProto.FromString(obj_body)
|
283
292
|
return cls(
|
@@ -18,6 +18,7 @@
|
|
18
18
|
from __future__ import annotations
|
19
19
|
|
20
20
|
import gc
|
21
|
+
import json
|
21
22
|
import sys
|
22
23
|
from collections import OrderedDict
|
23
24
|
from logging import WARN
|
@@ -26,6 +27,7 @@ from typing import TYPE_CHECKING, Any, cast, overload
|
|
26
27
|
import numpy as np
|
27
28
|
|
28
29
|
from ..constant import GC_THRESHOLD
|
30
|
+
from ..inflatable import InflatableObject, add_header_to_object_body, get_object_body
|
29
31
|
from ..logger import log
|
30
32
|
from ..typing import NDArray
|
31
33
|
from .array import Array
|
@@ -56,7 +58,7 @@ def _check_value(value: Array) -> None:
|
|
56
58
|
)
|
57
59
|
|
58
60
|
|
59
|
-
class ArrayRecord(TypedDict[str, Array]):
|
61
|
+
class ArrayRecord(TypedDict[str, Array], InflatableObject):
|
60
62
|
"""Array record.
|
61
63
|
|
62
64
|
A typed dictionary (``str`` to :class:`Array`) that can store named arrays,
|
@@ -365,6 +367,68 @@ class ArrayRecord(TypedDict[str, Array]):
|
|
365
367
|
|
366
368
|
return num_bytes
|
367
369
|
|
370
|
+
@property
|
371
|
+
def children(self) -> dict[str, InflatableObject]:
|
372
|
+
"""Return a dictionary of Arrays with their Object IDs as keys."""
|
373
|
+
return {arr.object_id: arr for arr in self.values()}
|
374
|
+
|
375
|
+
def deflate(self) -> bytes:
|
376
|
+
"""Deflate the ArrayRecord."""
|
377
|
+
# array_name: array_object_id mapping
|
378
|
+
array_refs: dict[str, str] = {}
|
379
|
+
|
380
|
+
for array_name, array in self.items():
|
381
|
+
array_refs[array_name] = array.object_id
|
382
|
+
|
383
|
+
# Serialize references dict
|
384
|
+
object_body = json.dumps(array_refs).encode("utf-8")
|
385
|
+
return add_header_to_object_body(object_body=object_body, cls=self)
|
386
|
+
|
387
|
+
@classmethod
|
388
|
+
def inflate(
|
389
|
+
cls, object_content: bytes, children: dict[str, InflatableObject] | None = None
|
390
|
+
) -> ArrayRecord:
|
391
|
+
"""Inflate an ArrayRecord from bytes.
|
392
|
+
|
393
|
+
Parameters
|
394
|
+
----------
|
395
|
+
object_content : bytes
|
396
|
+
The deflated object content of the ArrayRecord.
|
397
|
+
children : Optional[dict[str, InflatableObject]] (default: None)
|
398
|
+
Dictionary of children InflatableObjects mapped to their Object IDs.
|
399
|
+
These children enable the full inflation of the ArrayRecord.
|
400
|
+
|
401
|
+
Returns
|
402
|
+
-------
|
403
|
+
ArrayRecord
|
404
|
+
The inflated ArrayRecord.
|
405
|
+
"""
|
406
|
+
if children is None:
|
407
|
+
children = {}
|
408
|
+
|
409
|
+
# Inflate mapping of array_names (keys in the ArrayRecord) to Arrays' object IDs
|
410
|
+
obj_body = get_object_body(object_content, cls)
|
411
|
+
array_refs: dict[str, str] = json.loads(obj_body.decode(encoding="utf-8"))
|
412
|
+
|
413
|
+
unique_arrays = set(array_refs.values())
|
414
|
+
children_obj_ids = set(children.keys())
|
415
|
+
if unique_arrays != children_obj_ids:
|
416
|
+
raise ValueError(
|
417
|
+
"Unexpected set of `children`. "
|
418
|
+
f"Expected {unique_arrays} but got {children_obj_ids}."
|
419
|
+
)
|
420
|
+
|
421
|
+
# Ensure children are of type Array
|
422
|
+
if not all(isinstance(arr, Array) for arr in children.values()):
|
423
|
+
raise ValueError("`Children` are expected to be of type `Array`.")
|
424
|
+
|
425
|
+
# Instantiate new ArrayRecord
|
426
|
+
return ArrayRecord(
|
427
|
+
OrderedDict(
|
428
|
+
{name: children[object_id] for name, object_id in array_refs.items()}
|
429
|
+
)
|
430
|
+
)
|
431
|
+
|
368
432
|
|
369
433
|
class ParametersRecord(ArrayRecord):
|
370
434
|
"""Deprecated class ``ParametersRecord``, use ``ArrayRecord`` instead.
|
@@ -185,7 +185,9 @@ class ConfigRecord(TypedDict[str, ConfigRecordValues], InflatableObject):
|
|
185
185
|
return add_header_to_object_body(object_body=obj_body, cls=self)
|
186
186
|
|
187
187
|
@classmethod
|
188
|
-
def inflate(
|
188
|
+
def inflate(
|
189
|
+
cls, object_content: bytes, children: dict[str, InflatableObject] | None = None
|
190
|
+
) -> ConfigRecord:
|
189
191
|
"""Inflate a ConfigRecord from bytes.
|
190
192
|
|
191
193
|
Parameters
|
@@ -193,11 +195,18 @@ class ConfigRecord(TypedDict[str, ConfigRecordValues], InflatableObject):
|
|
193
195
|
object_content : bytes
|
194
196
|
The deflated object content of the ConfigRecord.
|
195
197
|
|
198
|
+
children : Optional[dict[str, InflatableObject]] (default: None)
|
199
|
+
Must be ``None``. ``ConfigRecord`` does not support child objects.
|
200
|
+
Providing any children will raise a ``ValueError``.
|
201
|
+
|
196
202
|
Returns
|
197
203
|
-------
|
198
204
|
ConfigRecord
|
199
205
|
The inflated ConfigRecord.
|
200
206
|
"""
|
207
|
+
if children is not None:
|
208
|
+
raise ValueError("`ConfigRecord` objects do not have children.")
|
209
|
+
|
201
210
|
obj_body = get_object_body(object_content, cls)
|
202
211
|
config_record_proto = ProtoConfigRecord.FromString(obj_body)
|
203
212
|
|
@@ -160,7 +160,9 @@ class MetricRecord(TypedDict[str, MetricRecordValues], InflatableObject):
|
|
160
160
|
return add_header_to_object_body(object_body=obj_body, cls=self)
|
161
161
|
|
162
162
|
@classmethod
|
163
|
-
def inflate(
|
163
|
+
def inflate(
|
164
|
+
cls, object_content: bytes, children: dict[str, InflatableObject] | None = None
|
165
|
+
) -> MetricRecord:
|
164
166
|
"""Inflate a MetricRecord from bytes.
|
165
167
|
|
166
168
|
Parameters
|
@@ -168,11 +170,18 @@ class MetricRecord(TypedDict[str, MetricRecordValues], InflatableObject):
|
|
168
170
|
object_content : bytes
|
169
171
|
The deflated object content of the MetricRecord.
|
170
172
|
|
173
|
+
children : Optional[dict[str, InflatableObject]] (default: None)
|
174
|
+
Must be ``None``. ``MetricRecord`` does not support child objects.
|
175
|
+
Providing any children will raise a ``ValueError``.
|
176
|
+
|
171
177
|
Returns
|
172
178
|
-------
|
173
179
|
MetricRecord
|
174
180
|
The inflated MetricRecord.
|
175
181
|
"""
|
182
|
+
if children is not None:
|
183
|
+
raise ValueError("`MetricRecord` objects do not have children.")
|
184
|
+
|
176
185
|
obj_body = get_object_body(object_content, cls)
|
177
186
|
metric_record_proto = ProtoMetricRecord.FromString(obj_body)
|
178
187
|
|
flwr/common/record/recorddict.py
CHANGED
@@ -17,10 +17,12 @@
|
|
17
17
|
|
18
18
|
from __future__ import annotations
|
19
19
|
|
20
|
+
import json
|
20
21
|
from logging import WARN
|
21
22
|
from textwrap import indent
|
22
23
|
from typing import TypeVar, Union, cast
|
23
24
|
|
25
|
+
from ..inflatable import InflatableObject, add_header_to_object_body, get_object_body
|
24
26
|
from ..logger import log
|
25
27
|
from .arrayrecord import ArrayRecord
|
26
28
|
from .configrecord import ConfigRecord
|
@@ -97,7 +99,7 @@ class _SyncedDict(TypedDict[str, T]):
|
|
97
99
|
)
|
98
100
|
|
99
101
|
|
100
|
-
class RecordDict(TypedDict[str, RecordType]):
|
102
|
+
class RecordDict(TypedDict[str, RecordType], InflatableObject):
|
101
103
|
"""RecordDict stores groups of arrays, metrics and configs.
|
102
104
|
|
103
105
|
A :class:`RecordDict` is the unified mechanism by which arrays,
|
@@ -286,6 +288,72 @@ class RecordDict(TypedDict[str, RecordType]):
|
|
286
288
|
)
|
287
289
|
return self.config_records
|
288
290
|
|
291
|
+
@property
|
292
|
+
def children(self) -> dict[str, InflatableObject]:
|
293
|
+
"""Return a dictionary of records with their Object IDs as keys."""
|
294
|
+
return {record.object_id: record for record in self.values()}
|
295
|
+
|
296
|
+
def deflate(self) -> bytes:
|
297
|
+
"""Deflate the RecordDict."""
|
298
|
+
# record_name: record_object_id mapping
|
299
|
+
record_refs: dict[str, str] = {}
|
300
|
+
|
301
|
+
for record_name, record in self.items():
|
302
|
+
record_refs[record_name] = record.object_id
|
303
|
+
|
304
|
+
# Serialize references dict
|
305
|
+
object_body = json.dumps(record_refs).encode("utf-8")
|
306
|
+
return add_header_to_object_body(object_body=object_body, cls=self)
|
307
|
+
|
308
|
+
@classmethod
|
309
|
+
def inflate(
|
310
|
+
cls, object_content: bytes, children: dict[str, InflatableObject] | None = None
|
311
|
+
) -> RecordDict:
|
312
|
+
"""Inflate an RecordDict from bytes.
|
313
|
+
|
314
|
+
Parameters
|
315
|
+
----------
|
316
|
+
object_content : bytes
|
317
|
+
The deflated object content of the RecordDict.
|
318
|
+
children : Optional[dict[str, InflatableObject]] (default: None)
|
319
|
+
Dictionary of children InflatableObjects mapped to their Object IDs.
|
320
|
+
These children enable the full inflation of the RecordDict. Default is None.
|
321
|
+
|
322
|
+
Returns
|
323
|
+
-------
|
324
|
+
RecordDict
|
325
|
+
The inflated RecordDict.
|
326
|
+
"""
|
327
|
+
if children is None:
|
328
|
+
children = {}
|
329
|
+
|
330
|
+
# Inflate mapping of record_names (keys in the RecordDict) to Record' object IDs
|
331
|
+
obj_body = get_object_body(object_content, cls)
|
332
|
+
record_refs: dict[str, str] = json.loads(obj_body.decode(encoding="utf-8"))
|
333
|
+
|
334
|
+
unique_records = set(record_refs.values())
|
335
|
+
children_obj_ids = set(children.keys())
|
336
|
+
if unique_records != children_obj_ids:
|
337
|
+
raise ValueError(
|
338
|
+
"Unexpected set of `children`. "
|
339
|
+
f"Expected {unique_records} but got {children_obj_ids}."
|
340
|
+
)
|
341
|
+
|
342
|
+
# Ensure children are one of the *Record objects exepecte in a RecordDict
|
343
|
+
if not all(
|
344
|
+
isinstance(ch, (ArrayRecord, ConfigRecord, MetricRecord))
|
345
|
+
for ch in children.values()
|
346
|
+
):
|
347
|
+
raise ValueError(
|
348
|
+
"`Children` are expected to be of type `ArrayRecord`, "
|
349
|
+
"`ConfigRecord` or `MetricRecord`."
|
350
|
+
)
|
351
|
+
|
352
|
+
# Instantiate new RecordDict
|
353
|
+
return RecordDict(
|
354
|
+
{name: children[object_id] for name, object_id in record_refs.items()} # type: ignore
|
355
|
+
)
|
356
|
+
|
289
357
|
|
290
358
|
class RecordSet(RecordDict):
|
291
359
|
"""Deprecated class ``RecordSet``, use ``RecordDict`` instead.
|
@@ -25,12 +25,15 @@ from uuid import UUID, uuid4
|
|
25
25
|
|
26
26
|
from flwr.common import Context, Message, log, now
|
27
27
|
from flwr.common.constant import (
|
28
|
+
HEARTBEAT_MAX_INTERVAL,
|
28
29
|
HEARTBEAT_PATIENCE,
|
29
30
|
MESSAGE_TTL_TOLERANCE,
|
30
31
|
NODE_ID_NUM_BYTES,
|
32
|
+
RUN_FAILURE_DETAILS_NO_HEARTBEAT,
|
31
33
|
RUN_ID_NUM_BYTES,
|
32
34
|
SUPERLINK_NODE_ID,
|
33
35
|
Status,
|
36
|
+
SubStatus,
|
34
37
|
)
|
35
38
|
from flwr.common.record import ConfigRecord
|
36
39
|
from flwr.common.typing import Run, RunStatus, UserConfig
|
@@ -52,8 +55,11 @@ class RunRecord: # pylint: disable=R0902
|
|
52
55
|
"""The record of a specific run, including its status and timestamps."""
|
53
56
|
|
54
57
|
run: Run
|
58
|
+
active_until: float = 0.0
|
59
|
+
heartbeat_interval: float = 0.0
|
55
60
|
logs: list[tuple[float, str]] = field(default_factory=list)
|
56
61
|
log_lock: threading.Lock = field(default_factory=threading.Lock)
|
62
|
+
lock: threading.RLock = field(default_factory=threading.RLock)
|
57
63
|
|
58
64
|
|
59
65
|
class InMemoryLinkState(LinkState): # pylint: disable=R0902,R0904
|
@@ -461,8 +467,29 @@ class InMemoryLinkState(LinkState): # pylint: disable=R0902,R0904
|
|
461
467
|
with self.lock:
|
462
468
|
return set(self.run_ids.keys())
|
463
469
|
|
470
|
+
def _check_and_tag_inactive_run(self, run_ids: set[int]) -> None:
|
471
|
+
"""Check if any runs are no longer active.
|
472
|
+
|
473
|
+
Marks runs with status 'starting' or 'running' as failed
|
474
|
+
if they have not sent a heartbeat before `active_until`.
|
475
|
+
"""
|
476
|
+
current = now()
|
477
|
+
for record in [self.run_ids[run_id] for run_id in run_ids]:
|
478
|
+
with record.lock:
|
479
|
+
if record.run.status.status in (Status.STARTING, Status.RUNNING):
|
480
|
+
if record.active_until < current.timestamp():
|
481
|
+
record.run.status = RunStatus(
|
482
|
+
status=Status.FINISHED,
|
483
|
+
sub_status=SubStatus.FAILED,
|
484
|
+
details=RUN_FAILURE_DETAILS_NO_HEARTBEAT,
|
485
|
+
)
|
486
|
+
record.run.finished_at = now().isoformat()
|
487
|
+
|
464
488
|
def get_run(self, run_id: int) -> Optional[Run]:
|
465
489
|
"""Retrieve information about the run with the specified `run_id`."""
|
490
|
+
# Check if runs are still active
|
491
|
+
self._check_and_tag_inactive_run(run_ids={run_id})
|
492
|
+
|
466
493
|
with self.lock:
|
467
494
|
if run_id not in self.run_ids:
|
468
495
|
log(ERROR, "`run_id` is invalid")
|
@@ -471,6 +498,9 @@ class InMemoryLinkState(LinkState): # pylint: disable=R0902,R0904
|
|
471
498
|
|
472
499
|
def get_run_status(self, run_ids: set[int]) -> dict[int, RunStatus]:
|
473
500
|
"""Retrieve the statuses for the specified runs."""
|
501
|
+
# Check if runs are still active
|
502
|
+
self._check_and_tag_inactive_run(run_ids=run_ids)
|
503
|
+
|
474
504
|
with self.lock:
|
475
505
|
return {
|
476
506
|
run_id: self.run_ids[run_id].run.status
|
@@ -480,12 +510,16 @@ class InMemoryLinkState(LinkState): # pylint: disable=R0902,R0904
|
|
480
510
|
|
481
511
|
def update_run_status(self, run_id: int, new_status: RunStatus) -> bool:
|
482
512
|
"""Update the status of the run with the specified `run_id`."""
|
513
|
+
# Check if runs are still active
|
514
|
+
self._check_and_tag_inactive_run(run_ids={run_id})
|
515
|
+
|
483
516
|
with self.lock:
|
484
517
|
# Check if the run_id exists
|
485
518
|
if run_id not in self.run_ids:
|
486
519
|
log(ERROR, "`run_id` is invalid")
|
487
520
|
return False
|
488
521
|
|
522
|
+
with self.run_ids[run_id].lock:
|
489
523
|
# Check if the status transition is valid
|
490
524
|
current_status = self.run_ids[run_id].run.status
|
491
525
|
if not is_valid_transition(current_status, new_status):
|
@@ -507,14 +541,23 @@ class InMemoryLinkState(LinkState): # pylint: disable=R0902,R0904
|
|
507
541
|
)
|
508
542
|
return False
|
509
543
|
|
510
|
-
#
|
544
|
+
# Initialize heartbeat_interval and active_until
|
545
|
+
# when switching to starting or running
|
546
|
+
current = now()
|
511
547
|
run_record = self.run_ids[run_id]
|
548
|
+
if new_status.status in (Status.STARTING, Status.RUNNING):
|
549
|
+
run_record.heartbeat_interval = HEARTBEAT_MAX_INTERVAL
|
550
|
+
run_record.active_until = (
|
551
|
+
current.timestamp() + run_record.heartbeat_interval
|
552
|
+
)
|
553
|
+
|
554
|
+
# Update the run status
|
512
555
|
if new_status.status == Status.STARTING:
|
513
|
-
run_record.run.starting_at =
|
556
|
+
run_record.run.starting_at = current.isoformat()
|
514
557
|
elif new_status.status == Status.RUNNING:
|
515
|
-
run_record.run.running_at =
|
558
|
+
run_record.run.running_at = current.isoformat()
|
516
559
|
elif new_status.status == Status.FINISHED:
|
517
|
-
run_record.run.finished_at =
|
560
|
+
run_record.run.finished_at = current.isoformat()
|
518
561
|
run_record.run.status = new_status
|
519
562
|
return True
|
520
563
|
|
@@ -558,6 +601,43 @@ class InMemoryLinkState(LinkState): # pylint: disable=R0902,R0904
|
|
558
601
|
return True
|
559
602
|
return False
|
560
603
|
|
604
|
+
def acknowledge_app_heartbeat(self, run_id: int, heartbeat_interval: float) -> bool:
|
605
|
+
"""Acknowledge a heartbeat received from a ServerApp for a given run.
|
606
|
+
|
607
|
+
A run with status `"running"` is considered alive as long as it sends heartbeats
|
608
|
+
within the tolerated interval: HEARTBEAT_PATIENCE × heartbeat_interval.
|
609
|
+
HEARTBEAT_PATIENCE = N allows for N-1 missed heartbeat before the run is
|
610
|
+
marked as `"completed:failed"`.
|
611
|
+
"""
|
612
|
+
with self.lock:
|
613
|
+
# Search for the run
|
614
|
+
record = self.run_ids.get(run_id)
|
615
|
+
|
616
|
+
# Check if the run_id exists
|
617
|
+
if record is None:
|
618
|
+
log(ERROR, "`run_id` is invalid")
|
619
|
+
return False
|
620
|
+
|
621
|
+
with record.lock:
|
622
|
+
# Check if runs are still active
|
623
|
+
self._check_and_tag_inactive_run(run_ids={run_id})
|
624
|
+
|
625
|
+
# Check if the run is of status "running"/"starting"
|
626
|
+
current_status = record.run.status
|
627
|
+
if current_status.status not in (Status.RUNNING, Status.STARTING):
|
628
|
+
log(
|
629
|
+
ERROR,
|
630
|
+
'Cannot acknowledge heartbeat for run with status "%s"',
|
631
|
+
current_status.status,
|
632
|
+
)
|
633
|
+
return False
|
634
|
+
|
635
|
+
# Update the `active_until` and `heartbeat_interval` for the given run
|
636
|
+
current = now().timestamp()
|
637
|
+
record.active_until = current + HEARTBEAT_PATIENCE * heartbeat_interval
|
638
|
+
record.heartbeat_interval = heartbeat_interval
|
639
|
+
return True
|
640
|
+
|
561
641
|
def get_serverapp_context(self, run_id: int) -> Optional[Context]:
|
562
642
|
"""Get the context for the specified `run_id`."""
|
563
643
|
return self.contexts.get(run_id)
|
@@ -292,6 +292,29 @@ class LinkState(abc.ABC): # pylint: disable=R0904
|
|
292
292
|
True if the heartbeat is successfully acknowledged; otherwise, False.
|
293
293
|
"""
|
294
294
|
|
295
|
+
@abc.abstractmethod
|
296
|
+
def acknowledge_app_heartbeat(self, run_id: int, heartbeat_interval: float) -> bool:
|
297
|
+
"""Acknowledge a heartbeat received from a ServerApp for a given run.
|
298
|
+
|
299
|
+
A run with status `"running"` is considered alive as long as it sends heartbeats
|
300
|
+
within the tolerated interval: HEARTBEAT_PATIENCE × heartbeat_interval.
|
301
|
+
HEARTBEAT_PATIENCE = N allows for N-1 missed heartbeat before the run is
|
302
|
+
marked as `"completed:failed"`.
|
303
|
+
|
304
|
+
Parameters
|
305
|
+
----------
|
306
|
+
run_id : int
|
307
|
+
The `run_id` from which the heartbeat was received.
|
308
|
+
heartbeat_interval : float
|
309
|
+
The interval (in seconds) from the current timestamp within which the next
|
310
|
+
heartbeat from the ServerApp for this run must be received.
|
311
|
+
|
312
|
+
Returns
|
313
|
+
-------
|
314
|
+
is_acknowledged : bool
|
315
|
+
True if the heartbeat is successfully acknowledged; otherwise, False.
|
316
|
+
"""
|
317
|
+
|
295
318
|
@abc.abstractmethod
|
296
319
|
def get_serverapp_context(self, run_id: int) -> Optional[Context]:
|
297
320
|
"""Get the context for the specified `run_id`.
|
@@ -28,12 +28,15 @@ from uuid import UUID, uuid4
|
|
28
28
|
|
29
29
|
from flwr.common import Context, Message, Metadata, log, now
|
30
30
|
from flwr.common.constant import (
|
31
|
+
HEARTBEAT_MAX_INTERVAL,
|
31
32
|
HEARTBEAT_PATIENCE,
|
32
33
|
MESSAGE_TTL_TOLERANCE,
|
33
34
|
NODE_ID_NUM_BYTES,
|
35
|
+
RUN_FAILURE_DETAILS_NO_HEARTBEAT,
|
34
36
|
RUN_ID_NUM_BYTES,
|
35
37
|
SUPERLINK_NODE_ID,
|
36
38
|
Status,
|
39
|
+
SubStatus,
|
37
40
|
)
|
38
41
|
from flwr.common.message import make_message
|
39
42
|
from flwr.common.record import ConfigRecord
|
@@ -92,6 +95,8 @@ CREATE INDEX IF NOT EXISTS idx_online_until ON node (online_until);
|
|
92
95
|
SQL_CREATE_TABLE_RUN = """
|
93
96
|
CREATE TABLE IF NOT EXISTS run(
|
94
97
|
run_id INTEGER UNIQUE,
|
98
|
+
active_until REAL,
|
99
|
+
heartbeat_interval REAL,
|
95
100
|
fab_id TEXT,
|
96
101
|
fab_version TEXT,
|
97
102
|
fab_hash TEXT,
|
@@ -742,20 +747,21 @@ class SqliteLinkState(LinkState): # pylint: disable=R0904
|
|
742
747
|
if self.query(query, (sint64_run_id,))[0]["COUNT(*)"] == 0:
|
743
748
|
query = (
|
744
749
|
"INSERT INTO run "
|
745
|
-
"(run_id,
|
746
|
-
"
|
747
|
-
"sub_status, details)
|
750
|
+
"(run_id, active_until, heartbeat_interval, fab_id, fab_version, "
|
751
|
+
"fab_hash, override_config, federation_options, pending_at, "
|
752
|
+
"starting_at, running_at, finished_at, sub_status, details) "
|
753
|
+
"VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"
|
748
754
|
)
|
749
755
|
override_config_json = json.dumps(override_config)
|
750
756
|
data = [
|
751
757
|
sint64_run_id,
|
758
|
+
0, # The `active_until` is not used until the run is started
|
759
|
+
0, # This `heartbeat_interval` is not used until the run is started
|
752
760
|
fab_id,
|
753
761
|
fab_version,
|
754
762
|
fab_hash,
|
755
763
|
override_config_json,
|
756
764
|
configrecord_to_bytes(federation_options),
|
757
|
-
]
|
758
|
-
data += [
|
759
765
|
now().isoformat(),
|
760
766
|
"",
|
761
767
|
"",
|
@@ -796,8 +802,33 @@ class SqliteLinkState(LinkState): # pylint: disable=R0904
|
|
796
802
|
rows = self.query(query)
|
797
803
|
return {convert_sint64_to_uint64(row["run_id"]) for row in rows}
|
798
804
|
|
805
|
+
def _check_and_tag_inactive_run(self, run_ids: set[int]) -> None:
|
806
|
+
"""Check if any runs are no longer active.
|
807
|
+
|
808
|
+
Marks runs with status 'starting' or 'running' as failed
|
809
|
+
if they have not sent a heartbeat before `active_until`.
|
810
|
+
"""
|
811
|
+
sint_run_ids = [convert_uint64_to_sint64(run_id) for run_id in run_ids]
|
812
|
+
query = "UPDATE run SET finished_at = ?, sub_status = ?, details = ? "
|
813
|
+
query += "WHERE starting_at != '' AND finished_at = '' AND active_until < ?"
|
814
|
+
query += f" AND run_id IN ({','.join(['?'] * len(run_ids))});"
|
815
|
+
current = now()
|
816
|
+
self.query(
|
817
|
+
query,
|
818
|
+
(
|
819
|
+
current.isoformat(),
|
820
|
+
SubStatus.FAILED,
|
821
|
+
RUN_FAILURE_DETAILS_NO_HEARTBEAT,
|
822
|
+
current.timestamp(),
|
823
|
+
*sint_run_ids,
|
824
|
+
),
|
825
|
+
)
|
826
|
+
|
799
827
|
def get_run(self, run_id: int) -> Optional[Run]:
|
800
828
|
"""Retrieve information about the run with the specified `run_id`."""
|
829
|
+
# Check if runs are still active
|
830
|
+
self._check_and_tag_inactive_run(run_ids={run_id})
|
831
|
+
|
801
832
|
# Convert the uint64 value to sint64 for SQLite
|
802
833
|
sint64_run_id = convert_uint64_to_sint64(run_id)
|
803
834
|
query = "SELECT * FROM run WHERE run_id = ?;"
|
@@ -825,6 +856,9 @@ class SqliteLinkState(LinkState): # pylint: disable=R0904
|
|
825
856
|
|
826
857
|
def get_run_status(self, run_ids: set[int]) -> dict[int, RunStatus]:
|
827
858
|
"""Retrieve the statuses for the specified runs."""
|
859
|
+
# Check if runs are still active
|
860
|
+
self._check_and_tag_inactive_run(run_ids=run_ids)
|
861
|
+
|
828
862
|
# Convert the uint64 value to sint64 for SQLite
|
829
863
|
sint64_run_ids = (convert_uint64_to_sint64(run_id) for run_id in set(run_ids))
|
830
864
|
query = f"SELECT * FROM run WHERE run_id IN ({','.join(['?'] * len(run_ids))});"
|
@@ -842,6 +876,9 @@ class SqliteLinkState(LinkState): # pylint: disable=R0904
|
|
842
876
|
|
843
877
|
def update_run_status(self, run_id: int, new_status: RunStatus) -> bool:
|
844
878
|
"""Update the status of the run with the specified `run_id`."""
|
879
|
+
# Check if runs are still active
|
880
|
+
self._check_and_tag_inactive_run(run_ids={run_id})
|
881
|
+
|
845
882
|
# Convert the uint64 value to sint64 for SQLite
|
846
883
|
sint64_run_id = convert_uint64_to_sint64(run_id)
|
847
884
|
query = "SELECT * FROM run WHERE run_id = ?;"
|
@@ -879,9 +916,22 @@ class SqliteLinkState(LinkState): # pylint: disable=R0904
|
|
879
916
|
return False
|
880
917
|
|
881
918
|
# Update the status
|
882
|
-
query = "UPDATE run SET %s= ?, sub_status = ?, details =
|
919
|
+
query = "UPDATE run SET %s= ?, sub_status = ?, details = ?, "
|
920
|
+
query += "active_until = ?, heartbeat_interval = ? "
|
883
921
|
query += "WHERE run_id = ?;"
|
884
922
|
|
923
|
+
# Prepare data for query
|
924
|
+
# Initialize heartbeat_interval and active_until
|
925
|
+
# when switching to starting or running
|
926
|
+
current = now()
|
927
|
+
if new_status.status in (Status.STARTING, Status.RUNNING):
|
928
|
+
heartbeat_interval = HEARTBEAT_MAX_INTERVAL
|
929
|
+
active_until = current.timestamp() + heartbeat_interval
|
930
|
+
else:
|
931
|
+
heartbeat_interval = 0
|
932
|
+
active_until = 0
|
933
|
+
|
934
|
+
# Determine the timestamp field based on the new status
|
885
935
|
timestamp_fld = ""
|
886
936
|
if new_status.status == Status.STARTING:
|
887
937
|
timestamp_fld = "starting_at"
|
@@ -891,10 +941,12 @@ class SqliteLinkState(LinkState): # pylint: disable=R0904
|
|
891
941
|
timestamp_fld = "finished_at"
|
892
942
|
|
893
943
|
data = (
|
894
|
-
|
944
|
+
current.isoformat(),
|
895
945
|
new_status.sub_status,
|
896
946
|
new_status.details,
|
897
|
-
|
947
|
+
active_until,
|
948
|
+
heartbeat_interval,
|
949
|
+
convert_uint64_to_sint64(run_id),
|
898
950
|
)
|
899
951
|
self.query(query % timestamp_fld, data)
|
900
952
|
return True
|
@@ -957,6 +1009,44 @@ class SqliteLinkState(LinkState): # pylint: disable=R0904
|
|
957
1009
|
)
|
958
1010
|
return True
|
959
1011
|
|
1012
|
+
def acknowledge_app_heartbeat(self, run_id: int, heartbeat_interval: float) -> bool:
|
1013
|
+
"""Acknowledge a heartbeat received from a ServerApp for a given run.
|
1014
|
+
|
1015
|
+
A run with status `"running"` is considered alive as long as it sends heartbeats
|
1016
|
+
within the tolerated interval: HEARTBEAT_PATIENCE × heartbeat_interval.
|
1017
|
+
HEARTBEAT_PATIENCE = N allows for N-1 missed heartbeat before the run is
|
1018
|
+
marked as `"completed:failed"`.
|
1019
|
+
"""
|
1020
|
+
# Check if runs are still active
|
1021
|
+
self._check_and_tag_inactive_run(run_ids={run_id})
|
1022
|
+
|
1023
|
+
# Search for the run
|
1024
|
+
sint_run_id = convert_uint64_to_sint64(run_id)
|
1025
|
+
query = "SELECT * FROM run WHERE run_id = ?;"
|
1026
|
+
rows = self.query(query, (sint_run_id,))
|
1027
|
+
|
1028
|
+
if not rows:
|
1029
|
+
log(ERROR, "`run_id` is invalid")
|
1030
|
+
return False
|
1031
|
+
|
1032
|
+
# Check if the run is of status "running"/"starting"
|
1033
|
+
row = rows[0]
|
1034
|
+
status = determine_run_status(row)
|
1035
|
+
if status not in (Status.RUNNING, Status.STARTING):
|
1036
|
+
log(
|
1037
|
+
ERROR,
|
1038
|
+
'Cannot acknowledge heartbeat for run with status "%s"',
|
1039
|
+
status,
|
1040
|
+
)
|
1041
|
+
return False
|
1042
|
+
|
1043
|
+
# Update the `active_until` and `heartbeat_interval` for the given run
|
1044
|
+
active_until = now().timestamp() + HEARTBEAT_PATIENCE * heartbeat_interval
|
1045
|
+
query = "UPDATE run SET active_until = ?, heartbeat_interval = ? "
|
1046
|
+
query += "WHERE run_id = ?"
|
1047
|
+
self.query(query, (active_until, heartbeat_interval, sint_run_id))
|
1048
|
+
return True
|
1049
|
+
|
960
1050
|
def get_serverapp_context(self, run_id: int) -> Optional[Context]:
|
961
1051
|
"""Get the context for the specified `run_id`."""
|
962
1052
|
# Retrieve context if any
|
{flwr_nightly-1.19.0.dev20250511.dist-info → flwr_nightly-1.19.0.dev20250513.dist-info}/METADATA
RENAMED
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: flwr-nightly
|
3
|
-
Version: 1.19.0.
|
3
|
+
Version: 1.19.0.dev20250513
|
4
4
|
Summary: Flower: A Friendly Federated AI Framework
|
5
5
|
License: Apache-2.0
|
6
6
|
Keywords: Artificial Intelligence,Federated AI,Federated Analytics,Federated Evaluation,Federated Learning,Flower,Machine Learning
|
{flwr_nightly-1.19.0.dev20250511.dist-info → flwr_nightly-1.19.0.dev20250513.dist-info}/RECORD
RENAMED
@@ -115,7 +115,7 @@ flwr/common/args.py,sha256=-aX_jVnSaDrJR2KZ8Wq0Y3dQHII4R4MJtJOIXzVUA0c,5417
|
|
115
115
|
flwr/common/auth_plugin/__init__.py,sha256=m271m9YjK2QfKDOuIIhcTvGmv1GWh1PL97QB05NTSHs,887
|
116
116
|
flwr/common/auth_plugin/auth_plugin.py,sha256=GaXw4IiU2DkVNkp5S9ue821sbkU9zWSu6HSVZetEdjs,3938
|
117
117
|
flwr/common/config.py,sha256=glcZDjco-amw1YfQcYTFJ4S1pt9APoexT-mf1QscuHs,13960
|
118
|
-
flwr/common/constant.py,sha256=
|
118
|
+
flwr/common/constant.py,sha256=Q8N-up1TvL_vllV_QA8mQlKjqVJ6Kdoze3iem6nSF9E,7375
|
119
119
|
flwr/common/context.py,sha256=Be8obQR_OvEDy1OmshuUKxGRQ7Qx89mf5F4xlhkR10s,2407
|
120
120
|
flwr/common/date.py,sha256=1ZT2cRSpC2DJqprOVTLXYCR_O2_OZR0zXO_brJ3LqWc,1554
|
121
121
|
flwr/common/differential_privacy.py,sha256=FdlpdpPl_H_2HJa8CQM1iCUGBBQ5Dc8CzxmHERM-EoE,6148
|
@@ -128,20 +128,20 @@ flwr/common/exit/exit.py,sha256=mJgbqMlVlwAgYtq-Vedj53wO4VxcDcy_P-GzqGK-1GQ,3452
|
|
128
128
|
flwr/common/exit/exit_code.py,sha256=PNEnCrZfOILjfDAFu5m-2YWEJBrk97xglq4zCUlqV7E,3470
|
129
129
|
flwr/common/exit_handlers.py,sha256=MEk5_savTLphn-6lW57UQlos-XrFA39XEBn-OF1vXXg,3174
|
130
130
|
flwr/common/grpc.py,sha256=manTaHaPiyYngUq1ErZvvV2B2GxlXUUUGRy3jc3TBIQ,9798
|
131
|
-
flwr/common/heartbeat.py,sha256=
|
132
|
-
flwr/common/inflatable.py,sha256=
|
131
|
+
flwr/common/heartbeat.py,sha256=nU0SNlL0A6t736ku7D7z7UUB1vXaX9QIn2fsJJPWeBU,4125
|
132
|
+
flwr/common/inflatable.py,sha256=o3PQejN8GNm_xlHKJLB1h2wQzOXKkDfaiy6z5o142pA,3681
|
133
133
|
flwr/common/logger.py,sha256=JbRf6E2vQxXzpDBq1T8IDUJo_usu3gjWEBPQ6uKcmdg,13049
|
134
134
|
flwr/common/message.py,sha256=znr205Erq2hkxwFbvNNCsQTRS2UKv_Qsyu0sFNEhEAw,23721
|
135
135
|
flwr/common/object_ref.py,sha256=p3SfTeqo3Aj16SkB-vsnNn01zswOPdGNBitcbRnqmUk,9134
|
136
136
|
flwr/common/parameter.py,sha256=UVw6sOgehEFhFs4uUCMl2kfVq1PD6ncmWgPLMsZPKPE,2095
|
137
137
|
flwr/common/pyproject.py,sha256=2SU6yJW7059SbMXgzjOdK1GZRWO6AixDH7BmdxbMvHI,1386
|
138
138
|
flwr/common/record/__init__.py,sha256=cNGccdDoxttqgnUgyKRIqLWULjW-NaSmOufVxtXq-sw,1197
|
139
|
-
flwr/common/record/array.py,sha256=
|
140
|
-
flwr/common/record/arrayrecord.py,sha256
|
141
|
-
flwr/common/record/configrecord.py,sha256=
|
139
|
+
flwr/common/record/array.py,sha256=7c5peTvVsnhXQVvYOIFqm2UowGO-Uu0_44EeqybU0HY,10850
|
140
|
+
flwr/common/record/arrayrecord.py,sha256=-9iIfBKtCS-fGmh5NLTDS7GTHqfHA3o1MguPyyybAyI,16787
|
141
|
+
flwr/common/record/configrecord.py,sha256=2BqauemtQBsZDWLtkTZp51_4J7L9-8i5LrM9RV8nmNQ,9664
|
142
142
|
flwr/common/record/conversion_utils.py,sha256=wbNCzy7oAqaA3-arhls_EqRZYXRC4YrWIoE-Gy82fJ0,1191
|
143
|
-
flwr/common/record/metricrecord.py,sha256=
|
144
|
-
flwr/common/record/recorddict.py,sha256=
|
143
|
+
flwr/common/record/metricrecord.py,sha256=PGeYISykqf_NSyij-qXdf85Pfnder7Rw9CagIolBe30,8839
|
144
|
+
flwr/common/record/recorddict.py,sha256=qnRET_r6_7o7JSQQO2PnMAKezA4yZEy0frsfy8CKxb8,14986
|
145
145
|
flwr/common/record/typeddict.py,sha256=dDKgUThs2BscYUNcgP82KP8-qfAYXYftDrf2LszAC_o,3599
|
146
146
|
flwr/common/recorddict_compat.py,sha256=Znn1xRGiqLpPPgviVqyb-GPTM-pCK6tpnEmhWSXafy8,14119
|
147
147
|
flwr/common/retry_invoker.py,sha256=T6puUH3nCxdRzQHeanyr-0nTxhRiS1TH07rmef9vuLQ,14482
|
@@ -293,10 +293,10 @@ flwr/server/superlink/fleet/vce/backend/backend.py,sha256=-wDHjgAy5mrfEgXj0GxkJI
|
|
293
293
|
flwr/server/superlink/fleet/vce/backend/raybackend.py,sha256=Hx9hxL7lju1_VJoAwkhBOGerZ3628u0P1zgkPhGWRPY,7154
|
294
294
|
flwr/server/superlink/fleet/vce/vce_api.py,sha256=m7WUiHRl-jTqzjH3cqNCj3RXe3ohT6V6I0JIR6zWZj8,12780
|
295
295
|
flwr/server/superlink/linkstate/__init__.py,sha256=OtsgvDTnZLU3k0sUbkHbqoVwW6ql2FDmb6uT6DbNkZo,1064
|
296
|
-
flwr/server/superlink/linkstate/in_memory_linkstate.py,sha256=
|
297
|
-
flwr/server/superlink/linkstate/linkstate.py,sha256=
|
296
|
+
flwr/server/superlink/linkstate/in_memory_linkstate.py,sha256=vvoOWjYlmOlbakH7AzpMh0jB70Qxx7UTlAGqjcA8ctM,25926
|
297
|
+
flwr/server/superlink/linkstate/linkstate.py,sha256=j6nW351t07VrBhFqjO34z8tf2PuKOE9aCX9SqpW96pQ,13100
|
298
298
|
flwr/server/superlink/linkstate/linkstate_factory.py,sha256=8RlosqSpKOoD_vhUUQPY0jtE3A84GeF96Z7sWNkRRcA,2069
|
299
|
-
flwr/server/superlink/linkstate/sqlite_linkstate.py,sha256=
|
299
|
+
flwr/server/superlink/linkstate/sqlite_linkstate.py,sha256=E43YO88vdnG9GW6Rwh9Fb7oWGgEABS9RXDRg3OR3T4Q,43573
|
300
300
|
flwr/server/superlink/linkstate/utils.py,sha256=AJs9jTAEK7JnjF2AODXnOfy0pKAKpe6oUWPCanAP57s,15382
|
301
301
|
flwr/server/superlink/serverappio/__init__.py,sha256=Fy4zJuoccZe5mZSEIpOmQvU6YeXFBa1M4eZuXXmJcn8,717
|
302
302
|
flwr/server/superlink/serverappio/serverappio_grpc.py,sha256=opJ6SYwIAbu4NWEo3K-VxFO-tMSFmE4H3i2HwHIVRzw,2173
|
@@ -333,7 +333,7 @@ flwr/superexec/exec_servicer.py,sha256=Z0YYfs6eNPhqn8rY0x_R04XgR2mKFpggt07IH0EhU
|
|
333
333
|
flwr/superexec/exec_user_auth_interceptor.py,sha256=iqygALkOMBUu_s_R9G0mFThZA7HTUzuXCLgxLCefiwI,4440
|
334
334
|
flwr/superexec/executor.py,sha256=M5ucqSE53jfRtuCNf59WFLqQvA1Mln4741TySeZE7qQ,3112
|
335
335
|
flwr/superexec/simulation.py,sha256=j6YwUvBN7EQ09ID7MYOCVZ70PGbuyBy8f9bXU0EszEM,4088
|
336
|
-
flwr_nightly-1.19.0.
|
337
|
-
flwr_nightly-1.19.0.
|
338
|
-
flwr_nightly-1.19.0.
|
339
|
-
flwr_nightly-1.19.0.
|
336
|
+
flwr_nightly-1.19.0.dev20250513.dist-info/METADATA,sha256=4dlTZbixbfPDUcDbK3OBZ3JwaIFV553qWtAcY4d6zpQ,15910
|
337
|
+
flwr_nightly-1.19.0.dev20250513.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
|
338
|
+
flwr_nightly-1.19.0.dev20250513.dist-info/entry_points.txt,sha256=2-1L-GNKhwGw2_7_RoH55vHw2SIHjdAQy3HAVAWl9PY,374
|
339
|
+
flwr_nightly-1.19.0.dev20250513.dist-info/RECORD,,
|
{flwr_nightly-1.19.0.dev20250511.dist-info → flwr_nightly-1.19.0.dev20250513.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|