flwr-nightly 1.19.0.dev20250511__py3-none-any.whl → 1.19.0.dev20250513__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
flwr/common/constant.py CHANGED
@@ -62,6 +62,7 @@ HEARTBEAT_BASE_MULTIPLIER = 0.8
62
62
  HEARTBEAT_RANDOM_RANGE = (-0.1, 0.1)
63
63
  HEARTBEAT_MAX_INTERVAL = 1e300
64
64
  HEARTBEAT_PATIENCE = 2
65
+ RUN_FAILURE_DETAILS_NO_HEARTBEAT = "No heartbeat received from the run."
65
66
 
66
67
  # IDs
67
68
  RUN_ID_NUM_BYTES = 8
flwr/common/heartbeat.py CHANGED
@@ -77,6 +77,11 @@ class HeartbeatSender:
77
77
  self._stop_event.set()
78
78
  self._thread.join()
79
79
 
80
+ @property
81
+ def is_running(self) -> bool:
82
+ """Return True if the heartbeat sender is running, False otherwise."""
83
+ return self._thread.is_alive() and not self._stop_event.is_set()
84
+
80
85
  def _run(self) -> None:
81
86
  """Periodically send heartbeats until stopped."""
82
87
  while not self._stop_event.is_set():
flwr/common/inflatable.py CHANGED
@@ -15,6 +15,8 @@
15
15
  """InflatableObject base class."""
16
16
 
17
17
 
18
+ from __future__ import annotations
19
+
18
20
  import hashlib
19
21
  from typing import TypeVar
20
22
 
@@ -31,7 +33,9 @@ class InflatableObject:
31
33
  raise NotImplementedError()
32
34
 
33
35
  @classmethod
34
- def inflate(cls, object_content: bytes) -> "InflatableObject":
36
+ def inflate(
37
+ cls, object_content: bytes, children: dict[str, InflatableObject] | None = None
38
+ ) -> InflatableObject:
35
39
  """Inflate the object from bytes.
36
40
 
37
41
  Parameters
@@ -39,6 +43,10 @@ class InflatableObject:
39
43
  object_content : bytes
40
44
  The deflated object content.
41
45
 
46
+ children : Optional[dict[str, InflatableObject]] (default: None)
47
+ Dictionary of children InflatableObjects mapped to their object IDs. These
48
+ childrens enable the full inflation of the parent InflatableObject.
49
+
42
50
  Returns
43
51
  -------
44
52
  InflatableObject
@@ -51,6 +59,11 @@ class InflatableObject:
51
59
  """Get object_id."""
52
60
  return get_object_id(self.deflate())
53
61
 
62
+ @property
63
+ def children(self) -> dict[str, InflatableObject] | None:
64
+ """Get all child objects as a dictionary or None if there are no children."""
65
+ return None
66
+
54
67
 
55
68
  def get_object_id(object_content: bytes) -> str:
56
69
  """Return a SHA-256 hash of the (deflated) object content."""
@@ -265,7 +265,9 @@ class Array(InflatableObject):
265
265
  return add_header_to_object_body(object_body=obj_body, cls=self)
266
266
 
267
267
  @classmethod
268
- def inflate(cls, object_content: bytes) -> Array:
268
+ def inflate(
269
+ cls, object_content: bytes, children: dict[str, InflatableObject] | None = None
270
+ ) -> Array:
269
271
  """Inflate an Array from bytes.
270
272
 
271
273
  Parameters
@@ -273,11 +275,18 @@ class Array(InflatableObject):
273
275
  object_content : bytes
274
276
  The deflated object content of the Array.
275
277
 
278
+ children : Optional[dict[str, InflatableObject]] (default: None)
279
+ Must be ``None``. ``Array`` does not support child objects.
280
+ Providing any children will raise a ``ValueError``.
281
+
276
282
  Returns
277
283
  -------
278
284
  Array
279
285
  The inflated Array.
280
286
  """
287
+ if children is not None:
288
+ raise ValueError("`Array` objects do not have children.")
289
+
281
290
  obj_body = get_object_body(object_content, cls)
282
291
  proto_array = ArrayProto.FromString(obj_body)
283
292
  return cls(
@@ -18,6 +18,7 @@
18
18
  from __future__ import annotations
19
19
 
20
20
  import gc
21
+ import json
21
22
  import sys
22
23
  from collections import OrderedDict
23
24
  from logging import WARN
@@ -26,6 +27,7 @@ from typing import TYPE_CHECKING, Any, cast, overload
26
27
  import numpy as np
27
28
 
28
29
  from ..constant import GC_THRESHOLD
30
+ from ..inflatable import InflatableObject, add_header_to_object_body, get_object_body
29
31
  from ..logger import log
30
32
  from ..typing import NDArray
31
33
  from .array import Array
@@ -56,7 +58,7 @@ def _check_value(value: Array) -> None:
56
58
  )
57
59
 
58
60
 
59
- class ArrayRecord(TypedDict[str, Array]):
61
+ class ArrayRecord(TypedDict[str, Array], InflatableObject):
60
62
  """Array record.
61
63
 
62
64
  A typed dictionary (``str`` to :class:`Array`) that can store named arrays,
@@ -365,6 +367,68 @@ class ArrayRecord(TypedDict[str, Array]):
365
367
 
366
368
  return num_bytes
367
369
 
370
+ @property
371
+ def children(self) -> dict[str, InflatableObject]:
372
+ """Return a dictionary of Arrays with their Object IDs as keys."""
373
+ return {arr.object_id: arr for arr in self.values()}
374
+
375
+ def deflate(self) -> bytes:
376
+ """Deflate the ArrayRecord."""
377
+ # array_name: array_object_id mapping
378
+ array_refs: dict[str, str] = {}
379
+
380
+ for array_name, array in self.items():
381
+ array_refs[array_name] = array.object_id
382
+
383
+ # Serialize references dict
384
+ object_body = json.dumps(array_refs).encode("utf-8")
385
+ return add_header_to_object_body(object_body=object_body, cls=self)
386
+
387
+ @classmethod
388
+ def inflate(
389
+ cls, object_content: bytes, children: dict[str, InflatableObject] | None = None
390
+ ) -> ArrayRecord:
391
+ """Inflate an ArrayRecord from bytes.
392
+
393
+ Parameters
394
+ ----------
395
+ object_content : bytes
396
+ The deflated object content of the ArrayRecord.
397
+ children : Optional[dict[str, InflatableObject]] (default: None)
398
+ Dictionary of children InflatableObjects mapped to their Object IDs.
399
+ These children enable the full inflation of the ArrayRecord.
400
+
401
+ Returns
402
+ -------
403
+ ArrayRecord
404
+ The inflated ArrayRecord.
405
+ """
406
+ if children is None:
407
+ children = {}
408
+
409
+ # Inflate mapping of array_names (keys in the ArrayRecord) to Arrays' object IDs
410
+ obj_body = get_object_body(object_content, cls)
411
+ array_refs: dict[str, str] = json.loads(obj_body.decode(encoding="utf-8"))
412
+
413
+ unique_arrays = set(array_refs.values())
414
+ children_obj_ids = set(children.keys())
415
+ if unique_arrays != children_obj_ids:
416
+ raise ValueError(
417
+ "Unexpected set of `children`. "
418
+ f"Expected {unique_arrays} but got {children_obj_ids}."
419
+ )
420
+
421
+ # Ensure children are of type Array
422
+ if not all(isinstance(arr, Array) for arr in children.values()):
423
+ raise ValueError("`Children` are expected to be of type `Array`.")
424
+
425
+ # Instantiate new ArrayRecord
426
+ return ArrayRecord(
427
+ OrderedDict(
428
+ {name: children[object_id] for name, object_id in array_refs.items()}
429
+ )
430
+ )
431
+
368
432
 
369
433
  class ParametersRecord(ArrayRecord):
370
434
  """Deprecated class ``ParametersRecord``, use ``ArrayRecord`` instead.
@@ -185,7 +185,9 @@ class ConfigRecord(TypedDict[str, ConfigRecordValues], InflatableObject):
185
185
  return add_header_to_object_body(object_body=obj_body, cls=self)
186
186
 
187
187
  @classmethod
188
- def inflate(cls, object_content: bytes) -> ConfigRecord:
188
+ def inflate(
189
+ cls, object_content: bytes, children: dict[str, InflatableObject] | None = None
190
+ ) -> ConfigRecord:
189
191
  """Inflate a ConfigRecord from bytes.
190
192
 
191
193
  Parameters
@@ -193,11 +195,18 @@ class ConfigRecord(TypedDict[str, ConfigRecordValues], InflatableObject):
193
195
  object_content : bytes
194
196
  The deflated object content of the ConfigRecord.
195
197
 
198
+ children : Optional[dict[str, InflatableObject]] (default: None)
199
+ Must be ``None``. ``ConfigRecord`` does not support child objects.
200
+ Providing any children will raise a ``ValueError``.
201
+
196
202
  Returns
197
203
  -------
198
204
  ConfigRecord
199
205
  The inflated ConfigRecord.
200
206
  """
207
+ if children is not None:
208
+ raise ValueError("`ConfigRecord` objects do not have children.")
209
+
201
210
  obj_body = get_object_body(object_content, cls)
202
211
  config_record_proto = ProtoConfigRecord.FromString(obj_body)
203
212
 
@@ -160,7 +160,9 @@ class MetricRecord(TypedDict[str, MetricRecordValues], InflatableObject):
160
160
  return add_header_to_object_body(object_body=obj_body, cls=self)
161
161
 
162
162
  @classmethod
163
- def inflate(cls, object_content: bytes) -> MetricRecord:
163
+ def inflate(
164
+ cls, object_content: bytes, children: dict[str, InflatableObject] | None = None
165
+ ) -> MetricRecord:
164
166
  """Inflate a MetricRecord from bytes.
165
167
 
166
168
  Parameters
@@ -168,11 +170,18 @@ class MetricRecord(TypedDict[str, MetricRecordValues], InflatableObject):
168
170
  object_content : bytes
169
171
  The deflated object content of the MetricRecord.
170
172
 
173
+ children : Optional[dict[str, InflatableObject]] (default: None)
174
+ Must be ``None``. ``MetricRecord`` does not support child objects.
175
+ Providing any children will raise a ``ValueError``.
176
+
171
177
  Returns
172
178
  -------
173
179
  MetricRecord
174
180
  The inflated MetricRecord.
175
181
  """
182
+ if children is not None:
183
+ raise ValueError("`MetricRecord` objects do not have children.")
184
+
176
185
  obj_body = get_object_body(object_content, cls)
177
186
  metric_record_proto = ProtoMetricRecord.FromString(obj_body)
178
187
 
@@ -17,10 +17,12 @@
17
17
 
18
18
  from __future__ import annotations
19
19
 
20
+ import json
20
21
  from logging import WARN
21
22
  from textwrap import indent
22
23
  from typing import TypeVar, Union, cast
23
24
 
25
+ from ..inflatable import InflatableObject, add_header_to_object_body, get_object_body
24
26
  from ..logger import log
25
27
  from .arrayrecord import ArrayRecord
26
28
  from .configrecord import ConfigRecord
@@ -97,7 +99,7 @@ class _SyncedDict(TypedDict[str, T]):
97
99
  )
98
100
 
99
101
 
100
- class RecordDict(TypedDict[str, RecordType]):
102
+ class RecordDict(TypedDict[str, RecordType], InflatableObject):
101
103
  """RecordDict stores groups of arrays, metrics and configs.
102
104
 
103
105
  A :class:`RecordDict` is the unified mechanism by which arrays,
@@ -286,6 +288,72 @@ class RecordDict(TypedDict[str, RecordType]):
286
288
  )
287
289
  return self.config_records
288
290
 
291
+ @property
292
+ def children(self) -> dict[str, InflatableObject]:
293
+ """Return a dictionary of records with their Object IDs as keys."""
294
+ return {record.object_id: record for record in self.values()}
295
+
296
+ def deflate(self) -> bytes:
297
+ """Deflate the RecordDict."""
298
+ # record_name: record_object_id mapping
299
+ record_refs: dict[str, str] = {}
300
+
301
+ for record_name, record in self.items():
302
+ record_refs[record_name] = record.object_id
303
+
304
+ # Serialize references dict
305
+ object_body = json.dumps(record_refs).encode("utf-8")
306
+ return add_header_to_object_body(object_body=object_body, cls=self)
307
+
308
+ @classmethod
309
+ def inflate(
310
+ cls, object_content: bytes, children: dict[str, InflatableObject] | None = None
311
+ ) -> RecordDict:
312
+ """Inflate an RecordDict from bytes.
313
+
314
+ Parameters
315
+ ----------
316
+ object_content : bytes
317
+ The deflated object content of the RecordDict.
318
+ children : Optional[dict[str, InflatableObject]] (default: None)
319
+ Dictionary of children InflatableObjects mapped to their Object IDs.
320
+ These children enable the full inflation of the RecordDict. Default is None.
321
+
322
+ Returns
323
+ -------
324
+ RecordDict
325
+ The inflated RecordDict.
326
+ """
327
+ if children is None:
328
+ children = {}
329
+
330
+ # Inflate mapping of record_names (keys in the RecordDict) to Record' object IDs
331
+ obj_body = get_object_body(object_content, cls)
332
+ record_refs: dict[str, str] = json.loads(obj_body.decode(encoding="utf-8"))
333
+
334
+ unique_records = set(record_refs.values())
335
+ children_obj_ids = set(children.keys())
336
+ if unique_records != children_obj_ids:
337
+ raise ValueError(
338
+ "Unexpected set of `children`. "
339
+ f"Expected {unique_records} but got {children_obj_ids}."
340
+ )
341
+
342
+ # Ensure children are one of the *Record objects exepecte in a RecordDict
343
+ if not all(
344
+ isinstance(ch, (ArrayRecord, ConfigRecord, MetricRecord))
345
+ for ch in children.values()
346
+ ):
347
+ raise ValueError(
348
+ "`Children` are expected to be of type `ArrayRecord`, "
349
+ "`ConfigRecord` or `MetricRecord`."
350
+ )
351
+
352
+ # Instantiate new RecordDict
353
+ return RecordDict(
354
+ {name: children[object_id] for name, object_id in record_refs.items()} # type: ignore
355
+ )
356
+
289
357
 
290
358
  class RecordSet(RecordDict):
291
359
  """Deprecated class ``RecordSet``, use ``RecordDict`` instead.
@@ -25,12 +25,15 @@ from uuid import UUID, uuid4
25
25
 
26
26
  from flwr.common import Context, Message, log, now
27
27
  from flwr.common.constant import (
28
+ HEARTBEAT_MAX_INTERVAL,
28
29
  HEARTBEAT_PATIENCE,
29
30
  MESSAGE_TTL_TOLERANCE,
30
31
  NODE_ID_NUM_BYTES,
32
+ RUN_FAILURE_DETAILS_NO_HEARTBEAT,
31
33
  RUN_ID_NUM_BYTES,
32
34
  SUPERLINK_NODE_ID,
33
35
  Status,
36
+ SubStatus,
34
37
  )
35
38
  from flwr.common.record import ConfigRecord
36
39
  from flwr.common.typing import Run, RunStatus, UserConfig
@@ -52,8 +55,11 @@ class RunRecord: # pylint: disable=R0902
52
55
  """The record of a specific run, including its status and timestamps."""
53
56
 
54
57
  run: Run
58
+ active_until: float = 0.0
59
+ heartbeat_interval: float = 0.0
55
60
  logs: list[tuple[float, str]] = field(default_factory=list)
56
61
  log_lock: threading.Lock = field(default_factory=threading.Lock)
62
+ lock: threading.RLock = field(default_factory=threading.RLock)
57
63
 
58
64
 
59
65
  class InMemoryLinkState(LinkState): # pylint: disable=R0902,R0904
@@ -461,8 +467,29 @@ class InMemoryLinkState(LinkState): # pylint: disable=R0902,R0904
461
467
  with self.lock:
462
468
  return set(self.run_ids.keys())
463
469
 
470
+ def _check_and_tag_inactive_run(self, run_ids: set[int]) -> None:
471
+ """Check if any runs are no longer active.
472
+
473
+ Marks runs with status 'starting' or 'running' as failed
474
+ if they have not sent a heartbeat before `active_until`.
475
+ """
476
+ current = now()
477
+ for record in [self.run_ids[run_id] for run_id in run_ids]:
478
+ with record.lock:
479
+ if record.run.status.status in (Status.STARTING, Status.RUNNING):
480
+ if record.active_until < current.timestamp():
481
+ record.run.status = RunStatus(
482
+ status=Status.FINISHED,
483
+ sub_status=SubStatus.FAILED,
484
+ details=RUN_FAILURE_DETAILS_NO_HEARTBEAT,
485
+ )
486
+ record.run.finished_at = now().isoformat()
487
+
464
488
  def get_run(self, run_id: int) -> Optional[Run]:
465
489
  """Retrieve information about the run with the specified `run_id`."""
490
+ # Check if runs are still active
491
+ self._check_and_tag_inactive_run(run_ids={run_id})
492
+
466
493
  with self.lock:
467
494
  if run_id not in self.run_ids:
468
495
  log(ERROR, "`run_id` is invalid")
@@ -471,6 +498,9 @@ class InMemoryLinkState(LinkState): # pylint: disable=R0902,R0904
471
498
 
472
499
  def get_run_status(self, run_ids: set[int]) -> dict[int, RunStatus]:
473
500
  """Retrieve the statuses for the specified runs."""
501
+ # Check if runs are still active
502
+ self._check_and_tag_inactive_run(run_ids=run_ids)
503
+
474
504
  with self.lock:
475
505
  return {
476
506
  run_id: self.run_ids[run_id].run.status
@@ -480,12 +510,16 @@ class InMemoryLinkState(LinkState): # pylint: disable=R0902,R0904
480
510
 
481
511
  def update_run_status(self, run_id: int, new_status: RunStatus) -> bool:
482
512
  """Update the status of the run with the specified `run_id`."""
513
+ # Check if runs are still active
514
+ self._check_and_tag_inactive_run(run_ids={run_id})
515
+
483
516
  with self.lock:
484
517
  # Check if the run_id exists
485
518
  if run_id not in self.run_ids:
486
519
  log(ERROR, "`run_id` is invalid")
487
520
  return False
488
521
 
522
+ with self.run_ids[run_id].lock:
489
523
  # Check if the status transition is valid
490
524
  current_status = self.run_ids[run_id].run.status
491
525
  if not is_valid_transition(current_status, new_status):
@@ -507,14 +541,23 @@ class InMemoryLinkState(LinkState): # pylint: disable=R0902,R0904
507
541
  )
508
542
  return False
509
543
 
510
- # Update the status
544
+ # Initialize heartbeat_interval and active_until
545
+ # when switching to starting or running
546
+ current = now()
511
547
  run_record = self.run_ids[run_id]
548
+ if new_status.status in (Status.STARTING, Status.RUNNING):
549
+ run_record.heartbeat_interval = HEARTBEAT_MAX_INTERVAL
550
+ run_record.active_until = (
551
+ current.timestamp() + run_record.heartbeat_interval
552
+ )
553
+
554
+ # Update the run status
512
555
  if new_status.status == Status.STARTING:
513
- run_record.run.starting_at = now().isoformat()
556
+ run_record.run.starting_at = current.isoformat()
514
557
  elif new_status.status == Status.RUNNING:
515
- run_record.run.running_at = now().isoformat()
558
+ run_record.run.running_at = current.isoformat()
516
559
  elif new_status.status == Status.FINISHED:
517
- run_record.run.finished_at = now().isoformat()
560
+ run_record.run.finished_at = current.isoformat()
518
561
  run_record.run.status = new_status
519
562
  return True
520
563
 
@@ -558,6 +601,43 @@ class InMemoryLinkState(LinkState): # pylint: disable=R0902,R0904
558
601
  return True
559
602
  return False
560
603
 
604
+ def acknowledge_app_heartbeat(self, run_id: int, heartbeat_interval: float) -> bool:
605
+ """Acknowledge a heartbeat received from a ServerApp for a given run.
606
+
607
+ A run with status `"running"` is considered alive as long as it sends heartbeats
608
+ within the tolerated interval: HEARTBEAT_PATIENCE × heartbeat_interval.
609
+ HEARTBEAT_PATIENCE = N allows for N-1 missed heartbeat before the run is
610
+ marked as `"completed:failed"`.
611
+ """
612
+ with self.lock:
613
+ # Search for the run
614
+ record = self.run_ids.get(run_id)
615
+
616
+ # Check if the run_id exists
617
+ if record is None:
618
+ log(ERROR, "`run_id` is invalid")
619
+ return False
620
+
621
+ with record.lock:
622
+ # Check if runs are still active
623
+ self._check_and_tag_inactive_run(run_ids={run_id})
624
+
625
+ # Check if the run is of status "running"/"starting"
626
+ current_status = record.run.status
627
+ if current_status.status not in (Status.RUNNING, Status.STARTING):
628
+ log(
629
+ ERROR,
630
+ 'Cannot acknowledge heartbeat for run with status "%s"',
631
+ current_status.status,
632
+ )
633
+ return False
634
+
635
+ # Update the `active_until` and `heartbeat_interval` for the given run
636
+ current = now().timestamp()
637
+ record.active_until = current + HEARTBEAT_PATIENCE * heartbeat_interval
638
+ record.heartbeat_interval = heartbeat_interval
639
+ return True
640
+
561
641
  def get_serverapp_context(self, run_id: int) -> Optional[Context]:
562
642
  """Get the context for the specified `run_id`."""
563
643
  return self.contexts.get(run_id)
@@ -292,6 +292,29 @@ class LinkState(abc.ABC): # pylint: disable=R0904
292
292
  True if the heartbeat is successfully acknowledged; otherwise, False.
293
293
  """
294
294
 
295
+ @abc.abstractmethod
296
+ def acknowledge_app_heartbeat(self, run_id: int, heartbeat_interval: float) -> bool:
297
+ """Acknowledge a heartbeat received from a ServerApp for a given run.
298
+
299
+ A run with status `"running"` is considered alive as long as it sends heartbeats
300
+ within the tolerated interval: HEARTBEAT_PATIENCE × heartbeat_interval.
301
+ HEARTBEAT_PATIENCE = N allows for N-1 missed heartbeat before the run is
302
+ marked as `"completed:failed"`.
303
+
304
+ Parameters
305
+ ----------
306
+ run_id : int
307
+ The `run_id` from which the heartbeat was received.
308
+ heartbeat_interval : float
309
+ The interval (in seconds) from the current timestamp within which the next
310
+ heartbeat from the ServerApp for this run must be received.
311
+
312
+ Returns
313
+ -------
314
+ is_acknowledged : bool
315
+ True if the heartbeat is successfully acknowledged; otherwise, False.
316
+ """
317
+
295
318
  @abc.abstractmethod
296
319
  def get_serverapp_context(self, run_id: int) -> Optional[Context]:
297
320
  """Get the context for the specified `run_id`.
@@ -28,12 +28,15 @@ from uuid import UUID, uuid4
28
28
 
29
29
  from flwr.common import Context, Message, Metadata, log, now
30
30
  from flwr.common.constant import (
31
+ HEARTBEAT_MAX_INTERVAL,
31
32
  HEARTBEAT_PATIENCE,
32
33
  MESSAGE_TTL_TOLERANCE,
33
34
  NODE_ID_NUM_BYTES,
35
+ RUN_FAILURE_DETAILS_NO_HEARTBEAT,
34
36
  RUN_ID_NUM_BYTES,
35
37
  SUPERLINK_NODE_ID,
36
38
  Status,
39
+ SubStatus,
37
40
  )
38
41
  from flwr.common.message import make_message
39
42
  from flwr.common.record import ConfigRecord
@@ -92,6 +95,8 @@ CREATE INDEX IF NOT EXISTS idx_online_until ON node (online_until);
92
95
  SQL_CREATE_TABLE_RUN = """
93
96
  CREATE TABLE IF NOT EXISTS run(
94
97
  run_id INTEGER UNIQUE,
98
+ active_until REAL,
99
+ heartbeat_interval REAL,
95
100
  fab_id TEXT,
96
101
  fab_version TEXT,
97
102
  fab_hash TEXT,
@@ -742,20 +747,21 @@ class SqliteLinkState(LinkState): # pylint: disable=R0904
742
747
  if self.query(query, (sint64_run_id,))[0]["COUNT(*)"] == 0:
743
748
  query = (
744
749
  "INSERT INTO run "
745
- "(run_id, fab_id, fab_version, fab_hash, override_config, "
746
- "federation_options, pending_at, starting_at, running_at, finished_at, "
747
- "sub_status, details) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"
750
+ "(run_id, active_until, heartbeat_interval, fab_id, fab_version, "
751
+ "fab_hash, override_config, federation_options, pending_at, "
752
+ "starting_at, running_at, finished_at, sub_status, details) "
753
+ "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);"
748
754
  )
749
755
  override_config_json = json.dumps(override_config)
750
756
  data = [
751
757
  sint64_run_id,
758
+ 0, # The `active_until` is not used until the run is started
759
+ 0, # This `heartbeat_interval` is not used until the run is started
752
760
  fab_id,
753
761
  fab_version,
754
762
  fab_hash,
755
763
  override_config_json,
756
764
  configrecord_to_bytes(federation_options),
757
- ]
758
- data += [
759
765
  now().isoformat(),
760
766
  "",
761
767
  "",
@@ -796,8 +802,33 @@ class SqliteLinkState(LinkState): # pylint: disable=R0904
796
802
  rows = self.query(query)
797
803
  return {convert_sint64_to_uint64(row["run_id"]) for row in rows}
798
804
 
805
+ def _check_and_tag_inactive_run(self, run_ids: set[int]) -> None:
806
+ """Check if any runs are no longer active.
807
+
808
+ Marks runs with status 'starting' or 'running' as failed
809
+ if they have not sent a heartbeat before `active_until`.
810
+ """
811
+ sint_run_ids = [convert_uint64_to_sint64(run_id) for run_id in run_ids]
812
+ query = "UPDATE run SET finished_at = ?, sub_status = ?, details = ? "
813
+ query += "WHERE starting_at != '' AND finished_at = '' AND active_until < ?"
814
+ query += f" AND run_id IN ({','.join(['?'] * len(run_ids))});"
815
+ current = now()
816
+ self.query(
817
+ query,
818
+ (
819
+ current.isoformat(),
820
+ SubStatus.FAILED,
821
+ RUN_FAILURE_DETAILS_NO_HEARTBEAT,
822
+ current.timestamp(),
823
+ *sint_run_ids,
824
+ ),
825
+ )
826
+
799
827
  def get_run(self, run_id: int) -> Optional[Run]:
800
828
  """Retrieve information about the run with the specified `run_id`."""
829
+ # Check if runs are still active
830
+ self._check_and_tag_inactive_run(run_ids={run_id})
831
+
801
832
  # Convert the uint64 value to sint64 for SQLite
802
833
  sint64_run_id = convert_uint64_to_sint64(run_id)
803
834
  query = "SELECT * FROM run WHERE run_id = ?;"
@@ -825,6 +856,9 @@ class SqliteLinkState(LinkState): # pylint: disable=R0904
825
856
 
826
857
  def get_run_status(self, run_ids: set[int]) -> dict[int, RunStatus]:
827
858
  """Retrieve the statuses for the specified runs."""
859
+ # Check if runs are still active
860
+ self._check_and_tag_inactive_run(run_ids=run_ids)
861
+
828
862
  # Convert the uint64 value to sint64 for SQLite
829
863
  sint64_run_ids = (convert_uint64_to_sint64(run_id) for run_id in set(run_ids))
830
864
  query = f"SELECT * FROM run WHERE run_id IN ({','.join(['?'] * len(run_ids))});"
@@ -842,6 +876,9 @@ class SqliteLinkState(LinkState): # pylint: disable=R0904
842
876
 
843
877
  def update_run_status(self, run_id: int, new_status: RunStatus) -> bool:
844
878
  """Update the status of the run with the specified `run_id`."""
879
+ # Check if runs are still active
880
+ self._check_and_tag_inactive_run(run_ids={run_id})
881
+
845
882
  # Convert the uint64 value to sint64 for SQLite
846
883
  sint64_run_id = convert_uint64_to_sint64(run_id)
847
884
  query = "SELECT * FROM run WHERE run_id = ?;"
@@ -879,9 +916,22 @@ class SqliteLinkState(LinkState): # pylint: disable=R0904
879
916
  return False
880
917
 
881
918
  # Update the status
882
- query = "UPDATE run SET %s= ?, sub_status = ?, details = ? "
919
+ query = "UPDATE run SET %s= ?, sub_status = ?, details = ?, "
920
+ query += "active_until = ?, heartbeat_interval = ? "
883
921
  query += "WHERE run_id = ?;"
884
922
 
923
+ # Prepare data for query
924
+ # Initialize heartbeat_interval and active_until
925
+ # when switching to starting or running
926
+ current = now()
927
+ if new_status.status in (Status.STARTING, Status.RUNNING):
928
+ heartbeat_interval = HEARTBEAT_MAX_INTERVAL
929
+ active_until = current.timestamp() + heartbeat_interval
930
+ else:
931
+ heartbeat_interval = 0
932
+ active_until = 0
933
+
934
+ # Determine the timestamp field based on the new status
885
935
  timestamp_fld = ""
886
936
  if new_status.status == Status.STARTING:
887
937
  timestamp_fld = "starting_at"
@@ -891,10 +941,12 @@ class SqliteLinkState(LinkState): # pylint: disable=R0904
891
941
  timestamp_fld = "finished_at"
892
942
 
893
943
  data = (
894
- now().isoformat(),
944
+ current.isoformat(),
895
945
  new_status.sub_status,
896
946
  new_status.details,
897
- sint64_run_id,
947
+ active_until,
948
+ heartbeat_interval,
949
+ convert_uint64_to_sint64(run_id),
898
950
  )
899
951
  self.query(query % timestamp_fld, data)
900
952
  return True
@@ -957,6 +1009,44 @@ class SqliteLinkState(LinkState): # pylint: disable=R0904
957
1009
  )
958
1010
  return True
959
1011
 
1012
+ def acknowledge_app_heartbeat(self, run_id: int, heartbeat_interval: float) -> bool:
1013
+ """Acknowledge a heartbeat received from a ServerApp for a given run.
1014
+
1015
+ A run with status `"running"` is considered alive as long as it sends heartbeats
1016
+ within the tolerated interval: HEARTBEAT_PATIENCE × heartbeat_interval.
1017
+ HEARTBEAT_PATIENCE = N allows for N-1 missed heartbeat before the run is
1018
+ marked as `"completed:failed"`.
1019
+ """
1020
+ # Check if runs are still active
1021
+ self._check_and_tag_inactive_run(run_ids={run_id})
1022
+
1023
+ # Search for the run
1024
+ sint_run_id = convert_uint64_to_sint64(run_id)
1025
+ query = "SELECT * FROM run WHERE run_id = ?;"
1026
+ rows = self.query(query, (sint_run_id,))
1027
+
1028
+ if not rows:
1029
+ log(ERROR, "`run_id` is invalid")
1030
+ return False
1031
+
1032
+ # Check if the run is of status "running"/"starting"
1033
+ row = rows[0]
1034
+ status = determine_run_status(row)
1035
+ if status not in (Status.RUNNING, Status.STARTING):
1036
+ log(
1037
+ ERROR,
1038
+ 'Cannot acknowledge heartbeat for run with status "%s"',
1039
+ status,
1040
+ )
1041
+ return False
1042
+
1043
+ # Update the `active_until` and `heartbeat_interval` for the given run
1044
+ active_until = now().timestamp() + HEARTBEAT_PATIENCE * heartbeat_interval
1045
+ query = "UPDATE run SET active_until = ?, heartbeat_interval = ? "
1046
+ query += "WHERE run_id = ?"
1047
+ self.query(query, (active_until, heartbeat_interval, sint_run_id))
1048
+ return True
1049
+
960
1050
  def get_serverapp_context(self, run_id: int) -> Optional[Context]:
961
1051
  """Get the context for the specified `run_id`."""
962
1052
  # Retrieve context if any
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: flwr-nightly
3
- Version: 1.19.0.dev20250511
3
+ Version: 1.19.0.dev20250513
4
4
  Summary: Flower: A Friendly Federated AI Framework
5
5
  License: Apache-2.0
6
6
  Keywords: Artificial Intelligence,Federated AI,Federated Analytics,Federated Evaluation,Federated Learning,Flower,Machine Learning
@@ -115,7 +115,7 @@ flwr/common/args.py,sha256=-aX_jVnSaDrJR2KZ8Wq0Y3dQHII4R4MJtJOIXzVUA0c,5417
115
115
  flwr/common/auth_plugin/__init__.py,sha256=m271m9YjK2QfKDOuIIhcTvGmv1GWh1PL97QB05NTSHs,887
116
116
  flwr/common/auth_plugin/auth_plugin.py,sha256=GaXw4IiU2DkVNkp5S9ue821sbkU9zWSu6HSVZetEdjs,3938
117
117
  flwr/common/config.py,sha256=glcZDjco-amw1YfQcYTFJ4S1pt9APoexT-mf1QscuHs,13960
118
- flwr/common/constant.py,sha256=RmVW2YLGosdBzyePgj2EMdZnHrT1PKEtNScaNK_FHZ0,7302
118
+ flwr/common/constant.py,sha256=Q8N-up1TvL_vllV_QA8mQlKjqVJ6Kdoze3iem6nSF9E,7375
119
119
  flwr/common/context.py,sha256=Be8obQR_OvEDy1OmshuUKxGRQ7Qx89mf5F4xlhkR10s,2407
120
120
  flwr/common/date.py,sha256=1ZT2cRSpC2DJqprOVTLXYCR_O2_OZR0zXO_brJ3LqWc,1554
121
121
  flwr/common/differential_privacy.py,sha256=FdlpdpPl_H_2HJa8CQM1iCUGBBQ5Dc8CzxmHERM-EoE,6148
@@ -128,20 +128,20 @@ flwr/common/exit/exit.py,sha256=mJgbqMlVlwAgYtq-Vedj53wO4VxcDcy_P-GzqGK-1GQ,3452
128
128
  flwr/common/exit/exit_code.py,sha256=PNEnCrZfOILjfDAFu5m-2YWEJBrk97xglq4zCUlqV7E,3470
129
129
  flwr/common/exit_handlers.py,sha256=MEk5_savTLphn-6lW57UQlos-XrFA39XEBn-OF1vXXg,3174
130
130
  flwr/common/grpc.py,sha256=manTaHaPiyYngUq1ErZvvV2B2GxlXUUUGRy3jc3TBIQ,9798
131
- flwr/common/heartbeat.py,sha256=yzi-gWH5wswdg0hfQwxwGkjI5twxIHBBVW45MD5QITI,3924
132
- flwr/common/inflatable.py,sha256=yCfnRYj4xeUqV2m-K5hcQPeVhL7gdSGw7CewPYKnjnE,3156
131
+ flwr/common/heartbeat.py,sha256=nU0SNlL0A6t736ku7D7z7UUB1vXaX9QIn2fsJJPWeBU,4125
132
+ flwr/common/inflatable.py,sha256=o3PQejN8GNm_xlHKJLB1h2wQzOXKkDfaiy6z5o142pA,3681
133
133
  flwr/common/logger.py,sha256=JbRf6E2vQxXzpDBq1T8IDUJo_usu3gjWEBPQ6uKcmdg,13049
134
134
  flwr/common/message.py,sha256=znr205Erq2hkxwFbvNNCsQTRS2UKv_Qsyu0sFNEhEAw,23721
135
135
  flwr/common/object_ref.py,sha256=p3SfTeqo3Aj16SkB-vsnNn01zswOPdGNBitcbRnqmUk,9134
136
136
  flwr/common/parameter.py,sha256=UVw6sOgehEFhFs4uUCMl2kfVq1PD6ncmWgPLMsZPKPE,2095
137
137
  flwr/common/pyproject.py,sha256=2SU6yJW7059SbMXgzjOdK1GZRWO6AixDH7BmdxbMvHI,1386
138
138
  flwr/common/record/__init__.py,sha256=cNGccdDoxttqgnUgyKRIqLWULjW-NaSmOufVxtXq-sw,1197
139
- flwr/common/record/array.py,sha256=tPTT6cw7B1Fo626LOVaA_sfj2_EtkxdnvSkRTyPrVRY,10469
140
- flwr/common/record/arrayrecord.py,sha256=KbehV2yXJ_6ZWcHPPrC-MNkE00DRCObxgyrVLwBQ5OY,14389
141
- flwr/common/record/configrecord.py,sha256=lXVGjNfQD3lqvQTstGPFfQjeEHl29alfoL9trCKKlY4,9269
139
+ flwr/common/record/array.py,sha256=7c5peTvVsnhXQVvYOIFqm2UowGO-Uu0_44EeqybU0HY,10850
140
+ flwr/common/record/arrayrecord.py,sha256=-9iIfBKtCS-fGmh5NLTDS7GTHqfHA3o1MguPyyybAyI,16787
141
+ flwr/common/record/configrecord.py,sha256=2BqauemtQBsZDWLtkTZp51_4J7L9-8i5LrM9RV8nmNQ,9664
142
142
  flwr/common/record/conversion_utils.py,sha256=wbNCzy7oAqaA3-arhls_EqRZYXRC4YrWIoE-Gy82fJ0,1191
143
- flwr/common/record/metricrecord.py,sha256=MRMv0fSmJvHlg0HtX_s4IBqxHAh8QHgFN75CmR6fBOU,8444
144
- flwr/common/record/recorddict.py,sha256=zo7TiVZCH_LB9gwUP7-Jo-jLpFLrvxYSryovwZANQiw,12386
143
+ flwr/common/record/metricrecord.py,sha256=PGeYISykqf_NSyij-qXdf85Pfnder7Rw9CagIolBe30,8839
144
+ flwr/common/record/recorddict.py,sha256=qnRET_r6_7o7JSQQO2PnMAKezA4yZEy0frsfy8CKxb8,14986
145
145
  flwr/common/record/typeddict.py,sha256=dDKgUThs2BscYUNcgP82KP8-qfAYXYftDrf2LszAC_o,3599
146
146
  flwr/common/recorddict_compat.py,sha256=Znn1xRGiqLpPPgviVqyb-GPTM-pCK6tpnEmhWSXafy8,14119
147
147
  flwr/common/retry_invoker.py,sha256=T6puUH3nCxdRzQHeanyr-0nTxhRiS1TH07rmef9vuLQ,14482
@@ -293,10 +293,10 @@ flwr/server/superlink/fleet/vce/backend/backend.py,sha256=-wDHjgAy5mrfEgXj0GxkJI
293
293
  flwr/server/superlink/fleet/vce/backend/raybackend.py,sha256=Hx9hxL7lju1_VJoAwkhBOGerZ3628u0P1zgkPhGWRPY,7154
294
294
  flwr/server/superlink/fleet/vce/vce_api.py,sha256=m7WUiHRl-jTqzjH3cqNCj3RXe3ohT6V6I0JIR6zWZj8,12780
295
295
  flwr/server/superlink/linkstate/__init__.py,sha256=OtsgvDTnZLU3k0sUbkHbqoVwW6ql2FDmb6uT6DbNkZo,1064
296
- flwr/server/superlink/linkstate/in_memory_linkstate.py,sha256=vu36ntb9N1qeDWvt6YsnPfvVppZPJi04icAgolui9no,22520
297
- flwr/server/superlink/linkstate/linkstate.py,sha256=zrVR9UpeuCfgqK5MDIqudcXtFjvsCRjD2ylv4UZu8MU,12140
296
+ flwr/server/superlink/linkstate/in_memory_linkstate.py,sha256=vvoOWjYlmOlbakH7AzpMh0jB70Qxx7UTlAGqjcA8ctM,25926
297
+ flwr/server/superlink/linkstate/linkstate.py,sha256=j6nW351t07VrBhFqjO34z8tf2PuKOE9aCX9SqpW96pQ,13100
298
298
  flwr/server/superlink/linkstate/linkstate_factory.py,sha256=8RlosqSpKOoD_vhUUQPY0jtE3A84GeF96Z7sWNkRRcA,2069
299
- flwr/server/superlink/linkstate/sqlite_linkstate.py,sha256=c-6iopBJ3Y09Jh0qYka02tdIUF3Aq9X0nrBK5eAuwOk,39847
299
+ flwr/server/superlink/linkstate/sqlite_linkstate.py,sha256=E43YO88vdnG9GW6Rwh9Fb7oWGgEABS9RXDRg3OR3T4Q,43573
300
300
  flwr/server/superlink/linkstate/utils.py,sha256=AJs9jTAEK7JnjF2AODXnOfy0pKAKpe6oUWPCanAP57s,15382
301
301
  flwr/server/superlink/serverappio/__init__.py,sha256=Fy4zJuoccZe5mZSEIpOmQvU6YeXFBa1M4eZuXXmJcn8,717
302
302
  flwr/server/superlink/serverappio/serverappio_grpc.py,sha256=opJ6SYwIAbu4NWEo3K-VxFO-tMSFmE4H3i2HwHIVRzw,2173
@@ -333,7 +333,7 @@ flwr/superexec/exec_servicer.py,sha256=Z0YYfs6eNPhqn8rY0x_R04XgR2mKFpggt07IH0EhU
333
333
  flwr/superexec/exec_user_auth_interceptor.py,sha256=iqygALkOMBUu_s_R9G0mFThZA7HTUzuXCLgxLCefiwI,4440
334
334
  flwr/superexec/executor.py,sha256=M5ucqSE53jfRtuCNf59WFLqQvA1Mln4741TySeZE7qQ,3112
335
335
  flwr/superexec/simulation.py,sha256=j6YwUvBN7EQ09ID7MYOCVZ70PGbuyBy8f9bXU0EszEM,4088
336
- flwr_nightly-1.19.0.dev20250511.dist-info/METADATA,sha256=NL5XgF9L-oJRThfCtBe-l84Z_6WJkNNNhPghf3SMFQg,15910
337
- flwr_nightly-1.19.0.dev20250511.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
338
- flwr_nightly-1.19.0.dev20250511.dist-info/entry_points.txt,sha256=2-1L-GNKhwGw2_7_RoH55vHw2SIHjdAQy3HAVAWl9PY,374
339
- flwr_nightly-1.19.0.dev20250511.dist-info/RECORD,,
336
+ flwr_nightly-1.19.0.dev20250513.dist-info/METADATA,sha256=4dlTZbixbfPDUcDbK3OBZ3JwaIFV553qWtAcY4d6zpQ,15910
337
+ flwr_nightly-1.19.0.dev20250513.dist-info/WHEEL,sha256=b4K_helf-jlQoXBBETfwnf4B04YC67LOev0jo4fX5m8,88
338
+ flwr_nightly-1.19.0.dev20250513.dist-info/entry_points.txt,sha256=2-1L-GNKhwGw2_7_RoH55vHw2SIHjdAQy3HAVAWl9PY,374
339
+ flwr_nightly-1.19.0.dev20250513.dist-info/RECORD,,