PyPI - flwr-nightly - Versions diffs - 1.10.0.dev20240707__py3-none-any.whl → 1.10.0.dev20240722__py3-none-any.whl - Mend

flwr-nightly 1.10.0.dev20240707py3-none-any.whl → 1.10.0.dev20240722py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of flwr-nightly might be problematic. Click here for more details.

Files changed (92) hide show

flwr/cli/build.py +16 -2
flwr/cli/config_utils.py +36 -14
flwr/cli/install.py +17 -1
flwr/cli/new/new.py +31 -20
flwr/cli/new/templates/app/code/client.hf.py.tpl +11 -3
flwr/cli/new/templates/app/code/client.jax.py.tpl +2 -1
flwr/cli/new/templates/app/code/client.mlx.py.tpl +15 -10
flwr/cli/new/templates/app/code/client.numpy.py.tpl +2 -1
flwr/cli/new/templates/app/code/client.pytorch.py.tpl +12 -3
flwr/cli/new/templates/app/code/client.sklearn.py.tpl +6 -3
flwr/cli/new/templates/app/code/client.tensorflow.py.tpl +13 -3
flwr/cli/new/templates/app/code/flwr_tune/app.py.tpl +2 -2
flwr/cli/new/templates/app/code/flwr_tune/server.py.tpl +1 -1
flwr/cli/new/templates/app/code/server.hf.py.tpl +16 -11
flwr/cli/new/templates/app/code/server.jax.py.tpl +15 -8
flwr/cli/new/templates/app/code/server.mlx.py.tpl +11 -7
flwr/cli/new/templates/app/code/server.numpy.py.tpl +15 -8
flwr/cli/new/templates/app/code/server.pytorch.py.tpl +15 -13
flwr/cli/new/templates/app/code/server.sklearn.py.tpl +16 -10
flwr/cli/new/templates/app/code/server.tensorflow.py.tpl +16 -13
flwr/cli/new/templates/app/code/task.hf.py.tpl +2 -2
flwr/cli/new/templates/app/code/task.mlx.py.tpl +2 -2
flwr/cli/new/templates/app/code/task.pytorch.py.tpl +1 -1
flwr/cli/new/templates/app/pyproject.flowertune.toml.tpl +9 -12
flwr/cli/new/templates/app/pyproject.hf.toml.tpl +17 -16
flwr/cli/new/templates/app/pyproject.jax.toml.tpl +17 -11
flwr/cli/new/templates/app/pyproject.mlx.toml.tpl +17 -12
flwr/cli/new/templates/app/pyproject.numpy.toml.tpl +12 -12
flwr/cli/new/templates/app/pyproject.pytorch.toml.tpl +13 -12
flwr/cli/new/templates/app/pyproject.sklearn.toml.tpl +12 -12
flwr/cli/new/templates/app/pyproject.tensorflow.toml.tpl +15 -12
flwr/cli/run/run.py +128 -53
flwr/client/app.py +56 -24
flwr/client/client_app.py +28 -8
flwr/client/grpc_adapter_client/connection.py +3 -2
flwr/client/grpc_client/connection.py +3 -2
flwr/client/grpc_rere_client/connection.py +17 -6
flwr/client/message_handler/message_handler.py +1 -1
flwr/client/node_state.py +59 -12
flwr/client/node_state_tests.py +4 -3
flwr/client/rest_client/connection.py +19 -8
flwr/client/supernode/app.py +55 -24
flwr/client/typing.py +2 -2
flwr/common/config.py +87 -2
flwr/common/constant.py +3 -0
flwr/common/context.py +24 -9
flwr/common/logger.py +25 -0
flwr/common/serde.py +45 -0
flwr/common/telemetry.py +17 -0
flwr/common/typing.py +5 -0
flwr/proto/common_pb2.py +36 -0
flwr/proto/common_pb2.pyi +121 -0
flwr/proto/common_pb2_grpc.py +4 -0
flwr/proto/common_pb2_grpc.pyi +4 -0
flwr/proto/driver_pb2.py +24 -19
flwr/proto/driver_pb2.pyi +21 -1
flwr/proto/exec_pb2.py +16 -11
flwr/proto/exec_pb2.pyi +22 -1
flwr/proto/run_pb2.py +12 -7
flwr/proto/run_pb2.pyi +22 -1
flwr/proto/task_pb2.py +7 -8
flwr/server/__init__.py +2 -0
flwr/server/compat/legacy_context.py +5 -4
flwr/server/driver/grpc_driver.py +82 -140
flwr/server/run_serverapp.py +40 -15
flwr/server/server_app.py +56 -10
flwr/server/serverapp_components.py +52 -0
flwr/server/superlink/driver/driver_servicer.py +18 -3
flwr/server/superlink/fleet/message_handler/message_handler.py +13 -2
flwr/server/superlink/fleet/vce/backend/backend.py +4 -4
flwr/server/superlink/fleet/vce/backend/raybackend.py +10 -10
flwr/server/superlink/fleet/vce/vce_api.py +149 -117
flwr/server/superlink/state/in_memory_state.py +11 -3
flwr/server/superlink/state/sqlite_state.py +23 -8
flwr/server/superlink/state/state.py +7 -2
flwr/server/typing.py +2 -0
flwr/server/workflow/secure_aggregation/secaggplus_workflow.py +18 -2
flwr/simulation/app.py +4 -3
flwr/simulation/ray_transport/ray_actor.py +15 -19
flwr/simulation/ray_transport/ray_client_proxy.py +22 -9
flwr/simulation/run_simulation.py +237 -66
flwr/superexec/app.py +14 -7
flwr/superexec/deployment.py +110 -33
flwr/superexec/exec_grpc.py +5 -1
flwr/superexec/exec_servicer.py +4 -1
flwr/superexec/executor.py +18 -0
flwr/superexec/simulation.py +151 -0
{flwr_nightly-1.10.0.dev20240707.dist-info → flwr_nightly-1.10.0.dev20240722.dist-info}/METADATA +3 -2
{flwr_nightly-1.10.0.dev20240707.dist-info → flwr_nightly-1.10.0.dev20240722.dist-info}/RECORD +92 -86
{flwr_nightly-1.10.0.dev20240707.dist-info → flwr_nightly-1.10.0.dev20240722.dist-info}/LICENSE +0 -0
{flwr_nightly-1.10.0.dev20240707.dist-info → flwr_nightly-1.10.0.dev20240722.dist-info}/WHEEL +0 -0
{flwr_nightly-1.10.0.dev20240707.dist-info → flwr_nightly-1.10.0.dev20240722.dist-info}/entry_points.txt +0 -0

flwr/server/superlink/fleet/vce/backend/backend.py CHANGED Viewed

@@ -33,8 +33,8 @@ class Backend(ABC):
         """Construct a backend."""
     @abstractmethod
-    async def build(self) -> None:
-        """Build backend asynchronously.
+    def build(self) -> None:
+        """Build backend.
         Different components need to be in place before workers in a backend are ready
         to accept jobs. When this method finishes executing, the backend should be fully
@@ -54,11 +54,11 @@ class Backend(ABC):
         """Report whether a backend worker is idle and can therefore run a ClientApp."""
     @abstractmethod
-    async def terminate(self) -> None:
+    def terminate(self) -> None:
         """Terminate backend."""
     @abstractmethod
-    async def process_message(
+    def process_message(
         self,
         app: Callable[[], ClientApp],
         message: Message,

flwr/server/superlink/fleet/vce/backend/raybackend.py CHANGED Viewed

@@ -21,6 +21,7 @@ from typing import Callable, Dict, List, Tuple, Union
 import ray
 from flwr.client.client_app import ClientApp
+from flwr.common.constant import PARTITION_ID_KEY
 from flwr.common.context import Context
 from flwr.common.logger import log
 from flwr.common.message import Message
@@ -153,12 +154,12 @@ class RayBackend(Backend):
         """Report whether the pool has idle actors."""
         return self.pool.is_actor_available()
-    async def build(self) -> None:
+    def build(self) -> None:
         """Build pool of Ray actors that this backend will submit jobs to."""
-        await self.pool.add_actors_to_pool(self.pool.actors_capacity)
+        self.pool.add_actors_to_pool(self.pool.actors_capacity)
         log(DEBUG, "Constructed ActorPool with: %i actors", self.pool.num_actors)
-    async def process_message(
+    def process_message(
         self,
         app: Callable[[], ClientApp],
         message: Message,
@@ -168,21 +169,20 @@ class RayBackend(Backend):
         Return output message and updated context.
         """
-        partition_id = context.partition_id
+        partition_id = context.node_config[PARTITION_ID_KEY]
         try:
             # Submit a task to the pool
-            future = await self.pool.submit(
+            future = self.pool.submit(
                 lambda a, a_fn, mssg, cid, state: a.run.remote(a_fn, mssg, cid, state),
                 (app, message, str(partition_id), context),
             )
-            await future
             # Fetch result
             (
                 out_mssg,
                 updated_context,
-            ) = await self.pool.fetch_result_and_return_actor_to_pool(future)
+            ) = self.pool.fetch_result_and_return_actor_to_pool(future)
             return out_mssg, updated_context
@@ -193,11 +193,11 @@ class RayBackend(Backend):
                 self.__class__.__name__,
             )
             # add actor back into pool
-            await self.pool.add_actor_back_to_pool(future)
+            self.pool.add_actor_back_to_pool(future)
             raise ex
-    async def terminate(self) -> None:
+    def terminate(self) -> None:
         """Terminate all actors in actor pool."""
-        await self.pool.terminate_all_actors()
+        self.pool.terminate_all_actors()
         ray.shutdown()
         log(DEBUG, "Terminated %s", self.__class__.__name__)

flwr/server/superlink/fleet/vce/vce_api.py CHANGED Viewed

@@ -14,24 +14,33 @@
 # ==============================================================================
 """Fleet Simulation Engine API."""
-import asyncio
 import json
-import sys
+import threading
 import time
 import traceback
+from concurrent.futures import ThreadPoolExecutor
 from logging import DEBUG, ERROR, INFO, WARN
 from pathlib import Path
-from typing import Callable, Dict, List, Optional
+from queue import Empty, Queue
+from time import sleep
+from typing import Callable, Dict, Optional
 from flwr.client.client_app import ClientApp, ClientAppException, LoadClientAppError
 from flwr.client.node_state import NodeState
-from flwr.common.constant import PING_MAX_INTERVAL, ErrorCode
+from flwr.client.supernode.app import _get_load_client_app_fn
+from flwr.common.constant import (
+    NUM_PARTITIONS_KEY,
+    PARTITION_ID_KEY,
+    PING_MAX_INTERVAL,
+    ErrorCode,
+)
 from flwr.common.logger import log
 from flwr.common.message import Error
-from flwr.common.object_ref import load_app
 from flwr.common.serde import message_from_taskins, message_to_taskres
-from flwr.proto.task_pb2 import TaskIns  # pylint: disable=E0611
-from flwr.server.superlink.state import StateFactory
+from flwr.common.typing import Run
+from flwr.proto.task_pb2 import TaskIns, TaskRes  # pylint: disable=E0611
+from flwr.server.superlink.state import State, StateFactory
 from .backend import Backend, error_messages_backends, supported_backends
@@ -51,31 +60,57 @@ def _register_nodes(
     return nodes_mapping
+def _register_node_states(
+    nodes_mapping: NodeToPartitionMapping,
+    run: Run,
+    app_dir: Optional[str] = None,
+) -> Dict[int, NodeState]:
+    """Create NodeState objects and pre-register the context for the run."""
+    node_states: Dict[int, NodeState] = {}
+    num_partitions = len(set(nodes_mapping.values()))
+    for node_id, partition_id in nodes_mapping.items():
+        node_states[node_id] = NodeState(
+            node_id=node_id,
+            node_config={
+                PARTITION_ID_KEY: str(partition_id),
+                NUM_PARTITIONS_KEY: str(num_partitions),
+            },
+        )
+        # Pre-register Context objects
+        node_states[node_id].register_context(
+            run_id=run.run_id, run=run, app_dir=app_dir
+        )
+    return node_states
 # pylint: disable=too-many-arguments,too-many-locals
-async def worker(
+def worker(
     app_fn: Callable[[], ClientApp],
-    queue: "asyncio.Queue[TaskIns]",
+    taskins_queue: "Queue[TaskIns]",
+    taskres_queue: "Queue[TaskRes]",
     node_states: Dict[int, NodeState],
-    state_factory: StateFactory,
     backend: Backend,
+    f_stop: threading.Event,
 ) -> None:
     """Get TaskIns from queue and pass it to an actor in the pool to execute it."""
-    state = state_factory.state()
-    while True:
+    while not f_stop.is_set():
         out_mssg = None
         try:
-            task_ins: TaskIns = await queue.get()
+            # Fetch from queue with timeout. We use a timeout so
+            # the stopping event can be evaluated even when the queue is empty.
+            task_ins: TaskIns = taskins_queue.get(timeout=1.0)
             node_id = task_ins.task.consumer.node_id
-            # Register and retrieve runstate
-            node_states[node_id].register_context(run_id=task_ins.run_id)
+            # Retrieve context
             context = node_states[node_id].retrieve_context(run_id=task_ins.run_id)
             # Convert TaskIns to Message
             message = message_from_taskins(task_ins)
             # Let backend process message
-            out_mssg, updated_context = await backend.process_message(
+            out_mssg, updated_context = backend.process_message(
                 app_fn, message, context
             )
@@ -83,11 +118,9 @@ async def worker(
             node_states[node_id].update_context(
                 task_ins.run_id, context=updated_context
             )
-        except asyncio.CancelledError as e:
-            log(DEBUG, "Terminating async worker: %s", e)
-            break
+        except Empty:
+            # An exception raised if queue.get times out
+            pass
         # Exceptions aren't raised but reported as an error message
         except Exception as ex:  # pylint: disable=broad-exception-caught
             log(ERROR, ex)
@@ -111,67 +144,48 @@ async def worker(
                 task_res = message_to_taskres(out_mssg)
                 # Store TaskRes in state
                 task_res.task.pushed_at = time.time()
-                state.store_task_res(task_res)
+                taskres_queue.put(task_res)
-async def add_taskins_to_queue(
-    queue: "asyncio.Queue[TaskIns]",
-    state_factory: StateFactory,
+def add_taskins_to_queue(
+    state: State,
+    queue: "Queue[TaskIns]",
     nodes_mapping: NodeToPartitionMapping,
-    backend: Backend,
-    consumers: List["asyncio.Task[None]"],
-    f_stop: asyncio.Event,
+    f_stop: threading.Event,
 ) -> None:
-    """Retrieve TaskIns and add it to the queue."""
-    state = state_factory.state()
-    num_initial_consumers = len(consumers)
+    """Put TaskIns in a queue from State."""
     while not f_stop.is_set():
         for node_id in nodes_mapping.keys():
-            task_ins = state.get_task_ins(node_id=node_id, limit=1)
-            if task_ins:
-                await queue.put(task_ins[0])
-        # Count consumers that are running
-        num_active = sum(not (cc.done()) for cc in consumers)
-        # Alert if number of consumers decreased by half
-        if num_active < num_initial_consumers // 2:
-            log(
-                WARN,
-                "Number of active workers has more than halved: (%i/%i active)",
-                num_active,
-                num_initial_consumers,
-            )
+            task_ins_list = state.get_task_ins(node_id=node_id, limit=1)
+            for task_ins in task_ins_list:
+                queue.put(task_ins)
+        sleep(0.1)
-        # Break if consumers died
-        if num_active == 0:
-            raise RuntimeError("All workers have died. Ending Simulation.")
-        # Log some stats
-        log(
-            DEBUG,
-            "Simulation Engine stats: "
-            "Active workers: (%i/%i) | %s (%i workers) | Tasks in queue: %i)",
-            num_active,
-            num_initial_consumers,
-            backend.__class__.__name__,
-            backend.num_workers,
-            queue.qsize(),
-        )
-        await asyncio.sleep(1.0)
-    log(DEBUG, "Async producer: Stopped pulling from StateFactory.")
+def put_taskres_into_state(
+    state: State, queue: "Queue[TaskRes]", f_stop: threading.Event
+) -> None:
+    """Put TaskRes into State from a queue."""
+    while not f_stop.is_set():
+        try:
+            taskres = queue.get(timeout=1.0)
+            state.store_task_res(taskres)
+        except Empty:
+            # queue is empty when timeout was triggered
+            pass
-async def run(
+def run_api(
     app_fn: Callable[[], ClientApp],
     backend_fn: Callable[[], Backend],
     nodes_mapping: NodeToPartitionMapping,
     state_factory: StateFactory,
     node_states: Dict[int, NodeState],
-    f_stop: asyncio.Event,
+    f_stop: threading.Event,
 ) -> None:
-    """Run the VCE async."""
-    queue: "asyncio.Queue[TaskIns]" = asyncio.Queue(128)
+    """Run the VCE."""
+    taskins_queue: "Queue[TaskIns]" = Queue()
+    taskres_queue: "Queue[TaskRes]" = Queue()
     try:
@@ -179,27 +193,48 @@ async def run(
         backend = backend_fn()
         # Build backend
-        await backend.build()
+        backend.build()
         # Add workers (they submit Messages to Backend)
-        worker_tasks = [
-            asyncio.create_task(
-                worker(app_fn, queue, node_states, state_factory, backend)
-            )
-            for _ in range(backend.num_workers)
-        ]
-        # Create producer (adds TaskIns into Queue)
-        producer = asyncio.create_task(
-            add_taskins_to_queue(
-                queue, state_factory, nodes_mapping, backend, worker_tasks, f_stop
-            )
+        state = state_factory.state()
+        extractor_th = threading.Thread(
+            target=add_taskins_to_queue,
+            args=(
+                state,
+                taskins_queue,
+                nodes_mapping,
+                f_stop,
+            ),
         )
+        extractor_th.start()
-        # Wait for producer to finish
-        # The producer runs forever until f_stop is set or until
-        # all worker (consumer) coroutines are completed. Workers
-        # also run forever and only end if an exception is raised.
-        await asyncio.gather(producer)
+        injector_th = threading.Thread(
+            target=put_taskres_into_state,
+            args=(
+                state,
+                taskres_queue,
+                f_stop,
+            ),
+        )
+        injector_th.start()
+        with ThreadPoolExecutor() as executor:
+            _ = [
+                executor.submit(
+                    worker,
+                    app_fn,
+                    taskins_queue,
+                    taskres_queue,
+                    node_states,
+                    backend,
+                    f_stop,
+                )
+                for _ in range(backend.num_workers)
+            ]
+        extractor_th.join()
+        injector_th.join()
     except Exception as ex:
@@ -214,18 +249,9 @@ async def run(
         raise RuntimeError("Simulation Engine crashed.") from ex
     finally:
-        # Produced task terminated, now cancel worker tasks
-        for w_t in worker_tasks:
-            _ = w_t.cancel()
-        while not all(w_t.done() for w_t in worker_tasks):
-            log(DEBUG, "Terminating async workers...")
-            await asyncio.sleep(0.5)
-        await asyncio.gather(*[w_t for w_t in worker_tasks if not w_t.done()])
         # Terminate backend
-        await backend.terminate()
+        backend.terminate()
 # pylint: disable=too-many-arguments,unused-argument,too-many-locals,too-many-branches
@@ -234,7 +260,10 @@ def start_vce(
     backend_name: str,
     backend_config_json_stream: str,
     app_dir: str,
-    f_stop: asyncio.Event,
+    is_app: bool,
+    f_stop: threading.Event,
+    run: Run,
+    flwr_dir: Optional[str] = None,
     client_app: Optional[ClientApp] = None,
     client_app_attr: Optional[str] = None,
     num_supernodes: Optional[int] = None,
@@ -285,9 +314,9 @@ def start_vce(
         )
     # Construct mapping of NodeStates
-    node_states: Dict[int, NodeState] = {}
-    for node_id, partition_id in nodes_mapping.items():
-        node_states[node_id] = NodeState(partition_id=partition_id)
+    node_states = _register_node_states(
+        nodes_mapping=nodes_mapping, run=run, app_dir=app_dir if is_app else None
+    )
     # Load backend config
     log(DEBUG, "Supported backends: %s", list(supported_backends.keys()))
@@ -316,16 +345,12 @@ def start_vce(
     def _load() -> ClientApp:
         if client_app_attr:
-            if app_dir is not None:
-                sys.path.insert(0, app_dir)
-            app: ClientApp = load_app(client_app_attr, LoadClientAppError, app_dir)
-            if not isinstance(app, ClientApp):
-                raise LoadClientAppError(
-                    f"Attribute {client_app_attr} is not of type {ClientApp}",
-                ) from None
+            app = _get_load_client_app_fn(
+                default_app_ref=client_app_attr,
+                dir_arg=app_dir,
+                flwr_dir_arg=flwr_dir,
+                multi_app=True,
+            )(run.fab_id, run.fab_version)
         if client_app:
             app = client_app
@@ -335,18 +360,25 @@ def start_vce(
     try:
         # Test if ClientApp can be loaded
-        _ = app_fn()
+        client_app = app_fn()
+        # Cache `ClientApp`
+        if client_app_attr:
+            # Now wrap the loaded ClientApp in a dummy function
+            # this prevent unnecesary low-level loading of ClientApp
+            def _load_client_app() -> ClientApp:
+                return client_app
+            app_fn = _load_client_app
         # Run main simulation loop
-        asyncio.run(
-            run(
-                app_fn,
-                backend_fn,
-                nodes_mapping,
-                state_factory,
-                node_states,
-                f_stop,
-            )
+        run_api(
+            app_fn,
+            backend_fn,
+            nodes_mapping,
+            state_factory,
+            node_states,
+            f_stop,
         )
     except LoadClientAppError as loadapp_ex:
         f_stop_delay = 10

flwr/server/superlink/state/in_memory_state.py CHANGED Viewed

@@ -23,7 +23,7 @@ from uuid import UUID, uuid4
 from flwr.common import log, now
 from flwr.common.constant import NODE_ID_NUM_BYTES, RUN_ID_NUM_BYTES
-from flwr.common.typing import Run
+from flwr.common.typing import Run, UserConfig
 from flwr.proto.task_pb2 import TaskIns, TaskRes  # pylint: disable=E0611
 from flwr.server.superlink.state.state import State
 from flwr.server.utils import validate_task_ins_or_res
@@ -275,7 +275,12 @@ class InMemoryState(State):  # pylint: disable=R0902,R0904
         """Retrieve stored `node_id` filtered by `client_public_keys`."""
         return self.public_key_to_node_id.get(client_public_key)
-    def create_run(self, fab_id: str, fab_version: str) -> int:
+    def create_run(
+        self,
+        fab_id: str,
+        fab_version: str,
+        override_config: UserConfig,
+    ) -> int:
         """Create a new run for the specified `fab_id` and `fab_version`."""
         # Sample a random int64 as run_id
         with self.lock:
@@ -283,7 +288,10 @@ class InMemoryState(State):  # pylint: disable=R0902,R0904
             if run_id not in self.run_ids:
                 self.run_ids[run_id] = Run(
-                    run_id=run_id, fab_id=fab_id, fab_version=fab_version
+                    run_id=run_id,
+                    fab_id=fab_id,
+                    fab_version=fab_version,
+                    override_config=override_config,
                 )
                 return run_id
         log(ERROR, "Unexpected run creation failure.")

flwr/server/superlink/state/sqlite_state.py CHANGED Viewed

@@ -15,6 +15,7 @@
 """SQLite based implemenation of server state."""
+import json
 import re
 import sqlite3
 import time
@@ -24,7 +25,7 @@ from uuid import UUID, uuid4
 from flwr.common import log, now
 from flwr.common.constant import NODE_ID_NUM_BYTES, RUN_ID_NUM_BYTES
-from flwr.common.typing import Run
+from flwr.common.typing import Run, UserConfig
 from flwr.proto.node_pb2 import Node  # pylint: disable=E0611
 from flwr.proto.recordset_pb2 import RecordSet  # pylint: disable=E0611
 from flwr.proto.task_pb2 import Task, TaskIns, TaskRes  # pylint: disable=E0611
@@ -61,9 +62,10 @@ CREATE INDEX IF NOT EXISTS idx_online_until ON node (online_until);
 SQL_CREATE_TABLE_RUN = """
 CREATE TABLE IF NOT EXISTS run(
-    run_id          INTEGER UNIQUE,
-    fab_id          TEXT,
-    fab_version     TEXT
+    run_id                INTEGER UNIQUE,
+    fab_id                TEXT,
+    fab_version           TEXT,
+    override_config       TEXT
 );
 """
@@ -613,7 +615,12 @@ class SqliteState(State):  # pylint: disable=R0904
             return node_id
         return None
-    def create_run(self, fab_id: str, fab_version: str) -> int:
+    def create_run(
+        self,
+        fab_id: str,
+        fab_version: str,
+        override_config: UserConfig,
+    ) -> int:
         """Create a new run for the specified `fab_id` and `fab_version`."""
         # Sample a random int64 as run_id
         run_id = generate_rand_int_from_bytes(RUN_ID_NUM_BYTES)
@@ -622,8 +629,13 @@ class SqliteState(State):  # pylint: disable=R0904
         query = "SELECT COUNT(*) FROM run WHERE run_id = ?;"
         # If run_id does not exist
         if self.query(query, (run_id,))[0]["COUNT(*)"] == 0:
-            query = "INSERT INTO run (run_id, fab_id, fab_version) VALUES (?, ?, ?);"
-            self.query(query, (run_id, fab_id, fab_version))
+            query = (
+                "INSERT INTO run (run_id, fab_id, fab_version, override_config)"
+                "VALUES (?, ?, ?, ?);"
+            )
+            self.query(
+                query, (run_id, fab_id, fab_version, json.dumps(override_config))
+            )
             return run_id
         log(ERROR, "Unexpected run creation failure.")
         return 0
@@ -687,7 +699,10 @@ class SqliteState(State):  # pylint: disable=R0904
         try:
             row = self.query(query, (run_id,))[0]
             return Run(
-                run_id=run_id, fab_id=row["fab_id"], fab_version=row["fab_version"]
+                run_id=run_id,
+                fab_id=row["fab_id"],
+                fab_version=row["fab_version"],
+                override_config=json.loads(row["override_config"]),
             )
         except sqlite3.IntegrityError:
             log(ERROR, "`run_id` does not exist.")

flwr/server/superlink/state/state.py CHANGED Viewed

@@ -19,7 +19,7 @@ import abc
 from typing import List, Optional, Set
 from uuid import UUID
-from flwr.common.typing import Run
+from flwr.common.typing import Run, UserConfig
 from flwr.proto.task_pb2 import TaskIns, TaskRes  # pylint: disable=E0611
@@ -157,7 +157,12 @@ class State(abc.ABC):  # pylint: disable=R0904
         """Retrieve stored `node_id` filtered by `client_public_keys`."""
     @abc.abstractmethod
-    def create_run(self, fab_id: str, fab_version: str) -> int:
+    def create_run(
+        self,
+        fab_id: str,
+        fab_version: str,
+        override_config: UserConfig,
+    ) -> int:
         """Create a new run for the specified `fab_id` and `fab_version`."""
     @abc.abstractmethod

flwr/server/typing.py CHANGED Viewed

@@ -20,6 +20,8 @@ from typing import Callable
 from flwr.common import Context
 from .driver import Driver
+from .serverapp_components import ServerAppComponents
 ServerAppCallable = Callable[[Driver, Context], None]
 Workflow = Callable[[Driver, Context], None]
+ServerFn = Callable[[Context], ServerAppComponents]

flwr/server/workflow/secure_aggregation/secaggplus_workflow.py CHANGED Viewed

@@ -81,6 +81,7 @@ class WorkflowState:  # pylint: disable=R0902
     forward_ciphertexts: Dict[int, List[bytes]] = field(default_factory=dict)
     aggregate_ndarrays: NDArrays = field(default_factory=list)
     legacy_results: List[Tuple[ClientProxy, FitRes]] = field(default_factory=list)
+    failures: List[Exception] = field(default_factory=list)
 class SecAggPlusWorkflow:
@@ -394,6 +395,7 @@ class SecAggPlusWorkflow:
         for msg in msgs:
             if msg.has_error():
+                state.failures.append(Exception(msg.error))
                 continue
             key_dict = msg.content.configs_records[RECORD_KEY_CONFIGS]
             node_id = msg.metadata.src_node_id
@@ -451,6 +453,9 @@ class SecAggPlusWorkflow:
             nid: [] for nid in state.active_node_ids
         }  # dest node ID -> list of src node IDs
         for msg in msgs:
+            if msg.has_error():
+                state.failures.append(Exception(msg.error))
+                continue
             node_id = msg.metadata.src_node_id
             res_dict = msg.content.configs_records[RECORD_KEY_CONFIGS]
             dst_lst = cast(List[int], res_dict[Key.DESTINATION_LIST])
@@ -515,6 +520,9 @@ class SecAggPlusWorkflow:
         # Sum collected masked vectors and compute active/dead node IDs
         masked_vector = None
         for msg in msgs:
+            if msg.has_error():
+                state.failures.append(Exception(msg.error))
+                continue
             res_dict = msg.content.configs_records[RECORD_KEY_CONFIGS]
             bytes_list = cast(List[bytes], res_dict[Key.MASKED_PARAMETERS])
             client_masked_vec = [bytes_to_ndarray(b) for b in bytes_list]
@@ -528,6 +536,9 @@ class SecAggPlusWorkflow:
         # Backward compatibility with Strategy
         for msg in msgs:
+            if msg.has_error():
+                state.failures.append(Exception(msg.error))
+                continue
             fitres = compat.recordset_to_fitres(msg.content, True)
             proxy = state.nid_to_proxies[msg.metadata.src_node_id]
             state.legacy_results.append((proxy, fitres))
@@ -584,6 +595,9 @@ class SecAggPlusWorkflow:
         for nid in state.sampled_node_ids:
             collected_shares_dict[nid] = []
         for msg in msgs:
+            if msg.has_error():
+                state.failures.append(Exception(msg.error))
+                continue
             res_dict = msg.content.configs_records[RECORD_KEY_CONFIGS]
             nids = cast(List[int], res_dict[Key.NODE_ID_LIST])
             shares = cast(List[bytes], res_dict[Key.SHARE_LIST])
@@ -652,9 +666,11 @@ class SecAggPlusWorkflow:
             INFO,
             "aggregate_fit: received %s results and %s failures",
             len(results),
-            0,
+            len(state.failures),
+        )
+        aggregated_result = context.strategy.aggregate_fit(
+            current_round, results, state.failures  # type: ignore
         )
-        aggregated_result = context.strategy.aggregate_fit(current_round, results, [])
         parameters_aggregated, metrics_aggregated = aggregated_result
         # Update the parameters and write history

flwr-nightly 1.10.0.dev20240707__py3-none-any.whl → 1.10.0.dev20240722__py3-none-any.whl

Potentially problematic release.

flwr-nightly 1.10.0.dev20240707py3-none-any.whl → 1.10.0.dev20240722py3-none-any.whl