PyPI - flwr-nightly - Versions diffs - 1.10.0.dev20240624__py3-none-any.whl → 1.10.0.dev20240722__py3-none-any.whl - Mend

flwr-nightly 1.10.0.dev20240624py3-none-any.whl → 1.10.0.dev20240722py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of flwr-nightly might be problematic. Click here for more details.

Files changed (95) hide show

flwr/cli/build.py +18 -4
flwr/cli/config_utils.py +36 -14
flwr/cli/install.py +17 -1
flwr/cli/new/new.py +31 -20
flwr/cli/new/templates/app/code/client.hf.py.tpl +11 -3
flwr/cli/new/templates/app/code/client.jax.py.tpl +2 -1
flwr/cli/new/templates/app/code/client.mlx.py.tpl +15 -10
flwr/cli/new/templates/app/code/client.numpy.py.tpl +2 -1
flwr/cli/new/templates/app/code/client.pytorch.py.tpl +12 -3
flwr/cli/new/templates/app/code/client.sklearn.py.tpl +6 -3
flwr/cli/new/templates/app/code/client.tensorflow.py.tpl +13 -3
flwr/cli/new/templates/app/code/flwr_tune/app.py.tpl +2 -2
flwr/cli/new/templates/app/code/flwr_tune/server.py.tpl +1 -1
flwr/cli/new/templates/app/code/server.hf.py.tpl +16 -11
flwr/cli/new/templates/app/code/server.jax.py.tpl +15 -8
flwr/cli/new/templates/app/code/server.mlx.py.tpl +11 -7
flwr/cli/new/templates/app/code/server.numpy.py.tpl +15 -8
flwr/cli/new/templates/app/code/server.pytorch.py.tpl +15 -13
flwr/cli/new/templates/app/code/server.sklearn.py.tpl +16 -10
flwr/cli/new/templates/app/code/server.tensorflow.py.tpl +16 -13
flwr/cli/new/templates/app/code/task.hf.py.tpl +2 -2
flwr/cli/new/templates/app/code/task.mlx.py.tpl +2 -2
flwr/cli/new/templates/app/code/task.pytorch.py.tpl +1 -1
flwr/cli/new/templates/app/pyproject.flowertune.toml.tpl +9 -12
flwr/cli/new/templates/app/pyproject.hf.toml.tpl +17 -16
flwr/cli/new/templates/app/pyproject.jax.toml.tpl +17 -11
flwr/cli/new/templates/app/pyproject.mlx.toml.tpl +17 -12
flwr/cli/new/templates/app/pyproject.numpy.toml.tpl +12 -12
flwr/cli/new/templates/app/pyproject.pytorch.toml.tpl +13 -12
flwr/cli/new/templates/app/pyproject.sklearn.toml.tpl +12 -12
flwr/cli/new/templates/app/pyproject.tensorflow.toml.tpl +15 -12
flwr/cli/run/run.py +135 -51
flwr/client/__init__.py +2 -0
flwr/client/app.py +63 -26
flwr/client/client_app.py +49 -4
flwr/client/grpc_adapter_client/connection.py +3 -2
flwr/client/grpc_client/connection.py +3 -2
flwr/client/grpc_rere_client/connection.py +17 -6
flwr/client/message_handler/message_handler.py +3 -4
flwr/client/node_state.py +60 -10
flwr/client/node_state_tests.py +4 -3
flwr/client/rest_client/connection.py +19 -8
flwr/client/supernode/app.py +60 -21
flwr/client/typing.py +1 -0
flwr/common/config.py +87 -2
flwr/common/constant.py +6 -0
flwr/common/context.py +26 -1
flwr/common/logger.py +38 -0
flwr/common/message.py +0 -17
flwr/common/serde.py +45 -0
flwr/common/telemetry.py +17 -0
flwr/common/typing.py +5 -0
flwr/proto/common_pb2.py +36 -0
flwr/proto/common_pb2.pyi +121 -0
flwr/proto/common_pb2_grpc.py +4 -0
flwr/proto/common_pb2_grpc.pyi +4 -0
flwr/proto/driver_pb2.py +24 -19
flwr/proto/driver_pb2.pyi +21 -1
flwr/proto/exec_pb2.py +16 -11
flwr/proto/exec_pb2.pyi +22 -1
flwr/proto/run_pb2.py +12 -7
flwr/proto/run_pb2.pyi +22 -1
flwr/proto/task_pb2.py +7 -8
flwr/server/__init__.py +2 -0
flwr/server/compat/legacy_context.py +5 -4
flwr/server/driver/grpc_driver.py +82 -140
flwr/server/run_serverapp.py +40 -15
flwr/server/server_app.py +56 -10
flwr/server/serverapp_components.py +52 -0
flwr/server/superlink/driver/driver_servicer.py +18 -3
flwr/server/superlink/fleet/message_handler/message_handler.py +13 -2
flwr/server/superlink/fleet/vce/backend/backend.py +4 -4
flwr/server/superlink/fleet/vce/backend/raybackend.py +10 -10
flwr/server/superlink/fleet/vce/vce_api.py +149 -122
flwr/server/superlink/state/in_memory_state.py +15 -7
flwr/server/superlink/state/sqlite_state.py +27 -12
flwr/server/superlink/state/state.py +7 -2
flwr/server/superlink/state/utils.py +6 -0
flwr/server/typing.py +2 -0
flwr/server/workflow/secure_aggregation/secaggplus_workflow.py +18 -2
flwr/simulation/app.py +52 -36
flwr/simulation/ray_transport/ray_actor.py +15 -19
flwr/simulation/ray_transport/ray_client_proxy.py +33 -13
flwr/simulation/run_simulation.py +237 -66
flwr/superexec/app.py +14 -7
flwr/superexec/deployment.py +186 -0
flwr/superexec/exec_grpc.py +5 -1
flwr/superexec/exec_servicer.py +4 -1
flwr/superexec/executor.py +18 -0
flwr/superexec/simulation.py +151 -0
{flwr_nightly-1.10.0.dev20240624.dist-info → flwr_nightly-1.10.0.dev20240722.dist-info}/METADATA +3 -2
{flwr_nightly-1.10.0.dev20240624.dist-info → flwr_nightly-1.10.0.dev20240722.dist-info}/RECORD +95 -88
{flwr_nightly-1.10.0.dev20240624.dist-info → flwr_nightly-1.10.0.dev20240722.dist-info}/LICENSE +0 -0
{flwr_nightly-1.10.0.dev20240624.dist-info → flwr_nightly-1.10.0.dev20240722.dist-info}/WHEEL +0 -0
{flwr_nightly-1.10.0.dev20240624.dist-info → flwr_nightly-1.10.0.dev20240722.dist-info}/entry_points.txt +0 -0

flwr/server/typing.py CHANGED Viewed

@@ -20,6 +20,8 @@ from typing import Callable
 from flwr.common import Context
 from .driver import Driver
+from .serverapp_components import ServerAppComponents
 ServerAppCallable = Callable[[Driver, Context], None]
 Workflow = Callable[[Driver, Context], None]
+ServerFn = Callable[[Context], ServerAppComponents]

flwr/server/workflow/secure_aggregation/secaggplus_workflow.py CHANGED Viewed

@@ -81,6 +81,7 @@ class WorkflowState:  # pylint: disable=R0902
     forward_ciphertexts: Dict[int, List[bytes]] = field(default_factory=dict)
     aggregate_ndarrays: NDArrays = field(default_factory=list)
     legacy_results: List[Tuple[ClientProxy, FitRes]] = field(default_factory=list)
+    failures: List[Exception] = field(default_factory=list)
 class SecAggPlusWorkflow:
@@ -394,6 +395,7 @@ class SecAggPlusWorkflow:
         for msg in msgs:
             if msg.has_error():
+                state.failures.append(Exception(msg.error))
                 continue
             key_dict = msg.content.configs_records[RECORD_KEY_CONFIGS]
             node_id = msg.metadata.src_node_id
@@ -451,6 +453,9 @@ class SecAggPlusWorkflow:
             nid: [] for nid in state.active_node_ids
         }  # dest node ID -> list of src node IDs
         for msg in msgs:
+            if msg.has_error():
+                state.failures.append(Exception(msg.error))
+                continue
             node_id = msg.metadata.src_node_id
             res_dict = msg.content.configs_records[RECORD_KEY_CONFIGS]
             dst_lst = cast(List[int], res_dict[Key.DESTINATION_LIST])
@@ -515,6 +520,9 @@ class SecAggPlusWorkflow:
         # Sum collected masked vectors and compute active/dead node IDs
         masked_vector = None
         for msg in msgs:
+            if msg.has_error():
+                state.failures.append(Exception(msg.error))
+                continue
             res_dict = msg.content.configs_records[RECORD_KEY_CONFIGS]
             bytes_list = cast(List[bytes], res_dict[Key.MASKED_PARAMETERS])
             client_masked_vec = [bytes_to_ndarray(b) for b in bytes_list]
@@ -528,6 +536,9 @@ class SecAggPlusWorkflow:
         # Backward compatibility with Strategy
         for msg in msgs:
+            if msg.has_error():
+                state.failures.append(Exception(msg.error))
+                continue
             fitres = compat.recordset_to_fitres(msg.content, True)
             proxy = state.nid_to_proxies[msg.metadata.src_node_id]
             state.legacy_results.append((proxy, fitres))
@@ -584,6 +595,9 @@ class SecAggPlusWorkflow:
         for nid in state.sampled_node_ids:
             collected_shares_dict[nid] = []
         for msg in msgs:
+            if msg.has_error():
+                state.failures.append(Exception(msg.error))
+                continue
             res_dict = msg.content.configs_records[RECORD_KEY_CONFIGS]
             nids = cast(List[int], res_dict[Key.NODE_ID_LIST])
             shares = cast(List[bytes], res_dict[Key.SHARE_LIST])
@@ -652,9 +666,11 @@ class SecAggPlusWorkflow:
             INFO,
             "aggregate_fit: received %s results and %s failures",
             len(results),
-            0,
+            len(state.failures),
+        )
+        aggregated_result = context.strategy.aggregate_fit(
+            current_round, results, state.failures  # type: ignore
         )
-        aggregated_result = context.strategy.aggregate_fit(current_round, results, [])
         parameters_aggregated, metrics_aggregated = aggregated_result
         # Update the parameters and write history

flwr/simulation/app.py CHANGED Viewed

@@ -27,14 +27,16 @@ from typing import Any, Dict, List, Optional, Type, Union
 import ray
 from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy
-from flwr.client import ClientFn
+from flwr.client import ClientFnExt
 from flwr.common import EventType, event
-from flwr.common.logger import log, set_logger_propagation
+from flwr.common.constant import NODE_ID_NUM_BYTES
+from flwr.common.logger import log, set_logger_propagation, warn_unsupported_feature
 from flwr.server.client_manager import ClientManager
 from flwr.server.history import History
 from flwr.server.server import Server, init_defaults, run_fl
 from flwr.server.server_config import ServerConfig
 from flwr.server.strategy import Strategy
+from flwr.server.superlink.state.utils import generate_rand_int_from_bytes
 from flwr.simulation.ray_transport.ray_actor import (
     ClientAppActor,
     VirtualClientEngineActor,
@@ -51,7 +53,7 @@ Invalid Arguments in method:
 `start_simulation(
     *,
     client_fn: ClientFn,
-    num_clients: Optional[int] = None,
+    num_clients: int,
     clients_ids: Optional[List[str]] = None,
     client_resources: Optional[Dict[str, float]] = None,
     server: Optional[Server] = None,
@@ -70,13 +72,29 @@ REASON:
 """
+NodeToPartitionMapping = Dict[int, int]
+def _create_node_id_to_partition_mapping(
+    num_clients: int,
+) -> NodeToPartitionMapping:
+    """Generate a node_id:partition_id mapping."""
+    nodes_mapping: NodeToPartitionMapping = {}  # {node-id; partition-id}
+    for i in range(num_clients):
+        while True:
+            node_id = generate_rand_int_from_bytes(NODE_ID_NUM_BYTES)
+            if node_id not in nodes_mapping:
+                break
+        nodes_mapping[node_id] = i
+    return nodes_mapping
 # pylint: disable=too-many-arguments,too-many-statements,too-many-branches
 def start_simulation(
     *,
-    client_fn: ClientFn,
-    num_clients: Optional[int] = None,
-    clients_ids: Optional[List[str]] = None,
+    client_fn: ClientFnExt,
+    num_clients: int,
+    clients_ids: Optional[List[str]] = None,  # UNSUPPORTED, WILL BE REMOVED
     client_resources: Optional[Dict[str, float]] = None,
     server: Optional[Server] = None,
     config: Optional[ServerConfig] = None,
@@ -92,23 +110,24 @@ def start_simulation(
     Parameters
     ----------
-    client_fn : ClientFn
-        A function creating client instances. The function must take a single
-        `str` argument called `cid`. It should return a single client instance
-        of type Client. Note that the created client instances are ephemeral
-        and will often be destroyed after a single method invocation. Since client
-        instances are not long-lived, they should not attempt to carry state over
-        method invocations. Any state required by the instance (model, dataset,
-        hyperparameters, ...) should be (re-)created in either the call to `client_fn`
-        or the call to any of the client methods (e.g., load evaluation data in the
-        `evaluate` method itself).
-    num_clients : Optional[int]
-        The total number of clients in this simulation. This must be set if
-        `clients_ids` is not set and vice-versa.
+    client_fn : ClientFnExt
+        A function creating `Client` instances. The function must have the signature
+        `client_fn(context: Context). It should return
+        a single client instance of type `Client`. Note that the created client
+        instances are ephemeral and will often be destroyed after a single method
+        invocation. Since client instances are not long-lived, they should not attempt
+        to carry state over method invocations. Any state required by the instance
+        (model, dataset, hyperparameters, ...) should be (re-)created in either the
+        call to `client_fn` or the call to any of the client methods (e.g., load
+        evaluation data in the `evaluate` method itself).
+    num_clients : int
+        The total number of clients in this simulation.
     clients_ids : Optional[List[str]]
+        UNSUPPORTED, WILL BE REMOVED. USE `num_clients` INSTEAD.
         List `client_id`s for each client. This is only required if
         `num_clients` is not set. Setting both `num_clients` and `clients_ids`
         with `len(clients_ids)` not equal to `num_clients` generates an error.
+        Using this argument will raise an error.
     client_resources : Optional[Dict[str, float]] (default: `{"num_cpus": 1, "num_gpus": 0.0}`)
         CPU and GPU resources for a single client. Supported keys
         are `num_cpus` and `num_gpus`. To understand the GPU utilization caused by
@@ -158,7 +177,6 @@ def start_simulation(
         is an advanced feature. For all details, please refer to the Ray documentation:
         https://docs.ray.io/en/latest/ray-core/scheduling/index.html
     Returns
     -------
     hist : flwr.server.history.History
@@ -170,6 +188,14 @@ def start_simulation(
         {"num_clients": len(clients_ids) if clients_ids is not None else num_clients},
     )
+    if clients_ids is not None:
+        warn_unsupported_feature(
+            "Passing `clients_ids` to `start_simulation` is deprecated and not longer "
+            "used by `start_simulation`. Use `num_clients` exclusively instead."
+        )
+        log(ERROR, "`clients_ids` argument used.")
+        sys.exit()
     # Set logger propagation
     loop: Optional[asyncio.AbstractEventLoop] = None
     try:
@@ -196,20 +222,8 @@ def start_simulation(
         initialized_config,
     )
-    # clients_ids takes precedence
-    cids: List[str]
-    if clients_ids is not None:
-        if (num_clients is not None) and (len(clients_ids) != num_clients):
-            log(ERROR, INVALID_ARGUMENTS_START_SIMULATION)
-            sys.exit()
-        else:
-            cids = clients_ids
-    else:
-        if num_clients is None:
-            log(ERROR, INVALID_ARGUMENTS_START_SIMULATION)
-            sys.exit()
-        else:
-            cids = [str(x) for x in range(num_clients)]
+    # Create node-id to partition-id mapping
+    nodes_mapping = _create_node_id_to_partition_mapping(num_clients)
     # Default arguments for Ray initialization
     if not ray_init_args:
@@ -308,10 +322,12 @@ def start_simulation(
     )
     # Register one RayClientProxy object for each client with the ClientManager
-    for cid in cids:
+    for node_id, partition_id in nodes_mapping.items():
         client_proxy = RayActorClientProxy(
             client_fn=client_fn,
-            cid=cid,
+            node_id=node_id,
+            partition_id=partition_id,
+            num_partitions=num_clients,
             actor_pool=pool,
         )
         initialized_server.client_manager().register(client=client_proxy)

flwr/simulation/ray_transport/ray_actor.py CHANGED Viewed

@@ -14,7 +14,6 @@
 # ==============================================================================
 """Ray-based Flower Actor and ActorPool implementation."""
-import asyncio
 import threading
 from abc import ABC
 from logging import DEBUG, ERROR, WARNING
@@ -411,9 +410,7 @@ class BasicActorPool:
         self.client_resources = client_resources
         # Queue of idle actors
-        self.pool: "asyncio.Queue[Type[VirtualClientEngineActor]]" = asyncio.Queue(
-            maxsize=1024
-        )
+        self.pool: List[VirtualClientEngineActor] = []
         self.num_actors = 0
         # Resolve arguments to pass during actor init
@@ -427,38 +424,37 @@ class BasicActorPool:
         # Figure out how many actors can be created given the cluster resources
         # and the resources the user indicates each VirtualClient will need
         self.actors_capacity = pool_size_from_resources(client_resources)
-        self._future_to_actor: Dict[Any, Type[VirtualClientEngineActor]] = {}
+        self._future_to_actor: Dict[Any, VirtualClientEngineActor] = {}
     def is_actor_available(self) -> bool:
         """Return true if there is an idle actor."""
-        return self.pool.qsize() > 0
+        return len(self.pool) > 0
-    async def add_actors_to_pool(self, num_actors: int) -> None:
+    def add_actors_to_pool(self, num_actors: int) -> None:
         """Add actors to the pool.
         This method may be executed also if new resources are added to your Ray cluster
         (e.g. you add a new node).
         """
         for _ in range(num_actors):
-            await self.pool.put(self.create_actor_fn())  # type: ignore
+            self.pool.append(self.create_actor_fn())  # type: ignore
         self.num_actors += num_actors
-    async def terminate_all_actors(self) -> None:
+    def terminate_all_actors(self) -> None:
         """Terminate actors in pool."""
         num_terminated = 0
-        while self.pool.qsize():
-            actor = await self.pool.get()
+        for actor in self.pool:
             actor.terminate.remote()  # type: ignore
             num_terminated += 1
         log(DEBUG, "Terminated %i actors", num_terminated)
-    async def submit(
+    def submit(
         self, actor_fn: Any, job: Tuple[ClientAppFn, Message, str, Context]
     ) -> Any:
         """On idle actor, submit job and return future."""
         # Remove idle actor from pool
-        actor = await self.pool.get()
+        actor = self.pool.pop()
         # Submit job to actor
         app_fn, mssg, cid, context = job
         future = actor_fn(actor, app_fn, mssg, cid, context)
@@ -467,18 +463,18 @@ class BasicActorPool:
         self._future_to_actor[future] = actor
         return future
-    async def add_actor_back_to_pool(self, future: Any) -> None:
+    def add_actor_back_to_pool(self, future: Any) -> None:
         """Ad actor assigned to run future back into the pool."""
         actor = self._future_to_actor.pop(future)
-        await self.pool.put(actor)
+        self.pool.append(actor)
-    async def fetch_result_and_return_actor_to_pool(
+    def fetch_result_and_return_actor_to_pool(
         self, future: Any
     ) -> Tuple[Message, Context]:
         """Pull result given a future and add actor back to pool."""
-        # Get actor that ran job
-        await self.add_actor_back_to_pool(future)
         # Retrieve result for object store
         # Instead of doing ray.get(future) we await it
-        _, out_mssg, updated_context = await future
+        _, out_mssg, updated_context = ray.get(future)
+        # Get actor that ran job
+        self.add_actor_back_to_pool(future)
         return out_mssg, updated_context

flwr/simulation/ray_transport/ray_client_proxy.py CHANGED Viewed

@@ -20,11 +20,16 @@ from logging import ERROR
 from typing import Optional
 from flwr import common
-from flwr.client import ClientFn
+from flwr.client import ClientFnExt
 from flwr.client.client_app import ClientApp
 from flwr.client.node_state import NodeState
 from flwr.common import DEFAULT_TTL, Message, Metadata, RecordSet
-from flwr.common.constant import MessageType, MessageTypeLegacy
+from flwr.common.constant import (
+    NUM_PARTITIONS_KEY,
+    PARTITION_ID_KEY,
+    MessageType,
+    MessageTypeLegacy,
+)
 from flwr.common.logger import log
 from flwr.common.recordset_compat import (
     evaluateins_to_recordset,
@@ -43,17 +48,30 @@ from flwr.simulation.ray_transport.ray_actor import VirtualClientEngineActorPool
 class RayActorClientProxy(ClientProxy):
     """Flower client proxy which delegates work using Ray."""
-    def __init__(
-        self, client_fn: ClientFn, cid: str, actor_pool: VirtualClientEngineActorPool
+    def __init__(  # pylint: disable=too-many-arguments
+        self,
+        client_fn: ClientFnExt,
+        node_id: int,
+        partition_id: int,
+        num_partitions: int,
+        actor_pool: VirtualClientEngineActorPool,
     ):
-        super().__init__(cid)
+        super().__init__(cid=str(node_id))
+        self.node_id = node_id
+        self.partition_id = partition_id
         def _load_app() -> ClientApp:
             return ClientApp(client_fn=client_fn)
         self.app_fn = _load_app
         self.actor_pool = actor_pool
-        self.proxy_state = NodeState()
+        self.proxy_state = NodeState(
+            node_id=node_id,
+            node_config={
+                PARTITION_ID_KEY: str(partition_id),
+                NUM_PARTITIONS_KEY: str(num_partitions),
+            },
+        )
     def _submit_job(self, message: Message, timeout: Optional[float]) -> Message:
         """Sumbit a message to the ActorPool."""
@@ -62,16 +80,19 @@ class RayActorClientProxy(ClientProxy):
         # Register state
         self.proxy_state.register_context(run_id=run_id)
-        # Retrieve state
-        state = self.proxy_state.retrieve_context(run_id=run_id)
+        # Retrieve context
+        context = self.proxy_state.retrieve_context(run_id=run_id)
+        partition_id_str = str(context.node_config[PARTITION_ID_KEY])
         try:
             self.actor_pool.submit_client_job(
-                lambda a, a_fn, mssg, cid, state: a.run.remote(a_fn, mssg, cid, state),
-                (self.app_fn, message, self.cid, state),
+                lambda a, a_fn, mssg, partition_id, context: a.run.remote(
+                    a_fn, mssg, partition_id, context
+                ),
+                (self.app_fn, message, partition_id_str, context),
             )
             out_mssg, updated_context = self.actor_pool.get_client_result(
-                self.cid, timeout
+                partition_id_str, timeout
             )
             # Update state
@@ -103,11 +124,10 @@ class RayActorClientProxy(ClientProxy):
                 message_id="",
                 group_id=str(group_id) if group_id is not None else "",
                 src_node_id=0,
-                dst_node_id=int(self.cid),
+                dst_node_id=self.node_id,
                 reply_to_message="",
                 ttl=timeout if timeout else DEFAULT_TTL,
                 message_type=message_type,
-                partition_id=int(self.cid),
             ),
         )

flwr-nightly 1.10.0.dev20240624__py3-none-any.whl → 1.10.0.dev20240722__py3-none-any.whl

Potentially problematic release.

flwr-nightly 1.10.0.dev20240624py3-none-any.whl → 1.10.0.dev20240722py3-none-any.whl