PyPI - flwr-nightly - Versions diffs - 1.10.0.dev20240709__py3-none-any.whl → 1.10.0.dev20240711__py3-none-any.whl - Mend

flwr-nightly 1.10.0.dev20240709py3-none-any.whl → 1.10.0.dev20240711py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of flwr-nightly might be problematic. Click here for more details.

Files changed (22) hide show

flwr/cli/config_utils.py +10 -0
flwr/cli/new/new.py +3 -1
flwr/cli/run/run.py +25 -8
flwr/client/app.py +12 -6
flwr/client/node_state.py +36 -8
flwr/client/node_state_tests.py +3 -2
flwr/client/supernode/app.py +20 -6
flwr/common/logger.py +25 -0
flwr/server/__init__.py +2 -0
flwr/server/server_app.py +56 -10
flwr/server/serverapp_components.py +52 -0
flwr/server/superlink/fleet/vce/backend/backend.py +4 -4
flwr/server/superlink/fleet/vce/backend/raybackend.py +8 -9
flwr/server/superlink/fleet/vce/vce_api.py +93 -98
flwr/server/typing.py +2 -0
flwr/simulation/ray_transport/ray_actor.py +15 -19
flwr/simulation/run_simulation.py +49 -33
{flwr_nightly-1.10.0.dev20240709.dist-info → flwr_nightly-1.10.0.dev20240711.dist-info}/METADATA +2 -2
{flwr_nightly-1.10.0.dev20240709.dist-info → flwr_nightly-1.10.0.dev20240711.dist-info}/RECORD +22 -21
{flwr_nightly-1.10.0.dev20240709.dist-info → flwr_nightly-1.10.0.dev20240711.dist-info}/LICENSE +0 -0
{flwr_nightly-1.10.0.dev20240709.dist-info → flwr_nightly-1.10.0.dev20240711.dist-info}/WHEEL +0 -0
{flwr_nightly-1.10.0.dev20240709.dist-info → flwr_nightly-1.10.0.dev20240711.dist-info}/entry_points.txt +0 -0

flwr/server/superlink/fleet/vce/vce_api.py CHANGED Viewed

@@ -14,14 +14,18 @@
 # ==============================================================================
 """Fleet Simulation Engine API."""
-import asyncio
 import json
 import sys
+import threading
 import time
 import traceback
+from concurrent.futures import ThreadPoolExecutor
 from logging import DEBUG, ERROR, INFO, WARN
 from pathlib import Path
-from typing import Callable, Dict, List, Optional
+from queue import Empty, Queue
+from time import sleep
+from typing import Callable, Dict, Optional
 from flwr.client.client_app import ClientApp, ClientAppException, LoadClientAppError
 from flwr.client.node_state import NodeState
@@ -30,8 +34,8 @@ from flwr.common.logger import log
 from flwr.common.message import Error
 from flwr.common.object_ref import load_app
 from flwr.common.serde import message_from_taskins, message_to_taskres
-from flwr.proto.task_pb2 import TaskIns  # pylint: disable=E0611
-from flwr.server.superlink.state import StateFactory
+from flwr.proto.task_pb2 import TaskIns, TaskRes  # pylint: disable=E0611
+from flwr.server.superlink.state import State, StateFactory
 from .backend import Backend, error_messages_backends, supported_backends
@@ -52,19 +56,21 @@ def _register_nodes(
 # pylint: disable=too-many-arguments,too-many-locals
-async def worker(
+def worker(
     app_fn: Callable[[], ClientApp],
-    queue: "asyncio.Queue[TaskIns]",
+    taskins_queue: "Queue[TaskIns]",
+    taskres_queue: "Queue[TaskRes]",
     node_states: Dict[int, NodeState],
-    state_factory: StateFactory,
     backend: Backend,
+    f_stop: threading.Event,
 ) -> None:
     """Get TaskIns from queue and pass it to an actor in the pool to execute it."""
-    state = state_factory.state()
-    while True:
+    while not f_stop.is_set():
         out_mssg = None
         try:
-            task_ins: TaskIns = await queue.get()
+            # Fetch from queue with timeout. We use a timeout so
+            # the stopping event can be evaluated even when the queue is empty.
+            task_ins: TaskIns = taskins_queue.get(timeout=1.0)
             node_id = task_ins.task.consumer.node_id
             # Register and retrieve runstate
@@ -75,7 +81,7 @@ async def worker(
             message = message_from_taskins(task_ins)
             # Let backend process message
-            out_mssg, updated_context = await backend.process_message(
+            out_mssg, updated_context = backend.process_message(
                 app_fn, message, context
             )
@@ -83,11 +89,9 @@ async def worker(
             node_states[node_id].update_context(
                 task_ins.run_id, context=updated_context
             )
-        except asyncio.CancelledError as e:
-            log(DEBUG, "Terminating async worker: %s", e)
-            break
+        except Empty:
+            # An exception raised if queue.get times out
+            pass
         # Exceptions aren't raised but reported as an error message
         except Exception as ex:  # pylint: disable=broad-exception-caught
             log(ERROR, ex)
@@ -111,67 +115,48 @@ async def worker(
                 task_res = message_to_taskres(out_mssg)
                 # Store TaskRes in state
                 task_res.task.pushed_at = time.time()
-                state.store_task_res(task_res)
+                taskres_queue.put(task_res)
-async def add_taskins_to_queue(
-    queue: "asyncio.Queue[TaskIns]",
-    state_factory: StateFactory,
+def add_taskins_to_queue(
+    state: State,
+    queue: "Queue[TaskIns]",
     nodes_mapping: NodeToPartitionMapping,
-    backend: Backend,
-    consumers: List["asyncio.Task[None]"],
-    f_stop: asyncio.Event,
+    f_stop: threading.Event,
 ) -> None:
-    """Retrieve TaskIns and add it to the queue."""
-    state = state_factory.state()
-    num_initial_consumers = len(consumers)
+    """Put TaskIns in a queue from State."""
     while not f_stop.is_set():
         for node_id in nodes_mapping.keys():
-            task_ins = state.get_task_ins(node_id=node_id, limit=1)
-            if task_ins:
-                await queue.put(task_ins[0])
-        # Count consumers that are running
-        num_active = sum(not (cc.done()) for cc in consumers)
-        # Alert if number of consumers decreased by half
-        if num_active < num_initial_consumers // 2:
-            log(
-                WARN,
-                "Number of active workers has more than halved: (%i/%i active)",
-                num_active,
-                num_initial_consumers,
-            )
+            task_ins_list = state.get_task_ins(node_id=node_id, limit=1)
+            for task_ins in task_ins_list:
+                queue.put(task_ins)
+        sleep(0.1)
-        # Break if consumers died
-        if num_active == 0:
-            raise RuntimeError("All workers have died. Ending Simulation.")
-        # Log some stats
-        log(
-            DEBUG,
-            "Simulation Engine stats: "
-            "Active workers: (%i/%i) | %s (%i workers) | Tasks in queue: %i)",
-            num_active,
-            num_initial_consumers,
-            backend.__class__.__name__,
-            backend.num_workers,
-            queue.qsize(),
-        )
-        await asyncio.sleep(1.0)
-    log(DEBUG, "Async producer: Stopped pulling from StateFactory.")
+def put_taskres_into_state(
+    state: State, queue: "Queue[TaskRes]", f_stop: threading.Event
+) -> None:
+    """Put TaskRes into State from a queue."""
+    while not f_stop.is_set():
+        try:
+            taskres = queue.get(timeout=1.0)
+            state.store_task_res(taskres)
+        except Empty:
+            # queue is empty when timeout was triggered
+            pass
-async def run(
+def run(
     app_fn: Callable[[], ClientApp],
     backend_fn: Callable[[], Backend],
     nodes_mapping: NodeToPartitionMapping,
     state_factory: StateFactory,
     node_states: Dict[int, NodeState],
-    f_stop: asyncio.Event,
+    f_stop: threading.Event,
 ) -> None:
-    """Run the VCE async."""
-    queue: "asyncio.Queue[TaskIns]" = asyncio.Queue(128)
+    """Run the VCE."""
+    taskins_queue: "Queue[TaskIns]" = Queue()
+    taskres_queue: "Queue[TaskRes]" = Queue()
     try:
@@ -179,27 +164,48 @@ async def run(
         backend = backend_fn()
         # Build backend
-        await backend.build()
+        backend.build()
         # Add workers (they submit Messages to Backend)
-        worker_tasks = [
-            asyncio.create_task(
-                worker(app_fn, queue, node_states, state_factory, backend)
-            )
-            for _ in range(backend.num_workers)
-        ]
-        # Create producer (adds TaskIns into Queue)
-        producer = asyncio.create_task(
-            add_taskins_to_queue(
-                queue, state_factory, nodes_mapping, backend, worker_tasks, f_stop
-            )
+        state = state_factory.state()
+        extractor_th = threading.Thread(
+            target=add_taskins_to_queue,
+            args=(
+                state,
+                taskins_queue,
+                nodes_mapping,
+                f_stop,
+            ),
         )
+        extractor_th.start()
-        # Wait for producer to finish
-        # The producer runs forever until f_stop is set or until
-        # all worker (consumer) coroutines are completed. Workers
-        # also run forever and only end if an exception is raised.
-        await asyncio.gather(producer)
+        injector_th = threading.Thread(
+            target=put_taskres_into_state,
+            args=(
+                state,
+                taskres_queue,
+                f_stop,
+            ),
+        )
+        injector_th.start()
+        with ThreadPoolExecutor() as executor:
+            _ = [
+                executor.submit(
+                    worker,
+                    app_fn,
+                    taskins_queue,
+                    taskres_queue,
+                    node_states,
+                    backend,
+                    f_stop,
+                )
+                for _ in range(backend.num_workers)
+            ]
+        extractor_th.join()
+        injector_th.join()
     except Exception as ex:
@@ -214,18 +220,9 @@ async def run(
         raise RuntimeError("Simulation Engine crashed.") from ex
     finally:
-        # Produced task terminated, now cancel worker tasks
-        for w_t in worker_tasks:
-            _ = w_t.cancel()
-        while not all(w_t.done() for w_t in worker_tasks):
-            log(DEBUG, "Terminating async workers...")
-            await asyncio.sleep(0.5)
-        await asyncio.gather(*[w_t for w_t in worker_tasks if not w_t.done()])
         # Terminate backend
-        await backend.terminate()
+        backend.terminate()
 # pylint: disable=too-many-arguments,unused-argument,too-many-locals,too-many-branches
@@ -234,7 +231,7 @@ def start_vce(
     backend_name: str,
     backend_config_json_stream: str,
     app_dir: str,
-    f_stop: asyncio.Event,
+    f_stop: threading.Event,
     client_app: Optional[ClientApp] = None,
     client_app_attr: Optional[str] = None,
     num_supernodes: Optional[int] = None,
@@ -338,15 +335,13 @@ def start_vce(
         _ = app_fn()
         # Run main simulation loop
-        asyncio.run(
-            run(
-                app_fn,
-                backend_fn,
-                nodes_mapping,
-                state_factory,
-                node_states,
-                f_stop,
-            )
+        run(
+            app_fn,
+            backend_fn,
+            nodes_mapping,
+            state_factory,
+            node_states,
+            f_stop,
         )
     except LoadClientAppError as loadapp_ex:
         f_stop_delay = 10

flwr/server/typing.py CHANGED Viewed

@@ -20,6 +20,8 @@ from typing import Callable
 from flwr.common import Context
 from .driver import Driver
+from .serverapp_components import ServerAppComponents
 ServerAppCallable = Callable[[Driver, Context], None]
 Workflow = Callable[[Driver, Context], None]
+ServerFn = Callable[[Context], ServerAppComponents]

flwr/simulation/ray_transport/ray_actor.py CHANGED Viewed

@@ -14,7 +14,6 @@
 # ==============================================================================
 """Ray-based Flower Actor and ActorPool implementation."""
-import asyncio
 import threading
 from abc import ABC
 from logging import DEBUG, ERROR, WARNING
@@ -411,9 +410,7 @@ class BasicActorPool:
         self.client_resources = client_resources
         # Queue of idle actors
-        self.pool: "asyncio.Queue[Type[VirtualClientEngineActor]]" = asyncio.Queue(
-            maxsize=1024
-        )
+        self.pool: List[VirtualClientEngineActor] = []
         self.num_actors = 0
         # Resolve arguments to pass during actor init
@@ -427,38 +424,37 @@ class BasicActorPool:
         # Figure out how many actors can be created given the cluster resources
         # and the resources the user indicates each VirtualClient will need
         self.actors_capacity = pool_size_from_resources(client_resources)
-        self._future_to_actor: Dict[Any, Type[VirtualClientEngineActor]] = {}
+        self._future_to_actor: Dict[Any, VirtualClientEngineActor] = {}
     def is_actor_available(self) -> bool:
         """Return true if there is an idle actor."""
-        return self.pool.qsize() > 0
+        return len(self.pool) > 0
-    async def add_actors_to_pool(self, num_actors: int) -> None:
+    def add_actors_to_pool(self, num_actors: int) -> None:
         """Add actors to the pool.
         This method may be executed also if new resources are added to your Ray cluster
         (e.g. you add a new node).
         """
         for _ in range(num_actors):
-            await self.pool.put(self.create_actor_fn())  # type: ignore
+            self.pool.append(self.create_actor_fn())  # type: ignore
         self.num_actors += num_actors
-    async def terminate_all_actors(self) -> None:
+    def terminate_all_actors(self) -> None:
         """Terminate actors in pool."""
         num_terminated = 0
-        while self.pool.qsize():
-            actor = await self.pool.get()
+        for actor in self.pool:
             actor.terminate.remote()  # type: ignore
             num_terminated += 1
         log(DEBUG, "Terminated %i actors", num_terminated)
-    async def submit(
+    def submit(
         self, actor_fn: Any, job: Tuple[ClientAppFn, Message, str, Context]
     ) -> Any:
         """On idle actor, submit job and return future."""
         # Remove idle actor from pool
-        actor = await self.pool.get()
+        actor = self.pool.pop()
         # Submit job to actor
         app_fn, mssg, cid, context = job
         future = actor_fn(actor, app_fn, mssg, cid, context)
@@ -467,18 +463,18 @@ class BasicActorPool:
         self._future_to_actor[future] = actor
         return future
-    async def add_actor_back_to_pool(self, future: Any) -> None:
+    def add_actor_back_to_pool(self, future: Any) -> None:
         """Ad actor assigned to run future back into the pool."""
         actor = self._future_to_actor.pop(future)
-        await self.pool.put(actor)
+        self.pool.append(actor)
-    async def fetch_result_and_return_actor_to_pool(
+    def fetch_result_and_return_actor_to_pool(
         self, future: Any
     ) -> Tuple[Message, Context]:
         """Pull result given a future and add actor back to pool."""
-        # Get actor that ran job
-        await self.add_actor_back_to_pool(future)
         # Retrieve result for object store
         # Instead of doing ray.get(future) we await it
-        _, out_mssg, updated_context = await future
+        _, out_mssg, updated_context = ray.get(future)
+        # Get actor that ran job
+        self.add_actor_back_to_pool(future)
         return out_mssg, updated_context

flwr/simulation/run_simulation.py CHANGED Viewed

@@ -22,7 +22,7 @@ import threading
 import traceback
 from logging import DEBUG, ERROR, INFO, WARNING
 from time import sleep
-from typing import Optional
+from typing import Dict, Optional
 from flwr.client import ClientApp
 from flwr.common import EventType, event, log
@@ -126,16 +126,25 @@ def run_simulation(
 def run_serverapp_th(
     server_app_attr: Optional[str],
     server_app: Optional[ServerApp],
+    server_app_run_config: Dict[str, str],
     driver: Driver,
     app_dir: str,
-    f_stop: asyncio.Event,
+    f_stop: threading.Event,
+    has_exception: threading.Event,
     enable_tf_gpu_growth: bool,
     delay_launch: int = 3,
 ) -> threading.Thread:
     """Run SeverApp in a thread."""
-    def server_th_with_start_checks(  # type: ignore
-        tf_gpu_growth: bool, stop_event: asyncio.Event, **kwargs
+    def server_th_with_start_checks(
+        tf_gpu_growth: bool,
+        stop_event: threading.Event,
+        exception_event: threading.Event,
+        _driver: Driver,
+        _server_app_dir: str,
+        _server_app_run_config: Dict[str, str],
+        _server_app_attr: Optional[str],
+        _server_app: Optional[ServerApp],
     ) -> None:
         """Run SeverApp, after check if GPU memory growth has to be set.
@@ -147,10 +156,18 @@ def run_serverapp_th(
                 enable_gpu_growth()
             # Run ServerApp
-            run(**kwargs)
+            run(
+                driver=_driver,
+                server_app_dir=_server_app_dir,
+                server_app_run_config=_server_app_run_config,
+                server_app_attr=_server_app_attr,
+                loaded_server_app=_server_app,
+            )
         except Exception as ex:  # pylint: disable=broad-exception-caught
             log(ERROR, "ServerApp thread raised an exception: %s", ex)
             log(ERROR, traceback.format_exc())
+            exception_event.set()
+            raise
         finally:
             log(DEBUG, "ServerApp finished running.")
             # Upon completion, trigger stop event if one was passed
@@ -160,13 +177,16 @@ def run_serverapp_th(
     serverapp_th = threading.Thread(
         target=server_th_with_start_checks,
-        args=(enable_tf_gpu_growth, f_stop),
-        kwargs={
-            "server_app_attr": server_app_attr,
-            "loaded_server_app": server_app,
-            "driver": driver,
-            "server_app_dir": app_dir,
-        },
+        args=(
+            enable_tf_gpu_growth,
+            f_stop,
+            has_exception,
+            driver,
+            app_dir,
+            server_app_run_config,
+            server_app_attr,
+            server_app,
+        ),
     )
     sleep(delay_launch)
     serverapp_th.start()
@@ -196,20 +216,18 @@ def _main_loop(
     server_app: Optional[ServerApp] = None,
     server_app_attr: Optional[str] = None,
 ) -> None:
-    """Launch SuperLink with Simulation Engine, then ServerApp on a separate thread.
-    Everything runs on the main thread or a separate one, depending on whether the main
-    thread already contains a running Asyncio event loop. This is the case if running
-    the Simulation Engine on a Jupyter/Colab notebook.
-    """
+    """Launch SuperLink with Simulation Engine, then ServerApp on a separate thread."""
     # Initialize StateFactory
     state_factory = StateFactory(":flwr-in-memory-state:")
-    f_stop = asyncio.Event()
+    f_stop = threading.Event()
+    # A Threading event to indicate if an exception was raised in the ServerApp thread
+    server_app_thread_has_exception = threading.Event()
     serverapp_th = None
     try:
         # Create run (with empty fab_id and fab_version)
         run_id_ = state_factory.state().create_run("", "", {})
+        server_app_run_config: Dict[str, str] = {}
         if run_id:
             _override_run_id(state_factory, run_id_to_replace=run_id_, run_id=run_id)
@@ -222,9 +240,11 @@ def _main_loop(
         serverapp_th = run_serverapp_th(
             server_app_attr=server_app_attr,
             server_app=server_app,
+            server_app_run_config=server_app_run_config,
             driver=driver,
             app_dir=app_dir,
             f_stop=f_stop,
+            has_exception=server_app_thread_has_exception,
             enable_tf_gpu_growth=enable_tf_gpu_growth,
         )
@@ -253,6 +273,8 @@ def _main_loop(
         event(EventType.RUN_SUPERLINK_LEAVE)
         if serverapp_th:
             serverapp_th.join()
+            if server_app_thread_has_exception.is_set():
+                raise RuntimeError("Exception in ServerApp thread")
     log(DEBUG, "Stopping Simulation Engine now.")
@@ -349,7 +371,6 @@ def _run_simulation(
     # Convert config to original JSON-stream format
     backend_config_stream = json.dumps(backend_config)
-    simulation_engine_th = None
     args = (
         num_supernodes,
         backend_name,
@@ -363,31 +384,26 @@ def _run_simulation(
         server_app_attr,
     )
     # Detect if there is an Asyncio event loop already running.
-    # If yes, run everything on a separate thread. In environments
-    # like Jupyter/Colab notebooks, there is an event loop present.
-    run_in_thread = False
+    # If yes, disable logger propagation. In environmnets
+    # like Jupyter/Colab notebooks, it's often better to do this.
+    asyncio_loop_running = False
     try:
         _ = (
             asyncio.get_running_loop()
         )  # Raises RuntimeError if no event loop is present
         log(DEBUG, "Asyncio event loop already running.")
-        run_in_thread = True
+        asyncio_loop_running = True
     except RuntimeError:
-        log(DEBUG, "No asyncio event loop running")
+        pass
     finally:
-        if run_in_thread:
+        if asyncio_loop_running:
             # Set logger propagation to False to prevent duplicated log output in Colab.
             logger = set_logger_propagation(logger, False)
-            log(DEBUG, "Starting Simulation Engine on a new thread.")
-            simulation_engine_th = threading.Thread(target=_main_loop, args=args)
-            simulation_engine_th.start()
-            simulation_engine_th.join()
-        else:
-            log(DEBUG, "Starting Simulation Engine on the main thread.")
-            _main_loop(*args)
+        _main_loop(*args)
 def _parse_args_run_simulation() -> argparse.ArgumentParser:

{flwr_nightly-1.10.0.dev20240709.dist-info → flwr_nightly-1.10.0.dev20240711.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: flwr-nightly
-Version: 1.10.0.dev20240709
+Version: 1.10.0.dev20240711
 Summary: Flower: A Friendly Federated Learning Framework
 Home-page: https://flower.ai
 License: Apache-2.0
@@ -33,7 +33,7 @@ Classifier: Typing :: Typed
 Provides-Extra: rest
 Provides-Extra: simulation
 Requires-Dist: cryptography (>=42.0.4,<43.0.0)
-Requires-Dist: grpcio (>=1.60.0,<2.0.0)
+Requires-Dist: grpcio (>=1.60.0,<2.0.0,!=1.64.2,!=1.65.0)
 Requires-Dist: iterators (>=0.0.2,<0.0.3)
 Requires-Dist: numpy (>=1.21.0,<2.0.0)
 Requires-Dist: pathspec (>=0.12.1,<0.13.0)

flwr-nightly 1.10.0.dev20240709__py3-none-any.whl → 1.10.0.dev20240711__py3-none-any.whl

Potentially problematic release.

flwr-nightly 1.10.0.dev20240709py3-none-any.whl → 1.10.0.dev20240711py3-none-any.whl