PyPI - blazefl - Versions diffs - 2.0.0.dev3__tar.gz → 2.0.0.dev4__tar.gz - Mend

blazefl 2.0.0.dev3tar.gz → 2.0.0.dev4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (116) hide show

{blazefl-2.0.0.dev3 → blazefl-2.0.0.dev4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: blazefl
-Version: 2.0.0.dev3
+Version: 2.0.0.dev4
 Summary: A blazing-fast and lightweight simulation framework for Federated Learning.
 Author-email: kitsuyaazuma <kitsuyaazuma@gmail.com>
 License-File: LICENSE

{blazefl-2.0.0.dev3 → blazefl-2.0.0.dev4}/examples/quickstart-fedavg/config/config.yaml RENAMED Viewed

@@ -14,4 +14,5 @@ dataset_root_dir: /tmp/quickstart-fedavg/dataset
 dataset_split_dir: /tmp/quickstart-fedavg/split
 share_dir: /tmp/quickstart-fedavg/share
 state_dir: /tmp/quickstart-fedavg/state
-serial: false
+parallel: true
+ipc_mode: storage

{blazefl-2.0.0.dev3 → blazefl-2.0.0.dev4}/examples/quickstart-fedavg/main.py RENAMED Viewed

@@ -98,31 +98,32 @@ def main(cfg: DictConfig):
         batch_size=cfg.batch_size,
     )
     trainer: FedAvgBaseClientTrainer | FedAvgProcessPoolClientTrainer | None = None
-    if cfg.serial:
-        trainer = FedAvgBaseClientTrainer(
+    if cfg.parallel:
+        trainer = FedAvgProcessPoolClientTrainer(
             model_selector=model_selector,
             model_name=cfg.model_name,
             dataset=dataset,
+            share_dir=share_dir,
+            state_dir=state_dir,
+            seed=cfg.seed,
             device=device,
             num_clients=cfg.num_clients,
             epochs=cfg.epochs,
             lr=cfg.lr,
             batch_size=cfg.batch_size,
+            num_parallels=cfg.num_parallels,
+            ipc_mode=cfg.ipc_mode,
         )
     else:
-        trainer = FedAvgProcessPoolClientTrainer(
+        trainer = FedAvgBaseClientTrainer(
             model_selector=model_selector,
             model_name=cfg.model_name,
             dataset=dataset,
-            share_dir=share_dir,
-            state_dir=state_dir,
-            seed=cfg.seed,
             device=device,
             num_clients=cfg.num_clients,
             epochs=cfg.epochs,
             lr=cfg.lr,
             batch_size=cfg.batch_size,
-            num_parallels=cfg.num_parallels,
         )
     pipeline = FedAvgPipeline(handler=handler, trainer=trainer, writer=writer)
     try:

{blazefl-2.0.0.dev3 → blazefl-2.0.0.dev4}/examples/step-by-step-dsfl/algorithm/dsfl.py RENAMED Viewed

@@ -233,7 +233,7 @@ class DSFLBaseServerHandler(BaseServerHandler[DSFLUplinkPackage, DSFLDownlinkPac
 @dataclass
-class DSFLDiskSharedData:
+class DSFLClientConfig:
     model_selector: DSFLModelSelector
     model_name: str
     dataset: DSFLPartitionedDataset
@@ -245,7 +245,6 @@ class DSFLDiskSharedData:
     kd_lr: float
     cid: int
     seed: int
-    payload: DSFLDownlinkPackage
     state_path: Path
@@ -258,7 +257,7 @@ class DSFLClientState:
 class DSFLProcessPoolClientTrainer(
-    ProcessPoolClientTrainer[DSFLUplinkPackage, DSFLDownlinkPackage, DSFLDiskSharedData]
+    ProcessPoolClientTrainer[DSFLUplinkPackage, DSFLDownlinkPackage, DSFLClientConfig]
 ):
     def __init__(
         self,
@@ -300,68 +299,76 @@ class DSFLProcessPoolClientTrainer(
         self.device = device
         self.num_clients = num_clients
         self.seed = seed
+        self.ipc_mode = "storage"
         if self.device == "cuda":
             self.device_count = torch.cuda.device_count()
     @staticmethod
-    def process_client(path: Path, device: str) -> Path:
-        data = torch.load(path, weights_only=False)
-        assert isinstance(data, DSFLDiskSharedData)
-        model = data.model_selector.select_model(data.model_name)
-        optimizer = torch.optim.SGD(model.parameters(), lr=data.lr)
+    def worker(
+        config: DSFLClientConfig | Path,
+        payload: DSFLDownlinkPackage | Path,
+        device: str,
+    ) -> Path:
+        assert isinstance(config, Path) and isinstance(payload, Path)
+        config_path, payload_path = config, payload
+        c = torch.load(config_path, weights_only=False)
+        p = torch.load(payload_path, weights_only=False)
+        assert isinstance(c, DSFLClientConfig) and isinstance(p, DSFLDownlinkPackage)
+        model = c.model_selector.select_model(c.model_name)
+        optimizer = torch.optim.SGD(model.parameters(), lr=c.lr)
         kd_optimizer: torch.optim.SGD | None = None
         state: DSFLClientState | None = None
-        if data.state_path.exists():
-            state = torch.load(data.state_path, weights_only=False)
+        if c.state_path.exists():
+            state = torch.load(c.state_path, weights_only=False)
             assert isinstance(state, DSFLClientState)
             RandomState.set_random_state(state.random)
             model.load_state_dict(state.model)
             optimizer.load_state_dict(state.optimizer)
             if state.kd_optimizer is not None:
-                kd_optimizer = torch.optim.SGD(model.parameters(), lr=data.kd_lr)
+                kd_optimizer = torch.optim.SGD(model.parameters(), lr=c.kd_lr)
                 kd_optimizer.load_state_dict(state.kd_optimizer)
         else:
-            seed_everything(data.seed, device=device)
+            seed_everything(c.seed, device=device)
         # Distill
-        open_dataset = data.dataset.get_dataset(type_="open", cid=None)
-        if data.payload.indices is not None and data.payload.soft_labels is not None:
-            global_soft_labels = list(torch.unbind(data.payload.soft_labels, dim=0))
-            global_indices = data.payload.indices.tolist()
+        open_dataset = c.dataset.get_dataset(type_="open", cid=None)
+        if p.indices is not None and p.soft_labels is not None:
+            global_soft_labels = list(torch.unbind(p.soft_labels, dim=0))
+            global_indices = p.indices.tolist()
             if kd_optimizer is None:
-                kd_optimizer = torch.optim.SGD(model.parameters(), lr=data.kd_lr)
+                kd_optimizer = torch.optim.SGD(model.parameters(), lr=c.kd_lr)
             DSFLBaseServerHandler.distill(
                 model=model,
                 optimizer=kd_optimizer,
-                dataset=data.dataset,
+                dataset=c.dataset,
                 global_soft_labels=global_soft_labels,
                 global_indices=global_indices,
-                kd_epochs=data.kd_epochs,
-                kd_batch_size=data.kd_batch_size,
+                kd_epochs=c.kd_epochs,
+                kd_batch_size=c.kd_batch_size,
                 device=device,
             )
         # Train
-        train_loader = data.dataset.get_dataloader(
+        train_loader = c.dataset.get_dataloader(
             type_="train",
-            cid=data.cid,
-            batch_size=data.batch_size,
+            cid=c.cid,
+            batch_size=c.batch_size,
         )
         DSFLProcessPoolClientTrainer.train(
             model=model,
             optimizer=optimizer,
             train_loader=train_loader,
             device=device,
-            epochs=data.epochs,
+            epochs=c.epochs,
         )
         # Predict
         open_loader = DataLoader(
-            Subset(open_dataset, data.payload.next_indices.tolist()),
-            batch_size=data.batch_size,
+            Subset(open_dataset, p.next_indices.tolist()),
+            batch_size=c.batch_size,
         )
         soft_labels = DSFLProcessPoolClientTrainer.predict(
             model=model,
@@ -370,10 +377,10 @@ class DSFLProcessPoolClientTrainer(
         )
         # Evaluate
-        test_loader = data.dataset.get_dataloader(
+        test_loader = c.dataset.get_dataloader(
             type_="test",
-            cid=data.cid,
-            batch_size=data.batch_size,
+            cid=c.cid,
+            batch_size=c.batch_size,
         )
         loss, acc = DSFLBaseServerHandler.evaulate(
             model=model,
@@ -383,19 +390,19 @@ class DSFLProcessPoolClientTrainer(
         package = DSFLUplinkPackage(
             soft_labels=soft_labels,
-            indices=data.payload.next_indices,
+            indices=p.next_indices,
             metadata={"loss": loss, "acc": acc},
         )
-        torch.save(package, path)
+        torch.save(package, config_path)
         state = DSFLClientState(
             random=RandomState.get_random_state(device=device),
             model=model.state_dict(),
             optimizer=optimizer.state_dict(),
             kd_optimizer=kd_optimizer.state_dict() if kd_optimizer else None,
         )
-        torch.save(state, data.state_path)
-        return path
+        torch.save(state, c.state_path)
+        return config_path
     @staticmethod
     def train(
@@ -442,10 +449,8 @@ class DSFLProcessPoolClientTrainer(
         soft_labels = torch.cat(soft_labels_list, dim=0)
         return soft_labels.cpu()
-    def get_shared_data(
-        self, cid: int, payload: DSFLDownlinkPackage
-    ) -> DSFLDiskSharedData:
-        data = DSFLDiskSharedData(
+    def get_client_config(self, cid: int) -> DSFLClientConfig:
+        data = DSFLClientConfig(
             model_selector=self.model_selector,
             model_name=self.model_name,
             dataset=self.dataset,
@@ -457,7 +462,6 @@ class DSFLProcessPoolClientTrainer(
             kd_lr=self.kd_lr,
             cid=cid,
             seed=self.seed,
-            payload=payload,
             state_path=self.state_dir.joinpath(f"{cid}.pt"),
         )
         return data

{blazefl-2.0.0.dev3 → blazefl-2.0.0.dev4}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [project]
 name = "blazefl"
-version = "2.0.0.dev3"
+version = "2.0.0.dev4"
 description = "A blazing-fast and lightweight simulation framework for Federated Learning."
 readme = "README.md"
 authors = [

{blazefl-2.0.0.dev3 → blazefl-2.0.0.dev4}/src/blazefl/contrib/fedavg.py RENAMED Viewed

@@ -2,6 +2,7 @@ import random
 from copy import deepcopy
 from dataclasses import dataclass
 from pathlib import Path
+from typing import Literal
 import torch
 from torch.utils.data import DataLoader
@@ -431,7 +432,7 @@ class FedAvgBaseClientTrainer(
 @dataclass
-class FedAvgDiskSharedData:
+class FedAvgClientConfig:
     """
     Data structure representing shared data for parallel client training
     in the Federated Averaging (FedAvg) algorithm.
@@ -448,7 +449,6 @@ class FedAvgDiskSharedData:
         lr (float): Learning rate for the optimizer.
         cid (int): Client ID.
         seed (int): Seed for reproducibility.
-        payload (FedAvgDownlinkPackage): Downlink package with global model parameters.
         state_path (Path): Path to save the client's random state.
     """
@@ -460,13 +460,12 @@ class FedAvgDiskSharedData:
     lr: float
     cid: int
     seed: int
-    payload: FedAvgDownlinkPackage
     state_path: Path
 class FedAvgProcessPoolClientTrainer(
     ProcessPoolClientTrainer[
-        FedAvgUplinkPackage, FedAvgDownlinkPackage, FedAvgDiskSharedData
+        FedAvgUplinkPackage, FedAvgDownlinkPackage, FedAvgClientConfig
     ]
 ):
     """
@@ -488,6 +487,8 @@ class FedAvgProcessPoolClientTrainer(
         lr (float): Learning rate for the optimizer.
         seed (int): Seed for reproducibility.
         num_parallels (int): Number of parallel processes for training.
+        ipc_mode (Literal["storage", "shared_memory"]):
+            Inter-process communication mode.
         device_count (int | None): Number of CUDA devices available (if using GPU).
     """
@@ -505,6 +506,7 @@ class FedAvgProcessPoolClientTrainer(
         lr: float,
         seed: int,
         num_parallels: int,
+        ipc_mode: Literal["storage", "shared_memory"],
     ) -> None:
         """
         Initialize the FedAvgParalleClientTrainer.
@@ -542,50 +544,93 @@ class FedAvgProcessPoolClientTrainer(
         self.device = device
         self.num_clients = num_clients
         self.seed = seed
+        self.ipc_mode = ipc_mode
     @staticmethod
-    def process_client(path: Path, device: str) -> Path:
+    def worker(
+        config: FedAvgClientConfig | Path,
+        payload: FedAvgDownlinkPackage | Path,
+        device: str,
+    ) -> FedAvgUplinkPackage | Path:
         """
         Process a single client's local training and evaluation.
-        This method is executed by a parallel process and handles data loading,
-        training, evaluation, and saving results to a shared file.
+        This method is executed by a worker process and handles loading client
+        configuration and payload, performing the client-specific training,
+        and returning the result.
         Args:
-            path (Path): Path to the shared data file containing client-specific
-            information.
-            device (str): Device to use for processing.
+            config (FedAvgClientConfig | Path):
+                The client's configuration data, or a path to a file containing
+                the configuration if `ipc_mode` is "storage".
+            payload (FedAvgDownlinkPackage | Path):
+                The downlink payload from the server, or a path to a file
+                containing the payload if `ipc_mode` is "storage".
+            device (str): Device to use for processing (e.g., "cpu", "cuda:0").
         Returns:
-            Path: Path to the file with the processed results.
-        """
-        data = torch.load(path, weights_only=False)
-        assert isinstance(data, FedAvgDiskSharedData)
-        if data.state_path.exists():
-            state = torch.load(data.state_path, weights_only=False)
-            assert isinstance(state, RandomState)
-            RandomState.set_random_state(state)
+            FedAvgUplinkPackage | Path:
+                The uplink package containing the client's results, or a path to
+                a file containing the package if `ipc_mode` is "storage".
+        """
+        def _storage_worker(
+            config_path: Path,
+            payload_path: Path,
+            device: str,
+        ) -> Path:
+            config = torch.load(config_path, weights_only=False)
+            assert isinstance(config, FedAvgClientConfig)
+            payload = torch.load(payload_path, weights_only=False)
+            assert isinstance(payload, FedAvgDownlinkPackage)
+            package = _shared_memory_worker(
+                config=config,
+                payload=payload,
+                device=device,
+            )
+            torch.save(package, config_path)
+            return config_path
+        def _shared_memory_worker(
+            config: FedAvgClientConfig,
+            payload: FedAvgDownlinkPackage,
+            device: str,
+        ) -> FedAvgUplinkPackage:
+            if config.state_path.exists():
+                state = torch.load(config.state_path, weights_only=False)
+                assert isinstance(state, RandomState)
+                RandomState.set_random_state(state)
+            else:
+                seed_everything(config.seed, device=device)
+            model = config.model_selector.select_model(config.model_name)
+            train_loader = config.dataset.get_dataloader(
+                type_="train",
+                cid=config.cid,
+                batch_size=config.batch_size,
+            )
+            package = FedAvgProcessPoolClientTrainer.train(
+                model=model,
+                model_parameters=payload.model_parameters,
+                train_loader=train_loader,
+                device=device,
+                epochs=config.epochs,
+                lr=config.lr,
+            )
+            torch.save(RandomState.get_random_state(device=device), config.state_path)
+            return package
+        if isinstance(config, Path) and isinstance(payload, Path):
+            return _storage_worker(config, payload, device)
+        elif isinstance(config, FedAvgClientConfig) and isinstance(
+            payload, FedAvgDownlinkPackage
+        ):
+            return _shared_memory_worker(config, payload, device)
         else:
-            seed_everything(data.seed, device=device)
-        model = data.model_selector.select_model(data.model_name)
-        train_loader = data.dataset.get_dataloader(
-            type_="train",
-            cid=data.cid,
-            batch_size=data.batch_size,
-        )
-        package = FedAvgProcessPoolClientTrainer.train(
-            model=model,
-            model_parameters=data.payload.model_parameters,
-            train_loader=train_loader,
-            device=device,
-            epochs=data.epochs,
-            lr=data.lr,
-        )
-        torch.save(package, path)
-        torch.save(RandomState.get_random_state(device=device), data.state_path)
-        return path
+            raise TypeError(
+                "Invalid types for config and payload."
+                " Expected FedAvgClientConfig and FedAvgDownlinkPackage or Path."
+            )
     @staticmethod
     def train(
@@ -636,21 +681,17 @@ class FedAvgProcessPoolClientTrainer(
         return FedAvgUplinkPackage(model_parameters, data_size)
-    def get_shared_data(
-        self, cid: int, payload: FedAvgDownlinkPackage
-    ) -> FedAvgDiskSharedData:
+    def get_client_config(self, cid: int) -> FedAvgClientConfig:
         """
-        Generate the shared data for a specific client.
+        Generate the client configuration for a specific client.
         Args:
             cid (int): Client ID.
-            payload (FedAvgDownlinkPackage): Downlink package with global model
-            parameters.
         Returns:
-            FedAvgDiskSharedData: Shared data structure for the client.
+            FedAvgClientConfig: Client configuration data structure.
         """
-        data = FedAvgDiskSharedData(
+        data = FedAvgClientConfig(
             model_selector=self.model_selector,
             model_name=self.model_name,
             dataset=self.dataset,
@@ -659,7 +700,6 @@ class FedAvgProcessPoolClientTrainer(
             lr=self.lr,
             cid=cid,
             seed=self.seed,
-            payload=payload,
             state_path=self.state_dir.joinpath(f"{cid}.pt"),
         )
         return data

{blazefl-2.0.0.dev3 → blazefl-2.0.0.dev4}/src/blazefl/core/client_trainer.py RENAMED Viewed

@@ -3,11 +3,13 @@ import signal
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from multiprocessing.pool import ApplyResult
 from pathlib import Path
-from typing import Protocol, TypeVar
+from typing import Literal, Protocol, TypeVar
 import torch
 from tqdm import tqdm
+from blazefl.utils import move_tensor_to_shared_memory
 UplinkPackage = TypeVar("UplinkPackage")
 DownlinkPackage = TypeVar("DownlinkPackage", contravariant=True)
@@ -47,12 +49,12 @@ class BaseClientTrainer(Protocol[UplinkPackage, DownlinkPackage]):
         ...
-DiskSharedData = TypeVar("DiskSharedData", covariant=True)
+ClientConfig = TypeVar("ClientConfig")
 class ProcessPoolClientTrainer(
     BaseClientTrainer[UplinkPackage, DownlinkPackage],
-    Protocol[UplinkPackage, DownlinkPackage, DiskSharedData],
+    Protocol[UplinkPackage, DownlinkPackage, ClientConfig],
 ):
     """
     Abstract base class for parallel client training in federated learning.
@@ -63,7 +65,12 @@ class ProcessPoolClientTrainer(
     Attributes:
         num_parallels (int): Number of parallel processes to use for client training.
         share_dir (Path): Directory path for sharing data between processes.
+        device (str): The primary device to use for computation (e.g., "cpu", "cuda").
+        device_count (int): The number of available CUDA devices, if `device` is "cuda".
         cache (list[UplinkPackage]): Cache to store uplink packages from clients.
+        ipc_mode (Literal["storage", "shared_memory"]): Inter-process communication
+            mode. "storage" uses disk for data exchange, "shared_memory" uses
+            shared memory for tensor data. Defaults to "storage".
     Raises:
         NotImplementedError: If the abstract methods are not implemented in a subclass.
@@ -74,17 +81,17 @@ class ProcessPoolClientTrainer(
     device: str
     device_count: int
     cache: list[UplinkPackage]
+    ipc_mode: Literal["storage", "shared_memory"] = "storage"
-    def get_shared_data(self, cid: int, payload: DownlinkPackage) -> DiskSharedData:
+    def get_client_config(self, cid: int) -> ClientConfig:
         """
-        Retrieve shared data for a given client ID and payload.
+        Retrieve the configuration for a given client ID.
         Args:
             cid (int): Client ID.
-            payload (DownlinkPackage): The data package received from the server.
         Returns:
-            DiskSharedData: The shared data associated with the client ID and payload.
+            ClientConfig: The configuration for the specified client.
         """
         ...
@@ -103,16 +110,29 @@ class ProcessPoolClientTrainer(
         return self.device
     @staticmethod
-    def process_client(path: Path, device: str) -> Path:
+    def worker(
+        config: ClientConfig | Path, payload: DownlinkPackage | Path, device: str
+    ) -> UplinkPackage | Path:
         """
-        Process a single client based on the provided path.
+        Process a single client's training task.
+        This method is executed by each worker process in the pool.
+        It handles loading client configuration and payload, performing
+        the client-specific operations, and returning the result.
         Args:
-            path (Path): Path to the client's data file.
-            device (str): Device to use for processing.
+            config (ClientConfig | Path):
+                The client's configuration data, or a path to a file containing
+                the configuration if `ipc_mode` is "storage".
+            payload (DownlinkPackage | Path):
+                The downlink payload from the server, or a path to a file
+                containing the payload if `ipc_mode` is "storage".
+            device (str): Device to use for processing (e.g., "cpu", "cuda:0").
         Returns:
-            Path: Path to the processed client's data file.
+            UplinkPackage | Path:
+                The uplink package containing the client's results, or a path
+                to a file containing the package if `ipc_mode` is "storage".
         """
         ...
@@ -130,6 +150,13 @@ class ProcessPoolClientTrainer(
         Returns:
             None
         """
+        payload_path = Path()
+        if self.ipc_mode == "storage":
+            payload_path = self.share_dir.joinpath("payload.pkl")
+            torch.save(payload, payload_path)
+        else:  # shared_memory
+            move_tensor_to_shared_memory(payload)
         with mp.Pool(
             processes=self.num_parallels,
             initializer=signal.signal,
@@ -137,16 +164,28 @@ class ProcessPoolClientTrainer(
         ) as pool:
             jobs: list[ApplyResult] = []
             for cid in cid_list:
-                path = self.share_dir.joinpath(f"{cid}.pkl")
-                data = self.get_shared_data(cid, payload)
+                config = self.get_client_config(cid)
                 device = self.get_client_device(cid)
-                torch.save(data, path)
-                jobs.append(pool.apply_async(self.process_client, (path, device)))
+                if self.ipc_mode == "storage":
+                    config_path = self.share_dir.joinpath(f"{cid}.pkl")
+                    torch.save(config, config_path)
+                    jobs.append(
+                        pool.apply_async(
+                            self.worker, (config_path, payload_path, device)
+                        )
+                    )
+                else:  # shared_memory
+                    jobs.append(
+                        pool.apply_async(self.worker, (config, payload, device))
+                    )
             for job in tqdm(jobs, desc="Client", leave=False):
-                path = job.get()
-                assert isinstance(path, Path)
-                package = torch.load(path, weights_only=False)
+                result = job.get()
+                if self.ipc_mode == "storage":
+                    assert isinstance(result, Path)
+                    package = torch.load(result, weights_only=False)
+                else:  # shared_memory
+                    package = result
                 self.cache.append(package)
@@ -159,12 +198,24 @@ class ThreadPoolClientTrainer(
     device_count: int
     cache: list[UplinkPackage]
-    def process_client(
+    def worker(
         self,
         cid: int,
         device: str,
         payload: DownlinkPackage,
-    ) -> UplinkPackage: ...
+    ) -> UplinkPackage:
+        """
+        Process a single client's training task in a thread.
+        Args:
+            cid (int): The client ID.
+            device (str): The device to use for processing this client.
+            payload (DownlinkPackage): The data package received from the server.
+        Returns:
+            UplinkPackage: The uplink package containing the client's results.
+        """
+        ...
     def get_client_device(self, cid: int) -> str:
         if self.device == "cuda":
@@ -177,7 +228,7 @@ class ThreadPoolClientTrainer(
             for cid in cid_list:
                 device = self.get_client_device(cid)
                 future = executor.submit(
-                    self.process_client,
+                    self.worker,
                     cid,
                     device,
                     payload,

blazefl 2.0.0.dev3__tar.gz → 2.0.0.dev4__tar.gz

blazefl 2.0.0.dev3tar.gz → 2.0.0.dev4tar.gz