PyPI - TransferQueue - Versions diffs - 0.1.1.dev0__py3-none-any.whl - Mend

TransferQueue 0.1.1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

recipe/simple_use_case/async_demo.py +331 -0
recipe/simple_use_case/sync_demo.py +220 -0
tests/test_async_simple_storage_manager.py +339 -0
tests/test_client.py +423 -0
tests/test_controller.py +274 -0
tests/test_controller_data_partitions.py +513 -0
tests/test_kv_storage_manager.py +92 -0
tests/test_put.py +327 -0
tests/test_samplers.py +492 -0
tests/test_serial_utils_on_cpu.py +202 -0
tests/test_simple_storage_unit.py +443 -0
tests/test_storage_client_factory.py +45 -0
transfer_queue/__init__.py +48 -0
transfer_queue/client.py +611 -0
transfer_queue/controller.py +1187 -0
transfer_queue/metadata.py +460 -0
transfer_queue/sampler/__init__.py +19 -0
transfer_queue/sampler/base.py +74 -0
transfer_queue/sampler/grpo_group_n_sampler.py +157 -0
transfer_queue/sampler/sequential_sampler.py +75 -0
transfer_queue/storage/__init__.py +25 -0
transfer_queue/storage/clients/__init__.py +24 -0
transfer_queue/storage/clients/base.py +22 -0
transfer_queue/storage/clients/factory.py +55 -0
transfer_queue/storage/clients/yuanrong_client.py +118 -0
transfer_queue/storage/managers/__init__.py +23 -0
transfer_queue/storage/managers/base.py +460 -0
transfer_queue/storage/managers/factory.py +43 -0
transfer_queue/storage/managers/simple_backend_manager.py +611 -0
transfer_queue/storage/managers/yuanrong_manager.py +18 -0
transfer_queue/storage/simple_backend.py +451 -0
transfer_queue/utils/__init__.py +13 -0
transfer_queue/utils/serial_utils.py +240 -0
transfer_queue/utils/utils.py +132 -0
transfer_queue/utils/zmq_utils.py +170 -0
transfer_queue/version/version +1 -0
transferqueue-0.1.1.dev0.dist-info/METADATA +327 -0
transferqueue-0.1.1.dev0.dist-info/RECORD +41 -0
transferqueue-0.1.1.dev0.dist-info/WHEEL +5 -0
transferqueue-0.1.1.dev0.dist-info/licenses/LICENSE +202 -0
transferqueue-0.1.1.dev0.dist-info/top_level.txt +4 -0

transfer_queue/sampler/grpo_group_n_sampler.py ADDED Viewed

@@ -0,0 +1,157 @@
+# Copyright 2025 The TransferQueue Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+from transfer_queue.sampler import BaseSampler
+class GRPOGroupNSampler(BaseSampler):
+    """Group-based sampler for reinforcement learning and multi-sample generation workflows.
+    This sampler implements grouped sampling without replacement, specifically designed
+    for scenarios where multiple samples need to be generated from the same input prompt
+    or where grouped sampling is required. It ensures that all samples belonging to the
+    same prompt are either selected together or not at all, maintaining the integrity
+    of prompt groups throughout the training process.
+    The sampler is commonly used in GRPO (Group Relative Policy Optimization)
+    training scenarios where you need to generate multiple responses from the same
+    prompt and train the policy on all of them together.
+    The sampler is configured through TransferQueueController and receives parameters
+    via the sampling_config in get_meta calls:
+    ```python
+    # Initialize controller with GRPO sampler
+    from transfer_queue import TransferQueueController, GRPOGroupNSampler, AsyncTransferQueueClient
+    controller = TransferQueueController.remote(sampler=GRPOGroupNSampler)
+    controller_info = process_zmq_server_info(controller)
+    client = AsyncTransferQueueClient(
+        client_id="rl_client",
+        controller_info=controller_info,
+    )
+    # Get metadata with grouped sampling configuration
+    meta = await client.async_get_meta(
+        data_fields=["input_ids", "attention_mask", "generated_text", "reward"],
+        batch_size=16,  # Total samples requested
+        partition_id="train_0",
+        task_name="rl_training",
+        sampling_config={"n_samples_per_prompt": 4}  # 4 samples per prompt
+    )
+    # This will return 16 samples organized as 4 groups of 4 samples each
+    ```
+    Data Organization:
+    This sampler assumes the user puts the prompts in consecutive orders, such as
+    [prompt1_sample1, prompt1_sample2, prompt2_sample1, prompt2_sample2, ...]
+    belong to the same prompt group:
+    ```
+    ready_indexes = [prompt1_sample1, prompt1_sample2, prompt1_sample3, prompt1_sample4,
+                    prompt2_sample1, prompt2_sample2, prompt2_sample3, prompt2_sample4, ...]
+    ```
+    """
+    def __init__(
+        self,
+    ):
+        """Initialize the GRPOGroupNSampler.
+        The sampler maintains minimal internal state and relies on runtime
+        configuration through the sampling_config parameter.
+        """
+        super().__init__()
+    def sample(
+        self,
+        ready_indexes: list[int],
+        batch_size: int,
+        n_samples_per_prompt: int,
+        *args: Any,
+        **kwargs: Any,
+    ) -> tuple[list[int], list[int]]:
+        """Sample groups of indices from the ready indices.
+        This method implements group completeness validation and ensures that only complete
+        groups are sampled. It returns empty lists if insufficient complete groups are available.
+        Args:
+            ready_indexes: List of global indices for which all required fields have been
+                produced and samples are not labeled as consumed. These should be organized
+                such that consecutive indices belong to the same prompt group.
+            batch_size: Total number of samples to select. Must be divisible by n_samples_per_prompt.
+            n_samples_per_prompt: Number of samples per prompt group. Must be > 0.
+            *args: Additional positional arguments (ignored in current implementation)
+            **kwargs: Additional keyword arguments (ignored in current implementation)
+        Returns:
+            Tuple of (sampled_indexes, consumed_indexes):
+            - sampled_indexes: List of selected global indices, length = batch_size or empty
+            - consumed_indexes: List of indices to mark as consumed, identical to sampled_indexes
+              (without replacement semantics)
+        Examples:
+            >>> sampler = GRPOGroupNSampler()
+            >>> ready_indexes = [0, 1, 3, 4, 6, 7]  # No complete groups after sorting
+            >>> sampled, consumed = sampler.sample(ready_indexes, 6, n_samples_per_prompt=3)
+            >>> sampled
+            []
+            >>> consumed
+            []
+            >>> ready_indexes = [0, 1, 3, 4, 5, 6, 7, 9, 10, 11]  # Has complete groups after sorting
+            >>> sampled, consumed = sampler.sample(ready_indexes, 6, n_samples_per_prompt=3)
+            >>> sampled
+            [3, 4, 5, 9, 10, 11]
+            >>> consumed
+            [3, 4, 5, 9, 10, 11]
+        """
+        # Basic validation
+        if n_samples_per_prompt <= 0:
+            raise ValueError(f"n_samples_per_prompt must be positive, got {n_samples_per_prompt}")
+        if batch_size % n_samples_per_prompt != 0:
+            raise ValueError(
+                f"batch_size ({batch_size}) must be a multiple of n_samples_per_prompt ({n_samples_per_prompt})"
+            )
+        required_groups = batch_size // n_samples_per_prompt
+        sorted_ready_indexes = sorted(ready_indexes)
+        complete_group_indices = []
+        found_groups = 0
+        i = 0
+        while i <= len(sorted_ready_indexes) - n_samples_per_prompt and found_groups < required_groups:
+            potential_group = sorted_ready_indexes[i : i + n_samples_per_prompt]
+            # Check if this forms a complete group (consecutive indices)
+            is_consecutive = all(
+                potential_group[j + 1] - potential_group[j] == 1 for j in range(len(potential_group) - 1)
+            )
+            if is_consecutive:
+                complete_group_indices.extend(potential_group)
+                found_groups += 1
+                i += n_samples_per_prompt
+            else:
+                i += 1
+        if found_groups < required_groups:
+            return [], []
+        sampled_indexes = complete_group_indices
+        consumed_indexes = sampled_indexes.copy()
+        return sampled_indexes, consumed_indexes

transfer_queue/sampler/sequential_sampler.py ADDED Viewed

@@ -0,0 +1,75 @@
+# Copyright 2025 The TransferQueue Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+from transfer_queue.sampler import BaseSampler
+class SequentialSampler(BaseSampler):
+    """Sequential sampler for basic data consumption patterns.
+    This sampler implements sequential sampling without replacement, selecting samples
+    from the beginning of the ready_indexes list in order. It's the default sampling
+    strategy for TransferQueueController and provides simple, deterministic data consumption
+    with minimal overhead.
+    The sampler is ideal for standard supervised learning scenarios, data preprocessing
+    pipelines, and any use case where ordered, predictable data consumption is preferred.
+    It ensures each sample is consumed exactly once, maintaining a clean progression through
+    the available data.
+    This sampler is typically used as the default sampler in TransferQueueController:
+    ```python
+    # Default usage (SequentialSampler is the default)
+    controller = TransferQueueController.remote()
+    # or explicitly:
+    controller = TransferQueueController.remote(sampler=SequentialSampler)
+    ```
+    """
+    def __init__(
+        self,
+    ):
+        """Initialize the SequentialSampler.
+        SequentialSampler requires no initialization parameters and maintains
+        minimal internal state for optimal performance.
+        """
+        super().__init__()
+    def sample(
+        self,
+        ready_indexes: list[int],
+        batch_size: int,
+        *args: Any,
+        **kwargs: Any,
+    ) -> tuple[list[int], list[int]]:
+        """Select first batch_size elements from ready_indexes.
+        Args:
+            ready_indexes: Available sample indices.
+            batch_size: Number of samples to select. If larger than available ready samples,
+                all available samples will be returned.
+            *args: Additional positional arguments (ignored).
+            **kwargs: Additional keyword arguments (ignored).
+        Returns:
+            Tuple of (sampled_indexes, consumed_indexes), where consumed_indexes = sampled_indexes.
+        """
+        sampled_indexes = ready_indexes[:batch_size]
+        consumed_indexes = sampled_indexes
+        return sampled_indexes, consumed_indexes

transfer_queue/storage/__init__.py ADDED Viewed

@@ -0,0 +1,25 @@
+# Copyright 2025 The TransferQueue Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .managers import AsyncSimpleStorageManager, TransferQueueStorageManager, TransferQueueStorageManagerFactory
+from .simple_backend import SimpleStorageUnit, StorageMetaGroup, StorageUnitData
+__all__ = [
+    "SimpleStorageUnit",
+    "StorageUnitData",
+    "StorageMetaGroup",
+    "TransferQueueStorageManager",
+    "TransferQueueStorageManagerFactory",
+    "AsyncSimpleStorageManager",
+]

transfer_queue/storage/clients/__init__.py ADDED Viewed

@@ -0,0 +1,24 @@
+# Copyright 2025 The TransferQueue Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This module is currently empty but reserved for future client implementations
+from .base import TransferQueueStorageKVClient
+from .factory import StorageClientFactory
+from .yuanrong_client import YRStorageClient
+__all__ = [
+    "TransferQueueStorageKVClient",
+    "StorageClientFactory",
+    "YRStorageClient",
+]

transfer_queue/storage/clients/base.py ADDED Viewed

@@ -0,0 +1,22 @@
+from abc import ABC, abstractmethod
+from torch import Tensor
+class TransferQueueStorageKVClient(ABC):
+    """
+    Abstract base class for storage client.
+    Subclasses must implement the core methods: put, get, and clear.
+    """
+    @abstractmethod
+    def put(self, keys: list[str], values: list[Tensor]) -> None:
+        raise NotImplementedError("Subclasses must implement put")
+    @abstractmethod
+    def get(self, keys: list[str], shapes=None, dtypes=None) -> list[Tensor]:
+        raise NotImplementedError("Subclasses must implement get")
+    @abstractmethod
+    def clear(self, keys: list[str]) -> None:
+        raise NotImplementedError("Subclasses must implement clear")

transfer_queue/storage/clients/factory.py ADDED Viewed

@@ -0,0 +1,55 @@
+# Copyright 2025 The TransferQueue Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transfer_queue.storage.clients.base import TransferQueueStorageKVClient
+class StorageClientFactory:
+    """
+    Factory class for creating storage client instances.
+    Uses a decorator-based registration mechanism to map client names to classes.
+    """
+    # Class variable: maps client names to their corresponding classes
+    _registry: dict[str, TransferQueueStorageKVClient] = {}
+    @classmethod
+    def register(cls, client_type: str):
+        """
+        Decorator to register a concrete client class with the factory.
+        Args:
+            client_type (str): The name used to identify the client
+        Returns:
+            Callable: The decorator function that returns the original class
+        """
+        def decorator(client_class: TransferQueueStorageKVClient) -> TransferQueueStorageKVClient:
+            cls._registry[client_type] = client_class
+            return client_class
+        return decorator
+    @classmethod
+    def create(cls, client_type: str, config: dict) -> TransferQueueStorageKVClient:
+        """
+        Create and return an instance of the storage client by name.
+        Args:
+            client_type (str): The registered name of the client
+        Returns:
+            StorageClientFactory: An instance of the requested client
+        Raises:
+            ValueError: If no client is registered with the given name
+        """
+        if client_type not in cls._registry:
+            raise ValueError(f"Unknown StorageClient: {client_type}")
+        return cls._registry[client_type](config)

transfer_queue/storage/clients/yuanrong_client.py ADDED Viewed

@@ -0,0 +1,118 @@
+from typing import Any
+import torch
+from torch import Tensor
+from transfer_queue.storage.clients.base import TransferQueueStorageKVClient
+from transfer_queue.storage.clients.factory import StorageClientFactory
+YUANRONG_DATASYSTEM_IMPORTED: bool = True
+TORCH_NPU_IMPORTED: bool = True
+try:
+    import datasystem
+except ImportError:
+    YUANRONG_DATASYSTEM_IMPORTED = False
+try:
+    import torch_npu
+except ImportError:
+    TORCH_NPU_IMPORTED = False
+# TODO: DSTensorClient.dev_mget has wrong behavior: it may require stricter environment to execute
+@StorageClientFactory.register("Yuanrong")
+class YRStorageClient(TransferQueueStorageKVClient):
+    """
+    Storage client for YuanRong DataSystem.
+    Communicates with the remote tensor storage service via DsTensorClient.
+    All tensors must reside on NPU device.
+    """
+    def __init__(self, config: dict[str, Any]):
+        if not YUANRONG_DATASYSTEM_IMPORTED:
+            raise ImportError("YuanRong DataSystem not installed.")
+        if not TORCH_NPU_IMPORTED:
+            raise ImportError("Torch_npu not installed.")
+        self.host = config.get("host")
+        self.port = config.get("port")
+        self.device_id = config.get("device_id")
+        torch_npu.npu.set_device(f"npu:{self.device_id}")  # set npu_device
+        self._ds_client = datasystem.DsTensorClient(self.host, self.port, self.device_id)
+        self._ds_client.init()
+    def _create_empty_tensorlist(self, shapes, dtypes):
+        """
+        Create a list of empty NPU tensors with given shapes and dtypes.
+        Args:
+            shapes (list): List of tensor shapes (e.g., [(3,), (2, 4)])
+            dtypes (list): List of torch dtypes (e.g., [torch.float32, torch.int64])
+        Returns:
+            list: List of uninitialized NPU tensors
+        """
+        if len(dtypes) != len(shapes):
+            raise ValueError("Length of dtypes must equal length of shapes")
+        tensors: list[Tensor] = []
+        for dtype, shape in zip(dtypes, shapes, strict=False):
+            tensor = torch.empty(shape, dtype=dtype).to(f"npu:{self.device_id}")
+            tensors.append(tensor)
+        return tensors
+    def put(self, keys: list[str], values: list[Tensor]):
+        """
+        Store tensors to remote storage.
+        Args:
+            keys (list): List of string keys
+            values (list): List of torch.Tensor on NPU
+        """
+        if not isinstance(keys, list) or not isinstance(values, list):
+            raise ValueError("keys and values must be lists")
+        if len(keys) != len(values):
+            raise ValueError("Number of keys must match number of values")
+        # TODO: Support the situation when the number of keys is greater than 10000
+        if len(keys) > 10000:
+            raise NotImplementedError("We will support the number of keys greater than 10000 int the future")
+        for value in values:
+            if not isinstance(value, torch.Tensor):
+                raise ValueError(f"Expected torch.Tensor, got {type(value)}")
+            if value.device.type != "npu":
+                raise ValueError(f"Tensor is on {value.device}, not on NPU")
+        self._ds_client.dev_mset(keys, values)
+    def get(self, keys: list[str], shapes=None, dtypes=None) -> list[Tensor]:
+        """
+        Retrieve tensors from remote storage.
+        Args:
+            keys (list): List of keys to fetch
+            shapes (list): Expected shapes of returned tensors
+            dtypes (list): Expected dtypes of returned tensors
+        Returns:
+            list: List of retrieved NPU tensors
+        """
+        if shapes is None:
+            raise ValueError("Yuanrong storage client needs Expected shapes of returned tensors")
+        if dtypes is None:
+            raise ValueError("Yuanrong storage client needs Expected dtypes of returned tensors")
+        if len(dtypes) != len(shapes):
+            raise ValueError("Length of dtypes must equal length of shapes")
+        values: list[Tensor] = self._create_empty_tensorlist(shapes=shapes, dtypes=dtypes)
+        # TODO: Support the situation when the number of keys is greater than 10000
+        if len(keys) > 10000:
+            raise NotImplementedError("We will support the number of keys greater than 10000 int the future")
+        # Timeout set to 2000ms
+        self._ds_client.dev_mget(keys, values, 2000)
+        return values
+    def clear(self, keys: list[str]):
+        """
+        Delete entries from storage by keys.
+        Args:
+            keys (list): List of keys to delete
+        """
+        self._ds_client.dev_delete(keys)

transfer_queue/storage/managers/__init__.py ADDED Viewed

@@ -0,0 +1,23 @@
+# Copyright 2025 The TransferQueue Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .base import TransferQueueStorageManager
+from .factory import TransferQueueStorageManagerFactory
+from .simple_backend_manager import AsyncSimpleStorageManager
+__all__ = [
+    "TransferQueueStorageManager",
+    "TransferQueueStorageManagerFactory",
+    "AsyncSimpleStorageManager",
+]