PyPI - TransferQueue - Versions diffs - 0.0.1.dev0__py3-none-any.whl - Mend

TransferQueue 0.0.1.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

recipe/simple_use_case/async_demo.py +307 -0
recipe/simple_use_case/sync_demo.py +223 -0
tests/test_client.py +390 -0
tests/test_controller.py +268 -0
tests/test_serial_utils_on_cpu.py +202 -0
tests/test_simple_storage_unit.py +479 -0
transfer_queue/__init__.py +42 -0
transfer_queue/client.py +663 -0
transfer_queue/controller.py +772 -0
transfer_queue/metadata.py +603 -0
transfer_queue/storage.py +515 -0
transfer_queue/utils/__init__.py +13 -0
transfer_queue/utils/serial_utils.py +240 -0
transfer_queue/utils/utils.py +98 -0
transfer_queue/utils/zmq_utils.py +175 -0
transfer_queue/version/version +1 -0
transferqueue-0.0.1.dev0.dist-info/METADATA +15 -0
transferqueue-0.0.1.dev0.dist-info/RECORD +21 -0
transferqueue-0.0.1.dev0.dist-info/WHEEL +5 -0
transferqueue-0.0.1.dev0.dist-info/licenses/LICENSE +202 -0
transferqueue-0.0.1.dev0.dist-info/top_level.txt +4 -0

tests/test_client.py ADDED Viewed

@@ -0,0 +1,390 @@
+# Copyright 2025 The TransferQueue Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import time
+from pathlib import Path
+from threading import Thread
+import pytest
+import torch
+import zmq
+from tensordict import NonTensorStack, TensorDict
+# Import your classes here
+parent_dir = Path(__file__).resolve().parent.parent
+sys.path.append(str(parent_dir))
+from transfer_queue import TransferQueueClient  # noqa: E402
+from transfer_queue.metadata import (  # noqa: E402
+    BatchMeta,
+    FieldMeta,
+    SampleMeta,
+)
+from transfer_queue.utils.zmq_utils import (  # noqa: E402
+    ZMQMessage,
+    ZMQRequestType,
+    ZMQServerInfo,
+)
+TEST_DATA = TensorDict(
+    {
+        "log_probs": [torch.tensor([1.0, 2.0, 3.0]), torch.tensor([4.0, 5.0, 6.0]), torch.tensor([7.0, 8.0, 9.0])],
+        "variable_length_sequences": torch.nested.as_nested_tensor(
+            [
+                torch.tensor([-0.5, -1.2, -0.8]),
+                torch.tensor([-0.3, -1.5, -2.1, -0.9]),
+                torch.tensor([-1.1, -0.7]),
+            ]
+        ),
+        "prompt_text": ["Hello world!", "This is a longer sentence for testing", "Test case"],
+    },
+    batch_size=[3],
+)
+# Mock Controller for Client Unit Testing
+class MockController:
+    def __init__(self, controller_id="controller_0"):
+        self.controller_id = controller_id
+        self.context = zmq.Context()
+        # Socket for data requests
+        self.request_socket = self.context.socket(zmq.ROUTER)
+        self.request_port = self._bind_to_random_port(self.request_socket)
+        self.zmq_server_info = ZMQServerInfo.create(
+            role="TransferQueueController",
+            id=controller_id,
+            ip="127.0.0.1",
+            ports={
+                "request_handle_socket": self.request_port,
+            },
+        )
+        self.running = True
+        self.request_thread = Thread(target=self._handle_requests, daemon=True)
+        self.request_thread.start()
+    def _bind_to_random_port(self, socket):
+        port = socket.bind_to_random_port("tcp://127.0.0.1")
+        return port
+    def _handle_requests(self):
+        poller = zmq.Poller()
+        poller.register(self.request_socket, zmq.POLLIN)
+        while self.running:
+            try:
+                socks = dict(poller.poll(100))  # 100ms timeout
+                if self.request_socket in socks:
+                    identity, serialized_msg = self.request_socket.recv_multipart()
+                    request_msg = ZMQMessage.deserialize(serialized_msg)
+                    # Determine response based on request type
+                    if request_msg.request_type == ZMQRequestType.GET_META:
+                        response_body = self._mock_batch_meta(request_msg.body)
+                        response_type = ZMQRequestType.GET_META_RESPONSE
+                    elif request_msg.request_type == ZMQRequestType.GET_CLEAR_META:
+                        response_body = self._mock_batch_meta(request_msg.body)
+                        response_type = ZMQRequestType.GET_CLEAR_META_RESPONSE
+                    elif request_msg.request_type == ZMQRequestType.CLEAR_META:
+                        response_body = {"message": "clear ok"}
+                        response_type = ZMQRequestType.CLEAR_META_RESPONSE
+                    # Send response
+                    response_msg = ZMQMessage.create(
+                        request_type=response_type,
+                        sender_id=self.controller_id,
+                        receiver_id=request_msg.sender_id,
+                        body=response_body,
+                    )
+                    self.request_socket.send_multipart([identity, response_msg.serialize()])
+            except zmq.Again:
+                continue
+            except Exception as e:
+                if self.is_running:
+                    print(f"MockController running exception: {e}")
+                else:
+                    print(f"MockController ERROR: {e}")
+                    raise
+    def _mock_batch_meta(self, request_body):
+        batch_size = request_body.get("batch_size", 1)
+        data_fields = request_body.get("data_fields", [])
+        samples = []
+        for i in range(batch_size):
+            fields = []
+            for field_name in data_fields:
+                field_meta = FieldMeta(
+                    name=field_name,
+                    dtype=None,
+                    shape=None,
+                    production_status=0,
+                )
+                fields.append(field_meta)
+            sample = SampleMeta(
+                global_step=0,
+                global_index=i,
+                storage_id="storage_0",
+                local_index=i,
+                fields={field.name: field for field in fields},
+            )
+            samples.append(sample)
+        metadata = BatchMeta(samples=samples)
+        return {"metadata": metadata}
+    def stop(self):
+        self.running = False
+        time.sleep(0.2)  # Give thread time to stop
+        self.request_socket.close()
+        self.context.term()
+# Mock Storage for Client Unit Testing
+class MockStorage:
+    def __init__(self, storage_id="storage_0"):
+        self.storage_id = storage_id
+        self.context = zmq.Context()
+        # Socket for data operations
+        self.data_socket = self.context.socket(zmq.ROUTER)
+        self.data_port = self._bind_to_random_port(self.data_socket)
+        self.zmq_server_info = ZMQServerInfo.create(
+            role="TransferQueueStorage",
+            id=storage_id,
+            ip="127.0.0.1",
+            ports={
+                "put_get_socket": self.data_port,
+            },
+        )
+        self.running = True
+        self.data_thread = Thread(target=self._handle_data_requests, daemon=True)
+        self.data_thread.start()
+    def _bind_to_random_port(self, socket):
+        port = socket.bind_to_random_port("tcp://127.0.0.1")
+        return port
+    def _handle_data_requests(self):
+        poller = zmq.Poller()
+        poller.register(self.data_socket, zmq.POLLIN)
+        while self.running:
+            try:
+                socks = dict(poller.poll(100))  # 100ms timeout
+                if self.data_socket in socks:
+                    identity, msg_bytes = self.data_socket.recv_multipart()
+                    msg = ZMQMessage.deserialize(msg_bytes)
+                    # Handle different request types
+                    if msg.request_type == ZMQRequestType.PUT_DATA:
+                        response_body = {"message": "Data stored successfully"}
+                        response_type = ZMQRequestType.PUT_DATA_RESPONSE
+                    elif msg.request_type == ZMQRequestType.GET_DATA:
+                        response_body = self._handle_get_data(msg.body)
+                        response_type = ZMQRequestType.GET_DATA_RESPONSE
+                    elif msg.request_type == ZMQRequestType.CLEAR_DATA:
+                        response_body = {"message": "Data cleared successfully"}
+                        response_type = ZMQRequestType.CLEAR_DATA_RESPONSE
+                    # Send response
+                    response_msg = ZMQMessage.create(
+                        request_type=response_type,
+                        sender_id=self.storage_id,
+                        receiver_id=msg.sender_id,
+                        body=response_body,
+                    )
+                    self.data_socket.send_multipart([identity, response_msg.serialize()])
+            except zmq.Again:
+                continue
+            except Exception as e:
+                if self.is_running:
+                    print(f"MockStorage running exception: {e}")
+                else:
+                    print(f"MockStorage ERROR: {e}")
+                    raise
+    def _handle_get_data(self, request_body):
+        """Handle GET_DATA request by retrieving stored data"""
+        local_indexes = request_body.get("local_indexes", [])
+        fields = request_body.get("fields", [])
+        result: dict[str, list] = {}
+        for field in fields:
+            gathered_items = [TEST_DATA[field][i] for i in local_indexes]
+            if gathered_items:
+                all_tensors = all(isinstance(x, torch.Tensor) for x in gathered_items)
+                if all_tensors:
+                    result[field] = torch.nested.as_nested_tensor(gathered_items)
+                else:
+                    result[field] = NonTensorStack(*gathered_items)
+        return {"data": TensorDict(result)}
+    def stop(self):
+        self.running = False
+        time.sleep(0.2)  # Give thread time to stop
+        self.data_socket.close()
+        self.context.term()
+# Test Fixtures
+@pytest.fixture
+def mock_controller():
+    controller = MockController()
+    yield controller
+    controller.stop()
+@pytest.fixture
+def mock_storage():
+    storage = MockStorage()
+    yield storage
+    storage.stop()
+@pytest.fixture
+def client_setup(mock_controller, mock_storage):
+    # Create client with mock controller and storage
+    client_id = "client_0"
+    client = TransferQueueClient(
+        client_id=client_id,
+        controller_infos={mock_controller.controller_id: mock_controller.zmq_server_info},
+        storage_infos={mock_storage.storage_id: mock_storage.zmq_server_info},
+    )
+    # Give some time for connections to establish
+    time.sleep(0.5)
+    yield client, mock_controller, mock_storage
+# Test basic functionality
+def test_client_initialization(client_setup):
+    """Test client initialization and connection setup"""
+    client, mock_controller, mock_storage = client_setup
+    assert client.client_id is not None
+    assert mock_controller.controller_id in client._controllers
+    assert mock_storage.storage_id in client._storages
+def test_put_and_get_data(client_setup):
+    """Test basic put and get operations"""
+    client, _, _ = client_setup
+    # Test put operation
+    client.put(data=TEST_DATA, global_step=0)
+    # Get metadata for retrieving data
+    metadata = client.get_meta(
+        data_fields=["log_probs", "variable_length_sequences", "prompt_text"], batch_size=2, global_step=0
+    )
+    # Test get operation
+    result = client.get_data(metadata)
+    # Verify result structure
+    assert "log_probs" in result
+    assert "variable_length_sequences" in result
+    assert "prompt_text" in result
+    torch.testing.assert_close(result["log_probs"][0], torch.tensor([1.0, 2.0, 3.0]))
+    torch.testing.assert_close(result["log_probs"][1], torch.tensor([4.0, 5.0, 6.0]))
+    torch.testing.assert_close(result["variable_length_sequences"][0], torch.tensor([-0.5, -1.2, -0.8]))
+    torch.testing.assert_close(result["variable_length_sequences"][1], torch.tensor([-0.3, -1.5, -2.1, -0.9]))
+    assert result["prompt_text"][0] == "Hello world!"
+    assert result["prompt_text"][1] == "This is a longer sentence for testing"
+def test_get_meta(client_setup):
+    """Test metadata retrieval"""
+    client, _, _ = client_setup
+    # Test get_meta operation
+    metadata = client.get_meta(data_fields=["tokens", "labels"], batch_size=10, global_step=0)
+    # Verify metadata structure
+    assert hasattr(metadata, "storage_meta_groups")
+    assert hasattr(metadata, "global_indexes")
+    assert hasattr(metadata, "field_names")
+    assert hasattr(metadata, "size")
+    assert len(metadata.global_indexes) == 10
+def test_clear_operation(client_setup):
+    """Test clear operation"""
+    client, _, _ = client_setup
+    # Test clear operation
+    client.clear(global_step=0)
+# Test with multiple controllers and storage units
+def test_multiple_servers():
+    """Test client with multiple controllers and storage units"""
+    # Create multiple mock servers
+    controllers = [MockController(f"controller_{i}") for i in range(2)]
+    storages = [MockStorage(f"storage_{i}") for i in range(3)]
+    try:
+        # Create client with multiple servers
+        client_id = "client_test_multiple_servers"
+        controller_infos = {c.controller_id: c.zmq_server_info for c in controllers}
+        storage_infos = {s.storage_id: s.zmq_server_info for s in storages}
+        client = TransferQueueClient(
+            client_id=client_id, controller_infos=controller_infos, storage_infos=storage_infos
+        )
+        # Give time for connections
+        time.sleep(1.0)
+        # Verify connections
+        assert len(client._controllers) == 2
+        assert len(client._storages) == 3
+        # Test basic operation
+        test_data = TensorDict({"tokens": torch.randint(0, 100, (5, 128))}, batch_size=5)
+        # Test put operation
+        client.put(data=test_data, global_step=0)
+    finally:
+        # Clean up
+        for c in controllers:
+            c.stop()
+        for s in storages:
+            s.stop()
+# Test error handling
+def test_put_without_required_params(client_setup):
+    """Test put operation without required parameters"""
+    client, _, _ = client_setup
+    # Create test data
+    test_data = TensorDict({"tokens": torch.randint(0, 100, (5, 128))}, batch_size=5)
+    # Test put without global_step (should fail)
+    with pytest.raises(AssertionError):
+        client.put(data=test_data)

tests/test_controller.py ADDED Viewed

@@ -0,0 +1,268 @@
+# Copyright 2025 The TransferQueue Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import math
+import sys
+from pathlib import Path
+import numpy as np
+import pytest
+import ray
+import torch
+parent_dir = Path(__file__).resolve().parent.parent
+sys.path.append(str(parent_dir))
+from transfer_queue.controller import TQ_INIT_FIELD_NUM, TransferQueueController  # noqa: E402
+from transfer_queue.storage import TransferQueueStorageSimpleUnit  # noqa: E402
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+@pytest.fixture(scope="function")
+def ray_setup():
+    if ray.is_initialized():
+        ray.shutdown()
+    ray.init(
+        ignore_reinit_error=True,
+        runtime_env={"env_vars": {"RAY_DEBUG": "1", "RAY_DEDUP_LOGS": "0"}},
+        log_to_driver=True,
+    )
+    yield
+    if ray.is_initialized():
+        ray.shutdown()
+        logger.info("Ray has been shut down completely after test")
+@pytest.fixture(scope="function")
+def setup_teardown_transfer_queue_controller(ray_setup):
+    # Used as the offset for the global index to distinguish which global step the data corresponds to
+    global_batch_size = 8
+    num_global_batch = 2
+    num_n_samples = 2
+    num_data_storage_units = 2
+    tq_controller = TransferQueueController.remote(
+        num_storage_units=num_data_storage_units,
+        global_batch_size=global_batch_size,
+        num_global_batch=num_global_batch,
+        num_n_samples=num_n_samples,
+    )
+    yield tq_controller, global_batch_size, num_global_batch, num_n_samples
+    ray.get(tq_controller.clear.remote(0))
+@pytest.fixture(scope="function")
+def setup_teardown_register_controller_info(setup_teardown_transfer_queue_controller):
+    tq_controller, global_batch_size, num_global_batch, num_n_samples = setup_teardown_transfer_queue_controller
+    total_storage_size = global_batch_size * num_global_batch * num_n_samples
+    num_data_storage_units = 2
+    data_system_storage_units = {}
+    for storage_unit_rank in range(num_data_storage_units):
+        storage_node = TransferQueueStorageSimpleUnit.remote(
+            storage_size=math.ceil(total_storage_size / num_data_storage_units)
+        )
+        data_system_storage_units[storage_unit_rank] = storage_node
+        logger.info(f"TransferQueueStorageSimpleUnit #{storage_unit_rank} has been created.")
+    # Register controller info
+    zmq_server_info = ray.get(tq_controller.get_zmq_server_info.remote())
+    controller_infos = {zmq_server_info.id: zmq_server_info}
+    ray.get(
+        [
+            storage_unit.register_controller_info.remote(controller_infos)
+            for storage_unit in data_system_storage_units.values()
+        ]
+    )
+    yield tq_controller, global_batch_size, num_n_samples, data_system_storage_units
+class TestTransferQueueController:
+    @pytest.mark.parametrize("num_n_samples", [1, 2])
+    @pytest.mark.parametrize("num_global_batch", [1, 2])
+    def test_build_index_storage_mapping(self, num_n_samples, num_global_batch, ray_setup):
+        # Used as the offset for the global index to distinguish which global step the data corresponds to
+        global_batch_size = 8
+        num_data_storage_units = 2
+        self.tq_controller = TransferQueueController.remote(
+            num_storage_units=num_data_storage_units,
+            global_batch_size=global_batch_size,
+            num_global_batch=num_global_batch,
+            num_n_samples=num_n_samples,
+        )
+        global_index_storage_mapping, global_index_local_index_mapping = ray.get(
+            self.tq_controller.get_global_index_mapping.remote()
+        )
+        if num_global_batch == 1 and num_n_samples == 1:
+            assert np.array_equal(global_index_storage_mapping, np.array([0, 0, 0, 0, 1, 1, 1, 1]))
+            assert np.array_equal(global_index_local_index_mapping, np.array([0, 1, 2, 3, 0, 1, 2, 3]))
+        # The data of a single GBS will be distributed across different storage units
+        elif num_global_batch == 2 and num_n_samples == 1:
+            assert np.array_equal(
+                global_index_storage_mapping, np.array([0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1])
+            )
+            assert np.array_equal(
+                global_index_local_index_mapping, np.array([0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7])
+            )
+        # When num_n_samples is larger than 1
+        elif num_global_batch == 1 and num_n_samples == 2:
+            assert np.array_equal(
+                global_index_storage_mapping, np.array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1])
+            )
+            assert np.array_equal(
+                global_index_local_index_mapping, np.array([0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7])
+            )
+        elif num_global_batch == 2 and num_n_samples == 2:
+            assert np.array_equal(
+                global_index_storage_mapping,
+                np.array(
+                    [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+                ),
+            )
+            assert np.array_equal(
+                global_index_local_index_mapping,
+                np.array(
+                    [
+                        0,
+                        1,
+                        2,
+                        3,
+                        4,
+                        5,
+                        6,
+                        7,
+                        0,
+                        1,
+                        2,
+                        3,
+                        4,
+                        5,
+                        6,
+                        7,
+                        8,
+                        9,
+                        10,
+                        11,
+                        12,
+                        13,
+                        14,
+                        15,
+                        8,
+                        9,
+                        10,
+                        11,
+                        12,
+                        13,
+                        14,
+                        15,
+                    ]
+                ),
+            )
+    def test_update_production_status(self, setup_teardown_transfer_queue_controller):
+        tq_controller, global_batch_size, num_global_batch, num_n_samples = setup_teardown_transfer_queue_controller
+        total_storage_size = global_batch_size * num_global_batch * num_n_samples
+        # Initialize get_data_production_status and filed_name_mapping
+        init_update_production_status = torch.zeros(total_storage_size, TQ_INIT_FIELD_NUM, dtype=torch.int8)
+        assert torch.equal(ray.get(tq_controller.get_data_production_status.remote()), init_update_production_status)
+        assert ray.get(tq_controller.get_field_name_mapping.remote()) == {}
+        columns_list = ["test_prompts"]
+        global_indexes = list(range(global_batch_size * num_n_samples))
+        # update production status
+        tq_controller._update_production_status.remote(global_indexes, columns_list)
+        new_field_name_mapping = ray.get(tq_controller.get_field_name_mapping.remote())
+        assert new_field_name_mapping["test_prompts"] == 0
+        new_data_production_status = ray.get(tq_controller.get_data_production_status.remote())
+        assert new_data_production_status[:, 0][: len(global_indexes)].sum() == len(global_indexes)
+    def test_data_consumption_status(self, setup_teardown_transfer_queue_controller):
+        tq_controller, global_batch_size, num_global_batch, num_n_samples = setup_teardown_transfer_queue_controller
+        total_storage_size = global_batch_size * num_global_batch * num_n_samples
+        init_data_consumption_status = {}
+        assert ray.get(tq_controller.get_data_consumption_status.remote()) == init_data_consumption_status
+        task_name = "test_task1"
+        ray.get(tq_controller._get_consumption_status.remote(task_name))
+        new_data_consumption_status = ray.get(tq_controller.get_data_consumption_status.remote())
+        assert torch.equal(new_data_consumption_status[task_name], torch.zeros(total_storage_size, dtype=torch.int8))
+    def test_get_prompt_metadata(self, setup_teardown_register_controller_info):
+        tq_controller, global_batch_size, n_samples, _ = setup_teardown_register_controller_info
+        data_fields = ["test_prompts"]
+        global_step = 5
+        metadata = ray.get(
+            tq_controller._get_metadata.remote(
+                data_fields=data_fields,
+                batch_size=global_batch_size * n_samples,
+                global_step=global_step,
+                mode="insert",
+            )
+        )
+        metadata.reorder([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0])
+        assert metadata.global_indexes == [
+            31,
+            30,
+            29,
+            28,
+            27,
+            26,
+            25,
+            24,
+            23,
+            22,
+            21,
+            20,
+            19,
+            18,
+            17,
+            16,
+        ]
+        assert metadata.local_indexes == [
+            15,
+            14,
+            13,
+            12,
+            11,
+            10,
+            9,
+            8,
+            15,
+            14,
+            13,
+            12,
+            11,
+            10,
+            9,
+            8,
+        ]
+        storage_ids = metadata.storage_ids
+        assert len(set(storage_ids[: len(storage_ids) // 2])) == 1
+    # TODO: Test case where multiple clients concurrently read datameta from a single controller,
+    #  and each client receives the correct response