PyPI - TransferQueue - Versions diffs - 0.1.4.dev0__py3-none-any.whl → 0.1.4.dev1__py3-none-any.whl - Mend

TransferQueue 0.1.4.dev0py3-none-any.whl → 0.1.4.dev1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

performance_test.py +5 -5
tests/test_metadata.py +147 -0
tests/test_serial_utils_on_cpu.py +53 -0
transfer_queue/controller.py +115 -100
transfer_queue/metadata.py +30 -1
transfer_queue/storage/managers/simple_backend_manager.py +6 -1
transfer_queue/storage/simple_backend.py +28 -21
transfer_queue/utils/perf_utils.py +104 -0
transfer_queue/utils/zmq_utils.py +6 -5
transfer_queue/version/version +1 -1
{transferqueue-0.1.4.dev0.dist-info → transferqueue-0.1.4.dev1.dist-info}/METADATA +1 -1
{transferqueue-0.1.4.dev0.dist-info → transferqueue-0.1.4.dev1.dist-info}/RECORD +16 -14
{transferqueue-0.1.4.dev0.dist-info → transferqueue-0.1.4.dev1.dist-info}/top_level.txt +1 -0
verify_fix.py +109 -0
{transferqueue-0.1.4.dev0.dist-info → transferqueue-0.1.4.dev1.dist-info}/WHEEL +0 -0
{transferqueue-0.1.4.dev0.dist-info → transferqueue-0.1.4.dev1.dist-info}/licenses/LICENSE +0 -0

performance_test.py CHANGED Viewed

@@ -137,8 +137,8 @@ class RayBandwidthTester:
             RemoteDataStore = RemoteDataStoreRemote
         self.remote_store = RemoteDataStore.options(
-            num_cpus=0.01,
-            resources={f"node:{WORKER_NODE_IP}": 0.001}
+            num_cpus=1,
+            resources={f"node:{WORKER_NODE_IP}": 1}
         ).remote()
         logger.info(f"Remote data store created on worker node {WORKER_NODE_IP}")
@@ -232,15 +232,15 @@ class TQBandwidthTester:
             # 限制在远程节点
             for storage_unit_rank in range(self.config.num_data_storage_units):
                 storage_node = SimpleStorageUnit.options(
-                    num_cpus=0.01,
-                    resources={f"node:{WORKER_NODE_IP}": 0.001},
+                    num_cpus=1,
+                    resources={f"node:{WORKER_NODE_IP}": 1},
                     runtime_env={"env_vars": {"OMP_NUM_THREADS": "2"}},
                 ).remote(
                     storage_unit_size=math.ceil(total_storage_size / self.config.num_data_storage_units)
                 )
                 self.data_system_storage_units[storage_unit_rank] = storage_node
         else:
-            storage_placement_group = get_placement_group(self.config.num_data_storage_units, num_cpus_per_actor=0.01)
+            storage_placement_group = get_placement_group(self.config.num_data_storage_units, num_cpus_per_actor=1)
             for storage_unit_rank in range(self.config.num_data_storage_units):
                 storage_node = SimpleStorageUnit.options(
                     placement_group=storage_placement_group,

tests/test_metadata.py CHANGED Viewed

@@ -535,6 +535,153 @@ class TestBatchMeta:
         assert selected_field.production_status == ProductionStatus.READY_FOR_CONSUME
         assert selected_field.name == "field1"
+    def test_batch_meta_select_samples(self):
+        """Example: Select specific samples from a batch."""
+        fields = {
+            "field1": FieldMeta(name="field1", dtype=torch.float32, shape=(2,)),
+            "field2": FieldMeta(name="field2", dtype=torch.int64, shape=(3,)),
+        }
+        samples = [
+            SampleMeta(partition_id="partition_0", global_index=0, fields=fields),
+            SampleMeta(partition_id="partition_0", global_index=1, fields=fields),
+            SampleMeta(partition_id="partition_0", global_index=2, fields=fields),
+            SampleMeta(partition_id="partition_0", global_index=3, fields=fields),
+        ]
+        batch = BatchMeta(samples=samples, extra_info={"test_key": "test_value"})
+        # Select samples at indices [0, 2]
+        selected_batch = batch.select_samples([0, 2])
+        # Check number of samples
+        assert len(selected_batch) == 2
+        # Check global indexes
+        assert selected_batch.global_indexes == [0, 2]
+        # Check fields are preserved
+        for sample in selected_batch.samples:
+            assert "field1" in sample.fields
+            assert "field2" in sample.fields
+        # Original batch is unchanged
+        assert len(batch) == 4
+        # Extra info is preserved
+        assert selected_batch.extra_info["test_key"] == "test_value"
+    def test_batch_meta_select_samples_all_indices(self):
+        """Example: Select all samples using complete index list."""
+        fields = {
+            "test_field": FieldMeta(
+                name="test_field", dtype=torch.float32, shape=(2,), production_status=ProductionStatus.READY_FOR_CONSUME
+            )
+        }
+        samples = [
+            SampleMeta(partition_id="partition_0", global_index=0, fields=fields),
+            SampleMeta(partition_id="partition_0", global_index=1, fields=fields),
+            SampleMeta(partition_id="partition_0", global_index=2, fields=fields),
+        ]
+        batch = BatchMeta(samples=samples, extra_info={"test_key": "test_value"})
+        # Select all samples
+        selected_batch = batch.select_samples([0, 1, 2])
+        # All samples are selected
+        assert len(selected_batch) == 3
+        assert selected_batch.global_indexes == [0, 1, 2]
+        # Extra info is preserved
+        assert selected_batch.extra_info["test_key"] == "test_value"
+    def test_batch_meta_select_samples_single_sample(self):
+        """Example: Select a single sample from batch."""
+        fields = {
+            "test_field": FieldMeta(
+                name="test_field", dtype=torch.float32, shape=(2,), production_status=ProductionStatus.READY_FOR_CONSUME
+            )
+        }
+        samples = [
+            SampleMeta(partition_id="partition_0", global_index=0, fields=fields),
+            SampleMeta(partition_id="partition_0", global_index=1, fields=fields),
+            SampleMeta(partition_id="partition_0", global_index=2, fields=fields),
+        ]
+        batch = BatchMeta(samples=samples)
+        # Select only the middle sample
+        selected_batch = batch.select_samples([1])
+        assert len(selected_batch) == 1
+        assert selected_batch.global_indexes == [1]
+        assert selected_batch.samples[0].batch_index == 0  # New batch index
+    def test_batch_meta_select_samples_empty_list(self):
+        """Example: Select with empty list returns empty batch."""
+        fields = {
+            "test_field": FieldMeta(
+                name="test_field", dtype=torch.float32, shape=(2,), production_status=ProductionStatus.READY_FOR_CONSUME
+            )
+        }
+        samples = [
+            SampleMeta(partition_id="partition_0", global_index=0, fields=fields),
+            SampleMeta(partition_id="partition_0", global_index=1, fields=fields),
+        ]
+        batch = BatchMeta(samples=samples, extra_info={"test_key": "test_value"})
+        # Select with empty list
+        selected_batch = batch.select_samples([])
+        assert len(selected_batch) == 0
+        assert selected_batch.global_indexes == []
+        # Extra info is still preserved
+        assert selected_batch.extra_info["test_key"] == "test_value"
+    def test_batch_meta_select_samples_reverse_order(self):
+        """Example: Select samples in reverse order."""
+        fields = {
+            "test_field": FieldMeta(
+                name="test_field", dtype=torch.float32, shape=(2,), production_status=ProductionStatus.READY_FOR_CONSUME
+            )
+        }
+        samples = [
+            SampleMeta(partition_id="partition_0", global_index=0, fields=fields),
+            SampleMeta(partition_id="partition_0", global_index=1, fields=fields),
+            SampleMeta(partition_id="partition_0", global_index=2, fields=fields),
+        ]
+        batch = BatchMeta(samples=samples)
+        # Select samples in reverse order
+        selected_batch = batch.select_samples([2, 1, 0])
+        assert len(selected_batch) == 3
+        assert selected_batch.global_indexes == [2, 1, 0]
+        # Batch indexes are re-assigned
+        assert selected_batch.samples[0].global_index == 2
+        assert selected_batch.samples[1].global_index == 1
+        assert selected_batch.samples[2].global_index == 0
+    def test_batch_meta_select_samples_with_extra_info(self):
+        """Example: Select samples preserves all extra info types."""
+        fields = {
+            "test_field": FieldMeta(
+                name="test_field", dtype=torch.float32, shape=(2,), production_status=ProductionStatus.READY_FOR_CONSUME
+            )
+        }
+        samples = [
+            SampleMeta(partition_id="partition_0", global_index=0, fields=fields),
+            SampleMeta(partition_id="partition_0", global_index=1, fields=fields),
+        ]
+        batch = BatchMeta(samples=samples)
+        # Add various extra info types
+        batch.extra_info["tensor"] = torch.randn(3, 4)
+        batch.extra_info["string"] = "test_string"
+        batch.extra_info["number"] = 42
+        batch.extra_info["list"] = [1, 2, 3]
+        # Select one sample
+        selected_batch = batch.select_samples([0])
+        # All extra info is preserved
+        assert "tensor" in selected_batch.extra_info
+        assert selected_batch.extra_info["string"] == "test_string"
+        assert selected_batch.extra_info["number"] == 42
+        assert selected_batch.extra_info["list"] == [1, 2, 3]
     def test_batch_meta_extra_info_operations(self):
         """Example: Extra info management operations."""
         fields = {

tests/test_serial_utils_on_cpu.py CHANGED Viewed

@@ -541,3 +541,56 @@ def test_nested_jagged_tensor_serialization(enable_zero_copy):
         # Verify individual components
         for i in range(len(outer_td["nested_jagged1"].unbind())):
             assert torch.allclose(decoded_msg.body["data"]["nested_jagged1"][i], outer_td["nested_jagged1"][i])
+@pytest.mark.parametrize("enable_zero_copy", [True, False])
+def test_single_nested_tensor_serialization(enable_zero_copy):
+    """Test serialization of nested tensor with only one element (edge case for zero-copy)."""
+    with patch("transfer_queue.utils.zmq_utils.TQ_ZERO_COPY_SERIALIZATION", enable_zero_copy):
+        from transfer_queue.utils.zmq_utils import ZMQMessage, ZMQRequestType
+        # Create nested tensor with only one element
+        # This is the critical edge case where a nested tensor with 1 element
+        # must be distinguished from a regular tensor during deserialization
+        single_nested = torch.nested.as_nested_tensor([torch.randn(4, 3)], layout=torch.strided)
+        # For normal tensor, expand to batch_size=1 to match the nested tensor's batch dimension
+        normal_tensor = torch.randn(1, 4, 3)
+        # Create TensorDict with both types
+        td = TensorDict(
+            {
+                "single_nested_tensor": single_nested,
+                "normal_tensor": normal_tensor,
+            },
+            batch_size=1,
+        )
+        msg = ZMQMessage(
+            request_type=ZMQRequestType.PUT_DATA,
+            sender_id="test",
+            receiver_id="test",
+            request_id="test",
+            timestamp=0.0,
+            body={"data": td},
+        )
+        encoded_msg = msg.serialize()
+        decoded_msg = ZMQMessage.deserialize(encoded_msg)
+        # Verify batch sizes
+        assert decoded_msg.body["data"].batch_size == td.batch_size
+        # Verify normal tensor
+        assert torch.allclose(decoded_msg.body["data"]["normal_tensor"], td["normal_tensor"])
+        assert decoded_msg.body["data"]["normal_tensor"].shape == td["normal_tensor"].shape
+        # Verify single nested tensor is properly reconstructed as nested
+        assert decoded_msg.body["data"]["single_nested_tensor"].is_nested
+        assert decoded_msg.body["data"]["single_nested_tensor"].layout == torch.strided
+        assert len(decoded_msg.body["data"]["single_nested_tensor"].unbind()) == 1
+        assert torch.allclose(decoded_msg.body["data"]["single_nested_tensor"][0], td["single_nested_tensor"][0])
+        # Ensure the nested tensor with single element is correctly distinguished from regular tensor
+        # Both should have the same data but different types
+        assert not decoded_msg.body["data"]["normal_tensor"].is_nested
+        assert decoded_msg.body["data"]["single_nested_tensor"].is_nested

transfer_queue/controller.py CHANGED Viewed

@@ -32,6 +32,7 @@ from transfer_queue.metadata import (
     SampleMeta,
 )
 from transfer_queue.sampler import BaseSampler, SequentialSampler
+from transfer_queue.utils.perf_utils import IntervalPerfMonitor
 from transfer_queue.utils.utils import (
     ProductionStatus,
     TransferQueueRole,
@@ -584,7 +585,7 @@ class TransferQueueController:
         self.partitions[partition_id] = DataPartitionStatus(partition_id=partition_id)
-        logger.info(f"Created partition {partition_id} with dynamic capacity")
+        logger.info(f"Created partition {partition_id}")
         return True
     def get_partition(self, partition_id: str) -> Optional[DataPartitionStatus]:
@@ -1008,7 +1009,7 @@ class TransferQueueController:
         poller = zmq.Poller()
         poller.register(self.handshake_socket, zmq.POLLIN)
-        logger.info(f"Dynamic Controller {self.controller_id} started waiting for storage connections...")
+        logger.info(f"Controller {self.controller_id} started waiting for storage connections...")
         while True:
             socks = dict(poller.poll(TQ_CONTROLLER_CONNECTION_CHECK_INTERVAL * 1000))
@@ -1036,23 +1037,23 @@ class TransferQueueController:
                             self._connected_storage_managers.add(storage_manager_id)
                             storage_manager_type = request_msg.body.get("storage_manager_type", "Unknown")
                             logger.info(
-                                f"Dynamic Controller {self.controller_id} received handshake from "
+                                f"Controller {self.controller_id} received handshake from "
                                 f"storage manager {storage_manager_id} (type: {storage_manager_type}). "
                                 f"Total connected: {len(self._connected_storage_managers)}"
                             )
                         else:
                             logger.debug(
-                                f"Dynamic Controller {self.controller_id} received duplicate handshake from "
+                                f"Controller {self.controller_id} received duplicate handshake from "
                                 f"storage manager {storage_manager_id}. Resending ACK."
                             )
                 except Exception as e:
-                    logger.error(f"Dynamic Controller {self.controller_id} error processing handshake: {e}")
+                    logger.error(f"Controller {self.controller_id} error processing handshake: {e}")
     def _start_process_handshake(self):
         """Start the handshake process thread."""
         self.wait_connection_thread = Thread(
-            target=self._wait_connection, name="DynamicTransferQueueControllerWaitConnectionThread", daemon=True
+            target=self._wait_connection, name="TransferQueueControllerWaitConnectionThread", daemon=True
         )
         self.wait_connection_thread.start()
@@ -1060,7 +1061,7 @@ class TransferQueueController:
         """Start the data status update processing thread."""
         self.process_update_data_status_thread = Thread(
             target=self._update_data_status,
-            name="DynamicTransferQueueControllerProcessUpdateDataStatusThread",
+            name="TransferQueueControllerProcessUpdateDataStatusThread",
             daemon=True,
         )
         self.process_update_data_status_thread.start()
@@ -1068,12 +1069,17 @@ class TransferQueueController:
     def _start_process_request(self):
         """Start the request processing thread."""
         self.process_request_thread = Thread(
-            target=self._process_request, name="DynamicTransferQueueControllerProcessRequestThread", daemon=True
+            target=self._process_request, name="TransferQueueControllerProcessRequestThread", daemon=True
         )
         self.process_request_thread.start()
     def _process_request(self):
         """Main request processing loop - adapted for partition-based operations."""
+        logger.info(f"[{self.controller_id}]: start processing requests...")
+        perf_monitor = IntervalPerfMonitor(caller_name=self.controller_id)
         while True:
             messages = self.request_handle_socket.recv_multipart()
             identity = messages.pop(0)
@@ -1081,88 +1087,96 @@ class TransferQueueController:
             request_msg = ZMQMessage.deserialize(serialized_msg)
             if request_msg.request_type == ZMQRequestType.GET_META:
-                params = request_msg.body
-                metadata = self.get_metadata(
-                    data_fields=params["data_fields"],
-                    batch_size=params["batch_size"],
-                    partition_id=params["partition_id"],
-                    mode=params.get("mode", "fetch"),
-                    task_name=params.get("task_name"),
-                    sampling_config=params.get("sampling_config"),
-                )
-                response_msg = ZMQMessage.create(
-                    request_type=ZMQRequestType.GET_META_RESPONSE,
-                    sender_id=self.controller_id,
-                    receiver_id=request_msg.sender_id,
-                    body={"metadata": metadata},
-                )
-            elif request_msg.request_type == ZMQRequestType.GET_CLEAR_META:
-                params = request_msg.body
-                partition_id = params["partition_id"]
-                metadata = self.get_metadata(
-                    data_fields=[],
-                    partition_id=partition_id,
-                    mode="insert",
-                )
-                response_msg = ZMQMessage.create(
-                    request_type=ZMQRequestType.GET_CLEAR_META_RESPONSE,
-                    sender_id=self.controller_id,
-                    receiver_id=request_msg.sender_id,
-                    body={"metadata": metadata},
-                )
-            elif request_msg.request_type == ZMQRequestType.CLEAR_META:
-                params = request_msg.body
-                partition_id = params["partition_id"]
+                with perf_monitor.measure(op_type="GET_META"):
+                    params = request_msg.body
+                    metadata = self.get_metadata(
+                        data_fields=params["data_fields"],
+                        batch_size=params["batch_size"],
+                        partition_id=params["partition_id"],
+                        mode=params.get("mode", "fetch"),
+                        task_name=params.get("task_name"),
+                        sampling_config=params.get("sampling_config"),
+                    )
-                clear_success = self.clear(partition_id)
-                if clear_success:
                     response_msg = ZMQMessage.create(
-                        request_type=ZMQRequestType.CLEAR_META_RESPONSE,
+                        request_type=ZMQRequestType.GET_META_RESPONSE,
                         sender_id=self.controller_id,
                         receiver_id=request_msg.sender_id,
-                        body={"message": f"Clear operation completed by controller {self.controller_id}"},
+                        body={"metadata": metadata},
+                    )
+            elif request_msg.request_type == ZMQRequestType.GET_CLEAR_META:
+                with perf_monitor.measure(op_type="GET_CLEAR_META"):
+                    params = request_msg.body
+                    partition_id = params["partition_id"]
+                    metadata = self.get_metadata(
+                        data_fields=[],
+                        partition_id=partition_id,
+                        mode="insert",
                     )
-                else:
                     response_msg = ZMQMessage.create(
-                        request_type=ZMQRequestType.CLEAR_META_RESPONSE,
+                        request_type=ZMQRequestType.GET_CLEAR_META_RESPONSE,
                         sender_id=self.controller_id,
                         receiver_id=request_msg.sender_id,
-                        body={"error": f"Clear operation failed for partition {partition_id}"},
+                        body={"metadata": metadata},
                     )
+            elif request_msg.request_type == ZMQRequestType.CLEAR_META:
+                with perf_monitor.measure(op_type="CLEAR_META"):
+                    params = request_msg.body
+                    partition_id = params["partition_id"]
+                    clear_success = self.clear(partition_id)
+                    if clear_success:
+                        response_msg = ZMQMessage.create(
+                            request_type=ZMQRequestType.CLEAR_META_RESPONSE,
+                            sender_id=self.controller_id,
+                            receiver_id=request_msg.sender_id,
+                            body={"message": f"Clear operation completed by controller {self.controller_id}"},
+                        )
+                    else:
+                        response_msg = ZMQMessage.create(
+                            request_type=ZMQRequestType.CLEAR_META_RESPONSE,
+                            sender_id=self.controller_id,
+                            receiver_id=request_msg.sender_id,
+                            body={"error": f"Clear operation failed for partition {partition_id}"},
+                        )
             elif request_msg.request_type == ZMQRequestType.CHECK_CONSUMPTION:
-                # Handle consumption status checks
-                params = request_msg.body
-                consumption_status = self.get_consumption_status(params["partition_id"], params["task_name"])
-                sample_filter = params.get("sample_filter")
-                if consumption_status is not None and sample_filter:
-                    batch_status = consumption_status[sample_filter]
-                    consumed = torch.all(batch_status == 1).item()
-                elif consumption_status is not None:
-                    batch_status = consumption_status
-                    consumed = torch.all(batch_status == 1).item()
-                else:
-                    consumed = False
-                response_msg = ZMQMessage.create(
-                    request_type=ZMQRequestType.CONSUMPTION_RESPONSE,
-                    sender_id=self.controller_id,
-                    receiver_id=request_msg.sender_id,
-                    body={
-                        "partition_id": params["partition_id"],
-                        "consumed": consumed,
-                    },
-                )
+                with perf_monitor.measure(op_type="CHECK_CONSUMPTION"):
+                    # Handle consumption status checks
+                    params = request_msg.body
+                    consumption_status = self.get_consumption_status(params["partition_id"], params["task_name"])
+                    sample_filter = params.get("sample_filter")
+                    if consumption_status is not None and sample_filter:
+                        batch_status = consumption_status[sample_filter]
+                        consumed = torch.all(batch_status == 1).item()
+                    elif consumption_status is not None:
+                        batch_status = consumption_status
+                        consumed = torch.all(batch_status == 1).item()
+                    else:
+                        consumed = False
+                    response_msg = ZMQMessage.create(
+                        request_type=ZMQRequestType.CONSUMPTION_RESPONSE,
+                        sender_id=self.controller_id,
+                        receiver_id=request_msg.sender_id,
+                        body={
+                            "partition_id": params["partition_id"],
+                            "consumed": consumed,
+                        },
+                    )
             self.request_handle_socket.send_multipart([identity, *response_msg.serialize()])
     def _update_data_status(self):
         """Process data status update messages from storage units - adapted for partitions."""
+        logger.info(f"[{self.controller_id}]: start receiving update_data_status requests...")
+        perf_monitor = IntervalPerfMonitor(caller_name=self.controller_id)
         while True:
             messages = self.data_status_update_socket.recv_multipart()
             identity = messages.pop(0)
@@ -1170,32 +1184,33 @@ class TransferQueueController:
             request_msg = ZMQMessage.deserialize(serialized_msg)
             if request_msg.request_type == ZMQRequestType.NOTIFY_DATA_UPDATE:
-                message_data = request_msg.body
-                partition_id = message_data.get("partition_id")
-                # Update production status
-                success = self.update_production_status(
-                    partition_id=partition_id,
-                    global_indexes=message_data.get("global_indexes", []),
-                    field_names=message_data.get("fields", []),
-                    dtypes=message_data.get("dtypes", {}),
-                    shapes=message_data.get("shapes", {}),
-                )
+                with perf_monitor.measure(op_type="NOTIFY_DATA_UPDATE"):
+                    message_data = request_msg.body
+                    partition_id = message_data.get("partition_id")
+                    # Update production status
+                    success = self.update_production_status(
+                        partition_id=partition_id,
+                        global_indexes=message_data.get("global_indexes", []),
+                        field_names=message_data.get("fields", []),
+                        dtypes=message_data.get("dtypes", {}),
+                        shapes=message_data.get("shapes", {}),
+                    )
-                if success:
-                    logger.info(f"Updated production status for partition {partition_id}")
-                # Send acknowledgment
-                response_msg = ZMQMessage.create(
-                    request_type=ZMQRequestType.NOTIFY_DATA_UPDATE_ACK,
-                    sender_id=self.controller_id,
-                    body={
-                        "controller_id": self.controller_id,
-                        "partition_id": partition_id,
-                        "success": success,
-                    },
-                )
-                self.data_status_update_socket.send_multipart([identity, *response_msg.serialize()])
+                    if success:
+                        logger.info(f"Updated production status for partition {partition_id}")
+                    # Send acknowledgment
+                    response_msg = ZMQMessage.create(
+                        request_type=ZMQRequestType.NOTIFY_DATA_UPDATE_ACK,
+                        sender_id=self.controller_id,
+                        body={
+                            "controller_id": self.controller_id,
+                            "partition_id": partition_id,
+                            "success": success,
+                        },
+                    )
+                    self.data_status_update_socket.send_multipart([identity, *response_msg.serialize()])
     def get_zmq_server_info(self) -> ZMQServerInfo:
         """Get ZMQ server connection information."""

transfer_queue/metadata.py CHANGED Viewed

@@ -261,6 +261,28 @@ class BatchMeta:
             object.__setattr__(self, "_is_ready", all(sample.is_ready for sample in self.samples))
         return self
+    def select_samples(self, sample_indices: list[int]) -> "BatchMeta":
+        """
+        Select specific samples from this batch.
+        This will construct a new BatchMeta instance containing only the specified samples.
+        Args:
+            sample_indices (list[int]): List of sample indices to retain.
+        Returns:
+            BatchMeta: A new BatchMeta instance containing only the specified samples.
+        """
+        if any(i < 0 or i >= len(self.samples) for i in sample_indices):
+            raise ValueError(f"Sample indices must be in range [0, {len(self.samples)})")
+        selected_samples = [self.samples[i] for i in sample_indices]
+        # construct new BatchMeta instance
+        selected_batch_meta = BatchMeta(samples=selected_samples, extra_info=self.extra_info.copy())
+        return selected_batch_meta
     def select_fields(self, field_names: list[str]) -> "BatchMeta":
         """
         Select specific fields from all samples in this batch.
@@ -287,7 +309,7 @@ class BatchMeta:
     def __getitem__(self, item):
         if isinstance(item, int | np.integer):
             sample_meta = self.samples[item] if self.samples else []
-            return BatchMeta(samples=[sample_meta], extra_info=self.extra_info)
+            return BatchMeta(samples=[sample_meta], extra_info=self.extra_info.copy())
         else:
             raise TypeError(f"Indexing with {type(item)} is not supported now!")
@@ -508,6 +530,13 @@ class BatchMeta:
             extra_info = {}
         return cls(samples=[], extra_info=extra_info)
+    def __str__(self):
+        sample_strs = ", ".join(str(sample) for sample in self.samples)
+        return (
+            f"BatchMeta(size={self.size}, field_names={self.field_names}, is_ready={self.is_ready}, "
+            f"samples=[{sample_strs}], extra_info={self.extra_info})"
+        )
 def _union_fields(fields1: dict[str, FieldMeta], fields2: dict[str, FieldMeta]) -> dict[str, FieldMeta]:
     """Union two sample's fields. If fields overlap, the fields in fields1 will be replaced by fields2."""

transfer_queue/storage/managers/simple_backend_manager.py CHANGED Viewed

@@ -173,6 +173,8 @@ class AsyncSimpleStorageManager(TransferQueueStorageManager):
             metadata: BatchMeta containing storage location information.
         """
+        logger.info(f"{__class__.__name__}: receive put_data request, putting {metadata.size} samples.")
         # group samples by storage unit
         storage_meta_groups = build_storage_meta_groups(
             metadata, self.global_index_storage_unit_mapping, self.global_index_local_index_mapping
@@ -228,7 +230,8 @@ class AsyncSimpleStorageManager(TransferQueueStorageManager):
                     else NonTensorStack(*transfer_data["field_data"][field])
                 )
                 for field in transfer_data["field_data"]
-            }
+            },
+            batch_size=len(local_indexes),
         )
         request_msg = ZMQMessage.create(
@@ -263,6 +266,8 @@ class AsyncSimpleStorageManager(TransferQueueStorageManager):
             TensorDict containing the retrieved data.
         """
+        logger.info(f"{__class__.__name__}: receive get_data request, getting {metadata.size} samples.")
         # group samples by storage unit
         storage_meta_groups = build_storage_meta_groups(
             metadata, self.global_index_storage_unit_mapping, self.global_index_local_index_mapping

transfer_queue/storage/simple_backend.py CHANGED Viewed

@@ -28,14 +28,14 @@ from ray.util import get_node_ip_address
 from tensordict import NonTensorStack, TensorDict
 from transfer_queue.metadata import SampleMeta
+from transfer_queue.utils.perf_utils import IntervalPerfMonitor
 from transfer_queue.utils.utils import TransferQueueRole
 from transfer_queue.utils.zmq_utils import ZMQMessage, ZMQRequestType, ZMQServerInfo, create_zmq_socket, get_free_port
 logger = logging.getLogger(__name__)
 logger.setLevel(os.getenv("TQ_LOGGING_LEVEL", logging.WARNING))
-# ZMQ timeouts (in seconds) and retry configurations
-TQ_STORAGE_POLLER_TIMEOUT = int(os.environ.get("TQ_STORAGE_POLLER_TIMEOUT", 5))
+TQ_STORAGE_POLLER_TIMEOUT = int(os.environ.get("TQ_STORAGE_POLLER_TIMEOUT", 5))  # in seconds
 class StorageUnitData:
@@ -200,7 +200,7 @@ class SimpleStorageUnit:
     def _start_process_put_get(self) -> None:
         """Create a daemon thread and start put/get process."""
         self.process_put_get_thread = Thread(
-            target=self._process_put_get, name=f"StorageUnitProcessPutGetThread-{self.zmq_server_info.id}", daemon=True
+            target=self._process_put_get, name=f"StorageUnitProcessPutGetThread-{self.storage_unit_id}", daemon=True
         )
         self.process_put_get_thread.start()
@@ -209,6 +209,10 @@ class SimpleStorageUnit:
         poller = zmq.Poller()
         poller.register(self.put_get_socket, zmq.POLLIN)
+        logger.info(f"[{self.storage_unit_id}]: start processing put/get requests...")
+        perf_monitor = IntervalPerfMonitor(caller_name=self.storage_unit_id)
         while True:
             socks = dict(poller.poll(TQ_STORAGE_POLLER_TIMEOUT * 1000))
@@ -219,29 +223,32 @@ class SimpleStorageUnit:
                 request_msg = ZMQMessage.deserialize(serialized_msg)
                 operation = request_msg.request_type
                 try:
-                    logger.debug(f"[{self.zmq_server_info.id}]: receive operation: {operation}, message: {request_msg}")
+                    logger.debug(f"[{self.storage_unit_id}]: receive operation: {operation}, message: {request_msg}")
                     if operation == ZMQRequestType.PUT_DATA:
-                        response_msg = self._handle_put(request_msg)
+                        with perf_monitor.measure(op_type="PUT_DATA"):
+                            response_msg = self._handle_put(request_msg)
                     elif operation == ZMQRequestType.GET_DATA:
-                        response_msg = self._handle_get(request_msg)
+                        with perf_monitor.measure(op_type="GET_DATA"):
+                            response_msg = self._handle_get(request_msg)
                     elif operation == ZMQRequestType.CLEAR_DATA:
-                        response_msg = self._handle_clear(request_msg)
+                        with perf_monitor.measure(op_type="CLEAR_DATA"):
+                            response_msg = self._handle_clear(request_msg)
                     else:
                         response_msg = ZMQMessage.create(
                             request_type=ZMQRequestType.PUT_GET_OPERATION_ERROR,
-                            sender_id=self.zmq_server_info.id,
+                            sender_id=self.storage_unit_id,
                             body={
-                                "message": f"Storage unit id #{self.zmq_server_info.id} "
+                                "message": f"Storage unit id #{self.storage_unit_id} "
                                 f"receive invalid operation: {operation}."
                             },
                         )
                 except Exception as e:
                     response_msg = ZMQMessage.create(
                         request_type=ZMQRequestType.PUT_GET_ERROR,
-                        sender_id=self.zmq_server_info.id,
+                        sender_id=self.storage_unit_id,
                         body={
-                            "message": f"Storage unit id #{self.zmq_server_info.id} occur error in processing "
+                            "message": f"Storage unit id #{self.storage_unit_id} occur error in processing "
                             f"put/get/clear request, detail error message: {str(e)}."
                         },
                     )
@@ -268,17 +275,17 @@ class SimpleStorageUnit:
             # After put operation finish, send a message to the client
             response_msg = ZMQMessage.create(
-                request_type=ZMQRequestType.PUT_DATA_RESPONSE, sender_id=self.zmq_server_info.id, body={}
+                request_type=ZMQRequestType.PUT_DATA_RESPONSE, sender_id=self.storage_unit_id, body={}
             )
             return response_msg
         except Exception as e:
             return ZMQMessage.create(
                 request_type=ZMQRequestType.PUT_ERROR,
-                sender_id=self.zmq_server_info.id,
+                sender_id=self.storage_unit_id,
                 body={
                     "message": f"Failed to put data into storage unit id "
-                    f"#{self.zmq_server_info.id}, detail error message: {str(e)}"
+                    f"#{self.storage_unit_id}, detail error message: {str(e)}"
                 },
             )
@@ -300,7 +307,7 @@ class SimpleStorageUnit:
             response_msg = ZMQMessage.create(
                 request_type=ZMQRequestType.GET_DATA_RESPONSE,
-                sender_id=self.zmq_server_info.id,
+                sender_id=self.storage_unit_id,
                 body={
                     "data": result_data,
                 },
@@ -308,9 +315,9 @@ class SimpleStorageUnit:
         except Exception as e:
             response_msg = ZMQMessage.create(
                 request_type=ZMQRequestType.GET_ERROR,
-                sender_id=self.zmq_server_info.id,
+                sender_id=self.storage_unit_id,
                 body={
-                    "message": f"Failed to get data from storage unit id #{self.zmq_server_info.id}, "
+                    "message": f"Failed to get data from storage unit id #{self.storage_unit_id}, "
                     f"detail error message: {str(e)}"
                 },
             )
@@ -333,15 +340,15 @@ class SimpleStorageUnit:
             response_msg = ZMQMessage.create(
                 request_type=ZMQRequestType.CLEAR_DATA_RESPONSE,
-                sender_id=self.zmq_server_info.id,
-                body={"message": f"Clear data in storage unit id #{self.zmq_server_info.id} successfully."},
+                sender_id=self.storage_unit_id,
+                body={"message": f"Clear data in storage unit id #{self.storage_unit_id} successfully."},
             )
         except Exception as e:
             response_msg = ZMQMessage.create(
                 request_type=ZMQRequestType.CLEAR_DATA_ERROR,
-                sender_id=self.zmq_server_info.id,
+                sender_id=self.storage_unit_id,
                 body={
-                    "message": f"Failed to clear data in storage unit id #{self.zmq_server_info.id}, "
+                    "message": f"Failed to clear data in storage unit id #{self.storage_unit_id}, "
                     f"detail error message: {str(e)}"
                 },
             )

transfer_queue/utils/perf_utils.py ADDED Viewed

@@ -0,0 +1,104 @@
+import logging
+import os
+import time
+from collections import defaultdict
+from contextlib import contextmanager
+logger = logging.getLogger(__name__)
+logger.setLevel(os.getenv("TQ_LOGGING_LEVEL", logging.INFO))
+TQ_PERF_LOG_FLUSH_INTERVAL = float(os.environ.get("TQ_PERF_LOG_FLUSH_INTERVAL", 10))  # in seconds
+class IntervalPerfMonitor:
+    """
+    Monitors and logs performance statistics for operations over configurable time intervals.
+    This class is designed to be used in contexts where you want to track the number of successful
+    operations and their processing times, and periodically log summary statistics such as request
+    counts, rates, and timing metrics (average, max, min) per operation type.
+    Usage:
+        monitor = IntervalPerfMonitor("Your Class")
+        with monitor.measure("method_name"):
+            # perform upload operation
+    At each interval (controlled by TQ_PERF_LOG_FLUSH_INTERVAL), the monitor logs aggregated
+    statistics and resets its counters.
+    Args:
+        caller_name (str): Name of the component or caller using the monitor, included in logs.
+    """
+    def __init__(self, caller_name: str):
+        self.caller_name = caller_name
+        self.last_flush_time = time.perf_counter()
+        self.success_counts: dict[str, int] = defaultdict(int)
+        self.process_time: dict[str, list[float]] = defaultdict(list)
+    def _flush_logs(self):
+        """
+        Internal method to conditionally flush (log) aggregated performance statistics.
+        If the configured time interval (TQ_PERF_LOG_FLUSH_INTERVAL) has passed since the last flush,
+        this method logs:
+          - Total number of successful requests and requests per minute.
+          - Average processing time across all operations.
+          - For each operation type: request count, requests per minute, average, max, and min processing times.
+        After logging, all statistics are reset and the flush timer is updated.
+        """
+        now = time.perf_counter()
+        # only flush if the interval has passed
+        if (now - self.last_flush_time) >= TQ_PERF_LOG_FLUSH_INTERVAL:
+            minutes = (now - self.last_flush_time) / 60
+            total_requests = sum(self.success_counts.values())
+            total_process_time = sum(sum(time_list) for time_list in self.process_time.values())
+            total_avg_process_time = total_process_time / total_requests if total_requests > 0 else 0.0
+            # max/min/avg time for each operation type
+            op_detail_stats = []
+            for op_type, count in self.success_counts.items():
+                times = self.process_time[op_type]
+                if not times:
+                    op_avg = op_max = op_min = 0.0
+                else:
+                    op_avg = sum(times) / len(times)
+                    op_max = max(times)
+                    op_min = min(times)
+                op_detail_stats.append(
+                    f"{op_type}: req_count={count}, req/min={count / minutes:.2f}, "
+                    f"avg_time={op_avg:.6f}s, max_time={op_max:.6f}s, min_time={op_min:.6f}s"
+                )
+            log_msg = (
+                f"{self.caller_name}: [Performance] "
+                f"Total success requests: {total_requests}, "
+                f"Total req/min: {total_requests / minutes:.2f}, "
+                f"Total avg process time: {total_avg_process_time:.4f}s; \n"
+                f"Time range: last {minutes:.2f} minutes; \n"
+                f"Per-operation statistics: {'; '.join(op_detail_stats)}"
+            )
+            logger.info(log_msg)
+            # reset counts
+            self.success_counts.clear()
+            self.process_time.clear()
+            self.last_flush_time = now
+    @contextmanager
+    def measure(self, op_type: str):
+        start_time = time.perf_counter()
+        try:
+            yield
+        finally:
+            cost = time.perf_counter() - start_time
+            self.success_counts[op_type] += 1
+            self.process_time[op_type].append(cost)
+            # try flush logs
+            self._flush_logs()

transfer_queue/utils/zmq_utils.py CHANGED Viewed

@@ -22,6 +22,7 @@ from dataclasses import dataclass
 from typing import Any, Optional, TypeAlias
 from uuid import uuid4
+import numpy as np
 import psutil
 import torch
 import zmq
@@ -162,15 +163,15 @@ class ZMQMessage:
                     tensor_list = tensor.unbind()
                     tensor_count = len(tensor_list)
                     serialized_tensors = [_encoder.encode(inner_tensor) for inner_tensor in tensor_list]
-                    return tensor_count, serialized_tensors
+                    return tensor_count, serialized_tensors  # tensor_count may equal to 1 for single nested tensor
                 else:
-                    return 1, [_encoder.encode(tensor)]
+                    return -1, [_encoder.encode(tensor)]  # use -1 to indicate regular single tensor
             # Use map to process all tensors in parallel-like fashion
             nested_tensor_info_and_serialized_tensors = list(map(process_tensor, tensors))
             # Extract nested_tensor_info and flatten serialized tensors using itertools
-            nested_tensor_info = [info for info, _ in nested_tensor_info_and_serialized_tensors]
+            nested_tensor_info = np.array([info for info, _ in nested_tensor_info_and_serialized_tensors])
             double_layer_serialized_tensors: list[list[bytestr]] = list(
                 itertools.chain.from_iterable(serialized for _, serialized in nested_tensor_info_and_serialized_tensors)
             )
@@ -209,14 +210,14 @@ class ZMQMessage:
                     f"When TQ_ZERO_COPY_SERIALIZATION is enabled, input data should be a list, but got {type(data)}."
                 )
-            tensor_nums = sum(nested_tensor_info)
+            tensor_nums = np.abs(nested_tensor_info).sum()
             if tensor_nums != len(single_tensors):
                 raise ValueError(f"Expecting {tensor_nums} tensors, but got {len(single_tensors)}.")
             tensors = [None] * len(nested_tensor_info)
             current_idx = 0
             for i, tensor_num in enumerate(nested_tensor_info):
-                if tensor_num == 1:
+                if tensor_num == -1:
                     tensors[i] = single_tensors[current_idx]
                     current_idx += 1
                 else:

transfer_queue/version/version CHANGED Viewed

	@@ -1 +1 @@
1	- 0.1.4.~~dev0~~
1	+ 0.1.4.dev1

{transferqueue-0.1.4.dev0.dist-info → transferqueue-0.1.4.dev1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: TransferQueue
-Version: 0.1.4.dev0
+Version: 0.1.4.dev1
 Summary: TransferQueue: An Asynchronous Streaming Data Management Module
 Author-email: The TransferQueue Team <hanzy19@tsinghua.org.cn>
 License: Apache-2.0

{transferqueue-0.1.4.dev0.dist-info → transferqueue-0.1.4.dev1.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,7 @@
-performance_test.py,sha256=tKM1m-IDlcOv003kaVgvrO37r04OMfCSQ7fbrefqV5A,14451
+performance_test.py,sha256=Yl4k-ln5iHAJ5HIfK--Lx-tlEXQcI71Juzve-lBesUI,14434
 serial_profiling_demo.py,sha256=b4GEoF8bSIawQlWOIK2eg9Tgn_-Q8n_KMsfRHLdK1mc,3961
 serial_profiling_demo_nested_non_continues_test.py,sha256=ahXMGDsRc_bwE3lbVE6L-8_guezv6rpG24pKKIxVKF8,3849
+verify_fix.py,sha256=IcZ52jbYoxFb4MRwZMx0f52E8ATFn7GbUp8fmVY31RU,3522
 recipe/simple_use_case/async_demo.py,sha256=wGQCnXAzElE1D6-POwJVhR4UbT3kEJdCnnF-ZL5jPKg,13709
 recipe/simple_use_case/sync_demo.py,sha256=Rfatf205t-gHaxShk2n8LavMVoMLWZpguHRKpKgMDNo,8680
 tests/test_async_simple_storage_manager.py,sha256=qYmSJV6LYSXAzjnlzWZ5HboOhr4B1gMgNwOc5_esVaU,12366
@@ -8,21 +9,21 @@ tests/test_client.py,sha256=Wj1Eswt9qVfL0-192Mwx5_ICaQO7iso0WMJxKpGEvFg,15546
 tests/test_controller.py,sha256=ZcvFCC3jSnNN_fEerjA37RQv0SSO0Xh8vjcL2mvF03o,11084
 tests/test_controller_data_partitions.py,sha256=RQExMFcuXblpUZE_LaFaoncbwX4-YlPcBi69siRfvzY,19363
 tests/test_kv_storage_manager.py,sha256=j45VZ14H8MY8-c4CJI1MRgljoHijaK-tkQAD_-9lmzw,4012
-tests/test_metadata.py,sha256=iPbYZIyDavtG2QMFPqbKXzRDTgNkcssjnnQkKPisim0,26044
+tests/test_metadata.py,sha256=q0X8UuxTmx-JjZlXR8nqr7cm2YbGFHWNbO9kZHgFFYU,32415
 tests/test_samplers.py,sha256=CvYqfmbHEWWa1RyymztCAn0GcitAPOBbfJ4ud1VvO2o,19168
-tests/test_serial_utils_on_cpu.py,sha256=_-I88rjj_uzWWE_tiT_PcfH5_j15j-Q7iB_Gq-dU6B4,21122
+tests/test_serial_utils_on_cpu.py,sha256=iZII_-oVBu3KQ8Afpf7roqET9mIMegrsQ7cwg1XPXQo,23595
 tests/test_simple_storage_unit.py,sha256=29mrQwIkS63D6-b1lNZRhUlZ2nkmjpXtQVGHPvYq_ug,16595
 tests/test_storage_client_factory.py,sha256=lZr7SRY4rpzQB-ZgG7gbjPF2Pcde55nwweumSJT7Yd0,2363
 transfer_queue/__init__.py,sha256=68c0sBfqHPqTa7OdzO4sAZB52XvwtjpwLqP9BWAh4fA,1535
 transfer_queue/client.py,sha256=vH9stFyDCXtLYujdNVYjz815NjXbSUJOmxcFOZLIU1s,25831
-transfer_queue/controller.py,sha256=pq5OwXbdFILOZD3IKR39KF-Y9B8yVLmfsztzoQ2EvMk,49839
-transfer_queue/metadata.py,sha256=zqxAOrFj2uRCC8jPWxkLjOR7hJuiY3AxF2FKYe3oxNU,21515
+transfer_queue/controller.py,sha256=uIhgSEdHQDfmQicqdPXnyXQuJ6Cn5AH3Wyue0SlYdrY,50853
+transfer_queue/metadata.py,sha256=W71tN_-AVfizFDFM-2mIWt1RYvIkN9J1pwgUwBeDPpo,22664
 transfer_queue/sampler/__init__.py,sha256=1oauDy2Dwb5GXhKi7tl5DWAHv8i4t2MQK1S4U36Sy4g,788
 transfer_queue/sampler/base.py,sha256=wFti4dNJb3YArYpGzxA_YDfyUTdTG8wVz6HclPDyZPw,3299
 transfer_queue/sampler/grpo_group_n_sampler.py,sha256=Kq3hGAz8mboBNvw4Dj0P8lP6Qs8TDojx81fxSh57w28,6566
 transfer_queue/sampler/sequential_sampler.py,sha256=TY0eB-uFLUskwoNMgu3AvuF4G2KDkgjOkrlXZHy4Pls,2780
 transfer_queue/storage/__init__.py,sha256=559q9ZOMLLhHXil5-iY3aLPnACoJLnZnKf-E0lvpQdk,978
-transfer_queue/storage/simple_backend.py,sha256=51M5QOXood6D5Sojh135L3jPHvXgzl_TcGLKR02mIiw,18988
+transfer_queue/storage/simple_backend.py,sha256=3bGqalWk0FX7SYjSxEWQeH1j-XAl_6heFWJvRbTgaII,19358
 transfer_queue/storage/clients/__init__.py,sha256=WCa6pcijAixpopvelkZZ9ZRTwF_P3fYMmSEBb04CQZ4,915
 transfer_queue/storage/clients/base.py,sha256=xXd9JBeTmW8tN4wsPocHhW-ERUEzx2YyYHZrtuQQIdI,690
 transfer_queue/storage/clients/factory.py,sha256=lPOG8oMAgaTbrzkogcOULPJnGywa0F-m4vskkOQZhnU,2137
@@ -30,15 +31,16 @@ transfer_queue/storage/clients/yuanrong_client.py,sha256=MskYioa0BHHGRqzYbAg9DCn
 transfer_queue/storage/managers/__init__.py,sha256=bkgaIN4Xa3IF26JJt4BK4bqct0SESdwA32wX5SLnuY0,959
 transfer_queue/storage/managers/base.py,sha256=iSBethCS-pq0tHcRsolXtGD0V_0PtBmInNF4Gi-flfw,21628
 transfer_queue/storage/managers/factory.py,sha256=58kp2mCKz1K8Ea7RWMsWxdDhN3y4ZhgE-G647AKq7-I,1752
-transfer_queue/storage/managers/simple_backend_manager.py,sha256=vTOfc3Y163km-r-lv4fQ_KlUzTTPD3LBUT1E7dCcPcs,27759
+transfer_queue/storage/managers/simple_backend_manager.py,sha256=8F7fW7Z3SDeR22KCZqnFWPyC4qil5w_sp7QXHfvJsfY,28015
 transfer_queue/storage/managers/yuanrong_manager.py,sha256=RsCmVVDNcTaMU9J9vwN1gu0-srdBCWG-W3Q_Si91uio,1250
 transfer_queue/utils/__init__.py,sha256=vki-5RVaRBKxVc6Q7XPQox3VNPio2DvJYvRz0SZtu-w,586
+transfer_queue/utils/perf_utils.py,sha256=WUl8AW9eHS5P9G3zq8g52MgMsyZqTqFXuBkXyBLmBLc,4100
 transfer_queue/utils/serial_utils.py,sha256=J94wrNKVEJtZg22o7GByMs9e_UuwOgRqt1faC5Sy7DY,6048
 transfer_queue/utils/utils.py,sha256=EE5S8YtyLNduohj1egKLHQlG4K2nrN-yAa8klBx9Nro,4846
-transfer_queue/utils/zmq_utils.py,sha256=gsu9xbfxrEXWihxsyW-3hhPcmUIQL0G0eZgJuzjF8gY,10590
-transfer_queue/version/version,sha256=_j45IFlkEXFXEmud89gRme3qSSoOPxu3Gk5uOxHL9eo,11
-transferqueue-0.1.4.dev0.dist-info/licenses/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
-transferqueue-0.1.4.dev0.dist-info/METADATA,sha256=hRPOl4vOhcBv86t4fjMicXkZR5UnrmQwyw4ngvETTmw,19502
-transferqueue-0.1.4.dev0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-transferqueue-0.1.4.dev0.dist-info/top_level.txt,sha256=6qfRszcN5Zyq8fWzDWI_wDo9N3Dg8k-8CsXeMLkwuXo,120
-transferqueue-0.1.4.dev0.dist-info/RECORD,,
+transfer_queue/utils/zmq_utils.py,sha256=ecJO1GV_AEAAKnnts-0t7jl19j9jUpBT07N6ZSR8op0,10730
+transfer_queue/version/version,sha256=BEnC3jt-HrAwaHIIQhet48H4zzl05lM_-XlEH_IAuRc,11
+transferqueue-0.1.4.dev1.dist-info/licenses/LICENSE,sha256=WNHhf_5RCaeuKWyq_K39vmp9F28LxKsB4SpomwSZ2L0,11357
+transferqueue-0.1.4.dev1.dist-info/METADATA,sha256=pUiuMKBnGXQUsrtba_G1VAllyhJr64aCOuxyt3UMyQo,19502
+transferqueue-0.1.4.dev1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+transferqueue-0.1.4.dev1.dist-info/top_level.txt,sha256=4MQO9VzdR-IUYG4xAidtwDNiWECIQZ_zx0G5KflYJkE,131
+transferqueue-0.1.4.dev1.dist-info/RECORD,,

{transferqueue-0.1.4.dev0.dist-info → transferqueue-0.1.4.dev1.dist-info}/top_level.txt RENAMED Viewed

@@ -5,3 +5,4 @@ serial_profiling_demo
 serial_profiling_demo_nested_non_continues_test
 tests
 transfer_queue
+verify_fix

verify_fix.py ADDED Viewed

@@ -0,0 +1,109 @@
+#!/usr/bin/env python
+"""
+验证脚本：测试单元素nested tensor的序列化/反序列化修复
+此脚本验证了在TQ_ZERO_COPY_SERIALIZATION=True时，
+序列化只有1个tensor的nested tensor能够正确区分于普通tensor。
+"""
+import os
+import torch
+from tensordict import TensorDict
+# 启用零拷贝序列化
+os.environ["TQ_ZERO_COPY_SERIALIZATION"] = "True"
+from transfer_queue.utils.zmq_utils import ZMQMessage, ZMQRequestType
+def test_single_nested_tensor_fix():
+    """验证单元素nested tensor修复"""
+    print("=" * 80)
+    print("测试：单元素nested tensor序列化/反序列化修复")
+    print("=" * 80)
+    # 创建单元素nested tensor和普通tensor
+    single_nested = torch.nested.as_nested_tensor([torch.randn(4, 3)], layout=torch.strided)
+    normal_tensor = torch.randn(1, 4, 3)
+    print("\n1. 创建测试数据：")
+    print(f"   - 单元素nested tensor: {single_nested.shape}, is_nested={single_nested.is_nested}")
+    print(f"   - 普通tensor: {normal_tensor.shape}, is_nested={normal_tensor.is_nested}")
+    # 创建TensorDict
+    td = TensorDict(
+        {
+            "single_nested_tensor": single_nested,
+            "normal_tensor": normal_tensor,
+        },
+        batch_size=1,
+    )
+    print("\n2. 创建ZMQMessage并序列化：")
+    msg = ZMQMessage(
+        request_type=ZMQRequestType.PUT_DATA,
+        sender_id="test_sender",
+        receiver_id="test_receiver",
+        body={"data": td},
+    )
+    # 序列化
+    serialized_data = msg.serialize()
+    print(f"   - 序列化完成，数据列表长度: {len(serialized_data)}")
+    # 反序列化
+    print("\n3. 反序列化数据：")
+    decoded_msg = ZMQMessage.deserialize(serialized_data)
+    print(f"   - 反序列化完成")
+    print(f"   - decoded_msg.body['data']['single_nested_tensor'].is_nested = {decoded_msg.body['data']['single_nested_tensor'].is_nested}")
+    print(f"   - decoded_msg.body['data']['normal_tensor'].is_nested = {decoded_msg.body['data']['normal_tensor'].is_nested}")
+    # 验证结果
+    print("\n4. 验证结果：")
+    success = True
+    # 检查单元素nested tensor
+    if decoded_msg.body["data"]["single_nested_tensor"].is_nested:
+        print("   ✓ 单元素nested tensor正确保持为nested类型")
+    else:
+        print("   ✗ 单元素nested tensor错误地变成了普通tensor类型")
+        success = False
+    # 检查普通tensor
+    if not decoded_msg.body["data"]["normal_tensor"].is_nested:
+        print("   ✓ 普通tensor正确保持为普通tensor类型")
+    else:
+        print("   ✗ 普通tensor错误地变成了nested类型")
+        success = False
+    # 检查数据内容
+    import torch
+    if torch.allclose(
+        decoded_msg.body["data"]["single_nested_tensor"][0],
+        single_nested[0]
+    ):
+        print("   ✓ 单元素nested tensor数据内容正确")
+    else:
+        print("   ✗ 单元素nested tensor数据内容不正确")
+        success = False
+    if torch.allclose(
+        decoded_msg.body["data"]["normal_tensor"],
+        normal_tensor
+    ):
+        print("   ✓ 普通tensor数据内容正确")
+    else:
+        print("   ✗ 普通tensor数据内容不正确")
+        success = False
+    print("\n" + "=" * 80)
+    if success:
+        print("✓ 所有测试通过！修复有效。")
+    else:
+        print("✗ 测试失败！修复可能存在问题。")
+    print("=" * 80)
+    return success
+if __name__ == "__main__":
+    test_single_nested_tensor_fix()

{transferqueue-0.1.4.dev0.dist-info → transferqueue-0.1.4.dev1.dist-info}/WHEEL RENAMED Viewed

File without changes

{transferqueue-0.1.4.dev0.dist-info → transferqueue-0.1.4.dev1.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

TransferQueue 0.1.4.dev0__py3-none-any.whl → 0.1.4.dev1__py3-none-any.whl

TransferQueue 0.1.4.dev0py3-none-any.whl → 0.1.4.dev1py3-none-any.whl