PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511220812__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (257) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +317 -34
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +143 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +17 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +17 -1
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +183 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +406 -0
tests/layers/vllm/test_compressed_tensors_moe.py +199 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +441 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +443 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +320 -0
tests/layers/vllm/test_unquantized.py +662 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +26 -6
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +67 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_envs.py +110 -12
tests/test_tpu_info.py +14 -0
tests/test_utils.py +2 -45
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +15 -10
tpu_inference/distributed/utils.py +56 -4
tpu_inference/envs.py +92 -8
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +25 -4
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +807 -230
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +117 -145
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +194 -101
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +218 -137
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +376 -195
tpu_inference/kernels/ragged_paged_attention/v3/util.py +15 -1
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +25 -12
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/fused_moe_gmm.py +506 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +282 -0
tpu_inference/layers/common/sharding.py +32 -9
tpu_inference/layers/common/utils.py +94 -0
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +270 -77
tpu_inference/layers/jax/attention/gpt_oss_attention.py +24 -11
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +101 -494
tpu_inference/layers/vllm/linear.py +64 -0
tpu_inference/layers/vllm/process_weights/__init__.py +13 -0
tpu_inference/layers/vllm/{sharding.py → process_weights/cleanup_sharding.py} +24 -15
tpu_inference/layers/vllm/process_weights/fused_moe_weights.py +369 -0
tpu_inference/layers/vllm/process_weights/linear_weights.py +174 -0
tpu_inference/layers/vllm/quantization/__init__.py +19 -3
tpu_inference/layers/vllm/quantization/awq.py +96 -82
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +23 -8
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +172 -176
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +111 -91
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +79 -43
tpu_inference/layers/vllm/quantization/{common.py → configs.py} +42 -25
tpu_inference/layers/vllm/quantization/fp8.py +119 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +137 -178
tpu_inference/layers/vllm/quantization/unquantized.py +157 -233
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +112 -35
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +267 -157
tpu_inference/models/jax/gpt_oss.py +26 -10
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +18 -5
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +179 -51
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +92 -32
tpu_inference/models/jax/utils/weight_utils.py +234 -155
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +32 -8
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +51 -72
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +180 -80
tpu_inference/runner/kv_cache.py +54 -20
tpu_inference/runner/kv_cache_manager.py +55 -33
tpu_inference/runner/lora_utils.py +16 -1
tpu_inference/runner/multimodal_manager.py +16 -2
tpu_inference/runner/persistent_batch_manager.py +54 -2
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +16 -3
tpu_inference/runner/tpu_runner.py +124 -61
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +84 -22
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +72 -44
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +66 -52
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/METADATA +8 -9
tpu_inference-0.13.2.dev20251230.dist-info/RECORD +266 -0
tpu_inference/layers/vllm/linear_common.py +0 -186
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.11.1.dev202511220812.dist-info/RECORD +0 -174
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511220812.dist-info → tpu_inference-0.13.2.dev20251230.dist-info}/top_level.txt +0 -0

tests/core/test_dp_scheduler.py CHANGED Viewed

@@ -1,19 +1,30 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from unittest.mock import MagicMock, patch
 import pytest
-import torch
 from vllm.config import VllmConfig
-from vllm.v1.core.sched.output import (CachedRequestData, GrammarOutput,
-                                       SchedulerOutput)
+from vllm.v1.core.sched.output import CachedRequestData, SchedulerOutput
 from vllm.v1.core.sched.scheduler import Scheduler
-from vllm.v1.engine import EngineCoreOutputs
 from vllm.v1.kv_cache_interface import KVCacheConfig
 from vllm.v1.metrics.stats import PrefixCacheStats, SchedulerStats
-from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.request import Request
 from tpu_inference.core.sched.dp_scheduler import (
-    DPScheduler, DPSchedulerOutput, update_vllm_config_for_dp_scheduler)
+    DPScheduler, DPSchedulerOutput, SchedulerCommand,
+    update_vllm_config_for_dp_scheduler)
 class TestDPScheduler:
@@ -43,387 +54,241 @@ class TestDPScheduler:
         """Create a mock StructuredOutputManager."""
         return MagicMock()
-    def _create_dp_scheduler_with_mocks(self, mock_vllm_config,
-                                        mock_kv_cache_config,
-                                        mock_structured_output_manager,
-                                        **kwargs):
-        """Helper to create a DPScheduler with properly mocked schedulers."""
-        # Create individual mock scheduler instances
-        mock_scheduler_0 = MagicMock()
-        mock_scheduler_1 = MagicMock()
-        # Patch the Scheduler class to return our mock instances
-        with patch.object(
-                mock_vllm_config.scheduler_config, '_original_scheduler_cls',
-                MagicMock(side_effect=[mock_scheduler_0, mock_scheduler_1])):
-            scheduler = DPScheduler(
-                vllm_config=mock_vllm_config,
-                kv_cache_config=mock_kv_cache_config,
-                structured_output_manager=mock_structured_output_manager,
-                block_size=16,
-                **kwargs)
-            return scheduler
-    def test_init_creates_per_rank_schedulers(
+    def test_init_creates_worker_processes(
         self,
         mock_vllm_config,
         mock_kv_cache_config,
         mock_structured_output_manager,
     ):
-        """Test Initialization creates schedulers for each DP rank."""
-        # Mock the scheduler class
-        mock_scheduler_instance = MagicMock()
-        mock_scheduler_cls = MagicMock(return_value=mock_scheduler_instance)
-        with patch.object(mock_vllm_config.scheduler_config,
-                          '_original_scheduler_cls', mock_scheduler_cls):
-            scheduler = DPScheduler(
-                vllm_config=mock_vllm_config,
-                kv_cache_config=mock_kv_cache_config,
-                structured_output_manager=mock_structured_output_manager,
-                block_size=16,
-                log_stats=True,
-            )
-            # Verify schedulers were created
-            assert len(scheduler.schedulers) == 2
-            assert scheduler.dp_size == 2
-            assert scheduler.log_stats is True
-            assert len(scheduler.per_rank_kv_cache_configs) == 2
-            # Verify each rank got the correct config
-            for rank_config in scheduler.per_rank_kv_cache_configs:
-                assert rank_config.num_blocks == 50  # 100 / 2
+        """Test initialization creates worker processes for each DP rank."""
+        with patch(
+                'tpu_inference.core.sched.dp_scheduler._scheduler_worker_process'
+        ):
+            with patch('multiprocessing.get_context') as mock_get_context:
+                # Setup mock context
+                mock_ctx = MagicMock()
+                mock_process = MagicMock()
+                mock_queue = MagicMock()
+                mock_ctx.Queue = MagicMock(return_value=mock_queue)
+                mock_ctx.Process = MagicMock(return_value=mock_process)
+                mock_get_context.return_value = mock_ctx
+                scheduler = DPScheduler(
+                    vllm_config=mock_vllm_config,
+                    kv_cache_config=mock_kv_cache_config,
+                    structured_output_manager=mock_structured_output_manager,
+                    block_size=16,
+                    log_stats=True,
+                )
+                # Verify processes and queues were created
+                assert scheduler.dp_size == 2
+                assert len(scheduler.processes) == 2
+                assert len(scheduler.input_queues) == 2
+                # output_queues is a dict with (rank, command) tuple keys
+                # 2 ranks × 14 commands (SchedulerCommand enum)
+                assert len(scheduler.output_queues) == 28
+                assert scheduler.log_stats is True
+                assert len(scheduler.per_rank_kv_cache_configs) == 2
+                # Verify each rank got the correct config
+                for rank_config in scheduler.per_rank_kv_cache_configs:
+                    assert rank_config.num_blocks == 50  # 100 / 2
+                # Verify processes were started
+                assert mock_process.start.call_count == 2
     def test_get_rank_token_counts(self, mock_vllm_config,
                                    mock_kv_cache_config,
                                    mock_structured_output_manager):
-        """Test _get_rank_token_counts calculates tokens per rank."""
-        scheduler = self._create_dp_scheduler_with_mocks(
-            mock_vllm_config, mock_kv_cache_config,
-            mock_structured_output_manager)
-        # Mock requests on different ranks
-        req1 = MagicMock()
-        req1.num_tokens = 10
-        req2 = MagicMock()
-        req2.num_tokens = 20
-        req3 = MagicMock()
-        req3.num_tokens = 15
-        scheduler.schedulers[0].running = [req1]
-        scheduler.schedulers[0].waiting = [req2]
-        scheduler.schedulers[1].running = [req3]
-        scheduler.schedulers[1].waiting = []
-        rank_tokens = scheduler._get_rank_token_counts()
-        assert rank_tokens[0] == 30  # 10 + 20
-        assert rank_tokens[1] == 15
+        """Test _get_rank_token_counts queries workers and aggregates tokens."""
+        with patch(
+                'tpu_inference.core.sched.dp_scheduler._scheduler_worker_process'
+        ):
+            with patch('multiprocessing.get_context'):
+                scheduler = DPScheduler(
+                    vllm_config=mock_vllm_config,
+                    kv_cache_config=mock_kv_cache_config,
+                    structured_output_manager=mock_structured_output_manager,
+                    block_size=16,
+                )
+                # Mock the queues - need to mock the .get() method to return the value
+                scheduler.input_queues = [MagicMock(), MagicMock()]
+                mock_queue_0 = MagicMock()
+                mock_queue_0.get.return_value = 30
+                mock_queue_1 = MagicMock()
+                mock_queue_1.get.return_value = 15
+                scheduler.output_queues = {
+                    (0, "get_token_count"): mock_queue_0,
+                    (1, "get_token_count"): mock_queue_1,
+                }
+                rank_tokens = scheduler._get_rank_token_counts()
+                # Verify correct commands were sent
+                scheduler.input_queues[0].put.assert_called_with(
+                    (SchedulerCommand.GET_TOKEN_COUNT, None))
+                scheduler.input_queues[1].put.assert_called_with(
+                    (SchedulerCommand.GET_TOKEN_COUNT, None))
+                assert rank_tokens[0] == 30
+                assert rank_tokens[1] == 15
     def test_find_best_rank_with_cache_hit(self, mock_vllm_config,
                                            mock_kv_cache_config,
                                            mock_structured_output_manager):
-        """Test _find_best_rank_for_request with cache hit."""
-        scheduler = self._create_dp_scheduler_with_mocks(
-            mock_vllm_config, mock_kv_cache_config,
-            mock_structured_output_manager)
-        # Mock request
-        mock_request = MagicMock(spec=Request)
-        # Mock KV cache managers with different cache hits
-        scheduler.schedulers[0].kv_cache_manager = MagicMock()
-        scheduler.schedulers[
-            0].kv_cache_manager.get_computed_blocks.return_value = (
-                [],
-                10,
-            )  # 10 cached tokens
-        scheduler.schedulers[1].kv_cache_manager = MagicMock()
-        scheduler.schedulers[
-            1].kv_cache_manager.get_computed_blocks.return_value = (
-                [],
-                20,
-            )  # 20 cached tokens (better)
-        # Mock empty running/waiting queues
-        scheduler.schedulers[0].running = []
-        scheduler.schedulers[0].waiting = []
-        scheduler.schedulers[1].running = []
-        scheduler.schedulers[1].waiting = []
-        rank = scheduler._find_best_rank_for_request(mock_request)
-        # Should choose rank 1 with better cache hit
-        assert rank == 1
+        """Test _find_best_rank_for_request prefers cache hits."""
+        with patch(
+                'tpu_inference.core.sched.dp_scheduler._scheduler_worker_process'
+        ):
+            with patch('multiprocessing.get_context'):
+                scheduler = DPScheduler(
+                    vllm_config=mock_vllm_config,
+                    kv_cache_config=mock_kv_cache_config,
+                    structured_output_manager=mock_structured_output_manager,
+                    block_size=16,
+                )
+                mock_request = MagicMock(spec=Request)
+                # Mock the queues with tuple keys (rank, command)
+                scheduler.input_queues = [MagicMock(), MagicMock()]
+                # Create proper mocks for queue.get() calls
+                mock_queue_get_token_0 = MagicMock()
+                mock_queue_get_token_0.get.return_value = 100
+                mock_queue_get_token_1 = MagicMock()
+                mock_queue_get_token_1.get.return_value = 50
+                mock_queue_computed_0 = MagicMock()
+                mock_queue_computed_0.get.return_value = ([], 10)
+                mock_queue_computed_1 = MagicMock()
+                mock_queue_computed_1.get.return_value = ([], 25)
+                scheduler.output_queues = {
+                    (0, "get_token_count"): mock_queue_get_token_0,
+                    (1, "get_token_count"): mock_queue_get_token_1,
+                    (0, "get_computed_blocks"): mock_queue_computed_0,
+                    (1, "get_computed_blocks"): mock_queue_computed_1,
+                }
+                rank = scheduler._find_best_rank_for_request(mock_request)
+                # Should prefer rank with better cache hit
+                assert rank == 1
     def test_find_best_rank_without_cache_hit(self, mock_vllm_config,
                                               mock_kv_cache_config,
                                               mock_structured_output_manager):
-        """Test _find_best_rank_for_request without cache hit (load balancing)."""
-        scheduler = self._create_dp_scheduler_with_mocks(
-            mock_vllm_config, mock_kv_cache_config,
-            mock_structured_output_manager)
-        # Mock request
-        mock_request = MagicMock(spec=Request)
-        # Mock KV cache managers with no cache hits
-        scheduler.schedulers[0].kv_cache_manager = MagicMock()
-        scheduler.schedulers[
-            0].kv_cache_manager.get_computed_blocks.return_value = ([], 0)
-        scheduler.schedulers[1].kv_cache_manager = MagicMock()
-        scheduler.schedulers[
-            1].kv_cache_manager.get_computed_blocks.return_value = ([], 0)
-        # Mock requests with different token counts
-        req1 = MagicMock()
-        req1.num_tokens = 50
-        req2 = MagicMock()
-        req2.num_tokens = 30
-        scheduler.schedulers[0].running = [req1]
-        scheduler.schedulers[0].waiting = []
-        scheduler.schedulers[1].running = [req2]
-        scheduler.schedulers[1].waiting = []
-        rank = scheduler._find_best_rank_for_request(mock_request)
-        # Should choose rank 1 with fewer tokens
-        assert rank == 1
+        """Test _find_best_rank_for_request uses load balancing without cache hit."""
+        with patch(
+                'tpu_inference.core.sched.dp_scheduler._scheduler_worker_process'
+        ):
+            with patch('multiprocessing.get_context'):
+                scheduler = DPScheduler(
+                    vllm_config=mock_vllm_config,
+                    kv_cache_config=mock_kv_cache_config,
+                    structured_output_manager=mock_structured_output_manager,
+                    block_size=16,
+                )
+                mock_request = MagicMock(spec=Request)
+                # Mock the queues with tuple keys (rank, command)
+                scheduler.input_queues = [MagicMock(), MagicMock()]
+                # Create proper mocks for queue.get() calls
+                mock_queue_get_token_0 = MagicMock()
+                mock_queue_get_token_0.get.return_value = 100
+                mock_queue_get_token_1 = MagicMock()
+                mock_queue_get_token_1.get.return_value = 50
+                mock_queue_computed_0 = MagicMock()
+                mock_queue_computed_0.get.return_value = ([], 0)
+                mock_queue_computed_1 = MagicMock()
+                mock_queue_computed_1.get.return_value = ([], 0)
+                scheduler.output_queues = {
+                    (0, "get_token_count"): mock_queue_get_token_0,
+                    (1, "get_token_count"): mock_queue_get_token_1,
+                    (0, "get_computed_blocks"): mock_queue_computed_0,
+                    (1, "get_computed_blocks"): mock_queue_computed_1,
+                }
+                rank = scheduler._find_best_rank_for_request(mock_request)
+                # Should choose rank with fewer tokens (rank 1)
+                assert rank == 1
     def test_add_request_assigns_to_best_rank(self, mock_vllm_config,
                                               mock_kv_cache_config,
                                               mock_structured_output_manager):
-        """Test add_request assigns and adds request to best rank."""
-        scheduler = self._create_dp_scheduler_with_mocks(
-            mock_vllm_config, mock_kv_cache_config,
-            mock_structured_output_manager)
-        # Mock the rank selection
-        mock_request = MagicMock(spec=Request)
-        mock_request.request_id = "req1"
-        # Mock _find_best_rank_for_request to return rank 1
-        scheduler._find_best_rank_for_request = MagicMock(return_value=1)
-        # Mock schedulers
-        scheduler.schedulers[0].add_request = MagicMock()
-        scheduler.schedulers[1].add_request = MagicMock()
-        scheduler.add_request(mock_request)
-        # Verify request was assigned to rank 1
-        assert scheduler.assigned_dp_rank["req1"] == 1
-        scheduler.schedulers[1].add_request.assert_called_once_with(
-            mock_request)
-        scheduler.schedulers[0].add_request.assert_not_called()
-    def test_schedule_runs_all_schedulers(self, mock_vllm_config,
-                                          mock_kv_cache_config,
-                                          mock_structured_output_manager):
-        """Test schedule runs all schedulers and combines output."""
-        scheduler = self._create_dp_scheduler_with_mocks(
-            mock_vllm_config, mock_kv_cache_config,
-            mock_structured_output_manager)
-        # Mock scheduler outputs
-        mock_output_0 = MagicMock(spec=SchedulerOutput)
-        mock_output_0.scheduled_new_reqs = []
-        mock_output_0.num_scheduled_tokens = {"req1": 10}
-        mock_output_0.total_num_scheduled_tokens = 10
-        mock_output_0.finished_req_ids = set()
-        mock_output_0.scheduled_cached_reqs = CachedRequestData(
-            req_ids=[],
-            resumed_req_ids=[],
-            new_token_ids=[],
-            all_token_ids=[],
-            new_block_ids=[],
-            num_computed_tokens=[],
-            num_output_tokens=[],
-        )
-        mock_output_0.scheduled_spec_decode_tokens = {}
-        mock_output_0.scheduled_encoder_inputs = {}
-        mock_output_0.num_common_prefix_blocks = []
-        mock_output_1 = MagicMock(spec=SchedulerOutput)
-        mock_output_1.scheduled_new_reqs = []
-        mock_output_1.num_scheduled_tokens = {"req2": 20}
-        mock_output_1.total_num_scheduled_tokens = 20
-        mock_output_1.finished_req_ids = set()
-        mock_output_1.scheduled_cached_reqs = CachedRequestData(
-            req_ids=[],
-            resumed_req_ids=[],
-            new_token_ids=[],
-            all_token_ids=[],
-            new_block_ids=[],
-            num_computed_tokens=[],
-            num_output_tokens=[],
-        )
-        mock_output_1.scheduled_spec_decode_tokens = {}
-        mock_output_1.scheduled_encoder_inputs = {}
-        mock_output_1.num_common_prefix_blocks = []
-        scheduler.schedulers[0].schedule = MagicMock(
-            return_value=mock_output_0)
-        scheduler.schedulers[1].schedule = MagicMock(
-            return_value=mock_output_1)
-        scheduler.schedulers[0].running = []
-        scheduler.schedulers[0].waiting = []
-        scheduler.schedulers[1].running = []
-        scheduler.schedulers[1].waiting = []
-        # Assign ranks for requests
-        scheduler.assigned_dp_rank = {"req1": 0, "req2": 1}
-        output = scheduler.schedule()
-        # Verify combined output
-        assert isinstance(output, DPSchedulerOutput)
-        assert output.total_num_scheduled_tokens == 30  # 10 + 20
-        assert "req1" in output.num_scheduled_tokens
-        assert "req2" in output.num_scheduled_tokens
-        assert output.assigned_dp_rank == {"req1": 0, "req2": 1}
-    def test_combine_cached_request_data(self, mock_vllm_config,
-                                         mock_kv_cache_config,
-                                         mock_structured_output_manager):
-        """Test _combine_cached_request_data combines data from all ranks."""
-        mock_scheduler_cls = MagicMock(return_value=MagicMock())
-        with patch.object(mock_vllm_config.scheduler_config,
-                          '_original_scheduler_cls', mock_scheduler_cls):
-            scheduler = DPScheduler(
-                vllm_config=mock_vllm_config,
-                kv_cache_config=mock_kv_cache_config,
-                structured_output_manager=mock_structured_output_manager,
-                block_size=16,
-            )
-            # Create mock rank outputs with different cached request data
-            output_0 = MagicMock(spec=SchedulerOutput)
-            output_0.scheduled_cached_reqs = CachedRequestData(
-                req_ids=["req1"],
-                resumed_req_ids=["req1"],
-                new_token_ids=[[1, 2, 3]],
-                all_token_ids=[[1, 2, 3, 4, 5]],
-                new_block_ids=[[10, 11]],
-                num_computed_tokens=[5],
-                num_output_tokens=[3],
-            )
-            output_1 = MagicMock(spec=SchedulerOutput)
-            output_1.scheduled_cached_reqs = CachedRequestData(
-                req_ids=["req2"],
-                resumed_req_ids=[],
-                new_token_ids=[[6, 7]],
-                all_token_ids=[[6, 7, 8, 9]],
-                new_block_ids=[[20, 21]],
-                num_computed_tokens=[4],
-                num_output_tokens=[2],
-            )
-            rank_outputs = [output_0, output_1]
-            combined = scheduler._combine_cached_request_data(rank_outputs)
-            # Verify combined data
-            assert combined.req_ids == ["req1", "req2"]
-            assert combined.resumed_req_ids == ["req1"]
-            assert combined.new_token_ids == [[1, 2, 3], [6, 7]]
-            assert combined.all_token_ids == [[1, 2, 3, 4, 5], [6, 7, 8, 9]]
-            assert combined.new_block_ids == [[10, 11], [20, 21]]
-            assert combined.num_computed_tokens == [5, 4]
-            assert combined.num_output_tokens == [3, 2]
-    def test_get_grammar_bitmask_with_structured_output(
+        """Test add_request assigns request to best rank."""
+        with patch(
+                'tpu_inference.core.sched.dp_scheduler._scheduler_worker_process'
+        ):
+            with patch('multiprocessing.get_context'):
+                scheduler = DPScheduler(
+                    vllm_config=mock_vllm_config,
+                    kv_cache_config=mock_kv_cache_config,
+                    structured_output_manager=mock_structured_output_manager,
+                    block_size=16,
+                )
+                mock_request = MagicMock(spec=Request)
+                mock_request.request_id = "req1"
+                # Mock the queues with tuple keys
+                scheduler.input_queues = [MagicMock(), MagicMock()]
+                scheduler.output_queues = {
+                    (0, "add_request"): MagicMock(),
+                    (1, "add_request"): MagicMock(),
+                }
+                # Mock _find_best_rank_for_request to return rank 1
+                scheduler._find_best_rank_for_request = MagicMock(
+                    return_value=1)
+                scheduler.add_request(mock_request)
+                # Verify request was assigned to rank 1
+                assert scheduler.assigned_dp_rank["req1"] == 1
+                # Verify ADD_REQUEST command was sent to rank 1
+                scheduler.input_queues[1].put.assert_called_with(
+                    (SchedulerCommand.ADD_REQUEST, mock_request))
+                # Verify we waited for completion
+                scheduler.output_queues[(
+                    1, "add_request")].get.assert_called_once()
+    def test_schedule_sends_commands_and_combines_output(
             self, mock_vllm_config, mock_kv_cache_config,
             mock_structured_output_manager):
-        """Test get_grammar_bitmask combines bitmasks from all ranks."""
-        scheduler = self._create_dp_scheduler_with_mocks(
-            mock_vllm_config, mock_kv_cache_config,
-            mock_structured_output_manager)
-        # Create mock scheduler outputs
-        mock_output_0 = MagicMock()
-        mock_output_1 = MagicMock()
-        # Mock grammar outputs from each rank
-        grammar_output_0 = GrammarOutput(
-            structured_output_request_ids=["req1"],
-            grammar_bitmask=torch.ones((1, 100), dtype=torch.bool),
-        )
-        grammar_output_1 = GrammarOutput(
-            structured_output_request_ids=["req2"],
-            grammar_bitmask=torch.ones((1, 100), dtype=torch.bool) * 0,
-        )
-        scheduler.schedulers[0].get_grammar_bitmask = MagicMock(
-            return_value=grammar_output_0)
-        scheduler.schedulers[1].get_grammar_bitmask = MagicMock(
-            return_value=grammar_output_1)
-        # Cache scheduler outputs
-        scheduler.cached_schedulers_output.append(
-            [mock_output_0, mock_output_1])
-        # Create a DPSchedulerOutput
-        dp_output = DPSchedulerOutput(
-            scheduled_new_reqs=[],
-            scheduled_cached_reqs=CachedRequestData(
-                req_ids=[],
-                resumed_req_ids=[],
-                new_token_ids=[],
-                all_token_ids=[],
-                new_block_ids=[],
-                num_computed_tokens=[],
-                num_output_tokens=[],
-            ),
-            num_scheduled_tokens={},
-            total_num_scheduled_tokens=0,
-            scheduled_spec_decode_tokens={},
-            scheduled_encoder_inputs={},
-            num_common_prefix_blocks=[],
-            finished_req_ids=set(),
-            free_encoder_mm_hashes=set(),
-        )
-        result = scheduler.get_grammar_bitmask(dp_output)
-        assert result is not None
-        assert result.structured_output_request_ids == ["req1", "req2"]
-        assert result.grammar_bitmask.shape == (2, 100)
-    def test_get_grammar_bitmask_no_structured_output(
-            self, mock_vllm_config, mock_kv_cache_config,
-            mock_structured_output_manager):
-        """Test get_grammar_bitmask returns None when no structured output."""
-        mock_scheduler_cls = MagicMock(return_value=MagicMock())
-        with patch.object(mock_vllm_config.scheduler_config,
-                          '_original_scheduler_cls', mock_scheduler_cls):
-            scheduler = DPScheduler(
-                vllm_config=mock_vllm_config,
-                kv_cache_config=mock_kv_cache_config,
-                structured_output_manager=mock_structured_output_manager,
-                block_size=16,
-            )
-            # Mock schedulers returning None
-            scheduler.schedulers[0].get_grammar_bitmask = MagicMock(
-                return_value=None)
-            scheduler.schedulers[1].get_grammar_bitmask = MagicMock(
-                return_value=None)
-            # Cache scheduler outputs
-            mock_output_0 = MagicMock()
-            mock_output_1 = MagicMock()
-            scheduler.cached_schedulers_output.append(
-                [mock_output_0, mock_output_1])
-            dp_output = DPSchedulerOutput(
-                scheduled_new_reqs=[],
-                scheduled_cached_reqs=CachedRequestData(
+        """Test schedule sends SCHEDULE command to all workers and combines output."""
+        with patch(
+                'tpu_inference.core.sched.dp_scheduler._scheduler_worker_process'
+        ):
+            with patch('multiprocessing.get_context'):
+                scheduler = DPScheduler(
+                    vllm_config=mock_vllm_config,
+                    kv_cache_config=mock_kv_cache_config,
+                    structured_output_manager=mock_structured_output_manager,
+                    block_size=16,
+                )
+                # Mock the queues with tuple keys
+                scheduler.input_queues = [MagicMock(), MagicMock()]
+                # Create mock scheduler outputs
+                mock_output_0 = MagicMock(spec=SchedulerOutput)
+                mock_output_0.scheduled_new_reqs = []
+                mock_output_0.num_scheduled_tokens = {"req1": 10}
+                mock_output_0.total_num_scheduled_tokens = 10
+                mock_output_0.finished_req_ids = set()
+                mock_output_0.scheduled_cached_reqs = CachedRequestData(
                     req_ids=[],
                     resumed_req_ids=[],
                     new_token_ids=[],
@@ -431,40 +296,17 @@ class TestDPScheduler:
                     new_block_ids=[],
                     num_computed_tokens=[],
                     num_output_tokens=[],
-                ),
-                num_scheduled_tokens={},
-                total_num_scheduled_tokens=0,
-                scheduled_spec_decode_tokens={},
-                scheduled_encoder_inputs={},
-                num_common_prefix_blocks=[],
-                finished_req_ids=set(),
-                free_encoder_mm_hashes=set(),
-            )
-            result = scheduler.get_grammar_bitmask(dp_output)
-            assert result is None
-    def test_update_from_output_routes_to_schedulers(
-            self, mock_vllm_config, mock_kv_cache_config,
-            mock_structured_output_manager):
-        """Test update_from_output splits output and updates each scheduler."""
-        mock_scheduler_cls = MagicMock(return_value=MagicMock())
-        with patch.object(mock_vllm_config.scheduler_config,
-                          '_original_scheduler_cls', mock_scheduler_cls):
-            scheduler = DPScheduler(
-                vllm_config=mock_vllm_config,
-                kv_cache_config=mock_kv_cache_config,
-                structured_output_manager=mock_structured_output_manager,
-                block_size=16,
-            )
-            # Setup assigned ranks
-            scheduler.assigned_dp_rank = {"req1": 0, "req2": 1, "req3": 0}
-            # Create DPSchedulerOutput
-            dp_output = DPSchedulerOutput(
-                scheduled_new_reqs=[],
-                scheduled_cached_reqs=CachedRequestData(
+                )
+                mock_output_0.scheduled_spec_decode_tokens = {}
+                mock_output_0.scheduled_encoder_inputs = {}
+                mock_output_0.num_common_prefix_blocks = []
+                mock_output_1 = MagicMock(spec=SchedulerOutput)
+                mock_output_1.scheduled_new_reqs = []
+                mock_output_1.num_scheduled_tokens = {"req2": 20}
+                mock_output_1.total_num_scheduled_tokens = 20
+                mock_output_1.finished_req_ids = set()
+                mock_output_1.scheduled_cached_reqs = CachedRequestData(
                     req_ids=[],
                     resumed_req_ids=[],
                     new_token_ids=[],
@@ -472,397 +314,437 @@ class TestDPScheduler:
                     new_block_ids=[],
                     num_computed_tokens=[],
                     num_output_tokens=[],
-                ),
-                num_scheduled_tokens={
-                    "req1": 10,
-                    "req2": 20,
-                    "req3": 15
-                },
-                total_num_scheduled_tokens=45,
-                scheduled_spec_decode_tokens={},
-                scheduled_encoder_inputs={},
-                num_common_prefix_blocks=[],
-                finished_req_ids={"req3"},  # req3 finished
-                free_encoder_mm_hashes=set(),
-                assigned_dp_rank={
-                    "req1": 0,
-                    "req2": 1,
-                    "req3": 0
-                },
-            )
-            # Create mock model runner output
-            model_output = ModelRunnerOutput(
-                req_ids=["req1", "req2", "req3"],
-                req_id_to_index={
-                    "req1": 0,
-                    "req2": 1,
-                    "req3": 2
-                },
-                sampled_token_ids=torch.tensor([100, 200, 300]),
-                logprobs=None,
-                prompt_logprobs_dict={},
-                pooler_output=None,
-                num_nans_in_logits=0,
-                kv_connector_output=None,
-            )
-            # Mock rank scheduler outputs (cached from schedule call)
-            rank_output_0 = MagicMock()
-            rank_output_1 = MagicMock()
-            scheduler.cached_schedulers_output.append(
-                [rank_output_0, rank_output_1])
-            # Mock scheduler update_from_output
-            engine_output_0 = EngineCoreOutputs()
-            engine_output_0.engine_index = 0
-            engine_output_0.outputs = []
-            engine_output_0.finished_requests = {"req3"}
-            engine_output_1 = EngineCoreOutputs()
-            engine_output_1.engine_index = 0
-            engine_output_1.outputs = []
-            engine_output_1.finished_requests = set()
-            scheduler.schedulers[0].update_from_output = MagicMock(
-                return_value={0: engine_output_0})
-            scheduler.schedulers[1].update_from_output = MagicMock(
-                return_value={0: engine_output_1})
-            # Mock make_stats
-            scheduler.make_stats = MagicMock(return_value=None)
-            _ = scheduler.update_from_output(dp_output, model_output)
-            # Verify schedulers were updated
-            assert scheduler.schedulers[0].update_from_output.called
-            assert scheduler.schedulers[1].update_from_output.called
-            # Verify finished request was cleaned up
-            assert "req3" not in scheduler.assigned_dp_rank
-            assert "req1" in scheduler.assigned_dp_rank
-            assert "req2" in scheduler.assigned_dp_rank
-    def test_split_model_output_by_rank(self, mock_vllm_config,
-                                        mock_kv_cache_config,
-                                        mock_structured_output_manager):
-        """Test _split_model_output_by_rank distributes output correctly."""
-        mock_scheduler_cls = MagicMock(return_value=MagicMock())
-        with patch.object(mock_vllm_config.scheduler_config,
-                          '_original_scheduler_cls', mock_scheduler_cls):
-            scheduler = DPScheduler(
-                vllm_config=mock_vllm_config,
-                kv_cache_config=mock_kv_cache_config,
-                structured_output_manager=mock_structured_output_manager,
-                block_size=16,
-            )
-            # Setup assigned ranks
-            scheduler.assigned_dp_rank = {
-                "req1": 0,
-                "req2": 1,
-                "req3": 0,
-                "req4": 1
-            }
-            # Create global model output
-            global_output = ModelRunnerOutput(
-                req_ids=["req1", "req2", "req3", "req4"],
-                req_id_to_index={
-                    "req1": 0,
-                    "req2": 1,
-                    "req3": 2,
-                    "req4": 3
-                },
-                sampled_token_ids=torch.tensor([100, 200, 300, 400]),
-                logprobs=None,
-                prompt_logprobs_dict={},
-                pooler_output=None,
-                num_nans_in_logits=0,
-                kv_connector_output=None,
-            )
-            rank_outputs = scheduler._split_model_output_by_rank(global_output)
-            # Verify split outputs
-            assert len(rank_outputs) == 2
-            assert rank_outputs[0].req_ids == ["req1", "req3"]
-            assert rank_outputs[1].req_ids == ["req2", "req4"]
-    def test_cleanup_finished_requests(self, mock_vllm_config,
-                                       mock_kv_cache_config,
-                                       mock_structured_output_manager):
-        """Test _cleanup_finished_requests removes finished requests."""
-        mock_scheduler_cls = MagicMock(return_value=MagicMock())
-        with patch.object(mock_vllm_config.scheduler_config,
-                          '_original_scheduler_cls', mock_scheduler_cls):
-            scheduler = DPScheduler(
-                vllm_config=mock_vllm_config,
-                kv_cache_config=mock_kv_cache_config,
-                structured_output_manager=mock_structured_output_manager,
-                block_size=16,
-            )
-            # Setup assigned ranks
-            scheduler.assigned_dp_rank = {"req1": 0, "req2": 1, "req3": 0}
-            # Clean up finished requests
-            scheduler._cleanup_finished_requests({"req1", "req3"})
-            # Verify cleanup
-            assert "req1" not in scheduler.assigned_dp_rank
-            assert "req3" not in scheduler.assigned_dp_rank
-            assert "req2" in scheduler.assigned_dp_rank
-    def test_finish_requests_single_and_multiple(
-            self, mock_vllm_config, mock_kv_cache_config,
-            mock_structured_output_manager):
-        """Test finish_requests handles single string and list."""
-        scheduler = self._create_dp_scheduler_with_mocks(
-            mock_vllm_config, mock_kv_cache_config,
-            mock_structured_output_manager)
-        # Setup assigned ranks
-        scheduler.assigned_dp_rank = {"req1": 0, "req2": 1, "req3": 0}
-        # Mock scheduler finish_requests
-        scheduler.schedulers[0].finish_requests = MagicMock()
-        scheduler.schedulers[1].finish_requests = MagicMock()
-        # Test with single string
-        scheduler.finish_requests("req1", finished_status="completed")
-        scheduler.schedulers[0].finish_requests.assert_called_with(["req1"],
-                                                                   "completed")
-        # Test with list
-        scheduler.schedulers[0].finish_requests.reset_mock()
-        scheduler.schedulers[1].finish_requests.reset_mock()
-        scheduler.finish_requests(["req1", "req2"],
-                                  finished_status="completed")
-        scheduler.schedulers[0].finish_requests.assert_called_once_with(
-            ["req1"], "completed")
-        scheduler.schedulers[1].finish_requests.assert_called_once_with(
-            ["req2"], "completed")
+                )
+                mock_output_1.scheduled_spec_decode_tokens = {}
+                mock_output_1.scheduled_encoder_inputs = {}
+                mock_output_1.num_common_prefix_blocks = []
+                # Setup mock queue responses with tuple keys - need to mock .get()
+                mock_queue_0 = MagicMock()
+                mock_queue_0.get.return_value = mock_output_0
+                mock_queue_1 = MagicMock()
+                mock_queue_1.get.return_value = mock_output_1
+                scheduler.output_queues = {
+                    (0, "schedule"): mock_queue_0,
+                    (1, "schedule"): mock_queue_1,
+                }
+                # Setup assigned ranks
+                scheduler.assigned_dp_rank = {"req1": 0, "req2": 1}
+                output = scheduler.schedule()
+                # Verify SCHEDULE commands were sent
+                scheduler.input_queues[0].put.assert_called_with(
+                    (SchedulerCommand.SCHEDULE, None))
+                scheduler.input_queues[1].put.assert_called_with(
+                    (SchedulerCommand.SCHEDULE, None))
+                # Verify combined output
+                assert isinstance(output, DPSchedulerOutput)
+                assert output.total_num_scheduled_tokens == 30
+                assert "req1" in output.num_scheduled_tokens
+                assert "req2" in output.num_scheduled_tokens
+                assert output.assigned_dp_rank == {"req1": 0, "req2": 1}
-    def test_get_num_unfinished_requests(self, mock_vllm_config,
+    def test_combine_cached_request_data(self, mock_vllm_config,
                                          mock_kv_cache_config,
                                          mock_structured_output_manager):
-        """Test get_num_unfinished_requests aggregates across ranks."""
-        scheduler = self._create_dp_scheduler_with_mocks(
-            mock_vllm_config, mock_kv_cache_config,
-            mock_structured_output_manager)
-        scheduler.schedulers[0].get_num_unfinished_requests = MagicMock(
-            return_value=5)
-        scheduler.schedulers[1].get_num_unfinished_requests = MagicMock(
-            return_value=3)
+        """Test _combine_cached_request_data combines data from all ranks."""
+        with patch(
+                'tpu_inference.core.sched.dp_scheduler._scheduler_worker_process'
+        ):
+            with patch('multiprocessing.get_context'):
+                scheduler = DPScheduler(
+                    vllm_config=mock_vllm_config,
+                    kv_cache_config=mock_kv_cache_config,
+                    structured_output_manager=mock_structured_output_manager,
+                    block_size=16,
+                )
+                # Create mock rank outputs
+                output_0 = MagicMock(spec=SchedulerOutput)
+                output_0.scheduled_cached_reqs = CachedRequestData(
+                    req_ids=["req1"],
+                    resumed_req_ids=["req1"],
+                    new_token_ids=[[1, 2, 3]],
+                    all_token_ids=[[1, 2, 3, 4, 5]],
+                    new_block_ids=[[10, 11]],
+                    num_computed_tokens=[5],
+                    num_output_tokens=[3],
+                )
+                output_1 = MagicMock(spec=SchedulerOutput)
+                output_1.scheduled_cached_reqs = CachedRequestData(
+                    req_ids=["req2"],
+                    resumed_req_ids=[],
+                    new_token_ids=[[6, 7]],
+                    all_token_ids=[[6, 7, 8, 9]],
+                    new_block_ids=[[20, 21]],
+                    num_computed_tokens=[4],
+                    num_output_tokens=[2],
+                )
+                combined = scheduler._combine_cached_request_data(
+                    [output_0, output_1])
+                # Verify combined data
+                assert combined.req_ids == ["req1", "req2"]
+                assert combined.resumed_req_ids == ["req1"]
+                assert combined.new_token_ids == [[1, 2, 3], [6, 7]]
+                assert combined.num_computed_tokens == [5, 4]
+                assert combined.num_output_tokens == [3, 2]
+    def test_finish_requests_routes_to_workers(self, mock_vllm_config,
+                                               mock_kv_cache_config,
+                                               mock_structured_output_manager):
+        """Test finish_requests sends FINISH_REQUESTS command to appropriate workers."""
+        with patch(
+                'tpu_inference.core.sched.dp_scheduler._scheduler_worker_process'
+        ):
+            with patch('multiprocessing.get_context'):
+                scheduler = DPScheduler(
+                    vllm_config=mock_vllm_config,
+                    kv_cache_config=mock_kv_cache_config,
+                    structured_output_manager=mock_structured_output_manager,
+                    block_size=16,
+                )
+                scheduler.input_queues = [MagicMock(), MagicMock()]
+                scheduler.output_queues = {
+                    (0, "finish_requests"): MagicMock(),
+                    (1, "finish_requests"): MagicMock(),
+                }
+                scheduler.assigned_dp_rank = {"req1": 0, "req2": 1, "req3": 0}
+                # Test with list of requests
+                scheduler.finish_requests(["req1", "req2"],
+                                          finished_status="completed")
+                # Verify FINISH_REQUESTS commands were sent to correct ranks
+                scheduler.input_queues[0].put.assert_called()
+                scheduler.input_queues[1].put.assert_called()
-        total = scheduler.get_num_unfinished_requests()
-        assert total == 8
+    def test_get_num_unfinished_requests(self, mock_vllm_config,
+                                         mock_kv_cache_config,
+                                         mock_structured_output_manager):
+        """Test get_num_unfinished_requests queries all workers."""
+        with patch(
+                'tpu_inference.core.sched.dp_scheduler._scheduler_worker_process'
+        ):
+            with patch('multiprocessing.get_context'):
+                scheduler = DPScheduler(
+                    vllm_config=mock_vllm_config,
+                    kv_cache_config=mock_kv_cache_config,
+                    structured_output_manager=mock_structured_output_manager,
+                    block_size=16,
+                )
+                scheduler.input_queues = [MagicMock(), MagicMock()]
+                # Create proper mocks for queue.get() calls
+                mock_queue_0 = MagicMock()
+                mock_queue_0.get.return_value = 5
+                mock_queue_1 = MagicMock()
+                mock_queue_1.get.return_value = 3
+                scheduler.output_queues = {
+                    (0, "get_num_unfinished_requests"): mock_queue_0,
+                    (1, "get_num_unfinished_requests"): mock_queue_1,
+                }
+                total = scheduler.get_num_unfinished_requests()
+                # Verify commands were sent
+                scheduler.input_queues[0].put.assert_called_with(
+                    (SchedulerCommand.GET_NUM_UNFINISHED_REQUESTS, None))
+                scheduler.input_queues[1].put.assert_called_with(
+                    (SchedulerCommand.GET_NUM_UNFINISHED_REQUESTS, None))
+                assert total == 8
     def test_has_finished_requests(self, mock_vllm_config,
                                    mock_kv_cache_config,
                                    mock_structured_output_manager):
-        """Test has_finished_requests checks all ranks."""
-        mock_scheduler_cls = MagicMock(return_value=MagicMock())
-        with patch.object(mock_vllm_config.scheduler_config,
-                          '_original_scheduler_cls', mock_scheduler_cls):
-            scheduler = DPScheduler(
-                vllm_config=mock_vllm_config,
-                kv_cache_config=mock_kv_cache_config,
-                structured_output_manager=mock_structured_output_manager,
-                block_size=16,
-            )
-            # Test when one rank has finished requests
-            scheduler.schedulers[0].has_finished_requests = MagicMock(
-                return_value=False)
-            scheduler.schedulers[1].has_finished_requests = MagicMock(
-                return_value=True)
-            assert scheduler.has_finished_requests() is True
-            # Test when no rank has finished requests
-            scheduler.schedulers[1].has_finished_requests = MagicMock(
-                return_value=False)
-            assert scheduler.has_finished_requests() is False
+        """Test has_finished_requests checks all workers."""
+        with patch(
+                'tpu_inference.core.sched.dp_scheduler._scheduler_worker_process'
+        ):
+            with patch('multiprocessing.get_context'):
+                scheduler = DPScheduler(
+                    vllm_config=mock_vllm_config,
+                    kv_cache_config=mock_kv_cache_config,
+                    structured_output_manager=mock_structured_output_manager,
+                    block_size=16,
+                )
+                scheduler.input_queues = [MagicMock(), MagicMock()]
+                # Create proper mocks for queue.get() calls
+                mock_queue_0 = MagicMock()
+                mock_queue_0.get.return_value = False
+                mock_queue_1 = MagicMock()
+                mock_queue_1.get.return_value = True
+                scheduler.output_queues = {
+                    (0, "has_finished_requests"): mock_queue_0,
+                    (1, "has_finished_requests"): mock_queue_1,
+                }
+                result = scheduler.has_finished_requests()
+                assert result is True
+                # Verify commands were sent
+                scheduler.input_queues[0].put.assert_called_with(
+                    (SchedulerCommand.HAS_FINISHED_REQUESTS, None))
+                scheduler.input_queues[1].put.assert_called_with(
+                    (SchedulerCommand.HAS_FINISHED_REQUESTS, None))
     def test_get_request_counts(self, mock_vllm_config, mock_kv_cache_config,
                                 mock_structured_output_manager):
-        """Test get_request_counts aggregates across ranks."""
-        scheduler = self._create_dp_scheduler_with_mocks(
-            mock_vllm_config, mock_kv_cache_config,
-            mock_structured_output_manager)
-        # Mock running and waiting queues
-        scheduler.schedulers[0].running = [MagicMock(),
-                                           MagicMock()]  # 2 running
-        scheduler.schedulers[0].waiting = [MagicMock()]  # 1 waiting
-        scheduler.schedulers[1].running = [MagicMock()]  # 1 running
-        scheduler.schedulers[1].waiting = [
-            MagicMock(), MagicMock(), MagicMock()
-        ]  # 3 waiting
-        running, waiting = scheduler.get_request_counts()
-        assert running == 3  # 2 + 1
-        assert waiting == 4  # 1 + 3
+        """Test get_request_counts queries all workers."""
+        with patch(
+                'tpu_inference.core.sched.dp_scheduler._scheduler_worker_process'
+        ):
+            with patch('multiprocessing.get_context'):
+                scheduler = DPScheduler(
+                    vllm_config=mock_vllm_config,
+                    kv_cache_config=mock_kv_cache_config,
+                    structured_output_manager=mock_structured_output_manager,
+                    block_size=16,
+                )
+                scheduler.input_queues = [MagicMock(), MagicMock()]
+                # Create proper mocks for queue.get() calls
+                mock_queue_0 = MagicMock()
+                mock_queue_0.get.return_value = (2, 1)
+                mock_queue_1 = MagicMock()
+                mock_queue_1.get.return_value = (1, 3)
+                scheduler.output_queues = {
+                    (0, "get_request_counts"): mock_queue_0,
+                    (1, "get_request_counts"): mock_queue_1,
+                }
+                running, waiting = scheduler.get_request_counts()
+                # Verify commands were sent
+                scheduler.input_queues[0].put.assert_called_with(
+                    (SchedulerCommand.GET_REQUEST_COUNTS, None))
+                scheduler.input_queues[1].put.assert_called_with(
+                    (SchedulerCommand.GET_REQUEST_COUNTS, None))
+                assert running == 3
+                assert waiting == 4
     def test_reset_prefix_cache(self, mock_vllm_config, mock_kv_cache_config,
                                 mock_structured_output_manager):
-        """Test reset_prefix_cache resets all ranks."""
-        scheduler = self._create_dp_scheduler_with_mocks(
-            mock_vllm_config, mock_kv_cache_config,
-            mock_structured_output_manager)
-        scheduler.schedulers[0].reset_prefix_cache = MagicMock(
-            return_value=True)
-        scheduler.schedulers[1].reset_prefix_cache = MagicMock(
-            return_value=True)
-        result = scheduler.reset_prefix_cache()
-        assert result is True
-        scheduler.schedulers[0].reset_prefix_cache.assert_called_once()
-        scheduler.schedulers[1].reset_prefix_cache.assert_called_once()
-    def test_make_stats_with_logging_enabled(self, mock_vllm_config,
-                                             mock_kv_cache_config,
-                                             mock_structured_output_manager):
-        """Test make_stats aggregates stats from all ranks."""
-        scheduler = self._create_dp_scheduler_with_mocks(
-            mock_vllm_config,
-            mock_kv_cache_config,
-            mock_structured_output_manager,
-            log_stats=True)
-        # Create mock stats for each rank
-        stats_0 = SchedulerStats(
-            num_running_reqs=3,
-            num_waiting_reqs=2,
-            kv_cache_usage=0.5,
-            prefix_cache_stats=PrefixCacheStats(reset=False,
-                                                requests=10,
-                                                queries=8,
-                                                hits=5),
-            connector_prefix_cache_stats=PrefixCacheStats(reset=False,
-                                                          requests=5,
-                                                          queries=4,
-                                                          hits=2),
-            spec_decoding_stats=None,
-            kv_connector_stats=None,
-        )
-        stats_1 = SchedulerStats(
-            num_running_reqs=4,
-            num_waiting_reqs=1,
-            kv_cache_usage=0.7,
-            prefix_cache_stats=PrefixCacheStats(reset=False,
-                                                requests=15,
-                                                queries=12,
-                                                hits=8),
-            connector_prefix_cache_stats=PrefixCacheStats(reset=False,
-                                                          requests=6,
-                                                          queries=5,
-                                                          hits=3),
-            spec_decoding_stats=None,
-            kv_connector_stats=None,
-        )
-        scheduler.schedulers[0].make_stats = MagicMock(return_value=stats_0)
-        scheduler.schedulers[1].make_stats = MagicMock(return_value=stats_1)
-        combined_stats = scheduler.make_stats()
-        # Verify aggregated stats
-        assert combined_stats.num_running_reqs == 7  # 3 + 4
-        assert combined_stats.num_waiting_reqs == 3  # 2 + 1
-        assert combined_stats.kv_cache_usage == 0.6  # (0.5 + 0.7) / 2
-        # Verify prefix cache stats
-        assert combined_stats.prefix_cache_stats.requests == 25  # 10 + 15
-        assert combined_stats.prefix_cache_stats.queries == 20  # 8 + 12
-        assert combined_stats.prefix_cache_stats.hits == 13  # 5 + 8
-        # Verify connector prefix cache stats
-        assert combined_stats.connector_prefix_cache_stats.requests == 11  # 5 + 6
-        assert combined_stats.connector_prefix_cache_stats.queries == 9  # 4 + 5
-        assert combined_stats.connector_prefix_cache_stats.hits == 5  # 2 + 3
-    def test_make_stats_with_logging_disabled(self, mock_vllm_config,
-                                              mock_kv_cache_config,
-                                              mock_structured_output_manager):
-        """Test make_stats returns None when logging is disabled."""
-        mock_scheduler_cls = MagicMock(return_value=MagicMock())
-        with patch.object(mock_vllm_config.scheduler_config,
-                          '_original_scheduler_cls', mock_scheduler_cls):
-            scheduler = DPScheduler(
-                vllm_config=mock_vllm_config,
-                kv_cache_config=mock_kv_cache_config,
-                structured_output_manager=mock_structured_output_manager,
-                block_size=16,
-                log_stats=False,
-            )
-            stats = scheduler.make_stats()
-            assert stats is None
+        """Test reset_prefix_cache sends command to all workers."""
+        with patch(
+                'tpu_inference.core.sched.dp_scheduler._scheduler_worker_process'
+        ):
+            with patch('multiprocessing.get_context'):
+                scheduler = DPScheduler(
+                    vllm_config=mock_vllm_config,
+                    kv_cache_config=mock_kv_cache_config,
+                    structured_output_manager=mock_structured_output_manager,
+                    block_size=16,
+                )
+                scheduler.input_queues = [MagicMock(), MagicMock()]
+                # Create proper mocks for queue.get() calls
+                mock_queue_0 = MagicMock()
+                mock_queue_0.get.return_value = True
+                mock_queue_1 = MagicMock()
+                mock_queue_1.get.return_value = True
+                scheduler.output_queues = {
+                    (0, "reset_prefix_cache"): mock_queue_0,
+                    (1, "reset_prefix_cache"): mock_queue_1,
+                }
+                result = scheduler.reset_prefix_cache()
+                # Verify commands were sent
+                scheduler.input_queues[0].put.assert_called_with(
+                    (SchedulerCommand.RESET_PREFIX_CACHE, None))
+                scheduler.input_queues[1].put.assert_called_with(
+                    (SchedulerCommand.RESET_PREFIX_CACHE, None))
+                assert result is True
+    def test_make_stats_aggregates_from_workers(
+            self, mock_vllm_config, mock_kv_cache_config,
+            mock_structured_output_manager):
+        """Test make_stats aggregates statistics from all workers."""
+        with patch(
+                'tpu_inference.core.sched.dp_scheduler._scheduler_worker_process'
+        ):
+            with patch('multiprocessing.get_context'):
+                scheduler = DPScheduler(
+                    vllm_config=mock_vllm_config,
+                    kv_cache_config=mock_kv_cache_config,
+                    structured_output_manager=mock_structured_output_manager,
+                    block_size=16,
+                    log_stats=True,
+                )
+                scheduler.input_queues = [MagicMock(), MagicMock()]
+                # Create mock stats
+                stats_0 = SchedulerStats(
+                    num_running_reqs=3,
+                    num_waiting_reqs=2,
+                    kv_cache_usage=0.5,
+                    prefix_cache_stats=PrefixCacheStats(reset=False,
+                                                        requests=10,
+                                                        queries=8,
+                                                        hits=5),
+                    connector_prefix_cache_stats=PrefixCacheStats(reset=False,
+                                                                  requests=5,
+                                                                  queries=4,
+                                                                  hits=2),
+                    spec_decoding_stats=None,
+                    kv_connector_stats=None,
+                )
+                stats_1 = SchedulerStats(
+                    num_running_reqs=4,
+                    num_waiting_reqs=1,
+                    kv_cache_usage=0.7,
+                    prefix_cache_stats=PrefixCacheStats(reset=False,
+                                                        requests=15,
+                                                        queries=12,
+                                                        hits=8),
+                    connector_prefix_cache_stats=PrefixCacheStats(reset=False,
+                                                                  requests=6,
+                                                                  queries=5,
+                                                                  hits=3),
+                    spec_decoding_stats=None,
+                    kv_connector_stats=None,
+                )
+                # Create proper mocks for queue.get() calls
+                mock_queue_0 = MagicMock()
+                mock_queue_0.get.return_value = stats_0
+                mock_queue_1 = MagicMock()
+                mock_queue_1.get.return_value = stats_1
+                scheduler.output_queues = {
+                    (0, "make_stats"): mock_queue_0,
+                    (1, "make_stats"): mock_queue_1,
+                }
+                combined_stats = scheduler.make_stats()
+                # Verify commands were sent
+                scheduler.input_queues[0].put.assert_called_with(
+                    (SchedulerCommand.MAKE_STATS, (None, None)))
+                scheduler.input_queues[1].put.assert_called_with(
+                    (SchedulerCommand.MAKE_STATS, (None, None)))
+                assert combined_stats.num_running_reqs == 7
+                assert combined_stats.num_waiting_reqs == 3
+                assert combined_stats.kv_cache_usage == 0.6
+    def test_make_stats_returns_none_when_disabled(
+            self, mock_vllm_config, mock_kv_cache_config,
+            mock_structured_output_manager):
+        """Test make_stats returns None when logging disabled."""
+        with patch(
+                'tpu_inference.core.sched.dp_scheduler._scheduler_worker_process'
+        ):
+            with patch('multiprocessing.get_context'):
+                scheduler = DPScheduler(
+                    vllm_config=mock_vllm_config,
+                    kv_cache_config=mock_kv_cache_config,
+                    structured_output_manager=mock_structured_output_manager,
+                    block_size=16,
+                    log_stats=False,
+                )
+                stats = scheduler.make_stats()
+                assert stats is None
     def test_update_draft_token_ids(self, mock_vllm_config,
                                     mock_kv_cache_config,
                                     mock_structured_output_manager):
-        """Test update_draft_token_ids routes tokens to correct ranks."""
-        scheduler = self._create_dp_scheduler_with_mocks(
-            mock_vllm_config, mock_kv_cache_config,
-            mock_structured_output_manager)
-        # Setup assigned ranks
-        scheduler.assigned_dp_rank = {"req1": 0, "req2": 1, "req3": 0}
-        # Create mock draft token IDs
-        draft_token_ids = MagicMock()
-        draft_token_ids.req_ids = ["req1", "req2", "req3"]
-        draft_token_ids.draft_token_ids = [
-            [101, 102, 103],
-            [201, 202],
-            [301, 302, 303, 304],
-        ]
-        # Mock scheduler update_draft_token_ids
-        scheduler.schedulers[0].update_draft_token_ids = MagicMock()
-        scheduler.schedulers[1].update_draft_token_ids = MagicMock()
-        scheduler.update_draft_token_ids(draft_token_ids)
-        # Verify each scheduler received correct tokens
-        assert scheduler.schedulers[0].update_draft_token_ids.called
-        assert scheduler.schedulers[1].update_draft_token_ids.called
-        # Check rank 0 got req1 and req3
-        call_args_0 = scheduler.schedulers[0].update_draft_token_ids.call_args[
-            0][0]
-        assert "req1" in call_args_0.req_ids
-        assert "req3" in call_args_0.req_ids
-        # Check rank 1 got req2
-        call_args_1 = scheduler.schedulers[1].update_draft_token_ids.call_args[
-            0][0]
-        assert "req2" in call_args_1.req_ids
+        """Test update_draft_token_ids routes to correct workers."""
+        with patch(
+                'tpu_inference.core.sched.dp_scheduler._scheduler_worker_process'
+        ):
+            with patch('multiprocessing.get_context'):
+                scheduler = DPScheduler(
+                    vllm_config=mock_vllm_config,
+                    kv_cache_config=mock_kv_cache_config,
+                    structured_output_manager=mock_structured_output_manager,
+                    block_size=16,
+                )
+                scheduler.input_queues = [MagicMock(), MagicMock()]
+                scheduler.output_queues = {
+                    (0, "update_draft_token_ids"): MagicMock(),
+                    (1, "update_draft_token_ids"): MagicMock(),
+                }
+                scheduler.assigned_dp_rank = {"req1": 0, "req2": 1, "req3": 0}
+                draft_token_ids = MagicMock()
+                draft_token_ids.req_ids = ["req1", "req2", "req3"]
+                draft_token_ids.draft_token_ids = [
+                    [101, 102, 103],
+                    [201, 202],
+                    [301, 302, 303, 304],
+                ]
+                scheduler.update_draft_token_ids(draft_token_ids)
+                # Verify commands were sent to correct workers
+                scheduler.input_queues[0].put.assert_called()
+                scheduler.input_queues[1].put.assert_called()
     def test_shutdown(self, mock_vllm_config, mock_kv_cache_config,
                       mock_structured_output_manager):
-        """Test shutdown calls shutdown on all schedulers."""
-        scheduler = self._create_dp_scheduler_with_mocks(
-            mock_vllm_config, mock_kv_cache_config,
-            mock_structured_output_manager)
-        scheduler.schedulers[0].shutdown = MagicMock()
-        scheduler.schedulers[1].shutdown = MagicMock()
-        scheduler.shutdown()
-        scheduler.schedulers[0].shutdown.assert_called_once()
-        scheduler.schedulers[1].shutdown.assert_called_once()
+        """Test shutdown sends SHUTDOWN command to all workers."""
+        with patch(
+                'tpu_inference.core.sched.dp_scheduler._scheduler_worker_process'
+        ):
+            with patch('multiprocessing.get_context'):
+                scheduler = DPScheduler(
+                    vllm_config=mock_vllm_config,
+                    kv_cache_config=mock_kv_cache_config,
+                    structured_output_manager=mock_structured_output_manager,
+                    block_size=16,
+                )
+                scheduler.input_queues = [MagicMock(), MagicMock()]
+                scheduler.output_queues = {
+                    (0, "shutdown"): MagicMock(),
+                    (1, "shutdown"): MagicMock(),
+                }
+                mock_process_0 = MagicMock()
+                mock_process_1 = MagicMock()
+                mock_process_0.is_alive = MagicMock(return_value=False)
+                mock_process_1.is_alive = MagicMock(return_value=False)
+                scheduler.processes = [mock_process_0, mock_process_1]
+                scheduler.shutdown()
+                # Verify SHUTDOWN commands were sent
+                scheduler.input_queues[0].put.assert_called_with(
+                    (SchedulerCommand.SHUTDOWN, None))
+                scheduler.input_queues[1].put.assert_called_with(
+                    (SchedulerCommand.SHUTDOWN, None))
+                # Verify processes were joined
+                mock_process_0.join.assert_called()
+                mock_process_1.join.assert_called()
 class TestUpdateVllmConfigForDPScheduler:

tpu-inference 0.11.1.dev202511220812__py3-none-any.whl → 0.13.2.dev20251230__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511220812py3-none-any.whl → 0.13.2.dev20251230py3-none-any.whl