PyPI - tpu-inference - Versions diffs - 0.11.1__py3-none-any.whl - Mend

tpu-inference 0.11.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (168) hide show

tests/__init__.py +0 -0
tests/core/__init__.py +0 -0
tests/core/test_adapters.py +83 -0
tests/core/test_core_tpu.py +523 -0
tests/core/test_disagg_executor.py +60 -0
tests/core/test_disagg_utils.py +53 -0
tests/core/test_init.py +49 -0
tests/kernels/__init__.py +0 -0
tests/kernels/quantized_matmul_kernel_test.py +191 -0
tests/kernels/ragged_kv_cache_update_v2_test.py +234 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +400 -0
tests/kernels/ragged_paged_attention_kernel_v3_test.py +504 -0
tests/lora/__init__.py +0 -0
tests/lora/test_lora.py +123 -0
tests/test_base.py +201 -0
tests/test_quantization.py +836 -0
tests/test_tpu_info.py +120 -0
tests/test_utils.py +218 -0
tests/tpu_backend_test.py +59 -0
tpu_inference/__init__.py +30 -0
tpu_inference/adapters/__init__.py +0 -0
tpu_inference/adapters/vllm_adapters.py +42 -0
tpu_inference/adapters/vllm_config_adapters.py +134 -0
tpu_inference/backend.py +69 -0
tpu_inference/core/__init__.py +0 -0
tpu_inference/core/adapters.py +153 -0
tpu_inference/core/core_tpu.py +776 -0
tpu_inference/core/disagg_executor.py +117 -0
tpu_inference/core/disagg_utils.py +51 -0
tpu_inference/di/__init__.py +0 -0
tpu_inference/di/abstracts.py +28 -0
tpu_inference/di/host.py +76 -0
tpu_inference/di/interfaces.py +51 -0
tpu_inference/distributed/__init__.py +0 -0
tpu_inference/distributed/tpu_connector.py +699 -0
tpu_inference/distributed/utils.py +59 -0
tpu_inference/executors/__init__.py +0 -0
tpu_inference/executors/ray_distributed_executor.py +346 -0
tpu_inference/experimental/__init__.py +0 -0
tpu_inference/experimental/llama3_jax_stashed.py +258 -0
tpu_inference/interfaces/__init__.py +0 -0
tpu_inference/interfaces/cache.py +31 -0
tpu_inference/interfaces/config.py +47 -0
tpu_inference/interfaces/config_parts.py +117 -0
tpu_inference/interfaces/engine.py +51 -0
tpu_inference/interfaces/outputs.py +22 -0
tpu_inference/interfaces/params.py +21 -0
tpu_inference/interfaces/platform.py +74 -0
tpu_inference/interfaces/request.py +39 -0
tpu_inference/interfaces/scheduler.py +31 -0
tpu_inference/kernels/__init__.py +0 -0
tpu_inference/kernels/collectives/__init__.py +0 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +735 -0
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +60 -0
tpu_inference/kernels/collectives/util.py +47 -0
tpu_inference/kernels/flash_attention/__init__.py +0 -0
tpu_inference/kernels/flash_attention/kernel.py +772 -0
tpu_inference/kernels/quantized_matmul/__init__.py +0 -0
tpu_inference/kernels/quantized_matmul/kernel.py +395 -0
tpu_inference/kernels/quantized_matmul/tuned_block_sizes.py +609 -0
tpu_inference/kernels/quantized_matmul/util.py +58 -0
tpu_inference/kernels/ragged_paged_attention/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +875 -0
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +287 -0
tpu_inference/kernels/ragged_paged_attention/v2/tuned_block_sizes.py +1482 -0
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +0 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +1447 -0
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3834 -0
tpu_inference/kernels/ragged_paged_attention/v3/util.py +47 -0
tpu_inference/layers/__init__.py +0 -0
tpu_inference/layers/common/__init__.py +0 -0
tpu_inference/layers/common/attention_metadata.py +34 -0
tpu_inference/layers/jax/__init__.py +0 -0
tpu_inference/layers/jax/attention/__init__.py +0 -0
tpu_inference/layers/jax/attention/attention.py +254 -0
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +354 -0
tpu_inference/layers/jax/attention/llama4_attention.py +153 -0
tpu_inference/layers/jax/attention_interface.py +356 -0
tpu_inference/layers/jax/base.py +151 -0
tpu_inference/layers/jax/binary_search.py +295 -0
tpu_inference/layers/jax/constants.py +88 -0
tpu_inference/layers/jax/layers.py +301 -0
tpu_inference/layers/jax/misc.py +16 -0
tpu_inference/layers/jax/moe/__init__.py +0 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +608 -0
tpu_inference/layers/jax/moe/moe.py +209 -0
tpu_inference/layers/jax/rope.py +172 -0
tpu_inference/layers/jax/rope_interface.py +214 -0
tpu_inference/layers/jax/sample/__init__.py +0 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +515 -0
tpu_inference/layers/jax/sample/sampling.py +95 -0
tpu_inference/layers/jax/sample/sampling_metadata.py +69 -0
tpu_inference/layers/jax/sharding.py +406 -0
tpu_inference/layers/jax/transformer_block.py +76 -0
tpu_inference/layers/vllm/__init__.py +0 -0
tpu_inference/layers/vllm/attention.py +184 -0
tpu_inference/layers/vllm/fused_moe.py +399 -0
tpu_inference/layers/vllm/linear_common.py +186 -0
tpu_inference/layers/vllm/quantization/__init__.py +34 -0
tpu_inference/layers/vllm/quantization/awq.py +207 -0
tpu_inference/layers/vllm/quantization/common.py +105 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +121 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +0 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +208 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +136 -0
tpu_inference/layers/vllm/quantization/unquantized.py +263 -0
tpu_inference/layers/vllm/sharding.py +151 -0
tpu_inference/logger.py +10 -0
tpu_inference/lora/__init__.py +0 -0
tpu_inference/lora/torch_lora_ops.py +103 -0
tpu_inference/lora/torch_punica_tpu.py +308 -0
tpu_inference/mock/__init__.py +0 -0
tpu_inference/mock/vllm_config_utils.py +28 -0
tpu_inference/mock/vllm_envs.py +1233 -0
tpu_inference/mock/vllm_logger.py +212 -0
tpu_inference/mock/vllm_logging_utils.py +15 -0
tpu_inference/models/__init__.py +0 -0
tpu_inference/models/common/__init__.py +0 -0
tpu_inference/models/common/model_loader.py +433 -0
tpu_inference/models/jax/__init__.py +0 -0
tpu_inference/models/jax/deepseek_v3.py +868 -0
tpu_inference/models/jax/llama3.py +366 -0
tpu_inference/models/jax/llama4.py +473 -0
tpu_inference/models/jax/llama_eagle3.py +333 -0
tpu_inference/models/jax/phi3.py +376 -0
tpu_inference/models/jax/qwen2.py +375 -0
tpu_inference/models/jax/qwen2_5_vl.py +976 -0
tpu_inference/models/jax/qwen3.py +302 -0
tpu_inference/models/jax/utils/__init__.py +0 -0
tpu_inference/models/jax/utils/file_utils.py +96 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +164 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/quantization_utils.py +588 -0
tpu_inference/models/jax/utils/weight_utils.py +510 -0
tpu_inference/models/vllm/__init__.py +0 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +272 -0
tpu_inference/models/vllm/vllm_model_wrapper_context.py +45 -0
tpu_inference/platforms/__init__.py +2 -0
tpu_inference/platforms/tpu_jax.py +257 -0
tpu_inference/runner/__init__.py +0 -0
tpu_inference/runner/block_table_jax.py +122 -0
tpu_inference/runner/compilation_manager.py +672 -0
tpu_inference/runner/input_batch_jax.py +435 -0
tpu_inference/runner/kv_cache.py +119 -0
tpu_inference/runner/kv_cache_manager.py +460 -0
tpu_inference/runner/lora_utils.py +92 -0
tpu_inference/runner/multimodal_manager.py +208 -0
tpu_inference/runner/persistent_batch_manager.py +244 -0
tpu_inference/runner/speculative_decoding_manager.py +250 -0
tpu_inference/runner/structured_decoding_manager.py +89 -0
tpu_inference/runner/tpu_jax_runner.py +771 -0
tpu_inference/runner/utils.py +426 -0
tpu_inference/spec_decode/__init__.py +0 -0
tpu_inference/spec_decode/jax/__init__.py +0 -0
tpu_inference/spec_decode/jax/eagle3.py +334 -0
tpu_inference/tpu_info.py +77 -0
tpu_inference/utils.py +294 -0
tpu_inference/worker/__init__.py +0 -0
tpu_inference/worker/_temporary_vllm_compat.py +129 -0
tpu_inference/worker/base.py +100 -0
tpu_inference/worker/tpu_worker_jax.py +321 -0
tpu_inference-0.11.1.dist-info/METADATA +101 -0
tpu_inference-0.11.1.dist-info/RECORD +168 -0
tpu_inference-0.11.1.dist-info/WHEEL +5 -0
tpu_inference-0.11.1.dist-info/licenses/LICENSE +201 -0
tpu_inference-0.11.1.dist-info/top_level.txt +2 -0

tests/__init__.py ADDED Viewed

File without changes

tests/core/__init__.py ADDED Viewed

File without changes

tests/core/test_adapters.py ADDED Viewed

@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+import unittest
+from unittest.mock import MagicMock, PropertyMock
+from tpu_inference.core import adapters
+class TestVllmConfigAdapter(unittest.TestCase):
+    def test_config_adapter(self):
+        mock_vllm_config = MagicMock()
+        type(mock_vllm_config).scheduler_config = PropertyMock(
+            return_value="scheduler")
+        type(mock_vllm_config).cache_config = PropertyMock(
+            return_value="cache")
+        adapter = adapters.VllmConfigAdapter(mock_vllm_config)
+        self.assertEqual(adapter.scheduler_config, "scheduler")
+        self.assertEqual(adapter.cache_config, "cache")
+class TestVllmSchedulerAdapter(unittest.TestCase):
+    def test_add_request(self):
+        mock_scheduler = MagicMock()
+        mock_request = MagicMock()
+        mock_request.vllm_request = "vllm_request"
+        adapter = adapters.VllmSchedulerAdapter(mock_scheduler)
+        adapter.add_request(mock_request)
+        mock_scheduler.add_request.assert_called_once_with("vllm_request")
+    def test_getattr(self):
+        mock_scheduler = MagicMock()
+        adapter = adapters.VllmSchedulerAdapter(mock_scheduler)
+        adapter.schedule()
+        mock_scheduler.schedule.assert_called_once()
+class TestVllmEngineAdapter(unittest.TestCase):
+    def test_engine_adapter(self):
+        mock_engine_core = MagicMock()
+        mock_engine_core.scheduler = "scheduler"
+        type(mock_engine_core).model_executor = PropertyMock(
+            return_value="executor")
+        adapter = adapters.VllmEngineAdapter(mock_engine_core)
+        self.assertIsInstance(adapter.scheduler, adapters.VllmSchedulerAdapter)
+        self.assertEqual(adapter.model_executor, "executor")
+        adapter.execute_model_with_error_logging("arg1", kwarg1="kwarg1")
+        mock_engine_core.execute_model_with_error_logging.assert_called_once_with(
+            "arg1", kwarg1="kwarg1")
+        adapter.shutdown()
+        mock_engine_core.shutdown.assert_called_once()
+class TestVllmRequestAdapter(unittest.TestCase):
+    def test_request_adapter(self):
+        mock_vllm_request = MagicMock()
+        type(mock_vllm_request).request_id = PropertyMock(return_value="123")
+        # Mock properties that can be written to by setting them as attributes
+        # on the mock object.
+        mock_vllm_request.num_computed_tokens = 10
+        mock_vllm_request.status = "COMPLETED"
+        adapter = adapters.VllmRequestAdapter(mock_vllm_request)
+        self.assertEqual(adapter.vllm_request, mock_vllm_request)
+        self.assertEqual(adapter.request_id, "123")
+        self.assertEqual(adapter.num_computed_tokens, 10)
+        self.assertEqual(adapter.status, "COMPLETED")
+        adapter.num_computed_tokens = 20
+        self.assertEqual(mock_vllm_request.num_computed_tokens, 20)
+        adapter.status = "RUNNING"
+        self.assertEqual(mock_vllm_request.status, "RUNNING")

tests/core/test_core_tpu.py ADDED Viewed

@@ -0,0 +1,523 @@
+# SPDX-License-Identifier: Apache-2.0
+import unittest
+from unittest.mock import MagicMock, patch
+from vllm.config import ParallelConfig, VllmConfig
+from vllm.v1.engine import EngineCoreRequest, EngineCoreRequestType
+from vllm.v1.executor.abstract import Executor
+from vllm.v1.request import Request
+from tpu_inference.core.adapters import (VllmConfigAdapter, VllmEngineAdapter,
+                                         VllmRequestAdapter)
+from tpu_inference.core.core_tpu import (DisaggEngineCore,
+                                         DisaggEngineCoreProc,
+                                         _DisaggOrchestrator)
+from tpu_inference.interfaces.config import IConfig
+from tpu_inference.interfaces.engine import IEngineCore
+class TestDisaggEngineCore(unittest.TestCase):
+    def setUp(self):
+        # Patch disagg_utils to control slice configuration.
+        self.mock_disagg_utils_patcher = patch(
+            'tpu_inference.core.core_tpu.disagg_utils')
+        self.mock_disagg_utils = self.mock_disagg_utils_patcher.start()
+        self.mock_disagg_utils.get_prefill_slices.return_value = (
+            4, )  # One prefill engine
+        self.mock_disagg_utils.get_decode_slices.return_value = (
+            2, )  # One decode engine
+        self.addCleanup(self.mock_disagg_utils_patcher.stop)
+        # Patch the orchestrator to test the adapter in isolation
+        self.mock_orchestrator_patcher = patch(
+            'tpu_inference.core.core_tpu._DisaggOrchestrator')
+        self.mock_orchestrator = self.mock_orchestrator_patcher.start()
+        self.addCleanup(self.mock_orchestrator_patcher.stop)
+        # Patch vLLMEngineCore to avoid its complex initialization.
+        self.mock_engine_core_patcher = patch(
+            'tpu_inference.core.core_tpu.vLLMEngineCore')
+        self.mock_vLLMEngineCore = self.mock_engine_core_patcher.start()
+        self.addCleanup(self.mock_engine_core_patcher.stop)
+        # Mock jax.devices
+        self.mock_jax_devices_patcher = patch('jax.devices',
+                                              return_value=[MagicMock()] * 8)
+        self.mock_jax_devices = self.mock_jax_devices_patcher.start()
+        self.addCleanup(self.mock_jax_devices_patcher.stop)
+        # VLLM Config
+        self.mock_vllm_config = MagicMock(spec=VllmConfig)
+        self.mock_vllm_config.parallel_config = MagicMock(spec=ParallelConfig)
+        self.mock_vllm_config.device_config = MagicMock()
+        self.mock_vllm_config.cache_config = MagicMock()
+        self.mock_vllm_config.cache_config.prefix_caching_hash_algo = "builtin"
+        self.mock_vllm_config.cache_config.block_size = 5
+        self.mock_vllm_config.__post_init__ = MagicMock()
+    def test_initialization(self):
+        """Tests that the adapter initializes the orchestrator correctly."""
+        engine = DisaggEngineCore(
+            vllm_config=self.mock_vllm_config,
+            executor_class=MagicMock(spec=Executor),
+            log_stats=False,
+        )
+        self.mock_orchestrator.assert_called_once()
+        args, kwargs = self.mock_orchestrator.call_args
+        self.assertIsInstance(kwargs['config'], VllmConfigAdapter)
+        self.assertEqual(kwargs['config'].vllm_config, self.mock_vllm_config)
+        self.assertEqual(kwargs['output_queue'], engine.output_queue)
+        self.assertEqual(len(kwargs['prefill_engines']), 1)
+        self.assertIsInstance(kwargs['prefill_engines'][0], VllmEngineAdapter)
+        self.assertEqual(len(kwargs['decode_engines']), 1)
+        self.assertIsInstance(kwargs['decode_engines'][0], VllmEngineAdapter)
+        self.assertEqual(kwargs['prefill_slice_sizes'], (4, ))
+        self.assertEqual(kwargs['decode_slice_sizes'], (2, ))
+    def test_add_request(self):
+        """Tests that the adapter correctly delegates add_request to the orchestrator."""
+        engine = DisaggEngineCore(
+            vllm_config=self.mock_vllm_config,
+            executor_class=MagicMock(spec=Executor),
+            log_stats=False,
+        )
+        mock_request = MagicMock(spec=Request)
+        mock_request.request_id = "test_req"
+        mock_request.pooling_params = None
+        mock_request.kv_transfer_params = None
+        engine.add_request(mock_request)
+        self.mock_orchestrator.return_value.add_request.assert_called_once()
+        # Get the argument passed to add_request
+        passed_request_adapter = self.mock_orchestrator.return_value.add_request.call_args[
+            0][0]
+        # Assert it's the correct type and wraps the correct underlying request
+        self.assertIsInstance(passed_request_adapter, VllmRequestAdapter)
+        self.assertIsInstance(passed_request_adapter.vllm_request, Request)
+        self.assertEqual(passed_request_adapter.request_id, "test_req")
+    def test_shutdown(self):
+        """Tests that the adapter correctly delegates shutdown to the orchestrator."""
+        engine = DisaggEngineCore(
+            vllm_config=self.mock_vllm_config,
+            executor_class=MagicMock(spec=Executor),
+            log_stats=False,
+        )
+        engine.shutdown()
+        self.mock_orchestrator.return_value.shutdown.assert_called_once()
+class TestDisaggEngineCoreProc(unittest.TestCase):
+    def setUp(self):
+        # Patch disagg_utils to control slice configuration.
+        self.mock_disagg_utils_patcher = patch(
+            'tpu_inference.core.core_tpu.disagg_utils')
+        self.mock_disagg_utils = self.mock_disagg_utils_patcher.start()
+        self.mock_disagg_utils.get_prefill_slices.return_value = (
+            4, )  # One prefill engine
+        self.mock_disagg_utils.get_decode_slices.return_value = (
+            2, )  # One decode engine
+        self.addCleanup(self.mock_disagg_utils_patcher.stop)
+        # Patch the orchestrator to test the adapter in isolation
+        self.mock_orchestrator_patcher = patch(
+            'tpu_inference.core.core_tpu._DisaggOrchestrator')
+        self.mock_orchestrator = self.mock_orchestrator_patcher.start()
+        self.addCleanup(self.mock_orchestrator_patcher.stop)
+        # Patch vLLMEngineCore to avoid its complex initialization.
+        self.mock_engine_core_patcher = patch(
+            'tpu_inference.core.core_tpu.vLLMEngineCore')
+        self.mock_vLLMEngineCore = self.mock_engine_core_patcher.start()
+        self.addCleanup(self.mock_engine_core_patcher.stop)
+        # Patch the ZMQ handshake to isolate the test.
+        self.mock_handshake_patcher = patch(
+            'tpu_inference.core.core_tpu.DisaggEngineCoreProc._perform_handshake'
+        )
+        self.mock_handshake = self.mock_handshake_patcher.start()
+        self.mock_handshake.return_value.__enter__.return_value = MagicMock(
+            outputs=["output_addr"], coordinator_output=None)
+        self.addCleanup(self.mock_handshake_patcher.stop)
+        # Patch threads to avoid them running in the background.
+        def mock_thread_constructor(*args, **kwargs):
+            mock_thread = MagicMock()
+            def mock_start():
+                # Check if this is the input thread by looking at target and args
+                target = kwargs.get('target')
+                thread_args = kwargs.get('args', ())
+                # If this is the input thread (process_input_sockets), set the ready_event
+                if (target and hasattr(target, '__name__')
+                        and target.__name__ == 'process_input_sockets'):
+                    assert len(
+                        thread_args
+                    ) == 4, "Expected 4 arguments for vllm process_input_sockets function"
+                    ready_event = thread_args[
+                        3]  # ready_event is the 4th argument
+                    ready_event.set()
+            mock_thread.start = mock_start
+            mock_thread.is_alive.return_value = True
+            return mock_thread
+        self.thread_patcher = patch("threading.Thread",
+                                    side_effect=mock_thread_constructor)
+        self.mock_thread = self.thread_patcher.start()
+        self.addCleanup(self.thread_patcher.stop)
+        # Mock jax.devices
+        self.mock_jax_devices_patcher = patch('jax.devices',
+                                              return_value=[MagicMock()] * 8)
+        self.mock_jax_devices = self.mock_jax_devices_patcher.start()
+        self.addCleanup(self.mock_jax_devices_patcher.stop)
+        # VLLM Config
+        self.mock_vllm_config = MagicMock(spec=VllmConfig)
+        self.mock_vllm_config.parallel_config = MagicMock(spec=ParallelConfig)
+        self.mock_vllm_config.device_config = MagicMock()
+        self.mock_vllm_config.cache_config = MagicMock()
+        self.mock_vllm_config.cache_config.prefix_caching_hash_algo = "builtin"
+        self.mock_vllm_config.cache_config.block_size = 5
+        self.mock_vllm_config.__post_init__ = MagicMock()
+    def test_initialization(self):
+        """Tests that the adapter initializes the orchestrator correctly."""
+        proc = DisaggEngineCoreProc(
+            vllm_config=self.mock_vllm_config,
+            local_client=True,
+            handshake_address="dummy_addr",
+            executor_class=MagicMock(spec=Executor),
+            log_stats=False,
+        )
+        self.mock_orchestrator.assert_called_once()
+        args, kwargs = self.mock_orchestrator.call_args
+        self.assertIsInstance(kwargs['config'], VllmConfigAdapter)
+        self.assertEqual(kwargs['config'].vllm_config, self.mock_vllm_config)
+        self.assertEqual(kwargs['output_queue'], proc.output_queue)
+        self.assertEqual(len(kwargs['prefill_engines']), 1)
+        self.assertIsInstance(kwargs['prefill_engines'][0], VllmEngineAdapter)
+        self.assertEqual(len(kwargs['decode_engines']), 1)
+        self.assertIsInstance(kwargs['decode_engines'][0], VllmEngineAdapter)
+        self.assertEqual(kwargs['prefill_slice_sizes'], (4, ))
+        self.assertEqual(kwargs['decode_slice_sizes'], (2, ))
+    def test_add_request(self):
+        """Tests that the adapter correctly delegates add_request to the orchestrator."""
+        proc = DisaggEngineCoreProc(
+            vllm_config=self.mock_vllm_config,
+            local_client=True,
+            handshake_address="dummy_addr",
+            executor_class=MagicMock(spec=Executor),
+            log_stats=False,
+        )
+        mock_request = MagicMock(spec=EngineCoreRequest)
+        mock_request.request_id = "test_req"
+        mock_request.mm_hashes = None
+        mock_request.mm_kwargs = []
+        mock_request.use_structured_output = False
+        mock_request.pooling_params = None
+        mock_request.sampling_params.structured_outputs = None
+        mock_request.block_hashes = []
+        mock_engine_request, _ = proc.preprocess_add_request(mock_request)
+        proc.add_request(mock_engine_request)
+        self.mock_orchestrator.return_value.add_request.assert_called_once()
+        # Get the argument passed to add_request
+        passed_request_adapter = self.mock_orchestrator.return_value.add_request.call_args[
+            0][0]
+        # Assert it's the correct type and wraps the correct underlying request
+        self.assertIsInstance(passed_request_adapter, VllmRequestAdapter)
+        self.assertIsInstance(passed_request_adapter.vllm_request, Request)
+        self.assertEqual(passed_request_adapter.request_id, "test_req")
+    def test_shutdown(self):
+        """Tests that the adapter correctly delegates shutdown to the orchestrator."""
+        proc = DisaggEngineCoreProc(
+            vllm_config=self.mock_vllm_config,
+            local_client=True,
+            handshake_address="dummy_addr",
+            executor_class=MagicMock(spec=Executor),
+            log_stats=False,
+        )
+        proc.shutdown()
+        self.mock_orchestrator.return_value.shutdown.assert_called_once()
+    def test_handle_client_request_add(self):
+        """Tests that the adapter correctly handles an ADD request."""
+        proc = DisaggEngineCoreProc(
+            vllm_config=self.mock_vllm_config,
+            local_client=True,
+            handshake_address="dummy_addr",
+            executor_class=MagicMock(spec=Executor),
+            log_stats=False,
+        )
+        mock_request = MagicMock(spec=EngineCoreRequest)
+        mock_request.request_id = "test_req"
+        mock_request.mm_hashes = None
+        mock_request.mm_kwargs = []
+        mock_request.use_structured_output = False
+        mock_request.pooling_params = None
+        mock_request.sampling_params.structured_outputs = None
+        mock_request.block_hashes = []
+        mock_request = proc.preprocess_add_request(mock_request)
+        proc._handle_client_request(EngineCoreRequestType.ADD, mock_request)
+        self.mock_orchestrator.return_value.add_request.assert_called_once()
+    def test_handle_client_request_abort(self):
+        """Tests that the adapter correctly handles an ABORT request."""
+        proc = DisaggEngineCoreProc(
+            vllm_config=self.mock_vllm_config,
+            local_client=True,
+            handshake_address="dummy_addr",
+            executor_class=MagicMock(spec=Executor),
+            log_stats=False,
+        )
+        # This is currently a no-op, so we just check that it doesn't crash
+        proc._handle_client_request(EngineCoreRequestType.ABORT, "test_req")
+    def test_handle_client_request_utility(self):
+        """Tests that the adapter correctly handles a UTILITY request."""
+        proc = DisaggEngineCoreProc(
+            vllm_config=self.mock_vllm_config,
+            local_client=True,
+            handshake_address="dummy_addr",
+            executor_class=MagicMock(spec=Executor),
+            log_stats=False,
+        )
+        # Mock a method on the prefill engine instance
+        proc._prefill_engines = [MagicMock()]
+        proc._prefill_engines[0].list_loras.return_value = {1, 2, 3}
+        utility_request = (0, "call-id-1", "list_loras", ())
+        proc._handle_client_request(EngineCoreRequestType.UTILITY,
+                                    utility_request)
+        proc._prefill_engines[0].list_loras.assert_called_once()
+        self.assertTrue(proc.output_queue.qsize() > 0)
+class TestDisaggOrchestrator(unittest.TestCase):
+    def setUp(self):
+        self.mock_config = MagicMock(spec=IConfig)
+        self.mock_config.scheduler_config = MagicMock()
+        self.mock_config.scheduler_config.max_num_seqs = 16
+        self.mock_config.cache_config = MagicMock()
+        self.mock_config.cache_config.block_size = 5
+        self.mock_output_queue = MagicMock()
+        self.mock_prefill_engine = MagicMock(spec=IEngineCore)
+        self.mock_decode_engine = MagicMock(spec=IEngineCore)
+        # The orchestrator accesses the scheduler on the engine.
+        self.mock_prefill_engine.scheduler = MagicMock()
+        self.mock_decode_engine.scheduler = MagicMock()
+        # The orchestrator accesses the model_executor on the engine.
+        self.mock_prefill_engine.model_executor = MagicMock()
+        self.mock_decode_engine.model_executor = MagicMock()
+        # Patch threads to avoid them running in the background.
+        self.jet_thread_patcher = patch(
+            "tpu_inference.core.core_tpu.JetThread", MagicMock)
+        self.mock_jet_thread = self.jet_thread_patcher.start()
+        self.addCleanup(self.jet_thread_patcher.stop)
+    def test_initialization(self):
+        """Tests that the orchestrator initializes correctly."""
+        orchestrator = _DisaggOrchestrator(
+            config=self.mock_config,
+            output_queue=self.mock_output_queue,
+            prefill_engines=[self.mock_prefill_engine],
+            decode_engines=[self.mock_decode_engine],
+            prefill_slice_sizes=(4, ),
+            decode_slice_sizes=(2, ),
+        )
+        self.assertEqual(orchestrator._config, self.mock_config)
+        self.assertEqual(orchestrator._output_queue, self.mock_output_queue)
+        self.assertEqual(len(orchestrator._prefill_engines), 1)
+        self.assertEqual(len(orchestrator._decode_engines), 1)
+        self.assertEqual(len(orchestrator._all_threads),
+                         3)  # 1 prefill, 1 transfer, 1 decode
+    def test_add_request(self):
+        """Tests that a new request is added to the prefill engine."""
+        orchestrator = _DisaggOrchestrator(
+            config=self.mock_config,
+            output_queue=self.mock_output_queue,
+            prefill_engines=[self.mock_prefill_engine],
+            decode_engines=[self.mock_decode_engine],
+            prefill_slice_sizes=(4, ),
+            decode_slice_sizes=(2, ),
+        )
+        mock_request = MagicMock()
+        mock_request.vllm_request.request_id = "test_req"
+        orchestrator.add_request(mock_request)
+        self.assertIn("test_req", orchestrator._requests)
+        self.mock_prefill_engine.scheduler.add_request.assert_called_once_with(
+            mock_request)
+    def test_prefill_logic(self):
+        """Tests the prefill logic of the orchestrator."""
+        orchestrator = _DisaggOrchestrator(
+            config=self.mock_config,
+            output_queue=self.mock_output_queue,
+            prefill_engines=[self.mock_prefill_engine],
+            decode_engines=[self.mock_decode_engine],
+            prefill_slice_sizes=(4, ),
+            decode_slice_sizes=(2, ),
+        )
+        orchestrator.live = True
+        # Mock scheduler output
+        mock_scheduler_output = MagicMock()
+        mock_scheduler_output.total_num_scheduled_tokens = 1
+        self.mock_prefill_engine.scheduler.schedule.return_value = mock_scheduler_output
+        # Mock model output
+        mock_model_output = MagicMock()
+        mock_model_output.req_id_to_index = {"test_req": 0}
+        mock_model_output.sampled_token_ids = [[1]]
+        self.mock_prefill_engine.execute_model_with_error_logging.return_value = mock_model_output
+        # Mock request
+        mock_request = MagicMock()
+        orchestrator._requests["test_req"] = mock_request
+        # Mock the side effect of update_from_output to stop the loop
+        def stop_loop(*args, **kwargs):
+            orchestrator.live = False
+            return {}
+        self.mock_prefill_engine.scheduler.update_from_output.side_effect = stop_loop
+        orchestrator._prefill(0)
+        self.mock_prefill_engine.execute_model_with_error_logging.assert_called_once(
+        )
+        self.assertTrue(orchestrator._transfer_backlogs[0].qsize() > 0)
+    def test_transfer_logic(self):
+        """Tests the transfer logic of the orchestrator."""
+        orchestrator = _DisaggOrchestrator(
+            config=self.mock_config,
+            output_queue=self.mock_output_queue,
+            prefill_engines=[self.mock_prefill_engine],
+            decode_engines=[self.mock_decode_engine],
+            prefill_slice_sizes=(4, ),
+            decode_slice_sizes=(2, ),
+        )
+        orchestrator.live = True
+        # Mock kv cache map
+        mock_kv_cache_map = {"test_req": ([MagicMock()], [])}
+        orchestrator._transfer_backlogs[0].put(mock_kv_cache_map)
+        orchestrator._transfer_backlogs[0].put(
+            None)  # Sentinel to stop the loop
+        orchestrator._transfer(0)
+        self.mock_decode_engine.model_executor.driver_worker.model_runner.transfer_kv_cache.assert_called_once(
+        )
+        self.assertTrue(orchestrator._decode_backlogs[0].qsize() > 0)
+    def test_decode_logic(self):
+        """Tests the decode logic of the orchestrator."""
+        orchestrator = _DisaggOrchestrator(
+            config=self.mock_config,
+            output_queue=self.mock_output_queue,
+            prefill_engines=[self.mock_prefill_engine],
+            decode_engines=[self.mock_decode_engine],
+            prefill_slice_sizes=(4, ),
+            decode_slice_sizes=(2, ),
+        )
+        orchestrator.live = True
+        # Mock prefill output
+        mock_prefill_output = {
+            "req_id": "test_req",
+            "cache": [MagicMock()],
+            "block_hashes": []
+        }
+        orchestrator._decode_backlogs[0].put(mock_prefill_output)
+        orchestrator._decode_backlogs[0].put(None)  # Sentinel to stop the loop
+        # Mock request
+        mock_request = MagicMock()
+        mock_request.vllm_request.num_computed_tokens = 10
+        orchestrator._requests["test_req"] = mock_request
+        # Mock scheduler and model runner states for the loop condition
+        self.mock_decode_engine.scheduler.has_requests.return_value = False
+        self.mock_decode_engine.scheduler.get_request_counts.return_value = (0,
+                                                                             0)
+        self.mock_decode_engine.model_executor.driver_worker.model_runner.input_batch.num_reqs = 0
+        self.mock_decode_engine.scheduler.kv_cache_manager.get_block_ids.return_value = (
+            [20, 21], )
+        # Mock scheduler output
+        mock_scheduler_output = MagicMock()
+        mock_scheduler_output.total_num_scheduled_tokens = 1
+        self.mock_decode_engine.scheduler.schedule.return_value = mock_scheduler_output
+        # Mock model output
+        mock_model_output = MagicMock()
+        self.mock_decode_engine.execute_model_with_error_logging.return_value = mock_model_output
+        # Mock the side effect of update_from_output to stop the loop
+        def stop_loop(*args, **kwargs):
+            orchestrator.live = False
+            return {"test_req": MagicMock()}
+        self.mock_decode_engine.scheduler.update_from_output.side_effect = stop_loop
+        orchestrator._decode(0)
+        self.mock_decode_engine.execute_model_with_error_logging.assert_called_once(
+        )
+        self.mock_output_queue.put_nowait.assert_called_once()
+    def test_shutdown(self):
+        """Tests that the orchestrator correctly shuts down its engines."""
+        orchestrator = _DisaggOrchestrator(
+            config=self.mock_config,
+            output_queue=self.mock_output_queue,
+            prefill_engines=[self.mock_prefill_engine],
+            decode_engines=[self.mock_decode_engine],
+            prefill_slice_sizes=(4, ),
+            decode_slice_sizes=(2, ),
+        )
+        orchestrator.shutdown()
+        self.mock_prefill_engine.shutdown.assert_called_once()
+        self.mock_decode_engine.shutdown.assert_called_once()
+if __name__ == '__main__':
+    unittest.main()

tests/core/test_disagg_executor.py ADDED Viewed

@@ -0,0 +1,60 @@
+# SPDX-License-Identifier: Apache-2.0
+import unittest
+from unittest.mock import MagicMock, patch
+from vllm.config import ModelConfig, VllmConfig
+from tpu_inference.core.disagg_executor import DisaggExecutor
+class DisaggExecutorTest(unittest.TestCase):
+    def setUp(self):
+        """Set up the test environment by mocking dependencies."""
+        # Mock configurations
+        self.mock_vllm_config = MagicMock(spec=VllmConfig)
+        self.mock_vllm_config.model_config = ModelConfig(
+            tokenizer_mode="auto",
+            trust_remote_code=False,
+            seed=0,
+            dtype='bfloat16')
+        self.mock_vllm_config.cache_config = MagicMock()
+        self.mock_vllm_config.scheduler_config = MagicMock()
+        self.mock_vllm_config.load_config = MagicMock()
+        self.mock_vllm_config.lora_config = None
+        self.mock_vllm_config.parallel_config = MagicMock()
+        self.mock_vllm_config.device_config = MagicMock()
+        self.mock_vllm_config.speculative_config = None
+        self.mock_vllm_config.prompt_adapter_config = None
+        self.mock_vllm_config.observability_config = MagicMock()
+        # Patch the collective_rpc method to avoid actual RPC calls
+        self.patcher = patch(
+            "tpu_inference.core.disagg_executor.DisaggExecutor.collective_rpc")
+        self.mock_collective_rpc = self.patcher.start()
+        self.addCleanup(self.patcher.stop)
+        # Create a DisaggExecutor instance with the mock config
+        self.executor = DisaggExecutor(vllm_config=self.mock_vllm_config)
+    def test_init_with_devices(self):
+        """Test init_with_devices."""
+        self.executor._init_executor()
+        # Check that collective_rpc was called with the expected arguments
+        self.mock_collective_rpc.assert_called()
+        calls = self.mock_collective_rpc.call_args_list
+        # Asserts for init_worker
+        self.assertEqual(calls[0][0][0], "init_worker")
+        self.assertEqual(calls[1][0][0], "init_device")
+        self.assertEqual(calls[2][0][0], "load_model")
+    def test_check_health(self):
+        """Test check_health."""
+        # Call check_health (it should always pass)
+        self.executor.check_health()
+if __name__ == '__main__':
+    unittest.main()