PyPI - tpu-inference - Versions diffs - 0.11.1.dev202511270815__py3-none-any.whl → 0.13.0rc2.post7__py3-none-any.whl - Mend

tpu-inference 0.11.1.dev202511270815py3-none-any.whl → 0.13.0rc2.post7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of tpu-inference might be problematic. Click here for more details.

Files changed (251) hide show

tests/__init__.py +13 -0
tests/core/__init__.py +13 -0
tests/core/test_disagg_utils.py +14 -0
tests/core/test_dp_scheduler.py +650 -768
tests/core/test_init.py +14 -0
tests/distributed/__init__.py +13 -0
tests/distributed/test_distributed_utils.py +120 -0
tests/distributed/test_tpu_connector.py +478 -0
tests/e2e/__init__.py +13 -0
tests/e2e/test_async_scheduler.py +211 -0
tests/e2e/test_data_parallel.py +289 -0
tests/e2e/test_hybrid_kvcache.py +219 -0
tests/e2e/test_local_disagg.py +257 -0
tests/e2e/test_model_loader.py +268 -0
tests/e2e/test_multi_modal_inference.py +111 -0
tests/e2e/test_pipeline_parallel.py +265 -0
tests/e2e/test_runai_model_streamer_loader.py +104 -0
tests/e2e/test_sampling_params.py +269 -0
tests/e2e/test_speculative_decoding.py +311 -0
tests/e2e/test_structured_decoding.py +46 -0
tests/executors/__init__.py +13 -0
tests/executors/test_ray_distributed_executor.py +199 -0
tests/experimental/__init__.py +13 -0
tests/experimental/test_llama3_jax_stashed.py +208 -0
tests/kernels/__init__.py +13 -0
tests/kernels/collectives/__init__.py +13 -0
tests/kernels/collectives/all_gather_matmul_kernel_test.py +69 -0
tests/kernels/fused_moe_v1_test.py +14 -0
tests/kernels/gmm_test.py +205 -0
tests/kernels/mla_v1_test.py +143 -41
tests/kernels/quantized_matmul_kernel_test.py +2 -34
tests/kernels/ragged_kv_cache_update_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v2_test.py +14 -0
tests/kernels/ragged_paged_attention_kernel_v3_hd64_test.py +17 -1
tests/kernels/ragged_paged_attention_kernel_v3_test.py +17 -1
tests/layers/__init__.py +13 -0
tests/layers/common/__init__.py +13 -0
tests/layers/common/test_attention_interface.py +156 -0
tests/layers/common/test_quantization.py +149 -0
tests/layers/jax/__init__.py +13 -0
tests/layers/jax/attention/__init__.py +13 -0
tests/layers/jax/attention/test_common_attention.py +103 -0
tests/layers/jax/attention/test_deepseek_v3_attention.py +233 -0
tests/layers/jax/attention/test_llama4_attention.py +135 -0
tests/layers/jax/moe/__init__.py +13 -0
tests/layers/jax/moe/test_deepseek_moe.py +235 -0
tests/layers/jax/sample/__init__.py +13 -0
tests/layers/jax/sample/test_rejection_sampler.py +1624 -0
tests/layers/jax/sample/test_sampling.py +115 -0
tests/layers/jax/sample/test_sampling_metadata.py +254 -0
tests/layers/jax/test_layers.py +155 -0
tests/{test_quantization.py → layers/jax/test_qwix.py} +183 -50
tests/layers/jax/test_rope.py +93 -0
tests/layers/jax/test_sharding.py +159 -0
tests/layers/jax/test_transformer_block.py +152 -0
tests/layers/vllm/__init__.py +13 -0
tests/layers/vllm/test_attention.py +363 -0
tests/layers/vllm/test_awq.py +405 -0
tests/layers/vllm/test_compressed_tensors_moe.py +202 -0
tests/layers/vllm/test_compressed_tensors_w8a8_fp8.py +418 -0
tests/layers/vllm/test_compressed_tensors_w8a8_int8.py +441 -0
tests/layers/vllm/test_fp8.py +17 -0
tests/layers/vllm/test_mxfp4.py +312 -0
tests/layers/vllm/test_unquantized.py +651 -0
tests/layers/vllm/utils.py +87 -0
tests/lora/__init__.py +13 -0
tests/lora/conftest.py +14 -0
tests/lora/test_bgmv.py +14 -0
tests/lora/test_layers.py +21 -3
tests/lora/test_lora.py +15 -1
tests/lora/test_lora_perf.py +67 -0
tests/models/__init__.py +13 -0
tests/models/common/__init__.py +13 -0
tests/models/common/test_model_loader.py +455 -0
tests/models/jax/__init__.py +13 -0
tests/models/jax/test_deepseek_v3.py +401 -0
tests/models/jax/test_llama3.py +184 -0
tests/models/jax/test_llama4.py +298 -0
tests/models/jax/test_llama_eagle3.py +197 -0
tests/models/jax/test_llama_guard_4.py +242 -0
tests/models/jax/test_qwen2.py +172 -0
tests/models/jax/test_qwen2_5_vl.py +605 -0
tests/models/jax/test_qwen3.py +169 -0
tests/models/jax/test_weight_loading.py +180 -0
tests/models/jax/utils/__init__.py +13 -0
tests/models/jax/utils/test_multi_modal_utils.py +212 -0
tests/platforms/__init__.py +13 -0
tests/platforms/test_tpu_platform.py +54 -0
tests/runner/__init__.py +13 -0
tests/runner/test_block_table.py +395 -0
tests/runner/test_input_batch.py +226 -0
tests/runner/test_kv_cache.py +220 -0
tests/runner/test_kv_cache_manager.py +498 -0
tests/runner/test_multimodal_manager.py +429 -0
tests/runner/test_persistent_batch_manager.py +84 -0
tests/runner/test_speculative_decoding_manager.py +368 -0
tests/runner/test_structured_decoding_manager.py +220 -0
tests/runner/test_tpu_runner.py +261 -0
tests/runner/test_tpu_runner_dp.py +1099 -0
tests/runner/test_tpu_runner_mesh.py +200 -0
tests/runner/test_utils.py +411 -0
tests/spec_decode/__init__.py +13 -0
tests/spec_decode/test_eagle3.py +311 -0
tests/test_base.py +14 -0
tests/test_envs.py +110 -12
tests/test_tpu_info.py +14 -0
tests/test_utils.py +2 -45
tests/worker/__init__.py +13 -0
tests/worker/tpu_worker_test.py +414 -0
tpu_inference/__init__.py +14 -0
tpu_inference/core/__init__.py +13 -0
tpu_inference/core/sched/__init__.py +13 -0
tpu_inference/core/sched/dp_scheduler.py +372 -56
tpu_inference/distributed/__init__.py +13 -0
tpu_inference/distributed/jax_parallel_state.py +14 -0
tpu_inference/distributed/tpu_connector.py +15 -10
tpu_inference/distributed/utils.py +56 -4
tpu_inference/envs.py +92 -8
tpu_inference/executors/__init__.py +13 -0
tpu_inference/executors/ray_distributed_executor.py +22 -1
tpu_inference/experimental/__init__.py +13 -0
tpu_inference/experimental/llama3_jax_stashed.py +14 -0
tpu_inference/kernels/__init__.py +13 -0
tpu_inference/kernels/collectives/__init__.py +13 -0
tpu_inference/kernels/collectives/all_gather_matmul.py +12 -6
tpu_inference/kernels/collectives/all_gather_matmul_tuned_block_sizes.py +7 -2
tpu_inference/kernels/flash_attention/__init__.py +13 -0
tpu_inference/kernels/fused_moe/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/__init__.py +13 -0
tpu_inference/kernels/fused_moe/v1/kernel.py +370 -324
tpu_inference/kernels/megablox/__init__.py +13 -0
tpu_inference/kernels/megablox/common.py +54 -0
tpu_inference/kernels/megablox/gmm.py +646 -0
tpu_inference/kernels/mla/__init__.py +13 -0
tpu_inference/kernels/mla/v1/__init__.py +13 -0
tpu_inference/kernels/mla/v1/kernel.py +117 -145
tpu_inference/kernels/quantized_matmul/__init__.py +13 -0
tpu_inference/kernels/quantized_matmul/kernel.py +69 -8
tpu_inference/kernels/ragged_paged_attention/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v2/kernel.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v2/ragged_kv_cache_update.py +2 -1
tpu_inference/kernels/ragged_paged_attention/v3/__init__.py +13 -0
tpu_inference/kernels/ragged_paged_attention/v3/kernel.py +194 -101
tpu_inference/kernels/ragged_paged_attention/v3/kernel_hd64.py +167 -97
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes.py +3817 -3504
tpu_inference/kernels/ragged_paged_attention/v3/tuned_block_sizes_hd64.py +376 -195
tpu_inference/kernels/ragged_paged_attention/v3/util.py +15 -1
tpu_inference/layers/__init__.py +13 -0
tpu_inference/layers/common/__init__.py +13 -0
tpu_inference/layers/common/attention_interface.py +26 -19
tpu_inference/layers/common/attention_metadata.py +14 -0
tpu_inference/layers/common/quant_methods.py +15 -0
tpu_inference/layers/common/quantization.py +270 -0
tpu_inference/layers/common/sharding.py +31 -9
tpu_inference/layers/jax/__init__.py +13 -0
tpu_inference/layers/jax/attention/__init__.py +13 -0
tpu_inference/layers/jax/attention/attention.py +19 -6
tpu_inference/layers/jax/attention/deepseek_v3_attention.py +270 -77
tpu_inference/layers/jax/attention/gpt_oss_attention.py +24 -11
tpu_inference/layers/jax/attention/llama4_attention.py +17 -4
tpu_inference/layers/jax/base.py +14 -0
tpu_inference/layers/jax/constants.py +13 -0
tpu_inference/layers/jax/layers.py +14 -0
tpu_inference/layers/jax/misc.py +14 -0
tpu_inference/layers/jax/moe/__init__.py +13 -0
tpu_inference/layers/jax/moe/deepseek_v3_moe.py +20 -13
tpu_inference/layers/jax/moe/gpt_oss_moe.py +14 -0
tpu_inference/layers/jax/moe/moe.py +43 -3
tpu_inference/layers/jax/pp_utils.py +53 -0
tpu_inference/layers/jax/rope.py +14 -0
tpu_inference/layers/jax/rope_interface.py +14 -0
tpu_inference/layers/jax/sample/__init__.py +13 -0
tpu_inference/layers/jax/sample/rejection_sampler.py +13 -0
tpu_inference/layers/jax/sample/sampling.py +15 -1
tpu_inference/layers/jax/sample/sampling_metadata.py +14 -0
tpu_inference/layers/jax/transformer_block.py +14 -0
tpu_inference/layers/vllm/__init__.py +13 -0
tpu_inference/layers/vllm/attention.py +4 -4
tpu_inference/layers/vllm/fused_moe.py +210 -260
tpu_inference/layers/vllm/linear_common.py +57 -22
tpu_inference/layers/vllm/quantization/__init__.py +16 -0
tpu_inference/layers/vllm/quantization/awq.py +15 -1
tpu_inference/layers/vllm/quantization/common.py +33 -18
tpu_inference/layers/vllm/quantization/compressed_tensors/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors.py +18 -3
tpu_inference/layers/vllm/quantization/compressed_tensors/compressed_tensors_moe.py +211 -148
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/__init__.py +13 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_fp8.py +14 -0
tpu_inference/layers/vllm/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_int8.py +14 -0
tpu_inference/layers/vllm/quantization/fp8.py +118 -0
tpu_inference/layers/vllm/quantization/mxfp4.py +280 -210
tpu_inference/layers/vllm/quantization/unquantized.py +134 -86
tpu_inference/layers/vllm/sharding.py +21 -4
tpu_inference/lora/__init__.py +13 -0
tpu_inference/lora/torch_lora_ops.py +8 -13
tpu_inference/models/__init__.py +13 -0
tpu_inference/models/common/__init__.py +13 -0
tpu_inference/models/common/model_loader.py +77 -36
tpu_inference/models/jax/__init__.py +13 -0
tpu_inference/models/jax/deepseek_v3.py +267 -157
tpu_inference/models/jax/gpt_oss.py +26 -10
tpu_inference/models/jax/jax_intermediate_tensor.py +14 -0
tpu_inference/models/jax/llama3.py +99 -36
tpu_inference/models/jax/llama4.py +14 -0
tpu_inference/models/jax/llama_eagle3.py +14 -0
tpu_inference/models/jax/llama_guard_4.py +15 -1
tpu_inference/models/jax/qwen2.py +17 -2
tpu_inference/models/jax/qwen2_5_vl.py +18 -4
tpu_inference/models/jax/qwen3.py +17 -2
tpu_inference/models/jax/utils/__init__.py +13 -0
tpu_inference/models/jax/utils/file_utils.py +14 -0
tpu_inference/models/jax/utils/multi_modal_utils.py +18 -4
tpu_inference/models/jax/utils/qwix/__init__.py +13 -0
tpu_inference/models/jax/utils/{quantization/quantization_utils.py → qwix/qwix_utils.py} +91 -31
tpu_inference/models/jax/utils/weight_utils.py +39 -2
tpu_inference/models/vllm/__init__.py +13 -0
tpu_inference/models/vllm/vllm_model_wrapper.py +20 -4
tpu_inference/models/vllm/vllm_model_wrapper_context.py +14 -0
tpu_inference/platforms/__init__.py +14 -0
tpu_inference/platforms/tpu_platform.py +47 -71
tpu_inference/runner/__init__.py +13 -0
tpu_inference/runner/compilation_manager.py +158 -63
tpu_inference/runner/kv_cache.py +54 -20
tpu_inference/runner/kv_cache_manager.py +53 -30
tpu_inference/runner/lora_utils.py +14 -0
tpu_inference/runner/multimodal_manager.py +15 -1
tpu_inference/runner/persistent_batch_manager.py +54 -2
tpu_inference/runner/speculative_decoding_manager.py +14 -0
tpu_inference/runner/structured_decoding_manager.py +14 -0
tpu_inference/runner/tpu_runner.py +105 -57
tpu_inference/runner/utils.py +2 -2
tpu_inference/spec_decode/__init__.py +13 -0
tpu_inference/spec_decode/jax/__init__.py +13 -0
tpu_inference/spec_decode/jax/eagle3.py +65 -19
tpu_inference/tpu_info.py +14 -0
tpu_inference/utils.py +72 -44
tpu_inference/worker/__init__.py +13 -0
tpu_inference/worker/tpu_worker.py +65 -52
{tpu_inference-0.11.1.dev202511270815.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/METADATA +11 -9
tpu_inference-0.13.0rc2.post7.dist-info/RECORD +261 -0
tpu_inference/models/jax/utils/quantization/__init__.py +0 -0
tpu_inference/models/jax/utils/quantization/configs/fp8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/fp8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/configs/int8_all_modules_w_only.yaml +0 -5
tpu_inference/models/jax/utils/quantization/configs/int8_default.yaml +0 -6
tpu_inference/models/jax/utils/quantization/mxfp4_utils.py +0 -105
tpu_inference-0.11.1.dev202511270815.dist-info/RECORD +0 -174
{tpu_inference-0.11.1.dev202511270815.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/WHEEL +0 -0
{tpu_inference-0.11.1.dev202511270815.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/licenses/LICENSE +0 -0
{tpu_inference-0.11.1.dev202511270815.dist-info → tpu_inference-0.13.0rc2.post7.dist-info}/top_level.txt +0 -0

tpu_inference/core/sched/dp_scheduler.py CHANGED Viewed

@@ -1,8 +1,27 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import copy
+import multiprocessing.reduction
 from collections import defaultdict, deque
 from dataclasses import dataclass
+from enum import Enum
+from multiprocessing import Process, Queue
+from time import time
 from typing import Any, Dict, List, Optional, Tuple
+import cloudpickle
 import torch
 from vllm.config import VllmConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry
@@ -19,10 +38,186 @@ from vllm.v1.request import Request
 from vllm.v1.structured_output import StructuredOutputManager
 from tpu_inference.logger import init_logger
+from tpu_inference.utils import time_function
 logger = init_logger(__name__)
+class SchedulerCommand(Enum):
+    """Enum for scheduler worker process commands."""
+    ADD_REQUEST = "add_request"
+    SCHEDULE = "schedule"
+    FINISH_REQUESTS = "finish_requests"
+    UPDATE_DRAFT_TOKEN_IDS = "update_draft_token_ids"
+    UPDATE_FROM_OUTPUT = "update_from_output"
+    GET_GRAMMAR_BITMASK = "get_grammar_bitmask"
+    MAKE_STATS = "make_stats"
+    RESET_PREFIX_CACHE = "reset_prefix_cache"
+    GET_NUM_UNFINISHED_REQUESTS = "get_num_unfinished_requests"
+    HAS_FINISHED_REQUESTS = "has_finished_requests"
+    GET_REQUEST_COUNTS = "get_request_counts"
+    GET_TOKEN_COUNT = "get_token_count"
+    GET_COMPUTED_BLOCKS = "get_computed_blocks"
+    SHUTDOWN = "shutdown"
+class SchedulerWorkerError(Exception):
+    """Exception raised when a scheduler worker process encounters an error."""
+    def __init__(self, rank: int, message: str):
+        self.rank = rank
+        self.message = message
+        super().__init__(f"Scheduler worker {rank} error: {message}")
+# Monkey-patch multiprocessing to use cloudpickle
+# Standard pickle fails to serialize the vLLM Request object.
+_original_dumps = multiprocessing.reduction.ForkingPickler.dumps
+_original_loads = multiprocessing.reduction.ForkingPickler.loads
+def _cloudpickle_dumps(obj, protocol=None):
+    """Use cloudpickle for serialization."""
+    return cloudpickle.dumps(obj, protocol=protocol)
+def _cloudpickle_loads(data):
+    """Use cloudpickle for deserialization."""
+    return cloudpickle.loads(data)
+def _enable_cloudpickle():
+    """Enable cloudpickle for multiprocessing queues."""
+    multiprocessing.reduction.ForkingPickler.dumps = staticmethod(
+        _cloudpickle_dumps)
+    multiprocessing.reduction.ForkingPickler.loads = staticmethod(
+        _cloudpickle_loads)
+def _disable_cloudpickle():
+    """Restore original pickle for multiprocessing."""
+    multiprocessing.reduction.ForkingPickler.dumps = _original_dumps
+    multiprocessing.reduction.ForkingPickler.loads = _original_loads
+def _scheduler_worker_process(
+    rank: int,
+    input_queue: Queue,
+    output_queues: Dict[str, Queue],
+    vllm_config: Any,
+    kv_cache_config: Any,
+    structured_output_manager: Any,
+    block_size: int,
+    mm_registry: Any,
+    include_finished_set: bool,
+    log_stats: bool,
+    original_scheduler_cls: type,
+):
+    """Worker process that manages a single scheduler instance."""
+    # Initialize the scheduler in this process
+    scheduler = original_scheduler_cls(
+        vllm_config=vllm_config,
+        kv_cache_config=kv_cache_config,
+        structured_output_manager=structured_output_manager,
+        block_size=block_size,
+        mm_registry=mm_registry,
+        include_finished_set=include_finished_set,
+        log_stats=log_stats,
+    )
+    logger.debug(f"Scheduler worker process {rank} started")
+    # Process commands from the input queue
+    while True:
+        try:
+            command, data = input_queue.get()
+            match command:
+                case SchedulerCommand.ADD_REQUEST:
+                    request = data
+                    scheduler.add_request(request)
+                    output_queues[command.value].put(None)  # Signal completion
+                case SchedulerCommand.SCHEDULE:
+                    output = scheduler.schedule()
+                    output_queues[command.value].put(output)
+                case SchedulerCommand.FINISH_REQUESTS:
+                    request_ids, finished_status = data
+                    scheduler.finish_requests(request_ids, finished_status)
+                    output_queues[command.value].put(None)  # Signal completion
+                case SchedulerCommand.UPDATE_DRAFT_TOKEN_IDS:
+                    draft_token_ids = data
+                    scheduler.update_draft_token_ids(draft_token_ids)
+                    output_queues[command.value].put(None)  # Signal completion
+                case SchedulerCommand.UPDATE_FROM_OUTPUT:
+                    scheduler_output, model_runner_output = data
+                    result = scheduler.update_from_output(
+                        scheduler_output, model_runner_output)
+                    output_queues[command.value].put(result)
+                case SchedulerCommand.GET_GRAMMAR_BITMASK:
+                    scheduler_output = data
+                    result = scheduler.get_grammar_bitmask(scheduler_output)
+                    output_queues[command.value].put(result)
+                case SchedulerCommand.MAKE_STATS:
+                    spec_decoding_stats, kv_connector_stats = data
+                    result = scheduler.make_stats(spec_decoding_stats,
+                                                  kv_connector_stats)
+                    output_queues[command.value].put(result)
+                case SchedulerCommand.RESET_PREFIX_CACHE:
+                    result = scheduler.reset_prefix_cache()
+                    output_queues[command.value].put(result)
+                case SchedulerCommand.GET_NUM_UNFINISHED_REQUESTS:
+                    result = scheduler.get_num_unfinished_requests()
+                    output_queues[command.value].put(result)
+                case SchedulerCommand.HAS_FINISHED_REQUESTS:
+                    result = scheduler.has_finished_requests()
+                    output_queues[command.value].put(result)
+                case SchedulerCommand.GET_REQUEST_COUNTS:
+                    running = len(scheduler.running)
+                    waiting = len(scheduler.waiting)
+                    output_queues[command.value].put((running, waiting))
+                case SchedulerCommand.GET_TOKEN_COUNT:
+                    # Calculate total tokens across running and waiting requests
+                    total_tokens = 0
+                    for req in scheduler.running:
+                        total_tokens += len(req.all_token_ids)
+                    for req in scheduler.waiting:
+                        total_tokens += len(req.all_token_ids)
+                    output_queues[command.value].put(total_tokens)
+                case SchedulerCommand.GET_COMPUTED_BLOCKS:
+                    request = data
+                    blocks, cached_tokens = scheduler.kv_cache_manager.get_computed_blocks(
+                        request)
+                    output_queues[command.value].put((blocks, cached_tokens))
+                case SchedulerCommand.SHUTDOWN:
+                    scheduler.shutdown()
+                    output_queues[command.value].put(None)  # Signal completion
+                    break
+                case _:
+                    error = SchedulerWorkerError(
+                        rank, f"Unknown command: {command}")
+                    output_queues[command.value].put(error)
+                    raise error
+        except Exception as e:
+            logger.error(f"Error in scheduler worker {rank}: {e}",
+                         exc_info=True)
+            error = SchedulerWorkerError(rank, str(e))
+            output_queues[command.value].put(error)
 @dataclass
 class DPSchedulerOutput(SchedulerOutput):
     """Extended SchedulerOutput that includes DP rank assignments."""
@@ -77,22 +272,50 @@ class DPScheduler(SchedulerInterface):
         # The original scheduler class could be Scheduler or AsyncScheduler
         original_scheduler_cls = vllm_config.scheduler_config._original_scheduler_cls
-        self.schedulers: List[Scheduler] = []
+        # Enable cloudpickle for multiprocessing to handle local functions
+        _enable_cloudpickle()
+        # Create worker processes with separate output queues for each command type
+        import multiprocessing
+        ctx = multiprocessing.get_context('fork')
+        self.input_queues: List[Queue] = []
+        self.output_queues: Dict[Tuple[int, str], Queue] = {}
+        self.processes: List[Process] = []
         for rank in range(self.dp_size):
-            scheduler = original_scheduler_cls(
-                vllm_config=self.vllm_config,
-                kv_cache_config=self.per_rank_kv_cache_configs[rank],
-                structured_output_manager=structured_output_manager,
-                block_size=block_size,
-                mm_registry=mm_registry,
-                include_finished_set=include_finished_set,
-                log_stats=log_stats,
+            input_queue = ctx.Queue()
+            self.input_queues.append(input_queue)
+            output_queues_for_rank: Dict[str, Queue] = {}
+            for cmd in SchedulerCommand:
+                output_queues_for_rank[cmd.value] = ctx.Queue()
+                self.output_queues[(
+                    rank, cmd.value)] = output_queues_for_rank[cmd.value]
+            process = ctx.Process(
+                target=_scheduler_worker_process,
+                args=(
+                    rank,
+                    input_queue,
+                    output_queues_for_rank,
+                    self.vllm_config,
+                    self.per_rank_kv_cache_configs[rank],
+                    structured_output_manager,
+                    block_size,
+                    mm_registry,
+                    include_finished_set,
+                    log_stats,
+                    original_scheduler_cls,
+                ),
             )
-            self.schedulers.append(scheduler)
+            process.start()
+            self.processes.append(process)
         logger.info(
             f"DPScheduler (Async = {self.vllm_config.scheduler_config.async_scheduling}) "
-            f"per-rank limits: max_seqs={self.vllm_config.scheduler_config.max_num_seqs}, "
+            f"started {self.dp_size} worker processes with cloudpickle. "
+            f"Per-rank limits: max_seqs={self.vllm_config.scheduler_config.max_num_seqs}, "
             f"max_tokens={self.vllm_config.scheduler_config.max_num_batched_tokens}"
         )
@@ -103,15 +326,39 @@ class DPScheduler(SchedulerInterface):
             rank_config.num_blocks = kv_cache_config.num_blocks // self.dp_size
             self.per_rank_kv_cache_configs.append(rank_config)
+    def _get_result_from_queue(self, rank: int,
+                               command: SchedulerCommand) -> Any:
+        """Get result from the output queue for a specific rank and command type."""
+        queue_obj = self.output_queues[(rank, command.value)]
+        try:
+            start_time = time()
+            result = queue_obj.get()
+            end_time = time()
+            if end_time - start_time > 1.0:
+                logger.warning(
+                    f"Long wait time ({end_time - start_time:.2f}s) for rank {rank} "
+                    f"command {command.value} response.")
+        except EOFError as e:
+            raise RuntimeError(
+                f"Queue error for rank {rank} command {command.value}: "
+                "Worker process terminated unexpectedly. "
+                "This may indicate a crash in the scheduler worker process."
+            ) from e
+        if isinstance(result, SchedulerWorkerError):
+            raise result
+        return result
     def _get_rank_token_counts(self) -> Dict[int, int]:
         """Calculate total tokens currently assigned to each DP rank."""
-        rank_tokens = {rank: 0 for rank in range(self.dp_size)}
+        for rank in range(self.dp_size):
+            self.input_queues[rank].put(
+                (SchedulerCommand.GET_TOKEN_COUNT, None))
-        for rank, scheduler in enumerate(self.schedulers):
-            for request in scheduler.running:
-                rank_tokens[rank] += request.num_tokens
-            for request in scheduler.waiting:
-                rank_tokens[rank] += request.num_tokens
+        rank_tokens = {}
+        for rank in range(self.dp_size):
+            token_count = self._get_result_from_queue(
+                rank, SchedulerCommand.GET_TOKEN_COUNT)
+            rank_tokens[rank] = token_count
         return rank_tokens
@@ -120,11 +367,15 @@ class DPScheduler(SchedulerInterface):
         rank_tokens = self._get_rank_token_counts()
         # First, try to find a rank with prefix cache hit
+        for rank in range(self.dp_size):
+            self.input_queues[rank].put(
+                (SchedulerCommand.GET_COMPUTED_BLOCKS, request))
         best_cache_rank = None
         best_cache_tokens = 0
-        for rank, scheduler in enumerate(self.schedulers):
-            blocks, cached_tokens = scheduler.kv_cache_manager.get_computed_blocks(
-                request)
+        for rank in range(self.dp_size):
+            blocks, cached_tokens = self._get_result_from_queue(
+                rank, SchedulerCommand.GET_COMPUTED_BLOCKS)
             if cached_tokens > best_cache_tokens:
                 best_cache_tokens = cached_tokens
                 best_cache_rank = rank
@@ -149,26 +400,30 @@ class DPScheduler(SchedulerInterface):
             f"assigned to rank {self.assigned_dp_rank[request.request_id]})")
         rank = self._find_best_rank_for_request(request)
         self.assigned_dp_rank[request.request_id] = rank
-        self.schedulers[rank].add_request(request)
+        self.input_queues[rank].put((SchedulerCommand.ADD_REQUEST, request))
+        self._get_result_from_queue(rank, SchedulerCommand.ADD_REQUEST)
+    @time_function
     def schedule(self) -> DPSchedulerOutput:
         """
         Main scheduling method that coordinates all DP rank schedulers.
         Process:
         1. Add any new requests to appropriate DP ranks
-        2. Run each scheduler independently
+        2. Run each scheduler independently in parallel
         3. Combine outputs from all schedulers
         4. Return unified scheduling result
         """
         # Run each scheduler independently
+        for rank in range(self.dp_size):
+            self.input_queues[rank].put((SchedulerCommand.SCHEDULE, None))
+        # Collect outputs from all workers (blocking)
         rank_outputs = []
-        for rank, scheduler in enumerate(self.schedulers):
-            logger.debug(
-                f"Running scheduler for rank {rank}: "
-                f"{len(scheduler.running)} running, {len(scheduler.waiting)} waiting"
-            )
-            output = scheduler.schedule()
+        for rank in range(self.dp_size):
+            output = self._get_result_from_queue(rank,
+                                                 SchedulerCommand.SCHEDULE)
             rank_outputs.append(output)
         # Cache scheduler outputs to use in `update_from_output`
@@ -292,10 +547,12 @@ class DPScheduler(SchedulerInterface):
         combined_bitmasks = []
         # Get grammar bitmask from each DP rank scheduler
-        for rank, scheduler in enumerate(self.schedulers):
-            rank_output = rank_scheduler_outputs[rank]
-            grammar_output = scheduler.get_grammar_bitmask(rank_output)
+        for rank in range(self.dp_size):
+            self.input_queues[rank].put((SchedulerCommand.GET_GRAMMAR_BITMASK,
+                                         rank_scheduler_outputs[rank]))
+        for rank in range(self.dp_size):
+            grammar_output = self._get_result_from_queue(
+                rank, SchedulerCommand.GET_GRAMMAR_BITMASK)
             if grammar_output is not None:
                 combined_structured_output_request_ids.extend(
                     grammar_output.structured_output_request_ids)
@@ -328,10 +585,15 @@ class DPScheduler(SchedulerInterface):
             model_runner_output)
         rank_scheduler_outputs = self.cached_schedulers_output.popleft()
         # Update each scheduler with its portion of the output
+        for rank in range(self.dp_size):
+            self.input_queues[rank].put(
+                (SchedulerCommand.UPDATE_FROM_OUTPUT,
+                 (rank_scheduler_outputs[rank], rank_model_outputs[rank])))
         combined_engine_outputs = defaultdict(list)
-        for rank, scheduler in enumerate(self.schedulers):
-            rank_engine_outputs = scheduler.update_from_output(
-                rank_scheduler_outputs[rank], rank_model_outputs[rank])
+        for rank in range(self.dp_size):
+            rank_engine_outputs = self._get_result_from_queue(
+                rank, SchedulerCommand.UPDATE_FROM_OUTPUT)
             for client_idx, engine_output in rank_engine_outputs.items():
                 combined_engine_outputs[client_idx].append(engine_output)
@@ -397,30 +659,62 @@ class DPScheduler(SchedulerInterface):
         # Forward to each scheduler
         for rank, req_ids in rank_request_ids.items():
-            self.schedulers[rank].finish_requests(req_ids, finished_status)
+            self.input_queues[rank].put(
+                (SchedulerCommand.FINISH_REQUESTS, (req_ids, finished_status)))
+            self._get_result_from_queue(rank, SchedulerCommand.FINISH_REQUESTS)
     def get_num_unfinished_requests(self) -> int:
         """Get total number of unfinished requests across all DP ranks."""
-        return sum(scheduler.get_num_unfinished_requests()
-                   for scheduler in self.schedulers)
+        for rank in range(self.dp_size):
+            self.input_queues[rank].put(
+                (SchedulerCommand.GET_NUM_UNFINISHED_REQUESTS, None))
+        total = 0
+        for rank in range(self.dp_size):
+            count = self._get_result_from_queue(
+                rank, SchedulerCommand.GET_NUM_UNFINISHED_REQUESTS)
+            total += count
+        return total
     def has_finished_requests(self) -> bool:
         """Check if any DP rank has finished requests."""
-        return any(scheduler.has_finished_requests()
-                   for scheduler in self.schedulers)
+        for rank in range(self.dp_size):
+            self.input_queues[rank].put(
+                (SchedulerCommand.HAS_FINISHED_REQUESTS, None))
+        has_finished_any = False
+        for rank in range(self.dp_size):
+            has_finished_any |= self._get_result_from_queue(
+                rank, SchedulerCommand.HAS_FINISHED_REQUESTS)
+        return has_finished_any
     def get_request_counts(self) -> Tuple[int, int]:
         """Get total (running, waiting) request counts across all DP ranks."""
-        total_running = sum(
-            len(scheduler.running) for scheduler in self.schedulers)
-        total_waiting = sum(
-            len(scheduler.waiting) for scheduler in self.schedulers)
+        for rank in range(self.dp_size):
+            self.input_queues[rank].put(
+                (SchedulerCommand.GET_REQUEST_COUNTS, None))
+        total_running = 0
+        total_waiting = 0
+        for rank in range(self.dp_size):
+            running, waiting = self._get_result_from_queue(
+                rank, SchedulerCommand.GET_REQUEST_COUNTS)
+            total_running += running
+            total_waiting += waiting
         return total_running, total_waiting
     def reset_prefix_cache(self) -> bool:
         """Reset prefix cache for all DP rank schedulers."""
-        return all(scheduler.reset_prefix_cache()
-                   for scheduler in self.schedulers)
+        for rank in range(self.dp_size):
+            self.input_queues[rank].put(
+                (SchedulerCommand.RESET_PREFIX_CACHE, None))
+        all_success = True
+        for rank in range(self.dp_size):
+            success = self._get_result_from_queue(
+                rank, SchedulerCommand.RESET_PREFIX_CACHE)
+            all_success &= success
+        return all_success
     def make_stats(self,
                    spec_decoding_stats=None,
@@ -438,9 +732,14 @@ class DPScheduler(SchedulerInterface):
         combined_connector_prefix_cache_stats: Optional[
             PrefixCacheStats] = None
-        for scheduler in self.schedulers:
-            rank_stats = scheduler.make_stats(spec_decoding_stats,
-                                              kv_connector_stats)
+        for rank in range(self.dp_size):
+            self.input_queues[rank].put(
+                (SchedulerCommand.MAKE_STATS, (spec_decoding_stats,
+                                               kv_connector_stats)))
+        for rank in range(self.dp_size):
+            rank_stats = self._get_result_from_queue(
+                rank, SchedulerCommand.MAKE_STATS)
             if rank_stats is None:
                 continue
@@ -465,8 +764,7 @@ class DPScheduler(SchedulerInterface):
                 combined_connector_prefix_cache_stats.hits += rank_stats.connector_prefix_cache_stats.hits
         # Average KV cache usage across ranks
-        avg_kv_cache_usage = total_kv_cache_usage / len(
-            self.schedulers) if self.schedulers else 0.0
+        avg_kv_cache_usage = total_kv_cache_usage / self.dp_size if self.dp_size else 0.0
         return SchedulerStats(
             num_running_reqs=total_running_reqs,
@@ -494,18 +792,36 @@ class DPScheduler(SchedulerInterface):
                 rank_draft_tokens[rank]["req_ids"].append(req_id)
                 rank_draft_tokens[rank]["draft_token_ids"].append(tokens)
-        # Forward to each scheduler
         for rank, draft_data in rank_draft_tokens.items():
             # Create a draft_token_ids object for this rank (mock structure)
             rank_draft_token_ids = type(draft_token_ids)(
                 req_ids=draft_data["req_ids"],
                 draft_token_ids=draft_data["draft_token_ids"])
-            self.schedulers[rank].update_draft_token_ids(rank_draft_token_ids)
+            self.input_queues[rank].put(
+                (SchedulerCommand.UPDATE_DRAFT_TOKEN_IDS,
+                 rank_draft_token_ids))
+            self._get_result_from_queue(
+                rank, SchedulerCommand.UPDATE_DRAFT_TOKEN_IDS)
     def shutdown(self) -> None:
-        """Shutdown all DP rank schedulers."""
-        for scheduler in self.schedulers:
-            scheduler.shutdown()
+        """Shutdown all DP rank scheduler worker processes."""
+        # Send shutdown command to all workers
+        for rank in range(self.dp_size):
+            self.input_queues[rank].put((SchedulerCommand.SHUTDOWN, None))
+        # Wait for acknowledgment (blocking)
+        for rank in range(self.dp_size):
+            self._get_result_from_queue(rank, SchedulerCommand.SHUTDOWN)
+        # Terminate and join all processes
+        for process in self.processes:
+            process.join(timeout=5.0)
+            if process.is_alive():
+                process.terminate()
+                process.join()
+        # Restore original pickle
+        _disable_cloudpickle()
 def update_vllm_config_for_dp_scheduler(vllm_config: Any) -> None:

tpu_inference/distributed/__init__.py CHANGED Viewed

@@ -0,0 +1,13 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

tpu_inference/distributed/jax_parallel_state.py CHANGED Viewed

@@ -1,3 +1,17 @@
+# Copyright 2025 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 from typing import Any, Optional
 import jax

tpu_inference/distributed/tpu_connector.py CHANGED Viewed

@@ -88,7 +88,7 @@ if TYPE_CHECKING:
 from tpu_inference import envs
 from tpu_inference.distributed.utils import (get_host_ip, get_kv_ips,
                                              get_kv_ports,
-                                             get_kv_transfer_port, get_node_id,
+                                             get_kv_transfer_port,
                                              get_side_channel_port)
 from tpu_inference.logger import init_logger
 from tpu_inference.runner.tpu_runner import TPUModelRunner
@@ -442,10 +442,10 @@ class TPUConnectorWorker:
         self.runner: TPUModelRunner = None
         self.mesh: Mesh = None
         self.multi_host = envs.TPU_MULTIHOST_BACKEND == "ray"
-        # NOTE(xiang): This can not be the worker rank set in RayDistributedExecutor.
-        # The worker rank is assigned with vLLM's sorting logic, which does not work
-        # for TPU host topology.
-        self.node_id = get_node_id()
+        # default value for none distributed scenario
+        # when the topology is initialized, runner will update it
+        # based on topology_order_id
+        self.node_id = 0
         # req_id: (kv, expiration_time)
         self.reqs_wait_pull: dict[ReqId, list[list[jax.Array], float]] = {}
@@ -457,7 +457,6 @@ class TPUConnectorWorker:
         self.side_channel_port = get_side_channel_port()
         self.kv_transfer_server = None
-        self._maybe_start_p2p_server()
         self.zmq_cxt = zmq.Context()
         if self.is_producer:
             ready_event = threading.Event()
@@ -473,7 +472,7 @@ class TPUConnectorWorker:
             self.pull_conns: dict[str, Any] = {}
             self.notif_sockets: dict[str, zmq.Socket] = {}
-        logger.info(f"TPUConnector Worker {self.node_id} --> init | "
+        logger.info(f"TPUConnector Worker --> init | "
                     f"ip={self.host_ip} | "
                     f"kv_transfer_port={self.kv_transfer_port} | "
                     f"side_channel_port={self.side_channel_port}")
@@ -489,6 +488,7 @@ class TPUConnectorWorker:
             self.zmq_cxt.destroy(linger=0)
     def register_runner(self, runner: TPUModelRunner):
+        self.node_id = runner.topology_order_id
         self.runner = runner
         self.mesh = runner.mesh
@@ -499,6 +499,11 @@ class TPUConnectorWorker:
         self.shape = list(kv_layer.shape)
         self.dtype = kv_layer.dtype
         self.sharding = kv_layer.sharding
+        logger.info(f"TPUConnector Worker --> register_runner | "
+                    f"node_id={self.node_id} | "
+                    f"ip={self.host_ip} | "
+                    f"kv_transfer_port={self.kv_transfer_port}")
+        self._maybe_start_p2p_server()
     def _maybe_start_p2p_server(self):
         if self.kv_transfer_server is not None:
@@ -694,9 +699,9 @@ class TPUConnectorWorker:
 def get_uuid() -> int:
     int128 = uuid4().int
-    # Must be 64-bit int, otherwise vllm output encoder would raise error.
-    int64 = int128 >> 64
-    return int64
+    # Must be less than 64-bit int, otherwise vllm output encoder would raise error.
+    # use 50 bit to avoid GO trunk the int when doing JSon serialization
+    return int128 >> 78
 @jax.jit

tpu-inference 0.11.1.dev202511270815__py3-none-any.whl → 0.13.0rc2.post7__py3-none-any.whl

Potentially problematic release.

tpu-inference 0.11.1.dev202511270815py3-none-any.whl → 0.13.0rc2.post7py3-none-any.whl