PyPI - torchrl - Versions diffs - 0.11.0__cp314-cp314-win_amd64.whl - Mend

torchrl 0.11.0__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (394) hide show

benchmarks/benchmark_batched_envs.py +104 -0
benchmarks/conftest.py +91 -0
benchmarks/ecosystem/gym_env_throughput.py +321 -0
benchmarks/ecosystem/vmas_rllib_vs_torchrl_sampling_performance.py +231 -0
benchmarks/requirements.txt +7 -0
benchmarks/storage/benchmark_sample_latency_over_rpc.py +193 -0
benchmarks/test_collectors_benchmark.py +240 -0
benchmarks/test_compressed_storage_benchmark.py +145 -0
benchmarks/test_envs_benchmark.py +133 -0
benchmarks/test_llm.py +101 -0
benchmarks/test_non_tensor_env_benchmark.py +70 -0
benchmarks/test_objectives_benchmarks.py +1199 -0
benchmarks/test_replaybuffer_benchmark.py +254 -0
sota-check/README.md +35 -0
sota-implementations/README.md +142 -0
sota-implementations/a2c/README.md +39 -0
sota-implementations/a2c/a2c_atari.py +291 -0
sota-implementations/a2c/a2c_mujoco.py +273 -0
sota-implementations/a2c/utils_atari.py +240 -0
sota-implementations/a2c/utils_mujoco.py +160 -0
sota-implementations/bandits/README.md +7 -0
sota-implementations/bandits/dqn.py +126 -0
sota-implementations/cql/cql_offline.py +198 -0
sota-implementations/cql/cql_online.py +249 -0
sota-implementations/cql/discrete_cql_offline.py +180 -0
sota-implementations/cql/discrete_cql_online.py +227 -0
sota-implementations/cql/utils.py +471 -0
sota-implementations/crossq/crossq.py +271 -0
sota-implementations/crossq/utils.py +320 -0
sota-implementations/ddpg/ddpg.py +231 -0
sota-implementations/ddpg/utils.py +325 -0
sota-implementations/decision_transformer/dt.py +163 -0
sota-implementations/decision_transformer/lamb.py +167 -0
sota-implementations/decision_transformer/online_dt.py +178 -0
sota-implementations/decision_transformer/utils.py +562 -0
sota-implementations/discrete_sac/discrete_sac.py +243 -0
sota-implementations/discrete_sac/utils.py +324 -0
sota-implementations/dqn/README.md +30 -0
sota-implementations/dqn/dqn_atari.py +272 -0
sota-implementations/dqn/dqn_cartpole.py +236 -0
sota-implementations/dqn/utils_atari.py +132 -0
sota-implementations/dqn/utils_cartpole.py +90 -0
sota-implementations/dreamer/README.md +129 -0
sota-implementations/dreamer/dreamer.py +586 -0
sota-implementations/dreamer/dreamer_utils.py +1107 -0
sota-implementations/expert-iteration/README.md +352 -0
sota-implementations/expert-iteration/ei_utils.py +770 -0
sota-implementations/expert-iteration/expert-iteration-async.py +512 -0
sota-implementations/expert-iteration/expert-iteration-sync.py +508 -0
sota-implementations/expert-iteration/requirements_gsm8k.txt +13 -0
sota-implementations/expert-iteration/requirements_ifeval.txt +16 -0
sota-implementations/gail/gail.py +327 -0
sota-implementations/gail/gail_utils.py +68 -0
sota-implementations/gail/ppo_utils.py +157 -0
sota-implementations/grpo/README.md +273 -0
sota-implementations/grpo/grpo-async.py +437 -0
sota-implementations/grpo/grpo-sync.py +435 -0
sota-implementations/grpo/grpo_utils.py +843 -0
sota-implementations/grpo/requirements_gsm8k.txt +11 -0
sota-implementations/grpo/requirements_ifeval.txt +16 -0
sota-implementations/impala/README.md +33 -0
sota-implementations/impala/impala_multi_node_ray.py +292 -0
sota-implementations/impala/impala_multi_node_submitit.py +284 -0
sota-implementations/impala/impala_single_node.py +261 -0
sota-implementations/impala/utils.py +184 -0
sota-implementations/iql/discrete_iql.py +230 -0
sota-implementations/iql/iql_offline.py +164 -0
sota-implementations/iql/iql_online.py +225 -0
sota-implementations/iql/utils.py +437 -0
sota-implementations/multiagent/README.md +74 -0
sota-implementations/multiagent/iql.py +237 -0
sota-implementations/multiagent/maddpg_iddpg.py +266 -0
sota-implementations/multiagent/mappo_ippo.py +267 -0
sota-implementations/multiagent/qmix_vdn.py +271 -0
sota-implementations/multiagent/sac.py +337 -0
sota-implementations/multiagent/utils/__init__.py +4 -0
sota-implementations/multiagent/utils/logging.py +151 -0
sota-implementations/multiagent/utils/utils.py +43 -0
sota-implementations/ppo/README.md +29 -0
sota-implementations/ppo/ppo_atari.py +305 -0
sota-implementations/ppo/ppo_mujoco.py +293 -0
sota-implementations/ppo/utils_atari.py +238 -0
sota-implementations/ppo/utils_mujoco.py +152 -0
sota-implementations/ppo_trainer/train.py +21 -0
sota-implementations/redq/README.md +7 -0
sota-implementations/redq/redq.py +199 -0
sota-implementations/redq/utils.py +1060 -0
sota-implementations/sac/sac-async.py +266 -0
sota-implementations/sac/sac.py +239 -0
sota-implementations/sac/utils.py +381 -0
sota-implementations/sac_trainer/train.py +16 -0
sota-implementations/td3/td3.py +254 -0
sota-implementations/td3/utils.py +319 -0
sota-implementations/td3_bc/td3_bc.py +177 -0
sota-implementations/td3_bc/utils.py +251 -0
torchrl/__init__.py +144 -0
torchrl/_extension.py +74 -0
torchrl/_torchrl.cp314-win_amd64.pyd +0 -0
torchrl/_utils.py +1431 -0
torchrl/collectors/__init__.py +48 -0
torchrl/collectors/_base.py +1058 -0
torchrl/collectors/_constants.py +88 -0
torchrl/collectors/_multi_async.py +324 -0
torchrl/collectors/_multi_base.py +1805 -0
torchrl/collectors/_multi_sync.py +464 -0
torchrl/collectors/_runner.py +581 -0
torchrl/collectors/_single.py +2009 -0
torchrl/collectors/_single_async.py +259 -0
torchrl/collectors/collectors.py +62 -0
torchrl/collectors/distributed/__init__.py +32 -0
torchrl/collectors/distributed/default_configs.py +133 -0
torchrl/collectors/distributed/generic.py +1306 -0
torchrl/collectors/distributed/ray.py +1092 -0
torchrl/collectors/distributed/rpc.py +1006 -0
torchrl/collectors/distributed/sync.py +731 -0
torchrl/collectors/distributed/utils.py +160 -0
torchrl/collectors/llm/__init__.py +10 -0
torchrl/collectors/llm/base.py +494 -0
torchrl/collectors/llm/ray_collector.py +275 -0
torchrl/collectors/llm/utils.py +36 -0
torchrl/collectors/llm/weight_update/__init__.py +10 -0
torchrl/collectors/llm/weight_update/vllm.py +348 -0
torchrl/collectors/llm/weight_update/vllm_v2.py +311 -0
torchrl/collectors/utils.py +433 -0
torchrl/collectors/weight_update.py +591 -0
torchrl/csrc/numpy_utils.h +38 -0
torchrl/csrc/pybind.cpp +27 -0
torchrl/csrc/segment_tree.h +458 -0
torchrl/csrc/torch_utils.h +34 -0
torchrl/csrc/utils.cpp +48 -0
torchrl/csrc/utils.h +31 -0
torchrl/data/__init__.py +187 -0
torchrl/data/datasets/__init__.py +58 -0
torchrl/data/datasets/atari_dqn.py +878 -0
torchrl/data/datasets/common.py +281 -0
torchrl/data/datasets/d4rl.py +489 -0
torchrl/data/datasets/d4rl_infos.py +187 -0
torchrl/data/datasets/gen_dgrl.py +375 -0
torchrl/data/datasets/minari_data.py +643 -0
torchrl/data/datasets/openml.py +177 -0
torchrl/data/datasets/openx.py +798 -0
torchrl/data/datasets/roboset.py +363 -0
torchrl/data/datasets/utils.py +11 -0
torchrl/data/datasets/vd4rl.py +432 -0
torchrl/data/llm/__init__.py +34 -0
torchrl/data/llm/dataset.py +491 -0
torchrl/data/llm/history.py +1378 -0
torchrl/data/llm/prompt.py +198 -0
torchrl/data/llm/reward.py +225 -0
torchrl/data/llm/topk.py +186 -0
torchrl/data/llm/utils.py +543 -0
torchrl/data/map/__init__.py +21 -0
torchrl/data/map/hash.py +185 -0
torchrl/data/map/query.py +204 -0
torchrl/data/map/tdstorage.py +363 -0
torchrl/data/map/tree.py +1434 -0
torchrl/data/map/utils.py +103 -0
torchrl/data/postprocs/__init__.py +8 -0
torchrl/data/postprocs/postprocs.py +391 -0
torchrl/data/replay_buffers/__init__.py +99 -0
torchrl/data/replay_buffers/checkpointers.py +622 -0
torchrl/data/replay_buffers/ray_buffer.py +292 -0
torchrl/data/replay_buffers/replay_buffers.py +2376 -0
torchrl/data/replay_buffers/samplers.py +2578 -0
torchrl/data/replay_buffers/scheduler.py +265 -0
torchrl/data/replay_buffers/storages.py +2412 -0
torchrl/data/replay_buffers/utils.py +1042 -0
torchrl/data/replay_buffers/writers.py +781 -0
torchrl/data/tensor_specs.py +7101 -0
torchrl/data/utils.py +334 -0
torchrl/envs/__init__.py +265 -0
torchrl/envs/async_envs.py +1105 -0
torchrl/envs/batched_envs.py +3093 -0
torchrl/envs/common.py +4241 -0
torchrl/envs/custom/__init__.py +11 -0
torchrl/envs/custom/chess.py +617 -0
torchrl/envs/custom/llm.py +214 -0
torchrl/envs/custom/pendulum.py +401 -0
torchrl/envs/custom/san_moves.txt +29274 -0
torchrl/envs/custom/tictactoeenv.py +288 -0
torchrl/envs/env_creator.py +263 -0
torchrl/envs/gym_like.py +752 -0
torchrl/envs/libs/__init__.py +68 -0
torchrl/envs/libs/_gym_utils.py +326 -0
torchrl/envs/libs/brax.py +846 -0
torchrl/envs/libs/dm_control.py +544 -0
torchrl/envs/libs/envpool.py +447 -0
torchrl/envs/libs/gym.py +2239 -0
torchrl/envs/libs/habitat.py +138 -0
torchrl/envs/libs/isaac_lab.py +87 -0
torchrl/envs/libs/isaacgym.py +203 -0
torchrl/envs/libs/jax_utils.py +166 -0
torchrl/envs/libs/jumanji.py +963 -0
torchrl/envs/libs/meltingpot.py +599 -0
torchrl/envs/libs/openml.py +153 -0
torchrl/envs/libs/openspiel.py +652 -0
torchrl/envs/libs/pettingzoo.py +1042 -0
torchrl/envs/libs/procgen.py +351 -0
torchrl/envs/libs/robohive.py +429 -0
torchrl/envs/libs/smacv2.py +645 -0
torchrl/envs/libs/unity_mlagents.py +891 -0
torchrl/envs/libs/utils.py +147 -0
torchrl/envs/libs/vmas.py +813 -0
torchrl/envs/llm/__init__.py +63 -0
torchrl/envs/llm/chat.py +730 -0
torchrl/envs/llm/datasets/README.md +4 -0
torchrl/envs/llm/datasets/__init__.py +17 -0
torchrl/envs/llm/datasets/gsm8k.py +353 -0
torchrl/envs/llm/datasets/ifeval.py +274 -0
torchrl/envs/llm/envs.py +789 -0
torchrl/envs/llm/libs/README.md +3 -0
torchrl/envs/llm/libs/__init__.py +8 -0
torchrl/envs/llm/libs/mlgym.py +869 -0
torchrl/envs/llm/reward/__init__.py +10 -0
torchrl/envs/llm/reward/gsm8k.py +324 -0
torchrl/envs/llm/reward/ifeval/README.md +13 -0
torchrl/envs/llm/reward/ifeval/__init__.py +10 -0
torchrl/envs/llm/reward/ifeval/_instructions.py +1667 -0
torchrl/envs/llm/reward/ifeval/_instructions_main.py +131 -0
torchrl/envs/llm/reward/ifeval/_instructions_registry.py +100 -0
torchrl/envs/llm/reward/ifeval/_instructions_util.py +1677 -0
torchrl/envs/llm/reward/ifeval/_scorer.py +454 -0
torchrl/envs/llm/transforms/__init__.py +55 -0
torchrl/envs/llm/transforms/browser.py +292 -0
torchrl/envs/llm/transforms/dataloading.py +859 -0
torchrl/envs/llm/transforms/format.py +73 -0
torchrl/envs/llm/transforms/kl.py +1544 -0
torchrl/envs/llm/transforms/policy_version.py +189 -0
torchrl/envs/llm/transforms/reason.py +323 -0
torchrl/envs/llm/transforms/tokenizer.py +321 -0
torchrl/envs/llm/transforms/tools.py +1955 -0
torchrl/envs/model_based/__init__.py +9 -0
torchrl/envs/model_based/common.py +180 -0
torchrl/envs/model_based/dreamer.py +112 -0
torchrl/envs/transforms/__init__.py +147 -0
torchrl/envs/transforms/functional.py +48 -0
torchrl/envs/transforms/gym_transforms.py +203 -0
torchrl/envs/transforms/module.py +341 -0
torchrl/envs/transforms/r3m.py +372 -0
torchrl/envs/transforms/ray_service.py +663 -0
torchrl/envs/transforms/rb_transforms.py +214 -0
torchrl/envs/transforms/transforms.py +11835 -0
torchrl/envs/transforms/utils.py +94 -0
torchrl/envs/transforms/vc1.py +307 -0
torchrl/envs/transforms/vecnorm.py +845 -0
torchrl/envs/transforms/vip.py +407 -0
torchrl/envs/utils.py +1718 -0
torchrl/envs/vec_envs.py +11 -0
torchrl/modules/__init__.py +206 -0
torchrl/modules/distributions/__init__.py +73 -0
torchrl/modules/distributions/continuous.py +830 -0
torchrl/modules/distributions/discrete.py +908 -0
torchrl/modules/distributions/truncated_normal.py +187 -0
torchrl/modules/distributions/utils.py +233 -0
torchrl/modules/llm/__init__.py +62 -0
torchrl/modules/llm/backends/__init__.py +65 -0
torchrl/modules/llm/backends/vllm/__init__.py +94 -0
torchrl/modules/llm/backends/vllm/_models.py +46 -0
torchrl/modules/llm/backends/vllm/base.py +72 -0
torchrl/modules/llm/backends/vllm/vllm_async.py +2075 -0
torchrl/modules/llm/backends/vllm/vllm_plugin.py +22 -0
torchrl/modules/llm/backends/vllm/vllm_sync.py +446 -0
torchrl/modules/llm/backends/vllm/vllm_utils.py +129 -0
torchrl/modules/llm/policies/__init__.py +28 -0
torchrl/modules/llm/policies/common.py +1809 -0
torchrl/modules/llm/policies/transformers_wrapper.py +2756 -0
torchrl/modules/llm/policies/vllm_wrapper.py +2241 -0
torchrl/modules/llm/utils.py +23 -0
torchrl/modules/mcts/__init__.py +21 -0
torchrl/modules/mcts/scores.py +579 -0
torchrl/modules/models/__init__.py +86 -0
torchrl/modules/models/batchrenorm.py +119 -0
torchrl/modules/models/decision_transformer.py +179 -0
torchrl/modules/models/exploration.py +731 -0
torchrl/modules/models/llm.py +156 -0
torchrl/modules/models/model_based.py +596 -0
torchrl/modules/models/models.py +1712 -0
torchrl/modules/models/multiagent.py +1067 -0
torchrl/modules/models/recipes/impala.py +185 -0
torchrl/modules/models/utils.py +162 -0
torchrl/modules/planners/__init__.py +10 -0
torchrl/modules/planners/cem.py +228 -0
torchrl/modules/planners/common.py +73 -0
torchrl/modules/planners/mppi.py +265 -0
torchrl/modules/tensordict_module/__init__.py +89 -0
torchrl/modules/tensordict_module/actors.py +2457 -0
torchrl/modules/tensordict_module/common.py +529 -0
torchrl/modules/tensordict_module/exploration.py +814 -0
torchrl/modules/tensordict_module/probabilistic.py +321 -0
torchrl/modules/tensordict_module/rnn.py +1639 -0
torchrl/modules/tensordict_module/sequence.py +132 -0
torchrl/modules/tensordict_module/world_models.py +34 -0
torchrl/modules/utils/__init__.py +38 -0
torchrl/modules/utils/mappings.py +9 -0
torchrl/modules/utils/utils.py +89 -0
torchrl/objectives/__init__.py +78 -0
torchrl/objectives/a2c.py +659 -0
torchrl/objectives/common.py +753 -0
torchrl/objectives/cql.py +1346 -0
torchrl/objectives/crossq.py +710 -0
torchrl/objectives/ddpg.py +453 -0
torchrl/objectives/decision_transformer.py +371 -0
torchrl/objectives/deprecated.py +516 -0
torchrl/objectives/dqn.py +683 -0
torchrl/objectives/dreamer.py +488 -0
torchrl/objectives/functional.py +48 -0
torchrl/objectives/gail.py +258 -0
torchrl/objectives/iql.py +996 -0
torchrl/objectives/llm/__init__.py +30 -0
torchrl/objectives/llm/grpo.py +846 -0
torchrl/objectives/llm/sft.py +482 -0
torchrl/objectives/multiagent/__init__.py +8 -0
torchrl/objectives/multiagent/qmixer.py +396 -0
torchrl/objectives/ppo.py +1669 -0
torchrl/objectives/redq.py +683 -0
torchrl/objectives/reinforce.py +530 -0
torchrl/objectives/sac.py +1580 -0
torchrl/objectives/td3.py +570 -0
torchrl/objectives/td3_bc.py +625 -0
torchrl/objectives/utils.py +782 -0
torchrl/objectives/value/__init__.py +28 -0
torchrl/objectives/value/advantages.py +1956 -0
torchrl/objectives/value/functional.py +1459 -0
torchrl/objectives/value/utils.py +360 -0
torchrl/record/__init__.py +17 -0
torchrl/record/loggers/__init__.py +23 -0
torchrl/record/loggers/common.py +48 -0
torchrl/record/loggers/csv.py +226 -0
torchrl/record/loggers/mlflow.py +142 -0
torchrl/record/loggers/tensorboard.py +139 -0
torchrl/record/loggers/trackio.py +163 -0
torchrl/record/loggers/utils.py +78 -0
torchrl/record/loggers/wandb.py +214 -0
torchrl/record/recorder.py +554 -0
torchrl/services/__init__.py +79 -0
torchrl/services/base.py +109 -0
torchrl/services/ray_service.py +453 -0
torchrl/testing/__init__.py +107 -0
torchrl/testing/assertions.py +179 -0
torchrl/testing/dist_utils.py +122 -0
torchrl/testing/env_creators.py +227 -0
torchrl/testing/env_helper.py +35 -0
torchrl/testing/gym_helpers.py +156 -0
torchrl/testing/llm_mocks.py +119 -0
torchrl/testing/mocking_classes.py +2720 -0
torchrl/testing/modules.py +295 -0
torchrl/testing/mp_helpers.py +15 -0
torchrl/testing/ray_helpers.py +293 -0
torchrl/testing/utils.py +190 -0
torchrl/trainers/__init__.py +42 -0
torchrl/trainers/algorithms/__init__.py +11 -0
torchrl/trainers/algorithms/configs/__init__.py +705 -0
torchrl/trainers/algorithms/configs/collectors.py +216 -0
torchrl/trainers/algorithms/configs/common.py +41 -0
torchrl/trainers/algorithms/configs/data.py +308 -0
torchrl/trainers/algorithms/configs/envs.py +104 -0
torchrl/trainers/algorithms/configs/envs_libs.py +361 -0
torchrl/trainers/algorithms/configs/logging.py +80 -0
torchrl/trainers/algorithms/configs/modules.py +570 -0
torchrl/trainers/algorithms/configs/objectives.py +177 -0
torchrl/trainers/algorithms/configs/trainers.py +340 -0
torchrl/trainers/algorithms/configs/transforms.py +955 -0
torchrl/trainers/algorithms/configs/utils.py +252 -0
torchrl/trainers/algorithms/configs/weight_sync_schemes.py +191 -0
torchrl/trainers/algorithms/configs/weight_update.py +159 -0
torchrl/trainers/algorithms/ppo.py +373 -0
torchrl/trainers/algorithms/sac.py +308 -0
torchrl/trainers/helpers/__init__.py +40 -0
torchrl/trainers/helpers/collectors.py +416 -0
torchrl/trainers/helpers/envs.py +573 -0
torchrl/trainers/helpers/logger.py +33 -0
torchrl/trainers/helpers/losses.py +132 -0
torchrl/trainers/helpers/models.py +658 -0
torchrl/trainers/helpers/replay_buffer.py +59 -0
torchrl/trainers/helpers/trainers.py +301 -0
torchrl/trainers/trainers.py +2052 -0
torchrl/weight_update/__init__.py +33 -0
torchrl/weight_update/_distributed.py +749 -0
torchrl/weight_update/_mp.py +624 -0
torchrl/weight_update/_noupdate.py +102 -0
torchrl/weight_update/_ray.py +1032 -0
torchrl/weight_update/_rpc.py +284 -0
torchrl/weight_update/_shared.py +891 -0
torchrl/weight_update/llm/__init__.py +32 -0
torchrl/weight_update/llm/vllm_double_buffer.py +370 -0
torchrl/weight_update/llm/vllm_nccl.py +710 -0
torchrl/weight_update/utils.py +73 -0
torchrl/weight_update/weight_sync_schemes.py +1244 -0
torchrl-0.11.0.dist-info/LICENSE +21 -0
torchrl-0.11.0.dist-info/METADATA +1307 -0
torchrl-0.11.0.dist-info/RECORD +394 -0
torchrl-0.11.0.dist-info/WHEEL +5 -0
torchrl-0.11.0.dist-info/entry_points.txt +2 -0
torchrl-0.11.0.dist-info/top_level.txt +7 -0

torchrl/modules/llm/backends/vllm/vllm_async.py ADDED Viewed

@@ -0,0 +1,2075 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""Async vLLM engine implementation for efficient batching and inference.
+This module provides an async vLLM engine that leverages native vLLM batching
+for better performance and memory efficiency compared to the explicit batching
+approach used in the legacy vLLM backend.
+"""
+from __future__ import annotations
+import asyncio
+import os
+import random
+import time
+import uuid
+from collections.abc import Iterator, Sequence
+from concurrent.futures import ThreadPoolExecutor, wait
+from typing import Any, Literal, TYPE_CHECKING
+import torch
+from torchrl._utils import logger as torchrl_logger
+# Import RLvLLMEngine and shared utilities
+from .base import RLvLLMEngine
+from .vllm_utils import stateless_init_process_group
+_has_vllm = True
+if TYPE_CHECKING:
+    from vllm.engine.async_llm_engine import AsyncEngineArgs
+    from vllm.engine.request import RequestOutput
+    from vllm.engine.sampling_params import SamplingParams
+TIMEOUT_SECONDS = os.getenv("TORCHRL_VLLM_TIMEOUT_SECONDS", 300)
+try:
+    import vllm
+    _has_vllm = True
+except ImportError:
+    vllm = None
+    _has_vllm = False
+def _get_ray():
+    """Import Ray on demand to avoid global import side-effects.
+    Returns:
+        ModuleType: The imported Ray module.
+    Raises:
+        ImportError: If Ray is not installed.
+    """
+    try:
+        import ray  # type: ignore
+        return ray
+    except Exception as e:  # pragma: no cover - surfaced to callers
+        raise ImportError(
+            "ray is not installed. Please install it with `pip install ray`."
+        ) from e
+class _AsyncvLLMWorker:
+    """Async vLLM worker extension for Ray with weight update capabilities."""
+    def init_weight_update_group(
+        self,
+        master_address: str,
+        master_port: str,
+        rank_offset: int,
+        world_size: int,
+    ):
+        """Initialize weight update group for this worker (non-blocking).
+        This method starts NCCL initialization in a background thread and returns immediately,
+        allowing the RPC to complete. The NCCL collective will complete when the trainer joins.
+        Args:
+            master_address (str): The master address for distributed training.
+            master_port (str): The master port for distributed training.
+            rank_offset (int): Rank offset for this worker in the global weight update group.
+            world_size (int): Total number of processes in the weight update group.
+        """
+        import threading
+        from vllm.distributed.parallel_state import get_world_group
+        torchrl_logger.info(f"=> in {type(self).__name__}.init_weight_update_group")
+        if getattr(self, "model_update_group", None) is not None:
+            torchrl_logger.info("Model update group already initialized")
+            return
+        # Get the local rank within the tensor parallel group
+        tp_group = get_world_group()
+        local_rank = tp_group.rank
+        torchrl_logger.info(f"Local rank in tensor parallel group: {local_rank}")
+        # Calculate the global rank for weight update group
+        rank = local_rank + rank_offset
+        torchrl_logger.info(
+            f"Starting {type(self).__name__} weight update group init (non-blocking) with "
+            f"{master_address=}, {master_port=}, {rank=}, {world_size=}, device={self.device}"
+        )
+        # Start NCCL init in a background thread so this RPC can return immediately
+        def _init_nccl_background():
+            try:
+                from .vllm_utils import stateless_init_process_group
+                torchrl_logger.info(
+                    f"Worker rank {rank}: Starting NCCL init (will block until collective completes)..."
+                )
+                self.model_update_group = stateless_init_process_group(
+                    master_address, master_port, rank, world_size, self.device
+                )
+                torchrl_logger.info(f"Worker rank {rank}: NCCL init complete!")
+            except Exception as e:
+                torchrl_logger.error(f"Worker rank {rank}: NCCL init failed: {e}")
+                raise
+        thread = threading.Thread(target=_init_nccl_background, daemon=False)
+        thread.start()
+        # Store thread reference for potential cleanup
+        self._nccl_init_thread = thread
+        torchrl_logger.info(
+            f"{type(self).__name__}.init_weight_update_group dispatched (non-blocking)"
+        )
+    def update_weight(self, name: str, dtype_name: str, shape: tuple[int, ...]):
+        """Update weight via broadcast from master (rank 0) - periodic-mono pattern.
+        Args:
+            name (str): Parameter name.
+            dtype_name (str): Parameter dtype name (e.g., 'bfloat16').
+            shape (tuple[int, ...]): Parameter shape.
+        """
+        if self.model_update_group is None:
+            raise RuntimeError("Weight update group not initialized")
+        # Convert dtype name to dtype (like periodic-mono)
+        dtype = getattr(torch, dtype_name)
+        # Workers receive broadcast from master (rank 0)
+        weight = torch.empty(shape, dtype=dtype, device="cuda")
+        self.model_update_group.broadcast(
+            weight, src=0, stream=torch.cuda.current_stream()
+        )
+        self.model_runner.model.load_weights(weights=[(name, weight)])
+        del weight
+    def check_nccl_group_ready(self):
+        """Check if NCCL group is ready for communication."""
+        ready = self.model_update_group is not None
+        torchrl_logger.info(f"Worker NCCL group ready: {ready}")
+        return ready
+    def load_weights_from_storage(self, storage_path: str, num_threads: int = 1):
+        """Load weights from shared storage (double-buffer approach).
+        This method reads weights from a memory-mapped TensorDict directory
+        and loads them into the model. Used for file-based weight synchronization
+        as an alternative to NCCL collectives.
+        Args:
+            storage_path: Path to the directory containing memory-mapped weights
+            num_threads: Number of threads for reading (default: 1)
+        """
+        from tensordict import TensorDict
+        torchrl_logger.info(f"Worker loading weights from {storage_path}")
+        # Read weights from shared storage
+        weights = TensorDict.load_memmap(storage_path)
+        weights = weights.flatten_keys(".")
+        # Convert to list of (name, tensor) tuples
+        weights_list = list(weights.items())
+        torchrl_logger.info(f"Worker loading {len(weights_list)} weights into model")
+        with ThreadPoolExecutor(max_workers=num_threads) as executor:
+            futures = [
+                executor.submit(self.model_runner.model.load_weights, weights)
+                for weights in weights_list
+            ]
+            wait(futures)
+        torchrl_logger.info(
+            f"Worker successfully loaded {len(weights_list)} weights from storage"
+        )
+class _AsyncLLMEngine:
+    """Extended AsyncLLMEngine with TorchRL-specific features.
+    This class wraps vLLM's AsyncLLMEngine and adds functionality needed
+    for TorchRL integration, including weight updates and batch management.
+    This is a private class and should not be used directly. Use the ray remote actor class :class:`AsyncLLMEngineActor` instead.
+    Keyword Args:
+        engine_args (AsyncEngineArgs): Arguments for creating the AsyncLLMEngine instances.
+        bundle_indices (list[int], optional): Bundle indices for the engine.
+        enable_prefix_caching (bool, optional): Whether to enable prefix caching.
+            .. warning::
+                enable_prefix_caching is set to False by default, which is recommended if prompt log probs are needed.
+                Set it to True if prompt log probs are not needed.
+                See `this issue <https://github.com/vllm-project/vllm/issues/8268>`_ for more details.
+    """
+    def __init__(
+        self,
+        *,
+        engine_args: AsyncEngineArgs,
+        bundle_indices: list[int] | None = None,
+        enable_prefix_caching: bool = False,
+    ):
+        if not _has_vllm:
+            raise ImportError(
+                "vllm is not installed. Please install it with `pip install vllm`."
+            )
+        from vllm import AsyncLLMEngine
+        if bundle_indices is not None:
+            os.environ["VLLM_RAY_BUNDLE_INDICES"] = ",".join(map(str, bundle_indices))
+        engine_args.enable_prefix_caching = enable_prefix_caching
+        # Fix for vLLM issue #19123: Set RAY_ADDRESS so vLLM subprocesses connect
+        # to the same Ray cluster instead of starting a new one (causes KeyError: 'bundles')
+        try:
+            import ray
+            if ray.is_initialized():
+                # Get the current Ray address and set it in the environment
+                # so vLLM's subprocess connects to the same cluster
+                ray_address = ray.get_runtime_context().gcs_address
+                if ray_address and "RAY_ADDRESS" not in os.environ:
+                    os.environ["RAY_ADDRESS"] = ray_address
+                    torchrl_logger.debug(
+                        f"Set RAY_ADDRESS={ray_address} for vLLM subprocess"
+                    )
+        except Exception:
+            pass  # Ray not available or not initialized, let vLLM handle it
+        # Create the engine directly - this is the source of the blocking ray.get issue
+        # but we need to handle it differently for multiple replicas
+        self.engine = AsyncLLMEngine.from_engine_args(engine_args)
+        self.bundle_indices = bundle_indices
+    def ready(self) -> bool:
+        """Check if engine is ready for inference."""
+        return True
+    async def generate(
+        self,
+        prompts: Any = None,
+        sampling_params: SamplingParams | None = None,
+        *,
+        prompt_token_ids: list[int] | list[list[int]] | None = None,
+        use_tqdm: bool = True,
+        lora_request: Any = None,
+        prompt_adapter_request: Any = None,
+        guided_options_request: Any = None,
+        timeout_seconds: float | None = None,
+    ) -> RequestOutput | list[RequestOutput]:
+        """Generate text with the same interface as vLLM.LLM.generate.
+        This method mirrors the interface of vLLM.LLM.generate to provide seamless
+        compatibility between sync and async engines.
+        Args:
+            prompts: String, TokensPrompt, or list of these. Input prompts for generation.
+            sampling_params: SamplingParams object for controlling generation behavior.
+            prompt_token_ids: Alternative to prompts - token IDs for generation.
+            use_tqdm: Whether to show progress bar (not used in async engine).
+            lora_request: LoRA request for adapter-based generation.
+            guided_options_request: Guided decoding options.
+            timeout_seconds: Timeout for generation in seconds.
+        Returns:
+            RequestOutput or list of RequestOutput: Generated outputs from vLLM.
+        """
+        if not _has_vllm:
+            raise ImportError(
+                "vllm is not installed. Please install it with `pip install vllm`."
+            )
+        from vllm import SamplingParams, TokensPrompt
+        # Track whether input was originally a single prompt
+        single_prompt_input = False
+        # Handle prompt_token_ids if provided
+        if prompt_token_ids is not None:
+            if prompts is not None:
+                raise ValueError("Cannot specify both prompts and prompt_token_ids")
+            # Convert token IDs to TokensPrompt objects
+            if not prompt_token_ids:
+                raise ValueError("prompt_token_ids cannot be empty")
+            # Check if it's a list of lists or a single list
+            if prompt_token_ids and isinstance(prompt_token_ids[0], list):
+                # List of token ID lists
+                prompts = [
+                    TokensPrompt(prompt_token_ids=tokens) for tokens in prompt_token_ids
+                ]
+            else:
+                # Single token ID list - cast to ensure type compatibility
+                token_list = list(prompt_token_ids) if prompt_token_ids else []
+                prompts = TokensPrompt(prompt_token_ids=token_list)
+                single_prompt_input = True
+        elif prompts is None:
+            raise ValueError("Must specify either prompts or prompt_token_ids")
+        else:
+            # prompts was provided directly
+            if not isinstance(prompts, (list, tuple)):
+                single_prompt_input = True
+        # Default sampling params if not provided
+        if sampling_params is None:
+            sampling_params = SamplingParams()
+        async def _gen_one(prompt) -> RequestOutput:
+            request_id = str(uuid.uuid4())
+            final = None
+            # Build kwargs for engine.generate
+            gen_kwargs = {
+                "prompt": prompt,
+                "sampling_params": sampling_params,
+                "request_id": request_id,
+            }
+            # Add optional parameters if provided
+            if lora_request is not None:
+                gen_kwargs["lora_request"] = lora_request
+            if prompt_adapter_request is not None:
+                gen_kwargs["prompt_adapter_request"] = prompt_adapter_request
+            if guided_options_request is not None:
+                gen_kwargs["guided_options_request"] = guided_options_request
+            async for output in self.engine.generate(**gen_kwargs):
+                if output.finished:
+                    final = output
+            assert final is not None
+            return final
+        async def _run_generation():
+            if single_prompt_input:
+                return await _gen_one(prompts)
+            # List of prompts: run concurrently
+            tasks = [asyncio.create_task(_gen_one(p)) for p in prompts]
+            results = await asyncio.gather(*tasks)
+            return results
+        try:
+            if timeout_seconds is not None and timeout_seconds > 0:
+                return await asyncio.wait_for(
+                    _run_generation(), timeout=timeout_seconds
+                )
+            else:
+                return await _run_generation()
+        except TimeoutError:
+            # Best-effort cleanup
+            try:
+                abort_fn = getattr(self.engine, "abort", None)
+                if callable(abort_fn):
+                    # We can't easily track all request IDs, so this is best-effort
+                    pass
+            except Exception:
+                pass
+            raise TimeoutError(
+                f"vLLM generation timed out after {timeout_seconds} seconds"
+            )
+    async def get_tokenizer(self):
+        """Get the tokenizer from the engine."""
+        return await self.engine.get_tokenizer()
+    async def collective_rpc_v1(
+        self,
+        method: str,
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict | None = None,
+    ):
+        """Perform a collective RPC call to the given method (vLLM V1).
+        Args:
+            method (str): Method name to call.
+            timeout (float | None): Timeout for the RPC call.
+            args (tuple): Arguments to pass to the method.
+            kwargs (dict | None): Keyword arguments to pass to the method.
+        """
+        from vllm import envs
+        if envs and envs.VLLM_USE_V1:
+            return await self.engine.collective_rpc(method, timeout, args, kwargs)
+        else:
+            return self.engine.engine.collective_rpc(method, timeout, args, kwargs)
+    def collective_rpc_v0(
+        self,
+        method: str,
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict | None = None,
+    ):
+        """Perform a collective RPC call to the given method (vLLM V0).
+        Args:
+            method (str): Method name to call.
+            timeout (float | None): Timeout for the RPC call.
+            args (tuple): Arguments to pass to the method.
+            kwargs (dict | None): Keyword arguments to pass to the method.
+        """
+        return self.engine.engine.collective_rpc(method, timeout, args, kwargs)
+    def get_num_unfinished_requests(self) -> int:
+        """Get the number of unfinished requests in the engine.
+        Returns:
+            int: Number of unfinished requests.
+        """
+        try:
+            # Try to access the method directly if available
+            if hasattr(self.engine, "get_num_unfinished_requests"):
+                return self.engine.get_num_unfinished_requests()
+            # Fallback to accessing through engine.engine for v0
+            elif hasattr(self.engine, "engine") and hasattr(
+                self.engine.engine, "get_num_unfinished_requests"
+            ):
+                return self.engine.engine.get_num_unfinished_requests()
+            else:
+                # If method not available, return 0 as fallback
+                torchrl_logger.warning(
+                    "get_num_unfinished_requests not available, returning 0"
+                )
+                return 0
+        except Exception as e:
+            torchrl_logger.warning(f"Error getting unfinished requests count: {e}")
+            return 0
+    def get_cache_usage(self) -> float:
+        """Get the KV cache usage as a fraction between 0 and 1.
+        Returns:
+            float: Cache usage fraction (0.0 = empty, 1.0 = full).
+        """
+        try:
+            # Try to get cache usage from the engine
+            if hasattr(self.engine, "engine") and hasattr(
+                self.engine.engine, "cache_config"
+            ):
+                # Access the LLM engine's cache information
+                cache_config = self.engine.engine.cache_config
+                if hasattr(cache_config, "cache_usage"):
+                    return cache_config.cache_usage
+                elif hasattr(self.engine.engine, "scheduler"):
+                    # Try to get usage from the scheduler
+                    scheduler = self.engine.engine.scheduler
+                    if hasattr(scheduler, "get_num_free_gpu_blocks") and hasattr(
+                        scheduler, "get_num_total_gpu_blocks"
+                    ):
+                        free_blocks = scheduler.get_num_free_gpu_blocks()
+                        total_blocks = scheduler.get_num_total_gpu_blocks()
+                        if total_blocks > 0:
+                            return 1.0 - (free_blocks / total_blocks)
+            # Fallback: return a random value for now (this should be replaced with actual metrics)
+            torchrl_logger.warning(
+                "Cache usage metrics not available, returning random value"
+            )
+            return (
+                random.random() * 0.5
+            )  # Return a value between 0 and 0.5 to simulate partial usage
+        except Exception as e:
+            torchrl_logger.warning(f"Error getting cache usage: {e}")
+            return 0.0
+def _gpus_per_replica(engine_args: AsyncEngineArgs) -> int:
+    """Get the number of GPUs per replica for the given engine args."""
+    return (
+        engine_args.tensor_parallel_size
+        * getattr(engine_args, "data_parallel_size", 1)  # Default to 1 if not present
+        * getattr(
+            engine_args, "pipeline_parallel_size", 1
+        )  # Default to 1 if not present
+    )
+# Ray actor wrapper is created lazily in __init__ to avoid global Ray import.
+class AsyncVLLM(RLvLLMEngine):
+    """A service that manages multiple async vLLM engine actors for distributed inference.
+    This is the main entry point for async vLLM inference in TorchRL. It manages multiple
+    vLLM engine replicas running as Ray actors, providing load balancing, weight updates,
+    and a unified interface for text generation.
+    The service automatically handles Ray actor lifecycle management, GPU allocation through
+    placement groups, and provides both synchronous and asynchronous generation interfaces
+    that are compatible with the standard vLLM API.
+    Args:
+        engine_args (AsyncEngineArgs): Configuration for the vLLM engines.
+        num_replicas (int, optional): Number of engine replicas to create. Defaults to 1.
+        actor_class (optional): Custom Ray actor class. Defaults to the internal actor implementation.
+        enable_prefix_caching (bool, optional): Whether to enable prefix caching. Defaults to False.
+            .. warning::
+                enable_prefix_caching is set to False by default, which is recommended if prompt log probs are needed.
+                Set it to True if prompt log probs are not needed.
+                See `this issue <https://github.com/vllm-project/vllm/issues/8268>`_ for more details.
+    Example:
+        >>> from torchrl.modules.llm import AsyncVLLM
+        >>> from vllm import SamplingParams
+        >>>
+        >>> # Simple usage - single GPU, single replica
+        >>> service = AsyncVLLM.from_pretrained("Qwen/Qwen2.5-3B")
+        >>>
+        >>> # Advanced usage - multi-GPU tensor parallel with multiple replicas
+        >>> service = AsyncVLLM.from_pretrained(
+        ...     "Qwen/Qwen2.5-7B",
+        ...     num_devices=2,  # Use 2 GPUs for tensor parallelism
+        ...     num_replicas=2,  # Create 2 replicas for higher throughput
+        ...     max_model_len=4096
+        ... )
+        >>>
+        >>> # Generate text
+        >>> sampling_params = SamplingParams(temperature=0.7, max_tokens=100)
+        >>> result = service.generate("Hello, world!", sampling_params)
+        >>> print(result.outputs[0].text)
+        >>>
+        >>> # Alternative: using AsyncEngineArgs directly for advanced configuration
+        >>> from vllm import AsyncEngineArgs
+        >>> engine_args = AsyncEngineArgs(
+        ...     model="Qwen/Qwen2.5-3B",
+        ...     tensor_parallel_size=2
+        ... )
+        >>> service = AsyncVLLM.launch(engine_args, num_replicas=2)
+    .. note::
+        **Architecture and Design**
+        The AsyncVLLM service implements a distributed inference architecture with the following key components:
+        1. **Ray Actor Management**: Each replica runs as a separate Ray actor with dedicated GPU resources.
+           The service creates a placement group to ensure optimal GPU allocation and co-location of
+           tensor-parallel workers on the same node when possible.
+        2. **Load Balancing**: Generation requests are distributed across replicas using random selection
+           by default, or can target specific replicas using the `actor_index` parameter.
+        3. **Weight Synchronization**: The service supports weight updates across all replicas through
+           NCCL communication groups, enabling integration with distributed training workflows.
+        4. **Resource Management**: Automatic GPU allocation and cleanup through Ray placement groups,
+           with proper shutdown procedures to prevent resource leaks.
+        5. **API Compatibility**: Provides the same interface as vLLM's synchronous `LLM.generate()`
+           method, making it a drop-in replacement for async workloads.
+        **Ray Integration**
+        The service leverages Ray's actor model for distributed execution. Each replica is an independent
+        Ray actor that can be scheduled on different nodes. The service handles actor lifecycle,
+        monitors readiness, and provides centralized access to all replicas.
+        **Performance Considerations**
+        - Prefix caching is enabled by default for better performance with repeated prompts
+        - Tensor parallelism is supported for large models that don't fit on single GPUs
+        - Multiple replicas allow concurrent processing of different requests
+        - Native vLLM batching is used within each replica for optimal throughput
+        **Error Handling**
+        The service includes timeout support, graceful shutdown procedures, and best-effort
+        request cleanup on failures. Ray's fault tolerance mechanisms provide additional
+        resilience for long-running inference workloads.
+    """
+    def __init__(
+        self,
+        engine_args: AsyncEngineArgs,
+        num_replicas: int = 1,
+        actor_class=None,
+        enable_prefix_caching: bool = False,
+    ):
+        if not _has_vllm:
+            raise ImportError(
+                "vllm is not installed. Please install it with `pip install vllm`."
+            )
+        # Lazily import ray only when constructing the actor class to avoid global import
+        # Enable prefix caching by default for better performance
+        engine_args.enable_prefix_caching = enable_prefix_caching
+        self.engine_args = engine_args
+        self.num_replicas = num_replicas
+        if actor_class is None:
+            ray = _get_ray()
+            self.actor_class = ray.remote(num_cpus=0, num_gpus=0)(_AsyncLLMEngine)
+        else:
+            self.actor_class = actor_class
+        self.actors: list = []
+        self._launched = False
+        self._service_id = uuid.uuid4().hex[
+            :8
+        ]  # Unique suffix to avoid name collisions
+        self._placement_group = None
+        self._load_balancer = None
+    def _launch(self):
+        """Launch all actor replicas."""
+        if self._launched:
+            torchrl_logger.warning("AsyncVLLMEngineService already launched")
+            return
+        # Local imports to avoid global Ray dependency
+        ray = _get_ray()
+        from ray.util.placement_group import placement_group
+        from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+        torchrl_logger.info(
+            f"Launching {self.num_replicas} async vLLM engine actors..."
+        )
+        # Create placement groups - one per replica to avoid conflicts
+        self._placement_groups = []
+        # Create actor replicas sequentially to avoid race conditions
+        for i in range(self.num_replicas):
+            torchrl_logger.info(
+                f"Creating async actor replica {i + 1}/{self.num_replicas} ..."
+            )
+            # Create individual placement group for this replica
+            num_gpus = _gpus_per_replica(self.engine_args)
+            bundles = [{"GPU": 1.0, "CPU": 1.0} for _ in range(num_gpus)]
+            torchrl_logger.info(
+                f"Creating placement group for replica {i + 1} with {len(bundles)} bundles"
+            )
+            placement_group_name = f"vllm-replica-{self._service_id}-{i}"
+            pg = placement_group(bundles, strategy="PACK", name=placement_group_name)
+            self._placement_groups.append(pg)
+            torchrl_logger.info(f"Placement group {placement_group_name} created: {pg}")
+            # Wait for placement group to be ready
+            ray.get(pg.ready(), timeout=180)
+            torchrl_logger.info(f"Placement group {placement_group_name} ready")
+            # Calculate bundle indices for tensor parallelism
+            bundle_indices = None
+            if num_gpus > 1:
+                bundle_indices = list(range(num_gpus))
+            bundle_index = 0  # Always use first bundle since each replica has its own placement group
+            scheduling_strategy = PlacementGroupSchedulingStrategy(
+                placement_group=pg,
+                placement_group_capture_child_tasks=True,
+                placement_group_bundle_index=bundle_index,
+            )
+            actor = self.actor_class.options(
+                name=f"async-vllm-replica-{self._service_id}-{i}",
+                namespace="torchrl_vllm",
+                scheduling_strategy=scheduling_strategy,
+                num_gpus=0,
+                num_cpus=0,
+            ).remote(
+                engine_args=self.engine_args,
+                bundle_indices=bundle_indices,
+                enable_prefix_caching=self.engine_args.enable_prefix_caching,
+            )
+            self.actors.append(actor)
+        torchrl_logger.info("Waiting for actors to be ready")
+        # Wait for this actor to be ready before creating the next one
+        ready_futures = [actor.ready.remote() for actor in self.actors]
+        try:
+            ray.get(
+                ready_futures, timeout=TIMEOUT_SECONDS
+            )  # 5 minute timeout for engine initialization
+            torchrl_logger.info("✅ Actors are ready")
+        except Exception as e:
+            torchrl_logger.error(
+                f"❌ Failed to initialize actors within {TIMEOUT_SECONDS} seconds: {e}. You can increase the timeout by setting the TORCHRL_VLLM_TIMEOUT_SECONDS environment variable."
+            )
+            raise
+        # Store the first placement group for backward compatibility
+        self._placement_group = (
+            self._placement_groups[0] if self._placement_groups else None
+        )
+        self._launched = True
+        torchrl_logger.info(
+            f"✅ Successfully launched {len(self.actors)} async vLLM engine actors"
+        )
+    @classmethod
+    def launch(
+        cls,
+        engine_args: AsyncEngineArgs,
+        num_replicas: int = 1,
+    ) -> AsyncVLLM:
+        """Launch a new AsyncVLLMEngineService.
+        Args:
+            engine_args (AsyncEngineArgs): Arguments for creating the AsyncLLMEngine instances.
+            num_replicas (int): Number of actor replicas to create.
+        Returns:
+            AsyncVLLMEngineService: The launched service.
+        """
+        service = cls(engine_args, num_replicas)
+        service._launch()
+        # create a default load balancer with smart routing
+        service.create_load_balancer()
+        return service
+    @classmethod
+    def from_pretrained(
+        cls,
+        model_name: str,
+        num_devices: int | None = None,
+        num_replicas: int = 1,
+        verbose: bool = True,
+        compile: bool = True,
+        enable_fp32_output: bool = False,
+        **kwargs,
+    ) -> AsyncVLLM:
+        """Create an AsyncVLLM instance from a pretrained model.
+        This is a convenience method that combines model loading and service launching
+        in a single call, similar to how other ML libraries work.
+        Args:
+            model_name (str): The model name to pass to vLLM.
+            num_devices (int, optional): Number of devices to use, per replica.
+            num_replicas (int): Number of engine replicas to create.
+            verbose (bool, optional): Whether to enable verbose logging with throughput statistics. Defaults to True.
+            compile (bool, optional): Whether to enable model compilation for better performance. Defaults to True.
+            enable_fp32_output (bool, optional): Whether to enable FP32 output for the final layer. Defaults to False.
+            **kwargs: Additional arguments passed to AsyncEngineArgs.
+        Returns:
+            AsyncVLLM: The launched async vLLM service.
+        Example:
+            >>> # Simple usage with defaults
+            >>> service = AsyncVLLM.from_pretrained("Qwen/Qwen2.5-3B")
+            >>>
+            >>> # Multi-GPU tensor parallel with multiple replicas
+            >>> service = AsyncVLLM.from_pretrained(
+            ...     "Qwen/Qwen2.5-7B",
+            ...     num_devices=2,
+            ...     num_replicas=2,
+            ...     max_model_len=4096
+            ... )
+            >>>
+            >>> # Generate text
+            >>> from vllm import SamplingParams
+            >>> result = service.generate("Hello, world!", SamplingParams(max_tokens=50))
+            >>>
+            >>> # Enable FP32 output for better numerical stability
+            >>> service = AsyncVLLM.from_pretrained(
+            ...     "Qwen/Qwen2.5-3B",
+            ...     enable_fp32_output=True
+            ... )
+        """
+        return make_async_vllm_engine(
+            model_name=model_name,
+            num_devices=num_devices,
+            num_replicas=num_replicas,
+            verbose=verbose,
+            compile=compile,
+            enable_fp32_output=enable_fp32_output,
+            **kwargs,
+        )
+    def _is_batch(
+        self, prompts: Any, prompt_token_ids: list[int] | list[list[int]] | None = None
+    ) -> bool:
+        """Check if the input represents a batch of prompts.
+        Args:
+            prompts: Input prompts that can be string, TokensPrompt, or list of these
+            prompt_token_ids: Alternative token IDs input
+        Returns:
+            bool: True if this represents multiple prompts, False for single prompt
+        """
+        # If prompts is a list, we need to determine if it's a batch or a single prompt
+        if isinstance(prompts, list):
+            # Empty list is not a batch
+            if len(prompts) == 0:
+                return False
+            # If all elements are integers, it's a single prompt represented as token IDs
+            # We trust that if one is an int, then all are ints.
+            if any(isinstance(item, int) for item in prompts):
+                return False
+            # If it contains strings, TokensPrompt objects, or other non-integer types,
+            # it's a batch of prompts
+            return True
+        # If prompt_token_ids is provided and is a list of lists, it's a batch
+        if prompt_token_ids is not None and isinstance(prompt_token_ids, list):
+            if len(prompt_token_ids) > 0 and isinstance(prompt_token_ids[0], list):
+                return True
+        return False
+    def _iterate(
+        self, prompts: Any, prompt_token_ids: list[int] | list[list[int]] | None = None
+    ):
+        """Iterate over individual prompts in a batch.
+        Args:
+            prompts: Input prompts that can be string, TokensPrompt, or list of these
+            prompt_token_ids: Alternative token IDs input
+        Yields:
+            tuple: (individual_prompt, individual_prompt_token_ids) for each item
+        """
+        if isinstance(prompts, list):
+            # Check if this is actually a single prompt represented as token IDs
+            if all(isinstance(item, int) for item in prompts):
+                # This is a single prompt as token IDs, not a batch
+                yield prompts, prompt_token_ids
+                return
+            # Handle list of prompts (actual batch)
+            if prompt_token_ids is None:
+                for prompt in prompts:
+                    yield prompt, None
+            elif (
+                isinstance(prompt_token_ids, list)
+                and len(prompt_token_ids) > 0
+                and isinstance(prompt_token_ids[0], list)
+            ):
+                # Both prompts and prompt_token_ids are lists
+                for prompt, token_ids in zip(prompts, prompt_token_ids):
+                    yield prompt, token_ids
+            else:
+                # prompts is list, but prompt_token_ids is single list - replicate it
+                for prompt in prompts:
+                    yield prompt, prompt_token_ids
+        else:
+            # Single prompt case
+            if (
+                prompt_token_ids is not None
+                and isinstance(prompt_token_ids, list)
+                and len(prompt_token_ids) > 0
+                and isinstance(prompt_token_ids[0], list)
+            ):
+                # Single prompt but multiple token_ids - replicate prompt
+                for token_ids in prompt_token_ids:
+                    yield prompts, token_ids
+            else:
+                # Single prompt, single (or no) token_ids
+                yield prompts, prompt_token_ids
+    def _generate_impl(
+        self,
+        prompt: Any,
+        sampling_params: SamplingParams | None = None,
+        *,
+        prompt_token_ids: list[int] | None = None,
+        use_tqdm: bool = True,
+        lora_request: Any = None,
+        prompt_adapter_request: Any = None,
+        guided_options_request: Any = None,
+        timeout_seconds: float | None = None,
+        actor_index: int | None = None,
+    ):
+        """Generate text for a single prompt and return a Ray future.
+        This is the internal implementation that returns a future instead of the result.
+        Used for batched generation to enable parallel execution.
+        Args:
+            prompt: Single prompt (string, TokensPrompt, etc.)
+            sampling_params: SamplingParams object for controlling generation behavior
+            prompt_token_ids: Token IDs for a single prompt
+            use_tqdm: Whether to show progress bar (not used in async engine)
+            lora_request: LoRA request for adapter-based generation
+            prompt_adapter_request: Prompt adapter request
+            guided_options_request: Guided decoding options
+            timeout_seconds: Timeout for generation in seconds
+            actor_index: Specific actor to use (random if None)
+        Returns:
+            Ray ObjectRef: Future that will resolve to RequestOutput
+        """
+        if actor_index is None:
+            if len(self.actors) == 1:
+                actor = self.actors[0]
+            else:
+                if self._load_balancer is None:
+                    raise RuntimeError(
+                        "LoadBalancer is not created. Create a LoadBalancer using AsyncVLLM.create_load_balancer before calling generate."
+                    )
+                # Extract single prompt for prefix-aware routing
+                single_prompt = self._extract_single_prompt_for_routing(
+                    prompt, prompt_token_ids
+                )
+                actor_index = self._load_balancer.select_actor(prompt=single_prompt)
+                actor = self.actors[actor_index]
+        else:
+            actor = self.actors[actor_index]
+        return actor.generate.remote(
+            prompt,
+            sampling_params,
+            prompt_token_ids=prompt_token_ids,
+            use_tqdm=use_tqdm,
+            lora_request=lora_request,
+            prompt_adapter_request=prompt_adapter_request,
+            guided_options_request=guided_options_request,
+            timeout_seconds=timeout_seconds,
+        )
+    def generate(
+        self,
+        prompts: Any = None,
+        sampling_params: SamplingParams | None = None,
+        *,
+        prompt_token_ids: list[int] | list[list[int]] | None = None,
+        use_tqdm: bool = True,
+        lora_request: Any = None,
+        prompt_adapter_request: Any = None,
+        guided_options_request: Any = None,
+        timeout_seconds: float | None = None,
+        actor_index: int | None = None,
+    ) -> RequestOutput | list[RequestOutput]:
+        """Generate text using one of the actors with vLLM.LLM.generate interface.
+        This method provides the same interface as vLLM.LLM.generate for seamless
+        compatibility between sync and async engines. It can be used to generate text
+        within multiple threads / actors. If `actor_index` is not provided, the load balancer
+        will be used to select the actor.
+        `generate` is a blocking method, so it will wait for the generation to complete.
+        Args:
+            prompts (String, TokensPrompt, or list of these): Input prompts for generation.
+            sampling_params (SamplingParams): SamplingParams object for controlling generation behavior.
+            prompt_token_ids (list[int] | list[list[int]]): Alternative to prompts - token IDs for generation.
+            use_tqdm (bool): Whether to show progress bar (not used in async engine).
+            lora_request (Any): LoRA request for adapter-based generation.
+            prompt_adapter_request (Any): Prompt adapter request.
+            guided_options_request (Any): Guided decoding options.
+            timeout_seconds (float | None): Timeout for generation in seconds.
+            actor_index (int | None): Specific actor to use (random if None).
+        Returns:
+            RequestOutput | list[RequestOutput]: Generated outputs from vLLM.
+        """
+        ray = _get_ray()
+        # Check if this is a batch request
+        if self._is_batch(prompts, prompt_token_ids):
+            # Handle batched input by unbinding and sending individual requests
+            futures = []
+            for prompt, prompt_token_ids_i in self._iterate(prompts, prompt_token_ids):
+                future = self._generate_impl(
+                    prompt,
+                    sampling_params,
+                    prompt_token_ids=prompt_token_ids_i,
+                    use_tqdm=use_tqdm,
+                    lora_request=lora_request,
+                    prompt_adapter_request=prompt_adapter_request,
+                    guided_options_request=guided_options_request,
+                    timeout_seconds=timeout_seconds,
+                    actor_index=actor_index,
+                )
+                futures.append(future)
+            # Collect all results
+            results = ray.get(futures)
+            return results
+        else:
+            # Single prompt case - call _generate_impt and get result directly
+            future = self._generate_impl(
+                prompts,
+                sampling_params,
+                prompt_token_ids=prompt_token_ids,
+                use_tqdm=use_tqdm,
+                lora_request=lora_request,
+                prompt_adapter_request=prompt_adapter_request,
+                guided_options_request=guided_options_request,
+                timeout_seconds=timeout_seconds,
+                actor_index=actor_index,
+            )
+            result = ray.get(future)
+            return result
+    def get_random_actor_index(self) -> int:
+        """Get a random actor index."""
+        return random.randint(0, len(self.actors) - 1)
+    def _init_weight_update_group_internal(self, master_address: str, master_port: str):
+        """Initialize NCCL weight update group across all actors.
+        Args:
+            master_address (str): Master address for distributed training.
+            master_port (str): Master port for distributed training.
+        Returns:
+            list: Ray futures for initialization calls.
+        """
+        gpus_per_replica = _gpus_per_replica(self.engine_args)
+        weight_sync_world_size = self.num_replicas * gpus_per_replica + 1
+        torchrl_logger.info(
+            f"AsyncVLLMEngineService requests weight update group for {self.num_replicas} actors "
+            f"with {gpus_per_replica} GPUs per replica and {weight_sync_world_size} world size"
+        )
+        from vllm import envs
+        refs = []
+        for i, actor in enumerate(self.actors):
+            rank_offset = 1 + i * gpus_per_replica
+            if envs and envs.VLLM_USE_V1:
+                actor_collective_rpc = actor.collective_rpc_v1
+            else:
+                actor_collective_rpc = actor.collective_rpc_v0
+            refs.append(
+                actor_collective_rpc.remote(
+                    "init_weight_update_group",
+                    args=(
+                        master_address,
+                        master_port,
+                        rank_offset,
+                        weight_sync_world_size,
+                    ),
+                )
+            )
+            torchrl_logger.info(
+                f"AsyncVLLMEngineService args: {master_address=}, {master_port=}, "
+                f"{rank_offset=}, {weight_sync_world_size=}"
+            )
+            torchrl_logger.info(
+                f"AsyncVLLMEngineService requests weight update group for actor {i} "
+                f"with rank_offset {rank_offset}"
+            )
+        return refs
+    def collective_rpc(
+        self,
+        method: str,
+        timeout: float | None = None,
+        args: tuple = (),
+        kwargs: dict | None = None,
+    ) -> list[Any]:
+        """Forward an RPC to all actors.
+        Args:
+            method (str): Method name to call.
+            timeout (float | None): Timeout for the RPC call.
+            args (tuple): Arguments to pass to the method.
+            kwargs (dict | None): Keyword arguments to pass to the method.
+        Returns:
+            list[Any]: Ray futures for all RPC calls.
+        """
+        from vllm import envs
+        futures = []
+        for actor in self.actors:
+            if envs and envs.VLLM_USE_V1:
+                actor_collective_rpc = actor.collective_rpc_v1
+            else:
+                actor_collective_rpc = actor.collective_rpc_v0
+            futures.append(actor_collective_rpc.remote(method, timeout, args, kwargs))
+        return futures
+    def shutdown(self):
+        """Shutdown all actors and clean up resources."""
+        torchrl_logger.info(
+            f"Shutting down {len(self.actors)} async vLLM engine actors..."
+        )
+        ray = _get_ray()
+        from ray.util.placement_group import remove_placement_group
+        # Kill all actors
+        for i, actor in enumerate(self.actors):
+            try:
+                ray.kill(actor)
+                torchrl_logger.info(f"Shutdown async actor {i + 1}/{len(self.actors)}")
+            except Exception as e:
+                torchrl_logger.warning(f"Error shutting down async actor {i + 1}: {e}")
+        # Clear the actors list
+        self.actors.clear()
+        # Remove placement groups if any
+        if hasattr(self, "_placement_groups") and self._placement_groups:
+            for i, pg in enumerate(self._placement_groups):
+                try:
+                    remove_placement_group(pg)
+                    torchrl_logger.info(
+                        f"Removed placement group {i + 1}/{len(self._placement_groups)}"
+                    )
+                except Exception as e:
+                    torchrl_logger.warning(
+                        f"Error removing placement group {i + 1}: {e}"
+                    )
+            self._placement_groups = []
+        # Remove legacy single placement group if any
+        if self._placement_group is not None:
+            remove_placement_group(self._placement_group)
+        self._placement_group = None
+        self._launched = False
+        torchrl_logger.info("AsyncVLLMEngineService shutdown complete")
+    # RLvLLMEngine interface implementation
+    def get_tp_size(self) -> int:
+        """Get the tensor parallel size."""
+        return self.engine_args.tensor_parallel_size
+    def get_model_metadata(self) -> dict[str, tuple[torch.dtype, torch.Size]]:
+        """Get model parameter metadata.
+        Note: This requires the model to be loaded. For now, we return an empty dict
+        and expect the metadata to be provided externally during weight updates.
+        """
+        # TODO: Implement metadata extraction from loaded model
+        # This would require accessing the model from one of the actors
+        torchrl_logger.warning(
+            "AsyncVLLM.get_model_metadata() not yet implemented - returning empty dict"
+        )
+        return {}
+    def get_master_address(self) -> str:
+        """Get the master address for weight synchronization."""
+        return "localhost"  # Default for now
+    def get_master_port(self) -> int:
+        """Get the master port for weight synchronization."""
+        # Cache the port like V1 does to ensure consistency
+        if not hasattr(self, "_cached_master_port"):
+            if _has_vllm:
+                try:
+                    from vllm.utils import get_open_port
+                    self._cached_master_port = get_open_port()
+                except ImportError:
+                    self._cached_master_port = 29500  # Default port if import fails
+            else:
+                self._cached_master_port = 29500  # Default port
+        return self._cached_master_port
+    def init_weight_update_group(
+        self,
+        master_address: str,
+        master_port: int | str,
+    ) -> list[Any]:
+        """Forward the request to init NCCL weight update group to all actors.
+        This method initializes the weight update group for all vLLM workers.
+        The external trainer should be rank 0, and vLLM workers will be ranks 1+.
+        Args:
+            master_address: Master address for NCCL communication.
+            master_port: Master port for NCCL communication.
+        Returns:
+            List of Ray futures for the initialization calls.
+        Note:
+            The caller must wait on the returned futures (ray.get(refs)) to ensure
+            all workers have completed initialization before sending weights.
+        """
+        if not self._launched:
+            raise RuntimeError(
+                "AsyncVLLM service must be launched before initializing weight update group"
+            )
+        gpus_per_replica = _gpus_per_replica(self.engine_args)
+        weight_sync_world_size = self.num_replicas * gpus_per_replica + 1
+        torchrl_logger.info(
+            f"Initializing weight update group for {self.num_replicas} replicas "
+            f"with {gpus_per_replica} GPUs each (world_size={weight_sync_world_size})"
+        )
+        from vllm import envs
+        refs = []
+        for i, actor in enumerate(self.actors):
+            rank_offset = 1 + i * gpus_per_replica
+            if envs and envs.VLLM_USE_V1:
+                actor_collective_rpc = actor.collective_rpc_v1
+            else:
+                actor_collective_rpc = actor.collective_rpc_v0
+            refs.append(
+                actor_collective_rpc.remote(
+                    "init_weight_update_group",
+                    args=(
+                        master_address,
+                        str(master_port),
+                        rank_offset,
+                        weight_sync_world_size,
+                    ),
+                )
+            )
+            torchrl_logger.info(
+                f"Requested init for actor {i} with rank_offset {rank_offset}"
+            )
+        return refs
+    def update_weights(self, weights: Iterator[tuple[str, torch.Tensor]]) -> None:
+        """Update model weights across all replicas using NCCL broadcast.
+        Args:
+            weights: Iterator yielding (parameter_name, tensor) tuples
+        """
+        if not self._launched:
+            raise RuntimeError(
+                "AsyncVLLM service must be launched before updating weights"
+            )
+        # Convert iterator to dict for easier handling
+        weights_dict = dict(weights)
+        if not weights_dict:
+            torchrl_logger.warning("No weights provided for update")
+            return
+        torchrl_logger.info(
+            f"Updating {len(weights_dict)} parameters across {len(self.actors)} replicas using NCCL broadcast"
+        )
+        self._update_weights_with_nccl_broadcast_simple(weights_dict)
+        torchrl_logger.info("AsyncVLLM NCCL weight update completed")
+    def _update_weights_with_nccl_broadcast_simple(
+        self, weights_dict: dict[str, torch.Tensor]
+    ) -> None:
+        """Update weights using simple NCCL broadcast like V1.
+        This approach follows the V1 pattern:
+        1. Training process (master) broadcasts as rank 0
+        2. All vLLM workers receive as ranks 1, 2, 3...
+        3. Simple and reliable like the working V1 implementation
+        Args:
+            weights_dict: Dictionary of parameter names to weight tensors
+        """
+        if not hasattr(self, "_nccl_master_group") or self._nccl_master_group is None:
+            raise RuntimeError(
+                "NCCL master group not initialized. This is a bug in the setup process."
+            )
+        t0 = time.time()
+        # Move all weights to cuda:0 (matching NCCL communicator device)
+        gpu_weights = {}
+        for name, weight in weights_dict.items():
+            # Ensure weight is on cuda:0 (matching NCCL communicator)
+            if weight.device != torch.device("cuda:0"):
+                gpu_weights[name] = weight.to("cuda:0", non_blocking=True)
+            else:
+                gpu_weights[name] = weight
+        # Use periodic-mono pattern: individual weight updates with immediate RPC->NCCL
+        torchrl_logger.info(
+            f"Updating {len(gpu_weights)} weights using periodic-mono pattern..."
+        )
+        updated_weights = 0
+        ray = _get_ray()
+        with torch.cuda.device(0):  # Ensure we're on the correct CUDA device
+            for name, weight in gpu_weights.items():
+                # Convert dtype to string name (like periodic-mono)
+                dtype_name = str(weight.dtype).split(".")[
+                    -1
+                ]  # "torch.bfloat16" -> "bfloat16"
+                # Step 1: Send RPC to workers for this weight
+                futures = self.collective_rpc(
+                    "update_weight", args=(name, dtype_name, tuple(weight.shape))
+                )
+                # Step 2: Immediately broadcast this weight (like periodic-mono)
+                self._nccl_master_group.broadcast(
+                    weight, src=0, stream=torch.cuda.current_stream()
+                )
+                # Step 3: Wait for workers to complete this weight
+                ray.get(futures)
+                updated_weights += 1
+        torch.cuda.synchronize()
+        t2 = time.time()
+        torchrl_logger.info(
+            f"Successfully updated {updated_weights}/{len(gpu_weights)} weights in {t2 - t0:.3f}s"
+        )
+    def _setup_nccl_master_group(self) -> None:
+        """Set up NCCL communication group for the master node (rank 0)."""
+        # Calculate world size (should match what workers use)
+        gpus_per_replica = _gpus_per_replica(self.engine_args)
+        weight_sync_world_size = self.num_replicas * gpus_per_replica + 1
+        master_address = self.get_master_address()
+        master_port = self.get_master_port()
+        torchrl_logger.info(
+            f"Setting up NCCL master group: rank=0, world_size={weight_sync_world_size}, "
+            f"address={master_address}:{master_port}"
+        )
+        # Ensure CUDA is available and initialized
+        if not torch.cuda.is_available():
+            raise RuntimeError("CUDA not available for NCCL communication")
+        # Set CUDA device before initializing NCCL
+        torch.cuda.set_device(0)
+        # Initialize master as rank 0 in the NCCL group (use synchronous version)
+        self._nccl_master_group = stateless_init_process_group(
+            master_address=master_address,
+            master_port=str(master_port),
+            rank=0,  # Master is always rank 0
+            world_size=weight_sync_world_size,
+            device=torch.device("cuda:0"),
+        )
+        torchrl_logger.info("NCCL master group initialized successfully")
+    def get_num_unfinished_requests(
+        self, actor_index: int | None = None
+    ) -> int | list[int]:
+        """Get the number of unfinished requests for one or all actors.
+        Args:
+            actor_index (int | None): Index of specific actor, or None for all actors.
+        Returns:
+            int | list[int]: Number of unfinished requests for the specified actor,
+                           or list of counts for all actors if actor_index is None.
+        """
+        if not self._launched:
+            raise RuntimeError(
+                "AsyncVLLM service must be launched before getting request counts"
+            )
+        ray = _get_ray()
+        if actor_index is not None:
+            if not (0 <= actor_index < len(self.actors)):
+                raise IndexError(
+                    f"Actor index {actor_index} out of range [0, {len(self.actors)})"
+                )
+            actor = self.actors[actor_index]
+            return ray.get(actor.get_num_unfinished_requests.remote())
+        else:
+            # Get counts from all actors
+            futures = [
+                actor.get_num_unfinished_requests.remote() for actor in self.actors
+            ]
+            return ray.get(futures)
+    def get_cache_usage(self, actor_index: int | None = None) -> float | list[float]:
+        """Get the KV cache usage for one or all actors.
+        Args:
+            actor_index (int | None): Index of specific actor, or None for all actors.
+        Returns:
+            float | list[float]: Cache usage fraction for the specified actor,
+                               or list of usage fractions for all actors if actor_index is None.
+        """
+        if not self._launched:
+            raise RuntimeError(
+                "AsyncVLLM service must be launched before getting cache usage"
+            )
+        ray = _get_ray()
+        if actor_index is not None:
+            if not (0 <= actor_index < len(self.actors)):
+                raise IndexError(
+                    f"Actor index {actor_index} out of range [0, {len(self.actors)})"
+                )
+            actor = self.actors[actor_index]
+            return ray.get(actor.get_cache_usage.remote())
+        else:
+            # Get usage from all actors
+            futures = [actor.get_cache_usage.remote() for actor in self.actors]
+            return ray.get(futures)
+    def create_load_balancer(
+        self,
+        strategy: Literal["requests", "kv-cache"]
+        | Sequence[Literal["prefix-aware", "requests", "kv-cache", "round-robin"]]
+        | None = None,
+        **kwargs,
+    ) -> LoadBalancer:
+        """Create a load balancer for this AsyncVLLM service.
+        Args:
+            strategy: Load balancing strategy or sequence of strategies in fallback order.
+                Default: ["prefix-aware", "requests"] - tries cache-aware routing first,
+                then load balancing. Single strategies: "requests", "kv-cache"
+                Strategy sequences: ["prefix-aware", "requests", "round-robin"]
+            **kwargs: Additional arguments passed to LoadBalancer constructor.
+        Returns:
+            LoadBalancer: Configured load balancer instance. This is stored in the AsyncVLLM instance.
+        Examples:
+            >>> service = AsyncVLLM.from_pretrained("Qwen/Qwen2.5-3B", num_replicas=3)
+            >>> # Use smart defaults (prefix-aware -> requests)
+            >>> lb = service.create_load_balancer()
+            >>> selected_actor_index = lb.select_actor(prompt="Hello world")
+            >>> # Simple single strategy
+            >>> lb = service.create_load_balancer("requests")
+            >>> selected_actor_index = lb.select_actor()
+            >>> # Custom strategy hierarchy
+            >>> lb = service.create_load_balancer(
+            ...     ["prefix-aware", "kv-cache", "round-robin"],
+            ...     prefix_length=16,
+            ...     overload_threshold=2.0
+            ... )
+            >>> selected_actor_index = lb.select_actor(prompt="Hello world")
+        """
+        if not self._launched:
+            raise RuntimeError(
+                "AsyncVLLM service must be launched before creating load balancer"
+            )
+        load_balancer = LoadBalancer(self, strategy, **kwargs)
+        self._load_balancer = load_balancer
+        return load_balancer
+    def _extract_single_prompt_for_routing(
+        self,
+        prompts: Any = None,
+        prompt_token_ids: list[int] | list[list[int]] | None = None,
+    ) -> str | list[int] | None:
+        """Extract a single prompt for load balancer routing, if possible.
+        Args:
+            prompts: The prompts argument passed to generate().
+            prompt_token_ids: The prompt_token_ids argument passed to generate().
+        Returns:
+            str | list[int] | None: Single prompt for routing, or None if multiple prompts.
+        """
+        try:
+            # Handle prompt_token_ids first (takes precedence over prompts)
+            if prompt_token_ids is not None:
+                if isinstance(prompt_token_ids, list):
+                    if len(prompt_token_ids) == 0:
+                        return None  # Empty list
+                    elif len(prompt_token_ids) == 1:
+                        # Single prompt case - could be tokens directly or nested list
+                        if isinstance(prompt_token_ids[0], int):
+                            # Single token sequence: [token1, token2, ...]
+                            return prompt_token_ids
+                        elif isinstance(prompt_token_ids[0], list):
+                            # Nested list with single prompt: [[token1, token2, ...]]
+                            return prompt_token_ids[0]
+                        else:
+                            return None
+                    else:
+                        # Multiple prompts: [[tokens1...], [tokens2...], ...]
+                        return None
+                else:
+                    # Not a list, invalid format
+                    return None
+            # Handle prompts argument
+            if prompts is None:
+                return None
+            # Import vLLM types for proper checking
+            try:
+                pass
+            except ImportError:
+                # Fallback if imports fail
+                type(None)
+                type(None)
+            # Single string prompt
+            if isinstance(prompts, str):
+                return prompts
+            # TokensPrompt object
+            elif hasattr(prompts, "prompt_token_ids"):  # TokensPrompt-like object
+                return prompts.prompt_token_ids
+            # TextPrompt object
+            elif hasattr(prompts, "prompt"):  # TextPrompt-like object
+                return prompts.prompt
+            # List of prompts
+            elif isinstance(prompts, (list, tuple)):
+                if len(prompts) == 0:
+                    return None  # Empty list
+                elif len(prompts) == 1:
+                    # Single prompt in list - recursively extract
+                    return self._extract_single_prompt_for_routing(prompts[0], None)
+                else:
+                    # Multiple prompts - cannot do prefix routing
+                    return None
+            # Other types (shouldn't happen in normal usage)
+            else:
+                torchrl_logger.debug(
+                    f"Unknown prompt type for routing: {type(prompts)}"
+                )
+                return None
+        except Exception as e:
+            torchrl_logger.debug(f"Error extracting single prompt for routing: {e}")
+            return None
+class LoadBalancer:
+    """Load balancer for distributing requests across AsyncVLLM actors with strategy hierarchy.
+    This class implements sophisticated load balancing with multiple strategies and intelligent
+    fallback mechanisms. Strategies are tried in order until one succeeds, providing robust
+    request routing even when some strategies fail.
+    Args:
+        actors: Either a single AsyncVLLM instance or a list of Ray actors.
+        strategy: Single strategy or sequence of strategies in fallback order.
+            Available strategies:
+            - "prefix-aware": Route based on prompt prefix for cache locality
+            - "requests": Select actor with fewest pending requests
+            - "kv-cache": Select actor with lowest KV cache utilization
+            - "round-robin": Simple round-robin distribution
+            Default: ["prefix-aware", "requests"]
+        prefix_length: Number of tokens/words to use for prefix routing (default: 8).
+        overload_threshold: Multiplier for average load to consider actor overloaded (default: 1.5).
+    Examples:
+        >>> service = AsyncVLLM.from_pretrained("Qwen/Qwen2.5-3B", num_replicas=3)
+        >>> # Simple strategy
+        >>> lb = LoadBalancer(service, "requests")
+        >>> actor_idx = lb.select_actor()
+        >>> # Strategy hierarchy: try prefix-aware first, fall back to requests, then round-robin
+        >>> lb = LoadBalancer(service, ["prefix-aware", "requests", "round-robin"])
+        >>> actor_idx = lb.select_actor(prompt="Hello world")  # Uses prefix routing
+        >>> actor_idx = lb.select_actor()  # Falls back to requests (no prompt)
+        >>> # Custom configuration
+        >>> lb = LoadBalancer(
+        ...     service,
+        ...     ["prefix-aware", "kv-cache"],
+        ...     prefix_length=16,
+        ...     overload_threshold=2.0
+        ... )
+    """
+    def __init__(
+        self,
+        actors: list[Any] | AsyncVLLM,
+        strategy: Literal["requests", "kv-cache"]
+        | Sequence[Literal["prefix-aware", "requests", "kv-cache", "round-robin"]]
+        | None = None,
+        prefix_length: int = 8,
+        overload_threshold: float = 1.5,
+    ):
+        if strategy is None:
+            strategy = ["prefix-aware", "requests"]
+        # Handle both AsyncVLLM instances and direct actor lists
+        if hasattr(actors, "actors"):  # AsyncVLLM instance
+            self.actors = actors.actors
+            self.async_vllm = actors
+        elif isinstance(actors, list):  # Direct list of actors
+            self.actors = actors
+            self.async_vllm = None
+        else:
+            raise ValueError(
+                "actors must be either an AsyncVLLM instance or a list of actors"
+            )
+        if not self.actors:
+            raise ValueError("No actors provided")
+        # Handle both single strategy and strategy hierarchy
+        if isinstance(strategy, str):
+            self.strategies = [strategy]
+        else:
+            self.strategies = list(strategy)
+        # Validate strategies
+        valid_strategies = {"prefix-aware", "requests", "kv-cache", "round-robin"}
+        for s in self.strategies:
+            if s not in valid_strategies:
+                raise ValueError(
+                    f"Invalid strategy '{s}'. Must be one of {valid_strategies}"
+                )
+        if not self.strategies:
+            raise ValueError("At least one strategy must be provided")
+        self.strategy = self.strategies[
+            0
+        ]  # Primary strategy for backward compatibility
+        self.prefix_length = prefix_length
+        self.overload_threshold = overload_threshold
+        self._round_robin_index = 0  # For round-robin fallback
+    def select_actor(
+        self,
+        prompt: str | list[int] | None = None,
+        request_context: dict[str, Any] | None = None,
+    ) -> int:
+        """Select the optimal actor index based on the configured strategy hierarchy.
+        Args:
+            prompt: The input prompt (string or token list) for prefix-aware routing.
+            request_context: Additional context for routing decisions.
+        Returns:
+            int: Index of the selected actor in the actors list.
+        Raises:
+            RuntimeError: If unable to gather metrics from actors.
+            ValueError: If no actors are available.
+        """
+        if not self.actors:
+            raise ValueError("No actors available for selection")
+        # Try each strategy in order until one succeeds
+        for i, strategy in enumerate(self.strategies):
+            try:
+                torchrl_logger.debug(
+                    f"Trying strategy {i + 1}/{len(self.strategies)}: {strategy}"
+                )
+                if strategy == "prefix-aware":
+                    if prompt is not None:
+                        return self._select_by_prefix_aware(prompt)
+                    else:
+                        torchrl_logger.debug(
+                            "No prompt provided for prefix-aware routing, trying next strategy"
+                        )
+                        continue
+                elif strategy == "requests":
+                    return self._select_by_requests()
+                elif strategy == "kv-cache":
+                    return self._select_by_cache_usage()
+                elif strategy == "round-robin":
+                    return self._select_round_robin()
+                else:
+                    torchrl_logger.warning(
+                        f"Unknown strategy: {strategy}, trying next strategy"
+                    )
+                    continue
+            except Exception as e:
+                torchrl_logger.warning(
+                    f"Strategy '{strategy}' failed with error: {e}. "
+                    f"Trying next strategy..."
+                )
+                continue
+        # All strategies failed, final fallback to random
+        torchrl_logger.warning(
+            f"All strategies {self.strategies} failed. Falling back to random selection."
+        )
+        return random.randint(0, len(self.actors) - 1)
+    def _select_by_requests(self) -> int:
+        """Select actor with fewest pending requests."""
+        if self.async_vllm is not None:
+            # Use AsyncVLLM's built-in method to get request counts
+            request_counts = self.async_vllm.get_num_unfinished_requests()
+        else:
+            # Query actors directly
+            futures = [
+                actor.get_num_unfinished_requests.remote() for actor in self.actors
+            ]
+            ray = _get_ray()
+            request_counts = ray.get(futures)
+        # Find the actor with minimum pending requests
+        min_requests = min(request_counts)
+        min_indices = [
+            i for i, count in enumerate(request_counts) if count == min_requests
+        ]
+        # If multiple actors have the same minimum count, choose randomly among them
+        selected_index = random.choice(min_indices)
+        torchrl_logger.debug(
+            f"LoadBalancer (requests): Selected actor {selected_index} "
+            f"with {min_requests} pending requests. "
+            f"Request counts: {request_counts}"
+        )
+        return selected_index
+    def _select_by_cache_usage(self) -> int:
+        """Select actor with lowest KV cache utilization."""
+        if self.async_vllm is not None:
+            # Use AsyncVLLM's built-in method to get cache usage
+            cache_usages = self.async_vllm.get_cache_usage()
+        else:
+            # Query actors directly
+            futures = [actor.get_cache_usage.remote() for actor in self.actors]
+            ray = _get_ray()
+            cache_usages = ray.get(futures)
+        # Find the actor with minimum cache usage
+        min_usage = min(cache_usages)
+        min_indices = [
+            i for i, usage in enumerate(cache_usages) if abs(usage - min_usage) < 1e-6
+        ]
+        # If multiple actors have similar cache usage, choose randomly among them
+        selected_index = random.choice(min_indices)
+        torchrl_logger.debug(
+            f"LoadBalancer (kv-cache): Selected actor {selected_index} "
+            f"with {min_usage:.3f} cache usage. "
+            f"Cache usages: {[f'{u:.3f}' for u in cache_usages]}"
+        )
+        return selected_index
+    def _select_by_prefix_aware(self, prompt: str | list[int]) -> int:
+        """Select actor based on prompt prefix for cache locality.
+        Args:
+            prompt: Input prompt as string or token list.
+        Returns:
+            int: Selected actor index.
+        Raises:
+            ValueError: If prefix cannot be extracted.
+        """
+        try:
+            # Extract prefix tokens
+            prefix_tokens = self._extract_prefix_tokens(prompt)
+            if not prefix_tokens:
+                raise ValueError("Could not extract meaningful prefix tokens")
+            # Create consistent hash from prefix
+            prefix_hash = hash(tuple(prefix_tokens))
+            preferred_actor = prefix_hash % len(self.actors)
+            # Check if preferred actor is overloaded
+            if self._is_actor_overloaded(preferred_actor):
+                torchrl_logger.debug(
+                    f"Preferred actor {preferred_actor} is overloaded "
+                    f"(threshold: {self.overload_threshold}), falling back to load-based selection"
+                )
+                # Fall back to requests-based selection
+                return self._select_by_requests()
+            torchrl_logger.debug(
+                f"LoadBalancer (prefix-aware): Selected actor {preferred_actor} "
+                f"for prefix hash {prefix_hash} (tokens: {prefix_tokens[:4]}...)"
+            )
+            return preferred_actor
+        except Exception as e:
+            torchrl_logger.warning(f"Prefix-aware routing failed: {e}")
+            raise
+    def _select_round_robin(self) -> int:
+        """Select actor using round-robin strategy."""
+        selected = self._round_robin_index % len(self.actors)
+        self._round_robin_index = (self._round_robin_index + 1) % len(self.actors)
+        torchrl_logger.debug(f"LoadBalancer (round-robin): Selected actor {selected}")
+        return selected
+    def _extract_prefix_tokens(self, prompt: str | list[int]) -> list[int]:
+        """Extract prefix tokens from prompt (string or token list).
+        Args:
+            prompt: Input prompt.
+        Returns:
+            list[int]: Prefix tokens (up to self.prefix_length).
+        Raises:
+            ValueError: If tokenization fails or prompt is invalid.
+        """
+        if isinstance(prompt, list):
+            # Already tokenized
+            if not prompt:
+                raise ValueError("Empty token list provided")
+            return prompt[: self.prefix_length]
+        elif isinstance(prompt, str):
+            # Need to tokenize - this requires access to tokenizer
+            if not prompt.strip():
+                raise ValueError("Empty or whitespace-only string provided")
+            # Try to get tokenizer from AsyncVLLM instance
+            if self.async_vllm is not None:
+                try:
+                    # This is a simplistic approach - in practice you'd want to cache the tokenizer
+                    # For now, use a simple heuristic based on string content
+                    return self._simple_string_hash(prompt)
+                except Exception as e:
+                    torchrl_logger.warning(f"Could not tokenize string: {e}")
+                    return self._simple_string_hash(prompt)
+            else:
+                # Fall back to simple string hashing
+                return self._simple_string_hash(prompt)
+        else:
+            raise ValueError(f"Unsupported prompt type: {type(prompt)}")
+    def _simple_string_hash(self, text: str) -> list[int]:
+        """Create pseudo-tokens from string for prefix routing.
+        This is a fallback when proper tokenization isn't available.
+        """
+        # Use words as pseudo-tokens, limited to prefix_length
+        words = text.strip().split()[: self.prefix_length]
+        if not words:
+            raise ValueError("No words found in text")
+        # Convert words to integers using hash
+        pseudo_tokens = [
+            abs(hash(word)) % 50000 for word in words
+        ]  # Simulate vocab size
+        return pseudo_tokens
+    def _is_actor_overloaded(self, actor_index: int) -> bool:
+        """Check if an actor is overloaded compared to average load.
+        Args:
+            actor_index: Index of actor to check.
+        Returns:
+            bool: True if actor is overloaded.
+        """
+        try:
+            if self.async_vllm is not None:
+                request_counts = self.async_vllm.get_num_unfinished_requests()
+            else:
+                futures = [
+                    actor.get_num_unfinished_requests.remote() for actor in self.actors
+                ]
+            ray = _get_ray()
+            request_counts = ray.get(futures)
+            if not request_counts:
+                return False
+            avg_requests = sum(request_counts) / len(request_counts)
+            actor_requests = request_counts[actor_index]
+            is_overloaded = actor_requests > avg_requests * self.overload_threshold
+            torchrl_logger.debug(
+                f"Actor {actor_index}: {actor_requests} requests, "
+                f"avg: {avg_requests:.1f}, threshold: {avg_requests * self.overload_threshold:.1f}, "
+                f"overloaded: {is_overloaded}"
+            )
+            return is_overloaded
+        except Exception as e:
+            torchrl_logger.warning(f"Could not check actor load: {e}")
+            return False  # Assume not overloaded if we can't check
+    def get_stats(self) -> dict[str, Any]:
+        """Get current load balancing statistics for all actors.
+        Returns:
+            dict: Statistics including request counts and cache usage for all actors.
+        """
+        stats = {
+            "strategies": self.strategies,
+            "primary_strategy": self.strategy,  # For backward compatibility
+            "num_actors": len(self.actors),
+            "prefix_length": self.prefix_length,
+            "overload_threshold": self.overload_threshold,
+            "round_robin_index": self._round_robin_index,
+            "actor_stats": [],
+        }
+        try:
+            if self.async_vllm is not None:
+                request_counts = self.async_vllm.get_num_unfinished_requests()
+                cache_usages = self.async_vllm.get_cache_usage()
+            else:
+                request_futures = [
+                    actor.get_num_unfinished_requests.remote() for actor in self.actors
+                ]
+                cache_futures = [
+                    actor.get_cache_usage.remote() for actor in self.actors
+                ]
+            ray = _get_ray()
+            request_counts = ray.get(request_futures)
+            cache_usages = ray.get(cache_futures)
+            for i, (requests, cache_usage) in enumerate(
+                zip(request_counts, cache_usages)
+            ):
+                stats["actor_stats"].append(
+                    {
+                        "actor_index": i,
+                        "pending_requests": requests,
+                        "cache_usage": cache_usage,
+                    }
+                )
+        except Exception as e:
+            torchrl_logger.warning(f"Error gathering load balancer stats: {e}")
+            stats["error"] = str(e)
+        return stats
+def make_async_vllm_engine(
+    *,
+    model_name: str,
+    num_devices: int | None = None,
+    num_replicas: int = 1,
+    verbose: bool = True,
+    compile: bool = True,
+    enable_fp32_output: bool = False,
+    tensor_parallel_size: int | None = None,
+    data_parallel_size: int | None = None,
+    pipeline_parallel_size: int | None = None,
+    **kwargs,
+) -> AsyncVLLM:
+    """Create an async vLLM engine service.
+    Keyword Args:
+        model_name (str): The model name to pass to vLLM.
+        num_devices (int, optional): Number of devices to use, per replica.
+        num_replicas (int): Number of engine replicas to create.
+        verbose (bool, optional): Whether to enable verbose logging with throughput statistics. Defaults to True.
+        compile (bool, optional): Whether to enable model compilation for better performance. Defaults to True.
+        enable_fp32_output (bool, optional): Whether to enable FP32 output for the final layer. Defaults to False.
+            This can help with numerical stability for certain models. Requires model-specific support in
+            torchrl.modules.llm.backends._models.
+        tensor_parallel_size (int, optional): Number of devices to use, per replica. Defaults to None.
+        data_parallel_size (int, optional): Number of data parallel groups to use. Defaults to None.
+        pipeline_parallel_size (int, optional): Number of pipeline parallel groups to use. Defaults to None.
+        **kwargs: Additional arguments passed to AsyncEngineArgs.
+    Returns:
+        AsyncVLLM: The launched engine service.
+    Raises:
+        RuntimeError: If no CUDA devices are available.
+        ValueError: If invalid device configuration is provided.
+    Example:
+        >>> # Create a single-GPU async engine
+        >>> service = make_async_vllm_engine("Qwen/Qwen2.5-3B")
+        >>>
+        >>> # Create a 2-GPU tensor parallel async engine with 2 replicas
+        >>> service = make_async_vllm_engine("Qwen/Qwen2.5-3B", num_devices=2, num_replicas=2)
+        >>> # Generate text
+        >>> result = service.generate("Hello, world!", sampling_params)
+        >>>
+        >>> # Create with FP32 output enabled
+        >>> service = make_async_vllm_engine("Qwen/Qwen2.5-3B", enable_fp32_output=True)
+    """
+    if not _has_vllm:
+        raise ImportError(
+            "vllm is not installed. Please install it with `pip install vllm`."
+        )
+    from vllm import AsyncEngineArgs
+    # Set FP32 output environment variable if requested
+    if enable_fp32_output:
+        os.environ["VLLM_ENABLE_FP32_OUTPUT"] = "1"
+        torchrl_logger.info(
+            "Enabled FP32 output for vLLM (VLLM_ENABLE_FP32_OUTPUT=1). "
+            "This will use FP32 for the final output layer if the model supports it."
+        )
+    # Configure verbose logging if requested
+    if verbose:
+        import logging
+        # Enable vLLM's throughput logging by setting the appropriate log level
+        logging.getLogger("vllm.engine.metrics").setLevel(logging.INFO)
+        logging.getLogger("vllm").setLevel(logging.INFO)
+        # vLLM logs throughput stats at INFO level every few seconds
+        # The stats include: prompt throughput, generation throughput, running/pending requests, GPU KV cache usage
+        torchrl_logger.info(
+            "Enabled verbose vLLM logging - throughput statistics will be displayed"
+        )
+    # Set tensor_parallel_size to num_devices if not set
+    if tensor_parallel_size is None:
+        if num_devices is None:
+            tensor_parallel_size = 1
+        else:
+            tensor_parallel_size = num_devices
+    elif num_devices is not None and tensor_parallel_size != num_devices:
+        raise ValueError(f"tensor_parallel_size must be set to {num_devices}")
+    if data_parallel_size is None:
+        data_parallel_size = 1
+    if pipeline_parallel_size is None:
+        pipeline_parallel_size = 1
+    # Create engine args
+    kwargs.setdefault("distributed_executor_backend", "ray")
+    # Don't explicitly set enable_prefix_caching to avoid conflicts
+    kwargs.setdefault("enable_prefix_caching", True)
+    # Set compilation flag - this controls whether vLLM will compile the model for better performance
+    # Disabled by default in GRPO since it can cause issues during training
+    if "compilation_config" not in kwargs:
+        if compile:
+            kwargs["compilation_config"] = {"level": 3}  # PIECEWISE compilation
+        else:
+            kwargs["compilation_config"] = {"level": 0}  # NO_COMPILATION
+    engine_args = AsyncEngineArgs(
+        model=model_name,
+        tensor_parallel_size=tensor_parallel_size,
+        data_parallel_size=data_parallel_size,
+        pipeline_parallel_size=pipeline_parallel_size,
+        worker_extension_cls="torchrl.modules.llm.backends.vllm.vllm_async._AsyncvLLMWorker",
+        **kwargs,
+    )
+    return AsyncVLLM.launch(engine_args, num_replicas)