sglang 0.1.26__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sglang/bench_serving.py CHANGED
@@ -369,7 +369,7 @@ def sample_random_requests(
369
369
  ) -> List[Tuple[str, int, int]]:
370
370
 
371
371
  input_lens = np.random.randint(
372
- int(input_len * range_ratio),
372
+ max(int(input_len * range_ratio), 1),
373
373
  input_len + 1,
374
374
  size=num_prompts,
375
375
  )
@@ -415,7 +415,7 @@ def sample_random_requests(
415
415
  prompt_token_ids = tokenizer(prompt).input_ids
416
416
  prompt_len = len(prompt_token_ids)
417
417
 
418
- if prompt_len <= input_lens[i]:
418
+ if prompt_len > input_lens[i]:
419
419
  input_ids = prompt_token_ids[: input_lens[i]]
420
420
  else:
421
421
  ratio = (input_lens[i] + prompt_len - 1) // prompt_len
@@ -935,7 +935,7 @@ if __name__ == "__main__":
935
935
  parser.add_argument(
936
936
  "--random-range-ratio",
937
937
  type=float,
938
- default=1.0,
938
+ default=0.0,
939
939
  help="Range of sampled ratio of input/output length, "
940
940
  "used only for random dataset.",
941
941
  )
sglang/global_config.py CHANGED
@@ -17,7 +17,7 @@ class GlobalConfig:
17
17
 
18
18
  # Runtime constants: New generation token ratio estimation
19
19
  self.init_new_token_ratio = 0.7
20
- self.base_min_new_token_ratio = 0.2
20
+ self.base_min_new_token_ratio = 0.1
21
21
  self.new_token_ratio_decay = 0.001
22
22
  self.new_token_ratio_recovery = 0.05
23
23
 
@@ -15,7 +15,6 @@ from flashinfer import (
15
15
  BatchPrefillWithRaggedKVCacheWrapper,
16
16
  )
17
17
  from flashinfer.decode import _grouped_size_compiled_for_decode_kernels
18
- from torch.nn.parameter import Parameter
19
18
  from vllm.config import DeviceConfig, LoadConfig
20
19
  from vllm.config import ModelConfig as VllmModelConfig
21
20
  from vllm.distributed import (
@@ -23,7 +22,6 @@ from vllm.distributed import (
23
22
  init_distributed_environment,
24
23
  initialize_model_parallel,
25
24
  )
26
- from vllm.model_executor.layers.linear import QKVParallelLinear
27
25
  from vllm.model_executor.models import ModelRegistry
28
26
 
29
27
  from sglang.global_config import global_config
@@ -32,26 +30,16 @@ from sglang.srt.memory_pool import ReqToTokenPool, TokenToKVPool
32
30
  from sglang.srt.server_args import ServerArgs
33
31
  from sglang.srt.utils import (
34
32
  get_available_gpu_memory,
33
+ is_llama3_405b_fp8,
35
34
  is_multimodal_model,
36
35
  monkey_patch_vllm_dummy_weight_loader,
37
36
  monkey_patch_vllm_p2p_access_check,
37
+ monkey_patch_vllm_qvk_linear_loader,
38
38
  )
39
39
 
40
40
  logger = logging.getLogger("srt.model_runner")
41
41
 
42
42
 
43
- def is_llama3_405b_fp8(model_config):
44
- if (
45
- model_config.hf_config.architectures[0] == "LlamaForCausalLM"
46
- and model_config.hf_config.hidden_size == 16384
47
- and model_config.hf_config.intermediate_size == 53248
48
- and model_config.hf_config.num_hidden_layers == 126
49
- and model_config.hf_config.quantization_config["quant_method"] == "fbgemm_fp8"
50
- ):
51
- return True
52
- return False
53
-
54
-
55
43
  class ModelRunner:
56
44
  def __init__(
57
45
  self,
@@ -132,9 +120,13 @@ class ModelRunner:
132
120
  seed=42,
133
121
  skip_tokenizer_init=True,
134
122
  )
135
- if is_llama3_405b_fp8(self.model_config):
123
+
124
+ if is_llama3_405b_fp8(self.model_config) and self.tp_size <= 8:
125
+ # A temporary hack to fix the num_heads for meta-llama/Meta-Llama-3.1-405B-FP8 checkpoints
136
126
  self.model_config.hf_config.num_key_value_heads = 8
137
127
  vllm_model_config.hf_config.num_key_value_heads = 8
128
+ monkey_patch_vllm_qvk_linear_loader()
129
+
138
130
  self.dtype = vllm_model_config.dtype
139
131
  if self.model_config.model_overide_args is not None:
140
132
  vllm_model_config.hf_config.update(self.model_config.model_overide_args)
@@ -387,39 +379,5 @@ def load_model_cls_srt(model_arch: str) -> Optional[Type[nn.Module]]:
387
379
  return model_arch_name_to_cls[model_arch]
388
380
 
389
381
 
390
- def get_original_weight(loaded_weight, head_dim):
391
- n_kv_head = loaded_weight.shape[0] // (2 * head_dim)
392
- dim = loaded_weight.shape[1]
393
- for i in range(n_kv_head):
394
- loaded_weight[i * head_dim : (i + 1) * head_dim, :] = loaded_weight[
395
- 2 * i * head_dim : (2 * i + 1) * head_dim, :
396
- ]
397
- original_kv_weight = loaded_weight[: n_kv_head * head_dim, :]
398
- assert original_kv_weight.shape == (n_kv_head * head_dim, dim)
399
- return original_kv_weight
400
-
401
-
402
- def get_weight_loader_srt(weight_loader):
403
- def weight_loader_srt(
404
- self,
405
- param: Parameter,
406
- loaded_weight: torch.Tensor,
407
- loaded_shard_id: Optional[str] = None,
408
- ):
409
- if (
410
- loaded_shard_id in ["k", "v"]
411
- and loaded_weight.shape[0] == self.head_size * self.total_num_kv_heads * 2
412
- ):
413
- loaded_weight = get_original_weight(loaded_weight, self.head_size)
414
-
415
- weight_loader(self, param, loaded_weight, loaded_shard_id)
416
-
417
- return weight_loader_srt
418
-
419
-
420
382
  # Monkey patch model loader
421
383
  setattr(ModelRegistry, "load_model_cls", load_model_cls_srt)
422
- original_weight_loader = QKVParallelLinear.weight_loader
423
- setattr(
424
- QKVParallelLinear, "weight_loader", get_weight_loader_srt(original_weight_loader)
425
- )
@@ -40,7 +40,10 @@ class GenerateReqInput:
40
40
  self.text is not None and self.input_ids is not None
41
41
  ):
42
42
  raise ValueError("Either text or input_ids should be provided.")
43
- if self.sampling_params.get("n", 1) != 1:
43
+ if (
44
+ isinstance(self.sampling_params, dict)
45
+ and self.sampling_params.get("n", 1) != 1
46
+ ):
44
47
  is_single = False
45
48
  else:
46
49
  if self.text is not None:
@@ -94,9 +94,14 @@ def load_chat_template_for_openai_api(chat_template_arg):
94
94
  async def v1_completions(tokenizer_manager, raw_request: Request):
95
95
  request_json = await raw_request.json()
96
96
  request = CompletionRequest(**request_json)
97
+ prompt = request.prompt
98
+ if isinstance(prompt, str) or isinstance(prompt[0], str):
99
+ prompt_kwargs = {"text": prompt}
100
+ else:
101
+ prompt_kwargs = {"input_ids": prompt}
97
102
 
98
103
  adapted_request = GenerateReqInput(
99
- text=request.prompt,
104
+ **prompt_kwargs,
100
105
  sampling_params={
101
106
  "temperature": request.temperature,
102
107
  "max_new_tokens": request.max_tokens,
sglang/srt/server.py CHANGED
@@ -202,15 +202,12 @@ def launch_server(
202
202
  "reinstall the latest version by following the instructions "
203
203
  "at https://docs.flashinfer.ai/installation.html.",
204
204
  )
205
-
206
- if server_args.tp_size // server_args.dp_size > 1:
205
+ if server_args.tp_size * server_args.dp_size > 1:
207
206
  # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency.
208
207
  maybe_set_triton_cache_manager()
209
-
210
208
  if server_args.chat_template:
211
209
  # TODO: replace this with huggingface transformers template
212
210
  load_chat_template_for_openai_api(server_args.chat_template)
213
-
214
211
  if server_args.enable_torch_compile:
215
212
  _set_torch_compile_config()
216
213
 
sglang/srt/utils.py CHANGED
@@ -21,6 +21,7 @@ import torch.distributed as dist
21
21
  from fastapi.responses import JSONResponse
22
22
  from packaging import version as pkg_version
23
23
  from starlette.middleware.base import BaseHTTPMiddleware
24
+ from torch.nn.parameter import Parameter
24
25
  from triton.runtime.cache import (
25
26
  FileCacheManager,
26
27
  default_cache_dir,
@@ -471,7 +472,7 @@ def maybe_set_triton_cache_manager() -> None:
471
472
  cache_manger = os.environ.get("TRITON_CACHE_MANAGER", None)
472
473
  if cache_manger is None:
473
474
  manager = "sglang.srt.utils:CustomCacheManager"
474
- logger.info("Setting Triton cache manager to: %s", manager)
475
+ logger.debug("Setting Triton cache manager to: %s", manager)
475
476
  os.environ["TRITON_CACHE_MANAGER"] = manager
476
477
 
477
478
 
@@ -615,3 +616,52 @@ def set_ulimit(target_soft_limit=65535):
615
616
  resource.setrlimit(resource_type, (target_soft_limit, current_hard))
616
617
  except ValueError as e:
617
618
  logger.warn(f"Fail to set RLIMIT_NOFILE: {e}")
619
+
620
+
621
+ def is_llama3_405b_fp8(model_config):
622
+ """Return whether the model is meta-llama/Meta-Llama-3.1-405B-FP8 with 16 kv heads."""
623
+ if (
624
+ model_config.hf_config.architectures[0] == "LlamaForCausalLM"
625
+ and model_config.hf_config.hidden_size == 16384
626
+ and model_config.hf_config.intermediate_size == 53248
627
+ and model_config.hf_config.num_hidden_layers == 126
628
+ and model_config.hf_config.num_key_value_heads == 16
629
+ and hasattr(model_config.hf_config, "quantization_config")
630
+ and model_config.hf_config.quantization_config["quant_method"] == "fbgemm_fp8"
631
+ ):
632
+ return True
633
+ return False
634
+
635
+
636
+ def monkey_patch_vllm_qvk_linear_loader():
637
+ """A temporary hack to fix the num_heads for meta-llama/Meta-Llama-3.1-405B-FP8 checkpoints."""
638
+ from vllm.model_executor.layers.linear import QKVParallelLinear
639
+
640
+ origin_weight_loader = QKVParallelLinear.weight_loader
641
+
642
+ def get_original_weight(loaded_weight, head_dim):
643
+ n_kv_head = loaded_weight.shape[0] // (2 * head_dim)
644
+ dim = loaded_weight.shape[1]
645
+ for i in range(n_kv_head):
646
+ loaded_weight[i * head_dim : (i + 1) * head_dim, :] = loaded_weight[
647
+ 2 * i * head_dim : (2 * i + 1) * head_dim, :
648
+ ]
649
+ original_kv_weight = loaded_weight[: n_kv_head * head_dim, :]
650
+ assert original_kv_weight.shape == (n_kv_head * head_dim, dim)
651
+ return original_kv_weight
652
+
653
+ def weight_loader_srt(
654
+ self,
655
+ param: Parameter,
656
+ loaded_weight: torch.Tensor,
657
+ loaded_shard_id: Optional[str] = None,
658
+ ):
659
+ if (
660
+ loaded_shard_id in ["k", "v"]
661
+ and loaded_weight.shape[0] == self.head_size * self.total_num_kv_heads * 2
662
+ ):
663
+ loaded_weight = get_original_weight(loaded_weight, self.head_size)
664
+
665
+ origin_weight_loader(self, param, loaded_weight, loaded_shard_id)
666
+
667
+ setattr(QKVParallelLinear, "weight_loader", weight_loader_srt)
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.1.26"
1
+ __version__ = "0.2.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.1.26
3
+ Version: 0.2.1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -249,7 +249,7 @@ Requires-Dist: outlines >=0.0.44 ; extra == 'srt'
249
249
 
250
250
  --------------------------------------------------------------------------------
251
251
 
252
- | [**Blog**](https://lmsys.org/blog/2024-01-17-sglang/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
252
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) |
253
253
 
254
254
  SGLang is a fast serving framework for large language models and vision language models.
255
255
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
@@ -259,13 +259,14 @@ The core features include:
259
259
  - **Flexible Frontend Language**: Enables easy programming of LLM applications with chained generation calls, advanced prompting, control flow, multiple modalities, parallelism, and external interactions.
260
260
 
261
261
  ## News
262
- - [2024/04] 🔥 SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
263
- - [2024/02] 🔥 SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
264
- - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
262
+ - [2024/07] 🔥 Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
263
+ - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
264
+ - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
265
265
 
266
266
  <details>
267
267
  <summary>More</summary>
268
268
 
269
+ - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
269
270
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
270
271
 
271
272
  </details>
@@ -302,7 +303,8 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3/
302
303
  ```
303
304
 
304
305
  ### Method 3: Using docker
305
- The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags).
306
+ The docker images are available on Docker Hub as [lmsysorg/sglang](https://hub.docker.com/r/lmsysorg/sglang/tags), built from [Dockerfile](docker).
307
+ Repalce `<secret>` below with your huggingface hub [token](https://huggingface.co/docs/hub/en/security-tokens).
306
308
 
307
309
  ```bash
308
310
  docker run --gpus all \
@@ -311,15 +313,10 @@ docker run --gpus all \
311
313
  --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
312
314
  --ipc=host \
313
315
  lmsysorg/sglang:latest \
314
- python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B --host 0.0.0.0 --port 30000
316
+ python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --host 0.0.0.0 --port 30000
315
317
  ```
316
318
 
317
319
  ### Common Notes
318
- - If you see errors from the Triton compiler, please install the [Triton Nightly](https://triton-lang.org/main/getting-started/installation.html) by
319
- ```
320
- pip uninstall -y triton triton-nightly
321
- pip install -U --index-url https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/Triton-Nightly/pypi/simple/ triton-nightly
322
- ```
323
320
  - If you cannot install FlashInfer, check out its [installation](https://docs.flashinfer.ai/installation.html#) page. If you still cannot install it, you can use the slower Triton kernels by adding `--disable-flashinfer` when launching the server.
324
321
  - If you only need to use the OpenAI backend, you can avoid installing other dependencies by using `pip install "sglang[openai]"`.
325
322
 
@@ -402,6 +399,22 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
402
399
  ```
403
400
  - If the model does not have a template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/custom_chat_template.md).
404
401
  - To enable fp8 quantization, you can add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
402
+ - To enable experimental torch.compile support, you can add `--enable-torch-compile`. It accelerates small models on small batch sizes.
403
+
404
+ ### Run Llama 3.1 405B
405
+
406
+ ```bash
407
+ # 2 nodes run 405B fp16
408
+ # replace the `172.16.4.52:20000` with your own first node ip address and port, disable CUDA Graph temporarily
409
+ # on the first node
410
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 0 --disable-cuda-graph --mem-frac 0.75
411
+
412
+ # on the second
413
+ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct --tp 16 --nccl-init-addr 172.16.4.52:20000 --nnodes 2 --node-rank 1 --disable-cuda-graph --mem-frac 0.75
414
+
415
+ # single node run 405B fp8
416
+ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 --tp 8
417
+ ```
405
418
 
406
419
  ### Supported Models
407
420
 
@@ -660,15 +673,12 @@ for out in state.text_iter():
660
673
  - The `choices` argument in `sgl.gen` is implemented by computing the [token-length normalized log probabilities](https://blog.eleuther.ai/multiple-choice-normalization/) of all choices and selecting the one with the highest probability.
661
674
  - The `regex` argument in `sgl.gen` is implemented through autoregressive decoding with logit bias masking, according to the constraints set by the regex. It is compatible with `temperature=0` and `temperature != 0`.
662
675
 
663
- ## Benchmark And Performance
664
- - Llama-7B on NVIDIA A10G, FP16, Tensor Parallelism=1
665
- ![llama_7b](assets/llama_7b.jpg)
666
676
 
667
- - Mixtral-8x7B on NVIDIA A10G, FP16, Tensor Parallelism=8
668
- ![mixtral_8x7b](assets/mixtral_8x7b.jpg)
677
+ ## Benchmark And Performance
678
+ ![8b_throughput](https://lmsys.org/images/blog/sglang_llama3/8b_throughput.svg)
679
+ ![70b_fp8_throughput](https://lmsys.org/images/blog/sglang_llama3/70b_fp8_throughput.svg)
669
680
 
670
- - Learn more about the above [results](docs/benchmark_results.md).
671
- - Synthetic latency and throughput benchmark [scripts](https://github.com/sgl-project/sglang/tree/main/benchmark/latency_throughput).
681
+ Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
672
682
 
673
683
  ## Roadmap
674
684
  [Development Roadmap (2024 Q3)](https://github.com/sgl-project/sglang/issues/634)
@@ -1,13 +1,13 @@
1
1
  sglang/__init__.py,sha256=UV7VlXhXrwi00Zg45iNB9KcnmrwLjdMtjMz06AiafY0,1151
2
2
  sglang/api.py,sha256=1JARbc1wNYF6tODdUpgmNgTyLOvMnxdTBctLvEwzGTY,5565
3
3
  sglang/bench_latency.py,sha256=UPy6WhrddMTDX7HqIeHNhCn5vF0YMOKxJlQRvhMC8zU,10552
4
- sglang/bench_serving.py,sha256=zKGgVX3S-ggUvOxvEM4AszzXRPRVU6NGNnBG5vAAvRY,34577
4
+ sglang/bench_serving.py,sha256=UWhTENnoATPJo3nk59Ktr73CwZgiY_MGaRY6TQk0ozI,34584
5
5
  sglang/check_env.py,sha256=CscuPMlf68dkgZf0m-FiLpUisNNDoihMck4qhLOeV1Q,4124
6
- sglang/global_config.py,sha256=QG-ABVJksKK_llvUx7fSZcmK4GGCs-hBUVcM4LCr7Nw,1749
6
+ sglang/global_config.py,sha256=CyhGL7PE-KlMcg7IHWykzImU1y4NQlpeIlh9lHA77uo,1749
7
7
  sglang/launch_server.py,sha256=Gg8CwNlTCCfg1dF65ZT9ePLxOT9LKtY79GhIPG6PCrU,358
8
8
  sglang/launch_server_llavavid.py,sha256=40uaazMsavKuk6YXFa5v37kdUpFGuealgJJeph1g8gU,1025
9
9
  sglang/utils.py,sha256=arJuwOAEX445M2NL9SAOi6jBNu0-cfU04PLAr-hIH3U,8168
10
- sglang/version.py,sha256=3_QdGLpuk_SDY7k9PpNcHpSTjlPdhadPiEgF82wzkqk,23
10
+ sglang/version.py,sha256=HfjVOrpTnmZ-xVFCYSVmX50EXaBQeJteUHG-PD6iQs8,22
11
11
  sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
12
  sglang/lang/chat_template.py,sha256=psIlhaDo70twgLrx5Lgln03metLEA3-FZuixeI0Y7Ao,13309
13
13
  sglang/lang/compiler.py,sha256=UiXUmPR9wBAPtnORrLcyQX8Uh0ZL0nKeV8ZgBozAJPw,7531
@@ -28,9 +28,9 @@ sglang/srt/memory_pool.py,sha256=FhJk5GtYortO3MJIsMMQ-o49agwDHVX1aEQH2LITq6c,394
28
28
  sglang/srt/mm_utils.py,sha256=OptgAHDX-73Bk4jAdr2BOAJtiEXJNzPrMhaM-dy275c,8889
29
29
  sglang/srt/model_config.py,sha256=lZu1D-XLVMETHS6FBMoPn8Uowa9QFGe95d3SuWrr2q8,5282
30
30
  sglang/srt/sampling_params.py,sha256=OI11asr1Bd_E5soDjih614v4flgWxdMZU9HAF0aBafQ,3062
31
- sglang/srt/server.py,sha256=DXhcJt0V24a7yhydP1abPrK1qqV3qt7r8cyOMVOAI4M,14611
31
+ sglang/srt/server.py,sha256=IUed6vnXCx7-xbrpEMAaJZ_aa4UubPAQ5pXvcv-xNoY,14607
32
32
  sglang/srt/server_args.py,sha256=aF6L35mEB-FU3BL_ooKuCIcOXLhYLxA9-MjpaOTQRCo,13189
33
- sglang/srt/utils.py,sha256=bUp3SLzbDms0dvuETaccDPAGRHOIGW5A61pqH62XiT0,20370
33
+ sglang/srt/utils.py,sha256=HvKkGbut8sOxMpGIzYsJ9NEZJg48LOnxyGESaGZmANs,22385
34
34
  sglang/srt/constrained/__init__.py,sha256=5LB3_mDTMW6wcRkFA5J2Rd5HPHHEKRyiELhe4gtlBYM,1472
35
35
  sglang/srt/constrained/base_cache.py,sha256=QQjmFEiT8jlOskJoZobhrDl2TKB-B4b1LPQo9JQCP_w,1405
36
36
  sglang/srt/constrained/fsm_cache.py,sha256=P4qNDHHxpKpTnYL_8V1R6OFXlUwbM6ZcBdzddpcBgb4,1135
@@ -45,13 +45,13 @@ sglang/srt/layers/token_attention.py,sha256=EJ4gjbVLfshOZ_vr1iB-Eq8_B-4F26n_wPDj
45
45
  sglang/srt/layers/quantization/__init__.py,sha256=PQFzdPpul98DvywBA6YMBOnrMjtHE1LMlMpJ7FM8J3I,1971
46
46
  sglang/srt/layers/quantization/fp8.py,sha256=jaqgRFnHC--IL8iqB6Qygi-KXYPYBKKqt_j4Rk55_h4,24946
47
47
  sglang/srt/managers/detokenizer_manager.py,sha256=8rN2cdMr61LWy07lingEqLnNy0W5Rebdn14IsTQ9PCs,5049
48
- sglang/srt/managers/io_struct.py,sha256=Y6jW3p0cNg0jcrEQNki1H8MMEWxwWA4p6Y-xVgUVWaI,5404
48
+ sglang/srt/managers/io_struct.py,sha256=VHy9wdZ3sfZA7fS6iq8lqbxdHL5WkBZNqxpacyZ8_8c,5483
49
49
  sglang/srt/managers/tokenizer_manager.py,sha256=SbivhFhZUR9HU9pLTe93MlYprAFAHzOU3KMBA2piQUk,19308
50
50
  sglang/srt/managers/controller/cuda_graph_runner.py,sha256=0aRqA1_34oJ557Zn8PjpJecex5bBWJdnCmBlcDVvYO0,8509
51
51
  sglang/srt/managers/controller/infer_batch.py,sha256=SKwCwhnZ_CNlG0mVCEc4X0e4HNjJFke-c8zdWP3TzjQ,34186
52
52
  sglang/srt/managers/controller/manager_multi.py,sha256=DT8Y9RF5OyTxlrLEZYz4claNWir3UrVztdOZaVPiA6g,6077
53
53
  sglang/srt/managers/controller/manager_single.py,sha256=2xO_iWK6tWvc0B31nKbe2N3klxwQBJmPTnFhNjzhVSI,4566
54
- sglang/srt/managers/controller/model_runner.py,sha256=FwZ7FU7nhJsYhtoTNxYFc4e6oMEwSqOh8ohXOKtFPKc,15828
54
+ sglang/srt/managers/controller/model_runner.py,sha256=4-nBd9_MgIlamjEdLZDepBEykYNR8nL-65Sf1EYsnx0,14371
55
55
  sglang/srt/managers/controller/radix_cache.py,sha256=tx8LEQpqLxipw9UUVj4D1YQLMMDmWnjDYv8oDlOl-co,8210
56
56
  sglang/srt/managers/controller/schedule_heuristic.py,sha256=SQAGzPS3aB_TPj7rnPBhewwyR6W1sVwW4D3zG3JUY00,2714
57
57
  sglang/srt/managers/controller/tp_worker.py,sha256=yjz-Xzl0zEy4QSU-EYneZH5vi3oHtBuXTtYe4VuDp2g,30517
@@ -79,14 +79,14 @@ sglang/srt/models/qwen2.py,sha256=87Tt1Bti-Py3AGudcf7k5ni-OHhtDKPj_Hke44YGw4U,11
79
79
  sglang/srt/models/qwen2_moe.py,sha256=oHNoo45myV5kitkls2GWVzuGt1Q4pRHN2nLlXEltFI8,17581
80
80
  sglang/srt/models/stablelm.py,sha256=Z_XCDSHY_QMz3lZwwkZdIZjEOizZjLYJU9GDi8o08qQ,10802
81
81
  sglang/srt/models/yivl.py,sha256=55KPrQ-dVplI0hh2WCSugjc1luE0J2UAafjZxu_7Xuc,4367
82
- sglang/srt/openai_api/adapter.py,sha256=eirFYVGIp5D-UrQLqW5dRJOQYKmzF9nmgCzFeUOb2z8,15737
82
+ sglang/srt/openai_api/adapter.py,sha256=A0IG9ZKEMkkYCsLrVEspnVWzZHBUbc1vHv747LrF8ew,15920
83
83
  sglang/srt/openai_api/protocol.py,sha256=j7ifIR2SFQxTwaHAd9ksM096vfffcNltzTH4sg7H0RA,5739
84
84
  sglang/test/test_conversation.py,sha256=gF_AyOxQgpPQBPnA57-kq-M0p_zFu-rBDMFgAq655Rw,1596
85
85
  sglang/test/test_openai_protocol.py,sha256=DVx3r6hrb8oRqbo5AYIleldxbqMBTtb-gtORM6t_Y1c,1661
86
86
  sglang/test/test_programs.py,sha256=uefeHUFKT2NJESOujj-CsnPXdw1aQQN2TzUbPCHJjGs,13654
87
87
  sglang/test/test_utils.py,sha256=kD_fQe3WroZ9Kc3NBRKPiZOFJ_JD2uEE9XIvPp6AD9Y,11048
88
- sglang-0.1.26.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
89
- sglang-0.1.26.dist-info/METADATA,sha256=QnzTK6blFTHKTDw9ULRpaJVvXyg0MuzkdqwYkk0zPb0,30986
90
- sglang-0.1.26.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
91
- sglang-0.1.26.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
92
- sglang-0.1.26.dist-info/RECORD,,
88
+ sglang-0.2.1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
89
+ sglang-0.2.1.dist-info/METADATA,sha256=9Ez23PnJHeEmys9eu3mEfe5SASMKR-AZ8nR9hXMqS5A,31706
90
+ sglang-0.2.1.dist-info/WHEEL,sha256=Wyh-_nZ0DJYolHNn1_hMa4lM7uDedD_RGVwbmTjyItk,91
91
+ sglang-0.2.1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
92
+ sglang-0.2.1.dist-info/RECORD,,