sglang 0.3.6__py3-none-any.whl → 0.3.6.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. sglang/__init__.py +2 -2
  2. sglang/api.py +2 -2
  3. sglang/bench_one_batch.py +2 -4
  4. sglang/bench_serving.py +75 -26
  5. sglang/lang/backend/base_backend.py +1 -1
  6. sglang/lang/backend/runtime_endpoint.py +2 -2
  7. sglang/srt/configs/model_config.py +13 -14
  8. sglang/srt/constrained/__init__.py +13 -14
  9. sglang/srt/constrained/base_grammar_backend.py +13 -15
  10. sglang/srt/constrained/outlines_backend.py +13 -15
  11. sglang/srt/constrained/outlines_jump_forward.py +13 -15
  12. sglang/srt/constrained/xgrammar_backend.py +38 -57
  13. sglang/srt/conversation.py +13 -15
  14. sglang/srt/hf_transformers_utils.py +13 -15
  15. sglang/srt/layers/activation.py +13 -13
  16. sglang/srt/layers/attention/flashinfer_backend.py +13 -6
  17. sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
  18. sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
  19. sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
  20. sglang/srt/layers/custom_op_util.py +13 -14
  21. sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
  22. sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
  23. sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
  24. sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
  25. sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
  26. sglang/srt/layers/fused_moe_triton/layer.py +633 -0
  27. sglang/srt/layers/layernorm.py +13 -15
  28. sglang/srt/layers/logits_processor.py +13 -15
  29. sglang/srt/layers/quantization/__init__.py +77 -17
  30. sglang/srt/layers/radix_attention.py +13 -15
  31. sglang/srt/layers/rotary_embedding.py +13 -13
  32. sglang/srt/lora/lora.py +13 -14
  33. sglang/srt/lora/lora_config.py +13 -14
  34. sglang/srt/lora/lora_manager.py +22 -24
  35. sglang/srt/managers/data_parallel_controller.py +25 -19
  36. sglang/srt/managers/detokenizer_manager.py +13 -16
  37. sglang/srt/managers/io_struct.py +43 -28
  38. sglang/srt/managers/schedule_batch.py +55 -26
  39. sglang/srt/managers/schedule_policy.py +13 -15
  40. sglang/srt/managers/scheduler.py +89 -70
  41. sglang/srt/managers/session_controller.py +14 -15
  42. sglang/srt/managers/tokenizer_manager.py +29 -22
  43. sglang/srt/managers/tp_worker.py +13 -15
  44. sglang/srt/managers/tp_worker_overlap_thread.py +13 -15
  45. sglang/srt/metrics/collector.py +13 -15
  46. sglang/srt/metrics/func_timer.py +13 -15
  47. sglang/srt/mm_utils.py +13 -14
  48. sglang/srt/model_executor/cuda_graph_runner.py +20 -19
  49. sglang/srt/model_executor/forward_batch_info.py +19 -17
  50. sglang/srt/model_executor/model_runner.py +42 -30
  51. sglang/srt/models/chatglm.py +15 -16
  52. sglang/srt/models/commandr.py +15 -16
  53. sglang/srt/models/dbrx.py +15 -16
  54. sglang/srt/models/deepseek.py +15 -15
  55. sglang/srt/models/deepseek_v2.py +15 -15
  56. sglang/srt/models/exaone.py +14 -15
  57. sglang/srt/models/gemma.py +14 -14
  58. sglang/srt/models/gemma2.py +24 -19
  59. sglang/srt/models/gemma2_reward.py +13 -14
  60. sglang/srt/models/gpt_bigcode.py +14 -14
  61. sglang/srt/models/grok.py +15 -15
  62. sglang/srt/models/internlm2.py +13 -15
  63. sglang/srt/models/internlm2_reward.py +13 -14
  64. sglang/srt/models/llama.py +21 -21
  65. sglang/srt/models/llama_classification.py +13 -14
  66. sglang/srt/models/llama_reward.py +13 -14
  67. sglang/srt/models/llava.py +13 -15
  68. sglang/srt/models/llavavid.py +13 -15
  69. sglang/srt/models/minicpm.py +13 -15
  70. sglang/srt/models/minicpm3.py +13 -15
  71. sglang/srt/models/mistral.py +13 -15
  72. sglang/srt/models/mixtral.py +15 -15
  73. sglang/srt/models/mixtral_quant.py +14 -14
  74. sglang/srt/models/olmo.py +21 -19
  75. sglang/srt/models/olmoe.py +23 -20
  76. sglang/srt/models/qwen.py +14 -14
  77. sglang/srt/models/qwen2.py +22 -19
  78. sglang/srt/models/qwen2_moe.py +17 -18
  79. sglang/srt/models/stablelm.py +18 -16
  80. sglang/srt/models/torch_native_llama.py +15 -17
  81. sglang/srt/models/xverse.py +13 -14
  82. sglang/srt/models/xverse_moe.py +15 -16
  83. sglang/srt/models/yivl.py +13 -15
  84. sglang/srt/openai_api/adapter.py +13 -15
  85. sglang/srt/openai_api/protocol.py +13 -15
  86. sglang/srt/sampling/sampling_batch_info.py +4 -1
  87. sglang/srt/sampling/sampling_params.py +13 -15
  88. sglang/srt/server.py +59 -34
  89. sglang/srt/server_args.py +22 -22
  90. sglang/srt/utils.py +196 -17
  91. sglang/test/few_shot_gsm8k.py +8 -4
  92. sglang/test/runners.py +13 -14
  93. sglang/test/test_utils.py +1 -1
  94. sglang/version.py +1 -1
  95. {sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/LICENSE +1 -1
  96. {sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/METADATA +24 -15
  97. sglang-0.3.6.post1.dist-info/RECORD +164 -0
  98. sglang/srt/layers/fused_moe/__init__.py +0 -1
  99. sglang-0.3.6.dist-info/RECORD +0 -161
  100. /sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +0 -0
  101. {sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/WHEEL +0 -0
  102. {sglang-0.3.6.dist-info → sglang-0.3.6.post1.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py CHANGED
@@ -1,22 +1,21 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """Common utilities."""
17
15
 
18
16
  import base64
19
17
  import ipaddress
18
+ import itertools
20
19
  import json
21
20
  import logging
22
21
  import os
@@ -33,7 +32,7 @@ import time
33
32
  import warnings
34
33
  from importlib.metadata import PackageNotFoundError, version
35
34
  from io import BytesIO
36
- from typing import Any, Dict, List, Optional, Union
35
+ from typing import Any, Callable, Dict, List, Optional, Protocol, Tuple, Union
37
36
 
38
37
  import numpy as np
39
38
  import psutil
@@ -46,6 +45,8 @@ from fastapi.responses import ORJSONResponse
46
45
  from packaging import version as pkg_version
47
46
  from starlette.routing import Mount
48
47
  from torch import nn
48
+ from torch.func import functional_call
49
+ from torch.library import Library
49
50
  from torch.profiler import ProfilerActivity, profile, record_function
50
51
  from triton.runtime.cache import (
51
52
  FileCacheManager,
@@ -192,6 +193,94 @@ def get_available_gpu_memory(device, gpu_id, distributed=False):
192
193
  return free_gpu_memory / (1 << 30)
193
194
 
194
195
 
196
+ def is_pin_memory_available() -> bool:
197
+ return torch.cuda.is_available()
198
+
199
+
200
+ _CPU_OFFLOAD_BYTES = 0
201
+ _CPU_OFFLOAD_MAX_BYTES = 0
202
+
203
+
204
+ def set_cpu_offload_max_bytes(max_bytes: int) -> None:
205
+ global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
206
+ _CPU_OFFLOAD_BYTES = 0
207
+ _CPU_OFFLOAD_MAX_BYTES = max_bytes
208
+
209
+
210
+ def maybe_offload_to_cpu(module: torch.nn.Module) -> torch.nn.Module:
211
+ device = next(module.parameters()).device
212
+
213
+ if device == torch.device("cpu"):
214
+ return module
215
+
216
+ global _CPU_OFFLOAD_MAX_BYTES, _CPU_OFFLOAD_BYTES
217
+ if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
218
+ return module
219
+
220
+ pin_memory = is_pin_memory_available()
221
+ # offload parameters to CPU
222
+ # use pin_memory if possible, which helps cudagraph capture speed
223
+ offloaded_parameters = False
224
+ for p in module.parameters():
225
+ if _CPU_OFFLOAD_BYTES >= _CPU_OFFLOAD_MAX_BYTES:
226
+ # we use per-parameter offloading
227
+ # one module might have some parameters offloaded and some not
228
+ break
229
+
230
+ # `torch.empty_like` does not support `pin_memory` argument
231
+ cpu_data = torch.empty_strided(
232
+ size=p.data.size(),
233
+ stride=p.data.stride(),
234
+ dtype=p.data.dtype,
235
+ layout=p.data.layout,
236
+ device="cpu",
237
+ pin_memory=pin_memory,
238
+ )
239
+ cpu_data.copy_(p.data)
240
+ p.data = cpu_data
241
+ _CPU_OFFLOAD_BYTES += p.data.numel() * p.data.element_size()
242
+ offloaded_parameters = True
243
+
244
+ if offloaded_parameters:
245
+ original_forward = module.forward
246
+
247
+ def forward(*args, **kwargs):
248
+ module.forward = original_forward
249
+ device_state = {
250
+ # here we blindly call `to(device)`
251
+ # if the parameter is already on the device, it will be a no-op
252
+ k: v.to(device, non_blocking=True)
253
+ for k, v in module.state_dict().items()
254
+ }
255
+ output = functional_call(module, device_state, args=args, kwargs=kwargs)
256
+ module.forward = forward
257
+ return output
258
+
259
+ module.forward = forward
260
+
261
+ return module
262
+
263
+
264
+ class LayerFn(Protocol):
265
+
266
+ def __call__(self, layer_id: int, prefix: str) -> torch.nn.Module: ...
267
+
268
+
269
+ def make_layers(
270
+ num_hidden_layers: int,
271
+ layer_fn: LayerFn,
272
+ prefix: str = "",
273
+ ) -> Tuple[int, int, torch.nn.ModuleList]:
274
+ """Make a list of layers with the given layer function"""
275
+ modules = torch.nn.ModuleList(
276
+ [
277
+ maybe_offload_to_cpu(layer_fn(idx=idx, prefix=f"{prefix}.{idx}"))
278
+ for idx in range(num_hidden_layers)
279
+ ]
280
+ )
281
+ return modules
282
+
283
+
195
284
  def set_random_seed(seed: int) -> None:
196
285
  """Set the random seed for all libraries."""
197
286
  random.seed(seed)
@@ -842,4 +931,94 @@ def get_nvgpu_memory_capacity():
842
931
 
843
932
  def crash_on_warnings():
844
933
  # Crash on warning if we are running CI tests
845
- return os.getenv("SGLANG_IS_IN_CI", "false") == "true"
934
+ return os.getenv("SGLANG_IS_IN_CI", "false").lower() == "true"
935
+
936
+
937
+ def get_device_name(device_id: int = 0) -> str:
938
+ if hasattr(torch, "cuda") and torch.cuda.is_available():
939
+ return torch.cuda.get_device_name(device_id)
940
+
941
+ if hasattr(torch, "hip") and torch.hip.is_available():
942
+ return torch.hip.get_device_name(device_id)
943
+
944
+ if hasattr(torch, "xpu") and torch.xpu.is_available():
945
+ return torch.xpu.get_device_name(device_id)
946
+
947
+ if hasattr(torch, "hpu") and torch.hpu.is_available():
948
+ return torch.hpu.get_device_name(device_id)
949
+
950
+
951
+ sglang_lib = Library("sglang", "FRAGMENT") # noqa
952
+
953
+
954
+ def direct_register_custom_op(
955
+ op_name: str,
956
+ op_func: Callable,
957
+ mutates_args: List[str],
958
+ fake_impl: Optional[Callable] = None,
959
+ target_lib: Optional[Library] = None,
960
+ ):
961
+ """
962
+ `torch.library.custom_op` can have significant overhead because it
963
+ needs to consider complicated dispatching logic. This function
964
+ directly registers a custom op and dispatches it to the CUDA backend.
965
+ See https://gist.github.com/youkaichao/ecbea9ec9fc79a45d2adce1784d7a9a5
966
+ for more details.
967
+
968
+ By default, the custom op is registered to the vLLM library. If you
969
+ want to register it to a different library, you can pass the library
970
+ object to the `target_lib` argument.
971
+
972
+ IMPORTANT: the lifetime of the operator is tied to the lifetime of the
973
+ library object. If you want to bind the operator to a different library,
974
+ make sure the library object is alive when the operator is used.
975
+ """
976
+ import torch.library
977
+
978
+ if hasattr(torch.library, "infer_schema"):
979
+ schema_str = torch.library.infer_schema(op_func, mutates_args=mutates_args)
980
+ else:
981
+ # for pytorch 2.4
982
+ import torch._custom_op.impl
983
+
984
+ schema_str = torch._custom_op.impl.infer_schema(op_func, mutates_args)
985
+
986
+ my_lib = target_lib or sglang_lib
987
+ my_lib.define(op_name + schema_str)
988
+ my_lib.impl(op_name, op_func, "CUDA")
989
+ if fake_impl is not None:
990
+ my_lib._register_fake(op_name, fake_impl)
991
+
992
+
993
+ def gpu_proc_affinity(
994
+ tp_size: int,
995
+ nnodes: int,
996
+ gpu_id: int,
997
+ ):
998
+ # current process
999
+ pid = os.getpid()
1000
+ p = psutil.Process(pid)
1001
+
1002
+ tp_size_per_node = tp_size // nnodes
1003
+
1004
+ # total physical cores
1005
+ total_pcores = psutil.cpu_count(logical=False)
1006
+ # physical cores per TP (N.B. more Cores than GPUs on node)
1007
+ num_cores_bind = total_pcores // tp_size_per_node
1008
+
1009
+ # able to handle multiple DP per node
1010
+ start_cpu_id = (gpu_id * num_cores_bind) % total_pcores
1011
+ end_cpu_id = start_cpu_id + num_cores_bind
1012
+
1013
+ if psutil.cpu_count() != psutil.cpu_count(logical=False):
1014
+ # HT on
1015
+ upper_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
1016
+ lower_cpu_ids = [id + total_pcores for id in range(start_cpu_id, end_cpu_id)]
1017
+ bind_cpu_ids = list(itertools.chain(upper_cpu_ids, lower_cpu_ids))
1018
+ else:
1019
+ # HT off
1020
+ bind_cpu_ids = [id for id in range(start_cpu_id, end_cpu_id)]
1021
+
1022
+ # set cpu_affinity to current process
1023
+ p.cpu_affinity(bind_cpu_ids)
1024
+ logger.info(f"Process {pid} gpu_id {gpu_id} is running on CPUs: {p.cpu_affinity()}")
@@ -48,9 +48,13 @@ def run_eval(args):
48
48
  # Select backend
49
49
  set_default_backend(RuntimeEndpoint(f"{args.host}:{args.port}"))
50
50
 
51
- # Read data
52
- url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
53
- filename = download_and_cache_file(url)
51
+ if args.data_path is None:
52
+ # Read data
53
+ url = "https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl"
54
+ filename = download_and_cache_file(url)
55
+ else:
56
+ filename = args.data_path
57
+
54
58
  lines = list(read_jsonl(filename))
55
59
 
56
60
  # Construct prompts
@@ -131,7 +135,7 @@ def run_eval(args):
131
135
  if __name__ == "__main__":
132
136
  parser = argparse.ArgumentParser()
133
137
  parser.add_argument("--num-shots", type=int, default=5)
134
- parser.add_argument("--data-path", type=str, default="test.jsonl")
138
+ parser.add_argument("--data-path", type=str)
135
139
  parser.add_argument("--num-questions", type=int, default=200)
136
140
  parser.add_argument("--max-new-tokens", type=int, default=512)
137
141
  parser.add_argument("--parallel", type=int, default=128)
sglang/test/runners.py CHANGED
@@ -1,17 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
15
14
 
16
15
  import json
17
16
  import multiprocessing as mp
sglang/test/test_utils.py CHANGED
@@ -44,7 +44,7 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8
44
44
 
45
45
  def is_in_ci():
46
46
  """Return whether it is in CI runner."""
47
- return os.getenv("SGLANG_IS_IN_CI", "false") == "true"
47
+ return os.getenv("SGLANG_IS_IN_CI", "false").lower() == "true"
48
48
 
49
49
 
50
50
  if is_in_ci():
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.6"
1
+ __version__ = "0.3.6.post1"
@@ -186,7 +186,7 @@
186
186
  same "printed page" as the copyright notice for easier
187
187
  identification within third-party archives.
188
188
 
189
- Copyright [yyyy] [name of copyright owner]
189
+ Copyright 2023-2024 SGLang Team
190
190
 
191
191
  Licensed under the Apache License, Version 2.0 (the "License");
192
192
  you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.6
3
+ Version: 0.3.6.post1
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -190,7 +190,7 @@ License: Apache License
190
190
  same "printed page" as the copyright notice for easier
191
191
  identification within third-party archives.
192
192
 
193
- Copyright [yyyy] [name of copyright owner]
193
+ Copyright 2023-2024 SGLang Team
194
194
 
195
195
  Licensed under the Apache License, Version 2.0 (the "License");
196
196
  you may not use this file except in compliance with the License.
@@ -222,6 +222,7 @@ Requires-Dist: fastapi; extra == "runtime-common"
222
222
  Requires-Dist: hf_transfer; extra == "runtime-common"
223
223
  Requires-Dist: huggingface_hub; extra == "runtime-common"
224
224
  Requires-Dist: interegular; extra == "runtime-common"
225
+ Requires-Dist: modelscope; extra == "runtime-common"
225
226
  Requires-Dist: orjson; extra == "runtime-common"
226
227
  Requires-Dist: outlines<0.1.0,>=0.0.44; extra == "runtime-common"
227
228
  Requires-Dist: packaging; extra == "runtime-common"
@@ -234,7 +235,7 @@ Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
234
235
  Requires-Dist: torchao; extra == "runtime-common"
235
236
  Requires-Dist: uvicorn; extra == "runtime-common"
236
237
  Requires-Dist: uvloop; extra == "runtime-common"
237
- Requires-Dist: modelscope; extra == "runtime-common"
238
+ Requires-Dist: xgrammar>=0.1.4; extra == "runtime-common"
238
239
  Provides-Extra: srt
239
240
  Requires-Dist: sglang[runtime_common]; extra == "srt"
240
241
  Requires-Dist: torch; extra == "srt"
@@ -245,6 +246,8 @@ Requires-Dist: torch; extra == "srt-hip"
245
246
  Requires-Dist: vllm==0.6.3.dev13; extra == "srt-hip"
246
247
  Provides-Extra: srt-xpu
247
248
  Requires-Dist: sglang[runtime_common]; extra == "srt-xpu"
249
+ Provides-Extra: srt-hpu
250
+ Requires-Dist: sglang[runtime_common]; extra == "srt-hpu"
248
251
  Provides-Extra: openai
249
252
  Requires-Dist: openai>=1.0; extra == "openai"
250
253
  Requires-Dist: tiktoken; extra == "openai"
@@ -274,6 +277,11 @@ Requires-Dist: sglang[srt_xpu]; extra == "all-xpu"
274
277
  Requires-Dist: sglang[openai]; extra == "all-xpu"
275
278
  Requires-Dist: sglang[anthropic]; extra == "all-xpu"
276
279
  Requires-Dist: sglang[litellm]; extra == "all-xpu"
280
+ Provides-Extra: all-hpu
281
+ Requires-Dist: sglang[srt_hpu]; extra == "all-hpu"
282
+ Requires-Dist: sglang[openai]; extra == "all-hpu"
283
+ Requires-Dist: sglang[anthropic]; extra == "all-hpu"
284
+ Requires-Dist: sglang[litellm]; extra == "all-hpu"
277
285
  Provides-Extra: dev
278
286
  Requires-Dist: sglang[all]; extra == "dev"
279
287
  Requires-Dist: sglang[test]; extra == "dev"
@@ -283,6 +291,9 @@ Requires-Dist: sglang[test]; extra == "dev-hip"
283
291
  Provides-Extra: dev-xpu
284
292
  Requires-Dist: sglang[all_xpu]; extra == "dev-xpu"
285
293
  Requires-Dist: sglang[test]; extra == "dev-xpu"
294
+ Provides-Extra: dev-hpu
295
+ Requires-Dist: sglang[all_hpu]; extra == "dev-hpu"
296
+ Requires-Dist: sglang[test]; extra == "dev-hpu"
286
297
 
287
298
  <div align="center" id="sglangtop">
288
299
  <img src="https://raw.githubusercontent.com/sgl-project/sglang/main/assets/logo.png" alt="logo" width="400" margin="10px"></img>
@@ -321,21 +332,16 @@ SGLang is a fast serving framework for large language models and vision language
321
332
  It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
322
333
  The core features include:
323
334
 
324
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
335
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, overhead-free CPU scheduler, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (FP8/INT4/AWQ/GPTQ).
325
336
  - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
326
337
  - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte, mcdse) and reward models (Skywork), with easy extensibility for integrating new models.
327
338
  - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
328
339
 
329
340
  ## Getting Started
330
- Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
331
-
332
- Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
333
-
334
- ## Backend: SGLang Runtime (SRT)
335
- See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
336
-
337
- ## Frontend: Structured Generation Language (SGLang)
338
- See [https://sgl-project.github.io/frontend/frontend.html](https://sgl-project.github.io/frontend/frontend.html)
341
+ - [Install SGLang](https://sgl-project.github.io/start/install.html)
342
+ - [Send requests](https://sgl-project.github.io/start/send_request.html)
343
+ - [Backend: SGLang Runtime (SRT)](https://sgl-project.github.io/backend/backend.html)
344
+ - [Frontend: Structured Generation Language (SGLang)](https://sgl-project.github.io/frontend/frontend.html)
339
345
 
340
346
  ## Benchmark And Performance
341
347
  Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/), [v0.3 blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)
@@ -343,6 +349,9 @@ Learn more in our release blogs: [v0.2 blog](https://lmsys.org/blog/2024-07-25-s
343
349
  ## Roadmap
344
350
  [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
345
351
 
346
- ## Citation And Acknowledgment
352
+ ## Adoption and Sponsorship
353
+ The project is supported by (alphabetically): AMD, Baseten, Etched, Hyperbolic, Jam & Tea Studios, LinkedIn, NVIDIA, RunPod, Stanford, UC Berkeley, and xAI.
354
+
355
+ ## Acknowledgment and Citation
356
+ We learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
347
357
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
348
- We also learned from the design and reused code from the following projects: [Guidance](https://github.com/guidance-ai/guidance), [vLLM](https://github.com/vllm-project/vllm), [LightLLM](https://github.com/ModelTC/lightllm), [FlashInfer](https://github.com/flashinfer-ai/flashinfer), [Outlines](https://github.com/outlines-dev/outlines), and [LMQL](https://github.com/eth-sri/lmql).
@@ -0,0 +1,164 @@
1
+ sglang/__init__.py,sha256=3M0oz0ZA8fULhV5LwQ4hxh-MRdHsOJRD1D63C60pdG4,1616
2
+ sglang/api.py,sha256=NdO6cYnklnEBQBKqQjlqI8-P1EownKQ71t5ibCGhEVo,6953
3
+ sglang/bench_latency.py,sha256=oZjSAzX7dUiSu-zdz0dkyUPo-qAX_lsXFH1gf03akgI,76
4
+ sglang/bench_offline_throughput.py,sha256=z6uA6Gxa_nFZa0cOXi7MJDuX82xcqk5WfqBMavd8a-s,10929
5
+ sglang/bench_one_batch.py,sha256=WxrQUkMcxz5GV8OEHj0ckHgpC76HgO6YxmDvJFRDeyU,15670
6
+ sglang/bench_one_batch_server.py,sha256=nzeF_bcaXanQuYLBxAvd3OO4fwbKproMcahXdHIVR6w,5920
7
+ sglang/bench_serving.py,sha256=hI7FjaERyqKBrYtKewDU6E4rSufKxqsUPyUgtWtTKSI,52545
8
+ sglang/check_env.py,sha256=nR2m0a9WbQmkimJihUx-Lqi7XjN0jyWTCO2vYyA7R2M,5356
9
+ sglang/global_config.py,sha256=fnT0U9vlHdGaQFKN9tYTnUF4-eVW4HYQURd5zvPtrg0,1286
10
+ sglang/launch_server.py,sha256=_XIqBcXArYtHTqilOFkYWKZBYXGCMHAxbYOST08LGj0,415
11
+ sglang/launch_server_llavavid.py,sha256=tGc17S1vUfLwbi1GB26oOdXxTWr7gjlqpTrPnrMRNO8,1007
12
+ sglang/utils.py,sha256=eCvD3fZCALr-MuyZxJL7HAeeqqpxAxf4LJrf7OiCbco,11547
13
+ sglang/version.py,sha256=YrfhKDmn6rTAj_qREKEXk2FahHCqSbHd4BNoD7wlIi0,28
14
+ sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
15
+ sglang/lang/chat_template.py,sha256=jprS3-In2FTUoedKwZg-HYvDwU8RTIYntOlf2zoN2sU,14814
16
+ sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
17
+ sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
18
+ sglang/lang/interpreter.py,sha256=SBjejhLhTKzNM0HbjtTg5r17WPJ64WFSk6lcM_SCWKs,30717
19
+ sglang/lang/ir.py,sha256=zpzzAO1YVldhE95Vwz5hU_TQltu-xt8A6rfFr0PuIDA,18410
20
+ sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
21
+ sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
22
+ sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
23
+ sglang/lang/backend/base_backend.py,sha256=tdoh9YF3CyekY1BKiX9n7-aA4srDWIuA4RDJLM7q8qg,1985
24
+ sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
25
+ sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
26
+ sglang/lang/backend/runtime_endpoint.py,sha256=IWbrAKrUkzNOvwV6V9_y6pkTr2SUYEkKBT-3kirgad0,10514
27
+ sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
28
+ sglang/srt/conversation.py,sha256=u9zFU8aMYzwHUbQRKU76B_T-jfLlPoxUcWG_nRbDM2I,21201
29
+ sglang/srt/hf_transformers_utils.py,sha256=sUUCpjbTHuYDMuwOaz00nH5fataXKjliD8gCxXU64sw,6712
30
+ sglang/srt/mm_utils.py,sha256=1ScBunw_x4W8ebM_AcJ62-1T2mfT8NlMJqdAhkF1lb0,12367
31
+ sglang/srt/model_parallel.py,sha256=QR-Alqo0sElDXPJ79N1PhUHHKiEHPQn3dyXduMP-SHQ,3664
32
+ sglang/srt/server.py,sha256=7PSxAUhiS796yQFeiQxiilRhLQ3FpV0wL53CfDgkCIk,30851
33
+ sglang/srt/server_args.py,sha256=CfmpU6_EDnxJzpJiRx2n6AhOPCtrHPOf-7wEtTF__L0,30834
34
+ sglang/srt/utils.py,sha256=APZEUancLC0jRI1JMbv7e5bIZy3OEySGyZspxGA60yQ,33509
35
+ sglang/srt/configs/__init__.py,sha256=_usVIXHQjft4PAJ1Y-yGQOn2QNOv501GYMlQwpGXbns,208
36
+ sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
37
+ sglang/srt/configs/model_config.py,sha256=dQ58mYKN3M5IwldFZkwIb4CCBa6dREb5Om4Kg2kffOE,9565
38
+ sglang/srt/configs/qwen2vl.py,sha256=AYHuFgJ0bwhWYkD7S6fvP7yJejJnuhy4xp5Q2W-O6ps,4424
39
+ sglang/srt/constrained/__init__.py,sha256=UWZNVLvOT5ZBX8M36sONgDmnKtkQ0cSfhQD2jO0ATuk,786
40
+ sglang/srt/constrained/base_grammar_backend.py,sha256=FhVm7PxhXDl0joV9NP5RjKgz7dR1dZvUAQnh0mdtvVY,2353
41
+ sglang/srt/constrained/outlines_backend.py,sha256=IDpyzXJS-ydRXYOHHzx1bO9VjiMRF8E5knn4CLFwPU8,6447
42
+ sglang/srt/constrained/outlines_jump_forward.py,sha256=IGg6mThDepugfez0jnQ6HfLSHtiUl_Mq7bsPFppb3DA,6196
43
+ sglang/srt/constrained/xgrammar_backend.py,sha256=4ZCQgcjWEY2Lg4r2V9sAiYJJblkQ_uVbEnvsjqhR1Pc,4548
44
+ sglang/srt/layers/activation.py,sha256=EboMjT9HV2tNHQ6rzpojtlkzev1lAFbhQlxMg9hwxBQ,5471
45
+ sglang/srt/layers/custom_op_util.py,sha256=0vu-yX2wwonmO1L_o5G7SA6C-8XuhDIh9rPDvNeLhoc,922
46
+ sglang/srt/layers/fused_moe_patch.py,sha256=dxjcBMY_zAqA0pnmy5KDUZZJSd5Q64Xlxhxyb33cdMk,4240
47
+ sglang/srt/layers/layernorm.py,sha256=nRQ1w1xSUcU-zlqVC61BnGG6otS5W1w9VaSzeXizrx4,4037
48
+ sglang/srt/layers/linear.py,sha256=EOdlpAf6srqxzvPpxcv10KFJKedNc22CGP1qEvpRbDg,46131
49
+ sglang/srt/layers/logits_processor.py,sha256=V8fHxeQK8lzUhGD2Xc7MY1Y9qBhzFyh6hqp31RJVefg,12669
50
+ sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
51
+ sglang/srt/layers/radix_attention.py,sha256=C_mK4mfmKlxMRNeKYP9E5R3PRd3eT-OcE_g3mo36dJM,2058
52
+ sglang/srt/layers/rotary_embedding.py,sha256=29tx3JNR40AoXqBa2cFGBjva9vU2xgFipETlpMaaZas,3985
53
+ sglang/srt/layers/sampler.py,sha256=zgNwgUx7fozkWsEJFRKDV9SipHBijfpU9pTroNst6Ho,4552
54
+ sglang/srt/layers/torchao_utils.py,sha256=v0hyr4hLsM42QwOPCdKb-ftRTjVokBZbqvRj4O4C-Nw,3415
55
+ sglang/srt/layers/vocab_parallel_embedding.py,sha256=RmaZbgXbFnGKX1eGYxlmiko-6JwaJX6seHupUSCtAm8,21583
56
+ sglang/srt/layers/attention/__init__.py,sha256=EL1o6Q5vLgViN3pOr2A7F6K9FlNEpMdBypFAVMeq_HA,2445
57
+ sglang/srt/layers/attention/double_sparsity_backend.py,sha256=BlX7uXteQpnoOnKsdBKh8h20zMVMEiibB5F_PkZSlNI,10706
58
+ sglang/srt/layers/attention/flashinfer_backend.py,sha256=oblYMbmYzK94H3EA9lMhKWaKdi8HLH5NqAiZmjzj4Es,24875
59
+ sglang/srt/layers/attention/triton_backend.py,sha256=gjxed2cvc2-8QEHkzyTVv6ui7oYOp2b_vgIUQVD1XuM,6538
60
+ sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=BE63WhKiutSNkhJLsRwvfsRy-ExvuAv7FZyoWv73ul8,18744
61
+ sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py,sha256=1pSXfY3EEaM7iRN_uElHnAfsrJMhTFbu9fj8Z0O2PbE,21480
62
+ sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=Gfct-0_l-S2ZrP4F-zkzNiFbmd3C3f7uJovacOuDxaA,11472
63
+ sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=lojFXRZMLWkzS2Y8uxaolnQhXaWKG19mCAWaF5KQeiI,6087
64
+ sglang/srt/layers/fused_moe_grok/__init__.py,sha256=rj_JBzcP--eaaM6LGQ-u580uQvqLisp5JtGBAs1fVYc,80
65
+ sglang/srt/layers/fused_moe_grok/fused_moe.py,sha256=bxRcjdALxeY3FDnKivGOoNr6Er1kh6CCPtlAp7pjz50,23844
66
+ sglang/srt/layers/fused_moe_grok/layer.py,sha256=v-o5YHYEU2HIEZwouyuc3UyfNj7YQrEYOO_BXKELU7Y,23453
67
+ sglang/srt/layers/fused_moe_triton/__init__.py,sha256=PHKFqd2hPOO-g9kSMseg2g76lpg9OGXQDThWU6bt9vs,902
68
+ sglang/srt/layers/fused_moe_triton/fused_moe.py,sha256=qwfRBOeY5DT48Q6z71Eh9cjFehvs_K6eLIVWNL044Ug,28363
69
+ sglang/srt/layers/fused_moe_triton/layer.py,sha256=URDkTt8xEqnqpO5tb_3L7JlhlO53VWfqDDNSRYEu-LY,21545
70
+ sglang/srt/layers/quantization/__init__.py,sha256=f9tCC_9sHjp7JCPvyZIvuoTB4KooIucGA9S2w7ADevw,4849
71
+ sglang/srt/layers/quantization/base_config.py,sha256=daK9p0aijMszLUm1W4Pc33FK87MdqYK1NoWFKif-j80,4599
72
+ sglang/srt/lora/lora.py,sha256=KhhO9aKCyFWvJnhI07lZKANIvNjtt882HrTYFNBZMv0,15065
73
+ sglang/srt/lora/lora_config.py,sha256=a2fTQESlCbG1xLiBYy4ptZ6c0Burcqyg1_6V1XSok-Y,1506
74
+ sglang/srt/lora/lora_manager.py,sha256=DHiqdl0_4wQ5PxZBZtlCpP14515mDV2_H9tzL3Rdss8,12886
75
+ sglang/srt/managers/data_parallel_controller.py,sha256=JxRtJJTVn1FU2iD292rLZPftAsR4_8j4d3yF8j0dvBc,8327
76
+ sglang/srt/managers/detokenizer_manager.py,sha256=nWBn54pz3aQ8tzVvViwwL2k0V4WATi0qw11H0Bzua-Q,7389
77
+ sglang/srt/managers/image_processor.py,sha256=Pk_dtXzljTkFt7Acsv1RyDzEqvCvjc7BMngxGhtkpDU,13817
78
+ sglang/srt/managers/io_struct.py,sha256=WLXz-tyn0jR7zNO9feRBXgyjphVa8qR55OoEOUdzoVI,13751
79
+ sglang/srt/managers/schedule_batch.py,sha256=-5oYdkStPiYjPWl0tCkUVRjTGB7fjA0wIngK-09da7w,43111
80
+ sglang/srt/managers/schedule_policy.py,sha256=ayFz4iPLIlG8mx5i1glTCAMHJPGpFedMP9UgRtqkNhA,12526
81
+ sglang/srt/managers/scheduler.py,sha256=8owHPXG6fxZtsCWSJ6K7EOlFDcPxYinZC1DwKMJcEVM,55930
82
+ sglang/srt/managers/session_controller.py,sha256=jXoPHxMGh8T1iYWIEjSXoPVwaL6NEjv3QtqlsrvPE1c,2355
83
+ sglang/srt/managers/tokenizer_manager.py,sha256=zYbKEKNuM1B3PXzA7jnDpxew-0rZXSX-7dHmVLWG3e4,26477
84
+ sglang/srt/managers/tp_worker.py,sha256=1SQJ60iKS9e5vGY555fT1iZ4OtLumXzeWfB08fSWKbk,6176
85
+ sglang/srt/managers/tp_worker_overlap_thread.py,sha256=7vhPebaOS4JamaS08CGf_hwxnUO7Gy_SXZXEPwNHKoY,7621
86
+ sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
87
+ sglang/srt/mem_cache/chunk_cache.py,sha256=VcCpyrf5FOQ5xoKeOouCI5ZQLkZo_pgY1SPbDDkagGg,2492
88
+ sglang/srt/mem_cache/flush_cache.py,sha256=GYcxmNXh4hsMpFfNOuCTpKilW7guZwTtAg_usVeM3J0,979
89
+ sglang/srt/mem_cache/memory_pool.py,sha256=41fjuj_sD0yfJq-sy-X99cc2djBa6w4dy2y47V0WqNU,10934
90
+ sglang/srt/mem_cache/radix_cache.py,sha256=DzLCO_gYQ7X_C2NJSEHzzMZhb5HzWjKF9wXJQsnzr8M,10427
91
+ sglang/srt/metrics/collector.py,sha256=ZWoFx_FKN0sNMSZ8RJWUVQ0RFEYhIHxdw0d4TZTluMU,6861
92
+ sglang/srt/metrics/func_timer.py,sha256=VFyNRrbnKVCwnQsrlLin1lITJfjQpf9m8sGPqL5LIsQ,3438
93
+ sglang/srt/model_executor/cuda_graph_runner.py,sha256=4hbCtE3gt5kvMNHrnxkE8YPRFcgmVo0Bwz3lgbYZw_E,14805
94
+ sglang/srt/model_executor/forward_batch_info.py,sha256=n5yk927COTU0klDAkQuwrFzamMygfkHxmDp1I6bJYD8,12612
95
+ sglang/srt/model_executor/model_runner.py,sha256=AafFWd_EDWbOe0o5etAyutGum5O8_9tO55KRcaAWDW4,29680
96
+ sglang/srt/models/baichuan.py,sha256=RyvPQvi7wy9VUGvLwG17XttcTp43yRj6c3zNRImBToA,15005
97
+ sglang/srt/models/chatglm.py,sha256=OikygdK8Mi6F2QPPhAr2E_P4l2V0yWQjDJOdnBAApPE,13216
98
+ sglang/srt/models/commandr.py,sha256=XkzpfsdDPDx-W5oOac8nFIe39JJZvmv65K5GIpgJTz0,14212
99
+ sglang/srt/models/dbrx.py,sha256=ucn3UJ1s4nx2qa5hUb8VhJmfVrDZ59e9oNetMU5EWq8,14624
100
+ sglang/srt/models/deepseek.py,sha256=B5OuW--kDIPfZesOhvGGUhHQNWh0pMPNCYmdsv9lv5U,15922
101
+ sglang/srt/models/deepseek_v2.py,sha256=shdHVtZGmLEZMZwGlIPz8NPoSb1c_n6hQxWKG45WahE,32265
102
+ sglang/srt/models/exaone.py,sha256=6LJ1Mr9MbHOXdH_nK9Dba3SR28LMCJvdH1k53w9M9Vg,13081
103
+ sglang/srt/models/gemma.py,sha256=079CfoQqBnrLIbW0LWcLp-nmb1aPVN1Tw6PxMQQ3Lsk,12289
104
+ sglang/srt/models/gemma2.py,sha256=lbfQhQpUhf1MAEB_00Uo6rp20k4Hr353UbPKKuMsxec,15020
105
+ sglang/srt/models/gemma2_reward.py,sha256=cQawatbsfBuWQTueivYHl_17ZoQUHEelI1sr1y5pvfY,2556
106
+ sglang/srt/models/gpt2.py,sha256=Th7_Dnkw82GFBOuMOTrHtA44JBPHRUtY3Qd73rQwzMc,9741
107
+ sglang/srt/models/gpt_bigcode.py,sha256=lYo4ajy49VvvPkaduaFtOaCRT_ItqyNUE158S-BI5QA,10136
108
+ sglang/srt/models/grok.py,sha256=rDIH_SFzauuEHcL_vCOSrYLjdBC3i3o_AcceL3amsJw,14927
109
+ sglang/srt/models/internlm2.py,sha256=DxbA15d9QR0tLOczpC6DkB8QyNHXJRdZatY6Nskwv1k,12170
110
+ sglang/srt/models/internlm2_reward.py,sha256=Lr-JA0vfTQJt9q5oDMiopGuoXAevyEv5PAoDe2rsTJk,2425
111
+ sglang/srt/models/llama.py,sha256=FSGuM3BamhuT5h2jedh5cSFwFYduOJwkAZJJ672awRw,16423
112
+ sglang/srt/models/llama_classification.py,sha256=c8WZ1ADa3f6s2IJVoP10ouVgeCwv_ndns_qMgLrC6QI,3413
113
+ sglang/srt/models/llama_embedding.py,sha256=2ex2jrz31osaAd9V8sJeN0qyxmk-L5NgOBkXL1puGhI,3166
114
+ sglang/srt/models/llama_reward.py,sha256=prhHDPpf1k6tlQtGE6zq5gx0uSZAD3W5v7W28bdgy4U,4619
115
+ sglang/srt/models/llava.py,sha256=72DnZXIwu78zYqU8YIElq_AaSIFO_icYOPTHXE0_-YQ,24941
116
+ sglang/srt/models/llavavid.py,sha256=DeWqGSmXgIYGuLyy2ZrxjM9WqbRjueP4chNmXt7Bnus,12221
117
+ sglang/srt/models/minicpm.py,sha256=KbiTf-kaDAJxSo9Z4IGMTrs9WrYYji1KXO1kA2iy-as,13816
118
+ sglang/srt/models/minicpm3.py,sha256=C43mTr2Qjccj4sXuTDgzbfZhvCNbsEHNggMRXQ7SrWs,25108
119
+ sglang/srt/models/mistral.py,sha256=EYifJUUzN2Z2-iL37eJiNZF_DB0H4pa0mKlgYRIxM70,838
120
+ sglang/srt/models/mixtral.py,sha256=E3d8I7V3Dp1nCEHRbhh-PKBG8UaVK5XOHwl9QyIjcX0,14043
121
+ sglang/srt/models/mixtral_quant.py,sha256=o-oTG8BGtWuNu-o6muHSarMNBQwrjQowyBFOQhuclZ8,14065
122
+ sglang/srt/models/mllama.py,sha256=pET1x8wY04yoS8HMCncKx0tFPqGp78K8rlA7Eq7XioE,37889
123
+ sglang/srt/models/olmo.py,sha256=DEUPNDM0z83N-Qdhkj2WJMtbiz5JNbSBMIjUaYZN9RM,12068
124
+ sglang/srt/models/olmoe.py,sha256=jVKrjqQQrWLdlkGSGUaMPdT9PHzNH4X-RVwON29eaGw,15412
125
+ sglang/srt/models/phi3_small.py,sha256=fxqGU0xphJzTeuBW38SRRYpRb2rcsg53JxuObK0pZig,15141
126
+ sglang/srt/models/qwen.py,sha256=P9zcFnz_Tsz73tVtLRwZ8uWzCtMxWOrzlv2o9Ys_Gck,9947
127
+ sglang/srt/models/qwen2.py,sha256=ApFFASNwvrkDXi-KkCNA7fTk4uLMuJWoMg15zCaAKdA,12514
128
+ sglang/srt/models/qwen2_moe.py,sha256=1oxDsKDq3jlHKx9jMi1SfHOqCRVyN5n76uw3M-CUODE,17048
129
+ sglang/srt/models/qwen2_vl.py,sha256=G3FNa_N2-CzB56LVrukwBtJazxMrDC_GPNjK6Wqxc4s,26415
130
+ sglang/srt/models/stablelm.py,sha256=jpmsyWMJo_9JapOESnuV7ObNCh78BRznXY0iFvvIbZE,11354
131
+ sglang/srt/models/torch_native_llama.py,sha256=vNQxsnbVAY1bdyMCCWDZAtWdbaFIiJXhmVxHjk5BB9Y,19400
132
+ sglang/srt/models/xverse.py,sha256=LGe0ma0wOir3x-OLBT_cRocw8JEo9d3AYNxgA2OcLrk,13659
133
+ sglang/srt/models/xverse_moe.py,sha256=YqbzkSsnTFt-8-aI8YobF9qJA70qrBjbS1Kjn1KNqVY,15766
134
+ sglang/srt/models/yivl.py,sha256=yj4aWsOBVGQBLurSrLmYXVC7zGIPH7EYHHtAaAZ7Liw,4859
135
+ sglang/srt/openai_api/adapter.py,sha256=MhOcWZjcLv4_OuvLvDMcAu6K_u2joJvhaZxaKm0hi3M,53634
136
+ sglang/srt/openai_api/protocol.py,sha256=vBgrbTqtECsZ5dG0rgP1FHsTBt4eR9zbDX3FBIN-rz4,10172
137
+ sglang/srt/sampling/sampling_batch_info.py,sha256=YC-KPyDWyLGNPL4YVcst4xwP8Wlz2zcCNJHB_5zljXQ,8470
138
+ sglang/srt/sampling/sampling_params.py,sha256=n7RbBg_bS5fYhsiWa8uJYnfoXy_i5DvtTBOkuFnHDNU,5286
139
+ sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
140
+ sglang/srt/sampling/penaltylib/orchestrator.py,sha256=J-DEemZcKm1--o37kf3qDOE8SZ_6H3d5oex49Mgq2ZU,10762
141
+ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=1Zp2aL6dD60mwD1tCcSG0x5IYo0v4z9ce-q_YwbJ9f8,2490
142
+ sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=_Nxv0XgUPirZjw2SEJYp_Cd9ZcLwmt7h6JE6J4hhFq4,3629
143
+ sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=5tOgCg7OvE9kSN9VMCpH1hwqo1YMxt9iS5PVpct9HpU,2468
144
+ sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=m22Rfn1RuB1HpImBDECsiJ2VooBYpsFADAwnk1EPzk0,2751
145
+ sglang/test/few_shot_gsm8k.py,sha256=7yDbEQe49gZeJhz2wFFX-gf_59ThDKsCS1xwfogNc7k,4034
146
+ sglang/test/few_shot_gsm8k_engine.py,sha256=QQbrwOX6-cJDD3RZC_e7zPnt6aSo8JdF8X_lRHSjdDM,3886
147
+ sglang/test/run_eval.py,sha256=9yO0hXZOcn4abEOs96T-XPguDEklK16Ltco0pGF3zCg,4020
148
+ sglang/test/runners.py,sha256=ANzjrHkT_1E0G3UcD47O8XEKst3Si4AOfx-uErbFS7o,15129
149
+ sglang/test/simple_eval_common.py,sha256=joqrGysuLnJFtzDRIgFkMsRyKUSyjVPFWp0_PHAL3Ik,12378
150
+ sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
151
+ sglang/test/simple_eval_humaneval.py,sha256=zmV3xWYc2OrpiT9Dy55RTKZL5DEROD1cJ0NA_-cU5zI,5685
152
+ sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWUpk,2550
153
+ sglang/test/simple_eval_mgsm.py,sha256=rd7TSUyxdKbrXaVoewo24V8lCo_6kO8zxPhhmvylpw8,10259
154
+ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
155
+ sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
156
+ sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
157
+ sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
158
+ sglang/test/test_utils.py,sha256=ULF7C3pLXkMevXgE_Dodt29OBfvvXKUnRvwKhaBg1ys,23470
159
+ sglang/test/srt/sampling/penaltylib/utils.py,sha256=CjxHgywh0hx_87iynzQt_ztHu6zBVuE-YrZ-XPmW6U4,12906
160
+ sglang-0.3.6.post1.dist-info/LICENSE,sha256=FJXh51fvTQklojUFY89XVLsjxRcBqOxPs8XNy-2uZ0c,11346
161
+ sglang-0.3.6.post1.dist-info/METADATA,sha256=XwhCEL8SbEVcT7LQLk26g6tzduS6mByBE7dDqZYpQxo,22073
162
+ sglang-0.3.6.post1.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
163
+ sglang-0.3.6.post1.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
164
+ sglang-0.3.6.post1.dist-info/RECORD,,
@@ -1 +0,0 @@
1
- from sglang.srt.layers.fused_moe.layer import FusedMoE, FusedMoEMethodBase