sglang 0.3.1.post3__py3-none-any.whl → 0.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. sglang/__init__.py +2 -0
  2. sglang/api.py +23 -1
  3. sglang/bench_latency.py +48 -33
  4. sglang/bench_server_latency.py +0 -6
  5. sglang/bench_serving.py +2 -2
  6. sglang/lang/backend/runtime_endpoint.py +14 -1
  7. sglang/lang/interpreter.py +16 -6
  8. sglang/lang/ir.py +20 -4
  9. sglang/srt/configs/model_config.py +11 -9
  10. sglang/srt/constrained/fsm_cache.py +9 -1
  11. sglang/srt/constrained/jump_forward.py +15 -2
  12. sglang/srt/hf_transformers_utils.py +1 -0
  13. sglang/srt/layers/activation.py +4 -4
  14. sglang/srt/layers/attention/__init__.py +49 -0
  15. sglang/srt/layers/attention/flashinfer_backend.py +277 -0
  16. sglang/srt/layers/{flashinfer_utils.py → attention/flashinfer_utils.py} +82 -80
  17. sglang/srt/layers/attention/triton_backend.py +161 -0
  18. sglang/srt/layers/{triton_attention → attention/triton_ops}/extend_attention.py +3 -1
  19. sglang/srt/layers/fused_moe/patch.py +117 -0
  20. sglang/srt/layers/layernorm.py +4 -4
  21. sglang/srt/layers/logits_processor.py +19 -15
  22. sglang/srt/layers/pooler.py +3 -3
  23. sglang/srt/layers/quantization/__init__.py +0 -2
  24. sglang/srt/layers/radix_attention.py +6 -4
  25. sglang/srt/layers/sampler.py +6 -4
  26. sglang/srt/layers/torchao_utils.py +18 -0
  27. sglang/srt/lora/lora.py +20 -21
  28. sglang/srt/lora/lora_manager.py +97 -25
  29. sglang/srt/managers/detokenizer_manager.py +31 -18
  30. sglang/srt/managers/image_processor.py +187 -0
  31. sglang/srt/managers/io_struct.py +99 -75
  32. sglang/srt/managers/schedule_batch.py +187 -68
  33. sglang/srt/managers/{policy_scheduler.py → schedule_policy.py} +31 -21
  34. sglang/srt/managers/scheduler.py +1021 -0
  35. sglang/srt/managers/tokenizer_manager.py +120 -247
  36. sglang/srt/managers/tp_worker.py +28 -925
  37. sglang/srt/mem_cache/memory_pool.py +34 -52
  38. sglang/srt/mem_cache/radix_cache.py +5 -5
  39. sglang/srt/model_executor/cuda_graph_runner.py +25 -25
  40. sglang/srt/model_executor/forward_batch_info.py +94 -97
  41. sglang/srt/model_executor/model_runner.py +76 -78
  42. sglang/srt/models/baichuan.py +10 -10
  43. sglang/srt/models/chatglm.py +12 -12
  44. sglang/srt/models/commandr.py +10 -10
  45. sglang/srt/models/dbrx.py +12 -12
  46. sglang/srt/models/deepseek.py +10 -10
  47. sglang/srt/models/deepseek_v2.py +14 -15
  48. sglang/srt/models/exaone.py +10 -10
  49. sglang/srt/models/gemma.py +10 -10
  50. sglang/srt/models/gemma2.py +11 -11
  51. sglang/srt/models/gpt_bigcode.py +10 -10
  52. sglang/srt/models/grok.py +10 -10
  53. sglang/srt/models/internlm2.py +10 -10
  54. sglang/srt/models/llama.py +22 -10
  55. sglang/srt/models/llama_classification.py +5 -5
  56. sglang/srt/models/llama_embedding.py +4 -4
  57. sglang/srt/models/llama_reward.py +142 -0
  58. sglang/srt/models/llava.py +39 -33
  59. sglang/srt/models/llavavid.py +31 -28
  60. sglang/srt/models/minicpm.py +10 -10
  61. sglang/srt/models/minicpm3.py +14 -15
  62. sglang/srt/models/mixtral.py +10 -10
  63. sglang/srt/models/mixtral_quant.py +10 -10
  64. sglang/srt/models/olmoe.py +10 -10
  65. sglang/srt/models/qwen.py +10 -10
  66. sglang/srt/models/qwen2.py +11 -11
  67. sglang/srt/models/qwen2_moe.py +10 -10
  68. sglang/srt/models/stablelm.py +10 -10
  69. sglang/srt/models/torch_native_llama.py +506 -0
  70. sglang/srt/models/xverse.py +10 -10
  71. sglang/srt/models/xverse_moe.py +10 -10
  72. sglang/srt/openai_api/adapter.py +7 -0
  73. sglang/srt/sampling/sampling_batch_info.py +36 -27
  74. sglang/srt/sampling/sampling_params.py +3 -1
  75. sglang/srt/server.py +170 -119
  76. sglang/srt/server_args.py +54 -27
  77. sglang/srt/utils.py +101 -128
  78. sglang/test/runners.py +76 -33
  79. sglang/test/test_programs.py +38 -5
  80. sglang/test/test_utils.py +53 -9
  81. sglang/version.py +1 -1
  82. {sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/METADATA +42 -23
  83. sglang-0.3.3.dist-info/RECORD +139 -0
  84. sglang/srt/layers/attention_backend.py +0 -482
  85. sglang/srt/managers/controller_multi.py +0 -207
  86. sglang/srt/managers/controller_single.py +0 -164
  87. sglang-0.3.1.post3.dist-info/RECORD +0 -134
  88. /sglang/srt/layers/{triton_attention → attention/triton_ops}/decode_attention.py +0 -0
  89. /sglang/srt/layers/{triton_attention → attention/triton_ops}/prefill_attention.py +0 -0
  90. {sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/LICENSE +0 -0
  91. {sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/WHEEL +0 -0
  92. {sglang-0.3.1.post3.dist-info → sglang-0.3.3.dist-info}/top_level.txt +0 -0
@@ -72,7 +72,7 @@ def test_select(check_answer):
72
72
  statement="The capital of Germany is Berlin.",
73
73
  )
74
74
  if check_answer:
75
- assert ret["answer"] == "True", ret.text
75
+ assert ret["answer"] == "True", ret.text()
76
76
  else:
77
77
  assert ret["answer"] in ["True", "False", "Unknown"]
78
78
 
@@ -80,7 +80,7 @@ def test_select(check_answer):
80
80
  statement="The capital of Canada is Tokyo.",
81
81
  )
82
82
  if check_answer:
83
- assert ret["answer"] == "False", ret.text
83
+ assert ret["answer"] == "False", ret.text()
84
84
  else:
85
85
  assert ret["answer"] in ["True", "False", "Unknown"]
86
86
 
@@ -88,7 +88,7 @@ def test_select(check_answer):
88
88
  statement="Purple is a better color than green.",
89
89
  )
90
90
  if check_answer:
91
- assert ret["answer"] == "Unknown", ret.text
91
+ assert ret["answer"] == "Unknown", ret.text()
92
92
  else:
93
93
  assert ret["answer"] in ["True", "False", "Unknown"]
94
94
 
@@ -100,8 +100,8 @@ def test_decode_int():
100
100
  s += "The number of days in a year is " + sgl.gen_int("days") + "\n"
101
101
 
102
102
  ret = decode_int.run(temperature=0.1)
103
- assert int(ret["hours"]) == 24, ret.text
104
- assert int(ret["days"]) == 365, ret.text
103
+ assert int(ret["hours"]) == 24, ret.text()
104
+ assert int(ret["days"]) == 365, ret.text()
105
105
 
106
106
 
107
107
  def test_decode_json_regex():
@@ -517,3 +517,36 @@ def test_hellaswag_select():
517
517
  accuracy = np.mean(np.array(preds) == np.array(labels))
518
518
 
519
519
  return accuracy, latency
520
+
521
+
522
+ def test_gen_min_new_tokens():
523
+ """
524
+ Validate sgl.gen(min_tokens) functionality.
525
+
526
+ The test asks a question where, without a min_tokens constraint, the generated answer is expected to be short.
527
+ By enforcing the min_tokens parameter, we ensure the generated answer has at least the specified number of tokens.
528
+ We verify that the number of tokens in the answer is >= the min_tokens threshold.
529
+ """
530
+ import sglang as sgl
531
+ from sglang.srt.hf_transformers_utils import get_tokenizer
532
+
533
+ model_path = sgl.global_config.default_backend.endpoint.get_model_name()
534
+ MIN_TOKENS, MAX_TOKENS = 64, 128
535
+
536
+ @sgl.function
537
+ def convo_1(s):
538
+ s += sgl.user("What is the capital of the United States?")
539
+ s += sgl.assistant(
540
+ sgl.gen("answer", min_tokens=MIN_TOKENS, max_tokens=MAX_TOKENS)
541
+ )
542
+
543
+ def assert_min_tokens(tokenizer, text):
544
+ token_ids = tokenizer.encode(text)
545
+ assert (
546
+ len(token_ids) >= MIN_TOKENS
547
+ ), f"Generated {len(token_ids)} tokens, min required: {MIN_TOKENS}. Text: {text}"
548
+
549
+ tokenizer = get_tokenizer(model_path)
550
+
551
+ state = convo_1.run()
552
+ assert_min_tokens(tokenizer, state["answer"])
sglang/test/test_utils.py CHANGED
@@ -23,12 +23,13 @@ from sglang.srt.utils import kill_child_process
23
23
  from sglang.utils import get_exception_traceback
24
24
 
25
25
  DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
26
- DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
26
+ DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Llama-3.1-8B-Instruct"
27
27
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
28
28
  DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
29
+ DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
29
30
  DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
30
- DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Meta-Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
31
- DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Meta-Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
31
+ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
32
+ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
32
33
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
33
34
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
34
35
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
@@ -84,7 +85,7 @@ def call_generate_vllm(prompt, temperature, max_tokens, stop=None, n=1, url=None
84
85
 
85
86
 
86
87
  def call_generate_outlines(
87
- prompt, temperature, max_tokens, stop=[], regex=None, n=1, url=None
88
+ prompt, temperature, max_tokens, stop=None, regex=None, n=1, url=None
88
89
  ):
89
90
  assert url is not None
90
91
 
@@ -513,7 +514,16 @@ def get_similarities(vec1, vec2):
513
514
  return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0)
514
515
 
515
516
 
516
- def run_bench_serving(model, num_prompts, request_rate, other_server_args):
517
+ def run_bench_serving(
518
+ model,
519
+ num_prompts,
520
+ request_rate,
521
+ other_server_args,
522
+ dataset_name="random",
523
+ random_input_len=4096,
524
+ random_output_len=2048,
525
+ disable_stream=False,
526
+ ):
517
527
  # Launch the server
518
528
  base_url = DEFAULT_URL_FOR_TEST
519
529
  process = popen_launch_server(
@@ -529,21 +539,21 @@ def run_bench_serving(model, num_prompts, request_rate, other_server_args):
529
539
  base_url=base_url,
530
540
  host=None,
531
541
  port=None,
532
- dataset_name="random",
542
+ dataset_name=dataset_name,
533
543
  dataset_path="",
534
544
  model=None,
535
545
  tokenizer=None,
536
546
  num_prompts=num_prompts,
537
547
  sharegpt_output_len=None,
538
- random_input_len=4096,
539
- random_output_len=2048,
548
+ random_input_len=random_input_len,
549
+ random_output_len=random_output_len,
540
550
  random_range_ratio=0.0,
541
551
  request_rate=request_rate,
542
552
  multi=None,
543
553
  seed=0,
544
554
  output_file=None,
545
555
  disable_tqdm=False,
546
- disable_stream=False,
556
+ disable_stream=disable_stream,
547
557
  disable_ignore_eos=False,
548
558
  extra_request_body=None,
549
559
  )
@@ -587,3 +597,37 @@ def run_bench_latency(model, other_args):
587
597
  kill_child_process(process.pid)
588
598
 
589
599
  return output_throughput
600
+
601
+
602
+ def lcs(X, Y):
603
+ m = len(X)
604
+ n = len(Y)
605
+ L = [[0] * (n + 1) for _ in range(m + 1)]
606
+
607
+ for i in range(m + 1):
608
+ for j in range(n + 1):
609
+ if i == 0 or j == 0:
610
+ L[i][j] = 0
611
+ elif X[i - 1] == Y[j - 1]:
612
+ L[i][j] = L[i - 1][j - 1] + 1
613
+ else:
614
+ L[i][j] = max(L[i - 1][j], L[i][j - 1])
615
+
616
+ return L[m][n]
617
+
618
+
619
+ def calculate_rouge_l(output_strs_list1, output_strs_list2):
620
+ """calculate the ROUGE-L score"""
621
+ rouge_l_scores = []
622
+
623
+ for s1, s2 in zip(output_strs_list1, output_strs_list2):
624
+ lcs_len = lcs(s1, s2)
625
+ precision = lcs_len / len(s1) if len(s1) > 0 else 0
626
+ recall = lcs_len / len(s2) if len(s2) > 0 else 0
627
+ if precision + recall > 0:
628
+ fmeasure = (2 * precision * recall) / (precision + recall)
629
+ else:
630
+ fmeasure = 0.0
631
+ rouge_l_scores.append(fmeasure)
632
+
633
+ return rouge_l_scores
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.3.1.post3"
1
+ __version__ = "0.3.3"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.3.1.post3
3
+ Version: 0.3.3
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -248,6 +248,7 @@ Requires-Dist: uvloop; extra == "srt"
248
248
  Requires-Dist: zmq; extra == "srt"
249
249
  Requires-Dist: vllm==0.5.5; extra == "srt"
250
250
  Requires-Dist: outlines>=0.0.44; extra == "srt"
251
+ Requires-Dist: modelscope; extra == "srt"
251
252
  Provides-Extra: test
252
253
  Requires-Dist: jsonlines; extra == "test"
253
254
  Requires-Dist: matplotlib; extra == "test"
@@ -269,16 +270,11 @@ Requires-Dist: peft; extra == "test"
269
270
 
270
271
  --------------------------------------------------------------------------------
271
272
 
272
- | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Weekly Development Meeting**](https://calendar.app.google/v2Tw3kuHkKYyp8VV7) |
273
+ | [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Paper**](https://arxiv.org/abs/2312.07104) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2ngly9muu-t37XiH87qvD~6rVBTkTEHw) | [**Join Bi-Weekly Development Meeting (Oct. 19)**](https://calendar.app.google/GYW7S8QGoanCuaxW6) |
273
274
 
274
- SGLang is a fast serving framework for large language models and vision language models.
275
- It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
276
- The core features include:
277
-
278
- - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
279
- - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
280
- - **Extensive Model Support**: Supports a wide range of generative models (Llama 3, Gemma 2, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
281
- - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
275
+ ## Upcoming Events
276
+ - [Oct. 11, 2024] Invited talks at [AMD Advancing AI](https://www.amd.com/en/corporate/events/advancing-ai.html) Developer Day.
277
+ - [Oct. 16, 2024] Online meetup for efficient LLM deployment and serving, co-hosted by SGLang, FlashInfer, and MLC LLM! Fill out the [Google form](https://forms.gle/B3YeedLxmrrhL1NM8) to receive the invite link.
282
278
 
283
279
  ## News
284
280
  - [2024/09] 🔥 SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
@@ -294,6 +290,16 @@ The core features include:
294
290
 
295
291
  </details>
296
292
 
293
+ ## About
294
+ SGLang is a fast serving framework for large language models and vision language models.
295
+ It makes your interaction with models faster and more controllable by co-designing the backend runtime and frontend language.
296
+ The core features include:
297
+
298
+ - **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
299
+ - **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
300
+ - **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.) and embedding models (e5-mistral), with easy extensibility for integrating new models.
301
+ - **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
302
+
297
303
  ## Contents
298
304
  - [Install](#install)
299
305
  - [Backend: SGLang Runtime (SRT)](#backend-sglang-runtime-srt)
@@ -318,7 +324,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
318
324
  ### Method 2: From source
319
325
  ```
320
326
  # Use the last release branch
321
- git clone -b v0.3.1.post3 https://github.com/sgl-project/sglang.git
327
+ git clone -b v0.3.3 https://github.com/sgl-project/sglang.git
322
328
  cd sglang
323
329
 
324
330
  pip install --upgrade pip
@@ -339,7 +345,7 @@ docker run --gpus all \
339
345
  --env "HF_TOKEN=<secret>" \
340
346
  --ipc=host \
341
347
  lmsysorg/sglang:latest \
342
- python3 -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
348
+ python3 -m sglang.launch_server --model-path meta-llama/Llama-3.1-8B-Instruct --host 0.0.0.0 --port 30000
343
349
  ```
344
350
 
345
351
  ### Method 4: Using docker compose
@@ -348,9 +354,9 @@ docker run --gpus all \
348
354
  <summary>More</summary>
349
355
 
350
356
  > This method is recommended if you plan to serve it as a service.
351
- > A better approach is to use the [k8s-sglang-service.yaml](./docker/k8s-sglang-service.yaml).
357
+ > A better approach is to use the [k8s-sglang-service.yaml](docker/k8s-sglang-service.yaml).
352
358
 
353
- 1. Copy the [compose.yml](./docker/compose.yaml) to your local machine
359
+ 1. Copy the [compose.yml](docker/compose.yaml) to your local machine
354
360
  2. Execute the command `docker compose up -d` in your terminal.
355
361
  </details>
356
362
 
@@ -379,7 +385,7 @@ resources:
379
385
  run: |
380
386
  conda deactivate
381
387
  python3 -m sglang.launch_server \
382
- --model-path meta-llama/Meta-Llama-3.1-8B-Instruct \
388
+ --model-path meta-llama/Llama-3.1-8B-Instruct \
383
389
  --host 0.0.0.0 \
384
390
  --port 30000
385
391
  ```
@@ -421,7 +427,8 @@ curl http://localhost:30000/generate \
421
427
  }
422
428
  }'
423
429
  ```
424
- Learn more about the argument format [here](docs/en/sampling_params.md).
430
+
431
+ Learn more about the argument specification, streaming, and multi-modal support [here](docs/en/sampling_params.md).
425
432
 
426
433
  ### OpenAI Compatible API
427
434
  In addition, the server supports OpenAI-compatible APIs.
@@ -460,7 +467,7 @@ response = client.embeddings.create(
460
467
  print(response)
461
468
  ```
462
469
 
463
- It supports streaming, vision, and most features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
470
+ It supports streaming, vision, and almost all features of the Chat/Completions/Models/Batch endpoints specified by the [OpenAI API Reference](https://platform.openai.com/docs/api-reference/).
464
471
 
465
472
  ### Additional Server Arguments
466
473
  - To enable multi-GPU tensor parallelism, add `--tp 2`. If it reports the error "peer access is not supported between these two devices", add `--enable-p2p-check` to the server launch command.
@@ -481,10 +488,11 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
481
488
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --chunked-prefill-size 4096
482
489
  ```
483
490
  - To enable torch.compile acceleration, add `--enable-torch-compile`. It accelerates small models on small batch sizes.
491
+ - To enable torchao quantization, add `--torchao-config int4wo-128`. It supports various quantization strategies.
484
492
  - To enable fp8 weight quantization, add `--quantization fp8` on a fp16 checkpoint or directly load a fp8 checkpoint without specifying any arguments.
485
493
  - To enable fp8 kv cache quantization, add `--kv-cache-dtype fp8_e5m2`.
486
494
  - If the model does not have a chat template in the Hugging Face tokenizer, you can specify a [custom chat template](docs/en/custom_chat_template.md).
487
- - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port.
495
+ - To run tensor parallelism on multiple nodes, add `--nnodes 2`. If you have two nodes with two GPUs on each node and want to run TP=4, let `sgl-dev-0` be the hostname of the first node and `50000` be an available port, you can use the following commands. If you meet deadlock, please try to add `--disable-cuda-graph`
488
496
  ```
489
497
  # Node 0
490
498
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --tp 4 --nccl-init sgl-dev-0:50000 --nnodes 2 --node-rank 0
@@ -499,9 +507,9 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
499
507
  - Llama / Llama 2 / Llama 3 / Llama 3.1
500
508
  - Mistral / Mixtral / Mistral NeMo
501
509
  - Gemma / Gemma 2
502
- - OLMoE
503
510
  - Qwen / Qwen 2 / Qwen 2 MoE
504
511
  - DeepSeek / DeepSeek 2
512
+ - OLMoE
505
513
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
506
514
  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
507
515
  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
@@ -521,7 +529,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
521
529
  - BaiChuan2
522
530
  - MiniCPM / MiniCPM 3
523
531
  - XVERSE / XVERSE MoE
524
-
532
+ - SmolLM
525
533
 
526
534
  **Embedding Models**
527
535
 
@@ -529,7 +537,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
529
537
  - gte-Qwen2
530
538
  - `python -m sglang.launch_server --model-path Alibaba-NLP/gte-Qwen2-7B-instruct --is-embedding`
531
539
 
532
- Instructions for supporting a new model are [here](https://github.com/sgl-project/sglang/blob/main/docs/en/model_support.md).
540
+ Instructions for supporting a new model are [here](docs/en/model_support.md).
533
541
 
534
542
  #### Use Models From ModelScope
535
543
  <details>
@@ -543,6 +551,17 @@ Launch [Qwen2-7B-Instruct](https://www.modelscope.cn/models/qwen/qwen2-7b-instru
543
551
  ```
544
552
  SGLANG_USE_MODELSCOPE=true python -m sglang.launch_server --model-path qwen/Qwen2-7B-Instruct --port 30000
545
553
  ```
554
+
555
+ Or start it by docker.
556
+ ```bash
557
+ docker run --gpus all \
558
+ -p 30000:30000 \
559
+ -v ~/.cache/modelscope:/root/.cache/modelscope \
560
+ --env "SGLANG_USE_MODELSCOPE=true" \
561
+ --ipc=host \
562
+ lmsysorg/sglang:latest \
563
+ python3 -m sglang.launch_server --model-path Qwen/Qwen2.5-7B-Instruct --host 0.0.0.0 --port 30000
564
+ ```
546
565
 
547
566
  </details>
548
567
 
@@ -581,7 +600,7 @@ GLOO_SOCKET_IFNAME=eth0 python3 -m sglang.launch_server --model-path meta-llama/
581
600
  The frontend language can be used with local models or API models. It is an alternative to the OpenAI API. You may found it easier to use for complex prompting workflow.
582
601
 
583
602
  ### Quick Start
584
- The example below shows how to use sglang to answer a mulit-turn question.
603
+ The example below shows how to use sglang to answer a multi-turn question.
585
604
 
586
605
  #### Using Local Models
587
606
  First, launch a server with
@@ -824,7 +843,7 @@ def chat_example(s):
824
843
  Learn more at this [blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/).
825
844
 
826
845
  ## Roadmap
827
- [Development Roadmap (2024 Q3)](https://github.com/sgl-project/sglang/issues/634)
846
+ [Development Roadmap (2024 Q4)](https://github.com/sgl-project/sglang/issues/1487)
828
847
 
829
848
  ## Citation And Acknowledgment
830
849
  Please cite our paper, [SGLang: Efficient Execution of Structured Language Model Programs](https://arxiv.org/abs/2312.07104), if you find the project useful.
@@ -0,0 +1,139 @@
1
+ sglang/__init__.py,sha256=b_pqO9bR2fjK9En_tigfzKTiQzE8b_hUizY0DAKVk1M,1616
2
+ sglang/api.py,sha256=5x591S4rLbmNPs75qPwGKVu1sonVGDyjPAJlHTyWw50,6956
3
+ sglang/bench_latency.py,sha256=NkaL4YFWqDnochwaLd8o2pyZGqu6TeURbFB3TGyZHr4,17893
4
+ sglang/bench_server_latency.py,sha256=rRSDqjJ5jan9AzppOGx75KRUjZCU2dUG2h06CQOdJgk,5377
5
+ sglang/bench_serving.py,sha256=1AQzkQ8ci9-rMZEM7wap8I09oPP4AZd93RfXMQRgVro,36386
6
+ sglang/check_env.py,sha256=rGRABCgt-0SfUrow4px28b2P59aMn8eVTnN5eZc_a8s,5397
7
+ sglang/global_config.py,sha256=38id86i3tRGCSOFZlN1LM01a3xt-V98xuNgKGG9boCk,1058
8
+ sglang/launch_server.py,sha256=UnjNjYuZ8TtvmRtgYEsFImkbvCwvn_tQjk0V7cHy67E,450
9
+ sglang/launch_server_llavavid.py,sha256=olPKyhozi1coCwoRMwBRYWsTFByrgus9CwPSeNmskgc,1002
10
+ sglang/utils.py,sha256=NA_4xUrTI7KICQ3PEACfNWKE3nxSA5QvQZJNd4TQrDc,9395
11
+ sglang/version.py,sha256=8KcCYTXH99C2-gCLuPILJvtT9YftRWJsartIx6TQ2ZY,22
12
+ sglang/lang/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
+ sglang/lang/chat_template.py,sha256=uqI_I9zIKXGXg7-W-yjqvx1ZeS_TuwFCms6wkmC2QmY,13411
14
+ sglang/lang/choices.py,sha256=-W1DVw9N9ZliVpvmWrzIXG4cswAah8eMQrHWzkS3D8o,6234
15
+ sglang/lang/compiler.py,sha256=o1C6G3TzhjSlsH-doTPy5oiVehr57dxNTa5oZw5TTAI,7639
16
+ sglang/lang/interpreter.py,sha256=zakc6IkzATaMqVDWKWvqDRrqnRykxFawajA7aUHUDbI,30640
17
+ sglang/lang/ir.py,sha256=F_9ac10OjktxR7KhOV07wiJXV20s79cRfh9d4koExJc,18262
18
+ sglang/lang/tracer.py,sha256=borJmlSJOhg1RUndGRnilnR60eEZz2Y9aU7BpftsOxU,8287
19
+ sglang/lang/backend/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
+ sglang/lang/backend/anthropic.py,sha256=EXRX7xJgA5KZszX7toSLVnKzFQ5EO0Loj-YjHFtxSxg,2081
21
+ sglang/lang/backend/base_backend.py,sha256=Q5HdiDtyBewQeoYH0kDtBRVL8KFiEPNq9dw7XmauHQ8,1985
22
+ sglang/lang/backend/litellm.py,sha256=ugmL7sfUxkUHVbHtwNzHgdQAEd4UCjNQboFuE3KThcY,2450
23
+ sglang/lang/backend/openai.py,sha256=qM7eVH_kMxnDd2rpxOH0v76KxtOJFlAwgLgWIKvFGCI,15060
24
+ sglang/lang/backend/runtime_endpoint.py,sha256=iVb7SlrpJ1ic92QG5kQUphZUb2EaVWY43dkmAO5pju4,10514
25
+ sglang/lang/backend/vertexai.py,sha256=O-iBLD-y3vq80UxnrAoJri7bxpgd-_eakZ88Cf8bEGA,4855
26
+ sglang/srt/conversation.py,sha256=S5w5V6G1xigNxa3UQoSxRcMpQLWWDT9EPBoHBvHkSAk,19663
27
+ sglang/srt/hf_transformers_utils.py,sha256=rt6flb6BoYTO8fw7AKCXmQLJx5XuSUuRmZX-VJHmuLQ,6064
28
+ sglang/srt/mm_utils.py,sha256=zox644S3IHUWmADdK4MnIbdTS2DWHOy0_Dq0gCU38QQ,12273
29
+ sglang/srt/server.py,sha256=SKV6IxR8w0AmuwgHSEOfag_t-f6hAEq9Xg49iBioi2U,22224
30
+ sglang/srt/server_args.py,sha256=LI8ehxs0sfI0EDhON-OhNGbDx0-oo9QhfnpYjYwnH54,24405
31
+ sglang/srt/utils.py,sha256=amDWXIu1syU-kvdV8bUkNfYaMfpcN22BKZm_2xp59jI,22202
32
+ sglang/srt/configs/__init__.py,sha256=292SuEorST-lAq2Uvsv2M7yC28uYZlssVvRDsF-bZCQ,86
33
+ sglang/srt/configs/exaone.py,sha256=Duxd4yQoKy8GWEzZD_kCY_OzmN_67CTJL_Kgn0eXk3g,10731
34
+ sglang/srt/configs/model_config.py,sha256=36My-o44trhWY3KYDeSFMGvv9XuUtIVI5e7F8VlOTWo,6723
35
+ sglang/srt/constrained/__init__.py,sha256=ze8awDPvwAzdeMwzJ-25kXOQ4nVWoaP55jBDt5UOS_4,2070
36
+ sglang/srt/constrained/base_tool_cache.py,sha256=5sazBMHHDpHMoqOjuY6itCxwTmIFCflIWEDXMtmrPVs,2006
37
+ sglang/srt/constrained/fsm_cache.py,sha256=9GtliIN55Ov8Q9MSFfQC5rKrz3qTsB7Cm5OkhivKngY,3271
38
+ sglang/srt/constrained/jump_forward.py,sha256=o-CzJu3DEs0eFKlLzsQVYMSo4vBKpffs25sXLOJd6jc,6997
39
+ sglang/srt/layers/activation.py,sha256=7VEkCrx2dvl629Lz0fkJcJfVoZA-ykEdkpTzKEc_drQ,5225
40
+ sglang/srt/layers/layernorm.py,sha256=HCj8Y_X6MNNdtQU2sWKgyjIqVERxl9dqrmjbBbyJjpE,3796
41
+ sglang/srt/layers/linear.py,sha256=9rjCiSb_QOn5RgpVjIhEKdReRvSYVfcTSjbWBEbApLI,45173
42
+ sglang/srt/layers/logits_processor.py,sha256=Fq7VHwjP4iSzl_OBLo8qw_HVbIDbYB-0MGmfiD3Jk_E,12521
43
+ sglang/srt/layers/pooler.py,sha256=rj2lygvleBnyLCBZ8I11HGMgpfIDsT0l3PIkshJwdu4,1606
44
+ sglang/srt/layers/radix_attention.py,sha256=i07VRXPDHj-zJ1TSrXEqCxumQwYSHwAvc8DoIg-Irtg,1964
45
+ sglang/srt/layers/sampler.py,sha256=J5vd0CcLpLfgtLniCoe2VF6hjM_ld76hbDG4p1qoAMc,4010
46
+ sglang/srt/layers/torchao_utils.py,sha256=1nzZkSzbF4qCAMeBKAeeDpMl_mK8imiY2RL3xFEgvAw,3340
47
+ sglang/srt/layers/attention/__init__.py,sha256=zLLwinbYLAQHfVEz0jZiVa_cYNgSYoy4wYD_0y-ErHQ,1798
48
+ sglang/srt/layers/attention/flashinfer_backend.py,sha256=DOvm-d3XLjE6XJDD3a8aCnlpuAJZZ946YFDH_Ec4lqc,10150
49
+ sglang/srt/layers/attention/flashinfer_utils.py,sha256=9YMt7ab6F0gEVkxdVm8vDB0LVBRYRL0XIKVrmndp4n8,7571
50
+ sglang/srt/layers/attention/triton_backend.py,sha256=I_kw0LXdgziHAFC8Qv5n5PDFJRLvZyzVsXwjmFZ0KSc,6041
51
+ sglang/srt/layers/attention/triton_ops/decode_attention.py,sha256=XCQTX0kUttT1AG5FRMgfQbiXgvoempYD0UR2r6D_vJg,16711
52
+ sglang/srt/layers/attention/triton_ops/extend_attention.py,sha256=oyqon1KG5-ICHcCANAbrglXLYKvWHFML-4tIQI9M5VI,11063
53
+ sglang/srt/layers/attention/triton_ops/prefill_attention.py,sha256=QkXPcT02c13zha2M4mBm2S5dh_sS-Gc4FkkrcywRqvc,5377
54
+ sglang/srt/layers/fused_moe/__init__.py,sha256=bWCrDdOy2ANEXTb8CHYO63O3Iu3eZnn0PJbgl0z5vvE,75
55
+ sglang/srt/layers/fused_moe/fused_moe.py,sha256=1WM2cObWXcFWtqh_utGJFPnrT344rORwuQ9hJDaH2s0,23104
56
+ sglang/srt/layers/fused_moe/layer.py,sha256=raFyvPzjYz-Fv8B3IcOxQYKKCWqXis5mXwg1GFE61y4,22243
57
+ sglang/srt/layers/fused_moe/patch.py,sha256=B9cDtHqHfnWE0QqZAffvUi6cVRKcMBMKDGJWGIaKh3U,3898
58
+ sglang/srt/layers/quantization/__init__.py,sha256=QilMNqgu3eOFUkEjXLSDa1NvoNdi_CAvC8a1hprOgN8,2979
59
+ sglang/srt/layers/quantization/base_config.py,sha256=vlpSPvSrFmUe65ETg4SoPocQ9bVNY6As3QuHdr_3Dr4,4023
60
+ sglang/srt/lora/lora.py,sha256=a5j_Yy0s95msVPFgOuH5PCe7sMu0AyZFQ5wL0H-YIg8,14913
61
+ sglang/srt/lora/lora_config.py,sha256=paVB7F7SIuxr_vodvKf8zzAlH2fdVYHhXxcXV62D0Vo,1411
62
+ sglang/srt/lora/lora_manager.py,sha256=gzBwYXZEPYj56PkGTshTbWRfl_370wb6uTcRhDaLiF8,12801
63
+ sglang/srt/managers/detokenizer_manager.py,sha256=iCLPdHkL6lAp_-Qew1u4Tyt3jYRkJ8i-Bj3l8TC-uaA,7278
64
+ sglang/srt/managers/image_processor.py,sha256=9Y9RqyLdbt4uOK7pnJCJIhY77791klskSrEg8U6pyS4,6910
65
+ sglang/srt/managers/io_struct.py,sha256=rPyQk5y-jJu4eyoqUVh4M8B14PifjkE8B3K5yI0NX24,12185
66
+ sglang/srt/managers/schedule_batch.py,sha256=mqdMg1QB6PNLbBjxkXoP_Ld82R1w34g_13YH82DGMh8,31216
67
+ sglang/srt/managers/schedule_policy.py,sha256=PiTKvsAFwoNWNsv_SFkghIHCL452MdboRc2cmN6ITcU,11935
68
+ sglang/srt/managers/scheduler.py,sha256=N9GQnp2SXd8-uN49KmQO-144N27M6h3dxRZuFZ-9AmY,39132
69
+ sglang/srt/managers/tokenizer_manager.py,sha256=BAvLW_cRtIgjL0_cwrvDAb7g740fgEddyqaT3JtofR4,24548
70
+ sglang/srt/managers/tp_worker.py,sha256=fcaW-u7AAX49kQCNn_AEtdRPykRdT6Z6lx1O9LHA15E,4833
71
+ sglang/srt/mem_cache/base_prefix_cache.py,sha256=qEQwEkG4E5rab2ZoTqcesf5pR_J4nV2jBxIHsBJHtIM,924
72
+ sglang/srt/mem_cache/chunk_cache.py,sha256=CjZZYlqQzq7mYOiBMLWA5XNb6HIyh5lIMdY-K0OUZEc,2368
73
+ sglang/srt/mem_cache/flush_cache.py,sha256=pTLKPRB17U6vl5RFJJvuJ4jCL2SyomgkUBNlkDpGRqo,978
74
+ sglang/srt/mem_cache/memory_pool.py,sha256=L-5drUt7vlyvple4OcjH1jJRzt2qhVrpc9klZn-bQfE,7125
75
+ sglang/srt/mem_cache/radix_cache.py,sha256=00bghOihUm7lA1i4gxxMYQLept9LaHg2ZSXZryuFZZI,10121
76
+ sglang/srt/model_executor/cuda_graph_runner.py,sha256=iheZYErwFT_W4kJUE1dgbGoQQx7hyOSKa-Yv8guq0DI,10479
77
+ sglang/srt/model_executor/forward_batch_info.py,sha256=FIQ8XIIP724mIL2l7w7mSEFH452qw-TPpqm43J4YeHM,5822
78
+ sglang/srt/model_executor/model_runner.py,sha256=KyglHFIMb5TC-NszN2D85_k7oVQLhbwhUYa7u3RFkoc,22874
79
+ sglang/srt/models/baichuan.py,sha256=50m43kIVo-YamHFwxyiLGG_pCbF7mzUJfhEyuuSmVC8,15100
80
+ sglang/srt/models/chatglm.py,sha256=XaS_6-ZvRw7X-56sk9xQogqT0NzGEMVpiAdQnC5qbBY,13333
81
+ sglang/srt/models/commandr.py,sha256=2urK7u2FiwPBl60hMmt-wfaJ8V-ilv6l1B37MUlvSxk,14121
82
+ sglang/srt/models/dbrx.py,sha256=qTpyA1Iv56VI-ksPKt4JryX2Pn7T5FXAa0n0ZoT4qbw,14615
83
+ sglang/srt/models/deepseek.py,sha256=4sl4YYoxqe-vif7KJKcMjMA3KgvzYHqpQBgM58lzLHc,15973
84
+ sglang/srt/models/deepseek_v2.py,sha256=dt0FGAgW3jd7OJJnKfH-LIU13U0I9b7R9shYmAEins4,28390
85
+ sglang/srt/models/exaone.py,sha256=9JfFhYbpcHMXIaBNn8rc_GOlkItkIgbGNslNyFD7gvU,13054
86
+ sglang/srt/models/gemma.py,sha256=gui46inEJsrmppEMTUIQuzMxGPEBx_TjiZ5-PacjuSk,12240
87
+ sglang/srt/models/gemma2.py,sha256=V0GjEdTqxyXvBqjgyiyONipohjOqw0pLITmZZRb2kIE,14890
88
+ sglang/srt/models/gpt_bigcode.py,sha256=LgSm-8oxBfnzMAC4Jqqg-RJGge4E_wgJ1br7ylbTPZ0,10162
89
+ sglang/srt/models/grok.py,sha256=lUR_SmD_KhIiZx5OVUPZp8VVdrAga6WWTdMKJ5PCFbw,14896
90
+ sglang/srt/models/internlm2.py,sha256=4SUaeJl2dZlUowahfv7kLbz3jLXtmvdBPGURmhAeX6Q,12169
91
+ sglang/srt/models/llama.py,sha256=5j66LmvFhOKgFZiE75mJ80XBjZ2dNx7e8Yea5lsD0P0,15828
92
+ sglang/srt/models/llama_classification.py,sha256=Yhabu9FuBxjNo74crMsK0FqpD53ehOx_zcHgIXjvlvQ,3379
93
+ sglang/srt/models/llama_embedding.py,sha256=4j3WNLB-x7XQnJvohdRs7VSSEabbhiE2BRHmnG5IZRU,3453
94
+ sglang/srt/models/llama_reward.py,sha256=qQOPfn-9oqhsD0EaffXtk-EXKRdSZL1X7CYAGCDoG9A,5383
95
+ sglang/srt/models/llava.py,sha256=zbJs1P4_Bjh2_dSbyoheJZ1wGXuKHGz6BpV766G7ZUY,25094
96
+ sglang/srt/models/llavavid.py,sha256=qhBGHTxzGAOMgqMiwOc3mUbaK6qeXsEYSlNmlEEIdeM,12198
97
+ sglang/srt/models/minicpm.py,sha256=5vc-Lq7ggHrRxxkciVMdZ5Vq6ThLwnhFS62UCokFC2g,13792
98
+ sglang/srt/models/minicpm3.py,sha256=hhhgZTKQApUZpH_MYQZTk3K1Ox-xpJRxGCemoUw8x4U,25184
99
+ sglang/srt/models/mistral.py,sha256=tiYoKjyYVzlQl52QUZ33odD2yCxj9dxcqln474VuZOw,744
100
+ sglang/srt/models/mixtral.py,sha256=BonqX_rSB_UuBDQe3uy8-NOxB4Q4s2mTxTQItvFB9ZQ,13864
101
+ sglang/srt/models/mixtral_quant.py,sha256=SAHBIiD5O1TnojCpqTLcPy3TEvfSCKeOe3GC47fdFSg,14039
102
+ sglang/srt/models/olmoe.py,sha256=ghhNpZe4SzaZEpw0APYBbAmLb3LBagRC2N724RkOkH4,15312
103
+ sglang/srt/models/qwen.py,sha256=IrOKHS7b4SL2fnJegq811eeHnAQDya2PujIgKQ9URVY,9921
104
+ sglang/srt/models/qwen2.py,sha256=B7hXnW5uYPmpMgSN7tI3tTvMEmmQLpddsw_iNTiaHJI,12398
105
+ sglang/srt/models/qwen2_moe.py,sha256=MK-9W6FJhXoQYayg_jpXjKKq4n5j3s2b2ZaoCBfVJ2I,17120
106
+ sglang/srt/models/stablelm.py,sha256=ldtlRG1XGdYcjwqb48dpMTfbdh8KHUjcWrrUYNJ0MEk,11326
107
+ sglang/srt/models/torch_native_llama.py,sha256=c5GJ_k9zbSOk0PjLCXAK8YebGEy0RUVYZ9_h6_19A3M,19215
108
+ sglang/srt/models/xverse.py,sha256=i11wEKqqVCoVtH7yo9jfpNyGHxhw7NvTPid3ojmg79s,13634
109
+ sglang/srt/models/xverse_moe.py,sha256=JwkBhsyusP7e_hAMnomkP8cEmKNCLJPRtwaTERQ0D0M,15818
110
+ sglang/srt/models/yivl.py,sha256=N3noJ5M-FiZS-E_zfaJs4prQOu_ineRt11MWloYgOR8,4826
111
+ sglang/srt/openai_api/adapter.py,sha256=ULX1lo23r6semogKcbUOXGSgPJi8NJ7IuC0WVvEbVbs,51458
112
+ sglang/srt/openai_api/protocol.py,sha256=rdSwUAoO5-KLemJOE50xwSUagxY4T1QIiNyCYsTtCi0,9868
113
+ sglang/srt/sampling/sampling_batch_info.py,sha256=mtE_kLC6U-X6Q20BVjPWyDOoGc4kcTdIPpcsNeZcRYo,6462
114
+ sglang/srt/sampling/sampling_params.py,sha256=Xwh4_M6PP4SWyGV-zNyIhp4XbRKbeU4251ao8UOlZlI,5704
115
+ sglang/srt/sampling/penaltylib/__init__.py,sha256=5vQw0Y5DSzmsoFg1IdMIKLwFVhYZ5ArADHVBYbSmOec,513
116
+ sglang/srt/sampling/penaltylib/orchestrator.py,sha256=WkTNeDhj9H9rtp2ZZeX6MS2sdKSGlLboE6FcuKrwUo0,10815
117
+ sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py,sha256=IvYioX53Vq_ji-0Zhcz_r5mUa3T3GaIydVS6K4FhWfE,2557
118
+ sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py,sha256=XJZP0C4NFyXgcODbIWXxrgVEjmRgqLdZuVAtoN-LveY,3565
119
+ sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py,sha256=0PlANTrR959foTA3Nj5qBE7ndaOZgG-9X6LhzlmEUc8,2533
120
+ sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py,sha256=v9jOgA0-I31WcrhIydiFbpy2ZJPLytFLGM98NRPd2sU,2820
121
+ sglang/test/few_shot_gsm8k.py,sha256=To7Sdg-DLF8poIQLwiOBYKbkz-1C_gn6H79vIbyPR-o,3860
122
+ sglang/test/run_eval.py,sha256=NWxeLWmInBgkCvC9Jr_QzF7GfAiBve3Gf1JQrEOlNlU,3899
123
+ sglang/test/runners.py,sha256=VCmtH08FsAq_JTAKfKo0zB4o-osNMAxxwe4aKcSxr4c,13515
124
+ sglang/test/simple_eval_common.py,sha256=r0G-9QLycs2ax3RMc44T_61fzMxlpTzv6pececC7lyY,12379
125
+ sglang/test/simple_eval_gpqa.py,sha256=8Xt9Bw05c7SZTYrCZgB68OZUqUbLo69ywiyx0bTvSUk,3220
126
+ sglang/test/simple_eval_humaneval.py,sha256=7lTi841NT58smNOtRwCedrdX9IWWypdLkOtaQOBy-GI,5687
127
+ sglang/test/simple_eval_math.py,sha256=6kGKNwNbLN-Af3Wj8WTimWhH-Xp3enDmSvvSjsgWUpk,2550
128
+ sglang/test/simple_eval_mgsm.py,sha256=wfbqJW9Rkc66vzq2fEMF6jchmoA8mw1OUiGU55cZ2B0,10261
129
+ sglang/test/simple_eval_mmlu.py,sha256=FkwamjGMjueTixymkedF-YiPloSLiy4ftILFUrKZ9XI,4357
130
+ sglang/test/test_activation.py,sha256=jkdNRzJnbd5OgZliQaIXpxovlcky17UrweomcOcMxoE,1442
131
+ sglang/test/test_layernorm.py,sha256=IacByD5d-stXjzBz8Ypamc7povlcedpKPbb_4JLgo3c,3720
132
+ sglang/test/test_programs.py,sha256=1Z0umrsUu9pagzyGH5SrXl_qhKSyTfUv_kWC2mcn0qo,18208
133
+ sglang/test/test_utils.py,sha256=NkJuezjmonjgC3_i_CTBd8KSqWh6W9CLcgoaqvTNK2U,18684
134
+ sglang/test/srt/sampling/penaltylib/utils.py,sha256=-0p0rV-P4lNo7xAe3rQSBHTubc50a-DFyOQmLGAkgkQ,12515
135
+ sglang-0.3.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
136
+ sglang-0.3.3.dist-info/METADATA,sha256=zeY2pmiGPJb52zaHqiRHY4OcZqAHPvG_zPyve5KfANc,39063
137
+ sglang-0.3.3.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
138
+ sglang-0.3.3.dist-info/top_level.txt,sha256=yxhh3pYQkcnA7v3Bg889C2jZhvtJdEincysO7PEB09M,7
139
+ sglang-0.3.3.dist-info/RECORD,,