sglang 0.4.1.post7__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. sglang/bench_offline_throughput.py +17 -11
  2. sglang/bench_one_batch.py +14 -6
  3. sglang/bench_serving.py +47 -44
  4. sglang/lang/chat_template.py +31 -0
  5. sglang/srt/configs/load_config.py +1 -0
  6. sglang/srt/distributed/device_communicators/custom_all_reduce.py +5 -2
  7. sglang/srt/entrypoints/engine.py +5 -2
  8. sglang/srt/entrypoints/http_server.py +24 -0
  9. sglang/srt/function_call_parser.py +494 -0
  10. sglang/srt/layers/activation.py +5 -5
  11. sglang/srt/layers/dp_attention.py +3 -1
  12. sglang/srt/layers/layernorm.py +5 -5
  13. sglang/srt/layers/linear.py +24 -9
  14. sglang/srt/layers/logits_processor.py +1 -1
  15. sglang/srt/layers/moe/ep_moe/layer.py +20 -12
  16. sglang/srt/layers/moe/fused_moe_native.py +17 -3
  17. sglang/srt/layers/moe/fused_moe_triton/configs/E=256,N=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  18. sglang/srt/layers/moe/fused_moe_triton/fused_moe.py +18 -1
  19. sglang/srt/layers/moe/fused_moe_triton/layer.py +9 -0
  20. sglang/srt/layers/parameter.py +16 -7
  21. sglang/srt/layers/quantization/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  22. sglang/srt/layers/quantization/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  23. sglang/srt/layers/quantization/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  24. sglang/srt/layers/quantization/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  25. sglang/srt/layers/quantization/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  26. sglang/srt/layers/quantization/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  27. sglang/srt/layers/quantization/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  28. sglang/srt/layers/quantization/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  29. sglang/srt/layers/quantization/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128, 128].json +164 -0
  30. sglang/srt/layers/quantization/fp8.py +4 -1
  31. sglang/srt/layers/rotary_embedding.py +6 -1
  32. sglang/srt/layers/sampler.py +28 -8
  33. sglang/srt/layers/torchao_utils.py +12 -6
  34. sglang/srt/managers/detokenizer_manager.py +1 -0
  35. sglang/srt/managers/io_struct.py +36 -5
  36. sglang/srt/managers/schedule_batch.py +31 -25
  37. sglang/srt/managers/scheduler.py +61 -35
  38. sglang/srt/managers/tokenizer_manager.py +4 -0
  39. sglang/srt/model_executor/cuda_graph_runner.py +23 -25
  40. sglang/srt/model_executor/forward_batch_info.py +5 -7
  41. sglang/srt/model_executor/model_runner.py +7 -4
  42. sglang/srt/model_loader/loader.py +75 -0
  43. sglang/srt/model_loader/weight_utils.py +91 -5
  44. sglang/srt/models/commandr.py +14 -2
  45. sglang/srt/models/dbrx.py +9 -1
  46. sglang/srt/models/deepseek_v2.py +3 -3
  47. sglang/srt/models/gemma2.py +9 -1
  48. sglang/srt/models/grok.py +1 -0
  49. sglang/srt/models/minicpm3.py +3 -3
  50. sglang/srt/models/torch_native_llama.py +17 -4
  51. sglang/srt/openai_api/adapter.py +139 -37
  52. sglang/srt/openai_api/protocol.py +5 -4
  53. sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +11 -14
  54. sglang/srt/sampling/sampling_batch_info.py +4 -14
  55. sglang/srt/server.py +2 -2
  56. sglang/srt/server_args.py +20 -1
  57. sglang/srt/speculative/eagle_utils.py +37 -15
  58. sglang/srt/speculative/eagle_worker.py +11 -13
  59. sglang/srt/utils.py +62 -65
  60. sglang/test/test_programs.py +1 -0
  61. sglang/test/test_utils.py +81 -22
  62. sglang/version.py +1 -1
  63. {sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/METADATA +7 -7
  64. {sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/RECORD +67 -56
  65. {sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/LICENSE +0 -0
  66. {sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/WHEEL +0 -0
  67. {sglang-0.4.1.post7.dist-info → sglang-0.4.2.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py CHANGED
@@ -14,6 +14,7 @@
14
14
  """Common utilities."""
15
15
 
16
16
  import base64
17
+ import ctypes
17
18
  import dataclasses
18
19
  import io
19
20
  import ipaddress
@@ -29,6 +30,7 @@ import shutil
29
30
  import signal
30
31
  import socket
31
32
  import subprocess
33
+ import sys
32
34
  import tempfile
33
35
  import time
34
36
  import warnings
@@ -59,7 +61,6 @@ from triton.runtime.cache import (
59
61
  default_dump_dir,
60
62
  default_override_dir,
61
63
  )
62
- from uvicorn.config import LOGGING_CONFIG
63
64
 
64
65
  logger = logging.getLogger(__name__)
65
66
 
@@ -73,7 +74,7 @@ def is_hip() -> bool:
73
74
 
74
75
 
75
76
  def is_cuda():
76
- return hasattr(torch, "cuda") and torch.cuda.is_available()
77
+ return hasattr(torch, "cuda") and torch.version.cuda is not None
77
78
 
78
79
 
79
80
  def is_cuda_alike():
@@ -773,7 +774,7 @@ def get_zmq_socket(
773
774
 
774
775
 
775
776
  def dump_to_file(dirpath, name, value):
776
- from vllm.distributed import get_tensor_model_parallel_rank
777
+ from sglang.srt.distributed import get_tensor_model_parallel_rank
777
778
 
778
779
  if get_tensor_model_parallel_rank() != 0:
779
780
  return
@@ -1242,68 +1243,6 @@ def dataclass_to_string_truncated(data, max_length=2048):
1242
1243
  return str(data)
1243
1244
 
1244
1245
 
1245
- TOOLS_TAG_LIST = ["<|plugin|>", "<function=", "<tool_call>", "<|python_tag|>"]
1246
-
1247
-
1248
- def parse_tool_response(text, tools, **kwargs):
1249
- """Parse model response containing tool information.
1250
-
1251
- Args:
1252
- text(str): model response in string format
1253
- tools(List): tools from user request
1254
- """
1255
- if "<|plugin|>" in text: # internlm2
1256
- text, action = text.split("<|action_start|><|plugin|>")
1257
- action = action.split("<|action_end|>".strip())[0]
1258
- action = action[action.find("{") :]
1259
- action = json.loads(action)
1260
- name, parameters = action["name"], json.dumps(
1261
- action.get("parameters", action.get("arguments", {})), ensure_ascii=False
1262
- )
1263
- call_info_list = [(name, parameters)]
1264
- elif "<function=" in text: # llama3.1
1265
- action, _ = text.split("</function>")
1266
- parameters = action[action.find("{") :]
1267
- name = action.split("<function=")[1].split(">{")[0]
1268
- call_info_list = [(name, parameters)]
1269
- elif "<tool_call>" in text and "</tool_call>" in text: # qwen2.5
1270
- # get tool_call in text
1271
- pattern = r"<tool_call>(.*?)</tool_call>"
1272
- match_result_list = re.findall(pattern, text, re.DOTALL)
1273
- call_info_list = []
1274
- for match_result in match_result_list:
1275
- action = json.loads(match_result)
1276
- call_info_list.append(
1277
- (action["name"], json.dumps(action["arguments"], ensure_ascii=False))
1278
- )
1279
- # get text outside of tags
1280
- if not text.startswith("<tool_call>"):
1281
- text = text[: text.find("<tool_call>")]
1282
- elif not text.endswith("</tool_call>"):
1283
- text = text[text.rfind("</tool_call>") + len("</tool_call>") :]
1284
- else:
1285
- text = ""
1286
- elif "<|python_tag|>" in text: # llama3.2
1287
- _, action = text.split("<|python_tag|>")
1288
- action = json.loads(action)
1289
- name, parameters = action["name"], json.dumps(
1290
- action.get("parameters", action.get("arguments", {})), ensure_ascii=False
1291
- )
1292
- call_info_list = [(name, parameters)]
1293
- else:
1294
- raise RuntimeError(f"Unexpected model response: {text}")
1295
-
1296
- call_info_list = [
1297
- (
1298
- [tool.function.name for tool in tools].index(call_info[0]),
1299
- call_info[0],
1300
- call_info[1],
1301
- )
1302
- for call_info in call_info_list
1303
- ]
1304
- return text, call_info_list
1305
-
1306
-
1307
1246
  def permute_weight(x: torch.Tensor) -> torch.Tensor:
1308
1247
  b_ = x.shape[0]
1309
1248
  n_ = x.shape[1]
@@ -1366,7 +1305,33 @@ def nullable_str(val: str):
1366
1305
  return val
1367
1306
 
1368
1307
 
1308
+ def pyspy_dump_schedulers():
1309
+ """py-spy dump on all scheduler in a local node."""
1310
+ try:
1311
+ pid = psutil.Process().pid
1312
+ # Command to run py-spy with the PID
1313
+ cmd = f"py-spy dump --pid {pid}"
1314
+ result = subprocess.run(
1315
+ cmd, shell=True, capture_output=True, text=True, check=True
1316
+ )
1317
+ logger.info(f"Profile for PID {pid}:\n{result.stdout}")
1318
+ except subprocess.CalledProcessError as e:
1319
+ logger.info(f"Failed to profile PID {pid}. Error: {e.stderr}")
1320
+
1321
+
1322
+ def kill_itself_when_parent_died():
1323
+ if sys.platform == "linux":
1324
+ # sigkill this process when parent worker manager dies
1325
+ PR_SET_PDEATHSIG = 1
1326
+ libc = ctypes.CDLL("libc.so.6")
1327
+ libc.prctl(PR_SET_PDEATHSIG, signal.SIGKILL)
1328
+ else:
1329
+ logger.warninig("kill_itself_when_parent_died is only supported in linux.")
1330
+
1331
+
1369
1332
  def set_uvicorn_logging_configs():
1333
+ from uvicorn.config import LOGGING_CONFIG
1334
+
1370
1335
  LOGGING_CONFIG["formatters"]["default"][
1371
1336
  "fmt"
1372
1337
  ] = "[%(asctime)s] %(levelprefix)s %(message)s"
@@ -1442,3 +1407,35 @@ def is_valid_ipv6_address(address: str) -> bool:
1442
1407
  return True
1443
1408
  except ValueError:
1444
1409
  return False
1410
+
1411
+
1412
+ def rank0_print(msg: str):
1413
+ from sglang.srt.distributed import get_tensor_model_parallel_rank
1414
+
1415
+ if get_tensor_model_parallel_rank() == 0:
1416
+ print(msg, flush=True)
1417
+
1418
+
1419
+ def launch_dummy_health_check_server(host, port):
1420
+ import uvicorn
1421
+ from fastapi import FastAPI, Response
1422
+
1423
+ app = FastAPI()
1424
+
1425
+ @app.get("/health")
1426
+ async def health():
1427
+ """Check the health of the http server."""
1428
+ return Response(status_code=200)
1429
+
1430
+ @app.get("/health_generate")
1431
+ async def health_generate():
1432
+ """Check the health of the http server."""
1433
+ return Response(status_code=200)
1434
+
1435
+ uvicorn.run(
1436
+ app,
1437
+ host=host,
1438
+ port=port,
1439
+ timeout_keep_alive=5,
1440
+ loop="uvloop",
1441
+ )
@@ -535,6 +535,7 @@ def test_hellaswag_select():
535
535
 
536
536
  # Compute accuracy
537
537
  accuracy_gen = np.mean(np.array(preds_gen) == np.array(labels))
538
+ print(f"{accuracy=}, {accuracy_gen=}")
538
539
  assert np.abs(accuracy_gen - accuracy) < 0.05
539
540
  assert np.abs(latency_gen - latency) < 1
540
541
 
sglang/test/test_utils.py CHANGED
@@ -34,7 +34,7 @@ DEFAULT_SMALL_MOE_MODEL_NAME_FOR_TEST = "Qwen/Qwen1.5-MoE-A2.7B"
34
34
  DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST = "Alibaba-NLP/gte-Qwen2-1.5B-instruct"
35
35
  DEFAULT_MLA_MODEL_NAME_FOR_TEST = "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"
36
36
  DEFAULT_MLA_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
37
- DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
37
+ DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 1000
38
38
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 = "meta-llama/Llama-3.1-8B-Instruct,mistralai/Mistral-7B-Instruct-v0.3,deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct,google/gemma-2-27b-it"
39
39
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2 = "meta-llama/Llama-3.1-70B-Instruct,mistralai/Mixtral-8x7B-Instruct-v0.1,Qwen/Qwen2-57B-A14B-Instruct"
40
40
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1 = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8,neuralmagic/Mistral-7B-Instruct-v0.3-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8,neuralmagic/gemma-2-2b-it-FP8"
@@ -42,6 +42,9 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-In
42
42
  DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
43
43
  DEFAULT_SMALL_MODEL_NAME_FOR_TEST_QWEN = "Qwen/Qwen2.5-1.5B-Instruct"
44
44
 
45
+ DEFAULT_EAGLE_TARGET_MODEL_FOR_TEST = "meta-llama/Llama-2-7b-chat-hf"
46
+ DEFAULT_EAGLE_DRAFT_MODEL_FOR_TEST = "lmzheng/sglang-EAGLE-llama2-chat-7B"
47
+
45
48
 
46
49
  def is_in_ci():
47
50
  """Return whether it is in CI runner."""
@@ -132,10 +135,6 @@ def call_generate_srt_raw(prompt, temperature, max_tokens, stop=None, url=None):
132
135
  return pred
133
136
 
134
137
 
135
- def call_generate_gserver(prompt, temperature, max_tokens, stop=None, url=None):
136
- raise NotImplementedError()
137
-
138
-
139
138
  def call_generate_guidance(
140
139
  prompt, temperature, max_tokens, stop=None, n=1, regex=None, model=None
141
140
  ):
@@ -527,6 +526,48 @@ def get_similarities(vec1, vec2):
527
526
  return F.cosine_similarity(torch.tensor(vec1), torch.tensor(vec2), dim=0)
528
527
 
529
528
 
529
+ def get_benchmark_args(
530
+ base_url="",
531
+ dataset_name="",
532
+ dataset_path="",
533
+ tokenizer="",
534
+ num_prompts=500,
535
+ random_input_len=4096,
536
+ random_output_len=2048,
537
+ request_rate=float("inf"),
538
+ disable_stream=False,
539
+ disable_ignore_eos=False,
540
+ ):
541
+ return SimpleNamespace(
542
+ backend="sglang",
543
+ base_url=base_url,
544
+ host=None,
545
+ port=None,
546
+ dataset_name=dataset_name,
547
+ dataset_path=dataset_path,
548
+ model=None,
549
+ tokenizer=tokenizer,
550
+ num_prompts=num_prompts,
551
+ sharegpt_output_len=None,
552
+ sharegpt_context_len=None,
553
+ random_input_len=random_input_len,
554
+ random_output_len=random_output_len,
555
+ random_range_ratio=0.0,
556
+ request_rate=request_rate,
557
+ multi=None,
558
+ output_file=None,
559
+ disable_tqdm=False,
560
+ disable_stream=disable_stream,
561
+ return_logprob=False,
562
+ seed=0,
563
+ disable_ignore_eos=disable_ignore_eos,
564
+ extra_request_body=None,
565
+ apply_chat_template=False,
566
+ profile=None,
567
+ lora_name=None,
568
+ )
569
+
570
+
530
571
  def run_bench_serving(
531
572
  model,
532
573
  num_prompts,
@@ -538,6 +579,7 @@ def run_bench_serving(
538
579
  random_input_len=4096,
539
580
  random_output_len=2048,
540
581
  disable_stream=False,
582
+ disable_ignore_eos=False,
541
583
  need_warmup=False,
542
584
  ):
543
585
  # Launch the server
@@ -550,32 +592,17 @@ def run_bench_serving(
550
592
  )
551
593
 
552
594
  # Run benchmark
553
- args = SimpleNamespace(
554
- backend="sglang",
595
+ args = get_benchmark_args(
555
596
  base_url=base_url,
556
- host=None,
557
- port=None,
558
597
  dataset_name=dataset_name,
559
598
  dataset_path=dataset_path,
560
- model=None,
561
599
  tokenizer=tokenizer,
562
600
  num_prompts=num_prompts,
563
- sharegpt_output_len=None,
564
- sharegpt_context_len=None,
565
601
  random_input_len=random_input_len,
566
602
  random_output_len=random_output_len,
567
- random_range_ratio=0.0,
568
603
  request_rate=request_rate,
569
- multi=None,
570
- seed=0,
571
- output_file=None,
572
- disable_tqdm=False,
573
604
  disable_stream=disable_stream,
574
- disable_ignore_eos=False,
575
- return_logprob=False,
576
- lora_name=None,
577
- extra_request_body=None,
578
- profile=None,
605
+ disable_ignore_eos=disable_ignore_eos,
579
606
  )
580
607
 
581
608
  try:
@@ -591,6 +618,38 @@ def run_bench_serving(
591
618
  return res
592
619
 
593
620
 
621
+ def run_bench_serving_multi(
622
+ model,
623
+ base_url,
624
+ other_server_args,
625
+ benchmark_args,
626
+ need_warmup=False,
627
+ ):
628
+ # Launch the server
629
+ process = popen_launch_server(
630
+ model,
631
+ base_url,
632
+ timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
633
+ other_args=other_server_args,
634
+ )
635
+
636
+ # run benchmark for all
637
+ res_l = []
638
+ try:
639
+ for args in benchmark_args:
640
+ if need_warmup:
641
+ warmup_args = copy.deepcopy(args)
642
+ warmup_args.num_prompts = 16
643
+ run_benchmark(warmup_args)
644
+
645
+ res = run_benchmark(args)
646
+ res_l.append((args, res))
647
+ finally:
648
+ kill_process_tree(process.pid)
649
+
650
+ return res_l
651
+
652
+
594
653
  def run_bench_one_batch(model, other_args):
595
654
  command = [
596
655
  "python3",
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.4.1.post7"
1
+ __version__ = "0.4.2"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: sglang
3
- Version: 0.4.1.post7
3
+ Version: 0.4.2
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -240,7 +240,7 @@ Requires-Dist: xgrammar>=0.1.10; extra == "runtime-common"
240
240
  Provides-Extra: srt
241
241
  Requires-Dist: sglang[runtime_common]; extra == "srt"
242
242
  Requires-Dist: cuda-python; extra == "srt"
243
- Requires-Dist: sgl-kernel>=0.0.2.post14; extra == "srt"
243
+ Requires-Dist: sgl-kernel>=0.0.3; extra == "srt"
244
244
  Requires-Dist: torch; extra == "srt"
245
245
  Requires-Dist: vllm==0.6.4.post1; extra == "srt"
246
246
  Requires-Dist: flashinfer==0.1.6; extra == "srt"
@@ -333,16 +333,16 @@ Requires-Dist: sglang[test]; extra == "dev-cpu"
333
333
  | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
334
334
 
335
335
  ## News
336
- - [2024/12] 🔥 SGLang v0.4: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
337
- - [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
338
- - [2024/09] SGLang v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
339
- - [2024/07] Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
336
+ - [2025/01] 🔥 SGLang provides day one support for DeepSeek V3/R1 models on NVIDIA and AMD GPUs with DeekSeek-specific optimizations. ([instructions](https://github.com/sgl-project/sglang/tree/main/benchmark/deepseek_v3), [AMD blog](https://www.amd.com/en/developer/resources/technical-articles/amd-instinct-gpus-power-deepseek-v3-revolutionizing-ai-development-with-sglang.html))
337
+ - [2024/12] 🔥 v0.4 Release: Zero-Overhead Batch Scheduler, Cache-Aware Load Balancer, Faster Structured Outputs ([blog](https://lmsys.org/blog/2024-12-04-sglang-v0-4/)).
338
+ - [2024/09] v0.3 Release: 7x Faster DeepSeek MLA, 1.5x Faster torch.compile, Multi-Image/Video LLaVA-OneVision ([blog](https://lmsys.org/blog/2024-09-04-sglang-v0-3/)).
339
+ - [2024/07] v0.2 Release: Faster Llama3 Serving with SGLang Runtime (vs. TensorRT-LLM, vLLM) ([blog](https://lmsys.org/blog/2024-07-25-sglang-llama3/)).
340
340
 
341
341
  <details>
342
342
  <summary>More</summary>
343
343
 
344
+ - [2024/10] The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
344
345
  - [2024/02] SGLang enables **3x faster JSON decoding** with compressed finite state machine ([blog](https://lmsys.org/blog/2024-02-05-compressed-fsm/)).
345
- - [2024/04] SGLang is used by the official **LLaVA-NeXT (video)** release ([blog](https://llava-vl.github.io/blog/2024-04-30-llava-next-video/)).
346
346
  - [2024/01] SGLang provides up to **5x faster inference** with RadixAttention ([blog](https://lmsys.org/blog/2024-01-17-sglang/)).
347
347
  - [2024/01] SGLang powers the serving of the official **LLaVA v1.6** release demo ([usage](https://github.com/haotian-liu/LLaVA?tab=readme-ov-file#demo)).
348
348