sglang 0.3.6__py3-none-any.whl → 0.3.6.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. sglang/__init__.py +2 -2
  2. sglang/api.py +2 -2
  3. sglang/bench_one_batch.py +4 -7
  4. sglang/bench_one_batch_server.py +2 -2
  5. sglang/bench_serving.py +75 -26
  6. sglang/check_env.py +7 -1
  7. sglang/lang/backend/base_backend.py +1 -1
  8. sglang/lang/backend/runtime_endpoint.py +2 -2
  9. sglang/lang/tracer.py +1 -1
  10. sglang/launch_server.py +0 -3
  11. sglang/srt/configs/model_config.py +15 -20
  12. sglang/srt/constrained/__init__.py +13 -14
  13. sglang/srt/constrained/base_grammar_backend.py +13 -15
  14. sglang/srt/constrained/outlines_backend.py +13 -15
  15. sglang/srt/constrained/outlines_jump_forward.py +13 -15
  16. sglang/srt/constrained/xgrammar_backend.py +38 -57
  17. sglang/srt/conversation.py +13 -15
  18. sglang/srt/hf_transformers_utils.py +13 -15
  19. sglang/srt/layers/activation.py +13 -13
  20. sglang/srt/layers/attention/flashinfer_backend.py +14 -7
  21. sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
  22. sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
  23. sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
  24. sglang/srt/layers/custom_op_util.py +13 -14
  25. sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
  26. sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
  27. sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
  28. sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
  29. sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
  30. sglang/srt/layers/fused_moe_triton/layer.py +633 -0
  31. sglang/srt/layers/layernorm.py +13 -15
  32. sglang/srt/layers/logits_processor.py +13 -15
  33. sglang/srt/layers/quantization/__init__.py +77 -17
  34. sglang/srt/layers/radix_attention.py +13 -15
  35. sglang/srt/layers/rotary_embedding.py +13 -13
  36. sglang/srt/layers/sampler.py +1 -1
  37. sglang/srt/lora/lora.py +13 -14
  38. sglang/srt/lora/lora_config.py +13 -14
  39. sglang/srt/lora/lora_manager.py +22 -24
  40. sglang/srt/managers/data_parallel_controller.py +25 -19
  41. sglang/srt/managers/detokenizer_manager.py +13 -18
  42. sglang/srt/managers/image_processor.py +6 -9
  43. sglang/srt/managers/io_struct.py +43 -28
  44. sglang/srt/managers/schedule_batch.py +92 -27
  45. sglang/srt/managers/schedule_policy.py +13 -15
  46. sglang/srt/managers/scheduler.py +94 -72
  47. sglang/srt/managers/session_controller.py +29 -19
  48. sglang/srt/managers/tokenizer_manager.py +29 -22
  49. sglang/srt/managers/tp_worker.py +13 -15
  50. sglang/srt/managers/tp_worker_overlap_thread.py +13 -15
  51. sglang/srt/metrics/collector.py +13 -15
  52. sglang/srt/metrics/func_timer.py +13 -15
  53. sglang/srt/mm_utils.py +13 -14
  54. sglang/srt/model_executor/cuda_graph_runner.py +20 -19
  55. sglang/srt/model_executor/forward_batch_info.py +19 -17
  56. sglang/srt/model_executor/model_runner.py +42 -30
  57. sglang/srt/models/chatglm.py +15 -16
  58. sglang/srt/models/commandr.py +15 -16
  59. sglang/srt/models/dbrx.py +15 -16
  60. sglang/srt/models/deepseek.py +15 -15
  61. sglang/srt/models/deepseek_v2.py +15 -15
  62. sglang/srt/models/exaone.py +14 -15
  63. sglang/srt/models/gemma.py +14 -14
  64. sglang/srt/models/gemma2.py +24 -19
  65. sglang/srt/models/gemma2_reward.py +13 -14
  66. sglang/srt/models/gpt_bigcode.py +14 -14
  67. sglang/srt/models/grok.py +15 -15
  68. sglang/srt/models/internlm2.py +13 -15
  69. sglang/srt/models/internlm2_reward.py +13 -14
  70. sglang/srt/models/llama.py +21 -21
  71. sglang/srt/models/llama_classification.py +13 -14
  72. sglang/srt/models/llama_reward.py +13 -14
  73. sglang/srt/models/llava.py +20 -16
  74. sglang/srt/models/llavavid.py +13 -15
  75. sglang/srt/models/minicpm.py +13 -15
  76. sglang/srt/models/minicpm3.py +13 -15
  77. sglang/srt/models/mistral.py +13 -15
  78. sglang/srt/models/mixtral.py +15 -15
  79. sglang/srt/models/mixtral_quant.py +14 -14
  80. sglang/srt/models/olmo.py +21 -19
  81. sglang/srt/models/olmoe.py +23 -20
  82. sglang/srt/models/qwen.py +14 -14
  83. sglang/srt/models/qwen2.py +22 -19
  84. sglang/srt/models/qwen2_moe.py +17 -18
  85. sglang/srt/models/stablelm.py +18 -16
  86. sglang/srt/models/torch_native_llama.py +15 -17
  87. sglang/srt/models/xverse.py +13 -14
  88. sglang/srt/models/xverse_moe.py +15 -16
  89. sglang/srt/models/yivl.py +13 -15
  90. sglang/srt/openai_api/adapter.py +13 -15
  91. sglang/srt/openai_api/protocol.py +13 -15
  92. sglang/srt/sampling/sampling_batch_info.py +4 -1
  93. sglang/srt/sampling/sampling_params.py +13 -15
  94. sglang/srt/server.py +60 -34
  95. sglang/srt/server_args.py +22 -22
  96. sglang/srt/utils.py +208 -19
  97. sglang/test/few_shot_gsm8k.py +8 -4
  98. sglang/test/runners.py +13 -14
  99. sglang/test/test_utils.py +2 -2
  100. sglang/version.py +1 -1
  101. {sglang-0.3.6.dist-info → sglang-0.3.6.post2.dist-info}/LICENSE +1 -1
  102. {sglang-0.3.6.dist-info → sglang-0.3.6.post2.dist-info}/METADATA +25 -15
  103. sglang-0.3.6.post2.dist-info/RECORD +164 -0
  104. sglang/srt/layers/fused_moe/__init__.py +0 -1
  105. sglang-0.3.6.dist-info/RECORD +0 -161
  106. /sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +0 -0
  107. {sglang-0.3.6.dist-info → sglang-0.3.6.post2.dist-info}/WHEEL +0 -0
  108. {sglang-0.3.6.dist-info → sglang-0.3.6.post2.dist-info}/top_level.txt +0 -0
sglang/__init__.py CHANGED
@@ -11,7 +11,7 @@ from sglang.api import (
11
11
  gen,
12
12
  gen_int,
13
13
  gen_string,
14
- get_server_args,
14
+ get_server_info,
15
15
  image,
16
16
  select,
17
17
  set_default_backend,
@@ -41,7 +41,7 @@ __all__ = [
41
41
  "gen",
42
42
  "gen_int",
43
43
  "gen_string",
44
- "get_server_args",
44
+ "get_server_info",
45
45
  "image",
46
46
  "select",
47
47
  "set_default_backend",
sglang/api.py CHANGED
@@ -65,7 +65,7 @@ def flush_cache(backend: Optional[BaseBackend] = None):
65
65
  return backend.flush_cache()
66
66
 
67
67
 
68
- def get_server_args(backend: Optional[BaseBackend] = None):
68
+ def get_server_info(backend: Optional[BaseBackend] = None):
69
69
  backend = backend or global_config.default_backend
70
70
  if backend is None:
71
71
  return None
@@ -73,7 +73,7 @@ def get_server_args(backend: Optional[BaseBackend] = None):
73
73
  # If backend is Runtime
74
74
  if hasattr(backend, "endpoint"):
75
75
  backend = backend.endpoint
76
- return backend.get_server_args()
76
+ return backend.get_server_info()
77
77
 
78
78
 
79
79
  def gen(
sglang/bench_one_batch.py CHANGED
@@ -212,6 +212,7 @@ def extend(reqs, model_runner):
212
212
  token_to_kv_pool=model_runner.token_to_kv_pool,
213
213
  tree_cache=None,
214
214
  model_config=model_runner.model_config,
215
+ enable_overlap=False,
215
216
  )
216
217
  batch.prepare_for_extend()
217
218
  model_worker_batch = batch.get_model_worker_batch()
@@ -278,10 +279,7 @@ def correctness_test(
278
279
 
279
280
 
280
281
  def synchronize(device):
281
- if device == "cuda":
282
- torch.cuda.synchronize()
283
- elif device == "xpu":
284
- torch.xpu.synchronize()
282
+ torch.get_device_module(device).synchronize()
285
283
 
286
284
 
287
285
  def latency_test_run_once(
@@ -468,7 +466,6 @@ if __name__ == "__main__":
468
466
 
469
467
  try:
470
468
  main(server_args, bench_args)
471
- except Exception as e:
472
- raise e
473
469
  finally:
474
- kill_child_process()
470
+ if server_args.tp_size != 1:
471
+ kill_child_process()
@@ -5,9 +5,9 @@ This script launches a server and uses the HTTP interface.
5
5
  It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
6
6
 
7
7
  Usage:
8
- python3 -m sglang.bench_server_latency --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
8
+ python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
9
9
 
10
- python3 -m sglang.bench_server_latency --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
10
+ python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
11
11
  """
12
12
 
13
13
  import argparse
sglang/bench_serving.py CHANGED
@@ -25,6 +25,7 @@ import warnings
25
25
  from argparse import ArgumentParser
26
26
  from dataclasses import dataclass, field
27
27
  from datetime import datetime
28
+ from pathlib import Path
28
29
  from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
29
30
 
30
31
  import aiohttp
@@ -407,7 +408,7 @@ async def async_request_profile(api_url: str) -> RequestFuncOutput:
407
408
 
408
409
 
409
410
  def get_model(pretrained_model_name_or_path: str) -> str:
410
- if os.getenv("SGLANG_USE_MODELSCOPE", "False").lower() == "true":
411
+ if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() == "true":
411
412
  import huggingface_hub.constants
412
413
  from modelscope import snapshot_download
413
414
 
@@ -693,6 +694,19 @@ def gen_prompt(tokenizer, token_num):
693
694
  return tokenizer.decode(selected_tokens)
694
695
 
695
696
 
697
+ def get_gen_prefix_cache_path(args, tokenizer):
698
+ """Create cache directory under ~/.cache/sglang/benchmark"""
699
+ cache_dir = Path.home() / ".cache" / "sglang" / "benchmark"
700
+
701
+ # Create a unique cache filename based on the generation parameters
702
+ cache_key = (
703
+ f"gen_prefix_{args.gen_num_groups}_{args.gen_prompts_per_group}_"
704
+ f"{args.gen_system_prompt_len}_{args.gen_question_len}_{args.gen_output_len}_"
705
+ f"{tokenizer.__class__.__name__}.pkl"
706
+ )
707
+ return cache_dir / cache_key
708
+
709
+
696
710
  def sample_generated_shared_prefix_requests(
697
711
  num_groups: int,
698
712
  prompts_per_group: int,
@@ -701,12 +715,17 @@ def sample_generated_shared_prefix_requests(
701
715
  output_len: int,
702
716
  tokenizer: PreTrainedTokenizerBase,
703
717
  ) -> List[Tuple[str, int, int]]:
704
- if args.generated_input_path and os.path.exists(args.generated_input_path):
705
- print(f"\nloading generated input data from {args.generated_input_path}")
706
- with open(args.generated_input_path, "rb") as f:
718
+ """Generate benchmark requests with shared system prompts using random tokens and caching."""
719
+ cache_path = get_gen_prefix_cache_path(args, tokenizer)
720
+
721
+ # Try to load from cache first
722
+ if cache_path.exists():
723
+ print(f"\nLoading cached generated input data from {cache_path}")
724
+ with open(cache_path, "rb") as f:
707
725
  return pickle.load(f)
708
726
 
709
- """Generate benchmark requests with shared system prompts using random tokens."""
727
+ print("\nGenerating new input data...")
728
+
710
729
  # Generate system prompts for each group
711
730
  system_prompts = []
712
731
  for _ in range(num_groups):
@@ -719,17 +738,16 @@ def sample_generated_shared_prefix_requests(
719
738
  question = gen_prompt(tokenizer, question_len)
720
739
  questions.append(question)
721
740
 
722
- # Shuffle questions
723
- random.shuffle(questions)
724
-
725
741
  # Combine system prompts with questions
726
742
  input_requests = []
727
743
  total_input_tokens = 0
728
744
  total_output_tokens = 0
729
745
 
730
- for group_idx in range(num_groups):
746
+ for group_idx in tqdm(range(num_groups), desc="Generating system prompt"):
731
747
  system_prompt = system_prompts[group_idx]
732
- for prompt_idx in range(prompts_per_group):
748
+ for prompt_idx in tqdm(
749
+ range(prompts_per_group), desc="Generating questions", leave=False
750
+ ):
733
751
  question = questions[group_idx * prompts_per_group + prompt_idx]
734
752
  full_prompt = f"{system_prompt}\n\n{question}"
735
753
  prompt_len = len(tokenizer.encode(full_prompt))
@@ -738,6 +756,10 @@ def sample_generated_shared_prefix_requests(
738
756
  total_input_tokens += prompt_len
739
757
  total_output_tokens += output_len
740
758
 
759
+ # Shuffle questions
760
+ random.shuffle(input_requests)
761
+
762
+ # Print statistics
741
763
  print(f"\nGenerated shared prefix dataset statistics:")
742
764
  print(f"Number of groups: {num_groups}")
743
765
  print(f"Prompts per group: {prompts_per_group}")
@@ -750,11 +772,12 @@ def sample_generated_shared_prefix_requests(
750
772
  print(
751
773
  f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
752
774
  )
753
- if args.generated_input_save_path:
754
- print(f"Saving generated input data to {args.generated_input_save_path}")
755
- os.makedirs(os.path.dirname(args.generated_input_save_path), exist_ok=True)
756
- with open(args.generated_input_save_path, "wb") as f:
757
- pickle.dump(input_requests, f)
775
+
776
+ # Save to cache
777
+ cache_path.parent.mkdir(parents=True, exist_ok=True)
778
+ print(f"Caching generated input data to {cache_path}")
779
+ with open(cache_path, "wb") as f:
780
+ pickle.dump(input_requests, f)
758
781
 
759
782
  return input_requests
760
783
 
@@ -859,6 +882,7 @@ async def benchmark(
859
882
  tokenizer: PreTrainedTokenizerBase,
860
883
  input_requests: List[Tuple[str, int, int]],
861
884
  request_rate: float,
885
+ max_concurrency: Optional[int],
862
886
  disable_tqdm: bool,
863
887
  extra_request_body: Dict[str, Any],
864
888
  profile: bool,
@@ -868,6 +892,15 @@ async def benchmark(
868
892
  else:
869
893
  raise ValueError(f"Unknown backend: {backend}")
870
894
 
895
+ # From https://github.com/vllm-project/vllm/pull/9390
896
+ semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
897
+
898
+ async def limited_request_func(request_func_input, pbar):
899
+ if semaphore is None:
900
+ return await request_func(request_func_input=request_func_input, pbar=pbar)
901
+ async with semaphore:
902
+ return await request_func(request_func_input=request_func_input, pbar=pbar)
903
+
871
904
  print("Starting initial single prompt test run...")
872
905
  test_prompt, test_prompt_len, test_output_len = input_requests[0]
873
906
  test_input = RequestFuncInput(
@@ -913,7 +946,7 @@ async def benchmark(
913
946
  )
914
947
  tasks.append(
915
948
  asyncio.create_task(
916
- request_func(request_func_input=request_func_input, pbar=pbar)
949
+ limited_request_func(request_func_input=request_func_input, pbar=pbar)
917
950
  )
918
951
  )
919
952
  outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
@@ -940,6 +973,12 @@ async def benchmark(
940
973
  print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
941
974
  print("{:<40} {:<10}".format("Backend:", backend))
942
975
  print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
976
+ print(
977
+ "{:<40} {:<10}".format(
978
+ "Max reqeuest concurrency:",
979
+ max_concurrency if max_concurrency else "not set",
980
+ )
981
+ )
943
982
  print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
944
983
  print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
945
984
  print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
@@ -1003,6 +1042,7 @@ async def benchmark(
1003
1042
  "backend": args.backend,
1004
1043
  "dataset_name": args.dataset_name,
1005
1044
  "request_rate": request_rate,
1045
+ "max_concurrency": max_concurrency,
1006
1046
  "total_input_tokens": metrics.total_input,
1007
1047
  "total_output_tokens": metrics.total_output,
1008
1048
  "total_output_tokens_retokenized": metrics.total_output_retokenized,
@@ -1090,6 +1130,10 @@ def run_benchmark(args_: argparse.Namespace):
1090
1130
  global args
1091
1131
  args = args_
1092
1132
 
1133
+ # Set default value for max_concurrency if not present
1134
+ if not hasattr(args, "max_concurrency"):
1135
+ args.max_concurrency = None
1136
+
1093
1137
  # Set global environments
1094
1138
  set_ulimit()
1095
1139
  random.seed(args.seed)
@@ -1201,6 +1245,7 @@ def run_benchmark(args_: argparse.Namespace):
1201
1245
  tokenizer=tokenizer,
1202
1246
  input_requests=input_requests,
1203
1247
  request_rate=args.request_rate,
1248
+ max_concurrency=args.max_concurrency,
1204
1249
  disable_tqdm=args.disable_tqdm,
1205
1250
  extra_request_body=extra_request_body,
1206
1251
  profile=args.profile,
@@ -1220,6 +1265,7 @@ def run_benchmark(args_: argparse.Namespace):
1220
1265
  tokenizer=tokenizer,
1221
1266
  input_requests=input_requests,
1222
1267
  request_rate=rate,
1268
+ max_concurrency=args.max_concurrency,
1223
1269
  disable_tqdm=args.disable_tqdm,
1224
1270
  extra_request_body=extra_request_body,
1225
1271
  profile=args.profile,
@@ -1319,6 +1365,19 @@ if __name__ == "__main__":
1319
1365
  help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
1320
1366
  "Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
1321
1367
  )
1368
+ parser.add_argument(
1369
+ "--max-concurrency",
1370
+ type=int,
1371
+ default=None,
1372
+ help="Maximum number of concurrent requests. This can be used "
1373
+ "to help simulate an environment where a higher level component "
1374
+ "is enforcing a maximum number of concurrent requests. While the "
1375
+ "--request-rate argument controls the rate at which requests are "
1376
+ "initiated, this argument will control how many are actually allowed "
1377
+ "to execute at a time. This means that when used in combination, the "
1378
+ "actual request rate may be lower than specified with --request-rate, "
1379
+ "if the server is not processing requests fast enough to keep up.",
1380
+ )
1322
1381
  parser.add_argument("--seed", type=int, default=1, help="The random seed.")
1323
1382
  parser.add_argument(
1324
1383
  "--multi",
@@ -1386,16 +1445,6 @@ if __name__ == "__main__":
1386
1445
  default=256,
1387
1446
  help="Target length in tokens for outputs in generated-shared-prefix dataset",
1388
1447
  )
1389
- parser.add_argument(
1390
- "--generated-input-save-path",
1391
- type=str,
1392
- help="Path to save generated input data",
1393
- )
1394
- parser.add_argument(
1395
- "--generated-input-path",
1396
- type=str,
1397
- help="Path to load previously generated input data",
1398
- )
1399
1448
  parser.add_argument(
1400
1449
  "--profile",
1401
1450
  action="store_true",
sglang/check_env.py CHANGED
@@ -22,18 +22,24 @@ PACKAGE_LIST = [
22
22
  "hf_transfer",
23
23
  "huggingface_hub",
24
24
  "interegular",
25
+ "modelscope",
26
+ "orjson",
27
+ "outlines",
28
+ "packaging",
25
29
  "psutil",
26
30
  "pydantic",
27
31
  "multipart",
28
32
  "zmq",
33
+ "torchao",
29
34
  "uvicorn",
30
35
  "uvloop",
31
36
  "vllm",
32
- "outlines",
37
+ "xgrammar",
33
38
  "openai",
34
39
  "tiktoken",
35
40
  "anthropic",
36
41
  "litellm",
42
+ "decord",
37
43
  ]
38
44
 
39
45
 
@@ -78,5 +78,5 @@ class BaseBackend:
78
78
  def flush_cache(self):
79
79
  pass
80
80
 
81
- def get_server_args(self):
81
+ def get_server_info(self):
82
82
  pass
@@ -58,9 +58,9 @@ class RuntimeEndpoint(BaseBackend):
58
58
  )
59
59
  self._assert_success(res)
60
60
 
61
- def get_server_args(self):
61
+ def get_server_info(self):
62
62
  res = http_request(
63
- self.base_url + "/get_server_args",
63
+ self.base_url + "/get_server_info",
64
64
  api_key=self.api_key,
65
65
  verify=self.verify,
66
66
  )
sglang/lang/tracer.py CHANGED
@@ -278,6 +278,6 @@ class TracingScope:
278
278
 
279
279
  def add_child_state(self, state: TracerProgramState):
280
280
  cur_scope = self
281
- while cur_scope != None:
281
+ while cur_scope is not None:
282
282
  cur_scope.tracer_state.child_states.append(state)
283
283
  cur_scope = cur_scope.last_scope
sglang/launch_server.py CHANGED
@@ -1,6 +1,5 @@
1
1
  """Launch the inference server."""
2
2
 
3
- import os
4
3
  import sys
5
4
 
6
5
  from sglang.srt.server import launch_server
@@ -12,7 +11,5 @@ if __name__ == "__main__":
12
11
 
13
12
  try:
14
13
  launch_server(server_args)
15
- except Exception as e:
16
- raise e
17
14
  finally:
18
15
  kill_child_process()
@@ -1,27 +1,26 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
15
14
 
16
15
  import json
17
16
  import logging
18
- import os
19
17
  from enum import IntEnum, auto
20
18
  from typing import List, Optional
21
19
 
22
20
  from transformers import PretrainedConfig
23
21
 
24
22
  from sglang.srt.hf_transformers_utils import get_config, get_context_length
23
+ from sglang.srt.utils import get_bool_env_var
25
24
 
26
25
  logger = logging.getLogger(__name__)
27
26
 
@@ -60,13 +59,9 @@ class ModelConfig:
60
59
 
61
60
  # Derive context length
62
61
  derived_context_len = get_context_length(self.hf_text_config)
63
- allow_long_context = os.environ.get(
64
- "SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", None
65
- )
66
-
67
62
  if context_length is not None:
68
63
  if context_length > derived_context_len:
69
- if allow_long_context:
64
+ if get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN"):
70
65
  logger.warning(
71
66
  f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
72
67
  f"This may lead to incorrect model outputs or CUDA errors."
@@ -1,17 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
15
14
 
16
15
  # TODO(lmzheng): make this an optional dependency
17
16
  from sglang.srt.constrained.outlines_backend import build_regex_from_object
@@ -1,18 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """The baseclass of a backend for grammar-guided constrained decoding."""
17
15
 
18
16
  from concurrent.futures import Future, ThreadPoolExecutor
@@ -1,18 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """Constrained decoding with outlines backend."""
17
15
 
18
16
  import json
@@ -1,18 +1,16 @@
1
- """
2
- Copyright 2023-2024 SGLang Team
3
- Licensed under the Apache License, Version 2.0 (the "License");
4
- you may not use this file except in compliance with the License.
5
- You may obtain a copy of the License at
6
-
7
- http://www.apache.org/licenses/LICENSE-2.0
8
-
9
- Unless required by applicable law or agreed to in writing, software
10
- distributed under the License is distributed on an "AS IS" BASIS,
11
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
- See the License for the specific language governing permissions and
13
- limitations under the License.
14
- """
15
-
1
+ # Copyright 2023-2024 SGLang Team
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+ # ==============================================================================
16
14
  """
17
15
  Faster constrained decoding with jump forward decoding / compressed finite state machine.
18
16
  Reference: https://lmsys.org/blog/2024-02-05-compressed-fsm/