sglang 0.3.6__py3-none-any.whl → 0.3.6.post2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +2 -2
- sglang/api.py +2 -2
- sglang/bench_one_batch.py +4 -7
- sglang/bench_one_batch_server.py +2 -2
- sglang/bench_serving.py +75 -26
- sglang/check_env.py +7 -1
- sglang/lang/backend/base_backend.py +1 -1
- sglang/lang/backend/runtime_endpoint.py +2 -2
- sglang/lang/tracer.py +1 -1
- sglang/launch_server.py +0 -3
- sglang/srt/configs/model_config.py +15 -20
- sglang/srt/constrained/__init__.py +13 -14
- sglang/srt/constrained/base_grammar_backend.py +13 -15
- sglang/srt/constrained/outlines_backend.py +13 -15
- sglang/srt/constrained/outlines_jump_forward.py +13 -15
- sglang/srt/constrained/xgrammar_backend.py +38 -57
- sglang/srt/conversation.py +13 -15
- sglang/srt/hf_transformers_utils.py +13 -15
- sglang/srt/layers/activation.py +13 -13
- sglang/srt/layers/attention/flashinfer_backend.py +14 -7
- sglang/srt/layers/attention/triton_ops/decode_attention.py +51 -55
- sglang/srt/layers/attention/triton_ops/extend_attention.py +16 -16
- sglang/srt/layers/attention/triton_ops/prefill_attention.py +13 -15
- sglang/srt/layers/custom_op_util.py +13 -14
- sglang/srt/layers/fused_moe_grok/__init__.py +1 -0
- sglang/srt/layers/{fused_moe → fused_moe_grok}/layer.py +4 -9
- sglang/srt/layers/{fused_moe/patch.py → fused_moe_patch.py} +5 -0
- sglang/srt/layers/fused_moe_triton/__init__.py +44 -0
- sglang/srt/layers/fused_moe_triton/fused_moe.py +861 -0
- sglang/srt/layers/fused_moe_triton/layer.py +633 -0
- sglang/srt/layers/layernorm.py +13 -15
- sglang/srt/layers/logits_processor.py +13 -15
- sglang/srt/layers/quantization/__init__.py +77 -17
- sglang/srt/layers/radix_attention.py +13 -15
- sglang/srt/layers/rotary_embedding.py +13 -13
- sglang/srt/layers/sampler.py +1 -1
- sglang/srt/lora/lora.py +13 -14
- sglang/srt/lora/lora_config.py +13 -14
- sglang/srt/lora/lora_manager.py +22 -24
- sglang/srt/managers/data_parallel_controller.py +25 -19
- sglang/srt/managers/detokenizer_manager.py +13 -18
- sglang/srt/managers/image_processor.py +6 -9
- sglang/srt/managers/io_struct.py +43 -28
- sglang/srt/managers/schedule_batch.py +92 -27
- sglang/srt/managers/schedule_policy.py +13 -15
- sglang/srt/managers/scheduler.py +94 -72
- sglang/srt/managers/session_controller.py +29 -19
- sglang/srt/managers/tokenizer_manager.py +29 -22
- sglang/srt/managers/tp_worker.py +13 -15
- sglang/srt/managers/tp_worker_overlap_thread.py +13 -15
- sglang/srt/metrics/collector.py +13 -15
- sglang/srt/metrics/func_timer.py +13 -15
- sglang/srt/mm_utils.py +13 -14
- sglang/srt/model_executor/cuda_graph_runner.py +20 -19
- sglang/srt/model_executor/forward_batch_info.py +19 -17
- sglang/srt/model_executor/model_runner.py +42 -30
- sglang/srt/models/chatglm.py +15 -16
- sglang/srt/models/commandr.py +15 -16
- sglang/srt/models/dbrx.py +15 -16
- sglang/srt/models/deepseek.py +15 -15
- sglang/srt/models/deepseek_v2.py +15 -15
- sglang/srt/models/exaone.py +14 -15
- sglang/srt/models/gemma.py +14 -14
- sglang/srt/models/gemma2.py +24 -19
- sglang/srt/models/gemma2_reward.py +13 -14
- sglang/srt/models/gpt_bigcode.py +14 -14
- sglang/srt/models/grok.py +15 -15
- sglang/srt/models/internlm2.py +13 -15
- sglang/srt/models/internlm2_reward.py +13 -14
- sglang/srt/models/llama.py +21 -21
- sglang/srt/models/llama_classification.py +13 -14
- sglang/srt/models/llama_reward.py +13 -14
- sglang/srt/models/llava.py +20 -16
- sglang/srt/models/llavavid.py +13 -15
- sglang/srt/models/minicpm.py +13 -15
- sglang/srt/models/minicpm3.py +13 -15
- sglang/srt/models/mistral.py +13 -15
- sglang/srt/models/mixtral.py +15 -15
- sglang/srt/models/mixtral_quant.py +14 -14
- sglang/srt/models/olmo.py +21 -19
- sglang/srt/models/olmoe.py +23 -20
- sglang/srt/models/qwen.py +14 -14
- sglang/srt/models/qwen2.py +22 -19
- sglang/srt/models/qwen2_moe.py +17 -18
- sglang/srt/models/stablelm.py +18 -16
- sglang/srt/models/torch_native_llama.py +15 -17
- sglang/srt/models/xverse.py +13 -14
- sglang/srt/models/xverse_moe.py +15 -16
- sglang/srt/models/yivl.py +13 -15
- sglang/srt/openai_api/adapter.py +13 -15
- sglang/srt/openai_api/protocol.py +13 -15
- sglang/srt/sampling/sampling_batch_info.py +4 -1
- sglang/srt/sampling/sampling_params.py +13 -15
- sglang/srt/server.py +60 -34
- sglang/srt/server_args.py +22 -22
- sglang/srt/utils.py +208 -19
- sglang/test/few_shot_gsm8k.py +8 -4
- sglang/test/runners.py +13 -14
- sglang/test/test_utils.py +2 -2
- sglang/version.py +1 -1
- {sglang-0.3.6.dist-info → sglang-0.3.6.post2.dist-info}/LICENSE +1 -1
- {sglang-0.3.6.dist-info → sglang-0.3.6.post2.dist-info}/METADATA +25 -15
- sglang-0.3.6.post2.dist-info/RECORD +164 -0
- sglang/srt/layers/fused_moe/__init__.py +0 -1
- sglang-0.3.6.dist-info/RECORD +0 -161
- /sglang/srt/layers/{fused_moe → fused_moe_grok}/fused_moe.py +0 -0
- {sglang-0.3.6.dist-info → sglang-0.3.6.post2.dist-info}/WHEEL +0 -0
- {sglang-0.3.6.dist-info → sglang-0.3.6.post2.dist-info}/top_level.txt +0 -0
sglang/__init__.py
CHANGED
@@ -11,7 +11,7 @@ from sglang.api import (
|
|
11
11
|
gen,
|
12
12
|
gen_int,
|
13
13
|
gen_string,
|
14
|
-
|
14
|
+
get_server_info,
|
15
15
|
image,
|
16
16
|
select,
|
17
17
|
set_default_backend,
|
@@ -41,7 +41,7 @@ __all__ = [
|
|
41
41
|
"gen",
|
42
42
|
"gen_int",
|
43
43
|
"gen_string",
|
44
|
-
"
|
44
|
+
"get_server_info",
|
45
45
|
"image",
|
46
46
|
"select",
|
47
47
|
"set_default_backend",
|
sglang/api.py
CHANGED
@@ -65,7 +65,7 @@ def flush_cache(backend: Optional[BaseBackend] = None):
|
|
65
65
|
return backend.flush_cache()
|
66
66
|
|
67
67
|
|
68
|
-
def
|
68
|
+
def get_server_info(backend: Optional[BaseBackend] = None):
|
69
69
|
backend = backend or global_config.default_backend
|
70
70
|
if backend is None:
|
71
71
|
return None
|
@@ -73,7 +73,7 @@ def get_server_args(backend: Optional[BaseBackend] = None):
|
|
73
73
|
# If backend is Runtime
|
74
74
|
if hasattr(backend, "endpoint"):
|
75
75
|
backend = backend.endpoint
|
76
|
-
return backend.
|
76
|
+
return backend.get_server_info()
|
77
77
|
|
78
78
|
|
79
79
|
def gen(
|
sglang/bench_one_batch.py
CHANGED
@@ -212,6 +212,7 @@ def extend(reqs, model_runner):
|
|
212
212
|
token_to_kv_pool=model_runner.token_to_kv_pool,
|
213
213
|
tree_cache=None,
|
214
214
|
model_config=model_runner.model_config,
|
215
|
+
enable_overlap=False,
|
215
216
|
)
|
216
217
|
batch.prepare_for_extend()
|
217
218
|
model_worker_batch = batch.get_model_worker_batch()
|
@@ -278,10 +279,7 @@ def correctness_test(
|
|
278
279
|
|
279
280
|
|
280
281
|
def synchronize(device):
|
281
|
-
|
282
|
-
torch.cuda.synchronize()
|
283
|
-
elif device == "xpu":
|
284
|
-
torch.xpu.synchronize()
|
282
|
+
torch.get_device_module(device).synchronize()
|
285
283
|
|
286
284
|
|
287
285
|
def latency_test_run_once(
|
@@ -468,7 +466,6 @@ if __name__ == "__main__":
|
|
468
466
|
|
469
467
|
try:
|
470
468
|
main(server_args, bench_args)
|
471
|
-
except Exception as e:
|
472
|
-
raise e
|
473
469
|
finally:
|
474
|
-
|
470
|
+
if server_args.tp_size != 1:
|
471
|
+
kill_child_process()
|
sglang/bench_one_batch_server.py
CHANGED
@@ -5,9 +5,9 @@ This script launches a server and uses the HTTP interface.
|
|
5
5
|
It accepts server arguments (the same as launch_server.py) and benchmark arguments (e.g., batch size, input lengths).
|
6
6
|
|
7
7
|
Usage:
|
8
|
-
python3 -m sglang.
|
8
|
+
python3 -m sglang.bench_one_batch_server --model meta-llama/Meta-Llama-3.1-8B --batch-size 1 16 64 --input-len 1024 --output-len 8
|
9
9
|
|
10
|
-
python3 -m sglang.
|
10
|
+
python3 -m sglang.bench_one_batch_server --model None --base-url http://localhost:30000 --batch-size 16 --input-len 1024 --output-len 8
|
11
11
|
"""
|
12
12
|
|
13
13
|
import argparse
|
sglang/bench_serving.py
CHANGED
@@ -25,6 +25,7 @@ import warnings
|
|
25
25
|
from argparse import ArgumentParser
|
26
26
|
from dataclasses import dataclass, field
|
27
27
|
from datetime import datetime
|
28
|
+
from pathlib import Path
|
28
29
|
from typing import Any, AsyncGenerator, Dict, List, Optional, Tuple, Union
|
29
30
|
|
30
31
|
import aiohttp
|
@@ -407,7 +408,7 @@ async def async_request_profile(api_url: str) -> RequestFuncOutput:
|
|
407
408
|
|
408
409
|
|
409
410
|
def get_model(pretrained_model_name_or_path: str) -> str:
|
410
|
-
if os.getenv("SGLANG_USE_MODELSCOPE", "
|
411
|
+
if os.getenv("SGLANG_USE_MODELSCOPE", "false").lower() == "true":
|
411
412
|
import huggingface_hub.constants
|
412
413
|
from modelscope import snapshot_download
|
413
414
|
|
@@ -693,6 +694,19 @@ def gen_prompt(tokenizer, token_num):
|
|
693
694
|
return tokenizer.decode(selected_tokens)
|
694
695
|
|
695
696
|
|
697
|
+
def get_gen_prefix_cache_path(args, tokenizer):
|
698
|
+
"""Create cache directory under ~/.cache/sglang/benchmark"""
|
699
|
+
cache_dir = Path.home() / ".cache" / "sglang" / "benchmark"
|
700
|
+
|
701
|
+
# Create a unique cache filename based on the generation parameters
|
702
|
+
cache_key = (
|
703
|
+
f"gen_prefix_{args.gen_num_groups}_{args.gen_prompts_per_group}_"
|
704
|
+
f"{args.gen_system_prompt_len}_{args.gen_question_len}_{args.gen_output_len}_"
|
705
|
+
f"{tokenizer.__class__.__name__}.pkl"
|
706
|
+
)
|
707
|
+
return cache_dir / cache_key
|
708
|
+
|
709
|
+
|
696
710
|
def sample_generated_shared_prefix_requests(
|
697
711
|
num_groups: int,
|
698
712
|
prompts_per_group: int,
|
@@ -701,12 +715,17 @@ def sample_generated_shared_prefix_requests(
|
|
701
715
|
output_len: int,
|
702
716
|
tokenizer: PreTrainedTokenizerBase,
|
703
717
|
) -> List[Tuple[str, int, int]]:
|
704
|
-
|
705
|
-
|
706
|
-
|
718
|
+
"""Generate benchmark requests with shared system prompts using random tokens and caching."""
|
719
|
+
cache_path = get_gen_prefix_cache_path(args, tokenizer)
|
720
|
+
|
721
|
+
# Try to load from cache first
|
722
|
+
if cache_path.exists():
|
723
|
+
print(f"\nLoading cached generated input data from {cache_path}")
|
724
|
+
with open(cache_path, "rb") as f:
|
707
725
|
return pickle.load(f)
|
708
726
|
|
709
|
-
"
|
727
|
+
print("\nGenerating new input data...")
|
728
|
+
|
710
729
|
# Generate system prompts for each group
|
711
730
|
system_prompts = []
|
712
731
|
for _ in range(num_groups):
|
@@ -719,17 +738,16 @@ def sample_generated_shared_prefix_requests(
|
|
719
738
|
question = gen_prompt(tokenizer, question_len)
|
720
739
|
questions.append(question)
|
721
740
|
|
722
|
-
# Shuffle questions
|
723
|
-
random.shuffle(questions)
|
724
|
-
|
725
741
|
# Combine system prompts with questions
|
726
742
|
input_requests = []
|
727
743
|
total_input_tokens = 0
|
728
744
|
total_output_tokens = 0
|
729
745
|
|
730
|
-
for group_idx in range(num_groups):
|
746
|
+
for group_idx in tqdm(range(num_groups), desc="Generating system prompt"):
|
731
747
|
system_prompt = system_prompts[group_idx]
|
732
|
-
for prompt_idx in
|
748
|
+
for prompt_idx in tqdm(
|
749
|
+
range(prompts_per_group), desc="Generating questions", leave=False
|
750
|
+
):
|
733
751
|
question = questions[group_idx * prompts_per_group + prompt_idx]
|
734
752
|
full_prompt = f"{system_prompt}\n\n{question}"
|
735
753
|
prompt_len = len(tokenizer.encode(full_prompt))
|
@@ -738,6 +756,10 @@ def sample_generated_shared_prefix_requests(
|
|
738
756
|
total_input_tokens += prompt_len
|
739
757
|
total_output_tokens += output_len
|
740
758
|
|
759
|
+
# Shuffle questions
|
760
|
+
random.shuffle(input_requests)
|
761
|
+
|
762
|
+
# Print statistics
|
741
763
|
print(f"\nGenerated shared prefix dataset statistics:")
|
742
764
|
print(f"Number of groups: {num_groups}")
|
743
765
|
print(f"Prompts per group: {prompts_per_group}")
|
@@ -750,11 +772,12 @@ def sample_generated_shared_prefix_requests(
|
|
750
772
|
print(
|
751
773
|
f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
|
752
774
|
)
|
753
|
-
|
754
|
-
|
755
|
-
|
756
|
-
|
757
|
-
|
775
|
+
|
776
|
+
# Save to cache
|
777
|
+
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
778
|
+
print(f"Caching generated input data to {cache_path}")
|
779
|
+
with open(cache_path, "wb") as f:
|
780
|
+
pickle.dump(input_requests, f)
|
758
781
|
|
759
782
|
return input_requests
|
760
783
|
|
@@ -859,6 +882,7 @@ async def benchmark(
|
|
859
882
|
tokenizer: PreTrainedTokenizerBase,
|
860
883
|
input_requests: List[Tuple[str, int, int]],
|
861
884
|
request_rate: float,
|
885
|
+
max_concurrency: Optional[int],
|
862
886
|
disable_tqdm: bool,
|
863
887
|
extra_request_body: Dict[str, Any],
|
864
888
|
profile: bool,
|
@@ -868,6 +892,15 @@ async def benchmark(
|
|
868
892
|
else:
|
869
893
|
raise ValueError(f"Unknown backend: {backend}")
|
870
894
|
|
895
|
+
# From https://github.com/vllm-project/vllm/pull/9390
|
896
|
+
semaphore = asyncio.Semaphore(max_concurrency) if max_concurrency else None
|
897
|
+
|
898
|
+
async def limited_request_func(request_func_input, pbar):
|
899
|
+
if semaphore is None:
|
900
|
+
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
901
|
+
async with semaphore:
|
902
|
+
return await request_func(request_func_input=request_func_input, pbar=pbar)
|
903
|
+
|
871
904
|
print("Starting initial single prompt test run...")
|
872
905
|
test_prompt, test_prompt_len, test_output_len = input_requests[0]
|
873
906
|
test_input = RequestFuncInput(
|
@@ -913,7 +946,7 @@ async def benchmark(
|
|
913
946
|
)
|
914
947
|
tasks.append(
|
915
948
|
asyncio.create_task(
|
916
|
-
|
949
|
+
limited_request_func(request_func_input=request_func_input, pbar=pbar)
|
917
950
|
)
|
918
951
|
)
|
919
952
|
outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
|
@@ -940,6 +973,12 @@ async def benchmark(
|
|
940
973
|
print("\n{s:{c}^{n}}".format(s=" Serving Benchmark Result ", n=50, c="="))
|
941
974
|
print("{:<40} {:<10}".format("Backend:", backend))
|
942
975
|
print("{:<40} {:<10}".format("Traffic request rate:", request_rate))
|
976
|
+
print(
|
977
|
+
"{:<40} {:<10}".format(
|
978
|
+
"Max reqeuest concurrency:",
|
979
|
+
max_concurrency if max_concurrency else "not set",
|
980
|
+
)
|
981
|
+
)
|
943
982
|
print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
|
944
983
|
print("{:<40} {:<10.2f}".format("Benchmark duration (s):", benchmark_duration))
|
945
984
|
print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
|
@@ -1003,6 +1042,7 @@ async def benchmark(
|
|
1003
1042
|
"backend": args.backend,
|
1004
1043
|
"dataset_name": args.dataset_name,
|
1005
1044
|
"request_rate": request_rate,
|
1045
|
+
"max_concurrency": max_concurrency,
|
1006
1046
|
"total_input_tokens": metrics.total_input,
|
1007
1047
|
"total_output_tokens": metrics.total_output,
|
1008
1048
|
"total_output_tokens_retokenized": metrics.total_output_retokenized,
|
@@ -1090,6 +1130,10 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1090
1130
|
global args
|
1091
1131
|
args = args_
|
1092
1132
|
|
1133
|
+
# Set default value for max_concurrency if not present
|
1134
|
+
if not hasattr(args, "max_concurrency"):
|
1135
|
+
args.max_concurrency = None
|
1136
|
+
|
1093
1137
|
# Set global environments
|
1094
1138
|
set_ulimit()
|
1095
1139
|
random.seed(args.seed)
|
@@ -1201,6 +1245,7 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1201
1245
|
tokenizer=tokenizer,
|
1202
1246
|
input_requests=input_requests,
|
1203
1247
|
request_rate=args.request_rate,
|
1248
|
+
max_concurrency=args.max_concurrency,
|
1204
1249
|
disable_tqdm=args.disable_tqdm,
|
1205
1250
|
extra_request_body=extra_request_body,
|
1206
1251
|
profile=args.profile,
|
@@ -1220,6 +1265,7 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1220
1265
|
tokenizer=tokenizer,
|
1221
1266
|
input_requests=input_requests,
|
1222
1267
|
request_rate=rate,
|
1268
|
+
max_concurrency=args.max_concurrency,
|
1223
1269
|
disable_tqdm=args.disable_tqdm,
|
1224
1270
|
extra_request_body=extra_request_body,
|
1225
1271
|
profile=args.profile,
|
@@ -1319,6 +1365,19 @@ if __name__ == "__main__":
|
|
1319
1365
|
help="Number of requests per second. If this is inf, then all the requests are sent at time 0. "
|
1320
1366
|
"Otherwise, we use Poisson process to synthesize the request arrival times. Default is inf.",
|
1321
1367
|
)
|
1368
|
+
parser.add_argument(
|
1369
|
+
"--max-concurrency",
|
1370
|
+
type=int,
|
1371
|
+
default=None,
|
1372
|
+
help="Maximum number of concurrent requests. This can be used "
|
1373
|
+
"to help simulate an environment where a higher level component "
|
1374
|
+
"is enforcing a maximum number of concurrent requests. While the "
|
1375
|
+
"--request-rate argument controls the rate at which requests are "
|
1376
|
+
"initiated, this argument will control how many are actually allowed "
|
1377
|
+
"to execute at a time. This means that when used in combination, the "
|
1378
|
+
"actual request rate may be lower than specified with --request-rate, "
|
1379
|
+
"if the server is not processing requests fast enough to keep up.",
|
1380
|
+
)
|
1322
1381
|
parser.add_argument("--seed", type=int, default=1, help="The random seed.")
|
1323
1382
|
parser.add_argument(
|
1324
1383
|
"--multi",
|
@@ -1386,16 +1445,6 @@ if __name__ == "__main__":
|
|
1386
1445
|
default=256,
|
1387
1446
|
help="Target length in tokens for outputs in generated-shared-prefix dataset",
|
1388
1447
|
)
|
1389
|
-
parser.add_argument(
|
1390
|
-
"--generated-input-save-path",
|
1391
|
-
type=str,
|
1392
|
-
help="Path to save generated input data",
|
1393
|
-
)
|
1394
|
-
parser.add_argument(
|
1395
|
-
"--generated-input-path",
|
1396
|
-
type=str,
|
1397
|
-
help="Path to load previously generated input data",
|
1398
|
-
)
|
1399
1448
|
parser.add_argument(
|
1400
1449
|
"--profile",
|
1401
1450
|
action="store_true",
|
sglang/check_env.py
CHANGED
@@ -22,18 +22,24 @@ PACKAGE_LIST = [
|
|
22
22
|
"hf_transfer",
|
23
23
|
"huggingface_hub",
|
24
24
|
"interegular",
|
25
|
+
"modelscope",
|
26
|
+
"orjson",
|
27
|
+
"outlines",
|
28
|
+
"packaging",
|
25
29
|
"psutil",
|
26
30
|
"pydantic",
|
27
31
|
"multipart",
|
28
32
|
"zmq",
|
33
|
+
"torchao",
|
29
34
|
"uvicorn",
|
30
35
|
"uvloop",
|
31
36
|
"vllm",
|
32
|
-
"
|
37
|
+
"xgrammar",
|
33
38
|
"openai",
|
34
39
|
"tiktoken",
|
35
40
|
"anthropic",
|
36
41
|
"litellm",
|
42
|
+
"decord",
|
37
43
|
]
|
38
44
|
|
39
45
|
|
@@ -58,9 +58,9 @@ class RuntimeEndpoint(BaseBackend):
|
|
58
58
|
)
|
59
59
|
self._assert_success(res)
|
60
60
|
|
61
|
-
def
|
61
|
+
def get_server_info(self):
|
62
62
|
res = http_request(
|
63
|
-
self.base_url + "/
|
63
|
+
self.base_url + "/get_server_info",
|
64
64
|
api_key=self.api_key,
|
65
65
|
verify=self.verify,
|
66
66
|
)
|
sglang/lang/tracer.py
CHANGED
sglang/launch_server.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
"""Launch the inference server."""
|
2
2
|
|
3
|
-
import os
|
4
3
|
import sys
|
5
4
|
|
6
5
|
from sglang.srt.server import launch_server
|
@@ -12,7 +11,5 @@ if __name__ == "__main__":
|
|
12
11
|
|
13
12
|
try:
|
14
13
|
launch_server(server_args)
|
15
|
-
except Exception as e:
|
16
|
-
raise e
|
17
14
|
finally:
|
18
15
|
kill_child_process()
|
@@ -1,27 +1,26 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
15
14
|
|
16
15
|
import json
|
17
16
|
import logging
|
18
|
-
import os
|
19
17
|
from enum import IntEnum, auto
|
20
18
|
from typing import List, Optional
|
21
19
|
|
22
20
|
from transformers import PretrainedConfig
|
23
21
|
|
24
22
|
from sglang.srt.hf_transformers_utils import get_config, get_context_length
|
23
|
+
from sglang.srt.utils import get_bool_env_var
|
25
24
|
|
26
25
|
logger = logging.getLogger(__name__)
|
27
26
|
|
@@ -60,13 +59,9 @@ class ModelConfig:
|
|
60
59
|
|
61
60
|
# Derive context length
|
62
61
|
derived_context_len = get_context_length(self.hf_text_config)
|
63
|
-
allow_long_context = os.environ.get(
|
64
|
-
"SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN", None
|
65
|
-
)
|
66
|
-
|
67
62
|
if context_length is not None:
|
68
63
|
if context_length > derived_context_len:
|
69
|
-
if
|
64
|
+
if get_bool_env_var("SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN"):
|
70
65
|
logger.warning(
|
71
66
|
f"Warning: User-specified context_length ({context_length}) is greater than the derived context_length ({derived_context_len}). "
|
72
67
|
f"This may lead to incorrect model outputs or CUDA errors."
|
@@ -1,17 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
15
14
|
|
16
15
|
# TODO(lmzheng): make this an optional dependency
|
17
16
|
from sglang.srt.constrained.outlines_backend import build_regex_from_object
|
@@ -1,18 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
16
14
|
"""The baseclass of a backend for grammar-guided constrained decoding."""
|
17
15
|
|
18
16
|
from concurrent.futures import Future, ThreadPoolExecutor
|
@@ -1,18 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
16
14
|
"""Constrained decoding with outlines backend."""
|
17
15
|
|
18
16
|
import json
|
@@ -1,18 +1,16 @@
|
|
1
|
-
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
"""
|
15
|
-
|
1
|
+
# Copyright 2023-2024 SGLang Team
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
3
|
+
# you may not use this file except in compliance with the License.
|
4
|
+
# You may obtain a copy of the License at
|
5
|
+
#
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
7
|
+
#
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
11
|
+
# See the License for the specific language governing permissions and
|
12
|
+
# limitations under the License.
|
13
|
+
# ==============================================================================
|
16
14
|
"""
|
17
15
|
Faster constrained decoding with jump forward decoding / compressed finite state machine.
|
18
16
|
Reference: https://lmsys.org/blog/2024-02-05-compressed-fsm/
|