sglang 0.2.10__py3-none-any.whl → 0.2.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/__init__.py +8 -0
- sglang/api.py +10 -2
- sglang/bench_latency.py +145 -36
- sglang/check_env.py +24 -2
- sglang/global_config.py +0 -1
- sglang/lang/backend/base_backend.py +3 -1
- sglang/lang/backend/openai.py +8 -3
- sglang/lang/backend/runtime_endpoint.py +46 -29
- sglang/lang/choices.py +164 -0
- sglang/lang/interpreter.py +6 -13
- sglang/lang/ir.py +11 -2
- sglang/srt/layers/logits_processor.py +1 -1
- sglang/srt/layers/radix_attention.py +2 -5
- sglang/srt/managers/schedule_batch.py +95 -324
- sglang/srt/managers/tokenizer_manager.py +6 -3
- sglang/srt/managers/tp_worker.py +20 -22
- sglang/srt/mem_cache/memory_pool.py +9 -14
- sglang/srt/model_executor/cuda_graph_runner.py +3 -3
- sglang/srt/model_executor/forward_batch_info.py +256 -0
- sglang/srt/model_executor/model_runner.py +6 -10
- sglang/srt/models/chatglm.py +1 -1
- sglang/srt/models/commandr.py +1 -1
- sglang/srt/models/dbrx.py +1 -1
- sglang/srt/models/deepseek.py +1 -1
- sglang/srt/models/deepseek_v2.py +1 -1
- sglang/srt/models/gemma.py +1 -1
- sglang/srt/models/gemma2.py +1 -1
- sglang/srt/models/gpt_bigcode.py +1 -1
- sglang/srt/models/grok.py +1 -1
- sglang/srt/models/internlm2.py +1 -1
- sglang/srt/models/llama2.py +1 -1
- sglang/srt/models/llama_classification.py +1 -1
- sglang/srt/models/llava.py +1 -2
- sglang/srt/models/llavavid.py +1 -2
- sglang/srt/models/minicpm.py +1 -1
- sglang/srt/models/mixtral.py +1 -1
- sglang/srt/models/mixtral_quant.py +1 -1
- sglang/srt/models/qwen.py +1 -1
- sglang/srt/models/qwen2.py +1 -1
- sglang/srt/models/qwen2_moe.py +1 -1
- sglang/srt/models/stablelm.py +1 -1
- sglang/srt/openai_api/adapter.py +34 -12
- sglang/srt/openai_api/protocol.py +6 -0
- sglang/srt/server.py +24 -6
- sglang/srt/server_args.py +4 -0
- sglang/test/test_utils.py +1 -1
- sglang/version.py +1 -1
- {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/METADATA +34 -24
- {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/RECORD +52 -50
- {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/LICENSE +0 -0
- {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/WHEEL +0 -0
- {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/top_level.txt +0 -0
sglang/lang/choices.py
ADDED
@@ -0,0 +1,164 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from dataclasses import dataclass
|
3
|
+
from typing import Any, Dict, List, Optional
|
4
|
+
|
5
|
+
import numpy as np
|
6
|
+
|
7
|
+
|
8
|
+
@dataclass
|
9
|
+
class ChoicesDecision:
|
10
|
+
decision: str
|
11
|
+
meta_info: Optional[Dict[str, Any]] = None
|
12
|
+
|
13
|
+
|
14
|
+
class ChoicesSamplingMethod(ABC):
|
15
|
+
|
16
|
+
@property
|
17
|
+
def requires_unconditional_logprobs(self) -> bool:
|
18
|
+
return False
|
19
|
+
|
20
|
+
@abstractmethod
|
21
|
+
def __call__(
|
22
|
+
self,
|
23
|
+
*,
|
24
|
+
choices: List[str],
|
25
|
+
normalized_prompt_logprobs: List[float],
|
26
|
+
input_token_logprobs: List[List[Any]],
|
27
|
+
output_token_logprobs: List[List[Any]],
|
28
|
+
unconditional_token_logprobs: Optional[List[List[Any]]] = None,
|
29
|
+
) -> ChoicesDecision: ...
|
30
|
+
|
31
|
+
|
32
|
+
class TokenLengthNormalized(ChoicesSamplingMethod):
|
33
|
+
|
34
|
+
def __call__(
|
35
|
+
self,
|
36
|
+
*,
|
37
|
+
choices: List[str],
|
38
|
+
normalized_prompt_logprobs: List[float],
|
39
|
+
input_token_logprobs: List[List[Any]],
|
40
|
+
output_token_logprobs: List[List[Any]],
|
41
|
+
unconditional_token_logprobs: Optional[List[List[Any]]] = None,
|
42
|
+
) -> ChoicesDecision:
|
43
|
+
"""Select the option with the highest token length normalized prompt logprob."""
|
44
|
+
best_choice = choices[np.argmax(normalized_prompt_logprobs)]
|
45
|
+
meta_info = {
|
46
|
+
"normalized_prompt_logprobs": normalized_prompt_logprobs,
|
47
|
+
"input_token_logprobs": input_token_logprobs,
|
48
|
+
"output_token_logprobs": output_token_logprobs,
|
49
|
+
}
|
50
|
+
return ChoicesDecision(decision=best_choice, meta_info=meta_info)
|
51
|
+
|
52
|
+
|
53
|
+
token_length_normalized = TokenLengthNormalized()
|
54
|
+
|
55
|
+
|
56
|
+
class GreedyTokenSelection(ChoicesSamplingMethod):
|
57
|
+
|
58
|
+
def __call__(
|
59
|
+
self,
|
60
|
+
*,
|
61
|
+
choices: List[str],
|
62
|
+
normalized_prompt_logprobs: List[float],
|
63
|
+
input_token_logprobs: List[List[Any]],
|
64
|
+
output_token_logprobs: List[List[Any]],
|
65
|
+
unconditional_token_logprobs: Optional[List[List[Any]]] = None,
|
66
|
+
) -> ChoicesDecision:
|
67
|
+
"""Select the option based on greedy logprob selection. For overlapping options
|
68
|
+
where one option is a subset of a longer option, extend the shorter option using
|
69
|
+
its average logprob for comparison against the longer option."""
|
70
|
+
|
71
|
+
num_options = len(choices)
|
72
|
+
max_tokens = max(len(option) for option in input_token_logprobs)
|
73
|
+
logprob_matrix = self._build_logprob_matrix(
|
74
|
+
input_token_logprobs, max_tokens, num_options
|
75
|
+
)
|
76
|
+
remaining = self._greedy_selection(logprob_matrix, num_options, max_tokens)
|
77
|
+
|
78
|
+
best_choice = choices[remaining[0]]
|
79
|
+
meta_info = {
|
80
|
+
"normalized_prompt_logprobs": normalized_prompt_logprobs,
|
81
|
+
"input_token_logprobs": input_token_logprobs,
|
82
|
+
"output_token_logprobs": output_token_logprobs,
|
83
|
+
"greedy_logprob_matrix": logprob_matrix.tolist(),
|
84
|
+
}
|
85
|
+
return ChoicesDecision(decision=best_choice, meta_info=meta_info)
|
86
|
+
|
87
|
+
def _build_logprob_matrix(self, input_token_logprobs, max_tokens, num_options):
|
88
|
+
logprob_matrix = np.zeros((num_options, max_tokens))
|
89
|
+
for i, option in enumerate(input_token_logprobs):
|
90
|
+
actual_logprobs = [token[0] for token in option]
|
91
|
+
avg_logprob = np.mean(actual_logprobs)
|
92
|
+
logprob_matrix[i, : len(option)] = actual_logprobs
|
93
|
+
if len(option) < max_tokens:
|
94
|
+
logprob_matrix[i, len(option) :] = avg_logprob
|
95
|
+
return logprob_matrix
|
96
|
+
|
97
|
+
def _greedy_selection(self, logprob_matrix, num_options, max_tokens):
|
98
|
+
remaining = np.arange(num_options)
|
99
|
+
for j in range(max_tokens):
|
100
|
+
max_logprob = np.max(logprob_matrix[remaining, j])
|
101
|
+
remaining = remaining[logprob_matrix[remaining, j] == max_logprob]
|
102
|
+
if len(remaining) == 1:
|
103
|
+
break
|
104
|
+
return remaining
|
105
|
+
|
106
|
+
|
107
|
+
greedy_token_selection = GreedyTokenSelection()
|
108
|
+
|
109
|
+
|
110
|
+
class UnconditionalLikelihoodNormalized(ChoicesSamplingMethod):
|
111
|
+
|
112
|
+
@property
|
113
|
+
def requires_unconditional_logprobs(self) -> bool:
|
114
|
+
return True
|
115
|
+
|
116
|
+
def __call__(
|
117
|
+
self,
|
118
|
+
*,
|
119
|
+
choices: List[str],
|
120
|
+
normalized_prompt_logprobs: List[float],
|
121
|
+
input_token_logprobs: List[List[Any]],
|
122
|
+
output_token_logprobs: List[List[Any]],
|
123
|
+
unconditional_token_logprobs: Optional[List[List[Any]]] = None,
|
124
|
+
) -> ChoicesDecision:
|
125
|
+
"""Select the option with the highest average token logprob once normalized by
|
126
|
+
the unconditional token logprobs.
|
127
|
+
|
128
|
+
The first unconditional token logprob is assumed to be None. If so, it is
|
129
|
+
replaced with 0 for the purposes of normalization."""
|
130
|
+
|
131
|
+
if unconditional_token_logprobs is None:
|
132
|
+
raise ValueError(
|
133
|
+
"Unconditional token logprobs are required for this method."
|
134
|
+
)
|
135
|
+
|
136
|
+
normalized_unconditional_prompt_logprobs = self._normalize_logprobs(
|
137
|
+
input_token_logprobs, unconditional_token_logprobs
|
138
|
+
)
|
139
|
+
|
140
|
+
best_choice = choices[np.argmax(normalized_unconditional_prompt_logprobs)]
|
141
|
+
meta_info = {
|
142
|
+
"normalized_prompt_logprobs": normalized_prompt_logprobs,
|
143
|
+
"input_token_logprobs": input_token_logprobs,
|
144
|
+
"output_token_logprobs": output_token_logprobs,
|
145
|
+
"unconditional_token_logprobs": unconditional_token_logprobs,
|
146
|
+
"normalized_unconditional_prompt_logprobs": normalized_unconditional_prompt_logprobs,
|
147
|
+
}
|
148
|
+
return ChoicesDecision(decision=best_choice, meta_info=meta_info)
|
149
|
+
|
150
|
+
def _normalize_logprobs(self, input_token_logprobs, unconditional_token_logprobs):
|
151
|
+
normalized_unconditional_prompt_logprobs = []
|
152
|
+
for inputs, unconditionals in zip(
|
153
|
+
input_token_logprobs, unconditional_token_logprobs
|
154
|
+
):
|
155
|
+
inputs_logprobs = np.array([token[0] for token in inputs])
|
156
|
+
unconditionals_logprobs = np.array([token[0] for token in unconditionals])
|
157
|
+
unconditionals_logprobs[0] = unconditionals_logprobs[0] or 0
|
158
|
+
normalized_unconditional_prompt_logprobs.append(
|
159
|
+
float(np.mean(inputs_logprobs - unconditionals_logprobs))
|
160
|
+
)
|
161
|
+
return normalized_unconditional_prompt_logprobs
|
162
|
+
|
163
|
+
|
164
|
+
unconditional_likelihood_normalized = UnconditionalLikelihoodNormalized()
|
sglang/lang/interpreter.py
CHANGED
@@ -538,24 +538,17 @@ class StreamExecutor:
|
|
538
538
|
self.stream_var_event[name].set()
|
539
539
|
|
540
540
|
def _execute_select(self, expr: SglSelect):
|
541
|
-
(
|
542
|
-
|
543
|
-
|
544
|
-
input_token_logprobs,
|
545
|
-
output_token_logprobs,
|
546
|
-
) = self.backend.select(self, expr.choices, expr.temperature)
|
541
|
+
choices_decision = self.backend.select(
|
542
|
+
self, expr.choices, expr.temperature, expr.choices_method
|
543
|
+
)
|
547
544
|
if expr.name is not None:
|
548
545
|
name = expr.name
|
549
|
-
self.variables[name] = decision
|
550
|
-
self.meta_info[name] =
|
551
|
-
"normalized_prompt_logprobs": normalized_prompt_logprobs,
|
552
|
-
"input_token_logprobs": input_token_logprobs,
|
553
|
-
"output_token_logprobs": output_token_logprobs,
|
554
|
-
}
|
546
|
+
self.variables[name] = choices_decision.decision
|
547
|
+
self.meta_info[name] = choices_decision.meta_info
|
555
548
|
self.variable_event[name].set()
|
556
549
|
if self.stream_var_event:
|
557
550
|
self.stream_var_event[name].set()
|
558
|
-
self.text_ += decision
|
551
|
+
self.text_ += choices_decision.decision
|
559
552
|
|
560
553
|
def _execute_variable(self, expr: SglVariable):
|
561
554
|
src_executor = expr.source_stream_executor
|
sglang/lang/ir.py
CHANGED
@@ -6,6 +6,7 @@ import warnings
|
|
6
6
|
from typing import List, Optional, Union
|
7
7
|
|
8
8
|
from sglang.global_config import global_config
|
9
|
+
from sglang.lang.choices import ChoicesSamplingMethod
|
9
10
|
|
10
11
|
REGEX_INT = r"[-+]?[0-9]+"
|
11
12
|
REGEX_FLOAT = r"[-+]?[0-9]*\.?[0-9]+"
|
@@ -461,14 +462,22 @@ class SglRoleEnd(SglExpr):
|
|
461
462
|
|
462
463
|
|
463
464
|
class SglSelect(SglExpr):
|
464
|
-
|
465
|
+
|
466
|
+
def __init__(
|
467
|
+
self,
|
468
|
+
name: str,
|
469
|
+
choices: List[str],
|
470
|
+
temperature: float,
|
471
|
+
choices_method: ChoicesSamplingMethod,
|
472
|
+
):
|
465
473
|
super().__init__()
|
466
474
|
self.name = name
|
467
475
|
self.choices = choices
|
468
476
|
self.temperature = temperature
|
477
|
+
self.choices_method = choices_method
|
469
478
|
|
470
479
|
def __repr__(self):
|
471
|
-
return f"Select({self.name}, choices={self.choices})"
|
480
|
+
return f"Select({self.name}, choices={self.choices}, choices_method={self.choices_method})"
|
472
481
|
|
473
482
|
|
474
483
|
class SglFork(SglExpr):
|
@@ -25,7 +25,7 @@ from vllm.distributed import (
|
|
25
25
|
tensor_model_parallel_all_gather,
|
26
26
|
)
|
27
27
|
|
28
|
-
from sglang.srt.model_executor.
|
28
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
|
29
29
|
|
30
30
|
|
31
31
|
@dataclasses.dataclass
|
@@ -22,11 +22,8 @@ from torch import nn
|
|
22
22
|
from sglang.global_config import global_config
|
23
23
|
from sglang.srt.layers.extend_attention import extend_attention_fwd
|
24
24
|
from sglang.srt.layers.token_attention import token_attention_fwd
|
25
|
-
from sglang.srt.model_executor.
|
26
|
-
|
27
|
-
InputMetadata,
|
28
|
-
global_server_args_dict,
|
29
|
-
)
|
25
|
+
from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
|
26
|
+
from sglang.srt.model_executor.model_runner import global_server_args_dict
|
30
27
|
|
31
28
|
|
32
29
|
class RadixAttention(nn.Module):
|