sglang 0.2.10__py3-none-any.whl → 0.2.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. sglang/__init__.py +8 -0
  2. sglang/api.py +10 -2
  3. sglang/bench_latency.py +145 -36
  4. sglang/check_env.py +24 -2
  5. sglang/global_config.py +0 -1
  6. sglang/lang/backend/base_backend.py +3 -1
  7. sglang/lang/backend/openai.py +8 -3
  8. sglang/lang/backend/runtime_endpoint.py +46 -29
  9. sglang/lang/choices.py +164 -0
  10. sglang/lang/interpreter.py +6 -13
  11. sglang/lang/ir.py +11 -2
  12. sglang/srt/layers/logits_processor.py +1 -1
  13. sglang/srt/layers/radix_attention.py +2 -5
  14. sglang/srt/managers/schedule_batch.py +95 -324
  15. sglang/srt/managers/tokenizer_manager.py +6 -3
  16. sglang/srt/managers/tp_worker.py +20 -22
  17. sglang/srt/mem_cache/memory_pool.py +9 -14
  18. sglang/srt/model_executor/cuda_graph_runner.py +3 -3
  19. sglang/srt/model_executor/forward_batch_info.py +256 -0
  20. sglang/srt/model_executor/model_runner.py +6 -10
  21. sglang/srt/models/chatglm.py +1 -1
  22. sglang/srt/models/commandr.py +1 -1
  23. sglang/srt/models/dbrx.py +1 -1
  24. sglang/srt/models/deepseek.py +1 -1
  25. sglang/srt/models/deepseek_v2.py +1 -1
  26. sglang/srt/models/gemma.py +1 -1
  27. sglang/srt/models/gemma2.py +1 -1
  28. sglang/srt/models/gpt_bigcode.py +1 -1
  29. sglang/srt/models/grok.py +1 -1
  30. sglang/srt/models/internlm2.py +1 -1
  31. sglang/srt/models/llama2.py +1 -1
  32. sglang/srt/models/llama_classification.py +1 -1
  33. sglang/srt/models/llava.py +1 -2
  34. sglang/srt/models/llavavid.py +1 -2
  35. sglang/srt/models/minicpm.py +1 -1
  36. sglang/srt/models/mixtral.py +1 -1
  37. sglang/srt/models/mixtral_quant.py +1 -1
  38. sglang/srt/models/qwen.py +1 -1
  39. sglang/srt/models/qwen2.py +1 -1
  40. sglang/srt/models/qwen2_moe.py +1 -1
  41. sglang/srt/models/stablelm.py +1 -1
  42. sglang/srt/openai_api/adapter.py +34 -12
  43. sglang/srt/openai_api/protocol.py +6 -0
  44. sglang/srt/server.py +24 -6
  45. sglang/srt/server_args.py +4 -0
  46. sglang/test/test_utils.py +1 -1
  47. sglang/version.py +1 -1
  48. {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/METADATA +34 -24
  49. {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/RECORD +52 -50
  50. {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/LICENSE +0 -0
  51. {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/WHEEL +0 -0
  52. {sglang-0.2.10.dist-info → sglang-0.2.11.dist-info}/top_level.txt +0 -0
sglang/lang/choices.py ADDED
@@ -0,0 +1,164 @@
1
+ from abc import ABC, abstractmethod
2
+ from dataclasses import dataclass
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ import numpy as np
6
+
7
+
8
+ @dataclass
9
+ class ChoicesDecision:
10
+ decision: str
11
+ meta_info: Optional[Dict[str, Any]] = None
12
+
13
+
14
+ class ChoicesSamplingMethod(ABC):
15
+
16
+ @property
17
+ def requires_unconditional_logprobs(self) -> bool:
18
+ return False
19
+
20
+ @abstractmethod
21
+ def __call__(
22
+ self,
23
+ *,
24
+ choices: List[str],
25
+ normalized_prompt_logprobs: List[float],
26
+ input_token_logprobs: List[List[Any]],
27
+ output_token_logprobs: List[List[Any]],
28
+ unconditional_token_logprobs: Optional[List[List[Any]]] = None,
29
+ ) -> ChoicesDecision: ...
30
+
31
+
32
+ class TokenLengthNormalized(ChoicesSamplingMethod):
33
+
34
+ def __call__(
35
+ self,
36
+ *,
37
+ choices: List[str],
38
+ normalized_prompt_logprobs: List[float],
39
+ input_token_logprobs: List[List[Any]],
40
+ output_token_logprobs: List[List[Any]],
41
+ unconditional_token_logprobs: Optional[List[List[Any]]] = None,
42
+ ) -> ChoicesDecision:
43
+ """Select the option with the highest token length normalized prompt logprob."""
44
+ best_choice = choices[np.argmax(normalized_prompt_logprobs)]
45
+ meta_info = {
46
+ "normalized_prompt_logprobs": normalized_prompt_logprobs,
47
+ "input_token_logprobs": input_token_logprobs,
48
+ "output_token_logprobs": output_token_logprobs,
49
+ }
50
+ return ChoicesDecision(decision=best_choice, meta_info=meta_info)
51
+
52
+
53
+ token_length_normalized = TokenLengthNormalized()
54
+
55
+
56
+ class GreedyTokenSelection(ChoicesSamplingMethod):
57
+
58
+ def __call__(
59
+ self,
60
+ *,
61
+ choices: List[str],
62
+ normalized_prompt_logprobs: List[float],
63
+ input_token_logprobs: List[List[Any]],
64
+ output_token_logprobs: List[List[Any]],
65
+ unconditional_token_logprobs: Optional[List[List[Any]]] = None,
66
+ ) -> ChoicesDecision:
67
+ """Select the option based on greedy logprob selection. For overlapping options
68
+ where one option is a subset of a longer option, extend the shorter option using
69
+ its average logprob for comparison against the longer option."""
70
+
71
+ num_options = len(choices)
72
+ max_tokens = max(len(option) for option in input_token_logprobs)
73
+ logprob_matrix = self._build_logprob_matrix(
74
+ input_token_logprobs, max_tokens, num_options
75
+ )
76
+ remaining = self._greedy_selection(logprob_matrix, num_options, max_tokens)
77
+
78
+ best_choice = choices[remaining[0]]
79
+ meta_info = {
80
+ "normalized_prompt_logprobs": normalized_prompt_logprobs,
81
+ "input_token_logprobs": input_token_logprobs,
82
+ "output_token_logprobs": output_token_logprobs,
83
+ "greedy_logprob_matrix": logprob_matrix.tolist(),
84
+ }
85
+ return ChoicesDecision(decision=best_choice, meta_info=meta_info)
86
+
87
+ def _build_logprob_matrix(self, input_token_logprobs, max_tokens, num_options):
88
+ logprob_matrix = np.zeros((num_options, max_tokens))
89
+ for i, option in enumerate(input_token_logprobs):
90
+ actual_logprobs = [token[0] for token in option]
91
+ avg_logprob = np.mean(actual_logprobs)
92
+ logprob_matrix[i, : len(option)] = actual_logprobs
93
+ if len(option) < max_tokens:
94
+ logprob_matrix[i, len(option) :] = avg_logprob
95
+ return logprob_matrix
96
+
97
+ def _greedy_selection(self, logprob_matrix, num_options, max_tokens):
98
+ remaining = np.arange(num_options)
99
+ for j in range(max_tokens):
100
+ max_logprob = np.max(logprob_matrix[remaining, j])
101
+ remaining = remaining[logprob_matrix[remaining, j] == max_logprob]
102
+ if len(remaining) == 1:
103
+ break
104
+ return remaining
105
+
106
+
107
+ greedy_token_selection = GreedyTokenSelection()
108
+
109
+
110
+ class UnconditionalLikelihoodNormalized(ChoicesSamplingMethod):
111
+
112
+ @property
113
+ def requires_unconditional_logprobs(self) -> bool:
114
+ return True
115
+
116
+ def __call__(
117
+ self,
118
+ *,
119
+ choices: List[str],
120
+ normalized_prompt_logprobs: List[float],
121
+ input_token_logprobs: List[List[Any]],
122
+ output_token_logprobs: List[List[Any]],
123
+ unconditional_token_logprobs: Optional[List[List[Any]]] = None,
124
+ ) -> ChoicesDecision:
125
+ """Select the option with the highest average token logprob once normalized by
126
+ the unconditional token logprobs.
127
+
128
+ The first unconditional token logprob is assumed to be None. If so, it is
129
+ replaced with 0 for the purposes of normalization."""
130
+
131
+ if unconditional_token_logprobs is None:
132
+ raise ValueError(
133
+ "Unconditional token logprobs are required for this method."
134
+ )
135
+
136
+ normalized_unconditional_prompt_logprobs = self._normalize_logprobs(
137
+ input_token_logprobs, unconditional_token_logprobs
138
+ )
139
+
140
+ best_choice = choices[np.argmax(normalized_unconditional_prompt_logprobs)]
141
+ meta_info = {
142
+ "normalized_prompt_logprobs": normalized_prompt_logprobs,
143
+ "input_token_logprobs": input_token_logprobs,
144
+ "output_token_logprobs": output_token_logprobs,
145
+ "unconditional_token_logprobs": unconditional_token_logprobs,
146
+ "normalized_unconditional_prompt_logprobs": normalized_unconditional_prompt_logprobs,
147
+ }
148
+ return ChoicesDecision(decision=best_choice, meta_info=meta_info)
149
+
150
+ def _normalize_logprobs(self, input_token_logprobs, unconditional_token_logprobs):
151
+ normalized_unconditional_prompt_logprobs = []
152
+ for inputs, unconditionals in zip(
153
+ input_token_logprobs, unconditional_token_logprobs
154
+ ):
155
+ inputs_logprobs = np.array([token[0] for token in inputs])
156
+ unconditionals_logprobs = np.array([token[0] for token in unconditionals])
157
+ unconditionals_logprobs[0] = unconditionals_logprobs[0] or 0
158
+ normalized_unconditional_prompt_logprobs.append(
159
+ float(np.mean(inputs_logprobs - unconditionals_logprobs))
160
+ )
161
+ return normalized_unconditional_prompt_logprobs
162
+
163
+
164
+ unconditional_likelihood_normalized = UnconditionalLikelihoodNormalized()
@@ -538,24 +538,17 @@ class StreamExecutor:
538
538
  self.stream_var_event[name].set()
539
539
 
540
540
  def _execute_select(self, expr: SglSelect):
541
- (
542
- decision,
543
- normalized_prompt_logprobs,
544
- input_token_logprobs,
545
- output_token_logprobs,
546
- ) = self.backend.select(self, expr.choices, expr.temperature)
541
+ choices_decision = self.backend.select(
542
+ self, expr.choices, expr.temperature, expr.choices_method
543
+ )
547
544
  if expr.name is not None:
548
545
  name = expr.name
549
- self.variables[name] = decision
550
- self.meta_info[name] = {
551
- "normalized_prompt_logprobs": normalized_prompt_logprobs,
552
- "input_token_logprobs": input_token_logprobs,
553
- "output_token_logprobs": output_token_logprobs,
554
- }
546
+ self.variables[name] = choices_decision.decision
547
+ self.meta_info[name] = choices_decision.meta_info
555
548
  self.variable_event[name].set()
556
549
  if self.stream_var_event:
557
550
  self.stream_var_event[name].set()
558
- self.text_ += decision
551
+ self.text_ += choices_decision.decision
559
552
 
560
553
  def _execute_variable(self, expr: SglVariable):
561
554
  src_executor = expr.source_stream_executor
sglang/lang/ir.py CHANGED
@@ -6,6 +6,7 @@ import warnings
6
6
  from typing import List, Optional, Union
7
7
 
8
8
  from sglang.global_config import global_config
9
+ from sglang.lang.choices import ChoicesSamplingMethod
9
10
 
10
11
  REGEX_INT = r"[-+]?[0-9]+"
11
12
  REGEX_FLOAT = r"[-+]?[0-9]*\.?[0-9]+"
@@ -461,14 +462,22 @@ class SglRoleEnd(SglExpr):
461
462
 
462
463
 
463
464
  class SglSelect(SglExpr):
464
- def __init__(self, name: str, choices: List[str], temperature: float):
465
+
466
+ def __init__(
467
+ self,
468
+ name: str,
469
+ choices: List[str],
470
+ temperature: float,
471
+ choices_method: ChoicesSamplingMethod,
472
+ ):
465
473
  super().__init__()
466
474
  self.name = name
467
475
  self.choices = choices
468
476
  self.temperature = temperature
477
+ self.choices_method = choices_method
469
478
 
470
479
  def __repr__(self):
471
- return f"Select({self.name}, choices={self.choices})"
480
+ return f"Select({self.name}, choices={self.choices}, choices_method={self.choices_method})"
472
481
 
473
482
 
474
483
  class SglFork(SglExpr):
@@ -25,7 +25,7 @@ from vllm.distributed import (
25
25
  tensor_model_parallel_all_gather,
26
26
  )
27
27
 
28
- from sglang.srt.model_executor.model_runner import ForwardMode, InputMetadata
28
+ from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
29
29
 
30
30
 
31
31
  @dataclasses.dataclass
@@ -22,11 +22,8 @@ from torch import nn
22
22
  from sglang.global_config import global_config
23
23
  from sglang.srt.layers.extend_attention import extend_attention_fwd
24
24
  from sglang.srt.layers.token_attention import token_attention_fwd
25
- from sglang.srt.model_executor.model_runner import (
26
- ForwardMode,
27
- InputMetadata,
28
- global_server_args_dict,
29
- )
25
+ from sglang.srt.model_executor.forward_batch_info import ForwardMode, InputMetadata
26
+ from sglang.srt.model_executor.model_runner import global_server_args_dict
30
27
 
31
28
 
32
29
  class RadixAttention(nn.Module):