sglang 0.2.14.post2__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. sglang/api.py +2 -0
  2. sglang/bench_latency.py +39 -28
  3. sglang/lang/backend/runtime_endpoint.py +8 -4
  4. sglang/lang/interpreter.py +3 -0
  5. sglang/lang/ir.py +5 -0
  6. sglang/launch_server_llavavid.py +12 -12
  7. sglang/srt/configs/__init__.py +5 -0
  8. sglang/srt/configs/exaone.py +195 -0
  9. sglang/srt/constrained/fsm_cache.py +1 -1
  10. sglang/srt/conversation.py +24 -2
  11. sglang/srt/hf_transformers_utils.py +12 -12
  12. sglang/srt/layers/extend_attention.py +13 -8
  13. sglang/srt/layers/logits_processor.py +4 -4
  14. sglang/srt/layers/sampler.py +94 -17
  15. sglang/srt/managers/controller_multi.py +5 -5
  16. sglang/srt/managers/controller_single.py +5 -5
  17. sglang/srt/managers/io_struct.py +6 -1
  18. sglang/srt/managers/schedule_batch.py +26 -11
  19. sglang/srt/managers/tokenizer_manager.py +9 -9
  20. sglang/srt/managers/tp_worker.py +38 -26
  21. sglang/srt/model_config.py +3 -3
  22. sglang/srt/model_executor/cuda_graph_runner.py +26 -9
  23. sglang/srt/model_executor/forward_batch_info.py +68 -23
  24. sglang/srt/model_executor/model_runner.py +15 -22
  25. sglang/srt/models/chatglm.py +9 -15
  26. sglang/srt/models/commandr.py +5 -1
  27. sglang/srt/models/dbrx.py +5 -1
  28. sglang/srt/models/deepseek.py +5 -1
  29. sglang/srt/models/deepseek_v2.py +57 -25
  30. sglang/srt/models/exaone.py +368 -0
  31. sglang/srt/models/gemma.py +5 -1
  32. sglang/srt/models/gemma2.py +5 -1
  33. sglang/srt/models/gpt_bigcode.py +5 -1
  34. sglang/srt/models/grok.py +5 -1
  35. sglang/srt/models/internlm2.py +5 -1
  36. sglang/srt/models/{llama2.py → llama.py} +25 -45
  37. sglang/srt/models/llama_classification.py +34 -41
  38. sglang/srt/models/llama_embedding.py +7 -6
  39. sglang/srt/models/llava.py +8 -11
  40. sglang/srt/models/llavavid.py +5 -6
  41. sglang/srt/models/minicpm.py +5 -1
  42. sglang/srt/models/mistral.py +2 -3
  43. sglang/srt/models/mixtral.py +6 -2
  44. sglang/srt/models/mixtral_quant.py +5 -1
  45. sglang/srt/models/qwen.py +5 -2
  46. sglang/srt/models/qwen2.py +6 -2
  47. sglang/srt/models/qwen2_moe.py +5 -14
  48. sglang/srt/models/stablelm.py +5 -1
  49. sglang/srt/openai_api/adapter.py +16 -1
  50. sglang/srt/openai_api/protocol.py +5 -5
  51. sglang/srt/sampling/sampling_batch_info.py +75 -6
  52. sglang/srt/server.py +6 -6
  53. sglang/srt/utils.py +0 -3
  54. sglang/test/runners.py +1 -1
  55. sglang/test/test_programs.py +68 -0
  56. sglang/test/test_utils.py +4 -0
  57. sglang/utils.py +39 -0
  58. sglang/version.py +1 -1
  59. {sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/METADATA +9 -8
  60. sglang-0.3.0.dist-info/RECORD +118 -0
  61. {sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/WHEEL +1 -1
  62. sglang-0.2.14.post2.dist-info/RECORD +0 -115
  63. {sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/LICENSE +0 -0
  64. {sglang-0.2.14.post2.dist-info → sglang-0.3.0.dist-info}/top_level.txt +0 -0
sglang/api.py CHANGED
@@ -78,6 +78,7 @@ def gen(
78
78
  choices: Optional[List[str]] = None,
79
79
  choices_method: Optional[ChoicesSamplingMethod] = None,
80
80
  regex: Optional[str] = None,
81
+ json_schema: Optional[str] = None,
81
82
  ):
82
83
  """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
83
84
 
@@ -114,6 +115,7 @@ def gen(
114
115
  return_text_in_logprobs,
115
116
  dtype,
116
117
  regex,
118
+ json_schema,
117
119
  )
118
120
 
119
121
 
sglang/bench_latency.py CHANGED
@@ -11,26 +11,34 @@ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct
11
11
  ## plot the results in series of lines:
12
12
  python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
13
13
 
14
-
15
14
  # Usage (correctness test):
16
15
  python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
17
16
 
18
17
  ## Reference output (of the correctness test above, can be gpu dependent):
19
- prefill logits (first half) tensor([[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633],
20
- [-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633],
21
- [ -9.1875, -10.2500, 2.7109, ..., -4.3359, -4.0664, -4.1328]],
22
- device='cuda:0', dtype=torch.float16)
23
- prefill logits (final) tensor([[-8.3203, -7.1211, 3.3379, ..., -4.9570, -4.1328, -3.4141],
24
- [-8.9062, -9.0156, 4.1445, ..., -4.9922, -4.4961, -4.0742],
25
- [-9.6328, -9.0547, 4.0117, ..., -5.3047, -4.7148, -4.4609]],
26
- device='cuda:0', dtype=torch.float16)
27
- <s> The capital of France is.
18
+ input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
19
+
20
+ prefill logits (first half): tensor([[-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633],
21
+ [-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633],
22
+ [ -9.1875, -10.2500, 2.7129, ..., -4.3359, -4.0664, -4.1328]],
23
+ device='cuda:0')
24
+
25
+ prefill logits (final): tensor([[-8.3125, -7.1172, 3.3457, ..., -4.9570, -4.1328, -3.4141],
26
+ [-8.9141, -9.0156, 4.1445, ..., -4.9922, -4.4961, -4.0781],
27
+ [-9.6328, -9.0547, 4.0195, ..., -5.3047, -4.7148, -4.4570]],
28
+ device='cuda:0')
29
+
30
+ ========== Prompt 0 ==========
31
+ <s> The capital of France is Paris.
28
32
  The capital of the United States is Washington, D.C.
29
33
 
30
- <s> The capital of the United Kindom is.
34
+
35
+ ========== Prompt 1 ==========
36
+ <s> The capital of the United Kindom is London.
31
37
  The capital of the United Kingdom is London.
32
38
  The capital of the
33
- <s> Today is a sunny day and I like go for a walk in the park.
39
+
40
+ ========== Prompt 2 ==========
41
+ <s> Today is a sunny day and I like to go for a walk in the park.
34
42
  I'm going to the park
35
43
  """
36
44
 
@@ -200,16 +208,16 @@ def extend(reqs, model_runner):
200
208
  tree_cache=None,
201
209
  )
202
210
  batch.prepare_for_extend(model_runner.model_config.vocab_size)
203
- output = model_runner.forward(batch, ForwardMode.EXTEND)
204
- next_token_ids = batch.sample(output.next_token_logits)
205
- return next_token_ids, output.next_token_logits, batch
211
+ sample_output, logits_output = model_runner.forward(batch, ForwardMode.EXTEND)
212
+ next_token_ids = sample_output.batch_next_token_ids.tolist()
213
+ return next_token_ids, logits_output.next_token_logits, batch
206
214
 
207
215
 
208
216
  def decode(input_token_ids, batch, model_runner):
209
- batch.prepare_for_decode(input_token_ids.cpu().numpy())
210
- output = model_runner.forward(batch, ForwardMode.DECODE)
211
- next_token_ids = batch.sample(output.next_token_logits)
212
- return next_token_ids, output.next_token_logits
217
+ batch.prepare_for_decode(input_token_ids)
218
+ sample_output, logits_output = model_runner.forward(batch, ForwardMode.DECODE)
219
+ next_token_ids = sample_output.batch_next_token_ids.tolist()
220
+ return next_token_ids, logits_output.next_token_logits
213
221
 
214
222
 
215
223
  @torch.inference_mode()
@@ -225,12 +233,12 @@ def correctness_test(
225
233
 
226
234
  # Prepare inputs
227
235
  input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
228
- rank_print(f"{input_ids=}")
236
+ rank_print(f"\n{input_ids=}\n")
229
237
 
230
238
  if bench_args.cut_len > 0:
231
239
  # Prefill
232
240
  next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
233
- rank_print("prefill logits (first half)", next_token_logits)
241
+ rank_print(f"prefill logits (first half): {next_token_logits} \n")
234
242
 
235
243
  # Prepare extend inputs
236
244
  reqs = prepare_extend_inputs_for_correctness_test(
@@ -239,7 +247,7 @@ def correctness_test(
239
247
 
240
248
  # Extend
241
249
  next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
242
- rank_print("prefill logits (final)", next_token_logits)
250
+ rank_print(f"prefill logits (final): {next_token_logits} \n")
243
251
 
244
252
  # Decode
245
253
  output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
@@ -250,7 +258,8 @@ def correctness_test(
250
258
 
251
259
  # Print
252
260
  for i in range(len(reqs)):
253
- rank_print(tokenizer.decode(output_ids[i]))
261
+ rank_print(f"========== Prompt {i} ==========")
262
+ rank_print(tokenizer.decode(output_ids[i]), "\n")
254
263
 
255
264
 
256
265
  @torch.inference_mode()
@@ -292,6 +301,7 @@ def latency_test_run_once(
292
301
  measurement_results["prefill_throughput"] = throughput
293
302
 
294
303
  # Decode
304
+ decode_latencies = []
295
305
  for i in range(output_len):
296
306
  torch.cuda.synchronize()
297
307
  tic = time.time()
@@ -300,17 +310,18 @@ def latency_test_run_once(
300
310
  latency = time.time() - tic
301
311
  tot_latency += latency
302
312
  throughput = batch_size / latency
313
+ decode_latencies.append(latency)
303
314
  if i < 5:
304
315
  rank_print(
305
316
  f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
306
317
  )
307
- avg_decode_latency = (tot_latency - prefill_latency) / output_len
308
- avg_decode_throughput = batch_size / avg_decode_latency
318
+ med_decode_latency = np.median(decode_latencies)
319
+ med_decode_throughput = batch_size / med_decode_latency
309
320
  rank_print(
310
- f"Decode. avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s"
321
+ f"Decode. median latency: {med_decode_latency:6.5f} s, median throughput: {med_decode_throughput:9.2f} token/s"
311
322
  )
312
- measurement_results["avg_decode_latency"] = avg_decode_latency
313
- measurement_results["avg_decode_throughput"] = avg_decode_throughput
323
+ measurement_results["median_decode_latency"] = med_decode_latency
324
+ measurement_results["median_decode_throughput"] = med_decode_throughput
314
325
 
315
326
  throughput = (input_len + output_len) * batch_size / tot_latency
316
327
  rank_print(
@@ -4,7 +4,7 @@ from typing import List, Optional
4
4
 
5
5
  from sglang.global_config import global_config
6
6
  from sglang.lang.backend.base_backend import BaseBackend
7
- from sglang.lang.chat_template import get_chat_template_by_model_path
7
+ from sglang.lang.chat_template import get_chat_template, get_chat_template_by_model_path
8
8
  from sglang.lang.choices import ChoicesDecision, ChoicesSamplingMethod
9
9
  from sglang.lang.interpreter import StreamExecutor
10
10
  from sglang.lang.ir import (
@@ -23,6 +23,7 @@ class RuntimeEndpoint(BaseBackend):
23
23
  base_url: str,
24
24
  api_key: Optional[str] = None,
25
25
  verify: Optional[str] = None,
26
+ chat_template_name: Optional[str] = None,
26
27
  ):
27
28
  super().__init__()
28
29
  self.support_concate_and_append = True
@@ -39,9 +40,12 @@ class RuntimeEndpoint(BaseBackend):
39
40
  self._assert_success(res)
40
41
  self.model_info = res.json()
41
42
 
42
- self.chat_template = get_chat_template_by_model_path(
43
- self.model_info["model_path"]
44
- )
43
+ if chat_template_name:
44
+ self.chat_template = get_chat_template(chat_template_name)
45
+ else:
46
+ self.chat_template = get_chat_template_by_model_path(
47
+ self.model_info["model_path"]
48
+ )
45
49
 
46
50
  def get_model_name(self):
47
51
  return self.model_info["model_path"]
@@ -673,6 +673,7 @@ class StreamExecutor:
673
673
  "return_text_in_logprobs",
674
674
  "dtype",
675
675
  "regex",
676
+ "json_schema",
676
677
  ]:
677
678
  value = getattr(sampling_params, item, None)
678
679
  if value is not None:
@@ -854,6 +855,8 @@ class ProgramState:
854
855
  return self.stream_executor.get_meta_info(name)
855
856
 
856
857
  def __iadd__(self, other):
858
+ if other is None:
859
+ raise ValueError("Tried to append None to state.")
857
860
  self.stream_executor.submit(other)
858
861
  return self
859
862
 
sglang/lang/ir.py CHANGED
@@ -30,6 +30,7 @@ class SglSamplingParams:
30
30
  logprob_start_len: Optional[int] = (None,)
31
31
  top_logprobs_num: Optional[int] = (None,)
32
32
  return_text_in_logprobs: Optional[bool] = (None,)
33
+ json_schema: Optional[str] = None
33
34
 
34
35
  # for constrained generation, not included in to_xxx_kwargs
35
36
  dtype: Optional[str] = None
@@ -51,6 +52,7 @@ class SglSamplingParams:
51
52
  self.logprob_start_len,
52
53
  self.top_logprobs_num,
53
54
  self.return_text_in_logprobs,
55
+ self.json_schema,
54
56
  )
55
57
 
56
58
  def to_openai_kwargs(self):
@@ -121,6 +123,7 @@ class SglSamplingParams:
121
123
  "presence_penalty": self.presence_penalty,
122
124
  "ignore_eos": self.ignore_eos,
123
125
  "regex": self.regex,
126
+ "json_schema": self.json_schema,
124
127
  }
125
128
 
126
129
 
@@ -425,6 +428,7 @@ class SglGen(SglExpr):
425
428
  return_text_in_logprobs: Optional[bool] = None,
426
429
  dtype: Optional[type] = None,
427
430
  regex: Optional[str] = None,
431
+ json_schema: Optional[str] = None,
428
432
  ):
429
433
  """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
430
434
  super().__init__()
@@ -446,6 +450,7 @@ class SglGen(SglExpr):
446
450
  return_text_in_logprobs=return_text_in_logprobs,
447
451
  dtype=dtype,
448
452
  regex=regex,
453
+ json_schema=json_schema,
449
454
  )
450
455
 
451
456
  def __repr__(self):
@@ -10,17 +10,17 @@ if __name__ == "__main__":
10
10
  args = parser.parse_args()
11
11
  server_args = ServerArgs.from_cli_args(args)
12
12
 
13
- model_overide_args = {}
14
- model_overide_args["mm_spatial_pool_stride"] = 2
15
- model_overide_args["architectures"] = ["LlavaVidForCausalLM"]
16
- model_overide_args["num_frames"] = 16
17
- model_overide_args["model_type"] = "llavavid"
18
- if model_overide_args["num_frames"] == 32:
19
- model_overide_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
20
- model_overide_args["max_sequence_length"] = 4096 * 2
21
- model_overide_args["tokenizer_model_max_length"] = 4096 * 2
22
- model_overide_args["model_max_length"] = 4096 * 2
13
+ model_override_args = {}
14
+ model_override_args["mm_spatial_pool_stride"] = 2
15
+ model_override_args["architectures"] = ["LlavaVidForCausalLM"]
16
+ model_override_args["num_frames"] = 16
17
+ model_override_args["model_type"] = "llavavid"
18
+ if model_override_args["num_frames"] == 32:
19
+ model_override_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
20
+ model_override_args["max_sequence_length"] = 4096 * 2
21
+ model_override_args["tokenizer_model_max_length"] = 4096 * 2
22
+ model_override_args["model_max_length"] = 4096 * 2
23
23
  if "34b" in args.model_path.lower():
24
- model_overide_args["image_token_index"] = 64002
24
+ model_override_args["image_token_index"] = 64002
25
25
 
26
- launch_server(server_args, model_overide_args, None)
26
+ launch_server(server_args, model_override_args, None)
@@ -0,0 +1,5 @@
1
+ from sglang.srt.configs.exaone import ExaoneConfig
2
+
3
+ __all__ = [
4
+ "ExaoneConfig",
5
+ ]
@@ -0,0 +1,195 @@
1
+ # coding=utf-8
2
+ # Copyright 2024 The LG AI Research EXAONE Lab. All rights reserved.
3
+ # Copyright 2024 The LG CNS AI Engineering Team.
4
+ # Copyright 2023-2024 SGLang Team.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ """ EXAONE model configuration """
18
+ from typing import Any, Dict
19
+
20
+ from transformers.configuration_utils import PretrainedConfig
21
+ from transformers.utils import logging
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, Any] = {}
26
+
27
+
28
+ # ruff: noqa: E501
29
+ class ExaoneConfig(PretrainedConfig):
30
+ r"""
31
+ This is the configuration class to store the configuration of a :class:`~transformers.ExaoneModel`. It is used to
32
+ instantiate a EXAONE model according to the specified arguments, defining the model architecture. Instantiating a
33
+ configuration with the defaults will yield a similar configuration to that of the Exaone
34
+
35
+ Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
36
+ outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
37
+
38
+
39
+ Args:
40
+ vocab_size (:obj:`int`, `optional`, defaults to 102400):
41
+ Vocabulary size of the EXAONE model. Defines the number of different tokens that can be represented by the
42
+ :obj:`inputs_ids` passed when calling :class:`~transformers.ExaoneModel`. Vocabulary size of the model.
43
+ Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of
44
+ :class:`~transformers.EXAONEModel`.
45
+ max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
46
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
47
+ just in case (e.g., 512 or 1024 or 2048).
48
+ hidden_size (:obj:`int`, `optional`, defaults to 2048):
49
+ Dimensionality of the encoder layers and the pooler layer.
50
+ num_layers (:obj:`int`, `optional`, defaults to 32):
51
+ Number of hidden layers in the Transformer encoder.
52
+ num_attention_heads (:obj:`int`, `optional`, defaults to 32):
53
+ Number of attention heads for each attention layer in the Transformer decoder.
54
+ num_key_value_heads (:obj:`int`, `optional`):
55
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
56
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
57
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
58
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
59
+ by meanpooling all the original heads within that group. For more details checkout [this
60
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
61
+ `num_attention_heads`.
62
+ intermediate_size (:obj:`int`, `optional`, defaults to `hidden_size * 4`):
63
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
64
+ activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"silu"`):
65
+ The non-linear activation function (function or string) in the decoder.
66
+ rope_theta (:obj:`float`, `optional`, defaults to 10000.0):
67
+ The base period of the RoPE embeddings.
68
+ rope_scaling (:obj:`Dict`, `optional`):
69
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
70
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
71
+ accordingly.
72
+ Expected contents:
73
+ `rope_type` (:obj:`str`):
74
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
75
+ 'llama3'], with 'default' being the original RoPE implementation.
76
+ `factor` (:obj:`float`, `optional`):
77
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
78
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
79
+ original maximum pre-trained length.
80
+ `original_max_position_embeddings` (:obj:`int`, `optional`):
81
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
82
+ pretraining.
83
+ `attention_factor` (:obj:`float`, `optional`):
84
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
85
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
86
+ `factor` field to infer the suggested value.
87
+ `beta_fast` (:obj:`float`, `optional`):
88
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
89
+ ramp function. If unspecified, it defaults to 32.
90
+ `beta_slow` (:obj:`float`, `optional`):
91
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
92
+ ramp function. If unspecified, it defaults to 1.
93
+ `short_factor` (:obj:`List[float]`, `optional`):
94
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
95
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
96
+ size divided by the number of attention heads divided by 2
97
+ `long_factor` (:obj:`List[float]`, `optional`):
98
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
99
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
100
+ size divided by the number of attention heads divided by 2
101
+ `low_freq_factor` (:obj:`float`, `optional`):
102
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
103
+ `high_freq_factor` (:obj:`float`, `optional`):
104
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
105
+ embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
106
+ The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
107
+ attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
108
+ The dropout ratio for the attention probabilities.
109
+ layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
110
+ The epsilon used by the layer normalization layers.
111
+ initializer_range (:obj:`float`, `optional`, defaults to 0.02):
112
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
113
+ use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
114
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
115
+ relevant if ``configs.is_decoder=True``.
116
+ bos_token_id (:obj:`int`, `optional`, defaults to 0):
117
+ Beginning of stream token id.
118
+ eos_token_id (:obj:`int`, `optional`, defaults to 2):
119
+ End of stream token id.
120
+ tie_word_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
121
+ Whether to tie weight embeddings
122
+ gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
123
+ If True, use gradient checkpointing to save memory at the expense of slower backward pass.
124
+
125
+ Example::
126
+
127
+ >>> from transformers import EXAONEModel, ExaoneConfig
128
+
129
+ >>> # Initializing a EXAONE configuration
130
+ >>> configuration = ExaoneConfig()
131
+
132
+ >>> # Initializing a model from configuration
133
+ >>> model = EXAONEModel(configuration)
134
+
135
+ >>> # Accessing the model configuration
136
+ >>> configuration = model.configs
137
+ """
138
+
139
+ model_type = "exaone"
140
+ keys_to_ignore_at_inference = ["past_key_values"]
141
+ attribute_map = {"num_hidden_layers": "num_layers"}
142
+
143
+ def __init__(
144
+ self,
145
+ vocab_size=102400,
146
+ max_position_embeddings=2048,
147
+ hidden_size=2048,
148
+ num_layers=32,
149
+ num_attention_heads=32,
150
+ num_key_value_heads=None,
151
+ intermediate_size=None,
152
+ activation_function="silu",
153
+ rope_theta=10000.0,
154
+ rope_scaling=None,
155
+ embed_dropout=0.0,
156
+ attention_dropout=0.0,
157
+ layer_norm_epsilon=1e-5,
158
+ initializer_range=0.02,
159
+ use_cache=True,
160
+ bos_token_id=0,
161
+ eos_token_id=2,
162
+ tie_word_embeddings=True,
163
+ **kwargs
164
+ ):
165
+ self.vocab_size = vocab_size
166
+ self.max_position_embeddings = max_position_embeddings
167
+ self.hidden_size = hidden_size
168
+ self.num_layers = num_layers
169
+ self.num_attention_heads = num_attention_heads
170
+ self.num_hidden_layers = num_layers
171
+ if num_key_value_heads is None:
172
+ num_key_value_heads = num_attention_heads
173
+ self.num_key_value_heads = num_key_value_heads
174
+ if intermediate_size:
175
+ self.intermediate_size = intermediate_size
176
+ else:
177
+ self.intermediate_size = hidden_size * 4
178
+ self.activation_function = activation_function
179
+ self.embed_dropout = embed_dropout
180
+ self.attention_dropout = attention_dropout
181
+ self.layer_norm_epsilon = layer_norm_epsilon
182
+ self.initializer_range = initializer_range
183
+ self.use_cache = use_cache
184
+ self.rope_theta = rope_theta
185
+ self.rope_scaling = rope_scaling
186
+
187
+ self.bos_token_id = bos_token_id
188
+ self.eos_token_id = eos_token_id
189
+
190
+ super().__init__(
191
+ bos_token_id=bos_token_id,
192
+ eos_token_id=eos_token_id,
193
+ tie_word_embeddings=tie_word_embeddings,
194
+ **kwargs
195
+ )
@@ -79,7 +79,7 @@ class FSMCache(BaseToolCache):
79
79
 
80
80
  def init_value(self, value):
81
81
  if self.json_schema_mode:
82
- regex = build_regex_from_schema(value)
82
+ regex = build_regex_from_schema(value, whitespace_pattern=r"[\n\t ]*")
83
83
  return RegexGuide(regex, self.outlines_tokenizer), regex
84
84
  else:
85
85
  return RegexGuide(value, self.outlines_tokenizer)
@@ -386,7 +386,16 @@ def generate_chat_conv(
386
386
  for message in request.messages:
387
387
  msg_role = message.role
388
388
  if msg_role == "system":
389
- conv.system_message = message.content
389
+ if isinstance(message.content, str):
390
+ conv.system_message = message.content
391
+ elif isinstance(message.content, list):
392
+ if (
393
+ len(message.content) != 1
394
+ or getattr(message.content[0], "type", None) != "text"
395
+ ):
396
+ raise ValueError("The system message should be a single text.")
397
+ else:
398
+ conv.system_message = getattr(message.content[0], "text", "")
390
399
  elif msg_role == "user":
391
400
  # Handle the various types of Chat Request content types here.
392
401
  role = conv.roles[0]
@@ -414,7 +423,20 @@ def generate_chat_conv(
414
423
  conv.append_image(content.image_url.url)
415
424
  conv.append_message(conv.roles[0], real_content)
416
425
  elif msg_role == "assistant":
417
- conv.append_message(conv.roles[1], message.content)
426
+ parsed_content = ""
427
+ if isinstance(message.content, str):
428
+ parsed_content = message.content
429
+ elif isinstance(message.content, list):
430
+ if (
431
+ len(message.content) != 1
432
+ or getattr(message.content[0], "type", None) != "text"
433
+ ):
434
+ raise ValueError(
435
+ "The assistant's response should be a single text."
436
+ )
437
+ else:
438
+ parsed_content = getattr(message.content[0], "text", "")
439
+ conv.append_message(conv.roles[1], parsed_content)
418
440
  else:
419
441
  raise ValueError(f"Unknown role: {msg_role}")
420
442
 
@@ -15,6 +15,7 @@ limitations under the License.
15
15
 
16
16
  """Utilities for Huggingface Transformers."""
17
17
 
18
+ import contextlib
18
19
  import functools
19
20
  import json
20
21
  import os
@@ -34,15 +35,20 @@ from transformers import (
34
35
  try:
35
36
  from vllm.transformers_utils.configs import ChatGLMConfig, DbrxConfig
36
37
 
38
+ from sglang.srt.configs import ExaoneConfig
39
+
37
40
  _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {
38
41
  ChatGLMConfig.model_type: ChatGLMConfig,
39
42
  DbrxConfig.model_type: DbrxConfig,
43
+ ExaoneConfig.model_type: ExaoneConfig,
40
44
  }
41
45
  except ImportError:
42
46
  # We want this file to run without vllm dependency
43
47
  _CONFIG_REGISTRY: Dict[str, Type[PretrainedConfig]] = {}
44
48
 
45
- from sglang.srt.utils import is_multimodal_model
49
+ for name, cls in _CONFIG_REGISTRY.items():
50
+ with contextlib.suppress(ValueError):
51
+ AutoConfig.register(name, cls)
46
52
 
47
53
 
48
54
  def download_from_hf(model_path: str):
@@ -52,17 +58,11 @@ def download_from_hf(model_path: str):
52
58
  return snapshot_download(model_path, allow_patterns=["*.json", "*.bin", "*.model"])
53
59
 
54
60
 
55
- def get_config_json(model_path: str):
56
- with open(os.path.join(model_path, "config.json")) as f:
57
- config = json.load(f)
58
- return config
59
-
60
-
61
61
  def get_config(
62
62
  model: str,
63
63
  trust_remote_code: bool,
64
64
  revision: Optional[str] = None,
65
- model_overide_args: Optional[dict] = None,
65
+ model_override_args: Optional[dict] = None,
66
66
  ):
67
67
  config = AutoConfig.from_pretrained(
68
68
  model, trust_remote_code=trust_remote_code, revision=revision
@@ -70,8 +70,8 @@ def get_config(
70
70
  if config.model_type in _CONFIG_REGISTRY:
71
71
  config_class = _CONFIG_REGISTRY[config.model_type]
72
72
  config = config_class.from_pretrained(model, revision=revision)
73
- if model_overide_args:
74
- config.update(model_overide_args)
73
+ if model_override_args:
74
+ config.update(model_override_args)
75
75
  return config
76
76
 
77
77
 
@@ -89,10 +89,10 @@ CONTEXT_LENGTH_KEYS = [
89
89
 
90
90
 
91
91
  def get_context_length(config):
92
- """Get the context length of a model from a huggingface model config."""
92
+ """Get the context length of a model from a huggingface model configs."""
93
93
  rope_scaling = getattr(config, "rope_scaling", None)
94
94
  if rope_scaling:
95
- rope_scaling_factor = config.rope_scaling["factor"]
95
+ rope_scaling_factor = config.rope_scaling.get("factor", 1)
96
96
  if "original_max_position_embeddings" in rope_scaling:
97
97
  rope_scaling_factor = 1
98
98
  if config.rope_scaling.get("rope_type", None) == "llama3":
@@ -127,8 +127,7 @@ def _fwd_kernel(
127
127
  )
128
128
  k = tl.load(K_Buffer + offs_buf_k, mask=mask_n[None, :], other=0.0)
129
129
 
130
- qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
131
- qk += tl.dot(q, k)
130
+ qk = tl.dot(q.to(k.dtype), k)
132
131
  if BLOCK_DPE > 0:
133
132
  offs_kpe = (
134
133
  offs_kv_loc[None, :] * stride_buf_kbs
@@ -140,7 +139,7 @@ def _fwd_kernel(
140
139
  mask=mask_n[None, :],
141
140
  other=0.0,
142
141
  )
143
- qk += tl.dot(qpe, kpe)
142
+ qk += tl.dot(qpe.to(kpe.dtype), kpe)
144
143
  qk *= sm_scale
145
144
 
146
145
  if logit_cap > 0:
@@ -179,9 +178,7 @@ def _fwd_kernel(
179
178
  )
180
179
  k = tl.load(K_Extend + offs_k, mask=mask_n[None, :], other=0.0)
181
180
 
182
- qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32)
183
- qk += tl.dot(q, k)
184
-
181
+ qk = tl.dot(q, k, out_dtype=tl.float32)
185
182
  if BLOCK_DPE > 0:
186
183
  offs_kpe = (
187
184
  (cur_seq_extend_start_contiguous + start_n + offs_n[None, :])
@@ -276,9 +273,17 @@ def extend_attention_fwd(
276
273
  BLOCK_DV = Lv
277
274
 
278
275
  if CUDA_CAPABILITY[0] >= 9:
279
- BLOCK_M, BLOCK_N = (128, 64)
276
+ if Lq <= 256:
277
+ BLOCK_M, BLOCK_N = (128, 64)
278
+ else:
279
+ BLOCK_M, BLOCK_N = (32, 64)
280
280
  elif CUDA_CAPABILITY[0] >= 8:
281
- BLOCK_M, BLOCK_N = (128, 128) if Lq <= 128 else (64, 64)
281
+ if Lq <= 128:
282
+ BLOCK_M, BLOCK_N = (128, 128)
283
+ elif Lq <= 256:
284
+ BLOCK_M, BLOCK_N = (64, 64)
285
+ else:
286
+ BLOCK_M, BLOCK_N = (32, 64)
282
287
  else:
283
288
  BLOCK_M, BLOCK_N = (64, 64) if Lq <= 128 else (32, 32)
284
289