sglang 0.2.13__py3-none-any.whl → 0.2.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. sglang/api.py +6 -0
  2. sglang/bench_latency.py +7 -3
  3. sglang/bench_serving.py +50 -26
  4. sglang/check_env.py +15 -0
  5. sglang/lang/chat_template.py +10 -5
  6. sglang/lang/compiler.py +4 -0
  7. sglang/lang/interpreter.py +1 -0
  8. sglang/lang/ir.py +9 -0
  9. sglang/launch_server.py +8 -1
  10. sglang/srt/conversation.py +50 -1
  11. sglang/srt/hf_transformers_utils.py +22 -23
  12. sglang/srt/layers/activation.py +24 -1
  13. sglang/srt/layers/decode_attention.py +338 -50
  14. sglang/srt/layers/fused_moe/layer.py +2 -2
  15. sglang/srt/layers/layernorm.py +3 -0
  16. sglang/srt/layers/logits_processor.py +60 -23
  17. sglang/srt/layers/radix_attention.py +3 -4
  18. sglang/srt/layers/sampler.py +154 -0
  19. sglang/srt/managers/controller_multi.py +2 -8
  20. sglang/srt/managers/controller_single.py +7 -10
  21. sglang/srt/managers/detokenizer_manager.py +20 -9
  22. sglang/srt/managers/io_struct.py +44 -11
  23. sglang/srt/managers/policy_scheduler.py +5 -2
  24. sglang/srt/managers/schedule_batch.py +52 -167
  25. sglang/srt/managers/tokenizer_manager.py +192 -83
  26. sglang/srt/managers/tp_worker.py +130 -43
  27. sglang/srt/mem_cache/memory_pool.py +82 -8
  28. sglang/srt/mm_utils.py +79 -7
  29. sglang/srt/model_executor/cuda_graph_runner.py +49 -11
  30. sglang/srt/model_executor/forward_batch_info.py +59 -27
  31. sglang/srt/model_executor/model_runner.py +210 -61
  32. sglang/srt/models/chatglm.py +4 -12
  33. sglang/srt/models/commandr.py +5 -1
  34. sglang/srt/models/dbrx.py +5 -1
  35. sglang/srt/models/deepseek.py +5 -1
  36. sglang/srt/models/deepseek_v2.py +5 -1
  37. sglang/srt/models/gemma.py +5 -1
  38. sglang/srt/models/gemma2.py +15 -7
  39. sglang/srt/models/gpt_bigcode.py +5 -1
  40. sglang/srt/models/grok.py +16 -2
  41. sglang/srt/models/internlm2.py +5 -1
  42. sglang/srt/models/llama2.py +7 -3
  43. sglang/srt/models/llama_classification.py +2 -2
  44. sglang/srt/models/llama_embedding.py +4 -0
  45. sglang/srt/models/llava.py +176 -59
  46. sglang/srt/models/minicpm.py +5 -1
  47. sglang/srt/models/mixtral.py +5 -1
  48. sglang/srt/models/mixtral_quant.py +5 -1
  49. sglang/srt/models/qwen.py +5 -2
  50. sglang/srt/models/qwen2.py +13 -3
  51. sglang/srt/models/qwen2_moe.py +5 -14
  52. sglang/srt/models/stablelm.py +5 -1
  53. sglang/srt/openai_api/adapter.py +117 -37
  54. sglang/srt/sampling/sampling_batch_info.py +209 -0
  55. sglang/srt/{sampling_params.py → sampling/sampling_params.py} +18 -0
  56. sglang/srt/server.py +84 -56
  57. sglang/srt/server_args.py +43 -15
  58. sglang/srt/utils.py +26 -16
  59. sglang/test/runners.py +23 -31
  60. sglang/test/simple_eval_common.py +9 -10
  61. sglang/test/simple_eval_gpqa.py +2 -1
  62. sglang/test/simple_eval_humaneval.py +2 -2
  63. sglang/test/simple_eval_math.py +2 -1
  64. sglang/test/simple_eval_mmlu.py +2 -1
  65. sglang/test/test_activation.py +55 -0
  66. sglang/test/test_utils.py +36 -53
  67. sglang/version.py +1 -1
  68. {sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/METADATA +92 -25
  69. sglang-0.2.14.dist-info/RECORD +114 -0
  70. {sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/WHEEL +1 -1
  71. sglang/launch_server_llavavid.py +0 -29
  72. sglang-0.2.13.dist-info/RECORD +0 -112
  73. {sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/LICENSE +0 -0
  74. {sglang-0.2.13.dist-info → sglang-0.2.14.dist-info}/top_level.txt +0 -0
sglang/srt/utils.py CHANGED
@@ -224,13 +224,18 @@ def is_multimodal_model(model):
224
224
  raise ValueError("unrecognized type")
225
225
 
226
226
 
227
- def is_generation_model(model_architectures):
227
+ def is_generation_model(model_architectures, is_embedding: bool = False):
228
+ # We have two ways to determine whether a model is a generative model.
229
+ # 1. Check the model architectue
230
+ # 2. check the `is_embedding` server args
231
+
228
232
  if (
229
233
  "LlamaEmbeddingModel" in model_architectures
230
234
  or "MistralModel" in model_architectures
231
235
  ):
232
236
  return False
233
- return True
237
+ else:
238
+ return not is_embedding
234
239
 
235
240
 
236
241
  def decode_video_base64(video_base64):
@@ -347,7 +352,7 @@ def suppress_other_loggers():
347
352
  logging.WARN
348
353
  )
349
354
  logging.getLogger("vllm.selector").setLevel(logging.WARN)
350
- logging.getLogger("vllm.utils").setLevel(logging.WARN)
355
+ logging.getLogger("vllm.utils").setLevel(logging.ERROR)
351
356
 
352
357
 
353
358
  def assert_pkg_version(pkg: str, min_version: str, message: str):
@@ -369,14 +374,11 @@ def kill_parent_process():
369
374
  """Kill the parent process and all children of the parent process."""
370
375
  current_process = psutil.Process()
371
376
  parent_process = current_process.parent()
372
- children = parent_process.children(recursive=True)
373
- for child in children:
374
- if child.pid != current_process.pid:
375
- os.kill(child.pid, 9)
376
- os.kill(parent_process.pid, 9)
377
+ kill_child_process(parent_process.pid, skip_pid=current_process.pid)
377
378
 
378
379
 
379
- def kill_child_process(pid, including_parent=True):
380
+ def kill_child_process(pid, including_parent=True, skip_pid=None):
381
+ """Kill the process and all its children process."""
380
382
  try:
381
383
  parent = psutil.Process(pid)
382
384
  except psutil.NoSuchProcess:
@@ -384,6 +386,8 @@ def kill_child_process(pid, including_parent=True):
384
386
 
385
387
  children = parent.children(recursive=True)
386
388
  for child in children:
389
+ if child.pid == skip_pid:
390
+ continue
387
391
  try:
388
392
  child.kill()
389
393
  except psutil.NoSuchProcess:
@@ -452,10 +456,6 @@ def monkey_patch_vllm_dummy_weight_loader():
452
456
  quant_method = getattr(module, "quant_method", None)
453
457
  if quant_method is not None:
454
458
  quant_method.process_weights_after_loading(module)
455
- # FIXME: Remove this after Mixtral is updated
456
- # to use quant_method.
457
- if hasattr(module, "process_weights_after_loading"):
458
- module.process_weights_after_loading()
459
459
 
460
460
  # NOTE(woosuk): For accurate performance evaluation, we assign
461
461
  # random values to the weights.
@@ -692,7 +692,7 @@ def monkey_patch_vllm_qvk_linear_loader():
692
692
  setattr(QKVParallelLinear, "weight_loader", weight_loader_srt)
693
693
 
694
694
 
695
- def add_api_key_middleware(app, api_key):
695
+ def add_api_key_middleware(app, api_key: str):
696
696
  @app.middleware("http")
697
697
  async def authentication(request, call_next):
698
698
  if request.method == "OPTIONS":
@@ -704,7 +704,7 @@ def add_api_key_middleware(app, api_key):
704
704
  return await call_next(request)
705
705
 
706
706
 
707
- def prepare_model(model_path):
707
+ def prepare_model(model_path: str):
708
708
  if "SGLANG_USE_MODELSCOPE" in os.environ:
709
709
  if not os.path.exists(model_path):
710
710
  from modelscope import snapshot_download
@@ -713,7 +713,7 @@ def prepare_model(model_path):
713
713
  return model_path
714
714
 
715
715
 
716
- def prepare_tokenizer(tokenizer_path):
716
+ def prepare_tokenizer(tokenizer_path: str):
717
717
  if "SGLANG_USE_MODELSCOPE" in os.environ:
718
718
  if not os.path.exists(tokenizer_path):
719
719
  from modelscope import snapshot_download
@@ -722,3 +722,13 @@ def prepare_tokenizer(tokenizer_path):
722
722
  tokenizer_path, ignore_patterns=["*.bin", "*.safetensors"]
723
723
  )
724
724
  return tokenizer_path
725
+
726
+
727
+ def configure_logger(server_args, prefix: str = ""):
728
+ format = f"[%(asctime)s{prefix}] %(message)s"
729
+ logging.basicConfig(
730
+ level=getattr(logging, server_args.log_level.upper()),
731
+ format=format,
732
+ datefmt="%H:%M:%S",
733
+ force=True,
734
+ )
sglang/test/runners.py CHANGED
@@ -14,7 +14,7 @@ limitations under the License.
14
14
  """
15
15
 
16
16
  import json
17
- import multiprocessing
17
+ import multiprocessing as mp
18
18
  import os
19
19
  from dataclasses import dataclass
20
20
  from typing import List, Union
@@ -24,15 +24,15 @@ import torch.nn.functional as F
24
24
  from transformers import AutoModelForCausalLM, AutoTokenizer
25
25
 
26
26
  from sglang.srt.server import Runtime
27
- from sglang.srt.utils import is_generation_model
27
+ from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER
28
28
 
29
29
  DEFAULT_PROMPTS = [
30
30
  # the output of gemma-2-2b from SRT is unstable on the commented prompt
31
31
  # "The capital of France is",
32
+ "Apple is red. Banana is Yellow. " * 800 + "Apple is",
32
33
  "The capital of the United Kindom is",
33
34
  "Today is a sunny day and I like",
34
35
  "AI is a field of computer science focused on",
35
- "Apple is red. Banana is Yellow. " * 800 + "Apple is",
36
36
  ]
37
37
 
38
38
  dirpath = os.path.dirname(__file__)
@@ -63,44 +63,37 @@ class HFRunner:
63
63
  def __init__(
64
64
  self,
65
65
  model_path,
66
- torch_dtype=torch.float16,
67
- is_generation_model=None,
66
+ torch_dtype,
67
+ is_generation,
68
68
  ):
69
- self.in_queue = multiprocessing.Queue()
70
- self.out_queue = multiprocessing.Queue()
69
+ self.is_generation = is_generation
70
+
71
+ self.in_queue = mp.Queue()
72
+ self.out_queue = mp.Queue()
71
73
 
72
- self.model_proc = multiprocessing.Process(
74
+ self.model_proc = mp.Process(
73
75
  target=self.start_model_process,
74
76
  args=(
75
77
  self.in_queue,
76
78
  self.out_queue,
77
79
  model_path,
78
80
  torch_dtype,
79
- is_generation_model,
80
81
  ),
81
82
  )
82
83
  self.model_proc.start()
83
84
 
84
- def start_model_process(
85
- self, in_queue, out_queue, model_path, torch_dtype, is_generation_model
86
- ):
85
+ def start_model_process(self, in_queue, out_queue, model_path, torch_dtype):
87
86
  self.tokenizer = AutoTokenizer.from_pretrained(
88
87
  model_path,
89
88
  torch_dtype=torch_dtype,
90
- trust_remote_code=True,
91
89
  )
92
90
 
93
- self.is_generation_model = (
94
- is_generation_model(model_path)
95
- if is_generation_model is None
96
- else is_generation_model
97
- )
98
- if self.is_generation_model:
91
+ if self.is_generation:
99
92
  self.model = AutoModelForCausalLM.from_pretrained(
100
93
  model_path,
101
94
  torch_dtype=torch_dtype,
95
+ trust_remote_code=False,
102
96
  low_cpu_mem_usage=True,
103
- trust_remote_code=True,
104
97
  ).cuda()
105
98
  else:
106
99
  from sentence_transformers import SentenceTransformer
@@ -113,7 +106,7 @@ class HFRunner:
113
106
  while True:
114
107
  prompts, max_new_tokens = in_queue.get()
115
108
  if prompts is not None:
116
- if self.is_generation_model:
109
+ if self.is_generation:
117
110
  output_strs = []
118
111
  prefill_logprobs = []
119
112
  for p in prompts:
@@ -176,22 +169,20 @@ class SRTRunner:
176
169
  def __init__(
177
170
  self,
178
171
  model_path,
172
+ torch_dtype,
173
+ is_generation,
179
174
  tp_size=1,
180
- torch_dtype=torch.float16,
181
- is_generation_model=None,
182
- port=5157,
175
+ port=DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
183
176
  ):
184
- self.is_generation_model = (
185
- is_generation_model(model_path)
186
- if is_generation_model is None
187
- else is_generation_model
188
- )
177
+ self.is_generation = is_generation
189
178
  self.runtime = Runtime(
190
179
  model_path=model_path,
191
180
  tp_size=tp_size,
192
181
  dtype=get_dtype_str(torch_dtype),
193
182
  port=port,
194
- mem_fraction_static=0.7,
183
+ mem_fraction_static=0.69,
184
+ trust_remote_code=False,
185
+ is_embedding=not self.is_generation,
195
186
  )
196
187
 
197
188
  def forward(
@@ -199,7 +190,7 @@ class SRTRunner:
199
190
  prompts: Union[List[str], List[torch.Tensor]] = DEFAULT_PROMPTS,
200
191
  max_new_tokens=8,
201
192
  ):
202
- if self.is_generation_model:
193
+ if self.is_generation:
203
194
  # the return value contains logprobs from prefill
204
195
  output_strs = []
205
196
  top_input_logprobs = []
@@ -209,6 +200,7 @@ class SRTRunner:
209
200
  prompt,
210
201
  sampling_params=sampling_params,
211
202
  return_logprob=True,
203
+ logprob_start_len=0,
212
204
  top_logprobs_num=NUM_TOP_LOGPROBS,
213
205
  )
214
206
  response = json.loads(response)
@@ -1,13 +1,12 @@
1
1
  # Adapted from https://github.com/openai/simple-evals/
2
2
 
3
- import base64
4
3
  import os
5
4
  import resource
6
5
  import time
7
6
  from collections import defaultdict
8
7
  from dataclasses import dataclass, field
9
8
  from multiprocessing.pool import ThreadPool
10
- from typing import Any, Dict, List, Tuple
9
+ from typing import Any, Dict, List, Optional, Tuple
11
10
 
12
11
  import httpx
13
12
  import jinja2
@@ -44,8 +43,8 @@ class EvalResult:
44
43
  Result of running an evaluation (usually consisting of many samples)
45
44
  """
46
45
 
47
- score: float | None # top-line metric
48
- metrics: Dict[str, float] | None # other metrics
46
+ score: Optional[float] # top-line metric
47
+ metrics: Optional[Dict[str, float]] # other metrics
49
48
  htmls: List[str] # strings of valid HTML
50
49
  convos: List[MessageList] # sampled conversations
51
50
 
@@ -56,10 +55,10 @@ class SingleEvalResult:
56
55
  Result of evaluating a single sample
57
56
  """
58
57
 
59
- score: float | None
58
+ score: Optional[float]
60
59
  metrics: Dict[str, float] = field(default_factory=dict)
61
- html: str | None = None
62
- convo: MessageList | None = None # sampled conversation
60
+ html: Optional[str] = None
61
+ convo: Optional[MessageList] = None # sampled conversation
63
62
 
64
63
 
65
64
  class Eval:
@@ -89,8 +88,8 @@ class ChatCompletionSampler(SamplerBase):
89
88
  def __init__(
90
89
  self,
91
90
  base_url: str = None,
92
- model: str | None = None,
93
- system_message: str | None = None,
91
+ model: Optional[str] = None,
92
+ system_message: Optional[str] = None,
94
93
  temperature: float = 0.0,
95
94
  max_tokens: int = 2048,
96
95
  ):
@@ -272,7 +271,7 @@ def _compute_stat(values: list, stat: str):
272
271
  def aggregate_results(
273
272
  single_eval_results: List[SingleEvalResult],
274
273
  default_stats: Tuple[str] = ("mean", "std"),
275
- name2stats: Dict[str, Tuple[str]] | None = None,
274
+ name2stats: Optional[Dict[str, Tuple[str]]] = None,
276
275
  ) -> EvalResult:
277
276
  """
278
277
  Aggregate results from multiple evaluations into a single EvalResult.
@@ -8,6 +8,7 @@ https://arxiv.org/abs/2311.12022
8
8
 
9
9
  import random
10
10
  import re
11
+ from typing import Optional
11
12
 
12
13
  import pandas
13
14
 
@@ -28,7 +29,7 @@ class GPQAEval(Eval):
28
29
  def __init__(
29
30
  self,
30
31
  filename: str,
31
- num_examples: int | None,
32
+ num_examples: Optional[int],
32
33
  num_threads: int,
33
34
  n_repeats: int = 1,
34
35
  ):
@@ -9,7 +9,7 @@ https://arxiv.org/abs/2107.03374 https://github.com/openai/human-eval/
9
9
  import random
10
10
  import re
11
11
  from concurrent.futures import ThreadPoolExecutor, as_completed
12
- from typing import Dict, List
12
+ from typing import Dict, List, Optional
13
13
 
14
14
  import tqdm
15
15
 
@@ -61,7 +61,7 @@ def evaluate_functional_correctness(
61
61
  class HumanEval(Eval):
62
62
  def __init__(
63
63
  self,
64
- num_examples: int | None,
64
+ num_examples: Optional[int],
65
65
  num_threads: int,
66
66
  num_samples_per_task: int = 5,
67
67
  ks_passes: List[int] = [1, 2, 5],
@@ -8,6 +8,7 @@ https://arxiv.org/abs/2103.03874
8
8
 
9
9
  import random
10
10
  import re
11
+ from typing import Optional
11
12
 
12
13
  import pandas
13
14
 
@@ -36,7 +37,7 @@ class MathEval(Eval):
36
37
  self,
37
38
  filename: str,
38
39
  equality_checker: SamplerBase,
39
- num_examples: int | None,
40
+ num_examples: Optional[int],
40
41
  num_threads: int,
41
42
  ):
42
43
  df = pandas.read_csv(filename)
@@ -8,6 +8,7 @@ https://arxiv.org/abs/2009.03300
8
8
 
9
9
  import random
10
10
  import re
11
+ from typing import Optional
11
12
 
12
13
  import pandas
13
14
 
@@ -84,7 +85,7 @@ subject2category = {
84
85
 
85
86
 
86
87
  class MMLUEval(Eval):
87
- def __init__(self, filename: str, num_examples: int | None, num_threads: int):
88
+ def __init__(self, filename: str, num_examples: Optional[int], num_threads: int):
88
89
  df = pandas.read_csv(filename)
89
90
  examples = [row.to_dict() for _, row in df.iterrows()]
90
91
  if num_examples:
@@ -0,0 +1,55 @@
1
+ import itertools
2
+ import unittest
3
+
4
+ import torch
5
+
6
+ from sglang.srt.layers.activation import GeluAndMul
7
+
8
+
9
+ class TestGeluAndMul(unittest.TestCase):
10
+ DTYPES = [torch.half, torch.bfloat16]
11
+ NUM_TOKENS = [7, 83, 2048]
12
+ D = [512, 4096, 5120, 13824]
13
+ SEEDS = [0]
14
+
15
+ @classmethod
16
+ def setUpClass(cls):
17
+ if not torch.cuda.is_available():
18
+ raise unittest.SkipTest("CUDA is not available")
19
+ torch.set_default_device("cuda")
20
+
21
+ def _run_gelu_and_mul_test(self, num_tokens, d, dtype, seed):
22
+ torch.manual_seed(seed)
23
+
24
+ layer = GeluAndMul().to(dtype=dtype)
25
+ x = torch.randn(num_tokens, 2 * d, dtype=dtype)
26
+
27
+ with torch.inference_mode():
28
+ ref_out = layer.forward_native(x)
29
+ out = layer.forward_cuda(x)
30
+
31
+ if dtype == torch.bfloat16:
32
+ atol = rtol = 1e-2
33
+ else:
34
+ atol = rtol = 1e-3
35
+
36
+ self.assertTrue(torch.allclose(out, ref_out, atol=atol, rtol=rtol))
37
+
38
+ def test_gelu_and_mul(self):
39
+ for params in itertools.product(
40
+ self.NUM_TOKENS,
41
+ self.D,
42
+ self.DTYPES,
43
+ self.SEEDS,
44
+ ):
45
+ with self.subTest(
46
+ num_tokens=params[0],
47
+ d=params[1],
48
+ dtype=params[2],
49
+ seed=params[3],
50
+ ):
51
+ self._run_gelu_and_mul_test(*params)
52
+
53
+
54
+ if __name__ == "__main__":
55
+ unittest.main(verbosity=2)
sglang/test/test_utils.py CHANGED
@@ -2,11 +2,10 @@
2
2
 
3
3
  import argparse
4
4
  import asyncio
5
- import multiprocessing
5
+ import os
6
6
  import subprocess
7
7
  import threading
8
8
  import time
9
- import unittest
10
9
  from functools import partial
11
10
  from typing import Callable, List, Optional
12
11
 
@@ -18,14 +17,19 @@ import torch.nn.functional as F
18
17
  from sglang.global_config import global_config
19
18
  from sglang.lang.backend.openai import OpenAI
20
19
  from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
20
+ from sglang.srt.utils import kill_child_process
21
21
  from sglang.utils import get_exception_traceback
22
22
 
23
23
  DEFAULT_MODEL_NAME_FOR_TEST = "meta-llama/Meta-Llama-3.1-8B-Instruct"
24
24
  DEFAULT_MOE_MODEL_NAME_FOR_TEST = "mistralai/Mixtral-8x7B-Instruct-v0.1"
25
- DEFAULT_URL_FOR_MOE_TEST = "http://127.0.0.1:6157"
26
- DEFAULT_URL_FOR_ACCURACY_TEST = "http://127.0.0.1:7157"
27
- DEFAULT_URL_FOR_UNIT_TEST = "http://127.0.0.1:8157"
28
- DEFAULT_URL_FOR_E2E_TEST = "http://127.0.0.1:9157"
25
+ DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH = 600
26
+
27
+ if os.getenv("SGLANG_IS_IN_CI", "false") == "true":
28
+ DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 5157
29
+ DEFAULT_URL_FOR_TEST = "http://127.0.0.1:6157"
30
+ else:
31
+ DEFAULT_PORT_FOR_SRT_TEST_RUNNER = 1157
32
+ DEFAULT_URL_FOR_TEST = "http://127.0.0.1:2157"
29
33
 
30
34
 
31
35
  def call_generate_lightllm(prompt, temperature, max_tokens, stop=None, url=None):
@@ -104,31 +108,8 @@ def call_generate_srt_raw(prompt, temperature, max_tokens, stop=None, url=None):
104
108
  return pred
105
109
 
106
110
 
107
- def call_generate_ginfer(prompt, temperature, max_tokens, stop=None, url=None):
108
- import grpc
109
- from ginfer import sampler_pb2, sampler_pb2_grpc
110
-
111
- sampler_channel = grpc.insecure_channel(url.replace("http://", ""))
112
- sampler = sampler_pb2_grpc.SamplerStub(sampler_channel)
113
-
114
- if stop is None:
115
- stop_strings = None
116
- else:
117
- stop_strings = [stop]
118
-
119
- sample_request = sampler_pb2.SampleTextRequest(
120
- prompt=prompt,
121
- settings=sampler_pb2.SampleSettings(
122
- max_len=max_tokens,
123
- rng_seed=0,
124
- temperature=max(temperature, 1e-7),
125
- nucleus_p=1,
126
- stop_strings=stop_strings,
127
- ),
128
- )
129
- stream = sampler.SampleText(sample_request)
130
- response = "".join([x.text for x in stream])
131
- return response
111
+ def call_generate_gserver(prompt, temperature, max_tokens, stop=None, url=None):
112
+ raise NotImplementedError()
132
113
 
133
114
 
134
115
  def call_generate_guidance(
@@ -271,7 +252,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
271
252
  "vllm",
272
253
  "outlines",
273
254
  "lightllm",
274
- "ginfer",
255
+ "gserver",
275
256
  "guidance",
276
257
  "lmql",
277
258
  "srt-raw",
@@ -292,7 +273,7 @@ def add_common_other_args_and_parse(parser: argparse.ArgumentParser):
292
273
  "lightllm": 22000,
293
274
  "lmql": 23000,
294
275
  "srt-raw": 30000,
295
- "ginfer": 9988,
276
+ "gserver": 9988,
296
277
  }
297
278
  args.port = default_port.get(args.backend, None)
298
279
  return args
@@ -328,8 +309,8 @@ def _get_call_generate(args: argparse.Namespace):
328
309
  return partial(call_generate_vllm, url=f"{args.host}:{args.port}/generate")
329
310
  elif args.backend == "srt-raw":
330
311
  return partial(call_generate_srt_raw, url=f"{args.host}:{args.port}/generate")
331
- elif args.backend == "ginfer":
332
- return partial(call_generate_ginfer, url=f"{args.host}:{args.port}")
312
+ elif args.backend == "gserver":
313
+ return partial(call_generate_gserver, url=f"{args.host}:{args.port}")
333
314
  elif args.backend == "outlines":
334
315
  return partial(call_generate_outlines, url=f"{args.host}:{args.port}/generate")
335
316
  elif args.backend == "guidance":
@@ -480,34 +461,36 @@ def run_unittest_files(files: List[str], timeout_per_file: float):
480
461
  success = True
481
462
 
482
463
  for filename in files:
464
+ global process
483
465
 
484
- def func():
485
- print(f"\n\nRun {filename}\n\n")
486
- ret = unittest.main(module=None, argv=["", "-vb"] + [filename])
487
-
488
- p = multiprocessing.Process(target=func)
489
-
490
- def run_one_file():
491
- p.start()
492
- p.join()
466
+ def run_one_file(filename):
467
+ filename = os.path.join(os.getcwd(), filename)
468
+ print(f"\n\nRun:\npython3 {filename}\n\n", flush=True)
469
+ process = subprocess.Popen(
470
+ ["python3", filename], stdout=None, stderr=None, env=os.environ
471
+ )
472
+ process.wait()
473
+ return process.returncode
493
474
 
494
475
  try:
495
- run_with_timeout(run_one_file, timeout=timeout_per_file)
496
- if p.exitcode != 0:
497
- success = False
498
- break
476
+ ret_code = run_with_timeout(
477
+ run_one_file, args=(filename,), timeout=timeout_per_file
478
+ )
479
+ assert ret_code == 0
499
480
  except TimeoutError:
500
- p.terminate()
481
+ kill_child_process(process.pid)
501
482
  time.sleep(5)
502
483
  print(
503
- f"\nTimeout after {timeout_per_file} seconds when running {filename}\n"
484
+ f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
485
+ flush=True,
504
486
  )
505
- return False
487
+ success = False
488
+ break
506
489
 
507
490
  if success:
508
- print(f"Success. Time elapsed: {time.time() - tic:.2f}s")
491
+ print(f"Success. Time elapsed: {time.time() - tic:.2f}s", flush=True)
509
492
  else:
510
- print(f"Fail. Time elapsed: {time.time() - tic:.2f}s")
493
+ print(f"Fail. Time elapsed: {time.time() - tic:.2f}s", flush=True)
511
494
 
512
495
  return 0 if success else -1
513
496
 
sglang/version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.13"
1
+ __version__ = "0.2.14"