sglang 0.5.3__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (112) hide show
  1. sglang/bench_one_batch.py +0 -2
  2. sglang/bench_serving.py +224 -127
  3. sglang/compile_deep_gemm.py +3 -0
  4. sglang/launch_server.py +0 -14
  5. sglang/srt/configs/__init__.py +2 -0
  6. sglang/srt/configs/falcon_h1.py +12 -58
  7. sglang/srt/configs/mamba_utils.py +117 -0
  8. sglang/srt/configs/model_config.py +68 -31
  9. sglang/srt/configs/nemotron_h.py +286 -0
  10. sglang/srt/configs/qwen3_next.py +11 -43
  11. sglang/srt/disaggregation/decode.py +7 -18
  12. sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
  13. sglang/srt/disaggregation/nixl/conn.py +55 -23
  14. sglang/srt/disaggregation/prefill.py +17 -32
  15. sglang/srt/entrypoints/engine.py +2 -2
  16. sglang/srt/entrypoints/grpc_request_manager.py +10 -23
  17. sglang/srt/entrypoints/grpc_server.py +220 -80
  18. sglang/srt/entrypoints/http_server.py +49 -1
  19. sglang/srt/entrypoints/openai/protocol.py +159 -31
  20. sglang/srt/entrypoints/openai/serving_chat.py +13 -71
  21. sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
  22. sglang/srt/environ.py +4 -0
  23. sglang/srt/function_call/function_call_parser.py +8 -6
  24. sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
  25. sglang/srt/grpc/sglang_scheduler_pb2.pyi +64 -6
  26. sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +88 -0
  27. sglang/srt/layers/attention/attention_registry.py +31 -22
  28. sglang/srt/layers/attention/fla/layernorm_gated.py +47 -30
  29. sglang/srt/layers/attention/flashattention_backend.py +0 -1
  30. sglang/srt/layers/attention/flashinfer_backend.py +223 -6
  31. sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -1
  32. sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -59
  33. sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
  34. sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -4
  35. sglang/srt/layers/attention/mamba/mamba.py +189 -241
  36. sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
  37. sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
  38. sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
  39. sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
  40. sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
  41. sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
  42. sglang/srt/layers/attention/triton_backend.py +1 -1
  43. sglang/srt/layers/logits_processor.py +136 -6
  44. sglang/srt/layers/modelopt_utils.py +11 -0
  45. sglang/srt/layers/moe/cutlass_w4a8_moe.py +18 -21
  46. sglang/srt/layers/moe/ep_moe/kernels.py +31 -452
  47. sglang/srt/layers/moe/ep_moe/layer.py +8 -286
  48. sglang/srt/layers/moe/fused_moe_triton/layer.py +6 -11
  49. sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
  50. sglang/srt/layers/moe/moe_runner/runner.py +3 -0
  51. sglang/srt/layers/moe/utils.py +7 -1
  52. sglang/srt/layers/quantization/__init__.py +1 -1
  53. sglang/srt/layers/quantization/fp8.py +84 -18
  54. sglang/srt/layers/quantization/modelopt_quant.py +1 -1
  55. sglang/srt/layers/quantization/quark/quark.py +3 -1
  56. sglang/srt/layers/quantization/w4afp8.py +2 -16
  57. sglang/srt/lora/lora_manager.py +0 -8
  58. sglang/srt/managers/overlap_utils.py +18 -16
  59. sglang/srt/managers/schedule_batch.py +119 -90
  60. sglang/srt/managers/schedule_policy.py +1 -1
  61. sglang/srt/managers/scheduler.py +213 -126
  62. sglang/srt/managers/scheduler_metrics_mixin.py +1 -1
  63. sglang/srt/managers/scheduler_output_processor_mixin.py +180 -86
  64. sglang/srt/managers/tokenizer_manager.py +270 -53
  65. sglang/srt/managers/tp_worker.py +39 -28
  66. sglang/srt/mem_cache/allocator.py +7 -2
  67. sglang/srt/mem_cache/chunk_cache.py +1 -1
  68. sglang/srt/mem_cache/memory_pool.py +162 -68
  69. sglang/srt/mem_cache/radix_cache.py +8 -3
  70. sglang/srt/mem_cache/swa_radix_cache.py +70 -14
  71. sglang/srt/model_executor/cuda_graph_runner.py +1 -1
  72. sglang/srt/model_executor/forward_batch_info.py +4 -18
  73. sglang/srt/model_executor/model_runner.py +55 -51
  74. sglang/srt/model_loader/__init__.py +1 -1
  75. sglang/srt/model_loader/loader.py +187 -6
  76. sglang/srt/model_loader/weight_utils.py +3 -0
  77. sglang/srt/models/falcon_h1.py +11 -9
  78. sglang/srt/models/gemma3_mm.py +16 -0
  79. sglang/srt/models/grok.py +5 -13
  80. sglang/srt/models/mixtral.py +1 -3
  81. sglang/srt/models/mllama4.py +11 -1
  82. sglang/srt/models/nemotron_h.py +514 -0
  83. sglang/srt/models/utils.py +5 -1
  84. sglang/srt/sampling/sampling_batch_info.py +11 -9
  85. sglang/srt/server_args.py +100 -33
  86. sglang/srt/speculative/eagle_worker.py +11 -13
  87. sglang/srt/speculative/ngram_worker.py +12 -11
  88. sglang/srt/speculative/spec_utils.py +0 -1
  89. sglang/srt/two_batch_overlap.py +1 -0
  90. sglang/srt/utils/common.py +18 -0
  91. sglang/srt/utils/hf_transformers_utils.py +2 -0
  92. sglang/test/longbench_v2/__init__.py +1 -0
  93. sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
  94. sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
  95. sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
  96. sglang/test/run_eval.py +40 -0
  97. sglang/test/simple_eval_longbench_v2.py +332 -0
  98. sglang/test/test_cutlass_w4a8_moe.py +9 -19
  99. sglang/test/test_deterministic.py +18 -2
  100. sglang/test/test_deterministic_utils.py +81 -0
  101. sglang/test/test_disaggregation_utils.py +63 -0
  102. sglang/test/test_utils.py +32 -11
  103. sglang/version.py +1 -1
  104. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +4 -4
  105. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +109 -98
  106. sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
  107. sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
  108. sglang/test/test_block_fp8_ep.py +0 -358
  109. /sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +0 -0
  110. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
  111. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
  112. {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ Uses GrpcRequestManager for orchestration without tokenization.
5
5
 
6
6
  import argparse
7
7
  import asyncio
8
+ import dataclasses
8
9
  import logging
9
10
  import multiprocessing as mp
10
11
  import os
@@ -14,8 +15,12 @@ from concurrent import futures
14
15
  from typing import AsyncIterator, Dict, Optional, Tuple
15
16
 
16
17
  import grpc
18
+ from google.protobuf.json_format import MessageToDict
19
+ from google.protobuf.struct_pb2 import Struct
20
+ from google.protobuf.timestamp_pb2 import Timestamp
17
21
  from grpc_reflection.v1alpha import reflection
18
22
 
23
+ import sglang
19
24
  from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
20
25
  from sglang.srt.entrypoints.grpc_request_manager import GrpcRequestManager
21
26
  from sglang.srt.grpc import sglang_scheduler_pb2, sglang_scheduler_pb2_grpc
@@ -172,17 +177,19 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
172
177
  request_manager: GrpcRequestManager,
173
178
  server_args: ServerArgs,
174
179
  model_info: Dict,
180
+ scheduler_info: Dict,
175
181
  ):
176
182
  """Initialize the standalone gRPC service."""
177
183
  self.request_manager = request_manager
178
184
  self.server_args = server_args
179
185
  self.model_info = model_info
186
+ self.scheduler_info = scheduler_info
180
187
  self.start_time = time.time()
181
188
 
182
189
  # Start the request manager's event loop using auto_create_handle_loop
183
190
  self.request_manager.auto_create_handle_loop()
184
191
 
185
- logger.info("Standalone gRPC scheduler service initialized")
192
+ logger.info("gRPC scheduler servicer initialized")
186
193
 
187
194
  async def Generate(
188
195
  self,
@@ -190,7 +197,7 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
190
197
  context: grpc.aio.ServicerContext,
191
198
  ) -> AsyncIterator[sglang_scheduler_pb2.GenerateResponse]:
192
199
  """Handle generation requests with streaming responses."""
193
- logger.info(f"Generation request: {request.request_id}")
200
+ logger.info(f"Receive generation request: {request.request_id}")
194
201
 
195
202
  try:
196
203
  # Convert gRPC request to internal format
@@ -204,6 +211,13 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
204
211
  )
205
212
 
206
213
  async for output in response_generator:
214
+ # Check if client cancelled before processing/yielding
215
+ if context.cancelled():
216
+ logger.info(f"Client cancelled request {request.request_id}")
217
+ # Explicitly abort the request to notify scheduler
218
+ await self.request_manager.abort_request(request.request_id)
219
+ break
220
+
207
221
  # Handle batch responses (for n>1 non-streaming)
208
222
  if isinstance(output, list):
209
223
  for batch_output in output:
@@ -242,7 +256,10 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
242
256
  yield self._create_chunk_response(request.request_id, output)
243
257
 
244
258
  except Exception as e:
245
- logger.error(f"Generate failed: {e}\n{get_exception_traceback()}")
259
+ logger.error(
260
+ f"Generate failed for request {request.request_id}: {e}\n"
261
+ f"{get_exception_traceback()}"
262
+ )
246
263
  yield sglang_scheduler_pb2.GenerateResponse(
247
264
  request_id=request.request_id,
248
265
  error=sglang_scheduler_pb2.GenerateError(
@@ -255,10 +272,10 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
255
272
  async def Embed(
256
273
  self,
257
274
  request: sglang_scheduler_pb2.EmbedRequest,
258
- context: grpc.aio.ServicerContext,
275
+ _context: grpc.aio.ServicerContext,
259
276
  ) -> sglang_scheduler_pb2.EmbedResponse:
260
277
  """Handle embedding requests."""
261
- logger.info(f"Embedding request: {request.request_id}")
278
+ logger.info(f"Receive embedding request: {request.request_id}")
262
279
 
263
280
  try:
264
281
  # Convert request
@@ -285,7 +302,10 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
285
302
  )
286
303
 
287
304
  except Exception as e:
288
- logger.error(f"Embed failed: {e}\n{get_exception_traceback()}")
305
+ logger.error(
306
+ f"Embed failed for request {request.request_id}: {e}\n"
307
+ f"{get_exception_traceback()}"
308
+ )
289
309
  return sglang_scheduler_pb2.EmbedResponse(
290
310
  request_id=request.request_id,
291
311
  error=sglang_scheduler_pb2.EmbedError(
@@ -300,86 +320,95 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
300
320
  request: sglang_scheduler_pb2.HealthCheckRequest,
301
321
  context: grpc.aio.ServicerContext,
302
322
  ) -> sglang_scheduler_pb2.HealthCheckResponse:
303
- """Health check by generating from client input."""
304
- try:
305
- # Check if request manager is shutting down
306
- if self.request_manager.gracefully_exit:
307
- return sglang_scheduler_pb2.HealthCheckResponse(
308
- healthy=False, message="Server shutting down"
309
- )
323
+ """
324
+ Check the health of the inference server by sending a special request to generate one token.
325
+ Similar to HTTP server's /health endpoint.
326
+ """
327
+ logger.info("Receive health check request")
310
328
 
311
- # Extract tokenized input from request
312
- if not request.HasField("tokenized"):
313
- return sglang_scheduler_pb2.HealthCheckResponse(
314
- healthy=False, message="Tokenized input required for health check"
315
- )
316
-
317
- input_text = request.tokenized.original_text
318
- input_ids = list(request.tokenized.input_ids)
329
+ if self.request_manager.gracefully_exit:
330
+ logger.info(
331
+ "Health check request received during shutdown. Returning unhealthy."
332
+ )
333
+ return sglang_scheduler_pb2.HealthCheckResponse(
334
+ healthy=False, message="Server is shutting down"
335
+ )
319
336
 
320
- # Create health check request
321
- rid = f"HEALTH_CHECK_GRPC_{time.time()}"
337
+ # Create a special health check request
338
+ rid = f"HEALTH_CHECK_{time.time()}"
339
+ sampling_params = SGLSamplingParams(max_new_tokens=1, temperature=0.0)
340
+ sampling_params.normalize(tokenizer=None)
322
341
 
323
- health_request = TokenizedGenerateReqInput(
342
+ # Create health check request
343
+ is_generation = self.scheduler_info.get("is_generation", True)
344
+ if is_generation:
345
+ health_req = TokenizedGenerateReqInput(
324
346
  rid=rid,
325
- input_text=input_text,
326
- input_ids=input_ids,
327
- sampling_params=SGLSamplingParams(max_new_tokens=1, temperature=0.0),
328
- stream=False,
329
- mm_inputs=None,
347
+ input_text="",
348
+ input_ids=[0],
349
+ sampling_params=sampling_params,
330
350
  return_logprob=False,
331
351
  logprob_start_len=-1,
332
352
  top_logprobs_num=0,
353
+ stream=False,
354
+ mm_inputs=None,
333
355
  token_ids_logprob=None,
334
356
  )
335
-
357
+ # Set disaggregation params if needed
336
358
  if self.server_args.disaggregation_mode != DisaggregationMode.NULL:
337
- health_request.bootstrap_host = FAKE_BOOTSTRAP_HOST
338
- health_request.bootstrap_room = 0
339
-
340
- logger.info(f"Sending health check request to request manager...")
341
-
342
- # Submit and wait for response
343
- output_generator = self.request_manager.generate_request(
344
- health_request, request_id=rid
359
+ health_req.bootstrap_host = FAKE_BOOTSTRAP_HOST
360
+ health_req.bootstrap_room = 0
361
+ else:
362
+ health_req = TokenizedEmbeddingReqInput(
363
+ rid=rid,
364
+ input_text="",
365
+ input_ids=[0],
345
366
  )
346
367
 
368
+ # Submit health check request
369
+ async def run_health_check():
347
370
  try:
348
- # Get first response with timeout
349
- response = await asyncio.wait_for(
350
- output_generator.__anext__(), timeout=HEALTH_CHECK_TIMEOUT
351
- )
352
-
353
- # Clean up
354
- if rid in self.request_manager.rid_to_state:
355
- del self.request_manager.rid_to_state[rid]
356
-
371
+ async for _ in self.request_manager.generate_request(
372
+ obj=health_req,
373
+ request_id=rid,
374
+ ):
375
+ # Got at least one response, server is healthy
376
+ return True
377
+ except Exception as e:
378
+ logger.warning(f"Health check failed: {e}")
379
+ return False
380
+ return False
381
+
382
+ task = asyncio.create_task(run_health_check())
383
+
384
+ # Wait for response with timeout
385
+ tic = time.time()
386
+ while time.time() < tic + HEALTH_CHECK_TIMEOUT:
387
+ await asyncio.sleep(1)
388
+ # Check if we got a response from scheduler
389
+ if self.request_manager.last_receive_tstamp > tic:
390
+ task.cancel()
391
+ # Clean up health check state
392
+ self.request_manager._cleanup_request_state(rid)
357
393
  return sglang_scheduler_pb2.HealthCheckResponse(
358
394
  healthy=True, message="Health check passed"
359
395
  )
360
396
 
361
- except asyncio.TimeoutError:
362
- # Clean up on timeout
363
- if rid in self.request_manager.rid_to_state:
364
- del self.request_manager.rid_to_state[rid]
365
-
366
- return sglang_scheduler_pb2.HealthCheckResponse(
367
- healthy=False, message="Health check timeout"
368
- )
369
-
370
- except Exception as e:
371
- logger.error(f"Health check failed: {e}")
372
- return sglang_scheduler_pb2.HealthCheckResponse(
373
- healthy=False, message=f"Health check error: {str(e)}"
374
- )
397
+ # Timeout - server not responding
398
+ task.cancel()
399
+ self.request_manager._cleanup_request_state(rid)
400
+ logger.warning(f"Health check timeout after {HEALTH_CHECK_TIMEOUT}s")
401
+ return sglang_scheduler_pb2.HealthCheckResponse(
402
+ healthy=False, message=f"Health check timeout after {HEALTH_CHECK_TIMEOUT}s"
403
+ )
375
404
 
376
405
  async def Abort(
377
406
  self,
378
407
  request: sglang_scheduler_pb2.AbortRequest,
379
- context: grpc.aio.ServicerContext,
408
+ _context: grpc.aio.ServicerContext,
380
409
  ) -> sglang_scheduler_pb2.AbortResponse:
381
410
  """Abort an ongoing request."""
382
- logger.info(f"Aborting request: {request.request_id}")
411
+ logger.info(f"Receive abort request: {request.request_id}")
383
412
 
384
413
  try:
385
414
  success = await self.request_manager.abort_request(request.request_id)
@@ -389,12 +418,98 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
389
418
  message=f"Request {request.request_id} {'aborted' if success else 'not found'}",
390
419
  )
391
420
  except Exception as e:
392
- logger.error(f"Abort failed: {e}")
421
+ logger.error(
422
+ f"Abort failed for request {request.request_id}: {e}\n"
423
+ f"{get_exception_traceback()}"
424
+ )
393
425
  return sglang_scheduler_pb2.AbortResponse(
394
426
  success=False,
395
427
  message=str(e),
396
428
  )
397
429
 
430
+ async def GetModelInfo(
431
+ self,
432
+ _request: sglang_scheduler_pb2.GetModelInfoRequest,
433
+ _context: grpc.aio.ServicerContext,
434
+ ) -> sglang_scheduler_pb2.GetModelInfoResponse:
435
+ """Get model information."""
436
+ logger.debug("Receive model info request")
437
+
438
+ is_generation = self.scheduler_info.get("is_generation")
439
+ if is_generation is None:
440
+ is_generation = not self.server_args.is_embedding
441
+
442
+ return sglang_scheduler_pb2.GetModelInfoResponse(
443
+ model_path=self.server_args.model_path,
444
+ tokenizer_path=self.server_args.tokenizer_path or "",
445
+ is_generation=is_generation,
446
+ preferred_sampling_params=(
447
+ self.server_args.preferred_sampling_params or ""
448
+ ),
449
+ weight_version=self.server_args.weight_version or "",
450
+ served_model_name=self.server_args.served_model_name,
451
+ max_context_length=self.model_info["max_context_length"],
452
+ vocab_size=self.model_info["vocab_size"],
453
+ supports_vision=self.model_info["supports_vision"],
454
+ model_type=self.model_info["model_type"],
455
+ eos_token_ids=self.model_info["eos_token_ids"],
456
+ pad_token_id=self.model_info["pad_token_id"],
457
+ bos_token_id=self.model_info["bos_token_id"],
458
+ max_req_input_len=self.model_info["max_req_input_len"],
459
+ )
460
+
461
+ async def GetServerInfo(
462
+ self,
463
+ _request: sglang_scheduler_pb2.GetServerInfoRequest,
464
+ _context: grpc.aio.ServicerContext,
465
+ ) -> sglang_scheduler_pb2.GetServerInfoResponse:
466
+ """Get server information."""
467
+ logger.debug("Receive server info request")
468
+
469
+ server_args_dict = dataclasses.asdict(self.server_args)
470
+ server_args_struct = Struct()
471
+
472
+ def make_serializable(obj):
473
+ if obj is None:
474
+ return None
475
+ elif isinstance(obj, (str, int, float, bool)):
476
+ return obj
477
+ elif isinstance(obj, (list, tuple, set)):
478
+ return [make_serializable(item) for item in obj]
479
+ elif isinstance(obj, dict):
480
+ return {k: make_serializable(v) for k, v in obj.items()}
481
+ else:
482
+ return str(obj)
483
+
484
+ serializable_args = make_serializable(server_args_dict)
485
+ server_args_struct.update(serializable_args)
486
+
487
+ # Convert scheduler_info to Struct
488
+ scheduler_info_struct = Struct()
489
+ scheduler_info_struct.update(self.scheduler_info)
490
+
491
+ # Get runtime state from request manager
492
+ manager_state = self.request_manager.get_server_info()
493
+
494
+ # Calculate uptime
495
+ uptime = time.time() - self.start_time
496
+
497
+ # Create timestamp
498
+ start_timestamp = Timestamp()
499
+ start_timestamp.FromSeconds(int(self.start_time))
500
+
501
+ return sglang_scheduler_pb2.GetServerInfoResponse(
502
+ server_args=server_args_struct,
503
+ scheduler_info=scheduler_info_struct,
504
+ active_requests=manager_state["active_requests"],
505
+ is_paused=manager_state["paused"],
506
+ last_receive_timestamp=manager_state["last_receive_time"],
507
+ uptime_seconds=uptime,
508
+ sglang_version=sglang.__version__,
509
+ server_type="grpc",
510
+ start_time=start_timestamp,
511
+ )
512
+
398
513
  # Helper methods for request/response conversion
399
514
 
400
515
  def _convert_generate_request(
@@ -411,6 +526,7 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
411
526
 
412
527
  # Convert sampling params
413
528
  sampling_params = self._convert_sampling_params(grpc_req.sampling_params)
529
+ sampling_params.normalize(tokenizer=None)
414
530
 
415
531
  # Extract disaggregated params if present
416
532
  bootstrap_host = None
@@ -483,28 +599,52 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
483
599
  elif grpc_params.HasField("structural_tag"):
484
600
  structural_tag = grpc_params.structural_tag
485
601
 
602
+ # Handle optional parameters conversion
603
+ custom_params = (
604
+ MessageToDict(grpc_params.custom_params)
605
+ if grpc_params.HasField("custom_params")
606
+ else None
607
+ )
608
+ max_new_tokens = (
609
+ grpc_params.max_new_tokens
610
+ if grpc_params.HasField("max_new_tokens")
611
+ else None
612
+ )
613
+ stream_interval = (
614
+ grpc_params.stream_interval
615
+ if grpc_params.HasField("stream_interval")
616
+ else None
617
+ )
618
+ logit_bias = dict(grpc_params.logit_bias) if grpc_params.logit_bias else None
619
+ stop = list(grpc_params.stop) if grpc_params.stop else None
620
+ stop_token_ids = (
621
+ list(grpc_params.stop_token_ids) if grpc_params.stop_token_ids else None
622
+ )
623
+
486
624
  return SGLSamplingParams(
487
- temperature=grpc_params.temperature or 1.0,
488
- top_p=grpc_params.top_p or 1.0,
489
- top_k=grpc_params.top_k or -1,
490
- min_p=grpc_params.min_p or 0.0,
491
- frequency_penalty=grpc_params.frequency_penalty or 0.0,
492
- presence_penalty=grpc_params.presence_penalty or 0.0,
493
- repetition_penalty=grpc_params.repetition_penalty or 1.0,
494
- max_new_tokens=grpc_params.max_new_tokens or 128,
495
- min_new_tokens=grpc_params.min_new_tokens or 0,
496
- stop=list(grpc_params.stop) if grpc_params.stop else [],
497
- stop_token_ids=(
498
- list(grpc_params.stop_token_ids) if grpc_params.stop_token_ids else []
499
- ),
625
+ temperature=grpc_params.temperature,
626
+ top_p=grpc_params.top_p,
627
+ top_k=grpc_params.top_k,
628
+ min_p=grpc_params.min_p,
629
+ frequency_penalty=grpc_params.frequency_penalty,
630
+ presence_penalty=grpc_params.presence_penalty,
631
+ repetition_penalty=grpc_params.repetition_penalty,
632
+ max_new_tokens=max_new_tokens,
633
+ min_new_tokens=grpc_params.min_new_tokens,
634
+ stop=stop,
635
+ stop_token_ids=stop_token_ids,
500
636
  skip_special_tokens=grpc_params.skip_special_tokens,
501
637
  spaces_between_special_tokens=grpc_params.spaces_between_special_tokens,
638
+ no_stop_trim=grpc_params.no_stop_trim,
502
639
  regex=regex,
503
640
  json_schema=json_schema,
504
641
  ebnf=ebnf_grammar,
505
642
  structural_tag=structural_tag,
506
- n=grpc_params.n or 1,
643
+ n=grpc_params.n,
507
644
  ignore_eos=grpc_params.ignore_eos,
645
+ stream_interval=stream_interval,
646
+ logit_bias=logit_bias,
647
+ custom_params=custom_params,
508
648
  )
509
649
 
510
650
  def _convert_output_logprobs_to_proto(
@@ -731,6 +871,7 @@ async def serve_grpc(
731
871
  request_manager=request_manager,
732
872
  server_args=server_args,
733
873
  model_info=model_info,
874
+ scheduler_info=scheduler_info,
734
875
  )
735
876
  sglang_scheduler_pb2_grpc.add_SglangSchedulerServicer_to_server(servicer, server)
736
877
 
@@ -745,9 +886,8 @@ async def serve_grpc(
745
886
  listen_addr = f"{server_args.host}:{server_args.port}"
746
887
  server.add_insecure_port(listen_addr)
747
888
 
748
- logger.info(f"Starting standalone gRPC server on {listen_addr}")
749
-
750
889
  await server.start()
890
+ logger.info(f"gRPC server listening on {listen_addr}")
751
891
 
752
892
  # Handle shutdown signals
753
893
  loop = asyncio.get_running_loop()
@@ -52,12 +52,14 @@ from sglang.srt.entrypoints.engine import _launch_subprocesses
52
52
  from sglang.srt.entrypoints.openai.protocol import (
53
53
  ChatCompletionRequest,
54
54
  CompletionRequest,
55
+ DetokenizeRequest,
55
56
  EmbeddingRequest,
56
57
  ErrorResponse,
57
58
  ModelCard,
58
59
  ModelList,
59
60
  ResponsesRequest,
60
61
  ScoringRequest,
62
+ TokenizeRequest,
61
63
  V1RerankReqInput,
62
64
  )
63
65
  from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
@@ -65,6 +67,10 @@ from sglang.srt.entrypoints.openai.serving_completions import OpenAIServingCompl
65
67
  from sglang.srt.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
66
68
  from sglang.srt.entrypoints.openai.serving_rerank import OpenAIServingRerank
67
69
  from sglang.srt.entrypoints.openai.serving_score import OpenAIServingScore
70
+ from sglang.srt.entrypoints.openai.serving_tokenize import (
71
+ OpenAIServingDetokenize,
72
+ OpenAIServingTokenize,
73
+ )
68
74
  from sglang.srt.function_call.function_call_parser import FunctionCallParser
69
75
  from sglang.srt.managers.io_struct import (
70
76
  AbortReq,
@@ -229,6 +235,12 @@ async def lifespan(fast_api_app: FastAPI):
229
235
  fast_api_app.state.openai_serving_rerank = OpenAIServingRerank(
230
236
  _global_state.tokenizer_manager
231
237
  )
238
+ fast_api_app.state.openai_serving_tokenize = OpenAIServingTokenize(
239
+ _global_state.tokenizer_manager
240
+ )
241
+ fast_api_app.state.openai_serving_detokenize = OpenAIServingDetokenize(
242
+ _global_state.tokenizer_manager
243
+ )
232
244
 
233
245
  server_args: ServerArgs = fast_api_app.server_args
234
246
 
@@ -494,7 +506,7 @@ async def get_load():
494
506
 
495
507
 
496
508
  # example usage:
497
- # curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"max_micro_batch_size": 8}}'
509
+ # curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"pp_max_micro_batch_size": 8}}'
498
510
  @app.api_route("/set_internal_state", methods=["POST", "PUT"])
499
511
  async def set_internal_state(obj: SetInternalStateReq, request: Request):
500
512
  res = await _global_state.tokenizer_manager.set_internal_state(obj)
@@ -1070,6 +1082,42 @@ async def openai_v1_embeddings(request: EmbeddingRequest, raw_request: Request):
1070
1082
  )
1071
1083
 
1072
1084
 
1085
+ @app.post(
1086
+ "/v1/tokenize",
1087
+ response_class=ORJSONResponse,
1088
+ dependencies=[Depends(validate_json_request)],
1089
+ )
1090
+ @app.post(
1091
+ "/tokenize",
1092
+ response_class=ORJSONResponse,
1093
+ dependencies=[Depends(validate_json_request)],
1094
+ include_in_schema=False,
1095
+ )
1096
+ async def openai_v1_tokenize(request: TokenizeRequest, raw_request: Request):
1097
+ """OpenAI-compatible tokenization endpoint."""
1098
+ return await raw_request.app.state.openai_serving_tokenize.handle_request(
1099
+ request, raw_request
1100
+ )
1101
+
1102
+
1103
+ @app.post(
1104
+ "/v1/detokenize",
1105
+ response_class=ORJSONResponse,
1106
+ dependencies=[Depends(validate_json_request)],
1107
+ )
1108
+ @app.post(
1109
+ "/detokenize",
1110
+ response_class=ORJSONResponse,
1111
+ dependencies=[Depends(validate_json_request)],
1112
+ include_in_schema=False,
1113
+ )
1114
+ async def openai_v1_detokenize(request: DetokenizeRequest, raw_request: Request):
1115
+ """OpenAI-compatible detokenization endpoint."""
1116
+ return await raw_request.app.state.openai_serving_detokenize.handle_request(
1117
+ request, raw_request
1118
+ )
1119
+
1120
+
1073
1121
  @app.get("/v1/models", response_class=ORJSONResponse)
1074
1122
  async def available_models():
1075
1123
  """Show available models. OpenAI-compatible endpoint."""