sglang 0.5.3__py3-none-any.whl → 0.5.3.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sglang/bench_one_batch.py +0 -2
- sglang/bench_serving.py +224 -127
- sglang/compile_deep_gemm.py +3 -0
- sglang/launch_server.py +0 -14
- sglang/srt/configs/__init__.py +2 -0
- sglang/srt/configs/falcon_h1.py +12 -58
- sglang/srt/configs/mamba_utils.py +117 -0
- sglang/srt/configs/model_config.py +68 -31
- sglang/srt/configs/nemotron_h.py +286 -0
- sglang/srt/configs/qwen3_next.py +11 -43
- sglang/srt/disaggregation/decode.py +7 -18
- sglang/srt/disaggregation/decode_kvcache_offload_manager.py +1 -1
- sglang/srt/disaggregation/nixl/conn.py +55 -23
- sglang/srt/disaggregation/prefill.py +17 -32
- sglang/srt/entrypoints/engine.py +2 -2
- sglang/srt/entrypoints/grpc_request_manager.py +10 -23
- sglang/srt/entrypoints/grpc_server.py +220 -80
- sglang/srt/entrypoints/http_server.py +49 -1
- sglang/srt/entrypoints/openai/protocol.py +159 -31
- sglang/srt/entrypoints/openai/serving_chat.py +13 -71
- sglang/srt/entrypoints/openai/serving_tokenize.py +144 -0
- sglang/srt/environ.py +4 -0
- sglang/srt/function_call/function_call_parser.py +8 -6
- sglang/srt/grpc/sglang_scheduler_pb2.py +78 -70
- sglang/srt/grpc/sglang_scheduler_pb2.pyi +64 -6
- sglang/srt/grpc/sglang_scheduler_pb2_grpc.py +88 -0
- sglang/srt/layers/attention/attention_registry.py +31 -22
- sglang/srt/layers/attention/fla/layernorm_gated.py +47 -30
- sglang/srt/layers/attention/flashattention_backend.py +0 -1
- sglang/srt/layers/attention/flashinfer_backend.py +223 -6
- sglang/srt/layers/attention/flashinfer_mla_backend.py +1 -1
- sglang/srt/layers/attention/hybrid_linear_attn_backend.py +165 -59
- sglang/srt/layers/attention/mamba/causal_conv1d.py +1 -1
- sglang/srt/layers/attention/mamba/causal_conv1d_triton.py +9 -4
- sglang/srt/layers/attention/mamba/mamba.py +189 -241
- sglang/srt/layers/attention/mamba/mamba2_metadata.py +211 -0
- sglang/srt/layers/attention/mamba/mixer2_rms_norm_gated.py +120 -0
- sglang/srt/layers/attention/mamba/ops/ssd_bmm.py +0 -50
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_scan.py +0 -60
- sglang/srt/layers/attention/mamba/ops/ssd_chunk_state.py +0 -111
- sglang/srt/layers/attention/mamba/ops/ssd_state_passing.py +0 -11
- sglang/srt/layers/attention/triton_backend.py +1 -1
- sglang/srt/layers/logits_processor.py +136 -6
- sglang/srt/layers/modelopt_utils.py +11 -0
- sglang/srt/layers/moe/cutlass_w4a8_moe.py +18 -21
- sglang/srt/layers/moe/ep_moe/kernels.py +31 -452
- sglang/srt/layers/moe/ep_moe/layer.py +8 -286
- sglang/srt/layers/moe/fused_moe_triton/layer.py +6 -11
- sglang/srt/layers/moe/moe_runner/deep_gemm.py +304 -0
- sglang/srt/layers/moe/moe_runner/runner.py +3 -0
- sglang/srt/layers/moe/utils.py +7 -1
- sglang/srt/layers/quantization/__init__.py +1 -1
- sglang/srt/layers/quantization/fp8.py +84 -18
- sglang/srt/layers/quantization/modelopt_quant.py +1 -1
- sglang/srt/layers/quantization/quark/quark.py +3 -1
- sglang/srt/layers/quantization/w4afp8.py +2 -16
- sglang/srt/lora/lora_manager.py +0 -8
- sglang/srt/managers/overlap_utils.py +18 -16
- sglang/srt/managers/schedule_batch.py +119 -90
- sglang/srt/managers/schedule_policy.py +1 -1
- sglang/srt/managers/scheduler.py +213 -126
- sglang/srt/managers/scheduler_metrics_mixin.py +1 -1
- sglang/srt/managers/scheduler_output_processor_mixin.py +180 -86
- sglang/srt/managers/tokenizer_manager.py +270 -53
- sglang/srt/managers/tp_worker.py +39 -28
- sglang/srt/mem_cache/allocator.py +7 -2
- sglang/srt/mem_cache/chunk_cache.py +1 -1
- sglang/srt/mem_cache/memory_pool.py +162 -68
- sglang/srt/mem_cache/radix_cache.py +8 -3
- sglang/srt/mem_cache/swa_radix_cache.py +70 -14
- sglang/srt/model_executor/cuda_graph_runner.py +1 -1
- sglang/srt/model_executor/forward_batch_info.py +4 -18
- sglang/srt/model_executor/model_runner.py +55 -51
- sglang/srt/model_loader/__init__.py +1 -1
- sglang/srt/model_loader/loader.py +187 -6
- sglang/srt/model_loader/weight_utils.py +3 -0
- sglang/srt/models/falcon_h1.py +11 -9
- sglang/srt/models/gemma3_mm.py +16 -0
- sglang/srt/models/grok.py +5 -13
- sglang/srt/models/mixtral.py +1 -3
- sglang/srt/models/mllama4.py +11 -1
- sglang/srt/models/nemotron_h.py +514 -0
- sglang/srt/models/utils.py +5 -1
- sglang/srt/sampling/sampling_batch_info.py +11 -9
- sglang/srt/server_args.py +100 -33
- sglang/srt/speculative/eagle_worker.py +11 -13
- sglang/srt/speculative/ngram_worker.py +12 -11
- sglang/srt/speculative/spec_utils.py +0 -1
- sglang/srt/two_batch_overlap.py +1 -0
- sglang/srt/utils/common.py +18 -0
- sglang/srt/utils/hf_transformers_utils.py +2 -0
- sglang/test/longbench_v2/__init__.py +1 -0
- sglang/test/longbench_v2/test_longbench_v2_eval.py +238 -0
- sglang/test/longbench_v2/validate_longbench_v2.py +337 -0
- sglang/test/longbench_v2/validate_longbench_v2_standalone.py +306 -0
- sglang/test/run_eval.py +40 -0
- sglang/test/simple_eval_longbench_v2.py +332 -0
- sglang/test/test_cutlass_w4a8_moe.py +9 -19
- sglang/test/test_deterministic.py +18 -2
- sglang/test/test_deterministic_utils.py +81 -0
- sglang/test/test_disaggregation_utils.py +63 -0
- sglang/test/test_utils.py +32 -11
- sglang/version.py +1 -1
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/METADATA +4 -4
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/RECORD +109 -98
- sglang/srt/layers/attention/mamba/mamba_utils.py +0 -81
- sglang/srt/managers/tp_worker_overlap_thread.py +0 -311
- sglang/test/test_block_fp8_ep.py +0 -358
- /sglang/srt/speculative/{ngram_utils.py → ngram_info.py} +0 -0
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/WHEEL +0 -0
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/licenses/LICENSE +0 -0
- {sglang-0.5.3.dist-info → sglang-0.5.3.post1.dist-info}/top_level.txt +0 -0
@@ -5,6 +5,7 @@ Uses GrpcRequestManager for orchestration without tokenization.
|
|
5
5
|
|
6
6
|
import argparse
|
7
7
|
import asyncio
|
8
|
+
import dataclasses
|
8
9
|
import logging
|
9
10
|
import multiprocessing as mp
|
10
11
|
import os
|
@@ -14,8 +15,12 @@ from concurrent import futures
|
|
14
15
|
from typing import AsyncIterator, Dict, Optional, Tuple
|
15
16
|
|
16
17
|
import grpc
|
18
|
+
from google.protobuf.json_format import MessageToDict
|
19
|
+
from google.protobuf.struct_pb2 import Struct
|
20
|
+
from google.protobuf.timestamp_pb2 import Timestamp
|
17
21
|
from grpc_reflection.v1alpha import reflection
|
18
22
|
|
23
|
+
import sglang
|
19
24
|
from sglang.srt.disaggregation.utils import FAKE_BOOTSTRAP_HOST, DisaggregationMode
|
20
25
|
from sglang.srt.entrypoints.grpc_request_manager import GrpcRequestManager
|
21
26
|
from sglang.srt.grpc import sglang_scheduler_pb2, sglang_scheduler_pb2_grpc
|
@@ -172,17 +177,19 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
|
|
172
177
|
request_manager: GrpcRequestManager,
|
173
178
|
server_args: ServerArgs,
|
174
179
|
model_info: Dict,
|
180
|
+
scheduler_info: Dict,
|
175
181
|
):
|
176
182
|
"""Initialize the standalone gRPC service."""
|
177
183
|
self.request_manager = request_manager
|
178
184
|
self.server_args = server_args
|
179
185
|
self.model_info = model_info
|
186
|
+
self.scheduler_info = scheduler_info
|
180
187
|
self.start_time = time.time()
|
181
188
|
|
182
189
|
# Start the request manager's event loop using auto_create_handle_loop
|
183
190
|
self.request_manager.auto_create_handle_loop()
|
184
191
|
|
185
|
-
logger.info("
|
192
|
+
logger.info("gRPC scheduler servicer initialized")
|
186
193
|
|
187
194
|
async def Generate(
|
188
195
|
self,
|
@@ -190,7 +197,7 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
|
|
190
197
|
context: grpc.aio.ServicerContext,
|
191
198
|
) -> AsyncIterator[sglang_scheduler_pb2.GenerateResponse]:
|
192
199
|
"""Handle generation requests with streaming responses."""
|
193
|
-
logger.info(f"
|
200
|
+
logger.info(f"Receive generation request: {request.request_id}")
|
194
201
|
|
195
202
|
try:
|
196
203
|
# Convert gRPC request to internal format
|
@@ -204,6 +211,13 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
|
|
204
211
|
)
|
205
212
|
|
206
213
|
async for output in response_generator:
|
214
|
+
# Check if client cancelled before processing/yielding
|
215
|
+
if context.cancelled():
|
216
|
+
logger.info(f"Client cancelled request {request.request_id}")
|
217
|
+
# Explicitly abort the request to notify scheduler
|
218
|
+
await self.request_manager.abort_request(request.request_id)
|
219
|
+
break
|
220
|
+
|
207
221
|
# Handle batch responses (for n>1 non-streaming)
|
208
222
|
if isinstance(output, list):
|
209
223
|
for batch_output in output:
|
@@ -242,7 +256,10 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
|
|
242
256
|
yield self._create_chunk_response(request.request_id, output)
|
243
257
|
|
244
258
|
except Exception as e:
|
245
|
-
logger.error(
|
259
|
+
logger.error(
|
260
|
+
f"Generate failed for request {request.request_id}: {e}\n"
|
261
|
+
f"{get_exception_traceback()}"
|
262
|
+
)
|
246
263
|
yield sglang_scheduler_pb2.GenerateResponse(
|
247
264
|
request_id=request.request_id,
|
248
265
|
error=sglang_scheduler_pb2.GenerateError(
|
@@ -255,10 +272,10 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
|
|
255
272
|
async def Embed(
|
256
273
|
self,
|
257
274
|
request: sglang_scheduler_pb2.EmbedRequest,
|
258
|
-
|
275
|
+
_context: grpc.aio.ServicerContext,
|
259
276
|
) -> sglang_scheduler_pb2.EmbedResponse:
|
260
277
|
"""Handle embedding requests."""
|
261
|
-
logger.info(f"
|
278
|
+
logger.info(f"Receive embedding request: {request.request_id}")
|
262
279
|
|
263
280
|
try:
|
264
281
|
# Convert request
|
@@ -285,7 +302,10 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
|
|
285
302
|
)
|
286
303
|
|
287
304
|
except Exception as e:
|
288
|
-
logger.error(
|
305
|
+
logger.error(
|
306
|
+
f"Embed failed for request {request.request_id}: {e}\n"
|
307
|
+
f"{get_exception_traceback()}"
|
308
|
+
)
|
289
309
|
return sglang_scheduler_pb2.EmbedResponse(
|
290
310
|
request_id=request.request_id,
|
291
311
|
error=sglang_scheduler_pb2.EmbedError(
|
@@ -300,86 +320,95 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
|
|
300
320
|
request: sglang_scheduler_pb2.HealthCheckRequest,
|
301
321
|
context: grpc.aio.ServicerContext,
|
302
322
|
) -> sglang_scheduler_pb2.HealthCheckResponse:
|
303
|
-
"""
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
healthy=False, message="Server shutting down"
|
309
|
-
)
|
323
|
+
"""
|
324
|
+
Check the health of the inference server by sending a special request to generate one token.
|
325
|
+
Similar to HTTP server's /health endpoint.
|
326
|
+
"""
|
327
|
+
logger.info("Receive health check request")
|
310
328
|
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
input_ids = list(request.tokenized.input_ids)
|
329
|
+
if self.request_manager.gracefully_exit:
|
330
|
+
logger.info(
|
331
|
+
"Health check request received during shutdown. Returning unhealthy."
|
332
|
+
)
|
333
|
+
return sglang_scheduler_pb2.HealthCheckResponse(
|
334
|
+
healthy=False, message="Server is shutting down"
|
335
|
+
)
|
319
336
|
|
320
|
-
|
321
|
-
|
337
|
+
# Create a special health check request
|
338
|
+
rid = f"HEALTH_CHECK_{time.time()}"
|
339
|
+
sampling_params = SGLSamplingParams(max_new_tokens=1, temperature=0.0)
|
340
|
+
sampling_params.normalize(tokenizer=None)
|
322
341
|
|
323
|
-
|
342
|
+
# Create health check request
|
343
|
+
is_generation = self.scheduler_info.get("is_generation", True)
|
344
|
+
if is_generation:
|
345
|
+
health_req = TokenizedGenerateReqInput(
|
324
346
|
rid=rid,
|
325
|
-
input_text=
|
326
|
-
input_ids=
|
327
|
-
sampling_params=
|
328
|
-
stream=False,
|
329
|
-
mm_inputs=None,
|
347
|
+
input_text="",
|
348
|
+
input_ids=[0],
|
349
|
+
sampling_params=sampling_params,
|
330
350
|
return_logprob=False,
|
331
351
|
logprob_start_len=-1,
|
332
352
|
top_logprobs_num=0,
|
353
|
+
stream=False,
|
354
|
+
mm_inputs=None,
|
333
355
|
token_ids_logprob=None,
|
334
356
|
)
|
335
|
-
|
357
|
+
# Set disaggregation params if needed
|
336
358
|
if self.server_args.disaggregation_mode != DisaggregationMode.NULL:
|
337
|
-
|
338
|
-
|
339
|
-
|
340
|
-
|
341
|
-
|
342
|
-
|
343
|
-
|
344
|
-
health_request, request_id=rid
|
359
|
+
health_req.bootstrap_host = FAKE_BOOTSTRAP_HOST
|
360
|
+
health_req.bootstrap_room = 0
|
361
|
+
else:
|
362
|
+
health_req = TokenizedEmbeddingReqInput(
|
363
|
+
rid=rid,
|
364
|
+
input_text="",
|
365
|
+
input_ids=[0],
|
345
366
|
)
|
346
367
|
|
368
|
+
# Submit health check request
|
369
|
+
async def run_health_check():
|
347
370
|
try:
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
)
|
352
|
-
|
353
|
-
|
354
|
-
|
355
|
-
|
356
|
-
|
371
|
+
async for _ in self.request_manager.generate_request(
|
372
|
+
obj=health_req,
|
373
|
+
request_id=rid,
|
374
|
+
):
|
375
|
+
# Got at least one response, server is healthy
|
376
|
+
return True
|
377
|
+
except Exception as e:
|
378
|
+
logger.warning(f"Health check failed: {e}")
|
379
|
+
return False
|
380
|
+
return False
|
381
|
+
|
382
|
+
task = asyncio.create_task(run_health_check())
|
383
|
+
|
384
|
+
# Wait for response with timeout
|
385
|
+
tic = time.time()
|
386
|
+
while time.time() < tic + HEALTH_CHECK_TIMEOUT:
|
387
|
+
await asyncio.sleep(1)
|
388
|
+
# Check if we got a response from scheduler
|
389
|
+
if self.request_manager.last_receive_tstamp > tic:
|
390
|
+
task.cancel()
|
391
|
+
# Clean up health check state
|
392
|
+
self.request_manager._cleanup_request_state(rid)
|
357
393
|
return sglang_scheduler_pb2.HealthCheckResponse(
|
358
394
|
healthy=True, message="Health check passed"
|
359
395
|
)
|
360
396
|
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
)
|
369
|
-
|
370
|
-
except Exception as e:
|
371
|
-
logger.error(f"Health check failed: {e}")
|
372
|
-
return sglang_scheduler_pb2.HealthCheckResponse(
|
373
|
-
healthy=False, message=f"Health check error: {str(e)}"
|
374
|
-
)
|
397
|
+
# Timeout - server not responding
|
398
|
+
task.cancel()
|
399
|
+
self.request_manager._cleanup_request_state(rid)
|
400
|
+
logger.warning(f"Health check timeout after {HEALTH_CHECK_TIMEOUT}s")
|
401
|
+
return sglang_scheduler_pb2.HealthCheckResponse(
|
402
|
+
healthy=False, message=f"Health check timeout after {HEALTH_CHECK_TIMEOUT}s"
|
403
|
+
)
|
375
404
|
|
376
405
|
async def Abort(
|
377
406
|
self,
|
378
407
|
request: sglang_scheduler_pb2.AbortRequest,
|
379
|
-
|
408
|
+
_context: grpc.aio.ServicerContext,
|
380
409
|
) -> sglang_scheduler_pb2.AbortResponse:
|
381
410
|
"""Abort an ongoing request."""
|
382
|
-
logger.info(f"
|
411
|
+
logger.info(f"Receive abort request: {request.request_id}")
|
383
412
|
|
384
413
|
try:
|
385
414
|
success = await self.request_manager.abort_request(request.request_id)
|
@@ -389,12 +418,98 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
|
|
389
418
|
message=f"Request {request.request_id} {'aborted' if success else 'not found'}",
|
390
419
|
)
|
391
420
|
except Exception as e:
|
392
|
-
logger.error(
|
421
|
+
logger.error(
|
422
|
+
f"Abort failed for request {request.request_id}: {e}\n"
|
423
|
+
f"{get_exception_traceback()}"
|
424
|
+
)
|
393
425
|
return sglang_scheduler_pb2.AbortResponse(
|
394
426
|
success=False,
|
395
427
|
message=str(e),
|
396
428
|
)
|
397
429
|
|
430
|
+
async def GetModelInfo(
|
431
|
+
self,
|
432
|
+
_request: sglang_scheduler_pb2.GetModelInfoRequest,
|
433
|
+
_context: grpc.aio.ServicerContext,
|
434
|
+
) -> sglang_scheduler_pb2.GetModelInfoResponse:
|
435
|
+
"""Get model information."""
|
436
|
+
logger.debug("Receive model info request")
|
437
|
+
|
438
|
+
is_generation = self.scheduler_info.get("is_generation")
|
439
|
+
if is_generation is None:
|
440
|
+
is_generation = not self.server_args.is_embedding
|
441
|
+
|
442
|
+
return sglang_scheduler_pb2.GetModelInfoResponse(
|
443
|
+
model_path=self.server_args.model_path,
|
444
|
+
tokenizer_path=self.server_args.tokenizer_path or "",
|
445
|
+
is_generation=is_generation,
|
446
|
+
preferred_sampling_params=(
|
447
|
+
self.server_args.preferred_sampling_params or ""
|
448
|
+
),
|
449
|
+
weight_version=self.server_args.weight_version or "",
|
450
|
+
served_model_name=self.server_args.served_model_name,
|
451
|
+
max_context_length=self.model_info["max_context_length"],
|
452
|
+
vocab_size=self.model_info["vocab_size"],
|
453
|
+
supports_vision=self.model_info["supports_vision"],
|
454
|
+
model_type=self.model_info["model_type"],
|
455
|
+
eos_token_ids=self.model_info["eos_token_ids"],
|
456
|
+
pad_token_id=self.model_info["pad_token_id"],
|
457
|
+
bos_token_id=self.model_info["bos_token_id"],
|
458
|
+
max_req_input_len=self.model_info["max_req_input_len"],
|
459
|
+
)
|
460
|
+
|
461
|
+
async def GetServerInfo(
|
462
|
+
self,
|
463
|
+
_request: sglang_scheduler_pb2.GetServerInfoRequest,
|
464
|
+
_context: grpc.aio.ServicerContext,
|
465
|
+
) -> sglang_scheduler_pb2.GetServerInfoResponse:
|
466
|
+
"""Get server information."""
|
467
|
+
logger.debug("Receive server info request")
|
468
|
+
|
469
|
+
server_args_dict = dataclasses.asdict(self.server_args)
|
470
|
+
server_args_struct = Struct()
|
471
|
+
|
472
|
+
def make_serializable(obj):
|
473
|
+
if obj is None:
|
474
|
+
return None
|
475
|
+
elif isinstance(obj, (str, int, float, bool)):
|
476
|
+
return obj
|
477
|
+
elif isinstance(obj, (list, tuple, set)):
|
478
|
+
return [make_serializable(item) for item in obj]
|
479
|
+
elif isinstance(obj, dict):
|
480
|
+
return {k: make_serializable(v) for k, v in obj.items()}
|
481
|
+
else:
|
482
|
+
return str(obj)
|
483
|
+
|
484
|
+
serializable_args = make_serializable(server_args_dict)
|
485
|
+
server_args_struct.update(serializable_args)
|
486
|
+
|
487
|
+
# Convert scheduler_info to Struct
|
488
|
+
scheduler_info_struct = Struct()
|
489
|
+
scheduler_info_struct.update(self.scheduler_info)
|
490
|
+
|
491
|
+
# Get runtime state from request manager
|
492
|
+
manager_state = self.request_manager.get_server_info()
|
493
|
+
|
494
|
+
# Calculate uptime
|
495
|
+
uptime = time.time() - self.start_time
|
496
|
+
|
497
|
+
# Create timestamp
|
498
|
+
start_timestamp = Timestamp()
|
499
|
+
start_timestamp.FromSeconds(int(self.start_time))
|
500
|
+
|
501
|
+
return sglang_scheduler_pb2.GetServerInfoResponse(
|
502
|
+
server_args=server_args_struct,
|
503
|
+
scheduler_info=scheduler_info_struct,
|
504
|
+
active_requests=manager_state["active_requests"],
|
505
|
+
is_paused=manager_state["paused"],
|
506
|
+
last_receive_timestamp=manager_state["last_receive_time"],
|
507
|
+
uptime_seconds=uptime,
|
508
|
+
sglang_version=sglang.__version__,
|
509
|
+
server_type="grpc",
|
510
|
+
start_time=start_timestamp,
|
511
|
+
)
|
512
|
+
|
398
513
|
# Helper methods for request/response conversion
|
399
514
|
|
400
515
|
def _convert_generate_request(
|
@@ -411,6 +526,7 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
|
|
411
526
|
|
412
527
|
# Convert sampling params
|
413
528
|
sampling_params = self._convert_sampling_params(grpc_req.sampling_params)
|
529
|
+
sampling_params.normalize(tokenizer=None)
|
414
530
|
|
415
531
|
# Extract disaggregated params if present
|
416
532
|
bootstrap_host = None
|
@@ -483,28 +599,52 @@ class SGLangSchedulerServicer(sglang_scheduler_pb2_grpc.SglangSchedulerServicer)
|
|
483
599
|
elif grpc_params.HasField("structural_tag"):
|
484
600
|
structural_tag = grpc_params.structural_tag
|
485
601
|
|
602
|
+
# Handle optional parameters conversion
|
603
|
+
custom_params = (
|
604
|
+
MessageToDict(grpc_params.custom_params)
|
605
|
+
if grpc_params.HasField("custom_params")
|
606
|
+
else None
|
607
|
+
)
|
608
|
+
max_new_tokens = (
|
609
|
+
grpc_params.max_new_tokens
|
610
|
+
if grpc_params.HasField("max_new_tokens")
|
611
|
+
else None
|
612
|
+
)
|
613
|
+
stream_interval = (
|
614
|
+
grpc_params.stream_interval
|
615
|
+
if grpc_params.HasField("stream_interval")
|
616
|
+
else None
|
617
|
+
)
|
618
|
+
logit_bias = dict(grpc_params.logit_bias) if grpc_params.logit_bias else None
|
619
|
+
stop = list(grpc_params.stop) if grpc_params.stop else None
|
620
|
+
stop_token_ids = (
|
621
|
+
list(grpc_params.stop_token_ids) if grpc_params.stop_token_ids else None
|
622
|
+
)
|
623
|
+
|
486
624
|
return SGLSamplingParams(
|
487
|
-
temperature=grpc_params.temperature
|
488
|
-
top_p=grpc_params.top_p
|
489
|
-
top_k=grpc_params.top_k
|
490
|
-
min_p=grpc_params.min_p
|
491
|
-
frequency_penalty=grpc_params.frequency_penalty
|
492
|
-
presence_penalty=grpc_params.presence_penalty
|
493
|
-
repetition_penalty=grpc_params.repetition_penalty
|
494
|
-
max_new_tokens=
|
495
|
-
min_new_tokens=grpc_params.min_new_tokens
|
496
|
-
stop=
|
497
|
-
stop_token_ids=
|
498
|
-
list(grpc_params.stop_token_ids) if grpc_params.stop_token_ids else []
|
499
|
-
),
|
625
|
+
temperature=grpc_params.temperature,
|
626
|
+
top_p=grpc_params.top_p,
|
627
|
+
top_k=grpc_params.top_k,
|
628
|
+
min_p=grpc_params.min_p,
|
629
|
+
frequency_penalty=grpc_params.frequency_penalty,
|
630
|
+
presence_penalty=grpc_params.presence_penalty,
|
631
|
+
repetition_penalty=grpc_params.repetition_penalty,
|
632
|
+
max_new_tokens=max_new_tokens,
|
633
|
+
min_new_tokens=grpc_params.min_new_tokens,
|
634
|
+
stop=stop,
|
635
|
+
stop_token_ids=stop_token_ids,
|
500
636
|
skip_special_tokens=grpc_params.skip_special_tokens,
|
501
637
|
spaces_between_special_tokens=grpc_params.spaces_between_special_tokens,
|
638
|
+
no_stop_trim=grpc_params.no_stop_trim,
|
502
639
|
regex=regex,
|
503
640
|
json_schema=json_schema,
|
504
641
|
ebnf=ebnf_grammar,
|
505
642
|
structural_tag=structural_tag,
|
506
|
-
n=grpc_params.n
|
643
|
+
n=grpc_params.n,
|
507
644
|
ignore_eos=grpc_params.ignore_eos,
|
645
|
+
stream_interval=stream_interval,
|
646
|
+
logit_bias=logit_bias,
|
647
|
+
custom_params=custom_params,
|
508
648
|
)
|
509
649
|
|
510
650
|
def _convert_output_logprobs_to_proto(
|
@@ -731,6 +871,7 @@ async def serve_grpc(
|
|
731
871
|
request_manager=request_manager,
|
732
872
|
server_args=server_args,
|
733
873
|
model_info=model_info,
|
874
|
+
scheduler_info=scheduler_info,
|
734
875
|
)
|
735
876
|
sglang_scheduler_pb2_grpc.add_SglangSchedulerServicer_to_server(servicer, server)
|
736
877
|
|
@@ -745,9 +886,8 @@ async def serve_grpc(
|
|
745
886
|
listen_addr = f"{server_args.host}:{server_args.port}"
|
746
887
|
server.add_insecure_port(listen_addr)
|
747
888
|
|
748
|
-
logger.info(f"Starting standalone gRPC server on {listen_addr}")
|
749
|
-
|
750
889
|
await server.start()
|
890
|
+
logger.info(f"gRPC server listening on {listen_addr}")
|
751
891
|
|
752
892
|
# Handle shutdown signals
|
753
893
|
loop = asyncio.get_running_loop()
|
@@ -52,12 +52,14 @@ from sglang.srt.entrypoints.engine import _launch_subprocesses
|
|
52
52
|
from sglang.srt.entrypoints.openai.protocol import (
|
53
53
|
ChatCompletionRequest,
|
54
54
|
CompletionRequest,
|
55
|
+
DetokenizeRequest,
|
55
56
|
EmbeddingRequest,
|
56
57
|
ErrorResponse,
|
57
58
|
ModelCard,
|
58
59
|
ModelList,
|
59
60
|
ResponsesRequest,
|
60
61
|
ScoringRequest,
|
62
|
+
TokenizeRequest,
|
61
63
|
V1RerankReqInput,
|
62
64
|
)
|
63
65
|
from sglang.srt.entrypoints.openai.serving_chat import OpenAIServingChat
|
@@ -65,6 +67,10 @@ from sglang.srt.entrypoints.openai.serving_completions import OpenAIServingCompl
|
|
65
67
|
from sglang.srt.entrypoints.openai.serving_embedding import OpenAIServingEmbedding
|
66
68
|
from sglang.srt.entrypoints.openai.serving_rerank import OpenAIServingRerank
|
67
69
|
from sglang.srt.entrypoints.openai.serving_score import OpenAIServingScore
|
70
|
+
from sglang.srt.entrypoints.openai.serving_tokenize import (
|
71
|
+
OpenAIServingDetokenize,
|
72
|
+
OpenAIServingTokenize,
|
73
|
+
)
|
68
74
|
from sglang.srt.function_call.function_call_parser import FunctionCallParser
|
69
75
|
from sglang.srt.managers.io_struct import (
|
70
76
|
AbortReq,
|
@@ -229,6 +235,12 @@ async def lifespan(fast_api_app: FastAPI):
|
|
229
235
|
fast_api_app.state.openai_serving_rerank = OpenAIServingRerank(
|
230
236
|
_global_state.tokenizer_manager
|
231
237
|
)
|
238
|
+
fast_api_app.state.openai_serving_tokenize = OpenAIServingTokenize(
|
239
|
+
_global_state.tokenizer_manager
|
240
|
+
)
|
241
|
+
fast_api_app.state.openai_serving_detokenize = OpenAIServingDetokenize(
|
242
|
+
_global_state.tokenizer_manager
|
243
|
+
)
|
232
244
|
|
233
245
|
server_args: ServerArgs = fast_api_app.server_args
|
234
246
|
|
@@ -494,7 +506,7 @@ async def get_load():
|
|
494
506
|
|
495
507
|
|
496
508
|
# example usage:
|
497
|
-
# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"
|
509
|
+
# curl -s -X POST http://localhost:30000/set_internal_state -H "Content-Type: application/json" -d '{"server_args": {"pp_max_micro_batch_size": 8}}'
|
498
510
|
@app.api_route("/set_internal_state", methods=["POST", "PUT"])
|
499
511
|
async def set_internal_state(obj: SetInternalStateReq, request: Request):
|
500
512
|
res = await _global_state.tokenizer_manager.set_internal_state(obj)
|
@@ -1070,6 +1082,42 @@ async def openai_v1_embeddings(request: EmbeddingRequest, raw_request: Request):
|
|
1070
1082
|
)
|
1071
1083
|
|
1072
1084
|
|
1085
|
+
@app.post(
|
1086
|
+
"/v1/tokenize",
|
1087
|
+
response_class=ORJSONResponse,
|
1088
|
+
dependencies=[Depends(validate_json_request)],
|
1089
|
+
)
|
1090
|
+
@app.post(
|
1091
|
+
"/tokenize",
|
1092
|
+
response_class=ORJSONResponse,
|
1093
|
+
dependencies=[Depends(validate_json_request)],
|
1094
|
+
include_in_schema=False,
|
1095
|
+
)
|
1096
|
+
async def openai_v1_tokenize(request: TokenizeRequest, raw_request: Request):
|
1097
|
+
"""OpenAI-compatible tokenization endpoint."""
|
1098
|
+
return await raw_request.app.state.openai_serving_tokenize.handle_request(
|
1099
|
+
request, raw_request
|
1100
|
+
)
|
1101
|
+
|
1102
|
+
|
1103
|
+
@app.post(
|
1104
|
+
"/v1/detokenize",
|
1105
|
+
response_class=ORJSONResponse,
|
1106
|
+
dependencies=[Depends(validate_json_request)],
|
1107
|
+
)
|
1108
|
+
@app.post(
|
1109
|
+
"/detokenize",
|
1110
|
+
response_class=ORJSONResponse,
|
1111
|
+
dependencies=[Depends(validate_json_request)],
|
1112
|
+
include_in_schema=False,
|
1113
|
+
)
|
1114
|
+
async def openai_v1_detokenize(request: DetokenizeRequest, raw_request: Request):
|
1115
|
+
"""OpenAI-compatible detokenization endpoint."""
|
1116
|
+
return await raw_request.app.state.openai_serving_detokenize.handle_request(
|
1117
|
+
request, raw_request
|
1118
|
+
)
|
1119
|
+
|
1120
|
+
|
1073
1121
|
@app.get("/v1/models", response_class=ORJSONResponse)
|
1074
1122
|
async def available_models():
|
1075
1123
|
"""Show available models. OpenAI-compatible endpoint."""
|