sglang 0.3.5__tar.gz → 0.3.5.post1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sglang-0.3.5 → sglang-0.3.5.post1}/PKG-INFO +11 -7
- {sglang-0.3.5 → sglang-0.3.5.post1}/README.md +8 -5
- {sglang-0.3.5 → sglang-0.3.5.post1}/pyproject.toml +4 -3
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/bench_serving.py +113 -3
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/configs/model_config.py +5 -2
- sglang-0.3.5.post1/sglang/srt/constrained/__init__.py +17 -0
- sglang-0.3.5.post1/sglang/srt/constrained/base_grammar_backend.py +72 -0
- sglang-0.3.5.post1/sglang/srt/constrained/outlines_backend.py +165 -0
- sglang-0.3.5.post1/sglang/srt/constrained/outlines_jump_forward.py +182 -0
- sglang-0.3.5.post1/sglang/srt/constrained/xgrammar_backend.py +114 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/attention/triton_ops/decode_attention.py +7 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/attention/triton_ops/extend_attention.py +6 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/fused_moe/fused_moe.py +23 -7
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/quantization/base_config.py +4 -6
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/vocab_parallel_embedding.py +216 -150
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/io_struct.py +5 -3
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/schedule_batch.py +14 -20
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/scheduler.py +153 -94
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/tokenizer_manager.py +81 -17
- sglang-0.3.5.post1/sglang/srt/metrics/collector.py +211 -0
- sglang-0.3.5.post1/sglang/srt/metrics/func_timer.py +108 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/mm_utils.py +1 -1
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/model_executor/cuda_graph_runner.py +2 -2
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/model_executor/forward_batch_info.py +7 -3
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/model_executor/model_runner.py +2 -1
- sglang-0.3.5.post1/sglang/srt/models/gemma2_reward.py +69 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/gpt2.py +31 -37
- sglang-0.3.5.post1/sglang/srt/models/internlm2_reward.py +62 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/llama.py +11 -6
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/llama_reward.py +5 -26
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/olmo.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/qwen2_vl.py +5 -7
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/openai_api/adapter.py +6 -2
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/sampling/sampling_batch_info.py +2 -3
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/sampling/sampling_params.py +0 -14
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/server.py +58 -16
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/server_args.py +42 -22
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/utils.py +87 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/simple_eval_common.py +1 -1
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/simple_eval_humaneval.py +2 -2
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/simple_eval_mgsm.py +2 -2
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/test_utils.py +18 -4
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/utils.py +1 -0
- sglang-0.3.5.post1/sglang/version.py +1 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang.egg-info/PKG-INFO +11 -7
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang.egg-info/SOURCES.txt +8 -5
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang.egg-info/requires.txt +2 -1
- sglang-0.3.5/sglang/srt/constrained/__init__.py +0 -81
- sglang-0.3.5/sglang/srt/constrained/base_tool_cache.py +0 -65
- sglang-0.3.5/sglang/srt/constrained/bnf_cache.py +0 -61
- sglang-0.3.5/sglang/srt/constrained/fsm_cache.py +0 -95
- sglang-0.3.5/sglang/srt/constrained/grammar.py +0 -190
- sglang-0.3.5/sglang/srt/constrained/jump_forward.py +0 -203
- sglang-0.3.5/sglang/version.py +0 -1
- {sglang-0.3.5 → sglang-0.3.5.post1}/LICENSE +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/setup.cfg +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/__init__.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/api.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/bench_latency.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/bench_server_latency.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/check_env.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/global_config.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/__init__.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/backend/__init__.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/backend/anthropic.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/backend/base_backend.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/backend/litellm.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/backend/openai.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/backend/runtime_endpoint.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/backend/vertexai.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/chat_template.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/choices.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/compiler.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/interpreter.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/ir.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/lang/tracer.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/launch_server.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/launch_server_llavavid.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/configs/__init__.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/configs/exaone.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/configs/qwen2vl.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/conversation.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/hf_transformers_utils.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/activation.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/attention/__init__.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/attention/double_sparsity_backend.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/attention/flashinfer_backend.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/attention/triton_backend.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/attention/triton_ops/double_sparsity_attention.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/attention/triton_ops/prefill_attention.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/fused_moe/__init__.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/fused_moe/layer.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/fused_moe/patch.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/layernorm.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/linear.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/logits_processor.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/pooler.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/quantization/__init__.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/radix_attention.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/rotary_embedding.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/sampler.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/layers/torchao_utils.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/lora/lora.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/lora/lora_config.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/lora/lora_manager.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/data_parallel_controller.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/detokenizer_manager.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/image_processor.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/schedule_policy.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/tp_worker.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/managers/tp_worker_overlap_thread.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/mem_cache/chunk_cache.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/mem_cache/flush_cache.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/mem_cache/memory_pool.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/mem_cache/radix_cache.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/baichuan.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/chatglm.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/commandr.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/dbrx.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/deepseek.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/deepseek_v2.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/exaone.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/gemma.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/gemma2.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/gpt_bigcode.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/grok.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/internlm2.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/llama_classification.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/llama_embedding.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/llava.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/llavavid.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/minicpm.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/minicpm3.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/mistral.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/mixtral.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/mixtral_quant.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/mllama.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/olmoe.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/qwen.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/qwen2.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/qwen2_moe.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/stablelm.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/torch_native_llama.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/xverse.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/xverse_moe.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/models/yivl.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/openai_api/protocol.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/few_shot_gsm8k.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/few_shot_gsm8k_engine.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/run_eval.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/runners.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/simple_eval_gpqa.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/simple_eval_math.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/simple_eval_mmlu.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/test_activation.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/test_layernorm.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang/test/test_programs.py +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang.egg-info/dependency_links.txt +0 -0
- {sglang-0.3.5 → sglang-0.3.5.post1}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: sglang
|
3
|
-
Version: 0.3.5
|
3
|
+
Version: 0.3.5.post1
|
4
4
|
Summary: SGLang is yet another fast serving framework for large language models and vision language models.
|
5
5
|
License: Apache License
|
6
6
|
Version 2.0, January 2004
|
@@ -225,13 +225,14 @@ Requires-Dist: interegular; extra == "runtime-common"
|
|
225
225
|
Requires-Dist: orjson; extra == "runtime-common"
|
226
226
|
Requires-Dist: packaging; extra == "runtime-common"
|
227
227
|
Requires-Dist: pillow; extra == "runtime-common"
|
228
|
+
Requires-Dist: prometheus-client>=0.20.0; extra == "runtime-common"
|
228
229
|
Requires-Dist: psutil; extra == "runtime-common"
|
229
230
|
Requires-Dist: pydantic; extra == "runtime-common"
|
230
231
|
Requires-Dist: python-multipart; extra == "runtime-common"
|
231
232
|
Requires-Dist: torchao; extra == "runtime-common"
|
232
233
|
Requires-Dist: uvicorn; extra == "runtime-common"
|
233
234
|
Requires-Dist: uvloop; extra == "runtime-common"
|
234
|
-
Requires-Dist:
|
235
|
+
Requires-Dist: pyzmq>=25.1.2; extra == "runtime-common"
|
235
236
|
Requires-Dist: outlines>=0.0.44; extra == "runtime-common"
|
236
237
|
Requires-Dist: modelscope; extra == "runtime-common"
|
237
238
|
Provides-Extra: srt
|
@@ -291,13 +292,14 @@ Requires-Dist: sglang[test]; extra == "dev-xpu"
|
|
291
292
|
[](https://github.com/sgl-project/sglang/tree/main/LICENSE)
|
292
293
|
[](https://github.com/sgl-project/sglang/issues)
|
293
294
|
[](https://github.com/sgl-project/sglang/issues)
|
295
|
+
[-006BFF)](https://gurubase.io/g/sglang)
|
294
296
|
|
295
297
|
</div>
|
296
298
|
|
297
299
|
--------------------------------------------------------------------------------
|
298
300
|
|
299
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-
|
300
|
-
[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
|
301
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA) |
|
302
|
+
[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
301
303
|
|
302
304
|
## News
|
303
305
|
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
@@ -321,11 +323,13 @@ The core features include:
|
|
321
323
|
|
322
324
|
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
323
325
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
324
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.)
|
326
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
|
325
327
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
326
328
|
|
327
|
-
##
|
328
|
-
See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
|
329
|
+
## Getting Started
|
330
|
+
Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
|
331
|
+
|
332
|
+
Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
|
329
333
|
|
330
334
|
## Backend: SGLang Runtime (SRT)
|
331
335
|
See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
|
@@ -6,13 +6,14 @@
|
|
6
6
|
[](https://github.com/sgl-project/sglang/tree/main/LICENSE)
|
7
7
|
[](https://github.com/sgl-project/sglang/issues)
|
8
8
|
[](https://github.com/sgl-project/sglang/issues)
|
9
|
+
[-006BFF)](https://gurubase.io/g/sglang)
|
9
10
|
|
10
11
|
</div>
|
11
12
|
|
12
13
|
--------------------------------------------------------------------------------
|
13
14
|
|
14
|
-
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-
|
15
|
-
[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) |
|
15
|
+
| [**Blog**](https://lmsys.org/blog/2024-07-25-sglang-llama3/) | [**Documentation**](https://sgl-project.github.io/) | [**Join Slack**](https://join.slack.com/t/sgl-fru7574/shared_invite/zt-2tmmp6flg-89dOlJW2TjnBrTRk1I_~GA) |
|
16
|
+
[**Join Bi-Weekly Development Meeting**](https://docs.google.com/document/d/1xEow4eIM152xNcRxqZz9VEcOiTQo8-CEuuQ5qTmkt-E/edit?usp=sharing) | [**Slides**](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#slides) |
|
16
17
|
|
17
18
|
## News
|
18
19
|
- [2024/10] 🔥 The First SGLang Online Meetup ([slides](https://github.com/sgl-project/sgl-learning-materials?tab=readme-ov-file#the-first-sglang-online-meetup)).
|
@@ -36,11 +37,13 @@ The core features include:
|
|
36
37
|
|
37
38
|
- **Fast Backend Runtime**: Provides efficient serving with RadixAttention for prefix caching, jump-forward constrained decoding, continuous batching, token attention (paged attention), tensor parallelism, FlashInfer kernels, chunked prefill, and quantization (INT4/FP8/AWQ/GPTQ).
|
38
39
|
- **Flexible Frontend Language**: Offers an intuitive interface for programming LLM applications, including chained generation calls, advanced prompting, control flow, multi-modal inputs, parallelism, and external interactions.
|
39
|
-
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.)
|
40
|
+
- **Extensive Model Support**: Supports a wide range of generative models (Llama, Gemma, Mistral, QWen, DeepSeek, LLaVA, etc.), embedding models (e5-mistral, gte) and reward models (Skywork), with easy extensibility for integrating new models.
|
40
41
|
- **Active Community**: SGLang is open-source and backed by an active community with industry adoption.
|
41
42
|
|
42
|
-
##
|
43
|
-
See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
|
43
|
+
## Getting Started
|
44
|
+
Install SGLang: See [https://sgl-project.github.io/start/install.html](https://sgl-project.github.io/start/install.html)
|
45
|
+
|
46
|
+
Send requests: See [https://sgl-project.github.io/start/send_request.html](https://sgl-project.github.io/start/send_request.html)
|
44
47
|
|
45
48
|
## Backend: SGLang Runtime (SRT)
|
46
49
|
See [https://sgl-project.github.io/backend/backend.html](https://sgl-project.github.io/backend/backend.html)
|
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "sglang"
|
7
|
-
version = "0.3.5"
|
7
|
+
version = "0.3.5.post1"
|
8
8
|
description = "SGLang is yet another fast serving framework for large language models and vision language models."
|
9
9
|
readme = "README.md"
|
10
10
|
requires-python = ">=3.8"
|
@@ -17,10 +17,11 @@ dependencies = ["requests", "tqdm", "numpy", "IPython"]
|
|
17
17
|
|
18
18
|
[project.optional-dependencies]
|
19
19
|
runtime_common = ["aiohttp", "decord", "fastapi", "hf_transfer", "huggingface_hub", "interegular",
|
20
|
-
"orjson", "packaging", "pillow", "psutil", "pydantic", "python-multipart",
|
21
|
-
"torchao", "uvicorn", "uvloop", "
|
20
|
+
"orjson", "packaging", "pillow", "prometheus-client>=0.20.0", "psutil", "pydantic", "python-multipart",
|
21
|
+
"torchao", "uvicorn", "uvloop", "pyzmq>=25.1.2",
|
22
22
|
"outlines>=0.0.44", "modelscope"]
|
23
23
|
srt = ["sglang[runtime_common]", "torch", "vllm==0.6.3.post1"]
|
24
|
+
|
24
25
|
# HIP (Heterogeneous-computing Interface for Portability) for AMD
|
25
26
|
# => base docker rocm/vllm-dev:20241022, not from public vllm whl
|
26
27
|
srt_hip = ["sglang[runtime_common]", "torch", "vllm==0.6.3.dev13"]
|
@@ -596,12 +596,20 @@ def sample_random_requests(
|
|
596
596
|
|
597
597
|
# Filter out sequences that are too long or too short
|
598
598
|
input_requests: List[Tuple[str, int, int]] = []
|
599
|
-
for
|
599
|
+
for data in dataset:
|
600
|
+
i = len(input_requests)
|
601
|
+
if i == num_prompts:
|
602
|
+
break
|
603
|
+
|
600
604
|
# Tokenize the prompts and completions.
|
601
|
-
prompt =
|
605
|
+
prompt = data[0]
|
602
606
|
prompt_token_ids = tokenizer.encode(prompt)
|
603
607
|
prompt_len = len(prompt_token_ids)
|
604
608
|
|
609
|
+
# Skip empty prompt
|
610
|
+
if prompt_len == 0:
|
611
|
+
continue
|
612
|
+
|
605
613
|
if prompt_len > input_lens[i]:
|
606
614
|
input_ids = prompt_token_ids[: input_lens[i]]
|
607
615
|
else:
|
@@ -627,6 +635,66 @@ def sample_random_requests(
|
|
627
635
|
return input_requests
|
628
636
|
|
629
637
|
|
638
|
+
def gen_prompt(tokenizer, token_num):
|
639
|
+
"""Generate a random prompt of specified token length using tokenizer vocabulary."""
|
640
|
+
all_available_tokens = list(tokenizer.get_vocab().values())
|
641
|
+
selected_tokens = random.choices(all_available_tokens, k=token_num)
|
642
|
+
return tokenizer.decode(selected_tokens)
|
643
|
+
|
644
|
+
|
645
|
+
def sample_generated_shared_prefix_requests(
|
646
|
+
num_groups: int,
|
647
|
+
prompts_per_group: int,
|
648
|
+
system_prompt_len: int,
|
649
|
+
question_len: int,
|
650
|
+
output_len: int,
|
651
|
+
tokenizer: PreTrainedTokenizerBase,
|
652
|
+
) -> List[Tuple[str, int, int]]:
|
653
|
+
"""Generate benchmark requests with shared system prompts using random tokens."""
|
654
|
+
# Generate system prompts for each group
|
655
|
+
system_prompts = []
|
656
|
+
for _ in range(num_groups):
|
657
|
+
system_prompt = gen_prompt(tokenizer, system_prompt_len)
|
658
|
+
system_prompts.append(system_prompt)
|
659
|
+
|
660
|
+
# Generate questions
|
661
|
+
questions = []
|
662
|
+
for _ in range(num_groups * prompts_per_group):
|
663
|
+
question = gen_prompt(tokenizer, question_len)
|
664
|
+
questions.append(question)
|
665
|
+
|
666
|
+
# Combine system prompts with questions
|
667
|
+
input_requests = []
|
668
|
+
total_input_tokens = 0
|
669
|
+
total_output_tokens = 0
|
670
|
+
|
671
|
+
for group_idx in range(num_groups):
|
672
|
+
system_prompt = system_prompts[group_idx]
|
673
|
+
for prompt_idx in range(prompts_per_group):
|
674
|
+
question = questions[group_idx * prompts_per_group + prompt_idx]
|
675
|
+
full_prompt = f"{system_prompt}\n\n{question}"
|
676
|
+
prompt_len = len(tokenizer.encode(full_prompt))
|
677
|
+
|
678
|
+
input_requests.append((full_prompt, prompt_len, output_len))
|
679
|
+
total_input_tokens += prompt_len
|
680
|
+
total_output_tokens += output_len
|
681
|
+
|
682
|
+
print(f"\nGenerated shared prefix dataset statistics:")
|
683
|
+
print(f"Number of groups: {num_groups}")
|
684
|
+
print(f"Prompts per group: {prompts_per_group}")
|
685
|
+
print(f"Total prompts: {len(input_requests)}")
|
686
|
+
print(f"Total input tokens: {total_input_tokens}")
|
687
|
+
print(f"Total output tokens: {total_output_tokens}")
|
688
|
+
print(
|
689
|
+
f"Average system prompt length: {sum(len(tokenizer.encode(sp)) for sp in system_prompts) / len(system_prompts):.1f} tokens"
|
690
|
+
)
|
691
|
+
print(
|
692
|
+
f"Average question length: {sum(len(tokenizer.encode(q)) for q in questions) / len(questions):.1f} tokens\n"
|
693
|
+
)
|
694
|
+
|
695
|
+
return input_requests
|
696
|
+
|
697
|
+
|
630
698
|
async def get_request(
|
631
699
|
input_requests: List[Tuple[str, int, int]],
|
632
700
|
request_rate: float,
|
@@ -1048,6 +1116,15 @@ def run_benchmark(args_: argparse.Namespace):
|
|
1048
1116
|
tokenizer=tokenizer,
|
1049
1117
|
dataset_path=args.dataset_path,
|
1050
1118
|
)
|
1119
|
+
elif args.dataset_name == "generated-shared-prefix":
|
1120
|
+
input_requests = sample_generated_shared_prefix_requests(
|
1121
|
+
num_groups=args.gen_num_groups,
|
1122
|
+
prompts_per_group=args.gen_prompts_per_group,
|
1123
|
+
system_prompt_len=args.gen_system_prompt_len,
|
1124
|
+
question_len=args.gen_question_len,
|
1125
|
+
output_len=args.gen_output_len,
|
1126
|
+
tokenizer=tokenizer,
|
1127
|
+
)
|
1051
1128
|
else:
|
1052
1129
|
raise ValueError(f"Unknown dataset: {args.dataset_name}")
|
1053
1130
|
|
@@ -1121,7 +1198,7 @@ if __name__ == "__main__":
|
|
1121
1198
|
"--dataset-name",
|
1122
1199
|
type=str,
|
1123
1200
|
default="sharegpt",
|
1124
|
-
choices=["sharegpt", "random"],
|
1201
|
+
choices=["sharegpt", "random", "generated-shared-prefix"],
|
1125
1202
|
help="Name of the dataset to benchmark on.",
|
1126
1203
|
)
|
1127
1204
|
parser.add_argument(
|
@@ -1208,5 +1285,38 @@ if __name__ == "__main__":
|
|
1208
1285
|
help="Append given JSON object to the request payload. You can use this to specify"
|
1209
1286
|
"additional generate params like sampling params.",
|
1210
1287
|
)
|
1288
|
+
|
1289
|
+
group = parser.add_argument_group("generated-shared-prefix dataset arguments")
|
1290
|
+
group.add_argument(
|
1291
|
+
"--gen-num-groups",
|
1292
|
+
type=int,
|
1293
|
+
default=64,
|
1294
|
+
help="Number of system prompt groups for generated-shared-prefix dataset",
|
1295
|
+
)
|
1296
|
+
group.add_argument(
|
1297
|
+
"--gen-prompts-per-group",
|
1298
|
+
type=int,
|
1299
|
+
default=16,
|
1300
|
+
help="Number of prompts per system prompt group for generated-shared-prefix dataset",
|
1301
|
+
)
|
1302
|
+
group.add_argument(
|
1303
|
+
"--gen-system-prompt-len",
|
1304
|
+
type=int,
|
1305
|
+
default=2048,
|
1306
|
+
help="Target length in tokens for system prompts in generated-shared-prefix dataset",
|
1307
|
+
)
|
1308
|
+
group.add_argument(
|
1309
|
+
"--gen-question-len",
|
1310
|
+
type=int,
|
1311
|
+
default=128,
|
1312
|
+
help="Target length in tokens for questions in generated-shared-prefix dataset",
|
1313
|
+
)
|
1314
|
+
group.add_argument(
|
1315
|
+
"--gen-output-len",
|
1316
|
+
type=int,
|
1317
|
+
default=256,
|
1318
|
+
help="Target length in tokens for outputs in generated-shared-prefix dataset",
|
1319
|
+
)
|
1320
|
+
|
1211
1321
|
args = parser.parse_args()
|
1212
1322
|
run_benchmark(args)
|
@@ -39,7 +39,7 @@ class ModelConfig:
|
|
39
39
|
revision: Optional[str] = None,
|
40
40
|
context_length: Optional[int] = None,
|
41
41
|
model_override_args: Optional[dict] = None,
|
42
|
-
is_embedding: Optional[bool] = None
|
42
|
+
is_embedding: Optional[bool] = None,
|
43
43
|
) -> None:
|
44
44
|
# Parse args
|
45
45
|
self.model_override_args = json.loads(model_override_args)
|
@@ -52,7 +52,9 @@ class ModelConfig:
|
|
52
52
|
self.hf_text_config = get_hf_text_config(self.hf_config)
|
53
53
|
|
54
54
|
# Check model type
|
55
|
-
self.is_generation = is_generation_model(
|
55
|
+
self.is_generation = is_generation_model(
|
56
|
+
self.hf_config.architectures, is_embedding
|
57
|
+
)
|
56
58
|
self.is_multimodal = is_multimodal_model(self.hf_config.architectures)
|
57
59
|
self.is_encoder_decoder = is_encoder_decoder_model(self.hf_config.architectures)
|
58
60
|
|
@@ -208,6 +210,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
|
|
208
210
|
or "MistralModel" in model_architectures
|
209
211
|
or "LlamaForSequenceClassification" in model_architectures
|
210
212
|
or "LlamaForSequenceClassificationWithNormal_Weights" in model_architectures
|
213
|
+
or "InternLM2ForRewardModel" in model_architectures
|
211
214
|
):
|
212
215
|
return False
|
213
216
|
else:
|
@@ -0,0 +1,17 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
16
|
+
# TODO(lmzheng): make this an optional dependency
|
17
|
+
from sglang.srt.constrained.outlines_backend import build_regex_from_object
|
@@ -0,0 +1,72 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
16
|
+
"""The baseclass of backends for grammar-guided constrained decoding."""
|
17
|
+
|
18
|
+
from concurrent.futures import Future, ThreadPoolExecutor
|
19
|
+
from dataclasses import dataclass
|
20
|
+
from threading import Event, Lock
|
21
|
+
from typing import Any, Optional, Tuple
|
22
|
+
|
23
|
+
|
24
|
+
@dataclass
|
25
|
+
class CacheEntry:
|
26
|
+
value: Any
|
27
|
+
event: Event
|
28
|
+
|
29
|
+
|
30
|
+
class BaseGrammarObject:
|
31
|
+
pass
|
32
|
+
|
33
|
+
|
34
|
+
class BaseGrammarBackend:
|
35
|
+
def __init__(self):
|
36
|
+
self.executor = ThreadPoolExecutor()
|
37
|
+
self.cache = {}
|
38
|
+
self.cache_lock = Lock()
|
39
|
+
|
40
|
+
def init_value(self, key: Tuple[str, str]) -> BaseGrammarObject:
|
41
|
+
with self.cache_lock:
|
42
|
+
if key in self.cache:
|
43
|
+
cache_hit = True
|
44
|
+
entry = self.cache[key]
|
45
|
+
else:
|
46
|
+
cache_hit = False
|
47
|
+
entry = CacheEntry(None, Event())
|
48
|
+
self.cache[key] = entry
|
49
|
+
|
50
|
+
if cache_hit:
|
51
|
+
entry.event.wait()
|
52
|
+
else:
|
53
|
+
entry.value = self.init_value_impl(key)
|
54
|
+
entry.event.set()
|
55
|
+
return entry.value.copy()
|
56
|
+
|
57
|
+
def init_value_impl(self, key: Tuple[str, str]) -> BaseGrammarObject:
|
58
|
+
raise NotImplementedError()
|
59
|
+
|
60
|
+
def get_cached_value(self, key: Tuple[str, str]) -> Optional[BaseGrammarObject]:
|
61
|
+
with self.cache_lock:
|
62
|
+
entry = self.cache.get(key)
|
63
|
+
if not entry or not entry.event.is_set():
|
64
|
+
return None
|
65
|
+
return self.cache[key].value.copy()
|
66
|
+
|
67
|
+
def get_future_value(self, key: Tuple[str, str]) -> Future:
|
68
|
+
return self.executor.submit(self.init_value, key)
|
69
|
+
|
70
|
+
def reset(self):
|
71
|
+
with self.cache_lock:
|
72
|
+
self.cache.clear()
|
@@ -0,0 +1,165 @@
|
|
1
|
+
"""
|
2
|
+
Copyright 2023-2024 SGLang Team
|
3
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4
|
+
you may not use this file except in compliance with the License.
|
5
|
+
You may obtain a copy of the License at
|
6
|
+
|
7
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
8
|
+
|
9
|
+
Unless required by applicable law or agreed to in writing, software
|
10
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12
|
+
See the License for the specific language governing permissions and
|
13
|
+
limitations under the License.
|
14
|
+
"""
|
15
|
+
|
16
|
+
"""Constrained decoding with outlines backend."""
|
17
|
+
|
18
|
+
import json
|
19
|
+
import logging
|
20
|
+
from typing import Dict, List, Optional, Tuple, Union
|
21
|
+
|
22
|
+
import torch
|
23
|
+
from outlines.fsm.guide import RegexGuide
|
24
|
+
from outlines.models.transformers import TransformerTokenizer
|
25
|
+
|
26
|
+
from sglang.srt.constrained.base_grammar_backend import (
|
27
|
+
BaseGrammarBackend,
|
28
|
+
BaseGrammarObject,
|
29
|
+
)
|
30
|
+
from sglang.srt.constrained.outlines_jump_forward import OutlinesJumpForwardMap
|
31
|
+
|
32
|
+
logger = logging.getLogger(__name__)
|
33
|
+
|
34
|
+
|
35
|
+
try:
|
36
|
+
from outlines.fsm.json_schema import build_regex_from_object
|
37
|
+
except ImportError:
|
38
|
+
# Since outlines 0.0.32, build_regex_from_object is replaced by build_regex_from_schema,
|
39
|
+
# which only accepts string schema as input.
|
40
|
+
from outlines.fsm.json_schema import build_regex_from_schema
|
41
|
+
from pydantic import BaseModel
|
42
|
+
|
43
|
+
def build_regex_from_object(
|
44
|
+
object: Union[str, BaseModel, Dict], whitespace_pattern: Optional[str] = None
|
45
|
+
):
|
46
|
+
if isinstance(object, type(BaseModel)):
|
47
|
+
schema = json.dumps(object.model_json_schema())
|
48
|
+
elif isinstance(object, Dict):
|
49
|
+
schema = json.dumps(object)
|
50
|
+
else:
|
51
|
+
schema = object
|
52
|
+
return build_regex_from_schema(schema, whitespace_pattern)
|
53
|
+
|
54
|
+
|
55
|
+
class OutlinesGrammar(BaseGrammarObject):
|
56
|
+
def __init__(
|
57
|
+
self,
|
58
|
+
guide: RegexGuide,
|
59
|
+
jump_forward_map: Union[OutlinesJumpForwardMap, None],
|
60
|
+
) -> None:
|
61
|
+
self.guide = guide
|
62
|
+
self.jump_forward_map = jump_forward_map
|
63
|
+
self.state = 0
|
64
|
+
|
65
|
+
def accept_token(self, token: int):
|
66
|
+
self.state = self.guide.get_next_state(self.state, token)
|
67
|
+
|
68
|
+
def try_jump_forward(self, tokenizer) -> Optional[Tuple]:
|
69
|
+
if not self.jump_forward_map:
|
70
|
+
return None
|
71
|
+
|
72
|
+
jump_forward_bytes = self.jump_forward_map.jump_forward_byte(self.state)
|
73
|
+
if jump_forward_bytes is None or len(jump_forward_bytes) <= 1:
|
74
|
+
return None
|
75
|
+
|
76
|
+
# preprocess the jump forward string
|
77
|
+
suffix_bytes = []
|
78
|
+
continuation_range = range(0x80, 0xC0)
|
79
|
+
cur_state = self.state
|
80
|
+
while (
|
81
|
+
len(jump_forward_bytes) and jump_forward_bytes[0][0] in continuation_range
|
82
|
+
):
|
83
|
+
# continuation bytes
|
84
|
+
byte_edge = jump_forward_bytes.pop(0)
|
85
|
+
suffix_bytes.append(byte_edge[0])
|
86
|
+
cur_state = byte_edge[1]
|
87
|
+
|
88
|
+
suffix_tokens = [f"<0x{hex(b)[2:].upper()}>" for b in suffix_bytes]
|
89
|
+
suffix_ids = tokenizer.convert_tokens_to_ids(suffix_tokens)
|
90
|
+
return suffix_ids, cur_state
|
91
|
+
|
92
|
+
def jump_forward_str_state(self, helper: Tuple[List[int], str]) -> Tuple[str, int]:
|
93
|
+
_, cur_state = helper
|
94
|
+
return self.jump_forward_map.jump_forward_symbol(cur_state)
|
95
|
+
|
96
|
+
def jump_and_retokenize(
|
97
|
+
self, old_output_ids: List[int], new_output_ids: List[int], next_state: int
|
98
|
+
):
|
99
|
+
self.state = next_state
|
100
|
+
|
101
|
+
def fill_vocab_mask(self, vocab_mask: torch.Tensor):
|
102
|
+
vocab_mask.fill_(1)
|
103
|
+
vocab_mask[self.guide.get_next_instruction(self.state).tokens] = 0
|
104
|
+
|
105
|
+
def copy(self):
|
106
|
+
return OutlinesGrammar(self.guide, self.jump_forward_map)
|
107
|
+
|
108
|
+
|
109
|
+
class OutlinesGrammarBackend(BaseGrammarBackend):
|
110
|
+
def __init__(
|
111
|
+
self,
|
112
|
+
tokenizer,
|
113
|
+
whitespace_pattern: bool,
|
114
|
+
allow_jump_forward: bool,
|
115
|
+
):
|
116
|
+
super().__init__()
|
117
|
+
|
118
|
+
try:
|
119
|
+
self.outlines_tokenizer = TransformerTokenizer(tokenizer)
|
120
|
+
except AttributeError:
|
121
|
+
# FIXME: tmp fix for chatglm2 & chatglm3 (pad_token_id=0)
|
122
|
+
origin_pad_token_id = tokenizer.pad_token_id
|
123
|
+
|
124
|
+
def fset(self, value):
|
125
|
+
self._value = value
|
126
|
+
|
127
|
+
type(tokenizer).pad_token_id = property(
|
128
|
+
fget=type(tokenizer).pad_token_id.fget, fset=fset
|
129
|
+
)
|
130
|
+
self.outlines_tokenizer = TransformerTokenizer(tokenizer)
|
131
|
+
self.outlines_tokenizer.tokenizer.pad_token_id = origin_pad_token_id
|
132
|
+
self.outlines_tokenizer.pad_token_id = origin_pad_token_id
|
133
|
+
self.outlines_tokenizer.pad_token = (
|
134
|
+
self.outlines_tokenizer.tokenizer.pad_token
|
135
|
+
)
|
136
|
+
self.outlines_tokenizer.vocabulary = (
|
137
|
+
self.outlines_tokenizer.tokenizer.get_vocab()
|
138
|
+
)
|
139
|
+
self.allow_jump_forward = allow_jump_forward
|
140
|
+
self.whitespace_pattern = whitespace_pattern
|
141
|
+
|
142
|
+
def init_value_impl(self, key: Tuple[str, str]) -> OutlinesGrammar:
|
143
|
+
key_type, key_string = key
|
144
|
+
if key_type == "json":
|
145
|
+
try:
|
146
|
+
regex = build_regex_from_object(
|
147
|
+
key_string,
|
148
|
+
whitespace_pattern=self.whitespace_pattern,
|
149
|
+
)
|
150
|
+
except NotImplementedError as e:
|
151
|
+
logger.warning(
|
152
|
+
f"skip invalid json schema: json_schema={key_string}, {e=}"
|
153
|
+
)
|
154
|
+
return None, key_string
|
155
|
+
elif key_type == "regex":
|
156
|
+
regex = key_string
|
157
|
+
else:
|
158
|
+
raise ValueError(f"Invalid key_type: {key_type}")
|
159
|
+
|
160
|
+
guide = RegexGuide(regex, self.outlines_tokenizer)
|
161
|
+
if self.allow_jump_forward:
|
162
|
+
jump_forward_map = OutlinesJumpForwardMap(regex)
|
163
|
+
else:
|
164
|
+
jump_forward_map = None
|
165
|
+
return OutlinesGrammar(guide, jump_forward_map)
|