sglang 0.2.14.post2__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (126) hide show
  1. {sglang-0.2.14.post2/sglang.egg-info → sglang-0.3.0}/PKG-INFO +9 -8
  2. {sglang-0.2.14.post2 → sglang-0.3.0}/README.md +8 -7
  3. {sglang-0.2.14.post2 → sglang-0.3.0}/pyproject.toml +1 -1
  4. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/api.py +2 -0
  5. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/bench_latency.py +39 -28
  6. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/lang/backend/runtime_endpoint.py +8 -4
  7. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/lang/interpreter.py +3 -0
  8. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/lang/ir.py +5 -0
  9. sglang-0.3.0/sglang/launch_server_llavavid.py +26 -0
  10. sglang-0.3.0/sglang/srt/configs/__init__.py +5 -0
  11. sglang-0.3.0/sglang/srt/configs/exaone.py +195 -0
  12. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/constrained/fsm_cache.py +1 -1
  13. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/conversation.py +24 -2
  14. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/hf_transformers_utils.py +12 -12
  15. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/layers/extend_attention.py +13 -8
  16. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/layers/logits_processor.py +4 -4
  17. sglang-0.3.0/sglang/srt/layers/sampler.py +178 -0
  18. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/managers/controller_multi.py +5 -5
  19. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/managers/controller_single.py +5 -5
  20. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/managers/io_struct.py +6 -1
  21. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/managers/schedule_batch.py +26 -11
  22. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/managers/tokenizer_manager.py +9 -9
  23. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/managers/tp_worker.py +38 -26
  24. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/model_config.py +3 -3
  25. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/model_executor/cuda_graph_runner.py +26 -9
  26. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/model_executor/forward_batch_info.py +68 -23
  27. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/model_executor/model_runner.py +15 -22
  28. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/chatglm.py +9 -15
  29. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/commandr.py +5 -1
  30. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/dbrx.py +5 -1
  31. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/deepseek.py +5 -1
  32. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/deepseek_v2.py +57 -25
  33. sglang-0.3.0/sglang/srt/models/exaone.py +368 -0
  34. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/gemma.py +5 -1
  35. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/gemma2.py +5 -1
  36. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/gpt_bigcode.py +5 -1
  37. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/grok.py +5 -1
  38. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/internlm2.py +5 -1
  39. sglang-0.2.14.post2/sglang/srt/models/llama2.py → sglang-0.3.0/sglang/srt/models/llama.py +25 -45
  40. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/llama_classification.py +34 -41
  41. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/llama_embedding.py +7 -6
  42. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/llava.py +8 -11
  43. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/llavavid.py +5 -6
  44. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/minicpm.py +5 -1
  45. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/mistral.py +2 -3
  46. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/mixtral.py +6 -2
  47. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/mixtral_quant.py +5 -1
  48. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/qwen.py +5 -2
  49. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/qwen2.py +6 -2
  50. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/qwen2_moe.py +5 -14
  51. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/stablelm.py +5 -1
  52. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/openai_api/adapter.py +16 -1
  53. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/openai_api/protocol.py +5 -5
  54. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/sampling/sampling_batch_info.py +75 -6
  55. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/server.py +6 -6
  56. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/utils.py +0 -3
  57. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/test/runners.py +1 -1
  58. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/test/test_programs.py +68 -0
  59. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/test/test_utils.py +4 -0
  60. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/utils.py +39 -0
  61. sglang-0.3.0/sglang/version.py +1 -0
  62. {sglang-0.2.14.post2 → sglang-0.3.0/sglang.egg-info}/PKG-INFO +9 -8
  63. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang.egg-info/SOURCES.txt +4 -1
  64. sglang-0.2.14.post2/sglang/launch_server_llavavid.py +0 -26
  65. sglang-0.2.14.post2/sglang/srt/layers/sampler.py +0 -101
  66. sglang-0.2.14.post2/sglang/version.py +0 -1
  67. {sglang-0.2.14.post2 → sglang-0.3.0}/LICENSE +0 -0
  68. {sglang-0.2.14.post2 → sglang-0.3.0}/setup.cfg +0 -0
  69. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/__init__.py +0 -0
  70. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/bench_serving.py +0 -0
  71. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/check_env.py +0 -0
  72. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/global_config.py +0 -0
  73. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/lang/__init__.py +0 -0
  74. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/lang/backend/__init__.py +0 -0
  75. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/lang/backend/anthropic.py +0 -0
  76. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/lang/backend/base_backend.py +0 -0
  77. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/lang/backend/litellm.py +0 -0
  78. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/lang/backend/openai.py +0 -0
  79. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/lang/backend/vertexai.py +0 -0
  80. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/lang/chat_template.py +0 -0
  81. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/lang/choices.py +0 -0
  82. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/lang/compiler.py +0 -0
  83. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/lang/tracer.py +0 -0
  84. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/launch_server.py +0 -0
  85. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/constrained/__init__.py +0 -0
  86. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/constrained/base_tool_cache.py +0 -0
  87. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/constrained/jump_forward.py +0 -0
  88. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/layers/activation.py +0 -0
  89. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/layers/decode_attention.py +0 -0
  90. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/layers/fused_moe/__init__.py +0 -0
  91. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/layers/fused_moe/fused_moe.py +0 -0
  92. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/layers/fused_moe/layer.py +0 -0
  93. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/layers/layernorm.py +0 -0
  94. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/layers/pooler.py +0 -0
  95. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/layers/prefill_attention.py +0 -0
  96. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/layers/radix_attention.py +0 -0
  97. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/managers/detokenizer_manager.py +0 -0
  98. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/managers/policy_scheduler.py +0 -0
  99. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/mem_cache/base_prefix_cache.py +0 -0
  100. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/mem_cache/chunk_cache.py +0 -0
  101. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/mem_cache/flush_cache.py +0 -0
  102. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/mem_cache/memory_pool.py +0 -0
  103. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/mem_cache/radix_cache.py +0 -0
  104. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/mm_utils.py +0 -0
  105. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/models/yivl.py +0 -0
  106. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/sampling/penaltylib/__init__.py +0 -0
  107. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/sampling/penaltylib/orchestrator.py +0 -0
  108. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/sampling/penaltylib/penalizers/frequency_penalty.py +0 -0
  109. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/sampling/penaltylib/penalizers/min_new_tokens.py +0 -0
  110. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/sampling/penaltylib/penalizers/presence_penalty.py +0 -0
  111. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/sampling/penaltylib/penalizers/repetition_penalty.py +0 -0
  112. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/sampling/sampling_params.py +0 -0
  113. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/srt/server_args.py +0 -0
  114. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/test/run_eval.py +0 -0
  115. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/test/simple_eval_common.py +0 -0
  116. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/test/simple_eval_gpqa.py +0 -0
  117. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/test/simple_eval_humaneval.py +0 -0
  118. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/test/simple_eval_math.py +0 -0
  119. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/test/simple_eval_mgsm.py +0 -0
  120. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/test/simple_eval_mmlu.py +0 -0
  121. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/test/srt/sampling/penaltylib/utils.py +0 -0
  122. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/test/test_activation.py +0 -0
  123. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang/test/test_layernorm.py +0 -0
  124. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang.egg-info/dependency_links.txt +0 -0
  125. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang.egg-info/requires.txt +0 -0
  126. {sglang-0.2.14.post2 → sglang-0.3.0}/sglang.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sglang
3
- Version: 0.2.14.post2
3
+ Version: 0.3.0
4
4
  Summary: SGLang is yet another fast serving framework for large language models and vision language models.
5
5
  License: Apache License
6
6
  Version 2.0, January 2004
@@ -312,7 +312,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
312
312
  ### Method 2: From source
313
313
  ```
314
314
  # Use the last release branch
315
- git clone -b v0.2.14.post2 https://github.com/sgl-project/sglang.git
315
+ git clone -b v0.3.0 https://github.com/sgl-project/sglang.git
316
316
  cd sglang
317
317
 
318
318
  pip install --upgrade pip
@@ -461,7 +461,7 @@ It supports streaming, vision, and most features of the Chat/Completions/Models/
461
461
  ```
462
462
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
463
463
  ```
464
- - Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
464
+ - Add `--dp 2` to enable multi-GPU data parallelism. Data parallelism is better for throughput if there is enough memory. It can also be used together with tensor parallelism. The following command uses 4 GPUs in total.
465
465
  ```
466
466
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
467
467
  ```
@@ -489,13 +489,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
489
489
  ### Supported Models
490
490
 
491
491
  **Generative Models**
492
-
493
492
  - Llama / Llama 2 / Llama 3 / Llama 3.1
494
493
  - Mistral / Mixtral / Mistral NeMo
495
494
  - Gemma / Gemma 2
496
495
  - Qwen / Qwen 2 / Qwen 2 MoE
497
496
  - DeepSeek / DeepSeek 2
498
497
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
498
+ - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
499
499
  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
500
500
  - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
501
501
  - LLaVA 1.5 / 1.6 / NeXT
@@ -509,6 +509,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
509
509
  - Grok
510
510
  - ChatGLM
511
511
  - InternLM 2
512
+ - Exaone 3
512
513
 
513
514
  **Embedding Models**
514
515
 
@@ -636,7 +637,7 @@ print(state["answer_1"])
636
637
  #### More Examples
637
638
 
638
639
  Anthropic and VertexAI (Gemini) models are also supported.
639
- You can find more examples at [examples/quick_start](examples/quick_start).
640
+ You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
640
641
 
641
642
  ### Language Feature
642
643
  To begin with, import sglang.
@@ -649,7 +650,7 @@ You can implement your prompt flow in a function decorated by `sgl.function`.
649
650
  You can then invoke the function with `run` or `run_batch`.
650
651
  The system will manage the state, chat template, parallelism and batching for you.
651
652
 
652
- The complete code for the examples below can be found at [readme_examples.py](examples/usage/readme_examples.py)
653
+ The complete code for the examples below can be found at [readme_examples.py](examples/frontend_language/usage/readme_examples.py)
653
654
 
654
655
  #### Control Flow
655
656
  You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
@@ -698,7 +699,7 @@ def image_qa(s, image_file, question):
698
699
  s += sgl.assistant(sgl.gen("answer", max_tokens=256)
699
700
  ```
700
701
 
701
- See also [srt_example_llava.py](examples/quick_start/srt_example_llava.py).
702
+ See also [srt_example_llava.py](examples/frontend_language/quick_start/local_example_llava_next.py).
702
703
 
703
704
  #### Constrained Decoding
704
705
  Use `regex` to specify a regular expression as a decoding constraint.
@@ -742,7 +743,7 @@ def character_gen(s, name):
742
743
  s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
743
744
  ```
744
745
 
745
- See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
746
+ See also [json_decode.py](examples/frontend_language/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
746
747
 
747
748
  #### Batching
748
749
  Use `run_batch` to run a batch of requests with continuous batching.
@@ -56,7 +56,7 @@ pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.4/
56
56
  ### Method 2: From source
57
57
  ```
58
58
  # Use the last release branch
59
- git clone -b v0.2.14.post2 https://github.com/sgl-project/sglang.git
59
+ git clone -b v0.3.0 https://github.com/sgl-project/sglang.git
60
60
  cd sglang
61
61
 
62
62
  pip install --upgrade pip
@@ -205,7 +205,7 @@ It supports streaming, vision, and most features of the Chat/Completions/Models/
205
205
  ```
206
206
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --tp 2
207
207
  ```
208
- - Add `--dp 2` to enable multi-GPU data parallelism. It can also be used together with tensor parallelism. Data parallelism is better for throughput if there is enough memory.
208
+ - Add `--dp 2` to enable multi-GPU data parallelism. Data parallelism is better for throughput if there is enough memory. It can also be used together with tensor parallelism. The following command uses 4 GPUs in total.
209
209
  ```
210
210
  python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct --port 30000 --dp 2 --tp 2
211
211
  ```
@@ -233,13 +233,13 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
233
233
  ### Supported Models
234
234
 
235
235
  **Generative Models**
236
-
237
236
  - Llama / Llama 2 / Llama 3 / Llama 3.1
238
237
  - Mistral / Mixtral / Mistral NeMo
239
238
  - Gemma / Gemma 2
240
239
  - Qwen / Qwen 2 / Qwen 2 MoE
241
240
  - DeepSeek / DeepSeek 2
242
241
  - [LLaVA-OneVision](https://llava-vl.github.io/blog/2024-08-05-llava-onevision/)
242
+ - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-7b-ov --port=30000 --chat-template=chatml-llava`
243
243
  - `python3 -m sglang.launch_server --model-path lmms-lab/llava-onevision-qwen2-72b-ov --port=30000 --tp-size=8 --chat-template=chatml-llava`
244
244
  - Query the server with the [OpenAI Vision API](https://platform.openai.com/docs/guides/vision). See examples at [test/srt/test_vision_openai_server.py](test/srt/test_vision_openai_server.py)
245
245
  - LLaVA 1.5 / 1.6 / NeXT
@@ -253,6 +253,7 @@ python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3-8B-Instruct
253
253
  - Grok
254
254
  - ChatGLM
255
255
  - InternLM 2
256
+ - Exaone 3
256
257
 
257
258
  **Embedding Models**
258
259
 
@@ -380,7 +381,7 @@ print(state["answer_1"])
380
381
  #### More Examples
381
382
 
382
383
  Anthropic and VertexAI (Gemini) models are also supported.
383
- You can find more examples at [examples/quick_start](examples/quick_start).
384
+ You can find more examples at [examples/quick_start](examples/frontend_language/quick_start).
384
385
 
385
386
  ### Language Feature
386
387
  To begin with, import sglang.
@@ -393,7 +394,7 @@ You can implement your prompt flow in a function decorated by `sgl.function`.
393
394
  You can then invoke the function with `run` or `run_batch`.
394
395
  The system will manage the state, chat template, parallelism and batching for you.
395
396
 
396
- The complete code for the examples below can be found at [readme_examples.py](examples/usage/readme_examples.py)
397
+ The complete code for the examples below can be found at [readme_examples.py](examples/frontend_language/usage/readme_examples.py)
397
398
 
398
399
  #### Control Flow
399
400
  You can use any Python code within the function body, including control flow, nested function calls, and external libraries.
@@ -442,7 +443,7 @@ def image_qa(s, image_file, question):
442
443
  s += sgl.assistant(sgl.gen("answer", max_tokens=256)
443
444
  ```
444
445
 
445
- See also [srt_example_llava.py](examples/quick_start/srt_example_llava.py).
446
+ See also [srt_example_llava.py](examples/frontend_language/quick_start/local_example_llava_next.py).
446
447
 
447
448
  #### Constrained Decoding
448
449
  Use `regex` to specify a regular expression as a decoding constraint.
@@ -486,7 +487,7 @@ def character_gen(s, name):
486
487
  s += sgl.gen("json_output", max_tokens=256, regex=character_regex)
487
488
  ```
488
489
 
489
- See also [json_decode.py](examples/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
490
+ See also [json_decode.py](examples/frontend_language/usage/json_decode.py) for an additional example of specifying formats with Pydantic models.
490
491
 
491
492
  #### Batching
492
493
  Use `run_batch` to run a batch of requests with continuous batching.
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "sglang"
7
- version = "0.2.14.post2"
7
+ version = "0.3.0"
8
8
  description = "SGLang is yet another fast serving framework for large language models and vision language models."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.8"
@@ -78,6 +78,7 @@ def gen(
78
78
  choices: Optional[List[str]] = None,
79
79
  choices_method: Optional[ChoicesSamplingMethod] = None,
80
80
  regex: Optional[str] = None,
81
+ json_schema: Optional[str] = None,
81
82
  ):
82
83
  """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
83
84
 
@@ -114,6 +115,7 @@ def gen(
114
115
  return_text_in_logprobs,
115
116
  dtype,
116
117
  regex,
118
+ json_schema,
117
119
  )
118
120
 
119
121
 
@@ -11,26 +11,34 @@ python -m sglang.bench_latency --model-path meta-llama/Meta-Llama-3-8B-Instruct
11
11
  ## plot the results in series of lines:
12
12
  python -m sglang.bench_latency --result-filename out.jsonl --graph-sql="select run_name, batch_size, prefill_throughput from results"
13
13
 
14
-
15
14
  # Usage (correctness test):
16
15
  python -m sglang.bench_latency --model-path TinyLlama/TinyLlama-1.1B-Chat-v0.4 --correct
17
16
 
18
17
  ## Reference output (of the correctness test above, can be gpu dependent):
19
- prefill logits (first half) tensor([[-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633],
20
- [-10.0312, -9.5000, 0.8936, ..., -4.9414, -3.2402, -3.3633],
21
- [ -9.1875, -10.2500, 2.7109, ..., -4.3359, -4.0664, -4.1328]],
22
- device='cuda:0', dtype=torch.float16)
23
- prefill logits (final) tensor([[-8.3203, -7.1211, 3.3379, ..., -4.9570, -4.1328, -3.4141],
24
- [-8.9062, -9.0156, 4.1445, ..., -4.9922, -4.4961, -4.0742],
25
- [-9.6328, -9.0547, 4.0117, ..., -5.3047, -4.7148, -4.4609]],
26
- device='cuda:0', dtype=torch.float16)
27
- <s> The capital of France is.
18
+ input_ids=[[1, 450, 7483, 310, 3444, 338], [1, 450, 7483, 310, 278, 3303, 13187, 290, 338], [1, 20628, 338, 263, 6575, 1460, 2462, 322, 306, 763]]
19
+
20
+ prefill logits (first half): tensor([[-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633],
21
+ [-10.0312, -9.5000, 0.8931, ..., -4.9414, -3.2422, -3.3633],
22
+ [ -9.1875, -10.2500, 2.7129, ..., -4.3359, -4.0664, -4.1328]],
23
+ device='cuda:0')
24
+
25
+ prefill logits (final): tensor([[-8.3125, -7.1172, 3.3457, ..., -4.9570, -4.1328, -3.4141],
26
+ [-8.9141, -9.0156, 4.1445, ..., -4.9922, -4.4961, -4.0781],
27
+ [-9.6328, -9.0547, 4.0195, ..., -5.3047, -4.7148, -4.4570]],
28
+ device='cuda:0')
29
+
30
+ ========== Prompt 0 ==========
31
+ <s> The capital of France is Paris.
28
32
  The capital of the United States is Washington, D.C.
29
33
 
30
- <s> The capital of the United Kindom is.
34
+
35
+ ========== Prompt 1 ==========
36
+ <s> The capital of the United Kindom is London.
31
37
  The capital of the United Kingdom is London.
32
38
  The capital of the
33
- <s> Today is a sunny day and I like go for a walk in the park.
39
+
40
+ ========== Prompt 2 ==========
41
+ <s> Today is a sunny day and I like to go for a walk in the park.
34
42
  I'm going to the park
35
43
  """
36
44
 
@@ -200,16 +208,16 @@ def extend(reqs, model_runner):
200
208
  tree_cache=None,
201
209
  )
202
210
  batch.prepare_for_extend(model_runner.model_config.vocab_size)
203
- output = model_runner.forward(batch, ForwardMode.EXTEND)
204
- next_token_ids = batch.sample(output.next_token_logits)
205
- return next_token_ids, output.next_token_logits, batch
211
+ sample_output, logits_output = model_runner.forward(batch, ForwardMode.EXTEND)
212
+ next_token_ids = sample_output.batch_next_token_ids.tolist()
213
+ return next_token_ids, logits_output.next_token_logits, batch
206
214
 
207
215
 
208
216
  def decode(input_token_ids, batch, model_runner):
209
- batch.prepare_for_decode(input_token_ids.cpu().numpy())
210
- output = model_runner.forward(batch, ForwardMode.DECODE)
211
- next_token_ids = batch.sample(output.next_token_logits)
212
- return next_token_ids, output.next_token_logits
217
+ batch.prepare_for_decode(input_token_ids)
218
+ sample_output, logits_output = model_runner.forward(batch, ForwardMode.DECODE)
219
+ next_token_ids = sample_output.batch_next_token_ids.tolist()
220
+ return next_token_ids, logits_output.next_token_logits
213
221
 
214
222
 
215
223
  @torch.inference_mode()
@@ -225,12 +233,12 @@ def correctness_test(
225
233
 
226
234
  # Prepare inputs
227
235
  input_ids, reqs = prepare_inputs_for_correctness_test(bench_args, tokenizer)
228
- rank_print(f"{input_ids=}")
236
+ rank_print(f"\n{input_ids=}\n")
229
237
 
230
238
  if bench_args.cut_len > 0:
231
239
  # Prefill
232
240
  next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
233
- rank_print("prefill logits (first half)", next_token_logits)
241
+ rank_print(f"prefill logits (first half): {next_token_logits} \n")
234
242
 
235
243
  # Prepare extend inputs
236
244
  reqs = prepare_extend_inputs_for_correctness_test(
@@ -239,7 +247,7 @@ def correctness_test(
239
247
 
240
248
  # Extend
241
249
  next_token_ids, next_token_logits, batch = extend(reqs, model_runner)
242
- rank_print("prefill logits (final)", next_token_logits)
250
+ rank_print(f"prefill logits (final): {next_token_logits} \n")
243
251
 
244
252
  # Decode
245
253
  output_ids = [input_ids[i] + [next_token_ids[i]] for i in range(len(input_ids))]
@@ -250,7 +258,8 @@ def correctness_test(
250
258
 
251
259
  # Print
252
260
  for i in range(len(reqs)):
253
- rank_print(tokenizer.decode(output_ids[i]))
261
+ rank_print(f"========== Prompt {i} ==========")
262
+ rank_print(tokenizer.decode(output_ids[i]), "\n")
254
263
 
255
264
 
256
265
  @torch.inference_mode()
@@ -292,6 +301,7 @@ def latency_test_run_once(
292
301
  measurement_results["prefill_throughput"] = throughput
293
302
 
294
303
  # Decode
304
+ decode_latencies = []
295
305
  for i in range(output_len):
296
306
  torch.cuda.synchronize()
297
307
  tic = time.time()
@@ -300,17 +310,18 @@ def latency_test_run_once(
300
310
  latency = time.time() - tic
301
311
  tot_latency += latency
302
312
  throughput = batch_size / latency
313
+ decode_latencies.append(latency)
303
314
  if i < 5:
304
315
  rank_print(
305
316
  f"Decode. latency: {latency:6.5f} s, throughput: {throughput:9.2f} token/s"
306
317
  )
307
- avg_decode_latency = (tot_latency - prefill_latency) / output_len
308
- avg_decode_throughput = batch_size / avg_decode_latency
318
+ med_decode_latency = np.median(decode_latencies)
319
+ med_decode_throughput = batch_size / med_decode_latency
309
320
  rank_print(
310
- f"Decode. avg latency: {avg_decode_latency:6.5f} s, avg throughput: {avg_decode_throughput:9.2f} token/s"
321
+ f"Decode. median latency: {med_decode_latency:6.5f} s, median throughput: {med_decode_throughput:9.2f} token/s"
311
322
  )
312
- measurement_results["avg_decode_latency"] = avg_decode_latency
313
- measurement_results["avg_decode_throughput"] = avg_decode_throughput
323
+ measurement_results["median_decode_latency"] = med_decode_latency
324
+ measurement_results["median_decode_throughput"] = med_decode_throughput
314
325
 
315
326
  throughput = (input_len + output_len) * batch_size / tot_latency
316
327
  rank_print(
@@ -4,7 +4,7 @@ from typing import List, Optional
4
4
 
5
5
  from sglang.global_config import global_config
6
6
  from sglang.lang.backend.base_backend import BaseBackend
7
- from sglang.lang.chat_template import get_chat_template_by_model_path
7
+ from sglang.lang.chat_template import get_chat_template, get_chat_template_by_model_path
8
8
  from sglang.lang.choices import ChoicesDecision, ChoicesSamplingMethod
9
9
  from sglang.lang.interpreter import StreamExecutor
10
10
  from sglang.lang.ir import (
@@ -23,6 +23,7 @@ class RuntimeEndpoint(BaseBackend):
23
23
  base_url: str,
24
24
  api_key: Optional[str] = None,
25
25
  verify: Optional[str] = None,
26
+ chat_template_name: Optional[str] = None,
26
27
  ):
27
28
  super().__init__()
28
29
  self.support_concate_and_append = True
@@ -39,9 +40,12 @@ class RuntimeEndpoint(BaseBackend):
39
40
  self._assert_success(res)
40
41
  self.model_info = res.json()
41
42
 
42
- self.chat_template = get_chat_template_by_model_path(
43
- self.model_info["model_path"]
44
- )
43
+ if chat_template_name:
44
+ self.chat_template = get_chat_template(chat_template_name)
45
+ else:
46
+ self.chat_template = get_chat_template_by_model_path(
47
+ self.model_info["model_path"]
48
+ )
45
49
 
46
50
  def get_model_name(self):
47
51
  return self.model_info["model_path"]
@@ -673,6 +673,7 @@ class StreamExecutor:
673
673
  "return_text_in_logprobs",
674
674
  "dtype",
675
675
  "regex",
676
+ "json_schema",
676
677
  ]:
677
678
  value = getattr(sampling_params, item, None)
678
679
  if value is not None:
@@ -854,6 +855,8 @@ class ProgramState:
854
855
  return self.stream_executor.get_meta_info(name)
855
856
 
856
857
  def __iadd__(self, other):
858
+ if other is None:
859
+ raise ValueError("Tried to append None to state.")
857
860
  self.stream_executor.submit(other)
858
861
  return self
859
862
 
@@ -30,6 +30,7 @@ class SglSamplingParams:
30
30
  logprob_start_len: Optional[int] = (None,)
31
31
  top_logprobs_num: Optional[int] = (None,)
32
32
  return_text_in_logprobs: Optional[bool] = (None,)
33
+ json_schema: Optional[str] = None
33
34
 
34
35
  # for constrained generation, not included in to_xxx_kwargs
35
36
  dtype: Optional[str] = None
@@ -51,6 +52,7 @@ class SglSamplingParams:
51
52
  self.logprob_start_len,
52
53
  self.top_logprobs_num,
53
54
  self.return_text_in_logprobs,
55
+ self.json_schema,
54
56
  )
55
57
 
56
58
  def to_openai_kwargs(self):
@@ -121,6 +123,7 @@ class SglSamplingParams:
121
123
  "presence_penalty": self.presence_penalty,
122
124
  "ignore_eos": self.ignore_eos,
123
125
  "regex": self.regex,
126
+ "json_schema": self.json_schema,
124
127
  }
125
128
 
126
129
 
@@ -425,6 +428,7 @@ class SglGen(SglExpr):
425
428
  return_text_in_logprobs: Optional[bool] = None,
426
429
  dtype: Optional[type] = None,
427
430
  regex: Optional[str] = None,
431
+ json_schema: Optional[str] = None,
428
432
  ):
429
433
  """Call the model to generate. See the meaning of the arguments in docs/en/sampling_params.md"""
430
434
  super().__init__()
@@ -446,6 +450,7 @@ class SglGen(SglExpr):
446
450
  return_text_in_logprobs=return_text_in_logprobs,
447
451
  dtype=dtype,
448
452
  regex=regex,
453
+ json_schema=json_schema,
449
454
  )
450
455
 
451
456
  def __repr__(self):
@@ -0,0 +1,26 @@
1
+ """Launch the inference server for Llava-video model."""
2
+
3
+ import argparse
4
+
5
+ from sglang.srt.server import ServerArgs, launch_server
6
+
7
+ if __name__ == "__main__":
8
+ parser = argparse.ArgumentParser()
9
+ ServerArgs.add_cli_args(parser)
10
+ args = parser.parse_args()
11
+ server_args = ServerArgs.from_cli_args(args)
12
+
13
+ model_override_args = {}
14
+ model_override_args["mm_spatial_pool_stride"] = 2
15
+ model_override_args["architectures"] = ["LlavaVidForCausalLM"]
16
+ model_override_args["num_frames"] = 16
17
+ model_override_args["model_type"] = "llavavid"
18
+ if model_override_args["num_frames"] == 32:
19
+ model_override_args["rope_scaling"] = {"factor": 2.0, "type": "linear"}
20
+ model_override_args["max_sequence_length"] = 4096 * 2
21
+ model_override_args["tokenizer_model_max_length"] = 4096 * 2
22
+ model_override_args["model_max_length"] = 4096 * 2
23
+ if "34b" in args.model_path.lower():
24
+ model_override_args["image_token_index"] = 64002
25
+
26
+ launch_server(server_args, model_override_args, None)
@@ -0,0 +1,5 @@
1
+ from sglang.srt.configs.exaone import ExaoneConfig
2
+
3
+ __all__ = [
4
+ "ExaoneConfig",
5
+ ]
@@ -0,0 +1,195 @@
1
+ # coding=utf-8
2
+ # Copyright 2024 The LG AI Research EXAONE Lab. All rights reserved.
3
+ # Copyright 2024 The LG CNS AI Engineering Team.
4
+ # Copyright 2023-2024 SGLang Team.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+ """ EXAONE model configuration """
18
+ from typing import Any, Dict
19
+
20
+ from transformers.configuration_utils import PretrainedConfig
21
+ from transformers.utils import logging
22
+
23
+ logger = logging.get_logger(__name__)
24
+
25
+ EXAONE_PRETRAINED_CONFIG_ARCHIVE_MAP: Dict[str, Any] = {}
26
+
27
+
28
+ # ruff: noqa: E501
29
+ class ExaoneConfig(PretrainedConfig):
30
+ r"""
31
+ This is the configuration class to store the configuration of a :class:`~transformers.ExaoneModel`. It is used to
32
+ instantiate a EXAONE model according to the specified arguments, defining the model architecture. Instantiating a
33
+ configuration with the defaults will yield a similar configuration to that of the Exaone
34
+
35
+ Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
36
+ outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
37
+
38
+
39
+ Args:
40
+ vocab_size (:obj:`int`, `optional`, defaults to 102400):
41
+ Vocabulary size of the EXAONE model. Defines the number of different tokens that can be represented by the
42
+ :obj:`inputs_ids` passed when calling :class:`~transformers.ExaoneModel`. Vocabulary size of the model.
43
+ Defines the different tokens that can be represented by the `inputs_ids` passed to the forward method of
44
+ :class:`~transformers.EXAONEModel`.
45
+ max_position_embeddings (:obj:`int`, `optional`, defaults to 2048):
46
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
47
+ just in case (e.g., 512 or 1024 or 2048).
48
+ hidden_size (:obj:`int`, `optional`, defaults to 2048):
49
+ Dimensionality of the encoder layers and the pooler layer.
50
+ num_layers (:obj:`int`, `optional`, defaults to 32):
51
+ Number of hidden layers in the Transformer encoder.
52
+ num_attention_heads (:obj:`int`, `optional`, defaults to 32):
53
+ Number of attention heads for each attention layer in the Transformer decoder.
54
+ num_key_value_heads (:obj:`int`, `optional`):
55
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
56
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
57
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
58
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
59
+ by meanpooling all the original heads within that group. For more details checkout [this
60
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
61
+ `num_attention_heads`.
62
+ intermediate_size (:obj:`int`, `optional`, defaults to `hidden_size * 4`):
63
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
64
+ activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"silu"`):
65
+ The non-linear activation function (function or string) in the decoder.
66
+ rope_theta (:obj:`float`, `optional`, defaults to 10000.0):
67
+ The base period of the RoPE embeddings.
68
+ rope_scaling (:obj:`Dict`, `optional`):
69
+ Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
70
+ and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
71
+ accordingly.
72
+ Expected contents:
73
+ `rope_type` (:obj:`str`):
74
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
75
+ 'llama3'], with 'default' being the original RoPE implementation.
76
+ `factor` (:obj:`float`, `optional`):
77
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
78
+ most scaling types, a `factor` of x will enable the model to handle sequences of length x *
79
+ original maximum pre-trained length.
80
+ `original_max_position_embeddings` (:obj:`int`, `optional`):
81
+ Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
82
+ pretraining.
83
+ `attention_factor` (:obj:`float`, `optional`):
84
+ Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
85
+ computation. If unspecified, it defaults to value recommended by the implementation, using the
86
+ `factor` field to infer the suggested value.
87
+ `beta_fast` (:obj:`float`, `optional`):
88
+ Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
89
+ ramp function. If unspecified, it defaults to 32.
90
+ `beta_slow` (:obj:`float`, `optional`):
91
+ Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
92
+ ramp function. If unspecified, it defaults to 1.
93
+ `short_factor` (:obj:`List[float]`, `optional`):
94
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<
95
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
96
+ size divided by the number of attention heads divided by 2
97
+ `long_factor` (:obj:`List[float]`, `optional`):
98
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<
99
+ `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
100
+ size divided by the number of attention heads divided by 2
101
+ `low_freq_factor` (:obj:`float`, `optional`):
102
+ Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
103
+ `high_freq_factor` (:obj:`float`, `optional`):
104
+ Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
105
+ embed_dropout (:obj:`float`, `optional`, defaults to 0.0):
106
+ The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
107
+ attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
108
+ The dropout ratio for the attention probabilities.
109
+ layer_norm_epsilon (:obj:`float`, `optional`, defaults to 1e-5):
110
+ The epsilon used by the layer normalization layers.
111
+ initializer_range (:obj:`float`, `optional`, defaults to 0.02):
112
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
113
+ use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
114
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
115
+ relevant if ``configs.is_decoder=True``.
116
+ bos_token_id (:obj:`int`, `optional`, defaults to 0):
117
+ Beginning of stream token id.
118
+ eos_token_id (:obj:`int`, `optional`, defaults to 2):
119
+ End of stream token id.
120
+ tie_word_embeddings (:obj:`bool`, `optional`, defaults to :obj:`True`):
121
+ Whether to tie weight embeddings
122
+ gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
123
+ If True, use gradient checkpointing to save memory at the expense of slower backward pass.
124
+
125
+ Example::
126
+
127
+ >>> from transformers import EXAONEModel, ExaoneConfig
128
+
129
+ >>> # Initializing a EXAONE configuration
130
+ >>> configuration = ExaoneConfig()
131
+
132
+ >>> # Initializing a model from configuration
133
+ >>> model = EXAONEModel(configuration)
134
+
135
+ >>> # Accessing the model configuration
136
+ >>> configuration = model.configs
137
+ """
138
+
139
+ model_type = "exaone"
140
+ keys_to_ignore_at_inference = ["past_key_values"]
141
+ attribute_map = {"num_hidden_layers": "num_layers"}
142
+
143
+ def __init__(
144
+ self,
145
+ vocab_size=102400,
146
+ max_position_embeddings=2048,
147
+ hidden_size=2048,
148
+ num_layers=32,
149
+ num_attention_heads=32,
150
+ num_key_value_heads=None,
151
+ intermediate_size=None,
152
+ activation_function="silu",
153
+ rope_theta=10000.0,
154
+ rope_scaling=None,
155
+ embed_dropout=0.0,
156
+ attention_dropout=0.0,
157
+ layer_norm_epsilon=1e-5,
158
+ initializer_range=0.02,
159
+ use_cache=True,
160
+ bos_token_id=0,
161
+ eos_token_id=2,
162
+ tie_word_embeddings=True,
163
+ **kwargs
164
+ ):
165
+ self.vocab_size = vocab_size
166
+ self.max_position_embeddings = max_position_embeddings
167
+ self.hidden_size = hidden_size
168
+ self.num_layers = num_layers
169
+ self.num_attention_heads = num_attention_heads
170
+ self.num_hidden_layers = num_layers
171
+ if num_key_value_heads is None:
172
+ num_key_value_heads = num_attention_heads
173
+ self.num_key_value_heads = num_key_value_heads
174
+ if intermediate_size:
175
+ self.intermediate_size = intermediate_size
176
+ else:
177
+ self.intermediate_size = hidden_size * 4
178
+ self.activation_function = activation_function
179
+ self.embed_dropout = embed_dropout
180
+ self.attention_dropout = attention_dropout
181
+ self.layer_norm_epsilon = layer_norm_epsilon
182
+ self.initializer_range = initializer_range
183
+ self.use_cache = use_cache
184
+ self.rope_theta = rope_theta
185
+ self.rope_scaling = rope_scaling
186
+
187
+ self.bos_token_id = bos_token_id
188
+ self.eos_token_id = eos_token_id
189
+
190
+ super().__init__(
191
+ bos_token_id=bos_token_id,
192
+ eos_token_id=eos_token_id,
193
+ tie_word_embeddings=tie_word_embeddings,
194
+ **kwargs
195
+ )
@@ -79,7 +79,7 @@ class FSMCache(BaseToolCache):
79
79
 
80
80
  def init_value(self, value):
81
81
  if self.json_schema_mode:
82
- regex = build_regex_from_schema(value)
82
+ regex = build_regex_from_schema(value, whitespace_pattern=r"[\n\t ]*")
83
83
  return RegexGuide(regex, self.outlines_tokenizer), regex
84
84
  else:
85
85
  return RegexGuide(value, self.outlines_tokenizer)