sglang 0.1.16__py3-none-any.whl → 0.1.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. sglang/__init__.py +3 -1
  2. sglang/api.py +7 -7
  3. sglang/backend/anthropic.py +1 -1
  4. sglang/backend/litellm.py +90 -0
  5. sglang/backend/openai.py +158 -11
  6. sglang/backend/runtime_endpoint.py +18 -10
  7. sglang/bench_latency.py +299 -0
  8. sglang/global_config.py +12 -2
  9. sglang/lang/compiler.py +2 -2
  10. sglang/lang/interpreter.py +114 -67
  11. sglang/lang/ir.py +28 -3
  12. sglang/launch_server.py +4 -1
  13. sglang/launch_server_llavavid.py +2 -1
  14. sglang/srt/constrained/__init__.py +13 -6
  15. sglang/srt/constrained/fsm_cache.py +8 -2
  16. sglang/srt/constrained/jump_forward.py +113 -25
  17. sglang/srt/conversation.py +2 -0
  18. sglang/srt/flush_cache.py +3 -1
  19. sglang/srt/hf_transformers_utils.py +130 -1
  20. sglang/srt/layers/extend_attention.py +17 -0
  21. sglang/srt/layers/fused_moe.py +582 -0
  22. sglang/srt/layers/logits_processor.py +65 -32
  23. sglang/srt/layers/radix_attention.py +41 -7
  24. sglang/srt/layers/token_attention.py +16 -1
  25. sglang/srt/managers/controller/dp_worker.py +113 -0
  26. sglang/srt/managers/{router → controller}/infer_batch.py +242 -100
  27. sglang/srt/managers/controller/manager_multi.py +191 -0
  28. sglang/srt/managers/{router/manager.py → controller/manager_single.py} +34 -14
  29. sglang/srt/managers/{router → controller}/model_runner.py +262 -158
  30. sglang/srt/managers/{router → controller}/radix_cache.py +11 -1
  31. sglang/srt/managers/{router/scheduler.py → controller/schedule_heuristic.py} +9 -7
  32. sglang/srt/managers/{router/model_rpc.py → controller/tp_worker.py} +298 -267
  33. sglang/srt/managers/detokenizer_manager.py +42 -46
  34. sglang/srt/managers/io_struct.py +22 -12
  35. sglang/srt/managers/tokenizer_manager.py +151 -87
  36. sglang/srt/model_config.py +83 -5
  37. sglang/srt/models/chatglm.py +399 -0
  38. sglang/srt/models/commandr.py +10 -13
  39. sglang/srt/models/dbrx.py +9 -15
  40. sglang/srt/models/gemma.py +12 -15
  41. sglang/srt/models/grok.py +738 -0
  42. sglang/srt/models/llama2.py +26 -15
  43. sglang/srt/models/llama_classification.py +104 -0
  44. sglang/srt/models/llava.py +86 -19
  45. sglang/srt/models/llavavid.py +11 -20
  46. sglang/srt/models/mixtral.py +282 -103
  47. sglang/srt/models/mixtral_quant.py +372 -0
  48. sglang/srt/models/qwen.py +9 -13
  49. sglang/srt/models/qwen2.py +11 -13
  50. sglang/srt/models/stablelm.py +9 -15
  51. sglang/srt/models/yivl.py +17 -22
  52. sglang/srt/openai_api_adapter.py +150 -95
  53. sglang/srt/openai_protocol.py +11 -2
  54. sglang/srt/server.py +124 -48
  55. sglang/srt/server_args.py +128 -48
  56. sglang/srt/utils.py +234 -67
  57. sglang/test/test_programs.py +65 -3
  58. sglang/test/test_utils.py +32 -1
  59. sglang/utils.py +23 -4
  60. {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/METADATA +40 -27
  61. sglang-0.1.18.dist-info/RECORD +78 -0
  62. {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/WHEEL +1 -1
  63. sglang/srt/backend_config.py +0 -13
  64. sglang/srt/models/dbrx_config.py +0 -281
  65. sglang/srt/weight_utils.py +0 -417
  66. sglang-0.1.16.dist-info/RECORD +0 -72
  67. {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/LICENSE +0 -0
  68. {sglang-0.1.16.dist-info → sglang-0.1.18.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,7 @@
1
+ """
2
+ The radix tree data structure for managing the KV cache.
3
+ """
4
+
1
5
  import heapq
2
6
  import time
3
7
  from collections import defaultdict
@@ -58,7 +62,7 @@ class RadixCache:
58
62
 
59
63
  def insert(self, key, value=None):
60
64
  if self.disable:
61
- return len(key)
65
+ return 0
62
66
 
63
67
  if value is None:
64
68
  value = [x for x in key]
@@ -76,6 +80,12 @@ class RadixCache:
76
80
  indices = self.req_to_token_pool.req_to_token[req_pool_idx, : len(token_ids)]
77
81
  new_prefix_len = self.insert(token_ids, indices.clone())
78
82
 
83
+ if self.disable:
84
+ if del_in_memory_pool:
85
+ self.token_to_kv_pool.dec_refs(indices)
86
+ else:
87
+ return torch.tensor([], dtype=torch.int64), self.root_node
88
+
79
89
  # Radix Cache takes one ref in memory pool
80
90
  self.token_to_kv_pool.dec_refs(indices[last_uncached_pos:new_prefix_len])
81
91
 
@@ -1,20 +1,22 @@
1
+ """Request scheduler heuristic."""
2
+
1
3
  import random
2
4
  from collections import defaultdict
3
5
 
4
6
 
5
- class Scheduler:
7
+ class ScheduleHeuristic:
6
8
  def __init__(
7
9
  self,
8
10
  schedule_heuristic,
9
- max_running_seq,
10
- max_prefill_num_token,
11
- max_total_num_token,
11
+ max_running_seqs,
12
+ max_prefill_num_tokens,
13
+ max_total_num_tokens,
12
14
  tree_cache,
13
15
  ):
14
16
  self.schedule_heuristic = schedule_heuristic
15
- self.max_running_seq = max_running_seq
16
- self.max_prefill_num_token = max_prefill_num_token
17
- self.max_total_num_token = max_total_num_token
17
+ self.max_running_seqs = max_running_seqs
18
+ self.max_prefill_num_tokens = max_prefill_num_tokens
19
+ self.max_total_num_tokens = max_total_num_tokens
18
20
  self.tree_cache = tree_cache
19
21
 
20
22
  def get_priority_queue(self, forward_queue):