sglang 0.1.14__py3-none-any.whl → 0.1.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. sglang/__init__.py +59 -2
  2. sglang/api.py +40 -11
  3. sglang/backend/anthropic.py +17 -3
  4. sglang/backend/litellm.py +90 -0
  5. sglang/backend/openai.py +160 -12
  6. sglang/backend/runtime_endpoint.py +62 -27
  7. sglang/backend/vertexai.py +1 -0
  8. sglang/bench_latency.py +320 -0
  9. sglang/global_config.py +24 -3
  10. sglang/lang/chat_template.py +122 -6
  11. sglang/lang/compiler.py +2 -2
  12. sglang/lang/interpreter.py +206 -98
  13. sglang/lang/ir.py +98 -34
  14. sglang/lang/tracer.py +6 -4
  15. sglang/launch_server.py +4 -1
  16. sglang/launch_server_llavavid.py +32 -0
  17. sglang/srt/constrained/__init__.py +14 -6
  18. sglang/srt/constrained/fsm_cache.py +9 -2
  19. sglang/srt/constrained/jump_forward.py +113 -24
  20. sglang/srt/conversation.py +4 -2
  21. sglang/srt/flush_cache.py +18 -0
  22. sglang/srt/hf_transformers_utils.py +144 -3
  23. sglang/srt/layers/context_flashattention_nopad.py +1 -0
  24. sglang/srt/layers/extend_attention.py +20 -1
  25. sglang/srt/layers/fused_moe.py +596 -0
  26. sglang/srt/layers/logits_processor.py +190 -61
  27. sglang/srt/layers/radix_attention.py +62 -53
  28. sglang/srt/layers/token_attention.py +21 -9
  29. sglang/srt/managers/controller/cuda_graph_runner.py +196 -0
  30. sglang/srt/managers/controller/dp_worker.py +113 -0
  31. sglang/srt/managers/controller/infer_batch.py +908 -0
  32. sglang/srt/managers/controller/manager_multi.py +195 -0
  33. sglang/srt/managers/controller/manager_single.py +177 -0
  34. sglang/srt/managers/controller/model_runner.py +359 -0
  35. sglang/srt/managers/{router → controller}/radix_cache.py +102 -53
  36. sglang/srt/managers/controller/schedule_heuristic.py +65 -0
  37. sglang/srt/managers/controller/tp_worker.py +813 -0
  38. sglang/srt/managers/detokenizer_manager.py +42 -40
  39. sglang/srt/managers/io_struct.py +44 -10
  40. sglang/srt/managers/tokenizer_manager.py +224 -82
  41. sglang/srt/memory_pool.py +52 -59
  42. sglang/srt/model_config.py +97 -2
  43. sglang/srt/models/chatglm.py +399 -0
  44. sglang/srt/models/commandr.py +369 -0
  45. sglang/srt/models/dbrx.py +406 -0
  46. sglang/srt/models/gemma.py +34 -38
  47. sglang/srt/models/gemma2.py +436 -0
  48. sglang/srt/models/grok.py +738 -0
  49. sglang/srt/models/llama2.py +47 -37
  50. sglang/srt/models/llama_classification.py +107 -0
  51. sglang/srt/models/llava.py +92 -27
  52. sglang/srt/models/llavavid.py +298 -0
  53. sglang/srt/models/minicpm.py +366 -0
  54. sglang/srt/models/mixtral.py +302 -127
  55. sglang/srt/models/mixtral_quant.py +372 -0
  56. sglang/srt/models/qwen.py +40 -35
  57. sglang/srt/models/qwen2.py +33 -36
  58. sglang/srt/models/qwen2_moe.py +473 -0
  59. sglang/srt/models/stablelm.py +33 -39
  60. sglang/srt/models/yivl.py +19 -26
  61. sglang/srt/openai_api_adapter.py +411 -0
  62. sglang/srt/{managers/openai_protocol.py → openai_protocol.py} +44 -19
  63. sglang/srt/sampling_params.py +2 -0
  64. sglang/srt/server.py +197 -481
  65. sglang/srt/server_args.py +190 -74
  66. sglang/srt/utils.py +460 -95
  67. sglang/test/test_programs.py +73 -10
  68. sglang/test/test_utils.py +226 -7
  69. sglang/utils.py +97 -27
  70. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/METADATA +74 -45
  71. sglang-0.1.21.dist-info/RECORD +82 -0
  72. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/WHEEL +1 -1
  73. sglang/srt/backend_config.py +0 -13
  74. sglang/srt/managers/router/infer_batch.py +0 -503
  75. sglang/srt/managers/router/manager.py +0 -79
  76. sglang/srt/managers/router/model_rpc.py +0 -686
  77. sglang/srt/managers/router/model_runner.py +0 -514
  78. sglang/srt/managers/router/scheduler.py +0 -70
  79. sglang-0.1.14.dist-info/RECORD +0 -64
  80. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/LICENSE +0 -0
  81. {sglang-0.1.14.dist-info → sglang-0.1.21.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,65 @@
1
+ """Request scheduler heuristic."""
2
+
3
+ import random
4
+ from collections import defaultdict
5
+
6
+
7
+ class ScheduleHeuristic:
8
+ def __init__(
9
+ self,
10
+ schedule_heuristic,
11
+ max_running_seqs,
12
+ max_prefill_num_tokens,
13
+ max_total_num_tokens,
14
+ tree_cache,
15
+ ):
16
+ if tree_cache.disable and schedule_heuristic == "lpm":
17
+ # LMP is not meaningless when tree cache is disabled.
18
+ schedule_heuristic = "fcfs"
19
+
20
+ self.schedule_heuristic = schedule_heuristic
21
+ self.max_running_seqs = max_running_seqs
22
+ self.max_prefill_num_tokens = max_prefill_num_tokens
23
+ self.max_total_num_tokens = max_total_num_tokens
24
+ self.tree_cache = tree_cache
25
+
26
+ def get_priority_queue(self, forward_queue):
27
+ if self.schedule_heuristic == "lpm":
28
+ # longest prefix match
29
+ forward_queue.sort(key=lambda x: -len(x.prefix_indices))
30
+ return forward_queue
31
+ elif self.schedule_heuristic == "random":
32
+ random.shuffle(forward_queue)
33
+ return forward_queue
34
+ elif self.schedule_heuristic == "fcfs":
35
+ return forward_queue
36
+ elif self.schedule_heuristic == "dfs-weight":
37
+ last_node_to_reqs = defaultdict(list)
38
+ for req in forward_queue:
39
+ last_node_to_reqs[req.last_node].append(req)
40
+
41
+ node_to_weight = defaultdict(int)
42
+ for node in last_node_to_reqs:
43
+ node_to_weight[node] = len(last_node_to_reqs[node])
44
+ self.calc_weight(self.tree_cache.root_node, node_to_weight)
45
+
46
+ q = []
47
+ self.get_dfs_priority(
48
+ self.tree_cache.root_node, node_to_weight, last_node_to_reqs, q
49
+ )
50
+ assert len(q) == len(forward_queue)
51
+ return q
52
+ else:
53
+ raise ValueError(f"Unknown schedule_heuristic: {self.schedule_heuristic}")
54
+
55
+ def calc_weight(self, cur_node, node_to_weight):
56
+ for child in cur_node.children.values():
57
+ self.calc_weight(child, node_to_weight)
58
+ node_to_weight[cur_node] += node_to_weight[child]
59
+
60
+ def get_dfs_priority(self, cur_node, node_to_priority, last_node_to_reqs, q):
61
+ childs = [child for child in cur_node.children.values()]
62
+ childs.sort(key=lambda x: -node_to_priority[x])
63
+ for child in childs:
64
+ self.get_dfs_priority(child, node_to_priority, last_node_to_reqs, q)
65
+ q.extend(last_node_to_reqs[cur_node])