guidellm 0.4.0a21__py3-none-any.whl → 0.4.0a155__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of guidellm might be problematic. Click here for more details.

Files changed (116) hide show
  1. guidellm/__init__.py +5 -2
  2. guidellm/__main__.py +451 -252
  3. guidellm/backends/__init__.py +33 -0
  4. guidellm/backends/backend.py +110 -0
  5. guidellm/backends/openai.py +355 -0
  6. guidellm/backends/response_handlers.py +455 -0
  7. guidellm/benchmark/__init__.py +53 -39
  8. guidellm/benchmark/benchmarker.py +148 -317
  9. guidellm/benchmark/entrypoints.py +466 -128
  10. guidellm/benchmark/output.py +517 -771
  11. guidellm/benchmark/profile.py +580 -280
  12. guidellm/benchmark/progress.py +568 -549
  13. guidellm/benchmark/scenarios/__init__.py +40 -0
  14. guidellm/benchmark/scenarios/chat.json +6 -0
  15. guidellm/benchmark/scenarios/rag.json +6 -0
  16. guidellm/benchmark/schemas.py +2085 -0
  17. guidellm/data/__init__.py +28 -4
  18. guidellm/data/collators.py +16 -0
  19. guidellm/data/deserializers/__init__.py +53 -0
  20. guidellm/data/deserializers/deserializer.py +109 -0
  21. guidellm/data/deserializers/file.py +222 -0
  22. guidellm/data/deserializers/huggingface.py +94 -0
  23. guidellm/data/deserializers/memory.py +192 -0
  24. guidellm/data/deserializers/synthetic.py +346 -0
  25. guidellm/data/loaders.py +145 -0
  26. guidellm/data/preprocessors/__init__.py +25 -0
  27. guidellm/data/preprocessors/formatters.py +412 -0
  28. guidellm/data/preprocessors/mappers.py +198 -0
  29. guidellm/data/preprocessors/preprocessor.py +29 -0
  30. guidellm/data/processor.py +30 -0
  31. guidellm/data/schemas.py +13 -0
  32. guidellm/data/utils/__init__.py +10 -0
  33. guidellm/data/utils/dataset.py +94 -0
  34. guidellm/data/utils/functions.py +18 -0
  35. guidellm/extras/__init__.py +4 -0
  36. guidellm/extras/audio.py +215 -0
  37. guidellm/extras/vision.py +242 -0
  38. guidellm/logger.py +2 -2
  39. guidellm/mock_server/__init__.py +8 -0
  40. guidellm/mock_server/config.py +84 -0
  41. guidellm/mock_server/handlers/__init__.py +17 -0
  42. guidellm/mock_server/handlers/chat_completions.py +280 -0
  43. guidellm/mock_server/handlers/completions.py +280 -0
  44. guidellm/mock_server/handlers/tokenizer.py +142 -0
  45. guidellm/mock_server/models.py +510 -0
  46. guidellm/mock_server/server.py +168 -0
  47. guidellm/mock_server/utils.py +302 -0
  48. guidellm/preprocess/dataset.py +23 -26
  49. guidellm/presentation/builder.py +2 -2
  50. guidellm/presentation/data_models.py +25 -21
  51. guidellm/presentation/injector.py +2 -3
  52. guidellm/scheduler/__init__.py +65 -26
  53. guidellm/scheduler/constraints.py +1035 -0
  54. guidellm/scheduler/environments.py +252 -0
  55. guidellm/scheduler/scheduler.py +140 -368
  56. guidellm/scheduler/schemas.py +272 -0
  57. guidellm/scheduler/strategies.py +519 -0
  58. guidellm/scheduler/worker.py +391 -420
  59. guidellm/scheduler/worker_group.py +707 -0
  60. guidellm/schemas/__init__.py +31 -0
  61. guidellm/schemas/info.py +159 -0
  62. guidellm/schemas/request.py +216 -0
  63. guidellm/schemas/response.py +119 -0
  64. guidellm/schemas/stats.py +228 -0
  65. guidellm/{config.py → settings.py} +32 -21
  66. guidellm/utils/__init__.py +95 -8
  67. guidellm/utils/auto_importer.py +98 -0
  68. guidellm/utils/cli.py +46 -2
  69. guidellm/utils/console.py +183 -0
  70. guidellm/utils/encoding.py +778 -0
  71. guidellm/utils/functions.py +134 -0
  72. guidellm/utils/hf_datasets.py +1 -2
  73. guidellm/utils/hf_transformers.py +4 -4
  74. guidellm/utils/imports.py +9 -0
  75. guidellm/utils/messaging.py +1118 -0
  76. guidellm/utils/mixins.py +115 -0
  77. guidellm/utils/pydantic_utils.py +411 -0
  78. guidellm/utils/random.py +3 -4
  79. guidellm/utils/registry.py +220 -0
  80. guidellm/utils/singleton.py +133 -0
  81. guidellm/{objects → utils}/statistics.py +341 -247
  82. guidellm/utils/synchronous.py +159 -0
  83. guidellm/utils/text.py +163 -50
  84. guidellm/utils/typing.py +41 -0
  85. guidellm/version.py +1 -1
  86. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/METADATA +33 -10
  87. guidellm-0.4.0a155.dist-info/RECORD +96 -0
  88. guidellm/backend/__init__.py +0 -23
  89. guidellm/backend/backend.py +0 -259
  90. guidellm/backend/openai.py +0 -705
  91. guidellm/backend/response.py +0 -136
  92. guidellm/benchmark/aggregator.py +0 -760
  93. guidellm/benchmark/benchmark.py +0 -837
  94. guidellm/benchmark/scenario.py +0 -104
  95. guidellm/data/prideandprejudice.txt.gz +0 -0
  96. guidellm/dataset/__init__.py +0 -22
  97. guidellm/dataset/creator.py +0 -213
  98. guidellm/dataset/entrypoints.py +0 -42
  99. guidellm/dataset/file.py +0 -92
  100. guidellm/dataset/hf_datasets.py +0 -62
  101. guidellm/dataset/in_memory.py +0 -132
  102. guidellm/dataset/synthetic.py +0 -287
  103. guidellm/objects/__init__.py +0 -18
  104. guidellm/objects/pydantic.py +0 -89
  105. guidellm/request/__init__.py +0 -18
  106. guidellm/request/loader.py +0 -284
  107. guidellm/request/request.py +0 -79
  108. guidellm/request/types.py +0 -10
  109. guidellm/scheduler/queues.py +0 -25
  110. guidellm/scheduler/result.py +0 -155
  111. guidellm/scheduler/strategy.py +0 -495
  112. guidellm-0.4.0a21.dist-info/RECORD +0 -62
  113. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/WHEEL +0 -0
  114. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/entry_points.txt +0 -0
  115. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/licenses/LICENSE +0 -0
  116. {guidellm-0.4.0a21.dist-info → guidellm-0.4.0a155.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,17 @@
1
+ """
2
+ HTTP request handlers for the GuideLLM mock server.
3
+
4
+ This module exposes request handlers that implement OpenAI-compatible API endpoints
5
+ for the mock server. The handlers provide realistic LLM simulation capabilities
6
+ including chat completions, legacy completions, and tokenization services with
7
+ configurable timing characteristics, token counting, and proper error handling to
8
+ support comprehensive benchmarking and testing scenarios.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from .chat_completions import ChatCompletionsHandler
14
+ from .completions import CompletionsHandler
15
+ from .tokenizer import TokenizerHandler
16
+
17
+ __all__ = ["ChatCompletionsHandler", "CompletionsHandler", "TokenizerHandler"]
@@ -0,0 +1,280 @@
1
+ """
2
+ OpenAI Chat Completions API endpoint handler for the mock server.
3
+
4
+ Provides a complete implementation of the /v1/chat/completions endpoint that simulates
5
+ realistic LLM behavior with configurable timing characteristics. Supports both streaming
6
+ and non-streaming responses with proper token counting, latency simulation including
7
+ TTFT (Time To First Token) and ITL (Inter-Token Latency), and OpenAI-compatible error
8
+ handling for comprehensive benchmarking scenarios.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import asyncio
14
+ import json
15
+ import math
16
+ import time
17
+ import uuid
18
+
19
+ from pydantic import ValidationError
20
+ from sanic import response
21
+ from sanic.request import Request
22
+ from sanic.response import HTTPResponse, ResponseStream
23
+ from transformers import PreTrainedTokenizer
24
+
25
+ from guidellm.mock_server.config import MockServerConfig
26
+ from guidellm.mock_server.models import (
27
+ ChatCompletionChoice,
28
+ ChatCompletionsRequest,
29
+ ChatCompletionsResponse,
30
+ ChatMessage,
31
+ ErrorDetail,
32
+ ErrorResponse,
33
+ Usage,
34
+ )
35
+ from guidellm.mock_server.utils import (
36
+ MockTokenizer,
37
+ create_fake_text,
38
+ create_fake_tokens_str,
39
+ sample_number,
40
+ times_generator,
41
+ )
42
+
43
+ __all__ = ["ChatCompletionsHandler"]
44
+
45
+
46
+ class ChatCompletionsHandler:
47
+ """
48
+ Handles OpenAI Chat Completions API requests with realistic LLM simulation.
49
+
50
+ Implements the /v1/chat/completions endpoint behavior including request validation,
51
+ response generation, and timing simulation. Supports both streaming and
52
+ non-streaming modes with configurable latency characteristics for comprehensive
53
+ benchmarking. Uses either a mock tokenizer or a real tokenizer for accurate token
54
+ counting and realistic text generation.
55
+
56
+ Example:
57
+ ::
58
+ config = MockServerConfig(ttft_ms=100, itl_ms=50)
59
+ handler = ChatCompletionsHandler(config)
60
+ response = await handler.handle(request)
61
+ """
62
+
63
+ def __init__(self, config: MockServerConfig) -> None:
64
+ """
65
+ Initialize the Chat Completions handler with server configuration.
66
+
67
+ :param config: Mock server configuration containing timing and behavior settings
68
+ """
69
+ self.config = config
70
+ self.tokenizer = (
71
+ MockTokenizer()
72
+ if config.processor is None
73
+ else PreTrainedTokenizer.from_pretrained(config.processor)
74
+ )
75
+
76
+ async def handle(self, request: Request) -> HTTPResponse:
77
+ """
78
+ Process incoming chat completion requests with validation and routing.
79
+
80
+ Validates the request payload, handles errors gracefully, and routes to
81
+ appropriate streaming or non-streaming response handlers based on the
82
+ request configuration.
83
+
84
+ :param request: Sanic HTTP request containing chat completion parameters
85
+ :return: HTTP response with completion data or error information
86
+ :raises ValidationError: When request payload fails validation
87
+ :raises JSONDecodeError: When request contains invalid JSON
88
+ """
89
+ try:
90
+ # Parse and validate request
91
+ req_data = ChatCompletionsRequest(**request.json)
92
+ except ValidationError as exc:
93
+ return response.json(
94
+ ErrorResponse(
95
+ error=ErrorDetail(
96
+ message=f"Invalid request: {str(exc)}",
97
+ type="invalid_request_error",
98
+ code="invalid_request",
99
+ )
100
+ ).model_dump(),
101
+ status=400,
102
+ )
103
+ except (json.JSONDecodeError, TypeError):
104
+ return response.json(
105
+ ErrorResponse(
106
+ error=ErrorDetail(
107
+ message="Invalid JSON in request body",
108
+ type="invalid_request_error",
109
+ code="invalid_json",
110
+ )
111
+ ).model_dump(),
112
+ status=400,
113
+ )
114
+
115
+ # Handle streaming vs non-streaming
116
+ if req_data.stream:
117
+ return await self._handle_stream(req_data)
118
+ else:
119
+ return await self._handle_non_stream(req_data)
120
+
121
+ async def _handle_non_stream(self, req: ChatCompletionsRequest) -> HTTPResponse:
122
+ """
123
+ Generate complete non-streaming chat completion response.
124
+
125
+ Simulates realistic LLM behavior with TTFT and ITL delays, generates
126
+ appropriate token counts, and returns a complete response with usage
127
+ statistics and generated content.
128
+
129
+ :param req: Validated chat completion request parameters
130
+ :return: Complete HTTP response with generated completion data
131
+ """
132
+ # TTFT delay
133
+ await asyncio.sleep(
134
+ sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
135
+ )
136
+
137
+ # Token counts
138
+ prompt_text = self.tokenizer.apply_chat_template(req.messages)
139
+ prompt_tokens = len(self.tokenizer(prompt_text)) # type: ignore[arg-type]
140
+ max_tokens = req.max_completion_tokens or req.max_tokens or math.inf
141
+ completion_tokens_count = min(
142
+ sample_number(self.config.output_tokens, self.config.output_tokens_std),
143
+ max_tokens,
144
+ )
145
+
146
+ # ITL delay
147
+ itl_delay = 0.0
148
+ delays_iter = iter(times_generator(self.config.itl_ms, self.config.itl_ms_std))
149
+ for _ in range(int(completion_tokens_count) - 1):
150
+ itl_delay += next(delays_iter)
151
+ await asyncio.sleep(itl_delay / 1000.0)
152
+
153
+ # Response
154
+ chat_response = ChatCompletionsResponse(
155
+ id=f"chatcmpl-{uuid.uuid4().hex[:29]}",
156
+ model=req.model,
157
+ choices=[
158
+ ChatCompletionChoice(
159
+ index=0,
160
+ message=ChatMessage(
161
+ role="assistant",
162
+ content=create_fake_text(
163
+ int(completion_tokens_count), self.tokenizer
164
+ ),
165
+ ),
166
+ finish_reason="stop",
167
+ )
168
+ ],
169
+ usage=Usage(
170
+ prompt_tokens=prompt_tokens,
171
+ completion_tokens=int(completion_tokens_count),
172
+ ),
173
+ system_fingerprint=f"fp_{uuid.uuid4().hex[:10]}",
174
+ )
175
+
176
+ return response.json(chat_response.model_dump())
177
+
178
+ async def _handle_stream(self, req: ChatCompletionsRequest) -> HTTPResponse:
179
+ """
180
+ Generate streaming chat completion response with real-time token delivery.
181
+
182
+ Creates a streaming response that delivers tokens incrementally with
183
+ realistic timing delays. Supports optional usage statistics in the final
184
+ stream chunk when requested via stream_options.
185
+
186
+ :param req: Validated chat completion request with streaming enabled
187
+ :return: Streaming HTTP response delivering tokens with proper timing
188
+ """
189
+
190
+ async def generate_stream(stream_response):
191
+ completion_id = f"chatcmpl-{uuid.uuid4().hex[:29]}"
192
+
193
+ # TTFT delay
194
+ await asyncio.sleep(
195
+ sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
196
+ )
197
+
198
+ # Token counts
199
+ prompt_text = self.tokenizer.apply_chat_template(req.messages)
200
+ prompt_tokens = len(self.tokenizer(prompt_text)) # type: ignore[arg-type]
201
+ max_tokens = req.max_completion_tokens or req.max_tokens or math.inf
202
+ completion_tokens_count = int(
203
+ min(
204
+ sample_number(
205
+ self.config.output_tokens, self.config.output_tokens_std
206
+ ),
207
+ max_tokens,
208
+ )
209
+ )
210
+
211
+ # Send tokens
212
+ tokens = create_fake_tokens_str(completion_tokens_count, self.tokenizer)
213
+ delays_iter = iter(
214
+ times_generator(self.config.itl_ms, self.config.itl_ms_std)
215
+ )
216
+
217
+ for index, token in enumerate(tokens):
218
+ if index > 0:
219
+ itl_delay = next(delays_iter)
220
+ await asyncio.sleep(itl_delay / 1000.0)
221
+
222
+ chunk_data = {
223
+ "id": completion_id,
224
+ "object": "chat.completion.chunk",
225
+ "created": int(time.time()),
226
+ "model": req.model,
227
+ "choices": [
228
+ {
229
+ "index": 0,
230
+ "delta": {"content": token},
231
+ "finish_reason": None,
232
+ }
233
+ ],
234
+ }
235
+ await stream_response.write(f"data: {json.dumps(chunk_data)}\n\n")
236
+
237
+ # Send final chunk with finish reason
238
+ final_chunk = {
239
+ "id": completion_id,
240
+ "object": "chat.completion.chunk",
241
+ "created": int(time.time()),
242
+ "model": req.model,
243
+ "choices": [
244
+ {
245
+ "index": 0,
246
+ "delta": {},
247
+ "finish_reason": "stop",
248
+ }
249
+ ],
250
+ }
251
+ await stream_response.write(f"data: {json.dumps(final_chunk)}\n\n")
252
+
253
+ # Send usage if requested
254
+ if req.stream_options and req.stream_options.include_usage:
255
+ usage_chunk = {
256
+ "id": completion_id,
257
+ "object": "chat.completion.chunk",
258
+ "created": int(time.time()),
259
+ "model": req.model,
260
+ "choices": [],
261
+ "usage": {
262
+ "prompt_tokens": prompt_tokens,
263
+ "completion_tokens": completion_tokens_count,
264
+ "total_tokens": prompt_tokens + completion_tokens_count,
265
+ },
266
+ }
267
+ await stream_response.write(f"data: {json.dumps(usage_chunk)}\n\n")
268
+
269
+ # End stream
270
+ await stream_response.write("data: [DONE]\n\n")
271
+
272
+ return ResponseStream( # type: ignore[return-value]
273
+ generate_stream,
274
+ content_type="text/event-stream",
275
+ headers={
276
+ "Cache-Control": "no-cache",
277
+ "Connection": "keep-alive",
278
+ "X-Accel-Buffering": "no",
279
+ },
280
+ )
@@ -0,0 +1,280 @@
1
+ """
2
+ Legacy OpenAI Completions API handler for the mock server.
3
+
4
+ This module provides the CompletionsHandler class that implements the /v1/completions
5
+ endpoint for the guidellm mock server. It supports both streaming and non-streaming
6
+ completions with configurable timing parameters (TTFT, ITL) and token generation to
7
+ simulate realistic LLM behavior for benchmarking and testing purposes.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import asyncio
13
+ import json
14
+ import math
15
+ import time
16
+ import uuid
17
+
18
+ from pydantic import ValidationError
19
+ from sanic import response
20
+ from sanic.request import Request
21
+ from sanic.response import HTTPResponse, ResponseStream
22
+ from transformers import PreTrainedTokenizer
23
+
24
+ from guidellm.mock_server.config import MockServerConfig
25
+ from guidellm.mock_server.models import (
26
+ CompletionChoice,
27
+ CompletionsRequest,
28
+ CompletionsResponse,
29
+ ErrorDetail,
30
+ ErrorResponse,
31
+ Usage,
32
+ )
33
+ from guidellm.mock_server.utils import (
34
+ MockTokenizer,
35
+ create_fake_text,
36
+ create_fake_tokens_str,
37
+ sample_number,
38
+ times_generator,
39
+ )
40
+
41
+ __all__ = ["CompletionsHandler"]
42
+
43
+
44
+ class CompletionsHandler:
45
+ """
46
+ Handler for the OpenAI /v1/completions endpoint in the mock server.
47
+
48
+ This handler simulates the legacy OpenAI completions API by processing incoming
49
+ requests and generating responses with configurable timing and token generation
50
+ patterns. It supports both streaming and non-streaming modes, applying realistic
51
+ timing delays (TTFT and ITL) to mimic actual LLM behavior for benchmarking.
52
+
53
+ Example:
54
+ ::
55
+ config = MockServerConfig(ttft_ms=100, itl_ms=50)
56
+ handler = CompletionsHandler(config)
57
+ response = await handler.handle(sanic_request)
58
+ """
59
+
60
+ def __init__(self, config: MockServerConfig) -> None:
61
+ """
62
+ Initialize the completions handler with configuration settings.
63
+
64
+ :param config: Mock server configuration containing timing parameters
65
+ and tokenizer settings
66
+ """
67
+ self.config = config
68
+ self.tokenizer = (
69
+ MockTokenizer()
70
+ if config.processor is None
71
+ else PreTrainedTokenizer.from_pretrained(config.processor)
72
+ )
73
+
74
+ async def handle(self, request: Request) -> HTTPResponse:
75
+ """
76
+ Process a completions request and return the appropriate response.
77
+
78
+ Validates the incoming request, determines whether to use streaming or
79
+ non-streaming mode, and delegates to the appropriate handler method.
80
+
81
+ :param request: Sanic request object containing the completions request data
82
+ :return: HTTP response with completion data or error information
83
+ :raises ValidationError: When request validation fails
84
+ :raises json.JSONDecodeError: When request JSON is malformed
85
+ """
86
+ try:
87
+ # Parse and validate request
88
+ req_data = CompletionsRequest(**request.json)
89
+ except ValidationError as e:
90
+ return response.json(
91
+ ErrorResponse(
92
+ error=ErrorDetail(
93
+ message=f"Invalid request: {str(e)}",
94
+ type="invalid_request_error",
95
+ code="invalid_request",
96
+ )
97
+ ).model_dump(),
98
+ status=400,
99
+ )
100
+ except (json.JSONDecodeError, TypeError):
101
+ return response.json(
102
+ ErrorResponse(
103
+ error=ErrorDetail(
104
+ message="Invalid JSON in request body",
105
+ type="invalid_request_error",
106
+ code="invalid_json",
107
+ )
108
+ ).model_dump(),
109
+ status=400,
110
+ )
111
+
112
+ # Handle streaming vs non-streaming
113
+ if req_data.stream:
114
+ return await self._handle_stream(req_data)
115
+ else:
116
+ return await self._handle_non_stream(req_data)
117
+
118
+ async def _handle_non_stream(self, req: CompletionsRequest) -> HTTPResponse:
119
+ """
120
+ Generate a non-streaming completion response.
121
+
122
+ Simulates TTFT and ITL delays, generates appropriate token counts, and returns
123
+ a complete response with the generated text and usage statistics.
124
+
125
+ :param req: Validated completions request containing prompt and parameters
126
+ :return: JSON HTTP response with completion text and usage data
127
+ :raises NotImplementedError: When batch processing is requested
128
+ """
129
+ if isinstance(req.prompt, list):
130
+ raise NotImplementedError("Batch processing is not supported.")
131
+
132
+ # TTFT delay
133
+ await asyncio.sleep(
134
+ sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
135
+ )
136
+
137
+ # Token counts
138
+ prompt_tokens = len(self.tokenizer(req.prompt))
139
+ max_tokens = req.max_tokens or math.inf
140
+ completion_tokens_count = int(
141
+ min(
142
+ sample_number(self.config.output_tokens, self.config.output_tokens_std),
143
+ max_tokens,
144
+ )
145
+ if req.stop
146
+ else max_tokens
147
+ )
148
+
149
+ # ITL delay
150
+ itl_delay = 0.0
151
+ delays_iter = iter(times_generator(self.config.itl_ms, self.config.itl_ms_std))
152
+ for _ in range(int(completion_tokens_count) - 1):
153
+ itl_delay += next(delays_iter)
154
+ await asyncio.sleep(itl_delay / 1000.0)
155
+
156
+ # Response
157
+ completion_response = CompletionsResponse(
158
+ id=f"cmpl-{uuid.uuid4().hex[:29]}",
159
+ model=req.model,
160
+ choices=[
161
+ CompletionChoice(
162
+ text=create_fake_text(completion_tokens_count, self.tokenizer),
163
+ index=0,
164
+ finish_reason="stop",
165
+ )
166
+ ],
167
+ usage=Usage(
168
+ prompt_tokens=prompt_tokens,
169
+ completion_tokens=completion_tokens_count,
170
+ ),
171
+ system_fingerprint=f"fp_{uuid.uuid4().hex[:10]}",
172
+ )
173
+
174
+ return response.json(completion_response.model_dump())
175
+
176
+ async def _handle_stream(self, req: CompletionsRequest) -> HTTPResponse:
177
+ """
178
+ Generate a streaming completion response.
179
+
180
+ Creates a server-sent events stream that delivers tokens incrementally with
181
+ realistic timing delays between each token. Includes usage statistics if
182
+ requested and properly terminates the stream.
183
+
184
+ :param req: Validated completions request containing prompt and streaming
185
+ options
186
+ :return: ResponseStream object that generates server-sent events
187
+ """
188
+
189
+ async def generate_stream(stream_response):
190
+ completion_id = f"cmpl-{uuid.uuid4().hex[:29]}"
191
+
192
+ # TTFT delay
193
+ await asyncio.sleep(
194
+ sample_number(self.config.ttft_ms, self.config.ttft_ms_std) / 1000.0
195
+ )
196
+
197
+ # Token counts
198
+ prompt_tokens = len(self.tokenizer(req.prompt))
199
+ max_tokens = req.max_tokens or math.inf
200
+ completion_tokens_count = int(
201
+ min(
202
+ sample_number(
203
+ self.config.output_tokens, self.config.output_tokens_std
204
+ ),
205
+ max_tokens,
206
+ )
207
+ if req.stop
208
+ else max_tokens
209
+ )
210
+
211
+ # Send tokens
212
+ tokens = create_fake_tokens_str(completion_tokens_count, self.tokenizer)
213
+ delays_iter = iter(
214
+ times_generator(self.config.itl_ms, self.config.itl_ms_std)
215
+ )
216
+
217
+ for index, token in enumerate(tokens):
218
+ if index > 0:
219
+ itl_delay = next(delays_iter)
220
+ await asyncio.sleep(itl_delay / 1000.0)
221
+
222
+ chunk_data = {
223
+ "id": completion_id,
224
+ "object": "text_completion",
225
+ "created": int(time.time()),
226
+ "model": req.model,
227
+ "choices": [
228
+ {
229
+ "text": token,
230
+ "index": index,
231
+ "finish_reason": None,
232
+ }
233
+ ],
234
+ }
235
+ await stream_response.write(f"data: {json.dumps(chunk_data)}\n\n")
236
+
237
+ # Send final chunk with finish reason
238
+ final_chunk = {
239
+ "id": completion_id,
240
+ "object": "text_completion",
241
+ "created": int(time.time()),
242
+ "model": req.model,
243
+ "choices": [
244
+ {
245
+ "text": "",
246
+ "index": index,
247
+ "finish_reason": "stop",
248
+ }
249
+ ],
250
+ }
251
+ await stream_response.write(f"data: {json.dumps(final_chunk)}\n\n")
252
+
253
+ # Send usage if requested
254
+ if req.stream_options and req.stream_options.include_usage:
255
+ usage_chunk = {
256
+ "id": completion_id,
257
+ "object": "text_completion",
258
+ "created": int(time.time()),
259
+ "model": req.model,
260
+ "choices": [],
261
+ "usage": {
262
+ "prompt_tokens": prompt_tokens,
263
+ "completion_tokens": completion_tokens_count,
264
+ "total_tokens": prompt_tokens + completion_tokens_count,
265
+ },
266
+ }
267
+ await stream_response.write(f"data: {json.dumps(usage_chunk)}\n\n")
268
+
269
+ # End stream
270
+ await stream_response.write("data: [DONE]\n\n")
271
+
272
+ return ResponseStream( # type: ignore[return-value]
273
+ generate_stream,
274
+ content_type="text/event-stream",
275
+ headers={
276
+ "Cache-Control": "no-cache",
277
+ "Connection": "keep-alive",
278
+ "X-Accel-Buffering": "no",
279
+ },
280
+ )