guidellm 0.1.0__py3-none-any.whl → 0.2.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of guidellm might be problematic. Click here for more details.

Files changed (69) hide show
  1. guidellm/__init__.py +38 -6
  2. guidellm/__main__.py +294 -0
  3. guidellm/backend/__init__.py +19 -6
  4. guidellm/backend/backend.py +238 -0
  5. guidellm/backend/openai.py +532 -122
  6. guidellm/backend/response.py +132 -0
  7. guidellm/benchmark/__init__.py +73 -0
  8. guidellm/benchmark/aggregator.py +760 -0
  9. guidellm/benchmark/benchmark.py +838 -0
  10. guidellm/benchmark/benchmarker.py +334 -0
  11. guidellm/benchmark/entrypoints.py +141 -0
  12. guidellm/benchmark/output.py +946 -0
  13. guidellm/benchmark/profile.py +409 -0
  14. guidellm/benchmark/progress.py +720 -0
  15. guidellm/config.py +34 -56
  16. guidellm/data/__init__.py +4 -0
  17. guidellm/data/prideandprejudice.txt.gz +0 -0
  18. guidellm/dataset/__init__.py +22 -0
  19. guidellm/dataset/creator.py +213 -0
  20. guidellm/dataset/entrypoints.py +42 -0
  21. guidellm/dataset/file.py +90 -0
  22. guidellm/dataset/hf_datasets.py +62 -0
  23. guidellm/dataset/in_memory.py +132 -0
  24. guidellm/dataset/synthetic.py +262 -0
  25. guidellm/objects/__init__.py +18 -0
  26. guidellm/objects/pydantic.py +60 -0
  27. guidellm/objects/statistics.py +947 -0
  28. guidellm/request/__init__.py +12 -10
  29. guidellm/request/loader.py +281 -0
  30. guidellm/request/request.py +79 -0
  31. guidellm/scheduler/__init__.py +51 -3
  32. guidellm/scheduler/result.py +137 -0
  33. guidellm/scheduler/scheduler.py +382 -0
  34. guidellm/scheduler/strategy.py +493 -0
  35. guidellm/scheduler/types.py +7 -0
  36. guidellm/scheduler/worker.py +511 -0
  37. guidellm/utils/__init__.py +16 -29
  38. guidellm/utils/colors.py +8 -0
  39. guidellm/utils/hf_transformers.py +35 -0
  40. guidellm/utils/random.py +43 -0
  41. guidellm/utils/text.py +118 -357
  42. {guidellm-0.1.0.dist-info → guidellm-0.2.0.dev0.dist-info}/METADATA +96 -79
  43. guidellm-0.2.0.dev0.dist-info/RECORD +48 -0
  44. {guidellm-0.1.0.dist-info → guidellm-0.2.0.dev0.dist-info}/WHEEL +1 -1
  45. guidellm-0.2.0.dev0.dist-info/entry_points.txt +2 -0
  46. guidellm/backend/base.py +0 -320
  47. guidellm/core/__init__.py +0 -24
  48. guidellm/core/distribution.py +0 -190
  49. guidellm/core/report.py +0 -321
  50. guidellm/core/request.py +0 -44
  51. guidellm/core/result.py +0 -545
  52. guidellm/core/serializable.py +0 -169
  53. guidellm/executor/__init__.py +0 -10
  54. guidellm/executor/base.py +0 -213
  55. guidellm/executor/profile_generator.py +0 -343
  56. guidellm/main.py +0 -336
  57. guidellm/request/base.py +0 -194
  58. guidellm/request/emulated.py +0 -391
  59. guidellm/request/file.py +0 -76
  60. guidellm/request/transformers.py +0 -100
  61. guidellm/scheduler/base.py +0 -374
  62. guidellm/scheduler/load_generator.py +0 -196
  63. guidellm/utils/injector.py +0 -70
  64. guidellm/utils/progress.py +0 -196
  65. guidellm/utils/transformers.py +0 -151
  66. guidellm-0.1.0.dist-info/RECORD +0 -35
  67. guidellm-0.1.0.dist-info/entry_points.txt +0 -3
  68. {guidellm-0.1.0.dist-info → guidellm-0.2.0.dev0.dist-info/licenses}/LICENSE +0 -0
  69. {guidellm-0.1.0.dist-info → guidellm-0.2.0.dev0.dist-info}/top_level.txt +0 -0
guidellm/main.py DELETED
@@ -1,336 +0,0 @@
1
- import asyncio
2
- from typing import Literal, Optional, get_args
3
-
4
- import click
5
- from loguru import logger
6
-
7
- from guidellm.backend import Backend, BackendEnginePublic
8
- from guidellm.core import GuidanceReport, TextGenerationBenchmarkReport
9
- from guidellm.executor import Executor, ProfileGenerationMode
10
- from guidellm.request import (
11
- EmulatedRequestGenerator,
12
- FileRequestGenerator,
13
- TransformersDatasetRequestGenerator,
14
- )
15
- from guidellm.request.base import RequestGenerator
16
- from guidellm.utils import BenchmarkReportProgress
17
-
18
- __all__ = ["generate_benchmark_report"]
19
-
20
-
21
- @click.command()
22
- @click.option(
23
- "--target",
24
- type=str,
25
- required=True,
26
- help=(
27
- "The target path or url for the backend to evaluate. "
28
- "Ex: 'http://localhost:8000/v1'"
29
- ),
30
- )
31
- @click.option(
32
- "--backend",
33
- type=click.Choice(get_args(BackendEnginePublic)),
34
- default="openai_server",
35
- help=(
36
- "The backend to use for benchmarking. "
37
- "The default is OpenAI Server enabling compatability with any server that "
38
- "follows the OpenAI spec including vLLM."
39
- ),
40
- )
41
- @click.option(
42
- "--model",
43
- type=str,
44
- default=None,
45
- help=(
46
- "The Model to use for benchmarking. If not provided, it will use "
47
- "the first available model provided the backend supports listing models."
48
- ),
49
- )
50
- @click.option(
51
- "--data",
52
- type=str,
53
- required=True,
54
- help=(
55
- "The data source to use for benchmarking. "
56
- "Depending on the data-type, it should be a "
57
- "path to a data file containing prompts to run (ex: data.txt), "
58
- "a HuggingFace dataset name (ex: 'neuralmagic/LLM_compression_calibration'), "
59
- "or a configuration for emulated data "
60
- "(ex: 'prompt_tokens=128,generated_tokens=128')."
61
- ),
62
- )
63
- @click.option(
64
- "--data-type",
65
- type=click.Choice(["emulated", "file", "transformers"]),
66
- required=True,
67
- help=(
68
- "The type of data to use for benchmarking. "
69
- "Use 'emulated' for synthetic data, 'file' for a file, or 'transformers' "
70
- "for a HuggingFace dataset. Specify the data source with the --data flag."
71
- ),
72
- )
73
- @click.option(
74
- "--tokenizer",
75
- type=str,
76
- default=None,
77
- help=(
78
- "The tokenizer to use for calculating the number of prompt tokens. "
79
- "This should match the tokenizer used by the model."
80
- "By default, it will use the --model flag to determine the tokenizer. "
81
- "If not provided and the model is not available, will raise an error. "
82
- "Ex: 'neuralmagic/Meta-Llama-3.1-8B-quantized.w8a8'"
83
- ),
84
- )
85
- @click.option(
86
- "--rate-type",
87
- type=click.Choice(get_args(ProfileGenerationMode)),
88
- default="sweep",
89
- help=(
90
- "The type of request rate to use for benchmarking. "
91
- "Use sweep to run a full range from synchronous to throughput (default), "
92
- "synchronous for sending requests one after the other, "
93
- "throughput to send requests as fast as possible, "
94
- "constant for a fixed request rate, "
95
- "or poisson for a real-world variable request rate."
96
- ),
97
- )
98
- @click.option(
99
- "--rate",
100
- type=float,
101
- default=None,
102
- help=(
103
- "The request rate to use for constant and poisson rate types. "
104
- "To run multiple, provide the flag multiple times. "
105
- ),
106
- multiple=True,
107
- )
108
- @click.option(
109
- "--max-seconds",
110
- type=int,
111
- default=120,
112
- help=(
113
- "The maximum number of seconds for each benchmark run. "
114
- "Either max-seconds, max-requests, or both must be set. "
115
- "The default is 120 seconds. "
116
- "Note, this is the maximum time for each rate supplied, not the total time. "
117
- "This value should be large enough to allow for "
118
- "the server's performance to stabilize."
119
- ),
120
- )
121
- @click.option(
122
- "--max-requests",
123
- type=int,
124
- default=None,
125
- help=(
126
- "The maximum number of requests for each benchmark run. "
127
- "Either max-seconds, max-requests, or both must be set. "
128
- "Note, this is the maximum number of requests for each rate supplied, "
129
- "not the total number of requests. "
130
- "This value should be large enough to allow for "
131
- "the server's performance to stabilize."
132
- ),
133
- )
134
- @click.option(
135
- "--output-path",
136
- type=str,
137
- default=None,
138
- help=(
139
- "The output path to save the output report to for loading later. "
140
- "Ex: guidance_report.json. "
141
- "The default is None, meaning no output is saved and results are only "
142
- "printed to the console."
143
- ),
144
- )
145
- @click.option(
146
- "--enable-continuous-refresh",
147
- is_flag=True,
148
- default=False,
149
- help=(
150
- "Enable continual refreshing of the output table in the CLI "
151
- "until the user exits. "
152
- ),
153
- )
154
- def generate_benchmark_report_cli(
155
- target: str,
156
- backend: BackendEnginePublic,
157
- model: Optional[str],
158
- data: Optional[str],
159
- data_type: Literal["emulated", "file", "transformers"],
160
- tokenizer: Optional[str],
161
- rate_type: ProfileGenerationMode,
162
- rate: Optional[float],
163
- max_seconds: Optional[int],
164
- max_requests: Optional[int],
165
- output_path: str,
166
- enable_continuous_refresh: bool,
167
- ):
168
- """
169
- Generate a benchmark report for a specified backend and dataset.
170
- """
171
- generate_benchmark_report(
172
- target=target,
173
- backend=backend,
174
- model=model,
175
- data=data,
176
- data_type=data_type,
177
- tokenizer=tokenizer,
178
- rate_type=rate_type,
179
- rate=rate,
180
- max_seconds=max_seconds,
181
- max_requests=max_requests,
182
- output_path=output_path,
183
- cont_refresh_table=enable_continuous_refresh,
184
- )
185
-
186
-
187
- def generate_benchmark_report(
188
- target: str,
189
- backend: BackendEnginePublic,
190
- model: Optional[str],
191
- data: Optional[str],
192
- data_type: Literal["emulated", "file", "transformers"],
193
- tokenizer: Optional[str],
194
- rate_type: ProfileGenerationMode,
195
- rate: Optional[float],
196
- max_seconds: Optional[int],
197
- max_requests: Optional[int],
198
- output_path: str,
199
- cont_refresh_table: bool,
200
- ) -> GuidanceReport:
201
- """
202
- Generate a benchmark report for a specified backend and dataset.
203
-
204
- :param target: The target URL or path for the backend to evaluate.
205
- :param backend: The backend type to use for benchmarking.
206
- :param model: The model to benchmark;
207
- defaults to the first available if not specified.
208
- :param data: The data source for benchmarking,
209
- which may be a path, dataset name, or config.
210
- :param data_type: The type of data to use,
211
- such as 'emulated', 'file', or 'transformers'.
212
- :param tokenizer: The tokenizer to use for token counting,
213
- defaulting to Llama 3.1 if not provided.
214
- :param rate_type: The rate type for requests during benchmarking.
215
- :param rate: The specific request rate for constant and poisson rate types.
216
- :param max_seconds: Maximum duration for each benchmark run in seconds.
217
- :param max_requests: Maximum number of requests per benchmark run.
218
- :param output_path: Path to save the output report file.
219
- :param cont_refresh_table: Continually refresh the table in the CLI
220
- until the user exits.
221
- """
222
- logger.info(
223
- "Generating benchmark report with target: {}, backend: {}", target, backend
224
- )
225
-
226
- # Create backend
227
- backend_inst = Backend.create(
228
- backend_type=backend,
229
- target=target,
230
- model=model,
231
- )
232
-
233
- request_generator: RequestGenerator
234
-
235
- # Create tokenizer and request generator
236
- tokenizer_inst = tokenizer
237
- if not tokenizer_inst:
238
- try:
239
- tokenizer_inst = backend_inst.model_tokenizer()
240
- except Exception as err:
241
- raise ValueError(
242
- "Could not load model's tokenizer, "
243
- "--tokenizer must be provided for request generation"
244
- ) from err
245
-
246
- if data_type == "emulated":
247
- request_generator = EmulatedRequestGenerator(
248
- config=data, tokenizer=tokenizer_inst
249
- )
250
- elif data_type == "file":
251
- request_generator = FileRequestGenerator(path=data, tokenizer=tokenizer_inst)
252
- elif data_type == "transformers":
253
- request_generator = TransformersDatasetRequestGenerator(
254
- dataset=data, tokenizer=tokenizer_inst
255
- )
256
- else:
257
- raise ValueError(f"Unknown data type: {data_type}")
258
-
259
- # Create executor
260
- executor = Executor(
261
- backend=backend_inst,
262
- request_generator=request_generator,
263
- mode=rate_type,
264
- rate=rate if rate_type in ("constant", "poisson") else None,
265
- max_number=max_requests,
266
- max_duration=max_seconds,
267
- )
268
-
269
- # Run executor
270
- logger.debug(
271
- "Running executor with args: {}",
272
- {
273
- "backend": backend,
274
- "request_generator": request_generator,
275
- "mode": rate_type,
276
- "rate": rate,
277
- "max_number": max_requests,
278
- "max_duration": max_seconds,
279
- },
280
- )
281
- report = asyncio.run(_run_executor_for_result(executor))
282
-
283
- # Save and print report
284
- guidance_report = GuidanceReport()
285
- guidance_report.benchmarks.append(report)
286
-
287
- if output_path:
288
- guidance_report.save_file(output_path)
289
-
290
- guidance_report.print(
291
- save_path=output_path if output_path is not None else "stdout",
292
- continual_refresh=cont_refresh_table,
293
- )
294
-
295
- return guidance_report
296
-
297
-
298
- async def _run_executor_for_result(executor: Executor) -> TextGenerationBenchmarkReport:
299
- report = None
300
- progress = BenchmarkReportProgress()
301
- started = False
302
-
303
- async for result in executor.run():
304
- if not started:
305
- progress.start(result.generation_modes) # type: ignore # noqa: PGH003
306
- started = True
307
-
308
- if result.current_index is not None:
309
- description = f"{result.current_profile.load_gen_mode}" # type: ignore # noqa: PGH003
310
- if result.current_profile.load_gen_mode in ("constant", "poisson"): # type: ignore # noqa: PGH003
311
- description += f"@{result.current_profile.load_gen_rate:.2f} req/s" # type: ignore # noqa: PGH003
312
-
313
- progress.update_benchmark(
314
- index=result.current_index,
315
- description=description,
316
- completed=result.scheduler_result.completed, # type: ignore # noqa: PGH003
317
- completed_count=result.scheduler_result.count_completed, # type: ignore # noqa: PGH003
318
- completed_total=result.scheduler_result.count_total, # type: ignore # noqa: PGH003
319
- start_time=result.scheduler_result.benchmark.start_time, # type: ignore # noqa: PGH003
320
- req_per_sec=result.scheduler_result.benchmark.completed_request_rate, # type: ignore # noqa: PGH003
321
- )
322
-
323
- if result.completed:
324
- report = result.report
325
- break
326
-
327
- progress.finish()
328
-
329
- if not report:
330
- raise ValueError("No report generated by executor")
331
-
332
- return report
333
-
334
-
335
- if __name__ == "__main__":
336
- generate_benchmark_report_cli()
guidellm/request/base.py DELETED
@@ -1,194 +0,0 @@
1
- import contextlib
2
- import threading
3
- import time
4
- from abc import ABC, abstractmethod
5
- from queue import Empty, Full, Queue
6
- from typing import Iterator, Literal, Union
7
-
8
- from loguru import logger
9
- from transformers import ( # type: ignore # noqa: PGH003
10
- AutoTokenizer,
11
- PreTrainedTokenizer,
12
- )
13
-
14
- from guidellm.core.request import TextGenerationRequest
15
-
16
- __all__ = ["GenerationMode", "RequestGenerator"]
17
-
18
-
19
- GenerationMode = Literal["async", "sync"]
20
-
21
-
22
- class RequestGenerator(ABC):
23
- """
24
- A base class for request generators that generate result requests.
25
-
26
- :param type_: The type of the request generator.
27
- :type type_: str
28
- :param source: The data source for the request generator.
29
- :type source: str
30
- :param tokenizer: The tokenizer instance or the name/config to use
31
- for tokenizing prompts.
32
- :type tokenizer: Union[str, PreTrainedTokenizer]
33
- :param mode: The generation mode, either 'async' or 'sync'.
34
- :type mode: GenerationMode
35
- :param async_queue_size: The size of the request queue.
36
- :type async_queue_size: int
37
- """
38
-
39
- def __init__(
40
- self,
41
- type_: str,
42
- source: str,
43
- tokenizer: Union[str, PreTrainedTokenizer],
44
- mode: GenerationMode = "async",
45
- async_queue_size: int = 50,
46
- ):
47
- self._type = type_
48
- self._source = source
49
- self._async_queue_size: int = async_queue_size
50
- self._mode: str = mode
51
- self._queue: Queue = Queue(maxsize=async_queue_size)
52
- self._stop_event: threading.Event = threading.Event()
53
-
54
- if not tokenizer:
55
- err = "Tokenizer must be provided for request generation"
56
- logger.error(err)
57
- raise ValueError(err)
58
-
59
- self._tokenizer = (
60
- AutoTokenizer.from_pretrained(tokenizer)
61
- if isinstance(tokenizer, str)
62
- else tokenizer
63
- )
64
- logger.info("Tokenizer initialized for request generation: {}", self._tokenizer)
65
-
66
- if self._mode == "async":
67
- self._thread = threading.Thread(target=self._populate_queue, daemon=True)
68
- self._thread.start()
69
- logger.info(
70
- "RequestGenerator started in async mode with queue size: {}",
71
- self._async_queue_size,
72
- )
73
-
74
- def __repr__(self) -> str:
75
- """
76
- Return a string representation of the RequestGenerator.
77
-
78
- :return: String representation of the RequestGenerator.
79
- :rtype: str
80
- """
81
- return (
82
- f"RequestGenerator("
83
- f"mode={self._mode}, "
84
- f"async_queue_size={self._async_queue_size}, "
85
- f"tokenizer={self._tokenizer})"
86
- )
87
-
88
- def __iter__(self) -> Iterator[TextGenerationRequest]:
89
- """
90
- Provide an iterator interface to generate new requests.
91
-
92
- :return: An iterator over result requests.
93
- :rtype: Iterator[TextGenerationRequest]
94
- """
95
- if self.mode == "async":
96
- while not self._stop_event.is_set():
97
- try:
98
- item = self._queue.get_nowait()
99
- self._queue.task_done()
100
- yield item
101
- except Empty:
102
- time.sleep(0.01)
103
- continue
104
- else:
105
- while not self._stop_event.is_set():
106
- yield self.create_item()
107
-
108
- @property
109
- def type_(self) -> str:
110
- """
111
- Get the type of the request generator.
112
-
113
- :return: The type of the request generator.
114
- :rtype: str
115
- """
116
- return self._type
117
-
118
- @property
119
- def source(self) -> str:
120
- """
121
- Get the data source for the request generator.
122
-
123
- :return: The data source.
124
- :rtype: str
125
- """
126
- return self._source
127
-
128
- @property
129
- def tokenizer(self) -> PreTrainedTokenizer:
130
- """
131
- Get the tokenizer instance.
132
-
133
- :return: The tokenizer instance.
134
- :rtype: PreTrainedTokenizer
135
- """
136
- return self._tokenizer
137
-
138
- @property
139
- def mode(self) -> str:
140
- """
141
- Get the generation mode.
142
-
143
- :return: The generation mode.
144
- :rtype: str
145
- """
146
- return self._mode
147
-
148
- @property
149
- def async_queue_size(self) -> int:
150
- """
151
- Get the size of the request queue.
152
-
153
- :return: The size of the request queue.
154
- :rtype: int
155
- """
156
- return self._async_queue_size
157
-
158
- @abstractmethod
159
- def create_item(self) -> TextGenerationRequest:
160
- """
161
- Abstract method to create a new result request item.
162
-
163
- :return: A new result request.
164
- :rtype: TextGenerationRequest
165
- """
166
-
167
- def stop(self):
168
- """
169
- Stop the background task that populates the queue.
170
- """
171
- logger.info("Stopping RequestGenerator...")
172
- self._stop_event.set()
173
- if self._mode == "async":
174
- self._thread.join()
175
- logger.info("RequestGenerator stopped")
176
-
177
- def _populate_queue(self):
178
- """
179
- Populate the request queue in the background.
180
- """
181
-
182
- while not self._stop_event.is_set():
183
- with contextlib.suppress(Full):
184
- if self._queue.qsize() < self._async_queue_size:
185
- item = self.create_item()
186
- self._queue.put(item, timeout=0.1)
187
- logger.debug(
188
- "Item added to queue. Current queue size: {}",
189
- self._queue.qsize(),
190
- )
191
- else:
192
- time.sleep(0.1)
193
-
194
- logger.info("RequestGenerator stopped populating queue")