guidellm 0.1.0__tar.gz → 0.2.0rc20250418__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of guidellm might be problematic. Click here for more details.

Files changed (85) hide show
  1. {guidellm-0.1.0/src/guidellm.egg-info → guidellm-0.2.0rc20250418}/PKG-INFO +95 -78
  2. guidellm-0.2.0rc20250418/README.md +201 -0
  3. guidellm-0.2.0rc20250418/pyproject.toml +74 -0
  4. guidellm-0.2.0rc20250418/src/guidellm/__init__.py +51 -0
  5. guidellm-0.2.0rc20250418/src/guidellm/__main__.py +294 -0
  6. guidellm-0.2.0rc20250418/src/guidellm/backend/__init__.py +23 -0
  7. guidellm-0.2.0rc20250418/src/guidellm/backend/backend.py +238 -0
  8. guidellm-0.2.0rc20250418/src/guidellm/backend/openai.py +578 -0
  9. guidellm-0.2.0rc20250418/src/guidellm/backend/response.py +132 -0
  10. guidellm-0.2.0rc20250418/src/guidellm/benchmark/__init__.py +73 -0
  11. guidellm-0.2.0rc20250418/src/guidellm/benchmark/aggregator.py +760 -0
  12. guidellm-0.2.0rc20250418/src/guidellm/benchmark/benchmark.py +838 -0
  13. guidellm-0.2.0rc20250418/src/guidellm/benchmark/benchmarker.py +334 -0
  14. guidellm-0.2.0rc20250418/src/guidellm/benchmark/entrypoints.py +141 -0
  15. guidellm-0.2.0rc20250418/src/guidellm/benchmark/output.py +946 -0
  16. guidellm-0.2.0rc20250418/src/guidellm/benchmark/profile.py +409 -0
  17. guidellm-0.2.0rc20250418/src/guidellm/benchmark/progress.py +720 -0
  18. {guidellm-0.1.0 → guidellm-0.2.0rc20250418}/src/guidellm/config.py +34 -56
  19. guidellm-0.2.0rc20250418/src/guidellm/data/__init__.py +4 -0
  20. guidellm-0.2.0rc20250418/src/guidellm/data/prideandprejudice.txt.gz +0 -0
  21. guidellm-0.2.0rc20250418/src/guidellm/dataset/__init__.py +22 -0
  22. guidellm-0.2.0rc20250418/src/guidellm/dataset/creator.py +213 -0
  23. guidellm-0.2.0rc20250418/src/guidellm/dataset/entrypoints.py +42 -0
  24. guidellm-0.2.0rc20250418/src/guidellm/dataset/file.py +90 -0
  25. guidellm-0.2.0rc20250418/src/guidellm/dataset/hf_datasets.py +62 -0
  26. guidellm-0.2.0rc20250418/src/guidellm/dataset/in_memory.py +132 -0
  27. guidellm-0.2.0rc20250418/src/guidellm/dataset/synthetic.py +262 -0
  28. guidellm-0.2.0rc20250418/src/guidellm/objects/__init__.py +18 -0
  29. guidellm-0.2.0rc20250418/src/guidellm/objects/pydantic.py +60 -0
  30. guidellm-0.2.0rc20250418/src/guidellm/objects/statistics.py +947 -0
  31. guidellm-0.2.0rc20250418/src/guidellm/request/__init__.py +15 -0
  32. guidellm-0.2.0rc20250418/src/guidellm/request/loader.py +281 -0
  33. guidellm-0.2.0rc20250418/src/guidellm/request/request.py +79 -0
  34. guidellm-0.2.0rc20250418/src/guidellm/scheduler/__init__.py +52 -0
  35. guidellm-0.2.0rc20250418/src/guidellm/scheduler/result.py +137 -0
  36. guidellm-0.2.0rc20250418/src/guidellm/scheduler/scheduler.py +382 -0
  37. guidellm-0.2.0rc20250418/src/guidellm/scheduler/strategy.py +493 -0
  38. guidellm-0.2.0rc20250418/src/guidellm/scheduler/types.py +7 -0
  39. guidellm-0.2.0rc20250418/src/guidellm/scheduler/worker.py +511 -0
  40. guidellm-0.2.0rc20250418/src/guidellm/utils/__init__.py +27 -0
  41. guidellm-0.2.0rc20250418/src/guidellm/utils/colors.py +8 -0
  42. guidellm-0.2.0rc20250418/src/guidellm/utils/hf_transformers.py +35 -0
  43. guidellm-0.2.0rc20250418/src/guidellm/utils/random.py +43 -0
  44. guidellm-0.2.0rc20250418/src/guidellm/utils/text.py +216 -0
  45. {guidellm-0.1.0 → guidellm-0.2.0rc20250418/src/guidellm.egg-info}/PKG-INFO +95 -78
  46. guidellm-0.2.0rc20250418/src/guidellm.egg-info/SOURCES.txt +52 -0
  47. guidellm-0.2.0rc20250418/src/guidellm.egg-info/entry_points.txt +2 -0
  48. {guidellm-0.1.0 → guidellm-0.2.0rc20250418}/src/guidellm.egg-info/requires.txt +5 -1
  49. guidellm-0.1.0/README.md +0 -189
  50. guidellm-0.1.0/pyproject.toml +0 -206
  51. guidellm-0.1.0/src/guidellm/__init__.py +0 -19
  52. guidellm-0.1.0/src/guidellm/backend/__init__.py +0 -10
  53. guidellm-0.1.0/src/guidellm/backend/base.py +0 -320
  54. guidellm-0.1.0/src/guidellm/backend/openai.py +0 -168
  55. guidellm-0.1.0/src/guidellm/core/__init__.py +0 -24
  56. guidellm-0.1.0/src/guidellm/core/distribution.py +0 -190
  57. guidellm-0.1.0/src/guidellm/core/report.py +0 -321
  58. guidellm-0.1.0/src/guidellm/core/request.py +0 -44
  59. guidellm-0.1.0/src/guidellm/core/result.py +0 -545
  60. guidellm-0.1.0/src/guidellm/core/serializable.py +0 -169
  61. guidellm-0.1.0/src/guidellm/executor/__init__.py +0 -10
  62. guidellm-0.1.0/src/guidellm/executor/base.py +0 -213
  63. guidellm-0.1.0/src/guidellm/executor/profile_generator.py +0 -343
  64. guidellm-0.1.0/src/guidellm/main.py +0 -336
  65. guidellm-0.1.0/src/guidellm/request/__init__.py +0 -13
  66. guidellm-0.1.0/src/guidellm/request/base.py +0 -194
  67. guidellm-0.1.0/src/guidellm/request/emulated.py +0 -391
  68. guidellm-0.1.0/src/guidellm/request/file.py +0 -76
  69. guidellm-0.1.0/src/guidellm/request/transformers.py +0 -100
  70. guidellm-0.1.0/src/guidellm/scheduler/__init__.py +0 -4
  71. guidellm-0.1.0/src/guidellm/scheduler/base.py +0 -374
  72. guidellm-0.1.0/src/guidellm/scheduler/load_generator.py +0 -196
  73. guidellm-0.1.0/src/guidellm/utils/__init__.py +0 -40
  74. guidellm-0.1.0/src/guidellm/utils/injector.py +0 -70
  75. guidellm-0.1.0/src/guidellm/utils/progress.py +0 -196
  76. guidellm-0.1.0/src/guidellm/utils/text.py +0 -455
  77. guidellm-0.1.0/src/guidellm/utils/transformers.py +0 -151
  78. guidellm-0.1.0/src/guidellm.egg-info/SOURCES.txt +0 -39
  79. guidellm-0.1.0/src/guidellm.egg-info/entry_points.txt +0 -3
  80. {guidellm-0.1.0 → guidellm-0.2.0rc20250418}/LICENSE +0 -0
  81. {guidellm-0.1.0 → guidellm-0.2.0rc20250418}/MANIFEST.in +0 -0
  82. {guidellm-0.1.0 → guidellm-0.2.0rc20250418}/setup.cfg +0 -0
  83. {guidellm-0.1.0 → guidellm-0.2.0rc20250418}/src/guidellm/logger.py +0 -0
  84. {guidellm-0.1.0 → guidellm-0.2.0rc20250418}/src/guidellm.egg-info/dependency_links.txt +0 -0
  85. {guidellm-0.1.0 → guidellm-0.2.0rc20250418}/src/guidellm.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.4
2
2
  Name: guidellm
3
- Version: 0.1.0
3
+ Version: 0.2.0rc20250418
4
4
  Summary: Guidance platform for deploying and managing large language models.
5
5
  Author: Neuralmagic, Inc.
6
6
  License: Apache License
@@ -206,15 +206,17 @@ License: Apache License
206
206
  limitations under the License.
207
207
 
208
208
  Project-URL: homepage, https://github.com/neuralmagic/guidellm
209
- Requires-Python: <4.0,>=3.8.0
209
+ Requires-Python: <4.0,>=3.9.0
210
210
  Description-Content-Type: text/markdown
211
211
  License-File: LICENSE
212
212
  Requires-Dist: click
213
213
  Requires-Dist: datasets
214
214
  Requires-Dist: ftfy>=6.0.0
215
+ Requires-Dist: httpx[http2]<1.0.0
215
216
  Requires-Dist: loguru
216
217
  Requires-Dist: numpy
217
- Requires-Dist: openai
218
+ Requires-Dist: pillow
219
+ Requires-Dist: protobuf
218
220
  Requires-Dist: pydantic>=2.0.0
219
221
  Requires-Dist: pydantic-settings>=2.0.0
220
222
  Requires-Dist: pyyaml>=6.0.0
@@ -226,12 +228,14 @@ Requires-Dist: pre-commit~=3.5.0; extra == "dev"
226
228
  Requires-Dist: scipy~=1.10; extra == "dev"
227
229
  Requires-Dist: sphinx~=7.1.2; extra == "dev"
228
230
  Requires-Dist: tox~=4.16.0; extra == "dev"
231
+ Requires-Dist: lorem~=0.1.1; extra == "dev"
229
232
  Requires-Dist: pytest~=8.2.2; extra == "dev"
230
233
  Requires-Dist: pytest-asyncio~=0.23.8; extra == "dev"
231
234
  Requires-Dist: pytest-cov~=5.0.0; extra == "dev"
232
235
  Requires-Dist: pytest-mock~=3.14.0; extra == "dev"
233
236
  Requires-Dist: pytest-rerunfailures~=14.0; extra == "dev"
234
237
  Requires-Dist: requests-mock~=1.12.1; extra == "dev"
238
+ Requires-Dist: respx~=0.22.0; extra == "dev"
235
239
  Requires-Dist: mypy~=1.10.1; extra == "dev"
236
240
  Requires-Dist: ruff~=0.5.2; extra == "dev"
237
241
  Requires-Dist: mdformat~=0.7.17; extra == "dev"
@@ -242,11 +246,12 @@ Requires-Dist: types-click~=7.1.8; extra == "dev"
242
246
  Requires-Dist: types-PyYAML~=6.0.1; extra == "dev"
243
247
  Requires-Dist: types-requests~=2.32.0; extra == "dev"
244
248
  Requires-Dist: types-toml; extra == "dev"
249
+ Dynamic: license-file
245
250
 
246
251
  <p align="center">
247
252
  <picture>
248
- <source media="(prefers-color-scheme: dark)" srcset="https://github.com/neuralmagic/guidellm/blob/main/docs/assets/guidellm-logo-light.png">
249
- <img alt="GuideLLM Logo" src="https://github.com/neuralmagic/guidellm/blob/main/docs/assets/guidellm-logo-dark.png" width=55%>
253
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/guidellm-logo-light.png">
254
+ <img alt="GuideLLM Logo" src="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/guidellm-logo-dark.png" width=55%>
250
255
  </picture>
251
256
  </p>
252
257
 
@@ -254,25 +259,25 @@ Requires-Dist: types-toml; extra == "dev"
254
259
  Scale Efficiently: Evaluate and Optimize Your LLM Deployments for Real-World Inference
255
260
  </h3>
256
261
 
257
- [![GitHub Release](https://img.shields.io/github/release/neuralmagic/guidellm.svg?label=Version)](https://github.com/neuralmagic/guidellm/releases) [![Documentation](https://img.shields.io/badge/Documentation-8A2BE2?logo=read-the-docs&logoColor=%23ffffff&color=%231BC070)](https://github.com/neuralmagic/guidellm/tree/main/docs) [![License](https://img.shields.io/github/license/neuralmagic/guidellm.svg)](https://github.com/neuralmagic/guidellm/blob/main/LICENSE) [![PyPI Release](https://img.shields.io/pypi/v/guidellm.svg?label=PyPI%20Release)](https://pypi.python.org/pypi/guidellm) [![Pypi Release](https://img.shields.io/pypi/v/guidellm-nightly.svg?label=PyPI%20Nightly)](https://pypi.python.org/pypi/guidellm-nightly) [![Python Versions](https://img.shields.io/pypi/pyversions/guidellm.svg?label=Python)](https://pypi.python.org/pypi/guidellm) [![Nightly Build](https://img.shields.io/github/actions/workflow/status/neuralmagic/guidellm/nightly.yml?branch=main&label=Nightly%20Build)](https://github.com/neuralmagic/guidellm/actions/workflows/nightly.yml)
262
+ [![GitHub Release](https://img.shields.io/github/release/neuralmagic/guidellm.svg?label=Version)](https://github.com/neuralmagic/guidellm/releases) [![Documentation](https://img.shields.io/badge/Documentation-8A2BE2?logo=read-the-docs&logoColor=%23ffffff&color=%231BC070)](https://github.com/neuralmagic/guidellm/tree/main/docs) [![License](https://img.shields.io/github/license/neuralmagic/guidellm.svg)](https://github.com/neuralmagic/guidellm/blob/main/LICENSE) [![PyPI Release](https://img.shields.io/pypi/v/guidellm.svg?label=PyPI%20Release)](https://pypi.python.org/pypi/guidellm) [![Python Versions](https://img.shields.io/badge/Python-3.8--3.12-orange)](https://pypi.python.org/pypi/guidellm) [![Nightly Build](https://img.shields.io/github/actions/workflow/status/neuralmagic/guidellm/nightly.yml?branch=main&label=Nightly%20Build)](https://github.com/neuralmagic/guidellm/actions/workflows/nightly.yml)
258
263
 
259
264
  ## Overview
260
265
 
261
266
  <p>
262
267
  <picture>
263
- <source media="(prefers-color-scheme: dark)" srcset="https://github.com/neuralmagic/guidellm/blob/main/docs/assets/guidellm-user-flows-dark.png">
264
- <img alt="GuideLLM User Flows" src="https://github.com/neuralmagic/guidellm/blob/main/docs/assets/guidellm-user-flows-light.png">
268
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/guidellm-user-flows-dark.png">
269
+ <img alt="GuideLLM User Flows" src="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/guidellm-user-flows-light.png">
265
270
  </picture>
266
271
  </p>
267
272
 
268
- **GuideLLM** is a powerful tool for evaluating and optimizing the deployment of large language models (LLMs). By simulating real-world inference workloads, GuideLLM helps users gauge the performance, resource needs, and cost implications of deploying LLMs on various hardware configurations. This approach ensures efficient, scalable, and cost-effective LLM inference serving while maintaining high service quality.
273
+ **GuideLLM** is a platform for evaluating and optimizing the deployment of large language models (LLMs). By simulating real-world inference workloads, GuideLLM enables users to assess the performance, resource requirements, and cost implications of deploying LLMs on various hardware configurations. This approach ensures efficient, scalable, and cost-effective LLM inference serving while maintaining high service quality.
269
274
 
270
275
  ### Key Features
271
276
 
272
277
  - **Performance Evaluation:** Analyze LLM inference under different load scenarios to ensure your system meets your service level objectives (SLOs).
273
278
  - **Resource Optimization:** Determine the most suitable hardware configurations for running your models effectively.
274
279
  - **Cost Estimation:** Understand the financial impact of different deployment strategies and make informed decisions to minimize costs.
275
- - **Scalability Testing:** Simulate scaling to handle large numbers of concurrent users without degradation in performance.
280
+ - **Scalability Testing:** Simulate scaling to handle large numbers of concurrent users without performance degradation.
276
281
 
277
282
  ## Getting Started
278
283
 
@@ -281,21 +286,27 @@ Scale Efficiently: Evaluate and Optimize Your LLM Deployments for Real-World Inf
281
286
  Before installing, ensure you have the following prerequisites:
282
287
 
283
288
  - OS: Linux or MacOS
284
- - Python: 3.8 – 3.12
289
+ - Python: 3.9 – 3.13
285
290
 
286
- GuideLLM is available on PyPI and is installed using `pip`:
291
+ The latest GuideLLM release can be installed using pip:
287
292
 
288
293
  ```bash
289
294
  pip install guidellm
290
295
  ```
291
296
 
297
+ Or from source code using pip:
298
+
299
+ ```bash
300
+ pip install git+https://github.com/neuralmagic/guidellm.git
301
+ ```
302
+
292
303
  For detailed installation instructions and requirements, see the [Installation Guide](https://github.com/neuralmagic/guidellm/tree/main/docs/install.md).
293
304
 
294
305
  ### Quick Start
295
306
 
296
307
  #### 1. Start an OpenAI Compatible Server (vLLM)
297
308
 
298
- GuideLLM requires an OpenAI-compatible server to run evaluations. [vLLM](https://github.com/vllm-project/vllm) is recommended for this purpose. To start a vLLM server with a Llama 3.1 8B quantized model, run the following command:
309
+ GuideLLM requires an OpenAI-compatible server to run evaluations. [vLLM](https://github.com/vllm-project/vllm) is recommended for this purpose. After installing vLLM on your desired server (`pip install vllm`), start a vLLM server with a Llama 3.1 8B quantized model by running the following command:
299
310
 
300
311
  ```bash
301
312
  vllm serve "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
@@ -303,107 +314,114 @@ vllm serve "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
303
314
 
304
315
  For more information on starting a vLLM server, see the [vLLM Documentation](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html).
305
316
 
306
- #### 2. Run a GuideLLM Evaluation
317
+ For information on starting other supported inference servers or platforms, see the [Supported Backends documentation](https://github.com/neuralmagic/guidellm/tree/main/docs/backends.md).
318
+
319
+ #### 2. Run a GuideLLM Benchmark
307
320
 
308
- To run a GuideLLM evaluation, use the `guidellm` command with the appropriate model name and options on the server hosting the model or one with network access to the deployment server. For example, to evaluate the full performance range of the previously deployed Llama 3.1 8B model, run the following command:
321
+ To run a GuideLLM benchmark, use the `guidellm benchmark` command with the target set to an OpenAI-compatible server. For this example, the target is set to 'http://localhost:8000', assuming that vLLM is active and running on the same server. Otherwise, update it to the appropriate location. By default, GuideLLM automatically determines the model available on the server and uses it. To target a different model, pass the desired name with the `--model` argument. Additionally, the `--rate-type` is set to `sweep`, which automatically runs a range of benchmarks to determine the minimum and maximum rates that the server and model can support. Each benchmark run under the sweep will run for 30 seconds, as set by the `--max-seconds` argument. Finally, `--data` is set to a synthetic dataset with 256 prompt tokens and 128 output tokens per request. For more arguments, supported scenarios, and configurations, jump to the [Configurations Section](#configurations) or run `guidellm benchmark --help`.
322
+
323
+ Now, to start benchmarking, run the following command:
309
324
 
310
325
  ```bash
311
- guidellm \
312
- --target "http://localhost:8000/v1" \
313
- --model "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16" \
314
- --data-type emulated \
315
- --data "prompt_tokens=512,generated_tokens=128"
326
+ guidellm benchmark \
327
+ --target "http://localhost:8000" \
328
+ --rate-type sweep \
329
+ --max-seconds 30 \
330
+ --data "prompt_tokens=256,output_tokens=128"
316
331
  ```
317
332
 
318
- The above command will begin the evaluation and output progress updates similar to the following (if running on a different server, be sure to update the target!): <img src= "https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/sample-benchmarks.gif"/>
333
+ The above command will begin the evaluation and provide progress updates similar to the following: <img src= "https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/sample-benchmarks.gif"/>
319
334
 
320
- Notes:
335
+ #### 3. Analyze the Results
321
336
 
322
- - The `--target` flag specifies the server hosting the model. In this case, it is a local vLLM server.
323
- - The `--model` flag specifies the model to evaluate. The model name should match the name of the model deployed on the server
324
- - By default, GuideLLM will run a `sweep` of performance evaluations across different request rates, each lasting 120 seconds and the results are printed out to the terminal.
337
+ After the evaluation is completed, GuideLLM will summarize the results into three sections:
325
338
 
326
- #### 3. Analyze the Results
339
+ 1. Benchmarks Metadata: A summary of the benchmark run and the arguments used to create it, including the server, data, profile, and more.
340
+ 2. Benchmarks Info: A high-level view of each benchmark and the requests that were run, including the type, duration, request statuses, and number of tokens.
341
+ 3. Benchmarks Stats: A summary of the statistics for each benchmark run, including the request rate, concurrency, latency, and token-level metrics such as TTFT, ITL, and more.
327
342
 
328
- After the evaluation is completed, GuideLLM will summarize the results, including various performance metrics.
343
+ The sections will look similar to the following: <img alt="Sample GuideLLM benchmark output" src="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/sample-output.png" />
329
344
 
330
- The output results will start with a summary of the evaluation, followed by the requests data for each benchmark run. For example, the start of the output will look like the following:
345
+ For more details about the metrics and definitions, please refer to the [Metrics documentation](https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/metrics.md).
331
346
 
332
- <img alt="Sample GuideLLM benchmark start output" src="https://github.com/neuralmagic/guidellm/blob/main/docs/assets/sample-output-start.png" />
347
+ #### 4. Explore the Results File
333
348
 
334
- The end of the output will include important performance summary metrics such as request latency, time to first token (TTFT), inter-token latency (ITL), and more:
349
+ By default, the full results, including complete statistics and request data, are saved to a file `benchmarks.json` in the current working directory. This file can be used for further analysis or reporting, and additionally can be reloaded into Python for further analysis using the `guidellm.benchmark.GenerativeBenchmarksReport` class. You can specify a different file name and extension with the `--output` argument.
335
350
 
336
- <img alt="Sample GuideLLM benchmark end output" src="https://github.com/neuralmagic/guidellm/blob/main/docs/assets/sample-output-end.png" />
351
+ For more details about the supported output file types, please take a look at the [Outputs documentation](raw.githubusercontent.com/neuralmagic/guidellm/main/docs/outputs.md).
337
352
 
338
- #### 4. Use the Results
353
+ #### 5. Use the Results
339
354
 
340
355
  The results from GuideLLM are used to optimize your LLM deployment for performance, resource efficiency, and cost. By analyzing the performance metrics, you can identify bottlenecks, determine the optimal request rate, and select the most cost-effective hardware configuration for your deployment.
341
356
 
342
- For example, if we deploy a latency-sensitive chat application, we likely want to optimize for low time to first token (TTFT) and inter-token latency (ITL). A reasonable threshold will depend on the application requirements. Still, we may want to ensure time to first token (TTFT) is under 200ms and inter-token latency (ITL) is under 50ms (20 updates per second). From the example results above, we can see that the model can meet these requirements on average at a request rate of 2.37 requests per second for each server. If you'd like to target a higher percentage of requests meeting these requirements, you can use the **Performance Stats by Benchmark** section to determine the rate at which 90% or 95% of requests meet these requirements.
357
+ For example, when deploying a chat application, we likely want to ensure that our time to first token (TTFT) and inter-token latency (ITL) are under certain thresholds to meet our service level objectives (SLOs) or service level agreements (SLAs). For example, setting TTFT to 200ms and ITL 25ms for the sample data provided in the example above, we can see that even though the server is capable of handling up to 13 requests per second, we would only be able to meet our SLOs for 99% of users at a request rate of 3.5 requests per second. If we relax our constraints on ITL to 50 ms, then we can meet the TTFT SLA for 99% of users at a request rate of approximately 10 requests per second.
343
358
 
344
- If we deploy a throughput-sensitive summarization application, we likely want to optimize for the maximum requests the server can handle per second. In this case, the throughput benchmark shows that the server maxes out at 4.06 requests per second. If we need to handle more requests, consider adding more servers or upgrading the hardware configuration.
359
+ For further details on determining the optimal request rate and SLOs, refer to the [SLOs documentation](https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/service_level_objectives.md).
345
360
 
346
361
  ### Configurations
347
362
 
348
- GuideLLM provides various CLI and environment options to customize evaluations, including setting the duration of each benchmark run, the number of concurrent requests, and the request rate.
363
+ GuideLLM offers a range of configurations through both the benchmark CLI command and environment variables, which provide default values and more granular controls. The most common configurations are listed below. A complete list is easily accessible, though, by running `guidellm benchmark --help` or `guidellm config` respectively.
349
364
 
350
- Some typical configurations for the CLI include:
365
+ #### Benchmark CLI
351
366
 
352
- - `--rate-type`: The rate to use for benchmarking. Options include `sweep`, `synchronous`, `throughput`, `constant`, and `poisson`.
353
- - `--rate-type sweep`: (default) Sweep runs through the full range of the server's performance, starting with a `synchronous` rate, then `throughput`, and finally, 10 `constant` rates between the min and max request rate found.
354
- - `--rate-type synchronous`: Synchronous runs requests synchronously, one after the other.
355
- - `--rate-type throughput`: Throughput runs requests in a throughput manner, sending requests as fast as possible.
356
- - `--rate-type constant`: Constant runs requests at a constant rate. Specify the request rate per second with the `--rate` argument. For example, `--rate 10` or multiple rates with `--rate 10 --rate 20 --rate 30`.
357
- - `--rate-type poisson`: Poisson draws from a Poisson distribution with the mean at the specified rate, adding some real-world variance to the runs. Specify the request rate per second with the `--rate` argument. For example, `--rate 10` or multiple rates with `--rate 10 --rate 20 --rate 30`.
358
- - `--data-type`: The data to use for the benchmark. Options include `emulated`, `transformers`, and `file`.
359
- - `--data-type emulated`: Emulated supports an EmulationConfig in string or file format for the `--data` argument to generate fake data. Specify the number of prompt tokens at a minimum and optionally the number of output tokens and other parameters for variance in the length. For example, `--data "prompt_tokens=128"`, `--data "prompt_tokens=128,generated_tokens=128" `, or `--data "prompt_tokens=128,prompt_tokens_variance=10" `.
360
- - `--data-type file`: File supports a file path or URL to a file for the `--data` argument. The file should contain data encoded as a CSV, JSONL, TXT, or JSON/YAML file with a single prompt per line for CSV, JSONL, and TXT or a list of prompts for JSON/YAML. For example, `--data "data.txt"` where data.txt contents are `"prompt1\nprompt2\nprompt3"`.
361
- - `--data-type transformers`: Transformers supports a dataset name or file path for the `--data` argument. For example, `--data "neuralmagic/LLM_compression_calibration"`.
362
- - `--max-seconds`: The maximum number of seconds to run each benchmark. The default is 120 seconds.
363
- - `--max-requests`: The maximum number of requests to run in each benchmark.
367
+ The `guidellm benchmark` command is used to run benchmarks against a generative AI backend/server. The command accepts a variety of arguments to customize the benchmark run. The most common arguments include:
364
368
 
365
- For a complete list of supported CLI arguments, run the following command:
369
+ - `--target`: Specifies the target path for the backend to run benchmarks against. For example, `http://localhost:8000`. This is required to define the server endpoint.
366
370
 
367
- ```bash
368
- guidellm --help
369
- ```
371
+ - `--model`: Allows selecting a specific model from the server. If not provided, it defaults to the first model available on the server. Useful when multiple models are hosted on the same server.
370
372
 
371
- For a full list of configuration options, run the following command:
373
+ - `--processor`: Used only for synthetic data creation or when the token source configuration is set to local for calculating token metrics locally. It must match the model's processor or tokenizer to ensure compatibility and correctness. This supports either a HuggingFace model ID or a local path to a processor or tokenizer.
372
374
 
373
- ```bash
374
- guidellm-config
375
- ```
375
+ - `--data`: Specifies the dataset to use. This can be a HuggingFace dataset ID, a local path to a dataset, or standard text files such as CSV, JSONL, and more. Additionally, synthetic data configurations can be provided using JSON or key-value strings. Synthetic data options include:
376
+
377
+ - `prompt_tokens`: Average number of tokens for prompts.
378
+ - `output_tokens`: Average number of tokens for outputs.
379
+ - `TYPE_stdev`, `TYPE_min`, `TYPE_max`: Standard deviation, minimum, and maximum values for the specified type (e.g., `prompt_tokens`, `output_tokens`). If not provided, will use the provided tokens value only.
380
+ - `samples`: Number of samples to generate, defaults to 1000.
381
+ - `source`: Source text data for generation, defaults to a local copy of Pride and Prejudice.
382
+
383
+ - `--data-args`: A JSON string used to specify the columns to source data from (e.g., `prompt_column`, `output_tokens_count_column`) and additional arguments to pass into the HuggingFace datasets constructor.
384
+
385
+ - `--data-sampler`: Enables applying `random` shuffling or sampling to the dataset. If not set, no sampling is used.
386
+
387
+ - `--rate-type`: Defines the type of benchmark to run (default sweep). Supported types include:
388
+
389
+ - `synchronous`: Runs a single stream of requests one at a time. `--rate` must not be set for this mode.
390
+ - `throughput`: Runs all requests in parallel to measure the maximum throughput for the server (bounded by GUIDELLM\_\_MAX_CONCURRENCY config argument). `--rate` must not be set for this mode.
391
+ - `concurrent`: Runs a fixed number of streams of requests in parallel. `--rate` must be set to the desired concurrency level/number of streams.
392
+ - `constant`: Sends requests asynchronously at a constant rate set by `--rate`.
393
+ - `poisson`: Sends requests at a rate following a Poisson distribution with the mean set by `--rate`.
394
+ - `sweep`: Automatically determines the minimum and maximum rates the server can support by running synchronous and throughput benchmarks, and then runs a series of benchmarks equally spaced between the two rates. The number of benchmarks is set by `--rate` (default is 10).
395
+
396
+ - `--max-seconds`: Sets the maximum duration (in seconds) for each benchmark run. If not specified, the benchmark will run until the dataset is exhausted or the `--max-requests` limit is reached.
376
397
 
377
- See the [GuideLLM Documentation](#Documentation) for further information.
398
+ - `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted.
399
+
400
+ - `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results.
401
+
402
+ - `--cooldown-percent`: Specifies the percentage of the benchmark to treat as a cooldown phase. Requests during this phase are excluded from the final results.
403
+
404
+ - `--output-path`: Defines the path to save the benchmark results. Supports JSON, YAML, or CSV formats. If a directory is provided, the results will be saved as `benchmarks.json` in that directory. If not set, the results will be saved in the current working directory.
378
405
 
379
406
  ## Resources
380
407
 
381
408
  ### Documentation
382
409
 
383
- Our comprehensive documentation provides detailed guides and resources to help you get the most out of GuideLLM. Whether just getting started or looking to dive deeper into advanced topics, you can find what you need in our [full documentation](https://github.com/neuralmagic/guidellm/tree/main/docs).
410
+ Our comprehensive documentation offers detailed guides and resources to help you maximize the benefits of GuideLLM. Whether just getting started or looking to dive deeper into advanced topics, you can find what you need in our [documentation](https://github.com/neuralmagic/guidellm/tree/main/docs).
384
411
 
385
412
  ### Core Docs
386
413
 
387
414
  - [**Installation Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/install.md) - This guide provides step-by-step instructions for installing GuideLLM, including prerequisites and setup tips.
415
+ - [**Backends Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/backends.md) - A comprehensive overview of supported backends and how to set them up for use with GuideLLM.
416
+ - [**Metrics Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/metrics.md) - Detailed explanations of the metrics used in GuideLLM, including definitions and how to interpret them.
417
+ - [**Outputs Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/outputs.md) - Information on the different output formats supported by GuideLLM and how to use them.
388
418
  - [**Architecture Overview**](https://github.com/neuralmagic/guidellm/tree/main/docs/architecture.md) - A detailed look at GuideLLM's design, components, and how they interact.
389
- - [**CLI Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/guides/cli.md) - Comprehensive usage information for running GuideLLM via the command line, including available commands and options.
390
- - [**Configuration Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/guides/configuration.md) - Instructions on configuring GuideLLM to suit various deployment needs and performance goals.
391
419
 
392
420
  ### Supporting External Documentation
393
421
 
394
422
  - [**vLLM Documentation**](https://vllm.readthedocs.io/en/latest/) - Official vLLM documentation provides insights into installation, usage, and supported models.
395
423
 
396
- ### Releases
397
-
398
- Visit our [GitHub Releases page](https://github.com/neuralmagic/guidellm/releases) and review the release notes to stay updated with the latest releases.
399
-
400
- ### License
401
-
402
- GuideLLM is licensed under the [Apache License 2.0](https://github.com/neuralmagic/guidellm/blob/main/LICENSE).
403
-
404
- ## Community
405
-
406
- ### Contribute
424
+ ### Contribution Docs
407
425
 
408
426
  We appreciate contributions to the code, examples, integrations, documentation, bug reports, and feature requests! Your feedback and involvement are crucial in helping GuideLLM grow and improve. Below are some ways you can get involved:
409
427
 
@@ -411,14 +429,13 @@ We appreciate contributions to the code, examples, integrations, documentation,
411
429
  - [**CONTRIBUTING.md**](https://github.com/neuralmagic/guidellm/blob/main/CONTRIBUTING.md) - Guidelines for contributing to the project, including code standards, pull request processes, and more.
412
430
  - [**CODE_OF_CONDUCT.md**](https://github.com/neuralmagic/guidellm/blob/main/CODE_OF_CONDUCT.md) - Our expectations for community behavior to ensure a welcoming and inclusive environment.
413
431
 
414
- ### Join
432
+ ### Releases
433
+
434
+ Visit our [GitHub Releases page](https://github.com/neuralmagic/guidellm/releases) and review the release notes to stay updated with the latest releases.
415
435
 
416
- We invite you to join our growing community of developers, researchers, and enthusiasts passionate about LLMs and optimization. Whether you're looking for help, want to share your own experiences, or stay up to date with the latest developments, there are plenty of ways to get involved:
436
+ ### License
417
437
 
418
- - [**Neural Magic Community Slack**](https://neuralmagic.com/community/) - Join our Slack channel to connect with other GuideLLM users and developers. Ask questions, share your work, and get real-time support.
419
- - [**GitHub Issues**](https://github.com/neuralmagic/guidellm/issues) - Report bugs, request features, or browse existing issues. Your feedback helps us improve GuideLLM.
420
- - [**Subscribe to Updates**](https://neuralmagic.com/subscribe/) - Sign up for the latest news, announcements, and updates about GuideLLM, webinars, events, and more.
421
- - [**Contact Us**](http://neuralmagic.com/contact/) - Use our contact form for general questions about Neural Magic or GuideLLM.
438
+ GuideLLM is licensed under the [Apache License 2.0](https://github.com/neuralmagic/guidellm/blob/main/LICENSE).
422
439
 
423
440
  ### Cite
424
441
 
@@ -0,0 +1,201 @@
1
+ <p align="center">
2
+ <picture>
3
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/guidellm-logo-light.png">
4
+ <img alt="GuideLLM Logo" src="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/guidellm-logo-dark.png" width=55%>
5
+ </picture>
6
+ </p>
7
+
8
+ <h3 align="center">
9
+ Scale Efficiently: Evaluate and Optimize Your LLM Deployments for Real-World Inference
10
+ </h3>
11
+
12
+ [![GitHub Release](https://img.shields.io/github/release/neuralmagic/guidellm.svg?label=Version)](https://github.com/neuralmagic/guidellm/releases) [![Documentation](https://img.shields.io/badge/Documentation-8A2BE2?logo=read-the-docs&logoColor=%23ffffff&color=%231BC070)](https://github.com/neuralmagic/guidellm/tree/main/docs) [![License](https://img.shields.io/github/license/neuralmagic/guidellm.svg)](https://github.com/neuralmagic/guidellm/blob/main/LICENSE) [![PyPI Release](https://img.shields.io/pypi/v/guidellm.svg?label=PyPI%20Release)](https://pypi.python.org/pypi/guidellm) [![Python Versions](https://img.shields.io/badge/Python-3.8--3.12-orange)](https://pypi.python.org/pypi/guidellm) [![Nightly Build](https://img.shields.io/github/actions/workflow/status/neuralmagic/guidellm/nightly.yml?branch=main&label=Nightly%20Build)](https://github.com/neuralmagic/guidellm/actions/workflows/nightly.yml)
13
+
14
+ ## Overview
15
+
16
+ <p>
17
+ <picture>
18
+ <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/guidellm-user-flows-dark.png">
19
+ <img alt="GuideLLM User Flows" src="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/guidellm-user-flows-light.png">
20
+ </picture>
21
+ </p>
22
+
23
+ **GuideLLM** is a platform for evaluating and optimizing the deployment of large language models (LLMs). By simulating real-world inference workloads, GuideLLM enables users to assess the performance, resource requirements, and cost implications of deploying LLMs on various hardware configurations. This approach ensures efficient, scalable, and cost-effective LLM inference serving while maintaining high service quality.
24
+
25
+ ### Key Features
26
+
27
+ - **Performance Evaluation:** Analyze LLM inference under different load scenarios to ensure your system meets your service level objectives (SLOs).
28
+ - **Resource Optimization:** Determine the most suitable hardware configurations for running your models effectively.
29
+ - **Cost Estimation:** Understand the financial impact of different deployment strategies and make informed decisions to minimize costs.
30
+ - **Scalability Testing:** Simulate scaling to handle large numbers of concurrent users without performance degradation.
31
+
32
+ ## Getting Started
33
+
34
+ ### Installation
35
+
36
+ Before installing, ensure you have the following prerequisites:
37
+
38
+ - OS: Linux or MacOS
39
+ - Python: 3.9 – 3.13
40
+
41
+ The latest GuideLLM release can be installed using pip:
42
+
43
+ ```bash
44
+ pip install guidellm
45
+ ```
46
+
47
+ Or from source code using pip:
48
+
49
+ ```bash
50
+ pip install git+https://github.com/neuralmagic/guidellm.git
51
+ ```
52
+
53
+ For detailed installation instructions and requirements, see the [Installation Guide](https://github.com/neuralmagic/guidellm/tree/main/docs/install.md).
54
+
55
+ ### Quick Start
56
+
57
+ #### 1. Start an OpenAI Compatible Server (vLLM)
58
+
59
+ GuideLLM requires an OpenAI-compatible server to run evaluations. [vLLM](https://github.com/vllm-project/vllm) is recommended for this purpose. After installing vLLM on your desired server (`pip install vllm`), start a vLLM server with a Llama 3.1 8B quantized model by running the following command:
60
+
61
+ ```bash
62
+ vllm serve "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
63
+ ```
64
+
65
+ For more information on starting a vLLM server, see the [vLLM Documentation](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html).
66
+
67
+ For information on starting other supported inference servers or platforms, see the [Supported Backends documentation](https://github.com/neuralmagic/guidellm/tree/main/docs/backends.md).
68
+
69
+ #### 2. Run a GuideLLM Benchmark
70
+
71
+ To run a GuideLLM benchmark, use the `guidellm benchmark` command with the target set to an OpenAI-compatible server. For this example, the target is set to 'http://localhost:8000', assuming that vLLM is active and running on the same server. Otherwise, update it to the appropriate location. By default, GuideLLM automatically determines the model available on the server and uses it. To target a different model, pass the desired name with the `--model` argument. Additionally, the `--rate-type` is set to `sweep`, which automatically runs a range of benchmarks to determine the minimum and maximum rates that the server and model can support. Each benchmark run under the sweep will run for 30 seconds, as set by the `--max-seconds` argument. Finally, `--data` is set to a synthetic dataset with 256 prompt tokens and 128 output tokens per request. For more arguments, supported scenarios, and configurations, jump to the [Configurations Section](#configurations) or run `guidellm benchmark --help`.
72
+
73
+ Now, to start benchmarking, run the following command:
74
+
75
+ ```bash
76
+ guidellm benchmark \
77
+ --target "http://localhost:8000" \
78
+ --rate-type sweep \
79
+ --max-seconds 30 \
80
+ --data "prompt_tokens=256,output_tokens=128"
81
+ ```
82
+
83
+ The above command will begin the evaluation and provide progress updates similar to the following: <img src= "https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/sample-benchmarks.gif"/>
84
+
85
+ #### 3. Analyze the Results
86
+
87
+ After the evaluation is completed, GuideLLM will summarize the results into three sections:
88
+
89
+ 1. Benchmarks Metadata: A summary of the benchmark run and the arguments used to create it, including the server, data, profile, and more.
90
+ 2. Benchmarks Info: A high-level view of each benchmark and the requests that were run, including the type, duration, request statuses, and number of tokens.
91
+ 3. Benchmarks Stats: A summary of the statistics for each benchmark run, including the request rate, concurrency, latency, and token-level metrics such as TTFT, ITL, and more.
92
+
93
+ The sections will look similar to the following: <img alt="Sample GuideLLM benchmark output" src="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/sample-output.png" />
94
+
95
+ For more details about the metrics and definitions, please refer to the [Metrics documentation](https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/metrics.md).
96
+
97
+ #### 4. Explore the Results File
98
+
99
+ By default, the full results, including complete statistics and request data, are saved to a file `benchmarks.json` in the current working directory. This file can be used for further analysis or reporting, and additionally can be reloaded into Python for further analysis using the `guidellm.benchmark.GenerativeBenchmarksReport` class. You can specify a different file name and extension with the `--output` argument.
100
+
101
+ For more details about the supported output file types, please take a look at the [Outputs documentation](raw.githubusercontent.com/neuralmagic/guidellm/main/docs/outputs.md).
102
+
103
+ #### 5. Use the Results
104
+
105
+ The results from GuideLLM are used to optimize your LLM deployment for performance, resource efficiency, and cost. By analyzing the performance metrics, you can identify bottlenecks, determine the optimal request rate, and select the most cost-effective hardware configuration for your deployment.
106
+
107
+ For example, when deploying a chat application, we likely want to ensure that our time to first token (TTFT) and inter-token latency (ITL) are under certain thresholds to meet our service level objectives (SLOs) or service level agreements (SLAs). For example, setting TTFT to 200ms and ITL 25ms for the sample data provided in the example above, we can see that even though the server is capable of handling up to 13 requests per second, we would only be able to meet our SLOs for 99% of users at a request rate of 3.5 requests per second. If we relax our constraints on ITL to 50 ms, then we can meet the TTFT SLA for 99% of users at a request rate of approximately 10 requests per second.
108
+
109
+ For further details on determining the optimal request rate and SLOs, refer to the [SLOs documentation](https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/service_level_objectives.md).
110
+
111
+ ### Configurations
112
+
113
+ GuideLLM offers a range of configurations through both the benchmark CLI command and environment variables, which provide default values and more granular controls. The most common configurations are listed below. A complete list is easily accessible, though, by running `guidellm benchmark --help` or `guidellm config` respectively.
114
+
115
+ #### Benchmark CLI
116
+
117
+ The `guidellm benchmark` command is used to run benchmarks against a generative AI backend/server. The command accepts a variety of arguments to customize the benchmark run. The most common arguments include:
118
+
119
+ - `--target`: Specifies the target path for the backend to run benchmarks against. For example, `http://localhost:8000`. This is required to define the server endpoint.
120
+
121
+ - `--model`: Allows selecting a specific model from the server. If not provided, it defaults to the first model available on the server. Useful when multiple models are hosted on the same server.
122
+
123
+ - `--processor`: Used only for synthetic data creation or when the token source configuration is set to local for calculating token metrics locally. It must match the model's processor or tokenizer to ensure compatibility and correctness. This supports either a HuggingFace model ID or a local path to a processor or tokenizer.
124
+
125
+ - `--data`: Specifies the dataset to use. This can be a HuggingFace dataset ID, a local path to a dataset, or standard text files such as CSV, JSONL, and more. Additionally, synthetic data configurations can be provided using JSON or key-value strings. Synthetic data options include:
126
+
127
+ - `prompt_tokens`: Average number of tokens for prompts.
128
+ - `output_tokens`: Average number of tokens for outputs.
129
+ - `TYPE_stdev`, `TYPE_min`, `TYPE_max`: Standard deviation, minimum, and maximum values for the specified type (e.g., `prompt_tokens`, `output_tokens`). If not provided, will use the provided tokens value only.
130
+ - `samples`: Number of samples to generate, defaults to 1000.
131
+ - `source`: Source text data for generation, defaults to a local copy of Pride and Prejudice.
132
+
133
+ - `--data-args`: A JSON string used to specify the columns to source data from (e.g., `prompt_column`, `output_tokens_count_column`) and additional arguments to pass into the HuggingFace datasets constructor.
134
+
135
+ - `--data-sampler`: Enables applying `random` shuffling or sampling to the dataset. If not set, no sampling is used.
136
+
137
+ - `--rate-type`: Defines the type of benchmark to run (default sweep). Supported types include:
138
+
139
+ - `synchronous`: Runs a single stream of requests one at a time. `--rate` must not be set for this mode.
140
+ - `throughput`: Runs all requests in parallel to measure the maximum throughput for the server (bounded by GUIDELLM\_\_MAX_CONCURRENCY config argument). `--rate` must not be set for this mode.
141
+ - `concurrent`: Runs a fixed number of streams of requests in parallel. `--rate` must be set to the desired concurrency level/number of streams.
142
+ - `constant`: Sends requests asynchronously at a constant rate set by `--rate`.
143
+ - `poisson`: Sends requests at a rate following a Poisson distribution with the mean set by `--rate`.
144
+ - `sweep`: Automatically determines the minimum and maximum rates the server can support by running synchronous and throughput benchmarks, and then runs a series of benchmarks equally spaced between the two rates. The number of benchmarks is set by `--rate` (default is 10).
145
+
146
+ - `--max-seconds`: Sets the maximum duration (in seconds) for each benchmark run. If not specified, the benchmark will run until the dataset is exhausted or the `--max-requests` limit is reached.
147
+
148
+ - `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted.
149
+
150
+ - `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results.
151
+
152
+ - `--cooldown-percent`: Specifies the percentage of the benchmark to treat as a cooldown phase. Requests during this phase are excluded from the final results.
153
+
154
+ - `--output-path`: Defines the path to save the benchmark results. Supports JSON, YAML, or CSV formats. If a directory is provided, the results will be saved as `benchmarks.json` in that directory. If not set, the results will be saved in the current working directory.
155
+
156
+ ## Resources
157
+
158
+ ### Documentation
159
+
160
+ Our comprehensive documentation offers detailed guides and resources to help you maximize the benefits of GuideLLM. Whether just getting started or looking to dive deeper into advanced topics, you can find what you need in our [documentation](https://github.com/neuralmagic/guidellm/tree/main/docs).
161
+
162
+ ### Core Docs
163
+
164
+ - [**Installation Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/install.md) - This guide provides step-by-step instructions for installing GuideLLM, including prerequisites and setup tips.
165
+ - [**Backends Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/backends.md) - A comprehensive overview of supported backends and how to set them up for use with GuideLLM.
166
+ - [**Metrics Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/metrics.md) - Detailed explanations of the metrics used in GuideLLM, including definitions and how to interpret them.
167
+ - [**Outputs Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/outputs.md) - Information on the different output formats supported by GuideLLM and how to use them.
168
+ - [**Architecture Overview**](https://github.com/neuralmagic/guidellm/tree/main/docs/architecture.md) - A detailed look at GuideLLM's design, components, and how they interact.
169
+
170
+ ### Supporting External Documentation
171
+
172
+ - [**vLLM Documentation**](https://vllm.readthedocs.io/en/latest/) - Official vLLM documentation provides insights into installation, usage, and supported models.
173
+
174
+ ### Contribution Docs
175
+
176
+ We appreciate contributions to the code, examples, integrations, documentation, bug reports, and feature requests! Your feedback and involvement are crucial in helping GuideLLM grow and improve. Below are some ways you can get involved:
177
+
178
+ - [**DEVELOPING.md**](https://github.com/neuralmagic/guidellm/blob/main/DEVELOPING.md) - Development guide for setting up your environment and making contributions.
179
+ - [**CONTRIBUTING.md**](https://github.com/neuralmagic/guidellm/blob/main/CONTRIBUTING.md) - Guidelines for contributing to the project, including code standards, pull request processes, and more.
180
+ - [**CODE_OF_CONDUCT.md**](https://github.com/neuralmagic/guidellm/blob/main/CODE_OF_CONDUCT.md) - Our expectations for community behavior to ensure a welcoming and inclusive environment.
181
+
182
+ ### Releases
183
+
184
+ Visit our [GitHub Releases page](https://github.com/neuralmagic/guidellm/releases) and review the release notes to stay updated with the latest releases.
185
+
186
+ ### License
187
+
188
+ GuideLLM is licensed under the [Apache License 2.0](https://github.com/neuralmagic/guidellm/blob/main/LICENSE).
189
+
190
+ ### Cite
191
+
192
+ If you find GuideLLM helpful in your research or projects, please consider citing it:
193
+
194
+ ```bibtex
195
+ @misc{guidellm2024,
196
+ title={GuideLLM: Scalable Inference and Optimization for Large Language Models},
197
+ author={Neural Magic, Inc.},
198
+ year={2024},
199
+ howpublished={\url{https://github.com/neuralmagic/guidellm}},
200
+ }
201
+ ```
@@ -0,0 +1,74 @@
1
+ [build-system]
2
+ requires = [ "setuptools >= 61.0", "wheel", "build",]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "guidellm"
7
+ version = "0.2.0.rc20250418"
8
+ description = "Guidance platform for deploying and managing large language models."
9
+ requires-python = ">=3.9.0,<4.0"
10
+ dependencies = [ "click", "datasets", "ftfy>=6.0.0", "httpx[http2]<1.0.0", "loguru", "numpy", "pillow", "protobuf", "pydantic>=2.0.0", "pydantic-settings>=2.0.0", "pyyaml>=6.0.0", "requests", "rich", "transformers",]
11
+ [[project.authors]]
12
+ name = "Neuralmagic, Inc."
13
+
14
+ [tool.isort]
15
+ profile = "black"
16
+
17
+ [tool.mypy]
18
+ files = [ "src/guidellm", "tests",]
19
+ python_version = "3.9"
20
+ warn_redundant_casts = true
21
+ warn_unused_ignores = false
22
+ show_error_codes = true
23
+ namespace_packages = true
24
+ exclude = [ "venv", ".tox",]
25
+ follow_imports = "silent"
26
+ [[tool.mypy.overrides]]
27
+ module = [ "datasets.*",]
28
+ ignore_missing_imports = true
29
+
30
+ [tool.ruff]
31
+ line-length = 88
32
+ indent-width = 4
33
+ exclude = [ "build", "dist", "env", ".venv",]
34
+
35
+ [project.readme]
36
+ file = "README.md"
37
+ content-type = "text/markdown"
38
+
39
+ [project.license]
40
+ file = "LICENSE"
41
+
42
+ [project.urls]
43
+ homepage = "https://github.com/neuralmagic/guidellm"
44
+
45
+ [project.optional-dependencies]
46
+ dev = [ "pre-commit~=3.5.0", "scipy~=1.10", "sphinx~=7.1.2", "tox~=4.16.0", "lorem~=0.1.1", "pytest~=8.2.2", "pytest-asyncio~=0.23.8", "pytest-cov~=5.0.0", "pytest-mock~=3.14.0", "pytest-rerunfailures~=14.0", "requests-mock~=1.12.1", "respx~=0.22.0", "mypy~=1.10.1", "ruff~=0.5.2", "mdformat~=0.7.17", "mdformat-footnote~=0.1.1", "mdformat-frontmatter~=2.0.8", "mdformat-gfm~=0.3.6", "types-click~=7.1.8", "types-PyYAML~=6.0.1", "types-requests~=2.32.0", "types-toml",]
47
+
48
+ [tool.setuptools.package-data]
49
+ "guidellm.data" = [ "*.gz",]
50
+
51
+ [tool.ruff.format]
52
+ quote-style = "double"
53
+ indent-style = "space"
54
+
55
+ [tool.ruff.lint]
56
+ ignore = [ "PLR0913", "TCH001", "COM812", "ISC001", "TCH002", "PLW1514", "RET505", "RET506", "PD011",]
57
+ select = [ "E", "W", "A", "C", "COM", "ERA", "I", "ICN", "N", "NPY", "PD", "PT", "PTH", "Q", "TCH", "TID", "RUF022", "C4", "C90", "ISC", "PIE", "R", "SIM", "ARG", "ASYNC", "B", "BLE", "E", "F", "INP", "PGH", "PL", "RSE", "S", "SLF", "T10", "T20", "UP", "W", "YTT", "FIX",]
58
+
59
+ [tool.pytest.ini_options]
60
+ addopts = "-s -vvv --cache-clear"
61
+ markers = [ "smoke: quick tests to check basic functionality", "sanity: detailed tests to ensure major functions work correctly", "regression: tests to ensure that new changes do not break existing functionality",]
62
+
63
+ [project.entry-points.console_scripts]
64
+ guidellm = "guidellm.__main__:cli"
65
+
66
+ [tool.setuptools.packages.find]
67
+ where = [ "src",]
68
+ include = [ "*",]
69
+
70
+ [tool.ruff.lint.extend-per-file-ignores]
71
+ "tests/**/*.py" = [ "S101", "ARG", "PLR2004", "TCH002", "SLF001", "S105", "S311", "PT011", "N806", "PGH003", "S106", "PLR0915",]
72
+
73
+ [tool.ruff.lint.isort]
74
+ known-first-party = [ "guidellm", "tests",]