guidellm 0.1.0__tar.gz → 0.2.0rc20250418__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of guidellm might be problematic. Click here for more details.
- {guidellm-0.1.0/src/guidellm.egg-info → guidellm-0.2.0rc20250418}/PKG-INFO +95 -78
- guidellm-0.2.0rc20250418/README.md +201 -0
- guidellm-0.2.0rc20250418/pyproject.toml +74 -0
- guidellm-0.2.0rc20250418/src/guidellm/__init__.py +51 -0
- guidellm-0.2.0rc20250418/src/guidellm/__main__.py +294 -0
- guidellm-0.2.0rc20250418/src/guidellm/backend/__init__.py +23 -0
- guidellm-0.2.0rc20250418/src/guidellm/backend/backend.py +238 -0
- guidellm-0.2.0rc20250418/src/guidellm/backend/openai.py +578 -0
- guidellm-0.2.0rc20250418/src/guidellm/backend/response.py +132 -0
- guidellm-0.2.0rc20250418/src/guidellm/benchmark/__init__.py +73 -0
- guidellm-0.2.0rc20250418/src/guidellm/benchmark/aggregator.py +760 -0
- guidellm-0.2.0rc20250418/src/guidellm/benchmark/benchmark.py +838 -0
- guidellm-0.2.0rc20250418/src/guidellm/benchmark/benchmarker.py +334 -0
- guidellm-0.2.0rc20250418/src/guidellm/benchmark/entrypoints.py +141 -0
- guidellm-0.2.0rc20250418/src/guidellm/benchmark/output.py +946 -0
- guidellm-0.2.0rc20250418/src/guidellm/benchmark/profile.py +409 -0
- guidellm-0.2.0rc20250418/src/guidellm/benchmark/progress.py +720 -0
- {guidellm-0.1.0 → guidellm-0.2.0rc20250418}/src/guidellm/config.py +34 -56
- guidellm-0.2.0rc20250418/src/guidellm/data/__init__.py +4 -0
- guidellm-0.2.0rc20250418/src/guidellm/data/prideandprejudice.txt.gz +0 -0
- guidellm-0.2.0rc20250418/src/guidellm/dataset/__init__.py +22 -0
- guidellm-0.2.0rc20250418/src/guidellm/dataset/creator.py +213 -0
- guidellm-0.2.0rc20250418/src/guidellm/dataset/entrypoints.py +42 -0
- guidellm-0.2.0rc20250418/src/guidellm/dataset/file.py +90 -0
- guidellm-0.2.0rc20250418/src/guidellm/dataset/hf_datasets.py +62 -0
- guidellm-0.2.0rc20250418/src/guidellm/dataset/in_memory.py +132 -0
- guidellm-0.2.0rc20250418/src/guidellm/dataset/synthetic.py +262 -0
- guidellm-0.2.0rc20250418/src/guidellm/objects/__init__.py +18 -0
- guidellm-0.2.0rc20250418/src/guidellm/objects/pydantic.py +60 -0
- guidellm-0.2.0rc20250418/src/guidellm/objects/statistics.py +947 -0
- guidellm-0.2.0rc20250418/src/guidellm/request/__init__.py +15 -0
- guidellm-0.2.0rc20250418/src/guidellm/request/loader.py +281 -0
- guidellm-0.2.0rc20250418/src/guidellm/request/request.py +79 -0
- guidellm-0.2.0rc20250418/src/guidellm/scheduler/__init__.py +52 -0
- guidellm-0.2.0rc20250418/src/guidellm/scheduler/result.py +137 -0
- guidellm-0.2.0rc20250418/src/guidellm/scheduler/scheduler.py +382 -0
- guidellm-0.2.0rc20250418/src/guidellm/scheduler/strategy.py +493 -0
- guidellm-0.2.0rc20250418/src/guidellm/scheduler/types.py +7 -0
- guidellm-0.2.0rc20250418/src/guidellm/scheduler/worker.py +511 -0
- guidellm-0.2.0rc20250418/src/guidellm/utils/__init__.py +27 -0
- guidellm-0.2.0rc20250418/src/guidellm/utils/colors.py +8 -0
- guidellm-0.2.0rc20250418/src/guidellm/utils/hf_transformers.py +35 -0
- guidellm-0.2.0rc20250418/src/guidellm/utils/random.py +43 -0
- guidellm-0.2.0rc20250418/src/guidellm/utils/text.py +216 -0
- {guidellm-0.1.0 → guidellm-0.2.0rc20250418/src/guidellm.egg-info}/PKG-INFO +95 -78
- guidellm-0.2.0rc20250418/src/guidellm.egg-info/SOURCES.txt +52 -0
- guidellm-0.2.0rc20250418/src/guidellm.egg-info/entry_points.txt +2 -0
- {guidellm-0.1.0 → guidellm-0.2.0rc20250418}/src/guidellm.egg-info/requires.txt +5 -1
- guidellm-0.1.0/README.md +0 -189
- guidellm-0.1.0/pyproject.toml +0 -206
- guidellm-0.1.0/src/guidellm/__init__.py +0 -19
- guidellm-0.1.0/src/guidellm/backend/__init__.py +0 -10
- guidellm-0.1.0/src/guidellm/backend/base.py +0 -320
- guidellm-0.1.0/src/guidellm/backend/openai.py +0 -168
- guidellm-0.1.0/src/guidellm/core/__init__.py +0 -24
- guidellm-0.1.0/src/guidellm/core/distribution.py +0 -190
- guidellm-0.1.0/src/guidellm/core/report.py +0 -321
- guidellm-0.1.0/src/guidellm/core/request.py +0 -44
- guidellm-0.1.0/src/guidellm/core/result.py +0 -545
- guidellm-0.1.0/src/guidellm/core/serializable.py +0 -169
- guidellm-0.1.0/src/guidellm/executor/__init__.py +0 -10
- guidellm-0.1.0/src/guidellm/executor/base.py +0 -213
- guidellm-0.1.0/src/guidellm/executor/profile_generator.py +0 -343
- guidellm-0.1.0/src/guidellm/main.py +0 -336
- guidellm-0.1.0/src/guidellm/request/__init__.py +0 -13
- guidellm-0.1.0/src/guidellm/request/base.py +0 -194
- guidellm-0.1.0/src/guidellm/request/emulated.py +0 -391
- guidellm-0.1.0/src/guidellm/request/file.py +0 -76
- guidellm-0.1.0/src/guidellm/request/transformers.py +0 -100
- guidellm-0.1.0/src/guidellm/scheduler/__init__.py +0 -4
- guidellm-0.1.0/src/guidellm/scheduler/base.py +0 -374
- guidellm-0.1.0/src/guidellm/scheduler/load_generator.py +0 -196
- guidellm-0.1.0/src/guidellm/utils/__init__.py +0 -40
- guidellm-0.1.0/src/guidellm/utils/injector.py +0 -70
- guidellm-0.1.0/src/guidellm/utils/progress.py +0 -196
- guidellm-0.1.0/src/guidellm/utils/text.py +0 -455
- guidellm-0.1.0/src/guidellm/utils/transformers.py +0 -151
- guidellm-0.1.0/src/guidellm.egg-info/SOURCES.txt +0 -39
- guidellm-0.1.0/src/guidellm.egg-info/entry_points.txt +0 -3
- {guidellm-0.1.0 → guidellm-0.2.0rc20250418}/LICENSE +0 -0
- {guidellm-0.1.0 → guidellm-0.2.0rc20250418}/MANIFEST.in +0 -0
- {guidellm-0.1.0 → guidellm-0.2.0rc20250418}/setup.cfg +0 -0
- {guidellm-0.1.0 → guidellm-0.2.0rc20250418}/src/guidellm/logger.py +0 -0
- {guidellm-0.1.0 → guidellm-0.2.0rc20250418}/src/guidellm.egg-info/dependency_links.txt +0 -0
- {guidellm-0.1.0 → guidellm-0.2.0rc20250418}/src/guidellm.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
2
|
Name: guidellm
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.2.0rc20250418
|
|
4
4
|
Summary: Guidance platform for deploying and managing large language models.
|
|
5
5
|
Author: Neuralmagic, Inc.
|
|
6
6
|
License: Apache License
|
|
@@ -206,15 +206,17 @@ License: Apache License
|
|
|
206
206
|
limitations under the License.
|
|
207
207
|
|
|
208
208
|
Project-URL: homepage, https://github.com/neuralmagic/guidellm
|
|
209
|
-
Requires-Python: <4.0,>=3.
|
|
209
|
+
Requires-Python: <4.0,>=3.9.0
|
|
210
210
|
Description-Content-Type: text/markdown
|
|
211
211
|
License-File: LICENSE
|
|
212
212
|
Requires-Dist: click
|
|
213
213
|
Requires-Dist: datasets
|
|
214
214
|
Requires-Dist: ftfy>=6.0.0
|
|
215
|
+
Requires-Dist: httpx[http2]<1.0.0
|
|
215
216
|
Requires-Dist: loguru
|
|
216
217
|
Requires-Dist: numpy
|
|
217
|
-
Requires-Dist:
|
|
218
|
+
Requires-Dist: pillow
|
|
219
|
+
Requires-Dist: protobuf
|
|
218
220
|
Requires-Dist: pydantic>=2.0.0
|
|
219
221
|
Requires-Dist: pydantic-settings>=2.0.0
|
|
220
222
|
Requires-Dist: pyyaml>=6.0.0
|
|
@@ -226,12 +228,14 @@ Requires-Dist: pre-commit~=3.5.0; extra == "dev"
|
|
|
226
228
|
Requires-Dist: scipy~=1.10; extra == "dev"
|
|
227
229
|
Requires-Dist: sphinx~=7.1.2; extra == "dev"
|
|
228
230
|
Requires-Dist: tox~=4.16.0; extra == "dev"
|
|
231
|
+
Requires-Dist: lorem~=0.1.1; extra == "dev"
|
|
229
232
|
Requires-Dist: pytest~=8.2.2; extra == "dev"
|
|
230
233
|
Requires-Dist: pytest-asyncio~=0.23.8; extra == "dev"
|
|
231
234
|
Requires-Dist: pytest-cov~=5.0.0; extra == "dev"
|
|
232
235
|
Requires-Dist: pytest-mock~=3.14.0; extra == "dev"
|
|
233
236
|
Requires-Dist: pytest-rerunfailures~=14.0; extra == "dev"
|
|
234
237
|
Requires-Dist: requests-mock~=1.12.1; extra == "dev"
|
|
238
|
+
Requires-Dist: respx~=0.22.0; extra == "dev"
|
|
235
239
|
Requires-Dist: mypy~=1.10.1; extra == "dev"
|
|
236
240
|
Requires-Dist: ruff~=0.5.2; extra == "dev"
|
|
237
241
|
Requires-Dist: mdformat~=0.7.17; extra == "dev"
|
|
@@ -242,11 +246,12 @@ Requires-Dist: types-click~=7.1.8; extra == "dev"
|
|
|
242
246
|
Requires-Dist: types-PyYAML~=6.0.1; extra == "dev"
|
|
243
247
|
Requires-Dist: types-requests~=2.32.0; extra == "dev"
|
|
244
248
|
Requires-Dist: types-toml; extra == "dev"
|
|
249
|
+
Dynamic: license-file
|
|
245
250
|
|
|
246
251
|
<p align="center">
|
|
247
252
|
<picture>
|
|
248
|
-
<source media="(prefers-color-scheme: dark)" srcset="https://
|
|
249
|
-
<img alt="GuideLLM Logo" src="https://
|
|
253
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/guidellm-logo-light.png">
|
|
254
|
+
<img alt="GuideLLM Logo" src="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/guidellm-logo-dark.png" width=55%>
|
|
250
255
|
</picture>
|
|
251
256
|
</p>
|
|
252
257
|
|
|
@@ -254,25 +259,25 @@ Requires-Dist: types-toml; extra == "dev"
|
|
|
254
259
|
Scale Efficiently: Evaluate and Optimize Your LLM Deployments for Real-World Inference
|
|
255
260
|
</h3>
|
|
256
261
|
|
|
257
|
-
[](https://github.com/neuralmagic/guidellm/releases) [](https://github.com/neuralmagic/guidellm/tree/main/docs) [](https://github.com/neuralmagic/guidellm/blob/main/LICENSE) [](https://pypi.python.org/pypi/guidellm) [](https://github.com/neuralmagic/guidellm/releases) [](https://github.com/neuralmagic/guidellm/tree/main/docs) [](https://github.com/neuralmagic/guidellm/blob/main/LICENSE) [](https://pypi.python.org/pypi/guidellm) [](https://pypi.python.org/pypi/guidellm) [](https://github.com/neuralmagic/guidellm/actions/workflows/nightly.yml)
|
|
258
263
|
|
|
259
264
|
## Overview
|
|
260
265
|
|
|
261
266
|
<p>
|
|
262
267
|
<picture>
|
|
263
|
-
<source media="(prefers-color-scheme: dark)" srcset="https://
|
|
264
|
-
<img alt="GuideLLM User Flows" src="https://
|
|
268
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/guidellm-user-flows-dark.png">
|
|
269
|
+
<img alt="GuideLLM User Flows" src="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/guidellm-user-flows-light.png">
|
|
265
270
|
</picture>
|
|
266
271
|
</p>
|
|
267
272
|
|
|
268
|
-
**GuideLLM** is a
|
|
273
|
+
**GuideLLM** is a platform for evaluating and optimizing the deployment of large language models (LLMs). By simulating real-world inference workloads, GuideLLM enables users to assess the performance, resource requirements, and cost implications of deploying LLMs on various hardware configurations. This approach ensures efficient, scalable, and cost-effective LLM inference serving while maintaining high service quality.
|
|
269
274
|
|
|
270
275
|
### Key Features
|
|
271
276
|
|
|
272
277
|
- **Performance Evaluation:** Analyze LLM inference under different load scenarios to ensure your system meets your service level objectives (SLOs).
|
|
273
278
|
- **Resource Optimization:** Determine the most suitable hardware configurations for running your models effectively.
|
|
274
279
|
- **Cost Estimation:** Understand the financial impact of different deployment strategies and make informed decisions to minimize costs.
|
|
275
|
-
- **Scalability Testing:** Simulate scaling to handle large numbers of concurrent users without degradation
|
|
280
|
+
- **Scalability Testing:** Simulate scaling to handle large numbers of concurrent users without performance degradation.
|
|
276
281
|
|
|
277
282
|
## Getting Started
|
|
278
283
|
|
|
@@ -281,21 +286,27 @@ Scale Efficiently: Evaluate and Optimize Your LLM Deployments for Real-World Inf
|
|
|
281
286
|
Before installing, ensure you have the following prerequisites:
|
|
282
287
|
|
|
283
288
|
- OS: Linux or MacOS
|
|
284
|
-
- Python: 3.
|
|
289
|
+
- Python: 3.9 – 3.13
|
|
285
290
|
|
|
286
|
-
|
|
291
|
+
The latest GuideLLM release can be installed using pip:
|
|
287
292
|
|
|
288
293
|
```bash
|
|
289
294
|
pip install guidellm
|
|
290
295
|
```
|
|
291
296
|
|
|
297
|
+
Or from source code using pip:
|
|
298
|
+
|
|
299
|
+
```bash
|
|
300
|
+
pip install git+https://github.com/neuralmagic/guidellm.git
|
|
301
|
+
```
|
|
302
|
+
|
|
292
303
|
For detailed installation instructions and requirements, see the [Installation Guide](https://github.com/neuralmagic/guidellm/tree/main/docs/install.md).
|
|
293
304
|
|
|
294
305
|
### Quick Start
|
|
295
306
|
|
|
296
307
|
#### 1. Start an OpenAI Compatible Server (vLLM)
|
|
297
308
|
|
|
298
|
-
GuideLLM requires an OpenAI-compatible server to run evaluations. [vLLM](https://github.com/vllm-project/vllm) is recommended for this purpose.
|
|
309
|
+
GuideLLM requires an OpenAI-compatible server to run evaluations. [vLLM](https://github.com/vllm-project/vllm) is recommended for this purpose. After installing vLLM on your desired server (`pip install vllm`), start a vLLM server with a Llama 3.1 8B quantized model by running the following command:
|
|
299
310
|
|
|
300
311
|
```bash
|
|
301
312
|
vllm serve "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
|
|
@@ -303,107 +314,114 @@ vllm serve "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
|
|
|
303
314
|
|
|
304
315
|
For more information on starting a vLLM server, see the [vLLM Documentation](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html).
|
|
305
316
|
|
|
306
|
-
|
|
317
|
+
For information on starting other supported inference servers or platforms, see the [Supported Backends documentation](https://github.com/neuralmagic/guidellm/tree/main/docs/backends.md).
|
|
318
|
+
|
|
319
|
+
#### 2. Run a GuideLLM Benchmark
|
|
307
320
|
|
|
308
|
-
To run a GuideLLM
|
|
321
|
+
To run a GuideLLM benchmark, use the `guidellm benchmark` command with the target set to an OpenAI-compatible server. For this example, the target is set to 'http://localhost:8000', assuming that vLLM is active and running on the same server. Otherwise, update it to the appropriate location. By default, GuideLLM automatically determines the model available on the server and uses it. To target a different model, pass the desired name with the `--model` argument. Additionally, the `--rate-type` is set to `sweep`, which automatically runs a range of benchmarks to determine the minimum and maximum rates that the server and model can support. Each benchmark run under the sweep will run for 30 seconds, as set by the `--max-seconds` argument. Finally, `--data` is set to a synthetic dataset with 256 prompt tokens and 128 output tokens per request. For more arguments, supported scenarios, and configurations, jump to the [Configurations Section](#configurations) or run `guidellm benchmark --help`.
|
|
322
|
+
|
|
323
|
+
Now, to start benchmarking, run the following command:
|
|
309
324
|
|
|
310
325
|
```bash
|
|
311
|
-
guidellm \
|
|
312
|
-
--target "http://localhost:8000
|
|
313
|
-
--
|
|
314
|
-
--
|
|
315
|
-
--data "prompt_tokens=
|
|
326
|
+
guidellm benchmark \
|
|
327
|
+
--target "http://localhost:8000" \
|
|
328
|
+
--rate-type sweep \
|
|
329
|
+
--max-seconds 30 \
|
|
330
|
+
--data "prompt_tokens=256,output_tokens=128"
|
|
316
331
|
```
|
|
317
332
|
|
|
318
|
-
The above command will begin the evaluation and
|
|
333
|
+
The above command will begin the evaluation and provide progress updates similar to the following: <img src= "https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/sample-benchmarks.gif"/>
|
|
319
334
|
|
|
320
|
-
|
|
335
|
+
#### 3. Analyze the Results
|
|
321
336
|
|
|
322
|
-
|
|
323
|
-
- The `--model` flag specifies the model to evaluate. The model name should match the name of the model deployed on the server
|
|
324
|
-
- By default, GuideLLM will run a `sweep` of performance evaluations across different request rates, each lasting 120 seconds and the results are printed out to the terminal.
|
|
337
|
+
After the evaluation is completed, GuideLLM will summarize the results into three sections:
|
|
325
338
|
|
|
326
|
-
|
|
339
|
+
1. Benchmarks Metadata: A summary of the benchmark run and the arguments used to create it, including the server, data, profile, and more.
|
|
340
|
+
2. Benchmarks Info: A high-level view of each benchmark and the requests that were run, including the type, duration, request statuses, and number of tokens.
|
|
341
|
+
3. Benchmarks Stats: A summary of the statistics for each benchmark run, including the request rate, concurrency, latency, and token-level metrics such as TTFT, ITL, and more.
|
|
327
342
|
|
|
328
|
-
|
|
343
|
+
The sections will look similar to the following: <img alt="Sample GuideLLM benchmark output" src="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/sample-output.png" />
|
|
329
344
|
|
|
330
|
-
|
|
345
|
+
For more details about the metrics and definitions, please refer to the [Metrics documentation](https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/metrics.md).
|
|
331
346
|
|
|
332
|
-
|
|
347
|
+
#### 4. Explore the Results File
|
|
333
348
|
|
|
334
|
-
|
|
349
|
+
By default, the full results, including complete statistics and request data, are saved to a file `benchmarks.json` in the current working directory. This file can be used for further analysis or reporting, and additionally can be reloaded into Python for further analysis using the `guidellm.benchmark.GenerativeBenchmarksReport` class. You can specify a different file name and extension with the `--output` argument.
|
|
335
350
|
|
|
336
|
-
|
|
351
|
+
For more details about the supported output file types, please take a look at the [Outputs documentation](raw.githubusercontent.com/neuralmagic/guidellm/main/docs/outputs.md).
|
|
337
352
|
|
|
338
|
-
####
|
|
353
|
+
#### 5. Use the Results
|
|
339
354
|
|
|
340
355
|
The results from GuideLLM are used to optimize your LLM deployment for performance, resource efficiency, and cost. By analyzing the performance metrics, you can identify bottlenecks, determine the optimal request rate, and select the most cost-effective hardware configuration for your deployment.
|
|
341
356
|
|
|
342
|
-
For example,
|
|
357
|
+
For example, when deploying a chat application, we likely want to ensure that our time to first token (TTFT) and inter-token latency (ITL) are under certain thresholds to meet our service level objectives (SLOs) or service level agreements (SLAs). For example, setting TTFT to 200ms and ITL 25ms for the sample data provided in the example above, we can see that even though the server is capable of handling up to 13 requests per second, we would only be able to meet our SLOs for 99% of users at a request rate of 3.5 requests per second. If we relax our constraints on ITL to 50 ms, then we can meet the TTFT SLA for 99% of users at a request rate of approximately 10 requests per second.
|
|
343
358
|
|
|
344
|
-
|
|
359
|
+
For further details on determining the optimal request rate and SLOs, refer to the [SLOs documentation](https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/service_level_objectives.md).
|
|
345
360
|
|
|
346
361
|
### Configurations
|
|
347
362
|
|
|
348
|
-
GuideLLM
|
|
363
|
+
GuideLLM offers a range of configurations through both the benchmark CLI command and environment variables, which provide default values and more granular controls. The most common configurations are listed below. A complete list is easily accessible, though, by running `guidellm benchmark --help` or `guidellm config` respectively.
|
|
349
364
|
|
|
350
|
-
|
|
365
|
+
#### Benchmark CLI
|
|
351
366
|
|
|
352
|
-
|
|
353
|
-
- `--rate-type sweep`: (default) Sweep runs through the full range of the server's performance, starting with a `synchronous` rate, then `throughput`, and finally, 10 `constant` rates between the min and max request rate found.
|
|
354
|
-
- `--rate-type synchronous`: Synchronous runs requests synchronously, one after the other.
|
|
355
|
-
- `--rate-type throughput`: Throughput runs requests in a throughput manner, sending requests as fast as possible.
|
|
356
|
-
- `--rate-type constant`: Constant runs requests at a constant rate. Specify the request rate per second with the `--rate` argument. For example, `--rate 10` or multiple rates with `--rate 10 --rate 20 --rate 30`.
|
|
357
|
-
- `--rate-type poisson`: Poisson draws from a Poisson distribution with the mean at the specified rate, adding some real-world variance to the runs. Specify the request rate per second with the `--rate` argument. For example, `--rate 10` or multiple rates with `--rate 10 --rate 20 --rate 30`.
|
|
358
|
-
- `--data-type`: The data to use for the benchmark. Options include `emulated`, `transformers`, and `file`.
|
|
359
|
-
- `--data-type emulated`: Emulated supports an EmulationConfig in string or file format for the `--data` argument to generate fake data. Specify the number of prompt tokens at a minimum and optionally the number of output tokens and other parameters for variance in the length. For example, `--data "prompt_tokens=128"`, `--data "prompt_tokens=128,generated_tokens=128" `, or `--data "prompt_tokens=128,prompt_tokens_variance=10" `.
|
|
360
|
-
- `--data-type file`: File supports a file path or URL to a file for the `--data` argument. The file should contain data encoded as a CSV, JSONL, TXT, or JSON/YAML file with a single prompt per line for CSV, JSONL, and TXT or a list of prompts for JSON/YAML. For example, `--data "data.txt"` where data.txt contents are `"prompt1\nprompt2\nprompt3"`.
|
|
361
|
-
- `--data-type transformers`: Transformers supports a dataset name or file path for the `--data` argument. For example, `--data "neuralmagic/LLM_compression_calibration"`.
|
|
362
|
-
- `--max-seconds`: The maximum number of seconds to run each benchmark. The default is 120 seconds.
|
|
363
|
-
- `--max-requests`: The maximum number of requests to run in each benchmark.
|
|
367
|
+
The `guidellm benchmark` command is used to run benchmarks against a generative AI backend/server. The command accepts a variety of arguments to customize the benchmark run. The most common arguments include:
|
|
364
368
|
|
|
365
|
-
|
|
369
|
+
- `--target`: Specifies the target path for the backend to run benchmarks against. For example, `http://localhost:8000`. This is required to define the server endpoint.
|
|
366
370
|
|
|
367
|
-
|
|
368
|
-
guidellm --help
|
|
369
|
-
```
|
|
371
|
+
- `--model`: Allows selecting a specific model from the server. If not provided, it defaults to the first model available on the server. Useful when multiple models are hosted on the same server.
|
|
370
372
|
|
|
371
|
-
|
|
373
|
+
- `--processor`: Used only for synthetic data creation or when the token source configuration is set to local for calculating token metrics locally. It must match the model's processor or tokenizer to ensure compatibility and correctness. This supports either a HuggingFace model ID or a local path to a processor or tokenizer.
|
|
372
374
|
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
375
|
+
- `--data`: Specifies the dataset to use. This can be a HuggingFace dataset ID, a local path to a dataset, or standard text files such as CSV, JSONL, and more. Additionally, synthetic data configurations can be provided using JSON or key-value strings. Synthetic data options include:
|
|
376
|
+
|
|
377
|
+
- `prompt_tokens`: Average number of tokens for prompts.
|
|
378
|
+
- `output_tokens`: Average number of tokens for outputs.
|
|
379
|
+
- `TYPE_stdev`, `TYPE_min`, `TYPE_max`: Standard deviation, minimum, and maximum values for the specified type (e.g., `prompt_tokens`, `output_tokens`). If not provided, will use the provided tokens value only.
|
|
380
|
+
- `samples`: Number of samples to generate, defaults to 1000.
|
|
381
|
+
- `source`: Source text data for generation, defaults to a local copy of Pride and Prejudice.
|
|
382
|
+
|
|
383
|
+
- `--data-args`: A JSON string used to specify the columns to source data from (e.g., `prompt_column`, `output_tokens_count_column`) and additional arguments to pass into the HuggingFace datasets constructor.
|
|
384
|
+
|
|
385
|
+
- `--data-sampler`: Enables applying `random` shuffling or sampling to the dataset. If not set, no sampling is used.
|
|
386
|
+
|
|
387
|
+
- `--rate-type`: Defines the type of benchmark to run (default sweep). Supported types include:
|
|
388
|
+
|
|
389
|
+
- `synchronous`: Runs a single stream of requests one at a time. `--rate` must not be set for this mode.
|
|
390
|
+
- `throughput`: Runs all requests in parallel to measure the maximum throughput for the server (bounded by GUIDELLM\_\_MAX_CONCURRENCY config argument). `--rate` must not be set for this mode.
|
|
391
|
+
- `concurrent`: Runs a fixed number of streams of requests in parallel. `--rate` must be set to the desired concurrency level/number of streams.
|
|
392
|
+
- `constant`: Sends requests asynchronously at a constant rate set by `--rate`.
|
|
393
|
+
- `poisson`: Sends requests at a rate following a Poisson distribution with the mean set by `--rate`.
|
|
394
|
+
- `sweep`: Automatically determines the minimum and maximum rates the server can support by running synchronous and throughput benchmarks, and then runs a series of benchmarks equally spaced between the two rates. The number of benchmarks is set by `--rate` (default is 10).
|
|
395
|
+
|
|
396
|
+
- `--max-seconds`: Sets the maximum duration (in seconds) for each benchmark run. If not specified, the benchmark will run until the dataset is exhausted or the `--max-requests` limit is reached.
|
|
376
397
|
|
|
377
|
-
|
|
398
|
+
- `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted.
|
|
399
|
+
|
|
400
|
+
- `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results.
|
|
401
|
+
|
|
402
|
+
- `--cooldown-percent`: Specifies the percentage of the benchmark to treat as a cooldown phase. Requests during this phase are excluded from the final results.
|
|
403
|
+
|
|
404
|
+
- `--output-path`: Defines the path to save the benchmark results. Supports JSON, YAML, or CSV formats. If a directory is provided, the results will be saved as `benchmarks.json` in that directory. If not set, the results will be saved in the current working directory.
|
|
378
405
|
|
|
379
406
|
## Resources
|
|
380
407
|
|
|
381
408
|
### Documentation
|
|
382
409
|
|
|
383
|
-
Our comprehensive documentation
|
|
410
|
+
Our comprehensive documentation offers detailed guides and resources to help you maximize the benefits of GuideLLM. Whether just getting started or looking to dive deeper into advanced topics, you can find what you need in our [documentation](https://github.com/neuralmagic/guidellm/tree/main/docs).
|
|
384
411
|
|
|
385
412
|
### Core Docs
|
|
386
413
|
|
|
387
414
|
- [**Installation Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/install.md) - This guide provides step-by-step instructions for installing GuideLLM, including prerequisites and setup tips.
|
|
415
|
+
- [**Backends Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/backends.md) - A comprehensive overview of supported backends and how to set them up for use with GuideLLM.
|
|
416
|
+
- [**Metrics Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/metrics.md) - Detailed explanations of the metrics used in GuideLLM, including definitions and how to interpret them.
|
|
417
|
+
- [**Outputs Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/outputs.md) - Information on the different output formats supported by GuideLLM and how to use them.
|
|
388
418
|
- [**Architecture Overview**](https://github.com/neuralmagic/guidellm/tree/main/docs/architecture.md) - A detailed look at GuideLLM's design, components, and how they interact.
|
|
389
|
-
- [**CLI Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/guides/cli.md) - Comprehensive usage information for running GuideLLM via the command line, including available commands and options.
|
|
390
|
-
- [**Configuration Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/guides/configuration.md) - Instructions on configuring GuideLLM to suit various deployment needs and performance goals.
|
|
391
419
|
|
|
392
420
|
### Supporting External Documentation
|
|
393
421
|
|
|
394
422
|
- [**vLLM Documentation**](https://vllm.readthedocs.io/en/latest/) - Official vLLM documentation provides insights into installation, usage, and supported models.
|
|
395
423
|
|
|
396
|
-
###
|
|
397
|
-
|
|
398
|
-
Visit our [GitHub Releases page](https://github.com/neuralmagic/guidellm/releases) and review the release notes to stay updated with the latest releases.
|
|
399
|
-
|
|
400
|
-
### License
|
|
401
|
-
|
|
402
|
-
GuideLLM is licensed under the [Apache License 2.0](https://github.com/neuralmagic/guidellm/blob/main/LICENSE).
|
|
403
|
-
|
|
404
|
-
## Community
|
|
405
|
-
|
|
406
|
-
### Contribute
|
|
424
|
+
### Contribution Docs
|
|
407
425
|
|
|
408
426
|
We appreciate contributions to the code, examples, integrations, documentation, bug reports, and feature requests! Your feedback and involvement are crucial in helping GuideLLM grow and improve. Below are some ways you can get involved:
|
|
409
427
|
|
|
@@ -411,14 +429,13 @@ We appreciate contributions to the code, examples, integrations, documentation,
|
|
|
411
429
|
- [**CONTRIBUTING.md**](https://github.com/neuralmagic/guidellm/blob/main/CONTRIBUTING.md) - Guidelines for contributing to the project, including code standards, pull request processes, and more.
|
|
412
430
|
- [**CODE_OF_CONDUCT.md**](https://github.com/neuralmagic/guidellm/blob/main/CODE_OF_CONDUCT.md) - Our expectations for community behavior to ensure a welcoming and inclusive environment.
|
|
413
431
|
|
|
414
|
-
###
|
|
432
|
+
### Releases
|
|
433
|
+
|
|
434
|
+
Visit our [GitHub Releases page](https://github.com/neuralmagic/guidellm/releases) and review the release notes to stay updated with the latest releases.
|
|
415
435
|
|
|
416
|
-
|
|
436
|
+
### License
|
|
417
437
|
|
|
418
|
-
|
|
419
|
-
- [**GitHub Issues**](https://github.com/neuralmagic/guidellm/issues) - Report bugs, request features, or browse existing issues. Your feedback helps us improve GuideLLM.
|
|
420
|
-
- [**Subscribe to Updates**](https://neuralmagic.com/subscribe/) - Sign up for the latest news, announcements, and updates about GuideLLM, webinars, events, and more.
|
|
421
|
-
- [**Contact Us**](http://neuralmagic.com/contact/) - Use our contact form for general questions about Neural Magic or GuideLLM.
|
|
438
|
+
GuideLLM is licensed under the [Apache License 2.0](https://github.com/neuralmagic/guidellm/blob/main/LICENSE).
|
|
422
439
|
|
|
423
440
|
### Cite
|
|
424
441
|
|
|
@@ -0,0 +1,201 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<picture>
|
|
3
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/guidellm-logo-light.png">
|
|
4
|
+
<img alt="GuideLLM Logo" src="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/guidellm-logo-dark.png" width=55%>
|
|
5
|
+
</picture>
|
|
6
|
+
</p>
|
|
7
|
+
|
|
8
|
+
<h3 align="center">
|
|
9
|
+
Scale Efficiently: Evaluate and Optimize Your LLM Deployments for Real-World Inference
|
|
10
|
+
</h3>
|
|
11
|
+
|
|
12
|
+
[](https://github.com/neuralmagic/guidellm/releases) [](https://github.com/neuralmagic/guidellm/tree/main/docs) [](https://github.com/neuralmagic/guidellm/blob/main/LICENSE) [](https://pypi.python.org/pypi/guidellm) [](https://pypi.python.org/pypi/guidellm) [](https://github.com/neuralmagic/guidellm/actions/workflows/nightly.yml)
|
|
13
|
+
|
|
14
|
+
## Overview
|
|
15
|
+
|
|
16
|
+
<p>
|
|
17
|
+
<picture>
|
|
18
|
+
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/guidellm-user-flows-dark.png">
|
|
19
|
+
<img alt="GuideLLM User Flows" src="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/guidellm-user-flows-light.png">
|
|
20
|
+
</picture>
|
|
21
|
+
</p>
|
|
22
|
+
|
|
23
|
+
**GuideLLM** is a platform for evaluating and optimizing the deployment of large language models (LLMs). By simulating real-world inference workloads, GuideLLM enables users to assess the performance, resource requirements, and cost implications of deploying LLMs on various hardware configurations. This approach ensures efficient, scalable, and cost-effective LLM inference serving while maintaining high service quality.
|
|
24
|
+
|
|
25
|
+
### Key Features
|
|
26
|
+
|
|
27
|
+
- **Performance Evaluation:** Analyze LLM inference under different load scenarios to ensure your system meets your service level objectives (SLOs).
|
|
28
|
+
- **Resource Optimization:** Determine the most suitable hardware configurations for running your models effectively.
|
|
29
|
+
- **Cost Estimation:** Understand the financial impact of different deployment strategies and make informed decisions to minimize costs.
|
|
30
|
+
- **Scalability Testing:** Simulate scaling to handle large numbers of concurrent users without performance degradation.
|
|
31
|
+
|
|
32
|
+
## Getting Started
|
|
33
|
+
|
|
34
|
+
### Installation
|
|
35
|
+
|
|
36
|
+
Before installing, ensure you have the following prerequisites:
|
|
37
|
+
|
|
38
|
+
- OS: Linux or MacOS
|
|
39
|
+
- Python: 3.9 – 3.13
|
|
40
|
+
|
|
41
|
+
The latest GuideLLM release can be installed using pip:
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install guidellm
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
Or from source code using pip:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
pip install git+https://github.com/neuralmagic/guidellm.git
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
For detailed installation instructions and requirements, see the [Installation Guide](https://github.com/neuralmagic/guidellm/tree/main/docs/install.md).
|
|
54
|
+
|
|
55
|
+
### Quick Start
|
|
56
|
+
|
|
57
|
+
#### 1. Start an OpenAI Compatible Server (vLLM)
|
|
58
|
+
|
|
59
|
+
GuideLLM requires an OpenAI-compatible server to run evaluations. [vLLM](https://github.com/vllm-project/vllm) is recommended for this purpose. After installing vLLM on your desired server (`pip install vllm`), start a vLLM server with a Llama 3.1 8B quantized model by running the following command:
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
vllm serve "neuralmagic/Meta-Llama-3.1-8B-Instruct-quantized.w4a16"
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
For more information on starting a vLLM server, see the [vLLM Documentation](https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html).
|
|
66
|
+
|
|
67
|
+
For information on starting other supported inference servers or platforms, see the [Supported Backends documentation](https://github.com/neuralmagic/guidellm/tree/main/docs/backends.md).
|
|
68
|
+
|
|
69
|
+
#### 2. Run a GuideLLM Benchmark
|
|
70
|
+
|
|
71
|
+
To run a GuideLLM benchmark, use the `guidellm benchmark` command with the target set to an OpenAI-compatible server. For this example, the target is set to 'http://localhost:8000', assuming that vLLM is active and running on the same server. Otherwise, update it to the appropriate location. By default, GuideLLM automatically determines the model available on the server and uses it. To target a different model, pass the desired name with the `--model` argument. Additionally, the `--rate-type` is set to `sweep`, which automatically runs a range of benchmarks to determine the minimum and maximum rates that the server and model can support. Each benchmark run under the sweep will run for 30 seconds, as set by the `--max-seconds` argument. Finally, `--data` is set to a synthetic dataset with 256 prompt tokens and 128 output tokens per request. For more arguments, supported scenarios, and configurations, jump to the [Configurations Section](#configurations) or run `guidellm benchmark --help`.
|
|
72
|
+
|
|
73
|
+
Now, to start benchmarking, run the following command:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
guidellm benchmark \
|
|
77
|
+
--target "http://localhost:8000" \
|
|
78
|
+
--rate-type sweep \
|
|
79
|
+
--max-seconds 30 \
|
|
80
|
+
--data "prompt_tokens=256,output_tokens=128"
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
The above command will begin the evaluation and provide progress updates similar to the following: <img src= "https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/sample-benchmarks.gif"/>
|
|
84
|
+
|
|
85
|
+
#### 3. Analyze the Results
|
|
86
|
+
|
|
87
|
+
After the evaluation is completed, GuideLLM will summarize the results into three sections:
|
|
88
|
+
|
|
89
|
+
1. Benchmarks Metadata: A summary of the benchmark run and the arguments used to create it, including the server, data, profile, and more.
|
|
90
|
+
2. Benchmarks Info: A high-level view of each benchmark and the requests that were run, including the type, duration, request statuses, and number of tokens.
|
|
91
|
+
3. Benchmarks Stats: A summary of the statistics for each benchmark run, including the request rate, concurrency, latency, and token-level metrics such as TTFT, ITL, and more.
|
|
92
|
+
|
|
93
|
+
The sections will look similar to the following: <img alt="Sample GuideLLM benchmark output" src="https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/assets/sample-output.png" />
|
|
94
|
+
|
|
95
|
+
For more details about the metrics and definitions, please refer to the [Metrics documentation](https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/metrics.md).
|
|
96
|
+
|
|
97
|
+
#### 4. Explore the Results File
|
|
98
|
+
|
|
99
|
+
By default, the full results, including complete statistics and request data, are saved to a file `benchmarks.json` in the current working directory. This file can be used for further analysis or reporting, and additionally can be reloaded into Python for further analysis using the `guidellm.benchmark.GenerativeBenchmarksReport` class. You can specify a different file name and extension with the `--output` argument.
|
|
100
|
+
|
|
101
|
+
For more details about the supported output file types, please take a look at the [Outputs documentation](raw.githubusercontent.com/neuralmagic/guidellm/main/docs/outputs.md).
|
|
102
|
+
|
|
103
|
+
#### 5. Use the Results
|
|
104
|
+
|
|
105
|
+
The results from GuideLLM are used to optimize your LLM deployment for performance, resource efficiency, and cost. By analyzing the performance metrics, you can identify bottlenecks, determine the optimal request rate, and select the most cost-effective hardware configuration for your deployment.
|
|
106
|
+
|
|
107
|
+
For example, when deploying a chat application, we likely want to ensure that our time to first token (TTFT) and inter-token latency (ITL) are under certain thresholds to meet our service level objectives (SLOs) or service level agreements (SLAs). For example, setting TTFT to 200ms and ITL 25ms for the sample data provided in the example above, we can see that even though the server is capable of handling up to 13 requests per second, we would only be able to meet our SLOs for 99% of users at a request rate of 3.5 requests per second. If we relax our constraints on ITL to 50 ms, then we can meet the TTFT SLA for 99% of users at a request rate of approximately 10 requests per second.
|
|
108
|
+
|
|
109
|
+
For further details on determining the optimal request rate and SLOs, refer to the [SLOs documentation](https://raw.githubusercontent.com/neuralmagic/guidellm/main/docs/service_level_objectives.md).
|
|
110
|
+
|
|
111
|
+
### Configurations
|
|
112
|
+
|
|
113
|
+
GuideLLM offers a range of configurations through both the benchmark CLI command and environment variables, which provide default values and more granular controls. The most common configurations are listed below. A complete list is easily accessible, though, by running `guidellm benchmark --help` or `guidellm config` respectively.
|
|
114
|
+
|
|
115
|
+
#### Benchmark CLI
|
|
116
|
+
|
|
117
|
+
The `guidellm benchmark` command is used to run benchmarks against a generative AI backend/server. The command accepts a variety of arguments to customize the benchmark run. The most common arguments include:
|
|
118
|
+
|
|
119
|
+
- `--target`: Specifies the target path for the backend to run benchmarks against. For example, `http://localhost:8000`. This is required to define the server endpoint.
|
|
120
|
+
|
|
121
|
+
- `--model`: Allows selecting a specific model from the server. If not provided, it defaults to the first model available on the server. Useful when multiple models are hosted on the same server.
|
|
122
|
+
|
|
123
|
+
- `--processor`: Used only for synthetic data creation or when the token source configuration is set to local for calculating token metrics locally. It must match the model's processor or tokenizer to ensure compatibility and correctness. This supports either a HuggingFace model ID or a local path to a processor or tokenizer.
|
|
124
|
+
|
|
125
|
+
- `--data`: Specifies the dataset to use. This can be a HuggingFace dataset ID, a local path to a dataset, or standard text files such as CSV, JSONL, and more. Additionally, synthetic data configurations can be provided using JSON or key-value strings. Synthetic data options include:
|
|
126
|
+
|
|
127
|
+
- `prompt_tokens`: Average number of tokens for prompts.
|
|
128
|
+
- `output_tokens`: Average number of tokens for outputs.
|
|
129
|
+
- `TYPE_stdev`, `TYPE_min`, `TYPE_max`: Standard deviation, minimum, and maximum values for the specified type (e.g., `prompt_tokens`, `output_tokens`). If not provided, will use the provided tokens value only.
|
|
130
|
+
- `samples`: Number of samples to generate, defaults to 1000.
|
|
131
|
+
- `source`: Source text data for generation, defaults to a local copy of Pride and Prejudice.
|
|
132
|
+
|
|
133
|
+
- `--data-args`: A JSON string used to specify the columns to source data from (e.g., `prompt_column`, `output_tokens_count_column`) and additional arguments to pass into the HuggingFace datasets constructor.
|
|
134
|
+
|
|
135
|
+
- `--data-sampler`: Enables applying `random` shuffling or sampling to the dataset. If not set, no sampling is used.
|
|
136
|
+
|
|
137
|
+
- `--rate-type`: Defines the type of benchmark to run (default sweep). Supported types include:
|
|
138
|
+
|
|
139
|
+
- `synchronous`: Runs a single stream of requests one at a time. `--rate` must not be set for this mode.
|
|
140
|
+
- `throughput`: Runs all requests in parallel to measure the maximum throughput for the server (bounded by GUIDELLM\_\_MAX_CONCURRENCY config argument). `--rate` must not be set for this mode.
|
|
141
|
+
- `concurrent`: Runs a fixed number of streams of requests in parallel. `--rate` must be set to the desired concurrency level/number of streams.
|
|
142
|
+
- `constant`: Sends requests asynchronously at a constant rate set by `--rate`.
|
|
143
|
+
- `poisson`: Sends requests at a rate following a Poisson distribution with the mean set by `--rate`.
|
|
144
|
+
- `sweep`: Automatically determines the minimum and maximum rates the server can support by running synchronous and throughput benchmarks, and then runs a series of benchmarks equally spaced between the two rates. The number of benchmarks is set by `--rate` (default is 10).
|
|
145
|
+
|
|
146
|
+
- `--max-seconds`: Sets the maximum duration (in seconds) for each benchmark run. If not specified, the benchmark will run until the dataset is exhausted or the `--max-requests` limit is reached.
|
|
147
|
+
|
|
148
|
+
- `--max-requests`: Sets the maximum number of requests for each benchmark run. If not provided, the benchmark will run until `--max-seconds` is reached or the dataset is exhausted.
|
|
149
|
+
|
|
150
|
+
- `--warmup-percent`: Specifies the percentage of the benchmark to treat as a warmup phase. Requests during this phase are excluded from the final results.
|
|
151
|
+
|
|
152
|
+
- `--cooldown-percent`: Specifies the percentage of the benchmark to treat as a cooldown phase. Requests during this phase are excluded from the final results.
|
|
153
|
+
|
|
154
|
+
- `--output-path`: Defines the path to save the benchmark results. Supports JSON, YAML, or CSV formats. If a directory is provided, the results will be saved as `benchmarks.json` in that directory. If not set, the results will be saved in the current working directory.
|
|
155
|
+
|
|
156
|
+
## Resources
|
|
157
|
+
|
|
158
|
+
### Documentation
|
|
159
|
+
|
|
160
|
+
Our comprehensive documentation offers detailed guides and resources to help you maximize the benefits of GuideLLM. Whether just getting started or looking to dive deeper into advanced topics, you can find what you need in our [documentation](https://github.com/neuralmagic/guidellm/tree/main/docs).
|
|
161
|
+
|
|
162
|
+
### Core Docs
|
|
163
|
+
|
|
164
|
+
- [**Installation Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/install.md) - This guide provides step-by-step instructions for installing GuideLLM, including prerequisites and setup tips.
|
|
165
|
+
- [**Backends Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/backends.md) - A comprehensive overview of supported backends and how to set them up for use with GuideLLM.
|
|
166
|
+
- [**Metrics Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/metrics.md) - Detailed explanations of the metrics used in GuideLLM, including definitions and how to interpret them.
|
|
167
|
+
- [**Outputs Guide**](https://github.com/neuralmagic/guidellm/tree/main/docs/outputs.md) - Information on the different output formats supported by GuideLLM and how to use them.
|
|
168
|
+
- [**Architecture Overview**](https://github.com/neuralmagic/guidellm/tree/main/docs/architecture.md) - A detailed look at GuideLLM's design, components, and how they interact.
|
|
169
|
+
|
|
170
|
+
### Supporting External Documentation
|
|
171
|
+
|
|
172
|
+
- [**vLLM Documentation**](https://vllm.readthedocs.io/en/latest/) - Official vLLM documentation provides insights into installation, usage, and supported models.
|
|
173
|
+
|
|
174
|
+
### Contribution Docs
|
|
175
|
+
|
|
176
|
+
We appreciate contributions to the code, examples, integrations, documentation, bug reports, and feature requests! Your feedback and involvement are crucial in helping GuideLLM grow and improve. Below are some ways you can get involved:
|
|
177
|
+
|
|
178
|
+
- [**DEVELOPING.md**](https://github.com/neuralmagic/guidellm/blob/main/DEVELOPING.md) - Development guide for setting up your environment and making contributions.
|
|
179
|
+
- [**CONTRIBUTING.md**](https://github.com/neuralmagic/guidellm/blob/main/CONTRIBUTING.md) - Guidelines for contributing to the project, including code standards, pull request processes, and more.
|
|
180
|
+
- [**CODE_OF_CONDUCT.md**](https://github.com/neuralmagic/guidellm/blob/main/CODE_OF_CONDUCT.md) - Our expectations for community behavior to ensure a welcoming and inclusive environment.
|
|
181
|
+
|
|
182
|
+
### Releases
|
|
183
|
+
|
|
184
|
+
Visit our [GitHub Releases page](https://github.com/neuralmagic/guidellm/releases) and review the release notes to stay updated with the latest releases.
|
|
185
|
+
|
|
186
|
+
### License
|
|
187
|
+
|
|
188
|
+
GuideLLM is licensed under the [Apache License 2.0](https://github.com/neuralmagic/guidellm/blob/main/LICENSE).
|
|
189
|
+
|
|
190
|
+
### Cite
|
|
191
|
+
|
|
192
|
+
If you find GuideLLM helpful in your research or projects, please consider citing it:
|
|
193
|
+
|
|
194
|
+
```bibtex
|
|
195
|
+
@misc{guidellm2024,
|
|
196
|
+
title={GuideLLM: Scalable Inference and Optimization for Large Language Models},
|
|
197
|
+
author={Neural Magic, Inc.},
|
|
198
|
+
year={2024},
|
|
199
|
+
howpublished={\url{https://github.com/neuralmagic/guidellm}},
|
|
200
|
+
}
|
|
201
|
+
```
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = [ "setuptools >= 61.0", "wheel", "build",]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "guidellm"
|
|
7
|
+
version = "0.2.0.rc20250418"
|
|
8
|
+
description = "Guidance platform for deploying and managing large language models."
|
|
9
|
+
requires-python = ">=3.9.0,<4.0"
|
|
10
|
+
dependencies = [ "click", "datasets", "ftfy>=6.0.0", "httpx[http2]<1.0.0", "loguru", "numpy", "pillow", "protobuf", "pydantic>=2.0.0", "pydantic-settings>=2.0.0", "pyyaml>=6.0.0", "requests", "rich", "transformers",]
|
|
11
|
+
[[project.authors]]
|
|
12
|
+
name = "Neuralmagic, Inc."
|
|
13
|
+
|
|
14
|
+
[tool.isort]
|
|
15
|
+
profile = "black"
|
|
16
|
+
|
|
17
|
+
[tool.mypy]
|
|
18
|
+
files = [ "src/guidellm", "tests",]
|
|
19
|
+
python_version = "3.9"
|
|
20
|
+
warn_redundant_casts = true
|
|
21
|
+
warn_unused_ignores = false
|
|
22
|
+
show_error_codes = true
|
|
23
|
+
namespace_packages = true
|
|
24
|
+
exclude = [ "venv", ".tox",]
|
|
25
|
+
follow_imports = "silent"
|
|
26
|
+
[[tool.mypy.overrides]]
|
|
27
|
+
module = [ "datasets.*",]
|
|
28
|
+
ignore_missing_imports = true
|
|
29
|
+
|
|
30
|
+
[tool.ruff]
|
|
31
|
+
line-length = 88
|
|
32
|
+
indent-width = 4
|
|
33
|
+
exclude = [ "build", "dist", "env", ".venv",]
|
|
34
|
+
|
|
35
|
+
[project.readme]
|
|
36
|
+
file = "README.md"
|
|
37
|
+
content-type = "text/markdown"
|
|
38
|
+
|
|
39
|
+
[project.license]
|
|
40
|
+
file = "LICENSE"
|
|
41
|
+
|
|
42
|
+
[project.urls]
|
|
43
|
+
homepage = "https://github.com/neuralmagic/guidellm"
|
|
44
|
+
|
|
45
|
+
[project.optional-dependencies]
|
|
46
|
+
dev = [ "pre-commit~=3.5.0", "scipy~=1.10", "sphinx~=7.1.2", "tox~=4.16.0", "lorem~=0.1.1", "pytest~=8.2.2", "pytest-asyncio~=0.23.8", "pytest-cov~=5.0.0", "pytest-mock~=3.14.0", "pytest-rerunfailures~=14.0", "requests-mock~=1.12.1", "respx~=0.22.0", "mypy~=1.10.1", "ruff~=0.5.2", "mdformat~=0.7.17", "mdformat-footnote~=0.1.1", "mdformat-frontmatter~=2.0.8", "mdformat-gfm~=0.3.6", "types-click~=7.1.8", "types-PyYAML~=6.0.1", "types-requests~=2.32.0", "types-toml",]
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.package-data]
|
|
49
|
+
"guidellm.data" = [ "*.gz",]
|
|
50
|
+
|
|
51
|
+
[tool.ruff.format]
|
|
52
|
+
quote-style = "double"
|
|
53
|
+
indent-style = "space"
|
|
54
|
+
|
|
55
|
+
[tool.ruff.lint]
|
|
56
|
+
ignore = [ "PLR0913", "TCH001", "COM812", "ISC001", "TCH002", "PLW1514", "RET505", "RET506", "PD011",]
|
|
57
|
+
select = [ "E", "W", "A", "C", "COM", "ERA", "I", "ICN", "N", "NPY", "PD", "PT", "PTH", "Q", "TCH", "TID", "RUF022", "C4", "C90", "ISC", "PIE", "R", "SIM", "ARG", "ASYNC", "B", "BLE", "E", "F", "INP", "PGH", "PL", "RSE", "S", "SLF", "T10", "T20", "UP", "W", "YTT", "FIX",]
|
|
58
|
+
|
|
59
|
+
[tool.pytest.ini_options]
|
|
60
|
+
addopts = "-s -vvv --cache-clear"
|
|
61
|
+
markers = [ "smoke: quick tests to check basic functionality", "sanity: detailed tests to ensure major functions work correctly", "regression: tests to ensure that new changes do not break existing functionality",]
|
|
62
|
+
|
|
63
|
+
[project.entry-points.console_scripts]
|
|
64
|
+
guidellm = "guidellm.__main__:cli"
|
|
65
|
+
|
|
66
|
+
[tool.setuptools.packages.find]
|
|
67
|
+
where = [ "src",]
|
|
68
|
+
include = [ "*",]
|
|
69
|
+
|
|
70
|
+
[tool.ruff.lint.extend-per-file-ignores]
|
|
71
|
+
"tests/**/*.py" = [ "S101", "ARG", "PLR2004", "TCH002", "SLF001", "S105", "S311", "PT011", "N806", "PGH003", "S106", "PLR0915",]
|
|
72
|
+
|
|
73
|
+
[tool.ruff.lint.isort]
|
|
74
|
+
known-first-party = [ "guidellm", "tests",]
|