liger-kernel 0.3.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- liger_kernel-0.4.0/NOTICE +58 -0
- {liger_kernel-0.3.0/src/liger_kernel.egg-info → liger_kernel-0.4.0}/PKG-INFO +74 -35
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/README.md +68 -31
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/pyproject.toml +9 -5
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/ops/cross_entropy.py +5 -39
- liger_kernel-0.4.0/src/liger_kernel/ops/experimental/mm_int8int2.py +355 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/ops/fused_linear_cross_entropy.py +13 -10
- liger_kernel-0.4.0/src/liger_kernel/ops/fused_linear_jsd.py +245 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/ops/geglu.py +2 -2
- liger_kernel-0.4.0/src/liger_kernel/ops/jsd.py +176 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/ops/kl_div.py +45 -34
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/ops/rms_norm.py +67 -42
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/ops/swiglu.py +2 -2
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/ops/utils.py +62 -1
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/__init__.py +3 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/auto_model.py +18 -6
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/functional.py +4 -0
- liger_kernel-0.4.0/src/liger_kernel/transformers/fused_linear_jsd.py +98 -0
- liger_kernel-0.4.0/src/liger_kernel/transformers/jsd.py +75 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/kl_div.py +3 -2
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/model/gemma.py +124 -1
- liger_kernel-0.4.0/src/liger_kernel/transformers/model/llama.py +277 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/model/mistral.py +3 -0
- liger_kernel-0.4.0/src/liger_kernel/transformers/model/mixtral.py +309 -0
- liger_kernel-0.4.0/src/liger_kernel/transformers/model/mllama.py +274 -0
- liger_kernel-0.4.0/src/liger_kernel/transformers/model/phi3.py +274 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/model/qwen2.py +123 -2
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/model/qwen2_vl.py +8 -1
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/monkey_patch.py +254 -129
- {liger_kernel-0.3.0 → liger_kernel-0.4.0/src/liger_kernel.egg-info}/PKG-INFO +74 -35
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel.egg-info/SOURCES.txt +6 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel.egg-info/requires.txt +6 -3
- liger_kernel-0.3.0/NOTICE +0 -4
- liger_kernel-0.3.0/src/liger_kernel/transformers/model/llama.py +0 -146
- liger_kernel-0.3.0/src/liger_kernel/transformers/model/mixtral.py +0 -158
- liger_kernel-0.3.0/src/liger_kernel/transformers/model/phi3.py +0 -136
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/LICENSE +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/setup.cfg +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/env_report.py +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/ops/__init__.py +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/ops/experimental/embedding.py +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/ops/layer_norm.py +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/ops/rope.py +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/cross_entropy.py +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/geglu.py +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/layer_norm.py +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/model/__init__.py +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/rms_norm.py +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/rope.py +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/swiglu.py +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/transformers/trainer_integration.py +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/triton/__init__.py +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel/triton/monkey_patch.py +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel.egg-info/dependency_links.txt +0 -0
- {liger_kernel-0.3.0 → liger_kernel-0.4.0}/src/liger_kernel.egg-info/top_level.txt +0 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
Copyright 2024 LinkedIn Corporation
|
|
2
|
+
All Rights Reserved.
|
|
3
|
+
|
|
4
|
+
Licensed under the BSD 2-Clause License (the "License"). See License in the project root for license information.
|
|
5
|
+
|
|
6
|
+
This product includes software developed by LinkedIn Corporation.
|
|
7
|
+
|
|
8
|
+
This product contains code derived from the following open source projects:
|
|
9
|
+
|
|
10
|
+
1. Unsloth
|
|
11
|
+
Copyright (c) 2023 Unsloth AI
|
|
12
|
+
Licensed under the Apache License, Version 2.0
|
|
13
|
+
Source: https://github.com/unslothai/unsloth
|
|
14
|
+
|
|
15
|
+
The `calculate_settings` function to determine block size and warp is reused for Norm and MLP operations.
|
|
16
|
+
Modifications and additions were made to the RMS Norm implementation.
|
|
17
|
+
|
|
18
|
+
2. Triton
|
|
19
|
+
Copyright (c) 2023 OpenAI
|
|
20
|
+
Licensed under the MIT License
|
|
21
|
+
Source: https://github.com/openai/triton
|
|
22
|
+
|
|
23
|
+
Modifications were made based on Triton tutorials for the RMS Norm implementation.
|
|
24
|
+
|
|
25
|
+
3. Efficient Cross Entropy
|
|
26
|
+
Copyright (c) 2023 Mohamed Malek
|
|
27
|
+
Licensed under the MIT License
|
|
28
|
+
Source: https://github.com/mgmalek/efficient_cross_entropy
|
|
29
|
+
|
|
30
|
+
The idea of gradient-in-forward and chunking was used in the Linear Cross Entropy implementation.
|
|
31
|
+
|
|
32
|
+
4. Flash Attention
|
|
33
|
+
Copyright (c) 2023 Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, Christopher Ré
|
|
34
|
+
Licensed under the BSD 3-Clause License
|
|
35
|
+
Source: https://github.com/Dao-AILab/flash-attention
|
|
36
|
+
|
|
37
|
+
Optimization ideas such as tiling and recomputation were inspired by this work.
|
|
38
|
+
|
|
39
|
+
5. AutoAWQ
|
|
40
|
+
Copyright (c) 2023 Casper Hansen
|
|
41
|
+
Licensed under the MIT License
|
|
42
|
+
Source: https://github.com/casper-hansen/AutoAWQ
|
|
43
|
+
|
|
44
|
+
The design of the automodel was referenced from this project.
|
|
45
|
+
|
|
46
|
+
6. llm.c
|
|
47
|
+
Copyright (c) 2023 Andrej Karpathy
|
|
48
|
+
Licensed under the MIT License
|
|
49
|
+
Source: https://github.com/karpathy/llm.c
|
|
50
|
+
|
|
51
|
+
The design of end-to-end testing was referenced from this project.
|
|
52
|
+
|
|
53
|
+
7. Tiny Shakespeare Dataset
|
|
54
|
+
Source: https://huggingface.co/datasets/karpathy/tiny_shakespeare
|
|
55
|
+
|
|
56
|
+
This dataset is used to conduct convergence tests on mini models.
|
|
57
|
+
|
|
58
|
+
For full license texts, please refer to the respective project repositories.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: liger_kernel
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.4.0
|
|
4
4
|
Summary: Efficient Triton kernels for LLM Training
|
|
5
5
|
License: BSD 2-CLAUSE LICENSE
|
|
6
6
|
Copyright 2024 LinkedIn Corporation
|
|
@@ -31,18 +31,22 @@ Description-Content-Type: text/markdown
|
|
|
31
31
|
License-File: LICENSE
|
|
32
32
|
License-File: NOTICE
|
|
33
33
|
Requires-Dist: torch>=2.1.2
|
|
34
|
-
Requires-Dist: triton>=2.3.
|
|
35
|
-
|
|
34
|
+
Requires-Dist: triton>=2.3.1
|
|
35
|
+
Provides-Extra: transformers
|
|
36
|
+
Requires-Dist: transformers~=4.0; extra == "transformers"
|
|
36
37
|
Provides-Extra: dev
|
|
38
|
+
Requires-Dist: transformers>=4.44.2; extra == "dev"
|
|
37
39
|
Requires-Dist: matplotlib>=3.7.2; extra == "dev"
|
|
38
40
|
Requires-Dist: flake8>=4.0.1.1; extra == "dev"
|
|
39
41
|
Requires-Dist: black>=24.4.2; extra == "dev"
|
|
40
42
|
Requires-Dist: isort>=5.13.2; extra == "dev"
|
|
41
43
|
Requires-Dist: pytest>=7.1.2; extra == "dev"
|
|
42
44
|
Requires-Dist: datasets>=2.19.2; extra == "dev"
|
|
43
|
-
Requires-Dist:
|
|
45
|
+
Requires-Dist: torchvision>=0.16.2; extra == "dev"
|
|
44
46
|
Requires-Dist: seaborn; extra == "dev"
|
|
45
47
|
|
|
48
|
+
<a name="readme-top"></a>
|
|
49
|
+
|
|
46
50
|
# Liger Kernel: Efficient Triton Kernels for LLM Training
|
|
47
51
|
|
|
48
52
|
|
|
@@ -51,6 +55,7 @@ Requires-Dist: seaborn; extra == "dev"
|
|
|
51
55
|
<th style="padding: 10px;" colspan="2">Stable</th>
|
|
52
56
|
<th style="padding: 10px;" colspan="2">Nightly</th>
|
|
53
57
|
<th style="padding: 10px;">Discord</th>
|
|
58
|
+
<th style="padding: 10px;">Gurubase (experimental)</th>
|
|
54
59
|
</tr>
|
|
55
60
|
<tr>
|
|
56
61
|
<td style="padding: 10px;">
|
|
@@ -74,8 +79,13 @@ Requires-Dist: seaborn; extra == "dev"
|
|
|
74
79
|
</a>
|
|
75
80
|
</td>
|
|
76
81
|
<td style="padding: 10px;">
|
|
77
|
-
<a href="https://discord.gg/
|
|
78
|
-
<img src="https://dcbadge.vercel.app/api/server/
|
|
82
|
+
<a href="https://discord.gg/gpumode">
|
|
83
|
+
<img src="https://dcbadge.vercel.app/api/server/gpumode?style=flat" alt="Join Our Discord">
|
|
84
|
+
</a>
|
|
85
|
+
</td>
|
|
86
|
+
<td style="padding: 10px;">
|
|
87
|
+
<a href="https://gurubase.io/g/liger-kernel">
|
|
88
|
+
<img src="https://img.shields.io/badge/Gurubase-Ask%20Liger%20Kernel%20Guru-006BFF" alt="Ask Liger Kernel Guru">
|
|
79
89
|
</a>
|
|
80
90
|
</td>
|
|
81
91
|
</tr>
|
|
@@ -85,11 +95,12 @@ Requires-Dist: seaborn; extra == "dev"
|
|
|
85
95
|
|
|
86
96
|
<img src="https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/logo-banner.png">
|
|
87
97
|
|
|
88
|
-
[Installation](#installation) | [Getting Started](#getting-started) | [Examples](#examples) | [APIs](#apis) | [
|
|
98
|
+
[Installation](#installation) | [Getting Started](#getting-started) | [Examples](#examples) | [APIs](#apis) | [Cite our work](#cite-this-work)
|
|
89
99
|
|
|
90
100
|
<details>
|
|
91
101
|
<summary>Latest News 🔥</summary>
|
|
92
|
-
|
|
102
|
+
|
|
103
|
+
- [2024/10/21] We have released the tech report of Liger Kernel on Arxiv: https://arxiv.org/pdf/2410.10989
|
|
93
104
|
- [2024/9/6] We release v0.2.1 ([X post](https://x.com/liger_kernel/status/1832168197002510649)). 2500+ Stars, 10+ New Contributors, 50+ PRs, 50k Downloads in two weeks!
|
|
94
105
|
- [2024/8/31] CUDA MODE talk, [Liger-Kernel: Real-world Triton kernel for LLM Training](https://youtu.be/gWble4FreV4?si=dxPeIchhkJ36Mbns), [Slides](https://github.com/cuda-mode/lectures?tab=readme-ov-file#lecture-28-liger-kernel)
|
|
95
106
|
- [2024/8/23] Official release: check out our [X post](https://x.com/hsu_byron/status/1827072737673982056)
|
|
@@ -147,11 +158,21 @@ With one line of code, Liger Kernel can increase throughput by more than 20% and
|
|
|
147
158
|
|
|
148
159
|
## Installation
|
|
149
160
|
|
|
150
|
-
### Dependencies
|
|
161
|
+
### Dependencies
|
|
162
|
+
|
|
163
|
+
#### CUDA
|
|
151
164
|
|
|
152
165
|
- `torch >= 2.1.2`
|
|
153
166
|
- `triton >= 2.3.0`
|
|
154
|
-
|
|
167
|
+
|
|
168
|
+
#### ROCm
|
|
169
|
+
|
|
170
|
+
- `torch >= 2.5.0` Install according to the instruction in Pytorch official webpage.
|
|
171
|
+
- `triton >= 3.0.0` Install from pypi. (e.g. `pip install triton==3.0.0`)
|
|
172
|
+
|
|
173
|
+
### Optional Dependencies
|
|
174
|
+
|
|
175
|
+
- `transformers >= 4.x`: Required if you plan to use the transformers models patching APIs. The specific model you are working will dictate the minimum version of transformers.
|
|
155
176
|
|
|
156
177
|
> **Note:**
|
|
157
178
|
> Our kernels inherit the full spectrum of hardware compatibility offered by [Triton](https://github.com/triton-lang/triton).
|
|
@@ -174,7 +195,11 @@ To install from source:
|
|
|
174
195
|
git clone https://github.com/linkedin/Liger-Kernel.git
|
|
175
196
|
cd Liger-Kernel
|
|
176
197
|
pip install -e .
|
|
198
|
+
# or if using transformers
|
|
199
|
+
pip install -e .[transformers]
|
|
177
200
|
```
|
|
201
|
+
|
|
202
|
+
|
|
178
203
|
## Getting Started
|
|
179
204
|
|
|
180
205
|
There are a couple of ways to apply Liger kernels, depending on the level of customization required.
|
|
@@ -267,13 +292,14 @@ loss.backward()
|
|
|
267
292
|
| **Model** | **API** | **Supported Operations** |
|
|
268
293
|
|-------------|--------------------------------------------------------------|-------------------------------------------------------------------------|
|
|
269
294
|
| LLaMA 2 & 3 | `liger_kernel.transformers.apply_liger_kernel_to_llama` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
295
|
+
| LLaMA 3.2-Vision | `liger_kernel.transformers.apply_liger_kernel_to_mllama` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
270
296
|
| Mistral | `liger_kernel.transformers.apply_liger_kernel_to_mistral` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
271
297
|
| Mixtral | `liger_kernel.transformers.apply_liger_kernel_to_mixtral` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
272
298
|
| Gemma1 | `liger_kernel.transformers.apply_liger_kernel_to_gemma` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
273
299
|
| Gemma2 | `liger_kernel.transformers.apply_liger_kernel_to_gemma2` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss |
|
|
274
|
-
| Qwen2
|
|
300
|
+
| Qwen2 & Qwen2.5 | `liger_kernel.transformers.apply_liger_kernel_to_qwen2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
275
301
|
| Qwen2-VL | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl` | RMSNorm, LayerNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
276
|
-
| Phi3
|
|
302
|
+
| Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
277
303
|
|
|
278
304
|
|
|
279
305
|
|
|
@@ -289,6 +315,8 @@ loss.backward()
|
|
|
289
315
|
| CrossEntropy | `liger_kernel.transformers.LigerCrossEntropyLoss` |
|
|
290
316
|
| FusedLinearCrossEntropy | `liger_kernel.transformers.LigerFusedLinearCrossEntropyLoss`|
|
|
291
317
|
| KLDivergence | `liger_kernel.transformers.LigerKLDIVLoss` |
|
|
318
|
+
| JSD | `liger_kernel.transformers.LigerJSD` |
|
|
319
|
+
| FusedLinearJSD | `liger_kernel.transformers.LigerFusedLinearJSD` |
|
|
292
320
|
|
|
293
321
|
- **RMSNorm**: [RMSNorm](https://arxiv.org/pdf/1910.07467), which normalizes activations using their root mean square, is implemented by fusing the normalization and scaling steps into a single Triton kernel, and achieves ~3X speedup with ~3X peak memory reduction.
|
|
294
322
|
- **LayerNorm**: [LayerNorm](https://arxiv.org/pdf/1607.06450), which centers and normalizes activations across the feature dimension, is implemented by fusing the centering, normalization and scaling steps into a single Triton kernel, and achieves ~2X speedup.
|
|
@@ -303,35 +331,23 @@ $$\text{GeGLU}(x)=\text{GELU}(xW+b)\otimes(xV+c)$$
|
|
|
303
331
|
<!-- TODO: verify vocab sizes are accurate -->
|
|
304
332
|
- **FusedLinearCrossEntropy**: Peak memory usage of cross entropy loss is further improved by fusing the model head with the CE loss and chunking the input for block-wise loss and gradient calculation, a technique inspired by [Efficient Cross Entropy](https://github.com/mgmalek/efficient_cross_entropy). It achieves >4X memory reduction for 128k vocab size. **This is highly effective for large batch size, large sequence length, and large vocabulary sizes.** Please refer to the [Medusa example](https://github.com/linkedin/Liger-Kernel/tree/main/examples/medusa) for individual kernel usage.
|
|
305
333
|
- **KLDivergence**: [KL Divergence](https://pytorch.org/docs/stable/generated/torch.nn.KLDivLoss.html) is implemented by fusing the forward into a single triton kernel, with reduction done outside the kernel. It achieves ~1.5X speed and ~15% memory reduction for 128K vocab size.
|
|
334
|
+
- **JSD**: [Generalized JSD](https://arxiv.org/pdf/2306.13649) (Jensen-Shannon divergence), is implemented by computing both the loss and gradient in the forward pass. It achieves ~1.5X speed and ~54% memory reduction for 128k vocab size.
|
|
335
|
+
- **FusedLinearJSD**: Peak memory usage of JSD loss is further improved by fusing the model head with the model head with the JSD and chunking the input for block-wise loss and gradient calculation. It achieves ~85% memory reduction for 128k vocab size where batch size $\times$ sequence length is 8192.
|
|
336
|
+
|
|
306
337
|
|
|
307
338
|
### Experimental Kernels
|
|
308
339
|
|
|
309
340
|
| **Kernel** | **API** |
|
|
310
341
|
|---------------------------------|-------------------------------------------------------------|
|
|
311
342
|
| Embedding | `liger_kernel.transformers.experimental.LigerEmbedding` |
|
|
312
|
-
|
|
343
|
+
| Matmul int2xint8 | `liger_kernel.transformers.experimental.matmul`
|
|
313
344
|
|
|
314
345
|
- **Embedding**: [Embedding](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) is implemented by fusing embedding lookup and output operations. It achieves a peak speedup of ~1.5x in the forward pass and an overall speedup of ~1.1x.
|
|
315
|
-
|
|
346
|
+
- **Matmul int2xint8**: is implemented by using the cache tiled matrix multiplication and by fusing the matmul with the unpacking process which achieves a considerable speed up and performs on par with @torch.compile
|
|
316
347
|
<!-- TODO: be more specific about batch size -->
|
|
317
348
|
> **Note:**
|
|
318
349
|
> Reported speedups and memory reductions are with respect to the LLaMA 3-8B Hugging Face layer implementations. All models use 4K hidden size and 4K sequence length and are evaluated based on memory usage and wall time for the forward+backward pass on a single NVIDIA A100 80G GPU using small batch sizes. Liger kernels exhibit more efficient scaling to larger batch sizes, detailed further in the [Benchmark](./benchmark) folder.
|
|
319
350
|
|
|
320
|
-
## Note on ML Compiler
|
|
321
|
-
|
|
322
|
-
### Torch Compile
|
|
323
|
-
|
|
324
|
-
Since Liger Kernel is 100% Triton-based, it works seamlessly with [`torch.compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html). In the following example, Liger Kernel can further optimize the model on top of Torch Compile, reducing the memory by more than half.
|
|
325
|
-
|
|
326
|
-
| Configuration | Throughput (tokens/sec) | Memory Reserved (GB) |
|
|
327
|
-
|--------------------------------|----------------------------|-------------------------|
|
|
328
|
-
| Torch Compile | 3780 | 66.4 |
|
|
329
|
-
| Torch Compile + Liger Kernel | 3702 | 31.0 |
|
|
330
|
-
|
|
331
|
-
> **Note:**
|
|
332
|
-
> 1. Benchmark conditions: LLaMA 3-8B, Batch Size = 8, Seq Len = 4096, Data Type = `bf16`, Optimizer = AdamW, Gradient Checkpointing = True, Distributed Strategy = FSDP1 on 8 A100s.
|
|
333
|
-
> 2. Tested on torch `2.5.0.dev20240731+cu118`
|
|
334
|
-
|
|
335
351
|
## Contributing
|
|
336
352
|
|
|
337
353
|
[CONTRIBUTING GUIDE](https://github.com/linkedin/Liger-Kernel/blob/main/CONTRIBUTING.md)
|
|
@@ -365,7 +381,14 @@ Many thanks to the contributors to these projects for their invaluable work that
|
|
|
365
381
|
|
|
366
382
|
## License
|
|
367
383
|
|
|
368
|
-
[BSD 2-CLAUSE](https://github.com/linkedin/Liger-Kernel/blob/main/LICENSE)
|
|
384
|
+
This project is licensed under the [BSD 2-CLAUSE](https://github.com/linkedin/Liger-Kernel/blob/main/LICENSE) License (see `LICENSE` for details).
|
|
385
|
+
It also includes components from projects licensed under:
|
|
386
|
+
|
|
387
|
+
- Apache License 2.0 (see `LICENSE-APACHE-2.0` for details).
|
|
388
|
+
- MIT License (see `LICENSE-MIT-AutoAWQ` for details).
|
|
389
|
+
- MIT License (see `LICENSE-MIT-Efficient Cross Entropy` for details).
|
|
390
|
+
- MIT License (see `LICENSE-MIT-llmc` for details).
|
|
391
|
+
- MIT License (see `LICENSE-MIT-triton` for details).
|
|
369
392
|
|
|
370
393
|
## Contact
|
|
371
394
|
|
|
@@ -376,13 +399,29 @@ Many thanks to the contributors to these projects for their invaluable work that
|
|
|
376
399
|
|
|
377
400
|
Biblatex entry:
|
|
378
401
|
```bib
|
|
379
|
-
@
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
402
|
+
@article{hsu2024ligerkernelefficienttriton,
|
|
403
|
+
title={Liger Kernel: Efficient Triton Kernels for LLM Training},
|
|
404
|
+
author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen},
|
|
405
|
+
year={2024},
|
|
406
|
+
eprint={2410.10989},
|
|
407
|
+
archivePrefix={arXiv},
|
|
408
|
+
primaryClass={cs.LG},
|
|
409
|
+
url={https://arxiv.org/abs/2410.10989},
|
|
410
|
+
journal={arXiv preprint arXiv:2410.10989},
|
|
384
411
|
}
|
|
385
412
|
```
|
|
386
413
|
|
|
387
414
|
## Star History
|
|
388
415
|
[](https://star-history.com/#linkedin/Liger-Kernel&Date)
|
|
416
|
+
|
|
417
|
+
## Contributors
|
|
418
|
+
|
|
419
|
+
<a href="https://github.com/linkedin/Liger-Kernel/graphs/contributors">
|
|
420
|
+
<img alt="contributors" src="https://contrib.rocks/image?repo=linkedin/Liger-Kernel"/>
|
|
421
|
+
</a>
|
|
422
|
+
|
|
423
|
+
<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
|
|
424
|
+
<a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
|
|
425
|
+
↑ Back to Top ↑
|
|
426
|
+
</a>
|
|
427
|
+
</p>
|
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
<a name="readme-top"></a>
|
|
2
|
+
|
|
1
3
|
# Liger Kernel: Efficient Triton Kernels for LLM Training
|
|
2
4
|
|
|
3
5
|
|
|
@@ -6,6 +8,7 @@
|
|
|
6
8
|
<th style="padding: 10px;" colspan="2">Stable</th>
|
|
7
9
|
<th style="padding: 10px;" colspan="2">Nightly</th>
|
|
8
10
|
<th style="padding: 10px;">Discord</th>
|
|
11
|
+
<th style="padding: 10px;">Gurubase (experimental)</th>
|
|
9
12
|
</tr>
|
|
10
13
|
<tr>
|
|
11
14
|
<td style="padding: 10px;">
|
|
@@ -29,8 +32,13 @@
|
|
|
29
32
|
</a>
|
|
30
33
|
</td>
|
|
31
34
|
<td style="padding: 10px;">
|
|
32
|
-
<a href="https://discord.gg/
|
|
33
|
-
<img src="https://dcbadge.vercel.app/api/server/
|
|
35
|
+
<a href="https://discord.gg/gpumode">
|
|
36
|
+
<img src="https://dcbadge.vercel.app/api/server/gpumode?style=flat" alt="Join Our Discord">
|
|
37
|
+
</a>
|
|
38
|
+
</td>
|
|
39
|
+
<td style="padding: 10px;">
|
|
40
|
+
<a href="https://gurubase.io/g/liger-kernel">
|
|
41
|
+
<img src="https://img.shields.io/badge/Gurubase-Ask%20Liger%20Kernel%20Guru-006BFF" alt="Ask Liger Kernel Guru">
|
|
34
42
|
</a>
|
|
35
43
|
</td>
|
|
36
44
|
</tr>
|
|
@@ -40,11 +48,12 @@
|
|
|
40
48
|
|
|
41
49
|
<img src="https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/logo-banner.png">
|
|
42
50
|
|
|
43
|
-
[Installation](#installation) | [Getting Started](#getting-started) | [Examples](#examples) | [APIs](#apis) | [
|
|
51
|
+
[Installation](#installation) | [Getting Started](#getting-started) | [Examples](#examples) | [APIs](#apis) | [Cite our work](#cite-this-work)
|
|
44
52
|
|
|
45
53
|
<details>
|
|
46
54
|
<summary>Latest News 🔥</summary>
|
|
47
|
-
|
|
55
|
+
|
|
56
|
+
- [2024/10/21] We have released the tech report of Liger Kernel on Arxiv: https://arxiv.org/pdf/2410.10989
|
|
48
57
|
- [2024/9/6] We release v0.2.1 ([X post](https://x.com/liger_kernel/status/1832168197002510649)). 2500+ Stars, 10+ New Contributors, 50+ PRs, 50k Downloads in two weeks!
|
|
49
58
|
- [2024/8/31] CUDA MODE talk, [Liger-Kernel: Real-world Triton kernel for LLM Training](https://youtu.be/gWble4FreV4?si=dxPeIchhkJ36Mbns), [Slides](https://github.com/cuda-mode/lectures?tab=readme-ov-file#lecture-28-liger-kernel)
|
|
50
59
|
- [2024/8/23] Official release: check out our [X post](https://x.com/hsu_byron/status/1827072737673982056)
|
|
@@ -102,11 +111,21 @@ With one line of code, Liger Kernel can increase throughput by more than 20% and
|
|
|
102
111
|
|
|
103
112
|
## Installation
|
|
104
113
|
|
|
105
|
-
### Dependencies
|
|
114
|
+
### Dependencies
|
|
115
|
+
|
|
116
|
+
#### CUDA
|
|
106
117
|
|
|
107
118
|
- `torch >= 2.1.2`
|
|
108
119
|
- `triton >= 2.3.0`
|
|
109
|
-
|
|
120
|
+
|
|
121
|
+
#### ROCm
|
|
122
|
+
|
|
123
|
+
- `torch >= 2.5.0` Install according to the instruction in Pytorch official webpage.
|
|
124
|
+
- `triton >= 3.0.0` Install from pypi. (e.g. `pip install triton==3.0.0`)
|
|
125
|
+
|
|
126
|
+
### Optional Dependencies
|
|
127
|
+
|
|
128
|
+
- `transformers >= 4.x`: Required if you plan to use the transformers models patching APIs. The specific model you are working will dictate the minimum version of transformers.
|
|
110
129
|
|
|
111
130
|
> **Note:**
|
|
112
131
|
> Our kernels inherit the full spectrum of hardware compatibility offered by [Triton](https://github.com/triton-lang/triton).
|
|
@@ -129,7 +148,11 @@ To install from source:
|
|
|
129
148
|
git clone https://github.com/linkedin/Liger-Kernel.git
|
|
130
149
|
cd Liger-Kernel
|
|
131
150
|
pip install -e .
|
|
151
|
+
# or if using transformers
|
|
152
|
+
pip install -e .[transformers]
|
|
132
153
|
```
|
|
154
|
+
|
|
155
|
+
|
|
133
156
|
## Getting Started
|
|
134
157
|
|
|
135
158
|
There are a couple of ways to apply Liger kernels, depending on the level of customization required.
|
|
@@ -222,13 +245,14 @@ loss.backward()
|
|
|
222
245
|
| **Model** | **API** | **Supported Operations** |
|
|
223
246
|
|-------------|--------------------------------------------------------------|-------------------------------------------------------------------------|
|
|
224
247
|
| LLaMA 2 & 3 | `liger_kernel.transformers.apply_liger_kernel_to_llama` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
248
|
+
| LLaMA 3.2-Vision | `liger_kernel.transformers.apply_liger_kernel_to_mllama` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
225
249
|
| Mistral | `liger_kernel.transformers.apply_liger_kernel_to_mistral` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
226
250
|
| Mixtral | `liger_kernel.transformers.apply_liger_kernel_to_mixtral` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
227
251
|
| Gemma1 | `liger_kernel.transformers.apply_liger_kernel_to_gemma` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
228
252
|
| Gemma2 | `liger_kernel.transformers.apply_liger_kernel_to_gemma2` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss |
|
|
229
|
-
| Qwen2
|
|
253
|
+
| Qwen2 & Qwen2.5 | `liger_kernel.transformers.apply_liger_kernel_to_qwen2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
230
254
|
| Qwen2-VL | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl` | RMSNorm, LayerNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
231
|
-
| Phi3
|
|
255
|
+
| Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
|
|
232
256
|
|
|
233
257
|
|
|
234
258
|
|
|
@@ -244,6 +268,8 @@ loss.backward()
|
|
|
244
268
|
| CrossEntropy | `liger_kernel.transformers.LigerCrossEntropyLoss` |
|
|
245
269
|
| FusedLinearCrossEntropy | `liger_kernel.transformers.LigerFusedLinearCrossEntropyLoss`|
|
|
246
270
|
| KLDivergence | `liger_kernel.transformers.LigerKLDIVLoss` |
|
|
271
|
+
| JSD | `liger_kernel.transformers.LigerJSD` |
|
|
272
|
+
| FusedLinearJSD | `liger_kernel.transformers.LigerFusedLinearJSD` |
|
|
247
273
|
|
|
248
274
|
- **RMSNorm**: [RMSNorm](https://arxiv.org/pdf/1910.07467), which normalizes activations using their root mean square, is implemented by fusing the normalization and scaling steps into a single Triton kernel, and achieves ~3X speedup with ~3X peak memory reduction.
|
|
249
275
|
- **LayerNorm**: [LayerNorm](https://arxiv.org/pdf/1607.06450), which centers and normalizes activations across the feature dimension, is implemented by fusing the centering, normalization and scaling steps into a single Triton kernel, and achieves ~2X speedup.
|
|
@@ -258,35 +284,23 @@ $$\text{GeGLU}(x)=\text{GELU}(xW+b)\otimes(xV+c)$$
|
|
|
258
284
|
<!-- TODO: verify vocab sizes are accurate -->
|
|
259
285
|
- **FusedLinearCrossEntropy**: Peak memory usage of cross entropy loss is further improved by fusing the model head with the CE loss and chunking the input for block-wise loss and gradient calculation, a technique inspired by [Efficient Cross Entropy](https://github.com/mgmalek/efficient_cross_entropy). It achieves >4X memory reduction for 128k vocab size. **This is highly effective for large batch size, large sequence length, and large vocabulary sizes.** Please refer to the [Medusa example](https://github.com/linkedin/Liger-Kernel/tree/main/examples/medusa) for individual kernel usage.
|
|
260
286
|
- **KLDivergence**: [KL Divergence](https://pytorch.org/docs/stable/generated/torch.nn.KLDivLoss.html) is implemented by fusing the forward into a single triton kernel, with reduction done outside the kernel. It achieves ~1.5X speed and ~15% memory reduction for 128K vocab size.
|
|
287
|
+
- **JSD**: [Generalized JSD](https://arxiv.org/pdf/2306.13649) (Jensen-Shannon divergence), is implemented by computing both the loss and gradient in the forward pass. It achieves ~1.5X speed and ~54% memory reduction for 128k vocab size.
|
|
288
|
+
- **FusedLinearJSD**: Peak memory usage of JSD loss is further improved by fusing the model head with the model head with the JSD and chunking the input for block-wise loss and gradient calculation. It achieves ~85% memory reduction for 128k vocab size where batch size $\times$ sequence length is 8192.
|
|
289
|
+
|
|
261
290
|
|
|
262
291
|
### Experimental Kernels
|
|
263
292
|
|
|
264
293
|
| **Kernel** | **API** |
|
|
265
294
|
|---------------------------------|-------------------------------------------------------------|
|
|
266
295
|
| Embedding | `liger_kernel.transformers.experimental.LigerEmbedding` |
|
|
267
|
-
|
|
296
|
+
| Matmul int2xint8 | `liger_kernel.transformers.experimental.matmul`
|
|
268
297
|
|
|
269
298
|
- **Embedding**: [Embedding](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) is implemented by fusing embedding lookup and output operations. It achieves a peak speedup of ~1.5x in the forward pass and an overall speedup of ~1.1x.
|
|
270
|
-
|
|
299
|
+
- **Matmul int2xint8**: is implemented by using the cache tiled matrix multiplication and by fusing the matmul with the unpacking process which achieves a considerable speed up and performs on par with @torch.compile
|
|
271
300
|
<!-- TODO: be more specific about batch size -->
|
|
272
301
|
> **Note:**
|
|
273
302
|
> Reported speedups and memory reductions are with respect to the LLaMA 3-8B Hugging Face layer implementations. All models use 4K hidden size and 4K sequence length and are evaluated based on memory usage and wall time for the forward+backward pass on a single NVIDIA A100 80G GPU using small batch sizes. Liger kernels exhibit more efficient scaling to larger batch sizes, detailed further in the [Benchmark](./benchmark) folder.
|
|
274
303
|
|
|
275
|
-
## Note on ML Compiler
|
|
276
|
-
|
|
277
|
-
### Torch Compile
|
|
278
|
-
|
|
279
|
-
Since Liger Kernel is 100% Triton-based, it works seamlessly with [`torch.compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html). In the following example, Liger Kernel can further optimize the model on top of Torch Compile, reducing the memory by more than half.
|
|
280
|
-
|
|
281
|
-
| Configuration | Throughput (tokens/sec) | Memory Reserved (GB) |
|
|
282
|
-
|--------------------------------|----------------------------|-------------------------|
|
|
283
|
-
| Torch Compile | 3780 | 66.4 |
|
|
284
|
-
| Torch Compile + Liger Kernel | 3702 | 31.0 |
|
|
285
|
-
|
|
286
|
-
> **Note:**
|
|
287
|
-
> 1. Benchmark conditions: LLaMA 3-8B, Batch Size = 8, Seq Len = 4096, Data Type = `bf16`, Optimizer = AdamW, Gradient Checkpointing = True, Distributed Strategy = FSDP1 on 8 A100s.
|
|
288
|
-
> 2. Tested on torch `2.5.0.dev20240731+cu118`
|
|
289
|
-
|
|
290
304
|
## Contributing
|
|
291
305
|
|
|
292
306
|
[CONTRIBUTING GUIDE](https://github.com/linkedin/Liger-Kernel/blob/main/CONTRIBUTING.md)
|
|
@@ -320,7 +334,14 @@ Many thanks to the contributors to these projects for their invaluable work that
|
|
|
320
334
|
|
|
321
335
|
## License
|
|
322
336
|
|
|
323
|
-
[BSD 2-CLAUSE](https://github.com/linkedin/Liger-Kernel/blob/main/LICENSE)
|
|
337
|
+
This project is licensed under the [BSD 2-CLAUSE](https://github.com/linkedin/Liger-Kernel/blob/main/LICENSE) License (see `LICENSE` for details).
|
|
338
|
+
It also includes components from projects licensed under:
|
|
339
|
+
|
|
340
|
+
- Apache License 2.0 (see `LICENSE-APACHE-2.0` for details).
|
|
341
|
+
- MIT License (see `LICENSE-MIT-AutoAWQ` for details).
|
|
342
|
+
- MIT License (see `LICENSE-MIT-Efficient Cross Entropy` for details).
|
|
343
|
+
- MIT License (see `LICENSE-MIT-llmc` for details).
|
|
344
|
+
- MIT License (see `LICENSE-MIT-triton` for details).
|
|
324
345
|
|
|
325
346
|
## Contact
|
|
326
347
|
|
|
@@ -331,13 +352,29 @@ Many thanks to the contributors to these projects for their invaluable work that
|
|
|
331
352
|
|
|
332
353
|
Biblatex entry:
|
|
333
354
|
```bib
|
|
334
|
-
@
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
355
|
+
@article{hsu2024ligerkernelefficienttriton,
|
|
356
|
+
title={Liger Kernel: Efficient Triton Kernels for LLM Training},
|
|
357
|
+
author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen},
|
|
358
|
+
year={2024},
|
|
359
|
+
eprint={2410.10989},
|
|
360
|
+
archivePrefix={arXiv},
|
|
361
|
+
primaryClass={cs.LG},
|
|
362
|
+
url={https://arxiv.org/abs/2410.10989},
|
|
363
|
+
journal={arXiv preprint arXiv:2410.10989},
|
|
339
364
|
}
|
|
340
365
|
```
|
|
341
366
|
|
|
342
367
|
## Star History
|
|
343
368
|
[](https://star-history.com/#linkedin/Liger-Kernel&Date)
|
|
369
|
+
|
|
370
|
+
## Contributors
|
|
371
|
+
|
|
372
|
+
<a href="https://github.com/linkedin/Liger-Kernel/graphs/contributors">
|
|
373
|
+
<img alt="contributors" src="https://contrib.rocks/image?repo=linkedin/Liger-Kernel"/>
|
|
374
|
+
</a>
|
|
375
|
+
|
|
376
|
+
<p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
|
|
377
|
+
<a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
|
|
378
|
+
↑ Back to Top ↑
|
|
379
|
+
</a>
|
|
380
|
+
</p>
|
|
@@ -4,26 +4,30 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "liger_kernel"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.4.0"
|
|
8
8
|
description = "Efficient Triton kernels for LLM Training"
|
|
9
9
|
urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
|
|
10
10
|
readme = { file = "README.md", content-type = "text/markdown" }
|
|
11
11
|
license = { file = "LICENSE" }
|
|
12
12
|
dependencies = [
|
|
13
13
|
"torch>=2.1.2",
|
|
14
|
-
"triton>=2.3.
|
|
15
|
-
"transformers>=4.42.0"
|
|
14
|
+
"triton>=2.3.1",
|
|
16
15
|
]
|
|
17
16
|
|
|
18
17
|
[project.optional-dependencies]
|
|
18
|
+
transformers = [
|
|
19
|
+
"transformers~=4.0"
|
|
20
|
+
]
|
|
21
|
+
|
|
19
22
|
dev = [
|
|
23
|
+
"transformers>=4.44.2",
|
|
20
24
|
"matplotlib>=3.7.2",
|
|
21
25
|
"flake8>=4.0.1.1",
|
|
22
26
|
"black>=24.4.2",
|
|
23
27
|
"isort>=5.13.2",
|
|
24
28
|
"pytest>=7.1.2",
|
|
25
29
|
"datasets>=2.19.2",
|
|
26
|
-
"
|
|
30
|
+
"torchvision>=0.16.2",
|
|
27
31
|
"seaborn",
|
|
28
32
|
]
|
|
29
33
|
|
|
@@ -33,7 +37,7 @@ include = ["liger_kernel", "liger_kernel.*"]
|
|
|
33
37
|
|
|
34
38
|
[tool.pytest.ini_options]
|
|
35
39
|
pythonpath = [
|
|
36
|
-
"src",
|
|
40
|
+
"src",
|
|
37
41
|
"."
|
|
38
42
|
]
|
|
39
43
|
asyncio_mode = "auto"
|
|
@@ -2,6 +2,8 @@ import torch
|
|
|
2
2
|
import triton
|
|
3
3
|
import triton.language as tl
|
|
4
4
|
|
|
5
|
+
from liger_kernel.ops.utils import element_mul_kernel, is_hip
|
|
6
|
+
|
|
5
7
|
|
|
6
8
|
@triton.jit
|
|
7
9
|
def liger_cross_entropy_kernel(
|
|
@@ -126,7 +128,7 @@ def liger_cross_entropy_kernel(
|
|
|
126
128
|
# So we can safely calculate log (softmax(X_y)) without overflow
|
|
127
129
|
loss = -(ori_X_y - m - tl.log(d))
|
|
128
130
|
|
|
129
|
-
#
|
|
131
|
+
# Original loss = H(q, p), with label smoothing regularization = H(q', p) and (label_smoothing / V) = eps
|
|
130
132
|
# H(q', p) = (1 - label_smoothing) * H(q, p) + label_smoothing * H(u, p)
|
|
131
133
|
# = (1 - label_smoothing) * H(q, p) + eps * sum(logsoftmax(x_i))
|
|
132
134
|
# By using m (global max of xi) and d (sum of e^(xi-m)), we can simplify as:
|
|
@@ -159,42 +161,6 @@ def liger_cross_entropy_kernel(
|
|
|
159
161
|
MAX_FUSED_SIZE = 65536 // 2 # the best size we found by manually tuning
|
|
160
162
|
|
|
161
163
|
|
|
162
|
-
@triton.jit
|
|
163
|
-
def element_mul_kernel(
|
|
164
|
-
X_ptr,
|
|
165
|
-
X_stride,
|
|
166
|
-
grad_output_ptr,
|
|
167
|
-
n_cols,
|
|
168
|
-
BLOCK_SIZE: tl.constexpr,
|
|
169
|
-
):
|
|
170
|
-
"""
|
|
171
|
-
This function multiplies each element of the tensor pointed by X_ptr with the value pointed by grad_output_ptr.
|
|
172
|
-
The multiplication is performed in-place on the tensor pointed by X_ptr.
|
|
173
|
-
|
|
174
|
-
Parameters:
|
|
175
|
-
X_ptr: Pointer to the input tensor.
|
|
176
|
-
X_stride (int): The stride of the input tensor.
|
|
177
|
-
grad_output_ptr: Pointer to the gradient output value.
|
|
178
|
-
n_cols (int): The number of columns in the input tensor.
|
|
179
|
-
BLOCK_SIZE (int): The block size for Triton operations.
|
|
180
|
-
"""
|
|
181
|
-
|
|
182
|
-
# Get the program ID and convert it to int64 to avoid overflow
|
|
183
|
-
program_id = tl.program_id(0).to(tl.int64)
|
|
184
|
-
|
|
185
|
-
# Locate the start index
|
|
186
|
-
X_ptr += program_id * X_stride
|
|
187
|
-
|
|
188
|
-
# Load the gradient output value
|
|
189
|
-
grad_output = tl.load(grad_output_ptr)
|
|
190
|
-
|
|
191
|
-
# Perform the element-wise multiplication
|
|
192
|
-
for i in range(0, n_cols, BLOCK_SIZE):
|
|
193
|
-
X_offsets = i + tl.arange(0, BLOCK_SIZE)
|
|
194
|
-
X_block = tl.load(X_ptr + X_offsets, mask=X_offsets < n_cols)
|
|
195
|
-
tl.store(X_ptr + X_offsets, X_block * grad_output, mask=X_offsets < n_cols)
|
|
196
|
-
|
|
197
|
-
|
|
198
164
|
def cross_entropy_forward(_input, target, ignore_index, label_smoothing, reduction):
|
|
199
165
|
BT, V = _input.shape
|
|
200
166
|
n_rows = BT
|
|
@@ -228,7 +194,7 @@ def cross_entropy_forward(_input, target, ignore_index, label_smoothing, reducti
|
|
|
228
194
|
BLOCK_SIZE=BLOCK_SIZE,
|
|
229
195
|
# TODO: 32 seems to give the best performance
|
|
230
196
|
# Performance is quite sensitive to num_warps
|
|
231
|
-
num_warps=32,
|
|
197
|
+
num_warps=32 if not is_hip() else 16,
|
|
232
198
|
)
|
|
233
199
|
|
|
234
200
|
loss = torch.sum(loss_1d)
|
|
@@ -253,7 +219,7 @@ def cross_entropy_backward(_input, grad_output):
|
|
|
253
219
|
grad_output,
|
|
254
220
|
V,
|
|
255
221
|
BLOCK_SIZE=BLOCK_SIZE,
|
|
256
|
-
num_warps=32,
|
|
222
|
+
num_warps=32 if not is_hip() else 16,
|
|
257
223
|
)
|
|
258
224
|
|
|
259
225
|
return _input
|