liger-kernel 0.3.1__tar.gz → 0.4.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. liger_kernel-0.4.1/NOTICE +58 -0
  2. {liger_kernel-0.3.1/src/liger_kernel.egg-info → liger_kernel-0.4.1}/PKG-INFO +63 -29
  3. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/README.md +60 -27
  4. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/pyproject.toml +4 -3
  5. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/env_report.py +2 -0
  6. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/ops/cross_entropy.py +144 -65
  7. liger_kernel-0.4.1/src/liger_kernel/ops/experimental/mm_int8int2.py +355 -0
  8. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/ops/fused_linear_cross_entropy.py +31 -11
  9. liger_kernel-0.4.1/src/liger_kernel/ops/fused_linear_jsd.py +245 -0
  10. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/ops/geglu.py +2 -2
  11. liger_kernel-0.4.1/src/liger_kernel/ops/group_norm.py +322 -0
  12. liger_kernel-0.4.1/src/liger_kernel/ops/jsd.py +176 -0
  13. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/ops/kl_div.py +2 -2
  14. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/ops/rms_norm.py +92 -46
  15. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/ops/swiglu.py +2 -2
  16. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/ops/utils.py +62 -1
  17. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/transformers/__init__.py +3 -0
  18. liger_kernel-0.4.1/src/liger_kernel/transformers/cross_entropy.py +53 -0
  19. liger_kernel-0.4.1/src/liger_kernel/transformers/functional.py +56 -0
  20. liger_kernel-0.4.1/src/liger_kernel/transformers/fused_linear_cross_entropy.py +48 -0
  21. liger_kernel-0.4.1/src/liger_kernel/transformers/fused_linear_jsd.py +98 -0
  22. liger_kernel-0.4.1/src/liger_kernel/transformers/group_norm.py +56 -0
  23. liger_kernel-0.4.1/src/liger_kernel/transformers/jsd.py +75 -0
  24. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/transformers/model/gemma.py +124 -1
  25. liger_kernel-0.4.1/src/liger_kernel/transformers/model/gemma2.py +277 -0
  26. liger_kernel-0.4.1/src/liger_kernel/transformers/model/llama.py +277 -0
  27. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/transformers/model/mistral.py +3 -0
  28. liger_kernel-0.4.1/src/liger_kernel/transformers/model/mixtral.py +309 -0
  29. liger_kernel-0.4.1/src/liger_kernel/transformers/model/mllama.py +274 -0
  30. liger_kernel-0.4.1/src/liger_kernel/transformers/model/phi3.py +274 -0
  31. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/transformers/model/qwen2.py +123 -2
  32. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/transformers/model/qwen2_vl.py +8 -1
  33. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/transformers/monkey_patch.py +258 -68
  34. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/transformers/rms_norm.py +11 -3
  35. {liger_kernel-0.3.1 → liger_kernel-0.4.1/src/liger_kernel.egg-info}/PKG-INFO +63 -29
  36. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel.egg-info/SOURCES.txt +9 -0
  37. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel.egg-info/requires.txt +2 -1
  38. liger_kernel-0.3.1/NOTICE +0 -4
  39. liger_kernel-0.3.1/src/liger_kernel/transformers/cross_entropy.py +0 -21
  40. liger_kernel-0.3.1/src/liger_kernel/transformers/functional.py +0 -19
  41. liger_kernel-0.3.1/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -21
  42. liger_kernel-0.3.1/src/liger_kernel/transformers/model/llama.py +0 -146
  43. liger_kernel-0.3.1/src/liger_kernel/transformers/model/mixtral.py +0 -158
  44. liger_kernel-0.3.1/src/liger_kernel/transformers/model/phi3.py +0 -136
  45. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/LICENSE +0 -0
  46. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/setup.cfg +0 -0
  47. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/ops/__init__.py +0 -0
  48. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/ops/experimental/embedding.py +0 -0
  49. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/ops/layer_norm.py +0 -0
  50. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/ops/rope.py +0 -0
  51. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/transformers/auto_model.py +0 -0
  52. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
  53. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/transformers/geglu.py +0 -0
  54. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/transformers/kl_div.py +0 -0
  55. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/transformers/layer_norm.py +0 -0
  56. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/transformers/model/__init__.py +0 -0
  57. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/transformers/rope.py +0 -0
  58. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/transformers/swiglu.py +0 -0
  59. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/transformers/trainer_integration.py +0 -0
  60. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/triton/__init__.py +0 -0
  61. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel/triton/monkey_patch.py +0 -0
  62. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel.egg-info/dependency_links.txt +0 -0
  63. {liger_kernel-0.3.1 → liger_kernel-0.4.1}/src/liger_kernel.egg-info/top_level.txt +0 -0
@@ -0,0 +1,58 @@
1
+ Copyright 2024 LinkedIn Corporation
2
+ All Rights Reserved.
3
+
4
+ Licensed under the BSD 2-Clause License (the "License"). See License in the project root for license information.
5
+
6
+ This product includes software developed by LinkedIn Corporation.
7
+
8
+ This product contains code derived from the following open source projects:
9
+
10
+ 1. Unsloth
11
+ Copyright (c) 2023 Unsloth AI
12
+ Licensed under the Apache License, Version 2.0
13
+ Source: https://github.com/unslothai/unsloth
14
+
15
+ The `calculate_settings` function to determine block size and warp is reused for Norm and MLP operations.
16
+ Modifications and additions were made to the RMS Norm implementation.
17
+
18
+ 2. Triton
19
+ Copyright (c) 2023 OpenAI
20
+ Licensed under the MIT License
21
+ Source: https://github.com/openai/triton
22
+
23
+ Modifications were made based on Triton tutorials for the RMS Norm implementation.
24
+
25
+ 3. Efficient Cross Entropy
26
+ Copyright (c) 2023 Mohamed Malek
27
+ Licensed under the MIT License
28
+ Source: https://github.com/mgmalek/efficient_cross_entropy
29
+
30
+ The idea of gradient-in-forward and chunking was used in the Linear Cross Entropy implementation.
31
+
32
+ 4. Flash Attention
33
+ Copyright (c) 2023 Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, Christopher Ré
34
+ Licensed under the BSD 3-Clause License
35
+ Source: https://github.com/Dao-AILab/flash-attention
36
+
37
+ Optimization ideas such as tiling and recomputation were inspired by this work.
38
+
39
+ 5. AutoAWQ
40
+ Copyright (c) 2023 Casper Hansen
41
+ Licensed under the MIT License
42
+ Source: https://github.com/casper-hansen/AutoAWQ
43
+
44
+ The design of the automodel was referenced from this project.
45
+
46
+ 6. llm.c
47
+ Copyright (c) 2023 Andrej Karpathy
48
+ Licensed under the MIT License
49
+ Source: https://github.com/karpathy/llm.c
50
+
51
+ The design of end-to-end testing was referenced from this project.
52
+
53
+ 7. Tiny Shakespeare Dataset
54
+ Source: https://huggingface.co/datasets/karpathy/tiny_shakespeare
55
+
56
+ This dataset is used to conduct convergence tests on mini models.
57
+
58
+ For full license texts, please refer to the respective project repositories.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel
3
- Version: 0.3.1
3
+ Version: 0.4.1
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -31,7 +31,7 @@ Description-Content-Type: text/markdown
31
31
  License-File: LICENSE
32
32
  License-File: NOTICE
33
33
  Requires-Dist: torch>=2.1.2
34
- Requires-Dist: triton>=2.3.0
34
+ Requires-Dist: triton>=2.3.1
35
35
  Provides-Extra: transformers
36
36
  Requires-Dist: transformers~=4.0; extra == "transformers"
37
37
  Provides-Extra: dev
@@ -42,8 +42,11 @@ Requires-Dist: black>=24.4.2; extra == "dev"
42
42
  Requires-Dist: isort>=5.13.2; extra == "dev"
43
43
  Requires-Dist: pytest>=7.1.2; extra == "dev"
44
44
  Requires-Dist: datasets>=2.19.2; extra == "dev"
45
+ Requires-Dist: torchvision>=0.16.2; extra == "dev"
45
46
  Requires-Dist: seaborn; extra == "dev"
46
47
 
48
+ <a name="readme-top"></a>
49
+
47
50
  # Liger Kernel: Efficient Triton Kernels for LLM Training
48
51
 
49
52
 
@@ -52,6 +55,7 @@ Requires-Dist: seaborn; extra == "dev"
52
55
  <th style="padding: 10px;" colspan="2">Stable</th>
53
56
  <th style="padding: 10px;" colspan="2">Nightly</th>
54
57
  <th style="padding: 10px;">Discord</th>
58
+ <th style="padding: 10px;">Gurubase (experimental)</th>
55
59
  </tr>
56
60
  <tr>
57
61
  <td style="padding: 10px;">
@@ -79,6 +83,11 @@ Requires-Dist: seaborn; extra == "dev"
79
83
  <img src="https://dcbadge.vercel.app/api/server/gpumode?style=flat" alt="Join Our Discord">
80
84
  </a>
81
85
  </td>
86
+ <td style="padding: 10px;">
87
+ <a href="https://gurubase.io/g/liger-kernel">
88
+ <img src="https://img.shields.io/badge/Gurubase-Ask%20Liger%20Kernel%20Guru-006BFF" alt="Ask Liger Kernel Guru">
89
+ </a>
90
+ </td>
82
91
  </tr>
83
92
  </table>
84
93
 
@@ -86,11 +95,12 @@ Requires-Dist: seaborn; extra == "dev"
86
95
 
87
96
  <img src="https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/logo-banner.png">
88
97
 
89
- [Installation](#installation) | [Getting Started](#getting-started) | [Examples](#examples) | [APIs](#apis) | [Structure](#structure) | [Contributing](#contributing) | [Acknowledgement](#acknowledgement)
98
+ [Installation](#installation) | [Getting Started](#getting-started) | [Examples](#examples) | [APIs](#apis) | [Cite our work](#cite-this-work)
90
99
 
91
100
  <details>
92
101
  <summary>Latest News 🔥</summary>
93
-
102
+
103
+ - [2024/10/21] We have released the tech report of Liger Kernel on Arxiv: https://arxiv.org/pdf/2410.10989
94
104
  - [2024/9/6] We release v0.2.1 ([X post](https://x.com/liger_kernel/status/1832168197002510649)). 2500+ Stars, 10+ New Contributors, 50+ PRs, 50k Downloads in two weeks!
95
105
  - [2024/8/31] CUDA MODE talk, [Liger-Kernel: Real-world Triton kernel for LLM Training](https://youtu.be/gWble4FreV4?si=dxPeIchhkJ36Mbns), [Slides](https://github.com/cuda-mode/lectures?tab=readme-ov-file#lecture-28-liger-kernel)
96
106
  - [2024/8/23] Official release: check out our [X post](https://x.com/hsu_byron/status/1827072737673982056)
@@ -148,11 +158,18 @@ With one line of code, Liger Kernel can increase throughput by more than 20% and
148
158
 
149
159
  ## Installation
150
160
 
151
- ### Dependencies
161
+ ### Dependencies
162
+
163
+ #### CUDA
152
164
 
153
165
  - `torch >= 2.1.2`
154
166
  - `triton >= 2.3.0`
155
167
 
168
+ #### ROCm
169
+
170
+ - `torch >= 2.5.0` Install according to the instruction in Pytorch official webpage.
171
+ - `triton >= 3.0.0` Install from pypi. (e.g. `pip install triton==3.0.0`)
172
+
156
173
  ### Optional Dependencies
157
174
 
158
175
  - `transformers >= 4.x`: Required if you plan to use the transformers models patching APIs. The specific model you are working will dictate the minimum version of transformers.
@@ -182,6 +199,7 @@ pip install -e .
182
199
  pip install -e .[transformers]
183
200
  ```
184
201
 
202
+
185
203
  ## Getting Started
186
204
 
187
205
  There are a couple of ways to apply Liger kernels, depending on the level of customization required.
@@ -274,10 +292,11 @@ loss.backward()
274
292
  | **Model** | **API** | **Supported Operations** |
275
293
  |-------------|--------------------------------------------------------------|-------------------------------------------------------------------------|
276
294
  | LLaMA 2 & 3 | `liger_kernel.transformers.apply_liger_kernel_to_llama` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
295
+ | LLaMA 3.2-Vision | `liger_kernel.transformers.apply_liger_kernel_to_mllama` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
277
296
  | Mistral | `liger_kernel.transformers.apply_liger_kernel_to_mistral` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
278
297
  | Mixtral | `liger_kernel.transformers.apply_liger_kernel_to_mixtral` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
279
298
  | Gemma1 | `liger_kernel.transformers.apply_liger_kernel_to_gemma` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
280
- | Gemma2 | `liger_kernel.transformers.apply_liger_kernel_to_gemma2` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss |
299
+ | Gemma2 | `liger_kernel.transformers.apply_liger_kernel_to_gemma2` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
281
300
  | Qwen2 & Qwen2.5 | `liger_kernel.transformers.apply_liger_kernel_to_qwen2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
282
301
  | Qwen2-VL | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl` | RMSNorm, LayerNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
283
302
  | Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
@@ -296,9 +315,12 @@ loss.backward()
296
315
  | CrossEntropy | `liger_kernel.transformers.LigerCrossEntropyLoss` |
297
316
  | FusedLinearCrossEntropy | `liger_kernel.transformers.LigerFusedLinearCrossEntropyLoss`|
298
317
  | KLDivergence | `liger_kernel.transformers.LigerKLDIVLoss` |
318
+ | JSD | `liger_kernel.transformers.LigerJSD` |
319
+ | FusedLinearJSD | `liger_kernel.transformers.LigerFusedLinearJSD` |
299
320
 
300
321
  - **RMSNorm**: [RMSNorm](https://arxiv.org/pdf/1910.07467), which normalizes activations using their root mean square, is implemented by fusing the normalization and scaling steps into a single Triton kernel, and achieves ~3X speedup with ~3X peak memory reduction.
301
322
  - **LayerNorm**: [LayerNorm](https://arxiv.org/pdf/1607.06450), which centers and normalizes activations across the feature dimension, is implemented by fusing the centering, normalization and scaling steps into a single Triton kernel, and achieves ~2X speedup.
323
+ - **GroupNorm**: [GroupNorm](https://arxiv.org/pdf/1803.08494), which normalizes activations across the group dimension for a given sample. Channels are grouped in K groups over which the normalization is performed, is implemented by fusing the centering, normalization and scaling steps into a single Triton kernel, and can achieve up to ~2X speedup as the number of channels/groups increases.
302
324
  - **RoPE**: [Rotary Positional Embedding](https://arxiv.org/pdf/2104.09864) is implemented by fusing the query and key embedding rotary into a single kernel with inplace replacement, and achieves ~3X speedup with ~3X peak memory reduction.
303
325
  - **SwiGLU**: [Swish Gated Linear Units](https://arxiv.org/pdf/2002.05202), given by
304
326
  $$\text{SwiGLU}(x)=\text{Swish}_{\beta}(xW+b)\otimes(xV+c)$$
@@ -310,35 +332,23 @@ $$\text{GeGLU}(x)=\text{GELU}(xW+b)\otimes(xV+c)$$
310
332
  <!-- TODO: verify vocab sizes are accurate -->
311
333
  - **FusedLinearCrossEntropy**: Peak memory usage of cross entropy loss is further improved by fusing the model head with the CE loss and chunking the input for block-wise loss and gradient calculation, a technique inspired by [Efficient Cross Entropy](https://github.com/mgmalek/efficient_cross_entropy). It achieves >4X memory reduction for 128k vocab size. **This is highly effective for large batch size, large sequence length, and large vocabulary sizes.** Please refer to the [Medusa example](https://github.com/linkedin/Liger-Kernel/tree/main/examples/medusa) for individual kernel usage.
312
334
  - **KLDivergence**: [KL Divergence](https://pytorch.org/docs/stable/generated/torch.nn.KLDivLoss.html) is implemented by fusing the forward into a single triton kernel, with reduction done outside the kernel. It achieves ~1.5X speed and ~15% memory reduction for 128K vocab size.
335
+ - **JSD**: [Generalized JSD](https://arxiv.org/pdf/2306.13649) (Jensen-Shannon divergence), is implemented by computing both the loss and gradient in the forward pass. It achieves ~1.5X speed and ~54% memory reduction for 128k vocab size.
336
+ - **FusedLinearJSD**: Peak memory usage of JSD loss is further improved by fusing the model head with the JSD and chunking the input for block-wise loss and gradient calculation. It achieves ~85% memory reduction for 128k vocab size where batch size $\times$ sequence length is 8192.
337
+
313
338
 
314
339
  ### Experimental Kernels
315
340
 
316
341
  | **Kernel** | **API** |
317
342
  |---------------------------------|-------------------------------------------------------------|
318
343
  | Embedding | `liger_kernel.transformers.experimental.LigerEmbedding` |
319
-
344
+ | Matmul int2xint8 | `liger_kernel.transformers.experimental.matmul`
320
345
 
321
346
  - **Embedding**: [Embedding](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) is implemented by fusing embedding lookup and output operations. It achieves a peak speedup of ~1.5x in the forward pass and an overall speedup of ~1.1x.
322
-
347
+ - **Matmul int2xint8**: is implemented by using the cache tiled matrix multiplication and by fusing the matmul with the unpacking process which achieves a considerable speed up and performs on par with @torch.compile
323
348
  <!-- TODO: be more specific about batch size -->
324
349
  > **Note:**
325
350
  > Reported speedups and memory reductions are with respect to the LLaMA 3-8B Hugging Face layer implementations. All models use 4K hidden size and 4K sequence length and are evaluated based on memory usage and wall time for the forward+backward pass on a single NVIDIA A100 80G GPU using small batch sizes. Liger kernels exhibit more efficient scaling to larger batch sizes, detailed further in the [Benchmark](./benchmark) folder.
326
351
 
327
- ## Note on ML Compiler
328
-
329
- ### Torch Compile
330
-
331
- Since Liger Kernel is 100% Triton-based, it works seamlessly with [`torch.compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html). In the following example, Liger Kernel can further optimize the model on top of Torch Compile, reducing the memory by more than half.
332
-
333
- | Configuration | Throughput (tokens/sec) | Memory Reserved (GB) |
334
- |--------------------------------|----------------------------|-------------------------|
335
- | Torch Compile | 3780 | 66.4 |
336
- | Torch Compile + Liger Kernel | 3702 | 31.0 |
337
-
338
- > **Note:**
339
- > 1. Benchmark conditions: LLaMA 3-8B, Batch Size = 8, Seq Len = 4096, Data Type = `bf16`, Optimizer = AdamW, Gradient Checkpointing = True, Distributed Strategy = FSDP1 on 8 A100s.
340
- > 2. Tested on torch `2.5.0.dev20240731+cu118`
341
-
342
352
  ## Contributing
343
353
 
344
354
  [CONTRIBUTING GUIDE](https://github.com/linkedin/Liger-Kernel/blob/main/CONTRIBUTING.md)
@@ -372,7 +382,14 @@ Many thanks to the contributors to these projects for their invaluable work that
372
382
 
373
383
  ## License
374
384
 
375
- [BSD 2-CLAUSE](https://github.com/linkedin/Liger-Kernel/blob/main/LICENSE)
385
+ This project is licensed under the [BSD 2-CLAUSE](https://github.com/linkedin/Liger-Kernel/blob/main/LICENSE) License (see `LICENSE` for details).
386
+ It also includes components from projects licensed under:
387
+
388
+ - Apache License 2.0 (see `LICENSE-APACHE-2.0` for details).
389
+ - MIT License (see `LICENSE-MIT-AutoAWQ` for details).
390
+ - MIT License (see `LICENSE-MIT-Efficient Cross Entropy` for details).
391
+ - MIT License (see `LICENSE-MIT-llmc` for details).
392
+ - MIT License (see `LICENSE-MIT-triton` for details).
376
393
 
377
394
  ## Contact
378
395
 
@@ -383,13 +400,30 @@ Many thanks to the contributors to these projects for their invaluable work that
383
400
 
384
401
  Biblatex entry:
385
402
  ```bib
386
- @software{liger2024,
387
- title = {Liger-Kernel: Efficient Triton Kernels for LLM Training},
388
- author = {Hsu, Pin-Lun and Dai, Yun and Kothapalli, Vignesh and Song, Qingquan and Tang, Shao and Zhu, Siyu},
389
- url = {https://github.com/linkedin/Liger-Kernel},
390
- year = {2024}
403
+ @article{hsu2024ligerkernelefficienttriton,
404
+ title={Liger Kernel: Efficient Triton Kernels for LLM Training},
405
+ author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen},
406
+ year={2024},
407
+ eprint={2410.10989},
408
+ archivePrefix={arXiv},
409
+ primaryClass={cs.LG},
410
+ url={https://arxiv.org/abs/2410.10989},
411
+ journal={arXiv preprint arXiv:2410.10989},
391
412
  }
392
413
  ```
393
414
 
394
415
  ## Star History
395
416
  [![Star History Chart](https://api.star-history.com/svg?repos=linkedin/Liger-Kernel&type=Date)](https://star-history.com/#linkedin/Liger-Kernel&Date)
417
+
418
+ ## Contributors
419
+
420
+ <a href="https://github.com/linkedin/Liger-Kernel/graphs/contributors">
421
+ <img alt="contributors" src="https://contrib.rocks/image?repo=linkedin/Liger-Kernel"/>
422
+ </a>
423
+
424
+ <p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
425
+ <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
426
+ ↑ Back to Top ↑
427
+ </a>
428
+ </p>
429
+
@@ -1,3 +1,5 @@
1
+ <a name="readme-top"></a>
2
+
1
3
  # Liger Kernel: Efficient Triton Kernels for LLM Training
2
4
 
3
5
 
@@ -6,6 +8,7 @@
6
8
  <th style="padding: 10px;" colspan="2">Stable</th>
7
9
  <th style="padding: 10px;" colspan="2">Nightly</th>
8
10
  <th style="padding: 10px;">Discord</th>
11
+ <th style="padding: 10px;">Gurubase (experimental)</th>
9
12
  </tr>
10
13
  <tr>
11
14
  <td style="padding: 10px;">
@@ -33,6 +36,11 @@
33
36
  <img src="https://dcbadge.vercel.app/api/server/gpumode?style=flat" alt="Join Our Discord">
34
37
  </a>
35
38
  </td>
39
+ <td style="padding: 10px;">
40
+ <a href="https://gurubase.io/g/liger-kernel">
41
+ <img src="https://img.shields.io/badge/Gurubase-Ask%20Liger%20Kernel%20Guru-006BFF" alt="Ask Liger Kernel Guru">
42
+ </a>
43
+ </td>
36
44
  </tr>
37
45
  </table>
38
46
 
@@ -40,11 +48,12 @@
40
48
 
41
49
  <img src="https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/logo-banner.png">
42
50
 
43
- [Installation](#installation) | [Getting Started](#getting-started) | [Examples](#examples) | [APIs](#apis) | [Structure](#structure) | [Contributing](#contributing) | [Acknowledgement](#acknowledgement)
51
+ [Installation](#installation) | [Getting Started](#getting-started) | [Examples](#examples) | [APIs](#apis) | [Cite our work](#cite-this-work)
44
52
 
45
53
  <details>
46
54
  <summary>Latest News 🔥</summary>
47
-
55
+
56
+ - [2024/10/21] We have released the tech report of Liger Kernel on Arxiv: https://arxiv.org/pdf/2410.10989
48
57
  - [2024/9/6] We release v0.2.1 ([X post](https://x.com/liger_kernel/status/1832168197002510649)). 2500+ Stars, 10+ New Contributors, 50+ PRs, 50k Downloads in two weeks!
49
58
  - [2024/8/31] CUDA MODE talk, [Liger-Kernel: Real-world Triton kernel for LLM Training](https://youtu.be/gWble4FreV4?si=dxPeIchhkJ36Mbns), [Slides](https://github.com/cuda-mode/lectures?tab=readme-ov-file#lecture-28-liger-kernel)
50
59
  - [2024/8/23] Official release: check out our [X post](https://x.com/hsu_byron/status/1827072737673982056)
@@ -102,11 +111,18 @@ With one line of code, Liger Kernel can increase throughput by more than 20% and
102
111
 
103
112
  ## Installation
104
113
 
105
- ### Dependencies
114
+ ### Dependencies
115
+
116
+ #### CUDA
106
117
 
107
118
  - `torch >= 2.1.2`
108
119
  - `triton >= 2.3.0`
109
120
 
121
+ #### ROCm
122
+
123
+ - `torch >= 2.5.0` Install according to the instruction in Pytorch official webpage.
124
+ - `triton >= 3.0.0` Install from pypi. (e.g. `pip install triton==3.0.0`)
125
+
110
126
  ### Optional Dependencies
111
127
 
112
128
  - `transformers >= 4.x`: Required if you plan to use the transformers models patching APIs. The specific model you are working will dictate the minimum version of transformers.
@@ -136,6 +152,7 @@ pip install -e .
136
152
  pip install -e .[transformers]
137
153
  ```
138
154
 
155
+
139
156
  ## Getting Started
140
157
 
141
158
  There are a couple of ways to apply Liger kernels, depending on the level of customization required.
@@ -228,10 +245,11 @@ loss.backward()
228
245
  | **Model** | **API** | **Supported Operations** |
229
246
  |-------------|--------------------------------------------------------------|-------------------------------------------------------------------------|
230
247
  | LLaMA 2 & 3 | `liger_kernel.transformers.apply_liger_kernel_to_llama` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
248
+ | LLaMA 3.2-Vision | `liger_kernel.transformers.apply_liger_kernel_to_mllama` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
231
249
  | Mistral | `liger_kernel.transformers.apply_liger_kernel_to_mistral` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
232
250
  | Mixtral | `liger_kernel.transformers.apply_liger_kernel_to_mixtral` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
233
251
  | Gemma1 | `liger_kernel.transformers.apply_liger_kernel_to_gemma` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
234
- | Gemma2 | `liger_kernel.transformers.apply_liger_kernel_to_gemma2` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss |
252
+ | Gemma2 | `liger_kernel.transformers.apply_liger_kernel_to_gemma2` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
235
253
  | Qwen2 & Qwen2.5 | `liger_kernel.transformers.apply_liger_kernel_to_qwen2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
236
254
  | Qwen2-VL | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl` | RMSNorm, LayerNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
237
255
  | Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
@@ -250,9 +268,12 @@ loss.backward()
250
268
  | CrossEntropy | `liger_kernel.transformers.LigerCrossEntropyLoss` |
251
269
  | FusedLinearCrossEntropy | `liger_kernel.transformers.LigerFusedLinearCrossEntropyLoss`|
252
270
  | KLDivergence | `liger_kernel.transformers.LigerKLDIVLoss` |
271
+ | JSD | `liger_kernel.transformers.LigerJSD` |
272
+ | FusedLinearJSD | `liger_kernel.transformers.LigerFusedLinearJSD` |
253
273
 
254
274
  - **RMSNorm**: [RMSNorm](https://arxiv.org/pdf/1910.07467), which normalizes activations using their root mean square, is implemented by fusing the normalization and scaling steps into a single Triton kernel, and achieves ~3X speedup with ~3X peak memory reduction.
255
275
  - **LayerNorm**: [LayerNorm](https://arxiv.org/pdf/1607.06450), which centers and normalizes activations across the feature dimension, is implemented by fusing the centering, normalization and scaling steps into a single Triton kernel, and achieves ~2X speedup.
276
+ - **GroupNorm**: [GroupNorm](https://arxiv.org/pdf/1803.08494), which normalizes activations across the group dimension for a given sample. Channels are grouped in K groups over which the normalization is performed, is implemented by fusing the centering, normalization and scaling steps into a single Triton kernel, and can achieve up to ~2X speedup as the number of channels/groups increases.
256
277
  - **RoPE**: [Rotary Positional Embedding](https://arxiv.org/pdf/2104.09864) is implemented by fusing the query and key embedding rotary into a single kernel with inplace replacement, and achieves ~3X speedup with ~3X peak memory reduction.
257
278
  - **SwiGLU**: [Swish Gated Linear Units](https://arxiv.org/pdf/2002.05202), given by
258
279
  $$\text{SwiGLU}(x)=\text{Swish}_{\beta}(xW+b)\otimes(xV+c)$$
@@ -264,35 +285,23 @@ $$\text{GeGLU}(x)=\text{GELU}(xW+b)\otimes(xV+c)$$
264
285
  <!-- TODO: verify vocab sizes are accurate -->
265
286
  - **FusedLinearCrossEntropy**: Peak memory usage of cross entropy loss is further improved by fusing the model head with the CE loss and chunking the input for block-wise loss and gradient calculation, a technique inspired by [Efficient Cross Entropy](https://github.com/mgmalek/efficient_cross_entropy). It achieves >4X memory reduction for 128k vocab size. **This is highly effective for large batch size, large sequence length, and large vocabulary sizes.** Please refer to the [Medusa example](https://github.com/linkedin/Liger-Kernel/tree/main/examples/medusa) for individual kernel usage.
266
287
  - **KLDivergence**: [KL Divergence](https://pytorch.org/docs/stable/generated/torch.nn.KLDivLoss.html) is implemented by fusing the forward into a single triton kernel, with reduction done outside the kernel. It achieves ~1.5X speed and ~15% memory reduction for 128K vocab size.
288
+ - **JSD**: [Generalized JSD](https://arxiv.org/pdf/2306.13649) (Jensen-Shannon divergence), is implemented by computing both the loss and gradient in the forward pass. It achieves ~1.5X speed and ~54% memory reduction for 128k vocab size.
289
+ - **FusedLinearJSD**: Peak memory usage of JSD loss is further improved by fusing the model head with the JSD and chunking the input for block-wise loss and gradient calculation. It achieves ~85% memory reduction for 128k vocab size where batch size $\times$ sequence length is 8192.
290
+
267
291
 
268
292
  ### Experimental Kernels
269
293
 
270
294
  | **Kernel** | **API** |
271
295
  |---------------------------------|-------------------------------------------------------------|
272
296
  | Embedding | `liger_kernel.transformers.experimental.LigerEmbedding` |
273
-
297
+ | Matmul int2xint8 | `liger_kernel.transformers.experimental.matmul`
274
298
 
275
299
  - **Embedding**: [Embedding](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) is implemented by fusing embedding lookup and output operations. It achieves a peak speedup of ~1.5x in the forward pass and an overall speedup of ~1.1x.
276
-
300
+ - **Matmul int2xint8**: is implemented by using the cache tiled matrix multiplication and by fusing the matmul with the unpacking process which achieves a considerable speed up and performs on par with @torch.compile
277
301
  <!-- TODO: be more specific about batch size -->
278
302
  > **Note:**
279
303
  > Reported speedups and memory reductions are with respect to the LLaMA 3-8B Hugging Face layer implementations. All models use 4K hidden size and 4K sequence length and are evaluated based on memory usage and wall time for the forward+backward pass on a single NVIDIA A100 80G GPU using small batch sizes. Liger kernels exhibit more efficient scaling to larger batch sizes, detailed further in the [Benchmark](./benchmark) folder.
280
304
 
281
- ## Note on ML Compiler
282
-
283
- ### Torch Compile
284
-
285
- Since Liger Kernel is 100% Triton-based, it works seamlessly with [`torch.compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html). In the following example, Liger Kernel can further optimize the model on top of Torch Compile, reducing the memory by more than half.
286
-
287
- | Configuration | Throughput (tokens/sec) | Memory Reserved (GB) |
288
- |--------------------------------|----------------------------|-------------------------|
289
- | Torch Compile | 3780 | 66.4 |
290
- | Torch Compile + Liger Kernel | 3702 | 31.0 |
291
-
292
- > **Note:**
293
- > 1. Benchmark conditions: LLaMA 3-8B, Batch Size = 8, Seq Len = 4096, Data Type = `bf16`, Optimizer = AdamW, Gradient Checkpointing = True, Distributed Strategy = FSDP1 on 8 A100s.
294
- > 2. Tested on torch `2.5.0.dev20240731+cu118`
295
-
296
305
  ## Contributing
297
306
 
298
307
  [CONTRIBUTING GUIDE](https://github.com/linkedin/Liger-Kernel/blob/main/CONTRIBUTING.md)
@@ -326,7 +335,14 @@ Many thanks to the contributors to these projects for their invaluable work that
326
335
 
327
336
  ## License
328
337
 
329
- [BSD 2-CLAUSE](https://github.com/linkedin/Liger-Kernel/blob/main/LICENSE)
338
+ This project is licensed under the [BSD 2-CLAUSE](https://github.com/linkedin/Liger-Kernel/blob/main/LICENSE) License (see `LICENSE` for details).
339
+ It also includes components from projects licensed under:
340
+
341
+ - Apache License 2.0 (see `LICENSE-APACHE-2.0` for details).
342
+ - MIT License (see `LICENSE-MIT-AutoAWQ` for details).
343
+ - MIT License (see `LICENSE-MIT-Efficient Cross Entropy` for details).
344
+ - MIT License (see `LICENSE-MIT-llmc` for details).
345
+ - MIT License (see `LICENSE-MIT-triton` for details).
330
346
 
331
347
  ## Contact
332
348
 
@@ -337,13 +353,30 @@ Many thanks to the contributors to these projects for their invaluable work that
337
353
 
338
354
  Biblatex entry:
339
355
  ```bib
340
- @software{liger2024,
341
- title = {Liger-Kernel: Efficient Triton Kernels for LLM Training},
342
- author = {Hsu, Pin-Lun and Dai, Yun and Kothapalli, Vignesh and Song, Qingquan and Tang, Shao and Zhu, Siyu},
343
- url = {https://github.com/linkedin/Liger-Kernel},
344
- year = {2024}
356
+ @article{hsu2024ligerkernelefficienttriton,
357
+ title={Liger Kernel: Efficient Triton Kernels for LLM Training},
358
+ author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen},
359
+ year={2024},
360
+ eprint={2410.10989},
361
+ archivePrefix={arXiv},
362
+ primaryClass={cs.LG},
363
+ url={https://arxiv.org/abs/2410.10989},
364
+ journal={arXiv preprint arXiv:2410.10989},
345
365
  }
346
366
  ```
347
367
 
348
368
  ## Star History
349
369
  [![Star History Chart](https://api.star-history.com/svg?repos=linkedin/Liger-Kernel&type=Date)](https://star-history.com/#linkedin/Liger-Kernel&Date)
370
+
371
+ ## Contributors
372
+
373
+ <a href="https://github.com/linkedin/Liger-Kernel/graphs/contributors">
374
+ <img alt="contributors" src="https://contrib.rocks/image?repo=linkedin/Liger-Kernel"/>
375
+ </a>
376
+
377
+ <p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
378
+ <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
379
+ ↑ Back to Top ↑
380
+ </a>
381
+ </p>
382
+
@@ -4,14 +4,14 @@ build-backend = "setuptools.build_meta"
4
4
 
5
5
  [project]
6
6
  name = "liger_kernel"
7
- version = "0.3.1"
7
+ version = "0.4.1"
8
8
  description = "Efficient Triton kernels for LLM Training"
9
9
  urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
10
10
  readme = { file = "README.md", content-type = "text/markdown" }
11
11
  license = { file = "LICENSE" }
12
12
  dependencies = [
13
13
  "torch>=2.1.2",
14
- "triton>=2.3.0",
14
+ "triton>=2.3.1",
15
15
  ]
16
16
 
17
17
  [project.optional-dependencies]
@@ -27,6 +27,7 @@ dev = [
27
27
  "isort>=5.13.2",
28
28
  "pytest>=7.1.2",
29
29
  "datasets>=2.19.2",
30
+ "torchvision>=0.16.2",
30
31
  "seaborn",
31
32
  ]
32
33
 
@@ -36,7 +37,7 @@ include = ["liger_kernel", "liger_kernel.*"]
36
37
 
37
38
  [tool.pytest.ini_options]
38
39
  pythonpath = [
39
- "src",
40
+ "src",
40
41
  "."
41
42
  ]
42
43
  asyncio_mode = "auto"
@@ -4,11 +4,13 @@ import sys
4
4
 
5
5
  def print_env_report():
6
6
  """
7
+
7
8
  Prints a report of the environment. Useful for debugging and reproducibility.
8
9
  Usage:
9
10
  ```
10
11
  python -m liger_kernel.env_report
11
12
  ```
13
+
12
14
  """
13
15
  print("Environment Report:")
14
16
  print("-------------------")