liger-kernel 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. liger_kernel/ops/cross_entropy.py +5 -39
  2. liger_kernel/ops/experimental/mm_int8int2.py +355 -0
  3. liger_kernel/ops/fused_linear_cross_entropy.py +13 -10
  4. liger_kernel/ops/fused_linear_jsd.py +245 -0
  5. liger_kernel/ops/geglu.py +2 -2
  6. liger_kernel/ops/jsd.py +176 -0
  7. liger_kernel/ops/kl_div.py +45 -34
  8. liger_kernel/ops/rms_norm.py +67 -42
  9. liger_kernel/ops/swiglu.py +2 -2
  10. liger_kernel/ops/utils.py +62 -1
  11. liger_kernel/transformers/__init__.py +3 -0
  12. liger_kernel/transformers/auto_model.py +18 -6
  13. liger_kernel/transformers/functional.py +4 -0
  14. liger_kernel/transformers/fused_linear_jsd.py +98 -0
  15. liger_kernel/transformers/jsd.py +75 -0
  16. liger_kernel/transformers/kl_div.py +3 -2
  17. liger_kernel/transformers/model/gemma.py +124 -1
  18. liger_kernel/transformers/model/llama.py +135 -4
  19. liger_kernel/transformers/model/mistral.py +3 -0
  20. liger_kernel/transformers/model/mixtral.py +153 -2
  21. liger_kernel/transformers/model/mllama.py +274 -0
  22. liger_kernel/transformers/model/phi3.py +140 -2
  23. liger_kernel/transformers/model/qwen2.py +123 -2
  24. liger_kernel/transformers/model/qwen2_vl.py +8 -1
  25. liger_kernel/transformers/monkey_patch.py +254 -129
  26. {liger_kernel-0.3.0.dist-info → liger_kernel-0.4.0.dist-info}/METADATA +74 -35
  27. liger_kernel-0.4.0.dist-info/NOTICE +58 -0
  28. liger_kernel-0.4.0.dist-info/RECORD +48 -0
  29. {liger_kernel-0.3.0.dist-info → liger_kernel-0.4.0.dist-info}/WHEEL +1 -1
  30. liger_kernel-0.3.0.dist-info/NOTICE +0 -4
  31. liger_kernel-0.3.0.dist-info/RECORD +0 -42
  32. {liger_kernel-0.3.0.dist-info → liger_kernel-0.4.0.dist-info}/LICENSE +0 -0
  33. {liger_kernel-0.3.0.dist-info → liger_kernel-0.4.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -31,17 +31,21 @@ Description-Content-Type: text/markdown
31
31
  License-File: LICENSE
32
32
  License-File: NOTICE
33
33
  Requires-Dist: torch>=2.1.2
34
- Requires-Dist: triton>=2.3.0
35
- Requires-Dist: transformers>=4.42.0
34
+ Requires-Dist: triton>=2.3.1
36
35
  Provides-Extra: dev
36
+ Requires-Dist: transformers>=4.44.2; extra == "dev"
37
37
  Requires-Dist: matplotlib>=3.7.2; extra == "dev"
38
38
  Requires-Dist: flake8>=4.0.1.1; extra == "dev"
39
39
  Requires-Dist: black>=24.4.2; extra == "dev"
40
40
  Requires-Dist: isort>=5.13.2; extra == "dev"
41
41
  Requires-Dist: pytest>=7.1.2; extra == "dev"
42
42
  Requires-Dist: datasets>=2.19.2; extra == "dev"
43
- Requires-Dist: jupyter==1.0.0; extra == "dev"
43
+ Requires-Dist: torchvision>=0.16.2; extra == "dev"
44
44
  Requires-Dist: seaborn; extra == "dev"
45
+ Provides-Extra: transformers
46
+ Requires-Dist: transformers~=4.0; extra == "transformers"
47
+
48
+ <a name="readme-top"></a>
45
49
 
46
50
  # Liger Kernel: Efficient Triton Kernels for LLM Training
47
51
 
@@ -51,6 +55,7 @@ Requires-Dist: seaborn; extra == "dev"
51
55
  <th style="padding: 10px;" colspan="2">Stable</th>
52
56
  <th style="padding: 10px;" colspan="2">Nightly</th>
53
57
  <th style="padding: 10px;">Discord</th>
58
+ <th style="padding: 10px;">Gurubase (experimental)</th>
54
59
  </tr>
55
60
  <tr>
56
61
  <td style="padding: 10px;">
@@ -74,8 +79,13 @@ Requires-Dist: seaborn; extra == "dev"
74
79
  </a>
75
80
  </td>
76
81
  <td style="padding: 10px;">
77
- <a href="https://discord.gg/CX2YmNmn">
78
- <img src="https://dcbadge.vercel.app/api/server/cudamode?style=flat" alt="Join Our Discord">
82
+ <a href="https://discord.gg/gpumode">
83
+ <img src="https://dcbadge.vercel.app/api/server/gpumode?style=flat" alt="Join Our Discord">
84
+ </a>
85
+ </td>
86
+ <td style="padding: 10px;">
87
+ <a href="https://gurubase.io/g/liger-kernel">
88
+ <img src="https://img.shields.io/badge/Gurubase-Ask%20Liger%20Kernel%20Guru-006BFF" alt="Ask Liger Kernel Guru">
79
89
  </a>
80
90
  </td>
81
91
  </tr>
@@ -85,11 +95,12 @@ Requires-Dist: seaborn; extra == "dev"
85
95
 
86
96
  <img src="https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/logo-banner.png">
87
97
 
88
- [Installation](#installation) | [Getting Started](#getting-started) | [Examples](#examples) | [APIs](#apis) | [Structure](#structure) | [Contributing](#contributing) | [Acknowledgement](#acknowledgement)
98
+ [Installation](#installation) | [Getting Started](#getting-started) | [Examples](#examples) | [APIs](#apis) | [Cite our work](#cite-this-work)
89
99
 
90
100
  <details>
91
101
  <summary>Latest News 🔥</summary>
92
-
102
+
103
+ - [2024/10/21] We have released the tech report of Liger Kernel on Arxiv: https://arxiv.org/pdf/2410.10989
93
104
  - [2024/9/6] We release v0.2.1 ([X post](https://x.com/liger_kernel/status/1832168197002510649)). 2500+ Stars, 10+ New Contributors, 50+ PRs, 50k Downloads in two weeks!
94
105
  - [2024/8/31] CUDA MODE talk, [Liger-Kernel: Real-world Triton kernel for LLM Training](https://youtu.be/gWble4FreV4?si=dxPeIchhkJ36Mbns), [Slides](https://github.com/cuda-mode/lectures?tab=readme-ov-file#lecture-28-liger-kernel)
95
106
  - [2024/8/23] Official release: check out our [X post](https://x.com/hsu_byron/status/1827072737673982056)
@@ -147,11 +158,21 @@ With one line of code, Liger Kernel can increase throughput by more than 20% and
147
158
 
148
159
  ## Installation
149
160
 
150
- ### Dependencies
161
+ ### Dependencies
162
+
163
+ #### CUDA
151
164
 
152
165
  - `torch >= 2.1.2`
153
166
  - `triton >= 2.3.0`
154
- - `transformers >= 4.42.0`
167
+
168
+ #### ROCm
169
+
170
+ - `torch >= 2.5.0` Install according to the instruction in Pytorch official webpage.
171
+ - `triton >= 3.0.0` Install from pypi. (e.g. `pip install triton==3.0.0`)
172
+
173
+ ### Optional Dependencies
174
+
175
+ - `transformers >= 4.x`: Required if you plan to use the transformers models patching APIs. The specific model you are working will dictate the minimum version of transformers.
155
176
 
156
177
  > **Note:**
157
178
  > Our kernels inherit the full spectrum of hardware compatibility offered by [Triton](https://github.com/triton-lang/triton).
@@ -174,7 +195,11 @@ To install from source:
174
195
  git clone https://github.com/linkedin/Liger-Kernel.git
175
196
  cd Liger-Kernel
176
197
  pip install -e .
198
+ # or if using transformers
199
+ pip install -e .[transformers]
177
200
  ```
201
+
202
+
178
203
  ## Getting Started
179
204
 
180
205
  There are a couple of ways to apply Liger kernels, depending on the level of customization required.
@@ -267,13 +292,14 @@ loss.backward()
267
292
  | **Model** | **API** | **Supported Operations** |
268
293
  |-------------|--------------------------------------------------------------|-------------------------------------------------------------------------|
269
294
  | LLaMA 2 & 3 | `liger_kernel.transformers.apply_liger_kernel_to_llama` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
295
+ | LLaMA 3.2-Vision | `liger_kernel.transformers.apply_liger_kernel_to_mllama` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
270
296
  | Mistral | `liger_kernel.transformers.apply_liger_kernel_to_mistral` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
271
297
  | Mixtral | `liger_kernel.transformers.apply_liger_kernel_to_mixtral` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
272
298
  | Gemma1 | `liger_kernel.transformers.apply_liger_kernel_to_gemma` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
273
299
  | Gemma2 | `liger_kernel.transformers.apply_liger_kernel_to_gemma2` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss |
274
- | Qwen2 | `liger_kernel.transformers.apply_liger_kernel_to_qwen2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
300
+ | Qwen2 & Qwen2.5 | `liger_kernel.transformers.apply_liger_kernel_to_qwen2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
275
301
  | Qwen2-VL | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl` | RMSNorm, LayerNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
276
- | Phi3 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
302
+ | Phi3 & Phi3.5 | `liger_kernel.transformers.apply_liger_kernel_to_phi3` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
277
303
 
278
304
 
279
305
 
@@ -289,6 +315,8 @@ loss.backward()
289
315
  | CrossEntropy | `liger_kernel.transformers.LigerCrossEntropyLoss` |
290
316
  | FusedLinearCrossEntropy | `liger_kernel.transformers.LigerFusedLinearCrossEntropyLoss`|
291
317
  | KLDivergence | `liger_kernel.transformers.LigerKLDIVLoss` |
318
+ | JSD | `liger_kernel.transformers.LigerJSD` |
319
+ | FusedLinearJSD | `liger_kernel.transformers.LigerFusedLinearJSD` |
292
320
 
293
321
  - **RMSNorm**: [RMSNorm](https://arxiv.org/pdf/1910.07467), which normalizes activations using their root mean square, is implemented by fusing the normalization and scaling steps into a single Triton kernel, and achieves ~3X speedup with ~3X peak memory reduction.
294
322
  - **LayerNorm**: [LayerNorm](https://arxiv.org/pdf/1607.06450), which centers and normalizes activations across the feature dimension, is implemented by fusing the centering, normalization and scaling steps into a single Triton kernel, and achieves ~2X speedup.
@@ -303,35 +331,23 @@ $$\text{GeGLU}(x)=\text{GELU}(xW+b)\otimes(xV+c)$$
303
331
  <!-- TODO: verify vocab sizes are accurate -->
304
332
  - **FusedLinearCrossEntropy**: Peak memory usage of cross entropy loss is further improved by fusing the model head with the CE loss and chunking the input for block-wise loss and gradient calculation, a technique inspired by [Efficient Cross Entropy](https://github.com/mgmalek/efficient_cross_entropy). It achieves >4X memory reduction for 128k vocab size. **This is highly effective for large batch size, large sequence length, and large vocabulary sizes.** Please refer to the [Medusa example](https://github.com/linkedin/Liger-Kernel/tree/main/examples/medusa) for individual kernel usage.
305
333
  - **KLDivergence**: [KL Divergence](https://pytorch.org/docs/stable/generated/torch.nn.KLDivLoss.html) is implemented by fusing the forward into a single triton kernel, with reduction done outside the kernel. It achieves ~1.5X speed and ~15% memory reduction for 128K vocab size.
334
+ - **JSD**: [Generalized JSD](https://arxiv.org/pdf/2306.13649) (Jensen-Shannon divergence), is implemented by computing both the loss and gradient in the forward pass. It achieves ~1.5X speed and ~54% memory reduction for 128k vocab size.
335
+ - **FusedLinearJSD**: Peak memory usage of JSD loss is further improved by fusing the model head with the model head with the JSD and chunking the input for block-wise loss and gradient calculation. It achieves ~85% memory reduction for 128k vocab size where batch size $\times$ sequence length is 8192.
336
+
306
337
 
307
338
  ### Experimental Kernels
308
339
 
309
340
  | **Kernel** | **API** |
310
341
  |---------------------------------|-------------------------------------------------------------|
311
342
  | Embedding | `liger_kernel.transformers.experimental.LigerEmbedding` |
312
-
343
+ | Matmul int2xint8 | `liger_kernel.transformers.experimental.matmul`
313
344
 
314
345
  - **Embedding**: [Embedding](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) is implemented by fusing embedding lookup and output operations. It achieves a peak speedup of ~1.5x in the forward pass and an overall speedup of ~1.1x.
315
-
346
+ - **Matmul int2xint8**: is implemented by using the cache tiled matrix multiplication and by fusing the matmul with the unpacking process which achieves a considerable speed up and performs on par with @torch.compile
316
347
  <!-- TODO: be more specific about batch size -->
317
348
  > **Note:**
318
349
  > Reported speedups and memory reductions are with respect to the LLaMA 3-8B Hugging Face layer implementations. All models use 4K hidden size and 4K sequence length and are evaluated based on memory usage and wall time for the forward+backward pass on a single NVIDIA A100 80G GPU using small batch sizes. Liger kernels exhibit more efficient scaling to larger batch sizes, detailed further in the [Benchmark](./benchmark) folder.
319
350
 
320
- ## Note on ML Compiler
321
-
322
- ### Torch Compile
323
-
324
- Since Liger Kernel is 100% Triton-based, it works seamlessly with [`torch.compile`](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html). In the following example, Liger Kernel can further optimize the model on top of Torch Compile, reducing the memory by more than half.
325
-
326
- | Configuration | Throughput (tokens/sec) | Memory Reserved (GB) |
327
- |--------------------------------|----------------------------|-------------------------|
328
- | Torch Compile | 3780 | 66.4 |
329
- | Torch Compile + Liger Kernel | 3702 | 31.0 |
330
-
331
- > **Note:**
332
- > 1. Benchmark conditions: LLaMA 3-8B, Batch Size = 8, Seq Len = 4096, Data Type = `bf16`, Optimizer = AdamW, Gradient Checkpointing = True, Distributed Strategy = FSDP1 on 8 A100s.
333
- > 2. Tested on torch `2.5.0.dev20240731+cu118`
334
-
335
351
  ## Contributing
336
352
 
337
353
  [CONTRIBUTING GUIDE](https://github.com/linkedin/Liger-Kernel/blob/main/CONTRIBUTING.md)
@@ -365,7 +381,14 @@ Many thanks to the contributors to these projects for their invaluable work that
365
381
 
366
382
  ## License
367
383
 
368
- [BSD 2-CLAUSE](https://github.com/linkedin/Liger-Kernel/blob/main/LICENSE)
384
+ This project is licensed under the [BSD 2-CLAUSE](https://github.com/linkedin/Liger-Kernel/blob/main/LICENSE) License (see `LICENSE` for details).
385
+ It also includes components from projects licensed under:
386
+
387
+ - Apache License 2.0 (see `LICENSE-APACHE-2.0` for details).
388
+ - MIT License (see `LICENSE-MIT-AutoAWQ` for details).
389
+ - MIT License (see `LICENSE-MIT-Efficient Cross Entropy` for details).
390
+ - MIT License (see `LICENSE-MIT-llmc` for details).
391
+ - MIT License (see `LICENSE-MIT-triton` for details).
369
392
 
370
393
  ## Contact
371
394
 
@@ -376,13 +399,29 @@ Many thanks to the contributors to these projects for their invaluable work that
376
399
 
377
400
  Biblatex entry:
378
401
  ```bib
379
- @software{liger2024,
380
- title = {Liger-Kernel: Efficient Triton Kernels for LLM Training},
381
- author = {Hsu, Pin-Lun and Dai, Yun and Kothapalli, Vignesh and Song, Qingquan and Tang, Shao and Zhu, Siyu},
382
- url = {https://github.com/linkedin/Liger-Kernel},
383
- year = {2024}
402
+ @article{hsu2024ligerkernelefficienttriton,
403
+ title={Liger Kernel: Efficient Triton Kernels for LLM Training},
404
+ author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen},
405
+ year={2024},
406
+ eprint={2410.10989},
407
+ archivePrefix={arXiv},
408
+ primaryClass={cs.LG},
409
+ url={https://arxiv.org/abs/2410.10989},
410
+ journal={arXiv preprint arXiv:2410.10989},
384
411
  }
385
412
  ```
386
413
 
387
414
  ## Star History
388
415
  [![Star History Chart](https://api.star-history.com/svg?repos=linkedin/Liger-Kernel&type=Date)](https://star-history.com/#linkedin/Liger-Kernel&Date)
416
+
417
+ ## Contributors
418
+
419
+ <a href="https://github.com/linkedin/Liger-Kernel/graphs/contributors">
420
+ <img alt="contributors" src="https://contrib.rocks/image?repo=linkedin/Liger-Kernel"/>
421
+ </a>
422
+
423
+ <p align="right" style="font-size: 14px; color: #555; margin-top: 20px;">
424
+ <a href="#readme-top" style="text-decoration: none; color: #007bff; font-weight: bold;">
425
+ ↑ Back to Top ↑
426
+ </a>
427
+ </p>
@@ -0,0 +1,58 @@
1
+ Copyright 2024 LinkedIn Corporation
2
+ All Rights Reserved.
3
+
4
+ Licensed under the BSD 2-Clause License (the "License"). See License in the project root for license information.
5
+
6
+ This product includes software developed by LinkedIn Corporation.
7
+
8
+ This product contains code derived from the following open source projects:
9
+
10
+ 1. Unsloth
11
+ Copyright (c) 2023 Unsloth AI
12
+ Licensed under the Apache License, Version 2.0
13
+ Source: https://github.com/unslothai/unsloth
14
+
15
+ The `calculate_settings` function to determine block size and warp is reused for Norm and MLP operations.
16
+ Modifications and additions were made to the RMS Norm implementation.
17
+
18
+ 2. Triton
19
+ Copyright (c) 2023 OpenAI
20
+ Licensed under the MIT License
21
+ Source: https://github.com/openai/triton
22
+
23
+ Modifications were made based on Triton tutorials for the RMS Norm implementation.
24
+
25
+ 3. Efficient Cross Entropy
26
+ Copyright (c) 2023 Mohamed Malek
27
+ Licensed under the MIT License
28
+ Source: https://github.com/mgmalek/efficient_cross_entropy
29
+
30
+ The idea of gradient-in-forward and chunking was used in the Linear Cross Entropy implementation.
31
+
32
+ 4. Flash Attention
33
+ Copyright (c) 2023 Tri Dao, Daniel Y. Fu, Stefano Ermon, Atri Rudra, Christopher Ré
34
+ Licensed under the BSD 3-Clause License
35
+ Source: https://github.com/Dao-AILab/flash-attention
36
+
37
+ Optimization ideas such as tiling and recomputation were inspired by this work.
38
+
39
+ 5. AutoAWQ
40
+ Copyright (c) 2023 Casper Hansen
41
+ Licensed under the MIT License
42
+ Source: https://github.com/casper-hansen/AutoAWQ
43
+
44
+ The design of the automodel was referenced from this project.
45
+
46
+ 6. llm.c
47
+ Copyright (c) 2023 Andrej Karpathy
48
+ Licensed under the MIT License
49
+ Source: https://github.com/karpathy/llm.c
50
+
51
+ The design of end-to-end testing was referenced from this project.
52
+
53
+ 7. Tiny Shakespeare Dataset
54
+ Source: https://huggingface.co/datasets/karpathy/tiny_shakespeare
55
+
56
+ This dataset is used to conduct convergence tests on mini models.
57
+
58
+ For full license texts, please refer to the respective project repositories.
@@ -0,0 +1,48 @@
1
+ liger_kernel/env_report.py,sha256=LFUJ6UMkFFGPBYXBlqHFGy4bhsemEpSI-_1edSazlHI,1130
2
+ liger_kernel/ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ liger_kernel/ops/cross_entropy.py,sha256=23Di7l0T20OBj8K3-0PYEA5FCJrrbiKs3xMGyLlzbtg,11248
4
+ liger_kernel/ops/fused_linear_cross_entropy.py,sha256=M-cF4BO-vvso2BIdk7-Q2FleeFPhqSQwZR1EirPC4OE,9456
5
+ liger_kernel/ops/fused_linear_jsd.py,sha256=5D_obamh08lGGTMyh85kBJD_aNjPhOYf4-TmCZ6m4s4,9626
6
+ liger_kernel/ops/geglu.py,sha256=MQL4zyzneZqZYUGPvb1QjI_EYT9_pKfSDgR25WD9jrI,4127
7
+ liger_kernel/ops/jsd.py,sha256=anWfdioucxZy4JQfTvbHBR-IQrZKeH-gBF1MHwwTuTQ,5781
8
+ liger_kernel/ops/kl_div.py,sha256=03FNXfvCb6M-56hhFepAFV9p6brArPR6KOKkdGD34mw,8374
9
+ liger_kernel/ops/layer_norm.py,sha256=unGMYMOPqtkM9aTrokhcqgPmsV2AUN7Yzv86isVB9OI,7422
10
+ liger_kernel/ops/rms_norm.py,sha256=9S9wyZLmzNyJlBxV4vbv4p5es7bGP-m_5wK9JC6JIdA,10911
11
+ liger_kernel/ops/rope.py,sha256=jrzaA9-6Orn44y_IIam9_YNPQxOFK2FrIRNfFea4EtU,8513
12
+ liger_kernel/ops/swiglu.py,sha256=Fwxtd76rhHKT9ShQAGca9RsnASplAVxtYKHmiT73_yA,2994
13
+ liger_kernel/ops/utils.py,sha256=3JSF--O7KT5Wa5BuO70M4h0XetxoZ_e9IoW9GRlxlBg,3777
14
+ liger_kernel/ops/experimental/embedding.py,sha256=LYR66dB-jhvhtUjeV4PnNro-n77J1mdlmpSLSxB3Y6U,4186
15
+ liger_kernel/ops/experimental/mm_int8int2.py,sha256=JpGVZCgRC6T8XMUJ_QbZRS2XU1bh0urIZphs5DTc1mY,13358
16
+ liger_kernel/transformers/__init__.py,sha256=gia-eBxr7TLxU0GdDf8AfCY4WgDlFLqIGSt7EoQGsBA,1336
17
+ liger_kernel/transformers/auto_model.py,sha256=RMIwQHSiXoksXFTIqFZ4PLBgoqkxJJAT3q1Qh47bGN8,1552
18
+ liger_kernel/transformers/cross_entropy.py,sha256=gL30VByCSA_iQSkhV6no70x_IUqqFSTMJdytppico_w,804
19
+ liger_kernel/transformers/functional.py,sha256=zlQ1yKOIZe-ZGmFicRMrlFJfAt8zzBWu8L4GVQbp_e8,1124
20
+ liger_kernel/transformers/fused_linear_cross_entropy.py,sha256=-07t8YRajZTrJOG2rUzt6Ur7kNuWgarWcqy7ou5Da8k,629
21
+ liger_kernel/transformers/fused_linear_jsd.py,sha256=MJ-KjmLZnakuoVpnbDGkd95DQgvESniyrRWYzollVZM,4066
22
+ liger_kernel/transformers/geglu.py,sha256=QcrME_8ooIn0xa59LaC0aoOdRrBIFd11Y0bAyF0NfCw,1130
23
+ liger_kernel/transformers/jsd.py,sha256=W-5CypO2mx4-bUWOxq1KScfCdoXlLoYbtt5xBnRzMs4,3056
24
+ liger_kernel/transformers/kl_div.py,sha256=qVhjBg6tjRyue5iZ3NFxo8uySY4JuIFJyv0IM_50F24,431
25
+ liger_kernel/transformers/layer_norm.py,sha256=fd6o4kSHJWolQMWxh-l1qObfgL08ruNbUoBiANKX1ow,972
26
+ liger_kernel/transformers/monkey_patch.py,sha256=qetRIZmdHIDxE0TtWP5-rWS91NuGgRYRZBTqzJUojkI,35507
27
+ liger_kernel/transformers/rms_norm.py,sha256=4XfMQI6dORF7s_5qUqVHKWv-3IUomaimU2dg-NwnpoM,1035
28
+ liger_kernel/transformers/rope.py,sha256=m-ah8vZBYW8tfplTXCiAPMHJWlB1tdp_JPXJeWE-Boo,943
29
+ liger_kernel/transformers/swiglu.py,sha256=0-tVJ8xEYfhxnduc16PflXFj8sZPxdx9sHUn3hfwCI4,2468
30
+ liger_kernel/transformers/trainer_integration.py,sha256=W3ON51O5GkyzNJsItz0y5rKx-uy2f2cFfveZpqbUdhw,123
31
+ liger_kernel/transformers/experimental/embedding.py,sha256=HpckiAMKM8-SRxKDcGTqortVxnjhwpZsfsp9lfjqfeM,895
32
+ liger_kernel/transformers/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ liger_kernel/transformers/model/gemma.py,sha256=R4huxuR48gkLrdT8KqV7As2v9dZtEmcGVz6YG1ZmuJE,9692
34
+ liger_kernel/transformers/model/llama.py,sha256=RinsgC_eR-YNvZd2SHPQxZ4eyR3uViaTFCM3SvI5nks,10426
35
+ liger_kernel/transformers/model/mistral.py,sha256=XpL1rlWg_llvW3z_Hf_d8WQs7uQaH4ds7EZ2SxjQHsU,5144
36
+ liger_kernel/transformers/model/mixtral.py,sha256=nyDS1dBpsOXYC2DuW59Hgu7ZrGftrHuWPfNqjcNPIxs,11503
37
+ liger_kernel/transformers/model/mllama.py,sha256=mesNCgj0Ea1O-fqRD4LVxDJ1CR2abY_zAzK_bfVzkiU,11222
38
+ liger_kernel/transformers/model/phi3.py,sha256=xUZPlaPKwknLjHc3uUW3EPodm1h0vD3G7Qnhh51v-Io,10332
39
+ liger_kernel/transformers/model/qwen2.py,sha256=EyhSSzQOskGjSnCsKMZpd1s5IAIlHd5PBO3q0MoCs00,9619
40
+ liger_kernel/transformers/model/qwen2_vl.py,sha256=j6xAhp9AG195dsZK5f8dFYVM9uKtWApZrggT5Y08jn4,7055
41
+ liger_kernel/triton/__init__.py,sha256=yfRe0zMb47QnqjecZWG7LnanfCTzeku7SgWRAwNVmzU,101
42
+ liger_kernel/triton/monkey_patch.py,sha256=5BcGKTtdqeYchypBIBopGIWPx1-cFALz7sOKoEsqXJ0,1584
43
+ liger_kernel-0.4.0.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
44
+ liger_kernel-0.4.0.dist-info/METADATA,sha256=DfE4CFCD-OnW5VdfxakEA_dXsYxJemAHNtfc5x8TVOc,27694
45
+ liger_kernel-0.4.0.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
46
+ liger_kernel-0.4.0.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
47
+ liger_kernel-0.4.0.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
48
+ liger_kernel-0.4.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (74.1.2)
2
+ Generator: setuptools (75.3.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,4 +0,0 @@
1
- Copyright 2024 LinkedIn Corporation
2
- All Rights Reserved.
3
-
4
- Licensed under the BSD 2-Clause License (the "License"). See License in the project root for license information.
@@ -1,42 +0,0 @@
1
- liger_kernel/env_report.py,sha256=LFUJ6UMkFFGPBYXBlqHFGy4bhsemEpSI-_1edSazlHI,1130
2
- liger_kernel/ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
- liger_kernel/ops/cross_entropy.py,sha256=6uoPScKpXJ7gdBlOpSnZcQ5fQe52JHYjUVsr_Bf4kCE,12317
4
- liger_kernel/ops/fused_linear_cross_entropy.py,sha256=Jf2h_x3X3BZqGrXKSt_MutxfkT5vhPnbdlnioikV-lU,9321
5
- liger_kernel/ops/geglu.py,sha256=DFdNEgL4GkSE54WKd00rkkf3_3pJgTjHnZHrEd-Y2fM,4101
6
- liger_kernel/ops/kl_div.py,sha256=Gqy4U5HWonTVqj9AUknfyahiqUpwV5ncG6uiKATcLus,8087
7
- liger_kernel/ops/layer_norm.py,sha256=unGMYMOPqtkM9aTrokhcqgPmsV2AUN7Yzv86isVB9OI,7422
8
- liger_kernel/ops/rms_norm.py,sha256=4miEoDSdsc0GuhI3BpBRxt6iieFQcN2QnNp4o8PVB98,9921
9
- liger_kernel/ops/rope.py,sha256=jrzaA9-6Orn44y_IIam9_YNPQxOFK2FrIRNfFea4EtU,8513
10
- liger_kernel/ops/swiglu.py,sha256=SYC9KaxvR1514B2RIvCMHWYP6T7cHL4XhPEIds2xgkI,2968
11
- liger_kernel/ops/utils.py,sha256=Y5sbRuZVoswsMzITTTiFgITJN2QO0K4McAAUncE3UnE,1941
12
- liger_kernel/ops/experimental/embedding.py,sha256=LYR66dB-jhvhtUjeV4PnNro-n77J1mdlmpSLSxB3Y6U,4186
13
- liger_kernel/transformers/__init__.py,sha256=UP5NP8yJhkFkjLVTkFRU0w0CA49hwdhqwmIgaBAEcj0,1148
14
- liger_kernel/transformers/auto_model.py,sha256=WQyaORi2zPIWTLhuAWCRPIzyHd5T4my4yGHQrt1-uBA,1247
15
- liger_kernel/transformers/cross_entropy.py,sha256=gL30VByCSA_iQSkhV6no70x_IUqqFSTMJdytppico_w,804
16
- liger_kernel/transformers/functional.py,sha256=gXviuzvWjkSLfNGUWLKDnp4s6ATpvz7309kov6JKp0Y,906
17
- liger_kernel/transformers/fused_linear_cross_entropy.py,sha256=-07t8YRajZTrJOG2rUzt6Ur7kNuWgarWcqy7ou5Da8k,629
18
- liger_kernel/transformers/geglu.py,sha256=QcrME_8ooIn0xa59LaC0aoOdRrBIFd11Y0bAyF0NfCw,1130
19
- liger_kernel/transformers/kl_div.py,sha256=SRwBN-izC3G2P0XWFXtvDazyk_CSoizlb_ZCQS1PsTA,378
20
- liger_kernel/transformers/layer_norm.py,sha256=fd6o4kSHJWolQMWxh-l1qObfgL08ruNbUoBiANKX1ow,972
21
- liger_kernel/transformers/monkey_patch.py,sha256=lpQGfALGF7UC1D8ZGz7ONLEdzkF2a7sBUwnFjCctYeQ,30132
22
- liger_kernel/transformers/rms_norm.py,sha256=4XfMQI6dORF7s_5qUqVHKWv-3IUomaimU2dg-NwnpoM,1035
23
- liger_kernel/transformers/rope.py,sha256=m-ah8vZBYW8tfplTXCiAPMHJWlB1tdp_JPXJeWE-Boo,943
24
- liger_kernel/transformers/swiglu.py,sha256=0-tVJ8xEYfhxnduc16PflXFj8sZPxdx9sHUn3hfwCI4,2468
25
- liger_kernel/transformers/trainer_integration.py,sha256=W3ON51O5GkyzNJsItz0y5rKx-uy2f2cFfveZpqbUdhw,123
26
- liger_kernel/transformers/experimental/embedding.py,sha256=HpckiAMKM8-SRxKDcGTqortVxnjhwpZsfsp9lfjqfeM,895
27
- liger_kernel/transformers/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
28
- liger_kernel/transformers/model/gemma.py,sha256=EcdkGbSj_qroTDFl0Sc_HLyDyY0xcDhwrgkM_wkXnw8,4987
29
- liger_kernel/transformers/model/llama.py,sha256=6McXLi_Bt35WuxaJ_0CzEnOtayHXiPw5vjiDsaQKdJU,5323
30
- liger_kernel/transformers/model/mistral.py,sha256=_MQJrDntlxBO5cJwgTjr2rk2nNd5FAXVnzcTg_PEekQ,5079
31
- liger_kernel/transformers/model/mixtral.py,sha256=ZwVz7zSD2S2fyyMuJgDE4grvt2VvQL-jsZeJtdwnHFk,5750
32
- liger_kernel/transformers/model/phi3.py,sha256=zmjOsVV5TjKJ0U2dCm6W-8WCx1toKoh2Wm2PZu3XOIw,4927
33
- liger_kernel/transformers/model/qwen2.py,sha256=Va4uiZaVzCG2V7XKDfHjZyYTre5vPQM02j83jnnhono,4873
34
- liger_kernel/transformers/model/qwen2_vl.py,sha256=UajJdi49tUOfa68i2WHQ_2GZBF7d_N_uwOntER3bsl8,6607
35
- liger_kernel/triton/__init__.py,sha256=yfRe0zMb47QnqjecZWG7LnanfCTzeku7SgWRAwNVmzU,101
36
- liger_kernel/triton/monkey_patch.py,sha256=5BcGKTtdqeYchypBIBopGIWPx1-cFALz7sOKoEsqXJ0,1584
37
- liger_kernel-0.3.0.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
38
- liger_kernel-0.3.0.dist-info/METADATA,sha256=za6aISK4wPPAxlXPlMIK763uK_ieOJJlDjeoiEj392o,25454
39
- liger_kernel-0.3.0.dist-info/NOTICE,sha256=BXkXY9aWvEy_7MAB57zDu1z8uMYT1i1l9B6EpHuBa8s,173
40
- liger_kernel-0.3.0.dist-info/WHEEL,sha256=cVxcB9AmuTcXqmwrtPhNK88dr7IR_b6qagTj0UvIEbY,91
41
- liger_kernel-0.3.0.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
42
- liger_kernel-0.3.0.dist-info/RECORD,,