liger-kernel-nightly 0.3.1.dev20241101044713__tar.gz → 0.3.1.dev20241102065152__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of liger-kernel-nightly might be problematic. Click here for more details.
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/PKG-INFO +2 -2
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/pyproject.toml +2 -2
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/model/llama.py +22 -19
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel_nightly.egg-info/PKG-INFO +2 -2
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel_nightly.egg-info/requires.txt +1 -1
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/LICENSE +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/LICENSE-Apache-2.0 +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/LICENSE-MIT-AutoAWQ +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/LICENSE-MIT-Efficient-Cross-Entropy +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/LICENSE-MIT-llmc +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/LICENSE-MIT-triton +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/NOTICE +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/README.md +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/setup.cfg +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/env_report.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/ops/__init__.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/ops/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/ops/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/ops/experimental/mm_int8int2.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/ops/fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/ops/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/ops/geglu.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/ops/jsd.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/ops/kl_div.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/ops/layer_norm.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/ops/rms_norm.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/ops/rope.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/ops/swiglu.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/ops/utils.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/__init__.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/auto_model.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/cross_entropy.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/experimental/embedding.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/functional.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/fused_linear_cross_entropy.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/fused_linear_jsd.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/geglu.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/jsd.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/kl_div.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/layer_norm.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/model/__init__.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/model/gemma.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/model/mistral.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/model/mixtral.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/model/mllama.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/model/phi3.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/model/qwen2.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/model/qwen2_vl.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/monkey_patch.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/rms_norm.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/rope.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/swiglu.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/transformers/trainer_integration.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/triton/__init__.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel/triton/monkey_patch.py +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel_nightly.egg-info/SOURCES.txt +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel_nightly.egg-info/dependency_links.txt +0 -0
- {liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/src/liger_kernel_nightly.egg-info/top_level.txt +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: liger_kernel_nightly
|
|
3
|
-
Version: 0.3.1.
|
|
3
|
+
Version: 0.3.1.dev20241102065152
|
|
4
4
|
Summary: Efficient Triton kernels for LLM Training
|
|
5
5
|
License: BSD 2-CLAUSE LICENSE
|
|
6
6
|
Copyright 2024 LinkedIn Corporation
|
|
@@ -36,7 +36,7 @@ License-File: LICENSE-MIT-llmc
|
|
|
36
36
|
License-File: LICENSE-MIT-triton
|
|
37
37
|
License-File: NOTICE
|
|
38
38
|
Requires-Dist: torch>=2.1.2
|
|
39
|
-
Requires-Dist: triton>=2.3.
|
|
39
|
+
Requires-Dist: triton>=2.3.1
|
|
40
40
|
Provides-Extra: transformers
|
|
41
41
|
Requires-Dist: transformers~=4.0; extra == "transformers"
|
|
42
42
|
Provides-Extra: dev
|
|
@@ -4,14 +4,14 @@ build-backend = "setuptools.build_meta"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "liger_kernel_nightly"
|
|
7
|
-
version = "0.3.1.
|
|
7
|
+
version = "0.3.1.dev20241102065152"
|
|
8
8
|
description = "Efficient Triton kernels for LLM Training"
|
|
9
9
|
urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
|
|
10
10
|
readme = { file = "README.md", content-type = "text/markdown" }
|
|
11
11
|
license = { file = "LICENSE" }
|
|
12
12
|
dependencies = [
|
|
13
13
|
"torch>=2.1.2",
|
|
14
|
-
"triton>=2.3.
|
|
14
|
+
"triton>=2.3.1",
|
|
15
15
|
]
|
|
16
16
|
|
|
17
17
|
[project.optional-dependencies]
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
from typing import List, Optional, Tuple, Union
|
|
1
|
+
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
|
|
2
2
|
|
|
3
3
|
import torch
|
|
4
4
|
import torch.nn.functional as F
|
|
@@ -18,6 +18,10 @@ from liger_kernel.transformers.fused_linear_cross_entropy import (
|
|
|
18
18
|
)
|
|
19
19
|
|
|
20
20
|
|
|
21
|
+
if TYPE_CHECKING:
|
|
22
|
+
from transformers.cache_utils import Cache
|
|
23
|
+
|
|
24
|
+
|
|
21
25
|
@add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
|
|
22
26
|
@replace_return_docstrings(
|
|
23
27
|
output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
|
@@ -27,7 +31,7 @@ def lce_forward_deprecated(
|
|
|
27
31
|
input_ids: torch.LongTensor = None,
|
|
28
32
|
attention_mask: Optional[torch.Tensor] = None,
|
|
29
33
|
position_ids: Optional[torch.LongTensor] = None,
|
|
30
|
-
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
|
34
|
+
past_key_values: Optional[Union["Cache", List[torch.FloatTensor]]] = None,
|
|
31
35
|
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
32
36
|
labels: Optional[torch.LongTensor] = None,
|
|
33
37
|
use_cache: Optional[bool] = None,
|
|
@@ -153,19 +157,19 @@ def lce_forward_deprecated(
|
|
|
153
157
|
)
|
|
154
158
|
def lce_forward(
|
|
155
159
|
self,
|
|
156
|
-
input_ids=None,
|
|
157
|
-
attention_mask=None,
|
|
158
|
-
position_ids=None,
|
|
159
|
-
past_key_values=None,
|
|
160
|
-
inputs_embeds=None,
|
|
161
|
-
labels=None,
|
|
162
|
-
use_cache=None,
|
|
163
|
-
output_attentions=None,
|
|
164
|
-
output_hidden_states=None,
|
|
165
|
-
return_dict=None,
|
|
166
|
-
cache_position=None,
|
|
167
|
-
num_logits_to_keep=0,
|
|
168
|
-
**
|
|
160
|
+
input_ids: torch.LongTensor = None,
|
|
161
|
+
attention_mask: Optional[torch.Tensor] = None,
|
|
162
|
+
position_ids: Optional[torch.LongTensor] = None,
|
|
163
|
+
past_key_values: Optional[Union["Cache", List[torch.FloatTensor]]] = None,
|
|
164
|
+
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
165
|
+
labels: Optional[torch.LongTensor] = None,
|
|
166
|
+
use_cache: Optional[bool] = None,
|
|
167
|
+
output_attentions: Optional[bool] = None,
|
|
168
|
+
output_hidden_states: Optional[bool] = None,
|
|
169
|
+
return_dict: Optional[bool] = None,
|
|
170
|
+
cache_position: Optional[torch.LongTensor] = None,
|
|
171
|
+
num_logits_to_keep: int = 0,
|
|
172
|
+
**loss_kwargs,
|
|
169
173
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
|
170
174
|
r"""
|
|
171
175
|
Args:
|
|
@@ -224,7 +228,6 @@ def lce_forward(
|
|
|
224
228
|
output_hidden_states=output_hidden_states,
|
|
225
229
|
return_dict=return_dict,
|
|
226
230
|
cache_position=cache_position,
|
|
227
|
-
**kwargs,
|
|
228
231
|
)
|
|
229
232
|
|
|
230
233
|
hidden_states = outputs[0]
|
|
@@ -245,12 +248,12 @@ def lce_forward(
|
|
|
245
248
|
shift_hidden_states = shift_hidden_states.view(-1, self.config.hidden_size)
|
|
246
249
|
shift_labels = shift_labels.view(-1)
|
|
247
250
|
|
|
248
|
-
reduction = "sum" if "num_items_in_batch" in
|
|
251
|
+
reduction = "sum" if "num_items_in_batch" in loss_kwargs else "mean"
|
|
249
252
|
lce = LigerFusedLinearCrossEntropyLoss(reduction=reduction)
|
|
250
253
|
|
|
251
254
|
loss = lce(self.lm_head.weight, shift_hidden_states, shift_labels)
|
|
252
255
|
if reduction == "sum":
|
|
253
|
-
loss /=
|
|
256
|
+
loss /= loss_kwargs["num_items_in_batch"]
|
|
254
257
|
|
|
255
258
|
else: # if in inference mode materialize logits
|
|
256
259
|
logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
|
|
@@ -259,7 +262,7 @@ def lce_forward(
|
|
|
259
262
|
logits=logits,
|
|
260
263
|
labels=labels,
|
|
261
264
|
vocab_size=self.config.vocab_size,
|
|
262
|
-
**
|
|
265
|
+
**loss_kwargs,
|
|
263
266
|
)
|
|
264
267
|
|
|
265
268
|
if not return_dict:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: liger_kernel_nightly
|
|
3
|
-
Version: 0.3.1.
|
|
3
|
+
Version: 0.3.1.dev20241102065152
|
|
4
4
|
Summary: Efficient Triton kernels for LLM Training
|
|
5
5
|
License: BSD 2-CLAUSE LICENSE
|
|
6
6
|
Copyright 2024 LinkedIn Corporation
|
|
@@ -36,7 +36,7 @@ License-File: LICENSE-MIT-llmc
|
|
|
36
36
|
License-File: LICENSE-MIT-triton
|
|
37
37
|
License-File: NOTICE
|
|
38
38
|
Requires-Dist: torch>=2.1.2
|
|
39
|
-
Requires-Dist: triton>=2.3.
|
|
39
|
+
Requires-Dist: triton>=2.3.1
|
|
40
40
|
Provides-Extra: transformers
|
|
41
41
|
Requires-Dist: transformers~=4.0; extra == "transformers"
|
|
42
42
|
Provides-Extra: dev
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{liger_kernel_nightly-0.3.1.dev20241101044713 → liger_kernel_nightly-0.3.1.dev20241102065152}/NOTICE
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|