liger-kernel-nightly 0.5.10.dev20250630172023__py3-none-any.whl → 0.5.10.dev20250704061125__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- liger_kernel/chunked_loss/__init__.py +1 -0
- liger_kernel/chunked_loss/cosine_similarity_loss.py +127 -0
- liger_kernel/chunked_loss/functional.py +2 -0
- liger_kernel/ops/geglu.py +1 -1
- liger_kernel/ops/swiglu.py +1 -1
- liger_kernel/transformers/model/gemma.py +9 -1
- liger_kernel/transformers/model/gemma2.py +9 -1
- liger_kernel/transformers/model/llama.py +10 -1
- liger_kernel/transformers/model/mistral.py +0 -3
- liger_kernel/transformers/model/phi3.py +9 -1
- liger_kernel/transformers/model/qwen2.py +8 -0
- liger_kernel/transformers/monkey_patch.py +10 -3
- {liger_kernel_nightly-0.5.10.dev20250630172023.dist-info → liger_kernel_nightly-0.5.10.dev20250704061125.dist-info}/METADATA +1 -1
- {liger_kernel_nightly-0.5.10.dev20250630172023.dist-info → liger_kernel_nightly-0.5.10.dev20250704061125.dist-info}/RECORD +18 -17
- {liger_kernel_nightly-0.5.10.dev20250630172023.dist-info → liger_kernel_nightly-0.5.10.dev20250704061125.dist-info}/LICENSE +0 -0
- {liger_kernel_nightly-0.5.10.dev20250630172023.dist-info → liger_kernel_nightly-0.5.10.dev20250704061125.dist-info}/NOTICE +0 -0
- {liger_kernel_nightly-0.5.10.dev20250630172023.dist-info → liger_kernel_nightly-0.5.10.dev20250704061125.dist-info}/WHEEL +0 -0
- {liger_kernel_nightly-0.5.10.dev20250630172023.dist-info → liger_kernel_nightly-0.5.10.dev20250704061125.dist-info}/top_level.txt +0 -0
@@ -1,3 +1,4 @@
|
|
1
|
+
from liger_kernel.chunked_loss.cosine_similarity_loss import LigerFusedLinearCosineSimilarityLoss # noqa:F401
|
1
2
|
from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOLoss # noqa: F401
|
2
3
|
from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOLoss # noqa: F401
|
3
4
|
from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOLoss # noqa: F401
|
@@ -0,0 +1,127 @@
|
|
1
|
+
import torch
|
2
|
+
import torch.nn.functional as F
|
3
|
+
|
4
|
+
from liger_kernel.chunked_loss.fused_linear_distillation import LigerFusedLinearDistillationBase
|
5
|
+
|
6
|
+
|
7
|
+
class LigerFusedLinearCosineSimilarityFunction(LigerFusedLinearDistillationBase):
|
8
|
+
@staticmethod
|
9
|
+
def distillation_loss_fn(student_logits, teacher_logits, beta=1.0):
|
10
|
+
"""
|
11
|
+
Compute Cosine loss (Cosine Similarity Loss).
|
12
|
+
Args:
|
13
|
+
student_logits (torch.Tensor): Logits of student tokens. Shape: (batch_size * seq_len,).
|
14
|
+
teacher_logits (torch.Tensor): Logits of teacher tokens. Shape: (batch_size * seq_len,).
|
15
|
+
beta: Coefficient beta of generalized Cosine Similarity in the interval [0, 1]. Default: `1.0` (float): .
|
16
|
+
Returns:
|
17
|
+
torch.Tensor: cosine similarity loss
|
18
|
+
"""
|
19
|
+
student_norm = F.normalize(student_logits, p=2, dim=-1)
|
20
|
+
teacher_norm = F.normalize(teacher_logits, p=2, dim=-1)
|
21
|
+
|
22
|
+
cosine_sim = F.cosine_similarity(student_norm, teacher_norm, dim=-1)
|
23
|
+
loss = beta * (1 - cosine_sim)
|
24
|
+
return loss.sum()
|
25
|
+
|
26
|
+
@classmethod
|
27
|
+
def forward(
|
28
|
+
cls,
|
29
|
+
ctx,
|
30
|
+
student_input: torch.Tensor,
|
31
|
+
student_weight: torch.Tensor,
|
32
|
+
teacher_input: torch.Tensor,
|
33
|
+
teacher_weight: torch.Tensor,
|
34
|
+
true_labels: torch.LongTensor,
|
35
|
+
student_bias: torch.Tensor,
|
36
|
+
teacher_bias: torch.Tensor,
|
37
|
+
weight_hard_loss: float = 0.5,
|
38
|
+
weight_soft_loss: float = 0.5,
|
39
|
+
beta: float = 0.5,
|
40
|
+
ignore_index: int = -100,
|
41
|
+
temperature: float = 1.0,
|
42
|
+
compiled: bool = True,
|
43
|
+
chunk_size: int = 1024,
|
44
|
+
):
|
45
|
+
return super().forward(
|
46
|
+
cls=cls,
|
47
|
+
ctx=ctx,
|
48
|
+
student_input=student_input,
|
49
|
+
student_weight=student_weight,
|
50
|
+
teacher_input=teacher_input,
|
51
|
+
teacher_weight=teacher_weight,
|
52
|
+
target=true_labels,
|
53
|
+
student_bias=student_bias,
|
54
|
+
teacher_bias=teacher_bias,
|
55
|
+
chunk_size=chunk_size,
|
56
|
+
weight_hard_loss=weight_hard_loss,
|
57
|
+
weight_soft_loss=weight_soft_loss,
|
58
|
+
beta=beta,
|
59
|
+
ignore_index=ignore_index,
|
60
|
+
temperature=temperature,
|
61
|
+
compiled=compiled,
|
62
|
+
)
|
63
|
+
|
64
|
+
@staticmethod
|
65
|
+
def backward(ctx, grad_output):
|
66
|
+
grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output)[:6]
|
67
|
+
|
68
|
+
return (
|
69
|
+
*grads,
|
70
|
+
None, # teacher_bias
|
71
|
+
None, # weight_hard_loss
|
72
|
+
None, # weight_soft_loss
|
73
|
+
None, # beta
|
74
|
+
None, # ignore_index
|
75
|
+
None, # temperature
|
76
|
+
None, # compiled
|
77
|
+
None, # chunk_size
|
78
|
+
)
|
79
|
+
|
80
|
+
|
81
|
+
class LigerFusedLinearCosineSimilarityLoss(torch.nn.Module):
|
82
|
+
def __init__(
|
83
|
+
self,
|
84
|
+
weight_hard_loss: float = 0.5,
|
85
|
+
weight_soft_loss: float = 0.5,
|
86
|
+
beta: float = 0.5,
|
87
|
+
ignore_index: int = -100,
|
88
|
+
temperature: float = 1.0,
|
89
|
+
compiled: bool = True,
|
90
|
+
chunk_size: int = 1024,
|
91
|
+
):
|
92
|
+
super().__init__()
|
93
|
+
assert temperature != 0, "Temperature cannot be 0."
|
94
|
+
self.weight_hard_loss = weight_hard_loss
|
95
|
+
self.weight_soft_loss = weight_soft_loss
|
96
|
+
self.ignore_index = ignore_index
|
97
|
+
self.temperature = temperature
|
98
|
+
self.compiled = compiled
|
99
|
+
self.beta = beta
|
100
|
+
self.chunk_size = chunk_size
|
101
|
+
|
102
|
+
def forward(
|
103
|
+
self,
|
104
|
+
student_input: torch.Tensor,
|
105
|
+
student_weight: torch.Tensor,
|
106
|
+
teacher_input: torch.Tensor,
|
107
|
+
teacher_weight: torch.Tensor,
|
108
|
+
true_labels: torch.LongTensor,
|
109
|
+
student_bias: torch.Tensor = None,
|
110
|
+
teacher_bias: torch.Tensor = None,
|
111
|
+
) -> torch.Tensor:
|
112
|
+
return LigerFusedLinearCosineSimilarityFunction.apply(
|
113
|
+
student_input,
|
114
|
+
student_weight,
|
115
|
+
teacher_input,
|
116
|
+
teacher_weight,
|
117
|
+
true_labels,
|
118
|
+
student_bias,
|
119
|
+
teacher_bias,
|
120
|
+
self.weight_hard_loss,
|
121
|
+
self.weight_soft_loss,
|
122
|
+
self.beta,
|
123
|
+
self.ignore_index,
|
124
|
+
self.temperature,
|
125
|
+
self.compiled,
|
126
|
+
self.chunk_size,
|
127
|
+
)
|
@@ -1,3 +1,4 @@
|
|
1
|
+
from liger_kernel.chunked_loss.cosine_similarity_loss import LigerFusedLinearCosineSimilarityFunction
|
1
2
|
from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOFunction
|
2
3
|
from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOFunction
|
3
4
|
from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOFunction
|
@@ -9,6 +10,7 @@ from liger_kernel.chunked_loss.simpo_loss import LigerFusedLinearSimPOFunction
|
|
9
10
|
liger_fused_linear_orpo = LigerFusedLinearORPOFunction.apply
|
10
11
|
liger_fused_linear_dpo = LigerFusedLinearDPOFunction.apply
|
11
12
|
liger_fused_linear_jsd = LigerFusedLinearJSDFunction.apply
|
13
|
+
liger_fused_linear_cosine = LigerFusedLinearCosineSimilarityFunction.apply
|
12
14
|
liger_fused_linear_cpo = LigerFusedLinearCPOFunction.apply
|
13
15
|
liger_fused_linear_simpo = LigerFusedLinearSimPOFunction.apply
|
14
16
|
liger_fused_linear_kto = LigerFusedLinearKTOFunction.apply
|
liger_kernel/ops/geglu.py
CHANGED
@@ -40,7 +40,7 @@ def _geglu_tanh_forward_kernel(a, b, c, stride, n_cols: tl.constexpr, BLOCK_SIZE
|
|
40
40
|
tanh_arg = sqrt_2_over_pi * (a_row + 0.044715 * a_cubed)
|
41
41
|
tanh_result = tanh(tanh_arg)
|
42
42
|
geglu_a = 0.5 * a_row * (1 + tanh_result)
|
43
|
-
c_row = geglu_a * b_row
|
43
|
+
c_row = geglu_a.cast(b_row.dtype) * b_row
|
44
44
|
tl.store(c + col_offsets, c_row, mask=mask)
|
45
45
|
|
46
46
|
|
liger_kernel/ops/swiglu.py
CHANGED
@@ -26,7 +26,7 @@ def _swiglu_forward_kernel(a_ptr, b_ptr, c_ptr, stride, n_cols: tl.constexpr, BL
|
|
26
26
|
# sigmoid requires type float32
|
27
27
|
a_row = tl.load(a_ptr + col_offsets, mask=mask, other=0).to(tl.float32)
|
28
28
|
b_row = tl.load(b_ptr + col_offsets, mask=mask, other=0)
|
29
|
-
c_row = silu(a_row) * b_row
|
29
|
+
c_row = silu(a_row).cast(b_row.dtype) * b_row
|
30
30
|
tl.store(c_ptr + col_offsets, c_row, mask=mask)
|
31
31
|
|
32
32
|
|
@@ -27,6 +27,7 @@ def lce_forward_deprecated(
|
|
27
27
|
output_hidden_states: Optional[bool] = None,
|
28
28
|
return_dict: Optional[bool] = None,
|
29
29
|
cache_position: Optional[torch.LongTensor] = None,
|
30
|
+
skip_logits: Optional[bool] = None,
|
30
31
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
31
32
|
r"""
|
32
33
|
|
@@ -81,7 +82,14 @@ def lce_forward_deprecated(
|
|
81
82
|
loss = None
|
82
83
|
logits = None
|
83
84
|
|
84
|
-
if
|
85
|
+
if skip_logits and labels is None:
|
86
|
+
raise ValueError("skip_logits is True, but labels is None")
|
87
|
+
|
88
|
+
if skip_logits is None:
|
89
|
+
# By default, if in training mode, don't materialize logits
|
90
|
+
skip_logits = self.training and labels is not None
|
91
|
+
|
92
|
+
if skip_logits:
|
85
93
|
shift_hidden_states = hidden_states[..., :-1, :].contiguous()
|
86
94
|
shift_labels = labels[..., 1:].contiguous()
|
87
95
|
|
@@ -30,6 +30,7 @@ def lce_forward_deprecated(
|
|
30
30
|
output_hidden_states: Optional[bool] = None,
|
31
31
|
return_dict: Optional[bool] = None,
|
32
32
|
cache_position: Optional[torch.LongTensor] = None,
|
33
|
+
skip_logits: Optional[bool] = None,
|
33
34
|
**kwargs,
|
34
35
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
35
36
|
r"""
|
@@ -85,7 +86,14 @@ def lce_forward_deprecated(
|
|
85
86
|
loss = None
|
86
87
|
logits = None
|
87
88
|
|
88
|
-
if
|
89
|
+
if skip_logits and labels is None:
|
90
|
+
raise ValueError("skip_logits is True, but labels is None")
|
91
|
+
|
92
|
+
if skip_logits is None:
|
93
|
+
# By default, if in training mode, don't materialize logits
|
94
|
+
skip_logits = self.training and labels is not None
|
95
|
+
|
96
|
+
if skip_logits:
|
89
97
|
shift_hidden_states = hidden_states[..., :-1, :].contiguous()
|
90
98
|
shift_labels = labels[..., 1:].contiguous()
|
91
99
|
|
@@ -37,6 +37,7 @@ def lce_forward_deprecated(
|
|
37
37
|
output_hidden_states: Optional[bool] = None,
|
38
38
|
return_dict: Optional[bool] = None,
|
39
39
|
cache_position: Optional[torch.LongTensor] = None,
|
40
|
+
skip_logits: Optional[bool] = None,
|
40
41
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
41
42
|
r"""
|
42
43
|
Copy paste llama forward but replace torch cross entropy with liger fused linear cross entropy
|
@@ -91,7 +92,15 @@ def lce_forward_deprecated(
|
|
91
92
|
loss = None
|
92
93
|
logits = None
|
93
94
|
|
94
|
-
if
|
95
|
+
# if in training mode, don't materialize logits
|
96
|
+
if skip_logits and labels is None:
|
97
|
+
raise ValueError("skip_logits is True, but labels is None")
|
98
|
+
|
99
|
+
if skip_logits is None:
|
100
|
+
# By default, if in training mode, don't materialize logits
|
101
|
+
skip_logits = self.training and labels is not None
|
102
|
+
|
103
|
+
if skip_logits:
|
95
104
|
shift_hidden_states = hidden_states[..., :-1, :].contiguous()
|
96
105
|
shift_labels = labels[..., 1:].contiguous()
|
97
106
|
|
@@ -26,6 +26,7 @@ def lce_forward_deprecated(
|
|
26
26
|
output_hidden_states: Optional[bool] = None,
|
27
27
|
return_dict: Optional[bool] = None,
|
28
28
|
cache_position: Optional[torch.LongTensor] = None,
|
29
|
+
skip_logits: Optional[bool] = None,
|
29
30
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
30
31
|
r"""
|
31
32
|
Copy paste phi3 forward from transfomers v4.44.2 but replace torch cross entropy with liger fused linear cross entropy
|
@@ -80,7 +81,14 @@ def lce_forward_deprecated(
|
|
80
81
|
loss = None
|
81
82
|
logits = None
|
82
83
|
|
83
|
-
if
|
84
|
+
if skip_logits and labels is None:
|
85
|
+
raise ValueError("skip_logits is True, but labels is None")
|
86
|
+
|
87
|
+
if skip_logits is None:
|
88
|
+
# By default, if in training mode, don't materialize logits
|
89
|
+
skip_logits = self.training and labels is not None
|
90
|
+
|
91
|
+
if skip_logits:
|
84
92
|
shift_hidden_states = hidden_states[..., :-1, :].contiguous()
|
85
93
|
shift_labels = labels[..., 1:].contiguous()
|
86
94
|
|
@@ -26,6 +26,7 @@ def lce_forward_deprecated(
|
|
26
26
|
output_hidden_states: Optional[bool] = None,
|
27
27
|
return_dict: Optional[bool] = None,
|
28
28
|
cache_position: Optional[torch.LongTensor] = None,
|
29
|
+
skip_logits: Optional[bool] = None,
|
29
30
|
) -> Union[Tuple, CausalLMOutputWithPast]:
|
30
31
|
r"""
|
31
32
|
Copy paste Qwen2's forward but replace torch cross entropy with liger fused linear cross entropy
|
@@ -80,6 +81,13 @@ def lce_forward_deprecated(
|
|
80
81
|
loss = None
|
81
82
|
logits = None
|
82
83
|
|
84
|
+
if skip_logits and labels is None:
|
85
|
+
raise ValueError("skip_logits is True, but labels is None")
|
86
|
+
|
87
|
+
if skip_logits is None:
|
88
|
+
# By default, if in training mode, don't materialize logits
|
89
|
+
skip_logits = self.training and labels is not None
|
90
|
+
|
83
91
|
if self.training and (labels is not None):
|
84
92
|
shift_hidden_states = hidden_states[..., :-1, :].contiguous()
|
85
93
|
shift_labels = labels[..., 1:].contiguous()
|
@@ -611,10 +611,17 @@ def apply_liger_kernel_to_mistral(
|
|
611
611
|
if cross_entropy:
|
612
612
|
modeling_mistral.CrossEntropyLoss = LigerCrossEntropyLoss
|
613
613
|
if fused_linear_cross_entropy:
|
614
|
-
if
|
615
|
-
model
|
614
|
+
if transformer_version >= version.parse("4.49.0"):
|
615
|
+
if model is not None:
|
616
|
+
model.forward = MethodType(mistral_lce_forward, model)
|
617
|
+
else:
|
618
|
+
modeling_mistral.MistralForCausalLM.forward = mistral_lce_forward
|
616
619
|
else:
|
617
|
-
|
620
|
+
logger.warning(
|
621
|
+
"The latest version of Liger does not support transformers < 4.49.0 for llava. Please downgrade your liger version or upgrade your transformer version."
|
622
|
+
)
|
623
|
+
logger.warning("LigerFusedLinearCrossEntropy patch is not applied.")
|
624
|
+
|
618
625
|
if swiglu:
|
619
626
|
modeling_mistral.MistralMLP = LigerSwiGLUMLP
|
620
627
|
|
@@ -2,10 +2,11 @@ liger_kernel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
2
|
liger_kernel/env_report.py,sha256=uhdEC8OydxoZlb7B6YYcAaBF3crGFdIck-4cxaW4NJY,1728
|
3
3
|
liger_kernel/utils.py,sha256=BQleeZWHSZPNuPcYcoZTOp1kcNEZONZilPP5-AmjgWI,2024
|
4
4
|
liger_kernel/chunked_loss/README.md,sha256=0FmkFC3hKBqyoDT5uTlIYmrvRkF-EOCR1y-EBU1LpWU,2248
|
5
|
-
liger_kernel/chunked_loss/__init__.py,sha256=
|
5
|
+
liger_kernel/chunked_loss/__init__.py,sha256=J5_jNnzZ4gZmA38W5f_4oab7xMoNk1Xy-yh3X_Xlf-s,714
|
6
|
+
liger_kernel/chunked_loss/cosine_similarity_loss.py,sha256=pZ07OQ6RI-c8uk96tDRlUXdt31-da7yWhfwircZlKRw,4198
|
6
7
|
liger_kernel/chunked_loss/cpo_loss.py,sha256=Gzz1eU4kgcbdubFVRy55e8A1Cr-r45UgNicXwZIjmBU,5454
|
7
8
|
liger_kernel/chunked_loss/dpo_loss.py,sha256=tapMiNdI8_ufW55iG0Ud4dmiW39gu1DzlvtoOCHrdGg,6259
|
8
|
-
liger_kernel/chunked_loss/functional.py,sha256
|
9
|
+
liger_kernel/chunked_loss/functional.py,sha256=-XPDbLml9dHmvoSU2VNTUrBDFehuzvuAGPikVetBMtI,1132
|
9
10
|
liger_kernel/chunked_loss/fused_linear_distillation.py,sha256=ooR-qnZCyWJN935oHCSWLaKKKyaYERyhNczRGi1VOiw,11935
|
10
11
|
liger_kernel/chunked_loss/fused_linear_ppo.py,sha256=AA19cpv6D8mo5RbSK5GRCcZoOSnpxV_Z1eJlAsC5eic,13434
|
11
12
|
liger_kernel/chunked_loss/fused_linear_preference.py,sha256=FIH85uUXAOgYx5Ax8MjFhJHVu-2pKtY7wSegd0zSyyY,18336
|
@@ -21,7 +22,7 @@ liger_kernel/ops/dyt.py,sha256=gCLz4S8aul8SY9nvIGaoK67aGb7U9MJRQdo3ONqmQYs,5417
|
|
21
22
|
liger_kernel/ops/fused_linear_cross_entropy.py,sha256=5fbGhN85n3zf0uIdJ7PYHWIRzTf0VTFiS0ARtOmqIP0,11020
|
22
23
|
liger_kernel/ops/fused_linear_jsd.py,sha256=CSoprxb-YcJy-YUKiTcYkxN8sb9h2kdk_iHuncvSV5c,9683
|
23
24
|
liger_kernel/ops/fused_neighborhood_attention.py,sha256=vPi5xbnh6wxyZehaqo6Tuilqo2fN5SGDiONjnNmIKqs,35556
|
24
|
-
liger_kernel/ops/geglu.py,sha256=
|
25
|
+
liger_kernel/ops/geglu.py,sha256=r0WSq9E93zzynL44Wh8femzOWK07_SseBM_pJUyxT3s,4144
|
25
26
|
liger_kernel/ops/group_norm.py,sha256=qD4D4lSjSgVtO52EBNLC2iTseALRgPgqXE50U2woggk,10837
|
26
27
|
liger_kernel/ops/grpo_loss.py,sha256=anRnv7k1-AV3pCC6_TqP0GMg78YYUfRAJrbpx6PVhl0,9448
|
27
28
|
liger_kernel/ops/jsd.py,sha256=onHp5T3MbvJaVz5Vup7Ww6EQp_HTaZeayTjJk6FgQMY,7042
|
@@ -33,7 +34,7 @@ liger_kernel/ops/rms_norm.py,sha256=-rcgHwWCxlA-Syec2XhdW4jfOeCDt2r7qwjslgXFYDU,
|
|
33
34
|
liger_kernel/ops/rope.py,sha256=ofmBOkUpZZO-Q8Z5B_LOFYYLD-YT-8WnJ4vGOrDYouI,8943
|
34
35
|
liger_kernel/ops/softmax.py,sha256=tgORx6MK1IDDtZKqGarj0IPIVjqAIEUXXYPiinhRdtI,5864
|
35
36
|
liger_kernel/ops/sparsemax.py,sha256=AeWe1xgkHJFEKWTj2vu_0hj7LztGvjqXAps-QTpCY0U,5087
|
36
|
-
liger_kernel/ops/swiglu.py,sha256=
|
37
|
+
liger_kernel/ops/swiglu.py,sha256=D7nd4u_LInwsIRNCDdY77lqnTz8-W5dJrpEAt8zEO_A,3033
|
37
38
|
liger_kernel/ops/tvd.py,sha256=FHJtLQI95ijqgg9UtaHpMAjSCiPxB6CduPwPMcGxelc,6405
|
38
39
|
liger_kernel/ops/utils.py,sha256=uoFKQqo-34N2TWQNvXMFywqGiOMMXNEVBxVojzlUAa0,3836
|
39
40
|
liger_kernel/ops/experimental/embedding.py,sha256=tolj3tItkzpSb30zWqDN2_yX4ectflaQ8HMyKyFIQc8,4172
|
@@ -53,7 +54,7 @@ liger_kernel/transformers/grpo_loss.py,sha256=uAkUNKSnUGEOqa82L9w2e6AI1kcmG8K45-
|
|
53
54
|
liger_kernel/transformers/jsd.py,sha256=DGqRnxIZxsvxo0_tbbxX3b-sDbDjC_yKufyRIHCcScY,2979
|
54
55
|
liger_kernel/transformers/kl_div.py,sha256=WLffFbh1EExD2Eb1F7lN11fo9JJC-0751WJjZAF1Fj8,409
|
55
56
|
liger_kernel/transformers/layer_norm.py,sha256=c9pk3PEasOKYR0rhe5e5nNrnYKVCEW4VC8S6LpCq9EQ,906
|
56
|
-
liger_kernel/transformers/monkey_patch.py,sha256=
|
57
|
+
liger_kernel/transformers/monkey_patch.py,sha256=rXmaVry8hdpnH8HunfJhZmrsdlwAxjMP3x10ZYMnTy4,85554
|
57
58
|
liger_kernel/transformers/multi_token_attention.py,sha256=l9VDICK0dfmifUDW668hGscP8AHq2rYcM2oGUa3baRQ,1751
|
58
59
|
liger_kernel/transformers/qwen2vl_mrope.py,sha256=5EwSqrMdsL9MYspeBMXBsNJKvH0MOmRrtJXAJlnnlOI,1047
|
59
60
|
liger_kernel/transformers/rms_norm.py,sha256=vkekcvTeWY8vL4H6hg3t0XeY0Ew_3OFMPHuzqlxPPVw,2719
|
@@ -65,21 +66,21 @@ liger_kernel/transformers/trainer_integration.py,sha256=W3ON51O5GkyzNJsItz0y5rKx
|
|
65
66
|
liger_kernel/transformers/tvd.py,sha256=XrRfyJIqN6HFxXk8MYyFVZM1OLz3mtSbRZvWfZ_JerQ,450
|
66
67
|
liger_kernel/transformers/experimental/embedding.py,sha256=2P0QYdlFyFrG5OqTzTa1wcRgDSyjBMv5i1a7BrDPDQw,881
|
67
68
|
liger_kernel/transformers/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
68
|
-
liger_kernel/transformers/model/gemma.py,sha256=
|
69
|
-
liger_kernel/transformers/model/gemma2.py,sha256=
|
69
|
+
liger_kernel/transformers/model/gemma.py,sha256=mNX-mIwV6jI4zfbrUHp0C468pOmjzsL7mjXipGt-eS0,10007
|
70
|
+
liger_kernel/transformers/model/gemma2.py,sha256=R_JFPyWTk7RyA7D05ZiIaNO5pX8gWcvfWf-6rdCRMxs,11296
|
70
71
|
liger_kernel/transformers/model/gemma3.py,sha256=JI4jj9K660HeRsofB6cpkCHBQ0OsazElArRtKUehUmw,15945
|
71
72
|
liger_kernel/transformers/model/glm4.py,sha256=GlnEhdGJuDIqp2R9qC54biY3HwV1tWmfpJm6ijoAsrM,5257
|
72
|
-
liger_kernel/transformers/model/llama.py,sha256=
|
73
|
+
liger_kernel/transformers/model/llama.py,sha256=i8jJgyZsMKWQ-zKloETLugtwFpUOdaWxLDceciFXKd4,12832
|
73
74
|
liger_kernel/transformers/model/llama4.py,sha256=IgbB8sTh3dlETQnaNNy1bZLuXy-Nt7qmeAjF27ydGpg,4210
|
74
75
|
liger_kernel/transformers/model/llava.py,sha256=bLCioday_SOm69ogMDBhy_4UsVkH2-BSl93-EXY6-7I,15076
|
75
76
|
liger_kernel/transformers/model/loss_utils.py,sha256=WWAMdiONPaXpIvxyOim_0igLrYh0yyOok5Q9_L9xvZw,1787
|
76
|
-
liger_kernel/transformers/model/mistral.py,sha256=
|
77
|
+
liger_kernel/transformers/model/mistral.py,sha256=syYNL8dLThX2-4uC13Lu0krEZ5zw3InviDUR3AJmc-I,5500
|
77
78
|
liger_kernel/transformers/model/mixtral.py,sha256=VY-y73IyjcCyWyI7ahxXLw0fJrhgjYfr1xwRYtsHX0o,11396
|
78
79
|
liger_kernel/transformers/model/mllama.py,sha256=my29NXk-p6ckQaP8qDIN8e318yI_9mQZHt38MV3SqLY,11280
|
79
80
|
liger_kernel/transformers/model/olmo2.py,sha256=6L_bo-ZUgO1lYppdJneOtYxNIylQKS6BiGp13g7Uq9E,5259
|
80
81
|
liger_kernel/transformers/model/paligemma.py,sha256=xuIx3oOwTgftU3jqLfWOxUxgCLBNJh0yNC21an9qDjo,18773
|
81
|
-
liger_kernel/transformers/model/phi3.py,sha256=
|
82
|
-
liger_kernel/transformers/model/qwen2.py,sha256=
|
82
|
+
liger_kernel/transformers/model/phi3.py,sha256=zAzBVNOA16B16yy2HWsEgOMHhLoYkpWOWPgBT4z95WI,10655
|
83
|
+
liger_kernel/transformers/model/qwen2.py,sha256=3fpOTEOkniQmkCfN1KUa3KhseHJVzhj2Ht9FdYPUy-E,9962
|
83
84
|
liger_kernel/transformers/model/qwen2_5_vl.py,sha256=zEVVwotCXnAm3RRc8-1Nc8uitSWrwW4B9dYY2uOZDwg,6331
|
84
85
|
liger_kernel/transformers/model/qwen2_vl.py,sha256=5vK-vtCDpKZ2w33xYp2BS8kQYWUbKMqaiKvQcI27Mss,5884
|
85
86
|
liger_kernel/transformers/model/qwen3.py,sha256=w2jBHuK9kK9EmOr5dnEIXNQXUgUSV_sJUkXSEwxLPHs,4885
|
@@ -88,9 +89,9 @@ liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7H
|
|
88
89
|
liger_kernel/transformers/trainer/orpo_trainer.py,sha256=tX0h63aOFe3rNqTmk6JpMf75UPo981yzEa6TghnjS0Q,5370
|
89
90
|
liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
|
90
91
|
liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
|
91
|
-
liger_kernel_nightly-0.5.10.
|
92
|
-
liger_kernel_nightly-0.5.10.
|
93
|
-
liger_kernel_nightly-0.5.10.
|
94
|
-
liger_kernel_nightly-0.5.10.
|
95
|
-
liger_kernel_nightly-0.5.10.
|
96
|
-
liger_kernel_nightly-0.5.10.
|
92
|
+
liger_kernel_nightly-0.5.10.dev20250704061125.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
|
93
|
+
liger_kernel_nightly-0.5.10.dev20250704061125.dist-info/METADATA,sha256=7mx4Zgy5kdvnanl50nrzJ9HE6vTou5oeeOLx45V_T1c,24536
|
94
|
+
liger_kernel_nightly-0.5.10.dev20250704061125.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
|
95
|
+
liger_kernel_nightly-0.5.10.dev20250704061125.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
96
|
+
liger_kernel_nightly-0.5.10.dev20250704061125.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
|
97
|
+
liger_kernel_nightly-0.5.10.dev20250704061125.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|