liger-kernel-nightly 0.6.3.dev20251027181634__py3-none-any.whl → 0.6.3.dev20251028143010__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- liger_kernel/chunked_loss/cosine_similarity_loss.py +13 -4
- liger_kernel/chunked_loss/fused_linear_distillation.py +13 -2
- liger_kernel/chunked_loss/jsd_loss.py +18 -5
- liger_kernel/transformers/monkey_patch.py +52 -20
- {liger_kernel_nightly-0.6.3.dev20251027181634.dist-info → liger_kernel_nightly-0.6.3.dev20251028143010.dist-info}/METADATA +1 -1
- {liger_kernel_nightly-0.6.3.dev20251027181634.dist-info → liger_kernel_nightly-0.6.3.dev20251028143010.dist-info}/RECORD +10 -10
- {liger_kernel_nightly-0.6.3.dev20251027181634.dist-info → liger_kernel_nightly-0.6.3.dev20251028143010.dist-info}/LICENSE +0 -0
- {liger_kernel_nightly-0.6.3.dev20251027181634.dist-info → liger_kernel_nightly-0.6.3.dev20251028143010.dist-info}/NOTICE +0 -0
- {liger_kernel_nightly-0.6.3.dev20251027181634.dist-info → liger_kernel_nightly-0.6.3.dev20251028143010.dist-info}/WHEEL +0 -0
- {liger_kernel_nightly-0.6.3.dev20251027181634.dist-info → liger_kernel_nightly-0.6.3.dev20251028143010.dist-info}/top_level.txt +0 -0
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
from typing import Tuple
|
|
2
|
+
from typing import Union
|
|
3
|
+
|
|
1
4
|
import torch
|
|
2
5
|
import torch.nn.functional as F
|
|
3
6
|
|
|
@@ -41,7 +44,8 @@ class LigerFusedLinearCosineSimilarityFunction(LigerFusedLinearDistillationBase)
|
|
|
41
44
|
temperature: float = 1.0,
|
|
42
45
|
compiled: bool = True,
|
|
43
46
|
chunk_size: int = 1024,
|
|
44
|
-
|
|
47
|
+
return_soft_hard_loss: bool = False,
|
|
48
|
+
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
|
|
45
49
|
return super().forward(
|
|
46
50
|
cls=cls,
|
|
47
51
|
ctx=ctx,
|
|
@@ -59,11 +63,12 @@ class LigerFusedLinearCosineSimilarityFunction(LigerFusedLinearDistillationBase)
|
|
|
59
63
|
ignore_index=ignore_index,
|
|
60
64
|
temperature=temperature,
|
|
61
65
|
compiled=compiled,
|
|
66
|
+
return_soft_hard_loss=return_soft_hard_loss,
|
|
62
67
|
)
|
|
63
68
|
|
|
64
69
|
@staticmethod
|
|
65
|
-
def backward(ctx, grad_output):
|
|
66
|
-
grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output)[:6]
|
|
70
|
+
def backward(ctx, grad_output, *args):
|
|
71
|
+
grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output, *args)[:6]
|
|
67
72
|
|
|
68
73
|
return (
|
|
69
74
|
*grads,
|
|
@@ -75,6 +80,7 @@ class LigerFusedLinearCosineSimilarityFunction(LigerFusedLinearDistillationBase)
|
|
|
75
80
|
None, # temperature
|
|
76
81
|
None, # compiled
|
|
77
82
|
None, # chunk_size
|
|
83
|
+
None, # return_soft_hard_loss
|
|
78
84
|
)
|
|
79
85
|
|
|
80
86
|
|
|
@@ -88,6 +94,7 @@ class LigerFusedLinearCosineSimilarityLoss(torch.nn.Module):
|
|
|
88
94
|
temperature: float = 1.0,
|
|
89
95
|
compiled: bool = True,
|
|
90
96
|
chunk_size: int = 1024,
|
|
97
|
+
return_soft_hard_loss: bool = False,
|
|
91
98
|
):
|
|
92
99
|
super().__init__()
|
|
93
100
|
assert temperature != 0, "Temperature cannot be 0."
|
|
@@ -98,6 +105,7 @@ class LigerFusedLinearCosineSimilarityLoss(torch.nn.Module):
|
|
|
98
105
|
self.compiled = compiled
|
|
99
106
|
self.beta = beta
|
|
100
107
|
self.chunk_size = chunk_size
|
|
108
|
+
self.return_soft_hard_loss = return_soft_hard_loss
|
|
101
109
|
|
|
102
110
|
def forward(
|
|
103
111
|
self,
|
|
@@ -108,7 +116,7 @@ class LigerFusedLinearCosineSimilarityLoss(torch.nn.Module):
|
|
|
108
116
|
true_labels: torch.LongTensor,
|
|
109
117
|
student_bias: torch.Tensor = None,
|
|
110
118
|
teacher_bias: torch.Tensor = None,
|
|
111
|
-
) -> torch.Tensor:
|
|
119
|
+
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
|
|
112
120
|
return LigerFusedLinearCosineSimilarityFunction.apply(
|
|
113
121
|
student_input,
|
|
114
122
|
student_weight,
|
|
@@ -124,4 +132,5 @@ class LigerFusedLinearCosineSimilarityLoss(torch.nn.Module):
|
|
|
124
132
|
self.temperature,
|
|
125
133
|
self.compiled,
|
|
126
134
|
self.chunk_size,
|
|
135
|
+
self.return_soft_hard_loss,
|
|
127
136
|
)
|
|
@@ -1,5 +1,7 @@
|
|
|
1
1
|
from abc import abstractmethod
|
|
2
2
|
from functools import partial
|
|
3
|
+
from typing import Tuple
|
|
4
|
+
from typing import Union
|
|
3
5
|
|
|
4
6
|
import torch
|
|
5
7
|
|
|
@@ -157,8 +159,9 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
|
|
|
157
159
|
compute_ce_loss=True,
|
|
158
160
|
temperature=1.0,
|
|
159
161
|
compiled=True,
|
|
162
|
+
return_soft_hard_loss=False,
|
|
160
163
|
**loss_kwargs,
|
|
161
|
-
):
|
|
164
|
+
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
|
|
162
165
|
"""
|
|
163
166
|
Base class for fused linear layer with distillation loss.
|
|
164
167
|
Only need to compute gradients for student model.
|
|
@@ -180,6 +183,7 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
|
|
|
180
183
|
compute_ce_loss (bool): Whether to compute CE loss.
|
|
181
184
|
temperature (float): Temperature to control the input probability distribution. Default: `1.0` (i.e. no scale)
|
|
182
185
|
compiled (bool): Whether to use torch compile for chunk accumulation.
|
|
186
|
+
return_soft_hard_loss (bool): Whether to return soft and hard losses separately. Default: False.
|
|
183
187
|
loss_kwargs (dict): Other possible arguments that a loss function might need
|
|
184
188
|
"""
|
|
185
189
|
CHUNK_SIZE = chunk_size
|
|
@@ -187,6 +191,8 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
|
|
|
187
191
|
grad_inputs = []
|
|
188
192
|
grad_bias = torch.zeros_like(student_bias) if student_bias is not None else None
|
|
189
193
|
loss_acc = torch.zeros((), device=student_input.device)
|
|
194
|
+
soft_loss_acc = torch.zeros((), device=student_input.device) if return_soft_hard_loss else None
|
|
195
|
+
hard_loss_acc = torch.zeros((), device=student_input.device) if return_soft_hard_loss else None
|
|
190
196
|
|
|
191
197
|
loss_func_to_call = partial(
|
|
192
198
|
LigerFusedLinearDistillationBase._compute_loss,
|
|
@@ -247,6 +253,9 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
|
|
|
247
253
|
)
|
|
248
254
|
grad_weight.add_(chunk_grad_weight)
|
|
249
255
|
loss_acc.add_(chunk_loss)
|
|
256
|
+
if return_soft_hard_loss:
|
|
257
|
+
soft_loss_acc.add_(chunk_soft_loss)
|
|
258
|
+
hard_loss_acc.add_(chunk_hard_loss)
|
|
250
259
|
return chunk_grad_input
|
|
251
260
|
|
|
252
261
|
if compiled:
|
|
@@ -268,10 +277,12 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
|
|
|
268
277
|
grad_weight,
|
|
269
278
|
grad_bias,
|
|
270
279
|
)
|
|
280
|
+
if return_soft_hard_loss:
|
|
281
|
+
return loss_acc, soft_loss_acc, hard_loss_acc
|
|
271
282
|
return loss_acc
|
|
272
283
|
|
|
273
284
|
@staticmethod
|
|
274
|
-
def backward(ctx, grad_output):
|
|
285
|
+
def backward(ctx, grad_output, *args):
|
|
275
286
|
grad_input, grad_weight, grad_bias = ctx.saved_tensors
|
|
276
287
|
if torch.ne(grad_output, torch.tensor(1.0, device=grad_output.device)):
|
|
277
288
|
grad_input = grad_input * grad_output
|
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
import math
|
|
2
2
|
|
|
3
|
+
from typing import Tuple
|
|
4
|
+
from typing import Union
|
|
5
|
+
|
|
3
6
|
import torch
|
|
4
7
|
import torch.nn.functional as F
|
|
5
8
|
|
|
@@ -56,6 +59,7 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
|
|
|
56
59
|
temperature: float = 1.0,
|
|
57
60
|
compiled: bool = True,
|
|
58
61
|
chunk_size: int = 1024,
|
|
62
|
+
return_soft_hard_loss: bool = False,
|
|
59
63
|
):
|
|
60
64
|
"""
|
|
61
65
|
Fused linear layer with JSD distillation loss.
|
|
@@ -72,8 +76,9 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
|
|
|
72
76
|
temperature (float): Temperature for softening/sharpening distributions
|
|
73
77
|
compiled (bool): Whether to use torch compile
|
|
74
78
|
chunk_size (int): Size of chunks for processing.
|
|
79
|
+
return_soft_hard_loss (bool): Whether to return soft and hard losses separately. Default: False.
|
|
75
80
|
Returns:
|
|
76
|
-
torch.Tensor: Computed loss
|
|
81
|
+
torch.Tensor: Computed loss, or tuple (loss, soft_loss, hard_loss) if return_soft_hard_loss=True
|
|
77
82
|
"""
|
|
78
83
|
return super().forward(
|
|
79
84
|
cls=cls,
|
|
@@ -92,11 +97,12 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
|
|
|
92
97
|
ignore_index=ignore_index,
|
|
93
98
|
temperature=temperature,
|
|
94
99
|
compiled=compiled,
|
|
100
|
+
return_soft_hard_loss=return_soft_hard_loss,
|
|
95
101
|
)
|
|
96
102
|
|
|
97
103
|
@staticmethod
|
|
98
|
-
def backward(ctx, grad_output):
|
|
99
|
-
grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output)[:6]
|
|
104
|
+
def backward(ctx, grad_output, *args):
|
|
105
|
+
grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output, *args)[:6]
|
|
100
106
|
|
|
101
107
|
return (
|
|
102
108
|
*grads,
|
|
@@ -108,6 +114,7 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
|
|
|
108
114
|
None, # temperature
|
|
109
115
|
None, # compiled
|
|
110
116
|
None, # chunk_size
|
|
117
|
+
None, # return_soft_hard_loss
|
|
111
118
|
)
|
|
112
119
|
|
|
113
120
|
|
|
@@ -125,6 +132,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
|
|
|
125
132
|
temperature: float = 1.0,
|
|
126
133
|
compiled: bool = True,
|
|
127
134
|
chunk_size: int = 1024,
|
|
135
|
+
return_soft_hard_loss: bool = False,
|
|
128
136
|
):
|
|
129
137
|
"""
|
|
130
138
|
Args:
|
|
@@ -135,6 +143,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
|
|
|
135
143
|
compiled (bool): Whether to use torch compile
|
|
136
144
|
beta (float): Coefficient beta of generalized JSD in the interval [0, 1]. Default: `0.5`.
|
|
137
145
|
chunk_size (int): Size of chunks for processing.
|
|
146
|
+
return_soft_hard_loss (bool): Whether to return soft and hard losses separately. Default: False.
|
|
138
147
|
"""
|
|
139
148
|
super().__init__()
|
|
140
149
|
assert temperature != 0, "Temperature cannot be 0."
|
|
@@ -145,6 +154,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
|
|
|
145
154
|
self.compiled = compiled
|
|
146
155
|
self.beta = beta
|
|
147
156
|
self.chunk_size = chunk_size
|
|
157
|
+
self.return_soft_hard_loss = return_soft_hard_loss
|
|
148
158
|
|
|
149
159
|
def forward(
|
|
150
160
|
self,
|
|
@@ -155,7 +165,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
|
|
|
155
165
|
true_labels: torch.LongTensor,
|
|
156
166
|
student_bias: torch.Tensor = None,
|
|
157
167
|
teacher_bias: torch.Tensor = None,
|
|
158
|
-
) -> torch.Tensor:
|
|
168
|
+
) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor, torch.Tensor]]:
|
|
159
169
|
"""
|
|
160
170
|
Compute the JSD distillation loss.
|
|
161
171
|
|
|
@@ -167,7 +177,9 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
|
|
|
167
177
|
true_labels (torch.LongTensor): Target labels tensor
|
|
168
178
|
|
|
169
179
|
Returns:
|
|
170
|
-
torch.Tensor
|
|
180
|
+
torch.Tensor or Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
|
181
|
+
If return_soft_hard_loss is False: Computed combined loss
|
|
182
|
+
If return_soft_hard_loss is True: Tuple of (combined_loss, soft_loss, hard_loss)
|
|
171
183
|
"""
|
|
172
184
|
return LigerFusedLinearJSDFunction.apply(
|
|
173
185
|
student_input,
|
|
@@ -184,4 +196,5 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
|
|
|
184
196
|
self.temperature,
|
|
185
197
|
self.compiled,
|
|
186
198
|
self.chunk_size,
|
|
199
|
+
self.return_soft_hard_loss,
|
|
187
200
|
)
|
|
@@ -2038,6 +2038,7 @@ def apply_liger_kernel_to_internvl(
|
|
|
2038
2038
|
cross_entropy: bool = False,
|
|
2039
2039
|
fused_linear_cross_entropy: bool = True,
|
|
2040
2040
|
rms_norm: bool = True,
|
|
2041
|
+
layer_norm: bool = True,
|
|
2041
2042
|
model: Optional[PreTrainedModel] = None,
|
|
2042
2043
|
**kwargs,
|
|
2043
2044
|
) -> None:
|
|
@@ -2048,37 +2049,60 @@ def apply_liger_kernel_to_internvl(
|
|
|
2048
2049
|
NOTE: InternVL is not available in transformers<4.52.1
|
|
2049
2050
|
|
|
2050
2051
|
Args:
|
|
2051
|
-
rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
|
|
2052
2052
|
cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
|
|
2053
2053
|
fused_linear_cross_entropy (bool):
|
|
2054
2054
|
Whether to apply Liger's fused linear cross entropy loss. Default is True.
|
|
2055
2055
|
`cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
|
|
2056
2056
|
If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
|
|
2057
2057
|
rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
|
|
2058
|
-
|
|
2058
|
+
layer_norm (bool): Whether to apply Liger's LayerNorm. Default is True.
|
|
2059
2059
|
model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
|
|
2060
2060
|
loaded. Default is None.
|
|
2061
2061
|
"""
|
|
2062
2062
|
assert not (cross_entropy and fused_linear_cross_entropy), (
|
|
2063
2063
|
"cross_entropy and fused_linear_cross_entropy cannot both be True."
|
|
2064
2064
|
)
|
|
2065
|
+
import torch.nn as torch_nn
|
|
2065
2066
|
|
|
2066
2067
|
from transformers.models.internvl import modeling_internvl
|
|
2068
|
+
from transformers.models.internvl.modeling_internvl import InternVLForConditionalGeneration
|
|
2069
|
+
from transformers.models.internvl.modeling_internvl import InternVLModel
|
|
2070
|
+
from transformers.models.internvl.modeling_internvl import InternVLVisionLayer
|
|
2071
|
+
from transformers.models.internvl.modeling_internvl import InternVLVisionModel
|
|
2072
|
+
from transformers.models.internvl.modeling_internvl import InternVLVisionRMSNorm
|
|
2067
2073
|
|
|
2074
|
+
from liger_kernel.transformers.layer_norm import LigerLayerNorm
|
|
2068
2075
|
from liger_kernel.transformers.model.internvl import lce_forward as internvl_lce_forward
|
|
2076
|
+
from liger_kernel.transformers.rms_norm import LigerRMSNorm
|
|
2077
|
+
|
|
2078
|
+
if layer_norm and model is None:
|
|
2079
|
+
modeling_internvl.nn.LayerNorm = LigerLayerNorm
|
|
2069
2080
|
|
|
2070
2081
|
if cross_entropy:
|
|
2071
|
-
logger.
|
|
2072
|
-
|
|
2082
|
+
logger.info("Apply liger cross entropy")
|
|
2083
|
+
|
|
2084
|
+
from transformers.loss.loss_utils import nn
|
|
2085
|
+
|
|
2086
|
+
nn.functional.cross_entropy = liger_cross_entropy
|
|
2073
2087
|
if fused_linear_cross_entropy:
|
|
2074
2088
|
modeling_internvl.InternVLForConditionalGeneration.forward = internvl_lce_forward
|
|
2075
2089
|
if rms_norm:
|
|
2076
2090
|
modeling_internvl.InternVLVisionRMSNorm = LigerRMSNorm
|
|
2077
2091
|
|
|
2078
2092
|
if model is not None:
|
|
2079
|
-
|
|
2093
|
+
# The model instance already exists, so we need to additionally patch the
|
|
2094
|
+
# instance variables that reference already-instantiated modules
|
|
2095
|
+
if isinstance(model, (InternVLForConditionalGeneration, InternVLModel)):
|
|
2096
|
+
# NOTE: language_model and visual properties can be accessed throught conditional class.
|
|
2097
|
+
text_model = model.language_model
|
|
2098
|
+
vision_model: InternVLVisionModel = model.vision_tower
|
|
2099
|
+
else:
|
|
2100
|
+
raise TypeError(
|
|
2101
|
+
f"Unsupported internvl model type. `model` must be `InternVLForConditionalGeneration`, `InternVLModel`. Got: {type(model)}"
|
|
2102
|
+
)
|
|
2103
|
+
|
|
2104
|
+
text_model_name = model.config.text_config.model_type
|
|
2080
2105
|
text_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN.get(text_model_name, None)
|
|
2081
|
-
vision_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN.get(vision_model_name, None)
|
|
2082
2106
|
|
|
2083
2107
|
kwargs = {"cross_entropy": False, "fused_linear_cross_entropy": False, **kwargs} | {"rms_norm": rms_norm}
|
|
2084
2108
|
if text_liger_fn:
|
|
@@ -2091,25 +2115,33 @@ def apply_liger_kernel_to_internvl(
|
|
|
2091
2115
|
f"These parameters are not supported by {text_model_name}. Enter the remaining {list(text_kwargs.keys())} except for {list(remain_params)}\n"
|
|
2092
2116
|
f"Parameters accepted by {text_model_name}: {list(accept_params.keys())}"
|
|
2093
2117
|
)
|
|
2094
|
-
text_kwargs["model"] =
|
|
2118
|
+
text_kwargs["model"] = text_model
|
|
2095
2119
|
text_liger_fn(**text_kwargs)
|
|
2096
2120
|
elif text_model_name not in MODEL_TYPE_TO_APPLY_LIGER_FN:
|
|
2097
2121
|
logger.warning(f"{text_model_name} is not supported by Liger kernel.")
|
|
2098
2122
|
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
|
|
2102
|
-
|
|
2123
|
+
# Patch vision model RMSNorm layers
|
|
2124
|
+
if rms_norm:
|
|
2125
|
+
for encoder_layer in vision_model.encoder.layer:
|
|
2126
|
+
encoder_layer: InternVLVisionLayer
|
|
2127
|
+
if isinstance(encoder_layer.attention.q_norm, InternVLVisionRMSNorm):
|
|
2128
|
+
_patch_rms_norm_module(encoder_layer.attention.q_norm)
|
|
2129
|
+
if isinstance(encoder_layer.attention.k_norm, InternVLVisionRMSNorm):
|
|
2130
|
+
_patch_rms_norm_module(encoder_layer.attention.k_norm)
|
|
2103
2131
|
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
)
|
|
2109
|
-
|
|
2110
|
-
|
|
2111
|
-
|
|
2112
|
-
|
|
2132
|
+
# Patch vision model LayerNorm layers
|
|
2133
|
+
if layer_norm:
|
|
2134
|
+
# Patch layernorm
|
|
2135
|
+
if isinstance(vision_model.layernorm, torch_nn.LayerNorm):
|
|
2136
|
+
_patch_layer_norm_module(vision_model.layernorm)
|
|
2137
|
+
|
|
2138
|
+
# Patch encoder layers
|
|
2139
|
+
for encoder_layer in vision_model.encoder.layer:
|
|
2140
|
+
encoder_layer: InternVLVisionLayer
|
|
2141
|
+
if isinstance(encoder_layer.layernorm_before, torch_nn.LayerNorm):
|
|
2142
|
+
_patch_layer_norm_module(encoder_layer.layernorm_before)
|
|
2143
|
+
if isinstance(encoder_layer.layernorm_after, torch_nn.LayerNorm):
|
|
2144
|
+
_patch_layer_norm_module(encoder_layer.layernorm_after)
|
|
2113
2145
|
|
|
2114
2146
|
|
|
2115
2147
|
def apply_liger_kernel_to_smolvlm(
|
|
@@ -3,16 +3,16 @@ liger_kernel/env_report.py,sha256=uhdEC8OydxoZlb7B6YYcAaBF3crGFdIck-4cxaW4NJY,17
|
|
|
3
3
|
liger_kernel/utils.py,sha256=BQleeZWHSZPNuPcYcoZTOp1kcNEZONZilPP5-AmjgWI,2024
|
|
4
4
|
liger_kernel/chunked_loss/README.md,sha256=0FmkFC3hKBqyoDT5uTlIYmrvRkF-EOCR1y-EBU1LpWU,2248
|
|
5
5
|
liger_kernel/chunked_loss/__init__.py,sha256=J5_jNnzZ4gZmA38W5f_4oab7xMoNk1Xy-yh3X_Xlf-s,714
|
|
6
|
-
liger_kernel/chunked_loss/cosine_similarity_loss.py,sha256=
|
|
6
|
+
liger_kernel/chunked_loss/cosine_similarity_loss.py,sha256=x2nprTHPraU8Ya2NMZtaDk9r-s-1NKJwCTrzQIdmg-8,4680
|
|
7
7
|
liger_kernel/chunked_loss/cpo_loss.py,sha256=Gzz1eU4kgcbdubFVRy55e8A1Cr-r45UgNicXwZIjmBU,5454
|
|
8
8
|
liger_kernel/chunked_loss/dpo_loss.py,sha256=I83khNs3QQjuhr8U3NIOAACkbse6DNiBV-TulPZ0lXw,9006
|
|
9
9
|
liger_kernel/chunked_loss/functional.py,sha256=-XPDbLml9dHmvoSU2VNTUrBDFehuzvuAGPikVetBMtI,1132
|
|
10
|
-
liger_kernel/chunked_loss/fused_linear_distillation.py,sha256=
|
|
10
|
+
liger_kernel/chunked_loss/fused_linear_distillation.py,sha256=yRtolfFGfKB-SxGQQyF68GYXd11Zlvh1InLdGeWNFIE,12652
|
|
11
11
|
liger_kernel/chunked_loss/fused_linear_ppo.py,sha256=ZjpNP5VC-tXXIKb4AckkQ3iWWQeej-JoG4StJq3N0wg,13650
|
|
12
12
|
liger_kernel/chunked_loss/fused_linear_preference.py,sha256=FIH85uUXAOgYx5Ax8MjFhJHVu-2pKtY7wSegd0zSyyY,18336
|
|
13
13
|
liger_kernel/chunked_loss/fused_linear_unpaired_preference.py,sha256=RiuK3UtRwH9T6jZ36sA8Urj-TVuOLOO2syLg_JOQapY,13437
|
|
14
14
|
liger_kernel/chunked_loss/grpo_loss.py,sha256=SkZuKoW8K94UbWR-OtfopsQkuQ8tFOr_90AGR6_Mhes,12844
|
|
15
|
-
liger_kernel/chunked_loss/jsd_loss.py,sha256=
|
|
15
|
+
liger_kernel/chunked_loss/jsd_loss.py,sha256=G0RghPYYelyZ6DOEiwS8we9TT5MY2iHpiFqzZ2Xy87g,8038
|
|
16
16
|
liger_kernel/chunked_loss/kto_loss.py,sha256=llVCe6DkcpCo57seGWoMikaQVFApx764jsmSbQyqwQY,7529
|
|
17
17
|
liger_kernel/chunked_loss/orpo_loss.py,sha256=nu9UYG16dcMw93lvHi4_hYs3Q0FK1KnlmMRj7OpYU8s,4872
|
|
18
18
|
liger_kernel/chunked_loss/simpo_loss.py,sha256=fy2w8KbhMrBv7b1jdIeH3bBFxY52bPQPZb3KwBvmurM,5385
|
|
@@ -59,7 +59,7 @@ liger_kernel/transformers/jsd.py,sha256=DGqRnxIZxsvxo0_tbbxX3b-sDbDjC_yKufyRIHCc
|
|
|
59
59
|
liger_kernel/transformers/kl_div.py,sha256=WLffFbh1EExD2Eb1F7lN11fo9JJC-0751WJjZAF1Fj8,409
|
|
60
60
|
liger_kernel/transformers/layer_norm.py,sha256=c9pk3PEasOKYR0rhe5e5nNrnYKVCEW4VC8S6LpCq9EQ,906
|
|
61
61
|
liger_kernel/transformers/llama4_rope.py,sha256=kS6PSHEwf3dS7hD7C7p8S0geugx2EMCiP0h0F7LsUoY,3639
|
|
62
|
-
liger_kernel/transformers/monkey_patch.py,sha256=
|
|
62
|
+
liger_kernel/transformers/monkey_patch.py,sha256=3DLFMn2VusVcR6C5YElfpHJBRoJxvho0a2JoVdGqxHA,117266
|
|
63
63
|
liger_kernel/transformers/multi_token_attention.py,sha256=K3NIY9_5TPgZ4_Rahn0xnkMXxD_fmlJHK4CWGYvGQp0,1752
|
|
64
64
|
liger_kernel/transformers/poly_norm.py,sha256=g5tC75i3qy1_N26ZUP-jfpct7ivQAEdJfIfx8IXzeyE,1377
|
|
65
65
|
liger_kernel/transformers/qwen2vl_mrope.py,sha256=5EwSqrMdsL9MYspeBMXBsNJKvH0MOmRrtJXAJlnnlOI,1047
|
|
@@ -103,9 +103,9 @@ liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7H
|
|
|
103
103
|
liger_kernel/transformers/trainer/orpo_trainer.py,sha256=tX0h63aOFe3rNqTmk6JpMf75UPo981yzEa6TghnjS0Q,5370
|
|
104
104
|
liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
|
|
105
105
|
liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
|
|
106
|
-
liger_kernel_nightly-0.6.3.
|
|
107
|
-
liger_kernel_nightly-0.6.3.
|
|
108
|
-
liger_kernel_nightly-0.6.3.
|
|
109
|
-
liger_kernel_nightly-0.6.3.
|
|
110
|
-
liger_kernel_nightly-0.6.3.
|
|
111
|
-
liger_kernel_nightly-0.6.3.
|
|
106
|
+
liger_kernel_nightly-0.6.3.dev20251028143010.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
|
|
107
|
+
liger_kernel_nightly-0.6.3.dev20251028143010.dist-info/METADATA,sha256=ckNo8u8rwQ-UDuznWIg4v4k6i6eePViOYnkx9cshTd8,24777
|
|
108
|
+
liger_kernel_nightly-0.6.3.dev20251028143010.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
|
|
109
|
+
liger_kernel_nightly-0.6.3.dev20251028143010.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
|
|
110
|
+
liger_kernel_nightly-0.6.3.dev20251028143010.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
|
|
111
|
+
liger_kernel_nightly-0.6.3.dev20251028143010.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|