npm - cciwon-code-review-cli - Versions diffs - 2.0.2 → 2.0.3 - Mend

cciwon-code-review-cli 2.0.2 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (110) hide show

package/unsloth_compiled_cache/LayerNorm.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""
+2025.12.6
+2025.12.7
+4.57.1
+0.24.0
+__UNSLOTH_VERSIONING__
+"""
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+import os
+import torch
+import importlib.util
+import math
+if importlib.util.find_spec("unsloth_studio") is None:
+    UNSLOTH_STUDIO_ENABLED = False
+else:
+    UNSLOTH_STUDIO_ENABLED = os.environ.get("UNSLOTH_STUDIO_DISABLED", "0") == "0"
+pass
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+import math
+UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
+UNSLOTH_ENABLE_CCE = os.environ.get("UNSLOTH_ENABLE_CCE", "1") == "1"
+UNSLOTH_COMPILE_DISABLE = os.environ.get("UNSLOTH_COMPILE_DISABLE", "0") in ("1", "partial",)
+import logging
+logger_compiler = logging.getLogger(__name__)
+if UNSLOTH_ENABLE_LOGGING:
+    logger_compiler.setLevel(logging.DEBUG)
+global INFERENCE_RUNS
+INFERENCE_RUNS = 0
+try:
+    import torch._dynamo.eval_frame as torch_dynamo_eval_frame
+    torch_dynamo_eval_frame._stance.stance
+    torch_compiler_set_stance = torch.compiler.set_stance
+except:
+    torch_dynamo_eval_frame = None
+    torch_compiler_set_stance = None
+pass
+from unsloth_zoo import DEVICE_TYPE_TORCH, DEVICE_COUNT
+torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from transformers.models.qwen3_moe.modeling_qwen3_moe import (F)
+def forward(self, input: Tensor) -> Tensor:
+    return F.layer_norm(
+        input, self.normalized_shape, self.weight, self.bias, self.eps
+    ).to(input.dtype).to(input.dtype)

package/unsloth_compiled_cache/Linear4bit_peft_forward.py ADDED Viewed

@@ -0,0 +1,115 @@
+"""
+2025.12.6
+2025.12.7
+4.57.1
+0.24.0
+__UNSLOTH_VERSIONING__
+"""
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from peft.tuners.lora.bnb import (torch)
+torch_addmm = torch.addmm
+torch_add   = torch.add
+# @torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
+def lora_forward(result, lora_A, lora_B, dropout, x, scaling):
+    # Use result.dtype (bfloat16 from base layer) since x may have been cast to float32
+    # by _cast_input_dtype when autocast is disabled
+    target_dtype = result.dtype
+    xA = dropout(x).to(target_dtype) @ lora_A.weight.to(target_dtype).t()
+    # output = result + scaling * xA @ lora_B.weight.t()
+    shape = result.shape
+    output = torch_addmm(
+        result.view(-1, shape[-1]),
+        xA.view(-1, xA.shape[-1]),
+        lora_B.weight.to(target_dtype).t(),
+        alpha = scaling,
+        beta = 1,
+    ).view(shape)
+    bias = lora_B.bias
+    if bias is not None:
+        output = torch_add(
+            output,
+            bias.to(target_dtype),
+            alpha = scaling,
+        )
+    return output
+pass
+def unsloth_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+    adapter_names = kwargs.pop("adapter_names", None)
+    if self.disable_adapters:
+        if self.merged:
+            self.unmerge()
+        result = self.base_layer(x, *args, **kwargs)
+    elif adapter_names is not None:
+        result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **variant_kwargs, **kwargs)
+    elif self.merged:
+        result = self.base_layer(x, *args, **kwargs)
+    else:
+        result = self.base_layer(x, *args, **kwargs)
+        # As per Tim Dettmers, for 4bit, we need to defensively clone here.
+        # The reason is that in some cases, an error can occur that backprop
+        # does not work on a manipulated view. This issue may be solved with
+        # newer PyTorch versions but this would need extensive testing to be
+        # sure.
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.lora_A.keys():
+                continue
+            lora_A = self.lora_A[active_adapter]
+            lora_B = self.lora_B[active_adapter]
+            dropout = self.lora_dropout[active_adapter]
+            scaling = self.scaling[active_adapter]
+            requires_conversion = not torch.is_autocast_enabled()
+            if requires_conversion:
+                expected_dtype = result.dtype
+                x = self._cast_input_dtype(x, lora_A.weight.dtype)
+            if active_adapter not in self.lora_variant:  # vanilla LoRA
+                return lora_forward(result, lora_A, lora_B, dropout, x, scaling)
+                if requires_conversion:
+                    output = output.to(expected_dtype)
+                result = result + output
+            else:
+                result = self.lora_variant[active_adapter].forward(
+                    self,
+                    active_adapter=active_adapter,
+                    x=x,
+                    result=result,
+                    **variant_kwargs,
+                    **kwargs,
+                )
+                if requires_conversion:
+                    result = result.to(expected_dtype)
+    return result

package/unsloth_compiled_cache/Linear8bitLt_peft_forward.py ADDED Viewed

@@ -0,0 +1,113 @@
+"""
+2025.12.6
+2025.12.7
+4.57.1
+0.24.0
+__UNSLOTH_VERSIONING__
+"""
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
+import torch._dynamo
+@torch._dynamo.disable
+def _call_8bit_base_layer(base_layer, x, *args, **kwargs):
+    return base_layer(x, *args, **kwargs)
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from peft.tuners.lora.bnb import (torch)
+torch_addmm = torch.addmm
+torch_add   = torch.add
+# @torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
+def lora_forward(result, lora_A, lora_B, dropout, x, scaling):
+    # Use result.dtype (bfloat16 from base layer) since x may have been cast to float32
+    # by _cast_input_dtype when autocast is disabled
+    target_dtype = result.dtype
+    xA = dropout(x).to(target_dtype) @ lora_A.weight.to(target_dtype).t()
+    # output = result + scaling * xA @ lora_B.weight.t()
+    shape = result.shape
+    output = torch_addmm(
+        result.view(-1, shape[-1]),
+        xA.view(-1, xA.shape[-1]),
+        lora_B.weight.to(target_dtype).t(),
+        alpha = scaling,
+        beta = 1,
+    ).view(shape)
+    bias = lora_B.bias
+    if bias is not None:
+        output = torch_add(
+            output,
+            bias.to(target_dtype),
+            alpha = scaling,
+        )
+    return output
+pass
+def unsloth_forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+    adapter_names = kwargs.pop("adapter_names", None)
+    if self.disable_adapters:
+        if self.merged:
+            self.unmerge()
+        result = _call_8bit_base_layer(self.base_layer, x, *args, **kwargs)
+    elif adapter_names is not None:
+        result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **variant_kwargs, **kwargs)
+    elif self.merged:
+        result = _call_8bit_base_layer(self.base_layer, x, *args, **kwargs)
+    else:
+        result = _call_8bit_base_layer(self.base_layer, x, *args, **kwargs)
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.lora_A.keys():
+                continue
+            lora_A = self.lora_A[active_adapter]
+            lora_B = self.lora_B[active_adapter]
+            dropout = self.lora_dropout[active_adapter]
+            scaling = self.scaling[active_adapter]
+            requires_conversion = not torch.is_autocast_enabled()
+            if requires_conversion:
+                expected_dtype = result.dtype
+                x = self._cast_input_dtype(x, lora_A.weight.dtype)
+            if active_adapter not in self.lora_variant:  # vanilla LoRA
+                return lora_forward(result, lora_A, lora_B, dropout, x, scaling)
+                if requires_conversion:
+                    output = output.to(expected_dtype)
+                result = result + output
+            else:
+                result = self.lora_variant[active_adapter].forward(
+                    self,
+                    active_adapter=active_adapter,
+                    x=x,
+                    result=result,
+                    **variant_kwargs,
+                    **kwargs,
+                )
+                if requires_conversion:
+                    result = result.to(expected_dtype)
+    return result

package/unsloth_compiled_cache/Linear_peft_forward.py ADDED Viewed

@@ -0,0 +1,104 @@
+"""
+2025.12.6
+2025.12.7
+4.57.1
+0.24.0
+__UNSLOTH_VERSIONING__
+"""
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from peft.tuners.lora.torchao import (Any, torch)
+torch_addmm = torch.addmm
+torch_add   = torch.add
+# @torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
+def lora_forward(result, lora_A, lora_B, dropout, x, scaling):
+    # Use result.dtype (bfloat16 from base layer) since x may have been cast to float32
+    # by _cast_input_dtype when autocast is disabled
+    target_dtype = result.dtype
+    xA = dropout(x).to(target_dtype) @ lora_A.weight.to(target_dtype).t()
+    # output = result + scaling * xA @ lora_B.weight.t()
+    shape = result.shape
+    output = torch_addmm(
+        result.view(-1, shape[-1]),
+        xA.view(-1, xA.shape[-1]),
+        lora_B.weight.to(target_dtype).t(),
+        alpha = scaling,
+        beta = 1,
+    ).view(shape)
+    bias = lora_B.bias
+    if bias is not None:
+        output = torch_add(
+            output,
+            bias.to(target_dtype),
+            alpha = scaling,
+        )
+    return output
+pass
+def unsloth_forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
+    adapter_names = kwargs.pop("adapter_names", None)
+    if self.disable_adapters:
+        if self.merged:
+            self.unmerge()
+        result = self.base_layer(x, *args, **kwargs)
+    elif adapter_names is not None:
+        result = self._mixed_batch_forward(x, *args, adapter_names=adapter_names, **variant_kwargs, **kwargs)
+    elif self.merged:
+        result = self.base_layer(x, *args, **kwargs)
+    else:
+        result = self.base_layer(x, *args, **kwargs)
+        torch_result_dtype = result.dtype
+        lora_A_keys = self.lora_A.keys()
+        for active_adapter in self.active_adapters:
+            if active_adapter not in lora_A_keys:
+                continue
+            lora_A = self.lora_A[active_adapter]
+            lora_B = self.lora_B[active_adapter]
+            dropout = self.lora_dropout[active_adapter]
+            scaling = self.scaling[active_adapter]
+            if not torch.is_autocast_enabled(): result, x = result.to(lora_A.weight.dtype), x.to(lora_A.weight.dtype)
+            if active_adapter not in self.lora_variant:  # vanilla LoRA
+                return lora_forward(result, lora_A, lora_B, dropout, x, scaling)
+            else:
+                result = self.lora_variant[active_adapter].forward(
+                    self,
+                    active_adapter=active_adapter,
+                    x=x,
+                    result=result,
+                    **variant_kwargs,
+                    **kwargs,
+                )
+        result = result.to(torch_result_dtype)
+    return result

package/unsloth_compiled_cache/LoraParallelLinear_peft_forward.py ADDED Viewed

@@ -0,0 +1,91 @@
+"""
+2025.12.6
+2025.12.7
+4.57.1
+0.24.0
+__UNSLOTH_VERSIONING__
+"""
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from peft.tuners.lora.tp_layer import (Any, __name__, torch)
+torch_addmm = torch.addmm
+torch_add   = torch.add
+# @torch.compile(fullgraph = False, dynamic = True, options = torch_compile_options)
+def lora_forward(result, lora_A, lora_B, dropout, x, scaling):
+    # Use result.dtype (bfloat16 from base layer) since x may have been cast to float32
+    # by _cast_input_dtype when autocast is disabled
+    target_dtype = result.dtype
+    xA = dropout(x).to(target_dtype) @ lora_A.weight.to(target_dtype).t()
+    # output = result + scaling * xA @ lora_B.weight.t()
+    shape = result.shape
+    output = torch_addmm(
+        result.view(-1, shape[-1]),
+        xA.view(-1, xA.shape[-1]),
+        lora_B.weight.to(target_dtype).t(),
+        alpha = scaling,
+        beta = 1,
+    ).view(shape)
+    bias = lora_B.bias
+    if bias is not None:
+        output = torch_add(
+            output,
+            bias.to(target_dtype),
+            alpha = scaling,
+        )
+    return output
+pass
+def unsloth_forward(self, x: torch.Tensor, *args: Any, **kwargs: Any):
+    adapter_names = kwargs.pop("adapter_names", None)
+    # If weight is used for matrix multiplication here, the final aggregation operation of the original
+    # parallel_linear layer will be missing, so we need to directly call its forward function to obtain the
+    # output of the original parallel_linear layer.
+    if self.disable_adapters:
+        if self.merged:
+            self.unmerge()
+        result, bias = self.base_layer(x, *args, **kwargs)
+    elif adapter_names is not None:
+        raise ValueError(f"{self.__class__.__name__} does not support mixed_batch_forward yet.")
+    elif self.merged:
+        result, bias = self.base_layer(x, *args, **kwargs)
+    else:
+        result, bias = self.base_layer(x, *args, **kwargs)
+        torch_result_dtype = result.dtype
+        for active_adapter in self.active_adapters:
+            if active_adapter not in self.lora_A.keys():
+                continue
+            lora_A = self.lora_A[active_adapter]
+            lora_B = self.lora_B[active_adapter]
+            dropout = self.lora_dropout[active_adapter]
+            scaling = self.scaling[active_adapter]
+            if not torch.is_autocast_enabled(): result, x = result.to(lora_A.weight.dtype), x.to(lora_A.weight.dtype)
+            return lora_forward(result, lora_A, lora_B, dropout, x, scaling)
+        result = result.to(torch_result_dtype)
+    return result, bias

package/unsloth_compiled_cache/RMSNorm.py ADDED Viewed

@@ -0,0 +1,73 @@
+"""
+2025.12.6
+2025.12.7
+4.57.1
+0.24.0
+__UNSLOTH_VERSIONING__
+"""
+# Unsloth auto generated code
+# Copyright 2023-present Daniel Han-Chen, Michael Han-Chen & the Unsloth team. All rights reserved.
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Lesser General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+import os
+import torch
+import importlib.util
+import math
+if importlib.util.find_spec("unsloth_studio") is None:
+    UNSLOTH_STUDIO_ENABLED = False
+else:
+    UNSLOTH_STUDIO_ENABLED = os.environ.get("UNSLOTH_STUDIO_DISABLED", "0") == "0"
+pass
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+import math
+UNSLOTH_ENABLE_LOGGING = os.environ.get("UNSLOTH_ENABLE_LOGGING", "0") == "1"
+UNSLOTH_ENABLE_CCE = os.environ.get("UNSLOTH_ENABLE_CCE", "1") == "1"
+UNSLOTH_COMPILE_DISABLE = os.environ.get("UNSLOTH_COMPILE_DISABLE", "0") in ("1", "partial",)
+import logging
+logger_compiler = logging.getLogger(__name__)
+if UNSLOTH_ENABLE_LOGGING:
+    logger_compiler.setLevel(logging.DEBUG)
+global INFERENCE_RUNS
+INFERENCE_RUNS = 0
+try:
+    import torch._dynamo.eval_frame as torch_dynamo_eval_frame
+    torch_dynamo_eval_frame._stance.stance
+    torch_compiler_set_stance = torch.compiler.set_stance
+except:
+    torch_dynamo_eval_frame = None
+    torch_compiler_set_stance = None
+pass
+from unsloth_zoo import DEVICE_TYPE_TORCH, DEVICE_COUNT
+torch_compile_options = {'epilogue_fusion': True, 'max_autotune': False, 'shape_padding': True, 'trace.enabled': False, 'triton.cudagraphs': False, 'debug': False, 'dce': True, 'memory_planning': True, 'coordinate_descent_tuning': False, 'trace.graph_diagram': False, 'compile_threads': 32, 'group_fusion': True, 'disable_progress': True, 'verbose_progress': False, 'triton.multi_kernel': 0, 'triton.use_block_ptr': False, 'triton.enable_persistent_tma_matmul': True, 'triton.autotune_at_compile_time': False, 'triton.cooperative_reductions': False, 'cuda.compile_opt_level': '-O2', 'cuda.enable_cuda_lto': True, 'combo_kernels': False, 'benchmark_combo_kernel': True, 'combo_kernel_foreach_dynamic_shapes': True}
+from torch import Tensor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+from typing import Any, List, Optional, Tuple, Union, Dict, Set, Callable
+from transformers.models.qwen3_moe.modeling_qwen3_moe import (F, torch)
+def forward(self, x: torch.Tensor) -> torch.Tensor:
+    """
+    Runs the forward pass.
+    """
+    return F.rms_norm(x, self.normalized_shape, self.weight, self.eps).to(input.dtype).to(input.dtype)