adv-optm 1.2.dev10__tar.gz → 1.2.dev11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of adv-optm might be problematic. Click here for more details.
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/PKG-INFO +1 -1
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/__init__.py +1 -1
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/optim/AdaMuon_adv.py +3 -65
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/optim/AdamW_adv.py +4 -8
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/optim/Muon_adv.py +2 -65
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/util/Kourkoutas.py +1 -40
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm.egg-info/PKG-INFO +1 -1
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/setup.py +1 -1
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/LICENSE +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/README.md +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/optim/Adopt_adv.py +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/optim/Lion_Prodigy_adv.py +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/optim/Lion_adv.py +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/optim/Prodigy_adv.py +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/optim/Simplified_AdEMAMix.py +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/optim/__init__.py +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/util/BF16_Stochastic_Rounding.py +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/util/Effective_Shape.py +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/util/NNMF.py +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/util/Newton_Schulz.py +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/util/One_Bit_Boolean.py +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/util/OrthoGrad.py +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm/util/__init__.py +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm.egg-info/SOURCES.txt +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm.egg-info/dependency_links.txt +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm.egg-info/requires.txt +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/adv_optm.egg-info/top_level.txt +0 -0
- {adv_optm-1.2.dev10 → adv_optm-1.2.dev11}/setup.cfg +0 -0
|
@@ -11,7 +11,7 @@ from ..util.One_Bit_Boolean import _pack_bools, _unpack_bools
|
|
|
11
11
|
|
|
12
12
|
class AdaMuon_adv(torch.optim.Optimizer):
|
|
13
13
|
"""
|
|
14
|
-
|
|
14
|
+
IImplements an advanced AdaMuon optimizer algorithm.
|
|
15
15
|
|
|
16
16
|
AdaMuon combines the geometry-aware updates of Muon with the element-wise
|
|
17
17
|
adaptivity of Adam. It is designed for 2D parameters (e.g., linear layers)
|
|
@@ -25,9 +25,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
|
|
|
25
25
|
3. An RMS-aligned rescaling strategy to match the update magnitude of Adam,
|
|
26
26
|
allowing for reuse of learning rate schedules.
|
|
27
27
|
|
|
28
|
-
Can also operate in a hybrid mode, using an auxiliary AdamW
|
|
29
|
-
optimizer for specific parameters (e.g., biases, norms, embeddings) as
|
|
30
|
-
defined by a `layer_key_fn`.
|
|
31
28
|
|
|
32
29
|
Args:
|
|
33
30
|
params (iterable): iterable of parameters to optimize or dicts defining
|
|
@@ -69,12 +66,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
|
|
|
69
66
|
(default: 128)
|
|
70
67
|
nnmf_factor (bool): whether to use the factorization or disable it to use
|
|
71
68
|
the uncompressed optimizer. (default: False)
|
|
72
|
-
MuonWithAuxAdam (bool): If True, enables the hybrid optimizer mode.
|
|
73
|
-
Parameters designated by `layer_key_fn` will be optimized with
|
|
74
|
-
AdamW_adv instead of Muon. (default: False)
|
|
75
|
-
adam_kwargs (Optional[dict]): A dictionary of keyword arguments to pass
|
|
76
|
-
to the auxiliary AdamW_adv optimizer. Only used when
|
|
77
|
-
`MuonWithAuxAdam` is True. (default: None)
|
|
78
69
|
"""
|
|
79
70
|
|
|
80
71
|
def __init__(
|
|
@@ -99,10 +90,6 @@ class AdaMuon_adv(torch.optim.Optimizer):
|
|
|
99
90
|
low_rank_ortho: bool = False,
|
|
100
91
|
ortho_rank: int = 128,
|
|
101
92
|
nnmf_factor: bool = False,
|
|
102
|
-
# hybrid optimizer mode
|
|
103
|
-
MuonWithAuxAdam: bool = False,
|
|
104
|
-
muon_adam_lr: float = 1e-4,
|
|
105
|
-
adam_kwargs: Optional[dict] = None,
|
|
106
93
|
):
|
|
107
94
|
if not (lr >= 0.0):
|
|
108
95
|
raise ValueError(f"Learning-rate should be >= 0.0. Got {lr}")
|
|
@@ -114,7 +101,7 @@ class AdaMuon_adv(torch.optim.Optimizer):
|
|
|
114
101
|
print("Warning: nesterov is incompatible with Simplified_AdEMAMix, Disabling cautious.")
|
|
115
102
|
nesterov = False
|
|
116
103
|
|
|
117
|
-
|
|
104
|
+
defaults = {
|
|
118
105
|
"lr": lr, "betas": betas, "weight_decay": weight_decay,
|
|
119
106
|
"eps": eps, "rms_target": rms_target, "ns_steps": ns_steps,
|
|
120
107
|
"ns_eps": ns_eps, "ns_coeffs": ns_coeffs, "nnmf_factor": nnmf_factor,
|
|
@@ -127,34 +114,7 @@ class AdaMuon_adv(torch.optim.Optimizer):
|
|
|
127
114
|
}
|
|
128
115
|
self.stochastic_rounding = stochastic_rounding
|
|
129
116
|
|
|
130
|
-
|
|
131
|
-
self.aux_adam = None
|
|
132
|
-
|
|
133
|
-
if not self.MuonWithAuxAdam:
|
|
134
|
-
super().__init__(params, muon_defaults)
|
|
135
|
-
return
|
|
136
|
-
|
|
137
|
-
# HYBRID OPTIMIZER LOGIC
|
|
138
|
-
adam_kwargs = adam_kwargs or {}
|
|
139
|
-
self.aux_adam = AdamW_adv(
|
|
140
|
-
[],
|
|
141
|
-
lr=muon_adam_lr,
|
|
142
|
-
**adam_kwargs,
|
|
143
|
-
_is_delegate=True
|
|
144
|
-
)
|
|
145
|
-
adam_defaults = self.aux_adam.defaults
|
|
146
|
-
|
|
147
|
-
final_param_groups = []
|
|
148
|
-
for group in params:
|
|
149
|
-
optim_type = group.get('optim_type', 'muon')
|
|
150
|
-
defaults_to_use = adam_defaults if optim_type == 'adam' else muon_defaults
|
|
151
|
-
|
|
152
|
-
new_group = group.copy()
|
|
153
|
-
for key, value in defaults_to_use.items():
|
|
154
|
-
new_group.setdefault(key, value)
|
|
155
|
-
final_param_groups.append(new_group)
|
|
156
|
-
|
|
157
|
-
super().__init__(final_param_groups, muon_defaults)
|
|
117
|
+
super().__init__(params, defaults)
|
|
158
118
|
|
|
159
119
|
|
|
160
120
|
@property
|
|
@@ -169,30 +129,8 @@ class AdaMuon_adv(torch.optim.Optimizer):
|
|
|
169
129
|
def supports_flat_params(self):
|
|
170
130
|
return False
|
|
171
131
|
|
|
172
|
-
@property
|
|
173
|
-
def kourkoutas_helper(self):
|
|
174
|
-
"""
|
|
175
|
-
Exposes the kourkoutas_helper from the auxiliary AdamW optimizer,
|
|
176
|
-
if it exists. This allows external access for logging K-b.
|
|
177
|
-
"""
|
|
178
|
-
if self.aux_adam and hasattr(self.aux_adam, 'kourkoutas_helper'):
|
|
179
|
-
return self.aux_adam.kourkoutas_helper
|
|
180
|
-
return None
|
|
181
|
-
|
|
182
132
|
@torch.no_grad()
|
|
183
133
|
def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
|
|
184
|
-
if self.MuonWithAuxAdam:
|
|
185
|
-
optim_type = group.get('optim_type')
|
|
186
|
-
if optim_type == 'adam':
|
|
187
|
-
# Delegate to the AdamW_adv optimizer's logic.
|
|
188
|
-
# We need to temporarily "lend" our state and param_groups
|
|
189
|
-
# to the delegate so it has the full context to work with,
|
|
190
|
-
# especially for features like Kourkoutas-beta.
|
|
191
|
-
self.aux_adam.state = self.state
|
|
192
|
-
self.aux_adam.param_groups = self.param_groups
|
|
193
|
-
self.aux_adam.step_parameter(p, group, i)
|
|
194
|
-
return
|
|
195
|
-
|
|
196
134
|
if p.grad is None:
|
|
197
135
|
return
|
|
198
136
|
|
|
@@ -107,7 +107,6 @@ class AdamW_adv(torch.optim.Optimizer):
|
|
|
107
107
|
k_logging: int = 0,
|
|
108
108
|
layer_key_fn: Optional[Callable] = None,
|
|
109
109
|
nnmf_factor: bool = False,
|
|
110
|
-
_is_delegate: bool = False,
|
|
111
110
|
):
|
|
112
111
|
if not (lr >= 0.0):
|
|
113
112
|
raise ValueError(f"Learning-rate should be >= 0.0. Got {lr}")
|
|
@@ -138,11 +137,10 @@ class AdamW_adv(torch.optim.Optimizer):
|
|
|
138
137
|
self.factored = nnmf_factor
|
|
139
138
|
self.kourkoutas_beta = kourkoutas_beta
|
|
140
139
|
self.layer_key_fn = layer_key_fn
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
self.
|
|
145
|
-
self.kourkoutas_helper = None
|
|
140
|
+
super().__init__(params, defaults)
|
|
141
|
+
|
|
142
|
+
if self.kourkoutas_beta:
|
|
143
|
+
self.kourkoutas_helper = KourkoutasHelper(self)
|
|
146
144
|
|
|
147
145
|
@property
|
|
148
146
|
def supports_fused_back_pass(self):
|
|
@@ -160,8 +158,6 @@ class AdamW_adv(torch.optim.Optimizer):
|
|
|
160
158
|
def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
|
|
161
159
|
if p.grad is None:
|
|
162
160
|
return
|
|
163
|
-
if group.get('kourkoutas_beta', False) and self.kourkoutas_helper is None:
|
|
164
|
-
self.kourkoutas_helper = KourkoutasHelper(self)
|
|
165
161
|
|
|
166
162
|
grad = p.grad
|
|
167
163
|
if grad.dtype != torch.float32 and self.factored:
|
|
@@ -23,10 +23,6 @@ class Muon_adv(torch.optim.Optimizer):
|
|
|
23
23
|
This implementation is designed for 2D parameters (e.g., linear layers) and
|
|
24
24
|
can handle other-dimensional parameters (e.g., 1D bias, 4D convolutional layers) by
|
|
25
25
|
flattening/reshaping them.
|
|
26
|
-
|
|
27
|
-
Can also operate in a hybrid mode, using an auxiliary AdamW
|
|
28
|
-
optimizer for specific parameters (e.g., biases, norms, embeddings) as
|
|
29
|
-
defined by a `layer_key_fn`.
|
|
30
26
|
|
|
31
27
|
Args:
|
|
32
28
|
params (iterable): iterable of parameters to optimize or dicts defining
|
|
@@ -69,12 +65,6 @@ class Muon_adv(torch.optim.Optimizer):
|
|
|
69
65
|
normuon_lr_scale (float): Scaling factor for the NorMuon learning rate.
|
|
70
66
|
(default: 0.2)
|
|
71
67
|
normuon_atan2 (bool): whether to use the atan2 for NorMuon. (default: False)
|
|
72
|
-
MuonWithAuxAdam (bool): If True, enables the hybrid optimizer mode.
|
|
73
|
-
Parameters designated by `layer_key_fn` will be optimized with
|
|
74
|
-
AdamW_adv instead of Muon. (default: False)
|
|
75
|
-
adam_kwargs (Optional[dict]): A dictionary of keyword arguments to pass
|
|
76
|
-
to the auxiliary AdamW_adv optimizer. Only used when
|
|
77
|
-
`MuonWithAuxAdam` is True. (default: None)
|
|
78
68
|
"""
|
|
79
69
|
|
|
80
70
|
def __init__(
|
|
@@ -102,10 +92,6 @@ class Muon_adv(torch.optim.Optimizer):
|
|
|
102
92
|
normuon_eps: float = 1e-8,
|
|
103
93
|
normuon_lr_scale: float = 0.2,
|
|
104
94
|
normuon_atan2: bool = False,
|
|
105
|
-
# hybrid optimizer mode
|
|
106
|
-
MuonWithAuxAdam: bool = False,
|
|
107
|
-
muon_adam_lr: float = 1e-4,
|
|
108
|
-
adam_kwargs: Optional[dict] = None,
|
|
109
95
|
):
|
|
110
96
|
if not (lr >= 0.0):
|
|
111
97
|
raise ValueError(f"Learning-rate should be >= 0.0. Got {lr}")
|
|
@@ -121,7 +107,7 @@ class Muon_adv(torch.optim.Optimizer):
|
|
|
121
107
|
print("Warning: nesterov is incompatible with Simplified_AdEMAMix, Disabling cautious.")
|
|
122
108
|
nesterov = False
|
|
123
109
|
|
|
124
|
-
|
|
110
|
+
defaults = {
|
|
125
111
|
"lr": lr, "beta1": beta1, "weight_decay": weight_decay,
|
|
126
112
|
"nesterov": nesterov, "ns_steps": ns_steps, "ns_eps": ns_eps,
|
|
127
113
|
"ns_coeffs": ns_coeffs, "nnmf_factor": nnmf_factor,
|
|
@@ -137,34 +123,7 @@ class Muon_adv(torch.optim.Optimizer):
|
|
|
137
123
|
}
|
|
138
124
|
self.stochastic_rounding = stochastic_rounding
|
|
139
125
|
|
|
140
|
-
|
|
141
|
-
self.aux_adam = None
|
|
142
|
-
|
|
143
|
-
if not self.MuonWithAuxAdam:
|
|
144
|
-
super().__init__(params, muon_defaults)
|
|
145
|
-
return
|
|
146
|
-
|
|
147
|
-
# HYBRID OPTIMIZER LOGIC
|
|
148
|
-
adam_kwargs = adam_kwargs or {}
|
|
149
|
-
self.aux_adam = AdamW_adv(
|
|
150
|
-
[],
|
|
151
|
-
lr=muon_adam_lr,
|
|
152
|
-
**adam_kwargs,
|
|
153
|
-
_is_delegate=True
|
|
154
|
-
)
|
|
155
|
-
adam_defaults = self.aux_adam.defaults
|
|
156
|
-
|
|
157
|
-
final_param_groups = []
|
|
158
|
-
for group in params:
|
|
159
|
-
optim_type = group.get('optim_type', 'muon')
|
|
160
|
-
defaults_to_use = adam_defaults if optim_type == 'adam' else muon_defaults
|
|
161
|
-
|
|
162
|
-
new_group = group.copy()
|
|
163
|
-
for key, value in defaults_to_use.items():
|
|
164
|
-
new_group.setdefault(key, value)
|
|
165
|
-
final_param_groups.append(new_group)
|
|
166
|
-
|
|
167
|
-
super().__init__(final_param_groups, muon_defaults)
|
|
126
|
+
super().__init__(params, defaults)
|
|
168
127
|
|
|
169
128
|
|
|
170
129
|
@property
|
|
@@ -179,30 +138,8 @@ class Muon_adv(torch.optim.Optimizer):
|
|
|
179
138
|
def supports_flat_params(self):
|
|
180
139
|
return False
|
|
181
140
|
|
|
182
|
-
@property
|
|
183
|
-
def kourkoutas_helper(self):
|
|
184
|
-
"""
|
|
185
|
-
Exposes the kourkoutas_helper from the auxiliary AdamW optimizer,
|
|
186
|
-
if it exists. This allows external access for logging K-b.
|
|
187
|
-
"""
|
|
188
|
-
if self.aux_adam and hasattr(self.aux_adam, 'kourkoutas_helper'):
|
|
189
|
-
return self.aux_adam.kourkoutas_helper
|
|
190
|
-
return None
|
|
191
|
-
|
|
192
141
|
@torch.no_grad()
|
|
193
142
|
def step_parameter(self, p: torch.Tensor, group: dict, i: int | None = None):
|
|
194
|
-
if self.MuonWithAuxAdam:
|
|
195
|
-
optim_type = group.get('optim_type')
|
|
196
|
-
if optim_type == 'adam':
|
|
197
|
-
# Delegate to the AdamW_adv optimizer's logic.
|
|
198
|
-
# We need to temporarily "lend" our state and param_groups
|
|
199
|
-
# to the delegate so it has the full context to work with,
|
|
200
|
-
# especially for features like Kourkoutas-beta.
|
|
201
|
-
self.aux_adam.state = self.state
|
|
202
|
-
self.aux_adam.param_groups = self.param_groups
|
|
203
|
-
self.aux_adam.step_parameter(p, group, i)
|
|
204
|
-
return
|
|
205
|
-
|
|
206
143
|
if p.grad is None:
|
|
207
144
|
return
|
|
208
145
|
|
|
@@ -24,9 +24,6 @@ class KourkoutasHelper:
|
|
|
24
24
|
# making it compatible with fused back pass mechanisms.
|
|
25
25
|
self._build_layer_info_if_needed()
|
|
26
26
|
|
|
27
|
-
if self.optimizer.param_groups[0].get('k_logging', 0) > 0:
|
|
28
|
-
self.print_layer_info()
|
|
29
|
-
|
|
30
27
|
def _build_layer_info_if_needed(self):
|
|
31
28
|
"""Builds a map of layers and the parameters they contain."""
|
|
32
29
|
if self._layer_info_built:
|
|
@@ -53,31 +50,9 @@ class KourkoutasHelper:
|
|
|
53
50
|
if layer_key not in self.layer_info:
|
|
54
51
|
self.layer_info[layer_key] = {'params': [], 'group_ref': group}
|
|
55
52
|
self.layer_info[layer_key]['params'].append(p)
|
|
56
|
-
|
|
57
|
-
k_logging_interval = self.optimizer.param_groups[0].get('k_logging', 0)
|
|
58
|
-
if k_logging_interval > 0:
|
|
59
|
-
print(f"[Kourkoutas-β Debug] Layer info built. Found {len(self.layer_info)} unique layers/buckets.")
|
|
60
53
|
|
|
61
54
|
self._layer_info_built = True
|
|
62
55
|
|
|
63
|
-
def print_layer_info(self):
|
|
64
|
-
"""Prints the contents of self.layer_info for debugging."""
|
|
65
|
-
print("\n--- BEGIN self.layer_info DUMP ---")
|
|
66
|
-
if not self.layer_info:
|
|
67
|
-
print("Layer info is empty. Make sure the optimizer has parameters.")
|
|
68
|
-
return
|
|
69
|
-
|
|
70
|
-
for layer_key, info in self.layer_info.items():
|
|
71
|
-
param_count = len(info['params'])
|
|
72
|
-
first_param_details = ""
|
|
73
|
-
if param_count > 0:
|
|
74
|
-
p = info['params'][0]
|
|
75
|
-
first_param_details = f" (Example param shape: {list(p.shape)}, dtype: {p.dtype})"
|
|
76
|
-
|
|
77
|
-
print(f"Key: {layer_key}, Params: {param_count}{first_param_details}")
|
|
78
|
-
|
|
79
|
-
print("--- END self.layer_info DUMP ---\n")
|
|
80
|
-
|
|
81
56
|
def prepare_step(self, current_step: int):
|
|
82
57
|
"""
|
|
83
58
|
Calculates dynamic beta2 for all layers using the completed scalar accumulators
|
|
@@ -85,9 +60,8 @@ class KourkoutasHelper:
|
|
|
85
60
|
"""
|
|
86
61
|
|
|
87
62
|
beta2_log = []
|
|
88
|
-
first_layer_key = next(iter(self.layer_info), None)
|
|
89
63
|
# These are just for the sample log, initialize them
|
|
90
|
-
sun, pooled_grad_norm,
|
|
64
|
+
sun, pooled_grad_norm, r_ema_tensor = (torch.tensor(0.0),)*3
|
|
91
65
|
|
|
92
66
|
# The optimizer that owns this helper holds the master defaults for K-b.
|
|
93
67
|
# This is crucial in hybrid optimizers where some param_groups might not
|
|
@@ -124,7 +98,6 @@ class KourkoutasHelper:
|
|
|
124
98
|
accumulator = self.layer_state[layer_key]['sum_sq_accumulator']
|
|
125
99
|
|
|
126
100
|
pooled_grad_norm = torch.sqrt(accumulator)
|
|
127
|
-
prev_r_ema_val = r_ema_tensor.item() # for logging
|
|
128
101
|
|
|
129
102
|
# Update the persistent EMA tensor in-place.
|
|
130
103
|
r_ema_tensor.mul_(ema_alpha).add_(pooled_grad_norm, alpha=1.0 - ema_alpha)
|
|
@@ -150,21 +123,9 @@ class KourkoutasHelper:
|
|
|
150
123
|
if beta2_log:
|
|
151
124
|
beta2_tensor = torch.tensor(beta2_log, device='cpu')
|
|
152
125
|
self.last_beta2_stats = {
|
|
153
|
-
'min': beta2_tensor.min().item(),
|
|
154
|
-
'max': beta2_tensor.max().item(),
|
|
155
126
|
'mean': beta2_tensor.mean().item(),
|
|
156
127
|
}
|
|
157
128
|
|
|
158
|
-
# Handle periodic console logging
|
|
159
|
-
k_logging_interval = self.optimizer.param_groups[0].get('k_logging', 0)
|
|
160
|
-
is_logging_step = k_logging_interval > 0 and (current_step + 1) % k_logging_interval == 0
|
|
161
|
-
if is_logging_step and self.last_beta2_stats:
|
|
162
|
-
if first_layer_key:
|
|
163
|
-
print(f"\n[Kourkoutas-β Debug] Step {current_step + 1} - Sample Layer '{first_layer_key}':")
|
|
164
|
-
print(f" - Grad Norm: {pooled_grad_norm.item():.4e}, Prev EMA: {prev_r_ema_val:.4e}, New EMA: {r_ema_tensor.item():.4e}")
|
|
165
|
-
print(f" - Sunspike: {sun.item():.4f}, Dynamic Beta2: {self.layer_state[first_layer_key]['dynamic_beta2']:.4f}")
|
|
166
|
-
print(f"[Kourkoutas-β Debug] Step {current_step + 1} Overall Beta2 Stats: Min={self.last_beta2_stats['min']:.4f}, Max={self.last_beta2_stats['max']:.4f}, Mean={self.last_beta2_stats['mean']:.4f}")
|
|
167
|
-
|
|
168
129
|
def maybe_prepare_step(self, current_step: int):
|
|
169
130
|
"""
|
|
170
131
|
A universal guard that calls prepare_step() exactly once per training step.
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|