returnn 1.20250204.160236__py3-none-any.whl → 1.20250206.144022__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of returnn might be problematic. Click here for more details.

returnn/PKG-INFO CHANGED
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20250204.160236
3
+ Version: 1.20250206.144022
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -1,2 +1,2 @@
1
- version = '1.20250204.160236'
2
- long_version = '1.20250204.160236+git.e147886'
1
+ version = '1.20250206.144022'
2
+ long_version = '1.20250206.144022+git.550e757'
@@ -35,6 +35,7 @@ class TransformerEncoder(rf.Module):
35
35
  layer: Optional[Union[TransformerEncoderLayer, rf.Module, type, Dict[str, Any], Any]] = None,
36
36
  layer_opts: Optional[Dict[str, Any]] = None,
37
37
  embed_dim: Optional[Dim] = None,
38
+ input_embedding: Union[None, rf.Module, type, Dict[str, Any]] = rf.Embedding,
38
39
  input_embedding_scale: float = None,
39
40
  input_dropout: float = None,
40
41
  sequential=rf.Sequential,
@@ -53,6 +54,7 @@ class TransformerEncoder(rf.Module):
53
54
  :param layer: an instance of :class:`TransformerEncoderLayer` or similar
54
55
  :param layer_opts: options for the encoder layer
55
56
  :param embed_dim: if given, will first have an embedding [vocab,embed] and then a linear [embed,model].
57
+ :param input_embedding:
56
58
  :param input_embedding_scale:
57
59
  :param input_dropout:
58
60
  :param sequential:
@@ -77,9 +79,15 @@ class TransformerEncoder(rf.Module):
77
79
  self.model_dim = model_dim
78
80
  self.embed_dim = embed_dim
79
81
 
80
- # We could make this optional or configurable if we ever need to.
81
- # Or maybe you would just have another separate implementation of this module then...
82
- self.input_embedding = rf.Embedding(vocab_dim, embed_dim or model_dim)
82
+ if input_embedding is None or isinstance(input_embedding, rf.Module):
83
+ pass
84
+ elif isinstance(input_embedding, type):
85
+ input_embedding: rf.Embedding = input_embedding(vocab_dim, embed_dim or model_dim)
86
+ elif isinstance(input_embedding, dict):
87
+ input_embedding = rf.build_from_dict(input_embedding, vocab_dim, embed_dim or model_dim)
88
+ else:
89
+ raise TypeError(f"unexpected input_embedding {input_embedding!r} type {type(input_embedding)}")
90
+ self.input_embedding = input_embedding
83
91
 
84
92
  self.input_embedding_proj = None
85
93
  if embed_dim:
@@ -88,17 +96,13 @@ class TransformerEncoder(rf.Module):
88
96
  if pos_enc is None:
89
97
  pass
90
98
  elif isinstance(pos_enc, dict):
91
- pos_enc = rf.build_from_dict(
92
- pos_enc, feat_dim=embed_dim or model_dim, dtype=self.input_embedding.weight.dtype
93
- )
99
+ pos_enc = rf.build_from_dict(pos_enc, feat_dim=embed_dim or model_dim, dtype=rf.get_default_float_dtype())
94
100
  elif isinstance(pos_enc, rf.Module):
95
101
  pass
96
102
  elif isinstance(pos_enc, FunctionType):
97
- pos_enc = functools.partial(
98
- pos_enc, feat_dim=embed_dim or model_dim, dtype=self.input_embedding.weight.dtype
99
- )
103
+ pos_enc = functools.partial(pos_enc, feat_dim=embed_dim or model_dim, dtype=rf.get_default_float_dtype())
100
104
  else:
101
- raise TypeError(f"unexpected pos_enc type {pos_enc!r}")
105
+ raise TypeError(f"unexpected pos_enc {pos_enc!r} type {type(pos_enc)}")
102
106
  self.pos_enc = pos_enc
103
107
  if input_embedding_scale is None:
104
108
  input_embedding_scale = model_dim.dimension**0.5
@@ -157,7 +161,11 @@ class TransformerEncoder(rf.Module):
157
161
  :param collected_outputs:
158
162
  :return: final encoder output, after final layer norm
159
163
  """
160
- decoded = self.input_embedding(source) * self.input_embedding_scale
164
+ if self.input_embedding is not None:
165
+ decoded = self.input_embedding(source) * self.input_embedding_scale
166
+ else:
167
+ assert self.model_dim in source.dims
168
+ decoded = source
161
169
  if self.pos_enc is not None:
162
170
  decoded = decoded + self.pos_enc(spatial_dim=spatial_dim)
163
171
  decoded = rf.dropout(decoded, self.input_dropout)
@@ -139,7 +139,7 @@ class TensorDict:
139
139
  """
140
140
  visited_dims = set()
141
141
  for key, value in self.data.items():
142
- assert key in raw_tensor_dict
142
+ assert key in raw_tensor_dict, f"key {key} not in raw_tensor_dict {list(raw_tensor_dict.keys())}"
143
143
  value.raw_tensor = raw_tensor_dict[key]
144
144
  for i, dim in enumerate(value.dims):
145
145
  dim: Dim
returnn/torch/engine.py CHANGED
@@ -505,12 +505,35 @@ class Engine(EngineBase):
505
505
  file=log.v1,
506
506
  )
507
507
 
508
+ print("Checking for inf/nan in model parameters...", file=log.v1)
509
+ count_nan_inf_params = 0
510
+ for name, param in self._pt_model.named_parameters():
511
+ got_nan_inf_t = torch.stack([torch.isnan(param).any(), torch.isinf(param).any()]).cpu()
512
+ got_nan = got_nan_inf_t[0].item()
513
+ got_inf = got_nan_inf_t[1].item()
514
+ if got_nan or got_inf:
515
+ s = "/".join([s_ for s_, b in [("nan", got_nan), ("inf", got_inf)] if b])
516
+ print(f" {name} {param}: {s}", file=log.v1)
517
+ count_nan_inf_params += 1
518
+ if count_nan_inf_params == 0:
519
+ print("(No inf/nan in model parameters.)", file=log.v1)
520
+
508
521
  def _debug_func() -> torch.Tensor:
509
522
  self._run_step(extern_data, train_flag=True, train_func=True)
510
- return rf.get_run_ctx().total_loss()
523
+ loss = rf.get_run_ctx().total_loss()
524
+ assert isinstance(loss, Tensor)
525
+ return loss.raw_tensor
511
526
 
512
527
  print("Running debug_inf_nan...", file=log.v1)
513
528
  debug_inf_nan(_debug_func, with_grad=True)
529
+ if count_nan_inf_params > 0 and self.global_train_step == 1:
530
+ print(
531
+ "This was the second step, so likely the first step grad was broken."
532
+ " Try again with reset model...",
533
+ file=log.v1,
534
+ )
535
+ self._load_model()
536
+ debug_inf_nan(_debug_func, with_grad=True)
514
537
  raise Exception(f"Inf/nan score in step {step_idx}.")
515
538
 
516
539
  step_idx += 1
@@ -52,6 +52,7 @@ def debug_inf_nan(
52
52
  *,
53
53
  with_grad: bool = False,
54
54
  report_every_op_call: bool = True,
55
+ stop_reporting_after_first_inf_nan: bool = True,
55
56
  file: Optional[Union[TextIO, TextIOBase]] = None,
56
57
  ):
57
58
  """
@@ -61,6 +62,7 @@ def debug_inf_nan(
61
62
  and we will call `loss = func(); loss.backward()`.
62
63
  :param with_grad: whether to compute and debug gradients for inf/nan.
63
64
  :param report_every_op_call: whether to report every op call.
65
+ :param stop_reporting_after_first_inf_nan: whether to stop reporting after the first inf/nan.
64
66
  :param file: where to write the output to. Default is stdout.
65
67
  """
66
68
 
@@ -69,13 +71,18 @@ def debug_inf_nan(
69
71
 
70
72
  # noinspection PyUnresolvedReferences,PyProtectedMember
71
73
  cur_frame: FrameType = sys._getframe()
72
- trace_ops = _TraceOps(root_frame=cur_frame, file=file, report_every_op_call=report_every_op_call)
74
+ trace_ops = _TraceOps(
75
+ root_frame=cur_frame,
76
+ file=file,
77
+ report_every_op_call=report_every_op_call,
78
+ stop_reporting_after_first_inf_nan=stop_reporting_after_first_inf_nan,
79
+ )
73
80
 
74
81
  if with_grad:
75
-
76
82
  with torch.autograd.detect_anomaly():
77
83
  with trace_ops: # currently only for forward (but we might want to trace the backward too)
78
84
  loss = func()
85
+ file.flush() # the backward detect_anomaly might screw up the output otherwise
79
86
  try:
80
87
  loss.backward()
81
88
  except RuntimeError as exc:
@@ -89,23 +96,46 @@ def debug_inf_nan(
89
96
 
90
97
  # For efficiency, and to be less spammy
91
98
  _TraceFuncNameBlacklist = {
92
- "aten::detach",
93
99
  "aten::zeros_like",
94
100
  "aten::ones_like",
101
+ "aten::full",
102
+ "aten::scalar_tensor", # when we deliberately create a scalar inf tensor
103
+ "aten::_local_scalar_dense",
104
+ "aten::where.self", # when we intentionally mask with inf
105
+ "aten::detach",
106
+ "aten::_to_copy",
107
+ "aten::clone",
108
+ "aten::stack",
109
+ "aten::view",
110
+ "aten::_unsafe_view",
111
+ "aten::permute",
112
+ "aten::t",
113
+ "aten::split_with_sizes",
114
+ "aten::slice.Tensor",
115
+ "aten::select.int",
95
116
  }
96
117
 
97
118
 
98
119
  class _TraceOps(TorchDispatchMode):
99
- def __init__(self, *, root_frame: FrameType, file: Union[TextIO, TextIOBase], report_every_op_call: bool = True):
120
+ def __init__(
121
+ self,
122
+ *,
123
+ root_frame: FrameType,
124
+ file: Union[TextIO, TextIOBase],
125
+ report_every_op_call: bool = True,
126
+ stop_reporting_after_first_inf_nan: bool = True,
127
+ ):
100
128
  super().__init__()
101
129
  self.root_frame = root_frame
102
130
  self.file = file
131
+ self.enabled = True
103
132
  self.report_every_op_call = report_every_op_call
133
+ self.stop_reporting_after_first_inf_nan = stop_reporting_after_first_inf_nan
104
134
 
105
135
  def __torch_dispatch__(self, func, types, args=(), kwargs=None):
106
136
  if kwargs is None:
107
137
  kwargs = {}
108
- if func.name() in _TraceFuncNameBlacklist:
138
+ if not self.enabled or func.name() in _TraceFuncNameBlacklist:
109
139
  return func(*args, **kwargs)
110
140
  if self.report_every_op_call:
111
141
  print(f"--- op {func.name()}", file=self.file)
@@ -121,6 +151,8 @@ class _TraceOps(TorchDispatchMode):
121
151
  traceback.print_list(
122
152
  _extract_stack_up_to(skip_top_num_frames=1, root_frame=self.root_frame), file=self.file
123
153
  )
154
+ if self.stop_reporting_after_first_inf_nan:
155
+ self.enabled = False
124
156
  return out
125
157
 
126
158
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: returnn
3
- Version: 1.20250204.160236
3
+ Version: 1.20250206.144022
4
4
  Summary: The RWTH extensible training framework for universal recurrent neural networks
5
5
  Home-page: https://github.com/rwth-i6/returnn/
6
6
  Author: Albert Zeyer
@@ -1,9 +1,9 @@
1
- returnn/PKG-INFO,sha256=tVaxTG1KNp2EVd4-m0vHijnHu6CbjU8wpugsPQKty_M,5215
1
+ returnn/PKG-INFO,sha256=vBdT0ayV-Q8OjPdp1xlJt0CiopUZKNWKwKuadpiKHDk,5215
2
2
  returnn/__init__.py,sha256=biBtRsM0WZ406vShaeH-9WFoqJ8XwTbn6g0EeFJ7l8E,1012
3
3
  returnn/__main__.py,sha256=qBFbuB1yN3adgVM5pXt2-Yq9vorjRNchNPL8kDKx44M,31752
4
4
  returnn/__old_mod_loader__.py,sha256=nvsNY-xELdS_IPNkv66Q9Rmvg4dbGW0-EBRDcCmctos,7654
5
5
  returnn/__setup__.py,sha256=22kQn2fh11iPM0hLb2Fy5sLmoU1JGvmDxXRYuRgQkwU,4659
6
- returnn/_setup_info_generated.py,sha256=6h1pDgIqJ8MnrBvdiNdCUPZ56ZcnQn3Wg5MSw9gwXGs,77
6
+ returnn/_setup_info_generated.py,sha256=uAkEz6DVwoN42Nh2WLNsoE4lJ0BtlRznPPlXMWKxJQo,77
7
7
  returnn/config.py,sha256=3tmKhB6FnQZaNdtcYsiB61JnEY--iZ2qmJ4yq0b6tE0,29140
8
8
  returnn/forward_iface.py,sha256=A_OJiaXsX4MlXQRzST86ylyxSUZbC402PQL1REcqHjM,911
9
9
  returnn/learning_rate_control.py,sha256=ZvWryAn_tv9DhV8sh1LV3eE34Yltl3On3mYZAG4hR9s,34684
@@ -141,7 +141,7 @@ returnn/frontend/encoder/base.py,sha256=A759EwCYAmSi-kzXz1vaTjR2l59TvNGQlzaNdp3U
141
141
  returnn/frontend/encoder/conformer.py,sha256=YPtH0Clx2QrKOoxbtUSkYR7QiDp-EYmoOcGc_gc2ZEk,19778
142
142
  returnn/frontend/encoder/conformer_v2.py,sha256=vAYdT8m2Zzg3IIZZafeccClFHU1_c9T-EgBOsHadQPA,7701
143
143
  returnn/frontend/encoder/e_branchformer.py,sha256=zEla-iXJciK7bCenlTwsPB8dXo_VPMlFm2xc3op_lPY,12278
144
- returnn/frontend/encoder/transformer.py,sha256=k-tJjp5ymJ7QzKjyQdKVBfHVCw1-mJTfIzhIpGosxDs,11066
144
+ returnn/frontend/encoder/transformer.py,sha256=Jj0mF1D2MohOk-9sGYdsLtVW_86fwoq4pKWCdPMvPR8,11580
145
145
  returnn/import_/__init__.py,sha256=L2dKxWCcn0fz_7H7OS-zw5i5Yrljjjh_d61dEcFP_JY,243
146
146
  returnn/import_/common.py,sha256=0cmvyd7NtMLH55IskEoSDtkcMwChxLhauV2UZ4mK68I,8148
147
147
  returnn/import_/git.py,sha256=IXBVOybQAHf5OlMfVY6oZ-7eiDYPG0OR7MyDJKcVHSM,13961
@@ -162,7 +162,7 @@ returnn/tensor/control_flow_ctx.py,sha256=L9e32AfYDUDgsEDHL07thSFyYFqwhyVSqzE_bM
162
162
  returnn/tensor/dim.py,sha256=652DlcSe6o6l5OyY5xt9Yigij_Xry-ToG9AemMX3roY,4208
163
163
  returnn/tensor/marked_dim.py,sha256=Ae2hQIb5QixRU2gDhQEm0tmYt8TmomWoGERB414jR8o,1884
164
164
  returnn/tensor/tensor.py,sha256=bisF7j3rU5Rvx8C8S57C9hGo2jgWwTaQ6wc_Db7Mwpw,9087
165
- returnn/tensor/tensor_dict.py,sha256=0QLUnIqc0za3bk2ytU4Cdmri2Z732O6BOc6hW1dYE8Q,7078
165
+ returnn/tensor/tensor_dict.py,sha256=WTqMefemeHQG381MVUjvHMmYVd2TV9IQ0qU4i_XJi3c,7146
166
166
  returnn/tensor/utils.py,sha256=B6_XyNTXPIyLxWk061Qo-Md8_DnINGdVwpXJF6pahBk,9772
167
167
  returnn/tf/__init__.py,sha256=X4g2LFCFTl0uiybMRkfBY8AYkgMa6HX0vVxxTk0nMiE,88
168
168
  returnn/tf/compat.py,sha256=NkAkdlR37m2d9qh3i33sIfEGilOaFBeCofAQpQwnZpY,1632
@@ -207,7 +207,7 @@ returnn/tf/util/open_fst.py,sha256=sZRDw4TbxvhGqpGdUJWy1ebvlZm4_RPhygpRw9uLAOQ,1
207
207
  returnn/torch/README.md,sha256=jzJ2FpOHW02vxN69yKaV97C9LI-hmvjBglKfdZXIDdc,85
208
208
  returnn/torch/__init__.py,sha256=MHEUyNHB20Vy89uKAqZoj6FxJKF1Gq3HW-i6ra1pNcI,24
209
209
  returnn/torch/distributed.py,sha256=i13cUVjI7GxpO0TAresrNyCM0ZBAaf-cXNr09Fmg_2k,6266
210
- returnn/torch/engine.py,sha256=eWWHk_wOEV8ysLx8VkQHnA2613uOkEx1-Ibp-YGDncw,73615
210
+ returnn/torch/engine.py,sha256=Zd3ePKFSi5fkvV1FxaYn0QGgu5cag_ocKPwFmKglf3I,75095
211
211
  returnn/torch/updater.py,sha256=GqtBvZpElPVMm0lq84JPl4NVLFFETZAzAbR0rTomSao,28249
212
212
  returnn/torch/data/__init__.py,sha256=6cLNEi8KoGI12PF6akN7mI_mtjlx-0hcQAfMYoExwik,132
213
213
  returnn/torch/data/extern_data.py,sha256=_uT_9_gd5HIh1IoRsrebVG-nufSnb7fgC5jyU05GxJg,7580
@@ -226,7 +226,7 @@ returnn/torch/optim/lion.py,sha256=jV_qfwyyO5HAgqW94caap-ALkVjU688RpRgkZyLNZ5Y,5
226
226
  returnn/torch/util/README.md,sha256=AW-6ueWhgcwDcm57md6sm227QXNkvLnlRLwaH7NlS-w,193
227
227
  returnn/torch/util/__init__.py,sha256=AOXYUjzPm0XrzFJCPAXo9Jj_FvqD1XH3FfKtho80Vl8,26
228
228
  returnn/torch/util/array_.py,sha256=ell3VZvn01SLtF9Pw2fvPzFNO-XDQ7tSB9VCrVSKmSA,2556
229
- returnn/torch/util/debug_inf_nan.py,sha256=UnCU-Yt0UC2vzpbXVs3cDUrR4pa3F6X_CWHEBaKqDcM,5113
229
+ returnn/torch/util/debug_inf_nan.py,sha256=v0IzLy4kRKBWChSV70O4x829QtEuXMwB9mBqAyE4O2o,6223
230
230
  returnn/torch/util/diagnose_gpu.py,sha256=PYMmSk7iQ-jC3RXKKNXlYx1Q744C0LXqz0SB6ympwQg,5844
231
231
  returnn/torch/util/exception_helper.py,sha256=4e7YEf9D42aAUEkM3uSjnOxpNEYgtyPSpNV0-1L6PSU,4319
232
232
  returnn/torch/util/gradient_checkpoint.py,sha256=iLy-FB65DC8O6LxzmMvFjnSdpIVpko87ppIvRKAbtpQ,27995
@@ -253,8 +253,8 @@ returnn/util/sig_proc.py,sha256=Tjz0VOAVyqu2qDCF5HZ1JjALjcFsHcNkcd96WgZeKfE,7265
253
253
  returnn/util/task_system.py,sha256=y4sMVXQ25Qd2z0rx03uOlXlkE-jbCYC1Sjfn-XlraVU,26003
254
254
  returnn/util/train_proc_manager.py,sha256=Pjht28k6uz6BNQ47uW6Gf880iyq5q4wx7P_K2tmoAM8,3266
255
255
  returnn/util/watch_memory.py,sha256=BR5P2kvBN6UI81cE0_1WAA6Hd1SByLbBaiDxvLhPOew,4213
256
- returnn-1.20250204.160236.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
257
- returnn-1.20250204.160236.dist-info/METADATA,sha256=tVaxTG1KNp2EVd4-m0vHijnHu6CbjU8wpugsPQKty_M,5215
258
- returnn-1.20250204.160236.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
259
- returnn-1.20250204.160236.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
260
- returnn-1.20250204.160236.dist-info/RECORD,,
256
+ returnn-1.20250206.144022.dist-info/LICENSE,sha256=ywBD_U2aD4vpuoIgNAsjIGBYydl0tVKll3De0Z8s77c,11041
257
+ returnn-1.20250206.144022.dist-info/METADATA,sha256=vBdT0ayV-Q8OjPdp1xlJt0CiopUZKNWKwKuadpiKHDk,5215
258
+ returnn-1.20250206.144022.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
259
+ returnn-1.20250206.144022.dist-info/top_level.txt,sha256=Lsn4WZc5Pbfk0-xDQOgnFCxOoqxL4CyeM3N1TFbJncw,8
260
+ returnn-1.20250206.144022.dist-info/RECORD,,